1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.95 2008/08/06 15:38:58 dillon Exp $ 35 */ 36 37 #include "hammer.h" 38 39 static int hammer_mem_lookup(hammer_cursor_t cursor); 40 static int hammer_mem_first(hammer_cursor_t cursor); 41 static int hammer_frontend_trunc_callback(hammer_record_t record, 42 void *data __unused); 43 static int hammer_record_needs_overwrite_delete(hammer_record_t record); 44 static int hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip, 45 hammer_btree_leaf_elm_t leaf); 46 47 struct rec_trunc_info { 48 u_int16_t rec_type; 49 int64_t trunc_off; 50 }; 51 52 /* 53 * Red-black tree support. Comparison code for insertion. 54 */ 55 static int 56 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) 57 { 58 if (rec1->leaf.base.rec_type < rec2->leaf.base.rec_type) 59 return(-1); 60 if (rec1->leaf.base.rec_type > rec2->leaf.base.rec_type) 61 return(1); 62 63 if (rec1->leaf.base.key < rec2->leaf.base.key) 64 return(-1); 65 if (rec1->leaf.base.key > rec2->leaf.base.key) 66 return(1); 67 68 /* 69 * Never match against an item deleted by the front-end. 70 * 71 * rec1 is greater then rec2 if rec1 is marked deleted. 72 * rec1 is less then rec2 if rec2 is marked deleted. 73 * 74 * Multiple deleted records may be present, do not return 0 75 * if both are marked deleted. 76 */ 77 if (rec1->flags & HAMMER_RECF_DELETED_FE) 78 return(1); 79 if (rec2->flags & HAMMER_RECF_DELETED_FE) 80 return(-1); 81 82 return(0); 83 } 84 85 /* 86 * Basic record comparison code similar to hammer_btree_cmp(). 87 */ 88 static int 89 hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec) 90 { 91 if (elm->rec_type < rec->leaf.base.rec_type) 92 return(-3); 93 if (elm->rec_type > rec->leaf.base.rec_type) 94 return(3); 95 96 if (elm->key < rec->leaf.base.key) 97 return(-2); 98 if (elm->key > rec->leaf.base.key) 99 return(2); 100 101 /* 102 * Never match against an item deleted by the front-end. 103 * elm is less then rec if rec is marked deleted. 104 */ 105 if (rec->flags & HAMMER_RECF_DELETED_FE) 106 return(-1); 107 return(0); 108 } 109 110 /* 111 * Special LOOKUP_INFO to locate an overlapping record. This used by 112 * the reservation code to implement small-block records (whos keys will 113 * be different depending on data_len, when representing the same base 114 * offset). 115 * 116 * NOTE: The base file offset of a data record is (key - data_len), not (key). 117 */ 118 static int 119 hammer_rec_overlap_compare(hammer_btree_leaf_elm_t leaf, hammer_record_t rec) 120 { 121 if (leaf->base.rec_type < rec->leaf.base.rec_type) 122 return(-3); 123 if (leaf->base.rec_type > rec->leaf.base.rec_type) 124 return(3); 125 126 /* 127 * Overlap compare 128 */ 129 if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) { 130 /* leaf_end <= rec_beg */ 131 if (leaf->base.key <= rec->leaf.base.key - rec->leaf.data_len) 132 return(-2); 133 /* leaf_beg >= rec_end */ 134 if (leaf->base.key - leaf->data_len >= rec->leaf.base.key) 135 return(2); 136 } else { 137 if (leaf->base.key < rec->leaf.base.key) 138 return(-2); 139 if (leaf->base.key > rec->leaf.base.key) 140 return(2); 141 } 142 143 /* 144 * Never match against an item deleted by the front-end. 145 * leaf is less then rec if rec is marked deleted. 146 * 147 * We must still return the proper code for the scan to continue 148 * along the correct branches. 149 */ 150 if (rec->flags & HAMMER_RECF_DELETED_FE) { 151 if (leaf->base.key < rec->leaf.base.key) 152 return(-2); 153 if (leaf->base.key > rec->leaf.base.key) 154 return(2); 155 return(-1); 156 } 157 return(0); 158 } 159 160 /* 161 * RB_SCAN comparison code for hammer_mem_first(). The argument order 162 * is reversed so the comparison result has to be negated. key_beg and 163 * key_end are both range-inclusive. 164 * 165 * Localized deletions are not cached in-memory. 166 */ 167 static 168 int 169 hammer_rec_scan_cmp(hammer_record_t rec, void *data) 170 { 171 hammer_cursor_t cursor = data; 172 int r; 173 174 r = hammer_rec_cmp(&cursor->key_beg, rec); 175 if (r > 1) 176 return(-1); 177 r = hammer_rec_cmp(&cursor->key_end, rec); 178 if (r < -1) 179 return(1); 180 return(0); 181 } 182 183 /* 184 * This compare function is used when simply looking up key_beg. 185 */ 186 static 187 int 188 hammer_rec_find_cmp(hammer_record_t rec, void *data) 189 { 190 hammer_cursor_t cursor = data; 191 int r; 192 193 r = hammer_rec_cmp(&cursor->key_beg, rec); 194 if (r > 1) 195 return(-1); 196 if (r < -1) 197 return(1); 198 return(0); 199 } 200 201 /* 202 * Locate blocks within the truncation range. Partial blocks do not count. 203 */ 204 static 205 int 206 hammer_rec_trunc_cmp(hammer_record_t rec, void *data) 207 { 208 struct rec_trunc_info *info = data; 209 210 if (rec->leaf.base.rec_type < info->rec_type) 211 return(-1); 212 if (rec->leaf.base.rec_type > info->rec_type) 213 return(1); 214 215 switch(rec->leaf.base.rec_type) { 216 case HAMMER_RECTYPE_DB: 217 /* 218 * DB record key is not beyond the truncation point, retain. 219 */ 220 if (rec->leaf.base.key < info->trunc_off) 221 return(-1); 222 break; 223 case HAMMER_RECTYPE_DATA: 224 /* 225 * DATA record offset start is not beyond the truncation point, 226 * retain. 227 */ 228 if (rec->leaf.base.key - rec->leaf.data_len < info->trunc_off) 229 return(-1); 230 break; 231 default: 232 panic("hammer_rec_trunc_cmp: unexpected record type"); 233 } 234 235 /* 236 * The record start is >= the truncation point, return match, 237 * the record should be destroyed. 238 */ 239 return(0); 240 } 241 242 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare); 243 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node, 244 hammer_rec_overlap_compare, hammer_btree_leaf_elm_t); 245 246 /* 247 * Allocate a record for the caller to finish filling in. The record is 248 * returned referenced. 249 */ 250 hammer_record_t 251 hammer_alloc_mem_record(hammer_inode_t ip, int data_len) 252 { 253 hammer_record_t record; 254 255 ++hammer_count_records; 256 record = kmalloc(sizeof(*record), M_HAMMER, 257 M_WAITOK | M_ZERO | M_USE_RESERVE); 258 record->flush_state = HAMMER_FST_IDLE; 259 record->ip = ip; 260 record->leaf.base.btype = HAMMER_BTREE_TYPE_RECORD; 261 record->leaf.data_len = data_len; 262 hammer_ref(&record->lock); 263 264 if (data_len) { 265 record->data = kmalloc(data_len, M_HAMMER, M_WAITOK | M_ZERO); 266 record->flags |= HAMMER_RECF_ALLOCDATA; 267 ++hammer_count_record_datas; 268 } 269 270 return (record); 271 } 272 273 void 274 hammer_wait_mem_record_ident(hammer_record_t record, const char *ident) 275 { 276 while (record->flush_state == HAMMER_FST_FLUSH) { 277 record->flags |= HAMMER_RECF_WANTED; 278 tsleep(record, 0, ident, 0); 279 } 280 } 281 282 /* 283 * Called from the backend, hammer_inode.c, after a record has been 284 * flushed to disk. The record has been exclusively locked by the 285 * caller and interlocked with BE. 286 * 287 * We clean up the state, unlock, and release the record (the record 288 * was referenced by the fact that it was in the HAMMER_FST_FLUSH state). 289 */ 290 void 291 hammer_flush_record_done(hammer_record_t record, int error) 292 { 293 hammer_inode_t target_ip; 294 295 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 296 KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); 297 298 if (error) { 299 /* 300 * An error occured, the backend was unable to sync the 301 * record to its media. Leave the record intact. 302 */ 303 hammer_critical_error(record->ip->hmp, record->ip, error, 304 "while flushing record"); 305 } 306 307 --record->flush_group->refs; 308 record->flush_group = NULL; 309 310 if (record->flags & HAMMER_RECF_DELETED_BE) { 311 if ((target_ip = record->target_ip) != NULL) { 312 TAILQ_REMOVE(&target_ip->target_list, record, 313 target_entry); 314 record->target_ip = NULL; 315 hammer_test_inode(target_ip); 316 } 317 record->flush_state = HAMMER_FST_IDLE; 318 } else { 319 if (record->target_ip) { 320 record->flush_state = HAMMER_FST_SETUP; 321 hammer_test_inode(record->ip); 322 hammer_test_inode(record->target_ip); 323 } else { 324 record->flush_state = HAMMER_FST_IDLE; 325 } 326 } 327 record->flags &= ~HAMMER_RECF_INTERLOCK_BE; 328 if (record->flags & HAMMER_RECF_WANTED) { 329 record->flags &= ~HAMMER_RECF_WANTED; 330 wakeup(record); 331 } 332 hammer_rel_mem_record(record); 333 } 334 335 /* 336 * Release a memory record. Records marked for deletion are immediately 337 * removed from the RB-Tree but otherwise left intact until the last ref 338 * goes away. 339 */ 340 void 341 hammer_rel_mem_record(struct hammer_record *record) 342 { 343 hammer_mount_t hmp; 344 hammer_reserve_t resv; 345 hammer_inode_t ip; 346 hammer_inode_t target_ip; 347 348 hammer_unref(&record->lock); 349 350 if (record->lock.refs == 0) { 351 /* 352 * Upon release of the last reference wakeup any waiters. 353 * The record structure may get destroyed so callers will 354 * loop up and do a relookup. 355 * 356 * WARNING! Record must be removed from RB-TREE before we 357 * might possibly block. hammer_test_inode() can block! 358 */ 359 ip = record->ip; 360 hmp = ip->hmp; 361 362 /* 363 * Upon release of the last reference a record marked deleted 364 * is destroyed. 365 */ 366 if (record->flags & HAMMER_RECF_DELETED_FE) { 367 KKASSERT(ip->lock.refs > 0); 368 KKASSERT(record->flush_state != HAMMER_FST_FLUSH); 369 370 /* 371 * target_ip may have zero refs, we have to ref it 372 * to prevent it from being ripped out from under 373 * us. 374 */ 375 if ((target_ip = record->target_ip) != NULL) { 376 TAILQ_REMOVE(&target_ip->target_list, 377 record, target_entry); 378 record->target_ip = NULL; 379 hammer_ref(&target_ip->lock); 380 } 381 382 if (record->flags & HAMMER_RECF_ONRBTREE) { 383 RB_REMOVE(hammer_rec_rb_tree, 384 &record->ip->rec_tree, 385 record); 386 KKASSERT(ip->rsv_recs > 0); 387 --hmp->rsv_recs; 388 --ip->rsv_recs; 389 hmp->rsv_databytes -= record->leaf.data_len; 390 record->flags &= ~HAMMER_RECF_ONRBTREE; 391 392 if (RB_EMPTY(&record->ip->rec_tree)) { 393 record->ip->flags &= ~HAMMER_INODE_XDIRTY; 394 record->ip->sync_flags &= ~HAMMER_INODE_XDIRTY; 395 hammer_test_inode(record->ip); 396 } 397 } 398 399 /* 400 * We must wait for any direct-IO to complete before 401 * we can destroy the record because the bio may 402 * have a reference to it. 403 */ 404 if (record->flags & 405 (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL)) { 406 hammer_io_direct_wait(record); 407 } 408 409 410 /* 411 * Do this test after removing record from the B-Tree. 412 */ 413 if (target_ip) { 414 hammer_test_inode(target_ip); 415 hammer_rel_inode(target_ip, 0); 416 } 417 418 if (record->flags & HAMMER_RECF_ALLOCDATA) { 419 --hammer_count_record_datas; 420 kfree(record->data, M_HAMMER); 421 record->flags &= ~HAMMER_RECF_ALLOCDATA; 422 } 423 424 /* 425 * Release the reservation. If the record was not 426 * committed return the reservation before 427 * releasing it. 428 */ 429 if ((resv = record->resv) != NULL) { 430 if ((record->flags & HAMMER_RECF_COMMITTED) == 0) { 431 hammer_blockmap_reserve_undo( 432 resv, 433 record->leaf.data_offset, 434 record->leaf.data_len); 435 } 436 hammer_blockmap_reserve_complete(hmp, resv); 437 record->resv = NULL; 438 } 439 record->data = NULL; 440 --hammer_count_records; 441 kfree(record, M_HAMMER); 442 } 443 } 444 } 445 446 /* 447 * Record visibility depends on whether the record is being accessed by 448 * the backend or the frontend. 449 * 450 * Return non-zero if the record is visible, zero if it isn't or if it is 451 * deleted. 452 */ 453 static __inline 454 int 455 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record) 456 { 457 if (cursor->flags & HAMMER_CURSOR_BACKEND) { 458 if (record->flags & HAMMER_RECF_DELETED_BE) 459 return(0); 460 } else { 461 if (record->flags & HAMMER_RECF_DELETED_FE) 462 return(0); 463 } 464 return(1); 465 } 466 467 /* 468 * This callback is used as part of the RB_SCAN function for in-memory 469 * records. We terminate it (return -1) as soon as we get a match. 470 * 471 * This routine is used by frontend code. 472 * 473 * The primary compare code does not account for ASOF lookups. This 474 * code handles that case as well as a few others. 475 */ 476 static 477 int 478 hammer_rec_scan_callback(hammer_record_t rec, void *data) 479 { 480 hammer_cursor_t cursor = data; 481 482 /* 483 * We terminate on success, so this should be NULL on entry. 484 */ 485 KKASSERT(cursor->iprec == NULL); 486 487 /* 488 * Skip if the record was marked deleted. 489 */ 490 if (hammer_ip_iterate_mem_good(cursor, rec) == 0) 491 return(0); 492 493 /* 494 * Skip if not visible due to our as-of TID 495 */ 496 if (cursor->flags & HAMMER_CURSOR_ASOF) { 497 if (cursor->asof < rec->leaf.base.create_tid) 498 return(0); 499 if (rec->leaf.base.delete_tid && 500 cursor->asof >= rec->leaf.base.delete_tid) { 501 return(0); 502 } 503 } 504 505 /* 506 * ref the record. The record is protected from backend B-Tree 507 * interactions by virtue of the cursor's IP lock. 508 */ 509 hammer_ref(&rec->lock); 510 511 /* 512 * The record may have been deleted while we were blocked. 513 */ 514 if (hammer_ip_iterate_mem_good(cursor, rec) == 0) { 515 hammer_rel_mem_record(rec); 516 return(0); 517 } 518 519 /* 520 * Set the matching record and stop the scan. 521 */ 522 cursor->iprec = rec; 523 return(-1); 524 } 525 526 527 /* 528 * Lookup an in-memory record given the key specified in the cursor. Works 529 * just like hammer_btree_lookup() but operates on an inode's in-memory 530 * record list. 531 * 532 * The lookup must fail if the record is marked for deferred deletion. 533 */ 534 static 535 int 536 hammer_mem_lookup(hammer_cursor_t cursor) 537 { 538 int error; 539 540 KKASSERT(cursor->ip); 541 if (cursor->iprec) { 542 hammer_rel_mem_record(cursor->iprec); 543 cursor->iprec = NULL; 544 } 545 hammer_rec_rb_tree_RB_SCAN(&cursor->ip->rec_tree, hammer_rec_find_cmp, 546 hammer_rec_scan_callback, cursor); 547 548 if (cursor->iprec == NULL) 549 error = ENOENT; 550 else 551 error = 0; 552 return(error); 553 } 554 555 /* 556 * hammer_mem_first() - locate the first in-memory record matching the 557 * cursor within the bounds of the key range. 558 */ 559 static 560 int 561 hammer_mem_first(hammer_cursor_t cursor) 562 { 563 hammer_inode_t ip; 564 565 ip = cursor->ip; 566 KKASSERT(ip != NULL); 567 568 if (cursor->iprec) { 569 hammer_rel_mem_record(cursor->iprec); 570 cursor->iprec = NULL; 571 } 572 573 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp, 574 hammer_rec_scan_callback, cursor); 575 576 /* 577 * Adjust scan.node and keep it linked into the RB-tree so we can 578 * hold the cursor through third party modifications of the RB-tree. 579 */ 580 if (cursor->iprec) 581 return(0); 582 return(ENOENT); 583 } 584 585 /************************************************************************ 586 * HAMMER IN-MEMORY RECORD FUNCTIONS * 587 ************************************************************************ 588 * 589 * These functions manipulate in-memory records. Such records typically 590 * exist prior to being committed to disk or indexed via the on-disk B-Tree. 591 */ 592 593 /* 594 * Add a directory entry (dip,ncp) which references inode (ip). 595 * 596 * Note that the low 32 bits of the namekey are set temporarily to create 597 * a unique in-memory record, and may be modified a second time when the 598 * record is synchronized to disk. In particular, the low 32 bits cannot be 599 * all 0's when synching to disk, which is not handled here. 600 * 601 * NOTE: bytes does not include any terminating \0 on name, and name might 602 * not be terminated. 603 */ 604 int 605 hammer_ip_add_directory(struct hammer_transaction *trans, 606 struct hammer_inode *dip, const char *name, int bytes, 607 struct hammer_inode *ip) 608 { 609 struct hammer_cursor cursor; 610 hammer_record_t record; 611 int error; 612 int count; 613 u_int32_t iterator; 614 615 record = hammer_alloc_mem_record(dip, HAMMER_ENTRY_SIZE(bytes)); 616 if (++trans->hmp->namekey_iterator == 0) 617 ++trans->hmp->namekey_iterator; 618 619 record->type = HAMMER_MEM_RECORD_ADD; 620 record->leaf.base.localization = dip->obj_localization + 621 HAMMER_LOCALIZE_MISC; 622 record->leaf.base.obj_id = dip->obj_id; 623 record->leaf.base.key = hammer_directory_namekey(name, bytes); 624 record->leaf.base.key += trans->hmp->namekey_iterator; 625 record->leaf.base.rec_type = HAMMER_RECTYPE_DIRENTRY; 626 record->leaf.base.obj_type = ip->ino_leaf.base.obj_type; 627 record->data->entry.obj_id = ip->obj_id; 628 record->data->entry.localization = ip->obj_localization; 629 bcopy(name, record->data->entry.name, bytes); 630 631 ++ip->ino_data.nlinks; 632 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); 633 634 /* 635 * Find an unused namekey. Both the in-memory record tree and 636 * the B-Tree are checked. Exact matches also match create_tid 637 * so use an ASOF search to (mostly) ignore it. 638 * 639 * delete-visibility is set so pending deletions do not give us 640 * a false-negative on our ability to use an iterator. 641 */ 642 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 643 cursor.key_beg = record->leaf.base; 644 cursor.flags |= HAMMER_CURSOR_ASOF; 645 cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 646 cursor.asof = ip->obj_asof; 647 648 count = 0; 649 while (hammer_ip_lookup(&cursor) == 0) { 650 iterator = (u_int32_t)record->leaf.base.key + 1; 651 if (iterator == 0) 652 iterator = 1; 653 record->leaf.base.key &= ~0xFFFFFFFFLL; 654 record->leaf.base.key |= iterator; 655 cursor.key_beg.key = record->leaf.base.key; 656 if (++count == 1000000000) { 657 hammer_rel_mem_record(record); 658 error = ENOSPC; 659 goto failed; 660 } 661 } 662 663 /* 664 * The target inode and the directory entry are bound together. 665 */ 666 record->target_ip = ip; 667 record->flush_state = HAMMER_FST_SETUP; 668 TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry); 669 670 /* 671 * The inode now has a dependancy and must be taken out of the idle 672 * state. An inode not in an idle state is given an extra reference. 673 * 674 * When transitioning to a SETUP state flag for an automatic reflush 675 * when the dependancies are disposed of if someone is waiting on 676 * the inode. 677 */ 678 if (ip->flush_state == HAMMER_FST_IDLE) { 679 hammer_ref(&ip->lock); 680 ip->flush_state = HAMMER_FST_SETUP; 681 if (ip->flags & HAMMER_INODE_FLUSHW) 682 ip->flags |= HAMMER_INODE_REFLUSH; 683 } 684 error = hammer_mem_add(record); 685 if (error == 0) { 686 dip->ino_data.mtime = trans->time; 687 hammer_modify_inode(dip, HAMMER_INODE_MTIME); 688 } 689 failed: 690 hammer_done_cursor(&cursor); 691 return(error); 692 } 693 694 /* 695 * Delete the directory entry and update the inode link count. The 696 * cursor must be seeked to the directory entry record being deleted. 697 * 698 * The related inode should be share-locked by the caller. The caller is 699 * on the frontend. 700 * 701 * This function can return EDEADLK requiring the caller to terminate 702 * the cursor, any locks, wait on the returned record, and retry. 703 */ 704 int 705 hammer_ip_del_directory(struct hammer_transaction *trans, 706 hammer_cursor_t cursor, struct hammer_inode *dip, 707 struct hammer_inode *ip) 708 { 709 hammer_record_t record; 710 int error; 711 712 if (hammer_cursor_inmem(cursor)) { 713 /* 714 * In-memory (unsynchronized) records can simply be freed. 715 * Even though the HAMMER_RECF_DELETED_FE flag is ignored 716 * by the backend, we must still avoid races against the 717 * backend potentially syncing the record to the media. 718 * 719 * We cannot call hammer_ip_delete_record(), that routine may 720 * only be called from the backend. 721 */ 722 record = cursor->iprec; 723 if (record->flags & HAMMER_RECF_INTERLOCK_BE) { 724 KKASSERT(cursor->deadlk_rec == NULL); 725 hammer_ref(&record->lock); 726 cursor->deadlk_rec = record; 727 error = EDEADLK; 728 } else { 729 KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); 730 record->flags |= HAMMER_RECF_DELETED_FE; 731 error = 0; 732 } 733 } else { 734 /* 735 * If the record is on-disk we have to queue the deletion by 736 * the record's key. This also causes lookups to skip the 737 * record. 738 */ 739 KKASSERT(dip->flags & 740 (HAMMER_INODE_ONDISK | HAMMER_INODE_DONDISK)); 741 record = hammer_alloc_mem_record(dip, 0); 742 record->type = HAMMER_MEM_RECORD_DEL; 743 record->leaf.base = cursor->leaf->base; 744 745 record->target_ip = ip; 746 record->flush_state = HAMMER_FST_SETUP; 747 TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry); 748 749 /* 750 * The inode now has a dependancy and must be taken out of 751 * the idle state. An inode not in an idle state is given 752 * an extra reference. 753 * 754 * When transitioning to a SETUP state flag for an automatic 755 * reflush when the dependancies are disposed of if someone 756 * is waiting on the inode. 757 */ 758 if (ip->flush_state == HAMMER_FST_IDLE) { 759 hammer_ref(&ip->lock); 760 ip->flush_state = HAMMER_FST_SETUP; 761 if (ip->flags & HAMMER_INODE_FLUSHW) 762 ip->flags |= HAMMER_INODE_REFLUSH; 763 } 764 765 error = hammer_mem_add(record); 766 } 767 768 /* 769 * One less link. The file may still be open in the OS even after 770 * all links have gone away. 771 * 772 * We have to terminate the cursor before syncing the inode to 773 * avoid deadlocking against ourselves. XXX this may no longer 774 * be true. 775 * 776 * If nlinks drops to zero and the vnode is inactive (or there is 777 * no vnode), call hammer_inode_unloadable_check() to zonk the 778 * inode. If we don't do this here the inode will not be destroyed 779 * on-media until we unmount. 780 */ 781 if (error == 0) { 782 --ip->ino_data.nlinks; 783 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); 784 if (ip->ino_data.nlinks == 0 && 785 (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) { 786 hammer_done_cursor(cursor); 787 hammer_inode_unloadable_check(ip, 1); 788 hammer_flush_inode(ip, 0); 789 } 790 dip->ino_data.mtime = trans->time; 791 hammer_modify_inode(dip, HAMMER_INODE_MTIME); 792 793 } 794 return(error); 795 } 796 797 /* 798 * Add a record to an inode. 799 * 800 * The caller must allocate the record with hammer_alloc_mem_record(ip) and 801 * initialize the following additional fields: 802 * 803 * The related inode should be share-locked by the caller. The caller is 804 * on the frontend. 805 * 806 * record->rec.entry.base.base.key 807 * record->rec.entry.base.base.rec_type 808 * record->rec.entry.base.base.data_len 809 * record->data (a copy will be kmalloc'd if it cannot be embedded) 810 */ 811 int 812 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record) 813 { 814 hammer_inode_t ip = record->ip; 815 int error; 816 817 KKASSERT(record->leaf.base.localization != 0); 818 record->leaf.base.obj_id = ip->obj_id; 819 record->leaf.base.obj_type = ip->ino_leaf.base.obj_type; 820 error = hammer_mem_add(record); 821 return(error); 822 } 823 824 /* 825 * Locate a bulk record in-memory. Bulk records allow disk space to be 826 * reserved so the front-end can flush large data writes without having 827 * to queue the BIO to the flusher. Only the related record gets queued 828 * to the flusher. 829 */ 830 static hammer_record_t 831 hammer_ip_get_bulk(hammer_inode_t ip, off_t file_offset, int bytes) 832 { 833 hammer_record_t record; 834 struct hammer_btree_leaf_elm leaf; 835 836 bzero(&leaf, sizeof(leaf)); 837 leaf.base.obj_id = ip->obj_id; 838 leaf.base.key = file_offset + bytes; 839 leaf.base.create_tid = 0; 840 leaf.base.delete_tid = 0; 841 leaf.base.rec_type = HAMMER_RECTYPE_DATA; 842 leaf.base.obj_type = 0; /* unused */ 843 leaf.base.btype = HAMMER_BTREE_TYPE_RECORD; /* unused */ 844 leaf.base.localization = ip->obj_localization + HAMMER_LOCALIZE_MISC; 845 leaf.data_len = bytes; 846 847 record = hammer_rec_rb_tree_RB_LOOKUP_INFO(&ip->rec_tree, &leaf); 848 if (record) 849 hammer_ref(&record->lock); 850 return(record); 851 } 852 853 /* 854 * Reserve blockmap space placemarked with an in-memory record. 855 * 856 * This routine is called by the frontend in order to be able to directly 857 * flush a buffer cache buffer. The frontend has locked the related buffer 858 * cache buffers and we should be able to manipulate any overlapping 859 * in-memory records. 860 * 861 * The caller is responsible for adding the returned record. 862 */ 863 hammer_record_t 864 hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, 865 int *errorp) 866 { 867 hammer_record_t record; 868 hammer_record_t conflict; 869 int zone; 870 871 /* 872 * Deal with conflicting in-memory records. We cannot have multiple 873 * in-memory records for the same offset without seriously confusing 874 * the backend, including but not limited to the backend issuing 875 * delete-create-delete sequences and asserting on the delete_tid 876 * being the same as the create_tid. 877 * 878 * If we encounter a record with the backend interlock set we cannot 879 * immediately delete it without confusing the backend. 880 */ 881 while ((conflict = hammer_ip_get_bulk(ip, file_offset, bytes)) !=NULL) { 882 if (conflict->flags & HAMMER_RECF_INTERLOCK_BE) { 883 conflict->flags |= HAMMER_RECF_WANTED; 884 tsleep(conflict, 0, "hmrrc3", 0); 885 } else { 886 conflict->flags |= HAMMER_RECF_DELETED_FE; 887 } 888 hammer_rel_mem_record(conflict); 889 } 890 891 /* 892 * Create a record to cover the direct write. This is called with 893 * the related BIO locked so there should be no possible conflict. 894 * 895 * The backend is responsible for finalizing the space reserved in 896 * this record. 897 * 898 * XXX bytes not aligned, depend on the reservation code to 899 * align the reservation. 900 */ 901 record = hammer_alloc_mem_record(ip, 0); 902 zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX : 903 HAMMER_ZONE_SMALL_DATA_INDEX; 904 record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes, 905 &record->leaf.data_offset, 906 errorp); 907 if (record->resv == NULL) { 908 kprintf("hammer_ip_add_bulk: reservation failed\n"); 909 hammer_rel_mem_record(record); 910 return(NULL); 911 } 912 record->type = HAMMER_MEM_RECORD_DATA; 913 record->leaf.base.rec_type = HAMMER_RECTYPE_DATA; 914 record->leaf.base.obj_type = ip->ino_leaf.base.obj_type; 915 record->leaf.base.obj_id = ip->obj_id; 916 record->leaf.base.key = file_offset + bytes; 917 record->leaf.base.localization = ip->obj_localization + 918 HAMMER_LOCALIZE_MISC; 919 record->leaf.data_len = bytes; 920 hammer_crc_set_leaf(data, &record->leaf); 921 KKASSERT(*errorp == 0); 922 return(record); 923 } 924 925 /* 926 * Frontend truncation code. Scan in-memory records only. On-disk records 927 * and records in a flushing state are handled by the backend. The vnops 928 * setattr code will handle the block containing the truncation point. 929 * 930 * Partial blocks are not deleted. 931 */ 932 int 933 hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size) 934 { 935 struct rec_trunc_info info; 936 937 switch(ip->ino_data.obj_type) { 938 case HAMMER_OBJTYPE_REGFILE: 939 info.rec_type = HAMMER_RECTYPE_DATA; 940 break; 941 case HAMMER_OBJTYPE_DBFILE: 942 info.rec_type = HAMMER_RECTYPE_DB; 943 break; 944 default: 945 return(EINVAL); 946 } 947 info.trunc_off = file_size; 948 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_trunc_cmp, 949 hammer_frontend_trunc_callback, &info); 950 return(0); 951 } 952 953 static int 954 hammer_frontend_trunc_callback(hammer_record_t record, void *data __unused) 955 { 956 if (record->flags & HAMMER_RECF_DELETED_FE) 957 return(0); 958 if (record->flush_state == HAMMER_FST_FLUSH) 959 return(0); 960 KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0); 961 hammer_ref(&record->lock); 962 record->flags |= HAMMER_RECF_DELETED_FE; 963 hammer_rel_mem_record(record); 964 return(0); 965 } 966 967 /* 968 * Return 1 if the caller must check for and delete existing records 969 * before writing out a new data record. 970 * 971 * Return 0 if the caller can just insert the record into the B-Tree without 972 * checking. 973 */ 974 static int 975 hammer_record_needs_overwrite_delete(hammer_record_t record) 976 { 977 hammer_inode_t ip = record->ip; 978 int64_t file_offset; 979 int r; 980 981 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) 982 file_offset = record->leaf.base.key; 983 else 984 file_offset = record->leaf.base.key - record->leaf.data_len; 985 r = (file_offset < ip->save_trunc_off); 986 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 987 if (ip->save_trunc_off <= record->leaf.base.key) 988 ip->save_trunc_off = record->leaf.base.key + 1; 989 } else { 990 if (ip->save_trunc_off < record->leaf.base.key) 991 ip->save_trunc_off = record->leaf.base.key; 992 } 993 return(r); 994 } 995 996 /* 997 * Backend code. Sync a record to the media. 998 */ 999 int 1000 hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) 1001 { 1002 hammer_transaction_t trans = cursor->trans; 1003 int64_t file_offset; 1004 int bytes; 1005 void *bdata; 1006 int error; 1007 int doprop; 1008 1009 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 1010 KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); 1011 KKASSERT(record->leaf.base.localization != 0); 1012 1013 /* 1014 * Any direct-write related to the record must complete before we 1015 * can sync the record to the on-disk media. 1016 */ 1017 if (record->flags & (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL)) 1018 hammer_io_direct_wait(record); 1019 1020 /* 1021 * If this is a bulk-data record placemarker there may be an existing 1022 * record on-disk, indicating a data overwrite. If there is the 1023 * on-disk record must be deleted before we can insert our new record. 1024 * 1025 * We've synthesized this record and do not know what the create_tid 1026 * on-disk is, nor how much data it represents. 1027 * 1028 * Keep in mind that (key) for data records is (base_offset + len), 1029 * not (base_offset). Also, we only want to get rid of on-disk 1030 * records since we are trying to sync our in-memory record, call 1031 * hammer_ip_delete_range() with truncating set to 1 to make sure 1032 * it skips in-memory records. 1033 * 1034 * It is ok for the lookup to return ENOENT. 1035 * 1036 * NOTE OPTIMIZATION: sync_trunc_off is used to determine if we have 1037 * to call hammer_ip_delete_range() or not. This also means we must 1038 * update sync_trunc_off() as we write. 1039 */ 1040 if (record->type == HAMMER_MEM_RECORD_DATA && 1041 hammer_record_needs_overwrite_delete(record)) { 1042 file_offset = record->leaf.base.key - record->leaf.data_len; 1043 bytes = (record->leaf.data_len + HAMMER_BUFMASK) & 1044 ~HAMMER_BUFMASK; 1045 KKASSERT((file_offset & HAMMER_BUFMASK) == 0); 1046 error = hammer_ip_delete_range( 1047 cursor, record->ip, 1048 file_offset, file_offset + bytes - 1, 1049 1); 1050 if (error && error != ENOENT) 1051 goto done; 1052 } 1053 1054 /* 1055 * If this is a general record there may be an on-disk version 1056 * that must be deleted before we can insert the new record. 1057 */ 1058 if (record->type == HAMMER_MEM_RECORD_GENERAL) { 1059 error = hammer_delete_general(cursor, record->ip, 1060 &record->leaf); 1061 if (error && error != ENOENT) 1062 goto done; 1063 } 1064 1065 /* 1066 * Setup the cursor. 1067 */ 1068 hammer_normalize_cursor(cursor); 1069 cursor->key_beg = record->leaf.base; 1070 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1071 cursor->flags |= HAMMER_CURSOR_BACKEND; 1072 cursor->flags &= ~HAMMER_CURSOR_INSERT; 1073 1074 /* 1075 * Records can wind up on-media before the inode itself is on-media. 1076 * Flag the case. 1077 */ 1078 record->ip->flags |= HAMMER_INODE_DONDISK; 1079 1080 /* 1081 * If we are deleting a directory entry an exact match must be 1082 * found on-disk. 1083 */ 1084 if (record->type == HAMMER_MEM_RECORD_DEL) { 1085 error = hammer_btree_lookup(cursor); 1086 if (error == 0) { 1087 KKASSERT(cursor->iprec == NULL); 1088 error = hammer_ip_delete_record(cursor, record->ip, 1089 trans->tid); 1090 if (error == 0) { 1091 record->flags |= HAMMER_RECF_DELETED_FE; 1092 record->flags |= HAMMER_RECF_DELETED_BE; 1093 record->flags |= HAMMER_RECF_COMMITTED; 1094 } 1095 } 1096 goto done; 1097 } 1098 1099 /* 1100 * We are inserting. 1101 * 1102 * Issue a lookup to position the cursor and locate the cluster. The 1103 * target key should not exist. If we are creating a directory entry 1104 * we may have to iterate the low 32 bits of the key to find an unused 1105 * key. 1106 */ 1107 hammer_sync_lock_sh(trans); 1108 cursor->flags |= HAMMER_CURSOR_INSERT; 1109 error = hammer_btree_lookup(cursor); 1110 if (hammer_debug_inode) 1111 kprintf("DOINSERT LOOKUP %d\n", error); 1112 if (error == 0) { 1113 kprintf("hammer_ip_sync_record: duplicate rec " 1114 "at (%016llx)\n", record->leaf.base.key); 1115 Debugger("duplicate record1"); 1116 error = EIO; 1117 } 1118 #if 0 1119 if (record->type == HAMMER_MEM_RECORD_DATA) 1120 kprintf("sync_record %016llx ---------------- %016llx %d\n", 1121 record->leaf.base.key - record->leaf.data_len, 1122 record->leaf.data_offset, error); 1123 #endif 1124 1125 if (error != ENOENT) 1126 goto done_unlock; 1127 1128 /* 1129 * Allocate the record and data. The result buffers will be 1130 * marked as being modified and further calls to 1131 * hammer_modify_buffer() will result in unneeded UNDO records. 1132 * 1133 * Support zero-fill records (data == NULL and data_len != 0) 1134 */ 1135 if (record->type == HAMMER_MEM_RECORD_DATA) { 1136 /* 1137 * The data portion of a bulk-data record has already been 1138 * committed to disk, we need only adjust the layer2 1139 * statistics in the same transaction as our B-Tree insert. 1140 */ 1141 KKASSERT(record->leaf.data_offset != 0); 1142 error = hammer_blockmap_finalize(trans, 1143 record->leaf.data_offset, 1144 record->leaf.data_len); 1145 } else if (record->data && record->leaf.data_len) { 1146 /* 1147 * Wholely cached record, with data. Allocate the data. 1148 */ 1149 bdata = hammer_alloc_data(trans, record->leaf.data_len, 1150 record->leaf.base.rec_type, 1151 &record->leaf.data_offset, 1152 &cursor->data_buffer, &error); 1153 if (bdata == NULL) 1154 goto done_unlock; 1155 hammer_crc_set_leaf(record->data, &record->leaf); 1156 hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0); 1157 bcopy(record->data, bdata, record->leaf.data_len); 1158 hammer_modify_buffer_done(cursor->data_buffer); 1159 } else { 1160 /* 1161 * Wholely cached record, without data. 1162 */ 1163 record->leaf.data_offset = 0; 1164 record->leaf.data_crc = 0; 1165 } 1166 1167 error = hammer_btree_insert(cursor, &record->leaf, &doprop); 1168 if (hammer_debug_inode && error) 1169 kprintf("BTREE INSERT error %d @ %016llx:%d key %016llx\n", error, cursor->node->node_offset, cursor->index, record->leaf.base.key); 1170 1171 /* 1172 * Our record is on-disk, normally mark the in-memory version as 1173 * deleted. If the record represented a directory deletion but 1174 * we had to sync a valid directory entry to disk we must convert 1175 * the record to a covering delete so the frontend does not have 1176 * visibility on the synced entry. 1177 */ 1178 if (error == 0) { 1179 if (doprop) { 1180 hammer_btree_do_propagation(cursor, 1181 record->ip->pfsm, 1182 &record->leaf); 1183 } 1184 if (record->flags & HAMMER_RECF_CONVERT_DELETE) { 1185 KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); 1186 record->flags &= ~HAMMER_RECF_DELETED_FE; 1187 record->type = HAMMER_MEM_RECORD_DEL; 1188 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 1189 record->flags &= ~HAMMER_RECF_CONVERT_DELETE; 1190 /* hammer_flush_record_done takes care of the rest */ 1191 } else { 1192 record->flags |= HAMMER_RECF_DELETED_FE; 1193 record->flags |= HAMMER_RECF_DELETED_BE; 1194 } 1195 record->flags |= HAMMER_RECF_COMMITTED; 1196 } else { 1197 if (record->leaf.data_offset) { 1198 hammer_blockmap_free(trans, record->leaf.data_offset, 1199 record->leaf.data_len); 1200 } 1201 } 1202 done_unlock: 1203 hammer_sync_unlock(trans); 1204 done: 1205 return(error); 1206 } 1207 1208 /* 1209 * Add the record to the inode's rec_tree. The low 32 bits of a directory 1210 * entry's key is used to deal with hash collisions in the upper 32 bits. 1211 * A unique 64 bit key is generated in-memory and may be regenerated a 1212 * second time when the directory record is flushed to the on-disk B-Tree. 1213 * 1214 * A referenced record is passed to this function. This function 1215 * eats the reference. If an error occurs the record will be deleted. 1216 * 1217 * A copy of the temporary record->data pointer provided by the caller 1218 * will be made. 1219 */ 1220 int 1221 hammer_mem_add(hammer_record_t record) 1222 { 1223 hammer_mount_t hmp = record->ip->hmp; 1224 1225 /* 1226 * Make a private copy of record->data 1227 */ 1228 if (record->data) 1229 KKASSERT(record->flags & HAMMER_RECF_ALLOCDATA); 1230 1231 /* 1232 * Insert into the RB tree. A unique key should have already 1233 * been selected if this is a directory entry. 1234 */ 1235 if (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) { 1236 record->flags |= HAMMER_RECF_DELETED_FE; 1237 hammer_rel_mem_record(record); 1238 return (EEXIST); 1239 } 1240 ++hmp->count_newrecords; 1241 ++hmp->rsv_recs; 1242 ++record->ip->rsv_recs; 1243 record->ip->hmp->rsv_databytes += record->leaf.data_len; 1244 record->flags |= HAMMER_RECF_ONRBTREE; 1245 hammer_modify_inode(record->ip, HAMMER_INODE_XDIRTY); 1246 hammer_rel_mem_record(record); 1247 return(0); 1248 } 1249 1250 /************************************************************************ 1251 * HAMMER INODE MERGED-RECORD FUNCTIONS * 1252 ************************************************************************ 1253 * 1254 * These functions augment the B-Tree scanning functions in hammer_btree.c 1255 * by merging in-memory records with on-disk records. 1256 */ 1257 1258 /* 1259 * Locate a particular record either in-memory or on-disk. 1260 * 1261 * NOTE: This is basically a standalone routine, hammer_ip_next() may 1262 * NOT be called to iterate results. 1263 */ 1264 int 1265 hammer_ip_lookup(hammer_cursor_t cursor) 1266 { 1267 int error; 1268 1269 /* 1270 * If the element is in-memory return it without searching the 1271 * on-disk B-Tree 1272 */ 1273 KKASSERT(cursor->ip); 1274 error = hammer_mem_lookup(cursor); 1275 if (error == 0) { 1276 cursor->leaf = &cursor->iprec->leaf; 1277 return(error); 1278 } 1279 if (error != ENOENT) 1280 return(error); 1281 1282 /* 1283 * If the inode has on-disk components search the on-disk B-Tree. 1284 */ 1285 if ((cursor->ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0) 1286 return(error); 1287 error = hammer_btree_lookup(cursor); 1288 if (error == 0) 1289 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); 1290 return(error); 1291 } 1292 1293 /* 1294 * Locate the first record within the cursor's key_beg/key_end range, 1295 * restricted to a particular inode. 0 is returned on success, ENOENT 1296 * if no records matched the requested range, or some other error. 1297 * 1298 * When 0 is returned hammer_ip_next() may be used to iterate additional 1299 * records within the requested range. 1300 * 1301 * This function can return EDEADLK, requiring the caller to terminate 1302 * the cursor and try again. 1303 */ 1304 int 1305 hammer_ip_first(hammer_cursor_t cursor) 1306 { 1307 hammer_inode_t ip = cursor->ip; 1308 int error; 1309 1310 KKASSERT(ip != NULL); 1311 1312 /* 1313 * Clean up fields and setup for merged scan 1314 */ 1315 cursor->flags &= ~HAMMER_CURSOR_RETEST; 1316 cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM; 1317 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF; 1318 if (cursor->iprec) { 1319 hammer_rel_mem_record(cursor->iprec); 1320 cursor->iprec = NULL; 1321 } 1322 1323 /* 1324 * Search the on-disk B-Tree. hammer_btree_lookup() only does an 1325 * exact lookup so if we get ENOENT we have to call the iterate 1326 * function to validate the first record after the begin key. 1327 * 1328 * The ATEDISK flag is used by hammer_btree_iterate to determine 1329 * whether it must index forwards or not. It is also used here 1330 * to select the next record from in-memory or on-disk. 1331 * 1332 * EDEADLK can only occur if the lookup hit an empty internal 1333 * element and couldn't delete it. Since this could only occur 1334 * in-range, we can just iterate from the failure point. 1335 */ 1336 if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) { 1337 error = hammer_btree_lookup(cursor); 1338 if (error == ENOENT || error == EDEADLK) { 1339 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1340 if (hammer_debug_general & 0x2000) 1341 kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index); 1342 error = hammer_btree_iterate(cursor); 1343 } 1344 if (error && error != ENOENT) 1345 return(error); 1346 if (error == 0) { 1347 cursor->flags &= ~HAMMER_CURSOR_DISKEOF; 1348 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1349 } else { 1350 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1351 } 1352 } 1353 1354 /* 1355 * Search the in-memory record list (Red-Black tree). Unlike the 1356 * B-Tree search, mem_first checks for records in the range. 1357 */ 1358 error = hammer_mem_first(cursor); 1359 if (error && error != ENOENT) 1360 return(error); 1361 if (error == 0) { 1362 cursor->flags &= ~HAMMER_CURSOR_MEMEOF; 1363 cursor->flags &= ~HAMMER_CURSOR_ATEMEM; 1364 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0) 1365 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1366 } 1367 1368 /* 1369 * This will return the first matching record. 1370 */ 1371 return(hammer_ip_next(cursor)); 1372 } 1373 1374 /* 1375 * Retrieve the next record in a merged iteration within the bounds of the 1376 * cursor. This call may be made multiple times after the cursor has been 1377 * initially searched with hammer_ip_first(). 1378 * 1379 * 0 is returned on success, ENOENT if no further records match the 1380 * requested range, or some other error code is returned. 1381 */ 1382 int 1383 hammer_ip_next(hammer_cursor_t cursor) 1384 { 1385 hammer_btree_elm_t elm; 1386 hammer_record_t rec, save; 1387 int error; 1388 int r; 1389 1390 next_btree: 1391 /* 1392 * Load the current on-disk and in-memory record. If we ate any 1393 * records we have to get the next one. 1394 * 1395 * If we deleted the last on-disk record we had scanned ATEDISK will 1396 * be clear and RETEST will be set, forcing a call to iterate. The 1397 * fact that ATEDISK is clear causes iterate to re-test the 'current' 1398 * element. If ATEDISK is set, iterate will skip the 'current' 1399 * element. 1400 * 1401 * Get the next on-disk record 1402 */ 1403 if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_RETEST)) { 1404 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 1405 error = hammer_btree_iterate(cursor); 1406 cursor->flags &= ~HAMMER_CURSOR_RETEST; 1407 if (error == 0) { 1408 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1409 hammer_cache_node(&cursor->ip->cache[1], 1410 cursor->node); 1411 } else { 1412 cursor->flags |= HAMMER_CURSOR_DISKEOF | 1413 HAMMER_CURSOR_ATEDISK; 1414 } 1415 } 1416 } 1417 1418 next_memory: 1419 /* 1420 * Get the next in-memory record. 1421 * 1422 * hammer_rec_scan_cmp: Is the record still in our general range, 1423 * (non-inclusive of snapshot exclusions)? 1424 * hammer_rec_scan_callback: Is the record in our snapshot? 1425 */ 1426 if (cursor->flags & HAMMER_CURSOR_ATEMEM) { 1427 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) { 1428 save = cursor->iprec; 1429 cursor->iprec = NULL; 1430 rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL; 1431 while (rec) { 1432 if (hammer_rec_scan_cmp(rec, cursor) != 0) 1433 break; 1434 if (hammer_rec_scan_callback(rec, cursor) != 0) 1435 break; 1436 rec = hammer_rec_rb_tree_RB_NEXT(rec); 1437 } 1438 if (save) 1439 hammer_rel_mem_record(save); 1440 if (cursor->iprec) { 1441 KKASSERT(cursor->iprec == rec); 1442 cursor->flags &= ~HAMMER_CURSOR_ATEMEM; 1443 } else { 1444 cursor->flags |= HAMMER_CURSOR_MEMEOF; 1445 } 1446 } 1447 } 1448 1449 /* 1450 * The memory record may have become stale while being held in 1451 * cursor->iprec. We are interlocked against the backend on 1452 * with regards to B-Tree entries. 1453 */ 1454 if ((cursor->flags & HAMMER_CURSOR_ATEMEM) == 0) { 1455 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0) { 1456 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1457 goto next_memory; 1458 } 1459 } 1460 1461 /* 1462 * Extract either the disk or memory record depending on their 1463 * relative position. 1464 */ 1465 error = 0; 1466 switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) { 1467 case 0: 1468 /* 1469 * Both entries valid. Compare the entries and nominally 1470 * return the first one in the sort order. Numerous cases 1471 * require special attention, however. 1472 */ 1473 elm = &cursor->node->ondisk->elms[cursor->index]; 1474 r = hammer_btree_cmp(&elm->base, &cursor->iprec->leaf.base); 1475 1476 /* 1477 * If the two entries differ only by their key (-2/2) or 1478 * create_tid (-1/1), and are DATA records, we may have a 1479 * nominal match. We have to calculate the base file 1480 * offset of the data. 1481 */ 1482 if (r <= 2 && r >= -2 && r != 0 && 1483 cursor->ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE && 1484 cursor->iprec->type == HAMMER_MEM_RECORD_DATA) { 1485 int64_t base1 = elm->leaf.base.key - elm->leaf.data_len; 1486 int64_t base2 = cursor->iprec->leaf.base.key - 1487 cursor->iprec->leaf.data_len; 1488 if (base1 == base2) 1489 r = 0; 1490 } 1491 1492 if (r < 0) { 1493 error = hammer_btree_extract(cursor, 1494 HAMMER_CURSOR_GET_LEAF); 1495 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1496 break; 1497 } 1498 1499 /* 1500 * If the entries match exactly the memory entry is either 1501 * an on-disk directory entry deletion or a bulk data 1502 * overwrite. If it is a directory entry deletion we eat 1503 * both entries. 1504 * 1505 * For the bulk-data overwrite case it is possible to have 1506 * visibility into both, which simply means the syncer 1507 * hasn't gotten around to doing the delete+insert sequence 1508 * on the B-Tree. Use the memory entry and throw away the 1509 * on-disk entry. 1510 * 1511 * If the in-memory record is not either of these we 1512 * probably caught the syncer while it was syncing it to 1513 * the media. Since we hold a shared lock on the cursor, 1514 * the in-memory record had better be marked deleted at 1515 * this point. 1516 */ 1517 if (r == 0) { 1518 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) { 1519 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { 1520 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1521 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1522 goto next_btree; 1523 } 1524 } else if (cursor->iprec->type == HAMMER_MEM_RECORD_DATA) { 1525 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { 1526 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1527 } 1528 /* fall through to memory entry */ 1529 } else { 1530 panic("hammer_ip_next: duplicate mem/b-tree entry %p %d %08x", cursor->iprec, cursor->iprec->type, cursor->iprec->flags); 1531 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1532 goto next_memory; 1533 } 1534 } 1535 /* fall through to the memory entry */ 1536 case HAMMER_CURSOR_ATEDISK: 1537 /* 1538 * Only the memory entry is valid. 1539 */ 1540 cursor->leaf = &cursor->iprec->leaf; 1541 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1542 1543 /* 1544 * If the memory entry is an on-disk deletion we should have 1545 * also had found a B-Tree record. If the backend beat us 1546 * to it it would have interlocked the cursor and we should 1547 * have seen the in-memory record marked DELETED_FE. 1548 */ 1549 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL && 1550 (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { 1551 panic("hammer_ip_next: del-on-disk with no b-tree entry iprec %p flags %08x", cursor->iprec, cursor->iprec->flags); 1552 } 1553 break; 1554 case HAMMER_CURSOR_ATEMEM: 1555 /* 1556 * Only the disk entry is valid 1557 */ 1558 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); 1559 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1560 break; 1561 default: 1562 /* 1563 * Neither entry is valid 1564 * 1565 * XXX error not set properly 1566 */ 1567 cursor->leaf = NULL; 1568 error = ENOENT; 1569 break; 1570 } 1571 return(error); 1572 } 1573 1574 /* 1575 * Resolve the cursor->data pointer for the current cursor position in 1576 * a merged iteration. 1577 */ 1578 int 1579 hammer_ip_resolve_data(hammer_cursor_t cursor) 1580 { 1581 hammer_record_t record; 1582 int error; 1583 1584 if (hammer_cursor_inmem(cursor)) { 1585 /* 1586 * The data associated with an in-memory record is usually 1587 * kmalloced, but reserve-ahead data records will have an 1588 * on-disk reference. 1589 * 1590 * NOTE: Reserve-ahead data records must be handled in the 1591 * context of the related high level buffer cache buffer 1592 * to interlock against async writes. 1593 */ 1594 record = cursor->iprec; 1595 cursor->data = record->data; 1596 error = 0; 1597 if (cursor->data == NULL) { 1598 KKASSERT(record->leaf.base.rec_type == 1599 HAMMER_RECTYPE_DATA); 1600 cursor->data = hammer_bread_ext(cursor->trans->hmp, 1601 record->leaf.data_offset, 1602 record->leaf.data_len, 1603 &error, 1604 &cursor->data_buffer); 1605 } 1606 } else { 1607 cursor->leaf = &cursor->node->ondisk->elms[cursor->index].leaf; 1608 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA); 1609 } 1610 return(error); 1611 } 1612 1613 /* 1614 * Backend truncation / record replacement - delete records in range. 1615 * 1616 * Delete all records within the specified range for inode ip. In-memory 1617 * records still associated with the frontend are ignored. 1618 * 1619 * If truncating is non-zero in-memory records associated with the back-end 1620 * are ignored. If truncating is > 1 we can return EWOULDBLOCK. 1621 * 1622 * NOTES: 1623 * 1624 * * An unaligned range will cause new records to be added to cover 1625 * the edge cases. (XXX not implemented yet). 1626 * 1627 * * Replacement via reservations (see hammer_ip_sync_record_cursor()) 1628 * also do not deal with unaligned ranges. 1629 * 1630 * * ran_end is inclusive (e.g. 0,1023 instead of 0,1024). 1631 * 1632 * * Record keys for regular file data have to be special-cased since 1633 * they indicate the end of the range (key = base + bytes). 1634 * 1635 * * This function may be asked to delete ridiculously huge ranges, for 1636 * example if someone truncates or removes a 1TB regular file. We 1637 * must be very careful on restarts and we may have to stop w/ 1638 * EWOULDBLOCK to avoid blowing out the buffer cache. 1639 */ 1640 int 1641 hammer_ip_delete_range(hammer_cursor_t cursor, hammer_inode_t ip, 1642 int64_t ran_beg, int64_t ran_end, int truncating) 1643 { 1644 hammer_transaction_t trans = cursor->trans; 1645 hammer_btree_leaf_elm_t leaf; 1646 int error; 1647 int64_t off; 1648 int64_t tmp64; 1649 1650 #if 0 1651 kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end); 1652 #endif 1653 1654 KKASSERT(trans->type == HAMMER_TRANS_FLS); 1655 retry: 1656 hammer_normalize_cursor(cursor); 1657 cursor->key_beg.localization = ip->obj_localization + 1658 HAMMER_LOCALIZE_MISC; 1659 cursor->key_beg.obj_id = ip->obj_id; 1660 cursor->key_beg.create_tid = 0; 1661 cursor->key_beg.delete_tid = 0; 1662 cursor->key_beg.obj_type = 0; 1663 1664 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 1665 cursor->key_beg.key = ran_beg; 1666 cursor->key_beg.rec_type = HAMMER_RECTYPE_DB; 1667 } else { 1668 /* 1669 * The key in the B-Tree is (base+bytes), so the first possible 1670 * matching key is ran_beg + 1. 1671 */ 1672 cursor->key_beg.key = ran_beg + 1; 1673 cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA; 1674 } 1675 1676 cursor->key_end = cursor->key_beg; 1677 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 1678 cursor->key_end.key = ran_end; 1679 } else { 1680 tmp64 = ran_end + MAXPHYS + 1; /* work around GCC-4 bug */ 1681 if (tmp64 < ran_end) 1682 cursor->key_end.key = 0x7FFFFFFFFFFFFFFFLL; 1683 else 1684 cursor->key_end.key = ran_end + MAXPHYS + 1; 1685 } 1686 1687 cursor->asof = ip->obj_asof; 1688 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1689 cursor->flags |= HAMMER_CURSOR_ASOF; 1690 cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 1691 cursor->flags |= HAMMER_CURSOR_BACKEND; 1692 cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE; 1693 1694 error = hammer_ip_first(cursor); 1695 1696 /* 1697 * Iterate through matching records and mark them as deleted. 1698 */ 1699 while (error == 0) { 1700 leaf = cursor->leaf; 1701 1702 KKASSERT(leaf->base.delete_tid == 0); 1703 KKASSERT(leaf->base.obj_id == ip->obj_id); 1704 1705 /* 1706 * There may be overlap cases for regular file data. Also 1707 * remember the key for a regular file record is (base + len), 1708 * NOT (base). 1709 * 1710 * Note that do to duplicates (mem & media) allowed by 1711 * DELETE_VISIBILITY, off can wind up less then ran_beg. 1712 */ 1713 if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) { 1714 off = leaf->base.key - leaf->data_len; 1715 /* 1716 * Check the left edge case. We currently do not 1717 * split existing records. 1718 */ 1719 if (off < ran_beg && leaf->base.key > ran_beg) { 1720 panic("hammer left edge case %016llx %d\n", 1721 leaf->base.key, leaf->data_len); 1722 } 1723 1724 /* 1725 * Check the right edge case. Note that the 1726 * record can be completely out of bounds, which 1727 * terminates the search. 1728 * 1729 * base->key is exclusive of the right edge while 1730 * ran_end is inclusive of the right edge. The 1731 * (key - data_len) left boundary is inclusive. 1732 * 1733 * XXX theory-check this test at some point, are 1734 * we missing a + 1 somewhere? Note that ran_end 1735 * could overflow. 1736 */ 1737 if (leaf->base.key - 1 > ran_end) { 1738 if (leaf->base.key - leaf->data_len > ran_end) 1739 break; 1740 panic("hammer right edge case\n"); 1741 } 1742 } else { 1743 off = leaf->base.key; 1744 } 1745 1746 /* 1747 * Delete the record. When truncating we do not delete 1748 * in-memory (data) records because they represent data 1749 * written after the truncation. 1750 * 1751 * This will also physically destroy the B-Tree entry and 1752 * data if the retention policy dictates. The function 1753 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next() 1754 * to retest the new 'current' element. 1755 */ 1756 if (truncating == 0 || hammer_cursor_ondisk(cursor)) { 1757 error = hammer_ip_delete_record(cursor, ip, trans->tid); 1758 /* 1759 * If we have built up too many meta-buffers we risk 1760 * deadlocking the kernel and must stop. This can 1761 * occur when deleting ridiculously huge files. 1762 * sync_trunc_off is updated so the next cycle does 1763 * not re-iterate records we have already deleted. 1764 * 1765 * This is only done with formal truncations. 1766 */ 1767 if (truncating > 1 && error == 0 && 1768 hammer_flusher_meta_limit(ip->hmp)) { 1769 ip->sync_trunc_off = off; 1770 error = EWOULDBLOCK; 1771 } 1772 } 1773 if (error) 1774 break; 1775 ran_beg = off; /* for restart */ 1776 error = hammer_ip_next(cursor); 1777 } 1778 if (cursor->node) 1779 hammer_cache_node(&ip->cache[1], cursor->node); 1780 1781 if (error == EDEADLK) { 1782 hammer_done_cursor(cursor); 1783 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); 1784 if (error == 0) 1785 goto retry; 1786 } 1787 if (error == ENOENT) 1788 error = 0; 1789 return(error); 1790 } 1791 1792 /* 1793 * This backend function deletes the specified record on-disk, similar to 1794 * delete_range but for a specific record. Unlike the exact deletions 1795 * used when deleting a directory entry this function uses an ASOF search 1796 * like delete_range. 1797 * 1798 * This function may be called with ip->obj_asof set for a slave snapshot, 1799 * so don't use it. We always delete non-historical records only. 1800 */ 1801 static int 1802 hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip, 1803 hammer_btree_leaf_elm_t leaf) 1804 { 1805 hammer_transaction_t trans = cursor->trans; 1806 int error; 1807 1808 KKASSERT(trans->type == HAMMER_TRANS_FLS); 1809 retry: 1810 hammer_normalize_cursor(cursor); 1811 cursor->key_beg = leaf->base; 1812 cursor->asof = HAMMER_MAX_TID; 1813 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1814 cursor->flags |= HAMMER_CURSOR_ASOF; 1815 cursor->flags |= HAMMER_CURSOR_BACKEND; 1816 cursor->flags &= ~HAMMER_CURSOR_INSERT; 1817 1818 error = hammer_btree_lookup(cursor); 1819 if (error == 0) { 1820 error = hammer_ip_delete_record(cursor, ip, trans->tid); 1821 } 1822 if (error == EDEADLK) { 1823 hammer_done_cursor(cursor); 1824 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); 1825 if (error == 0) 1826 goto retry; 1827 } 1828 return(error); 1829 } 1830 1831 /* 1832 * This function deletes remaining auxillary records when an inode is 1833 * being deleted. This function explicitly does not delete the 1834 * inode record, directory entry, data, or db records. Those must be 1835 * properly disposed of prior to this call. 1836 */ 1837 int 1838 hammer_ip_delete_clean(hammer_cursor_t cursor, hammer_inode_t ip, int *countp) 1839 { 1840 hammer_transaction_t trans = cursor->trans; 1841 hammer_btree_leaf_elm_t leaf; 1842 int error; 1843 1844 KKASSERT(trans->type == HAMMER_TRANS_FLS); 1845 retry: 1846 hammer_normalize_cursor(cursor); 1847 cursor->key_beg.localization = ip->obj_localization + 1848 HAMMER_LOCALIZE_MISC; 1849 cursor->key_beg.obj_id = ip->obj_id; 1850 cursor->key_beg.create_tid = 0; 1851 cursor->key_beg.delete_tid = 0; 1852 cursor->key_beg.obj_type = 0; 1853 cursor->key_beg.rec_type = HAMMER_RECTYPE_CLEAN_START; 1854 cursor->key_beg.key = HAMMER_MIN_KEY; 1855 1856 cursor->key_end = cursor->key_beg; 1857 cursor->key_end.rec_type = HAMMER_RECTYPE_MAX; 1858 cursor->key_end.key = HAMMER_MAX_KEY; 1859 1860 cursor->asof = ip->obj_asof; 1861 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1862 cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1863 cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 1864 cursor->flags |= HAMMER_CURSOR_BACKEND; 1865 1866 error = hammer_ip_first(cursor); 1867 1868 /* 1869 * Iterate through matching records and mark them as deleted. 1870 */ 1871 while (error == 0) { 1872 leaf = cursor->leaf; 1873 1874 KKASSERT(leaf->base.delete_tid == 0); 1875 1876 /* 1877 * Mark the record and B-Tree entry as deleted. This will 1878 * also physically delete the B-Tree entry, record, and 1879 * data if the retention policy dictates. The function 1880 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next() 1881 * to retest the new 'current' element. 1882 * 1883 * Directory entries (and delete-on-disk directory entries) 1884 * must be synced and cannot be deleted. 1885 */ 1886 error = hammer_ip_delete_record(cursor, ip, trans->tid); 1887 ++*countp; 1888 if (error) 1889 break; 1890 error = hammer_ip_next(cursor); 1891 } 1892 if (cursor->node) 1893 hammer_cache_node(&ip->cache[1], cursor->node); 1894 if (error == EDEADLK) { 1895 hammer_done_cursor(cursor); 1896 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); 1897 if (error == 0) 1898 goto retry; 1899 } 1900 if (error == ENOENT) 1901 error = 0; 1902 return(error); 1903 } 1904 1905 /* 1906 * Delete the record at the current cursor. On success the cursor will 1907 * be positioned appropriately for an iteration but may no longer be at 1908 * a leaf node. 1909 * 1910 * This routine is only called from the backend. 1911 * 1912 * NOTE: This can return EDEADLK, requiring the caller to terminate the 1913 * cursor and retry. 1914 */ 1915 int 1916 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_inode_t ip, 1917 hammer_tid_t tid) 1918 { 1919 hammer_record_t iprec; 1920 hammer_mount_t hmp; 1921 int error; 1922 1923 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1924 KKASSERT(tid != 0); 1925 hmp = cursor->node->hmp; 1926 1927 /* 1928 * In-memory (unsynchronized) records can simply be freed. This 1929 * only occurs in range iterations since all other records are 1930 * individually synchronized. Thus there should be no confusion with 1931 * the interlock. 1932 * 1933 * An in-memory record may be deleted before being committed to disk, 1934 * but could have been accessed in the mean time. The reservation 1935 * code will deal with the case. 1936 */ 1937 if (hammer_cursor_inmem(cursor)) { 1938 iprec = cursor->iprec; 1939 KKASSERT((iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0); 1940 iprec->flags |= HAMMER_RECF_DELETED_FE; 1941 iprec->flags |= HAMMER_RECF_DELETED_BE; 1942 return(0); 1943 } 1944 1945 /* 1946 * On-disk records are marked as deleted by updating their delete_tid. 1947 * This does not effect their position in the B-Tree (which is based 1948 * on their create_tid). 1949 * 1950 * Frontend B-Tree operations track inodes so we tell 1951 * hammer_delete_at_cursor() not to. 1952 */ 1953 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); 1954 1955 if (error == 0) { 1956 error = hammer_delete_at_cursor( 1957 cursor, 1958 HAMMER_DELETE_ADJUST | hammer_nohistory(ip), 1959 cursor->trans->tid, 1960 cursor->trans->time32, 1961 0, NULL); 1962 } 1963 return(error); 1964 } 1965 1966 /* 1967 * Delete the B-Tree element at the current cursor and do any necessary 1968 * mirror propagation. 1969 * 1970 * The cursor must be properly positioned for an iteration on return but 1971 * may be pointing at an internal element. 1972 * 1973 * An element can be un-deleted by passing a delete_tid of 0 with 1974 * HAMMER_DELETE_ADJUST. 1975 */ 1976 int 1977 hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags, 1978 hammer_tid_t delete_tid, u_int32_t delete_ts, 1979 int track, int64_t *stat_bytes) 1980 { 1981 struct hammer_btree_leaf_elm save_leaf; 1982 hammer_transaction_t trans; 1983 hammer_btree_leaf_elm_t leaf; 1984 hammer_node_t node; 1985 hammer_btree_elm_t elm; 1986 hammer_off_t data_offset; 1987 int32_t data_len; 1988 u_int16_t rec_type; 1989 int error; 1990 int icount; 1991 int doprop; 1992 1993 error = hammer_cursor_upgrade(cursor); 1994 if (error) 1995 return(error); 1996 1997 trans = cursor->trans; 1998 node = cursor->node; 1999 elm = &node->ondisk->elms[cursor->index]; 2000 leaf = &elm->leaf; 2001 KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD); 2002 2003 hammer_sync_lock_sh(trans); 2004 doprop = 0; 2005 icount = 0; 2006 2007 /* 2008 * Adjust the delete_tid. Update the mirror_tid propagation field 2009 * as well. delete_tid can be 0 (undelete -- used by mirroring). 2010 */ 2011 if (delete_flags & HAMMER_DELETE_ADJUST) { 2012 if (elm->base.rec_type == HAMMER_RECTYPE_INODE) { 2013 if (elm->leaf.base.delete_tid == 0 && delete_tid) 2014 icount = -1; 2015 if (elm->leaf.base.delete_tid && delete_tid == 0) 2016 icount = 1; 2017 } 2018 2019 hammer_modify_node(trans, node, elm, sizeof(*elm)); 2020 elm->leaf.base.delete_tid = delete_tid; 2021 elm->leaf.delete_ts = delete_ts; 2022 hammer_modify_node_done(node); 2023 2024 if (elm->leaf.base.delete_tid > node->ondisk->mirror_tid) { 2025 hammer_modify_node_field(trans, node, mirror_tid); 2026 node->ondisk->mirror_tid = elm->leaf.base.delete_tid; 2027 hammer_modify_node_done(node); 2028 doprop = 1; 2029 if (hammer_debug_general & 0x0002) { 2030 kprintf("delete_at_cursor: propagate %016llx" 2031 " @%016llx\n", 2032 elm->leaf.base.delete_tid, 2033 node->node_offset); 2034 } 2035 } 2036 2037 /* 2038 * Adjust for the iteration. We have deleted the current 2039 * element and want to clear ATEDISK so the iteration does 2040 * not skip the element after, which now becomes the current 2041 * element. This element must be re-tested if doing an 2042 * iteration, which is handled by the RETEST flag. 2043 */ 2044 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 2045 cursor->flags |= HAMMER_CURSOR_RETEST; 2046 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 2047 } 2048 2049 /* 2050 * An on-disk record cannot have the same delete_tid 2051 * as its create_tid. In a chain of record updates 2052 * this could result in a duplicate record. 2053 */ 2054 KKASSERT(elm->leaf.base.delete_tid != 2055 elm->leaf.base.create_tid); 2056 } 2057 2058 /* 2059 * Destroy the B-Tree element if asked (typically if a nohistory 2060 * file or mount, or when called by the pruning code). 2061 * 2062 * Adjust the ATEDISK flag to properly support iterations. 2063 */ 2064 if (delete_flags & HAMMER_DELETE_DESTROY) { 2065 data_offset = elm->leaf.data_offset; 2066 data_len = elm->leaf.data_len; 2067 rec_type = elm->leaf.base.rec_type; 2068 if (doprop) { 2069 save_leaf = elm->leaf; 2070 leaf = &save_leaf; 2071 } 2072 if (elm->base.rec_type == HAMMER_RECTYPE_INODE && 2073 elm->leaf.base.delete_tid == 0) { 2074 icount = -1; 2075 } 2076 2077 error = hammer_btree_delete(cursor); 2078 if (error == 0) { 2079 /* 2080 * The deletion moves the next element (if any) to 2081 * the current element position. We must clear 2082 * ATEDISK so this element is not skipped and we 2083 * must set RETEST to force any iteration to re-test 2084 * the element. 2085 */ 2086 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 2087 cursor->flags |= HAMMER_CURSOR_RETEST; 2088 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 2089 } 2090 } 2091 if (error == 0) { 2092 switch(data_offset & HAMMER_OFF_ZONE_MASK) { 2093 case HAMMER_ZONE_LARGE_DATA: 2094 case HAMMER_ZONE_SMALL_DATA: 2095 case HAMMER_ZONE_META: 2096 hammer_blockmap_free(trans, 2097 data_offset, data_len); 2098 break; 2099 default: 2100 break; 2101 } 2102 } 2103 } 2104 2105 /* 2106 * Track inode count and next_tid. This is used by the mirroring 2107 * and PFS code. icount can be negative, zero, or positive. 2108 */ 2109 if (error == 0 && track) { 2110 if (icount) { 2111 hammer_modify_volume_field(trans, trans->rootvol, 2112 vol0_stat_inodes); 2113 trans->rootvol->ondisk->vol0_stat_inodes += icount; 2114 hammer_modify_volume_done(trans->rootvol); 2115 } 2116 if (trans->rootvol->ondisk->vol0_next_tid < delete_tid) { 2117 hammer_modify_volume(trans, trans->rootvol, NULL, 0); 2118 trans->rootvol->ondisk->vol0_next_tid = delete_tid; 2119 hammer_modify_volume_done(trans->rootvol); 2120 } 2121 } 2122 2123 /* 2124 * mirror_tid propagation occurs if the node's mirror_tid had to be 2125 * updated while adjusting the delete_tid. 2126 * 2127 * This occurs when deleting even in nohistory mode, but does not 2128 * occur when pruning an already-deleted node. 2129 * 2130 * cursor->ip is NULL when called from the pruning, mirroring, 2131 * and pfs code. If non-NULL propagation will be conditionalized 2132 * on whether the PFS is in no-history mode or not. 2133 */ 2134 if (doprop) { 2135 if (cursor->ip) 2136 hammer_btree_do_propagation(cursor, cursor->ip->pfsm, leaf); 2137 else 2138 hammer_btree_do_propagation(cursor, NULL, leaf); 2139 } 2140 hammer_sync_unlock(trans); 2141 return (error); 2142 } 2143 2144 /* 2145 * Determine whether we can remove a directory. This routine checks whether 2146 * a directory is empty or not and enforces flush connectivity. 2147 * 2148 * Flush connectivity requires that we block if the target directory is 2149 * currently flushing, otherwise it may not end up in the same flush group. 2150 * 2151 * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure. 2152 */ 2153 int 2154 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip) 2155 { 2156 struct hammer_cursor cursor; 2157 int error; 2158 2159 /* 2160 * Check directory empty 2161 */ 2162 hammer_init_cursor(trans, &cursor, &ip->cache[1], ip); 2163 2164 cursor.key_beg.localization = ip->obj_localization + 2165 HAMMER_LOCALIZE_MISC; 2166 cursor.key_beg.obj_id = ip->obj_id; 2167 cursor.key_beg.create_tid = 0; 2168 cursor.key_beg.delete_tid = 0; 2169 cursor.key_beg.obj_type = 0; 2170 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1; 2171 cursor.key_beg.key = HAMMER_MIN_KEY; 2172 2173 cursor.key_end = cursor.key_beg; 2174 cursor.key_end.rec_type = 0xFFFF; 2175 cursor.key_end.key = HAMMER_MAX_KEY; 2176 2177 cursor.asof = ip->obj_asof; 2178 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2179 2180 error = hammer_ip_first(&cursor); 2181 if (error == ENOENT) 2182 error = 0; 2183 else if (error == 0) 2184 error = ENOTEMPTY; 2185 hammer_done_cursor(&cursor); 2186 return(error); 2187 } 2188 2189