1 /* 2 * Copyright (c) 2007 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.26 2008/01/25 21:50:56 dillon Exp $ 35 */ 36 37 #include "hammer.h" 38 39 static int hammer_mem_add(hammer_transaction_t trans, 40 hammer_record_t record); 41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip); 42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip); 43 44 /* 45 * Red-black tree support. 46 */ 47 static int 48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) 49 { 50 if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type) 51 return(-1); 52 if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type) 53 return(1); 54 55 if (rec1->rec.base.base.key < rec2->rec.base.base.key) 56 return(-1); 57 if (rec1->rec.base.base.key > rec2->rec.base.base.key) 58 return(1); 59 60 if (rec1->rec.base.base.create_tid == 0) { 61 if (rec2->rec.base.base.create_tid == 0) 62 return(0); 63 return(1); 64 } 65 if (rec2->rec.base.base.create_tid == 0) 66 return(-1); 67 68 if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid) 69 return(-1); 70 if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid) 71 return(1); 72 return(0); 73 } 74 75 static int 76 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec) 77 { 78 if (info->rec_type < rec->rec.base.base.rec_type) 79 return(-3); 80 if (info->rec_type > rec->rec.base.base.rec_type) 81 return(3); 82 83 if (info->key < rec->rec.base.base.key) 84 return(-2); 85 if (info->key > rec->rec.base.base.key) 86 return(2); 87 88 if (info->create_tid == 0) { 89 if (rec->rec.base.base.create_tid == 0) 90 return(0); 91 return(1); 92 } 93 if (rec->rec.base.base.create_tid == 0) 94 return(-1); 95 if (info->create_tid < rec->rec.base.base.create_tid) 96 return(-1); 97 if (info->create_tid > rec->rec.base.base.create_tid) 98 return(1); 99 return(0); 100 } 101 102 /* 103 * RB_SCAN comparison code for hammer_mem_first(). The argument order 104 * is reversed so the comparison result has to be negated. key_beg and 105 * key_end are both range-inclusive. 106 * 107 * The creation timestamp can cause hammer_rec_compare() to return -1 or +1. 108 * These do not stop the scan. 109 * 110 * Localized deletions are not cached in-memory. 111 */ 112 static 113 int 114 hammer_rec_scan_cmp(hammer_record_t rec, void *data) 115 { 116 hammer_cursor_t cursor = data; 117 int r; 118 119 r = hammer_rec_compare(&cursor->key_beg, rec); 120 if (r > 1) 121 return(-1); 122 r = hammer_rec_compare(&cursor->key_end, rec); 123 if (r < -1) 124 return(1); 125 return(0); 126 } 127 128 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare); 129 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node, 130 hammer_rec_compare, hammer_base_elm_t); 131 132 /* 133 * Allocate a record for the caller to finish filling in. The record is 134 * returned referenced. 135 */ 136 hammer_record_t 137 hammer_alloc_mem_record(hammer_inode_t ip) 138 { 139 hammer_record_t record; 140 141 ++hammer_count_records; 142 record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO); 143 record->ip = ip; 144 record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD; 145 hammer_ref(&record->lock); 146 return (record); 147 } 148 149 /* 150 * Release a memory record. Records marked for deletion are immediately 151 * removed from the RB-Tree but otherwise left intact until the last ref 152 * goes away. 153 */ 154 void 155 hammer_rel_mem_record(struct hammer_record *record) 156 { 157 hammer_unref(&record->lock); 158 159 if (record->flags & HAMMER_RECF_DELETED) { 160 if (record->flags & HAMMER_RECF_ONRBTREE) { 161 RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, 162 record); 163 record->flags &= ~HAMMER_RECF_ONRBTREE; 164 } 165 if (record->lock.refs == 0) { 166 if (record->flags & HAMMER_RECF_ALLOCDATA) { 167 --hammer_count_record_datas; 168 kfree(record->data, M_HAMMER); 169 record->flags &= ~HAMMER_RECF_ALLOCDATA; 170 } 171 record->data = NULL; 172 --hammer_count_records; 173 kfree(record, M_HAMMER); 174 return; 175 } 176 } 177 178 /* 179 * If someone wanted the record wake them up. 180 */ 181 if (record->flags & HAMMER_RECF_WANTED) { 182 record->flags &= ~HAMMER_RECF_WANTED; 183 wakeup(record); 184 } 185 } 186 187 /* 188 * Lookup an in-memory record given the key specified in the cursor. Works 189 * just like hammer_btree_lookup() but operates on an inode's in-memory 190 * record list. 191 * 192 * The lookup must fail if the record is marked for deferred deletion. 193 */ 194 static 195 int 196 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip) 197 { 198 int error; 199 200 if (cursor->iprec) { 201 hammer_rel_mem_record(cursor->iprec); 202 cursor->iprec = NULL; 203 } 204 if (cursor->ip) { 205 hammer_rec_rb_tree_scan_info_done(&cursor->scan, 206 &cursor->ip->rec_tree); 207 } 208 cursor->ip = ip; 209 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree); 210 cursor->scan.node = NULL; 211 cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO( 212 &ip->rec_tree, &cursor->key_beg); 213 if (cursor->iprec == NULL) { 214 error = ENOENT; 215 } else { 216 hammer_ref(&cursor->iprec->lock); 217 error = 0; 218 } 219 return(error); 220 } 221 222 /* 223 * hammer_mem_first() - locate the first in-memory record matching the 224 * cursor. 225 * 226 * The RB_SCAN function we use is designed as a callback. We terminate it 227 * (return -1) as soon as we get a match. 228 */ 229 static 230 int 231 hammer_rec_scan_callback(hammer_record_t rec, void *data) 232 { 233 hammer_cursor_t cursor = data; 234 235 /* 236 * We terminate on success, so this should be NULL on entry. 237 */ 238 KKASSERT(cursor->iprec == NULL); 239 240 /* 241 * Skip if the record was marked deleted 242 */ 243 if (rec->flags & HAMMER_RECF_DELETED) 244 return(0); 245 246 /* 247 * Skip if not visible due to our as-of TID 248 */ 249 if (cursor->flags & HAMMER_CURSOR_ASOF) { 250 if (cursor->asof < rec->rec.base.base.create_tid) 251 return(0); 252 if (rec->rec.base.base.delete_tid && 253 cursor->asof >= rec->rec.base.base.delete_tid) { 254 return(0); 255 } 256 } 257 258 /* 259 * Block if currently being synchronized to disk, otherwise we 260 * may get a duplicate. Wakeup the syncer if it's stuck on 261 * the record. 262 */ 263 hammer_ref(&rec->lock); 264 ++rec->blocked; 265 while (rec->flags & HAMMER_RECF_SYNCING) { 266 rec->flags |= HAMMER_RECF_WANTED; 267 tsleep(rec, 0, "hmrrc2", 0); 268 } 269 --rec->blocked; 270 271 /* 272 * The record may have been deleted while we were blocked. 273 */ 274 if (rec->flags & HAMMER_RECF_DELETED) { 275 hammer_rel_mem_record(cursor->iprec); 276 return(0); 277 } 278 279 /* 280 * Set the matching record and stop the scan. 281 */ 282 cursor->iprec = rec; 283 return(-1); 284 } 285 286 static 287 int 288 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip) 289 { 290 if (cursor->iprec) { 291 hammer_rel_mem_record(cursor->iprec); 292 cursor->iprec = NULL; 293 } 294 if (cursor->ip) { 295 hammer_rec_rb_tree_scan_info_done(&cursor->scan, 296 &cursor->ip->rec_tree); 297 } 298 cursor->ip = ip; 299 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree); 300 301 cursor->scan.node = NULL; 302 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp, 303 hammer_rec_scan_callback, cursor); 304 305 /* 306 * Adjust scan.node and keep it linked into the RB-tree so we can 307 * hold the cursor through third party modifications of the RB-tree. 308 */ 309 if (cursor->iprec) { 310 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec); 311 return(0); 312 } 313 return(ENOENT); 314 } 315 316 void 317 hammer_mem_done(hammer_cursor_t cursor) 318 { 319 if (cursor->ip) { 320 hammer_rec_rb_tree_scan_info_done(&cursor->scan, 321 &cursor->ip->rec_tree); 322 cursor->ip = NULL; 323 } 324 if (cursor->iprec) { 325 hammer_rel_mem_record(cursor->iprec); 326 cursor->iprec = NULL; 327 } 328 } 329 330 /************************************************************************ 331 * HAMMER IN-MEMORY RECORD FUNCTIONS * 332 ************************************************************************ 333 * 334 * These functions manipulate in-memory records. Such records typically 335 * exist prior to being committed to disk or indexed via the on-disk B-Tree. 336 */ 337 338 /* 339 * Add a directory entry (dip,ncp) which references inode (ip). 340 * 341 * Note that the low 32 bits of the namekey are set temporarily to create 342 * a unique in-memory record, and may be modified a second time when the 343 * record is synchronized to disk. In particular, the low 32 bits cannot be 344 * all 0's when synching to disk, which is not handled here. 345 */ 346 int 347 hammer_ip_add_directory(struct hammer_transaction *trans, 348 struct hammer_inode *dip, struct namecache *ncp, 349 struct hammer_inode *ip) 350 { 351 hammer_record_t record; 352 int error; 353 int bytes; 354 355 record = hammer_alloc_mem_record(dip); 356 357 bytes = ncp->nc_nlen; /* NOTE: terminating \0 is NOT included */ 358 if (++trans->hmp->namekey_iterator == 0) 359 ++trans->hmp->namekey_iterator; 360 361 record->rec.entry.base.base.obj_id = dip->obj_id; 362 record->rec.entry.base.base.key = 363 hammer_directory_namekey(ncp->nc_name, bytes); 364 record->rec.entry.base.base.key += trans->hmp->namekey_iterator; 365 record->rec.entry.base.base.create_tid = trans->tid; 366 record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY; 367 record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type; 368 record->rec.entry.obj_id = ip->obj_id; 369 if (bytes <= sizeof(record->rec.entry.den_name)) { 370 record->data = (void *)record->rec.entry.den_name; 371 record->flags |= HAMMER_RECF_EMBEDDED_DATA; 372 } else { 373 ++hammer_count_record_datas; 374 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK); 375 record->flags |= HAMMER_RECF_ALLOCDATA; 376 } 377 bcopy(ncp->nc_name, record->data, bytes); 378 record->rec.entry.base.data_len = bytes; 379 ++ip->ino_rec.ino_nlinks; 380 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY); 381 error = hammer_mem_add(trans, record); 382 return(error); 383 } 384 385 /* 386 * Delete the directory entry and update the inode link count. The 387 * cursor must be seeked to the directory entry record being deleted. 388 * 389 * NOTE: HAMMER_CURSOR_DELETE may not have been set. XXX remove flag. 390 * 391 * This function can return EDEADLK requiring the caller to terminate 392 * the cursor and retry. 393 */ 394 int 395 hammer_ip_del_directory(struct hammer_transaction *trans, 396 hammer_cursor_t cursor, struct hammer_inode *dip, 397 struct hammer_inode *ip) 398 { 399 int error; 400 401 error = hammer_ip_delete_record(cursor, trans->tid); 402 403 /* 404 * One less link. The file may still be open in the OS even after 405 * all links have gone away so we only try to sync if the OS has 406 * no references and nlinks falls to 0. 407 * 408 * We have to terminate the cursor before syncing the inode to 409 * avoid deadlocking against ourselves. 410 */ 411 if (error == 0) { 412 --ip->ino_rec.ino_nlinks; 413 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY); 414 if (ip->ino_rec.ino_nlinks == 0 && 415 (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) { 416 hammer_done_cursor(cursor); 417 hammer_sync_inode(ip, MNT_NOWAIT, 1); 418 } 419 420 } 421 return(error); 422 } 423 424 /* 425 * Add a record to an inode. 426 * 427 * The caller must allocate the record with hammer_alloc_mem_record(ip) and 428 * initialize the following additional fields: 429 * 430 * record->rec.entry.base.base.key 431 * record->rec.entry.base.base.rec_type 432 * record->rec.entry.base.base.data_len 433 * record->data (a copy will be kmalloc'd if not embedded) 434 */ 435 int 436 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record) 437 { 438 hammer_inode_t ip = record->ip; 439 int error; 440 int bytes; 441 void *data; 442 443 record->rec.base.base.obj_id = ip->obj_id; 444 record->rec.base.base.create_tid = trans->tid; 445 record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type; 446 bytes = record->rec.base.data_len; 447 448 if (record->data) { 449 if ((char *)record->data < (char *)&record->rec || 450 (char *)record->data >= (char *)(&record->rec + 1)) { 451 ++hammer_count_record_datas; 452 data = kmalloc(bytes, M_HAMMER, M_WAITOK); 453 record->flags |= HAMMER_RECF_ALLOCDATA; 454 bcopy(record->data, data, bytes); 455 record->data = data; 456 } else { 457 record->flags |= HAMMER_RECF_EMBEDDED_DATA; 458 } 459 } 460 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY); 461 error = hammer_mem_add(trans, record); 462 return(error); 463 } 464 465 /* 466 * Sync data from a buffer cache buffer (typically) to the filesystem. This 467 * is called via the strategy called from a cached data source. This code 468 * is responsible for actually writing a data record out to the disk. 469 * 470 * This can only occur non-historically (i.e. 'current' data only). 471 */ 472 int 473 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip, 474 int64_t offset, void *data, int bytes, 475 struct hammer_cursor **spike) 476 { 477 struct hammer_cursor cursor; 478 hammer_record_ondisk_t rec; 479 union hammer_btree_elm elm; 480 void *bdata; 481 int error; 482 483 KKASSERT((offset & HAMMER_BUFMASK) == 0); 484 KKASSERT((bytes & HAMMER_BUFMASK) == 0); 485 retry: 486 error = hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp); 487 if (error) 488 return(error); 489 cursor.key_beg.obj_id = ip->obj_id; 490 cursor.key_beg.key = offset + bytes; 491 cursor.key_beg.create_tid = trans->tid; 492 cursor.key_beg.delete_tid = 0; 493 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 494 cursor.asof = trans->tid; 495 cursor.flags |= HAMMER_CURSOR_INSERT; 496 497 /* 498 * Issue a lookup to position the cursor and locate the cluster 499 */ 500 error = hammer_btree_lookup(&cursor); 501 if (error == 0) { 502 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n", 503 offset, bytes); 504 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index], 505 HAMMER_BTREE_TYPE_LEAF, cursor.index); 506 error = EIO; 507 } 508 if (error != ENOENT) 509 goto done; 510 511 /* 512 * Allocate record and data space now that we know which cluster 513 * the B-Tree node ended up in. 514 */ 515 bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error, 516 &cursor.data_buffer); 517 if (bdata == NULL) 518 goto done; 519 rec = hammer_alloc_record(cursor.node->cluster, &error, 520 HAMMER_RECTYPE_DATA, &cursor.record_buffer); 521 if (rec == NULL) 522 goto fail1; 523 524 /* 525 * Fill everything in and insert our B-Tree node. 526 */ 527 hammer_modify_buffer(cursor.record_buffer); 528 rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD; 529 rec->base.base.obj_id = ip->obj_id; 530 rec->base.base.key = offset + bytes; 531 rec->base.base.create_tid = trans->tid; 532 rec->base.base.delete_tid = 0; 533 rec->base.base.rec_type = HAMMER_RECTYPE_DATA; 534 rec->base.data_crc = crc32(data, bytes); 535 rec->base.rec_id = hammer_alloc_recid(cursor.node->cluster); 536 rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata); 537 rec->base.data_len = bytes; 538 539 hammer_modify_buffer(cursor.data_buffer); 540 bcopy(data, bdata, bytes); 541 542 elm.leaf.base = rec->base.base; 543 elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec); 544 elm.leaf.data_offset = rec->base.data_offset; 545 elm.leaf.data_len = bytes; 546 elm.leaf.data_crc = rec->base.data_crc; 547 548 /* 549 * Data records can wind up on-disk before the inode itself is 550 * on-disk. One must assume data records may be on-disk if either 551 * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set 552 */ 553 ip->flags |= HAMMER_INODE_DONDISK; 554 555 error = hammer_btree_insert(&cursor, &elm); 556 if (error == 0) 557 goto done; 558 559 hammer_free_record_ptr(cursor.record_buffer, rec, HAMMER_RECTYPE_DATA); 560 fail1: 561 hammer_free_data_ptr(cursor.data_buffer, bdata, bytes); 562 done: 563 /* 564 * If ENOSPC in cluster fill in the spike structure and return 565 * ENOSPC. 566 */ 567 if (error == ENOSPC) 568 hammer_load_spike(&cursor, spike); 569 hammer_done_cursor(&cursor); 570 if (error == EDEADLK) 571 goto retry; 572 return(error); 573 } 574 575 /* 576 * Sync an in-memory record to the disk. this is typically called via fsync 577 * from a cached record source. This code is responsible for actually 578 * writing a record out to the disk. 579 */ 580 int 581 hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike) 582 { 583 struct hammer_cursor cursor; 584 hammer_record_ondisk_t rec; 585 hammer_mount_t hmp; 586 union hammer_btree_elm elm; 587 void *bdata; 588 int error; 589 590 retry: 591 /* 592 * If the record has been deleted or is being synchronized, stop. 593 * Interlock with the syncing flag. 594 */ 595 if (record->flags & (HAMMER_RECF_DELETED | HAMMER_RECF_SYNCING)) 596 return(0); 597 record->flags |= HAMMER_RECF_SYNCING; 598 599 /* 600 * If someone other then us is referencing the record and not 601 * blocking waiting for us, we have to wait until they finish. 602 * 603 * It is possible the record got destroyed while we were blocked. 604 */ 605 if (record->lock.refs > record->blocked + 1) { 606 while (record->lock.refs > record->blocked + 1) { 607 record->flags |= HAMMER_RECF_WANTED; 608 tsleep(record, 0, "hmrrc1", 0); 609 } 610 if (record->flags & HAMMER_RECF_DELETED) 611 return(0); 612 } 613 614 /* 615 * Get a cursor 616 */ 617 error = hammer_init_cursor_hmp(&cursor, &record->ip->cache[0], 618 record->ip->hmp); 619 if (error) 620 return(error); 621 cursor.key_beg = record->rec.base.base; 622 cursor.flags |= HAMMER_CURSOR_INSERT; 623 624 /* 625 * Issue a lookup to position the cursor and locate the cluster. The 626 * target key should not exist. If we are creating a directory entry 627 * we may have to iterate the low 32 bits of the key to find an unused 628 * key. 629 */ 630 for (;;) { 631 error = hammer_btree_lookup(&cursor); 632 if (error) 633 break; 634 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) { 635 kprintf("hammer_ip_sync_record: duplicate rec " 636 "at (%016llx)\n", record->rec.base.base.key); 637 Debugger("duplicate record1"); 638 error = EIO; 639 break; 640 } 641 hmp = cursor.node->cluster->volume->hmp; 642 if (++hmp->namekey_iterator == 0) 643 ++hmp->namekey_iterator; 644 record->rec.base.base.key &= ~(0xFFFFFFFFLL); 645 record->rec.base.base.key |= hmp->namekey_iterator; 646 cursor.key_beg.key = record->rec.base.base.key; 647 } 648 if (error != ENOENT) 649 goto done; 650 651 /* 652 * Mark the record as undergoing synchronization. Our cursor is 653 * holding a locked B-Tree node for the insertion which interlocks 654 * anyone trying to access this record. 655 * 656 * XXX There is still a race present related to iterations. An 657 * iteration may process the record, a sync may occur, and then 658 * later process the B-Tree element for the same record. 659 * 660 * We do not try to synchronize a deleted record. 661 */ 662 if (record->flags & HAMMER_RECF_DELETED) { 663 error = 0; 664 goto done; 665 } 666 667 /* 668 * Allocate record and data space now that we know which cluster 669 * the B-Tree node ended up in. 670 */ 671 if (record->data == NULL || 672 (record->flags & HAMMER_RECF_EMBEDDED_DATA)) { 673 bdata = record->data; 674 } else { 675 bdata = hammer_alloc_data(cursor.node->cluster, 676 record->rec.base.data_len, &error, 677 &cursor.data_buffer); 678 if (bdata == NULL) 679 goto done; 680 } 681 rec = hammer_alloc_record(cursor.node->cluster, &error, 682 record->rec.base.base.rec_type, 683 &cursor.record_buffer); 684 if (rec == NULL) 685 goto fail1; 686 687 /* 688 * Fill everything in and insert our B-Tree node. 689 */ 690 hammer_modify_buffer(cursor.record_buffer); 691 *rec = record->rec; 692 if (bdata) { 693 rec->base.data_crc = crc32(record->data, 694 record->rec.base.data_len); 695 if (record->flags & HAMMER_RECF_EMBEDDED_DATA) { 696 /* 697 * Data embedded in record 698 */ 699 rec->base.data_offset = ((char *)bdata - 700 (char *)&record->rec); 701 KKASSERT(rec->base.data_offset >= 0 && 702 rec->base.data_offset + rec->base.data_len <= 703 sizeof(*rec)); 704 rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec); 705 } else { 706 /* 707 * Data separate from record 708 */ 709 rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata); 710 hammer_modify_buffer(cursor.data_buffer); 711 bcopy(record->data, bdata, rec->base.data_len); 712 } 713 } 714 rec->base.rec_id = hammer_alloc_recid(cursor.node->cluster); 715 716 elm.leaf.base = record->rec.base.base; 717 elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec); 718 elm.leaf.data_offset = rec->base.data_offset; 719 elm.leaf.data_len = rec->base.data_len; 720 elm.leaf.data_crc = rec->base.data_crc; 721 722 error = hammer_btree_insert(&cursor, &elm); 723 724 /* 725 * Clean up on success, or fall through on error. 726 */ 727 if (error == 0) { 728 record->flags |= HAMMER_RECF_DELETED; 729 goto done; 730 } 731 732 hammer_free_record_ptr(cursor.record_buffer, rec, 733 record->rec.base.base.rec_type); 734 fail1: 735 if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) { 736 hammer_free_data_ptr(cursor.data_buffer, bdata, 737 record->rec.base.data_len); 738 } 739 done: 740 record->flags &= ~HAMMER_RECF_SYNCING; 741 /* 742 * If ENOSPC in cluster fill in the spike structure and return 743 * ENOSPC. 744 */ 745 if (error == ENOSPC) 746 hammer_load_spike(&cursor, spike); 747 hammer_done_cursor(&cursor); 748 if (error == EDEADLK) 749 goto retry; 750 return(error); 751 } 752 753 /* 754 * Write out a record using the specified cursor. The caller does not have 755 * to seek the cursor. The flags are used to determine whether the data 756 * (if any) is embedded in the record or not. 757 * 758 * The cursor must be initialized, including its key_beg element. The B-Tree 759 * key may be different from the base element in the record (e.g. for spikes). 760 * 761 * NOTE: This can return EDEADLK, requiring the caller to release its cursor 762 * and retry the operation. 763 */ 764 int 765 hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec, 766 void *data, int cursor_flags) 767 { 768 union hammer_btree_elm elm; 769 hammer_record_ondisk_t nrec; 770 hammer_cluster_t ncluster; 771 hammer_volume_t nvolume; 772 int32_t nrec_offset; 773 void *bdata; 774 int error; 775 776 KKASSERT(cursor->flags & HAMMER_CURSOR_INSERT); 777 778 /* 779 * Issue a lookup to position the cursor and locate the cluster. The 780 * target key should not exist. 781 * 782 * If we run out of space trying to adjust the B-Tree for the 783 * insert, re-lookup without the insert flag so the cursor 784 * is properly positioned for the spike. 785 */ 786 error = hammer_btree_lookup(cursor); 787 if (error == 0) { 788 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n", 789 orec->base.base.key); 790 Debugger("duplicate record2"); 791 error = EIO; 792 } 793 if (error != ENOENT) 794 goto done; 795 796 /* 797 * Allocate record and data space now that we know which cluster 798 * the B-Tree node ended up in. 799 */ 800 if (data == NULL || 801 (cursor_flags & HAMMER_RECF_EMBEDDED_DATA)) { 802 bdata = data; 803 } else { 804 bdata = hammer_alloc_data(cursor->node->cluster, 805 orec->base.data_len, &error, 806 &cursor->data_buffer); 807 if (bdata == NULL) 808 goto done; 809 } 810 nrec = hammer_alloc_record(cursor->node->cluster, &error, 811 orec->base.base.rec_type, 812 &cursor->record_buffer); 813 if (nrec == NULL) 814 goto fail1; 815 816 /* 817 * Fill everything in and insert our B-Tree node. 818 */ 819 hammer_modify_buffer(cursor->record_buffer); 820 *nrec = *orec; 821 nrec->base.data_offset = 0; 822 if (bdata) { 823 nrec->base.data_crc = crc32(bdata, nrec->base.data_len); 824 if (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) { 825 /* 826 * Data embedded in record 827 */ 828 nrec->base.data_offset = ((char *)bdata - (char *)orec); 829 KKASSERT(nrec->base.data_offset >= 0 && 830 nrec->base.data_offset + nrec->base.data_len < 831 sizeof(*nrec)); 832 nrec->base.data_offset += hammer_bclu_offset(cursor->record_buffer, nrec); 833 } else { 834 /* 835 * Data separate from record 836 */ 837 nrec->base.data_offset = hammer_bclu_offset(cursor->data_buffer, bdata); 838 hammer_modify_buffer(cursor->data_buffer); 839 bcopy(data, bdata, nrec->base.data_len); 840 } 841 } 842 nrec->base.rec_id = hammer_alloc_recid(cursor->node->cluster); 843 nrec_offset = hammer_bclu_offset(cursor->record_buffer, nrec); 844 845 switch(cursor->key_beg.btype) { 846 case HAMMER_BTREE_TYPE_RECORD: 847 elm.leaf.base = cursor->key_beg; 848 elm.leaf.rec_offset = nrec_offset; 849 elm.leaf.data_offset = nrec->base.data_offset; 850 elm.leaf.data_len = nrec->base.data_len; 851 elm.leaf.data_crc = nrec->base.data_crc; 852 error = hammer_btree_insert(cursor, &elm); 853 break; 854 case HAMMER_BTREE_TYPE_SPIKE_BEG: 855 #if 0 856 Debugger("SpikeSpike"); 857 #endif 858 nvolume = hammer_get_volume(cursor->node->cluster->volume->hmp, 859 nrec->spike.vol_no, &error); 860 if (error) 861 break; 862 ncluster = hammer_get_cluster(nvolume, nrec->spike.clu_no, 863 &error, GET_CLUSTER_NORECOVER); 864 hammer_rel_volume(nvolume, 0); 865 if (error) 866 break; 867 hammer_modify_cluster(ncluster); 868 ncluster->ondisk->clu_btree_parent_offset = 869 cursor->node->node_offset; 870 hammer_rel_cluster(ncluster, 0); 871 error = hammer_btree_insert_cluster(cursor, ncluster, 872 nrec_offset); 873 break; 874 case HAMMER_BTREE_TYPE_SPIKE_END: 875 panic("hammer_write_record: cannot write spike-end elms"); 876 break; 877 default: 878 panic("hammer_write_record: unknown btype %02x", 879 elm.leaf.base.btype); 880 break; 881 } 882 if (error == 0) 883 goto done; 884 885 hammer_free_record_ptr(cursor->record_buffer, nrec, 886 orec->base.base.rec_type); 887 fail1: 888 if (data && (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) == 0) { 889 hammer_free_data_ptr(cursor->data_buffer, bdata, 890 orec->base.data_len); 891 } 892 done: 893 /* leave cursor intact */ 894 return(error); 895 } 896 897 /* 898 * Add the record to the inode's rec_tree. The low 32 bits of a directory 899 * entry's key is used to deal with hash collisions in the upper 32 bits. 900 * A unique 64 bit key is generated in-memory and may be regenerated a 901 * second time when the directory record is flushed to the on-disk B-Tree. 902 * 903 * A referenced record is passed to this function. This function 904 * eats the reference. If an error occurs the record will be deleted. 905 */ 906 static 907 int 908 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record) 909 { 910 while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) { 911 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){ 912 record->flags |= HAMMER_RECF_DELETED; 913 hammer_rel_mem_record(record); 914 return (EEXIST); 915 } 916 if (++trans->hmp->namekey_iterator == 0) 917 ++trans->hmp->namekey_iterator; 918 record->rec.base.base.key &= ~(0xFFFFFFFFLL); 919 record->rec.base.base.key |= trans->hmp->namekey_iterator; 920 } 921 record->flags |= HAMMER_RECF_ONRBTREE; 922 hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY); 923 hammer_rel_mem_record(record); 924 return(0); 925 } 926 927 /************************************************************************ 928 * HAMMER INODE MERGED-RECORD FUNCTIONS * 929 ************************************************************************ 930 * 931 * These functions augment the B-Tree scanning functions in hammer_btree.c 932 * by merging in-memory records with on-disk records. 933 */ 934 935 /* 936 * Locate a particular record either in-memory or on-disk. 937 * 938 * NOTE: This is basically a standalone routine, hammer_ip_next() may 939 * NOT be called to iterate results. 940 */ 941 int 942 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip) 943 { 944 int error; 945 946 /* 947 * If the element is in-memory return it without searching the 948 * on-disk B-Tree 949 */ 950 error = hammer_mem_lookup(cursor, ip); 951 if (error == 0) { 952 cursor->record = &cursor->iprec->rec; 953 return(error); 954 } 955 if (error != ENOENT) 956 return(error); 957 958 /* 959 * If the inode has on-disk components search the on-disk B-Tree. 960 */ 961 if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0) 962 return(error); 963 error = hammer_btree_lookup(cursor); 964 if (error == 0) 965 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD); 966 return(error); 967 } 968 969 /* 970 * Locate the first record within the cursor's key_beg/key_end range, 971 * restricted to a particular inode. 0 is returned on success, ENOENT 972 * if no records matched the requested range, or some other error. 973 * 974 * When 0 is returned hammer_ip_next() may be used to iterate additional 975 * records within the requested range. 976 * 977 * This function can return EDEADLK, requiring the caller to terminate 978 * the cursor and try again. 979 */ 980 int 981 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip) 982 { 983 int error; 984 985 /* 986 * Clean up fields and setup for merged scan 987 */ 988 cursor->flags &= ~HAMMER_CURSOR_DELBTREE; 989 cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM; 990 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF; 991 if (cursor->iprec) { 992 hammer_rel_mem_record(cursor->iprec); 993 cursor->iprec = NULL; 994 } 995 996 /* 997 * Search the on-disk B-Tree. hammer_btree_lookup() only does an 998 * exact lookup so if we get ENOENT we have to call the iterate 999 * function to validate the first record after the begin key. 1000 * 1001 * The ATEDISK flag is used by hammer_btree_iterate to determine 1002 * whether it must index forwards or not. It is also used here 1003 * to select the next record from in-memory or on-disk. 1004 * 1005 * EDEADLK can only occur if the lookup hit an empty internal 1006 * element and couldn't delete it. Since this could only occur 1007 * in-range, we can just iterate from the failure point. 1008 */ 1009 if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) { 1010 error = hammer_btree_lookup(cursor); 1011 if (error == ENOENT || error == EDEADLK) { 1012 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1013 error = hammer_btree_iterate(cursor); 1014 } 1015 if (error && error != ENOENT) 1016 return(error); 1017 if (error == 0) { 1018 cursor->flags &= ~HAMMER_CURSOR_DISKEOF; 1019 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1020 } else { 1021 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1022 } 1023 } 1024 1025 /* 1026 * Search the in-memory record list (Red-Black tree). Unlike the 1027 * B-Tree search, mem_first checks for records in the range. 1028 */ 1029 error = hammer_mem_first(cursor, ip); 1030 if (error && error != ENOENT) 1031 return(error); 1032 if (error == 0) { 1033 cursor->flags &= ~HAMMER_CURSOR_MEMEOF; 1034 cursor->flags &= ~HAMMER_CURSOR_ATEMEM; 1035 } 1036 1037 /* 1038 * This will return the first matching record. 1039 */ 1040 return(hammer_ip_next(cursor)); 1041 } 1042 1043 /* 1044 * Retrieve the next record in a merged iteration within the bounds of the 1045 * cursor. This call may be made multiple times after the cursor has been 1046 * initially searched with hammer_ip_first(). 1047 * 1048 * 0 is returned on success, ENOENT if no further records match the 1049 * requested range, or some other error code is returned. 1050 */ 1051 int 1052 hammer_ip_next(hammer_cursor_t cursor) 1053 { 1054 hammer_btree_elm_t elm; 1055 hammer_record_t rec; 1056 int error; 1057 int r; 1058 1059 /* 1060 * Load the current on-disk and in-memory record. If we ate any 1061 * records we have to get the next one. 1062 * 1063 * If we deleted the last on-disk record we had scanned ATEDISK will 1064 * be clear and DELBTREE will be set, forcing a call to iterate. The 1065 * fact that ATEDISK is clear causes iterate to re-test the 'current' 1066 * element. If ATEDISK is set, iterate will skip the 'current' 1067 * element. 1068 * 1069 * Get the next on-disk record 1070 */ 1071 if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) { 1072 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 1073 error = hammer_btree_iterate(cursor); 1074 cursor->flags &= ~HAMMER_CURSOR_DELBTREE; 1075 if (error == 0) 1076 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1077 else 1078 cursor->flags |= HAMMER_CURSOR_DISKEOF | 1079 HAMMER_CURSOR_ATEDISK; 1080 } 1081 } 1082 1083 /* 1084 * Get the next in-memory record. The record can be ripped out 1085 * of the RB tree so we maintain a scan_info structure to track 1086 * the next node. 1087 * 1088 * hammer_rec_scan_cmp: Is the record still in our general range, 1089 * (non-inclusive of snapshot exclusions)? 1090 * hammer_rec_scan_callback: Is the record in our snapshot? 1091 */ 1092 if (cursor->flags & HAMMER_CURSOR_ATEMEM) { 1093 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) { 1094 if (cursor->iprec) { 1095 hammer_rel_mem_record(cursor->iprec); 1096 cursor->iprec = NULL; 1097 } 1098 rec = cursor->scan.node; /* next node */ 1099 while (rec) { 1100 if (hammer_rec_scan_cmp(rec, cursor) != 0) 1101 break; 1102 if (hammer_rec_scan_callback(rec, cursor) != 0) 1103 break; 1104 rec = hammer_rec_rb_tree_RB_NEXT(rec); 1105 } 1106 if (cursor->iprec) { 1107 KKASSERT(cursor->iprec == rec); 1108 cursor->flags &= ~HAMMER_CURSOR_ATEMEM; 1109 cursor->scan.node = 1110 hammer_rec_rb_tree_RB_NEXT(rec); 1111 } else { 1112 cursor->flags |= HAMMER_CURSOR_MEMEOF; 1113 } 1114 } 1115 } 1116 1117 /* 1118 * Extract either the disk or memory record depending on their 1119 * relative position. 1120 */ 1121 error = 0; 1122 switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) { 1123 case 0: 1124 /* 1125 * Both entries valid 1126 */ 1127 elm = &cursor->node->ondisk->elms[cursor->index]; 1128 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base); 1129 if (r < 0) { 1130 error = hammer_btree_extract(cursor, 1131 HAMMER_CURSOR_GET_RECORD); 1132 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1133 break; 1134 } 1135 /* fall through to the memory entry */ 1136 case HAMMER_CURSOR_ATEDISK: 1137 /* 1138 * Only the memory entry is valid 1139 */ 1140 cursor->record = &cursor->iprec->rec; 1141 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1142 break; 1143 case HAMMER_CURSOR_ATEMEM: 1144 /* 1145 * Only the disk entry is valid 1146 */ 1147 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD); 1148 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1149 break; 1150 default: 1151 /* 1152 * Neither entry is valid 1153 * 1154 * XXX error not set properly 1155 */ 1156 cursor->record = NULL; 1157 error = ENOENT; 1158 break; 1159 } 1160 return(error); 1161 } 1162 1163 /* 1164 * Resolve the cursor->data pointer for the current cursor position in 1165 * a merged iteration. 1166 */ 1167 int 1168 hammer_ip_resolve_data(hammer_cursor_t cursor) 1169 { 1170 int error; 1171 1172 if (cursor->iprec && cursor->record == &cursor->iprec->rec) { 1173 cursor->data = cursor->iprec->data; 1174 error = 0; 1175 } else { 1176 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA); 1177 } 1178 return(error); 1179 } 1180 1181 /* 1182 * Delete all records within the specified range for inode ip. 1183 * 1184 * NOTE: An unaligned range will cause new records to be added to cover 1185 * the edge cases. (XXX not implemented yet). 1186 * 1187 * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024). 1188 * 1189 * NOTE: Record keys for regular file data have to be special-cased since 1190 * they indicate the end of the range (key = base + bytes). 1191 * 1192 * NOTE: The spike structure must be filled in if we return ENOSPC. 1193 */ 1194 int 1195 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip, 1196 int64_t ran_beg, int64_t ran_end, 1197 struct hammer_cursor **spike) 1198 { 1199 struct hammer_cursor cursor; 1200 hammer_record_ondisk_t rec; 1201 hammer_base_elm_t base; 1202 int error; 1203 int64_t off; 1204 1205 retry: 1206 hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp); 1207 1208 cursor.key_beg.obj_id = ip->obj_id; 1209 cursor.key_beg.create_tid = 0; 1210 cursor.key_beg.delete_tid = 0; 1211 cursor.key_beg.obj_type = 0; 1212 cursor.asof = ip->obj_asof; 1213 cursor.flags |= HAMMER_CURSOR_ASOF; 1214 1215 cursor.key_end = cursor.key_beg; 1216 if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) { 1217 cursor.key_beg.key = ran_beg; 1218 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 1219 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 1220 cursor.key_end.key = ran_end; 1221 } else { 1222 /* 1223 * The key in the B-Tree is (base+bytes), so the first possible 1224 * matching key is ran_beg + 1. 1225 */ 1226 int64_t tmp64; 1227 1228 cursor.key_beg.key = ran_beg + 1; 1229 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 1230 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 1231 1232 tmp64 = ran_end + MAXPHYS + 1; /* work around GCC-4 bug */ 1233 if (tmp64 < ran_end) 1234 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 1235 else 1236 cursor.key_end.key = ran_end + MAXPHYS + 1; 1237 } 1238 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 1239 1240 error = hammer_ip_first(&cursor, ip); 1241 1242 /* 1243 * Iterate through matching records and mark them as deleted. 1244 */ 1245 while (error == 0) { 1246 rec = cursor.record; 1247 base = &rec->base.base; 1248 1249 KKASSERT(base->delete_tid == 0); 1250 1251 /* 1252 * There may be overlap cases for regular file data. Also 1253 * remember the key for a regular file record is the offset 1254 * of the last byte of the record (base + len - 1), NOT the 1255 * base offset. 1256 */ 1257 #if 0 1258 kprintf("delete_range rec_type %02x\n", base->rec_type); 1259 #endif 1260 if (base->rec_type == HAMMER_RECTYPE_DATA) { 1261 #if 0 1262 kprintf("delete_range loop key %016llx\n", 1263 base->key - rec->base.data_len); 1264 #endif 1265 off = base->key - rec->base.data_len; 1266 /* 1267 * Check the left edge case. We currently do not 1268 * split existing records. 1269 */ 1270 if (off < ran_beg) { 1271 panic("hammer left edge case %016llx %d\n", 1272 base->key, rec->base.data_len); 1273 } 1274 1275 /* 1276 * Check the right edge case. Note that the 1277 * record can be completely out of bounds, which 1278 * terminates the search. 1279 * 1280 * base->key is exclusive of the right edge while 1281 * ran_end is inclusive of the right edge. The 1282 * (key - data_len) left boundary is inclusive. 1283 * 1284 * XXX theory-check this test at some point, are 1285 * we missing a + 1 somewhere? Note that ran_end 1286 * could overflow. 1287 */ 1288 if (base->key - 1 > ran_end) { 1289 if (base->key - rec->base.data_len > ran_end) 1290 break; 1291 panic("hammer right edge case\n"); 1292 } 1293 } 1294 1295 /* 1296 * Mark the record and B-Tree entry as deleted. This will 1297 * also physically delete the B-Tree entry, record, and 1298 * data if the retention policy dictates. The function 1299 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next() 1300 * uses to perform a fixup. 1301 */ 1302 error = hammer_ip_delete_record(&cursor, trans->tid); 1303 if (error) 1304 break; 1305 error = hammer_ip_next(&cursor); 1306 } 1307 hammer_done_cursor(&cursor); 1308 if (error == EDEADLK) 1309 goto retry; 1310 if (error == ENOENT) 1311 error = 0; 1312 return(error); 1313 } 1314 1315 /* 1316 * Delete all records associated with an inode except the inode record 1317 * itself. 1318 */ 1319 int 1320 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip) 1321 { 1322 struct hammer_cursor cursor; 1323 hammer_record_ondisk_t rec; 1324 hammer_base_elm_t base; 1325 int error; 1326 1327 retry: 1328 hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp); 1329 1330 cursor.key_beg.obj_id = ip->obj_id; 1331 cursor.key_beg.create_tid = 0; 1332 cursor.key_beg.delete_tid = 0; 1333 cursor.key_beg.obj_type = 0; 1334 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1; 1335 cursor.key_beg.key = HAMMER_MIN_KEY; 1336 1337 cursor.key_end = cursor.key_beg; 1338 cursor.key_end.rec_type = 0xFFFF; 1339 cursor.key_end.key = HAMMER_MAX_KEY; 1340 1341 cursor.asof = ip->obj_asof; 1342 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1343 1344 error = hammer_ip_first(&cursor, ip); 1345 1346 /* 1347 * Iterate through matching records and mark them as deleted. 1348 */ 1349 while (error == 0) { 1350 rec = cursor.record; 1351 base = &rec->base.base; 1352 1353 KKASSERT(base->delete_tid == 0); 1354 1355 /* 1356 * Mark the record and B-Tree entry as deleted. This will 1357 * also physically delete the B-Tree entry, record, and 1358 * data if the retention policy dictates. The function 1359 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next() 1360 * uses to perform a fixup. 1361 */ 1362 error = hammer_ip_delete_record(&cursor, trans->tid); 1363 if (error) 1364 break; 1365 error = hammer_ip_next(&cursor); 1366 } 1367 hammer_done_cursor(&cursor); 1368 if (error == EDEADLK) 1369 goto retry; 1370 if (error == ENOENT) 1371 error = 0; 1372 return(error); 1373 } 1374 1375 /* 1376 * Delete the record at the current cursor. On success the cursor will 1377 * be positioned appropriately for an iteration but may no longer be at 1378 * a leaf node. 1379 * 1380 * NOTE: This can return EDEADLK, requiring the caller to terminate the 1381 * cursor and retry. 1382 */ 1383 int 1384 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) 1385 { 1386 hammer_btree_elm_t elm; 1387 hammer_mount_t hmp; 1388 int error; 1389 1390 /* 1391 * In-memory (unsynchronized) records can simply be freed. 1392 */ 1393 if (cursor->record == &cursor->iprec->rec) { 1394 cursor->iprec->flags |= HAMMER_RECF_DELETED; 1395 return(0); 1396 } 1397 1398 /* 1399 * On-disk records are marked as deleted by updating their delete_tid. 1400 * This does not effect their position in the B-Tree (which is based 1401 * on their create_tid). 1402 */ 1403 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD); 1404 elm = NULL; 1405 hmp = cursor->node->cluster->volume->hmp; 1406 1407 if (error == 0) { 1408 hammer_modify_buffer(cursor->record_buffer); 1409 cursor->record->base.base.delete_tid = tid; 1410 1411 error = hammer_cursor_upgrade(cursor); 1412 if (error == 0) { 1413 hammer_modify_node(cursor->node); 1414 elm = &cursor->node->ondisk->elms[cursor->index]; 1415 elm->leaf.base.delete_tid = tid; 1416 } 1417 } 1418 1419 /* 1420 * If we were mounted with the nohistory option, we physically 1421 * delete the record. 1422 */ 1423 if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) { 1424 int32_t rec_offset; 1425 int32_t data_offset; 1426 int32_t data_len; 1427 u_int8_t rec_type; 1428 hammer_cluster_t cluster; 1429 1430 rec_offset = elm->leaf.rec_offset; 1431 data_offset = elm->leaf.data_offset; 1432 data_len = elm->leaf.data_len; 1433 rec_type = elm->leaf.base.rec_type; 1434 KKASSERT(rec_type == cursor->record->base.base.rec_type); 1435 1436 /* 1437 * We must ref the cluster to prevent it from being 1438 * freed prior to our freeing the last record. 1439 */ 1440 cluster = cursor->node->cluster; 1441 hammer_ref_cluster(cluster); 1442 1443 error = hammer_btree_delete(cursor); 1444 if (error == 0) { 1445 /* 1446 * This forces a fixup for the iteration because 1447 * the cursor is now either sitting at the 'next' 1448 * element or sitting at the end of a leaf. 1449 */ 1450 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 1451 cursor->flags |= HAMMER_CURSOR_DELBTREE; 1452 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1453 } 1454 hammer_free_record(cluster, rec_offset, rec_type); 1455 if (data_offset && (data_offset - rec_offset < 0 || 1456 data_offset - rec_offset >= HAMMER_RECORD_SIZE)) { 1457 hammer_free_data(cluster, data_offset,data_len); 1458 } 1459 } 1460 #if 0 1461 kprintf("hammer_ip_delete_record: %d:%d:%08x %08x/%d " 1462 "(%d remain in cluster)\n", 1463 cluster->volume->vol_no, cluster->clu_no, 1464 rec_offset, data_offset, data_len, 1465 cluster->ondisk->stat_records); 1466 #endif 1467 hammer_rel_cluster(cluster, 0); 1468 if (error) { 1469 panic("hammer_ip_delete_record: unable to physically delete the record!\n"); 1470 error = 0; 1471 } 1472 } 1473 return(error); 1474 } 1475 1476 /* 1477 * Determine whether a directory is empty or not. Returns 0 if the directory 1478 * is empty, ENOTEMPTY if it isn't, plus other possible errors. 1479 */ 1480 int 1481 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip) 1482 { 1483 struct hammer_cursor cursor; 1484 int error; 1485 1486 hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp); 1487 1488 cursor.key_beg.obj_id = ip->obj_id; 1489 cursor.key_beg.create_tid = 0; 1490 cursor.key_beg.delete_tid = 0; 1491 cursor.key_beg.obj_type = 0; 1492 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1; 1493 cursor.key_beg.key = HAMMER_MIN_KEY; 1494 1495 cursor.key_end = cursor.key_beg; 1496 cursor.key_end.rec_type = 0xFFFF; 1497 cursor.key_end.key = HAMMER_MAX_KEY; 1498 1499 cursor.asof = ip->obj_asof; 1500 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1501 1502 error = hammer_ip_first(&cursor, ip); 1503 if (error == ENOENT) 1504 error = 0; 1505 else if (error == 0) 1506 error = ENOTEMPTY; 1507 hammer_done_cursor(&cursor); 1508 return(error); 1509 } 1510 1511