1 /* $OpenBSD: btree.c,v 1.30 2010/09/01 12:13:21 martinh Exp $ */ 2 3 /* 4 * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/tree.h> 21 #include <sys/stat.h> 22 #include <sys/queue.h> 23 #include <sys/param.h> 24 #include <sys/uio.h> 25 26 #include <assert.h> 27 #include <err.h> 28 #include <errno.h> 29 #include <fcntl.h> 30 #include <stddef.h> 31 #include <stdint.h> 32 #include <stdio.h> 33 #include <stdlib.h> 34 #include <string.h> 35 #include <time.h> 36 #include <unistd.h> 37 38 #include "btree.h" 39 40 /* #define DEBUG */ 41 42 #ifdef DEBUG 43 # define DPRINTF(...) do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \ 44 fprintf(stderr, __VA_ARGS__); \ 45 fprintf(stderr, "\n"); } while(0) 46 #else 47 # define DPRINTF(...) 48 #endif 49 50 #define PAGESIZE 4096 51 #define BT_MINKEYS 4 52 #define BT_MAGIC 0xB3DBB3DB 53 #define BT_VERSION 4 54 #define MAXKEYSIZE 255 55 56 #define P_INVALID 0xFFFFFFFF 57 58 #define F_ISSET(w, f) (((w) & (f)) == (f)) 59 60 typedef uint32_t pgno_t; 61 typedef uint16_t indx_t; 62 63 /* There are four page types: meta, index, leaf and overflow. 64 * They all share the same page header. 65 */ 66 struct page { /* represents an on-disk page */ 67 pgno_t pgno; /* page number */ 68 #define P_BRANCH 0x01 /* branch page */ 69 #define P_LEAF 0x02 /* leaf page */ 70 #define P_OVERFLOW 0x04 /* overflow page */ 71 #define P_META 0x08 /* meta page */ 72 #define P_HEAD 0x10 /* header page */ 73 uint32_t flags; 74 #define lower b.fb.fb_lower 75 #define upper b.fb.fb_upper 76 #define p_next_pgno b.pb_next_pgno 77 union page_bounds { 78 struct { 79 indx_t fb_lower; /* lower bound of free space */ 80 indx_t fb_upper; /* upper bound of free space */ 81 } fb; 82 pgno_t pb_next_pgno; /* overflow page linked list */ 83 } b; 84 indx_t ptrs[1]; /* dynamic size */ 85 } __packed; 86 87 #define PAGEHDRSZ offsetof(struct page, ptrs) 88 89 #define NUMKEYSP(p) (((p)->lower - PAGEHDRSZ) >> 1) 90 #define NUMKEYS(mp) (((mp)->page->lower - PAGEHDRSZ) >> 1) 91 #define SIZELEFT(mp) (indx_t)((mp)->page->upper - (mp)->page->lower) 92 #define PAGEFILL(bt, mp) (1000 * ((bt)->head.psize - PAGEHDRSZ - SIZELEFT(mp)) / \ 93 ((bt)->head.psize - PAGEHDRSZ)) 94 #define IS_LEAF(mp) F_ISSET((mp)->page->flags, P_LEAF) 95 #define IS_BRANCH(mp) F_ISSET((mp)->page->flags, P_BRANCH) 96 #define IS_OVERFLOW(mp) F_ISSET((mp)->page->flags, P_OVERFLOW) 97 98 struct bt_head { /* header page content */ 99 uint32_t magic; 100 uint32_t version; 101 uint32_t flags; 102 uint32_t psize; /* page size */ 103 } __packed; 104 105 struct bt_meta { /* meta (footer) page content */ 106 #define BT_TOMBSTONE 0x01 /* file is replaced */ 107 uint32_t flags; 108 pgno_t root; /* page number of root page */ 109 pgno_t prev_meta; /* previous meta page number */ 110 time_t created_at; 111 uint32_t branch_pages; 112 uint32_t leaf_pages; 113 uint32_t overflow_pages; 114 uint32_t revisions; 115 uint32_t depth; 116 uint64_t entries; 117 unsigned char hash[SHA_DIGEST_LENGTH]; 118 } __packed; 119 120 struct btkey { 121 size_t len; 122 char str[MAXKEYSIZE]; 123 }; 124 125 struct mpage { /* an in-memory cached page */ 126 RB_ENTRY(mpage) entry; /* page cache entry */ 127 SIMPLEQ_ENTRY(mpage) next; /* queue of dirty pages */ 128 TAILQ_ENTRY(mpage) lru_next; /* LRU queue */ 129 struct mpage *parent; /* NULL if root */ 130 unsigned int parent_index; /* keep track of node index */ 131 struct btkey prefix; 132 struct page *page; 133 pgno_t pgno; /* copy of page->pgno */ 134 short ref; /* increased by cursors */ 135 short dirty; /* 1 if on dirty queue */ 136 }; 137 RB_HEAD(page_cache, mpage); 138 SIMPLEQ_HEAD(dirty_queue, mpage); 139 TAILQ_HEAD(lru_queue, mpage); 140 141 static int mpage_cmp(struct mpage *a, struct mpage *b); 142 static struct mpage *mpage_lookup(struct btree *bt, pgno_t pgno); 143 static void mpage_add(struct btree *bt, struct mpage *mp); 144 static void mpage_free(struct mpage *mp); 145 static void mpage_del(struct btree *bt, struct mpage *mp); 146 static void mpage_flush(struct btree *bt); 147 static struct mpage *mpage_copy(struct btree *bt, struct mpage *mp); 148 static void mpage_prune(struct btree *bt); 149 static void mpage_dirty(struct btree *bt, struct mpage *mp); 150 static struct mpage *mpage_touch(struct btree *bt, struct mpage *mp); 151 152 RB_PROTOTYPE(page_cache, mpage, entry, mpage_cmp); 153 RB_GENERATE(page_cache, mpage, entry, mpage_cmp); 154 155 struct ppage { /* ordered list of pages */ 156 SLIST_ENTRY(ppage) entry; 157 struct mpage *mpage; 158 unsigned int ki; /* cursor index on page */ 159 }; 160 SLIST_HEAD(page_stack, ppage); 161 162 #define CURSOR_EMPTY(c) SLIST_EMPTY(&(c)->stack) 163 #define CURSOR_TOP(c) SLIST_FIRST(&(c)->stack) 164 #define CURSOR_POP(c) SLIST_REMOVE_HEAD(&(c)->stack, entry) 165 #define CURSOR_PUSH(c,p) SLIST_INSERT_HEAD(&(c)->stack, p, entry) 166 167 struct cursor { 168 struct btree *bt; 169 struct btree_txn *txn; 170 struct page_stack stack; /* stack of parent pages */ 171 short initialized; /* 1 if initialized */ 172 short eof; /* 1 if end is reached */ 173 }; 174 175 #define METAHASHLEN offsetof(struct bt_meta, hash) 176 #define METADATA(p) ((void *)((char *)p + PAGEHDRSZ)) 177 178 struct node { 179 #define n_pgno p.np_pgno 180 #define n_dsize p.np_dsize 181 union { 182 pgno_t np_pgno; /* child page number */ 183 uint32_t np_dsize; /* leaf data size */ 184 } p; 185 uint16_t ksize; /* key size */ 186 #define F_BIGDATA 0x01 /* data put on overflow page */ 187 uint8_t flags; 188 char data[1]; 189 } __packed; 190 191 struct btree_txn { 192 pgno_t root; /* current / new root page */ 193 pgno_t next_pgno; /* next unallocated page */ 194 struct btree *bt; /* btree is ref'd */ 195 struct dirty_queue *dirty_queue; /* modified pages */ 196 #define BT_TXN_RDONLY 0x01 /* read-only transaction */ 197 #define BT_TXN_ERROR 0x02 /* an error has occurred */ 198 unsigned int flags; 199 }; 200 201 struct btree { 202 int fd; 203 char *path; 204 #define BT_FIXPADDING 0x01 /* internal */ 205 unsigned int flags; 206 bt_cmp_func cmp; /* user compare function */ 207 struct bt_head head; 208 struct bt_meta meta; 209 struct page_cache *page_cache; 210 struct lru_queue *lru_queue; 211 struct btree_txn *txn; /* current write transaction */ 212 int ref; /* increased by cursors & txn */ 213 struct btree_stat stat; 214 off_t size; /* current file size */ 215 }; 216 217 #define NODESIZE offsetof(struct node, data) 218 219 #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->size)) 220 #define LEAFSIZE(k, d) (NODESIZE + (k)->size + (d)->size) 221 #define NODEPTRP(p, i) ((struct node *)((char *)(p) + (p)->ptrs[i])) 222 #define NODEPTR(mp, i) NODEPTRP((mp)->page, i) 223 #define NODEKEY(node) (void *)((node)->data) 224 #define NODEDATA(node) (void *)((char *)(node)->data + (node)->ksize) 225 #define NODEPGNO(node) ((node)->p.np_pgno) 226 #define NODEDSZ(node) ((node)->p.np_dsize) 227 228 #define BT_COMMIT_PAGES 64 /* max number of pages to write in one commit */ 229 #define BT_MAXCACHE_DEF 1024 /* max number of pages to keep in cache */ 230 231 static int btree_read_page(struct btree *bt, pgno_t pgno, 232 struct page *page); 233 static struct mpage *btree_get_mpage(struct btree *bt, pgno_t pgno); 234 static int btree_search_page_root(struct btree *bt, 235 struct mpage *root, struct btval *key, 236 struct cursor *cursor, int modify, 237 struct mpage **mpp); 238 static int btree_search_page(struct btree *bt, 239 struct btree_txn *txn, struct btval *key, 240 struct cursor *cursor, int modify, 241 struct mpage **mpp); 242 243 static int btree_write_header(struct btree *bt, int fd); 244 static int btree_read_header(struct btree *bt); 245 static int btree_is_meta_page(struct page *p); 246 static int btree_read_meta(struct btree *bt, pgno_t *p_next); 247 static int btree_write_meta(struct btree *bt, pgno_t root, 248 unsigned int flags); 249 static void btree_ref(struct btree *bt); 250 251 static struct node *btree_search_node(struct btree *bt, struct mpage *mp, 252 struct btval *key, int *exactp, unsigned int *kip); 253 static int btree_add_node(struct btree *bt, struct mpage *mp, 254 indx_t indx, struct btval *key, struct btval *data, 255 pgno_t pgno, uint8_t flags); 256 static void btree_del_node(struct btree *bt, struct mpage *mp, 257 indx_t indx); 258 static int btree_read_data(struct btree *bt, struct mpage *mp, 259 struct node *leaf, struct btval *data); 260 261 static int btree_rebalance(struct btree *bt, struct mpage *mp); 262 static int btree_update_key(struct btree *bt, struct mpage *mp, 263 indx_t indx, struct btval *key); 264 static int btree_adjust_prefix(struct btree *bt, 265 struct mpage *src, int delta); 266 static int btree_move_node(struct btree *bt, struct mpage *src, 267 indx_t srcindx, struct mpage *dst, indx_t dstindx); 268 static int btree_merge(struct btree *bt, struct mpage *src, 269 struct mpage *dst); 270 static int btree_split(struct btree *bt, struct mpage **mpp, 271 unsigned int *newindxp, struct btval *newkey, 272 struct btval *newdata, pgno_t newpgno); 273 static struct mpage *btree_new_page(struct btree *bt, uint32_t flags); 274 static int btree_write_overflow_data(struct btree *bt, 275 struct page *p, struct btval *data); 276 277 static void cursor_pop_page(struct cursor *cursor); 278 static struct ppage *cursor_push_page(struct cursor *cursor, 279 struct mpage *mp); 280 281 static int bt_set_key(struct btree *bt, struct mpage *mp, 282 struct node *node, struct btval *key); 283 static int btree_sibling(struct cursor *cursor, int move_right); 284 static int btree_cursor_next(struct cursor *cursor, 285 struct btval *key, struct btval *data); 286 static int btree_cursor_set(struct cursor *cursor, 287 struct btval *key, struct btval *data, int *exactp); 288 static int btree_cursor_first(struct cursor *cursor, 289 struct btval *key, struct btval *data); 290 291 static void bt_reduce_separator(struct btree *bt, struct node *min, 292 struct btval *sep); 293 static void remove_prefix(struct btree *bt, struct btval *key, 294 size_t pfxlen); 295 static void expand_prefix(struct btree *bt, struct mpage *mp, 296 indx_t indx, struct btkey *expkey); 297 static void concat_prefix(struct btree *bt, char *s1, size_t n1, 298 char *s2, size_t n2, char *cs, size_t *cn); 299 static void common_prefix(struct btree *bt, struct btkey *min, 300 struct btkey *max, struct btkey *pfx); 301 static void find_common_prefix(struct btree *bt, struct mpage *mp); 302 303 static size_t bt_leaf_size(struct btree *bt, struct btval *key, 304 struct btval *data); 305 static size_t bt_branch_size(struct btree *bt, struct btval *key); 306 307 static pgno_t btree_compact_tree(struct btree *bt, pgno_t pgno, 308 struct btree *btc); 309 310 static int memncmp(const void *s1, size_t n1, 311 const void *s2, size_t n2); 312 static int memnrcmp(const void *s1, size_t n1, 313 const void *s2, size_t n2); 314 315 static int 316 memncmp(const void *s1, size_t n1, const void *s2, size_t n2) 317 { 318 if (n1 < n2) { 319 if (memcmp(s1, s2, n1) == 0) 320 return -1; 321 } 322 else if (n1 > n2) { 323 if (memcmp(s1, s2, n2) == 0) 324 return 1; 325 } 326 return memcmp(s1, s2, n1); 327 } 328 329 static int 330 memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2) 331 { 332 const unsigned char *p1; 333 const unsigned char *p2; 334 335 if (n1 == 0) 336 return n2 == 0 ? 0 : -1; 337 338 if (n2 == 0) 339 return n1 == 0 ? 0 : 1; 340 341 p1 = (const unsigned char *)s1 + n1 - 1; 342 p2 = (const unsigned char *)s2 + n2 - 1; 343 344 while (*p1 == *p2) { 345 if (p1 == s1) 346 return (p2 == s2) ? 0 : -1; 347 if (p2 == s2) 348 return (p1 == p2) ? 0 : 1; 349 p1--; 350 p2--; 351 } 352 return *p1 - *p2; 353 } 354 355 int 356 btree_cmp(struct btree *bt, const struct btval *a, const struct btval *b) 357 { 358 return bt->cmp(a, b); 359 } 360 361 static void 362 common_prefix(struct btree *bt, struct btkey *min, struct btkey *max, 363 struct btkey *pfx) 364 { 365 size_t n = 0; 366 char *p1; 367 char *p2; 368 369 if (min->len == 0 || max->len == 0) { 370 pfx->len = 0; 371 return; 372 } 373 374 if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 375 p1 = min->str + min->len - 1; 376 p2 = max->str + max->len - 1; 377 378 while (*p1 == *p2) { 379 if (p1 < min->str || p2 < max->str) 380 break; 381 p1--; 382 p2--; 383 n++; 384 } 385 386 assert(n <= (int)sizeof(pfx->str)); 387 pfx->len = n; 388 bcopy(p2 + 1, pfx->str, n); 389 } else { 390 p1 = min->str; 391 p2 = max->str; 392 393 while (*p1 == *p2) { 394 if (n == min->len || n == max->len) 395 break; 396 p1++; 397 p2++; 398 n++; 399 } 400 401 assert(n <= (int)sizeof(pfx->str)); 402 pfx->len = n; 403 bcopy(max->str, pfx->str, n); 404 } 405 } 406 407 static void 408 remove_prefix(struct btree *bt, struct btval *key, size_t pfxlen) 409 { 410 if (pfxlen == 0 || bt->cmp != NULL) 411 return; 412 413 DPRINTF("removing %zu bytes of prefix from key [%.*s]", pfxlen, 414 (int)key->size, (char *)key->data); 415 assert(pfxlen <= key->size); 416 key->size -= pfxlen; 417 if (!F_ISSET(bt->flags, BT_REVERSEKEY)) 418 key->data = (char *)key->data + pfxlen; 419 } 420 421 static void 422 expand_prefix(struct btree *bt, struct mpage *mp, indx_t indx, 423 struct btkey *expkey) 424 { 425 struct node *node; 426 427 node = NODEPTR(mp, indx); 428 expkey->len = sizeof(expkey->str); 429 concat_prefix(bt, mp->prefix.str, mp->prefix.len, 430 NODEKEY(node), node->ksize, expkey->str, &expkey->len); 431 } 432 433 static int 434 bt_cmp(struct btree *bt, const struct btval *key1, const struct btval *key2, 435 struct btkey *pfx) 436 { 437 if (F_ISSET(bt->flags, BT_REVERSEKEY)) 438 return memnrcmp(key1->data, key1->size - pfx->len, 439 key2->data, key2->size); 440 else 441 return memncmp((char *)key1->data + pfx->len, key1->size - pfx->len, 442 key2->data, key2->size); 443 } 444 445 void 446 btval_reset(struct btval *btv) 447 { 448 if (btv) { 449 if (btv->mp) 450 btv->mp->ref--; 451 if (btv->free_data) 452 free(btv->data); 453 bzero(btv, sizeof(*btv)); 454 } 455 } 456 457 static int 458 mpage_cmp(struct mpage *a, struct mpage *b) 459 { 460 if (a->pgno > b->pgno) 461 return 1; 462 if (a->pgno < b->pgno) 463 return -1; 464 return 0; 465 } 466 467 static struct mpage * 468 mpage_lookup(struct btree *bt, pgno_t pgno) 469 { 470 struct mpage find, *mp; 471 472 find.pgno = pgno; 473 mp = RB_FIND(page_cache, bt->page_cache, &find); 474 if (mp) { 475 bt->stat.hits++; 476 /* Update LRU queue. Move page to the end. */ 477 TAILQ_REMOVE(bt->lru_queue, mp, lru_next); 478 TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next); 479 } 480 return mp; 481 } 482 483 static void 484 mpage_add(struct btree *bt, struct mpage *mp) 485 { 486 assert(RB_INSERT(page_cache, bt->page_cache, mp) == NULL); 487 bt->stat.cache_size++; 488 TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next); 489 } 490 491 static void 492 mpage_free(struct mpage *mp) 493 { 494 if (mp != NULL) { 495 free(mp->page); 496 free(mp); 497 } 498 } 499 500 static void 501 mpage_del(struct btree *bt, struct mpage *mp) 502 { 503 assert(RB_REMOVE(page_cache, bt->page_cache, mp) == mp); 504 assert(bt->stat.cache_size > 0); 505 bt->stat.cache_size--; 506 TAILQ_REMOVE(bt->lru_queue, mp, lru_next); 507 } 508 509 static void 510 mpage_flush(struct btree *bt) 511 { 512 struct mpage *mp; 513 514 while ((mp = RB_MIN(page_cache, bt->page_cache)) != NULL) { 515 mpage_del(bt, mp); 516 mpage_free(mp); 517 } 518 } 519 520 static struct mpage * 521 mpage_copy(struct btree *bt, struct mpage *mp) 522 { 523 struct mpage *copy; 524 525 if ((copy = calloc(1, sizeof(*copy))) == NULL) 526 return NULL; 527 if ((copy->page = malloc(bt->head.psize)) == NULL) { 528 free(copy); 529 return NULL; 530 } 531 bcopy(mp->page, copy->page, bt->head.psize); 532 bcopy(&mp->prefix, ©->prefix, sizeof(mp->prefix)); 533 copy->parent = mp->parent; 534 copy->parent_index = mp->parent_index; 535 copy->pgno = mp->pgno; 536 537 return copy; 538 } 539 540 /* Remove the least recently used memory pages until the cache size is 541 * within the configured bounds. Pages referenced by cursors or returned 542 * key/data are not pruned. 543 */ 544 static void 545 mpage_prune(struct btree *bt) 546 { 547 struct mpage *mp, *next; 548 549 for (mp = TAILQ_FIRST(bt->lru_queue); mp; mp = next) { 550 if (bt->stat.cache_size <= bt->stat.max_cache) 551 break; 552 next = TAILQ_NEXT(mp, lru_next); 553 if (!mp->dirty && mp->ref <= 0) { 554 mpage_del(bt, mp); 555 mpage_free(mp); 556 } 557 } 558 } 559 560 /* Mark a page as dirty and push it on the dirty queue. 561 */ 562 static void 563 mpage_dirty(struct btree *bt, struct mpage *mp) 564 { 565 assert(bt != NULL); 566 assert(bt->txn != NULL); 567 568 if (!mp->dirty) { 569 mp->dirty = 1; 570 SIMPLEQ_INSERT_TAIL(bt->txn->dirty_queue, mp, next); 571 } 572 } 573 574 /* Touch a page: make it dirty and re-insert into tree with updated pgno. 575 */ 576 static struct mpage * 577 mpage_touch(struct btree *bt, struct mpage *mp) 578 { 579 assert(bt != NULL); 580 assert(bt->txn != NULL); 581 assert(mp != NULL); 582 583 if (!mp->dirty) { 584 DPRINTF("touching page %u -> %u", mp->pgno, bt->txn->next_pgno); 585 if (mp->ref == 0) 586 mpage_del(bt, mp); 587 else { 588 if ((mp = mpage_copy(bt, mp)) == NULL) 589 return NULL; 590 } 591 mp->pgno = mp->page->pgno = bt->txn->next_pgno++; 592 mpage_dirty(bt, mp); 593 mpage_add(bt, mp); 594 595 /* Update the page number to new touched page. */ 596 if (mp->parent != NULL) 597 NODEPGNO(NODEPTR(mp->parent, 598 mp->parent_index)) = mp->pgno; 599 } 600 601 return mp; 602 } 603 604 static int 605 btree_read_page(struct btree *bt, pgno_t pgno, struct page *page) 606 { 607 ssize_t rc; 608 609 DPRINTF("reading page %u", pgno); 610 bt->stat.reads++; 611 if ((rc = pread(bt->fd, page, bt->head.psize, (off_t)pgno*bt->head.psize)) == 0) { 612 DPRINTF("page %u doesn't exist", pgno); 613 errno = ENOENT; 614 return BT_FAIL; 615 } else if (rc != (ssize_t)bt->head.psize) { 616 if (rc > 0) 617 errno = EINVAL; 618 DPRINTF("read: %s", strerror(errno)); 619 return BT_FAIL; 620 } 621 622 if (page->pgno != pgno) { 623 DPRINTF("page numbers don't match: %u != %u", pgno, page->pgno); 624 errno = EINVAL; 625 return BT_FAIL; 626 } 627 628 DPRINTF("page %u has flags 0x%X", pgno, page->flags); 629 630 return BT_SUCCESS; 631 } 632 633 int 634 btree_sync(struct btree *bt) 635 { 636 if (!F_ISSET(bt->flags, BT_NOSYNC)) 637 return fsync(bt->fd); 638 return 0; 639 } 640 641 struct btree_txn * 642 btree_txn_begin(struct btree *bt, int rdonly) 643 { 644 struct btree_txn *txn; 645 646 if (!rdonly && bt->txn != NULL) { 647 DPRINTF("write transaction already begun"); 648 errno = EBUSY; 649 return NULL; 650 } 651 652 if ((txn = calloc(1, sizeof(*txn))) == NULL) { 653 DPRINTF("calloc: %s", strerror(errno)); 654 return NULL; 655 } 656 657 if (rdonly) { 658 txn->flags |= BT_TXN_RDONLY; 659 } else { 660 txn->dirty_queue = calloc(1, sizeof(*txn->dirty_queue)); 661 if (txn->dirty_queue == NULL) { 662 free(txn); 663 return NULL; 664 } 665 SIMPLEQ_INIT(txn->dirty_queue); 666 667 DPRINTF("taking write lock on txn %p", txn); 668 if (flock(bt->fd, LOCK_EX | LOCK_NB) != 0) { 669 DPRINTF("flock: %s", strerror(errno)); 670 errno = EBUSY; 671 free(txn->dirty_queue); 672 free(txn); 673 return NULL; 674 } 675 bt->txn = txn; 676 } 677 678 txn->bt = bt; 679 btree_ref(bt); 680 681 if (btree_read_meta(bt, &txn->next_pgno) != BT_SUCCESS) { 682 btree_txn_abort(txn); 683 return NULL; 684 } 685 686 txn->root = bt->meta.root; 687 DPRINTF("begin transaction on btree %p, root page %u", bt, txn->root); 688 689 return txn; 690 } 691 692 void 693 btree_txn_abort(struct btree_txn *txn) 694 { 695 struct mpage *mp; 696 struct btree *bt; 697 698 if (txn == NULL) 699 return; 700 701 bt = txn->bt; 702 DPRINTF("abort transaction on btree %p, root page %u", bt, txn->root); 703 704 if (!F_ISSET(txn->flags, BT_TXN_RDONLY)) { 705 /* Discard all dirty pages. 706 */ 707 while (!SIMPLEQ_EMPTY(txn->dirty_queue)) { 708 mp = SIMPLEQ_FIRST(txn->dirty_queue); 709 assert(mp->ref == 0); /* cursors should be closed */ 710 mpage_del(bt, mp); 711 SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next); 712 mpage_free(mp); 713 } 714 715 DPRINTF("releasing write lock on txn %p", txn); 716 txn->bt->txn = NULL; 717 if (flock(txn->bt->fd, LOCK_UN) != 0) { 718 DPRINTF("failed to unlock fd %d: %s", 719 txn->bt->fd, strerror(errno)); 720 } 721 free(txn->dirty_queue); 722 } 723 724 btree_close(txn->bt); 725 free(txn); 726 } 727 728 int 729 btree_txn_commit(struct btree_txn *txn) 730 { 731 int n, done; 732 ssize_t rc; 733 off_t size; 734 struct mpage *mp; 735 struct btree *bt; 736 struct iovec iov[BT_COMMIT_PAGES]; 737 738 assert(txn != NULL); 739 assert(txn->bt != NULL); 740 741 bt = txn->bt; 742 743 if (F_ISSET(txn->flags, BT_TXN_RDONLY)) { 744 DPRINTF("attempt to commit read-only transaction"); 745 btree_txn_abort(txn); 746 errno = EPERM; 747 return BT_FAIL; 748 } 749 750 if (txn != bt->txn) { 751 DPRINTF("attempt to commit unknown transaction"); 752 btree_txn_abort(txn); 753 errno = EINVAL; 754 return BT_FAIL; 755 } 756 757 if (F_ISSET(txn->flags, BT_TXN_ERROR)) { 758 DPRINTF("error flag is set, can't commit"); 759 btree_txn_abort(txn); 760 errno = EINVAL; 761 return BT_FAIL; 762 } 763 764 if (SIMPLEQ_EMPTY(txn->dirty_queue)) 765 goto done; 766 767 if (F_ISSET(bt->flags, BT_FIXPADDING)) { 768 size = lseek(bt->fd, 0, SEEK_END); 769 size += bt->head.psize - (size % bt->head.psize); 770 DPRINTF("extending to multiple of page size: %llu", size); 771 if (ftruncate(bt->fd, size) != 0) { 772 DPRINTF("ftruncate: %s", strerror(errno)); 773 btree_txn_abort(txn); 774 return BT_FAIL; 775 } 776 bt->flags &= ~BT_FIXPADDING; 777 } 778 779 DPRINTF("committing transaction on btree %p, root page %u", 780 bt, txn->root); 781 782 /* Commit up to BT_COMMIT_PAGES dirty pages to disk until done. 783 */ 784 do { 785 n = 0; 786 done = 1; 787 SIMPLEQ_FOREACH(mp, txn->dirty_queue, next) { 788 DPRINTF("commiting page %u", mp->pgno); 789 iov[n].iov_len = bt->head.psize; 790 iov[n].iov_base = mp->page; 791 if (++n >= BT_COMMIT_PAGES) { 792 done = 0; 793 break; 794 } 795 } 796 797 if (n == 0) 798 break; 799 800 DPRINTF("commiting %u dirty pages", n); 801 rc = writev(bt->fd, iov, n); 802 if (rc != (ssize_t)bt->head.psize*n) { 803 if (rc > 0) 804 DPRINTF("short write, filesystem full?"); 805 else 806 DPRINTF("writev: %s", strerror(errno)); 807 btree_txn_abort(txn); 808 return BT_FAIL; 809 } 810 811 /* Remove the dirty flag from the written pages. 812 */ 813 while (!SIMPLEQ_EMPTY(txn->dirty_queue)) { 814 mp = SIMPLEQ_FIRST(txn->dirty_queue); 815 mp->dirty = 0; 816 SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next); 817 if (--n == 0) 818 break; 819 } 820 } while (!done); 821 822 if (btree_sync(bt) != 0 || 823 btree_write_meta(bt, txn->root, 0) != BT_SUCCESS || 824 btree_sync(bt) != 0) { 825 btree_txn_abort(txn); 826 return BT_FAIL; 827 } 828 829 done: 830 mpage_prune(bt); 831 btree_txn_abort(txn); 832 833 return BT_SUCCESS; 834 } 835 836 static int 837 btree_write_header(struct btree *bt, int fd) 838 { 839 struct stat sb; 840 struct bt_head *h; 841 struct page *p; 842 ssize_t rc; 843 unsigned int psize; 844 845 DPRINTF("writing header page"); 846 assert(bt != NULL); 847 848 /* Ask stat for 'optimal blocksize for I/O'. 849 */ 850 if (fstat(fd, &sb) == 0) 851 psize = sb.st_blksize; 852 else 853 psize = PAGESIZE; 854 855 if ((p = calloc(1, psize)) == NULL) 856 return -1; 857 p->flags = P_HEAD; 858 859 h = METADATA(p); 860 h->magic = BT_MAGIC; 861 h->version = BT_VERSION; 862 h->psize = psize; 863 bcopy(h, &bt->head, sizeof(*h)); 864 865 rc = write(fd, p, bt->head.psize); 866 free(p); 867 if (rc != (ssize_t)bt->head.psize) { 868 if (rc > 0) 869 DPRINTF("short write, filesystem full?"); 870 return BT_FAIL; 871 } 872 873 return BT_SUCCESS; 874 } 875 876 static int 877 btree_read_header(struct btree *bt) 878 { 879 char page[PAGESIZE]; 880 struct page *p; 881 struct bt_head *h; 882 int rc; 883 884 assert(bt != NULL); 885 886 /* We don't know the page size yet, so use a minimum value. 887 */ 888 889 if ((rc = pread(bt->fd, page, PAGESIZE, 0)) == 0) { 890 errno = ENOENT; 891 return -1; 892 } else if (rc != PAGESIZE) { 893 if (rc > 0) 894 errno = EINVAL; 895 DPRINTF("read: %s", strerror(errno)); 896 return -1; 897 } 898 899 p = (struct page *)page; 900 901 if (!F_ISSET(p->flags, P_HEAD)) { 902 DPRINTF("page %d not a header page", p->pgno); 903 errno = EINVAL; 904 return -1; 905 } 906 907 h = METADATA(p); 908 if (h->magic != BT_MAGIC) { 909 DPRINTF("header has invalid magic"); 910 errno = EINVAL; 911 return -1; 912 } 913 914 if (h->version != BT_VERSION) { 915 DPRINTF("database is version %u, expected version %u", 916 bt->head.version, BT_VERSION); 917 errno = EINVAL; 918 return -1; 919 } 920 921 bcopy(h, &bt->head, sizeof(*h)); 922 return 0; 923 } 924 925 static int 926 btree_write_meta(struct btree *bt, pgno_t root, unsigned int flags) 927 { 928 struct mpage *mp; 929 struct bt_meta *meta; 930 ssize_t rc; 931 932 DPRINTF("writing meta page for root page %u", root); 933 934 assert(bt != NULL); 935 assert(bt->txn != NULL); 936 937 if ((mp = btree_new_page(bt, P_META)) == NULL) 938 return -1; 939 940 bt->meta.prev_meta = bt->meta.root; 941 bt->meta.root = root; 942 bt->meta.flags = flags; 943 bt->meta.created_at = time(0); 944 bt->meta.revisions++; 945 SHA1((unsigned char *)&bt->meta, METAHASHLEN, bt->meta.hash); 946 947 /* Copy the meta data changes to the new meta page. */ 948 meta = METADATA(mp->page); 949 bcopy(&bt->meta, meta, sizeof(*meta)); 950 951 rc = write(bt->fd, mp->page, bt->head.psize); 952 mp->dirty = 0; 953 SIMPLEQ_REMOVE_HEAD(bt->txn->dirty_queue, next); 954 if (rc != (ssize_t)bt->head.psize) { 955 if (rc > 0) 956 DPRINTF("short write, filesystem full?"); 957 return BT_FAIL; 958 } 959 960 if ((bt->size = lseek(bt->fd, 0, SEEK_END)) == -1) { 961 DPRINTF("failed to update file size: %s", strerror(errno)); 962 bt->size = 0; 963 } 964 965 return BT_SUCCESS; 966 } 967 968 /* Returns true if page p is a valid meta page, false otherwise. 969 */ 970 static int 971 btree_is_meta_page(struct page *p) 972 { 973 struct bt_meta *m; 974 unsigned char hash[SHA_DIGEST_LENGTH]; 975 976 m = METADATA(p); 977 if (!F_ISSET(p->flags, P_META)) { 978 DPRINTF("page %d not a meta page", p->pgno); 979 errno = EINVAL; 980 return 0; 981 } 982 983 if (m->root >= p->pgno && m->root != P_INVALID) { 984 DPRINTF("page %d points to an invalid root page", p->pgno); 985 errno = EINVAL; 986 return 0; 987 } 988 989 SHA1((unsigned char *)m, METAHASHLEN, hash); 990 if (bcmp(hash, m->hash, SHA_DIGEST_LENGTH) != 0) { 991 DPRINTF("page %d has an invalid digest", p->pgno); 992 errno = EINVAL; 993 return 0; 994 } 995 996 return 1; 997 } 998 999 static int 1000 btree_read_meta(struct btree *bt, pgno_t *p_next) 1001 { 1002 struct mpage *mp; 1003 struct bt_meta *meta; 1004 pgno_t meta_pgno, next_pgno; 1005 off_t size; 1006 1007 assert(bt != NULL); 1008 1009 if ((size = lseek(bt->fd, 0, SEEK_END)) == -1) 1010 goto fail; 1011 1012 DPRINTF("btree_read_meta: size = %llu", size); 1013 1014 if (size < bt->size) { 1015 DPRINTF("file has shrunk!"); 1016 errno = EIO; 1017 goto fail; 1018 } 1019 1020 if (size == bt->head.psize) { /* there is only the header */ 1021 if (p_next != NULL) 1022 *p_next = 1; 1023 return BT_SUCCESS; /* new file */ 1024 } 1025 1026 next_pgno = size / bt->head.psize; 1027 if (next_pgno == 0) { 1028 DPRINTF("corrupt file"); 1029 errno = EIO; 1030 goto fail; 1031 } 1032 1033 meta_pgno = next_pgno - 1; 1034 1035 if (size % bt->head.psize != 0) { 1036 DPRINTF("filesize not a multiple of the page size!"); 1037 bt->flags |= BT_FIXPADDING; 1038 next_pgno++; 1039 } 1040 1041 if (p_next != NULL) 1042 *p_next = next_pgno; 1043 1044 if (size == bt->size) { 1045 DPRINTF("size unchanged, keeping current meta page"); 1046 if (F_ISSET(bt->meta.flags, BT_TOMBSTONE)) { 1047 DPRINTF("file is dead"); 1048 errno = ESTALE; 1049 return BT_FAIL; 1050 } else 1051 return BT_SUCCESS; 1052 } 1053 bt->size = size; 1054 1055 while (meta_pgno > 0) { 1056 if ((mp = btree_get_mpage(bt, meta_pgno)) == NULL) 1057 break; 1058 if (btree_is_meta_page(mp->page)) { 1059 meta = METADATA(mp->page); 1060 DPRINTF("flags = 0x%x", meta->flags); 1061 if (F_ISSET(meta->flags, BT_TOMBSTONE)) { 1062 DPRINTF("file is dead"); 1063 errno = ESTALE; 1064 return BT_FAIL; 1065 } else { 1066 /* Make copy of last meta page. */ 1067 bcopy(meta, &bt->meta, sizeof(bt->meta)); 1068 return BT_SUCCESS; 1069 } 1070 } 1071 --meta_pgno; /* scan backwards to first valid meta page */ 1072 } 1073 1074 errno = EIO; 1075 fail: 1076 if (p_next != NULL) 1077 *p_next = P_INVALID; 1078 return BT_FAIL; 1079 } 1080 1081 struct btree * 1082 btree_open_fd(int fd, unsigned int flags) 1083 { 1084 struct btree *bt; 1085 int fl; 1086 1087 fl = fcntl(fd, F_GETFL, 0); 1088 if (fcntl(fd, F_SETFL, fl | O_APPEND) == -1) 1089 return NULL; 1090 1091 if ((bt = calloc(1, sizeof(*bt))) == NULL) 1092 return NULL; 1093 bt->fd = fd; 1094 bt->flags = flags; 1095 bt->flags &= ~BT_FIXPADDING; 1096 bt->ref = 1; 1097 bt->meta.root = P_INVALID; 1098 1099 if ((bt->page_cache = calloc(1, sizeof(*bt->page_cache))) == NULL) 1100 goto fail; 1101 bt->stat.max_cache = BT_MAXCACHE_DEF; 1102 RB_INIT(bt->page_cache); 1103 1104 if ((bt->lru_queue = calloc(1, sizeof(*bt->lru_queue))) == NULL) 1105 goto fail; 1106 TAILQ_INIT(bt->lru_queue); 1107 1108 if (btree_read_header(bt) != 0) { 1109 if (errno != ENOENT) 1110 goto fail; 1111 DPRINTF("new database"); 1112 btree_write_header(bt, bt->fd); 1113 } 1114 1115 if (btree_read_meta(bt, NULL) != 0) 1116 goto fail; 1117 1118 DPRINTF("opened database version %u, pagesize %u", 1119 bt->head.version, bt->head.psize); 1120 DPRINTF("timestamp: %s", ctime(&bt->meta.created_at)); 1121 DPRINTF("depth: %u", bt->meta.depth); 1122 DPRINTF("entries: %llu", bt->meta.entries); 1123 DPRINTF("revisions: %u", bt->meta.revisions); 1124 DPRINTF("branch pages: %u", bt->meta.branch_pages); 1125 DPRINTF("leaf pages: %u", bt->meta.leaf_pages); 1126 DPRINTF("overflow pages: %u", bt->meta.overflow_pages); 1127 DPRINTF("root: %u", bt->meta.root); 1128 DPRINTF("previous meta page: %u", bt->meta.prev_meta); 1129 1130 return bt; 1131 1132 fail: 1133 free(bt->lru_queue); 1134 free(bt->page_cache); 1135 free(bt); 1136 return NULL; 1137 } 1138 1139 struct btree * 1140 btree_open(const char *path, unsigned int flags, mode_t mode) 1141 { 1142 int fd, oflags; 1143 struct btree *bt; 1144 1145 if (F_ISSET(flags, BT_RDONLY)) 1146 oflags = O_RDONLY; 1147 else 1148 oflags = O_RDWR | O_CREAT | O_APPEND; 1149 1150 if ((fd = open(path, oflags, mode)) == -1) 1151 return NULL; 1152 1153 if ((bt = btree_open_fd(fd, flags)) == NULL) 1154 close(fd); 1155 else { 1156 bt->path = strdup(path); 1157 DPRINTF("opened btree %p", bt); 1158 } 1159 1160 return bt; 1161 } 1162 1163 static void 1164 btree_ref(struct btree *bt) 1165 { 1166 bt->ref++; 1167 DPRINTF("ref is now %d on btree %p", bt->ref, bt); 1168 } 1169 1170 void 1171 btree_close(struct btree *bt) 1172 { 1173 if (bt == NULL) 1174 return; 1175 1176 if (--bt->ref == 0) { 1177 DPRINTF("ref is zero, closing btree %p", bt); 1178 close(bt->fd); 1179 mpage_flush(bt); 1180 free(bt->page_cache); 1181 free(bt); 1182 } else 1183 DPRINTF("ref is now %d on btree %p", bt->ref, bt); 1184 } 1185 1186 /* Search for key within a leaf page, using binary search. 1187 * Returns the smallest entry larger or equal to the key. 1188 * If exactp is non-null, stores whether the found entry was an exact match 1189 * in *exactp (1 or 0). 1190 * If kip is non-null, stores the index of the found entry in *kip. 1191 * If no entry larger of equal to the key is found, returns NULL. 1192 */ 1193 static struct node * 1194 btree_search_node(struct btree *bt, struct mpage *mp, struct btval *key, 1195 int *exactp, unsigned int *kip) 1196 { 1197 unsigned int i = 0; 1198 int low, high; 1199 int rc = 0; 1200 struct node *node; 1201 struct btval nodekey; 1202 1203 DPRINTF("searching %lu keys in %s page %u with prefix [%.*s]", 1204 NUMKEYS(mp), 1205 IS_LEAF(mp) ? "leaf" : "branch", 1206 mp->pgno, (int)mp->prefix.len, (char *)mp->prefix.str); 1207 1208 assert(NUMKEYS(mp) > 0); 1209 1210 bzero(&nodekey, sizeof(nodekey)); 1211 1212 low = IS_LEAF(mp) ? 0 : 1; 1213 high = NUMKEYS(mp) - 1; 1214 while (low <= high) { 1215 i = (low + high) >> 1; 1216 node = NODEPTR(mp, i); 1217 1218 nodekey.size = node->ksize; 1219 nodekey.data = NODEKEY(node); 1220 1221 if (bt->cmp) 1222 rc = bt->cmp(key, &nodekey); 1223 else 1224 rc = bt_cmp(bt, key, &nodekey, &mp->prefix); 1225 1226 if (IS_LEAF(mp)) 1227 DPRINTF("found leaf index %u [%.*s], rc = %i", 1228 i, (int)nodekey.size, (char *)nodekey.data, rc); 1229 else 1230 DPRINTF("found branch index %u [%.*s -> %u], rc = %i", 1231 i, (int)node->ksize, (char *)NODEKEY(node), 1232 node->n_pgno, rc); 1233 1234 if (rc == 0) 1235 break; 1236 if (rc > 0) 1237 low = i + 1; 1238 else 1239 high = i - 1; 1240 } 1241 1242 if (rc > 0) { /* Found entry is less than the key. */ 1243 i++; /* Skip to get the smallest entry larger than key. */ 1244 if (i >= NUMKEYS(mp)) 1245 /* There is no entry larger or equal to the key. */ 1246 return NULL; 1247 } 1248 if (exactp) 1249 *exactp = (rc == 0); 1250 if (kip) /* Store the key index if requested. */ 1251 *kip = i; 1252 1253 return NODEPTR(mp, i); 1254 } 1255 1256 static void 1257 cursor_pop_page(struct cursor *cursor) 1258 { 1259 struct ppage *top; 1260 1261 top = CURSOR_TOP(cursor); 1262 CURSOR_POP(cursor); 1263 top->mpage->ref--; 1264 1265 DPRINTF("popped page %u off cursor %p", top->mpage->pgno, cursor); 1266 1267 free(top); 1268 } 1269 1270 static struct ppage * 1271 cursor_push_page(struct cursor *cursor, struct mpage *mp) 1272 { 1273 struct ppage *ppage; 1274 1275 DPRINTF("pushing page %u on cursor %p", mp->pgno, cursor); 1276 1277 if ((ppage = calloc(1, sizeof(*ppage))) == NULL) 1278 return NULL; 1279 ppage->mpage = mp; 1280 mp->ref++; 1281 CURSOR_PUSH(cursor, ppage); 1282 return ppage; 1283 } 1284 1285 static struct mpage * 1286 btree_get_mpage(struct btree *bt, pgno_t pgno) 1287 { 1288 struct mpage *mp; 1289 1290 mp = mpage_lookup(bt, pgno); 1291 if (mp == NULL) { 1292 if ((mp = calloc(1, sizeof(*mp))) == NULL) 1293 return NULL; 1294 if ((mp->page = malloc(bt->head.psize)) == NULL) { 1295 free(mp); 1296 return NULL; 1297 } 1298 if (btree_read_page(bt, pgno, mp->page) != BT_SUCCESS) { 1299 mpage_free(mp); 1300 return NULL; 1301 } 1302 mp->pgno = pgno; 1303 mpage_add(bt, mp); 1304 } else 1305 DPRINTF("returning page %u from cache", pgno); 1306 1307 return mp; 1308 } 1309 1310 static void 1311 concat_prefix(struct btree *bt, char *s1, size_t n1, char *s2, size_t n2, 1312 char *cs, size_t *cn) 1313 { 1314 assert(*cn >= n1 + n2); 1315 if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 1316 bcopy(s2, cs, n2); 1317 bcopy(s1, cs + n2, n1); 1318 } else { 1319 bcopy(s1, cs, n1); 1320 bcopy(s2, cs + n1, n2); 1321 } 1322 *cn = n1 + n2; 1323 } 1324 1325 static void 1326 find_common_prefix(struct btree *bt, struct mpage *mp) 1327 { 1328 indx_t lbound = 0, ubound = 0; 1329 struct mpage *lp, *up; 1330 struct btkey lprefix, uprefix; 1331 1332 mp->prefix.len = 0; 1333 if (bt->cmp != NULL) 1334 return; 1335 1336 lp = mp; 1337 while (lp->parent != NULL) { 1338 if (lp->parent_index > 0) { 1339 lbound = lp->parent_index; 1340 break; 1341 } 1342 lp = lp->parent; 1343 } 1344 1345 up = mp; 1346 while (up->parent != NULL) { 1347 if (up->parent_index + 1 < (indx_t)NUMKEYS(up->parent)) { 1348 ubound = up->parent_index + 1; 1349 break; 1350 } 1351 up = up->parent; 1352 } 1353 1354 if (lp->parent != NULL && up->parent != NULL) { 1355 expand_prefix(bt, lp->parent, lbound, &lprefix); 1356 expand_prefix(bt, up->parent, ubound, &uprefix); 1357 common_prefix(bt, &lprefix, &uprefix, &mp->prefix); 1358 } 1359 else if (mp->parent) 1360 bcopy(&mp->parent->prefix, &mp->prefix, sizeof(mp->prefix)); 1361 1362 DPRINTF("found common prefix [%.*s] (len %zu) for page %u", 1363 (int)mp->prefix.len, mp->prefix.str, mp->prefix.len, mp->pgno); 1364 } 1365 1366 static int 1367 btree_search_page_root(struct btree *bt, struct mpage *root, struct btval *key, 1368 struct cursor *cursor, int modify, struct mpage **mpp) 1369 { 1370 struct mpage *mp, *parent; 1371 1372 if (cursor && cursor_push_page(cursor, root) == NULL) 1373 return BT_FAIL; 1374 1375 mp = root; 1376 while (IS_BRANCH(mp)) { 1377 unsigned int i = 0; 1378 struct node *node; 1379 1380 DPRINTF("branch page %u has %lu keys", mp->pgno, NUMKEYS(mp)); 1381 assert(NUMKEYS(mp) > 1); 1382 DPRINTF("found index 0 to page %u", NODEPGNO(NODEPTR(mp, 0))); 1383 1384 if (key == NULL) /* Initialize cursor to first page. */ 1385 i = 0; 1386 else { 1387 int exact; 1388 node = btree_search_node(bt, mp, key, &exact, &i); 1389 if (node == NULL) 1390 i = NUMKEYS(mp) - 1; 1391 else if (!exact) { 1392 assert(i > 0); 1393 i--; 1394 } 1395 } 1396 1397 if (key) 1398 DPRINTF("following index %u for key %.*s", 1399 i, (int)key->size, (char *)key->data); 1400 assert(i >= 0 && i < NUMKEYS(mp)); 1401 node = NODEPTR(mp, i); 1402 1403 if (cursor) 1404 CURSOR_TOP(cursor)->ki = i; 1405 1406 parent = mp; 1407 if ((mp = btree_get_mpage(bt, NODEPGNO(node))) == NULL) 1408 return BT_FAIL; 1409 mp->parent = parent; 1410 mp->parent_index = i; 1411 find_common_prefix(bt, mp); 1412 1413 if (cursor && cursor_push_page(cursor, mp) == NULL) 1414 return BT_FAIL; 1415 1416 if (modify && (mp = mpage_touch(bt, mp)) == NULL) 1417 return BT_FAIL; 1418 } 1419 1420 if (!IS_LEAF(mp)) { 1421 DPRINTF("internal error, index points to a %02X page!?", 1422 mp->page->flags); 1423 return BT_FAIL; 1424 } 1425 1426 DPRINTF("found leaf page %u for key %.*s", mp->pgno, 1427 key ? (int)key->size : 0, key ? (char *)key->data : NULL); 1428 1429 *mpp = mp; 1430 return BT_SUCCESS; 1431 } 1432 1433 /* Search for the page a given key should be in. 1434 * Stores a pointer to the found page in *mpp. 1435 * If key is NULL, search for the lowest page (used by btree_cursor_first). 1436 * If cursor is non-null, pushes parent pages on the cursor stack. 1437 * If modify is true, visited pages are updated with new page numbers. 1438 */ 1439 static int 1440 btree_search_page(struct btree *bt, struct btree_txn *txn, struct btval *key, 1441 struct cursor *cursor, int modify, struct mpage **mpp) 1442 { 1443 int rc; 1444 pgno_t root; 1445 struct mpage *mp; 1446 1447 /* Can't modify pages outside a transaction. */ 1448 if (txn == NULL && modify) { 1449 errno = EINVAL; 1450 return BT_FAIL; 1451 } 1452 1453 /* Choose which root page to start with. If a transaction is given 1454 * use the root page from the transaction, otherwise read the last 1455 * committed root page. 1456 */ 1457 if (txn == NULL) { 1458 if ((rc = btree_read_meta(bt, NULL)) != BT_SUCCESS) 1459 return rc; 1460 root = bt->meta.root; 1461 } else if (F_ISSET(txn->flags, BT_TXN_ERROR)) { 1462 DPRINTF("transaction has failed, must abort"); 1463 errno = EINVAL; 1464 return BT_FAIL; 1465 } else 1466 root = txn->root; 1467 1468 if (root == P_INVALID) { /* Tree is empty. */ 1469 DPRINTF("tree is empty"); 1470 errno = ENOENT; 1471 return BT_FAIL; 1472 } 1473 1474 if ((mp = btree_get_mpage(bt, root)) == NULL) 1475 return BT_FAIL; 1476 1477 DPRINTF("root page has flags 0x%X", mp->page->flags); 1478 1479 assert(mp->parent == NULL); 1480 assert(mp->prefix.len == 0); 1481 1482 if (modify && !mp->dirty) { 1483 if ((mp = mpage_touch(bt, mp)) == NULL) 1484 return BT_FAIL; 1485 txn->root = mp->pgno; 1486 } 1487 1488 return btree_search_page_root(bt, mp, key, cursor, modify, mpp); 1489 } 1490 1491 static int 1492 btree_read_data(struct btree *bt, struct mpage *mp, struct node *leaf, 1493 struct btval *data) 1494 { 1495 struct mpage *omp; /* overflow mpage */ 1496 size_t psz; 1497 size_t max; 1498 size_t sz = 0; 1499 pgno_t pgno; 1500 1501 bzero(data, sizeof(*data)); 1502 max = bt->head.psize - PAGEHDRSZ; 1503 1504 if (!F_ISSET(leaf->flags, F_BIGDATA)) { 1505 data->size = leaf->n_dsize; 1506 if (data->size > 0) { 1507 if (mp == NULL) { 1508 if ((data->data = malloc(data->size)) == NULL) 1509 return BT_FAIL; 1510 bcopy(NODEDATA(leaf), data->data, data->size); 1511 data->free_data = 1; 1512 data->mp = NULL; 1513 } else { 1514 data->data = NODEDATA(leaf); 1515 data->free_data = 0; 1516 data->mp = mp; 1517 mp->ref++; 1518 } 1519 } 1520 return BT_SUCCESS; 1521 } 1522 1523 /* Read overflow data. 1524 */ 1525 DPRINTF("allocating %u byte for overflow data", leaf->n_dsize); 1526 if ((data->data = malloc(leaf->n_dsize)) == NULL) 1527 return BT_FAIL; 1528 data->size = leaf->n_dsize; 1529 data->free_data = 1; 1530 data->mp = NULL; 1531 bcopy(NODEDATA(leaf), &pgno, sizeof(pgno)); 1532 for (sz = 0; sz < data->size; ) { 1533 if ((omp = btree_get_mpage(bt, pgno)) == NULL || 1534 !F_ISSET(omp->page->flags, P_OVERFLOW)) { 1535 DPRINTF("read overflow page %u failed", pgno); 1536 free(data->data); 1537 mpage_free(omp); 1538 return BT_FAIL; 1539 } 1540 psz = data->size - sz; 1541 if (psz > max) 1542 psz = max; 1543 bcopy(omp->page->ptrs, (char *)data->data + sz, psz); 1544 sz += psz; 1545 pgno = omp->page->p_next_pgno; 1546 } 1547 1548 return BT_SUCCESS; 1549 } 1550 1551 int 1552 btree_txn_get(struct btree *bt, struct btree_txn *txn, 1553 struct btval *key, struct btval *data) 1554 { 1555 int rc, exact; 1556 struct node *leaf; 1557 struct mpage *mp; 1558 1559 assert(key); 1560 assert(data); 1561 DPRINTF("===> get key [%.*s]", (int)key->size, (char *)key->data); 1562 1563 if (bt != NULL && txn != NULL && bt != txn->bt) { 1564 errno = EINVAL; 1565 return BT_FAIL; 1566 } 1567 1568 if (bt == NULL) { 1569 if (txn == NULL) { 1570 errno = EINVAL; 1571 return BT_FAIL; 1572 } 1573 bt = txn->bt; 1574 } 1575 1576 if (key->size == 0 || key->size > MAXKEYSIZE) { 1577 errno = EINVAL; 1578 return BT_FAIL; 1579 } 1580 1581 if ((rc = btree_search_page(bt, txn, key, NULL, 0, &mp)) != BT_SUCCESS) 1582 return rc; 1583 1584 leaf = btree_search_node(bt, mp, key, &exact, NULL); 1585 if (leaf && exact) 1586 rc = btree_read_data(bt, mp, leaf, data); 1587 else { 1588 errno = ENOENT; 1589 rc = BT_FAIL; 1590 } 1591 1592 mpage_prune(bt); 1593 return rc; 1594 } 1595 1596 static int 1597 btree_sibling(struct cursor *cursor, int move_right) 1598 { 1599 int rc; 1600 struct node *indx; 1601 struct ppage *parent, *top; 1602 struct mpage *mp; 1603 1604 top = CURSOR_TOP(cursor); 1605 if ((parent = SLIST_NEXT(top, entry)) == NULL) { 1606 errno = ENOENT; 1607 return BT_FAIL; /* root has no siblings */ 1608 } 1609 1610 DPRINTF("parent page is page %u, index %u", 1611 parent->mpage->pgno, parent->ki); 1612 1613 cursor_pop_page(cursor); 1614 if (move_right ? (parent->ki + 1 >= NUMKEYS(parent->mpage)) 1615 : (parent->ki == 0)) { 1616 DPRINTF("no more keys left, moving to %s sibling", 1617 move_right ? "right" : "left"); 1618 if ((rc = btree_sibling(cursor, move_right)) != BT_SUCCESS) 1619 return rc; 1620 parent = CURSOR_TOP(cursor); 1621 } else { 1622 if (move_right) 1623 parent->ki++; 1624 else 1625 parent->ki--; 1626 DPRINTF("just moving to %s index key %u", 1627 move_right ? "right" : "left", parent->ki); 1628 } 1629 assert(IS_BRANCH(parent->mpage)); 1630 1631 indx = NODEPTR(parent->mpage, parent->ki); 1632 if ((mp = btree_get_mpage(cursor->bt, indx->n_pgno)) == NULL) 1633 return BT_FAIL; 1634 mp->parent = parent->mpage; 1635 mp->parent_index = parent->ki; 1636 1637 cursor_push_page(cursor, mp); 1638 find_common_prefix(cursor->bt, mp); 1639 1640 return BT_SUCCESS; 1641 } 1642 1643 static int 1644 bt_set_key(struct btree *bt, struct mpage *mp, struct node *node, 1645 struct btval *key) 1646 { 1647 if (key == NULL) 1648 return 0; 1649 1650 if (mp->prefix.len > 0) { 1651 key->size = node->ksize + mp->prefix.len; 1652 key->data = malloc(key->size); 1653 if (key->data == NULL) 1654 return -1; 1655 concat_prefix(bt, 1656 mp->prefix.str, mp->prefix.len, 1657 NODEKEY(node), node->ksize, 1658 key->data, &key->size); 1659 key->free_data = 1; 1660 } else { 1661 key->size = node->ksize; 1662 key->data = NODEKEY(node); 1663 key->free_data = 0; 1664 key->mp = mp; 1665 mp->ref++; 1666 } 1667 1668 return 0; 1669 } 1670 1671 static int 1672 btree_cursor_next(struct cursor *cursor, struct btval *key, struct btval *data) 1673 { 1674 struct ppage *top; 1675 struct mpage *mp; 1676 struct node *leaf; 1677 1678 if (cursor->eof) { 1679 errno = ENOENT; 1680 return BT_FAIL; 1681 } 1682 1683 assert(cursor->initialized); 1684 1685 top = CURSOR_TOP(cursor); 1686 mp = top->mpage; 1687 1688 DPRINTF("cursor_next: top page is %u in cursor %p", mp->pgno, cursor); 1689 1690 if (top->ki + 1 >= NUMKEYS(mp)) { 1691 DPRINTF("=====> move to next sibling page"); 1692 if (btree_sibling(cursor, 1) != BT_SUCCESS) { 1693 cursor->eof = 1; 1694 return BT_FAIL; 1695 } 1696 top = CURSOR_TOP(cursor); 1697 mp = top->mpage; 1698 DPRINTF("next page is %u, key index %u", mp->pgno, top->ki); 1699 } else 1700 top->ki++; 1701 1702 DPRINTF("==> cursor points to page %u with %lu keys, key index %u", 1703 mp->pgno, NUMKEYS(mp), top->ki); 1704 1705 assert(IS_LEAF(mp)); 1706 leaf = NODEPTR(mp, top->ki); 1707 1708 if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 1709 return BT_FAIL; 1710 1711 if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 1712 return BT_FAIL; 1713 1714 return BT_SUCCESS; 1715 } 1716 1717 static int 1718 btree_cursor_set(struct cursor *cursor, struct btval *key, struct btval *data, 1719 int *exactp) 1720 { 1721 int rc; 1722 struct node *leaf; 1723 struct mpage *mp; 1724 struct ppage *top; 1725 1726 assert(cursor); 1727 assert(key); 1728 assert(key->size > 0); 1729 1730 rc = btree_search_page(cursor->bt, cursor->txn, key, cursor, 0, &mp); 1731 if (rc != BT_SUCCESS) 1732 return rc; 1733 assert(IS_LEAF(mp)); 1734 1735 top = CURSOR_TOP(cursor); 1736 leaf = btree_search_node(cursor->bt, mp, key, exactp, &top->ki); 1737 if (exactp != NULL && !*exactp) { 1738 /* BT_CURSOR_EXACT specified and not an exact match. */ 1739 errno = ENOENT; 1740 return BT_FAIL; 1741 } 1742 1743 if (leaf == NULL) { 1744 DPRINTF("===> inexact leaf not found, goto sibling"); 1745 if (btree_sibling(cursor, 1) != BT_SUCCESS) 1746 return BT_FAIL; /* no entries matched */ 1747 top = CURSOR_TOP(cursor); 1748 top->ki = 0; 1749 mp = top->mpage; 1750 assert(IS_LEAF(mp)); 1751 leaf = NODEPTR(mp, 0); 1752 } 1753 1754 cursor->initialized = 1; 1755 cursor->eof = 0; 1756 1757 if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 1758 return BT_FAIL; 1759 1760 if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 1761 return BT_FAIL; 1762 DPRINTF("==> cursor placed on key %.*s", 1763 (int)key->size, (char *)key->data); 1764 1765 return BT_SUCCESS; 1766 } 1767 1768 static int 1769 btree_cursor_first(struct cursor *cursor, struct btval *key, struct btval *data) 1770 { 1771 int rc; 1772 struct mpage *mp; 1773 struct node *leaf; 1774 1775 rc = btree_search_page(cursor->bt, cursor->txn, NULL, cursor, 0, &mp); 1776 if (rc != BT_SUCCESS) 1777 return rc; 1778 assert(IS_LEAF(mp)); 1779 1780 leaf = NODEPTR(mp, 0); 1781 cursor->initialized = 1; 1782 cursor->eof = 0; 1783 1784 if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 1785 return BT_FAIL; 1786 1787 if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 1788 return BT_FAIL; 1789 1790 return BT_SUCCESS; 1791 } 1792 1793 int 1794 btree_cursor_get(struct cursor *cursor, struct btval *key, struct btval *data, 1795 enum cursor_op op) 1796 { 1797 int rc; 1798 int exact = 0; 1799 1800 assert(cursor); 1801 1802 switch (op) { 1803 case BT_CURSOR: 1804 case BT_CURSOR_EXACT: 1805 while (CURSOR_TOP(cursor) != NULL) 1806 cursor_pop_page(cursor); 1807 if (key == NULL || key->size == 0 || key->size > MAXKEYSIZE) { 1808 errno = EINVAL; 1809 rc = BT_FAIL; 1810 } else if (op == BT_CURSOR_EXACT) 1811 rc = btree_cursor_set(cursor, key, data, &exact); 1812 else 1813 rc = btree_cursor_set(cursor, key, data, NULL); 1814 break; 1815 case BT_NEXT: 1816 if (!cursor->initialized) 1817 rc = btree_cursor_first(cursor, key, data); 1818 else 1819 rc = btree_cursor_next(cursor, key, data); 1820 break; 1821 case BT_FIRST: 1822 while (CURSOR_TOP(cursor) != NULL) 1823 cursor_pop_page(cursor); 1824 rc = btree_cursor_first(cursor, key, data); 1825 break; 1826 default: 1827 DPRINTF("unhandled/unimplemented cursor operation %u", op); 1828 rc = BT_FAIL; 1829 break; 1830 } 1831 1832 mpage_prune(cursor->bt); 1833 1834 return rc; 1835 } 1836 1837 static struct mpage * 1838 btree_new_page(struct btree *bt, uint32_t flags) 1839 { 1840 struct mpage *mp; 1841 1842 assert(bt != NULL); 1843 assert(bt->txn != NULL); 1844 1845 DPRINTF("allocating new mpage %u, page size %u", 1846 bt->txn->next_pgno, bt->head.psize); 1847 if ((mp = calloc(1, sizeof(*mp))) == NULL) 1848 return NULL; 1849 if ((mp->page = malloc(bt->head.psize)) == NULL) { 1850 free(mp); 1851 return NULL; 1852 } 1853 mp->pgno = mp->page->pgno = bt->txn->next_pgno++; 1854 mp->page->flags = flags; 1855 mp->page->lower = PAGEHDRSZ; 1856 mp->page->upper = bt->head.psize; 1857 1858 if (IS_BRANCH(mp)) 1859 bt->meta.branch_pages++; 1860 else if (IS_LEAF(mp)) 1861 bt->meta.leaf_pages++; 1862 else if (IS_OVERFLOW(mp)) 1863 bt->meta.overflow_pages++; 1864 1865 mpage_add(bt, mp); 1866 mpage_dirty(bt, mp); 1867 1868 return mp; 1869 } 1870 1871 static size_t 1872 bt_leaf_size(struct btree *bt, struct btval *key, struct btval *data) 1873 { 1874 size_t sz; 1875 1876 sz = LEAFSIZE(key, data); 1877 if (data->size >= bt->head.psize / BT_MINKEYS) { 1878 /* put on overflow page */ 1879 sz -= data->size - sizeof(pgno_t); 1880 } 1881 1882 return sz + sizeof(indx_t); 1883 } 1884 1885 static size_t 1886 bt_branch_size(struct btree *bt, struct btval *key) 1887 { 1888 size_t sz; 1889 1890 sz = INDXSIZE(key); 1891 if (sz >= bt->head.psize / BT_MINKEYS) { 1892 /* put on overflow page */ 1893 /* not implemented */ 1894 /* sz -= key->size - sizeof(pgno_t); */ 1895 } 1896 1897 return sz + sizeof(indx_t); 1898 } 1899 1900 static int 1901 btree_write_overflow_data(struct btree *bt, struct page *p, struct btval *data) 1902 { 1903 size_t done = 0; 1904 size_t sz; 1905 size_t max; 1906 pgno_t *linkp; /* linked page stored here */ 1907 struct mpage *next = NULL; 1908 1909 max = bt->head.psize - PAGEHDRSZ; 1910 1911 while (done < data->size) { 1912 if (next != NULL) 1913 p = next->page; 1914 linkp = &p->p_next_pgno; 1915 if (data->size - done > max) { 1916 /* need another overflow page */ 1917 if ((next = btree_new_page(bt, P_OVERFLOW)) == NULL) 1918 return BT_FAIL; 1919 *linkp = next->pgno; 1920 DPRINTF("linking overflow page %u", next->pgno); 1921 } else 1922 *linkp = 0; /* indicates end of list */ 1923 sz = data->size - done; 1924 if (sz > max) 1925 sz = max; 1926 DPRINTF("copying %zu bytes to overflow page %u", sz, p->pgno); 1927 bcopy((char *)data->data + done, p->ptrs, sz); 1928 done += sz; 1929 } 1930 1931 return BT_SUCCESS; 1932 } 1933 1934 /* Key prefix should already be stripped. 1935 */ 1936 static int 1937 btree_add_node(struct btree *bt, struct mpage *mp, indx_t indx, 1938 struct btval *key, struct btval *data, pgno_t pgno, uint8_t flags) 1939 { 1940 unsigned int i; 1941 size_t node_size = NODESIZE; 1942 indx_t ofs; 1943 struct node *node; 1944 struct page *p; 1945 struct mpage *ofp = NULL; /* overflow page */ 1946 1947 p = mp->page; 1948 assert(p->upper >= p->lower); 1949 1950 DPRINTF("add node [%.*s] to %s page %u at index %i, key size %zu", 1951 key ? (int)key->size : 0, key ? (char *)key->data : NULL, 1952 IS_LEAF(mp) ? "leaf" : "branch", 1953 mp->pgno, indx, key ? key->size : 0); 1954 1955 if (key != NULL) 1956 node_size += key->size; 1957 1958 if (IS_LEAF(mp)) { 1959 assert(data); 1960 node_size += data->size; 1961 if (F_ISSET(flags, F_BIGDATA)) { 1962 /* Data already on overflow page. */ 1963 node_size -= data->size - sizeof(pgno_t); 1964 } else if (data->size >= bt->head.psize / BT_MINKEYS) { 1965 /* Put data on overflow page. */ 1966 DPRINTF("data size is %zu, put on overflow page", 1967 data->size); 1968 node_size -= data->size - sizeof(pgno_t); 1969 if ((ofp = btree_new_page(bt, P_OVERFLOW)) == NULL) 1970 return BT_FAIL; 1971 DPRINTF("allocated overflow page %u", ofp->pgno); 1972 flags |= F_BIGDATA; 1973 } 1974 } 1975 1976 if (node_size + sizeof(indx_t) > SIZELEFT(mp)) { 1977 DPRINTF("not enough room in page %u, got %lu ptrs", 1978 mp->pgno, NUMKEYS(mp)); 1979 DPRINTF("upper - lower = %u - %u = %u", p->upper, p->lower, 1980 p->upper - p->lower); 1981 DPRINTF("node size = %zu", node_size); 1982 return BT_FAIL; 1983 } 1984 1985 /* Move higher pointers up one slot. */ 1986 for (i = NUMKEYS(mp); i > indx; i--) 1987 p->ptrs[i] = p->ptrs[i - 1]; 1988 1989 /* Adjust free space offsets. */ 1990 ofs = p->upper - node_size; 1991 assert(ofs >= p->lower + sizeof(indx_t)); 1992 p->ptrs[indx] = ofs; 1993 p->upper = ofs; 1994 p->lower += sizeof(indx_t); 1995 1996 /* Write the node data. */ 1997 node = NODEPTR(mp, indx); 1998 node->ksize = (key == NULL) ? 0 : key->size; 1999 node->flags = flags; 2000 if (IS_LEAF(mp)) 2001 node->n_dsize = data->size; 2002 else 2003 node->n_pgno = pgno; 2004 2005 if (key) 2006 bcopy(key->data, NODEKEY(node), key->size); 2007 2008 if (IS_LEAF(mp)) { 2009 assert(key); 2010 if (ofp == NULL) { 2011 if (F_ISSET(flags, F_BIGDATA)) 2012 bcopy(data->data, node->data + key->size, 2013 sizeof(pgno_t)); 2014 else 2015 bcopy(data->data, node->data + key->size, 2016 data->size); 2017 } else { 2018 bcopy(&ofp->pgno, node->data + key->size, 2019 sizeof(pgno_t)); 2020 if (btree_write_overflow_data(bt, ofp->page, 2021 data) == BT_FAIL) 2022 return BT_FAIL; 2023 } 2024 } 2025 2026 return BT_SUCCESS; 2027 } 2028 2029 static void 2030 btree_del_node(struct btree *bt, struct mpage *mp, indx_t indx) 2031 { 2032 unsigned int sz; 2033 indx_t i, j, numkeys, ptr; 2034 struct node *node; 2035 char *base; 2036 2037 DPRINTF("delete node %u on %s page %u", indx, 2038 IS_LEAF(mp) ? "leaf" : "branch", mp->pgno); 2039 assert(indx < NUMKEYS(mp)); 2040 2041 node = NODEPTR(mp, indx); 2042 sz = NODESIZE + node->ksize; 2043 if (IS_LEAF(mp)) { 2044 if (F_ISSET(node->flags, F_BIGDATA)) 2045 sz += sizeof(pgno_t); 2046 else 2047 sz += NODEDSZ(node); 2048 } 2049 2050 ptr = mp->page->ptrs[indx]; 2051 numkeys = NUMKEYS(mp); 2052 for (i = j = 0; i < numkeys; i++) { 2053 if (i != indx) { 2054 mp->page->ptrs[j] = mp->page->ptrs[i]; 2055 if (mp->page->ptrs[i] < ptr) 2056 mp->page->ptrs[j] += sz; 2057 j++; 2058 } 2059 } 2060 2061 base = (char *)mp->page + mp->page->upper; 2062 bcopy(base, base + sz, ptr - mp->page->upper); 2063 2064 mp->page->lower -= sizeof(indx_t); 2065 mp->page->upper += sz; 2066 } 2067 2068 struct cursor * 2069 btree_txn_cursor_open(struct btree *bt, struct btree_txn *txn) 2070 { 2071 struct cursor *cursor; 2072 2073 if (bt != NULL && txn != NULL && bt != txn->bt) { 2074 errno = EINVAL; 2075 return NULL; 2076 } 2077 2078 if (bt == NULL) { 2079 if (txn == NULL) { 2080 errno = EINVAL; 2081 return NULL; 2082 } 2083 bt = txn->bt; 2084 } 2085 2086 if ((cursor = calloc(1, sizeof(*cursor))) != NULL) { 2087 SLIST_INIT(&cursor->stack); 2088 cursor->bt = bt; 2089 cursor->txn = txn; 2090 btree_ref(bt); 2091 } 2092 2093 return cursor; 2094 } 2095 2096 void 2097 btree_cursor_close(struct cursor *cursor) 2098 { 2099 if (cursor != NULL) { 2100 while (!CURSOR_EMPTY(cursor)) 2101 cursor_pop_page(cursor); 2102 2103 btree_close(cursor->bt); 2104 free(cursor); 2105 } 2106 } 2107 2108 static int 2109 btree_update_key(struct btree *bt, struct mpage *mp, indx_t indx, 2110 struct btval *key) 2111 { 2112 indx_t ptr, i, numkeys; 2113 int delta; 2114 size_t len; 2115 struct node *node; 2116 char *base; 2117 2118 node = NODEPTR(mp, indx); 2119 ptr = mp->page->ptrs[indx]; 2120 DPRINTF("update key %u (ofs %u) [%.*s] to [%.*s] on page %u", 2121 indx, ptr, 2122 (int)node->ksize, (char *)NODEKEY(node), 2123 (int)key->size, (char *)key->data, 2124 mp->pgno); 2125 2126 if (key->size != node->ksize) { 2127 delta = key->size - node->ksize; 2128 if (delta > 0 && SIZELEFT(mp) < delta) { 2129 DPRINTF("OUCH! Not enough room, delta = %d", delta); 2130 return BT_FAIL; 2131 } 2132 2133 numkeys = NUMKEYS(mp); 2134 for (i = 0; i < numkeys; i++) { 2135 if (mp->page->ptrs[i] <= ptr) 2136 mp->page->ptrs[i] -= delta; 2137 } 2138 2139 base = (char *)mp->page + mp->page->upper; 2140 len = ptr - mp->page->upper + NODESIZE; 2141 bcopy(base, base - delta, len); 2142 mp->page->upper -= delta; 2143 2144 node = NODEPTR(mp, indx); 2145 node->ksize = key->size; 2146 } 2147 2148 bcopy(key->data, NODEKEY(node), key->size); 2149 2150 return BT_SUCCESS; 2151 } 2152 2153 static int 2154 btree_adjust_prefix(struct btree *bt, struct mpage *src, int delta) 2155 { 2156 indx_t i; 2157 struct node *node; 2158 struct btkey tmpkey; 2159 struct btval key; 2160 2161 DPRINTF("adjusting prefix lengths on page %u with delta %d", 2162 src->pgno, delta); 2163 assert(delta != 0); 2164 2165 for (i = 0; i < NUMKEYS(src); i++) { 2166 node = NODEPTR(src, i); 2167 tmpkey.len = node->ksize - delta; 2168 if (delta > 0) { 2169 if (F_ISSET(bt->flags, BT_REVERSEKEY)) 2170 bcopy(NODEKEY(node), tmpkey.str, tmpkey.len); 2171 else 2172 bcopy((char *)NODEKEY(node) + delta, tmpkey.str, 2173 tmpkey.len); 2174 } else { 2175 if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 2176 bcopy(NODEKEY(node), tmpkey.str, node->ksize); 2177 bcopy(src->prefix.str, tmpkey.str + node->ksize, 2178 -delta); 2179 } else { 2180 bcopy(src->prefix.str + src->prefix.len + delta, 2181 tmpkey.str, -delta); 2182 bcopy(NODEKEY(node), tmpkey.str - delta, 2183 node->ksize); 2184 } 2185 } 2186 key.size = tmpkey.len; 2187 key.data = tmpkey.str; 2188 if (btree_update_key(bt, src, i, &key) != BT_SUCCESS) 2189 return BT_FAIL; 2190 } 2191 2192 return BT_SUCCESS; 2193 } 2194 2195 /* Move a node from src to dst. 2196 */ 2197 static int 2198 btree_move_node(struct btree *bt, struct mpage *src, indx_t srcindx, 2199 struct mpage *dst, indx_t dstindx) 2200 { 2201 int rc; 2202 unsigned int pfxlen, mp_pfxlen = 0; 2203 struct node *srcnode; 2204 struct mpage *mp = NULL; 2205 struct btkey tmpkey, srckey; 2206 struct btval key, data; 2207 2208 assert(src->parent); 2209 assert(dst->parent); 2210 2211 srcnode = NODEPTR(src, srcindx); 2212 DPRINTF("moving %s node %u [%.*s] on page %u to node %u on page %u", 2213 IS_LEAF(src) ? "leaf" : "branch", 2214 srcindx, 2215 (int)srcnode->ksize, (char *)NODEKEY(srcnode), 2216 src->pgno, 2217 dstindx, dst->pgno); 2218 2219 find_common_prefix(bt, src); 2220 2221 if (IS_BRANCH(src)) { 2222 /* Need to check if the page the moved node points to 2223 * changes prefix. 2224 */ 2225 if ((mp = btree_get_mpage(bt, NODEPGNO(srcnode))) == NULL) 2226 return BT_FAIL; 2227 mp->parent = src; 2228 mp->parent_index = srcindx; 2229 find_common_prefix(bt, mp); 2230 mp_pfxlen = mp->prefix.len; 2231 } 2232 2233 /* Mark src and dst as dirty. */ 2234 if ((src = mpage_touch(bt, src)) == NULL || 2235 (dst = mpage_touch(bt, dst)) == NULL) 2236 return BT_FAIL; 2237 2238 find_common_prefix(bt, dst); 2239 2240 /* Check if src node has destination page prefix. Otherwise the 2241 * destination page must expand its prefix on all its nodes. 2242 */ 2243 srckey.len = srcnode->ksize; 2244 bcopy(NODEKEY(srcnode), srckey.str, srckey.len); 2245 common_prefix(bt, &srckey, &dst->prefix, &tmpkey); 2246 if (tmpkey.len != dst->prefix.len) { 2247 if (btree_adjust_prefix(bt, dst, 2248 tmpkey.len - dst->prefix.len) != BT_SUCCESS) 2249 return BT_FAIL; 2250 bcopy(&tmpkey, &dst->prefix, sizeof(tmpkey)); 2251 } 2252 2253 if (srcindx == 0 && IS_BRANCH(src)) { 2254 struct mpage *low; 2255 2256 /* must find the lowest key below src 2257 */ 2258 assert(btree_search_page_root(bt, src, NULL, NULL, 0, 2259 &low) == BT_SUCCESS); 2260 expand_prefix(bt, low, 0, &srckey); 2261 DPRINTF("found lowest key [%.*s] on leaf page %u", 2262 (int)srckey.len, srckey.str, low->pgno); 2263 } else { 2264 srckey.len = srcnode->ksize; 2265 bcopy(NODEKEY(srcnode), srckey.str, srcnode->ksize); 2266 } 2267 find_common_prefix(bt, src); 2268 2269 /* expand the prefix */ 2270 tmpkey.len = sizeof(tmpkey.str); 2271 concat_prefix(bt, src->prefix.str, src->prefix.len, 2272 srckey.str, srckey.len, tmpkey.str, &tmpkey.len); 2273 2274 /* Add the node to the destination page. Adjust prefix for 2275 * destination page. 2276 */ 2277 key.size = tmpkey.len; 2278 key.data = tmpkey.str; 2279 remove_prefix(bt, &key, dst->prefix.len); 2280 data.size = NODEDSZ(srcnode); 2281 data.data = NODEDATA(srcnode); 2282 rc = btree_add_node(bt, dst, dstindx, &key, &data, NODEPGNO(srcnode), 2283 srcnode->flags); 2284 if (rc != BT_SUCCESS) 2285 return rc; 2286 2287 /* Delete the node from the source page. 2288 */ 2289 btree_del_node(bt, src, srcindx); 2290 2291 /* Update the parent separators. 2292 */ 2293 if (srcindx == 0 && src->parent_index != 0) { 2294 expand_prefix(bt, src, 0, &tmpkey); 2295 key.size = tmpkey.len; 2296 key.data = tmpkey.str; 2297 remove_prefix(bt, &key, src->parent->prefix.len); 2298 2299 DPRINTF("update separator for source page %u to [%.*s]", 2300 src->pgno, (int)key.size, (char *)key.data); 2301 if (btree_update_key(bt, src->parent, src->parent_index, 2302 &key) != BT_SUCCESS) 2303 return BT_FAIL; 2304 } 2305 2306 if (srcindx == 0 && IS_BRANCH(src)) { 2307 struct btval nullkey; 2308 nullkey.size = 0; 2309 assert(btree_update_key(bt, src, 0, &nullkey) == BT_SUCCESS); 2310 } 2311 2312 if (dstindx == 0 && dst->parent_index != 0) { 2313 expand_prefix(bt, dst, 0, &tmpkey); 2314 key.size = tmpkey.len; 2315 key.data = tmpkey.str; 2316 remove_prefix(bt, &key, dst->parent->prefix.len); 2317 2318 DPRINTF("update separator for destination page %u to [%.*s]", 2319 dst->pgno, (int)key.size, (char *)key.data); 2320 if (btree_update_key(bt, dst->parent, dst->parent_index, 2321 &key) != BT_SUCCESS) 2322 return BT_FAIL; 2323 } 2324 2325 if (dstindx == 0 && IS_BRANCH(dst)) { 2326 struct btval nullkey; 2327 nullkey.size = 0; 2328 assert(btree_update_key(bt, dst, 0, &nullkey) == BT_SUCCESS); 2329 } 2330 2331 /* We can get a new page prefix here! 2332 * Must update keys in all nodes of this page! 2333 */ 2334 pfxlen = src->prefix.len; 2335 find_common_prefix(bt, src); 2336 if (src->prefix.len != pfxlen) { 2337 if (btree_adjust_prefix(bt, src, 2338 src->prefix.len - pfxlen) != BT_SUCCESS) 2339 return BT_FAIL; 2340 } 2341 2342 pfxlen = dst->prefix.len; 2343 find_common_prefix(bt, dst); 2344 if (dst->prefix.len != pfxlen) { 2345 if (btree_adjust_prefix(bt, dst, 2346 dst->prefix.len - pfxlen) != BT_SUCCESS) 2347 return BT_FAIL; 2348 } 2349 2350 if (IS_BRANCH(dst)) { 2351 assert(mp); 2352 mp->parent = dst; 2353 mp->parent_index = dstindx; 2354 find_common_prefix(bt, mp); 2355 if (mp->prefix.len != mp_pfxlen) { 2356 DPRINTF("moved branch node has changed prefix"); 2357 if ((mp = mpage_touch(bt, mp)) == NULL) 2358 return BT_FAIL; 2359 if (btree_adjust_prefix(bt, mp, 2360 mp->prefix.len - mp_pfxlen) != BT_SUCCESS) 2361 return BT_FAIL; 2362 } 2363 } 2364 2365 return BT_SUCCESS; 2366 } 2367 2368 static int 2369 btree_merge(struct btree *bt, struct mpage *src, struct mpage *dst) 2370 { 2371 int rc; 2372 indx_t i; 2373 unsigned int pfxlen; 2374 struct node *srcnode; 2375 struct btkey tmpkey, dstpfx; 2376 struct btval key, data; 2377 2378 DPRINTF("merging page %u and %u", src->pgno, dst->pgno); 2379 2380 assert(src->parent); /* can't merge root page */ 2381 assert(dst->parent); 2382 assert(bt->txn != NULL); 2383 2384 /* Mark src and dst as dirty. */ 2385 if ((src = mpage_touch(bt, src)) == NULL || 2386 (dst = mpage_touch(bt, dst)) == NULL) 2387 return BT_FAIL; 2388 2389 find_common_prefix(bt, src); 2390 find_common_prefix(bt, dst); 2391 2392 /* Check if source nodes has destination page prefix. Otherwise 2393 * the destination page must expand its prefix on all its nodes. 2394 */ 2395 common_prefix(bt, &src->prefix, &dst->prefix, &dstpfx); 2396 if (dstpfx.len != dst->prefix.len) { 2397 if (btree_adjust_prefix(bt, dst, 2398 dstpfx.len - dst->prefix.len) != BT_SUCCESS) 2399 return BT_FAIL; 2400 bcopy(&dstpfx, &dst->prefix, sizeof(dstpfx)); 2401 } 2402 2403 /* Move all nodes from src to dst. 2404 */ 2405 for (i = 0; i < NUMKEYS(src); i++) { 2406 srcnode = NODEPTR(src, i); 2407 2408 /* If branch node 0 (implicit key), find the real key. 2409 */ 2410 if (i == 0 && IS_BRANCH(src)) { 2411 struct mpage *low; 2412 2413 /* must find the lowest key below src 2414 */ 2415 assert(btree_search_page_root(bt, src, NULL, NULL, 0, 2416 &low) == BT_SUCCESS); 2417 expand_prefix(bt, low, 0, &tmpkey); 2418 DPRINTF("found lowest key [%.*s] on leaf page %u", 2419 (int)tmpkey.len, tmpkey.str, low->pgno); 2420 } else { 2421 expand_prefix(bt, src, i, &tmpkey); 2422 } 2423 2424 key.size = tmpkey.len; 2425 key.data = tmpkey.str; 2426 2427 remove_prefix(bt, &key, dst->prefix.len); 2428 data.size = NODEDSZ(srcnode); 2429 data.data = NODEDATA(srcnode); 2430 rc = btree_add_node(bt, dst, NUMKEYS(dst), &key, 2431 &data, NODEPGNO(srcnode), srcnode->flags); 2432 if (rc != BT_SUCCESS) 2433 return rc; 2434 } 2435 2436 DPRINTF("dst page %u now has %lu keys (%.1f%% filled)", 2437 dst->pgno, NUMKEYS(dst), (float)PAGEFILL(bt, dst) / 10); 2438 2439 /* Unlink the src page from parent. 2440 */ 2441 btree_del_node(bt, src->parent, src->parent_index); 2442 if (src->parent_index == 0) { 2443 key.size = 0; 2444 if (btree_update_key(bt, src->parent, 0, &key) != BT_SUCCESS) 2445 return BT_FAIL; 2446 2447 pfxlen = src->prefix.len; 2448 find_common_prefix(bt, src); 2449 assert (src->prefix.len == pfxlen); 2450 } 2451 2452 if (IS_LEAF(src)) 2453 bt->meta.leaf_pages--; 2454 else 2455 bt->meta.branch_pages--; 2456 2457 return btree_rebalance(bt, src->parent); 2458 } 2459 2460 #define FILL_THRESHOLD 250 2461 2462 static int 2463 btree_rebalance(struct btree *bt, struct mpage *mp) 2464 { 2465 struct node *node; 2466 struct mpage *parent; 2467 struct mpage *root; 2468 struct mpage *neighbor = NULL; 2469 indx_t si = 0, di = 0; 2470 2471 assert(bt != NULL); 2472 assert(bt->txn != NULL); 2473 assert(mp != NULL); 2474 2475 DPRINTF("rebalancing %s page %u (has %lu keys, %.1f%% full)", 2476 IS_LEAF(mp) ? "leaf" : "branch", 2477 mp->pgno, NUMKEYS(mp), (float)PAGEFILL(bt, mp) / 10); 2478 2479 if (PAGEFILL(bt, mp) >= FILL_THRESHOLD) { 2480 DPRINTF("no need to rebalance page %u, above fill threshold", 2481 mp->pgno); 2482 return BT_SUCCESS; 2483 } 2484 2485 parent = mp->parent; 2486 2487 if (parent == NULL) { 2488 if (NUMKEYS(mp) == 0) { 2489 DPRINTF("tree is completely empty"); 2490 bt->txn->root = P_INVALID; 2491 bt->meta.depth--; 2492 bt->meta.leaf_pages--; 2493 } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { 2494 DPRINTF("collapsing root page!"); 2495 bt->txn->root = NODEPGNO(NODEPTR(mp, 0)); 2496 if ((root = btree_get_mpage(bt, bt->txn->root)) == NULL) 2497 return BT_FAIL; 2498 root->parent = NULL; 2499 bt->meta.depth--; 2500 bt->meta.branch_pages--; 2501 } else 2502 DPRINTF("root page doesn't need rebalancing"); 2503 return BT_SUCCESS; 2504 } 2505 2506 /* The parent (branch page) must have at least 2 pointers, 2507 * otherwise the tree is invalid. 2508 */ 2509 assert(NUMKEYS(parent) > 1); 2510 2511 /* Leaf page fill factor is below the threshold. 2512 * Try to move keys from left or right neighbor, or 2513 * merge with a neighbor page. 2514 */ 2515 2516 /* Find neighbors. 2517 */ 2518 if (mp->parent_index == 0) { 2519 /* We're the leftmost leaf in our parent. 2520 */ 2521 DPRINTF("reading right neighbor"); 2522 node = NODEPTR(parent, mp->parent_index + 1); 2523 if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL) 2524 return BT_FAIL; 2525 neighbor->parent_index = mp->parent_index + 1; 2526 si = 0; 2527 di = NUMKEYS(mp); 2528 } else { 2529 /* There is at least one neighbor to the left. 2530 */ 2531 DPRINTF("reading left neighbor"); 2532 node = NODEPTR(parent, mp->parent_index - 1); 2533 if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL) 2534 return BT_FAIL; 2535 neighbor->parent_index = mp->parent_index - 1; 2536 si = NUMKEYS(neighbor) - 1; 2537 di = 0; 2538 } 2539 neighbor->parent = parent; 2540 2541 DPRINTF("found neighbor page %u (%lu keys, %.1f%% full)", 2542 neighbor->pgno, NUMKEYS(neighbor), (float)PAGEFILL(bt, neighbor) / 10); 2543 2544 /* If the neighbor page is above threshold and has at least two 2545 * keys, move one key from it. 2546 * 2547 * Otherwise we should try to merge them, but that might not be 2548 * possible, even if both are below threshold, as prefix expansion 2549 * might make keys larger. FIXME: detect this 2550 */ 2551 if (PAGEFILL(bt, neighbor) >= FILL_THRESHOLD && NUMKEYS(neighbor) >= 2) 2552 return btree_move_node(bt, neighbor, si, mp, di); 2553 else { /* FIXME: if (has_enough_room()) */ 2554 if (mp->parent_index == 0) 2555 return btree_merge(bt, neighbor, mp); 2556 else 2557 return btree_merge(bt, mp, neighbor); 2558 } 2559 } 2560 2561 int 2562 btree_txn_del(struct btree *bt, struct btree_txn *txn, 2563 struct btval *key, struct btval *data) 2564 { 2565 int rc, exact, close_txn = 0; 2566 unsigned int ki; 2567 struct node *leaf; 2568 struct mpage *mp; 2569 2570 DPRINTF("========> delete key %.*s", (int)key->size, (char *)key->data); 2571 2572 assert(key != NULL); 2573 2574 if (bt != NULL && txn != NULL && bt != txn->bt) { 2575 errno = EINVAL; 2576 return BT_FAIL; 2577 } 2578 2579 if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) { 2580 errno = EINVAL; 2581 return BT_FAIL; 2582 } 2583 2584 if (bt == NULL) { 2585 if (txn == NULL) { 2586 errno = EINVAL; 2587 return BT_FAIL; 2588 } 2589 bt = txn->bt; 2590 } 2591 2592 if (key->size == 0 || key->size > MAXKEYSIZE) { 2593 errno = EINVAL; 2594 return BT_FAIL; 2595 } 2596 2597 if (txn == NULL) { 2598 close_txn = 1; 2599 if ((txn = btree_txn_begin(bt, 0)) == NULL) 2600 return BT_FAIL; 2601 } 2602 2603 if ((rc = btree_search_page(bt, txn, key, NULL, 1, &mp)) != BT_SUCCESS) 2604 goto done; 2605 2606 leaf = btree_search_node(bt, mp, key, &exact, &ki); 2607 if (leaf == NULL || !exact) { 2608 errno = ENOENT; 2609 rc = BT_FAIL; 2610 goto done; 2611 } 2612 2613 if (data && (rc = btree_read_data(bt, NULL, leaf, data)) != BT_SUCCESS) 2614 goto done; 2615 2616 btree_del_node(bt, mp, ki); 2617 bt->meta.entries--; 2618 rc = btree_rebalance(bt, mp); 2619 if (rc != BT_SUCCESS) 2620 txn->flags |= BT_TXN_ERROR; 2621 2622 done: 2623 if (close_txn) { 2624 if (rc == BT_SUCCESS) 2625 rc = btree_txn_commit(txn); 2626 else 2627 btree_txn_abort(txn); 2628 } 2629 mpage_prune(bt); 2630 return rc; 2631 } 2632 2633 /* Reduce the length of the prefix separator <sep> to the minimum length that 2634 * still makes it uniquely distinguishable from <min>. 2635 * 2636 * <min> is guaranteed to be sorted less than <sep> 2637 * 2638 * On return, <sep> is modified to the minimum length. 2639 */ 2640 static void 2641 bt_reduce_separator(struct btree *bt, struct node *min, struct btval *sep) 2642 { 2643 size_t n = 0; 2644 char *p1; 2645 char *p2; 2646 2647 if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 2648 2649 assert(sep->size > 0); 2650 2651 p1 = (char *)NODEKEY(min) + min->ksize - 1; 2652 p2 = (char *)sep->data + sep->size - 1; 2653 2654 while (p1 >= (char *)NODEKEY(min) && *p1 == *p2) { 2655 assert(p2 > (char *)sep->data); 2656 p1--; 2657 p2--; 2658 n++; 2659 } 2660 2661 sep->data = p2; 2662 sep->size = n + 1; 2663 } else { 2664 2665 assert(min->ksize > 0); 2666 assert(sep->size > 0); 2667 2668 p1 = (char *)NODEKEY(min); 2669 p2 = (char *)sep->data; 2670 2671 while (*p1 == *p2) { 2672 p1++; 2673 p2++; 2674 n++; 2675 if (n == min->ksize || n == sep->size) 2676 break; 2677 } 2678 2679 sep->size = n + 1; 2680 } 2681 2682 DPRINTF("reduced separator to [%.*s] > [%.*s]", 2683 (int)sep->size, (char *)sep->data, 2684 (int)min->ksize, (char *)NODEKEY(min)); 2685 } 2686 2687 /* Split page <*mpp>, and insert <key,(data|newpgno)> in either left or 2688 * right sibling, at index <*newindxp> (as if unsplit). Updates *mpp and 2689 * *newindxp with the actual values after split, ie if *mpp and *newindxp 2690 * refer to a node in the new right sibling page. 2691 */ 2692 static int 2693 btree_split(struct btree *bt, struct mpage **mpp, unsigned int *newindxp, 2694 struct btval *newkey, struct btval *newdata, pgno_t newpgno) 2695 { 2696 uint8_t flags; 2697 int rc = BT_SUCCESS, ins_new = 0; 2698 indx_t newindx; 2699 pgno_t pgno = 0; 2700 size_t orig_pfx_len, left_pfx_diff, right_pfx_diff, pfx_diff; 2701 unsigned int i, j, split_indx; 2702 struct node *node; 2703 struct mpage *pright, *p, *mp; 2704 struct btval sepkey, rkey, rdata; 2705 struct btkey tmpkey; 2706 struct page *copy; 2707 2708 assert(bt != NULL); 2709 assert(bt->txn != NULL); 2710 2711 mp = *mpp; 2712 newindx = *newindxp; 2713 2714 DPRINTF("-----> splitting %s page %u and adding [%.*s] at index %i", 2715 IS_LEAF(mp) ? "leaf" : "branch", mp->pgno, 2716 (int)newkey->size, (char *)newkey->data, *newindxp); 2717 DPRINTF("page %u has prefix [%.*s]", mp->pgno, 2718 (int)mp->prefix.len, (char *)mp->prefix.str); 2719 orig_pfx_len = mp->prefix.len; 2720 2721 if (mp->parent == NULL) { 2722 if ((mp->parent = btree_new_page(bt, P_BRANCH)) == NULL) 2723 return BT_FAIL; 2724 mp->parent_index = 0; 2725 bt->txn->root = mp->parent->pgno; 2726 DPRINTF("root split! new root = %u", mp->parent->pgno); 2727 bt->meta.depth++; 2728 2729 /* Add left (implicit) pointer. */ 2730 if (btree_add_node(bt, mp->parent, 0, NULL, NULL, 2731 mp->pgno, 0) != BT_SUCCESS) 2732 return BT_FAIL; 2733 } else { 2734 DPRINTF("parent branch page is %u", mp->parent->pgno); 2735 } 2736 2737 /* Create a right sibling. */ 2738 if ((pright = btree_new_page(bt, mp->page->flags)) == NULL) 2739 return BT_FAIL; 2740 pright->parent = mp->parent; 2741 pright->parent_index = mp->parent_index + 1; 2742 DPRINTF("new right sibling: page %u", pright->pgno); 2743 2744 /* Move half of the keys to the right sibling. */ 2745 if ((copy = malloc(bt->head.psize)) == NULL) 2746 return BT_FAIL; 2747 bcopy(mp->page, copy, bt->head.psize); 2748 assert(mp->ref == 0); /* XXX */ 2749 bzero(&mp->page->ptrs, bt->head.psize - PAGEHDRSZ); 2750 mp->page->lower = PAGEHDRSZ; 2751 mp->page->upper = bt->head.psize; 2752 2753 split_indx = NUMKEYSP(copy) / 2 + 1; 2754 2755 /* First find the separating key between the split pages. 2756 */ 2757 bzero(&sepkey, sizeof(sepkey)); 2758 if (newindx == split_indx) { 2759 sepkey.size = newkey->size; 2760 sepkey.data = newkey->data; 2761 remove_prefix(bt, &sepkey, mp->prefix.len); 2762 } else { 2763 node = NODEPTRP(copy, split_indx); 2764 sepkey.size = node->ksize; 2765 sepkey.data = NODEKEY(node); 2766 } 2767 2768 if (IS_LEAF(mp) && bt->cmp == NULL) { 2769 /* Find the smallest separator. */ 2770 /* Ref: Prefix B-trees, R. Bayer, K. Unterauer, 1977 */ 2771 node = NODEPTRP(copy, split_indx - 1); 2772 bt_reduce_separator(bt, node, &sepkey); 2773 } 2774 2775 /* Fix separator wrt parent prefix. */ 2776 if (bt->cmp == NULL) { 2777 tmpkey.len = sizeof(tmpkey.str); 2778 concat_prefix(bt, mp->prefix.str, mp->prefix.len, 2779 sepkey.data, sepkey.size, tmpkey.str, &tmpkey.len); 2780 sepkey.data = tmpkey.str; 2781 sepkey.size = tmpkey.len; 2782 } 2783 2784 DPRINTF("separator is [%.*s]", (int)sepkey.size, (char *)sepkey.data); 2785 2786 /* Copy separator key to the parent. 2787 */ 2788 if (SIZELEFT(pright->parent) < bt_branch_size(bt, &sepkey)) { 2789 rc = btree_split(bt, &pright->parent, &pright->parent_index, 2790 &sepkey, NULL, pright->pgno); 2791 2792 /* Right page might now have changed parent. 2793 * Check if left page also changed parent. 2794 */ 2795 if (pright->parent != mp->parent && 2796 mp->parent_index >= NUMKEYS(mp->parent)) { 2797 mp->parent = pright->parent; 2798 mp->parent_index = pright->parent_index - 1; 2799 } 2800 } else { 2801 remove_prefix(bt, &sepkey, pright->parent->prefix.len); 2802 rc = btree_add_node(bt, pright->parent, pright->parent_index, 2803 &sepkey, NULL, pright->pgno, 0); 2804 } 2805 if (rc != BT_SUCCESS) { 2806 free(copy); 2807 return BT_FAIL; 2808 } 2809 2810 /* Update prefix for right and left page, if the parent was split. 2811 */ 2812 find_common_prefix(bt, pright); 2813 assert(orig_pfx_len <= pright->prefix.len); 2814 right_pfx_diff = pright->prefix.len - orig_pfx_len; 2815 2816 find_common_prefix(bt, mp); 2817 assert(orig_pfx_len <= mp->prefix.len); 2818 left_pfx_diff = mp->prefix.len - orig_pfx_len; 2819 2820 for (i = j = 0; i <= NUMKEYSP(copy); j++) { 2821 if (i < split_indx) { 2822 /* Re-insert in left sibling. */ 2823 p = mp; 2824 pfx_diff = left_pfx_diff; 2825 } else { 2826 /* Insert in right sibling. */ 2827 if (i == split_indx) 2828 /* Reset insert index for right sibling. */ 2829 j = (i == newindx && ins_new); 2830 p = pright; 2831 pfx_diff = right_pfx_diff; 2832 } 2833 2834 if (i == newindx && !ins_new) { 2835 /* Insert the original entry that caused the split. */ 2836 rkey.data = newkey->data; 2837 rkey.size = newkey->size; 2838 if (IS_LEAF(mp)) { 2839 rdata.data = newdata->data; 2840 rdata.size = newdata->size; 2841 } else 2842 pgno = newpgno; 2843 flags = 0; 2844 pfx_diff = p->prefix.len; 2845 2846 ins_new = 1; 2847 2848 /* Update page and index for the new key. */ 2849 *newindxp = j; 2850 *mpp = p; 2851 } else if (i == NUMKEYSP(copy)) { 2852 break; 2853 } else { 2854 node = NODEPTRP(copy, i); 2855 rkey.data = NODEKEY(node); 2856 rkey.size = node->ksize; 2857 if (IS_LEAF(mp)) { 2858 rdata.data = NODEDATA(node); 2859 rdata.size = node->n_dsize; 2860 } else 2861 pgno = node->n_pgno; 2862 flags = node->flags; 2863 2864 i++; 2865 } 2866 2867 if (!IS_LEAF(mp) && j == 0) { 2868 /* First branch index doesn't need key data. */ 2869 rkey.size = 0; 2870 } else 2871 remove_prefix(bt, &rkey, pfx_diff); 2872 2873 rc = btree_add_node(bt, p, j, &rkey, &rdata, pgno,flags); 2874 } 2875 2876 free(copy); 2877 return rc; 2878 } 2879 2880 int 2881 btree_txn_put(struct btree *bt, struct btree_txn *txn, 2882 struct btval *key, struct btval *data, unsigned int flags) 2883 { 2884 int rc = BT_SUCCESS, exact, close_txn = 0; 2885 unsigned int ki; 2886 struct node *leaf; 2887 struct mpage *mp; 2888 struct btval xkey; 2889 2890 assert(key != NULL); 2891 assert(data != NULL); 2892 2893 if (bt != NULL && txn != NULL && bt != txn->bt) { 2894 errno = EINVAL; 2895 return BT_FAIL; 2896 } 2897 2898 if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) { 2899 errno = EINVAL; 2900 return BT_FAIL; 2901 } 2902 2903 if (bt == NULL) { 2904 if (txn == NULL) { 2905 errno = EINVAL; 2906 return BT_FAIL; 2907 } 2908 bt = txn->bt; 2909 } 2910 2911 if (key->size == 0 || key->size > MAXKEYSIZE) { 2912 errno = EINVAL; 2913 return BT_FAIL; 2914 } 2915 2916 DPRINTF("==> put key %.*s, size %zu, data size %zu", 2917 (int)key->size, (char *)key->data, key->size, data->size); 2918 2919 if (txn == NULL) { 2920 close_txn = 1; 2921 if ((txn = btree_txn_begin(bt, 0)) == NULL) 2922 return BT_FAIL; 2923 } 2924 2925 rc = btree_search_page(bt, txn, key, NULL, 1, &mp); 2926 if (rc == BT_SUCCESS) { 2927 leaf = btree_search_node(bt, mp, key, &exact, &ki); 2928 if (leaf && exact) { 2929 if (F_ISSET(flags, BT_NOOVERWRITE)) { 2930 DPRINTF("duplicate key %.*s", 2931 (int)key->size, (char *)key->data); 2932 errno = EEXIST; 2933 rc = BT_FAIL; 2934 goto done; 2935 } 2936 btree_del_node(bt, mp, ki); 2937 } 2938 if (leaf == NULL) { /* append if not found */ 2939 ki = NUMKEYS(mp); 2940 DPRINTF("appending key at index %i", ki); 2941 } 2942 } else if (errno == ENOENT) { 2943 /* new file, just write a root leaf page */ 2944 DPRINTF("allocating new root leaf page"); 2945 if ((mp = btree_new_page(bt, P_LEAF)) == NULL) { 2946 rc = BT_FAIL; 2947 goto done; 2948 } 2949 txn->root = mp->pgno; 2950 bt->meta.depth++; 2951 ki = 0; 2952 } 2953 else 2954 goto done; 2955 2956 assert(IS_LEAF(mp)); 2957 DPRINTF("there are %lu keys, should insert new key at index %i", 2958 NUMKEYS(mp), ki); 2959 2960 /* Copy the key pointer as it is modified by the prefix code. The 2961 * caller might have malloc'ed the data. 2962 */ 2963 xkey.data = key->data; 2964 xkey.size = key->size; 2965 2966 if (SIZELEFT(mp) < bt_leaf_size(bt, key, data)) { 2967 rc = btree_split(bt, &mp, &ki, &xkey, data, P_INVALID); 2968 } else { 2969 /* There is room already in this leaf page. */ 2970 remove_prefix(bt, &xkey, mp->prefix.len); 2971 rc = btree_add_node(bt, mp, ki, &xkey, data, 0, 0); 2972 } 2973 2974 if (rc != BT_SUCCESS) 2975 txn->flags |= BT_TXN_ERROR; 2976 else 2977 bt->meta.entries++; 2978 2979 done: 2980 if (close_txn) { 2981 if (rc == BT_SUCCESS) 2982 rc = btree_txn_commit(txn); 2983 else 2984 btree_txn_abort(txn); 2985 } 2986 mpage_prune(bt); 2987 return rc; 2988 } 2989 2990 static pgno_t 2991 btree_compact_tree(struct btree *bt, pgno_t pgno, struct btree *btc) 2992 { 2993 ssize_t rc; 2994 indx_t i; 2995 pgno_t *pnext, next; 2996 struct node *node; 2997 struct page *p; 2998 struct mpage *mp; 2999 3000 /* Get the page and make a copy of it. 3001 */ 3002 if ((mp = btree_get_mpage(bt, pgno)) == NULL) 3003 return P_INVALID; 3004 if ((p = malloc(bt->head.psize)) == NULL) 3005 return P_INVALID; 3006 bcopy(mp->page, p, bt->head.psize); 3007 3008 /* Go through all nodes in the (copied) page and update the 3009 * page pointers. 3010 */ 3011 if (F_ISSET(p->flags, P_BRANCH)) { 3012 for (i = 0; i < NUMKEYSP(p); i++) { 3013 node = NODEPTRP(p, i); 3014 node->n_pgno = btree_compact_tree(bt, node->n_pgno, btc); 3015 if (node->n_pgno == P_INVALID) { 3016 free(p); 3017 return P_INVALID; 3018 } 3019 } 3020 } else if (F_ISSET(p->flags, P_LEAF)) { 3021 for (i = 0; i < NUMKEYSP(p); i++) { 3022 node = NODEPTRP(p, i); 3023 if (F_ISSET(node->flags, F_BIGDATA)) { 3024 bcopy(NODEDATA(node), &next, sizeof(next)); 3025 next = btree_compact_tree(bt, next, btc); 3026 if (next == P_INVALID) { 3027 free(p); 3028 return P_INVALID; 3029 } 3030 bcopy(&next, NODEDATA(node), sizeof(next)); 3031 } 3032 } 3033 } else if (F_ISSET(p->flags, P_OVERFLOW)) { 3034 pnext = &p->p_next_pgno; 3035 if (*pnext > 0) { 3036 *pnext = btree_compact_tree(bt, *pnext, btc); 3037 if (*pnext == P_INVALID) { 3038 free(p); 3039 return P_INVALID; 3040 } 3041 } 3042 } else 3043 assert(0); 3044 3045 pgno = p->pgno = btc->txn->next_pgno++; 3046 rc = write(btc->fd, p, bt->head.psize); 3047 free(p); 3048 if (rc != (ssize_t)bt->head.psize) 3049 return P_INVALID; 3050 mpage_prune(bt); 3051 return pgno; 3052 } 3053 3054 int 3055 btree_compact(struct btree *bt) 3056 { 3057 char *compact_path = NULL; 3058 struct btree *btc; 3059 struct btree_txn *txn, *txnc = NULL; 3060 int fd; 3061 pgno_t root; 3062 3063 assert(bt != NULL); 3064 3065 DPRINTF("compacting btree %p with path %s", bt, bt->path); 3066 3067 if (bt->path == NULL) { 3068 errno = EINVAL; 3069 return BT_FAIL; 3070 } 3071 3072 if ((txn = btree_txn_begin(bt, 0)) == NULL) 3073 return BT_FAIL; 3074 3075 asprintf(&compact_path, "%s.compact.XXXXXX", bt->path); 3076 fd = mkstemp(compact_path); 3077 if (fd == -1) { 3078 free(compact_path); 3079 btree_txn_abort(txn); 3080 return BT_FAIL; 3081 } 3082 3083 if ((btc = btree_open_fd(fd, 0)) == NULL) 3084 goto failed; 3085 bcopy(&bt->meta, &btc->meta, sizeof(bt->meta)); 3086 btc->meta.revisions = 0; 3087 3088 if ((txnc = btree_txn_begin(btc, 0)) == NULL) 3089 goto failed; 3090 3091 if (bt->meta.root != P_INVALID) { 3092 root = btree_compact_tree(bt, bt->meta.root, btc); 3093 if (root == P_INVALID) 3094 goto failed; 3095 if (btree_write_meta(btc, root, 0) != BT_SUCCESS) 3096 goto failed; 3097 } 3098 3099 fsync(fd); 3100 3101 DPRINTF("renaming %s to %s", compact_path, bt->path); 3102 if (rename(compact_path, bt->path) != 0) 3103 goto failed; 3104 3105 /* Write a "tombstone" meta page so other processes can pick up 3106 * the change and re-open the file. 3107 */ 3108 if (btree_write_meta(bt, P_INVALID, BT_TOMBSTONE) != BT_SUCCESS) 3109 goto failed; 3110 3111 btree_txn_abort(txn); 3112 btree_txn_abort(txnc); 3113 free(compact_path); 3114 btree_close(btc); 3115 mpage_prune(bt); 3116 return 0; 3117 3118 failed: 3119 btree_txn_abort(txn); 3120 btree_txn_abort(txnc); 3121 unlink(compact_path); 3122 free(compact_path); 3123 btree_close(btc); 3124 mpage_prune(bt); 3125 return BT_FAIL; 3126 } 3127 3128 /* Reverts the last change. Truncates the file at the last root page. 3129 */ 3130 int 3131 btree_revert(struct btree *bt) 3132 { 3133 if (btree_read_meta(bt, NULL) != 0) 3134 return -1; 3135 3136 DPRINTF("truncating file at page %u", bt->meta.root); 3137 return ftruncate(bt->fd, bt->head.psize * bt->meta.root); 3138 } 3139 3140 void 3141 btree_set_cache_size(struct btree *bt, unsigned int cache_size) 3142 { 3143 bt->stat.max_cache = cache_size; 3144 } 3145 3146 unsigned int 3147 btree_get_flags(struct btree *bt) 3148 { 3149 return (bt->flags & ~BT_FIXPADDING); 3150 } 3151 3152 const char * 3153 btree_get_path(struct btree *bt) 3154 { 3155 return bt->path; 3156 } 3157 3158 const struct btree_stat * 3159 btree_stat(struct btree *bt) 3160 { 3161 if (bt == NULL) 3162 return NULL; 3163 3164 bt->stat.branch_pages = bt->meta.branch_pages; 3165 bt->stat.leaf_pages = bt->meta.leaf_pages; 3166 bt->stat.overflow_pages = bt->meta.overflow_pages; 3167 bt->stat.revisions = bt->meta.revisions; 3168 bt->stat.depth = bt->meta.depth; 3169 bt->stat.entries = bt->meta.entries; 3170 bt->stat.psize = bt->head.psize; 3171 bt->stat.created_at = bt->meta.created_at; 3172 3173 return &bt->stat; 3174 } 3175 3176