1 /* $OpenBSD: btree.c,v 1.38 2017/05/26 21:23:14 sthen Exp $ */ 2 3 /* 4 * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/tree.h> 21 #include <sys/stat.h> 22 #include <sys/queue.h> 23 #include <sys/uio.h> 24 25 #include <assert.h> 26 #include <err.h> 27 #include <errno.h> 28 #include <fcntl.h> 29 #include <stddef.h> 30 #include <stdint.h> 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <string.h> 34 #include <time.h> 35 #include <unistd.h> 36 37 #include "btree.h" 38 39 /* #define DEBUG */ 40 41 #ifdef DEBUG 42 # define DPRINTF(...) do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \ 43 fprintf(stderr, __VA_ARGS__); \ 44 fprintf(stderr, "\n"); } while(0) 45 #else 46 # define DPRINTF(...) 47 #endif 48 49 #define PAGESIZE 4096 50 #define BT_MINKEYS 4 51 #define BT_MAGIC 0xB3DBB3DB 52 #define BT_VERSION 4 53 #define MAXKEYSIZE 255 54 55 #define P_INVALID 0xFFFFFFFF 56 57 #define F_ISSET(w, f) (((w) & (f)) == (f)) 58 #define MINIMUM(a,b) ((a) < (b) ? (a) : (b)) 59 60 typedef uint32_t pgno_t; 61 typedef uint16_t indx_t; 62 63 /* There are four page types: meta, index, leaf and overflow. 64 * They all share the same page header. 65 */ 66 struct page { /* represents an on-disk page */ 67 pgno_t pgno; /* page number */ 68 #define P_BRANCH 0x01 /* branch page */ 69 #define P_LEAF 0x02 /* leaf page */ 70 #define P_OVERFLOW 0x04 /* overflow page */ 71 #define P_META 0x08 /* meta page */ 72 #define P_HEAD 0x10 /* header page */ 73 uint32_t flags; 74 #define lower b.fb.fb_lower 75 #define upper b.fb.fb_upper 76 #define p_next_pgno b.pb_next_pgno 77 union page_bounds { 78 struct { 79 indx_t fb_lower; /* lower bound of free space */ 80 indx_t fb_upper; /* upper bound of free space */ 81 } fb; 82 pgno_t pb_next_pgno; /* overflow page linked list */ 83 } b; 84 indx_t ptrs[1]; /* dynamic size */ 85 } __packed; 86 87 #define PAGEHDRSZ offsetof(struct page, ptrs) 88 89 #define NUMKEYSP(p) (((p)->lower - PAGEHDRSZ) >> 1) 90 #define NUMKEYS(mp) (((mp)->page->lower - PAGEHDRSZ) >> 1) 91 #define SIZELEFT(mp) (indx_t)((mp)->page->upper - (mp)->page->lower) 92 #define PAGEFILL(bt, mp) (1000 * ((bt)->head.psize - PAGEHDRSZ - SIZELEFT(mp)) / \ 93 ((bt)->head.psize - PAGEHDRSZ)) 94 #define IS_LEAF(mp) F_ISSET((mp)->page->flags, P_LEAF) 95 #define IS_BRANCH(mp) F_ISSET((mp)->page->flags, P_BRANCH) 96 #define IS_OVERFLOW(mp) F_ISSET((mp)->page->flags, P_OVERFLOW) 97 98 struct bt_head { /* header page content */ 99 uint32_t magic; 100 uint32_t version; 101 uint32_t flags; 102 uint32_t psize; /* page size */ 103 } __packed; 104 105 struct bt_meta { /* meta (footer) page content */ 106 #define BT_TOMBSTONE 0x01 /* file is replaced */ 107 uint32_t flags; 108 pgno_t root; /* page number of root page */ 109 pgno_t prev_meta; /* previous meta page number */ 110 time_t created_at; 111 uint32_t branch_pages; 112 uint32_t leaf_pages; 113 uint32_t overflow_pages; 114 uint32_t revisions; 115 uint32_t depth; 116 uint64_t entries; 117 unsigned char hash[SHA_DIGEST_LENGTH]; 118 } __packed; 119 120 struct btkey { 121 size_t len; 122 char str[MAXKEYSIZE]; 123 }; 124 125 struct mpage { /* an in-memory cached page */ 126 RB_ENTRY(mpage) entry; /* page cache entry */ 127 SIMPLEQ_ENTRY(mpage) next; /* queue of dirty pages */ 128 TAILQ_ENTRY(mpage) lru_next; /* LRU queue */ 129 struct mpage *parent; /* NULL if root */ 130 unsigned int parent_index; /* keep track of node index */ 131 struct btkey prefix; 132 struct page *page; 133 pgno_t pgno; /* copy of page->pgno */ 134 short ref; /* increased by cursors */ 135 short dirty; /* 1 if on dirty queue */ 136 }; 137 RB_HEAD(page_cache, mpage); 138 SIMPLEQ_HEAD(dirty_queue, mpage); 139 TAILQ_HEAD(lru_queue, mpage); 140 141 static int mpage_cmp(struct mpage *a, struct mpage *b); 142 static struct mpage *mpage_lookup(struct btree *bt, pgno_t pgno); 143 static void mpage_add(struct btree *bt, struct mpage *mp); 144 static void mpage_free(struct mpage *mp); 145 static void mpage_del(struct btree *bt, struct mpage *mp); 146 static void mpage_flush(struct btree *bt); 147 static struct mpage *mpage_copy(struct btree *bt, struct mpage *mp); 148 static void mpage_prune(struct btree *bt); 149 static void mpage_dirty(struct btree *bt, struct mpage *mp); 150 static struct mpage *mpage_touch(struct btree *bt, struct mpage *mp); 151 152 RB_PROTOTYPE(page_cache, mpage, entry, mpage_cmp); 153 RB_GENERATE(page_cache, mpage, entry, mpage_cmp); 154 155 struct ppage { /* ordered list of pages */ 156 SLIST_ENTRY(ppage) entry; 157 struct mpage *mpage; 158 unsigned int ki; /* cursor index on page */ 159 }; 160 SLIST_HEAD(page_stack, ppage); 161 162 #define CURSOR_EMPTY(c) SLIST_EMPTY(&(c)->stack) 163 #define CURSOR_TOP(c) SLIST_FIRST(&(c)->stack) 164 #define CURSOR_POP(c) SLIST_REMOVE_HEAD(&(c)->stack, entry) 165 #define CURSOR_PUSH(c,p) SLIST_INSERT_HEAD(&(c)->stack, p, entry) 166 167 struct cursor { 168 struct btree *bt; 169 struct btree_txn *txn; 170 struct page_stack stack; /* stack of parent pages */ 171 short initialized; /* 1 if initialized */ 172 short eof; /* 1 if end is reached */ 173 }; 174 175 #define METAHASHLEN offsetof(struct bt_meta, hash) 176 #define METADATA(p) ((void *)((char *)p + PAGEHDRSZ)) 177 178 struct node { 179 #define n_pgno p.np_pgno 180 #define n_dsize p.np_dsize 181 union { 182 pgno_t np_pgno; /* child page number */ 183 uint32_t np_dsize; /* leaf data size */ 184 } p; 185 uint16_t ksize; /* key size */ 186 #define F_BIGDATA 0x01 /* data put on overflow page */ 187 uint8_t flags; 188 char data[1]; 189 } __packed; 190 191 struct btree_txn { 192 pgno_t root; /* current / new root page */ 193 pgno_t next_pgno; /* next unallocated page */ 194 struct btree *bt; /* btree is ref'd */ 195 struct dirty_queue *dirty_queue; /* modified pages */ 196 #define BT_TXN_RDONLY 0x01 /* read-only transaction */ 197 #define BT_TXN_ERROR 0x02 /* an error has occurred */ 198 unsigned int flags; 199 }; 200 201 struct btree { 202 int fd; 203 char *path; 204 #define BT_FIXPADDING 0x01 /* internal */ 205 unsigned int flags; 206 bt_cmp_func cmp; /* user compare function */ 207 struct bt_head head; 208 struct bt_meta meta; 209 struct page_cache *page_cache; 210 struct lru_queue *lru_queue; 211 struct btree_txn *txn; /* current write transaction */ 212 int ref; /* increased by cursors & txn */ 213 struct btree_stat stat; 214 off_t size; /* current file size */ 215 }; 216 217 #define NODESIZE offsetof(struct node, data) 218 219 #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->size)) 220 #define LEAFSIZE(k, d) (NODESIZE + (k)->size + (d)->size) 221 #define NODEPTRP(p, i) ((struct node *)((char *)(p) + (p)->ptrs[i])) 222 #define NODEPTR(mp, i) NODEPTRP((mp)->page, i) 223 #define NODEKEY(node) (void *)((node)->data) 224 #define NODEDATA(node) (void *)((char *)(node)->data + (node)->ksize) 225 #define NODEPGNO(node) ((node)->p.np_pgno) 226 #define NODEDSZ(node) ((node)->p.np_dsize) 227 228 #define BT_COMMIT_PAGES 64 /* max number of pages to write in one commit */ 229 #define BT_MAXCACHE_DEF 1024 /* max number of pages to keep in cache */ 230 231 static int btree_read_page(struct btree *bt, pgno_t pgno, 232 struct page *page); 233 static struct mpage *btree_get_mpage(struct btree *bt, pgno_t pgno); 234 static int btree_search_page_root(struct btree *bt, 235 struct mpage *root, struct btval *key, 236 struct cursor *cursor, int modify, 237 struct mpage **mpp); 238 static int btree_search_page(struct btree *bt, 239 struct btree_txn *txn, struct btval *key, 240 struct cursor *cursor, int modify, 241 struct mpage **mpp); 242 243 static int btree_write_header(struct btree *bt, int fd); 244 static int btree_read_header(struct btree *bt); 245 static int btree_is_meta_page(struct page *p); 246 static int btree_read_meta(struct btree *bt, pgno_t *p_next); 247 static int btree_write_meta(struct btree *bt, pgno_t root, 248 unsigned int flags); 249 static void btree_ref(struct btree *bt); 250 251 static struct node *btree_search_node(struct btree *bt, struct mpage *mp, 252 struct btval *key, int *exactp, unsigned int *kip); 253 static int btree_add_node(struct btree *bt, struct mpage *mp, 254 indx_t indx, struct btval *key, struct btval *data, 255 pgno_t pgno, uint8_t flags); 256 static void btree_del_node(struct btree *bt, struct mpage *mp, 257 indx_t indx); 258 static int btree_read_data(struct btree *bt, struct mpage *mp, 259 struct node *leaf, struct btval *data); 260 261 static int btree_rebalance(struct btree *bt, struct mpage *mp); 262 static int btree_update_key(struct btree *bt, struct mpage *mp, 263 indx_t indx, struct btval *key); 264 static int btree_adjust_prefix(struct btree *bt, 265 struct mpage *src, int delta); 266 static int btree_move_node(struct btree *bt, struct mpage *src, 267 indx_t srcindx, struct mpage *dst, indx_t dstindx); 268 static int btree_merge(struct btree *bt, struct mpage *src, 269 struct mpage *dst); 270 static int btree_split(struct btree *bt, struct mpage **mpp, 271 unsigned int *newindxp, struct btval *newkey, 272 struct btval *newdata, pgno_t newpgno); 273 static struct mpage *btree_new_page(struct btree *bt, uint32_t flags); 274 static int btree_write_overflow_data(struct btree *bt, 275 struct page *p, struct btval *data); 276 277 static void cursor_pop_page(struct cursor *cursor); 278 static struct ppage *cursor_push_page(struct cursor *cursor, 279 struct mpage *mp); 280 281 static int bt_set_key(struct btree *bt, struct mpage *mp, 282 struct node *node, struct btval *key); 283 static int btree_sibling(struct cursor *cursor, int move_right); 284 static int btree_cursor_next(struct cursor *cursor, 285 struct btval *key, struct btval *data); 286 static int btree_cursor_set(struct cursor *cursor, 287 struct btval *key, struct btval *data, int *exactp); 288 static int btree_cursor_first(struct cursor *cursor, 289 struct btval *key, struct btval *data); 290 291 static void bt_reduce_separator(struct btree *bt, struct node *min, 292 struct btval *sep); 293 static void remove_prefix(struct btree *bt, struct btval *key, 294 size_t pfxlen); 295 static void expand_prefix(struct btree *bt, struct mpage *mp, 296 indx_t indx, struct btkey *expkey); 297 static void concat_prefix(struct btree *bt, char *s1, size_t n1, 298 char *s2, size_t n2, char *cs, size_t *cn); 299 static void common_prefix(struct btree *bt, struct btkey *min, 300 struct btkey *max, struct btkey *pfx); 301 static void find_common_prefix(struct btree *bt, struct mpage *mp); 302 303 static size_t bt_leaf_size(struct btree *bt, struct btval *key, 304 struct btval *data); 305 static size_t bt_branch_size(struct btree *bt, struct btval *key); 306 307 static pgno_t btree_compact_tree(struct btree *bt, pgno_t pgno, 308 struct btree *btc); 309 310 static int memncmp(const void *s1, size_t n1, 311 const void *s2, size_t n2); 312 static int memnrcmp(const void *s1, size_t n1, 313 const void *s2, size_t n2); 314 315 static int 316 memncmp(const void *s1, size_t n1, const void *s2, size_t n2) 317 { 318 if (n1 < n2) { 319 if (memcmp(s1, s2, n1) == 0) 320 return -1; 321 } 322 else if (n1 > n2) { 323 if (memcmp(s1, s2, n2) == 0) 324 return 1; 325 } 326 return memcmp(s1, s2, n1); 327 } 328 329 static int 330 memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2) 331 { 332 const unsigned char *p1; 333 const unsigned char *p2; 334 335 if (n1 == 0) 336 return n2 == 0 ? 0 : -1; 337 338 if (n2 == 0) 339 return n1 == 0 ? 0 : 1; 340 341 p1 = (const unsigned char *)s1 + n1 - 1; 342 p2 = (const unsigned char *)s2 + n2 - 1; 343 344 while (*p1 == *p2) { 345 if (p1 == s1) 346 return (p2 == s2) ? 0 : -1; 347 if (p2 == s2) 348 return (p1 == p2) ? 0 : 1; 349 p1--; 350 p2--; 351 } 352 return *p1 - *p2; 353 } 354 355 int 356 btree_cmp(struct btree *bt, const struct btval *a, const struct btval *b) 357 { 358 return bt->cmp(a, b); 359 } 360 361 static void 362 common_prefix(struct btree *bt, struct btkey *min, struct btkey *max, 363 struct btkey *pfx) 364 { 365 size_t n = 0; 366 char *p1; 367 char *p2; 368 369 if (min->len == 0 || max->len == 0) { 370 pfx->len = 0; 371 return; 372 } 373 374 if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 375 p1 = min->str + min->len - 1; 376 p2 = max->str + max->len - 1; 377 378 while (*p1 == *p2) { 379 if (p1 < min->str || p2 < max->str) 380 break; 381 p1--; 382 p2--; 383 n++; 384 } 385 386 assert(n <= (int)sizeof(pfx->str)); 387 pfx->len = n; 388 bcopy(p2 + 1, pfx->str, n); 389 } else { 390 p1 = min->str; 391 p2 = max->str; 392 393 while (*p1 == *p2) { 394 if (n == min->len || n == max->len) 395 break; 396 p1++; 397 p2++; 398 n++; 399 } 400 401 assert(n <= (int)sizeof(pfx->str)); 402 pfx->len = n; 403 bcopy(max->str, pfx->str, n); 404 } 405 } 406 407 static void 408 remove_prefix(struct btree *bt, struct btval *key, size_t pfxlen) 409 { 410 if (pfxlen == 0 || bt->cmp != NULL) 411 return; 412 413 DPRINTF("removing %zu bytes of prefix from key [%.*s]", pfxlen, 414 (int)key->size, (char *)key->data); 415 assert(pfxlen <= key->size); 416 key->size -= pfxlen; 417 if (!F_ISSET(bt->flags, BT_REVERSEKEY)) 418 key->data = (char *)key->data + pfxlen; 419 } 420 421 static void 422 expand_prefix(struct btree *bt, struct mpage *mp, indx_t indx, 423 struct btkey *expkey) 424 { 425 struct node *node; 426 427 node = NODEPTR(mp, indx); 428 expkey->len = sizeof(expkey->str); 429 concat_prefix(bt, mp->prefix.str, mp->prefix.len, 430 NODEKEY(node), node->ksize, expkey->str, &expkey->len); 431 } 432 433 static int 434 bt_cmp(struct btree *bt, const struct btval *key1, const struct btval *key2, 435 struct btkey *pfx) 436 { 437 if (F_ISSET(bt->flags, BT_REVERSEKEY)) 438 return memnrcmp(key1->data, key1->size - pfx->len, 439 key2->data, key2->size); 440 else 441 return memncmp((char *)key1->data + pfx->len, key1->size - pfx->len, 442 key2->data, key2->size); 443 } 444 445 void 446 btval_reset(struct btval *btv) 447 { 448 if (btv) { 449 if (btv->mp) 450 btv->mp->ref--; 451 if (btv->free_data) 452 free(btv->data); 453 memset(btv, 0, sizeof(*btv)); 454 } 455 } 456 457 static int 458 mpage_cmp(struct mpage *a, struct mpage *b) 459 { 460 if (a->pgno > b->pgno) 461 return 1; 462 if (a->pgno < b->pgno) 463 return -1; 464 return 0; 465 } 466 467 static struct mpage * 468 mpage_lookup(struct btree *bt, pgno_t pgno) 469 { 470 struct mpage find, *mp; 471 472 find.pgno = pgno; 473 mp = RB_FIND(page_cache, bt->page_cache, &find); 474 if (mp) { 475 bt->stat.hits++; 476 /* Update LRU queue. Move page to the end. */ 477 TAILQ_REMOVE(bt->lru_queue, mp, lru_next); 478 TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next); 479 } 480 return mp; 481 } 482 483 static void 484 mpage_add(struct btree *bt, struct mpage *mp) 485 { 486 assert(RB_INSERT(page_cache, bt->page_cache, mp) == NULL); 487 bt->stat.cache_size++; 488 TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next); 489 } 490 491 static void 492 mpage_free(struct mpage *mp) 493 { 494 if (mp != NULL) { 495 free(mp->page); 496 free(mp); 497 } 498 } 499 500 static void 501 mpage_del(struct btree *bt, struct mpage *mp) 502 { 503 assert(RB_REMOVE(page_cache, bt->page_cache, mp) == mp); 504 assert(bt->stat.cache_size > 0); 505 bt->stat.cache_size--; 506 TAILQ_REMOVE(bt->lru_queue, mp, lru_next); 507 } 508 509 static void 510 mpage_flush(struct btree *bt) 511 { 512 struct mpage *mp; 513 514 while ((mp = RB_MIN(page_cache, bt->page_cache)) != NULL) { 515 mpage_del(bt, mp); 516 mpage_free(mp); 517 } 518 } 519 520 static struct mpage * 521 mpage_copy(struct btree *bt, struct mpage *mp) 522 { 523 struct mpage *copy; 524 525 if ((copy = calloc(1, sizeof(*copy))) == NULL) 526 return NULL; 527 if ((copy->page = malloc(bt->head.psize)) == NULL) { 528 free(copy); 529 return NULL; 530 } 531 bcopy(mp->page, copy->page, bt->head.psize); 532 bcopy(&mp->prefix, ©->prefix, sizeof(mp->prefix)); 533 copy->parent = mp->parent; 534 copy->parent_index = mp->parent_index; 535 copy->pgno = mp->pgno; 536 537 return copy; 538 } 539 540 /* Remove the least recently used memory pages until the cache size is 541 * within the configured bounds. Pages referenced by cursors or returned 542 * key/data are not pruned. 543 */ 544 static void 545 mpage_prune(struct btree *bt) 546 { 547 struct mpage *mp, *next; 548 549 for (mp = TAILQ_FIRST(bt->lru_queue); mp; mp = next) { 550 if (bt->stat.cache_size <= bt->stat.max_cache) 551 break; 552 next = TAILQ_NEXT(mp, lru_next); 553 if (!mp->dirty && mp->ref <= 0) { 554 mpage_del(bt, mp); 555 mpage_free(mp); 556 } 557 } 558 } 559 560 /* Mark a page as dirty and push it on the dirty queue. 561 */ 562 static void 563 mpage_dirty(struct btree *bt, struct mpage *mp) 564 { 565 assert(bt != NULL); 566 assert(bt->txn != NULL); 567 568 if (!mp->dirty) { 569 mp->dirty = 1; 570 SIMPLEQ_INSERT_TAIL(bt->txn->dirty_queue, mp, next); 571 } 572 } 573 574 /* Touch a page: make it dirty and re-insert into tree with updated pgno. 575 */ 576 static struct mpage * 577 mpage_touch(struct btree *bt, struct mpage *mp) 578 { 579 assert(bt != NULL); 580 assert(bt->txn != NULL); 581 assert(mp != NULL); 582 583 if (!mp->dirty) { 584 DPRINTF("touching page %u -> %u", mp->pgno, bt->txn->next_pgno); 585 if (mp->ref == 0) 586 mpage_del(bt, mp); 587 else { 588 if ((mp = mpage_copy(bt, mp)) == NULL) 589 return NULL; 590 } 591 mp->pgno = mp->page->pgno = bt->txn->next_pgno++; 592 mpage_dirty(bt, mp); 593 mpage_add(bt, mp); 594 595 /* Update the page number to new touched page. */ 596 if (mp->parent != NULL) 597 NODEPGNO(NODEPTR(mp->parent, 598 mp->parent_index)) = mp->pgno; 599 } 600 601 return mp; 602 } 603 604 static int 605 btree_read_page(struct btree *bt, pgno_t pgno, struct page *page) 606 { 607 ssize_t rc; 608 609 DPRINTF("reading page %u", pgno); 610 bt->stat.reads++; 611 if ((rc = pread(bt->fd, page, bt->head.psize, (off_t)pgno*bt->head.psize)) == 0) { 612 DPRINTF("page %u doesn't exist", pgno); 613 errno = ENOENT; 614 return BT_FAIL; 615 } else if (rc != (ssize_t)bt->head.psize) { 616 if (rc > 0) 617 errno = EINVAL; 618 DPRINTF("read: %s", strerror(errno)); 619 return BT_FAIL; 620 } 621 622 if (page->pgno != pgno) { 623 DPRINTF("page numbers don't match: %u != %u", pgno, page->pgno); 624 errno = EINVAL; 625 return BT_FAIL; 626 } 627 628 DPRINTF("page %u has flags 0x%X", pgno, page->flags); 629 630 return BT_SUCCESS; 631 } 632 633 int 634 btree_sync(struct btree *bt) 635 { 636 if (!F_ISSET(bt->flags, BT_NOSYNC)) 637 return fsync(bt->fd); 638 return 0; 639 } 640 641 struct btree_txn * 642 btree_txn_begin(struct btree *bt, int rdonly) 643 { 644 struct btree_txn *txn; 645 646 if (!rdonly && bt->txn != NULL) { 647 DPRINTF("write transaction already begun"); 648 errno = EBUSY; 649 return NULL; 650 } 651 652 if ((txn = calloc(1, sizeof(*txn))) == NULL) { 653 DPRINTF("calloc: %s", strerror(errno)); 654 return NULL; 655 } 656 657 if (rdonly) { 658 txn->flags |= BT_TXN_RDONLY; 659 } else { 660 txn->dirty_queue = calloc(1, sizeof(*txn->dirty_queue)); 661 if (txn->dirty_queue == NULL) { 662 free(txn); 663 return NULL; 664 } 665 SIMPLEQ_INIT(txn->dirty_queue); 666 667 DPRINTF("taking write lock on txn %p", txn); 668 if (flock(bt->fd, LOCK_EX | LOCK_NB) != 0) { 669 DPRINTF("flock: %s", strerror(errno)); 670 errno = EBUSY; 671 free(txn->dirty_queue); 672 free(txn); 673 return NULL; 674 } 675 bt->txn = txn; 676 } 677 678 txn->bt = bt; 679 btree_ref(bt); 680 681 if (btree_read_meta(bt, &txn->next_pgno) != BT_SUCCESS) { 682 btree_txn_abort(txn); 683 return NULL; 684 } 685 686 txn->root = bt->meta.root; 687 DPRINTF("begin transaction on btree %p, root page %u", bt, txn->root); 688 689 return txn; 690 } 691 692 void 693 btree_txn_abort(struct btree_txn *txn) 694 { 695 struct mpage *mp; 696 struct btree *bt; 697 698 if (txn == NULL) 699 return; 700 701 bt = txn->bt; 702 DPRINTF("abort transaction on btree %p, root page %u", bt, txn->root); 703 704 if (!F_ISSET(txn->flags, BT_TXN_RDONLY)) { 705 /* Discard all dirty pages. 706 */ 707 while (!SIMPLEQ_EMPTY(txn->dirty_queue)) { 708 mp = SIMPLEQ_FIRST(txn->dirty_queue); 709 assert(mp->ref == 0); /* cursors should be closed */ 710 mpage_del(bt, mp); 711 SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next); 712 mpage_free(mp); 713 } 714 715 DPRINTF("releasing write lock on txn %p", txn); 716 txn->bt->txn = NULL; 717 if (flock(txn->bt->fd, LOCK_UN) != 0) { 718 DPRINTF("failed to unlock fd %d: %s", 719 txn->bt->fd, strerror(errno)); 720 } 721 free(txn->dirty_queue); 722 } 723 724 btree_close(txn->bt); 725 free(txn); 726 } 727 728 int 729 btree_txn_commit(struct btree_txn *txn) 730 { 731 int n, done; 732 ssize_t rc; 733 off_t size; 734 struct mpage *mp; 735 struct btree *bt; 736 struct iovec iov[BT_COMMIT_PAGES]; 737 738 assert(txn != NULL); 739 assert(txn->bt != NULL); 740 741 bt = txn->bt; 742 743 if (F_ISSET(txn->flags, BT_TXN_RDONLY)) { 744 DPRINTF("attempt to commit read-only transaction"); 745 btree_txn_abort(txn); 746 errno = EPERM; 747 return BT_FAIL; 748 } 749 750 if (txn != bt->txn) { 751 DPRINTF("attempt to commit unknown transaction"); 752 btree_txn_abort(txn); 753 errno = EINVAL; 754 return BT_FAIL; 755 } 756 757 if (F_ISSET(txn->flags, BT_TXN_ERROR)) { 758 DPRINTF("error flag is set, can't commit"); 759 btree_txn_abort(txn); 760 errno = EINVAL; 761 return BT_FAIL; 762 } 763 764 if (SIMPLEQ_EMPTY(txn->dirty_queue)) 765 goto done; 766 767 if (F_ISSET(bt->flags, BT_FIXPADDING)) { 768 size = lseek(bt->fd, 0, SEEK_END); 769 size += bt->head.psize - (size % bt->head.psize); 770 DPRINTF("extending to multiple of page size: %llu", size); 771 if (ftruncate(bt->fd, size) != 0) { 772 DPRINTF("ftruncate: %s", strerror(errno)); 773 btree_txn_abort(txn); 774 return BT_FAIL; 775 } 776 bt->flags &= ~BT_FIXPADDING; 777 } 778 779 DPRINTF("committing transaction on btree %p, root page %u", 780 bt, txn->root); 781 782 /* Commit up to BT_COMMIT_PAGES dirty pages to disk until done. 783 */ 784 do { 785 n = 0; 786 done = 1; 787 SIMPLEQ_FOREACH(mp, txn->dirty_queue, next) { 788 DPRINTF("committing page %u", mp->pgno); 789 iov[n].iov_len = bt->head.psize; 790 iov[n].iov_base = mp->page; 791 if (++n >= BT_COMMIT_PAGES) { 792 done = 0; 793 break; 794 } 795 } 796 797 if (n == 0) 798 break; 799 800 DPRINTF("committing %u dirty pages", n); 801 rc = writev(bt->fd, iov, n); 802 if (rc != (ssize_t)bt->head.psize*n) { 803 if (rc > 0) 804 DPRINTF("short write, filesystem full?"); 805 else 806 DPRINTF("writev: %s", strerror(errno)); 807 btree_txn_abort(txn); 808 return BT_FAIL; 809 } 810 811 /* Remove the dirty flag from the written pages. 812 */ 813 while (!SIMPLEQ_EMPTY(txn->dirty_queue)) { 814 mp = SIMPLEQ_FIRST(txn->dirty_queue); 815 mp->dirty = 0; 816 SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next); 817 if (--n == 0) 818 break; 819 } 820 } while (!done); 821 822 if (btree_sync(bt) != 0 || 823 btree_write_meta(bt, txn->root, 0) != BT_SUCCESS || 824 btree_sync(bt) != 0) { 825 btree_txn_abort(txn); 826 return BT_FAIL; 827 } 828 829 done: 830 mpage_prune(bt); 831 btree_txn_abort(txn); 832 833 return BT_SUCCESS; 834 } 835 836 static int 837 btree_write_header(struct btree *bt, int fd) 838 { 839 struct stat sb; 840 struct bt_head *h; 841 struct page *p; 842 ssize_t rc; 843 unsigned int psize; 844 845 DPRINTF("writing header page"); 846 assert(bt != NULL); 847 848 /* 849 * Ask stat for 'optimal blocksize for I/O', but cap to fit in indx_t. 850 */ 851 if (fstat(fd, &sb) == 0) 852 psize = MINIMUM(32*1024, sb.st_blksize); 853 else 854 psize = PAGESIZE; 855 856 if ((p = calloc(1, psize)) == NULL) 857 return -1; 858 p->flags = P_HEAD; 859 860 h = METADATA(p); 861 h->magic = BT_MAGIC; 862 h->version = BT_VERSION; 863 h->psize = psize; 864 bcopy(h, &bt->head, sizeof(*h)); 865 866 rc = write(fd, p, bt->head.psize); 867 free(p); 868 if (rc != (ssize_t)bt->head.psize) { 869 if (rc > 0) 870 DPRINTF("short write, filesystem full?"); 871 return BT_FAIL; 872 } 873 874 return BT_SUCCESS; 875 } 876 877 static int 878 btree_read_header(struct btree *bt) 879 { 880 char page[PAGESIZE]; 881 struct page *p; 882 struct bt_head *h; 883 int rc; 884 885 assert(bt != NULL); 886 887 /* We don't know the page size yet, so use a minimum value. 888 */ 889 890 if ((rc = pread(bt->fd, page, PAGESIZE, 0)) == 0) { 891 errno = ENOENT; 892 return -1; 893 } else if (rc != PAGESIZE) { 894 if (rc > 0) 895 errno = EINVAL; 896 DPRINTF("read: %s", strerror(errno)); 897 return -1; 898 } 899 900 p = (struct page *)page; 901 902 if (!F_ISSET(p->flags, P_HEAD)) { 903 DPRINTF("page %d not a header page", p->pgno); 904 errno = EINVAL; 905 return -1; 906 } 907 908 h = METADATA(p); 909 if (h->magic != BT_MAGIC) { 910 DPRINTF("header has invalid magic"); 911 errno = EINVAL; 912 return -1; 913 } 914 915 if (h->version != BT_VERSION) { 916 DPRINTF("database is version %u, expected version %u", 917 bt->head.version, BT_VERSION); 918 errno = EINVAL; 919 return -1; 920 } 921 922 bcopy(h, &bt->head, sizeof(*h)); 923 return 0; 924 } 925 926 static int 927 btree_write_meta(struct btree *bt, pgno_t root, unsigned int flags) 928 { 929 struct mpage *mp; 930 struct bt_meta *meta; 931 ssize_t rc; 932 933 DPRINTF("writing meta page for root page %u", root); 934 935 assert(bt != NULL); 936 assert(bt->txn != NULL); 937 938 if ((mp = btree_new_page(bt, P_META)) == NULL) 939 return -1; 940 941 bt->meta.prev_meta = bt->meta.root; 942 bt->meta.root = root; 943 bt->meta.flags = flags; 944 bt->meta.created_at = time(0); 945 bt->meta.revisions++; 946 SHA1((unsigned char *)&bt->meta, METAHASHLEN, bt->meta.hash); 947 948 /* Copy the meta data changes to the new meta page. */ 949 meta = METADATA(mp->page); 950 bcopy(&bt->meta, meta, sizeof(*meta)); 951 952 rc = write(bt->fd, mp->page, bt->head.psize); 953 mp->dirty = 0; 954 SIMPLEQ_REMOVE_HEAD(bt->txn->dirty_queue, next); 955 if (rc != (ssize_t)bt->head.psize) { 956 if (rc > 0) 957 DPRINTF("short write, filesystem full?"); 958 return BT_FAIL; 959 } 960 961 if ((bt->size = lseek(bt->fd, 0, SEEK_END)) == -1) { 962 DPRINTF("failed to update file size: %s", strerror(errno)); 963 bt->size = 0; 964 } 965 966 return BT_SUCCESS; 967 } 968 969 /* Returns true if page p is a valid meta page, false otherwise. 970 */ 971 static int 972 btree_is_meta_page(struct page *p) 973 { 974 struct bt_meta *m; 975 unsigned char hash[SHA_DIGEST_LENGTH]; 976 977 m = METADATA(p); 978 if (!F_ISSET(p->flags, P_META)) { 979 DPRINTF("page %d not a meta page", p->pgno); 980 errno = EINVAL; 981 return 0; 982 } 983 984 if (m->root >= p->pgno && m->root != P_INVALID) { 985 DPRINTF("page %d points to an invalid root page", p->pgno); 986 errno = EINVAL; 987 return 0; 988 } 989 990 SHA1((unsigned char *)m, METAHASHLEN, hash); 991 if (bcmp(hash, m->hash, SHA_DIGEST_LENGTH) != 0) { 992 DPRINTF("page %d has an invalid digest", p->pgno); 993 errno = EINVAL; 994 return 0; 995 } 996 997 return 1; 998 } 999 1000 static int 1001 btree_read_meta(struct btree *bt, pgno_t *p_next) 1002 { 1003 struct mpage *mp; 1004 struct bt_meta *meta; 1005 pgno_t meta_pgno, next_pgno; 1006 off_t size; 1007 1008 assert(bt != NULL); 1009 1010 if ((size = lseek(bt->fd, 0, SEEK_END)) == -1) 1011 goto fail; 1012 1013 DPRINTF("btree_read_meta: size = %llu", size); 1014 1015 if (size < bt->size) { 1016 DPRINTF("file has shrunk!"); 1017 errno = EIO; 1018 goto fail; 1019 } 1020 1021 if (size == bt->head.psize) { /* there is only the header */ 1022 if (p_next != NULL) 1023 *p_next = 1; 1024 return BT_SUCCESS; /* new file */ 1025 } 1026 1027 next_pgno = size / bt->head.psize; 1028 if (next_pgno == 0) { 1029 DPRINTF("corrupt file"); 1030 errno = EIO; 1031 goto fail; 1032 } 1033 1034 meta_pgno = next_pgno - 1; 1035 1036 if (size % bt->head.psize != 0) { 1037 DPRINTF("filesize not a multiple of the page size!"); 1038 bt->flags |= BT_FIXPADDING; 1039 next_pgno++; 1040 } 1041 1042 if (p_next != NULL) 1043 *p_next = next_pgno; 1044 1045 if (size == bt->size) { 1046 DPRINTF("size unchanged, keeping current meta page"); 1047 if (F_ISSET(bt->meta.flags, BT_TOMBSTONE)) { 1048 DPRINTF("file is dead"); 1049 errno = ESTALE; 1050 return BT_FAIL; 1051 } else 1052 return BT_SUCCESS; 1053 } 1054 bt->size = size; 1055 1056 while (meta_pgno > 0) { 1057 if ((mp = btree_get_mpage(bt, meta_pgno)) == NULL) 1058 break; 1059 if (btree_is_meta_page(mp->page)) { 1060 meta = METADATA(mp->page); 1061 DPRINTF("flags = 0x%x", meta->flags); 1062 if (F_ISSET(meta->flags, BT_TOMBSTONE)) { 1063 DPRINTF("file is dead"); 1064 errno = ESTALE; 1065 return BT_FAIL; 1066 } else { 1067 /* Make copy of last meta page. */ 1068 bcopy(meta, &bt->meta, sizeof(bt->meta)); 1069 return BT_SUCCESS; 1070 } 1071 } 1072 --meta_pgno; /* scan backwards to first valid meta page */ 1073 } 1074 1075 errno = EIO; 1076 fail: 1077 if (p_next != NULL) 1078 *p_next = P_INVALID; 1079 return BT_FAIL; 1080 } 1081 1082 struct btree * 1083 btree_open_fd(int fd, unsigned int flags) 1084 { 1085 struct btree *bt; 1086 int fl; 1087 1088 fl = fcntl(fd, F_GETFL); 1089 if (fcntl(fd, F_SETFL, fl | O_APPEND) == -1) 1090 return NULL; 1091 1092 if ((bt = calloc(1, sizeof(*bt))) == NULL) 1093 return NULL; 1094 bt->fd = fd; 1095 bt->flags = flags; 1096 bt->flags &= ~BT_FIXPADDING; 1097 bt->ref = 1; 1098 bt->meta.root = P_INVALID; 1099 1100 if ((bt->page_cache = calloc(1, sizeof(*bt->page_cache))) == NULL) 1101 goto fail; 1102 bt->stat.max_cache = BT_MAXCACHE_DEF; 1103 RB_INIT(bt->page_cache); 1104 1105 if ((bt->lru_queue = calloc(1, sizeof(*bt->lru_queue))) == NULL) 1106 goto fail; 1107 TAILQ_INIT(bt->lru_queue); 1108 1109 if (btree_read_header(bt) != 0) { 1110 if (errno != ENOENT) 1111 goto fail; 1112 DPRINTF("new database"); 1113 btree_write_header(bt, bt->fd); 1114 } 1115 1116 if (btree_read_meta(bt, NULL) != 0) 1117 goto fail; 1118 1119 DPRINTF("opened database version %u, pagesize %u", 1120 bt->head.version, bt->head.psize); 1121 DPRINTF("timestamp: %s", ctime(&bt->meta.created_at)); 1122 DPRINTF("depth: %u", bt->meta.depth); 1123 DPRINTF("entries: %llu", bt->meta.entries); 1124 DPRINTF("revisions: %u", bt->meta.revisions); 1125 DPRINTF("branch pages: %u", bt->meta.branch_pages); 1126 DPRINTF("leaf pages: %u", bt->meta.leaf_pages); 1127 DPRINTF("overflow pages: %u", bt->meta.overflow_pages); 1128 DPRINTF("root: %u", bt->meta.root); 1129 DPRINTF("previous meta page: %u", bt->meta.prev_meta); 1130 1131 return bt; 1132 1133 fail: 1134 free(bt->lru_queue); 1135 free(bt->page_cache); 1136 free(bt); 1137 return NULL; 1138 } 1139 1140 struct btree * 1141 btree_open(const char *path, unsigned int flags, mode_t mode) 1142 { 1143 int fd, oflags; 1144 struct btree *bt; 1145 1146 if (F_ISSET(flags, BT_RDONLY)) 1147 oflags = O_RDONLY; 1148 else 1149 oflags = O_RDWR | O_CREAT | O_APPEND; 1150 1151 if ((fd = open(path, oflags, mode)) == -1) 1152 return NULL; 1153 1154 if ((bt = btree_open_fd(fd, flags)) == NULL) 1155 close(fd); 1156 else { 1157 bt->path = strdup(path); 1158 DPRINTF("opened btree %p", bt); 1159 } 1160 1161 return bt; 1162 } 1163 1164 static void 1165 btree_ref(struct btree *bt) 1166 { 1167 bt->ref++; 1168 DPRINTF("ref is now %d on btree %p", bt->ref, bt); 1169 } 1170 1171 void 1172 btree_close(struct btree *bt) 1173 { 1174 if (bt == NULL) 1175 return; 1176 1177 if (--bt->ref == 0) { 1178 DPRINTF("ref is zero, closing btree %p", bt); 1179 close(bt->fd); 1180 mpage_flush(bt); 1181 free(bt->lru_queue); 1182 free(bt->path); 1183 free(bt->page_cache); 1184 free(bt); 1185 } else 1186 DPRINTF("ref is now %d on btree %p", bt->ref, bt); 1187 } 1188 1189 /* Search for key within a leaf page, using binary search. 1190 * Returns the smallest entry larger or equal to the key. 1191 * If exactp is non-null, stores whether the found entry was an exact match 1192 * in *exactp (1 or 0). 1193 * If kip is non-null, stores the index of the found entry in *kip. 1194 * If no entry larger of equal to the key is found, returns NULL. 1195 */ 1196 static struct node * 1197 btree_search_node(struct btree *bt, struct mpage *mp, struct btval *key, 1198 int *exactp, unsigned int *kip) 1199 { 1200 unsigned int i = 0; 1201 int low, high; 1202 int rc = 0; 1203 struct node *node; 1204 struct btval nodekey; 1205 1206 DPRINTF("searching %lu keys in %s page %u with prefix [%.*s]", 1207 NUMKEYS(mp), 1208 IS_LEAF(mp) ? "leaf" : "branch", 1209 mp->pgno, (int)mp->prefix.len, (char *)mp->prefix.str); 1210 1211 assert(NUMKEYS(mp) > 0); 1212 1213 memset(&nodekey, 0, sizeof(nodekey)); 1214 1215 low = IS_LEAF(mp) ? 0 : 1; 1216 high = NUMKEYS(mp) - 1; 1217 while (low <= high) { 1218 i = (low + high) >> 1; 1219 node = NODEPTR(mp, i); 1220 1221 nodekey.size = node->ksize; 1222 nodekey.data = NODEKEY(node); 1223 1224 if (bt->cmp) 1225 rc = bt->cmp(key, &nodekey); 1226 else 1227 rc = bt_cmp(bt, key, &nodekey, &mp->prefix); 1228 1229 if (IS_LEAF(mp)) 1230 DPRINTF("found leaf index %u [%.*s], rc = %d", 1231 i, (int)nodekey.size, (char *)nodekey.data, rc); 1232 else 1233 DPRINTF("found branch index %u [%.*s -> %u], rc = %d", 1234 i, (int)node->ksize, (char *)NODEKEY(node), 1235 node->n_pgno, rc); 1236 1237 if (rc == 0) 1238 break; 1239 if (rc > 0) 1240 low = i + 1; 1241 else 1242 high = i - 1; 1243 } 1244 1245 if (rc > 0) { /* Found entry is less than the key. */ 1246 i++; /* Skip to get the smallest entry larger than key. */ 1247 if (i >= NUMKEYS(mp)) 1248 /* There is no entry larger or equal to the key. */ 1249 return NULL; 1250 } 1251 if (exactp) 1252 *exactp = (rc == 0); 1253 if (kip) /* Store the key index if requested. */ 1254 *kip = i; 1255 1256 return NODEPTR(mp, i); 1257 } 1258 1259 static void 1260 cursor_pop_page(struct cursor *cursor) 1261 { 1262 struct ppage *top; 1263 1264 top = CURSOR_TOP(cursor); 1265 CURSOR_POP(cursor); 1266 top->mpage->ref--; 1267 1268 DPRINTF("popped page %u off cursor %p", top->mpage->pgno, cursor); 1269 1270 free(top); 1271 } 1272 1273 static struct ppage * 1274 cursor_push_page(struct cursor *cursor, struct mpage *mp) 1275 { 1276 struct ppage *ppage; 1277 1278 DPRINTF("pushing page %u on cursor %p", mp->pgno, cursor); 1279 1280 if ((ppage = calloc(1, sizeof(*ppage))) == NULL) 1281 return NULL; 1282 ppage->mpage = mp; 1283 mp->ref++; 1284 CURSOR_PUSH(cursor, ppage); 1285 return ppage; 1286 } 1287 1288 static struct mpage * 1289 btree_get_mpage(struct btree *bt, pgno_t pgno) 1290 { 1291 struct mpage *mp; 1292 1293 mp = mpage_lookup(bt, pgno); 1294 if (mp == NULL) { 1295 if ((mp = calloc(1, sizeof(*mp))) == NULL) 1296 return NULL; 1297 if ((mp->page = malloc(bt->head.psize)) == NULL) { 1298 free(mp); 1299 return NULL; 1300 } 1301 if (btree_read_page(bt, pgno, mp->page) != BT_SUCCESS) { 1302 mpage_free(mp); 1303 return NULL; 1304 } 1305 mp->pgno = pgno; 1306 mpage_add(bt, mp); 1307 } else 1308 DPRINTF("returning page %u from cache", pgno); 1309 1310 return mp; 1311 } 1312 1313 static void 1314 concat_prefix(struct btree *bt, char *s1, size_t n1, char *s2, size_t n2, 1315 char *cs, size_t *cn) 1316 { 1317 assert(*cn >= n1 + n2); 1318 if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 1319 bcopy(s2, cs, n2); 1320 bcopy(s1, cs + n2, n1); 1321 } else { 1322 bcopy(s1, cs, n1); 1323 bcopy(s2, cs + n1, n2); 1324 } 1325 *cn = n1 + n2; 1326 } 1327 1328 static void 1329 find_common_prefix(struct btree *bt, struct mpage *mp) 1330 { 1331 indx_t lbound = 0, ubound = 0; 1332 struct mpage *lp, *up; 1333 struct btkey lprefix, uprefix; 1334 1335 mp->prefix.len = 0; 1336 if (bt->cmp != NULL) 1337 return; 1338 1339 lp = mp; 1340 while (lp->parent != NULL) { 1341 if (lp->parent_index > 0) { 1342 lbound = lp->parent_index; 1343 break; 1344 } 1345 lp = lp->parent; 1346 } 1347 1348 up = mp; 1349 while (up->parent != NULL) { 1350 if (up->parent_index + 1 < (indx_t)NUMKEYS(up->parent)) { 1351 ubound = up->parent_index + 1; 1352 break; 1353 } 1354 up = up->parent; 1355 } 1356 1357 if (lp->parent != NULL && up->parent != NULL) { 1358 expand_prefix(bt, lp->parent, lbound, &lprefix); 1359 expand_prefix(bt, up->parent, ubound, &uprefix); 1360 common_prefix(bt, &lprefix, &uprefix, &mp->prefix); 1361 } 1362 else if (mp->parent) 1363 bcopy(&mp->parent->prefix, &mp->prefix, sizeof(mp->prefix)); 1364 1365 DPRINTF("found common prefix [%.*s] (len %zu) for page %u", 1366 (int)mp->prefix.len, mp->prefix.str, mp->prefix.len, mp->pgno); 1367 } 1368 1369 static int 1370 btree_search_page_root(struct btree *bt, struct mpage *root, struct btval *key, 1371 struct cursor *cursor, int modify, struct mpage **mpp) 1372 { 1373 struct mpage *mp, *parent; 1374 1375 if (cursor && cursor_push_page(cursor, root) == NULL) 1376 return BT_FAIL; 1377 1378 mp = root; 1379 while (IS_BRANCH(mp)) { 1380 unsigned int i = 0; 1381 struct node *node; 1382 1383 DPRINTF("branch page %u has %lu keys", mp->pgno, NUMKEYS(mp)); 1384 assert(NUMKEYS(mp) > 1); 1385 DPRINTF("found index 0 to page %u", NODEPGNO(NODEPTR(mp, 0))); 1386 1387 if (key == NULL) /* Initialize cursor to first page. */ 1388 i = 0; 1389 else { 1390 int exact; 1391 node = btree_search_node(bt, mp, key, &exact, &i); 1392 if (node == NULL) 1393 i = NUMKEYS(mp) - 1; 1394 else if (!exact) { 1395 assert(i > 0); 1396 i--; 1397 } 1398 } 1399 1400 if (key) 1401 DPRINTF("following index %u for key %.*s", 1402 i, (int)key->size, (char *)key->data); 1403 assert(i >= 0 && i < NUMKEYS(mp)); 1404 node = NODEPTR(mp, i); 1405 1406 if (cursor) 1407 CURSOR_TOP(cursor)->ki = i; 1408 1409 parent = mp; 1410 if ((mp = btree_get_mpage(bt, NODEPGNO(node))) == NULL) 1411 return BT_FAIL; 1412 mp->parent = parent; 1413 mp->parent_index = i; 1414 find_common_prefix(bt, mp); 1415 1416 if (cursor && cursor_push_page(cursor, mp) == NULL) 1417 return BT_FAIL; 1418 1419 if (modify && (mp = mpage_touch(bt, mp)) == NULL) 1420 return BT_FAIL; 1421 } 1422 1423 if (!IS_LEAF(mp)) { 1424 DPRINTF("internal error, index points to a %02X page!?", 1425 mp->page->flags); 1426 return BT_FAIL; 1427 } 1428 1429 DPRINTF("found leaf page %u for key %.*s", mp->pgno, 1430 key ? (int)key->size : 0, key ? (char *)key->data : NULL); 1431 1432 *mpp = mp; 1433 return BT_SUCCESS; 1434 } 1435 1436 /* Search for the page a given key should be in. 1437 * Stores a pointer to the found page in *mpp. 1438 * If key is NULL, search for the lowest page (used by btree_cursor_first). 1439 * If cursor is non-null, pushes parent pages on the cursor stack. 1440 * If modify is true, visited pages are updated with new page numbers. 1441 */ 1442 static int 1443 btree_search_page(struct btree *bt, struct btree_txn *txn, struct btval *key, 1444 struct cursor *cursor, int modify, struct mpage **mpp) 1445 { 1446 int rc; 1447 pgno_t root; 1448 struct mpage *mp; 1449 1450 /* Can't modify pages outside a transaction. */ 1451 if (txn == NULL && modify) { 1452 errno = EINVAL; 1453 return BT_FAIL; 1454 } 1455 1456 /* Choose which root page to start with. If a transaction is given 1457 * use the root page from the transaction, otherwise read the last 1458 * committed root page. 1459 */ 1460 if (txn == NULL) { 1461 if ((rc = btree_read_meta(bt, NULL)) != BT_SUCCESS) 1462 return rc; 1463 root = bt->meta.root; 1464 } else if (F_ISSET(txn->flags, BT_TXN_ERROR)) { 1465 DPRINTF("transaction has failed, must abort"); 1466 errno = EINVAL; 1467 return BT_FAIL; 1468 } else 1469 root = txn->root; 1470 1471 if (root == P_INVALID) { /* Tree is empty. */ 1472 DPRINTF("tree is empty"); 1473 errno = ENOENT; 1474 return BT_FAIL; 1475 } 1476 1477 if ((mp = btree_get_mpage(bt, root)) == NULL) 1478 return BT_FAIL; 1479 1480 DPRINTF("root page has flags 0x%X", mp->page->flags); 1481 1482 assert(mp->parent == NULL); 1483 assert(mp->prefix.len == 0); 1484 1485 if (modify && !mp->dirty) { 1486 if ((mp = mpage_touch(bt, mp)) == NULL) 1487 return BT_FAIL; 1488 txn->root = mp->pgno; 1489 } 1490 1491 return btree_search_page_root(bt, mp, key, cursor, modify, mpp); 1492 } 1493 1494 static int 1495 btree_read_data(struct btree *bt, struct mpage *mp, struct node *leaf, 1496 struct btval *data) 1497 { 1498 struct mpage *omp; /* overflow mpage */ 1499 size_t psz; 1500 size_t max; 1501 size_t sz = 0; 1502 pgno_t pgno; 1503 1504 memset(data, 0, sizeof(*data)); 1505 max = bt->head.psize - PAGEHDRSZ; 1506 1507 if (!F_ISSET(leaf->flags, F_BIGDATA)) { 1508 data->size = leaf->n_dsize; 1509 if (data->size > 0) { 1510 if (mp == NULL) { 1511 if ((data->data = malloc(data->size)) == NULL) 1512 return BT_FAIL; 1513 bcopy(NODEDATA(leaf), data->data, data->size); 1514 data->free_data = 1; 1515 data->mp = NULL; 1516 } else { 1517 data->data = NODEDATA(leaf); 1518 data->free_data = 0; 1519 data->mp = mp; 1520 mp->ref++; 1521 } 1522 } 1523 return BT_SUCCESS; 1524 } 1525 1526 /* Read overflow data. 1527 */ 1528 DPRINTF("allocating %u byte for overflow data", leaf->n_dsize); 1529 if ((data->data = malloc(leaf->n_dsize)) == NULL) 1530 return BT_FAIL; 1531 data->size = leaf->n_dsize; 1532 data->free_data = 1; 1533 data->mp = NULL; 1534 bcopy(NODEDATA(leaf), &pgno, sizeof(pgno)); 1535 for (sz = 0; sz < data->size; ) { 1536 if ((omp = btree_get_mpage(bt, pgno)) == NULL || 1537 !F_ISSET(omp->page->flags, P_OVERFLOW)) { 1538 DPRINTF("read overflow page %u failed", pgno); 1539 free(data->data); 1540 mpage_free(omp); 1541 return BT_FAIL; 1542 } 1543 psz = data->size - sz; 1544 if (psz > max) 1545 psz = max; 1546 bcopy(omp->page->ptrs, (char *)data->data + sz, psz); 1547 sz += psz; 1548 pgno = omp->page->p_next_pgno; 1549 } 1550 1551 return BT_SUCCESS; 1552 } 1553 1554 int 1555 btree_txn_get(struct btree *bt, struct btree_txn *txn, 1556 struct btval *key, struct btval *data) 1557 { 1558 int rc, exact; 1559 struct node *leaf; 1560 struct mpage *mp; 1561 1562 assert(key); 1563 assert(data); 1564 DPRINTF("===> get key [%.*s]", (int)key->size, (char *)key->data); 1565 1566 if (bt != NULL && txn != NULL && bt != txn->bt) { 1567 errno = EINVAL; 1568 return BT_FAIL; 1569 } 1570 1571 if (bt == NULL) { 1572 if (txn == NULL) { 1573 errno = EINVAL; 1574 return BT_FAIL; 1575 } 1576 bt = txn->bt; 1577 } 1578 1579 if (key->size == 0 || key->size > MAXKEYSIZE) { 1580 errno = EINVAL; 1581 return BT_FAIL; 1582 } 1583 1584 if ((rc = btree_search_page(bt, txn, key, NULL, 0, &mp)) != BT_SUCCESS) 1585 return rc; 1586 1587 leaf = btree_search_node(bt, mp, key, &exact, NULL); 1588 if (leaf && exact) 1589 rc = btree_read_data(bt, mp, leaf, data); 1590 else { 1591 errno = ENOENT; 1592 rc = BT_FAIL; 1593 } 1594 1595 mpage_prune(bt); 1596 return rc; 1597 } 1598 1599 static int 1600 btree_sibling(struct cursor *cursor, int move_right) 1601 { 1602 int rc; 1603 struct node *indx; 1604 struct ppage *parent, *top; 1605 struct mpage *mp; 1606 1607 top = CURSOR_TOP(cursor); 1608 if ((parent = SLIST_NEXT(top, entry)) == NULL) { 1609 errno = ENOENT; 1610 return BT_FAIL; /* root has no siblings */ 1611 } 1612 1613 DPRINTF("parent page is page %u, index %u", 1614 parent->mpage->pgno, parent->ki); 1615 1616 cursor_pop_page(cursor); 1617 if (move_right ? (parent->ki + 1 >= NUMKEYS(parent->mpage)) 1618 : (parent->ki == 0)) { 1619 DPRINTF("no more keys left, moving to %s sibling", 1620 move_right ? "right" : "left"); 1621 if ((rc = btree_sibling(cursor, move_right)) != BT_SUCCESS) 1622 return rc; 1623 parent = CURSOR_TOP(cursor); 1624 } else { 1625 if (move_right) 1626 parent->ki++; 1627 else 1628 parent->ki--; 1629 DPRINTF("just moving to %s index key %u", 1630 move_right ? "right" : "left", parent->ki); 1631 } 1632 assert(IS_BRANCH(parent->mpage)); 1633 1634 indx = NODEPTR(parent->mpage, parent->ki); 1635 if ((mp = btree_get_mpage(cursor->bt, indx->n_pgno)) == NULL) 1636 return BT_FAIL; 1637 mp->parent = parent->mpage; 1638 mp->parent_index = parent->ki; 1639 1640 cursor_push_page(cursor, mp); 1641 find_common_prefix(cursor->bt, mp); 1642 1643 return BT_SUCCESS; 1644 } 1645 1646 static int 1647 bt_set_key(struct btree *bt, struct mpage *mp, struct node *node, 1648 struct btval *key) 1649 { 1650 if (key == NULL) 1651 return 0; 1652 1653 if (mp->prefix.len > 0) { 1654 key->size = node->ksize + mp->prefix.len; 1655 key->data = malloc(key->size); 1656 if (key->data == NULL) 1657 return -1; 1658 concat_prefix(bt, 1659 mp->prefix.str, mp->prefix.len, 1660 NODEKEY(node), node->ksize, 1661 key->data, &key->size); 1662 key->free_data = 1; 1663 } else { 1664 key->size = node->ksize; 1665 key->data = NODEKEY(node); 1666 key->free_data = 0; 1667 key->mp = mp; 1668 mp->ref++; 1669 } 1670 1671 return 0; 1672 } 1673 1674 static int 1675 btree_cursor_next(struct cursor *cursor, struct btval *key, struct btval *data) 1676 { 1677 struct ppage *top; 1678 struct mpage *mp; 1679 struct node *leaf; 1680 1681 if (cursor->eof) { 1682 errno = ENOENT; 1683 return BT_FAIL; 1684 } 1685 1686 assert(cursor->initialized); 1687 1688 top = CURSOR_TOP(cursor); 1689 mp = top->mpage; 1690 1691 DPRINTF("cursor_next: top page is %u in cursor %p", mp->pgno, cursor); 1692 1693 if (top->ki + 1 >= NUMKEYS(mp)) { 1694 DPRINTF("=====> move to next sibling page"); 1695 if (btree_sibling(cursor, 1) != BT_SUCCESS) { 1696 cursor->eof = 1; 1697 return BT_FAIL; 1698 } 1699 top = CURSOR_TOP(cursor); 1700 mp = top->mpage; 1701 DPRINTF("next page is %u, key index %u", mp->pgno, top->ki); 1702 } else 1703 top->ki++; 1704 1705 DPRINTF("==> cursor points to page %u with %lu keys, key index %u", 1706 mp->pgno, NUMKEYS(mp), top->ki); 1707 1708 assert(IS_LEAF(mp)); 1709 leaf = NODEPTR(mp, top->ki); 1710 1711 if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 1712 return BT_FAIL; 1713 1714 if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 1715 return BT_FAIL; 1716 1717 return BT_SUCCESS; 1718 } 1719 1720 static int 1721 btree_cursor_set(struct cursor *cursor, struct btval *key, struct btval *data, 1722 int *exactp) 1723 { 1724 int rc; 1725 struct node *leaf; 1726 struct mpage *mp; 1727 struct ppage *top; 1728 1729 assert(cursor); 1730 assert(key); 1731 assert(key->size > 0); 1732 1733 rc = btree_search_page(cursor->bt, cursor->txn, key, cursor, 0, &mp); 1734 if (rc != BT_SUCCESS) 1735 return rc; 1736 assert(IS_LEAF(mp)); 1737 1738 top = CURSOR_TOP(cursor); 1739 leaf = btree_search_node(cursor->bt, mp, key, exactp, &top->ki); 1740 if (exactp != NULL && !*exactp) { 1741 /* BT_CURSOR_EXACT specified and not an exact match. */ 1742 errno = ENOENT; 1743 return BT_FAIL; 1744 } 1745 1746 if (leaf == NULL) { 1747 DPRINTF("===> inexact leaf not found, goto sibling"); 1748 if (btree_sibling(cursor, 1) != BT_SUCCESS) 1749 return BT_FAIL; /* no entries matched */ 1750 top = CURSOR_TOP(cursor); 1751 top->ki = 0; 1752 mp = top->mpage; 1753 assert(IS_LEAF(mp)); 1754 leaf = NODEPTR(mp, 0); 1755 } 1756 1757 cursor->initialized = 1; 1758 cursor->eof = 0; 1759 1760 if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 1761 return BT_FAIL; 1762 1763 if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 1764 return BT_FAIL; 1765 DPRINTF("==> cursor placed on key %.*s", 1766 (int)key->size, (char *)key->data); 1767 1768 return BT_SUCCESS; 1769 } 1770 1771 static int 1772 btree_cursor_first(struct cursor *cursor, struct btval *key, struct btval *data) 1773 { 1774 int rc; 1775 struct mpage *mp; 1776 struct node *leaf; 1777 1778 rc = btree_search_page(cursor->bt, cursor->txn, NULL, cursor, 0, &mp); 1779 if (rc != BT_SUCCESS) 1780 return rc; 1781 assert(IS_LEAF(mp)); 1782 1783 leaf = NODEPTR(mp, 0); 1784 cursor->initialized = 1; 1785 cursor->eof = 0; 1786 1787 if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 1788 return BT_FAIL; 1789 1790 if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 1791 return BT_FAIL; 1792 1793 return BT_SUCCESS; 1794 } 1795 1796 int 1797 btree_cursor_get(struct cursor *cursor, struct btval *key, struct btval *data, 1798 enum cursor_op op) 1799 { 1800 int rc; 1801 int exact = 0; 1802 1803 assert(cursor); 1804 1805 switch (op) { 1806 case BT_CURSOR: 1807 case BT_CURSOR_EXACT: 1808 while (CURSOR_TOP(cursor) != NULL) 1809 cursor_pop_page(cursor); 1810 if (key == NULL || key->size == 0 || key->size > MAXKEYSIZE) { 1811 errno = EINVAL; 1812 rc = BT_FAIL; 1813 } else if (op == BT_CURSOR_EXACT) 1814 rc = btree_cursor_set(cursor, key, data, &exact); 1815 else 1816 rc = btree_cursor_set(cursor, key, data, NULL); 1817 break; 1818 case BT_NEXT: 1819 if (!cursor->initialized) 1820 rc = btree_cursor_first(cursor, key, data); 1821 else 1822 rc = btree_cursor_next(cursor, key, data); 1823 break; 1824 case BT_FIRST: 1825 while (CURSOR_TOP(cursor) != NULL) 1826 cursor_pop_page(cursor); 1827 rc = btree_cursor_first(cursor, key, data); 1828 break; 1829 default: 1830 DPRINTF("unhandled/unimplemented cursor operation %u", op); 1831 rc = BT_FAIL; 1832 break; 1833 } 1834 1835 mpage_prune(cursor->bt); 1836 1837 return rc; 1838 } 1839 1840 static struct mpage * 1841 btree_new_page(struct btree *bt, uint32_t flags) 1842 { 1843 struct mpage *mp; 1844 1845 assert(bt != NULL); 1846 assert(bt->txn != NULL); 1847 1848 DPRINTF("allocating new mpage %u, page size %u", 1849 bt->txn->next_pgno, bt->head.psize); 1850 if ((mp = calloc(1, sizeof(*mp))) == NULL) 1851 return NULL; 1852 if ((mp->page = malloc(bt->head.psize)) == NULL) { 1853 free(mp); 1854 return NULL; 1855 } 1856 mp->pgno = mp->page->pgno = bt->txn->next_pgno++; 1857 mp->page->flags = flags; 1858 mp->page->lower = PAGEHDRSZ; 1859 mp->page->upper = bt->head.psize; 1860 1861 if (IS_BRANCH(mp)) 1862 bt->meta.branch_pages++; 1863 else if (IS_LEAF(mp)) 1864 bt->meta.leaf_pages++; 1865 else if (IS_OVERFLOW(mp)) 1866 bt->meta.overflow_pages++; 1867 1868 mpage_add(bt, mp); 1869 mpage_dirty(bt, mp); 1870 1871 return mp; 1872 } 1873 1874 static size_t 1875 bt_leaf_size(struct btree *bt, struct btval *key, struct btval *data) 1876 { 1877 size_t sz; 1878 1879 sz = LEAFSIZE(key, data); 1880 if (data->size >= bt->head.psize / BT_MINKEYS) { 1881 /* put on overflow page */ 1882 sz -= data->size - sizeof(pgno_t); 1883 } 1884 1885 return sz + sizeof(indx_t); 1886 } 1887 1888 static size_t 1889 bt_branch_size(struct btree *bt, struct btval *key) 1890 { 1891 size_t sz; 1892 1893 sz = INDXSIZE(key); 1894 if (sz >= bt->head.psize / BT_MINKEYS) { 1895 /* put on overflow page */ 1896 /* not implemented */ 1897 /* sz -= key->size - sizeof(pgno_t); */ 1898 } 1899 1900 return sz + sizeof(indx_t); 1901 } 1902 1903 static int 1904 btree_write_overflow_data(struct btree *bt, struct page *p, struct btval *data) 1905 { 1906 size_t done = 0; 1907 size_t sz; 1908 size_t max; 1909 pgno_t *linkp; /* linked page stored here */ 1910 struct mpage *next = NULL; 1911 1912 max = bt->head.psize - PAGEHDRSZ; 1913 1914 while (done < data->size) { 1915 if (next != NULL) 1916 p = next->page; 1917 linkp = &p->p_next_pgno; 1918 if (data->size - done > max) { 1919 /* need another overflow page */ 1920 if ((next = btree_new_page(bt, P_OVERFLOW)) == NULL) 1921 return BT_FAIL; 1922 *linkp = next->pgno; 1923 DPRINTF("linking overflow page %u", next->pgno); 1924 } else 1925 *linkp = 0; /* indicates end of list */ 1926 sz = data->size - done; 1927 if (sz > max) 1928 sz = max; 1929 DPRINTF("copying %zu bytes to overflow page %u", sz, p->pgno); 1930 bcopy((char *)data->data + done, p->ptrs, sz); 1931 done += sz; 1932 } 1933 1934 return BT_SUCCESS; 1935 } 1936 1937 /* Key prefix should already be stripped. 1938 */ 1939 static int 1940 btree_add_node(struct btree *bt, struct mpage *mp, indx_t indx, 1941 struct btval *key, struct btval *data, pgno_t pgno, uint8_t flags) 1942 { 1943 unsigned int i; 1944 size_t node_size = NODESIZE; 1945 indx_t ofs; 1946 struct node *node; 1947 struct page *p; 1948 struct mpage *ofp = NULL; /* overflow page */ 1949 1950 p = mp->page; 1951 assert(p->upper >= p->lower); 1952 1953 DPRINTF("add node [%.*s] to %s page %u at index %d, key size %zu", 1954 key ? (int)key->size : 0, key ? (char *)key->data : NULL, 1955 IS_LEAF(mp) ? "leaf" : "branch", 1956 mp->pgno, indx, key ? key->size : 0); 1957 1958 if (key != NULL) 1959 node_size += key->size; 1960 1961 if (IS_LEAF(mp)) { 1962 assert(data); 1963 node_size += data->size; 1964 if (F_ISSET(flags, F_BIGDATA)) { 1965 /* Data already on overflow page. */ 1966 node_size -= data->size - sizeof(pgno_t); 1967 } else if (data->size >= bt->head.psize / BT_MINKEYS) { 1968 /* Put data on overflow page. */ 1969 DPRINTF("data size is %zu, put on overflow page", 1970 data->size); 1971 node_size -= data->size - sizeof(pgno_t); 1972 if ((ofp = btree_new_page(bt, P_OVERFLOW)) == NULL) 1973 return BT_FAIL; 1974 DPRINTF("allocated overflow page %u", ofp->pgno); 1975 flags |= F_BIGDATA; 1976 } 1977 } 1978 1979 if (node_size + sizeof(indx_t) > SIZELEFT(mp)) { 1980 DPRINTF("not enough room in page %u, got %lu ptrs", 1981 mp->pgno, NUMKEYS(mp)); 1982 DPRINTF("upper - lower = %u - %u = %u", p->upper, p->lower, 1983 p->upper - p->lower); 1984 DPRINTF("node size = %zu", node_size); 1985 return BT_FAIL; 1986 } 1987 1988 /* Move higher pointers up one slot. */ 1989 for (i = NUMKEYS(mp); i > indx; i--) 1990 p->ptrs[i] = p->ptrs[i - 1]; 1991 1992 /* Adjust free space offsets. */ 1993 ofs = p->upper - node_size; 1994 assert(ofs >= p->lower + sizeof(indx_t)); 1995 p->ptrs[indx] = ofs; 1996 p->upper = ofs; 1997 p->lower += sizeof(indx_t); 1998 1999 /* Write the node data. */ 2000 node = NODEPTR(mp, indx); 2001 node->ksize = (key == NULL) ? 0 : key->size; 2002 node->flags = flags; 2003 if (IS_LEAF(mp)) 2004 node->n_dsize = data->size; 2005 else 2006 node->n_pgno = pgno; 2007 2008 if (key) 2009 bcopy(key->data, NODEKEY(node), key->size); 2010 2011 if (IS_LEAF(mp)) { 2012 assert(key); 2013 if (ofp == NULL) { 2014 if (F_ISSET(flags, F_BIGDATA)) 2015 bcopy(data->data, node->data + key->size, 2016 sizeof(pgno_t)); 2017 else 2018 bcopy(data->data, node->data + key->size, 2019 data->size); 2020 } else { 2021 bcopy(&ofp->pgno, node->data + key->size, 2022 sizeof(pgno_t)); 2023 if (btree_write_overflow_data(bt, ofp->page, 2024 data) == BT_FAIL) 2025 return BT_FAIL; 2026 } 2027 } 2028 2029 return BT_SUCCESS; 2030 } 2031 2032 static void 2033 btree_del_node(struct btree *bt, struct mpage *mp, indx_t indx) 2034 { 2035 unsigned int sz; 2036 indx_t i, j, numkeys, ptr; 2037 struct node *node; 2038 char *base; 2039 2040 DPRINTF("delete node %u on %s page %u", indx, 2041 IS_LEAF(mp) ? "leaf" : "branch", mp->pgno); 2042 assert(indx < NUMKEYS(mp)); 2043 2044 node = NODEPTR(mp, indx); 2045 sz = NODESIZE + node->ksize; 2046 if (IS_LEAF(mp)) { 2047 if (F_ISSET(node->flags, F_BIGDATA)) 2048 sz += sizeof(pgno_t); 2049 else 2050 sz += NODEDSZ(node); 2051 } 2052 2053 ptr = mp->page->ptrs[indx]; 2054 numkeys = NUMKEYS(mp); 2055 for (i = j = 0; i < numkeys; i++) { 2056 if (i != indx) { 2057 mp->page->ptrs[j] = mp->page->ptrs[i]; 2058 if (mp->page->ptrs[i] < ptr) 2059 mp->page->ptrs[j] += sz; 2060 j++; 2061 } 2062 } 2063 2064 base = (char *)mp->page + mp->page->upper; 2065 bcopy(base, base + sz, ptr - mp->page->upper); 2066 2067 mp->page->lower -= sizeof(indx_t); 2068 mp->page->upper += sz; 2069 } 2070 2071 struct cursor * 2072 btree_txn_cursor_open(struct btree *bt, struct btree_txn *txn) 2073 { 2074 struct cursor *cursor; 2075 2076 if (bt != NULL && txn != NULL && bt != txn->bt) { 2077 errno = EINVAL; 2078 return NULL; 2079 } 2080 2081 if (bt == NULL) { 2082 if (txn == NULL) { 2083 errno = EINVAL; 2084 return NULL; 2085 } 2086 bt = txn->bt; 2087 } 2088 2089 if ((cursor = calloc(1, sizeof(*cursor))) != NULL) { 2090 SLIST_INIT(&cursor->stack); 2091 cursor->bt = bt; 2092 cursor->txn = txn; 2093 btree_ref(bt); 2094 } 2095 2096 return cursor; 2097 } 2098 2099 void 2100 btree_cursor_close(struct cursor *cursor) 2101 { 2102 if (cursor != NULL) { 2103 while (!CURSOR_EMPTY(cursor)) 2104 cursor_pop_page(cursor); 2105 2106 btree_close(cursor->bt); 2107 free(cursor); 2108 } 2109 } 2110 2111 static int 2112 btree_update_key(struct btree *bt, struct mpage *mp, indx_t indx, 2113 struct btval *key) 2114 { 2115 indx_t ptr, i, numkeys; 2116 int delta; 2117 size_t len; 2118 struct node *node; 2119 char *base; 2120 2121 node = NODEPTR(mp, indx); 2122 ptr = mp->page->ptrs[indx]; 2123 DPRINTF("update key %u (ofs %u) [%.*s] to [%.*s] on page %u", 2124 indx, ptr, 2125 (int)node->ksize, (char *)NODEKEY(node), 2126 (int)key->size, (char *)key->data, 2127 mp->pgno); 2128 2129 if (key->size != node->ksize) { 2130 delta = key->size - node->ksize; 2131 if (delta > 0 && SIZELEFT(mp) < delta) { 2132 DPRINTF("OUCH! Not enough room, delta = %d", delta); 2133 return BT_FAIL; 2134 } 2135 2136 numkeys = NUMKEYS(mp); 2137 for (i = 0; i < numkeys; i++) { 2138 if (mp->page->ptrs[i] <= ptr) 2139 mp->page->ptrs[i] -= delta; 2140 } 2141 2142 base = (char *)mp->page + mp->page->upper; 2143 len = ptr - mp->page->upper + NODESIZE; 2144 bcopy(base, base - delta, len); 2145 mp->page->upper -= delta; 2146 2147 node = NODEPTR(mp, indx); 2148 node->ksize = key->size; 2149 } 2150 2151 bcopy(key->data, NODEKEY(node), key->size); 2152 2153 return BT_SUCCESS; 2154 } 2155 2156 static int 2157 btree_adjust_prefix(struct btree *bt, struct mpage *src, int delta) 2158 { 2159 indx_t i; 2160 struct node *node; 2161 struct btkey tmpkey; 2162 struct btval key; 2163 2164 DPRINTF("adjusting prefix lengths on page %u with delta %d", 2165 src->pgno, delta); 2166 assert(delta != 0); 2167 2168 for (i = 0; i < NUMKEYS(src); i++) { 2169 node = NODEPTR(src, i); 2170 tmpkey.len = node->ksize - delta; 2171 if (delta > 0) { 2172 if (F_ISSET(bt->flags, BT_REVERSEKEY)) 2173 bcopy(NODEKEY(node), tmpkey.str, tmpkey.len); 2174 else 2175 bcopy((char *)NODEKEY(node) + delta, tmpkey.str, 2176 tmpkey.len); 2177 } else { 2178 if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 2179 bcopy(NODEKEY(node), tmpkey.str, node->ksize); 2180 bcopy(src->prefix.str, tmpkey.str + node->ksize, 2181 -delta); 2182 } else { 2183 bcopy(src->prefix.str + src->prefix.len + delta, 2184 tmpkey.str, -delta); 2185 bcopy(NODEKEY(node), tmpkey.str - delta, 2186 node->ksize); 2187 } 2188 } 2189 key.size = tmpkey.len; 2190 key.data = tmpkey.str; 2191 if (btree_update_key(bt, src, i, &key) != BT_SUCCESS) 2192 return BT_FAIL; 2193 } 2194 2195 return BT_SUCCESS; 2196 } 2197 2198 /* Move a node from src to dst. 2199 */ 2200 static int 2201 btree_move_node(struct btree *bt, struct mpage *src, indx_t srcindx, 2202 struct mpage *dst, indx_t dstindx) 2203 { 2204 int rc; 2205 unsigned int pfxlen, mp_pfxlen = 0; 2206 struct node *srcnode; 2207 struct mpage *mp = NULL; 2208 struct btkey tmpkey, srckey; 2209 struct btval key, data; 2210 2211 assert(src->parent); 2212 assert(dst->parent); 2213 2214 srcnode = NODEPTR(src, srcindx); 2215 DPRINTF("moving %s node %u [%.*s] on page %u to node %u on page %u", 2216 IS_LEAF(src) ? "leaf" : "branch", 2217 srcindx, 2218 (int)srcnode->ksize, (char *)NODEKEY(srcnode), 2219 src->pgno, 2220 dstindx, dst->pgno); 2221 2222 find_common_prefix(bt, src); 2223 2224 if (IS_BRANCH(src)) { 2225 /* Need to check if the page the moved node points to 2226 * changes prefix. 2227 */ 2228 if ((mp = btree_get_mpage(bt, NODEPGNO(srcnode))) == NULL) 2229 return BT_FAIL; 2230 mp->parent = src; 2231 mp->parent_index = srcindx; 2232 find_common_prefix(bt, mp); 2233 mp_pfxlen = mp->prefix.len; 2234 } 2235 2236 /* Mark src and dst as dirty. */ 2237 if ((src = mpage_touch(bt, src)) == NULL || 2238 (dst = mpage_touch(bt, dst)) == NULL) 2239 return BT_FAIL; 2240 2241 find_common_prefix(bt, dst); 2242 2243 /* Check if src node has destination page prefix. Otherwise the 2244 * destination page must expand its prefix on all its nodes. 2245 */ 2246 srckey.len = srcnode->ksize; 2247 bcopy(NODEKEY(srcnode), srckey.str, srckey.len); 2248 common_prefix(bt, &srckey, &dst->prefix, &tmpkey); 2249 if (tmpkey.len != dst->prefix.len) { 2250 if (btree_adjust_prefix(bt, dst, 2251 tmpkey.len - dst->prefix.len) != BT_SUCCESS) 2252 return BT_FAIL; 2253 bcopy(&tmpkey, &dst->prefix, sizeof(tmpkey)); 2254 } 2255 2256 if (srcindx == 0 && IS_BRANCH(src)) { 2257 struct mpage *low; 2258 2259 /* must find the lowest key below src 2260 */ 2261 assert(btree_search_page_root(bt, src, NULL, NULL, 0, 2262 &low) == BT_SUCCESS); 2263 expand_prefix(bt, low, 0, &srckey); 2264 DPRINTF("found lowest key [%.*s] on leaf page %u", 2265 (int)srckey.len, srckey.str, low->pgno); 2266 } else { 2267 srckey.len = srcnode->ksize; 2268 bcopy(NODEKEY(srcnode), srckey.str, srcnode->ksize); 2269 } 2270 find_common_prefix(bt, src); 2271 2272 /* expand the prefix */ 2273 tmpkey.len = sizeof(tmpkey.str); 2274 concat_prefix(bt, src->prefix.str, src->prefix.len, 2275 srckey.str, srckey.len, tmpkey.str, &tmpkey.len); 2276 2277 /* Add the node to the destination page. Adjust prefix for 2278 * destination page. 2279 */ 2280 key.size = tmpkey.len; 2281 key.data = tmpkey.str; 2282 remove_prefix(bt, &key, dst->prefix.len); 2283 data.size = NODEDSZ(srcnode); 2284 data.data = NODEDATA(srcnode); 2285 rc = btree_add_node(bt, dst, dstindx, &key, &data, NODEPGNO(srcnode), 2286 srcnode->flags); 2287 if (rc != BT_SUCCESS) 2288 return rc; 2289 2290 /* Delete the node from the source page. 2291 */ 2292 btree_del_node(bt, src, srcindx); 2293 2294 /* Update the parent separators. 2295 */ 2296 if (srcindx == 0 && src->parent_index != 0) { 2297 expand_prefix(bt, src, 0, &tmpkey); 2298 key.size = tmpkey.len; 2299 key.data = tmpkey.str; 2300 remove_prefix(bt, &key, src->parent->prefix.len); 2301 2302 DPRINTF("update separator for source page %u to [%.*s]", 2303 src->pgno, (int)key.size, (char *)key.data); 2304 if (btree_update_key(bt, src->parent, src->parent_index, 2305 &key) != BT_SUCCESS) 2306 return BT_FAIL; 2307 } 2308 2309 if (srcindx == 0 && IS_BRANCH(src)) { 2310 struct btval nullkey; 2311 nullkey.size = 0; 2312 assert(btree_update_key(bt, src, 0, &nullkey) == BT_SUCCESS); 2313 } 2314 2315 if (dstindx == 0 && dst->parent_index != 0) { 2316 expand_prefix(bt, dst, 0, &tmpkey); 2317 key.size = tmpkey.len; 2318 key.data = tmpkey.str; 2319 remove_prefix(bt, &key, dst->parent->prefix.len); 2320 2321 DPRINTF("update separator for destination page %u to [%.*s]", 2322 dst->pgno, (int)key.size, (char *)key.data); 2323 if (btree_update_key(bt, dst->parent, dst->parent_index, 2324 &key) != BT_SUCCESS) 2325 return BT_FAIL; 2326 } 2327 2328 if (dstindx == 0 && IS_BRANCH(dst)) { 2329 struct btval nullkey; 2330 nullkey.size = 0; 2331 assert(btree_update_key(bt, dst, 0, &nullkey) == BT_SUCCESS); 2332 } 2333 2334 /* We can get a new page prefix here! 2335 * Must update keys in all nodes of this page! 2336 */ 2337 pfxlen = src->prefix.len; 2338 find_common_prefix(bt, src); 2339 if (src->prefix.len != pfxlen) { 2340 if (btree_adjust_prefix(bt, src, 2341 src->prefix.len - pfxlen) != BT_SUCCESS) 2342 return BT_FAIL; 2343 } 2344 2345 pfxlen = dst->prefix.len; 2346 find_common_prefix(bt, dst); 2347 if (dst->prefix.len != pfxlen) { 2348 if (btree_adjust_prefix(bt, dst, 2349 dst->prefix.len - pfxlen) != BT_SUCCESS) 2350 return BT_FAIL; 2351 } 2352 2353 if (IS_BRANCH(dst)) { 2354 assert(mp); 2355 mp->parent = dst; 2356 mp->parent_index = dstindx; 2357 find_common_prefix(bt, mp); 2358 if (mp->prefix.len != mp_pfxlen) { 2359 DPRINTF("moved branch node has changed prefix"); 2360 if ((mp = mpage_touch(bt, mp)) == NULL) 2361 return BT_FAIL; 2362 if (btree_adjust_prefix(bt, mp, 2363 mp->prefix.len - mp_pfxlen) != BT_SUCCESS) 2364 return BT_FAIL; 2365 } 2366 } 2367 2368 return BT_SUCCESS; 2369 } 2370 2371 static int 2372 btree_merge(struct btree *bt, struct mpage *src, struct mpage *dst) 2373 { 2374 int rc; 2375 indx_t i; 2376 unsigned int pfxlen; 2377 struct node *srcnode; 2378 struct btkey tmpkey, dstpfx; 2379 struct btval key, data; 2380 2381 DPRINTF("merging page %u and %u", src->pgno, dst->pgno); 2382 2383 assert(src->parent); /* can't merge root page */ 2384 assert(dst->parent); 2385 assert(bt->txn != NULL); 2386 2387 /* Mark src and dst as dirty. */ 2388 if ((src = mpage_touch(bt, src)) == NULL || 2389 (dst = mpage_touch(bt, dst)) == NULL) 2390 return BT_FAIL; 2391 2392 find_common_prefix(bt, src); 2393 find_common_prefix(bt, dst); 2394 2395 /* Check if source nodes has destination page prefix. Otherwise 2396 * the destination page must expand its prefix on all its nodes. 2397 */ 2398 common_prefix(bt, &src->prefix, &dst->prefix, &dstpfx); 2399 if (dstpfx.len != dst->prefix.len) { 2400 if (btree_adjust_prefix(bt, dst, 2401 dstpfx.len - dst->prefix.len) != BT_SUCCESS) 2402 return BT_FAIL; 2403 bcopy(&dstpfx, &dst->prefix, sizeof(dstpfx)); 2404 } 2405 2406 /* Move all nodes from src to dst. 2407 */ 2408 for (i = 0; i < NUMKEYS(src); i++) { 2409 srcnode = NODEPTR(src, i); 2410 2411 /* If branch node 0 (implicit key), find the real key. 2412 */ 2413 if (i == 0 && IS_BRANCH(src)) { 2414 struct mpage *low; 2415 2416 /* must find the lowest key below src 2417 */ 2418 assert(btree_search_page_root(bt, src, NULL, NULL, 0, 2419 &low) == BT_SUCCESS); 2420 expand_prefix(bt, low, 0, &tmpkey); 2421 DPRINTF("found lowest key [%.*s] on leaf page %u", 2422 (int)tmpkey.len, tmpkey.str, low->pgno); 2423 } else { 2424 expand_prefix(bt, src, i, &tmpkey); 2425 } 2426 2427 key.size = tmpkey.len; 2428 key.data = tmpkey.str; 2429 2430 remove_prefix(bt, &key, dst->prefix.len); 2431 data.size = NODEDSZ(srcnode); 2432 data.data = NODEDATA(srcnode); 2433 rc = btree_add_node(bt, dst, NUMKEYS(dst), &key, 2434 &data, NODEPGNO(srcnode), srcnode->flags); 2435 if (rc != BT_SUCCESS) 2436 return rc; 2437 } 2438 2439 DPRINTF("dst page %u now has %lu keys (%.1f%% filled)", 2440 dst->pgno, NUMKEYS(dst), (float)PAGEFILL(bt, dst) / 10); 2441 2442 /* Unlink the src page from parent. 2443 */ 2444 btree_del_node(bt, src->parent, src->parent_index); 2445 if (src->parent_index == 0) { 2446 key.size = 0; 2447 if (btree_update_key(bt, src->parent, 0, &key) != BT_SUCCESS) 2448 return BT_FAIL; 2449 2450 pfxlen = src->prefix.len; 2451 find_common_prefix(bt, src); 2452 assert (src->prefix.len == pfxlen); 2453 } 2454 2455 if (IS_LEAF(src)) 2456 bt->meta.leaf_pages--; 2457 else 2458 bt->meta.branch_pages--; 2459 2460 return btree_rebalance(bt, src->parent); 2461 } 2462 2463 #define FILL_THRESHOLD 250 2464 2465 static int 2466 btree_rebalance(struct btree *bt, struct mpage *mp) 2467 { 2468 struct node *node; 2469 struct mpage *parent; 2470 struct mpage *root; 2471 struct mpage *neighbor = NULL; 2472 indx_t si = 0, di = 0; 2473 2474 assert(bt != NULL); 2475 assert(bt->txn != NULL); 2476 assert(mp != NULL); 2477 2478 DPRINTF("rebalancing %s page %u (has %lu keys, %.1f%% full)", 2479 IS_LEAF(mp) ? "leaf" : "branch", 2480 mp->pgno, NUMKEYS(mp), (float)PAGEFILL(bt, mp) / 10); 2481 2482 if (PAGEFILL(bt, mp) >= FILL_THRESHOLD) { 2483 DPRINTF("no need to rebalance page %u, above fill threshold", 2484 mp->pgno); 2485 return BT_SUCCESS; 2486 } 2487 2488 parent = mp->parent; 2489 2490 if (parent == NULL) { 2491 if (NUMKEYS(mp) == 0) { 2492 DPRINTF("tree is completely empty"); 2493 bt->txn->root = P_INVALID; 2494 bt->meta.depth--; 2495 bt->meta.leaf_pages--; 2496 } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { 2497 DPRINTF("collapsing root page!"); 2498 bt->txn->root = NODEPGNO(NODEPTR(mp, 0)); 2499 if ((root = btree_get_mpage(bt, bt->txn->root)) == NULL) 2500 return BT_FAIL; 2501 root->parent = NULL; 2502 bt->meta.depth--; 2503 bt->meta.branch_pages--; 2504 } else 2505 DPRINTF("root page doesn't need rebalancing"); 2506 return BT_SUCCESS; 2507 } 2508 2509 /* The parent (branch page) must have at least 2 pointers, 2510 * otherwise the tree is invalid. 2511 */ 2512 assert(NUMKEYS(parent) > 1); 2513 2514 /* Leaf page fill factor is below the threshold. 2515 * Try to move keys from left or right neighbor, or 2516 * merge with a neighbor page. 2517 */ 2518 2519 /* Find neighbors. 2520 */ 2521 if (mp->parent_index == 0) { 2522 /* We're the leftmost leaf in our parent. 2523 */ 2524 DPRINTF("reading right neighbor"); 2525 node = NODEPTR(parent, mp->parent_index + 1); 2526 if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL) 2527 return BT_FAIL; 2528 neighbor->parent_index = mp->parent_index + 1; 2529 si = 0; 2530 di = NUMKEYS(mp); 2531 } else { 2532 /* There is at least one neighbor to the left. 2533 */ 2534 DPRINTF("reading left neighbor"); 2535 node = NODEPTR(parent, mp->parent_index - 1); 2536 if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL) 2537 return BT_FAIL; 2538 neighbor->parent_index = mp->parent_index - 1; 2539 si = NUMKEYS(neighbor) - 1; 2540 di = 0; 2541 } 2542 neighbor->parent = parent; 2543 2544 DPRINTF("found neighbor page %u (%lu keys, %.1f%% full)", 2545 neighbor->pgno, NUMKEYS(neighbor), (float)PAGEFILL(bt, neighbor) / 10); 2546 2547 /* If the neighbor page is above threshold and has at least two 2548 * keys, move one key from it. 2549 * 2550 * Otherwise we should try to merge them, but that might not be 2551 * possible, even if both are below threshold, as prefix expansion 2552 * might make keys larger. FIXME: detect this 2553 */ 2554 if (PAGEFILL(bt, neighbor) >= FILL_THRESHOLD && NUMKEYS(neighbor) >= 2) 2555 return btree_move_node(bt, neighbor, si, mp, di); 2556 else { /* FIXME: if (has_enough_room()) */ 2557 if (mp->parent_index == 0) 2558 return btree_merge(bt, neighbor, mp); 2559 else 2560 return btree_merge(bt, mp, neighbor); 2561 } 2562 } 2563 2564 int 2565 btree_txn_del(struct btree *bt, struct btree_txn *txn, 2566 struct btval *key, struct btval *data) 2567 { 2568 int rc, exact, close_txn = 0; 2569 unsigned int ki; 2570 struct node *leaf; 2571 struct mpage *mp; 2572 2573 DPRINTF("========> delete key %.*s", (int)key->size, (char *)key->data); 2574 2575 assert(key != NULL); 2576 2577 if (bt != NULL && txn != NULL && bt != txn->bt) { 2578 errno = EINVAL; 2579 return BT_FAIL; 2580 } 2581 2582 if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) { 2583 errno = EINVAL; 2584 return BT_FAIL; 2585 } 2586 2587 if (bt == NULL) { 2588 if (txn == NULL) { 2589 errno = EINVAL; 2590 return BT_FAIL; 2591 } 2592 bt = txn->bt; 2593 } 2594 2595 if (key->size == 0 || key->size > MAXKEYSIZE) { 2596 errno = EINVAL; 2597 return BT_FAIL; 2598 } 2599 2600 if (txn == NULL) { 2601 close_txn = 1; 2602 if ((txn = btree_txn_begin(bt, 0)) == NULL) 2603 return BT_FAIL; 2604 } 2605 2606 if ((rc = btree_search_page(bt, txn, key, NULL, 1, &mp)) != BT_SUCCESS) 2607 goto done; 2608 2609 leaf = btree_search_node(bt, mp, key, &exact, &ki); 2610 if (leaf == NULL || !exact) { 2611 errno = ENOENT; 2612 rc = BT_FAIL; 2613 goto done; 2614 } 2615 2616 if (data && (rc = btree_read_data(bt, NULL, leaf, data)) != BT_SUCCESS) 2617 goto done; 2618 2619 btree_del_node(bt, mp, ki); 2620 bt->meta.entries--; 2621 rc = btree_rebalance(bt, mp); 2622 if (rc != BT_SUCCESS) 2623 txn->flags |= BT_TXN_ERROR; 2624 2625 done: 2626 if (close_txn) { 2627 if (rc == BT_SUCCESS) 2628 rc = btree_txn_commit(txn); 2629 else 2630 btree_txn_abort(txn); 2631 } 2632 mpage_prune(bt); 2633 return rc; 2634 } 2635 2636 /* Reduce the length of the prefix separator <sep> to the minimum length that 2637 * still makes it uniquely distinguishable from <min>. 2638 * 2639 * <min> is guaranteed to be sorted less than <sep> 2640 * 2641 * On return, <sep> is modified to the minimum length. 2642 */ 2643 static void 2644 bt_reduce_separator(struct btree *bt, struct node *min, struct btval *sep) 2645 { 2646 size_t n = 0; 2647 char *p1; 2648 char *p2; 2649 2650 if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 2651 2652 assert(sep->size > 0); 2653 2654 p1 = (char *)NODEKEY(min) + min->ksize - 1; 2655 p2 = (char *)sep->data + sep->size - 1; 2656 2657 while (p1 >= (char *)NODEKEY(min) && *p1 == *p2) { 2658 assert(p2 > (char *)sep->data); 2659 p1--; 2660 p2--; 2661 n++; 2662 } 2663 2664 sep->data = p2; 2665 sep->size = n + 1; 2666 } else { 2667 2668 assert(min->ksize > 0); 2669 assert(sep->size > 0); 2670 2671 p1 = (char *)NODEKEY(min); 2672 p2 = (char *)sep->data; 2673 2674 while (*p1 == *p2) { 2675 p1++; 2676 p2++; 2677 n++; 2678 if (n == min->ksize || n == sep->size) 2679 break; 2680 } 2681 2682 sep->size = n + 1; 2683 } 2684 2685 DPRINTF("reduced separator to [%.*s] > [%.*s]", 2686 (int)sep->size, (char *)sep->data, 2687 (int)min->ksize, (char *)NODEKEY(min)); 2688 } 2689 2690 /* Split page <*mpp>, and insert <key,(data|newpgno)> in either left or 2691 * right sibling, at index <*newindxp> (as if unsplit). Updates *mpp and 2692 * *newindxp with the actual values after split, ie if *mpp and *newindxp 2693 * refer to a node in the new right sibling page. 2694 */ 2695 static int 2696 btree_split(struct btree *bt, struct mpage **mpp, unsigned int *newindxp, 2697 struct btval *newkey, struct btval *newdata, pgno_t newpgno) 2698 { 2699 uint8_t flags; 2700 int rc = BT_SUCCESS, ins_new = 0; 2701 indx_t newindx; 2702 pgno_t pgno = 0; 2703 size_t orig_pfx_len, left_pfx_diff, right_pfx_diff, pfx_diff; 2704 unsigned int i, j, split_indx; 2705 struct node *node; 2706 struct mpage *pright, *p, *mp; 2707 struct btval sepkey, rkey, rdata; 2708 struct btkey tmpkey; 2709 struct page *copy; 2710 2711 assert(bt != NULL); 2712 assert(bt->txn != NULL); 2713 2714 mp = *mpp; 2715 newindx = *newindxp; 2716 2717 DPRINTF("-----> splitting %s page %u and adding [%.*s] at index %d", 2718 IS_LEAF(mp) ? "leaf" : "branch", mp->pgno, 2719 (int)newkey->size, (char *)newkey->data, *newindxp); 2720 DPRINTF("page %u has prefix [%.*s]", mp->pgno, 2721 (int)mp->prefix.len, (char *)mp->prefix.str); 2722 orig_pfx_len = mp->prefix.len; 2723 2724 if (mp->parent == NULL) { 2725 if ((mp->parent = btree_new_page(bt, P_BRANCH)) == NULL) 2726 return BT_FAIL; 2727 mp->parent_index = 0; 2728 bt->txn->root = mp->parent->pgno; 2729 DPRINTF("root split! new root = %u", mp->parent->pgno); 2730 bt->meta.depth++; 2731 2732 /* Add left (implicit) pointer. */ 2733 if (btree_add_node(bt, mp->parent, 0, NULL, NULL, 2734 mp->pgno, 0) != BT_SUCCESS) 2735 return BT_FAIL; 2736 } else { 2737 DPRINTF("parent branch page is %u", mp->parent->pgno); 2738 } 2739 2740 /* Create a right sibling. */ 2741 if ((pright = btree_new_page(bt, mp->page->flags)) == NULL) 2742 return BT_FAIL; 2743 pright->parent = mp->parent; 2744 pright->parent_index = mp->parent_index + 1; 2745 DPRINTF("new right sibling: page %u", pright->pgno); 2746 2747 /* Move half of the keys to the right sibling. */ 2748 if ((copy = malloc(bt->head.psize)) == NULL) 2749 return BT_FAIL; 2750 bcopy(mp->page, copy, bt->head.psize); 2751 assert(mp->ref == 0); /* XXX */ 2752 memset(&mp->page->ptrs, 0, bt->head.psize - PAGEHDRSZ); 2753 mp->page->lower = PAGEHDRSZ; 2754 mp->page->upper = bt->head.psize; 2755 2756 split_indx = NUMKEYSP(copy) / 2 + 1; 2757 2758 /* First find the separating key between the split pages. 2759 */ 2760 memset(&sepkey, 0, sizeof(sepkey)); 2761 if (newindx == split_indx) { 2762 sepkey.size = newkey->size; 2763 sepkey.data = newkey->data; 2764 remove_prefix(bt, &sepkey, mp->prefix.len); 2765 } else { 2766 node = NODEPTRP(copy, split_indx); 2767 sepkey.size = node->ksize; 2768 sepkey.data = NODEKEY(node); 2769 } 2770 2771 if (IS_LEAF(mp) && bt->cmp == NULL) { 2772 /* Find the smallest separator. */ 2773 /* Ref: Prefix B-trees, R. Bayer, K. Unterauer, 1977 */ 2774 node = NODEPTRP(copy, split_indx - 1); 2775 bt_reduce_separator(bt, node, &sepkey); 2776 } 2777 2778 /* Fix separator wrt parent prefix. */ 2779 if (bt->cmp == NULL) { 2780 tmpkey.len = sizeof(tmpkey.str); 2781 concat_prefix(bt, mp->prefix.str, mp->prefix.len, 2782 sepkey.data, sepkey.size, tmpkey.str, &tmpkey.len); 2783 sepkey.data = tmpkey.str; 2784 sepkey.size = tmpkey.len; 2785 } 2786 2787 DPRINTF("separator is [%.*s]", (int)sepkey.size, (char *)sepkey.data); 2788 2789 /* Copy separator key to the parent. 2790 */ 2791 if (SIZELEFT(pright->parent) < bt_branch_size(bt, &sepkey)) { 2792 rc = btree_split(bt, &pright->parent, &pright->parent_index, 2793 &sepkey, NULL, pright->pgno); 2794 2795 /* Right page might now have changed parent. 2796 * Check if left page also changed parent. 2797 */ 2798 if (pright->parent != mp->parent && 2799 mp->parent_index >= NUMKEYS(mp->parent)) { 2800 mp->parent = pright->parent; 2801 mp->parent_index = pright->parent_index - 1; 2802 } 2803 } else { 2804 remove_prefix(bt, &sepkey, pright->parent->prefix.len); 2805 rc = btree_add_node(bt, pright->parent, pright->parent_index, 2806 &sepkey, NULL, pright->pgno, 0); 2807 } 2808 if (rc != BT_SUCCESS) { 2809 free(copy); 2810 return BT_FAIL; 2811 } 2812 2813 /* Update prefix for right and left page, if the parent was split. 2814 */ 2815 find_common_prefix(bt, pright); 2816 assert(orig_pfx_len <= pright->prefix.len); 2817 right_pfx_diff = pright->prefix.len - orig_pfx_len; 2818 2819 find_common_prefix(bt, mp); 2820 assert(orig_pfx_len <= mp->prefix.len); 2821 left_pfx_diff = mp->prefix.len - orig_pfx_len; 2822 2823 for (i = j = 0; i <= NUMKEYSP(copy); j++) { 2824 if (i < split_indx) { 2825 /* Re-insert in left sibling. */ 2826 p = mp; 2827 pfx_diff = left_pfx_diff; 2828 } else { 2829 /* Insert in right sibling. */ 2830 if (i == split_indx) 2831 /* Reset insert index for right sibling. */ 2832 j = (i == newindx && ins_new); 2833 p = pright; 2834 pfx_diff = right_pfx_diff; 2835 } 2836 2837 if (i == newindx && !ins_new) { 2838 /* Insert the original entry that caused the split. */ 2839 rkey.data = newkey->data; 2840 rkey.size = newkey->size; 2841 if (IS_LEAF(mp)) { 2842 rdata.data = newdata->data; 2843 rdata.size = newdata->size; 2844 } else 2845 pgno = newpgno; 2846 flags = 0; 2847 pfx_diff = p->prefix.len; 2848 2849 ins_new = 1; 2850 2851 /* Update page and index for the new key. */ 2852 *newindxp = j; 2853 *mpp = p; 2854 } else if (i == NUMKEYSP(copy)) { 2855 break; 2856 } else { 2857 node = NODEPTRP(copy, i); 2858 rkey.data = NODEKEY(node); 2859 rkey.size = node->ksize; 2860 if (IS_LEAF(mp)) { 2861 rdata.data = NODEDATA(node); 2862 rdata.size = node->n_dsize; 2863 } else 2864 pgno = node->n_pgno; 2865 flags = node->flags; 2866 2867 i++; 2868 } 2869 2870 if (!IS_LEAF(mp) && j == 0) { 2871 /* First branch index doesn't need key data. */ 2872 rkey.size = 0; 2873 } else 2874 remove_prefix(bt, &rkey, pfx_diff); 2875 2876 rc = btree_add_node(bt, p, j, &rkey, &rdata, pgno,flags); 2877 } 2878 2879 free(copy); 2880 return rc; 2881 } 2882 2883 int 2884 btree_txn_put(struct btree *bt, struct btree_txn *txn, 2885 struct btval *key, struct btval *data, unsigned int flags) 2886 { 2887 int rc = BT_SUCCESS, exact, close_txn = 0; 2888 unsigned int ki; 2889 struct node *leaf; 2890 struct mpage *mp; 2891 struct btval xkey; 2892 2893 assert(key != NULL); 2894 assert(data != NULL); 2895 2896 if (bt != NULL && txn != NULL && bt != txn->bt) { 2897 errno = EINVAL; 2898 return BT_FAIL; 2899 } 2900 2901 if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) { 2902 errno = EINVAL; 2903 return BT_FAIL; 2904 } 2905 2906 if (bt == NULL) { 2907 if (txn == NULL) { 2908 errno = EINVAL; 2909 return BT_FAIL; 2910 } 2911 bt = txn->bt; 2912 } 2913 2914 if (key->size == 0 || key->size > MAXKEYSIZE) { 2915 errno = EINVAL; 2916 return BT_FAIL; 2917 } 2918 2919 DPRINTF("==> put key %.*s, size %zu, data size %zu", 2920 (int)key->size, (char *)key->data, key->size, data->size); 2921 2922 if (txn == NULL) { 2923 close_txn = 1; 2924 if ((txn = btree_txn_begin(bt, 0)) == NULL) 2925 return BT_FAIL; 2926 } 2927 2928 rc = btree_search_page(bt, txn, key, NULL, 1, &mp); 2929 if (rc == BT_SUCCESS) { 2930 leaf = btree_search_node(bt, mp, key, &exact, &ki); 2931 if (leaf && exact) { 2932 if (F_ISSET(flags, BT_NOOVERWRITE)) { 2933 DPRINTF("duplicate key %.*s", 2934 (int)key->size, (char *)key->data); 2935 errno = EEXIST; 2936 rc = BT_FAIL; 2937 goto done; 2938 } 2939 btree_del_node(bt, mp, ki); 2940 } 2941 if (leaf == NULL) { /* append if not found */ 2942 ki = NUMKEYS(mp); 2943 DPRINTF("appending key at index %d", ki); 2944 } 2945 } else if (errno == ENOENT) { 2946 /* new file, just write a root leaf page */ 2947 DPRINTF("allocating new root leaf page"); 2948 if ((mp = btree_new_page(bt, P_LEAF)) == NULL) { 2949 rc = BT_FAIL; 2950 goto done; 2951 } 2952 txn->root = mp->pgno; 2953 bt->meta.depth++; 2954 ki = 0; 2955 } 2956 else 2957 goto done; 2958 2959 assert(IS_LEAF(mp)); 2960 DPRINTF("there are %lu keys, should insert new key at index %u", 2961 NUMKEYS(mp), ki); 2962 2963 /* Copy the key pointer as it is modified by the prefix code. The 2964 * caller might have malloc'ed the data. 2965 */ 2966 xkey.data = key->data; 2967 xkey.size = key->size; 2968 2969 if (SIZELEFT(mp) < bt_leaf_size(bt, key, data)) { 2970 rc = btree_split(bt, &mp, &ki, &xkey, data, P_INVALID); 2971 } else { 2972 /* There is room already in this leaf page. */ 2973 remove_prefix(bt, &xkey, mp->prefix.len); 2974 rc = btree_add_node(bt, mp, ki, &xkey, data, 0, 0); 2975 } 2976 2977 if (rc != BT_SUCCESS) 2978 txn->flags |= BT_TXN_ERROR; 2979 else 2980 bt->meta.entries++; 2981 2982 done: 2983 if (close_txn) { 2984 if (rc == BT_SUCCESS) 2985 rc = btree_txn_commit(txn); 2986 else 2987 btree_txn_abort(txn); 2988 } 2989 mpage_prune(bt); 2990 return rc; 2991 } 2992 2993 static pgno_t 2994 btree_compact_tree(struct btree *bt, pgno_t pgno, struct btree *btc) 2995 { 2996 ssize_t rc; 2997 indx_t i; 2998 pgno_t *pnext, next; 2999 struct node *node; 3000 struct page *p; 3001 struct mpage *mp; 3002 3003 /* Get the page and make a copy of it. 3004 */ 3005 if ((mp = btree_get_mpage(bt, pgno)) == NULL) 3006 return P_INVALID; 3007 if ((p = malloc(bt->head.psize)) == NULL) 3008 return P_INVALID; 3009 bcopy(mp->page, p, bt->head.psize); 3010 3011 /* Go through all nodes in the (copied) page and update the 3012 * page pointers. 3013 */ 3014 if (F_ISSET(p->flags, P_BRANCH)) { 3015 for (i = 0; i < NUMKEYSP(p); i++) { 3016 node = NODEPTRP(p, i); 3017 node->n_pgno = btree_compact_tree(bt, node->n_pgno, btc); 3018 if (node->n_pgno == P_INVALID) { 3019 free(p); 3020 return P_INVALID; 3021 } 3022 } 3023 } else if (F_ISSET(p->flags, P_LEAF)) { 3024 for (i = 0; i < NUMKEYSP(p); i++) { 3025 node = NODEPTRP(p, i); 3026 if (F_ISSET(node->flags, F_BIGDATA)) { 3027 bcopy(NODEDATA(node), &next, sizeof(next)); 3028 next = btree_compact_tree(bt, next, btc); 3029 if (next == P_INVALID) { 3030 free(p); 3031 return P_INVALID; 3032 } 3033 bcopy(&next, NODEDATA(node), sizeof(next)); 3034 } 3035 } 3036 } else if (F_ISSET(p->flags, P_OVERFLOW)) { 3037 pnext = &p->p_next_pgno; 3038 if (*pnext > 0) { 3039 *pnext = btree_compact_tree(bt, *pnext, btc); 3040 if (*pnext == P_INVALID) { 3041 free(p); 3042 return P_INVALID; 3043 } 3044 } 3045 } else 3046 assert(0); 3047 3048 pgno = p->pgno = btc->txn->next_pgno++; 3049 rc = write(btc->fd, p, bt->head.psize); 3050 free(p); 3051 if (rc != (ssize_t)bt->head.psize) 3052 return P_INVALID; 3053 mpage_prune(bt); 3054 return pgno; 3055 } 3056 3057 int 3058 btree_compact(struct btree *bt) 3059 { 3060 char *compact_path = NULL; 3061 struct btree *btc; 3062 struct btree_txn *txn, *txnc = NULL; 3063 int fd; 3064 pgno_t root; 3065 3066 assert(bt != NULL); 3067 3068 DPRINTF("compacting btree %p with path %s", bt, bt->path); 3069 3070 if (bt->path == NULL) { 3071 errno = EINVAL; 3072 return BT_FAIL; 3073 } 3074 3075 if ((txn = btree_txn_begin(bt, 0)) == NULL) 3076 return BT_FAIL; 3077 3078 if (asprintf(&compact_path, "%s.compact.XXXXXX", bt->path) == -1) { 3079 btree_txn_abort(txn); 3080 return BT_FAIL; 3081 } 3082 fd = mkstemp(compact_path); 3083 if (fd == -1) { 3084 free(compact_path); 3085 btree_txn_abort(txn); 3086 return BT_FAIL; 3087 } 3088 3089 if ((btc = btree_open_fd(fd, 0)) == NULL) 3090 goto failed; 3091 bcopy(&bt->meta, &btc->meta, sizeof(bt->meta)); 3092 btc->meta.revisions = 0; 3093 3094 if ((txnc = btree_txn_begin(btc, 0)) == NULL) 3095 goto failed; 3096 3097 if (bt->meta.root != P_INVALID) { 3098 root = btree_compact_tree(bt, bt->meta.root, btc); 3099 if (root == P_INVALID) 3100 goto failed; 3101 if (btree_write_meta(btc, root, 0) != BT_SUCCESS) 3102 goto failed; 3103 } 3104 3105 fsync(fd); 3106 3107 DPRINTF("renaming %s to %s", compact_path, bt->path); 3108 if (rename(compact_path, bt->path) != 0) 3109 goto failed; 3110 3111 /* Write a "tombstone" meta page so other processes can pick up 3112 * the change and re-open the file. 3113 */ 3114 if (btree_write_meta(bt, P_INVALID, BT_TOMBSTONE) != BT_SUCCESS) 3115 goto failed; 3116 3117 btree_txn_abort(txn); 3118 btree_txn_abort(txnc); 3119 free(compact_path); 3120 btree_close(btc); 3121 mpage_prune(bt); 3122 return 0; 3123 3124 failed: 3125 btree_txn_abort(txn); 3126 btree_txn_abort(txnc); 3127 unlink(compact_path); 3128 free(compact_path); 3129 btree_close(btc); 3130 mpage_prune(bt); 3131 return BT_FAIL; 3132 } 3133 3134 /* Reverts the last change. Truncates the file at the last root page. 3135 */ 3136 int 3137 btree_revert(struct btree *bt) 3138 { 3139 if (btree_read_meta(bt, NULL) != 0) 3140 return -1; 3141 3142 DPRINTF("truncating file at page %u", bt->meta.root); 3143 return ftruncate(bt->fd, bt->head.psize * bt->meta.root); 3144 } 3145 3146 void 3147 btree_set_cache_size(struct btree *bt, unsigned int cache_size) 3148 { 3149 bt->stat.max_cache = cache_size; 3150 } 3151 3152 unsigned int 3153 btree_get_flags(struct btree *bt) 3154 { 3155 return (bt->flags & ~BT_FIXPADDING); 3156 } 3157 3158 const char * 3159 btree_get_path(struct btree *bt) 3160 { 3161 return bt->path; 3162 } 3163 3164 const struct btree_stat * 3165 btree_stat(struct btree *bt) 3166 { 3167 if (bt == NULL) 3168 return NULL; 3169 3170 bt->stat.branch_pages = bt->meta.branch_pages; 3171 bt->stat.leaf_pages = bt->meta.leaf_pages; 3172 bt->stat.overflow_pages = bt->meta.overflow_pages; 3173 bt->stat.revisions = bt->meta.revisions; 3174 bt->stat.depth = bt->meta.depth; 3175 bt->stat.entries = bt->meta.entries; 3176 bt->stat.psize = bt->head.psize; 3177 bt->stat.created_at = bt->meta.created_at; 3178 3179 return &bt->stat; 3180 } 3181 3182