146134Smao /*- 261196Sbostic * Copyright (c) 1990, 1993 361196Sbostic * The Regents of the University of California. All rights reserved. 446134Smao * 546134Smao * This code is derived from software contributed to Berkeley by 646134Smao * Mike Olson. 746134Smao * 846134Smao * %sccs.include.redist.c% 946134Smao */ 1046134Smao 1146134Smao #if defined(LIBC_SCCS) && !defined(lint) 12*66214Sbostic static char sccsid[] = "@(#)bt_split.c 8.3 (Berkeley) 02/21/94"; 1346134Smao #endif /* LIBC_SCCS and not lint */ 1446134Smao 1546134Smao #include <sys/types.h> 1656741Sbostic 1750989Sbostic #include <limits.h> 1850989Sbostic #include <stdio.h> 1946561Sbostic #include <stdlib.h> 2046561Sbostic #include <string.h> 2156741Sbostic 2257932Sbostic #include <db.h> 2346134Smao #include "btree.h" 2446134Smao 2557440Sbostic static int bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *)); 2657451Sbostic static PAGE *bt_page 2766192Sbostic __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t)); 2850989Sbostic static int bt_preserve __P((BTREE *, pgno_t)); 2957451Sbostic static PAGE *bt_psplit 3066192Sbostic __P((BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t)); 3157451Sbostic static PAGE *bt_root 3266192Sbostic __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t)); 3350989Sbostic static int bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *)); 3450989Sbostic static recno_t rec_total __P((PAGE *)); 3550989Sbostic 3650989Sbostic #ifdef STATISTICS 3750989Sbostic u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved; 3850989Sbostic #endif 3950989Sbostic 4046134Smao /* 4150989Sbostic * __BT_SPLIT -- Split the tree. 4246134Smao * 4350989Sbostic * Parameters: 4450989Sbostic * t: tree 4557440Sbostic * sp: page to split 4650989Sbostic * key: key to insert 4750989Sbostic * data: data to insert 4850989Sbostic * flags: BIGKEY/BIGDATA flags 4957451Sbostic * ilen: insert length 5050989Sbostic * skip: index to leave open 5146134Smao * 5250989Sbostic * Returns: 5350989Sbostic * RET_ERROR, RET_SUCCESS 5446134Smao */ 5546134Smao int 5657451Sbostic __bt_split(t, sp, key, data, flags, ilen, skip) 5750989Sbostic BTREE *t; 5857440Sbostic PAGE *sp; 5950989Sbostic const DBT *key, *data; 6066192Sbostic int flags; 6157451Sbostic size_t ilen; 6266192Sbostic indx_t skip; 6346134Smao { 6450989Sbostic BINTERNAL *bi; 6557451Sbostic BLEAF *bl, *tbl; 6650989Sbostic DBT a, b; 6750989Sbostic EPGNO *parent; 6857440Sbostic PAGE *h, *l, *r, *lchild, *rchild; 6957989Sbostic indx_t nxtindex; 7057451Sbostic size_t n, nbytes, nksize; 7160234Sbostic int parentsplit; 7250989Sbostic char *dest; 7346134Smao 7446134Smao /* 7550989Sbostic * Split the page into two pages, l and r. The split routines return 7657440Sbostic * a pointer to the page into which the key should be inserted and with 7757440Sbostic * skip set to the offset which should be used. Additionally, l and r 7857440Sbostic * are pinned. 7946134Smao */ 8057440Sbostic h = sp->pgno == P_ROOT ? 8157451Sbostic bt_root(t, sp, &l, &r, &skip, ilen) : 8257451Sbostic bt_page(t, sp, &l, &r, &skip, ilen); 8350989Sbostic if (h == NULL) 8446134Smao return (RET_ERROR); 8546134Smao 8650989Sbostic /* 8757440Sbostic * Insert the new key/data pair into the leaf page. (Key inserts 8857440Sbostic * always cause a leaf page to split first.) 8950989Sbostic */ 9057451Sbostic h->linp[skip] = h->upper -= ilen; 9150989Sbostic dest = (char *)h + h->upper; 9260048Sbostic if (ISSET(t, R_RECNO)) 9350989Sbostic WR_RLEAF(dest, data, flags) 9451106Sbostic else 9550989Sbostic WR_BLEAF(dest, key, data, flags) 9646134Smao 9757440Sbostic /* If the root page was split, make it look right. */ 9857440Sbostic if (sp->pgno == P_ROOT && 9960048Sbostic (ISSET(t, R_RECNO) ? 10057440Sbostic bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) 10157440Sbostic goto err2; 10257440Sbostic 10346134Smao /* 10450989Sbostic * Now we walk the parent page stack -- a LIFO stack of the pages that 10550989Sbostic * were traversed when we searched for the page that split. Each stack 10650989Sbostic * entry is a page number and a page index offset. The offset is for 10750989Sbostic * the page traversed on the search. We've just split a page, so we 10850989Sbostic * have to insert a new key into the parent page. 10950989Sbostic * 11050989Sbostic * If the insert into the parent page causes it to split, may have to 11150989Sbostic * continue splitting all the way up the tree. We stop if the root 11250989Sbostic * splits or the page inserted into didn't have to split to hold the 11350989Sbostic * new key. Some algorithms replace the key for the old page as well 11450989Sbostic * as the new page. We don't, as there's no reason to believe that the 11550989Sbostic * first key on the old page is any better than the key we have, and, 11650989Sbostic * in the case of a key being placed at index 0 causing the split, the 11750989Sbostic * key is unavailable. 11850989Sbostic * 11950989Sbostic * There are a maximum of 5 pages pinned at any time. We keep the left 12050989Sbostic * and right pages pinned while working on the parent. The 5 are the 12150989Sbostic * two children, left parent and right parent (when the parent splits) 12250989Sbostic * and the root page or the overflow key page when calling bt_preserve. 12350989Sbostic * This code must make sure that all pins are released other than the 12450989Sbostic * root page or overflow page which is unlocked elsewhere. 12546134Smao */ 12660234Sbostic while ((parent = BT_POP(t)) != NULL) { 12750989Sbostic lchild = l; 12850989Sbostic rchild = r; 12946134Smao 13050989Sbostic /* Get the parent page. */ 13150989Sbostic if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 13250989Sbostic goto err2; 13346134Smao 13457451Sbostic /* 13557451Sbostic * The new key goes ONE AFTER the index, because the split 13657451Sbostic * was to the right. 13757451Sbostic */ 13850989Sbostic skip = parent->index + 1; 13946134Smao 14050989Sbostic /* 14150989Sbostic * Calculate the space needed on the parent page. 14250989Sbostic * 14357451Sbostic * Prefix trees: space hack when inserting into BINTERNAL 14457451Sbostic * pages. Retain only what's needed to distinguish between 14557451Sbostic * the new entry and the LAST entry on the page to its left. 14657451Sbostic * If the keys compare equal, retain the entire key. Note, 14757451Sbostic * we don't touch overflow keys, and the entire key must be 14857451Sbostic * retained for the next-to-left most key on the leftmost 14957451Sbostic * page of each level, or the search will fail. Applicable 15057451Sbostic * ONLY to internal pages that have leaf pages as children. 15157451Sbostic * Further reduction of the key between pairs of internal 15257451Sbostic * pages loses too much information. 15350989Sbostic */ 15450989Sbostic switch (rchild->flags & P_TYPE) { 15550989Sbostic case P_BINTERNAL: 15650989Sbostic bi = GETBINTERNAL(rchild, 0); 15750989Sbostic nbytes = NBINTERNAL(bi->ksize); 15850989Sbostic break; 15950989Sbostic case P_BLEAF: 16050989Sbostic bl = GETBLEAF(rchild, 0); 16156401Sbostic nbytes = NBINTERNAL(bl->ksize); 16257451Sbostic if (t->bt_pfx && !(bl->flags & P_BIGKEY) && 16357451Sbostic (h->prevpg != P_INVALID || skip > 1)) { 16450989Sbostic tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1); 16550989Sbostic a.size = tbl->ksize; 16650989Sbostic a.data = tbl->bytes; 16750989Sbostic b.size = bl->ksize; 16850989Sbostic b.data = bl->bytes; 16957451Sbostic nksize = t->bt_pfx(&a, &b); 17050989Sbostic n = NBINTERNAL(nksize); 17150989Sbostic if (n < nbytes) { 17250989Sbostic #ifdef STATISTICS 17350989Sbostic bt_pfxsaved += nbytes - n; 17450989Sbostic #endif 17550989Sbostic nbytes = n; 17650989Sbostic } else 17750989Sbostic nksize = 0; 17850989Sbostic } else 17950989Sbostic nksize = 0; 18050989Sbostic break; 18150989Sbostic case P_RINTERNAL: 18250989Sbostic case P_RLEAF: 18350989Sbostic nbytes = NRINTERNAL; 18450989Sbostic break; 18556741Sbostic default: 18656741Sbostic abort(); 18750989Sbostic } 18850989Sbostic 18950989Sbostic /* Split the parent page if necessary or shift the indices. */ 19057989Sbostic if (h->upper - h->lower < nbytes + sizeof(indx_t)) { 19157440Sbostic sp = h; 19250989Sbostic h = h->pgno == P_ROOT ? 19357451Sbostic bt_root(t, h, &l, &r, &skip, nbytes) : 19457451Sbostic bt_page(t, h, &l, &r, &skip, nbytes); 19550989Sbostic if (h == NULL) 19650989Sbostic goto err1; 19760234Sbostic parentsplit = 1; 19846134Smao } else { 19950989Sbostic if (skip < (nxtindex = NEXTINDEX(h))) 20058017Sbostic memmove(h->linp + skip + 1, h->linp + skip, 20157989Sbostic (nxtindex - skip) * sizeof(indx_t)); 20257989Sbostic h->lower += sizeof(indx_t); 20360234Sbostic parentsplit = 0; 20446134Smao } 20546134Smao 20650989Sbostic /* Insert the key into the parent page. */ 20750989Sbostic switch(rchild->flags & P_TYPE) { 20850989Sbostic case P_BINTERNAL: 20950989Sbostic h->linp[skip] = h->upper -= nbytes; 21050989Sbostic dest = (char *)h + h->linp[skip]; 21158017Sbostic memmove(dest, bi, nbytes); 21250989Sbostic ((BINTERNAL *)dest)->pgno = rchild->pgno; 21350989Sbostic break; 21450989Sbostic case P_BLEAF: 21550989Sbostic h->linp[skip] = h->upper -= nbytes; 21650989Sbostic dest = (char *)h + h->linp[skip]; 21750989Sbostic WR_BINTERNAL(dest, nksize ? nksize : bl->ksize, 21851106Sbostic rchild->pgno, bl->flags & P_BIGKEY); 21958017Sbostic memmove(dest, bl->bytes, nksize ? nksize : bl->ksize); 22050989Sbostic if (bl->flags & P_BIGKEY && 22150989Sbostic bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR) 22250989Sbostic goto err1; 22350989Sbostic break; 22450989Sbostic case P_RINTERNAL: 22560234Sbostic /* 22660234Sbostic * Update the left page count. If split 22760234Sbostic * added at index 0, fix the correct page. 22860234Sbostic */ 22960234Sbostic if (skip > 0) 23060234Sbostic dest = (char *)h + h->linp[skip - 1]; 23160234Sbostic else 23260234Sbostic dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; 23360234Sbostic ((RINTERNAL *)dest)->nrecs = rec_total(lchild); 23460234Sbostic ((RINTERNAL *)dest)->pgno = lchild->pgno; 23560234Sbostic 23660234Sbostic /* Update the right page count. */ 23750989Sbostic h->linp[skip] = h->upper -= nbytes; 23850989Sbostic dest = (char *)h + h->linp[skip]; 23950989Sbostic ((RINTERNAL *)dest)->nrecs = rec_total(rchild); 24050989Sbostic ((RINTERNAL *)dest)->pgno = rchild->pgno; 24150989Sbostic break; 24250989Sbostic case P_RLEAF: 24360234Sbostic /* 24460234Sbostic * Update the left page count. If split 24560234Sbostic * added at index 0, fix the correct page. 24660234Sbostic */ 24760234Sbostic if (skip > 0) 24860234Sbostic dest = (char *)h + h->linp[skip - 1]; 24960234Sbostic else 25060234Sbostic dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; 25160234Sbostic ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild); 25260234Sbostic ((RINTERNAL *)dest)->pgno = lchild->pgno; 25360234Sbostic 25460234Sbostic /* Update the right page count. */ 25550989Sbostic h->linp[skip] = h->upper -= nbytes; 25650989Sbostic dest = (char *)h + h->linp[skip]; 25750989Sbostic ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild); 25850989Sbostic ((RINTERNAL *)dest)->pgno = rchild->pgno; 25950989Sbostic break; 26056741Sbostic default: 26156741Sbostic abort(); 26250989Sbostic } 26346134Smao 26450989Sbostic /* Unpin the held pages. */ 26560234Sbostic if (!parentsplit) { 26650989Sbostic mpool_put(t->bt_mp, h, MPOOL_DIRTY); 26750989Sbostic break; 26850989Sbostic } 26957440Sbostic 27057440Sbostic /* If the root page was split, make it look right. */ 27157440Sbostic if (sp->pgno == P_ROOT && 27260048Sbostic (ISSET(t, R_RECNO) ? 27357440Sbostic bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) 27457440Sbostic goto err1; 27557440Sbostic 27650989Sbostic mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); 27750989Sbostic mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); 27850989Sbostic } 27946134Smao 28050989Sbostic /* Unpin the held pages. */ 28150989Sbostic mpool_put(t->bt_mp, l, MPOOL_DIRTY); 28250989Sbostic mpool_put(t->bt_mp, r, MPOOL_DIRTY); 28350989Sbostic 28451106Sbostic /* Clear any pages left on the stack. */ 28550989Sbostic return (RET_SUCCESS); 28646134Smao 28746134Smao /* 28850989Sbostic * If something fails in the above loop we were already walking back 28950989Sbostic * up the tree and the tree is now inconsistent. Nothing much we can 29050989Sbostic * do about it but release any memory we're holding. 29146134Smao */ 29250989Sbostic err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); 29350989Sbostic mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); 29446134Smao 29550989Sbostic err2: mpool_put(t->bt_mp, l, 0); 29650989Sbostic mpool_put(t->bt_mp, r, 0); 29750989Sbostic __dbpanic(t->bt_dbp); 29850989Sbostic return (RET_ERROR); 29946134Smao } 30046134Smao 30146134Smao /* 30250989Sbostic * BT_PAGE -- Split a non-root page of a btree. 30346134Smao * 30450989Sbostic * Parameters: 30550989Sbostic * t: tree 30650989Sbostic * h: root page 30750989Sbostic * lp: pointer to left page pointer 30850989Sbostic * rp: pointer to right page pointer 30950989Sbostic * skip: pointer to index to leave open 31057451Sbostic * ilen: insert length 31146134Smao * 31250989Sbostic * Returns: 31350989Sbostic * Pointer to page in which to insert or NULL on error. 31446134Smao */ 31550989Sbostic static PAGE * 31657451Sbostic bt_page(t, h, lp, rp, skip, ilen) 31750989Sbostic BTREE *t; 31850989Sbostic PAGE *h, **lp, **rp; 31966192Sbostic indx_t *skip; 32057451Sbostic size_t ilen; 32146134Smao { 32250989Sbostic PAGE *l, *r, *tp; 32350989Sbostic pgno_t npg; 32446134Smao 32550989Sbostic #ifdef STATISTICS 32650989Sbostic ++bt_split; 32750989Sbostic #endif 32850989Sbostic /* Put the new right page for the split into place. */ 32956492Sbostic if ((r = __bt_new(t, &npg)) == NULL) 33050989Sbostic return (NULL); 33150989Sbostic r->pgno = npg; 33250989Sbostic r->lower = BTDATAOFF; 33350989Sbostic r->upper = t->bt_psize; 33450989Sbostic r->nextpg = h->nextpg; 33550989Sbostic r->prevpg = h->pgno; 33650989Sbostic r->flags = h->flags & P_TYPE; 33746134Smao 33850989Sbostic /* 33950989Sbostic * If we're splitting the last page on a level because we're appending 34050989Sbostic * a key to it (skip is NEXTINDEX()), it's likely that the data is 34150989Sbostic * sorted. Adding an empty page on the side of the level is less work 34250989Sbostic * and can push the fill factor much higher than normal. If we're 34350989Sbostic * wrong it's no big deal, we'll just do the split the right way next 34450989Sbostic * time. It may look like it's equally easy to do a similar hack for 34550989Sbostic * reverse sorted data, that is, split the tree left, but it's not. 34650989Sbostic * Don't even try. 34750989Sbostic */ 34850989Sbostic if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) { 34950989Sbostic #ifdef STATISTICS 35050989Sbostic ++bt_sortsplit; 35150989Sbostic #endif 35250989Sbostic h->nextpg = r->pgno; 35357989Sbostic r->lower = BTDATAOFF + sizeof(indx_t); 35450989Sbostic *skip = 0; 35550989Sbostic *lp = h; 35650989Sbostic *rp = r; 35750989Sbostic return (r); 35850989Sbostic } 35946134Smao 36050989Sbostic /* Put the new left page for the split into place. */ 361*66214Sbostic if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) { 36250989Sbostic mpool_put(t->bt_mp, r, 0); 36350989Sbostic return (NULL); 36450989Sbostic } 36550989Sbostic l->pgno = h->pgno; 36650989Sbostic l->nextpg = r->pgno; 36750989Sbostic l->prevpg = h->prevpg; 36850989Sbostic l->lower = BTDATAOFF; 36950989Sbostic l->upper = t->bt_psize; 37050989Sbostic l->flags = h->flags & P_TYPE; 37146134Smao 37250989Sbostic /* Fix up the previous pointer of the page after the split page. */ 37350989Sbostic if (h->nextpg != P_INVALID) { 37450989Sbostic if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) { 37550989Sbostic free(l); 37650989Sbostic /* XXX mpool_free(t->bt_mp, r->pgno); */ 37750989Sbostic return (NULL); 37846134Smao } 37950989Sbostic tp->prevpg = r->pgno; 38050989Sbostic mpool_put(t->bt_mp, tp, 0); 38150989Sbostic } 38246134Smao 38350989Sbostic /* 38450989Sbostic * Split right. The key/data pairs aren't sorted in the btree page so 38550989Sbostic * it's simpler to copy the data from the split page onto two new pages 38650989Sbostic * instead of copying half the data to the right page and compacting 38750989Sbostic * the left page in place. Since the left page can't change, we have 38850989Sbostic * to swap the original and the allocated left page after the split. 38950989Sbostic */ 39057451Sbostic tp = bt_psplit(t, h, l, r, skip, ilen); 39146134Smao 39250989Sbostic /* Move the new left page onto the old left page. */ 39358017Sbostic memmove(h, l, t->bt_psize); 39450989Sbostic if (tp == l) 39550989Sbostic tp = h; 39650989Sbostic free(l); 39750989Sbostic 39850989Sbostic *lp = h; 39950989Sbostic *rp = r; 40050989Sbostic return (tp); 40146134Smao } 40246134Smao 40346134Smao /* 40451106Sbostic * BT_ROOT -- Split the root page of a btree. 40546134Smao * 40650989Sbostic * Parameters: 40750989Sbostic * t: tree 40850989Sbostic * h: root page 40950989Sbostic * lp: pointer to left page pointer 41050989Sbostic * rp: pointer to right page pointer 41150989Sbostic * skip: pointer to index to leave open 41257451Sbostic * ilen: insert length 41346134Smao * 41450989Sbostic * Returns: 41550989Sbostic * Pointer to page in which to insert or NULL on error. 41646134Smao */ 41750989Sbostic static PAGE * 41857451Sbostic bt_root(t, h, lp, rp, skip, ilen) 41950989Sbostic BTREE *t; 42050989Sbostic PAGE *h, **lp, **rp; 42166192Sbostic indx_t *skip; 42257451Sbostic size_t ilen; 42346134Smao { 42450989Sbostic PAGE *l, *r, *tp; 42550989Sbostic pgno_t lnpg, rnpg; 42646134Smao 42750989Sbostic #ifdef STATISTICS 42850989Sbostic ++bt_split; 42950989Sbostic ++bt_rootsplit; 43050989Sbostic #endif 43150989Sbostic /* Put the new left and right pages for the split into place. */ 43256492Sbostic if ((l = __bt_new(t, &lnpg)) == NULL || 43356492Sbostic (r = __bt_new(t, &rnpg)) == NULL) 43450989Sbostic return (NULL); 43550989Sbostic l->pgno = lnpg; 43650989Sbostic r->pgno = rnpg; 43750989Sbostic l->nextpg = r->pgno; 43850989Sbostic r->prevpg = l->pgno; 43950989Sbostic l->prevpg = r->nextpg = P_INVALID; 44050989Sbostic l->lower = r->lower = BTDATAOFF; 44150989Sbostic l->upper = r->upper = t->bt_psize; 44250989Sbostic l->flags = r->flags = h->flags & P_TYPE; 44350989Sbostic 44450989Sbostic /* Split the root page. */ 44557451Sbostic tp = bt_psplit(t, h, l, r, skip, ilen); 44650989Sbostic 44750989Sbostic *lp = l; 44850989Sbostic *rp = r; 44950989Sbostic return (tp); 45046134Smao } 45146134Smao 45246134Smao /* 45357440Sbostic * BT_RROOT -- Fix up the recno root page after it has been split. 45446134Smao * 45550989Sbostic * Parameters: 45650989Sbostic * t: tree 45750989Sbostic * h: root page 45857440Sbostic * l: left page 45957440Sbostic * r: right page 46046134Smao * 46150989Sbostic * Returns: 46250989Sbostic * RET_ERROR, RET_SUCCESS 46346134Smao */ 46450989Sbostic static int 46550989Sbostic bt_rroot(t, h, l, r) 46650989Sbostic BTREE *t; 46750989Sbostic PAGE *h, *l, *r; 46846134Smao { 46950989Sbostic char *dest; 47046134Smao 47150989Sbostic /* Insert the left and right keys, set the header information. */ 47250989Sbostic h->linp[0] = h->upper = t->bt_psize - NRINTERNAL; 47350989Sbostic dest = (char *)h + h->upper; 47450989Sbostic WR_RINTERNAL(dest, 47550989Sbostic l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno); 47646134Smao 47750989Sbostic h->linp[1] = h->upper -= NRINTERNAL; 47850989Sbostic dest = (char *)h + h->upper; 47950989Sbostic WR_RINTERNAL(dest, 48050989Sbostic r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno); 48146134Smao 48257989Sbostic h->lower = BTDATAOFF + 2 * sizeof(indx_t); 48346134Smao 48450989Sbostic /* Unpin the root page, set to recno internal page. */ 48550989Sbostic h->flags &= ~P_TYPE; 48650989Sbostic h->flags |= P_RINTERNAL; 48750989Sbostic mpool_put(t->bt_mp, h, MPOOL_DIRTY); 48846134Smao 48950989Sbostic return (RET_SUCCESS); 49050989Sbostic } 49146134Smao 49250989Sbostic /* 49357440Sbostic * BT_BROOT -- Fix up the btree root page after it has been split. 49450989Sbostic * 49550989Sbostic * Parameters: 49650989Sbostic * t: tree 49750989Sbostic * h: root page 49857440Sbostic * l: left page 49957440Sbostic * r: right page 50050989Sbostic * 50150989Sbostic * Returns: 50250989Sbostic * RET_ERROR, RET_SUCCESS 50350989Sbostic */ 50450989Sbostic static int 50550989Sbostic bt_broot(t, h, l, r) 50650989Sbostic BTREE *t; 50750989Sbostic PAGE *h, *l, *r; 50850989Sbostic { 50950989Sbostic BINTERNAL *bi; 51050989Sbostic BLEAF *bl; 51150989Sbostic size_t nbytes; 51250989Sbostic char *dest; 51346134Smao 51446134Smao /* 51550989Sbostic * If the root page was a leaf page, change it into an internal page. 51650989Sbostic * We copy the key we split on (but not the key's data, in the case of 51756401Sbostic * a leaf page) to the new root page. 51850989Sbostic * 51950989Sbostic * The btree comparison code guarantees that the left-most key on any 52057440Sbostic * level of the tree is never used, so it doesn't need to be filled in. 52146134Smao */ 52256401Sbostic nbytes = NBINTERNAL(0); 52350989Sbostic h->linp[0] = h->upper = t->bt_psize - nbytes; 52450989Sbostic dest = (char *)h + h->upper; 52550989Sbostic WR_BINTERNAL(dest, 0, l->pgno, 0); 52646134Smao 52750989Sbostic switch(h->flags & P_TYPE) { 52850989Sbostic case P_BLEAF: 52950989Sbostic bl = GETBLEAF(r, 0); 53050989Sbostic nbytes = NBINTERNAL(bl->ksize); 53150989Sbostic h->linp[1] = h->upper -= nbytes; 53250989Sbostic dest = (char *)h + h->upper; 53350989Sbostic WR_BINTERNAL(dest, bl->ksize, r->pgno, 0); 53458017Sbostic memmove(dest, bl->bytes, bl->ksize); 53550989Sbostic 53656401Sbostic /* 53756401Sbostic * If the key is on an overflow page, mark the overflow chain 53856401Sbostic * so it isn't deleted when the leaf copy of the key is deleted. 53956401Sbostic */ 54050989Sbostic if (bl->flags & P_BIGKEY && 54150989Sbostic bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR) 54250989Sbostic return (RET_ERROR); 54350989Sbostic break; 54450989Sbostic case P_BINTERNAL: 54550989Sbostic bi = GETBINTERNAL(r, 0); 54650989Sbostic nbytes = NBINTERNAL(bi->ksize); 54750989Sbostic h->linp[1] = h->upper -= nbytes; 54850989Sbostic dest = (char *)h + h->upper; 54958017Sbostic memmove(dest, bi, nbytes); 55050989Sbostic ((BINTERNAL *)dest)->pgno = r->pgno; 55150989Sbostic break; 55256741Sbostic default: 55356741Sbostic abort(); 55446134Smao } 55557440Sbostic 55657440Sbostic /* There are two keys on the page. */ 55757989Sbostic h->lower = BTDATAOFF + 2 * sizeof(indx_t); 55846134Smao 55950989Sbostic /* Unpin the root page, set to btree internal page. */ 56050989Sbostic h->flags &= ~P_TYPE; 56150989Sbostic h->flags |= P_BINTERNAL; 56250989Sbostic mpool_put(t->bt_mp, h, MPOOL_DIRTY); 56346134Smao 56446134Smao return (RET_SUCCESS); 56546134Smao } 56646134Smao 56746134Smao /* 56850989Sbostic * BT_PSPLIT -- Do the real work of splitting the page. 56946134Smao * 57050989Sbostic * Parameters: 57150989Sbostic * t: tree 57250989Sbostic * h: page to be split 57350989Sbostic * l: page to put lower half of data 57450989Sbostic * r: page to put upper half of data 57557440Sbostic * pskip: pointer to index to leave open 57657451Sbostic * ilen: insert length 57746134Smao * 57850989Sbostic * Returns: 57950989Sbostic * Pointer to page in which to insert. 58046134Smao */ 58150989Sbostic static PAGE * 58257451Sbostic bt_psplit(t, h, l, r, pskip, ilen) 58350989Sbostic BTREE *t; 58450989Sbostic PAGE *h, *l, *r; 58566192Sbostic indx_t *pskip; 58657451Sbostic size_t ilen; 58746134Smao { 58850989Sbostic BINTERNAL *bi; 58950989Sbostic BLEAF *bl; 59050989Sbostic RLEAF *rl; 59150989Sbostic EPGNO *c; 59250989Sbostic PAGE *rval; 59358067Sbostic void *src; 59458067Sbostic indx_t full, half, nxt, off, skip, top, used; 59550989Sbostic size_t nbytes; 59658067Sbostic int bigkeycnt, isbigkey; 59746134Smao 59846134Smao /* 59957451Sbostic * Split the data to the left and right pages. Leave the skip index 60057451Sbostic * open. Additionally, make some effort not to split on an overflow 60157451Sbostic * key. This makes internal page processing faster and can save 60257451Sbostic * space as overflow keys used by internal pages are never deleted. 60346134Smao */ 60450989Sbostic bigkeycnt = 0; 60551106Sbostic skip = *pskip; 60657451Sbostic full = t->bt_psize - BTDATAOFF; 60757451Sbostic half = full / 2; 60857451Sbostic used = 0; 60950989Sbostic for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) { 61057451Sbostic if (skip == off) { 61157451Sbostic nbytes = ilen; 61257451Sbostic isbigkey = 0; /* XXX: not really known. */ 61357451Sbostic } else 61457451Sbostic switch (h->flags & P_TYPE) { 61557451Sbostic case P_BINTERNAL: 61657451Sbostic src = bi = GETBINTERNAL(h, nxt); 61757451Sbostic nbytes = NBINTERNAL(bi->ksize); 61857451Sbostic isbigkey = bi->flags & P_BIGKEY; 61957451Sbostic break; 62057451Sbostic case P_BLEAF: 62157451Sbostic src = bl = GETBLEAF(h, nxt); 62257451Sbostic nbytes = NBLEAF(bl); 62357451Sbostic isbigkey = bl->flags & P_BIGKEY; 62457451Sbostic break; 62557451Sbostic case P_RINTERNAL: 62657451Sbostic src = GETRINTERNAL(h, nxt); 62757451Sbostic nbytes = NRINTERNAL; 62857451Sbostic isbigkey = 0; 62957451Sbostic break; 63057451Sbostic case P_RLEAF: 63157451Sbostic src = rl = GETRLEAF(h, nxt); 63257451Sbostic nbytes = NRLEAF(rl); 63357451Sbostic isbigkey = 0; 63457451Sbostic break; 63557451Sbostic default: 63657451Sbostic abort(); 63757451Sbostic } 63857451Sbostic 63957451Sbostic /* 64057451Sbostic * If the key/data pairs are substantial fractions of the max 64157451Sbostic * possible size for the page, it's possible to get situations 64257451Sbostic * where we decide to try and copy too much onto the left page. 64357451Sbostic * Make sure that doesn't happen. 64457451Sbostic */ 64557451Sbostic if (skip <= off && used + nbytes >= full) { 64657451Sbostic --off; 64750989Sbostic break; 64850989Sbostic } 64946134Smao 65057451Sbostic /* Copy the key/data pair, if not the skipped index. */ 65157451Sbostic if (skip != off) { 65257451Sbostic ++nxt; 65357451Sbostic 65457451Sbostic l->linp[off] = l->upper -= nbytes; 65558017Sbostic memmove((char *)l + l->upper, src, nbytes); 65657451Sbostic } 65757451Sbostic 65857451Sbostic used += nbytes; 65957451Sbostic if (used >= half) { 66057440Sbostic if (!isbigkey || bigkeycnt == 3) 66157440Sbostic break; 66257440Sbostic else 66357440Sbostic ++bigkeycnt; 66457451Sbostic } 66550989Sbostic } 66657451Sbostic 66757451Sbostic /* 66857451Sbostic * Off is the last offset that's valid for the left page. 66957451Sbostic * Nxt is the first offset to be placed on the right page. 67057451Sbostic */ 67157989Sbostic l->lower += (off + 1) * sizeof(indx_t); 67246134Smao 67350989Sbostic /* 67451106Sbostic * If splitting the page that the cursor was on, the cursor has to be 67551106Sbostic * adjusted to point to the same record as before the split. If the 67657451Sbostic * cursor is at or past the skipped slot, the cursor is incremented by 67757451Sbostic * one. If the cursor is on the right page, it is decremented by the 67857451Sbostic * number of records split to the left page. 67951106Sbostic * 68060048Sbostic * Don't bother checking for the B_SEQINIT flag, the page number will 68151106Sbostic * be P_INVALID. 68250989Sbostic */ 68350989Sbostic c = &t->bt_bcursor; 68457451Sbostic if (c->pgno == h->pgno) { 68557451Sbostic if (c->index >= skip) 68657451Sbostic ++c->index; 68757451Sbostic if (c->index < nxt) /* Left page. */ 68850989Sbostic c->pgno = l->pgno; 68957451Sbostic else { /* Right page. */ 69050989Sbostic c->pgno = r->pgno; 69157451Sbostic c->index -= nxt; 69246134Smao } 69357451Sbostic } 69446134Smao 69550989Sbostic /* 69657451Sbostic * If the skipped index was on the left page, just return that page. 69757451Sbostic * Otherwise, adjust the skip index to reflect the new position on 69857451Sbostic * the right page. 69950989Sbostic */ 70057451Sbostic if (skip <= off) { 70157451Sbostic skip = 0; 70257451Sbostic rval = l; 70357451Sbostic } else { 70450989Sbostic rval = r; 70557451Sbostic *pskip -= nxt; 70657451Sbostic } 70746134Smao 70850989Sbostic for (off = 0; nxt < top; ++off) { 70951106Sbostic if (skip == nxt) { 71057451Sbostic ++off; 71151106Sbostic skip = 0; 71246134Smao } 71350989Sbostic switch (h->flags & P_TYPE) { 71450989Sbostic case P_BINTERNAL: 71550989Sbostic src = bi = GETBINTERNAL(h, nxt); 71650989Sbostic nbytes = NBINTERNAL(bi->ksize); 71750989Sbostic break; 71850989Sbostic case P_BLEAF: 71950989Sbostic src = bl = GETBLEAF(h, nxt); 72050989Sbostic nbytes = NBLEAF(bl); 72150989Sbostic break; 72250989Sbostic case P_RINTERNAL: 72350989Sbostic src = GETRINTERNAL(h, nxt); 72450989Sbostic nbytes = NRINTERNAL; 72550989Sbostic break; 72650989Sbostic case P_RLEAF: 72750989Sbostic src = rl = GETRLEAF(h, nxt); 72850989Sbostic nbytes = NRLEAF(rl); 72950989Sbostic break; 73056741Sbostic default: 73156741Sbostic abort(); 73250989Sbostic } 73350989Sbostic ++nxt; 73450989Sbostic r->linp[off] = r->upper -= nbytes; 73558017Sbostic memmove((char *)r + r->upper, src, nbytes); 73650989Sbostic } 73757989Sbostic r->lower += off * sizeof(indx_t); 73846134Smao 73950989Sbostic /* If the key is being appended to the page, adjust the index. */ 74051106Sbostic if (skip == top) 74157989Sbostic r->lower += sizeof(indx_t); 74246134Smao 74350989Sbostic return (rval); 74450989Sbostic } 74546134Smao 74650989Sbostic /* 74750989Sbostic * BT_PRESERVE -- Mark a chain of pages as used by an internal node. 74850989Sbostic * 74950989Sbostic * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the 75050989Sbostic * record that references them gets deleted. Chains pointed to by internal 75150989Sbostic * pages never get deleted. This routine marks a chain as pointed to by an 75250989Sbostic * internal page. 75350989Sbostic * 75450989Sbostic * Parameters: 75550989Sbostic * t: tree 75650989Sbostic * pg: page number of first page in the chain. 75750989Sbostic * 75850989Sbostic * Returns: 75950989Sbostic * RET_SUCCESS, RET_ERROR. 76050989Sbostic */ 76150989Sbostic static int 76250989Sbostic bt_preserve(t, pg) 76350989Sbostic BTREE *t; 76450989Sbostic pgno_t pg; 76550989Sbostic { 76650989Sbostic PAGE *h; 76746134Smao 76850989Sbostic if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 76950989Sbostic return (RET_ERROR); 77050989Sbostic h->flags |= P_PRESERVE; 77150989Sbostic mpool_put(t->bt_mp, h, MPOOL_DIRTY); 77246134Smao return (RET_SUCCESS); 77346134Smao } 77450989Sbostic 77550989Sbostic /* 77650989Sbostic * REC_TOTAL -- Return the number of recno entries below a page. 77750989Sbostic * 77850989Sbostic * Parameters: 77950989Sbostic * h: page 78050989Sbostic * 78150989Sbostic * Returns: 78250989Sbostic * The number of recno entries below a page. 78350989Sbostic * 78450989Sbostic * XXX 78550989Sbostic * These values could be set by the bt_psplit routine. The problem is that the 78650989Sbostic * entry has to be popped off of the stack etc. or the values have to be passed 78750989Sbostic * all the way back to bt_split/bt_rroot and it's not very clean. 78850989Sbostic */ 78950989Sbostic static recno_t 79050989Sbostic rec_total(h) 79150989Sbostic PAGE *h; 79250989Sbostic { 79350989Sbostic recno_t recs; 79457989Sbostic indx_t nxt, top; 79550989Sbostic 79650989Sbostic for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt) 79750989Sbostic recs += GETRINTERNAL(h, nxt)->nrecs; 79850989Sbostic return (recs); 79950989Sbostic } 800