xref: /csrg-svn/lib/libc/db/btree/bt_split.c (revision 57440)
146134Smao /*-
246134Smao  * Copyright (c) 1990 The Regents of the University of California.
346134Smao  * All rights reserved.
446134Smao  *
546134Smao  * This code is derived from software contributed to Berkeley by
646134Smao  * Mike Olson.
746134Smao  *
846134Smao  * %sccs.include.redist.c%
946134Smao  */
1046134Smao 
1146134Smao #if defined(LIBC_SCCS) && !defined(lint)
12*57440Sbostic static char sccsid[] = "@(#)bt_split.c	5.9 (Berkeley) 01/09/93";
1346134Smao #endif /* LIBC_SCCS and not lint */
1446134Smao 
1546134Smao #include <sys/types.h>
1656741Sbostic 
1750989Sbostic #define	__DBINTERFACE_PRIVATE
1846134Smao #include <db.h>
1950989Sbostic #include <limits.h>
2050989Sbostic #include <stdio.h>
2146561Sbostic #include <stdlib.h>
2246561Sbostic #include <string.h>
2356741Sbostic 
2446134Smao #include "btree.h"
2546134Smao 
26*57440Sbostic static int	 bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
27*57440Sbostic static PAGE	*bt_page __P((BTREE *, PAGE *, PAGE **, PAGE **, int *));
2850989Sbostic static int	 bt_preserve __P((BTREE *, pgno_t));
2950989Sbostic static PAGE	*bt_psplit __P((BTREE *, PAGE *, PAGE *, PAGE *, int *));
3050989Sbostic static PAGE	*bt_root __P((BTREE *, PAGE *, PAGE **, PAGE **, int *));
3150989Sbostic static int	 bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
3250989Sbostic static recno_t	 rec_total __P((PAGE *));
3350989Sbostic 
3450989Sbostic #ifdef STATISTICS
3550989Sbostic u_long	bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
3650989Sbostic #endif
3750989Sbostic 
3846134Smao /*
3950989Sbostic  * __BT_SPLIT -- Split the tree.
4046134Smao  *
4150989Sbostic  * Parameters:
4250989Sbostic  *	t:	tree
43*57440Sbostic  *	sp:	page to split
4450989Sbostic  *	key:	key to insert
4550989Sbostic  *	data:	data to insert
4650989Sbostic  *	flags:	BIGKEY/BIGDATA flags
4750989Sbostic  *	nbytes:	length of insertion
4850989Sbostic  *	skip:	index to leave open
4946134Smao  *
5050989Sbostic  * Returns:
5150989Sbostic  *	RET_ERROR, RET_SUCCESS
5246134Smao  */
5346134Smao int
54*57440Sbostic __bt_split(t, sp, key, data, flags, nbytes, skip)
5550989Sbostic 	BTREE *t;
56*57440Sbostic 	PAGE *sp;
5750989Sbostic 	const DBT *key, *data;
5850989Sbostic 	u_long flags;
5950989Sbostic 	size_t nbytes;
6050989Sbostic 	int skip;
6146134Smao {
6250989Sbostic 	BINTERNAL *bi;
6350989Sbostic 	BLEAF *bl;
6450989Sbostic 	DBT a, b;
6550989Sbostic 	EPGNO *parent;
66*57440Sbostic 	PAGE *h, *l, *r, *lchild, *rchild;
6750989Sbostic 	index_t nxtindex;
6850989Sbostic 	size_t nksize;
6950989Sbostic 	int nosplit;
7050989Sbostic 	char *dest;
7146134Smao 
7246134Smao 	/*
7350989Sbostic 	 * Split the page into two pages, l and r.  The split routines return
74*57440Sbostic 	 * a pointer to the page into which the key should be inserted and with
75*57440Sbostic 	 * skip set to the offset which should be used.  Additionally, l and r
76*57440Sbostic 	 * are pinned.
7746134Smao 	 */
78*57440Sbostic 	h = sp->pgno == P_ROOT ?
79*57440Sbostic 	    bt_root(t, sp, &l, &r, &skip) : bt_page(t, sp, &l, &r, &skip);
8050989Sbostic 	if (h == NULL)
8146134Smao 		return (RET_ERROR);
8246134Smao 
8350989Sbostic 	/*
84*57440Sbostic 	 * Insert the new key/data pair into the leaf page.  (Key inserts
85*57440Sbostic 	 * always cause a leaf page to split first.)
8650989Sbostic 	 */
8750989Sbostic 	h->linp[skip] = h->upper -= nbytes;
8850989Sbostic 	dest = (char *)h + h->upper;
8951106Sbostic 	if (ISSET(t, BTF_RECNO))
9050989Sbostic 		WR_RLEAF(dest, data, flags)
9151106Sbostic 	else
9250989Sbostic 		WR_BLEAF(dest, key, data, flags)
9346134Smao 
94*57440Sbostic 	/* If the root page was split, make it look right. */
95*57440Sbostic 	if (sp->pgno == P_ROOT &&
96*57440Sbostic 	    (ISSET(t, BTF_RECNO) ?
97*57440Sbostic 	    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
98*57440Sbostic 		goto err2;
99*57440Sbostic 
10046134Smao 	/*
10150989Sbostic 	 * Now we walk the parent page stack -- a LIFO stack of the pages that
10250989Sbostic 	 * were traversed when we searched for the page that split.  Each stack
10350989Sbostic 	 * entry is a page number and a page index offset.  The offset is for
10450989Sbostic 	 * the page traversed on the search.  We've just split a page, so we
10550989Sbostic 	 * have to insert a new key into the parent page.
10650989Sbostic 	 *
10750989Sbostic 	 * If the insert into the parent page causes it to split, may have to
10850989Sbostic 	 * continue splitting all the way up the tree.  We stop if the root
10950989Sbostic 	 * splits or the page inserted into didn't have to split to hold the
11050989Sbostic 	 * new key.  Some algorithms replace the key for the old page as well
11150989Sbostic 	 * as the new page.  We don't, as there's no reason to believe that the
11250989Sbostic 	 * first key on the old page is any better than the key we have, and,
11350989Sbostic 	 * in the case of a key being placed at index 0 causing the split, the
11450989Sbostic 	 * key is unavailable.
11550989Sbostic 	 *
11650989Sbostic 	 * There are a maximum of 5 pages pinned at any time.  We keep the left
11750989Sbostic 	 * and right pages pinned while working on the parent.   The 5 are the
11850989Sbostic 	 * two children, left parent and right parent (when the parent splits)
11950989Sbostic 	 * and the root page or the overflow key page when calling bt_preserve.
12050989Sbostic 	 * This code must make sure that all pins are released other than the
12150989Sbostic 	 * root page or overflow page which is unlocked elsewhere.
12246134Smao 	 */
12350989Sbostic 	for (nosplit = 0; (parent = BT_POP(t)) != NULL;) {
12450989Sbostic 		lchild = l;
12550989Sbostic 		rchild = r;
12646134Smao 
12750989Sbostic 		/* Get the parent page. */
12850989Sbostic 		if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
12950989Sbostic 			goto err2;
13046134Smao 
13150989Sbostic 	 	/* The new key goes ONE AFTER the index. */
13250989Sbostic 		skip = parent->index + 1;
13346134Smao 
13450989Sbostic 		/*
13550989Sbostic 		 * Calculate the space needed on the parent page.
13650989Sbostic 		 *
13751106Sbostic 		 * Space hack when inserting into BINTERNAL pages.  Only need to
13850989Sbostic 		 * retain the number of bytes that will distinguish between the
13951106Sbostic 		 * new entry and the LAST entry on the page to its left.  If the
14051106Sbostic 		 * keys compare equal, retain the entire key.  Note, we don't
14151106Sbostic 		 * touch overflow keys and the entire key must be retained for
14251106Sbostic 		 * the next-to-leftmost key on the leftmost page of each level,
14351106Sbostic 		 * or the search will fail.
14450989Sbostic 		 */
14550989Sbostic 		switch (rchild->flags & P_TYPE) {
14650989Sbostic 		case P_BINTERNAL:
14750989Sbostic 			bi = GETBINTERNAL(rchild, 0);
14850989Sbostic 			nbytes = NBINTERNAL(bi->ksize);
14950989Sbostic 			if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) &&
15050989Sbostic 			    !(bi->flags & P_BIGKEY)) {
15150989Sbostic 				BINTERNAL *tbi;
15250989Sbostic 				tbi =
15350989Sbostic 				    GETBINTERNAL(lchild, NEXTINDEX(lchild) - 1);
15450989Sbostic 				a.size = tbi->ksize;
15550989Sbostic 				a.data = tbi->bytes;
15650989Sbostic 				b.size = bi->ksize;
15750989Sbostic 				b.data = bi->bytes;
15850989Sbostic 				goto prefix;
15956741Sbostic 			} else
16056741Sbostic 				nksize = 0;
16150989Sbostic 			break;
16250989Sbostic 		case P_BLEAF:
16350989Sbostic 			bl = GETBLEAF(rchild, 0);
16456401Sbostic 			nbytes = NBINTERNAL(bl->ksize);
16550989Sbostic 			if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) &&
16650989Sbostic 			    !(bl->flags & P_BIGKEY)) {
16750989Sbostic 				BLEAF *tbl;
16850989Sbostic 				size_t n;
16950989Sbostic 
17050989Sbostic 				tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
17150989Sbostic 				a.size = tbl->ksize;
17250989Sbostic 				a.data = tbl->bytes;
17350989Sbostic 				b.size = bl->ksize;
17450989Sbostic 				b.data = bl->bytes;
17550989Sbostic prefix:				nksize = t->bt_pfx(&a, &b);
17650989Sbostic 				n = NBINTERNAL(nksize);
17750989Sbostic 				if (n < nbytes) {
17850989Sbostic #ifdef STATISTICS
17950989Sbostic 					bt_pfxsaved += nbytes - n;
18050989Sbostic #endif
18150989Sbostic 					nbytes = n;
18250989Sbostic 				} else
18350989Sbostic 					nksize = 0;
18450989Sbostic 			} else
18550989Sbostic 				nksize = 0;
18650989Sbostic 			break;
18750989Sbostic 		case P_RINTERNAL:
18850989Sbostic 		case P_RLEAF:
18950989Sbostic 			nbytes = NRINTERNAL;
19050989Sbostic 			break;
19156741Sbostic 		default:
19256741Sbostic 			abort();
19350989Sbostic 		}
19450989Sbostic 
19550989Sbostic 		/* Split the parent page if necessary or shift the indices. */
19650989Sbostic 		if (h->upper - h->lower < nbytes + sizeof(index_t)) {
197*57440Sbostic 			sp = h;
19850989Sbostic 			h = h->pgno == P_ROOT ?
19950989Sbostic 			    bt_root(t, h, &l, &r, &skip) :
20050989Sbostic 			    bt_page(t, h, &l, &r, &skip);
20150989Sbostic 			if (h == NULL)
20250989Sbostic 				goto err1;
20346134Smao 		} else {
20450989Sbostic 			if (skip < (nxtindex = NEXTINDEX(h)))
20550989Sbostic 				bcopy(h->linp + skip, h->linp + skip + 1,
20650989Sbostic 				    (nxtindex - skip) * sizeof(index_t));
20750989Sbostic 			h->lower += sizeof(index_t);
20850989Sbostic 			nosplit = 1;
20946134Smao 		}
21046134Smao 
21150989Sbostic 		/* Insert the key into the parent page. */
21250989Sbostic 		switch(rchild->flags & P_TYPE) {
21350989Sbostic 		case P_BINTERNAL:
21450989Sbostic 			h->linp[skip] = h->upper -= nbytes;
21550989Sbostic 			dest = (char *)h + h->linp[skip];
21650989Sbostic 			bcopy(bi, dest, nbytes);
21750989Sbostic 			if (nksize)
21850989Sbostic 				((BINTERNAL *)dest)->ksize = nksize;
21950989Sbostic 			((BINTERNAL *)dest)->pgno = rchild->pgno;
22050989Sbostic 			break;
22150989Sbostic 		case P_BLEAF:
22250989Sbostic 			h->linp[skip] = h->upper -= nbytes;
22350989Sbostic 			dest = (char *)h + h->linp[skip];
22450989Sbostic 			WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
22551106Sbostic 			    rchild->pgno, bl->flags & P_BIGKEY);
22650989Sbostic 			bcopy(bl->bytes, dest, nksize ? nksize : bl->ksize);
22750989Sbostic 			if (bl->flags & P_BIGKEY &&
22850989Sbostic 			    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
22950989Sbostic 				goto err1;
23050989Sbostic 			break;
23150989Sbostic 		case P_RINTERNAL:
23250989Sbostic 			/* Update both left and right page counts. */
23350989Sbostic 			h->linp[skip] = h->upper -= nbytes;
23450989Sbostic 			dest = (char *)h + h->linp[skip];
23550989Sbostic 			((RINTERNAL *)dest)->nrecs = rec_total(rchild);
23650989Sbostic 			((RINTERNAL *)dest)->pgno = rchild->pgno;
23750989Sbostic 			dest = (char *)h + h->linp[skip - 1];
23850989Sbostic 			((RINTERNAL *)dest)->nrecs = rec_total(lchild);
23950989Sbostic 			((RINTERNAL *)dest)->pgno = lchild->pgno;
24050989Sbostic 			break;
24150989Sbostic 		case P_RLEAF:
24250989Sbostic 			/* Update both left and right page counts. */
24350989Sbostic 			h->linp[skip] = h->upper -= nbytes;
24450989Sbostic 			dest = (char *)h + h->linp[skip];
24550989Sbostic 			((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
24650989Sbostic 			((RINTERNAL *)dest)->pgno = rchild->pgno;
24750989Sbostic 			dest = (char *)h + h->linp[skip - 1];
24850989Sbostic 			((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
24950989Sbostic 			((RINTERNAL *)dest)->pgno = lchild->pgno;
25050989Sbostic 			break;
25156741Sbostic 		default:
25256741Sbostic 			abort();
25350989Sbostic 		}
25446134Smao 
25550989Sbostic 		/* Unpin the held pages. */
25650989Sbostic 		if (nosplit) {
25750989Sbostic 			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
25850989Sbostic 			break;
25950989Sbostic 		}
260*57440Sbostic 
261*57440Sbostic 		/* If the root page was split, make it look right. */
262*57440Sbostic 		if (sp->pgno == P_ROOT &&
263*57440Sbostic 		    (ISSET(t, BTF_RECNO) ?
264*57440Sbostic 		    bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
265*57440Sbostic 			goto err1;
266*57440Sbostic 
26750989Sbostic 		mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
26850989Sbostic 		mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
26950989Sbostic 	}
27046134Smao 
27150989Sbostic 	/* Unpin the held pages. */
27250989Sbostic 	mpool_put(t->bt_mp, l, MPOOL_DIRTY);
27350989Sbostic 	mpool_put(t->bt_mp, r, MPOOL_DIRTY);
27450989Sbostic 
27551106Sbostic 	/* Clear any pages left on the stack. */
27651106Sbostic 	BT_CLR(t);
27750989Sbostic 	return (RET_SUCCESS);
27846134Smao 
27946134Smao 	/*
28050989Sbostic 	 * If something fails in the above loop we were already walking back
28150989Sbostic 	 * up the tree and the tree is now inconsistent.  Nothing much we can
28250989Sbostic 	 * do about it but release any memory we're holding.
28346134Smao 	 */
28450989Sbostic err1:	mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
28550989Sbostic 	mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
28646134Smao 
28750989Sbostic err2:	mpool_put(t->bt_mp, l, 0);
28850989Sbostic 	mpool_put(t->bt_mp, r, 0);
28950989Sbostic 	__dbpanic(t->bt_dbp);
29050989Sbostic 	return (RET_ERROR);
29146134Smao }
29246134Smao 
29346134Smao /*
29450989Sbostic  * BT_PAGE -- Split a non-root page of a btree.
29546134Smao  *
29650989Sbostic  * Parameters:
29750989Sbostic  *	t:	tree
29850989Sbostic  *	h:	root page
29950989Sbostic  *	lp:	pointer to left page pointer
30050989Sbostic  *	rp:	pointer to right page pointer
30150989Sbostic  *	skip:	pointer to index to leave open
30246134Smao  *
30350989Sbostic  * Returns:
30450989Sbostic  *	Pointer to page in which to insert or NULL on error.
30546134Smao  */
30650989Sbostic static PAGE *
30750989Sbostic bt_page(t, h, lp, rp, skip)
30850989Sbostic 	BTREE *t;
30950989Sbostic 	PAGE *h, **lp, **rp;
31050989Sbostic 	int *skip;
31146134Smao {
31250989Sbostic 	PAGE *l, *r, *tp;
31350989Sbostic 	pgno_t npg;
31446134Smao 
31550989Sbostic #ifdef STATISTICS
31650989Sbostic 	++bt_split;
31750989Sbostic #endif
31850989Sbostic 	/* Put the new right page for the split into place. */
31956492Sbostic 	if ((r = __bt_new(t, &npg)) == NULL)
32050989Sbostic 		return (NULL);
32150989Sbostic 	r->pgno = npg;
32250989Sbostic 	r->lower = BTDATAOFF;
32350989Sbostic 	r->upper = t->bt_psize;
32450989Sbostic 	r->nextpg = h->nextpg;
32550989Sbostic 	r->prevpg = h->pgno;
32650989Sbostic 	r->flags = h->flags & P_TYPE;
32746134Smao 
32850989Sbostic 	/*
32950989Sbostic 	 * If we're splitting the last page on a level because we're appending
33050989Sbostic 	 * a key to it (skip is NEXTINDEX()), it's likely that the data is
33150989Sbostic 	 * sorted.  Adding an empty page on the side of the level is less work
33250989Sbostic 	 * and can push the fill factor much higher than normal.  If we're
33350989Sbostic 	 * wrong it's no big deal, we'll just do the split the right way next
33450989Sbostic 	 * time.  It may look like it's equally easy to do a similar hack for
33550989Sbostic 	 * reverse sorted data, that is, split the tree left, but it's not.
33650989Sbostic 	 * Don't even try.
33750989Sbostic 	 */
33850989Sbostic 	if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
33950989Sbostic #ifdef STATISTICS
34050989Sbostic 		++bt_sortsplit;
34150989Sbostic #endif
34250989Sbostic 		h->nextpg = r->pgno;
34350989Sbostic 		r->lower = BTDATAOFF + sizeof(index_t);
34450989Sbostic 		*skip = 0;
34550989Sbostic 		*lp = h;
34650989Sbostic 		*rp = r;
34750989Sbostic 		return (r);
34850989Sbostic 	}
34946134Smao 
35050989Sbostic 	/* Put the new left page for the split into place. */
35150989Sbostic 	if ((l = malloc(t->bt_psize)) == NULL) {
35250989Sbostic 		mpool_put(t->bt_mp, r, 0);
35350989Sbostic 		return (NULL);
35450989Sbostic 	}
35550989Sbostic 	l->pgno = h->pgno;
35650989Sbostic 	l->nextpg = r->pgno;
35750989Sbostic 	l->prevpg = h->prevpg;
35850989Sbostic 	l->lower = BTDATAOFF;
35950989Sbostic 	l->upper = t->bt_psize;
36050989Sbostic 	l->flags = h->flags & P_TYPE;
36146134Smao 
36250989Sbostic 	/* Fix up the previous pointer of the page after the split page. */
36350989Sbostic 	if (h->nextpg != P_INVALID) {
36450989Sbostic 		if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
36550989Sbostic 			free(l);
36650989Sbostic 			/* XXX mpool_free(t->bt_mp, r->pgno); */
36750989Sbostic 			return (NULL);
36846134Smao 		}
36950989Sbostic 		tp->prevpg = r->pgno;
37050989Sbostic 		mpool_put(t->bt_mp, tp, 0);
37150989Sbostic 	}
37246134Smao 
37350989Sbostic 	/*
37450989Sbostic 	 * Split right.  The key/data pairs aren't sorted in the btree page so
37550989Sbostic 	 * it's simpler to copy the data from the split page onto two new pages
37650989Sbostic 	 * instead of copying half the data to the right page and compacting
37750989Sbostic 	 * the left page in place.  Since the left page can't change, we have
37850989Sbostic 	 * to swap the original and the allocated left page after the split.
37950989Sbostic 	 */
38050989Sbostic 	tp = bt_psplit(t, h, l, r, skip);
38146134Smao 
38250989Sbostic 	/* Move the new left page onto the old left page. */
383*57440Sbostic 	memmove(h, l, t->bt_psize);
38450989Sbostic 	if (tp == l)
38550989Sbostic 		tp = h;
38650989Sbostic 	free(l);
38750989Sbostic 
38850989Sbostic 	*lp = h;
38950989Sbostic 	*rp = r;
39050989Sbostic 	return (tp);
39146134Smao }
39246134Smao 
39346134Smao /*
39451106Sbostic  * BT_ROOT -- Split the root page of a btree.
39546134Smao  *
39650989Sbostic  * Parameters:
39750989Sbostic  *	t:	tree
39850989Sbostic  *	h:	root page
39950989Sbostic  *	lp:	pointer to left page pointer
40050989Sbostic  *	rp:	pointer to right page pointer
40150989Sbostic  *	skip:	pointer to index to leave open
40246134Smao  *
40350989Sbostic  * Returns:
40450989Sbostic  *	Pointer to page in which to insert or NULL on error.
40546134Smao  */
40650989Sbostic static PAGE *
40750989Sbostic bt_root(t, h, lp, rp, skip)
40850989Sbostic 	BTREE *t;
40950989Sbostic 	PAGE *h, **lp, **rp;
41050989Sbostic 	int *skip;
41146134Smao {
41250989Sbostic 	PAGE *l, *r, *tp;
41350989Sbostic 	pgno_t lnpg, rnpg;
41446134Smao 
41550989Sbostic #ifdef STATISTICS
41650989Sbostic 	++bt_split;
41750989Sbostic 	++bt_rootsplit;
41850989Sbostic #endif
41950989Sbostic 	/* Put the new left and right pages for the split into place. */
42056492Sbostic 	if ((l = __bt_new(t, &lnpg)) == NULL ||
42156492Sbostic 	    (r = __bt_new(t, &rnpg)) == NULL)
42250989Sbostic 		return (NULL);
42350989Sbostic 	l->pgno = lnpg;
42450989Sbostic 	r->pgno = rnpg;
42550989Sbostic 	l->nextpg = r->pgno;
42650989Sbostic 	r->prevpg = l->pgno;
42750989Sbostic 	l->prevpg = r->nextpg = P_INVALID;
42850989Sbostic 	l->lower = r->lower = BTDATAOFF;
42950989Sbostic 	l->upper = r->upper = t->bt_psize;
43050989Sbostic 	l->flags = r->flags = h->flags & P_TYPE;
43150989Sbostic 
43250989Sbostic 	/* Split the root page. */
43350989Sbostic 	tp = bt_psplit(t, h, l, r, skip);
43450989Sbostic 
43550989Sbostic 	*lp = l;
43650989Sbostic 	*rp = r;
43750989Sbostic 	return (tp);
43846134Smao }
43946134Smao 
44046134Smao /*
441*57440Sbostic  * BT_RROOT -- Fix up the recno root page after it has been split.
44246134Smao  *
44350989Sbostic  * Parameters:
44450989Sbostic  *	t:	tree
44550989Sbostic  *	h:	root page
446*57440Sbostic  *	l:	left page
447*57440Sbostic  *	r:	right page
44846134Smao  *
44950989Sbostic  * Returns:
45050989Sbostic  *	RET_ERROR, RET_SUCCESS
45146134Smao  */
45250989Sbostic static int
45350989Sbostic bt_rroot(t, h, l, r)
45450989Sbostic 	BTREE *t;
45550989Sbostic 	PAGE *h, *l, *r;
45646134Smao {
45750989Sbostic 	char *dest;
45846134Smao 
45950989Sbostic 	/* Insert the left and right keys, set the header information. */
46050989Sbostic 	h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
46150989Sbostic 	dest = (char *)h + h->upper;
46250989Sbostic 	WR_RINTERNAL(dest,
46350989Sbostic 	    l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
46446134Smao 
46550989Sbostic 	h->linp[1] = h->upper -= NRINTERNAL;
46650989Sbostic 	dest = (char *)h + h->upper;
46750989Sbostic 	WR_RINTERNAL(dest,
46850989Sbostic 	    r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
46946134Smao 
47050989Sbostic 	h->lower = BTDATAOFF + 2 * sizeof(index_t);
47146134Smao 
47250989Sbostic 	/* Unpin the root page, set to recno internal page. */
47350989Sbostic 	h->flags &= ~P_TYPE;
47450989Sbostic 	h->flags |= P_RINTERNAL;
47550989Sbostic 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
47646134Smao 
47750989Sbostic 	return (RET_SUCCESS);
47850989Sbostic }
47946134Smao 
48050989Sbostic /*
481*57440Sbostic  * BT_BROOT -- Fix up the btree root page after it has been split.
48250989Sbostic  *
48350989Sbostic  * Parameters:
48450989Sbostic  *	t:	tree
48550989Sbostic  *	h:	root page
486*57440Sbostic  *	l:	left page
487*57440Sbostic  *	r:	right page
48850989Sbostic  *
48950989Sbostic  * Returns:
49050989Sbostic  *	RET_ERROR, RET_SUCCESS
49150989Sbostic  */
49250989Sbostic static int
49350989Sbostic bt_broot(t, h, l, r)
49450989Sbostic 	BTREE *t;
49550989Sbostic 	PAGE *h, *l, *r;
49650989Sbostic {
49750989Sbostic 	BINTERNAL *bi;
49850989Sbostic 	BLEAF *bl;
49950989Sbostic 	size_t nbytes;
50050989Sbostic 	char *dest;
50146134Smao 
50246134Smao 	/*
50350989Sbostic 	 * If the root page was a leaf page, change it into an internal page.
50450989Sbostic 	 * We copy the key we split on (but not the key's data, in the case of
50556401Sbostic 	 * a leaf page) to the new root page.
50650989Sbostic 	 *
50750989Sbostic 	 * The btree comparison code guarantees that the left-most key on any
508*57440Sbostic 	 * level of the tree is never used, so it doesn't need to be filled in.
50946134Smao 	 */
51056401Sbostic 	nbytes = NBINTERNAL(0);
51150989Sbostic 	h->linp[0] = h->upper = t->bt_psize - nbytes;
51250989Sbostic 	dest = (char *)h + h->upper;
51350989Sbostic 	WR_BINTERNAL(dest, 0, l->pgno, 0);
51446134Smao 
51550989Sbostic 	switch(h->flags & P_TYPE) {
51650989Sbostic 	case P_BLEAF:
51750989Sbostic 		bl = GETBLEAF(r, 0);
51850989Sbostic 		nbytes = NBINTERNAL(bl->ksize);
51950989Sbostic 		h->linp[1] = h->upper -= nbytes;
52050989Sbostic 		dest = (char *)h + h->upper;
52150989Sbostic 		WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
52250989Sbostic 		bcopy(bl->bytes, dest, bl->ksize);
52350989Sbostic 
52456401Sbostic 		/*
52556401Sbostic 		 * If the key is on an overflow page, mark the overflow chain
52656401Sbostic 		 * so it isn't deleted when the leaf copy of the key is deleted.
52756401Sbostic 		 */
52850989Sbostic 		if (bl->flags & P_BIGKEY &&
52950989Sbostic 		    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
53050989Sbostic 			return (RET_ERROR);
53150989Sbostic 		break;
53250989Sbostic 	case P_BINTERNAL:
53350989Sbostic 		bi = GETBINTERNAL(r, 0);
53450989Sbostic 		nbytes = NBINTERNAL(bi->ksize);
53550989Sbostic 		h->linp[1] = h->upper -= nbytes;
53650989Sbostic 		dest = (char *)h + h->upper;
53750989Sbostic 		bcopy(bi, dest, nbytes);
53850989Sbostic 		((BINTERNAL *)dest)->pgno = r->pgno;
53950989Sbostic 		break;
54056741Sbostic 	default:
54156741Sbostic 		abort();
54246134Smao 	}
543*57440Sbostic 
544*57440Sbostic 	/* There are two keys on the page. */
54550989Sbostic 	h->lower = BTDATAOFF + 2 * sizeof(index_t);
54646134Smao 
54750989Sbostic 	/* Unpin the root page, set to btree internal page. */
54850989Sbostic 	h->flags &= ~P_TYPE;
54950989Sbostic 	h->flags |= P_BINTERNAL;
55050989Sbostic 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
55146134Smao 
55246134Smao 	return (RET_SUCCESS);
55346134Smao }
55446134Smao 
55546134Smao /*
55650989Sbostic  * BT_PSPLIT -- Do the real work of splitting the page.
55746134Smao  *
55850989Sbostic  * Parameters:
55950989Sbostic  *	t:	tree
56050989Sbostic  *	h:	page to be split
56150989Sbostic  *	l:	page to put lower half of data
56250989Sbostic  *	r:	page to put upper half of data
563*57440Sbostic  *	pskip:	pointer to index to leave open
56446134Smao  *
56550989Sbostic  * Returns:
56650989Sbostic  *	Pointer to page in which to insert.
56746134Smao  */
56850989Sbostic static PAGE *
56951106Sbostic bt_psplit(t, h, l, r, pskip)
57050989Sbostic 	BTREE *t;
57150989Sbostic 	PAGE *h, *l, *r;
57251106Sbostic 	int *pskip;
57346134Smao {
57450989Sbostic 	BINTERNAL *bi;
57550989Sbostic 	BLEAF *bl;
57650989Sbostic 	RLEAF *rl;
57750989Sbostic 	EPGNO *c;
57850989Sbostic 	PAGE *rval;
57951106Sbostic 	index_t half, skip;
58050989Sbostic 	size_t nbytes;
58150989Sbostic 	void *src;
58250989Sbostic 	int bigkeycnt, isbigkey, nxt, off, top;
58346134Smao 
58446134Smao 	/*
58550989Sbostic 	 * Split the data to the left and right pages. Leave the skip index
586*57440Sbostic 	 * open.  Additionally,
58750989Sbostic 	 * make some effort not to split on an overflow key.  This makes it
58850989Sbostic 	 * faster to process internal pages and can save space since overflow
58950989Sbostic 	 * keys used by internal pages are never deleted.
59046134Smao 	 */
59150989Sbostic 	bigkeycnt = 0;
59251106Sbostic 	skip = *pskip;
59350989Sbostic 	half = (t->bt_psize - BTDATAOFF) / 2;
59450989Sbostic 	for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
59551106Sbostic 		if (skip == off)
59650989Sbostic 			continue;
59750989Sbostic 		switch (h->flags & P_TYPE) {
59850989Sbostic 		case P_BINTERNAL:
59950989Sbostic 			src = bi = GETBINTERNAL(h, nxt);
60050989Sbostic 			nbytes = NBINTERNAL(bi->ksize);
60150989Sbostic 			isbigkey = bi->flags & P_BIGKEY;
60250989Sbostic 			break;
60350989Sbostic 		case P_BLEAF:
60450989Sbostic 			src = bl = GETBLEAF(h, nxt);
60550989Sbostic 			nbytes = NBLEAF(bl);
60650989Sbostic 			isbigkey = bl->flags & P_BIGKEY;
60750989Sbostic 			break;
60850989Sbostic 		case P_RINTERNAL:
60950989Sbostic 			src = GETRINTERNAL(h, nxt);
61050989Sbostic 			nbytes = NRINTERNAL;
61150989Sbostic 			isbigkey = 0;
61250989Sbostic 			break;
61350989Sbostic 		case P_RLEAF:
61450989Sbostic 			src = rl = GETRLEAF(h, nxt);
61550989Sbostic 			nbytes = NRLEAF(rl);
61650989Sbostic 			isbigkey = 0;
61750989Sbostic 			break;
61856741Sbostic 		default:
61956741Sbostic 			abort();
62050989Sbostic 		}
62150989Sbostic 		++nxt;
62250989Sbostic 		l->linp[off] = l->upper -= nbytes;
62350989Sbostic 		bcopy(src, (char *)l + l->upper, nbytes);
62446134Smao 
62550989Sbostic 		/* There's no empirical justification for the '3'. */
62656402Sbostic 		if (half < nbytes) {
627*57440Sbostic 			if (!isbigkey || bigkeycnt == 3)
628*57440Sbostic 				break;
629*57440Sbostic 			else
630*57440Sbostic 				++bigkeycnt;
63156402Sbostic 		} else
63250989Sbostic 			half -= nbytes;
63350989Sbostic 	}
63450989Sbostic 	l->lower += (off + 1) * sizeof(index_t);
63546134Smao 
63650989Sbostic 	/*
63751106Sbostic 	 * If splitting the page that the cursor was on, the cursor has to be
63851106Sbostic 	 * adjusted to point to the same record as before the split.  If the
63951106Sbostic 	 * skipped slot and the cursor are both on the left page and the cursor
64051106Sbostic 	 * is on or past the skipped slot, the cursor is incremented by one.
64151106Sbostic 	 * If the skipped slot and the cursor are both on the right page and
64251106Sbostic 	 * the cursor is on or past the skipped slot, the cursor is incremented
64351106Sbostic 	 * by one.  If the skipped slot and the cursor aren't on the same page,
64451106Sbostic 	 * the cursor isn't changed.  Regardless of the relationship of the
64551106Sbostic 	 * skipped slot and the cursor, if the cursor is on the right page it
64651106Sbostic 	 * is decremented by the number of records split to the left page.
64751106Sbostic 	 *
64851106Sbostic 	 * Don't bother checking for the BTF_SEQINIT flag, the page number will
64951106Sbostic 	 * be P_INVALID.
65050989Sbostic 	 */
65150989Sbostic 	c = &t->bt_bcursor;
65250989Sbostic 	if (c->pgno == h->pgno)
65351106Sbostic 		if (c->index < off) {			/* left page */
65450989Sbostic 			c->pgno = l->pgno;
65551106Sbostic 			if (c->index >= skip)
65651106Sbostic 				++c->index;
65751106Sbostic 		} else {				/* right page */
65850989Sbostic 			c->pgno = r->pgno;
65951106Sbostic 			if (c->index >= skip && skip > off)
66051106Sbostic 				++c->index;
66150989Sbostic 			c->index -= off;
66246134Smao 		}
66346134Smao 
66450989Sbostic 	/*
66550989Sbostic 	 * Decide which page to return, and adjust the skip index if the
66650989Sbostic 	 * to-be-inserted-upon page has changed.
66750989Sbostic 	 */
66851106Sbostic 	if (skip > off) {
66950989Sbostic 		rval = r;
67051106Sbostic 		*pskip -= off + 1;
67150989Sbostic 	} else
67250989Sbostic 		rval = l;
67346134Smao 
67450989Sbostic 	for (off = 0; nxt < top; ++off) {
67551106Sbostic 		if (skip == nxt) {
67651106Sbostic 			skip = 0;
67750989Sbostic 			continue;
67846134Smao 		}
67950989Sbostic 		switch (h->flags & P_TYPE) {
68050989Sbostic 		case P_BINTERNAL:
68150989Sbostic 			src = bi = GETBINTERNAL(h, nxt);
68250989Sbostic 			nbytes = NBINTERNAL(bi->ksize);
68350989Sbostic 			break;
68450989Sbostic 		case P_BLEAF:
68550989Sbostic 			src = bl = GETBLEAF(h, nxt);
68650989Sbostic 			nbytes = NBLEAF(bl);
68750989Sbostic 			break;
68850989Sbostic 		case P_RINTERNAL:
68950989Sbostic 			src = GETRINTERNAL(h, nxt);
69050989Sbostic 			nbytes = NRINTERNAL;
69150989Sbostic 			break;
69250989Sbostic 		case P_RLEAF:
69350989Sbostic 			src = rl = GETRLEAF(h, nxt);
69450989Sbostic 			nbytes = NRLEAF(rl);
69550989Sbostic 			break;
69656741Sbostic 		default:
69756741Sbostic 			abort();
69850989Sbostic 		}
69950989Sbostic 		++nxt;
70050989Sbostic 		r->linp[off] = r->upper -= nbytes;
70150989Sbostic 		bcopy(src, (char *)r + r->upper, nbytes);
70250989Sbostic 	}
70350989Sbostic 	r->lower += off * sizeof(index_t);
70446134Smao 
70550989Sbostic 	/* If the key is being appended to the page, adjust the index. */
70651106Sbostic 	if (skip == top)
70750989Sbostic 		r->lower += sizeof(index_t);
70846134Smao 
70950989Sbostic 	return (rval);
71050989Sbostic }
71146134Smao 
71250989Sbostic /*
71350989Sbostic  * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
71450989Sbostic  *
71550989Sbostic  * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
71650989Sbostic  * record that references them gets deleted.  Chains pointed to by internal
71750989Sbostic  * pages never get deleted.  This routine marks a chain as pointed to by an
71850989Sbostic  * internal page.
71950989Sbostic  *
72050989Sbostic  * Parameters:
72150989Sbostic  *	t:	tree
72250989Sbostic  *	pg:	page number of first page in the chain.
72350989Sbostic  *
72450989Sbostic  * Returns:
72550989Sbostic  *	RET_SUCCESS, RET_ERROR.
72650989Sbostic  */
72750989Sbostic static int
72850989Sbostic bt_preserve(t, pg)
72950989Sbostic 	BTREE *t;
73050989Sbostic 	pgno_t pg;
73150989Sbostic {
73250989Sbostic 	PAGE *h;
73346134Smao 
73450989Sbostic 	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
73550989Sbostic 		return (RET_ERROR);
73650989Sbostic 	h->flags |= P_PRESERVE;
73750989Sbostic 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
73846134Smao 	return (RET_SUCCESS);
73946134Smao }
74050989Sbostic 
74150989Sbostic /*
74250989Sbostic  * REC_TOTAL -- Return the number of recno entries below a page.
74350989Sbostic  *
74450989Sbostic  * Parameters:
74550989Sbostic  *	h:	page
74650989Sbostic  *
74750989Sbostic  * Returns:
74850989Sbostic  *	The number of recno entries below a page.
74950989Sbostic  *
75050989Sbostic  * XXX
75150989Sbostic  * These values could be set by the bt_psplit routine.  The problem is that the
75250989Sbostic  * entry has to be popped off of the stack etc. or the values have to be passed
75350989Sbostic  * all the way back to bt_split/bt_rroot and it's not very clean.
75450989Sbostic  */
75550989Sbostic static recno_t
75650989Sbostic rec_total(h)
75750989Sbostic 	PAGE *h;
75850989Sbostic {
75950989Sbostic 	recno_t recs;
76050989Sbostic 	index_t nxt, top;
76150989Sbostic 
76250989Sbostic 	for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
76350989Sbostic 		recs += GETRINTERNAL(h, nxt)->nrecs;
76450989Sbostic 	return (recs);
76550989Sbostic }
766