xref: /csrg-svn/lib/libc/db/btree/bt_split.c (revision 56741)
146134Smao /*-
246134Smao  * Copyright (c) 1990 The Regents of the University of California.
346134Smao  * All rights reserved.
446134Smao  *
546134Smao  * This code is derived from software contributed to Berkeley by
646134Smao  * Mike Olson.
746134Smao  *
846134Smao  * %sccs.include.redist.c%
946134Smao  */
1046134Smao 
1146134Smao #if defined(LIBC_SCCS) && !defined(lint)
12*56741Sbostic static char sccsid[] = "@(#)bt_split.c	5.8 (Berkeley) 11/13/92";
1346134Smao #endif /* LIBC_SCCS and not lint */
1446134Smao 
1546134Smao #include <sys/types.h>
16*56741Sbostic 
1750989Sbostic #define	__DBINTERFACE_PRIVATE
1846134Smao #include <db.h>
1950989Sbostic #include <limits.h>
2050989Sbostic #include <stdio.h>
2146561Sbostic #include <stdlib.h>
2246561Sbostic #include <string.h>
23*56741Sbostic 
2446134Smao #include "btree.h"
2546134Smao 
2650989Sbostic static int	 bt_preserve __P((BTREE *, pgno_t));
2750989Sbostic static PAGE	*bt_psplit __P((BTREE *, PAGE *, PAGE *, PAGE *, int *));
2850989Sbostic static PAGE	*bt_page __P((BTREE *, PAGE *, PAGE **, PAGE **, int *));
2950989Sbostic static PAGE	*bt_root __P((BTREE *, PAGE *, PAGE **, PAGE **, int *));
3050989Sbostic static int	 bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
3150989Sbostic static int	 bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
3250989Sbostic static recno_t	 rec_total __P((PAGE *));
3350989Sbostic 
3450989Sbostic #ifdef STATISTICS
3550989Sbostic u_long	bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
3650989Sbostic #endif
3750989Sbostic 
3846134Smao /*
3950989Sbostic  * __BT_SPLIT -- Split the tree.
4046134Smao  *
4150989Sbostic  * Parameters:
4250989Sbostic  *	t:	tree
4350989Sbostic  *	h:	page to split
4450989Sbostic  *	key:	key to insert
4550989Sbostic  *	data:	data to insert
4650989Sbostic  *	flags:	BIGKEY/BIGDATA flags
4750989Sbostic  *	nbytes:	length of insertion
4850989Sbostic  *	skip:	index to leave open
4946134Smao  *
5050989Sbostic  * Returns:
5150989Sbostic  *	RET_ERROR, RET_SUCCESS
5246134Smao  */
5346134Smao int
5450989Sbostic __bt_split(t, h, key, data, flags, nbytes, skip)
5550989Sbostic 	BTREE *t;
5650989Sbostic 	PAGE *h;
5750989Sbostic 	const DBT *key, *data;
5850989Sbostic 	u_long flags;
5950989Sbostic 	size_t nbytes;
6050989Sbostic 	int skip;
6146134Smao {
6250989Sbostic 	BINTERNAL *bi;
6350989Sbostic 	BLEAF *bl;
6450989Sbostic 	DBT a, b;
6550989Sbostic 	EPGNO *parent;
6650989Sbostic 	PAGE *l, *r, *lchild, *rchild;
6750989Sbostic 	index_t nxtindex;
6850989Sbostic 	size_t nksize;
6950989Sbostic 	int nosplit;
7050989Sbostic 	char *dest;
7146134Smao 
7246134Smao 	/*
7350989Sbostic 	 * Split the page into two pages, l and r.  The split routines return
7450989Sbostic 	 * a pointer to the page into which the key should be inserted and skip
7550989Sbostic 	 * set to the offset which should be used.  Additionally, l and r are
7650989Sbostic 	 * pinned.
7746134Smao 	 */
7850989Sbostic 	h = h->pgno == P_ROOT ?
7950989Sbostic 	    bt_root(t, h, &l, &r, &skip) : bt_page(t, h, &l, &r, &skip);
8050989Sbostic 	if (h == NULL)
8146134Smao 		return (RET_ERROR);
8246134Smao 
8350989Sbostic 	/*
8450989Sbostic 	 * Grab the space and insert the [rb]leaf structure.  Always a [rb]leaf
8550989Sbostic 	 * structure since key inserts always cause a leaf page to split first.
8650989Sbostic 	 */
8750989Sbostic 	h->linp[skip] = h->upper -= nbytes;
8850989Sbostic 	dest = (char *)h + h->upper;
8951106Sbostic 	if (ISSET(t, BTF_RECNO))
9050989Sbostic 		WR_RLEAF(dest, data, flags)
9151106Sbostic 	else
9250989Sbostic 		WR_BLEAF(dest, key, data, flags)
9346134Smao 
9446134Smao 	/*
9550989Sbostic 	 * Now we walk the parent page stack -- a LIFO stack of the pages that
9650989Sbostic 	 * were traversed when we searched for the page that split.  Each stack
9750989Sbostic 	 * entry is a page number and a page index offset.  The offset is for
9850989Sbostic 	 * the page traversed on the search.  We've just split a page, so we
9950989Sbostic 	 * have to insert a new key into the parent page.
10050989Sbostic 	 *
10150989Sbostic 	 * If the insert into the parent page causes it to split, may have to
10250989Sbostic 	 * continue splitting all the way up the tree.  We stop if the root
10350989Sbostic 	 * splits or the page inserted into didn't have to split to hold the
10450989Sbostic 	 * new key.  Some algorithms replace the key for the old page as well
10550989Sbostic 	 * as the new page.  We don't, as there's no reason to believe that the
10650989Sbostic 	 * first key on the old page is any better than the key we have, and,
10750989Sbostic 	 * in the case of a key being placed at index 0 causing the split, the
10850989Sbostic 	 * key is unavailable.
10950989Sbostic 	 *
11050989Sbostic 	 * There are a maximum of 5 pages pinned at any time.  We keep the left
11150989Sbostic 	 * and right pages pinned while working on the parent.   The 5 are the
11250989Sbostic 	 * two children, left parent and right parent (when the parent splits)
11350989Sbostic 	 * and the root page or the overflow key page when calling bt_preserve.
11450989Sbostic 	 * This code must make sure that all pins are released other than the
11550989Sbostic 	 * root page or overflow page which is unlocked elsewhere.
11646134Smao 	 */
11750989Sbostic 	for (nosplit = 0; (parent = BT_POP(t)) != NULL;) {
11850989Sbostic 		lchild = l;
11950989Sbostic 		rchild = r;
12046134Smao 
12150989Sbostic 		/* Get the parent page. */
12250989Sbostic 		if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
12350989Sbostic 			goto err2;
12446134Smao 
12550989Sbostic 	 	/* The new key goes ONE AFTER the index. */
12650989Sbostic 		skip = parent->index + 1;
12746134Smao 
12850989Sbostic 		/*
12950989Sbostic 		 * Calculate the space needed on the parent page.
13050989Sbostic 		 *
13151106Sbostic 		 * Space hack when inserting into BINTERNAL pages.  Only need to
13250989Sbostic 		 * retain the number of bytes that will distinguish between the
13351106Sbostic 		 * new entry and the LAST entry on the page to its left.  If the
13451106Sbostic 		 * keys compare equal, retain the entire key.  Note, we don't
13551106Sbostic 		 * touch overflow keys and the entire key must be retained for
13651106Sbostic 		 * the next-to-leftmost key on the leftmost page of each level,
13751106Sbostic 		 * or the search will fail.
13850989Sbostic 		 */
13950989Sbostic 		switch (rchild->flags & P_TYPE) {
14050989Sbostic 		case P_BINTERNAL:
14150989Sbostic 			bi = GETBINTERNAL(rchild, 0);
14250989Sbostic 			nbytes = NBINTERNAL(bi->ksize);
14350989Sbostic 			if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) &&
14450989Sbostic 			    !(bi->flags & P_BIGKEY)) {
14550989Sbostic 				BINTERNAL *tbi;
14650989Sbostic 				tbi =
14750989Sbostic 				    GETBINTERNAL(lchild, NEXTINDEX(lchild) - 1);
14850989Sbostic 				a.size = tbi->ksize;
14950989Sbostic 				a.data = tbi->bytes;
15050989Sbostic 				b.size = bi->ksize;
15150989Sbostic 				b.data = bi->bytes;
15250989Sbostic 				goto prefix;
153*56741Sbostic 			} else
154*56741Sbostic 				nksize = 0;
15550989Sbostic 			break;
15650989Sbostic 		case P_BLEAF:
15750989Sbostic 			bl = GETBLEAF(rchild, 0);
15856401Sbostic 			nbytes = NBINTERNAL(bl->ksize);
15950989Sbostic 			if (t->bt_pfx && (h->prevpg != P_INVALID || skip > 1) &&
16050989Sbostic 			    !(bl->flags & P_BIGKEY)) {
16150989Sbostic 				BLEAF *tbl;
16250989Sbostic 				size_t n;
16350989Sbostic 
16450989Sbostic 				tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
16550989Sbostic 				a.size = tbl->ksize;
16650989Sbostic 				a.data = tbl->bytes;
16750989Sbostic 				b.size = bl->ksize;
16850989Sbostic 				b.data = bl->bytes;
16950989Sbostic prefix:				nksize = t->bt_pfx(&a, &b);
17050989Sbostic 				n = NBINTERNAL(nksize);
17150989Sbostic 				if (n < nbytes) {
17250989Sbostic #ifdef STATISTICS
17350989Sbostic 					bt_pfxsaved += nbytes - n;
17450989Sbostic #endif
17550989Sbostic 					nbytes = n;
17650989Sbostic 				} else
17750989Sbostic 					nksize = 0;
17850989Sbostic 			} else
17950989Sbostic 				nksize = 0;
18050989Sbostic 			break;
18150989Sbostic 		case P_RINTERNAL:
18250989Sbostic 		case P_RLEAF:
18350989Sbostic 			nbytes = NRINTERNAL;
18450989Sbostic 			break;
185*56741Sbostic 		default:
186*56741Sbostic 			abort();
18750989Sbostic 		}
18850989Sbostic 
18950989Sbostic 		/* Split the parent page if necessary or shift the indices. */
19050989Sbostic 		if (h->upper - h->lower < nbytes + sizeof(index_t)) {
19150989Sbostic 			h = h->pgno == P_ROOT ?
19250989Sbostic 			    bt_root(t, h, &l, &r, &skip) :
19350989Sbostic 			    bt_page(t, h, &l, &r, &skip);
19450989Sbostic 			if (h == NULL)
19550989Sbostic 				goto err1;
19646134Smao 		} else {
19750989Sbostic 			if (skip < (nxtindex = NEXTINDEX(h)))
19850989Sbostic 				bcopy(h->linp + skip, h->linp + skip + 1,
19950989Sbostic 				    (nxtindex - skip) * sizeof(index_t));
20050989Sbostic 			h->lower += sizeof(index_t);
20150989Sbostic 			nosplit = 1;
20246134Smao 		}
20346134Smao 
20450989Sbostic 		/* Insert the key into the parent page. */
20550989Sbostic 		switch(rchild->flags & P_TYPE) {
20650989Sbostic 		case P_BINTERNAL:
20750989Sbostic 			h->linp[skip] = h->upper -= nbytes;
20850989Sbostic 			dest = (char *)h + h->linp[skip];
20950989Sbostic 			bcopy(bi, dest, nbytes);
21050989Sbostic 			if (nksize)
21150989Sbostic 				((BINTERNAL *)dest)->ksize = nksize;
21250989Sbostic 			((BINTERNAL *)dest)->pgno = rchild->pgno;
21350989Sbostic 			break;
21450989Sbostic 		case P_BLEAF:
21550989Sbostic 			h->linp[skip] = h->upper -= nbytes;
21650989Sbostic 			dest = (char *)h + h->linp[skip];
21750989Sbostic 			WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
21851106Sbostic 			    rchild->pgno, bl->flags & P_BIGKEY);
21950989Sbostic 			bcopy(bl->bytes, dest, nksize ? nksize : bl->ksize);
22050989Sbostic 			if (bl->flags & P_BIGKEY &&
22150989Sbostic 			    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
22250989Sbostic 				goto err1;
22350989Sbostic 			break;
22450989Sbostic 		case P_RINTERNAL:
22550989Sbostic 			/* Update both left and right page counts. */
22650989Sbostic 			h->linp[skip] = h->upper -= nbytes;
22750989Sbostic 			dest = (char *)h + h->linp[skip];
22850989Sbostic 			((RINTERNAL *)dest)->nrecs = rec_total(rchild);
22950989Sbostic 			((RINTERNAL *)dest)->pgno = rchild->pgno;
23050989Sbostic 			dest = (char *)h + h->linp[skip - 1];
23150989Sbostic 			((RINTERNAL *)dest)->nrecs = rec_total(lchild);
23250989Sbostic 			((RINTERNAL *)dest)->pgno = lchild->pgno;
23350989Sbostic 			break;
23450989Sbostic 		case P_RLEAF:
23550989Sbostic 			/* Update both left and right page counts. */
23650989Sbostic 			h->linp[skip] = h->upper -= nbytes;
23750989Sbostic 			dest = (char *)h + h->linp[skip];
23850989Sbostic 			((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
23950989Sbostic 			((RINTERNAL *)dest)->pgno = rchild->pgno;
24050989Sbostic 			dest = (char *)h + h->linp[skip - 1];
24150989Sbostic 			((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
24250989Sbostic 			((RINTERNAL *)dest)->pgno = lchild->pgno;
24350989Sbostic 			break;
244*56741Sbostic 		default:
245*56741Sbostic 			abort();
24650989Sbostic 		}
24746134Smao 
24850989Sbostic 		/* Unpin the held pages. */
24950989Sbostic 		if (nosplit) {
25050989Sbostic 			mpool_put(t->bt_mp, h, MPOOL_DIRTY);
25150989Sbostic 			break;
25250989Sbostic 		}
25350989Sbostic 		mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
25450989Sbostic 		mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
25550989Sbostic 	}
25646134Smao 
25750989Sbostic 	/* Unpin the held pages. */
25850989Sbostic 	mpool_put(t->bt_mp, l, MPOOL_DIRTY);
25950989Sbostic 	mpool_put(t->bt_mp, r, MPOOL_DIRTY);
26050989Sbostic 
26151106Sbostic 	/* Clear any pages left on the stack. */
26251106Sbostic 	BT_CLR(t);
26350989Sbostic 	return (RET_SUCCESS);
26446134Smao 
26546134Smao 	/*
26650989Sbostic 	 * If something fails in the above loop we were already walking back
26750989Sbostic 	 * up the tree and the tree is now inconsistent.  Nothing much we can
26850989Sbostic 	 * do about it but release any memory we're holding.
26946134Smao 	 */
27050989Sbostic err1:	mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
27150989Sbostic 	mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
27246134Smao 
27350989Sbostic err2:	mpool_put(t->bt_mp, l, 0);
27450989Sbostic 	mpool_put(t->bt_mp, r, 0);
27550989Sbostic 	__dbpanic(t->bt_dbp);
27650989Sbostic 	return (RET_ERROR);
27746134Smao }
27846134Smao 
27946134Smao /*
28050989Sbostic  * BT_PAGE -- Split a non-root page of a btree.
28146134Smao  *
28250989Sbostic  * Parameters:
28350989Sbostic  *	t:	tree
28450989Sbostic  *	h:	root page
28550989Sbostic  *	lp:	pointer to left page pointer
28650989Sbostic  *	rp:	pointer to right page pointer
28750989Sbostic  *	skip:	pointer to index to leave open
28846134Smao  *
28950989Sbostic  * Returns:
29050989Sbostic  *	Pointer to page in which to insert or NULL on error.
29146134Smao  */
29250989Sbostic static PAGE *
29350989Sbostic bt_page(t, h, lp, rp, skip)
29450989Sbostic 	BTREE *t;
29550989Sbostic 	PAGE *h, **lp, **rp;
29650989Sbostic 	int *skip;
29746134Smao {
29850989Sbostic 	PAGE *l, *r, *tp;
29950989Sbostic 	pgno_t npg;
30046134Smao 
30150989Sbostic #ifdef STATISTICS
30250989Sbostic 	++bt_split;
30350989Sbostic #endif
30450989Sbostic 	/* Put the new right page for the split into place. */
30556492Sbostic 	if ((r = __bt_new(t, &npg)) == NULL)
30650989Sbostic 		return (NULL);
30750989Sbostic 	r->pgno = npg;
30850989Sbostic 	r->lower = BTDATAOFF;
30950989Sbostic 	r->upper = t->bt_psize;
31050989Sbostic 	r->nextpg = h->nextpg;
31150989Sbostic 	r->prevpg = h->pgno;
31250989Sbostic 	r->flags = h->flags & P_TYPE;
31346134Smao 
31450989Sbostic 	/*
31550989Sbostic 	 * If we're splitting the last page on a level because we're appending
31650989Sbostic 	 * a key to it (skip is NEXTINDEX()), it's likely that the data is
31750989Sbostic 	 * sorted.  Adding an empty page on the side of the level is less work
31850989Sbostic 	 * and can push the fill factor much higher than normal.  If we're
31950989Sbostic 	 * wrong it's no big deal, we'll just do the split the right way next
32050989Sbostic 	 * time.  It may look like it's equally easy to do a similar hack for
32150989Sbostic 	 * reverse sorted data, that is, split the tree left, but it's not.
32250989Sbostic 	 * Don't even try.
32350989Sbostic 	 */
32450989Sbostic 	if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
32550989Sbostic #ifdef STATISTICS
32650989Sbostic 		++bt_sortsplit;
32750989Sbostic #endif
32850989Sbostic 		h->nextpg = r->pgno;
32950989Sbostic 		r->lower = BTDATAOFF + sizeof(index_t);
33050989Sbostic 		*skip = 0;
33150989Sbostic 		*lp = h;
33250989Sbostic 		*rp = r;
33350989Sbostic 		return (r);
33450989Sbostic 	}
33546134Smao 
33650989Sbostic 	/* Put the new left page for the split into place. */
33750989Sbostic 	if ((l = malloc(t->bt_psize)) == NULL) {
33850989Sbostic 		mpool_put(t->bt_mp, r, 0);
33950989Sbostic 		return (NULL);
34050989Sbostic 	}
34150989Sbostic 	l->pgno = h->pgno;
34250989Sbostic 	l->nextpg = r->pgno;
34350989Sbostic 	l->prevpg = h->prevpg;
34450989Sbostic 	l->lower = BTDATAOFF;
34550989Sbostic 	l->upper = t->bt_psize;
34650989Sbostic 	l->flags = h->flags & P_TYPE;
34746134Smao 
34850989Sbostic 	/* Fix up the previous pointer of the page after the split page. */
34950989Sbostic 	if (h->nextpg != P_INVALID) {
35050989Sbostic 		if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
35150989Sbostic 			free(l);
35250989Sbostic 			/* XXX mpool_free(t->bt_mp, r->pgno); */
35350989Sbostic 			return (NULL);
35446134Smao 		}
35550989Sbostic 		tp->prevpg = r->pgno;
35650989Sbostic 		mpool_put(t->bt_mp, tp, 0);
35750989Sbostic 	}
35846134Smao 
35950989Sbostic 	/*
36050989Sbostic 	 * Split right.  The key/data pairs aren't sorted in the btree page so
36150989Sbostic 	 * it's simpler to copy the data from the split page onto two new pages
36250989Sbostic 	 * instead of copying half the data to the right page and compacting
36350989Sbostic 	 * the left page in place.  Since the left page can't change, we have
36450989Sbostic 	 * to swap the original and the allocated left page after the split.
36550989Sbostic 	 */
36650989Sbostic 	tp = bt_psplit(t, h, l, r, skip);
36746134Smao 
36850989Sbostic 	/* Move the new left page onto the old left page. */
36950989Sbostic 	bcopy(l, h, t->bt_psize);
37050989Sbostic 	if (tp == l)
37150989Sbostic 		tp = h;
37250989Sbostic 	free(l);
37350989Sbostic 
37450989Sbostic 	*lp = h;
37550989Sbostic 	*rp = r;
37650989Sbostic 	return (tp);
37746134Smao }
37846134Smao 
37946134Smao /*
38051106Sbostic  * BT_ROOT -- Split the root page of a btree.
38146134Smao  *
38250989Sbostic  * Parameters:
38350989Sbostic  *	t:	tree
38450989Sbostic  *	h:	root page
38550989Sbostic  *	lp:	pointer to left page pointer
38650989Sbostic  *	rp:	pointer to right page pointer
38750989Sbostic  *	skip:	pointer to index to leave open
38846134Smao  *
38950989Sbostic  * Returns:
39050989Sbostic  *	Pointer to page in which to insert or NULL on error.
39146134Smao  */
39250989Sbostic static PAGE *
39350989Sbostic bt_root(t, h, lp, rp, skip)
39450989Sbostic 	BTREE *t;
39550989Sbostic 	PAGE *h, **lp, **rp;
39650989Sbostic 	int *skip;
39746134Smao {
39850989Sbostic 	PAGE *l, *r, *tp;
39950989Sbostic 	pgno_t lnpg, rnpg;
40046134Smao 
40150989Sbostic #ifdef STATISTICS
40250989Sbostic 	++bt_split;
40350989Sbostic 	++bt_rootsplit;
40450989Sbostic #endif
40550989Sbostic 	/* Put the new left and right pages for the split into place. */
40656492Sbostic 	if ((l = __bt_new(t, &lnpg)) == NULL ||
40756492Sbostic 	    (r = __bt_new(t, &rnpg)) == NULL)
40850989Sbostic 		return (NULL);
40950989Sbostic 	l->pgno = lnpg;
41050989Sbostic 	r->pgno = rnpg;
41150989Sbostic 	l->nextpg = r->pgno;
41250989Sbostic 	r->prevpg = l->pgno;
41350989Sbostic 	l->prevpg = r->nextpg = P_INVALID;
41450989Sbostic 	l->lower = r->lower = BTDATAOFF;
41550989Sbostic 	l->upper = r->upper = t->bt_psize;
41650989Sbostic 	l->flags = r->flags = h->flags & P_TYPE;
41750989Sbostic 
41850989Sbostic 	/* Split the root page. */
41950989Sbostic 	tp = bt_psplit(t, h, l, r, skip);
42050989Sbostic 
42150989Sbostic 	/* Make the root page look right. */
42250989Sbostic 	if ((ISSET(t, BTF_RECNO) ?
42350989Sbostic 	    bt_rroot(t, h, l, r) : bt_broot(t, h, l, r)) == RET_ERROR)
42450989Sbostic 		return (NULL);
42550989Sbostic 
42650989Sbostic 	*lp = l;
42750989Sbostic 	*rp = r;
42850989Sbostic 	return (tp);
42946134Smao }
43046134Smao 
43146134Smao /*
43250989Sbostic  * BT_RROOT -- Fix up the recno root page after the split.
43346134Smao  *
43450989Sbostic  * Parameters:
43550989Sbostic  *	t:	tree
43650989Sbostic  *	h:	root page
43746134Smao  *
43850989Sbostic  * Returns:
43950989Sbostic  *	RET_ERROR, RET_SUCCESS
44046134Smao  */
44150989Sbostic static int
44250989Sbostic bt_rroot(t, h, l, r)
44350989Sbostic 	BTREE *t;
44450989Sbostic 	PAGE *h, *l, *r;
44546134Smao {
44650989Sbostic 	char *dest;
44746134Smao 
44850989Sbostic 	/* Insert the left and right keys, set the header information. */
44950989Sbostic 	h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
45050989Sbostic 	dest = (char *)h + h->upper;
45150989Sbostic 	WR_RINTERNAL(dest,
45250989Sbostic 	    l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
45346134Smao 
45450989Sbostic 	h->linp[1] = h->upper -= NRINTERNAL;
45550989Sbostic 	dest = (char *)h + h->upper;
45650989Sbostic 	WR_RINTERNAL(dest,
45750989Sbostic 	    r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
45846134Smao 
45950989Sbostic 	h->lower = BTDATAOFF + 2 * sizeof(index_t);
46046134Smao 
46150989Sbostic 	/* Unpin the root page, set to recno internal page. */
46250989Sbostic 	h->flags &= ~P_TYPE;
46350989Sbostic 	h->flags |= P_RINTERNAL;
46450989Sbostic 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
46546134Smao 
46650989Sbostic 	return (RET_SUCCESS);
46750989Sbostic }
46846134Smao 
46950989Sbostic /*
47050989Sbostic  * BT_BROOT -- Fix up the btree root page after the split.
47150989Sbostic  *
47250989Sbostic  * Parameters:
47350989Sbostic  *	t:	tree
47450989Sbostic  *	h:	root page
47550989Sbostic  *
47650989Sbostic  * Returns:
47750989Sbostic  *	RET_ERROR, RET_SUCCESS
47850989Sbostic  */
47950989Sbostic static int
48050989Sbostic bt_broot(t, h, l, r)
48150989Sbostic 	BTREE *t;
48250989Sbostic 	PAGE *h, *l, *r;
48350989Sbostic {
48450989Sbostic 	BINTERNAL *bi;
48550989Sbostic 	BLEAF *bl;
48650989Sbostic 	size_t nbytes;
48750989Sbostic 	char *dest;
48846134Smao 
48946134Smao 	/*
49050989Sbostic 	 * If the root page was a leaf page, change it into an internal page.
49150989Sbostic 	 * We copy the key we split on (but not the key's data, in the case of
49256401Sbostic 	 * a leaf page) to the new root page.
49350989Sbostic 	 *
49450989Sbostic 	 * The btree comparison code guarantees that the left-most key on any
49550989Sbostic 	 * level of the tree is never used, so it doesn't need to be filled
49650989Sbostic 	 * in.  (This is not just convenience -- if the insert index is 0, we
49750989Sbostic 	 * don't *have* a key to fill in.)  The right key is available because
49850989Sbostic 	 * the split code guarantees not to split on the skipped index.
49946134Smao 	 */
50056401Sbostic 	nbytes = NBINTERNAL(0);
50150989Sbostic 	h->linp[0] = h->upper = t->bt_psize - nbytes;
50250989Sbostic 	dest = (char *)h + h->upper;
50350989Sbostic 	WR_BINTERNAL(dest, 0, l->pgno, 0);
50446134Smao 
50550989Sbostic 	switch(h->flags & P_TYPE) {
50650989Sbostic 	case P_BLEAF:
50750989Sbostic 		bl = GETBLEAF(r, 0);
50850989Sbostic 		nbytes = NBINTERNAL(bl->ksize);
50950989Sbostic 		h->linp[1] = h->upper -= nbytes;
51050989Sbostic 		dest = (char *)h + h->upper;
51150989Sbostic 		WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
51250989Sbostic 		bcopy(bl->bytes, dest, bl->ksize);
51350989Sbostic 
51456401Sbostic 		/*
51556401Sbostic 		 * If the key is on an overflow page, mark the overflow chain
51656401Sbostic 		 * so it isn't deleted when the leaf copy of the key is deleted.
51756401Sbostic 		 */
51850989Sbostic 		if (bl->flags & P_BIGKEY &&
51950989Sbostic 		    bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
52050989Sbostic 			return (RET_ERROR);
52150989Sbostic 		break;
52250989Sbostic 	case P_BINTERNAL:
52350989Sbostic 		bi = GETBINTERNAL(r, 0);
52450989Sbostic 		nbytes = NBINTERNAL(bi->ksize);
52550989Sbostic 		h->linp[1] = h->upper -= nbytes;
52650989Sbostic 		dest = (char *)h + h->upper;
52750989Sbostic 		bcopy(bi, dest, nbytes);
52850989Sbostic 		((BINTERNAL *)dest)->pgno = r->pgno;
52950989Sbostic 		break;
530*56741Sbostic 	default:
531*56741Sbostic 		abort();
53246134Smao 	}
53350989Sbostic 	h->lower = BTDATAOFF + 2 * sizeof(index_t);
53446134Smao 
53550989Sbostic 	/* Unpin the root page, set to btree internal page. */
53650989Sbostic 	h->flags &= ~P_TYPE;
53750989Sbostic 	h->flags |= P_BINTERNAL;
53850989Sbostic 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
53946134Smao 
54046134Smao 	return (RET_SUCCESS);
54146134Smao }
54246134Smao 
54346134Smao /*
54450989Sbostic  * BT_PSPLIT -- Do the real work of splitting the page.
54546134Smao  *
54650989Sbostic  * Parameters:
54750989Sbostic  *	t:	tree
54850989Sbostic  *	h:	page to be split
54950989Sbostic  *	l:	page to put lower half of data
55050989Sbostic  *	r:	page to put upper half of data
55150989Sbostic  *	skip:	pointer to index to leave open
55246134Smao  *
55350989Sbostic  * Returns:
55450989Sbostic  *	Pointer to page in which to insert.
55546134Smao  */
55650989Sbostic static PAGE *
55751106Sbostic bt_psplit(t, h, l, r, pskip)
55850989Sbostic 	BTREE *t;
55950989Sbostic 	PAGE *h, *l, *r;
56051106Sbostic 	int *pskip;
56146134Smao {
56250989Sbostic 	BINTERNAL *bi;
56350989Sbostic 	BLEAF *bl;
56450989Sbostic 	RLEAF *rl;
56550989Sbostic 	EPGNO *c;
56650989Sbostic 	PAGE *rval;
56751106Sbostic 	index_t half, skip;
56850989Sbostic 	size_t nbytes;
56950989Sbostic 	void *src;
57050989Sbostic 	int bigkeycnt, isbigkey, nxt, off, top;
57146134Smao 
57246134Smao 	/*
57350989Sbostic 	 * Split the data to the left and right pages. Leave the skip index
57450989Sbostic 	 * open and guarantee that the split doesn't happen on that index (the
57550989Sbostic 	 * right key must be available for the parent page).  Additionally,
57650989Sbostic 	 * make some effort not to split on an overflow key.  This makes it
57750989Sbostic 	 * faster to process internal pages and can save space since overflow
57850989Sbostic 	 * keys used by internal pages are never deleted.
57946134Smao 	 */
58050989Sbostic 	bigkeycnt = 0;
58151106Sbostic 	skip = *pskip;
58250989Sbostic 	half = (t->bt_psize - BTDATAOFF) / 2;
58350989Sbostic 	for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
58451106Sbostic 		if (skip == off)
58550989Sbostic 			continue;
58650989Sbostic 		switch (h->flags & P_TYPE) {
58750989Sbostic 		case P_BINTERNAL:
58850989Sbostic 			src = bi = GETBINTERNAL(h, nxt);
58950989Sbostic 			nbytes = NBINTERNAL(bi->ksize);
59050989Sbostic 			isbigkey = bi->flags & P_BIGKEY;
59150989Sbostic 			break;
59250989Sbostic 		case P_BLEAF:
59350989Sbostic 			src = bl = GETBLEAF(h, nxt);
59450989Sbostic 			nbytes = NBLEAF(bl);
59550989Sbostic 			isbigkey = bl->flags & P_BIGKEY;
59650989Sbostic 			break;
59750989Sbostic 		case P_RINTERNAL:
59850989Sbostic 			src = GETRINTERNAL(h, nxt);
59950989Sbostic 			nbytes = NRINTERNAL;
60050989Sbostic 			isbigkey = 0;
60150989Sbostic 			break;
60250989Sbostic 		case P_RLEAF:
60350989Sbostic 			src = rl = GETRLEAF(h, nxt);
60450989Sbostic 			nbytes = NRLEAF(rl);
60550989Sbostic 			isbigkey = 0;
60650989Sbostic 			break;
607*56741Sbostic 		default:
608*56741Sbostic 			abort();
60950989Sbostic 		}
61050989Sbostic 		++nxt;
61150989Sbostic 		l->linp[off] = l->upper -= nbytes;
61250989Sbostic 		bcopy(src, (char *)l + l->upper, nbytes);
61346134Smao 
61450989Sbostic 		/* There's no empirical justification for the '3'. */
61556402Sbostic 		if (half < nbytes) {
61656402Sbostic 			if (skip != off + 1)
61756402Sbostic 				if (!isbigkey || bigkeycnt == 3)
61856402Sbostic 					break;
61956402Sbostic 				else
62056402Sbostic 					++bigkeycnt;
62156402Sbostic 		} else
62250989Sbostic 			half -= nbytes;
62350989Sbostic 	}
62450989Sbostic 	l->lower += (off + 1) * sizeof(index_t);
62546134Smao 
62650989Sbostic 	/*
62751106Sbostic 	 * If splitting the page that the cursor was on, the cursor has to be
62851106Sbostic 	 * adjusted to point to the same record as before the split.  If the
62951106Sbostic 	 * skipped slot and the cursor are both on the left page and the cursor
63051106Sbostic 	 * is on or past the skipped slot, the cursor is incremented by one.
63151106Sbostic 	 * If the skipped slot and the cursor are both on the right page and
63251106Sbostic 	 * the cursor is on or past the skipped slot, the cursor is incremented
63351106Sbostic 	 * by one.  If the skipped slot and the cursor aren't on the same page,
63451106Sbostic 	 * the cursor isn't changed.  Regardless of the relationship of the
63551106Sbostic 	 * skipped slot and the cursor, if the cursor is on the right page it
63651106Sbostic 	 * is decremented by the number of records split to the left page.
63751106Sbostic 	 *
63851106Sbostic 	 * Don't bother checking for the BTF_SEQINIT flag, the page number will
63951106Sbostic 	 * be P_INVALID.
64050989Sbostic 	 */
64150989Sbostic 	c = &t->bt_bcursor;
64250989Sbostic 	if (c->pgno == h->pgno)
64351106Sbostic 		if (c->index < off) {			/* left page */
64450989Sbostic 			c->pgno = l->pgno;
64551106Sbostic 			if (c->index >= skip)
64651106Sbostic 				++c->index;
64751106Sbostic 		} else {				/* right page */
64850989Sbostic 			c->pgno = r->pgno;
64951106Sbostic 			if (c->index >= skip && skip > off)
65051106Sbostic 				++c->index;
65150989Sbostic 			c->index -= off;
65246134Smao 		}
65346134Smao 
65450989Sbostic 	/*
65550989Sbostic 	 * Decide which page to return, and adjust the skip index if the
65650989Sbostic 	 * to-be-inserted-upon page has changed.
65750989Sbostic 	 */
65851106Sbostic 	if (skip > off) {
65950989Sbostic 		rval = r;
66051106Sbostic 		*pskip -= off + 1;
66150989Sbostic 	} else
66250989Sbostic 		rval = l;
66346134Smao 
66450989Sbostic 	for (off = 0; nxt < top; ++off) {
66551106Sbostic 		if (skip == nxt) {
66651106Sbostic 			skip = 0;
66750989Sbostic 			continue;
66846134Smao 		}
66950989Sbostic 		switch (h->flags & P_TYPE) {
67050989Sbostic 		case P_BINTERNAL:
67150989Sbostic 			src = bi = GETBINTERNAL(h, nxt);
67250989Sbostic 			nbytes = NBINTERNAL(bi->ksize);
67350989Sbostic 			break;
67450989Sbostic 		case P_BLEAF:
67550989Sbostic 			src = bl = GETBLEAF(h, nxt);
67650989Sbostic 			nbytes = NBLEAF(bl);
67750989Sbostic 			break;
67850989Sbostic 		case P_RINTERNAL:
67950989Sbostic 			src = GETRINTERNAL(h, nxt);
68050989Sbostic 			nbytes = NRINTERNAL;
68150989Sbostic 			break;
68250989Sbostic 		case P_RLEAF:
68350989Sbostic 			src = rl = GETRLEAF(h, nxt);
68450989Sbostic 			nbytes = NRLEAF(rl);
68550989Sbostic 			break;
686*56741Sbostic 		default:
687*56741Sbostic 			abort();
68850989Sbostic 		}
68950989Sbostic 		++nxt;
69050989Sbostic 		r->linp[off] = r->upper -= nbytes;
69150989Sbostic 		bcopy(src, (char *)r + r->upper, nbytes);
69250989Sbostic 	}
69350989Sbostic 	r->lower += off * sizeof(index_t);
69446134Smao 
69550989Sbostic 	/* If the key is being appended to the page, adjust the index. */
69651106Sbostic 	if (skip == top)
69750989Sbostic 		r->lower += sizeof(index_t);
69846134Smao 
69950989Sbostic 	return (rval);
70050989Sbostic }
70146134Smao 
70250989Sbostic /*
70350989Sbostic  * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
70450989Sbostic  *
70550989Sbostic  * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
70650989Sbostic  * record that references them gets deleted.  Chains pointed to by internal
70750989Sbostic  * pages never get deleted.  This routine marks a chain as pointed to by an
70850989Sbostic  * internal page.
70950989Sbostic  *
71050989Sbostic  * Parameters:
71150989Sbostic  *	t:	tree
71250989Sbostic  *	pg:	page number of first page in the chain.
71350989Sbostic  *
71450989Sbostic  * Returns:
71550989Sbostic  *	RET_SUCCESS, RET_ERROR.
71650989Sbostic  */
71750989Sbostic static int
71850989Sbostic bt_preserve(t, pg)
71950989Sbostic 	BTREE *t;
72050989Sbostic 	pgno_t pg;
72150989Sbostic {
72250989Sbostic 	PAGE *h;
72346134Smao 
72450989Sbostic 	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
72550989Sbostic 		return (RET_ERROR);
72650989Sbostic 	h->flags |= P_PRESERVE;
72750989Sbostic 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
72846134Smao 	return (RET_SUCCESS);
72946134Smao }
73050989Sbostic 
73150989Sbostic /*
73250989Sbostic  * REC_TOTAL -- Return the number of recno entries below a page.
73350989Sbostic  *
73450989Sbostic  * Parameters:
73550989Sbostic  *	h:	page
73650989Sbostic  *
73750989Sbostic  * Returns:
73850989Sbostic  *	The number of recno entries below a page.
73950989Sbostic  *
74050989Sbostic  * XXX
74150989Sbostic  * These values could be set by the bt_psplit routine.  The problem is that the
74250989Sbostic  * entry has to be popped off of the stack etc. or the values have to be passed
74350989Sbostic  * all the way back to bt_split/bt_rroot and it's not very clean.
74450989Sbostic  */
74550989Sbostic static recno_t
74650989Sbostic rec_total(h)
74750989Sbostic 	PAGE *h;
74850989Sbostic {
74950989Sbostic 	recno_t recs;
75050989Sbostic 	index_t nxt, top;
75150989Sbostic 
75250989Sbostic 	for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
75350989Sbostic 		recs += GETRINTERNAL(h, nxt)->nrecs;
75450989Sbostic 	return (recs);
75550989Sbostic }
756