1*0Sstevel@tonic-gate /*-
2*0Sstevel@tonic-gate  * See the file LICENSE for redistribution information.
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * Copyright (c) 1996, 1997, 1998
5*0Sstevel@tonic-gate  *	Sleepycat Software.  All rights reserved.
6*0Sstevel@tonic-gate  */
7*0Sstevel@tonic-gate /*
8*0Sstevel@tonic-gate  * Copyright (c) 1990, 1993, 1994, 1995, 1996
9*0Sstevel@tonic-gate  *	Keith Bostic.  All rights reserved.
10*0Sstevel@tonic-gate  */
11*0Sstevel@tonic-gate /*
12*0Sstevel@tonic-gate  * Copyright (c) 1990, 1993, 1994, 1995
13*0Sstevel@tonic-gate  *	The Regents of the University of California.  All rights reserved.
14*0Sstevel@tonic-gate  *
15*0Sstevel@tonic-gate  * Redistribution and use in source and binary forms, with or without
16*0Sstevel@tonic-gate  * modification, are permitted provided that the following conditions
17*0Sstevel@tonic-gate  * are met:
18*0Sstevel@tonic-gate  * 1. Redistributions of source code must retain the above copyright
19*0Sstevel@tonic-gate  *    notice, this list of conditions and the following disclaimer.
20*0Sstevel@tonic-gate  * 2. Redistributions in binary form must reproduce the above copyright
21*0Sstevel@tonic-gate  *    notice, this list of conditions and the following disclaimer in the
22*0Sstevel@tonic-gate  *    documentation and/or other materials provided with the distribution.
23*0Sstevel@tonic-gate  * 3. All advertising materials mentioning features or use of this software
24*0Sstevel@tonic-gate  *    must display the following acknowledgement:
25*0Sstevel@tonic-gate  *	This product includes software developed by the University of
26*0Sstevel@tonic-gate  *	California, Berkeley and its contributors.
27*0Sstevel@tonic-gate  * 4. Neither the name of the University nor the names of its contributors
28*0Sstevel@tonic-gate  *    may be used to endorse or promote products derived from this software
29*0Sstevel@tonic-gate  *    without specific prior written permission.
30*0Sstevel@tonic-gate  *
31*0Sstevel@tonic-gate  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32*0Sstevel@tonic-gate  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33*0Sstevel@tonic-gate  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34*0Sstevel@tonic-gate  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35*0Sstevel@tonic-gate  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36*0Sstevel@tonic-gate  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37*0Sstevel@tonic-gate  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38*0Sstevel@tonic-gate  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39*0Sstevel@tonic-gate  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40*0Sstevel@tonic-gate  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41*0Sstevel@tonic-gate  * SUCH DAMAGE.
42*0Sstevel@tonic-gate  */
43*0Sstevel@tonic-gate 
44*0Sstevel@tonic-gate #include "config.h"
45*0Sstevel@tonic-gate 
46*0Sstevel@tonic-gate #ifndef lint
47*0Sstevel@tonic-gate static const char sccsid[] = "@(#)bt_split.c	10.33 (Sleepycat) 10/13/98";
48*0Sstevel@tonic-gate #endif /* not lint */
49*0Sstevel@tonic-gate 
50*0Sstevel@tonic-gate #ifndef NO_SYSTEM_INCLUDES
51*0Sstevel@tonic-gate #include <sys/types.h>
52*0Sstevel@tonic-gate 
53*0Sstevel@tonic-gate #include <errno.h>
54*0Sstevel@tonic-gate #include <limits.h>
55*0Sstevel@tonic-gate #include <string.h>
56*0Sstevel@tonic-gate #endif
57*0Sstevel@tonic-gate 
58*0Sstevel@tonic-gate #include "db_int.h"
59*0Sstevel@tonic-gate #include "db_page.h"
60*0Sstevel@tonic-gate #include "btree.h"
61*0Sstevel@tonic-gate 
62*0Sstevel@tonic-gate static int __bam_broot __P((DBC *, PAGE *, PAGE *, PAGE *));
63*0Sstevel@tonic-gate static int __bam_page __P((DBC *, EPG *, EPG *));
64*0Sstevel@tonic-gate static int __bam_pinsert __P((DBC *, EPG *, PAGE *, PAGE *));
65*0Sstevel@tonic-gate static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *));
66*0Sstevel@tonic-gate static int __bam_root __P((DBC *, EPG *));
67*0Sstevel@tonic-gate static int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *));
68*0Sstevel@tonic-gate 
69*0Sstevel@tonic-gate /*
70*0Sstevel@tonic-gate  * __bam_split --
71*0Sstevel@tonic-gate  *	Split a page.
72*0Sstevel@tonic-gate  *
73*0Sstevel@tonic-gate  * PUBLIC: int __bam_split __P((DBC *, void *));
74*0Sstevel@tonic-gate  */
75*0Sstevel@tonic-gate int
76*0Sstevel@tonic-gate __bam_split(dbc, arg)
77*0Sstevel@tonic-gate 	DBC *dbc;
78*0Sstevel@tonic-gate 	void *arg;
79*0Sstevel@tonic-gate {
80*0Sstevel@tonic-gate 	BTREE *t;
81*0Sstevel@tonic-gate 	CURSOR *cp;
82*0Sstevel@tonic-gate 	DB *dbp;
83*0Sstevel@tonic-gate 	enum { UP, DOWN } dir;
84*0Sstevel@tonic-gate 	int exact, level, ret;
85*0Sstevel@tonic-gate 
86*0Sstevel@tonic-gate 	dbp = dbc->dbp;
87*0Sstevel@tonic-gate 	cp = dbc->internal;
88*0Sstevel@tonic-gate 
89*0Sstevel@tonic-gate 	/*
90*0Sstevel@tonic-gate 	 * The locking protocol we use to avoid deadlock to acquire locks by
91*0Sstevel@tonic-gate 	 * walking down the tree, but we do it as lazily as possible, locking
92*0Sstevel@tonic-gate 	 * the root only as a last resort.  We expect all stack pages to have
93*0Sstevel@tonic-gate 	 * been discarded before we're called; we discard all short-term locks.
94*0Sstevel@tonic-gate 	 *
95*0Sstevel@tonic-gate 	 * When __bam_split is first called, we know that a leaf page was too
96*0Sstevel@tonic-gate 	 * full for an insert.  We don't know what leaf page it was, but we
97*0Sstevel@tonic-gate 	 * have the key/recno that caused the problem.  We call XX_search to
98*0Sstevel@tonic-gate 	 * reacquire the leaf page, but this time get both the leaf page and
99*0Sstevel@tonic-gate 	 * its parent, locked.  We then split the leaf page and see if the new
100*0Sstevel@tonic-gate 	 * internal key will fit into the parent page.  If it will, we're done.
101*0Sstevel@tonic-gate 	 *
102*0Sstevel@tonic-gate 	 * If it won't, we discard our current locks and repeat the process,
103*0Sstevel@tonic-gate 	 * only this time acquiring the parent page and its parent, locked.
104*0Sstevel@tonic-gate 	 * This process repeats until we succeed in the split, splitting the
105*0Sstevel@tonic-gate 	 * root page as the final resort.  The entire process then repeats,
106*0Sstevel@tonic-gate 	 * as necessary, until we split a leaf page.
107*0Sstevel@tonic-gate 	 *
108*0Sstevel@tonic-gate 	 * XXX
109*0Sstevel@tonic-gate 	 * A traditional method of speeding this up is to maintain a stack of
110*0Sstevel@tonic-gate 	 * the pages traversed in the original search.  You can detect if the
111*0Sstevel@tonic-gate 	 * stack is correct by storing the page's LSN when it was searched and
112*0Sstevel@tonic-gate 	 * comparing that LSN with the current one when it's locked during the
113*0Sstevel@tonic-gate 	 * split.  This would be an easy change for this code, but I have no
114*0Sstevel@tonic-gate 	 * numbers that indicate it's worthwhile.
115*0Sstevel@tonic-gate 	 */
116*0Sstevel@tonic-gate 	t = dbp->internal;
117*0Sstevel@tonic-gate 	for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) {
118*0Sstevel@tonic-gate 		/*
119*0Sstevel@tonic-gate 		 * Acquire a page and its parent, locked.
120*0Sstevel@tonic-gate 		 */
121*0Sstevel@tonic-gate 		if ((ret = (dbp->type == DB_BTREE ?
122*0Sstevel@tonic-gate 		    __bam_search(dbc, arg, S_WRPAIR, level, NULL, &exact) :
123*0Sstevel@tonic-gate 		    __bam_rsearch(dbc,
124*0Sstevel@tonic-gate 		        (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0)
125*0Sstevel@tonic-gate 			return (ret);
126*0Sstevel@tonic-gate 
127*0Sstevel@tonic-gate 		/*
128*0Sstevel@tonic-gate 		 * Split the page if it still needs it (it's possible another
129*0Sstevel@tonic-gate 		 * thread of control has already split the page).  If we are
130*0Sstevel@tonic-gate 		 * guaranteed that two items will fit on the page, the split
131*0Sstevel@tonic-gate 		 * is no longer necessary.
132*0Sstevel@tonic-gate 		 */
133*0Sstevel@tonic-gate 		if (t->bt_ovflsize * 2 <=
134*0Sstevel@tonic-gate 		    (db_indx_t)P_FREESPACE(cp->csp[0].page)) {
135*0Sstevel@tonic-gate 			__bam_stkrel(dbc, 1);
136*0Sstevel@tonic-gate 			return (0);
137*0Sstevel@tonic-gate 		}
138*0Sstevel@tonic-gate 		ret = cp->csp[0].page->pgno == PGNO_ROOT ?
139*0Sstevel@tonic-gate 		    __bam_root(dbc, &cp->csp[0]) :
140*0Sstevel@tonic-gate 		    __bam_page(dbc, &cp->csp[-1], &cp->csp[0]);
141*0Sstevel@tonic-gate 		BT_STK_CLR(cp);
142*0Sstevel@tonic-gate 
143*0Sstevel@tonic-gate 		switch (ret) {
144*0Sstevel@tonic-gate 		case 0:
145*0Sstevel@tonic-gate 			/* Once we've split the leaf page, we're done. */
146*0Sstevel@tonic-gate 			if (level == LEAFLEVEL)
147*0Sstevel@tonic-gate 				return (0);
148*0Sstevel@tonic-gate 
149*0Sstevel@tonic-gate 			/* Switch directions. */
150*0Sstevel@tonic-gate 			if (dir == UP)
151*0Sstevel@tonic-gate 				dir = DOWN;
152*0Sstevel@tonic-gate 			break;
153*0Sstevel@tonic-gate 		case DB_NEEDSPLIT:
154*0Sstevel@tonic-gate 			/*
155*0Sstevel@tonic-gate 			 * It's possible to fail to split repeatedly, as other
156*0Sstevel@tonic-gate 			 * threads may be modifying the tree, or the page usage
157*0Sstevel@tonic-gate 			 * is sufficiently bad that we don't get enough space
158*0Sstevel@tonic-gate 			 * the first time.
159*0Sstevel@tonic-gate 			 */
160*0Sstevel@tonic-gate 			if (dir == DOWN)
161*0Sstevel@tonic-gate 				dir = UP;
162*0Sstevel@tonic-gate 			break;
163*0Sstevel@tonic-gate 		default:
164*0Sstevel@tonic-gate 			return (ret);
165*0Sstevel@tonic-gate 		}
166*0Sstevel@tonic-gate 	}
167*0Sstevel@tonic-gate 	/* NOTREACHED */
168*0Sstevel@tonic-gate }
169*0Sstevel@tonic-gate 
170*0Sstevel@tonic-gate /*
171*0Sstevel@tonic-gate  * __bam_root --
172*0Sstevel@tonic-gate  *	Split the root page of a btree.
173*0Sstevel@tonic-gate  */
174*0Sstevel@tonic-gate static int
175*0Sstevel@tonic-gate __bam_root(dbc, cp)
176*0Sstevel@tonic-gate 	DBC *dbc;
177*0Sstevel@tonic-gate 	EPG *cp;
178*0Sstevel@tonic-gate {
179*0Sstevel@tonic-gate 	DB *dbp;
180*0Sstevel@tonic-gate 	PAGE *lp, *rp;
181*0Sstevel@tonic-gate 	db_indx_t split;
182*0Sstevel@tonic-gate 	int ret;
183*0Sstevel@tonic-gate 
184*0Sstevel@tonic-gate 	dbp = dbc->dbp;
185*0Sstevel@tonic-gate 
186*0Sstevel@tonic-gate 	/* Yeah, right. */
187*0Sstevel@tonic-gate 	if (cp->page->level >= MAXBTREELEVEL) {
188*0Sstevel@tonic-gate 		ret = ENOSPC;
189*0Sstevel@tonic-gate 		goto err;
190*0Sstevel@tonic-gate 	}
191*0Sstevel@tonic-gate 
192*0Sstevel@tonic-gate 	/* Create new left and right pages for the split. */
193*0Sstevel@tonic-gate 	lp = rp = NULL;
194*0Sstevel@tonic-gate 	if ((ret = __bam_new(dbc, TYPE(cp->page), &lp)) != 0 ||
195*0Sstevel@tonic-gate 	    (ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0)
196*0Sstevel@tonic-gate 		goto err;
197*0Sstevel@tonic-gate 	P_INIT(lp, dbp->pgsize, lp->pgno,
198*0Sstevel@tonic-gate 	    PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno,
199*0Sstevel@tonic-gate 	    cp->page->level, TYPE(cp->page));
200*0Sstevel@tonic-gate 	P_INIT(rp, dbp->pgsize, rp->pgno,
201*0Sstevel@tonic-gate 	    ISINTERNAL(cp->page) ?  PGNO_INVALID : lp->pgno, PGNO_INVALID,
202*0Sstevel@tonic-gate 	    cp->page->level, TYPE(cp->page));
203*0Sstevel@tonic-gate 
204*0Sstevel@tonic-gate 	/* Split the page. */
205*0Sstevel@tonic-gate 	if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
206*0Sstevel@tonic-gate 		goto err;
207*0Sstevel@tonic-gate 
208*0Sstevel@tonic-gate 	/* Log the change. */
209*0Sstevel@tonic-gate 	if (DB_LOGGING(dbc)) {
210*0Sstevel@tonic-gate 		DBT __a;
211*0Sstevel@tonic-gate 		DB_LSN __lsn;
212*0Sstevel@tonic-gate 		memset(&__a, 0, sizeof(__a));
213*0Sstevel@tonic-gate 		__a.data = cp->page;
214*0Sstevel@tonic-gate 		__a.size = dbp->pgsize;
215*0Sstevel@tonic-gate 		ZERO_LSN(__lsn);
216*0Sstevel@tonic-gate 		if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn,
217*0Sstevel@tonic-gate 		    &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp),
218*0Sstevel@tonic-gate 		    PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &__lsn,
219*0Sstevel@tonic-gate 		    &__a)) != 0)
220*0Sstevel@tonic-gate 			goto err;
221*0Sstevel@tonic-gate 		LSN(lp) = LSN(rp) = LSN(cp->page);
222*0Sstevel@tonic-gate 	}
223*0Sstevel@tonic-gate 
224*0Sstevel@tonic-gate 	/* Clean up the new root page. */
225*0Sstevel@tonic-gate 	if ((ret = (dbp->type == DB_RECNO ?
226*0Sstevel@tonic-gate 	    __ram_root(dbc, cp->page, lp, rp) :
227*0Sstevel@tonic-gate 	    __bam_broot(dbc, cp->page, lp, rp))) != 0)
228*0Sstevel@tonic-gate 		goto err;
229*0Sstevel@tonic-gate 
230*0Sstevel@tonic-gate 	/* Adjust any cursors.  Do it last so we don't have to undo it. */
231*0Sstevel@tonic-gate 	__bam_ca_split(dbp, cp->page->pgno, lp->pgno, rp->pgno, split, 1);
232*0Sstevel@tonic-gate 
233*0Sstevel@tonic-gate 	/* Success -- write the real pages back to the store. */
234*0Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY);
235*0Sstevel@tonic-gate 	(void)__BT_TLPUT(dbc, cp->lock);
236*0Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY);
237*0Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY);
238*0Sstevel@tonic-gate 
239*0Sstevel@tonic-gate 	return (0);
240*0Sstevel@tonic-gate 
241*0Sstevel@tonic-gate err:	if (lp != NULL)
242*0Sstevel@tonic-gate 		(void)__bam_free(dbc, lp);
243*0Sstevel@tonic-gate 	if (rp != NULL)
244*0Sstevel@tonic-gate 		(void)__bam_free(dbc, rp);
245*0Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, cp->page, 0);
246*0Sstevel@tonic-gate 	(void)__BT_TLPUT(dbc, cp->lock);
247*0Sstevel@tonic-gate 	return (ret);
248*0Sstevel@tonic-gate }
249*0Sstevel@tonic-gate 
250*0Sstevel@tonic-gate /*
251*0Sstevel@tonic-gate  * __bam_page --
252*0Sstevel@tonic-gate  *	Split the non-root page of a btree.
253*0Sstevel@tonic-gate  */
254*0Sstevel@tonic-gate static int
255*0Sstevel@tonic-gate __bam_page(dbc, pp, cp)
256*0Sstevel@tonic-gate 	DBC *dbc;
257*0Sstevel@tonic-gate 	EPG *pp, *cp;
258*0Sstevel@tonic-gate {
259*0Sstevel@tonic-gate 	DB *dbp;
260*0Sstevel@tonic-gate 	DB_LOCK tplock;
261*0Sstevel@tonic-gate 	PAGE *lp, *rp, *tp;
262*0Sstevel@tonic-gate 	db_indx_t split;
263*0Sstevel@tonic-gate 	int ret;
264*0Sstevel@tonic-gate 
265*0Sstevel@tonic-gate 	dbp = dbc->dbp;
266*0Sstevel@tonic-gate 	lp = rp = tp = NULL;
267*0Sstevel@tonic-gate 	ret = -1;
268*0Sstevel@tonic-gate 
269*0Sstevel@tonic-gate 	/* Create new right page for the split. */
270*0Sstevel@tonic-gate 	if ((ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0)
271*0Sstevel@tonic-gate 		goto err;
272*0Sstevel@tonic-gate 	P_INIT(rp, dbp->pgsize, rp->pgno,
273*0Sstevel@tonic-gate 	    ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->pgno,
274*0Sstevel@tonic-gate 	    ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->next_pgno,
275*0Sstevel@tonic-gate 	    cp->page->level, TYPE(cp->page));
276*0Sstevel@tonic-gate 
277*0Sstevel@tonic-gate 	/* Create new left page for the split. */
278*0Sstevel@tonic-gate 	if ((ret = __os_malloc(dbp->pgsize, NULL, &lp)) != 0)
279*0Sstevel@tonic-gate 		goto err;
280*0Sstevel@tonic-gate 	P_INIT(lp, dbp->pgsize, cp->page->pgno,
281*0Sstevel@tonic-gate 	    ISINTERNAL(cp->page) ?  PGNO_INVALID : cp->page->prev_pgno,
282*0Sstevel@tonic-gate 	    ISINTERNAL(cp->page) ?  PGNO_INVALID : rp->pgno,
283*0Sstevel@tonic-gate 	    cp->page->level, TYPE(cp->page));
284*0Sstevel@tonic-gate 	ZERO_LSN(lp->lsn);
285*0Sstevel@tonic-gate 
286*0Sstevel@tonic-gate 	/*
287*0Sstevel@tonic-gate 	 * Split right.
288*0Sstevel@tonic-gate 	 *
289*0Sstevel@tonic-gate 	 * Only the indices are sorted on the page, i.e., the key/data pairs
290*0Sstevel@tonic-gate 	 * aren't, so it's simpler to copy the data from the split page onto
291*0Sstevel@tonic-gate 	 * two new pages instead of copying half the data to the right page
292*0Sstevel@tonic-gate 	 * and compacting the left page in place.  Since the left page can't
293*0Sstevel@tonic-gate 	 * change, we swap the original and the allocated left page after the
294*0Sstevel@tonic-gate 	 * split.
295*0Sstevel@tonic-gate 	 */
296*0Sstevel@tonic-gate 	if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
297*0Sstevel@tonic-gate 		goto err;
298*0Sstevel@tonic-gate 
299*0Sstevel@tonic-gate 	/*
300*0Sstevel@tonic-gate 	 * Fix up the previous pointer of any leaf page following the split
301*0Sstevel@tonic-gate 	 * page.
302*0Sstevel@tonic-gate 	 *
303*0Sstevel@tonic-gate 	 * !!!
304*0Sstevel@tonic-gate 	 * There are interesting deadlock situations here as we write-lock a
305*0Sstevel@tonic-gate 	 * page that's not in our direct ancestry.  Consider a cursor walking
306*0Sstevel@tonic-gate 	 * through the leaf pages, that has the previous page read-locked and
307*0Sstevel@tonic-gate 	 * is waiting on a lock for the page we just split.  It will deadlock
308*0Sstevel@tonic-gate 	 * here.  If this is a problem, we can fail in the split; it's not a
309*0Sstevel@tonic-gate 	 * problem as the split will succeed after the cursor passes through
310*0Sstevel@tonic-gate 	 * the page we're splitting.
311*0Sstevel@tonic-gate 	 */
312*0Sstevel@tonic-gate 	if (TYPE(cp->page) == P_LBTREE && rp->next_pgno != PGNO_INVALID) {
313*0Sstevel@tonic-gate 		if ((ret = __bam_lget(dbc,
314*0Sstevel@tonic-gate 		    0, rp->next_pgno, DB_LOCK_WRITE, &tplock)) != 0)
315*0Sstevel@tonic-gate 			goto err;
316*0Sstevel@tonic-gate 		if ((ret = memp_fget(dbp->mpf, &rp->next_pgno, 0, &tp)) != 0)
317*0Sstevel@tonic-gate 			goto err;
318*0Sstevel@tonic-gate 	}
319*0Sstevel@tonic-gate 
320*0Sstevel@tonic-gate 	/* Insert the new pages into the parent page. */
321*0Sstevel@tonic-gate 	if ((ret = __bam_pinsert(dbc, pp, lp, rp)) != 0)
322*0Sstevel@tonic-gate 		goto err;
323*0Sstevel@tonic-gate 
324*0Sstevel@tonic-gate 	/* Log the change. */
325*0Sstevel@tonic-gate 	if (DB_LOGGING(dbc)) {
326*0Sstevel@tonic-gate 		DBT __a;
327*0Sstevel@tonic-gate 		DB_LSN __lsn;
328*0Sstevel@tonic-gate 		memset(&__a, 0, sizeof(__a));
329*0Sstevel@tonic-gate 		__a.data = cp->page;
330*0Sstevel@tonic-gate 		__a.size = dbp->pgsize;
331*0Sstevel@tonic-gate 		if (tp == NULL)
332*0Sstevel@tonic-gate 			ZERO_LSN(__lsn);
333*0Sstevel@tonic-gate 		if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn,
334*0Sstevel@tonic-gate 		    &cp->page->lsn, 0, dbp->log_fileid, PGNO(cp->page),
335*0Sstevel@tonic-gate 		    &LSN(cp->page), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp),
336*0Sstevel@tonic-gate 		    tp == NULL ? 0 : PGNO(tp),
337*0Sstevel@tonic-gate 		    tp == NULL ? &__lsn : &LSN(tp), &__a)) != 0)
338*0Sstevel@tonic-gate 			goto err;
339*0Sstevel@tonic-gate 
340*0Sstevel@tonic-gate 		LSN(lp) = LSN(rp) = LSN(cp->page);
341*0Sstevel@tonic-gate 		if (tp != NULL)
342*0Sstevel@tonic-gate 			LSN(tp) = LSN(cp->page);
343*0Sstevel@tonic-gate 	}
344*0Sstevel@tonic-gate 
345*0Sstevel@tonic-gate 	/* Copy the allocated page into place. */
346*0Sstevel@tonic-gate 	memcpy(cp->page, lp, LOFFSET(lp));
347*0Sstevel@tonic-gate 	memcpy((u_int8_t *)cp->page + HOFFSET(lp),
348*0Sstevel@tonic-gate 	    (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp));
349*0Sstevel@tonic-gate 	__os_free(lp, dbp->pgsize);
350*0Sstevel@tonic-gate 	lp = NULL;
351*0Sstevel@tonic-gate 
352*0Sstevel@tonic-gate 	/* Finish the next-page link. */
353*0Sstevel@tonic-gate 	if (tp != NULL)
354*0Sstevel@tonic-gate 		tp->prev_pgno = rp->pgno;
355*0Sstevel@tonic-gate 
356*0Sstevel@tonic-gate 	/* Adjust any cursors.  Do so last so we don't have to undo it. */
357*0Sstevel@tonic-gate 	__bam_ca_split(dbp, cp->page->pgno, cp->page->pgno, rp->pgno, split, 0);
358*0Sstevel@tonic-gate 
359*0Sstevel@tonic-gate 	/* Success -- write the real pages back to the store. */
360*0Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY);
361*0Sstevel@tonic-gate 	(void)__BT_TLPUT(dbc, pp->lock);
362*0Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY);
363*0Sstevel@tonic-gate 	(void)__BT_TLPUT(dbc, cp->lock);
364*0Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY);
365*0Sstevel@tonic-gate 	if (tp != NULL) {
366*0Sstevel@tonic-gate 		(void)memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY);
367*0Sstevel@tonic-gate 		(void)__BT_TLPUT(dbc, tplock);
368*0Sstevel@tonic-gate 	}
369*0Sstevel@tonic-gate 	return (0);
370*0Sstevel@tonic-gate 
371*0Sstevel@tonic-gate err:	if (lp != NULL)
372*0Sstevel@tonic-gate 		__os_free(lp, dbp->pgsize);
373*0Sstevel@tonic-gate 	if (rp != NULL)
374*0Sstevel@tonic-gate 		(void)__bam_free(dbc, rp);
375*0Sstevel@tonic-gate 	if (tp != NULL) {
376*0Sstevel@tonic-gate 		(void)memp_fput(dbp->mpf, tp, 0);
377*0Sstevel@tonic-gate 		if (ret == DB_NEEDSPLIT)
378*0Sstevel@tonic-gate 			(void)__BT_LPUT(dbc, tplock);
379*0Sstevel@tonic-gate 		else
380*0Sstevel@tonic-gate 			(void)__BT_TLPUT(dbc, tplock);
381*0Sstevel@tonic-gate 	}
382*0Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, pp->page, 0);
383*0Sstevel@tonic-gate 	if (ret == DB_NEEDSPLIT)
384*0Sstevel@tonic-gate 		(void)__BT_LPUT(dbc, pp->lock);
385*0Sstevel@tonic-gate 	else
386*0Sstevel@tonic-gate 		(void)__BT_TLPUT(dbc, pp->lock);
387*0Sstevel@tonic-gate 	(void)memp_fput(dbp->mpf, cp->page, 0);
388*0Sstevel@tonic-gate 	if (ret == DB_NEEDSPLIT)
389*0Sstevel@tonic-gate 		(void)__BT_LPUT(dbc, cp->lock);
390*0Sstevel@tonic-gate 	else
391*0Sstevel@tonic-gate 		(void)__BT_TLPUT(dbc, cp->lock);
392*0Sstevel@tonic-gate 	return (ret);
393*0Sstevel@tonic-gate }
394*0Sstevel@tonic-gate 
395*0Sstevel@tonic-gate /*
396*0Sstevel@tonic-gate  * __bam_broot --
397*0Sstevel@tonic-gate  *	Fix up the btree root page after it has been split.
398*0Sstevel@tonic-gate  */
399*0Sstevel@tonic-gate static int
400*0Sstevel@tonic-gate __bam_broot(dbc, rootp, lp, rp)
401*0Sstevel@tonic-gate 	DBC *dbc;
402*0Sstevel@tonic-gate 	PAGE *rootp, *lp, *rp;
403*0Sstevel@tonic-gate {
404*0Sstevel@tonic-gate 	BINTERNAL bi, *child_bi;
405*0Sstevel@tonic-gate 	BKEYDATA *child_bk;
406*0Sstevel@tonic-gate 	DB *dbp;
407*0Sstevel@tonic-gate 	DBT hdr, data;
408*0Sstevel@tonic-gate 	int ret;
409*0Sstevel@tonic-gate 
410*0Sstevel@tonic-gate 	dbp = dbc->dbp;
411*0Sstevel@tonic-gate 
412*0Sstevel@tonic-gate 	/*
413*0Sstevel@tonic-gate 	 * If the root page was a leaf page, change it into an internal page.
414*0Sstevel@tonic-gate 	 * We copy the key we split on (but not the key's data, in the case of
415*0Sstevel@tonic-gate 	 * a leaf page) to the new root page.
416*0Sstevel@tonic-gate 	 */
417*0Sstevel@tonic-gate 	P_INIT(rootp, dbp->pgsize,
418*0Sstevel@tonic-gate 	    PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE);
419*0Sstevel@tonic-gate 
420*0Sstevel@tonic-gate 	memset(&data, 0, sizeof(data));
421*0Sstevel@tonic-gate 	memset(&hdr, 0, sizeof(hdr));
422*0Sstevel@tonic-gate 
423*0Sstevel@tonic-gate 	/*
424*0Sstevel@tonic-gate 	 * The btree comparison code guarantees that the left-most key on any
425*0Sstevel@tonic-gate 	 * level of the tree is never used, so it doesn't need to be filled in.
426*0Sstevel@tonic-gate 	 */
427*0Sstevel@tonic-gate 	memset(&bi, 0, sizeof(bi));
428*0Sstevel@tonic-gate 	bi.len = 0;
429*0Sstevel@tonic-gate 	B_TSET(bi.type, B_KEYDATA, 0);
430*0Sstevel@tonic-gate 	bi.pgno = lp->pgno;
431*0Sstevel@tonic-gate 	if (F_ISSET(dbp, DB_BT_RECNUM)) {
432*0Sstevel@tonic-gate 		bi.nrecs = __bam_total(lp);
433*0Sstevel@tonic-gate 		RE_NREC_SET(rootp, bi.nrecs);
434*0Sstevel@tonic-gate 	}
435*0Sstevel@tonic-gate 	hdr.data = &bi;
436*0Sstevel@tonic-gate 	hdr.size = SSZA(BINTERNAL, data);
437*0Sstevel@tonic-gate 	if ((ret =
438*0Sstevel@tonic-gate 	    __db_pitem(dbc, rootp, 0, BINTERNAL_SIZE(0), &hdr, NULL)) != 0)
439*0Sstevel@tonic-gate 		return (ret);
440*0Sstevel@tonic-gate 
441*0Sstevel@tonic-gate 	switch (TYPE(rp)) {
442*0Sstevel@tonic-gate 	case P_IBTREE:
443*0Sstevel@tonic-gate 		/* Copy the first key of the child page onto the root page. */
444*0Sstevel@tonic-gate 		child_bi = GET_BINTERNAL(rp, 0);
445*0Sstevel@tonic-gate 
446*0Sstevel@tonic-gate 		bi.len = child_bi->len;
447*0Sstevel@tonic-gate 		B_TSET(bi.type, child_bi->type, 0);
448*0Sstevel@tonic-gate 		bi.pgno = rp->pgno;
449*0Sstevel@tonic-gate 		if (F_ISSET(dbp, DB_BT_RECNUM)) {
450*0Sstevel@tonic-gate 			bi.nrecs = __bam_total(rp);
451*0Sstevel@tonic-gate 			RE_NREC_ADJ(rootp, bi.nrecs);
452*0Sstevel@tonic-gate 		}
453*0Sstevel@tonic-gate 		hdr.data = &bi;
454*0Sstevel@tonic-gate 		hdr.size = SSZA(BINTERNAL, data);
455*0Sstevel@tonic-gate 		data.data = child_bi->data;
456*0Sstevel@tonic-gate 		data.size = child_bi->len;
457*0Sstevel@tonic-gate 		if ((ret = __db_pitem(dbc, rootp, 1,
458*0Sstevel@tonic-gate 		    BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0)
459*0Sstevel@tonic-gate 			return (ret);
460*0Sstevel@tonic-gate 
461*0Sstevel@tonic-gate 		/* Increment the overflow ref count. */
462*0Sstevel@tonic-gate 		if (B_TYPE(child_bi->type) == B_OVERFLOW)
463*0Sstevel@tonic-gate 			if ((ret = __db_ovref(dbc,
464*0Sstevel@tonic-gate 			    ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0)
465*0Sstevel@tonic-gate 				return (ret);
466*0Sstevel@tonic-gate 		break;
467*0Sstevel@tonic-gate 	case P_LBTREE:
468*0Sstevel@tonic-gate 		/* Copy the first key of the child page onto the root page. */
469*0Sstevel@tonic-gate 		child_bk = GET_BKEYDATA(rp, 0);
470*0Sstevel@tonic-gate 		switch (B_TYPE(child_bk->type)) {
471*0Sstevel@tonic-gate 		case B_KEYDATA:
472*0Sstevel@tonic-gate 			bi.len = child_bk->len;
473*0Sstevel@tonic-gate 			B_TSET(bi.type, child_bk->type, 0);
474*0Sstevel@tonic-gate 			bi.pgno = rp->pgno;
475*0Sstevel@tonic-gate 			if (F_ISSET(dbp, DB_BT_RECNUM)) {
476*0Sstevel@tonic-gate 				bi.nrecs = __bam_total(rp);
477*0Sstevel@tonic-gate 				RE_NREC_ADJ(rootp, bi.nrecs);
478*0Sstevel@tonic-gate 			}
479*0Sstevel@tonic-gate 			hdr.data = &bi;
480*0Sstevel@tonic-gate 			hdr.size = SSZA(BINTERNAL, data);
481*0Sstevel@tonic-gate 			data.data = child_bk->data;
482*0Sstevel@tonic-gate 			data.size = child_bk->len;
483*0Sstevel@tonic-gate 			if ((ret = __db_pitem(dbc, rootp, 1,
484*0Sstevel@tonic-gate 			    BINTERNAL_SIZE(child_bk->len), &hdr, &data)) != 0)
485*0Sstevel@tonic-gate 				return (ret);
486*0Sstevel@tonic-gate 			break;
487*0Sstevel@tonic-gate 		case B_DUPLICATE:
488*0Sstevel@tonic-gate 		case B_OVERFLOW:
489*0Sstevel@tonic-gate 			bi.len = BOVERFLOW_SIZE;
490*0Sstevel@tonic-gate 			B_TSET(bi.type, child_bk->type, 0);
491*0Sstevel@tonic-gate 			bi.pgno = rp->pgno;
492*0Sstevel@tonic-gate 			if (F_ISSET(dbp, DB_BT_RECNUM)) {
493*0Sstevel@tonic-gate 				bi.nrecs = __bam_total(rp);
494*0Sstevel@tonic-gate 				RE_NREC_ADJ(rootp, bi.nrecs);
495*0Sstevel@tonic-gate 			}
496*0Sstevel@tonic-gate 			hdr.data = &bi;
497*0Sstevel@tonic-gate 			hdr.size = SSZA(BINTERNAL, data);
498*0Sstevel@tonic-gate 			data.data = child_bk;
499*0Sstevel@tonic-gate 			data.size = BOVERFLOW_SIZE;
500*0Sstevel@tonic-gate 			if ((ret = __db_pitem(dbc, rootp, 1,
501*0Sstevel@tonic-gate 			    BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0)
502*0Sstevel@tonic-gate 				return (ret);
503*0Sstevel@tonic-gate 
504*0Sstevel@tonic-gate 			/* Increment the overflow ref count. */
505*0Sstevel@tonic-gate 			if (B_TYPE(child_bk->type) == B_OVERFLOW)
506*0Sstevel@tonic-gate 				if ((ret = __db_ovref(dbc,
507*0Sstevel@tonic-gate 				    ((BOVERFLOW *)child_bk)->pgno, 1)) != 0)
508*0Sstevel@tonic-gate 					return (ret);
509*0Sstevel@tonic-gate 			break;
510*0Sstevel@tonic-gate 		default:
511*0Sstevel@tonic-gate 			return (__db_pgfmt(dbp, rp->pgno));
512*0Sstevel@tonic-gate 		}
513*0Sstevel@tonic-gate 		break;
514*0Sstevel@tonic-gate 	default:
515*0Sstevel@tonic-gate 		return (__db_pgfmt(dbp, rp->pgno));
516*0Sstevel@tonic-gate 	}
517*0Sstevel@tonic-gate 	return (0);
518*0Sstevel@tonic-gate }
519*0Sstevel@tonic-gate 
520*0Sstevel@tonic-gate /*
521*0Sstevel@tonic-gate  * __ram_root --
522*0Sstevel@tonic-gate  *	Fix up the recno root page after it has been split.
523*0Sstevel@tonic-gate  */
524*0Sstevel@tonic-gate static int
525*0Sstevel@tonic-gate __ram_root(dbc, rootp, lp, rp)
526*0Sstevel@tonic-gate 	DBC *dbc;
527*0Sstevel@tonic-gate 	PAGE *rootp, *lp, *rp;
528*0Sstevel@tonic-gate {
529*0Sstevel@tonic-gate 	DB *dbp;
530*0Sstevel@tonic-gate 	DBT hdr;
531*0Sstevel@tonic-gate 	RINTERNAL ri;
532*0Sstevel@tonic-gate 	int ret;
533*0Sstevel@tonic-gate 
534*0Sstevel@tonic-gate 	dbp = dbc->dbp;
535*0Sstevel@tonic-gate 
536*0Sstevel@tonic-gate 	/* Initialize the page. */
537*0Sstevel@tonic-gate 	P_INIT(rootp, dbp->pgsize,
538*0Sstevel@tonic-gate 	    PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO);
539*0Sstevel@tonic-gate 
540*0Sstevel@tonic-gate 	/* Initialize the header. */
541*0Sstevel@tonic-gate 	memset(&hdr, 0, sizeof(hdr));
542*0Sstevel@tonic-gate 	hdr.data = &ri;
543*0Sstevel@tonic-gate 	hdr.size = RINTERNAL_SIZE;
544*0Sstevel@tonic-gate 
545*0Sstevel@tonic-gate 	/* Insert the left and right keys, set the header information. */
546*0Sstevel@tonic-gate 	ri.pgno = lp->pgno;
547*0Sstevel@tonic-gate 	ri.nrecs = __bam_total(lp);
548*0Sstevel@tonic-gate 	if ((ret = __db_pitem(dbc, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0)
549*0Sstevel@tonic-gate 		return (ret);
550*0Sstevel@tonic-gate 	RE_NREC_SET(rootp, ri.nrecs);
551*0Sstevel@tonic-gate 	ri.pgno = rp->pgno;
552*0Sstevel@tonic-gate 	ri.nrecs = __bam_total(rp);
553*0Sstevel@tonic-gate 	if ((ret = __db_pitem(dbc, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0)
554*0Sstevel@tonic-gate 		return (ret);
555*0Sstevel@tonic-gate 	RE_NREC_ADJ(rootp, ri.nrecs);
556*0Sstevel@tonic-gate 	return (0);
557*0Sstevel@tonic-gate }
558*0Sstevel@tonic-gate 
559*0Sstevel@tonic-gate /*
560*0Sstevel@tonic-gate  * __bam_pinsert --
561*0Sstevel@tonic-gate  *	Insert a new key into a parent page, completing the split.
562*0Sstevel@tonic-gate  */
563*0Sstevel@tonic-gate static int
564*0Sstevel@tonic-gate __bam_pinsert(dbc, parent, lchild, rchild)
565*0Sstevel@tonic-gate 	DBC *dbc;
566*0Sstevel@tonic-gate 	EPG *parent;
567*0Sstevel@tonic-gate 	PAGE *lchild, *rchild;
568*0Sstevel@tonic-gate {
569*0Sstevel@tonic-gate 	BINTERNAL bi, *child_bi;
570*0Sstevel@tonic-gate 	BKEYDATA *child_bk, *tmp_bk;
571*0Sstevel@tonic-gate 	BTREE *t;
572*0Sstevel@tonic-gate 	DB *dbp;
573*0Sstevel@tonic-gate 	DBT a, b, hdr, data;
574*0Sstevel@tonic-gate 	PAGE *ppage;
575*0Sstevel@tonic-gate 	RINTERNAL ri;
576*0Sstevel@tonic-gate 	db_indx_t off;
577*0Sstevel@tonic-gate 	db_recno_t nrecs;
578*0Sstevel@tonic-gate 	u_int32_t n, nbytes, nksize;
579*0Sstevel@tonic-gate 	int ret;
580*0Sstevel@tonic-gate 
581*0Sstevel@tonic-gate 	dbp = dbc->dbp;
582*0Sstevel@tonic-gate 	t = dbp->internal;
583*0Sstevel@tonic-gate 	ppage = parent->page;
584*0Sstevel@tonic-gate 
585*0Sstevel@tonic-gate 	/* If handling record numbers, count records split to the right page. */
586*0Sstevel@tonic-gate 	nrecs = dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM) ?
587*0Sstevel@tonic-gate 	    __bam_total(rchild) : 0;
588*0Sstevel@tonic-gate 
589*0Sstevel@tonic-gate 	/*
590*0Sstevel@tonic-gate 	 * Now we insert the new page's first key into the parent page, which
591*0Sstevel@tonic-gate 	 * completes the split.  The parent points to a PAGE and a page index
592*0Sstevel@tonic-gate 	 * offset, where the new key goes ONE AFTER the index, because we split
593*0Sstevel@tonic-gate 	 * to the right.
594*0Sstevel@tonic-gate 	 *
595*0Sstevel@tonic-gate 	 * XXX
596*0Sstevel@tonic-gate 	 * Some btree algorithms replace the key for the old page as well as
597*0Sstevel@tonic-gate 	 * the new page.  We don't, as there's no reason to believe that the
598*0Sstevel@tonic-gate 	 * first key on the old page is any better than the key we have, and,
599*0Sstevel@tonic-gate 	 * in the case of a key being placed at index 0 causing the split, the
600*0Sstevel@tonic-gate 	 * key is unavailable.
601*0Sstevel@tonic-gate 	 */
602*0Sstevel@tonic-gate 	off = parent->indx + O_INDX;
603*0Sstevel@tonic-gate 
604*0Sstevel@tonic-gate 	/*
605*0Sstevel@tonic-gate 	 * Calculate the space needed on the parent page.
606*0Sstevel@tonic-gate 	 *
607*0Sstevel@tonic-gate 	 * Prefix trees: space hack used when inserting into BINTERNAL pages.
608*0Sstevel@tonic-gate 	 * Retain only what's needed to distinguish between the new entry and
609*0Sstevel@tonic-gate 	 * the LAST entry on the page to its left.  If the keys compare equal,
610*0Sstevel@tonic-gate 	 * retain the entire key.  We ignore overflow keys, and the entire key
611*0Sstevel@tonic-gate 	 * must be retained for the next-to-leftmost key on the leftmost page
612*0Sstevel@tonic-gate 	 * of each level, or the search will fail.  Applicable ONLY to internal
613*0Sstevel@tonic-gate 	 * pages that have leaf pages as children.  Further reduction of the
614*0Sstevel@tonic-gate 	 * key between pairs of internal pages loses too much information.
615*0Sstevel@tonic-gate 	 */
616*0Sstevel@tonic-gate 	switch (TYPE(rchild)) {
617*0Sstevel@tonic-gate 	case P_IBTREE:
618*0Sstevel@tonic-gate 		child_bi = GET_BINTERNAL(rchild, 0);
619*0Sstevel@tonic-gate 		nbytes = BINTERNAL_PSIZE(child_bi->len);
620*0Sstevel@tonic-gate 
621*0Sstevel@tonic-gate 		if (P_FREESPACE(ppage) < nbytes)
622*0Sstevel@tonic-gate 			return (DB_NEEDSPLIT);
623*0Sstevel@tonic-gate 
624*0Sstevel@tonic-gate 		/* Add a new record for the right page. */
625*0Sstevel@tonic-gate 		memset(&bi, 0, sizeof(bi));
626*0Sstevel@tonic-gate 		bi.len = child_bi->len;
627*0Sstevel@tonic-gate 		B_TSET(bi.type, child_bi->type, 0);
628*0Sstevel@tonic-gate 		bi.pgno = rchild->pgno;
629*0Sstevel@tonic-gate 		bi.nrecs = nrecs;
630*0Sstevel@tonic-gate 		memset(&hdr, 0, sizeof(hdr));
631*0Sstevel@tonic-gate 		hdr.data = &bi;
632*0Sstevel@tonic-gate 		hdr.size = SSZA(BINTERNAL, data);
633*0Sstevel@tonic-gate 		memset(&data, 0, sizeof(data));
634*0Sstevel@tonic-gate 		data.data = child_bi->data;
635*0Sstevel@tonic-gate 		data.size = child_bi->len;
636*0Sstevel@tonic-gate 		if ((ret = __db_pitem(dbc, ppage, off,
637*0Sstevel@tonic-gate 		    BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0)
638*0Sstevel@tonic-gate 			return (ret);
639*0Sstevel@tonic-gate 
640*0Sstevel@tonic-gate 		/* Increment the overflow ref count. */
641*0Sstevel@tonic-gate 		if (B_TYPE(child_bi->type) == B_OVERFLOW)
642*0Sstevel@tonic-gate 			if ((ret = __db_ovref(dbc,
643*0Sstevel@tonic-gate 			    ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0)
644*0Sstevel@tonic-gate 				return (ret);
645*0Sstevel@tonic-gate 		break;
646*0Sstevel@tonic-gate 	case P_LBTREE:
647*0Sstevel@tonic-gate 		child_bk = GET_BKEYDATA(rchild, 0);
648*0Sstevel@tonic-gate 		switch (B_TYPE(child_bk->type)) {
649*0Sstevel@tonic-gate 		case B_KEYDATA:
650*0Sstevel@tonic-gate 			nbytes = BINTERNAL_PSIZE(child_bk->len);
651*0Sstevel@tonic-gate 			nksize = child_bk->len;
652*0Sstevel@tonic-gate 			if (t->bt_prefix == NULL)
653*0Sstevel@tonic-gate 				goto noprefix;
654*0Sstevel@tonic-gate 			if (ppage->prev_pgno == PGNO_INVALID && off <= 1)
655*0Sstevel@tonic-gate 				goto noprefix;
656*0Sstevel@tonic-gate 			tmp_bk = GET_BKEYDATA(lchild, NUM_ENT(lchild) - P_INDX);
657*0Sstevel@tonic-gate 			if (B_TYPE(tmp_bk->type) != B_KEYDATA)
658*0Sstevel@tonic-gate 				goto noprefix;
659*0Sstevel@tonic-gate 			memset(&a, 0, sizeof(a));
660*0Sstevel@tonic-gate 			a.size = tmp_bk->len;
661*0Sstevel@tonic-gate 			a.data = tmp_bk->data;
662*0Sstevel@tonic-gate 			memset(&b, 0, sizeof(b));
663*0Sstevel@tonic-gate 			b.size = child_bk->len;
664*0Sstevel@tonic-gate 			b.data = child_bk->data;
665*0Sstevel@tonic-gate 			nksize = t->bt_prefix(&a, &b);
666*0Sstevel@tonic-gate 			if ((n = BINTERNAL_PSIZE(nksize)) < nbytes)
667*0Sstevel@tonic-gate 				nbytes = n;
668*0Sstevel@tonic-gate 			else
669*0Sstevel@tonic-gate noprefix:			nksize = child_bk->len;
670*0Sstevel@tonic-gate 
671*0Sstevel@tonic-gate 			if (P_FREESPACE(ppage) < nbytes)
672*0Sstevel@tonic-gate 				return (DB_NEEDSPLIT);
673*0Sstevel@tonic-gate 
674*0Sstevel@tonic-gate 			memset(&bi, 0, sizeof(bi));
675*0Sstevel@tonic-gate 			bi.len = nksize;
676*0Sstevel@tonic-gate 			B_TSET(bi.type, child_bk->type, 0);
677*0Sstevel@tonic-gate 			bi.pgno = rchild->pgno;
678*0Sstevel@tonic-gate 			bi.nrecs = nrecs;
679*0Sstevel@tonic-gate 			memset(&hdr, 0, sizeof(hdr));
680*0Sstevel@tonic-gate 			hdr.data = &bi;
681*0Sstevel@tonic-gate 			hdr.size = SSZA(BINTERNAL, data);
682*0Sstevel@tonic-gate 			memset(&data, 0, sizeof(data));
683*0Sstevel@tonic-gate 			data.data = child_bk->data;
684*0Sstevel@tonic-gate 			data.size = nksize;
685*0Sstevel@tonic-gate 			if ((ret = __db_pitem(dbc, ppage, off,
686*0Sstevel@tonic-gate 			    BINTERNAL_SIZE(nksize), &hdr, &data)) != 0)
687*0Sstevel@tonic-gate 				return (ret);
688*0Sstevel@tonic-gate 			break;
689*0Sstevel@tonic-gate 		case B_DUPLICATE:
690*0Sstevel@tonic-gate 		case B_OVERFLOW:
691*0Sstevel@tonic-gate 			nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE);
692*0Sstevel@tonic-gate 
693*0Sstevel@tonic-gate 			if (P_FREESPACE(ppage) < nbytes)
694*0Sstevel@tonic-gate 				return (DB_NEEDSPLIT);
695*0Sstevel@tonic-gate 
696*0Sstevel@tonic-gate 			memset(&bi, 0, sizeof(bi));
697*0Sstevel@tonic-gate 			bi.len = BOVERFLOW_SIZE;
698*0Sstevel@tonic-gate 			B_TSET(bi.type, child_bk->type, 0);
699*0Sstevel@tonic-gate 			bi.pgno = rchild->pgno;
700*0Sstevel@tonic-gate 			bi.nrecs = nrecs;
701*0Sstevel@tonic-gate 			memset(&hdr, 0, sizeof(hdr));
702*0Sstevel@tonic-gate 			hdr.data = &bi;
703*0Sstevel@tonic-gate 			hdr.size = SSZA(BINTERNAL, data);
704*0Sstevel@tonic-gate 			memset(&data, 0, sizeof(data));
705*0Sstevel@tonic-gate 			data.data = child_bk;
706*0Sstevel@tonic-gate 			data.size = BOVERFLOW_SIZE;
707*0Sstevel@tonic-gate 			if ((ret = __db_pitem(dbc, ppage, off,
708*0Sstevel@tonic-gate 			    BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0)
709*0Sstevel@tonic-gate 				return (ret);
710*0Sstevel@tonic-gate 
711*0Sstevel@tonic-gate 			/* Increment the overflow ref count. */
712*0Sstevel@tonic-gate 			if (B_TYPE(child_bk->type) == B_OVERFLOW)
713*0Sstevel@tonic-gate 				if ((ret = __db_ovref(dbc,
714*0Sstevel@tonic-gate 				    ((BOVERFLOW *)child_bk)->pgno, 1)) != 0)
715*0Sstevel@tonic-gate 					return (ret);
716*0Sstevel@tonic-gate 			break;
717*0Sstevel@tonic-gate 		default:
718*0Sstevel@tonic-gate 			return (__db_pgfmt(dbp, rchild->pgno));
719*0Sstevel@tonic-gate 		}
720*0Sstevel@tonic-gate 		break;
721*0Sstevel@tonic-gate 	case P_IRECNO:
722*0Sstevel@tonic-gate 	case P_LRECNO:
723*0Sstevel@tonic-gate 		nbytes = RINTERNAL_PSIZE;
724*0Sstevel@tonic-gate 
725*0Sstevel@tonic-gate 		if (P_FREESPACE(ppage) < nbytes)
726*0Sstevel@tonic-gate 			return (DB_NEEDSPLIT);
727*0Sstevel@tonic-gate 
728*0Sstevel@tonic-gate 		/* Add a new record for the right page. */
729*0Sstevel@tonic-gate 		memset(&hdr, 0, sizeof(hdr));
730*0Sstevel@tonic-gate 		hdr.data = &ri;
731*0Sstevel@tonic-gate 		hdr.size = RINTERNAL_SIZE;
732*0Sstevel@tonic-gate 		ri.pgno = rchild->pgno;
733*0Sstevel@tonic-gate 		ri.nrecs = nrecs;
734*0Sstevel@tonic-gate 		if ((ret = __db_pitem(dbc,
735*0Sstevel@tonic-gate 		    ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0)
736*0Sstevel@tonic-gate 			return (ret);
737*0Sstevel@tonic-gate 		break;
738*0Sstevel@tonic-gate 	default:
739*0Sstevel@tonic-gate 		return (__db_pgfmt(dbp, rchild->pgno));
740*0Sstevel@tonic-gate 	}
741*0Sstevel@tonic-gate 
742*0Sstevel@tonic-gate 	/* Adjust the parent page's left page record count. */
743*0Sstevel@tonic-gate 	if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) {
744*0Sstevel@tonic-gate 		/* Log the change. */
745*0Sstevel@tonic-gate 		if (DB_LOGGING(dbc) &&
746*0Sstevel@tonic-gate 		    (ret = __bam_cadjust_log(dbp->dbenv->lg_info,
747*0Sstevel@tonic-gate 		    dbc->txn, &LSN(ppage), 0, dbp->log_fileid,
748*0Sstevel@tonic-gate 		    PGNO(ppage), &LSN(ppage), (u_int32_t)parent->indx,
749*0Sstevel@tonic-gate 		    -(int32_t)nrecs, (int32_t)0)) != 0)
750*0Sstevel@tonic-gate 			return (ret);
751*0Sstevel@tonic-gate 
752*0Sstevel@tonic-gate 		/* Update the left page count. */
753*0Sstevel@tonic-gate 		if (dbp->type == DB_RECNO)
754*0Sstevel@tonic-gate 			GET_RINTERNAL(ppage, parent->indx)->nrecs -= nrecs;
755*0Sstevel@tonic-gate 		else
756*0Sstevel@tonic-gate 			GET_BINTERNAL(ppage, parent->indx)->nrecs -= nrecs;
757*0Sstevel@tonic-gate 	}
758*0Sstevel@tonic-gate 
759*0Sstevel@tonic-gate 	return (0);
760*0Sstevel@tonic-gate }
761*0Sstevel@tonic-gate 
762*0Sstevel@tonic-gate /*
763*0Sstevel@tonic-gate  * __bam_psplit --
764*0Sstevel@tonic-gate  *	Do the real work of splitting the page.
765*0Sstevel@tonic-gate  */
766*0Sstevel@tonic-gate static int
767*0Sstevel@tonic-gate __bam_psplit(dbc, cp, lp, rp, splitret)
768*0Sstevel@tonic-gate 	DBC *dbc;
769*0Sstevel@tonic-gate 	EPG *cp;
770*0Sstevel@tonic-gate 	PAGE *lp, *rp;
771*0Sstevel@tonic-gate 	db_indx_t *splitret;
772*0Sstevel@tonic-gate {
773*0Sstevel@tonic-gate 	DB *dbp;
774*0Sstevel@tonic-gate 	PAGE *pp;
775*0Sstevel@tonic-gate 	db_indx_t half, nbytes, off, splitp, top;
776*0Sstevel@tonic-gate 	int adjust, cnt, isbigkey, ret;
777*0Sstevel@tonic-gate 
778*0Sstevel@tonic-gate 	dbp = dbc->dbp;
779*0Sstevel@tonic-gate 	pp = cp->page;
780*0Sstevel@tonic-gate 	adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX;
781*0Sstevel@tonic-gate 
782*0Sstevel@tonic-gate 	/*
783*0Sstevel@tonic-gate 	 * If we're splitting the first (last) page on a level because we're
784*0Sstevel@tonic-gate 	 * inserting (appending) a key to it, it's likely that the data is
785*0Sstevel@tonic-gate 	 * sorted.  Moving a single item to the new page is less work and can
786*0Sstevel@tonic-gate 	 * push the fill factor higher than normal.  If we're wrong it's not
787*0Sstevel@tonic-gate 	 * a big deal, we'll just do the split the right way next time.
788*0Sstevel@tonic-gate 	 */
789*0Sstevel@tonic-gate 	off = 0;
790*0Sstevel@tonic-gate 	if (NEXT_PGNO(pp) == PGNO_INVALID &&
791*0Sstevel@tonic-gate 	    ((ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page) - 1) ||
792*0Sstevel@tonic-gate 	    (!ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page))))
793*0Sstevel@tonic-gate 		off = NUM_ENT(cp->page) - adjust;
794*0Sstevel@tonic-gate 	else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0)
795*0Sstevel@tonic-gate 		off = adjust;
796*0Sstevel@tonic-gate 
797*0Sstevel@tonic-gate 	if (off != 0)
798*0Sstevel@tonic-gate 		goto sort;
799*0Sstevel@tonic-gate 
800*0Sstevel@tonic-gate 	/*
801*0Sstevel@tonic-gate 	 * Split the data to the left and right pages.  Try not to split on
802*0Sstevel@tonic-gate 	 * an overflow key.  (Overflow keys on internal pages will slow down
803*0Sstevel@tonic-gate 	 * searches.)  Refuse to split in the middle of a set of duplicates.
804*0Sstevel@tonic-gate 	 *
805*0Sstevel@tonic-gate 	 * First, find the optimum place to split.
806*0Sstevel@tonic-gate 	 *
807*0Sstevel@tonic-gate 	 * It's possible to try and split past the last record on the page if
808*0Sstevel@tonic-gate 	 * there's a very large record at the end of the page.  Make sure this
809*0Sstevel@tonic-gate 	 * doesn't happen by bounding the check at the next-to-last entry on
810*0Sstevel@tonic-gate 	 * the page.
811*0Sstevel@tonic-gate 	 *
812*0Sstevel@tonic-gate 	 * Note, we try and split half the data present on the page.  This is
813*0Sstevel@tonic-gate 	 * because another process may have already split the page and left
814*0Sstevel@tonic-gate 	 * it half empty.  We don't try and skip the split -- we don't know
815*0Sstevel@tonic-gate 	 * how much space we're going to need on the page, and we may need up
816*0Sstevel@tonic-gate 	 * to half the page for a big item, so there's no easy test to decide
817*0Sstevel@tonic-gate 	 * if we need to split or not.  Besides, if two threads are inserting
818*0Sstevel@tonic-gate 	 * data into the same place in the database, we're probably going to
819*0Sstevel@tonic-gate 	 * need more space soon anyway.
820*0Sstevel@tonic-gate 	 */
821*0Sstevel@tonic-gate 	top = NUM_ENT(pp) - adjust;
822*0Sstevel@tonic-gate 	half = (dbp->pgsize - HOFFSET(pp)) / 2;
823*0Sstevel@tonic-gate 	for (nbytes = 0, off = 0; off < top && nbytes < half; ++off)
824*0Sstevel@tonic-gate 		switch (TYPE(pp)) {
825*0Sstevel@tonic-gate 		case P_IBTREE:
826*0Sstevel@tonic-gate 			if (B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA)
827*0Sstevel@tonic-gate 				nbytes +=
828*0Sstevel@tonic-gate 				   BINTERNAL_SIZE(GET_BINTERNAL(pp, off)->len);
829*0Sstevel@tonic-gate 			else
830*0Sstevel@tonic-gate 				nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE);
831*0Sstevel@tonic-gate 			break;
832*0Sstevel@tonic-gate 		case P_LBTREE:
833*0Sstevel@tonic-gate 			if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)
834*0Sstevel@tonic-gate 				nbytes +=
835*0Sstevel@tonic-gate 				    BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
836*0Sstevel@tonic-gate 			else
837*0Sstevel@tonic-gate 				nbytes += BOVERFLOW_SIZE;
838*0Sstevel@tonic-gate 
839*0Sstevel@tonic-gate 			++off;
840*0Sstevel@tonic-gate 			if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)
841*0Sstevel@tonic-gate 				nbytes +=
842*0Sstevel@tonic-gate 				    BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
843*0Sstevel@tonic-gate 			else
844*0Sstevel@tonic-gate 				nbytes += BOVERFLOW_SIZE;
845*0Sstevel@tonic-gate 			break;
846*0Sstevel@tonic-gate 		case P_IRECNO:
847*0Sstevel@tonic-gate 			nbytes += RINTERNAL_SIZE;
848*0Sstevel@tonic-gate 			break;
849*0Sstevel@tonic-gate 		case P_LRECNO:
850*0Sstevel@tonic-gate 			nbytes += BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
851*0Sstevel@tonic-gate 			break;
852*0Sstevel@tonic-gate 		default:
853*0Sstevel@tonic-gate 			return (__db_pgfmt(dbp, pp->pgno));
854*0Sstevel@tonic-gate 		}
855*0Sstevel@tonic-gate sort:	splitp = off;
856*0Sstevel@tonic-gate 
857*0Sstevel@tonic-gate 	/*
858*0Sstevel@tonic-gate 	 * Splitp is either at or just past the optimum split point.  If
859*0Sstevel@tonic-gate 	 * it's a big key, try and find something close by that's not.
860*0Sstevel@tonic-gate 	 */
861*0Sstevel@tonic-gate 	if (TYPE(pp) == P_IBTREE)
862*0Sstevel@tonic-gate 		isbigkey = B_TYPE(GET_BINTERNAL(pp, off)->type) != B_KEYDATA;
863*0Sstevel@tonic-gate 	else if (TYPE(pp) == P_LBTREE)
864*0Sstevel@tonic-gate 		isbigkey = B_TYPE(GET_BKEYDATA(pp, off)->type) != B_KEYDATA;
865*0Sstevel@tonic-gate 	else
866*0Sstevel@tonic-gate 		isbigkey = 0;
867*0Sstevel@tonic-gate 	if (isbigkey)
868*0Sstevel@tonic-gate 		for (cnt = 1; cnt <= 3; ++cnt) {
869*0Sstevel@tonic-gate 			off = splitp + cnt * adjust;
870*0Sstevel@tonic-gate 			if (off < (db_indx_t)NUM_ENT(pp) &&
871*0Sstevel@tonic-gate 			    ((TYPE(pp) == P_IBTREE &&
872*0Sstevel@tonic-gate 			    B_TYPE(GET_BINTERNAL(pp,off)->type) == B_KEYDATA) ||
873*0Sstevel@tonic-gate 			    B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)) {
874*0Sstevel@tonic-gate 				splitp = off;
875*0Sstevel@tonic-gate 				break;
876*0Sstevel@tonic-gate 			}
877*0Sstevel@tonic-gate 			if (splitp <= (db_indx_t)(cnt * adjust))
878*0Sstevel@tonic-gate 				continue;
879*0Sstevel@tonic-gate 			off = splitp - cnt * adjust;
880*0Sstevel@tonic-gate 			if (TYPE(pp) == P_IBTREE ?
881*0Sstevel@tonic-gate 			    B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA :
882*0Sstevel@tonic-gate 			    B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) {
883*0Sstevel@tonic-gate 				splitp = off;
884*0Sstevel@tonic-gate 				break;
885*0Sstevel@tonic-gate 			}
886*0Sstevel@tonic-gate 		}
887*0Sstevel@tonic-gate 
888*0Sstevel@tonic-gate 	/*
889*0Sstevel@tonic-gate 	 * We can't split in the middle a set of duplicates.  We know that
890*0Sstevel@tonic-gate 	 * no duplicate set can take up more than about 25% of the page,
891*0Sstevel@tonic-gate 	 * because that's the point where we push it off onto a duplicate
892*0Sstevel@tonic-gate 	 * page set.  So, this loop can't be unbounded.
893*0Sstevel@tonic-gate 	 */
894*0Sstevel@tonic-gate 	if (F_ISSET(dbp, DB_AM_DUP) && TYPE(pp) == P_LBTREE &&
895*0Sstevel@tonic-gate 	    pp->inp[splitp] == pp->inp[splitp - adjust])
896*0Sstevel@tonic-gate 		for (cnt = 1;; ++cnt) {
897*0Sstevel@tonic-gate 			off = splitp + cnt * adjust;
898*0Sstevel@tonic-gate 			if (off < NUM_ENT(pp) &&
899*0Sstevel@tonic-gate 			    pp->inp[splitp] != pp->inp[off]) {
900*0Sstevel@tonic-gate 				splitp = off;
901*0Sstevel@tonic-gate 				break;
902*0Sstevel@tonic-gate 			}
903*0Sstevel@tonic-gate 			if (splitp <= (db_indx_t)(cnt * adjust))
904*0Sstevel@tonic-gate 				continue;
905*0Sstevel@tonic-gate 			off = splitp - cnt * adjust;
906*0Sstevel@tonic-gate 			if (pp->inp[splitp] != pp->inp[off]) {
907*0Sstevel@tonic-gate 				splitp = off + adjust;
908*0Sstevel@tonic-gate 				break;
909*0Sstevel@tonic-gate 			}
910*0Sstevel@tonic-gate 		}
911*0Sstevel@tonic-gate 
912*0Sstevel@tonic-gate 
913*0Sstevel@tonic-gate 	/* We're going to split at splitp. */
914*0Sstevel@tonic-gate 	if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0)
915*0Sstevel@tonic-gate 		return (ret);
916*0Sstevel@tonic-gate 	if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0)
917*0Sstevel@tonic-gate 		return (ret);
918*0Sstevel@tonic-gate 
919*0Sstevel@tonic-gate 	*splitret = splitp;
920*0Sstevel@tonic-gate 	return (0);
921*0Sstevel@tonic-gate }
922*0Sstevel@tonic-gate 
923*0Sstevel@tonic-gate /*
924*0Sstevel@tonic-gate  * __bam_copy --
925*0Sstevel@tonic-gate  *	Copy a set of records from one page to another.
926*0Sstevel@tonic-gate  *
927*0Sstevel@tonic-gate  * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t));
928*0Sstevel@tonic-gate  */
929*0Sstevel@tonic-gate int
930*0Sstevel@tonic-gate __bam_copy(dbp, pp, cp, nxt, stop)
931*0Sstevel@tonic-gate 	DB *dbp;
932*0Sstevel@tonic-gate 	PAGE *pp, *cp;
933*0Sstevel@tonic-gate 	u_int32_t nxt, stop;
934*0Sstevel@tonic-gate {
935*0Sstevel@tonic-gate 	db_indx_t nbytes, off;
936*0Sstevel@tonic-gate 
937*0Sstevel@tonic-gate 	/*
938*0Sstevel@tonic-gate 	 * Copy the rest of the data to the right page.  Nxt is the next
939*0Sstevel@tonic-gate 	 * offset placed on the target page.
940*0Sstevel@tonic-gate 	 */
941*0Sstevel@tonic-gate 	for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) {
942*0Sstevel@tonic-gate 		switch (TYPE(pp)) {
943*0Sstevel@tonic-gate 		case P_IBTREE:
944*0Sstevel@tonic-gate 			if (B_TYPE(GET_BINTERNAL(pp, nxt)->type) == B_KEYDATA)
945*0Sstevel@tonic-gate 				nbytes =
946*0Sstevel@tonic-gate 				    BINTERNAL_SIZE(GET_BINTERNAL(pp, nxt)->len);
947*0Sstevel@tonic-gate 			else
948*0Sstevel@tonic-gate 				nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE);
949*0Sstevel@tonic-gate 			break;
950*0Sstevel@tonic-gate 		case P_LBTREE:
951*0Sstevel@tonic-gate 			/*
952*0Sstevel@tonic-gate 			 * If we're on a key and it's a duplicate, just copy
953*0Sstevel@tonic-gate 			 * the offset.
954*0Sstevel@tonic-gate 			 */
955*0Sstevel@tonic-gate 			if (off != 0 && (nxt % P_INDX) == 0 &&
956*0Sstevel@tonic-gate 			    pp->inp[nxt] == pp->inp[nxt - P_INDX]) {
957*0Sstevel@tonic-gate 				cp->inp[off] = cp->inp[off - P_INDX];
958*0Sstevel@tonic-gate 				continue;
959*0Sstevel@tonic-gate 			}
960*0Sstevel@tonic-gate 			/* FALLTHROUGH */
961*0Sstevel@tonic-gate 		case P_LRECNO:
962*0Sstevel@tonic-gate 			if (B_TYPE(GET_BKEYDATA(pp, nxt)->type) == B_KEYDATA)
963*0Sstevel@tonic-gate 				nbytes =
964*0Sstevel@tonic-gate 				    BKEYDATA_SIZE(GET_BKEYDATA(pp, nxt)->len);
965*0Sstevel@tonic-gate 			else
966*0Sstevel@tonic-gate 				nbytes = BOVERFLOW_SIZE;
967*0Sstevel@tonic-gate 			break;
968*0Sstevel@tonic-gate 		case P_IRECNO:
969*0Sstevel@tonic-gate 			nbytes = RINTERNAL_SIZE;
970*0Sstevel@tonic-gate 			break;
971*0Sstevel@tonic-gate 		default:
972*0Sstevel@tonic-gate 			return (__db_pgfmt(dbp, pp->pgno));
973*0Sstevel@tonic-gate 		}
974*0Sstevel@tonic-gate 		cp->inp[off] = HOFFSET(cp) -= nbytes;
975*0Sstevel@tonic-gate 		memcpy(P_ENTRY(cp, off), P_ENTRY(pp, nxt), nbytes);
976*0Sstevel@tonic-gate 	}
977*0Sstevel@tonic-gate 	return (0);
978*0Sstevel@tonic-gate }
979