xref: /csrg-svn/lib/libc/db/btree/bt_split.c (revision 46134)
1*46134Smao /*-
2*46134Smao  * Copyright (c) 1990 The Regents of the University of California.
3*46134Smao  * All rights reserved.
4*46134Smao  *
5*46134Smao  * This code is derived from software contributed to Berkeley by
6*46134Smao  * Mike Olson.
7*46134Smao  *
8*46134Smao  * %sccs.include.redist.c%
9*46134Smao  */
10*46134Smao 
11*46134Smao #if defined(LIBC_SCCS) && !defined(lint)
12*46134Smao static char sccsid[] = "@(#)bt_split.c	5.1 (Berkeley) 01/23/91";
13*46134Smao #endif /* LIBC_SCCS and not lint */
14*46134Smao 
15*46134Smao #include <sys/types.h>
16*46134Smao #include <db.h>
17*46134Smao #include "btree.h"
18*46134Smao 
19*46134Smao /*
20*46134Smao  *  _BT_SPLIT -- Split a page into two pages.
21*46134Smao  *
22*46134Smao  *	Splits are caused by insertions, and propogate up the tree in
23*46134Smao  *	the usual way.  The root page is always page 1 in the file on
24*46134Smao  *	disk, so root splits are handled specially.  On entry to this
25*46134Smao  *	routine, t->bt_curpage is the page to be split.
26*46134Smao  *
27*46134Smao  *	Parameters:
28*46134Smao  *		t -- btree in which to do split.
29*46134Smao  *
30*46134Smao  *	Returns:
31*46134Smao  *		RET_SUCCESS, RET_ERROR.
32*46134Smao  *
33*46134Smao  *	Side Effects:
34*46134Smao  *		Changes the notion of the current page.
35*46134Smao  */
36*46134Smao 
37*46134Smao int
38*46134Smao _bt_split(t)
39*46134Smao 	BTREE_P t;
40*46134Smao {
41*46134Smao 	BTHEADER *h;
42*46134Smao 	BTHEADER *left, *right;
43*46134Smao 	pgno_t nextpgno, parent;
44*46134Smao 	int nbytes, len;
45*46134Smao 	IDATUM *id;
46*46134Smao 	DATUM *d;
47*46134Smao 	char *src;
48*46134Smao 	IDATUM *new;
49*46134Smao 	pgno_t oldchain;
50*46134Smao 	u_char flags;
51*46134Smao 
52*46134Smao 	h = (BTHEADER *) t->bt_curpage;
53*46134Smao 
54*46134Smao 	/* split root page specially, since it must remain page 1 */
55*46134Smao 	if (h->h_pgno == P_ROOT) {
56*46134Smao 		return (_bt_splitroot(t));
57*46134Smao 	}
58*46134Smao 
59*46134Smao 	/*
60*46134Smao 	 *  This is a little complicated.  We go to some trouble to
61*46134Smao 	 *  figure out which of the three possible cases -- in-memory tree,
62*46134Smao 	 *  disk tree (no cache), and disk tree (cache) -- we have, in order
63*46134Smao 	 *  to avoid unnecessary copying.  If we have a disk cache, then we
64*46134Smao 	 *  have to do some extra copying, though, since the cache code
65*46134Smao 	 *  manages buffers externally to this code.
66*46134Smao 	 */
67*46134Smao 
68*46134Smao 	if (ISDISK(t) && ISCACHE(t)) {
69*46134Smao 		if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize))
70*46134Smao 		    == (BTHEADER *) NULL)
71*46134Smao 			return (RET_ERROR);
72*46134Smao 		left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE;
73*46134Smao 		left->h_flags = t->bt_curpage->h_flags;
74*46134Smao 		left->h_lower = (index_t)
75*46134Smao 			  (((char *) &(left->h_linp[0])) - ((char *) left));
76*46134Smao 		left->h_upper = t->bt_psize;
77*46134Smao 
78*46134Smao 	} else {
79*46134Smao 		if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
80*46134Smao 			return (RET_ERROR);
81*46134Smao 	}
82*46134Smao 	left->h_pgno = h->h_pgno;
83*46134Smao 
84*46134Smao 	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
85*46134Smao 		return (RET_ERROR);
86*46134Smao 	right->h_pgno = ++(t->bt_npages);
87*46134Smao 
88*46134Smao 	/* now do the split */
89*46134Smao 	if (_bt_dopsplit(t, left, right) == RET_ERROR)
90*46134Smao 		return (RET_ERROR);
91*46134Smao 
92*46134Smao 	right->h_prevpg = left->h_pgno;
93*46134Smao 	nextpgno = right->h_nextpg = h->h_nextpg;
94*46134Smao 	left->h_nextpg = right->h_pgno;
95*46134Smao 	left->h_prevpg = h->h_prevpg;
96*46134Smao 
97*46134Smao 	/* okay, now use the left half of the page as the new page */
98*46134Smao 	if (ISDISK(t) && ISCACHE(t)) {
99*46134Smao 		(void) bcopy((char *) left, (char *) t->bt_curpage,
100*46134Smao 			     (int) t->bt_psize);
101*46134Smao 		(void) free ((char *) left);
102*46134Smao 		left = t->bt_curpage;
103*46134Smao 	} else {
104*46134Smao 		(void) free((char *) t->bt_curpage);
105*46134Smao 		t->bt_curpage = left;
106*46134Smao 	}
107*46134Smao 
108*46134Smao 	/*
109*46134Smao 	 *  Write the new pages out.  We need them to stay where they are
110*46134Smao 	 *  until we're done updating the parent pages.
111*46134Smao 	 */
112*46134Smao 
113*46134Smao 	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
114*46134Smao 		return (RET_ERROR);
115*46134Smao 	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
116*46134Smao 		return (RET_ERROR);
117*46134Smao 
118*46134Smao 	/* update 'prev' pointer of old neighbor of left */
119*46134Smao 	if (nextpgno != P_NONE) {
120*46134Smao 		if (_bt_getpage(t, nextpgno) == RET_ERROR)
121*46134Smao 			return (RET_ERROR);
122*46134Smao 		h = t->bt_curpage;
123*46134Smao 		h->h_prevpg = right->h_pgno;
124*46134Smao 		h->h_flags |= F_DIRTY;
125*46134Smao 	}
126*46134Smao 
127*46134Smao 	if ((parent = _bt_pop(t)) != P_NONE) {
128*46134Smao 		if (right->h_flags & F_LEAF) {
129*46134Smao 			d = (DATUM *) GETDATUM(right, 0);
130*46134Smao 			len = d->d_ksize;
131*46134Smao 			if (d->d_flags & D_BIGKEY) {
132*46134Smao 				bcopy(&(d->d_bytes[0]),
133*46134Smao 				      (char *) &oldchain,
134*46134Smao 				      sizeof(oldchain));
135*46134Smao 				if (_bt_markchain(t, oldchain) == RET_ERROR)
136*46134Smao 					return (RET_ERROR);
137*46134Smao 				src = (char *) &oldchain;
138*46134Smao 				flags = D_BIGKEY;
139*46134Smao 			} else {
140*46134Smao 				src = (char *) &(d->d_bytes[0]);
141*46134Smao 				flags = 0;
142*46134Smao 			}
143*46134Smao 		} else {
144*46134Smao 			id = (IDATUM *) GETDATUM(right, 0);
145*46134Smao 			len = id->i_size;
146*46134Smao 			flags = id->i_flags;
147*46134Smao 			src = (char *) &(id->i_bytes[0]);
148*46134Smao 		}
149*46134Smao 		nbytes = len + (sizeof(IDATUM) - sizeof(char));
150*46134Smao 		new = (IDATUM *) malloc((unsigned) nbytes);
151*46134Smao 		if (new == (IDATUM *) NULL)
152*46134Smao 			return (RET_ERROR);
153*46134Smao 		new->i_size = len;
154*46134Smao 		new->i_pgno = right->h_pgno;
155*46134Smao 		new->i_flags = flags;
156*46134Smao 		if (len > 0)
157*46134Smao 			(void) bcopy(src, (char *) &(new->i_bytes[0]), len);
158*46134Smao 
159*46134Smao 		nbytes = LONGALIGN(nbytes) + sizeof(index_t);
160*46134Smao 		if (_bt_getpage(t, parent) == RET_ERROR)
161*46134Smao 			return (RET_ERROR);
162*46134Smao 
163*46134Smao 		h = t->bt_curpage;
164*46134Smao 
165*46134Smao 		/*
166*46134Smao 		 *  Split the parent if we need to, then reposition the
167*46134Smao 		 *  tree's current page pointer for the new datum.
168*46134Smao 		 */
169*46134Smao 		if ((h->h_upper - h->h_lower) < nbytes) {
170*46134Smao 			if (_bt_split(t) == RET_ERROR)
171*46134Smao 				return (RET_ERROR);
172*46134Smao 			if (_bt_reposition(t, new, parent, right->h_prevpg)
173*46134Smao 			      == RET_ERROR)
174*46134Smao 				return (RET_ERROR);
175*46134Smao 		}
176*46134Smao 
177*46134Smao 		/* okay, now insert the new idatum */
178*46134Smao 		if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR)
179*46134Smao 			return (RET_ERROR);
180*46134Smao 	}
181*46134Smao 
182*46134Smao 	/*
183*46134Smao 	 *  Okay, split is done; don't need right page stapled down anymore.
184*46134Smao 	 *  The page we call 'left' above is the new version of the old
185*46134Smao 	 *  (split) page, so we can't release it.
186*46134Smao 	 */
187*46134Smao 
188*46134Smao 	if (_bt_release(t, right) == RET_ERROR)
189*46134Smao 		return (RET_ERROR);
190*46134Smao 	if (ISDISK(t) && !ISCACHE(t))
191*46134Smao 		(void) free((char *) right);
192*46134Smao 
193*46134Smao 	return (RET_SUCCESS);
194*46134Smao }
195*46134Smao 
196*46134Smao /*
197*46134Smao  *  _BT_REPOSITION -- Reposition the current page pointer of a btree.
198*46134Smao  *
199*46134Smao  *	After splitting a node in the tree in order to make room for
200*46134Smao  *	an insertion, we need to figure out which page after the split
201*46134Smao  *	should get the item we want to insert.  This routine positions
202*46134Smao  *	the tree's current page pointer appropriately.
203*46134Smao  *
204*46134Smao  *	Parameters:
205*46134Smao  *		t -- tree to position
206*46134Smao  *		new -- the item we want to insert
207*46134Smao  *		parent -- parent of the node that we just split
208*46134Smao  *		prev -- page number of item directly to the left of
209*46134Smao  *			new's position in the tree.
210*46134Smao  *
211*46134Smao  *	Returns:
212*46134Smao  *		RET_SUCCESS, RET_ERROR.
213*46134Smao  *
214*46134Smao  *	Side Effects:
215*46134Smao  *		None.
216*46134Smao  */
217*46134Smao 
218*46134Smao int
219*46134Smao _bt_reposition(t, new, parent, prev)
220*46134Smao 	BTREE_P t;
221*46134Smao 	IDATUM *new;
222*46134Smao 	pgno_t parent;
223*46134Smao 	pgno_t prev;
224*46134Smao {
225*46134Smao 	index_t i, next;
226*46134Smao 	IDATUM *idx;
227*46134Smao 
228*46134Smao 	if (parent == P_ROOT) {
229*46134Smao 
230*46134Smao 		/*
231*46134Smao 		 *  If we just split the root page, then there are guaranteed
232*46134Smao 		 *  to be exactly two IDATUMs on it.  Look at both of them
233*46134Smao 		 *  to see if they point to the page that we want.
234*46134Smao 		 */
235*46134Smao 
236*46134Smao 		if (_bt_getpage(t, parent) == RET_ERROR)
237*46134Smao 			return (RET_ERROR);
238*46134Smao 
239*46134Smao 		next = NEXTINDEX(t->bt_curpage);
240*46134Smao 		for (i = 0; i < next; i++) {
241*46134Smao 			idx = (IDATUM *) GETDATUM(t->bt_curpage, i);
242*46134Smao 			if (_bt_getpage(t, idx->i_pgno) == RET_ERROR)
243*46134Smao 				return (RET_ERROR);
244*46134Smao 			if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
245*46134Smao 				return (RET_SUCCESS);
246*46134Smao 			if (_bt_getpage(t, parent) == RET_ERROR)
247*46134Smao 				return (RET_ERROR);
248*46134Smao 		}
249*46134Smao 	} else {
250*46134Smao 
251*46134Smao 		/*
252*46134Smao 		 *  Get the parent page -- which is where the new item would
253*46134Smao 		 *  have gone -- and figure out whether the new item now goes
254*46134Smao 		 *  on the parent, or the page immediately to the right of
255*46134Smao 		 *  the parent.
256*46134Smao 		 */
257*46134Smao 
258*46134Smao 		if (_bt_getpage(t, parent) == RET_ERROR)
259*46134Smao 			return (RET_ERROR);
260*46134Smao 		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
261*46134Smao 			return (RET_SUCCESS);
262*46134Smao 		if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR)
263*46134Smao 			return (RET_ERROR);
264*46134Smao 		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
265*46134Smao 			return (RET_SUCCESS);
266*46134Smao 	}
267*46134Smao 	return (RET_ERROR);
268*46134Smao }
269*46134Smao 
270*46134Smao /*
271*46134Smao  *  _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page?
272*46134Smao  *
273*46134Smao  *	This routine is used by _bt_reposition to decide whether the current
274*46134Smao  *	page is the correct one on which to insert a new item.
275*46134Smao  *
276*46134Smao  *	Parameters:
277*46134Smao  *		t -- tree to check
278*46134Smao  *		new -- the item that will be inserted (used for binary search)
279*46134Smao  *		prev -- page number of page whose IDATUM is immediately to
280*46134Smao  *			the left of new's position in the tree.
281*46134Smao  *
282*46134Smao  *	Returns:
283*46134Smao  *		RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE).
284*46134Smao  */
285*46134Smao 
286*46134Smao int
287*46134Smao _bt_isonpage(t, new, prev)
288*46134Smao 	BTREE_P t;
289*46134Smao 	IDATUM *new;
290*46134Smao 	pgno_t prev;
291*46134Smao {
292*46134Smao 	BTHEADER *h = (BTHEADER *) t->bt_curpage;
293*46134Smao 	index_t i, next;
294*46134Smao 	IDATUM *idx;
295*46134Smao 
296*46134Smao 	i = _bt_binsrch(t, &(new->i_bytes[0]));
297*46134Smao 	while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0)
298*46134Smao 		--i;
299*46134Smao 	next = NEXTINDEX(h);
300*46134Smao 	idx = (IDATUM *) GETDATUM(h, i);
301*46134Smao 	while (i < next && idx->i_pgno != prev) {
302*46134Smao 		i++;
303*46134Smao 		idx = (IDATUM *) GETDATUM(h, i);
304*46134Smao 	}
305*46134Smao 	if (idx->i_pgno == prev)
306*46134Smao 		return (RET_SUCCESS);
307*46134Smao 	else
308*46134Smao 		return (RET_ERROR);
309*46134Smao }
310*46134Smao 
311*46134Smao /*
312*46134Smao  *  _BT_SPLITROOT -- Split the root of a btree.
313*46134Smao  *
314*46134Smao  *	The root page for a btree is always page one.  This means that in
315*46134Smao  *	order to split the root, we need to do extra work.
316*46134Smao  *
317*46134Smao  *	Parameters:
318*46134Smao  *		t -- tree to split
319*46134Smao  *
320*46134Smao  *	Returns:
321*46134Smao  *		RET_SUCCESS, RET_ERROR.
322*46134Smao  *
323*46134Smao  *	Side Effects:
324*46134Smao  *		Splits root upward in the usual way, adding two new pages
325*46134Smao  *		to the tree (rather than just one, as in usual splits).
326*46134Smao  */
327*46134Smao 
328*46134Smao int
329*46134Smao _bt_splitroot(t)
330*46134Smao 	BTREE_P t;
331*46134Smao {
332*46134Smao 	BTHEADER *h = t->bt_curpage;
333*46134Smao 	BTHEADER *left, *right;
334*46134Smao 	IDATUM *id;
335*46134Smao 	BTHEADER *where_h;
336*46134Smao 	char *src, *dest;
337*46134Smao 	int len, nbytes;
338*46134Smao 	u_long was_leaf;
339*46134Smao 	pgno_t oldchain;
340*46134Smao 	u_char flags;
341*46134Smao 
342*46134Smao 	/* get two new pages for the split */
343*46134Smao 	if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
344*46134Smao 		return (RET_ERROR);
345*46134Smao 	left->h_pgno = ++(t->bt_npages);
346*46134Smao 	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
347*46134Smao 		return (RET_ERROR);
348*46134Smao 	right->h_pgno = ++(t->bt_npages);
349*46134Smao 
350*46134Smao 	/* do the split */
351*46134Smao 	if (_bt_dopsplit(t, left, right) == RET_ERROR)
352*46134Smao 		return (RET_ERROR);
353*46134Smao 
354*46134Smao 	/* connect the new pages correctly */
355*46134Smao 	right->h_prevpg = left->h_pgno;
356*46134Smao 	left->h_nextpg = right->h_pgno;
357*46134Smao 
358*46134Smao 	/*
359*46134Smao 	 *  Write the child pages out now.  We need them to remain
360*46134Smao 	 *  where they are until we finish updating parent pages,
361*46134Smao 	 *  however.
362*46134Smao 	 */
363*46134Smao 
364*46134Smao 	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
365*46134Smao 		return (RET_ERROR);
366*46134Smao 	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
367*46134Smao 		return (RET_ERROR);
368*46134Smao 
369*46134Smao 	/* now change the root page into an internal page */
370*46134Smao 	was_leaf = (h->h_flags & F_LEAF);
371*46134Smao 	h->h_flags &= ~F_LEAF;
372*46134Smao 	h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h));
373*46134Smao 	h->h_upper = (index_t) t->bt_psize;
374*46134Smao 	(void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower));
375*46134Smao 
376*46134Smao 	/* put two new keys on root page */
377*46134Smao 	where_h = left;
378*46134Smao 	while (where_h) {
379*46134Smao 		DATUM *data;
380*46134Smao 		IDATUM *idata;
381*46134Smao 
382*46134Smao 		if (was_leaf) {
383*46134Smao 			data = (DATUM *) GETDATUM(where_h, 0);
384*46134Smao 
385*46134Smao 			if (where_h == left) {
386*46134Smao 				len = 0;	/* first key in tree is null */
387*46134Smao 			} else {
388*46134Smao 				if (data->d_flags & D_BIGKEY) {
389*46134Smao 					bcopy(&(data->d_bytes[0]),
390*46134Smao 					      (char *) &oldchain,
391*46134Smao 					      sizeof(oldchain));
392*46134Smao 					if (_bt_markchain(t, oldchain) == RET_ERROR)
393*46134Smao 						return (RET_ERROR);
394*46134Smao 					src = (char *) &oldchain;
395*46134Smao 					flags = D_BIGKEY;
396*46134Smao 				} else {
397*46134Smao 					src = (char *) &(data->d_bytes[0]);
398*46134Smao 					flags = 0;
399*46134Smao 				}
400*46134Smao 				len = data->d_ksize;
401*46134Smao 			}
402*46134Smao 		} else {
403*46134Smao 			idata = (IDATUM *) GETDATUM(where_h, 0);
404*46134Smao 			len = idata->i_size;
405*46134Smao 			flags = idata->i_flags;
406*46134Smao 			src = &(idata->i_bytes[0]);
407*46134Smao 		}
408*46134Smao 		dest = ((char *) h) + h->h_upper;
409*46134Smao 		nbytes = len + (sizeof (IDATUM) - sizeof(char));
410*46134Smao 		dest -= LONGALIGN(nbytes);
411*46134Smao 		id = (IDATUM *) dest;
412*46134Smao 		id->i_size = len;
413*46134Smao 		id->i_pgno = where_h->h_pgno;
414*46134Smao 		id->i_flags = flags;
415*46134Smao 		if (len > 0)
416*46134Smao 			(void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len);
417*46134Smao 		dest -= ((int) h);
418*46134Smao 		h->h_linp[NEXTINDEX(h)] = (index_t) dest;
419*46134Smao 		h->h_upper = (index_t) dest;
420*46134Smao 		h->h_lower += sizeof(index_t);
421*46134Smao 
422*46134Smao 		/* next page */
423*46134Smao 		if (where_h == left)
424*46134Smao 			where_h = right;
425*46134Smao 		else
426*46134Smao 			where_h = (BTHEADER *) NULL;
427*46134Smao 	}
428*46134Smao 
429*46134Smao 	if (_bt_release(t, left) == RET_ERROR)
430*46134Smao 		return (RET_ERROR);
431*46134Smao 	if (_bt_release(t, right) == RET_ERROR)
432*46134Smao 		return (RET_ERROR);
433*46134Smao 
434*46134Smao 	/*
435*46134Smao 	 *  That's it, split is done.  If we're doing a non-cached disk
436*46134Smao 	 *  btree, we can free up the pages we allocated, as they're on
437*46134Smao 	 *  disk, now.
438*46134Smao 	 */
439*46134Smao 
440*46134Smao 	if (ISDISK(t) && !ISCACHE(t)) {
441*46134Smao 		(void) free ((char *) left);
442*46134Smao 		(void) free ((char *) right);
443*46134Smao 	}
444*46134Smao 
445*46134Smao 	h->h_flags |= F_DIRTY;
446*46134Smao 
447*46134Smao 	return (RET_SUCCESS);
448*46134Smao }
449*46134Smao 
450*46134Smao /*
451*46134Smao  *  _BT_DOPSPLIT -- Do the work of splitting a page
452*46134Smao  *
453*46134Smao  *	This routine takes two page pointers and splits the data on the
454*46134Smao  *	current page of the btree between them.
455*46134Smao  *
456*46134Smao  *	We do a lot of work here to handle duplicate keys on a page; we
457*46134Smao  *	have to place these keys carefully to guarantee that later searches
458*46134Smao  *	will find them correctly.  See comments in the code below for details.
459*46134Smao  *
460*46134Smao  *	Parameters:
461*46134Smao  *		t -- tree to split
462*46134Smao  *		left -- pointer to page to get left half of the data
463*46134Smao  *		right -- pointer to page to get right half of the data
464*46134Smao  *
465*46134Smao  *	Returns:
466*46134Smao  *		None.
467*46134Smao  */
468*46134Smao 
469*46134Smao int
470*46134Smao _bt_dopsplit(t, left, right)
471*46134Smao 	BTREE_P t;
472*46134Smao 	BTHEADER *left;
473*46134Smao 	BTHEADER *right;
474*46134Smao {
475*46134Smao 	BTHEADER *h = t->bt_curpage;
476*46134Smao 	size_t psize;
477*46134Smao 	char *where;
478*46134Smao 	BTHEADER *where_h;
479*46134Smao 	index_t where_i;
480*46134Smao 	int nbytes, dsize, fixedsize, freespc;
481*46134Smao 	index_t i;
482*46134Smao 	index_t save_lower, save_upper, save_i;
483*46134Smao 	index_t switch_i;
484*46134Smao 	char *save_key;
485*46134Smao 	DATUM *d;
486*46134Smao 	CURSOR *c;
487*46134Smao 	index_t top;
488*46134Smao 	int free_save;
489*46134Smao 	pgno_t chain;
490*46134Smao 	int ignore;
491*46134Smao 
492*46134Smao 	/*
493*46134Smao 	 *  Our strategy is to put half the bytes on each page.  We figure
494*46134Smao 	 *  out how many bytes we have total, and then move items until
495*46134Smao 	 *  the last item moved put at least 50% of the data on the left
496*46134Smao 	 *  page.
497*46134Smao 	 */
498*46134Smao 	save_key = (char *) NULL;
499*46134Smao 	psize = (int) t->bt_psize;
500*46134Smao 	where = ((char *) left) + psize;
501*46134Smao 	where_h = left;
502*46134Smao 	where_i = 0;
503*46134Smao 	nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h));
504*46134Smao 	freespc = nbytes;
505*46134Smao 
506*46134Smao 	top = NEXTINDEX(h);
507*46134Smao 	if (h->h_flags & F_LEAF)
508*46134Smao 		fixedsize = (sizeof(DATUM) - sizeof(char));
509*46134Smao 	else
510*46134Smao 		fixedsize = (sizeof(IDATUM) - sizeof(char));
511*46134Smao 
512*46134Smao 	save_key = (char *) NULL;
513*46134Smao 
514*46134Smao 	/* move data */
515*46134Smao 	for (i = 0; i < top; i++) {
516*46134Smao 
517*46134Smao 		/*
518*46134Smao 		 *  Internal and leaf pages have different layouts for
519*46134Smao 		 *  data items, but in both cases the first entry in the
520*46134Smao 		 *  data item is a size_t.
521*46134Smao 		 */
522*46134Smao 		d = (DATUM *) GETDATUM(h,i);
523*46134Smao 		if (h->h_flags & F_LEAF) {
524*46134Smao 			dsize = d->d_ksize + d->d_dsize + fixedsize;
525*46134Smao 		} else {
526*46134Smao 			dsize = d->d_ksize + fixedsize;
527*46134Smao 		}
528*46134Smao 
529*46134Smao 		/*
530*46134Smao 		 *  If a page contains duplicate keys, we have to be
531*46134Smao 		 *  careful about splits.  A sequence of duplicate keys
532*46134Smao 		 *  may not begin in the middle of one page and end in
533*46134Smao 		 *  the middle of another; it must begin on a page boundary,
534*46134Smao 		 *  in order for searches on the internal nodes to work
535*46134Smao 		 *  correctly.
536*46134Smao 		 */
537*46134Smao 		if (where_h == left) {
538*46134Smao 			if (save_key == (char *) NULL) {
539*46134Smao 				if (h->h_flags & F_LEAF) {
540*46134Smao 					if (d->d_flags & D_BIGKEY) {
541*46134Smao 						free_save = TRUE;
542*46134Smao 						bcopy(&(d->d_bytes[0]),
543*46134Smao 						     (char *) &chain,
544*46134Smao 						     sizeof(chain));
545*46134Smao 						if (_bt_getbig(t, chain,
546*46134Smao 							       &save_key,
547*46134Smao 							       &ignore)
548*46134Smao 						    == RET_ERROR)
549*46134Smao 							return (RET_ERROR);
550*46134Smao 					} else {
551*46134Smao 						free_save = FALSE;
552*46134Smao 						save_key = (char *) &(d->d_bytes[0]);
553*46134Smao 					}
554*46134Smao 				} else {
555*46134Smao 					IDATUM *id = (IDATUM *) d;
556*46134Smao 
557*46134Smao 					if (id->i_flags & D_BIGKEY) {
558*46134Smao 						free_save = TRUE;
559*46134Smao 						bcopy(&(id->i_bytes[0]),
560*46134Smao 						     (char *) &chain,
561*46134Smao 						     sizeof(chain));
562*46134Smao 						if (_bt_getbig(t, chain,
563*46134Smao 							       &save_key,
564*46134Smao 							       &ignore)
565*46134Smao 						    == RET_ERROR)
566*46134Smao 							return (RET_ERROR);
567*46134Smao 					} else {
568*46134Smao 						free_save = FALSE;
569*46134Smao 						save_key = (char *)
570*46134Smao 							&(id->i_bytes[0]);
571*46134Smao 					}
572*46134Smao 				}
573*46134Smao 				save_i = 0;
574*46134Smao 				save_lower = where_h->h_lower;
575*46134Smao 				save_upper = where_h->h_upper;
576*46134Smao 			} else {
577*46134Smao 				if (_bt_cmp(t, save_key, i) != 0) {
578*46134Smao 					save_lower = where_h->h_lower;
579*46134Smao 					save_upper = where_h->h_upper;
580*46134Smao 					save_i = i;
581*46134Smao 				}
582*46134Smao 				if (h->h_flags & F_LEAF) {
583*46134Smao 					if (free_save)
584*46134Smao 						(void) free(save_key);
585*46134Smao 					if (d->d_flags & D_BIGKEY) {
586*46134Smao 						free_save = TRUE;
587*46134Smao 						bcopy(&(d->d_bytes[0]),
588*46134Smao 						     (char *) &chain,
589*46134Smao 						     sizeof(chain));
590*46134Smao 						if (_bt_getbig(t, chain,
591*46134Smao 							       &save_key,
592*46134Smao 							       &ignore)
593*46134Smao 						    == RET_ERROR)
594*46134Smao 							return (RET_ERROR);
595*46134Smao 					} else {
596*46134Smao 						free_save = FALSE;
597*46134Smao 						save_key = (char *) &(d->d_bytes[0]);
598*46134Smao 					}
599*46134Smao 				} else {
600*46134Smao 					IDATUM *id = (IDATUM *) d;
601*46134Smao 
602*46134Smao 					if (id->i_flags & D_BIGKEY) {
603*46134Smao 						free_save = TRUE;
604*46134Smao 						bcopy(&(id->i_bytes[0]),
605*46134Smao 						     (char *) &chain,
606*46134Smao 						     sizeof(chain));
607*46134Smao 						if (_bt_getbig(t, chain,
608*46134Smao 							       &save_key,
609*46134Smao 							       &ignore)
610*46134Smao 						    == RET_ERROR)
611*46134Smao 							return (RET_ERROR);
612*46134Smao 					} else {
613*46134Smao 						free_save = FALSE;
614*46134Smao 						save_key = (char *)
615*46134Smao 							&(id->i_bytes[0]);
616*46134Smao 					}
617*46134Smao 				}
618*46134Smao 			}
619*46134Smao 		}
620*46134Smao 
621*46134Smao 		/* copy data and update page state */
622*46134Smao 		where -= LONGALIGN(dsize);
623*46134Smao 		(void) bcopy((char *) d, (char *) where, dsize);
624*46134Smao 		where_h->h_upper = where_h->h_linp[where_i] =
625*46134Smao 			(index_t) (where - (int) where_h);
626*46134Smao 		where_h->h_lower += sizeof(index_t);
627*46134Smao 		where_i++;
628*46134Smao 
629*46134Smao 		/* if we've moved half, switch to the right-hand page */
630*46134Smao 		nbytes -= LONGALIGN(dsize) + sizeof(index_t);
631*46134Smao 		if ((freespc - nbytes) > nbytes) {
632*46134Smao 			nbytes = 2 * freespc;
633*46134Smao 
634*46134Smao 			/* identical keys go on the same page */
635*46134Smao 			if (save_i > 0) {
636*46134Smao 				/* i gets incremented at loop bottom... */
637*46134Smao 				i = save_i - 1;
638*46134Smao 				where_h->h_lower = save_lower;
639*46134Smao 				where_h->h_upper = save_upper;
640*46134Smao 			}
641*46134Smao 			where = ((char *) right) + psize;
642*46134Smao 			where_h = right;
643*46134Smao 			switch_i = where_i;
644*46134Smao 			where_i = 0;
645*46134Smao 		}
646*46134Smao 	}
647*46134Smao 
648*46134Smao 	/*
649*46134Smao 	 *  If there was an active scan on the database, and we just
650*46134Smao 	 *  split the page that the cursor was on, we may need to
651*46134Smao 	 *  adjust the cursor to point to the same entry as before the
652*46134Smao 	 *  split.
653*46134Smao 	 */
654*46134Smao 
655*46134Smao 	c = &(t->bt_cursor);
656*46134Smao 	if ((t->bt_flags & BTF_SEQINIT)
657*46134Smao 	    && (c->c_pgno == h->h_pgno)
658*46134Smao 	    && (c->c_index >= switch_i)) {
659*46134Smao 		c->c_pgno = where_h->h_pgno;
660*46134Smao 		c->c_index -= where_i;
661*46134Smao 	}
662*46134Smao 	return (RET_SUCCESS);
663*46134Smao }
664