1*53b37aa9Sespie /* $OpenBSD: bt_split.c,v 1.13 2005/08/05 13:03:00 espie Exp $ */
21b727fc6Smillert
3df930be7Sderaadt /*-
4df930be7Sderaadt * Copyright (c) 1990, 1993, 1994
5df930be7Sderaadt * The Regents of the University of California. All rights reserved.
6df930be7Sderaadt *
7df930be7Sderaadt * This code is derived from software contributed to Berkeley by
8df930be7Sderaadt * Mike Olson.
9df930be7Sderaadt *
10df930be7Sderaadt * Redistribution and use in source and binary forms, with or without
11df930be7Sderaadt * modification, are permitted provided that the following conditions
12df930be7Sderaadt * are met:
13df930be7Sderaadt * 1. Redistributions of source code must retain the above copyright
14df930be7Sderaadt * notice, this list of conditions and the following disclaimer.
15df930be7Sderaadt * 2. Redistributions in binary form must reproduce the above copyright
16df930be7Sderaadt * notice, this list of conditions and the following disclaimer in the
17df930be7Sderaadt * documentation and/or other materials provided with the distribution.
186580fee3Smillert * 3. Neither the name of the University nor the names of its contributors
19df930be7Sderaadt * may be used to endorse or promote products derived from this software
20df930be7Sderaadt * without specific prior written permission.
21df930be7Sderaadt *
22df930be7Sderaadt * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23df930be7Sderaadt * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24df930be7Sderaadt * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25df930be7Sderaadt * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26df930be7Sderaadt * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27df930be7Sderaadt * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28df930be7Sderaadt * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29df930be7Sderaadt * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30df930be7Sderaadt * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31df930be7Sderaadt * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32df930be7Sderaadt * SUCH DAMAGE.
33df930be7Sderaadt */
34df930be7Sderaadt
35df930be7Sderaadt #include <sys/types.h>
36df930be7Sderaadt
37df930be7Sderaadt #include <limits.h>
38df930be7Sderaadt #include <stdio.h>
39df930be7Sderaadt #include <stdlib.h>
40df930be7Sderaadt #include <string.h>
41df930be7Sderaadt
42df930be7Sderaadt #include <db.h>
43df930be7Sderaadt #include "btree.h"
44df930be7Sderaadt
45c72b5b24Smillert static int bt_broot(BTREE *, PAGE *, PAGE *, PAGE *);
46df95a199Smillert static PAGE *bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
47c72b5b24Smillert static int bt_preserve(BTREE *, pgno_t);
48df95a199Smillert static PAGE *bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t);
49df95a199Smillert static PAGE *bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t);
50c72b5b24Smillert static int bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *);
51c72b5b24Smillert static recno_t rec_total(PAGE *);
52df930be7Sderaadt
53df930be7Sderaadt #ifdef STATISTICS
54df930be7Sderaadt u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
55df930be7Sderaadt #endif
56df930be7Sderaadt
57df930be7Sderaadt /*
58df930be7Sderaadt * __BT_SPLIT -- Split the tree.
59df930be7Sderaadt *
60df930be7Sderaadt * Parameters:
61df930be7Sderaadt * t: tree
62df930be7Sderaadt * sp: page to split
63df930be7Sderaadt * key: key to insert
64df930be7Sderaadt * data: data to insert
65df930be7Sderaadt * flags: BIGKEY/BIGDATA flags
66df930be7Sderaadt * ilen: insert length
67df930be7Sderaadt * skip: index to leave open
68df930be7Sderaadt *
69df930be7Sderaadt * Returns:
70df930be7Sderaadt * RET_ERROR, RET_SUCCESS
71df930be7Sderaadt */
72df930be7Sderaadt int
__bt_split(BTREE * t,PAGE * sp,const DBT * key,const DBT * data,int flags,size_t ilen,u_int32_t argskip)73e20a56a5Sotto __bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags,
74e20a56a5Sotto size_t ilen, u_int32_t argskip)
75df930be7Sderaadt {
76df930be7Sderaadt BINTERNAL *bi;
77df930be7Sderaadt BLEAF *bl, *tbl;
78df930be7Sderaadt DBT a, b;
79df930be7Sderaadt EPGNO *parent;
80df930be7Sderaadt PAGE *h, *l, *r, *lchild, *rchild;
81df930be7Sderaadt indx_t nxtindex;
82df930be7Sderaadt u_int16_t skip;
83df930be7Sderaadt u_int32_t n, nbytes, nksize;
84df930be7Sderaadt int parentsplit;
85df930be7Sderaadt char *dest;
86df930be7Sderaadt
87df930be7Sderaadt /*
88df930be7Sderaadt * Split the page into two pages, l and r. The split routines return
89df930be7Sderaadt * a pointer to the page into which the key should be inserted and with
90df930be7Sderaadt * skip set to the offset which should be used. Additionally, l and r
91df930be7Sderaadt * are pinned.
92df930be7Sderaadt */
93df930be7Sderaadt skip = argskip;
94df930be7Sderaadt h = sp->pgno == P_ROOT ?
95df930be7Sderaadt bt_root(t, sp, &l, &r, &skip, ilen) :
96df930be7Sderaadt bt_page(t, sp, &l, &r, &skip, ilen);
97df930be7Sderaadt if (h == NULL)
98df930be7Sderaadt return (RET_ERROR);
99df930be7Sderaadt
100df930be7Sderaadt /*
101df930be7Sderaadt * Insert the new key/data pair into the leaf page. (Key inserts
102df930be7Sderaadt * always cause a leaf page to split first.)
103df930be7Sderaadt */
104df930be7Sderaadt h->linp[skip] = h->upper -= ilen;
105df930be7Sderaadt dest = (char *)h + h->upper;
106bec2d00aSderaadt if (F_ISSET(t, R_RECNO))
107df930be7Sderaadt WR_RLEAF(dest, data, flags)
108df930be7Sderaadt else
109df930be7Sderaadt WR_BLEAF(dest, key, data, flags)
110df930be7Sderaadt
111df930be7Sderaadt /* If the root page was split, make it look right. */
112df930be7Sderaadt if (sp->pgno == P_ROOT &&
113bec2d00aSderaadt (F_ISSET(t, R_RECNO) ?
114df930be7Sderaadt bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
115df930be7Sderaadt goto err2;
116df930be7Sderaadt
117df930be7Sderaadt /*
118df930be7Sderaadt * Now we walk the parent page stack -- a LIFO stack of the pages that
119df930be7Sderaadt * were traversed when we searched for the page that split. Each stack
120df930be7Sderaadt * entry is a page number and a page index offset. The offset is for
121df930be7Sderaadt * the page traversed on the search. We've just split a page, so we
122df930be7Sderaadt * have to insert a new key into the parent page.
123df930be7Sderaadt *
124df930be7Sderaadt * If the insert into the parent page causes it to split, may have to
125df930be7Sderaadt * continue splitting all the way up the tree. We stop if the root
126df930be7Sderaadt * splits or the page inserted into didn't have to split to hold the
127df930be7Sderaadt * new key. Some algorithms replace the key for the old page as well
128df930be7Sderaadt * as the new page. We don't, as there's no reason to believe that the
129df930be7Sderaadt * first key on the old page is any better than the key we have, and,
130df930be7Sderaadt * in the case of a key being placed at index 0 causing the split, the
131df930be7Sderaadt * key is unavailable.
132df930be7Sderaadt *
133df930be7Sderaadt * There are a maximum of 5 pages pinned at any time. We keep the left
134df930be7Sderaadt * and right pages pinned while working on the parent. The 5 are the
135df930be7Sderaadt * two children, left parent and right parent (when the parent splits)
136df930be7Sderaadt * and the root page or the overflow key page when calling bt_preserve.
137df930be7Sderaadt * This code must make sure that all pins are released other than the
138df930be7Sderaadt * root page or overflow page which is unlocked elsewhere.
139df930be7Sderaadt */
140df930be7Sderaadt while ((parent = BT_POP(t)) != NULL) {
141df930be7Sderaadt lchild = l;
142df930be7Sderaadt rchild = r;
143df930be7Sderaadt
144df930be7Sderaadt /* Get the parent page. */
145df930be7Sderaadt if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
146df930be7Sderaadt goto err2;
147df930be7Sderaadt
148df930be7Sderaadt /*
149df930be7Sderaadt * The new key goes ONE AFTER the index, because the split
150df930be7Sderaadt * was to the right.
151df930be7Sderaadt */
152df930be7Sderaadt skip = parent->index + 1;
153df930be7Sderaadt
154df930be7Sderaadt /*
155df930be7Sderaadt * Calculate the space needed on the parent page.
156df930be7Sderaadt *
157df930be7Sderaadt * Prefix trees: space hack when inserting into BINTERNAL
158df930be7Sderaadt * pages. Retain only what's needed to distinguish between
159df930be7Sderaadt * the new entry and the LAST entry on the page to its left.
160df930be7Sderaadt * If the keys compare equal, retain the entire key. Note,
161df930be7Sderaadt * we don't touch overflow keys, and the entire key must be
162df930be7Sderaadt * retained for the next-to-left most key on the leftmost
163df930be7Sderaadt * page of each level, or the search will fail. Applicable
164df930be7Sderaadt * ONLY to internal pages that have leaf pages as children.
165df930be7Sderaadt * Further reduction of the key between pairs of internal
166df930be7Sderaadt * pages loses too much information.
167df930be7Sderaadt */
168df930be7Sderaadt switch (rchild->flags & P_TYPE) {
169df930be7Sderaadt case P_BINTERNAL:
170df930be7Sderaadt bi = GETBINTERNAL(rchild, 0);
171df930be7Sderaadt nbytes = NBINTERNAL(bi->ksize);
172df930be7Sderaadt break;
173df930be7Sderaadt case P_BLEAF:
174df930be7Sderaadt bl = GETBLEAF(rchild, 0);
175df930be7Sderaadt nbytes = NBINTERNAL(bl->ksize);
176df930be7Sderaadt if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
177df930be7Sderaadt (h->prevpg != P_INVALID || skip > 1)) {
178df930be7Sderaadt tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
179df930be7Sderaadt a.size = tbl->ksize;
180df930be7Sderaadt a.data = tbl->bytes;
181df930be7Sderaadt b.size = bl->ksize;
182df930be7Sderaadt b.data = bl->bytes;
183df930be7Sderaadt nksize = t->bt_pfx(&a, &b);
184df930be7Sderaadt n = NBINTERNAL(nksize);
185df930be7Sderaadt if (n < nbytes) {
186df930be7Sderaadt #ifdef STATISTICS
187df930be7Sderaadt bt_pfxsaved += nbytes - n;
188df930be7Sderaadt #endif
189df930be7Sderaadt nbytes = n;
190df930be7Sderaadt } else
191df930be7Sderaadt nksize = 0;
192df930be7Sderaadt } else
193df930be7Sderaadt nksize = 0;
194df930be7Sderaadt break;
195df930be7Sderaadt case P_RINTERNAL:
196df930be7Sderaadt case P_RLEAF:
197df930be7Sderaadt nbytes = NRINTERNAL;
198df930be7Sderaadt break;
199df930be7Sderaadt default:
200df930be7Sderaadt abort();
201df930be7Sderaadt }
202df930be7Sderaadt
203df930be7Sderaadt /* Split the parent page if necessary or shift the indices. */
204df930be7Sderaadt if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
205df930be7Sderaadt sp = h;
206df930be7Sderaadt h = h->pgno == P_ROOT ?
207df930be7Sderaadt bt_root(t, h, &l, &r, &skip, nbytes) :
208df930be7Sderaadt bt_page(t, h, &l, &r, &skip, nbytes);
209df930be7Sderaadt if (h == NULL)
210df930be7Sderaadt goto err1;
211df930be7Sderaadt parentsplit = 1;
212df930be7Sderaadt } else {
213df930be7Sderaadt if (skip < (nxtindex = NEXTINDEX(h)))
214df930be7Sderaadt memmove(h->linp + skip + 1, h->linp + skip,
215df930be7Sderaadt (nxtindex - skip) * sizeof(indx_t));
216df930be7Sderaadt h->lower += sizeof(indx_t);
217df930be7Sderaadt parentsplit = 0;
218df930be7Sderaadt }
219df930be7Sderaadt
220df930be7Sderaadt /* Insert the key into the parent page. */
221df930be7Sderaadt switch (rchild->flags & P_TYPE) {
222df930be7Sderaadt case P_BINTERNAL:
223df930be7Sderaadt h->linp[skip] = h->upper -= nbytes;
224df930be7Sderaadt dest = (char *)h + h->linp[skip];
225df930be7Sderaadt memmove(dest, bi, nbytes);
226df930be7Sderaadt ((BINTERNAL *)dest)->pgno = rchild->pgno;
227df930be7Sderaadt break;
228df930be7Sderaadt case P_BLEAF:
229df930be7Sderaadt h->linp[skip] = h->upper -= nbytes;
230df930be7Sderaadt dest = (char *)h + h->linp[skip];
231df930be7Sderaadt WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
232df930be7Sderaadt rchild->pgno, bl->flags & P_BIGKEY);
233df930be7Sderaadt memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
234df930be7Sderaadt if (bl->flags & P_BIGKEY &&
235df930be7Sderaadt bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
236df930be7Sderaadt goto err1;
237df930be7Sderaadt break;
238df930be7Sderaadt case P_RINTERNAL:
239df930be7Sderaadt /*
240df930be7Sderaadt * Update the left page count. If split
241df930be7Sderaadt * added at index 0, fix the correct page.
242df930be7Sderaadt */
243df930be7Sderaadt if (skip > 0)
244df930be7Sderaadt dest = (char *)h + h->linp[skip - 1];
245df930be7Sderaadt else
246df930be7Sderaadt dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
247df930be7Sderaadt ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
248df930be7Sderaadt ((RINTERNAL *)dest)->pgno = lchild->pgno;
249df930be7Sderaadt
250df930be7Sderaadt /* Update the right page count. */
251df930be7Sderaadt h->linp[skip] = h->upper -= nbytes;
252df930be7Sderaadt dest = (char *)h + h->linp[skip];
253df930be7Sderaadt ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
254df930be7Sderaadt ((RINTERNAL *)dest)->pgno = rchild->pgno;
255df930be7Sderaadt break;
256df930be7Sderaadt case P_RLEAF:
257df930be7Sderaadt /*
258df930be7Sderaadt * Update the left page count. If split
259df930be7Sderaadt * added at index 0, fix the correct page.
260df930be7Sderaadt */
261df930be7Sderaadt if (skip > 0)
262df930be7Sderaadt dest = (char *)h + h->linp[skip - 1];
263df930be7Sderaadt else
264df930be7Sderaadt dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
265df930be7Sderaadt ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
266df930be7Sderaadt ((RINTERNAL *)dest)->pgno = lchild->pgno;
267df930be7Sderaadt
268df930be7Sderaadt /* Update the right page count. */
269df930be7Sderaadt h->linp[skip] = h->upper -= nbytes;
270df930be7Sderaadt dest = (char *)h + h->linp[skip];
271df930be7Sderaadt ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
272df930be7Sderaadt ((RINTERNAL *)dest)->pgno = rchild->pgno;
273df930be7Sderaadt break;
274df930be7Sderaadt default:
275df930be7Sderaadt abort();
276df930be7Sderaadt }
277df930be7Sderaadt
278df930be7Sderaadt /* Unpin the held pages. */
279df930be7Sderaadt if (!parentsplit) {
280df930be7Sderaadt mpool_put(t->bt_mp, h, MPOOL_DIRTY);
281df930be7Sderaadt break;
282df930be7Sderaadt }
283df930be7Sderaadt
284df930be7Sderaadt /* If the root page was split, make it look right. */
285df930be7Sderaadt if (sp->pgno == P_ROOT &&
286bec2d00aSderaadt (F_ISSET(t, R_RECNO) ?
287df930be7Sderaadt bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
288df930be7Sderaadt goto err1;
289df930be7Sderaadt
290df930be7Sderaadt mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
291df930be7Sderaadt mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
292df930be7Sderaadt }
293df930be7Sderaadt
294df930be7Sderaadt /* Unpin the held pages. */
295df930be7Sderaadt mpool_put(t->bt_mp, l, MPOOL_DIRTY);
296df930be7Sderaadt mpool_put(t->bt_mp, r, MPOOL_DIRTY);
297df930be7Sderaadt
298df930be7Sderaadt /* Clear any pages left on the stack. */
299df930be7Sderaadt return (RET_SUCCESS);
300df930be7Sderaadt
301df930be7Sderaadt /*
302df930be7Sderaadt * If something fails in the above loop we were already walking back
303df930be7Sderaadt * up the tree and the tree is now inconsistent. Nothing much we can
304df930be7Sderaadt * do about it but release any memory we're holding.
305df930be7Sderaadt */
306df930be7Sderaadt err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
307df930be7Sderaadt mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
308df930be7Sderaadt
309df930be7Sderaadt err2: mpool_put(t->bt_mp, l, 0);
310df930be7Sderaadt mpool_put(t->bt_mp, r, 0);
311df930be7Sderaadt __dbpanic(t->bt_dbp);
312df930be7Sderaadt return (RET_ERROR);
313df930be7Sderaadt }
314df930be7Sderaadt
315df930be7Sderaadt /*
316df930be7Sderaadt * BT_PAGE -- Split a non-root page of a btree.
317df930be7Sderaadt *
318df930be7Sderaadt * Parameters:
319df930be7Sderaadt * t: tree
320df930be7Sderaadt * h: root page
321df930be7Sderaadt * lp: pointer to left page pointer
322df930be7Sderaadt * rp: pointer to right page pointer
323df930be7Sderaadt * skip: pointer to index to leave open
324df930be7Sderaadt * ilen: insert length
325df930be7Sderaadt *
326df930be7Sderaadt * Returns:
327df930be7Sderaadt * Pointer to page in which to insert or NULL on error.
328df930be7Sderaadt */
329df930be7Sderaadt static PAGE *
bt_page(BTREE * t,PAGE * h,PAGE ** lp,PAGE ** rp,indx_t * skip,size_t ilen)330e20a56a5Sotto bt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
331df930be7Sderaadt {
332df930be7Sderaadt PAGE *l, *r, *tp;
333df930be7Sderaadt pgno_t npg;
334df930be7Sderaadt
335df930be7Sderaadt #ifdef STATISTICS
336df930be7Sderaadt ++bt_split;
337df930be7Sderaadt #endif
338df930be7Sderaadt /* Put the new right page for the split into place. */
339df930be7Sderaadt if ((r = __bt_new(t, &npg)) == NULL)
340df930be7Sderaadt return (NULL);
341df930be7Sderaadt r->pgno = npg;
342df930be7Sderaadt r->lower = BTDATAOFF;
343df930be7Sderaadt r->upper = t->bt_psize;
344df930be7Sderaadt r->nextpg = h->nextpg;
345df930be7Sderaadt r->prevpg = h->pgno;
346df930be7Sderaadt r->flags = h->flags & P_TYPE;
347df930be7Sderaadt
348df930be7Sderaadt /*
349df930be7Sderaadt * If we're splitting the last page on a level because we're appending
350df930be7Sderaadt * a key to it (skip is NEXTINDEX()), it's likely that the data is
351df930be7Sderaadt * sorted. Adding an empty page on the side of the level is less work
352df930be7Sderaadt * and can push the fill factor much higher than normal. If we're
353df930be7Sderaadt * wrong it's no big deal, we'll just do the split the right way next
354df930be7Sderaadt * time. It may look like it's equally easy to do a similar hack for
355df930be7Sderaadt * reverse sorted data, that is, split the tree left, but it's not.
356df930be7Sderaadt * Don't even try.
357df930be7Sderaadt */
358df930be7Sderaadt if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
359df930be7Sderaadt #ifdef STATISTICS
360df930be7Sderaadt ++bt_sortsplit;
361df930be7Sderaadt #endif
362df930be7Sderaadt h->nextpg = r->pgno;
363df930be7Sderaadt r->lower = BTDATAOFF + sizeof(indx_t);
364df930be7Sderaadt *skip = 0;
365df930be7Sderaadt *lp = h;
366df930be7Sderaadt *rp = r;
367df930be7Sderaadt return (r);
368df930be7Sderaadt }
369df930be7Sderaadt
370df930be7Sderaadt /* Put the new left page for the split into place. */
371df930be7Sderaadt if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
372df930be7Sderaadt mpool_put(t->bt_mp, r, 0);
373df930be7Sderaadt return (NULL);
374df930be7Sderaadt }
375bec2d00aSderaadt memset(l, 0xff, t->bt_psize);
376df930be7Sderaadt l->pgno = h->pgno;
377df930be7Sderaadt l->nextpg = r->pgno;
378df930be7Sderaadt l->prevpg = h->prevpg;
379df930be7Sderaadt l->lower = BTDATAOFF;
380df930be7Sderaadt l->upper = t->bt_psize;
381df930be7Sderaadt l->flags = h->flags & P_TYPE;
382df930be7Sderaadt
383df930be7Sderaadt /* Fix up the previous pointer of the page after the split page. */
384df930be7Sderaadt if (h->nextpg != P_INVALID) {
385df930be7Sderaadt if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
386df930be7Sderaadt free(l);
387df930be7Sderaadt /* XXX mpool_free(t->bt_mp, r->pgno); */
388df930be7Sderaadt return (NULL);
389df930be7Sderaadt }
390df930be7Sderaadt tp->prevpg = r->pgno;
391bec2d00aSderaadt mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
392df930be7Sderaadt }
393df930be7Sderaadt
394df930be7Sderaadt /*
395df930be7Sderaadt * Split right. The key/data pairs aren't sorted in the btree page so
396df930be7Sderaadt * it's simpler to copy the data from the split page onto two new pages
397df930be7Sderaadt * instead of copying half the data to the right page and compacting
398df930be7Sderaadt * the left page in place. Since the left page can't change, we have
399df930be7Sderaadt * to swap the original and the allocated left page after the split.
400df930be7Sderaadt */
401df930be7Sderaadt tp = bt_psplit(t, h, l, r, skip, ilen);
402df930be7Sderaadt
403df930be7Sderaadt /* Move the new left page onto the old left page. */
404df930be7Sderaadt memmove(h, l, t->bt_psize);
405df930be7Sderaadt if (tp == l)
406df930be7Sderaadt tp = h;
407df930be7Sderaadt free(l);
408df930be7Sderaadt
409df930be7Sderaadt *lp = h;
410df930be7Sderaadt *rp = r;
411df930be7Sderaadt return (tp);
412df930be7Sderaadt }
413df930be7Sderaadt
414df930be7Sderaadt /*
415df930be7Sderaadt * BT_ROOT -- Split the root page of a btree.
416df930be7Sderaadt *
417df930be7Sderaadt * Parameters:
418df930be7Sderaadt * t: tree
419df930be7Sderaadt * h: root page
420df930be7Sderaadt * lp: pointer to left page pointer
421df930be7Sderaadt * rp: pointer to right page pointer
422df930be7Sderaadt * skip: pointer to index to leave open
423df930be7Sderaadt * ilen: insert length
424df930be7Sderaadt *
425df930be7Sderaadt * Returns:
426df930be7Sderaadt * Pointer to page in which to insert or NULL on error.
427df930be7Sderaadt */
428df930be7Sderaadt static PAGE *
bt_root(BTREE * t,PAGE * h,PAGE ** lp,PAGE ** rp,indx_t * skip,size_t ilen)429e20a56a5Sotto bt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen)
430df930be7Sderaadt {
431df930be7Sderaadt PAGE *l, *r, *tp;
432df930be7Sderaadt pgno_t lnpg, rnpg;
433df930be7Sderaadt
434df930be7Sderaadt #ifdef STATISTICS
435df930be7Sderaadt ++bt_split;
436df930be7Sderaadt ++bt_rootsplit;
437df930be7Sderaadt #endif
438df930be7Sderaadt /* Put the new left and right pages for the split into place. */
439df930be7Sderaadt if ((l = __bt_new(t, &lnpg)) == NULL ||
440df930be7Sderaadt (r = __bt_new(t, &rnpg)) == NULL)
441df930be7Sderaadt return (NULL);
442df930be7Sderaadt l->pgno = lnpg;
443df930be7Sderaadt r->pgno = rnpg;
444df930be7Sderaadt l->nextpg = r->pgno;
445df930be7Sderaadt r->prevpg = l->pgno;
446df930be7Sderaadt l->prevpg = r->nextpg = P_INVALID;
447df930be7Sderaadt l->lower = r->lower = BTDATAOFF;
448df930be7Sderaadt l->upper = r->upper = t->bt_psize;
449df930be7Sderaadt l->flags = r->flags = h->flags & P_TYPE;
450df930be7Sderaadt
451df930be7Sderaadt /* Split the root page. */
452df930be7Sderaadt tp = bt_psplit(t, h, l, r, skip, ilen);
453df930be7Sderaadt
454df930be7Sderaadt *lp = l;
455df930be7Sderaadt *rp = r;
456df930be7Sderaadt return (tp);
457df930be7Sderaadt }
458df930be7Sderaadt
459df930be7Sderaadt /*
460df930be7Sderaadt * BT_RROOT -- Fix up the recno root page after it has been split.
461df930be7Sderaadt *
462df930be7Sderaadt * Parameters:
463df930be7Sderaadt * t: tree
464df930be7Sderaadt * h: root page
465df930be7Sderaadt * l: left page
466df930be7Sderaadt * r: right page
467df930be7Sderaadt *
468df930be7Sderaadt * Returns:
469df930be7Sderaadt * RET_ERROR, RET_SUCCESS
470df930be7Sderaadt */
471df930be7Sderaadt static int
bt_rroot(BTREE * t,PAGE * h,PAGE * l,PAGE * r)472e20a56a5Sotto bt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
473df930be7Sderaadt {
474df930be7Sderaadt char *dest;
475df930be7Sderaadt
476df930be7Sderaadt /* Insert the left and right keys, set the header information. */
477df930be7Sderaadt h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
478df930be7Sderaadt dest = (char *)h + h->upper;
479df930be7Sderaadt WR_RINTERNAL(dest,
480df930be7Sderaadt l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
481df930be7Sderaadt
482df930be7Sderaadt h->linp[1] = h->upper -= NRINTERNAL;
483df930be7Sderaadt dest = (char *)h + h->upper;
484df930be7Sderaadt WR_RINTERNAL(dest,
485df930be7Sderaadt r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
486df930be7Sderaadt
487df930be7Sderaadt h->lower = BTDATAOFF + 2 * sizeof(indx_t);
488df930be7Sderaadt
489df930be7Sderaadt /* Unpin the root page, set to recno internal page. */
490df930be7Sderaadt h->flags &= ~P_TYPE;
491df930be7Sderaadt h->flags |= P_RINTERNAL;
492df930be7Sderaadt mpool_put(t->bt_mp, h, MPOOL_DIRTY);
493df930be7Sderaadt
494df930be7Sderaadt return (RET_SUCCESS);
495df930be7Sderaadt }
496df930be7Sderaadt
497df930be7Sderaadt /*
498df930be7Sderaadt * BT_BROOT -- Fix up the btree root page after it has been split.
499df930be7Sderaadt *
500df930be7Sderaadt * Parameters:
501df930be7Sderaadt * t: tree
502df930be7Sderaadt * h: root page
503df930be7Sderaadt * l: left page
504df930be7Sderaadt * r: right page
505df930be7Sderaadt *
506df930be7Sderaadt * Returns:
507df930be7Sderaadt * RET_ERROR, RET_SUCCESS
508df930be7Sderaadt */
509df930be7Sderaadt static int
bt_broot(BTREE * t,PAGE * h,PAGE * l,PAGE * r)510e20a56a5Sotto bt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r)
511df930be7Sderaadt {
512df930be7Sderaadt BINTERNAL *bi;
513df930be7Sderaadt BLEAF *bl;
514df930be7Sderaadt u_int32_t nbytes;
515df930be7Sderaadt char *dest;
516df930be7Sderaadt
517df930be7Sderaadt /*
518df930be7Sderaadt * If the root page was a leaf page, change it into an internal page.
519df930be7Sderaadt * We copy the key we split on (but not the key's data, in the case of
520df930be7Sderaadt * a leaf page) to the new root page.
521df930be7Sderaadt *
522df930be7Sderaadt * The btree comparison code guarantees that the left-most key on any
523df930be7Sderaadt * level of the tree is never used, so it doesn't need to be filled in.
524df930be7Sderaadt */
525df930be7Sderaadt nbytes = NBINTERNAL(0);
526df930be7Sderaadt h->linp[0] = h->upper = t->bt_psize - nbytes;
527df930be7Sderaadt dest = (char *)h + h->upper;
528df930be7Sderaadt WR_BINTERNAL(dest, 0, l->pgno, 0);
529df930be7Sderaadt
530df930be7Sderaadt switch (h->flags & P_TYPE) {
531df930be7Sderaadt case P_BLEAF:
532df930be7Sderaadt bl = GETBLEAF(r, 0);
533df930be7Sderaadt nbytes = NBINTERNAL(bl->ksize);
534df930be7Sderaadt h->linp[1] = h->upper -= nbytes;
535df930be7Sderaadt dest = (char *)h + h->upper;
536df930be7Sderaadt WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
537df930be7Sderaadt memmove(dest, bl->bytes, bl->ksize);
538df930be7Sderaadt
539df930be7Sderaadt /*
540df930be7Sderaadt * If the key is on an overflow page, mark the overflow chain
541df930be7Sderaadt * so it isn't deleted when the leaf copy of the key is deleted.
542df930be7Sderaadt */
543df930be7Sderaadt if (bl->flags & P_BIGKEY &&
544df930be7Sderaadt bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
545df930be7Sderaadt return (RET_ERROR);
546df930be7Sderaadt break;
547df930be7Sderaadt case P_BINTERNAL:
548df930be7Sderaadt bi = GETBINTERNAL(r, 0);
549df930be7Sderaadt nbytes = NBINTERNAL(bi->ksize);
550df930be7Sderaadt h->linp[1] = h->upper -= nbytes;
551df930be7Sderaadt dest = (char *)h + h->upper;
552df930be7Sderaadt memmove(dest, bi, nbytes);
553df930be7Sderaadt ((BINTERNAL *)dest)->pgno = r->pgno;
554df930be7Sderaadt break;
555df930be7Sderaadt default:
556df930be7Sderaadt abort();
557df930be7Sderaadt }
558df930be7Sderaadt
559df930be7Sderaadt /* There are two keys on the page. */
560df930be7Sderaadt h->lower = BTDATAOFF + 2 * sizeof(indx_t);
561df930be7Sderaadt
562df930be7Sderaadt /* Unpin the root page, set to btree internal page. */
563df930be7Sderaadt h->flags &= ~P_TYPE;
564df930be7Sderaadt h->flags |= P_BINTERNAL;
565df930be7Sderaadt mpool_put(t->bt_mp, h, MPOOL_DIRTY);
566df930be7Sderaadt
567df930be7Sderaadt return (RET_SUCCESS);
568df930be7Sderaadt }
569df930be7Sderaadt
570df930be7Sderaadt /*
571df930be7Sderaadt * BT_PSPLIT -- Do the real work of splitting the page.
572df930be7Sderaadt *
573df930be7Sderaadt * Parameters:
574df930be7Sderaadt * t: tree
575df930be7Sderaadt * h: page to be split
576df930be7Sderaadt * l: page to put lower half of data
577df930be7Sderaadt * r: page to put upper half of data
578df930be7Sderaadt * pskip: pointer to index to leave open
579df930be7Sderaadt * ilen: insert length
580df930be7Sderaadt *
581df930be7Sderaadt * Returns:
582df930be7Sderaadt * Pointer to page in which to insert.
583df930be7Sderaadt */
584df930be7Sderaadt static PAGE *
bt_psplit(BTREE * t,PAGE * h,PAGE * l,PAGE * r,indx_t * pskip,size_t ilen)585e20a56a5Sotto bt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen)
586df930be7Sderaadt {
587df930be7Sderaadt BINTERNAL *bi;
588df930be7Sderaadt BLEAF *bl;
589bec2d00aSderaadt CURSOR *c;
590df930be7Sderaadt RLEAF *rl;
591df930be7Sderaadt PAGE *rval;
592df930be7Sderaadt void *src;
593df930be7Sderaadt indx_t full, half, nxt, off, skip, top, used;
594df930be7Sderaadt u_int32_t nbytes;
595df930be7Sderaadt int bigkeycnt, isbigkey;
596df930be7Sderaadt
597df930be7Sderaadt /*
598df930be7Sderaadt * Split the data to the left and right pages. Leave the skip index
599df930be7Sderaadt * open. Additionally, make some effort not to split on an overflow
600df930be7Sderaadt * key. This makes internal page processing faster and can save
601df930be7Sderaadt * space as overflow keys used by internal pages are never deleted.
602df930be7Sderaadt */
603df930be7Sderaadt bigkeycnt = 0;
604df930be7Sderaadt skip = *pskip;
605df930be7Sderaadt full = t->bt_psize - BTDATAOFF;
606df930be7Sderaadt half = full / 2;
607df930be7Sderaadt used = 0;
608df930be7Sderaadt for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
609df930be7Sderaadt if (skip == off) {
610df930be7Sderaadt nbytes = ilen;
611df930be7Sderaadt isbigkey = 0; /* XXX: not really known. */
612df930be7Sderaadt } else
613df930be7Sderaadt switch (h->flags & P_TYPE) {
614df930be7Sderaadt case P_BINTERNAL:
615df930be7Sderaadt src = bi = GETBINTERNAL(h, nxt);
616df930be7Sderaadt nbytes = NBINTERNAL(bi->ksize);
617df930be7Sderaadt isbigkey = bi->flags & P_BIGKEY;
618df930be7Sderaadt break;
619df930be7Sderaadt case P_BLEAF:
620df930be7Sderaadt src = bl = GETBLEAF(h, nxt);
621df930be7Sderaadt nbytes = NBLEAF(bl);
622df930be7Sderaadt isbigkey = bl->flags & P_BIGKEY;
623df930be7Sderaadt break;
624df930be7Sderaadt case P_RINTERNAL:
625df930be7Sderaadt src = GETRINTERNAL(h, nxt);
626df930be7Sderaadt nbytes = NRINTERNAL;
627df930be7Sderaadt isbigkey = 0;
628df930be7Sderaadt break;
629df930be7Sderaadt case P_RLEAF:
630df930be7Sderaadt src = rl = GETRLEAF(h, nxt);
631df930be7Sderaadt nbytes = NRLEAF(rl);
632df930be7Sderaadt isbigkey = 0;
633df930be7Sderaadt break;
634df930be7Sderaadt default:
635df930be7Sderaadt abort();
636df930be7Sderaadt }
637df930be7Sderaadt
638df930be7Sderaadt /*
639df930be7Sderaadt * If the key/data pairs are substantial fractions of the max
640df930be7Sderaadt * possible size for the page, it's possible to get situations
641df930be7Sderaadt * where we decide to try and copy too much onto the left page.
642df930be7Sderaadt * Make sure that doesn't happen.
643df930be7Sderaadt */
644eab84c55Sderaadt if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) ||
645eab84c55Sderaadt nxt == top - 1) {
646df930be7Sderaadt --off;
647df930be7Sderaadt break;
648df930be7Sderaadt }
649df930be7Sderaadt
650df930be7Sderaadt /* Copy the key/data pair, if not the skipped index. */
651df930be7Sderaadt if (skip != off) {
652df930be7Sderaadt ++nxt;
653df930be7Sderaadt
654df930be7Sderaadt l->linp[off] = l->upper -= nbytes;
655df930be7Sderaadt memmove((char *)l + l->upper, src, nbytes);
656df930be7Sderaadt }
657df930be7Sderaadt
658eab84c55Sderaadt used += nbytes + sizeof(indx_t);
659df930be7Sderaadt if (used >= half) {
660df930be7Sderaadt if (!isbigkey || bigkeycnt == 3)
661df930be7Sderaadt break;
662df930be7Sderaadt else
663df930be7Sderaadt ++bigkeycnt;
664df930be7Sderaadt }
665df930be7Sderaadt }
666df930be7Sderaadt
667df930be7Sderaadt /*
668df930be7Sderaadt * Off is the last offset that's valid for the left page.
669df930be7Sderaadt * Nxt is the first offset to be placed on the right page.
670df930be7Sderaadt */
671df930be7Sderaadt l->lower += (off + 1) * sizeof(indx_t);
672df930be7Sderaadt
673df930be7Sderaadt /*
674df930be7Sderaadt * If splitting the page that the cursor was on, the cursor has to be
675df930be7Sderaadt * adjusted to point to the same record as before the split. If the
676df930be7Sderaadt * cursor is at or past the skipped slot, the cursor is incremented by
677df930be7Sderaadt * one. If the cursor is on the right page, it is decremented by the
678df930be7Sderaadt * number of records split to the left page.
679df930be7Sderaadt */
680bec2d00aSderaadt c = &t->bt_cursor;
681bec2d00aSderaadt if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
682bec2d00aSderaadt if (c->pg.index >= skip)
683bec2d00aSderaadt ++c->pg.index;
684bec2d00aSderaadt if (c->pg.index < nxt) /* Left page. */
685bec2d00aSderaadt c->pg.pgno = l->pgno;
686df930be7Sderaadt else { /* Right page. */
687bec2d00aSderaadt c->pg.pgno = r->pgno;
688bec2d00aSderaadt c->pg.index -= nxt;
689df930be7Sderaadt }
690df930be7Sderaadt }
691df930be7Sderaadt
692df930be7Sderaadt /*
693df930be7Sderaadt * If the skipped index was on the left page, just return that page.
694df930be7Sderaadt * Otherwise, adjust the skip index to reflect the new position on
695df930be7Sderaadt * the right page.
696df930be7Sderaadt */
697df930be7Sderaadt if (skip <= off) {
6984da6d40eSmillert skip = MAX_PAGE_OFFSET;
699df930be7Sderaadt rval = l;
700df930be7Sderaadt } else {
701df930be7Sderaadt rval = r;
702df930be7Sderaadt *pskip -= nxt;
703df930be7Sderaadt }
704df930be7Sderaadt
705df930be7Sderaadt for (off = 0; nxt < top; ++off) {
706df930be7Sderaadt if (skip == nxt) {
707df930be7Sderaadt ++off;
7084da6d40eSmillert skip = MAX_PAGE_OFFSET;
709df930be7Sderaadt }
710df930be7Sderaadt switch (h->flags & P_TYPE) {
711df930be7Sderaadt case P_BINTERNAL:
712df930be7Sderaadt src = bi = GETBINTERNAL(h, nxt);
713df930be7Sderaadt nbytes = NBINTERNAL(bi->ksize);
714df930be7Sderaadt break;
715df930be7Sderaadt case P_BLEAF:
716df930be7Sderaadt src = bl = GETBLEAF(h, nxt);
717df930be7Sderaadt nbytes = NBLEAF(bl);
718df930be7Sderaadt break;
719df930be7Sderaadt case P_RINTERNAL:
720df930be7Sderaadt src = GETRINTERNAL(h, nxt);
721df930be7Sderaadt nbytes = NRINTERNAL;
722df930be7Sderaadt break;
723df930be7Sderaadt case P_RLEAF:
724df930be7Sderaadt src = rl = GETRLEAF(h, nxt);
725df930be7Sderaadt nbytes = NRLEAF(rl);
726df930be7Sderaadt break;
727df930be7Sderaadt default:
728df930be7Sderaadt abort();
729df930be7Sderaadt }
730df930be7Sderaadt ++nxt;
731df930be7Sderaadt r->linp[off] = r->upper -= nbytes;
732df930be7Sderaadt memmove((char *)r + r->upper, src, nbytes);
733df930be7Sderaadt }
734df930be7Sderaadt r->lower += off * sizeof(indx_t);
735df930be7Sderaadt
736df930be7Sderaadt /* If the key is being appended to the page, adjust the index. */
737df930be7Sderaadt if (skip == top)
738df930be7Sderaadt r->lower += sizeof(indx_t);
739df930be7Sderaadt
740df930be7Sderaadt return (rval);
741df930be7Sderaadt }
742df930be7Sderaadt
743df930be7Sderaadt /*
744df930be7Sderaadt * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
745df930be7Sderaadt *
746df930be7Sderaadt * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
747df930be7Sderaadt * record that references them gets deleted. Chains pointed to by internal
748df930be7Sderaadt * pages never get deleted. This routine marks a chain as pointed to by an
749df930be7Sderaadt * internal page.
750df930be7Sderaadt *
751df930be7Sderaadt * Parameters:
752df930be7Sderaadt * t: tree
753df930be7Sderaadt * pg: page number of first page in the chain.
754df930be7Sderaadt *
755df930be7Sderaadt * Returns:
756df930be7Sderaadt * RET_SUCCESS, RET_ERROR.
757df930be7Sderaadt */
758df930be7Sderaadt static int
bt_preserve(BTREE * t,pgno_t pg)759e20a56a5Sotto bt_preserve(BTREE *t, pgno_t pg)
760df930be7Sderaadt {
761df930be7Sderaadt PAGE *h;
762df930be7Sderaadt
763df930be7Sderaadt if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
764df930be7Sderaadt return (RET_ERROR);
765df930be7Sderaadt h->flags |= P_PRESERVE;
766df930be7Sderaadt mpool_put(t->bt_mp, h, MPOOL_DIRTY);
767df930be7Sderaadt return (RET_SUCCESS);
768df930be7Sderaadt }
769df930be7Sderaadt
770df930be7Sderaadt /*
771df930be7Sderaadt * REC_TOTAL -- Return the number of recno entries below a page.
772df930be7Sderaadt *
773df930be7Sderaadt * Parameters:
774df930be7Sderaadt * h: page
775df930be7Sderaadt *
776df930be7Sderaadt * Returns:
777df930be7Sderaadt * The number of recno entries below a page.
778df930be7Sderaadt *
779df930be7Sderaadt * XXX
780df930be7Sderaadt * These values could be set by the bt_psplit routine. The problem is that the
781df930be7Sderaadt * entry has to be popped off of the stack etc. or the values have to be passed
782df930be7Sderaadt * all the way back to bt_split/bt_rroot and it's not very clean.
783df930be7Sderaadt */
784df930be7Sderaadt static recno_t
rec_total(PAGE * h)785e20a56a5Sotto rec_total(PAGE *h)
786df930be7Sderaadt {
787df930be7Sderaadt recno_t recs;
788df930be7Sderaadt indx_t nxt, top;
789df930be7Sderaadt
790df930be7Sderaadt for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
791df930be7Sderaadt recs += GETRINTERNAL(h, nxt)->nrecs;
792df930be7Sderaadt return (recs);
793df930be7Sderaadt }
794