1*2639ae9bSBen Gras /* $NetBSD: bt_split.c,v 1.19 2009/04/22 18:44:06 christos Exp $ */ 2*2639ae9bSBen Gras 3*2639ae9bSBen Gras /*- 4*2639ae9bSBen Gras * Copyright (c) 1990, 1993, 1994 5*2639ae9bSBen Gras * The Regents of the University of California. All rights reserved. 6*2639ae9bSBen Gras * 7*2639ae9bSBen Gras * This code is derived from software contributed to Berkeley by 8*2639ae9bSBen Gras * Mike Olson. 9*2639ae9bSBen Gras * 10*2639ae9bSBen Gras * Redistribution and use in source and binary forms, with or without 11*2639ae9bSBen Gras * modification, are permitted provided that the following conditions 12*2639ae9bSBen Gras * are met: 13*2639ae9bSBen Gras * 1. Redistributions of source code must retain the above copyright 14*2639ae9bSBen Gras * notice, this list of conditions and the following disclaimer. 15*2639ae9bSBen Gras * 2. Redistributions in binary form must reproduce the above copyright 16*2639ae9bSBen Gras * notice, this list of conditions and the following disclaimer in the 17*2639ae9bSBen Gras * documentation and/or other materials provided with the distribution. 18*2639ae9bSBen Gras * 3. Neither the name of the University nor the names of its contributors 19*2639ae9bSBen Gras * may be used to endorse or promote products derived from this software 20*2639ae9bSBen Gras * without specific prior written permission. 21*2639ae9bSBen Gras * 22*2639ae9bSBen Gras * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23*2639ae9bSBen Gras * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24*2639ae9bSBen Gras * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25*2639ae9bSBen Gras * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26*2639ae9bSBen Gras * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27*2639ae9bSBen Gras * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28*2639ae9bSBen Gras * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29*2639ae9bSBen Gras * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30*2639ae9bSBen Gras * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31*2639ae9bSBen Gras * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32*2639ae9bSBen Gras * SUCH DAMAGE. 33*2639ae9bSBen Gras */ 34*2639ae9bSBen Gras 35*2639ae9bSBen Gras #if HAVE_NBTOOL_CONFIG_H 36*2639ae9bSBen Gras #include "nbtool_config.h" 37*2639ae9bSBen Gras #endif 38*2639ae9bSBen Gras 39*2639ae9bSBen Gras #include <sys/cdefs.h> 40*2639ae9bSBen Gras #ifndef __minix 41*2639ae9bSBen Gras __RCSID("$NetBSD: bt_split.c,v 1.19 2009/04/22 18:44:06 christos Exp $"); 42*2639ae9bSBen Gras #endif 43*2639ae9bSBen Gras 44*2639ae9bSBen Gras #ifndef __minix 45*2639ae9bSBen Gras #include "namespace.h" 46*2639ae9bSBen Gras #endif 47*2639ae9bSBen Gras #include <sys/types.h> 48*2639ae9bSBen Gras 49*2639ae9bSBen Gras #include <assert.h> 50*2639ae9bSBen Gras #include <limits.h> 51*2639ae9bSBen Gras #include <stdio.h> 52*2639ae9bSBen Gras #include <stdlib.h> 53*2639ae9bSBen Gras #include <string.h> 54*2639ae9bSBen Gras 55*2639ae9bSBen Gras #include <db.h> 56*2639ae9bSBen Gras #include "btree.h" 57*2639ae9bSBen Gras 58*2639ae9bSBen Gras static int bt_broot(BTREE *, PAGE *, PAGE *, PAGE *); 59*2639ae9bSBen Gras static PAGE *bt_page(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t); 60*2639ae9bSBen Gras static int bt_preserve(BTREE *, pgno_t); 61*2639ae9bSBen Gras static PAGE *bt_psplit(BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t); 62*2639ae9bSBen Gras static PAGE *bt_root(BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t); 63*2639ae9bSBen Gras static int bt_rroot(BTREE *, PAGE *, PAGE *, PAGE *); 64*2639ae9bSBen Gras static recno_t rec_total(PAGE *); 65*2639ae9bSBen Gras 66*2639ae9bSBen Gras #ifdef STATISTICS 67*2639ae9bSBen Gras unsigned long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved; 68*2639ae9bSBen Gras #endif 69*2639ae9bSBen Gras 70*2639ae9bSBen Gras /* 71*2639ae9bSBen Gras * __BT_SPLIT -- Split the tree. 72*2639ae9bSBen Gras * 73*2639ae9bSBen Gras * Parameters: 74*2639ae9bSBen Gras * t: tree 75*2639ae9bSBen Gras * sp: page to split 76*2639ae9bSBen Gras * key: key to insert 77*2639ae9bSBen Gras * data: data to insert 78*2639ae9bSBen Gras * flags: BIGKEY/BIGDATA flags 79*2639ae9bSBen Gras * ilen: insert length 80*2639ae9bSBen Gras * skip: index to leave open 81*2639ae9bSBen Gras * 82*2639ae9bSBen Gras * Returns: 83*2639ae9bSBen Gras * RET_ERROR, RET_SUCCESS 84*2639ae9bSBen Gras */ 85*2639ae9bSBen Gras int 86*2639ae9bSBen Gras __bt_split(BTREE *t, PAGE *sp, const DBT *key, const DBT *data, int flags, 87*2639ae9bSBen Gras size_t ilen, uint32_t argskip) 88*2639ae9bSBen Gras { 89*2639ae9bSBen Gras BINTERNAL *bi = NULL; /* pacify gcc */ 90*2639ae9bSBen Gras BLEAF *bl = NULL, *tbl; /* pacify gcc */ 91*2639ae9bSBen Gras DBT a, b; 92*2639ae9bSBen Gras EPGNO *parent; 93*2639ae9bSBen Gras PAGE *h, *l, *r, *lchild, *rchild; 94*2639ae9bSBen Gras indx_t nxtindex; 95*2639ae9bSBen Gras uint16_t skip; 96*2639ae9bSBen Gras uint32_t n, nbytes, nksize = 0; /* pacify gcc */ 97*2639ae9bSBen Gras int parentsplit; 98*2639ae9bSBen Gras char *dest; 99*2639ae9bSBen Gras 100*2639ae9bSBen Gras /* 101*2639ae9bSBen Gras * Split the page into two pages, l and r. The split routines return 102*2639ae9bSBen Gras * a pointer to the page into which the key should be inserted and with 103*2639ae9bSBen Gras * skip set to the offset which should be used. Additionally, l and r 104*2639ae9bSBen Gras * are pinned. 105*2639ae9bSBen Gras */ 106*2639ae9bSBen Gras skip = argskip; 107*2639ae9bSBen Gras h = sp->pgno == P_ROOT ? 108*2639ae9bSBen Gras bt_root(t, sp, &l, &r, &skip, ilen) : 109*2639ae9bSBen Gras bt_page(t, sp, &l, &r, &skip, ilen); 110*2639ae9bSBen Gras if (h == NULL) 111*2639ae9bSBen Gras return (RET_ERROR); 112*2639ae9bSBen Gras 113*2639ae9bSBen Gras /* 114*2639ae9bSBen Gras * Insert the new key/data pair into the leaf page. (Key inserts 115*2639ae9bSBen Gras * always cause a leaf page to split first.) 116*2639ae9bSBen Gras */ 117*2639ae9bSBen Gras _DBFIT(ilen, indx_t); 118*2639ae9bSBen Gras h->upper -= (indx_t)ilen; 119*2639ae9bSBen Gras h->linp[skip] = h->upper; 120*2639ae9bSBen Gras dest = (char *)(void *)h + h->upper; 121*2639ae9bSBen Gras if (F_ISSET(t, R_RECNO)) 122*2639ae9bSBen Gras WR_RLEAF(dest, data, flags); 123*2639ae9bSBen Gras else 124*2639ae9bSBen Gras WR_BLEAF(dest, key, data, flags); 125*2639ae9bSBen Gras 126*2639ae9bSBen Gras /* If the root page was split, make it look right. */ 127*2639ae9bSBen Gras if (sp->pgno == P_ROOT && 128*2639ae9bSBen Gras (F_ISSET(t, R_RECNO) ? 129*2639ae9bSBen Gras bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) 130*2639ae9bSBen Gras goto err2; 131*2639ae9bSBen Gras 132*2639ae9bSBen Gras /* 133*2639ae9bSBen Gras * Now we walk the parent page stack -- a LIFO stack of the pages that 134*2639ae9bSBen Gras * were traversed when we searched for the page that split. Each stack 135*2639ae9bSBen Gras * entry is a page number and a page index offset. The offset is for 136*2639ae9bSBen Gras * the page traversed on the search. We've just split a page, so we 137*2639ae9bSBen Gras * have to insert a new key into the parent page. 138*2639ae9bSBen Gras * 139*2639ae9bSBen Gras * If the insert into the parent page causes it to split, may have to 140*2639ae9bSBen Gras * continue splitting all the way up the tree. We stop if the root 141*2639ae9bSBen Gras * splits or the page inserted into didn't have to split to hold the 142*2639ae9bSBen Gras * new key. Some algorithms replace the key for the old page as well 143*2639ae9bSBen Gras * as the new page. We don't, as there's no reason to believe that the 144*2639ae9bSBen Gras * first key on the old page is any better than the key we have, and, 145*2639ae9bSBen Gras * in the case of a key being placed at index 0 causing the split, the 146*2639ae9bSBen Gras * key is unavailable. 147*2639ae9bSBen Gras * 148*2639ae9bSBen Gras * There are a maximum of 5 pages pinned at any time. We keep the left 149*2639ae9bSBen Gras * and right pages pinned while working on the parent. The 5 are the 150*2639ae9bSBen Gras * two children, left parent and right parent (when the parent splits) 151*2639ae9bSBen Gras * and the root page or the overflow key page when calling bt_preserve. 152*2639ae9bSBen Gras * This code must make sure that all pins are released other than the 153*2639ae9bSBen Gras * root page or overflow page which is unlocked elsewhere. 154*2639ae9bSBen Gras */ 155*2639ae9bSBen Gras while ((parent = BT_POP(t)) != NULL) { 156*2639ae9bSBen Gras lchild = l; 157*2639ae9bSBen Gras rchild = r; 158*2639ae9bSBen Gras 159*2639ae9bSBen Gras /* Get the parent page. */ 160*2639ae9bSBen Gras if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 161*2639ae9bSBen Gras goto err2; 162*2639ae9bSBen Gras 163*2639ae9bSBen Gras /* 164*2639ae9bSBen Gras * The new key goes ONE AFTER the index, because the split 165*2639ae9bSBen Gras * was to the right. 166*2639ae9bSBen Gras */ 167*2639ae9bSBen Gras skip = parent->index + 1; 168*2639ae9bSBen Gras 169*2639ae9bSBen Gras /* 170*2639ae9bSBen Gras * Calculate the space needed on the parent page. 171*2639ae9bSBen Gras * 172*2639ae9bSBen Gras * Prefix trees: space hack when inserting into BINTERNAL 173*2639ae9bSBen Gras * pages. Retain only what's needed to distinguish between 174*2639ae9bSBen Gras * the new entry and the LAST entry on the page to its left. 175*2639ae9bSBen Gras * If the keys compare equal, retain the entire key. Note, 176*2639ae9bSBen Gras * we don't touch overflow keys, and the entire key must be 177*2639ae9bSBen Gras * retained for the next-to-left most key on the leftmost 178*2639ae9bSBen Gras * page of each level, or the search will fail. Applicable 179*2639ae9bSBen Gras * ONLY to internal pages that have leaf pages as children. 180*2639ae9bSBen Gras * Further reduction of the key between pairs of internal 181*2639ae9bSBen Gras * pages loses too much information. 182*2639ae9bSBen Gras */ 183*2639ae9bSBen Gras switch (rchild->flags & P_TYPE) { 184*2639ae9bSBen Gras case P_BINTERNAL: 185*2639ae9bSBen Gras bi = GETBINTERNAL(rchild, 0); 186*2639ae9bSBen Gras nbytes = NBINTERNAL(bi->ksize); 187*2639ae9bSBen Gras break; 188*2639ae9bSBen Gras case P_BLEAF: 189*2639ae9bSBen Gras bl = GETBLEAF(rchild, 0); 190*2639ae9bSBen Gras nbytes = NBINTERNAL(bl->ksize); 191*2639ae9bSBen Gras if (t->bt_pfx && !(bl->flags & P_BIGKEY) && 192*2639ae9bSBen Gras (h->prevpg != P_INVALID || skip > 1)) { 193*2639ae9bSBen Gras size_t temp; 194*2639ae9bSBen Gras tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1); 195*2639ae9bSBen Gras a.size = tbl->ksize; 196*2639ae9bSBen Gras a.data = tbl->bytes; 197*2639ae9bSBen Gras b.size = bl->ksize; 198*2639ae9bSBen Gras b.data = bl->bytes; 199*2639ae9bSBen Gras temp = t->bt_pfx(&a, &b); 200*2639ae9bSBen Gras _DBFIT(temp, uint32_t); 201*2639ae9bSBen Gras nksize = (uint32_t)temp; 202*2639ae9bSBen Gras n = NBINTERNAL(nksize); 203*2639ae9bSBen Gras if (n < nbytes) { 204*2639ae9bSBen Gras #ifdef STATISTICS 205*2639ae9bSBen Gras bt_pfxsaved += nbytes - n; 206*2639ae9bSBen Gras #endif 207*2639ae9bSBen Gras nbytes = n; 208*2639ae9bSBen Gras } else 209*2639ae9bSBen Gras nksize = 0; 210*2639ae9bSBen Gras } else 211*2639ae9bSBen Gras nksize = 0; 212*2639ae9bSBen Gras break; 213*2639ae9bSBen Gras case P_RINTERNAL: 214*2639ae9bSBen Gras case P_RLEAF: 215*2639ae9bSBen Gras nbytes = NRINTERNAL; 216*2639ae9bSBen Gras break; 217*2639ae9bSBen Gras default: 218*2639ae9bSBen Gras abort(); 219*2639ae9bSBen Gras } 220*2639ae9bSBen Gras 221*2639ae9bSBen Gras /* Split the parent page if necessary or shift the indices. */ 222*2639ae9bSBen Gras if ((uint32_t)h->upper - (uint32_t)h->lower < nbytes + sizeof(indx_t)) { 223*2639ae9bSBen Gras sp = h; 224*2639ae9bSBen Gras h = h->pgno == P_ROOT ? 225*2639ae9bSBen Gras bt_root(t, h, &l, &r, &skip, nbytes) : 226*2639ae9bSBen Gras bt_page(t, h, &l, &r, &skip, nbytes); 227*2639ae9bSBen Gras if (h == NULL) 228*2639ae9bSBen Gras goto err1; 229*2639ae9bSBen Gras parentsplit = 1; 230*2639ae9bSBen Gras } else { 231*2639ae9bSBen Gras if (skip < (nxtindex = NEXTINDEX(h))) 232*2639ae9bSBen Gras memmove(h->linp + skip + 1, h->linp + skip, 233*2639ae9bSBen Gras (nxtindex - skip) * sizeof(indx_t)); 234*2639ae9bSBen Gras h->lower += sizeof(indx_t); 235*2639ae9bSBen Gras parentsplit = 0; 236*2639ae9bSBen Gras } 237*2639ae9bSBen Gras 238*2639ae9bSBen Gras /* Insert the key into the parent page. */ 239*2639ae9bSBen Gras switch (rchild->flags & P_TYPE) { 240*2639ae9bSBen Gras case P_BINTERNAL: 241*2639ae9bSBen Gras h->linp[skip] = h->upper -= nbytes; 242*2639ae9bSBen Gras dest = (char *)(void *)h + h->linp[skip]; 243*2639ae9bSBen Gras memmove(dest, bi, nbytes); 244*2639ae9bSBen Gras ((BINTERNAL *)(void *)dest)->pgno = rchild->pgno; 245*2639ae9bSBen Gras break; 246*2639ae9bSBen Gras case P_BLEAF: 247*2639ae9bSBen Gras h->linp[skip] = h->upper -= nbytes; 248*2639ae9bSBen Gras dest = (char *)(void *)h + h->linp[skip]; 249*2639ae9bSBen Gras WR_BINTERNAL(dest, nksize ? nksize : bl->ksize, 250*2639ae9bSBen Gras rchild->pgno, bl->flags & P_BIGKEY); 251*2639ae9bSBen Gras memmove(dest, bl->bytes, nksize ? nksize : bl->ksize); 252*2639ae9bSBen Gras if (bl->flags & P_BIGKEY && 253*2639ae9bSBen Gras bt_preserve(t, *(pgno_t *)(void *)bl->bytes) == 254*2639ae9bSBen Gras RET_ERROR) 255*2639ae9bSBen Gras goto err1; 256*2639ae9bSBen Gras break; 257*2639ae9bSBen Gras case P_RINTERNAL: 258*2639ae9bSBen Gras /* 259*2639ae9bSBen Gras * Update the left page count. If split 260*2639ae9bSBen Gras * added at index 0, fix the correct page. 261*2639ae9bSBen Gras */ 262*2639ae9bSBen Gras if (skip > 0) 263*2639ae9bSBen Gras dest = (char *)(void *)h + h->linp[skip - 1]; 264*2639ae9bSBen Gras else 265*2639ae9bSBen Gras dest = (char *)(void *)l + l->linp[NEXTINDEX(l) - 1]; 266*2639ae9bSBen Gras ((RINTERNAL *)(void *)dest)->nrecs = rec_total(lchild); 267*2639ae9bSBen Gras ((RINTERNAL *)(void *)dest)->pgno = lchild->pgno; 268*2639ae9bSBen Gras 269*2639ae9bSBen Gras /* Update the right page count. */ 270*2639ae9bSBen Gras h->linp[skip] = h->upper -= nbytes; 271*2639ae9bSBen Gras dest = (char *)(void *)h + h->linp[skip]; 272*2639ae9bSBen Gras ((RINTERNAL *)(void *)dest)->nrecs = rec_total(rchild); 273*2639ae9bSBen Gras ((RINTERNAL *)(void *)dest)->pgno = rchild->pgno; 274*2639ae9bSBen Gras break; 275*2639ae9bSBen Gras case P_RLEAF: 276*2639ae9bSBen Gras /* 277*2639ae9bSBen Gras * Update the left page count. If split 278*2639ae9bSBen Gras * added at index 0, fix the correct page. 279*2639ae9bSBen Gras */ 280*2639ae9bSBen Gras if (skip > 0) 281*2639ae9bSBen Gras dest = (char *)(void *)h + h->linp[skip - 1]; 282*2639ae9bSBen Gras else 283*2639ae9bSBen Gras dest = (char *)(void *)l + l->linp[NEXTINDEX(l) - 1]; 284*2639ae9bSBen Gras ((RINTERNAL *)(void *)dest)->nrecs = NEXTINDEX(lchild); 285*2639ae9bSBen Gras ((RINTERNAL *)(void *)dest)->pgno = lchild->pgno; 286*2639ae9bSBen Gras 287*2639ae9bSBen Gras /* Update the right page count. */ 288*2639ae9bSBen Gras h->linp[skip] = h->upper -= nbytes; 289*2639ae9bSBen Gras dest = (char *)(void *)h + h->linp[skip]; 290*2639ae9bSBen Gras ((RINTERNAL *)(void *)dest)->nrecs = NEXTINDEX(rchild); 291*2639ae9bSBen Gras ((RINTERNAL *)(void *)dest)->pgno = rchild->pgno; 292*2639ae9bSBen Gras break; 293*2639ae9bSBen Gras default: 294*2639ae9bSBen Gras abort(); 295*2639ae9bSBen Gras } 296*2639ae9bSBen Gras 297*2639ae9bSBen Gras /* Unpin the held pages. */ 298*2639ae9bSBen Gras if (!parentsplit) { 299*2639ae9bSBen Gras mpool_put(t->bt_mp, h, MPOOL_DIRTY); 300*2639ae9bSBen Gras break; 301*2639ae9bSBen Gras } 302*2639ae9bSBen Gras 303*2639ae9bSBen Gras /* If the root page was split, make it look right. */ 304*2639ae9bSBen Gras if (sp->pgno == P_ROOT && 305*2639ae9bSBen Gras (F_ISSET(t, R_RECNO) ? 306*2639ae9bSBen Gras bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) 307*2639ae9bSBen Gras goto err1; 308*2639ae9bSBen Gras 309*2639ae9bSBen Gras mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); 310*2639ae9bSBen Gras mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); 311*2639ae9bSBen Gras } 312*2639ae9bSBen Gras 313*2639ae9bSBen Gras /* Unpin the held pages. */ 314*2639ae9bSBen Gras mpool_put(t->bt_mp, l, MPOOL_DIRTY); 315*2639ae9bSBen Gras mpool_put(t->bt_mp, r, MPOOL_DIRTY); 316*2639ae9bSBen Gras 317*2639ae9bSBen Gras /* Clear any pages left on the stack. */ 318*2639ae9bSBen Gras return (RET_SUCCESS); 319*2639ae9bSBen Gras 320*2639ae9bSBen Gras /* 321*2639ae9bSBen Gras * If something fails in the above loop we were already walking back 322*2639ae9bSBen Gras * up the tree and the tree is now inconsistent. Nothing much we can 323*2639ae9bSBen Gras * do about it but release any memory we're holding. 324*2639ae9bSBen Gras */ 325*2639ae9bSBen Gras err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); 326*2639ae9bSBen Gras mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); 327*2639ae9bSBen Gras 328*2639ae9bSBen Gras err2: mpool_put(t->bt_mp, l, 0); 329*2639ae9bSBen Gras mpool_put(t->bt_mp, r, 0); 330*2639ae9bSBen Gras __dbpanic(t->bt_dbp); 331*2639ae9bSBen Gras return (RET_ERROR); 332*2639ae9bSBen Gras } 333*2639ae9bSBen Gras 334*2639ae9bSBen Gras /* 335*2639ae9bSBen Gras * BT_PAGE -- Split a non-root page of a btree. 336*2639ae9bSBen Gras * 337*2639ae9bSBen Gras * Parameters: 338*2639ae9bSBen Gras * t: tree 339*2639ae9bSBen Gras * h: root page 340*2639ae9bSBen Gras * lp: pointer to left page pointer 341*2639ae9bSBen Gras * rp: pointer to right page pointer 342*2639ae9bSBen Gras * skip: pointer to index to leave open 343*2639ae9bSBen Gras * ilen: insert length 344*2639ae9bSBen Gras * 345*2639ae9bSBen Gras * Returns: 346*2639ae9bSBen Gras * Pointer to page in which to insert or NULL on error. 347*2639ae9bSBen Gras */ 348*2639ae9bSBen Gras static PAGE * 349*2639ae9bSBen Gras bt_page(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen) 350*2639ae9bSBen Gras { 351*2639ae9bSBen Gras PAGE *l, *r, *tp; 352*2639ae9bSBen Gras pgno_t npg; 353*2639ae9bSBen Gras 354*2639ae9bSBen Gras #ifdef STATISTICS 355*2639ae9bSBen Gras ++bt_split; 356*2639ae9bSBen Gras #endif 357*2639ae9bSBen Gras /* Put the new right page for the split into place. */ 358*2639ae9bSBen Gras if ((r = __bt_new(t, &npg)) == NULL) 359*2639ae9bSBen Gras return (NULL); 360*2639ae9bSBen Gras r->pgno = npg; 361*2639ae9bSBen Gras r->lower = BTDATAOFF; 362*2639ae9bSBen Gras r->upper = t->bt_psize; 363*2639ae9bSBen Gras r->nextpg = h->nextpg; 364*2639ae9bSBen Gras r->prevpg = h->pgno; 365*2639ae9bSBen Gras r->flags = h->flags & P_TYPE; 366*2639ae9bSBen Gras 367*2639ae9bSBen Gras /* 368*2639ae9bSBen Gras * If we're splitting the last page on a level because we're appending 369*2639ae9bSBen Gras * a key to it (skip is NEXTINDEX()), it's likely that the data is 370*2639ae9bSBen Gras * sorted. Adding an empty page on the side of the level is less work 371*2639ae9bSBen Gras * and can push the fill factor much higher than normal. If we're 372*2639ae9bSBen Gras * wrong it's no big deal, we'll just do the split the right way next 373*2639ae9bSBen Gras * time. It may look like it's equally easy to do a similar hack for 374*2639ae9bSBen Gras * reverse sorted data, that is, split the tree left, but it's not. 375*2639ae9bSBen Gras * Don't even try. 376*2639ae9bSBen Gras */ 377*2639ae9bSBen Gras if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) { 378*2639ae9bSBen Gras #ifdef STATISTICS 379*2639ae9bSBen Gras ++bt_sortsplit; 380*2639ae9bSBen Gras #endif 381*2639ae9bSBen Gras h->nextpg = r->pgno; 382*2639ae9bSBen Gras r->lower = BTDATAOFF + sizeof(indx_t); 383*2639ae9bSBen Gras *skip = 0; 384*2639ae9bSBen Gras *lp = h; 385*2639ae9bSBen Gras *rp = r; 386*2639ae9bSBen Gras return (r); 387*2639ae9bSBen Gras } 388*2639ae9bSBen Gras 389*2639ae9bSBen Gras /* Put the new left page for the split into place. */ 390*2639ae9bSBen Gras if ((l = calloc(1, t->bt_psize)) == NULL) { 391*2639ae9bSBen Gras mpool_put(t->bt_mp, r, 0); 392*2639ae9bSBen Gras return (NULL); 393*2639ae9bSBen Gras } 394*2639ae9bSBen Gras #ifdef PURIFY 395*2639ae9bSBen Gras memset(l, 0xff, t->bt_psize); 396*2639ae9bSBen Gras #endif 397*2639ae9bSBen Gras l->pgno = h->pgno; 398*2639ae9bSBen Gras l->nextpg = r->pgno; 399*2639ae9bSBen Gras l->prevpg = h->prevpg; 400*2639ae9bSBen Gras l->lower = BTDATAOFF; 401*2639ae9bSBen Gras l->upper = t->bt_psize; 402*2639ae9bSBen Gras l->flags = h->flags & P_TYPE; 403*2639ae9bSBen Gras 404*2639ae9bSBen Gras /* Fix up the previous pointer of the page after the split page. */ 405*2639ae9bSBen Gras if (h->nextpg != P_INVALID) { 406*2639ae9bSBen Gras if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) { 407*2639ae9bSBen Gras free(l); 408*2639ae9bSBen Gras /* XXX mpool_free(t->bt_mp, r->pgno); */ 409*2639ae9bSBen Gras return (NULL); 410*2639ae9bSBen Gras } 411*2639ae9bSBen Gras tp->prevpg = r->pgno; 412*2639ae9bSBen Gras mpool_put(t->bt_mp, tp, MPOOL_DIRTY); 413*2639ae9bSBen Gras } 414*2639ae9bSBen Gras 415*2639ae9bSBen Gras /* 416*2639ae9bSBen Gras * Split right. The key/data pairs aren't sorted in the btree page so 417*2639ae9bSBen Gras * it's simpler to copy the data from the split page onto two new pages 418*2639ae9bSBen Gras * instead of copying half the data to the right page and compacting 419*2639ae9bSBen Gras * the left page in place. Since the left page can't change, we have 420*2639ae9bSBen Gras * to swap the original and the allocated left page after the split. 421*2639ae9bSBen Gras */ 422*2639ae9bSBen Gras tp = bt_psplit(t, h, l, r, skip, ilen); 423*2639ae9bSBen Gras 424*2639ae9bSBen Gras /* Move the new left page onto the old left page. */ 425*2639ae9bSBen Gras memmove(h, l, t->bt_psize); 426*2639ae9bSBen Gras if (tp == l) 427*2639ae9bSBen Gras tp = h; 428*2639ae9bSBen Gras free(l); 429*2639ae9bSBen Gras 430*2639ae9bSBen Gras *lp = h; 431*2639ae9bSBen Gras *rp = r; 432*2639ae9bSBen Gras return (tp); 433*2639ae9bSBen Gras } 434*2639ae9bSBen Gras 435*2639ae9bSBen Gras /* 436*2639ae9bSBen Gras * BT_ROOT -- Split the root page of a btree. 437*2639ae9bSBen Gras * 438*2639ae9bSBen Gras * Parameters: 439*2639ae9bSBen Gras * t: tree 440*2639ae9bSBen Gras * h: root page 441*2639ae9bSBen Gras * lp: pointer to left page pointer 442*2639ae9bSBen Gras * rp: pointer to right page pointer 443*2639ae9bSBen Gras * skip: pointer to index to leave open 444*2639ae9bSBen Gras * ilen: insert length 445*2639ae9bSBen Gras * 446*2639ae9bSBen Gras * Returns: 447*2639ae9bSBen Gras * Pointer to page in which to insert or NULL on error. 448*2639ae9bSBen Gras */ 449*2639ae9bSBen Gras static PAGE * 450*2639ae9bSBen Gras bt_root(BTREE *t, PAGE *h, PAGE **lp, PAGE **rp, indx_t *skip, size_t ilen) 451*2639ae9bSBen Gras { 452*2639ae9bSBen Gras PAGE *l, *r, *tp; 453*2639ae9bSBen Gras pgno_t lnpg, rnpg; 454*2639ae9bSBen Gras 455*2639ae9bSBen Gras #ifdef STATISTICS 456*2639ae9bSBen Gras ++bt_split; 457*2639ae9bSBen Gras ++bt_rootsplit; 458*2639ae9bSBen Gras #endif 459*2639ae9bSBen Gras /* Put the new left and right pages for the split into place. */ 460*2639ae9bSBen Gras if ((l = __bt_new(t, &lnpg)) == NULL || 461*2639ae9bSBen Gras (r = __bt_new(t, &rnpg)) == NULL) 462*2639ae9bSBen Gras return (NULL); 463*2639ae9bSBen Gras l->pgno = lnpg; 464*2639ae9bSBen Gras r->pgno = rnpg; 465*2639ae9bSBen Gras l->nextpg = r->pgno; 466*2639ae9bSBen Gras r->prevpg = l->pgno; 467*2639ae9bSBen Gras l->prevpg = r->nextpg = P_INVALID; 468*2639ae9bSBen Gras l->lower = r->lower = BTDATAOFF; 469*2639ae9bSBen Gras l->upper = r->upper = t->bt_psize; 470*2639ae9bSBen Gras l->flags = r->flags = h->flags & P_TYPE; 471*2639ae9bSBen Gras 472*2639ae9bSBen Gras /* Split the root page. */ 473*2639ae9bSBen Gras tp = bt_psplit(t, h, l, r, skip, ilen); 474*2639ae9bSBen Gras 475*2639ae9bSBen Gras *lp = l; 476*2639ae9bSBen Gras *rp = r; 477*2639ae9bSBen Gras return (tp); 478*2639ae9bSBen Gras } 479*2639ae9bSBen Gras 480*2639ae9bSBen Gras /* 481*2639ae9bSBen Gras * BT_RROOT -- Fix up the recno root page after it has been split. 482*2639ae9bSBen Gras * 483*2639ae9bSBen Gras * Parameters: 484*2639ae9bSBen Gras * t: tree 485*2639ae9bSBen Gras * h: root page 486*2639ae9bSBen Gras * l: left page 487*2639ae9bSBen Gras * r: right page 488*2639ae9bSBen Gras * 489*2639ae9bSBen Gras * Returns: 490*2639ae9bSBen Gras * RET_ERROR, RET_SUCCESS 491*2639ae9bSBen Gras */ 492*2639ae9bSBen Gras static int 493*2639ae9bSBen Gras bt_rroot(BTREE *t, PAGE *h, PAGE *l, PAGE *r) 494*2639ae9bSBen Gras { 495*2639ae9bSBen Gras char *dest; 496*2639ae9bSBen Gras uint32_t sz; 497*2639ae9bSBen Gras size_t temp; 498*2639ae9bSBen Gras 499*2639ae9bSBen Gras temp = t->bt_psize - NRINTERNAL; 500*2639ae9bSBen Gras _DBFIT(temp, uint32_t); 501*2639ae9bSBen Gras sz = (uint32_t)temp; 502*2639ae9bSBen Gras 503*2639ae9bSBen Gras /* Insert the left and right keys, set the header information. */ 504*2639ae9bSBen Gras _DBFIT(sz, indx_t); 505*2639ae9bSBen Gras h->linp[0] = h->upper = (indx_t)sz; 506*2639ae9bSBen Gras dest = (char *)(void *)h + h->upper; 507*2639ae9bSBen Gras WR_RINTERNAL(dest, 508*2639ae9bSBen Gras l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno); 509*2639ae9bSBen Gras 510*2639ae9bSBen Gras h->linp[1] = h->upper -= NRINTERNAL; 511*2639ae9bSBen Gras dest = (char *)(void *)h + h->upper; 512*2639ae9bSBen Gras WR_RINTERNAL(dest, 513*2639ae9bSBen Gras r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno); 514*2639ae9bSBen Gras 515*2639ae9bSBen Gras h->lower = BTDATAOFF + 2 * sizeof(indx_t); 516*2639ae9bSBen Gras 517*2639ae9bSBen Gras /* Unpin the root page, set to recno internal page. */ 518*2639ae9bSBen Gras h->flags &= ~P_TYPE; 519*2639ae9bSBen Gras h->flags |= P_RINTERNAL; 520*2639ae9bSBen Gras mpool_put(t->bt_mp, h, MPOOL_DIRTY); 521*2639ae9bSBen Gras 522*2639ae9bSBen Gras return (RET_SUCCESS); 523*2639ae9bSBen Gras } 524*2639ae9bSBen Gras 525*2639ae9bSBen Gras /* 526*2639ae9bSBen Gras * BT_BROOT -- Fix up the btree root page after it has been split. 527*2639ae9bSBen Gras * 528*2639ae9bSBen Gras * Parameters: 529*2639ae9bSBen Gras * t: tree 530*2639ae9bSBen Gras * h: root page 531*2639ae9bSBen Gras * l: left page 532*2639ae9bSBen Gras * r: right page 533*2639ae9bSBen Gras * 534*2639ae9bSBen Gras * Returns: 535*2639ae9bSBen Gras * RET_ERROR, RET_SUCCESS 536*2639ae9bSBen Gras */ 537*2639ae9bSBen Gras static int 538*2639ae9bSBen Gras bt_broot(BTREE *t, PAGE *h, PAGE *l, PAGE *r) 539*2639ae9bSBen Gras { 540*2639ae9bSBen Gras BINTERNAL *bi = NULL; /* pacify gcc */ 541*2639ae9bSBen Gras BLEAF *bl; 542*2639ae9bSBen Gras uint32_t nbytes; 543*2639ae9bSBen Gras char *dest; 544*2639ae9bSBen Gras 545*2639ae9bSBen Gras /* 546*2639ae9bSBen Gras * If the root page was a leaf page, change it into an internal page. 547*2639ae9bSBen Gras * We copy the key we split on (but not the key's data, in the case of 548*2639ae9bSBen Gras * a leaf page) to the new root page. 549*2639ae9bSBen Gras * 550*2639ae9bSBen Gras * The btree comparison code guarantees that the left-most key on any 551*2639ae9bSBen Gras * level of the tree is never used, so it doesn't need to be filled in. 552*2639ae9bSBen Gras */ 553*2639ae9bSBen Gras nbytes = NBINTERNAL(0); 554*2639ae9bSBen Gras h->linp[0] = h->upper = t->bt_psize - nbytes; 555*2639ae9bSBen Gras dest = (char *)(void *)h + h->upper; 556*2639ae9bSBen Gras WR_BINTERNAL(dest, 0, l->pgno, 0); 557*2639ae9bSBen Gras 558*2639ae9bSBen Gras switch (h->flags & P_TYPE) { 559*2639ae9bSBen Gras case P_BLEAF: 560*2639ae9bSBen Gras bl = GETBLEAF(r, 0); 561*2639ae9bSBen Gras nbytes = NBINTERNAL(bl->ksize); 562*2639ae9bSBen Gras h->linp[1] = h->upper -= nbytes; 563*2639ae9bSBen Gras dest = (char *)(void *)h + h->upper; 564*2639ae9bSBen Gras WR_BINTERNAL(dest, bl->ksize, r->pgno, 0); 565*2639ae9bSBen Gras memmove(dest, bl->bytes, bl->ksize); 566*2639ae9bSBen Gras 567*2639ae9bSBen Gras /* 568*2639ae9bSBen Gras * If the key is on an overflow page, mark the overflow chain 569*2639ae9bSBen Gras * so it isn't deleted when the leaf copy of the key is deleted. 570*2639ae9bSBen Gras */ 571*2639ae9bSBen Gras if (bl->flags & P_BIGKEY && 572*2639ae9bSBen Gras bt_preserve(t, *(pgno_t *)(void *)bl->bytes) == RET_ERROR) 573*2639ae9bSBen Gras return (RET_ERROR); 574*2639ae9bSBen Gras break; 575*2639ae9bSBen Gras case P_BINTERNAL: 576*2639ae9bSBen Gras bi = GETBINTERNAL(r, 0); 577*2639ae9bSBen Gras nbytes = NBINTERNAL(bi->ksize); 578*2639ae9bSBen Gras h->linp[1] = h->upper -= nbytes; 579*2639ae9bSBen Gras dest = (char *)(void *)h + h->upper; 580*2639ae9bSBen Gras memmove(dest, bi, nbytes); 581*2639ae9bSBen Gras ((BINTERNAL *)(void *)dest)->pgno = r->pgno; 582*2639ae9bSBen Gras break; 583*2639ae9bSBen Gras default: 584*2639ae9bSBen Gras abort(); 585*2639ae9bSBen Gras } 586*2639ae9bSBen Gras 587*2639ae9bSBen Gras /* There are two keys on the page. */ 588*2639ae9bSBen Gras h->lower = BTDATAOFF + 2 * sizeof(indx_t); 589*2639ae9bSBen Gras 590*2639ae9bSBen Gras /* Unpin the root page, set to btree internal page. */ 591*2639ae9bSBen Gras h->flags &= ~P_TYPE; 592*2639ae9bSBen Gras h->flags |= P_BINTERNAL; 593*2639ae9bSBen Gras mpool_put(t->bt_mp, h, MPOOL_DIRTY); 594*2639ae9bSBen Gras 595*2639ae9bSBen Gras return (RET_SUCCESS); 596*2639ae9bSBen Gras } 597*2639ae9bSBen Gras 598*2639ae9bSBen Gras /* 599*2639ae9bSBen Gras * BT_PSPLIT -- Do the real work of splitting the page. 600*2639ae9bSBen Gras * 601*2639ae9bSBen Gras * Parameters: 602*2639ae9bSBen Gras * t: tree 603*2639ae9bSBen Gras * h: page to be split 604*2639ae9bSBen Gras * l: page to put lower half of data 605*2639ae9bSBen Gras * r: page to put upper half of data 606*2639ae9bSBen Gras * pskip: pointer to index to leave open 607*2639ae9bSBen Gras * ilen: insert length 608*2639ae9bSBen Gras * 609*2639ae9bSBen Gras * Returns: 610*2639ae9bSBen Gras * Pointer to page in which to insert. 611*2639ae9bSBen Gras */ 612*2639ae9bSBen Gras static PAGE * 613*2639ae9bSBen Gras bt_psplit(BTREE *t, PAGE *h, PAGE *l, PAGE *r, indx_t *pskip, size_t ilen) 614*2639ae9bSBen Gras { 615*2639ae9bSBen Gras BINTERNAL *bi; 616*2639ae9bSBen Gras BLEAF *bl; 617*2639ae9bSBen Gras CURSOR *c; 618*2639ae9bSBen Gras RLEAF *rl; 619*2639ae9bSBen Gras PAGE *rval; 620*2639ae9bSBen Gras void *src = NULL; /* pacify gcc */ 621*2639ae9bSBen Gras indx_t full, half, nxt, off, skip, top, used; 622*2639ae9bSBen Gras uint32_t nbytes; 623*2639ae9bSBen Gras size_t temp; 624*2639ae9bSBen Gras int bigkeycnt, isbigkey; 625*2639ae9bSBen Gras 626*2639ae9bSBen Gras /* 627*2639ae9bSBen Gras * Split the data to the left and right pages. Leave the skip index 628*2639ae9bSBen Gras * open. Additionally, make some effort not to split on an overflow 629*2639ae9bSBen Gras * key. This makes internal page processing faster and can save 630*2639ae9bSBen Gras * space as overflow keys used by internal pages are never deleted. 631*2639ae9bSBen Gras */ 632*2639ae9bSBen Gras bigkeycnt = 0; 633*2639ae9bSBen Gras skip = *pskip; 634*2639ae9bSBen Gras temp = t->bt_psize - BTDATAOFF; 635*2639ae9bSBen Gras _DBFIT(temp, indx_t); 636*2639ae9bSBen Gras full = (indx_t)temp; 637*2639ae9bSBen Gras half = full / 2; 638*2639ae9bSBen Gras used = 0; 639*2639ae9bSBen Gras for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) { 640*2639ae9bSBen Gras if (skip == off) { 641*2639ae9bSBen Gras _DBFIT(ilen, uint32_t); 642*2639ae9bSBen Gras nbytes = (uint32_t)ilen; 643*2639ae9bSBen Gras isbigkey = 0; /* XXX: not really known. */ 644*2639ae9bSBen Gras } else 645*2639ae9bSBen Gras switch (h->flags & P_TYPE) { 646*2639ae9bSBen Gras case P_BINTERNAL: 647*2639ae9bSBen Gras src = bi = GETBINTERNAL(h, nxt); 648*2639ae9bSBen Gras nbytes = NBINTERNAL(bi->ksize); 649*2639ae9bSBen Gras isbigkey = bi->flags & P_BIGKEY; 650*2639ae9bSBen Gras break; 651*2639ae9bSBen Gras case P_BLEAF: 652*2639ae9bSBen Gras src = bl = GETBLEAF(h, nxt); 653*2639ae9bSBen Gras nbytes = NBLEAF(bl); 654*2639ae9bSBen Gras isbigkey = bl->flags & P_BIGKEY; 655*2639ae9bSBen Gras break; 656*2639ae9bSBen Gras case P_RINTERNAL: 657*2639ae9bSBen Gras src = GETRINTERNAL(h, nxt); 658*2639ae9bSBen Gras nbytes = NRINTERNAL; 659*2639ae9bSBen Gras isbigkey = 0; 660*2639ae9bSBen Gras break; 661*2639ae9bSBen Gras case P_RLEAF: 662*2639ae9bSBen Gras src = rl = GETRLEAF(h, nxt); 663*2639ae9bSBen Gras nbytes = NRLEAF(rl); 664*2639ae9bSBen Gras isbigkey = 0; 665*2639ae9bSBen Gras break; 666*2639ae9bSBen Gras default: 667*2639ae9bSBen Gras abort(); 668*2639ae9bSBen Gras } 669*2639ae9bSBen Gras 670*2639ae9bSBen Gras /* 671*2639ae9bSBen Gras * If the key/data pairs are substantial fractions of the max 672*2639ae9bSBen Gras * possible size for the page, it's possible to get situations 673*2639ae9bSBen Gras * where we decide to try and copy too much onto the left page. 674*2639ae9bSBen Gras * Make sure that doesn't happen. 675*2639ae9bSBen Gras */ 676*2639ae9bSBen Gras if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) || 677*2639ae9bSBen Gras nxt == top - 1) { 678*2639ae9bSBen Gras --off; 679*2639ae9bSBen Gras break; 680*2639ae9bSBen Gras } 681*2639ae9bSBen Gras 682*2639ae9bSBen Gras /* Copy the key/data pair, if not the skipped index. */ 683*2639ae9bSBen Gras if (skip != off) { 684*2639ae9bSBen Gras ++nxt; 685*2639ae9bSBen Gras 686*2639ae9bSBen Gras l->linp[off] = l->upper -= nbytes; 687*2639ae9bSBen Gras memmove((char *)(void *)l + l->upper, src, nbytes); 688*2639ae9bSBen Gras } 689*2639ae9bSBen Gras 690*2639ae9bSBen Gras temp = nbytes + sizeof(indx_t); 691*2639ae9bSBen Gras _DBFIT(temp, indx_t); 692*2639ae9bSBen Gras used += (indx_t)temp; 693*2639ae9bSBen Gras if (used >= half) { 694*2639ae9bSBen Gras if (!isbigkey || bigkeycnt == 3) 695*2639ae9bSBen Gras break; 696*2639ae9bSBen Gras else 697*2639ae9bSBen Gras ++bigkeycnt; 698*2639ae9bSBen Gras } 699*2639ae9bSBen Gras } 700*2639ae9bSBen Gras 701*2639ae9bSBen Gras /* 702*2639ae9bSBen Gras * Off is the last offset that's valid for the left page. 703*2639ae9bSBen Gras * Nxt is the first offset to be placed on the right page. 704*2639ae9bSBen Gras */ 705*2639ae9bSBen Gras temp = (off + 1) * sizeof(indx_t); 706*2639ae9bSBen Gras _DBFIT(temp, indx_t); 707*2639ae9bSBen Gras l->lower += (indx_t)temp; 708*2639ae9bSBen Gras 709*2639ae9bSBen Gras /* 710*2639ae9bSBen Gras * If splitting the page that the cursor was on, the cursor has to be 711*2639ae9bSBen Gras * adjusted to point to the same record as before the split. If the 712*2639ae9bSBen Gras * cursor is at or past the skipped slot, the cursor is incremented by 713*2639ae9bSBen Gras * one. If the cursor is on the right page, it is decremented by the 714*2639ae9bSBen Gras * number of records split to the left page. 715*2639ae9bSBen Gras */ 716*2639ae9bSBen Gras c = &t->bt_cursor; 717*2639ae9bSBen Gras if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) { 718*2639ae9bSBen Gras if (c->pg.index >= skip) 719*2639ae9bSBen Gras ++c->pg.index; 720*2639ae9bSBen Gras if (c->pg.index < nxt) /* Left page. */ 721*2639ae9bSBen Gras c->pg.pgno = l->pgno; 722*2639ae9bSBen Gras else { /* Right page. */ 723*2639ae9bSBen Gras c->pg.pgno = r->pgno; 724*2639ae9bSBen Gras c->pg.index -= nxt; 725*2639ae9bSBen Gras } 726*2639ae9bSBen Gras } 727*2639ae9bSBen Gras 728*2639ae9bSBen Gras /* 729*2639ae9bSBen Gras * If the skipped index was on the left page, just return that page. 730*2639ae9bSBen Gras * Otherwise, adjust the skip index to reflect the new position on 731*2639ae9bSBen Gras * the right page. 732*2639ae9bSBen Gras */ 733*2639ae9bSBen Gras if (skip <= off) { 734*2639ae9bSBen Gras skip = MAX_PAGE_OFFSET; 735*2639ae9bSBen Gras rval = l; 736*2639ae9bSBen Gras } else { 737*2639ae9bSBen Gras rval = r; 738*2639ae9bSBen Gras *pskip -= nxt; 739*2639ae9bSBen Gras } 740*2639ae9bSBen Gras 741*2639ae9bSBen Gras for (off = 0; nxt < top; ++off) { 742*2639ae9bSBen Gras if (skip == nxt) { 743*2639ae9bSBen Gras ++off; 744*2639ae9bSBen Gras skip = MAX_PAGE_OFFSET; 745*2639ae9bSBen Gras } 746*2639ae9bSBen Gras switch (h->flags & P_TYPE) { 747*2639ae9bSBen Gras case P_BINTERNAL: 748*2639ae9bSBen Gras src = bi = GETBINTERNAL(h, nxt); 749*2639ae9bSBen Gras nbytes = NBINTERNAL(bi->ksize); 750*2639ae9bSBen Gras break; 751*2639ae9bSBen Gras case P_BLEAF: 752*2639ae9bSBen Gras src = bl = GETBLEAF(h, nxt); 753*2639ae9bSBen Gras nbytes = NBLEAF(bl); 754*2639ae9bSBen Gras break; 755*2639ae9bSBen Gras case P_RINTERNAL: 756*2639ae9bSBen Gras src = GETRINTERNAL(h, nxt); 757*2639ae9bSBen Gras nbytes = NRINTERNAL; 758*2639ae9bSBen Gras break; 759*2639ae9bSBen Gras case P_RLEAF: 760*2639ae9bSBen Gras src = rl = GETRLEAF(h, nxt); 761*2639ae9bSBen Gras nbytes = NRLEAF(rl); 762*2639ae9bSBen Gras break; 763*2639ae9bSBen Gras default: 764*2639ae9bSBen Gras abort(); 765*2639ae9bSBen Gras } 766*2639ae9bSBen Gras ++nxt; 767*2639ae9bSBen Gras r->linp[off] = r->upper -= nbytes; 768*2639ae9bSBen Gras memmove((char *)(void *)r + r->upper, src, nbytes); 769*2639ae9bSBen Gras } 770*2639ae9bSBen Gras temp = off * sizeof(indx_t); 771*2639ae9bSBen Gras _DBFIT(temp, indx_t); 772*2639ae9bSBen Gras r->lower += (indx_t)temp; 773*2639ae9bSBen Gras 774*2639ae9bSBen Gras /* If the key is being appended to the page, adjust the index. */ 775*2639ae9bSBen Gras if (skip == top) 776*2639ae9bSBen Gras r->lower += sizeof(indx_t); 777*2639ae9bSBen Gras 778*2639ae9bSBen Gras return (rval); 779*2639ae9bSBen Gras } 780*2639ae9bSBen Gras 781*2639ae9bSBen Gras /* 782*2639ae9bSBen Gras * BT_PRESERVE -- Mark a chain of pages as used by an internal node. 783*2639ae9bSBen Gras * 784*2639ae9bSBen Gras * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the 785*2639ae9bSBen Gras * record that references them gets deleted. Chains pointed to by internal 786*2639ae9bSBen Gras * pages never get deleted. This routine marks a chain as pointed to by an 787*2639ae9bSBen Gras * internal page. 788*2639ae9bSBen Gras * 789*2639ae9bSBen Gras * Parameters: 790*2639ae9bSBen Gras * t: tree 791*2639ae9bSBen Gras * pg: page number of first page in the chain. 792*2639ae9bSBen Gras * 793*2639ae9bSBen Gras * Returns: 794*2639ae9bSBen Gras * RET_SUCCESS, RET_ERROR. 795*2639ae9bSBen Gras */ 796*2639ae9bSBen Gras static int 797*2639ae9bSBen Gras bt_preserve(BTREE *t, pgno_t pg) 798*2639ae9bSBen Gras { 799*2639ae9bSBen Gras PAGE *h; 800*2639ae9bSBen Gras 801*2639ae9bSBen Gras if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 802*2639ae9bSBen Gras return (RET_ERROR); 803*2639ae9bSBen Gras h->flags |= P_PRESERVE; 804*2639ae9bSBen Gras mpool_put(t->bt_mp, h, MPOOL_DIRTY); 805*2639ae9bSBen Gras return (RET_SUCCESS); 806*2639ae9bSBen Gras } 807*2639ae9bSBen Gras 808*2639ae9bSBen Gras /* 809*2639ae9bSBen Gras * REC_TOTAL -- Return the number of recno entries below a page. 810*2639ae9bSBen Gras * 811*2639ae9bSBen Gras * Parameters: 812*2639ae9bSBen Gras * h: page 813*2639ae9bSBen Gras * 814*2639ae9bSBen Gras * Returns: 815*2639ae9bSBen Gras * The number of recno entries below a page. 816*2639ae9bSBen Gras * 817*2639ae9bSBen Gras * XXX 818*2639ae9bSBen Gras * These values could be set by the bt_psplit routine. The problem is that the 819*2639ae9bSBen Gras * entry has to be popped off of the stack etc. or the values have to be passed 820*2639ae9bSBen Gras * all the way back to bt_split/bt_rroot and it's not very clean. 821*2639ae9bSBen Gras */ 822*2639ae9bSBen Gras static recno_t 823*2639ae9bSBen Gras rec_total(PAGE *h) 824*2639ae9bSBen Gras { 825*2639ae9bSBen Gras recno_t recs; 826*2639ae9bSBen Gras indx_t nxt, top; 827*2639ae9bSBen Gras 828*2639ae9bSBen Gras for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt) 829*2639ae9bSBen Gras recs += GETRINTERNAL(h, nxt)->nrecs; 830*2639ae9bSBen Gras return (recs); 831*2639ae9bSBen Gras } 832