1*0Sstevel@tonic-gate /*- 2*0Sstevel@tonic-gate * See the file LICENSE for redistribution information. 3*0Sstevel@tonic-gate * 4*0Sstevel@tonic-gate * Copyright (c) 1996, 1997, 1998 5*0Sstevel@tonic-gate * Sleepycat Software. All rights reserved. 6*0Sstevel@tonic-gate */ 7*0Sstevel@tonic-gate /* 8*0Sstevel@tonic-gate * Copyright (c) 1990, 1993, 1994, 1995, 1996 9*0Sstevel@tonic-gate * Keith Bostic. All rights reserved. 10*0Sstevel@tonic-gate */ 11*0Sstevel@tonic-gate /* 12*0Sstevel@tonic-gate * Copyright (c) 1990, 1993, 1994, 1995 13*0Sstevel@tonic-gate * The Regents of the University of California. All rights reserved. 14*0Sstevel@tonic-gate * 15*0Sstevel@tonic-gate * Redistribution and use in source and binary forms, with or without 16*0Sstevel@tonic-gate * modification, are permitted provided that the following conditions 17*0Sstevel@tonic-gate * are met: 18*0Sstevel@tonic-gate * 1. Redistributions of source code must retain the above copyright 19*0Sstevel@tonic-gate * notice, this list of conditions and the following disclaimer. 20*0Sstevel@tonic-gate * 2. Redistributions in binary form must reproduce the above copyright 21*0Sstevel@tonic-gate * notice, this list of conditions and the following disclaimer in the 22*0Sstevel@tonic-gate * documentation and/or other materials provided with the distribution. 23*0Sstevel@tonic-gate * 3. All advertising materials mentioning features or use of this software 24*0Sstevel@tonic-gate * must display the following acknowledgement: 25*0Sstevel@tonic-gate * This product includes software developed by the University of 26*0Sstevel@tonic-gate * California, Berkeley and its contributors. 27*0Sstevel@tonic-gate * 4. Neither the name of the University nor the names of its contributors 28*0Sstevel@tonic-gate * may be used to endorse or promote products derived from this software 29*0Sstevel@tonic-gate * without specific prior written permission. 30*0Sstevel@tonic-gate * 31*0Sstevel@tonic-gate * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32*0Sstevel@tonic-gate * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33*0Sstevel@tonic-gate * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34*0Sstevel@tonic-gate * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35*0Sstevel@tonic-gate * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36*0Sstevel@tonic-gate * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37*0Sstevel@tonic-gate * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38*0Sstevel@tonic-gate * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39*0Sstevel@tonic-gate * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40*0Sstevel@tonic-gate * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41*0Sstevel@tonic-gate * SUCH DAMAGE. 42*0Sstevel@tonic-gate */ 43*0Sstevel@tonic-gate 44*0Sstevel@tonic-gate #include "config.h" 45*0Sstevel@tonic-gate 46*0Sstevel@tonic-gate #ifndef lint 47*0Sstevel@tonic-gate static const char sccsid[] = "@(#)bt_split.c 10.33 (Sleepycat) 10/13/98"; 48*0Sstevel@tonic-gate #endif /* not lint */ 49*0Sstevel@tonic-gate 50*0Sstevel@tonic-gate #ifndef NO_SYSTEM_INCLUDES 51*0Sstevel@tonic-gate #include <sys/types.h> 52*0Sstevel@tonic-gate 53*0Sstevel@tonic-gate #include <errno.h> 54*0Sstevel@tonic-gate #include <limits.h> 55*0Sstevel@tonic-gate #include <string.h> 56*0Sstevel@tonic-gate #endif 57*0Sstevel@tonic-gate 58*0Sstevel@tonic-gate #include "db_int.h" 59*0Sstevel@tonic-gate #include "db_page.h" 60*0Sstevel@tonic-gate #include "btree.h" 61*0Sstevel@tonic-gate 62*0Sstevel@tonic-gate static int __bam_broot __P((DBC *, PAGE *, PAGE *, PAGE *)); 63*0Sstevel@tonic-gate static int __bam_page __P((DBC *, EPG *, EPG *)); 64*0Sstevel@tonic-gate static int __bam_pinsert __P((DBC *, EPG *, PAGE *, PAGE *)); 65*0Sstevel@tonic-gate static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *)); 66*0Sstevel@tonic-gate static int __bam_root __P((DBC *, EPG *)); 67*0Sstevel@tonic-gate static int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *)); 68*0Sstevel@tonic-gate 69*0Sstevel@tonic-gate /* 70*0Sstevel@tonic-gate * __bam_split -- 71*0Sstevel@tonic-gate * Split a page. 72*0Sstevel@tonic-gate * 73*0Sstevel@tonic-gate * PUBLIC: int __bam_split __P((DBC *, void *)); 74*0Sstevel@tonic-gate */ 75*0Sstevel@tonic-gate int 76*0Sstevel@tonic-gate __bam_split(dbc, arg) 77*0Sstevel@tonic-gate DBC *dbc; 78*0Sstevel@tonic-gate void *arg; 79*0Sstevel@tonic-gate { 80*0Sstevel@tonic-gate BTREE *t; 81*0Sstevel@tonic-gate CURSOR *cp; 82*0Sstevel@tonic-gate DB *dbp; 83*0Sstevel@tonic-gate enum { UP, DOWN } dir; 84*0Sstevel@tonic-gate int exact, level, ret; 85*0Sstevel@tonic-gate 86*0Sstevel@tonic-gate dbp = dbc->dbp; 87*0Sstevel@tonic-gate cp = dbc->internal; 88*0Sstevel@tonic-gate 89*0Sstevel@tonic-gate /* 90*0Sstevel@tonic-gate * The locking protocol we use to avoid deadlock to acquire locks by 91*0Sstevel@tonic-gate * walking down the tree, but we do it as lazily as possible, locking 92*0Sstevel@tonic-gate * the root only as a last resort. We expect all stack pages to have 93*0Sstevel@tonic-gate * been discarded before we're called; we discard all short-term locks. 94*0Sstevel@tonic-gate * 95*0Sstevel@tonic-gate * When __bam_split is first called, we know that a leaf page was too 96*0Sstevel@tonic-gate * full for an insert. We don't know what leaf page it was, but we 97*0Sstevel@tonic-gate * have the key/recno that caused the problem. We call XX_search to 98*0Sstevel@tonic-gate * reacquire the leaf page, but this time get both the leaf page and 99*0Sstevel@tonic-gate * its parent, locked. We then split the leaf page and see if the new 100*0Sstevel@tonic-gate * internal key will fit into the parent page. If it will, we're done. 101*0Sstevel@tonic-gate * 102*0Sstevel@tonic-gate * If it won't, we discard our current locks and repeat the process, 103*0Sstevel@tonic-gate * only this time acquiring the parent page and its parent, locked. 104*0Sstevel@tonic-gate * This process repeats until we succeed in the split, splitting the 105*0Sstevel@tonic-gate * root page as the final resort. The entire process then repeats, 106*0Sstevel@tonic-gate * as necessary, until we split a leaf page. 107*0Sstevel@tonic-gate * 108*0Sstevel@tonic-gate * XXX 109*0Sstevel@tonic-gate * A traditional method of speeding this up is to maintain a stack of 110*0Sstevel@tonic-gate * the pages traversed in the original search. You can detect if the 111*0Sstevel@tonic-gate * stack is correct by storing the page's LSN when it was searched and 112*0Sstevel@tonic-gate * comparing that LSN with the current one when it's locked during the 113*0Sstevel@tonic-gate * split. This would be an easy change for this code, but I have no 114*0Sstevel@tonic-gate * numbers that indicate it's worthwhile. 115*0Sstevel@tonic-gate */ 116*0Sstevel@tonic-gate t = dbp->internal; 117*0Sstevel@tonic-gate for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) { 118*0Sstevel@tonic-gate /* 119*0Sstevel@tonic-gate * Acquire a page and its parent, locked. 120*0Sstevel@tonic-gate */ 121*0Sstevel@tonic-gate if ((ret = (dbp->type == DB_BTREE ? 122*0Sstevel@tonic-gate __bam_search(dbc, arg, S_WRPAIR, level, NULL, &exact) : 123*0Sstevel@tonic-gate __bam_rsearch(dbc, 124*0Sstevel@tonic-gate (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0) 125*0Sstevel@tonic-gate return (ret); 126*0Sstevel@tonic-gate 127*0Sstevel@tonic-gate /* 128*0Sstevel@tonic-gate * Split the page if it still needs it (it's possible another 129*0Sstevel@tonic-gate * thread of control has already split the page). If we are 130*0Sstevel@tonic-gate * guaranteed that two items will fit on the page, the split 131*0Sstevel@tonic-gate * is no longer necessary. 132*0Sstevel@tonic-gate */ 133*0Sstevel@tonic-gate if (t->bt_ovflsize * 2 <= 134*0Sstevel@tonic-gate (db_indx_t)P_FREESPACE(cp->csp[0].page)) { 135*0Sstevel@tonic-gate __bam_stkrel(dbc, 1); 136*0Sstevel@tonic-gate return (0); 137*0Sstevel@tonic-gate } 138*0Sstevel@tonic-gate ret = cp->csp[0].page->pgno == PGNO_ROOT ? 139*0Sstevel@tonic-gate __bam_root(dbc, &cp->csp[0]) : 140*0Sstevel@tonic-gate __bam_page(dbc, &cp->csp[-1], &cp->csp[0]); 141*0Sstevel@tonic-gate BT_STK_CLR(cp); 142*0Sstevel@tonic-gate 143*0Sstevel@tonic-gate switch (ret) { 144*0Sstevel@tonic-gate case 0: 145*0Sstevel@tonic-gate /* Once we've split the leaf page, we're done. */ 146*0Sstevel@tonic-gate if (level == LEAFLEVEL) 147*0Sstevel@tonic-gate return (0); 148*0Sstevel@tonic-gate 149*0Sstevel@tonic-gate /* Switch directions. */ 150*0Sstevel@tonic-gate if (dir == UP) 151*0Sstevel@tonic-gate dir = DOWN; 152*0Sstevel@tonic-gate break; 153*0Sstevel@tonic-gate case DB_NEEDSPLIT: 154*0Sstevel@tonic-gate /* 155*0Sstevel@tonic-gate * It's possible to fail to split repeatedly, as other 156*0Sstevel@tonic-gate * threads may be modifying the tree, or the page usage 157*0Sstevel@tonic-gate * is sufficiently bad that we don't get enough space 158*0Sstevel@tonic-gate * the first time. 159*0Sstevel@tonic-gate */ 160*0Sstevel@tonic-gate if (dir == DOWN) 161*0Sstevel@tonic-gate dir = UP; 162*0Sstevel@tonic-gate break; 163*0Sstevel@tonic-gate default: 164*0Sstevel@tonic-gate return (ret); 165*0Sstevel@tonic-gate } 166*0Sstevel@tonic-gate } 167*0Sstevel@tonic-gate /* NOTREACHED */ 168*0Sstevel@tonic-gate } 169*0Sstevel@tonic-gate 170*0Sstevel@tonic-gate /* 171*0Sstevel@tonic-gate * __bam_root -- 172*0Sstevel@tonic-gate * Split the root page of a btree. 173*0Sstevel@tonic-gate */ 174*0Sstevel@tonic-gate static int 175*0Sstevel@tonic-gate __bam_root(dbc, cp) 176*0Sstevel@tonic-gate DBC *dbc; 177*0Sstevel@tonic-gate EPG *cp; 178*0Sstevel@tonic-gate { 179*0Sstevel@tonic-gate DB *dbp; 180*0Sstevel@tonic-gate PAGE *lp, *rp; 181*0Sstevel@tonic-gate db_indx_t split; 182*0Sstevel@tonic-gate int ret; 183*0Sstevel@tonic-gate 184*0Sstevel@tonic-gate dbp = dbc->dbp; 185*0Sstevel@tonic-gate 186*0Sstevel@tonic-gate /* Yeah, right. */ 187*0Sstevel@tonic-gate if (cp->page->level >= MAXBTREELEVEL) { 188*0Sstevel@tonic-gate ret = ENOSPC; 189*0Sstevel@tonic-gate goto err; 190*0Sstevel@tonic-gate } 191*0Sstevel@tonic-gate 192*0Sstevel@tonic-gate /* Create new left and right pages for the split. */ 193*0Sstevel@tonic-gate lp = rp = NULL; 194*0Sstevel@tonic-gate if ((ret = __bam_new(dbc, TYPE(cp->page), &lp)) != 0 || 195*0Sstevel@tonic-gate (ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0) 196*0Sstevel@tonic-gate goto err; 197*0Sstevel@tonic-gate P_INIT(lp, dbp->pgsize, lp->pgno, 198*0Sstevel@tonic-gate PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, 199*0Sstevel@tonic-gate cp->page->level, TYPE(cp->page)); 200*0Sstevel@tonic-gate P_INIT(rp, dbp->pgsize, rp->pgno, 201*0Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : lp->pgno, PGNO_INVALID, 202*0Sstevel@tonic-gate cp->page->level, TYPE(cp->page)); 203*0Sstevel@tonic-gate 204*0Sstevel@tonic-gate /* Split the page. */ 205*0Sstevel@tonic-gate if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) 206*0Sstevel@tonic-gate goto err; 207*0Sstevel@tonic-gate 208*0Sstevel@tonic-gate /* Log the change. */ 209*0Sstevel@tonic-gate if (DB_LOGGING(dbc)) { 210*0Sstevel@tonic-gate DBT __a; 211*0Sstevel@tonic-gate DB_LSN __lsn; 212*0Sstevel@tonic-gate memset(&__a, 0, sizeof(__a)); 213*0Sstevel@tonic-gate __a.data = cp->page; 214*0Sstevel@tonic-gate __a.size = dbp->pgsize; 215*0Sstevel@tonic-gate ZERO_LSN(__lsn); 216*0Sstevel@tonic-gate if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn, 217*0Sstevel@tonic-gate &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp), 218*0Sstevel@tonic-gate PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &__lsn, 219*0Sstevel@tonic-gate &__a)) != 0) 220*0Sstevel@tonic-gate goto err; 221*0Sstevel@tonic-gate LSN(lp) = LSN(rp) = LSN(cp->page); 222*0Sstevel@tonic-gate } 223*0Sstevel@tonic-gate 224*0Sstevel@tonic-gate /* Clean up the new root page. */ 225*0Sstevel@tonic-gate if ((ret = (dbp->type == DB_RECNO ? 226*0Sstevel@tonic-gate __ram_root(dbc, cp->page, lp, rp) : 227*0Sstevel@tonic-gate __bam_broot(dbc, cp->page, lp, rp))) != 0) 228*0Sstevel@tonic-gate goto err; 229*0Sstevel@tonic-gate 230*0Sstevel@tonic-gate /* Adjust any cursors. Do it last so we don't have to undo it. */ 231*0Sstevel@tonic-gate __bam_ca_split(dbp, cp->page->pgno, lp->pgno, rp->pgno, split, 1); 232*0Sstevel@tonic-gate 233*0Sstevel@tonic-gate /* Success -- write the real pages back to the store. */ 234*0Sstevel@tonic-gate (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); 235*0Sstevel@tonic-gate (void)__BT_TLPUT(dbc, cp->lock); 236*0Sstevel@tonic-gate (void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY); 237*0Sstevel@tonic-gate (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); 238*0Sstevel@tonic-gate 239*0Sstevel@tonic-gate return (0); 240*0Sstevel@tonic-gate 241*0Sstevel@tonic-gate err: if (lp != NULL) 242*0Sstevel@tonic-gate (void)__bam_free(dbc, lp); 243*0Sstevel@tonic-gate if (rp != NULL) 244*0Sstevel@tonic-gate (void)__bam_free(dbc, rp); 245*0Sstevel@tonic-gate (void)memp_fput(dbp->mpf, cp->page, 0); 246*0Sstevel@tonic-gate (void)__BT_TLPUT(dbc, cp->lock); 247*0Sstevel@tonic-gate return (ret); 248*0Sstevel@tonic-gate } 249*0Sstevel@tonic-gate 250*0Sstevel@tonic-gate /* 251*0Sstevel@tonic-gate * __bam_page -- 252*0Sstevel@tonic-gate * Split the non-root page of a btree. 253*0Sstevel@tonic-gate */ 254*0Sstevel@tonic-gate static int 255*0Sstevel@tonic-gate __bam_page(dbc, pp, cp) 256*0Sstevel@tonic-gate DBC *dbc; 257*0Sstevel@tonic-gate EPG *pp, *cp; 258*0Sstevel@tonic-gate { 259*0Sstevel@tonic-gate DB *dbp; 260*0Sstevel@tonic-gate DB_LOCK tplock; 261*0Sstevel@tonic-gate PAGE *lp, *rp, *tp; 262*0Sstevel@tonic-gate db_indx_t split; 263*0Sstevel@tonic-gate int ret; 264*0Sstevel@tonic-gate 265*0Sstevel@tonic-gate dbp = dbc->dbp; 266*0Sstevel@tonic-gate lp = rp = tp = NULL; 267*0Sstevel@tonic-gate ret = -1; 268*0Sstevel@tonic-gate 269*0Sstevel@tonic-gate /* Create new right page for the split. */ 270*0Sstevel@tonic-gate if ((ret = __bam_new(dbc, TYPE(cp->page), &rp)) != 0) 271*0Sstevel@tonic-gate goto err; 272*0Sstevel@tonic-gate P_INIT(rp, dbp->pgsize, rp->pgno, 273*0Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->pgno, 274*0Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->next_pgno, 275*0Sstevel@tonic-gate cp->page->level, TYPE(cp->page)); 276*0Sstevel@tonic-gate 277*0Sstevel@tonic-gate /* Create new left page for the split. */ 278*0Sstevel@tonic-gate if ((ret = __os_malloc(dbp->pgsize, NULL, &lp)) != 0) 279*0Sstevel@tonic-gate goto err; 280*0Sstevel@tonic-gate P_INIT(lp, dbp->pgsize, cp->page->pgno, 281*0Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : cp->page->prev_pgno, 282*0Sstevel@tonic-gate ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, 283*0Sstevel@tonic-gate cp->page->level, TYPE(cp->page)); 284*0Sstevel@tonic-gate ZERO_LSN(lp->lsn); 285*0Sstevel@tonic-gate 286*0Sstevel@tonic-gate /* 287*0Sstevel@tonic-gate * Split right. 288*0Sstevel@tonic-gate * 289*0Sstevel@tonic-gate * Only the indices are sorted on the page, i.e., the key/data pairs 290*0Sstevel@tonic-gate * aren't, so it's simpler to copy the data from the split page onto 291*0Sstevel@tonic-gate * two new pages instead of copying half the data to the right page 292*0Sstevel@tonic-gate * and compacting the left page in place. Since the left page can't 293*0Sstevel@tonic-gate * change, we swap the original and the allocated left page after the 294*0Sstevel@tonic-gate * split. 295*0Sstevel@tonic-gate */ 296*0Sstevel@tonic-gate if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) 297*0Sstevel@tonic-gate goto err; 298*0Sstevel@tonic-gate 299*0Sstevel@tonic-gate /* 300*0Sstevel@tonic-gate * Fix up the previous pointer of any leaf page following the split 301*0Sstevel@tonic-gate * page. 302*0Sstevel@tonic-gate * 303*0Sstevel@tonic-gate * !!! 304*0Sstevel@tonic-gate * There are interesting deadlock situations here as we write-lock a 305*0Sstevel@tonic-gate * page that's not in our direct ancestry. Consider a cursor walking 306*0Sstevel@tonic-gate * through the leaf pages, that has the previous page read-locked and 307*0Sstevel@tonic-gate * is waiting on a lock for the page we just split. It will deadlock 308*0Sstevel@tonic-gate * here. If this is a problem, we can fail in the split; it's not a 309*0Sstevel@tonic-gate * problem as the split will succeed after the cursor passes through 310*0Sstevel@tonic-gate * the page we're splitting. 311*0Sstevel@tonic-gate */ 312*0Sstevel@tonic-gate if (TYPE(cp->page) == P_LBTREE && rp->next_pgno != PGNO_INVALID) { 313*0Sstevel@tonic-gate if ((ret = __bam_lget(dbc, 314*0Sstevel@tonic-gate 0, rp->next_pgno, DB_LOCK_WRITE, &tplock)) != 0) 315*0Sstevel@tonic-gate goto err; 316*0Sstevel@tonic-gate if ((ret = memp_fget(dbp->mpf, &rp->next_pgno, 0, &tp)) != 0) 317*0Sstevel@tonic-gate goto err; 318*0Sstevel@tonic-gate } 319*0Sstevel@tonic-gate 320*0Sstevel@tonic-gate /* Insert the new pages into the parent page. */ 321*0Sstevel@tonic-gate if ((ret = __bam_pinsert(dbc, pp, lp, rp)) != 0) 322*0Sstevel@tonic-gate goto err; 323*0Sstevel@tonic-gate 324*0Sstevel@tonic-gate /* Log the change. */ 325*0Sstevel@tonic-gate if (DB_LOGGING(dbc)) { 326*0Sstevel@tonic-gate DBT __a; 327*0Sstevel@tonic-gate DB_LSN __lsn; 328*0Sstevel@tonic-gate memset(&__a, 0, sizeof(__a)); 329*0Sstevel@tonic-gate __a.data = cp->page; 330*0Sstevel@tonic-gate __a.size = dbp->pgsize; 331*0Sstevel@tonic-gate if (tp == NULL) 332*0Sstevel@tonic-gate ZERO_LSN(__lsn); 333*0Sstevel@tonic-gate if ((ret = __bam_split_log(dbp->dbenv->lg_info, dbc->txn, 334*0Sstevel@tonic-gate &cp->page->lsn, 0, dbp->log_fileid, PGNO(cp->page), 335*0Sstevel@tonic-gate &LSN(cp->page), PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 336*0Sstevel@tonic-gate tp == NULL ? 0 : PGNO(tp), 337*0Sstevel@tonic-gate tp == NULL ? &__lsn : &LSN(tp), &__a)) != 0) 338*0Sstevel@tonic-gate goto err; 339*0Sstevel@tonic-gate 340*0Sstevel@tonic-gate LSN(lp) = LSN(rp) = LSN(cp->page); 341*0Sstevel@tonic-gate if (tp != NULL) 342*0Sstevel@tonic-gate LSN(tp) = LSN(cp->page); 343*0Sstevel@tonic-gate } 344*0Sstevel@tonic-gate 345*0Sstevel@tonic-gate /* Copy the allocated page into place. */ 346*0Sstevel@tonic-gate memcpy(cp->page, lp, LOFFSET(lp)); 347*0Sstevel@tonic-gate memcpy((u_int8_t *)cp->page + HOFFSET(lp), 348*0Sstevel@tonic-gate (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp)); 349*0Sstevel@tonic-gate __os_free(lp, dbp->pgsize); 350*0Sstevel@tonic-gate lp = NULL; 351*0Sstevel@tonic-gate 352*0Sstevel@tonic-gate /* Finish the next-page link. */ 353*0Sstevel@tonic-gate if (tp != NULL) 354*0Sstevel@tonic-gate tp->prev_pgno = rp->pgno; 355*0Sstevel@tonic-gate 356*0Sstevel@tonic-gate /* Adjust any cursors. Do so last so we don't have to undo it. */ 357*0Sstevel@tonic-gate __bam_ca_split(dbp, cp->page->pgno, cp->page->pgno, rp->pgno, split, 0); 358*0Sstevel@tonic-gate 359*0Sstevel@tonic-gate /* Success -- write the real pages back to the store. */ 360*0Sstevel@tonic-gate (void)memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY); 361*0Sstevel@tonic-gate (void)__BT_TLPUT(dbc, pp->lock); 362*0Sstevel@tonic-gate (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); 363*0Sstevel@tonic-gate (void)__BT_TLPUT(dbc, cp->lock); 364*0Sstevel@tonic-gate (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); 365*0Sstevel@tonic-gate if (tp != NULL) { 366*0Sstevel@tonic-gate (void)memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY); 367*0Sstevel@tonic-gate (void)__BT_TLPUT(dbc, tplock); 368*0Sstevel@tonic-gate } 369*0Sstevel@tonic-gate return (0); 370*0Sstevel@tonic-gate 371*0Sstevel@tonic-gate err: if (lp != NULL) 372*0Sstevel@tonic-gate __os_free(lp, dbp->pgsize); 373*0Sstevel@tonic-gate if (rp != NULL) 374*0Sstevel@tonic-gate (void)__bam_free(dbc, rp); 375*0Sstevel@tonic-gate if (tp != NULL) { 376*0Sstevel@tonic-gate (void)memp_fput(dbp->mpf, tp, 0); 377*0Sstevel@tonic-gate if (ret == DB_NEEDSPLIT) 378*0Sstevel@tonic-gate (void)__BT_LPUT(dbc, tplock); 379*0Sstevel@tonic-gate else 380*0Sstevel@tonic-gate (void)__BT_TLPUT(dbc, tplock); 381*0Sstevel@tonic-gate } 382*0Sstevel@tonic-gate (void)memp_fput(dbp->mpf, pp->page, 0); 383*0Sstevel@tonic-gate if (ret == DB_NEEDSPLIT) 384*0Sstevel@tonic-gate (void)__BT_LPUT(dbc, pp->lock); 385*0Sstevel@tonic-gate else 386*0Sstevel@tonic-gate (void)__BT_TLPUT(dbc, pp->lock); 387*0Sstevel@tonic-gate (void)memp_fput(dbp->mpf, cp->page, 0); 388*0Sstevel@tonic-gate if (ret == DB_NEEDSPLIT) 389*0Sstevel@tonic-gate (void)__BT_LPUT(dbc, cp->lock); 390*0Sstevel@tonic-gate else 391*0Sstevel@tonic-gate (void)__BT_TLPUT(dbc, cp->lock); 392*0Sstevel@tonic-gate return (ret); 393*0Sstevel@tonic-gate } 394*0Sstevel@tonic-gate 395*0Sstevel@tonic-gate /* 396*0Sstevel@tonic-gate * __bam_broot -- 397*0Sstevel@tonic-gate * Fix up the btree root page after it has been split. 398*0Sstevel@tonic-gate */ 399*0Sstevel@tonic-gate static int 400*0Sstevel@tonic-gate __bam_broot(dbc, rootp, lp, rp) 401*0Sstevel@tonic-gate DBC *dbc; 402*0Sstevel@tonic-gate PAGE *rootp, *lp, *rp; 403*0Sstevel@tonic-gate { 404*0Sstevel@tonic-gate BINTERNAL bi, *child_bi; 405*0Sstevel@tonic-gate BKEYDATA *child_bk; 406*0Sstevel@tonic-gate DB *dbp; 407*0Sstevel@tonic-gate DBT hdr, data; 408*0Sstevel@tonic-gate int ret; 409*0Sstevel@tonic-gate 410*0Sstevel@tonic-gate dbp = dbc->dbp; 411*0Sstevel@tonic-gate 412*0Sstevel@tonic-gate /* 413*0Sstevel@tonic-gate * If the root page was a leaf page, change it into an internal page. 414*0Sstevel@tonic-gate * We copy the key we split on (but not the key's data, in the case of 415*0Sstevel@tonic-gate * a leaf page) to the new root page. 416*0Sstevel@tonic-gate */ 417*0Sstevel@tonic-gate P_INIT(rootp, dbp->pgsize, 418*0Sstevel@tonic-gate PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE); 419*0Sstevel@tonic-gate 420*0Sstevel@tonic-gate memset(&data, 0, sizeof(data)); 421*0Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr)); 422*0Sstevel@tonic-gate 423*0Sstevel@tonic-gate /* 424*0Sstevel@tonic-gate * The btree comparison code guarantees that the left-most key on any 425*0Sstevel@tonic-gate * level of the tree is never used, so it doesn't need to be filled in. 426*0Sstevel@tonic-gate */ 427*0Sstevel@tonic-gate memset(&bi, 0, sizeof(bi)); 428*0Sstevel@tonic-gate bi.len = 0; 429*0Sstevel@tonic-gate B_TSET(bi.type, B_KEYDATA, 0); 430*0Sstevel@tonic-gate bi.pgno = lp->pgno; 431*0Sstevel@tonic-gate if (F_ISSET(dbp, DB_BT_RECNUM)) { 432*0Sstevel@tonic-gate bi.nrecs = __bam_total(lp); 433*0Sstevel@tonic-gate RE_NREC_SET(rootp, bi.nrecs); 434*0Sstevel@tonic-gate } 435*0Sstevel@tonic-gate hdr.data = &bi; 436*0Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 437*0Sstevel@tonic-gate if ((ret = 438*0Sstevel@tonic-gate __db_pitem(dbc, rootp, 0, BINTERNAL_SIZE(0), &hdr, NULL)) != 0) 439*0Sstevel@tonic-gate return (ret); 440*0Sstevel@tonic-gate 441*0Sstevel@tonic-gate switch (TYPE(rp)) { 442*0Sstevel@tonic-gate case P_IBTREE: 443*0Sstevel@tonic-gate /* Copy the first key of the child page onto the root page. */ 444*0Sstevel@tonic-gate child_bi = GET_BINTERNAL(rp, 0); 445*0Sstevel@tonic-gate 446*0Sstevel@tonic-gate bi.len = child_bi->len; 447*0Sstevel@tonic-gate B_TSET(bi.type, child_bi->type, 0); 448*0Sstevel@tonic-gate bi.pgno = rp->pgno; 449*0Sstevel@tonic-gate if (F_ISSET(dbp, DB_BT_RECNUM)) { 450*0Sstevel@tonic-gate bi.nrecs = __bam_total(rp); 451*0Sstevel@tonic-gate RE_NREC_ADJ(rootp, bi.nrecs); 452*0Sstevel@tonic-gate } 453*0Sstevel@tonic-gate hdr.data = &bi; 454*0Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 455*0Sstevel@tonic-gate data.data = child_bi->data; 456*0Sstevel@tonic-gate data.size = child_bi->len; 457*0Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 1, 458*0Sstevel@tonic-gate BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) 459*0Sstevel@tonic-gate return (ret); 460*0Sstevel@tonic-gate 461*0Sstevel@tonic-gate /* Increment the overflow ref count. */ 462*0Sstevel@tonic-gate if (B_TYPE(child_bi->type) == B_OVERFLOW) 463*0Sstevel@tonic-gate if ((ret = __db_ovref(dbc, 464*0Sstevel@tonic-gate ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0) 465*0Sstevel@tonic-gate return (ret); 466*0Sstevel@tonic-gate break; 467*0Sstevel@tonic-gate case P_LBTREE: 468*0Sstevel@tonic-gate /* Copy the first key of the child page onto the root page. */ 469*0Sstevel@tonic-gate child_bk = GET_BKEYDATA(rp, 0); 470*0Sstevel@tonic-gate switch (B_TYPE(child_bk->type)) { 471*0Sstevel@tonic-gate case B_KEYDATA: 472*0Sstevel@tonic-gate bi.len = child_bk->len; 473*0Sstevel@tonic-gate B_TSET(bi.type, child_bk->type, 0); 474*0Sstevel@tonic-gate bi.pgno = rp->pgno; 475*0Sstevel@tonic-gate if (F_ISSET(dbp, DB_BT_RECNUM)) { 476*0Sstevel@tonic-gate bi.nrecs = __bam_total(rp); 477*0Sstevel@tonic-gate RE_NREC_ADJ(rootp, bi.nrecs); 478*0Sstevel@tonic-gate } 479*0Sstevel@tonic-gate hdr.data = &bi; 480*0Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 481*0Sstevel@tonic-gate data.data = child_bk->data; 482*0Sstevel@tonic-gate data.size = child_bk->len; 483*0Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 1, 484*0Sstevel@tonic-gate BINTERNAL_SIZE(child_bk->len), &hdr, &data)) != 0) 485*0Sstevel@tonic-gate return (ret); 486*0Sstevel@tonic-gate break; 487*0Sstevel@tonic-gate case B_DUPLICATE: 488*0Sstevel@tonic-gate case B_OVERFLOW: 489*0Sstevel@tonic-gate bi.len = BOVERFLOW_SIZE; 490*0Sstevel@tonic-gate B_TSET(bi.type, child_bk->type, 0); 491*0Sstevel@tonic-gate bi.pgno = rp->pgno; 492*0Sstevel@tonic-gate if (F_ISSET(dbp, DB_BT_RECNUM)) { 493*0Sstevel@tonic-gate bi.nrecs = __bam_total(rp); 494*0Sstevel@tonic-gate RE_NREC_ADJ(rootp, bi.nrecs); 495*0Sstevel@tonic-gate } 496*0Sstevel@tonic-gate hdr.data = &bi; 497*0Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 498*0Sstevel@tonic-gate data.data = child_bk; 499*0Sstevel@tonic-gate data.size = BOVERFLOW_SIZE; 500*0Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 1, 501*0Sstevel@tonic-gate BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) 502*0Sstevel@tonic-gate return (ret); 503*0Sstevel@tonic-gate 504*0Sstevel@tonic-gate /* Increment the overflow ref count. */ 505*0Sstevel@tonic-gate if (B_TYPE(child_bk->type) == B_OVERFLOW) 506*0Sstevel@tonic-gate if ((ret = __db_ovref(dbc, 507*0Sstevel@tonic-gate ((BOVERFLOW *)child_bk)->pgno, 1)) != 0) 508*0Sstevel@tonic-gate return (ret); 509*0Sstevel@tonic-gate break; 510*0Sstevel@tonic-gate default: 511*0Sstevel@tonic-gate return (__db_pgfmt(dbp, rp->pgno)); 512*0Sstevel@tonic-gate } 513*0Sstevel@tonic-gate break; 514*0Sstevel@tonic-gate default: 515*0Sstevel@tonic-gate return (__db_pgfmt(dbp, rp->pgno)); 516*0Sstevel@tonic-gate } 517*0Sstevel@tonic-gate return (0); 518*0Sstevel@tonic-gate } 519*0Sstevel@tonic-gate 520*0Sstevel@tonic-gate /* 521*0Sstevel@tonic-gate * __ram_root -- 522*0Sstevel@tonic-gate * Fix up the recno root page after it has been split. 523*0Sstevel@tonic-gate */ 524*0Sstevel@tonic-gate static int 525*0Sstevel@tonic-gate __ram_root(dbc, rootp, lp, rp) 526*0Sstevel@tonic-gate DBC *dbc; 527*0Sstevel@tonic-gate PAGE *rootp, *lp, *rp; 528*0Sstevel@tonic-gate { 529*0Sstevel@tonic-gate DB *dbp; 530*0Sstevel@tonic-gate DBT hdr; 531*0Sstevel@tonic-gate RINTERNAL ri; 532*0Sstevel@tonic-gate int ret; 533*0Sstevel@tonic-gate 534*0Sstevel@tonic-gate dbp = dbc->dbp; 535*0Sstevel@tonic-gate 536*0Sstevel@tonic-gate /* Initialize the page. */ 537*0Sstevel@tonic-gate P_INIT(rootp, dbp->pgsize, 538*0Sstevel@tonic-gate PGNO_ROOT, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO); 539*0Sstevel@tonic-gate 540*0Sstevel@tonic-gate /* Initialize the header. */ 541*0Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr)); 542*0Sstevel@tonic-gate hdr.data = &ri; 543*0Sstevel@tonic-gate hdr.size = RINTERNAL_SIZE; 544*0Sstevel@tonic-gate 545*0Sstevel@tonic-gate /* Insert the left and right keys, set the header information. */ 546*0Sstevel@tonic-gate ri.pgno = lp->pgno; 547*0Sstevel@tonic-gate ri.nrecs = __bam_total(lp); 548*0Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0) 549*0Sstevel@tonic-gate return (ret); 550*0Sstevel@tonic-gate RE_NREC_SET(rootp, ri.nrecs); 551*0Sstevel@tonic-gate ri.pgno = rp->pgno; 552*0Sstevel@tonic-gate ri.nrecs = __bam_total(rp); 553*0Sstevel@tonic-gate if ((ret = __db_pitem(dbc, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0) 554*0Sstevel@tonic-gate return (ret); 555*0Sstevel@tonic-gate RE_NREC_ADJ(rootp, ri.nrecs); 556*0Sstevel@tonic-gate return (0); 557*0Sstevel@tonic-gate } 558*0Sstevel@tonic-gate 559*0Sstevel@tonic-gate /* 560*0Sstevel@tonic-gate * __bam_pinsert -- 561*0Sstevel@tonic-gate * Insert a new key into a parent page, completing the split. 562*0Sstevel@tonic-gate */ 563*0Sstevel@tonic-gate static int 564*0Sstevel@tonic-gate __bam_pinsert(dbc, parent, lchild, rchild) 565*0Sstevel@tonic-gate DBC *dbc; 566*0Sstevel@tonic-gate EPG *parent; 567*0Sstevel@tonic-gate PAGE *lchild, *rchild; 568*0Sstevel@tonic-gate { 569*0Sstevel@tonic-gate BINTERNAL bi, *child_bi; 570*0Sstevel@tonic-gate BKEYDATA *child_bk, *tmp_bk; 571*0Sstevel@tonic-gate BTREE *t; 572*0Sstevel@tonic-gate DB *dbp; 573*0Sstevel@tonic-gate DBT a, b, hdr, data; 574*0Sstevel@tonic-gate PAGE *ppage; 575*0Sstevel@tonic-gate RINTERNAL ri; 576*0Sstevel@tonic-gate db_indx_t off; 577*0Sstevel@tonic-gate db_recno_t nrecs; 578*0Sstevel@tonic-gate u_int32_t n, nbytes, nksize; 579*0Sstevel@tonic-gate int ret; 580*0Sstevel@tonic-gate 581*0Sstevel@tonic-gate dbp = dbc->dbp; 582*0Sstevel@tonic-gate t = dbp->internal; 583*0Sstevel@tonic-gate ppage = parent->page; 584*0Sstevel@tonic-gate 585*0Sstevel@tonic-gate /* If handling record numbers, count records split to the right page. */ 586*0Sstevel@tonic-gate nrecs = dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM) ? 587*0Sstevel@tonic-gate __bam_total(rchild) : 0; 588*0Sstevel@tonic-gate 589*0Sstevel@tonic-gate /* 590*0Sstevel@tonic-gate * Now we insert the new page's first key into the parent page, which 591*0Sstevel@tonic-gate * completes the split. The parent points to a PAGE and a page index 592*0Sstevel@tonic-gate * offset, where the new key goes ONE AFTER the index, because we split 593*0Sstevel@tonic-gate * to the right. 594*0Sstevel@tonic-gate * 595*0Sstevel@tonic-gate * XXX 596*0Sstevel@tonic-gate * Some btree algorithms replace the key for the old page as well as 597*0Sstevel@tonic-gate * the new page. We don't, as there's no reason to believe that the 598*0Sstevel@tonic-gate * first key on the old page is any better than the key we have, and, 599*0Sstevel@tonic-gate * in the case of a key being placed at index 0 causing the split, the 600*0Sstevel@tonic-gate * key is unavailable. 601*0Sstevel@tonic-gate */ 602*0Sstevel@tonic-gate off = parent->indx + O_INDX; 603*0Sstevel@tonic-gate 604*0Sstevel@tonic-gate /* 605*0Sstevel@tonic-gate * Calculate the space needed on the parent page. 606*0Sstevel@tonic-gate * 607*0Sstevel@tonic-gate * Prefix trees: space hack used when inserting into BINTERNAL pages. 608*0Sstevel@tonic-gate * Retain only what's needed to distinguish between the new entry and 609*0Sstevel@tonic-gate * the LAST entry on the page to its left. If the keys compare equal, 610*0Sstevel@tonic-gate * retain the entire key. We ignore overflow keys, and the entire key 611*0Sstevel@tonic-gate * must be retained for the next-to-leftmost key on the leftmost page 612*0Sstevel@tonic-gate * of each level, or the search will fail. Applicable ONLY to internal 613*0Sstevel@tonic-gate * pages that have leaf pages as children. Further reduction of the 614*0Sstevel@tonic-gate * key between pairs of internal pages loses too much information. 615*0Sstevel@tonic-gate */ 616*0Sstevel@tonic-gate switch (TYPE(rchild)) { 617*0Sstevel@tonic-gate case P_IBTREE: 618*0Sstevel@tonic-gate child_bi = GET_BINTERNAL(rchild, 0); 619*0Sstevel@tonic-gate nbytes = BINTERNAL_PSIZE(child_bi->len); 620*0Sstevel@tonic-gate 621*0Sstevel@tonic-gate if (P_FREESPACE(ppage) < nbytes) 622*0Sstevel@tonic-gate return (DB_NEEDSPLIT); 623*0Sstevel@tonic-gate 624*0Sstevel@tonic-gate /* Add a new record for the right page. */ 625*0Sstevel@tonic-gate memset(&bi, 0, sizeof(bi)); 626*0Sstevel@tonic-gate bi.len = child_bi->len; 627*0Sstevel@tonic-gate B_TSET(bi.type, child_bi->type, 0); 628*0Sstevel@tonic-gate bi.pgno = rchild->pgno; 629*0Sstevel@tonic-gate bi.nrecs = nrecs; 630*0Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr)); 631*0Sstevel@tonic-gate hdr.data = &bi; 632*0Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 633*0Sstevel@tonic-gate memset(&data, 0, sizeof(data)); 634*0Sstevel@tonic-gate data.data = child_bi->data; 635*0Sstevel@tonic-gate data.size = child_bi->len; 636*0Sstevel@tonic-gate if ((ret = __db_pitem(dbc, ppage, off, 637*0Sstevel@tonic-gate BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) 638*0Sstevel@tonic-gate return (ret); 639*0Sstevel@tonic-gate 640*0Sstevel@tonic-gate /* Increment the overflow ref count. */ 641*0Sstevel@tonic-gate if (B_TYPE(child_bi->type) == B_OVERFLOW) 642*0Sstevel@tonic-gate if ((ret = __db_ovref(dbc, 643*0Sstevel@tonic-gate ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0) 644*0Sstevel@tonic-gate return (ret); 645*0Sstevel@tonic-gate break; 646*0Sstevel@tonic-gate case P_LBTREE: 647*0Sstevel@tonic-gate child_bk = GET_BKEYDATA(rchild, 0); 648*0Sstevel@tonic-gate switch (B_TYPE(child_bk->type)) { 649*0Sstevel@tonic-gate case B_KEYDATA: 650*0Sstevel@tonic-gate nbytes = BINTERNAL_PSIZE(child_bk->len); 651*0Sstevel@tonic-gate nksize = child_bk->len; 652*0Sstevel@tonic-gate if (t->bt_prefix == NULL) 653*0Sstevel@tonic-gate goto noprefix; 654*0Sstevel@tonic-gate if (ppage->prev_pgno == PGNO_INVALID && off <= 1) 655*0Sstevel@tonic-gate goto noprefix; 656*0Sstevel@tonic-gate tmp_bk = GET_BKEYDATA(lchild, NUM_ENT(lchild) - P_INDX); 657*0Sstevel@tonic-gate if (B_TYPE(tmp_bk->type) != B_KEYDATA) 658*0Sstevel@tonic-gate goto noprefix; 659*0Sstevel@tonic-gate memset(&a, 0, sizeof(a)); 660*0Sstevel@tonic-gate a.size = tmp_bk->len; 661*0Sstevel@tonic-gate a.data = tmp_bk->data; 662*0Sstevel@tonic-gate memset(&b, 0, sizeof(b)); 663*0Sstevel@tonic-gate b.size = child_bk->len; 664*0Sstevel@tonic-gate b.data = child_bk->data; 665*0Sstevel@tonic-gate nksize = t->bt_prefix(&a, &b); 666*0Sstevel@tonic-gate if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) 667*0Sstevel@tonic-gate nbytes = n; 668*0Sstevel@tonic-gate else 669*0Sstevel@tonic-gate noprefix: nksize = child_bk->len; 670*0Sstevel@tonic-gate 671*0Sstevel@tonic-gate if (P_FREESPACE(ppage) < nbytes) 672*0Sstevel@tonic-gate return (DB_NEEDSPLIT); 673*0Sstevel@tonic-gate 674*0Sstevel@tonic-gate memset(&bi, 0, sizeof(bi)); 675*0Sstevel@tonic-gate bi.len = nksize; 676*0Sstevel@tonic-gate B_TSET(bi.type, child_bk->type, 0); 677*0Sstevel@tonic-gate bi.pgno = rchild->pgno; 678*0Sstevel@tonic-gate bi.nrecs = nrecs; 679*0Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr)); 680*0Sstevel@tonic-gate hdr.data = &bi; 681*0Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 682*0Sstevel@tonic-gate memset(&data, 0, sizeof(data)); 683*0Sstevel@tonic-gate data.data = child_bk->data; 684*0Sstevel@tonic-gate data.size = nksize; 685*0Sstevel@tonic-gate if ((ret = __db_pitem(dbc, ppage, off, 686*0Sstevel@tonic-gate BINTERNAL_SIZE(nksize), &hdr, &data)) != 0) 687*0Sstevel@tonic-gate return (ret); 688*0Sstevel@tonic-gate break; 689*0Sstevel@tonic-gate case B_DUPLICATE: 690*0Sstevel@tonic-gate case B_OVERFLOW: 691*0Sstevel@tonic-gate nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE); 692*0Sstevel@tonic-gate 693*0Sstevel@tonic-gate if (P_FREESPACE(ppage) < nbytes) 694*0Sstevel@tonic-gate return (DB_NEEDSPLIT); 695*0Sstevel@tonic-gate 696*0Sstevel@tonic-gate memset(&bi, 0, sizeof(bi)); 697*0Sstevel@tonic-gate bi.len = BOVERFLOW_SIZE; 698*0Sstevel@tonic-gate B_TSET(bi.type, child_bk->type, 0); 699*0Sstevel@tonic-gate bi.pgno = rchild->pgno; 700*0Sstevel@tonic-gate bi.nrecs = nrecs; 701*0Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr)); 702*0Sstevel@tonic-gate hdr.data = &bi; 703*0Sstevel@tonic-gate hdr.size = SSZA(BINTERNAL, data); 704*0Sstevel@tonic-gate memset(&data, 0, sizeof(data)); 705*0Sstevel@tonic-gate data.data = child_bk; 706*0Sstevel@tonic-gate data.size = BOVERFLOW_SIZE; 707*0Sstevel@tonic-gate if ((ret = __db_pitem(dbc, ppage, off, 708*0Sstevel@tonic-gate BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) 709*0Sstevel@tonic-gate return (ret); 710*0Sstevel@tonic-gate 711*0Sstevel@tonic-gate /* Increment the overflow ref count. */ 712*0Sstevel@tonic-gate if (B_TYPE(child_bk->type) == B_OVERFLOW) 713*0Sstevel@tonic-gate if ((ret = __db_ovref(dbc, 714*0Sstevel@tonic-gate ((BOVERFLOW *)child_bk)->pgno, 1)) != 0) 715*0Sstevel@tonic-gate return (ret); 716*0Sstevel@tonic-gate break; 717*0Sstevel@tonic-gate default: 718*0Sstevel@tonic-gate return (__db_pgfmt(dbp, rchild->pgno)); 719*0Sstevel@tonic-gate } 720*0Sstevel@tonic-gate break; 721*0Sstevel@tonic-gate case P_IRECNO: 722*0Sstevel@tonic-gate case P_LRECNO: 723*0Sstevel@tonic-gate nbytes = RINTERNAL_PSIZE; 724*0Sstevel@tonic-gate 725*0Sstevel@tonic-gate if (P_FREESPACE(ppage) < nbytes) 726*0Sstevel@tonic-gate return (DB_NEEDSPLIT); 727*0Sstevel@tonic-gate 728*0Sstevel@tonic-gate /* Add a new record for the right page. */ 729*0Sstevel@tonic-gate memset(&hdr, 0, sizeof(hdr)); 730*0Sstevel@tonic-gate hdr.data = &ri; 731*0Sstevel@tonic-gate hdr.size = RINTERNAL_SIZE; 732*0Sstevel@tonic-gate ri.pgno = rchild->pgno; 733*0Sstevel@tonic-gate ri.nrecs = nrecs; 734*0Sstevel@tonic-gate if ((ret = __db_pitem(dbc, 735*0Sstevel@tonic-gate ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0) 736*0Sstevel@tonic-gate return (ret); 737*0Sstevel@tonic-gate break; 738*0Sstevel@tonic-gate default: 739*0Sstevel@tonic-gate return (__db_pgfmt(dbp, rchild->pgno)); 740*0Sstevel@tonic-gate } 741*0Sstevel@tonic-gate 742*0Sstevel@tonic-gate /* Adjust the parent page's left page record count. */ 743*0Sstevel@tonic-gate if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) { 744*0Sstevel@tonic-gate /* Log the change. */ 745*0Sstevel@tonic-gate if (DB_LOGGING(dbc) && 746*0Sstevel@tonic-gate (ret = __bam_cadjust_log(dbp->dbenv->lg_info, 747*0Sstevel@tonic-gate dbc->txn, &LSN(ppage), 0, dbp->log_fileid, 748*0Sstevel@tonic-gate PGNO(ppage), &LSN(ppage), (u_int32_t)parent->indx, 749*0Sstevel@tonic-gate -(int32_t)nrecs, (int32_t)0)) != 0) 750*0Sstevel@tonic-gate return (ret); 751*0Sstevel@tonic-gate 752*0Sstevel@tonic-gate /* Update the left page count. */ 753*0Sstevel@tonic-gate if (dbp->type == DB_RECNO) 754*0Sstevel@tonic-gate GET_RINTERNAL(ppage, parent->indx)->nrecs -= nrecs; 755*0Sstevel@tonic-gate else 756*0Sstevel@tonic-gate GET_BINTERNAL(ppage, parent->indx)->nrecs -= nrecs; 757*0Sstevel@tonic-gate } 758*0Sstevel@tonic-gate 759*0Sstevel@tonic-gate return (0); 760*0Sstevel@tonic-gate } 761*0Sstevel@tonic-gate 762*0Sstevel@tonic-gate /* 763*0Sstevel@tonic-gate * __bam_psplit -- 764*0Sstevel@tonic-gate * Do the real work of splitting the page. 765*0Sstevel@tonic-gate */ 766*0Sstevel@tonic-gate static int 767*0Sstevel@tonic-gate __bam_psplit(dbc, cp, lp, rp, splitret) 768*0Sstevel@tonic-gate DBC *dbc; 769*0Sstevel@tonic-gate EPG *cp; 770*0Sstevel@tonic-gate PAGE *lp, *rp; 771*0Sstevel@tonic-gate db_indx_t *splitret; 772*0Sstevel@tonic-gate { 773*0Sstevel@tonic-gate DB *dbp; 774*0Sstevel@tonic-gate PAGE *pp; 775*0Sstevel@tonic-gate db_indx_t half, nbytes, off, splitp, top; 776*0Sstevel@tonic-gate int adjust, cnt, isbigkey, ret; 777*0Sstevel@tonic-gate 778*0Sstevel@tonic-gate dbp = dbc->dbp; 779*0Sstevel@tonic-gate pp = cp->page; 780*0Sstevel@tonic-gate adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX; 781*0Sstevel@tonic-gate 782*0Sstevel@tonic-gate /* 783*0Sstevel@tonic-gate * If we're splitting the first (last) page on a level because we're 784*0Sstevel@tonic-gate * inserting (appending) a key to it, it's likely that the data is 785*0Sstevel@tonic-gate * sorted. Moving a single item to the new page is less work and can 786*0Sstevel@tonic-gate * push the fill factor higher than normal. If we're wrong it's not 787*0Sstevel@tonic-gate * a big deal, we'll just do the split the right way next time. 788*0Sstevel@tonic-gate */ 789*0Sstevel@tonic-gate off = 0; 790*0Sstevel@tonic-gate if (NEXT_PGNO(pp) == PGNO_INVALID && 791*0Sstevel@tonic-gate ((ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page) - 1) || 792*0Sstevel@tonic-gate (!ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page)))) 793*0Sstevel@tonic-gate off = NUM_ENT(cp->page) - adjust; 794*0Sstevel@tonic-gate else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0) 795*0Sstevel@tonic-gate off = adjust; 796*0Sstevel@tonic-gate 797*0Sstevel@tonic-gate if (off != 0) 798*0Sstevel@tonic-gate goto sort; 799*0Sstevel@tonic-gate 800*0Sstevel@tonic-gate /* 801*0Sstevel@tonic-gate * Split the data to the left and right pages. Try not to split on 802*0Sstevel@tonic-gate * an overflow key. (Overflow keys on internal pages will slow down 803*0Sstevel@tonic-gate * searches.) Refuse to split in the middle of a set of duplicates. 804*0Sstevel@tonic-gate * 805*0Sstevel@tonic-gate * First, find the optimum place to split. 806*0Sstevel@tonic-gate * 807*0Sstevel@tonic-gate * It's possible to try and split past the last record on the page if 808*0Sstevel@tonic-gate * there's a very large record at the end of the page. Make sure this 809*0Sstevel@tonic-gate * doesn't happen by bounding the check at the next-to-last entry on 810*0Sstevel@tonic-gate * the page. 811*0Sstevel@tonic-gate * 812*0Sstevel@tonic-gate * Note, we try and split half the data present on the page. This is 813*0Sstevel@tonic-gate * because another process may have already split the page and left 814*0Sstevel@tonic-gate * it half empty. We don't try and skip the split -- we don't know 815*0Sstevel@tonic-gate * how much space we're going to need on the page, and we may need up 816*0Sstevel@tonic-gate * to half the page for a big item, so there's no easy test to decide 817*0Sstevel@tonic-gate * if we need to split or not. Besides, if two threads are inserting 818*0Sstevel@tonic-gate * data into the same place in the database, we're probably going to 819*0Sstevel@tonic-gate * need more space soon anyway. 820*0Sstevel@tonic-gate */ 821*0Sstevel@tonic-gate top = NUM_ENT(pp) - adjust; 822*0Sstevel@tonic-gate half = (dbp->pgsize - HOFFSET(pp)) / 2; 823*0Sstevel@tonic-gate for (nbytes = 0, off = 0; off < top && nbytes < half; ++off) 824*0Sstevel@tonic-gate switch (TYPE(pp)) { 825*0Sstevel@tonic-gate case P_IBTREE: 826*0Sstevel@tonic-gate if (B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA) 827*0Sstevel@tonic-gate nbytes += 828*0Sstevel@tonic-gate BINTERNAL_SIZE(GET_BINTERNAL(pp, off)->len); 829*0Sstevel@tonic-gate else 830*0Sstevel@tonic-gate nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE); 831*0Sstevel@tonic-gate break; 832*0Sstevel@tonic-gate case P_LBTREE: 833*0Sstevel@tonic-gate if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) 834*0Sstevel@tonic-gate nbytes += 835*0Sstevel@tonic-gate BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); 836*0Sstevel@tonic-gate else 837*0Sstevel@tonic-gate nbytes += BOVERFLOW_SIZE; 838*0Sstevel@tonic-gate 839*0Sstevel@tonic-gate ++off; 840*0Sstevel@tonic-gate if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) 841*0Sstevel@tonic-gate nbytes += 842*0Sstevel@tonic-gate BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); 843*0Sstevel@tonic-gate else 844*0Sstevel@tonic-gate nbytes += BOVERFLOW_SIZE; 845*0Sstevel@tonic-gate break; 846*0Sstevel@tonic-gate case P_IRECNO: 847*0Sstevel@tonic-gate nbytes += RINTERNAL_SIZE; 848*0Sstevel@tonic-gate break; 849*0Sstevel@tonic-gate case P_LRECNO: 850*0Sstevel@tonic-gate nbytes += BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); 851*0Sstevel@tonic-gate break; 852*0Sstevel@tonic-gate default: 853*0Sstevel@tonic-gate return (__db_pgfmt(dbp, pp->pgno)); 854*0Sstevel@tonic-gate } 855*0Sstevel@tonic-gate sort: splitp = off; 856*0Sstevel@tonic-gate 857*0Sstevel@tonic-gate /* 858*0Sstevel@tonic-gate * Splitp is either at or just past the optimum split point. If 859*0Sstevel@tonic-gate * it's a big key, try and find something close by that's not. 860*0Sstevel@tonic-gate */ 861*0Sstevel@tonic-gate if (TYPE(pp) == P_IBTREE) 862*0Sstevel@tonic-gate isbigkey = B_TYPE(GET_BINTERNAL(pp, off)->type) != B_KEYDATA; 863*0Sstevel@tonic-gate else if (TYPE(pp) == P_LBTREE) 864*0Sstevel@tonic-gate isbigkey = B_TYPE(GET_BKEYDATA(pp, off)->type) != B_KEYDATA; 865*0Sstevel@tonic-gate else 866*0Sstevel@tonic-gate isbigkey = 0; 867*0Sstevel@tonic-gate if (isbigkey) 868*0Sstevel@tonic-gate for (cnt = 1; cnt <= 3; ++cnt) { 869*0Sstevel@tonic-gate off = splitp + cnt * adjust; 870*0Sstevel@tonic-gate if (off < (db_indx_t)NUM_ENT(pp) && 871*0Sstevel@tonic-gate ((TYPE(pp) == P_IBTREE && 872*0Sstevel@tonic-gate B_TYPE(GET_BINTERNAL(pp,off)->type) == B_KEYDATA) || 873*0Sstevel@tonic-gate B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)) { 874*0Sstevel@tonic-gate splitp = off; 875*0Sstevel@tonic-gate break; 876*0Sstevel@tonic-gate } 877*0Sstevel@tonic-gate if (splitp <= (db_indx_t)(cnt * adjust)) 878*0Sstevel@tonic-gate continue; 879*0Sstevel@tonic-gate off = splitp - cnt * adjust; 880*0Sstevel@tonic-gate if (TYPE(pp) == P_IBTREE ? 881*0Sstevel@tonic-gate B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA : 882*0Sstevel@tonic-gate B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) { 883*0Sstevel@tonic-gate splitp = off; 884*0Sstevel@tonic-gate break; 885*0Sstevel@tonic-gate } 886*0Sstevel@tonic-gate } 887*0Sstevel@tonic-gate 888*0Sstevel@tonic-gate /* 889*0Sstevel@tonic-gate * We can't split in the middle a set of duplicates. We know that 890*0Sstevel@tonic-gate * no duplicate set can take up more than about 25% of the page, 891*0Sstevel@tonic-gate * because that's the point where we push it off onto a duplicate 892*0Sstevel@tonic-gate * page set. So, this loop can't be unbounded. 893*0Sstevel@tonic-gate */ 894*0Sstevel@tonic-gate if (F_ISSET(dbp, DB_AM_DUP) && TYPE(pp) == P_LBTREE && 895*0Sstevel@tonic-gate pp->inp[splitp] == pp->inp[splitp - adjust]) 896*0Sstevel@tonic-gate for (cnt = 1;; ++cnt) { 897*0Sstevel@tonic-gate off = splitp + cnt * adjust; 898*0Sstevel@tonic-gate if (off < NUM_ENT(pp) && 899*0Sstevel@tonic-gate pp->inp[splitp] != pp->inp[off]) { 900*0Sstevel@tonic-gate splitp = off; 901*0Sstevel@tonic-gate break; 902*0Sstevel@tonic-gate } 903*0Sstevel@tonic-gate if (splitp <= (db_indx_t)(cnt * adjust)) 904*0Sstevel@tonic-gate continue; 905*0Sstevel@tonic-gate off = splitp - cnt * adjust; 906*0Sstevel@tonic-gate if (pp->inp[splitp] != pp->inp[off]) { 907*0Sstevel@tonic-gate splitp = off + adjust; 908*0Sstevel@tonic-gate break; 909*0Sstevel@tonic-gate } 910*0Sstevel@tonic-gate } 911*0Sstevel@tonic-gate 912*0Sstevel@tonic-gate 913*0Sstevel@tonic-gate /* We're going to split at splitp. */ 914*0Sstevel@tonic-gate if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0) 915*0Sstevel@tonic-gate return (ret); 916*0Sstevel@tonic-gate if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0) 917*0Sstevel@tonic-gate return (ret); 918*0Sstevel@tonic-gate 919*0Sstevel@tonic-gate *splitret = splitp; 920*0Sstevel@tonic-gate return (0); 921*0Sstevel@tonic-gate } 922*0Sstevel@tonic-gate 923*0Sstevel@tonic-gate /* 924*0Sstevel@tonic-gate * __bam_copy -- 925*0Sstevel@tonic-gate * Copy a set of records from one page to another. 926*0Sstevel@tonic-gate * 927*0Sstevel@tonic-gate * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t)); 928*0Sstevel@tonic-gate */ 929*0Sstevel@tonic-gate int 930*0Sstevel@tonic-gate __bam_copy(dbp, pp, cp, nxt, stop) 931*0Sstevel@tonic-gate DB *dbp; 932*0Sstevel@tonic-gate PAGE *pp, *cp; 933*0Sstevel@tonic-gate u_int32_t nxt, stop; 934*0Sstevel@tonic-gate { 935*0Sstevel@tonic-gate db_indx_t nbytes, off; 936*0Sstevel@tonic-gate 937*0Sstevel@tonic-gate /* 938*0Sstevel@tonic-gate * Copy the rest of the data to the right page. Nxt is the next 939*0Sstevel@tonic-gate * offset placed on the target page. 940*0Sstevel@tonic-gate */ 941*0Sstevel@tonic-gate for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) { 942*0Sstevel@tonic-gate switch (TYPE(pp)) { 943*0Sstevel@tonic-gate case P_IBTREE: 944*0Sstevel@tonic-gate if (B_TYPE(GET_BINTERNAL(pp, nxt)->type) == B_KEYDATA) 945*0Sstevel@tonic-gate nbytes = 946*0Sstevel@tonic-gate BINTERNAL_SIZE(GET_BINTERNAL(pp, nxt)->len); 947*0Sstevel@tonic-gate else 948*0Sstevel@tonic-gate nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE); 949*0Sstevel@tonic-gate break; 950*0Sstevel@tonic-gate case P_LBTREE: 951*0Sstevel@tonic-gate /* 952*0Sstevel@tonic-gate * If we're on a key and it's a duplicate, just copy 953*0Sstevel@tonic-gate * the offset. 954*0Sstevel@tonic-gate */ 955*0Sstevel@tonic-gate if (off != 0 && (nxt % P_INDX) == 0 && 956*0Sstevel@tonic-gate pp->inp[nxt] == pp->inp[nxt - P_INDX]) { 957*0Sstevel@tonic-gate cp->inp[off] = cp->inp[off - P_INDX]; 958*0Sstevel@tonic-gate continue; 959*0Sstevel@tonic-gate } 960*0Sstevel@tonic-gate /* FALLTHROUGH */ 961*0Sstevel@tonic-gate case P_LRECNO: 962*0Sstevel@tonic-gate if (B_TYPE(GET_BKEYDATA(pp, nxt)->type) == B_KEYDATA) 963*0Sstevel@tonic-gate nbytes = 964*0Sstevel@tonic-gate BKEYDATA_SIZE(GET_BKEYDATA(pp, nxt)->len); 965*0Sstevel@tonic-gate else 966*0Sstevel@tonic-gate nbytes = BOVERFLOW_SIZE; 967*0Sstevel@tonic-gate break; 968*0Sstevel@tonic-gate case P_IRECNO: 969*0Sstevel@tonic-gate nbytes = RINTERNAL_SIZE; 970*0Sstevel@tonic-gate break; 971*0Sstevel@tonic-gate default: 972*0Sstevel@tonic-gate return (__db_pgfmt(dbp, pp->pgno)); 973*0Sstevel@tonic-gate } 974*0Sstevel@tonic-gate cp->inp[off] = HOFFSET(cp) -= nbytes; 975*0Sstevel@tonic-gate memcpy(P_ENTRY(cp, off), P_ENTRY(pp, nxt), nbytes); 976*0Sstevel@tonic-gate } 977*0Sstevel@tonic-gate return (0); 978*0Sstevel@tonic-gate } 979