1*aae80e6bSchristos /* $NetBSD: bt_seq.c,v 1.20 2016/09/24 21:31:25 christos Exp $ */
2a954b078Scgd
39f0aa214Scgd /*-
424420c01Scgd * Copyright (c) 1990, 1993, 1994
59f0aa214Scgd * The Regents of the University of California. All rights reserved.
69f0aa214Scgd *
79f0aa214Scgd * This code is derived from software contributed to Berkeley by
89f0aa214Scgd * Mike Olson.
99f0aa214Scgd *
109f0aa214Scgd * Redistribution and use in source and binary forms, with or without
119f0aa214Scgd * modification, are permitted provided that the following conditions
129f0aa214Scgd * are met:
139f0aa214Scgd * 1. Redistributions of source code must retain the above copyright
149f0aa214Scgd * notice, this list of conditions and the following disclaimer.
159f0aa214Scgd * 2. Redistributions in binary form must reproduce the above copyright
169f0aa214Scgd * notice, this list of conditions and the following disclaimer in the
179f0aa214Scgd * documentation and/or other materials provided with the distribution.
18eb7c1594Sagc * 3. Neither the name of the University nor the names of its contributors
199f0aa214Scgd * may be used to endorse or promote products derived from this software
209f0aa214Scgd * without specific prior written permission.
219f0aa214Scgd *
229f0aa214Scgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
239f0aa214Scgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
249f0aa214Scgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
259f0aa214Scgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
269f0aa214Scgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
279f0aa214Scgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
289f0aa214Scgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
299f0aa214Scgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
309f0aa214Scgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
319f0aa214Scgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
329f0aa214Scgd * SUCH DAMAGE.
339f0aa214Scgd */
349f0aa214Scgd
35d3595ddfSjoerg #if HAVE_NBTOOL_CONFIG_H
36d3595ddfSjoerg #include "nbtool_config.h"
37d3595ddfSjoerg #endif
38d3595ddfSjoerg
3900ae392dSchristos #include <sys/cdefs.h>
40*aae80e6bSchristos __RCSID("$NetBSD: bt_seq.c,v 1.20 2016/09/24 21:31:25 christos Exp $");
419f0aa214Scgd
4243fa6fe3Sjtc #include "namespace.h"
439f0aa214Scgd #include <sys/types.h>
449f0aa214Scgd
45cb9daf8fSchristos #include <assert.h>
469f0aa214Scgd #include <errno.h>
479f0aa214Scgd #include <stddef.h>
489f0aa214Scgd #include <stdio.h>
499f0aa214Scgd #include <stdlib.h>
509f0aa214Scgd
519f0aa214Scgd #include <db.h>
529f0aa214Scgd #include "btree.h"
539f0aa214Scgd
54cb9daf8fSchristos static int __bt_first(BTREE *, const DBT *, EPG *, int *);
55cb9daf8fSchristos static int __bt_seqadv(BTREE *, EPG *, int);
56cb9daf8fSchristos static int __bt_seqset(BTREE *, EPG *, DBT *, int);
5786147b1cSchristos static int __bt_rseq_next(BTREE *, EPG *);
5886147b1cSchristos static int __bt_rseq_prev(BTREE *, EPG *);
599f0aa214Scgd
609f0aa214Scgd /*
619f0aa214Scgd * Sequential scan support.
629f0aa214Scgd *
6324420c01Scgd * The tree can be scanned sequentially, starting from either end of the
6424420c01Scgd * tree or from any specific key. A scan request before any scanning is
6524420c01Scgd * done is initialized as starting from the least node.
669f0aa214Scgd */
679f0aa214Scgd
689f0aa214Scgd /*
6924420c01Scgd * __bt_seq --
7024420c01Scgd * Btree sequential scan interface.
719f0aa214Scgd *
729f0aa214Scgd * Parameters:
739f0aa214Scgd * dbp: pointer to access method
749f0aa214Scgd * key: key for positioning and return value
759f0aa214Scgd * data: data return value
7686147b1cSchristos * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV, R_RNEXT, R_RPREV.
779f0aa214Scgd *
789f0aa214Scgd * Returns:
799f0aa214Scgd * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
809f0aa214Scgd */
819f0aa214Scgd int
__bt_seq(const DB * dbp,DBT * key,DBT * data,u_int flags)82cb9daf8fSchristos __bt_seq(const DB *dbp, DBT *key, DBT *data, u_int flags)
839f0aa214Scgd {
849f0aa214Scgd BTREE *t;
859f0aa214Scgd EPG e;
869f0aa214Scgd int status;
879f0aa214Scgd
8845e27c80Scgd t = dbp->internal;
8945e27c80Scgd
9045e27c80Scgd /* Toss any page pinned across calls. */
9145e27c80Scgd if (t->bt_pinned != NULL) {
9245e27c80Scgd mpool_put(t->bt_mp, t->bt_pinned, 0);
9345e27c80Scgd t->bt_pinned = NULL;
9445e27c80Scgd }
9545e27c80Scgd
969f0aa214Scgd /*
9727e03817Sryoon * If scan uninitialized as yet, or starting at a specific record, set
9824420c01Scgd * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin
9924420c01Scgd * the page the cursor references if they're successful.
1009f0aa214Scgd */
1019f0aa214Scgd switch (flags) {
1029f0aa214Scgd case R_NEXT:
1039f0aa214Scgd case R_PREV:
10486147b1cSchristos case R_RNEXT:
10586147b1cSchristos case R_RPREV:
10624420c01Scgd if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
10761238e71Schristos status = __bt_seqadv(t, &e, (int)flags);
1089f0aa214Scgd break;
1099f0aa214Scgd }
1109f0aa214Scgd /* FALLTHROUGH */
1119f0aa214Scgd case R_FIRST:
1129f0aa214Scgd case R_LAST:
11324420c01Scgd case R_CURSOR:
11461238e71Schristos status = __bt_seqset(t, &e, key, (int)flags);
1159f0aa214Scgd break;
1169f0aa214Scgd default:
1179f0aa214Scgd errno = EINVAL;
1189f0aa214Scgd return (RET_ERROR);
1199f0aa214Scgd }
1209f0aa214Scgd
1219f0aa214Scgd if (status == RET_SUCCESS) {
12261238e71Schristos __bt_setcur(t, e.page->pgno, (u_int)e.index);
1239f0aa214Scgd
12424420c01Scgd status =
12524420c01Scgd __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
12645e27c80Scgd
12745e27c80Scgd /*
12845e27c80Scgd * If the user is doing concurrent access, we copied the
12945e27c80Scgd * key/data, toss the page.
13045e27c80Scgd */
13124420c01Scgd if (F_ISSET(t, B_DB_LOCK))
1329f0aa214Scgd mpool_put(t->bt_mp, e.page, 0);
13345e27c80Scgd else
13445e27c80Scgd t->bt_pinned = e.page;
1359f0aa214Scgd }
1369f0aa214Scgd return (status);
1379f0aa214Scgd }
1389f0aa214Scgd
1399f0aa214Scgd /*
14024420c01Scgd * __bt_seqset --
14124420c01Scgd * Set the sequential scan to a specific key.
1429f0aa214Scgd *
1439f0aa214Scgd * Parameters:
1449f0aa214Scgd * t: tree
1459f0aa214Scgd * ep: storage for returned key
1469f0aa214Scgd * key: key for initial scan position
14786147b1cSchristos * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV, R_RNEXT, R_RPREV.
1489f0aa214Scgd *
1499f0aa214Scgd * Side effects:
1509f0aa214Scgd * Pins the page the cursor references.
1519f0aa214Scgd *
1529f0aa214Scgd * Returns:
1539f0aa214Scgd * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
1549f0aa214Scgd */
1559f0aa214Scgd static int
__bt_seqset(BTREE * t,EPG * ep,DBT * key,int flags)156cb9daf8fSchristos __bt_seqset(BTREE *t, EPG *ep, DBT *key, int flags)
1579f0aa214Scgd {
1589f0aa214Scgd PAGE *h;
1599f0aa214Scgd pgno_t pg;
1609f0aa214Scgd int exact;
1619f0aa214Scgd
1629f0aa214Scgd /*
16324420c01Scgd * Find the first, last or specific key in the tree and point the
16424420c01Scgd * cursor at it. The cursor may not be moved until a new key has
16524420c01Scgd * been found.
1669f0aa214Scgd */
1679f0aa214Scgd switch (flags) {
1689f0aa214Scgd case R_CURSOR: /* Keyed scan. */
1699f0aa214Scgd /*
17024420c01Scgd * Find the first instance of the key or the smallest key
17124420c01Scgd * which is greater than or equal to the specified key.
1729f0aa214Scgd */
1739f0aa214Scgd if (key->data == NULL || key->size == 0) {
1749f0aa214Scgd errno = EINVAL;
1759f0aa214Scgd return (RET_ERROR);
1769f0aa214Scgd }
17724420c01Scgd return (__bt_first(t, key, ep, &exact));
1789f0aa214Scgd case R_FIRST: /* First record. */
1799f0aa214Scgd case R_NEXT:
18086147b1cSchristos case R_RNEXT:
18186147b1cSchristos BT_CLR(t);
1829f0aa214Scgd /* Walk down the left-hand side of the tree. */
1839f0aa214Scgd for (pg = P_ROOT;;) {
184*aae80e6bSchristos if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
1859f0aa214Scgd return (RET_ERROR);
1869f0aa214Scgd
18724420c01Scgd /* Check for an empty tree. */
1889f0aa214Scgd if (NEXTINDEX(h) == 0) {
1899f0aa214Scgd mpool_put(t->bt_mp, h, 0);
1909f0aa214Scgd return (RET_SPECIAL);
1919f0aa214Scgd }
1929f0aa214Scgd
19324420c01Scgd if (h->flags & (P_BLEAF | P_RLEAF))
19424420c01Scgd break;
19524420c01Scgd pg = GETBINTERNAL(h, 0)->pgno;
19686147b1cSchristos BT_PUSH(t, h->pgno, 0);
19724420c01Scgd mpool_put(t->bt_mp, h, 0);
19824420c01Scgd }
1999f0aa214Scgd ep->page = h;
2009f0aa214Scgd ep->index = 0;
2019f0aa214Scgd break;
2029f0aa214Scgd case R_LAST: /* Last record. */
2039f0aa214Scgd case R_PREV:
20486147b1cSchristos case R_RPREV:
20586147b1cSchristos BT_CLR(t);
2069f0aa214Scgd /* Walk down the right-hand side of the tree. */
2079f0aa214Scgd for (pg = P_ROOT;;) {
208*aae80e6bSchristos if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
2099f0aa214Scgd return (RET_ERROR);
21024420c01Scgd
21124420c01Scgd /* Check for an empty tree. */
21224420c01Scgd if (NEXTINDEX(h) == 0) {
21324420c01Scgd mpool_put(t->bt_mp, h, 0);
21424420c01Scgd return (RET_SPECIAL);
21524420c01Scgd }
21624420c01Scgd
2179f0aa214Scgd if (h->flags & (P_BLEAF | P_RLEAF))
2189f0aa214Scgd break;
2199f0aa214Scgd pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
22086147b1cSchristos BT_PUSH(t, h->pgno, NEXTINDEX(h) - 1);
2219f0aa214Scgd mpool_put(t->bt_mp, h, 0);
2229f0aa214Scgd }
2239f0aa214Scgd
2249f0aa214Scgd ep->page = h;
2259f0aa214Scgd ep->index = NEXTINDEX(h) - 1;
2269f0aa214Scgd break;
2279f0aa214Scgd }
2289f0aa214Scgd return (RET_SUCCESS);
2299f0aa214Scgd }
2309f0aa214Scgd
2319f0aa214Scgd /*
23224420c01Scgd * __bt_seqadvance --
23324420c01Scgd * Advance the sequential scan.
2349f0aa214Scgd *
2359f0aa214Scgd * Parameters:
2369f0aa214Scgd * t: tree
23786147b1cSchristos * flags: R_NEXT, R_PREV, R_RNEXT, R_RPREV
2389f0aa214Scgd *
2399f0aa214Scgd * Side effects:
2409f0aa214Scgd * Pins the page the new key/data record is on.
2419f0aa214Scgd *
2429f0aa214Scgd * Returns:
2439f0aa214Scgd * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
2449f0aa214Scgd */
2459f0aa214Scgd static int
__bt_seqadv(BTREE * t,EPG * ep,int flags)246cb9daf8fSchristos __bt_seqadv(BTREE *t, EPG *ep, int flags)
2479f0aa214Scgd {
24824420c01Scgd CURSOR *c;
2499f0aa214Scgd PAGE *h;
250585dfd61Sthorpej indx_t idx = 0; /* pacify gcc */
2519f0aa214Scgd pgno_t pg;
25286147b1cSchristos int exact, rval;
2539f0aa214Scgd
25424420c01Scgd /*
25524420c01Scgd * There are a couple of states that we can be in. The cursor has
25624420c01Scgd * been initialized by the time we get here, but that's all we know.
25724420c01Scgd */
25824420c01Scgd c = &t->bt_cursor;
2599f0aa214Scgd
26024420c01Scgd /*
26186147b1cSchristos * The cursor was deleted and there weren't any duplicate records,
26286147b1cSchristos * so the cursor's key was saved. Find out where that key would
26386147b1cSchristos * be in the current tree. If the returned key is an exact match,
26486147b1cSchristos * it means that a key/data pair was inserted into the tree after
26586147b1cSchristos * the delete. We could reasonably return the key, but the problem
26686147b1cSchristos * is that this is the access pattern we'll see if the user is
26786147b1cSchristos * doing seq(..., R_NEXT)/put(..., 0) pairs, i.e. the put deletes
26886147b1cSchristos * the cursor record and then replaces it, so the cursor was saved,
26986147b1cSchristos * and we'll simply return the same "new" record until the user
27086147b1cSchristos * notices and doesn't do a put() of it. Since the key is an exact
27186147b1cSchristos * match, we could as easily put the new record before the cursor,
27286147b1cSchristos * and we've made no guarantee to return it. So, move forward or
27386147b1cSchristos * back a record if it's an exact match.
27486147b1cSchristos *
27586147b1cSchristos * XXX
27686147b1cSchristos * In the current implementation, put's to the cursor are done with
27786147b1cSchristos * delete/add pairs. This has two consequences. First, it means
27886147b1cSchristos * that seq(..., R_NEXT)/put(..., R_CURSOR) pairs are going to exhibit
27986147b1cSchristos * the same behavior as above. Second, you can return the same key
28086147b1cSchristos * twice if you have duplicate records. The scenario is that the
28186147b1cSchristos * cursor record is deleted, moving the cursor forward or backward
28286147b1cSchristos * to a duplicate. The add then inserts the new record at a location
28386147b1cSchristos * ahead of the cursor because duplicates aren't sorted in any way,
28486147b1cSchristos * and the new record is later returned. This has to be fixed at some
28586147b1cSchristos * point.
28624420c01Scgd */
28786147b1cSchristos if (F_ISSET(c, CURS_ACQUIRE)) {
28886147b1cSchristos if ((rval = __bt_first(t, &c->key, ep, &exact)) == RET_ERROR)
28986147b1cSchristos return RET_ERROR;
29086147b1cSchristos if (!exact)
29186147b1cSchristos return rval;
29286147b1cSchristos /*
29386147b1cSchristos * XXX
29486147b1cSchristos * Kluge -- get, release, get the page.
29586147b1cSchristos */
29686147b1cSchristos c->pg.pgno = ep->page->pgno;
29786147b1cSchristos c->pg.index = ep->index;
29886147b1cSchristos mpool_put(t->bt_mp, ep->page, 0);
29986147b1cSchristos }
30024420c01Scgd
30124420c01Scgd /* Get the page referenced by the cursor. */
302*aae80e6bSchristos if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
3039f0aa214Scgd return (RET_ERROR);
3049f0aa214Scgd
3059f0aa214Scgd /*
30624420c01Scgd * Find the next/previous record in the tree and point the cursor at
30724420c01Scgd * it. The cursor may not be moved until a new key has been found.
3089f0aa214Scgd */
3099f0aa214Scgd switch (flags) {
3109f0aa214Scgd case R_NEXT: /* Next record. */
31186147b1cSchristos case R_RNEXT:
31224420c01Scgd /*
31324420c01Scgd * The cursor was deleted in duplicate records, and moved
31424420c01Scgd * forward to a record that has yet to be returned. Clear
31524420c01Scgd * that flag, and return the record.
31624420c01Scgd */
31724420c01Scgd if (F_ISSET(c, CURS_AFTER))
31824420c01Scgd goto usecurrent;
319585dfd61Sthorpej idx = c->pg.index;
320585dfd61Sthorpej if (++idx == NEXTINDEX(h)) {
32186147b1cSchristos if (flags == R_RNEXT) {
32286147b1cSchristos ep->page = h;
32386147b1cSchristos ep->index = idx;
32486147b1cSchristos return __bt_rseq_next(t, ep);
32586147b1cSchristos }
3269f0aa214Scgd pg = h->nextpg;
3279f0aa214Scgd mpool_put(t->bt_mp, h, 0);
3289f0aa214Scgd if (pg == P_INVALID)
32986147b1cSchristos return RET_SPECIAL;
330*aae80e6bSchristos if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
33186147b1cSchristos return RET_ERROR;
332585dfd61Sthorpej idx = 0;
3339f0aa214Scgd }
3349f0aa214Scgd break;
3359f0aa214Scgd case R_PREV: /* Previous record. */
33686147b1cSchristos case R_RPREV:
33724420c01Scgd /*
33824420c01Scgd * The cursor was deleted in duplicate records, and moved
33924420c01Scgd * backward to a record that has yet to be returned. Clear
34024420c01Scgd * that flag, and return the record.
34124420c01Scgd */
34224420c01Scgd if (F_ISSET(c, CURS_BEFORE)) {
34324420c01Scgd usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE);
34424420c01Scgd ep->page = h;
34524420c01Scgd ep->index = c->pg.index;
34624420c01Scgd return (RET_SUCCESS);
34724420c01Scgd }
348585dfd61Sthorpej idx = c->pg.index;
349585dfd61Sthorpej if (idx == 0) {
35086147b1cSchristos if (flags == R_RPREV) {
35186147b1cSchristos ep->page = h;
35286147b1cSchristos ep->index = idx;
35386147b1cSchristos return __bt_rseq_prev(t, ep);
35486147b1cSchristos }
3559f0aa214Scgd pg = h->prevpg;
3569f0aa214Scgd mpool_put(t->bt_mp, h, 0);
3579f0aa214Scgd if (pg == P_INVALID)
35886147b1cSchristos return RET_SPECIAL;
359*aae80e6bSchristos if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
36086147b1cSchristos return RET_ERROR;
361585dfd61Sthorpej idx = NEXTINDEX(h) - 1;
36224420c01Scgd } else
363585dfd61Sthorpej --idx;
3649f0aa214Scgd break;
3659f0aa214Scgd }
3669f0aa214Scgd
36724420c01Scgd ep->page = h;
368585dfd61Sthorpej ep->index = idx;
3699f0aa214Scgd return (RET_SUCCESS);
3709f0aa214Scgd }
37186147b1cSchristos /*
37286147b1cSchristos * Get the first item on the next page, but by going up and down the tree.
37386147b1cSchristos */
37486147b1cSchristos static int
__bt_rseq_next(BTREE * t,EPG * ep)37586147b1cSchristos __bt_rseq_next(BTREE *t, EPG *ep)
37686147b1cSchristos {
37786147b1cSchristos PAGE *h;
37886147b1cSchristos indx_t idx;
37986147b1cSchristos EPGNO *up;
38086147b1cSchristos pgno_t pg;
38186147b1cSchristos
38286147b1cSchristos h = ep->page;
38386147b1cSchristos idx = ep->index;
38486147b1cSchristos do {
38586147b1cSchristos /* Move up the tree. */
38686147b1cSchristos up = BT_POP(t);
38786147b1cSchristos mpool_put(t->bt_mp, h, 0);
38886147b1cSchristos /* Did we hit the right edge of the root? */
38986147b1cSchristos if (up == NULL)
39086147b1cSchristos return RET_SPECIAL;
391*aae80e6bSchristos if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL)
39286147b1cSchristos return RET_ERROR;
39386147b1cSchristos idx = up->index;
39486147b1cSchristos } while (++idx == NEXTINDEX(h));
39586147b1cSchristos
39686147b1cSchristos while (!(h->flags & (P_BLEAF | P_RLEAF))) {
39786147b1cSchristos /* Move back down the tree. */
39886147b1cSchristos BT_PUSH(t, h->pgno, idx);
39986147b1cSchristos pg = GETBINTERNAL(h, idx)->pgno;
40086147b1cSchristos mpool_put(t->bt_mp, h, 0);
401*aae80e6bSchristos if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
40286147b1cSchristos return RET_ERROR;
40386147b1cSchristos idx = 0;
40486147b1cSchristos }
40586147b1cSchristos ep->page = h;
40686147b1cSchristos ep->index = idx;
40786147b1cSchristos return RET_SUCCESS;
40886147b1cSchristos }
40986147b1cSchristos
41086147b1cSchristos /*
41186147b1cSchristos * Get the last item on the previous page, but by going up and down the tree.
41286147b1cSchristos */
41386147b1cSchristos static int
__bt_rseq_prev(BTREE * t,EPG * ep)41486147b1cSchristos __bt_rseq_prev(BTREE *t, EPG *ep)
41586147b1cSchristos {
41686147b1cSchristos PAGE *h;
41786147b1cSchristos indx_t idx;
41886147b1cSchristos EPGNO *up;
41986147b1cSchristos pgno_t pg;
42086147b1cSchristos
42186147b1cSchristos h = ep->page;
42286147b1cSchristos idx = ep->index;
42386147b1cSchristos do {
42486147b1cSchristos /* Move up the tree. */
42586147b1cSchristos up = BT_POP(t);
42686147b1cSchristos mpool_put(t->bt_mp, h, 0);
42786147b1cSchristos /* Did we hit the left edge of the root? */
42886147b1cSchristos if (up == NULL)
42986147b1cSchristos return RET_SPECIAL;
430*aae80e6bSchristos if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL)
43186147b1cSchristos return RET_ERROR;
43286147b1cSchristos idx = up->index;
43386147b1cSchristos } while (idx == 0);
43486147b1cSchristos --idx;
43586147b1cSchristos while (!(h->flags & (P_BLEAF | P_RLEAF))) {
43686147b1cSchristos /* Move back down the tree. */
43786147b1cSchristos BT_PUSH(t, h->pgno, idx);
43886147b1cSchristos pg = GETBINTERNAL(h, idx)->pgno;
43986147b1cSchristos mpool_put(t->bt_mp, h, 0);
440*aae80e6bSchristos if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
44186147b1cSchristos return RET_ERROR;
44286147b1cSchristos idx = NEXTINDEX(h) - 1;
44386147b1cSchristos }
44486147b1cSchristos ep->page = h;
44586147b1cSchristos ep->index = idx;
44686147b1cSchristos return RET_SUCCESS;
44786147b1cSchristos }
4489f0aa214Scgd
4499f0aa214Scgd /*
45024420c01Scgd * __bt_first --
45124420c01Scgd * Find the first entry.
4529f0aa214Scgd *
4539f0aa214Scgd * Parameters:
45424420c01Scgd * t: the tree
45524420c01Scgd * key: the key
45624420c01Scgd * erval: return EPG
45724420c01Scgd * exactp: pointer to exact match flag
4589f0aa214Scgd *
4599f0aa214Scgd * Returns:
46024420c01Scgd * The first entry in the tree greater than or equal to key,
46124420c01Scgd * or RET_SPECIAL if no such key exists.
4629f0aa214Scgd */
46324420c01Scgd static int
__bt_first(BTREE * t,const DBT * key,EPG * erval,int * exactp)464cb9daf8fSchristos __bt_first(BTREE *t, const DBT *key, EPG *erval, int *exactp)
4659f0aa214Scgd {
46686147b1cSchristos PAGE *h, *hprev;
46724420c01Scgd EPG *ep, save;
46824420c01Scgd pgno_t pg;
4699f0aa214Scgd
47024420c01Scgd /*
47124420c01Scgd * Find any matching record; __bt_search pins the page.
47224420c01Scgd *
47324420c01Scgd * If it's an exact match and duplicates are possible, walk backwards
47424420c01Scgd * in the tree until we find the first one. Otherwise, make sure it's
47524420c01Scgd * a valid key (__bt_search may return an index just past the end of a
47624420c01Scgd * page) and return it.
47724420c01Scgd */
47824420c01Scgd if ((ep = __bt_search(t, key, exactp)) == NULL)
47986147b1cSchristos return RET_SPECIAL;
48024420c01Scgd if (*exactp) {
48124420c01Scgd if (F_ISSET(t, B_NODUPS)) {
48224420c01Scgd *erval = *ep;
48324420c01Scgd return (RET_SUCCESS);
48424420c01Scgd }
48524420c01Scgd
48624420c01Scgd /*
48724420c01Scgd * Walk backwards, as long as the entry matches and there are
48824420c01Scgd * keys left in the tree. Save a copy of each match in case
48924420c01Scgd * we go too far.
49024420c01Scgd */
49124420c01Scgd save = *ep;
49224420c01Scgd h = ep->page;
49324420c01Scgd do {
49424420c01Scgd if (save.page->pgno != ep->page->pgno) {
49524420c01Scgd mpool_put(t->bt_mp, save.page, 0);
49624420c01Scgd save = *ep;
49724420c01Scgd } else
49824420c01Scgd save.index = ep->index;
49924420c01Scgd
50024420c01Scgd /*
50124420c01Scgd * Don't unpin the page the last (or original) match
50224420c01Scgd * was on, but make sure it's unpinned if an error
50324420c01Scgd * occurs.
50424420c01Scgd */
50524420c01Scgd if (ep->index == 0) {
50624420c01Scgd if (h->prevpg == P_INVALID)
50724420c01Scgd break;
50824420c01Scgd if (h->pgno != save.page->pgno)
50924420c01Scgd mpool_put(t->bt_mp, h, 0);
510*aae80e6bSchristos if ((hprev = mpool_get(t->bt_mp,
51186147b1cSchristos h->prevpg, 0)) == NULL) {
51286147b1cSchristos if (h->pgno == save.page->pgno)
51386147b1cSchristos mpool_put(t->bt_mp,
51486147b1cSchristos save.page, 0);
51586147b1cSchristos return RET_ERROR;
51686147b1cSchristos }
51786147b1cSchristos ep->page = h = hprev;
51824420c01Scgd ep->index = NEXTINDEX(h);
51924420c01Scgd }
52024420c01Scgd --ep->index;
52124420c01Scgd } while (__bt_cmp(t, key, ep) == 0);
52224420c01Scgd
52324420c01Scgd /*
52424420c01Scgd * Reach here with the last page that was looked at pinned,
52524420c01Scgd * which may or may not be the same as the last (or original)
52624420c01Scgd * match page. If it's not useful, release it.
52724420c01Scgd */
52824420c01Scgd if (h->pgno != save.page->pgno)
52924420c01Scgd mpool_put(t->bt_mp, h, 0);
53024420c01Scgd
53124420c01Scgd *erval = save;
53224420c01Scgd return (RET_SUCCESS);
53324420c01Scgd }
53424420c01Scgd
53524420c01Scgd /* If at the end of a page, find the next entry. */
53624420c01Scgd if (ep->index == NEXTINDEX(ep->page)) {
53724420c01Scgd h = ep->page;
53824420c01Scgd pg = h->nextpg;
53924420c01Scgd mpool_put(t->bt_mp, h, 0);
54024420c01Scgd if (pg == P_INVALID)
54124420c01Scgd return (RET_SPECIAL);
542*aae80e6bSchristos if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
54324420c01Scgd return (RET_ERROR);
54424420c01Scgd ep->index = 0;
54524420c01Scgd ep->page = h;
54624420c01Scgd }
54724420c01Scgd *erval = *ep;
54824420c01Scgd return (RET_SUCCESS);
54924420c01Scgd }
55024420c01Scgd
55124420c01Scgd /*
55224420c01Scgd * __bt_setcur --
55324420c01Scgd * Set the cursor to an entry in the tree.
55424420c01Scgd *
55524420c01Scgd * Parameters:
55624420c01Scgd * t: the tree
55724420c01Scgd * pgno: page number
558585dfd61Sthorpej * idx: page index
55924420c01Scgd */
56024420c01Scgd void
__bt_setcur(BTREE * t,pgno_t pgno,u_int idx)561cb9daf8fSchristos __bt_setcur(BTREE *t, pgno_t pgno, u_int idx)
56224420c01Scgd {
56324420c01Scgd /* Lose any already deleted key. */
56424420c01Scgd if (t->bt_cursor.key.data != NULL) {
56524420c01Scgd free(t->bt_cursor.key.data);
56624420c01Scgd t->bt_cursor.key.size = 0;
56724420c01Scgd t->bt_cursor.key.data = NULL;
56824420c01Scgd }
56924420c01Scgd F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
57024420c01Scgd
57124420c01Scgd /* Update the cursor. */
57224420c01Scgd t->bt_cursor.pg.pgno = pgno;
573585dfd61Sthorpej t->bt_cursor.pg.index = idx;
57424420c01Scgd F_SET(&t->bt_cursor, CURS_INIT);
5759f0aa214Scgd }
576