xref: /netbsd-src/lib/libc/db/btree/bt_seq.c (revision aae80e6be7bf8e6ad43f8b63d296d2d3923c4ee5)
1*aae80e6bSchristos /*	$NetBSD: bt_seq.c,v 1.20 2016/09/24 21:31:25 christos Exp $	*/
2a954b078Scgd 
39f0aa214Scgd /*-
424420c01Scgd  * Copyright (c) 1990, 1993, 1994
59f0aa214Scgd  *	The Regents of the University of California.  All rights reserved.
69f0aa214Scgd  *
79f0aa214Scgd  * This code is derived from software contributed to Berkeley by
89f0aa214Scgd  * Mike Olson.
99f0aa214Scgd  *
109f0aa214Scgd  * Redistribution and use in source and binary forms, with or without
119f0aa214Scgd  * modification, are permitted provided that the following conditions
129f0aa214Scgd  * are met:
139f0aa214Scgd  * 1. Redistributions of source code must retain the above copyright
149f0aa214Scgd  *    notice, this list of conditions and the following disclaimer.
159f0aa214Scgd  * 2. Redistributions in binary form must reproduce the above copyright
169f0aa214Scgd  *    notice, this list of conditions and the following disclaimer in the
179f0aa214Scgd  *    documentation and/or other materials provided with the distribution.
18eb7c1594Sagc  * 3. Neither the name of the University nor the names of its contributors
199f0aa214Scgd  *    may be used to endorse or promote products derived from this software
209f0aa214Scgd  *    without specific prior written permission.
219f0aa214Scgd  *
229f0aa214Scgd  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
239f0aa214Scgd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
249f0aa214Scgd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
259f0aa214Scgd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
269f0aa214Scgd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
279f0aa214Scgd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
289f0aa214Scgd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
299f0aa214Scgd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
309f0aa214Scgd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
319f0aa214Scgd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
329f0aa214Scgd  * SUCH DAMAGE.
339f0aa214Scgd  */
349f0aa214Scgd 
35d3595ddfSjoerg #if HAVE_NBTOOL_CONFIG_H
36d3595ddfSjoerg #include "nbtool_config.h"
37d3595ddfSjoerg #endif
38d3595ddfSjoerg 
3900ae392dSchristos #include <sys/cdefs.h>
40*aae80e6bSchristos __RCSID("$NetBSD: bt_seq.c,v 1.20 2016/09/24 21:31:25 christos Exp $");
419f0aa214Scgd 
4243fa6fe3Sjtc #include "namespace.h"
439f0aa214Scgd #include <sys/types.h>
449f0aa214Scgd 
45cb9daf8fSchristos #include <assert.h>
469f0aa214Scgd #include <errno.h>
479f0aa214Scgd #include <stddef.h>
489f0aa214Scgd #include <stdio.h>
499f0aa214Scgd #include <stdlib.h>
509f0aa214Scgd 
519f0aa214Scgd #include <db.h>
529f0aa214Scgd #include "btree.h"
539f0aa214Scgd 
54cb9daf8fSchristos static int __bt_first(BTREE *, const DBT *, EPG *, int *);
55cb9daf8fSchristos static int __bt_seqadv(BTREE *, EPG *, int);
56cb9daf8fSchristos static int __bt_seqset(BTREE *, EPG *, DBT *, int);
5786147b1cSchristos static int __bt_rseq_next(BTREE *, EPG *);
5886147b1cSchristos static int __bt_rseq_prev(BTREE *, EPG *);
599f0aa214Scgd 
609f0aa214Scgd /*
619f0aa214Scgd  * Sequential scan support.
629f0aa214Scgd  *
6324420c01Scgd  * The tree can be scanned sequentially, starting from either end of the
6424420c01Scgd  * tree or from any specific key.  A scan request before any scanning is
6524420c01Scgd  * done is initialized as starting from the least node.
669f0aa214Scgd  */
679f0aa214Scgd 
689f0aa214Scgd /*
6924420c01Scgd  * __bt_seq --
7024420c01Scgd  *	Btree sequential scan interface.
719f0aa214Scgd  *
729f0aa214Scgd  * Parameters:
739f0aa214Scgd  *	dbp:	pointer to access method
749f0aa214Scgd  *	key:	key for positioning and return value
759f0aa214Scgd  *	data:	data return value
7686147b1cSchristos  *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV, R_RNEXT, R_RPREV.
779f0aa214Scgd  *
789f0aa214Scgd  * Returns:
799f0aa214Scgd  *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
809f0aa214Scgd  */
819f0aa214Scgd int
__bt_seq(const DB * dbp,DBT * key,DBT * data,u_int flags)82cb9daf8fSchristos __bt_seq(const DB *dbp, DBT *key, DBT *data, u_int flags)
839f0aa214Scgd {
849f0aa214Scgd 	BTREE *t;
859f0aa214Scgd 	EPG e;
869f0aa214Scgd 	int status;
879f0aa214Scgd 
8845e27c80Scgd 	t = dbp->internal;
8945e27c80Scgd 
9045e27c80Scgd 	/* Toss any page pinned across calls. */
9145e27c80Scgd 	if (t->bt_pinned != NULL) {
9245e27c80Scgd 		mpool_put(t->bt_mp, t->bt_pinned, 0);
9345e27c80Scgd 		t->bt_pinned = NULL;
9445e27c80Scgd 	}
9545e27c80Scgd 
969f0aa214Scgd 	/*
9727e03817Sryoon 	 * If scan uninitialized as yet, or starting at a specific record, set
9824420c01Scgd 	 * the scan to a specific key.  Both __bt_seqset and __bt_seqadv pin
9924420c01Scgd 	 * the page the cursor references if they're successful.
1009f0aa214Scgd 	 */
1019f0aa214Scgd 	switch (flags) {
1029f0aa214Scgd 	case R_NEXT:
1039f0aa214Scgd 	case R_PREV:
10486147b1cSchristos 	case R_RNEXT:
10586147b1cSchristos 	case R_RPREV:
10624420c01Scgd 		if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
10761238e71Schristos 			status = __bt_seqadv(t, &e, (int)flags);
1089f0aa214Scgd 			break;
1099f0aa214Scgd 		}
1109f0aa214Scgd 		/* FALLTHROUGH */
1119f0aa214Scgd 	case R_FIRST:
1129f0aa214Scgd 	case R_LAST:
11324420c01Scgd 	case R_CURSOR:
11461238e71Schristos 		status = __bt_seqset(t, &e, key, (int)flags);
1159f0aa214Scgd 		break;
1169f0aa214Scgd 	default:
1179f0aa214Scgd 		errno = EINVAL;
1189f0aa214Scgd 		return (RET_ERROR);
1199f0aa214Scgd 	}
1209f0aa214Scgd 
1219f0aa214Scgd 	if (status == RET_SUCCESS) {
12261238e71Schristos 		__bt_setcur(t, e.page->pgno, (u_int)e.index);
1239f0aa214Scgd 
12424420c01Scgd 		status =
12524420c01Scgd 		    __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
12645e27c80Scgd 
12745e27c80Scgd 		/*
12845e27c80Scgd 		 * If the user is doing concurrent access, we copied the
12945e27c80Scgd 		 * key/data, toss the page.
13045e27c80Scgd 		 */
13124420c01Scgd 		if (F_ISSET(t, B_DB_LOCK))
1329f0aa214Scgd 			mpool_put(t->bt_mp, e.page, 0);
13345e27c80Scgd 		else
13445e27c80Scgd 			t->bt_pinned = e.page;
1359f0aa214Scgd 	}
1369f0aa214Scgd 	return (status);
1379f0aa214Scgd }
1389f0aa214Scgd 
1399f0aa214Scgd /*
14024420c01Scgd  * __bt_seqset --
14124420c01Scgd  *	Set the sequential scan to a specific key.
1429f0aa214Scgd  *
1439f0aa214Scgd  * Parameters:
1449f0aa214Scgd  *	t:	tree
1459f0aa214Scgd  *	ep:	storage for returned key
1469f0aa214Scgd  *	key:	key for initial scan position
14786147b1cSchristos  *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV, R_RNEXT, R_RPREV.
1489f0aa214Scgd  *
1499f0aa214Scgd  * Side effects:
1509f0aa214Scgd  *	Pins the page the cursor references.
1519f0aa214Scgd  *
1529f0aa214Scgd  * Returns:
1539f0aa214Scgd  *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
1549f0aa214Scgd  */
1559f0aa214Scgd static int
__bt_seqset(BTREE * t,EPG * ep,DBT * key,int flags)156cb9daf8fSchristos __bt_seqset(BTREE *t, EPG *ep, DBT *key, int flags)
1579f0aa214Scgd {
1589f0aa214Scgd 	PAGE *h;
1599f0aa214Scgd 	pgno_t pg;
1609f0aa214Scgd 	int exact;
1619f0aa214Scgd 
1629f0aa214Scgd 	/*
16324420c01Scgd 	 * Find the first, last or specific key in the tree and point the
16424420c01Scgd 	 * cursor at it.  The cursor may not be moved until a new key has
16524420c01Scgd 	 * been found.
1669f0aa214Scgd 	 */
1679f0aa214Scgd 	switch (flags) {
1689f0aa214Scgd 	case R_CURSOR:				/* Keyed scan. */
1699f0aa214Scgd 		/*
17024420c01Scgd 		 * Find the first instance of the key or the smallest key
17124420c01Scgd 		 * which is greater than or equal to the specified key.
1729f0aa214Scgd 		 */
1739f0aa214Scgd 		if (key->data == NULL || key->size == 0) {
1749f0aa214Scgd 			errno = EINVAL;
1759f0aa214Scgd 			return (RET_ERROR);
1769f0aa214Scgd 		}
17724420c01Scgd 		return (__bt_first(t, key, ep, &exact));
1789f0aa214Scgd 	case R_FIRST:				/* First record. */
1799f0aa214Scgd 	case R_NEXT:
18086147b1cSchristos 	case R_RNEXT:
18186147b1cSchristos 		BT_CLR(t);
1829f0aa214Scgd 		/* Walk down the left-hand side of the tree. */
1839f0aa214Scgd 		for (pg = P_ROOT;;) {
184*aae80e6bSchristos 			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
1859f0aa214Scgd 				return (RET_ERROR);
1869f0aa214Scgd 
18724420c01Scgd 			/* Check for an empty tree. */
1889f0aa214Scgd 			if (NEXTINDEX(h) == 0) {
1899f0aa214Scgd 				mpool_put(t->bt_mp, h, 0);
1909f0aa214Scgd 				return (RET_SPECIAL);
1919f0aa214Scgd 			}
1929f0aa214Scgd 
19324420c01Scgd 			if (h->flags & (P_BLEAF | P_RLEAF))
19424420c01Scgd 				break;
19524420c01Scgd 			pg = GETBINTERNAL(h, 0)->pgno;
19686147b1cSchristos 			BT_PUSH(t, h->pgno, 0);
19724420c01Scgd 			mpool_put(t->bt_mp, h, 0);
19824420c01Scgd 		}
1999f0aa214Scgd 		ep->page = h;
2009f0aa214Scgd 		ep->index = 0;
2019f0aa214Scgd 		break;
2029f0aa214Scgd 	case R_LAST:				/* Last record. */
2039f0aa214Scgd 	case R_PREV:
20486147b1cSchristos 	case R_RPREV:
20586147b1cSchristos 		BT_CLR(t);
2069f0aa214Scgd 		/* Walk down the right-hand side of the tree. */
2079f0aa214Scgd 		for (pg = P_ROOT;;) {
208*aae80e6bSchristos 			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
2099f0aa214Scgd 				return (RET_ERROR);
21024420c01Scgd 
21124420c01Scgd 			/* Check for an empty tree. */
21224420c01Scgd 			if (NEXTINDEX(h) == 0) {
21324420c01Scgd 				mpool_put(t->bt_mp, h, 0);
21424420c01Scgd 				return (RET_SPECIAL);
21524420c01Scgd 			}
21624420c01Scgd 
2179f0aa214Scgd 			if (h->flags & (P_BLEAF | P_RLEAF))
2189f0aa214Scgd 				break;
2199f0aa214Scgd 			pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
22086147b1cSchristos 			BT_PUSH(t, h->pgno, NEXTINDEX(h) - 1);
2219f0aa214Scgd 			mpool_put(t->bt_mp, h, 0);
2229f0aa214Scgd 		}
2239f0aa214Scgd 
2249f0aa214Scgd 		ep->page = h;
2259f0aa214Scgd 		ep->index = NEXTINDEX(h) - 1;
2269f0aa214Scgd 		break;
2279f0aa214Scgd 	}
2289f0aa214Scgd 	return (RET_SUCCESS);
2299f0aa214Scgd }
2309f0aa214Scgd 
2319f0aa214Scgd /*
23224420c01Scgd  * __bt_seqadvance --
23324420c01Scgd  *	Advance the sequential scan.
2349f0aa214Scgd  *
2359f0aa214Scgd  * Parameters:
2369f0aa214Scgd  *	t:	tree
23786147b1cSchristos  *	flags:	R_NEXT, R_PREV, R_RNEXT, R_RPREV
2389f0aa214Scgd  *
2399f0aa214Scgd  * Side effects:
2409f0aa214Scgd  *	Pins the page the new key/data record is on.
2419f0aa214Scgd  *
2429f0aa214Scgd  * Returns:
2439f0aa214Scgd  *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
2449f0aa214Scgd  */
2459f0aa214Scgd static int
__bt_seqadv(BTREE * t,EPG * ep,int flags)246cb9daf8fSchristos __bt_seqadv(BTREE *t, EPG *ep, int flags)
2479f0aa214Scgd {
24824420c01Scgd 	CURSOR *c;
2499f0aa214Scgd 	PAGE *h;
250585dfd61Sthorpej 	indx_t idx = 0;	/* pacify gcc */
2519f0aa214Scgd 	pgno_t pg;
25286147b1cSchristos 	int exact, rval;
2539f0aa214Scgd 
25424420c01Scgd 	/*
25524420c01Scgd 	 * There are a couple of states that we can be in.  The cursor has
25624420c01Scgd 	 * been initialized by the time we get here, but that's all we know.
25724420c01Scgd 	 */
25824420c01Scgd 	c = &t->bt_cursor;
2599f0aa214Scgd 
26024420c01Scgd 	/*
26186147b1cSchristos 	 * The cursor was deleted and there weren't any duplicate records,
26286147b1cSchristos 	 * so the cursor's key was saved.  Find out where that key would
26386147b1cSchristos 	 * be in the current tree.  If the returned key is an exact match,
26486147b1cSchristos 	 * it means that a key/data pair was inserted into the tree after
26586147b1cSchristos 	 * the delete.  We could reasonably return the key, but the problem
26686147b1cSchristos 	 * is that this is the access pattern we'll see if the user is
26786147b1cSchristos 	 * doing seq(..., R_NEXT)/put(..., 0) pairs, i.e. the put deletes
26886147b1cSchristos 	 * the cursor record and then replaces it, so the cursor was saved,
26986147b1cSchristos 	 * and we'll simply return the same "new" record until the user
27086147b1cSchristos 	 * notices and doesn't do a put() of it.  Since the key is an exact
27186147b1cSchristos 	 * match, we could as easily put the new record before the cursor,
27286147b1cSchristos 	 * and we've made no guarantee to return it.  So, move forward or
27386147b1cSchristos 	 * back a record if it's an exact match.
27486147b1cSchristos 	 *
27586147b1cSchristos 	 * XXX
27686147b1cSchristos 	 * In the current implementation, put's to the cursor are done with
27786147b1cSchristos 	 * delete/add pairs.  This has two consequences.  First, it means
27886147b1cSchristos 	 * that seq(..., R_NEXT)/put(..., R_CURSOR) pairs are going to exhibit
27986147b1cSchristos 	 * the same behavior as above.  Second, you can return the same key
28086147b1cSchristos 	 * twice if you have duplicate records.  The scenario is that the
28186147b1cSchristos 	 * cursor record is deleted, moving the cursor forward or backward
28286147b1cSchristos 	 * to a duplicate.  The add then inserts the new record at a location
28386147b1cSchristos 	 * ahead of the cursor because duplicates aren't sorted in any way,
28486147b1cSchristos 	 * and the new record is later returned.  This has to be fixed at some
28586147b1cSchristos 	 * point.
28624420c01Scgd 	 */
28786147b1cSchristos 	if (F_ISSET(c, CURS_ACQUIRE)) {
28886147b1cSchristos 		if ((rval = __bt_first(t, &c->key, ep, &exact)) == RET_ERROR)
28986147b1cSchristos 			return RET_ERROR;
29086147b1cSchristos 		if (!exact)
29186147b1cSchristos 			return rval;
29286147b1cSchristos 		/*
29386147b1cSchristos 		 * XXX
29486147b1cSchristos 		 * Kluge -- get, release, get the page.
29586147b1cSchristos 		 */
29686147b1cSchristos 		c->pg.pgno = ep->page->pgno;
29786147b1cSchristos 		c->pg.index = ep->index;
29886147b1cSchristos 		mpool_put(t->bt_mp, ep->page, 0);
29986147b1cSchristos 	}
30024420c01Scgd 
30124420c01Scgd 	/* Get the page referenced by the cursor. */
302*aae80e6bSchristos 	if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
3039f0aa214Scgd 		return (RET_ERROR);
3049f0aa214Scgd 
3059f0aa214Scgd 	/*
30624420c01Scgd  	 * Find the next/previous record in the tree and point the cursor at
30724420c01Scgd 	 * it.  The cursor may not be moved until a new key has been found.
3089f0aa214Scgd 	 */
3099f0aa214Scgd 	switch (flags) {
3109f0aa214Scgd 	case R_NEXT:			/* Next record. */
31186147b1cSchristos 	case R_RNEXT:
31224420c01Scgd 		/*
31324420c01Scgd 		 * The cursor was deleted in duplicate records, and moved
31424420c01Scgd 		 * forward to a record that has yet to be returned.  Clear
31524420c01Scgd 		 * that flag, and return the record.
31624420c01Scgd 		 */
31724420c01Scgd 		if (F_ISSET(c, CURS_AFTER))
31824420c01Scgd 			goto usecurrent;
319585dfd61Sthorpej 		idx = c->pg.index;
320585dfd61Sthorpej 		if (++idx == NEXTINDEX(h)) {
32186147b1cSchristos 			if (flags == R_RNEXT) {
32286147b1cSchristos 				ep->page = h;
32386147b1cSchristos 				ep->index = idx;
32486147b1cSchristos 				return __bt_rseq_next(t, ep);
32586147b1cSchristos 			}
3269f0aa214Scgd 			pg = h->nextpg;
3279f0aa214Scgd 			mpool_put(t->bt_mp, h, 0);
3289f0aa214Scgd 			if (pg == P_INVALID)
32986147b1cSchristos 				return RET_SPECIAL;
330*aae80e6bSchristos 			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
33186147b1cSchristos 				return RET_ERROR;
332585dfd61Sthorpej 			idx = 0;
3339f0aa214Scgd 		}
3349f0aa214Scgd 		break;
3359f0aa214Scgd 	case R_PREV:			/* Previous record. */
33686147b1cSchristos 	case R_RPREV:
33724420c01Scgd 		/*
33824420c01Scgd 		 * The cursor was deleted in duplicate records, and moved
33924420c01Scgd 		 * backward to a record that has yet to be returned.  Clear
34024420c01Scgd 		 * that flag, and return the record.
34124420c01Scgd 		 */
34224420c01Scgd 		if (F_ISSET(c, CURS_BEFORE)) {
34324420c01Scgd usecurrent:		F_CLR(c, CURS_AFTER | CURS_BEFORE);
34424420c01Scgd 			ep->page = h;
34524420c01Scgd 			ep->index = c->pg.index;
34624420c01Scgd 			return (RET_SUCCESS);
34724420c01Scgd 		}
348585dfd61Sthorpej 		idx = c->pg.index;
349585dfd61Sthorpej 		if (idx == 0) {
35086147b1cSchristos 			if (flags == R_RPREV) {
35186147b1cSchristos 				ep->page = h;
35286147b1cSchristos 				ep->index = idx;
35386147b1cSchristos 				return __bt_rseq_prev(t, ep);
35486147b1cSchristos 			}
3559f0aa214Scgd 			pg = h->prevpg;
3569f0aa214Scgd 			mpool_put(t->bt_mp, h, 0);
3579f0aa214Scgd 			if (pg == P_INVALID)
35886147b1cSchristos 				return RET_SPECIAL;
359*aae80e6bSchristos 			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
36086147b1cSchristos 				return RET_ERROR;
361585dfd61Sthorpej 			idx = NEXTINDEX(h) - 1;
36224420c01Scgd 		} else
363585dfd61Sthorpej 			--idx;
3649f0aa214Scgd 		break;
3659f0aa214Scgd 	}
3669f0aa214Scgd 
36724420c01Scgd 	ep->page = h;
368585dfd61Sthorpej 	ep->index = idx;
3699f0aa214Scgd 	return (RET_SUCCESS);
3709f0aa214Scgd }
37186147b1cSchristos /*
37286147b1cSchristos  * Get the first item on the next page, but by going up and down the tree.
37386147b1cSchristos  */
37486147b1cSchristos static int
__bt_rseq_next(BTREE * t,EPG * ep)37586147b1cSchristos __bt_rseq_next(BTREE *t, EPG *ep)
37686147b1cSchristos {
37786147b1cSchristos 	PAGE *h;
37886147b1cSchristos 	indx_t idx;
37986147b1cSchristos 	EPGNO *up;
38086147b1cSchristos 	pgno_t pg;
38186147b1cSchristos 
38286147b1cSchristos 	h = ep->page;
38386147b1cSchristos 	idx = ep->index;
38486147b1cSchristos 	do {
38586147b1cSchristos 		/* Move up the tree. */
38686147b1cSchristos 		up = BT_POP(t);
38786147b1cSchristos 		mpool_put(t->bt_mp, h, 0);
38886147b1cSchristos 		/* Did we hit the right edge of the root? */
38986147b1cSchristos 		if (up == NULL)
39086147b1cSchristos 			return RET_SPECIAL;
391*aae80e6bSchristos 		if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL)
39286147b1cSchristos 			return RET_ERROR;
39386147b1cSchristos 		idx = up->index;
39486147b1cSchristos 	} while (++idx == NEXTINDEX(h));
39586147b1cSchristos 
39686147b1cSchristos 	while (!(h->flags & (P_BLEAF | P_RLEAF))) {
39786147b1cSchristos 		/* Move back down the tree. */
39886147b1cSchristos 		BT_PUSH(t, h->pgno, idx);
39986147b1cSchristos 		pg = GETBINTERNAL(h, idx)->pgno;
40086147b1cSchristos 		mpool_put(t->bt_mp, h, 0);
401*aae80e6bSchristos 		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
40286147b1cSchristos 			return RET_ERROR;
40386147b1cSchristos 		idx = 0;
40486147b1cSchristos 	}
40586147b1cSchristos 	ep->page = h;
40686147b1cSchristos 	ep->index = idx;
40786147b1cSchristos 	return RET_SUCCESS;
40886147b1cSchristos }
40986147b1cSchristos 
41086147b1cSchristos /*
41186147b1cSchristos  * Get the last item on the previous page, but by going up and down the tree.
41286147b1cSchristos  */
41386147b1cSchristos static int
__bt_rseq_prev(BTREE * t,EPG * ep)41486147b1cSchristos __bt_rseq_prev(BTREE *t, EPG *ep)
41586147b1cSchristos {
41686147b1cSchristos 	PAGE *h;
41786147b1cSchristos 	indx_t idx;
41886147b1cSchristos 	EPGNO *up;
41986147b1cSchristos 	pgno_t pg;
42086147b1cSchristos 
42186147b1cSchristos 	h = ep->page;
42286147b1cSchristos 	idx = ep->index;
42386147b1cSchristos 	do {
42486147b1cSchristos 		/* Move up the tree. */
42586147b1cSchristos 		up = BT_POP(t);
42686147b1cSchristos 		mpool_put(t->bt_mp, h, 0);
42786147b1cSchristos 		/* Did we hit the left edge of the root? */
42886147b1cSchristos 		if (up == NULL)
42986147b1cSchristos 			return RET_SPECIAL;
430*aae80e6bSchristos 		if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL)
43186147b1cSchristos 			return RET_ERROR;
43286147b1cSchristos 		idx = up->index;
43386147b1cSchristos 	} while (idx == 0);
43486147b1cSchristos 	--idx;
43586147b1cSchristos 	while (!(h->flags & (P_BLEAF | P_RLEAF))) {
43686147b1cSchristos 		/* Move back down the tree. */
43786147b1cSchristos 		BT_PUSH(t, h->pgno, idx);
43886147b1cSchristos 		pg = GETBINTERNAL(h, idx)->pgno;
43986147b1cSchristos 		mpool_put(t->bt_mp, h, 0);
440*aae80e6bSchristos 		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
44186147b1cSchristos 			return RET_ERROR;
44286147b1cSchristos 		idx = NEXTINDEX(h) - 1;
44386147b1cSchristos 	}
44486147b1cSchristos 	ep->page = h;
44586147b1cSchristos 	ep->index = idx;
44686147b1cSchristos 	return RET_SUCCESS;
44786147b1cSchristos }
4489f0aa214Scgd 
4499f0aa214Scgd /*
45024420c01Scgd  * __bt_first --
45124420c01Scgd  *	Find the first entry.
4529f0aa214Scgd  *
4539f0aa214Scgd  * Parameters:
45424420c01Scgd  *	t:	the tree
45524420c01Scgd  *    key:	the key
45624420c01Scgd  *  erval:	return EPG
45724420c01Scgd  * exactp:	pointer to exact match flag
4589f0aa214Scgd  *
4599f0aa214Scgd  * Returns:
46024420c01Scgd  *	The first entry in the tree greater than or equal to key,
46124420c01Scgd  *	or RET_SPECIAL if no such key exists.
4629f0aa214Scgd  */
46324420c01Scgd static int
__bt_first(BTREE * t,const DBT * key,EPG * erval,int * exactp)464cb9daf8fSchristos __bt_first(BTREE *t, const DBT *key, EPG *erval, int *exactp)
4659f0aa214Scgd {
46686147b1cSchristos 	PAGE *h, *hprev;
46724420c01Scgd 	EPG *ep, save;
46824420c01Scgd 	pgno_t pg;
4699f0aa214Scgd 
47024420c01Scgd 	/*
47124420c01Scgd 	 * Find any matching record; __bt_search pins the page.
47224420c01Scgd 	 *
47324420c01Scgd 	 * If it's an exact match and duplicates are possible, walk backwards
47424420c01Scgd 	 * in the tree until we find the first one.  Otherwise, make sure it's
47524420c01Scgd 	 * a valid key (__bt_search may return an index just past the end of a
47624420c01Scgd 	 * page) and return it.
47724420c01Scgd 	 */
47824420c01Scgd 	if ((ep = __bt_search(t, key, exactp)) == NULL)
47986147b1cSchristos 		return RET_SPECIAL;
48024420c01Scgd 	if (*exactp) {
48124420c01Scgd 		if (F_ISSET(t, B_NODUPS)) {
48224420c01Scgd 			*erval = *ep;
48324420c01Scgd 			return (RET_SUCCESS);
48424420c01Scgd 		}
48524420c01Scgd 
48624420c01Scgd 		/*
48724420c01Scgd 		 * Walk backwards, as long as the entry matches and there are
48824420c01Scgd 		 * keys left in the tree.  Save a copy of each match in case
48924420c01Scgd 		 * we go too far.
49024420c01Scgd 		 */
49124420c01Scgd 		save = *ep;
49224420c01Scgd 		h = ep->page;
49324420c01Scgd 		do {
49424420c01Scgd 			if (save.page->pgno != ep->page->pgno) {
49524420c01Scgd 				mpool_put(t->bt_mp, save.page, 0);
49624420c01Scgd 				save = *ep;
49724420c01Scgd 			} else
49824420c01Scgd 				save.index = ep->index;
49924420c01Scgd 
50024420c01Scgd 			/*
50124420c01Scgd 			 * Don't unpin the page the last (or original) match
50224420c01Scgd 			 * was on, but make sure it's unpinned if an error
50324420c01Scgd 			 * occurs.
50424420c01Scgd 			 */
50524420c01Scgd 			if (ep->index == 0) {
50624420c01Scgd 				if (h->prevpg == P_INVALID)
50724420c01Scgd 					break;
50824420c01Scgd 				if (h->pgno != save.page->pgno)
50924420c01Scgd 					mpool_put(t->bt_mp, h, 0);
510*aae80e6bSchristos 				if ((hprev = mpool_get(t->bt_mp,
51186147b1cSchristos 				    h->prevpg, 0)) == NULL) {
51286147b1cSchristos 					if (h->pgno == save.page->pgno)
51386147b1cSchristos 						mpool_put(t->bt_mp,
51486147b1cSchristos 						    save.page, 0);
51586147b1cSchristos  					return RET_ERROR;
51686147b1cSchristos 				}
51786147b1cSchristos 				ep->page = h = hprev;
51824420c01Scgd 				ep->index = NEXTINDEX(h);
51924420c01Scgd 			}
52024420c01Scgd 			--ep->index;
52124420c01Scgd 		} while (__bt_cmp(t, key, ep) == 0);
52224420c01Scgd 
52324420c01Scgd 		/*
52424420c01Scgd 		 * Reach here with the last page that was looked at pinned,
52524420c01Scgd 		 * which may or may not be the same as the last (or original)
52624420c01Scgd 		 * match page.  If it's not useful, release it.
52724420c01Scgd 		 */
52824420c01Scgd 		if (h->pgno != save.page->pgno)
52924420c01Scgd 			mpool_put(t->bt_mp, h, 0);
53024420c01Scgd 
53124420c01Scgd 		*erval = save;
53224420c01Scgd 		return (RET_SUCCESS);
53324420c01Scgd 	}
53424420c01Scgd 
53524420c01Scgd 	/* If at the end of a page, find the next entry. */
53624420c01Scgd 	if (ep->index == NEXTINDEX(ep->page)) {
53724420c01Scgd 		h = ep->page;
53824420c01Scgd 		pg = h->nextpg;
53924420c01Scgd 		mpool_put(t->bt_mp, h, 0);
54024420c01Scgd 		if (pg == P_INVALID)
54124420c01Scgd 			return (RET_SPECIAL);
542*aae80e6bSchristos 		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
54324420c01Scgd 			return (RET_ERROR);
54424420c01Scgd 		ep->index = 0;
54524420c01Scgd 		ep->page = h;
54624420c01Scgd 	}
54724420c01Scgd 	*erval = *ep;
54824420c01Scgd 	return (RET_SUCCESS);
54924420c01Scgd }
55024420c01Scgd 
55124420c01Scgd /*
55224420c01Scgd  * __bt_setcur --
55324420c01Scgd  *	Set the cursor to an entry in the tree.
55424420c01Scgd  *
55524420c01Scgd  * Parameters:
55624420c01Scgd  *	t:	the tree
55724420c01Scgd  *   pgno:	page number
558585dfd61Sthorpej  *    idx:	page index
55924420c01Scgd  */
56024420c01Scgd void
__bt_setcur(BTREE * t,pgno_t pgno,u_int idx)561cb9daf8fSchristos __bt_setcur(BTREE *t, pgno_t pgno, u_int idx)
56224420c01Scgd {
56324420c01Scgd 	/* Lose any already deleted key. */
56424420c01Scgd 	if (t->bt_cursor.key.data != NULL) {
56524420c01Scgd 		free(t->bt_cursor.key.data);
56624420c01Scgd 		t->bt_cursor.key.size = 0;
56724420c01Scgd 		t->bt_cursor.key.data = NULL;
56824420c01Scgd 	}
56924420c01Scgd 	F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
57024420c01Scgd 
57124420c01Scgd 	/* Update the cursor. */
57224420c01Scgd 	t->bt_cursor.pg.pgno = pgno;
573585dfd61Sthorpej 	t->bt_cursor.pg.index = idx;
57424420c01Scgd 	F_SET(&t->bt_cursor, CURS_INIT);
5759f0aa214Scgd }
576