xref: /onnv-gate/usr/src/cmd/sendmail/db/btree/bt_cursor.c (revision 0:68f95e015346)
1*0Sstevel@tonic-gate /*-
2*0Sstevel@tonic-gate  * See the file LICENSE for redistribution information.
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * Copyright (c) 1996, 1997, 1998
5*0Sstevel@tonic-gate  *	Sleepycat Software.  All rights reserved.
6*0Sstevel@tonic-gate  */
7*0Sstevel@tonic-gate 
8*0Sstevel@tonic-gate #include "config.h"
9*0Sstevel@tonic-gate 
10*0Sstevel@tonic-gate #ifndef lint
11*0Sstevel@tonic-gate static const char sccsid[] = "@(#)bt_cursor.c	10.81 (Sleepycat) 12/16/98";
12*0Sstevel@tonic-gate #endif /* not lint */
13*0Sstevel@tonic-gate 
14*0Sstevel@tonic-gate #ifndef NO_SYSTEM_INCLUDES
15*0Sstevel@tonic-gate #include <sys/types.h>
16*0Sstevel@tonic-gate 
17*0Sstevel@tonic-gate #include <errno.h>
18*0Sstevel@tonic-gate #include <stdlib.h>
19*0Sstevel@tonic-gate #include <string.h>
20*0Sstevel@tonic-gate #endif
21*0Sstevel@tonic-gate 
22*0Sstevel@tonic-gate #include "db_int.h"
23*0Sstevel@tonic-gate #include "db_page.h"
24*0Sstevel@tonic-gate #include "btree.h"
25*0Sstevel@tonic-gate #include "shqueue.h"
26*0Sstevel@tonic-gate #include "db_shash.h"
27*0Sstevel@tonic-gate #include "lock.h"
28*0Sstevel@tonic-gate #include "lock_ext.h"
29*0Sstevel@tonic-gate 
30*0Sstevel@tonic-gate static int __bam_c_close __P((DBC *));
31*0Sstevel@tonic-gate static int __bam_c_del __P((DBC *, u_int32_t));
32*0Sstevel@tonic-gate static int __bam_c_destroy __P((DBC *));
33*0Sstevel@tonic-gate static int __bam_c_first __P((DBC *, CURSOR *));
34*0Sstevel@tonic-gate static int __bam_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
35*0Sstevel@tonic-gate static int __bam_c_getstack __P((DBC *, CURSOR *));
36*0Sstevel@tonic-gate static int __bam_c_last __P((DBC *, CURSOR *));
37*0Sstevel@tonic-gate static int __bam_c_next __P((DBC *, CURSOR *, int));
38*0Sstevel@tonic-gate static int __bam_c_physdel __P((DBC *, CURSOR *, PAGE *));
39*0Sstevel@tonic-gate static int __bam_c_prev __P((DBC *, CURSOR *));
40*0Sstevel@tonic-gate static int __bam_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
41*0Sstevel@tonic-gate static void __bam_c_reset __P((CURSOR *));
42*0Sstevel@tonic-gate static int __bam_c_rget __P((DBC *, DBT *, u_int32_t));
43*0Sstevel@tonic-gate static int __bam_c_search __P((DBC *, CURSOR *, const DBT *, u_int32_t, int *));
44*0Sstevel@tonic-gate static int __bam_dsearch __P((DBC *, CURSOR *,  DBT *, u_int32_t *));
45*0Sstevel@tonic-gate 
46*0Sstevel@tonic-gate /* Discard the current page/lock held by a cursor. */
47*0Sstevel@tonic-gate #undef	DISCARD
48*0Sstevel@tonic-gate #define	DISCARD(dbc, cp) {						\
49*0Sstevel@tonic-gate 	if ((cp)->page != NULL) {					\
50*0Sstevel@tonic-gate 		(void)memp_fput((dbc)->dbp->mpf, (cp)->page, 0);	\
51*0Sstevel@tonic-gate 		(cp)->page = NULL;					\
52*0Sstevel@tonic-gate 	}								\
53*0Sstevel@tonic-gate 	if ((cp)->lock != LOCK_INVALID) {				\
54*0Sstevel@tonic-gate 		(void)__BT_TLPUT((dbc), (cp)->lock);			\
55*0Sstevel@tonic-gate 		(cp)->lock = LOCK_INVALID;				\
56*0Sstevel@tonic-gate 	}								\
57*0Sstevel@tonic-gate }
58*0Sstevel@tonic-gate 
59*0Sstevel@tonic-gate /* If the cursor references a deleted record. */
60*0Sstevel@tonic-gate #undef	IS_CUR_DELETED
61*0Sstevel@tonic-gate #define	IS_CUR_DELETED(cp)						\
62*0Sstevel@tonic-gate 	(((cp)->dpgno == PGNO_INVALID &&				\
63*0Sstevel@tonic-gate 	B_DISSET(GET_BKEYDATA((cp)->page,				\
64*0Sstevel@tonic-gate 	(cp)->indx + O_INDX)->type)) ||					\
65*0Sstevel@tonic-gate 	((cp)->dpgno != PGNO_INVALID &&					\
66*0Sstevel@tonic-gate 	B_DISSET(GET_BKEYDATA((cp)->page, (cp)->dindx)->type)))
67*0Sstevel@tonic-gate 
68*0Sstevel@tonic-gate /* If the cursor and index combination references a deleted record. */
69*0Sstevel@tonic-gate #undef	IS_DELETED
70*0Sstevel@tonic-gate #define	IS_DELETED(cp, indx)						\
71*0Sstevel@tonic-gate 	(((cp)->dpgno == PGNO_INVALID &&				\
72*0Sstevel@tonic-gate 	B_DISSET(GET_BKEYDATA((cp)->page, (indx) + O_INDX)->type)) ||	\
73*0Sstevel@tonic-gate 	((cp)->dpgno != PGNO_INVALID &&					\
74*0Sstevel@tonic-gate 	B_DISSET(GET_BKEYDATA((cp)->page, (indx))->type)))
75*0Sstevel@tonic-gate 
76*0Sstevel@tonic-gate /*
77*0Sstevel@tonic-gate  * Test to see if two cursors could point to duplicates of the same key,
78*0Sstevel@tonic-gate  * whether on-page or off-page.  The leaf page numbers must be the same
79*0Sstevel@tonic-gate  * in both cases.  In the case of off-page duplicates, the key indices
80*0Sstevel@tonic-gate  * on the leaf page will be the same.  In the case of on-page duplicates,
81*0Sstevel@tonic-gate  * the duplicate page number must not be set, and the key index offsets
82*0Sstevel@tonic-gate  * must be the same.  For the last test, as the saved copy of the cursor
83*0Sstevel@tonic-gate  * will not have a valid page pointer, we use the cursor's.
84*0Sstevel@tonic-gate  */
85*0Sstevel@tonic-gate #undef	POSSIBLE_DUPLICATE
86*0Sstevel@tonic-gate #define	POSSIBLE_DUPLICATE(cursor, saved_copy)				\
87*0Sstevel@tonic-gate 	((cursor)->pgno == (saved_copy).pgno &&				\
88*0Sstevel@tonic-gate 	((cursor)->indx == (saved_copy).indx ||				\
89*0Sstevel@tonic-gate 	((cursor)->dpgno == PGNO_INVALID &&				\
90*0Sstevel@tonic-gate 	    (saved_copy).dpgno == PGNO_INVALID &&			\
91*0Sstevel@tonic-gate 	    (cursor)->page->inp[(cursor)->indx] ==			\
92*0Sstevel@tonic-gate 	    (cursor)->page->inp[(saved_copy).indx])))
93*0Sstevel@tonic-gate 
94*0Sstevel@tonic-gate /*
95*0Sstevel@tonic-gate  * __bam_c_reset --
96*0Sstevel@tonic-gate  *	Initialize internal cursor structure.
97*0Sstevel@tonic-gate  */
98*0Sstevel@tonic-gate static void
__bam_c_reset(cp)99*0Sstevel@tonic-gate __bam_c_reset(cp)
100*0Sstevel@tonic-gate 	CURSOR *cp;
101*0Sstevel@tonic-gate {
102*0Sstevel@tonic-gate 	cp->sp = cp->csp = cp->stack;
103*0Sstevel@tonic-gate 	cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]);
104*0Sstevel@tonic-gate 	cp->page = NULL;
105*0Sstevel@tonic-gate 	cp->pgno = PGNO_INVALID;
106*0Sstevel@tonic-gate 	cp->indx = 0;
107*0Sstevel@tonic-gate 	cp->dpgno = PGNO_INVALID;
108*0Sstevel@tonic-gate 	cp->dindx = 0;
109*0Sstevel@tonic-gate 	cp->lock = LOCK_INVALID;
110*0Sstevel@tonic-gate 	cp->mode = DB_LOCK_NG;
111*0Sstevel@tonic-gate 	cp->recno = RECNO_OOB;
112*0Sstevel@tonic-gate 	cp->flags = 0;
113*0Sstevel@tonic-gate }
114*0Sstevel@tonic-gate 
115*0Sstevel@tonic-gate /*
116*0Sstevel@tonic-gate  * __bam_c_init --
117*0Sstevel@tonic-gate  *	Initialize the access private portion of a cursor
118*0Sstevel@tonic-gate  *
119*0Sstevel@tonic-gate  * PUBLIC: int __bam_c_init __P((DBC *));
120*0Sstevel@tonic-gate  */
121*0Sstevel@tonic-gate int
__bam_c_init(dbc)122*0Sstevel@tonic-gate __bam_c_init(dbc)
123*0Sstevel@tonic-gate 	DBC *dbc;
124*0Sstevel@tonic-gate {
125*0Sstevel@tonic-gate 	DB *dbp;
126*0Sstevel@tonic-gate 	CURSOR *cp;
127*0Sstevel@tonic-gate 	int ret;
128*0Sstevel@tonic-gate 
129*0Sstevel@tonic-gate 	if ((ret = __os_calloc(1, sizeof(CURSOR), &cp)) != 0)
130*0Sstevel@tonic-gate 		return (ret);
131*0Sstevel@tonic-gate 
132*0Sstevel@tonic-gate 	dbp = dbc->dbp;
133*0Sstevel@tonic-gate 	cp->dbc = dbc;
134*0Sstevel@tonic-gate 
135*0Sstevel@tonic-gate 	/*
136*0Sstevel@tonic-gate 	 * Logical record numbers are always the same size, and we don't want
137*0Sstevel@tonic-gate 	 * to have to check for space every time we return one.  Allocate it
138*0Sstevel@tonic-gate 	 * in advance.
139*0Sstevel@tonic-gate 	 */
140*0Sstevel@tonic-gate 	if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) {
141*0Sstevel@tonic-gate 		if ((ret = __os_malloc(sizeof(db_recno_t),
142*0Sstevel@tonic-gate 		    NULL, &dbc->rkey.data)) != 0) {
143*0Sstevel@tonic-gate 			__os_free(cp, sizeof(CURSOR));
144*0Sstevel@tonic-gate 			return (ret);
145*0Sstevel@tonic-gate 		}
146*0Sstevel@tonic-gate 		dbc->rkey.ulen = sizeof(db_recno_t);
147*0Sstevel@tonic-gate 	}
148*0Sstevel@tonic-gate 
149*0Sstevel@tonic-gate 	/* Initialize methods. */
150*0Sstevel@tonic-gate 	dbc->internal = cp;
151*0Sstevel@tonic-gate 	if (dbp->type == DB_BTREE) {
152*0Sstevel@tonic-gate 		dbc->c_am_close = __bam_c_close;
153*0Sstevel@tonic-gate 		dbc->c_am_destroy = __bam_c_destroy;
154*0Sstevel@tonic-gate 		dbc->c_del = __bam_c_del;
155*0Sstevel@tonic-gate 		dbc->c_get = __bam_c_get;
156*0Sstevel@tonic-gate 		dbc->c_put = __bam_c_put;
157*0Sstevel@tonic-gate 	} else {
158*0Sstevel@tonic-gate 		dbc->c_am_close = __bam_c_close;
159*0Sstevel@tonic-gate 		dbc->c_am_destroy = __bam_c_destroy;
160*0Sstevel@tonic-gate 		dbc->c_del = __ram_c_del;
161*0Sstevel@tonic-gate 		dbc->c_get = __ram_c_get;
162*0Sstevel@tonic-gate 		dbc->c_put = __ram_c_put;
163*0Sstevel@tonic-gate 	}
164*0Sstevel@tonic-gate 
165*0Sstevel@tonic-gate 	/* Initialize dynamic information. */
166*0Sstevel@tonic-gate 	__bam_c_reset(cp);
167*0Sstevel@tonic-gate 
168*0Sstevel@tonic-gate 	return (0);
169*0Sstevel@tonic-gate }
170*0Sstevel@tonic-gate 
171*0Sstevel@tonic-gate /*
172*0Sstevel@tonic-gate  * __bam_c_close --
173*0Sstevel@tonic-gate  *	Close down the cursor from a single use.
174*0Sstevel@tonic-gate  */
175*0Sstevel@tonic-gate static int
__bam_c_close(dbc)176*0Sstevel@tonic-gate __bam_c_close(dbc)
177*0Sstevel@tonic-gate 	DBC *dbc;
178*0Sstevel@tonic-gate {
179*0Sstevel@tonic-gate 	CURSOR *cp;
180*0Sstevel@tonic-gate 	DB *dbp;
181*0Sstevel@tonic-gate 	int ret;
182*0Sstevel@tonic-gate 
183*0Sstevel@tonic-gate 	dbp = dbc->dbp;
184*0Sstevel@tonic-gate 	cp = dbc->internal;
185*0Sstevel@tonic-gate 	ret = 0;
186*0Sstevel@tonic-gate 
187*0Sstevel@tonic-gate 	/*
188*0Sstevel@tonic-gate 	 * If a cursor deleted a btree key, perform the actual deletion.
189*0Sstevel@tonic-gate 	 * (Recno keys are either deleted immediately or never deleted.)
190*0Sstevel@tonic-gate 	 */
191*0Sstevel@tonic-gate 	if (dbp->type == DB_BTREE && F_ISSET(cp, C_DELETED))
192*0Sstevel@tonic-gate 		ret = __bam_c_physdel(dbc, cp, NULL);
193*0Sstevel@tonic-gate 
194*0Sstevel@tonic-gate 	/* Discard any locks not acquired inside of a transaction. */
195*0Sstevel@tonic-gate 	if (cp->lock != LOCK_INVALID) {
196*0Sstevel@tonic-gate 		(void)__BT_TLPUT(dbc, cp->lock);
197*0Sstevel@tonic-gate 		cp->lock = LOCK_INVALID;
198*0Sstevel@tonic-gate 	}
199*0Sstevel@tonic-gate 
200*0Sstevel@tonic-gate 	/* Sanity checks. */
201*0Sstevel@tonic-gate #ifdef DIAGNOSTIC
202*0Sstevel@tonic-gate 	if (cp->csp != cp->stack)
203*0Sstevel@tonic-gate 		__db_err(dbp->dbenv, "btree cursor close: stack not empty");
204*0Sstevel@tonic-gate #endif
205*0Sstevel@tonic-gate 
206*0Sstevel@tonic-gate 	/* Initialize dynamic information. */
207*0Sstevel@tonic-gate 	__bam_c_reset(cp);
208*0Sstevel@tonic-gate 
209*0Sstevel@tonic-gate 	return (ret);
210*0Sstevel@tonic-gate }
211*0Sstevel@tonic-gate 
212*0Sstevel@tonic-gate /*
213*0Sstevel@tonic-gate  * __bam_c_destroy --
214*0Sstevel@tonic-gate  *	Close a single cursor -- internal version.
215*0Sstevel@tonic-gate  */
216*0Sstevel@tonic-gate static int
__bam_c_destroy(dbc)217*0Sstevel@tonic-gate __bam_c_destroy(dbc)
218*0Sstevel@tonic-gate 	DBC *dbc;
219*0Sstevel@tonic-gate {
220*0Sstevel@tonic-gate 	/* Discard the structures. */
221*0Sstevel@tonic-gate 	__os_free(dbc->internal, sizeof(CURSOR));
222*0Sstevel@tonic-gate 
223*0Sstevel@tonic-gate 	return (0);
224*0Sstevel@tonic-gate }
225*0Sstevel@tonic-gate 
226*0Sstevel@tonic-gate /*
227*0Sstevel@tonic-gate  * __bam_c_del --
228*0Sstevel@tonic-gate  *	Delete using a cursor.
229*0Sstevel@tonic-gate  */
230*0Sstevel@tonic-gate static int
__bam_c_del(dbc,flags)231*0Sstevel@tonic-gate __bam_c_del(dbc, flags)
232*0Sstevel@tonic-gate 	DBC *dbc;
233*0Sstevel@tonic-gate 	u_int32_t flags;
234*0Sstevel@tonic-gate {
235*0Sstevel@tonic-gate 	CURSOR *cp;
236*0Sstevel@tonic-gate 	DB *dbp;
237*0Sstevel@tonic-gate 	DB_LOCK lock;
238*0Sstevel@tonic-gate 	PAGE *h;
239*0Sstevel@tonic-gate 	db_pgno_t pgno;
240*0Sstevel@tonic-gate 	db_indx_t indx;
241*0Sstevel@tonic-gate 	int ret;
242*0Sstevel@tonic-gate 
243*0Sstevel@tonic-gate 	dbp = dbc->dbp;
244*0Sstevel@tonic-gate 	cp = dbc->internal;
245*0Sstevel@tonic-gate 	h = NULL;
246*0Sstevel@tonic-gate 
247*0Sstevel@tonic-gate 	DB_PANIC_CHECK(dbp);
248*0Sstevel@tonic-gate 
249*0Sstevel@tonic-gate 	/* Check for invalid flags. */
250*0Sstevel@tonic-gate 	if ((ret = __db_cdelchk(dbp, flags,
251*0Sstevel@tonic-gate 	    F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
252*0Sstevel@tonic-gate 		return (ret);
253*0Sstevel@tonic-gate 
254*0Sstevel@tonic-gate 	/*
255*0Sstevel@tonic-gate 	 * If we are running CDB, this had better be either a write
256*0Sstevel@tonic-gate 	 * cursor or an immediate writer.
257*0Sstevel@tonic-gate 	 */
258*0Sstevel@tonic-gate 	if (F_ISSET(dbp, DB_AM_CDB))
259*0Sstevel@tonic-gate 		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
260*0Sstevel@tonic-gate 			return (EINVAL);
261*0Sstevel@tonic-gate 
262*0Sstevel@tonic-gate 	DEBUG_LWRITE(dbc, dbc->txn, "bam_c_del", NULL, NULL, flags);
263*0Sstevel@tonic-gate 
264*0Sstevel@tonic-gate 	/* If already deleted, return failure. */
265*0Sstevel@tonic-gate 	if (F_ISSET(cp, C_DELETED))
266*0Sstevel@tonic-gate 		return (DB_KEYEMPTY);
267*0Sstevel@tonic-gate 
268*0Sstevel@tonic-gate 	/*
269*0Sstevel@tonic-gate 	 * We don't physically delete the record until the cursor moves,
270*0Sstevel@tonic-gate 	 * so we have to have a long-lived write lock on the page instead
271*0Sstevel@tonic-gate 	 * of a long-lived read lock.  Note, we have to have a read lock
272*0Sstevel@tonic-gate 	 * to even get here, so we simply discard it.
273*0Sstevel@tonic-gate 	 */
274*0Sstevel@tonic-gate 	if (F_ISSET(dbp, DB_AM_LOCKING) && cp->mode != DB_LOCK_WRITE) {
275*0Sstevel@tonic-gate 		if ((ret = __bam_lget(dbc,
276*0Sstevel@tonic-gate 		    0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
277*0Sstevel@tonic-gate 			goto err;
278*0Sstevel@tonic-gate 		(void)__BT_TLPUT(dbc, cp->lock);
279*0Sstevel@tonic-gate 		cp->lock = lock;
280*0Sstevel@tonic-gate 		cp->mode = DB_LOCK_WRITE;
281*0Sstevel@tonic-gate 	}
282*0Sstevel@tonic-gate 
283*0Sstevel@tonic-gate 	/*
284*0Sstevel@tonic-gate 	 * Acquire the underlying page (which may be different from the above
285*0Sstevel@tonic-gate 	 * page because it may be a duplicate page), and set the on-page and
286*0Sstevel@tonic-gate 	 * in-cursor delete flags.  We don't need to lock it as we've already
287*0Sstevel@tonic-gate 	 * write-locked the page leading to it.
288*0Sstevel@tonic-gate 	 */
289*0Sstevel@tonic-gate 	if (cp->dpgno == PGNO_INVALID) {
290*0Sstevel@tonic-gate 		pgno = cp->pgno;
291*0Sstevel@tonic-gate 		indx = cp->indx;
292*0Sstevel@tonic-gate 	} else {
293*0Sstevel@tonic-gate 		pgno = cp->dpgno;
294*0Sstevel@tonic-gate 		indx = cp->dindx;
295*0Sstevel@tonic-gate 	}
296*0Sstevel@tonic-gate 
297*0Sstevel@tonic-gate 	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
298*0Sstevel@tonic-gate 		goto err;
299*0Sstevel@tonic-gate 
300*0Sstevel@tonic-gate 	/* Log the change. */
301*0Sstevel@tonic-gate 	if (DB_LOGGING(dbc) &&
302*0Sstevel@tonic-gate 	    (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h),
303*0Sstevel@tonic-gate 	    0, dbp->log_fileid, PGNO(h), &LSN(h), indx)) != 0) {
304*0Sstevel@tonic-gate 		(void)memp_fput(dbp->mpf, h, 0);
305*0Sstevel@tonic-gate 		goto err;
306*0Sstevel@tonic-gate 	}
307*0Sstevel@tonic-gate 
308*0Sstevel@tonic-gate 	/*
309*0Sstevel@tonic-gate 	 * Set the intent-to-delete flag on the page and update all cursors. */
310*0Sstevel@tonic-gate 	if (cp->dpgno == PGNO_INVALID)
311*0Sstevel@tonic-gate 		B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type);
312*0Sstevel@tonic-gate 	else
313*0Sstevel@tonic-gate 		B_DSET(GET_BKEYDATA(h, indx)->type);
314*0Sstevel@tonic-gate 	(void)__bam_ca_delete(dbp, pgno, indx, 1);
315*0Sstevel@tonic-gate 
316*0Sstevel@tonic-gate 	ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
317*0Sstevel@tonic-gate 	h = NULL;
318*0Sstevel@tonic-gate 
319*0Sstevel@tonic-gate 	/*
320*0Sstevel@tonic-gate 	 * If the tree has record numbers, we have to adjust the counts.
321*0Sstevel@tonic-gate 	 *
322*0Sstevel@tonic-gate 	 * !!!
323*0Sstevel@tonic-gate 	 * This test is right -- we don't yet support duplicates and record
324*0Sstevel@tonic-gate 	 * numbers in the same tree, so ignore duplicates if DB_BT_RECNUM
325*0Sstevel@tonic-gate 	 * set.
326*0Sstevel@tonic-gate 	 */
327*0Sstevel@tonic-gate 	if (F_ISSET(dbp, DB_BT_RECNUM)) {
328*0Sstevel@tonic-gate 		if ((ret = __bam_c_getstack(dbc, cp)) != 0)
329*0Sstevel@tonic-gate 			goto err;
330*0Sstevel@tonic-gate 		if ((ret = __bam_adjust(dbc, -1)) != 0)
331*0Sstevel@tonic-gate 			goto err;
332*0Sstevel@tonic-gate 		(void)__bam_stkrel(dbc, 0);
333*0Sstevel@tonic-gate 	}
334*0Sstevel@tonic-gate 
335*0Sstevel@tonic-gate err:	if (h != NULL)
336*0Sstevel@tonic-gate 		(void)memp_fput(dbp->mpf, h, 0);
337*0Sstevel@tonic-gate 	return (ret);
338*0Sstevel@tonic-gate }
339*0Sstevel@tonic-gate 
340*0Sstevel@tonic-gate /*
341*0Sstevel@tonic-gate  * __bam_c_get --
342*0Sstevel@tonic-gate  *	Get using a cursor (btree).
343*0Sstevel@tonic-gate  */
344*0Sstevel@tonic-gate static int
__bam_c_get(dbc,key,data,flags)345*0Sstevel@tonic-gate __bam_c_get(dbc, key, data, flags)
346*0Sstevel@tonic-gate 	DBC *dbc;
347*0Sstevel@tonic-gate 	DBT *key, *data;
348*0Sstevel@tonic-gate 	u_int32_t flags;
349*0Sstevel@tonic-gate {
350*0Sstevel@tonic-gate 	CURSOR *cp, copy, start;
351*0Sstevel@tonic-gate 	DB *dbp;
352*0Sstevel@tonic-gate 	PAGE *h;
353*0Sstevel@tonic-gate 	int exact, ret, tmp_rmw;
354*0Sstevel@tonic-gate 
355*0Sstevel@tonic-gate 	dbp = dbc->dbp;
356*0Sstevel@tonic-gate 	cp = dbc->internal;
357*0Sstevel@tonic-gate 
358*0Sstevel@tonic-gate 	DB_PANIC_CHECK(dbp);
359*0Sstevel@tonic-gate 
360*0Sstevel@tonic-gate 	/* Check for invalid flags. */
361*0Sstevel@tonic-gate 	if ((ret = __db_cgetchk(dbp,
362*0Sstevel@tonic-gate 	    key, data, flags, cp->pgno != PGNO_INVALID)) != 0)
363*0Sstevel@tonic-gate 		return (ret);
364*0Sstevel@tonic-gate 
365*0Sstevel@tonic-gate 	/* Clear OR'd in additional bits so we can check for flag equality. */
366*0Sstevel@tonic-gate 	tmp_rmw = 0;
367*0Sstevel@tonic-gate 	if (LF_ISSET(DB_RMW)) {
368*0Sstevel@tonic-gate 		if (!F_ISSET(dbp, DB_AM_CDB)) {
369*0Sstevel@tonic-gate 			tmp_rmw = 1;
370*0Sstevel@tonic-gate 			F_SET(dbc, DBC_RMW);
371*0Sstevel@tonic-gate 		}
372*0Sstevel@tonic-gate 		LF_CLR(DB_RMW);
373*0Sstevel@tonic-gate 	}
374*0Sstevel@tonic-gate 
375*0Sstevel@tonic-gate 	DEBUG_LREAD(dbc, dbc->txn, "bam_c_get",
376*0Sstevel@tonic-gate 	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
377*0Sstevel@tonic-gate 
378*0Sstevel@tonic-gate 	/*
379*0Sstevel@tonic-gate 	 * Return a cursor's record number.  It has nothing to do with the
380*0Sstevel@tonic-gate 	 * cursor get code except that it's been rammed into the interface.
381*0Sstevel@tonic-gate 	 */
382*0Sstevel@tonic-gate 	if (flags == DB_GET_RECNO) {
383*0Sstevel@tonic-gate 		ret = __bam_c_rget(dbc, data, flags);
384*0Sstevel@tonic-gate 		if (tmp_rmw)
385*0Sstevel@tonic-gate 			F_CLR(dbc, DBC_RMW);
386*0Sstevel@tonic-gate 		return (ret);
387*0Sstevel@tonic-gate 	}
388*0Sstevel@tonic-gate 
389*0Sstevel@tonic-gate 	/*
390*0Sstevel@tonic-gate 	 * Initialize the cursor for a new retrieval.  Clear the cursor's
391*0Sstevel@tonic-gate 	 * page pointer, it was set before this operation, and no longer
392*0Sstevel@tonic-gate 	 * has any meaning.
393*0Sstevel@tonic-gate 	 */
394*0Sstevel@tonic-gate 	cp->page = NULL;
395*0Sstevel@tonic-gate 	copy = *cp;
396*0Sstevel@tonic-gate 	cp->lock = LOCK_INVALID;
397*0Sstevel@tonic-gate 
398*0Sstevel@tonic-gate 	switch (flags) {
399*0Sstevel@tonic-gate 	case DB_CURRENT:
400*0Sstevel@tonic-gate 		/* It's not possible to return a deleted record. */
401*0Sstevel@tonic-gate 		if (F_ISSET(cp, C_DELETED)) {
402*0Sstevel@tonic-gate 			ret = DB_KEYEMPTY;
403*0Sstevel@tonic-gate 			goto err;
404*0Sstevel@tonic-gate 		}
405*0Sstevel@tonic-gate 
406*0Sstevel@tonic-gate 		/* Acquire the current page. */
407*0Sstevel@tonic-gate 		if ((ret = __bam_lget(dbc,
408*0Sstevel@tonic-gate 		    0, cp->pgno, DB_LOCK_READ, &cp->lock)) == 0)
409*0Sstevel@tonic-gate 			ret = memp_fget(dbp->mpf,
410*0Sstevel@tonic-gate 			    cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno,
411*0Sstevel@tonic-gate 			    0, &cp->page);
412*0Sstevel@tonic-gate 		if (ret != 0)
413*0Sstevel@tonic-gate 			goto err;
414*0Sstevel@tonic-gate 		break;
415*0Sstevel@tonic-gate 	case DB_NEXT_DUP:
416*0Sstevel@tonic-gate 		if (cp->pgno == PGNO_INVALID) {
417*0Sstevel@tonic-gate 			ret = EINVAL;
418*0Sstevel@tonic-gate 			goto err;
419*0Sstevel@tonic-gate 		}
420*0Sstevel@tonic-gate 		if ((ret = __bam_c_next(dbc, cp, 1)) != 0)
421*0Sstevel@tonic-gate 			goto err;
422*0Sstevel@tonic-gate 
423*0Sstevel@tonic-gate 		/* Make sure we didn't go past the end of the duplicates. */
424*0Sstevel@tonic-gate 		if (!POSSIBLE_DUPLICATE(cp, copy)) {
425*0Sstevel@tonic-gate 			ret = DB_NOTFOUND;
426*0Sstevel@tonic-gate 			goto err;
427*0Sstevel@tonic-gate 		}
428*0Sstevel@tonic-gate 		break;
429*0Sstevel@tonic-gate 	case DB_NEXT:
430*0Sstevel@tonic-gate 		if (cp->pgno != PGNO_INVALID) {
431*0Sstevel@tonic-gate 			if ((ret = __bam_c_next(dbc, cp, 1)) != 0)
432*0Sstevel@tonic-gate 				goto err;
433*0Sstevel@tonic-gate 			break;
434*0Sstevel@tonic-gate 		}
435*0Sstevel@tonic-gate 		/* FALLTHROUGH */
436*0Sstevel@tonic-gate 	case DB_FIRST:
437*0Sstevel@tonic-gate 		if ((ret = __bam_c_first(dbc, cp)) != 0)
438*0Sstevel@tonic-gate 			goto err;
439*0Sstevel@tonic-gate 		break;
440*0Sstevel@tonic-gate 	case DB_PREV:
441*0Sstevel@tonic-gate 		if (cp->pgno != PGNO_INVALID) {
442*0Sstevel@tonic-gate 			if ((ret = __bam_c_prev(dbc, cp)) != 0)
443*0Sstevel@tonic-gate 				goto err;
444*0Sstevel@tonic-gate 			break;
445*0Sstevel@tonic-gate 		}
446*0Sstevel@tonic-gate 		/* FALLTHROUGH */
447*0Sstevel@tonic-gate 	case DB_LAST:
448*0Sstevel@tonic-gate 		if ((ret = __bam_c_last(dbc, cp)) != 0)
449*0Sstevel@tonic-gate 			goto err;
450*0Sstevel@tonic-gate 		break;
451*0Sstevel@tonic-gate 	case DB_SET:
452*0Sstevel@tonic-gate 		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
453*0Sstevel@tonic-gate 			goto err;
454*0Sstevel@tonic-gate 
455*0Sstevel@tonic-gate 		/*
456*0Sstevel@tonic-gate 		 * We cannot currently be referencing a deleted record, but we
457*0Sstevel@tonic-gate 		 * may be referencing off-page duplicates.
458*0Sstevel@tonic-gate 		 *
459*0Sstevel@tonic-gate 		 * If we're referencing off-page duplicates, move off-page.
460*0Sstevel@tonic-gate 		 * If we moved off-page, move to the next non-deleted record.
461*0Sstevel@tonic-gate 		 * If we moved to the next non-deleted record, check to make
462*0Sstevel@tonic-gate 		 * sure we didn't switch records because our current record
463*0Sstevel@tonic-gate 		 * had no non-deleted data items.
464*0Sstevel@tonic-gate 		 */
465*0Sstevel@tonic-gate 		start = *cp;
466*0Sstevel@tonic-gate 		if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
467*0Sstevel@tonic-gate 			goto err;
468*0Sstevel@tonic-gate 		if (cp->dpgno != PGNO_INVALID && IS_CUR_DELETED(cp)) {
469*0Sstevel@tonic-gate 			if ((ret = __bam_c_next(dbc, cp, 0)) != 0)
470*0Sstevel@tonic-gate 				goto err;
471*0Sstevel@tonic-gate 			if (!POSSIBLE_DUPLICATE(cp, start)) {
472*0Sstevel@tonic-gate 				ret = DB_NOTFOUND;
473*0Sstevel@tonic-gate 				goto err;
474*0Sstevel@tonic-gate 			}
475*0Sstevel@tonic-gate 		}
476*0Sstevel@tonic-gate 		break;
477*0Sstevel@tonic-gate 	case DB_SET_RECNO:
478*0Sstevel@tonic-gate 		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
479*0Sstevel@tonic-gate 			goto err;
480*0Sstevel@tonic-gate 		break;
481*0Sstevel@tonic-gate 	case DB_GET_BOTH:
482*0Sstevel@tonic-gate 		if (F_ISSET(dbc, DBC_CONTINUE | DBC_KEYSET)) {
483*0Sstevel@tonic-gate 			/* Acquire the current page. */
484*0Sstevel@tonic-gate 			if ((ret = memp_fget(dbp->mpf,
485*0Sstevel@tonic-gate 			    cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno,
486*0Sstevel@tonic-gate 			    0, &cp->page)) != 0)
487*0Sstevel@tonic-gate 				goto err;
488*0Sstevel@tonic-gate 
489*0Sstevel@tonic-gate 			/* If DBC_CONTINUE, move to the next item. */
490*0Sstevel@tonic-gate 			if (F_ISSET(dbc, DBC_CONTINUE) &&
491*0Sstevel@tonic-gate 			    (ret = __bam_c_next(dbc, cp, 1)) != 0)
492*0Sstevel@tonic-gate 				goto err;
493*0Sstevel@tonic-gate 		} else {
494*0Sstevel@tonic-gate 			if ((ret =
495*0Sstevel@tonic-gate 			    __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
496*0Sstevel@tonic-gate 				goto err;
497*0Sstevel@tonic-gate 
498*0Sstevel@tonic-gate 			/*
499*0Sstevel@tonic-gate 			 * We may be referencing a duplicates page.  Move to
500*0Sstevel@tonic-gate 			 * the first duplicate.
501*0Sstevel@tonic-gate 			 */
502*0Sstevel@tonic-gate 			if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
503*0Sstevel@tonic-gate 				goto err;
504*0Sstevel@tonic-gate 		}
505*0Sstevel@tonic-gate 
506*0Sstevel@tonic-gate 		/* Search for a matching entry. */
507*0Sstevel@tonic-gate 		if ((ret = __bam_dsearch(dbc, cp, data, NULL)) != 0)
508*0Sstevel@tonic-gate 			goto err;
509*0Sstevel@tonic-gate 
510*0Sstevel@tonic-gate 		/* Ignore deleted entries. */
511*0Sstevel@tonic-gate 		if (IS_CUR_DELETED(cp)) {
512*0Sstevel@tonic-gate 			ret = DB_NOTFOUND;
513*0Sstevel@tonic-gate 			goto err;
514*0Sstevel@tonic-gate 		}
515*0Sstevel@tonic-gate 		break;
516*0Sstevel@tonic-gate 	case DB_SET_RANGE:
517*0Sstevel@tonic-gate 		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
518*0Sstevel@tonic-gate 			goto err;
519*0Sstevel@tonic-gate 
520*0Sstevel@tonic-gate 		/*
521*0Sstevel@tonic-gate 		 * As we didn't require an exact match, the search function
522*0Sstevel@tonic-gate 		 * may have returned an entry past the end of the page.  If
523*0Sstevel@tonic-gate 		 * so, move to the next entry.
524*0Sstevel@tonic-gate 		 */
525*0Sstevel@tonic-gate 		if (cp->indx == NUM_ENT(cp->page) &&
526*0Sstevel@tonic-gate 		    (ret = __bam_c_next(dbc, cp, 0)) != 0)
527*0Sstevel@tonic-gate 			goto err;
528*0Sstevel@tonic-gate 
529*0Sstevel@tonic-gate 		/*
530*0Sstevel@tonic-gate 		 * We may be referencing off-page duplicates, if so, move
531*0Sstevel@tonic-gate 		 * off-page.
532*0Sstevel@tonic-gate 		 */
533*0Sstevel@tonic-gate 		if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
534*0Sstevel@tonic-gate 			goto err;
535*0Sstevel@tonic-gate 
536*0Sstevel@tonic-gate 		/*
537*0Sstevel@tonic-gate 		 * We may be referencing a deleted record, if so, move to
538*0Sstevel@tonic-gate 		 * the next non-deleted record.
539*0Sstevel@tonic-gate 		 */
540*0Sstevel@tonic-gate 		if (IS_CUR_DELETED(cp) && (ret = __bam_c_next(dbc, cp, 0)) != 0)
541*0Sstevel@tonic-gate 			goto err;
542*0Sstevel@tonic-gate 		break;
543*0Sstevel@tonic-gate 	}
544*0Sstevel@tonic-gate 
545*0Sstevel@tonic-gate 	/*
546*0Sstevel@tonic-gate 	 * Return the key if the user didn't give us one.  If we've moved to
547*0Sstevel@tonic-gate 	 * a duplicate page, we may no longer have a pointer to the main page,
548*0Sstevel@tonic-gate 	 * so we have to go get it.  We know that it's already read-locked,
549*0Sstevel@tonic-gate 	 * however, so we don't have to acquire a new lock.
550*0Sstevel@tonic-gate 	 */
551*0Sstevel@tonic-gate 	if (flags != DB_SET) {
552*0Sstevel@tonic-gate 		if (cp->dpgno != PGNO_INVALID) {
553*0Sstevel@tonic-gate 			if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0)
554*0Sstevel@tonic-gate 				goto err;
555*0Sstevel@tonic-gate 		} else
556*0Sstevel@tonic-gate 			h = cp->page;
557*0Sstevel@tonic-gate 		ret = __db_ret(dbp,
558*0Sstevel@tonic-gate 		    h, cp->indx, key, &dbc->rkey.data, &dbc->rkey.ulen);
559*0Sstevel@tonic-gate 		if (cp->dpgno != PGNO_INVALID)
560*0Sstevel@tonic-gate 			(void)memp_fput(dbp->mpf, h, 0);
561*0Sstevel@tonic-gate 		if (ret)
562*0Sstevel@tonic-gate 			goto err;
563*0Sstevel@tonic-gate 	}
564*0Sstevel@tonic-gate 
565*0Sstevel@tonic-gate 	/* Return the data. */
566*0Sstevel@tonic-gate 	if ((ret = __db_ret(dbp, cp->page,
567*0Sstevel@tonic-gate 	    cp->dpgno == PGNO_INVALID ? cp->indx + O_INDX : cp->dindx,
568*0Sstevel@tonic-gate 	    data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
569*0Sstevel@tonic-gate 		goto err;
570*0Sstevel@tonic-gate 
571*0Sstevel@tonic-gate 	/*
572*0Sstevel@tonic-gate 	 * If the previous cursor record has been deleted, physically delete
573*0Sstevel@tonic-gate 	 * the entry from the page.  We clear the deleted flag before we call
574*0Sstevel@tonic-gate 	 * the underlying delete routine so that, if an error occurs, and we
575*0Sstevel@tonic-gate 	 * restore the cursor, the deleted flag is cleared.  This is because,
576*0Sstevel@tonic-gate 	 * if we manage to physically modify the page, and then restore the
577*0Sstevel@tonic-gate 	 * cursor, we might try to repeat the page modification when closing
578*0Sstevel@tonic-gate 	 * the cursor.
579*0Sstevel@tonic-gate 	 */
580*0Sstevel@tonic-gate 	if (F_ISSET(&copy, C_DELETED)) {
581*0Sstevel@tonic-gate 		F_CLR(&copy, C_DELETED);
582*0Sstevel@tonic-gate 		if ((ret = __bam_c_physdel(dbc, &copy, cp->page)) != 0)
583*0Sstevel@tonic-gate 			goto err;
584*0Sstevel@tonic-gate 	}
585*0Sstevel@tonic-gate 	F_CLR(cp, C_DELETED);
586*0Sstevel@tonic-gate 
587*0Sstevel@tonic-gate 	/* Release the previous lock, if any; the current lock is retained. */
588*0Sstevel@tonic-gate 	if (copy.lock != LOCK_INVALID)
589*0Sstevel@tonic-gate 		(void)__BT_TLPUT(dbc, copy.lock);
590*0Sstevel@tonic-gate 
591*0Sstevel@tonic-gate 	/* Release the current page. */
592*0Sstevel@tonic-gate 	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
593*0Sstevel@tonic-gate 		goto err;
594*0Sstevel@tonic-gate 
595*0Sstevel@tonic-gate 	if (0) {
596*0Sstevel@tonic-gate err:		if (cp->page != NULL)
597*0Sstevel@tonic-gate 			(void)memp_fput(dbp->mpf, cp->page, 0);
598*0Sstevel@tonic-gate 		if (cp->lock != LOCK_INVALID)
599*0Sstevel@tonic-gate 			(void)__BT_TLPUT(dbc, cp->lock);
600*0Sstevel@tonic-gate 		*cp = copy;
601*0Sstevel@tonic-gate 	}
602*0Sstevel@tonic-gate 
603*0Sstevel@tonic-gate 	/* Release temporary lock upgrade. */
604*0Sstevel@tonic-gate 	if (tmp_rmw)
605*0Sstevel@tonic-gate 		F_CLR(dbc, DBC_RMW);
606*0Sstevel@tonic-gate 
607*0Sstevel@tonic-gate 	return (ret);
608*0Sstevel@tonic-gate }
609*0Sstevel@tonic-gate 
610*0Sstevel@tonic-gate /*
611*0Sstevel@tonic-gate  * __bam_dsearch --
612*0Sstevel@tonic-gate  *	Search for a matching data item (or the first data item that's
613*0Sstevel@tonic-gate  *	equal to or greater than the one we're searching for).
614*0Sstevel@tonic-gate  */
615*0Sstevel@tonic-gate static int
__bam_dsearch(dbc,cp,data,iflagp)616*0Sstevel@tonic-gate __bam_dsearch(dbc, cp, data, iflagp)
617*0Sstevel@tonic-gate 	DBC *dbc;
618*0Sstevel@tonic-gate 	CURSOR *cp;
619*0Sstevel@tonic-gate 	DBT *data;
620*0Sstevel@tonic-gate 	u_int32_t *iflagp;
621*0Sstevel@tonic-gate {
622*0Sstevel@tonic-gate 	DB *dbp;
623*0Sstevel@tonic-gate 	CURSOR copy, last;
624*0Sstevel@tonic-gate 	int cmp, ret;
625*0Sstevel@tonic-gate 
626*0Sstevel@tonic-gate 	dbp = dbc->dbp;
627*0Sstevel@tonic-gate 
628*0Sstevel@tonic-gate 	/*
629*0Sstevel@tonic-gate 	 * If iflagp is non-NULL, we're doing an insert.
630*0Sstevel@tonic-gate 	 *
631*0Sstevel@tonic-gate 	 * If the duplicates are off-page, use the duplicate search routine.
632*0Sstevel@tonic-gate 	 */
633*0Sstevel@tonic-gate 	if (cp->dpgno != PGNO_INVALID) {
634*0Sstevel@tonic-gate 		if ((ret = __db_dsearch(dbc, iflagp != NULL,
635*0Sstevel@tonic-gate 		    data, cp->dpgno, &cp->dindx, &cp->page, &cmp)) != 0)
636*0Sstevel@tonic-gate 			return (ret);
637*0Sstevel@tonic-gate 		cp->dpgno = cp->page->pgno;
638*0Sstevel@tonic-gate 
639*0Sstevel@tonic-gate 		if (iflagp == NULL) {
640*0Sstevel@tonic-gate 			if (cmp != 0)
641*0Sstevel@tonic-gate 				return (DB_NOTFOUND);
642*0Sstevel@tonic-gate 			return (0);
643*0Sstevel@tonic-gate 		}
644*0Sstevel@tonic-gate 		*iflagp = DB_BEFORE;
645*0Sstevel@tonic-gate 		return (0);
646*0Sstevel@tonic-gate 	}
647*0Sstevel@tonic-gate 
648*0Sstevel@tonic-gate 	/* Otherwise, do the search ourselves. */
649*0Sstevel@tonic-gate 	copy = *cp;
650*0Sstevel@tonic-gate 	for (;;) {
651*0Sstevel@tonic-gate 		/* Save the last interesting cursor position. */
652*0Sstevel@tonic-gate 		last = *cp;
653*0Sstevel@tonic-gate 
654*0Sstevel@tonic-gate 		/* See if the data item matches the one we're looking for. */
655*0Sstevel@tonic-gate 		if ((cmp = __bam_cmp(dbp, data, cp->page, cp->indx + O_INDX,
656*0Sstevel@tonic-gate 		    dbp->dup_compare == NULL ?
657*0Sstevel@tonic-gate 		    __bam_defcmp : dbp->dup_compare)) == 0) {
658*0Sstevel@tonic-gate 			if (iflagp != NULL)
659*0Sstevel@tonic-gate 				*iflagp = DB_AFTER;
660*0Sstevel@tonic-gate 			return (0);
661*0Sstevel@tonic-gate 		}
662*0Sstevel@tonic-gate 
663*0Sstevel@tonic-gate 		/*
664*0Sstevel@tonic-gate 		 * If duplicate entries are sorted, we're done if we find a
665*0Sstevel@tonic-gate 		 * page entry that sorts greater than the application item.
666*0Sstevel@tonic-gate 		 * If doing an insert, return success, otherwise DB_NOTFOUND.
667*0Sstevel@tonic-gate 		 */
668*0Sstevel@tonic-gate 		if (dbp->dup_compare != NULL && cmp < 0) {
669*0Sstevel@tonic-gate 			if (iflagp == NULL)
670*0Sstevel@tonic-gate 				return (DB_NOTFOUND);
671*0Sstevel@tonic-gate 			*iflagp = DB_BEFORE;
672*0Sstevel@tonic-gate 			return (0);
673*0Sstevel@tonic-gate 		}
674*0Sstevel@tonic-gate 
675*0Sstevel@tonic-gate 		/*
676*0Sstevel@tonic-gate 		 * Move to the next item.  If we reach the end of the page and
677*0Sstevel@tonic-gate 		 * we're doing an insert, set the cursor to the last item and
678*0Sstevel@tonic-gate 		 * set the referenced memory location so callers know to insert
679*0Sstevel@tonic-gate 		 * after the item, instead of before it.  If not inserting, we
680*0Sstevel@tonic-gate 		 * return DB_NOTFOUND.
681*0Sstevel@tonic-gate 		 */
682*0Sstevel@tonic-gate 		if ((cp->indx += P_INDX) >= NUM_ENT(cp->page)) {
683*0Sstevel@tonic-gate 			if (iflagp == NULL)
684*0Sstevel@tonic-gate 				return (DB_NOTFOUND);
685*0Sstevel@tonic-gate 			goto use_last;
686*0Sstevel@tonic-gate 		}
687*0Sstevel@tonic-gate 
688*0Sstevel@tonic-gate 		/*
689*0Sstevel@tonic-gate 		 * Make sure we didn't go past the end of the duplicates.  The
690*0Sstevel@tonic-gate 		 * error conditions are the same as above.
691*0Sstevel@tonic-gate 		 */
692*0Sstevel@tonic-gate 		if (!POSSIBLE_DUPLICATE(cp, copy)) {
693*0Sstevel@tonic-gate 			if (iflagp == NULL)
694*0Sstevel@tonic-gate 				 return (DB_NOTFOUND);
695*0Sstevel@tonic-gate use_last:		*cp = last;
696*0Sstevel@tonic-gate 			*iflagp = DB_AFTER;
697*0Sstevel@tonic-gate 			return (0);
698*0Sstevel@tonic-gate 		}
699*0Sstevel@tonic-gate 	}
700*0Sstevel@tonic-gate 	/* NOTREACHED */
701*0Sstevel@tonic-gate }
702*0Sstevel@tonic-gate 
703*0Sstevel@tonic-gate /*
704*0Sstevel@tonic-gate  * __bam_c_rget --
705*0Sstevel@tonic-gate  *	Return the record number for a cursor.
706*0Sstevel@tonic-gate  */
707*0Sstevel@tonic-gate static int
__bam_c_rget(dbc,data,flags)708*0Sstevel@tonic-gate __bam_c_rget(dbc, data, flags)
709*0Sstevel@tonic-gate 	DBC *dbc;
710*0Sstevel@tonic-gate 	DBT *data;
711*0Sstevel@tonic-gate 	u_int32_t flags;
712*0Sstevel@tonic-gate {
713*0Sstevel@tonic-gate 	CURSOR *cp;
714*0Sstevel@tonic-gate 	DB *dbp;
715*0Sstevel@tonic-gate 	DBT dbt;
716*0Sstevel@tonic-gate 	db_recno_t recno;
717*0Sstevel@tonic-gate 	int exact, ret;
718*0Sstevel@tonic-gate 
719*0Sstevel@tonic-gate 	COMPQUIET(flags, 0);
720*0Sstevel@tonic-gate 	dbp = dbc->dbp;
721*0Sstevel@tonic-gate 	cp = dbc->internal;
722*0Sstevel@tonic-gate 
723*0Sstevel@tonic-gate 	/* Get the page with the current item on it. */
724*0Sstevel@tonic-gate 	if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0)
725*0Sstevel@tonic-gate 		return (ret);
726*0Sstevel@tonic-gate 
727*0Sstevel@tonic-gate 	/* Get a copy of the key. */
728*0Sstevel@tonic-gate 	memset(&dbt, 0, sizeof(DBT));
729*0Sstevel@tonic-gate 	dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
730*0Sstevel@tonic-gate 	if ((ret = __db_ret(dbp, cp->page, cp->indx, &dbt, NULL, NULL)) != 0)
731*0Sstevel@tonic-gate 		goto err;
732*0Sstevel@tonic-gate 
733*0Sstevel@tonic-gate 	exact = 1;
734*0Sstevel@tonic-gate 	if ((ret = __bam_search(dbc, &dbt,
735*0Sstevel@tonic-gate 	    F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND,
736*0Sstevel@tonic-gate 	    1, &recno, &exact)) != 0)
737*0Sstevel@tonic-gate 		goto err;
738*0Sstevel@tonic-gate 
739*0Sstevel@tonic-gate 	ret = __db_retcopy(data, &recno, sizeof(recno),
740*0Sstevel@tonic-gate 	    &dbc->rdata.data, &dbc->rdata.ulen, dbp->db_malloc);
741*0Sstevel@tonic-gate 
742*0Sstevel@tonic-gate 	/* Release the stack. */
743*0Sstevel@tonic-gate 	__bam_stkrel(dbc, 0);
744*0Sstevel@tonic-gate 
745*0Sstevel@tonic-gate err:	(void)memp_fput(dbp->mpf, cp->page, 0);
746*0Sstevel@tonic-gate 	__os_free(dbt.data, dbt.size);
747*0Sstevel@tonic-gate 	return (ret);
748*0Sstevel@tonic-gate }
749*0Sstevel@tonic-gate 
750*0Sstevel@tonic-gate /*
751*0Sstevel@tonic-gate  * __bam_c_put --
752*0Sstevel@tonic-gate  *	Put using a cursor.
753*0Sstevel@tonic-gate  */
754*0Sstevel@tonic-gate static int
__bam_c_put(dbc,key,data,flags)755*0Sstevel@tonic-gate __bam_c_put(dbc, key, data, flags)
756*0Sstevel@tonic-gate 	DBC *dbc;
757*0Sstevel@tonic-gate 	DBT *key, *data;
758*0Sstevel@tonic-gate 	u_int32_t flags;
759*0Sstevel@tonic-gate {
760*0Sstevel@tonic-gate 	CURSOR *cp, copy;
761*0Sstevel@tonic-gate 	DB *dbp;
762*0Sstevel@tonic-gate 	DBT dbt;
763*0Sstevel@tonic-gate 	db_indx_t indx;
764*0Sstevel@tonic-gate 	db_pgno_t pgno;
765*0Sstevel@tonic-gate 	u_int32_t iiflags, iiop;
766*0Sstevel@tonic-gate 	int exact, needkey, ret, stack;
767*0Sstevel@tonic-gate 	void *arg;
768*0Sstevel@tonic-gate 
769*0Sstevel@tonic-gate 	dbp = dbc->dbp;
770*0Sstevel@tonic-gate 	cp = dbc->internal;
771*0Sstevel@tonic-gate 
772*0Sstevel@tonic-gate 	DB_PANIC_CHECK(dbp);
773*0Sstevel@tonic-gate 
774*0Sstevel@tonic-gate 	DEBUG_LWRITE(dbc, dbc->txn, "bam_c_put",
775*0Sstevel@tonic-gate 	    flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL,
776*0Sstevel@tonic-gate 	    data, flags);
777*0Sstevel@tonic-gate 
778*0Sstevel@tonic-gate 	if ((ret = __db_cputchk(dbp, key, data, flags,
779*0Sstevel@tonic-gate 	    F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
780*0Sstevel@tonic-gate 		return (ret);
781*0Sstevel@tonic-gate 
782*0Sstevel@tonic-gate 	/*
783*0Sstevel@tonic-gate 	 * If we are running CDB, this had better be either a write
784*0Sstevel@tonic-gate 	 * cursor or an immediate writer.  If it's a regular writer,
785*0Sstevel@tonic-gate 	 * that means we have an IWRITE lock and we need to upgrade
786*0Sstevel@tonic-gate 	 * it to a write lock.
787*0Sstevel@tonic-gate 	 */
788*0Sstevel@tonic-gate 	if (F_ISSET(dbp, DB_AM_CDB)) {
789*0Sstevel@tonic-gate 		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
790*0Sstevel@tonic-gate 			return (EINVAL);
791*0Sstevel@tonic-gate 
792*0Sstevel@tonic-gate 		if (F_ISSET(dbc, DBC_RMW) &&
793*0Sstevel@tonic-gate 		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
794*0Sstevel@tonic-gate 		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
795*0Sstevel@tonic-gate 		    &dbc->mylock)) != 0)
796*0Sstevel@tonic-gate 			return (EAGAIN);
797*0Sstevel@tonic-gate 	}
798*0Sstevel@tonic-gate 
799*0Sstevel@tonic-gate 	if (0) {
800*0Sstevel@tonic-gate split:		/*
801*0Sstevel@tonic-gate 		 * To split, we need a valid key for the page.  Since it's a
802*0Sstevel@tonic-gate 		 * cursor, we have to build one.
803*0Sstevel@tonic-gate 		 *
804*0Sstevel@tonic-gate 		 * Acquire a copy of a key from the page.
805*0Sstevel@tonic-gate 		 */
806*0Sstevel@tonic-gate 		if (needkey) {
807*0Sstevel@tonic-gate 			memset(&dbt, 0, sizeof(DBT));
808*0Sstevel@tonic-gate 			if ((ret = __db_ret(dbp, cp->page, indx,
809*0Sstevel@tonic-gate 			    &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0)
810*0Sstevel@tonic-gate 				goto err;
811*0Sstevel@tonic-gate 			arg = &dbt;
812*0Sstevel@tonic-gate 		} else
813*0Sstevel@tonic-gate 			arg = key;
814*0Sstevel@tonic-gate 
815*0Sstevel@tonic-gate 		/*
816*0Sstevel@tonic-gate 		 * Discard any locks and pinned pages (the locks are discarded
817*0Sstevel@tonic-gate 		 * even if we're running with transactions, as they lock pages
818*0Sstevel@tonic-gate 		 * that we're sorry we ever acquired).  If stack is set and the
819*0Sstevel@tonic-gate 		 * cursor entries are valid, they point to the same entries as
820*0Sstevel@tonic-gate 		 * the stack, don't free them twice.
821*0Sstevel@tonic-gate 		 */
822*0Sstevel@tonic-gate 		if (stack) {
823*0Sstevel@tonic-gate 			(void)__bam_stkrel(dbc, 1);
824*0Sstevel@tonic-gate 			stack = 0;
825*0Sstevel@tonic-gate 		} else
826*0Sstevel@tonic-gate 			DISCARD(dbc, cp);
827*0Sstevel@tonic-gate 
828*0Sstevel@tonic-gate 		/*
829*0Sstevel@tonic-gate 		 * Restore the cursor to its original value.  This is necessary
830*0Sstevel@tonic-gate 		 * for two reasons.  First, we are about to copy it in case of
831*0Sstevel@tonic-gate 		 * error, again.  Second, we adjust cursors during the split,
832*0Sstevel@tonic-gate 		 * and we have to ensure this cursor is adjusted appropriately,
833*0Sstevel@tonic-gate 		 * along with all the other cursors.
834*0Sstevel@tonic-gate 		 */
835*0Sstevel@tonic-gate 		*cp = copy;
836*0Sstevel@tonic-gate 
837*0Sstevel@tonic-gate 		if ((ret = __bam_split(dbc, arg)) != 0)
838*0Sstevel@tonic-gate 			goto err;
839*0Sstevel@tonic-gate 	}
840*0Sstevel@tonic-gate 
841*0Sstevel@tonic-gate 	/*
842*0Sstevel@tonic-gate 	 * Initialize the cursor for a new retrieval.  Clear the cursor's
843*0Sstevel@tonic-gate 	 * page pointer, it was set before this operation, and no longer
844*0Sstevel@tonic-gate 	 * has any meaning.
845*0Sstevel@tonic-gate 	 */
846*0Sstevel@tonic-gate 	cp->page = NULL;
847*0Sstevel@tonic-gate 	copy = *cp;
848*0Sstevel@tonic-gate 	cp->lock = LOCK_INVALID;
849*0Sstevel@tonic-gate 
850*0Sstevel@tonic-gate 	iiflags = needkey = ret = stack = 0;
851*0Sstevel@tonic-gate 	switch (flags) {
852*0Sstevel@tonic-gate 	case DB_AFTER:
853*0Sstevel@tonic-gate 	case DB_BEFORE:
854*0Sstevel@tonic-gate 	case DB_CURRENT:
855*0Sstevel@tonic-gate 		needkey = 1;
856*0Sstevel@tonic-gate 		if (cp->dpgno == PGNO_INVALID) {
857*0Sstevel@tonic-gate 			pgno = cp->pgno;
858*0Sstevel@tonic-gate 			indx = cp->indx;
859*0Sstevel@tonic-gate 		} else {
860*0Sstevel@tonic-gate 			pgno = cp->dpgno;
861*0Sstevel@tonic-gate 			indx = cp->dindx;
862*0Sstevel@tonic-gate 		}
863*0Sstevel@tonic-gate 
864*0Sstevel@tonic-gate 		/*
865*0Sstevel@tonic-gate 		 * !!!
866*0Sstevel@tonic-gate 		 * This test is right -- we don't yet support duplicates and
867*0Sstevel@tonic-gate 		 * record numbers in the same tree, so ignore duplicates if
868*0Sstevel@tonic-gate 		 * DB_BT_RECNUM set.
869*0Sstevel@tonic-gate 		 */
870*0Sstevel@tonic-gate 		if (F_ISSET(dbp, DB_BT_RECNUM) &&
871*0Sstevel@tonic-gate 		    (flags != DB_CURRENT || F_ISSET(cp, C_DELETED))) {
872*0Sstevel@tonic-gate 			/* Acquire a complete stack. */
873*0Sstevel@tonic-gate 			if ((ret = __bam_c_getstack(dbc, cp)) != 0)
874*0Sstevel@tonic-gate 				goto err;
875*0Sstevel@tonic-gate 			cp->page = cp->csp->page;
876*0Sstevel@tonic-gate 
877*0Sstevel@tonic-gate 			stack = 1;
878*0Sstevel@tonic-gate 			iiflags = BI_DOINCR;
879*0Sstevel@tonic-gate 		} else {
880*0Sstevel@tonic-gate 			/* Acquire the current page. */
881*0Sstevel@tonic-gate 			if ((ret = __bam_lget(dbc,
882*0Sstevel@tonic-gate 			    0, cp->pgno, DB_LOCK_WRITE, &cp->lock)) == 0)
883*0Sstevel@tonic-gate 				ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page);
884*0Sstevel@tonic-gate 			if (ret != 0)
885*0Sstevel@tonic-gate 				goto err;
886*0Sstevel@tonic-gate 
887*0Sstevel@tonic-gate 			iiflags = 0;
888*0Sstevel@tonic-gate 		}
889*0Sstevel@tonic-gate 
890*0Sstevel@tonic-gate 		/*
891*0Sstevel@tonic-gate 		 * If the user has specified a duplicate comparison function,
892*0Sstevel@tonic-gate 		 * we return an error if DB_CURRENT was specified and the
893*0Sstevel@tonic-gate 		 * replacement data doesn't compare equal to the current data.
894*0Sstevel@tonic-gate 		 * This stops apps from screwing up the duplicate sort order.
895*0Sstevel@tonic-gate 		 */
896*0Sstevel@tonic-gate 		if (flags == DB_CURRENT && dbp->dup_compare != NULL)
897*0Sstevel@tonic-gate 			if (__bam_cmp(dbp, data,
898*0Sstevel@tonic-gate 			    cp->page, indx, dbp->dup_compare) != 0) {
899*0Sstevel@tonic-gate 				ret = EINVAL;
900*0Sstevel@tonic-gate 				goto err;
901*0Sstevel@tonic-gate 			}
902*0Sstevel@tonic-gate 
903*0Sstevel@tonic-gate 		iiop = flags;
904*0Sstevel@tonic-gate 		break;
905*0Sstevel@tonic-gate 	case DB_KEYFIRST:
906*0Sstevel@tonic-gate 	case DB_KEYLAST:
907*0Sstevel@tonic-gate 		/*
908*0Sstevel@tonic-gate 		 * If we have a duplicate comparison function, we position to
909*0Sstevel@tonic-gate 		 * the first of any on-page duplicates, and use __bam_dsearch
910*0Sstevel@tonic-gate 		 * to search for the right slot.  Otherwise, we position to
911*0Sstevel@tonic-gate 		 * the first/last of any on-page duplicates based on the flag
912*0Sstevel@tonic-gate 		 * value.
913*0Sstevel@tonic-gate 		 */
914*0Sstevel@tonic-gate 		if ((ret = __bam_c_search(dbc, cp, key,
915*0Sstevel@tonic-gate 		    flags == DB_KEYFIRST || dbp->dup_compare != NULL ?
916*0Sstevel@tonic-gate 		    DB_KEYFIRST : DB_KEYLAST, &exact)) != 0)
917*0Sstevel@tonic-gate 			goto err;
918*0Sstevel@tonic-gate 		stack = 1;
919*0Sstevel@tonic-gate 
920*0Sstevel@tonic-gate 		/*
921*0Sstevel@tonic-gate 		 * If an exact match:
922*0Sstevel@tonic-gate 		 *	If duplicates aren't supported, replace the current
923*0Sstevel@tonic-gate 		 *	item.  (When implementing the DB->put function, our
924*0Sstevel@tonic-gate 		 *	caller has already checked the DB_NOOVERWRITE flag.)
925*0Sstevel@tonic-gate 		 *
926*0Sstevel@tonic-gate 		 *	If there's a duplicate comparison function, find the
927*0Sstevel@tonic-gate 		 *	correct slot for this duplicate item.
928*0Sstevel@tonic-gate 		 *
929*0Sstevel@tonic-gate 		 *	If there's no duplicate comparison function, set the
930*0Sstevel@tonic-gate 		 *	insert flag based on the argument flags.
931*0Sstevel@tonic-gate 		 *
932*0Sstevel@tonic-gate 		 * If there's no match, the search function returned the
933*0Sstevel@tonic-gate 		 * smallest slot greater than the key, use it.
934*0Sstevel@tonic-gate 		 */
935*0Sstevel@tonic-gate 		if (exact) {
936*0Sstevel@tonic-gate 			if (F_ISSET(dbp, DB_AM_DUP)) {
937*0Sstevel@tonic-gate 				/*
938*0Sstevel@tonic-gate 				 * If at off-page duplicate page, move to the
939*0Sstevel@tonic-gate 				 * first or last entry -- if a comparison
940*0Sstevel@tonic-gate 				 * function was specified, start searching at
941*0Sstevel@tonic-gate 				 * the first entry.  Otherwise, move based on
942*0Sstevel@tonic-gate 				 * the DB_KEYFIRST/DB_KEYLAST flags.
943*0Sstevel@tonic-gate 				 */
944*0Sstevel@tonic-gate 				if ((ret = __bam_dup(dbc, cp, cp->indx,
945*0Sstevel@tonic-gate 				    dbp->dup_compare == NULL &&
946*0Sstevel@tonic-gate 				    flags != DB_KEYFIRST)) != 0)
947*0Sstevel@tonic-gate 					goto err;
948*0Sstevel@tonic-gate 
949*0Sstevel@tonic-gate 				/*
950*0Sstevel@tonic-gate 				 * If there's a comparison function, search for
951*0Sstevel@tonic-gate 				 * the correct slot.  Otherwise, set the insert
952*0Sstevel@tonic-gate 				 * flag based on the argment flag.
953*0Sstevel@tonic-gate 				 */
954*0Sstevel@tonic-gate 				if (dbp->dup_compare == NULL)
955*0Sstevel@tonic-gate 					iiop = flags == DB_KEYFIRST ?
956*0Sstevel@tonic-gate 					    DB_BEFORE : DB_AFTER;
957*0Sstevel@tonic-gate 				else
958*0Sstevel@tonic-gate 					if ((ret = __bam_dsearch(dbc,
959*0Sstevel@tonic-gate 					    cp, data, &iiop)) != 0)
960*0Sstevel@tonic-gate 						goto err;
961*0Sstevel@tonic-gate 			} else
962*0Sstevel@tonic-gate 				iiop = DB_CURRENT;
963*0Sstevel@tonic-gate 			iiflags = 0;
964*0Sstevel@tonic-gate 		} else {
965*0Sstevel@tonic-gate 			iiop = DB_BEFORE;
966*0Sstevel@tonic-gate 			iiflags = BI_NEWKEY;
967*0Sstevel@tonic-gate 		}
968*0Sstevel@tonic-gate 
969*0Sstevel@tonic-gate 		if (cp->dpgno == PGNO_INVALID) {
970*0Sstevel@tonic-gate 			pgno = cp->pgno;
971*0Sstevel@tonic-gate 			indx = cp->indx;
972*0Sstevel@tonic-gate 		} else {
973*0Sstevel@tonic-gate 			pgno = cp->dpgno;
974*0Sstevel@tonic-gate 			indx = cp->dindx;
975*0Sstevel@tonic-gate 		}
976*0Sstevel@tonic-gate 		break;
977*0Sstevel@tonic-gate 	}
978*0Sstevel@tonic-gate 
979*0Sstevel@tonic-gate 	ret = __bam_iitem(dbc, &cp->page, &indx, key, data, iiop, iiflags);
980*0Sstevel@tonic-gate 
981*0Sstevel@tonic-gate 	if (ret == DB_NEEDSPLIT)
982*0Sstevel@tonic-gate 		goto split;
983*0Sstevel@tonic-gate 	if (ret != 0)
984*0Sstevel@tonic-gate 		goto err;
985*0Sstevel@tonic-gate 
986*0Sstevel@tonic-gate 	/*
987*0Sstevel@tonic-gate 	 * Reset any cursors referencing this item that might have the item
988*0Sstevel@tonic-gate 	 * marked for deletion.
989*0Sstevel@tonic-gate 	 */
990*0Sstevel@tonic-gate 	if (iiop == DB_CURRENT) {
991*0Sstevel@tonic-gate 		(void)__bam_ca_delete(dbp, pgno, indx, 0);
992*0Sstevel@tonic-gate 
993*0Sstevel@tonic-gate 		/*
994*0Sstevel@tonic-gate 		 * It's also possible that we are the cursor that had the
995*0Sstevel@tonic-gate 		 * item marked for deletion, in which case we want to make
996*0Sstevel@tonic-gate 		 * sure that we don't delete it because we had the delete
997*0Sstevel@tonic-gate 		 * flag set already.
998*0Sstevel@tonic-gate 		 */
999*0Sstevel@tonic-gate 		if (cp->pgno == copy.pgno && cp->indx == copy.indx &&
1000*0Sstevel@tonic-gate 		    cp->dpgno == copy.dpgno && cp->dindx == copy.dindx)
1001*0Sstevel@tonic-gate 			F_CLR(&copy, C_DELETED);
1002*0Sstevel@tonic-gate 	}
1003*0Sstevel@tonic-gate 
1004*0Sstevel@tonic-gate 	/*
1005*0Sstevel@tonic-gate 	 * Update the cursor to point to the new entry.  The new entry was
1006*0Sstevel@tonic-gate 	 * stored on the current page, because we split pages until it was
1007*0Sstevel@tonic-gate 	 * possible.
1008*0Sstevel@tonic-gate 	 */
1009*0Sstevel@tonic-gate 	if (cp->dpgno == PGNO_INVALID)
1010*0Sstevel@tonic-gate 		cp->indx = indx;
1011*0Sstevel@tonic-gate 	else
1012*0Sstevel@tonic-gate 		cp->dindx = indx;
1013*0Sstevel@tonic-gate 
1014*0Sstevel@tonic-gate 	/*
1015*0Sstevel@tonic-gate 	 * If the previous cursor record has been deleted, physically delete
1016*0Sstevel@tonic-gate 	 * the entry from the page.  We clear the deleted flag before we call
1017*0Sstevel@tonic-gate 	 * the underlying delete routine so that, if an error occurs, and we
1018*0Sstevel@tonic-gate 	 * restore the cursor, the deleted flag is cleared.  This is because,
1019*0Sstevel@tonic-gate 	 * if we manage to physically modify the page, and then restore the
1020*0Sstevel@tonic-gate 	 * cursor, we might try to repeat the page modification when closing
1021*0Sstevel@tonic-gate 	 * the cursor.
1022*0Sstevel@tonic-gate 	 */
1023*0Sstevel@tonic-gate 	if (F_ISSET(&copy, C_DELETED)) {
1024*0Sstevel@tonic-gate 		F_CLR(&copy, C_DELETED);
1025*0Sstevel@tonic-gate 		if ((ret = __bam_c_physdel(dbc, &copy, cp->page)) != 0)
1026*0Sstevel@tonic-gate 			goto err;
1027*0Sstevel@tonic-gate 	}
1028*0Sstevel@tonic-gate 	F_CLR(cp, C_DELETED);
1029*0Sstevel@tonic-gate 
1030*0Sstevel@tonic-gate 	/* Release the previous lock, if any; the current lock is retained. */
1031*0Sstevel@tonic-gate 	if (copy.lock != LOCK_INVALID)
1032*0Sstevel@tonic-gate 		(void)__BT_TLPUT(dbc, copy.lock);
1033*0Sstevel@tonic-gate 
1034*0Sstevel@tonic-gate 	/*
1035*0Sstevel@tonic-gate 	 * Discard any pages pinned in the tree and their locks, except for
1036*0Sstevel@tonic-gate 	 * the leaf page, for which we only discard the pin, not the lock.
1037*0Sstevel@tonic-gate 	 *
1038*0Sstevel@tonic-gate 	 * Note, the leaf page participated in the stack we acquired, and so
1039*0Sstevel@tonic-gate 	 * we have to adjust the stack as necessary.  If there was only a
1040*0Sstevel@tonic-gate 	 * single page on the stack, we don't have to free further stack pages.
1041*0Sstevel@tonic-gate 	 */
1042*0Sstevel@tonic-gate 	if (stack && BT_STK_POP(cp) != NULL)
1043*0Sstevel@tonic-gate 		(void)__bam_stkrel(dbc, 0);
1044*0Sstevel@tonic-gate 
1045*0Sstevel@tonic-gate 	/* Release the current page. */
1046*0Sstevel@tonic-gate 	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
1047*0Sstevel@tonic-gate 		goto err;
1048*0Sstevel@tonic-gate 
1049*0Sstevel@tonic-gate 	if (0) {
1050*0Sstevel@tonic-gate err:		/* Discard any pinned pages. */
1051*0Sstevel@tonic-gate 		if (stack)
1052*0Sstevel@tonic-gate 			(void)__bam_stkrel(dbc, 0);
1053*0Sstevel@tonic-gate 		else
1054*0Sstevel@tonic-gate 			DISCARD(dbc, cp);
1055*0Sstevel@tonic-gate 		*cp = copy;
1056*0Sstevel@tonic-gate 	}
1057*0Sstevel@tonic-gate 
1058*0Sstevel@tonic-gate 	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
1059*0Sstevel@tonic-gate 		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
1060*0Sstevel@tonic-gate 		    DB_LOCK_IWRITE, 0);
1061*0Sstevel@tonic-gate 
1062*0Sstevel@tonic-gate 	return (ret);
1063*0Sstevel@tonic-gate }
1064*0Sstevel@tonic-gate 
1065*0Sstevel@tonic-gate /*
1066*0Sstevel@tonic-gate  * __bam_c_first --
1067*0Sstevel@tonic-gate  *	Return the first record.
1068*0Sstevel@tonic-gate  */
1069*0Sstevel@tonic-gate static int
__bam_c_first(dbc,cp)1070*0Sstevel@tonic-gate __bam_c_first(dbc, cp)
1071*0Sstevel@tonic-gate 	DBC *dbc;
1072*0Sstevel@tonic-gate 	CURSOR *cp;
1073*0Sstevel@tonic-gate {
1074*0Sstevel@tonic-gate 	DB *dbp;
1075*0Sstevel@tonic-gate 	db_pgno_t pgno;
1076*0Sstevel@tonic-gate 	int ret;
1077*0Sstevel@tonic-gate 
1078*0Sstevel@tonic-gate 	dbp = dbc->dbp;
1079*0Sstevel@tonic-gate 
1080*0Sstevel@tonic-gate 	/* Walk down the left-hand side of the tree. */
1081*0Sstevel@tonic-gate 	for (pgno = PGNO_ROOT;;) {
1082*0Sstevel@tonic-gate 		if ((ret =
1083*0Sstevel@tonic-gate 		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1084*0Sstevel@tonic-gate 			return (ret);
1085*0Sstevel@tonic-gate 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1086*0Sstevel@tonic-gate 			return (ret);
1087*0Sstevel@tonic-gate 
1088*0Sstevel@tonic-gate 		/* If we find a leaf page, we're done. */
1089*0Sstevel@tonic-gate 		if (ISLEAF(cp->page))
1090*0Sstevel@tonic-gate 			break;
1091*0Sstevel@tonic-gate 
1092*0Sstevel@tonic-gate 		pgno = GET_BINTERNAL(cp->page, 0)->pgno;
1093*0Sstevel@tonic-gate 		DISCARD(dbc, cp);
1094*0Sstevel@tonic-gate 	}
1095*0Sstevel@tonic-gate 
1096*0Sstevel@tonic-gate 	cp->pgno = cp->page->pgno;
1097*0Sstevel@tonic-gate 	cp->indx = 0;
1098*0Sstevel@tonic-gate 	cp->dpgno = PGNO_INVALID;
1099*0Sstevel@tonic-gate 
1100*0Sstevel@tonic-gate 	/* Check for duplicates. */
1101*0Sstevel@tonic-gate 	if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
1102*0Sstevel@tonic-gate 		return (ret);
1103*0Sstevel@tonic-gate 
1104*0Sstevel@tonic-gate 	/* If on an empty page or a deleted record, move to the next one. */
1105*0Sstevel@tonic-gate 	if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp))
1106*0Sstevel@tonic-gate 		if ((ret = __bam_c_next(dbc, cp, 0)) != 0)
1107*0Sstevel@tonic-gate 			return (ret);
1108*0Sstevel@tonic-gate 
1109*0Sstevel@tonic-gate 	return (0);
1110*0Sstevel@tonic-gate }
1111*0Sstevel@tonic-gate 
1112*0Sstevel@tonic-gate /*
1113*0Sstevel@tonic-gate  * __bam_c_last --
1114*0Sstevel@tonic-gate  *	Return the last record.
1115*0Sstevel@tonic-gate  */
1116*0Sstevel@tonic-gate static int
__bam_c_last(dbc,cp)1117*0Sstevel@tonic-gate __bam_c_last(dbc, cp)
1118*0Sstevel@tonic-gate 	DBC *dbc;
1119*0Sstevel@tonic-gate 	CURSOR *cp;
1120*0Sstevel@tonic-gate {
1121*0Sstevel@tonic-gate 	DB *dbp;
1122*0Sstevel@tonic-gate 	db_pgno_t pgno;
1123*0Sstevel@tonic-gate 	int ret;
1124*0Sstevel@tonic-gate 
1125*0Sstevel@tonic-gate 	dbp = dbc->dbp;
1126*0Sstevel@tonic-gate 
1127*0Sstevel@tonic-gate 	/* Walk down the right-hand side of the tree. */
1128*0Sstevel@tonic-gate 	for (pgno = PGNO_ROOT;;) {
1129*0Sstevel@tonic-gate 		if ((ret =
1130*0Sstevel@tonic-gate 		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1131*0Sstevel@tonic-gate 			return (ret);
1132*0Sstevel@tonic-gate 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1133*0Sstevel@tonic-gate 			return (ret);
1134*0Sstevel@tonic-gate 
1135*0Sstevel@tonic-gate 		/* If we find a leaf page, we're done. */
1136*0Sstevel@tonic-gate 		if (ISLEAF(cp->page))
1137*0Sstevel@tonic-gate 			break;
1138*0Sstevel@tonic-gate 
1139*0Sstevel@tonic-gate 		pgno =
1140*0Sstevel@tonic-gate 		    GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno;
1141*0Sstevel@tonic-gate 		DISCARD(dbc, cp);
1142*0Sstevel@tonic-gate 	}
1143*0Sstevel@tonic-gate 
1144*0Sstevel@tonic-gate 	cp->pgno = cp->page->pgno;
1145*0Sstevel@tonic-gate 	cp->indx = NUM_ENT(cp->page) == 0 ? 0 : NUM_ENT(cp->page) - P_INDX;
1146*0Sstevel@tonic-gate 	cp->dpgno = PGNO_INVALID;
1147*0Sstevel@tonic-gate 
1148*0Sstevel@tonic-gate 	/* Check for duplicates. */
1149*0Sstevel@tonic-gate 	if ((ret = __bam_dup(dbc, cp, cp->indx, 1)) != 0)
1150*0Sstevel@tonic-gate 		return (ret);
1151*0Sstevel@tonic-gate 
1152*0Sstevel@tonic-gate 	/* If on an empty page or a deleted record, move to the next one. */
1153*0Sstevel@tonic-gate 	if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp))
1154*0Sstevel@tonic-gate 		if ((ret = __bam_c_prev(dbc, cp)) != 0)
1155*0Sstevel@tonic-gate 			return (ret);
1156*0Sstevel@tonic-gate 
1157*0Sstevel@tonic-gate 	return (0);
1158*0Sstevel@tonic-gate }
1159*0Sstevel@tonic-gate 
1160*0Sstevel@tonic-gate /*
1161*0Sstevel@tonic-gate  * __bam_c_next --
1162*0Sstevel@tonic-gate  *	Move to the next record.
1163*0Sstevel@tonic-gate  */
1164*0Sstevel@tonic-gate static int
__bam_c_next(dbc,cp,initial_move)1165*0Sstevel@tonic-gate __bam_c_next(dbc, cp, initial_move)
1166*0Sstevel@tonic-gate 	DBC *dbc;
1167*0Sstevel@tonic-gate 	CURSOR *cp;
1168*0Sstevel@tonic-gate 	int initial_move;
1169*0Sstevel@tonic-gate {
1170*0Sstevel@tonic-gate 	DB *dbp;
1171*0Sstevel@tonic-gate 	db_indx_t adjust, indx;
1172*0Sstevel@tonic-gate 	db_pgno_t pgno;
1173*0Sstevel@tonic-gate 	int ret;
1174*0Sstevel@tonic-gate 
1175*0Sstevel@tonic-gate 	dbp = dbc->dbp;
1176*0Sstevel@tonic-gate 
1177*0Sstevel@tonic-gate 	/*
1178*0Sstevel@tonic-gate 	 * We're either moving through a page of duplicates or a btree leaf
1179*0Sstevel@tonic-gate 	 * page.
1180*0Sstevel@tonic-gate 	 */
1181*0Sstevel@tonic-gate 	if (cp->dpgno == PGNO_INVALID) {
1182*0Sstevel@tonic-gate 		adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX;
1183*0Sstevel@tonic-gate 		pgno = cp->pgno;
1184*0Sstevel@tonic-gate 		indx = cp->indx;
1185*0Sstevel@tonic-gate 	} else {
1186*0Sstevel@tonic-gate 		adjust = O_INDX;
1187*0Sstevel@tonic-gate 		pgno = cp->dpgno;
1188*0Sstevel@tonic-gate 		indx = cp->dindx;
1189*0Sstevel@tonic-gate 	}
1190*0Sstevel@tonic-gate 	if (cp->page == NULL) {
1191*0Sstevel@tonic-gate 		if ((ret =
1192*0Sstevel@tonic-gate 		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1193*0Sstevel@tonic-gate 			return (ret);
1194*0Sstevel@tonic-gate 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1195*0Sstevel@tonic-gate 			return (ret);
1196*0Sstevel@tonic-gate 	}
1197*0Sstevel@tonic-gate 
1198*0Sstevel@tonic-gate 	/*
1199*0Sstevel@tonic-gate 	 * If at the end of the page, move to a subsequent page.
1200*0Sstevel@tonic-gate 	 *
1201*0Sstevel@tonic-gate 	 * !!!
1202*0Sstevel@tonic-gate 	 * Check for >= NUM_ENT.  If we're here as the result of a search that
1203*0Sstevel@tonic-gate 	 * landed us on NUM_ENT, we'll increment indx before we test.
1204*0Sstevel@tonic-gate 	 *
1205*0Sstevel@tonic-gate 	 * !!!
1206*0Sstevel@tonic-gate 	 * This code handles empty pages and pages with only deleted entries.
1207*0Sstevel@tonic-gate 	 */
1208*0Sstevel@tonic-gate 	if (initial_move)
1209*0Sstevel@tonic-gate 		indx += adjust;
1210*0Sstevel@tonic-gate 	for (;;) {
1211*0Sstevel@tonic-gate 		if (indx >= NUM_ENT(cp->page)) {
1212*0Sstevel@tonic-gate 			/*
1213*0Sstevel@tonic-gate 			 * If we're in a btree leaf page, we've reached the end
1214*0Sstevel@tonic-gate 			 * of the tree.  If we've reached the end of a page of
1215*0Sstevel@tonic-gate 			 * duplicates, continue from the btree leaf page where
1216*0Sstevel@tonic-gate 			 * we found this page of duplicates.
1217*0Sstevel@tonic-gate 			 */
1218*0Sstevel@tonic-gate 			pgno = cp->page->next_pgno;
1219*0Sstevel@tonic-gate 			if (pgno == PGNO_INVALID) {
1220*0Sstevel@tonic-gate 				/* If in a btree leaf page, it's EOF. */
1221*0Sstevel@tonic-gate 				if (cp->dpgno == PGNO_INVALID)
1222*0Sstevel@tonic-gate 					return (DB_NOTFOUND);
1223*0Sstevel@tonic-gate 
1224*0Sstevel@tonic-gate 				/* Continue from the last btree leaf page. */
1225*0Sstevel@tonic-gate 				cp->dpgno = PGNO_INVALID;
1226*0Sstevel@tonic-gate 
1227*0Sstevel@tonic-gate 				adjust = P_INDX;
1228*0Sstevel@tonic-gate 				pgno = cp->pgno;
1229*0Sstevel@tonic-gate 				indx = cp->indx + P_INDX;
1230*0Sstevel@tonic-gate 			} else
1231*0Sstevel@tonic-gate 				indx = 0;
1232*0Sstevel@tonic-gate 
1233*0Sstevel@tonic-gate 			DISCARD(dbc, cp);
1234*0Sstevel@tonic-gate 			if ((ret = __bam_lget(dbc,
1235*0Sstevel@tonic-gate 			    0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1236*0Sstevel@tonic-gate 				return (ret);
1237*0Sstevel@tonic-gate 			if ((ret =
1238*0Sstevel@tonic-gate 			    memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1239*0Sstevel@tonic-gate 				return (ret);
1240*0Sstevel@tonic-gate 			continue;
1241*0Sstevel@tonic-gate 		}
1242*0Sstevel@tonic-gate 
1243*0Sstevel@tonic-gate 		/* Ignore deleted records. */
1244*0Sstevel@tonic-gate 		if (IS_DELETED(cp, indx)) {
1245*0Sstevel@tonic-gate 			indx += adjust;
1246*0Sstevel@tonic-gate 			continue;
1247*0Sstevel@tonic-gate 		}
1248*0Sstevel@tonic-gate 
1249*0Sstevel@tonic-gate 		/*
1250*0Sstevel@tonic-gate 		 * If we're not in a duplicates page, check to see if we've
1251*0Sstevel@tonic-gate 		 * found a page of duplicates, in which case we move to the
1252*0Sstevel@tonic-gate 		 * first entry.
1253*0Sstevel@tonic-gate 		 */
1254*0Sstevel@tonic-gate 		if (cp->dpgno == PGNO_INVALID) {
1255*0Sstevel@tonic-gate 			cp->pgno = cp->page->pgno;
1256*0Sstevel@tonic-gate 			cp->indx = indx;
1257*0Sstevel@tonic-gate 
1258*0Sstevel@tonic-gate 			if ((ret = __bam_dup(dbc, cp, indx, 0)) != 0)
1259*0Sstevel@tonic-gate 				return (ret);
1260*0Sstevel@tonic-gate 			if (cp->dpgno != PGNO_INVALID) {
1261*0Sstevel@tonic-gate 				indx = cp->dindx;
1262*0Sstevel@tonic-gate 				adjust = O_INDX;
1263*0Sstevel@tonic-gate 				continue;
1264*0Sstevel@tonic-gate 			}
1265*0Sstevel@tonic-gate 		} else {
1266*0Sstevel@tonic-gate 			cp->dpgno = cp->page->pgno;
1267*0Sstevel@tonic-gate 			cp->dindx = indx;
1268*0Sstevel@tonic-gate 		}
1269*0Sstevel@tonic-gate 		break;
1270*0Sstevel@tonic-gate 	}
1271*0Sstevel@tonic-gate 	return (0);
1272*0Sstevel@tonic-gate }
1273*0Sstevel@tonic-gate 
1274*0Sstevel@tonic-gate /*
1275*0Sstevel@tonic-gate  * __bam_c_prev --
1276*0Sstevel@tonic-gate  *	Move to the previous record.
1277*0Sstevel@tonic-gate  */
1278*0Sstevel@tonic-gate static int
__bam_c_prev(dbc,cp)1279*0Sstevel@tonic-gate __bam_c_prev(dbc, cp)
1280*0Sstevel@tonic-gate 	DBC *dbc;
1281*0Sstevel@tonic-gate 	CURSOR *cp;
1282*0Sstevel@tonic-gate {
1283*0Sstevel@tonic-gate 	DB *dbp;
1284*0Sstevel@tonic-gate 	db_indx_t indx, adjust;
1285*0Sstevel@tonic-gate 	db_pgno_t pgno;
1286*0Sstevel@tonic-gate 	int ret, set_indx;
1287*0Sstevel@tonic-gate 
1288*0Sstevel@tonic-gate 	dbp = dbc->dbp;
1289*0Sstevel@tonic-gate 
1290*0Sstevel@tonic-gate 	/*
1291*0Sstevel@tonic-gate 	 * We're either moving through a page of duplicates or a btree leaf
1292*0Sstevel@tonic-gate 	 * page.
1293*0Sstevel@tonic-gate 	 */
1294*0Sstevel@tonic-gate 	if (cp->dpgno == PGNO_INVALID) {
1295*0Sstevel@tonic-gate 		adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX;
1296*0Sstevel@tonic-gate 		pgno = cp->pgno;
1297*0Sstevel@tonic-gate 		indx = cp->indx;
1298*0Sstevel@tonic-gate 	} else {
1299*0Sstevel@tonic-gate 		adjust = O_INDX;
1300*0Sstevel@tonic-gate 		pgno = cp->dpgno;
1301*0Sstevel@tonic-gate 		indx = cp->dindx;
1302*0Sstevel@tonic-gate 	}
1303*0Sstevel@tonic-gate 	if (cp->page == NULL) {
1304*0Sstevel@tonic-gate 		if ((ret =
1305*0Sstevel@tonic-gate 		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1306*0Sstevel@tonic-gate 			return (ret);
1307*0Sstevel@tonic-gate 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1308*0Sstevel@tonic-gate 			return (ret);
1309*0Sstevel@tonic-gate 	}
1310*0Sstevel@tonic-gate 
1311*0Sstevel@tonic-gate 	/*
1312*0Sstevel@tonic-gate 	 * If at the beginning of the page, move to any previous one.
1313*0Sstevel@tonic-gate 	 *
1314*0Sstevel@tonic-gate 	 * !!!
1315*0Sstevel@tonic-gate 	 * This code handles empty pages and pages with only deleted entries.
1316*0Sstevel@tonic-gate 	 */
1317*0Sstevel@tonic-gate 	for (;;) {
1318*0Sstevel@tonic-gate 		if (indx == 0) {
1319*0Sstevel@tonic-gate 			/*
1320*0Sstevel@tonic-gate 			 * If we're in a btree leaf page, we've reached the
1321*0Sstevel@tonic-gate 			 * beginning of the tree.  If we've reached the first
1322*0Sstevel@tonic-gate 			 * of a page of duplicates, continue from the btree
1323*0Sstevel@tonic-gate 			 * leaf page where we found this page of duplicates.
1324*0Sstevel@tonic-gate 			 */
1325*0Sstevel@tonic-gate 			pgno = cp->page->prev_pgno;
1326*0Sstevel@tonic-gate 			if (pgno == PGNO_INVALID) {
1327*0Sstevel@tonic-gate 				/* If in a btree leaf page, it's SOF. */
1328*0Sstevel@tonic-gate 				if (cp->dpgno == PGNO_INVALID)
1329*0Sstevel@tonic-gate 					return (DB_NOTFOUND);
1330*0Sstevel@tonic-gate 
1331*0Sstevel@tonic-gate 				/* Continue from the last btree leaf page. */
1332*0Sstevel@tonic-gate 				cp->dpgno = PGNO_INVALID;
1333*0Sstevel@tonic-gate 
1334*0Sstevel@tonic-gate 				adjust = P_INDX;
1335*0Sstevel@tonic-gate 				pgno = cp->pgno;
1336*0Sstevel@tonic-gate 				indx = cp->indx;
1337*0Sstevel@tonic-gate 				set_indx = 0;
1338*0Sstevel@tonic-gate 			} else
1339*0Sstevel@tonic-gate 				set_indx = 1;
1340*0Sstevel@tonic-gate 
1341*0Sstevel@tonic-gate 			DISCARD(dbc, cp);
1342*0Sstevel@tonic-gate 			if ((ret = __bam_lget(dbc,
1343*0Sstevel@tonic-gate 			    0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1344*0Sstevel@tonic-gate 				return (ret);
1345*0Sstevel@tonic-gate 			if ((ret =
1346*0Sstevel@tonic-gate 			    memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1347*0Sstevel@tonic-gate 				return (ret);
1348*0Sstevel@tonic-gate 
1349*0Sstevel@tonic-gate 			if (set_indx)
1350*0Sstevel@tonic-gate 				indx = NUM_ENT(cp->page);
1351*0Sstevel@tonic-gate 			if (indx == 0)
1352*0Sstevel@tonic-gate 				continue;
1353*0Sstevel@tonic-gate 		}
1354*0Sstevel@tonic-gate 
1355*0Sstevel@tonic-gate 		/* Ignore deleted records. */
1356*0Sstevel@tonic-gate 		indx -= adjust;
1357*0Sstevel@tonic-gate 		if (IS_DELETED(cp, indx))
1358*0Sstevel@tonic-gate 			continue;
1359*0Sstevel@tonic-gate 
1360*0Sstevel@tonic-gate 		/*
1361*0Sstevel@tonic-gate 		 * If we're not in a duplicates page, check to see if we've
1362*0Sstevel@tonic-gate 		 * found a page of duplicates, in which case we move to the
1363*0Sstevel@tonic-gate 		 * last entry.
1364*0Sstevel@tonic-gate 		 */
1365*0Sstevel@tonic-gate 		if (cp->dpgno == PGNO_INVALID) {
1366*0Sstevel@tonic-gate 			cp->pgno = cp->page->pgno;
1367*0Sstevel@tonic-gate 			cp->indx = indx;
1368*0Sstevel@tonic-gate 
1369*0Sstevel@tonic-gate 			if ((ret = __bam_dup(dbc, cp, indx, 1)) != 0)
1370*0Sstevel@tonic-gate 				return (ret);
1371*0Sstevel@tonic-gate 			if (cp->dpgno != PGNO_INVALID) {
1372*0Sstevel@tonic-gate 				indx = cp->dindx + O_INDX;
1373*0Sstevel@tonic-gate 				adjust = O_INDX;
1374*0Sstevel@tonic-gate 				continue;
1375*0Sstevel@tonic-gate 			}
1376*0Sstevel@tonic-gate 		} else {
1377*0Sstevel@tonic-gate 			cp->dpgno = cp->page->pgno;
1378*0Sstevel@tonic-gate 			cp->dindx = indx;
1379*0Sstevel@tonic-gate 		}
1380*0Sstevel@tonic-gate 		break;
1381*0Sstevel@tonic-gate 	}
1382*0Sstevel@tonic-gate 	return (0);
1383*0Sstevel@tonic-gate }
1384*0Sstevel@tonic-gate 
1385*0Sstevel@tonic-gate /*
1386*0Sstevel@tonic-gate  * __bam_c_search --
1387*0Sstevel@tonic-gate  *	Move to a specified record.
1388*0Sstevel@tonic-gate  */
1389*0Sstevel@tonic-gate static int
__bam_c_search(dbc,cp,key,flags,exactp)1390*0Sstevel@tonic-gate __bam_c_search(dbc, cp, key, flags, exactp)
1391*0Sstevel@tonic-gate 	DBC *dbc;
1392*0Sstevel@tonic-gate 	CURSOR *cp;
1393*0Sstevel@tonic-gate 	const DBT *key;
1394*0Sstevel@tonic-gate 	u_int32_t flags;
1395*0Sstevel@tonic-gate 	int *exactp;
1396*0Sstevel@tonic-gate {
1397*0Sstevel@tonic-gate 	BTREE *t;
1398*0Sstevel@tonic-gate 	DB *dbp;
1399*0Sstevel@tonic-gate 	DB_LOCK lock;
1400*0Sstevel@tonic-gate 	PAGE *h;
1401*0Sstevel@tonic-gate 	db_recno_t recno;
1402*0Sstevel@tonic-gate 	db_indx_t indx;
1403*0Sstevel@tonic-gate 	u_int32_t sflags;
1404*0Sstevel@tonic-gate 	int cmp, needexact, ret;
1405*0Sstevel@tonic-gate 
1406*0Sstevel@tonic-gate 	dbp = dbc->dbp;
1407*0Sstevel@tonic-gate 	t = dbp->internal;
1408*0Sstevel@tonic-gate 
1409*0Sstevel@tonic-gate 	/* Find an entry in the database. */
1410*0Sstevel@tonic-gate 	switch (flags) {
1411*0Sstevel@tonic-gate 	case DB_SET_RECNO:
1412*0Sstevel@tonic-gate 		if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
1413*0Sstevel@tonic-gate 			return (ret);
1414*0Sstevel@tonic-gate 		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
1415*0Sstevel@tonic-gate 		needexact = *exactp = 1;
1416*0Sstevel@tonic-gate 		ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp);
1417*0Sstevel@tonic-gate 		break;
1418*0Sstevel@tonic-gate 	case DB_SET:
1419*0Sstevel@tonic-gate 	case DB_GET_BOTH:
1420*0Sstevel@tonic-gate 		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
1421*0Sstevel@tonic-gate 		needexact = *exactp = 1;
1422*0Sstevel@tonic-gate 		goto search;
1423*0Sstevel@tonic-gate 	case DB_SET_RANGE:
1424*0Sstevel@tonic-gate 		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
1425*0Sstevel@tonic-gate 		needexact = *exactp = 0;
1426*0Sstevel@tonic-gate 		goto search;
1427*0Sstevel@tonic-gate 	case DB_KEYFIRST:
1428*0Sstevel@tonic-gate 		sflags = S_KEYFIRST;
1429*0Sstevel@tonic-gate 		goto fast_search;
1430*0Sstevel@tonic-gate 	case DB_KEYLAST:
1431*0Sstevel@tonic-gate 		sflags = S_KEYLAST;
1432*0Sstevel@tonic-gate fast_search:	needexact = *exactp = 0;
1433*0Sstevel@tonic-gate 		/*
1434*0Sstevel@tonic-gate 		 * If the application has a history of inserting into the first
1435*0Sstevel@tonic-gate 		 * or last pages of the database, we check those pages first to
1436*0Sstevel@tonic-gate 		 * avoid doing a full search.
1437*0Sstevel@tonic-gate 		 *
1438*0Sstevel@tonic-gate 		 * Record numbers can't be fast-tracked, the entire tree has to
1439*0Sstevel@tonic-gate 		 * be locked.
1440*0Sstevel@tonic-gate 		 */
1441*0Sstevel@tonic-gate 		h = NULL;
1442*0Sstevel@tonic-gate 		lock = LOCK_INVALID;
1443*0Sstevel@tonic-gate 		if (F_ISSET(dbp, DB_BT_RECNUM))
1444*0Sstevel@tonic-gate 			goto search;
1445*0Sstevel@tonic-gate 
1446*0Sstevel@tonic-gate 		/* Check if the application has a history of sorted input. */
1447*0Sstevel@tonic-gate 		if (t->bt_lpgno == PGNO_INVALID)
1448*0Sstevel@tonic-gate 			goto search;
1449*0Sstevel@tonic-gate 
1450*0Sstevel@tonic-gate 		/*
1451*0Sstevel@tonic-gate 		 * Lock and retrieve the page on which we did the last insert.
1452*0Sstevel@tonic-gate 		 * It's okay if it doesn't exist, or if it's not the page type
1453*0Sstevel@tonic-gate 		 * we expected, it just means that the world changed.
1454*0Sstevel@tonic-gate 		 */
1455*0Sstevel@tonic-gate 		if (__bam_lget(dbc, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock))
1456*0Sstevel@tonic-gate 			goto fast_miss;
1457*0Sstevel@tonic-gate 		if (memp_fget(dbp->mpf, &t->bt_lpgno, 0, &h))
1458*0Sstevel@tonic-gate 			goto fast_miss;
1459*0Sstevel@tonic-gate 		if (TYPE(h) != P_LBTREE)
1460*0Sstevel@tonic-gate 			goto fast_miss;
1461*0Sstevel@tonic-gate 		if (NUM_ENT(h) == 0)
1462*0Sstevel@tonic-gate 			goto fast_miss;
1463*0Sstevel@tonic-gate 
1464*0Sstevel@tonic-gate 		/*
1465*0Sstevel@tonic-gate 		 * What we do here is test to see if we're at the beginning or
1466*0Sstevel@tonic-gate 		 * end of the tree and if the new item sorts before/after the
1467*0Sstevel@tonic-gate 		 * first/last page entry.  We don't try and catch inserts into
1468*0Sstevel@tonic-gate 		 * the middle of the tree (although we could, as long as there
1469*0Sstevel@tonic-gate 		 * were two keys on the page and we saved both the index and
1470*0Sstevel@tonic-gate 		 * the page number of the last insert).
1471*0Sstevel@tonic-gate 		 */
1472*0Sstevel@tonic-gate 		if (h->next_pgno == PGNO_INVALID) {
1473*0Sstevel@tonic-gate 			indx = NUM_ENT(h) - P_INDX;
1474*0Sstevel@tonic-gate 			if ((cmp =
1475*0Sstevel@tonic-gate 			    __bam_cmp(dbp, key, h, indx, t->bt_compare)) < 0)
1476*0Sstevel@tonic-gate 				goto try_begin;
1477*0Sstevel@tonic-gate 			if (cmp > 0) {
1478*0Sstevel@tonic-gate 				indx += P_INDX;
1479*0Sstevel@tonic-gate 				goto fast_hit;
1480*0Sstevel@tonic-gate 			}
1481*0Sstevel@tonic-gate 
1482*0Sstevel@tonic-gate 			/*
1483*0Sstevel@tonic-gate 			 * Found a duplicate.  If doing DB_KEYLAST, we're at
1484*0Sstevel@tonic-gate 			 * the correct position, otherwise, move to the first
1485*0Sstevel@tonic-gate 			 * of the duplicates.
1486*0Sstevel@tonic-gate 			 */
1487*0Sstevel@tonic-gate 			if (flags == DB_KEYLAST)
1488*0Sstevel@tonic-gate 				goto fast_hit;
1489*0Sstevel@tonic-gate 			for (;
1490*0Sstevel@tonic-gate 			    indx > 0 && h->inp[indx - P_INDX] == h->inp[indx];
1491*0Sstevel@tonic-gate 			    indx -= P_INDX)
1492*0Sstevel@tonic-gate 				;
1493*0Sstevel@tonic-gate 			goto fast_hit;
1494*0Sstevel@tonic-gate 		}
1495*0Sstevel@tonic-gate try_begin:	if (h->prev_pgno == PGNO_INVALID) {
1496*0Sstevel@tonic-gate 			indx = 0;
1497*0Sstevel@tonic-gate 			if ((cmp =
1498*0Sstevel@tonic-gate 			    __bam_cmp(dbp, key, h, indx, t->bt_compare)) > 0)
1499*0Sstevel@tonic-gate 				goto fast_miss;
1500*0Sstevel@tonic-gate 			if (cmp < 0)
1501*0Sstevel@tonic-gate 				goto fast_hit;
1502*0Sstevel@tonic-gate 			/*
1503*0Sstevel@tonic-gate 			 * Found a duplicate.  If doing DB_KEYFIRST, we're at
1504*0Sstevel@tonic-gate 			 * the correct position, otherwise, move to the last
1505*0Sstevel@tonic-gate 			 * of the duplicates.
1506*0Sstevel@tonic-gate 			 */
1507*0Sstevel@tonic-gate 			if (flags == DB_KEYFIRST)
1508*0Sstevel@tonic-gate 				goto fast_hit;
1509*0Sstevel@tonic-gate 			for (;
1510*0Sstevel@tonic-gate 			    indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
1511*0Sstevel@tonic-gate 			    h->inp[indx] == h->inp[indx + P_INDX];
1512*0Sstevel@tonic-gate 			    indx += P_INDX)
1513*0Sstevel@tonic-gate 				;
1514*0Sstevel@tonic-gate 			goto fast_hit;
1515*0Sstevel@tonic-gate 		}
1516*0Sstevel@tonic-gate 		goto fast_miss;
1517*0Sstevel@tonic-gate 
1518*0Sstevel@tonic-gate fast_hit:	/* Set the exact match flag, we may have found a duplicate. */
1519*0Sstevel@tonic-gate 		*exactp = cmp == 0;
1520*0Sstevel@tonic-gate 
1521*0Sstevel@tonic-gate 		/* Enter the entry in the stack. */
1522*0Sstevel@tonic-gate 		BT_STK_CLR(cp);
1523*0Sstevel@tonic-gate 		BT_STK_ENTER(cp, h, indx, lock, ret);
1524*0Sstevel@tonic-gate 		break;
1525*0Sstevel@tonic-gate 
1526*0Sstevel@tonic-gate fast_miss:	if (h != NULL)
1527*0Sstevel@tonic-gate 			(void)memp_fput(dbp->mpf, h, 0);
1528*0Sstevel@tonic-gate 		if (lock != LOCK_INVALID)
1529*0Sstevel@tonic-gate 			(void)__BT_LPUT(dbc, lock);
1530*0Sstevel@tonic-gate 
1531*0Sstevel@tonic-gate search:		ret = __bam_search(dbc, key, sflags, 1, NULL, exactp);
1532*0Sstevel@tonic-gate 		break;
1533*0Sstevel@tonic-gate 	default:				/* XXX: Impossible. */
1534*0Sstevel@tonic-gate 		abort();
1535*0Sstevel@tonic-gate 		/* NOTREACHED */
1536*0Sstevel@tonic-gate 	}
1537*0Sstevel@tonic-gate 	if (ret != 0)
1538*0Sstevel@tonic-gate 		return (ret);
1539*0Sstevel@tonic-gate 
1540*0Sstevel@tonic-gate 	/*
1541*0Sstevel@tonic-gate 	 * Initialize the cursor to reference it.  This has to be done
1542*0Sstevel@tonic-gate 	 * before we return (even with DB_NOTFOUND) because we have to
1543*0Sstevel@tonic-gate 	 * free the page(s) we locked in __bam_search.
1544*0Sstevel@tonic-gate 	 */
1545*0Sstevel@tonic-gate 	cp->page = cp->csp->page;
1546*0Sstevel@tonic-gate 	cp->pgno = cp->csp->page->pgno;
1547*0Sstevel@tonic-gate 	cp->indx = cp->csp->indx;
1548*0Sstevel@tonic-gate 	cp->lock = cp->csp->lock;
1549*0Sstevel@tonic-gate 	cp->dpgno = PGNO_INVALID;
1550*0Sstevel@tonic-gate 
1551*0Sstevel@tonic-gate 	/*
1552*0Sstevel@tonic-gate 	 * If we inserted a key into the first or last slot of the tree,
1553*0Sstevel@tonic-gate 	 * remember where it was so we can do it more quickly next time.
1554*0Sstevel@tonic-gate 	 */
1555*0Sstevel@tonic-gate 	if (flags == DB_KEYFIRST || flags == DB_KEYLAST)
1556*0Sstevel@tonic-gate 		t->bt_lpgno =
1557*0Sstevel@tonic-gate 		    ((cp->page->next_pgno == PGNO_INVALID &&
1558*0Sstevel@tonic-gate 		    cp->indx >= NUM_ENT(cp->page)) ||
1559*0Sstevel@tonic-gate 		    (cp->page->prev_pgno == PGNO_INVALID && cp->indx == 0)) ?
1560*0Sstevel@tonic-gate 		    cp->pgno : PGNO_INVALID;
1561*0Sstevel@tonic-gate 
1562*0Sstevel@tonic-gate 	/* If we need an exact match and didn't find one, we're done. */
1563*0Sstevel@tonic-gate 	if (needexact && *exactp == 0)
1564*0Sstevel@tonic-gate 		return (DB_NOTFOUND);
1565*0Sstevel@tonic-gate 
1566*0Sstevel@tonic-gate 	return (0);
1567*0Sstevel@tonic-gate }
1568*0Sstevel@tonic-gate 
1569*0Sstevel@tonic-gate /*
1570*0Sstevel@tonic-gate  * __bam_dup --
1571*0Sstevel@tonic-gate  *	Check for an off-page duplicates entry, and if found, move to the
1572*0Sstevel@tonic-gate  *	first or last entry.
1573*0Sstevel@tonic-gate  *
1574*0Sstevel@tonic-gate  * PUBLIC: int __bam_dup __P((DBC *, CURSOR *, u_int32_t, int));
1575*0Sstevel@tonic-gate  */
1576*0Sstevel@tonic-gate int
__bam_dup(dbc,cp,indx,last_dup)1577*0Sstevel@tonic-gate __bam_dup(dbc, cp, indx, last_dup)
1578*0Sstevel@tonic-gate 	DBC *dbc;
1579*0Sstevel@tonic-gate 	CURSOR *cp;
1580*0Sstevel@tonic-gate 	u_int32_t indx;
1581*0Sstevel@tonic-gate 	int last_dup;
1582*0Sstevel@tonic-gate {
1583*0Sstevel@tonic-gate 	BOVERFLOW *bo;
1584*0Sstevel@tonic-gate 	DB *dbp;
1585*0Sstevel@tonic-gate 	db_pgno_t pgno;
1586*0Sstevel@tonic-gate 	int ret;
1587*0Sstevel@tonic-gate 
1588*0Sstevel@tonic-gate 	dbp = dbc->dbp;
1589*0Sstevel@tonic-gate 
1590*0Sstevel@tonic-gate 	/*
1591*0Sstevel@tonic-gate 	 * Check for an overflow entry.  If we find one, move to the
1592*0Sstevel@tonic-gate 	 * duplicates page, and optionally move to the last record on
1593*0Sstevel@tonic-gate 	 * that page.
1594*0Sstevel@tonic-gate 	 *
1595*0Sstevel@tonic-gate 	 * !!!
1596*0Sstevel@tonic-gate 	 * We don't lock duplicates pages, we've already got the correct
1597*0Sstevel@tonic-gate 	 * lock on the main page.
1598*0Sstevel@tonic-gate 	 */
1599*0Sstevel@tonic-gate 	bo = GET_BOVERFLOW(cp->page, indx + O_INDX);
1600*0Sstevel@tonic-gate 	if (B_TYPE(bo->type) != B_DUPLICATE)
1601*0Sstevel@tonic-gate 		return (0);
1602*0Sstevel@tonic-gate 
1603*0Sstevel@tonic-gate 	pgno = bo->pgno;
1604*0Sstevel@tonic-gate 	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
1605*0Sstevel@tonic-gate 		return (ret);
1606*0Sstevel@tonic-gate 	cp->page = NULL;
1607*0Sstevel@tonic-gate 	if (last_dup) {
1608*0Sstevel@tonic-gate 		if ((ret = __db_dend(dbc, pgno, &cp->page)) != 0)
1609*0Sstevel@tonic-gate 			return (ret);
1610*0Sstevel@tonic-gate 		indx = NUM_ENT(cp->page) - O_INDX;
1611*0Sstevel@tonic-gate 	} else {
1612*0Sstevel@tonic-gate 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1613*0Sstevel@tonic-gate 			return (ret);
1614*0Sstevel@tonic-gate 		indx = 0;
1615*0Sstevel@tonic-gate 	}
1616*0Sstevel@tonic-gate 
1617*0Sstevel@tonic-gate 	/* Update the cursor's duplicate information. */
1618*0Sstevel@tonic-gate 	cp->dpgno = cp->page->pgno;
1619*0Sstevel@tonic-gate 	cp->dindx = indx;
1620*0Sstevel@tonic-gate 
1621*0Sstevel@tonic-gate 	return (0);
1622*0Sstevel@tonic-gate }
1623*0Sstevel@tonic-gate 
1624*0Sstevel@tonic-gate /*
1625*0Sstevel@tonic-gate  * __bam_c_physdel --
1626*0Sstevel@tonic-gate  *	Actually do the cursor deletion.
1627*0Sstevel@tonic-gate  */
1628*0Sstevel@tonic-gate static int
__bam_c_physdel(dbc,cp,h)1629*0Sstevel@tonic-gate __bam_c_physdel(dbc, cp, h)
1630*0Sstevel@tonic-gate 	DBC *dbc;
1631*0Sstevel@tonic-gate 	CURSOR *cp;
1632*0Sstevel@tonic-gate 	PAGE *h;
1633*0Sstevel@tonic-gate {
1634*0Sstevel@tonic-gate 	enum { DELETE_ITEM, DELETE_PAGE, NOTHING_FURTHER } cmd;
1635*0Sstevel@tonic-gate 	BOVERFLOW bo;
1636*0Sstevel@tonic-gate 	DB *dbp;
1637*0Sstevel@tonic-gate 	DBT dbt;
1638*0Sstevel@tonic-gate 	DB_LOCK lock;
1639*0Sstevel@tonic-gate 	db_indx_t indx;
1640*0Sstevel@tonic-gate 	db_pgno_t pgno, next_pgno, prev_pgno;
1641*0Sstevel@tonic-gate 	int delete_page, local_page, ret;
1642*0Sstevel@tonic-gate 
1643*0Sstevel@tonic-gate 	dbp = dbc->dbp;
1644*0Sstevel@tonic-gate 
1645*0Sstevel@tonic-gate 	delete_page = ret = 0;
1646*0Sstevel@tonic-gate 
1647*0Sstevel@tonic-gate 	/* Figure out what we're deleting. */
1648*0Sstevel@tonic-gate 	if (cp->dpgno == PGNO_INVALID) {
1649*0Sstevel@tonic-gate 		pgno = cp->pgno;
1650*0Sstevel@tonic-gate 		indx = cp->indx;
1651*0Sstevel@tonic-gate 	} else {
1652*0Sstevel@tonic-gate 		pgno = cp->dpgno;
1653*0Sstevel@tonic-gate 		indx = cp->dindx;
1654*0Sstevel@tonic-gate 	}
1655*0Sstevel@tonic-gate 
1656*0Sstevel@tonic-gate 	/*
1657*0Sstevel@tonic-gate 	 * If the item is referenced by another cursor, set that cursor's
1658*0Sstevel@tonic-gate 	 * delete flag and leave it up to it to do the delete.
1659*0Sstevel@tonic-gate 	 *
1660*0Sstevel@tonic-gate 	 * !!!
1661*0Sstevel@tonic-gate 	 * This test for > 0 is a tricky.  There are two ways that we can
1662*0Sstevel@tonic-gate 	 * be called here.  Either we are closing the cursor or we've moved
1663*0Sstevel@tonic-gate 	 * off the page with the deleted entry.  In the first case, we've
1664*0Sstevel@tonic-gate 	 * already removed the cursor from the active queue, so we won't see
1665*0Sstevel@tonic-gate 	 * it in __bam_ca_delete. In the second case, it will be on a different
1666*0Sstevel@tonic-gate 	 * item, so we won't bother with it in __bam_ca_delete.
1667*0Sstevel@tonic-gate 	 */
1668*0Sstevel@tonic-gate 	if (__bam_ca_delete(dbp, pgno, indx, 1) > 0)
1669*0Sstevel@tonic-gate 		return (0);
1670*0Sstevel@tonic-gate 
1671*0Sstevel@tonic-gate 	/*
1672*0Sstevel@tonic-gate 	 * If this is concurrent DB, upgrade the lock if necessary.
1673*0Sstevel@tonic-gate 	 */
1674*0Sstevel@tonic-gate 	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW) &&
1675*0Sstevel@tonic-gate 	    (ret = lock_get(dbp->dbenv->lk_info,
1676*0Sstevel@tonic-gate 	    dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
1677*0Sstevel@tonic-gate 	    &dbc->mylock)) != 0)
1678*0Sstevel@tonic-gate 		return (EAGAIN);
1679*0Sstevel@tonic-gate 
1680*0Sstevel@tonic-gate 	/*
1681*0Sstevel@tonic-gate 	 * If we don't already have the page locked, get it and delete the
1682*0Sstevel@tonic-gate 	 * items.
1683*0Sstevel@tonic-gate 	 */
1684*0Sstevel@tonic-gate 	if ((h == NULL || h->pgno != pgno)) {
1685*0Sstevel@tonic-gate 		if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &lock)) != 0)
1686*0Sstevel@tonic-gate 			return (ret);
1687*0Sstevel@tonic-gate 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
1688*0Sstevel@tonic-gate 			return (ret);
1689*0Sstevel@tonic-gate 		local_page = 1;
1690*0Sstevel@tonic-gate 	} else
1691*0Sstevel@tonic-gate 		local_page = 0;
1692*0Sstevel@tonic-gate 
1693*0Sstevel@tonic-gate 	/*
1694*0Sstevel@tonic-gate 	 * If we're deleting a duplicate entry and there are other duplicate
1695*0Sstevel@tonic-gate 	 * entries remaining, call the common code to do the work and fix up
1696*0Sstevel@tonic-gate 	 * the parent page as necessary.  Otherwise, do a normal btree delete.
1697*0Sstevel@tonic-gate 	 *
1698*0Sstevel@tonic-gate 	 * There are 5 possible cases:
1699*0Sstevel@tonic-gate 	 *
1700*0Sstevel@tonic-gate 	 * 1. It's not a duplicate item: do a normal btree delete.
1701*0Sstevel@tonic-gate 	 * 2. It's a duplicate item:
1702*0Sstevel@tonic-gate 	 *	2a: We delete an item from a page of duplicates, but there are
1703*0Sstevel@tonic-gate 	 *	    more items on the page.
1704*0Sstevel@tonic-gate 	 *      2b: We delete the last item from a page of duplicates, deleting
1705*0Sstevel@tonic-gate 	 *	    the last duplicate.
1706*0Sstevel@tonic-gate 	 *      2c: We delete the last item from a page of duplicates, but there
1707*0Sstevel@tonic-gate 	 *	    is a previous page of duplicates.
1708*0Sstevel@tonic-gate 	 *      2d: We delete the last item from a page of duplicates, but there
1709*0Sstevel@tonic-gate 	 *	    is a following page of duplicates.
1710*0Sstevel@tonic-gate 	 *
1711*0Sstevel@tonic-gate 	 * In the case of:
1712*0Sstevel@tonic-gate 	 *
1713*0Sstevel@tonic-gate 	 *  1: There's nothing further to do.
1714*0Sstevel@tonic-gate 	 * 2a: There's nothing further to do.
1715*0Sstevel@tonic-gate 	 * 2b: Do the normal btree delete instead of a duplicate delete, as
1716*0Sstevel@tonic-gate 	 *     that deletes both the duplicate chain and the parent page's
1717*0Sstevel@tonic-gate 	 *     entry.
1718*0Sstevel@tonic-gate 	 * 2c: There's nothing further to do.
1719*0Sstevel@tonic-gate 	 * 2d: Delete the duplicate, and update the parent page's entry.
1720*0Sstevel@tonic-gate 	 */
1721*0Sstevel@tonic-gate 	if (TYPE(h) == P_DUPLICATE) {
1722*0Sstevel@tonic-gate 		pgno = PGNO(h);
1723*0Sstevel@tonic-gate 		prev_pgno = PREV_PGNO(h);
1724*0Sstevel@tonic-gate 		next_pgno = NEXT_PGNO(h);
1725*0Sstevel@tonic-gate 
1726*0Sstevel@tonic-gate 		if (NUM_ENT(h) == 1 &&
1727*0Sstevel@tonic-gate 		    prev_pgno == PGNO_INVALID && next_pgno == PGNO_INVALID)
1728*0Sstevel@tonic-gate 			cmd = DELETE_PAGE;
1729*0Sstevel@tonic-gate 		else {
1730*0Sstevel@tonic-gate 			cmd = DELETE_ITEM;
1731*0Sstevel@tonic-gate 
1732*0Sstevel@tonic-gate 			/* Delete the duplicate. */
1733*0Sstevel@tonic-gate 			if ((ret = __db_drem(dbc, &h, indx, __bam_free)) != 0)
1734*0Sstevel@tonic-gate 				goto err;
1735*0Sstevel@tonic-gate 
1736*0Sstevel@tonic-gate 			/*
1737*0Sstevel@tonic-gate 			 * 2a: h != NULL, h->pgno == pgno
1738*0Sstevel@tonic-gate 			 * 2b: We don't reach this clause, as the above test
1739*0Sstevel@tonic-gate 			 *     was true.
1740*0Sstevel@tonic-gate 			 * 2c: h == NULL, prev_pgno != PGNO_INVALID
1741*0Sstevel@tonic-gate 			 * 2d: h != NULL, next_pgno != PGNO_INVALID
1742*0Sstevel@tonic-gate 			 *
1743*0Sstevel@tonic-gate 			 * Test for 2a and 2c: if we didn't empty the current
1744*0Sstevel@tonic-gate 			 * page or there was a previous page of duplicates, we
1745*0Sstevel@tonic-gate 			 * don't need to touch the parent page.
1746*0Sstevel@tonic-gate 			 */
1747*0Sstevel@tonic-gate 			if ((h != NULL && pgno == h->pgno) ||
1748*0Sstevel@tonic-gate 			    prev_pgno != PGNO_INVALID)
1749*0Sstevel@tonic-gate 				cmd = NOTHING_FURTHER;
1750*0Sstevel@tonic-gate 		}
1751*0Sstevel@tonic-gate 
1752*0Sstevel@tonic-gate 		/*
1753*0Sstevel@tonic-gate 		 * Release any page we're holding and its lock.
1754*0Sstevel@tonic-gate 		 *
1755*0Sstevel@tonic-gate 		 * !!!
1756*0Sstevel@tonic-gate 		 * If there is no subsequent page in the duplicate chain, then
1757*0Sstevel@tonic-gate 		 * __db_drem will have put page "h" and set it to NULL.
1758*0Sstevel@tonic-gate 		*/
1759*0Sstevel@tonic-gate 		if (local_page) {
1760*0Sstevel@tonic-gate 			if (h != NULL)
1761*0Sstevel@tonic-gate 				(void)memp_fput(dbp->mpf, h, 0);
1762*0Sstevel@tonic-gate 			(void)__BT_TLPUT(dbc, lock);
1763*0Sstevel@tonic-gate 			local_page = 0;
1764*0Sstevel@tonic-gate 		}
1765*0Sstevel@tonic-gate 
1766*0Sstevel@tonic-gate 		if (cmd == NOTHING_FURTHER)
1767*0Sstevel@tonic-gate 			goto done;
1768*0Sstevel@tonic-gate 
1769*0Sstevel@tonic-gate 		/* Acquire the parent page and switch the index to its entry. */
1770*0Sstevel@tonic-gate 		if ((ret =
1771*0Sstevel@tonic-gate 		    __bam_lget(dbc, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
1772*0Sstevel@tonic-gate 			goto err;
1773*0Sstevel@tonic-gate 		if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) {
1774*0Sstevel@tonic-gate 			(void)__BT_TLPUT(dbc, lock);
1775*0Sstevel@tonic-gate 			goto err;
1776*0Sstevel@tonic-gate 		}
1777*0Sstevel@tonic-gate 		local_page = 1;
1778*0Sstevel@tonic-gate 		indx = cp->indx;
1779*0Sstevel@tonic-gate 
1780*0Sstevel@tonic-gate 		if (cmd == DELETE_PAGE)
1781*0Sstevel@tonic-gate 			goto btd;
1782*0Sstevel@tonic-gate 
1783*0Sstevel@tonic-gate 		/*
1784*0Sstevel@tonic-gate 		 * Copy, delete, update, add-back the parent page's data entry.
1785*0Sstevel@tonic-gate 		 *
1786*0Sstevel@tonic-gate 		 * XXX
1787*0Sstevel@tonic-gate 		 * This may be a performance/logging problem.  We should add a
1788*0Sstevel@tonic-gate 		 * log message which simply logs/updates a random set of bytes
1789*0Sstevel@tonic-gate 		 * on a page, and use it instead of doing a delete/add pair.
1790*0Sstevel@tonic-gate 		 */
1791*0Sstevel@tonic-gate 		indx += O_INDX;
1792*0Sstevel@tonic-gate 		bo = *GET_BOVERFLOW(h, indx);
1793*0Sstevel@tonic-gate 		(void)__db_ditem(dbc, h, indx, BOVERFLOW_SIZE);
1794*0Sstevel@tonic-gate 		bo.pgno = next_pgno;
1795*0Sstevel@tonic-gate 		memset(&dbt, 0, sizeof(dbt));
1796*0Sstevel@tonic-gate 		dbt.data = &bo;
1797*0Sstevel@tonic-gate 		dbt.size = BOVERFLOW_SIZE;
1798*0Sstevel@tonic-gate 		(void)__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &dbt, NULL);
1799*0Sstevel@tonic-gate 		(void)memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY);
1800*0Sstevel@tonic-gate 		goto done;
1801*0Sstevel@tonic-gate 	}
1802*0Sstevel@tonic-gate 
1803*0Sstevel@tonic-gate btd:	/*
1804*0Sstevel@tonic-gate 	 * If the page is going to be emptied, delete it.  To delete a leaf
1805*0Sstevel@tonic-gate 	 * page we need a copy of a key from the page.  We use the 0th page
1806*0Sstevel@tonic-gate 	 * index since it's the last key that the page held.
1807*0Sstevel@tonic-gate 	 *
1808*0Sstevel@tonic-gate 	 * We malloc the page information instead of using the return key/data
1809*0Sstevel@tonic-gate 	 * memory because we've already set them -- the reason we've already
1810*0Sstevel@tonic-gate 	 * set them is because we're (potentially) about to do a reverse split,
1811*0Sstevel@tonic-gate 	 * which would make our saved page information useless.
1812*0Sstevel@tonic-gate 	 *
1813*0Sstevel@tonic-gate 	 * !!!
1814*0Sstevel@tonic-gate 	 * The following operations to delete a page might deadlock.  I think
1815*0Sstevel@tonic-gate 	 * that's OK.  The problem is if we're deleting an item because we're
1816*0Sstevel@tonic-gate 	 * closing cursors because we've already deadlocked and want to call
1817*0Sstevel@tonic-gate 	 * txn_abort().  If we fail due to deadlock, we leave a locked empty
1818*0Sstevel@tonic-gate 	 * page in the tree, which won't be empty long because we're going to
1819*0Sstevel@tonic-gate 	 * undo the delete.
1820*0Sstevel@tonic-gate 	 */
1821*0Sstevel@tonic-gate 	if (NUM_ENT(h) == 2 && h->pgno != PGNO_ROOT) {
1822*0Sstevel@tonic-gate 		memset(&dbt, 0, sizeof(DBT));
1823*0Sstevel@tonic-gate 		dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
1824*0Sstevel@tonic-gate 		if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0)
1825*0Sstevel@tonic-gate 			goto err;
1826*0Sstevel@tonic-gate 		delete_page = 1;
1827*0Sstevel@tonic-gate 	}
1828*0Sstevel@tonic-gate 
1829*0Sstevel@tonic-gate 	/*
1830*0Sstevel@tonic-gate 	 * Do a normal btree delete.
1831*0Sstevel@tonic-gate 	 *
1832*0Sstevel@tonic-gate 	 * !!!
1833*0Sstevel@tonic-gate 	 * Delete the key item first, otherwise the duplicate checks in
1834*0Sstevel@tonic-gate 	 * __bam_ditem() won't work!
1835*0Sstevel@tonic-gate 	 */
1836*0Sstevel@tonic-gate 	if ((ret = __bam_ditem(dbc, h, indx)) != 0)
1837*0Sstevel@tonic-gate 		goto err;
1838*0Sstevel@tonic-gate 	if ((ret = __bam_ditem(dbc, h, indx)) != 0)
1839*0Sstevel@tonic-gate 		goto err;
1840*0Sstevel@tonic-gate 
1841*0Sstevel@tonic-gate 	/* Discard any remaining locks/pages. */
1842*0Sstevel@tonic-gate 	if (local_page) {
1843*0Sstevel@tonic-gate 		(void)memp_fput(dbp->mpf, h, 0);
1844*0Sstevel@tonic-gate 		(void)__BT_TLPUT(dbc, lock);
1845*0Sstevel@tonic-gate 		local_page = 0;
1846*0Sstevel@tonic-gate 	}
1847*0Sstevel@tonic-gate 
1848*0Sstevel@tonic-gate 	/* Delete the page if it was emptied. */
1849*0Sstevel@tonic-gate 	if (delete_page)
1850*0Sstevel@tonic-gate 		ret = __bam_dpage(dbc, &dbt);
1851*0Sstevel@tonic-gate 
1852*0Sstevel@tonic-gate err:
1853*0Sstevel@tonic-gate done:	if (delete_page)
1854*0Sstevel@tonic-gate 		__os_free(dbt.data, dbt.size);
1855*0Sstevel@tonic-gate 
1856*0Sstevel@tonic-gate 	if (local_page) {
1857*0Sstevel@tonic-gate 		/*
1858*0Sstevel@tonic-gate 		 * It's possible for h to be NULL, as __db_drem may have
1859*0Sstevel@tonic-gate 		 * been relinking pages by the time that it deadlocked.
1860*0Sstevel@tonic-gate 		 */
1861*0Sstevel@tonic-gate 		if (h != NULL)
1862*0Sstevel@tonic-gate 			(void)memp_fput(dbp->mpf, h, 0);
1863*0Sstevel@tonic-gate 		(void)__BT_TLPUT(dbc, lock);
1864*0Sstevel@tonic-gate 	}
1865*0Sstevel@tonic-gate 
1866*0Sstevel@tonic-gate 	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
1867*0Sstevel@tonic-gate 		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
1868*0Sstevel@tonic-gate 		    DB_LOCK_IWRITE, 0);
1869*0Sstevel@tonic-gate 
1870*0Sstevel@tonic-gate 	return (ret);
1871*0Sstevel@tonic-gate }
1872*0Sstevel@tonic-gate 
1873*0Sstevel@tonic-gate /*
1874*0Sstevel@tonic-gate  * __bam_c_getstack --
1875*0Sstevel@tonic-gate  *	Acquire a full stack for a cursor.
1876*0Sstevel@tonic-gate  */
1877*0Sstevel@tonic-gate static int
__bam_c_getstack(dbc,cp)1878*0Sstevel@tonic-gate __bam_c_getstack(dbc, cp)
1879*0Sstevel@tonic-gate 	DBC *dbc;
1880*0Sstevel@tonic-gate 	CURSOR *cp;
1881*0Sstevel@tonic-gate {
1882*0Sstevel@tonic-gate 	DB *dbp;
1883*0Sstevel@tonic-gate 	DBT dbt;
1884*0Sstevel@tonic-gate 	PAGE *h;
1885*0Sstevel@tonic-gate 	db_pgno_t pgno;
1886*0Sstevel@tonic-gate 	int exact, ret;
1887*0Sstevel@tonic-gate 
1888*0Sstevel@tonic-gate 	dbp = dbc->dbp;
1889*0Sstevel@tonic-gate 	h = NULL;
1890*0Sstevel@tonic-gate 	memset(&dbt, 0, sizeof(DBT));
1891*0Sstevel@tonic-gate 	ret = 0;
1892*0Sstevel@tonic-gate 
1893*0Sstevel@tonic-gate 	/* Get the page with the current item on it. */
1894*0Sstevel@tonic-gate 	pgno = cp->pgno;
1895*0Sstevel@tonic-gate 	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
1896*0Sstevel@tonic-gate 		return (ret);
1897*0Sstevel@tonic-gate 
1898*0Sstevel@tonic-gate 	/* Get a copy of a key from the page. */
1899*0Sstevel@tonic-gate 	dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
1900*0Sstevel@tonic-gate 	if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0)
1901*0Sstevel@tonic-gate 		goto err;
1902*0Sstevel@tonic-gate 
1903*0Sstevel@tonic-gate 	/* Get a write-locked stack for that page. */
1904*0Sstevel@tonic-gate 	exact = 0;
1905*0Sstevel@tonic-gate 	ret = __bam_search(dbc, &dbt, S_KEYFIRST, 1, NULL, &exact);
1906*0Sstevel@tonic-gate 
1907*0Sstevel@tonic-gate 	/* We no longer need the key or the page. */
1908*0Sstevel@tonic-gate err:	if (h != NULL)
1909*0Sstevel@tonic-gate 		(void)memp_fput(dbp->mpf, h, 0);
1910*0Sstevel@tonic-gate 	if (dbt.data != NULL)
1911*0Sstevel@tonic-gate 		__os_free(dbt.data, dbt.size);
1912*0Sstevel@tonic-gate 	return (ret);
1913*0Sstevel@tonic-gate }
1914