xref: /netbsd-src/lib/libc/db/btree/bt_seq.c (revision aae80e6be7bf8e6ad43f8b63d296d2d3923c4ee5)
1 /*	$NetBSD: bt_seq.c,v 1.20 2016/09/24 21:31:25 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 1990, 1993, 1994
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Mike Olson.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #if HAVE_NBTOOL_CONFIG_H
36 #include "nbtool_config.h"
37 #endif
38 
39 #include <sys/cdefs.h>
40 __RCSID("$NetBSD: bt_seq.c,v 1.20 2016/09/24 21:31:25 christos Exp $");
41 
42 #include "namespace.h"
43 #include <sys/types.h>
44 
45 #include <assert.h>
46 #include <errno.h>
47 #include <stddef.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 
51 #include <db.h>
52 #include "btree.h"
53 
54 static int __bt_first(BTREE *, const DBT *, EPG *, int *);
55 static int __bt_seqadv(BTREE *, EPG *, int);
56 static int __bt_seqset(BTREE *, EPG *, DBT *, int);
57 static int __bt_rseq_next(BTREE *, EPG *);
58 static int __bt_rseq_prev(BTREE *, EPG *);
59 
60 /*
61  * Sequential scan support.
62  *
63  * The tree can be scanned sequentially, starting from either end of the
64  * tree or from any specific key.  A scan request before any scanning is
65  * done is initialized as starting from the least node.
66  */
67 
68 /*
69  * __bt_seq --
70  *	Btree sequential scan interface.
71  *
72  * Parameters:
73  *	dbp:	pointer to access method
74  *	key:	key for positioning and return value
75  *	data:	data return value
76  *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV, R_RNEXT, R_RPREV.
77  *
78  * Returns:
79  *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
80  */
81 int
__bt_seq(const DB * dbp,DBT * key,DBT * data,u_int flags)82 __bt_seq(const DB *dbp, DBT *key, DBT *data, u_int flags)
83 {
84 	BTREE *t;
85 	EPG e;
86 	int status;
87 
88 	t = dbp->internal;
89 
90 	/* Toss any page pinned across calls. */
91 	if (t->bt_pinned != NULL) {
92 		mpool_put(t->bt_mp, t->bt_pinned, 0);
93 		t->bt_pinned = NULL;
94 	}
95 
96 	/*
97 	 * If scan uninitialized as yet, or starting at a specific record, set
98 	 * the scan to a specific key.  Both __bt_seqset and __bt_seqadv pin
99 	 * the page the cursor references if they're successful.
100 	 */
101 	switch (flags) {
102 	case R_NEXT:
103 	case R_PREV:
104 	case R_RNEXT:
105 	case R_RPREV:
106 		if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
107 			status = __bt_seqadv(t, &e, (int)flags);
108 			break;
109 		}
110 		/* FALLTHROUGH */
111 	case R_FIRST:
112 	case R_LAST:
113 	case R_CURSOR:
114 		status = __bt_seqset(t, &e, key, (int)flags);
115 		break;
116 	default:
117 		errno = EINVAL;
118 		return (RET_ERROR);
119 	}
120 
121 	if (status == RET_SUCCESS) {
122 		__bt_setcur(t, e.page->pgno, (u_int)e.index);
123 
124 		status =
125 		    __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
126 
127 		/*
128 		 * If the user is doing concurrent access, we copied the
129 		 * key/data, toss the page.
130 		 */
131 		if (F_ISSET(t, B_DB_LOCK))
132 			mpool_put(t->bt_mp, e.page, 0);
133 		else
134 			t->bt_pinned = e.page;
135 	}
136 	return (status);
137 }
138 
139 /*
140  * __bt_seqset --
141  *	Set the sequential scan to a specific key.
142  *
143  * Parameters:
144  *	t:	tree
145  *	ep:	storage for returned key
146  *	key:	key for initial scan position
147  *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV, R_RNEXT, R_RPREV.
148  *
149  * Side effects:
150  *	Pins the page the cursor references.
151  *
152  * Returns:
153  *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
154  */
155 static int
__bt_seqset(BTREE * t,EPG * ep,DBT * key,int flags)156 __bt_seqset(BTREE *t, EPG *ep, DBT *key, int flags)
157 {
158 	PAGE *h;
159 	pgno_t pg;
160 	int exact;
161 
162 	/*
163 	 * Find the first, last or specific key in the tree and point the
164 	 * cursor at it.  The cursor may not be moved until a new key has
165 	 * been found.
166 	 */
167 	switch (flags) {
168 	case R_CURSOR:				/* Keyed scan. */
169 		/*
170 		 * Find the first instance of the key or the smallest key
171 		 * which is greater than or equal to the specified key.
172 		 */
173 		if (key->data == NULL || key->size == 0) {
174 			errno = EINVAL;
175 			return (RET_ERROR);
176 		}
177 		return (__bt_first(t, key, ep, &exact));
178 	case R_FIRST:				/* First record. */
179 	case R_NEXT:
180 	case R_RNEXT:
181 		BT_CLR(t);
182 		/* Walk down the left-hand side of the tree. */
183 		for (pg = P_ROOT;;) {
184 			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
185 				return (RET_ERROR);
186 
187 			/* Check for an empty tree. */
188 			if (NEXTINDEX(h) == 0) {
189 				mpool_put(t->bt_mp, h, 0);
190 				return (RET_SPECIAL);
191 			}
192 
193 			if (h->flags & (P_BLEAF | P_RLEAF))
194 				break;
195 			pg = GETBINTERNAL(h, 0)->pgno;
196 			BT_PUSH(t, h->pgno, 0);
197 			mpool_put(t->bt_mp, h, 0);
198 		}
199 		ep->page = h;
200 		ep->index = 0;
201 		break;
202 	case R_LAST:				/* Last record. */
203 	case R_PREV:
204 	case R_RPREV:
205 		BT_CLR(t);
206 		/* Walk down the right-hand side of the tree. */
207 		for (pg = P_ROOT;;) {
208 			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
209 				return (RET_ERROR);
210 
211 			/* Check for an empty tree. */
212 			if (NEXTINDEX(h) == 0) {
213 				mpool_put(t->bt_mp, h, 0);
214 				return (RET_SPECIAL);
215 			}
216 
217 			if (h->flags & (P_BLEAF | P_RLEAF))
218 				break;
219 			pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
220 			BT_PUSH(t, h->pgno, NEXTINDEX(h) - 1);
221 			mpool_put(t->bt_mp, h, 0);
222 		}
223 
224 		ep->page = h;
225 		ep->index = NEXTINDEX(h) - 1;
226 		break;
227 	}
228 	return (RET_SUCCESS);
229 }
230 
231 /*
232  * __bt_seqadvance --
233  *	Advance the sequential scan.
234  *
235  * Parameters:
236  *	t:	tree
237  *	flags:	R_NEXT, R_PREV, R_RNEXT, R_RPREV
238  *
239  * Side effects:
240  *	Pins the page the new key/data record is on.
241  *
242  * Returns:
243  *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
244  */
245 static int
__bt_seqadv(BTREE * t,EPG * ep,int flags)246 __bt_seqadv(BTREE *t, EPG *ep, int flags)
247 {
248 	CURSOR *c;
249 	PAGE *h;
250 	indx_t idx = 0;	/* pacify gcc */
251 	pgno_t pg;
252 	int exact, rval;
253 
254 	/*
255 	 * There are a couple of states that we can be in.  The cursor has
256 	 * been initialized by the time we get here, but that's all we know.
257 	 */
258 	c = &t->bt_cursor;
259 
260 	/*
261 	 * The cursor was deleted and there weren't any duplicate records,
262 	 * so the cursor's key was saved.  Find out where that key would
263 	 * be in the current tree.  If the returned key is an exact match,
264 	 * it means that a key/data pair was inserted into the tree after
265 	 * the delete.  We could reasonably return the key, but the problem
266 	 * is that this is the access pattern we'll see if the user is
267 	 * doing seq(..., R_NEXT)/put(..., 0) pairs, i.e. the put deletes
268 	 * the cursor record and then replaces it, so the cursor was saved,
269 	 * and we'll simply return the same "new" record until the user
270 	 * notices and doesn't do a put() of it.  Since the key is an exact
271 	 * match, we could as easily put the new record before the cursor,
272 	 * and we've made no guarantee to return it.  So, move forward or
273 	 * back a record if it's an exact match.
274 	 *
275 	 * XXX
276 	 * In the current implementation, put's to the cursor are done with
277 	 * delete/add pairs.  This has two consequences.  First, it means
278 	 * that seq(..., R_NEXT)/put(..., R_CURSOR) pairs are going to exhibit
279 	 * the same behavior as above.  Second, you can return the same key
280 	 * twice if you have duplicate records.  The scenario is that the
281 	 * cursor record is deleted, moving the cursor forward or backward
282 	 * to a duplicate.  The add then inserts the new record at a location
283 	 * ahead of the cursor because duplicates aren't sorted in any way,
284 	 * and the new record is later returned.  This has to be fixed at some
285 	 * point.
286 	 */
287 	if (F_ISSET(c, CURS_ACQUIRE)) {
288 		if ((rval = __bt_first(t, &c->key, ep, &exact)) == RET_ERROR)
289 			return RET_ERROR;
290 		if (!exact)
291 			return rval;
292 		/*
293 		 * XXX
294 		 * Kluge -- get, release, get the page.
295 		 */
296 		c->pg.pgno = ep->page->pgno;
297 		c->pg.index = ep->index;
298 		mpool_put(t->bt_mp, ep->page, 0);
299 	}
300 
301 	/* Get the page referenced by the cursor. */
302 	if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
303 		return (RET_ERROR);
304 
305 	/*
306  	 * Find the next/previous record in the tree and point the cursor at
307 	 * it.  The cursor may not be moved until a new key has been found.
308 	 */
309 	switch (flags) {
310 	case R_NEXT:			/* Next record. */
311 	case R_RNEXT:
312 		/*
313 		 * The cursor was deleted in duplicate records, and moved
314 		 * forward to a record that has yet to be returned.  Clear
315 		 * that flag, and return the record.
316 		 */
317 		if (F_ISSET(c, CURS_AFTER))
318 			goto usecurrent;
319 		idx = c->pg.index;
320 		if (++idx == NEXTINDEX(h)) {
321 			if (flags == R_RNEXT) {
322 				ep->page = h;
323 				ep->index = idx;
324 				return __bt_rseq_next(t, ep);
325 			}
326 			pg = h->nextpg;
327 			mpool_put(t->bt_mp, h, 0);
328 			if (pg == P_INVALID)
329 				return RET_SPECIAL;
330 			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
331 				return RET_ERROR;
332 			idx = 0;
333 		}
334 		break;
335 	case R_PREV:			/* Previous record. */
336 	case R_RPREV:
337 		/*
338 		 * The cursor was deleted in duplicate records, and moved
339 		 * backward to a record that has yet to be returned.  Clear
340 		 * that flag, and return the record.
341 		 */
342 		if (F_ISSET(c, CURS_BEFORE)) {
343 usecurrent:		F_CLR(c, CURS_AFTER | CURS_BEFORE);
344 			ep->page = h;
345 			ep->index = c->pg.index;
346 			return (RET_SUCCESS);
347 		}
348 		idx = c->pg.index;
349 		if (idx == 0) {
350 			if (flags == R_RPREV) {
351 				ep->page = h;
352 				ep->index = idx;
353 				return __bt_rseq_prev(t, ep);
354 			}
355 			pg = h->prevpg;
356 			mpool_put(t->bt_mp, h, 0);
357 			if (pg == P_INVALID)
358 				return RET_SPECIAL;
359 			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
360 				return RET_ERROR;
361 			idx = NEXTINDEX(h) - 1;
362 		} else
363 			--idx;
364 		break;
365 	}
366 
367 	ep->page = h;
368 	ep->index = idx;
369 	return (RET_SUCCESS);
370 }
371 /*
372  * Get the first item on the next page, but by going up and down the tree.
373  */
374 static int
__bt_rseq_next(BTREE * t,EPG * ep)375 __bt_rseq_next(BTREE *t, EPG *ep)
376 {
377 	PAGE *h;
378 	indx_t idx;
379 	EPGNO *up;
380 	pgno_t pg;
381 
382 	h = ep->page;
383 	idx = ep->index;
384 	do {
385 		/* Move up the tree. */
386 		up = BT_POP(t);
387 		mpool_put(t->bt_mp, h, 0);
388 		/* Did we hit the right edge of the root? */
389 		if (up == NULL)
390 			return RET_SPECIAL;
391 		if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL)
392 			return RET_ERROR;
393 		idx = up->index;
394 	} while (++idx == NEXTINDEX(h));
395 
396 	while (!(h->flags & (P_BLEAF | P_RLEAF))) {
397 		/* Move back down the tree. */
398 		BT_PUSH(t, h->pgno, idx);
399 		pg = GETBINTERNAL(h, idx)->pgno;
400 		mpool_put(t->bt_mp, h, 0);
401 		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
402 			return RET_ERROR;
403 		idx = 0;
404 	}
405 	ep->page = h;
406 	ep->index = idx;
407 	return RET_SUCCESS;
408 }
409 
410 /*
411  * Get the last item on the previous page, but by going up and down the tree.
412  */
413 static int
__bt_rseq_prev(BTREE * t,EPG * ep)414 __bt_rseq_prev(BTREE *t, EPG *ep)
415 {
416 	PAGE *h;
417 	indx_t idx;
418 	EPGNO *up;
419 	pgno_t pg;
420 
421 	h = ep->page;
422 	idx = ep->index;
423 	do {
424 		/* Move up the tree. */
425 		up = BT_POP(t);
426 		mpool_put(t->bt_mp, h, 0);
427 		/* Did we hit the left edge of the root? */
428 		if (up == NULL)
429 			return RET_SPECIAL;
430 		if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL)
431 			return RET_ERROR;
432 		idx = up->index;
433 	} while (idx == 0);
434 	--idx;
435 	while (!(h->flags & (P_BLEAF | P_RLEAF))) {
436 		/* Move back down the tree. */
437 		BT_PUSH(t, h->pgno, idx);
438 		pg = GETBINTERNAL(h, idx)->pgno;
439 		mpool_put(t->bt_mp, h, 0);
440 		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
441 			return RET_ERROR;
442 		idx = NEXTINDEX(h) - 1;
443 	}
444 	ep->page = h;
445 	ep->index = idx;
446 	return RET_SUCCESS;
447 }
448 
449 /*
450  * __bt_first --
451  *	Find the first entry.
452  *
453  * Parameters:
454  *	t:	the tree
455  *    key:	the key
456  *  erval:	return EPG
457  * exactp:	pointer to exact match flag
458  *
459  * Returns:
460  *	The first entry in the tree greater than or equal to key,
461  *	or RET_SPECIAL if no such key exists.
462  */
463 static int
__bt_first(BTREE * t,const DBT * key,EPG * erval,int * exactp)464 __bt_first(BTREE *t, const DBT *key, EPG *erval, int *exactp)
465 {
466 	PAGE *h, *hprev;
467 	EPG *ep, save;
468 	pgno_t pg;
469 
470 	/*
471 	 * Find any matching record; __bt_search pins the page.
472 	 *
473 	 * If it's an exact match and duplicates are possible, walk backwards
474 	 * in the tree until we find the first one.  Otherwise, make sure it's
475 	 * a valid key (__bt_search may return an index just past the end of a
476 	 * page) and return it.
477 	 */
478 	if ((ep = __bt_search(t, key, exactp)) == NULL)
479 		return RET_SPECIAL;
480 	if (*exactp) {
481 		if (F_ISSET(t, B_NODUPS)) {
482 			*erval = *ep;
483 			return (RET_SUCCESS);
484 		}
485 
486 		/*
487 		 * Walk backwards, as long as the entry matches and there are
488 		 * keys left in the tree.  Save a copy of each match in case
489 		 * we go too far.
490 		 */
491 		save = *ep;
492 		h = ep->page;
493 		do {
494 			if (save.page->pgno != ep->page->pgno) {
495 				mpool_put(t->bt_mp, save.page, 0);
496 				save = *ep;
497 			} else
498 				save.index = ep->index;
499 
500 			/*
501 			 * Don't unpin the page the last (or original) match
502 			 * was on, but make sure it's unpinned if an error
503 			 * occurs.
504 			 */
505 			if (ep->index == 0) {
506 				if (h->prevpg == P_INVALID)
507 					break;
508 				if (h->pgno != save.page->pgno)
509 					mpool_put(t->bt_mp, h, 0);
510 				if ((hprev = mpool_get(t->bt_mp,
511 				    h->prevpg, 0)) == NULL) {
512 					if (h->pgno == save.page->pgno)
513 						mpool_put(t->bt_mp,
514 						    save.page, 0);
515  					return RET_ERROR;
516 				}
517 				ep->page = h = hprev;
518 				ep->index = NEXTINDEX(h);
519 			}
520 			--ep->index;
521 		} while (__bt_cmp(t, key, ep) == 0);
522 
523 		/*
524 		 * Reach here with the last page that was looked at pinned,
525 		 * which may or may not be the same as the last (or original)
526 		 * match page.  If it's not useful, release it.
527 		 */
528 		if (h->pgno != save.page->pgno)
529 			mpool_put(t->bt_mp, h, 0);
530 
531 		*erval = save;
532 		return (RET_SUCCESS);
533 	}
534 
535 	/* If at the end of a page, find the next entry. */
536 	if (ep->index == NEXTINDEX(ep->page)) {
537 		h = ep->page;
538 		pg = h->nextpg;
539 		mpool_put(t->bt_mp, h, 0);
540 		if (pg == P_INVALID)
541 			return (RET_SPECIAL);
542 		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
543 			return (RET_ERROR);
544 		ep->index = 0;
545 		ep->page = h;
546 	}
547 	*erval = *ep;
548 	return (RET_SUCCESS);
549 }
550 
551 /*
552  * __bt_setcur --
553  *	Set the cursor to an entry in the tree.
554  *
555  * Parameters:
556  *	t:	the tree
557  *   pgno:	page number
558  *    idx:	page index
559  */
560 void
__bt_setcur(BTREE * t,pgno_t pgno,u_int idx)561 __bt_setcur(BTREE *t, pgno_t pgno, u_int idx)
562 {
563 	/* Lose any already deleted key. */
564 	if (t->bt_cursor.key.data != NULL) {
565 		free(t->bt_cursor.key.data);
566 		t->bt_cursor.key.size = 0;
567 		t->bt_cursor.key.data = NULL;
568 	}
569 	F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
570 
571 	/* Update the cursor. */
572 	t->bt_cursor.pg.pgno = pgno;
573 	t->bt_cursor.pg.index = idx;
574 	F_SET(&t->bt_cursor, CURS_INIT);
575 }
576