xref: /csrg-svn/lib/libc/db/btree/bt_open.c (revision 46453)
1 /*-
2  * Copyright (c) 1990 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 #if defined(LIBC_SCCS) && !defined(lint)
12 static char sccsid[] = "@(#)bt_open.c	5.4 (Berkeley) 02/18/91";
13 #endif /* LIBC_SCCS and not lint */
14 
15 /*
16  *  btree.c -- implementation of btree access method for 4.4BSD.
17  *
18  *	The design here is based on that of the btree access method used
19  *	in the Postgres database system at UC Berkeley.  The implementation
20  *	is wholly independent of the Postgres code.
21  *
22  *	This implementation supports btrees on disk (supply a filename) or
23  *	in memory (don't).  Public interfaces defined here are:
24  *
25  *		btree_open()	-- wrapper; returns a filled DB struct for
26  *				   a btree.
27  *
28  *		bt_open()	-- open a new or existing btree.
29  *		bt_get()	-- fetch data from a tree by key.
30  *		bt_seq()	-- do a sequential scan on a tree.
31  *		bt_put()	-- add data to a tree by key.
32  *		bt_delete()	-- remove data from a tree by key.
33  *		bt_close()	-- close a btree.
34  *		bt_sync()	-- force btree pages to disk (disk trees only).
35  */
36 
37 #include <sys/param.h>
38 #include <sys/file.h>
39 #include <sys/stat.h>
40 #include <sys/signal.h>
41 #include <sys/errno.h>
42 #include <sys/types.h>
43 #include <db.h>
44 #include "btree.h"
45 
46 BTREEINFO _DefaultBTInfo = {
47 	0,	/* flags */
48 	0,	/* cachesize */
49 	0,	/* psize */
50 	strcmp,	/* compare */
51 	0
52 };
53 
54 /*
55  *  BTREE_OPEN -- Wrapper routine to open a btree.
56  *
57  *	Creates and fills a DB struct, and calls the routine that actually
58  *	opens the btree.
59  *
60  *	Parameters:
61  *		f:  filename to open
62  *		flags:  flag bits passed to open
63  *		mode:  permission bits, used if O_CREAT specified
64  *		b:  BTREEINFO pointer
65  *
66  *	Returns:
67  *		Filled-in DBT on success; NULL on failure, with errno
68  *		set as appropriate.
69  *
70  *	Side Effects:
71  *		Allocates memory for the DB struct.
72  */
73 
74 DB *
75 btree_open(f, flags, mode, b)
76 	char *f;
77 	int flags;
78 	int mode;
79 	BTREEINFO *b;
80 {
81 	DB *db;
82 	BTREE t;
83 
84 	if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL)
85 		return ((DB *) NULL);
86 
87 	if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) {
88 		(void) free ((char *) db);
89 		return ((DB *) NULL);
90 	}
91 
92 	db->internal = (char *) t;
93 	db->close = bt_close;
94 	db->delete = bt_delete;
95 	db->get = bt_get;
96 	db->put = bt_put;
97 	db->seq = bt_seq;
98 	db->sync = bt_sync;
99 
100 	return (db);
101 }
102 
103 /*
104  *  BT_OPEN -- Open a btree.
105  *
106  *	This routine creates the correct kind (disk or in-memory) of
107  *	btree and initializes it as required.
108  *
109  *	Parameters:
110  *		f -- name of btree (NULL for in-memory btrees)
111  *		flags -- flags passed to open()
112  *		mode -- mode passed to open ()
113  *		b -- BTREEINFO structure, describing btree
114  *
115  *	Returns:
116  *		(Opaque) pointer to the btree.  On failure, returns NULL
117  *		with errno set as appropriate.
118  *
119  *	Side Effects:
120  *		Allocates memory, opens files.
121  */
122 
123 BTREE
124 bt_open(f, flags, mode, b)
125 	char *f;
126 	int flags;
127 	int mode;
128 	BTREEINFO *b;
129 {
130 	BTREE_P t;
131 	HTABLE ht;
132 	int nbytes;
133 	int fd;
134 	CURSOR *c;
135 	BTMETA m;
136 	struct stat buf;
137 
138 	/* use the default info if none was provided */
139 	if (b == (BTREEINFO *) NULL)
140 		b = &_DefaultBTInfo;
141 
142 	if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL)
143 		return ((BTREE) NULL);
144 
145 	if (b->compare)
146 		t->bt_compare = b->compare;
147 	else
148 		t->bt_compare = strcmp;
149 
150 	t->bt_fname = f;
151 	t->bt_curpage = (BTHEADER *) NULL;
152 	t->bt_free = P_NONE;
153 	c = &(t->bt_cursor);
154 	c->c_pgno = P_NONE;
155 	c->c_index = 0;
156 	c->c_flags = (u_char) NULL;
157 	c->c_key = (char *) NULL;
158 	t->bt_stack = (BTSTACK *) NULL;
159 	t->bt_flags = 0;
160 
161 	/*
162 	 *  If no file name was supplied, this is an in-memory btree.
163 	 *  Otherwise, it's a disk-based btree.
164 	 */
165 	if (f == (char *) NULL) {
166 		/* in memory */
167 		if ((t->bt_psize = b->psize) < MINPSIZE) {
168 			if (t->bt_psize != 0) {
169 				(void) free ((char *) t);
170 				errno = EINVAL;
171 				return ((BTREE) NULL);
172 			}
173 			t->bt_psize = getpagesize();
174 		}
175 
176 		nbytes = HTSIZE * sizeof(HTBUCKET *);
177 		if ((ht = (HTABLE) malloc((unsigned) nbytes))
178 		    == (HTABLE) NULL) {
179 			(void) free((char *) t);
180 			return ((BTREE) NULL);
181 		}
182 		(void) bzero((char *) ht, nbytes);
183 		t->bt_s.bt_ht = ht;
184 		t->bt_npages = 0;
185 		t->bt_lorder = BYTE_ORDER;
186 		if (!(b->flags & R_DUP))
187 			t->bt_flags |= BTF_NODUPS;
188 	} else {
189 		/* on disk */
190 		if ((fd = open(f, O_RDONLY, 0)) < 0) {
191 			/* doesn't exist yet, be sure page is big enough */
192 			if ((t->bt_psize = b->psize) < sizeof(BTHEADER)
193 			    && b->psize != 0) {
194 				(void) free((char *) t);
195 				errno = EINVAL;
196 				return ((BTREE) NULL);
197 			}
198 			if (b->lorder == 0)
199 				b->lorder = BYTE_ORDER;
200 
201 			if (b->lorder != BIG_ENDIAN
202 			    && b->lorder != LITTLE_ENDIAN) {
203 				(void) free((char *) t);
204 				errno = EINVAL;
205 				return ((BTREE) NULL);
206 			}
207 			t->bt_lorder = b->lorder;
208 			if (!(b->flags & R_DUP))
209 				t->bt_flags |= BTF_NODUPS;
210 		} else {
211 			/* exists, get page size from file */
212 			if (read(fd, &m, sizeof(m)) < sizeof(m)) {
213 				(void) close(fd);
214 				(void) free((char *) t);
215 				errno = EINVAL;
216 				return ((BTREE) NULL);
217 			}
218 
219 			/* lorder always stored in host-independent format */
220 			NTOHL(m.m_lorder);
221 			if (m.m_lorder != BIG_ENDIAN
222 			    && m.m_lorder != LITTLE_ENDIAN) {
223 				(void) free((char *) t);
224 				errno = EINVAL;
225 				return ((BTREE) NULL);
226 			}
227 			t->bt_lorder = m.m_lorder;
228 
229 			if (t->bt_lorder != BYTE_ORDER) {
230 				BLSWAP(m.m_magic);
231 				BLSWAP(m.m_version);
232 				BLSWAP(m.m_psize);
233 				BLSWAP(m.m_free);
234 				BLSWAP(m.m_flags);
235 			}
236 
237 			if (m.m_magic != BTREEMAGIC
238 			    || m.m_version != BTREEVERSION
239 			    || m.m_psize < MINPSIZE) {
240 				(void) close(fd);
241 				(void) free((char *) t);
242 #ifndef EFTYPE
243 #define EFTYPE	-100
244 #endif
245 				errno = EFTYPE;
246 				return ((BTREE) NULL);
247 			}
248 			t->bt_psize = m.m_psize;
249 			t->bt_free = m.m_free;
250 			t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK;
251 			(void) close(fd);
252 		}
253 
254 		/* now open the file the way the user wanted it */
255 		if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) {
256 			(void) free ((char *) t);
257 			return ((BTREE) NULL);
258 		}
259 
260 		/* access method files are always close-on-exec */
261 		if (fcntl(t->bt_s.bt_d.d_fd, F_SETFL, 1) == -1) {
262 			(void) close(t->bt_s.bt_d.d_fd);
263 			(void) free ((char *) t);
264 			return ((BTREE) NULL);
265 		}
266 
267 		/* get number of pages, page size if necessary */
268 		(void) fstat(t->bt_s.bt_d.d_fd, &buf);
269 		if (t->bt_psize == 0)
270 			t->bt_psize = buf.st_blksize;
271 		t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize);
272 
273 		/* page zero is metadata, doesn't count */
274 		if (t->bt_npages > 0)
275 			--(t->bt_npages);
276 
277 		if (b->cachesize == 0)
278 			b->cachesize = DEFCACHE;
279 
280 		/* get an lru buffer cache, if the user asked for one */
281 		if ((b->cachesize / t->bt_psize) > 0) {
282 			BTDISK *d = &(t->bt_s.bt_d);
283 
284 			d->d_cache = lruinit(d->d_fd,
285 					     (int) (b->cachesize / t->bt_psize),
286 					     (int) t->bt_psize,
287 					     (char *) t->bt_lorder,
288 					     _bt_pgin, _bt_pgout);
289 
290 			if (d->d_cache == (char *) NULL) {
291 				(void) free((char *) t);
292 				return ((BTREE) NULL);
293 			}
294 		}
295 	}
296 
297 	/* remember if tree was opened for write */
298 	if (((flags & O_WRONLY) == O_WRONLY)
299 	    || ((flags & O_RDWR) == O_RDWR))
300 		t->bt_flags |= BTF_ISWRITE;
301 
302 	return ((BTREE) t);
303 }
304 
305 /*
306  *  BT_GET -- Get an entry from a btree.
307  *
308  *	Does a key lookup in the tree to find the specified key, and returns
309  *	the key/data pair found.
310  *
311  *	Parameters:
312  *		tree -- btree in which to do lookup
313  *		key -- key to find
314  *		data -- pointer to DBT in which to return data
315  *		flag -- ignored
316  *
317  *	Returns:
318  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not
319  *		found.  If key is not found, nothing is stored in the
320  *		return DBT 'data'.
321  *
322  *	Side Effects:
323  *		None.
324  *
325  *	Warnings:
326  *		Return data is statically allocated, and will be overwritten
327  *		at the next call.
328  */
329 
330 int
331 bt_get(tree, key, data, flag)
332 	BTREE tree;
333 	DBT *key;
334 	DBT *data;
335 	int flag;
336 {
337 	BTREE_P t = (BTREE_P) tree;
338 	BTHEADER *h;
339 	DATUM *d;
340 	BTITEM *item;
341 
342 #ifdef lint
343 	flag = flag;
344 #endif /* lint */
345 
346 	/* lookup */
347 	item = _bt_search(t, key);
348 
349 	/* clear parent pointer stack */
350 	while (_bt_pop(t) != P_NONE)
351 		continue;
352 
353 	if (item == (BTITEM *) NULL)
354 		return (RET_ERROR);
355 
356 	h = (BTHEADER *) t->bt_curpage;
357 	data->size = 0;
358 	data->data = (char *) NULL;
359 
360 	/* match? */
361 	if (VALIDITEM(t, item)
362 	    && _bt_cmp(t, key->data, item->bti_index) == 0) {
363 		d = (DATUM *) GETDATUM(h, item->bti_index);
364 		return (_bt_buildret(t, d, data, key));
365 	}
366 
367 	/* nope */
368 	return (RET_SPECIAL);
369 }
370 
371 /*
372  *  BT_PUT -- Add an entry to a btree.
373  *
374  *	The specified (key, data) pair is added to the tree.  If the tree
375  *	was created for unique keys only, then duplicates will not be
376  *	entered.  If the requested key exists in the tree, it will be over-
377  *	written unless the flags parameter is R_NOOVERWRITE, in which case
378  *	the update will not be done.  If duplicate keys are permitted in the
379  *	tree, duplicates will be inserted and will not overwrite existing
380  *	keys.  Nodes are split as required.
381  *
382  *	Parameters:
383  *		tree -- btree in which to put the new entry
384  *		key -- key component to add
385  *		data -- data corresponding to key
386  *		flag -- R_NOOVERWRITE or zero.
387  *
388  *	Returns:
389  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the
390  *		NOOVERWRITE flag was set and the specified key exists
391  *		in the database.
392  *
393  *	Side Effects:
394  *		None.
395  */
396 
397 int
398 bt_put(tree, key, data, flag)
399 	BTREE tree;
400 	DBT *key;
401 	DBT *data;
402 	int flag;
403 {
404 	BTREE_P t = (BTREE_P) tree;
405 	BTITEM *item;
406 
407 	/* look for this key in the tree */
408 	item = _bt_search(t, key);
409 
410 	/*
411 	 *  If this tree was originally created without R_DUP, then duplicate
412 	 *  keys are not allowed.  We need to check this at insertion time.
413 	 */
414 
415 	if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) {
416 		if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) {
417 			if (_bt_delone(t, item->bti_index) == RET_ERROR) {
418 				while (_bt_pop(t) != P_NONE)
419 					continue;
420 				return (RET_ERROR);
421 			}
422 		}
423 	}
424 
425 	return (_bt_insert(t, item, key, data, flag));
426 }
427 
428 /*
429  *  BT_DELETE -- delete a key from the tree.
430  *
431  *	Deletes all keys (and their associated data items) matching the
432  *	supplied key from the tree.  If the flags entry is R_CURSOR, then
433  *	the current item in the active scan is deleted.
434  *
435  *	Parameters:
436  *		tree -- btree from which to delete key
437  *		key -- key to delete
438  *		flags -- R_CURSOR or zero
439  *
440  *	Returns:
441  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified
442  *		key was not in the tree.
443  *
444  *	Side Effects:
445  *		None.
446  */
447 
448 int
449 bt_delete(tree, key, flags)
450 	BTREE tree;
451 	DBT *key;
452 	int flags;
453 {
454 	BTREE_P t = (BTREE_P) tree;
455 	BTHEADER *h;
456 	BTITEM *item;
457 	int ndel = 0;
458 
459 	if (flags == R_CURSOR)
460 		return (_bt_crsrdel(t));
461 
462 	/* find the first matching key in the tree */
463 	item = _bt_first(t, key);
464 	h = t->bt_curpage;
465 
466 	/* don't need the descent stack for deletes */
467 	while (_bt_pop(t) != P_NONE)
468 		continue;
469 
470 	/* delete all matching keys */
471 	for (;;) {
472 		while (VALIDITEM(t, item)
473 		       && (_bt_cmp(t, key->data, item->bti_index) == 0)) {
474 			if (_bt_delone(t, item->bti_index) == RET_ERROR)
475 				return (RET_ERROR);
476 			ndel++;
477 		}
478 
479 		if (VALIDITEM(t, item) || h->h_nextpg == P_NONE)
480 			break;
481 
482 		/* next page, if necessary */
483 		do {
484 			if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
485 				return (RET_ERROR);
486 			h = t->bt_curpage;
487 		} while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
488 
489 		item->bti_pgno = h->h_pgno;
490 		item->bti_index = 0;
491 
492 		if (!VALIDITEM(t, item)
493 		    || _bt_cmp(t, key->data, item->bti_index) != 0)
494 			break;
495 	}
496 
497 	/* flush changes to disk */
498 	if (ISDISK(t)) {
499 		if (h->h_flags & F_DIRTY) {
500 			if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR)
501 				return (RET_ERROR);
502 		}
503 	}
504 
505 	if (ndel == 0)
506 		return (RET_SPECIAL);
507 
508 	return (RET_SUCCESS);
509 }
510 
511 /*
512  *  BT_SYNC -- sync the btree to disk.
513  *
514  *	Parameters:
515  *		tree -- btree to sync.
516  *
517  *	Returns:
518  *		RET_SUCCESS, RET_ERROR.
519  */
520 
521 bt_sync(tree)
522 	BTREE tree;
523 {
524 	BTREE_P t = (BTREE_P) tree;
525 	BTHEADER *h;
526 	pgno_t pgno;
527 
528 	/* if this is an in-memory btree, syncing is a no-op */
529 	if (!ISDISK(t))
530 		return (RET_SUCCESS);
531 
532 	h = (BTHEADER *) t->bt_curpage;
533 	h->h_flags &= ~F_DIRTY;
534 
535 	if (ISCACHE(t)) {
536 		pgno = t->bt_curpage->h_pgno;
537 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
538 			return(RET_ERROR);
539 		if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR)
540 			return (RET_ERROR);
541 		if (_bt_getpage(t, pgno) == RET_ERROR)
542 			return (RET_ERROR);
543 	} else {
544 		if (_bt_write(t, h, NORELEASE) == RET_ERROR)
545 			return (RET_ERROR);
546 	}
547 
548 	return (fsync(t->bt_s.bt_d.d_fd));
549 }
550 
551 /*
552  *  BT_SEQ -- Sequential scan interface.
553  *
554  *	This routine supports sequential scans on the btree.  If called with
555  *	flags set to R_CURSOR, or if no seq scan has been initialized in the
556  *	current tree, we initialize the scan.  Otherwise, we advance the scan
557  *	and return the next item.
558  *
559  *	Scans can be either keyed or non-keyed.  Keyed scans basically have
560  *	a starting point somewhere in the middle of the tree.  Non-keyed
561  *	scans start at an endpoint.  Also, scans can be backward (descending
562  *	order), forward (ascending order), or no movement (keep returning
563  *	the same item).
564  *
565  *	Flags is checked every time we enter the routine, so the user can
566  *	change directions on an active scan if desired.  The key argument
567  *	is examined only when we initialize the scan, in order to position
568  *	it properly.
569  *
570  *	Items are returned via the key and data arguments passed in.
571  *
572  *	Parameters:
573  *		tree -- btree in which to do scan
574  *		key -- key, used to position scan on initialization, and
575  *		       used to return key components to the user.
576  *		data -- used to return data components to the user.
577  *		flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or
578  *			 R_PREV.
579  *
580  *	Returns:
581  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data
582  *		exists in the tree in the specified direction.
583  *
584  *	Side Effects:
585  *		Changes the btree's notion of the current position in the
586  *		scan.
587  *
588  *	Warnings:
589  *		The key and data items returned are static and will be
590  *		overwritten by the next bt_get or bt_seq.
591  */
592 
593 int
594 bt_seq(tree, key, data, flags)
595 	BTREE tree;
596 	DBT *key;
597 	DBT *data;
598 	int flags;
599 {
600 	BTREE_P t = (BTREE_P) tree;
601 	BTHEADER *h;
602 	DATUM *d;
603 	int status;
604 
605 	/* do we need to initialize the scan? */
606 	if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST
607 	    || !(t->bt_flags & BTF_SEQINIT)) {
608 
609 		/* initialize it */
610 		status = _bt_seqinit(t, key, flags);
611 	} else {
612 		/* just advance the current scan pointer */
613 		status = _bt_seqadvance(t, flags);
614 	}
615 
616 	key->size = data->size = 0;
617 	key->data = data->data = (char *) NULL;
618 
619 	h = t->bt_curpage;
620 
621 	/* is there a valid item at the current scan location? */
622 	if (status == RET_SPECIAL) {
623 		if (flags == R_NEXT) {
624 			if (t->bt_cursor.c_index >= NEXTINDEX(h)) {
625 				if (NEXTINDEX(h) > 0)
626 					t->bt_cursor.c_index = NEXTINDEX(h) - 1;
627 				else
628 					t->bt_cursor.c_index = 0;
629 			}
630 		} else {
631 			t->bt_cursor.c_index = 0;
632 		}
633 		return (RET_SPECIAL);
634 	} else if (status == RET_ERROR)
635 		return (RET_ERROR);
636 
637 	/* okay, return the data */
638 	d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index);
639 
640 	return (_bt_buildret(t, d, data, key));
641 }
642 
643 /*
644  *  BT_CLOSE -- Close a btree
645  *
646  *	Parameters:
647  *		tree -- tree to close
648  *
649  *	Returns:
650  *		RET_SUCCESS, RET_ERROR.
651  *
652  *	Side Effects:
653  *		Frees space occupied by the tree.
654  */
655 
656 int
657 bt_close(tree)
658 	BTREE tree;
659 {
660 	BTREE_P t = (BTREE_P) tree;
661 	int i;
662 	BTHEADER *h;
663 	char *cache;
664 	struct HTBUCKET *b, *sb;
665 	HTABLE ht;
666 	int fd;
667 
668 	if (t->bt_cursor.c_key != (char *) NULL)
669 		(void) free(t->bt_cursor.c_key);
670 
671 	if (!ISDISK(t)) {
672 		/* in-memory tree, release hash table memory */
673 		ht = t->bt_s.bt_ht;
674 		for (i = 0; i < HTSIZE; i++) {
675 			if ((b = ht[i]) == (struct HTBUCKET *) NULL)
676 				break;
677 			do {
678 				sb = b;
679 				(void) free((char *) b->ht_page);
680 				b = b->ht_next;
681 				(void) free((char *) sb);
682 			} while (b != (struct HTBUCKET *) NULL);
683 		}
684 		(void) free ((char *) ht);
685 		(void) free ((char *) t);
686 		return (RET_SUCCESS);
687 	}
688 
689 	if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) {
690 		if (_bt_wrtmeta(t) == RET_ERROR)
691 			return (RET_ERROR);
692 	}
693 
694 	if (t->bt_curpage != (BTHEADER *) NULL) {
695 		h = t->bt_curpage;
696 		if (h->h_flags & F_DIRTY) {
697 			if (_bt_write(t, h, RELEASE) == RET_ERROR)
698 				return (RET_ERROR);
699 		} else {
700 			if (_bt_release(t, h) == RET_ERROR)
701 				return (RET_ERROR);
702 		}
703 
704 		/* flush and free the cache, if there is one */
705 		if (ISCACHE(t)) {
706 			cache = t->bt_s.bt_d.d_cache;
707 			if (lrusync(cache) == RET_ERROR)
708 				return (RET_ERROR);
709 			lrufree(cache);
710 		}
711 		(void) free ((char *) h);
712 	}
713 
714 	fd = t->bt_s.bt_d.d_fd;
715 	(void) free ((char *) t);
716 	return (close(fd));
717 }
718