xref: /csrg-svn/lib/libc/db/btree/bt_open.c (revision 46127)
1 /*-
2  * Copyright (c) 1990 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 #if defined(LIBC_SCCS) && !defined(lint)
12 static char sccsid[] = "@(#)bt_open.c	5.3 (Berkeley) 01/23/91";
13 #endif /* LIBC_SCCS and not lint */
14 
15 /*
16  *  btree.c -- implementation of btree access method for 4.4BSD.
17  *
18  *	The design here is based on that of the btree access method used
19  *	in the Postgres database system at UC Berkeley.  The implementation
20  *	is wholly independent of the Postgres code.
21  *
22  *	This implementation supports btrees on disk (supply a filename) or
23  *	in memory (don't).  Public interfaces defined here are:
24  *
25  *		btree_open()	-- wrapper; returns a filled DB struct for
26  *				   a btree.
27  *
28  *		bt_open()	-- open a new or existing btree.
29  *		bt_get()	-- fetch data from a tree by key.
30  *		bt_seq()	-- do a sequential scan on a tree.
31  *		bt_put()	-- add data to a tree by key.
32  *		bt_delete()	-- remove data from a tree by key.
33  *		bt_close()	-- close a btree.
34  *		bt_sync()	-- force btree pages to disk (disk trees only).
35  */
36 
37 #include <sys/param.h>
38 #include <sys/file.h>
39 #include <sys/stat.h>
40 #include <sys/signal.h>
41 #include <sys/errno.h>
42 #include <sys/types.h>
43 #include <db.h>
44 #include "btree.h"
45 
46 BTREEINFO _DefaultBTInfo = {
47 	0,	/* flags */
48 	0,	/* cachesize */
49 	0,	/* psize */
50 	strcmp,	/* compare */
51 	0
52 };
53 
54 /*
55  *  BTREE_OPEN -- Wrapper routine to open a btree.
56  *
57  *	Creates and fills a DB struct, and calls the routine that actually
58  *	opens the btree.
59  *
60  *	Parameters:
61  *		f:  filename to open
62  *		flags:  flag bits passed to open
63  *		mode:  permission bits, used if O_CREAT specified
64  *		b:  BTREEINFO pointer
65  *
66  *	Returns:
67  *		Filled-in DBT on success; NULL on failure, with errno
68  *		set as appropriate.
69  *
70  *	Side Effects:
71  *		Allocates memory for the DB struct.
72  */
73 
74 DB *
75 btree_open(f, flags, mode, b)
76 	char *f;
77 	int flags;
78 	int mode;
79 	BTREEINFO *b;
80 {
81 	DB *db;
82 	BTREE t;
83 
84 	if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL)
85 		return ((DB *) NULL);
86 
87 	if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) {
88 		(void) free ((char *) db);
89 		return ((DB *) NULL);
90 	}
91 
92 	db->internal = (char *) t;
93 	db->close = bt_close;
94 	db->delete = bt_delete;
95 	db->get = bt_get;
96 	db->put = bt_put;
97 	db->seq = bt_seq;
98 	db->sync = bt_sync;
99 
100 	return (db);
101 }
102 
103 /*
104  *  BT_OPEN -- Open a btree.
105  *
106  *	This routine creates the correct kind (disk or in-memory) of
107  *	btree and initializes it as required.
108  *
109  *	Parameters:
110  *		f -- name of btree (NULL for in-memory btrees)
111  *		flags -- flags passed to open()
112  *		mode -- mode passed to open ()
113  *		b -- BTREEINFO structure, describing btree
114  *
115  *	Returns:
116  *		(Opaque) pointer to the btree.  On failure, returns NULL
117  *		with errno set as appropriate.
118  *
119  *	Side Effects:
120  *		Allocates memory, opens files.
121  */
122 
123 BTREE
124 bt_open(f, flags, mode, b)
125 	char *f;
126 	int flags;
127 	int mode;
128 	BTREEINFO *b;
129 {
130 	BTREE_P t;
131 	HTABLE ht;
132 	int nbytes;
133 	int fd;
134 	CURSOR *c;
135 	BTMETA m;
136 	struct stat buf;
137 
138 	/* use the default info if none was provided */
139 	if (b == (BTREEINFO *) NULL)
140 		b = &_DefaultBTInfo;
141 
142 	if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL)
143 		return ((BTREE) NULL);
144 
145 	if (b->compare)
146 		t->bt_compare = b->compare;
147 	else
148 		t->bt_compare = strcmp;
149 
150 	t->bt_fname = f;
151 	t->bt_curpage = (BTHEADER *) NULL;
152 	t->bt_free = P_NONE;
153 	c = &(t->bt_cursor);
154 	c->c_pgno = P_NONE;
155 	c->c_index = 0;
156 	c->c_flags = (u_char) NULL;
157 	c->c_key = (char *) NULL;
158 	t->bt_stack = (BTSTACK *) NULL;
159 	t->bt_flags = 0;
160 
161 	/*
162 	 *  If no file name was supplied, this is an in-memory btree.
163 	 *  Otherwise, it's a disk-based btree.
164 	 */
165 	if (f == (char *) NULL) {
166 		/* in memory */
167 		if ((t->bt_psize = b->psize) < MINPSIZE) {
168 			if (t->bt_psize != 0) {
169 				(void) free ((char *) t);
170 				errno = EINVAL;
171 				return ((BTREE) NULL);
172 			}
173 			t->bt_psize = getpagesize();
174 		}
175 
176 		nbytes = HTSIZE * sizeof(HTBUCKET *);
177 		if ((ht = (HTABLE) malloc((unsigned) nbytes))
178 		    == (HTABLE) NULL) {
179 			(void) free((char *) t);
180 			return ((BTREE) NULL);
181 		}
182 		(void) bzero((char *) ht, nbytes);
183 		t->bt_s.bt_ht = ht;
184 		t->bt_npages = 0;
185 		t->bt_lorder = BYTE_ORDER;
186 		if (!(b->flags & R_DUP))
187 			t->bt_flags |= BTF_NODUPS;
188 	} else {
189 		/* on disk */
190 		if ((fd = open(f, O_RDONLY, 0)) < 0) {
191 			/* doesn't exist yet, be sure page is big enough */
192 			if ((t->bt_psize = b->psize) < sizeof(BTHEADER)
193 			    && b->psize != 0) {
194 				(void) free((char *) t);
195 				errno = EINVAL;
196 				return ((BTREE) NULL);
197 			}
198 			if (b->lorder == 0)
199 				b->lorder = BYTE_ORDER;
200 
201 			if (b->lorder != BIG_ENDIAN
202 			    && b->lorder != LITTLE_ENDIAN) {
203 				(void) free((char *) t);
204 				errno = EINVAL;
205 				return ((BTREE) NULL);
206 			}
207 			t->bt_lorder = b->lorder;
208 			if (!(b->flags & R_DUP))
209 				t->bt_flags |= BTF_NODUPS;
210 		} else {
211 			/* exists, get page size from file */
212 			if (read(fd, &m, sizeof(m)) < sizeof(m)) {
213 				(void) close(fd);
214 				(void) free((char *) t);
215 				errno = EINVAL;
216 				return ((BTREE) NULL);
217 			}
218 
219 			/* lorder always stored in host-independent format */
220 			NTOHL(m.m_lorder);
221 			if (m.m_lorder != BIG_ENDIAN
222 			    && m.m_lorder != LITTLE_ENDIAN) {
223 				(void) free((char *) t);
224 				errno = EINVAL;
225 				return ((BTREE) NULL);
226 			}
227 			t->bt_lorder = m.m_lorder;
228 
229 			if (t->bt_lorder != BYTE_ORDER) {
230 				BLSWAP(m.m_magic);
231 				BLSWAP(m.m_version);
232 				BLSWAP(m.m_psize);
233 				BLSWAP(m.m_free);
234 				BLSWAP(m.m_flags);
235 			}
236 
237 			if (m.m_magic != BTREEMAGIC
238 			    || m.m_version != BTREEVERSION
239 			    || m.m_psize < MINPSIZE) {
240 				(void) close(fd);
241 				(void) free((char *) t);
242 #ifndef EFTYPE
243 #define EFTYPE	-100
244 #endif
245 				errno = EFTYPE;
246 				return ((BTREE) NULL);
247 			}
248 			t->bt_psize = m.m_psize;
249 			t->bt_free = m.m_free;
250 			t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK;
251 			(void) close(fd);
252 		}
253 
254 		/* now open the file the way the user wanted it */
255 		if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) {
256 			(void) free ((char *) t);
257 			return ((BTREE) NULL);
258 		}
259 
260 		/* get number of pages, page size if necessary */
261 		(void) fstat(t->bt_s.bt_d.d_fd, &buf);
262 		if (t->bt_psize == 0)
263 			t->bt_psize = buf.st_blksize;
264 		t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize);
265 
266 		/* page zero is metadata, doesn't count */
267 		if (t->bt_npages > 0)
268 			--(t->bt_npages);
269 
270 		if (b->cachesize == 0)
271 			b->cachesize = DEFCACHE;
272 
273 		/* get an lru buffer cache, if the user asked for one */
274 		if ((b->cachesize / t->bt_psize) > 0) {
275 			BTDISK *d = &(t->bt_s.bt_d);
276 
277 			d->d_cache = lruinit(d->d_fd,
278 					     (int) (b->cachesize / t->bt_psize),
279 					     (int) t->bt_psize,
280 					     (char *) t->bt_lorder,
281 					     _bt_pgin, _bt_pgout);
282 
283 			if (d->d_cache == (char *) NULL) {
284 				(void) free((char *) t);
285 				return ((BTREE) NULL);
286 			}
287 		}
288 	}
289 
290 	/* remember if tree was opened for write */
291 	if (((flags & O_WRONLY) == O_WRONLY)
292 	    || ((flags & O_RDWR) == O_RDWR))
293 		t->bt_flags |= BTF_ISWRITE;
294 
295 	return ((BTREE) t);
296 }
297 
298 /*
299  *  BT_GET -- Get an entry from a btree.
300  *
301  *	Does a key lookup in the tree to find the specified key, and returns
302  *	the key/data pair found.
303  *
304  *	Parameters:
305  *		tree -- btree in which to do lookup
306  *		key -- key to find
307  *		data -- pointer to DBT in which to return data
308  *		flag -- ignored
309  *
310  *	Returns:
311  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not
312  *		found.  If key is not found, nothing is stored in the
313  *		return DBT 'data'.
314  *
315  *	Side Effects:
316  *		None.
317  *
318  *	Warnings:
319  *		Return data is statically allocated, and will be overwritten
320  *		at the next call.
321  */
322 
323 int
324 bt_get(tree, key, data, flag)
325 	BTREE tree;
326 	DBT *key;
327 	DBT *data;
328 	int flag;
329 {
330 	BTREE_P t = (BTREE_P) tree;
331 	BTHEADER *h;
332 	DATUM *d;
333 	BTITEM *item;
334 
335 #ifdef lint
336 	flag = flag;
337 #endif /* lint */
338 
339 	/* lookup */
340 	item = _bt_search(t, key);
341 	if (item == (BTITEM *) NULL)
342 		return (RET_ERROR);
343 
344 	/* clear parent pointer stack */
345 	while (_bt_pop(t) != P_NONE)
346 		continue;
347 
348 	h = (BTHEADER *) t->bt_curpage;
349 	data->size = 0;
350 	data->data = (char *) NULL;
351 
352 	/* match? */
353 	if (VALIDITEM(t, item)
354 	    && _bt_cmp(t, key->data, item->bti_index) == 0) {
355 		d = (DATUM *) GETDATUM(h, item->bti_index);
356 		return (_bt_buildret(t, d, data, key));
357 	}
358 
359 	/* nope */
360 	return (RET_SPECIAL);
361 }
362 
363 /*
364  *  BT_PUT -- Add an entry to a btree.
365  *
366  *	The specified (key, data) pair is added to the tree.  If the tree
367  *	was created for unique keys only, then duplicates will not be
368  *	entered.  If the requested key exists in the tree, it will be over-
369  *	written unless the flags parameter is R_NOOVERWRITE, in which case
370  *	the update will not be done.  If duplicate keys are permitted in the
371  *	tree, duplicates will be inserted and will not overwrite existing
372  *	keys.  Nodes are split as required.
373  *
374  *	Parameters:
375  *		tree -- btree in which to put the new entry
376  *		key -- key component to add
377  *		data -- data corresponding to key
378  *		flag -- R_NOOVERWRITE or zero.
379  *
380  *	Returns:
381  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the
382  *		NOOVERWRITE flag was set and the specified key exists
383  *		in the database.
384  *
385  *	Side Effects:
386  *		None.
387  */
388 
389 int
390 bt_put(tree, key, data, flag)
391 	BTREE tree;
392 	DBT *key;
393 	DBT *data;
394 	int flag;
395 {
396 	BTREE_P t = (BTREE_P) tree;
397 	BTITEM *item;
398 
399 	/* look for this key in the tree */
400 	item = _bt_search(t, key);
401 
402 	/*
403 	 *  If this tree was originally created without R_DUP, then duplicate
404 	 *  keys are not allowed.  We need to check this at insertion time.
405 	 */
406 
407 	if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) {
408 		if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) {
409 			if (_bt_delone(t, item->bti_index) == RET_ERROR)
410 				return (RET_ERROR);
411 		}
412 	}
413 
414 	return (_bt_insert(t, item, key, data, flag));
415 }
416 
417 /*
418  *  BT_DELETE -- delete a key from the tree.
419  *
420  *	Deletes all keys (and their associated data items) matching the
421  *	supplied key from the tree.  If the flags entry is R_CURSOR, then
422  *	the current item in the active scan is deleted.
423  *
424  *	Parameters:
425  *		tree -- btree from which to delete key
426  *		key -- key to delete
427  *		flags -- R_CURSOR or zero
428  *
429  *	Returns:
430  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified
431  *		key was not in the tree.
432  *
433  *	Side Effects:
434  *		None.
435  */
436 
437 int
438 bt_delete(tree, key, flags)
439 	BTREE tree;
440 	DBT *key;
441 	int flags;
442 {
443 	BTREE_P t = (BTREE_P) tree;
444 	BTHEADER *h;
445 	BTITEM *item;
446 	int ndel = 0;
447 
448 	if (flags == R_CURSOR)
449 		return (_bt_crsrdel(t));
450 
451 	/* find the first matching key in the tree */
452 	item = _bt_first(t, key);
453 	h = t->bt_curpage;
454 
455 	/* delete all matching keys */
456 	for (;;) {
457 		while (VALIDITEM(t, item)
458 		       && (_bt_cmp(t, key->data, item->bti_index) == 0)) {
459 			if (_bt_delone(t, item->bti_index) == RET_ERROR)
460 				return (RET_ERROR);
461 			ndel++;
462 		}
463 
464 		if (VALIDITEM(t, item) || h->h_nextpg == P_NONE)
465 			break;
466 
467 		/* next page, if necessary */
468 		do {
469 			if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
470 				return (RET_ERROR);
471 			h = t->bt_curpage;
472 		} while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
473 
474 		item->bti_pgno = h->h_pgno;
475 		item->bti_index = 0;
476 
477 		if (!VALIDITEM(t, item)
478 		    || _bt_cmp(t, key->data, item->bti_index) != 0)
479 			break;
480 	}
481 
482 	/* clean up the parent stack */
483 	while (_bt_pop(t) != P_NONE)
484 		continue;
485 
486 	/* flush changes to disk */
487 	if (ISDISK(t)) {
488 		if (h->h_flags & F_DIRTY) {
489 			if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR)
490 				return (RET_ERROR);
491 		}
492 	}
493 
494 	if (ndel == 0)
495 		return (RET_SPECIAL);
496 
497 	return (RET_SUCCESS);
498 }
499 
500 /*
501  *  BT_SYNC -- sync the btree to disk.
502  *
503  *	Parameters:
504  *		tree -- btree to sync.
505  *
506  *	Returns:
507  *		RET_SUCCESS, RET_ERROR.
508  */
509 
510 bt_sync(tree)
511 	BTREE tree;
512 {
513 	BTREE_P t = (BTREE_P) tree;
514 	BTHEADER *h;
515 	pgno_t pgno;
516 
517 	/* if this is an in-memory btree, syncing is a no-op */
518 	if (!ISDISK(t))
519 		return (RET_SUCCESS);
520 
521 	h = (BTHEADER *) t->bt_curpage;
522 	h->h_flags &= ~F_DIRTY;
523 
524 	if (ISCACHE(t)) {
525 		pgno = t->bt_curpage->h_pgno;
526 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
527 			return(RET_ERROR);
528 		if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR)
529 			return (RET_ERROR);
530 		if (_bt_getpage(t, pgno) == RET_ERROR)
531 			return (RET_ERROR);
532 	} else {
533 		if (_bt_write(t, h, NORELEASE) == RET_ERROR)
534 			return (RET_ERROR);
535 	}
536 
537 	return (fsync(t->bt_s.bt_d.d_fd));
538 }
539 
540 /*
541  *  BT_SEQ -- Sequential scan interface.
542  *
543  *	This routine supports sequential scans on the btree.  If called with
544  *	flags set to R_CURSOR, or if no seq scan has been initialized in the
545  *	current tree, we initialize the scan.  Otherwise, we advance the scan
546  *	and return the next item.
547  *
548  *	Scans can be either keyed or non-keyed.  Keyed scans basically have
549  *	a starting point somewhere in the middle of the tree.  Non-keyed
550  *	scans start at an endpoint.  Also, scans can be backward (descending
551  *	order), forward (ascending order), or no movement (keep returning
552  *	the same item).
553  *
554  *	Flags is checked every time we enter the routine, so the user can
555  *	change directions on an active scan if desired.  The key argument
556  *	is examined only when we initialize the scan, in order to position
557  *	it properly.
558  *
559  *	Items are returned via the key and data arguments passed in.
560  *
561  *	Parameters:
562  *		tree -- btree in which to do scan
563  *		key -- key, used to position scan on initialization, and
564  *		       used to return key components to the user.
565  *		data -- used to return data components to the user.
566  *		flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or
567  *			 R_PREV.
568  *
569  *	Returns:
570  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data
571  *		exists in the tree in the specified direction.
572  *
573  *	Side Effects:
574  *		Changes the btree's notion of the current position in the
575  *		scan.
576  *
577  *	Warnings:
578  *		The key and data items returned are static and will be
579  *		overwritten by the next bt_get or bt_seq.
580  */
581 
582 int
583 bt_seq(tree, key, data, flags)
584 	BTREE tree;
585 	DBT *key;
586 	DBT *data;
587 	int flags;
588 {
589 	BTREE_P t = (BTREE_P) tree;
590 	BTHEADER *h;
591 	DATUM *d;
592 	int status;
593 
594 	/* do we need to initialize the scan? */
595 	if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST
596 	    || !(t->bt_flags & BTF_SEQINIT)) {
597 
598 		/* initialize it */
599 		status = _bt_seqinit(t, key, flags);
600 	} else {
601 		/* just advance the current scan pointer */
602 		status = _bt_seqadvance(t, flags);
603 	}
604 
605 	key->size = data->size = 0;
606 	key->data = data->data = (char *) NULL;
607 
608 	h = t->bt_curpage;
609 
610 	/* is there a valid item at the current scan location? */
611 	if (status == RET_SPECIAL) {
612 		if (flags == R_NEXT) {
613 			if (t->bt_cursor.c_index >= NEXTINDEX(h)) {
614 				if (NEXTINDEX(h) > 0)
615 					t->bt_cursor.c_index = NEXTINDEX(h) - 1;
616 				else
617 					t->bt_cursor.c_index = 0;
618 			}
619 		} else {
620 			t->bt_cursor.c_index = 0;
621 		}
622 		return (RET_SPECIAL);
623 	} else if (status == RET_ERROR)
624 		return (RET_ERROR);
625 
626 	/* okay, return the data */
627 	d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index);
628 
629 	return (_bt_buildret(t, d, data, key));
630 }
631 
632 /*
633  *  BT_CLOSE -- Close a btree
634  *
635  *	Parameters:
636  *		tree -- tree to close
637  *
638  *	Returns:
639  *		RET_SUCCESS, RET_ERROR.
640  *
641  *	Side Effects:
642  *		Frees space occupied by the tree.
643  */
644 
645 int
646 bt_close(tree)
647 	BTREE tree;
648 {
649 	BTREE_P t = (BTREE_P) tree;
650 	int i;
651 	BTHEADER *h;
652 	char *cache;
653 	struct HTBUCKET *b, *sb;
654 	HTABLE ht;
655 	int fd;
656 
657 	if (t->bt_cursor.c_key != (char *) NULL)
658 		(void) free(t->bt_cursor.c_key);
659 
660 	if (!ISDISK(t)) {
661 		/* in-memory tree, release hash table memory */
662 		ht = t->bt_s.bt_ht;
663 		for (i = 0; i < HTSIZE; i++) {
664 			if ((b = ht[i]) == (struct HTBUCKET *) NULL)
665 				break;
666 			do {
667 				sb = b;
668 				(void) free((char *) b->ht_page);
669 				b = b->ht_next;
670 				(void) free((char *) sb);
671 			} while (b != (struct HTBUCKET *) NULL);
672 		}
673 		(void) free ((char *) ht);
674 		(void) free ((char *) t);
675 		return (RET_SUCCESS);
676 	}
677 
678 	if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) {
679 		if (_bt_wrtmeta(t) == RET_ERROR)
680 			return (RET_ERROR);
681 	}
682 
683 	if (t->bt_curpage != (BTHEADER *) NULL) {
684 		h = t->bt_curpage;
685 		if (h->h_flags & F_DIRTY) {
686 			if (_bt_write(t, h, RELEASE) == RET_ERROR)
687 				return (RET_ERROR);
688 		} else {
689 			if (_bt_release(t, h) == RET_ERROR)
690 				return (RET_ERROR);
691 		}
692 
693 		/* flush and free the cache, if there is one */
694 		if (ISCACHE(t)) {
695 			cache = t->bt_s.bt_d.d_cache;
696 			if (lrusync(cache) == RET_ERROR)
697 				return (RET_ERROR);
698 			lrufree(cache);
699 		}
700 		(void) free ((char *) h);
701 	}
702 
703 	fd = t->bt_s.bt_d.d_fd;
704 	(void) free ((char *) t);
705 	return (close(fd));
706 }
707