xref: /csrg-svn/lib/libc/db/btree/bt_open.c (revision 45879)
1 /*-
2  * Copyright (c) 1990 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 #if defined(LIBC_SCCS) && !defined(lint)
12 static char sccsid[] = "@(#)bt_open.c	5.2 (Berkeley) 01/08/91";
13 #endif /* LIBC_SCCS and not lint */
14 
15 /*
16  *  btree.c -- implementation of btree access method for 4.4BSD.
17  *
18  *	The design here is based on that of the btree access method used
19  *	in the Postgres database system at UC Berkeley.  The implementation
20  *	is wholly independent of the Postgres code.
21  *
22  *	This implementation supports btrees on disk (supply a filename) or
23  *	in memory (don't).  Public interfaces defined here are:
24  *
25  *		btree_open()	-- wrapper; returns a filled DB struct for
26  *				   a btree.
27  *
28  *		bt_open()	-- open a new or existing btree.
29  *		bt_get()	-- fetch data from a tree by key.
30  *		bt_seq()	-- do a sequential scan on a tree.
31  *		bt_put()	-- add data to a tree by key.
32  *		bt_delete()	-- remove data from a tree by key.
33  *		bt_close()	-- close a btree.
34  *		bt_sync()	-- force btree pages to disk (disk trees only).
35  */
36 
37 #include <sys/param.h>
38 #include <sys/file.h>
39 #include <sys/stat.h>
40 #include <sys/signal.h>
41 #include <sys/errno.h>
42 #include <db.h>
43 
44 #ifndef BLSWAP
45 #define BLSWAP(a) { \
46         register unsigned long _t; \
47         _t = (a); \
48         (a) >>= 24; \
49         (a) |= (_t & 0x00ff0000) >>  8; \
50         (a) |= (_t & 0x0000ff00) <<  8; \
51         (a) |= (_t & 0x000000ff) << 24; \
52 }
53 #endif /* ndef BLSWAP */
54 
55 typedef char	*BTREE;		/* should really be (void *) */
56 
57 /* #define	DEBUG */
58 
59 #define RET_ERROR	-1
60 #define RET_SUCCESS	 0
61 #define RET_SPECIAL	 1
62 
63 #ifndef TRUE
64 #define TRUE	1
65 #define FALSE	0
66 #endif /* ndef TRUE */
67 
68 /* libc */
69 extern char *malloc();
70 
71 /* these are defined in lrucache.c */
72 extern char	*lruinit();
73 extern char	*lruget();
74 extern char	*lrugetnew();
75 extern int	lrusync();
76 extern int	lruwrite();
77 extern int	lrurelease();
78 extern void	lrufree();
79 
80 /* these are defined here */
81 BTREE	bt_open();
82 int	bt_close();
83 int	bt_delete();
84 int	bt_get();
85 int	bt_put();
86 int	bt_seq();
87 int	bt_sync();
88 
89 /*
90  *  Private types.  What you choose for these depends on how big you
91  *  want to let files get, and how big you want to let pages get.
92  */
93 
94 typedef u_long	index_t;	/* so # bytes on a page fits in a long */
95 typedef u_long	pgno_t;		/* so # of pages in a btree fits in a long */
96 
97 /*
98  *  When we do searches, we push the parent page numbers onto a stack
99  *  as we descend the tree.  This is so that for insertions, we can
100  *  find our way back up to do internal page insertions and splits.
101  */
102 
103 typedef struct BTSTACK {
104 	pgno_t		bts_pgno;
105 	struct BTSTACK	*bts_next;
106 } BTSTACK;
107 
108 /*
109  *  Every btree page has a header that looks like this.  Flags are given
110  *  in the #define's for the F_ flags (see below).
111  */
112 
113 typedef struct BTHEADER {
114 	pgno_t h_pgno;		/* page number of this page */
115 	pgno_t h_prevpg;	/* left sibling */
116 	pgno_t h_nextpg;	/* right sibling */
117 
118 #define F_LEAF		0x01	/* leaf page, contains user data */
119 #define F_CONT		0x02	/* continuation page (large items) */
120 #define F_DIRTY		0x04	/* need to write to disk */
121 #define F_PRESERVE	0x08	/* never delete this chain of pages */
122 
123 	u_long h_flags;		/* page state */
124 	index_t h_lower;	/* lower bound of free space on page */
125 	index_t h_upper;	/* upper bound of free space on page */
126 	index_t h_linp[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
127 } BTHEADER;
128 
129 /*
130  *  HTBUCKETs are hash table buckets for looking up pages of in-memory
131  *  btrees by page number.  We use this indirection, rather than direct
132  *  pointers, so that the code for manipulating in-memory trees is the
133  *  same as that for manipulating on-disk trees.
134  */
135 
136 typedef struct HTBUCKET {
137 	pgno_t		ht_pgno;
138 	BTHEADER	*ht_page;
139 	struct HTBUCKET	*ht_next;
140 } HTBUCKET;
141 
142 typedef HTBUCKET	**HTABLE;
143 
144 /* minimum size we'll let a page be */
145 #define MINPSIZE	512
146 
147 /* default cache size, in bytes */
148 #define DEFCACHE	(20 * 1024)
149 
150 /* hash table size for in-memory trees */
151 #define	HTSIZE		128
152 
153 /* generate a hash key from a page number */
154 #define HASHKEY(pgno)	((pgno - 1) % HTSIZE)
155 
156 /*
157  *  Disk btrees have a file descriptor, and may also have an lru buffer
158  *  cache, if the user asked for one.
159  */
160 
161 typedef struct BTDISK {
162 	int	d_fd;
163 	char	*d_cache;
164 } BTDISK;
165 
166 /*
167  *  Cursors keep track of the current location in a sequential scan of
168  *  the database.  Since btrees impose a total ordering on keys, we can
169  *  walk forward or backward through the database from any point.  Cursors
170  *  survive updates to the tree, and can be used to delete a particular
171  *  record.
172  */
173 
174 typedef struct CURSOR {
175 	pgno_t		c_pgno;		/* pgno of current item in scan */
176 	index_t		c_index;	/* index of current item in scan */
177 	char		*c_key;		/* current key, used for updates */
178 
179 #define CRSR_BEFORE	0x01
180 
181 	u_char		c_flags;	/* to handle updates properly */
182 } CURSOR;
183 
184 /*
185  *  The private btree data structure.  The user passes a pointer to one of
186  *  these when we are to manipulate a tree, but the BTREE type is opaque
187  *  to him.
188  */
189 
190 typedef struct BTREEDATA_P {
191 	char		*bt_fname;		/* NULL for in-memory trees */
192 	union {
193 		BTDISK	bt_d;			/* for on-disk btrees */
194 		HTABLE	bt_ht;			/* hash table for mem trees */
195 	} bt_s;
196 	size_t		bt_psize;		/* page size for btree pages */
197 	int		(*bt_compare)();	/* key comparison function */
198 	pgno_t		bt_npages;		/* number of pages in tree */
199 	BTHEADER	*bt_curpage;		/* current page contents */
200 	pgno_t		bt_free;		/* free pg list for big data */
201 	CURSOR		bt_cursor;		/* cursor for scans */
202 	BTSTACK		*bt_stack;		/* parent stack for inserts */
203 	u_long		bt_lorder;		/* byte order (endian.h) */
204 
205 #define BTF_METAOK	0x01	/* meta-data written to start of file */
206 #define BTF_SEQINIT	0x02	/* we have called bt_seq */
207 #define BTF_ISWRITE	0x04	/* tree was opened for write */
208 #define BTF_NODUPS	0x08	/* tree created for unique keys */
209 
210 	u_long		bt_flags;		/* btree state */
211 } BTREEDATA_P;
212 
213 typedef BTREEDATA_P	*BTREE_P;
214 
215 /*
216  *  The first thing in a btree file is a BTMETA structure.  The rest of
217  *  the first page is empty, so that all disk operations are page-aligned.
218  */
219 
220 typedef struct BTMETA {
221 	u_long	m_magic;
222 	u_long	m_version;
223 	size_t	m_psize;
224 	pgno_t	m_free;
225 	u_long	m_flags;
226 	u_long	m_lorder;
227 } BTMETA;
228 
229 #define P_NONE		0		/* invalid page number in tree */
230 #define P_ROOT		1		/* page number of root pg in btree */
231 
232 #define NORELEASE	0		/* don't release a page during write */
233 #define RELEASE		1		/* release a page during write */
234 
235 #define INSERT		0		/* doing an insert operation */
236 #define DELETE		1		/* doing a delete operation */
237 
238 /* get the next free index on a btree page */
239 #define NEXTINDEX(p)	((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t)))
240 
241 /* is a BTITEM actually on the btree page? */
242 #define VALIDITEM(t, i)	((i)->bti_index < NEXTINDEX((t)->bt_curpage))
243 
244 /* guarantee longword alignment so structure refs work */
245 #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03)
246 
247 /* get a particular datum (or idatum) off a page */
248 #define GETDATUM(h,i)	 (((char *) h) + h->h_linp[i])
249 
250 /* is a {key,datum} too big to put on a single page? */
251 #define TOOBIG(t, sz)	(sz >= t->bt_psize / 5)
252 
253 /* is this a disk tree or a memory tree? */
254 #define ISDISK(t)	(t->bt_fname != (char *) NULL)
255 
256 /* does the disk tree use a cache? */
257 #define ISCACHE(t)	(t->bt_s.bt_d.d_cache != (char *) NULL)
258 
259 /*
260  *  DATUMs are for user data -- one appears on leaf pages for every
261  *  tree entry.  The d_bytes[] array contains the key first, then the data.
262  *
263  *  If either the key or the datum is too big to store on a single page,
264  *  a bit is set in the flags entry, and the d_bytes[] array contains a
265  *  pgno pointing to the page at which the data is actually stored.
266  *
267  *  Note on alignment:  every DATUM is guaranteed to be longword aligned
268  *  on the disk page.  In order to force longword alignment of user key
269  *  and data values, we must guarantee that the d_bytes[] array starts
270  *  on a longword boundary.  This is the reason that d_flags is a u_long,
271  *  rather than a u_char (it really only needs to be two bits big).  This
272  *  is necessary because we call the user's comparison function with a
273  *  pointer to the start of the d_bytes array.  We don't need to force
274  *  longword alignment of the data following the key, since that is copied
275  *  to a longword-aligned buffer before being returned to the user.
276  */
277 
278 typedef struct DATUM {
279 	size_t d_ksize;		/* size of key */
280 	size_t d_dsize;		/* size of data */
281 
282 #define D_BIGDATA	0x01	/* indirect datum ptr flag */
283 #define D_BIGKEY	0x02	/* indirect key ptr flag */
284 
285 	u_long d_flags;		/* flags (indirect bit) */
286 	char d_bytes[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
287 } DATUM;
288 
289 /* BTITEMs are used to return (page, index, datum) tuples from searches */
290 typedef struct BTITEM {
291 	pgno_t bti_pgno;
292 	index_t bti_index;
293 	DATUM *bti_datum;
294 } BTITEM;
295 
296 /*
297  *  IDATUMs are for data stored on internal pages.  This is the (key, pgno)
298  *  pair, such that key 'key' is the first entry on page 'pgno'.  If our
299  *  internal page contains keys (a) and (b) next to each other, then all
300  *  items >= to (a) and < (b) go on the same page as (a).  There are some
301  *  gotchas with duplicate keys, however.  See the split code for details.
302  *
303  *  If a key is too big to fit on a single page, then the i_bytes[] array
304  *  contains a pgno pointing to the start of a chain that actually stores
305  *  the bytes.  Since items on internal pages are never deleted from the
306  *  tree, these indirect chains are marked as special, so that they won't
307  *  be deleted if the corresponding leaf item is deleted.
308  *
309  *  As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char)
310  *  in order to guarantee that user keys are longword aligned on the disk
311  *  page.
312  */
313 
314 typedef struct IDATUM {
315 	size_t i_size;
316 	pgno_t i_pgno;
317 	u_long i_flags;		/* see DATUM.d_flags, above */
318 	char i_bytes[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
319 } IDATUM;
320 
321 /* all private interfaces have a leading _ in their names */
322 static BTITEM	*_bt_search();
323 static BTITEM	*_bt_searchr();
324 static BTHEADER	*_bt_allocpg();
325 static index_t	_bt_binsrch();
326 static int	_bt_isonpage();
327 static BTITEM	*_bt_first();
328 static int	_bt_release();
329 static int	_bt_wrtmeta();
330 static int	_bt_delindir();
331 static int	_bt_pgout();
332 static int	_bt_pgin();
333 static int	_bt_fixscan();
334 static int	_bt_indirect();
335 static int	_bt_crsrdel();
336 
337 extern int strcmp();
338 
339 static BTREEINFO _DefaultBTInfo = {
340 	0,	/* flags */
341 	0,	/* cachesize */
342 	0,	/* psize */
343 	strcmp,	/* compare */
344 	0
345 };
346 
347 /*
348  *  BTREE_OPEN -- Wrapper routine to open a btree.
349  *
350  *	Creates and fills a DB struct, and calls the routine that actually
351  *	opens the btree.
352  *
353  *	Parameters:
354  *		f:  filename to open
355  *		flags:  flag bits passed to open
356  *		mode:  permission bits, used if O_CREAT specified
357  *		b:  BTREEINFO pointer
358  *
359  *	Returns:
360  *		Filled-in DBT on success; NULL on failure, with errno
361  *		set as appropriate.
362  *
363  *	Side Effects:
364  *		Allocates memory for the DB struct.
365  */
366 
367 DB *
368 btree_open(f, flags, mode, b)
369 	char *f;
370 	int flags;
371 	int mode;
372 	BTREEINFO *b;
373 {
374 	DB *db;
375 	BTREE t;
376 
377 	if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL)
378 		return ((DB *) NULL);
379 
380 	if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) {
381 		(void) free ((char *) db);
382 		return ((DB *) NULL);
383 	}
384 
385 	db->internal = (char *) t;
386 	db->close = bt_close;
387 	db->delete = bt_delete;
388 	db->get = bt_get;
389 	db->put = bt_put;
390 	db->seq = bt_seq;
391 	db->sync = bt_sync;
392 
393 	return (db);
394 }
395 
396 /*
397  *  BT_OPEN -- Open a btree.
398  *
399  *	This routine creates the correct kind (disk or in-memory) of
400  *	btree and initializes it as required.
401  *
402  *	Parameters:
403  *		f -- name of btree (NULL for in-memory btrees)
404  *		flags -- flags passed to open()
405  *		mode -- mode passed to open ()
406  *		b -- BTREEINFO structure, describing btree
407  *
408  *	Returns:
409  *		(Opaque) pointer to the btree.  On failure, returns NULL
410  *		with errno set as appropriate.
411  *
412  *	Side Effects:
413  *		Allocates memory, opens files.
414  */
415 
416 BTREE
417 bt_open(f, flags, mode, b)
418 	char *f;
419 	int flags;
420 	int mode;
421 	BTREEINFO *b;
422 {
423 	BTREE_P t;
424 	HTABLE ht;
425 	int nbytes;
426 	int fd;
427 	CURSOR *c;
428 	BTMETA m;
429 	struct stat buf;
430 
431 	/* use the default info if none was provided */
432 	if (b == (BTREEINFO *) NULL)
433 		b = &_DefaultBTInfo;
434 
435 	if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL)
436 		return ((BTREE) NULL);
437 
438 	if (b->compare)
439 		t->bt_compare = b->compare;
440 	else
441 		t->bt_compare = strcmp;
442 
443 	t->bt_fname = f;
444 	t->bt_curpage = (BTHEADER *) NULL;
445 	t->bt_free = P_NONE;
446 	c = &(t->bt_cursor);
447 	c->c_pgno = P_NONE;
448 	c->c_index = 0;
449 	c->c_flags = (u_char) NULL;
450 	c->c_key = (char *) NULL;
451 	t->bt_stack = (BTSTACK *) NULL;
452 	t->bt_flags = 0;
453 
454 	/*
455 	 *  If no file name was supplied, this is an in-memory btree.
456 	 *  Otherwise, it's a disk-based btree.
457 	 */
458 	if (f == (char *) NULL) {
459 		/* in memory */
460 		if ((t->bt_psize = b->psize) < MINPSIZE) {
461 			if (t->bt_psize != 0) {
462 				(void) free ((char *) t);
463 				errno = EINVAL;
464 				return ((BTREE) NULL);
465 			}
466 			t->bt_psize = getpagesize();
467 		}
468 
469 		nbytes = HTSIZE * sizeof(HTBUCKET *);
470 		if ((ht = (HTABLE) malloc((unsigned) nbytes))
471 		    == (HTABLE) NULL) {
472 			(void) free((char *) t);
473 			return ((BTREE) NULL);
474 		}
475 		(void) bzero((char *) ht, nbytes);
476 		t->bt_s.bt_ht = ht;
477 		t->bt_npages = 0;
478 		t->bt_lorder = BYTE_ORDER;
479 		if (!(b->flags & R_DUP))
480 			t->bt_flags |= BTF_NODUPS;
481 	} else {
482 		/* on disk */
483 		if ((fd = open(f, O_RDONLY, 0)) < 0) {
484 			/* doesn't exist yet, be sure page is big enough */
485 			if ((t->bt_psize = b->psize) < sizeof(BTHEADER)
486 			    && b->psize != 0) {
487 				(void) free((char *) t);
488 				errno = EINVAL;
489 				return ((BTREE) NULL);
490 			}
491 			if (b->lorder == 0)
492 				b->lorder = BYTE_ORDER;
493 
494 			if (b->lorder != BIG_ENDIAN
495 			    && b->lorder != LITTLE_ENDIAN) {
496 				(void) free((char *) t);
497 				errno = EINVAL;
498 				return ((BTREE) NULL);
499 			}
500 			t->bt_lorder = b->lorder;
501 			if (!(b->flags & R_DUP))
502 				t->bt_flags |= BTF_NODUPS;
503 		} else {
504 			/* exists, get page size from file */
505 			if (read(fd, &m, sizeof(m)) < sizeof(m)) {
506 				(void) close(fd);
507 				(void) free((char *) t);
508 				errno = EINVAL;
509 				return ((BTREE) NULL);
510 			}
511 
512 			/* lorder always stored in host-independent format */
513 			NTOHL(m.m_lorder);
514 			if (m.m_lorder != BIG_ENDIAN
515 			    && m.m_lorder != LITTLE_ENDIAN) {
516 				(void) free((char *) t);
517 				errno = EINVAL;
518 				return ((BTREE) NULL);
519 			}
520 			t->bt_lorder = m.m_lorder;
521 
522 			if (t->bt_lorder != BYTE_ORDER) {
523 				BLSWAP(m.m_magic);
524 				BLSWAP(m.m_version);
525 				BLSWAP(m.m_psize);
526 				BLSWAP(m.m_free);
527 				BLSWAP(m.m_flags);
528 			}
529 
530 			if (m.m_magic != BTREEMAGIC
531 			    || m.m_version != BTREEVERSION
532 			    || m.m_psize < MINPSIZE) {
533 				(void) close(fd);
534 				(void) free((char *) t);
535 #ifndef EFTYPE
536 #define EFTYPE	-100
537 #endif
538 				errno = EFTYPE;
539 				return ((BTREE) NULL);
540 			}
541 			t->bt_psize = m.m_psize;
542 			t->bt_free = m.m_free;
543 			t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK;
544 			(void) close(fd);
545 		}
546 
547 		/* now open the file the way the user wanted it */
548 		if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) {
549 			(void) free ((char *) t);
550 			return ((BTREE) NULL);
551 		}
552 
553 		/* get number of pages, page size if necessary */
554 		(void) fstat(t->bt_s.bt_d.d_fd, &buf);
555 		if (t->bt_psize == 0)
556 			t->bt_psize = buf.st_blksize;
557 		t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize);
558 
559 		/* page zero is metadata, doesn't count */
560 		if (t->bt_npages > 0)
561 			--(t->bt_npages);
562 
563 		if (b->cachesize == 0)
564 			b->cachesize = DEFCACHE;
565 
566 		/* get an lru buffer cache, if the user asked for one */
567 		if ((b->cachesize / t->bt_psize) > 0) {
568 			BTDISK *d = &(t->bt_s.bt_d);
569 
570 			d->d_cache = lruinit(d->d_fd,
571 					     b->cachesize / t->bt_psize,
572 					     t->bt_psize,
573 					     (char *) t->bt_lorder,
574 					     _bt_pgin, _bt_pgout);
575 
576 			if (d->d_cache == (char *) NULL) {
577 				(void) free((char *) t);
578 				return ((BTREE) NULL);
579 			}
580 		}
581 	}
582 
583 	/* remember if tree was opened for write */
584 	if (((flags & O_WRONLY) == O_WRONLY)
585 	    || ((flags & O_RDWR) == O_RDWR))
586 		t->bt_flags |= BTF_ISWRITE;
587 
588 	return ((BTREE) t);
589 }
590 
591 /*
592  *  BT_GET -- Get an entry from a btree.
593  *
594  *	Does a key lookup in the tree to find the specified key, and returns
595  *	the key/data pair found.
596  *
597  *	Parameters:
598  *		tree -- btree in which to do lookup
599  *		key -- key to find
600  *		data -- pointer to DBT in which to return data
601  *		flag -- ignored
602  *
603  *	Returns:
604  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not
605  *		found.  If key is not found, nothing is stored in the
606  *		return DBT 'data'.
607  *
608  *	Side Effects:
609  *		None.
610  *
611  *	Warnings:
612  *		Return data is statically allocated, and will be overwritten
613  *		at the next call.
614  */
615 
616 int
617 bt_get(tree, key, data, flag)
618 	BTREE tree;
619 	DBT *key;
620 	DBT *data;
621 	int flag;
622 {
623 	BTREE_P t = (BTREE_P) tree;
624 	BTHEADER *h;
625 	DATUM *d;
626 	BTITEM *item;
627 
628 	/* lookup */
629 	item = _bt_search(t, key);
630 	if (item == (BTITEM *) NULL)
631 		return (RET_ERROR);
632 
633 	/* clear parent pointer stack */
634 	while (_bt_pop(t) != P_NONE)
635 		continue;
636 
637 	h = (BTHEADER *) t->bt_curpage;
638 	data->size = 0;
639 	data->data = (char *) NULL;
640 
641 	/* match? */
642 	if (VALIDITEM(t, item)
643 	    && _bt_cmp(t, key->data, item->bti_index) == 0) {
644 		d = (DATUM *) GETDATUM(h, item->bti_index);
645 		return (_bt_buildret(t, d, data, key));
646 	}
647 
648 	/* nope */
649 	return (RET_SPECIAL);
650 }
651 
652 /*
653  *  BT_PUT -- Add an entry to a btree.
654  *
655  *	The specified (key, data) pair is added to the tree.  If the tree
656  *	was created for unique keys only, then duplicates will not be
657  *	entered.  If the requested key exists in the tree, it will be over-
658  *	written unless the flags parameter is R_NOOVERWRITE, in which case
659  *	the update will not be done.  If duplicate keys are permitted in the
660  *	tree, duplicates will be inserted and will not overwrite existing
661  *	keys.  Nodes are split as required.
662  *
663  *	Parameters:
664  *		tree -- btree in which to put the new entry
665  *		key -- key component to add
666  *		data -- data corresponding to key
667  *		flag -- R_NOOVERWRITE or zero.
668  *
669  *	Returns:
670  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the
671  *		NOOVERWRITE flag was set and the specified key exists
672  *		in the database.
673  *
674  *	Side Effects:
675  *		None.
676  */
677 
678 int
679 bt_put(tree, key, data, flag)
680 	BTREE tree;
681 	DBT *key;
682 	DBT *data;
683 	int flag;
684 {
685 	BTREE_P t = (BTREE_P) tree;
686 	BTITEM *item;
687 
688 	/* look for this key in the tree */
689 	item = _bt_search(t, key);
690 
691 	/*
692 	 *  If this tree was originally created without R_DUP, then duplicate
693 	 *  keys are not allowed.  We need to check this at insertion time.
694 	 */
695 
696 	if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) {
697 		if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) {
698 			if (_bt_delone(t, item->bti_index) == RET_ERROR)
699 				return (RET_ERROR);
700 		}
701 	}
702 
703 	return (_bt_insert(t, item, key, data, flag));
704 }
705 
706 /*
707  *  BT_DELETE -- delete a key from the tree.
708  *
709  *	Deletes all keys (and their associated data items) matching the
710  *	supplied key from the tree.  If the flags entry is R_CURSOR, then
711  *	the current item in the active scan is deleted.
712  *
713  *	Parameters:
714  *		tree -- btree from which to delete key
715  *		key -- key to delete
716  *		flags -- R_CURSOR or zero
717  *
718  *	Returns:
719  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified
720  *		key was not in the tree.
721  *
722  *	Side Effects:
723  *		None.
724  */
725 
726 int
727 bt_delete(tree, key, flags)
728 	BTREE tree;
729 	DBT *key;
730 	int flags;
731 {
732 	BTREE_P t = (BTREE_P) tree;
733 	BTHEADER *h;
734 	BTITEM *item;
735 	int ndel = 0;
736 
737 	if (flags == R_CURSOR)
738 		return (_bt_crsrdel(t));
739 
740 	/* find the first matching key in the tree */
741 	item = _bt_first(t, key);
742 	h = t->bt_curpage;
743 
744 	/* delete all matching keys */
745 	for (;;) {
746 		while (VALIDITEM(t, item)
747 		       && (_bt_cmp(t, key->data, item->bti_index) == 0)) {
748 			if (_bt_delone(t, item->bti_index) == RET_ERROR)
749 				return (RET_ERROR);
750 			ndel++;
751 		}
752 
753 		if (VALIDITEM(t, item) || h->h_nextpg == P_NONE)
754 			break;
755 
756 		/* next page, if necessary */
757 		do {
758 			if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
759 				return (RET_ERROR);
760 			h = t->bt_curpage;
761 		} while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
762 
763 		item->bti_pgno = h->h_pgno;
764 		item->bti_index = 0;
765 
766 		if (!VALIDITEM(t, item)
767 		    || _bt_cmp(t, key->data, item->bti_index) != 0)
768 			break;
769 	}
770 
771 	/* clean up the parent stack */
772 	while (_bt_pop(t) != P_NONE)
773 		continue;
774 
775 	/* flush changes to disk */
776 	if (ISDISK(t)) {
777 		if (h->h_flags & F_DIRTY) {
778 			if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR)
779 				return (RET_ERROR);
780 		}
781 	}
782 
783 	if (ndel == 0)
784 		return (RET_SPECIAL);
785 
786 	return (RET_SUCCESS);
787 }
788 
789 /*
790  *  _BT_CRSRDEL -- Delete the item pointed to by the cursor.
791  *
792  *	This routine deletes the item most recently returned by a scan
793  *	through the tree.  Since it only makes sense to delete the current
794  *	record once, we make sure that we don't try to delete twice without
795  *	advancing the scan.
796  *
797  *	Parameters:
798  *		t -- tree in which to do deletion
799  *
800  *	Returns:
801  *		RET_SUCCESS, RET_ERROR.
802  *
803  *	Side Effects:
804  *		The call to _bt_delone marks the cursor, so we can tell that
805  *		the current record has been deleted.
806  */
807 
808 static int
809 _bt_crsrdel(t)
810 	BTREE_P t;
811 {
812 	CURSOR *c;
813 
814 	c = &(t->bt_cursor);
815 
816 	/* a cursor must exist, and can't have deleted the current key yet */
817 	if (!(t->bt_flags & BTF_SEQINIT) || (c->c_flags & CRSR_BEFORE)) {
818 		errno = EINVAL;
819 		return (RET_ERROR);
820 	}
821 
822 	if (_bt_getpage(t, c->c_pgno) == RET_ERROR)
823 		return (RET_ERROR);
824 
825 	if (c->c_index >= NEXTINDEX(t->bt_curpage)) {
826 		errno = EINVAL;
827 		return (RET_ERROR);
828 	}
829 
830 	return (_bt_delone(t, c->c_index));
831 }
832 
833 /*
834  *  _BT_DELONE -- Delete a single entry from a btree.
835  *
836  *	This routine physically removes a btree entry from a leaf page.
837  *	IDATUM items are *never* removed from internal nodes, regardless
838  *	of whether the entries that originally caused them to be added
839  *	are removed from the tree or not.  In addition, pages made empty
840  *	by element deletion are not actually reclaimed.  They are,
841  *	however, made available for reuse.
842  *
843  *	To delete an item from a page, we pack the remaining items at
844  *	the end of the page, overwriting the deleted item's entry.  We
845  *	move the line pointers backward on the page, overwriting the
846  *	original item's line pointer.  This guarantees that the space in
847  *	the middle of the page is free -- a property that our insertion
848  *	strategy relies on.
849  *
850  *	This routine doesn't reclaim pages all of whose entries have
851  *	been deleted.  These pages are available for reuse, however.
852  *	If an item is deleted that was too big to fit on a page, then
853  *	the blocks that it occupies are put on a free list for reuse.
854  *
855  *	Parameters:
856  *		t -- btree from which to delete item
857  *		index -- index of entry on current page to delete
858  *
859  *	Returns:
860  *		RET_SUCCESS, RET_ERROR.
861  *
862  *	Side Effects:
863  *		Physically changes page layout, adjusts internal page
864  *		state to reflect the deletion of the item, and updates
865  *		the list of free pages for this tree.
866  */
867 
868 static int
869 _bt_delone(t, index)
870 	BTREE_P t;
871 	index_t index;
872 {
873 	char *src, *dest;
874 	int nbytes, nmoved;
875 	index_t off;
876 	index_t top;
877 	index_t i;
878 	pgno_t chain;
879 	BTHEADER *h;
880 	CURSOR *c;
881 	DATUM *d;
882 
883 	/* deletion may confuse an active scan.  fix it.  */
884 	c = &(t->bt_cursor);
885 	if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno)
886 		if (_bt_fixscan(t, index, (DATUM *) NULL, DELETE) == RET_ERROR)
887 			return (RET_ERROR);
888 
889 	h = t->bt_curpage;
890 	off = h->h_linp[index];
891 	d = (DATUM *) GETDATUM(h, index);
892 
893 	/* if this is a big item, reclaim the space it occupies */
894 	if (d->d_flags & D_BIGKEY) {
895 		bcopy(&(d->d_bytes[0]),
896 		      (char *) &chain,
897 		      sizeof(chain));
898 		if (_bt_delindir(t, chain) == RET_ERROR)
899 			return (RET_ERROR);
900 		h = t->bt_curpage;
901 		d = (DATUM *) GETDATUM(h, index);
902 	}
903 	if (d->d_flags & D_BIGDATA) {
904 		bcopy(&(d->d_bytes[d->d_ksize]),
905 		      (char *) &chain,
906 		      sizeof(chain));
907 		if (_bt_delindir(t, chain) == RET_ERROR)
908 			return (RET_ERROR);
909 		h = t->bt_curpage;
910 		d = (DATUM *) GETDATUM(h, index);
911 	}
912 
913 	/* move the data down on the page */
914 	nbytes = d->d_ksize + d->d_dsize
915 		 + (sizeof(DATUM) - sizeof(char));
916 	nbytes = LONGALIGN(nbytes);
917 	src = ((char *) h) + h->h_upper;
918 	dest = src + nbytes;
919 	nmoved = (int) (((char *) d) - src);
920 	(void) bcopy(src, dest, nmoved);
921 
922 	/* next move the line pointers up */
923 	src = (char *) &(h->h_linp[index + 1]);
924 	dest = (char *) &(h->h_linp[index]);
925 	nmoved = (int) (((char *) &(h->h_linp[NEXTINDEX(h)])) - src);
926 	(void) bcopy(src, dest, nmoved);
927 
928 	/* remember that we freed up some space */
929 	h->h_upper += nbytes;
930 	h->h_lower -= sizeof(index_t);
931 
932 	/* adjust offsets in line pointers affected by moving the data */
933 	top = NEXTINDEX(h);
934 	for (i = 0; i < top; i++) {
935 		if (h->h_linp[i] < off)
936 			h->h_linp[i] += nbytes;
937 	}
938 
939 	/* it's gone */
940 	h->h_flags |= F_DIRTY;
941 
942 	return (RET_SUCCESS);
943 }
944 
945 /*
946  *  _BT_FIXSCAN -- Adjust a scan to cope with a change in tree structure.
947  *
948  *	If the user has an active scan on the database, and we delete an
949  *	item from the page the cursor is pointing at, we need to figure
950  *	out what to do about it.  Basically, the solution is to point
951  *	"between" keys in the tree, using the CRSR_BEFORE flag.  The
952  *	requirement is that the user should not miss any data in the
953  *	tree during a scan, just because he happened to do some deletions
954  *	or insertions while it was active.
955  *
956  *	In order to guarantee that the scan progresses properly, we need
957  *	to save the key of any deleted item we were pointing at, so that
958  *	we can check later insertions against it.
959  *
960  *	Parameters:
961  *		t -- tree to adjust
962  *		index -- index of item at which change was made
963  *		newd -- new datum (for insertions only)
964  *		op -- operation (DELETE or INSERT) causing change
965  *
966  *	Returns:
967  *		RET_SUCCESS, RET_ERROR (errno is set).
968  *
969  *	Side Effects:
970  *		None.
971  */
972 
973 static int
974 _bt_fixscan(t, index, newd, op)
975 	BTREE_P t;
976 	index_t index;
977 	DATUM *newd;
978 	int op;
979 {
980 	BTHEADER *h;
981 	CURSOR *c;
982 	DATUM *d;
983 
984 	h = t->bt_curpage;
985 	c = &(t->bt_cursor);
986 
987 	if (op == DELETE) {
988 		if (index < c->c_index)
989 			c->c_index--;
990 		else if (index == c->c_index) {
991 			if (!(c->c_flags & CRSR_BEFORE)) {
992 				if (_bt_crsrkey(t) == RET_ERROR)
993 					return (RET_ERROR);
994 				c->c_flags |= CRSR_BEFORE;
995 			}
996 		}
997 	} else {
998 		/*
999 		 *  If we previously deleted the object at this location,
1000 		 *  and now we're inserting a new one, we need to do the
1001 		 *  right thing -- the cursor should come either before
1002 		 *  or after the new item, depending on the key that was
1003 		 *  here, and the new one.
1004 		 */
1005 
1006 		if (c->c_flags & CRSR_BEFORE) {
1007 			if (index <= c->c_index) {
1008 				char *tmp;
1009 				int itmp;
1010 				pgno_t chain;
1011 				int r;
1012 
1013 				d = (DATUM *) GETDATUM(t->bt_curpage, index);
1014 				if (d->d_flags & D_BIGKEY) {
1015 					bcopy(&(newd->d_bytes[0]),
1016 					      (char *) &chain,
1017 					      sizeof(chain));
1018 					if (_bt_getbig(t, chain, &tmp, &itmp)
1019 					     == RET_ERROR)
1020 						return (RET_ERROR);
1021 				} else
1022 					tmp = &(newd->d_bytes[0]);
1023 
1024 				r = (*(t->bt_compare))(tmp, c->c_key);
1025 				if (r < 0)
1026 					c->c_index++;
1027 
1028 				if (d->d_flags & D_BIGKEY)
1029 					(void) free (tmp);
1030 			}
1031 		} else if (index <= c->c_index)
1032 			c->c_index++;
1033 	}
1034 	return (RET_SUCCESS);
1035 }
1036 
1037 /*
1038  *  _BT_CRSRKEY -- Save a copy of the key of the record that the cursor
1039  *		   is pointing to.  The record is about to be deleted.
1040  *
1041  *	Parameters:
1042  *		t -- btree
1043  *
1044  *	Returns:
1045  *		RET_SUCCESS, RET_ERROR.
1046  *
1047  *	Side Effects:
1048  *		Allocates memory for the copy which should be released when
1049  *		it is no longer needed.
1050  */
1051 
1052 static int
1053 _bt_crsrkey(t)
1054 	BTREE_P t;
1055 {
1056 	CURSOR *c;
1057 	DATUM *d;
1058 	pgno_t pgno;
1059 	int ignore;
1060 
1061 	c = &(t->bt_cursor);
1062 	d = (DATUM *) GETDATUM(t->bt_curpage, c->c_index);
1063 
1064 	if (d->d_flags & D_BIGKEY) {
1065 		bcopy(&(d->d_bytes[0]), (char *) &pgno, sizeof(pgno));
1066 		return (_bt_getbig(t, pgno, &(c->c_key), &ignore));
1067 	} else {
1068 		if ((c->c_key = (char *) malloc(d->d_ksize)) == (char *) NULL)
1069 			return (RET_ERROR);
1070 
1071 		bcopy(&(d->d_bytes[0]), c->c_key, d->d_ksize);
1072 	}
1073 	return (RET_SUCCESS);
1074 }
1075 
1076 /*
1077  *  _BT_GETBIG -- Get big data from indirect pages.
1078  *
1079  *	This routine chases indirect blocks for the big object at the
1080  *	specified page to a buffer, and return the address of the buffer.
1081  *
1082  *	Parameters:
1083  *		t -- btree with the indirect blocks
1084  *		pgno -- page number that starts the chain
1085  *		p -- (char **) to get the address of the buffer containing
1086  *		     the key or datum.
1087  *		sz -- pointer to an int to get the size of the instantiated
1088  *		      object.
1089  *
1090  *	Returns:
1091  *		RET_SUCCESS, RET_ERROR.
1092  *
1093  *	Side Effects:
1094  *		None.
1095  */
1096 
1097 static int
1098 _bt_getbig(t, pgno, p, sz)
1099 	BTREE_P t;
1100 	pgno_t pgno;
1101 	char **p;
1102 	int *sz;
1103 {
1104 	pgno_t save;
1105 	size_t nbytes;
1106 	size_t nhere;
1107 	BTHEADER *h;
1108 	char *top, *from, *where;
1109 
1110 	save = t->bt_curpage->h_pgno;
1111 	if (_bt_getpage(t, pgno) == RET_ERROR)
1112 		return (RET_ERROR);
1113 
1114 	h = t->bt_curpage;
1115 
1116 	bcopy((char *) &(h->h_linp[0]), (char *) &nbytes, sizeof(nbytes));
1117 
1118 	if ((*p = (char *) malloc(nbytes)) == (char *) NULL)
1119 		return (RET_ERROR);
1120 
1121 	*sz = nbytes;
1122 	from = ((char *) (&h->h_linp[0])) + sizeof(nbytes);
1123 	top = ((char *) h) + t->bt_psize;
1124 
1125 	/* need more space for data? */
1126 
1127 	where = *p;
1128 
1129 	while (nbytes > 0) {
1130 		nhere = (int) (top - from);
1131 		if (nhere > nbytes) {
1132 			(void) bcopy(from, where, nbytes);
1133 			nbytes = 0;
1134 		} else {
1135 			(void) bcopy(from, where, nhere);
1136 			where += nhere;
1137 			nbytes -= nhere;
1138 			if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
1139 				return (RET_ERROR);
1140 			h = t->bt_curpage;
1141 			top = ((char *) h) + t->bt_psize;
1142 			from = (char *) &(h->h_linp[0]);
1143 		}
1144 	}
1145 
1146 	if (_bt_getpage(t, save) == RET_ERROR)
1147 		return (RET_ERROR);
1148 
1149 	return (RET_SUCCESS);
1150 }
1151 
1152 /*
1153  *  _BT_DELINDIR -- Delete a chain of indirect blocks from the btree.
1154  *
1155  *	When a large item is deleted from the tree, this routine puts the
1156  *	space that it occupied onto the free list for later reuse.  The
1157  *	bt_free entry in the btree structure points at the head of this list.
1158  *	This value is also stored on disk in the btree's metadata.
1159  *
1160  *	Parameters:
1161  *		t -- btree from which to delete pages
1162  *		chain -- page number that starts the chain.
1163  *
1164  *	Returns:
1165  *		RET_SUCCESS, RET_ERROR.
1166  *
1167  *	Side Effects:
1168  *		Invalidates the current on-disk version of the btree's
1169  *		metadata (if any), and updates the free list appropriately.
1170  */
1171 
1172 static int
1173 _bt_delindir(t, chain)
1174 	BTREE_P t;
1175 	pgno_t chain;
1176 {
1177 	BTHEADER *h;
1178 	pgno_t save;
1179 	pgno_t oldfree;
1180 
1181 	h = t->bt_curpage;
1182 	save = h->h_pgno;
1183 	if (_bt_getpage(t, chain) == RET_ERROR)
1184 		return (RET_ERROR);
1185 
1186 	/*
1187 	 *  If some internal node is pointing at this chain, don't
1188 	 *  delete it.
1189 	 */
1190 
1191 	if (t->bt_curpage->h_flags & F_PRESERVE) {
1192 		if (_bt_getpage(t, save) == RET_ERROR)
1193 			return (RET_ERROR);
1194 		return (RET_SUCCESS);
1195 	}
1196 
1197 	/* if there's nothing on the free list, this is easy... */
1198 	if (t->bt_free == P_NONE) {
1199 		t->bt_free = chain;
1200 	} else {
1201 		oldfree = t->bt_free;
1202 
1203 		/* find the end of the data chain for the deleted datum */
1204 		t->bt_free = chain;
1205 		do {
1206 			if (_bt_getpage(t, chain) == RET_ERROR)
1207 				return (RET_ERROR);
1208 			h = t->bt_curpage;
1209 			if (h->h_nextpg != P_NONE)
1210 				chain = h->h_nextpg;
1211 		} while (h->h_nextpg != P_NONE);
1212 
1213 		/* link freed pages into free list */
1214 		h->h_nextpg = oldfree;
1215 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
1216 			return (RET_ERROR);
1217 		if (_bt_getpage(t, oldfree) == RET_ERROR)
1218 			return (RET_ERROR);
1219 		h = t->bt_curpage;
1220 		h->h_prevpg = chain;
1221 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
1222 			return (RET_ERROR);
1223 	}
1224 
1225 	/* restore the tree's current page pointer */
1226 	if (_bt_getpage(t, save) == RET_ERROR)
1227 		return (RET_ERROR);
1228 
1229 	/* we have trashed the tree metadata; rewrite it later */
1230 	t->bt_flags &= ~BTF_METAOK;
1231 
1232 	return (RET_SUCCESS);
1233 }
1234 
1235 /*
1236  *  _BT_FIRST -- Find the first item in the tree that matches the supplied
1237  *		 key.
1238  *
1239  *	This routine supports deletion.  When the user supplies a key to
1240  *	be deleted, we find the first one, and iteratively delete all the
1241  *	matching ones that follow it.
1242  *
1243  *	Parameters:
1244  *		t -- btree in which to find first occurrence
1245  *		key -- key to find
1246  *
1247  *	Returns:
1248  *		The BTITEM for the matching item.  If there's no match,
1249  *		this may point to the first item > than the supplied key,
1250  *		or off the end of the page.
1251  *
1252  *	Warnings:
1253  *		The BTITEM returned is in static space and will be overwritten
1254  *		by the next search of any kind in any btree.
1255  */
1256 
1257 static BTITEM *
1258 _bt_first(t, key)
1259 	BTREE_P t;
1260 	DBT *key;
1261 {
1262 	BTHEADER *h;
1263 	BTITEM *item;
1264 	index_t next;
1265 	int r;
1266 
1267 	/* find any matching item */
1268 	item = _bt_search(t, key);
1269 	h = t->bt_curpage;
1270 	next = NEXTINDEX(h);
1271 
1272 	/* if we're off the end of the page, search failed and we're done */
1273 	if (item->bti_index >= next)
1274 		return (item);
1275 
1276 	/* as long as we have an exact match, walk backwards */
1277 	while ((r = _bt_cmp(t, key->data, item->bti_index)) == 0) {
1278 
1279 		/* at start of page? */
1280 		if (item->bti_index == 0) {
1281 
1282 			/* if no prev page, we're done */
1283 			if (h->h_prevpg == P_NONE)
1284 				return (item);
1285 
1286 			/* walk backward, skipping empty pages */
1287 			do {
1288 				if (_bt_getpage(t, h->h_prevpg) == RET_ERROR)
1289 					return ((BTITEM *) NULL);
1290 				h = t->bt_curpage;
1291 			} while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE);
1292 
1293 			if (NEXTINDEX(h) != 0)
1294 				item->bti_index = NEXTINDEX(h) - 1;
1295 			else
1296 				item->bti_index = 0;
1297 
1298 			item->bti_pgno = h->h_pgno;
1299 		} else {
1300 			item->bti_index--;
1301 		}
1302 	}
1303 
1304 	/* if we went too far backwards, step forward one entry */
1305 	if (r > 0) {
1306 		if (++(item->bti_index) >= NEXTINDEX(h)
1307 		    && h->h_nextpg != P_NONE) {
1308 
1309 			/* walk forward, skipping empty pages */
1310 			do {
1311 				if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
1312 					return ((BTITEM *) NULL);
1313 				h = t->bt_curpage;
1314 			} while (h->h_nextpg != P_NONE && NEXTINDEX(h) == 0);
1315 
1316 			item->bti_index = 0;
1317 			item->bti_pgno = h->h_pgno;
1318 		}
1319 	}
1320 
1321 	/* got it */
1322 	return (item);
1323 }
1324 
1325 /*
1326  *  _BT_SEARCH, _BT_SEARCHR -- Search for a particular key in the tree.
1327  *
1328  *	Parameters:
1329  *		t -- btree in which to search
1330  *		key -- key to find
1331  *
1332  *	Returns:
1333  *		BTITEM for matching item, if any, or the BTITEM for the
1334  *		location of the key, if it were in the tree.
1335  *
1336  *	Warnings:
1337  *		The BTITEM returned is in static memory, and will be
1338  *		overwritten by the next search of any kind in any tree.
1339  */
1340 
1341 static BTITEM *
1342 _bt_search(t, key)
1343 	BTREE_P t;
1344 	DBT *key;
1345 {
1346 	/* we want to start all of our searches at the root */
1347 	if (_bt_getpage(t, P_ROOT) == RET_ERROR)
1348 		return ((BTITEM *) NULL);
1349 
1350 	return (_bt_searchr(t, key));
1351 }
1352 
1353 static BTITEM *
1354 _bt_searchr(t, key)
1355 	BTREE_P t;
1356 	DBT *key;
1357 {
1358 	BTHEADER *h = t->bt_curpage;
1359 	index_t index;
1360 	IDATUM *id;
1361 	DATUM *d;
1362 	static BTITEM item;
1363 
1364 	/* do a binary search on the current page */
1365 	index = _bt_binsrch(t, &(key->data[0]));
1366 
1367 	/*
1368 	 *  At this point, the binary search terminated because the endpoints
1369 	 *  got too close together, or we have a match.  Figure out which
1370 	 *  case applies and decide what to do based on the page type.
1371 	 */
1372 	if (h->h_flags & F_LEAF) {
1373 		item.bti_pgno = h->h_pgno;
1374 		item.bti_index = index;
1375 		if (index < NEXTINDEX(h))
1376 			d = (DATUM *) GETDATUM(h,index);
1377 		else
1378 			d = (DATUM *) NULL;
1379 
1380 		item.bti_datum = d;
1381 		return(&item);
1382 	} else {
1383 		id = (IDATUM *) GETDATUM(h, index);
1384 		_bt_push(t, h->h_pgno);
1385 		if (_bt_getpage(t, id->i_pgno) == RET_ERROR)
1386 			return ((BTITEM *) NULL);
1387 		return (_bt_searchr(t, key));
1388 	}
1389 }
1390 
1391 /*
1392  *  BT_GETPAGE -- Make pgno the current page of the btree.
1393  *
1394  *	This routine is just a wrapper that decides whether to call the
1395  *	memory or disk-based routine to do the work.
1396  *
1397  *	Parameters:
1398  *		t -- btree in which to get page
1399  *		pgno -- page number to get
1400  *
1401  *	Returns:
1402  *		RET_SUCCESS or RET_ERROR.
1403  */
1404 
1405 static int
1406 _bt_getpage(t, pgno)
1407 	BTREE_P t;
1408 	pgno_t pgno;
1409 {
1410 #ifdef DEBUG
1411 	if (pgno == P_NONE)
1412 		_punt();
1413 #endif /* DEBUG */
1414 
1415 	/* see if we can get away without doing any work */
1416 	if (t->bt_curpage != (BTHEADER *) NULL) {
1417 		if (t->bt_curpage->h_pgno == pgno)
1418 			return (RET_SUCCESS);
1419 	}
1420 
1421 	if (t->bt_fname == (char *) NULL)
1422 		return (_bt_getmpage(t, pgno));
1423 	else
1424 		return (_bt_getdpage(t, pgno));
1425 }
1426 
1427 /*
1428  *  _BT_GETMPAGE -- Make pgno the current page of the btree.
1429  *
1430  *	This routine gets pages for in-memory btrees.
1431  *
1432  *	Parameters:
1433  *		t -- btree in which to get page
1434  *		pgno -- page number to get
1435  *
1436  *	Returns:
1437  *		RET_SUCCESS or RET_ERROR.
1438  */
1439 
1440 static int
1441 _bt_getmpage(t, pgno)
1442 	register BTREE_P t;
1443 	pgno_t pgno;
1444 {
1445 	int htindex;
1446 	int nbytes;
1447 	BTHEADER *h;
1448 	HTBUCKET *b;
1449 
1450 	if (t->bt_curpage == (BTHEADER *) NULL) {
1451 		if (pgno != P_ROOT) {
1452 			errno = EBADF;
1453 			return (RET_ERROR);
1454 		}
1455 
1456 		t->bt_npages++;
1457 		h = (BTHEADER *) malloc((unsigned) t->bt_psize);
1458 		if (h == (BTHEADER *) NULL)
1459 			return (RET_ERROR);
1460 
1461 		h->h_pgno = P_ROOT;
1462 		h->h_flags = F_LEAF;
1463 		h->h_lower = (index_t)
1464 				(((char *) &(h->h_linp[0])) - ((char *) h));
1465 		h->h_upper = t->bt_psize;
1466 		h->h_prevpg = h->h_nextpg = P_NONE;
1467 
1468 		t->bt_curpage = h;
1469 
1470 		/* get the root page into the hash table */
1471 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
1472 			return (RET_ERROR);
1473 	}
1474 
1475 	htindex = HASHKEY(pgno);
1476 
1477 	for (b = t->bt_s.bt_ht[htindex];
1478 	     b != (HTBUCKET *) NULL;
1479 	     b = b->ht_next) {
1480 		if (b->ht_pgno == pgno) {
1481 			t->bt_curpage = b->ht_page;
1482 			return (RET_SUCCESS);
1483 		}
1484 	}
1485 	return (RET_ERROR);
1486 }
1487 
1488 /*
1489  *  _BT_GETDPAGE -- Make pgno the current page of the btree.
1490  *
1491  *	This routine gets pages for disk btrees.
1492  *
1493  *	Because disk btree pages must be readable across machine architectures,
1494  *	the btree code writes integers out in network format.  This routine
1495  *	converts them back to host format before returning the page.
1496  *
1497  *	Parameters:
1498  *		t -- btree in which to get page
1499  *		pgno -- page number to get
1500  *
1501  *	Returns:
1502  *		RET_SUCCESS, RET_ERROR.
1503  */
1504 
1505 static int
1506 _bt_getdpage(t, pgno)
1507 	register BTREE_P t;
1508 	pgno_t pgno;
1509 {
1510 	BTHEADER *h;
1511 	char *cache;
1512 	long pos;
1513 	int n, nbytes;
1514 
1515 	/* if we have an lru cache, let the cache code do the work */
1516 	if (ISCACHE(t)) {
1517 		cache = t->bt_s.bt_d.d_cache;
1518 
1519 		/* release the old page */
1520 		if (t->bt_curpage != (BTHEADER *) NULL) {
1521 			pgno_t opgno = t->bt_curpage->h_pgno;
1522 			t->bt_curpage->h_flags &= ~F_DIRTY;
1523 
1524 			if (lruwrite(cache, opgno) < 0)
1525 				return (RET_ERROR);
1526 
1527 			lrurelease(cache, opgno);
1528 		}
1529 
1530 		if (pgno > t->bt_npages) {
1531 			if ((h = (BTHEADER *) lrugetnew(cache, pgno, &nbytes))
1532 			    == (BTHEADER *) NULL)
1533 				return (RET_ERROR);
1534 			t->bt_npages = pgno;
1535 		} else {
1536 			if ((h = (BTHEADER *) lruget(cache, pgno, &nbytes))
1537 			    == (BTHEADER *) NULL)
1538 				return (RET_ERROR);
1539 		}
1540 
1541 		/* init this page, if necessary */
1542 		if (nbytes == 0) {
1543 			h->h_pgno = pgno;
1544 			h->h_flags = F_LEAF;
1545 			h->h_lower = (index_t)
1546 				(((char *) &(h->h_linp[0])) - ((char *) h));
1547 			h->h_upper = t->bt_psize;
1548 			h->h_prevpg = h->h_nextpg = P_NONE;
1549 		}
1550 
1551 		t->bt_curpage = h;
1552 
1553 		return (RET_SUCCESS);
1554 	}
1555 
1556 	/* sync the current page, if necessary */
1557 	if (t->bt_curpage != (BTHEADER *) NULL) {
1558 		if (t->bt_curpage->h_flags & F_DIRTY)
1559 			if (_bt_write(t, t->bt_curpage, RELEASE) == RET_ERROR)
1560 				return (RET_ERROR);
1561 	} else {
1562 		if (t->bt_npages == 0)
1563 			t->bt_npages = 1;
1564 
1565 		/* if no current page, get space for one */
1566 		if ((t->bt_curpage = (BTHEADER *) malloc((unsigned) t->bt_psize))
1567 		    == (BTHEADER *) NULL) {
1568 			return (RET_ERROR);
1569 		}
1570 	}
1571 
1572 	n = t->bt_psize;
1573 	pos = (long) (pgno * n);
1574 
1575 	/* seek to correct location in file */
1576 	if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) {
1577 		return (RET_ERROR);
1578 	}
1579 
1580 	/* read the page */
1581 	if ((nbytes = read(t->bt_s.bt_d.d_fd, t->bt_curpage, n)) < n) {
1582 
1583 		/*
1584 		 *  If we didn't get a full page, we must have gotten no
1585 		 *  data at all -- in which case we're trying to read a
1586 		 *  root page that doesn't exist yet.  This is the only
1587 		 *  case in which this is okay.  If this happens, construct
1588 		 *  an empty root page by hand.
1589 		 */
1590 		if (nbytes != 0 || pgno != P_ROOT) {
1591 			errno = EBADF;
1592 			return (RET_ERROR);
1593 		}
1594 
1595 		h = (BTHEADER *) t->bt_curpage;
1596 		h->h_pgno = pgno;
1597 		h->h_flags = F_LEAF;
1598 		h->h_lower = (index_t)
1599 				(((char *) &(h->h_linp[0])) - ((char *) h));
1600 		h->h_upper = t->bt_psize;
1601 		h->h_prevpg = h->h_nextpg = P_NONE;
1602 	} else
1603 		(void) _bt_pgin(t->bt_curpage, (char *) t->bt_lorder);
1604 
1605 	return (RET_SUCCESS);
1606 }
1607 
1608 /*
1609  *  _BT_PGOUT, _BT_PGIN -- Convert host-specific number layout to/from
1610  *			   the host-independent format stored on disk.
1611  *
1612  *	Parameters:
1613  *		h -- page to convert
1614  *		_lorder -- byte order for pages (stored as a char * in the
1615  *			   cache, and passed around as a magic cookie).
1616  *
1617  *	Returns:
1618  *		RET_SUCCESS (lru code requires a return value).
1619  *
1620  *	Side Effects:
1621  *		Layout of tree metadata on the page is changed in place.
1622  *
1623  *	Warnings:
1624  *		Everywhere else in the code, the types pgno_t and index_t
1625  *		are opaque.  These two routines know what they really
1626  *		are.
1627  */
1628 
1629 static int
1630 _bt_pgout(h, _lorder)
1631 	BTHEADER *h;
1632 	char *_lorder;
1633 {
1634 	int i;
1635 	int top;
1636 	int lorder;
1637 	DATUM *d;
1638 	IDATUM *id;
1639 	size_t chain;
1640 
1641 	lorder = (int) _lorder;
1642 	if (lorder == BYTE_ORDER)
1643 		return (RET_SUCCESS);
1644 
1645 	if (h->h_flags & F_LEAF) {
1646 		if (h->h_flags & F_CONT) {
1647 			if (h->h_prevpg == P_NONE) {
1648 				size_t longsz;
1649 
1650 				(void) bcopy((char *) &(h->h_linp[0]),
1651 					      (char *) &longsz,
1652 					      sizeof(longsz));
1653 				BLSWAP(longsz);
1654 				(void) bcopy((char *) &longsz,
1655 					      (char *) &(h->h_linp[0]),
1656 					      sizeof(longsz));
1657 			}
1658 		} else {
1659 			top = NEXTINDEX(h);
1660 			for (i = 0; i < top; i++) {
1661 				d = (DATUM *) GETDATUM(h, i);
1662 				if (d->d_flags & D_BIGKEY) {
1663 					(void) bcopy((char *) &(d->d_bytes[0]),
1664 						      (char *) &chain,
1665 						      sizeof(chain));
1666 					BLSWAP(chain);
1667 					(void) bcopy((char *) &chain,
1668 						      (char *) &(d->d_bytes[0]),
1669 						      sizeof(chain));
1670 				}
1671 				if (d->d_flags & D_BIGDATA) {
1672 					(void) bcopy((char *) &(d->d_bytes[d->d_ksize]),
1673 						      (char *) &chain,
1674 						      sizeof(chain));
1675 					BLSWAP(chain);
1676 					(void) bcopy((char *) &chain,
1677 						      (char *) &(d->d_bytes[d->d_ksize]),
1678 						      sizeof(chain));
1679 				}
1680 				BLSWAP(d->d_dsize);
1681 				BLSWAP(d->d_ksize);
1682 				BLSWAP(d->d_flags);
1683 				BLSWAP(h->h_linp[i]);
1684 			}
1685 		}
1686 	} else {
1687 		top = NEXTINDEX(h);
1688 		for (i = 0; i < top; i++) {
1689 			id = (IDATUM *) GETDATUM(h, i);
1690 			BLSWAP(id->i_size);
1691 			BLSWAP(id->i_pgno);
1692 			BLSWAP(id->i_flags);
1693 			if (id->i_flags & D_BIGKEY) {
1694 				(void) bcopy((char *) &(id->i_bytes[0]),
1695 					      (char *) &chain,
1696 					      sizeof(chain));
1697 				BLSWAP(chain);
1698 				(void) bcopy((char *) &chain,
1699 					      (char *) &(id->i_bytes[0]),
1700 					      sizeof(chain));
1701 			}
1702 			BLSWAP(h->h_linp[i]);
1703 		}
1704 	}
1705 	BLSWAP(h->h_flags);
1706 	BLSWAP(h->h_pgno);
1707 	BLSWAP(h->h_prevpg);
1708 	BLSWAP(h->h_nextpg);
1709 	BLSWAP(h->h_lower);
1710 	BLSWAP(h->h_upper);
1711 
1712 	return (RET_SUCCESS);
1713 }
1714 
1715 static int
1716 _bt_pgin(h, _lorder)
1717 	BTHEADER *h;
1718 	char *_lorder;
1719 {
1720 	int i;
1721 	int top;
1722 	int lorder;
1723 	DATUM *d;
1724 	IDATUM *id;
1725 	size_t chain;
1726 
1727 	/*
1728 	 *  If btree pages are to be stored in the host byte order, don't
1729 	 *  bother swapping.
1730 	 */
1731 	lorder = (int) _lorder;
1732 	if (lorder == BYTE_ORDER)
1733 		return (RET_SUCCESS);
1734 
1735 	BLSWAP(h->h_upper);
1736 	BLSWAP(h->h_lower);
1737 	BLSWAP(h->h_nextpg);
1738 	BLSWAP(h->h_prevpg);
1739 	BLSWAP(h->h_pgno);
1740 	BLSWAP(h->h_flags);
1741 
1742 	if (h->h_flags & F_LEAF) {
1743 		if (h->h_flags & F_CONT) {
1744 			if (h->h_prevpg == P_NONE) {
1745 				size_t longsz;
1746 
1747 				(void) bcopy((char *) &(h->h_linp[0]),
1748 					      (char *) &longsz,
1749 					      sizeof(longsz));
1750 				BLSWAP(longsz);
1751 				(void) bcopy((char *) &longsz,
1752 					      (char *) &(h->h_linp[0]),
1753 					      sizeof(longsz));
1754 			}
1755 		} else {
1756 			top = NEXTINDEX(h);
1757 			for (i = 0; i < top; i++) {
1758 				BLSWAP(h->h_linp[i]);
1759 				d = (DATUM *) GETDATUM(h, i);
1760 				BLSWAP(d->d_dsize);
1761 				BLSWAP(d->d_ksize);
1762 				BLSWAP(d->d_flags);
1763 				if (d->d_flags & D_BIGKEY) {
1764 					(void) bcopy((char *) &(d->d_bytes[0]),
1765 						      (char *) &chain,
1766 						      sizeof(chain));
1767 					BLSWAP(chain);
1768 					(void) bcopy((char *) &chain,
1769 						      (char *) &(d->d_bytes[0]),
1770 						      sizeof(chain));
1771 				}
1772 				if (d->d_flags & D_BIGDATA) {
1773 					(void) bcopy((char *) &(d->d_bytes[d->d_ksize]),
1774 						      (char *) &chain,
1775 						      sizeof(chain));
1776 					BLSWAP(chain);
1777 					(void) bcopy((char *) &chain,
1778 						      (char *) &(d->d_bytes[d->d_ksize]),
1779 						      sizeof(chain));
1780 				}
1781 			}
1782 		}
1783 	} else {
1784 		top = NEXTINDEX(h);
1785 		for (i = 0; i < top; i++) {
1786 			BLSWAP(h->h_linp[i]);
1787 			id = (IDATUM *) GETDATUM(h, i);
1788 			BLSWAP(id->i_size);
1789 			BLSWAP(id->i_pgno);
1790 			BLSWAP(id->i_flags);
1791 			if (id->i_flags & D_BIGKEY) {
1792 				(void) bcopy((char *) &(id->i_bytes[0]),
1793 					      (char *) &chain,
1794 					      sizeof(chain));
1795 				BLSWAP(chain);
1796 				(void) bcopy((char *) &chain,
1797 					      (char *) &(id->i_bytes[0]),
1798 					      sizeof(chain));
1799 			}
1800 		}
1801 	}
1802 	return (RET_SUCCESS);
1803 }
1804 
1805 /*
1806  *  BT_SYNC -- sync the btree to disk.
1807  *
1808  *	Parameters:
1809  *		tree -- btree to sync.
1810  *
1811  *	Returns:
1812  *		RET_SUCCESS, RET_ERROR.
1813  */
1814 
1815 bt_sync(tree)
1816 	BTREE tree;
1817 {
1818 	BTREE_P t = (BTREE_P) tree;
1819 	BTHEADER *h;
1820 	pgno_t pgno;
1821 
1822 	/* if this is an in-memory btree, syncing is a no-op */
1823 	if (!ISDISK(t))
1824 		return (RET_SUCCESS);
1825 
1826 	h = (BTHEADER *) t->bt_curpage;
1827 	h->h_flags &= ~F_DIRTY;
1828 
1829 	if (ISCACHE(t)) {
1830 		pgno = t->bt_curpage->h_pgno;
1831 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
1832 			return(RET_ERROR);
1833 		if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR)
1834 			return (RET_ERROR);
1835 		if (_bt_getpage(t, pgno) == RET_ERROR)
1836 			return (RET_ERROR);
1837 	} else {
1838 		if (_bt_write(t, h, NORELEASE) == RET_ERROR)
1839 			return (RET_ERROR);
1840 	}
1841 
1842 	return (fsync(t->bt_s.bt_d.d_fd));
1843 }
1844 
1845 /*
1846  *  _BT_INSERT -- Insert a new user datum in the btree.
1847  *
1848  *	This routine is called by bt_put, the public interface, once the
1849  *	location for the new item is known.  We do the work here, and
1850  *	handle splits if necessary.
1851  *
1852  *	Parameters:
1853  *		t -- btree in which to do the insertion.
1854  *		item -- BTITEM describing location of new datum
1855  *		key -- key to insert
1856  *		data -- data associated with key
1857  *		flag -- magic cookie passed recursively to bt_put if we
1858  *			have to do a split
1859  *
1860  *	Returns:
1861  *		RET_SUCCESS, RET_ERROR.
1862  */
1863 
1864 static int
1865 _bt_insert(t, item, key, data, flag)
1866 	BTREE_P t;
1867 	BTITEM *item;
1868 	DBT *key;
1869 	DBT *data;
1870 	int flag;
1871 {
1872 	index_t index;
1873 	BTHEADER *h;
1874 	DATUM *d;
1875 	int nbytes;
1876 	int status;
1877 	pgno_t pgno;
1878 	int keysize, datasize;
1879 	int bigkey, bigdata;
1880 
1881 	if (_bt_getpage(t, item->bti_pgno) == RET_ERROR)
1882 		return (RET_ERROR);
1883 	h = t->bt_curpage;
1884 
1885 	if (TOOBIG(t, data->size)) {
1886 		bigdata = TRUE;
1887 		datasize = sizeof(pgno_t);
1888 	} else {
1889 		bigdata = FALSE;
1890 		datasize = data->size;
1891 	}
1892 
1893 	if (TOOBIG(t, key->size)) {
1894 		bigkey = TRUE;
1895 		keysize = sizeof(pgno_t);
1896 	} else {
1897 		bigkey = FALSE;
1898 		keysize = key->size;
1899 	}
1900 
1901 	nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char));
1902 	nbytes = LONGALIGN(nbytes) + sizeof(index_t);
1903 
1904 	/* if there's not enough room here, split the page */
1905 	if ((h->h_upper - h->h_lower) < nbytes) {
1906 		if (_bt_split(t) == RET_ERROR)
1907 			return (RET_ERROR);
1908 
1909 		/* okay, try again */
1910 		return (bt_put(t, key, data, flag));
1911 	}
1912 
1913 	/* put together a leaf page datum from the key/data pair */
1914 	index = item->bti_index;
1915 	nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char));
1916 
1917 	if ((d = (DATUM *) malloc((unsigned) nbytes)) == (DATUM *) NULL)
1918 		return (RET_ERROR);
1919 
1920 	d->d_ksize = keysize;
1921 	d->d_dsize = datasize;
1922 	d->d_flags = 0;
1923 
1924 	if (bigkey) {
1925 		if (_bt_indirect(t, key, &pgno) == RET_ERROR)
1926 			return (RET_ERROR);
1927 		(void) bcopy((char *) &pgno, &(d->d_bytes[0]), sizeof(pgno));
1928 		d->d_flags |= D_BIGKEY;
1929 		if (_bt_getpage(t, item->bti_pgno) == RET_ERROR)
1930 			return (RET_ERROR);
1931 	} else {
1932 		if (d->d_ksize > 0) {
1933 			(void) bcopy((char *) key->data,
1934 				      (char *) &(d->d_bytes[0]),
1935 				      (int) d->d_ksize);
1936 		}
1937 	}
1938 
1939 	if (bigdata) {
1940 		if (_bt_indirect(t, data, &pgno) == RET_ERROR)
1941 			return (RET_ERROR);
1942 		(void) bcopy((char *) &pgno,
1943 			     &(d->d_bytes[keysize]),
1944 			     sizeof(pgno));
1945 		d->d_flags |= D_BIGDATA;
1946 		if (_bt_getpage(t, item->bti_pgno) == RET_ERROR)
1947 			return (RET_ERROR);
1948 	} else {
1949 		if (d->d_dsize > 0) {
1950 			(void) bcopy((char *) data->data,
1951 				      (char *) &(d->d_bytes[keysize]),
1952 				      (int) d->d_dsize);
1953 		}
1954 	}
1955 
1956 	/* do the insertion */
1957 	status = _bt_insertat(t, d, index);
1958 
1959 	(void) free((char *) d);
1960 
1961 	return (status);
1962 }
1963 
1964 /*
1965  *  _BT_INDIRECT -- Write a series of indirect pages for big objects.
1966  *
1967  *	A chain of indirect pages looks like
1968  *
1969  *	   +-------------------+   +---------------------+
1970  *	   |hdr|size|	       |   |hdr|		 |
1971  *	   +---+----+	       |   +---+		 |
1972  *	   |   ... data ...    |   |   ... data ...	 |    ...
1973  *	   |		       |   |			 |
1974  *	   +-------------------+   +---------------------+
1975  *
1976  *	where hdr is a standard btree page header (with the indirect bit
1977  *	set), size on the first page is the real size of the datum, and
1978  *	data are bytes of the datum, split across as many pages as necessary.
1979  *	Indirect pages are chained together with the h_prevpg and h_nextpg
1980  *	entries of the page header struct.
1981  *
1982  *	A single DBT is written per chain, so space on the last page is
1983  *	wasted.
1984  *
1985  *	We return the page number of the start of the chain.
1986  *
1987  *	When a big object is deleted from a tree, the space that it occupied
1988  *	is placed on a free list for later reuse.  This routine checks that
1989  *	free list before allocating new pages to the big datum being inserted.
1990  *
1991  *	Parameters:
1992  *		t -- btree in which to store indirect blocks
1993  *		data -- DBT with the big datum in it
1994  *		pgno -- place to put the starting page number of the chain
1995  *
1996  *	Returns:
1997  *		RET_SUCCESS, RET_ERROR.
1998  *
1999  *	Side Effects:
2000  *		Current page is changed on return.
2001  */
2002 
2003 static int
2004 _bt_indirect(t, data, pgno)
2005 	BTREE_P t;
2006 	DBT *data;
2007 	pgno_t *pgno;
2008 {
2009 	pgno_t save;
2010 	pgno_t prev;
2011 	char *top;
2012 	char *where;
2013 	char *from;
2014 	size_t dsize;
2015 	pgno_t nextchn;
2016 	int ischain;
2017 	BTHEADER *cur;
2018 
2019 	/* get set for first page in chain */
2020 	prev = P_NONE;
2021 	dsize = data->size;
2022 	from = (char *) data->data;
2023 
2024 	/* if there are blocks on the free list, use them first */
2025 	if ((*pgno = t->bt_free) == P_NONE) {
2026 		if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL)
2027 			return (RET_ERROR);
2028 
2029 		ischain = 0;
2030 		*pgno = cur->h_pgno = ++(t->bt_npages);
2031 	} else {
2032 		if (_bt_getpage(t, *pgno) == RET_ERROR)
2033 			return (RET_ERROR);
2034 		ischain = 1;
2035 		cur = t->bt_curpage;
2036 	}
2037 
2038 	cur->h_flags = F_CONT|F_LEAF;
2039 	(void) bcopy((char *) &dsize, (char *) &cur->h_linp[0], sizeof(size_t));
2040 	where = ((char *) (&cur->h_linp[0])) + sizeof(size_t);
2041 
2042 	/* fill and write pages in the chain */
2043 	for (;;) {
2044 		int nhere;
2045 
2046 		top = ((char *) cur) + t->bt_psize;
2047 		cur->h_prevpg = prev;
2048 		nextchn = cur->h_nextpg;
2049 		nhere = (int) (top - where);
2050 
2051 		if (nhere >= dsize) {
2052 			(void) bcopy(from, where, (int) dsize);
2053 			cur->h_nextpg = P_NONE;
2054 			dsize = 0;
2055 		} else {
2056 			(void) bcopy(from, where, (int) nhere);
2057 			dsize -= nhere;
2058 			from += nhere;
2059 			if (nextchn == P_NONE)
2060 				cur->h_nextpg = t->bt_npages + 1;
2061 			prev = cur->h_pgno;
2062 		}
2063 
2064 		/* current page is ready to go; write it out */
2065 		if (_bt_write(t, cur, RELEASE) == RET_ERROR)
2066 			return (RET_ERROR);
2067 
2068 		/* free the current page, if appropriate */
2069 		if (ISDISK(t) && !ISCACHE(t) && !ischain) {
2070 			(void) free ((char *) cur);
2071 		}
2072 
2073 		/* done? */
2074 		if (dsize == 0)
2075 			break;
2076 
2077 		/* allocate another page */
2078 		if (nextchn == P_NONE) {
2079 			if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL)
2080 				return (RET_ERROR);
2081 			ischain = 0;
2082 			cur->h_pgno = ++(t->bt_npages);
2083 		} else {
2084 			if (_bt_getpage(t, nextchn) == RET_ERROR)
2085 				return (RET_ERROR);
2086 			ischain = 1;
2087 			cur = t->bt_curpage;
2088 		}
2089 		cur->h_flags = F_LEAF | F_CONT;
2090 
2091 		where = (char *) (&cur->h_linp[0]);
2092 	}
2093 
2094 	/* if we used pages from the free list, record changes to it */
2095 	if (*pgno == t->bt_free) {
2096 		t->bt_free = nextchn;
2097 		t->bt_flags &= ~BTF_METAOK;
2098 	}
2099 
2100 	return (RET_SUCCESS);
2101 }
2102 
2103 /*
2104  *  _BT_SPLIT -- Split a page into two pages.
2105  *
2106  *	Splits are caused by insertions, and propogate up the tree in
2107  *	the usual way.  The root page is always page 1 in the file on
2108  *	disk, so root splits are handled specially.  On entry to this
2109  *	routine, t->bt_curpage is the page to be split.
2110  *
2111  *	Parameters:
2112  *		t -- btree in which to do split.
2113  *
2114  *	Returns:
2115  *		RET_SUCCESS, RET_ERROR.
2116  *
2117  *	Side Effects:
2118  *		Changes the notion of the current page.
2119  */
2120 
2121 static
2122 _bt_split(t)
2123 	BTREE_P t;
2124 {
2125 	BTHEADER *h;
2126 	BTHEADER *left, *right;
2127 	BTHEADER *next;
2128 	pgno_t nextpgno, parent;
2129 	int nbytes, len;
2130 	IDATUM *id;
2131 	DATUM *d;
2132 	char *src;
2133 	IDATUM *new;
2134 	pgno_t oldchain;
2135 	u_char flags;
2136 
2137 	h = (BTHEADER *) t->bt_curpage;
2138 
2139 	/* split root page specially, since it must remain page 1 */
2140 	if (h->h_pgno == P_ROOT) {
2141 		return (_bt_splitroot(t));
2142 	}
2143 
2144 	/*
2145 	 *  This is a little complicated.  We go to some trouble to
2146 	 *  figure out which of the three possible cases -- in-memory tree,
2147 	 *  disk tree (no cache), and disk tree (cache) -- we have, in order
2148 	 *  to avoid unnecessary copying.  If we have a disk cache, then we
2149 	 *  have to do some extra copying, though, since the cache code
2150 	 *  manages buffers externally to this code.
2151 	 */
2152 
2153 	if (ISDISK(t) && ISCACHE(t)) {
2154 		if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize))
2155 		    == (BTHEADER *) NULL)
2156 			return (RET_ERROR);
2157 		left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE;
2158 		left->h_flags = t->bt_curpage->h_flags;
2159 		left->h_lower = (index_t)
2160 			  (((char *) &(left->h_linp[0])) - ((char *) left));
2161 		left->h_upper = t->bt_psize;
2162 
2163 	} else {
2164 		if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
2165 			return (RET_ERROR);
2166 	}
2167 	left->h_pgno = h->h_pgno;
2168 
2169 	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
2170 		return (RET_ERROR);
2171 	right->h_pgno = ++(t->bt_npages);
2172 
2173 	/* now do the split */
2174 	if (_bt_dopsplit(t, left, right) == RET_ERROR)
2175 		return (RET_ERROR);
2176 
2177 	right->h_prevpg = left->h_pgno;
2178 	nextpgno = right->h_nextpg = h->h_nextpg;
2179 	left->h_nextpg = right->h_pgno;
2180 	left->h_prevpg = h->h_prevpg;
2181 
2182 	/* okay, now use the left half of the page as the new page */
2183 	if (ISDISK(t) && ISCACHE(t)) {
2184 		(void) bcopy((char *) left, (char *) t->bt_curpage,
2185 			     (int) t->bt_psize);
2186 		(void) free ((char *) left);
2187 		left = t->bt_curpage;
2188 	} else {
2189 		(void) free((char *) t->bt_curpage);
2190 		t->bt_curpage = left;
2191 	}
2192 
2193 	/*
2194 	 *  Write the new pages out.  We need them to stay where they are
2195 	 *  until we're done updating the parent pages.
2196 	 */
2197 
2198 	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
2199 		return (RET_ERROR);
2200 	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
2201 		return (RET_ERROR);
2202 
2203 	/* update 'prev' pointer of old neighbor of left */
2204 	if (nextpgno != P_NONE) {
2205 		if (_bt_getpage(t, nextpgno) == RET_ERROR)
2206 			return (RET_ERROR);
2207 		h = t->bt_curpage;
2208 		h->h_prevpg = right->h_pgno;
2209 		h->h_flags |= F_DIRTY;
2210 	}
2211 
2212 	if ((parent = _bt_pop(t)) != P_NONE) {
2213 		if (right->h_flags & F_LEAF) {
2214 			d = (DATUM *) GETDATUM(right, 0);
2215 			len = d->d_ksize;
2216 			if (d->d_flags & D_BIGKEY) {
2217 				bcopy(&(d->d_bytes[0]),
2218 				      (char *) &oldchain,
2219 				      sizeof(oldchain));
2220 				if (_bt_markchain(t, oldchain) == RET_ERROR)
2221 					return (RET_ERROR);
2222 				src = (char *) &oldchain;
2223 				flags = D_BIGKEY;
2224 			} else {
2225 				src = (char *) &(d->d_bytes[0]);
2226 				flags = 0;
2227 			}
2228 		} else {
2229 			id = (IDATUM *) GETDATUM(right, 0);
2230 			len = id->i_size;
2231 			flags = id->i_flags;
2232 			src = (char *) &(id->i_bytes[0]);
2233 		}
2234 		nbytes = len + (sizeof(IDATUM) - sizeof(char));
2235 		new = (IDATUM *) malloc((unsigned) nbytes);
2236 		if (new == (IDATUM *) NULL)
2237 			return (RET_ERROR);
2238 		new->i_size = len;
2239 		new->i_pgno = right->h_pgno;
2240 		new->i_flags = flags;
2241 		if (len > 0)
2242 			(void) bcopy(src, (char *) &(new->i_bytes[0]), len);
2243 
2244 		nbytes = LONGALIGN(nbytes) + sizeof(index_t);
2245 		if (_bt_getpage(t, parent) == RET_ERROR)
2246 			return (RET_ERROR);
2247 
2248 		h = t->bt_curpage;
2249 
2250 		/*
2251 		 *  Split the parent if we need to, then reposition the
2252 		 *  tree's current page pointer for the new datum.
2253 		 */
2254 		if ((h->h_upper - h->h_lower) < nbytes) {
2255 			if (_bt_split(t) == RET_ERROR)
2256 				return (RET_ERROR);
2257 			if (_bt_reposition(t, new, parent, right->h_prevpg)
2258 			      == RET_ERROR)
2259 				return (RET_ERROR);
2260 		}
2261 
2262 		/* okay, now insert the new idatum */
2263 		if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR)
2264 			return (RET_ERROR);
2265 	}
2266 
2267 	/*
2268 	 *  Okay, split is done; don't need right page stapled down anymore.
2269 	 *  The page we call 'left' above is the new version of the old
2270 	 *  (split) page, so we can't release it.
2271 	 */
2272 
2273 	if (_bt_release(t, right) == RET_ERROR)
2274 		return (RET_ERROR);
2275 	if (ISDISK(t) && !ISCACHE(t))
2276 		(void) free((char *) right);
2277 
2278 	return (RET_SUCCESS);
2279 }
2280 
2281 /*
2282  *  _BT_MARKCHAIN -- Mark a chain of pages as used by an internal node.
2283  *
2284  *	Chains of indirect blocks pointed to by leaf nodes get reclaimed
2285  *	when the item that points to them gets deleted.  Chains pointed
2286  *	to by internal nodes never get deleted.  This routine marks a
2287  *	chain as pointed to by an internal node.
2288  *
2289  *	Parameters:
2290  *		t -- tree in which to mark
2291  *		chain -- number of first page in chain
2292  *
2293  *	Returns:
2294  *		RET_SUCCESS, RET_ERROR.
2295  *
2296  *	Side Effects:
2297  *		None.
2298  */
2299 
2300 static int
2301 _bt_markchain(t, chain)
2302 	BTREE_P t;
2303 	pgno_t chain;
2304 {
2305 	pgno_t save;
2306 
2307 	save = t->bt_curpage->h_pgno;
2308 
2309 	if (_bt_getpage(t, chain) == RET_ERROR)
2310 		return (RET_ERROR);
2311 
2312 	t->bt_curpage->h_flags |= (F_DIRTY|F_PRESERVE);
2313 
2314 	if (_bt_getpage(t, save) == RET_ERROR)
2315 		return (RET_ERROR);
2316 
2317 	return (RET_SUCCESS);
2318 }
2319 
2320 /*
2321  *  _BT_REPOSITION -- Reposition the current page pointer of a btree.
2322  *
2323  *	After splitting a node in the tree in order to make room for
2324  *	an insertion, we need to figure out which page after the split
2325  *	should get the item we want to insert.  This routine positions
2326  *	the tree's current page pointer appropriately.
2327  *
2328  *	Parameters:
2329  *		t -- tree to position
2330  *		new -- the item we want to insert
2331  *		parent -- parent of the node that we just split
2332  *		prev -- page number of item directly to the left of
2333  *			new's position in the tree.
2334  *
2335  *	Returns:
2336  *		RET_SUCCESS, RET_ERROR.
2337  *
2338  *	Side Effects:
2339  *		None.
2340  */
2341 
2342 static int
2343 _bt_reposition(t, new, parent, prev)
2344 	BTREE_P t;
2345 	IDATUM *new;
2346 	pgno_t parent;
2347 	pgno_t prev;
2348 {
2349 	index_t i, next;
2350 	IDATUM *idx;
2351 
2352 	if (parent == P_ROOT) {
2353 
2354 		/*
2355 		 *  If we just split the root page, then there are guaranteed
2356 		 *  to be exactly two IDATUMs on it.  Look at both of them
2357 		 *  to see if they point to the page that we want.
2358 		 */
2359 
2360 		if (_bt_getpage(t, parent) == RET_ERROR)
2361 			return (RET_ERROR);
2362 
2363 		next = NEXTINDEX(t->bt_curpage);
2364 		for (i = 0; i < next; i++) {
2365 			idx = (IDATUM *) GETDATUM(t->bt_curpage, i);
2366 			if (_bt_getpage(t, idx->i_pgno) == RET_ERROR)
2367 				return (RET_ERROR);
2368 			if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
2369 				return (RET_SUCCESS);
2370 			if (_bt_getpage(t, parent) == RET_ERROR)
2371 				return (RET_ERROR);
2372 		}
2373 	} else {
2374 
2375 		/*
2376 		 *  Get the parent page -- which is where the new item would
2377 		 *  have gone -- and figure out whether the new item now goes
2378 		 *  on the parent, or the page immediately to the right of
2379 		 *  the parent.
2380 		 */
2381 
2382 		if (_bt_getpage(t, parent) == RET_ERROR)
2383 			return (RET_ERROR);
2384 		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
2385 			return (RET_SUCCESS);
2386 		if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR)
2387 			return (RET_ERROR);
2388 		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
2389 			return (RET_SUCCESS);
2390 	}
2391 	return (RET_ERROR);
2392 }
2393 
2394 /*
2395  *  _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page?
2396  *
2397  *	This routine is used by _bt_reposition to decide whether the current
2398  *	page is the correct one on which to insert a new item.
2399  *
2400  *	Parameters:
2401  *		t -- tree to check
2402  *		new -- the item that will be inserted (used for binary search)
2403  *		prev -- page number of page whose IDATUM is immediately to
2404  *			the left of new's position in the tree.
2405  *
2406  *	Returns:
2407  *		RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE).
2408  */
2409 
2410 static int
2411 _bt_isonpage(t, new, prev)
2412 	BTREE_P t;
2413 	IDATUM *new;
2414 	pgno_t prev;
2415 {
2416 	BTHEADER *h = (BTHEADER *) t->bt_curpage;
2417 	index_t i, next;
2418 	IDATUM *idx;
2419 
2420 	i = _bt_binsrch(t, &(new->i_bytes[0]));
2421 	while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0)
2422 		--i;
2423 	next = NEXTINDEX(h);
2424 	idx = (IDATUM *) GETDATUM(h, i);
2425 	while (i < next && idx->i_pgno != prev) {
2426 		i++;
2427 		idx = (IDATUM *) GETDATUM(h, i);
2428 	}
2429 	if (idx->i_pgno == prev)
2430 		return (RET_SUCCESS);
2431 	else
2432 		return (RET_ERROR);
2433 }
2434 
2435 /*
2436  *  _BT_SPLITROOT -- Split the root of a btree.
2437  *
2438  *	The root page for a btree is always page one.  This means that in
2439  *	order to split the root, we need to do extra work.
2440  *
2441  *	Parameters:
2442  *		t -- tree to split
2443  *
2444  *	Returns:
2445  *		RET_SUCCESS, RET_ERROR.
2446  *
2447  *	Side Effects:
2448  *		Splits root upward in the usual way, adding two new pages
2449  *		to the tree (rather than just one, as in usual splits).
2450  */
2451 
2452 static
2453 _bt_splitroot(t)
2454 	BTREE_P t;
2455 {
2456 	BTHEADER *h = t->bt_curpage;
2457 	BTHEADER *left, *right;
2458 	DATUM *d;
2459 	IDATUM *id;
2460 	char *where;
2461 	BTHEADER *where_h;
2462 	pgno_t lastpg;
2463 	char *src, *dest;
2464 	int len, nbytes;
2465 	u_long was_leaf;
2466 	pgno_t oldchain;
2467 	u_char flags;
2468 
2469 	/* get two new pages for the split */
2470 	if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
2471 		return (RET_ERROR);
2472 	left->h_pgno = ++(t->bt_npages);
2473 	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
2474 		return (RET_ERROR);
2475 	right->h_pgno = ++(t->bt_npages);
2476 
2477 	/* do the split */
2478 	if (_bt_dopsplit(t, left, right) == RET_ERROR)
2479 		return (RET_ERROR);
2480 
2481 	/* connect the new pages correctly */
2482 	right->h_prevpg = left->h_pgno;
2483 	left->h_nextpg = right->h_pgno;
2484 
2485 	/*
2486 	 *  Write the child pages out now.  We need them to remain
2487 	 *  where they are until we finish updating parent pages,
2488 	 *  however.
2489 	 */
2490 
2491 	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
2492 		return (RET_ERROR);
2493 	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
2494 		return (RET_ERROR);
2495 
2496 	/* now change the root page into an internal page */
2497 	was_leaf = (h->h_flags & F_LEAF);
2498 	h->h_flags &= ~F_LEAF;
2499 	h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h));
2500 	h->h_upper = (index_t) t->bt_psize;
2501 	(void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower));
2502 
2503 	/* put two new keys on root page */
2504 	where_h = left;
2505 	while (where_h) {
2506 		DATUM *data;
2507 		IDATUM *idata;
2508 
2509 		if (was_leaf) {
2510 			data = (DATUM *) GETDATUM(where_h, 0);
2511 
2512 			if (where_h == left) {
2513 				len = 0;	/* first key in tree is null */
2514 			} else {
2515 				if (data->d_flags & D_BIGKEY) {
2516 					bcopy(&(data->d_bytes[0]),
2517 					      (char *) &oldchain,
2518 					      sizeof(oldchain));
2519 					if (_bt_markchain(t, oldchain) == RET_ERROR)
2520 						return (RET_ERROR);
2521 					src = (char *) &oldchain;
2522 					flags = D_BIGKEY;
2523 				} else {
2524 					src = (char *) &(data->d_bytes[0]);
2525 					flags = 0;
2526 				}
2527 				len = data->d_ksize;
2528 			}
2529 		} else {
2530 			idata = (IDATUM *) GETDATUM(where_h, 0);
2531 			len = idata->i_size;
2532 			flags = idata->i_flags;
2533 			src = &(idata->i_bytes[0]);
2534 		}
2535 		dest = ((char *) h) + h->h_upper;
2536 		nbytes = len + (sizeof (IDATUM) - sizeof(char));
2537 		dest -= LONGALIGN(nbytes);
2538 		id = (IDATUM *) dest;
2539 		id->i_size = len;
2540 		id->i_pgno = where_h->h_pgno;
2541 		id->i_flags = flags;
2542 		if (len > 0)
2543 			(void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len);
2544 		dest -= ((int) h);
2545 		h->h_linp[NEXTINDEX(h)] = (index_t) dest;
2546 		h->h_upper = (index_t) dest;
2547 		h->h_lower += sizeof(index_t);
2548 
2549 		/* next page */
2550 		if (where_h == left)
2551 			where_h = right;
2552 		else
2553 			where_h = (BTHEADER *) NULL;
2554 	}
2555 
2556 	if (_bt_release(t, left) == RET_ERROR)
2557 		return (RET_ERROR);
2558 	if (_bt_release(t, right) == RET_ERROR)
2559 		return (RET_ERROR);
2560 
2561 	/*
2562 	 *  That's it, split is done.  If we're doing a non-cached disk
2563 	 *  btree, we can free up the pages we allocated, as they're on
2564 	 *  disk, now.
2565 	 */
2566 
2567 	if (ISDISK(t) && !ISCACHE(t)) {
2568 		(void) free ((char *) left);
2569 		(void) free ((char *) right);
2570 	}
2571 
2572 	h->h_flags |= F_DIRTY;
2573 
2574 	return (RET_SUCCESS);
2575 }
2576 
2577 /*
2578  *  _BT_DOPSPLIT -- Do the work of splitting a page
2579  *
2580  *	This routine takes two page pointers and splits the data on the
2581  *	current page of the btree between them.
2582  *
2583  *	We do a lot of work here to handle duplicate keys on a page; we
2584  *	have to place these keys carefully to guarantee that later searches
2585  *	will find them correctly.  See comments in the code below for details.
2586  *
2587  *	Parameters:
2588  *		t -- tree to split
2589  *		left -- pointer to page to get left half of the data
2590  *		right -- pointer to page to get right half of the data
2591  *
2592  *	Returns:
2593  *		None.
2594  */
2595 
2596 static
2597 _bt_dopsplit(t, left, right)
2598 	BTREE_P t;
2599 	BTHEADER *left;
2600 	BTHEADER *right;
2601 {
2602 	BTHEADER *h = t->bt_curpage;
2603 	size_t psize;
2604 	char *where;
2605 	BTHEADER *where_h;
2606 	index_t where_i;
2607 	int nbytes, dsize, fixedsize, freespc;
2608 	index_t i;
2609 	index_t save_lower, save_upper, save_i;
2610 	index_t switch_i;
2611 	char *save_key;
2612 	DATUM *d;
2613 	CURSOR *c;
2614 	index_t top;
2615 	int free_save;
2616 	pgno_t chain;
2617 	int ignore;
2618 
2619 	/*
2620 	 *  Our strategy is to put half the bytes on each page.  We figure
2621 	 *  out how many bytes we have total, and then move items until
2622 	 *  the last item moved put at least 50% of the data on the left
2623 	 *  page.
2624 	 */
2625 	save_key = (char *) NULL;
2626 	psize = (int) t->bt_psize;
2627 	where = ((char *) left) + psize;
2628 	where_h = left;
2629 	where_i = 0;
2630 	nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h));
2631 	freespc = nbytes;
2632 
2633 	top = NEXTINDEX(h);
2634 	if (h->h_flags & F_LEAF)
2635 		fixedsize = (sizeof(DATUM) - sizeof(char));
2636 	else
2637 		fixedsize = (sizeof(IDATUM) - sizeof(char));
2638 
2639 	save_key = (char *) NULL;
2640 
2641 	/* move data */
2642 	for (i = 0; i < top; i++) {
2643 
2644 		/*
2645 		 *  Internal and leaf pages have different layouts for
2646 		 *  data items, but in both cases the first entry in the
2647 		 *  data item is a size_t.
2648 		 */
2649 		d = (DATUM *) GETDATUM(h,i);
2650 		if (h->h_flags & F_LEAF) {
2651 			dsize = d->d_ksize + d->d_dsize + fixedsize;
2652 		} else {
2653 			dsize = d->d_ksize + fixedsize;
2654 		}
2655 
2656 		/*
2657 		 *  If a page contains duplicate keys, we have to be
2658 		 *  careful about splits.  A sequence of duplicate keys
2659 		 *  may not begin in the middle of one page and end in
2660 		 *  the middle of another; it must begin on a page boundary,
2661 		 *  in order for searches on the internal nodes to work
2662 		 *  correctly.
2663 		 */
2664 		if (where_h == left) {
2665 			if (save_key == (char *) NULL) {
2666 				if (h->h_flags & F_LEAF) {
2667 					if (d->d_flags & D_BIGKEY) {
2668 						free_save = TRUE;
2669 						bcopy(&(d->d_bytes[0]),
2670 						     (char *) &chain,
2671 						     sizeof(chain));
2672 						if (_bt_getbig(t, chain,
2673 							       &save_key,
2674 							       &ignore)
2675 						    == RET_ERROR)
2676 							return (RET_ERROR);
2677 					} else {
2678 						free_save = FALSE;
2679 						save_key = (char *) &(d->d_bytes[0]);
2680 					}
2681 				} else {
2682 					IDATUM *id = (IDATUM *) d;
2683 
2684 					if (id->i_flags & D_BIGKEY) {
2685 						free_save = TRUE;
2686 						bcopy(&(id->i_bytes[0]),
2687 						     (char *) &chain,
2688 						     sizeof(chain));
2689 						if (_bt_getbig(t, chain,
2690 							       &save_key,
2691 							       &ignore)
2692 						    == RET_ERROR)
2693 							return (RET_ERROR);
2694 					} else {
2695 						free_save = FALSE;
2696 						save_key = (char *)
2697 							&(id->i_bytes[0]);
2698 					}
2699 				}
2700 				save_i = 0;
2701 				save_lower = where_h->h_lower;
2702 				save_upper = where_h->h_upper;
2703 			} else {
2704 				if (_bt_cmp(t, save_key, i) != 0) {
2705 					save_lower = where_h->h_lower;
2706 					save_upper = where_h->h_upper;
2707 					save_i = i;
2708 				}
2709 				if (h->h_flags & F_LEAF) {
2710 					if (free_save)
2711 						(void) free(save_key);
2712 					if (d->d_flags & D_BIGKEY) {
2713 						free_save = TRUE;
2714 						bcopy(&(d->d_bytes[0]),
2715 						     (char *) &chain,
2716 						     sizeof(chain));
2717 						if (_bt_getbig(t, chain,
2718 							       &save_key,
2719 							       &ignore)
2720 						    == RET_ERROR)
2721 							return (RET_ERROR);
2722 					} else {
2723 						free_save = FALSE;
2724 						save_key = (char *) &(d->d_bytes[0]);
2725 					}
2726 				} else {
2727 					IDATUM *id = (IDATUM *) d;
2728 
2729 					if (id->i_flags & D_BIGKEY) {
2730 						free_save = TRUE;
2731 						bcopy(&(id->i_bytes[0]),
2732 						     (char *) &chain,
2733 						     sizeof(chain));
2734 						if (_bt_getbig(t, chain,
2735 							       &save_key,
2736 							       &ignore)
2737 						    == RET_ERROR)
2738 							return (RET_ERROR);
2739 					} else {
2740 						free_save = FALSE;
2741 						save_key = (char *)
2742 							&(id->i_bytes[0]);
2743 					}
2744 				}
2745 			}
2746 		}
2747 
2748 		/* copy data and update page state */
2749 		where -= LONGALIGN(dsize);
2750 		(void) bcopy((char *) d, (char *) where, dsize);
2751 		where_h->h_upper = where_h->h_linp[where_i] =
2752 			(index_t) (where - (int) where_h);
2753 		where_h->h_lower += sizeof(index_t);
2754 		where_i++;
2755 
2756 		/* if we've moved half, switch to the right-hand page */
2757 		nbytes -= LONGALIGN(dsize) + sizeof(index_t);
2758 		if ((freespc - nbytes) > nbytes) {
2759 			nbytes = 2 * freespc;
2760 
2761 			/* identical keys go on the same page */
2762 			if (save_i > 0) {
2763 				/* i gets incremented at loop bottom... */
2764 				i = save_i - 1;
2765 				where_h->h_lower = save_lower;
2766 				where_h->h_upper = save_upper;
2767 			}
2768 			where = ((char *) right) + psize;
2769 			where_h = right;
2770 			switch_i = where_i;
2771 			where_i = 0;
2772 		}
2773 	}
2774 
2775 	/*
2776 	 *  If there was an active scan on the database, and we just
2777 	 *  split the page that the cursor was on, we may need to
2778 	 *  adjust the cursor to point to the same entry as before the
2779 	 *  split.
2780 	 */
2781 
2782 	c = &(t->bt_cursor);
2783 	if ((t->bt_flags & BTF_SEQINIT)
2784 	    && (c->c_pgno == h->h_pgno)
2785 	    && (c->c_index >= switch_i)) {
2786 		c->c_pgno = where_h->h_pgno;
2787 		c->c_index -= where_i;
2788 	}
2789 	return (RET_SUCCESS);
2790 }
2791 
2792 /*
2793  *  _BT_INSERTI -- Insert IDATUM on current page in the btree.
2794  *
2795  *	This routine handles insertions to internal pages after splits
2796  *	lower in the tree.  On entry, t->bt_curpage is the page to get
2797  *	the new IDATUM.  We are also given pgno, the page number of the
2798  *	IDATUM that is immediately left of the new IDATUM's position.
2799  *	This guarantees that the IDATUM for the right half of the page
2800  *	after a split goes next to the IDATUM for its left half.
2801  *
2802  *	Parameters:
2803  *		t -- tree in which to do insertion.
2804  *		id -- new IDATUM to insert
2805  *		pgno -- page number of IDATUM left of id's position
2806  *
2807  *	Returns:
2808  *		RET_SUCCESS, RET_ERROR.
2809  */
2810 
2811 static int
2812 _bt_inserti(t, id, pgno)
2813 	BTREE_P t;
2814 	IDATUM *id;
2815 	pgno_t pgno;
2816 {
2817 	BTHEADER *h = t->bt_curpage;
2818 	index_t next, i;
2819 	IDATUM *idx;
2820 	char *key;
2821 	pgno_t chain;
2822 	int free_key;
2823 	int ignore;
2824 
2825 	if (id->i_flags & D_BIGKEY) {
2826 		free_key = TRUE;
2827 		bcopy(&(id->i_bytes[0]), (char *) &chain, sizeof(chain));
2828 		if (_bt_getbig(t, chain, &key, &ignore) == RET_ERROR)
2829 			return (RET_ERROR);
2830 	} else {
2831 		free_key = FALSE;
2832 		key = &(id->i_bytes[0]);
2833 	}
2834 	i = _bt_binsrch(t, key);
2835 
2836 	next = NEXTINDEX(h);
2837 	while (i < next && _bt_cmp(t, key, i) >= 0)
2838 		i++;
2839 
2840 	if (free_key)
2841 		(void) free(key);
2842 
2843 	/* okay, now we're close; find adjacent IDATUM */
2844 	for (;;) {
2845 		idx = (IDATUM *) GETDATUM(h,i);
2846 		if (idx->i_pgno == pgno) {
2847 			i++;
2848 			break;
2849 		}
2850 		--i;
2851 	}
2852 
2853 	/* correctly positioned, do the insertion */
2854 	return (_bt_insertat(t, id, i));
2855 }
2856 
2857 /*
2858  *  _BT_INSERTAT -- Insert a datum at a given location on the current page.
2859  *
2860  *	This routine does insertions on both leaf and internal pages.
2861  *
2862  *	Parameters:
2863  *		t -- tree in which to do insertion.
2864  *		p -- DATUM or IDATUM to insert.
2865  *		index -- index in line pointer array to put this item.
2866  *
2867  *	Returns:
2868  *		RET_SUCCESS, RET_ERROR.
2869  *
2870  *	Side Effects:
2871  *		Will rearrange line pointers to make space for the new
2872  *		entry.  This means that any scans currently active are
2873  *		invalid after this.
2874  *
2875  *	Warnings:
2876  *		There must be sufficient room for the new item on the page.
2877  */
2878 
2879 static int
2880 _bt_insertat(t, p, index)
2881 	BTREE_P t;
2882 	char *p;
2883 	index_t index;
2884 {
2885 	IDATUM *id = (IDATUM *) p;
2886 	DATUM *d = (DATUM *) p;
2887 	BTHEADER *h;
2888 	CURSOR *c;
2889 	index_t nxtindex;
2890 	char *src, *dest;
2891 	int nbytes;
2892 
2893 	/* insertion may confuse an active scan.  fix it. */
2894 	c = &(t->bt_cursor);
2895 	if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno)
2896 		if (_bt_fixscan(t, index, d, INSERT) == RET_ERROR)
2897 			return (RET_ERROR);
2898 
2899 	h = t->bt_curpage;
2900 	nxtindex = (index_t) NEXTINDEX(h);
2901 
2902 	/*
2903 	 *  If we're inserting at the middle of the line pointer array,
2904 	 *  copy pointers that will follow the new one up on the page.
2905 	 */
2906 
2907 	if (index < nxtindex) {
2908 		src = (char *) &(h->h_linp[index]);
2909 		dest = (char *) &(h->h_linp[index + 1]);
2910 		nbytes = (h->h_lower - (src - ((char *) h)))
2911 			 + sizeof(h->h_linp[0]);
2912 		(void) bcopy(src, dest, nbytes);
2913 	}
2914 
2915 	/* compute size and copy data to page */
2916 	if (h->h_flags & F_LEAF) {
2917 		nbytes = d->d_ksize + d->d_dsize
2918 			 + (sizeof(DATUM) - sizeof(char));
2919 	} else {
2920 		nbytes = id->i_size + (sizeof(IDATUM) - sizeof(char));
2921 	}
2922 	dest = (((char *) h) + h->h_upper) - LONGALIGN(nbytes);
2923 	(void) bcopy((char *) p, dest, nbytes);
2924 
2925 	/* update statistics */
2926 	dest -= (int) h;
2927 	h->h_linp[index] = (index_t) dest;
2928 	h->h_upper = (index_t) dest;
2929 	h->h_lower += sizeof(index_t);
2930 
2931 	/* we're done */
2932 	h->h_flags |= F_DIRTY;
2933 
2934 	return (RET_SUCCESS);
2935 }
2936 
2937 /*
2938  *  _BT_BINSRCH -- Do a binary search for a given key on the current page.
2939  *
2940  *	Searches on internal pages are handled slightly differently from
2941  *	searches on leaf pages.  This is because internal page searches
2942  *	find the largest item <= key in the tree, and leaf searches find
2943  *	the smallest item >= key.  This guarantees that leaf page searches
2944  *	leave us pointing at the item's correct position, and internal
2945  *	searches descend the tree correctly.
2946  *
2947  *	Parameters:
2948  *		t -- tree to search
2949  *		key -- key we're looking for
2950  *
2951  *	Returns:
2952  *		Index of the line pointer array entry for the (closest)
2953  *		match to key on the current page, with "closest" as defined
2954  *		above.
2955  */
2956 
2957 static index_t
2958 _bt_binsrch(t, key)
2959 	BTREE_P t;
2960 	char *key;
2961 {
2962 	index_t lbound, ubound, cur;
2963 	BTHEADER *h = t->bt_curpage;
2964 	IDATUM *id;
2965 	int match = 0;
2966 	int r;
2967 
2968 	lbound = 0;
2969 	ubound = NEXTINDEX(h);
2970 	if (ubound > 0)
2971 		--ubound;
2972 
2973 	/* do a binary search on the current page */
2974 	while ((ubound - lbound) > 1) {
2975 		cur = lbound + ((ubound - lbound) / 2);
2976 		r = _bt_cmp(t, key, cur);
2977 
2978 		if (r > 0)
2979 			lbound = cur + 1;
2980 		else if (r < 0)
2981 			ubound = cur;
2982 		else {
2983 			match++;
2984 			break;
2985 		}
2986 	}
2987 
2988 	/*
2989 	 *  At this point, the binary search terminated because the endpoints
2990 	 *  got too close together, or we have a match.  Figure out which
2991 	 *  case applies, decide what to do based on the page type (leaf or
2992 	 *  internal), and do the right thing.
2993 	 */
2994 	if (match) {
2995 		return (cur);
2996 	} else if (ubound != lbound) {
2997 		if (h->h_flags & F_LEAF) {
2998 			r = _bt_cmp(t, key, lbound);
2999 			if (r <= 0) {
3000 				return (lbound);
3001 			}
3002 		} else {
3003 			r = _bt_cmp(t, key, ubound);
3004 
3005 			/* for internal nodes, move as far left as possible */
3006 			if (r < 0) {
3007 				r = _bt_cmp(t, key, lbound);
3008 				if (r < 0 && lbound > 0)
3009 					--lbound;
3010 				return (lbound);
3011 			} else {
3012 				return (ubound);
3013 			}
3014 		}
3015 	}
3016 
3017 	if (h->h_flags & F_LEAF) {
3018 		if (ubound < NEXTINDEX(h)) {
3019 			r = _bt_cmp(t, key, ubound);
3020 			if (r > 0)
3021 				ubound++;
3022 		}
3023 	} else {
3024 		/* for internal pages, move as far left as possible */
3025 		if (ubound == NEXTINDEX(h))
3026 			ubound--;
3027 
3028 		while (_bt_cmp(t, key, ubound) < 0)
3029 			ubound--;
3030 	}
3031 	return (ubound);
3032 }
3033 
3034 /*
3035  *  BT_SEQ -- Sequential scan interface.
3036  *
3037  *	This routine supports sequential scans on the btree.  If called with
3038  *	flags set to R_CURSOR, or if no seq scan has been initialized in the
3039  *	current tree, we initialize the scan.  Otherwise, we advance the scan
3040  *	and return the next item.
3041  *
3042  *	Scans can be either keyed or non-keyed.  Keyed scans basically have
3043  *	a starting point somewhere in the middle of the tree.  Non-keyed
3044  *	scans start at an endpoint.  Also, scans can be backward (descending
3045  *	order), forward (ascending order), or no movement (keep returning
3046  *	the same item).
3047  *
3048  *	Flags is checked every time we enter the routine, so the user can
3049  *	change directions on an active scan if desired.  The key argument
3050  *	is examined only when we initialize the scan, in order to position
3051  *	it properly.
3052  *
3053  *	Items are returned via the key and data arguments passed in.
3054  *
3055  *	Parameters:
3056  *		tree -- btree in which to do scan
3057  *		key -- key, used to position scan on initialization, and
3058  *		       used to return key components to the user.
3059  *		data -- used to return data components to the user.
3060  *		flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or
3061  *			 R_PREV.
3062  *
3063  *	Returns:
3064  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data
3065  *		exists in the tree in the specified direction.
3066  *
3067  *	Side Effects:
3068  *		Changes the btree's notion of the current position in the
3069  *		scan.
3070  *
3071  *	Warnings:
3072  *		The key and data items returned are static and will be
3073  *		overwritten by the next bt_get or bt_seq.
3074  */
3075 
3076 bt_seq(tree, key, data, flags)
3077 	BTREE tree;
3078 	DBT *key;
3079 	DBT *data;
3080 	int flags;
3081 {
3082 	BTREE_P t = (BTREE_P) tree;
3083 	BTHEADER *h;
3084 	DATUM *d;
3085 	int status;
3086 
3087 	/* do we need to initialize the scan? */
3088 	if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST
3089 	    || !(t->bt_flags & BTF_SEQINIT)) {
3090 
3091 		/* initialize it */
3092 		status = _bt_seqinit(t, key, flags);
3093 	} else {
3094 		/* just advance the current scan pointer */
3095 		status = _bt_seqadvance(t, flags);
3096 	}
3097 
3098 	key->size = data->size = 0;
3099 	key->data = data->data = (char *) NULL;
3100 
3101 	h = t->bt_curpage;
3102 
3103 	/* is there a valid item at the current scan location? */
3104 	if (status == RET_SPECIAL) {
3105 		if (flags == R_NEXT) {
3106 			if (t->bt_cursor.c_index >= NEXTINDEX(h)) {
3107 				if (NEXTINDEX(h) > 0)
3108 					t->bt_cursor.c_index = NEXTINDEX(h) - 1;
3109 				else
3110 					t->bt_cursor.c_index = 0;
3111 			}
3112 		} else {
3113 			t->bt_cursor.c_index = 0;
3114 		}
3115 		return (RET_SPECIAL);
3116 	} else if (status == RET_ERROR)
3117 		return (RET_ERROR);
3118 
3119 	/* okay, return the data */
3120 	d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index);
3121 
3122 	return (_bt_buildret(t, d, data, key));
3123 }
3124 
3125 /*
3126  *  _BT_BUILDRET -- Build return key/data pair as a result of search or scan.
3127  *
3128  *	This routine manages the statically allocated buffers in which we
3129  *	return data to the user.
3130  *
3131  *	Parameters:
3132  *		t -- btree from which to return datum
3133  *		d -- DATUM to be returned to the user.
3134  *		data -- data argument supplied by user for return
3135  *		key -- key argument supplied by user for return
3136  *
3137  *	Returns:
3138  *		RET_SUCCESS, RET_ERROR.
3139  *
3140  *	Side Effects:
3141  *		May free and reallocate static buffers, if the data
3142  *		we want to return is bigger than the space we have to
3143  *		do so.
3144  */
3145 
3146 static int
3147 _bt_buildret(t, d, data, key)
3148 	BTREE_P t;
3149 	DATUM *d;
3150 	DBT *data;
3151 	DBT *key;
3152 {
3153 	static int _data_s = 0;
3154 	static int _key_s = 0;
3155 	static char *_data = (char *) NULL;
3156 	static char *_key = (char *) NULL;
3157 	char *from, *where, *top;
3158 	pgno_t save;
3159 	pgno_t chain;
3160 	size_t nbytes;
3161 	size_t nhere;
3162 	BTHEADER *h;
3163 
3164 	if (d->d_flags & D_BIGKEY) {
3165 		if (_key != (char *) NULL)
3166 			(void) free(_key);
3167 		(void) bcopy((char *) &(d->d_bytes[0]),
3168 		      	     (char *) &chain,
3169 		      	     sizeof(chain));
3170 		if (_bt_getbig(t, chain, &_key, &_key_s) == RET_ERROR)
3171 			return (RET_ERROR);
3172 		key->data = _key;
3173 		key->size = _key_s;
3174 	} else {
3175 		/* need more space for key? */
3176 		if (d->d_ksize > _key_s) {
3177 			if (_key != (char *) NULL)
3178 				(void) free (_key);
3179 			if ((_key = (char *) malloc((unsigned) d->d_ksize))
3180 			    == (char *) NULL)
3181 				return (RET_ERROR);
3182 			_key_s = d->d_ksize;
3183 		}
3184 
3185 		key->data = _key;
3186 		if ((key->size = d->d_ksize) > 0)
3187 			(void) bcopy(&(d->d_bytes[0]),
3188 				     _key,
3189 				     (int) d->d_ksize);
3190 	}
3191 
3192 	if (d->d_flags & D_BIGDATA) {
3193 		if (_data != (char *) NULL)
3194 			(void) free(_data);
3195 		(void) bcopy(&(d->d_bytes[d->d_ksize]),
3196 		      	     (char *) &chain,
3197 		      	     sizeof(chain));
3198 		if (_bt_getbig(t, chain, &_data, &_data_s) == RET_ERROR)
3199 			return (RET_ERROR);
3200 		data->data = _data;
3201 		data->size = _data_s;
3202 	} else {
3203 		/* need more space for data? */
3204 		if (d->d_dsize > _data_s) {
3205 			if (_data != (char *) NULL)
3206 				(void) free (_data);
3207 			if ((_data = (char *) malloc((unsigned) d->d_dsize))
3208 			    == (char *) NULL)
3209 				return (RET_ERROR);
3210 			_data_s = d->d_dsize;
3211 		}
3212 
3213 		data->data = _data;
3214 
3215 		if ((data->size = d->d_dsize) > 0)
3216 			(void) bcopy(&(d->d_bytes[d->d_ksize]),
3217 				      _data,
3218 				      d->d_dsize);
3219 	}
3220 
3221 	return (RET_SUCCESS);
3222 }
3223 
3224 /*
3225  *  _BT_SEQINIT -- Initialize a sequential scan on the btree.
3226  *
3227  *	Sets the tree's notion of the current scan location correctly
3228  *	given a key and a direction.
3229  *
3230  *	Parameters:
3231  *		t -- tree in which to initialize scan
3232  *		key -- key for initial scan position
3233  *		flags -- R_NEXT, R_PREV
3234  *
3235  *	Returns:
3236  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there's no data
3237  *		in the tree to scan.
3238  *
3239  *	Side Effects:
3240  *		Changes current scan position for the tree.  Almost certainly
3241  *		changes current page, as well.  Sets BTF_SEQINIT bit in tree
3242  *		flags, so that we know we've initialized a scan.
3243  */
3244 
3245 static int
3246 _bt_seqinit(t, key, flags)
3247 	BTREE_P t;
3248 	DBT *key;
3249 	int flags;
3250 {
3251 	BTITEM *item;
3252 	BTHEADER *h;
3253 	CURSOR *c;
3254 	IDATUM *id;
3255 	pgno_t pgno;
3256 	index_t index;
3257 	index_t last;
3258 
3259 	/*
3260 	 *  Figure out if we really have to search for the key that the
3261 	 *  user supplied.  If key is null, then this is an unkeyed scan
3262 	 *  and we can just start from an endpoint.
3263 	 */
3264 
3265 	c = &(t->bt_cursor);
3266 
3267 	if (flags == R_CURSOR) {
3268 		if (key->data != (char *) NULL) {
3269 
3270 			/* key supplied, find it */
3271 			item = _bt_search(t, key);
3272 			c->c_index = item->bti_index;
3273 			c->c_pgno = t->bt_curpage->h_pgno;
3274 		} else {
3275 			errno = EINVAL;
3276 			return (RET_ERROR);
3277 		}
3278 
3279 	} else {
3280 
3281 		/*
3282 		 *  Unkeyed scan.  For backward scans, find the last item
3283 		 *  in the tree; for forward scans, find the first item.
3284 		 */
3285 
3286 		if (_bt_getpage(t, P_ROOT) == RET_ERROR)
3287 			return (RET_ERROR);
3288 		h = t->bt_curpage;
3289 		if (flags == R_LAST || flags == R_PREV) {
3290 
3291 			/* backward scan */
3292 			while (!(h->h_flags & F_LEAF)) {
3293 				last = NEXTINDEX(h) - 1;
3294 				id = (IDATUM *) GETDATUM(h,last);
3295 				if (_bt_getpage(t, id->i_pgno) == RET_ERROR)
3296 					return (RET_ERROR);
3297 				h = t->bt_curpage;
3298 			}
3299 
3300 			/* skip empty pages */
3301 			while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE) {
3302 				if (_bt_getpage(t, h->h_prevpg) == RET_ERROR)
3303 					return (RET_ERROR);
3304 				h = t->bt_curpage;
3305 			}
3306 
3307 			c->c_pgno = h->h_pgno;
3308 			if (NEXTINDEX(h) > 0)
3309 				c->c_index = NEXTINDEX(h) - 1;
3310 			else
3311 				c->c_index = 0;
3312 		} else if (flags == R_FIRST || flags == R_NEXT) {
3313 			/* forward scan */
3314 			while (!(h->h_flags & F_LEAF)) {
3315 				id = (IDATUM *) GETDATUM(h,0);
3316 				if (_bt_getpage(t, id->i_pgno) == RET_ERROR)
3317 					return (RET_ERROR);
3318 				h = t->bt_curpage;
3319 			}
3320 
3321 			/* skip empty pages */
3322 			while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE) {
3323 				if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
3324 					return (RET_ERROR);
3325 				h = t->bt_curpage;
3326 			}
3327 
3328 			c->c_pgno = h->h_pgno;
3329 			c->c_index = 0;
3330 		} else {
3331 			/* no flags passed in */
3332 			errno = EINVAL;
3333 			return (RET_ERROR);
3334 		}
3335 	}
3336 
3337 	/* okay, scan is initialized */
3338 	t->bt_flags |= BTF_SEQINIT;
3339 
3340 	if (c->c_index == NEXTINDEX(t->bt_curpage))
3341 		return (RET_SPECIAL);
3342 
3343 	return (RET_SUCCESS);
3344 }
3345 
3346 /*
3347  *  _BT_SEQADVANCE -- Advance the sequential scan on this tree.
3348  *
3349  *	Moves the current location pointer for the scan on this tree one
3350  *	spot in the requested direction.
3351  *
3352  *	Parameters:
3353  *		t -- btree being scanned
3354  *		flags -- for R_NEXT, R_PREV
3355  *
3356  *	Returns:
3357  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there is no
3358  *		more data in the specified direction.
3359  *
3360  *	Side Effects:
3361  *		May change current page.
3362  */
3363 
3364 static int
3365 _bt_seqadvance(t, flags)
3366 	BTREE_P t;
3367 	int flags;
3368 {
3369 	BTHEADER *h;
3370 	CURSOR *c;
3371 	index_t index;
3372 
3373 	c = &(t->bt_cursor);
3374 	index = c->c_index;
3375 
3376 	if (_bt_getpage(t, c->c_pgno) == RET_ERROR)
3377 		return (RET_ERROR);
3378 	h = t->bt_curpage;
3379 
3380 	/* by the time we get here, don't need the cursor key anymore */
3381 	if (c->c_key != (char *) NULL)
3382 		(void) free(c->c_key);
3383 
3384 	if (flags == R_NEXT) {
3385 
3386 		/*
3387 		 *  This is a forward scan.  If the cursor is pointing
3388 		 *  at a virtual record (that is, it was pointing at
3389 		 *  a record that got deleted), then we should return
3390 		 *  the record it's pointing at now.  Otherwise, we
3391 		 *  should advance the scan.  In either case, we need
3392 		 *  to be careful not to run off the end of the current
3393 		 *  page.
3394 		 */
3395 
3396 		if (c->c_flags & CRSR_BEFORE) {
3397 
3398 			if (index >= NEXTINDEX(h)) {
3399 				/* out of items on this page, get next page */
3400 				if (h->h_nextpg == P_NONE) {
3401 					/* tell caller we're done... */
3402 					c->c_index = NEXTINDEX(h);
3403 					return (RET_SPECIAL);
3404 				}
3405 
3406 				/* skip empty pages */
3407 				do {
3408 					if (_bt_getpage(t, h->h_nextpg)
3409 					    == RET_ERROR) {
3410 						c->c_index = NEXTINDEX(h);
3411 						return (RET_ERROR);
3412 					}
3413 					h = t->bt_curpage;
3414 					c->c_pgno = h->h_pgno;
3415 				} while (NEXTINDEX(h) == 0
3416 					 && h->h_nextpg != P_NONE);
3417 
3418 				if (NEXTINDEX(h) == 0) {
3419 					/* tell caller we're done */
3420 					c->c_index = NEXTINDEX(h);
3421 					return (RET_SPECIAL);
3422 				}
3423 				index = 0;
3424 			}
3425 			c->c_flags &= ~CRSR_BEFORE;
3426 
3427 		} else if (++index >= NEXTINDEX(h)) {
3428 
3429 			/* out of items on this page, get next page */
3430 			if (h->h_nextpg == P_NONE) {
3431 				/* tell caller we're done... */
3432 				c->c_index = NEXTINDEX(h);
3433 				return (RET_SPECIAL);
3434 			}
3435 
3436 			/* skip empty pages */
3437 			do {
3438 				if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) {
3439 					c->c_index = NEXTINDEX(h);
3440 					return (RET_ERROR);
3441 				}
3442 				h = t->bt_curpage;
3443 				c->c_pgno = h->h_pgno;
3444 			} while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
3445 
3446 			if (NEXTINDEX(h) == 0) {
3447 				/* tell caller we're done */
3448 				c->c_index = NEXTINDEX(h);
3449 				return (RET_SPECIAL);
3450 			}
3451 			index = 0;
3452 		}
3453 	} else if (flags == R_PREV) {
3454 
3455 		/* for backward scans, life is substantially easier */
3456 		c->c_flags &= ~CRSR_BEFORE;
3457 		if (c->c_key != (char *) NULL) {
3458 			(void) free(c->c_key);
3459 			c->c_key = (char *) NULL;
3460 		}
3461 
3462 		if (index == 0) {
3463 
3464 			/* we may be done */
3465 			c->c_index = 0;
3466 
3467 			/* out of items on this page, get next page */
3468 			if (h->h_prevpg == P_NONE)
3469 				return (RET_SPECIAL);
3470 
3471 			/* skip empty pages */
3472 			do {
3473 				if (_bt_getpage(t, h->h_prevpg) == RET_ERROR)
3474 					return (RET_ERROR);
3475 				h = t->bt_curpage;
3476 				c->c_pgno = h->h_pgno;
3477 			} while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE);
3478 
3479 			if (NEXTINDEX(h) == 0)
3480 				return (RET_SPECIAL);
3481 
3482 			index = NEXTINDEX(h) - 1;
3483 		} else
3484 			--index;
3485 	} else {
3486 		/* must specify a direction */
3487 		errno = EINVAL;
3488 		return (RET_ERROR);
3489 	}
3490 
3491 	c->c_index = index;
3492 	return (RET_SUCCESS);
3493 }
3494 
3495 /*
3496  *  BT_CLOSE -- Close a btree
3497  *
3498  *	Parameters:
3499  *		tree -- tree to close
3500  *
3501  *	Returns:
3502  *		RET_SUCCESS, RET_ERROR.
3503  *
3504  *	Side Effects:
3505  *		Frees space occupied by the tree.
3506  */
3507 
3508 int
3509 bt_close(tree)
3510 	BTREE tree;
3511 {
3512 	BTREE_P t = (BTREE_P) tree;
3513 	int i;
3514 	BTHEADER *h;
3515 	char *cache;
3516 	struct HTBUCKET *b, *sb;
3517 	HTABLE ht;
3518 	int fd;
3519 
3520 	if (t->bt_cursor.c_key != (char *) NULL)
3521 		(void) free(t->bt_cursor.c_key);
3522 
3523 	if (!ISDISK(t)) {
3524 		/* in-memory tree, release hash table memory */
3525 		ht = t->bt_s.bt_ht;
3526 		for (i = 0; i < HTSIZE; i++) {
3527 			if ((b = ht[i]) == (struct HTBUCKET *) NULL)
3528 				break;
3529 			do {
3530 				sb = b;
3531 				(void) free((char *) b->ht_page);
3532 				b = b->ht_next;
3533 				(void) free((char *) sb);
3534 			} while (b != (struct HTBUCKET *) NULL);
3535 		}
3536 		(void) free ((char *) ht);
3537 		(void) free ((char *) t);
3538 		return (RET_SUCCESS);
3539 	}
3540 
3541 	if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) {
3542 		if (_bt_wrtmeta(t) == RET_ERROR)
3543 			return (RET_ERROR);
3544 	}
3545 
3546 	if (t->bt_curpage != (BTHEADER *) NULL) {
3547 		h = t->bt_curpage;
3548 		if (h->h_flags & F_DIRTY) {
3549 			if (_bt_write(t, h, RELEASE) == RET_ERROR)
3550 				return (RET_ERROR);
3551 		} else {
3552 			if (_bt_release(t, h) == RET_ERROR)
3553 				return (RET_ERROR);
3554 		}
3555 
3556 		/* flush and free the cache, if there is one */
3557 		if (ISCACHE(t)) {
3558 			cache = t->bt_s.bt_d.d_cache;
3559 			lrusync(cache);
3560 			lrufree(cache);
3561 		}
3562 		(void) free ((char *) h);
3563 	}
3564 
3565 	fd = t->bt_s.bt_d.d_fd;
3566 	(void) free ((char *) t);
3567 	return (close(fd));
3568 }
3569 
3570 /*
3571  *  _BT_ALLOCPG -- allocate a new page in the btree.
3572  *
3573  *	This is called when we split a page, to get space to do the split.
3574  *	For disk btrees, these pages are released when the split is done.
3575  *	For memory btrees, they are not.
3576  *
3577  *	Parameters:
3578  *		t -- tree in which to do split
3579  *
3580  *	Returns:
3581  *		Pointer to the newly-allocated page
3582  */
3583 
3584 static BTHEADER *
3585 _bt_allocpg(t)
3586 	BTREE_P t;
3587 {
3588 	BTHEADER *h = t->bt_curpage;
3589 	BTHEADER *nh;
3590 	char *cache;
3591 	int nbytes;
3592 
3593 	/* if we have a cache, let the cache code do the work */
3594 	if (ISDISK(t) && ISCACHE(t)) {
3595 		nh = (BTHEADER *) lrugetnew(t->bt_s.bt_d.d_cache,
3596 					    t->bt_npages + 1,
3597 					    &nbytes);
3598 	} else {
3599 		nh = (BTHEADER *) malloc((unsigned) t->bt_psize);
3600 	}
3601 
3602 	if (nh != (BTHEADER *) NULL) {
3603 		nh->h_pgno = nh->h_prevpg = nh->h_nextpg = P_NONE;
3604 		nh->h_flags = h->h_flags;
3605 		nh->h_lower = (index_t)
3606 				(((char *) &(nh->h_linp[0])) - ((char *) nh));
3607 		nh->h_upper = t->bt_psize;
3608 	}
3609 
3610 	return (nh);
3611 }
3612 
3613 /*
3614  *  _BT_WRITE -- Write a specific page to a btree file.
3615  *
3616  *	Parameters:
3617  *		t -- btree to get the page
3618  *		h -- page to write
3619  *		relflag -- (int) this page may/may not be released
3620  *
3621  *	Returns:
3622  *		RET_SUCCESS, RET_ERROR.
3623  *
3624  *	Side Effects:
3625  *		Writes a metadata page if none has been written yet.
3626  */
3627 
3628 static int
3629 _bt_write(t, h, relflag)
3630 	BTREE_P t;
3631 	BTHEADER *h;
3632 	int relflag;
3633 {
3634 	long pos;
3635 	int htindex;
3636 	HTBUCKET *b;
3637 	char *cache;
3638 	BTMETA m;
3639 	pgno_t pgno;
3640 
3641 	h->h_flags &= ~F_DIRTY;
3642 	if (ISDISK(t)) {
3643 
3644 		/* if we haven't done so yet, write the metadata */
3645 		if (!(t->bt_flags & BTF_METAOK)) {
3646 			if (_bt_wrtmeta(t) == RET_ERROR)
3647 				return (RET_ERROR);
3648 		}
3649 
3650 		pgno = h->h_pgno;
3651 
3652 
3653 		/* if we have a cache, let the cache code do the work */
3654 		if ((cache = t->bt_s.bt_d.d_cache) != (char *) NULL) {
3655 			if (lruwrite(cache, pgno) == RET_ERROR)
3656 				return (RET_ERROR);
3657 			if (relflag && lrurelease(cache, pgno) == RET_ERROR)
3658 				return (RET_ERROR);
3659 
3660 		} else {
3661 			(void) _bt_pgout(h, (char *) t->bt_lorder);
3662 			/* now write the current page */
3663 			pos = (long) (pgno * t->bt_psize);
3664 			if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos)
3665 				return (RET_ERROR);
3666 			if (write(t->bt_s.bt_d.d_fd, h, t->bt_psize)
3667 			    < t->bt_psize)
3668 				return (RET_ERROR);
3669 			if (!relflag)
3670 				(void) _bt_pgin(h, (char *) t->bt_lorder);
3671 		}
3672 	} else {
3673 		/* in-memory btree */
3674 		htindex = HASHKEY(h->h_pgno);
3675 
3676 		/* see if we need to overwrite existing entry */
3677 		for (b = t->bt_s.bt_ht[htindex];
3678 		     b != (HTBUCKET *) NULL;
3679 		     b = b->ht_next) {
3680 			if (b->ht_pgno == h->h_pgno) {
3681 				b->ht_page = h;
3682 				return (RET_SUCCESS);
3683 			}
3684 		}
3685 
3686 		/* new entry, write it */
3687 		b = (HTBUCKET *) malloc((unsigned) sizeof (HTBUCKET));
3688 		if (b == (HTBUCKET *) NULL)
3689 			return (RET_ERROR);
3690 
3691 		b->ht_pgno = h->h_pgno;
3692 		b->ht_page = h;
3693 		b->ht_next = t->bt_s.bt_ht[htindex];
3694 		t->bt_s.bt_ht[htindex] = b;
3695 	}
3696 	return (RET_SUCCESS);
3697 }
3698 
3699 /*
3700  *  _BT_RELEASE -- Release a locked-down cache page
3701  *
3702  *	During page splits, we want to force pages out to the cache
3703  *	before we actually release them, in some cases.  This routine
3704  *	releases such a page when it is no longer needed.
3705  *
3706  *	Parameters:
3707  *		t -- btree in which to release page
3708  *		h -- page to release
3709  *
3710  *	Returns:
3711  *		RET_SUCCESS, RET_ERROR.
3712  *
3713  *	Side Effects:
3714  *		None.
3715  */
3716 
3717 static int
3718 _bt_release(t, h)
3719 	BTREE_P t;
3720 	BTHEADER *h;
3721 {
3722 	if (ISDISK(t) && ISCACHE(t)) {
3723 		if (lrurelease(t->bt_s.bt_d.d_cache, h->h_pgno) < 0)
3724 			return (RET_ERROR);
3725 	}
3726 	return (RET_SUCCESS);
3727 }
3728 
3729 /*
3730  *  _BT_WRTMETA -- Write metadata to the btree.
3731  *
3732  *	Parameters:
3733  *		t -- tree to which to write
3734  *
3735  *	Returns:
3736  *		RET_SUCCESS, RET_ERROR.
3737  */
3738 
3739 static int
3740 _bt_wrtmeta(t)
3741 	BTREE_P t;
3742 {
3743 	BTMETA m;
3744 
3745 	if (lseek(t->bt_s.bt_d.d_fd, 0l, L_SET) != 0l)
3746 		return (RET_ERROR);
3747 
3748 	/* lorder has to be in host-independent format */
3749 	m.m_lorder = (u_long) htonl((long) t->bt_lorder);
3750 
3751 	m.m_magic = BTREEMAGIC;
3752 	m.m_version = BTREEVERSION;
3753 	m.m_psize = t->bt_psize;
3754 	m.m_free = t->bt_free;
3755 	m.m_flags = t->bt_flags & BTF_NODUPS;
3756 
3757 	if (t->bt_lorder != BYTE_ORDER) {
3758 		BLSWAP(m.m_magic);
3759 		BLSWAP(m.m_version);
3760 		BLSWAP(m.m_psize);
3761 		BLSWAP(m.m_free);
3762 		BLSWAP(m.m_flags);
3763 	}
3764 
3765 	if (write(t->bt_s.bt_d.d_fd, &m, sizeof(m))
3766 	    != sizeof(m)) {
3767 		return (RET_ERROR);
3768 	}
3769 
3770 	t->bt_flags |= BTF_METAOK;
3771 
3772 	return (RET_SUCCESS);
3773 }
3774 
3775 #ifdef DEBUG
3776 void
3777 _btdump(tree)
3778 	BTREE tree;
3779 {
3780 	BTREE_P t = (BTREE_P) tree;
3781 	DATUM *d;
3782 	IDATUM *id;
3783 	BTHEADER *h;
3784 	pgno_t npages;
3785 	pgno_t i;
3786 	index_t cur, top;
3787 
3788 	npages = t->bt_npages;
3789 	(void) printf("\"%s\" fd %d pgsz %d curpg %d @ 0x%lx",
3790 		t->bt_fname, t->bt_s.bt_d.d_fd,
3791 		t->bt_psize, t->bt_curpage);
3792 	(void) printf("npg %d cmp 0x%lx flags=(", npages, t->bt_compare);
3793 	if (t->bt_flags & BTF_SEQINIT)
3794 		(void) printf("BTF_SEQINIT");
3795 	(void) printf(")\n");
3796 
3797 	for (i = P_ROOT; i <= npages; i++) {
3798 		if (_bt_getpage(t, i) == RET_ERROR)
3799 			_punt();
3800 		h = t->bt_curpage;
3801 		top = NEXTINDEX(h);
3802 		(void) printf("    page %d:\n", i);
3803 		(void) printf("\tpgno %d prev %d next %d\n",
3804 			h->h_pgno, h->h_prevpg, h->h_nextpg);
3805 		(void) printf("\tlower %d upper %d nextind %d flags (",
3806 			h->h_lower, h->h_upper, top);
3807 		if (h->h_flags & F_LEAF)
3808 			(void) printf("F_LEAF");
3809 		else
3810 			(void) printf("<internal>");
3811 		if (h->h_flags & F_DIRTY)
3812 			(void) printf("|F_DIRTY");
3813 		if (h->h_flags & F_PRESERVE)
3814 			(void) printf("|F_PRESERVE");
3815 		if (h->h_flags & F_CONT) {
3816 			(void) printf("|F_CONT)");
3817 			if (h->h_prevpg == P_NONE) {
3818 				size_t longsz;
3819 				(void) bcopy((char *) &(h->h_linp[0]),
3820 					      (char *) &longsz,
3821 					      sizeof(longsz));
3822 				printf("\n\t\t(chain start, data length %ld)",
3823 					longsz);
3824 			}
3825 			printf("\n");
3826 			continue;
3827 		}
3828 		(void) printf(")\n");
3829 		for (cur = 0; cur < top; cur++) {
3830 			(void) printf("\t  [%d] off %d ", cur, h->h_linp[cur]);
3831 			if (h->h_flags & F_LEAF) {
3832 				d = (DATUM *) GETDATUM(h,cur);
3833 				(void) printf("ksize %d", d->d_ksize);
3834 				if (d->d_flags & D_BIGKEY)
3835 					(void) printf(" (indirect)");
3836 				(void) printf("; dsize %d", d->d_dsize);
3837 				if (d->d_flags & D_BIGDATA)
3838 					(void) printf(" (indirect)");
3839 			} else {
3840 				id = (IDATUM *) GETDATUM(h,cur);
3841 				(void) printf("size %d pgno %d",
3842 					id->i_size, id->i_pgno);
3843 				if (id->i_flags & D_BIGKEY)
3844 					(void) printf(" (indirect)");
3845 			}
3846 			(void) printf("\n");
3847 		}
3848 		(void) printf("\n");
3849 	}
3850 }
3851 #endif /* DEBUG */
3852 
3853 /*
3854  *  _BT_CMP -- Compare a key to a given item on the current page.
3855  *
3856  *	This routine is a wrapper for the user's comparison function.
3857  *
3858  *	Parameters:
3859  *		t -- tree in which to do comparison
3860  *		p -- pointer to one argument for the comparison function
3861  *		n -- index of item to supply second arg to comparison function
3862  *
3863  *	Returns:
3864  *		< 0 if p is < item at n
3865  *		= 0 if p is = item at n
3866  *		> 0 if p is > item at n
3867  */
3868 
3869 static int
3870 _bt_cmp(t, p, n)
3871 	BTREE_P t;
3872 	char *p;
3873 	index_t n;
3874 {
3875 	BTHEADER *h;
3876 	IDATUM *id;
3877 	DATUM *d;
3878 	char *arg;
3879 	int ignore;
3880 	int free_arg;
3881 	pgno_t chain;
3882 	int r;
3883 
3884 	h = t->bt_curpage;
3885 
3886 	/*
3887 	 *  The left-most key at any level of the tree on internal pages
3888 	 *  is guaranteed (artificially, by the code here) to be less than
3889 	 *  any user key.  This saves us from having to update the leftmost
3890 	 *  key when the user inserts a new key in the tree smaller than
3891 	 *  anything we've seen yet.
3892 	 */
3893 
3894 	if (h->h_prevpg == P_NONE && !(h->h_flags & F_LEAF) && n == 0)
3895 		return (1);
3896 
3897 	if (h->h_flags & F_LEAF) {
3898 		d = (DATUM *) GETDATUM(h,n);
3899 		if (d->d_flags & D_BIGKEY) {
3900 			free_arg = TRUE;
3901 			bcopy(&(d->d_bytes[0]), (char *) &chain, sizeof(chain));
3902 			if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR)
3903 				return (RET_ERROR);
3904 		} else {
3905 			free_arg = FALSE;
3906 			arg = &(d->d_bytes[0]);
3907 		}
3908 	} else {
3909 		id = (IDATUM *) GETDATUM(h,n);
3910 		if (id->i_flags & D_BIGKEY) {
3911 			free_arg = TRUE;
3912 			bcopy(&(id->i_bytes[0]),
3913 			      (char *) &chain,
3914 			      sizeof(chain));
3915 			if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR)
3916 				return (RET_ERROR);
3917 		} else {
3918 			free_arg = FALSE;
3919 			arg = &(id->i_bytes[0]);
3920 		}
3921 	}
3922 	r = (*(t->bt_compare))(p, arg);
3923 
3924 	if (free_arg)
3925 		(void) free(arg);
3926 
3927 	return (r);
3928 }
3929 
3930 /*
3931  *  _BT_PUSH/_BT_POP -- Push/pop a parent page number on the parent stack.
3932  *
3933  *	When we descend the tree, we keep track of parent pages in order
3934  *	to handle splits on insertions.
3935  *
3936  *	Parameters:
3937  *		t -- tree for which to push parent
3938  *		pgno -- page number to push (_bt_push only)
3939  *
3940  *	Returns:
3941  *		None.
3942  */
3943 
3944 static
3945 _bt_push(t, pgno)
3946 	BTREE_P t;
3947 	pgno_t pgno;
3948 {
3949 	BTSTACK *new;
3950 
3951 	new = (BTSTACK *) malloc((unsigned) sizeof(BTSTACK));
3952 	new->bts_pgno = pgno;
3953 	new->bts_next = t->bt_stack;
3954 	t->bt_stack = new;
3955 }
3956 
3957 static
3958 _bt_pop(t)
3959 	BTREE_P t;
3960 {
3961 	BTSTACK *s;
3962 	pgno_t p = P_NONE;
3963 
3964 	if ((s = t->bt_stack) != (BTSTACK *) NULL) {
3965 		p = s->bts_pgno;
3966 		t->bt_stack = s->bts_next;
3967 		(void) free ((char *) s);
3968 	}
3969 	return (p);
3970 }
3971 
3972 #ifdef DEBUG
3973 _punt()
3974 {
3975 	int pid;
3976 
3977 	pid = getpid();
3978 	(void) kill(pid, SIGILL);
3979 }
3980 #endif /* DEBUG */
3981