xref: /csrg-svn/lib/libc/db/btree/bt_open.c (revision 45876)
1 /*-
2  * Copyright (c) 1990 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 #if defined(LIBC_SCCS) && !defined(lint)
12 static char sccsid[] = "@(#)bt_open.c	5.1 (Berkeley) 01/07/91";
13 #endif /* LIBC_SCCS and not lint */
14 
15 /*
16  *  btree.c -- implementation of btree access method for 4.4BSD.
17  *
18  *	The design here is based on that of the btree access method used
19  *	in the Postgres database system at UC Berkeley.  The implementation
20  *	is wholly independent of the Postgres code.
21  *
22  *	This implementation supports btrees on disk (supply a filename) or
23  *	in memory (don't).  Public interfaces defined here are:
24  *
25  *		btree_open()	-- wrapper; returns a filled DB struct for
26  *				   a btree.
27  *
28  *		bt_open()	-- open a new or existing btree.
29  *		bt_get()	-- fetch data from a tree by key.
30  *		bt_seq()	-- do a sequential scan on a tree.
31  *		bt_put()	-- add data to a tree by key.
32  *		bt_delete()	-- remove data from a tree by key.
33  *		bt_close()	-- close a btree.
34  *		bt_sync()	-- force btree pages to disk (disk trees only).
35  */
36 
37 #include <sys/param.h>
38 #include <sys/file.h>
39 #include <sys/stat.h>
40 #include <sys/signal.h>
41 #include <sys/errno.h>
42 #include <db.h>
43 
44 #ifndef BLSWAP
45 #define BLSWAP(a) { \
46         register unsigned long _t; \
47         _t = (a); \
48         (a) >>= 24; \
49         (a) |= (_t & 0x00ff0000) >>  8; \
50         (a) |= (_t & 0x0000ff00) <<  8; \
51         (a) |= (_t & 0x000000ff) << 24; \
52 }
53 #endif /* ndef BLSWAP */
54 
55 typedef char	*BTREE;		/* should really be (void *) */
56 
57 /* #define	DEBUG */
58 
59 #define RET_ERROR	-1
60 #define RET_SUCCESS	 0
61 #define RET_SPECIAL	 1
62 
63 #ifndef TRUE
64 #define TRUE	1
65 #define FALSE	0
66 #endif /* ndef TRUE */
67 
68 extern int errno;
69 
70 /* libc */
71 extern char *malloc();
72 
73 /* these are defined in lrucache.c */
74 extern char	*lruinit();
75 extern char	*lruget();
76 extern char	*lrugetnew();
77 extern int	lrusync();
78 extern int	lruwrite();
79 extern int	lrurelease();
80 extern void	lrufree();
81 
82 /* these are defined here */
83 BTREE	bt_open();
84 int	bt_close();
85 int	bt_delete();
86 int	bt_get();
87 int	bt_put();
88 int	bt_seq();
89 int	bt_sync();
90 
91 /*
92  *  Private types.  What you choose for these depends on how big you
93  *  want to let files get, and how big you want to let pages get.
94  */
95 
96 typedef u_long	index_t;	/* so # bytes on a page fits in a long */
97 typedef u_long	pgno_t;		/* so # of pages in a btree fits in a long */
98 
99 /*
100  *  When we do searches, we push the parent page numbers onto a stack
101  *  as we descend the tree.  This is so that for insertions, we can
102  *  find our way back up to do internal page insertions and splits.
103  */
104 
105 typedef struct BTSTACK {
106 	pgno_t		bts_pgno;
107 	struct BTSTACK	*bts_next;
108 } BTSTACK;
109 
110 /*
111  *  Every btree page has a header that looks like this.  Flags are given
112  *  in the #define's for the F_ flags (see below).
113  */
114 
115 typedef struct BTHEADER {
116 	pgno_t h_pgno;		/* page number of this page */
117 	pgno_t h_prevpg;	/* left sibling */
118 	pgno_t h_nextpg;	/* right sibling */
119 
120 #define F_LEAF		0x01	/* leaf page, contains user data */
121 #define F_CONT		0x02	/* continuation page (large items) */
122 #define F_DIRTY		0x04	/* need to write to disk */
123 #define F_PRESERVE	0x08	/* never delete this chain of pages */
124 
125 	u_long h_flags;		/* page state */
126 	index_t h_lower;	/* lower bound of free space on page */
127 	index_t h_upper;	/* upper bound of free space on page */
128 	index_t h_linp[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
129 } BTHEADER;
130 
131 /*
132  *  HTBUCKETs are hash table buckets for looking up pages of in-memory
133  *  btrees by page number.  We use this indirection, rather than direct
134  *  pointers, so that the code for manipulating in-memory trees is the
135  *  same as that for manipulating on-disk trees.
136  */
137 
138 typedef struct HTBUCKET {
139 	pgno_t		ht_pgno;
140 	BTHEADER	*ht_page;
141 	struct HTBUCKET	*ht_next;
142 } HTBUCKET;
143 
144 typedef HTBUCKET	**HTABLE;
145 
146 /* minimum size we'll let a page be */
147 #define MINPSIZE	512
148 
149 /* default cache size, in bytes */
150 #define DEFCACHE	(20 * 1024)
151 
152 /* hash table size for in-memory trees */
153 #define	HTSIZE		128
154 
155 /* generate a hash key from a page number */
156 #define HASHKEY(pgno)	((pgno - 1) % HTSIZE)
157 
158 /*
159  *  Disk btrees have a file descriptor, and may also have an lru buffer
160  *  cache, if the user asked for one.
161  */
162 
163 typedef struct BTDISK {
164 	int	d_fd;
165 	char	*d_cache;
166 } BTDISK;
167 
168 /*
169  *  Cursors keep track of the current location in a sequential scan of
170  *  the database.  Since btrees impose a total ordering on keys, we can
171  *  walk forward or backward through the database from any point.  Cursors
172  *  survive updates to the tree, and can be used to delete a particular
173  *  record.
174  */
175 
176 typedef struct CURSOR {
177 	pgno_t		c_pgno;		/* pgno of current item in scan */
178 	index_t		c_index;	/* index of current item in scan */
179 	char		*c_key;		/* current key, used for updates */
180 
181 #define CRSR_BEFORE	0x01
182 
183 	u_char		c_flags;	/* to handle updates properly */
184 } CURSOR;
185 
186 /*
187  *  The private btree data structure.  The user passes a pointer to one of
188  *  these when we are to manipulate a tree, but the BTREE type is opaque
189  *  to him.
190  */
191 
192 typedef struct BTREEDATA_P {
193 	char		*bt_fname;		/* NULL for in-memory trees */
194 	union {
195 		BTDISK	bt_d;			/* for on-disk btrees */
196 		HTABLE	bt_ht;			/* hash table for mem trees */
197 	} bt_s;
198 	size_t		bt_psize;		/* page size for btree pages */
199 	int		(*bt_compare)();	/* key comparison function */
200 	pgno_t		bt_npages;		/* number of pages in tree */
201 	BTHEADER	*bt_curpage;		/* current page contents */
202 	pgno_t		bt_free;		/* free pg list for big data */
203 	CURSOR		bt_cursor;		/* cursor for scans */
204 	BTSTACK		*bt_stack;		/* parent stack for inserts */
205 	u_long		bt_lorder;		/* byte order (endian.h) */
206 
207 #define BTF_METAOK	0x01	/* meta-data written to start of file */
208 #define BTF_SEQINIT	0x02	/* we have called bt_seq */
209 #define BTF_ISWRITE	0x04	/* tree was opened for write */
210 #define BTF_NODUPS	0x08	/* tree created for unique keys */
211 
212 	u_long		bt_flags;		/* btree state */
213 } BTREEDATA_P;
214 
215 typedef BTREEDATA_P	*BTREE_P;
216 
217 /*
218  *  The first thing in a btree file is a BTMETA structure.  The rest of
219  *  the first page is empty, so that all disk operations are page-aligned.
220  */
221 
222 typedef struct BTMETA {
223 	u_long	m_magic;
224 	u_long	m_version;
225 	size_t	m_psize;
226 	pgno_t	m_free;
227 	u_long	m_flags;
228 	u_long	m_lorder;
229 } BTMETA;
230 
231 #define P_NONE		0		/* invalid page number in tree */
232 #define P_ROOT		1		/* page number of root pg in btree */
233 
234 #define NORELEASE	0		/* don't release a page during write */
235 #define RELEASE		1		/* release a page during write */
236 
237 #define INSERT		0		/* doing an insert operation */
238 #define DELETE		1		/* doing a delete operation */
239 
240 /* magic number for identifying btree files */
241 #define BTREEMAGIC	053162L		/* magic */
242 #define BTREEVERSION	2		/* last changed 6 jan 1991 */
243 
244 /* get the next free index on a btree page */
245 #define NEXTINDEX(p)	((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t)))
246 
247 /* is a BTITEM actually on the btree page? */
248 #define VALIDITEM(t, i)	((i)->bti_index < NEXTINDEX((t)->bt_curpage))
249 
250 /* guarantee longword alignment so structure refs work */
251 #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03)
252 
253 /* get a particular datum (or idatum) off a page */
254 #define GETDATUM(h,i)	 (((char *) h) + h->h_linp[i])
255 
256 /* is a {key,datum} too big to put on a single page? */
257 #define TOOBIG(t, sz)	(sz >= t->bt_psize / 5)
258 
259 /* is this a disk tree or a memory tree? */
260 #define ISDISK(t)	(t->bt_fname != (char *) NULL)
261 
262 /* does the disk tree use a cache? */
263 #define ISCACHE(t)	(t->bt_s.bt_d.d_cache != (char *) NULL)
264 
265 /*
266  *  DATUMs are for user data -- one appears on leaf pages for every
267  *  tree entry.  The d_bytes[] array contains the key first, then the data.
268  *
269  *  If either the key or the datum is too big to store on a single page,
270  *  a bit is set in the flags entry, and the d_bytes[] array contains a
271  *  pgno pointing to the page at which the data is actually stored.
272  *
273  *  Note on alignment:  every DATUM is guaranteed to be longword aligned
274  *  on the disk page.  In order to force longword alignment of user key
275  *  and data values, we must guarantee that the d_bytes[] array starts
276  *  on a longword boundary.  This is the reason that d_flags is a u_long,
277  *  rather than a u_char (it really only needs to be two bits big).  This
278  *  is necessary because we call the user's comparison function with a
279  *  pointer to the start of the d_bytes array.  We don't need to force
280  *  longword alignment of the data following the key, since that is copied
281  *  to a longword-aligned buffer before being returned to the user.
282  */
283 
284 typedef struct DATUM {
285 	size_t d_ksize;		/* size of key */
286 	size_t d_dsize;		/* size of data */
287 
288 #define D_BIGDATA	0x01	/* indirect datum ptr flag */
289 #define D_BIGKEY	0x02	/* indirect key ptr flag */
290 
291 	u_long d_flags;		/* flags (indirect bit) */
292 	char d_bytes[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
293 } DATUM;
294 
295 /* BTITEMs are used to return (page, index, datum) tuples from searches */
296 typedef struct BTITEM {
297 	pgno_t bti_pgno;
298 	index_t bti_index;
299 	DATUM *bti_datum;
300 } BTITEM;
301 
302 /*
303  *  IDATUMs are for data stored on internal pages.  This is the (key, pgno)
304  *  pair, such that key 'key' is the first entry on page 'pgno'.  If our
305  *  internal page contains keys (a) and (b) next to each other, then all
306  *  items >= to (a) and < (b) go on the same page as (a).  There are some
307  *  gotchas with duplicate keys, however.  See the split code for details.
308  *
309  *  If a key is too big to fit on a single page, then the i_bytes[] array
310  *  contains a pgno pointing to the start of a chain that actually stores
311  *  the bytes.  Since items on internal pages are never deleted from the
312  *  tree, these indirect chains are marked as special, so that they won't
313  *  be deleted if the corresponding leaf item is deleted.
314  *
315  *  As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char)
316  *  in order to guarantee that user keys are longword aligned on the disk
317  *  page.
318  */
319 
320 typedef struct IDATUM {
321 	size_t i_size;
322 	pgno_t i_pgno;
323 	u_long i_flags;		/* see DATUM.d_flags, above */
324 	char i_bytes[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
325 } IDATUM;
326 
327 /* all private interfaces have a leading _ in their names */
328 static BTITEM	*_bt_search();
329 static BTITEM	*_bt_searchr();
330 static BTHEADER	*_bt_allocpg();
331 static index_t	_bt_binsrch();
332 static int	_bt_isonpage();
333 static BTITEM	*_bt_first();
334 static int	_bt_release();
335 static int	_bt_wrtmeta();
336 static int	_bt_delindir();
337 static int	_bt_pgout();
338 static int	_bt_pgin();
339 static int	_bt_fixscan();
340 static int	_bt_indirect();
341 static int	_bt_crsrdel();
342 
343 extern int strcmp();
344 
345 static BTREEINFO _DefaultBTInfo = {
346 	0,	/* flags */
347 	0,	/* cachesize */
348 	0,	/* psize */
349 	strcmp,	/* compare */
350 	0
351 };
352 
353 /*
354  *  BTREE_OPEN -- Wrapper routine to open a btree.
355  *
356  *	Creates and fills a DB struct, and calls the routine that actually
357  *	opens the btree.
358  *
359  *	Parameters:
360  *		f:  filename to open
361  *		flags:  flag bits passed to open
362  *		mode:  permission bits, used if O_CREAT specified
363  *		b:  BTREEINFO pointer
364  *
365  *	Returns:
366  *		Filled-in DBT on success; NULL on failure, with errno
367  *		set as appropriate.
368  *
369  *	Side Effects:
370  *		Allocates memory for the DB struct.
371  */
372 
373 DB *
374 btree_open(f, flags, mode, b)
375 	char *f;
376 	int flags;
377 	int mode;
378 	BTREEINFO *b;
379 {
380 	DB *db;
381 	BTREE t;
382 
383 	if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL)
384 		return ((DB *) NULL);
385 
386 	if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) {
387 		(void) free ((char *) db);
388 		return ((DB *) NULL);
389 	}
390 
391 	db->internal = (char *) t;
392 	db->close = bt_close;
393 	db->delete = bt_delete;
394 	db->get = bt_get;
395 	db->put = bt_put;
396 	db->seq = bt_seq;
397 	db->sync = bt_sync;
398 
399 	return (db);
400 }
401 
402 /*
403  *  BT_OPEN -- Open a btree.
404  *
405  *	This routine creates the correct kind (disk or in-memory) of
406  *	btree and initializes it as required.
407  *
408  *	Parameters:
409  *		f -- name of btree (NULL for in-memory btrees)
410  *		flags -- flags passed to open()
411  *		mode -- mode passed to open ()
412  *		b -- BTREEINFO structure, describing btree
413  *
414  *	Returns:
415  *		(Opaque) pointer to the btree.  On failure, returns NULL
416  *		with errno set as appropriate.
417  *
418  *	Side Effects:
419  *		Allocates memory, opens files.
420  */
421 
422 BTREE
423 bt_open(f, flags, mode, b)
424 	char *f;
425 	int flags;
426 	int mode;
427 	BTREEINFO *b;
428 {
429 	BTREE_P t;
430 	HTABLE ht;
431 	int nbytes;
432 	int fd;
433 	CURSOR *c;
434 	BTMETA m;
435 	struct stat buf;
436 
437 	/* use the default info if none was provided */
438 	if (b == (BTREEINFO *) NULL)
439 		b = &_DefaultBTInfo;
440 
441 	if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL)
442 		return ((BTREE) NULL);
443 
444 	if (b->compare)
445 		t->bt_compare = b->compare;
446 	else
447 		t->bt_compare = strcmp;
448 
449 	t->bt_fname = f;
450 	t->bt_curpage = (BTHEADER *) NULL;
451 	t->bt_free = P_NONE;
452 	c = &(t->bt_cursor);
453 	c->c_pgno = P_NONE;
454 	c->c_index = 0;
455 	c->c_flags = (u_char) NULL;
456 	c->c_key = (char *) NULL;
457 	t->bt_stack = (BTSTACK *) NULL;
458 	t->bt_flags = 0;
459 
460 	/*
461 	 *  If no file name was supplied, this is an in-memory btree.
462 	 *  Otherwise, it's a disk-based btree.
463 	 */
464 	if (f == (char *) NULL) {
465 		/* in memory */
466 		if ((t->bt_psize = b->psize) < MINPSIZE) {
467 			if (t->bt_psize != 0) {
468 				(void) free ((char *) t);
469 				errno = EINVAL;
470 				return ((BTREE) NULL);
471 			}
472 			t->bt_psize = getpagesize();
473 		}
474 
475 		nbytes = HTSIZE * sizeof(HTBUCKET *);
476 		if ((ht = (HTABLE) malloc((unsigned) nbytes))
477 		    == (HTABLE) NULL) {
478 			(void) free((char *) t);
479 			return ((BTREE) NULL);
480 		}
481 		(void) bzero((char *) ht, nbytes);
482 		t->bt_s.bt_ht = ht;
483 		t->bt_npages = 0;
484 		t->bt_lorder = BYTE_ORDER;
485 		if (!(b->flags & R_DUP))
486 			t->bt_flags |= BTF_NODUPS;
487 	} else {
488 		/* on disk */
489 		if ((fd = open(f, O_RDONLY, 0)) < 0) {
490 			/* doesn't exist yet, be sure page is big enough */
491 			if ((t->bt_psize = b->psize) < sizeof(BTHEADER)
492 			    && b->psize != 0) {
493 				(void) free((char *) t);
494 				errno = EINVAL;
495 				return ((BTREE) NULL);
496 			}
497 			if (b->lorder == 0)
498 				b->lorder = BYTE_ORDER;
499 
500 			if (b->lorder != BIG_ENDIAN
501 			    && b->lorder != LITTLE_ENDIAN) {
502 				(void) free((char *) t);
503 				errno = EINVAL;
504 				return ((BTREE) NULL);
505 			}
506 			t->bt_lorder = b->lorder;
507 			if (!(b->flags & R_DUP))
508 				t->bt_flags |= BTF_NODUPS;
509 		} else {
510 			/* exists, get page size from file */
511 			if (read(fd, &m, sizeof(m)) < sizeof(m)) {
512 				(void) close(fd);
513 				(void) free((char *) t);
514 				errno = EINVAL;
515 				return ((BTREE) NULL);
516 			}
517 
518 			/* lorder always stored in host-independent format */
519 			NTOHL(m.m_lorder);
520 			if (m.m_lorder != BIG_ENDIAN
521 			    && m.m_lorder != LITTLE_ENDIAN) {
522 				(void) free((char *) t);
523 				errno = EINVAL;
524 				return ((BTREE) NULL);
525 			}
526 			t->bt_lorder = m.m_lorder;
527 
528 			if (t->bt_lorder != BYTE_ORDER) {
529 				BLSWAP(m.m_magic);
530 				BLSWAP(m.m_version);
531 				BLSWAP(m.m_psize);
532 				BLSWAP(m.m_free);
533 				BLSWAP(m.m_flags);
534 			}
535 
536 			if (m.m_magic != BTREEMAGIC
537 			    || m.m_version != BTREEVERSION
538 			    || m.m_psize < MINPSIZE) {
539 				(void) close(fd);
540 				(void) free((char *) t);
541 #ifndef EFTYPE
542 #define EFTYPE	-100
543 #endif
544 				errno = EFTYPE;
545 				return ((BTREE) NULL);
546 			}
547 			t->bt_psize = m.m_psize;
548 			t->bt_free = m.m_free;
549 			t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK;
550 			(void) close(fd);
551 		}
552 
553 		/* now open the file the way the user wanted it */
554 		if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) {
555 			(void) free ((char *) t);
556 			return ((BTREE) NULL);
557 		}
558 
559 		/* get number of pages, page size if necessary */
560 		(void) fstat(t->bt_s.bt_d.d_fd, &buf);
561 		if (t->bt_psize == 0)
562 			t->bt_psize = buf.st_blksize;
563 		t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize);
564 
565 		/* page zero is metadata, doesn't count */
566 		if (t->bt_npages > 0)
567 			--(t->bt_npages);
568 
569 		if (b->cachesize == 0)
570 			b->cachesize = DEFCACHE;
571 
572 		/* get an lru buffer cache, if the user asked for one */
573 		if ((b->cachesize / t->bt_psize) > 0) {
574 			BTDISK *d = &(t->bt_s.bt_d);
575 
576 			d->d_cache = lruinit(d->d_fd,
577 					     b->cachesize / t->bt_psize,
578 					     t->bt_psize,
579 					     (char *) t->bt_lorder,
580 					     _bt_pgin, _bt_pgout);
581 
582 			if (d->d_cache == (char *) NULL) {
583 				(void) free((char *) t);
584 				return ((BTREE) NULL);
585 			}
586 		}
587 	}
588 
589 	/* remember if tree was opened for write */
590 	if (((flags & O_WRONLY) == O_WRONLY)
591 	    || ((flags & O_RDWR) == O_RDWR))
592 		t->bt_flags |= BTF_ISWRITE;
593 
594 	return ((BTREE) t);
595 }
596 
597 /*
598  *  BT_GET -- Get an entry from a btree.
599  *
600  *	Does a key lookup in the tree to find the specified key, and returns
601  *	the key/data pair found.
602  *
603  *	Parameters:
604  *		tree -- btree in which to do lookup
605  *		key -- key to find
606  *		data -- pointer to DBT in which to return data
607  *		flag -- ignored
608  *
609  *	Returns:
610  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not
611  *		found.  If key is not found, nothing is stored in the
612  *		return DBT 'data'.
613  *
614  *	Side Effects:
615  *		None.
616  *
617  *	Warnings:
618  *		Return data is statically allocated, and will be overwritten
619  *		at the next call.
620  */
621 
622 int
623 bt_get(tree, key, data, flag)
624 	BTREE tree;
625 	DBT *key;
626 	DBT *data;
627 	int flag;
628 {
629 	BTREE_P t = (BTREE_P) tree;
630 	BTHEADER *h;
631 	DATUM *d;
632 	BTITEM *item;
633 
634 	/* lookup */
635 	item = _bt_search(t, key);
636 	if (item == (BTITEM *) NULL)
637 		return (RET_ERROR);
638 
639 	/* clear parent pointer stack */
640 	while (_bt_pop(t) != P_NONE)
641 		continue;
642 
643 	h = (BTHEADER *) t->bt_curpage;
644 	data->size = 0;
645 	data->data = (char *) NULL;
646 
647 	/* match? */
648 	if (VALIDITEM(t, item)
649 	    && _bt_cmp(t, key->data, item->bti_index) == 0) {
650 		d = (DATUM *) GETDATUM(h, item->bti_index);
651 		return (_bt_buildret(t, d, data, key));
652 	}
653 
654 	/* nope */
655 	return (RET_SPECIAL);
656 }
657 
658 /*
659  *  BT_PUT -- Add an entry to a btree.
660  *
661  *	The specified (key, data) pair is added to the tree.  If the tree
662  *	was created for unique keys only, then duplicates will not be
663  *	entered.  If the requested key exists in the tree, it will be over-
664  *	written unless the flags parameter is R_NOOVERWRITE, in which case
665  *	the update will not be done.  If duplicate keys are permitted in the
666  *	tree, duplicates will be inserted and will not overwrite existing
667  *	keys.  Nodes are split as required.
668  *
669  *	Parameters:
670  *		tree -- btree in which to put the new entry
671  *		key -- key component to add
672  *		data -- data corresponding to key
673  *		flag -- R_NOOVERWRITE or zero.
674  *
675  *	Returns:
676  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the
677  *		NOOVERWRITE flag was set and the specified key exists
678  *		in the database.
679  *
680  *	Side Effects:
681  *		None.
682  */
683 
684 int
685 bt_put(tree, key, data, flag)
686 	BTREE tree;
687 	DBT *key;
688 	DBT *data;
689 	int flag;
690 {
691 	BTREE_P t = (BTREE_P) tree;
692 	BTITEM *item;
693 
694 	/* look for this key in the tree */
695 	item = _bt_search(t, key);
696 
697 	/*
698 	 *  If this tree was originally created without R_DUP, then duplicate
699 	 *  keys are not allowed.  We need to check this at insertion time.
700 	 */
701 
702 	if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) {
703 		if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) {
704 			if (_bt_delone(t, item->bti_index) == RET_ERROR)
705 				return (RET_ERROR);
706 		}
707 	}
708 
709 	return (_bt_insert(t, item, key, data, flag));
710 }
711 
712 /*
713  *  BT_DELETE -- delete a key from the tree.
714  *
715  *	Deletes all keys (and their associated data items) matching the
716  *	supplied key from the tree.  If the flags entry is R_CURSOR, then
717  *	the current item in the active scan is deleted.
718  *
719  *	Parameters:
720  *		tree -- btree from which to delete key
721  *		key -- key to delete
722  *		flags -- R_CURSOR or zero
723  *
724  *	Returns:
725  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified
726  *		key was not in the tree.
727  *
728  *	Side Effects:
729  *		None.
730  */
731 
732 int
733 bt_delete(tree, key, flags)
734 	BTREE tree;
735 	DBT *key;
736 	int flags;
737 {
738 	BTREE_P t = (BTREE_P) tree;
739 	BTHEADER *h;
740 	BTITEM *item;
741 	int ndel = 0;
742 
743 	if (flags == R_CURSOR)
744 		return (_bt_crsrdel(t));
745 
746 	/* find the first matching key in the tree */
747 	item = _bt_first(t, key);
748 	h = t->bt_curpage;
749 
750 	/* delete all matching keys */
751 	for (;;) {
752 		while (VALIDITEM(t, item)
753 		       && (_bt_cmp(t, key->data, item->bti_index) == 0)) {
754 			if (_bt_delone(t, item->bti_index) == RET_ERROR)
755 				return (RET_ERROR);
756 			ndel++;
757 		}
758 
759 		if (VALIDITEM(t, item) || h->h_nextpg == P_NONE)
760 			break;
761 
762 		/* next page, if necessary */
763 		do {
764 			if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
765 				return (RET_ERROR);
766 			h = t->bt_curpage;
767 		} while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
768 
769 		item->bti_pgno = h->h_pgno;
770 		item->bti_index = 0;
771 
772 		if (!VALIDITEM(t, item)
773 		    || _bt_cmp(t, key->data, item->bti_index) != 0)
774 			break;
775 	}
776 
777 	/* clean up the parent stack */
778 	while (_bt_pop(t) != P_NONE)
779 		continue;
780 
781 	/* flush changes to disk */
782 	if (ISDISK(t)) {
783 		if (h->h_flags & F_DIRTY) {
784 			if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR)
785 				return (RET_ERROR);
786 		}
787 	}
788 
789 	if (ndel == 0)
790 		return (RET_SPECIAL);
791 
792 	return (RET_SUCCESS);
793 }
794 
795 /*
796  *  _BT_CRSRDEL -- Delete the item pointed to by the cursor.
797  *
798  *	This routine deletes the item most recently returned by a scan
799  *	through the tree.  Since it only makes sense to delete the current
800  *	record once, we make sure that we don't try to delete twice without
801  *	advancing the scan.
802  *
803  *	Parameters:
804  *		t -- tree in which to do deletion
805  *
806  *	Returns:
807  *		RET_SUCCESS, RET_ERROR.
808  *
809  *	Side Effects:
810  *		The call to _bt_delone marks the cursor, so we can tell that
811  *		the current record has been deleted.
812  */
813 
814 static int
815 _bt_crsrdel(t)
816 	BTREE_P t;
817 {
818 	CURSOR *c;
819 
820 	c = &(t->bt_cursor);
821 
822 	/* a cursor must exist, and can't have deleted the current key yet */
823 	if (!(t->bt_flags & BTF_SEQINIT) || (c->c_flags & CRSR_BEFORE)) {
824 		errno = EINVAL;
825 		return (RET_ERROR);
826 	}
827 
828 	if (_bt_getpage(t, c->c_pgno) == RET_ERROR)
829 		return (RET_ERROR);
830 
831 	if (c->c_index >= NEXTINDEX(t->bt_curpage)) {
832 		errno = EINVAL;
833 		return (RET_ERROR);
834 	}
835 
836 	return (_bt_delone(t, c->c_index));
837 }
838 
839 /*
840  *  _BT_DELONE -- Delete a single entry from a btree.
841  *
842  *	This routine physically removes a btree entry from a leaf page.
843  *	IDATUM items are *never* removed from internal nodes, regardless
844  *	of whether the entries that originally caused them to be added
845  *	are removed from the tree or not.  In addition, pages made empty
846  *	by element deletion are not actually reclaimed.  They are,
847  *	however, made available for reuse.
848  *
849  *	To delete an item from a page, we pack the remaining items at
850  *	the end of the page, overwriting the deleted item's entry.  We
851  *	move the line pointers backward on the page, overwriting the
852  *	original item's line pointer.  This guarantees that the space in
853  *	the middle of the page is free -- a property that our insertion
854  *	strategy relies on.
855  *
856  *	This routine doesn't reclaim pages all of whose entries have
857  *	been deleted.  These pages are available for reuse, however.
858  *	If an item is deleted that was too big to fit on a page, then
859  *	the blocks that it occupies are put on a free list for reuse.
860  *
861  *	Parameters:
862  *		t -- btree from which to delete item
863  *		index -- index of entry on current page to delete
864  *
865  *	Returns:
866  *		RET_SUCCESS, RET_ERROR.
867  *
868  *	Side Effects:
869  *		Physically changes page layout, adjusts internal page
870  *		state to reflect the deletion of the item, and updates
871  *		the list of free pages for this tree.
872  */
873 
874 static int
875 _bt_delone(t, index)
876 	BTREE_P t;
877 	index_t index;
878 {
879 	char *src, *dest;
880 	int nbytes, nmoved;
881 	index_t off;
882 	index_t top;
883 	index_t i;
884 	pgno_t chain;
885 	BTHEADER *h;
886 	CURSOR *c;
887 	DATUM *d;
888 
889 	/* deletion may confuse an active scan.  fix it.  */
890 	c = &(t->bt_cursor);
891 	if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno)
892 		if (_bt_fixscan(t, index, (DATUM *) NULL, DELETE) == RET_ERROR)
893 			return (RET_ERROR);
894 
895 	h = t->bt_curpage;
896 	off = h->h_linp[index];
897 	d = (DATUM *) GETDATUM(h, index);
898 
899 	/* if this is a big item, reclaim the space it occupies */
900 	if (d->d_flags & D_BIGKEY) {
901 		bcopy(&(d->d_bytes[0]),
902 		      (char *) &chain,
903 		      sizeof(chain));
904 		if (_bt_delindir(t, chain) == RET_ERROR)
905 			return (RET_ERROR);
906 		h = t->bt_curpage;
907 		d = (DATUM *) GETDATUM(h, index);
908 	}
909 	if (d->d_flags & D_BIGDATA) {
910 		bcopy(&(d->d_bytes[d->d_ksize]),
911 		      (char *) &chain,
912 		      sizeof(chain));
913 		if (_bt_delindir(t, chain) == RET_ERROR)
914 			return (RET_ERROR);
915 		h = t->bt_curpage;
916 		d = (DATUM *) GETDATUM(h, index);
917 	}
918 
919 	/* move the data down on the page */
920 	nbytes = d->d_ksize + d->d_dsize
921 		 + (sizeof(DATUM) - sizeof(char));
922 	nbytes = LONGALIGN(nbytes);
923 	src = ((char *) h) + h->h_upper;
924 	dest = src + nbytes;
925 	nmoved = (int) (((char *) d) - src);
926 	(void) bcopy(src, dest, nmoved);
927 
928 	/* next move the line pointers up */
929 	src = (char *) &(h->h_linp[index + 1]);
930 	dest = (char *) &(h->h_linp[index]);
931 	nmoved = (int) (((char *) &(h->h_linp[NEXTINDEX(h)])) - src);
932 	(void) bcopy(src, dest, nmoved);
933 
934 	/* remember that we freed up some space */
935 	h->h_upper += nbytes;
936 	h->h_lower -= sizeof(index_t);
937 
938 	/* adjust offsets in line pointers affected by moving the data */
939 	top = NEXTINDEX(h);
940 	for (i = 0; i < top; i++) {
941 		if (h->h_linp[i] < off)
942 			h->h_linp[i] += nbytes;
943 	}
944 
945 	/* it's gone */
946 	h->h_flags |= F_DIRTY;
947 
948 	return (RET_SUCCESS);
949 }
950 
951 /*
952  *  _BT_FIXSCAN -- Adjust a scan to cope with a change in tree structure.
953  *
954  *	If the user has an active scan on the database, and we delete an
955  *	item from the page the cursor is pointing at, we need to figure
956  *	out what to do about it.  Basically, the solution is to point
957  *	"between" keys in the tree, using the CRSR_BEFORE flag.  The
958  *	requirement is that the user should not miss any data in the
959  *	tree during a scan, just because he happened to do some deletions
960  *	or insertions while it was active.
961  *
962  *	In order to guarantee that the scan progresses properly, we need
963  *	to save the key of any deleted item we were pointing at, so that
964  *	we can check later insertions against it.
965  *
966  *	Parameters:
967  *		t -- tree to adjust
968  *		index -- index of item at which change was made
969  *		newd -- new datum (for insertions only)
970  *		op -- operation (DELETE or INSERT) causing change
971  *
972  *	Returns:
973  *		RET_SUCCESS, RET_ERROR (errno is set).
974  *
975  *	Side Effects:
976  *		None.
977  */
978 
979 static int
980 _bt_fixscan(t, index, newd, op)
981 	BTREE_P t;
982 	index_t index;
983 	DATUM *newd;
984 	int op;
985 {
986 	BTHEADER *h;
987 	CURSOR *c;
988 	DATUM *d;
989 
990 	h = t->bt_curpage;
991 	c = &(t->bt_cursor);
992 
993 	if (op == DELETE) {
994 		if (index < c->c_index)
995 			c->c_index--;
996 		else if (index == c->c_index) {
997 			if (!(c->c_flags & CRSR_BEFORE)) {
998 				if (_bt_crsrkey(t) == RET_ERROR)
999 					return (RET_ERROR);
1000 				c->c_flags |= CRSR_BEFORE;
1001 			}
1002 		}
1003 	} else {
1004 		/*
1005 		 *  If we previously deleted the object at this location,
1006 		 *  and now we're inserting a new one, we need to do the
1007 		 *  right thing -- the cursor should come either before
1008 		 *  or after the new item, depending on the key that was
1009 		 *  here, and the new one.
1010 		 */
1011 
1012 		if (c->c_flags & CRSR_BEFORE) {
1013 			if (index <= c->c_index) {
1014 				char *tmp;
1015 				int itmp;
1016 				pgno_t chain;
1017 				int r;
1018 
1019 				d = (DATUM *) GETDATUM(t->bt_curpage, index);
1020 				if (d->d_flags & D_BIGKEY) {
1021 					bcopy(&(newd->d_bytes[0]),
1022 					      (char *) &chain,
1023 					      sizeof(chain));
1024 					if (_bt_getbig(t, chain, &tmp, &itmp)
1025 					     == RET_ERROR)
1026 						return (RET_ERROR);
1027 				} else
1028 					tmp = &(newd->d_bytes[0]);
1029 
1030 				r = (*(t->bt_compare))(tmp, c->c_key);
1031 				if (r < 0)
1032 					c->c_index++;
1033 
1034 				if (d->d_flags & D_BIGKEY)
1035 					(void) free (tmp);
1036 			}
1037 		} else if (index <= c->c_index)
1038 			c->c_index++;
1039 	}
1040 	return (RET_SUCCESS);
1041 }
1042 
1043 /*
1044  *  _BT_CRSRKEY -- Save a copy of the key of the record that the cursor
1045  *		   is pointing to.  The record is about to be deleted.
1046  *
1047  *	Parameters:
1048  *		t -- btree
1049  *
1050  *	Returns:
1051  *		RET_SUCCESS, RET_ERROR.
1052  *
1053  *	Side Effects:
1054  *		Allocates memory for the copy which should be released when
1055  *		it is no longer needed.
1056  */
1057 
1058 static int
1059 _bt_crsrkey(t)
1060 	BTREE_P t;
1061 {
1062 	CURSOR *c;
1063 	DATUM *d;
1064 	pgno_t pgno;
1065 	int ignore;
1066 
1067 	c = &(t->bt_cursor);
1068 	d = (DATUM *) GETDATUM(t->bt_curpage, c->c_index);
1069 
1070 	if (d->d_flags & D_BIGKEY) {
1071 		bcopy(&(d->d_bytes[0]), (char *) &pgno, sizeof(pgno));
1072 		return (_bt_getbig(t, pgno, &(c->c_key), &ignore));
1073 	} else {
1074 		if ((c->c_key = (char *) malloc(d->d_ksize)) == (char *) NULL)
1075 			return (RET_ERROR);
1076 
1077 		bcopy(&(d->d_bytes[0]), c->c_key, d->d_ksize);
1078 	}
1079 	return (RET_SUCCESS);
1080 }
1081 
1082 /*
1083  *  _BT_GETBIG -- Get big data from indirect pages.
1084  *
1085  *	This routine chases indirect blocks for the big object at the
1086  *	specified page to a buffer, and return the address of the buffer.
1087  *
1088  *	Parameters:
1089  *		t -- btree with the indirect blocks
1090  *		pgno -- page number that starts the chain
1091  *		p -- (char **) to get the address of the buffer containing
1092  *		     the key or datum.
1093  *		sz -- pointer to an int to get the size of the instantiated
1094  *		      object.
1095  *
1096  *	Returns:
1097  *		RET_SUCCESS, RET_ERROR.
1098  *
1099  *	Side Effects:
1100  *		None.
1101  */
1102 
1103 static int
1104 _bt_getbig(t, pgno, p, sz)
1105 	BTREE_P t;
1106 	pgno_t pgno;
1107 	char **p;
1108 	int *sz;
1109 {
1110 	pgno_t save;
1111 	size_t nbytes;
1112 	size_t nhere;
1113 	BTHEADER *h;
1114 	char *top, *from, *where;
1115 
1116 	save = t->bt_curpage->h_pgno;
1117 	if (_bt_getpage(t, pgno) == RET_ERROR)
1118 		return (RET_ERROR);
1119 
1120 	h = t->bt_curpage;
1121 
1122 	bcopy((char *) &(h->h_linp[0]), (char *) &nbytes, sizeof(nbytes));
1123 
1124 	if ((*p = (char *) malloc(nbytes)) == (char *) NULL)
1125 		return (RET_ERROR);
1126 
1127 	*sz = nbytes;
1128 	from = ((char *) (&h->h_linp[0])) + sizeof(nbytes);
1129 	top = ((char *) h) + t->bt_psize;
1130 
1131 	/* need more space for data? */
1132 
1133 	where = *p;
1134 
1135 	while (nbytes > 0) {
1136 		nhere = (int) (top - from);
1137 		if (nhere > nbytes) {
1138 			(void) bcopy(from, where, nbytes);
1139 			nbytes = 0;
1140 		} else {
1141 			(void) bcopy(from, where, nhere);
1142 			where += nhere;
1143 			nbytes -= nhere;
1144 			if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
1145 				return (RET_ERROR);
1146 			h = t->bt_curpage;
1147 			top = ((char *) h) + t->bt_psize;
1148 			from = (char *) &(h->h_linp[0]);
1149 		}
1150 	}
1151 
1152 	if (_bt_getpage(t, save) == RET_ERROR)
1153 		return (RET_ERROR);
1154 
1155 	return (RET_SUCCESS);
1156 }
1157 
1158 /*
1159  *  _BT_DELINDIR -- Delete a chain of indirect blocks from the btree.
1160  *
1161  *	When a large item is deleted from the tree, this routine puts the
1162  *	space that it occupied onto the free list for later reuse.  The
1163  *	bt_free entry in the btree structure points at the head of this list.
1164  *	This value is also stored on disk in the btree's metadata.
1165  *
1166  *	Parameters:
1167  *		t -- btree from which to delete pages
1168  *		chain -- page number that starts the chain.
1169  *
1170  *	Returns:
1171  *		RET_SUCCESS, RET_ERROR.
1172  *
1173  *	Side Effects:
1174  *		Invalidates the current on-disk version of the btree's
1175  *		metadata (if any), and updates the free list appropriately.
1176  */
1177 
1178 static int
1179 _bt_delindir(t, chain)
1180 	BTREE_P t;
1181 	pgno_t chain;
1182 {
1183 	BTHEADER *h;
1184 	pgno_t save;
1185 	pgno_t oldfree;
1186 
1187 	h = t->bt_curpage;
1188 	save = h->h_pgno;
1189 	if (_bt_getpage(t, chain) == RET_ERROR)
1190 		return (RET_ERROR);
1191 
1192 	/*
1193 	 *  If some internal node is pointing at this chain, don't
1194 	 *  delete it.
1195 	 */
1196 
1197 	if (t->bt_curpage->h_flags & F_PRESERVE) {
1198 		if (_bt_getpage(t, save) == RET_ERROR)
1199 			return (RET_ERROR);
1200 		return (RET_SUCCESS);
1201 	}
1202 
1203 	/* if there's nothing on the free list, this is easy... */
1204 	if (t->bt_free == P_NONE) {
1205 		t->bt_free = chain;
1206 	} else {
1207 		oldfree = t->bt_free;
1208 
1209 		/* find the end of the data chain for the deleted datum */
1210 		t->bt_free = chain;
1211 		do {
1212 			if (_bt_getpage(t, chain) == RET_ERROR)
1213 				return (RET_ERROR);
1214 			h = t->bt_curpage;
1215 			if (h->h_nextpg != P_NONE)
1216 				chain = h->h_nextpg;
1217 		} while (h->h_nextpg != P_NONE);
1218 
1219 		/* link freed pages into free list */
1220 		h->h_nextpg = oldfree;
1221 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
1222 			return (RET_ERROR);
1223 		if (_bt_getpage(t, oldfree) == RET_ERROR)
1224 			return (RET_ERROR);
1225 		h = t->bt_curpage;
1226 		h->h_prevpg = chain;
1227 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
1228 			return (RET_ERROR);
1229 	}
1230 
1231 	/* restore the tree's current page pointer */
1232 	if (_bt_getpage(t, save) == RET_ERROR)
1233 		return (RET_ERROR);
1234 
1235 	/* we have trashed the tree metadata; rewrite it later */
1236 	t->bt_flags &= ~BTF_METAOK;
1237 
1238 	return (RET_SUCCESS);
1239 }
1240 
1241 /*
1242  *  _BT_FIRST -- Find the first item in the tree that matches the supplied
1243  *		 key.
1244  *
1245  *	This routine supports deletion.  When the user supplies a key to
1246  *	be deleted, we find the first one, and iteratively delete all the
1247  *	matching ones that follow it.
1248  *
1249  *	Parameters:
1250  *		t -- btree in which to find first occurrence
1251  *		key -- key to find
1252  *
1253  *	Returns:
1254  *		The BTITEM for the matching item.  If there's no match,
1255  *		this may point to the first item > than the supplied key,
1256  *		or off the end of the page.
1257  *
1258  *	Warnings:
1259  *		The BTITEM returned is in static space and will be overwritten
1260  *		by the next search of any kind in any btree.
1261  */
1262 
1263 static BTITEM *
1264 _bt_first(t, key)
1265 	BTREE_P t;
1266 	DBT *key;
1267 {
1268 	BTHEADER *h;
1269 	BTITEM *item;
1270 	index_t next;
1271 	int r;
1272 
1273 	/* find any matching item */
1274 	item = _bt_search(t, key);
1275 	h = t->bt_curpage;
1276 	next = NEXTINDEX(h);
1277 
1278 	/* if we're off the end of the page, search failed and we're done */
1279 	if (item->bti_index >= next)
1280 		return (item);
1281 
1282 	/* as long as we have an exact match, walk backwards */
1283 	while ((r = _bt_cmp(t, key->data, item->bti_index)) == 0) {
1284 
1285 		/* at start of page? */
1286 		if (item->bti_index == 0) {
1287 
1288 			/* if no prev page, we're done */
1289 			if (h->h_prevpg == P_NONE)
1290 				return (item);
1291 
1292 			/* walk backward, skipping empty pages */
1293 			do {
1294 				if (_bt_getpage(t, h->h_prevpg) == RET_ERROR)
1295 					return ((BTITEM *) NULL);
1296 				h = t->bt_curpage;
1297 			} while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE);
1298 
1299 			if (NEXTINDEX(h) != 0)
1300 				item->bti_index = NEXTINDEX(h) - 1;
1301 			else
1302 				item->bti_index = 0;
1303 
1304 			item->bti_pgno = h->h_pgno;
1305 		} else {
1306 			item->bti_index--;
1307 		}
1308 	}
1309 
1310 	/* if we went too far backwards, step forward one entry */
1311 	if (r > 0) {
1312 		if (++(item->bti_index) >= NEXTINDEX(h)
1313 		    && h->h_nextpg != P_NONE) {
1314 
1315 			/* walk forward, skipping empty pages */
1316 			do {
1317 				if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
1318 					return ((BTITEM *) NULL);
1319 				h = t->bt_curpage;
1320 			} while (h->h_nextpg != P_NONE && NEXTINDEX(h) == 0);
1321 
1322 			item->bti_index = 0;
1323 			item->bti_pgno = h->h_pgno;
1324 		}
1325 	}
1326 
1327 	/* got it */
1328 	return (item);
1329 }
1330 
1331 /*
1332  *  _BT_SEARCH, _BT_SEARCHR -- Search for a particular key in the tree.
1333  *
1334  *	Parameters:
1335  *		t -- btree in which to search
1336  *		key -- key to find
1337  *
1338  *	Returns:
1339  *		BTITEM for matching item, if any, or the BTITEM for the
1340  *		location of the key, if it were in the tree.
1341  *
1342  *	Warnings:
1343  *		The BTITEM returned is in static memory, and will be
1344  *		overwritten by the next search of any kind in any tree.
1345  */
1346 
1347 static BTITEM *
1348 _bt_search(t, key)
1349 	BTREE_P t;
1350 	DBT *key;
1351 {
1352 	/* we want to start all of our searches at the root */
1353 	if (_bt_getpage(t, P_ROOT) == RET_ERROR)
1354 		return ((BTITEM *) NULL);
1355 
1356 	return (_bt_searchr(t, key));
1357 }
1358 
1359 static BTITEM *
1360 _bt_searchr(t, key)
1361 	BTREE_P t;
1362 	DBT *key;
1363 {
1364 	BTHEADER *h = t->bt_curpage;
1365 	index_t index;
1366 	IDATUM *id;
1367 	DATUM *d;
1368 	static BTITEM item;
1369 
1370 	/* do a binary search on the current page */
1371 	index = _bt_binsrch(t, &(key->data[0]));
1372 
1373 	/*
1374 	 *  At this point, the binary search terminated because the endpoints
1375 	 *  got too close together, or we have a match.  Figure out which
1376 	 *  case applies and decide what to do based on the page type.
1377 	 */
1378 	if (h->h_flags & F_LEAF) {
1379 		item.bti_pgno = h->h_pgno;
1380 		item.bti_index = index;
1381 		if (index < NEXTINDEX(h))
1382 			d = (DATUM *) GETDATUM(h,index);
1383 		else
1384 			d = (DATUM *) NULL;
1385 
1386 		item.bti_datum = d;
1387 		return(&item);
1388 	} else {
1389 		id = (IDATUM *) GETDATUM(h, index);
1390 		_bt_push(t, h->h_pgno);
1391 		if (_bt_getpage(t, id->i_pgno) == RET_ERROR)
1392 			return ((BTITEM *) NULL);
1393 		return (_bt_searchr(t, key));
1394 	}
1395 }
1396 
1397 /*
1398  *  BT_GETPAGE -- Make pgno the current page of the btree.
1399  *
1400  *	This routine is just a wrapper that decides whether to call the
1401  *	memory or disk-based routine to do the work.
1402  *
1403  *	Parameters:
1404  *		t -- btree in which to get page
1405  *		pgno -- page number to get
1406  *
1407  *	Returns:
1408  *		RET_SUCCESS or RET_ERROR.
1409  */
1410 
1411 static int
1412 _bt_getpage(t, pgno)
1413 	BTREE_P t;
1414 	pgno_t pgno;
1415 {
1416 #ifdef DEBUG
1417 	if (pgno == P_NONE)
1418 		_punt();
1419 #endif /* DEBUG */
1420 
1421 	/* see if we can get away without doing any work */
1422 	if (t->bt_curpage != (BTHEADER *) NULL) {
1423 		if (t->bt_curpage->h_pgno == pgno)
1424 			return (RET_SUCCESS);
1425 	}
1426 
1427 	if (t->bt_fname == (char *) NULL)
1428 		return (_bt_getmpage(t, pgno));
1429 	else
1430 		return (_bt_getdpage(t, pgno));
1431 }
1432 
1433 /*
1434  *  _BT_GETMPAGE -- Make pgno the current page of the btree.
1435  *
1436  *	This routine gets pages for in-memory btrees.
1437  *
1438  *	Parameters:
1439  *		t -- btree in which to get page
1440  *		pgno -- page number to get
1441  *
1442  *	Returns:
1443  *		RET_SUCCESS or RET_ERROR.
1444  */
1445 
1446 static int
1447 _bt_getmpage(t, pgno)
1448 	register BTREE_P t;
1449 	pgno_t pgno;
1450 {
1451 	int htindex;
1452 	int nbytes;
1453 	BTHEADER *h;
1454 	HTBUCKET *b;
1455 
1456 	if (t->bt_curpage == (BTHEADER *) NULL) {
1457 		if (pgno != P_ROOT) {
1458 			errno = EBADF;
1459 			return (RET_ERROR);
1460 		}
1461 
1462 		t->bt_npages++;
1463 		h = (BTHEADER *) malloc((unsigned) t->bt_psize);
1464 		if (h == (BTHEADER *) NULL)
1465 			return (RET_ERROR);
1466 
1467 		h->h_pgno = P_ROOT;
1468 		h->h_flags = F_LEAF;
1469 		h->h_lower = (index_t)
1470 				(((char *) &(h->h_linp[0])) - ((char *) h));
1471 		h->h_upper = t->bt_psize;
1472 		h->h_prevpg = h->h_nextpg = P_NONE;
1473 
1474 		t->bt_curpage = h;
1475 
1476 		/* get the root page into the hash table */
1477 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
1478 			return (RET_ERROR);
1479 	}
1480 
1481 	htindex = HASHKEY(pgno);
1482 
1483 	for (b = t->bt_s.bt_ht[htindex];
1484 	     b != (HTBUCKET *) NULL;
1485 	     b = b->ht_next) {
1486 		if (b->ht_pgno == pgno) {
1487 			t->bt_curpage = b->ht_page;
1488 			return (RET_SUCCESS);
1489 		}
1490 	}
1491 	return (RET_ERROR);
1492 }
1493 
1494 /*
1495  *  _BT_GETDPAGE -- Make pgno the current page of the btree.
1496  *
1497  *	This routine gets pages for disk btrees.
1498  *
1499  *	Because disk btree pages must be readable across machine architectures,
1500  *	the btree code writes integers out in network format.  This routine
1501  *	converts them back to host format before returning the page.
1502  *
1503  *	Parameters:
1504  *		t -- btree in which to get page
1505  *		pgno -- page number to get
1506  *
1507  *	Returns:
1508  *		RET_SUCCESS, RET_ERROR.
1509  */
1510 
1511 static int
1512 _bt_getdpage(t, pgno)
1513 	register BTREE_P t;
1514 	pgno_t pgno;
1515 {
1516 	BTHEADER *h;
1517 	char *cache;
1518 	long pos;
1519 	int n, nbytes;
1520 
1521 	/* if we have an lru cache, let the cache code do the work */
1522 	if (ISCACHE(t)) {
1523 		cache = t->bt_s.bt_d.d_cache;
1524 
1525 		/* release the old page */
1526 		if (t->bt_curpage != (BTHEADER *) NULL) {
1527 			pgno_t opgno = t->bt_curpage->h_pgno;
1528 			t->bt_curpage->h_flags &= ~F_DIRTY;
1529 
1530 			if (lruwrite(cache, opgno) < 0)
1531 				return (RET_ERROR);
1532 
1533 			lrurelease(cache, opgno);
1534 		}
1535 
1536 		if (pgno > t->bt_npages) {
1537 			if ((h = (BTHEADER *) lrugetnew(cache, pgno, &nbytes))
1538 			    == (BTHEADER *) NULL)
1539 				return (RET_ERROR);
1540 			t->bt_npages = pgno;
1541 		} else {
1542 			if ((h = (BTHEADER *) lruget(cache, pgno, &nbytes))
1543 			    == (BTHEADER *) NULL)
1544 				return (RET_ERROR);
1545 		}
1546 
1547 		/* init this page, if necessary */
1548 		if (nbytes == 0) {
1549 			h->h_pgno = pgno;
1550 			h->h_flags = F_LEAF;
1551 			h->h_lower = (index_t)
1552 				(((char *) &(h->h_linp[0])) - ((char *) h));
1553 			h->h_upper = t->bt_psize;
1554 			h->h_prevpg = h->h_nextpg = P_NONE;
1555 		}
1556 
1557 		t->bt_curpage = h;
1558 
1559 		return (RET_SUCCESS);
1560 	}
1561 
1562 	/* sync the current page, if necessary */
1563 	if (t->bt_curpage != (BTHEADER *) NULL) {
1564 		if (t->bt_curpage->h_flags & F_DIRTY)
1565 			if (_bt_write(t, t->bt_curpage, RELEASE) == RET_ERROR)
1566 				return (RET_ERROR);
1567 	} else {
1568 		if (t->bt_npages == 0)
1569 			t->bt_npages = 1;
1570 
1571 		/* if no current page, get space for one */
1572 		if ((t->bt_curpage = (BTHEADER *) malloc((unsigned) t->bt_psize))
1573 		    == (BTHEADER *) NULL) {
1574 			return (RET_ERROR);
1575 		}
1576 	}
1577 
1578 	n = t->bt_psize;
1579 	pos = (long) (pgno * n);
1580 
1581 	/* seek to correct location in file */
1582 	if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) {
1583 		return (RET_ERROR);
1584 	}
1585 
1586 	/* read the page */
1587 	if ((nbytes = read(t->bt_s.bt_d.d_fd, t->bt_curpage, n)) < n) {
1588 
1589 		/*
1590 		 *  If we didn't get a full page, we must have gotten no
1591 		 *  data at all -- in which case we're trying to read a
1592 		 *  root page that doesn't exist yet.  This is the only
1593 		 *  case in which this is okay.  If this happens, construct
1594 		 *  an empty root page by hand.
1595 		 */
1596 		if (nbytes != 0 || pgno != P_ROOT) {
1597 			errno = EBADF;
1598 			return (RET_ERROR);
1599 		}
1600 
1601 		h = (BTHEADER *) t->bt_curpage;
1602 		h->h_pgno = pgno;
1603 		h->h_flags = F_LEAF;
1604 		h->h_lower = (index_t)
1605 				(((char *) &(h->h_linp[0])) - ((char *) h));
1606 		h->h_upper = t->bt_psize;
1607 		h->h_prevpg = h->h_nextpg = P_NONE;
1608 	} else
1609 		(void) _bt_pgin(t->bt_curpage, (char *) t->bt_lorder);
1610 
1611 	return (RET_SUCCESS);
1612 }
1613 
1614 /*
1615  *  _BT_PGOUT, _BT_PGIN -- Convert host-specific number layout to/from
1616  *			   the host-independent format stored on disk.
1617  *
1618  *	Parameters:
1619  *		h -- page to convert
1620  *		_lorder -- byte order for pages (stored as a char * in the
1621  *			   cache, and passed around as a magic cookie).
1622  *
1623  *	Returns:
1624  *		RET_SUCCESS (lru code requires a return value).
1625  *
1626  *	Side Effects:
1627  *		Layout of tree metadata on the page is changed in place.
1628  *
1629  *	Warnings:
1630  *		Everywhere else in the code, the types pgno_t and index_t
1631  *		are opaque.  These two routines know what they really
1632  *		are.
1633  */
1634 
1635 static int
1636 _bt_pgout(h, _lorder)
1637 	BTHEADER *h;
1638 	char *_lorder;
1639 {
1640 	int i;
1641 	int top;
1642 	int lorder;
1643 	DATUM *d;
1644 	IDATUM *id;
1645 	size_t chain;
1646 
1647 	lorder = (int) _lorder;
1648 	if (lorder == BYTE_ORDER)
1649 		return (RET_SUCCESS);
1650 
1651 	if (h->h_flags & F_LEAF) {
1652 		if (h->h_flags & F_CONT) {
1653 			if (h->h_prevpg == P_NONE) {
1654 				size_t longsz;
1655 
1656 				(void) bcopy((char *) &(h->h_linp[0]),
1657 					      (char *) &longsz,
1658 					      sizeof(longsz));
1659 				BLSWAP(longsz);
1660 				(void) bcopy((char *) &longsz,
1661 					      (char *) &(h->h_linp[0]),
1662 					      sizeof(longsz));
1663 			}
1664 		} else {
1665 			top = NEXTINDEX(h);
1666 			for (i = 0; i < top; i++) {
1667 				d = (DATUM *) GETDATUM(h, i);
1668 				if (d->d_flags & D_BIGKEY) {
1669 					(void) bcopy((char *) &(d->d_bytes[0]),
1670 						      (char *) &chain,
1671 						      sizeof(chain));
1672 					BLSWAP(chain);
1673 					(void) bcopy((char *) &chain,
1674 						      (char *) &(d->d_bytes[0]),
1675 						      sizeof(chain));
1676 				}
1677 				if (d->d_flags & D_BIGDATA) {
1678 					(void) bcopy((char *) &(d->d_bytes[d->d_ksize]),
1679 						      (char *) &chain,
1680 						      sizeof(chain));
1681 					BLSWAP(chain);
1682 					(void) bcopy((char *) &chain,
1683 						      (char *) &(d->d_bytes[d->d_ksize]),
1684 						      sizeof(chain));
1685 				}
1686 				BLSWAP(d->d_dsize);
1687 				BLSWAP(d->d_ksize);
1688 				BLSWAP(d->d_flags);
1689 				BLSWAP(h->h_linp[i]);
1690 			}
1691 		}
1692 	} else {
1693 		top = NEXTINDEX(h);
1694 		for (i = 0; i < top; i++) {
1695 			id = (IDATUM *) GETDATUM(h, i);
1696 			BLSWAP(id->i_size);
1697 			BLSWAP(id->i_pgno);
1698 			BLSWAP(id->i_flags);
1699 			if (id->i_flags & D_BIGKEY) {
1700 				(void) bcopy((char *) &(id->i_bytes[0]),
1701 					      (char *) &chain,
1702 					      sizeof(chain));
1703 				BLSWAP(chain);
1704 				(void) bcopy((char *) &chain,
1705 					      (char *) &(id->i_bytes[0]),
1706 					      sizeof(chain));
1707 			}
1708 			BLSWAP(h->h_linp[i]);
1709 		}
1710 	}
1711 	BLSWAP(h->h_flags);
1712 	BLSWAP(h->h_pgno);
1713 	BLSWAP(h->h_prevpg);
1714 	BLSWAP(h->h_nextpg);
1715 	BLSWAP(h->h_lower);
1716 	BLSWAP(h->h_upper);
1717 
1718 	return (RET_SUCCESS);
1719 }
1720 
1721 static int
1722 _bt_pgin(h, _lorder)
1723 	BTHEADER *h;
1724 	char *_lorder;
1725 {
1726 	int i;
1727 	int top;
1728 	int lorder;
1729 	DATUM *d;
1730 	IDATUM *id;
1731 	size_t chain;
1732 
1733 	/*
1734 	 *  If btree pages are to be stored in the host byte order, don't
1735 	 *  bother swapping.
1736 	 */
1737 	lorder = (int) _lorder;
1738 	if (lorder == BYTE_ORDER)
1739 		return (RET_SUCCESS);
1740 
1741 	BLSWAP(h->h_upper);
1742 	BLSWAP(h->h_lower);
1743 	BLSWAP(h->h_nextpg);
1744 	BLSWAP(h->h_prevpg);
1745 	BLSWAP(h->h_pgno);
1746 	BLSWAP(h->h_flags);
1747 
1748 	if (h->h_flags & F_LEAF) {
1749 		if (h->h_flags & F_CONT) {
1750 			if (h->h_prevpg == P_NONE) {
1751 				size_t longsz;
1752 
1753 				(void) bcopy((char *) &(h->h_linp[0]),
1754 					      (char *) &longsz,
1755 					      sizeof(longsz));
1756 				BLSWAP(longsz);
1757 				(void) bcopy((char *) &longsz,
1758 					      (char *) &(h->h_linp[0]),
1759 					      sizeof(longsz));
1760 			}
1761 		} else {
1762 			top = NEXTINDEX(h);
1763 			for (i = 0; i < top; i++) {
1764 				BLSWAP(h->h_linp[i]);
1765 				d = (DATUM *) GETDATUM(h, i);
1766 				BLSWAP(d->d_dsize);
1767 				BLSWAP(d->d_ksize);
1768 				BLSWAP(d->d_flags);
1769 				if (d->d_flags & D_BIGKEY) {
1770 					(void) bcopy((char *) &(d->d_bytes[0]),
1771 						      (char *) &chain,
1772 						      sizeof(chain));
1773 					BLSWAP(chain);
1774 					(void) bcopy((char *) &chain,
1775 						      (char *) &(d->d_bytes[0]),
1776 						      sizeof(chain));
1777 				}
1778 				if (d->d_flags & D_BIGDATA) {
1779 					(void) bcopy((char *) &(d->d_bytes[d->d_ksize]),
1780 						      (char *) &chain,
1781 						      sizeof(chain));
1782 					BLSWAP(chain);
1783 					(void) bcopy((char *) &chain,
1784 						      (char *) &(d->d_bytes[d->d_ksize]),
1785 						      sizeof(chain));
1786 				}
1787 			}
1788 		}
1789 	} else {
1790 		top = NEXTINDEX(h);
1791 		for (i = 0; i < top; i++) {
1792 			BLSWAP(h->h_linp[i]);
1793 			id = (IDATUM *) GETDATUM(h, i);
1794 			BLSWAP(id->i_size);
1795 			BLSWAP(id->i_pgno);
1796 			BLSWAP(id->i_flags);
1797 			if (id->i_flags & D_BIGKEY) {
1798 				(void) bcopy((char *) &(id->i_bytes[0]),
1799 					      (char *) &chain,
1800 					      sizeof(chain));
1801 				BLSWAP(chain);
1802 				(void) bcopy((char *) &chain,
1803 					      (char *) &(id->i_bytes[0]),
1804 					      sizeof(chain));
1805 			}
1806 		}
1807 	}
1808 	return (RET_SUCCESS);
1809 }
1810 
1811 /*
1812  *  BT_SYNC -- sync the btree to disk.
1813  *
1814  *	Parameters:
1815  *		tree -- btree to sync.
1816  *
1817  *	Returns:
1818  *		RET_SUCCESS, RET_ERROR.
1819  */
1820 
1821 bt_sync(tree)
1822 	BTREE tree;
1823 {
1824 	BTREE_P t = (BTREE_P) tree;
1825 	BTHEADER *h;
1826 	pgno_t pgno;
1827 
1828 	/* if this is an in-memory btree, syncing is a no-op */
1829 	if (!ISDISK(t))
1830 		return (RET_SUCCESS);
1831 
1832 	h = (BTHEADER *) t->bt_curpage;
1833 	h->h_flags &= ~F_DIRTY;
1834 
1835 	if (ISCACHE(t)) {
1836 		pgno = t->bt_curpage->h_pgno;
1837 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
1838 			return(RET_ERROR);
1839 		if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR)
1840 			return (RET_ERROR);
1841 		if (_bt_getpage(t, pgno) == RET_ERROR)
1842 			return (RET_ERROR);
1843 	} else {
1844 		if (_bt_write(t, h, NORELEASE) == RET_ERROR)
1845 			return (RET_ERROR);
1846 	}
1847 
1848 	return (fsync(t->bt_s.bt_d.d_fd));
1849 }
1850 
1851 /*
1852  *  _BT_INSERT -- Insert a new user datum in the btree.
1853  *
1854  *	This routine is called by bt_put, the public interface, once the
1855  *	location for the new item is known.  We do the work here, and
1856  *	handle splits if necessary.
1857  *
1858  *	Parameters:
1859  *		t -- btree in which to do the insertion.
1860  *		item -- BTITEM describing location of new datum
1861  *		key -- key to insert
1862  *		data -- data associated with key
1863  *		flag -- magic cookie passed recursively to bt_put if we
1864  *			have to do a split
1865  *
1866  *	Returns:
1867  *		RET_SUCCESS, RET_ERROR.
1868  */
1869 
1870 static int
1871 _bt_insert(t, item, key, data, flag)
1872 	BTREE_P t;
1873 	BTITEM *item;
1874 	DBT *key;
1875 	DBT *data;
1876 	int flag;
1877 {
1878 	index_t index;
1879 	BTHEADER *h;
1880 	DATUM *d;
1881 	int nbytes;
1882 	int status;
1883 	pgno_t pgno;
1884 	int keysize, datasize;
1885 	int bigkey, bigdata;
1886 
1887 	if (_bt_getpage(t, item->bti_pgno) == RET_ERROR)
1888 		return (RET_ERROR);
1889 	h = t->bt_curpage;
1890 
1891 	if (TOOBIG(t, data->size)) {
1892 		bigdata = TRUE;
1893 		datasize = sizeof(pgno_t);
1894 	} else {
1895 		bigdata = FALSE;
1896 		datasize = data->size;
1897 	}
1898 
1899 	if (TOOBIG(t, key->size)) {
1900 		bigkey = TRUE;
1901 		keysize = sizeof(pgno_t);
1902 	} else {
1903 		bigkey = FALSE;
1904 		keysize = key->size;
1905 	}
1906 
1907 	nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char));
1908 	nbytes = LONGALIGN(nbytes) + sizeof(index_t);
1909 
1910 	/* if there's not enough room here, split the page */
1911 	if ((h->h_upper - h->h_lower) < nbytes) {
1912 		if (_bt_split(t) == RET_ERROR)
1913 			return (RET_ERROR);
1914 
1915 		/* okay, try again */
1916 		return (bt_put(t, key, data, flag));
1917 	}
1918 
1919 	/* put together a leaf page datum from the key/data pair */
1920 	index = item->bti_index;
1921 	nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char));
1922 
1923 	if ((d = (DATUM *) malloc((unsigned) nbytes)) == (DATUM *) NULL)
1924 		return (RET_ERROR);
1925 
1926 	d->d_ksize = keysize;
1927 	d->d_dsize = datasize;
1928 	d->d_flags = 0;
1929 
1930 	if (bigkey) {
1931 		if (_bt_indirect(t, key, &pgno) == RET_ERROR)
1932 			return (RET_ERROR);
1933 		(void) bcopy((char *) &pgno, &(d->d_bytes[0]), sizeof(pgno));
1934 		d->d_flags |= D_BIGKEY;
1935 		if (_bt_getpage(t, item->bti_pgno) == RET_ERROR)
1936 			return (RET_ERROR);
1937 	} else {
1938 		if (d->d_ksize > 0) {
1939 			(void) bcopy((char *) key->data,
1940 				      (char *) &(d->d_bytes[0]),
1941 				      (int) d->d_ksize);
1942 		}
1943 	}
1944 
1945 	if (bigdata) {
1946 		if (_bt_indirect(t, data, &pgno) == RET_ERROR)
1947 			return (RET_ERROR);
1948 		(void) bcopy((char *) &pgno,
1949 			     &(d->d_bytes[keysize]),
1950 			     sizeof(pgno));
1951 		d->d_flags |= D_BIGDATA;
1952 		if (_bt_getpage(t, item->bti_pgno) == RET_ERROR)
1953 			return (RET_ERROR);
1954 	} else {
1955 		if (d->d_dsize > 0) {
1956 			(void) bcopy((char *) data->data,
1957 				      (char *) &(d->d_bytes[keysize]),
1958 				      (int) d->d_dsize);
1959 		}
1960 	}
1961 
1962 	/* do the insertion */
1963 	status = _bt_insertat(t, d, index);
1964 
1965 	(void) free((char *) d);
1966 
1967 	return (status);
1968 }
1969 
1970 /*
1971  *  _BT_INDIRECT -- Write a series of indirect pages for big objects.
1972  *
1973  *	A chain of indirect pages looks like
1974  *
1975  *	   +-------------------+   +---------------------+
1976  *	   |hdr|size|	       |   |hdr|		 |
1977  *	   +---+----+	       |   +---+		 |
1978  *	   |   ... data ...    |   |   ... data ...	 |    ...
1979  *	   |		       |   |			 |
1980  *	   +-------------------+   +---------------------+
1981  *
1982  *	where hdr is a standard btree page header (with the indirect bit
1983  *	set), size on the first page is the real size of the datum, and
1984  *	data are bytes of the datum, split across as many pages as necessary.
1985  *	Indirect pages are chained together with the h_prevpg and h_nextpg
1986  *	entries of the page header struct.
1987  *
1988  *	A single DBT is written per chain, so space on the last page is
1989  *	wasted.
1990  *
1991  *	We return the page number of the start of the chain.
1992  *
1993  *	When a big object is deleted from a tree, the space that it occupied
1994  *	is placed on a free list for later reuse.  This routine checks that
1995  *	free list before allocating new pages to the big datum being inserted.
1996  *
1997  *	Parameters:
1998  *		t -- btree in which to store indirect blocks
1999  *		data -- DBT with the big datum in it
2000  *		pgno -- place to put the starting page number of the chain
2001  *
2002  *	Returns:
2003  *		RET_SUCCESS, RET_ERROR.
2004  *
2005  *	Side Effects:
2006  *		Current page is changed on return.
2007  */
2008 
2009 static int
2010 _bt_indirect(t, data, pgno)
2011 	BTREE_P t;
2012 	DBT *data;
2013 	pgno_t *pgno;
2014 {
2015 	pgno_t save;
2016 	pgno_t prev;
2017 	char *top;
2018 	char *where;
2019 	char *from;
2020 	size_t dsize;
2021 	pgno_t nextchn;
2022 	int ischain;
2023 	BTHEADER *cur;
2024 
2025 	/* get set for first page in chain */
2026 	prev = P_NONE;
2027 	dsize = data->size;
2028 	from = (char *) data->data;
2029 
2030 	/* if there are blocks on the free list, use them first */
2031 	if ((*pgno = t->bt_free) == P_NONE) {
2032 		if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL)
2033 			return (RET_ERROR);
2034 
2035 		ischain = 0;
2036 		*pgno = cur->h_pgno = ++(t->bt_npages);
2037 	} else {
2038 		if (_bt_getpage(t, *pgno) == RET_ERROR)
2039 			return (RET_ERROR);
2040 		ischain = 1;
2041 		cur = t->bt_curpage;
2042 	}
2043 
2044 	cur->h_flags = F_CONT|F_LEAF;
2045 	(void) bcopy((char *) &dsize, (char *) &cur->h_linp[0], sizeof(size_t));
2046 	where = ((char *) (&cur->h_linp[0])) + sizeof(size_t);
2047 
2048 	/* fill and write pages in the chain */
2049 	for (;;) {
2050 		int nhere;
2051 
2052 		top = ((char *) cur) + t->bt_psize;
2053 		cur->h_prevpg = prev;
2054 		nextchn = cur->h_nextpg;
2055 		nhere = (int) (top - where);
2056 
2057 		if (nhere >= dsize) {
2058 			(void) bcopy(from, where, (int) dsize);
2059 			cur->h_nextpg = P_NONE;
2060 			dsize = 0;
2061 		} else {
2062 			(void) bcopy(from, where, (int) nhere);
2063 			dsize -= nhere;
2064 			from += nhere;
2065 			if (nextchn == P_NONE)
2066 				cur->h_nextpg = t->bt_npages + 1;
2067 			prev = cur->h_pgno;
2068 		}
2069 
2070 		/* current page is ready to go; write it out */
2071 		if (_bt_write(t, cur, RELEASE) == RET_ERROR)
2072 			return (RET_ERROR);
2073 
2074 		/* free the current page, if appropriate */
2075 		if (ISDISK(t) && !ISCACHE(t) && !ischain) {
2076 			(void) free ((char *) cur);
2077 		}
2078 
2079 		/* done? */
2080 		if (dsize == 0)
2081 			break;
2082 
2083 		/* allocate another page */
2084 		if (nextchn == P_NONE) {
2085 			if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL)
2086 				return (RET_ERROR);
2087 			ischain = 0;
2088 			cur->h_pgno = ++(t->bt_npages);
2089 		} else {
2090 			if (_bt_getpage(t, nextchn) == RET_ERROR)
2091 				return (RET_ERROR);
2092 			ischain = 1;
2093 			cur = t->bt_curpage;
2094 		}
2095 		cur->h_flags = F_LEAF | F_CONT;
2096 
2097 		where = (char *) (&cur->h_linp[0]);
2098 	}
2099 
2100 	/* if we used pages from the free list, record changes to it */
2101 	if (*pgno == t->bt_free) {
2102 		t->bt_free = nextchn;
2103 		t->bt_flags &= ~BTF_METAOK;
2104 	}
2105 
2106 	return (RET_SUCCESS);
2107 }
2108 
2109 /*
2110  *  _BT_SPLIT -- Split a page into two pages.
2111  *
2112  *	Splits are caused by insertions, and propogate up the tree in
2113  *	the usual way.  The root page is always page 1 in the file on
2114  *	disk, so root splits are handled specially.  On entry to this
2115  *	routine, t->bt_curpage is the page to be split.
2116  *
2117  *	Parameters:
2118  *		t -- btree in which to do split.
2119  *
2120  *	Returns:
2121  *		RET_SUCCESS, RET_ERROR.
2122  *
2123  *	Side Effects:
2124  *		Changes the notion of the current page.
2125  */
2126 
2127 static
2128 _bt_split(t)
2129 	BTREE_P t;
2130 {
2131 	BTHEADER *h;
2132 	BTHEADER *left, *right;
2133 	BTHEADER *next;
2134 	pgno_t nextpgno, parent;
2135 	int nbytes, len;
2136 	IDATUM *id;
2137 	DATUM *d;
2138 	char *src;
2139 	IDATUM *new;
2140 	pgno_t oldchain;
2141 	u_char flags;
2142 
2143 	h = (BTHEADER *) t->bt_curpage;
2144 
2145 	/* split root page specially, since it must remain page 1 */
2146 	if (h->h_pgno == P_ROOT) {
2147 		return (_bt_splitroot(t));
2148 	}
2149 
2150 	/*
2151 	 *  This is a little complicated.  We go to some trouble to
2152 	 *  figure out which of the three possible cases -- in-memory tree,
2153 	 *  disk tree (no cache), and disk tree (cache) -- we have, in order
2154 	 *  to avoid unnecessary copying.  If we have a disk cache, then we
2155 	 *  have to do some extra copying, though, since the cache code
2156 	 *  manages buffers externally to this code.
2157 	 */
2158 
2159 	if (ISDISK(t) && ISCACHE(t)) {
2160 		if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize))
2161 		    == (BTHEADER *) NULL)
2162 			return (RET_ERROR);
2163 		left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE;
2164 		left->h_flags = t->bt_curpage->h_flags;
2165 		left->h_lower = (index_t)
2166 			  (((char *) &(left->h_linp[0])) - ((char *) left));
2167 		left->h_upper = t->bt_psize;
2168 
2169 	} else {
2170 		if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
2171 			return (RET_ERROR);
2172 	}
2173 	left->h_pgno = h->h_pgno;
2174 
2175 	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
2176 		return (RET_ERROR);
2177 	right->h_pgno = ++(t->bt_npages);
2178 
2179 	/* now do the split */
2180 	if (_bt_dopsplit(t, left, right) == RET_ERROR)
2181 		return (RET_ERROR);
2182 
2183 	right->h_prevpg = left->h_pgno;
2184 	nextpgno = right->h_nextpg = h->h_nextpg;
2185 	left->h_nextpg = right->h_pgno;
2186 	left->h_prevpg = h->h_prevpg;
2187 
2188 	/* okay, now use the left half of the page as the new page */
2189 	if (ISDISK(t) && ISCACHE(t)) {
2190 		(void) bcopy((char *) left, (char *) t->bt_curpage,
2191 			     (int) t->bt_psize);
2192 		(void) free ((char *) left);
2193 		left = t->bt_curpage;
2194 	} else {
2195 		(void) free((char *) t->bt_curpage);
2196 		t->bt_curpage = left;
2197 	}
2198 
2199 	/*
2200 	 *  Write the new pages out.  We need them to stay where they are
2201 	 *  until we're done updating the parent pages.
2202 	 */
2203 
2204 	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
2205 		return (RET_ERROR);
2206 	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
2207 		return (RET_ERROR);
2208 
2209 	/* update 'prev' pointer of old neighbor of left */
2210 	if (nextpgno != P_NONE) {
2211 		if (_bt_getpage(t, nextpgno) == RET_ERROR)
2212 			return (RET_ERROR);
2213 		h = t->bt_curpage;
2214 		h->h_prevpg = right->h_pgno;
2215 		h->h_flags |= F_DIRTY;
2216 	}
2217 
2218 	if ((parent = _bt_pop(t)) != P_NONE) {
2219 		if (right->h_flags & F_LEAF) {
2220 			d = (DATUM *) GETDATUM(right, 0);
2221 			len = d->d_ksize;
2222 			if (d->d_flags & D_BIGKEY) {
2223 				bcopy(&(d->d_bytes[0]),
2224 				      (char *) &oldchain,
2225 				      sizeof(oldchain));
2226 				if (_bt_markchain(t, oldchain) == RET_ERROR)
2227 					return (RET_ERROR);
2228 				src = (char *) &oldchain;
2229 				flags = D_BIGKEY;
2230 			} else {
2231 				src = (char *) &(d->d_bytes[0]);
2232 				flags = 0;
2233 			}
2234 		} else {
2235 			id = (IDATUM *) GETDATUM(right, 0);
2236 			len = id->i_size;
2237 			flags = id->i_flags;
2238 			src = (char *) &(id->i_bytes[0]);
2239 		}
2240 		nbytes = len + (sizeof(IDATUM) - sizeof(char));
2241 		new = (IDATUM *) malloc((unsigned) nbytes);
2242 		if (new == (IDATUM *) NULL)
2243 			return (RET_ERROR);
2244 		new->i_size = len;
2245 		new->i_pgno = right->h_pgno;
2246 		new->i_flags = flags;
2247 		if (len > 0)
2248 			(void) bcopy(src, (char *) &(new->i_bytes[0]), len);
2249 
2250 		nbytes = LONGALIGN(nbytes) + sizeof(index_t);
2251 		if (_bt_getpage(t, parent) == RET_ERROR)
2252 			return (RET_ERROR);
2253 
2254 		h = t->bt_curpage;
2255 
2256 		/*
2257 		 *  Split the parent if we need to, then reposition the
2258 		 *  tree's current page pointer for the new datum.
2259 		 */
2260 		if ((h->h_upper - h->h_lower) < nbytes) {
2261 			if (_bt_split(t) == RET_ERROR)
2262 				return (RET_ERROR);
2263 			if (_bt_reposition(t, new, parent, right->h_prevpg)
2264 			      == RET_ERROR)
2265 				return (RET_ERROR);
2266 		}
2267 
2268 		/* okay, now insert the new idatum */
2269 		if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR)
2270 			return (RET_ERROR);
2271 	}
2272 
2273 	/*
2274 	 *  Okay, split is done; don't need right page stapled down anymore.
2275 	 *  The page we call 'left' above is the new version of the old
2276 	 *  (split) page, so we can't release it.
2277 	 */
2278 
2279 	if (_bt_release(t, right) == RET_ERROR)
2280 		return (RET_ERROR);
2281 	if (ISDISK(t) && !ISCACHE(t))
2282 		(void) free((char *) right);
2283 
2284 	return (RET_SUCCESS);
2285 }
2286 
2287 /*
2288  *  _BT_MARKCHAIN -- Mark a chain of pages as used by an internal node.
2289  *
2290  *	Chains of indirect blocks pointed to by leaf nodes get reclaimed
2291  *	when the item that points to them gets deleted.  Chains pointed
2292  *	to by internal nodes never get deleted.  This routine marks a
2293  *	chain as pointed to by an internal node.
2294  *
2295  *	Parameters:
2296  *		t -- tree in which to mark
2297  *		chain -- number of first page in chain
2298  *
2299  *	Returns:
2300  *		RET_SUCCESS, RET_ERROR.
2301  *
2302  *	Side Effects:
2303  *		None.
2304  */
2305 
2306 static int
2307 _bt_markchain(t, chain)
2308 	BTREE_P t;
2309 	pgno_t chain;
2310 {
2311 	pgno_t save;
2312 
2313 	save = t->bt_curpage->h_pgno;
2314 
2315 	if (_bt_getpage(t, chain) == RET_ERROR)
2316 		return (RET_ERROR);
2317 
2318 	t->bt_curpage->h_flags |= (F_DIRTY|F_PRESERVE);
2319 
2320 	if (_bt_getpage(t, save) == RET_ERROR)
2321 		return (RET_ERROR);
2322 
2323 	return (RET_SUCCESS);
2324 }
2325 
2326 /*
2327  *  _BT_REPOSITION -- Reposition the current page pointer of a btree.
2328  *
2329  *	After splitting a node in the tree in order to make room for
2330  *	an insertion, we need to figure out which page after the split
2331  *	should get the item we want to insert.  This routine positions
2332  *	the tree's current page pointer appropriately.
2333  *
2334  *	Parameters:
2335  *		t -- tree to position
2336  *		new -- the item we want to insert
2337  *		parent -- parent of the node that we just split
2338  *		prev -- page number of item directly to the left of
2339  *			new's position in the tree.
2340  *
2341  *	Returns:
2342  *		RET_SUCCESS, RET_ERROR.
2343  *
2344  *	Side Effects:
2345  *		None.
2346  */
2347 
2348 static int
2349 _bt_reposition(t, new, parent, prev)
2350 	BTREE_P t;
2351 	IDATUM *new;
2352 	pgno_t parent;
2353 	pgno_t prev;
2354 {
2355 	index_t i, next;
2356 	IDATUM *idx;
2357 
2358 	if (parent == P_ROOT) {
2359 
2360 		/*
2361 		 *  If we just split the root page, then there are guaranteed
2362 		 *  to be exactly two IDATUMs on it.  Look at both of them
2363 		 *  to see if they point to the page that we want.
2364 		 */
2365 
2366 		if (_bt_getpage(t, parent) == RET_ERROR)
2367 			return (RET_ERROR);
2368 
2369 		next = NEXTINDEX(t->bt_curpage);
2370 		for (i = 0; i < next; i++) {
2371 			idx = (IDATUM *) GETDATUM(t->bt_curpage, i);
2372 			if (_bt_getpage(t, idx->i_pgno) == RET_ERROR)
2373 				return (RET_ERROR);
2374 			if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
2375 				return (RET_SUCCESS);
2376 			if (_bt_getpage(t, parent) == RET_ERROR)
2377 				return (RET_ERROR);
2378 		}
2379 	} else {
2380 
2381 		/*
2382 		 *  Get the parent page -- which is where the new item would
2383 		 *  have gone -- and figure out whether the new item now goes
2384 		 *  on the parent, or the page immediately to the right of
2385 		 *  the parent.
2386 		 */
2387 
2388 		if (_bt_getpage(t, parent) == RET_ERROR)
2389 			return (RET_ERROR);
2390 		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
2391 			return (RET_SUCCESS);
2392 		if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR)
2393 			return (RET_ERROR);
2394 		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
2395 			return (RET_SUCCESS);
2396 	}
2397 	return (RET_ERROR);
2398 }
2399 
2400 /*
2401  *  _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page?
2402  *
2403  *	This routine is used by _bt_reposition to decide whether the current
2404  *	page is the correct one on which to insert a new item.
2405  *
2406  *	Parameters:
2407  *		t -- tree to check
2408  *		new -- the item that will be inserted (used for binary search)
2409  *		prev -- page number of page whose IDATUM is immediately to
2410  *			the left of new's position in the tree.
2411  *
2412  *	Returns:
2413  *		RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE).
2414  */
2415 
2416 static int
2417 _bt_isonpage(t, new, prev)
2418 	BTREE_P t;
2419 	IDATUM *new;
2420 	pgno_t prev;
2421 {
2422 	BTHEADER *h = (BTHEADER *) t->bt_curpage;
2423 	index_t i, next;
2424 	IDATUM *idx;
2425 
2426 	i = _bt_binsrch(t, &(new->i_bytes[0]));
2427 	while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0)
2428 		--i;
2429 	next = NEXTINDEX(h);
2430 	idx = (IDATUM *) GETDATUM(h, i);
2431 	while (i < next && idx->i_pgno != prev) {
2432 		i++;
2433 		idx = (IDATUM *) GETDATUM(h, i);
2434 	}
2435 	if (idx->i_pgno == prev)
2436 		return (RET_SUCCESS);
2437 	else
2438 		return (RET_ERROR);
2439 }
2440 
2441 /*
2442  *  _BT_SPLITROOT -- Split the root of a btree.
2443  *
2444  *	The root page for a btree is always page one.  This means that in
2445  *	order to split the root, we need to do extra work.
2446  *
2447  *	Parameters:
2448  *		t -- tree to split
2449  *
2450  *	Returns:
2451  *		RET_SUCCESS, RET_ERROR.
2452  *
2453  *	Side Effects:
2454  *		Splits root upward in the usual way, adding two new pages
2455  *		to the tree (rather than just one, as in usual splits).
2456  */
2457 
2458 static
2459 _bt_splitroot(t)
2460 	BTREE_P t;
2461 {
2462 	BTHEADER *h = t->bt_curpage;
2463 	BTHEADER *left, *right;
2464 	DATUM *d;
2465 	IDATUM *id;
2466 	char *where;
2467 	BTHEADER *where_h;
2468 	pgno_t lastpg;
2469 	char *src, *dest;
2470 	int len, nbytes;
2471 	u_long was_leaf;
2472 	pgno_t oldchain;
2473 	u_char flags;
2474 
2475 	/* get two new pages for the split */
2476 	if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
2477 		return (RET_ERROR);
2478 	left->h_pgno = ++(t->bt_npages);
2479 	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
2480 		return (RET_ERROR);
2481 	right->h_pgno = ++(t->bt_npages);
2482 
2483 	/* do the split */
2484 	if (_bt_dopsplit(t, left, right) == RET_ERROR)
2485 		return (RET_ERROR);
2486 
2487 	/* connect the new pages correctly */
2488 	right->h_prevpg = left->h_pgno;
2489 	left->h_nextpg = right->h_pgno;
2490 
2491 	/*
2492 	 *  Write the child pages out now.  We need them to remain
2493 	 *  where they are until we finish updating parent pages,
2494 	 *  however.
2495 	 */
2496 
2497 	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
2498 		return (RET_ERROR);
2499 	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
2500 		return (RET_ERROR);
2501 
2502 	/* now change the root page into an internal page */
2503 	was_leaf = (h->h_flags & F_LEAF);
2504 	h->h_flags &= ~F_LEAF;
2505 	h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h));
2506 	h->h_upper = (index_t) t->bt_psize;
2507 	(void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower));
2508 
2509 	/* put two new keys on root page */
2510 	where_h = left;
2511 	while (where_h) {
2512 		DATUM *data;
2513 		IDATUM *idata;
2514 
2515 		if (was_leaf) {
2516 			data = (DATUM *) GETDATUM(where_h, 0);
2517 
2518 			if (where_h == left) {
2519 				len = 0;	/* first key in tree is null */
2520 			} else {
2521 				if (data->d_flags & D_BIGKEY) {
2522 					bcopy(&(data->d_bytes[0]),
2523 					      (char *) &oldchain,
2524 					      sizeof(oldchain));
2525 					if (_bt_markchain(t, oldchain) == RET_ERROR)
2526 						return (RET_ERROR);
2527 					src = (char *) &oldchain;
2528 					flags = D_BIGKEY;
2529 				} else {
2530 					src = (char *) &(data->d_bytes[0]);
2531 					flags = 0;
2532 				}
2533 				len = data->d_ksize;
2534 			}
2535 		} else {
2536 			idata = (IDATUM *) GETDATUM(where_h, 0);
2537 			len = idata->i_size;
2538 			flags = idata->i_flags;
2539 			src = &(idata->i_bytes[0]);
2540 		}
2541 		dest = ((char *) h) + h->h_upper;
2542 		nbytes = len + (sizeof (IDATUM) - sizeof(char));
2543 		dest -= LONGALIGN(nbytes);
2544 		id = (IDATUM *) dest;
2545 		id->i_size = len;
2546 		id->i_pgno = where_h->h_pgno;
2547 		id->i_flags = flags;
2548 		if (len > 0)
2549 			(void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len);
2550 		dest -= ((int) h);
2551 		h->h_linp[NEXTINDEX(h)] = (index_t) dest;
2552 		h->h_upper = (index_t) dest;
2553 		h->h_lower += sizeof(index_t);
2554 
2555 		/* next page */
2556 		if (where_h == left)
2557 			where_h = right;
2558 		else
2559 			where_h = (BTHEADER *) NULL;
2560 	}
2561 
2562 	if (_bt_release(t, left) == RET_ERROR)
2563 		return (RET_ERROR);
2564 	if (_bt_release(t, right) == RET_ERROR)
2565 		return (RET_ERROR);
2566 
2567 	/*
2568 	 *  That's it, split is done.  If we're doing a non-cached disk
2569 	 *  btree, we can free up the pages we allocated, as they're on
2570 	 *  disk, now.
2571 	 */
2572 
2573 	if (ISDISK(t) && !ISCACHE(t)) {
2574 		(void) free ((char *) left);
2575 		(void) free ((char *) right);
2576 	}
2577 
2578 	h->h_flags |= F_DIRTY;
2579 
2580 	return (RET_SUCCESS);
2581 }
2582 
2583 /*
2584  *  _BT_DOPSPLIT -- Do the work of splitting a page
2585  *
2586  *	This routine takes two page pointers and splits the data on the
2587  *	current page of the btree between them.
2588  *
2589  *	We do a lot of work here to handle duplicate keys on a page; we
2590  *	have to place these keys carefully to guarantee that later searches
2591  *	will find them correctly.  See comments in the code below for details.
2592  *
2593  *	Parameters:
2594  *		t -- tree to split
2595  *		left -- pointer to page to get left half of the data
2596  *		right -- pointer to page to get right half of the data
2597  *
2598  *	Returns:
2599  *		None.
2600  */
2601 
2602 static
2603 _bt_dopsplit(t, left, right)
2604 	BTREE_P t;
2605 	BTHEADER *left;
2606 	BTHEADER *right;
2607 {
2608 	BTHEADER *h = t->bt_curpage;
2609 	size_t psize;
2610 	char *where;
2611 	BTHEADER *where_h;
2612 	index_t where_i;
2613 	int nbytes, dsize, fixedsize, freespc;
2614 	index_t i;
2615 	index_t save_lower, save_upper, save_i;
2616 	index_t switch_i;
2617 	char *save_key;
2618 	DATUM *d;
2619 	CURSOR *c;
2620 	index_t top;
2621 	int free_save;
2622 	pgno_t chain;
2623 	int ignore;
2624 
2625 	/*
2626 	 *  Our strategy is to put half the bytes on each page.  We figure
2627 	 *  out how many bytes we have total, and then move items until
2628 	 *  the last item moved put at least 50% of the data on the left
2629 	 *  page.
2630 	 */
2631 	save_key = (char *) NULL;
2632 	psize = (int) t->bt_psize;
2633 	where = ((char *) left) + psize;
2634 	where_h = left;
2635 	where_i = 0;
2636 	nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h));
2637 	freespc = nbytes;
2638 
2639 	top = NEXTINDEX(h);
2640 	if (h->h_flags & F_LEAF)
2641 		fixedsize = (sizeof(DATUM) - sizeof(char));
2642 	else
2643 		fixedsize = (sizeof(IDATUM) - sizeof(char));
2644 
2645 	save_key = (char *) NULL;
2646 
2647 	/* move data */
2648 	for (i = 0; i < top; i++) {
2649 
2650 		/*
2651 		 *  Internal and leaf pages have different layouts for
2652 		 *  data items, but in both cases the first entry in the
2653 		 *  data item is a size_t.
2654 		 */
2655 		d = (DATUM *) GETDATUM(h,i);
2656 		if (h->h_flags & F_LEAF) {
2657 			dsize = d->d_ksize + d->d_dsize + fixedsize;
2658 		} else {
2659 			dsize = d->d_ksize + fixedsize;
2660 		}
2661 
2662 		/*
2663 		 *  If a page contains duplicate keys, we have to be
2664 		 *  careful about splits.  A sequence of duplicate keys
2665 		 *  may not begin in the middle of one page and end in
2666 		 *  the middle of another; it must begin on a page boundary,
2667 		 *  in order for searches on the internal nodes to work
2668 		 *  correctly.
2669 		 */
2670 		if (where_h == left) {
2671 			if (save_key == (char *) NULL) {
2672 				if (h->h_flags & F_LEAF) {
2673 					if (d->d_flags & D_BIGKEY) {
2674 						free_save = TRUE;
2675 						bcopy(&(d->d_bytes[0]),
2676 						     (char *) &chain,
2677 						     sizeof(chain));
2678 						if (_bt_getbig(t, chain,
2679 							       &save_key,
2680 							       &ignore)
2681 						    == RET_ERROR)
2682 							return (RET_ERROR);
2683 					} else {
2684 						free_save = FALSE;
2685 						save_key = (char *) &(d->d_bytes[0]);
2686 					}
2687 				} else {
2688 					IDATUM *id = (IDATUM *) d;
2689 
2690 					if (id->i_flags & D_BIGKEY) {
2691 						free_save = TRUE;
2692 						bcopy(&(id->i_bytes[0]),
2693 						     (char *) &chain,
2694 						     sizeof(chain));
2695 						if (_bt_getbig(t, chain,
2696 							       &save_key,
2697 							       &ignore)
2698 						    == RET_ERROR)
2699 							return (RET_ERROR);
2700 					} else {
2701 						free_save = FALSE;
2702 						save_key = (char *)
2703 							&(id->i_bytes[0]);
2704 					}
2705 				}
2706 				save_i = 0;
2707 				save_lower = where_h->h_lower;
2708 				save_upper = where_h->h_upper;
2709 			} else {
2710 				if (_bt_cmp(t, save_key, i) != 0) {
2711 					save_lower = where_h->h_lower;
2712 					save_upper = where_h->h_upper;
2713 					save_i = i;
2714 				}
2715 				if (h->h_flags & F_LEAF) {
2716 					if (free_save)
2717 						(void) free(save_key);
2718 					if (d->d_flags & D_BIGKEY) {
2719 						free_save = TRUE;
2720 						bcopy(&(d->d_bytes[0]),
2721 						     (char *) &chain,
2722 						     sizeof(chain));
2723 						if (_bt_getbig(t, chain,
2724 							       &save_key,
2725 							       &ignore)
2726 						    == RET_ERROR)
2727 							return (RET_ERROR);
2728 					} else {
2729 						free_save = FALSE;
2730 						save_key = (char *) &(d->d_bytes[0]);
2731 					}
2732 				} else {
2733 					IDATUM *id = (IDATUM *) d;
2734 
2735 					if (id->i_flags & D_BIGKEY) {
2736 						free_save = TRUE;
2737 						bcopy(&(id->i_bytes[0]),
2738 						     (char *) &chain,
2739 						     sizeof(chain));
2740 						if (_bt_getbig(t, chain,
2741 							       &save_key,
2742 							       &ignore)
2743 						    == RET_ERROR)
2744 							return (RET_ERROR);
2745 					} else {
2746 						free_save = FALSE;
2747 						save_key = (char *)
2748 							&(id->i_bytes[0]);
2749 					}
2750 				}
2751 			}
2752 		}
2753 
2754 		/* copy data and update page state */
2755 		where -= LONGALIGN(dsize);
2756 		(void) bcopy((char *) d, (char *) where, dsize);
2757 		where_h->h_upper = where_h->h_linp[where_i] =
2758 			(index_t) (where - (int) where_h);
2759 		where_h->h_lower += sizeof(index_t);
2760 		where_i++;
2761 
2762 		/* if we've moved half, switch to the right-hand page */
2763 		nbytes -= LONGALIGN(dsize) + sizeof(index_t);
2764 		if ((freespc - nbytes) > nbytes) {
2765 			nbytes = 2 * freespc;
2766 
2767 			/* identical keys go on the same page */
2768 			if (save_i > 0) {
2769 				/* i gets incremented at loop bottom... */
2770 				i = save_i - 1;
2771 				where_h->h_lower = save_lower;
2772 				where_h->h_upper = save_upper;
2773 			}
2774 			where = ((char *) right) + psize;
2775 			where_h = right;
2776 			switch_i = where_i;
2777 			where_i = 0;
2778 		}
2779 	}
2780 
2781 	/*
2782 	 *  If there was an active scan on the database, and we just
2783 	 *  split the page that the cursor was on, we may need to
2784 	 *  adjust the cursor to point to the same entry as before the
2785 	 *  split.
2786 	 */
2787 
2788 	c = &(t->bt_cursor);
2789 	if ((t->bt_flags & BTF_SEQINIT)
2790 	    && (c->c_pgno == h->h_pgno)
2791 	    && (c->c_index >= switch_i)) {
2792 		c->c_pgno = where_h->h_pgno;
2793 		c->c_index -= where_i;
2794 	}
2795 	return (RET_SUCCESS);
2796 }
2797 
2798 /*
2799  *  _BT_INSERTI -- Insert IDATUM on current page in the btree.
2800  *
2801  *	This routine handles insertions to internal pages after splits
2802  *	lower in the tree.  On entry, t->bt_curpage is the page to get
2803  *	the new IDATUM.  We are also given pgno, the page number of the
2804  *	IDATUM that is immediately left of the new IDATUM's position.
2805  *	This guarantees that the IDATUM for the right half of the page
2806  *	after a split goes next to the IDATUM for its left half.
2807  *
2808  *	Parameters:
2809  *		t -- tree in which to do insertion.
2810  *		id -- new IDATUM to insert
2811  *		pgno -- page number of IDATUM left of id's position
2812  *
2813  *	Returns:
2814  *		RET_SUCCESS, RET_ERROR.
2815  */
2816 
2817 static int
2818 _bt_inserti(t, id, pgno)
2819 	BTREE_P t;
2820 	IDATUM *id;
2821 	pgno_t pgno;
2822 {
2823 	BTHEADER *h = t->bt_curpage;
2824 	index_t next, i;
2825 	IDATUM *idx;
2826 	char *key;
2827 	pgno_t chain;
2828 	int free_key;
2829 	int ignore;
2830 
2831 	if (id->i_flags & D_BIGKEY) {
2832 		free_key = TRUE;
2833 		bcopy(&(id->i_bytes[0]), (char *) &chain, sizeof(chain));
2834 		if (_bt_getbig(t, chain, &key, &ignore) == RET_ERROR)
2835 			return (RET_ERROR);
2836 	} else {
2837 		free_key = FALSE;
2838 		key = &(id->i_bytes[0]);
2839 	}
2840 	i = _bt_binsrch(t, key);
2841 
2842 	next = NEXTINDEX(h);
2843 	while (i < next && _bt_cmp(t, key, i) >= 0)
2844 		i++;
2845 
2846 	if (free_key)
2847 		(void) free(key);
2848 
2849 	/* okay, now we're close; find adjacent IDATUM */
2850 	for (;;) {
2851 		idx = (IDATUM *) GETDATUM(h,i);
2852 		if (idx->i_pgno == pgno) {
2853 			i++;
2854 			break;
2855 		}
2856 		--i;
2857 	}
2858 
2859 	/* correctly positioned, do the insertion */
2860 	return (_bt_insertat(t, id, i));
2861 }
2862 
2863 /*
2864  *  _BT_INSERTAT -- Insert a datum at a given location on the current page.
2865  *
2866  *	This routine does insertions on both leaf and internal pages.
2867  *
2868  *	Parameters:
2869  *		t -- tree in which to do insertion.
2870  *		p -- DATUM or IDATUM to insert.
2871  *		index -- index in line pointer array to put this item.
2872  *
2873  *	Returns:
2874  *		RET_SUCCESS, RET_ERROR.
2875  *
2876  *	Side Effects:
2877  *		Will rearrange line pointers to make space for the new
2878  *		entry.  This means that any scans currently active are
2879  *		invalid after this.
2880  *
2881  *	Warnings:
2882  *		There must be sufficient room for the new item on the page.
2883  */
2884 
2885 static int
2886 _bt_insertat(t, p, index)
2887 	BTREE_P t;
2888 	char *p;
2889 	index_t index;
2890 {
2891 	IDATUM *id = (IDATUM *) p;
2892 	DATUM *d = (DATUM *) p;
2893 	BTHEADER *h;
2894 	CURSOR *c;
2895 	index_t nxtindex;
2896 	char *src, *dest;
2897 	int nbytes;
2898 
2899 	/* insertion may confuse an active scan.  fix it. */
2900 	c = &(t->bt_cursor);
2901 	if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno)
2902 		if (_bt_fixscan(t, index, d, INSERT) == RET_ERROR)
2903 			return (RET_ERROR);
2904 
2905 	h = t->bt_curpage;
2906 	nxtindex = (index_t) NEXTINDEX(h);
2907 
2908 	/*
2909 	 *  If we're inserting at the middle of the line pointer array,
2910 	 *  copy pointers that will follow the new one up on the page.
2911 	 */
2912 
2913 	if (index < nxtindex) {
2914 		src = (char *) &(h->h_linp[index]);
2915 		dest = (char *) &(h->h_linp[index + 1]);
2916 		nbytes = (h->h_lower - (src - ((char *) h)))
2917 			 + sizeof(h->h_linp[0]);
2918 		(void) bcopy(src, dest, nbytes);
2919 	}
2920 
2921 	/* compute size and copy data to page */
2922 	if (h->h_flags & F_LEAF) {
2923 		nbytes = d->d_ksize + d->d_dsize
2924 			 + (sizeof(DATUM) - sizeof(char));
2925 	} else {
2926 		nbytes = id->i_size + (sizeof(IDATUM) - sizeof(char));
2927 	}
2928 	dest = (((char *) h) + h->h_upper) - LONGALIGN(nbytes);
2929 	(void) bcopy((char *) p, dest, nbytes);
2930 
2931 	/* update statistics */
2932 	dest -= (int) h;
2933 	h->h_linp[index] = (index_t) dest;
2934 	h->h_upper = (index_t) dest;
2935 	h->h_lower += sizeof(index_t);
2936 
2937 	/* we're done */
2938 	h->h_flags |= F_DIRTY;
2939 
2940 	return (RET_SUCCESS);
2941 }
2942 
2943 /*
2944  *  _BT_BINSRCH -- Do a binary search for a given key on the current page.
2945  *
2946  *	Searches on internal pages are handled slightly differently from
2947  *	searches on leaf pages.  This is because internal page searches
2948  *	find the largest item <= key in the tree, and leaf searches find
2949  *	the smallest item >= key.  This guarantees that leaf page searches
2950  *	leave us pointing at the item's correct position, and internal
2951  *	searches descend the tree correctly.
2952  *
2953  *	Parameters:
2954  *		t -- tree to search
2955  *		key -- key we're looking for
2956  *
2957  *	Returns:
2958  *		Index of the line pointer array entry for the (closest)
2959  *		match to key on the current page, with "closest" as defined
2960  *		above.
2961  */
2962 
2963 static index_t
2964 _bt_binsrch(t, key)
2965 	BTREE_P t;
2966 	char *key;
2967 {
2968 	index_t lbound, ubound, cur;
2969 	BTHEADER *h = t->bt_curpage;
2970 	IDATUM *id;
2971 	int match = 0;
2972 	int r;
2973 
2974 	lbound = 0;
2975 	ubound = NEXTINDEX(h);
2976 	if (ubound > 0)
2977 		--ubound;
2978 
2979 	/* do a binary search on the current page */
2980 	while ((ubound - lbound) > 1) {
2981 		cur = lbound + ((ubound - lbound) / 2);
2982 		r = _bt_cmp(t, key, cur);
2983 
2984 		if (r > 0)
2985 			lbound = cur + 1;
2986 		else if (r < 0)
2987 			ubound = cur;
2988 		else {
2989 			match++;
2990 			break;
2991 		}
2992 	}
2993 
2994 	/*
2995 	 *  At this point, the binary search terminated because the endpoints
2996 	 *  got too close together, or we have a match.  Figure out which
2997 	 *  case applies, decide what to do based on the page type (leaf or
2998 	 *  internal), and do the right thing.
2999 	 */
3000 	if (match) {
3001 		return (cur);
3002 	} else if (ubound != lbound) {
3003 		if (h->h_flags & F_LEAF) {
3004 			r = _bt_cmp(t, key, lbound);
3005 			if (r <= 0) {
3006 				return (lbound);
3007 			}
3008 		} else {
3009 			r = _bt_cmp(t, key, ubound);
3010 
3011 			/* for internal nodes, move as far left as possible */
3012 			if (r < 0) {
3013 				r = _bt_cmp(t, key, lbound);
3014 				if (r < 0 && lbound > 0)
3015 					--lbound;
3016 				return (lbound);
3017 			} else {
3018 				return (ubound);
3019 			}
3020 		}
3021 	}
3022 
3023 	if (h->h_flags & F_LEAF) {
3024 		if (ubound < NEXTINDEX(h)) {
3025 			r = _bt_cmp(t, key, ubound);
3026 			if (r > 0)
3027 				ubound++;
3028 		}
3029 	} else {
3030 		/* for internal pages, move as far left as possible */
3031 		if (ubound == NEXTINDEX(h))
3032 			ubound--;
3033 
3034 		while (_bt_cmp(t, key, ubound) < 0)
3035 			ubound--;
3036 	}
3037 	return (ubound);
3038 }
3039 
3040 /*
3041  *  BT_SEQ -- Sequential scan interface.
3042  *
3043  *	This routine supports sequential scans on the btree.  If called with
3044  *	flags set to R_CURSOR, or if no seq scan has been initialized in the
3045  *	current tree, we initialize the scan.  Otherwise, we advance the scan
3046  *	and return the next item.
3047  *
3048  *	Scans can be either keyed or non-keyed.  Keyed scans basically have
3049  *	a starting point somewhere in the middle of the tree.  Non-keyed
3050  *	scans start at an endpoint.  Also, scans can be backward (descending
3051  *	order), forward (ascending order), or no movement (keep returning
3052  *	the same item).
3053  *
3054  *	Flags is checked every time we enter the routine, so the user can
3055  *	change directions on an active scan if desired.  The key argument
3056  *	is examined only when we initialize the scan, in order to position
3057  *	it properly.
3058  *
3059  *	Items are returned via the key and data arguments passed in.
3060  *
3061  *	Parameters:
3062  *		tree -- btree in which to do scan
3063  *		key -- key, used to position scan on initialization, and
3064  *		       used to return key components to the user.
3065  *		data -- used to return data components to the user.
3066  *		flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or
3067  *			 R_PREV.
3068  *
3069  *	Returns:
3070  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data
3071  *		exists in the tree in the specified direction.
3072  *
3073  *	Side Effects:
3074  *		Changes the btree's notion of the current position in the
3075  *		scan.
3076  *
3077  *	Warnings:
3078  *		The key and data items returned are static and will be
3079  *		overwritten by the next bt_get or bt_seq.
3080  */
3081 
3082 bt_seq(tree, key, data, flags)
3083 	BTREE tree;
3084 	DBT *key;
3085 	DBT *data;
3086 	int flags;
3087 {
3088 	BTREE_P t = (BTREE_P) tree;
3089 	BTHEADER *h;
3090 	DATUM *d;
3091 	int status;
3092 
3093 	/* do we need to initialize the scan? */
3094 	if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST
3095 	    || !(t->bt_flags & BTF_SEQINIT)) {
3096 
3097 		/* initialize it */
3098 		status = _bt_seqinit(t, key, flags);
3099 	} else {
3100 		/* just advance the current scan pointer */
3101 		status = _bt_seqadvance(t, flags);
3102 	}
3103 
3104 	key->size = data->size = 0;
3105 	key->data = data->data = (char *) NULL;
3106 
3107 	h = t->bt_curpage;
3108 
3109 	/* is there a valid item at the current scan location? */
3110 	if (status == RET_SPECIAL) {
3111 		if (flags == R_NEXT) {
3112 			if (t->bt_cursor.c_index >= NEXTINDEX(h)) {
3113 				if (NEXTINDEX(h) > 0)
3114 					t->bt_cursor.c_index = NEXTINDEX(h) - 1;
3115 				else
3116 					t->bt_cursor.c_index = 0;
3117 			}
3118 		} else {
3119 			t->bt_cursor.c_index = 0;
3120 		}
3121 		return (RET_SPECIAL);
3122 	} else if (status == RET_ERROR)
3123 		return (RET_ERROR);
3124 
3125 	/* okay, return the data */
3126 	d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index);
3127 
3128 	return (_bt_buildret(t, d, data, key));
3129 }
3130 
3131 /*
3132  *  _BT_BUILDRET -- Build return key/data pair as a result of search or scan.
3133  *
3134  *	This routine manages the statically allocated buffers in which we
3135  *	return data to the user.
3136  *
3137  *	Parameters:
3138  *		t -- btree from which to return datum
3139  *		d -- DATUM to be returned to the user.
3140  *		data -- data argument supplied by user for return
3141  *		key -- key argument supplied by user for return
3142  *
3143  *	Returns:
3144  *		RET_SUCCESS, RET_ERROR.
3145  *
3146  *	Side Effects:
3147  *		May free and reallocate static buffers, if the data
3148  *		we want to return is bigger than the space we have to
3149  *		do so.
3150  */
3151 
3152 static int
3153 _bt_buildret(t, d, data, key)
3154 	BTREE_P t;
3155 	DATUM *d;
3156 	DBT *data;
3157 	DBT *key;
3158 {
3159 	static int _data_s = 0;
3160 	static int _key_s = 0;
3161 	static char *_data = (char *) NULL;
3162 	static char *_key = (char *) NULL;
3163 	char *from, *where, *top;
3164 	pgno_t save;
3165 	pgno_t chain;
3166 	size_t nbytes;
3167 	size_t nhere;
3168 	BTHEADER *h;
3169 
3170 	if (d->d_flags & D_BIGKEY) {
3171 		if (_key != (char *) NULL)
3172 			(void) free(_key);
3173 		(void) bcopy((char *) &(d->d_bytes[0]),
3174 		      	     (char *) &chain,
3175 		      	     sizeof(chain));
3176 		if (_bt_getbig(t, chain, &_key, &_key_s) == RET_ERROR)
3177 			return (RET_ERROR);
3178 		key->data = _key;
3179 		key->size = _key_s;
3180 	} else {
3181 		/* need more space for key? */
3182 		if (d->d_ksize > _key_s) {
3183 			if (_key != (char *) NULL)
3184 				(void) free (_key);
3185 			if ((_key = (char *) malloc((unsigned) d->d_ksize))
3186 			    == (char *) NULL)
3187 				return (RET_ERROR);
3188 			_key_s = d->d_ksize;
3189 		}
3190 
3191 		key->data = _key;
3192 		if ((key->size = d->d_ksize) > 0)
3193 			(void) bcopy(&(d->d_bytes[0]),
3194 				     _key,
3195 				     (int) d->d_ksize);
3196 	}
3197 
3198 	if (d->d_flags & D_BIGDATA) {
3199 		if (_data != (char *) NULL)
3200 			(void) free(_data);
3201 		(void) bcopy(&(d->d_bytes[d->d_ksize]),
3202 		      	     (char *) &chain,
3203 		      	     sizeof(chain));
3204 		if (_bt_getbig(t, chain, &_data, &_data_s) == RET_ERROR)
3205 			return (RET_ERROR);
3206 		data->data = _data;
3207 		data->size = _data_s;
3208 	} else {
3209 		/* need more space for data? */
3210 		if (d->d_dsize > _data_s) {
3211 			if (_data != (char *) NULL)
3212 				(void) free (_data);
3213 			if ((_data = (char *) malloc((unsigned) d->d_dsize))
3214 			    == (char *) NULL)
3215 				return (RET_ERROR);
3216 			_data_s = d->d_dsize;
3217 		}
3218 
3219 		data->data = _data;
3220 
3221 		if ((data->size = d->d_dsize) > 0)
3222 			(void) bcopy(&(d->d_bytes[d->d_ksize]),
3223 				      _data,
3224 				      d->d_dsize);
3225 	}
3226 
3227 	return (RET_SUCCESS);
3228 }
3229 
3230 /*
3231  *  _BT_SEQINIT -- Initialize a sequential scan on the btree.
3232  *
3233  *	Sets the tree's notion of the current scan location correctly
3234  *	given a key and a direction.
3235  *
3236  *	Parameters:
3237  *		t -- tree in which to initialize scan
3238  *		key -- key for initial scan position
3239  *		flags -- R_NEXT, R_PREV
3240  *
3241  *	Returns:
3242  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there's no data
3243  *		in the tree to scan.
3244  *
3245  *	Side Effects:
3246  *		Changes current scan position for the tree.  Almost certainly
3247  *		changes current page, as well.  Sets BTF_SEQINIT bit in tree
3248  *		flags, so that we know we've initialized a scan.
3249  */
3250 
3251 static int
3252 _bt_seqinit(t, key, flags)
3253 	BTREE_P t;
3254 	DBT *key;
3255 	int flags;
3256 {
3257 	BTITEM *item;
3258 	BTHEADER *h;
3259 	CURSOR *c;
3260 	IDATUM *id;
3261 	pgno_t pgno;
3262 	index_t index;
3263 	index_t last;
3264 
3265 	/*
3266 	 *  Figure out if we really have to search for the key that the
3267 	 *  user supplied.  If key is null, then this is an unkeyed scan
3268 	 *  and we can just start from an endpoint.
3269 	 */
3270 
3271 	c = &(t->bt_cursor);
3272 
3273 	if (flags == R_CURSOR) {
3274 		if (key->data != (char *) NULL) {
3275 
3276 			/* key supplied, find it */
3277 			item = _bt_search(t, key);
3278 			c->c_index = item->bti_index;
3279 			c->c_pgno = t->bt_curpage->h_pgno;
3280 		} else {
3281 			errno = EINVAL;
3282 			return (RET_ERROR);
3283 		}
3284 
3285 	} else {
3286 
3287 		/*
3288 		 *  Unkeyed scan.  For backward scans, find the last item
3289 		 *  in the tree; for forward scans, find the first item.
3290 		 */
3291 
3292 		if (_bt_getpage(t, P_ROOT) == RET_ERROR)
3293 			return (RET_ERROR);
3294 		h = t->bt_curpage;
3295 		if (flags == R_LAST || flags == R_PREV) {
3296 
3297 			/* backward scan */
3298 			while (!(h->h_flags & F_LEAF)) {
3299 				last = NEXTINDEX(h) - 1;
3300 				id = (IDATUM *) GETDATUM(h,last);
3301 				if (_bt_getpage(t, id->i_pgno) == RET_ERROR)
3302 					return (RET_ERROR);
3303 				h = t->bt_curpage;
3304 			}
3305 
3306 			/* skip empty pages */
3307 			while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE) {
3308 				if (_bt_getpage(t, h->h_prevpg) == RET_ERROR)
3309 					return (RET_ERROR);
3310 				h = t->bt_curpage;
3311 			}
3312 
3313 			c->c_pgno = h->h_pgno;
3314 			if (NEXTINDEX(h) > 0)
3315 				c->c_index = NEXTINDEX(h) - 1;
3316 			else
3317 				c->c_index = 0;
3318 		} else if (flags == R_FIRST || flags == R_NEXT) {
3319 			/* forward scan */
3320 			while (!(h->h_flags & F_LEAF)) {
3321 				id = (IDATUM *) GETDATUM(h,0);
3322 				if (_bt_getpage(t, id->i_pgno) == RET_ERROR)
3323 					return (RET_ERROR);
3324 				h = t->bt_curpage;
3325 			}
3326 
3327 			/* skip empty pages */
3328 			while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE) {
3329 				if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
3330 					return (RET_ERROR);
3331 				h = t->bt_curpage;
3332 			}
3333 
3334 			c->c_pgno = h->h_pgno;
3335 			c->c_index = 0;
3336 		} else {
3337 			/* no flags passed in */
3338 			errno = EINVAL;
3339 			return (RET_ERROR);
3340 		}
3341 	}
3342 
3343 	/* okay, scan is initialized */
3344 	t->bt_flags |= BTF_SEQINIT;
3345 
3346 	if (c->c_index == NEXTINDEX(t->bt_curpage))
3347 		return (RET_SPECIAL);
3348 
3349 	return (RET_SUCCESS);
3350 }
3351 
3352 /*
3353  *  _BT_SEQADVANCE -- Advance the sequential scan on this tree.
3354  *
3355  *	Moves the current location pointer for the scan on this tree one
3356  *	spot in the requested direction.
3357  *
3358  *	Parameters:
3359  *		t -- btree being scanned
3360  *		flags -- for R_NEXT, R_PREV
3361  *
3362  *	Returns:
3363  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there is no
3364  *		more data in the specified direction.
3365  *
3366  *	Side Effects:
3367  *		May change current page.
3368  */
3369 
3370 static int
3371 _bt_seqadvance(t, flags)
3372 	BTREE_P t;
3373 	int flags;
3374 {
3375 	BTHEADER *h;
3376 	CURSOR *c;
3377 	index_t index;
3378 
3379 	c = &(t->bt_cursor);
3380 	index = c->c_index;
3381 
3382 	if (_bt_getpage(t, c->c_pgno) == RET_ERROR)
3383 		return (RET_ERROR);
3384 	h = t->bt_curpage;
3385 
3386 	/* by the time we get here, don't need the cursor key anymore */
3387 	if (c->c_key != (char *) NULL)
3388 		(void) free(c->c_key);
3389 
3390 	if (flags == R_NEXT) {
3391 
3392 		/*
3393 		 *  This is a forward scan.  If the cursor is pointing
3394 		 *  at a virtual record (that is, it was pointing at
3395 		 *  a record that got deleted), then we should return
3396 		 *  the record it's pointing at now.  Otherwise, we
3397 		 *  should advance the scan.  In either case, we need
3398 		 *  to be careful not to run off the end of the current
3399 		 *  page.
3400 		 */
3401 
3402 		if (c->c_flags & CRSR_BEFORE) {
3403 
3404 			if (index >= NEXTINDEX(h)) {
3405 				/* out of items on this page, get next page */
3406 				if (h->h_nextpg == P_NONE) {
3407 					/* tell caller we're done... */
3408 					c->c_index = NEXTINDEX(h);
3409 					return (RET_SPECIAL);
3410 				}
3411 
3412 				/* skip empty pages */
3413 				do {
3414 					if (_bt_getpage(t, h->h_nextpg)
3415 					    == RET_ERROR) {
3416 						c->c_index = NEXTINDEX(h);
3417 						return (RET_ERROR);
3418 					}
3419 					h = t->bt_curpage;
3420 					c->c_pgno = h->h_pgno;
3421 				} while (NEXTINDEX(h) == 0
3422 					 && h->h_nextpg != P_NONE);
3423 
3424 				if (NEXTINDEX(h) == 0) {
3425 					/* tell caller we're done */
3426 					c->c_index = NEXTINDEX(h);
3427 					return (RET_SPECIAL);
3428 				}
3429 				index = 0;
3430 			}
3431 			c->c_flags &= ~CRSR_BEFORE;
3432 
3433 		} else if (++index >= NEXTINDEX(h)) {
3434 
3435 			/* out of items on this page, get next page */
3436 			if (h->h_nextpg == P_NONE) {
3437 				/* tell caller we're done... */
3438 				c->c_index = NEXTINDEX(h);
3439 				return (RET_SPECIAL);
3440 			}
3441 
3442 			/* skip empty pages */
3443 			do {
3444 				if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) {
3445 					c->c_index = NEXTINDEX(h);
3446 					return (RET_ERROR);
3447 				}
3448 				h = t->bt_curpage;
3449 				c->c_pgno = h->h_pgno;
3450 			} while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
3451 
3452 			if (NEXTINDEX(h) == 0) {
3453 				/* tell caller we're done */
3454 				c->c_index = NEXTINDEX(h);
3455 				return (RET_SPECIAL);
3456 			}
3457 			index = 0;
3458 		}
3459 	} else if (flags == R_PREV) {
3460 
3461 		/* for backward scans, life is substantially easier */
3462 		c->c_flags &= ~CRSR_BEFORE;
3463 		if (c->c_key != (char *) NULL) {
3464 			(void) free(c->c_key);
3465 			c->c_key = (char *) NULL;
3466 		}
3467 
3468 		if (index == 0) {
3469 
3470 			/* we may be done */
3471 			c->c_index = 0;
3472 
3473 			/* out of items on this page, get next page */
3474 			if (h->h_prevpg == P_NONE)
3475 				return (RET_SPECIAL);
3476 
3477 			/* skip empty pages */
3478 			do {
3479 				if (_bt_getpage(t, h->h_prevpg) == RET_ERROR)
3480 					return (RET_ERROR);
3481 				h = t->bt_curpage;
3482 				c->c_pgno = h->h_pgno;
3483 			} while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE);
3484 
3485 			if (NEXTINDEX(h) == 0)
3486 				return (RET_SPECIAL);
3487 
3488 			index = NEXTINDEX(h) - 1;
3489 		} else
3490 			--index;
3491 	} else {
3492 		/* must specify a direction */
3493 		errno = EINVAL;
3494 		return (RET_ERROR);
3495 	}
3496 
3497 	c->c_index = index;
3498 	return (RET_SUCCESS);
3499 }
3500 
3501 /*
3502  *  BT_CLOSE -- Close a btree
3503  *
3504  *	Parameters:
3505  *		tree -- tree to close
3506  *
3507  *	Returns:
3508  *		RET_SUCCESS, RET_ERROR.
3509  *
3510  *	Side Effects:
3511  *		Frees space occupied by the tree.
3512  */
3513 
3514 int
3515 bt_close(tree)
3516 	BTREE tree;
3517 {
3518 	BTREE_P t = (BTREE_P) tree;
3519 	int i;
3520 	BTHEADER *h;
3521 	char *cache;
3522 	struct HTBUCKET *b, *sb;
3523 	HTABLE ht;
3524 	int fd;
3525 
3526 	if (t->bt_cursor.c_key != (char *) NULL)
3527 		(void) free(t->bt_cursor.c_key);
3528 
3529 	if (!ISDISK(t)) {
3530 		/* in-memory tree, release hash table memory */
3531 		ht = t->bt_s.bt_ht;
3532 		for (i = 0; i < HTSIZE; i++) {
3533 			if ((b = ht[i]) == (struct HTBUCKET *) NULL)
3534 				break;
3535 			do {
3536 				sb = b;
3537 				(void) free((char *) b->ht_page);
3538 				b = b->ht_next;
3539 				(void) free((char *) sb);
3540 			} while (b != (struct HTBUCKET *) NULL);
3541 		}
3542 		(void) free ((char *) ht);
3543 		(void) free ((char *) t);
3544 		return (RET_SUCCESS);
3545 	}
3546 
3547 	if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) {
3548 		if (_bt_wrtmeta(t) == RET_ERROR)
3549 			return (RET_ERROR);
3550 	}
3551 
3552 	if (t->bt_curpage != (BTHEADER *) NULL) {
3553 		h = t->bt_curpage;
3554 		if (h->h_flags & F_DIRTY) {
3555 			if (_bt_write(t, h, RELEASE) == RET_ERROR)
3556 				return (RET_ERROR);
3557 		} else {
3558 			if (_bt_release(t, h) == RET_ERROR)
3559 				return (RET_ERROR);
3560 		}
3561 
3562 		/* flush and free the cache, if there is one */
3563 		if (ISCACHE(t)) {
3564 			cache = t->bt_s.bt_d.d_cache;
3565 			lrusync(cache);
3566 			lrufree(cache);
3567 		}
3568 		(void) free ((char *) h);
3569 	}
3570 
3571 	fd = t->bt_s.bt_d.d_fd;
3572 	(void) free ((char *) t);
3573 	return (close(fd));
3574 }
3575 
3576 /*
3577  *  _BT_ALLOCPG -- allocate a new page in the btree.
3578  *
3579  *	This is called when we split a page, to get space to do the split.
3580  *	For disk btrees, these pages are released when the split is done.
3581  *	For memory btrees, they are not.
3582  *
3583  *	Parameters:
3584  *		t -- tree in which to do split
3585  *
3586  *	Returns:
3587  *		Pointer to the newly-allocated page
3588  */
3589 
3590 static BTHEADER *
3591 _bt_allocpg(t)
3592 	BTREE_P t;
3593 {
3594 	BTHEADER *h = t->bt_curpage;
3595 	BTHEADER *nh;
3596 	char *cache;
3597 	int nbytes;
3598 
3599 	/* if we have a cache, let the cache code do the work */
3600 	if (ISDISK(t) && ISCACHE(t)) {
3601 		nh = (BTHEADER *) lrugetnew(t->bt_s.bt_d.d_cache,
3602 					    t->bt_npages + 1,
3603 					    &nbytes);
3604 	} else {
3605 		nh = (BTHEADER *) malloc((unsigned) t->bt_psize);
3606 	}
3607 
3608 	if (nh != (BTHEADER *) NULL) {
3609 		nh->h_pgno = nh->h_prevpg = nh->h_nextpg = P_NONE;
3610 		nh->h_flags = h->h_flags;
3611 		nh->h_lower = (index_t)
3612 				(((char *) &(nh->h_linp[0])) - ((char *) nh));
3613 		nh->h_upper = t->bt_psize;
3614 	}
3615 
3616 	return (nh);
3617 }
3618 
3619 /*
3620  *  _BT_WRITE -- Write a specific page to a btree file.
3621  *
3622  *	Parameters:
3623  *		t -- btree to get the page
3624  *		h -- page to write
3625  *		relflag -- (int) this page may/may not be released
3626  *
3627  *	Returns:
3628  *		RET_SUCCESS, RET_ERROR.
3629  *
3630  *	Side Effects:
3631  *		Writes a metadata page if none has been written yet.
3632  */
3633 
3634 static int
3635 _bt_write(t, h, relflag)
3636 	BTREE_P t;
3637 	BTHEADER *h;
3638 	int relflag;
3639 {
3640 	long pos;
3641 	int htindex;
3642 	HTBUCKET *b;
3643 	char *cache;
3644 	BTMETA m;
3645 	pgno_t pgno;
3646 
3647 	h->h_flags &= ~F_DIRTY;
3648 	if (ISDISK(t)) {
3649 
3650 		/* if we haven't done so yet, write the metadata */
3651 		if (!(t->bt_flags & BTF_METAOK)) {
3652 			if (_bt_wrtmeta(t) == RET_ERROR)
3653 				return (RET_ERROR);
3654 		}
3655 
3656 		pgno = h->h_pgno;
3657 
3658 
3659 		/* if we have a cache, let the cache code do the work */
3660 		if ((cache = t->bt_s.bt_d.d_cache) != (char *) NULL) {
3661 			if (lruwrite(cache, pgno) == RET_ERROR)
3662 				return (RET_ERROR);
3663 			if (relflag && lrurelease(cache, pgno) == RET_ERROR)
3664 				return (RET_ERROR);
3665 
3666 		} else {
3667 			(void) _bt_pgout(h, (char *) t->bt_lorder);
3668 			/* now write the current page */
3669 			pos = (long) (pgno * t->bt_psize);
3670 			if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos)
3671 				return (RET_ERROR);
3672 			if (write(t->bt_s.bt_d.d_fd, h, t->bt_psize)
3673 			    < t->bt_psize)
3674 				return (RET_ERROR);
3675 			if (!relflag)
3676 				(void) _bt_pgin(h, (char *) t->bt_lorder);
3677 		}
3678 	} else {
3679 		/* in-memory btree */
3680 		htindex = HASHKEY(h->h_pgno);
3681 
3682 		/* see if we need to overwrite existing entry */
3683 		for (b = t->bt_s.bt_ht[htindex];
3684 		     b != (HTBUCKET *) NULL;
3685 		     b = b->ht_next) {
3686 			if (b->ht_pgno == h->h_pgno) {
3687 				b->ht_page = h;
3688 				return (RET_SUCCESS);
3689 			}
3690 		}
3691 
3692 		/* new entry, write it */
3693 		b = (HTBUCKET *) malloc((unsigned) sizeof (HTBUCKET));
3694 		if (b == (HTBUCKET *) NULL)
3695 			return (RET_ERROR);
3696 
3697 		b->ht_pgno = h->h_pgno;
3698 		b->ht_page = h;
3699 		b->ht_next = t->bt_s.bt_ht[htindex];
3700 		t->bt_s.bt_ht[htindex] = b;
3701 	}
3702 	return (RET_SUCCESS);
3703 }
3704 
3705 /*
3706  *  _BT_RELEASE -- Release a locked-down cache page
3707  *
3708  *	During page splits, we want to force pages out to the cache
3709  *	before we actually release them, in some cases.  This routine
3710  *	releases such a page when it is no longer needed.
3711  *
3712  *	Parameters:
3713  *		t -- btree in which to release page
3714  *		h -- page to release
3715  *
3716  *	Returns:
3717  *		RET_SUCCESS, RET_ERROR.
3718  *
3719  *	Side Effects:
3720  *		None.
3721  */
3722 
3723 static int
3724 _bt_release(t, h)
3725 	BTREE_P t;
3726 	BTHEADER *h;
3727 {
3728 	if (ISDISK(t) && ISCACHE(t)) {
3729 		if (lrurelease(t->bt_s.bt_d.d_cache, h->h_pgno) < 0)
3730 			return (RET_ERROR);
3731 	}
3732 	return (RET_SUCCESS);
3733 }
3734 
3735 /*
3736  *  _BT_WRTMETA -- Write metadata to the btree.
3737  *
3738  *	Parameters:
3739  *		t -- tree to which to write
3740  *
3741  *	Returns:
3742  *		RET_SUCCESS, RET_ERROR.
3743  */
3744 
3745 static int
3746 _bt_wrtmeta(t)
3747 	BTREE_P t;
3748 {
3749 	BTMETA m;
3750 
3751 	if (lseek(t->bt_s.bt_d.d_fd, 0l, L_SET) != 0l)
3752 		return (RET_ERROR);
3753 
3754 	/* lorder has to be in host-independent format */
3755 	m.m_lorder = (u_long) htonl((long) t->bt_lorder);
3756 
3757 	m.m_magic = BTREEMAGIC;
3758 	m.m_version = BTREEVERSION;
3759 	m.m_psize = t->bt_psize;
3760 	m.m_free = t->bt_free;
3761 	m.m_flags = t->bt_flags & BTF_NODUPS;
3762 
3763 	if (t->bt_lorder != BYTE_ORDER) {
3764 		BLSWAP(m.m_magic);
3765 		BLSWAP(m.m_version);
3766 		BLSWAP(m.m_psize);
3767 		BLSWAP(m.m_free);
3768 		BLSWAP(m.m_flags);
3769 	}
3770 
3771 	if (write(t->bt_s.bt_d.d_fd, &m, sizeof(m))
3772 	    != sizeof(m)) {
3773 		return (RET_ERROR);
3774 	}
3775 
3776 	t->bt_flags |= BTF_METAOK;
3777 
3778 	return (RET_SUCCESS);
3779 }
3780 
3781 #ifdef DEBUG
3782 void
3783 _btdump(tree)
3784 	BTREE tree;
3785 {
3786 	BTREE_P t = (BTREE_P) tree;
3787 	DATUM *d;
3788 	IDATUM *id;
3789 	BTHEADER *h;
3790 	pgno_t npages;
3791 	pgno_t i;
3792 	index_t cur, top;
3793 
3794 	npages = t->bt_npages;
3795 	(void) printf("\"%s\" fd %d pgsz %d curpg %d @ 0x%lx",
3796 		t->bt_fname, t->bt_s.bt_d.d_fd,
3797 		t->bt_psize, t->bt_curpage);
3798 	(void) printf("npg %d cmp 0x%lx flags=(", npages, t->bt_compare);
3799 	if (t->bt_flags & BTF_SEQINIT)
3800 		(void) printf("BTF_SEQINIT");
3801 	(void) printf(")\n");
3802 
3803 	for (i = P_ROOT; i <= npages; i++) {
3804 		if (_bt_getpage(t, i) == RET_ERROR)
3805 			_punt();
3806 		h = t->bt_curpage;
3807 		top = NEXTINDEX(h);
3808 		(void) printf("    page %d:\n", i);
3809 		(void) printf("\tpgno %d prev %d next %d\n",
3810 			h->h_pgno, h->h_prevpg, h->h_nextpg);
3811 		(void) printf("\tlower %d upper %d nextind %d flags (",
3812 			h->h_lower, h->h_upper, top);
3813 		if (h->h_flags & F_LEAF)
3814 			(void) printf("F_LEAF");
3815 		else
3816 			(void) printf("<internal>");
3817 		if (h->h_flags & F_DIRTY)
3818 			(void) printf("|F_DIRTY");
3819 		if (h->h_flags & F_PRESERVE)
3820 			(void) printf("|F_PRESERVE");
3821 		if (h->h_flags & F_CONT) {
3822 			(void) printf("|F_CONT)");
3823 			if (h->h_prevpg == P_NONE) {
3824 				size_t longsz;
3825 				(void) bcopy((char *) &(h->h_linp[0]),
3826 					      (char *) &longsz,
3827 					      sizeof(longsz));
3828 				printf("\n\t\t(chain start, data length %ld)",
3829 					longsz);
3830 			}
3831 			printf("\n");
3832 			continue;
3833 		}
3834 		(void) printf(")\n");
3835 		for (cur = 0; cur < top; cur++) {
3836 			(void) printf("\t  [%d] off %d ", cur, h->h_linp[cur]);
3837 			if (h->h_flags & F_LEAF) {
3838 				d = (DATUM *) GETDATUM(h,cur);
3839 				(void) printf("ksize %d", d->d_ksize);
3840 				if (d->d_flags & D_BIGKEY)
3841 					(void) printf(" (indirect)");
3842 				(void) printf("; dsize %d", d->d_dsize);
3843 				if (d->d_flags & D_BIGDATA)
3844 					(void) printf(" (indirect)");
3845 			} else {
3846 				id = (IDATUM *) GETDATUM(h,cur);
3847 				(void) printf("size %d pgno %d",
3848 					id->i_size, id->i_pgno);
3849 				if (id->i_flags & D_BIGKEY)
3850 					(void) printf(" (indirect)");
3851 			}
3852 			(void) printf("\n");
3853 		}
3854 		(void) printf("\n");
3855 	}
3856 }
3857 #endif /* DEBUG */
3858 
3859 /*
3860  *  _BT_CMP -- Compare a key to a given item on the current page.
3861  *
3862  *	This routine is a wrapper for the user's comparison function.
3863  *
3864  *	Parameters:
3865  *		t -- tree in which to do comparison
3866  *		p -- pointer to one argument for the comparison function
3867  *		n -- index of item to supply second arg to comparison function
3868  *
3869  *	Returns:
3870  *		< 0 if p is < item at n
3871  *		= 0 if p is = item at n
3872  *		> 0 if p is > item at n
3873  */
3874 
3875 static int
3876 _bt_cmp(t, p, n)
3877 	BTREE_P t;
3878 	char *p;
3879 	index_t n;
3880 {
3881 	BTHEADER *h;
3882 	IDATUM *id;
3883 	DATUM *d;
3884 	char *arg;
3885 	int ignore;
3886 	int free_arg;
3887 	pgno_t chain;
3888 	int r;
3889 
3890 	h = t->bt_curpage;
3891 
3892 	/*
3893 	 *  The left-most key at any level of the tree on internal pages
3894 	 *  is guaranteed (artificially, by the code here) to be less than
3895 	 *  any user key.  This saves us from having to update the leftmost
3896 	 *  key when the user inserts a new key in the tree smaller than
3897 	 *  anything we've seen yet.
3898 	 */
3899 
3900 	if (h->h_prevpg == P_NONE && !(h->h_flags & F_LEAF) && n == 0)
3901 		return (1);
3902 
3903 	if (h->h_flags & F_LEAF) {
3904 		d = (DATUM *) GETDATUM(h,n);
3905 		if (d->d_flags & D_BIGKEY) {
3906 			free_arg = TRUE;
3907 			bcopy(&(d->d_bytes[0]), (char *) &chain, sizeof(chain));
3908 			if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR)
3909 				return (RET_ERROR);
3910 		} else {
3911 			free_arg = FALSE;
3912 			arg = &(d->d_bytes[0]);
3913 		}
3914 	} else {
3915 		id = (IDATUM *) GETDATUM(h,n);
3916 		if (id->i_flags & D_BIGKEY) {
3917 			free_arg = TRUE;
3918 			bcopy(&(id->i_bytes[0]),
3919 			      (char *) &chain,
3920 			      sizeof(chain));
3921 			if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR)
3922 				return (RET_ERROR);
3923 		} else {
3924 			free_arg = FALSE;
3925 			arg = &(id->i_bytes[0]);
3926 		}
3927 	}
3928 	r = (*(t->bt_compare))(p, arg);
3929 
3930 	if (free_arg)
3931 		(void) free(arg);
3932 
3933 	return (r);
3934 }
3935 
3936 /*
3937  *  _BT_PUSH/_BT_POP -- Push/pop a parent page number on the parent stack.
3938  *
3939  *	When we descend the tree, we keep track of parent pages in order
3940  *	to handle splits on insertions.
3941  *
3942  *	Parameters:
3943  *		t -- tree for which to push parent
3944  *		pgno -- page number to push (_bt_push only)
3945  *
3946  *	Returns:
3947  *		None.
3948  */
3949 
3950 static
3951 _bt_push(t, pgno)
3952 	BTREE_P t;
3953 	pgno_t pgno;
3954 {
3955 	BTSTACK *new;
3956 
3957 	new = (BTSTACK *) malloc((unsigned) sizeof(BTSTACK));
3958 	new->bts_pgno = pgno;
3959 	new->bts_next = t->bt_stack;
3960 	t->bt_stack = new;
3961 }
3962 
3963 static
3964 _bt_pop(t)
3965 	BTREE_P t;
3966 {
3967 	BTSTACK *s;
3968 	pgno_t p = P_NONE;
3969 
3970 	if ((s = t->bt_stack) != (BTSTACK *) NULL) {
3971 		p = s->bts_pgno;
3972 		t->bt_stack = s->bts_next;
3973 		(void) free ((char *) s);
3974 	}
3975 	return (p);
3976 }
3977 
3978 #ifdef DEBUG
3979 _punt()
3980 {
3981 	int pid;
3982 
3983 	pid = getpid();
3984 	(void) kill(pid, SIGILL);
3985 }
3986 #endif /* DEBUG */
3987