xref: /csrg-svn/lib/libc/db/btree/bt_open.c (revision 45876)
1*45876Sbostic /*-
2*45876Sbostic  * Copyright (c) 1990 The Regents of the University of California.
3*45876Sbostic  * All rights reserved.
4*45876Sbostic  *
5*45876Sbostic  * This code is derived from software contributed to Berkeley by
6*45876Sbostic  * Mike Olson.
7*45876Sbostic  *
8*45876Sbostic  * %sccs.include.redist.c%
9*45876Sbostic  */
10*45876Sbostic 
11*45876Sbostic #if defined(LIBC_SCCS) && !defined(lint)
12*45876Sbostic static char sccsid[] = "@(#)bt_open.c	5.1 (Berkeley) 01/07/91";
13*45876Sbostic #endif /* LIBC_SCCS and not lint */
14*45876Sbostic 
15*45876Sbostic /*
16*45876Sbostic  *  btree.c -- implementation of btree access method for 4.4BSD.
17*45876Sbostic  *
18*45876Sbostic  *	The design here is based on that of the btree access method used
19*45876Sbostic  *	in the Postgres database system at UC Berkeley.  The implementation
20*45876Sbostic  *	is wholly independent of the Postgres code.
21*45876Sbostic  *
22*45876Sbostic  *	This implementation supports btrees on disk (supply a filename) or
23*45876Sbostic  *	in memory (don't).  Public interfaces defined here are:
24*45876Sbostic  *
25*45876Sbostic  *		btree_open()	-- wrapper; returns a filled DB struct for
26*45876Sbostic  *				   a btree.
27*45876Sbostic  *
28*45876Sbostic  *		bt_open()	-- open a new or existing btree.
29*45876Sbostic  *		bt_get()	-- fetch data from a tree by key.
30*45876Sbostic  *		bt_seq()	-- do a sequential scan on a tree.
31*45876Sbostic  *		bt_put()	-- add data to a tree by key.
32*45876Sbostic  *		bt_delete()	-- remove data from a tree by key.
33*45876Sbostic  *		bt_close()	-- close a btree.
34*45876Sbostic  *		bt_sync()	-- force btree pages to disk (disk trees only).
35*45876Sbostic  */
36*45876Sbostic 
37*45876Sbostic #include <sys/param.h>
38*45876Sbostic #include <sys/file.h>
39*45876Sbostic #include <sys/stat.h>
40*45876Sbostic #include <sys/signal.h>
41*45876Sbostic #include <sys/errno.h>
42*45876Sbostic #include <db.h>
43*45876Sbostic 
44*45876Sbostic #ifndef BLSWAP
45*45876Sbostic #define BLSWAP(a) { \
46*45876Sbostic         register unsigned long _t; \
47*45876Sbostic         _t = (a); \
48*45876Sbostic         (a) >>= 24; \
49*45876Sbostic         (a) |= (_t & 0x00ff0000) >>  8; \
50*45876Sbostic         (a) |= (_t & 0x0000ff00) <<  8; \
51*45876Sbostic         (a) |= (_t & 0x000000ff) << 24; \
52*45876Sbostic }
53*45876Sbostic #endif /* ndef BLSWAP */
54*45876Sbostic 
55*45876Sbostic typedef char	*BTREE;		/* should really be (void *) */
56*45876Sbostic 
57*45876Sbostic /* #define	DEBUG */
58*45876Sbostic 
59*45876Sbostic #define RET_ERROR	-1
60*45876Sbostic #define RET_SUCCESS	 0
61*45876Sbostic #define RET_SPECIAL	 1
62*45876Sbostic 
63*45876Sbostic #ifndef TRUE
64*45876Sbostic #define TRUE	1
65*45876Sbostic #define FALSE	0
66*45876Sbostic #endif /* ndef TRUE */
67*45876Sbostic 
68*45876Sbostic extern int errno;
69*45876Sbostic 
70*45876Sbostic /* libc */
71*45876Sbostic extern char *malloc();
72*45876Sbostic 
73*45876Sbostic /* these are defined in lrucache.c */
74*45876Sbostic extern char	*lruinit();
75*45876Sbostic extern char	*lruget();
76*45876Sbostic extern char	*lrugetnew();
77*45876Sbostic extern int	lrusync();
78*45876Sbostic extern int	lruwrite();
79*45876Sbostic extern int	lrurelease();
80*45876Sbostic extern void	lrufree();
81*45876Sbostic 
82*45876Sbostic /* these are defined here */
83*45876Sbostic BTREE	bt_open();
84*45876Sbostic int	bt_close();
85*45876Sbostic int	bt_delete();
86*45876Sbostic int	bt_get();
87*45876Sbostic int	bt_put();
88*45876Sbostic int	bt_seq();
89*45876Sbostic int	bt_sync();
90*45876Sbostic 
91*45876Sbostic /*
92*45876Sbostic  *  Private types.  What you choose for these depends on how big you
93*45876Sbostic  *  want to let files get, and how big you want to let pages get.
94*45876Sbostic  */
95*45876Sbostic 
96*45876Sbostic typedef u_long	index_t;	/* so # bytes on a page fits in a long */
97*45876Sbostic typedef u_long	pgno_t;		/* so # of pages in a btree fits in a long */
98*45876Sbostic 
99*45876Sbostic /*
100*45876Sbostic  *  When we do searches, we push the parent page numbers onto a stack
101*45876Sbostic  *  as we descend the tree.  This is so that for insertions, we can
102*45876Sbostic  *  find our way back up to do internal page insertions and splits.
103*45876Sbostic  */
104*45876Sbostic 
105*45876Sbostic typedef struct BTSTACK {
106*45876Sbostic 	pgno_t		bts_pgno;
107*45876Sbostic 	struct BTSTACK	*bts_next;
108*45876Sbostic } BTSTACK;
109*45876Sbostic 
110*45876Sbostic /*
111*45876Sbostic  *  Every btree page has a header that looks like this.  Flags are given
112*45876Sbostic  *  in the #define's for the F_ flags (see below).
113*45876Sbostic  */
114*45876Sbostic 
115*45876Sbostic typedef struct BTHEADER {
116*45876Sbostic 	pgno_t h_pgno;		/* page number of this page */
117*45876Sbostic 	pgno_t h_prevpg;	/* left sibling */
118*45876Sbostic 	pgno_t h_nextpg;	/* right sibling */
119*45876Sbostic 
120*45876Sbostic #define F_LEAF		0x01	/* leaf page, contains user data */
121*45876Sbostic #define F_CONT		0x02	/* continuation page (large items) */
122*45876Sbostic #define F_DIRTY		0x04	/* need to write to disk */
123*45876Sbostic #define F_PRESERVE	0x08	/* never delete this chain of pages */
124*45876Sbostic 
125*45876Sbostic 	u_long h_flags;		/* page state */
126*45876Sbostic 	index_t h_lower;	/* lower bound of free space on page */
127*45876Sbostic 	index_t h_upper;	/* upper bound of free space on page */
128*45876Sbostic 	index_t h_linp[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
129*45876Sbostic } BTHEADER;
130*45876Sbostic 
131*45876Sbostic /*
132*45876Sbostic  *  HTBUCKETs are hash table buckets for looking up pages of in-memory
133*45876Sbostic  *  btrees by page number.  We use this indirection, rather than direct
134*45876Sbostic  *  pointers, so that the code for manipulating in-memory trees is the
135*45876Sbostic  *  same as that for manipulating on-disk trees.
136*45876Sbostic  */
137*45876Sbostic 
138*45876Sbostic typedef struct HTBUCKET {
139*45876Sbostic 	pgno_t		ht_pgno;
140*45876Sbostic 	BTHEADER	*ht_page;
141*45876Sbostic 	struct HTBUCKET	*ht_next;
142*45876Sbostic } HTBUCKET;
143*45876Sbostic 
144*45876Sbostic typedef HTBUCKET	**HTABLE;
145*45876Sbostic 
146*45876Sbostic /* minimum size we'll let a page be */
147*45876Sbostic #define MINPSIZE	512
148*45876Sbostic 
149*45876Sbostic /* default cache size, in bytes */
150*45876Sbostic #define DEFCACHE	(20 * 1024)
151*45876Sbostic 
152*45876Sbostic /* hash table size for in-memory trees */
153*45876Sbostic #define	HTSIZE		128
154*45876Sbostic 
155*45876Sbostic /* generate a hash key from a page number */
156*45876Sbostic #define HASHKEY(pgno)	((pgno - 1) % HTSIZE)
157*45876Sbostic 
158*45876Sbostic /*
159*45876Sbostic  *  Disk btrees have a file descriptor, and may also have an lru buffer
160*45876Sbostic  *  cache, if the user asked for one.
161*45876Sbostic  */
162*45876Sbostic 
163*45876Sbostic typedef struct BTDISK {
164*45876Sbostic 	int	d_fd;
165*45876Sbostic 	char	*d_cache;
166*45876Sbostic } BTDISK;
167*45876Sbostic 
168*45876Sbostic /*
169*45876Sbostic  *  Cursors keep track of the current location in a sequential scan of
170*45876Sbostic  *  the database.  Since btrees impose a total ordering on keys, we can
171*45876Sbostic  *  walk forward or backward through the database from any point.  Cursors
172*45876Sbostic  *  survive updates to the tree, and can be used to delete a particular
173*45876Sbostic  *  record.
174*45876Sbostic  */
175*45876Sbostic 
176*45876Sbostic typedef struct CURSOR {
177*45876Sbostic 	pgno_t		c_pgno;		/* pgno of current item in scan */
178*45876Sbostic 	index_t		c_index;	/* index of current item in scan */
179*45876Sbostic 	char		*c_key;		/* current key, used for updates */
180*45876Sbostic 
181*45876Sbostic #define CRSR_BEFORE	0x01
182*45876Sbostic 
183*45876Sbostic 	u_char		c_flags;	/* to handle updates properly */
184*45876Sbostic } CURSOR;
185*45876Sbostic 
186*45876Sbostic /*
187*45876Sbostic  *  The private btree data structure.  The user passes a pointer to one of
188*45876Sbostic  *  these when we are to manipulate a tree, but the BTREE type is opaque
189*45876Sbostic  *  to him.
190*45876Sbostic  */
191*45876Sbostic 
192*45876Sbostic typedef struct BTREEDATA_P {
193*45876Sbostic 	char		*bt_fname;		/* NULL for in-memory trees */
194*45876Sbostic 	union {
195*45876Sbostic 		BTDISK	bt_d;			/* for on-disk btrees */
196*45876Sbostic 		HTABLE	bt_ht;			/* hash table for mem trees */
197*45876Sbostic 	} bt_s;
198*45876Sbostic 	size_t		bt_psize;		/* page size for btree pages */
199*45876Sbostic 	int		(*bt_compare)();	/* key comparison function */
200*45876Sbostic 	pgno_t		bt_npages;		/* number of pages in tree */
201*45876Sbostic 	BTHEADER	*bt_curpage;		/* current page contents */
202*45876Sbostic 	pgno_t		bt_free;		/* free pg list for big data */
203*45876Sbostic 	CURSOR		bt_cursor;		/* cursor for scans */
204*45876Sbostic 	BTSTACK		*bt_stack;		/* parent stack for inserts */
205*45876Sbostic 	u_long		bt_lorder;		/* byte order (endian.h) */
206*45876Sbostic 
207*45876Sbostic #define BTF_METAOK	0x01	/* meta-data written to start of file */
208*45876Sbostic #define BTF_SEQINIT	0x02	/* we have called bt_seq */
209*45876Sbostic #define BTF_ISWRITE	0x04	/* tree was opened for write */
210*45876Sbostic #define BTF_NODUPS	0x08	/* tree created for unique keys */
211*45876Sbostic 
212*45876Sbostic 	u_long		bt_flags;		/* btree state */
213*45876Sbostic } BTREEDATA_P;
214*45876Sbostic 
215*45876Sbostic typedef BTREEDATA_P	*BTREE_P;
216*45876Sbostic 
217*45876Sbostic /*
218*45876Sbostic  *  The first thing in a btree file is a BTMETA structure.  The rest of
219*45876Sbostic  *  the first page is empty, so that all disk operations are page-aligned.
220*45876Sbostic  */
221*45876Sbostic 
222*45876Sbostic typedef struct BTMETA {
223*45876Sbostic 	u_long	m_magic;
224*45876Sbostic 	u_long	m_version;
225*45876Sbostic 	size_t	m_psize;
226*45876Sbostic 	pgno_t	m_free;
227*45876Sbostic 	u_long	m_flags;
228*45876Sbostic 	u_long	m_lorder;
229*45876Sbostic } BTMETA;
230*45876Sbostic 
231*45876Sbostic #define P_NONE		0		/* invalid page number in tree */
232*45876Sbostic #define P_ROOT		1		/* page number of root pg in btree */
233*45876Sbostic 
234*45876Sbostic #define NORELEASE	0		/* don't release a page during write */
235*45876Sbostic #define RELEASE		1		/* release a page during write */
236*45876Sbostic 
237*45876Sbostic #define INSERT		0		/* doing an insert operation */
238*45876Sbostic #define DELETE		1		/* doing a delete operation */
239*45876Sbostic 
240*45876Sbostic /* magic number for identifying btree files */
241*45876Sbostic #define BTREEMAGIC	053162L		/* magic */
242*45876Sbostic #define BTREEVERSION	2		/* last changed 6 jan 1991 */
243*45876Sbostic 
244*45876Sbostic /* get the next free index on a btree page */
245*45876Sbostic #define NEXTINDEX(p)	((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t)))
246*45876Sbostic 
247*45876Sbostic /* is a BTITEM actually on the btree page? */
248*45876Sbostic #define VALIDITEM(t, i)	((i)->bti_index < NEXTINDEX((t)->bt_curpage))
249*45876Sbostic 
250*45876Sbostic /* guarantee longword alignment so structure refs work */
251*45876Sbostic #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03)
252*45876Sbostic 
253*45876Sbostic /* get a particular datum (or idatum) off a page */
254*45876Sbostic #define GETDATUM(h,i)	 (((char *) h) + h->h_linp[i])
255*45876Sbostic 
256*45876Sbostic /* is a {key,datum} too big to put on a single page? */
257*45876Sbostic #define TOOBIG(t, sz)	(sz >= t->bt_psize / 5)
258*45876Sbostic 
259*45876Sbostic /* is this a disk tree or a memory tree? */
260*45876Sbostic #define ISDISK(t)	(t->bt_fname != (char *) NULL)
261*45876Sbostic 
262*45876Sbostic /* does the disk tree use a cache? */
263*45876Sbostic #define ISCACHE(t)	(t->bt_s.bt_d.d_cache != (char *) NULL)
264*45876Sbostic 
265*45876Sbostic /*
266*45876Sbostic  *  DATUMs are for user data -- one appears on leaf pages for every
267*45876Sbostic  *  tree entry.  The d_bytes[] array contains the key first, then the data.
268*45876Sbostic  *
269*45876Sbostic  *  If either the key or the datum is too big to store on a single page,
270*45876Sbostic  *  a bit is set in the flags entry, and the d_bytes[] array contains a
271*45876Sbostic  *  pgno pointing to the page at which the data is actually stored.
272*45876Sbostic  *
273*45876Sbostic  *  Note on alignment:  every DATUM is guaranteed to be longword aligned
274*45876Sbostic  *  on the disk page.  In order to force longword alignment of user key
275*45876Sbostic  *  and data values, we must guarantee that the d_bytes[] array starts
276*45876Sbostic  *  on a longword boundary.  This is the reason that d_flags is a u_long,
277*45876Sbostic  *  rather than a u_char (it really only needs to be two bits big).  This
278*45876Sbostic  *  is necessary because we call the user's comparison function with a
279*45876Sbostic  *  pointer to the start of the d_bytes array.  We don't need to force
280*45876Sbostic  *  longword alignment of the data following the key, since that is copied
281*45876Sbostic  *  to a longword-aligned buffer before being returned to the user.
282*45876Sbostic  */
283*45876Sbostic 
284*45876Sbostic typedef struct DATUM {
285*45876Sbostic 	size_t d_ksize;		/* size of key */
286*45876Sbostic 	size_t d_dsize;		/* size of data */
287*45876Sbostic 
288*45876Sbostic #define D_BIGDATA	0x01	/* indirect datum ptr flag */
289*45876Sbostic #define D_BIGKEY	0x02	/* indirect key ptr flag */
290*45876Sbostic 
291*45876Sbostic 	u_long d_flags;		/* flags (indirect bit) */
292*45876Sbostic 	char d_bytes[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
293*45876Sbostic } DATUM;
294*45876Sbostic 
295*45876Sbostic /* BTITEMs are used to return (page, index, datum) tuples from searches */
296*45876Sbostic typedef struct BTITEM {
297*45876Sbostic 	pgno_t bti_pgno;
298*45876Sbostic 	index_t bti_index;
299*45876Sbostic 	DATUM *bti_datum;
300*45876Sbostic } BTITEM;
301*45876Sbostic 
302*45876Sbostic /*
303*45876Sbostic  *  IDATUMs are for data stored on internal pages.  This is the (key, pgno)
304*45876Sbostic  *  pair, such that key 'key' is the first entry on page 'pgno'.  If our
305*45876Sbostic  *  internal page contains keys (a) and (b) next to each other, then all
306*45876Sbostic  *  items >= to (a) and < (b) go on the same page as (a).  There are some
307*45876Sbostic  *  gotchas with duplicate keys, however.  See the split code for details.
308*45876Sbostic  *
309*45876Sbostic  *  If a key is too big to fit on a single page, then the i_bytes[] array
310*45876Sbostic  *  contains a pgno pointing to the start of a chain that actually stores
311*45876Sbostic  *  the bytes.  Since items on internal pages are never deleted from the
312*45876Sbostic  *  tree, these indirect chains are marked as special, so that they won't
313*45876Sbostic  *  be deleted if the corresponding leaf item is deleted.
314*45876Sbostic  *
315*45876Sbostic  *  As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char)
316*45876Sbostic  *  in order to guarantee that user keys are longword aligned on the disk
317*45876Sbostic  *  page.
318*45876Sbostic  */
319*45876Sbostic 
320*45876Sbostic typedef struct IDATUM {
321*45876Sbostic 	size_t i_size;
322*45876Sbostic 	pgno_t i_pgno;
323*45876Sbostic 	u_long i_flags;		/* see DATUM.d_flags, above */
324*45876Sbostic 	char i_bytes[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
325*45876Sbostic } IDATUM;
326*45876Sbostic 
327*45876Sbostic /* all private interfaces have a leading _ in their names */
328*45876Sbostic static BTITEM	*_bt_search();
329*45876Sbostic static BTITEM	*_bt_searchr();
330*45876Sbostic static BTHEADER	*_bt_allocpg();
331*45876Sbostic static index_t	_bt_binsrch();
332*45876Sbostic static int	_bt_isonpage();
333*45876Sbostic static BTITEM	*_bt_first();
334*45876Sbostic static int	_bt_release();
335*45876Sbostic static int	_bt_wrtmeta();
336*45876Sbostic static int	_bt_delindir();
337*45876Sbostic static int	_bt_pgout();
338*45876Sbostic static int	_bt_pgin();
339*45876Sbostic static int	_bt_fixscan();
340*45876Sbostic static int	_bt_indirect();
341*45876Sbostic static int	_bt_crsrdel();
342*45876Sbostic 
343*45876Sbostic extern int strcmp();
344*45876Sbostic 
345*45876Sbostic static BTREEINFO _DefaultBTInfo = {
346*45876Sbostic 	0,	/* flags */
347*45876Sbostic 	0,	/* cachesize */
348*45876Sbostic 	0,	/* psize */
349*45876Sbostic 	strcmp,	/* compare */
350*45876Sbostic 	0
351*45876Sbostic };
352*45876Sbostic 
353*45876Sbostic /*
354*45876Sbostic  *  BTREE_OPEN -- Wrapper routine to open a btree.
355*45876Sbostic  *
356*45876Sbostic  *	Creates and fills a DB struct, and calls the routine that actually
357*45876Sbostic  *	opens the btree.
358*45876Sbostic  *
359*45876Sbostic  *	Parameters:
360*45876Sbostic  *		f:  filename to open
361*45876Sbostic  *		flags:  flag bits passed to open
362*45876Sbostic  *		mode:  permission bits, used if O_CREAT specified
363*45876Sbostic  *		b:  BTREEINFO pointer
364*45876Sbostic  *
365*45876Sbostic  *	Returns:
366*45876Sbostic  *		Filled-in DBT on success; NULL on failure, with errno
367*45876Sbostic  *		set as appropriate.
368*45876Sbostic  *
369*45876Sbostic  *	Side Effects:
370*45876Sbostic  *		Allocates memory for the DB struct.
371*45876Sbostic  */
372*45876Sbostic 
373*45876Sbostic DB *
374*45876Sbostic btree_open(f, flags, mode, b)
375*45876Sbostic 	char *f;
376*45876Sbostic 	int flags;
377*45876Sbostic 	int mode;
378*45876Sbostic 	BTREEINFO *b;
379*45876Sbostic {
380*45876Sbostic 	DB *db;
381*45876Sbostic 	BTREE t;
382*45876Sbostic 
383*45876Sbostic 	if ((db = (DB *) malloc((unsigned) sizeof(DB))) == (DB *) NULL)
384*45876Sbostic 		return ((DB *) NULL);
385*45876Sbostic 
386*45876Sbostic 	if ((t = bt_open(f, flags, mode, b)) == (BTREE) NULL) {
387*45876Sbostic 		(void) free ((char *) db);
388*45876Sbostic 		return ((DB *) NULL);
389*45876Sbostic 	}
390*45876Sbostic 
391*45876Sbostic 	db->internal = (char *) t;
392*45876Sbostic 	db->close = bt_close;
393*45876Sbostic 	db->delete = bt_delete;
394*45876Sbostic 	db->get = bt_get;
395*45876Sbostic 	db->put = bt_put;
396*45876Sbostic 	db->seq = bt_seq;
397*45876Sbostic 	db->sync = bt_sync;
398*45876Sbostic 
399*45876Sbostic 	return (db);
400*45876Sbostic }
401*45876Sbostic 
402*45876Sbostic /*
403*45876Sbostic  *  BT_OPEN -- Open a btree.
404*45876Sbostic  *
405*45876Sbostic  *	This routine creates the correct kind (disk or in-memory) of
406*45876Sbostic  *	btree and initializes it as required.
407*45876Sbostic  *
408*45876Sbostic  *	Parameters:
409*45876Sbostic  *		f -- name of btree (NULL for in-memory btrees)
410*45876Sbostic  *		flags -- flags passed to open()
411*45876Sbostic  *		mode -- mode passed to open ()
412*45876Sbostic  *		b -- BTREEINFO structure, describing btree
413*45876Sbostic  *
414*45876Sbostic  *	Returns:
415*45876Sbostic  *		(Opaque) pointer to the btree.  On failure, returns NULL
416*45876Sbostic  *		with errno set as appropriate.
417*45876Sbostic  *
418*45876Sbostic  *	Side Effects:
419*45876Sbostic  *		Allocates memory, opens files.
420*45876Sbostic  */
421*45876Sbostic 
422*45876Sbostic BTREE
423*45876Sbostic bt_open(f, flags, mode, b)
424*45876Sbostic 	char *f;
425*45876Sbostic 	int flags;
426*45876Sbostic 	int mode;
427*45876Sbostic 	BTREEINFO *b;
428*45876Sbostic {
429*45876Sbostic 	BTREE_P t;
430*45876Sbostic 	HTABLE ht;
431*45876Sbostic 	int nbytes;
432*45876Sbostic 	int fd;
433*45876Sbostic 	CURSOR *c;
434*45876Sbostic 	BTMETA m;
435*45876Sbostic 	struct stat buf;
436*45876Sbostic 
437*45876Sbostic 	/* use the default info if none was provided */
438*45876Sbostic 	if (b == (BTREEINFO *) NULL)
439*45876Sbostic 		b = &_DefaultBTInfo;
440*45876Sbostic 
441*45876Sbostic 	if ((t = (BTREE_P) malloc((unsigned) sizeof *t)) == (BTREE_P) NULL)
442*45876Sbostic 		return ((BTREE) NULL);
443*45876Sbostic 
444*45876Sbostic 	if (b->compare)
445*45876Sbostic 		t->bt_compare = b->compare;
446*45876Sbostic 	else
447*45876Sbostic 		t->bt_compare = strcmp;
448*45876Sbostic 
449*45876Sbostic 	t->bt_fname = f;
450*45876Sbostic 	t->bt_curpage = (BTHEADER *) NULL;
451*45876Sbostic 	t->bt_free = P_NONE;
452*45876Sbostic 	c = &(t->bt_cursor);
453*45876Sbostic 	c->c_pgno = P_NONE;
454*45876Sbostic 	c->c_index = 0;
455*45876Sbostic 	c->c_flags = (u_char) NULL;
456*45876Sbostic 	c->c_key = (char *) NULL;
457*45876Sbostic 	t->bt_stack = (BTSTACK *) NULL;
458*45876Sbostic 	t->bt_flags = 0;
459*45876Sbostic 
460*45876Sbostic 	/*
461*45876Sbostic 	 *  If no file name was supplied, this is an in-memory btree.
462*45876Sbostic 	 *  Otherwise, it's a disk-based btree.
463*45876Sbostic 	 */
464*45876Sbostic 	if (f == (char *) NULL) {
465*45876Sbostic 		/* in memory */
466*45876Sbostic 		if ((t->bt_psize = b->psize) < MINPSIZE) {
467*45876Sbostic 			if (t->bt_psize != 0) {
468*45876Sbostic 				(void) free ((char *) t);
469*45876Sbostic 				errno = EINVAL;
470*45876Sbostic 				return ((BTREE) NULL);
471*45876Sbostic 			}
472*45876Sbostic 			t->bt_psize = getpagesize();
473*45876Sbostic 		}
474*45876Sbostic 
475*45876Sbostic 		nbytes = HTSIZE * sizeof(HTBUCKET *);
476*45876Sbostic 		if ((ht = (HTABLE) malloc((unsigned) nbytes))
477*45876Sbostic 		    == (HTABLE) NULL) {
478*45876Sbostic 			(void) free((char *) t);
479*45876Sbostic 			return ((BTREE) NULL);
480*45876Sbostic 		}
481*45876Sbostic 		(void) bzero((char *) ht, nbytes);
482*45876Sbostic 		t->bt_s.bt_ht = ht;
483*45876Sbostic 		t->bt_npages = 0;
484*45876Sbostic 		t->bt_lorder = BYTE_ORDER;
485*45876Sbostic 		if (!(b->flags & R_DUP))
486*45876Sbostic 			t->bt_flags |= BTF_NODUPS;
487*45876Sbostic 	} else {
488*45876Sbostic 		/* on disk */
489*45876Sbostic 		if ((fd = open(f, O_RDONLY, 0)) < 0) {
490*45876Sbostic 			/* doesn't exist yet, be sure page is big enough */
491*45876Sbostic 			if ((t->bt_psize = b->psize) < sizeof(BTHEADER)
492*45876Sbostic 			    && b->psize != 0) {
493*45876Sbostic 				(void) free((char *) t);
494*45876Sbostic 				errno = EINVAL;
495*45876Sbostic 				return ((BTREE) NULL);
496*45876Sbostic 			}
497*45876Sbostic 			if (b->lorder == 0)
498*45876Sbostic 				b->lorder = BYTE_ORDER;
499*45876Sbostic 
500*45876Sbostic 			if (b->lorder != BIG_ENDIAN
501*45876Sbostic 			    && b->lorder != LITTLE_ENDIAN) {
502*45876Sbostic 				(void) free((char *) t);
503*45876Sbostic 				errno = EINVAL;
504*45876Sbostic 				return ((BTREE) NULL);
505*45876Sbostic 			}
506*45876Sbostic 			t->bt_lorder = b->lorder;
507*45876Sbostic 			if (!(b->flags & R_DUP))
508*45876Sbostic 				t->bt_flags |= BTF_NODUPS;
509*45876Sbostic 		} else {
510*45876Sbostic 			/* exists, get page size from file */
511*45876Sbostic 			if (read(fd, &m, sizeof(m)) < sizeof(m)) {
512*45876Sbostic 				(void) close(fd);
513*45876Sbostic 				(void) free((char *) t);
514*45876Sbostic 				errno = EINVAL;
515*45876Sbostic 				return ((BTREE) NULL);
516*45876Sbostic 			}
517*45876Sbostic 
518*45876Sbostic 			/* lorder always stored in host-independent format */
519*45876Sbostic 			NTOHL(m.m_lorder);
520*45876Sbostic 			if (m.m_lorder != BIG_ENDIAN
521*45876Sbostic 			    && m.m_lorder != LITTLE_ENDIAN) {
522*45876Sbostic 				(void) free((char *) t);
523*45876Sbostic 				errno = EINVAL;
524*45876Sbostic 				return ((BTREE) NULL);
525*45876Sbostic 			}
526*45876Sbostic 			t->bt_lorder = m.m_lorder;
527*45876Sbostic 
528*45876Sbostic 			if (t->bt_lorder != BYTE_ORDER) {
529*45876Sbostic 				BLSWAP(m.m_magic);
530*45876Sbostic 				BLSWAP(m.m_version);
531*45876Sbostic 				BLSWAP(m.m_psize);
532*45876Sbostic 				BLSWAP(m.m_free);
533*45876Sbostic 				BLSWAP(m.m_flags);
534*45876Sbostic 			}
535*45876Sbostic 
536*45876Sbostic 			if (m.m_magic != BTREEMAGIC
537*45876Sbostic 			    || m.m_version != BTREEVERSION
538*45876Sbostic 			    || m.m_psize < MINPSIZE) {
539*45876Sbostic 				(void) close(fd);
540*45876Sbostic 				(void) free((char *) t);
541*45876Sbostic #ifndef EFTYPE
542*45876Sbostic #define EFTYPE	-100
543*45876Sbostic #endif
544*45876Sbostic 				errno = EFTYPE;
545*45876Sbostic 				return ((BTREE) NULL);
546*45876Sbostic 			}
547*45876Sbostic 			t->bt_psize = m.m_psize;
548*45876Sbostic 			t->bt_free = m.m_free;
549*45876Sbostic 			t->bt_flags |= (m.m_flags & BTF_NODUPS) | BTF_METAOK;
550*45876Sbostic 			(void) close(fd);
551*45876Sbostic 		}
552*45876Sbostic 
553*45876Sbostic 		/* now open the file the way the user wanted it */
554*45876Sbostic 		if ((t->bt_s.bt_d.d_fd = open(f, flags, mode)) < 0) {
555*45876Sbostic 			(void) free ((char *) t);
556*45876Sbostic 			return ((BTREE) NULL);
557*45876Sbostic 		}
558*45876Sbostic 
559*45876Sbostic 		/* get number of pages, page size if necessary */
560*45876Sbostic 		(void) fstat(t->bt_s.bt_d.d_fd, &buf);
561*45876Sbostic 		if (t->bt_psize == 0)
562*45876Sbostic 			t->bt_psize = buf.st_blksize;
563*45876Sbostic 		t->bt_npages = (pgno_t) (buf.st_size / t->bt_psize);
564*45876Sbostic 
565*45876Sbostic 		/* page zero is metadata, doesn't count */
566*45876Sbostic 		if (t->bt_npages > 0)
567*45876Sbostic 			--(t->bt_npages);
568*45876Sbostic 
569*45876Sbostic 		if (b->cachesize == 0)
570*45876Sbostic 			b->cachesize = DEFCACHE;
571*45876Sbostic 
572*45876Sbostic 		/* get an lru buffer cache, if the user asked for one */
573*45876Sbostic 		if ((b->cachesize / t->bt_psize) > 0) {
574*45876Sbostic 			BTDISK *d = &(t->bt_s.bt_d);
575*45876Sbostic 
576*45876Sbostic 			d->d_cache = lruinit(d->d_fd,
577*45876Sbostic 					     b->cachesize / t->bt_psize,
578*45876Sbostic 					     t->bt_psize,
579*45876Sbostic 					     (char *) t->bt_lorder,
580*45876Sbostic 					     _bt_pgin, _bt_pgout);
581*45876Sbostic 
582*45876Sbostic 			if (d->d_cache == (char *) NULL) {
583*45876Sbostic 				(void) free((char *) t);
584*45876Sbostic 				return ((BTREE) NULL);
585*45876Sbostic 			}
586*45876Sbostic 		}
587*45876Sbostic 	}
588*45876Sbostic 
589*45876Sbostic 	/* remember if tree was opened for write */
590*45876Sbostic 	if (((flags & O_WRONLY) == O_WRONLY)
591*45876Sbostic 	    || ((flags & O_RDWR) == O_RDWR))
592*45876Sbostic 		t->bt_flags |= BTF_ISWRITE;
593*45876Sbostic 
594*45876Sbostic 	return ((BTREE) t);
595*45876Sbostic }
596*45876Sbostic 
597*45876Sbostic /*
598*45876Sbostic  *  BT_GET -- Get an entry from a btree.
599*45876Sbostic  *
600*45876Sbostic  *	Does a key lookup in the tree to find the specified key, and returns
601*45876Sbostic  *	the key/data pair found.
602*45876Sbostic  *
603*45876Sbostic  *	Parameters:
604*45876Sbostic  *		tree -- btree in which to do lookup
605*45876Sbostic  *		key -- key to find
606*45876Sbostic  *		data -- pointer to DBT in which to return data
607*45876Sbostic  *		flag -- ignored
608*45876Sbostic  *
609*45876Sbostic  *	Returns:
610*45876Sbostic  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the key is not
611*45876Sbostic  *		found.  If key is not found, nothing is stored in the
612*45876Sbostic  *		return DBT 'data'.
613*45876Sbostic  *
614*45876Sbostic  *	Side Effects:
615*45876Sbostic  *		None.
616*45876Sbostic  *
617*45876Sbostic  *	Warnings:
618*45876Sbostic  *		Return data is statically allocated, and will be overwritten
619*45876Sbostic  *		at the next call.
620*45876Sbostic  */
621*45876Sbostic 
622*45876Sbostic int
623*45876Sbostic bt_get(tree, key, data, flag)
624*45876Sbostic 	BTREE tree;
625*45876Sbostic 	DBT *key;
626*45876Sbostic 	DBT *data;
627*45876Sbostic 	int flag;
628*45876Sbostic {
629*45876Sbostic 	BTREE_P t = (BTREE_P) tree;
630*45876Sbostic 	BTHEADER *h;
631*45876Sbostic 	DATUM *d;
632*45876Sbostic 	BTITEM *item;
633*45876Sbostic 
634*45876Sbostic 	/* lookup */
635*45876Sbostic 	item = _bt_search(t, key);
636*45876Sbostic 	if (item == (BTITEM *) NULL)
637*45876Sbostic 		return (RET_ERROR);
638*45876Sbostic 
639*45876Sbostic 	/* clear parent pointer stack */
640*45876Sbostic 	while (_bt_pop(t) != P_NONE)
641*45876Sbostic 		continue;
642*45876Sbostic 
643*45876Sbostic 	h = (BTHEADER *) t->bt_curpage;
644*45876Sbostic 	data->size = 0;
645*45876Sbostic 	data->data = (char *) NULL;
646*45876Sbostic 
647*45876Sbostic 	/* match? */
648*45876Sbostic 	if (VALIDITEM(t, item)
649*45876Sbostic 	    && _bt_cmp(t, key->data, item->bti_index) == 0) {
650*45876Sbostic 		d = (DATUM *) GETDATUM(h, item->bti_index);
651*45876Sbostic 		return (_bt_buildret(t, d, data, key));
652*45876Sbostic 	}
653*45876Sbostic 
654*45876Sbostic 	/* nope */
655*45876Sbostic 	return (RET_SPECIAL);
656*45876Sbostic }
657*45876Sbostic 
658*45876Sbostic /*
659*45876Sbostic  *  BT_PUT -- Add an entry to a btree.
660*45876Sbostic  *
661*45876Sbostic  *	The specified (key, data) pair is added to the tree.  If the tree
662*45876Sbostic  *	was created for unique keys only, then duplicates will not be
663*45876Sbostic  *	entered.  If the requested key exists in the tree, it will be over-
664*45876Sbostic  *	written unless the flags parameter is R_NOOVERWRITE, in which case
665*45876Sbostic  *	the update will not be done.  If duplicate keys are permitted in the
666*45876Sbostic  *	tree, duplicates will be inserted and will not overwrite existing
667*45876Sbostic  *	keys.  Nodes are split as required.
668*45876Sbostic  *
669*45876Sbostic  *	Parameters:
670*45876Sbostic  *		tree -- btree in which to put the new entry
671*45876Sbostic  *		key -- key component to add
672*45876Sbostic  *		data -- data corresponding to key
673*45876Sbostic  *		flag -- R_NOOVERWRITE or zero.
674*45876Sbostic  *
675*45876Sbostic  *	Returns:
676*45876Sbostic  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the
677*45876Sbostic  *		NOOVERWRITE flag was set and the specified key exists
678*45876Sbostic  *		in the database.
679*45876Sbostic  *
680*45876Sbostic  *	Side Effects:
681*45876Sbostic  *		None.
682*45876Sbostic  */
683*45876Sbostic 
684*45876Sbostic int
685*45876Sbostic bt_put(tree, key, data, flag)
686*45876Sbostic 	BTREE tree;
687*45876Sbostic 	DBT *key;
688*45876Sbostic 	DBT *data;
689*45876Sbostic 	int flag;
690*45876Sbostic {
691*45876Sbostic 	BTREE_P t = (BTREE_P) tree;
692*45876Sbostic 	BTITEM *item;
693*45876Sbostic 
694*45876Sbostic 	/* look for this key in the tree */
695*45876Sbostic 	item = _bt_search(t, key);
696*45876Sbostic 
697*45876Sbostic 	/*
698*45876Sbostic 	 *  If this tree was originally created without R_DUP, then duplicate
699*45876Sbostic 	 *  keys are not allowed.  We need to check this at insertion time.
700*45876Sbostic 	 */
701*45876Sbostic 
702*45876Sbostic 	if (VALIDITEM(t, item) && _bt_cmp(t, key->data, item->bti_index) == 0) {
703*45876Sbostic 		if ((t->bt_flags & BTF_NODUPS) && flag == R_NOOVERWRITE) {
704*45876Sbostic 			if (_bt_delone(t, item->bti_index) == RET_ERROR)
705*45876Sbostic 				return (RET_ERROR);
706*45876Sbostic 		}
707*45876Sbostic 	}
708*45876Sbostic 
709*45876Sbostic 	return (_bt_insert(t, item, key, data, flag));
710*45876Sbostic }
711*45876Sbostic 
712*45876Sbostic /*
713*45876Sbostic  *  BT_DELETE -- delete a key from the tree.
714*45876Sbostic  *
715*45876Sbostic  *	Deletes all keys (and their associated data items) matching the
716*45876Sbostic  *	supplied key from the tree.  If the flags entry is R_CURSOR, then
717*45876Sbostic  *	the current item in the active scan is deleted.
718*45876Sbostic  *
719*45876Sbostic  *	Parameters:
720*45876Sbostic  *		tree -- btree from which to delete key
721*45876Sbostic  *		key -- key to delete
722*45876Sbostic  *		flags -- R_CURSOR or zero
723*45876Sbostic  *
724*45876Sbostic  *	Returns:
725*45876Sbostic  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if the specified
726*45876Sbostic  *		key was not in the tree.
727*45876Sbostic  *
728*45876Sbostic  *	Side Effects:
729*45876Sbostic  *		None.
730*45876Sbostic  */
731*45876Sbostic 
732*45876Sbostic int
733*45876Sbostic bt_delete(tree, key, flags)
734*45876Sbostic 	BTREE tree;
735*45876Sbostic 	DBT *key;
736*45876Sbostic 	int flags;
737*45876Sbostic {
738*45876Sbostic 	BTREE_P t = (BTREE_P) tree;
739*45876Sbostic 	BTHEADER *h;
740*45876Sbostic 	BTITEM *item;
741*45876Sbostic 	int ndel = 0;
742*45876Sbostic 
743*45876Sbostic 	if (flags == R_CURSOR)
744*45876Sbostic 		return (_bt_crsrdel(t));
745*45876Sbostic 
746*45876Sbostic 	/* find the first matching key in the tree */
747*45876Sbostic 	item = _bt_first(t, key);
748*45876Sbostic 	h = t->bt_curpage;
749*45876Sbostic 
750*45876Sbostic 	/* delete all matching keys */
751*45876Sbostic 	for (;;) {
752*45876Sbostic 		while (VALIDITEM(t, item)
753*45876Sbostic 		       && (_bt_cmp(t, key->data, item->bti_index) == 0)) {
754*45876Sbostic 			if (_bt_delone(t, item->bti_index) == RET_ERROR)
755*45876Sbostic 				return (RET_ERROR);
756*45876Sbostic 			ndel++;
757*45876Sbostic 		}
758*45876Sbostic 
759*45876Sbostic 		if (VALIDITEM(t, item) || h->h_nextpg == P_NONE)
760*45876Sbostic 			break;
761*45876Sbostic 
762*45876Sbostic 		/* next page, if necessary */
763*45876Sbostic 		do {
764*45876Sbostic 			if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
765*45876Sbostic 				return (RET_ERROR);
766*45876Sbostic 			h = t->bt_curpage;
767*45876Sbostic 		} while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
768*45876Sbostic 
769*45876Sbostic 		item->bti_pgno = h->h_pgno;
770*45876Sbostic 		item->bti_index = 0;
771*45876Sbostic 
772*45876Sbostic 		if (!VALIDITEM(t, item)
773*45876Sbostic 		    || _bt_cmp(t, key->data, item->bti_index) != 0)
774*45876Sbostic 			break;
775*45876Sbostic 	}
776*45876Sbostic 
777*45876Sbostic 	/* clean up the parent stack */
778*45876Sbostic 	while (_bt_pop(t) != P_NONE)
779*45876Sbostic 		continue;
780*45876Sbostic 
781*45876Sbostic 	/* flush changes to disk */
782*45876Sbostic 	if (ISDISK(t)) {
783*45876Sbostic 		if (h->h_flags & F_DIRTY) {
784*45876Sbostic 			if (_bt_write(t, t->bt_curpage, NORELEASE) == RET_ERROR)
785*45876Sbostic 				return (RET_ERROR);
786*45876Sbostic 		}
787*45876Sbostic 	}
788*45876Sbostic 
789*45876Sbostic 	if (ndel == 0)
790*45876Sbostic 		return (RET_SPECIAL);
791*45876Sbostic 
792*45876Sbostic 	return (RET_SUCCESS);
793*45876Sbostic }
794*45876Sbostic 
795*45876Sbostic /*
796*45876Sbostic  *  _BT_CRSRDEL -- Delete the item pointed to by the cursor.
797*45876Sbostic  *
798*45876Sbostic  *	This routine deletes the item most recently returned by a scan
799*45876Sbostic  *	through the tree.  Since it only makes sense to delete the current
800*45876Sbostic  *	record once, we make sure that we don't try to delete twice without
801*45876Sbostic  *	advancing the scan.
802*45876Sbostic  *
803*45876Sbostic  *	Parameters:
804*45876Sbostic  *		t -- tree in which to do deletion
805*45876Sbostic  *
806*45876Sbostic  *	Returns:
807*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
808*45876Sbostic  *
809*45876Sbostic  *	Side Effects:
810*45876Sbostic  *		The call to _bt_delone marks the cursor, so we can tell that
811*45876Sbostic  *		the current record has been deleted.
812*45876Sbostic  */
813*45876Sbostic 
814*45876Sbostic static int
815*45876Sbostic _bt_crsrdel(t)
816*45876Sbostic 	BTREE_P t;
817*45876Sbostic {
818*45876Sbostic 	CURSOR *c;
819*45876Sbostic 
820*45876Sbostic 	c = &(t->bt_cursor);
821*45876Sbostic 
822*45876Sbostic 	/* a cursor must exist, and can't have deleted the current key yet */
823*45876Sbostic 	if (!(t->bt_flags & BTF_SEQINIT) || (c->c_flags & CRSR_BEFORE)) {
824*45876Sbostic 		errno = EINVAL;
825*45876Sbostic 		return (RET_ERROR);
826*45876Sbostic 	}
827*45876Sbostic 
828*45876Sbostic 	if (_bt_getpage(t, c->c_pgno) == RET_ERROR)
829*45876Sbostic 		return (RET_ERROR);
830*45876Sbostic 
831*45876Sbostic 	if (c->c_index >= NEXTINDEX(t->bt_curpage)) {
832*45876Sbostic 		errno = EINVAL;
833*45876Sbostic 		return (RET_ERROR);
834*45876Sbostic 	}
835*45876Sbostic 
836*45876Sbostic 	return (_bt_delone(t, c->c_index));
837*45876Sbostic }
838*45876Sbostic 
839*45876Sbostic /*
840*45876Sbostic  *  _BT_DELONE -- Delete a single entry from a btree.
841*45876Sbostic  *
842*45876Sbostic  *	This routine physically removes a btree entry from a leaf page.
843*45876Sbostic  *	IDATUM items are *never* removed from internal nodes, regardless
844*45876Sbostic  *	of whether the entries that originally caused them to be added
845*45876Sbostic  *	are removed from the tree or not.  In addition, pages made empty
846*45876Sbostic  *	by element deletion are not actually reclaimed.  They are,
847*45876Sbostic  *	however, made available for reuse.
848*45876Sbostic  *
849*45876Sbostic  *	To delete an item from a page, we pack the remaining items at
850*45876Sbostic  *	the end of the page, overwriting the deleted item's entry.  We
851*45876Sbostic  *	move the line pointers backward on the page, overwriting the
852*45876Sbostic  *	original item's line pointer.  This guarantees that the space in
853*45876Sbostic  *	the middle of the page is free -- a property that our insertion
854*45876Sbostic  *	strategy relies on.
855*45876Sbostic  *
856*45876Sbostic  *	This routine doesn't reclaim pages all of whose entries have
857*45876Sbostic  *	been deleted.  These pages are available for reuse, however.
858*45876Sbostic  *	If an item is deleted that was too big to fit on a page, then
859*45876Sbostic  *	the blocks that it occupies are put on a free list for reuse.
860*45876Sbostic  *
861*45876Sbostic  *	Parameters:
862*45876Sbostic  *		t -- btree from which to delete item
863*45876Sbostic  *		index -- index of entry on current page to delete
864*45876Sbostic  *
865*45876Sbostic  *	Returns:
866*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
867*45876Sbostic  *
868*45876Sbostic  *	Side Effects:
869*45876Sbostic  *		Physically changes page layout, adjusts internal page
870*45876Sbostic  *		state to reflect the deletion of the item, and updates
871*45876Sbostic  *		the list of free pages for this tree.
872*45876Sbostic  */
873*45876Sbostic 
874*45876Sbostic static int
875*45876Sbostic _bt_delone(t, index)
876*45876Sbostic 	BTREE_P t;
877*45876Sbostic 	index_t index;
878*45876Sbostic {
879*45876Sbostic 	char *src, *dest;
880*45876Sbostic 	int nbytes, nmoved;
881*45876Sbostic 	index_t off;
882*45876Sbostic 	index_t top;
883*45876Sbostic 	index_t i;
884*45876Sbostic 	pgno_t chain;
885*45876Sbostic 	BTHEADER *h;
886*45876Sbostic 	CURSOR *c;
887*45876Sbostic 	DATUM *d;
888*45876Sbostic 
889*45876Sbostic 	/* deletion may confuse an active scan.  fix it.  */
890*45876Sbostic 	c = &(t->bt_cursor);
891*45876Sbostic 	if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno)
892*45876Sbostic 		if (_bt_fixscan(t, index, (DATUM *) NULL, DELETE) == RET_ERROR)
893*45876Sbostic 			return (RET_ERROR);
894*45876Sbostic 
895*45876Sbostic 	h = t->bt_curpage;
896*45876Sbostic 	off = h->h_linp[index];
897*45876Sbostic 	d = (DATUM *) GETDATUM(h, index);
898*45876Sbostic 
899*45876Sbostic 	/* if this is a big item, reclaim the space it occupies */
900*45876Sbostic 	if (d->d_flags & D_BIGKEY) {
901*45876Sbostic 		bcopy(&(d->d_bytes[0]),
902*45876Sbostic 		      (char *) &chain,
903*45876Sbostic 		      sizeof(chain));
904*45876Sbostic 		if (_bt_delindir(t, chain) == RET_ERROR)
905*45876Sbostic 			return (RET_ERROR);
906*45876Sbostic 		h = t->bt_curpage;
907*45876Sbostic 		d = (DATUM *) GETDATUM(h, index);
908*45876Sbostic 	}
909*45876Sbostic 	if (d->d_flags & D_BIGDATA) {
910*45876Sbostic 		bcopy(&(d->d_bytes[d->d_ksize]),
911*45876Sbostic 		      (char *) &chain,
912*45876Sbostic 		      sizeof(chain));
913*45876Sbostic 		if (_bt_delindir(t, chain) == RET_ERROR)
914*45876Sbostic 			return (RET_ERROR);
915*45876Sbostic 		h = t->bt_curpage;
916*45876Sbostic 		d = (DATUM *) GETDATUM(h, index);
917*45876Sbostic 	}
918*45876Sbostic 
919*45876Sbostic 	/* move the data down on the page */
920*45876Sbostic 	nbytes = d->d_ksize + d->d_dsize
921*45876Sbostic 		 + (sizeof(DATUM) - sizeof(char));
922*45876Sbostic 	nbytes = LONGALIGN(nbytes);
923*45876Sbostic 	src = ((char *) h) + h->h_upper;
924*45876Sbostic 	dest = src + nbytes;
925*45876Sbostic 	nmoved = (int) (((char *) d) - src);
926*45876Sbostic 	(void) bcopy(src, dest, nmoved);
927*45876Sbostic 
928*45876Sbostic 	/* next move the line pointers up */
929*45876Sbostic 	src = (char *) &(h->h_linp[index + 1]);
930*45876Sbostic 	dest = (char *) &(h->h_linp[index]);
931*45876Sbostic 	nmoved = (int) (((char *) &(h->h_linp[NEXTINDEX(h)])) - src);
932*45876Sbostic 	(void) bcopy(src, dest, nmoved);
933*45876Sbostic 
934*45876Sbostic 	/* remember that we freed up some space */
935*45876Sbostic 	h->h_upper += nbytes;
936*45876Sbostic 	h->h_lower -= sizeof(index_t);
937*45876Sbostic 
938*45876Sbostic 	/* adjust offsets in line pointers affected by moving the data */
939*45876Sbostic 	top = NEXTINDEX(h);
940*45876Sbostic 	for (i = 0; i < top; i++) {
941*45876Sbostic 		if (h->h_linp[i] < off)
942*45876Sbostic 			h->h_linp[i] += nbytes;
943*45876Sbostic 	}
944*45876Sbostic 
945*45876Sbostic 	/* it's gone */
946*45876Sbostic 	h->h_flags |= F_DIRTY;
947*45876Sbostic 
948*45876Sbostic 	return (RET_SUCCESS);
949*45876Sbostic }
950*45876Sbostic 
951*45876Sbostic /*
952*45876Sbostic  *  _BT_FIXSCAN -- Adjust a scan to cope with a change in tree structure.
953*45876Sbostic  *
954*45876Sbostic  *	If the user has an active scan on the database, and we delete an
955*45876Sbostic  *	item from the page the cursor is pointing at, we need to figure
956*45876Sbostic  *	out what to do about it.  Basically, the solution is to point
957*45876Sbostic  *	"between" keys in the tree, using the CRSR_BEFORE flag.  The
958*45876Sbostic  *	requirement is that the user should not miss any data in the
959*45876Sbostic  *	tree during a scan, just because he happened to do some deletions
960*45876Sbostic  *	or insertions while it was active.
961*45876Sbostic  *
962*45876Sbostic  *	In order to guarantee that the scan progresses properly, we need
963*45876Sbostic  *	to save the key of any deleted item we were pointing at, so that
964*45876Sbostic  *	we can check later insertions against it.
965*45876Sbostic  *
966*45876Sbostic  *	Parameters:
967*45876Sbostic  *		t -- tree to adjust
968*45876Sbostic  *		index -- index of item at which change was made
969*45876Sbostic  *		newd -- new datum (for insertions only)
970*45876Sbostic  *		op -- operation (DELETE or INSERT) causing change
971*45876Sbostic  *
972*45876Sbostic  *	Returns:
973*45876Sbostic  *		RET_SUCCESS, RET_ERROR (errno is set).
974*45876Sbostic  *
975*45876Sbostic  *	Side Effects:
976*45876Sbostic  *		None.
977*45876Sbostic  */
978*45876Sbostic 
979*45876Sbostic static int
980*45876Sbostic _bt_fixscan(t, index, newd, op)
981*45876Sbostic 	BTREE_P t;
982*45876Sbostic 	index_t index;
983*45876Sbostic 	DATUM *newd;
984*45876Sbostic 	int op;
985*45876Sbostic {
986*45876Sbostic 	BTHEADER *h;
987*45876Sbostic 	CURSOR *c;
988*45876Sbostic 	DATUM *d;
989*45876Sbostic 
990*45876Sbostic 	h = t->bt_curpage;
991*45876Sbostic 	c = &(t->bt_cursor);
992*45876Sbostic 
993*45876Sbostic 	if (op == DELETE) {
994*45876Sbostic 		if (index < c->c_index)
995*45876Sbostic 			c->c_index--;
996*45876Sbostic 		else if (index == c->c_index) {
997*45876Sbostic 			if (!(c->c_flags & CRSR_BEFORE)) {
998*45876Sbostic 				if (_bt_crsrkey(t) == RET_ERROR)
999*45876Sbostic 					return (RET_ERROR);
1000*45876Sbostic 				c->c_flags |= CRSR_BEFORE;
1001*45876Sbostic 			}
1002*45876Sbostic 		}
1003*45876Sbostic 	} else {
1004*45876Sbostic 		/*
1005*45876Sbostic 		 *  If we previously deleted the object at this location,
1006*45876Sbostic 		 *  and now we're inserting a new one, we need to do the
1007*45876Sbostic 		 *  right thing -- the cursor should come either before
1008*45876Sbostic 		 *  or after the new item, depending on the key that was
1009*45876Sbostic 		 *  here, and the new one.
1010*45876Sbostic 		 */
1011*45876Sbostic 
1012*45876Sbostic 		if (c->c_flags & CRSR_BEFORE) {
1013*45876Sbostic 			if (index <= c->c_index) {
1014*45876Sbostic 				char *tmp;
1015*45876Sbostic 				int itmp;
1016*45876Sbostic 				pgno_t chain;
1017*45876Sbostic 				int r;
1018*45876Sbostic 
1019*45876Sbostic 				d = (DATUM *) GETDATUM(t->bt_curpage, index);
1020*45876Sbostic 				if (d->d_flags & D_BIGKEY) {
1021*45876Sbostic 					bcopy(&(newd->d_bytes[0]),
1022*45876Sbostic 					      (char *) &chain,
1023*45876Sbostic 					      sizeof(chain));
1024*45876Sbostic 					if (_bt_getbig(t, chain, &tmp, &itmp)
1025*45876Sbostic 					     == RET_ERROR)
1026*45876Sbostic 						return (RET_ERROR);
1027*45876Sbostic 				} else
1028*45876Sbostic 					tmp = &(newd->d_bytes[0]);
1029*45876Sbostic 
1030*45876Sbostic 				r = (*(t->bt_compare))(tmp, c->c_key);
1031*45876Sbostic 				if (r < 0)
1032*45876Sbostic 					c->c_index++;
1033*45876Sbostic 
1034*45876Sbostic 				if (d->d_flags & D_BIGKEY)
1035*45876Sbostic 					(void) free (tmp);
1036*45876Sbostic 			}
1037*45876Sbostic 		} else if (index <= c->c_index)
1038*45876Sbostic 			c->c_index++;
1039*45876Sbostic 	}
1040*45876Sbostic 	return (RET_SUCCESS);
1041*45876Sbostic }
1042*45876Sbostic 
1043*45876Sbostic /*
1044*45876Sbostic  *  _BT_CRSRKEY -- Save a copy of the key of the record that the cursor
1045*45876Sbostic  *		   is pointing to.  The record is about to be deleted.
1046*45876Sbostic  *
1047*45876Sbostic  *	Parameters:
1048*45876Sbostic  *		t -- btree
1049*45876Sbostic  *
1050*45876Sbostic  *	Returns:
1051*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
1052*45876Sbostic  *
1053*45876Sbostic  *	Side Effects:
1054*45876Sbostic  *		Allocates memory for the copy which should be released when
1055*45876Sbostic  *		it is no longer needed.
1056*45876Sbostic  */
1057*45876Sbostic 
1058*45876Sbostic static int
1059*45876Sbostic _bt_crsrkey(t)
1060*45876Sbostic 	BTREE_P t;
1061*45876Sbostic {
1062*45876Sbostic 	CURSOR *c;
1063*45876Sbostic 	DATUM *d;
1064*45876Sbostic 	pgno_t pgno;
1065*45876Sbostic 	int ignore;
1066*45876Sbostic 
1067*45876Sbostic 	c = &(t->bt_cursor);
1068*45876Sbostic 	d = (DATUM *) GETDATUM(t->bt_curpage, c->c_index);
1069*45876Sbostic 
1070*45876Sbostic 	if (d->d_flags & D_BIGKEY) {
1071*45876Sbostic 		bcopy(&(d->d_bytes[0]), (char *) &pgno, sizeof(pgno));
1072*45876Sbostic 		return (_bt_getbig(t, pgno, &(c->c_key), &ignore));
1073*45876Sbostic 	} else {
1074*45876Sbostic 		if ((c->c_key = (char *) malloc(d->d_ksize)) == (char *) NULL)
1075*45876Sbostic 			return (RET_ERROR);
1076*45876Sbostic 
1077*45876Sbostic 		bcopy(&(d->d_bytes[0]), c->c_key, d->d_ksize);
1078*45876Sbostic 	}
1079*45876Sbostic 	return (RET_SUCCESS);
1080*45876Sbostic }
1081*45876Sbostic 
1082*45876Sbostic /*
1083*45876Sbostic  *  _BT_GETBIG -- Get big data from indirect pages.
1084*45876Sbostic  *
1085*45876Sbostic  *	This routine chases indirect blocks for the big object at the
1086*45876Sbostic  *	specified page to a buffer, and return the address of the buffer.
1087*45876Sbostic  *
1088*45876Sbostic  *	Parameters:
1089*45876Sbostic  *		t -- btree with the indirect blocks
1090*45876Sbostic  *		pgno -- page number that starts the chain
1091*45876Sbostic  *		p -- (char **) to get the address of the buffer containing
1092*45876Sbostic  *		     the key or datum.
1093*45876Sbostic  *		sz -- pointer to an int to get the size of the instantiated
1094*45876Sbostic  *		      object.
1095*45876Sbostic  *
1096*45876Sbostic  *	Returns:
1097*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
1098*45876Sbostic  *
1099*45876Sbostic  *	Side Effects:
1100*45876Sbostic  *		None.
1101*45876Sbostic  */
1102*45876Sbostic 
1103*45876Sbostic static int
1104*45876Sbostic _bt_getbig(t, pgno, p, sz)
1105*45876Sbostic 	BTREE_P t;
1106*45876Sbostic 	pgno_t pgno;
1107*45876Sbostic 	char **p;
1108*45876Sbostic 	int *sz;
1109*45876Sbostic {
1110*45876Sbostic 	pgno_t save;
1111*45876Sbostic 	size_t nbytes;
1112*45876Sbostic 	size_t nhere;
1113*45876Sbostic 	BTHEADER *h;
1114*45876Sbostic 	char *top, *from, *where;
1115*45876Sbostic 
1116*45876Sbostic 	save = t->bt_curpage->h_pgno;
1117*45876Sbostic 	if (_bt_getpage(t, pgno) == RET_ERROR)
1118*45876Sbostic 		return (RET_ERROR);
1119*45876Sbostic 
1120*45876Sbostic 	h = t->bt_curpage;
1121*45876Sbostic 
1122*45876Sbostic 	bcopy((char *) &(h->h_linp[0]), (char *) &nbytes, sizeof(nbytes));
1123*45876Sbostic 
1124*45876Sbostic 	if ((*p = (char *) malloc(nbytes)) == (char *) NULL)
1125*45876Sbostic 		return (RET_ERROR);
1126*45876Sbostic 
1127*45876Sbostic 	*sz = nbytes;
1128*45876Sbostic 	from = ((char *) (&h->h_linp[0])) + sizeof(nbytes);
1129*45876Sbostic 	top = ((char *) h) + t->bt_psize;
1130*45876Sbostic 
1131*45876Sbostic 	/* need more space for data? */
1132*45876Sbostic 
1133*45876Sbostic 	where = *p;
1134*45876Sbostic 
1135*45876Sbostic 	while (nbytes > 0) {
1136*45876Sbostic 		nhere = (int) (top - from);
1137*45876Sbostic 		if (nhere > nbytes) {
1138*45876Sbostic 			(void) bcopy(from, where, nbytes);
1139*45876Sbostic 			nbytes = 0;
1140*45876Sbostic 		} else {
1141*45876Sbostic 			(void) bcopy(from, where, nhere);
1142*45876Sbostic 			where += nhere;
1143*45876Sbostic 			nbytes -= nhere;
1144*45876Sbostic 			if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
1145*45876Sbostic 				return (RET_ERROR);
1146*45876Sbostic 			h = t->bt_curpage;
1147*45876Sbostic 			top = ((char *) h) + t->bt_psize;
1148*45876Sbostic 			from = (char *) &(h->h_linp[0]);
1149*45876Sbostic 		}
1150*45876Sbostic 	}
1151*45876Sbostic 
1152*45876Sbostic 	if (_bt_getpage(t, save) == RET_ERROR)
1153*45876Sbostic 		return (RET_ERROR);
1154*45876Sbostic 
1155*45876Sbostic 	return (RET_SUCCESS);
1156*45876Sbostic }
1157*45876Sbostic 
1158*45876Sbostic /*
1159*45876Sbostic  *  _BT_DELINDIR -- Delete a chain of indirect blocks from the btree.
1160*45876Sbostic  *
1161*45876Sbostic  *	When a large item is deleted from the tree, this routine puts the
1162*45876Sbostic  *	space that it occupied onto the free list for later reuse.  The
1163*45876Sbostic  *	bt_free entry in the btree structure points at the head of this list.
1164*45876Sbostic  *	This value is also stored on disk in the btree's metadata.
1165*45876Sbostic  *
1166*45876Sbostic  *	Parameters:
1167*45876Sbostic  *		t -- btree from which to delete pages
1168*45876Sbostic  *		chain -- page number that starts the chain.
1169*45876Sbostic  *
1170*45876Sbostic  *	Returns:
1171*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
1172*45876Sbostic  *
1173*45876Sbostic  *	Side Effects:
1174*45876Sbostic  *		Invalidates the current on-disk version of the btree's
1175*45876Sbostic  *		metadata (if any), and updates the free list appropriately.
1176*45876Sbostic  */
1177*45876Sbostic 
1178*45876Sbostic static int
1179*45876Sbostic _bt_delindir(t, chain)
1180*45876Sbostic 	BTREE_P t;
1181*45876Sbostic 	pgno_t chain;
1182*45876Sbostic {
1183*45876Sbostic 	BTHEADER *h;
1184*45876Sbostic 	pgno_t save;
1185*45876Sbostic 	pgno_t oldfree;
1186*45876Sbostic 
1187*45876Sbostic 	h = t->bt_curpage;
1188*45876Sbostic 	save = h->h_pgno;
1189*45876Sbostic 	if (_bt_getpage(t, chain) == RET_ERROR)
1190*45876Sbostic 		return (RET_ERROR);
1191*45876Sbostic 
1192*45876Sbostic 	/*
1193*45876Sbostic 	 *  If some internal node is pointing at this chain, don't
1194*45876Sbostic 	 *  delete it.
1195*45876Sbostic 	 */
1196*45876Sbostic 
1197*45876Sbostic 	if (t->bt_curpage->h_flags & F_PRESERVE) {
1198*45876Sbostic 		if (_bt_getpage(t, save) == RET_ERROR)
1199*45876Sbostic 			return (RET_ERROR);
1200*45876Sbostic 		return (RET_SUCCESS);
1201*45876Sbostic 	}
1202*45876Sbostic 
1203*45876Sbostic 	/* if there's nothing on the free list, this is easy... */
1204*45876Sbostic 	if (t->bt_free == P_NONE) {
1205*45876Sbostic 		t->bt_free = chain;
1206*45876Sbostic 	} else {
1207*45876Sbostic 		oldfree = t->bt_free;
1208*45876Sbostic 
1209*45876Sbostic 		/* find the end of the data chain for the deleted datum */
1210*45876Sbostic 		t->bt_free = chain;
1211*45876Sbostic 		do {
1212*45876Sbostic 			if (_bt_getpage(t, chain) == RET_ERROR)
1213*45876Sbostic 				return (RET_ERROR);
1214*45876Sbostic 			h = t->bt_curpage;
1215*45876Sbostic 			if (h->h_nextpg != P_NONE)
1216*45876Sbostic 				chain = h->h_nextpg;
1217*45876Sbostic 		} while (h->h_nextpg != P_NONE);
1218*45876Sbostic 
1219*45876Sbostic 		/* link freed pages into free list */
1220*45876Sbostic 		h->h_nextpg = oldfree;
1221*45876Sbostic 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
1222*45876Sbostic 			return (RET_ERROR);
1223*45876Sbostic 		if (_bt_getpage(t, oldfree) == RET_ERROR)
1224*45876Sbostic 			return (RET_ERROR);
1225*45876Sbostic 		h = t->bt_curpage;
1226*45876Sbostic 		h->h_prevpg = chain;
1227*45876Sbostic 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
1228*45876Sbostic 			return (RET_ERROR);
1229*45876Sbostic 	}
1230*45876Sbostic 
1231*45876Sbostic 	/* restore the tree's current page pointer */
1232*45876Sbostic 	if (_bt_getpage(t, save) == RET_ERROR)
1233*45876Sbostic 		return (RET_ERROR);
1234*45876Sbostic 
1235*45876Sbostic 	/* we have trashed the tree metadata; rewrite it later */
1236*45876Sbostic 	t->bt_flags &= ~BTF_METAOK;
1237*45876Sbostic 
1238*45876Sbostic 	return (RET_SUCCESS);
1239*45876Sbostic }
1240*45876Sbostic 
1241*45876Sbostic /*
1242*45876Sbostic  *  _BT_FIRST -- Find the first item in the tree that matches the supplied
1243*45876Sbostic  *		 key.
1244*45876Sbostic  *
1245*45876Sbostic  *	This routine supports deletion.  When the user supplies a key to
1246*45876Sbostic  *	be deleted, we find the first one, and iteratively delete all the
1247*45876Sbostic  *	matching ones that follow it.
1248*45876Sbostic  *
1249*45876Sbostic  *	Parameters:
1250*45876Sbostic  *		t -- btree in which to find first occurrence
1251*45876Sbostic  *		key -- key to find
1252*45876Sbostic  *
1253*45876Sbostic  *	Returns:
1254*45876Sbostic  *		The BTITEM for the matching item.  If there's no match,
1255*45876Sbostic  *		this may point to the first item > than the supplied key,
1256*45876Sbostic  *		or off the end of the page.
1257*45876Sbostic  *
1258*45876Sbostic  *	Warnings:
1259*45876Sbostic  *		The BTITEM returned is in static space and will be overwritten
1260*45876Sbostic  *		by the next search of any kind in any btree.
1261*45876Sbostic  */
1262*45876Sbostic 
1263*45876Sbostic static BTITEM *
1264*45876Sbostic _bt_first(t, key)
1265*45876Sbostic 	BTREE_P t;
1266*45876Sbostic 	DBT *key;
1267*45876Sbostic {
1268*45876Sbostic 	BTHEADER *h;
1269*45876Sbostic 	BTITEM *item;
1270*45876Sbostic 	index_t next;
1271*45876Sbostic 	int r;
1272*45876Sbostic 
1273*45876Sbostic 	/* find any matching item */
1274*45876Sbostic 	item = _bt_search(t, key);
1275*45876Sbostic 	h = t->bt_curpage;
1276*45876Sbostic 	next = NEXTINDEX(h);
1277*45876Sbostic 
1278*45876Sbostic 	/* if we're off the end of the page, search failed and we're done */
1279*45876Sbostic 	if (item->bti_index >= next)
1280*45876Sbostic 		return (item);
1281*45876Sbostic 
1282*45876Sbostic 	/* as long as we have an exact match, walk backwards */
1283*45876Sbostic 	while ((r = _bt_cmp(t, key->data, item->bti_index)) == 0) {
1284*45876Sbostic 
1285*45876Sbostic 		/* at start of page? */
1286*45876Sbostic 		if (item->bti_index == 0) {
1287*45876Sbostic 
1288*45876Sbostic 			/* if no prev page, we're done */
1289*45876Sbostic 			if (h->h_prevpg == P_NONE)
1290*45876Sbostic 				return (item);
1291*45876Sbostic 
1292*45876Sbostic 			/* walk backward, skipping empty pages */
1293*45876Sbostic 			do {
1294*45876Sbostic 				if (_bt_getpage(t, h->h_prevpg) == RET_ERROR)
1295*45876Sbostic 					return ((BTITEM *) NULL);
1296*45876Sbostic 				h = t->bt_curpage;
1297*45876Sbostic 			} while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE);
1298*45876Sbostic 
1299*45876Sbostic 			if (NEXTINDEX(h) != 0)
1300*45876Sbostic 				item->bti_index = NEXTINDEX(h) - 1;
1301*45876Sbostic 			else
1302*45876Sbostic 				item->bti_index = 0;
1303*45876Sbostic 
1304*45876Sbostic 			item->bti_pgno = h->h_pgno;
1305*45876Sbostic 		} else {
1306*45876Sbostic 			item->bti_index--;
1307*45876Sbostic 		}
1308*45876Sbostic 	}
1309*45876Sbostic 
1310*45876Sbostic 	/* if we went too far backwards, step forward one entry */
1311*45876Sbostic 	if (r > 0) {
1312*45876Sbostic 		if (++(item->bti_index) >= NEXTINDEX(h)
1313*45876Sbostic 		    && h->h_nextpg != P_NONE) {
1314*45876Sbostic 
1315*45876Sbostic 			/* walk forward, skipping empty pages */
1316*45876Sbostic 			do {
1317*45876Sbostic 				if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
1318*45876Sbostic 					return ((BTITEM *) NULL);
1319*45876Sbostic 				h = t->bt_curpage;
1320*45876Sbostic 			} while (h->h_nextpg != P_NONE && NEXTINDEX(h) == 0);
1321*45876Sbostic 
1322*45876Sbostic 			item->bti_index = 0;
1323*45876Sbostic 			item->bti_pgno = h->h_pgno;
1324*45876Sbostic 		}
1325*45876Sbostic 	}
1326*45876Sbostic 
1327*45876Sbostic 	/* got it */
1328*45876Sbostic 	return (item);
1329*45876Sbostic }
1330*45876Sbostic 
1331*45876Sbostic /*
1332*45876Sbostic  *  _BT_SEARCH, _BT_SEARCHR -- Search for a particular key in the tree.
1333*45876Sbostic  *
1334*45876Sbostic  *	Parameters:
1335*45876Sbostic  *		t -- btree in which to search
1336*45876Sbostic  *		key -- key to find
1337*45876Sbostic  *
1338*45876Sbostic  *	Returns:
1339*45876Sbostic  *		BTITEM for matching item, if any, or the BTITEM for the
1340*45876Sbostic  *		location of the key, if it were in the tree.
1341*45876Sbostic  *
1342*45876Sbostic  *	Warnings:
1343*45876Sbostic  *		The BTITEM returned is in static memory, and will be
1344*45876Sbostic  *		overwritten by the next search of any kind in any tree.
1345*45876Sbostic  */
1346*45876Sbostic 
1347*45876Sbostic static BTITEM *
1348*45876Sbostic _bt_search(t, key)
1349*45876Sbostic 	BTREE_P t;
1350*45876Sbostic 	DBT *key;
1351*45876Sbostic {
1352*45876Sbostic 	/* we want to start all of our searches at the root */
1353*45876Sbostic 	if (_bt_getpage(t, P_ROOT) == RET_ERROR)
1354*45876Sbostic 		return ((BTITEM *) NULL);
1355*45876Sbostic 
1356*45876Sbostic 	return (_bt_searchr(t, key));
1357*45876Sbostic }
1358*45876Sbostic 
1359*45876Sbostic static BTITEM *
1360*45876Sbostic _bt_searchr(t, key)
1361*45876Sbostic 	BTREE_P t;
1362*45876Sbostic 	DBT *key;
1363*45876Sbostic {
1364*45876Sbostic 	BTHEADER *h = t->bt_curpage;
1365*45876Sbostic 	index_t index;
1366*45876Sbostic 	IDATUM *id;
1367*45876Sbostic 	DATUM *d;
1368*45876Sbostic 	static BTITEM item;
1369*45876Sbostic 
1370*45876Sbostic 	/* do a binary search on the current page */
1371*45876Sbostic 	index = _bt_binsrch(t, &(key->data[0]));
1372*45876Sbostic 
1373*45876Sbostic 	/*
1374*45876Sbostic 	 *  At this point, the binary search terminated because the endpoints
1375*45876Sbostic 	 *  got too close together, or we have a match.  Figure out which
1376*45876Sbostic 	 *  case applies and decide what to do based on the page type.
1377*45876Sbostic 	 */
1378*45876Sbostic 	if (h->h_flags & F_LEAF) {
1379*45876Sbostic 		item.bti_pgno = h->h_pgno;
1380*45876Sbostic 		item.bti_index = index;
1381*45876Sbostic 		if (index < NEXTINDEX(h))
1382*45876Sbostic 			d = (DATUM *) GETDATUM(h,index);
1383*45876Sbostic 		else
1384*45876Sbostic 			d = (DATUM *) NULL;
1385*45876Sbostic 
1386*45876Sbostic 		item.bti_datum = d;
1387*45876Sbostic 		return(&item);
1388*45876Sbostic 	} else {
1389*45876Sbostic 		id = (IDATUM *) GETDATUM(h, index);
1390*45876Sbostic 		_bt_push(t, h->h_pgno);
1391*45876Sbostic 		if (_bt_getpage(t, id->i_pgno) == RET_ERROR)
1392*45876Sbostic 			return ((BTITEM *) NULL);
1393*45876Sbostic 		return (_bt_searchr(t, key));
1394*45876Sbostic 	}
1395*45876Sbostic }
1396*45876Sbostic 
1397*45876Sbostic /*
1398*45876Sbostic  *  BT_GETPAGE -- Make pgno the current page of the btree.
1399*45876Sbostic  *
1400*45876Sbostic  *	This routine is just a wrapper that decides whether to call the
1401*45876Sbostic  *	memory or disk-based routine to do the work.
1402*45876Sbostic  *
1403*45876Sbostic  *	Parameters:
1404*45876Sbostic  *		t -- btree in which to get page
1405*45876Sbostic  *		pgno -- page number to get
1406*45876Sbostic  *
1407*45876Sbostic  *	Returns:
1408*45876Sbostic  *		RET_SUCCESS or RET_ERROR.
1409*45876Sbostic  */
1410*45876Sbostic 
1411*45876Sbostic static int
1412*45876Sbostic _bt_getpage(t, pgno)
1413*45876Sbostic 	BTREE_P t;
1414*45876Sbostic 	pgno_t pgno;
1415*45876Sbostic {
1416*45876Sbostic #ifdef DEBUG
1417*45876Sbostic 	if (pgno == P_NONE)
1418*45876Sbostic 		_punt();
1419*45876Sbostic #endif /* DEBUG */
1420*45876Sbostic 
1421*45876Sbostic 	/* see if we can get away without doing any work */
1422*45876Sbostic 	if (t->bt_curpage != (BTHEADER *) NULL) {
1423*45876Sbostic 		if (t->bt_curpage->h_pgno == pgno)
1424*45876Sbostic 			return (RET_SUCCESS);
1425*45876Sbostic 	}
1426*45876Sbostic 
1427*45876Sbostic 	if (t->bt_fname == (char *) NULL)
1428*45876Sbostic 		return (_bt_getmpage(t, pgno));
1429*45876Sbostic 	else
1430*45876Sbostic 		return (_bt_getdpage(t, pgno));
1431*45876Sbostic }
1432*45876Sbostic 
1433*45876Sbostic /*
1434*45876Sbostic  *  _BT_GETMPAGE -- Make pgno the current page of the btree.
1435*45876Sbostic  *
1436*45876Sbostic  *	This routine gets pages for in-memory btrees.
1437*45876Sbostic  *
1438*45876Sbostic  *	Parameters:
1439*45876Sbostic  *		t -- btree in which to get page
1440*45876Sbostic  *		pgno -- page number to get
1441*45876Sbostic  *
1442*45876Sbostic  *	Returns:
1443*45876Sbostic  *		RET_SUCCESS or RET_ERROR.
1444*45876Sbostic  */
1445*45876Sbostic 
1446*45876Sbostic static int
1447*45876Sbostic _bt_getmpage(t, pgno)
1448*45876Sbostic 	register BTREE_P t;
1449*45876Sbostic 	pgno_t pgno;
1450*45876Sbostic {
1451*45876Sbostic 	int htindex;
1452*45876Sbostic 	int nbytes;
1453*45876Sbostic 	BTHEADER *h;
1454*45876Sbostic 	HTBUCKET *b;
1455*45876Sbostic 
1456*45876Sbostic 	if (t->bt_curpage == (BTHEADER *) NULL) {
1457*45876Sbostic 		if (pgno != P_ROOT) {
1458*45876Sbostic 			errno = EBADF;
1459*45876Sbostic 			return (RET_ERROR);
1460*45876Sbostic 		}
1461*45876Sbostic 
1462*45876Sbostic 		t->bt_npages++;
1463*45876Sbostic 		h = (BTHEADER *) malloc((unsigned) t->bt_psize);
1464*45876Sbostic 		if (h == (BTHEADER *) NULL)
1465*45876Sbostic 			return (RET_ERROR);
1466*45876Sbostic 
1467*45876Sbostic 		h->h_pgno = P_ROOT;
1468*45876Sbostic 		h->h_flags = F_LEAF;
1469*45876Sbostic 		h->h_lower = (index_t)
1470*45876Sbostic 				(((char *) &(h->h_linp[0])) - ((char *) h));
1471*45876Sbostic 		h->h_upper = t->bt_psize;
1472*45876Sbostic 		h->h_prevpg = h->h_nextpg = P_NONE;
1473*45876Sbostic 
1474*45876Sbostic 		t->bt_curpage = h;
1475*45876Sbostic 
1476*45876Sbostic 		/* get the root page into the hash table */
1477*45876Sbostic 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
1478*45876Sbostic 			return (RET_ERROR);
1479*45876Sbostic 	}
1480*45876Sbostic 
1481*45876Sbostic 	htindex = HASHKEY(pgno);
1482*45876Sbostic 
1483*45876Sbostic 	for (b = t->bt_s.bt_ht[htindex];
1484*45876Sbostic 	     b != (HTBUCKET *) NULL;
1485*45876Sbostic 	     b = b->ht_next) {
1486*45876Sbostic 		if (b->ht_pgno == pgno) {
1487*45876Sbostic 			t->bt_curpage = b->ht_page;
1488*45876Sbostic 			return (RET_SUCCESS);
1489*45876Sbostic 		}
1490*45876Sbostic 	}
1491*45876Sbostic 	return (RET_ERROR);
1492*45876Sbostic }
1493*45876Sbostic 
1494*45876Sbostic /*
1495*45876Sbostic  *  _BT_GETDPAGE -- Make pgno the current page of the btree.
1496*45876Sbostic  *
1497*45876Sbostic  *	This routine gets pages for disk btrees.
1498*45876Sbostic  *
1499*45876Sbostic  *	Because disk btree pages must be readable across machine architectures,
1500*45876Sbostic  *	the btree code writes integers out in network format.  This routine
1501*45876Sbostic  *	converts them back to host format before returning the page.
1502*45876Sbostic  *
1503*45876Sbostic  *	Parameters:
1504*45876Sbostic  *		t -- btree in which to get page
1505*45876Sbostic  *		pgno -- page number to get
1506*45876Sbostic  *
1507*45876Sbostic  *	Returns:
1508*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
1509*45876Sbostic  */
1510*45876Sbostic 
1511*45876Sbostic static int
1512*45876Sbostic _bt_getdpage(t, pgno)
1513*45876Sbostic 	register BTREE_P t;
1514*45876Sbostic 	pgno_t pgno;
1515*45876Sbostic {
1516*45876Sbostic 	BTHEADER *h;
1517*45876Sbostic 	char *cache;
1518*45876Sbostic 	long pos;
1519*45876Sbostic 	int n, nbytes;
1520*45876Sbostic 
1521*45876Sbostic 	/* if we have an lru cache, let the cache code do the work */
1522*45876Sbostic 	if (ISCACHE(t)) {
1523*45876Sbostic 		cache = t->bt_s.bt_d.d_cache;
1524*45876Sbostic 
1525*45876Sbostic 		/* release the old page */
1526*45876Sbostic 		if (t->bt_curpage != (BTHEADER *) NULL) {
1527*45876Sbostic 			pgno_t opgno = t->bt_curpage->h_pgno;
1528*45876Sbostic 			t->bt_curpage->h_flags &= ~F_DIRTY;
1529*45876Sbostic 
1530*45876Sbostic 			if (lruwrite(cache, opgno) < 0)
1531*45876Sbostic 				return (RET_ERROR);
1532*45876Sbostic 
1533*45876Sbostic 			lrurelease(cache, opgno);
1534*45876Sbostic 		}
1535*45876Sbostic 
1536*45876Sbostic 		if (pgno > t->bt_npages) {
1537*45876Sbostic 			if ((h = (BTHEADER *) lrugetnew(cache, pgno, &nbytes))
1538*45876Sbostic 			    == (BTHEADER *) NULL)
1539*45876Sbostic 				return (RET_ERROR);
1540*45876Sbostic 			t->bt_npages = pgno;
1541*45876Sbostic 		} else {
1542*45876Sbostic 			if ((h = (BTHEADER *) lruget(cache, pgno, &nbytes))
1543*45876Sbostic 			    == (BTHEADER *) NULL)
1544*45876Sbostic 				return (RET_ERROR);
1545*45876Sbostic 		}
1546*45876Sbostic 
1547*45876Sbostic 		/* init this page, if necessary */
1548*45876Sbostic 		if (nbytes == 0) {
1549*45876Sbostic 			h->h_pgno = pgno;
1550*45876Sbostic 			h->h_flags = F_LEAF;
1551*45876Sbostic 			h->h_lower = (index_t)
1552*45876Sbostic 				(((char *) &(h->h_linp[0])) - ((char *) h));
1553*45876Sbostic 			h->h_upper = t->bt_psize;
1554*45876Sbostic 			h->h_prevpg = h->h_nextpg = P_NONE;
1555*45876Sbostic 		}
1556*45876Sbostic 
1557*45876Sbostic 		t->bt_curpage = h;
1558*45876Sbostic 
1559*45876Sbostic 		return (RET_SUCCESS);
1560*45876Sbostic 	}
1561*45876Sbostic 
1562*45876Sbostic 	/* sync the current page, if necessary */
1563*45876Sbostic 	if (t->bt_curpage != (BTHEADER *) NULL) {
1564*45876Sbostic 		if (t->bt_curpage->h_flags & F_DIRTY)
1565*45876Sbostic 			if (_bt_write(t, t->bt_curpage, RELEASE) == RET_ERROR)
1566*45876Sbostic 				return (RET_ERROR);
1567*45876Sbostic 	} else {
1568*45876Sbostic 		if (t->bt_npages == 0)
1569*45876Sbostic 			t->bt_npages = 1;
1570*45876Sbostic 
1571*45876Sbostic 		/* if no current page, get space for one */
1572*45876Sbostic 		if ((t->bt_curpage = (BTHEADER *) malloc((unsigned) t->bt_psize))
1573*45876Sbostic 		    == (BTHEADER *) NULL) {
1574*45876Sbostic 			return (RET_ERROR);
1575*45876Sbostic 		}
1576*45876Sbostic 	}
1577*45876Sbostic 
1578*45876Sbostic 	n = t->bt_psize;
1579*45876Sbostic 	pos = (long) (pgno * n);
1580*45876Sbostic 
1581*45876Sbostic 	/* seek to correct location in file */
1582*45876Sbostic 	if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) {
1583*45876Sbostic 		return (RET_ERROR);
1584*45876Sbostic 	}
1585*45876Sbostic 
1586*45876Sbostic 	/* read the page */
1587*45876Sbostic 	if ((nbytes = read(t->bt_s.bt_d.d_fd, t->bt_curpage, n)) < n) {
1588*45876Sbostic 
1589*45876Sbostic 		/*
1590*45876Sbostic 		 *  If we didn't get a full page, we must have gotten no
1591*45876Sbostic 		 *  data at all -- in which case we're trying to read a
1592*45876Sbostic 		 *  root page that doesn't exist yet.  This is the only
1593*45876Sbostic 		 *  case in which this is okay.  If this happens, construct
1594*45876Sbostic 		 *  an empty root page by hand.
1595*45876Sbostic 		 */
1596*45876Sbostic 		if (nbytes != 0 || pgno != P_ROOT) {
1597*45876Sbostic 			errno = EBADF;
1598*45876Sbostic 			return (RET_ERROR);
1599*45876Sbostic 		}
1600*45876Sbostic 
1601*45876Sbostic 		h = (BTHEADER *) t->bt_curpage;
1602*45876Sbostic 		h->h_pgno = pgno;
1603*45876Sbostic 		h->h_flags = F_LEAF;
1604*45876Sbostic 		h->h_lower = (index_t)
1605*45876Sbostic 				(((char *) &(h->h_linp[0])) - ((char *) h));
1606*45876Sbostic 		h->h_upper = t->bt_psize;
1607*45876Sbostic 		h->h_prevpg = h->h_nextpg = P_NONE;
1608*45876Sbostic 	} else
1609*45876Sbostic 		(void) _bt_pgin(t->bt_curpage, (char *) t->bt_lorder);
1610*45876Sbostic 
1611*45876Sbostic 	return (RET_SUCCESS);
1612*45876Sbostic }
1613*45876Sbostic 
1614*45876Sbostic /*
1615*45876Sbostic  *  _BT_PGOUT, _BT_PGIN -- Convert host-specific number layout to/from
1616*45876Sbostic  *			   the host-independent format stored on disk.
1617*45876Sbostic  *
1618*45876Sbostic  *	Parameters:
1619*45876Sbostic  *		h -- page to convert
1620*45876Sbostic  *		_lorder -- byte order for pages (stored as a char * in the
1621*45876Sbostic  *			   cache, and passed around as a magic cookie).
1622*45876Sbostic  *
1623*45876Sbostic  *	Returns:
1624*45876Sbostic  *		RET_SUCCESS (lru code requires a return value).
1625*45876Sbostic  *
1626*45876Sbostic  *	Side Effects:
1627*45876Sbostic  *		Layout of tree metadata on the page is changed in place.
1628*45876Sbostic  *
1629*45876Sbostic  *	Warnings:
1630*45876Sbostic  *		Everywhere else in the code, the types pgno_t and index_t
1631*45876Sbostic  *		are opaque.  These two routines know what they really
1632*45876Sbostic  *		are.
1633*45876Sbostic  */
1634*45876Sbostic 
1635*45876Sbostic static int
1636*45876Sbostic _bt_pgout(h, _lorder)
1637*45876Sbostic 	BTHEADER *h;
1638*45876Sbostic 	char *_lorder;
1639*45876Sbostic {
1640*45876Sbostic 	int i;
1641*45876Sbostic 	int top;
1642*45876Sbostic 	int lorder;
1643*45876Sbostic 	DATUM *d;
1644*45876Sbostic 	IDATUM *id;
1645*45876Sbostic 	size_t chain;
1646*45876Sbostic 
1647*45876Sbostic 	lorder = (int) _lorder;
1648*45876Sbostic 	if (lorder == BYTE_ORDER)
1649*45876Sbostic 		return (RET_SUCCESS);
1650*45876Sbostic 
1651*45876Sbostic 	if (h->h_flags & F_LEAF) {
1652*45876Sbostic 		if (h->h_flags & F_CONT) {
1653*45876Sbostic 			if (h->h_prevpg == P_NONE) {
1654*45876Sbostic 				size_t longsz;
1655*45876Sbostic 
1656*45876Sbostic 				(void) bcopy((char *) &(h->h_linp[0]),
1657*45876Sbostic 					      (char *) &longsz,
1658*45876Sbostic 					      sizeof(longsz));
1659*45876Sbostic 				BLSWAP(longsz);
1660*45876Sbostic 				(void) bcopy((char *) &longsz,
1661*45876Sbostic 					      (char *) &(h->h_linp[0]),
1662*45876Sbostic 					      sizeof(longsz));
1663*45876Sbostic 			}
1664*45876Sbostic 		} else {
1665*45876Sbostic 			top = NEXTINDEX(h);
1666*45876Sbostic 			for (i = 0; i < top; i++) {
1667*45876Sbostic 				d = (DATUM *) GETDATUM(h, i);
1668*45876Sbostic 				if (d->d_flags & D_BIGKEY) {
1669*45876Sbostic 					(void) bcopy((char *) &(d->d_bytes[0]),
1670*45876Sbostic 						      (char *) &chain,
1671*45876Sbostic 						      sizeof(chain));
1672*45876Sbostic 					BLSWAP(chain);
1673*45876Sbostic 					(void) bcopy((char *) &chain,
1674*45876Sbostic 						      (char *) &(d->d_bytes[0]),
1675*45876Sbostic 						      sizeof(chain));
1676*45876Sbostic 				}
1677*45876Sbostic 				if (d->d_flags & D_BIGDATA) {
1678*45876Sbostic 					(void) bcopy((char *) &(d->d_bytes[d->d_ksize]),
1679*45876Sbostic 						      (char *) &chain,
1680*45876Sbostic 						      sizeof(chain));
1681*45876Sbostic 					BLSWAP(chain);
1682*45876Sbostic 					(void) bcopy((char *) &chain,
1683*45876Sbostic 						      (char *) &(d->d_bytes[d->d_ksize]),
1684*45876Sbostic 						      sizeof(chain));
1685*45876Sbostic 				}
1686*45876Sbostic 				BLSWAP(d->d_dsize);
1687*45876Sbostic 				BLSWAP(d->d_ksize);
1688*45876Sbostic 				BLSWAP(d->d_flags);
1689*45876Sbostic 				BLSWAP(h->h_linp[i]);
1690*45876Sbostic 			}
1691*45876Sbostic 		}
1692*45876Sbostic 	} else {
1693*45876Sbostic 		top = NEXTINDEX(h);
1694*45876Sbostic 		for (i = 0; i < top; i++) {
1695*45876Sbostic 			id = (IDATUM *) GETDATUM(h, i);
1696*45876Sbostic 			BLSWAP(id->i_size);
1697*45876Sbostic 			BLSWAP(id->i_pgno);
1698*45876Sbostic 			BLSWAP(id->i_flags);
1699*45876Sbostic 			if (id->i_flags & D_BIGKEY) {
1700*45876Sbostic 				(void) bcopy((char *) &(id->i_bytes[0]),
1701*45876Sbostic 					      (char *) &chain,
1702*45876Sbostic 					      sizeof(chain));
1703*45876Sbostic 				BLSWAP(chain);
1704*45876Sbostic 				(void) bcopy((char *) &chain,
1705*45876Sbostic 					      (char *) &(id->i_bytes[0]),
1706*45876Sbostic 					      sizeof(chain));
1707*45876Sbostic 			}
1708*45876Sbostic 			BLSWAP(h->h_linp[i]);
1709*45876Sbostic 		}
1710*45876Sbostic 	}
1711*45876Sbostic 	BLSWAP(h->h_flags);
1712*45876Sbostic 	BLSWAP(h->h_pgno);
1713*45876Sbostic 	BLSWAP(h->h_prevpg);
1714*45876Sbostic 	BLSWAP(h->h_nextpg);
1715*45876Sbostic 	BLSWAP(h->h_lower);
1716*45876Sbostic 	BLSWAP(h->h_upper);
1717*45876Sbostic 
1718*45876Sbostic 	return (RET_SUCCESS);
1719*45876Sbostic }
1720*45876Sbostic 
1721*45876Sbostic static int
1722*45876Sbostic _bt_pgin(h, _lorder)
1723*45876Sbostic 	BTHEADER *h;
1724*45876Sbostic 	char *_lorder;
1725*45876Sbostic {
1726*45876Sbostic 	int i;
1727*45876Sbostic 	int top;
1728*45876Sbostic 	int lorder;
1729*45876Sbostic 	DATUM *d;
1730*45876Sbostic 	IDATUM *id;
1731*45876Sbostic 	size_t chain;
1732*45876Sbostic 
1733*45876Sbostic 	/*
1734*45876Sbostic 	 *  If btree pages are to be stored in the host byte order, don't
1735*45876Sbostic 	 *  bother swapping.
1736*45876Sbostic 	 */
1737*45876Sbostic 	lorder = (int) _lorder;
1738*45876Sbostic 	if (lorder == BYTE_ORDER)
1739*45876Sbostic 		return (RET_SUCCESS);
1740*45876Sbostic 
1741*45876Sbostic 	BLSWAP(h->h_upper);
1742*45876Sbostic 	BLSWAP(h->h_lower);
1743*45876Sbostic 	BLSWAP(h->h_nextpg);
1744*45876Sbostic 	BLSWAP(h->h_prevpg);
1745*45876Sbostic 	BLSWAP(h->h_pgno);
1746*45876Sbostic 	BLSWAP(h->h_flags);
1747*45876Sbostic 
1748*45876Sbostic 	if (h->h_flags & F_LEAF) {
1749*45876Sbostic 		if (h->h_flags & F_CONT) {
1750*45876Sbostic 			if (h->h_prevpg == P_NONE) {
1751*45876Sbostic 				size_t longsz;
1752*45876Sbostic 
1753*45876Sbostic 				(void) bcopy((char *) &(h->h_linp[0]),
1754*45876Sbostic 					      (char *) &longsz,
1755*45876Sbostic 					      sizeof(longsz));
1756*45876Sbostic 				BLSWAP(longsz);
1757*45876Sbostic 				(void) bcopy((char *) &longsz,
1758*45876Sbostic 					      (char *) &(h->h_linp[0]),
1759*45876Sbostic 					      sizeof(longsz));
1760*45876Sbostic 			}
1761*45876Sbostic 		} else {
1762*45876Sbostic 			top = NEXTINDEX(h);
1763*45876Sbostic 			for (i = 0; i < top; i++) {
1764*45876Sbostic 				BLSWAP(h->h_linp[i]);
1765*45876Sbostic 				d = (DATUM *) GETDATUM(h, i);
1766*45876Sbostic 				BLSWAP(d->d_dsize);
1767*45876Sbostic 				BLSWAP(d->d_ksize);
1768*45876Sbostic 				BLSWAP(d->d_flags);
1769*45876Sbostic 				if (d->d_flags & D_BIGKEY) {
1770*45876Sbostic 					(void) bcopy((char *) &(d->d_bytes[0]),
1771*45876Sbostic 						      (char *) &chain,
1772*45876Sbostic 						      sizeof(chain));
1773*45876Sbostic 					BLSWAP(chain);
1774*45876Sbostic 					(void) bcopy((char *) &chain,
1775*45876Sbostic 						      (char *) &(d->d_bytes[0]),
1776*45876Sbostic 						      sizeof(chain));
1777*45876Sbostic 				}
1778*45876Sbostic 				if (d->d_flags & D_BIGDATA) {
1779*45876Sbostic 					(void) bcopy((char *) &(d->d_bytes[d->d_ksize]),
1780*45876Sbostic 						      (char *) &chain,
1781*45876Sbostic 						      sizeof(chain));
1782*45876Sbostic 					BLSWAP(chain);
1783*45876Sbostic 					(void) bcopy((char *) &chain,
1784*45876Sbostic 						      (char *) &(d->d_bytes[d->d_ksize]),
1785*45876Sbostic 						      sizeof(chain));
1786*45876Sbostic 				}
1787*45876Sbostic 			}
1788*45876Sbostic 		}
1789*45876Sbostic 	} else {
1790*45876Sbostic 		top = NEXTINDEX(h);
1791*45876Sbostic 		for (i = 0; i < top; i++) {
1792*45876Sbostic 			BLSWAP(h->h_linp[i]);
1793*45876Sbostic 			id = (IDATUM *) GETDATUM(h, i);
1794*45876Sbostic 			BLSWAP(id->i_size);
1795*45876Sbostic 			BLSWAP(id->i_pgno);
1796*45876Sbostic 			BLSWAP(id->i_flags);
1797*45876Sbostic 			if (id->i_flags & D_BIGKEY) {
1798*45876Sbostic 				(void) bcopy((char *) &(id->i_bytes[0]),
1799*45876Sbostic 					      (char *) &chain,
1800*45876Sbostic 					      sizeof(chain));
1801*45876Sbostic 				BLSWAP(chain);
1802*45876Sbostic 				(void) bcopy((char *) &chain,
1803*45876Sbostic 					      (char *) &(id->i_bytes[0]),
1804*45876Sbostic 					      sizeof(chain));
1805*45876Sbostic 			}
1806*45876Sbostic 		}
1807*45876Sbostic 	}
1808*45876Sbostic 	return (RET_SUCCESS);
1809*45876Sbostic }
1810*45876Sbostic 
1811*45876Sbostic /*
1812*45876Sbostic  *  BT_SYNC -- sync the btree to disk.
1813*45876Sbostic  *
1814*45876Sbostic  *	Parameters:
1815*45876Sbostic  *		tree -- btree to sync.
1816*45876Sbostic  *
1817*45876Sbostic  *	Returns:
1818*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
1819*45876Sbostic  */
1820*45876Sbostic 
1821*45876Sbostic bt_sync(tree)
1822*45876Sbostic 	BTREE tree;
1823*45876Sbostic {
1824*45876Sbostic 	BTREE_P t = (BTREE_P) tree;
1825*45876Sbostic 	BTHEADER *h;
1826*45876Sbostic 	pgno_t pgno;
1827*45876Sbostic 
1828*45876Sbostic 	/* if this is an in-memory btree, syncing is a no-op */
1829*45876Sbostic 	if (!ISDISK(t))
1830*45876Sbostic 		return (RET_SUCCESS);
1831*45876Sbostic 
1832*45876Sbostic 	h = (BTHEADER *) t->bt_curpage;
1833*45876Sbostic 	h->h_flags &= ~F_DIRTY;
1834*45876Sbostic 
1835*45876Sbostic 	if (ISCACHE(t)) {
1836*45876Sbostic 		pgno = t->bt_curpage->h_pgno;
1837*45876Sbostic 		if (_bt_write(t, h, RELEASE) == RET_ERROR)
1838*45876Sbostic 			return(RET_ERROR);
1839*45876Sbostic 		if (lrusync(t->bt_s.bt_d.d_cache) < RET_ERROR)
1840*45876Sbostic 			return (RET_ERROR);
1841*45876Sbostic 		if (_bt_getpage(t, pgno) == RET_ERROR)
1842*45876Sbostic 			return (RET_ERROR);
1843*45876Sbostic 	} else {
1844*45876Sbostic 		if (_bt_write(t, h, NORELEASE) == RET_ERROR)
1845*45876Sbostic 			return (RET_ERROR);
1846*45876Sbostic 	}
1847*45876Sbostic 
1848*45876Sbostic 	return (fsync(t->bt_s.bt_d.d_fd));
1849*45876Sbostic }
1850*45876Sbostic 
1851*45876Sbostic /*
1852*45876Sbostic  *  _BT_INSERT -- Insert a new user datum in the btree.
1853*45876Sbostic  *
1854*45876Sbostic  *	This routine is called by bt_put, the public interface, once the
1855*45876Sbostic  *	location for the new item is known.  We do the work here, and
1856*45876Sbostic  *	handle splits if necessary.
1857*45876Sbostic  *
1858*45876Sbostic  *	Parameters:
1859*45876Sbostic  *		t -- btree in which to do the insertion.
1860*45876Sbostic  *		item -- BTITEM describing location of new datum
1861*45876Sbostic  *		key -- key to insert
1862*45876Sbostic  *		data -- data associated with key
1863*45876Sbostic  *		flag -- magic cookie passed recursively to bt_put if we
1864*45876Sbostic  *			have to do a split
1865*45876Sbostic  *
1866*45876Sbostic  *	Returns:
1867*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
1868*45876Sbostic  */
1869*45876Sbostic 
1870*45876Sbostic static int
1871*45876Sbostic _bt_insert(t, item, key, data, flag)
1872*45876Sbostic 	BTREE_P t;
1873*45876Sbostic 	BTITEM *item;
1874*45876Sbostic 	DBT *key;
1875*45876Sbostic 	DBT *data;
1876*45876Sbostic 	int flag;
1877*45876Sbostic {
1878*45876Sbostic 	index_t index;
1879*45876Sbostic 	BTHEADER *h;
1880*45876Sbostic 	DATUM *d;
1881*45876Sbostic 	int nbytes;
1882*45876Sbostic 	int status;
1883*45876Sbostic 	pgno_t pgno;
1884*45876Sbostic 	int keysize, datasize;
1885*45876Sbostic 	int bigkey, bigdata;
1886*45876Sbostic 
1887*45876Sbostic 	if (_bt_getpage(t, item->bti_pgno) == RET_ERROR)
1888*45876Sbostic 		return (RET_ERROR);
1889*45876Sbostic 	h = t->bt_curpage;
1890*45876Sbostic 
1891*45876Sbostic 	if (TOOBIG(t, data->size)) {
1892*45876Sbostic 		bigdata = TRUE;
1893*45876Sbostic 		datasize = sizeof(pgno_t);
1894*45876Sbostic 	} else {
1895*45876Sbostic 		bigdata = FALSE;
1896*45876Sbostic 		datasize = data->size;
1897*45876Sbostic 	}
1898*45876Sbostic 
1899*45876Sbostic 	if (TOOBIG(t, key->size)) {
1900*45876Sbostic 		bigkey = TRUE;
1901*45876Sbostic 		keysize = sizeof(pgno_t);
1902*45876Sbostic 	} else {
1903*45876Sbostic 		bigkey = FALSE;
1904*45876Sbostic 		keysize = key->size;
1905*45876Sbostic 	}
1906*45876Sbostic 
1907*45876Sbostic 	nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char));
1908*45876Sbostic 	nbytes = LONGALIGN(nbytes) + sizeof(index_t);
1909*45876Sbostic 
1910*45876Sbostic 	/* if there's not enough room here, split the page */
1911*45876Sbostic 	if ((h->h_upper - h->h_lower) < nbytes) {
1912*45876Sbostic 		if (_bt_split(t) == RET_ERROR)
1913*45876Sbostic 			return (RET_ERROR);
1914*45876Sbostic 
1915*45876Sbostic 		/* okay, try again */
1916*45876Sbostic 		return (bt_put(t, key, data, flag));
1917*45876Sbostic 	}
1918*45876Sbostic 
1919*45876Sbostic 	/* put together a leaf page datum from the key/data pair */
1920*45876Sbostic 	index = item->bti_index;
1921*45876Sbostic 	nbytes = keysize + datasize + (sizeof(DATUM) - sizeof(char));
1922*45876Sbostic 
1923*45876Sbostic 	if ((d = (DATUM *) malloc((unsigned) nbytes)) == (DATUM *) NULL)
1924*45876Sbostic 		return (RET_ERROR);
1925*45876Sbostic 
1926*45876Sbostic 	d->d_ksize = keysize;
1927*45876Sbostic 	d->d_dsize = datasize;
1928*45876Sbostic 	d->d_flags = 0;
1929*45876Sbostic 
1930*45876Sbostic 	if (bigkey) {
1931*45876Sbostic 		if (_bt_indirect(t, key, &pgno) == RET_ERROR)
1932*45876Sbostic 			return (RET_ERROR);
1933*45876Sbostic 		(void) bcopy((char *) &pgno, &(d->d_bytes[0]), sizeof(pgno));
1934*45876Sbostic 		d->d_flags |= D_BIGKEY;
1935*45876Sbostic 		if (_bt_getpage(t, item->bti_pgno) == RET_ERROR)
1936*45876Sbostic 			return (RET_ERROR);
1937*45876Sbostic 	} else {
1938*45876Sbostic 		if (d->d_ksize > 0) {
1939*45876Sbostic 			(void) bcopy((char *) key->data,
1940*45876Sbostic 				      (char *) &(d->d_bytes[0]),
1941*45876Sbostic 				      (int) d->d_ksize);
1942*45876Sbostic 		}
1943*45876Sbostic 	}
1944*45876Sbostic 
1945*45876Sbostic 	if (bigdata) {
1946*45876Sbostic 		if (_bt_indirect(t, data, &pgno) == RET_ERROR)
1947*45876Sbostic 			return (RET_ERROR);
1948*45876Sbostic 		(void) bcopy((char *) &pgno,
1949*45876Sbostic 			     &(d->d_bytes[keysize]),
1950*45876Sbostic 			     sizeof(pgno));
1951*45876Sbostic 		d->d_flags |= D_BIGDATA;
1952*45876Sbostic 		if (_bt_getpage(t, item->bti_pgno) == RET_ERROR)
1953*45876Sbostic 			return (RET_ERROR);
1954*45876Sbostic 	} else {
1955*45876Sbostic 		if (d->d_dsize > 0) {
1956*45876Sbostic 			(void) bcopy((char *) data->data,
1957*45876Sbostic 				      (char *) &(d->d_bytes[keysize]),
1958*45876Sbostic 				      (int) d->d_dsize);
1959*45876Sbostic 		}
1960*45876Sbostic 	}
1961*45876Sbostic 
1962*45876Sbostic 	/* do the insertion */
1963*45876Sbostic 	status = _bt_insertat(t, d, index);
1964*45876Sbostic 
1965*45876Sbostic 	(void) free((char *) d);
1966*45876Sbostic 
1967*45876Sbostic 	return (status);
1968*45876Sbostic }
1969*45876Sbostic 
1970*45876Sbostic /*
1971*45876Sbostic  *  _BT_INDIRECT -- Write a series of indirect pages for big objects.
1972*45876Sbostic  *
1973*45876Sbostic  *	A chain of indirect pages looks like
1974*45876Sbostic  *
1975*45876Sbostic  *	   +-------------------+   +---------------------+
1976*45876Sbostic  *	   |hdr|size|	       |   |hdr|		 |
1977*45876Sbostic  *	   +---+----+	       |   +---+		 |
1978*45876Sbostic  *	   |   ... data ...    |   |   ... data ...	 |    ...
1979*45876Sbostic  *	   |		       |   |			 |
1980*45876Sbostic  *	   +-------------------+   +---------------------+
1981*45876Sbostic  *
1982*45876Sbostic  *	where hdr is a standard btree page header (with the indirect bit
1983*45876Sbostic  *	set), size on the first page is the real size of the datum, and
1984*45876Sbostic  *	data are bytes of the datum, split across as many pages as necessary.
1985*45876Sbostic  *	Indirect pages are chained together with the h_prevpg and h_nextpg
1986*45876Sbostic  *	entries of the page header struct.
1987*45876Sbostic  *
1988*45876Sbostic  *	A single DBT is written per chain, so space on the last page is
1989*45876Sbostic  *	wasted.
1990*45876Sbostic  *
1991*45876Sbostic  *	We return the page number of the start of the chain.
1992*45876Sbostic  *
1993*45876Sbostic  *	When a big object is deleted from a tree, the space that it occupied
1994*45876Sbostic  *	is placed on a free list for later reuse.  This routine checks that
1995*45876Sbostic  *	free list before allocating new pages to the big datum being inserted.
1996*45876Sbostic  *
1997*45876Sbostic  *	Parameters:
1998*45876Sbostic  *		t -- btree in which to store indirect blocks
1999*45876Sbostic  *		data -- DBT with the big datum in it
2000*45876Sbostic  *		pgno -- place to put the starting page number of the chain
2001*45876Sbostic  *
2002*45876Sbostic  *	Returns:
2003*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
2004*45876Sbostic  *
2005*45876Sbostic  *	Side Effects:
2006*45876Sbostic  *		Current page is changed on return.
2007*45876Sbostic  */
2008*45876Sbostic 
2009*45876Sbostic static int
2010*45876Sbostic _bt_indirect(t, data, pgno)
2011*45876Sbostic 	BTREE_P t;
2012*45876Sbostic 	DBT *data;
2013*45876Sbostic 	pgno_t *pgno;
2014*45876Sbostic {
2015*45876Sbostic 	pgno_t save;
2016*45876Sbostic 	pgno_t prev;
2017*45876Sbostic 	char *top;
2018*45876Sbostic 	char *where;
2019*45876Sbostic 	char *from;
2020*45876Sbostic 	size_t dsize;
2021*45876Sbostic 	pgno_t nextchn;
2022*45876Sbostic 	int ischain;
2023*45876Sbostic 	BTHEADER *cur;
2024*45876Sbostic 
2025*45876Sbostic 	/* get set for first page in chain */
2026*45876Sbostic 	prev = P_NONE;
2027*45876Sbostic 	dsize = data->size;
2028*45876Sbostic 	from = (char *) data->data;
2029*45876Sbostic 
2030*45876Sbostic 	/* if there are blocks on the free list, use them first */
2031*45876Sbostic 	if ((*pgno = t->bt_free) == P_NONE) {
2032*45876Sbostic 		if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL)
2033*45876Sbostic 			return (RET_ERROR);
2034*45876Sbostic 
2035*45876Sbostic 		ischain = 0;
2036*45876Sbostic 		*pgno = cur->h_pgno = ++(t->bt_npages);
2037*45876Sbostic 	} else {
2038*45876Sbostic 		if (_bt_getpage(t, *pgno) == RET_ERROR)
2039*45876Sbostic 			return (RET_ERROR);
2040*45876Sbostic 		ischain = 1;
2041*45876Sbostic 		cur = t->bt_curpage;
2042*45876Sbostic 	}
2043*45876Sbostic 
2044*45876Sbostic 	cur->h_flags = F_CONT|F_LEAF;
2045*45876Sbostic 	(void) bcopy((char *) &dsize, (char *) &cur->h_linp[0], sizeof(size_t));
2046*45876Sbostic 	where = ((char *) (&cur->h_linp[0])) + sizeof(size_t);
2047*45876Sbostic 
2048*45876Sbostic 	/* fill and write pages in the chain */
2049*45876Sbostic 	for (;;) {
2050*45876Sbostic 		int nhere;
2051*45876Sbostic 
2052*45876Sbostic 		top = ((char *) cur) + t->bt_psize;
2053*45876Sbostic 		cur->h_prevpg = prev;
2054*45876Sbostic 		nextchn = cur->h_nextpg;
2055*45876Sbostic 		nhere = (int) (top - where);
2056*45876Sbostic 
2057*45876Sbostic 		if (nhere >= dsize) {
2058*45876Sbostic 			(void) bcopy(from, where, (int) dsize);
2059*45876Sbostic 			cur->h_nextpg = P_NONE;
2060*45876Sbostic 			dsize = 0;
2061*45876Sbostic 		} else {
2062*45876Sbostic 			(void) bcopy(from, where, (int) nhere);
2063*45876Sbostic 			dsize -= nhere;
2064*45876Sbostic 			from += nhere;
2065*45876Sbostic 			if (nextchn == P_NONE)
2066*45876Sbostic 				cur->h_nextpg = t->bt_npages + 1;
2067*45876Sbostic 			prev = cur->h_pgno;
2068*45876Sbostic 		}
2069*45876Sbostic 
2070*45876Sbostic 		/* current page is ready to go; write it out */
2071*45876Sbostic 		if (_bt_write(t, cur, RELEASE) == RET_ERROR)
2072*45876Sbostic 			return (RET_ERROR);
2073*45876Sbostic 
2074*45876Sbostic 		/* free the current page, if appropriate */
2075*45876Sbostic 		if (ISDISK(t) && !ISCACHE(t) && !ischain) {
2076*45876Sbostic 			(void) free ((char *) cur);
2077*45876Sbostic 		}
2078*45876Sbostic 
2079*45876Sbostic 		/* done? */
2080*45876Sbostic 		if (dsize == 0)
2081*45876Sbostic 			break;
2082*45876Sbostic 
2083*45876Sbostic 		/* allocate another page */
2084*45876Sbostic 		if (nextchn == P_NONE) {
2085*45876Sbostic 			if ((cur = _bt_allocpg(t)) == (BTHEADER *) NULL)
2086*45876Sbostic 				return (RET_ERROR);
2087*45876Sbostic 			ischain = 0;
2088*45876Sbostic 			cur->h_pgno = ++(t->bt_npages);
2089*45876Sbostic 		} else {
2090*45876Sbostic 			if (_bt_getpage(t, nextchn) == RET_ERROR)
2091*45876Sbostic 				return (RET_ERROR);
2092*45876Sbostic 			ischain = 1;
2093*45876Sbostic 			cur = t->bt_curpage;
2094*45876Sbostic 		}
2095*45876Sbostic 		cur->h_flags = F_LEAF | F_CONT;
2096*45876Sbostic 
2097*45876Sbostic 		where = (char *) (&cur->h_linp[0]);
2098*45876Sbostic 	}
2099*45876Sbostic 
2100*45876Sbostic 	/* if we used pages from the free list, record changes to it */
2101*45876Sbostic 	if (*pgno == t->bt_free) {
2102*45876Sbostic 		t->bt_free = nextchn;
2103*45876Sbostic 		t->bt_flags &= ~BTF_METAOK;
2104*45876Sbostic 	}
2105*45876Sbostic 
2106*45876Sbostic 	return (RET_SUCCESS);
2107*45876Sbostic }
2108*45876Sbostic 
2109*45876Sbostic /*
2110*45876Sbostic  *  _BT_SPLIT -- Split a page into two pages.
2111*45876Sbostic  *
2112*45876Sbostic  *	Splits are caused by insertions, and propogate up the tree in
2113*45876Sbostic  *	the usual way.  The root page is always page 1 in the file on
2114*45876Sbostic  *	disk, so root splits are handled specially.  On entry to this
2115*45876Sbostic  *	routine, t->bt_curpage is the page to be split.
2116*45876Sbostic  *
2117*45876Sbostic  *	Parameters:
2118*45876Sbostic  *		t -- btree in which to do split.
2119*45876Sbostic  *
2120*45876Sbostic  *	Returns:
2121*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
2122*45876Sbostic  *
2123*45876Sbostic  *	Side Effects:
2124*45876Sbostic  *		Changes the notion of the current page.
2125*45876Sbostic  */
2126*45876Sbostic 
2127*45876Sbostic static
2128*45876Sbostic _bt_split(t)
2129*45876Sbostic 	BTREE_P t;
2130*45876Sbostic {
2131*45876Sbostic 	BTHEADER *h;
2132*45876Sbostic 	BTHEADER *left, *right;
2133*45876Sbostic 	BTHEADER *next;
2134*45876Sbostic 	pgno_t nextpgno, parent;
2135*45876Sbostic 	int nbytes, len;
2136*45876Sbostic 	IDATUM *id;
2137*45876Sbostic 	DATUM *d;
2138*45876Sbostic 	char *src;
2139*45876Sbostic 	IDATUM *new;
2140*45876Sbostic 	pgno_t oldchain;
2141*45876Sbostic 	u_char flags;
2142*45876Sbostic 
2143*45876Sbostic 	h = (BTHEADER *) t->bt_curpage;
2144*45876Sbostic 
2145*45876Sbostic 	/* split root page specially, since it must remain page 1 */
2146*45876Sbostic 	if (h->h_pgno == P_ROOT) {
2147*45876Sbostic 		return (_bt_splitroot(t));
2148*45876Sbostic 	}
2149*45876Sbostic 
2150*45876Sbostic 	/*
2151*45876Sbostic 	 *  This is a little complicated.  We go to some trouble to
2152*45876Sbostic 	 *  figure out which of the three possible cases -- in-memory tree,
2153*45876Sbostic 	 *  disk tree (no cache), and disk tree (cache) -- we have, in order
2154*45876Sbostic 	 *  to avoid unnecessary copying.  If we have a disk cache, then we
2155*45876Sbostic 	 *  have to do some extra copying, though, since the cache code
2156*45876Sbostic 	 *  manages buffers externally to this code.
2157*45876Sbostic 	 */
2158*45876Sbostic 
2159*45876Sbostic 	if (ISDISK(t) && ISCACHE(t)) {
2160*45876Sbostic 		if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize))
2161*45876Sbostic 		    == (BTHEADER *) NULL)
2162*45876Sbostic 			return (RET_ERROR);
2163*45876Sbostic 		left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE;
2164*45876Sbostic 		left->h_flags = t->bt_curpage->h_flags;
2165*45876Sbostic 		left->h_lower = (index_t)
2166*45876Sbostic 			  (((char *) &(left->h_linp[0])) - ((char *) left));
2167*45876Sbostic 		left->h_upper = t->bt_psize;
2168*45876Sbostic 
2169*45876Sbostic 	} else {
2170*45876Sbostic 		if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
2171*45876Sbostic 			return (RET_ERROR);
2172*45876Sbostic 	}
2173*45876Sbostic 	left->h_pgno = h->h_pgno;
2174*45876Sbostic 
2175*45876Sbostic 	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
2176*45876Sbostic 		return (RET_ERROR);
2177*45876Sbostic 	right->h_pgno = ++(t->bt_npages);
2178*45876Sbostic 
2179*45876Sbostic 	/* now do the split */
2180*45876Sbostic 	if (_bt_dopsplit(t, left, right) == RET_ERROR)
2181*45876Sbostic 		return (RET_ERROR);
2182*45876Sbostic 
2183*45876Sbostic 	right->h_prevpg = left->h_pgno;
2184*45876Sbostic 	nextpgno = right->h_nextpg = h->h_nextpg;
2185*45876Sbostic 	left->h_nextpg = right->h_pgno;
2186*45876Sbostic 	left->h_prevpg = h->h_prevpg;
2187*45876Sbostic 
2188*45876Sbostic 	/* okay, now use the left half of the page as the new page */
2189*45876Sbostic 	if (ISDISK(t) && ISCACHE(t)) {
2190*45876Sbostic 		(void) bcopy((char *) left, (char *) t->bt_curpage,
2191*45876Sbostic 			     (int) t->bt_psize);
2192*45876Sbostic 		(void) free ((char *) left);
2193*45876Sbostic 		left = t->bt_curpage;
2194*45876Sbostic 	} else {
2195*45876Sbostic 		(void) free((char *) t->bt_curpage);
2196*45876Sbostic 		t->bt_curpage = left;
2197*45876Sbostic 	}
2198*45876Sbostic 
2199*45876Sbostic 	/*
2200*45876Sbostic 	 *  Write the new pages out.  We need them to stay where they are
2201*45876Sbostic 	 *  until we're done updating the parent pages.
2202*45876Sbostic 	 */
2203*45876Sbostic 
2204*45876Sbostic 	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
2205*45876Sbostic 		return (RET_ERROR);
2206*45876Sbostic 	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
2207*45876Sbostic 		return (RET_ERROR);
2208*45876Sbostic 
2209*45876Sbostic 	/* update 'prev' pointer of old neighbor of left */
2210*45876Sbostic 	if (nextpgno != P_NONE) {
2211*45876Sbostic 		if (_bt_getpage(t, nextpgno) == RET_ERROR)
2212*45876Sbostic 			return (RET_ERROR);
2213*45876Sbostic 		h = t->bt_curpage;
2214*45876Sbostic 		h->h_prevpg = right->h_pgno;
2215*45876Sbostic 		h->h_flags |= F_DIRTY;
2216*45876Sbostic 	}
2217*45876Sbostic 
2218*45876Sbostic 	if ((parent = _bt_pop(t)) != P_NONE) {
2219*45876Sbostic 		if (right->h_flags & F_LEAF) {
2220*45876Sbostic 			d = (DATUM *) GETDATUM(right, 0);
2221*45876Sbostic 			len = d->d_ksize;
2222*45876Sbostic 			if (d->d_flags & D_BIGKEY) {
2223*45876Sbostic 				bcopy(&(d->d_bytes[0]),
2224*45876Sbostic 				      (char *) &oldchain,
2225*45876Sbostic 				      sizeof(oldchain));
2226*45876Sbostic 				if (_bt_markchain(t, oldchain) == RET_ERROR)
2227*45876Sbostic 					return (RET_ERROR);
2228*45876Sbostic 				src = (char *) &oldchain;
2229*45876Sbostic 				flags = D_BIGKEY;
2230*45876Sbostic 			} else {
2231*45876Sbostic 				src = (char *) &(d->d_bytes[0]);
2232*45876Sbostic 				flags = 0;
2233*45876Sbostic 			}
2234*45876Sbostic 		} else {
2235*45876Sbostic 			id = (IDATUM *) GETDATUM(right, 0);
2236*45876Sbostic 			len = id->i_size;
2237*45876Sbostic 			flags = id->i_flags;
2238*45876Sbostic 			src = (char *) &(id->i_bytes[0]);
2239*45876Sbostic 		}
2240*45876Sbostic 		nbytes = len + (sizeof(IDATUM) - sizeof(char));
2241*45876Sbostic 		new = (IDATUM *) malloc((unsigned) nbytes);
2242*45876Sbostic 		if (new == (IDATUM *) NULL)
2243*45876Sbostic 			return (RET_ERROR);
2244*45876Sbostic 		new->i_size = len;
2245*45876Sbostic 		new->i_pgno = right->h_pgno;
2246*45876Sbostic 		new->i_flags = flags;
2247*45876Sbostic 		if (len > 0)
2248*45876Sbostic 			(void) bcopy(src, (char *) &(new->i_bytes[0]), len);
2249*45876Sbostic 
2250*45876Sbostic 		nbytes = LONGALIGN(nbytes) + sizeof(index_t);
2251*45876Sbostic 		if (_bt_getpage(t, parent) == RET_ERROR)
2252*45876Sbostic 			return (RET_ERROR);
2253*45876Sbostic 
2254*45876Sbostic 		h = t->bt_curpage;
2255*45876Sbostic 
2256*45876Sbostic 		/*
2257*45876Sbostic 		 *  Split the parent if we need to, then reposition the
2258*45876Sbostic 		 *  tree's current page pointer for the new datum.
2259*45876Sbostic 		 */
2260*45876Sbostic 		if ((h->h_upper - h->h_lower) < nbytes) {
2261*45876Sbostic 			if (_bt_split(t) == RET_ERROR)
2262*45876Sbostic 				return (RET_ERROR);
2263*45876Sbostic 			if (_bt_reposition(t, new, parent, right->h_prevpg)
2264*45876Sbostic 			      == RET_ERROR)
2265*45876Sbostic 				return (RET_ERROR);
2266*45876Sbostic 		}
2267*45876Sbostic 
2268*45876Sbostic 		/* okay, now insert the new idatum */
2269*45876Sbostic 		if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR)
2270*45876Sbostic 			return (RET_ERROR);
2271*45876Sbostic 	}
2272*45876Sbostic 
2273*45876Sbostic 	/*
2274*45876Sbostic 	 *  Okay, split is done; don't need right page stapled down anymore.
2275*45876Sbostic 	 *  The page we call 'left' above is the new version of the old
2276*45876Sbostic 	 *  (split) page, so we can't release it.
2277*45876Sbostic 	 */
2278*45876Sbostic 
2279*45876Sbostic 	if (_bt_release(t, right) == RET_ERROR)
2280*45876Sbostic 		return (RET_ERROR);
2281*45876Sbostic 	if (ISDISK(t) && !ISCACHE(t))
2282*45876Sbostic 		(void) free((char *) right);
2283*45876Sbostic 
2284*45876Sbostic 	return (RET_SUCCESS);
2285*45876Sbostic }
2286*45876Sbostic 
2287*45876Sbostic /*
2288*45876Sbostic  *  _BT_MARKCHAIN -- Mark a chain of pages as used by an internal node.
2289*45876Sbostic  *
2290*45876Sbostic  *	Chains of indirect blocks pointed to by leaf nodes get reclaimed
2291*45876Sbostic  *	when the item that points to them gets deleted.  Chains pointed
2292*45876Sbostic  *	to by internal nodes never get deleted.  This routine marks a
2293*45876Sbostic  *	chain as pointed to by an internal node.
2294*45876Sbostic  *
2295*45876Sbostic  *	Parameters:
2296*45876Sbostic  *		t -- tree in which to mark
2297*45876Sbostic  *		chain -- number of first page in chain
2298*45876Sbostic  *
2299*45876Sbostic  *	Returns:
2300*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
2301*45876Sbostic  *
2302*45876Sbostic  *	Side Effects:
2303*45876Sbostic  *		None.
2304*45876Sbostic  */
2305*45876Sbostic 
2306*45876Sbostic static int
2307*45876Sbostic _bt_markchain(t, chain)
2308*45876Sbostic 	BTREE_P t;
2309*45876Sbostic 	pgno_t chain;
2310*45876Sbostic {
2311*45876Sbostic 	pgno_t save;
2312*45876Sbostic 
2313*45876Sbostic 	save = t->bt_curpage->h_pgno;
2314*45876Sbostic 
2315*45876Sbostic 	if (_bt_getpage(t, chain) == RET_ERROR)
2316*45876Sbostic 		return (RET_ERROR);
2317*45876Sbostic 
2318*45876Sbostic 	t->bt_curpage->h_flags |= (F_DIRTY|F_PRESERVE);
2319*45876Sbostic 
2320*45876Sbostic 	if (_bt_getpage(t, save) == RET_ERROR)
2321*45876Sbostic 		return (RET_ERROR);
2322*45876Sbostic 
2323*45876Sbostic 	return (RET_SUCCESS);
2324*45876Sbostic }
2325*45876Sbostic 
2326*45876Sbostic /*
2327*45876Sbostic  *  _BT_REPOSITION -- Reposition the current page pointer of a btree.
2328*45876Sbostic  *
2329*45876Sbostic  *	After splitting a node in the tree in order to make room for
2330*45876Sbostic  *	an insertion, we need to figure out which page after the split
2331*45876Sbostic  *	should get the item we want to insert.  This routine positions
2332*45876Sbostic  *	the tree's current page pointer appropriately.
2333*45876Sbostic  *
2334*45876Sbostic  *	Parameters:
2335*45876Sbostic  *		t -- tree to position
2336*45876Sbostic  *		new -- the item we want to insert
2337*45876Sbostic  *		parent -- parent of the node that we just split
2338*45876Sbostic  *		prev -- page number of item directly to the left of
2339*45876Sbostic  *			new's position in the tree.
2340*45876Sbostic  *
2341*45876Sbostic  *	Returns:
2342*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
2343*45876Sbostic  *
2344*45876Sbostic  *	Side Effects:
2345*45876Sbostic  *		None.
2346*45876Sbostic  */
2347*45876Sbostic 
2348*45876Sbostic static int
2349*45876Sbostic _bt_reposition(t, new, parent, prev)
2350*45876Sbostic 	BTREE_P t;
2351*45876Sbostic 	IDATUM *new;
2352*45876Sbostic 	pgno_t parent;
2353*45876Sbostic 	pgno_t prev;
2354*45876Sbostic {
2355*45876Sbostic 	index_t i, next;
2356*45876Sbostic 	IDATUM *idx;
2357*45876Sbostic 
2358*45876Sbostic 	if (parent == P_ROOT) {
2359*45876Sbostic 
2360*45876Sbostic 		/*
2361*45876Sbostic 		 *  If we just split the root page, then there are guaranteed
2362*45876Sbostic 		 *  to be exactly two IDATUMs on it.  Look at both of them
2363*45876Sbostic 		 *  to see if they point to the page that we want.
2364*45876Sbostic 		 */
2365*45876Sbostic 
2366*45876Sbostic 		if (_bt_getpage(t, parent) == RET_ERROR)
2367*45876Sbostic 			return (RET_ERROR);
2368*45876Sbostic 
2369*45876Sbostic 		next = NEXTINDEX(t->bt_curpage);
2370*45876Sbostic 		for (i = 0; i < next; i++) {
2371*45876Sbostic 			idx = (IDATUM *) GETDATUM(t->bt_curpage, i);
2372*45876Sbostic 			if (_bt_getpage(t, idx->i_pgno) == RET_ERROR)
2373*45876Sbostic 				return (RET_ERROR);
2374*45876Sbostic 			if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
2375*45876Sbostic 				return (RET_SUCCESS);
2376*45876Sbostic 			if (_bt_getpage(t, parent) == RET_ERROR)
2377*45876Sbostic 				return (RET_ERROR);
2378*45876Sbostic 		}
2379*45876Sbostic 	} else {
2380*45876Sbostic 
2381*45876Sbostic 		/*
2382*45876Sbostic 		 *  Get the parent page -- which is where the new item would
2383*45876Sbostic 		 *  have gone -- and figure out whether the new item now goes
2384*45876Sbostic 		 *  on the parent, or the page immediately to the right of
2385*45876Sbostic 		 *  the parent.
2386*45876Sbostic 		 */
2387*45876Sbostic 
2388*45876Sbostic 		if (_bt_getpage(t, parent) == RET_ERROR)
2389*45876Sbostic 			return (RET_ERROR);
2390*45876Sbostic 		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
2391*45876Sbostic 			return (RET_SUCCESS);
2392*45876Sbostic 		if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR)
2393*45876Sbostic 			return (RET_ERROR);
2394*45876Sbostic 		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
2395*45876Sbostic 			return (RET_SUCCESS);
2396*45876Sbostic 	}
2397*45876Sbostic 	return (RET_ERROR);
2398*45876Sbostic }
2399*45876Sbostic 
2400*45876Sbostic /*
2401*45876Sbostic  *  _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page?
2402*45876Sbostic  *
2403*45876Sbostic  *	This routine is used by _bt_reposition to decide whether the current
2404*45876Sbostic  *	page is the correct one on which to insert a new item.
2405*45876Sbostic  *
2406*45876Sbostic  *	Parameters:
2407*45876Sbostic  *		t -- tree to check
2408*45876Sbostic  *		new -- the item that will be inserted (used for binary search)
2409*45876Sbostic  *		prev -- page number of page whose IDATUM is immediately to
2410*45876Sbostic  *			the left of new's position in the tree.
2411*45876Sbostic  *
2412*45876Sbostic  *	Returns:
2413*45876Sbostic  *		RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE).
2414*45876Sbostic  */
2415*45876Sbostic 
2416*45876Sbostic static int
2417*45876Sbostic _bt_isonpage(t, new, prev)
2418*45876Sbostic 	BTREE_P t;
2419*45876Sbostic 	IDATUM *new;
2420*45876Sbostic 	pgno_t prev;
2421*45876Sbostic {
2422*45876Sbostic 	BTHEADER *h = (BTHEADER *) t->bt_curpage;
2423*45876Sbostic 	index_t i, next;
2424*45876Sbostic 	IDATUM *idx;
2425*45876Sbostic 
2426*45876Sbostic 	i = _bt_binsrch(t, &(new->i_bytes[0]));
2427*45876Sbostic 	while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0)
2428*45876Sbostic 		--i;
2429*45876Sbostic 	next = NEXTINDEX(h);
2430*45876Sbostic 	idx = (IDATUM *) GETDATUM(h, i);
2431*45876Sbostic 	while (i < next && idx->i_pgno != prev) {
2432*45876Sbostic 		i++;
2433*45876Sbostic 		idx = (IDATUM *) GETDATUM(h, i);
2434*45876Sbostic 	}
2435*45876Sbostic 	if (idx->i_pgno == prev)
2436*45876Sbostic 		return (RET_SUCCESS);
2437*45876Sbostic 	else
2438*45876Sbostic 		return (RET_ERROR);
2439*45876Sbostic }
2440*45876Sbostic 
2441*45876Sbostic /*
2442*45876Sbostic  *  _BT_SPLITROOT -- Split the root of a btree.
2443*45876Sbostic  *
2444*45876Sbostic  *	The root page for a btree is always page one.  This means that in
2445*45876Sbostic  *	order to split the root, we need to do extra work.
2446*45876Sbostic  *
2447*45876Sbostic  *	Parameters:
2448*45876Sbostic  *		t -- tree to split
2449*45876Sbostic  *
2450*45876Sbostic  *	Returns:
2451*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
2452*45876Sbostic  *
2453*45876Sbostic  *	Side Effects:
2454*45876Sbostic  *		Splits root upward in the usual way, adding two new pages
2455*45876Sbostic  *		to the tree (rather than just one, as in usual splits).
2456*45876Sbostic  */
2457*45876Sbostic 
2458*45876Sbostic static
2459*45876Sbostic _bt_splitroot(t)
2460*45876Sbostic 	BTREE_P t;
2461*45876Sbostic {
2462*45876Sbostic 	BTHEADER *h = t->bt_curpage;
2463*45876Sbostic 	BTHEADER *left, *right;
2464*45876Sbostic 	DATUM *d;
2465*45876Sbostic 	IDATUM *id;
2466*45876Sbostic 	char *where;
2467*45876Sbostic 	BTHEADER *where_h;
2468*45876Sbostic 	pgno_t lastpg;
2469*45876Sbostic 	char *src, *dest;
2470*45876Sbostic 	int len, nbytes;
2471*45876Sbostic 	u_long was_leaf;
2472*45876Sbostic 	pgno_t oldchain;
2473*45876Sbostic 	u_char flags;
2474*45876Sbostic 
2475*45876Sbostic 	/* get two new pages for the split */
2476*45876Sbostic 	if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
2477*45876Sbostic 		return (RET_ERROR);
2478*45876Sbostic 	left->h_pgno = ++(t->bt_npages);
2479*45876Sbostic 	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
2480*45876Sbostic 		return (RET_ERROR);
2481*45876Sbostic 	right->h_pgno = ++(t->bt_npages);
2482*45876Sbostic 
2483*45876Sbostic 	/* do the split */
2484*45876Sbostic 	if (_bt_dopsplit(t, left, right) == RET_ERROR)
2485*45876Sbostic 		return (RET_ERROR);
2486*45876Sbostic 
2487*45876Sbostic 	/* connect the new pages correctly */
2488*45876Sbostic 	right->h_prevpg = left->h_pgno;
2489*45876Sbostic 	left->h_nextpg = right->h_pgno;
2490*45876Sbostic 
2491*45876Sbostic 	/*
2492*45876Sbostic 	 *  Write the child pages out now.  We need them to remain
2493*45876Sbostic 	 *  where they are until we finish updating parent pages,
2494*45876Sbostic 	 *  however.
2495*45876Sbostic 	 */
2496*45876Sbostic 
2497*45876Sbostic 	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
2498*45876Sbostic 		return (RET_ERROR);
2499*45876Sbostic 	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
2500*45876Sbostic 		return (RET_ERROR);
2501*45876Sbostic 
2502*45876Sbostic 	/* now change the root page into an internal page */
2503*45876Sbostic 	was_leaf = (h->h_flags & F_LEAF);
2504*45876Sbostic 	h->h_flags &= ~F_LEAF;
2505*45876Sbostic 	h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h));
2506*45876Sbostic 	h->h_upper = (index_t) t->bt_psize;
2507*45876Sbostic 	(void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower));
2508*45876Sbostic 
2509*45876Sbostic 	/* put two new keys on root page */
2510*45876Sbostic 	where_h = left;
2511*45876Sbostic 	while (where_h) {
2512*45876Sbostic 		DATUM *data;
2513*45876Sbostic 		IDATUM *idata;
2514*45876Sbostic 
2515*45876Sbostic 		if (was_leaf) {
2516*45876Sbostic 			data = (DATUM *) GETDATUM(where_h, 0);
2517*45876Sbostic 
2518*45876Sbostic 			if (where_h == left) {
2519*45876Sbostic 				len = 0;	/* first key in tree is null */
2520*45876Sbostic 			} else {
2521*45876Sbostic 				if (data->d_flags & D_BIGKEY) {
2522*45876Sbostic 					bcopy(&(data->d_bytes[0]),
2523*45876Sbostic 					      (char *) &oldchain,
2524*45876Sbostic 					      sizeof(oldchain));
2525*45876Sbostic 					if (_bt_markchain(t, oldchain) == RET_ERROR)
2526*45876Sbostic 						return (RET_ERROR);
2527*45876Sbostic 					src = (char *) &oldchain;
2528*45876Sbostic 					flags = D_BIGKEY;
2529*45876Sbostic 				} else {
2530*45876Sbostic 					src = (char *) &(data->d_bytes[0]);
2531*45876Sbostic 					flags = 0;
2532*45876Sbostic 				}
2533*45876Sbostic 				len = data->d_ksize;
2534*45876Sbostic 			}
2535*45876Sbostic 		} else {
2536*45876Sbostic 			idata = (IDATUM *) GETDATUM(where_h, 0);
2537*45876Sbostic 			len = idata->i_size;
2538*45876Sbostic 			flags = idata->i_flags;
2539*45876Sbostic 			src = &(idata->i_bytes[0]);
2540*45876Sbostic 		}
2541*45876Sbostic 		dest = ((char *) h) + h->h_upper;
2542*45876Sbostic 		nbytes = len + (sizeof (IDATUM) - sizeof(char));
2543*45876Sbostic 		dest -= LONGALIGN(nbytes);
2544*45876Sbostic 		id = (IDATUM *) dest;
2545*45876Sbostic 		id->i_size = len;
2546*45876Sbostic 		id->i_pgno = where_h->h_pgno;
2547*45876Sbostic 		id->i_flags = flags;
2548*45876Sbostic 		if (len > 0)
2549*45876Sbostic 			(void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len);
2550*45876Sbostic 		dest -= ((int) h);
2551*45876Sbostic 		h->h_linp[NEXTINDEX(h)] = (index_t) dest;
2552*45876Sbostic 		h->h_upper = (index_t) dest;
2553*45876Sbostic 		h->h_lower += sizeof(index_t);
2554*45876Sbostic 
2555*45876Sbostic 		/* next page */
2556*45876Sbostic 		if (where_h == left)
2557*45876Sbostic 			where_h = right;
2558*45876Sbostic 		else
2559*45876Sbostic 			where_h = (BTHEADER *) NULL;
2560*45876Sbostic 	}
2561*45876Sbostic 
2562*45876Sbostic 	if (_bt_release(t, left) == RET_ERROR)
2563*45876Sbostic 		return (RET_ERROR);
2564*45876Sbostic 	if (_bt_release(t, right) == RET_ERROR)
2565*45876Sbostic 		return (RET_ERROR);
2566*45876Sbostic 
2567*45876Sbostic 	/*
2568*45876Sbostic 	 *  That's it, split is done.  If we're doing a non-cached disk
2569*45876Sbostic 	 *  btree, we can free up the pages we allocated, as they're on
2570*45876Sbostic 	 *  disk, now.
2571*45876Sbostic 	 */
2572*45876Sbostic 
2573*45876Sbostic 	if (ISDISK(t) && !ISCACHE(t)) {
2574*45876Sbostic 		(void) free ((char *) left);
2575*45876Sbostic 		(void) free ((char *) right);
2576*45876Sbostic 	}
2577*45876Sbostic 
2578*45876Sbostic 	h->h_flags |= F_DIRTY;
2579*45876Sbostic 
2580*45876Sbostic 	return (RET_SUCCESS);
2581*45876Sbostic }
2582*45876Sbostic 
2583*45876Sbostic /*
2584*45876Sbostic  *  _BT_DOPSPLIT -- Do the work of splitting a page
2585*45876Sbostic  *
2586*45876Sbostic  *	This routine takes two page pointers and splits the data on the
2587*45876Sbostic  *	current page of the btree between them.
2588*45876Sbostic  *
2589*45876Sbostic  *	We do a lot of work here to handle duplicate keys on a page; we
2590*45876Sbostic  *	have to place these keys carefully to guarantee that later searches
2591*45876Sbostic  *	will find them correctly.  See comments in the code below for details.
2592*45876Sbostic  *
2593*45876Sbostic  *	Parameters:
2594*45876Sbostic  *		t -- tree to split
2595*45876Sbostic  *		left -- pointer to page to get left half of the data
2596*45876Sbostic  *		right -- pointer to page to get right half of the data
2597*45876Sbostic  *
2598*45876Sbostic  *	Returns:
2599*45876Sbostic  *		None.
2600*45876Sbostic  */
2601*45876Sbostic 
2602*45876Sbostic static
2603*45876Sbostic _bt_dopsplit(t, left, right)
2604*45876Sbostic 	BTREE_P t;
2605*45876Sbostic 	BTHEADER *left;
2606*45876Sbostic 	BTHEADER *right;
2607*45876Sbostic {
2608*45876Sbostic 	BTHEADER *h = t->bt_curpage;
2609*45876Sbostic 	size_t psize;
2610*45876Sbostic 	char *where;
2611*45876Sbostic 	BTHEADER *where_h;
2612*45876Sbostic 	index_t where_i;
2613*45876Sbostic 	int nbytes, dsize, fixedsize, freespc;
2614*45876Sbostic 	index_t i;
2615*45876Sbostic 	index_t save_lower, save_upper, save_i;
2616*45876Sbostic 	index_t switch_i;
2617*45876Sbostic 	char *save_key;
2618*45876Sbostic 	DATUM *d;
2619*45876Sbostic 	CURSOR *c;
2620*45876Sbostic 	index_t top;
2621*45876Sbostic 	int free_save;
2622*45876Sbostic 	pgno_t chain;
2623*45876Sbostic 	int ignore;
2624*45876Sbostic 
2625*45876Sbostic 	/*
2626*45876Sbostic 	 *  Our strategy is to put half the bytes on each page.  We figure
2627*45876Sbostic 	 *  out how many bytes we have total, and then move items until
2628*45876Sbostic 	 *  the last item moved put at least 50% of the data on the left
2629*45876Sbostic 	 *  page.
2630*45876Sbostic 	 */
2631*45876Sbostic 	save_key = (char *) NULL;
2632*45876Sbostic 	psize = (int) t->bt_psize;
2633*45876Sbostic 	where = ((char *) left) + psize;
2634*45876Sbostic 	where_h = left;
2635*45876Sbostic 	where_i = 0;
2636*45876Sbostic 	nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h));
2637*45876Sbostic 	freespc = nbytes;
2638*45876Sbostic 
2639*45876Sbostic 	top = NEXTINDEX(h);
2640*45876Sbostic 	if (h->h_flags & F_LEAF)
2641*45876Sbostic 		fixedsize = (sizeof(DATUM) - sizeof(char));
2642*45876Sbostic 	else
2643*45876Sbostic 		fixedsize = (sizeof(IDATUM) - sizeof(char));
2644*45876Sbostic 
2645*45876Sbostic 	save_key = (char *) NULL;
2646*45876Sbostic 
2647*45876Sbostic 	/* move data */
2648*45876Sbostic 	for (i = 0; i < top; i++) {
2649*45876Sbostic 
2650*45876Sbostic 		/*
2651*45876Sbostic 		 *  Internal and leaf pages have different layouts for
2652*45876Sbostic 		 *  data items, but in both cases the first entry in the
2653*45876Sbostic 		 *  data item is a size_t.
2654*45876Sbostic 		 */
2655*45876Sbostic 		d = (DATUM *) GETDATUM(h,i);
2656*45876Sbostic 		if (h->h_flags & F_LEAF) {
2657*45876Sbostic 			dsize = d->d_ksize + d->d_dsize + fixedsize;
2658*45876Sbostic 		} else {
2659*45876Sbostic 			dsize = d->d_ksize + fixedsize;
2660*45876Sbostic 		}
2661*45876Sbostic 
2662*45876Sbostic 		/*
2663*45876Sbostic 		 *  If a page contains duplicate keys, we have to be
2664*45876Sbostic 		 *  careful about splits.  A sequence of duplicate keys
2665*45876Sbostic 		 *  may not begin in the middle of one page and end in
2666*45876Sbostic 		 *  the middle of another; it must begin on a page boundary,
2667*45876Sbostic 		 *  in order for searches on the internal nodes to work
2668*45876Sbostic 		 *  correctly.
2669*45876Sbostic 		 */
2670*45876Sbostic 		if (where_h == left) {
2671*45876Sbostic 			if (save_key == (char *) NULL) {
2672*45876Sbostic 				if (h->h_flags & F_LEAF) {
2673*45876Sbostic 					if (d->d_flags & D_BIGKEY) {
2674*45876Sbostic 						free_save = TRUE;
2675*45876Sbostic 						bcopy(&(d->d_bytes[0]),
2676*45876Sbostic 						     (char *) &chain,
2677*45876Sbostic 						     sizeof(chain));
2678*45876Sbostic 						if (_bt_getbig(t, chain,
2679*45876Sbostic 							       &save_key,
2680*45876Sbostic 							       &ignore)
2681*45876Sbostic 						    == RET_ERROR)
2682*45876Sbostic 							return (RET_ERROR);
2683*45876Sbostic 					} else {
2684*45876Sbostic 						free_save = FALSE;
2685*45876Sbostic 						save_key = (char *) &(d->d_bytes[0]);
2686*45876Sbostic 					}
2687*45876Sbostic 				} else {
2688*45876Sbostic 					IDATUM *id = (IDATUM *) d;
2689*45876Sbostic 
2690*45876Sbostic 					if (id->i_flags & D_BIGKEY) {
2691*45876Sbostic 						free_save = TRUE;
2692*45876Sbostic 						bcopy(&(id->i_bytes[0]),
2693*45876Sbostic 						     (char *) &chain,
2694*45876Sbostic 						     sizeof(chain));
2695*45876Sbostic 						if (_bt_getbig(t, chain,
2696*45876Sbostic 							       &save_key,
2697*45876Sbostic 							       &ignore)
2698*45876Sbostic 						    == RET_ERROR)
2699*45876Sbostic 							return (RET_ERROR);
2700*45876Sbostic 					} else {
2701*45876Sbostic 						free_save = FALSE;
2702*45876Sbostic 						save_key = (char *)
2703*45876Sbostic 							&(id->i_bytes[0]);
2704*45876Sbostic 					}
2705*45876Sbostic 				}
2706*45876Sbostic 				save_i = 0;
2707*45876Sbostic 				save_lower = where_h->h_lower;
2708*45876Sbostic 				save_upper = where_h->h_upper;
2709*45876Sbostic 			} else {
2710*45876Sbostic 				if (_bt_cmp(t, save_key, i) != 0) {
2711*45876Sbostic 					save_lower = where_h->h_lower;
2712*45876Sbostic 					save_upper = where_h->h_upper;
2713*45876Sbostic 					save_i = i;
2714*45876Sbostic 				}
2715*45876Sbostic 				if (h->h_flags & F_LEAF) {
2716*45876Sbostic 					if (free_save)
2717*45876Sbostic 						(void) free(save_key);
2718*45876Sbostic 					if (d->d_flags & D_BIGKEY) {
2719*45876Sbostic 						free_save = TRUE;
2720*45876Sbostic 						bcopy(&(d->d_bytes[0]),
2721*45876Sbostic 						     (char *) &chain,
2722*45876Sbostic 						     sizeof(chain));
2723*45876Sbostic 						if (_bt_getbig(t, chain,
2724*45876Sbostic 							       &save_key,
2725*45876Sbostic 							       &ignore)
2726*45876Sbostic 						    == RET_ERROR)
2727*45876Sbostic 							return (RET_ERROR);
2728*45876Sbostic 					} else {
2729*45876Sbostic 						free_save = FALSE;
2730*45876Sbostic 						save_key = (char *) &(d->d_bytes[0]);
2731*45876Sbostic 					}
2732*45876Sbostic 				} else {
2733*45876Sbostic 					IDATUM *id = (IDATUM *) d;
2734*45876Sbostic 
2735*45876Sbostic 					if (id->i_flags & D_BIGKEY) {
2736*45876Sbostic 						free_save = TRUE;
2737*45876Sbostic 						bcopy(&(id->i_bytes[0]),
2738*45876Sbostic 						     (char *) &chain,
2739*45876Sbostic 						     sizeof(chain));
2740*45876Sbostic 						if (_bt_getbig(t, chain,
2741*45876Sbostic 							       &save_key,
2742*45876Sbostic 							       &ignore)
2743*45876Sbostic 						    == RET_ERROR)
2744*45876Sbostic 							return (RET_ERROR);
2745*45876Sbostic 					} else {
2746*45876Sbostic 						free_save = FALSE;
2747*45876Sbostic 						save_key = (char *)
2748*45876Sbostic 							&(id->i_bytes[0]);
2749*45876Sbostic 					}
2750*45876Sbostic 				}
2751*45876Sbostic 			}
2752*45876Sbostic 		}
2753*45876Sbostic 
2754*45876Sbostic 		/* copy data and update page state */
2755*45876Sbostic 		where -= LONGALIGN(dsize);
2756*45876Sbostic 		(void) bcopy((char *) d, (char *) where, dsize);
2757*45876Sbostic 		where_h->h_upper = where_h->h_linp[where_i] =
2758*45876Sbostic 			(index_t) (where - (int) where_h);
2759*45876Sbostic 		where_h->h_lower += sizeof(index_t);
2760*45876Sbostic 		where_i++;
2761*45876Sbostic 
2762*45876Sbostic 		/* if we've moved half, switch to the right-hand page */
2763*45876Sbostic 		nbytes -= LONGALIGN(dsize) + sizeof(index_t);
2764*45876Sbostic 		if ((freespc - nbytes) > nbytes) {
2765*45876Sbostic 			nbytes = 2 * freespc;
2766*45876Sbostic 
2767*45876Sbostic 			/* identical keys go on the same page */
2768*45876Sbostic 			if (save_i > 0) {
2769*45876Sbostic 				/* i gets incremented at loop bottom... */
2770*45876Sbostic 				i = save_i - 1;
2771*45876Sbostic 				where_h->h_lower = save_lower;
2772*45876Sbostic 				where_h->h_upper = save_upper;
2773*45876Sbostic 			}
2774*45876Sbostic 			where = ((char *) right) + psize;
2775*45876Sbostic 			where_h = right;
2776*45876Sbostic 			switch_i = where_i;
2777*45876Sbostic 			where_i = 0;
2778*45876Sbostic 		}
2779*45876Sbostic 	}
2780*45876Sbostic 
2781*45876Sbostic 	/*
2782*45876Sbostic 	 *  If there was an active scan on the database, and we just
2783*45876Sbostic 	 *  split the page that the cursor was on, we may need to
2784*45876Sbostic 	 *  adjust the cursor to point to the same entry as before the
2785*45876Sbostic 	 *  split.
2786*45876Sbostic 	 */
2787*45876Sbostic 
2788*45876Sbostic 	c = &(t->bt_cursor);
2789*45876Sbostic 	if ((t->bt_flags & BTF_SEQINIT)
2790*45876Sbostic 	    && (c->c_pgno == h->h_pgno)
2791*45876Sbostic 	    && (c->c_index >= switch_i)) {
2792*45876Sbostic 		c->c_pgno = where_h->h_pgno;
2793*45876Sbostic 		c->c_index -= where_i;
2794*45876Sbostic 	}
2795*45876Sbostic 	return (RET_SUCCESS);
2796*45876Sbostic }
2797*45876Sbostic 
2798*45876Sbostic /*
2799*45876Sbostic  *  _BT_INSERTI -- Insert IDATUM on current page in the btree.
2800*45876Sbostic  *
2801*45876Sbostic  *	This routine handles insertions to internal pages after splits
2802*45876Sbostic  *	lower in the tree.  On entry, t->bt_curpage is the page to get
2803*45876Sbostic  *	the new IDATUM.  We are also given pgno, the page number of the
2804*45876Sbostic  *	IDATUM that is immediately left of the new IDATUM's position.
2805*45876Sbostic  *	This guarantees that the IDATUM for the right half of the page
2806*45876Sbostic  *	after a split goes next to the IDATUM for its left half.
2807*45876Sbostic  *
2808*45876Sbostic  *	Parameters:
2809*45876Sbostic  *		t -- tree in which to do insertion.
2810*45876Sbostic  *		id -- new IDATUM to insert
2811*45876Sbostic  *		pgno -- page number of IDATUM left of id's position
2812*45876Sbostic  *
2813*45876Sbostic  *	Returns:
2814*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
2815*45876Sbostic  */
2816*45876Sbostic 
2817*45876Sbostic static int
2818*45876Sbostic _bt_inserti(t, id, pgno)
2819*45876Sbostic 	BTREE_P t;
2820*45876Sbostic 	IDATUM *id;
2821*45876Sbostic 	pgno_t pgno;
2822*45876Sbostic {
2823*45876Sbostic 	BTHEADER *h = t->bt_curpage;
2824*45876Sbostic 	index_t next, i;
2825*45876Sbostic 	IDATUM *idx;
2826*45876Sbostic 	char *key;
2827*45876Sbostic 	pgno_t chain;
2828*45876Sbostic 	int free_key;
2829*45876Sbostic 	int ignore;
2830*45876Sbostic 
2831*45876Sbostic 	if (id->i_flags & D_BIGKEY) {
2832*45876Sbostic 		free_key = TRUE;
2833*45876Sbostic 		bcopy(&(id->i_bytes[0]), (char *) &chain, sizeof(chain));
2834*45876Sbostic 		if (_bt_getbig(t, chain, &key, &ignore) == RET_ERROR)
2835*45876Sbostic 			return (RET_ERROR);
2836*45876Sbostic 	} else {
2837*45876Sbostic 		free_key = FALSE;
2838*45876Sbostic 		key = &(id->i_bytes[0]);
2839*45876Sbostic 	}
2840*45876Sbostic 	i = _bt_binsrch(t, key);
2841*45876Sbostic 
2842*45876Sbostic 	next = NEXTINDEX(h);
2843*45876Sbostic 	while (i < next && _bt_cmp(t, key, i) >= 0)
2844*45876Sbostic 		i++;
2845*45876Sbostic 
2846*45876Sbostic 	if (free_key)
2847*45876Sbostic 		(void) free(key);
2848*45876Sbostic 
2849*45876Sbostic 	/* okay, now we're close; find adjacent IDATUM */
2850*45876Sbostic 	for (;;) {
2851*45876Sbostic 		idx = (IDATUM *) GETDATUM(h,i);
2852*45876Sbostic 		if (idx->i_pgno == pgno) {
2853*45876Sbostic 			i++;
2854*45876Sbostic 			break;
2855*45876Sbostic 		}
2856*45876Sbostic 		--i;
2857*45876Sbostic 	}
2858*45876Sbostic 
2859*45876Sbostic 	/* correctly positioned, do the insertion */
2860*45876Sbostic 	return (_bt_insertat(t, id, i));
2861*45876Sbostic }
2862*45876Sbostic 
2863*45876Sbostic /*
2864*45876Sbostic  *  _BT_INSERTAT -- Insert a datum at a given location on the current page.
2865*45876Sbostic  *
2866*45876Sbostic  *	This routine does insertions on both leaf and internal pages.
2867*45876Sbostic  *
2868*45876Sbostic  *	Parameters:
2869*45876Sbostic  *		t -- tree in which to do insertion.
2870*45876Sbostic  *		p -- DATUM or IDATUM to insert.
2871*45876Sbostic  *		index -- index in line pointer array to put this item.
2872*45876Sbostic  *
2873*45876Sbostic  *	Returns:
2874*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
2875*45876Sbostic  *
2876*45876Sbostic  *	Side Effects:
2877*45876Sbostic  *		Will rearrange line pointers to make space for the new
2878*45876Sbostic  *		entry.  This means that any scans currently active are
2879*45876Sbostic  *		invalid after this.
2880*45876Sbostic  *
2881*45876Sbostic  *	Warnings:
2882*45876Sbostic  *		There must be sufficient room for the new item on the page.
2883*45876Sbostic  */
2884*45876Sbostic 
2885*45876Sbostic static int
2886*45876Sbostic _bt_insertat(t, p, index)
2887*45876Sbostic 	BTREE_P t;
2888*45876Sbostic 	char *p;
2889*45876Sbostic 	index_t index;
2890*45876Sbostic {
2891*45876Sbostic 	IDATUM *id = (IDATUM *) p;
2892*45876Sbostic 	DATUM *d = (DATUM *) p;
2893*45876Sbostic 	BTHEADER *h;
2894*45876Sbostic 	CURSOR *c;
2895*45876Sbostic 	index_t nxtindex;
2896*45876Sbostic 	char *src, *dest;
2897*45876Sbostic 	int nbytes;
2898*45876Sbostic 
2899*45876Sbostic 	/* insertion may confuse an active scan.  fix it. */
2900*45876Sbostic 	c = &(t->bt_cursor);
2901*45876Sbostic 	if (t->bt_flags & BTF_SEQINIT && t->bt_curpage->h_pgno == c->c_pgno)
2902*45876Sbostic 		if (_bt_fixscan(t, index, d, INSERT) == RET_ERROR)
2903*45876Sbostic 			return (RET_ERROR);
2904*45876Sbostic 
2905*45876Sbostic 	h = t->bt_curpage;
2906*45876Sbostic 	nxtindex = (index_t) NEXTINDEX(h);
2907*45876Sbostic 
2908*45876Sbostic 	/*
2909*45876Sbostic 	 *  If we're inserting at the middle of the line pointer array,
2910*45876Sbostic 	 *  copy pointers that will follow the new one up on the page.
2911*45876Sbostic 	 */
2912*45876Sbostic 
2913*45876Sbostic 	if (index < nxtindex) {
2914*45876Sbostic 		src = (char *) &(h->h_linp[index]);
2915*45876Sbostic 		dest = (char *) &(h->h_linp[index + 1]);
2916*45876Sbostic 		nbytes = (h->h_lower - (src - ((char *) h)))
2917*45876Sbostic 			 + sizeof(h->h_linp[0]);
2918*45876Sbostic 		(void) bcopy(src, dest, nbytes);
2919*45876Sbostic 	}
2920*45876Sbostic 
2921*45876Sbostic 	/* compute size and copy data to page */
2922*45876Sbostic 	if (h->h_flags & F_LEAF) {
2923*45876Sbostic 		nbytes = d->d_ksize + d->d_dsize
2924*45876Sbostic 			 + (sizeof(DATUM) - sizeof(char));
2925*45876Sbostic 	} else {
2926*45876Sbostic 		nbytes = id->i_size + (sizeof(IDATUM) - sizeof(char));
2927*45876Sbostic 	}
2928*45876Sbostic 	dest = (((char *) h) + h->h_upper) - LONGALIGN(nbytes);
2929*45876Sbostic 	(void) bcopy((char *) p, dest, nbytes);
2930*45876Sbostic 
2931*45876Sbostic 	/* update statistics */
2932*45876Sbostic 	dest -= (int) h;
2933*45876Sbostic 	h->h_linp[index] = (index_t) dest;
2934*45876Sbostic 	h->h_upper = (index_t) dest;
2935*45876Sbostic 	h->h_lower += sizeof(index_t);
2936*45876Sbostic 
2937*45876Sbostic 	/* we're done */
2938*45876Sbostic 	h->h_flags |= F_DIRTY;
2939*45876Sbostic 
2940*45876Sbostic 	return (RET_SUCCESS);
2941*45876Sbostic }
2942*45876Sbostic 
2943*45876Sbostic /*
2944*45876Sbostic  *  _BT_BINSRCH -- Do a binary search for a given key on the current page.
2945*45876Sbostic  *
2946*45876Sbostic  *	Searches on internal pages are handled slightly differently from
2947*45876Sbostic  *	searches on leaf pages.  This is because internal page searches
2948*45876Sbostic  *	find the largest item <= key in the tree, and leaf searches find
2949*45876Sbostic  *	the smallest item >= key.  This guarantees that leaf page searches
2950*45876Sbostic  *	leave us pointing at the item's correct position, and internal
2951*45876Sbostic  *	searches descend the tree correctly.
2952*45876Sbostic  *
2953*45876Sbostic  *	Parameters:
2954*45876Sbostic  *		t -- tree to search
2955*45876Sbostic  *		key -- key we're looking for
2956*45876Sbostic  *
2957*45876Sbostic  *	Returns:
2958*45876Sbostic  *		Index of the line pointer array entry for the (closest)
2959*45876Sbostic  *		match to key on the current page, with "closest" as defined
2960*45876Sbostic  *		above.
2961*45876Sbostic  */
2962*45876Sbostic 
2963*45876Sbostic static index_t
2964*45876Sbostic _bt_binsrch(t, key)
2965*45876Sbostic 	BTREE_P t;
2966*45876Sbostic 	char *key;
2967*45876Sbostic {
2968*45876Sbostic 	index_t lbound, ubound, cur;
2969*45876Sbostic 	BTHEADER *h = t->bt_curpage;
2970*45876Sbostic 	IDATUM *id;
2971*45876Sbostic 	int match = 0;
2972*45876Sbostic 	int r;
2973*45876Sbostic 
2974*45876Sbostic 	lbound = 0;
2975*45876Sbostic 	ubound = NEXTINDEX(h);
2976*45876Sbostic 	if (ubound > 0)
2977*45876Sbostic 		--ubound;
2978*45876Sbostic 
2979*45876Sbostic 	/* do a binary search on the current page */
2980*45876Sbostic 	while ((ubound - lbound) > 1) {
2981*45876Sbostic 		cur = lbound + ((ubound - lbound) / 2);
2982*45876Sbostic 		r = _bt_cmp(t, key, cur);
2983*45876Sbostic 
2984*45876Sbostic 		if (r > 0)
2985*45876Sbostic 			lbound = cur + 1;
2986*45876Sbostic 		else if (r < 0)
2987*45876Sbostic 			ubound = cur;
2988*45876Sbostic 		else {
2989*45876Sbostic 			match++;
2990*45876Sbostic 			break;
2991*45876Sbostic 		}
2992*45876Sbostic 	}
2993*45876Sbostic 
2994*45876Sbostic 	/*
2995*45876Sbostic 	 *  At this point, the binary search terminated because the endpoints
2996*45876Sbostic 	 *  got too close together, or we have a match.  Figure out which
2997*45876Sbostic 	 *  case applies, decide what to do based on the page type (leaf or
2998*45876Sbostic 	 *  internal), and do the right thing.
2999*45876Sbostic 	 */
3000*45876Sbostic 	if (match) {
3001*45876Sbostic 		return (cur);
3002*45876Sbostic 	} else if (ubound != lbound) {
3003*45876Sbostic 		if (h->h_flags & F_LEAF) {
3004*45876Sbostic 			r = _bt_cmp(t, key, lbound);
3005*45876Sbostic 			if (r <= 0) {
3006*45876Sbostic 				return (lbound);
3007*45876Sbostic 			}
3008*45876Sbostic 		} else {
3009*45876Sbostic 			r = _bt_cmp(t, key, ubound);
3010*45876Sbostic 
3011*45876Sbostic 			/* for internal nodes, move as far left as possible */
3012*45876Sbostic 			if (r < 0) {
3013*45876Sbostic 				r = _bt_cmp(t, key, lbound);
3014*45876Sbostic 				if (r < 0 && lbound > 0)
3015*45876Sbostic 					--lbound;
3016*45876Sbostic 				return (lbound);
3017*45876Sbostic 			} else {
3018*45876Sbostic 				return (ubound);
3019*45876Sbostic 			}
3020*45876Sbostic 		}
3021*45876Sbostic 	}
3022*45876Sbostic 
3023*45876Sbostic 	if (h->h_flags & F_LEAF) {
3024*45876Sbostic 		if (ubound < NEXTINDEX(h)) {
3025*45876Sbostic 			r = _bt_cmp(t, key, ubound);
3026*45876Sbostic 			if (r > 0)
3027*45876Sbostic 				ubound++;
3028*45876Sbostic 		}
3029*45876Sbostic 	} else {
3030*45876Sbostic 		/* for internal pages, move as far left as possible */
3031*45876Sbostic 		if (ubound == NEXTINDEX(h))
3032*45876Sbostic 			ubound--;
3033*45876Sbostic 
3034*45876Sbostic 		while (_bt_cmp(t, key, ubound) < 0)
3035*45876Sbostic 			ubound--;
3036*45876Sbostic 	}
3037*45876Sbostic 	return (ubound);
3038*45876Sbostic }
3039*45876Sbostic 
3040*45876Sbostic /*
3041*45876Sbostic  *  BT_SEQ -- Sequential scan interface.
3042*45876Sbostic  *
3043*45876Sbostic  *	This routine supports sequential scans on the btree.  If called with
3044*45876Sbostic  *	flags set to R_CURSOR, or if no seq scan has been initialized in the
3045*45876Sbostic  *	current tree, we initialize the scan.  Otherwise, we advance the scan
3046*45876Sbostic  *	and return the next item.
3047*45876Sbostic  *
3048*45876Sbostic  *	Scans can be either keyed or non-keyed.  Keyed scans basically have
3049*45876Sbostic  *	a starting point somewhere in the middle of the tree.  Non-keyed
3050*45876Sbostic  *	scans start at an endpoint.  Also, scans can be backward (descending
3051*45876Sbostic  *	order), forward (ascending order), or no movement (keep returning
3052*45876Sbostic  *	the same item).
3053*45876Sbostic  *
3054*45876Sbostic  *	Flags is checked every time we enter the routine, so the user can
3055*45876Sbostic  *	change directions on an active scan if desired.  The key argument
3056*45876Sbostic  *	is examined only when we initialize the scan, in order to position
3057*45876Sbostic  *	it properly.
3058*45876Sbostic  *
3059*45876Sbostic  *	Items are returned via the key and data arguments passed in.
3060*45876Sbostic  *
3061*45876Sbostic  *	Parameters:
3062*45876Sbostic  *		tree -- btree in which to do scan
3063*45876Sbostic  *		key -- key, used to position scan on initialization, and
3064*45876Sbostic  *		       used to return key components to the user.
3065*45876Sbostic  *		data -- used to return data components to the user.
3066*45876Sbostic  *		flags -- specify R_CURSOR, R_FIRST, R_LAST, R_NEXT, or
3067*45876Sbostic  *			 R_PREV.
3068*45876Sbostic  *
3069*45876Sbostic  *	Returns:
3070*45876Sbostic  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if no more data
3071*45876Sbostic  *		exists in the tree in the specified direction.
3072*45876Sbostic  *
3073*45876Sbostic  *	Side Effects:
3074*45876Sbostic  *		Changes the btree's notion of the current position in the
3075*45876Sbostic  *		scan.
3076*45876Sbostic  *
3077*45876Sbostic  *	Warnings:
3078*45876Sbostic  *		The key and data items returned are static and will be
3079*45876Sbostic  *		overwritten by the next bt_get or bt_seq.
3080*45876Sbostic  */
3081*45876Sbostic 
3082*45876Sbostic bt_seq(tree, key, data, flags)
3083*45876Sbostic 	BTREE tree;
3084*45876Sbostic 	DBT *key;
3085*45876Sbostic 	DBT *data;
3086*45876Sbostic 	int flags;
3087*45876Sbostic {
3088*45876Sbostic 	BTREE_P t = (BTREE_P) tree;
3089*45876Sbostic 	BTHEADER *h;
3090*45876Sbostic 	DATUM *d;
3091*45876Sbostic 	int status;
3092*45876Sbostic 
3093*45876Sbostic 	/* do we need to initialize the scan? */
3094*45876Sbostic 	if (flags == R_CURSOR || flags == R_LAST || flags == R_FIRST
3095*45876Sbostic 	    || !(t->bt_flags & BTF_SEQINIT)) {
3096*45876Sbostic 
3097*45876Sbostic 		/* initialize it */
3098*45876Sbostic 		status = _bt_seqinit(t, key, flags);
3099*45876Sbostic 	} else {
3100*45876Sbostic 		/* just advance the current scan pointer */
3101*45876Sbostic 		status = _bt_seqadvance(t, flags);
3102*45876Sbostic 	}
3103*45876Sbostic 
3104*45876Sbostic 	key->size = data->size = 0;
3105*45876Sbostic 	key->data = data->data = (char *) NULL;
3106*45876Sbostic 
3107*45876Sbostic 	h = t->bt_curpage;
3108*45876Sbostic 
3109*45876Sbostic 	/* is there a valid item at the current scan location? */
3110*45876Sbostic 	if (status == RET_SPECIAL) {
3111*45876Sbostic 		if (flags == R_NEXT) {
3112*45876Sbostic 			if (t->bt_cursor.c_index >= NEXTINDEX(h)) {
3113*45876Sbostic 				if (NEXTINDEX(h) > 0)
3114*45876Sbostic 					t->bt_cursor.c_index = NEXTINDEX(h) - 1;
3115*45876Sbostic 				else
3116*45876Sbostic 					t->bt_cursor.c_index = 0;
3117*45876Sbostic 			}
3118*45876Sbostic 		} else {
3119*45876Sbostic 			t->bt_cursor.c_index = 0;
3120*45876Sbostic 		}
3121*45876Sbostic 		return (RET_SPECIAL);
3122*45876Sbostic 	} else if (status == RET_ERROR)
3123*45876Sbostic 		return (RET_ERROR);
3124*45876Sbostic 
3125*45876Sbostic 	/* okay, return the data */
3126*45876Sbostic 	d = (DATUM *) GETDATUM(h, t->bt_cursor.c_index);
3127*45876Sbostic 
3128*45876Sbostic 	return (_bt_buildret(t, d, data, key));
3129*45876Sbostic }
3130*45876Sbostic 
3131*45876Sbostic /*
3132*45876Sbostic  *  _BT_BUILDRET -- Build return key/data pair as a result of search or scan.
3133*45876Sbostic  *
3134*45876Sbostic  *	This routine manages the statically allocated buffers in which we
3135*45876Sbostic  *	return data to the user.
3136*45876Sbostic  *
3137*45876Sbostic  *	Parameters:
3138*45876Sbostic  *		t -- btree from which to return datum
3139*45876Sbostic  *		d -- DATUM to be returned to the user.
3140*45876Sbostic  *		data -- data argument supplied by user for return
3141*45876Sbostic  *		key -- key argument supplied by user for return
3142*45876Sbostic  *
3143*45876Sbostic  *	Returns:
3144*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
3145*45876Sbostic  *
3146*45876Sbostic  *	Side Effects:
3147*45876Sbostic  *		May free and reallocate static buffers, if the data
3148*45876Sbostic  *		we want to return is bigger than the space we have to
3149*45876Sbostic  *		do so.
3150*45876Sbostic  */
3151*45876Sbostic 
3152*45876Sbostic static int
3153*45876Sbostic _bt_buildret(t, d, data, key)
3154*45876Sbostic 	BTREE_P t;
3155*45876Sbostic 	DATUM *d;
3156*45876Sbostic 	DBT *data;
3157*45876Sbostic 	DBT *key;
3158*45876Sbostic {
3159*45876Sbostic 	static int _data_s = 0;
3160*45876Sbostic 	static int _key_s = 0;
3161*45876Sbostic 	static char *_data = (char *) NULL;
3162*45876Sbostic 	static char *_key = (char *) NULL;
3163*45876Sbostic 	char *from, *where, *top;
3164*45876Sbostic 	pgno_t save;
3165*45876Sbostic 	pgno_t chain;
3166*45876Sbostic 	size_t nbytes;
3167*45876Sbostic 	size_t nhere;
3168*45876Sbostic 	BTHEADER *h;
3169*45876Sbostic 
3170*45876Sbostic 	if (d->d_flags & D_BIGKEY) {
3171*45876Sbostic 		if (_key != (char *) NULL)
3172*45876Sbostic 			(void) free(_key);
3173*45876Sbostic 		(void) bcopy((char *) &(d->d_bytes[0]),
3174*45876Sbostic 		      	     (char *) &chain,
3175*45876Sbostic 		      	     sizeof(chain));
3176*45876Sbostic 		if (_bt_getbig(t, chain, &_key, &_key_s) == RET_ERROR)
3177*45876Sbostic 			return (RET_ERROR);
3178*45876Sbostic 		key->data = _key;
3179*45876Sbostic 		key->size = _key_s;
3180*45876Sbostic 	} else {
3181*45876Sbostic 		/* need more space for key? */
3182*45876Sbostic 		if (d->d_ksize > _key_s) {
3183*45876Sbostic 			if (_key != (char *) NULL)
3184*45876Sbostic 				(void) free (_key);
3185*45876Sbostic 			if ((_key = (char *) malloc((unsigned) d->d_ksize))
3186*45876Sbostic 			    == (char *) NULL)
3187*45876Sbostic 				return (RET_ERROR);
3188*45876Sbostic 			_key_s = d->d_ksize;
3189*45876Sbostic 		}
3190*45876Sbostic 
3191*45876Sbostic 		key->data = _key;
3192*45876Sbostic 		if ((key->size = d->d_ksize) > 0)
3193*45876Sbostic 			(void) bcopy(&(d->d_bytes[0]),
3194*45876Sbostic 				     _key,
3195*45876Sbostic 				     (int) d->d_ksize);
3196*45876Sbostic 	}
3197*45876Sbostic 
3198*45876Sbostic 	if (d->d_flags & D_BIGDATA) {
3199*45876Sbostic 		if (_data != (char *) NULL)
3200*45876Sbostic 			(void) free(_data);
3201*45876Sbostic 		(void) bcopy(&(d->d_bytes[d->d_ksize]),
3202*45876Sbostic 		      	     (char *) &chain,
3203*45876Sbostic 		      	     sizeof(chain));
3204*45876Sbostic 		if (_bt_getbig(t, chain, &_data, &_data_s) == RET_ERROR)
3205*45876Sbostic 			return (RET_ERROR);
3206*45876Sbostic 		data->data = _data;
3207*45876Sbostic 		data->size = _data_s;
3208*45876Sbostic 	} else {
3209*45876Sbostic 		/* need more space for data? */
3210*45876Sbostic 		if (d->d_dsize > _data_s) {
3211*45876Sbostic 			if (_data != (char *) NULL)
3212*45876Sbostic 				(void) free (_data);
3213*45876Sbostic 			if ((_data = (char *) malloc((unsigned) d->d_dsize))
3214*45876Sbostic 			    == (char *) NULL)
3215*45876Sbostic 				return (RET_ERROR);
3216*45876Sbostic 			_data_s = d->d_dsize;
3217*45876Sbostic 		}
3218*45876Sbostic 
3219*45876Sbostic 		data->data = _data;
3220*45876Sbostic 
3221*45876Sbostic 		if ((data->size = d->d_dsize) > 0)
3222*45876Sbostic 			(void) bcopy(&(d->d_bytes[d->d_ksize]),
3223*45876Sbostic 				      _data,
3224*45876Sbostic 				      d->d_dsize);
3225*45876Sbostic 	}
3226*45876Sbostic 
3227*45876Sbostic 	return (RET_SUCCESS);
3228*45876Sbostic }
3229*45876Sbostic 
3230*45876Sbostic /*
3231*45876Sbostic  *  _BT_SEQINIT -- Initialize a sequential scan on the btree.
3232*45876Sbostic  *
3233*45876Sbostic  *	Sets the tree's notion of the current scan location correctly
3234*45876Sbostic  *	given a key and a direction.
3235*45876Sbostic  *
3236*45876Sbostic  *	Parameters:
3237*45876Sbostic  *		t -- tree in which to initialize scan
3238*45876Sbostic  *		key -- key for initial scan position
3239*45876Sbostic  *		flags -- R_NEXT, R_PREV
3240*45876Sbostic  *
3241*45876Sbostic  *	Returns:
3242*45876Sbostic  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there's no data
3243*45876Sbostic  *		in the tree to scan.
3244*45876Sbostic  *
3245*45876Sbostic  *	Side Effects:
3246*45876Sbostic  *		Changes current scan position for the tree.  Almost certainly
3247*45876Sbostic  *		changes current page, as well.  Sets BTF_SEQINIT bit in tree
3248*45876Sbostic  *		flags, so that we know we've initialized a scan.
3249*45876Sbostic  */
3250*45876Sbostic 
3251*45876Sbostic static int
3252*45876Sbostic _bt_seqinit(t, key, flags)
3253*45876Sbostic 	BTREE_P t;
3254*45876Sbostic 	DBT *key;
3255*45876Sbostic 	int flags;
3256*45876Sbostic {
3257*45876Sbostic 	BTITEM *item;
3258*45876Sbostic 	BTHEADER *h;
3259*45876Sbostic 	CURSOR *c;
3260*45876Sbostic 	IDATUM *id;
3261*45876Sbostic 	pgno_t pgno;
3262*45876Sbostic 	index_t index;
3263*45876Sbostic 	index_t last;
3264*45876Sbostic 
3265*45876Sbostic 	/*
3266*45876Sbostic 	 *  Figure out if we really have to search for the key that the
3267*45876Sbostic 	 *  user supplied.  If key is null, then this is an unkeyed scan
3268*45876Sbostic 	 *  and we can just start from an endpoint.
3269*45876Sbostic 	 */
3270*45876Sbostic 
3271*45876Sbostic 	c = &(t->bt_cursor);
3272*45876Sbostic 
3273*45876Sbostic 	if (flags == R_CURSOR) {
3274*45876Sbostic 		if (key->data != (char *) NULL) {
3275*45876Sbostic 
3276*45876Sbostic 			/* key supplied, find it */
3277*45876Sbostic 			item = _bt_search(t, key);
3278*45876Sbostic 			c->c_index = item->bti_index;
3279*45876Sbostic 			c->c_pgno = t->bt_curpage->h_pgno;
3280*45876Sbostic 		} else {
3281*45876Sbostic 			errno = EINVAL;
3282*45876Sbostic 			return (RET_ERROR);
3283*45876Sbostic 		}
3284*45876Sbostic 
3285*45876Sbostic 	} else {
3286*45876Sbostic 
3287*45876Sbostic 		/*
3288*45876Sbostic 		 *  Unkeyed scan.  For backward scans, find the last item
3289*45876Sbostic 		 *  in the tree; for forward scans, find the first item.
3290*45876Sbostic 		 */
3291*45876Sbostic 
3292*45876Sbostic 		if (_bt_getpage(t, P_ROOT) == RET_ERROR)
3293*45876Sbostic 			return (RET_ERROR);
3294*45876Sbostic 		h = t->bt_curpage;
3295*45876Sbostic 		if (flags == R_LAST || flags == R_PREV) {
3296*45876Sbostic 
3297*45876Sbostic 			/* backward scan */
3298*45876Sbostic 			while (!(h->h_flags & F_LEAF)) {
3299*45876Sbostic 				last = NEXTINDEX(h) - 1;
3300*45876Sbostic 				id = (IDATUM *) GETDATUM(h,last);
3301*45876Sbostic 				if (_bt_getpage(t, id->i_pgno) == RET_ERROR)
3302*45876Sbostic 					return (RET_ERROR);
3303*45876Sbostic 				h = t->bt_curpage;
3304*45876Sbostic 			}
3305*45876Sbostic 
3306*45876Sbostic 			/* skip empty pages */
3307*45876Sbostic 			while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE) {
3308*45876Sbostic 				if (_bt_getpage(t, h->h_prevpg) == RET_ERROR)
3309*45876Sbostic 					return (RET_ERROR);
3310*45876Sbostic 				h = t->bt_curpage;
3311*45876Sbostic 			}
3312*45876Sbostic 
3313*45876Sbostic 			c->c_pgno = h->h_pgno;
3314*45876Sbostic 			if (NEXTINDEX(h) > 0)
3315*45876Sbostic 				c->c_index = NEXTINDEX(h) - 1;
3316*45876Sbostic 			else
3317*45876Sbostic 				c->c_index = 0;
3318*45876Sbostic 		} else if (flags == R_FIRST || flags == R_NEXT) {
3319*45876Sbostic 			/* forward scan */
3320*45876Sbostic 			while (!(h->h_flags & F_LEAF)) {
3321*45876Sbostic 				id = (IDATUM *) GETDATUM(h,0);
3322*45876Sbostic 				if (_bt_getpage(t, id->i_pgno) == RET_ERROR)
3323*45876Sbostic 					return (RET_ERROR);
3324*45876Sbostic 				h = t->bt_curpage;
3325*45876Sbostic 			}
3326*45876Sbostic 
3327*45876Sbostic 			/* skip empty pages */
3328*45876Sbostic 			while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE) {
3329*45876Sbostic 				if (_bt_getpage(t, h->h_nextpg) == RET_ERROR)
3330*45876Sbostic 					return (RET_ERROR);
3331*45876Sbostic 				h = t->bt_curpage;
3332*45876Sbostic 			}
3333*45876Sbostic 
3334*45876Sbostic 			c->c_pgno = h->h_pgno;
3335*45876Sbostic 			c->c_index = 0;
3336*45876Sbostic 		} else {
3337*45876Sbostic 			/* no flags passed in */
3338*45876Sbostic 			errno = EINVAL;
3339*45876Sbostic 			return (RET_ERROR);
3340*45876Sbostic 		}
3341*45876Sbostic 	}
3342*45876Sbostic 
3343*45876Sbostic 	/* okay, scan is initialized */
3344*45876Sbostic 	t->bt_flags |= BTF_SEQINIT;
3345*45876Sbostic 
3346*45876Sbostic 	if (c->c_index == NEXTINDEX(t->bt_curpage))
3347*45876Sbostic 		return (RET_SPECIAL);
3348*45876Sbostic 
3349*45876Sbostic 	return (RET_SUCCESS);
3350*45876Sbostic }
3351*45876Sbostic 
3352*45876Sbostic /*
3353*45876Sbostic  *  _BT_SEQADVANCE -- Advance the sequential scan on this tree.
3354*45876Sbostic  *
3355*45876Sbostic  *	Moves the current location pointer for the scan on this tree one
3356*45876Sbostic  *	spot in the requested direction.
3357*45876Sbostic  *
3358*45876Sbostic  *	Parameters:
3359*45876Sbostic  *		t -- btree being scanned
3360*45876Sbostic  *		flags -- for R_NEXT, R_PREV
3361*45876Sbostic  *
3362*45876Sbostic  *	Returns:
3363*45876Sbostic  *		RET_SUCCESS, RET_ERROR, or RET_SPECIAL if there is no
3364*45876Sbostic  *		more data in the specified direction.
3365*45876Sbostic  *
3366*45876Sbostic  *	Side Effects:
3367*45876Sbostic  *		May change current page.
3368*45876Sbostic  */
3369*45876Sbostic 
3370*45876Sbostic static int
3371*45876Sbostic _bt_seqadvance(t, flags)
3372*45876Sbostic 	BTREE_P t;
3373*45876Sbostic 	int flags;
3374*45876Sbostic {
3375*45876Sbostic 	BTHEADER *h;
3376*45876Sbostic 	CURSOR *c;
3377*45876Sbostic 	index_t index;
3378*45876Sbostic 
3379*45876Sbostic 	c = &(t->bt_cursor);
3380*45876Sbostic 	index = c->c_index;
3381*45876Sbostic 
3382*45876Sbostic 	if (_bt_getpage(t, c->c_pgno) == RET_ERROR)
3383*45876Sbostic 		return (RET_ERROR);
3384*45876Sbostic 	h = t->bt_curpage;
3385*45876Sbostic 
3386*45876Sbostic 	/* by the time we get here, don't need the cursor key anymore */
3387*45876Sbostic 	if (c->c_key != (char *) NULL)
3388*45876Sbostic 		(void) free(c->c_key);
3389*45876Sbostic 
3390*45876Sbostic 	if (flags == R_NEXT) {
3391*45876Sbostic 
3392*45876Sbostic 		/*
3393*45876Sbostic 		 *  This is a forward scan.  If the cursor is pointing
3394*45876Sbostic 		 *  at a virtual record (that is, it was pointing at
3395*45876Sbostic 		 *  a record that got deleted), then we should return
3396*45876Sbostic 		 *  the record it's pointing at now.  Otherwise, we
3397*45876Sbostic 		 *  should advance the scan.  In either case, we need
3398*45876Sbostic 		 *  to be careful not to run off the end of the current
3399*45876Sbostic 		 *  page.
3400*45876Sbostic 		 */
3401*45876Sbostic 
3402*45876Sbostic 		if (c->c_flags & CRSR_BEFORE) {
3403*45876Sbostic 
3404*45876Sbostic 			if (index >= NEXTINDEX(h)) {
3405*45876Sbostic 				/* out of items on this page, get next page */
3406*45876Sbostic 				if (h->h_nextpg == P_NONE) {
3407*45876Sbostic 					/* tell caller we're done... */
3408*45876Sbostic 					c->c_index = NEXTINDEX(h);
3409*45876Sbostic 					return (RET_SPECIAL);
3410*45876Sbostic 				}
3411*45876Sbostic 
3412*45876Sbostic 				/* skip empty pages */
3413*45876Sbostic 				do {
3414*45876Sbostic 					if (_bt_getpage(t, h->h_nextpg)
3415*45876Sbostic 					    == RET_ERROR) {
3416*45876Sbostic 						c->c_index = NEXTINDEX(h);
3417*45876Sbostic 						return (RET_ERROR);
3418*45876Sbostic 					}
3419*45876Sbostic 					h = t->bt_curpage;
3420*45876Sbostic 					c->c_pgno = h->h_pgno;
3421*45876Sbostic 				} while (NEXTINDEX(h) == 0
3422*45876Sbostic 					 && h->h_nextpg != P_NONE);
3423*45876Sbostic 
3424*45876Sbostic 				if (NEXTINDEX(h) == 0) {
3425*45876Sbostic 					/* tell caller we're done */
3426*45876Sbostic 					c->c_index = NEXTINDEX(h);
3427*45876Sbostic 					return (RET_SPECIAL);
3428*45876Sbostic 				}
3429*45876Sbostic 				index = 0;
3430*45876Sbostic 			}
3431*45876Sbostic 			c->c_flags &= ~CRSR_BEFORE;
3432*45876Sbostic 
3433*45876Sbostic 		} else if (++index >= NEXTINDEX(h)) {
3434*45876Sbostic 
3435*45876Sbostic 			/* out of items on this page, get next page */
3436*45876Sbostic 			if (h->h_nextpg == P_NONE) {
3437*45876Sbostic 				/* tell caller we're done... */
3438*45876Sbostic 				c->c_index = NEXTINDEX(h);
3439*45876Sbostic 				return (RET_SPECIAL);
3440*45876Sbostic 			}
3441*45876Sbostic 
3442*45876Sbostic 			/* skip empty pages */
3443*45876Sbostic 			do {
3444*45876Sbostic 				if (_bt_getpage(t, h->h_nextpg) == RET_ERROR) {
3445*45876Sbostic 					c->c_index = NEXTINDEX(h);
3446*45876Sbostic 					return (RET_ERROR);
3447*45876Sbostic 				}
3448*45876Sbostic 				h = t->bt_curpage;
3449*45876Sbostic 				c->c_pgno = h->h_pgno;
3450*45876Sbostic 			} while (NEXTINDEX(h) == 0 && h->h_nextpg != P_NONE);
3451*45876Sbostic 
3452*45876Sbostic 			if (NEXTINDEX(h) == 0) {
3453*45876Sbostic 				/* tell caller we're done */
3454*45876Sbostic 				c->c_index = NEXTINDEX(h);
3455*45876Sbostic 				return (RET_SPECIAL);
3456*45876Sbostic 			}
3457*45876Sbostic 			index = 0;
3458*45876Sbostic 		}
3459*45876Sbostic 	} else if (flags == R_PREV) {
3460*45876Sbostic 
3461*45876Sbostic 		/* for backward scans, life is substantially easier */
3462*45876Sbostic 		c->c_flags &= ~CRSR_BEFORE;
3463*45876Sbostic 		if (c->c_key != (char *) NULL) {
3464*45876Sbostic 			(void) free(c->c_key);
3465*45876Sbostic 			c->c_key = (char *) NULL;
3466*45876Sbostic 		}
3467*45876Sbostic 
3468*45876Sbostic 		if (index == 0) {
3469*45876Sbostic 
3470*45876Sbostic 			/* we may be done */
3471*45876Sbostic 			c->c_index = 0;
3472*45876Sbostic 
3473*45876Sbostic 			/* out of items on this page, get next page */
3474*45876Sbostic 			if (h->h_prevpg == P_NONE)
3475*45876Sbostic 				return (RET_SPECIAL);
3476*45876Sbostic 
3477*45876Sbostic 			/* skip empty pages */
3478*45876Sbostic 			do {
3479*45876Sbostic 				if (_bt_getpage(t, h->h_prevpg) == RET_ERROR)
3480*45876Sbostic 					return (RET_ERROR);
3481*45876Sbostic 				h = t->bt_curpage;
3482*45876Sbostic 				c->c_pgno = h->h_pgno;
3483*45876Sbostic 			} while (NEXTINDEX(h) == 0 && h->h_prevpg != P_NONE);
3484*45876Sbostic 
3485*45876Sbostic 			if (NEXTINDEX(h) == 0)
3486*45876Sbostic 				return (RET_SPECIAL);
3487*45876Sbostic 
3488*45876Sbostic 			index = NEXTINDEX(h) - 1;
3489*45876Sbostic 		} else
3490*45876Sbostic 			--index;
3491*45876Sbostic 	} else {
3492*45876Sbostic 		/* must specify a direction */
3493*45876Sbostic 		errno = EINVAL;
3494*45876Sbostic 		return (RET_ERROR);
3495*45876Sbostic 	}
3496*45876Sbostic 
3497*45876Sbostic 	c->c_index = index;
3498*45876Sbostic 	return (RET_SUCCESS);
3499*45876Sbostic }
3500*45876Sbostic 
3501*45876Sbostic /*
3502*45876Sbostic  *  BT_CLOSE -- Close a btree
3503*45876Sbostic  *
3504*45876Sbostic  *	Parameters:
3505*45876Sbostic  *		tree -- tree to close
3506*45876Sbostic  *
3507*45876Sbostic  *	Returns:
3508*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
3509*45876Sbostic  *
3510*45876Sbostic  *	Side Effects:
3511*45876Sbostic  *		Frees space occupied by the tree.
3512*45876Sbostic  */
3513*45876Sbostic 
3514*45876Sbostic int
3515*45876Sbostic bt_close(tree)
3516*45876Sbostic 	BTREE tree;
3517*45876Sbostic {
3518*45876Sbostic 	BTREE_P t = (BTREE_P) tree;
3519*45876Sbostic 	int i;
3520*45876Sbostic 	BTHEADER *h;
3521*45876Sbostic 	char *cache;
3522*45876Sbostic 	struct HTBUCKET *b, *sb;
3523*45876Sbostic 	HTABLE ht;
3524*45876Sbostic 	int fd;
3525*45876Sbostic 
3526*45876Sbostic 	if (t->bt_cursor.c_key != (char *) NULL)
3527*45876Sbostic 		(void) free(t->bt_cursor.c_key);
3528*45876Sbostic 
3529*45876Sbostic 	if (!ISDISK(t)) {
3530*45876Sbostic 		/* in-memory tree, release hash table memory */
3531*45876Sbostic 		ht = t->bt_s.bt_ht;
3532*45876Sbostic 		for (i = 0; i < HTSIZE; i++) {
3533*45876Sbostic 			if ((b = ht[i]) == (struct HTBUCKET *) NULL)
3534*45876Sbostic 				break;
3535*45876Sbostic 			do {
3536*45876Sbostic 				sb = b;
3537*45876Sbostic 				(void) free((char *) b->ht_page);
3538*45876Sbostic 				b = b->ht_next;
3539*45876Sbostic 				(void) free((char *) sb);
3540*45876Sbostic 			} while (b != (struct HTBUCKET *) NULL);
3541*45876Sbostic 		}
3542*45876Sbostic 		(void) free ((char *) ht);
3543*45876Sbostic 		(void) free ((char *) t);
3544*45876Sbostic 		return (RET_SUCCESS);
3545*45876Sbostic 	}
3546*45876Sbostic 
3547*45876Sbostic 	if ((t->bt_flags & BTF_ISWRITE) && !(t->bt_flags & BTF_METAOK)) {
3548*45876Sbostic 		if (_bt_wrtmeta(t) == RET_ERROR)
3549*45876Sbostic 			return (RET_ERROR);
3550*45876Sbostic 	}
3551*45876Sbostic 
3552*45876Sbostic 	if (t->bt_curpage != (BTHEADER *) NULL) {
3553*45876Sbostic 		h = t->bt_curpage;
3554*45876Sbostic 		if (h->h_flags & F_DIRTY) {
3555*45876Sbostic 			if (_bt_write(t, h, RELEASE) == RET_ERROR)
3556*45876Sbostic 				return (RET_ERROR);
3557*45876Sbostic 		} else {
3558*45876Sbostic 			if (_bt_release(t, h) == RET_ERROR)
3559*45876Sbostic 				return (RET_ERROR);
3560*45876Sbostic 		}
3561*45876Sbostic 
3562*45876Sbostic 		/* flush and free the cache, if there is one */
3563*45876Sbostic 		if (ISCACHE(t)) {
3564*45876Sbostic 			cache = t->bt_s.bt_d.d_cache;
3565*45876Sbostic 			lrusync(cache);
3566*45876Sbostic 			lrufree(cache);
3567*45876Sbostic 		}
3568*45876Sbostic 		(void) free ((char *) h);
3569*45876Sbostic 	}
3570*45876Sbostic 
3571*45876Sbostic 	fd = t->bt_s.bt_d.d_fd;
3572*45876Sbostic 	(void) free ((char *) t);
3573*45876Sbostic 	return (close(fd));
3574*45876Sbostic }
3575*45876Sbostic 
3576*45876Sbostic /*
3577*45876Sbostic  *  _BT_ALLOCPG -- allocate a new page in the btree.
3578*45876Sbostic  *
3579*45876Sbostic  *	This is called when we split a page, to get space to do the split.
3580*45876Sbostic  *	For disk btrees, these pages are released when the split is done.
3581*45876Sbostic  *	For memory btrees, they are not.
3582*45876Sbostic  *
3583*45876Sbostic  *	Parameters:
3584*45876Sbostic  *		t -- tree in which to do split
3585*45876Sbostic  *
3586*45876Sbostic  *	Returns:
3587*45876Sbostic  *		Pointer to the newly-allocated page
3588*45876Sbostic  */
3589*45876Sbostic 
3590*45876Sbostic static BTHEADER *
3591*45876Sbostic _bt_allocpg(t)
3592*45876Sbostic 	BTREE_P t;
3593*45876Sbostic {
3594*45876Sbostic 	BTHEADER *h = t->bt_curpage;
3595*45876Sbostic 	BTHEADER *nh;
3596*45876Sbostic 	char *cache;
3597*45876Sbostic 	int nbytes;
3598*45876Sbostic 
3599*45876Sbostic 	/* if we have a cache, let the cache code do the work */
3600*45876Sbostic 	if (ISDISK(t) && ISCACHE(t)) {
3601*45876Sbostic 		nh = (BTHEADER *) lrugetnew(t->bt_s.bt_d.d_cache,
3602*45876Sbostic 					    t->bt_npages + 1,
3603*45876Sbostic 					    &nbytes);
3604*45876Sbostic 	} else {
3605*45876Sbostic 		nh = (BTHEADER *) malloc((unsigned) t->bt_psize);
3606*45876Sbostic 	}
3607*45876Sbostic 
3608*45876Sbostic 	if (nh != (BTHEADER *) NULL) {
3609*45876Sbostic 		nh->h_pgno = nh->h_prevpg = nh->h_nextpg = P_NONE;
3610*45876Sbostic 		nh->h_flags = h->h_flags;
3611*45876Sbostic 		nh->h_lower = (index_t)
3612*45876Sbostic 				(((char *) &(nh->h_linp[0])) - ((char *) nh));
3613*45876Sbostic 		nh->h_upper = t->bt_psize;
3614*45876Sbostic 	}
3615*45876Sbostic 
3616*45876Sbostic 	return (nh);
3617*45876Sbostic }
3618*45876Sbostic 
3619*45876Sbostic /*
3620*45876Sbostic  *  _BT_WRITE -- Write a specific page to a btree file.
3621*45876Sbostic  *
3622*45876Sbostic  *	Parameters:
3623*45876Sbostic  *		t -- btree to get the page
3624*45876Sbostic  *		h -- page to write
3625*45876Sbostic  *		relflag -- (int) this page may/may not be released
3626*45876Sbostic  *
3627*45876Sbostic  *	Returns:
3628*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
3629*45876Sbostic  *
3630*45876Sbostic  *	Side Effects:
3631*45876Sbostic  *		Writes a metadata page if none has been written yet.
3632*45876Sbostic  */
3633*45876Sbostic 
3634*45876Sbostic static int
3635*45876Sbostic _bt_write(t, h, relflag)
3636*45876Sbostic 	BTREE_P t;
3637*45876Sbostic 	BTHEADER *h;
3638*45876Sbostic 	int relflag;
3639*45876Sbostic {
3640*45876Sbostic 	long pos;
3641*45876Sbostic 	int htindex;
3642*45876Sbostic 	HTBUCKET *b;
3643*45876Sbostic 	char *cache;
3644*45876Sbostic 	BTMETA m;
3645*45876Sbostic 	pgno_t pgno;
3646*45876Sbostic 
3647*45876Sbostic 	h->h_flags &= ~F_DIRTY;
3648*45876Sbostic 	if (ISDISK(t)) {
3649*45876Sbostic 
3650*45876Sbostic 		/* if we haven't done so yet, write the metadata */
3651*45876Sbostic 		if (!(t->bt_flags & BTF_METAOK)) {
3652*45876Sbostic 			if (_bt_wrtmeta(t) == RET_ERROR)
3653*45876Sbostic 				return (RET_ERROR);
3654*45876Sbostic 		}
3655*45876Sbostic 
3656*45876Sbostic 		pgno = h->h_pgno;
3657*45876Sbostic 
3658*45876Sbostic 
3659*45876Sbostic 		/* if we have a cache, let the cache code do the work */
3660*45876Sbostic 		if ((cache = t->bt_s.bt_d.d_cache) != (char *) NULL) {
3661*45876Sbostic 			if (lruwrite(cache, pgno) == RET_ERROR)
3662*45876Sbostic 				return (RET_ERROR);
3663*45876Sbostic 			if (relflag && lrurelease(cache, pgno) == RET_ERROR)
3664*45876Sbostic 				return (RET_ERROR);
3665*45876Sbostic 
3666*45876Sbostic 		} else {
3667*45876Sbostic 			(void) _bt_pgout(h, (char *) t->bt_lorder);
3668*45876Sbostic 			/* now write the current page */
3669*45876Sbostic 			pos = (long) (pgno * t->bt_psize);
3670*45876Sbostic 			if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos)
3671*45876Sbostic 				return (RET_ERROR);
3672*45876Sbostic 			if (write(t->bt_s.bt_d.d_fd, h, t->bt_psize)
3673*45876Sbostic 			    < t->bt_psize)
3674*45876Sbostic 				return (RET_ERROR);
3675*45876Sbostic 			if (!relflag)
3676*45876Sbostic 				(void) _bt_pgin(h, (char *) t->bt_lorder);
3677*45876Sbostic 		}
3678*45876Sbostic 	} else {
3679*45876Sbostic 		/* in-memory btree */
3680*45876Sbostic 		htindex = HASHKEY(h->h_pgno);
3681*45876Sbostic 
3682*45876Sbostic 		/* see if we need to overwrite existing entry */
3683*45876Sbostic 		for (b = t->bt_s.bt_ht[htindex];
3684*45876Sbostic 		     b != (HTBUCKET *) NULL;
3685*45876Sbostic 		     b = b->ht_next) {
3686*45876Sbostic 			if (b->ht_pgno == h->h_pgno) {
3687*45876Sbostic 				b->ht_page = h;
3688*45876Sbostic 				return (RET_SUCCESS);
3689*45876Sbostic 			}
3690*45876Sbostic 		}
3691*45876Sbostic 
3692*45876Sbostic 		/* new entry, write it */
3693*45876Sbostic 		b = (HTBUCKET *) malloc((unsigned) sizeof (HTBUCKET));
3694*45876Sbostic 		if (b == (HTBUCKET *) NULL)
3695*45876Sbostic 			return (RET_ERROR);
3696*45876Sbostic 
3697*45876Sbostic 		b->ht_pgno = h->h_pgno;
3698*45876Sbostic 		b->ht_page = h;
3699*45876Sbostic 		b->ht_next = t->bt_s.bt_ht[htindex];
3700*45876Sbostic 		t->bt_s.bt_ht[htindex] = b;
3701*45876Sbostic 	}
3702*45876Sbostic 	return (RET_SUCCESS);
3703*45876Sbostic }
3704*45876Sbostic 
3705*45876Sbostic /*
3706*45876Sbostic  *  _BT_RELEASE -- Release a locked-down cache page
3707*45876Sbostic  *
3708*45876Sbostic  *	During page splits, we want to force pages out to the cache
3709*45876Sbostic  *	before we actually release them, in some cases.  This routine
3710*45876Sbostic  *	releases such a page when it is no longer needed.
3711*45876Sbostic  *
3712*45876Sbostic  *	Parameters:
3713*45876Sbostic  *		t -- btree in which to release page
3714*45876Sbostic  *		h -- page to release
3715*45876Sbostic  *
3716*45876Sbostic  *	Returns:
3717*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
3718*45876Sbostic  *
3719*45876Sbostic  *	Side Effects:
3720*45876Sbostic  *		None.
3721*45876Sbostic  */
3722*45876Sbostic 
3723*45876Sbostic static int
3724*45876Sbostic _bt_release(t, h)
3725*45876Sbostic 	BTREE_P t;
3726*45876Sbostic 	BTHEADER *h;
3727*45876Sbostic {
3728*45876Sbostic 	if (ISDISK(t) && ISCACHE(t)) {
3729*45876Sbostic 		if (lrurelease(t->bt_s.bt_d.d_cache, h->h_pgno) < 0)
3730*45876Sbostic 			return (RET_ERROR);
3731*45876Sbostic 	}
3732*45876Sbostic 	return (RET_SUCCESS);
3733*45876Sbostic }
3734*45876Sbostic 
3735*45876Sbostic /*
3736*45876Sbostic  *  _BT_WRTMETA -- Write metadata to the btree.
3737*45876Sbostic  *
3738*45876Sbostic  *	Parameters:
3739*45876Sbostic  *		t -- tree to which to write
3740*45876Sbostic  *
3741*45876Sbostic  *	Returns:
3742*45876Sbostic  *		RET_SUCCESS, RET_ERROR.
3743*45876Sbostic  */
3744*45876Sbostic 
3745*45876Sbostic static int
3746*45876Sbostic _bt_wrtmeta(t)
3747*45876Sbostic 	BTREE_P t;
3748*45876Sbostic {
3749*45876Sbostic 	BTMETA m;
3750*45876Sbostic 
3751*45876Sbostic 	if (lseek(t->bt_s.bt_d.d_fd, 0l, L_SET) != 0l)
3752*45876Sbostic 		return (RET_ERROR);
3753*45876Sbostic 
3754*45876Sbostic 	/* lorder has to be in host-independent format */
3755*45876Sbostic 	m.m_lorder = (u_long) htonl((long) t->bt_lorder);
3756*45876Sbostic 
3757*45876Sbostic 	m.m_magic = BTREEMAGIC;
3758*45876Sbostic 	m.m_version = BTREEVERSION;
3759*45876Sbostic 	m.m_psize = t->bt_psize;
3760*45876Sbostic 	m.m_free = t->bt_free;
3761*45876Sbostic 	m.m_flags = t->bt_flags & BTF_NODUPS;
3762*45876Sbostic 
3763*45876Sbostic 	if (t->bt_lorder != BYTE_ORDER) {
3764*45876Sbostic 		BLSWAP(m.m_magic);
3765*45876Sbostic 		BLSWAP(m.m_version);
3766*45876Sbostic 		BLSWAP(m.m_psize);
3767*45876Sbostic 		BLSWAP(m.m_free);
3768*45876Sbostic 		BLSWAP(m.m_flags);
3769*45876Sbostic 	}
3770*45876Sbostic 
3771*45876Sbostic 	if (write(t->bt_s.bt_d.d_fd, &m, sizeof(m))
3772*45876Sbostic 	    != sizeof(m)) {
3773*45876Sbostic 		return (RET_ERROR);
3774*45876Sbostic 	}
3775*45876Sbostic 
3776*45876Sbostic 	t->bt_flags |= BTF_METAOK;
3777*45876Sbostic 
3778*45876Sbostic 	return (RET_SUCCESS);
3779*45876Sbostic }
3780*45876Sbostic 
3781*45876Sbostic #ifdef DEBUG
3782*45876Sbostic void
3783*45876Sbostic _btdump(tree)
3784*45876Sbostic 	BTREE tree;
3785*45876Sbostic {
3786*45876Sbostic 	BTREE_P t = (BTREE_P) tree;
3787*45876Sbostic 	DATUM *d;
3788*45876Sbostic 	IDATUM *id;
3789*45876Sbostic 	BTHEADER *h;
3790*45876Sbostic 	pgno_t npages;
3791*45876Sbostic 	pgno_t i;
3792*45876Sbostic 	index_t cur, top;
3793*45876Sbostic 
3794*45876Sbostic 	npages = t->bt_npages;
3795*45876Sbostic 	(void) printf("\"%s\" fd %d pgsz %d curpg %d @ 0x%lx",
3796*45876Sbostic 		t->bt_fname, t->bt_s.bt_d.d_fd,
3797*45876Sbostic 		t->bt_psize, t->bt_curpage);
3798*45876Sbostic 	(void) printf("npg %d cmp 0x%lx flags=(", npages, t->bt_compare);
3799*45876Sbostic 	if (t->bt_flags & BTF_SEQINIT)
3800*45876Sbostic 		(void) printf("BTF_SEQINIT");
3801*45876Sbostic 	(void) printf(")\n");
3802*45876Sbostic 
3803*45876Sbostic 	for (i = P_ROOT; i <= npages; i++) {
3804*45876Sbostic 		if (_bt_getpage(t, i) == RET_ERROR)
3805*45876Sbostic 			_punt();
3806*45876Sbostic 		h = t->bt_curpage;
3807*45876Sbostic 		top = NEXTINDEX(h);
3808*45876Sbostic 		(void) printf("    page %d:\n", i);
3809*45876Sbostic 		(void) printf("\tpgno %d prev %d next %d\n",
3810*45876Sbostic 			h->h_pgno, h->h_prevpg, h->h_nextpg);
3811*45876Sbostic 		(void) printf("\tlower %d upper %d nextind %d flags (",
3812*45876Sbostic 			h->h_lower, h->h_upper, top);
3813*45876Sbostic 		if (h->h_flags & F_LEAF)
3814*45876Sbostic 			(void) printf("F_LEAF");
3815*45876Sbostic 		else
3816*45876Sbostic 			(void) printf("<internal>");
3817*45876Sbostic 		if (h->h_flags & F_DIRTY)
3818*45876Sbostic 			(void) printf("|F_DIRTY");
3819*45876Sbostic 		if (h->h_flags & F_PRESERVE)
3820*45876Sbostic 			(void) printf("|F_PRESERVE");
3821*45876Sbostic 		if (h->h_flags & F_CONT) {
3822*45876Sbostic 			(void) printf("|F_CONT)");
3823*45876Sbostic 			if (h->h_prevpg == P_NONE) {
3824*45876Sbostic 				size_t longsz;
3825*45876Sbostic 				(void) bcopy((char *) &(h->h_linp[0]),
3826*45876Sbostic 					      (char *) &longsz,
3827*45876Sbostic 					      sizeof(longsz));
3828*45876Sbostic 				printf("\n\t\t(chain start, data length %ld)",
3829*45876Sbostic 					longsz);
3830*45876Sbostic 			}
3831*45876Sbostic 			printf("\n");
3832*45876Sbostic 			continue;
3833*45876Sbostic 		}
3834*45876Sbostic 		(void) printf(")\n");
3835*45876Sbostic 		for (cur = 0; cur < top; cur++) {
3836*45876Sbostic 			(void) printf("\t  [%d] off %d ", cur, h->h_linp[cur]);
3837*45876Sbostic 			if (h->h_flags & F_LEAF) {
3838*45876Sbostic 				d = (DATUM *) GETDATUM(h,cur);
3839*45876Sbostic 				(void) printf("ksize %d", d->d_ksize);
3840*45876Sbostic 				if (d->d_flags & D_BIGKEY)
3841*45876Sbostic 					(void) printf(" (indirect)");
3842*45876Sbostic 				(void) printf("; dsize %d", d->d_dsize);
3843*45876Sbostic 				if (d->d_flags & D_BIGDATA)
3844*45876Sbostic 					(void) printf(" (indirect)");
3845*45876Sbostic 			} else {
3846*45876Sbostic 				id = (IDATUM *) GETDATUM(h,cur);
3847*45876Sbostic 				(void) printf("size %d pgno %d",
3848*45876Sbostic 					id->i_size, id->i_pgno);
3849*45876Sbostic 				if (id->i_flags & D_BIGKEY)
3850*45876Sbostic 					(void) printf(" (indirect)");
3851*45876Sbostic 			}
3852*45876Sbostic 			(void) printf("\n");
3853*45876Sbostic 		}
3854*45876Sbostic 		(void) printf("\n");
3855*45876Sbostic 	}
3856*45876Sbostic }
3857*45876Sbostic #endif /* DEBUG */
3858*45876Sbostic 
3859*45876Sbostic /*
3860*45876Sbostic  *  _BT_CMP -- Compare a key to a given item on the current page.
3861*45876Sbostic  *
3862*45876Sbostic  *	This routine is a wrapper for the user's comparison function.
3863*45876Sbostic  *
3864*45876Sbostic  *	Parameters:
3865*45876Sbostic  *		t -- tree in which to do comparison
3866*45876Sbostic  *		p -- pointer to one argument for the comparison function
3867*45876Sbostic  *		n -- index of item to supply second arg to comparison function
3868*45876Sbostic  *
3869*45876Sbostic  *	Returns:
3870*45876Sbostic  *		< 0 if p is < item at n
3871*45876Sbostic  *		= 0 if p is = item at n
3872*45876Sbostic  *		> 0 if p is > item at n
3873*45876Sbostic  */
3874*45876Sbostic 
3875*45876Sbostic static int
3876*45876Sbostic _bt_cmp(t, p, n)
3877*45876Sbostic 	BTREE_P t;
3878*45876Sbostic 	char *p;
3879*45876Sbostic 	index_t n;
3880*45876Sbostic {
3881*45876Sbostic 	BTHEADER *h;
3882*45876Sbostic 	IDATUM *id;
3883*45876Sbostic 	DATUM *d;
3884*45876Sbostic 	char *arg;
3885*45876Sbostic 	int ignore;
3886*45876Sbostic 	int free_arg;
3887*45876Sbostic 	pgno_t chain;
3888*45876Sbostic 	int r;
3889*45876Sbostic 
3890*45876Sbostic 	h = t->bt_curpage;
3891*45876Sbostic 
3892*45876Sbostic 	/*
3893*45876Sbostic 	 *  The left-most key at any level of the tree on internal pages
3894*45876Sbostic 	 *  is guaranteed (artificially, by the code here) to be less than
3895*45876Sbostic 	 *  any user key.  This saves us from having to update the leftmost
3896*45876Sbostic 	 *  key when the user inserts a new key in the tree smaller than
3897*45876Sbostic 	 *  anything we've seen yet.
3898*45876Sbostic 	 */
3899*45876Sbostic 
3900*45876Sbostic 	if (h->h_prevpg == P_NONE && !(h->h_flags & F_LEAF) && n == 0)
3901*45876Sbostic 		return (1);
3902*45876Sbostic 
3903*45876Sbostic 	if (h->h_flags & F_LEAF) {
3904*45876Sbostic 		d = (DATUM *) GETDATUM(h,n);
3905*45876Sbostic 		if (d->d_flags & D_BIGKEY) {
3906*45876Sbostic 			free_arg = TRUE;
3907*45876Sbostic 			bcopy(&(d->d_bytes[0]), (char *) &chain, sizeof(chain));
3908*45876Sbostic 			if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR)
3909*45876Sbostic 				return (RET_ERROR);
3910*45876Sbostic 		} else {
3911*45876Sbostic 			free_arg = FALSE;
3912*45876Sbostic 			arg = &(d->d_bytes[0]);
3913*45876Sbostic 		}
3914*45876Sbostic 	} else {
3915*45876Sbostic 		id = (IDATUM *) GETDATUM(h,n);
3916*45876Sbostic 		if (id->i_flags & D_BIGKEY) {
3917*45876Sbostic 			free_arg = TRUE;
3918*45876Sbostic 			bcopy(&(id->i_bytes[0]),
3919*45876Sbostic 			      (char *) &chain,
3920*45876Sbostic 			      sizeof(chain));
3921*45876Sbostic 			if (_bt_getbig(t, chain, &arg, &ignore) == RET_ERROR)
3922*45876Sbostic 				return (RET_ERROR);
3923*45876Sbostic 		} else {
3924*45876Sbostic 			free_arg = FALSE;
3925*45876Sbostic 			arg = &(id->i_bytes[0]);
3926*45876Sbostic 		}
3927*45876Sbostic 	}
3928*45876Sbostic 	r = (*(t->bt_compare))(p, arg);
3929*45876Sbostic 
3930*45876Sbostic 	if (free_arg)
3931*45876Sbostic 		(void) free(arg);
3932*45876Sbostic 
3933*45876Sbostic 	return (r);
3934*45876Sbostic }
3935*45876Sbostic 
3936*45876Sbostic /*
3937*45876Sbostic  *  _BT_PUSH/_BT_POP -- Push/pop a parent page number on the parent stack.
3938*45876Sbostic  *
3939*45876Sbostic  *	When we descend the tree, we keep track of parent pages in order
3940*45876Sbostic  *	to handle splits on insertions.
3941*45876Sbostic  *
3942*45876Sbostic  *	Parameters:
3943*45876Sbostic  *		t -- tree for which to push parent
3944*45876Sbostic  *		pgno -- page number to push (_bt_push only)
3945*45876Sbostic  *
3946*45876Sbostic  *	Returns:
3947*45876Sbostic  *		None.
3948*45876Sbostic  */
3949*45876Sbostic 
3950*45876Sbostic static
3951*45876Sbostic _bt_push(t, pgno)
3952*45876Sbostic 	BTREE_P t;
3953*45876Sbostic 	pgno_t pgno;
3954*45876Sbostic {
3955*45876Sbostic 	BTSTACK *new;
3956*45876Sbostic 
3957*45876Sbostic 	new = (BTSTACK *) malloc((unsigned) sizeof(BTSTACK));
3958*45876Sbostic 	new->bts_pgno = pgno;
3959*45876Sbostic 	new->bts_next = t->bt_stack;
3960*45876Sbostic 	t->bt_stack = new;
3961*45876Sbostic }
3962*45876Sbostic 
3963*45876Sbostic static
3964*45876Sbostic _bt_pop(t)
3965*45876Sbostic 	BTREE_P t;
3966*45876Sbostic {
3967*45876Sbostic 	BTSTACK *s;
3968*45876Sbostic 	pgno_t p = P_NONE;
3969*45876Sbostic 
3970*45876Sbostic 	if ((s = t->bt_stack) != (BTSTACK *) NULL) {
3971*45876Sbostic 		p = s->bts_pgno;
3972*45876Sbostic 		t->bt_stack = s->bts_next;
3973*45876Sbostic 		(void) free ((char *) s);
3974*45876Sbostic 	}
3975*45876Sbostic 	return (p);
3976*45876Sbostic }
3977*45876Sbostic 
3978*45876Sbostic #ifdef DEBUG
3979*45876Sbostic _punt()
3980*45876Sbostic {
3981*45876Sbostic 	int pid;
3982*45876Sbostic 
3983*45876Sbostic 	pid = getpid();
3984*45876Sbostic 	(void) kill(pid, SIGILL);
3985*45876Sbostic }
3986*45876Sbostic #endif /* DEBUG */
3987