xref: /csrg-svn/lib/libc/db/btree/btree.h (revision 46139)
1*46139Smao /*-
2*46139Smao  * Copyright (c) 1990 The Regents of the University of California.
3*46139Smao  * All rights reserved.
4*46139Smao  *
5*46139Smao  * This code is derived from software contributed to Berkeley by
6*46139Smao  * Mike Olson.
7*46139Smao  *
8*46139Smao  * %sccs.include.redist.c%
9*46139Smao  */
10*46139Smao 
11*46139Smao /*
12*46139Smao  *  @(#)btree.h	5.1 (Berkeley) 01/23/91
13*46139Smao  */
14*46139Smao 
15*46139Smao typedef char	*BTREE;		/* should really be (void *) */
16*46139Smao 
17*46139Smao /* #define	DEBUG */
18*46139Smao 
19*46139Smao #define RET_ERROR	-1
20*46139Smao #define RET_SUCCESS	 0
21*46139Smao #define RET_SPECIAL	 1
22*46139Smao 
23*46139Smao #ifndef TRUE
24*46139Smao #define TRUE	1
25*46139Smao #define FALSE	0
26*46139Smao #endif /* ndef TRUE */
27*46139Smao 
28*46139Smao #ifndef NULL
29*46139Smao #define NULL	0
30*46139Smao #endif /* ndef NULL */
31*46139Smao 
32*46139Smao /* libc */
33*46139Smao extern char *malloc();
34*46139Smao 
35*46139Smao /* these are defined in lrucache.c */
36*46139Smao extern char	*lruinit();
37*46139Smao extern char	*lruget();
38*46139Smao extern char	*lrugetnew();
39*46139Smao extern int	lrusync();
40*46139Smao extern int	lruwrite();
41*46139Smao extern int	lrurelease();
42*46139Smao extern void	lrufree();
43*46139Smao 
44*46139Smao /* these are defined here */
45*46139Smao extern BTREE	bt_open();
46*46139Smao extern int	bt_close();
47*46139Smao extern int	bt_delete();
48*46139Smao extern int	bt_get();
49*46139Smao extern int	bt_put();
50*46139Smao extern int	bt_seq();
51*46139Smao extern int	bt_sync();
52*46139Smao 
53*46139Smao /*
54*46139Smao  *  Private types.  What you choose for these depends on how big you
55*46139Smao  *  want to let files get, and how big you want to let pages get.
56*46139Smao  */
57*46139Smao 
58*46139Smao typedef u_long	index_t;	/* so # bytes on a page fits in a long */
59*46139Smao typedef u_long	pgno_t;		/* so # of pages in a btree fits in a long */
60*46139Smao 
61*46139Smao /*
62*46139Smao  *  When we do searches, we push the parent page numbers onto a stack
63*46139Smao  *  as we descend the tree.  This is so that for insertions, we can
64*46139Smao  *  find our way back up to do internal page insertions and splits.
65*46139Smao  */
66*46139Smao 
67*46139Smao typedef struct BTSTACK {
68*46139Smao 	pgno_t		bts_pgno;
69*46139Smao 	struct BTSTACK	*bts_next;
70*46139Smao } BTSTACK;
71*46139Smao 
72*46139Smao /*
73*46139Smao  *  Every btree page has a header that looks like this.  Flags are given
74*46139Smao  *  in the #define's for the F_ flags (see below).
75*46139Smao  */
76*46139Smao 
77*46139Smao typedef struct BTHEADER {
78*46139Smao 	pgno_t h_pgno;		/* page number of this page */
79*46139Smao 	pgno_t h_prevpg;	/* left sibling */
80*46139Smao 	pgno_t h_nextpg;	/* right sibling */
81*46139Smao 
82*46139Smao #define F_LEAF		0x01	/* leaf page, contains user data */
83*46139Smao #define F_CONT		0x02	/* continuation page (large items) */
84*46139Smao #define F_DIRTY		0x04	/* need to write to disk */
85*46139Smao #define F_PRESERVE	0x08	/* never delete this chain of pages */
86*46139Smao 
87*46139Smao 	u_long h_flags;		/* page state */
88*46139Smao 	index_t h_lower;	/* lower bound of free space on page */
89*46139Smao 	index_t h_upper;	/* upper bound of free space on page */
90*46139Smao 	index_t h_linp[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
91*46139Smao } BTHEADER;
92*46139Smao 
93*46139Smao /*
94*46139Smao  *  HTBUCKETs are hash table buckets for looking up pages of in-memory
95*46139Smao  *  btrees by page number.  We use this indirection, rather than direct
96*46139Smao  *  pointers, so that the code for manipulating in-memory trees is the
97*46139Smao  *  same as that for manipulating on-disk trees.
98*46139Smao  */
99*46139Smao 
100*46139Smao typedef struct HTBUCKET {
101*46139Smao 	pgno_t		ht_pgno;
102*46139Smao 	BTHEADER	*ht_page;
103*46139Smao 	struct HTBUCKET	*ht_next;
104*46139Smao } HTBUCKET;
105*46139Smao 
106*46139Smao typedef HTBUCKET	**HTABLE;
107*46139Smao 
108*46139Smao /* minimum size we'll let a page be */
109*46139Smao #define MINPSIZE	512
110*46139Smao 
111*46139Smao /* default cache size, in bytes */
112*46139Smao #define DEFCACHE	(20 * 1024)
113*46139Smao 
114*46139Smao /* hash table size for in-memory trees */
115*46139Smao #define	HTSIZE		128
116*46139Smao 
117*46139Smao /* generate a hash key from a page number */
118*46139Smao #define HASHKEY(pgno)	((pgno - 1) % HTSIZE)
119*46139Smao 
120*46139Smao /*
121*46139Smao  *  Disk btrees have a file descriptor, and may also have an lru buffer
122*46139Smao  *  cache, if the user asked for one.
123*46139Smao  */
124*46139Smao 
125*46139Smao typedef struct BTDISK {
126*46139Smao 	int	d_fd;
127*46139Smao 	char	*d_cache;
128*46139Smao } BTDISK;
129*46139Smao 
130*46139Smao /*
131*46139Smao  *  Cursors keep track of the current location in a sequential scan of
132*46139Smao  *  the database.  Since btrees impose a total ordering on keys, we can
133*46139Smao  *  walk forward or backward through the database from any point.  Cursors
134*46139Smao  *  survive updates to the tree, and can be used to delete a particular
135*46139Smao  *  record.
136*46139Smao  */
137*46139Smao 
138*46139Smao typedef struct CURSOR {
139*46139Smao 	pgno_t		c_pgno;		/* pgno of current item in scan */
140*46139Smao 	index_t		c_index;	/* index of current item in scan */
141*46139Smao 	char		*c_key;		/* current key, used for updates */
142*46139Smao 
143*46139Smao #define CRSR_BEFORE	0x01
144*46139Smao 
145*46139Smao 	u_char		c_flags;	/* to handle updates properly */
146*46139Smao } CURSOR;
147*46139Smao 
148*46139Smao /*
149*46139Smao  *  The private btree data structure.  The user passes a pointer to one of
150*46139Smao  *  these when we are to manipulate a tree, but the BTREE type is opaque
151*46139Smao  *  to him.
152*46139Smao  */
153*46139Smao 
154*46139Smao typedef struct BTREEDATA_P {
155*46139Smao 	char		*bt_fname;		/* NULL for in-memory trees */
156*46139Smao 	union {
157*46139Smao 		BTDISK	bt_d;			/* for on-disk btrees */
158*46139Smao 		HTABLE	bt_ht;			/* hash table for mem trees */
159*46139Smao 	} bt_s;
160*46139Smao 	size_t		bt_psize;		/* page size for btree pages */
161*46139Smao 	int		(*bt_compare)();	/* key comparison function */
162*46139Smao 	pgno_t		bt_npages;		/* number of pages in tree */
163*46139Smao 	BTHEADER	*bt_curpage;		/* current page contents */
164*46139Smao 	pgno_t		bt_free;		/* free pg list for big data */
165*46139Smao 	CURSOR		bt_cursor;		/* cursor for scans */
166*46139Smao 	BTSTACK		*bt_stack;		/* parent stack for inserts */
167*46139Smao 	u_long		bt_lorder;		/* byte order (endian.h) */
168*46139Smao 
169*46139Smao #define BTF_METAOK	0x01	/* meta-data written to start of file */
170*46139Smao #define BTF_SEQINIT	0x02	/* we have called bt_seq */
171*46139Smao #define BTF_ISWRITE	0x04	/* tree was opened for write */
172*46139Smao #define BTF_NODUPS	0x08	/* tree created for unique keys */
173*46139Smao 
174*46139Smao 	u_long		bt_flags;		/* btree state */
175*46139Smao } BTREEDATA_P;
176*46139Smao 
177*46139Smao typedef BTREEDATA_P	*BTREE_P;
178*46139Smao 
179*46139Smao /*
180*46139Smao  *  The first thing in a btree file is a BTMETA structure.  The rest of
181*46139Smao  *  the first page is empty, so that all disk operations are page-aligned.
182*46139Smao  */
183*46139Smao 
184*46139Smao typedef struct BTMETA {
185*46139Smao 	u_long	m_magic;
186*46139Smao 	u_long	m_version;
187*46139Smao 	size_t	m_psize;
188*46139Smao 	pgno_t	m_free;
189*46139Smao 	u_long	m_flags;
190*46139Smao 	u_long	m_lorder;
191*46139Smao } BTMETA;
192*46139Smao 
193*46139Smao #define P_NONE		0		/* invalid page number in tree */
194*46139Smao #define P_ROOT		1		/* page number of root pg in btree */
195*46139Smao 
196*46139Smao #define NORELEASE	0		/* don't release a page during write */
197*46139Smao #define RELEASE		1		/* release a page during write */
198*46139Smao 
199*46139Smao #define INSERT		0		/* doing an insert operation */
200*46139Smao #define DELETE		1		/* doing a delete operation */
201*46139Smao 
202*46139Smao /* get the next free index on a btree page */
203*46139Smao #define NEXTINDEX(p)	((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t)))
204*46139Smao 
205*46139Smao /* is a BTITEM actually on the btree page? */
206*46139Smao #define VALIDITEM(t, i)	((i)->bti_index < NEXTINDEX((t)->bt_curpage))
207*46139Smao 
208*46139Smao /* guarantee longword alignment so structure refs work */
209*46139Smao #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03)
210*46139Smao 
211*46139Smao /* get a particular datum (or idatum) off a page */
212*46139Smao #define GETDATUM(h,i)	 (((char *) h) + h->h_linp[i])
213*46139Smao 
214*46139Smao /* is a {key,datum} too big to put on a single page? */
215*46139Smao #define TOOBIG(t, sz)	(sz >= t->bt_psize / 5)
216*46139Smao 
217*46139Smao /* is this a disk tree or a memory tree? */
218*46139Smao #define ISDISK(t)	(t->bt_fname != (char *) NULL)
219*46139Smao 
220*46139Smao /* does the disk tree use a cache? */
221*46139Smao #define ISCACHE(t)	(t->bt_s.bt_d.d_cache != (char *) NULL)
222*46139Smao 
223*46139Smao /*
224*46139Smao  *  DATUMs are for user data -- one appears on leaf pages for every
225*46139Smao  *  tree entry.  The d_bytes[] array contains the key first, then the data.
226*46139Smao  *
227*46139Smao  *  If either the key or the datum is too big to store on a single page,
228*46139Smao  *  a bit is set in the flags entry, and the d_bytes[] array contains a
229*46139Smao  *  pgno pointing to the page at which the data is actually stored.
230*46139Smao  *
231*46139Smao  *  Note on alignment:  every DATUM is guaranteed to be longword aligned
232*46139Smao  *  on the disk page.  In order to force longword alignment of user key
233*46139Smao  *  and data values, we must guarantee that the d_bytes[] array starts
234*46139Smao  *  on a longword boundary.  This is the reason that d_flags is a u_long,
235*46139Smao  *  rather than a u_char (it really only needs to be two bits big).  This
236*46139Smao  *  is necessary because we call the user's comparison function with a
237*46139Smao  *  pointer to the start of the d_bytes array.  We don't need to force
238*46139Smao  *  longword alignment of the data following the key, since that is copied
239*46139Smao  *  to a longword-aligned buffer before being returned to the user.
240*46139Smao  */
241*46139Smao 
242*46139Smao typedef struct DATUM {
243*46139Smao 	size_t d_ksize;		/* size of key */
244*46139Smao 	size_t d_dsize;		/* size of data */
245*46139Smao 
246*46139Smao #define D_BIGDATA	0x01	/* indirect datum ptr flag */
247*46139Smao #define D_BIGKEY	0x02	/* indirect key ptr flag */
248*46139Smao 
249*46139Smao 	u_long d_flags;		/* flags (indirect bit) */
250*46139Smao 	char d_bytes[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
251*46139Smao } DATUM;
252*46139Smao 
253*46139Smao /* BTITEMs are used to return (page, index, datum) tuples from searches */
254*46139Smao typedef struct BTITEM {
255*46139Smao 	pgno_t bti_pgno;
256*46139Smao 	index_t bti_index;
257*46139Smao 	DATUM *bti_datum;
258*46139Smao } BTITEM;
259*46139Smao 
260*46139Smao /*
261*46139Smao  *  IDATUMs are for data stored on internal pages.  This is the (key, pgno)
262*46139Smao  *  pair, such that key 'key' is the first entry on page 'pgno'.  If our
263*46139Smao  *  internal page contains keys (a) and (b) next to each other, then all
264*46139Smao  *  items >= to (a) and < (b) go on the same page as (a).  There are some
265*46139Smao  *  gotchas with duplicate keys, however.  See the split code for details.
266*46139Smao  *
267*46139Smao  *  If a key is too big to fit on a single page, then the i_bytes[] array
268*46139Smao  *  contains a pgno pointing to the start of a chain that actually stores
269*46139Smao  *  the bytes.  Since items on internal pages are never deleted from the
270*46139Smao  *  tree, these indirect chains are marked as special, so that they won't
271*46139Smao  *  be deleted if the corresponding leaf item is deleted.
272*46139Smao  *
273*46139Smao  *  As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char)
274*46139Smao  *  in order to guarantee that user keys are longword aligned on the disk
275*46139Smao  *  page.
276*46139Smao  */
277*46139Smao 
278*46139Smao typedef struct IDATUM {
279*46139Smao 	size_t i_size;
280*46139Smao 	pgno_t i_pgno;
281*46139Smao 	u_long i_flags;		/* see DATUM.d_flags, above */
282*46139Smao 	char i_bytes[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
283*46139Smao } IDATUM;
284*46139Smao 
285*46139Smao /* all private interfaces have a leading _ in their names */
286*46139Smao extern BTITEM	*_bt_search();
287*46139Smao extern BTITEM	*_bt_searchr();
288*46139Smao extern BTHEADER	*_bt_allocpg();
289*46139Smao extern index_t	_bt_binsrch();
290*46139Smao extern int	_bt_isonpage();
291*46139Smao extern BTITEM	*_bt_first();
292*46139Smao extern int	_bt_release();
293*46139Smao extern int	_bt_wrtmeta();
294*46139Smao extern int	_bt_delindir();
295*46139Smao extern int	_bt_pgout();
296*46139Smao extern int	_bt_pgin();
297*46139Smao extern int	_bt_fixscan();
298*46139Smao extern int	_bt_indirect();
299*46139Smao extern int	_bt_crsrdel();
300*46139Smao extern int	_bt_push();
301*46139Smao extern pgno_t	_bt_pop();
302*46139Smao extern int	strcmp();
303*46139Smao 
304