xref: /csrg-svn/lib/libc/db/btree/btree.h (revision 50989)
146139Smao /*-
2*50989Sbostic  * Copyright (c) 1991 The Regents of the University of California.
346139Smao  * All rights reserved.
446139Smao  *
546139Smao  * This code is derived from software contributed to Berkeley by
646139Smao  * Mike Olson.
746139Smao  *
846139Smao  * %sccs.include.redist.c%
9*50989Sbostic  *
10*50989Sbostic  *	@(#)btree.h	5.3 (Berkeley) 09/04/91
1146139Smao  */
1246139Smao 
13*50989Sbostic #include <mpool.h>
14*50989Sbostic 
15*50989Sbostic #define	DEFMAXKEYPAGE	(0)		/* Maximum keys per page */
16*50989Sbostic #define	DEFMINKEYPAGE	(2)		/* Minimum keys per page */
17*50989Sbostic #define	MINCACHE	(5)		/* Minimum cached pages */
18*50989Sbostic #define	MINPSIZE	(512)		/* Minimum page size */
19*50989Sbostic 
2046139Smao /*
21*50989Sbostic  * Page 0 of a btree file contains a BTMETA structure.  The rest of the first
22*50989Sbostic  * page is empty, so that all disk operations are page-aligned.  This page is
23*50989Sbostic  * also used as an out-of-band page, i.e. page pointers that point to nowhere
24*50989Sbostic  * point to page 0.  The m_nrecs field is used only the RECNO code.  This is
25*50989Sbostic  * because the btree doesn't really need it and it requires that put or delete
26*50989Sbostic  * calls modify the meta data.
2746139Smao  */
28*50989Sbostic #define	P_INVALID	 0		/* Invalid tree page number. */
29*50989Sbostic #define	P_META		 0		/* Tree meta-info page number. */
30*50989Sbostic #define	P_ROOT		 1		/* Tree root page number. */
3146139Smao 
32*50989Sbostic typedef struct BTMETA {
33*50989Sbostic 	u_long	m_magic;		/* magic number */
34*50989Sbostic 	u_long	m_version;		/* version */
35*50989Sbostic 	u_long	m_psize;		/* page size */
36*50989Sbostic 	u_long	m_free;			/* page number of first free page */
37*50989Sbostic 	u_long	m_nrecs;		/* R: number of records */
38*50989Sbostic #define	SAVEMETA	(BTF_NODUPS | BTF_RECNO)
39*50989Sbostic 	u_long	m_flags;		/* bt_flags & SAVEMETA */
40*50989Sbostic 	u_long	m_lorder;		/* byte order */
41*50989Sbostic } BTMETA;
4246139Smao 
43*50989Sbostic /*
44*50989Sbostic  * There are five page layouts in the btree: btree internal pages, btree leaf
45*50989Sbostic  * pages, recno internal pages, recno leaf pages and overflow pages.  Each type
46*50989Sbostic  * of page starts with a page header as typed by PAGE.
47*50989Sbostic  */
48*50989Sbostic typedef struct PAGE {
49*50989Sbostic 	pgno_t	pgno;			/* this page's page number */
50*50989Sbostic 	pgno_t	prevpg;			/* left sibling */
51*50989Sbostic 	pgno_t	nextpg;			/* right sibling */
5246139Smao 
53*50989Sbostic #define	P_BINTERNAL	0x01		/* btree internal page */
54*50989Sbostic #define	P_BLEAF		0x02		/* leaf page */
55*50989Sbostic #define	P_OVERFLOW	0x04		/* overflow page */
56*50989Sbostic #define	P_RINTERNAL	0x08		/* recno internal page */
57*50989Sbostic #define	P_RLEAF		0x10		/* leaf page */
58*50989Sbostic #define P_TYPE		0x1f		/* type mask */
5946139Smao 
60*50989Sbostic #define	P_PRESERVE	0x20		/* never delete this chain of pages */
61*50989Sbostic 	u_long	flags;
6246139Smao 
63*50989Sbostic 	index_t	lower;			/* lower bound of free space on page */
64*50989Sbostic 	index_t	upper;			/* upper bound of free space on page */
65*50989Sbostic 	index_t	linp[1];		/* long-aligned VARIABLE LENGTH DATA */
66*50989Sbostic } PAGE;
6746139Smao 
68*50989Sbostic /* First and next index. */
69*50989Sbostic #define	BTDATAOFF	(sizeof(PAGE) - sizeof(index_t))
70*50989Sbostic #define	NEXTINDEX(p)	(((p)->lower - BTDATAOFF) / sizeof(index_t))
7146139Smao 
7246139Smao /*
73*50989Sbostic  * For pages other than overflow pages, there is an array of offsets into the
74*50989Sbostic  * rest of the page immediately following the page header.  Each offset is to
75*50989Sbostic  * an item which is unique to the type of page.  The h_lower offset is just
76*50989Sbostic  * past the last filled-in index.  The h_upper offset is the first item on the
77*50989Sbostic  * page.  Offsets are from the beginning of the page.
78*50989Sbostic  *
79*50989Sbostic  * If an item is too big to store on a single page, a flag is set and the item
80*50989Sbostic  * is a { page, size } pair such that the page is the first page of an overflow
81*50989Sbostic  * chain with size bytes of item.  Overflow pages are simply bytes without any
82*50989Sbostic  * external structure.
83*50989Sbostic  *
84*50989Sbostic  * The size and page number fields in the items are long aligned so they can be
85*50989Sbostic  * manipulated without copying.
8646139Smao  */
87*50989Sbostic #define	LALIGN(l)	(((l) + sizeof(u_long) - 1) & ~(sizeof(u_long) - 1))
88*50989Sbostic #define	NOVFLSIZE	(sizeof(pgno_t) + sizeof(size_t))
8946139Smao 
9046139Smao /*
91*50989Sbostic  * For the btree internal pages, the item is a key.  BINTERNALs are {key, pgno}
92*50989Sbostic  * pairs, such that the key compares less than or equal to all of the records
93*50989Sbostic  * on that page.  For a tree without duplicate keys, an internal page with two
94*50989Sbostic  * consecutive keys, a and b, will have all records greater than or equal to a
95*50989Sbostic  * and less than b stored on the page associated with a.  Duplicate keys are
96*50989Sbostic  * somewhat special and can cause duplicate internal and leaf page records and
97*50989Sbostic  * some minor modifications of the above rule.
9846139Smao  */
99*50989Sbostic typedef struct BINTERNAL {
100*50989Sbostic 	size_t	ksize;			/* key size */
101*50989Sbostic 	pgno_t	pgno;			/* page number stored on */
102*50989Sbostic #define	P_BIGDATA	0x01		/* overflow data */
103*50989Sbostic #define	P_BIGKEY	0x02		/* overflow key */
104*50989Sbostic 	u_char	flags;
105*50989Sbostic 	char	bytes[1];		/* data */
106*50989Sbostic } BINTERNAL;
10746139Smao 
108*50989Sbostic /* Get the page's BINTERNAL structure at index indx. */
109*50989Sbostic #define	GETBINTERNAL(pg, indx) \
110*50989Sbostic 	((BINTERNAL *)((char *)(pg) + (pg)->linp[indx]))
11146139Smao 
112*50989Sbostic /* Get the number of bytes in the entry. */
113*50989Sbostic #define NBINTERNAL(len) \
114*50989Sbostic 	LALIGN(sizeof(size_t) + sizeof(pgno_t) + sizeof(u_char) + (len))
11546139Smao 
116*50989Sbostic /* Copy a BINTERNAL entry to the page. */
117*50989Sbostic #define	WR_BINTERNAL(p, size, pgno, flags) { \
118*50989Sbostic 	*(size_t *)p = size; \
119*50989Sbostic 	p += sizeof(size_t); \
120*50989Sbostic 	*(pgno_t *)p = pgno; \
121*50989Sbostic 	p += sizeof(pgno_t); \
122*50989Sbostic 	*(u_char *)p = flags; \
123*50989Sbostic 	p += sizeof(u_char); \
124*50989Sbostic }
12546139Smao 
12646139Smao /*
127*50989Sbostic  * For the recno internal pages, the item is a page number with the number of
128*50989Sbostic  * keys found on that page and below.
12946139Smao  */
130*50989Sbostic typedef struct RINTERNAL {
131*50989Sbostic 	recno_t	nrecs;			/* number of records */
132*50989Sbostic 	pgno_t	pgno;			/* page number stored below */
133*50989Sbostic } RINTERNAL;
13446139Smao 
135*50989Sbostic /* Get the page's RINTERNAL structure at index indx. */
136*50989Sbostic #define	GETRINTERNAL(pg, indx) \
137*50989Sbostic 	((RINTERNAL *)((char *)(pg) + (pg)->linp[indx]))
13846139Smao 
139*50989Sbostic /* Get the number of bytes in the entry. */
140*50989Sbostic #define NRINTERNAL \
141*50989Sbostic 	LALIGN(sizeof(recno_t) + sizeof(pgno_t))
14246139Smao 
143*50989Sbostic /* Copy a RINTERAL entry to the page. */
144*50989Sbostic #define	WR_RINTERNAL(p, nrecs, pgno) { \
145*50989Sbostic 	*(size_t *)p = nrecs; \
146*50989Sbostic 	p += sizeof(recno_t); \
147*50989Sbostic 	*(pgno_t *)p = pgno; \
148*50989Sbostic }
14946139Smao 
150*50989Sbostic /* For the btree leaf pages, the item is a key and data pair. */
151*50989Sbostic typedef struct BLEAF {
152*50989Sbostic 	size_t	ksize;			/* size of key */
153*50989Sbostic 	size_t	dsize;			/* size of data */
154*50989Sbostic 	u_char	flags;			/* P_BIGDATA, P_BIGKEY */
155*50989Sbostic 	char	bytes[1];		/* data */
156*50989Sbostic } BLEAF;
15746139Smao 
158*50989Sbostic /* Get the page's BLEAF structure at index indx. */
159*50989Sbostic #define	GETBLEAF(pg, indx) \
160*50989Sbostic 	((BLEAF *)((char *)(pg) + (pg)->linp[indx]))
16146139Smao 
162*50989Sbostic /* Get the number of bytes in the entry. */
163*50989Sbostic #define NBLEAF(p) \
164*50989Sbostic 	LALIGN(sizeof(size_t) + sizeof(size_t) + sizeof(u_char) + \
165*50989Sbostic 	    (p)->ksize + (p)->dsize)
16646139Smao 
167*50989Sbostic /* Get the number of bytes in the user's key/data pair. */
168*50989Sbostic #define NBLEAFDBT(ksize, dsize) \
169*50989Sbostic 	LALIGN(sizeof(size_t) + sizeof(size_t) + sizeof(u_char) + \
170*50989Sbostic 	    (ksize) + (dsize))
17146139Smao 
172*50989Sbostic /* Copy a BLEAF entry to the page. */
173*50989Sbostic #define	WR_BLEAF(p, key, data, flags) { \
174*50989Sbostic 	*(size_t *)p = key->size; \
175*50989Sbostic 	p += sizeof(size_t); \
176*50989Sbostic 	*(size_t *)p = data->size; \
177*50989Sbostic 	p += sizeof(size_t); \
178*50989Sbostic 	*(u_char *)p = flags; \
179*50989Sbostic 	p += sizeof(u_char); \
180*50989Sbostic 	bcopy(key->data, p, key->size); \
181*50989Sbostic 	p += key->size; \
182*50989Sbostic 	bcopy(data->data, p, data->size); \
183*50989Sbostic }
18446139Smao 
185*50989Sbostic /* For the recno leaf pages, the item is a data entry. */
186*50989Sbostic typedef struct RLEAF {
187*50989Sbostic 	size_t	dsize;			/* size of data */
188*50989Sbostic 	u_char	flags;			/* P_BIGDATA */
189*50989Sbostic 	char	bytes[1];
190*50989Sbostic } RLEAF;
19146139Smao 
192*50989Sbostic /* Get the page's RLEAF structure at index indx. */
193*50989Sbostic #define	GETRLEAF(pg, indx) \
194*50989Sbostic 	((RLEAF *)((char *)(pg) + (pg)->linp[indx]))
19546139Smao 
196*50989Sbostic /* Get the number of bytes in the entry. */
197*50989Sbostic #define NRLEAF(p) \
198*50989Sbostic 	LALIGN(sizeof(size_t) + sizeof(u_char) + (p)->dsize)
19946139Smao 
200*50989Sbostic /* Get the number of bytes from the user's data. */
201*50989Sbostic #define	NRLEAFDBT(dsize) \
202*50989Sbostic 	LALIGN(sizeof(size_t) + sizeof(u_char) + (dsize))
20346139Smao 
204*50989Sbostic /* Copy a RLEAF entry to the page. */
205*50989Sbostic #define	WR_RLEAF(p, data, flags) { \
206*50989Sbostic 	*(size_t *)p = data->size; \
207*50989Sbostic 	p += sizeof(size_t); \
208*50989Sbostic 	*(u_char *)p = flags; \
209*50989Sbostic 	p += sizeof(u_char); \
210*50989Sbostic 	bcopy(data->data, p, data->size); \
211*50989Sbostic }
21246139Smao 
21346139Smao /*
214*50989Sbostic  * A record in the tree is either a pointer to a page and an index in the page
215*50989Sbostic  * or a page number and an index.  These structures are used as a cursor, stack
216*50989Sbostic  * entry and search returns as well as to pass records to other routines.
217*50989Sbostic  *
218*50989Sbostic  * One comment about searches.  Internal page searches must find the largest
219*50989Sbostic  * record less than key in the tree so that descents work.  Leaf page searches
220*50989Sbostic  * must find the smallest record greater than key so that the returned index
221*50989Sbostic  * is the record's correct position for insertion.
22246139Smao  */
223*50989Sbostic typedef struct EPGNO {
224*50989Sbostic 	pgno_t	pgno;			/* the page number */
225*50989Sbostic 	index_t	index;			/* the index on the page */
226*50989Sbostic } EPGNO;
22746139Smao 
228*50989Sbostic typedef struct EPG {
229*50989Sbostic 	PAGE	*page;			/* the (pinned) page */
230*50989Sbostic 	index_t	 index;			/* the index on the page */
231*50989Sbostic } EPG;
23246139Smao 
233*50989Sbostic /* The in-memory btree/recno data structure. */
234*50989Sbostic typedef struct BTREE {
235*50989Sbostic 	MPOOL	*bt_mp;			/* memory pool cookie */
23646139Smao 
237*50989Sbostic 	DB	*bt_dbp;		/* pointer to enclosing DB */
23846139Smao 
239*50989Sbostic 	EPGNO	bt_bcursor;		/* btree cursor */
240*50989Sbostic 	recno_t	bt_rcursor;		/* R: recno cursor */
24146139Smao 
242*50989Sbostic #define	BT_POP(t)	(t->bt_sp ? t->bt_stack + --t->bt_sp : NULL)
243*50989Sbostic #define	BT_CLR(t)	(t->bt_sp = 0)
244*50989Sbostic 	EPGNO	*bt_stack;		/* stack of parent pages */
245*50989Sbostic 	u_int	bt_sp;			/* current stack pointer */
246*50989Sbostic 	u_int	bt_maxstack;		/* largest stack */
24746139Smao 
248*50989Sbostic 	char	*bt_kbuf;		/* key buffer */
249*50989Sbostic 	size_t	bt_kbufsz;		/* key buffer size */
250*50989Sbostic 	char	*bt_dbuf;		/* data buffer */
251*50989Sbostic 	size_t	bt_dbufsz;		/* data buffer size */
25246139Smao 
253*50989Sbostic 	int	bt_fd;			/* tree file descriptor */
254*50989Sbostic 	FILE	*bt_rfp;		/* R: record FILE pointer */
255*50989Sbostic 	int	bt_rfd;			/* R: record file descriptor */
25646139Smao 
257*50989Sbostic 	pgno_t	bt_free;		/* next free page */
258*50989Sbostic 	size_t	bt_psize;		/* page size */
259*50989Sbostic 	int	bt_maxkeypage;		/* maximum keys per page */
260*50989Sbostic 	size_t	bt_minkeypage;		/* minimum keys per page */
261*50989Sbostic 	int	bt_lorder;		/* byte order */
26246139Smao 
263*50989Sbostic 					/* sorted order */
264*50989Sbostic 	enum { NOT, BACK, FORWARD, } bt_order;
265*50989Sbostic 	EPGNO	bt_last;		/* last insert */
26646139Smao 
267*50989Sbostic 					/* B: key comparison function */
268*50989Sbostic 	int	(*bt_cmp) __P((const DBT *, const DBT *));
269*50989Sbostic 					/* B: prefix comparison function */
270*50989Sbostic 	int	(*bt_pfx) __P((const DBT *, const DBT *));
27146139Smao 
272*50989Sbostic 					/* R: recno input function */
273*50989Sbostic 	int	(*bt_irec) __P((struct BTREE *, recno_t));
274*50989Sbostic 	recno_t	bt_nrecs;		/* R: number of records in the tree */
275*50989Sbostic 	caddr_t	bt_smap;		/* R: start of mapped space */
276*50989Sbostic 	caddr_t bt_emap;		/* R: end of mapped space */
277*50989Sbostic 	size_t	bt_reclen;		/* R: fixed record length */
278*50989Sbostic 	u_char	bt_bval;		/* R: delimiting byte/pad character */
27946139Smao 
280*50989Sbostic #define	BTF_DELCRSR	0x001		/* B: delete cursor when closes/moves */
281*50989Sbostic #define	BTF_FIXEDLEN	0x002		/* fixed length records */
282*50989Sbostic #define	BTF_INMEM	0x004		/* in-memory tree */
283*50989Sbostic #define	BTF_METADIRTY	0x008		/* B: need to write meta-data */
284*50989Sbostic #define	BTF_MODIFIED	0x010		/* tree modified */
285*50989Sbostic #define	BTF_NODUPS	0x020		/* no duplicate keys permitted */
286*50989Sbostic #define	BTF_RDONLY	0x040		/* read-only tree */
287*50989Sbostic #define	BTF_RECNO	0x080		/* record oriented tree */
288*50989Sbostic #define	BTF_SEQINIT	0x100		/* sequential scan initialized */
289*50989Sbostic 	u_long		bt_flags;	/* btree state */
290*50989Sbostic } BTREE;
29146139Smao 
292*50989Sbostic #define	ISSET(t, f)	((t)->bt_flags & (f))
293*50989Sbostic #define	NOTSET(t, f)	(!((t)->bt_flags & (f)))
294*50989Sbostic #define	SET(t, f)	((t)->bt_flags |= (f))
295*50989Sbostic #define	UNSET(t, f)	((t)->bt_flags &= ~(f))
29646139Smao 
297*50989Sbostic #include "extern.h"
298