xref: /csrg-svn/lib/libc/db/btree/btree.h (revision 50989)
1 /*-
2  * Copyright (c) 1991 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * %sccs.include.redist.c%
9  *
10  *	@(#)btree.h	5.3 (Berkeley) 09/04/91
11  */
12 
13 #include <mpool.h>
14 
15 #define	DEFMAXKEYPAGE	(0)		/* Maximum keys per page */
16 #define	DEFMINKEYPAGE	(2)		/* Minimum keys per page */
17 #define	MINCACHE	(5)		/* Minimum cached pages */
18 #define	MINPSIZE	(512)		/* Minimum page size */
19 
20 /*
21  * Page 0 of a btree file contains a BTMETA structure.  The rest of the first
22  * page is empty, so that all disk operations are page-aligned.  This page is
23  * also used as an out-of-band page, i.e. page pointers that point to nowhere
24  * point to page 0.  The m_nrecs field is used only the RECNO code.  This is
25  * because the btree doesn't really need it and it requires that put or delete
26  * calls modify the meta data.
27  */
28 #define	P_INVALID	 0		/* Invalid tree page number. */
29 #define	P_META		 0		/* Tree meta-info page number. */
30 #define	P_ROOT		 1		/* Tree root page number. */
31 
32 typedef struct BTMETA {
33 	u_long	m_magic;		/* magic number */
34 	u_long	m_version;		/* version */
35 	u_long	m_psize;		/* page size */
36 	u_long	m_free;			/* page number of first free page */
37 	u_long	m_nrecs;		/* R: number of records */
38 #define	SAVEMETA	(BTF_NODUPS | BTF_RECNO)
39 	u_long	m_flags;		/* bt_flags & SAVEMETA */
40 	u_long	m_lorder;		/* byte order */
41 } BTMETA;
42 
43 /*
44  * There are five page layouts in the btree: btree internal pages, btree leaf
45  * pages, recno internal pages, recno leaf pages and overflow pages.  Each type
46  * of page starts with a page header as typed by PAGE.
47  */
48 typedef struct PAGE {
49 	pgno_t	pgno;			/* this page's page number */
50 	pgno_t	prevpg;			/* left sibling */
51 	pgno_t	nextpg;			/* right sibling */
52 
53 #define	P_BINTERNAL	0x01		/* btree internal page */
54 #define	P_BLEAF		0x02		/* leaf page */
55 #define	P_OVERFLOW	0x04		/* overflow page */
56 #define	P_RINTERNAL	0x08		/* recno internal page */
57 #define	P_RLEAF		0x10		/* leaf page */
58 #define P_TYPE		0x1f		/* type mask */
59 
60 #define	P_PRESERVE	0x20		/* never delete this chain of pages */
61 	u_long	flags;
62 
63 	index_t	lower;			/* lower bound of free space on page */
64 	index_t	upper;			/* upper bound of free space on page */
65 	index_t	linp[1];		/* long-aligned VARIABLE LENGTH DATA */
66 } PAGE;
67 
68 /* First and next index. */
69 #define	BTDATAOFF	(sizeof(PAGE) - sizeof(index_t))
70 #define	NEXTINDEX(p)	(((p)->lower - BTDATAOFF) / sizeof(index_t))
71 
72 /*
73  * For pages other than overflow pages, there is an array of offsets into the
74  * rest of the page immediately following the page header.  Each offset is to
75  * an item which is unique to the type of page.  The h_lower offset is just
76  * past the last filled-in index.  The h_upper offset is the first item on the
77  * page.  Offsets are from the beginning of the page.
78  *
79  * If an item is too big to store on a single page, a flag is set and the item
80  * is a { page, size } pair such that the page is the first page of an overflow
81  * chain with size bytes of item.  Overflow pages are simply bytes without any
82  * external structure.
83  *
84  * The size and page number fields in the items are long aligned so they can be
85  * manipulated without copying.
86  */
87 #define	LALIGN(l)	(((l) + sizeof(u_long) - 1) & ~(sizeof(u_long) - 1))
88 #define	NOVFLSIZE	(sizeof(pgno_t) + sizeof(size_t))
89 
90 /*
91  * For the btree internal pages, the item is a key.  BINTERNALs are {key, pgno}
92  * pairs, such that the key compares less than or equal to all of the records
93  * on that page.  For a tree without duplicate keys, an internal page with two
94  * consecutive keys, a and b, will have all records greater than or equal to a
95  * and less than b stored on the page associated with a.  Duplicate keys are
96  * somewhat special and can cause duplicate internal and leaf page records and
97  * some minor modifications of the above rule.
98  */
99 typedef struct BINTERNAL {
100 	size_t	ksize;			/* key size */
101 	pgno_t	pgno;			/* page number stored on */
102 #define	P_BIGDATA	0x01		/* overflow data */
103 #define	P_BIGKEY	0x02		/* overflow key */
104 	u_char	flags;
105 	char	bytes[1];		/* data */
106 } BINTERNAL;
107 
108 /* Get the page's BINTERNAL structure at index indx. */
109 #define	GETBINTERNAL(pg, indx) \
110 	((BINTERNAL *)((char *)(pg) + (pg)->linp[indx]))
111 
112 /* Get the number of bytes in the entry. */
113 #define NBINTERNAL(len) \
114 	LALIGN(sizeof(size_t) + sizeof(pgno_t) + sizeof(u_char) + (len))
115 
116 /* Copy a BINTERNAL entry to the page. */
117 #define	WR_BINTERNAL(p, size, pgno, flags) { \
118 	*(size_t *)p = size; \
119 	p += sizeof(size_t); \
120 	*(pgno_t *)p = pgno; \
121 	p += sizeof(pgno_t); \
122 	*(u_char *)p = flags; \
123 	p += sizeof(u_char); \
124 }
125 
126 /*
127  * For the recno internal pages, the item is a page number with the number of
128  * keys found on that page and below.
129  */
130 typedef struct RINTERNAL {
131 	recno_t	nrecs;			/* number of records */
132 	pgno_t	pgno;			/* page number stored below */
133 } RINTERNAL;
134 
135 /* Get the page's RINTERNAL structure at index indx. */
136 #define	GETRINTERNAL(pg, indx) \
137 	((RINTERNAL *)((char *)(pg) + (pg)->linp[indx]))
138 
139 /* Get the number of bytes in the entry. */
140 #define NRINTERNAL \
141 	LALIGN(sizeof(recno_t) + sizeof(pgno_t))
142 
143 /* Copy a RINTERAL entry to the page. */
144 #define	WR_RINTERNAL(p, nrecs, pgno) { \
145 	*(size_t *)p = nrecs; \
146 	p += sizeof(recno_t); \
147 	*(pgno_t *)p = pgno; \
148 }
149 
150 /* For the btree leaf pages, the item is a key and data pair. */
151 typedef struct BLEAF {
152 	size_t	ksize;			/* size of key */
153 	size_t	dsize;			/* size of data */
154 	u_char	flags;			/* P_BIGDATA, P_BIGKEY */
155 	char	bytes[1];		/* data */
156 } BLEAF;
157 
158 /* Get the page's BLEAF structure at index indx. */
159 #define	GETBLEAF(pg, indx) \
160 	((BLEAF *)((char *)(pg) + (pg)->linp[indx]))
161 
162 /* Get the number of bytes in the entry. */
163 #define NBLEAF(p) \
164 	LALIGN(sizeof(size_t) + sizeof(size_t) + sizeof(u_char) + \
165 	    (p)->ksize + (p)->dsize)
166 
167 /* Get the number of bytes in the user's key/data pair. */
168 #define NBLEAFDBT(ksize, dsize) \
169 	LALIGN(sizeof(size_t) + sizeof(size_t) + sizeof(u_char) + \
170 	    (ksize) + (dsize))
171 
172 /* Copy a BLEAF entry to the page. */
173 #define	WR_BLEAF(p, key, data, flags) { \
174 	*(size_t *)p = key->size; \
175 	p += sizeof(size_t); \
176 	*(size_t *)p = data->size; \
177 	p += sizeof(size_t); \
178 	*(u_char *)p = flags; \
179 	p += sizeof(u_char); \
180 	bcopy(key->data, p, key->size); \
181 	p += key->size; \
182 	bcopy(data->data, p, data->size); \
183 }
184 
185 /* For the recno leaf pages, the item is a data entry. */
186 typedef struct RLEAF {
187 	size_t	dsize;			/* size of data */
188 	u_char	flags;			/* P_BIGDATA */
189 	char	bytes[1];
190 } RLEAF;
191 
192 /* Get the page's RLEAF structure at index indx. */
193 #define	GETRLEAF(pg, indx) \
194 	((RLEAF *)((char *)(pg) + (pg)->linp[indx]))
195 
196 /* Get the number of bytes in the entry. */
197 #define NRLEAF(p) \
198 	LALIGN(sizeof(size_t) + sizeof(u_char) + (p)->dsize)
199 
200 /* Get the number of bytes from the user's data. */
201 #define	NRLEAFDBT(dsize) \
202 	LALIGN(sizeof(size_t) + sizeof(u_char) + (dsize))
203 
204 /* Copy a RLEAF entry to the page. */
205 #define	WR_RLEAF(p, data, flags) { \
206 	*(size_t *)p = data->size; \
207 	p += sizeof(size_t); \
208 	*(u_char *)p = flags; \
209 	p += sizeof(u_char); \
210 	bcopy(data->data, p, data->size); \
211 }
212 
213 /*
214  * A record in the tree is either a pointer to a page and an index in the page
215  * or a page number and an index.  These structures are used as a cursor, stack
216  * entry and search returns as well as to pass records to other routines.
217  *
218  * One comment about searches.  Internal page searches must find the largest
219  * record less than key in the tree so that descents work.  Leaf page searches
220  * must find the smallest record greater than key so that the returned index
221  * is the record's correct position for insertion.
222  */
223 typedef struct EPGNO {
224 	pgno_t	pgno;			/* the page number */
225 	index_t	index;			/* the index on the page */
226 } EPGNO;
227 
228 typedef struct EPG {
229 	PAGE	*page;			/* the (pinned) page */
230 	index_t	 index;			/* the index on the page */
231 } EPG;
232 
233 /* The in-memory btree/recno data structure. */
234 typedef struct BTREE {
235 	MPOOL	*bt_mp;			/* memory pool cookie */
236 
237 	DB	*bt_dbp;		/* pointer to enclosing DB */
238 
239 	EPGNO	bt_bcursor;		/* btree cursor */
240 	recno_t	bt_rcursor;		/* R: recno cursor */
241 
242 #define	BT_POP(t)	(t->bt_sp ? t->bt_stack + --t->bt_sp : NULL)
243 #define	BT_CLR(t)	(t->bt_sp = 0)
244 	EPGNO	*bt_stack;		/* stack of parent pages */
245 	u_int	bt_sp;			/* current stack pointer */
246 	u_int	bt_maxstack;		/* largest stack */
247 
248 	char	*bt_kbuf;		/* key buffer */
249 	size_t	bt_kbufsz;		/* key buffer size */
250 	char	*bt_dbuf;		/* data buffer */
251 	size_t	bt_dbufsz;		/* data buffer size */
252 
253 	int	bt_fd;			/* tree file descriptor */
254 	FILE	*bt_rfp;		/* R: record FILE pointer */
255 	int	bt_rfd;			/* R: record file descriptor */
256 
257 	pgno_t	bt_free;		/* next free page */
258 	size_t	bt_psize;		/* page size */
259 	int	bt_maxkeypage;		/* maximum keys per page */
260 	size_t	bt_minkeypage;		/* minimum keys per page */
261 	int	bt_lorder;		/* byte order */
262 
263 					/* sorted order */
264 	enum { NOT, BACK, FORWARD, } bt_order;
265 	EPGNO	bt_last;		/* last insert */
266 
267 					/* B: key comparison function */
268 	int	(*bt_cmp) __P((const DBT *, const DBT *));
269 					/* B: prefix comparison function */
270 	int	(*bt_pfx) __P((const DBT *, const DBT *));
271 
272 					/* R: recno input function */
273 	int	(*bt_irec) __P((struct BTREE *, recno_t));
274 	recno_t	bt_nrecs;		/* R: number of records in the tree */
275 	caddr_t	bt_smap;		/* R: start of mapped space */
276 	caddr_t bt_emap;		/* R: end of mapped space */
277 	size_t	bt_reclen;		/* R: fixed record length */
278 	u_char	bt_bval;		/* R: delimiting byte/pad character */
279 
280 #define	BTF_DELCRSR	0x001		/* B: delete cursor when closes/moves */
281 #define	BTF_FIXEDLEN	0x002		/* fixed length records */
282 #define	BTF_INMEM	0x004		/* in-memory tree */
283 #define	BTF_METADIRTY	0x008		/* B: need to write meta-data */
284 #define	BTF_MODIFIED	0x010		/* tree modified */
285 #define	BTF_NODUPS	0x020		/* no duplicate keys permitted */
286 #define	BTF_RDONLY	0x040		/* read-only tree */
287 #define	BTF_RECNO	0x080		/* record oriented tree */
288 #define	BTF_SEQINIT	0x100		/* sequential scan initialized */
289 	u_long		bt_flags;	/* btree state */
290 } BTREE;
291 
292 #define	ISSET(t, f)	((t)->bt_flags & (f))
293 #define	NOTSET(t, f)	(!((t)->bt_flags & (f)))
294 #define	SET(t, f)	((t)->bt_flags |= (f))
295 #define	UNSET(t, f)	((t)->bt_flags &= ~(f))
296 
297 #include "extern.h"
298