xref: /netbsd-src/external/bsd/openldap/dist/libraries/liblmdb/mdb.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /*	$NetBSD: mdb.c,v 1.1.1.1 2014/05/28 09:58:43 tron Exp $	*/
2 
3 /** @file mdb.c
4  *	@brief memory-mapped database library
5  *
6  *	A Btree-based database management library modeled loosely on the
7  *	BerkeleyDB API, but much simplified.
8  */
9 /*
10  * Copyright 2011-2013 Howard Chu, Symas Corp.
11  * All rights reserved.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted only as authorized by the OpenLDAP
15  * Public License.
16  *
17  * A copy of this license is available in the file LICENSE in the
18  * top-level directory of the distribution or, alternatively, at
19  * <http://www.OpenLDAP.org/license.html>.
20  *
21  * This code is derived from btree.c written by Martin Hedenfalk.
22  *
23  * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
24  *
25  * Permission to use, copy, modify, and distribute this software for any
26  * purpose with or without fee is hereby granted, provided that the above
27  * copyright notice and this permission notice appear in all copies.
28  *
29  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
30  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
31  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
32  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
33  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
34  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
35  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
36  */
37 #ifndef _GNU_SOURCE
38 #define _GNU_SOURCE 1
39 #endif
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #ifdef _WIN32
43 #include <windows.h>
44 /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
45  *  as int64 which is wrong. MSVC doesn't define it at all, so just
46  *  don't use it.
47  */
48 #define MDB_PID_T	int
49 #ifdef __GNUC__
50 # include <sys/param.h>
51 #else
52 # define LITTLE_ENDIAN	1234
53 # define BIG_ENDIAN	4321
54 # define BYTE_ORDER	LITTLE_ENDIAN
55 # ifndef SSIZE_MAX
56 #  define SSIZE_MAX	INT_MAX
57 # endif
58 #endif
59 #else
60 #define MDB_PID_T	pid_t
61 #include <sys/param.h>
62 #include <sys/uio.h>
63 #include <sys/mman.h>
64 #ifdef HAVE_SYS_FILE_H
65 #include <sys/file.h>
66 #endif
67 #include <fcntl.h>
68 #endif
69 
70 #include <errno.h>
71 #include <limits.h>
72 #include <stddef.h>
73 #include <inttypes.h>
74 #include <stdio.h>
75 #include <stdlib.h>
76 #include <string.h>
77 #include <time.h>
78 #include <unistd.h>
79 
80 #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
81 #include <netinet/in.h>
82 #include <resolv.h>	/* defines BYTE_ORDER on HPUX and Solaris */
83 #endif
84 
85 #if defined(__APPLE__) || defined (BSD)
86 # define MDB_USE_POSIX_SEM	1
87 # define MDB_FDATASYNC		fsync
88 #elif defined(ANDROID)
89 # define MDB_FDATASYNC		fsync
90 #endif
91 
92 #ifndef _WIN32
93 #include <pthread.h>
94 #ifdef MDB_USE_POSIX_SEM
95 # define MDB_USE_HASH		1
96 #include <semaphore.h>
97 #endif
98 #endif
99 
100 #ifdef USE_VALGRIND
101 #include <valgrind/memcheck.h>
102 #define VGMEMP_CREATE(h,r,z)    VALGRIND_CREATE_MEMPOOL(h,r,z)
103 #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s)
104 #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a)
105 #define VGMEMP_DESTROY(h)	VALGRIND_DESTROY_MEMPOOL(h)
106 #define VGMEMP_DEFINED(a,s)	VALGRIND_MAKE_MEM_DEFINED(a,s)
107 #else
108 #define VGMEMP_CREATE(h,r,z)
109 #define VGMEMP_ALLOC(h,a,s)
110 #define VGMEMP_FREE(h,a)
111 #define VGMEMP_DESTROY(h)
112 #define VGMEMP_DEFINED(a,s)
113 #endif
114 
115 #ifndef BYTE_ORDER
116 # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN))
117 /* Solaris just defines one or the other */
118 #  define LITTLE_ENDIAN	1234
119 #  define BIG_ENDIAN	4321
120 #  ifdef _LITTLE_ENDIAN
121 #   define BYTE_ORDER  LITTLE_ENDIAN
122 #  else
123 #   define BYTE_ORDER  BIG_ENDIAN
124 #  endif
125 # else
126 #  define BYTE_ORDER   __BYTE_ORDER
127 # endif
128 #endif
129 
130 #ifndef LITTLE_ENDIAN
131 #define LITTLE_ENDIAN	__LITTLE_ENDIAN
132 #endif
133 #ifndef BIG_ENDIAN
134 #define BIG_ENDIAN	__BIG_ENDIAN
135 #endif
136 
137 #if defined(__i386) || defined(__x86_64) || defined(_M_IX86)
138 #define MISALIGNED_OK	1
139 #endif
140 
141 #include "lmdb.h"
142 #include "midl.h"
143 
144 #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN)
145 # error "Unknown or unsupported endianness (BYTE_ORDER)"
146 #elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
147 # error "Two's complement, reasonably sized integer types, please"
148 #endif
149 
150 /** @defgroup internal	MDB Internals
151  *	@{
152  */
153 /** @defgroup compat	Compatibility Macros
154  *	A bunch of macros to minimize the amount of platform-specific ifdefs
155  *	needed throughout the rest of the code. When the features this library
156  *	needs are similar enough to POSIX to be hidden in a one-or-two line
157  *	replacement, this macro approach is used.
158  *	@{
159  */
160 
161 	/** Wrapper around __func__, which is a C99 feature */
162 #if __STDC_VERSION__ >= 199901L
163 # define mdb_func_	__func__
164 #elif __GNUC__ >= 2 || _MSC_VER >= 1300
165 # define mdb_func_	__FUNCTION__
166 #else
167 /* If a debug message says <mdb_unknown>(), update the #if statements above */
168 # define mdb_func_	"<mdb_unknown>"
169 #endif
170 
171 #ifdef _WIN32
172 #define MDB_USE_HASH	1
173 #define MDB_PIDLOCK	0
174 #define pthread_t	DWORD
175 #define pthread_mutex_t	HANDLE
176 #define pthread_key_t	DWORD
177 #define pthread_self()	GetCurrentThreadId()
178 #define pthread_key_create(x,y)	\
179 	((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0)
180 #define pthread_key_delete(x)	TlsFree(x)
181 #define pthread_getspecific(x)	TlsGetValue(x)
182 #define pthread_setspecific(x,y)	(TlsSetValue(x,y) ? 0 : ErrCode())
183 #define pthread_mutex_unlock(x)	ReleaseMutex(x)
184 #define pthread_mutex_lock(x)	WaitForSingleObject(x, INFINITE)
185 #define LOCK_MUTEX_R(env)	pthread_mutex_lock((env)->me_rmutex)
186 #define UNLOCK_MUTEX_R(env)	pthread_mutex_unlock((env)->me_rmutex)
187 #define LOCK_MUTEX_W(env)	pthread_mutex_lock((env)->me_wmutex)
188 #define UNLOCK_MUTEX_W(env)	pthread_mutex_unlock((env)->me_wmutex)
189 #define getpid()	GetCurrentProcessId()
190 #define	MDB_FDATASYNC(fd)	(!FlushFileBuffers(fd))
191 #define	MDB_MSYNC(addr,len,flags)	(!FlushViewOfFile(addr,len))
192 #define	ErrCode()	GetLastError()
193 #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
194 #define	close(fd)	(CloseHandle(fd) ? 0 : -1)
195 #define	munmap(ptr,len)	UnmapViewOfFile(ptr)
196 #ifdef PROCESS_QUERY_LIMITED_INFORMATION
197 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION
198 #else
199 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000
200 #endif
201 #define	Z	"I"
202 #else
203 
204 #define	Z	"z"			/**< printf format modifier for size_t */
205 
206 	/** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */
207 #define MDB_PIDLOCK			1
208 
209 #ifdef MDB_USE_POSIX_SEM
210 
211 #define LOCK_MUTEX_R(env)	mdb_sem_wait((env)->me_rmutex)
212 #define UNLOCK_MUTEX_R(env)	sem_post((env)->me_rmutex)
213 #define LOCK_MUTEX_W(env)	mdb_sem_wait((env)->me_wmutex)
214 #define UNLOCK_MUTEX_W(env)	sem_post((env)->me_wmutex)
215 
216 static int
217 mdb_sem_wait(sem_t *sem)
218 {
219    int rc;
220    while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ;
221    return rc;
222 }
223 
224 #else
225 	/** Lock the reader mutex.
226 	 */
227 #define LOCK_MUTEX_R(env)	pthread_mutex_lock(&(env)->me_txns->mti_mutex)
228 	/** Unlock the reader mutex.
229 	 */
230 #define UNLOCK_MUTEX_R(env)	pthread_mutex_unlock(&(env)->me_txns->mti_mutex)
231 
232 	/** Lock the writer mutex.
233 	 *	Only a single write transaction is allowed at a time. Other writers
234 	 *	will block waiting for this mutex.
235 	 */
236 #define LOCK_MUTEX_W(env)	pthread_mutex_lock(&(env)->me_txns->mti_wmutex)
237 	/** Unlock the writer mutex.
238 	 */
239 #define UNLOCK_MUTEX_W(env)	pthread_mutex_unlock(&(env)->me_txns->mti_wmutex)
240 #endif	/* MDB_USE_POSIX_SEM */
241 
242 	/** Get the error code for the last failed system function.
243 	 */
244 #define	ErrCode()	errno
245 
246 	/** An abstraction for a file handle.
247 	 *	On POSIX systems file handles are small integers. On Windows
248 	 *	they're opaque pointers.
249 	 */
250 #define	HANDLE	int
251 
252 	/**	A value for an invalid file handle.
253 	 *	Mainly used to initialize file variables and signify that they are
254 	 *	unused.
255 	 */
256 #define INVALID_HANDLE_VALUE	(-1)
257 
258 	/** Get the size of a memory page for the system.
259 	 *	This is the basic size that the platform's memory manager uses, and is
260 	 *	fundamental to the use of memory-mapped files.
261 	 */
262 #define	GET_PAGESIZE(x)	((x) = sysconf(_SC_PAGE_SIZE))
263 #endif
264 
265 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
266 #define MNAME_LEN	32
267 #else
268 #define MNAME_LEN	(sizeof(pthread_mutex_t))
269 #endif
270 
271 /** @} */
272 
273 #ifndef _WIN32
274 /**	A flag for opening a file and requesting synchronous data writes.
275  *	This is only used when writing a meta page. It's not strictly needed;
276  *	we could just do a normal write and then immediately perform a flush.
277  *	But if this flag is available it saves us an extra system call.
278  *
279  *	@note If O_DSYNC is undefined but exists in /usr/include,
280  * preferably set some compiler flag to get the definition.
281  * Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC.
282  */
283 #ifndef MDB_DSYNC
284 # define MDB_DSYNC	O_DSYNC
285 #endif
286 #endif
287 
288 /** Function for flushing the data of a file. Define this to fsync
289  *	if fdatasync() is not supported.
290  */
291 #ifndef MDB_FDATASYNC
292 # define MDB_FDATASYNC	fdatasync
293 #endif
294 
295 #ifndef MDB_MSYNC
296 # define MDB_MSYNC(addr,len,flags)	msync(addr,len,flags)
297 #endif
298 
299 #ifndef MS_SYNC
300 #define	MS_SYNC	1
301 #endif
302 
303 #ifndef MS_ASYNC
304 #define	MS_ASYNC	0
305 #endif
306 
307 	/** A page number in the database.
308 	 *	Note that 64 bit page numbers are overkill, since pages themselves
309 	 *	already represent 12-13 bits of addressable memory, and the OS will
310 	 *	always limit applications to a maximum of 63 bits of address space.
311 	 *
312 	 *	@note In the #MDB_node structure, we only store 48 bits of this value,
313 	 *	which thus limits us to only 60 bits of addressable data.
314 	 */
315 typedef MDB_ID	pgno_t;
316 
317 	/** A transaction ID.
318 	 *	See struct MDB_txn.mt_txnid for details.
319 	 */
320 typedef MDB_ID	txnid_t;
321 
322 /** @defgroup debug	Debug Macros
323  *	@{
324  */
325 #ifndef MDB_DEBUG
326 	/**	Enable debug output.  Needs variable argument macros (a C99 feature).
327 	 *	Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs
328 	 *	read from and written to the database (used for free space management).
329 	 */
330 #define MDB_DEBUG 0
331 #endif
332 
333 #if MDB_DEBUG
334 static int mdb_debug;
335 static txnid_t mdb_debug_start;
336 
337 	/**	Print a debug message with printf formatting.
338 	 *	Requires double parenthesis around 2 or more args.
339 	 */
340 # define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args))
341 # define DPRINTF0(fmt, ...) \
342 	fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__)
343 #else
344 # define DPRINTF(args)	((void) 0)
345 #endif
346 	/**	Print a debug string.
347 	 *	The string is printed literally, with no format processing.
348 	 */
349 #define DPUTS(arg)	DPRINTF(("%s", arg))
350 	/** Debuging output value of a cursor DBI: Negative in a sub-cursor. */
351 #define DDBI(mc) \
352 	(((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
353 /** @} */
354 
355 	/**	@brief The maximum size of a database page.
356 	 *
357 	 *	This is 32k, since it must fit in #MDB_page.#mp_upper.
358 	 *
359 	 *	LMDB will use database pages < OS pages if needed.
360 	 *	That causes more I/O in write transactions: The OS must
361 	 *	know (read) the whole page before writing a partial page.
362 	 *
363 	 *	Note that we don't currently support Huge pages. On Linux,
364 	 *	regular data files cannot use Huge pages, and in general
365 	 *	Huge pages aren't actually pageable. We rely on the OS
366 	 *	demand-pager to read our data and page it out when memory
367 	 *	pressure from other processes is high. So until OSs have
368 	 *	actual paging support for Huge pages, they're not viable.
369 	 */
370 #define MAX_PAGESIZE	 0x8000
371 
372 	/** The minimum number of keys required in a database page.
373 	 *	Setting this to a larger value will place a smaller bound on the
374 	 *	maximum size of a data item. Data items larger than this size will
375 	 *	be pushed into overflow pages instead of being stored directly in
376 	 *	the B-tree node. This value used to default to 4. With a page size
377 	 *	of 4096 bytes that meant that any item larger than 1024 bytes would
378 	 *	go into an overflow page. That also meant that on average 2-3KB of
379 	 *	each overflow page was wasted space. The value cannot be lower than
380 	 *	2 because then there would no longer be a tree structure. With this
381 	 *	value, items larger than 2KB will go into overflow pages, and on
382 	 *	average only 1KB will be wasted.
383 	 */
384 #define MDB_MINKEYS	 2
385 
386 	/**	A stamp that identifies a file as an MDB file.
387 	 *	There's nothing special about this value other than that it is easily
388 	 *	recognizable, and it will reflect any byte order mismatches.
389 	 */
390 #define MDB_MAGIC	 0xBEEFC0DE
391 
392 	/**	The version number for a database's datafile format. */
393 #define MDB_DATA_VERSION	 1
394 	/**	The version number for a database's lockfile format. */
395 #define MDB_LOCK_VERSION	 1
396 
397 	/**	@brief The max size of a key we can write, or 0 for dynamic max.
398 	 *
399 	 *	Define this as 0 to compute the max from the page size.  511
400 	 *	is default for backwards compat: liblmdb <= 0.9.10 can break
401 	 *	when modifying a DB with keys/dupsort data bigger than its max.
402 	 *
403 	 *	Data items in an #MDB_DUPSORT database are also limited to
404 	 *	this size, since they're actually keys of a sub-DB.  Keys and
405 	 *	#MDB_DUPSORT data items must fit on a node in a regular page.
406 	 */
407 #ifndef MDB_MAXKEYSIZE
408 #define MDB_MAXKEYSIZE	 511
409 #endif
410 
411 	/**	The maximum size of a key we can write to the environment. */
412 #if MDB_MAXKEYSIZE
413 #define ENV_MAXKEY(env)	(MDB_MAXKEYSIZE)
414 #else
415 #define ENV_MAXKEY(env)	((env)->me_maxkey)
416 #endif
417 
418 	/**	@brief The maximum size of a data item.
419 	 *
420 	 *	We only store a 32 bit value for node sizes.
421 	 */
422 #define MAXDATASIZE	0xffffffffUL
423 
424 #if MDB_DEBUG
425 	/**	Key size which fits in a #DKBUF.
426 	 *	@ingroup debug
427 	 */
428 #define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511)
429 	/**	A key buffer.
430 	 *	@ingroup debug
431 	 *	This is used for printing a hex dump of a key's contents.
432 	 */
433 #define DKBUF	char kbuf[DKBUF_MAXKEYSIZE*2+1]
434 	/**	Display a key in hex.
435 	 *	@ingroup debug
436 	 *	Invoke a function to display a key in hex.
437 	 */
438 #define	DKEY(x)	mdb_dkey(x, kbuf)
439 #else
440 #define	DKBUF
441 #define DKEY(x)	0
442 #endif
443 
444 	/** An invalid page number.
445 	 *	Mainly used to denote an empty tree.
446 	 */
447 #define P_INVALID	 (~(pgno_t)0)
448 
449 	/** Test if the flags \b f are set in a flag word \b w. */
450 #define F_ISSET(w, f)	 (((w) & (f)) == (f))
451 
452 	/** Round \b n up to an even number. */
453 #define EVEN(n)		(((n) + 1U) & -2) /* sign-extending -2 to match n+1U */
454 
455 	/**	Used for offsets within a single page.
456 	 *	Since memory pages are typically 4 or 8KB in size, 12-13 bits,
457 	 *	this is plenty.
458 	 */
459 typedef uint16_t	 indx_t;
460 
461 	/**	Default size of memory map.
462 	 *	This is certainly too small for any actual applications. Apps should always set
463 	 *	the size explicitly using #mdb_env_set_mapsize().
464 	 */
465 #define DEFAULT_MAPSIZE	1048576
466 
467 /**	@defgroup readers	Reader Lock Table
468  *	Readers don't acquire any locks for their data access. Instead, they
469  *	simply record their transaction ID in the reader table. The reader
470  *	mutex is needed just to find an empty slot in the reader table. The
471  *	slot's address is saved in thread-specific data so that subsequent read
472  *	transactions started by the same thread need no further locking to proceed.
473  *
474  *	If #MDB_NOTLS is set, the slot address is not saved in thread-specific data.
475  *
476  *	No reader table is used if the database is on a read-only filesystem, or
477  *	if #MDB_NOLOCK is set.
478  *
479  *	Since the database uses multi-version concurrency control, readers don't
480  *	actually need any locking. This table is used to keep track of which
481  *	readers are using data from which old transactions, so that we'll know
482  *	when a particular old transaction is no longer in use. Old transactions
483  *	that have discarded any data pages can then have those pages reclaimed
484  *	for use by a later write transaction.
485  *
486  *	The lock table is constructed such that reader slots are aligned with the
487  *	processor's cache line size. Any slot is only ever used by one thread.
488  *	This alignment guarantees that there will be no contention or cache
489  *	thrashing as threads update their own slot info, and also eliminates
490  *	any need for locking when accessing a slot.
491  *
492  *	A writer thread will scan every slot in the table to determine the oldest
493  *	outstanding reader transaction. Any freed pages older than this will be
494  *	reclaimed by the writer. The writer doesn't use any locks when scanning
495  *	this table. This means that there's no guarantee that the writer will
496  *	see the most up-to-date reader info, but that's not required for correct
497  *	operation - all we need is to know the upper bound on the oldest reader,
498  *	we don't care at all about the newest reader. So the only consequence of
499  *	reading stale information here is that old pages might hang around a
500  *	while longer before being reclaimed. That's actually good anyway, because
501  *	the longer we delay reclaiming old pages, the more likely it is that a
502  *	string of contiguous pages can be found after coalescing old pages from
503  *	many old transactions together.
504  *	@{
505  */
506 	/**	Number of slots in the reader table.
507 	 *	This value was chosen somewhat arbitrarily. 126 readers plus a
508 	 *	couple mutexes fit exactly into 8KB on my development machine.
509 	 *	Applications should set the table size using #mdb_env_set_maxreaders().
510 	 */
511 #define DEFAULT_READERS	126
512 
513 	/**	The size of a CPU cache line in bytes. We want our lock structures
514 	 *	aligned to this size to avoid false cache line sharing in the
515 	 *	lock table.
516 	 *	This value works for most CPUs. For Itanium this should be 128.
517 	 */
518 #ifndef CACHELINE
519 #define CACHELINE	64
520 #endif
521 
522 	/**	The information we store in a single slot of the reader table.
523 	 *	In addition to a transaction ID, we also record the process and
524 	 *	thread ID that owns a slot, so that we can detect stale information,
525 	 *	e.g. threads or processes that went away without cleaning up.
526 	 *	@note We currently don't check for stale records. We simply re-init
527 	 *	the table when we know that we're the only process opening the
528 	 *	lock file.
529 	 */
530 typedef struct MDB_rxbody {
531 	/**	Current Transaction ID when this transaction began, or (txnid_t)-1.
532 	 *	Multiple readers that start at the same time will probably have the
533 	 *	same ID here. Again, it's not important to exclude them from
534 	 *	anything; all we need to know is which version of the DB they
535 	 *	started from so we can avoid overwriting any data used in that
536 	 *	particular version.
537 	 */
538 	txnid_t		mrb_txnid;
539 	/** The process ID of the process owning this reader txn. */
540 	MDB_PID_T	mrb_pid;
541 	/** The thread ID of the thread owning this txn. */
542 	pthread_t	mrb_tid;
543 } MDB_rxbody;
544 
545 	/** The actual reader record, with cacheline padding. */
546 typedef struct MDB_reader {
547 	union {
548 		MDB_rxbody mrx;
549 		/** shorthand for mrb_txnid */
550 #define	mr_txnid	mru.mrx.mrb_txnid
551 #define	mr_pid	mru.mrx.mrb_pid
552 #define	mr_tid	mru.mrx.mrb_tid
553 		/** cache line alignment */
554 		char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)];
555 	} mru;
556 } MDB_reader;
557 
558 	/** The header for the reader table.
559 	 *	The table resides in a memory-mapped file. (This is a different file
560 	 *	than is used for the main database.)
561 	 *
562 	 *	For POSIX the actual mutexes reside in the shared memory of this
563 	 *	mapped file. On Windows, mutexes are named objects allocated by the
564 	 *	kernel; we store the mutex names in this mapped file so that other
565 	 *	processes can grab them. This same approach is also used on
566 	 *	MacOSX/Darwin (using named semaphores) since MacOSX doesn't support
567 	 *	process-shared POSIX mutexes. For these cases where a named object
568 	 *	is used, the object name is derived from a 64 bit FNV hash of the
569 	 *	environment pathname. As such, naming collisions are extremely
570 	 *	unlikely. If a collision occurs, the results are unpredictable.
571 	 */
572 typedef struct MDB_txbody {
573 		/** Stamp identifying this as an MDB file. It must be set
574 		 *	to #MDB_MAGIC. */
575 	uint32_t	mtb_magic;
576 		/** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */
577 	uint32_t	mtb_format;
578 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
579 	char	mtb_rmname[MNAME_LEN];
580 #else
581 		/** Mutex protecting access to this table.
582 		 *	This is the reader lock that #LOCK_MUTEX_R acquires.
583 		 */
584 	pthread_mutex_t	mtb_mutex;
585 #endif
586 		/**	The ID of the last transaction committed to the database.
587 		 *	This is recorded here only for convenience; the value can always
588 		 *	be determined by reading the main database meta pages.
589 		 */
590 	txnid_t		mtb_txnid;
591 		/** The number of slots that have been used in the reader table.
592 		 *	This always records the maximum count, it is not decremented
593 		 *	when readers release their slots.
594 		 */
595 	unsigned	mtb_numreaders;
596 } MDB_txbody;
597 
598 	/** The actual reader table definition. */
599 typedef struct MDB_txninfo {
600 	union {
601 		MDB_txbody mtb;
602 #define mti_magic	mt1.mtb.mtb_magic
603 #define mti_format	mt1.mtb.mtb_format
604 #define mti_mutex	mt1.mtb.mtb_mutex
605 #define mti_rmname	mt1.mtb.mtb_rmname
606 #define mti_txnid	mt1.mtb.mtb_txnid
607 #define mti_numreaders	mt1.mtb.mtb_numreaders
608 		char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
609 	} mt1;
610 	union {
611 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
612 		char mt2_wmname[MNAME_LEN];
613 #define	mti_wmname	mt2.mt2_wmname
614 #else
615 		pthread_mutex_t	mt2_wmutex;
616 #define mti_wmutex	mt2.mt2_wmutex
617 #endif
618 		char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)];
619 	} mt2;
620 	MDB_reader	mti_readers[1];
621 } MDB_txninfo;
622 
623 	/** Lockfile format signature: version, features and field layout */
624 #define MDB_LOCK_FORMAT \
625 	((uint32_t) \
626 	 ((MDB_LOCK_VERSION) \
627 	  /* Flags which describe functionality */ \
628 	  + (((MDB_PIDLOCK) != 0) << 16)))
629 /** @} */
630 
631 /** Common header for all page types.
632  * Overflow records occupy a number of contiguous pages with no
633  * headers on any page after the first.
634  */
635 typedef struct MDB_page {
636 #define	mp_pgno	mp_p.p_pgno
637 #define	mp_next	mp_p.p_next
638 	union {
639 		pgno_t		p_pgno;	/**< page number */
640 		void *		p_next;	/**< for in-memory list of freed structs */
641 	} mp_p;
642 	uint16_t	mp_pad;
643 /**	@defgroup mdb_page	Page Flags
644  *	@ingroup internal
645  *	Flags for the page headers.
646  *	@{
647  */
648 #define	P_BRANCH	 0x01		/**< branch page */
649 #define	P_LEAF		 0x02		/**< leaf page */
650 #define	P_OVERFLOW	 0x04		/**< overflow page */
651 #define	P_META		 0x08		/**< meta page */
652 #define	P_DIRTY		 0x10		/**< dirty page, also set for #P_SUBP pages */
653 #define	P_LEAF2		 0x20		/**< for #MDB_DUPFIXED records */
654 #define	P_SUBP		 0x40		/**< for #MDB_DUPSORT sub-pages */
655 #define	P_KEEP		 0x8000		/**< leave this page alone during spill */
656 /** @} */
657 	uint16_t	mp_flags;		/**< @ref mdb_page */
658 #define mp_lower	mp_pb.pb.pb_lower
659 #define mp_upper	mp_pb.pb.pb_upper
660 #define mp_pages	mp_pb.pb_pages
661 	union {
662 		struct {
663 			indx_t		pb_lower;		/**< lower bound of free space */
664 			indx_t		pb_upper;		/**< upper bound of free space */
665 		} pb;
666 		uint32_t	pb_pages;	/**< number of overflow pages */
667 	} mp_pb;
668 	indx_t		mp_ptrs[1];		/**< dynamic size */
669 } MDB_page;
670 
671 	/** Size of the page header, excluding dynamic data at the end */
672 #define PAGEHDRSZ	 ((unsigned) offsetof(MDB_page, mp_ptrs))
673 
674 	/** Address of first usable data byte in a page, after the header */
675 #define METADATA(p)	 ((void *)((char *)(p) + PAGEHDRSZ))
676 
677 	/** Number of nodes on a page */
678 #define NUMKEYS(p)	 (((p)->mp_lower - PAGEHDRSZ) >> 1)
679 
680 	/** The amount of space remaining in the page */
681 #define SIZELEFT(p)	 (indx_t)((p)->mp_upper - (p)->mp_lower)
682 
683 	/** The percentage of space used in the page, in tenths of a percent. */
684 #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
685 				((env)->me_psize - PAGEHDRSZ))
686 	/** The minimum page fill factor, in tenths of a percent.
687 	 *	Pages emptier than this are candidates for merging.
688 	 */
689 #define FILL_THRESHOLD	 250
690 
691 	/** Test if a page is a leaf page */
692 #define IS_LEAF(p)	 F_ISSET((p)->mp_flags, P_LEAF)
693 	/** Test if a page is a LEAF2 page */
694 #define IS_LEAF2(p)	 F_ISSET((p)->mp_flags, P_LEAF2)
695 	/** Test if a page is a branch page */
696 #define IS_BRANCH(p)	 F_ISSET((p)->mp_flags, P_BRANCH)
697 	/** Test if a page is an overflow page */
698 #define IS_OVERFLOW(p)	 F_ISSET((p)->mp_flags, P_OVERFLOW)
699 	/** Test if a page is a sub page */
700 #define IS_SUBP(p)	 F_ISSET((p)->mp_flags, P_SUBP)
701 
702 	/** The number of overflow pages needed to store the given size. */
703 #define OVPAGES(size, psize)	((PAGEHDRSZ-1 + (size)) / (psize) + 1)
704 
705 	/** Header for a single key/data pair within a page.
706 	 * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2.
707 	 * We guarantee 2-byte alignment for 'MDB_node's.
708 	 */
709 typedef struct MDB_node {
710 	/** lo and hi are used for data size on leaf nodes and for
711 	 * child pgno on branch nodes. On 64 bit platforms, flags
712 	 * is also used for pgno. (Branch nodes have no flags).
713 	 * They are in host byte order in case that lets some
714 	 * accesses be optimized into a 32-bit word access.
715 	 */
716 #if BYTE_ORDER == LITTLE_ENDIAN
717 	unsigned short	mn_lo, mn_hi;	/**< part of data size or pgno */
718 #else
719 	unsigned short	mn_hi, mn_lo;
720 #endif
721 /** @defgroup mdb_node Node Flags
722  *	@ingroup internal
723  *	Flags for node headers.
724  *	@{
725  */
726 #define F_BIGDATA	 0x01			/**< data put on overflow page */
727 #define F_SUBDATA	 0x02			/**< data is a sub-database */
728 #define F_DUPDATA	 0x04			/**< data has duplicates */
729 
730 /** valid flags for #mdb_node_add() */
731 #define	NODE_ADD_FLAGS	(F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND)
732 
733 /** @} */
734 	unsigned short	mn_flags;		/**< @ref mdb_node */
735 	unsigned short	mn_ksize;		/**< key size */
736 	char		mn_data[1];			/**< key and data are appended here */
737 } MDB_node;
738 
739 	/** Size of the node header, excluding dynamic data at the end */
740 #define NODESIZE	 offsetof(MDB_node, mn_data)
741 
742 	/** Bit position of top word in page number, for shifting mn_flags */
743 #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0)
744 
745 	/** Size of a node in a branch page with a given key.
746 	 *	This is just the node header plus the key, there is no data.
747 	 */
748 #define INDXSIZE(k)	 (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
749 
750 	/** Size of a node in a leaf page with a given key and data.
751 	 *	This is node header plus key plus data size.
752 	 */
753 #define LEAFSIZE(k, d)	 (NODESIZE + (k)->mv_size + (d)->mv_size)
754 
755 	/** Address of node \b i in page \b p */
756 #define NODEPTR(p, i)	 ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
757 
758 	/** Address of the key for the node */
759 #define NODEKEY(node)	 (void *)((node)->mn_data)
760 
761 	/** Address of the data for a node */
762 #define NODEDATA(node)	 (void *)((char *)(node)->mn_data + (node)->mn_ksize)
763 
764 	/** Get the page number pointed to by a branch node */
765 #define NODEPGNO(node) \
766 	((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \
767 	 (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0))
768 	/** Set the page number in a branch node */
769 #define SETPGNO(node,pgno)	do { \
770 	(node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \
771 	if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0)
772 
773 	/** Get the size of the data in a leaf node */
774 #define NODEDSZ(node)	 ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16))
775 	/** Set the size of the data for a leaf node */
776 #define SETDSZ(node,size)	do { \
777 	(node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0)
778 	/** The size of a key in a node */
779 #define NODEKSZ(node)	 ((node)->mn_ksize)
780 
781 	/** Copy a page number from src to dst */
782 #ifdef MISALIGNED_OK
783 #define COPY_PGNO(dst,src)	dst = src
784 #else
785 #if SIZE_MAX > 4294967295UL
786 #define COPY_PGNO(dst,src)	do { \
787 	unsigned short *s, *d;	\
788 	s = (unsigned short *)&(src);	\
789 	d = (unsigned short *)&(dst);	\
790 	*d++ = *s++;	\
791 	*d++ = *s++;	\
792 	*d++ = *s++;	\
793 	*d = *s;	\
794 } while (0)
795 #else
796 #define COPY_PGNO(dst,src)	do { \
797 	unsigned short *s, *d;	\
798 	s = (unsigned short *)&(src);	\
799 	d = (unsigned short *)&(dst);	\
800 	*d++ = *s++;	\
801 	*d = *s;	\
802 } while (0)
803 #endif
804 #endif
805 	/** The address of a key in a LEAF2 page.
806 	 *	LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs.
807 	 *	There are no node headers, keys are stored contiguously.
808 	 */
809 #define LEAF2KEY(p, i, ks)	((char *)(p) + PAGEHDRSZ + ((i)*(ks)))
810 
811 	/** Set the \b node's key into \b keyptr, if requested. */
812 #define MDB_GET_KEY(node, keyptr)	{ if ((keyptr) != NULL) { \
813 	(keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } }
814 
815 	/** Set the \b node's key into \b key. */
816 #define MDB_GET_KEY2(node, key)	{ key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); }
817 
818 	/** Information about a single database in the environment. */
819 typedef struct MDB_db {
820 	uint32_t	md_pad;		/**< also ksize for LEAF2 pages */
821 	uint16_t	md_flags;	/**< @ref mdb_dbi_open */
822 	uint16_t	md_depth;	/**< depth of this tree */
823 	pgno_t		md_branch_pages;	/**< number of internal pages */
824 	pgno_t		md_leaf_pages;		/**< number of leaf pages */
825 	pgno_t		md_overflow_pages;	/**< number of overflow pages */
826 	size_t		md_entries;		/**< number of data items */
827 	pgno_t		md_root;		/**< the root page of this tree */
828 } MDB_db;
829 
830 	/** mdb_dbi_open flags */
831 #define MDB_VALID	0x8000		/**< DB handle is valid, for me_dbflags */
832 #define PERSISTENT_FLAGS	(0xffff & ~(MDB_VALID))
833 #define VALID_FLAGS	(MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\
834 	MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE)
835 
836 	/** Handle for the DB used to track free pages. */
837 #define	FREE_DBI	0
838 	/** Handle for the default DB. */
839 #define	MAIN_DBI	1
840 
841 	/** Meta page content.
842 	 *	A meta page is the start point for accessing a database snapshot.
843 	 *	Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2).
844 	 */
845 typedef struct MDB_meta {
846 		/** Stamp identifying this as an MDB file. It must be set
847 		 *	to #MDB_MAGIC. */
848 	uint32_t	mm_magic;
849 		/** Version number of this lock file. Must be set to #MDB_DATA_VERSION. */
850 	uint32_t	mm_version;
851 	void		*mm_address;		/**< address for fixed mapping */
852 	size_t		mm_mapsize;			/**< size of mmap region */
853 	MDB_db		mm_dbs[2];			/**< first is free space, 2nd is main db */
854 	/** The size of pages used in this DB */
855 #define	mm_psize	mm_dbs[0].md_pad
856 	/** Any persistent environment flags. @ref mdb_env */
857 #define	mm_flags	mm_dbs[0].md_flags
858 	pgno_t		mm_last_pg;			/**< last used page in file */
859 	txnid_t		mm_txnid;			/**< txnid that committed this page */
860 } MDB_meta;
861 
862 	/** Buffer for a stack-allocated meta page.
863 	 *	The members define size and alignment, and silence type
864 	 *	aliasing warnings.  They are not used directly; that could
865 	 *	mean incorrectly using several union members in parallel.
866 	 */
867 typedef union MDB_metabuf {
868 	MDB_page	mb_page;
869 	struct {
870 		char		mm_pad[PAGEHDRSZ];
871 		MDB_meta	mm_meta;
872 	} mb_metabuf;
873 } MDB_metabuf;
874 
875 	/** Auxiliary DB info.
876 	 *	The information here is mostly static/read-only. There is
877 	 *	only a single copy of this record in the environment.
878 	 */
879 typedef struct MDB_dbx {
880 	MDB_val		md_name;		/**< name of the database */
881 	MDB_cmp_func	*md_cmp;	/**< function for comparing keys */
882 	MDB_cmp_func	*md_dcmp;	/**< function for comparing data items */
883 	MDB_rel_func	*md_rel;	/**< user relocate function */
884 	void		*md_relctx;		/**< user-provided context for md_rel */
885 } MDB_dbx;
886 
887 	/** A database transaction.
888 	 *	Every operation requires a transaction handle.
889 	 */
890 struct MDB_txn {
891 	MDB_txn		*mt_parent;		/**< parent of a nested txn */
892 	MDB_txn		*mt_child;		/**< nested txn under this txn */
893 	pgno_t		mt_next_pgno;	/**< next unallocated page */
894 	/** The ID of this transaction. IDs are integers incrementing from 1.
895 	 *	Only committed write transactions increment the ID. If a transaction
896 	 *	aborts, the ID may be re-used by the next writer.
897 	 */
898 	txnid_t		mt_txnid;
899 	MDB_env		*mt_env;		/**< the DB environment */
900 	/** The list of pages that became unused during this transaction.
901 	 */
902 	MDB_IDL		mt_free_pgs;
903 	/** The sorted list of dirty pages we temporarily wrote to disk
904 	 *	because the dirty list was full. page numbers in here are
905 	 *	shifted left by 1, deleted slots have the LSB set.
906 	 */
907 	MDB_IDL		mt_spill_pgs;
908 	union {
909 		/** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */
910 		MDB_ID2L	dirty_list;
911 		/** For read txns: This thread/txn's reader table slot, or NULL. */
912 		MDB_reader	*reader;
913 	} mt_u;
914 	/** Array of records for each DB known in the environment. */
915 	MDB_dbx		*mt_dbxs;
916 	/** Array of MDB_db records for each known DB */
917 	MDB_db		*mt_dbs;
918 /** @defgroup mt_dbflag	Transaction DB Flags
919  *	@ingroup internal
920  * @{
921  */
922 #define DB_DIRTY	0x01		/**< DB was modified or is DUPSORT data */
923 #define DB_STALE	0x02		/**< Named-DB record is older than txnID */
924 #define DB_NEW		0x04		/**< Named-DB handle opened in this txn */
925 #define DB_VALID	0x08		/**< DB handle is valid, see also #MDB_VALID */
926 /** @} */
927 	/** In write txns, array of cursors for each DB */
928 	MDB_cursor	**mt_cursors;
929 	/** Array of flags for each DB */
930 	unsigned char	*mt_dbflags;
931 	/**	Number of DB records in use. This number only ever increments;
932 	 *	we don't decrement it when individual DB handles are closed.
933 	 */
934 	MDB_dbi		mt_numdbs;
935 
936 /** @defgroup mdb_txn	Transaction Flags
937  *	@ingroup internal
938  *	@{
939  */
940 #define MDB_TXN_RDONLY		0x01		/**< read-only transaction */
941 #define MDB_TXN_ERROR		0x02		/**< an error has occurred */
942 #define MDB_TXN_DIRTY		0x04		/**< must write, even if dirty list is empty */
943 #define MDB_TXN_SPILLS		0x08		/**< txn or a parent has spilled pages */
944 /** @} */
945 	unsigned int	mt_flags;		/**< @ref mdb_txn */
946 	/** dirty_list room: Array size - #dirty pages visible to this txn.
947 	 *	Includes ancestor txns' dirty pages not hidden by other txns'
948 	 *	dirty/spilled pages. Thus commit(nested txn) has room to merge
949 	 *	dirty_list into mt_parent after freeing hidden mt_parent pages.
950 	 */
951 	unsigned int	mt_dirty_room;
952 };
953 
954 /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
955  * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
956  * raise this on a 64 bit machine.
957  */
958 #define CURSOR_STACK		 32
959 
960 struct MDB_xcursor;
961 
962 	/** Cursors are used for all DB operations.
963 	 *	A cursor holds a path of (page pointer, key index) from the DB
964 	 *	root to a position in the DB, plus other state. #MDB_DUPSORT
965 	 *	cursors include an xcursor to the current data item. Write txns
966 	 *	track their cursors and keep them up to date when data moves.
967 	 *	Exception: An xcursor's pointer to a #P_SUBP page can be stale.
968 	 *	(A node with #F_DUPDATA but no #F_SUBDATA contains a subpage).
969 	 */
970 struct MDB_cursor {
971 	/** Next cursor on this DB in this txn */
972 	MDB_cursor	*mc_next;
973 	/** Backup of the original cursor if this cursor is a shadow */
974 	MDB_cursor	*mc_backup;
975 	/** Context used for databases with #MDB_DUPSORT, otherwise NULL */
976 	struct MDB_xcursor	*mc_xcursor;
977 	/** The transaction that owns this cursor */
978 	MDB_txn		*mc_txn;
979 	/** The database handle this cursor operates on */
980 	MDB_dbi		mc_dbi;
981 	/** The database record for this cursor */
982 	MDB_db		*mc_db;
983 	/** The database auxiliary record for this cursor */
984 	MDB_dbx		*mc_dbx;
985 	/** The @ref mt_dbflag for this database */
986 	unsigned char	*mc_dbflag;
987 	unsigned short 	mc_snum;	/**< number of pushed pages */
988 	unsigned short	mc_top;		/**< index of top page, normally mc_snum-1 */
989 /** @defgroup mdb_cursor	Cursor Flags
990  *	@ingroup internal
991  *	Cursor state flags.
992  *	@{
993  */
994 #define C_INITIALIZED	0x01	/**< cursor has been initialized and is valid */
995 #define C_EOF	0x02			/**< No more data */
996 #define C_SUB	0x04			/**< Cursor is a sub-cursor */
997 #define C_DEL	0x08			/**< last op was a cursor_del */
998 #define C_SPLITTING	0x20		/**< Cursor is in page_split */
999 #define C_UNTRACK	0x40		/**< Un-track cursor when closing */
1000 /** @} */
1001 	unsigned int	mc_flags;	/**< @ref mdb_cursor */
1002 	MDB_page	*mc_pg[CURSOR_STACK];	/**< stack of pushed pages */
1003 	indx_t		mc_ki[CURSOR_STACK];	/**< stack of page indices */
1004 };
1005 
1006 	/** Context for sorted-dup records.
1007 	 *	We could have gone to a fully recursive design, with arbitrarily
1008 	 *	deep nesting of sub-databases. But for now we only handle these
1009 	 *	levels - main DB, optional sub-DB, sorted-duplicate DB.
1010 	 */
1011 typedef struct MDB_xcursor {
1012 	/** A sub-cursor for traversing the Dup DB */
1013 	MDB_cursor mx_cursor;
1014 	/** The database record for this Dup DB */
1015 	MDB_db	mx_db;
1016 	/**	The auxiliary DB record for this Dup DB */
1017 	MDB_dbx	mx_dbx;
1018 	/** The @ref mt_dbflag for this Dup DB */
1019 	unsigned char mx_dbflag;
1020 } MDB_xcursor;
1021 
1022 	/** State of FreeDB old pages, stored in the MDB_env */
1023 typedef struct MDB_pgstate {
1024 	pgno_t		*mf_pghead;	/**< Reclaimed freeDB pages, or NULL before use */
1025 	txnid_t		mf_pglast;	/**< ID of last used record, or 0 if !mf_pghead */
1026 } MDB_pgstate;
1027 
1028 	/** The database environment. */
1029 struct MDB_env {
1030 	HANDLE		me_fd;		/**< The main data file */
1031 	HANDLE		me_lfd;		/**< The lock file */
1032 	HANDLE		me_mfd;			/**< just for writing the meta pages */
1033 	/** Failed to update the meta page. Probably an I/O error. */
1034 #define	MDB_FATAL_ERROR	0x80000000U
1035 	/** Some fields are initialized. */
1036 #define	MDB_ENV_ACTIVE	0x20000000U
1037 	/** me_txkey is set */
1038 #define	MDB_ENV_TXKEY	0x10000000U
1039 	/** Have liveness lock in reader table */
1040 #define	MDB_LIVE_READER	0x08000000U
1041 	uint32_t 	me_flags;		/**< @ref mdb_env */
1042 	unsigned int	me_psize;	/**< DB page size, inited from me_os_psize */
1043 	unsigned int	me_os_psize;	/**< OS page size, from #GET_PAGESIZE */
1044 	unsigned int	me_maxreaders;	/**< size of the reader table */
1045 	unsigned int	me_numreaders;	/**< max numreaders set by this env */
1046 	MDB_dbi		me_numdbs;		/**< number of DBs opened */
1047 	MDB_dbi		me_maxdbs;		/**< size of the DB table */
1048 	MDB_PID_T	me_pid;		/**< process ID of this env */
1049 	char		*me_path;		/**< path to the DB files */
1050 	char		*me_map;		/**< the memory map of the data file */
1051 	MDB_txninfo	*me_txns;		/**< the memory map of the lock file or NULL */
1052 	MDB_meta	*me_metas[2];	/**< pointers to the two meta pages */
1053 	void		*me_pbuf;		/**< scratch area for DUPSORT put() */
1054 	MDB_txn		*me_txn;		/**< current write transaction */
1055 	size_t		me_mapsize;		/**< size of the data memory map */
1056 	off_t		me_size;		/**< current file size */
1057 	pgno_t		me_maxpg;		/**< me_mapsize / me_psize */
1058 	MDB_dbx		*me_dbxs;		/**< array of static DB info */
1059 	uint16_t	*me_dbflags;	/**< array of flags from MDB_db.md_flags */
1060 	pthread_key_t	me_txkey;	/**< thread-key for readers */
1061 	MDB_pgstate	me_pgstate;		/**< state of old pages from freeDB */
1062 #	define		me_pglast	me_pgstate.mf_pglast
1063 #	define		me_pghead	me_pgstate.mf_pghead
1064 	MDB_page	*me_dpages;		/**< list of malloc'd blocks for re-use */
1065 	/** IDL of pages that became unused in a write txn */
1066 	MDB_IDL		me_free_pgs;
1067 	/** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */
1068 	MDB_ID2L	me_dirty_list;
1069 	/** Max number of freelist items that can fit in a single overflow page */
1070 	int			me_maxfree_1pg;
1071 	/** Max size of a node on a page */
1072 	unsigned int	me_nodemax;
1073 #if !(MDB_MAXKEYSIZE)
1074 	unsigned int	me_maxkey;	/**< max size of a key */
1075 #endif
1076 #ifdef _WIN32
1077 	int		me_pidquery;		/**< Used in OpenProcess */
1078 	HANDLE		me_rmutex;		/* Windows mutexes don't reside in shared mem */
1079 	HANDLE		me_wmutex;
1080 #elif defined(MDB_USE_POSIX_SEM)
1081 	sem_t		*me_rmutex;		/* Shared mutexes are not supported */
1082 	sem_t		*me_wmutex;
1083 #endif
1084 	void		*me_userctx;	 /**< User-settable context */
1085 	MDB_assert_func *me_assert_func; /**< Callback for assertion failures */
1086 };
1087 
1088 	/** Nested transaction */
1089 typedef struct MDB_ntxn {
1090 	MDB_txn		mnt_txn;		/**< the transaction */
1091 	MDB_pgstate	mnt_pgstate;	/**< parent transaction's saved freestate */
1092 } MDB_ntxn;
1093 
1094 	/** max number of pages to commit in one writev() call */
1095 #define MDB_COMMIT_PAGES	 64
1096 #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES
1097 #undef MDB_COMMIT_PAGES
1098 #define MDB_COMMIT_PAGES	IOV_MAX
1099 #endif
1100 
1101 	/* max bytes to write in one call */
1102 #define MAX_WRITE		(0x80000000U >> (sizeof(ssize_t) == 4))
1103 
1104 static int  mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp);
1105 static int  mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp);
1106 static int  mdb_page_touch(MDB_cursor *mc);
1107 
1108 static int  mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl);
1109 static int  mdb_page_search_root(MDB_cursor *mc,
1110 			    MDB_val *key, int modify);
1111 #define MDB_PS_MODIFY	1
1112 #define MDB_PS_ROOTONLY	2
1113 #define MDB_PS_FIRST	4
1114 #define MDB_PS_LAST		8
1115 static int  mdb_page_search(MDB_cursor *mc,
1116 			    MDB_val *key, int flags);
1117 static int	mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
1118 
1119 #define MDB_SPLIT_REPLACE	MDB_APPENDDUP	/**< newkey is not new */
1120 static int	mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata,
1121 				pgno_t newpgno, unsigned int nflags);
1122 
1123 static int  mdb_env_read_header(MDB_env *env, MDB_meta *meta);
1124 static int  mdb_env_pick_meta(const MDB_env *env);
1125 static int  mdb_env_write_meta(MDB_txn *txn);
1126 #if !(defined(_WIN32) || defined(MDB_USE_POSIX_SEM)) /* Drop unused excl arg */
1127 # define mdb_env_close0(env, excl) mdb_env_close1(env)
1128 #endif
1129 static void mdb_env_close0(MDB_env *env, int excl);
1130 
1131 static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp);
1132 static int  mdb_node_add(MDB_cursor *mc, indx_t indx,
1133 			    MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags);
1134 static void mdb_node_del(MDB_cursor *mc, int ksize);
1135 static void mdb_node_shrink(MDB_page *mp, indx_t indx);
1136 static int	mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst);
1137 static int  mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data);
1138 static size_t	mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data);
1139 static size_t	mdb_branch_size(MDB_env *env, MDB_val *key);
1140 
1141 static int	mdb_rebalance(MDB_cursor *mc);
1142 static int	mdb_update_key(MDB_cursor *mc, MDB_val *key);
1143 
1144 static void	mdb_cursor_pop(MDB_cursor *mc);
1145 static int	mdb_cursor_push(MDB_cursor *mc, MDB_page *mp);
1146 
1147 static int	mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf);
1148 static int	mdb_cursor_sibling(MDB_cursor *mc, int move_right);
1149 static int	mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
1150 static int	mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
1151 static int	mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op,
1152 				int *exactp);
1153 static int	mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data);
1154 static int	mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data);
1155 
1156 static void	mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);
1157 static void	mdb_xcursor_init0(MDB_cursor *mc);
1158 static void	mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node);
1159 
1160 static int	mdb_drop0(MDB_cursor *mc, int subs);
1161 static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi);
1162 
1163 /** @cond */
1164 static MDB_cmp_func	mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long;
1165 /** @endcond */
1166 
1167 #ifdef _WIN32
1168 static SECURITY_DESCRIPTOR mdb_null_sd;
1169 static SECURITY_ATTRIBUTES mdb_all_sa;
1170 static int mdb_sec_inited;
1171 #endif
1172 
1173 /** Return the library version info. */
1174 char *
1175 mdb_version(int *major, int *minor, int *patch)
1176 {
1177 	if (major) *major = MDB_VERSION_MAJOR;
1178 	if (minor) *minor = MDB_VERSION_MINOR;
1179 	if (patch) *patch = MDB_VERSION_PATCH;
1180 	return MDB_VERSION_STRING;
1181 }
1182 
1183 /** Table of descriptions for MDB @ref errors */
1184 static char *const mdb_errstr[] = {
1185 	"MDB_KEYEXIST: Key/data pair already exists",
1186 	"MDB_NOTFOUND: No matching key/data pair found",
1187 	"MDB_PAGE_NOTFOUND: Requested page not found",
1188 	"MDB_CORRUPTED: Located page was wrong type",
1189 	"MDB_PANIC: Update of meta page failed",
1190 	"MDB_VERSION_MISMATCH: Database environment version mismatch",
1191 	"MDB_INVALID: File is not an MDB file",
1192 	"MDB_MAP_FULL: Environment mapsize limit reached",
1193 	"MDB_DBS_FULL: Environment maxdbs limit reached",
1194 	"MDB_READERS_FULL: Environment maxreaders limit reached",
1195 	"MDB_TLS_FULL: Thread-local storage keys full - too many environments open",
1196 	"MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big",
1197 	"MDB_CURSOR_FULL: Internal error - cursor stack limit reached",
1198 	"MDB_PAGE_FULL: Internal error - page has no more space",
1199 	"MDB_MAP_RESIZED: Database contents grew beyond environment mapsize",
1200 	"MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed",
1201 	"MDB_BAD_RSLOT: Invalid reuse of reader locktable slot",
1202 	"MDB_BAD_TXN: Transaction cannot recover - it must be aborted",
1203 	"MDB_BAD_VALSIZE: Too big key/data, key is empty, or wrong DUPFIXED size",
1204 };
1205 
1206 char *
1207 mdb_strerror(int err)
1208 {
1209 	int i;
1210 	if (!err)
1211 		return ("Successful return: 0");
1212 
1213 	if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) {
1214 		i = err - MDB_KEYEXIST;
1215 		return mdb_errstr[i];
1216 	}
1217 
1218 	return strerror(err);
1219 }
1220 
1221 /** assert(3) variant in cursor context */
1222 #define mdb_cassert(mc, expr)	mdb_assert0((mc)->mc_txn->mt_env, expr, #expr)
1223 /** assert(3) variant in transaction context */
1224 #define mdb_tassert(mc, expr)	mdb_assert0((txn)->mt_env, expr, #expr)
1225 /** assert(3) variant in environment context */
1226 #define mdb_eassert(env, expr)	mdb_assert0(env, expr, #expr)
1227 
1228 #ifndef NDEBUG
1229 # define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \
1230 		mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__))
1231 
1232 static void
1233 mdb_assert_fail(MDB_env *env, const char *expr_txt,
1234 	const char *func, const char *file, int line)
1235 {
1236 	char buf[400];
1237 	sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()",
1238 		file, line, expr_txt, func);
1239 	if (env->me_assert_func)
1240 		env->me_assert_func(env, buf);
1241 	fprintf(stderr, "%s\n", buf);
1242 	abort();
1243 }
1244 #else
1245 # define mdb_assert0(env, expr, expr_txt) ((void) 0)
1246 #endif /* NDEBUG */
1247 
1248 #if MDB_DEBUG
1249 /** Return the page number of \b mp which may be sub-page, for debug output */
1250 static pgno_t
1251 mdb_dbg_pgno(MDB_page *mp)
1252 {
1253 	pgno_t ret;
1254 	COPY_PGNO(ret, mp->mp_pgno);
1255 	return ret;
1256 }
1257 
1258 /** Display a key in hexadecimal and return the address of the result.
1259  * @param[in] key the key to display
1260  * @param[in] buf the buffer to write into. Should always be #DKBUF.
1261  * @return The key in hexadecimal form.
1262  */
1263 char *
1264 mdb_dkey(MDB_val *key, char *buf)
1265 {
1266 	char *ptr = buf;
1267 	unsigned char *c = key->mv_data;
1268 	unsigned int i;
1269 
1270 	if (!key)
1271 		return "";
1272 
1273 	if (key->mv_size > DKBUF_MAXKEYSIZE)
1274 		return "MDB_MAXKEYSIZE";
1275 	/* may want to make this a dynamic check: if the key is mostly
1276 	 * printable characters, print it as-is instead of converting to hex.
1277 	 */
1278 #if 1
1279 	buf[0] = '\0';
1280 	for (i=0; i<key->mv_size; i++)
1281 		ptr += sprintf(ptr, "%02x", *c++);
1282 #else
1283 	sprintf(buf, "%.*s", key->mv_size, key->mv_data);
1284 #endif
1285 	return buf;
1286 }
1287 
1288 /** Display all the keys in the page. */
1289 void
1290 mdb_page_list(MDB_page *mp)
1291 {
1292 	MDB_node *node;
1293 	unsigned int i, nkeys, nsize, total = 0;
1294 	MDB_val key;
1295 	DKBUF;
1296 
1297 	nkeys = NUMKEYS(mp);
1298 	fprintf(stderr, "Page %"Z"u numkeys %d\n", mdb_dbg_pgno(mp), nkeys);
1299 	for (i=0; i<nkeys; i++) {
1300 		node = NODEPTR(mp, i);
1301 		key.mv_size = node->mn_ksize;
1302 		key.mv_data = node->mn_data;
1303 		nsize = NODESIZE + key.mv_size;
1304 		if (IS_BRANCH(mp)) {
1305 			fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node),
1306 				DKEY(&key));
1307 			total += nsize;
1308 		} else {
1309 			if (F_ISSET(node->mn_flags, F_BIGDATA))
1310 				nsize += sizeof(pgno_t);
1311 			else
1312 				nsize += NODEDSZ(node);
1313 			total += nsize;
1314 			nsize += sizeof(indx_t);
1315 			fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key));
1316 		}
1317 		total = EVEN(total);
1318 	}
1319 	fprintf(stderr, "Total: %d\n", total);
1320 }
1321 
1322 void
1323 mdb_cursor_chk(MDB_cursor *mc)
1324 {
1325 	unsigned int i;
1326 	MDB_node *node;
1327 	MDB_page *mp;
1328 
1329 	if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return;
1330 	for (i=0; i<mc->mc_top; i++) {
1331 		mp = mc->mc_pg[i];
1332 		node = NODEPTR(mp, mc->mc_ki[i]);
1333 		if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)
1334 			printf("oops!\n");
1335 	}
1336 	if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))
1337 		printf("ack!\n");
1338 }
1339 #endif
1340 
1341 #if (MDB_DEBUG) > 2
1342 /** Count all the pages in each DB and in the freelist
1343  *  and make sure it matches the actual number of pages
1344  *  being used.
1345  */
1346 static void mdb_audit(MDB_txn *txn)
1347 {
1348 	MDB_cursor mc;
1349 	MDB_val key, data;
1350 	MDB_ID freecount, count;
1351 	MDB_dbi i;
1352 	int rc;
1353 
1354 	freecount = 0;
1355 	mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
1356 	while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
1357 		freecount += *(MDB_ID *)data.mv_data;
1358 
1359 	count = 0;
1360 	for (i = 0; i<txn->mt_numdbs; i++) {
1361 		MDB_xcursor mx;
1362 		mdb_cursor_init(&mc, txn, i, &mx);
1363 		if (txn->mt_dbs[i].md_root == P_INVALID)
1364 			continue;
1365 		count += txn->mt_dbs[i].md_branch_pages +
1366 			txn->mt_dbs[i].md_leaf_pages +
1367 			txn->mt_dbs[i].md_overflow_pages;
1368 		if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) {
1369 			mdb_page_search(&mc, NULL, MDB_PS_FIRST);
1370 			do {
1371 				unsigned j;
1372 				MDB_page *mp;
1373 				mp = mc.mc_pg[mc.mc_top];
1374 				for (j=0; j<NUMKEYS(mp); j++) {
1375 					MDB_node *leaf = NODEPTR(mp, j);
1376 					if (leaf->mn_flags & F_SUBDATA) {
1377 						MDB_db db;
1378 						memcpy(&db, NODEDATA(leaf), sizeof(db));
1379 						count += db.md_branch_pages + db.md_leaf_pages +
1380 							db.md_overflow_pages;
1381 					}
1382 				}
1383 			}
1384 			while (mdb_cursor_sibling(&mc, 1) == 0);
1385 		}
1386 	}
1387 	if (freecount + count + 2 /* metapages */ != txn->mt_next_pgno) {
1388 		fprintf(stderr, "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n",
1389 			txn->mt_txnid, freecount, count+2, freecount+count+2, txn->mt_next_pgno);
1390 	}
1391 }
1392 #endif
1393 
1394 int
1395 mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
1396 {
1397 	return txn->mt_dbxs[dbi].md_cmp(a, b);
1398 }
1399 
1400 int
1401 mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
1402 {
1403 	return txn->mt_dbxs[dbi].md_dcmp(a, b);
1404 }
1405 
1406 /** Allocate memory for a page.
1407  * Re-use old malloc'd pages first for singletons, otherwise just malloc.
1408  */
1409 static MDB_page *
1410 mdb_page_malloc(MDB_txn *txn, unsigned num)
1411 {
1412 	MDB_env *env = txn->mt_env;
1413 	MDB_page *ret = env->me_dpages;
1414 	size_t psize = env->me_psize, sz = psize, off;
1415 	/* For ! #MDB_NOMEMINIT, psize counts how much to init.
1416 	 * For a single page alloc, we init everything after the page header.
1417 	 * For multi-page, we init the final page; if the caller needed that
1418 	 * many pages they will be filling in at least up to the last page.
1419 	 */
1420 	if (num == 1) {
1421 		if (ret) {
1422 			VGMEMP_ALLOC(env, ret, sz);
1423 			VGMEMP_DEFINED(ret, sizeof(ret->mp_next));
1424 			env->me_dpages = ret->mp_next;
1425 			return ret;
1426 		}
1427 		psize -= off = PAGEHDRSZ;
1428 	} else {
1429 		sz *= num;
1430 		off = sz - psize;
1431 	}
1432 	if ((ret = malloc(sz)) != NULL) {
1433 		VGMEMP_ALLOC(env, ret, sz);
1434 		if (!(env->me_flags & MDB_NOMEMINIT)) {
1435 			memset((char *)ret + off, 0, psize);
1436 			ret->mp_pad = 0;
1437 		}
1438 	} else {
1439 		txn->mt_flags |= MDB_TXN_ERROR;
1440 	}
1441 	return ret;
1442 }
1443 
1444 /** Free a single page.
1445  * Saves single pages to a list, for future reuse.
1446  * (This is not used for multi-page overflow pages.)
1447  */
1448 static void
1449 mdb_page_free(MDB_env *env, MDB_page *mp)
1450 {
1451 	mp->mp_next = env->me_dpages;
1452 	VGMEMP_FREE(env, mp);
1453 	env->me_dpages = mp;
1454 }
1455 
1456 /** Free a dirty page */
1457 static void
1458 mdb_dpage_free(MDB_env *env, MDB_page *dp)
1459 {
1460 	if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
1461 		mdb_page_free(env, dp);
1462 	} else {
1463 		/* large pages just get freed directly */
1464 		VGMEMP_FREE(env, dp);
1465 		free(dp);
1466 	}
1467 }
1468 
1469 /**	Return all dirty pages to dpage list */
1470 static void
1471 mdb_dlist_free(MDB_txn *txn)
1472 {
1473 	MDB_env *env = txn->mt_env;
1474 	MDB_ID2L dl = txn->mt_u.dirty_list;
1475 	unsigned i, n = dl[0].mid;
1476 
1477 	for (i = 1; i <= n; i++) {
1478 		mdb_dpage_free(env, dl[i].mptr);
1479 	}
1480 	dl[0].mid = 0;
1481 }
1482 
1483 /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
1484  * @param[in] mc A cursor handle for the current operation.
1485  * @param[in] pflags Flags of the pages to update:
1486  * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it.
1487  * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush().
1488  * @return 0 on success, non-zero on failure.
1489  */
1490 static int
1491 mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
1492 {
1493 	enum { Mask = P_SUBP|P_DIRTY|P_KEEP };
1494 	MDB_txn *txn = mc->mc_txn;
1495 	MDB_cursor *m3;
1496 	MDB_xcursor *mx;
1497 	MDB_page *dp, *mp;
1498 	MDB_node *leaf;
1499 	unsigned i, j;
1500 	int rc = MDB_SUCCESS, level;
1501 
1502 	/* Mark pages seen by cursors */
1503 	if (mc->mc_flags & C_UNTRACK)
1504 		mc = NULL;				/* will find mc in mt_cursors */
1505 	for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) {
1506 		for (; mc; mc=mc->mc_next) {
1507 			if (!(mc->mc_flags & C_INITIALIZED))
1508 				continue;
1509 			for (m3 = mc;; m3 = &mx->mx_cursor) {
1510 				mp = NULL;
1511 				for (j=0; j<m3->mc_snum; j++) {
1512 					mp = m3->mc_pg[j];
1513 					if ((mp->mp_flags & Mask) == pflags)
1514 						mp->mp_flags ^= P_KEEP;
1515 				}
1516 				mx = m3->mc_xcursor;
1517 				/* Proceed to mx if it is at a sub-database */
1518 				if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
1519 					break;
1520 				if (! (mp && (mp->mp_flags & P_LEAF)))
1521 					break;
1522 				leaf = NODEPTR(mp, m3->mc_ki[j-1]);
1523 				if (!(leaf->mn_flags & F_SUBDATA))
1524 					break;
1525 			}
1526 		}
1527 		if (i == 0)
1528 			break;
1529 	}
1530 
1531 	if (all) {
1532 		/* Mark dirty root pages */
1533 		for (i=0; i<txn->mt_numdbs; i++) {
1534 			if (txn->mt_dbflags[i] & DB_DIRTY) {
1535 				pgno_t pgno = txn->mt_dbs[i].md_root;
1536 				if (pgno == P_INVALID)
1537 					continue;
1538 				if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS)
1539 					break;
1540 				if ((dp->mp_flags & Mask) == pflags && level <= 1)
1541 					dp->mp_flags ^= P_KEEP;
1542 			}
1543 		}
1544 	}
1545 
1546 	return rc;
1547 }
1548 
1549 static int mdb_page_flush(MDB_txn *txn, int keep);
1550 
1551 /**	Spill pages from the dirty list back to disk.
1552  * This is intended to prevent running into #MDB_TXN_FULL situations,
1553  * but note that they may still occur in a few cases:
1554  *	1) our estimate of the txn size could be too small. Currently this
1555  *	 seems unlikely, except with a large number of #MDB_MULTIPLE items.
1556  *	2) child txns may run out of space if their parents dirtied a
1557  *	 lot of pages and never spilled them. TODO: we probably should do
1558  *	 a preemptive spill during #mdb_txn_begin() of a child txn, if
1559  *	 the parent's dirty_room is below a given threshold.
1560  *
1561  * Otherwise, if not using nested txns, it is expected that apps will
1562  * not run into #MDB_TXN_FULL any more. The pages are flushed to disk
1563  * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared.
1564  * If the txn never references them again, they can be left alone.
1565  * If the txn only reads them, they can be used without any fuss.
1566  * If the txn writes them again, they can be dirtied immediately without
1567  * going thru all of the work of #mdb_page_touch(). Such references are
1568  * handled by #mdb_page_unspill().
1569  *
1570  * Also note, we never spill DB root pages, nor pages of active cursors,
1571  * because we'll need these back again soon anyway. And in nested txns,
1572  * we can't spill a page in a child txn if it was already spilled in a
1573  * parent txn. That would alter the parent txns' data even though
1574  * the child hasn't committed yet, and we'd have no way to undo it if
1575  * the child aborted.
1576  *
1577  * @param[in] m0 cursor A cursor handle identifying the transaction and
1578  *	database for which we are checking space.
1579  * @param[in] key For a put operation, the key being stored.
1580  * @param[in] data For a put operation, the data being stored.
1581  * @return 0 on success, non-zero on failure.
1582  */
1583 static int
1584 mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
1585 {
1586 	MDB_txn *txn = m0->mc_txn;
1587 	MDB_page *dp;
1588 	MDB_ID2L dl = txn->mt_u.dirty_list;
1589 	unsigned int i, j, need;
1590 	int rc;
1591 
1592 	if (m0->mc_flags & C_SUB)
1593 		return MDB_SUCCESS;
1594 
1595 	/* Estimate how much space this op will take */
1596 	i = m0->mc_db->md_depth;
1597 	/* Named DBs also dirty the main DB */
1598 	if (m0->mc_dbi > MAIN_DBI)
1599 		i += txn->mt_dbs[MAIN_DBI].md_depth;
1600 	/* For puts, roughly factor in the key+data size */
1601 	if (key)
1602 		i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize;
1603 	i += i;	/* double it for good measure */
1604 	need = i;
1605 
1606 	if (txn->mt_dirty_room > i)
1607 		return MDB_SUCCESS;
1608 
1609 	if (!txn->mt_spill_pgs) {
1610 		txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX);
1611 		if (!txn->mt_spill_pgs)
1612 			return ENOMEM;
1613 	} else {
1614 		/* purge deleted slots */
1615 		MDB_IDL sl = txn->mt_spill_pgs;
1616 		unsigned int num = sl[0];
1617 		j=0;
1618 		for (i=1; i<=num; i++) {
1619 			if (!(sl[i] & 1))
1620 				sl[++j] = sl[i];
1621 		}
1622 		sl[0] = j;
1623 	}
1624 
1625 	/* Preserve pages which may soon be dirtied again */
1626 	if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS)
1627 		goto done;
1628 
1629 	/* Less aggressive spill - we originally spilled the entire dirty list,
1630 	 * with a few exceptions for cursor pages and DB root pages. But this
1631 	 * turns out to be a lot of wasted effort because in a large txn many
1632 	 * of those pages will need to be used again. So now we spill only 1/8th
1633 	 * of the dirty pages. Testing revealed this to be a good tradeoff,
1634 	 * better than 1/2, 1/4, or 1/10.
1635 	 */
1636 	if (need < MDB_IDL_UM_MAX / 8)
1637 		need = MDB_IDL_UM_MAX / 8;
1638 
1639 	/* Save the page IDs of all the pages we're flushing */
1640 	/* flush from the tail forward, this saves a lot of shifting later on. */
1641 	for (i=dl[0].mid; i && need; i--) {
1642 		MDB_ID pn = dl[i].mid << 1;
1643 		dp = dl[i].mptr;
1644 		if (dp->mp_flags & P_KEEP)
1645 			continue;
1646 		/* Can't spill twice, make sure it's not already in a parent's
1647 		 * spill list.
1648 		 */
1649 		if (txn->mt_parent) {
1650 			MDB_txn *tx2;
1651 			for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
1652 				if (tx2->mt_spill_pgs) {
1653 					j = mdb_midl_search(tx2->mt_spill_pgs, pn);
1654 					if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) {
1655 						dp->mp_flags |= P_KEEP;
1656 						break;
1657 					}
1658 				}
1659 			}
1660 			if (tx2)
1661 				continue;
1662 		}
1663 		if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn)))
1664 			goto done;
1665 		need--;
1666 	}
1667 	mdb_midl_sort(txn->mt_spill_pgs);
1668 
1669 	/* Flush the spilled part of dirty list */
1670 	if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS)
1671 		goto done;
1672 
1673 	/* Reset any dirty pages we kept that page_flush didn't see */
1674 	rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i);
1675 
1676 done:
1677 	txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS;
1678 	return rc;
1679 }
1680 
1681 /** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */
1682 static txnid_t
1683 mdb_find_oldest(MDB_txn *txn)
1684 {
1685 	int i;
1686 	txnid_t mr, oldest = txn->mt_txnid - 1;
1687 	if (txn->mt_env->me_txns) {
1688 		MDB_reader *r = txn->mt_env->me_txns->mti_readers;
1689 		for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
1690 			if (r[i].mr_pid) {
1691 				mr = r[i].mr_txnid;
1692 				if (oldest > mr)
1693 					oldest = mr;
1694 			}
1695 		}
1696 	}
1697 	return oldest;
1698 }
1699 
1700 /** Add a page to the txn's dirty list */
1701 static void
1702 mdb_page_dirty(MDB_txn *txn, MDB_page *mp)
1703 {
1704 	MDB_ID2 mid;
1705 	int rc, (*insert)(MDB_ID2L, MDB_ID2 *);
1706 
1707 	if (txn->mt_env->me_flags & MDB_WRITEMAP) {
1708 		insert = mdb_mid2l_append;
1709 	} else {
1710 		insert = mdb_mid2l_insert;
1711 	}
1712 	mid.mid = mp->mp_pgno;
1713 	mid.mptr = mp;
1714 	rc = insert(txn->mt_u.dirty_list, &mid);
1715 	mdb_tassert(txn, rc == 0);
1716 	txn->mt_dirty_room--;
1717 }
1718 
1719 /** Allocate page numbers and memory for writing.  Maintain me_pglast,
1720  * me_pghead and mt_next_pgno.
1721  *
1722  * If there are free pages available from older transactions, they
1723  * are re-used first. Otherwise allocate a new page at mt_next_pgno.
1724  * Do not modify the freedB, just merge freeDB records into me_pghead[]
1725  * and move me_pglast to say which records were consumed.  Only this
1726  * function can create me_pghead and move me_pglast/mt_next_pgno.
1727  * @param[in] mc cursor A cursor handle identifying the transaction and
1728  *	database for which we are allocating.
1729  * @param[in] num the number of pages to allocate.
1730  * @param[out] mp Address of the allocated page(s). Requests for multiple pages
1731  *  will always be satisfied by a single contiguous chunk of memory.
1732  * @return 0 on success, non-zero on failure.
1733  */
1734 static int
1735 mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
1736 {
1737 #ifdef MDB_PARANOID	/* Seems like we can ignore this now */
1738 	/* Get at most <Max_retries> more freeDB records once me_pghead
1739 	 * has enough pages.  If not enough, use new pages from the map.
1740 	 * If <Paranoid> and mc is updating the freeDB, only get new
1741 	 * records if me_pghead is empty. Then the freelist cannot play
1742 	 * catch-up with itself by growing while trying to save it.
1743 	 */
1744 	enum { Paranoid = 1, Max_retries = 500 };
1745 #else
1746 	enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ };
1747 #endif
1748 	int rc, retry = Max_retries;
1749 	MDB_txn *txn = mc->mc_txn;
1750 	MDB_env *env = txn->mt_env;
1751 	pgno_t pgno, *mop = env->me_pghead;
1752 	unsigned i, j, k, mop_len = mop ? mop[0] : 0, n2 = num-1;
1753 	MDB_page *np;
1754 	txnid_t oldest = 0, last;
1755 	MDB_cursor_op op;
1756 	MDB_cursor m2;
1757 
1758 	*mp = NULL;
1759 
1760 	/* If our dirty list is already full, we can't do anything */
1761 	if (txn->mt_dirty_room == 0) {
1762 		rc = MDB_TXN_FULL;
1763 		goto fail;
1764 	}
1765 
1766 	for (op = MDB_FIRST;; op = MDB_NEXT) {
1767 		MDB_val key, data;
1768 		MDB_node *leaf;
1769 		pgno_t *idl, old_id, new_id;
1770 
1771 		/* Seek a big enough contiguous page range. Prefer
1772 		 * pages at the tail, just truncating the list.
1773 		 */
1774 		if (mop_len > n2) {
1775 			i = mop_len;
1776 			do {
1777 				pgno = mop[i];
1778 				if (mop[i-n2] == pgno+n2)
1779 					goto search_done;
1780 			} while (--i > n2);
1781 			if (Max_retries < INT_MAX && --retry < 0)
1782 				break;
1783 		}
1784 
1785 		if (op == MDB_FIRST) {	/* 1st iteration */
1786 			/* Prepare to fetch more and coalesce */
1787 			oldest = mdb_find_oldest(txn);
1788 			last = env->me_pglast;
1789 			mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
1790 			if (last) {
1791 				op = MDB_SET_RANGE;
1792 				key.mv_data = &last; /* will look up last+1 */
1793 				key.mv_size = sizeof(last);
1794 			}
1795 			if (Paranoid && mc->mc_dbi == FREE_DBI)
1796 				retry = -1;
1797 		}
1798 		if (Paranoid && retry < 0 && mop_len)
1799 			break;
1800 
1801 		last++;
1802 		/* Do not fetch more if the record will be too recent */
1803 		if (oldest <= last)
1804 			break;
1805 		rc = mdb_cursor_get(&m2, &key, NULL, op);
1806 		if (rc) {
1807 			if (rc == MDB_NOTFOUND)
1808 				break;
1809 			goto fail;
1810 		}
1811 		last = *(txnid_t*)key.mv_data;
1812 		if (oldest <= last)
1813 			break;
1814 		np = m2.mc_pg[m2.mc_top];
1815 		leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]);
1816 		if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS)
1817 			return rc;
1818 
1819 		idl = (MDB_ID *) data.mv_data;
1820 		i = idl[0];
1821 		if (!mop) {
1822 			if (!(env->me_pghead = mop = mdb_midl_alloc(i))) {
1823 				rc = ENOMEM;
1824 				goto fail;
1825 			}
1826 		} else {
1827 			if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0)
1828 				goto fail;
1829 			mop = env->me_pghead;
1830 		}
1831 		env->me_pglast = last;
1832 #if (MDB_DEBUG) > 1
1833 		DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u",
1834 			last, txn->mt_dbs[FREE_DBI].md_root, i));
1835 		for (k = i; k; k--)
1836 			DPRINTF(("IDL %"Z"u", idl[k]));
1837 #endif
1838 		/* Merge in descending sorted order */
1839 		j = mop_len;
1840 		k = mop_len += i;
1841 		mop[0] = (pgno_t)-1;
1842 		old_id = mop[j];
1843 		while (i) {
1844 			new_id = idl[i--];
1845 			for (; old_id < new_id; old_id = mop[--j])
1846 				mop[k--] = old_id;
1847 			mop[k--] = new_id;
1848 		}
1849 		mop[0] = mop_len;
1850 	}
1851 
1852 	/* Use new pages from the map when nothing suitable in the freeDB */
1853 	i = 0;
1854 	pgno = txn->mt_next_pgno;
1855 	if (pgno + num >= env->me_maxpg) {
1856 			DPUTS("DB size maxed out");
1857 			rc = MDB_MAP_FULL;
1858 			goto fail;
1859 	}
1860 
1861 search_done:
1862 	if (env->me_flags & MDB_WRITEMAP) {
1863 		np = (MDB_page *)(env->me_map + env->me_psize * pgno);
1864 	} else {
1865 		if (!(np = mdb_page_malloc(txn, num))) {
1866 			rc = ENOMEM;
1867 			goto fail;
1868 		}
1869 	}
1870 	if (i) {
1871 		mop[0] = mop_len -= num;
1872 		/* Move any stragglers down */
1873 		for (j = i-num; j < mop_len; )
1874 			mop[++j] = mop[++i];
1875 	} else {
1876 		txn->mt_next_pgno = pgno + num;
1877 	}
1878 	np->mp_pgno = pgno;
1879 	mdb_page_dirty(txn, np);
1880 	*mp = np;
1881 
1882 	return MDB_SUCCESS;
1883 
1884 fail:
1885 	txn->mt_flags |= MDB_TXN_ERROR;
1886 	return rc;
1887 }
1888 
1889 /** Copy the used portions of a non-overflow page.
1890  * @param[in] dst page to copy into
1891  * @param[in] src page to copy from
1892  * @param[in] psize size of a page
1893  */
1894 static void
1895 mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
1896 {
1897 	enum { Align = sizeof(pgno_t) };
1898 	indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower;
1899 
1900 	/* If page isn't full, just copy the used portion. Adjust
1901 	 * alignment so memcpy may copy words instead of bytes.
1902 	 */
1903 	if ((unused &= -Align) && !IS_LEAF2(src)) {
1904 		upper &= -Align;
1905 		memcpy(dst, src, (lower + (Align-1)) & -Align);
1906 		memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper),
1907 			psize - upper);
1908 	} else {
1909 		memcpy(dst, src, psize - unused);
1910 	}
1911 }
1912 
1913 /** Pull a page off the txn's spill list, if present.
1914  * If a page being referenced was spilled to disk in this txn, bring
1915  * it back and make it dirty/writable again.
1916  * @param[in] txn the transaction handle.
1917  * @param[in] mp the page being referenced. It must not be dirty.
1918  * @param[out] ret the writable page, if any. ret is unchanged if
1919  * mp wasn't spilled.
1920  */
1921 static int
1922 mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret)
1923 {
1924 	MDB_env *env = txn->mt_env;
1925 	const MDB_txn *tx2;
1926 	unsigned x;
1927 	pgno_t pgno = mp->mp_pgno, pn = pgno << 1;
1928 
1929 	for (tx2 = txn; tx2; tx2=tx2->mt_parent) {
1930 		if (!tx2->mt_spill_pgs)
1931 			continue;
1932 		x = mdb_midl_search(tx2->mt_spill_pgs, pn);
1933 		if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
1934 			MDB_page *np;
1935 			int num;
1936 			if (txn->mt_dirty_room == 0)
1937 				return MDB_TXN_FULL;
1938 			if (IS_OVERFLOW(mp))
1939 				num = mp->mp_pages;
1940 			else
1941 				num = 1;
1942 			if (env->me_flags & MDB_WRITEMAP) {
1943 				np = mp;
1944 			} else {
1945 				np = mdb_page_malloc(txn, num);
1946 				if (!np)
1947 					return ENOMEM;
1948 				if (num > 1)
1949 					memcpy(np, mp, num * env->me_psize);
1950 				else
1951 					mdb_page_copy(np, mp, env->me_psize);
1952 			}
1953 			if (tx2 == txn) {
1954 				/* If in current txn, this page is no longer spilled.
1955 				 * If it happens to be the last page, truncate the spill list.
1956 				 * Otherwise mark it as deleted by setting the LSB.
1957 				 */
1958 				if (x == txn->mt_spill_pgs[0])
1959 					txn->mt_spill_pgs[0]--;
1960 				else
1961 					txn->mt_spill_pgs[x] |= 1;
1962 			}	/* otherwise, if belonging to a parent txn, the
1963 				 * page remains spilled until child commits
1964 				 */
1965 
1966 			mdb_page_dirty(txn, np);
1967 			np->mp_flags |= P_DIRTY;
1968 			*ret = np;
1969 			break;
1970 		}
1971 	}
1972 	return MDB_SUCCESS;
1973 }
1974 
1975 /** Touch a page: make it dirty and re-insert into tree with updated pgno.
1976  * @param[in] mc cursor pointing to the page to be touched
1977  * @return 0 on success, non-zero on failure.
1978  */
1979 static int
1980 mdb_page_touch(MDB_cursor *mc)
1981 {
1982 	MDB_page *mp = mc->mc_pg[mc->mc_top], *np;
1983 	MDB_txn *txn = mc->mc_txn;
1984 	MDB_cursor *m2, *m3;
1985 	pgno_t	pgno;
1986 	int rc;
1987 
1988 	if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
1989 		if (txn->mt_flags & MDB_TXN_SPILLS) {
1990 			np = NULL;
1991 			rc = mdb_page_unspill(txn, mp, &np);
1992 			if (rc)
1993 				goto fail;
1994 			if (np)
1995 				goto done;
1996 		}
1997 		if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) ||
1998 			(rc = mdb_page_alloc(mc, 1, &np)))
1999 			goto fail;
2000 		pgno = np->mp_pgno;
2001 		DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc),
2002 			mp->mp_pgno, pgno));
2003 		mdb_cassert(mc, mp->mp_pgno != pgno);
2004 		mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
2005 		/* Update the parent page, if any, to point to the new page */
2006 		if (mc->mc_top) {
2007 			MDB_page *parent = mc->mc_pg[mc->mc_top-1];
2008 			MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]);
2009 			SETPGNO(node, pgno);
2010 		} else {
2011 			mc->mc_db->md_root = pgno;
2012 		}
2013 	} else if (txn->mt_parent && !IS_SUBP(mp)) {
2014 		MDB_ID2 mid, *dl = txn->mt_u.dirty_list;
2015 		pgno = mp->mp_pgno;
2016 		/* If txn has a parent, make sure the page is in our
2017 		 * dirty list.
2018 		 */
2019 		if (dl[0].mid) {
2020 			unsigned x = mdb_mid2l_search(dl, pgno);
2021 			if (x <= dl[0].mid && dl[x].mid == pgno) {
2022 				if (mp != dl[x].mptr) { /* bad cursor? */
2023 					mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
2024 					txn->mt_flags |= MDB_TXN_ERROR;
2025 					return MDB_CORRUPTED;
2026 				}
2027 				return 0;
2028 			}
2029 		}
2030 		mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX);
2031 		/* No - copy it */
2032 		np = mdb_page_malloc(txn, 1);
2033 		if (!np)
2034 			return ENOMEM;
2035 		mid.mid = pgno;
2036 		mid.mptr = np;
2037 		rc = mdb_mid2l_insert(dl, &mid);
2038 		mdb_cassert(mc, rc == 0);
2039 	} else {
2040 		return 0;
2041 	}
2042 
2043 	mdb_page_copy(np, mp, txn->mt_env->me_psize);
2044 	np->mp_pgno = pgno;
2045 	np->mp_flags |= P_DIRTY;
2046 
2047 done:
2048 	/* Adjust cursors pointing to mp */
2049 	mc->mc_pg[mc->mc_top] = np;
2050 	m2 = txn->mt_cursors[mc->mc_dbi];
2051 	if (mc->mc_flags & C_SUB) {
2052 		for (; m2; m2=m2->mc_next) {
2053 			m3 = &m2->mc_xcursor->mx_cursor;
2054 			if (m3->mc_snum < mc->mc_snum) continue;
2055 			if (m3->mc_pg[mc->mc_top] == mp)
2056 				m3->mc_pg[mc->mc_top] = np;
2057 		}
2058 	} else {
2059 		for (; m2; m2=m2->mc_next) {
2060 			if (m2->mc_snum < mc->mc_snum) continue;
2061 			if (m2->mc_pg[mc->mc_top] == mp) {
2062 				m2->mc_pg[mc->mc_top] = np;
2063 				if ((mc->mc_db->md_flags & MDB_DUPSORT) &&
2064 					m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
2065 				{
2066 					MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]);
2067 					if (!(leaf->mn_flags & F_SUBDATA))
2068 						m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
2069 				}
2070 			}
2071 		}
2072 	}
2073 	return 0;
2074 
2075 fail:
2076 	txn->mt_flags |= MDB_TXN_ERROR;
2077 	return rc;
2078 }
2079 
2080 int
2081 mdb_env_sync(MDB_env *env, int force)
2082 {
2083 	int rc = 0;
2084 	if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
2085 		if (env->me_flags & MDB_WRITEMAP) {
2086 			int flags = ((env->me_flags & MDB_MAPASYNC) && !force)
2087 				? MS_ASYNC : MS_SYNC;
2088 			if (MDB_MSYNC(env->me_map, env->me_mapsize, flags))
2089 				rc = ErrCode();
2090 #ifdef _WIN32
2091 			else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd))
2092 				rc = ErrCode();
2093 #endif
2094 		} else {
2095 			if (MDB_FDATASYNC(env->me_fd))
2096 				rc = ErrCode();
2097 		}
2098 	}
2099 	return rc;
2100 }
2101 
2102 /** Back up parent txn's cursors, then grab the originals for tracking */
2103 static int
2104 mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst)
2105 {
2106 	MDB_cursor *mc, *bk;
2107 	MDB_xcursor *mx;
2108 	size_t size;
2109 	int i;
2110 
2111 	for (i = src->mt_numdbs; --i >= 0; ) {
2112 		if ((mc = src->mt_cursors[i]) != NULL) {
2113 			size = sizeof(MDB_cursor);
2114 			if (mc->mc_xcursor)
2115 				size += sizeof(MDB_xcursor);
2116 			for (; mc; mc = bk->mc_next) {
2117 				bk = malloc(size);
2118 				if (!bk)
2119 					return ENOMEM;
2120 				*bk = *mc;
2121 				mc->mc_backup = bk;
2122 				mc->mc_db = &dst->mt_dbs[i];
2123 				/* Kill pointers into src - and dst to reduce abuse: The
2124 				 * user may not use mc until dst ends. Otherwise we'd...
2125 				 */
2126 				mc->mc_txn    = NULL;	/* ...set this to dst */
2127 				mc->mc_dbflag = NULL;	/* ...and &dst->mt_dbflags[i] */
2128 				if ((mx = mc->mc_xcursor) != NULL) {
2129 					*(MDB_xcursor *)(bk+1) = *mx;
2130 					mx->mx_cursor.mc_txn = NULL; /* ...and dst. */
2131 				}
2132 				mc->mc_next = dst->mt_cursors[i];
2133 				dst->mt_cursors[i] = mc;
2134 			}
2135 		}
2136 	}
2137 	return MDB_SUCCESS;
2138 }
2139 
2140 /** Close this write txn's cursors, give parent txn's cursors back to parent.
2141  * @param[in] txn the transaction handle.
2142  * @param[in] merge true to keep changes to parent cursors, false to revert.
2143  * @return 0 on success, non-zero on failure.
2144  */
2145 static void
2146 mdb_cursors_close(MDB_txn *txn, unsigned merge)
2147 {
2148 	MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk;
2149 	MDB_xcursor *mx;
2150 	int i;
2151 
2152 	for (i = txn->mt_numdbs; --i >= 0; ) {
2153 		for (mc = cursors[i]; mc; mc = next) {
2154 			next = mc->mc_next;
2155 			if ((bk = mc->mc_backup) != NULL) {
2156 				if (merge) {
2157 					/* Commit changes to parent txn */
2158 					mc->mc_next = bk->mc_next;
2159 					mc->mc_backup = bk->mc_backup;
2160 					mc->mc_txn = bk->mc_txn;
2161 					mc->mc_db = bk->mc_db;
2162 					mc->mc_dbflag = bk->mc_dbflag;
2163 					if ((mx = mc->mc_xcursor) != NULL)
2164 						mx->mx_cursor.mc_txn = bk->mc_txn;
2165 				} else {
2166 					/* Abort nested txn */
2167 					*mc = *bk;
2168 					if ((mx = mc->mc_xcursor) != NULL)
2169 						*mx = *(MDB_xcursor *)(bk+1);
2170 				}
2171 				mc = bk;
2172 			}
2173 			/* Only malloced cursors are permanently tracked. */
2174 			free(mc);
2175 		}
2176 		cursors[i] = NULL;
2177 	}
2178 }
2179 
2180 #if !(MDB_DEBUG)
2181 #define mdb_txn_reset0(txn, act) mdb_txn_reset0(txn)
2182 #endif
2183 static void
2184 mdb_txn_reset0(MDB_txn *txn, const char *act);
2185 
2186 #if !(MDB_PIDLOCK)		/* Currently the same as defined(_WIN32) */
2187 enum Pidlock_op {
2188 	Pidset, Pidcheck
2189 };
2190 #else
2191 enum Pidlock_op {
2192 	Pidset = F_SETLK, Pidcheck = F_GETLK
2193 };
2194 #endif
2195 
2196 /** Set or check a pid lock. Set returns 0 on success.
2197  * Check returns 0 if the process is certainly dead, nonzero if it may
2198  * be alive (the lock exists or an error happened so we do not know).
2199  *
2200  * On Windows Pidset is a no-op, we merely check for the existence
2201  * of the process with the given pid. On POSIX we use a single byte
2202  * lock on the lockfile, set at an offset equal to the pid.
2203  */
2204 static int
2205 mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid)
2206 {
2207 #if !(MDB_PIDLOCK)		/* Currently the same as defined(_WIN32) */
2208 	int ret = 0;
2209 	HANDLE h;
2210 	if (op == Pidcheck) {
2211 		h = OpenProcess(env->me_pidquery, FALSE, pid);
2212 		/* No documented "no such process" code, but other program use this: */
2213 		if (!h)
2214 			return ErrCode() != ERROR_INVALID_PARAMETER;
2215 		/* A process exists until all handles to it close. Has it exited? */
2216 		ret = WaitForSingleObject(h, 0) != 0;
2217 		CloseHandle(h);
2218 	}
2219 	return ret;
2220 #else
2221 	for (;;) {
2222 		int rc;
2223 		struct flock lock_info;
2224 		memset(&lock_info, 0, sizeof(lock_info));
2225 		lock_info.l_type = F_WRLCK;
2226 		lock_info.l_whence = SEEK_SET;
2227 		lock_info.l_start = pid;
2228 		lock_info.l_len = 1;
2229 		if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) {
2230 			if (op == F_GETLK && lock_info.l_type != F_UNLCK)
2231 				rc = -1;
2232 		} else if ((rc = ErrCode()) == EINTR) {
2233 			continue;
2234 		}
2235 		return rc;
2236 	}
2237 #endif
2238 }
2239 
2240 /** Common code for #mdb_txn_begin() and #mdb_txn_renew().
2241  * @param[in] txn the transaction handle to initialize
2242  * @return 0 on success, non-zero on failure.
2243  */
2244 static int
2245 mdb_txn_renew0(MDB_txn *txn)
2246 {
2247 	MDB_env *env = txn->mt_env;
2248 	MDB_txninfo *ti = env->me_txns;
2249 	MDB_meta *meta;
2250 	unsigned int i, nr;
2251 	uint16_t x;
2252 	int rc, new_notls = 0;
2253 
2254 	/* Setup db info */
2255 	txn->mt_numdbs = env->me_numdbs;
2256 	txn->mt_dbxs = env->me_dbxs;	/* mostly static anyway */
2257 
2258 	if (txn->mt_flags & MDB_TXN_RDONLY) {
2259 		if (!ti) {
2260 			meta = env->me_metas[ mdb_env_pick_meta(env) ];
2261 			txn->mt_txnid = meta->mm_txnid;
2262 			txn->mt_u.reader = NULL;
2263 		} else {
2264 			MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader :
2265 				pthread_getspecific(env->me_txkey);
2266 			if (r) {
2267 				if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
2268 					return MDB_BAD_RSLOT;
2269 			} else {
2270 				MDB_PID_T pid = env->me_pid;
2271 				pthread_t tid = pthread_self();
2272 
2273 				if (!(env->me_flags & MDB_LIVE_READER)) {
2274 					rc = mdb_reader_pid(env, Pidset, pid);
2275 					if (rc)
2276 						return rc;
2277 					env->me_flags |= MDB_LIVE_READER;
2278 				}
2279 
2280 				LOCK_MUTEX_R(env);
2281 				nr = ti->mti_numreaders;
2282 				for (i=0; i<nr; i++)
2283 					if (ti->mti_readers[i].mr_pid == 0)
2284 						break;
2285 				if (i == env->me_maxreaders) {
2286 					UNLOCK_MUTEX_R(env);
2287 					return MDB_READERS_FULL;
2288 				}
2289 				ti->mti_readers[i].mr_pid = pid;
2290 				ti->mti_readers[i].mr_tid = tid;
2291 				if (i == nr)
2292 					ti->mti_numreaders = ++nr;
2293 				/* Save numreaders for un-mutexed mdb_env_close() */
2294 				env->me_numreaders = nr;
2295 				UNLOCK_MUTEX_R(env);
2296 
2297 				r = &ti->mti_readers[i];
2298 				new_notls = (env->me_flags & MDB_NOTLS);
2299 				if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
2300 					r->mr_pid = 0;
2301 					return rc;
2302 				}
2303 			}
2304 			txn->mt_txnid = r->mr_txnid = ti->mti_txnid;
2305 			txn->mt_u.reader = r;
2306 			meta = env->me_metas[txn->mt_txnid & 1];
2307 		}
2308 	} else {
2309 		if (ti) {
2310 			LOCK_MUTEX_W(env);
2311 
2312 			txn->mt_txnid = ti->mti_txnid;
2313 			meta = env->me_metas[txn->mt_txnid & 1];
2314 		} else {
2315 			meta = env->me_metas[ mdb_env_pick_meta(env) ];
2316 			txn->mt_txnid = meta->mm_txnid;
2317 		}
2318 		txn->mt_txnid++;
2319 #if MDB_DEBUG
2320 		if (txn->mt_txnid == mdb_debug_start)
2321 			mdb_debug = 1;
2322 #endif
2323 		txn->mt_dirty_room = MDB_IDL_UM_MAX;
2324 		txn->mt_u.dirty_list = env->me_dirty_list;
2325 		txn->mt_u.dirty_list[0].mid = 0;
2326 		txn->mt_free_pgs = env->me_free_pgs;
2327 		txn->mt_free_pgs[0] = 0;
2328 		txn->mt_spill_pgs = NULL;
2329 		env->me_txn = txn;
2330 	}
2331 
2332 	/* Copy the DB info and flags */
2333 	memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));
2334 
2335 	/* Moved to here to avoid a data race in read TXNs */
2336 	txn->mt_next_pgno = meta->mm_last_pg+1;
2337 
2338 	for (i=2; i<txn->mt_numdbs; i++) {
2339 		x = env->me_dbflags[i];
2340 		txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS;
2341 		txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0;
2342 	}
2343 	txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID;
2344 
2345 	if (env->me_maxpg < txn->mt_next_pgno) {
2346 		mdb_txn_reset0(txn, "renew0-mapfail");
2347 		if (new_notls) {
2348 			txn->mt_u.reader->mr_pid = 0;
2349 			txn->mt_u.reader = NULL;
2350 		}
2351 		return MDB_MAP_RESIZED;
2352 	}
2353 
2354 	return MDB_SUCCESS;
2355 }
2356 
2357 int
2358 mdb_txn_renew(MDB_txn *txn)
2359 {
2360 	int rc;
2361 
2362 	if (!txn || txn->mt_dbxs)	/* A reset txn has mt_dbxs==NULL */
2363 		return EINVAL;
2364 
2365 	if (txn->mt_env->me_flags & MDB_FATAL_ERROR) {
2366 		DPUTS("environment had fatal error, must shutdown!");
2367 		return MDB_PANIC;
2368 	}
2369 
2370 	rc = mdb_txn_renew0(txn);
2371 	if (rc == MDB_SUCCESS) {
2372 		DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2373 			txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
2374 			(void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root));
2375 	}
2376 	return rc;
2377 }
2378 
2379 int
2380 mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
2381 {
2382 	MDB_txn *txn;
2383 	MDB_ntxn *ntxn;
2384 	int rc, size, tsize = sizeof(MDB_txn);
2385 
2386 	if (env->me_flags & MDB_FATAL_ERROR) {
2387 		DPUTS("environment had fatal error, must shutdown!");
2388 		return MDB_PANIC;
2389 	}
2390 	if ((env->me_flags & MDB_RDONLY) && !(flags & MDB_RDONLY))
2391 		return EACCES;
2392 	if (parent) {
2393 		/* Nested transactions: Max 1 child, write txns only, no writemap */
2394 		if (parent->mt_child ||
2395 			(flags & MDB_RDONLY) ||
2396 			(parent->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) ||
2397 			(env->me_flags & MDB_WRITEMAP))
2398 		{
2399 			return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN;
2400 		}
2401 		tsize = sizeof(MDB_ntxn);
2402 	}
2403 	size = tsize + env->me_maxdbs * (sizeof(MDB_db)+1);
2404 	if (!(flags & MDB_RDONLY))
2405 		size += env->me_maxdbs * sizeof(MDB_cursor *);
2406 
2407 	if ((txn = calloc(1, size)) == NULL) {
2408 		DPRINTF(("calloc: %s", strerror(ErrCode())));
2409 		return ENOMEM;
2410 	}
2411 	txn->mt_dbs = (MDB_db *) ((char *)txn + tsize);
2412 	if (flags & MDB_RDONLY) {
2413 		txn->mt_flags |= MDB_TXN_RDONLY;
2414 		txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs);
2415 	} else {
2416 		txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
2417 		txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs);
2418 	}
2419 	txn->mt_env = env;
2420 
2421 	if (parent) {
2422 		unsigned int i;
2423 		txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE);
2424 		if (!txn->mt_u.dirty_list ||
2425 			!(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)))
2426 		{
2427 			free(txn->mt_u.dirty_list);
2428 			free(txn);
2429 			return ENOMEM;
2430 		}
2431 		txn->mt_txnid = parent->mt_txnid;
2432 		txn->mt_dirty_room = parent->mt_dirty_room;
2433 		txn->mt_u.dirty_list[0].mid = 0;
2434 		txn->mt_spill_pgs = NULL;
2435 		txn->mt_next_pgno = parent->mt_next_pgno;
2436 		parent->mt_child = txn;
2437 		txn->mt_parent = parent;
2438 		txn->mt_numdbs = parent->mt_numdbs;
2439 		txn->mt_flags = parent->mt_flags;
2440 		txn->mt_dbxs = parent->mt_dbxs;
2441 		memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
2442 		/* Copy parent's mt_dbflags, but clear DB_NEW */
2443 		for (i=0; i<txn->mt_numdbs; i++)
2444 			txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW;
2445 		rc = 0;
2446 		ntxn = (MDB_ntxn *)txn;
2447 		ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */
2448 		if (env->me_pghead) {
2449 			size = MDB_IDL_SIZEOF(env->me_pghead);
2450 			env->me_pghead = mdb_midl_alloc(env->me_pghead[0]);
2451 			if (env->me_pghead)
2452 				memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size);
2453 			else
2454 				rc = ENOMEM;
2455 		}
2456 		if (!rc)
2457 			rc = mdb_cursor_shadow(parent, txn);
2458 		if (rc)
2459 			mdb_txn_reset0(txn, "beginchild-fail");
2460 	} else {
2461 		rc = mdb_txn_renew0(txn);
2462 	}
2463 	if (rc)
2464 		free(txn);
2465 	else {
2466 		*ret = txn;
2467 		DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2468 			txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
2469 			(void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root));
2470 	}
2471 
2472 	return rc;
2473 }
2474 
2475 MDB_env *
2476 mdb_txn_env(MDB_txn *txn)
2477 {
2478 	if(!txn) return NULL;
2479 	return txn->mt_env;
2480 }
2481 
2482 /** Export or close DBI handles opened in this txn. */
2483 static void
2484 mdb_dbis_update(MDB_txn *txn, int keep)
2485 {
2486 	int i;
2487 	MDB_dbi n = txn->mt_numdbs;
2488 	MDB_env *env = txn->mt_env;
2489 	unsigned char *tdbflags = txn->mt_dbflags;
2490 
2491 	for (i = n; --i >= 2;) {
2492 		if (tdbflags[i] & DB_NEW) {
2493 			if (keep) {
2494 				env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID;
2495 			} else {
2496 				char *ptr = env->me_dbxs[i].md_name.mv_data;
2497 				env->me_dbxs[i].md_name.mv_data = NULL;
2498 				env->me_dbxs[i].md_name.mv_size = 0;
2499 				env->me_dbflags[i] = 0;
2500 				free(ptr);
2501 			}
2502 		}
2503 	}
2504 	if (keep && env->me_numdbs < n)
2505 		env->me_numdbs = n;
2506 }
2507 
2508 /** Common code for #mdb_txn_reset() and #mdb_txn_abort().
2509  * May be called twice for readonly txns: First reset it, then abort.
2510  * @param[in] txn the transaction handle to reset
2511  * @param[in] act why the transaction is being reset
2512  */
2513 static void
2514 mdb_txn_reset0(MDB_txn *txn, const char *act)
2515 {
2516 	MDB_env	*env = txn->mt_env;
2517 
2518 	/* Close any DBI handles opened in this txn */
2519 	mdb_dbis_update(txn, 0);
2520 
2521 	DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2522 		act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
2523 		(void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root));
2524 
2525 	if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
2526 		if (txn->mt_u.reader) {
2527 			txn->mt_u.reader->mr_txnid = (txnid_t)-1;
2528 			if (!(env->me_flags & MDB_NOTLS))
2529 				txn->mt_u.reader = NULL; /* txn does not own reader */
2530 		}
2531 		txn->mt_numdbs = 0;		/* close nothing if called again */
2532 		txn->mt_dbxs = NULL;	/* mark txn as reset */
2533 	} else {
2534 		mdb_cursors_close(txn, 0);
2535 
2536 		if (!(env->me_flags & MDB_WRITEMAP)) {
2537 			mdb_dlist_free(txn);
2538 		}
2539 		mdb_midl_free(env->me_pghead);
2540 
2541 		if (txn->mt_parent) {
2542 			txn->mt_parent->mt_child = NULL;
2543 			env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate;
2544 			mdb_midl_free(txn->mt_free_pgs);
2545 			mdb_midl_free(txn->mt_spill_pgs);
2546 			free(txn->mt_u.dirty_list);
2547 			return;
2548 		}
2549 
2550 		if (mdb_midl_shrink(&txn->mt_free_pgs))
2551 			env->me_free_pgs = txn->mt_free_pgs;
2552 		env->me_pghead = NULL;
2553 		env->me_pglast = 0;
2554 
2555 		env->me_txn = NULL;
2556 		/* The writer mutex was locked in mdb_txn_begin. */
2557 		if (env->me_txns)
2558 			UNLOCK_MUTEX_W(env);
2559 	}
2560 }
2561 
2562 void
2563 mdb_txn_reset(MDB_txn *txn)
2564 {
2565 	if (txn == NULL)
2566 		return;
2567 
2568 	/* This call is only valid for read-only txns */
2569 	if (!(txn->mt_flags & MDB_TXN_RDONLY))
2570 		return;
2571 
2572 	mdb_txn_reset0(txn, "reset");
2573 }
2574 
2575 void
2576 mdb_txn_abort(MDB_txn *txn)
2577 {
2578 	if (txn == NULL)
2579 		return;
2580 
2581 	if (txn->mt_child)
2582 		mdb_txn_abort(txn->mt_child);
2583 
2584 	mdb_txn_reset0(txn, "abort");
2585 	/* Free reader slot tied to this txn (if MDB_NOTLS && writable FS) */
2586 	if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader)
2587 		txn->mt_u.reader->mr_pid = 0;
2588 
2589 	free(txn);
2590 }
2591 
2592 /** Save the freelist as of this transaction to the freeDB.
2593  * This changes the freelist. Keep trying until it stabilizes.
2594  */
2595 static int
2596 mdb_freelist_save(MDB_txn *txn)
2597 {
2598 	/* env->me_pghead[] can grow and shrink during this call.
2599 	 * env->me_pglast and txn->mt_free_pgs[] can only grow.
2600 	 * Page numbers cannot disappear from txn->mt_free_pgs[].
2601 	 */
2602 	MDB_cursor mc;
2603 	MDB_env	*env = txn->mt_env;
2604 	int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
2605 	txnid_t	pglast = 0, head_id = 0;
2606 	pgno_t	freecnt = 0, *free_pgs, *mop;
2607 	ssize_t	head_room = 0, total_room = 0, mop_len, clean_limit;
2608 
2609 	mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
2610 
2611 	if (env->me_pghead) {
2612 		/* Make sure first page of freeDB is touched and on freelist */
2613 		rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY);
2614 		if (rc && rc != MDB_NOTFOUND)
2615 			return rc;
2616 	}
2617 
2618 	/* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
2619 	clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
2620 		? SSIZE_MAX : maxfree_1pg;
2621 
2622 	for (;;) {
2623 		/* Come back here after each Put() in case freelist changed */
2624 		MDB_val key, data;
2625 		pgno_t *pgs;
2626 		ssize_t j;
2627 
2628 		/* If using records from freeDB which we have not yet
2629 		 * deleted, delete them and any we reserved for me_pghead.
2630 		 */
2631 		while (pglast < env->me_pglast) {
2632 			rc = mdb_cursor_first(&mc, &key, NULL);
2633 			if (rc)
2634 				return rc;
2635 			pglast = head_id = *(txnid_t *)key.mv_data;
2636 			total_room = head_room = 0;
2637 			mdb_tassert(txn, pglast <= env->me_pglast);
2638 			rc = mdb_cursor_del(&mc, 0);
2639 			if (rc)
2640 				return rc;
2641 		}
2642 
2643 		/* Save the IDL of pages freed by this txn, to a single record */
2644 		if (freecnt < txn->mt_free_pgs[0]) {
2645 			if (!freecnt) {
2646 				/* Make sure last page of freeDB is touched and on freelist */
2647 				rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY);
2648 				if (rc && rc != MDB_NOTFOUND)
2649 					return rc;
2650 			}
2651 			free_pgs = txn->mt_free_pgs;
2652 			/* Write to last page of freeDB */
2653 			key.mv_size = sizeof(txn->mt_txnid);
2654 			key.mv_data = &txn->mt_txnid;
2655 			do {
2656 				freecnt = free_pgs[0];
2657 				data.mv_size = MDB_IDL_SIZEOF(free_pgs);
2658 				rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
2659 				if (rc)
2660 					return rc;
2661 				/* Retry if mt_free_pgs[] grew during the Put() */
2662 				free_pgs = txn->mt_free_pgs;
2663 			} while (freecnt < free_pgs[0]);
2664 			mdb_midl_sort(free_pgs);
2665 			memcpy(data.mv_data, free_pgs, data.mv_size);
2666 #if (MDB_DEBUG) > 1
2667 			{
2668 				unsigned int i = free_pgs[0];
2669 				DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u",
2670 					txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i));
2671 				for (; i; i--)
2672 					DPRINTF(("IDL %"Z"u", free_pgs[i]));
2673 			}
2674 #endif
2675 			continue;
2676 		}
2677 
2678 		mop = env->me_pghead;
2679 		mop_len = mop ? mop[0] : 0;
2680 
2681 		/* Reserve records for me_pghead[]. Split it if multi-page,
2682 		 * to avoid searching freeDB for a page range. Use keys in
2683 		 * range [1,me_pglast]: Smaller than txnid of oldest reader.
2684 		 */
2685 		if (total_room >= mop_len) {
2686 			if (total_room == mop_len || --more < 0)
2687 				break;
2688 		} else if (head_room >= maxfree_1pg && head_id > 1) {
2689 			/* Keep current record (overflow page), add a new one */
2690 			head_id--;
2691 			head_room = 0;
2692 		}
2693 		/* (Re)write {key = head_id, IDL length = head_room} */
2694 		total_room -= head_room;
2695 		head_room = mop_len - total_room;
2696 		if (head_room > maxfree_1pg && head_id > 1) {
2697 			/* Overflow multi-page for part of me_pghead */
2698 			head_room /= head_id; /* amortize page sizes */
2699 			head_room += maxfree_1pg - head_room % (maxfree_1pg + 1);
2700 		} else if (head_room < 0) {
2701 			/* Rare case, not bothering to delete this record */
2702 			head_room = 0;
2703 		}
2704 		key.mv_size = sizeof(head_id);
2705 		key.mv_data = &head_id;
2706 		data.mv_size = (head_room + 1) * sizeof(pgno_t);
2707 		rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
2708 		if (rc)
2709 			return rc;
2710 		/* IDL is initially empty, zero out at least the length */
2711 		pgs = (pgno_t *)data.mv_data;
2712 		j = head_room > clean_limit ? head_room : 0;
2713 		do {
2714 			pgs[j] = 0;
2715 		} while (--j >= 0);
2716 		total_room += head_room;
2717 	}
2718 
2719 	/* Fill in the reserved me_pghead records */
2720 	rc = MDB_SUCCESS;
2721 	if (mop_len) {
2722 		MDB_val key, data;
2723 
2724 		mop += mop_len;
2725 		rc = mdb_cursor_first(&mc, &key, &data);
2726 		for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) {
2727 			unsigned flags = MDB_CURRENT;
2728 			txnid_t id = *(txnid_t *)key.mv_data;
2729 			ssize_t	len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1;
2730 			MDB_ID save;
2731 
2732 			mdb_tassert(txn, len >= 0 && id <= env->me_pglast);
2733 			key.mv_data = &id;
2734 			if (len > mop_len) {
2735 				len = mop_len;
2736 				data.mv_size = (len + 1) * sizeof(MDB_ID);
2737 				flags = 0;
2738 			}
2739 			data.mv_data = mop -= len;
2740 			save = mop[0];
2741 			mop[0] = len;
2742 			rc = mdb_cursor_put(&mc, &key, &data, flags);
2743 			mop[0] = save;
2744 			if (rc || !(mop_len -= len))
2745 				break;
2746 		}
2747 	}
2748 	return rc;
2749 }
2750 
2751 /** Flush (some) dirty pages to the map, after clearing their dirty flag.
2752  * @param[in] txn the transaction that's being committed
2753  * @param[in] keep number of initial pages in dirty_list to keep dirty.
2754  * @return 0 on success, non-zero on failure.
2755  */
2756 static int
2757 mdb_page_flush(MDB_txn *txn, int keep)
2758 {
2759 	MDB_env		*env = txn->mt_env;
2760 	MDB_ID2L	dl = txn->mt_u.dirty_list;
2761 	unsigned	psize = env->me_psize, j;
2762 	int			i, pagecount = dl[0].mid, rc;
2763 	size_t		size = 0, pos = 0;
2764 	pgno_t		pgno = 0;
2765 	MDB_page	*dp = NULL;
2766 #ifdef _WIN32
2767 	OVERLAPPED	ov;
2768 #else
2769 	struct iovec iov[MDB_COMMIT_PAGES];
2770 	ssize_t		wpos = 0, wsize = 0, wres;
2771 	size_t		next_pos = 1; /* impossible pos, so pos != next_pos */
2772 	int			n = 0;
2773 #endif
2774 
2775 	j = i = keep;
2776 
2777 	if (env->me_flags & MDB_WRITEMAP) {
2778 		/* Clear dirty flags */
2779 		while (++i <= pagecount) {
2780 			dp = dl[i].mptr;
2781 			/* Don't flush this page yet */
2782 			if (dp->mp_flags & P_KEEP) {
2783 				dp->mp_flags ^= P_KEEP;
2784 				dl[++j] = dl[i];
2785 				continue;
2786 			}
2787 			dp->mp_flags &= ~P_DIRTY;
2788 		}
2789 		goto done;
2790 	}
2791 
2792 	/* Write the pages */
2793 	for (;;) {
2794 		if (++i <= pagecount) {
2795 			dp = dl[i].mptr;
2796 			/* Don't flush this page yet */
2797 			if (dp->mp_flags & P_KEEP) {
2798 				dp->mp_flags ^= P_KEEP;
2799 				dl[i].mid = 0;
2800 				continue;
2801 			}
2802 			pgno = dl[i].mid;
2803 			/* clear dirty flag */
2804 			dp->mp_flags &= ~P_DIRTY;
2805 			pos = pgno * psize;
2806 			size = psize;
2807 			if (IS_OVERFLOW(dp)) size *= dp->mp_pages;
2808 		}
2809 #ifdef _WIN32
2810 		else break;
2811 
2812 		/* Windows actually supports scatter/gather I/O, but only on
2813 		 * unbuffered file handles. Since we're relying on the OS page
2814 		 * cache for all our data, that's self-defeating. So we just
2815 		 * write pages one at a time. We use the ov structure to set
2816 		 * the write offset, to at least save the overhead of a Seek
2817 		 * system call.
2818 		 */
2819 		DPRINTF(("committing page %"Z"u", pgno));
2820 		memset(&ov, 0, sizeof(ov));
2821 		ov.Offset = pos & 0xffffffff;
2822 		ov.OffsetHigh = pos >> 16 >> 16;
2823 		if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) {
2824 			rc = ErrCode();
2825 			DPRINTF(("WriteFile: %d", rc));
2826 			return rc;
2827 		}
2828 #else
2829 		/* Write up to MDB_COMMIT_PAGES dirty pages at a time. */
2830 		if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) {
2831 			if (n) {
2832 				/* Write previous page(s) */
2833 #ifdef MDB_USE_PWRITEV
2834 				wres = pwritev(env->me_fd, iov, n, wpos);
2835 #else
2836 				if (n == 1) {
2837 					wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos);
2838 				} else {
2839 					if (lseek(env->me_fd, wpos, SEEK_SET) == -1) {
2840 						rc = ErrCode();
2841 						DPRINTF(("lseek: %s", strerror(rc)));
2842 						return rc;
2843 					}
2844 					wres = writev(env->me_fd, iov, n);
2845 				}
2846 #endif
2847 				if (wres != wsize) {
2848 					if (wres < 0) {
2849 						rc = ErrCode();
2850 						DPRINTF(("Write error: %s", strerror(rc)));
2851 					} else {
2852 						rc = EIO; /* TODO: Use which error code? */
2853 						DPUTS("short write, filesystem full?");
2854 					}
2855 					return rc;
2856 				}
2857 				n = 0;
2858 			}
2859 			if (i > pagecount)
2860 				break;
2861 			wpos = pos;
2862 			wsize = 0;
2863 		}
2864 		DPRINTF(("committing page %"Z"u", pgno));
2865 		next_pos = pos + size;
2866 		iov[n].iov_len = size;
2867 		iov[n].iov_base = (char *)dp;
2868 		wsize += size;
2869 		n++;
2870 #endif	/* _WIN32 */
2871 	}
2872 
2873 	for (i = keep; ++i <= pagecount; ) {
2874 		dp = dl[i].mptr;
2875 		/* This is a page we skipped above */
2876 		if (!dl[i].mid) {
2877 			dl[++j] = dl[i];
2878 			dl[j].mid = dp->mp_pgno;
2879 			continue;
2880 		}
2881 		mdb_dpage_free(env, dp);
2882 	}
2883 
2884 done:
2885 	i--;
2886 	txn->mt_dirty_room += i - j;
2887 	dl[0].mid = j;
2888 	return MDB_SUCCESS;
2889 }
2890 
2891 int
2892 mdb_txn_commit(MDB_txn *txn)
2893 {
2894 	int		rc;
2895 	unsigned int i;
2896 	MDB_env	*env;
2897 
2898 	if (txn == NULL || txn->mt_env == NULL)
2899 		return EINVAL;
2900 
2901 	if (txn->mt_child) {
2902 		rc = mdb_txn_commit(txn->mt_child);
2903 		txn->mt_child = NULL;
2904 		if (rc)
2905 			goto fail;
2906 	}
2907 
2908 	env = txn->mt_env;
2909 
2910 	if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
2911 		mdb_dbis_update(txn, 1);
2912 		txn->mt_numdbs = 2; /* so txn_abort() doesn't close any new handles */
2913 		mdb_txn_abort(txn);
2914 		return MDB_SUCCESS;
2915 	}
2916 
2917 	if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) {
2918 		DPUTS("error flag is set, can't commit");
2919 		if (txn->mt_parent)
2920 			txn->mt_parent->mt_flags |= MDB_TXN_ERROR;
2921 		rc = MDB_BAD_TXN;
2922 		goto fail;
2923 	}
2924 
2925 	if (txn->mt_parent) {
2926 		MDB_txn *parent = txn->mt_parent;
2927 		MDB_ID2L dst, src;
2928 		MDB_IDL pspill;
2929 		unsigned x, y, len, ps_len;
2930 
2931 		/* Append our free list to parent's */
2932 		rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs);
2933 		if (rc)
2934 			goto fail;
2935 		mdb_midl_free(txn->mt_free_pgs);
2936 		/* Failures after this must either undo the changes
2937 		 * to the parent or set MDB_TXN_ERROR in the parent.
2938 		 */
2939 
2940 		parent->mt_next_pgno = txn->mt_next_pgno;
2941 		parent->mt_flags = txn->mt_flags;
2942 
2943 		/* Merge our cursors into parent's and close them */
2944 		mdb_cursors_close(txn, 1);
2945 
2946 		/* Update parent's DB table. */
2947 		memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
2948 		parent->mt_numdbs = txn->mt_numdbs;
2949 		parent->mt_dbflags[0] = txn->mt_dbflags[0];
2950 		parent->mt_dbflags[1] = txn->mt_dbflags[1];
2951 		for (i=2; i<txn->mt_numdbs; i++) {
2952 			/* preserve parent's DB_NEW status */
2953 			x = parent->mt_dbflags[i] & DB_NEW;
2954 			parent->mt_dbflags[i] = txn->mt_dbflags[i] | x;
2955 		}
2956 
2957 		dst = parent->mt_u.dirty_list;
2958 		src = txn->mt_u.dirty_list;
2959 		/* Remove anything in our dirty list from parent's spill list */
2960 		if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) {
2961 			x = y = ps_len;
2962 			pspill[0] = (pgno_t)-1;
2963 			/* Mark our dirty pages as deleted in parent spill list */
2964 			for (i=0, len=src[0].mid; ++i <= len; ) {
2965 				MDB_ID pn = src[i].mid << 1;
2966 				while (pn > pspill[x])
2967 					x--;
2968 				if (pn == pspill[x]) {
2969 					pspill[x] = 1;
2970 					y = --x;
2971 				}
2972 			}
2973 			/* Squash deleted pagenums if we deleted any */
2974 			for (x=y; ++x <= ps_len; )
2975 				if (!(pspill[x] & 1))
2976 					pspill[++y] = pspill[x];
2977 			pspill[0] = y;
2978 		}
2979 
2980 		/* Find len = length of merging our dirty list with parent's */
2981 		x = dst[0].mid;
2982 		dst[0].mid = 0;		/* simplify loops */
2983 		if (parent->mt_parent) {
2984 			len = x + src[0].mid;
2985 			y = mdb_mid2l_search(src, dst[x].mid + 1) - 1;
2986 			for (i = x; y && i; y--) {
2987 				pgno_t yp = src[y].mid;
2988 				while (yp < dst[i].mid)
2989 					i--;
2990 				if (yp == dst[i].mid) {
2991 					i--;
2992 					len--;
2993 				}
2994 			}
2995 		} else { /* Simplify the above for single-ancestor case */
2996 			len = MDB_IDL_UM_MAX - txn->mt_dirty_room;
2997 		}
2998 		/* Merge our dirty list with parent's */
2999 		y = src[0].mid;
3000 		for (i = len; y; dst[i--] = src[y--]) {
3001 			pgno_t yp = src[y].mid;
3002 			while (yp < dst[x].mid)
3003 				dst[i--] = dst[x--];
3004 			if (yp == dst[x].mid)
3005 				free(dst[x--].mptr);
3006 		}
3007 		mdb_tassert(txn, i == x);
3008 		dst[0].mid = len;
3009 		free(txn->mt_u.dirty_list);
3010 		parent->mt_dirty_room = txn->mt_dirty_room;
3011 		if (txn->mt_spill_pgs) {
3012 			if (parent->mt_spill_pgs) {
3013 				/* TODO: Prevent failure here, so parent does not fail */
3014 				rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
3015 				if (rc)
3016 					parent->mt_flags |= MDB_TXN_ERROR;
3017 				mdb_midl_free(txn->mt_spill_pgs);
3018 				mdb_midl_sort(parent->mt_spill_pgs);
3019 			} else {
3020 				parent->mt_spill_pgs = txn->mt_spill_pgs;
3021 			}
3022 		}
3023 
3024 		parent->mt_child = NULL;
3025 		mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
3026 		free(txn);
3027 		return rc;
3028 	}
3029 
3030 	if (txn != env->me_txn) {
3031 		DPUTS("attempt to commit unknown transaction");
3032 		rc = EINVAL;
3033 		goto fail;
3034 	}
3035 
3036 	mdb_cursors_close(txn, 0);
3037 
3038 	if (!txn->mt_u.dirty_list[0].mid &&
3039 		!(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS)))
3040 		goto done;
3041 
3042 	DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u",
3043 	    txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root));
3044 
3045 	/* Update DB root pointers */
3046 	if (txn->mt_numdbs > 2) {
3047 		MDB_cursor mc;
3048 		MDB_dbi i;
3049 		MDB_val data;
3050 		data.mv_size = sizeof(MDB_db);
3051 
3052 		mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
3053 		for (i = 2; i < txn->mt_numdbs; i++) {
3054 			if (txn->mt_dbflags[i] & DB_DIRTY) {
3055 				data.mv_data = &txn->mt_dbs[i];
3056 				rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0);
3057 				if (rc)
3058 					goto fail;
3059 			}
3060 		}
3061 	}
3062 
3063 	rc = mdb_freelist_save(txn);
3064 	if (rc)
3065 		goto fail;
3066 
3067 	mdb_midl_free(env->me_pghead);
3068 	env->me_pghead = NULL;
3069 	if (mdb_midl_shrink(&txn->mt_free_pgs))
3070 		env->me_free_pgs = txn->mt_free_pgs;
3071 
3072 #if (MDB_DEBUG) > 2
3073 	mdb_audit(txn);
3074 #endif
3075 
3076 	if ((rc = mdb_page_flush(txn, 0)) ||
3077 		(rc = mdb_env_sync(env, 0)) ||
3078 		(rc = mdb_env_write_meta(txn)))
3079 		goto fail;
3080 
3081 done:
3082 	env->me_pglast = 0;
3083 	env->me_txn = NULL;
3084 	mdb_dbis_update(txn, 1);
3085 
3086 	if (env->me_txns)
3087 		UNLOCK_MUTEX_W(env);
3088 	free(txn);
3089 
3090 	return MDB_SUCCESS;
3091 
3092 fail:
3093 	mdb_txn_abort(txn);
3094 	return rc;
3095 }
3096 
3097 /** Read the environment parameters of a DB environment before
3098  * mapping it into memory.
3099  * @param[in] env the environment handle
3100  * @param[out] meta address of where to store the meta information
3101  * @return 0 on success, non-zero on failure.
3102  */
3103 static int
3104 mdb_env_read_header(MDB_env *env, MDB_meta *meta)
3105 {
3106 	MDB_metabuf	pbuf;
3107 	MDB_page	*p;
3108 	MDB_meta	*m;
3109 	int			i, rc, off;
3110 	enum { Size = sizeof(pbuf) };
3111 
3112 	/* We don't know the page size yet, so use a minimum value.
3113 	 * Read both meta pages so we can use the latest one.
3114 	 */
3115 
3116 	for (i=off=0; i<2; i++, off = meta->mm_psize) {
3117 #ifdef _WIN32
3118 		DWORD len;
3119 		OVERLAPPED ov;
3120 		memset(&ov, 0, sizeof(ov));
3121 		ov.Offset = off;
3122 		rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
3123 		if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
3124 			rc = 0;
3125 #else
3126 		rc = pread(env->me_fd, &pbuf, Size, off);
3127 #endif
3128 		if (rc != Size) {
3129 			if (rc == 0 && off == 0)
3130 				return ENOENT;
3131 			rc = rc < 0 ? (int) ErrCode() : MDB_INVALID;
3132 			DPRINTF(("read: %s", mdb_strerror(rc)));
3133 			return rc;
3134 		}
3135 
3136 		p = (MDB_page *)&pbuf;
3137 
3138 		if (!F_ISSET(p->mp_flags, P_META)) {
3139 			DPRINTF(("page %"Z"u not a meta page", p->mp_pgno));
3140 			return MDB_INVALID;
3141 		}
3142 
3143 		m = METADATA(p);
3144 		if (m->mm_magic != MDB_MAGIC) {
3145 			DPUTS("meta has invalid magic");
3146 			return MDB_INVALID;
3147 		}
3148 
3149 		if (m->mm_version != MDB_DATA_VERSION) {
3150 			DPRINTF(("database is version %u, expected version %u",
3151 				m->mm_version, MDB_DATA_VERSION));
3152 			return MDB_VERSION_MISMATCH;
3153 		}
3154 
3155 		if (off == 0 || m->mm_txnid > meta->mm_txnid)
3156 			*meta = *m;
3157 	}
3158 	return 0;
3159 }
3160 
3161 /** Write the environment parameters of a freshly created DB environment.
3162  * @param[in] env the environment handle
3163  * @param[out] meta address of where to store the meta information
3164  * @return 0 on success, non-zero on failure.
3165  */
3166 static int
3167 mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
3168 {
3169 	MDB_page *p, *q;
3170 	int rc;
3171 	unsigned int	 psize;
3172 #ifdef _WIN32
3173 	DWORD len;
3174 	OVERLAPPED ov;
3175 	memset(&ov, 0, sizeof(ov));
3176 #define DO_PWRITE(rc, fd, ptr, size, len, pos)	do { \
3177 	ov.Offset = pos;	\
3178 	rc = WriteFile(fd, ptr, size, &len, &ov);	} while(0)
3179 #else
3180 	int len;
3181 #define DO_PWRITE(rc, fd, ptr, size, len, pos)	do { \
3182 	len = pwrite(fd, ptr, size, pos);	\
3183 	rc = (len >= 0); } while(0)
3184 #endif
3185 
3186 	DPUTS("writing new meta page");
3187 
3188 	psize = env->me_psize;
3189 
3190 	meta->mm_magic = MDB_MAGIC;
3191 	meta->mm_version = MDB_DATA_VERSION;
3192 	meta->mm_mapsize = env->me_mapsize;
3193 	meta->mm_psize = psize;
3194 	meta->mm_last_pg = 1;
3195 	meta->mm_flags = env->me_flags & 0xffff;
3196 	meta->mm_flags |= MDB_INTEGERKEY;
3197 	meta->mm_dbs[0].md_root = P_INVALID;
3198 	meta->mm_dbs[1].md_root = P_INVALID;
3199 
3200 	p = calloc(2, psize);
3201 	p->mp_pgno = 0;
3202 	p->mp_flags = P_META;
3203 	*(MDB_meta *)METADATA(p) = *meta;
3204 
3205 	q = (MDB_page *)((char *)p + psize);
3206 	q->mp_pgno = 1;
3207 	q->mp_flags = P_META;
3208 	*(MDB_meta *)METADATA(q) = *meta;
3209 
3210 	DO_PWRITE(rc, env->me_fd, p, psize * 2, len, 0);
3211 	if (!rc)
3212 		rc = ErrCode();
3213 	else if ((unsigned) len == psize * 2)
3214 		rc = MDB_SUCCESS;
3215 	else
3216 		rc = ENOSPC;
3217 	free(p);
3218 	return rc;
3219 }
3220 
3221 /** Update the environment info to commit a transaction.
3222  * @param[in] txn the transaction that's being committed
3223  * @return 0 on success, non-zero on failure.
3224  */
3225 static int
3226 mdb_env_write_meta(MDB_txn *txn)
3227 {
3228 	MDB_env *env;
3229 	MDB_meta	meta, metab, *mp;
3230 	off_t off;
3231 	int rc, len, toggle;
3232 	char *ptr;
3233 	HANDLE mfd;
3234 #ifdef _WIN32
3235 	OVERLAPPED ov;
3236 #else
3237 	int r2;
3238 #endif
3239 
3240 	toggle = txn->mt_txnid & 1;
3241 	DPRINTF(("writing meta page %d for root page %"Z"u",
3242 		toggle, txn->mt_dbs[MAIN_DBI].md_root));
3243 
3244 	env = txn->mt_env;
3245 	mp = env->me_metas[toggle];
3246 
3247 	if (env->me_flags & MDB_WRITEMAP) {
3248 		/* Persist any increases of mapsize config */
3249 		if (env->me_mapsize > mp->mm_mapsize)
3250 			mp->mm_mapsize = env->me_mapsize;
3251 		mp->mm_dbs[0] = txn->mt_dbs[0];
3252 		mp->mm_dbs[1] = txn->mt_dbs[1];
3253 		mp->mm_last_pg = txn->mt_next_pgno - 1;
3254 		mp->mm_txnid = txn->mt_txnid;
3255 		if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) {
3256 			unsigned meta_size = env->me_psize;
3257 			rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
3258 			ptr = env->me_map;
3259 			if (toggle) {
3260 #ifndef _WIN32	/* POSIX msync() requires ptr = start of OS page */
3261 				if (meta_size < env->me_os_psize)
3262 					meta_size += meta_size;
3263 				else
3264 #endif
3265 					ptr += meta_size;
3266 			}
3267 			if (MDB_MSYNC(ptr, meta_size, rc)) {
3268 				rc = ErrCode();
3269 				goto fail;
3270 			}
3271 		}
3272 		goto done;
3273 	}
3274 	metab.mm_txnid = env->me_metas[toggle]->mm_txnid;
3275 	metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg;
3276 
3277 	ptr = (char *)&meta;
3278 	if (env->me_mapsize > mp->mm_mapsize) {
3279 		/* Persist any increases of mapsize config */
3280 		meta.mm_mapsize = env->me_mapsize;
3281 		off = offsetof(MDB_meta, mm_mapsize);
3282 	} else {
3283 		off = offsetof(MDB_meta, mm_dbs[0].md_depth);
3284 	}
3285 	len = sizeof(MDB_meta) - off;
3286 
3287 	ptr += off;
3288 	meta.mm_dbs[0] = txn->mt_dbs[0];
3289 	meta.mm_dbs[1] = txn->mt_dbs[1];
3290 	meta.mm_last_pg = txn->mt_next_pgno - 1;
3291 	meta.mm_txnid = txn->mt_txnid;
3292 
3293 	if (toggle)
3294 		off += env->me_psize;
3295 	off += PAGEHDRSZ;
3296 
3297 	/* Write to the SYNC fd */
3298 	mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ?
3299 		env->me_fd : env->me_mfd;
3300 #ifdef _WIN32
3301 	{
3302 		memset(&ov, 0, sizeof(ov));
3303 		ov.Offset = off;
3304 		if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov))
3305 			rc = -1;
3306 	}
3307 #else
3308 	rc = pwrite(mfd, ptr, len, off);
3309 #endif
3310 	if (rc != len) {
3311 		rc = rc < 0 ? ErrCode() : EIO;
3312 		DPUTS("write failed, disk error?");
3313 		/* On a failure, the pagecache still contains the new data.
3314 		 * Write some old data back, to prevent it from being used.
3315 		 * Use the non-SYNC fd; we know it will fail anyway.
3316 		 */
3317 		meta.mm_last_pg = metab.mm_last_pg;
3318 		meta.mm_txnid = metab.mm_txnid;
3319 #ifdef _WIN32
3320 		memset(&ov, 0, sizeof(ov));
3321 		ov.Offset = off;
3322 		WriteFile(env->me_fd, ptr, len, NULL, &ov);
3323 #else
3324 		r2 = pwrite(env->me_fd, ptr, len, off);
3325 		(void)r2;	/* Silence warnings. We don't care about pwrite's return value */
3326 #endif
3327 fail:
3328 		env->me_flags |= MDB_FATAL_ERROR;
3329 		return rc;
3330 	}
3331 done:
3332 	/* Memory ordering issues are irrelevant; since the entire writer
3333 	 * is wrapped by wmutex, all of these changes will become visible
3334 	 * after the wmutex is unlocked. Since the DB is multi-version,
3335 	 * readers will get consistent data regardless of how fresh or
3336 	 * how stale their view of these values is.
3337 	 */
3338 	if (env->me_txns)
3339 		env->me_txns->mti_txnid = txn->mt_txnid;
3340 
3341 	return MDB_SUCCESS;
3342 }
3343 
3344 /** Check both meta pages to see which one is newer.
3345  * @param[in] env the environment handle
3346  * @return meta toggle (0 or 1).
3347  */
3348 static int
3349 mdb_env_pick_meta(const MDB_env *env)
3350 {
3351 	return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid);
3352 }
3353 
3354 int
3355 mdb_env_create(MDB_env **env)
3356 {
3357 	MDB_env *e;
3358 
3359 	e = calloc(1, sizeof(MDB_env));
3360 	if (!e)
3361 		return ENOMEM;
3362 
3363 	e->me_maxreaders = DEFAULT_READERS;
3364 	e->me_maxdbs = e->me_numdbs = 2;
3365 	e->me_fd = INVALID_HANDLE_VALUE;
3366 	e->me_lfd = INVALID_HANDLE_VALUE;
3367 	e->me_mfd = INVALID_HANDLE_VALUE;
3368 #ifdef MDB_USE_POSIX_SEM
3369 	e->me_rmutex = SEM_FAILED;
3370 	e->me_wmutex = SEM_FAILED;
3371 #endif
3372 	e->me_pid = getpid();
3373 	GET_PAGESIZE(e->me_os_psize);
3374 	VGMEMP_CREATE(e,0,0);
3375 	*env = e;
3376 	return MDB_SUCCESS;
3377 }
3378 
3379 static int
3380 mdb_env_map(MDB_env *env, void *addr, int newsize)
3381 {
3382 	MDB_page *p;
3383 	unsigned int flags = env->me_flags;
3384 #ifdef _WIN32
3385 	int rc;
3386 	HANDLE mh;
3387 	LONG sizelo, sizehi;
3388 	sizelo = env->me_mapsize & 0xffffffff;
3389 	sizehi = env->me_mapsize >> 16 >> 16; /* only needed on Win64 */
3390 
3391 	/* Windows won't create mappings for zero length files.
3392 	 * Just allocate the maxsize right now.
3393 	 */
3394 	if (newsize) {
3395 		if (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo
3396 			|| !SetEndOfFile(env->me_fd)
3397 			|| SetFilePointer(env->me_fd, 0, NULL, 0) != 0)
3398 			return ErrCode();
3399 	}
3400 	mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ?
3401 		PAGE_READWRITE : PAGE_READONLY,
3402 		sizehi, sizelo, NULL);
3403 	if (!mh)
3404 		return ErrCode();
3405 	env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ?
3406 		FILE_MAP_WRITE : FILE_MAP_READ,
3407 		0, 0, env->me_mapsize, addr);
3408 	rc = env->me_map ? 0 : ErrCode();
3409 	CloseHandle(mh);
3410 	if (rc)
3411 		return rc;
3412 #else
3413 	int prot = PROT_READ;
3414 	if (flags & MDB_WRITEMAP) {
3415 		prot |= PROT_WRITE;
3416 		if (ftruncate(env->me_fd, env->me_mapsize) < 0)
3417 			return ErrCode();
3418 	}
3419 	env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED,
3420 		env->me_fd, 0);
3421 	if (env->me_map == MAP_FAILED) {
3422 		env->me_map = NULL;
3423 		return ErrCode();
3424 	}
3425 
3426 	if (flags & MDB_NORDAHEAD) {
3427 		/* Turn off readahead. It's harmful when the DB is larger than RAM. */
3428 #ifdef MADV_RANDOM
3429 		madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
3430 #else
3431 #ifdef POSIX_MADV_RANDOM
3432 		posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
3433 #endif /* POSIX_MADV_RANDOM */
3434 #endif /* MADV_RANDOM */
3435 	}
3436 #endif /* _WIN32 */
3437 
3438 	/* Can happen because the address argument to mmap() is just a
3439 	 * hint.  mmap() can pick another, e.g. if the range is in use.
3440 	 * The MAP_FIXED flag would prevent that, but then mmap could
3441 	 * instead unmap existing pages to make room for the new map.
3442 	 */
3443 	if (addr && env->me_map != addr)
3444 		return EBUSY;	/* TODO: Make a new MDB_* error code? */
3445 
3446 	p = (MDB_page *)env->me_map;
3447 	env->me_metas[0] = METADATA(p);
3448 	env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize);
3449 
3450 	return MDB_SUCCESS;
3451 }
3452 
3453 int
3454 mdb_env_set_mapsize(MDB_env *env, size_t size)
3455 {
3456 	/* If env is already open, caller is responsible for making
3457 	 * sure there are no active txns.
3458 	 */
3459 	if (env->me_map) {
3460 		int rc;
3461 		void *old;
3462 		if (env->me_txn)
3463 			return EINVAL;
3464 		if (!size)
3465 			size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize;
3466 		else if (size < env->me_mapsize) {
3467 			/* If the configured size is smaller, make sure it's
3468 			 * still big enough. Silently round up to minimum if not.
3469 			 */
3470 			size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize;
3471 			if (size < minsize)
3472 				size = minsize;
3473 		}
3474 		munmap(env->me_map, env->me_mapsize);
3475 		env->me_mapsize = size;
3476 		old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL;
3477 		rc = mdb_env_map(env, old, 1);
3478 		if (rc)
3479 			return rc;
3480 	}
3481 	env->me_mapsize = size;
3482 	if (env->me_psize)
3483 		env->me_maxpg = env->me_mapsize / env->me_psize;
3484 	return MDB_SUCCESS;
3485 }
3486 
3487 int
3488 mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs)
3489 {
3490 	if (env->me_map)
3491 		return EINVAL;
3492 	env->me_maxdbs = dbs + 2; /* Named databases + main and free DB */
3493 	return MDB_SUCCESS;
3494 }
3495 
3496 int
3497 mdb_env_set_maxreaders(MDB_env *env, unsigned int readers)
3498 {
3499 	if (env->me_map || readers < 1)
3500 		return EINVAL;
3501 	env->me_maxreaders = readers;
3502 	return MDB_SUCCESS;
3503 }
3504 
3505 int
3506 mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers)
3507 {
3508 	if (!env || !readers)
3509 		return EINVAL;
3510 	*readers = env->me_maxreaders;
3511 	return MDB_SUCCESS;
3512 }
3513 
3514 /** Further setup required for opening an MDB environment
3515  */
3516 static int
3517 mdb_env_open2(MDB_env *env)
3518 {
3519 	unsigned int flags = env->me_flags;
3520 	int i, newenv = 0, rc;
3521 	MDB_meta meta;
3522 
3523 #ifdef _WIN32
3524 	/* See if we should use QueryLimited */
3525 	rc = GetVersion();
3526 	if ((rc & 0xff) > 5)
3527 		env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION;
3528 	else
3529 		env->me_pidquery = PROCESS_QUERY_INFORMATION;
3530 #endif /* _WIN32 */
3531 
3532 	memset(&meta, 0, sizeof(meta));
3533 
3534 	if ((i = mdb_env_read_header(env, &meta)) != 0) {
3535 		if (i != ENOENT)
3536 			return i;
3537 		DPUTS("new mdbenv");
3538 		newenv = 1;
3539 		env->me_psize = env->me_os_psize;
3540 		if (env->me_psize > MAX_PAGESIZE)
3541 			env->me_psize = MAX_PAGESIZE;
3542 	} else {
3543 		env->me_psize = meta.mm_psize;
3544 	}
3545 
3546 	/* Was a mapsize configured? */
3547 	if (!env->me_mapsize) {
3548 		/* If this is a new environment, take the default,
3549 		 * else use the size recorded in the existing env.
3550 		 */
3551 		env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize;
3552 	} else if (env->me_mapsize < meta.mm_mapsize) {
3553 		/* If the configured size is smaller, make sure it's
3554 		 * still big enough. Silently round up to minimum if not.
3555 		 */
3556 		size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize;
3557 		if (env->me_mapsize < minsize)
3558 			env->me_mapsize = minsize;
3559 	}
3560 
3561 	rc = mdb_env_map(env, meta.mm_address, newenv || env->me_mapsize != meta.mm_mapsize);
3562 	if (rc)
3563 		return rc;
3564 
3565 	if (newenv) {
3566 		if (flags & MDB_FIXEDMAP)
3567 			meta.mm_address = env->me_map;
3568 		i = mdb_env_init_meta(env, &meta);
3569 		if (i != MDB_SUCCESS) {
3570 			return i;
3571 		}
3572 	}
3573 
3574 	env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
3575 	env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2)
3576 		- sizeof(indx_t);
3577 #if !(MDB_MAXKEYSIZE)
3578 	env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db));
3579 #endif
3580 	env->me_maxpg = env->me_mapsize / env->me_psize;
3581 
3582 #if MDB_DEBUG
3583 	{
3584 		int toggle = mdb_env_pick_meta(env);
3585 		MDB_db *db = &env->me_metas[toggle]->mm_dbs[MAIN_DBI];
3586 
3587 		DPRINTF(("opened database version %u, pagesize %u",
3588 			env->me_metas[0]->mm_version, env->me_psize));
3589 		DPRINTF(("using meta page %d",    toggle));
3590 		DPRINTF(("depth: %u",             db->md_depth));
3591 		DPRINTF(("entries: %"Z"u",        db->md_entries));
3592 		DPRINTF(("branch pages: %"Z"u",   db->md_branch_pages));
3593 		DPRINTF(("leaf pages: %"Z"u",     db->md_leaf_pages));
3594 		DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages));
3595 		DPRINTF(("root: %"Z"u",           db->md_root));
3596 	}
3597 #endif
3598 
3599 	return MDB_SUCCESS;
3600 }
3601 
3602 
3603 /** Release a reader thread's slot in the reader lock table.
3604  *	This function is called automatically when a thread exits.
3605  * @param[in] ptr This points to the slot in the reader lock table.
3606  */
3607 static void
3608 mdb_env_reader_dest(void *ptr)
3609 {
3610 	MDB_reader *reader = ptr;
3611 
3612 	reader->mr_pid = 0;
3613 }
3614 
3615 #ifdef _WIN32
3616 /** Junk for arranging thread-specific callbacks on Windows. This is
3617  *	necessarily platform and compiler-specific. Windows supports up
3618  *	to 1088 keys. Let's assume nobody opens more than 64 environments
3619  *	in a single process, for now. They can override this if needed.
3620  */
3621 #ifndef MAX_TLS_KEYS
3622 #define MAX_TLS_KEYS	64
3623 #endif
3624 static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS];
3625 static int mdb_tls_nkeys;
3626 
3627 static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr)
3628 {
3629 	int i;
3630 	switch(reason) {
3631 	case DLL_PROCESS_ATTACH: break;
3632 	case DLL_THREAD_ATTACH: break;
3633 	case DLL_THREAD_DETACH:
3634 		for (i=0; i<mdb_tls_nkeys; i++) {
3635 			MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]);
3636 			mdb_env_reader_dest(r);
3637 		}
3638 		break;
3639 	case DLL_PROCESS_DETACH: break;
3640 	}
3641 }
3642 #ifdef __GNUC__
3643 #ifdef _WIN64
3644 const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
3645 #else
3646 PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
3647 #endif
3648 #else
3649 #ifdef _WIN64
3650 /* Force some symbol references.
3651  *	_tls_used forces the linker to create the TLS directory if not already done
3652  *	mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol.
3653  */
3654 #pragma comment(linker, "/INCLUDE:_tls_used")
3655 #pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
3656 #pragma const_seg(".CRT$XLB")
3657 extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp;
3658 const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
3659 #pragma const_seg()
3660 #else	/* WIN32 */
3661 #pragma comment(linker, "/INCLUDE:__tls_used")
3662 #pragma comment(linker, "/INCLUDE:_mdb_tls_cbp")
3663 #pragma data_seg(".CRT$XLB")
3664 PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
3665 #pragma data_seg()
3666 #endif	/* WIN 32/64 */
3667 #endif	/* !__GNUC__ */
3668 #endif
3669 
3670 /** Downgrade the exclusive lock on the region back to shared */
3671 static int
3672 mdb_env_share_locks(MDB_env *env, int *excl)
3673 {
3674 	int rc = 0, toggle = mdb_env_pick_meta(env);
3675 
3676 	env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid;
3677 
3678 #ifdef _WIN32
3679 	{
3680 		OVERLAPPED ov;
3681 		/* First acquire a shared lock. The Unlock will
3682 		 * then release the existing exclusive lock.
3683 		 */
3684 		memset(&ov, 0, sizeof(ov));
3685 		if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
3686 			rc = ErrCode();
3687 		} else {
3688 			UnlockFile(env->me_lfd, 0, 0, 1, 0);
3689 			*excl = 0;
3690 		}
3691 	}
3692 #else
3693 	{
3694 		struct flock lock_info;
3695 		/* The shared lock replaces the existing lock */
3696 		memset((void *)&lock_info, 0, sizeof(lock_info));
3697 		lock_info.l_type = F_RDLCK;
3698 		lock_info.l_whence = SEEK_SET;
3699 		lock_info.l_start = 0;
3700 		lock_info.l_len = 1;
3701 		while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
3702 				(rc = ErrCode()) == EINTR) ;
3703 		*excl = rc ? -1 : 0;	/* error may mean we lost the lock */
3704 	}
3705 #endif
3706 
3707 	return rc;
3708 }
3709 
3710 /** Try to get exlusive lock, otherwise shared.
3711  *	Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive.
3712  */
3713 static int
3714 mdb_env_excl_lock(MDB_env *env, int *excl)
3715 {
3716 	int rc = 0;
3717 #ifdef _WIN32
3718 	if (LockFile(env->me_lfd, 0, 0, 1, 0)) {
3719 		*excl = 1;
3720 	} else {
3721 		OVERLAPPED ov;
3722 		memset(&ov, 0, sizeof(ov));
3723 		if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
3724 			*excl = 0;
3725 		} else {
3726 			rc = ErrCode();
3727 		}
3728 	}
3729 #else
3730 	struct flock lock_info;
3731 	memset((void *)&lock_info, 0, sizeof(lock_info));
3732 	lock_info.l_type = F_WRLCK;
3733 	lock_info.l_whence = SEEK_SET;
3734 	lock_info.l_start = 0;
3735 	lock_info.l_len = 1;
3736 	while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
3737 			(rc = ErrCode()) == EINTR) ;
3738 	if (!rc) {
3739 		*excl = 1;
3740 	} else
3741 # ifdef MDB_USE_POSIX_SEM
3742 	if (*excl < 0) /* always true when !MDB_USE_POSIX_SEM */
3743 # endif
3744 	{
3745 		lock_info.l_type = F_RDLCK;
3746 		while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) &&
3747 				(rc = ErrCode()) == EINTR) ;
3748 		if (rc == 0)
3749 			*excl = 0;
3750 	}
3751 #endif
3752 	return rc;
3753 }
3754 
3755 #ifdef MDB_USE_HASH
3756 /*
3757  * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
3758  *
3759  * @(#) Revision: 5.1
3760  * @(#) Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp
3761  * @(#) Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v
3762  *
3763  *	  http://www.isthe.com/chongo/tech/comp/fnv/index.html
3764  *
3765  ***
3766  *
3767  * Please do not copyright this code.  This code is in the public domain.
3768  *
3769  * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
3770  * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO
3771  * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
3772  * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
3773  * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
3774  * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
3775  * PERFORMANCE OF THIS SOFTWARE.
3776  *
3777  * By:
3778  *	chongo <Landon Curt Noll> /\oo/\
3779  *	  http://www.isthe.com/chongo/
3780  *
3781  * Share and Enjoy!	:-)
3782  */
3783 
3784 typedef unsigned long long	mdb_hash_t;
3785 #define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL)
3786 
3787 /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
3788  * @param[in] val	value to hash
3789  * @param[in] hval	initial value for hash
3790  * @return 64 bit hash
3791  *
3792  * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the
3793  * 	 hval arg on the first call.
3794  */
3795 static mdb_hash_t
3796 mdb_hash_val(MDB_val *val, mdb_hash_t hval)
3797 {
3798 	unsigned char *s = (unsigned char *)val->mv_data;	/* unsigned string */
3799 	unsigned char *end = s + val->mv_size;
3800 	/*
3801 	 * FNV-1a hash each octet of the string
3802 	 */
3803 	while (s < end) {
3804 		/* xor the bottom with the current octet */
3805 		hval ^= (mdb_hash_t)*s++;
3806 
3807 		/* multiply by the 64 bit FNV magic prime mod 2^64 */
3808 		hval += (hval << 1) + (hval << 4) + (hval << 5) +
3809 			(hval << 7) + (hval << 8) + (hval << 40);
3810 	}
3811 	/* return our new hash value */
3812 	return hval;
3813 }
3814 
3815 /** Hash the string and output the encoded hash.
3816  * This uses modified RFC1924 Ascii85 encoding to accommodate systems with
3817  * very short name limits. We don't care about the encoding being reversible,
3818  * we just want to preserve as many bits of the input as possible in a
3819  * small printable string.
3820  * @param[in] str string to hash
3821  * @param[out] encbuf an array of 11 chars to hold the hash
3822  */
3823 static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~";
3824 
3825 static void
3826 mdb_pack85(unsigned long l, char *out)
3827 {
3828 	int i;
3829 
3830 	for (i=0; i<5; i++) {
3831 		*out++ = mdb_a85[l % 85];
3832 		l /= 85;
3833 	}
3834 }
3835 
3836 static void
3837 mdb_hash_enc(MDB_val *val, char *encbuf)
3838 {
3839 	mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT);
3840 
3841 	mdb_pack85(h, encbuf);
3842 	mdb_pack85(h>>32, encbuf+5);
3843 	encbuf[10] = '\0';
3844 }
3845 #endif
3846 
3847 /** Open and/or initialize the lock region for the environment.
3848  * @param[in] env The MDB environment.
3849  * @param[in] lpath The pathname of the file used for the lock region.
3850  * @param[in] mode The Unix permissions for the file, if we create it.
3851  * @param[out] excl Resulting file lock type: -1 none, 0 shared, 1 exclusive
3852  * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive
3853  * @return 0 on success, non-zero on failure.
3854  */
3855 static int
3856 mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
3857 {
3858 #ifdef _WIN32
3859 #	define MDB_ERRCODE_ROFS	ERROR_WRITE_PROTECT
3860 #else
3861 #	define MDB_ERRCODE_ROFS	EROFS
3862 #ifdef O_CLOEXEC	/* Linux: Open file and set FD_CLOEXEC atomically */
3863 #	define MDB_CLOEXEC		O_CLOEXEC
3864 #else
3865 	int fdflags;
3866 #	define MDB_CLOEXEC		0
3867 #endif
3868 #endif
3869 	int rc;
3870 	off_t size, rsize;
3871 
3872 #ifdef _WIN32
3873 	env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE,
3874 		FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS,
3875 		FILE_ATTRIBUTE_NORMAL, NULL);
3876 #else
3877 	env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode);
3878 #endif
3879 	if (env->me_lfd == INVALID_HANDLE_VALUE) {
3880 		rc = ErrCode();
3881 		if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) {
3882 			return MDB_SUCCESS;
3883 		}
3884 		goto fail_errno;
3885 	}
3886 #if ! ((MDB_CLOEXEC) || defined(_WIN32))
3887 	/* Lose record locks when exec*() */
3888 	if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0)
3889 			fcntl(env->me_lfd, F_SETFD, fdflags);
3890 #endif
3891 
3892 	if (!(env->me_flags & MDB_NOTLS)) {
3893 		rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
3894 		if (rc)
3895 			goto fail;
3896 		env->me_flags |= MDB_ENV_TXKEY;
3897 #ifdef _WIN32
3898 		/* Windows TLS callbacks need help finding their TLS info. */
3899 		if (mdb_tls_nkeys >= MAX_TLS_KEYS) {
3900 			rc = MDB_TLS_FULL;
3901 			goto fail;
3902 		}
3903 		mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey;
3904 #endif
3905 	}
3906 
3907 	/* Try to get exclusive lock. If we succeed, then
3908 	 * nobody is using the lock region and we should initialize it.
3909 	 */
3910 	if ((rc = mdb_env_excl_lock(env, excl))) goto fail;
3911 
3912 #ifdef _WIN32
3913 	size = GetFileSize(env->me_lfd, NULL);
3914 #else
3915 	size = lseek(env->me_lfd, 0, SEEK_END);
3916 	if (size == -1) goto fail_errno;
3917 #endif
3918 	rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
3919 	if (size < rsize && *excl > 0) {
3920 #ifdef _WIN32
3921 		if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
3922 			|| !SetEndOfFile(env->me_lfd))
3923 			goto fail_errno;
3924 #else
3925 		if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno;
3926 #endif
3927 	} else {
3928 		rsize = size;
3929 		size = rsize - sizeof(MDB_txninfo);
3930 		env->me_maxreaders = size/sizeof(MDB_reader) + 1;
3931 	}
3932 	{
3933 #ifdef _WIN32
3934 		HANDLE mh;
3935 		mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
3936 			0, 0, NULL);
3937 		if (!mh) goto fail_errno;
3938 		env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL);
3939 		CloseHandle(mh);
3940 		if (!env->me_txns) goto fail_errno;
3941 #else
3942 		void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
3943 			env->me_lfd, 0);
3944 		if (m == MAP_FAILED) goto fail_errno;
3945 		env->me_txns = m;
3946 #endif
3947 	}
3948 	if (*excl > 0) {
3949 #ifdef _WIN32
3950 		BY_HANDLE_FILE_INFORMATION stbuf;
3951 		struct {
3952 			DWORD volume;
3953 			DWORD nhigh;
3954 			DWORD nlow;
3955 		} idbuf;
3956 		MDB_val val;
3957 		char encbuf[11];
3958 
3959 		if (!mdb_sec_inited) {
3960 			InitializeSecurityDescriptor(&mdb_null_sd,
3961 				SECURITY_DESCRIPTOR_REVISION);
3962 			SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE);
3963 			mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES);
3964 			mdb_all_sa.bInheritHandle = FALSE;
3965 			mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd;
3966 			mdb_sec_inited = 1;
3967 		}
3968 		if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno;
3969 		idbuf.volume = stbuf.dwVolumeSerialNumber;
3970 		idbuf.nhigh  = stbuf.nFileIndexHigh;
3971 		idbuf.nlow   = stbuf.nFileIndexLow;
3972 		val.mv_data = &idbuf;
3973 		val.mv_size = sizeof(idbuf);
3974 		mdb_hash_enc(&val, encbuf);
3975 		sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf);
3976 		sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf);
3977 		env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname);
3978 		if (!env->me_rmutex) goto fail_errno;
3979 		env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname);
3980 		if (!env->me_wmutex) goto fail_errno;
3981 #elif defined(MDB_USE_POSIX_SEM)
3982 		struct stat stbuf;
3983 		struct {
3984 			dev_t dev;
3985 			ino_t ino;
3986 		} idbuf;
3987 		MDB_val val;
3988 		char encbuf[11];
3989 
3990 #if defined(__NetBSD__)
3991 #define	MDB_SHORT_SEMNAMES	1	/* limited to 14 chars */
3992 #endif
3993 		if (fstat(env->me_lfd, &stbuf)) goto fail_errno;
3994 		idbuf.dev = stbuf.st_dev;
3995 		idbuf.ino = stbuf.st_ino;
3996 		val.mv_data = &idbuf;
3997 		val.mv_size = sizeof(idbuf);
3998 		mdb_hash_enc(&val, encbuf);
3999 #ifdef MDB_SHORT_SEMNAMES
4000 		encbuf[9] = '\0';	/* drop name from 15 chars to 14 chars */
4001 #endif
4002 		sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf);
4003 		sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf);
4004 		/* Clean up after a previous run, if needed:  Try to
4005 		 * remove both semaphores before doing anything else.
4006 		 */
4007 		sem_unlink(env->me_txns->mti_rmname);
4008 		sem_unlink(env->me_txns->mti_wmname);
4009 		env->me_rmutex = sem_open(env->me_txns->mti_rmname,
4010 			O_CREAT|O_EXCL, mode, 1);
4011 		if (env->me_rmutex == SEM_FAILED) goto fail_errno;
4012 		env->me_wmutex = sem_open(env->me_txns->mti_wmname,
4013 			O_CREAT|O_EXCL, mode, 1);
4014 		if (env->me_wmutex == SEM_FAILED) goto fail_errno;
4015 #else	/* MDB_USE_POSIX_SEM */
4016 		pthread_mutexattr_t mattr;
4017 
4018 		if ((rc = pthread_mutexattr_init(&mattr))
4019 			|| (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED))
4020 			|| (rc = pthread_mutex_init(&env->me_txns->mti_mutex, &mattr))
4021 			|| (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr)))
4022 			goto fail;
4023 		pthread_mutexattr_destroy(&mattr);
4024 #endif	/* _WIN32 || MDB_USE_POSIX_SEM */
4025 
4026 		env->me_txns->mti_magic = MDB_MAGIC;
4027 		env->me_txns->mti_format = MDB_LOCK_FORMAT;
4028 		env->me_txns->mti_txnid = 0;
4029 		env->me_txns->mti_numreaders = 0;
4030 
4031 	} else {
4032 		if (env->me_txns->mti_magic != MDB_MAGIC) {
4033 			DPUTS("lock region has invalid magic");
4034 			rc = MDB_INVALID;
4035 			goto fail;
4036 		}
4037 		if (env->me_txns->mti_format != MDB_LOCK_FORMAT) {
4038 			DPRINTF(("lock region has format+version 0x%x, expected 0x%x",
4039 				env->me_txns->mti_format, MDB_LOCK_FORMAT));
4040 			rc = MDB_VERSION_MISMATCH;
4041 			goto fail;
4042 		}
4043 		rc = ErrCode();
4044 		if (rc && rc != EACCES && rc != EAGAIN) {
4045 			goto fail;
4046 		}
4047 #ifdef _WIN32
4048 		env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname);
4049 		if (!env->me_rmutex) goto fail_errno;
4050 		env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname);
4051 		if (!env->me_wmutex) goto fail_errno;
4052 #elif defined(MDB_USE_POSIX_SEM)
4053 		env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0);
4054 		if (env->me_rmutex == SEM_FAILED) goto fail_errno;
4055 		env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0);
4056 		if (env->me_wmutex == SEM_FAILED) goto fail_errno;
4057 #endif
4058 	}
4059 	return MDB_SUCCESS;
4060 
4061 fail_errno:
4062 	rc = ErrCode();
4063 fail:
4064 	return rc;
4065 }
4066 
4067 	/** The name of the lock file in the DB environment */
4068 #define LOCKNAME	"/lock.mdb"
4069 	/** The name of the data file in the DB environment */
4070 #define DATANAME	"/data.mdb"
4071 	/** The suffix of the lock file when no subdir is used */
4072 #define LOCKSUFF	"-lock"
4073 	/** Only a subset of the @ref mdb_env flags can be changed
4074 	 *	at runtime. Changing other flags requires closing the
4075 	 *	environment and re-opening it with the new flags.
4076 	 */
4077 #define	CHANGEABLE	(MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
4078 #define	CHANGELESS	(MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \
4079 	MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
4080 
4081 #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS)
4082 # error "Persistent DB flags & env flags overlap, but both go in mm_flags"
4083 #endif
4084 
4085 int
4086 mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
4087 {
4088 	int		oflags, rc, len, excl = -1;
4089 	char *lpath, *dpath;
4090 
4091 	if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS)))
4092 		return EINVAL;
4093 
4094 	len = strlen(path);
4095 	if (flags & MDB_NOSUBDIR) {
4096 		rc = len + sizeof(LOCKSUFF) + len + 1;
4097 	} else {
4098 		rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME);
4099 	}
4100 	lpath = malloc(rc);
4101 	if (!lpath)
4102 		return ENOMEM;
4103 	if (flags & MDB_NOSUBDIR) {
4104 		dpath = lpath + len + sizeof(LOCKSUFF);
4105 		sprintf(lpath, "%s" LOCKSUFF, path);
4106 		strcpy(dpath, path);
4107 	} else {
4108 		dpath = lpath + len + sizeof(LOCKNAME);
4109 		sprintf(lpath, "%s" LOCKNAME, path);
4110 		sprintf(dpath, "%s" DATANAME, path);
4111 	}
4112 
4113 	rc = MDB_SUCCESS;
4114 	flags |= env->me_flags;
4115 	if (flags & MDB_RDONLY) {
4116 		/* silently ignore WRITEMAP when we're only getting read access */
4117 		flags &= ~MDB_WRITEMAP;
4118 	} else {
4119 		if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) &&
4120 			  (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2)))))
4121 			rc = ENOMEM;
4122 	}
4123 	env->me_flags = flags |= MDB_ENV_ACTIVE;
4124 	if (rc)
4125 		goto leave;
4126 
4127 	env->me_path = strdup(path);
4128 	env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx));
4129 	env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t));
4130 	if (!(env->me_dbxs && env->me_path && env->me_dbflags)) {
4131 		rc = ENOMEM;
4132 		goto leave;
4133 	}
4134 
4135 	/* For RDONLY, get lockfile after we know datafile exists */
4136 	if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) {
4137 		rc = mdb_env_setup_locks(env, lpath, mode, &excl);
4138 		if (rc)
4139 			goto leave;
4140 	}
4141 
4142 #ifdef _WIN32
4143 	if (F_ISSET(flags, MDB_RDONLY)) {
4144 		oflags = GENERIC_READ;
4145 		len = OPEN_EXISTING;
4146 	} else {
4147 		oflags = GENERIC_READ|GENERIC_WRITE;
4148 		len = OPEN_ALWAYS;
4149 	}
4150 	mode = FILE_ATTRIBUTE_NORMAL;
4151 	env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE,
4152 		NULL, len, mode, NULL);
4153 #else
4154 	if (F_ISSET(flags, MDB_RDONLY))
4155 		oflags = O_RDONLY;
4156 	else
4157 		oflags = O_RDWR | O_CREAT;
4158 
4159 	env->me_fd = open(dpath, oflags, mode);
4160 #endif
4161 	if (env->me_fd == INVALID_HANDLE_VALUE) {
4162 		rc = ErrCode();
4163 		goto leave;
4164 	}
4165 
4166 	if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) {
4167 		rc = mdb_env_setup_locks(env, lpath, mode, &excl);
4168 		if (rc)
4169 			goto leave;
4170 	}
4171 
4172 	if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) {
4173 		if (flags & (MDB_RDONLY|MDB_WRITEMAP)) {
4174 			env->me_mfd = env->me_fd;
4175 		} else {
4176 			/* Synchronous fd for meta writes. Needed even with
4177 			 * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset.
4178 			 */
4179 #ifdef _WIN32
4180 			len = OPEN_EXISTING;
4181 			env->me_mfd = CreateFile(dpath, oflags,
4182 				FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len,
4183 				mode | FILE_FLAG_WRITE_THROUGH, NULL);
4184 #else
4185 			oflags &= ~O_CREAT;
4186 			env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode);
4187 #endif
4188 			if (env->me_mfd == INVALID_HANDLE_VALUE) {
4189 				rc = ErrCode();
4190 				goto leave;
4191 			}
4192 		}
4193 		DPRINTF(("opened dbenv %p", (void *) env));
4194 		if (excl > 0) {
4195 			rc = mdb_env_share_locks(env, &excl);
4196 			if (rc)
4197 				goto leave;
4198 		}
4199 		if (!((flags & MDB_RDONLY) ||
4200 			  (env->me_pbuf = calloc(1, env->me_psize))))
4201 			rc = ENOMEM;
4202 	}
4203 
4204 leave:
4205 	if (rc) {
4206 		mdb_env_close0(env, excl);
4207 	}
4208 	free(lpath);
4209 	return rc;
4210 }
4211 
4212 /** Destroy resources from mdb_env_open(), clear our readers & DBIs */
4213 static void
4214 mdb_env_close0(MDB_env *env, int excl)
4215 {
4216 	int i;
4217 
4218 	if (!(env->me_flags & MDB_ENV_ACTIVE))
4219 		return;
4220 
4221 	/* Doing this here since me_dbxs may not exist during mdb_env_close */
4222 	for (i = env->me_maxdbs; --i > MAIN_DBI; )
4223 		free(env->me_dbxs[i].md_name.mv_data);
4224 
4225 	free(env->me_pbuf);
4226 	free(env->me_dbflags);
4227 	free(env->me_dbxs);
4228 	free(env->me_path);
4229 	free(env->me_dirty_list);
4230 	mdb_midl_free(env->me_free_pgs);
4231 
4232 	if (env->me_flags & MDB_ENV_TXKEY) {
4233 		pthread_key_delete(env->me_txkey);
4234 #ifdef _WIN32
4235 		/* Delete our key from the global list */
4236 		for (i=0; i<mdb_tls_nkeys; i++)
4237 			if (mdb_tls_keys[i] == env->me_txkey) {
4238 				mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1];
4239 				mdb_tls_nkeys--;
4240 				break;
4241 			}
4242 #endif
4243 	}
4244 
4245 	if (env->me_map) {
4246 		munmap(env->me_map, env->me_mapsize);
4247 	}
4248 	if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE)
4249 		(void) close(env->me_mfd);
4250 	if (env->me_fd != INVALID_HANDLE_VALUE)
4251 		(void) close(env->me_fd);
4252 	if (env->me_txns) {
4253 		MDB_PID_T pid = env->me_pid;
4254 		/* Clearing readers is done in this function because
4255 		 * me_txkey with its destructor must be disabled first.
4256 		 */
4257 		for (i = env->me_numreaders; --i >= 0; )
4258 			if (env->me_txns->mti_readers[i].mr_pid == pid)
4259 				env->me_txns->mti_readers[i].mr_pid = 0;
4260 #ifdef _WIN32
4261 		if (env->me_rmutex) {
4262 			CloseHandle(env->me_rmutex);
4263 			if (env->me_wmutex) CloseHandle(env->me_wmutex);
4264 		}
4265 		/* Windows automatically destroys the mutexes when
4266 		 * the last handle closes.
4267 		 */
4268 #elif defined(MDB_USE_POSIX_SEM)
4269 		if (env->me_rmutex != SEM_FAILED) {
4270 			sem_close(env->me_rmutex);
4271 			if (env->me_wmutex != SEM_FAILED)
4272 				sem_close(env->me_wmutex);
4273 			/* If we have the filelock:  If we are the
4274 			 * only remaining user, clean up semaphores.
4275 			 */
4276 			if (excl == 0)
4277 				mdb_env_excl_lock(env, &excl);
4278 			if (excl > 0) {
4279 				sem_unlink(env->me_txns->mti_rmname);
4280 				sem_unlink(env->me_txns->mti_wmname);
4281 			}
4282 		}
4283 #endif
4284 		munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo));
4285 	}
4286 	if (env->me_lfd != INVALID_HANDLE_VALUE) {
4287 #ifdef _WIN32
4288 		if (excl >= 0) {
4289 			/* Unlock the lockfile.  Windows would have unlocked it
4290 			 * after closing anyway, but not necessarily at once.
4291 			 */
4292 			UnlockFile(env->me_lfd, 0, 0, 1, 0);
4293 		}
4294 #endif
4295 		(void) close(env->me_lfd);
4296 	}
4297 
4298 	env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY);
4299 }
4300 
4301 int
4302 mdb_env_copyfd(MDB_env *env, HANDLE fd)
4303 {
4304 	MDB_txn *txn = NULL;
4305 	int rc;
4306 	size_t wsize;
4307 	char *ptr;
4308 #ifdef _WIN32
4309 	DWORD len, w2;
4310 #define DO_WRITE(rc, fd, ptr, w2, len)	rc = WriteFile(fd, ptr, w2, &len, NULL)
4311 #else
4312 	ssize_t len;
4313 	size_t w2;
4314 #define DO_WRITE(rc, fd, ptr, w2, len)	len = write(fd, ptr, w2); rc = (len >= 0)
4315 #endif
4316 
4317 	/* Do the lock/unlock of the reader mutex before starting the
4318 	 * write txn.  Otherwise other read txns could block writers.
4319 	 */
4320 	rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
4321 	if (rc)
4322 		return rc;
4323 
4324 	if (env->me_txns) {
4325 		/* We must start the actual read txn after blocking writers */
4326 		mdb_txn_reset0(txn, "reset-stage1");
4327 
4328 		/* Temporarily block writers until we snapshot the meta pages */
4329 		LOCK_MUTEX_W(env);
4330 
4331 		rc = mdb_txn_renew0(txn);
4332 		if (rc) {
4333 			UNLOCK_MUTEX_W(env);
4334 			goto leave;
4335 		}
4336 	}
4337 
4338 	wsize = env->me_psize * 2;
4339 	ptr = env->me_map;
4340 	w2 = wsize;
4341 	while (w2 > 0) {
4342 		DO_WRITE(rc, fd, ptr, w2, len);
4343 		if (!rc) {
4344 			rc = ErrCode();
4345 			break;
4346 		} else if (len > 0) {
4347 			rc = MDB_SUCCESS;
4348 			ptr += len;
4349 			w2 -= len;
4350 			continue;
4351 		} else {
4352 			/* Non-blocking or async handles are not supported */
4353 			rc = EIO;
4354 			break;
4355 		}
4356 	}
4357 	if (env->me_txns)
4358 		UNLOCK_MUTEX_W(env);
4359 
4360 	if (rc)
4361 		goto leave;
4362 
4363 	wsize = txn->mt_next_pgno * env->me_psize - wsize;
4364 	while (wsize > 0) {
4365 		if (wsize > MAX_WRITE)
4366 			w2 = MAX_WRITE;
4367 		else
4368 			w2 = wsize;
4369 		DO_WRITE(rc, fd, ptr, w2, len);
4370 		if (!rc) {
4371 			rc = ErrCode();
4372 			break;
4373 		} else if (len > 0) {
4374 			rc = MDB_SUCCESS;
4375 			ptr += len;
4376 			wsize -= len;
4377 			continue;
4378 		} else {
4379 			rc = EIO;
4380 			break;
4381 		}
4382 	}
4383 
4384 leave:
4385 	mdb_txn_abort(txn);
4386 	return rc;
4387 }
4388 
4389 int
4390 mdb_env_copy(MDB_env *env, const char *path)
4391 {
4392 	int rc, len;
4393 	char *lpath;
4394 	HANDLE newfd = INVALID_HANDLE_VALUE;
4395 
4396 	if (env->me_flags & MDB_NOSUBDIR) {
4397 		lpath = (char *)path;
4398 	} else {
4399 		len = strlen(path);
4400 		len += sizeof(DATANAME);
4401 		lpath = malloc(len);
4402 		if (!lpath)
4403 			return ENOMEM;
4404 		sprintf(lpath, "%s" DATANAME, path);
4405 	}
4406 
4407 	/* The destination path must exist, but the destination file must not.
4408 	 * We don't want the OS to cache the writes, since the source data is
4409 	 * already in the OS cache.
4410 	 */
4411 #ifdef _WIN32
4412 	newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
4413 				FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
4414 #else
4415 	newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666);
4416 #endif
4417 	if (newfd == INVALID_HANDLE_VALUE) {
4418 		rc = ErrCode();
4419 		goto leave;
4420 	}
4421 
4422 #ifdef O_DIRECT
4423 	/* Set O_DIRECT if the file system supports it */
4424 	if ((rc = fcntl(newfd, F_GETFL)) != -1)
4425 		(void) fcntl(newfd, F_SETFL, rc | O_DIRECT);
4426 #endif
4427 #ifdef F_NOCACHE	/* __APPLE__ */
4428 	rc = fcntl(newfd, F_NOCACHE, 1);
4429 	if (rc) {
4430 		rc = ErrCode();
4431 		goto leave;
4432 	}
4433 #endif
4434 
4435 	rc = mdb_env_copyfd(env, newfd);
4436 
4437 leave:
4438 	if (!(env->me_flags & MDB_NOSUBDIR))
4439 		free(lpath);
4440 	if (newfd != INVALID_HANDLE_VALUE)
4441 		if (close(newfd) < 0 && rc == MDB_SUCCESS)
4442 			rc = ErrCode();
4443 
4444 	return rc;
4445 }
4446 
4447 void
4448 mdb_env_close(MDB_env *env)
4449 {
4450 	MDB_page *dp;
4451 
4452 	if (env == NULL)
4453 		return;
4454 
4455 	VGMEMP_DESTROY(env);
4456 	while ((dp = env->me_dpages) != NULL) {
4457 		VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
4458 		env->me_dpages = dp->mp_next;
4459 		free(dp);
4460 	}
4461 
4462 	mdb_env_close0(env, 0);
4463 	free(env);
4464 }
4465 
4466 /** Compare two items pointing at aligned size_t's */
4467 static int
4468 mdb_cmp_long(const MDB_val *a, const MDB_val *b)
4469 {
4470 	return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 :
4471 		*(size_t *)a->mv_data > *(size_t *)b->mv_data;
4472 }
4473 
4474 /** Compare two items pointing at aligned unsigned int's */
4475 static int
4476 mdb_cmp_int(const MDB_val *a, const MDB_val *b)
4477 {
4478 	return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
4479 		*(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
4480 }
4481 
4482 /** Compare two items pointing at unsigned ints of unknown alignment.
4483  *	Nodes and keys are guaranteed to be 2-byte aligned.
4484  */
4485 static int
4486 mdb_cmp_cint(const MDB_val *a, const MDB_val *b)
4487 {
4488 #if BYTE_ORDER == LITTLE_ENDIAN
4489 	unsigned short *u, *c;
4490 	int x;
4491 
4492 	u = (unsigned short *) ((char *) a->mv_data + a->mv_size);
4493 	c = (unsigned short *) ((char *) b->mv_data + a->mv_size);
4494 	do {
4495 		x = *--u - *--c;
4496 	} while(!x && u > (unsigned short *)a->mv_data);
4497 	return x;
4498 #else
4499 	return memcmp(a->mv_data, b->mv_data, a->mv_size);
4500 #endif
4501 }
4502 
4503 /** Compare two items lexically */
4504 static int
4505 mdb_cmp_memn(const MDB_val *a, const MDB_val *b)
4506 {
4507 	int diff;
4508 	ssize_t len_diff;
4509 	unsigned int len;
4510 
4511 	len = a->mv_size;
4512 	len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
4513 	if (len_diff > 0) {
4514 		len = b->mv_size;
4515 		len_diff = 1;
4516 	}
4517 
4518 	diff = memcmp(a->mv_data, b->mv_data, len);
4519 	return diff ? diff : len_diff<0 ? -1 : len_diff;
4520 }
4521 
4522 /** Compare two items in reverse byte order */
4523 static int
4524 mdb_cmp_memnr(const MDB_val *a, const MDB_val *b)
4525 {
4526 	const unsigned char	*p1, *p2, *p1_lim;
4527 	ssize_t len_diff;
4528 	int diff;
4529 
4530 	p1_lim = (const unsigned char *)a->mv_data;
4531 	p1 = (const unsigned char *)a->mv_data + a->mv_size;
4532 	p2 = (const unsigned char *)b->mv_data + b->mv_size;
4533 
4534 	len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
4535 	if (len_diff > 0) {
4536 		p1_lim += len_diff;
4537 		len_diff = 1;
4538 	}
4539 
4540 	while (p1 > p1_lim) {
4541 		diff = *--p1 - *--p2;
4542 		if (diff)
4543 			return diff;
4544 	}
4545 	return len_diff<0 ? -1 : len_diff;
4546 }
4547 
4548 /** Search for key within a page, using binary search.
4549  * Returns the smallest entry larger or equal to the key.
4550  * If exactp is non-null, stores whether the found entry was an exact match
4551  * in *exactp (1 or 0).
4552  * Updates the cursor index with the index of the found entry.
4553  * If no entry larger or equal to the key is found, returns NULL.
4554  */
4555 static MDB_node *
4556 mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp)
4557 {
4558 	unsigned int	 i = 0, nkeys;
4559 	int		 low, high;
4560 	int		 rc = 0;
4561 	MDB_page *mp = mc->mc_pg[mc->mc_top];
4562 	MDB_node	*node = NULL;
4563 	MDB_val	 nodekey;
4564 	MDB_cmp_func *cmp;
4565 	DKBUF;
4566 
4567 	nkeys = NUMKEYS(mp);
4568 
4569 	DPRINTF(("searching %u keys in %s %spage %"Z"u",
4570 	    nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
4571 	    mdb_dbg_pgno(mp)));
4572 
4573 	low = IS_LEAF(mp) ? 0 : 1;
4574 	high = nkeys - 1;
4575 	cmp = mc->mc_dbx->md_cmp;
4576 
4577 	/* Branch pages have no data, so if using integer keys,
4578 	 * alignment is guaranteed. Use faster mdb_cmp_int.
4579 	 */
4580 	if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) {
4581 		if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t))
4582 			cmp = mdb_cmp_long;
4583 		else
4584 			cmp = mdb_cmp_int;
4585 	}
4586 
4587 	if (IS_LEAF2(mp)) {
4588 		nodekey.mv_size = mc->mc_db->md_pad;
4589 		node = NODEPTR(mp, 0);	/* fake */
4590 		while (low <= high) {
4591 			i = (low + high) >> 1;
4592 			nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size);
4593 			rc = cmp(key, &nodekey);
4594 			DPRINTF(("found leaf index %u [%s], rc = %i",
4595 			    i, DKEY(&nodekey), rc));
4596 			if (rc == 0)
4597 				break;
4598 			if (rc > 0)
4599 				low = i + 1;
4600 			else
4601 				high = i - 1;
4602 		}
4603 	} else {
4604 		while (low <= high) {
4605 			i = (low + high) >> 1;
4606 
4607 			node = NODEPTR(mp, i);
4608 			nodekey.mv_size = NODEKSZ(node);
4609 			nodekey.mv_data = NODEKEY(node);
4610 
4611 			rc = cmp(key, &nodekey);
4612 #if MDB_DEBUG
4613 			if (IS_LEAF(mp))
4614 				DPRINTF(("found leaf index %u [%s], rc = %i",
4615 				    i, DKEY(&nodekey), rc));
4616 			else
4617 				DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i",
4618 				    i, DKEY(&nodekey), NODEPGNO(node), rc));
4619 #endif
4620 			if (rc == 0)
4621 				break;
4622 			if (rc > 0)
4623 				low = i + 1;
4624 			else
4625 				high = i - 1;
4626 		}
4627 	}
4628 
4629 	if (rc > 0) {	/* Found entry is less than the key. */
4630 		i++;	/* Skip to get the smallest entry larger than key. */
4631 		if (!IS_LEAF2(mp))
4632 			node = NODEPTR(mp, i);
4633 	}
4634 	if (exactp)
4635 		*exactp = (rc == 0 && nkeys > 0);
4636 	/* store the key index */
4637 	mc->mc_ki[mc->mc_top] = i;
4638 	if (i >= nkeys)
4639 		/* There is no entry larger or equal to the key. */
4640 		return NULL;
4641 
4642 	/* nodeptr is fake for LEAF2 */
4643 	return node;
4644 }
4645 
4646 #if 0
4647 static void
4648 mdb_cursor_adjust(MDB_cursor *mc, func)
4649 {
4650 	MDB_cursor *m2;
4651 
4652 	for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
4653 		if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) {
4654 			func(mc, m2);
4655 		}
4656 	}
4657 }
4658 #endif
4659 
4660 /** Pop a page off the top of the cursor's stack. */
4661 static void
4662 mdb_cursor_pop(MDB_cursor *mc)
4663 {
4664 	if (mc->mc_snum) {
4665 #if MDB_DEBUG
4666 		MDB_page	*top = mc->mc_pg[mc->mc_top];
4667 #endif
4668 		mc->mc_snum--;
4669 		if (mc->mc_snum)
4670 			mc->mc_top--;
4671 
4672 		DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno,
4673 			DDBI(mc), (void *) mc));
4674 	}
4675 }
4676 
4677 /** Push a page onto the top of the cursor's stack. */
4678 static int
4679 mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
4680 {
4681 	DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno,
4682 		DDBI(mc), (void *) mc));
4683 
4684 	if (mc->mc_snum >= CURSOR_STACK) {
4685 		mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
4686 		return MDB_CURSOR_FULL;
4687 	}
4688 
4689 	mc->mc_top = mc->mc_snum++;
4690 	mc->mc_pg[mc->mc_top] = mp;
4691 	mc->mc_ki[mc->mc_top] = 0;
4692 
4693 	return MDB_SUCCESS;
4694 }
4695 
4696 /** Find the address of the page corresponding to a given page number.
4697  * @param[in] txn the transaction for this access.
4698  * @param[in] pgno the page number for the page to retrieve.
4699  * @param[out] ret address of a pointer where the page's address will be stored.
4700  * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page.
4701  * @return 0 on success, non-zero on failure.
4702  */
4703 static int
4704 mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl)
4705 {
4706 	MDB_env *env = txn->mt_env;
4707 	MDB_page *p = NULL;
4708 	int level;
4709 
4710 	if (!((txn->mt_flags & MDB_TXN_RDONLY) | (env->me_flags & MDB_WRITEMAP))) {
4711 		MDB_txn *tx2 = txn;
4712 		level = 1;
4713 		do {
4714 			MDB_ID2L dl = tx2->mt_u.dirty_list;
4715 			unsigned x;
4716 			/* Spilled pages were dirtied in this txn and flushed
4717 			 * because the dirty list got full. Bring this page
4718 			 * back in from the map (but don't unspill it here,
4719 			 * leave that unless page_touch happens again).
4720 			 */
4721 			if (tx2->mt_spill_pgs) {
4722 				MDB_ID pn = pgno << 1;
4723 				x = mdb_midl_search(tx2->mt_spill_pgs, pn);
4724 				if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
4725 					p = (MDB_page *)(env->me_map + env->me_psize * pgno);
4726 					goto done;
4727 				}
4728 			}
4729 			if (dl[0].mid) {
4730 				unsigned x = mdb_mid2l_search(dl, pgno);
4731 				if (x <= dl[0].mid && dl[x].mid == pgno) {
4732 					p = dl[x].mptr;
4733 					goto done;
4734 				}
4735 			}
4736 			level++;
4737 		} while ((tx2 = tx2->mt_parent) != NULL);
4738 	}
4739 
4740 	if (pgno < txn->mt_next_pgno) {
4741 		level = 0;
4742 		p = (MDB_page *)(env->me_map + env->me_psize * pgno);
4743 	} else {
4744 		DPRINTF(("page %"Z"u not found", pgno));
4745 		txn->mt_flags |= MDB_TXN_ERROR;
4746 		return MDB_PAGE_NOTFOUND;
4747 	}
4748 
4749 done:
4750 	*ret = p;
4751 	if (lvl)
4752 		*lvl = level;
4753 	return MDB_SUCCESS;
4754 }
4755 
4756 /** Finish #mdb_page_search() / #mdb_page_search_lowest().
4757  *	The cursor is at the root page, set up the rest of it.
4758  */
4759 static int
4760 mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags)
4761 {
4762 	MDB_page	*mp = mc->mc_pg[mc->mc_top];
4763 	int rc;
4764 	DKBUF;
4765 
4766 	while (IS_BRANCH(mp)) {
4767 		MDB_node	*node;
4768 		indx_t		i;
4769 
4770 		DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp)));
4771 		mdb_cassert(mc, NUMKEYS(mp) > 1);
4772 		DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0))));
4773 
4774 		if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) {
4775 			i = 0;
4776 			if (flags & MDB_PS_LAST)
4777 				i = NUMKEYS(mp) - 1;
4778 		} else {
4779 			int	 exact;
4780 			node = mdb_node_search(mc, key, &exact);
4781 			if (node == NULL)
4782 				i = NUMKEYS(mp) - 1;
4783 			else {
4784 				i = mc->mc_ki[mc->mc_top];
4785 				if (!exact) {
4786 					mdb_cassert(mc, i > 0);
4787 					i--;
4788 				}
4789 			}
4790 			DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
4791 		}
4792 
4793 		mdb_cassert(mc, i < NUMKEYS(mp));
4794 		node = NODEPTR(mp, i);
4795 
4796 		if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0)
4797 			return rc;
4798 
4799 		mc->mc_ki[mc->mc_top] = i;
4800 		if ((rc = mdb_cursor_push(mc, mp)))
4801 			return rc;
4802 
4803 		if (flags & MDB_PS_MODIFY) {
4804 			if ((rc = mdb_page_touch(mc)) != 0)
4805 				return rc;
4806 			mp = mc->mc_pg[mc->mc_top];
4807 		}
4808 	}
4809 
4810 	if (!IS_LEAF(mp)) {
4811 		DPRINTF(("internal error, index points to a %02X page!?",
4812 		    mp->mp_flags));
4813 		mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
4814 		return MDB_CORRUPTED;
4815 	}
4816 
4817 	DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno,
4818 	    key ? DKEY(key) : "null"));
4819 	mc->mc_flags |= C_INITIALIZED;
4820 	mc->mc_flags &= ~C_EOF;
4821 
4822 	return MDB_SUCCESS;
4823 }
4824 
4825 /** Search for the lowest key under the current branch page.
4826  * This just bypasses a NUMKEYS check in the current page
4827  * before calling mdb_page_search_root(), because the callers
4828  * are all in situations where the current page is known to
4829  * be underfilled.
4830  */
4831 static int
4832 mdb_page_search_lowest(MDB_cursor *mc)
4833 {
4834 	MDB_page	*mp = mc->mc_pg[mc->mc_top];
4835 	MDB_node	*node = NODEPTR(mp, 0);
4836 	int rc;
4837 
4838 	if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0)
4839 		return rc;
4840 
4841 	mc->mc_ki[mc->mc_top] = 0;
4842 	if ((rc = mdb_cursor_push(mc, mp)))
4843 		return rc;
4844 	return mdb_page_search_root(mc, NULL, MDB_PS_FIRST);
4845 }
4846 
4847 /** Search for the page a given key should be in.
4848  * Push it and its parent pages on the cursor stack.
4849  * @param[in,out] mc the cursor for this operation.
4850  * @param[in] key the key to search for, or NULL for first/last page.
4851  * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB
4852  *   are touched (updated with new page numbers).
4853  *   If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf.
4854  *   This is used by #mdb_cursor_first() and #mdb_cursor_last().
4855  *   If MDB_PS_ROOTONLY set, just fetch root node, no further lookups.
4856  * @return 0 on success, non-zero on failure.
4857  */
4858 static int
4859 mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
4860 {
4861 	int		 rc;
4862 	pgno_t		 root;
4863 
4864 	/* Make sure the txn is still viable, then find the root from
4865 	 * the txn's db table and set it as the root of the cursor's stack.
4866 	 */
4867 	if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) {
4868 		DPUTS("transaction has failed, must abort");
4869 		return MDB_BAD_TXN;
4870 	} else {
4871 		/* Make sure we're using an up-to-date root */
4872 		if (*mc->mc_dbflag & DB_STALE) {
4873 				MDB_cursor mc2;
4874 				mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
4875 				rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0);
4876 				if (rc)
4877 					return rc;
4878 				{
4879 					MDB_val data;
4880 					int exact = 0;
4881 					uint16_t flags;
4882 					MDB_node *leaf = mdb_node_search(&mc2,
4883 						&mc->mc_dbx->md_name, &exact);
4884 					if (!exact)
4885 						return MDB_NOTFOUND;
4886 					rc = mdb_node_read(mc->mc_txn, leaf, &data);
4887 					if (rc)
4888 						return rc;
4889 					memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)),
4890 						sizeof(uint16_t));
4891 					/* The txn may not know this DBI, or another process may
4892 					 * have dropped and recreated the DB with other flags.
4893 					 */
4894 					if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags)
4895 						return MDB_INCOMPATIBLE;
4896 					memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db));
4897 				}
4898 				*mc->mc_dbflag &= ~DB_STALE;
4899 		}
4900 		root = mc->mc_db->md_root;
4901 
4902 		if (root == P_INVALID) {		/* Tree is empty. */
4903 			DPUTS("tree is empty");
4904 			return MDB_NOTFOUND;
4905 		}
4906 	}
4907 
4908 	mdb_cassert(mc, root > 1);
4909 	if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root)
4910 		if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0)
4911 			return rc;
4912 
4913 	mc->mc_snum = 1;
4914 	mc->mc_top = 0;
4915 
4916 	DPRINTF(("db %d root page %"Z"u has flags 0x%X",
4917 		DDBI(mc), root, mc->mc_pg[0]->mp_flags));
4918 
4919 	if (flags & MDB_PS_MODIFY) {
4920 		if ((rc = mdb_page_touch(mc)))
4921 			return rc;
4922 	}
4923 
4924 	if (flags & MDB_PS_ROOTONLY)
4925 		return MDB_SUCCESS;
4926 
4927 	return mdb_page_search_root(mc, key, flags);
4928 }
4929 
4930 static int
4931 mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp)
4932 {
4933 	MDB_txn *txn = mc->mc_txn;
4934 	pgno_t pg = mp->mp_pgno;
4935 	unsigned x = 0, ovpages = mp->mp_pages;
4936 	MDB_env *env = txn->mt_env;
4937 	MDB_IDL sl = txn->mt_spill_pgs;
4938 	MDB_ID pn = pg << 1;
4939 	int rc;
4940 
4941 	DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages));
4942 	/* If the page is dirty or on the spill list we just acquired it,
4943 	 * so we should give it back to our current free list, if any.
4944 	 * Otherwise put it onto the list of pages we freed in this txn.
4945 	 *
4946 	 * Won't create me_pghead: me_pglast must be inited along with it.
4947 	 * Unsupported in nested txns: They would need to hide the page
4948 	 * range in ancestor txns' dirty and spilled lists.
4949 	 */
4950 	if (env->me_pghead &&
4951 		!txn->mt_parent &&
4952 		((mp->mp_flags & P_DIRTY) ||
4953 		 (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn)))
4954 	{
4955 		unsigned i, j;
4956 		pgno_t *mop;
4957 		MDB_ID2 *dl, ix, iy;
4958 		rc = mdb_midl_need(&env->me_pghead, ovpages);
4959 		if (rc)
4960 			return rc;
4961 		if (!(mp->mp_flags & P_DIRTY)) {
4962 			/* This page is no longer spilled */
4963 			if (x == sl[0])
4964 				sl[0]--;
4965 			else
4966 				sl[x] |= 1;
4967 			goto release;
4968 		}
4969 		/* Remove from dirty list */
4970 		dl = txn->mt_u.dirty_list;
4971 		x = dl[0].mid--;
4972 		for (ix = dl[x]; ix.mptr != mp; ix = iy) {
4973 			if (x > 1) {
4974 				x--;
4975 				iy = dl[x];
4976 				dl[x] = ix;
4977 			} else {
4978 				mdb_cassert(mc, x > 1);
4979 				j = ++(dl[0].mid);
4980 				dl[j] = ix;		/* Unsorted. OK when MDB_TXN_ERROR. */
4981 				txn->mt_flags |= MDB_TXN_ERROR;
4982 				return MDB_CORRUPTED;
4983 			}
4984 		}
4985 		if (!(env->me_flags & MDB_WRITEMAP))
4986 			mdb_dpage_free(env, mp);
4987 release:
4988 		/* Insert in me_pghead */
4989 		mop = env->me_pghead;
4990 		j = mop[0] + ovpages;
4991 		for (i = mop[0]; i && mop[i] < pg; i--)
4992 			mop[j--] = mop[i];
4993 		while (j>i)
4994 			mop[j--] = pg++;
4995 		mop[0] += ovpages;
4996 	} else {
4997 		rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages);
4998 		if (rc)
4999 			return rc;
5000 	}
5001 	mc->mc_db->md_overflow_pages -= ovpages;
5002 	return 0;
5003 }
5004 
5005 /** Return the data associated with a given node.
5006  * @param[in] txn The transaction for this operation.
5007  * @param[in] leaf The node being read.
5008  * @param[out] data Updated to point to the node's data.
5009  * @return 0 on success, non-zero on failure.
5010  */
5011 static int
5012 mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data)
5013 {
5014 	MDB_page	*omp;		/* overflow page */
5015 	pgno_t		 pgno;
5016 	int rc;
5017 
5018 	if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) {
5019 		data->mv_size = NODEDSZ(leaf);
5020 		data->mv_data = NODEDATA(leaf);
5021 		return MDB_SUCCESS;
5022 	}
5023 
5024 	/* Read overflow data.
5025 	 */
5026 	data->mv_size = NODEDSZ(leaf);
5027 	memcpy(&pgno, NODEDATA(leaf), sizeof(pgno));
5028 	if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) {
5029 		DPRINTF(("read overflow page %"Z"u failed", pgno));
5030 		return rc;
5031 	}
5032 	data->mv_data = METADATA(omp);
5033 
5034 	return MDB_SUCCESS;
5035 }
5036 
5037 int
5038 mdb_get(MDB_txn *txn, MDB_dbi dbi,
5039     MDB_val *key, MDB_val *data)
5040 {
5041 	MDB_cursor	mc;
5042 	MDB_xcursor	mx;
5043 	int exact = 0;
5044 	DKBUF;
5045 
5046 	if (key == NULL || data == NULL)
5047 		return EINVAL;
5048 
5049 	DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key)));
5050 
5051 	if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
5052 		return EINVAL;
5053 
5054 	if (txn->mt_flags & MDB_TXN_ERROR)
5055 		return MDB_BAD_TXN;
5056 
5057 	mdb_cursor_init(&mc, txn, dbi, &mx);
5058 	return mdb_cursor_set(&mc, key, data, MDB_SET, &exact);
5059 }
5060 
5061 /** Find a sibling for a page.
5062  * Replaces the page at the top of the cursor's stack with the
5063  * specified sibling, if one exists.
5064  * @param[in] mc The cursor for this operation.
5065  * @param[in] move_right Non-zero if the right sibling is requested,
5066  * otherwise the left sibling.
5067  * @return 0 on success, non-zero on failure.
5068  */
5069 static int
5070 mdb_cursor_sibling(MDB_cursor *mc, int move_right)
5071 {
5072 	int		 rc;
5073 	MDB_node	*indx;
5074 	MDB_page	*mp;
5075 
5076 	if (mc->mc_snum < 2) {
5077 		return MDB_NOTFOUND;		/* root has no siblings */
5078 	}
5079 
5080 	mdb_cursor_pop(mc);
5081 	DPRINTF(("parent page is page %"Z"u, index %u",
5082 		mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]));
5083 
5084 	if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top]))
5085 		       : (mc->mc_ki[mc->mc_top] == 0)) {
5086 		DPRINTF(("no more keys left, moving to %s sibling",
5087 		    move_right ? "right" : "left"));
5088 		if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) {
5089 			/* undo cursor_pop before returning */
5090 			mc->mc_top++;
5091 			mc->mc_snum++;
5092 			return rc;
5093 		}
5094 	} else {
5095 		if (move_right)
5096 			mc->mc_ki[mc->mc_top]++;
5097 		else
5098 			mc->mc_ki[mc->mc_top]--;
5099 		DPRINTF(("just moving to %s index key %u",
5100 		    move_right ? "right" : "left", mc->mc_ki[mc->mc_top]));
5101 	}
5102 	mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top]));
5103 
5104 	indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5105 	if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) {
5106 		/* mc will be inconsistent if caller does mc_snum++ as above */
5107 		mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
5108 		return rc;
5109 	}
5110 
5111 	mdb_cursor_push(mc, mp);
5112 	if (!move_right)
5113 		mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1;
5114 
5115 	return MDB_SUCCESS;
5116 }
5117 
5118 /** Move the cursor to the next data item. */
5119 static int
5120 mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
5121 {
5122 	MDB_page	*mp;
5123 	MDB_node	*leaf;
5124 	int rc;
5125 
5126 	if (mc->mc_flags & C_EOF) {
5127 		return MDB_NOTFOUND;
5128 	}
5129 
5130 	mdb_cassert(mc, mc->mc_flags & C_INITIALIZED);
5131 
5132 	mp = mc->mc_pg[mc->mc_top];
5133 
5134 	if (mc->mc_db->md_flags & MDB_DUPSORT) {
5135 		leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5136 		if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5137 			if (op == MDB_NEXT || op == MDB_NEXT_DUP) {
5138 				rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT);
5139 				if (op != MDB_NEXT || rc != MDB_NOTFOUND) {
5140 					if (rc == MDB_SUCCESS)
5141 						MDB_GET_KEY(leaf, key);
5142 					return rc;
5143 				}
5144 			}
5145 		} else {
5146 			mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5147 			if (op == MDB_NEXT_DUP)
5148 				return MDB_NOTFOUND;
5149 		}
5150 	}
5151 
5152 	DPRINTF(("cursor_next: top page is %"Z"u in cursor %p",
5153 		mdb_dbg_pgno(mp), (void *) mc));
5154 	if (mc->mc_flags & C_DEL)
5155 		goto skip;
5156 
5157 	if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) {
5158 		DPUTS("=====> move to next sibling page");
5159 		if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) {
5160 			mc->mc_flags |= C_EOF;
5161 			return rc;
5162 		}
5163 		mp = mc->mc_pg[mc->mc_top];
5164 		DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]));
5165 	} else
5166 		mc->mc_ki[mc->mc_top]++;
5167 
5168 skip:
5169 	DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u",
5170 	    mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]));
5171 
5172 	if (IS_LEAF2(mp)) {
5173 		key->mv_size = mc->mc_db->md_pad;
5174 		key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5175 		return MDB_SUCCESS;
5176 	}
5177 
5178 	mdb_cassert(mc, IS_LEAF(mp));
5179 	leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5180 
5181 	if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5182 		mdb_xcursor_init1(mc, leaf);
5183 	}
5184 	if (data) {
5185 		if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5186 			return rc;
5187 
5188 		if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5189 			rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
5190 			if (rc != MDB_SUCCESS)
5191 				return rc;
5192 		}
5193 	}
5194 
5195 	MDB_GET_KEY(leaf, key);
5196 	return MDB_SUCCESS;
5197 }
5198 
5199 /** Move the cursor to the previous data item. */
5200 static int
5201 mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
5202 {
5203 	MDB_page	*mp;
5204 	MDB_node	*leaf;
5205 	int rc;
5206 
5207 	mdb_cassert(mc, mc->mc_flags & C_INITIALIZED);
5208 
5209 	mp = mc->mc_pg[mc->mc_top];
5210 
5211 	if (mc->mc_db->md_flags & MDB_DUPSORT) {
5212 		leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5213 		if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5214 			if (op == MDB_PREV || op == MDB_PREV_DUP) {
5215 				rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV);
5216 				if (op != MDB_PREV || rc != MDB_NOTFOUND) {
5217 					if (rc == MDB_SUCCESS)
5218 						MDB_GET_KEY(leaf, key);
5219 					return rc;
5220 				}
5221 			} else {
5222 				mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5223 				if (op == MDB_PREV_DUP)
5224 					return MDB_NOTFOUND;
5225 			}
5226 		}
5227 	}
5228 
5229 	DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p",
5230 		mdb_dbg_pgno(mp), (void *) mc));
5231 
5232 	if (mc->mc_ki[mc->mc_top] == 0)  {
5233 		DPUTS("=====> move to prev sibling page");
5234 		if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) {
5235 			return rc;
5236 		}
5237 		mp = mc->mc_pg[mc->mc_top];
5238 		mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1;
5239 		DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]));
5240 	} else
5241 		mc->mc_ki[mc->mc_top]--;
5242 
5243 	mc->mc_flags &= ~C_EOF;
5244 
5245 	DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u",
5246 	    mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]));
5247 
5248 	if (IS_LEAF2(mp)) {
5249 		key->mv_size = mc->mc_db->md_pad;
5250 		key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5251 		return MDB_SUCCESS;
5252 	}
5253 
5254 	mdb_cassert(mc, IS_LEAF(mp));
5255 	leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5256 
5257 	if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5258 		mdb_xcursor_init1(mc, leaf);
5259 	}
5260 	if (data) {
5261 		if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5262 			return rc;
5263 
5264 		if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5265 			rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
5266 			if (rc != MDB_SUCCESS)
5267 				return rc;
5268 		}
5269 	}
5270 
5271 	MDB_GET_KEY(leaf, key);
5272 	return MDB_SUCCESS;
5273 }
5274 
5275 /** Set the cursor on a specific data item. */
5276 static int
5277 mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5278     MDB_cursor_op op, int *exactp)
5279 {
5280 	int		 rc;
5281 	MDB_page	*mp;
5282 	MDB_node	*leaf = NULL;
5283 	DKBUF;
5284 
5285 	if (key->mv_size == 0)
5286 		return MDB_BAD_VALSIZE;
5287 
5288 	if (mc->mc_xcursor)
5289 		mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5290 
5291 	/* See if we're already on the right page */
5292 	if (mc->mc_flags & C_INITIALIZED) {
5293 		MDB_val nodekey;
5294 
5295 		mp = mc->mc_pg[mc->mc_top];
5296 		if (!NUMKEYS(mp)) {
5297 			mc->mc_ki[mc->mc_top] = 0;
5298 			return MDB_NOTFOUND;
5299 		}
5300 		if (mp->mp_flags & P_LEAF2) {
5301 			nodekey.mv_size = mc->mc_db->md_pad;
5302 			nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size);
5303 		} else {
5304 			leaf = NODEPTR(mp, 0);
5305 			MDB_GET_KEY2(leaf, nodekey);
5306 		}
5307 		rc = mc->mc_dbx->md_cmp(key, &nodekey);
5308 		if (rc == 0) {
5309 			/* Probably happens rarely, but first node on the page
5310 			 * was the one we wanted.
5311 			 */
5312 			mc->mc_ki[mc->mc_top] = 0;
5313 			if (exactp)
5314 				*exactp = 1;
5315 			goto set1;
5316 		}
5317 		if (rc > 0) {
5318 			unsigned int i;
5319 			unsigned int nkeys = NUMKEYS(mp);
5320 			if (nkeys > 1) {
5321 				if (mp->mp_flags & P_LEAF2) {
5322 					nodekey.mv_data = LEAF2KEY(mp,
5323 						 nkeys-1, nodekey.mv_size);
5324 				} else {
5325 					leaf = NODEPTR(mp, nkeys-1);
5326 					MDB_GET_KEY2(leaf, nodekey);
5327 				}
5328 				rc = mc->mc_dbx->md_cmp(key, &nodekey);
5329 				if (rc == 0) {
5330 					/* last node was the one we wanted */
5331 					mc->mc_ki[mc->mc_top] = nkeys-1;
5332 					if (exactp)
5333 						*exactp = 1;
5334 					goto set1;
5335 				}
5336 				if (rc < 0) {
5337 					if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) {
5338 						/* This is definitely the right page, skip search_page */
5339 						if (mp->mp_flags & P_LEAF2) {
5340 							nodekey.mv_data = LEAF2KEY(mp,
5341 								 mc->mc_ki[mc->mc_top], nodekey.mv_size);
5342 						} else {
5343 							leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5344 							MDB_GET_KEY2(leaf, nodekey);
5345 						}
5346 						rc = mc->mc_dbx->md_cmp(key, &nodekey);
5347 						if (rc == 0) {
5348 							/* current node was the one we wanted */
5349 							if (exactp)
5350 								*exactp = 1;
5351 							goto set1;
5352 						}
5353 					}
5354 					rc = 0;
5355 					goto set2;
5356 				}
5357 			}
5358 			/* If any parents have right-sibs, search.
5359 			 * Otherwise, there's nothing further.
5360 			 */
5361 			for (i=0; i<mc->mc_top; i++)
5362 				if (mc->mc_ki[i] <
5363 					NUMKEYS(mc->mc_pg[i])-1)
5364 					break;
5365 			if (i == mc->mc_top) {
5366 				/* There are no other pages */
5367 				mc->mc_ki[mc->mc_top] = nkeys;
5368 				return MDB_NOTFOUND;
5369 			}
5370 		}
5371 		if (!mc->mc_top) {
5372 			/* There are no other pages */
5373 			mc->mc_ki[mc->mc_top] = 0;
5374 			if (op == MDB_SET_RANGE) {
5375 				rc = 0;
5376 				goto set1;
5377 			} else
5378 				return MDB_NOTFOUND;
5379 		}
5380 	}
5381 
5382 	rc = mdb_page_search(mc, key, 0);
5383 	if (rc != MDB_SUCCESS)
5384 		return rc;
5385 
5386 	mp = mc->mc_pg[mc->mc_top];
5387 	mdb_cassert(mc, IS_LEAF(mp));
5388 
5389 set2:
5390 	leaf = mdb_node_search(mc, key, exactp);
5391 	if (exactp != NULL && !*exactp) {
5392 		/* MDB_SET specified and not an exact match. */
5393 		return MDB_NOTFOUND;
5394 	}
5395 
5396 	if (leaf == NULL) {
5397 		DPUTS("===> inexact leaf not found, goto sibling");
5398 		if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS)
5399 			return rc;		/* no entries matched */
5400 		mp = mc->mc_pg[mc->mc_top];
5401 		mdb_cassert(mc, IS_LEAF(mp));
5402 		leaf = NODEPTR(mp, 0);
5403 	}
5404 
5405 set1:
5406 	mc->mc_flags |= C_INITIALIZED;
5407 	mc->mc_flags &= ~C_EOF;
5408 
5409 	if (IS_LEAF2(mp)) {
5410 		key->mv_size = mc->mc_db->md_pad;
5411 		key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5412 		return MDB_SUCCESS;
5413 	}
5414 
5415 	if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5416 		mdb_xcursor_init1(mc, leaf);
5417 	}
5418 	if (data) {
5419 		if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5420 			if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) {
5421 				rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
5422 			} else {
5423 				int ex2, *ex2p;
5424 				if (op == MDB_GET_BOTH) {
5425 					ex2p = &ex2;
5426 					ex2 = 0;
5427 				} else {
5428 					ex2p = NULL;
5429 				}
5430 				rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p);
5431 				if (rc != MDB_SUCCESS)
5432 					return rc;
5433 			}
5434 		} else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) {
5435 			MDB_val d2;
5436 			if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS)
5437 				return rc;
5438 			rc = mc->mc_dbx->md_dcmp(data, &d2);
5439 			if (rc) {
5440 				if (op == MDB_GET_BOTH || rc > 0)
5441 					return MDB_NOTFOUND;
5442 				rc = 0;
5443 				*data = d2;
5444 			}
5445 
5446 		} else {
5447 			if (mc->mc_xcursor)
5448 				mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5449 			if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5450 				return rc;
5451 		}
5452 	}
5453 
5454 	/* The key already matches in all other cases */
5455 	if (op == MDB_SET_RANGE || op == MDB_SET_KEY)
5456 		MDB_GET_KEY(leaf, key);
5457 	DPRINTF(("==> cursor placed on key [%s]", DKEY(key)));
5458 
5459 	return rc;
5460 }
5461 
5462 /** Move the cursor to the first item in the database. */
5463 static int
5464 mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data)
5465 {
5466 	int		 rc;
5467 	MDB_node	*leaf;
5468 
5469 	if (mc->mc_xcursor)
5470 		mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5471 
5472 	if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
5473 		rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
5474 		if (rc != MDB_SUCCESS)
5475 			return rc;
5476 	}
5477 	mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
5478 
5479 	leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0);
5480 	mc->mc_flags |= C_INITIALIZED;
5481 	mc->mc_flags &= ~C_EOF;
5482 
5483 	mc->mc_ki[mc->mc_top] = 0;
5484 
5485 	if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
5486 		key->mv_size = mc->mc_db->md_pad;
5487 		key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size);
5488 		return MDB_SUCCESS;
5489 	}
5490 
5491 	if (data) {
5492 		if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5493 			mdb_xcursor_init1(mc, leaf);
5494 			rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
5495 			if (rc)
5496 				return rc;
5497 		} else {
5498 			if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5499 				return rc;
5500 		}
5501 	}
5502 	MDB_GET_KEY(leaf, key);
5503 	return MDB_SUCCESS;
5504 }
5505 
5506 /** Move the cursor to the last item in the database. */
5507 static int
5508 mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data)
5509 {
5510 	int		 rc;
5511 	MDB_node	*leaf;
5512 
5513 	if (mc->mc_xcursor)
5514 		mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5515 
5516 	if (!(mc->mc_flags & C_EOF)) {
5517 
5518 		if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
5519 			rc = mdb_page_search(mc, NULL, MDB_PS_LAST);
5520 			if (rc != MDB_SUCCESS)
5521 				return rc;
5522 		}
5523 		mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
5524 
5525 	}
5526 	mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;
5527 	mc->mc_flags |= C_INITIALIZED|C_EOF;
5528 	leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5529 
5530 	if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
5531 		key->mv_size = mc->mc_db->md_pad;
5532 		key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size);
5533 		return MDB_SUCCESS;
5534 	}
5535 
5536 	if (data) {
5537 		if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5538 			mdb_xcursor_init1(mc, leaf);
5539 			rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
5540 			if (rc)
5541 				return rc;
5542 		} else {
5543 			if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
5544 				return rc;
5545 		}
5546 	}
5547 
5548 	MDB_GET_KEY(leaf, key);
5549 	return MDB_SUCCESS;
5550 }
5551 
5552 int
5553 mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5554     MDB_cursor_op op)
5555 {
5556 	int		 rc;
5557 	int		 exact = 0;
5558 	int		 (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data);
5559 
5560 	if (mc == NULL)
5561 		return EINVAL;
5562 
5563 	if (mc->mc_txn->mt_flags & MDB_TXN_ERROR)
5564 		return MDB_BAD_TXN;
5565 
5566 	switch (op) {
5567 	case MDB_GET_CURRENT:
5568 		if (!(mc->mc_flags & C_INITIALIZED)) {
5569 			rc = EINVAL;
5570 		} else {
5571 			MDB_page *mp = mc->mc_pg[mc->mc_top];
5572 			int nkeys = NUMKEYS(mp);
5573 			if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) {
5574 				mc->mc_ki[mc->mc_top] = nkeys;
5575 				rc = MDB_NOTFOUND;
5576 				break;
5577 			}
5578 			rc = MDB_SUCCESS;
5579 			if (IS_LEAF2(mp)) {
5580 				key->mv_size = mc->mc_db->md_pad;
5581 				key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5582 			} else {
5583 				MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5584 				MDB_GET_KEY(leaf, key);
5585 				if (data) {
5586 					if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5587 						if (mc->mc_flags & C_DEL)
5588 							mdb_xcursor_init1(mc, leaf);
5589 						rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT);
5590 					} else {
5591 						rc = mdb_node_read(mc->mc_txn, leaf, data);
5592 					}
5593 				}
5594 			}
5595 		}
5596 		break;
5597 	case MDB_GET_BOTH:
5598 	case MDB_GET_BOTH_RANGE:
5599 		if (data == NULL) {
5600 			rc = EINVAL;
5601 			break;
5602 		}
5603 		if (mc->mc_xcursor == NULL) {
5604 			rc = MDB_INCOMPATIBLE;
5605 			break;
5606 		}
5607 		/* FALLTHRU */
5608 	case MDB_SET:
5609 	case MDB_SET_KEY:
5610 	case MDB_SET_RANGE:
5611 		if (key == NULL) {
5612 			rc = EINVAL;
5613 		} else {
5614 			rc = mdb_cursor_set(mc, key, data, op,
5615 				op == MDB_SET_RANGE ? NULL : &exact);
5616 		}
5617 		break;
5618 	case MDB_GET_MULTIPLE:
5619 		if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) {
5620 			rc = EINVAL;
5621 			break;
5622 		}
5623 		if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
5624 			rc = MDB_INCOMPATIBLE;
5625 			break;
5626 		}
5627 		rc = MDB_SUCCESS;
5628 		if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) ||
5629 			(mc->mc_xcursor->mx_cursor.mc_flags & C_EOF))
5630 			break;
5631 		goto fetchm;
5632 	case MDB_NEXT_MULTIPLE:
5633 		if (data == NULL) {
5634 			rc = EINVAL;
5635 			break;
5636 		}
5637 		if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
5638 			rc = MDB_INCOMPATIBLE;
5639 			break;
5640 		}
5641 		if (!(mc->mc_flags & C_INITIALIZED))
5642 			rc = mdb_cursor_first(mc, key, data);
5643 		else
5644 			rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP);
5645 		if (rc == MDB_SUCCESS) {
5646 			if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
5647 				MDB_cursor *mx;
5648 fetchm:
5649 				mx = &mc->mc_xcursor->mx_cursor;
5650 				data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) *
5651 					mx->mc_db->md_pad;
5652 				data->mv_data = METADATA(mx->mc_pg[mx->mc_top]);
5653 				mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1;
5654 			} else {
5655 				rc = MDB_NOTFOUND;
5656 			}
5657 		}
5658 		break;
5659 	case MDB_NEXT:
5660 	case MDB_NEXT_DUP:
5661 	case MDB_NEXT_NODUP:
5662 		if (!(mc->mc_flags & C_INITIALIZED))
5663 			rc = mdb_cursor_first(mc, key, data);
5664 		else
5665 			rc = mdb_cursor_next(mc, key, data, op);
5666 		break;
5667 	case MDB_PREV:
5668 	case MDB_PREV_DUP:
5669 	case MDB_PREV_NODUP:
5670 		if (!(mc->mc_flags & C_INITIALIZED)) {
5671 			rc = mdb_cursor_last(mc, key, data);
5672 			if (rc)
5673 				break;
5674 			mc->mc_flags |= C_INITIALIZED;
5675 			mc->mc_ki[mc->mc_top]++;
5676 		}
5677 		rc = mdb_cursor_prev(mc, key, data, op);
5678 		break;
5679 	case MDB_FIRST:
5680 		rc = mdb_cursor_first(mc, key, data);
5681 		break;
5682 	case MDB_FIRST_DUP:
5683 		mfunc = mdb_cursor_first;
5684 	mmove:
5685 		if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) {
5686 			rc = EINVAL;
5687 			break;
5688 		}
5689 		if (mc->mc_xcursor == NULL) {
5690 			rc = MDB_INCOMPATIBLE;
5691 			break;
5692 		}
5693 		if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
5694 			rc = EINVAL;
5695 			break;
5696 		}
5697 		rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL);
5698 		break;
5699 	case MDB_LAST:
5700 		rc = mdb_cursor_last(mc, key, data);
5701 		break;
5702 	case MDB_LAST_DUP:
5703 		mfunc = mdb_cursor_last;
5704 		goto mmove;
5705 	default:
5706 		DPRINTF(("unhandled/unimplemented cursor operation %u", op));
5707 		rc = EINVAL;
5708 		break;
5709 	}
5710 
5711 	if (mc->mc_flags & C_DEL)
5712 		mc->mc_flags ^= C_DEL;
5713 
5714 	return rc;
5715 }
5716 
5717 /** Touch all the pages in the cursor stack. Set mc_top.
5718  *	Makes sure all the pages are writable, before attempting a write operation.
5719  * @param[in] mc The cursor to operate on.
5720  */
5721 static int
5722 mdb_cursor_touch(MDB_cursor *mc)
5723 {
5724 	int rc = MDB_SUCCESS;
5725 
5726 	if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
5727 		MDB_cursor mc2;
5728 		MDB_xcursor mcx;
5729 		mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx);
5730 		rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY);
5731 		if (rc)
5732 			 return rc;
5733 		*mc->mc_dbflag |= DB_DIRTY;
5734 	}
5735 	mc->mc_top = 0;
5736 	if (mc->mc_snum) {
5737 		do {
5738 			rc = mdb_page_touch(mc);
5739 		} while (!rc && ++(mc->mc_top) < mc->mc_snum);
5740 		mc->mc_top = mc->mc_snum-1;
5741 	}
5742 	return rc;
5743 }
5744 
5745 /** Do not spill pages to disk if txn is getting full, may fail instead */
5746 #define MDB_NOSPILL	0x8000
5747 
5748 int
5749 mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
5750     unsigned int flags)
5751 {
5752 	enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */
5753 	MDB_env		*env;
5754 	MDB_node	*leaf = NULL;
5755 	MDB_page	*fp, *mp;
5756 	uint16_t	fp_flags;
5757 	MDB_val		xdata, *rdata, dkey, olddata;
5758 	MDB_db dummy;
5759 	int do_sub = 0, insert;
5760 	unsigned int mcount = 0, dcount = 0, nospill;
5761 	size_t nsize;
5762 	int rc, rc2;
5763 	unsigned int nflags;
5764 	DKBUF;
5765 
5766 	if (mc == NULL || key == NULL)
5767 		return EINVAL;
5768 
5769 	env = mc->mc_txn->mt_env;
5770 
5771 	/* Check this first so counter will always be zero on any
5772 	 * early failures.
5773 	 */
5774 	if (flags & MDB_MULTIPLE) {
5775 		dcount = data[1].mv_size;
5776 		data[1].mv_size = 0;
5777 		if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED))
5778 			return MDB_INCOMPATIBLE;
5779 	}
5780 
5781 	nospill = flags & MDB_NOSPILL;
5782 	flags &= ~MDB_NOSPILL;
5783 
5784 	if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
5785 		return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
5786 
5787 	if (flags != MDB_CURRENT && key->mv_size-1 >= ENV_MAXKEY(env))
5788 		return MDB_BAD_VALSIZE;
5789 
5790 #if SIZE_MAX > MAXDATASIZE
5791 	if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE))
5792 		return MDB_BAD_VALSIZE;
5793 #else
5794 	if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env))
5795 		return MDB_BAD_VALSIZE;
5796 #endif
5797 
5798 	DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u",
5799 		DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size));
5800 
5801 	dkey.mv_size = 0;
5802 
5803 	if (flags == MDB_CURRENT) {
5804 		if (!(mc->mc_flags & C_INITIALIZED))
5805 			return EINVAL;
5806 		rc = MDB_SUCCESS;
5807 	} else if (mc->mc_db->md_root == P_INVALID) {
5808 		/* new database, cursor has nothing to point to */
5809 		mc->mc_snum = 0;
5810 		mc->mc_top = 0;
5811 		mc->mc_flags &= ~C_INITIALIZED;
5812 		rc = MDB_NO_ROOT;
5813 	} else {
5814 		int exact = 0;
5815 		MDB_val d2;
5816 		if (flags & MDB_APPEND) {
5817 			MDB_val k2;
5818 			rc = mdb_cursor_last(mc, &k2, &d2);
5819 			if (rc == 0) {
5820 				rc = mc->mc_dbx->md_cmp(key, &k2);
5821 				if (rc > 0) {
5822 					rc = MDB_NOTFOUND;
5823 					mc->mc_ki[mc->mc_top]++;
5824 				} else {
5825 					/* new key is <= last key */
5826 					rc = MDB_KEYEXIST;
5827 				}
5828 			}
5829 		} else {
5830 			rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact);
5831 		}
5832 		if ((flags & MDB_NOOVERWRITE) && rc == 0) {
5833 			DPRINTF(("duplicate key [%s]", DKEY(key)));
5834 			*data = d2;
5835 			return MDB_KEYEXIST;
5836 		}
5837 		if (rc && rc != MDB_NOTFOUND)
5838 			return rc;
5839 	}
5840 
5841 	if (mc->mc_flags & C_DEL)
5842 		mc->mc_flags ^= C_DEL;
5843 
5844 	/* Cursor is positioned, check for room in the dirty list */
5845 	if (!nospill) {
5846 		if (flags & MDB_MULTIPLE) {
5847 			rdata = &xdata;
5848 			xdata.mv_size = data->mv_size * dcount;
5849 		} else {
5850 			rdata = data;
5851 		}
5852 		if ((rc2 = mdb_page_spill(mc, key, rdata)))
5853 			return rc2;
5854 	}
5855 
5856 	if (rc == MDB_NO_ROOT) {
5857 		MDB_page *np;
5858 		/* new database, write a root leaf page */
5859 		DPUTS("allocating new root leaf page");
5860 		if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) {
5861 			return rc2;
5862 		}
5863 		mdb_cursor_push(mc, np);
5864 		mc->mc_db->md_root = np->mp_pgno;
5865 		mc->mc_db->md_depth++;
5866 		*mc->mc_dbflag |= DB_DIRTY;
5867 		if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED))
5868 			== MDB_DUPFIXED)
5869 			np->mp_flags |= P_LEAF2;
5870 		mc->mc_flags |= C_INITIALIZED;
5871 	} else {
5872 		/* make sure all cursor pages are writable */
5873 		rc2 = mdb_cursor_touch(mc);
5874 		if (rc2)
5875 			return rc2;
5876 	}
5877 
5878 	insert = rc;
5879 	if (insert) {
5880 		/* The key does not exist */
5881 		DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top]));
5882 		if ((mc->mc_db->md_flags & MDB_DUPSORT) &&
5883 			LEAFSIZE(key, data) > env->me_nodemax)
5884 		{
5885 			/* Too big for a node, insert in sub-DB */
5886 			fp_flags = P_LEAF|P_DIRTY;
5887 			fp = env->me_pbuf;
5888 			fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */
5889 			fp->mp_lower = fp->mp_upper = olddata.mv_size = PAGEHDRSZ;
5890 			goto prep_subDB;
5891 		}
5892 	} else {
5893 		/* there's only a key anyway, so this is a no-op */
5894 		if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
5895 			unsigned int ksize = mc->mc_db->md_pad;
5896 			if (key->mv_size != ksize)
5897 				return MDB_BAD_VALSIZE;
5898 			if (flags == MDB_CURRENT) {
5899 				char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
5900 				memcpy(ptr, key->mv_data, ksize);
5901 			}
5902 			return MDB_SUCCESS;
5903 		}
5904 
5905 more:
5906 		leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5907 		olddata.mv_size = NODEDSZ(leaf);
5908 		olddata.mv_data = NODEDATA(leaf);
5909 
5910 		/* DB has dups? */
5911 		if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
5912 			/* Prepare (sub-)page/sub-DB to accept the new item,
5913 			 * if needed.  fp: old sub-page or a header faking
5914 			 * it.  mp: new (sub-)page.  offset: growth in page
5915 			 * size.  xdata: node data with new page or DB.
5916 			 */
5917 			unsigned	i, offset = 0;
5918 			mp = fp = xdata.mv_data = env->me_pbuf;
5919 			mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
5920 
5921 			/* Was a single item before, must convert now */
5922 			if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5923 				/* Just overwrite the current item */
5924 				if (flags == MDB_CURRENT)
5925 					goto current;
5926 
5927 #if UINT_MAX < SIZE_MAX
5928 				if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t))
5929 #ifdef MISALIGNED_OK
5930 					mc->mc_dbx->md_dcmp = mdb_cmp_long;
5931 #else
5932 					mc->mc_dbx->md_dcmp = mdb_cmp_cint;
5933 #endif
5934 #endif
5935 				/* if data matches, skip it */
5936 				if (!mc->mc_dbx->md_dcmp(data, &olddata)) {
5937 					if (flags & MDB_NODUPDATA)
5938 						rc = MDB_KEYEXIST;
5939 					else if (flags & MDB_MULTIPLE)
5940 						goto next_mult;
5941 					else
5942 						rc = MDB_SUCCESS;
5943 					return rc;
5944 				}
5945 
5946 				/* Back up original data item */
5947 				dkey.mv_size = olddata.mv_size;
5948 				dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size);
5949 
5950 				/* Make sub-page header for the dup items, with dummy body */
5951 				fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
5952 				fp->mp_lower = PAGEHDRSZ;
5953 				xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
5954 				if (mc->mc_db->md_flags & MDB_DUPFIXED) {
5955 					fp->mp_flags |= P_LEAF2;
5956 					fp->mp_pad = data->mv_size;
5957 					xdata.mv_size += 2 * data->mv_size;	/* leave space for 2 more */
5958 				} else {
5959 					xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
5960 						(dkey.mv_size & 1) + (data->mv_size & 1);
5961 				}
5962 				fp->mp_upper = xdata.mv_size;
5963 				olddata.mv_size = fp->mp_upper; /* pretend olddata is fp */
5964 			} else if (leaf->mn_flags & F_SUBDATA) {
5965 				/* Data is on sub-DB, just store it */
5966 				flags |= F_DUPDATA|F_SUBDATA;
5967 				goto put_sub;
5968 			} else {
5969 				/* Data is on sub-page */
5970 				fp = olddata.mv_data;
5971 				switch (flags) {
5972 				default:
5973 					if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
5974 						offset = EVEN(NODESIZE + sizeof(indx_t) +
5975 							data->mv_size);
5976 						break;
5977 					}
5978 					offset = fp->mp_pad;
5979 					if (SIZELEFT(fp) < offset) {
5980 						offset *= 4; /* space for 4 more */
5981 						break;
5982 					}
5983 					/* FALLTHRU: Big enough MDB_DUPFIXED sub-page */
5984 				case MDB_CURRENT:
5985 					fp->mp_flags |= P_DIRTY;
5986 					COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
5987 					mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
5988 					flags |= F_DUPDATA;
5989 					goto put_sub;
5990 				}
5991 				xdata.mv_size = olddata.mv_size + offset;
5992 			}
5993 
5994 			fp_flags = fp->mp_flags;
5995 			if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) {
5996 					/* Too big for a sub-page, convert to sub-DB */
5997 					fp_flags &= ~P_SUBP;
5998 prep_subDB:
5999 					if (mc->mc_db->md_flags & MDB_DUPFIXED) {
6000 						fp_flags |= P_LEAF2;
6001 						dummy.md_pad = fp->mp_pad;
6002 						dummy.md_flags = MDB_DUPFIXED;
6003 						if (mc->mc_db->md_flags & MDB_INTEGERDUP)
6004 							dummy.md_flags |= MDB_INTEGERKEY;
6005 					} else {
6006 						dummy.md_pad = 0;
6007 						dummy.md_flags = 0;
6008 					}
6009 					dummy.md_depth = 1;
6010 					dummy.md_branch_pages = 0;
6011 					dummy.md_leaf_pages = 1;
6012 					dummy.md_overflow_pages = 0;
6013 					dummy.md_entries = NUMKEYS(fp);
6014 					xdata.mv_size = sizeof(MDB_db);
6015 					xdata.mv_data = &dummy;
6016 					if ((rc = mdb_page_alloc(mc, 1, &mp)))
6017 						return rc;
6018 					offset = env->me_psize - olddata.mv_size;
6019 					flags |= F_DUPDATA|F_SUBDATA;
6020 					dummy.md_root = mp->mp_pgno;
6021 			}
6022 			if (mp != fp) {
6023 				mp->mp_flags = fp_flags | P_DIRTY;
6024 				mp->mp_pad   = fp->mp_pad;
6025 				mp->mp_lower = fp->mp_lower;
6026 				mp->mp_upper = fp->mp_upper + offset;
6027 				if (fp_flags & P_LEAF2) {
6028 					memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
6029 				} else {
6030 					memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper,
6031 						olddata.mv_size - fp->mp_upper);
6032 					for (i=0; i<NUMKEYS(fp); i++)
6033 						mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
6034 				}
6035 			}
6036 
6037 			rdata = &xdata;
6038 			flags |= F_DUPDATA;
6039 			do_sub = 1;
6040 			if (!insert)
6041 				mdb_node_del(mc, 0);
6042 			goto new_sub;
6043 		}
6044 current:
6045 		/* overflow page overwrites need special handling */
6046 		if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
6047 			MDB_page *omp;
6048 			pgno_t pg;
6049 			int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
6050 
6051 			memcpy(&pg, olddata.mv_data, sizeof(pg));
6052 			if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0)
6053 				return rc2;
6054 			ovpages = omp->mp_pages;
6055 
6056 			/* Is the ov page large enough? */
6057 			if (ovpages >= dpages) {
6058 			  if (!(omp->mp_flags & P_DIRTY) &&
6059 				  (level || (env->me_flags & MDB_WRITEMAP)))
6060 			  {
6061 				rc = mdb_page_unspill(mc->mc_txn, omp, &omp);
6062 				if (rc)
6063 					return rc;
6064 				level = 0;		/* dirty in this txn or clean */
6065 			  }
6066 			  /* Is it dirty? */
6067 			  if (omp->mp_flags & P_DIRTY) {
6068 				/* yes, overwrite it. Note in this case we don't
6069 				 * bother to try shrinking the page if the new data
6070 				 * is smaller than the overflow threshold.
6071 				 */
6072 				if (level > 1) {
6073 					/* It is writable only in a parent txn */
6074 					size_t sz = (size_t) env->me_psize * ovpages, off;
6075 					MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages);
6076 					MDB_ID2 id2;
6077 					if (!np)
6078 						return ENOMEM;
6079 					id2.mid = pg;
6080 					id2.mptr = np;
6081 					rc = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2);
6082 					mdb_cassert(mc, rc == 0);
6083 					if (!(flags & MDB_RESERVE)) {
6084 						/* Copy end of page, adjusting alignment so
6085 						 * compiler may copy words instead of bytes.
6086 						 */
6087 						off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t);
6088 						memcpy((size_t *)((char *)np + off),
6089 							(size_t *)((char *)omp + off), sz - off);
6090 						sz = PAGEHDRSZ;
6091 					}
6092 					memcpy(np, omp, sz); /* Copy beginning of page */
6093 					omp = np;
6094 				}
6095 				SETDSZ(leaf, data->mv_size);
6096 				if (F_ISSET(flags, MDB_RESERVE))
6097 					data->mv_data = METADATA(omp);
6098 				else
6099 					memcpy(METADATA(omp), data->mv_data, data->mv_size);
6100 				goto done;
6101 			  }
6102 			}
6103 			if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS)
6104 				return rc2;
6105 		} else if (data->mv_size == olddata.mv_size) {
6106 			/* same size, just replace it. Note that we could
6107 			 * also reuse this node if the new data is smaller,
6108 			 * but instead we opt to shrink the node in that case.
6109 			 */
6110 			if (F_ISSET(flags, MDB_RESERVE))
6111 				data->mv_data = olddata.mv_data;
6112 			else if (data->mv_size)
6113 				memcpy(olddata.mv_data, data->mv_data, data->mv_size);
6114 			else
6115 				memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
6116 			goto done;
6117 		}
6118 		mdb_node_del(mc, 0);
6119 		mc->mc_db->md_entries--;
6120 	}
6121 
6122 	rdata = data;
6123 
6124 new_sub:
6125 	nflags = flags & NODE_ADD_FLAGS;
6126 	nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata);
6127 	if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
6128 		if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
6129 			nflags &= ~MDB_APPEND;
6130 		if (!insert)
6131 			nflags |= MDB_SPLIT_REPLACE;
6132 		rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags);
6133 	} else {
6134 		/* There is room already in this leaf page. */
6135 		rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
6136 		if (rc == 0 && !do_sub && insert) {
6137 			/* Adjust other cursors pointing to mp */
6138 			MDB_cursor *m2, *m3;
6139 			MDB_dbi dbi = mc->mc_dbi;
6140 			unsigned i = mc->mc_top;
6141 			MDB_page *mp = mc->mc_pg[i];
6142 
6143 			for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
6144 				if (mc->mc_flags & C_SUB)
6145 					m3 = &m2->mc_xcursor->mx_cursor;
6146 				else
6147 					m3 = m2;
6148 				if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
6149 				if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) {
6150 					m3->mc_ki[i]++;
6151 				}
6152 			}
6153 		}
6154 	}
6155 
6156 	if (rc != MDB_SUCCESS)
6157 		mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
6158 	else {
6159 		/* Now store the actual data in the child DB. Note that we're
6160 		 * storing the user data in the keys field, so there are strict
6161 		 * size limits on dupdata. The actual data fields of the child
6162 		 * DB are all zero size.
6163 		 */
6164 		if (do_sub) {
6165 			int xflags;
6166 put_sub:
6167 			xdata.mv_size = 0;
6168 			xdata.mv_data = "";
6169 			leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6170 			if (flags & MDB_CURRENT) {
6171 				xflags = MDB_CURRENT|MDB_NOSPILL;
6172 			} else {
6173 				mdb_xcursor_init1(mc, leaf);
6174 				xflags = (flags & MDB_NODUPDATA) ?
6175 					MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL;
6176 			}
6177 			/* converted, write the original data first */
6178 			if (dkey.mv_size) {
6179 				rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
6180 				if (rc)
6181 					return rc;
6182 				{
6183 					/* Adjust other cursors pointing to mp */
6184 					MDB_cursor *m2;
6185 					unsigned i = mc->mc_top;
6186 					MDB_page *mp = mc->mc_pg[i];
6187 
6188 					for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
6189 						if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
6190 						if (!(m2->mc_flags & C_INITIALIZED)) continue;
6191 						if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) {
6192 							mdb_xcursor_init1(m2, leaf);
6193 						}
6194 					}
6195 				}
6196 				/* we've done our job */
6197 				dkey.mv_size = 0;
6198 			}
6199 			if (flags & MDB_APPENDDUP)
6200 				xflags |= MDB_APPEND;
6201 			rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
6202 			if (flags & F_SUBDATA) {
6203 				void *db = NODEDATA(leaf);
6204 				memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
6205 			}
6206 		}
6207 		/* sub-writes might have failed so check rc again.
6208 		 * Don't increment count if we just replaced an existing item.
6209 		 */
6210 		if (!rc && !(flags & MDB_CURRENT))
6211 			mc->mc_db->md_entries++;
6212 		if (flags & MDB_MULTIPLE) {
6213 			if (!rc) {
6214 next_mult:
6215 				mcount++;
6216 				/* let caller know how many succeeded, if any */
6217 				data[1].mv_size = mcount;
6218 				if (mcount < dcount) {
6219 					data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
6220 					goto more;
6221 				}
6222 			}
6223 		}
6224 	}
6225 done:
6226 	/* If we succeeded and the key didn't exist before, make sure
6227 	 * the cursor is marked valid.
6228 	 */
6229 	if (!rc && insert)
6230 		mc->mc_flags |= C_INITIALIZED;
6231 	return rc;
6232 }
6233 
6234 int
6235 mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
6236 {
6237 	MDB_node	*leaf;
6238 	MDB_page	*mp;
6239 	int rc;
6240 
6241 	if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
6242 		return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
6243 
6244 	if (!(mc->mc_flags & C_INITIALIZED))
6245 		return EINVAL;
6246 
6247 	if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
6248 		return MDB_NOTFOUND;
6249 
6250 	if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL)))
6251 		return rc;
6252 
6253 	rc = mdb_cursor_touch(mc);
6254 	if (rc)
6255 		return rc;
6256 
6257 	mp = mc->mc_pg[mc->mc_top];
6258 	leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6259 
6260 	if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6261 		if (!(flags & MDB_NODUPDATA)) {
6262 			if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
6263 				mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
6264 			}
6265 			rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL);
6266 			/* If sub-DB still has entries, we're done */
6267 			if (mc->mc_xcursor->mx_db.md_entries) {
6268 				if (leaf->mn_flags & F_SUBDATA) {
6269 					/* update subDB info */
6270 					void *db = NODEDATA(leaf);
6271 					memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
6272 				} else {
6273 					MDB_cursor *m2;
6274 					/* shrink fake page */
6275 					mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]);
6276 					leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6277 					mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
6278 					/* fix other sub-DB cursors pointed at this fake page */
6279 					for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
6280 						if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
6281 						if (m2->mc_pg[mc->mc_top] == mp &&
6282 							m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
6283 							m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
6284 					}
6285 				}
6286 				mc->mc_db->md_entries--;
6287 				mc->mc_flags |= C_DEL;
6288 				return rc;
6289 			}
6290 			/* otherwise fall thru and delete the sub-DB */
6291 		}
6292 
6293 		if (leaf->mn_flags & F_SUBDATA) {
6294 			/* add all the child DB's pages to the free list */
6295 			rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
6296 			if (rc == MDB_SUCCESS) {
6297 				mc->mc_db->md_entries -=
6298 					mc->mc_xcursor->mx_db.md_entries;
6299 			}
6300 		}
6301 	}
6302 
6303 	return mdb_cursor_del0(mc, leaf);
6304 }
6305 
6306 /** Allocate and initialize new pages for a database.
6307  * @param[in] mc a cursor on the database being added to.
6308  * @param[in] flags flags defining what type of page is being allocated.
6309  * @param[in] num the number of pages to allocate. This is usually 1,
6310  * unless allocating overflow pages for a large record.
6311  * @param[out] mp Address of a page, or NULL on failure.
6312  * @return 0 on success, non-zero on failure.
6313  */
6314 static int
6315 mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp)
6316 {
6317 	MDB_page	*np;
6318 	int rc;
6319 
6320 	if ((rc = mdb_page_alloc(mc, num, &np)))
6321 		return rc;
6322 	DPRINTF(("allocated new mpage %"Z"u, page size %u",
6323 	    np->mp_pgno, mc->mc_txn->mt_env->me_psize));
6324 	np->mp_flags = flags | P_DIRTY;
6325 	np->mp_lower = PAGEHDRSZ;
6326 	np->mp_upper = mc->mc_txn->mt_env->me_psize;
6327 
6328 	if (IS_BRANCH(np))
6329 		mc->mc_db->md_branch_pages++;
6330 	else if (IS_LEAF(np))
6331 		mc->mc_db->md_leaf_pages++;
6332 	else if (IS_OVERFLOW(np)) {
6333 		mc->mc_db->md_overflow_pages += num;
6334 		np->mp_pages = num;
6335 	}
6336 	*mp = np;
6337 
6338 	return 0;
6339 }
6340 
6341 /** Calculate the size of a leaf node.
6342  * The size depends on the environment's page size; if a data item
6343  * is too large it will be put onto an overflow page and the node
6344  * size will only include the key and not the data. Sizes are always
6345  * rounded up to an even number of bytes, to guarantee 2-byte alignment
6346  * of the #MDB_node headers.
6347  * @param[in] env The environment handle.
6348  * @param[in] key The key for the node.
6349  * @param[in] data The data for the node.
6350  * @return The number of bytes needed to store the node.
6351  */
6352 static size_t
6353 mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data)
6354 {
6355 	size_t		 sz;
6356 
6357 	sz = LEAFSIZE(key, data);
6358 	if (sz > env->me_nodemax) {
6359 		/* put on overflow page */
6360 		sz -= data->mv_size - sizeof(pgno_t);
6361 	}
6362 
6363 	return EVEN(sz + sizeof(indx_t));
6364 }
6365 
6366 /** Calculate the size of a branch node.
6367  * The size should depend on the environment's page size but since
6368  * we currently don't support spilling large keys onto overflow
6369  * pages, it's simply the size of the #MDB_node header plus the
6370  * size of the key. Sizes are always rounded up to an even number
6371  * of bytes, to guarantee 2-byte alignment of the #MDB_node headers.
6372  * @param[in] env The environment handle.
6373  * @param[in] key The key for the node.
6374  * @return The number of bytes needed to store the node.
6375  */
6376 static size_t
6377 mdb_branch_size(MDB_env *env, MDB_val *key)
6378 {
6379 	size_t		 sz;
6380 
6381 	sz = INDXSIZE(key);
6382 	if (sz > env->me_nodemax) {
6383 		/* put on overflow page */
6384 		/* not implemented */
6385 		/* sz -= key->size - sizeof(pgno_t); */
6386 	}
6387 
6388 	return sz + sizeof(indx_t);
6389 }
6390 
6391 /** Add a node to the page pointed to by the cursor.
6392  * @param[in] mc The cursor for this operation.
6393  * @param[in] indx The index on the page where the new node should be added.
6394  * @param[in] key The key for the new node.
6395  * @param[in] data The data for the new node, if any.
6396  * @param[in] pgno The page number, if adding a branch node.
6397  * @param[in] flags Flags for the node.
6398  * @return 0 on success, non-zero on failure. Possible errors are:
6399  * <ul>
6400  *	<li>ENOMEM - failed to allocate overflow pages for the node.
6401  *	<li>MDB_PAGE_FULL - there is insufficient room in the page. This error
6402  *	should never happen since all callers already calculate the
6403  *	page's free space before calling this function.
6404  * </ul>
6405  */
6406 static int
6407 mdb_node_add(MDB_cursor *mc, indx_t indx,
6408     MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags)
6409 {
6410 	unsigned int	 i;
6411 	size_t		 node_size = NODESIZE;
6412 	ssize_t		 room;
6413 	indx_t		 ofs;
6414 	MDB_node	*node;
6415 	MDB_page	*mp = mc->mc_pg[mc->mc_top];
6416 	MDB_page	*ofp = NULL;		/* overflow page */
6417 	DKBUF;
6418 
6419 	mdb_cassert(mc, mp->mp_upper >= mp->mp_lower);
6420 
6421 	DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]",
6422 	    IS_LEAF(mp) ? "leaf" : "branch",
6423 		IS_SUBP(mp) ? "sub-" : "",
6424 		mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0,
6425 		key ? key->mv_size : 0, key ? DKEY(key) : "null"));
6426 
6427 	if (IS_LEAF2(mp)) {
6428 		/* Move higher keys up one slot. */
6429 		int ksize = mc->mc_db->md_pad, dif;
6430 		char *ptr = LEAF2KEY(mp, indx, ksize);
6431 		dif = NUMKEYS(mp) - indx;
6432 		if (dif > 0)
6433 			memmove(ptr+ksize, ptr, dif*ksize);
6434 		/* insert new key */
6435 		memcpy(ptr, key->mv_data, ksize);
6436 
6437 		/* Just using these for counting */
6438 		mp->mp_lower += sizeof(indx_t);
6439 		mp->mp_upper -= ksize - sizeof(indx_t);
6440 		return MDB_SUCCESS;
6441 	}
6442 
6443 	room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t);
6444 	if (key != NULL)
6445 		node_size += key->mv_size;
6446 	if (IS_LEAF(mp)) {
6447 		mdb_cassert(mc, data);
6448 		if (F_ISSET(flags, F_BIGDATA)) {
6449 			/* Data already on overflow page. */
6450 			node_size += sizeof(pgno_t);
6451 		} else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) {
6452 			int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
6453 			int rc;
6454 			/* Put data on overflow page. */
6455 			DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page",
6456 			    data->mv_size, node_size+data->mv_size));
6457 			node_size = EVEN(node_size + sizeof(pgno_t));
6458 			if ((ssize_t)node_size > room)
6459 				goto full;
6460 			if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp)))
6461 				return rc;
6462 			DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno));
6463 			flags |= F_BIGDATA;
6464 			goto update;
6465 		} else {
6466 			node_size += data->mv_size;
6467 		}
6468 	}
6469 	node_size = EVEN(node_size);
6470 	if ((ssize_t)node_size > room)
6471 		goto full;
6472 
6473 update:
6474 	/* Move higher pointers up one slot. */
6475 	for (i = NUMKEYS(mp); i > indx; i--)
6476 		mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
6477 
6478 	/* Adjust free space offsets. */
6479 	ofs = mp->mp_upper - node_size;
6480 	mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t));
6481 	mp->mp_ptrs[indx] = ofs;
6482 	mp->mp_upper = ofs;
6483 	mp->mp_lower += sizeof(indx_t);
6484 
6485 	/* Write the node data. */
6486 	node = NODEPTR(mp, indx);
6487 	node->mn_ksize = (key == NULL) ? 0 : key->mv_size;
6488 	node->mn_flags = flags;
6489 	if (IS_LEAF(mp))
6490 		SETDSZ(node,data->mv_size);
6491 	else
6492 		SETPGNO(node,pgno);
6493 
6494 	if (key)
6495 		memcpy(NODEKEY(node), key->mv_data, key->mv_size);
6496 
6497 	if (IS_LEAF(mp)) {
6498 		mdb_cassert(mc, key);
6499 		if (ofp == NULL) {
6500 			if (F_ISSET(flags, F_BIGDATA))
6501 				memcpy(node->mn_data + key->mv_size, data->mv_data,
6502 				    sizeof(pgno_t));
6503 			else if (F_ISSET(flags, MDB_RESERVE))
6504 				data->mv_data = node->mn_data + key->mv_size;
6505 			else
6506 				memcpy(node->mn_data + key->mv_size, data->mv_data,
6507 				    data->mv_size);
6508 		} else {
6509 			memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno,
6510 			    sizeof(pgno_t));
6511 			if (F_ISSET(flags, MDB_RESERVE))
6512 				data->mv_data = METADATA(ofp);
6513 			else
6514 				memcpy(METADATA(ofp), data->mv_data, data->mv_size);
6515 		}
6516 	}
6517 
6518 	return MDB_SUCCESS;
6519 
6520 full:
6521 	DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
6522 		mdb_dbg_pgno(mp), NUMKEYS(mp)));
6523 	DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room));
6524 	DPRINTF(("node size = %"Z"u", node_size));
6525 	mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
6526 	return MDB_PAGE_FULL;
6527 }
6528 
6529 /** Delete the specified node from a page.
6530  * @param[in] mp The page to operate on.
6531  * @param[in] indx The index of the node to delete.
6532  * @param[in] ksize The size of a node. Only used if the page is
6533  * part of a #MDB_DUPFIXED database.
6534  */
6535 static void
6536 mdb_node_del(MDB_cursor *mc, int ksize)
6537 {
6538 	MDB_page *mp = mc->mc_pg[mc->mc_top];
6539 	indx_t	indx = mc->mc_ki[mc->mc_top];
6540 	unsigned int	 sz;
6541 	indx_t		 i, j, numkeys, ptr;
6542 	MDB_node	*node;
6543 	char		*base;
6544 
6545 	DPRINTF(("delete node %u on %s page %"Z"u", indx,
6546 	    IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp)));
6547 	numkeys = NUMKEYS(mp);
6548 	mdb_cassert(mc, indx < numkeys);
6549 
6550 	if (IS_LEAF2(mp)) {
6551 		int x = numkeys - 1 - indx;
6552 		base = LEAF2KEY(mp, indx, ksize);
6553 		if (x)
6554 			memmove(base, base + ksize, x * ksize);
6555 		mp->mp_lower -= sizeof(indx_t);
6556 		mp->mp_upper += ksize - sizeof(indx_t);
6557 		return;
6558 	}
6559 
6560 	node = NODEPTR(mp, indx);
6561 	sz = NODESIZE + node->mn_ksize;
6562 	if (IS_LEAF(mp)) {
6563 		if (F_ISSET(node->mn_flags, F_BIGDATA))
6564 			sz += sizeof(pgno_t);
6565 		else
6566 			sz += NODEDSZ(node);
6567 	}
6568 	sz = EVEN(sz);
6569 
6570 	ptr = mp->mp_ptrs[indx];
6571 	for (i = j = 0; i < numkeys; i++) {
6572 		if (i != indx) {
6573 			mp->mp_ptrs[j] = mp->mp_ptrs[i];
6574 			if (mp->mp_ptrs[i] < ptr)
6575 				mp->mp_ptrs[j] += sz;
6576 			j++;
6577 		}
6578 	}
6579 
6580 	base = (char *)mp + mp->mp_upper;
6581 	memmove(base + sz, base, ptr - mp->mp_upper);
6582 
6583 	mp->mp_lower -= sizeof(indx_t);
6584 	mp->mp_upper += sz;
6585 }
6586 
6587 /** Compact the main page after deleting a node on a subpage.
6588  * @param[in] mp The main page to operate on.
6589  * @param[in] indx The index of the subpage on the main page.
6590  */
6591 static void
6592 mdb_node_shrink(MDB_page *mp, indx_t indx)
6593 {
6594 	MDB_node *node;
6595 	MDB_page *sp, *xp;
6596 	char *base;
6597 	int nsize, delta;
6598 	indx_t		 i, numkeys, ptr;
6599 
6600 	node = NODEPTR(mp, indx);
6601 	sp = (MDB_page *)NODEDATA(node);
6602 	delta = SIZELEFT(sp);
6603 	xp = (MDB_page *)((char *)sp + delta);
6604 
6605 	/* shift subpage upward */
6606 	if (IS_LEAF2(sp)) {
6607 		nsize = NUMKEYS(sp) * sp->mp_pad;
6608 		if (nsize & 1)
6609 			return;		/* do not make the node uneven-sized */
6610 		memmove(METADATA(xp), METADATA(sp), nsize);
6611 	} else {
6612 		int i;
6613 		numkeys = NUMKEYS(sp);
6614 		for (i=numkeys-1; i>=0; i--)
6615 			xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta;
6616 	}
6617 	xp->mp_upper = sp->mp_lower;
6618 	xp->mp_lower = sp->mp_lower;
6619 	xp->mp_flags = sp->mp_flags;
6620 	xp->mp_pad = sp->mp_pad;
6621 	COPY_PGNO(xp->mp_pgno, mp->mp_pgno);
6622 
6623 	nsize = NODEDSZ(node) - delta;
6624 	SETDSZ(node, nsize);
6625 
6626 	/* shift lower nodes upward */
6627 	ptr = mp->mp_ptrs[indx];
6628 	numkeys = NUMKEYS(mp);
6629 	for (i = 0; i < numkeys; i++) {
6630 		if (mp->mp_ptrs[i] <= ptr)
6631 			mp->mp_ptrs[i] += delta;
6632 	}
6633 
6634 	base = (char *)mp + mp->mp_upper;
6635 	memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node));
6636 	mp->mp_upper += delta;
6637 }
6638 
6639 /** Initial setup of a sorted-dups cursor.
6640  * Sorted duplicates are implemented as a sub-database for the given key.
6641  * The duplicate data items are actually keys of the sub-database.
6642  * Operations on the duplicate data items are performed using a sub-cursor
6643  * initialized when the sub-database is first accessed. This function does
6644  * the preliminary setup of the sub-cursor, filling in the fields that
6645  * depend only on the parent DB.
6646  * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
6647  */
6648 static void
6649 mdb_xcursor_init0(MDB_cursor *mc)
6650 {
6651 	MDB_xcursor *mx = mc->mc_xcursor;
6652 
6653 	mx->mx_cursor.mc_xcursor = NULL;
6654 	mx->mx_cursor.mc_txn = mc->mc_txn;
6655 	mx->mx_cursor.mc_db = &mx->mx_db;
6656 	mx->mx_cursor.mc_dbx = &mx->mx_dbx;
6657 	mx->mx_cursor.mc_dbi = mc->mc_dbi;
6658 	mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
6659 	mx->mx_cursor.mc_snum = 0;
6660 	mx->mx_cursor.mc_top = 0;
6661 	mx->mx_cursor.mc_flags = C_SUB;
6662 	mx->mx_dbx.md_name.mv_size = 0;
6663 	mx->mx_dbx.md_name.mv_data = NULL;
6664 	mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
6665 	mx->mx_dbx.md_dcmp = NULL;
6666 	mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
6667 }
6668 
6669 /** Final setup of a sorted-dups cursor.
6670  *	Sets up the fields that depend on the data from the main cursor.
6671  * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
6672  * @param[in] node The data containing the #MDB_db record for the
6673  * sorted-dup database.
6674  */
6675 static void
6676 mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
6677 {
6678 	MDB_xcursor *mx = mc->mc_xcursor;
6679 
6680 	if (node->mn_flags & F_SUBDATA) {
6681 		memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
6682 		mx->mx_cursor.mc_pg[0] = 0;
6683 		mx->mx_cursor.mc_snum = 0;
6684 		mx->mx_cursor.mc_top = 0;
6685 		mx->mx_cursor.mc_flags = C_SUB;
6686 	} else {
6687 		MDB_page *fp = NODEDATA(node);
6688 		mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad;
6689 		mx->mx_db.md_flags = 0;
6690 		mx->mx_db.md_depth = 1;
6691 		mx->mx_db.md_branch_pages = 0;
6692 		mx->mx_db.md_leaf_pages = 1;
6693 		mx->mx_db.md_overflow_pages = 0;
6694 		mx->mx_db.md_entries = NUMKEYS(fp);
6695 		COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
6696 		mx->mx_cursor.mc_snum = 1;
6697 		mx->mx_cursor.mc_top = 0;
6698 		mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
6699 		mx->mx_cursor.mc_pg[0] = fp;
6700 		mx->mx_cursor.mc_ki[0] = 0;
6701 		if (mc->mc_db->md_flags & MDB_DUPFIXED) {
6702 			mx->mx_db.md_flags = MDB_DUPFIXED;
6703 			mx->mx_db.md_pad = fp->mp_pad;
6704 			if (mc->mc_db->md_flags & MDB_INTEGERDUP)
6705 				mx->mx_db.md_flags |= MDB_INTEGERKEY;
6706 		}
6707 	}
6708 	DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi,
6709 		mx->mx_db.md_root));
6710 	mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */
6711 #if UINT_MAX < SIZE_MAX
6712 	if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t))
6713 #ifdef MISALIGNED_OK
6714 		mx->mx_dbx.md_cmp = mdb_cmp_long;
6715 #else
6716 		mx->mx_dbx.md_cmp = mdb_cmp_cint;
6717 #endif
6718 #endif
6719 }
6720 
6721 /** Initialize a cursor for a given transaction and database. */
6722 static void
6723 mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx)
6724 {
6725 	mc->mc_next = NULL;
6726 	mc->mc_backup = NULL;
6727 	mc->mc_dbi = dbi;
6728 	mc->mc_txn = txn;
6729 	mc->mc_db = &txn->mt_dbs[dbi];
6730 	mc->mc_dbx = &txn->mt_dbxs[dbi];
6731 	mc->mc_dbflag = &txn->mt_dbflags[dbi];
6732 	mc->mc_snum = 0;
6733 	mc->mc_top = 0;
6734 	mc->mc_pg[0] = 0;
6735 	mc->mc_flags = 0;
6736 	if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
6737 		mdb_tassert(txn, mx != NULL);
6738 		mc->mc_xcursor = mx;
6739 		mdb_xcursor_init0(mc);
6740 	} else {
6741 		mc->mc_xcursor = NULL;
6742 	}
6743 	if (*mc->mc_dbflag & DB_STALE) {
6744 		mdb_page_search(mc, NULL, MDB_PS_ROOTONLY);
6745 	}
6746 }
6747 
6748 int
6749 mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret)
6750 {
6751 	MDB_cursor	*mc;
6752 	size_t size = sizeof(MDB_cursor);
6753 
6754 	if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
6755 		return EINVAL;
6756 
6757 	if (txn->mt_flags & MDB_TXN_ERROR)
6758 		return MDB_BAD_TXN;
6759 
6760 	/* Allow read access to the freelist */
6761 	if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
6762 		return EINVAL;
6763 
6764 	if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)
6765 		size += sizeof(MDB_xcursor);
6766 
6767 	if ((mc = malloc(size)) != NULL) {
6768 		mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1));
6769 		if (txn->mt_cursors) {
6770 			mc->mc_next = txn->mt_cursors[dbi];
6771 			txn->mt_cursors[dbi] = mc;
6772 			mc->mc_flags |= C_UNTRACK;
6773 		}
6774 	} else {
6775 		return ENOMEM;
6776 	}
6777 
6778 	*ret = mc;
6779 
6780 	return MDB_SUCCESS;
6781 }
6782 
6783 int
6784 mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc)
6785 {
6786 	if (txn == NULL || mc == NULL || mc->mc_dbi >= txn->mt_numdbs)
6787 		return EINVAL;
6788 
6789 	if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors)
6790 		return EINVAL;
6791 
6792 	mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor);
6793 	return MDB_SUCCESS;
6794 }
6795 
6796 /* Return the count of duplicate data items for the current key */
6797 int
6798 mdb_cursor_count(MDB_cursor *mc, size_t *countp)
6799 {
6800 	MDB_node	*leaf;
6801 
6802 	if (mc == NULL || countp == NULL)
6803 		return EINVAL;
6804 
6805 	if (mc->mc_xcursor == NULL)
6806 		return MDB_INCOMPATIBLE;
6807 
6808 	leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6809 	if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6810 		*countp = 1;
6811 	} else {
6812 		if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
6813 			return EINVAL;
6814 
6815 		*countp = mc->mc_xcursor->mx_db.md_entries;
6816 	}
6817 	return MDB_SUCCESS;
6818 }
6819 
6820 void
6821 mdb_cursor_close(MDB_cursor *mc)
6822 {
6823 	if (mc && !mc->mc_backup) {
6824 		/* remove from txn, if tracked */
6825 		if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) {
6826 			MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi];
6827 			while (*prev && *prev != mc) prev = &(*prev)->mc_next;
6828 			if (*prev == mc)
6829 				*prev = mc->mc_next;
6830 		}
6831 		free(mc);
6832 	}
6833 }
6834 
6835 MDB_txn *
6836 mdb_cursor_txn(MDB_cursor *mc)
6837 {
6838 	if (!mc) return NULL;
6839 	return mc->mc_txn;
6840 }
6841 
6842 MDB_dbi
6843 mdb_cursor_dbi(MDB_cursor *mc)
6844 {
6845 	return mc->mc_dbi;
6846 }
6847 
6848 /** Replace the key for a branch node with a new key.
6849  * @param[in] mc Cursor pointing to the node to operate on.
6850  * @param[in] key The new key to use.
6851  * @return 0 on success, non-zero on failure.
6852  */
6853 static int
6854 mdb_update_key(MDB_cursor *mc, MDB_val *key)
6855 {
6856 	MDB_page		*mp;
6857 	MDB_node		*node;
6858 	char			*base;
6859 	size_t			 len;
6860 	int				 delta, ksize, oksize;
6861 	indx_t			 ptr, i, numkeys, indx;
6862 	DKBUF;
6863 
6864 	indx = mc->mc_ki[mc->mc_top];
6865 	mp = mc->mc_pg[mc->mc_top];
6866 	node = NODEPTR(mp, indx);
6867 	ptr = mp->mp_ptrs[indx];
6868 #if MDB_DEBUG
6869 	{
6870 		MDB_val	k2;
6871 		char kbuf2[DKBUF_MAXKEYSIZE*2+1];
6872 		k2.mv_data = NODEKEY(node);
6873 		k2.mv_size = node->mn_ksize;
6874 		DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u",
6875 			indx, ptr,
6876 			mdb_dkey(&k2, kbuf2),
6877 			DKEY(key),
6878 			mp->mp_pgno));
6879 	}
6880 #endif
6881 
6882 	/* Sizes must be 2-byte aligned. */
6883 	ksize = EVEN(key->mv_size);
6884 	oksize = EVEN(node->mn_ksize);
6885 	delta = ksize - oksize;
6886 
6887 	/* Shift node contents if EVEN(key length) changed. */
6888 	if (delta) {
6889 		if (delta > 0 && SIZELEFT(mp) < delta) {
6890 			pgno_t pgno;
6891 			/* not enough space left, do a delete and split */
6892 			DPRINTF(("Not enough room, delta = %d, splitting...", delta));
6893 			pgno = NODEPGNO(node);
6894 			mdb_node_del(mc, 0);
6895 			return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE);
6896 		}
6897 
6898 		numkeys = NUMKEYS(mp);
6899 		for (i = 0; i < numkeys; i++) {
6900 			if (mp->mp_ptrs[i] <= ptr)
6901 				mp->mp_ptrs[i] -= delta;
6902 		}
6903 
6904 		base = (char *)mp + mp->mp_upper;
6905 		len = ptr - mp->mp_upper + NODESIZE;
6906 		memmove(base - delta, base, len);
6907 		mp->mp_upper -= delta;
6908 
6909 		node = NODEPTR(mp, indx);
6910 	}
6911 
6912 	/* But even if no shift was needed, update ksize */
6913 	if (node->mn_ksize != key->mv_size)
6914 		node->mn_ksize = key->mv_size;
6915 
6916 	if (key->mv_size)
6917 		memcpy(NODEKEY(node), key->mv_data, key->mv_size);
6918 
6919 	return MDB_SUCCESS;
6920 }
6921 
6922 static void
6923 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst);
6924 
6925 /** Move a node from csrc to cdst.
6926  */
6927 static int
6928 mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
6929 {
6930 	MDB_node		*srcnode;
6931 	MDB_val		 key, data;
6932 	pgno_t	srcpg;
6933 	MDB_cursor mn;
6934 	int			 rc;
6935 	unsigned short flags;
6936 
6937 	DKBUF;
6938 
6939 	/* Mark src and dst as dirty. */
6940 	if ((rc = mdb_page_touch(csrc)) ||
6941 	    (rc = mdb_page_touch(cdst)))
6942 		return rc;
6943 
6944 	if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
6945 		key.mv_size = csrc->mc_db->md_pad;
6946 		key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
6947 		data.mv_size = 0;
6948 		data.mv_data = NULL;
6949 		srcpg = 0;
6950 		flags = 0;
6951 	} else {
6952 		srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
6953 		mdb_cassert(csrc, !((size_t)srcnode & 1));
6954 		srcpg = NODEPGNO(srcnode);
6955 		flags = srcnode->mn_flags;
6956 		if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
6957 			unsigned int snum = csrc->mc_snum;
6958 			MDB_node *s2;
6959 			/* must find the lowest key below src */
6960 			mdb_page_search_lowest(csrc);
6961 			if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
6962 				key.mv_size = csrc->mc_db->md_pad;
6963 				key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
6964 			} else {
6965 				s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
6966 				key.mv_size = NODEKSZ(s2);
6967 				key.mv_data = NODEKEY(s2);
6968 			}
6969 			csrc->mc_snum = snum--;
6970 			csrc->mc_top = snum;
6971 		} else {
6972 			key.mv_size = NODEKSZ(srcnode);
6973 			key.mv_data = NODEKEY(srcnode);
6974 		}
6975 		data.mv_size = NODEDSZ(srcnode);
6976 		data.mv_data = NODEDATA(srcnode);
6977 	}
6978 	if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) {
6979 		unsigned int snum = cdst->mc_snum;
6980 		MDB_node *s2;
6981 		MDB_val bkey;
6982 		/* must find the lowest key below dst */
6983 		mdb_page_search_lowest(cdst);
6984 		if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) {
6985 			bkey.mv_size = cdst->mc_db->md_pad;
6986 			bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size);
6987 		} else {
6988 			s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
6989 			bkey.mv_size = NODEKSZ(s2);
6990 			bkey.mv_data = NODEKEY(s2);
6991 		}
6992 		cdst->mc_snum = snum--;
6993 		cdst->mc_top = snum;
6994 		mdb_cursor_copy(cdst, &mn);
6995 		mn.mc_ki[snum] = 0;
6996 		rc = mdb_update_key(&mn, &bkey);
6997 		if (rc)
6998 			return rc;
6999 	}
7000 
7001 	DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u",
7002 	    IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch",
7003 	    csrc->mc_ki[csrc->mc_top],
7004 		DKEY(&key),
7005 	    csrc->mc_pg[csrc->mc_top]->mp_pgno,
7006 	    cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno));
7007 
7008 	/* Add the node to the destination page.
7009 	 */
7010 	rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags);
7011 	if (rc != MDB_SUCCESS)
7012 		return rc;
7013 
7014 	/* Delete the node from the source page.
7015 	 */
7016 	mdb_node_del(csrc, key.mv_size);
7017 
7018 	{
7019 		/* Adjust other cursors pointing to mp */
7020 		MDB_cursor *m2, *m3;
7021 		MDB_dbi dbi = csrc->mc_dbi;
7022 		MDB_page *mp = csrc->mc_pg[csrc->mc_top];
7023 
7024 		for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7025 			if (csrc->mc_flags & C_SUB)
7026 				m3 = &m2->mc_xcursor->mx_cursor;
7027 			else
7028 				m3 = m2;
7029 			if (m3 == csrc) continue;
7030 			if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] ==
7031 				csrc->mc_ki[csrc->mc_top]) {
7032 				m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
7033 				m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
7034 			}
7035 		}
7036 	}
7037 
7038 	/* Update the parent separators.
7039 	 */
7040 	if (csrc->mc_ki[csrc->mc_top] == 0) {
7041 		if (csrc->mc_ki[csrc->mc_top-1] != 0) {
7042 			if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7043 				key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
7044 			} else {
7045 				srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
7046 				key.mv_size = NODEKSZ(srcnode);
7047 				key.mv_data = NODEKEY(srcnode);
7048 			}
7049 			DPRINTF(("update separator for source page %"Z"u to [%s]",
7050 				csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)));
7051 			mdb_cursor_copy(csrc, &mn);
7052 			mn.mc_snum--;
7053 			mn.mc_top--;
7054 			if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)
7055 				return rc;
7056 		}
7057 		if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
7058 			MDB_val	 nullkey;
7059 			indx_t	ix = csrc->mc_ki[csrc->mc_top];
7060 			nullkey.mv_size = 0;
7061 			csrc->mc_ki[csrc->mc_top] = 0;
7062 			rc = mdb_update_key(csrc, &nullkey);
7063 			csrc->mc_ki[csrc->mc_top] = ix;
7064 			mdb_cassert(csrc, rc == MDB_SUCCESS);
7065 		}
7066 	}
7067 
7068 	if (cdst->mc_ki[cdst->mc_top] == 0) {
7069 		if (cdst->mc_ki[cdst->mc_top-1] != 0) {
7070 			if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7071 				key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size);
7072 			} else {
7073 				srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
7074 				key.mv_size = NODEKSZ(srcnode);
7075 				key.mv_data = NODEKEY(srcnode);
7076 			}
7077 			DPRINTF(("update separator for destination page %"Z"u to [%s]",
7078 				cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)));
7079 			mdb_cursor_copy(cdst, &mn);
7080 			mn.mc_snum--;
7081 			mn.mc_top--;
7082 			if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)
7083 				return rc;
7084 		}
7085 		if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) {
7086 			MDB_val	 nullkey;
7087 			indx_t	ix = cdst->mc_ki[cdst->mc_top];
7088 			nullkey.mv_size = 0;
7089 			cdst->mc_ki[cdst->mc_top] = 0;
7090 			rc = mdb_update_key(cdst, &nullkey);
7091 			cdst->mc_ki[cdst->mc_top] = ix;
7092 			mdb_cassert(csrc, rc == MDB_SUCCESS);
7093 		}
7094 	}
7095 
7096 	return MDB_SUCCESS;
7097 }
7098 
7099 /** Merge one page into another.
7100  *  The nodes from the page pointed to by \b csrc will
7101  *	be copied to the page pointed to by \b cdst and then
7102  *	the \b csrc page will be freed.
7103  * @param[in] csrc Cursor pointing to the source page.
7104  * @param[in] cdst Cursor pointing to the destination page.
7105  */
7106 static int
7107 mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
7108 {
7109 	int			 rc;
7110 	indx_t			 i, j;
7111 	MDB_node		*srcnode;
7112 	MDB_val		 key, data;
7113 	unsigned	nkeys;
7114 
7115 	DPRINTF(("merging page %"Z"u into %"Z"u", csrc->mc_pg[csrc->mc_top]->mp_pgno,
7116 		cdst->mc_pg[cdst->mc_top]->mp_pgno));
7117 
7118 	mdb_cassert(csrc, csrc->mc_snum > 1);	/* can't merge root page */
7119 	mdb_cassert(csrc, cdst->mc_snum > 1);
7120 
7121 	/* Mark dst as dirty. */
7122 	if ((rc = mdb_page_touch(cdst)))
7123 		return rc;
7124 
7125 	/* Move all nodes from src to dst.
7126 	 */
7127 	j = nkeys = NUMKEYS(cdst->mc_pg[cdst->mc_top]);
7128 	if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7129 		key.mv_size = csrc->mc_db->md_pad;
7130 		key.mv_data = METADATA(csrc->mc_pg[csrc->mc_top]);
7131 		for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
7132 			rc = mdb_node_add(cdst, j, &key, NULL, 0, 0);
7133 			if (rc != MDB_SUCCESS)
7134 				return rc;
7135 			key.mv_data = (char *)key.mv_data + key.mv_size;
7136 		}
7137 	} else {
7138 		for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
7139 			srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i);
7140 			if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
7141 				unsigned int snum = csrc->mc_snum;
7142 				MDB_node *s2;
7143 				/* must find the lowest key below src */
7144 				mdb_page_search_lowest(csrc);
7145 				if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7146 					key.mv_size = csrc->mc_db->md_pad;
7147 					key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
7148 				} else {
7149 					s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
7150 					key.mv_size = NODEKSZ(s2);
7151 					key.mv_data = NODEKEY(s2);
7152 				}
7153 				csrc->mc_snum = snum--;
7154 				csrc->mc_top = snum;
7155 			} else {
7156 				key.mv_size = srcnode->mn_ksize;
7157 				key.mv_data = NODEKEY(srcnode);
7158 			}
7159 
7160 			data.mv_size = NODEDSZ(srcnode);
7161 			data.mv_data = NODEDATA(srcnode);
7162 			rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags);
7163 			if (rc != MDB_SUCCESS)
7164 				return rc;
7165 		}
7166 	}
7167 
7168 	DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)",
7169 	    cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]),
7170 		(float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10));
7171 
7172 	/* Unlink the src page from parent and add to free list.
7173 	 */
7174 	csrc->mc_top--;
7175 	mdb_node_del(csrc, 0);
7176 	if (csrc->mc_ki[csrc->mc_top] == 0) {
7177 		key.mv_size = 0;
7178 		rc = mdb_update_key(csrc, &key);
7179 		if (rc) {
7180 			csrc->mc_top++;
7181 			return rc;
7182 		}
7183 	}
7184 	csrc->mc_top++;
7185 
7186 	rc = mdb_midl_append(&csrc->mc_txn->mt_free_pgs,
7187 		csrc->mc_pg[csrc->mc_top]->mp_pgno);
7188 	if (rc)
7189 		return rc;
7190 	if (IS_LEAF(csrc->mc_pg[csrc->mc_top]))
7191 		csrc->mc_db->md_leaf_pages--;
7192 	else
7193 		csrc->mc_db->md_branch_pages--;
7194 	{
7195 		/* Adjust other cursors pointing to mp */
7196 		MDB_cursor *m2, *m3;
7197 		MDB_dbi dbi = csrc->mc_dbi;
7198 		MDB_page *mp = cdst->mc_pg[cdst->mc_top];
7199 
7200 		for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7201 			if (csrc->mc_flags & C_SUB)
7202 				m3 = &m2->mc_xcursor->mx_cursor;
7203 			else
7204 				m3 = m2;
7205 			if (m3 == csrc) continue;
7206 			if (m3->mc_snum < csrc->mc_snum) continue;
7207 			if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) {
7208 				m3->mc_pg[csrc->mc_top] = mp;
7209 				m3->mc_ki[csrc->mc_top] += nkeys;
7210 			}
7211 		}
7212 	}
7213 	mdb_cursor_pop(csrc);
7214 
7215 	return mdb_rebalance(csrc);
7216 }
7217 
7218 /** Copy the contents of a cursor.
7219  * @param[in] csrc The cursor to copy from.
7220  * @param[out] cdst The cursor to copy to.
7221  */
7222 static void
7223 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst)
7224 {
7225 	unsigned int i;
7226 
7227 	cdst->mc_txn = csrc->mc_txn;
7228 	cdst->mc_dbi = csrc->mc_dbi;
7229 	cdst->mc_db  = csrc->mc_db;
7230 	cdst->mc_dbx = csrc->mc_dbx;
7231 	cdst->mc_snum = csrc->mc_snum;
7232 	cdst->mc_top = csrc->mc_top;
7233 	cdst->mc_flags = csrc->mc_flags;
7234 
7235 	for (i=0; i<csrc->mc_snum; i++) {
7236 		cdst->mc_pg[i] = csrc->mc_pg[i];
7237 		cdst->mc_ki[i] = csrc->mc_ki[i];
7238 	}
7239 }
7240 
7241 /** Rebalance the tree after a delete operation.
7242  * @param[in] mc Cursor pointing to the page where rebalancing
7243  * should begin.
7244  * @return 0 on success, non-zero on failure.
7245  */
7246 static int
7247 mdb_rebalance(MDB_cursor *mc)
7248 {
7249 	MDB_node	*node;
7250 	int rc;
7251 	unsigned int ptop, minkeys;
7252 	MDB_cursor	mn;
7253 
7254 	minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top]));
7255 	DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)",
7256 	    IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch",
7257 	    mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]),
7258 		(float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10));
7259 
7260 	if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD &&
7261 		NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) {
7262 		DPRINTF(("no need to rebalance page %"Z"u, above fill threshold",
7263 		    mdb_dbg_pgno(mc->mc_pg[mc->mc_top])));
7264 		return MDB_SUCCESS;
7265 	}
7266 
7267 	if (mc->mc_snum < 2) {
7268 		MDB_page *mp = mc->mc_pg[0];
7269 		if (IS_SUBP(mp)) {
7270 			DPUTS("Can't rebalance a subpage, ignoring");
7271 			return MDB_SUCCESS;
7272 		}
7273 		if (NUMKEYS(mp) == 0) {
7274 			DPUTS("tree is completely empty");
7275 			mc->mc_db->md_root = P_INVALID;
7276 			mc->mc_db->md_depth = 0;
7277 			mc->mc_db->md_leaf_pages = 0;
7278 			rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
7279 			if (rc)
7280 				return rc;
7281 			/* Adjust cursors pointing to mp */
7282 			mc->mc_snum = 0;
7283 			mc->mc_top = 0;
7284 			mc->mc_flags &= ~C_INITIALIZED;
7285 			{
7286 				MDB_cursor *m2, *m3;
7287 				MDB_dbi dbi = mc->mc_dbi;
7288 
7289 				for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7290 					if (mc->mc_flags & C_SUB)
7291 						m3 = &m2->mc_xcursor->mx_cursor;
7292 					else
7293 						m3 = m2;
7294 					if (m3->mc_snum < mc->mc_snum) continue;
7295 					if (m3->mc_pg[0] == mp) {
7296 						m3->mc_snum = 0;
7297 						m3->mc_top = 0;
7298 						m3->mc_flags &= ~C_INITIALIZED;
7299 					}
7300 				}
7301 			}
7302 		} else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
7303 			DPUTS("collapsing root page!");
7304 			rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
7305 			if (rc)
7306 				return rc;
7307 			mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0));
7308 			rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL);
7309 			if (rc)
7310 				return rc;
7311 			mc->mc_db->md_depth--;
7312 			mc->mc_db->md_branch_pages--;
7313 			mc->mc_ki[0] = mc->mc_ki[1];
7314 			{
7315 				/* Adjust other cursors pointing to mp */
7316 				MDB_cursor *m2, *m3;
7317 				MDB_dbi dbi = mc->mc_dbi;
7318 
7319 				for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7320 					if (mc->mc_flags & C_SUB)
7321 						m3 = &m2->mc_xcursor->mx_cursor;
7322 					else
7323 						m3 = m2;
7324 					if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
7325 					if (m3->mc_pg[0] == mp) {
7326 						int i;
7327 						m3->mc_snum--;
7328 						m3->mc_top--;
7329 						for (i=0; i<m3->mc_snum; i++) {
7330 							m3->mc_pg[i] = m3->mc_pg[i+1];
7331 							m3->mc_ki[i] = m3->mc_ki[i+1];
7332 						}
7333 					}
7334 				}
7335 			}
7336 		} else
7337 			DPUTS("root page doesn't need rebalancing");
7338 		return MDB_SUCCESS;
7339 	}
7340 
7341 	/* The parent (branch page) must have at least 2 pointers,
7342 	 * otherwise the tree is invalid.
7343 	 */
7344 	ptop = mc->mc_top-1;
7345 	mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1);
7346 
7347 	/* Leaf page fill factor is below the threshold.
7348 	 * Try to move keys from left or right neighbor, or
7349 	 * merge with a neighbor page.
7350 	 */
7351 
7352 	/* Find neighbors.
7353 	 */
7354 	mdb_cursor_copy(mc, &mn);
7355 	mn.mc_xcursor = NULL;
7356 
7357 	if (mc->mc_ki[ptop] == 0) {
7358 		/* We're the leftmost leaf in our parent.
7359 		 */
7360 		DPUTS("reading right neighbor");
7361 		mn.mc_ki[ptop]++;
7362 		node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
7363 		rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL);
7364 		if (rc)
7365 			return rc;
7366 		mn.mc_ki[mn.mc_top] = 0;
7367 		mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
7368 	} else {
7369 		/* There is at least one neighbor to the left.
7370 		 */
7371 		DPUTS("reading left neighbor");
7372 		mn.mc_ki[ptop]--;
7373 		node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
7374 		rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL);
7375 		if (rc)
7376 			return rc;
7377 		mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1;
7378 		mc->mc_ki[mc->mc_top] = 0;
7379 	}
7380 
7381 	DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)",
7382 	    mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]),
7383 		(float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10));
7384 
7385 	/* If the neighbor page is above threshold and has enough keys,
7386 	 * move one key from it. Otherwise we should try to merge them.
7387 	 * (A branch page must never have less than 2 keys.)
7388 	 */
7389 	minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top]));
7390 	if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys)
7391 		return mdb_node_move(&mn, mc);
7392 	else {
7393 		if (mc->mc_ki[ptop] == 0)
7394 			rc = mdb_page_merge(&mn, mc);
7395 		else {
7396 			mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
7397 			rc = mdb_page_merge(mc, &mn);
7398 			mdb_cursor_copy(&mn, mc);
7399 		}
7400 		mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
7401 	}
7402 	return rc;
7403 }
7404 
7405 /** Complete a delete operation started by #mdb_cursor_del(). */
7406 static int
7407 mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf)
7408 {
7409 	int rc;
7410 	MDB_page *mp;
7411 	indx_t ki;
7412 	unsigned int nkeys;
7413 
7414 	mp = mc->mc_pg[mc->mc_top];
7415 	ki = mc->mc_ki[mc->mc_top];
7416 
7417 	/* add overflow pages to free list */
7418 	if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_BIGDATA)) {
7419 		MDB_page *omp;
7420 		pgno_t pg;
7421 
7422 		memcpy(&pg, NODEDATA(leaf), sizeof(pg));
7423 		if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) ||
7424 			(rc = mdb_ovpage_free(mc, omp)))
7425 			return rc;
7426 	}
7427 	mdb_node_del(mc, mc->mc_db->md_pad);
7428 	mc->mc_db->md_entries--;
7429 	rc = mdb_rebalance(mc);
7430 	if (rc != MDB_SUCCESS)
7431 		mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
7432 	else {
7433 		MDB_cursor *m2, *m3;
7434 		MDB_dbi dbi = mc->mc_dbi;
7435 
7436 		mp = mc->mc_pg[mc->mc_top];
7437 		nkeys = NUMKEYS(mp);
7438 
7439 		/* if mc points past last node in page, find next sibling */
7440 		if (mc->mc_ki[mc->mc_top] >= nkeys)
7441 			mdb_cursor_sibling(mc, 1);
7442 
7443 		/* Adjust other cursors pointing to mp */
7444 		for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7445 			m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
7446 			if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED))
7447 				continue;
7448 			if (m3 == mc || m3->mc_snum < mc->mc_snum)
7449 				continue;
7450 			if (m3->mc_pg[mc->mc_top] == mp) {
7451 				if (m3->mc_ki[mc->mc_top] >= ki) {
7452 					m3->mc_flags |= C_DEL;
7453 					if (m3->mc_ki[mc->mc_top] > ki)
7454 						m3->mc_ki[mc->mc_top]--;
7455 				}
7456 				if (m3->mc_ki[mc->mc_top] >= nkeys)
7457 					mdb_cursor_sibling(m3, 1);
7458 			}
7459 		}
7460 		mc->mc_flags |= C_DEL;
7461 	}
7462 
7463 	return rc;
7464 }
7465 
7466 int
7467 mdb_del(MDB_txn *txn, MDB_dbi dbi,
7468     MDB_val *key, MDB_val *data)
7469 {
7470 	MDB_cursor mc;
7471 	MDB_xcursor mx;
7472 	MDB_cursor_op op;
7473 	MDB_val rdata, *xdata;
7474 	int		 rc, exact;
7475 	DKBUF;
7476 
7477 	if (key == NULL)
7478 		return EINVAL;
7479 
7480 	DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key)));
7481 
7482 	if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
7483 		return EINVAL;
7484 
7485 	if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
7486 		return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
7487 
7488 	mdb_cursor_init(&mc, txn, dbi, &mx);
7489 
7490 	exact = 0;
7491 	if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) {
7492 		/* must ignore any data */
7493 		data = NULL;
7494 	}
7495 	if (data) {
7496 		op = MDB_GET_BOTH;
7497 		rdata = *data;
7498 		xdata = &rdata;
7499 	} else {
7500 		op = MDB_SET;
7501 		xdata = NULL;
7502 	}
7503 	rc = mdb_cursor_set(&mc, key, xdata, op, &exact);
7504 	if (rc == 0) {
7505 		/* let mdb_page_split know about this cursor if needed:
7506 		 * delete will trigger a rebalance; if it needs to move
7507 		 * a node from one page to another, it will have to
7508 		 * update the parent's separator key(s). If the new sepkey
7509 		 * is larger than the current one, the parent page may
7510 		 * run out of space, triggering a split. We need this
7511 		 * cursor to be consistent until the end of the rebalance.
7512 		 */
7513 		mc.mc_flags |= C_UNTRACK;
7514 		mc.mc_next = txn->mt_cursors[dbi];
7515 		txn->mt_cursors[dbi] = &mc;
7516 		rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA);
7517 		txn->mt_cursors[dbi] = mc.mc_next;
7518 	}
7519 	return rc;
7520 }
7521 
7522 /** Split a page and insert a new node.
7523  * @param[in,out] mc Cursor pointing to the page and desired insertion index.
7524  * The cursor will be updated to point to the actual page and index where
7525  * the node got inserted after the split.
7526  * @param[in] newkey The key for the newly inserted node.
7527  * @param[in] newdata The data for the newly inserted node.
7528  * @param[in] newpgno The page number, if the new node is a branch node.
7529  * @param[in] nflags The #NODE_ADD_FLAGS for the new node.
7530  * @return 0 on success, non-zero on failure.
7531  */
7532 static int
7533 mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno,
7534 	unsigned int nflags)
7535 {
7536 	unsigned int flags;
7537 	int		 rc = MDB_SUCCESS, new_root = 0, did_split = 0;
7538 	indx_t		 newindx;
7539 	pgno_t		 pgno = 0;
7540 	int	 i, j, split_indx, nkeys, pmax;
7541 	MDB_env 	*env = mc->mc_txn->mt_env;
7542 	MDB_node	*node;
7543 	MDB_val	 sepkey, rkey, xdata, *rdata = &xdata;
7544 	MDB_page	*copy = NULL;
7545 	MDB_page	*mp, *rp, *pp;
7546 	int ptop;
7547 	MDB_cursor	mn;
7548 	DKBUF;
7549 
7550 	mp = mc->mc_pg[mc->mc_top];
7551 	newindx = mc->mc_ki[mc->mc_top];
7552 	nkeys = NUMKEYS(mp);
7553 
7554 	DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i",
7555 	    IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
7556 	    DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys));
7557 
7558 	/* Create a right sibling. */
7559 	if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
7560 		return rc;
7561 	DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno));
7562 
7563 	if (mc->mc_snum < 2) {
7564 		if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp)))
7565 			return rc;
7566 		/* shift current top to make room for new parent */
7567 		mc->mc_pg[1] = mc->mc_pg[0];
7568 		mc->mc_ki[1] = mc->mc_ki[0];
7569 		mc->mc_pg[0] = pp;
7570 		mc->mc_ki[0] = 0;
7571 		mc->mc_db->md_root = pp->mp_pgno;
7572 		DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno));
7573 		mc->mc_db->md_depth++;
7574 		new_root = 1;
7575 
7576 		/* Add left (implicit) pointer. */
7577 		if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) {
7578 			/* undo the pre-push */
7579 			mc->mc_pg[0] = mc->mc_pg[1];
7580 			mc->mc_ki[0] = mc->mc_ki[1];
7581 			mc->mc_db->md_root = mp->mp_pgno;
7582 			mc->mc_db->md_depth--;
7583 			return rc;
7584 		}
7585 		mc->mc_snum = 2;
7586 		mc->mc_top = 1;
7587 		ptop = 0;
7588 	} else {
7589 		ptop = mc->mc_top-1;
7590 		DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno));
7591 	}
7592 
7593 	mc->mc_flags |= C_SPLITTING;
7594 	mdb_cursor_copy(mc, &mn);
7595 	mn.mc_pg[mn.mc_top] = rp;
7596 	mn.mc_ki[ptop] = mc->mc_ki[ptop]+1;
7597 
7598 	if (nflags & MDB_APPEND) {
7599 		mn.mc_ki[mn.mc_top] = 0;
7600 		sepkey = *newkey;
7601 		split_indx = newindx;
7602 		nkeys = 0;
7603 	} else {
7604 
7605 		split_indx = (nkeys+1) / 2;
7606 
7607 		if (IS_LEAF2(rp)) {
7608 			char *split, *ins;
7609 			int x;
7610 			unsigned int lsize, rsize, ksize;
7611 			/* Move half of the keys to the right sibling */
7612 			copy = NULL;
7613 			x = mc->mc_ki[mc->mc_top] - split_indx;
7614 			ksize = mc->mc_db->md_pad;
7615 			split = LEAF2KEY(mp, split_indx, ksize);
7616 			rsize = (nkeys - split_indx) * ksize;
7617 			lsize = (nkeys - split_indx) * sizeof(indx_t);
7618 			mp->mp_lower -= lsize;
7619 			rp->mp_lower += lsize;
7620 			mp->mp_upper += rsize - lsize;
7621 			rp->mp_upper -= rsize - lsize;
7622 			sepkey.mv_size = ksize;
7623 			if (newindx == split_indx) {
7624 				sepkey.mv_data = newkey->mv_data;
7625 			} else {
7626 				sepkey.mv_data = split;
7627 			}
7628 			if (x<0) {
7629 				ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
7630 				memcpy(rp->mp_ptrs, split, rsize);
7631 				sepkey.mv_data = rp->mp_ptrs;
7632 				memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
7633 				memcpy(ins, newkey->mv_data, ksize);
7634 				mp->mp_lower += sizeof(indx_t);
7635 				mp->mp_upper -= ksize - sizeof(indx_t);
7636 			} else {
7637 				if (x)
7638 					memcpy(rp->mp_ptrs, split, x * ksize);
7639 				ins = LEAF2KEY(rp, x, ksize);
7640 				memcpy(ins, newkey->mv_data, ksize);
7641 				memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
7642 				rp->mp_lower += sizeof(indx_t);
7643 				rp->mp_upper -= ksize - sizeof(indx_t);
7644 				mc->mc_ki[mc->mc_top] = x;
7645 				mc->mc_pg[mc->mc_top] = rp;
7646 			}
7647 		} else {
7648 			int psize, nsize, k;
7649 			/* Maximum free space in an empty page */
7650 			pmax = env->me_psize - PAGEHDRSZ;
7651 			if (IS_LEAF(mp))
7652 				nsize = mdb_leaf_size(env, newkey, newdata);
7653 			else
7654 				nsize = mdb_branch_size(env, newkey);
7655 			nsize = EVEN(nsize);
7656 
7657 			/* grab a page to hold a temporary copy */
7658 			copy = mdb_page_malloc(mc->mc_txn, 1);
7659 			if (copy == NULL)
7660 				return ENOMEM;
7661 			copy->mp_pgno  = mp->mp_pgno;
7662 			copy->mp_flags = mp->mp_flags;
7663 			copy->mp_lower = PAGEHDRSZ;
7664 			copy->mp_upper = env->me_psize;
7665 
7666 			/* prepare to insert */
7667 			for (i=0, j=0; i<nkeys; i++) {
7668 				if (i == newindx) {
7669 					copy->mp_ptrs[j++] = 0;
7670 				}
7671 				copy->mp_ptrs[j++] = mp->mp_ptrs[i];
7672 			}
7673 
7674 			/* When items are relatively large the split point needs
7675 			 * to be checked, because being off-by-one will make the
7676 			 * difference between success or failure in mdb_node_add.
7677 			 *
7678 			 * It's also relevant if a page happens to be laid out
7679 			 * such that one half of its nodes are all "small" and
7680 			 * the other half of its nodes are "large." If the new
7681 			 * item is also "large" and falls on the half with
7682 			 * "large" nodes, it also may not fit.
7683 			 *
7684 			 * As a final tweak, if the new item goes on the last
7685 			 * spot on the page (and thus, onto the new page), bias
7686 			 * the split so the new page is emptier than the old page.
7687 			 * This yields better packing during sequential inserts.
7688 			 */
7689 			if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) {
7690 				/* Find split point */
7691 				psize = 0;
7692 				if (newindx <= split_indx || newindx >= nkeys) {
7693 					i = 0; j = 1;
7694 					k = newindx >= nkeys ? nkeys : split_indx+2;
7695 				} else {
7696 					i = nkeys; j = -1;
7697 					k = split_indx-1;
7698 				}
7699 				for (; i!=k; i+=j) {
7700 					if (i == newindx) {
7701 						psize += nsize;
7702 						node = NULL;
7703 					} else {
7704 						node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
7705 						psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
7706 						if (IS_LEAF(mp)) {
7707 							if (F_ISSET(node->mn_flags, F_BIGDATA))
7708 								psize += sizeof(pgno_t);
7709 							else
7710 								psize += NODEDSZ(node);
7711 						}
7712 						psize = EVEN(psize);
7713 					}
7714 					if (psize > pmax || i == k-j) {
7715 						split_indx = i + (j<0);
7716 						break;
7717 					}
7718 				}
7719 			}
7720 			if (split_indx == newindx) {
7721 				sepkey.mv_size = newkey->mv_size;
7722 				sepkey.mv_data = newkey->mv_data;
7723 			} else {
7724 				node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]);
7725 				sepkey.mv_size = node->mn_ksize;
7726 				sepkey.mv_data = NODEKEY(node);
7727 			}
7728 		}
7729 	}
7730 
7731 	DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey)));
7732 
7733 	/* Copy separator key to the parent.
7734 	 */
7735 	if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) {
7736 		mn.mc_snum--;
7737 		mn.mc_top--;
7738 		did_split = 1;
7739 		rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0);
7740 
7741 		/* root split? */
7742 		if (mn.mc_snum == mc->mc_snum) {
7743 			mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top];
7744 			mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top];
7745 			mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop];
7746 			mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop];
7747 			mc->mc_snum++;
7748 			mc->mc_top++;
7749 			ptop++;
7750 		}
7751 		/* Right page might now have changed parent.
7752 		 * Check if left page also changed parent.
7753 		 */
7754 		if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
7755 		    mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
7756 			for (i=0; i<ptop; i++) {
7757 				mc->mc_pg[i] = mn.mc_pg[i];
7758 				mc->mc_ki[i] = mn.mc_ki[i];
7759 			}
7760 			mc->mc_pg[ptop] = mn.mc_pg[ptop];
7761 			mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
7762 		}
7763 	} else {
7764 		mn.mc_top--;
7765 		rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0);
7766 		mn.mc_top++;
7767 	}
7768 	mc->mc_flags ^= C_SPLITTING;
7769 	if (rc != MDB_SUCCESS) {
7770 		return rc;
7771 	}
7772 	if (nflags & MDB_APPEND) {
7773 		mc->mc_pg[mc->mc_top] = rp;
7774 		mc->mc_ki[mc->mc_top] = 0;
7775 		rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags);
7776 		if (rc)
7777 			return rc;
7778 		for (i=0; i<mc->mc_top; i++)
7779 			mc->mc_ki[i] = mn.mc_ki[i];
7780 	} else if (!IS_LEAF2(mp)) {
7781 		/* Move nodes */
7782 		mc->mc_pg[mc->mc_top] = rp;
7783 		i = split_indx;
7784 		j = 0;
7785 		do {
7786 			if (i == newindx) {
7787 				rkey.mv_data = newkey->mv_data;
7788 				rkey.mv_size = newkey->mv_size;
7789 				if (IS_LEAF(mp)) {
7790 					rdata = newdata;
7791 				} else
7792 					pgno = newpgno;
7793 				flags = nflags;
7794 				/* Update index for the new key. */
7795 				mc->mc_ki[mc->mc_top] = j;
7796 			} else {
7797 				node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]);
7798 				rkey.mv_data = NODEKEY(node);
7799 				rkey.mv_size = node->mn_ksize;
7800 				if (IS_LEAF(mp)) {
7801 					xdata.mv_data = NODEDATA(node);
7802 					xdata.mv_size = NODEDSZ(node);
7803 					rdata = &xdata;
7804 				} else
7805 					pgno = NODEPGNO(node);
7806 				flags = node->mn_flags;
7807 			}
7808 
7809 			if (!IS_LEAF(mp) && j == 0) {
7810 				/* First branch index doesn't need key data. */
7811 				rkey.mv_size = 0;
7812 			}
7813 
7814 			rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
7815 			if (rc) {
7816 				/* return tmp page to freelist */
7817 				mdb_page_free(env, copy);
7818 				return rc;
7819 			}
7820 			if (i == nkeys) {
7821 				i = 0;
7822 				j = 0;
7823 				mc->mc_pg[mc->mc_top] = copy;
7824 			} else {
7825 				i++;
7826 				j++;
7827 			}
7828 		} while (i != split_indx);
7829 
7830 		nkeys = NUMKEYS(copy);
7831 		for (i=0; i<nkeys; i++)
7832 			mp->mp_ptrs[i] = copy->mp_ptrs[i];
7833 		mp->mp_lower = copy->mp_lower;
7834 		mp->mp_upper = copy->mp_upper;
7835 		memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
7836 			env->me_psize - copy->mp_upper);
7837 
7838 		/* reset back to original page */
7839 		if (newindx < split_indx) {
7840 			mc->mc_pg[mc->mc_top] = mp;
7841 			if (nflags & MDB_RESERVE) {
7842 				node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
7843 				if (!(node->mn_flags & F_BIGDATA))
7844 					newdata->mv_data = NODEDATA(node);
7845 			}
7846 		} else {
7847 			mc->mc_pg[mc->mc_top] = rp;
7848 			mc->mc_ki[ptop]++;
7849 			/* Make sure mc_ki is still valid.
7850 			 */
7851 			if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
7852 				mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
7853 				for (i=0; i<ptop; i++) {
7854 					mc->mc_pg[i] = mn.mc_pg[i];
7855 					mc->mc_ki[i] = mn.mc_ki[i];
7856 				}
7857 				mc->mc_pg[ptop] = mn.mc_pg[ptop];
7858 				mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
7859 			}
7860 		}
7861 		/* return tmp page to freelist */
7862 		mdb_page_free(env, copy);
7863 	}
7864 
7865 	{
7866 		/* Adjust other cursors pointing to mp */
7867 		MDB_cursor *m2, *m3;
7868 		MDB_dbi dbi = mc->mc_dbi;
7869 		int fixup = NUMKEYS(mp);
7870 
7871 		for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7872 			if (mc->mc_flags & C_SUB)
7873 				m3 = &m2->mc_xcursor->mx_cursor;
7874 			else
7875 				m3 = m2;
7876 			if (m3 == mc)
7877 				continue;
7878 			if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
7879 				continue;
7880 			if (m3->mc_flags & C_SPLITTING)
7881 				continue;
7882 			if (new_root) {
7883 				int k;
7884 				/* root split */
7885 				for (k=m3->mc_top; k>=0; k--) {
7886 					m3->mc_ki[k+1] = m3->mc_ki[k];
7887 					m3->mc_pg[k+1] = m3->mc_pg[k];
7888 				}
7889 				if (m3->mc_ki[0] >= split_indx) {
7890 					m3->mc_ki[0] = 1;
7891 				} else {
7892 					m3->mc_ki[0] = 0;
7893 				}
7894 				m3->mc_pg[0] = mc->mc_pg[0];
7895 				m3->mc_snum++;
7896 				m3->mc_top++;
7897 			}
7898 			if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) {
7899 				if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE))
7900 					m3->mc_ki[mc->mc_top]++;
7901 				if (m3->mc_ki[mc->mc_top] >= fixup) {
7902 					m3->mc_pg[mc->mc_top] = rp;
7903 					m3->mc_ki[mc->mc_top] -= fixup;
7904 					m3->mc_ki[ptop] = mn.mc_ki[ptop];
7905 				}
7906 			} else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
7907 				m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
7908 				m3->mc_ki[ptop]++;
7909 			}
7910 		}
7911 	}
7912 	DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
7913 	return rc;
7914 }
7915 
7916 int
7917 mdb_put(MDB_txn *txn, MDB_dbi dbi,
7918     MDB_val *key, MDB_val *data, unsigned int flags)
7919 {
7920 	MDB_cursor mc;
7921 	MDB_xcursor mx;
7922 
7923 	if (key == NULL || data == NULL)
7924 		return EINVAL;
7925 
7926 	if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
7927 		return EINVAL;
7928 
7929 	if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags)
7930 		return EINVAL;
7931 
7932 	mdb_cursor_init(&mc, txn, dbi, &mx);
7933 	return mdb_cursor_put(&mc, key, data, flags);
7934 }
7935 
7936 int
7937 mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff)
7938 {
7939 	if ((flag & CHANGEABLE) != flag)
7940 		return EINVAL;
7941 	if (onoff)
7942 		env->me_flags |= flag;
7943 	else
7944 		env->me_flags &= ~flag;
7945 	return MDB_SUCCESS;
7946 }
7947 
7948 int
7949 mdb_env_get_flags(MDB_env *env, unsigned int *arg)
7950 {
7951 	if (!env || !arg)
7952 		return EINVAL;
7953 
7954 	*arg = env->me_flags;
7955 	return MDB_SUCCESS;
7956 }
7957 
7958 int
7959 mdb_env_set_userctx(MDB_env *env, void *ctx)
7960 {
7961 	if (!env)
7962 		return EINVAL;
7963 	env->me_userctx = ctx;
7964 	return MDB_SUCCESS;
7965 }
7966 
7967 void *
7968 mdb_env_get_userctx(MDB_env *env)
7969 {
7970 	return env ? env->me_userctx : NULL;
7971 }
7972 
7973 int
7974 mdb_env_set_assert(MDB_env *env, MDB_assert_func *func)
7975 {
7976 	if (!env)
7977 		return EINVAL;
7978 #ifndef NDEBUG
7979 	env->me_assert_func = func;
7980 #endif
7981 	return MDB_SUCCESS;
7982 }
7983 
7984 int
7985 mdb_env_get_path(MDB_env *env, const char **arg)
7986 {
7987 	if (!env || !arg)
7988 		return EINVAL;
7989 
7990 	*arg = env->me_path;
7991 	return MDB_SUCCESS;
7992 }
7993 
7994 int
7995 mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg)
7996 {
7997 	if (!env || !arg)
7998 		return EINVAL;
7999 
8000 	*arg = env->me_fd;
8001 	return MDB_SUCCESS;
8002 }
8003 
8004 /** Common code for #mdb_stat() and #mdb_env_stat().
8005  * @param[in] env the environment to operate in.
8006  * @param[in] db the #MDB_db record containing the stats to return.
8007  * @param[out] arg the address of an #MDB_stat structure to receive the stats.
8008  * @return 0, this function always succeeds.
8009  */
8010 static int
8011 mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg)
8012 {
8013 	arg->ms_psize = env->me_psize;
8014 	arg->ms_depth = db->md_depth;
8015 	arg->ms_branch_pages = db->md_branch_pages;
8016 	arg->ms_leaf_pages = db->md_leaf_pages;
8017 	arg->ms_overflow_pages = db->md_overflow_pages;
8018 	arg->ms_entries = db->md_entries;
8019 
8020 	return MDB_SUCCESS;
8021 }
8022 int
8023 mdb_env_stat(MDB_env *env, MDB_stat *arg)
8024 {
8025 	int toggle;
8026 
8027 	if (env == NULL || arg == NULL)
8028 		return EINVAL;
8029 
8030 	toggle = mdb_env_pick_meta(env);
8031 
8032 	return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg);
8033 }
8034 
8035 int
8036 mdb_env_info(MDB_env *env, MDB_envinfo *arg)
8037 {
8038 	int toggle;
8039 
8040 	if (env == NULL || arg == NULL)
8041 		return EINVAL;
8042 
8043 	toggle = mdb_env_pick_meta(env);
8044 	arg->me_mapaddr = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : 0;
8045 	arg->me_mapsize = env->me_mapsize;
8046 	arg->me_maxreaders = env->me_maxreaders;
8047 
8048 	/* me_numreaders may be zero if this process never used any readers. Use
8049 	 * the shared numreader count if it exists.
8050 	 */
8051 	arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : env->me_numreaders;
8052 
8053 	arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg;
8054 	arg->me_last_txnid = env->me_metas[toggle]->mm_txnid;
8055 	return MDB_SUCCESS;
8056 }
8057 
8058 /** Set the default comparison functions for a database.
8059  * Called immediately after a database is opened to set the defaults.
8060  * The user can then override them with #mdb_set_compare() or
8061  * #mdb_set_dupsort().
8062  * @param[in] txn A transaction handle returned by #mdb_txn_begin()
8063  * @param[in] dbi A database handle returned by #mdb_dbi_open()
8064  */
8065 static void
8066 mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi)
8067 {
8068 	uint16_t f = txn->mt_dbs[dbi].md_flags;
8069 
8070 	txn->mt_dbxs[dbi].md_cmp =
8071 		(f & MDB_REVERSEKEY) ? mdb_cmp_memnr :
8072 		(f & MDB_INTEGERKEY) ? mdb_cmp_cint  : mdb_cmp_memn;
8073 
8074 	txn->mt_dbxs[dbi].md_dcmp =
8075 		!(f & MDB_DUPSORT) ? 0 :
8076 		((f & MDB_INTEGERDUP)
8077 		 ? ((f & MDB_DUPFIXED)   ? mdb_cmp_int   : mdb_cmp_cint)
8078 		 : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn));
8079 }
8080 
8081 int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi)
8082 {
8083 	MDB_val key, data;
8084 	MDB_dbi i;
8085 	MDB_cursor mc;
8086 	int rc, dbflag, exact;
8087 	unsigned int unused = 0;
8088 	size_t len;
8089 
8090 	if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) {
8091 		mdb_default_cmp(txn, FREE_DBI);
8092 	}
8093 
8094 	if ((flags & VALID_FLAGS) != flags)
8095 		return EINVAL;
8096 	if (txn->mt_flags & MDB_TXN_ERROR)
8097 		return MDB_BAD_TXN;
8098 
8099 	/* main DB? */
8100 	if (!name) {
8101 		*dbi = MAIN_DBI;
8102 		if (flags & PERSISTENT_FLAGS) {
8103 			uint16_t f2 = flags & PERSISTENT_FLAGS;
8104 			/* make sure flag changes get committed */
8105 			if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) {
8106 				txn->mt_dbs[MAIN_DBI].md_flags |= f2;
8107 				txn->mt_flags |= MDB_TXN_DIRTY;
8108 			}
8109 		}
8110 		mdb_default_cmp(txn, MAIN_DBI);
8111 		return MDB_SUCCESS;
8112 	}
8113 
8114 	if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) {
8115 		mdb_default_cmp(txn, MAIN_DBI);
8116 	}
8117 
8118 	/* Is the DB already open? */
8119 	len = strlen(name);
8120 	for (i=2; i<txn->mt_numdbs; i++) {
8121 		if (!txn->mt_dbxs[i].md_name.mv_size) {
8122 			/* Remember this free slot */
8123 			if (!unused) unused = i;
8124 			continue;
8125 		}
8126 		if (len == txn->mt_dbxs[i].md_name.mv_size &&
8127 			!strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) {
8128 			*dbi = i;
8129 			return MDB_SUCCESS;
8130 		}
8131 	}
8132 
8133 	/* If no free slot and max hit, fail */
8134 	if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs)
8135 		return MDB_DBS_FULL;
8136 
8137 	/* Cannot mix named databases with some mainDB flags */
8138 	if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY))
8139 		return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND;
8140 
8141 	/* Find the DB info */
8142 	dbflag = DB_NEW|DB_VALID;
8143 	exact = 0;
8144 	key.mv_size = len;
8145 	key.mv_data = (void *)name;
8146 	mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
8147 	rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact);
8148 	if (rc == MDB_SUCCESS) {
8149 		/* make sure this is actually a DB */
8150 		MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]);
8151 		if (!(node->mn_flags & F_SUBDATA))
8152 			return MDB_INCOMPATIBLE;
8153 	} else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) {
8154 		/* Create if requested */
8155 		MDB_db dummy;
8156 		data.mv_size = sizeof(MDB_db);
8157 		data.mv_data = &dummy;
8158 		memset(&dummy, 0, sizeof(dummy));
8159 		dummy.md_root = P_INVALID;
8160 		dummy.md_flags = flags & PERSISTENT_FLAGS;
8161 		rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA);
8162 		dbflag |= DB_DIRTY;
8163 	}
8164 
8165 	/* OK, got info, add to table */
8166 	if (rc == MDB_SUCCESS) {
8167 		unsigned int slot = unused ? unused : txn->mt_numdbs;
8168 		txn->mt_dbxs[slot].md_name.mv_data = strdup(name);
8169 		txn->mt_dbxs[slot].md_name.mv_size = len;
8170 		txn->mt_dbxs[slot].md_rel = NULL;
8171 		txn->mt_dbflags[slot] = dbflag;
8172 		memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db));
8173 		*dbi = slot;
8174 		mdb_default_cmp(txn, slot);
8175 		if (!unused) {
8176 			txn->mt_numdbs++;
8177 		}
8178 	}
8179 
8180 	return rc;
8181 }
8182 
8183 int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg)
8184 {
8185 	if (txn == NULL || arg == NULL || dbi >= txn->mt_numdbs)
8186 		return EINVAL;
8187 
8188 	if (txn->mt_dbflags[dbi] & DB_STALE) {
8189 		MDB_cursor mc;
8190 		MDB_xcursor mx;
8191 		/* Stale, must read the DB's root. cursor_init does it for us. */
8192 		mdb_cursor_init(&mc, txn, dbi, &mx);
8193 	}
8194 	return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg);
8195 }
8196 
8197 void mdb_dbi_close(MDB_env *env, MDB_dbi dbi)
8198 {
8199 	char *ptr;
8200 	if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs)
8201 		return;
8202 	ptr = env->me_dbxs[dbi].md_name.mv_data;
8203 	env->me_dbxs[dbi].md_name.mv_data = NULL;
8204 	env->me_dbxs[dbi].md_name.mv_size = 0;
8205 	env->me_dbflags[dbi] = 0;
8206 	free(ptr);
8207 }
8208 
8209 int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags)
8210 {
8211 	/* We could return the flags for the FREE_DBI too but what's the point? */
8212 	if (txn == NULL || dbi < MAIN_DBI || dbi >= txn->mt_numdbs)
8213 		return EINVAL;
8214 	*flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS;
8215 	return MDB_SUCCESS;
8216 }
8217 
8218 /** Add all the DB's pages to the free list.
8219  * @param[in] mc Cursor on the DB to free.
8220  * @param[in] subs non-Zero to check for sub-DBs in this DB.
8221  * @return 0 on success, non-zero on failure.
8222  */
8223 static int
8224 mdb_drop0(MDB_cursor *mc, int subs)
8225 {
8226 	int rc;
8227 
8228 	rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
8229 	if (rc == MDB_SUCCESS) {
8230 		MDB_txn *txn = mc->mc_txn;
8231 		MDB_node *ni;
8232 		MDB_cursor mx;
8233 		unsigned int i;
8234 
8235 		/* LEAF2 pages have no nodes, cannot have sub-DBs */
8236 		if (IS_LEAF2(mc->mc_pg[mc->mc_top]))
8237 			mdb_cursor_pop(mc);
8238 
8239 		mdb_cursor_copy(mc, &mx);
8240 		while (mc->mc_snum > 0) {
8241 			MDB_page *mp = mc->mc_pg[mc->mc_top];
8242 			unsigned n = NUMKEYS(mp);
8243 			if (IS_LEAF(mp)) {
8244 				for (i=0; i<n; i++) {
8245 					ni = NODEPTR(mp, i);
8246 					if (ni->mn_flags & F_BIGDATA) {
8247 						MDB_page *omp;
8248 						pgno_t pg;
8249 						memcpy(&pg, NODEDATA(ni), sizeof(pg));
8250 						rc = mdb_page_get(txn, pg, &omp, NULL);
8251 						if (rc != 0)
8252 							return rc;
8253 						mdb_cassert(mc, IS_OVERFLOW(omp));
8254 						rc = mdb_midl_append_range(&txn->mt_free_pgs,
8255 							pg, omp->mp_pages);
8256 						if (rc)
8257 							return rc;
8258 					} else if (subs && (ni->mn_flags & F_SUBDATA)) {
8259 						mdb_xcursor_init1(mc, ni);
8260 						rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
8261 						if (rc)
8262 							return rc;
8263 					}
8264 				}
8265 			} else {
8266 				if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0)
8267 					return rc;
8268 				for (i=0; i<n; i++) {
8269 					pgno_t pg;
8270 					ni = NODEPTR(mp, i);
8271 					pg = NODEPGNO(ni);
8272 					/* free it */
8273 					mdb_midl_xappend(txn->mt_free_pgs, pg);
8274 				}
8275 			}
8276 			if (!mc->mc_top)
8277 				break;
8278 			mc->mc_ki[mc->mc_top] = i;
8279 			rc = mdb_cursor_sibling(mc, 1);
8280 			if (rc) {
8281 				/* no more siblings, go back to beginning
8282 				 * of previous level.
8283 				 */
8284 				mdb_cursor_pop(mc);
8285 				mc->mc_ki[0] = 0;
8286 				for (i=1; i<mc->mc_snum; i++) {
8287 					mc->mc_ki[i] = 0;
8288 					mc->mc_pg[i] = mx.mc_pg[i];
8289 				}
8290 			}
8291 		}
8292 		/* free it */
8293 		rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root);
8294 	} else if (rc == MDB_NOTFOUND) {
8295 		rc = MDB_SUCCESS;
8296 	}
8297 	return rc;
8298 }
8299 
8300 int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del)
8301 {
8302 	MDB_cursor *mc, *m2;
8303 	int rc;
8304 
8305 	if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1 || !(txn->mt_dbflags[dbi] & DB_VALID))
8306 		return EINVAL;
8307 
8308 	if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
8309 		return EACCES;
8310 
8311 	rc = mdb_cursor_open(txn, dbi, &mc);
8312 	if (rc)
8313 		return rc;
8314 
8315 	rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT);
8316 	/* Invalidate the dropped DB's cursors */
8317 	for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next)
8318 		m2->mc_flags &= ~(C_INITIALIZED|C_EOF);
8319 	if (rc)
8320 		goto leave;
8321 
8322 	/* Can't delete the main DB */
8323 	if (del && dbi > MAIN_DBI) {
8324 		rc = mdb_del(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL);
8325 		if (!rc) {
8326 			txn->mt_dbflags[dbi] = DB_STALE;
8327 			mdb_dbi_close(txn->mt_env, dbi);
8328 		}
8329 	} else {
8330 		/* reset the DB record, mark it dirty */
8331 		txn->mt_dbflags[dbi] |= DB_DIRTY;
8332 		txn->mt_dbs[dbi].md_depth = 0;
8333 		txn->mt_dbs[dbi].md_branch_pages = 0;
8334 		txn->mt_dbs[dbi].md_leaf_pages = 0;
8335 		txn->mt_dbs[dbi].md_overflow_pages = 0;
8336 		txn->mt_dbs[dbi].md_entries = 0;
8337 		txn->mt_dbs[dbi].md_root = P_INVALID;
8338 
8339 		txn->mt_flags |= MDB_TXN_DIRTY;
8340 	}
8341 leave:
8342 	mdb_cursor_close(mc);
8343 	return rc;
8344 }
8345 
8346 int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
8347 {
8348 	if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
8349 		return EINVAL;
8350 
8351 	txn->mt_dbxs[dbi].md_cmp = cmp;
8352 	return MDB_SUCCESS;
8353 }
8354 
8355 int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
8356 {
8357 	if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
8358 		return EINVAL;
8359 
8360 	txn->mt_dbxs[dbi].md_dcmp = cmp;
8361 	return MDB_SUCCESS;
8362 }
8363 
8364 int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel)
8365 {
8366 	if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
8367 		return EINVAL;
8368 
8369 	txn->mt_dbxs[dbi].md_rel = rel;
8370 	return MDB_SUCCESS;
8371 }
8372 
8373 int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx)
8374 {
8375 	if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
8376 		return EINVAL;
8377 
8378 	txn->mt_dbxs[dbi].md_relctx = ctx;
8379 	return MDB_SUCCESS;
8380 }
8381 
8382 int mdb_env_get_maxkeysize(MDB_env *env)
8383 {
8384 	return ENV_MAXKEY(env);
8385 }
8386 
8387 int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
8388 {
8389 	unsigned int i, rdrs;
8390 	MDB_reader *mr;
8391 	char buf[64];
8392 	int rc = 0, first = 1;
8393 
8394 	if (!env || !func)
8395 		return -1;
8396 	if (!env->me_txns) {
8397 		return func("(no reader locks)\n", ctx);
8398 	}
8399 	rdrs = env->me_txns->mti_numreaders;
8400 	mr = env->me_txns->mti_readers;
8401 	for (i=0; i<rdrs; i++) {
8402 		if (mr[i].mr_pid) {
8403 			txnid_t	txnid = mr[i].mr_txnid;
8404 			sprintf(buf, txnid == (txnid_t)-1 ?
8405 				"%10d %"Z"x -\n" : "%10d %"Z"x %"Z"u\n",
8406 				(int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid);
8407 			if (first) {
8408 				first = 0;
8409 				rc = func("    pid     thread     txnid\n", ctx);
8410 				if (rc < 0)
8411 					break;
8412 			}
8413 			rc = func(buf, ctx);
8414 			if (rc < 0)
8415 				break;
8416 		}
8417 	}
8418 	if (first) {
8419 		rc = func("(no active readers)\n", ctx);
8420 	}
8421 	return rc;
8422 }
8423 
8424 /** Insert pid into list if not already present.
8425  * return -1 if already present.
8426  */
8427 static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
8428 {
8429 	/* binary search of pid in list */
8430 	unsigned base = 0;
8431 	unsigned cursor = 1;
8432 	int val = 0;
8433 	unsigned n = ids[0];
8434 
8435 	while( 0 < n ) {
8436 		unsigned pivot = n >> 1;
8437 		cursor = base + pivot + 1;
8438 		val = pid - ids[cursor];
8439 
8440 		if( val < 0 ) {
8441 			n = pivot;
8442 
8443 		} else if ( val > 0 ) {
8444 			base = cursor;
8445 			n -= pivot + 1;
8446 
8447 		} else {
8448 			/* found, so it's a duplicate */
8449 			return -1;
8450 		}
8451 	}
8452 
8453 	if( val > 0 ) {
8454 		++cursor;
8455 	}
8456 	ids[0]++;
8457 	for (n = ids[0]; n > cursor; n--)
8458 		ids[n] = ids[n-1];
8459 	ids[n] = pid;
8460 	return 0;
8461 }
8462 
8463 int mdb_reader_check(MDB_env *env, int *dead)
8464 {
8465 	unsigned int i, j, rdrs;
8466 	MDB_reader *mr;
8467 	MDB_PID_T *pids, pid;
8468 	int count = 0;
8469 
8470 	if (!env)
8471 		return EINVAL;
8472 	if (dead)
8473 		*dead = 0;
8474 	if (!env->me_txns)
8475 		return MDB_SUCCESS;
8476 	rdrs = env->me_txns->mti_numreaders;
8477 	pids = malloc((rdrs+1) * sizeof(MDB_PID_T));
8478 	if (!pids)
8479 		return ENOMEM;
8480 	pids[0] = 0;
8481 	mr = env->me_txns->mti_readers;
8482 	for (i=0; i<rdrs; i++) {
8483 		if (mr[i].mr_pid && mr[i].mr_pid != env->me_pid) {
8484 			pid = mr[i].mr_pid;
8485 			if (mdb_pid_insert(pids, pid) == 0) {
8486 				if (!mdb_reader_pid(env, Pidcheck, pid)) {
8487 					LOCK_MUTEX_R(env);
8488 					/* Recheck, a new process may have reused pid */
8489 					if (!mdb_reader_pid(env, Pidcheck, pid)) {
8490 						for (j=i; j<rdrs; j++)
8491 							if (mr[j].mr_pid == pid) {
8492 								DPRINTF(("clear stale reader pid %u txn %"Z"d",
8493 									(unsigned) pid, mr[j].mr_txnid));
8494 								mr[j].mr_pid = 0;
8495 								count++;
8496 							}
8497 					}
8498 					UNLOCK_MUTEX_R(env);
8499 				}
8500 			}
8501 		}
8502 	}
8503 	free(pids);
8504 	if (dead)
8505 		*dead = count;
8506 	return MDB_SUCCESS;
8507 }
8508 /** @} */
8509