1 /* $NetBSD: mdb.c,v 1.1.1.1 2014/05/28 09:58:43 tron Exp $ */ 2 3 /** @file mdb.c 4 * @brief memory-mapped database library 5 * 6 * A Btree-based database management library modeled loosely on the 7 * BerkeleyDB API, but much simplified. 8 */ 9 /* 10 * Copyright 2011-2013 Howard Chu, Symas Corp. 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted only as authorized by the OpenLDAP 15 * Public License. 16 * 17 * A copy of this license is available in the file LICENSE in the 18 * top-level directory of the distribution or, alternatively, at 19 * <http://www.OpenLDAP.org/license.html>. 20 * 21 * This code is derived from btree.c written by Martin Hedenfalk. 22 * 23 * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se> 24 * 25 * Permission to use, copy, modify, and distribute this software for any 26 * purpose with or without fee is hereby granted, provided that the above 27 * copyright notice and this permission notice appear in all copies. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 30 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 31 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 32 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 33 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 34 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 35 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 36 */ 37 #ifndef _GNU_SOURCE 38 #define _GNU_SOURCE 1 39 #endif 40 #include <sys/types.h> 41 #include <sys/stat.h> 42 #ifdef _WIN32 43 #include <windows.h> 44 /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it 45 * as int64 which is wrong. MSVC doesn't define it at all, so just 46 * don't use it. 47 */ 48 #define MDB_PID_T int 49 #ifdef __GNUC__ 50 # include <sys/param.h> 51 #else 52 # define LITTLE_ENDIAN 1234 53 # define BIG_ENDIAN 4321 54 # define BYTE_ORDER LITTLE_ENDIAN 55 # ifndef SSIZE_MAX 56 # define SSIZE_MAX INT_MAX 57 # endif 58 #endif 59 #else 60 #define MDB_PID_T pid_t 61 #include <sys/param.h> 62 #include <sys/uio.h> 63 #include <sys/mman.h> 64 #ifdef HAVE_SYS_FILE_H 65 #include <sys/file.h> 66 #endif 67 #include <fcntl.h> 68 #endif 69 70 #include <errno.h> 71 #include <limits.h> 72 #include <stddef.h> 73 #include <inttypes.h> 74 #include <stdio.h> 75 #include <stdlib.h> 76 #include <string.h> 77 #include <time.h> 78 #include <unistd.h> 79 80 #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) 81 #include <netinet/in.h> 82 #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */ 83 #endif 84 85 #if defined(__APPLE__) || defined (BSD) 86 # define MDB_USE_POSIX_SEM 1 87 # define MDB_FDATASYNC fsync 88 #elif defined(ANDROID) 89 # define MDB_FDATASYNC fsync 90 #endif 91 92 #ifndef _WIN32 93 #include <pthread.h> 94 #ifdef MDB_USE_POSIX_SEM 95 # define MDB_USE_HASH 1 96 #include <semaphore.h> 97 #endif 98 #endif 99 100 #ifdef USE_VALGRIND 101 #include <valgrind/memcheck.h> 102 #define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z) 103 #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s) 104 #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a) 105 #define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h) 106 #define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s) 107 #else 108 #define VGMEMP_CREATE(h,r,z) 109 #define VGMEMP_ALLOC(h,a,s) 110 #define VGMEMP_FREE(h,a) 111 #define VGMEMP_DESTROY(h) 112 #define VGMEMP_DEFINED(a,s) 113 #endif 114 115 #ifndef BYTE_ORDER 116 # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) 117 /* Solaris just defines one or the other */ 118 # define LITTLE_ENDIAN 1234 119 # define BIG_ENDIAN 4321 120 # ifdef _LITTLE_ENDIAN 121 # define BYTE_ORDER LITTLE_ENDIAN 122 # else 123 # define BYTE_ORDER BIG_ENDIAN 124 # endif 125 # else 126 # define BYTE_ORDER __BYTE_ORDER 127 # endif 128 #endif 129 130 #ifndef LITTLE_ENDIAN 131 #define LITTLE_ENDIAN __LITTLE_ENDIAN 132 #endif 133 #ifndef BIG_ENDIAN 134 #define BIG_ENDIAN __BIG_ENDIAN 135 #endif 136 137 #if defined(__i386) || defined(__x86_64) || defined(_M_IX86) 138 #define MISALIGNED_OK 1 139 #endif 140 141 #include "lmdb.h" 142 #include "midl.h" 143 144 #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) 145 # error "Unknown or unsupported endianness (BYTE_ORDER)" 146 #elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF 147 # error "Two's complement, reasonably sized integer types, please" 148 #endif 149 150 /** @defgroup internal MDB Internals 151 * @{ 152 */ 153 /** @defgroup compat Compatibility Macros 154 * A bunch of macros to minimize the amount of platform-specific ifdefs 155 * needed throughout the rest of the code. When the features this library 156 * needs are similar enough to POSIX to be hidden in a one-or-two line 157 * replacement, this macro approach is used. 158 * @{ 159 */ 160 161 /** Wrapper around __func__, which is a C99 feature */ 162 #if __STDC_VERSION__ >= 199901L 163 # define mdb_func_ __func__ 164 #elif __GNUC__ >= 2 || _MSC_VER >= 1300 165 # define mdb_func_ __FUNCTION__ 166 #else 167 /* If a debug message says <mdb_unknown>(), update the #if statements above */ 168 # define mdb_func_ "<mdb_unknown>" 169 #endif 170 171 #ifdef _WIN32 172 #define MDB_USE_HASH 1 173 #define MDB_PIDLOCK 0 174 #define pthread_t DWORD 175 #define pthread_mutex_t HANDLE 176 #define pthread_key_t DWORD 177 #define pthread_self() GetCurrentThreadId() 178 #define pthread_key_create(x,y) \ 179 ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0) 180 #define pthread_key_delete(x) TlsFree(x) 181 #define pthread_getspecific(x) TlsGetValue(x) 182 #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) 183 #define pthread_mutex_unlock(x) ReleaseMutex(x) 184 #define pthread_mutex_lock(x) WaitForSingleObject(x, INFINITE) 185 #define LOCK_MUTEX_R(env) pthread_mutex_lock((env)->me_rmutex) 186 #define UNLOCK_MUTEX_R(env) pthread_mutex_unlock((env)->me_rmutex) 187 #define LOCK_MUTEX_W(env) pthread_mutex_lock((env)->me_wmutex) 188 #define UNLOCK_MUTEX_W(env) pthread_mutex_unlock((env)->me_wmutex) 189 #define getpid() GetCurrentProcessId() 190 #define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd)) 191 #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) 192 #define ErrCode() GetLastError() 193 #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} 194 #define close(fd) (CloseHandle(fd) ? 0 : -1) 195 #define munmap(ptr,len) UnmapViewOfFile(ptr) 196 #ifdef PROCESS_QUERY_LIMITED_INFORMATION 197 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION 198 #else 199 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 200 #endif 201 #define Z "I" 202 #else 203 204 #define Z "z" /**< printf format modifier for size_t */ 205 206 /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ 207 #define MDB_PIDLOCK 1 208 209 #ifdef MDB_USE_POSIX_SEM 210 211 #define LOCK_MUTEX_R(env) mdb_sem_wait((env)->me_rmutex) 212 #define UNLOCK_MUTEX_R(env) sem_post((env)->me_rmutex) 213 #define LOCK_MUTEX_W(env) mdb_sem_wait((env)->me_wmutex) 214 #define UNLOCK_MUTEX_W(env) sem_post((env)->me_wmutex) 215 216 static int 217 mdb_sem_wait(sem_t *sem) 218 { 219 int rc; 220 while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ; 221 return rc; 222 } 223 224 #else 225 /** Lock the reader mutex. 226 */ 227 #define LOCK_MUTEX_R(env) pthread_mutex_lock(&(env)->me_txns->mti_mutex) 228 /** Unlock the reader mutex. 229 */ 230 #define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&(env)->me_txns->mti_mutex) 231 232 /** Lock the writer mutex. 233 * Only a single write transaction is allowed at a time. Other writers 234 * will block waiting for this mutex. 235 */ 236 #define LOCK_MUTEX_W(env) pthread_mutex_lock(&(env)->me_txns->mti_wmutex) 237 /** Unlock the writer mutex. 238 */ 239 #define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&(env)->me_txns->mti_wmutex) 240 #endif /* MDB_USE_POSIX_SEM */ 241 242 /** Get the error code for the last failed system function. 243 */ 244 #define ErrCode() errno 245 246 /** An abstraction for a file handle. 247 * On POSIX systems file handles are small integers. On Windows 248 * they're opaque pointers. 249 */ 250 #define HANDLE int 251 252 /** A value for an invalid file handle. 253 * Mainly used to initialize file variables and signify that they are 254 * unused. 255 */ 256 #define INVALID_HANDLE_VALUE (-1) 257 258 /** Get the size of a memory page for the system. 259 * This is the basic size that the platform's memory manager uses, and is 260 * fundamental to the use of memory-mapped files. 261 */ 262 #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) 263 #endif 264 265 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) 266 #define MNAME_LEN 32 267 #else 268 #define MNAME_LEN (sizeof(pthread_mutex_t)) 269 #endif 270 271 /** @} */ 272 273 #ifndef _WIN32 274 /** A flag for opening a file and requesting synchronous data writes. 275 * This is only used when writing a meta page. It's not strictly needed; 276 * we could just do a normal write and then immediately perform a flush. 277 * But if this flag is available it saves us an extra system call. 278 * 279 * @note If O_DSYNC is undefined but exists in /usr/include, 280 * preferably set some compiler flag to get the definition. 281 * Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC. 282 */ 283 #ifndef MDB_DSYNC 284 # define MDB_DSYNC O_DSYNC 285 #endif 286 #endif 287 288 /** Function for flushing the data of a file. Define this to fsync 289 * if fdatasync() is not supported. 290 */ 291 #ifndef MDB_FDATASYNC 292 # define MDB_FDATASYNC fdatasync 293 #endif 294 295 #ifndef MDB_MSYNC 296 # define MDB_MSYNC(addr,len,flags) msync(addr,len,flags) 297 #endif 298 299 #ifndef MS_SYNC 300 #define MS_SYNC 1 301 #endif 302 303 #ifndef MS_ASYNC 304 #define MS_ASYNC 0 305 #endif 306 307 /** A page number in the database. 308 * Note that 64 bit page numbers are overkill, since pages themselves 309 * already represent 12-13 bits of addressable memory, and the OS will 310 * always limit applications to a maximum of 63 bits of address space. 311 * 312 * @note In the #MDB_node structure, we only store 48 bits of this value, 313 * which thus limits us to only 60 bits of addressable data. 314 */ 315 typedef MDB_ID pgno_t; 316 317 /** A transaction ID. 318 * See struct MDB_txn.mt_txnid for details. 319 */ 320 typedef MDB_ID txnid_t; 321 322 /** @defgroup debug Debug Macros 323 * @{ 324 */ 325 #ifndef MDB_DEBUG 326 /** Enable debug output. Needs variable argument macros (a C99 feature). 327 * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs 328 * read from and written to the database (used for free space management). 329 */ 330 #define MDB_DEBUG 0 331 #endif 332 333 #if MDB_DEBUG 334 static int mdb_debug; 335 static txnid_t mdb_debug_start; 336 337 /** Print a debug message with printf formatting. 338 * Requires double parenthesis around 2 or more args. 339 */ 340 # define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args)) 341 # define DPRINTF0(fmt, ...) \ 342 fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__) 343 #else 344 # define DPRINTF(args) ((void) 0) 345 #endif 346 /** Print a debug string. 347 * The string is printed literally, with no format processing. 348 */ 349 #define DPUTS(arg) DPRINTF(("%s", arg)) 350 /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ 351 #define DDBI(mc) \ 352 (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) 353 /** @} */ 354 355 /** @brief The maximum size of a database page. 356 * 357 * This is 32k, since it must fit in #MDB_page.#mp_upper. 358 * 359 * LMDB will use database pages < OS pages if needed. 360 * That causes more I/O in write transactions: The OS must 361 * know (read) the whole page before writing a partial page. 362 * 363 * Note that we don't currently support Huge pages. On Linux, 364 * regular data files cannot use Huge pages, and in general 365 * Huge pages aren't actually pageable. We rely on the OS 366 * demand-pager to read our data and page it out when memory 367 * pressure from other processes is high. So until OSs have 368 * actual paging support for Huge pages, they're not viable. 369 */ 370 #define MAX_PAGESIZE 0x8000 371 372 /** The minimum number of keys required in a database page. 373 * Setting this to a larger value will place a smaller bound on the 374 * maximum size of a data item. Data items larger than this size will 375 * be pushed into overflow pages instead of being stored directly in 376 * the B-tree node. This value used to default to 4. With a page size 377 * of 4096 bytes that meant that any item larger than 1024 bytes would 378 * go into an overflow page. That also meant that on average 2-3KB of 379 * each overflow page was wasted space. The value cannot be lower than 380 * 2 because then there would no longer be a tree structure. With this 381 * value, items larger than 2KB will go into overflow pages, and on 382 * average only 1KB will be wasted. 383 */ 384 #define MDB_MINKEYS 2 385 386 /** A stamp that identifies a file as an MDB file. 387 * There's nothing special about this value other than that it is easily 388 * recognizable, and it will reflect any byte order mismatches. 389 */ 390 #define MDB_MAGIC 0xBEEFC0DE 391 392 /** The version number for a database's datafile format. */ 393 #define MDB_DATA_VERSION 1 394 /** The version number for a database's lockfile format. */ 395 #define MDB_LOCK_VERSION 1 396 397 /** @brief The max size of a key we can write, or 0 for dynamic max. 398 * 399 * Define this as 0 to compute the max from the page size. 511 400 * is default for backwards compat: liblmdb <= 0.9.10 can break 401 * when modifying a DB with keys/dupsort data bigger than its max. 402 * 403 * Data items in an #MDB_DUPSORT database are also limited to 404 * this size, since they're actually keys of a sub-DB. Keys and 405 * #MDB_DUPSORT data items must fit on a node in a regular page. 406 */ 407 #ifndef MDB_MAXKEYSIZE 408 #define MDB_MAXKEYSIZE 511 409 #endif 410 411 /** The maximum size of a key we can write to the environment. */ 412 #if MDB_MAXKEYSIZE 413 #define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) 414 #else 415 #define ENV_MAXKEY(env) ((env)->me_maxkey) 416 #endif 417 418 /** @brief The maximum size of a data item. 419 * 420 * We only store a 32 bit value for node sizes. 421 */ 422 #define MAXDATASIZE 0xffffffffUL 423 424 #if MDB_DEBUG 425 /** Key size which fits in a #DKBUF. 426 * @ingroup debug 427 */ 428 #define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) 429 /** A key buffer. 430 * @ingroup debug 431 * This is used for printing a hex dump of a key's contents. 432 */ 433 #define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] 434 /** Display a key in hex. 435 * @ingroup debug 436 * Invoke a function to display a key in hex. 437 */ 438 #define DKEY(x) mdb_dkey(x, kbuf) 439 #else 440 #define DKBUF 441 #define DKEY(x) 0 442 #endif 443 444 /** An invalid page number. 445 * Mainly used to denote an empty tree. 446 */ 447 #define P_INVALID (~(pgno_t)0) 448 449 /** Test if the flags \b f are set in a flag word \b w. */ 450 #define F_ISSET(w, f) (((w) & (f)) == (f)) 451 452 /** Round \b n up to an even number. */ 453 #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ 454 455 /** Used for offsets within a single page. 456 * Since memory pages are typically 4 or 8KB in size, 12-13 bits, 457 * this is plenty. 458 */ 459 typedef uint16_t indx_t; 460 461 /** Default size of memory map. 462 * This is certainly too small for any actual applications. Apps should always set 463 * the size explicitly using #mdb_env_set_mapsize(). 464 */ 465 #define DEFAULT_MAPSIZE 1048576 466 467 /** @defgroup readers Reader Lock Table 468 * Readers don't acquire any locks for their data access. Instead, they 469 * simply record their transaction ID in the reader table. The reader 470 * mutex is needed just to find an empty slot in the reader table. The 471 * slot's address is saved in thread-specific data so that subsequent read 472 * transactions started by the same thread need no further locking to proceed. 473 * 474 * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. 475 * 476 * No reader table is used if the database is on a read-only filesystem, or 477 * if #MDB_NOLOCK is set. 478 * 479 * Since the database uses multi-version concurrency control, readers don't 480 * actually need any locking. This table is used to keep track of which 481 * readers are using data from which old transactions, so that we'll know 482 * when a particular old transaction is no longer in use. Old transactions 483 * that have discarded any data pages can then have those pages reclaimed 484 * for use by a later write transaction. 485 * 486 * The lock table is constructed such that reader slots are aligned with the 487 * processor's cache line size. Any slot is only ever used by one thread. 488 * This alignment guarantees that there will be no contention or cache 489 * thrashing as threads update their own slot info, and also eliminates 490 * any need for locking when accessing a slot. 491 * 492 * A writer thread will scan every slot in the table to determine the oldest 493 * outstanding reader transaction. Any freed pages older than this will be 494 * reclaimed by the writer. The writer doesn't use any locks when scanning 495 * this table. This means that there's no guarantee that the writer will 496 * see the most up-to-date reader info, but that's not required for correct 497 * operation - all we need is to know the upper bound on the oldest reader, 498 * we don't care at all about the newest reader. So the only consequence of 499 * reading stale information here is that old pages might hang around a 500 * while longer before being reclaimed. That's actually good anyway, because 501 * the longer we delay reclaiming old pages, the more likely it is that a 502 * string of contiguous pages can be found after coalescing old pages from 503 * many old transactions together. 504 * @{ 505 */ 506 /** Number of slots in the reader table. 507 * This value was chosen somewhat arbitrarily. 126 readers plus a 508 * couple mutexes fit exactly into 8KB on my development machine. 509 * Applications should set the table size using #mdb_env_set_maxreaders(). 510 */ 511 #define DEFAULT_READERS 126 512 513 /** The size of a CPU cache line in bytes. We want our lock structures 514 * aligned to this size to avoid false cache line sharing in the 515 * lock table. 516 * This value works for most CPUs. For Itanium this should be 128. 517 */ 518 #ifndef CACHELINE 519 #define CACHELINE 64 520 #endif 521 522 /** The information we store in a single slot of the reader table. 523 * In addition to a transaction ID, we also record the process and 524 * thread ID that owns a slot, so that we can detect stale information, 525 * e.g. threads or processes that went away without cleaning up. 526 * @note We currently don't check for stale records. We simply re-init 527 * the table when we know that we're the only process opening the 528 * lock file. 529 */ 530 typedef struct MDB_rxbody { 531 /** Current Transaction ID when this transaction began, or (txnid_t)-1. 532 * Multiple readers that start at the same time will probably have the 533 * same ID here. Again, it's not important to exclude them from 534 * anything; all we need to know is which version of the DB they 535 * started from so we can avoid overwriting any data used in that 536 * particular version. 537 */ 538 txnid_t mrb_txnid; 539 /** The process ID of the process owning this reader txn. */ 540 MDB_PID_T mrb_pid; 541 /** The thread ID of the thread owning this txn. */ 542 pthread_t mrb_tid; 543 } MDB_rxbody; 544 545 /** The actual reader record, with cacheline padding. */ 546 typedef struct MDB_reader { 547 union { 548 MDB_rxbody mrx; 549 /** shorthand for mrb_txnid */ 550 #define mr_txnid mru.mrx.mrb_txnid 551 #define mr_pid mru.mrx.mrb_pid 552 #define mr_tid mru.mrx.mrb_tid 553 /** cache line alignment */ 554 char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)]; 555 } mru; 556 } MDB_reader; 557 558 /** The header for the reader table. 559 * The table resides in a memory-mapped file. (This is a different file 560 * than is used for the main database.) 561 * 562 * For POSIX the actual mutexes reside in the shared memory of this 563 * mapped file. On Windows, mutexes are named objects allocated by the 564 * kernel; we store the mutex names in this mapped file so that other 565 * processes can grab them. This same approach is also used on 566 * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support 567 * process-shared POSIX mutexes. For these cases where a named object 568 * is used, the object name is derived from a 64 bit FNV hash of the 569 * environment pathname. As such, naming collisions are extremely 570 * unlikely. If a collision occurs, the results are unpredictable. 571 */ 572 typedef struct MDB_txbody { 573 /** Stamp identifying this as an MDB file. It must be set 574 * to #MDB_MAGIC. */ 575 uint32_t mtb_magic; 576 /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ 577 uint32_t mtb_format; 578 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) 579 char mtb_rmname[MNAME_LEN]; 580 #else 581 /** Mutex protecting access to this table. 582 * This is the reader lock that #LOCK_MUTEX_R acquires. 583 */ 584 pthread_mutex_t mtb_mutex; 585 #endif 586 /** The ID of the last transaction committed to the database. 587 * This is recorded here only for convenience; the value can always 588 * be determined by reading the main database meta pages. 589 */ 590 txnid_t mtb_txnid; 591 /** The number of slots that have been used in the reader table. 592 * This always records the maximum count, it is not decremented 593 * when readers release their slots. 594 */ 595 unsigned mtb_numreaders; 596 } MDB_txbody; 597 598 /** The actual reader table definition. */ 599 typedef struct MDB_txninfo { 600 union { 601 MDB_txbody mtb; 602 #define mti_magic mt1.mtb.mtb_magic 603 #define mti_format mt1.mtb.mtb_format 604 #define mti_mutex mt1.mtb.mtb_mutex 605 #define mti_rmname mt1.mtb.mtb_rmname 606 #define mti_txnid mt1.mtb.mtb_txnid 607 #define mti_numreaders mt1.mtb.mtb_numreaders 608 char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; 609 } mt1; 610 union { 611 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) 612 char mt2_wmname[MNAME_LEN]; 613 #define mti_wmname mt2.mt2_wmname 614 #else 615 pthread_mutex_t mt2_wmutex; 616 #define mti_wmutex mt2.mt2_wmutex 617 #endif 618 char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; 619 } mt2; 620 MDB_reader mti_readers[1]; 621 } MDB_txninfo; 622 623 /** Lockfile format signature: version, features and field layout */ 624 #define MDB_LOCK_FORMAT \ 625 ((uint32_t) \ 626 ((MDB_LOCK_VERSION) \ 627 /* Flags which describe functionality */ \ 628 + (((MDB_PIDLOCK) != 0) << 16))) 629 /** @} */ 630 631 /** Common header for all page types. 632 * Overflow records occupy a number of contiguous pages with no 633 * headers on any page after the first. 634 */ 635 typedef struct MDB_page { 636 #define mp_pgno mp_p.p_pgno 637 #define mp_next mp_p.p_next 638 union { 639 pgno_t p_pgno; /**< page number */ 640 void * p_next; /**< for in-memory list of freed structs */ 641 } mp_p; 642 uint16_t mp_pad; 643 /** @defgroup mdb_page Page Flags 644 * @ingroup internal 645 * Flags for the page headers. 646 * @{ 647 */ 648 #define P_BRANCH 0x01 /**< branch page */ 649 #define P_LEAF 0x02 /**< leaf page */ 650 #define P_OVERFLOW 0x04 /**< overflow page */ 651 #define P_META 0x08 /**< meta page */ 652 #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ 653 #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ 654 #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ 655 #define P_KEEP 0x8000 /**< leave this page alone during spill */ 656 /** @} */ 657 uint16_t mp_flags; /**< @ref mdb_page */ 658 #define mp_lower mp_pb.pb.pb_lower 659 #define mp_upper mp_pb.pb.pb_upper 660 #define mp_pages mp_pb.pb_pages 661 union { 662 struct { 663 indx_t pb_lower; /**< lower bound of free space */ 664 indx_t pb_upper; /**< upper bound of free space */ 665 } pb; 666 uint32_t pb_pages; /**< number of overflow pages */ 667 } mp_pb; 668 indx_t mp_ptrs[1]; /**< dynamic size */ 669 } MDB_page; 670 671 /** Size of the page header, excluding dynamic data at the end */ 672 #define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) 673 674 /** Address of first usable data byte in a page, after the header */ 675 #define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) 676 677 /** Number of nodes on a page */ 678 #define NUMKEYS(p) (((p)->mp_lower - PAGEHDRSZ) >> 1) 679 680 /** The amount of space remaining in the page */ 681 #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) 682 683 /** The percentage of space used in the page, in tenths of a percent. */ 684 #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ 685 ((env)->me_psize - PAGEHDRSZ)) 686 /** The minimum page fill factor, in tenths of a percent. 687 * Pages emptier than this are candidates for merging. 688 */ 689 #define FILL_THRESHOLD 250 690 691 /** Test if a page is a leaf page */ 692 #define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) 693 /** Test if a page is a LEAF2 page */ 694 #define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) 695 /** Test if a page is a branch page */ 696 #define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) 697 /** Test if a page is an overflow page */ 698 #define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) 699 /** Test if a page is a sub page */ 700 #define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) 701 702 /** The number of overflow pages needed to store the given size. */ 703 #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) 704 705 /** Header for a single key/data pair within a page. 706 * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. 707 * We guarantee 2-byte alignment for 'MDB_node's. 708 */ 709 typedef struct MDB_node { 710 /** lo and hi are used for data size on leaf nodes and for 711 * child pgno on branch nodes. On 64 bit platforms, flags 712 * is also used for pgno. (Branch nodes have no flags). 713 * They are in host byte order in case that lets some 714 * accesses be optimized into a 32-bit word access. 715 */ 716 #if BYTE_ORDER == LITTLE_ENDIAN 717 unsigned short mn_lo, mn_hi; /**< part of data size or pgno */ 718 #else 719 unsigned short mn_hi, mn_lo; 720 #endif 721 /** @defgroup mdb_node Node Flags 722 * @ingroup internal 723 * Flags for node headers. 724 * @{ 725 */ 726 #define F_BIGDATA 0x01 /**< data put on overflow page */ 727 #define F_SUBDATA 0x02 /**< data is a sub-database */ 728 #define F_DUPDATA 0x04 /**< data has duplicates */ 729 730 /** valid flags for #mdb_node_add() */ 731 #define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND) 732 733 /** @} */ 734 unsigned short mn_flags; /**< @ref mdb_node */ 735 unsigned short mn_ksize; /**< key size */ 736 char mn_data[1]; /**< key and data are appended here */ 737 } MDB_node; 738 739 /** Size of the node header, excluding dynamic data at the end */ 740 #define NODESIZE offsetof(MDB_node, mn_data) 741 742 /** Bit position of top word in page number, for shifting mn_flags */ 743 #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) 744 745 /** Size of a node in a branch page with a given key. 746 * This is just the node header plus the key, there is no data. 747 */ 748 #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) 749 750 /** Size of a node in a leaf page with a given key and data. 751 * This is node header plus key plus data size. 752 */ 753 #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) 754 755 /** Address of node \b i in page \b p */ 756 #define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i])) 757 758 /** Address of the key for the node */ 759 #define NODEKEY(node) (void *)((node)->mn_data) 760 761 /** Address of the data for a node */ 762 #define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) 763 764 /** Get the page number pointed to by a branch node */ 765 #define NODEPGNO(node) \ 766 ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \ 767 (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0)) 768 /** Set the page number in a branch node */ 769 #define SETPGNO(node,pgno) do { \ 770 (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \ 771 if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0) 772 773 /** Get the size of the data in a leaf node */ 774 #define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) 775 /** Set the size of the data for a leaf node */ 776 #define SETDSZ(node,size) do { \ 777 (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0) 778 /** The size of a key in a node */ 779 #define NODEKSZ(node) ((node)->mn_ksize) 780 781 /** Copy a page number from src to dst */ 782 #ifdef MISALIGNED_OK 783 #define COPY_PGNO(dst,src) dst = src 784 #else 785 #if SIZE_MAX > 4294967295UL 786 #define COPY_PGNO(dst,src) do { \ 787 unsigned short *s, *d; \ 788 s = (unsigned short *)&(src); \ 789 d = (unsigned short *)&(dst); \ 790 *d++ = *s++; \ 791 *d++ = *s++; \ 792 *d++ = *s++; \ 793 *d = *s; \ 794 } while (0) 795 #else 796 #define COPY_PGNO(dst,src) do { \ 797 unsigned short *s, *d; \ 798 s = (unsigned short *)&(src); \ 799 d = (unsigned short *)&(dst); \ 800 *d++ = *s++; \ 801 *d = *s; \ 802 } while (0) 803 #endif 804 #endif 805 /** The address of a key in a LEAF2 page. 806 * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs. 807 * There are no node headers, keys are stored contiguously. 808 */ 809 #define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) 810 811 /** Set the \b node's key into \b keyptr, if requested. */ 812 #define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \ 813 (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } 814 815 /** Set the \b node's key into \b key. */ 816 #define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } 817 818 /** Information about a single database in the environment. */ 819 typedef struct MDB_db { 820 uint32_t md_pad; /**< also ksize for LEAF2 pages */ 821 uint16_t md_flags; /**< @ref mdb_dbi_open */ 822 uint16_t md_depth; /**< depth of this tree */ 823 pgno_t md_branch_pages; /**< number of internal pages */ 824 pgno_t md_leaf_pages; /**< number of leaf pages */ 825 pgno_t md_overflow_pages; /**< number of overflow pages */ 826 size_t md_entries; /**< number of data items */ 827 pgno_t md_root; /**< the root page of this tree */ 828 } MDB_db; 829 830 /** mdb_dbi_open flags */ 831 #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ 832 #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) 833 #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ 834 MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) 835 836 /** Handle for the DB used to track free pages. */ 837 #define FREE_DBI 0 838 /** Handle for the default DB. */ 839 #define MAIN_DBI 1 840 841 /** Meta page content. 842 * A meta page is the start point for accessing a database snapshot. 843 * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). 844 */ 845 typedef struct MDB_meta { 846 /** Stamp identifying this as an MDB file. It must be set 847 * to #MDB_MAGIC. */ 848 uint32_t mm_magic; 849 /** Version number of this lock file. Must be set to #MDB_DATA_VERSION. */ 850 uint32_t mm_version; 851 void *mm_address; /**< address for fixed mapping */ 852 size_t mm_mapsize; /**< size of mmap region */ 853 MDB_db mm_dbs[2]; /**< first is free space, 2nd is main db */ 854 /** The size of pages used in this DB */ 855 #define mm_psize mm_dbs[0].md_pad 856 /** Any persistent environment flags. @ref mdb_env */ 857 #define mm_flags mm_dbs[0].md_flags 858 pgno_t mm_last_pg; /**< last used page in file */ 859 txnid_t mm_txnid; /**< txnid that committed this page */ 860 } MDB_meta; 861 862 /** Buffer for a stack-allocated meta page. 863 * The members define size and alignment, and silence type 864 * aliasing warnings. They are not used directly; that could 865 * mean incorrectly using several union members in parallel. 866 */ 867 typedef union MDB_metabuf { 868 MDB_page mb_page; 869 struct { 870 char mm_pad[PAGEHDRSZ]; 871 MDB_meta mm_meta; 872 } mb_metabuf; 873 } MDB_metabuf; 874 875 /** Auxiliary DB info. 876 * The information here is mostly static/read-only. There is 877 * only a single copy of this record in the environment. 878 */ 879 typedef struct MDB_dbx { 880 MDB_val md_name; /**< name of the database */ 881 MDB_cmp_func *md_cmp; /**< function for comparing keys */ 882 MDB_cmp_func *md_dcmp; /**< function for comparing data items */ 883 MDB_rel_func *md_rel; /**< user relocate function */ 884 void *md_relctx; /**< user-provided context for md_rel */ 885 } MDB_dbx; 886 887 /** A database transaction. 888 * Every operation requires a transaction handle. 889 */ 890 struct MDB_txn { 891 MDB_txn *mt_parent; /**< parent of a nested txn */ 892 MDB_txn *mt_child; /**< nested txn under this txn */ 893 pgno_t mt_next_pgno; /**< next unallocated page */ 894 /** The ID of this transaction. IDs are integers incrementing from 1. 895 * Only committed write transactions increment the ID. If a transaction 896 * aborts, the ID may be re-used by the next writer. 897 */ 898 txnid_t mt_txnid; 899 MDB_env *mt_env; /**< the DB environment */ 900 /** The list of pages that became unused during this transaction. 901 */ 902 MDB_IDL mt_free_pgs; 903 /** The sorted list of dirty pages we temporarily wrote to disk 904 * because the dirty list was full. page numbers in here are 905 * shifted left by 1, deleted slots have the LSB set. 906 */ 907 MDB_IDL mt_spill_pgs; 908 union { 909 /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ 910 MDB_ID2L dirty_list; 911 /** For read txns: This thread/txn's reader table slot, or NULL. */ 912 MDB_reader *reader; 913 } mt_u; 914 /** Array of records for each DB known in the environment. */ 915 MDB_dbx *mt_dbxs; 916 /** Array of MDB_db records for each known DB */ 917 MDB_db *mt_dbs; 918 /** @defgroup mt_dbflag Transaction DB Flags 919 * @ingroup internal 920 * @{ 921 */ 922 #define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */ 923 #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ 924 #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ 925 #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ 926 /** @} */ 927 /** In write txns, array of cursors for each DB */ 928 MDB_cursor **mt_cursors; 929 /** Array of flags for each DB */ 930 unsigned char *mt_dbflags; 931 /** Number of DB records in use. This number only ever increments; 932 * we don't decrement it when individual DB handles are closed. 933 */ 934 MDB_dbi mt_numdbs; 935 936 /** @defgroup mdb_txn Transaction Flags 937 * @ingroup internal 938 * @{ 939 */ 940 #define MDB_TXN_RDONLY 0x01 /**< read-only transaction */ 941 #define MDB_TXN_ERROR 0x02 /**< an error has occurred */ 942 #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ 943 #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ 944 /** @} */ 945 unsigned int mt_flags; /**< @ref mdb_txn */ 946 /** dirty_list room: Array size - #dirty pages visible to this txn. 947 * Includes ancestor txns' dirty pages not hidden by other txns' 948 * dirty/spilled pages. Thus commit(nested txn) has room to merge 949 * dirty_list into mt_parent after freeing hidden mt_parent pages. 950 */ 951 unsigned int mt_dirty_room; 952 }; 953 954 /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. 955 * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to 956 * raise this on a 64 bit machine. 957 */ 958 #define CURSOR_STACK 32 959 960 struct MDB_xcursor; 961 962 /** Cursors are used for all DB operations. 963 * A cursor holds a path of (page pointer, key index) from the DB 964 * root to a position in the DB, plus other state. #MDB_DUPSORT 965 * cursors include an xcursor to the current data item. Write txns 966 * track their cursors and keep them up to date when data moves. 967 * Exception: An xcursor's pointer to a #P_SUBP page can be stale. 968 * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). 969 */ 970 struct MDB_cursor { 971 /** Next cursor on this DB in this txn */ 972 MDB_cursor *mc_next; 973 /** Backup of the original cursor if this cursor is a shadow */ 974 MDB_cursor *mc_backup; 975 /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ 976 struct MDB_xcursor *mc_xcursor; 977 /** The transaction that owns this cursor */ 978 MDB_txn *mc_txn; 979 /** The database handle this cursor operates on */ 980 MDB_dbi mc_dbi; 981 /** The database record for this cursor */ 982 MDB_db *mc_db; 983 /** The database auxiliary record for this cursor */ 984 MDB_dbx *mc_dbx; 985 /** The @ref mt_dbflag for this database */ 986 unsigned char *mc_dbflag; 987 unsigned short mc_snum; /**< number of pushed pages */ 988 unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ 989 /** @defgroup mdb_cursor Cursor Flags 990 * @ingroup internal 991 * Cursor state flags. 992 * @{ 993 */ 994 #define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ 995 #define C_EOF 0x02 /**< No more data */ 996 #define C_SUB 0x04 /**< Cursor is a sub-cursor */ 997 #define C_DEL 0x08 /**< last op was a cursor_del */ 998 #define C_SPLITTING 0x20 /**< Cursor is in page_split */ 999 #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ 1000 /** @} */ 1001 unsigned int mc_flags; /**< @ref mdb_cursor */ 1002 MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ 1003 indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ 1004 }; 1005 1006 /** Context for sorted-dup records. 1007 * We could have gone to a fully recursive design, with arbitrarily 1008 * deep nesting of sub-databases. But for now we only handle these 1009 * levels - main DB, optional sub-DB, sorted-duplicate DB. 1010 */ 1011 typedef struct MDB_xcursor { 1012 /** A sub-cursor for traversing the Dup DB */ 1013 MDB_cursor mx_cursor; 1014 /** The database record for this Dup DB */ 1015 MDB_db mx_db; 1016 /** The auxiliary DB record for this Dup DB */ 1017 MDB_dbx mx_dbx; 1018 /** The @ref mt_dbflag for this Dup DB */ 1019 unsigned char mx_dbflag; 1020 } MDB_xcursor; 1021 1022 /** State of FreeDB old pages, stored in the MDB_env */ 1023 typedef struct MDB_pgstate { 1024 pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ 1025 txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ 1026 } MDB_pgstate; 1027 1028 /** The database environment. */ 1029 struct MDB_env { 1030 HANDLE me_fd; /**< The main data file */ 1031 HANDLE me_lfd; /**< The lock file */ 1032 HANDLE me_mfd; /**< just for writing the meta pages */ 1033 /** Failed to update the meta page. Probably an I/O error. */ 1034 #define MDB_FATAL_ERROR 0x80000000U 1035 /** Some fields are initialized. */ 1036 #define MDB_ENV_ACTIVE 0x20000000U 1037 /** me_txkey is set */ 1038 #define MDB_ENV_TXKEY 0x10000000U 1039 /** Have liveness lock in reader table */ 1040 #define MDB_LIVE_READER 0x08000000U 1041 uint32_t me_flags; /**< @ref mdb_env */ 1042 unsigned int me_psize; /**< DB page size, inited from me_os_psize */ 1043 unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ 1044 unsigned int me_maxreaders; /**< size of the reader table */ 1045 unsigned int me_numreaders; /**< max numreaders set by this env */ 1046 MDB_dbi me_numdbs; /**< number of DBs opened */ 1047 MDB_dbi me_maxdbs; /**< size of the DB table */ 1048 MDB_PID_T me_pid; /**< process ID of this env */ 1049 char *me_path; /**< path to the DB files */ 1050 char *me_map; /**< the memory map of the data file */ 1051 MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ 1052 MDB_meta *me_metas[2]; /**< pointers to the two meta pages */ 1053 void *me_pbuf; /**< scratch area for DUPSORT put() */ 1054 MDB_txn *me_txn; /**< current write transaction */ 1055 size_t me_mapsize; /**< size of the data memory map */ 1056 off_t me_size; /**< current file size */ 1057 pgno_t me_maxpg; /**< me_mapsize / me_psize */ 1058 MDB_dbx *me_dbxs; /**< array of static DB info */ 1059 uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ 1060 pthread_key_t me_txkey; /**< thread-key for readers */ 1061 MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ 1062 # define me_pglast me_pgstate.mf_pglast 1063 # define me_pghead me_pgstate.mf_pghead 1064 MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ 1065 /** IDL of pages that became unused in a write txn */ 1066 MDB_IDL me_free_pgs; 1067 /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ 1068 MDB_ID2L me_dirty_list; 1069 /** Max number of freelist items that can fit in a single overflow page */ 1070 int me_maxfree_1pg; 1071 /** Max size of a node on a page */ 1072 unsigned int me_nodemax; 1073 #if !(MDB_MAXKEYSIZE) 1074 unsigned int me_maxkey; /**< max size of a key */ 1075 #endif 1076 #ifdef _WIN32 1077 int me_pidquery; /**< Used in OpenProcess */ 1078 HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */ 1079 HANDLE me_wmutex; 1080 #elif defined(MDB_USE_POSIX_SEM) 1081 sem_t *me_rmutex; /* Shared mutexes are not supported */ 1082 sem_t *me_wmutex; 1083 #endif 1084 void *me_userctx; /**< User-settable context */ 1085 MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ 1086 }; 1087 1088 /** Nested transaction */ 1089 typedef struct MDB_ntxn { 1090 MDB_txn mnt_txn; /**< the transaction */ 1091 MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ 1092 } MDB_ntxn; 1093 1094 /** max number of pages to commit in one writev() call */ 1095 #define MDB_COMMIT_PAGES 64 1096 #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES 1097 #undef MDB_COMMIT_PAGES 1098 #define MDB_COMMIT_PAGES IOV_MAX 1099 #endif 1100 1101 /* max bytes to write in one call */ 1102 #define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) 1103 1104 static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); 1105 static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); 1106 static int mdb_page_touch(MDB_cursor *mc); 1107 1108 static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl); 1109 static int mdb_page_search_root(MDB_cursor *mc, 1110 MDB_val *key, int modify); 1111 #define MDB_PS_MODIFY 1 1112 #define MDB_PS_ROOTONLY 2 1113 #define MDB_PS_FIRST 4 1114 #define MDB_PS_LAST 8 1115 static int mdb_page_search(MDB_cursor *mc, 1116 MDB_val *key, int flags); 1117 static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); 1118 1119 #define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ 1120 static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, 1121 pgno_t newpgno, unsigned int nflags); 1122 1123 static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); 1124 static int mdb_env_pick_meta(const MDB_env *env); 1125 static int mdb_env_write_meta(MDB_txn *txn); 1126 #if !(defined(_WIN32) || defined(MDB_USE_POSIX_SEM)) /* Drop unused excl arg */ 1127 # define mdb_env_close0(env, excl) mdb_env_close1(env) 1128 #endif 1129 static void mdb_env_close0(MDB_env *env, int excl); 1130 1131 static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); 1132 static int mdb_node_add(MDB_cursor *mc, indx_t indx, 1133 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); 1134 static void mdb_node_del(MDB_cursor *mc, int ksize); 1135 static void mdb_node_shrink(MDB_page *mp, indx_t indx); 1136 static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst); 1137 static int mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data); 1138 static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); 1139 static size_t mdb_branch_size(MDB_env *env, MDB_val *key); 1140 1141 static int mdb_rebalance(MDB_cursor *mc); 1142 static int mdb_update_key(MDB_cursor *mc, MDB_val *key); 1143 1144 static void mdb_cursor_pop(MDB_cursor *mc); 1145 static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); 1146 1147 static int mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf); 1148 static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); 1149 static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); 1150 static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); 1151 static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, 1152 int *exactp); 1153 static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); 1154 static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); 1155 1156 static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); 1157 static void mdb_xcursor_init0(MDB_cursor *mc); 1158 static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); 1159 1160 static int mdb_drop0(MDB_cursor *mc, int subs); 1161 static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); 1162 1163 /** @cond */ 1164 static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; 1165 /** @endcond */ 1166 1167 #ifdef _WIN32 1168 static SECURITY_DESCRIPTOR mdb_null_sd; 1169 static SECURITY_ATTRIBUTES mdb_all_sa; 1170 static int mdb_sec_inited; 1171 #endif 1172 1173 /** Return the library version info. */ 1174 char * 1175 mdb_version(int *major, int *minor, int *patch) 1176 { 1177 if (major) *major = MDB_VERSION_MAJOR; 1178 if (minor) *minor = MDB_VERSION_MINOR; 1179 if (patch) *patch = MDB_VERSION_PATCH; 1180 return MDB_VERSION_STRING; 1181 } 1182 1183 /** Table of descriptions for MDB @ref errors */ 1184 static char *const mdb_errstr[] = { 1185 "MDB_KEYEXIST: Key/data pair already exists", 1186 "MDB_NOTFOUND: No matching key/data pair found", 1187 "MDB_PAGE_NOTFOUND: Requested page not found", 1188 "MDB_CORRUPTED: Located page was wrong type", 1189 "MDB_PANIC: Update of meta page failed", 1190 "MDB_VERSION_MISMATCH: Database environment version mismatch", 1191 "MDB_INVALID: File is not an MDB file", 1192 "MDB_MAP_FULL: Environment mapsize limit reached", 1193 "MDB_DBS_FULL: Environment maxdbs limit reached", 1194 "MDB_READERS_FULL: Environment maxreaders limit reached", 1195 "MDB_TLS_FULL: Thread-local storage keys full - too many environments open", 1196 "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big", 1197 "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", 1198 "MDB_PAGE_FULL: Internal error - page has no more space", 1199 "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", 1200 "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", 1201 "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", 1202 "MDB_BAD_TXN: Transaction cannot recover - it must be aborted", 1203 "MDB_BAD_VALSIZE: Too big key/data, key is empty, or wrong DUPFIXED size", 1204 }; 1205 1206 char * 1207 mdb_strerror(int err) 1208 { 1209 int i; 1210 if (!err) 1211 return ("Successful return: 0"); 1212 1213 if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { 1214 i = err - MDB_KEYEXIST; 1215 return mdb_errstr[i]; 1216 } 1217 1218 return strerror(err); 1219 } 1220 1221 /** assert(3) variant in cursor context */ 1222 #define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) 1223 /** assert(3) variant in transaction context */ 1224 #define mdb_tassert(mc, expr) mdb_assert0((txn)->mt_env, expr, #expr) 1225 /** assert(3) variant in environment context */ 1226 #define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) 1227 1228 #ifndef NDEBUG 1229 # define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ 1230 mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) 1231 1232 static void 1233 mdb_assert_fail(MDB_env *env, const char *expr_txt, 1234 const char *func, const char *file, int line) 1235 { 1236 char buf[400]; 1237 sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()", 1238 file, line, expr_txt, func); 1239 if (env->me_assert_func) 1240 env->me_assert_func(env, buf); 1241 fprintf(stderr, "%s\n", buf); 1242 abort(); 1243 } 1244 #else 1245 # define mdb_assert0(env, expr, expr_txt) ((void) 0) 1246 #endif /* NDEBUG */ 1247 1248 #if MDB_DEBUG 1249 /** Return the page number of \b mp which may be sub-page, for debug output */ 1250 static pgno_t 1251 mdb_dbg_pgno(MDB_page *mp) 1252 { 1253 pgno_t ret; 1254 COPY_PGNO(ret, mp->mp_pgno); 1255 return ret; 1256 } 1257 1258 /** Display a key in hexadecimal and return the address of the result. 1259 * @param[in] key the key to display 1260 * @param[in] buf the buffer to write into. Should always be #DKBUF. 1261 * @return The key in hexadecimal form. 1262 */ 1263 char * 1264 mdb_dkey(MDB_val *key, char *buf) 1265 { 1266 char *ptr = buf; 1267 unsigned char *c = key->mv_data; 1268 unsigned int i; 1269 1270 if (!key) 1271 return ""; 1272 1273 if (key->mv_size > DKBUF_MAXKEYSIZE) 1274 return "MDB_MAXKEYSIZE"; 1275 /* may want to make this a dynamic check: if the key is mostly 1276 * printable characters, print it as-is instead of converting to hex. 1277 */ 1278 #if 1 1279 buf[0] = '\0'; 1280 for (i=0; i<key->mv_size; i++) 1281 ptr += sprintf(ptr, "%02x", *c++); 1282 #else 1283 sprintf(buf, "%.*s", key->mv_size, key->mv_data); 1284 #endif 1285 return buf; 1286 } 1287 1288 /** Display all the keys in the page. */ 1289 void 1290 mdb_page_list(MDB_page *mp) 1291 { 1292 MDB_node *node; 1293 unsigned int i, nkeys, nsize, total = 0; 1294 MDB_val key; 1295 DKBUF; 1296 1297 nkeys = NUMKEYS(mp); 1298 fprintf(stderr, "Page %"Z"u numkeys %d\n", mdb_dbg_pgno(mp), nkeys); 1299 for (i=0; i<nkeys; i++) { 1300 node = NODEPTR(mp, i); 1301 key.mv_size = node->mn_ksize; 1302 key.mv_data = node->mn_data; 1303 nsize = NODESIZE + key.mv_size; 1304 if (IS_BRANCH(mp)) { 1305 fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node), 1306 DKEY(&key)); 1307 total += nsize; 1308 } else { 1309 if (F_ISSET(node->mn_flags, F_BIGDATA)) 1310 nsize += sizeof(pgno_t); 1311 else 1312 nsize += NODEDSZ(node); 1313 total += nsize; 1314 nsize += sizeof(indx_t); 1315 fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); 1316 } 1317 total = EVEN(total); 1318 } 1319 fprintf(stderr, "Total: %d\n", total); 1320 } 1321 1322 void 1323 mdb_cursor_chk(MDB_cursor *mc) 1324 { 1325 unsigned int i; 1326 MDB_node *node; 1327 MDB_page *mp; 1328 1329 if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return; 1330 for (i=0; i<mc->mc_top; i++) { 1331 mp = mc->mc_pg[i]; 1332 node = NODEPTR(mp, mc->mc_ki[i]); 1333 if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno) 1334 printf("oops!\n"); 1335 } 1336 if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) 1337 printf("ack!\n"); 1338 } 1339 #endif 1340 1341 #if (MDB_DEBUG) > 2 1342 /** Count all the pages in each DB and in the freelist 1343 * and make sure it matches the actual number of pages 1344 * being used. 1345 */ 1346 static void mdb_audit(MDB_txn *txn) 1347 { 1348 MDB_cursor mc; 1349 MDB_val key, data; 1350 MDB_ID freecount, count; 1351 MDB_dbi i; 1352 int rc; 1353 1354 freecount = 0; 1355 mdb_cursor_init(&mc, txn, FREE_DBI, NULL); 1356 while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) 1357 freecount += *(MDB_ID *)data.mv_data; 1358 1359 count = 0; 1360 for (i = 0; i<txn->mt_numdbs; i++) { 1361 MDB_xcursor mx; 1362 mdb_cursor_init(&mc, txn, i, &mx); 1363 if (txn->mt_dbs[i].md_root == P_INVALID) 1364 continue; 1365 count += txn->mt_dbs[i].md_branch_pages + 1366 txn->mt_dbs[i].md_leaf_pages + 1367 txn->mt_dbs[i].md_overflow_pages; 1368 if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { 1369 mdb_page_search(&mc, NULL, MDB_PS_FIRST); 1370 do { 1371 unsigned j; 1372 MDB_page *mp; 1373 mp = mc.mc_pg[mc.mc_top]; 1374 for (j=0; j<NUMKEYS(mp); j++) { 1375 MDB_node *leaf = NODEPTR(mp, j); 1376 if (leaf->mn_flags & F_SUBDATA) { 1377 MDB_db db; 1378 memcpy(&db, NODEDATA(leaf), sizeof(db)); 1379 count += db.md_branch_pages + db.md_leaf_pages + 1380 db.md_overflow_pages; 1381 } 1382 } 1383 } 1384 while (mdb_cursor_sibling(&mc, 1) == 0); 1385 } 1386 } 1387 if (freecount + count + 2 /* metapages */ != txn->mt_next_pgno) { 1388 fprintf(stderr, "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n", 1389 txn->mt_txnid, freecount, count+2, freecount+count+2, txn->mt_next_pgno); 1390 } 1391 } 1392 #endif 1393 1394 int 1395 mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) 1396 { 1397 return txn->mt_dbxs[dbi].md_cmp(a, b); 1398 } 1399 1400 int 1401 mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) 1402 { 1403 return txn->mt_dbxs[dbi].md_dcmp(a, b); 1404 } 1405 1406 /** Allocate memory for a page. 1407 * Re-use old malloc'd pages first for singletons, otherwise just malloc. 1408 */ 1409 static MDB_page * 1410 mdb_page_malloc(MDB_txn *txn, unsigned num) 1411 { 1412 MDB_env *env = txn->mt_env; 1413 MDB_page *ret = env->me_dpages; 1414 size_t psize = env->me_psize, sz = psize, off; 1415 /* For ! #MDB_NOMEMINIT, psize counts how much to init. 1416 * For a single page alloc, we init everything after the page header. 1417 * For multi-page, we init the final page; if the caller needed that 1418 * many pages they will be filling in at least up to the last page. 1419 */ 1420 if (num == 1) { 1421 if (ret) { 1422 VGMEMP_ALLOC(env, ret, sz); 1423 VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); 1424 env->me_dpages = ret->mp_next; 1425 return ret; 1426 } 1427 psize -= off = PAGEHDRSZ; 1428 } else { 1429 sz *= num; 1430 off = sz - psize; 1431 } 1432 if ((ret = malloc(sz)) != NULL) { 1433 VGMEMP_ALLOC(env, ret, sz); 1434 if (!(env->me_flags & MDB_NOMEMINIT)) { 1435 memset((char *)ret + off, 0, psize); 1436 ret->mp_pad = 0; 1437 } 1438 } else { 1439 txn->mt_flags |= MDB_TXN_ERROR; 1440 } 1441 return ret; 1442 } 1443 1444 /** Free a single page. 1445 * Saves single pages to a list, for future reuse. 1446 * (This is not used for multi-page overflow pages.) 1447 */ 1448 static void 1449 mdb_page_free(MDB_env *env, MDB_page *mp) 1450 { 1451 mp->mp_next = env->me_dpages; 1452 VGMEMP_FREE(env, mp); 1453 env->me_dpages = mp; 1454 } 1455 1456 /** Free a dirty page */ 1457 static void 1458 mdb_dpage_free(MDB_env *env, MDB_page *dp) 1459 { 1460 if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { 1461 mdb_page_free(env, dp); 1462 } else { 1463 /* large pages just get freed directly */ 1464 VGMEMP_FREE(env, dp); 1465 free(dp); 1466 } 1467 } 1468 1469 /** Return all dirty pages to dpage list */ 1470 static void 1471 mdb_dlist_free(MDB_txn *txn) 1472 { 1473 MDB_env *env = txn->mt_env; 1474 MDB_ID2L dl = txn->mt_u.dirty_list; 1475 unsigned i, n = dl[0].mid; 1476 1477 for (i = 1; i <= n; i++) { 1478 mdb_dpage_free(env, dl[i].mptr); 1479 } 1480 dl[0].mid = 0; 1481 } 1482 1483 /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. 1484 * @param[in] mc A cursor handle for the current operation. 1485 * @param[in] pflags Flags of the pages to update: 1486 * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. 1487 * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). 1488 * @return 0 on success, non-zero on failure. 1489 */ 1490 static int 1491 mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) 1492 { 1493 enum { Mask = P_SUBP|P_DIRTY|P_KEEP }; 1494 MDB_txn *txn = mc->mc_txn; 1495 MDB_cursor *m3; 1496 MDB_xcursor *mx; 1497 MDB_page *dp, *mp; 1498 MDB_node *leaf; 1499 unsigned i, j; 1500 int rc = MDB_SUCCESS, level; 1501 1502 /* Mark pages seen by cursors */ 1503 if (mc->mc_flags & C_UNTRACK) 1504 mc = NULL; /* will find mc in mt_cursors */ 1505 for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { 1506 for (; mc; mc=mc->mc_next) { 1507 if (!(mc->mc_flags & C_INITIALIZED)) 1508 continue; 1509 for (m3 = mc;; m3 = &mx->mx_cursor) { 1510 mp = NULL; 1511 for (j=0; j<m3->mc_snum; j++) { 1512 mp = m3->mc_pg[j]; 1513 if ((mp->mp_flags & Mask) == pflags) 1514 mp->mp_flags ^= P_KEEP; 1515 } 1516 mx = m3->mc_xcursor; 1517 /* Proceed to mx if it is at a sub-database */ 1518 if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) 1519 break; 1520 if (! (mp && (mp->mp_flags & P_LEAF))) 1521 break; 1522 leaf = NODEPTR(mp, m3->mc_ki[j-1]); 1523 if (!(leaf->mn_flags & F_SUBDATA)) 1524 break; 1525 } 1526 } 1527 if (i == 0) 1528 break; 1529 } 1530 1531 if (all) { 1532 /* Mark dirty root pages */ 1533 for (i=0; i<txn->mt_numdbs; i++) { 1534 if (txn->mt_dbflags[i] & DB_DIRTY) { 1535 pgno_t pgno = txn->mt_dbs[i].md_root; 1536 if (pgno == P_INVALID) 1537 continue; 1538 if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) 1539 break; 1540 if ((dp->mp_flags & Mask) == pflags && level <= 1) 1541 dp->mp_flags ^= P_KEEP; 1542 } 1543 } 1544 } 1545 1546 return rc; 1547 } 1548 1549 static int mdb_page_flush(MDB_txn *txn, int keep); 1550 1551 /** Spill pages from the dirty list back to disk. 1552 * This is intended to prevent running into #MDB_TXN_FULL situations, 1553 * but note that they may still occur in a few cases: 1554 * 1) our estimate of the txn size could be too small. Currently this 1555 * seems unlikely, except with a large number of #MDB_MULTIPLE items. 1556 * 2) child txns may run out of space if their parents dirtied a 1557 * lot of pages and never spilled them. TODO: we probably should do 1558 * a preemptive spill during #mdb_txn_begin() of a child txn, if 1559 * the parent's dirty_room is below a given threshold. 1560 * 1561 * Otherwise, if not using nested txns, it is expected that apps will 1562 * not run into #MDB_TXN_FULL any more. The pages are flushed to disk 1563 * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. 1564 * If the txn never references them again, they can be left alone. 1565 * If the txn only reads them, they can be used without any fuss. 1566 * If the txn writes them again, they can be dirtied immediately without 1567 * going thru all of the work of #mdb_page_touch(). Such references are 1568 * handled by #mdb_page_unspill(). 1569 * 1570 * Also note, we never spill DB root pages, nor pages of active cursors, 1571 * because we'll need these back again soon anyway. And in nested txns, 1572 * we can't spill a page in a child txn if it was already spilled in a 1573 * parent txn. That would alter the parent txns' data even though 1574 * the child hasn't committed yet, and we'd have no way to undo it if 1575 * the child aborted. 1576 * 1577 * @param[in] m0 cursor A cursor handle identifying the transaction and 1578 * database for which we are checking space. 1579 * @param[in] key For a put operation, the key being stored. 1580 * @param[in] data For a put operation, the data being stored. 1581 * @return 0 on success, non-zero on failure. 1582 */ 1583 static int 1584 mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) 1585 { 1586 MDB_txn *txn = m0->mc_txn; 1587 MDB_page *dp; 1588 MDB_ID2L dl = txn->mt_u.dirty_list; 1589 unsigned int i, j, need; 1590 int rc; 1591 1592 if (m0->mc_flags & C_SUB) 1593 return MDB_SUCCESS; 1594 1595 /* Estimate how much space this op will take */ 1596 i = m0->mc_db->md_depth; 1597 /* Named DBs also dirty the main DB */ 1598 if (m0->mc_dbi > MAIN_DBI) 1599 i += txn->mt_dbs[MAIN_DBI].md_depth; 1600 /* For puts, roughly factor in the key+data size */ 1601 if (key) 1602 i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; 1603 i += i; /* double it for good measure */ 1604 need = i; 1605 1606 if (txn->mt_dirty_room > i) 1607 return MDB_SUCCESS; 1608 1609 if (!txn->mt_spill_pgs) { 1610 txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); 1611 if (!txn->mt_spill_pgs) 1612 return ENOMEM; 1613 } else { 1614 /* purge deleted slots */ 1615 MDB_IDL sl = txn->mt_spill_pgs; 1616 unsigned int num = sl[0]; 1617 j=0; 1618 for (i=1; i<=num; i++) { 1619 if (!(sl[i] & 1)) 1620 sl[++j] = sl[i]; 1621 } 1622 sl[0] = j; 1623 } 1624 1625 /* Preserve pages which may soon be dirtied again */ 1626 if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) 1627 goto done; 1628 1629 /* Less aggressive spill - we originally spilled the entire dirty list, 1630 * with a few exceptions for cursor pages and DB root pages. But this 1631 * turns out to be a lot of wasted effort because in a large txn many 1632 * of those pages will need to be used again. So now we spill only 1/8th 1633 * of the dirty pages. Testing revealed this to be a good tradeoff, 1634 * better than 1/2, 1/4, or 1/10. 1635 */ 1636 if (need < MDB_IDL_UM_MAX / 8) 1637 need = MDB_IDL_UM_MAX / 8; 1638 1639 /* Save the page IDs of all the pages we're flushing */ 1640 /* flush from the tail forward, this saves a lot of shifting later on. */ 1641 for (i=dl[0].mid; i && need; i--) { 1642 MDB_ID pn = dl[i].mid << 1; 1643 dp = dl[i].mptr; 1644 if (dp->mp_flags & P_KEEP) 1645 continue; 1646 /* Can't spill twice, make sure it's not already in a parent's 1647 * spill list. 1648 */ 1649 if (txn->mt_parent) { 1650 MDB_txn *tx2; 1651 for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { 1652 if (tx2->mt_spill_pgs) { 1653 j = mdb_midl_search(tx2->mt_spill_pgs, pn); 1654 if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { 1655 dp->mp_flags |= P_KEEP; 1656 break; 1657 } 1658 } 1659 } 1660 if (tx2) 1661 continue; 1662 } 1663 if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) 1664 goto done; 1665 need--; 1666 } 1667 mdb_midl_sort(txn->mt_spill_pgs); 1668 1669 /* Flush the spilled part of dirty list */ 1670 if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) 1671 goto done; 1672 1673 /* Reset any dirty pages we kept that page_flush didn't see */ 1674 rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); 1675 1676 done: 1677 txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; 1678 return rc; 1679 } 1680 1681 /** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ 1682 static txnid_t 1683 mdb_find_oldest(MDB_txn *txn) 1684 { 1685 int i; 1686 txnid_t mr, oldest = txn->mt_txnid - 1; 1687 if (txn->mt_env->me_txns) { 1688 MDB_reader *r = txn->mt_env->me_txns->mti_readers; 1689 for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { 1690 if (r[i].mr_pid) { 1691 mr = r[i].mr_txnid; 1692 if (oldest > mr) 1693 oldest = mr; 1694 } 1695 } 1696 } 1697 return oldest; 1698 } 1699 1700 /** Add a page to the txn's dirty list */ 1701 static void 1702 mdb_page_dirty(MDB_txn *txn, MDB_page *mp) 1703 { 1704 MDB_ID2 mid; 1705 int rc, (*insert)(MDB_ID2L, MDB_ID2 *); 1706 1707 if (txn->mt_env->me_flags & MDB_WRITEMAP) { 1708 insert = mdb_mid2l_append; 1709 } else { 1710 insert = mdb_mid2l_insert; 1711 } 1712 mid.mid = mp->mp_pgno; 1713 mid.mptr = mp; 1714 rc = insert(txn->mt_u.dirty_list, &mid); 1715 mdb_tassert(txn, rc == 0); 1716 txn->mt_dirty_room--; 1717 } 1718 1719 /** Allocate page numbers and memory for writing. Maintain me_pglast, 1720 * me_pghead and mt_next_pgno. 1721 * 1722 * If there are free pages available from older transactions, they 1723 * are re-used first. Otherwise allocate a new page at mt_next_pgno. 1724 * Do not modify the freedB, just merge freeDB records into me_pghead[] 1725 * and move me_pglast to say which records were consumed. Only this 1726 * function can create me_pghead and move me_pglast/mt_next_pgno. 1727 * @param[in] mc cursor A cursor handle identifying the transaction and 1728 * database for which we are allocating. 1729 * @param[in] num the number of pages to allocate. 1730 * @param[out] mp Address of the allocated page(s). Requests for multiple pages 1731 * will always be satisfied by a single contiguous chunk of memory. 1732 * @return 0 on success, non-zero on failure. 1733 */ 1734 static int 1735 mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) 1736 { 1737 #ifdef MDB_PARANOID /* Seems like we can ignore this now */ 1738 /* Get at most <Max_retries> more freeDB records once me_pghead 1739 * has enough pages. If not enough, use new pages from the map. 1740 * If <Paranoid> and mc is updating the freeDB, only get new 1741 * records if me_pghead is empty. Then the freelist cannot play 1742 * catch-up with itself by growing while trying to save it. 1743 */ 1744 enum { Paranoid = 1, Max_retries = 500 }; 1745 #else 1746 enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; 1747 #endif 1748 int rc, retry = Max_retries; 1749 MDB_txn *txn = mc->mc_txn; 1750 MDB_env *env = txn->mt_env; 1751 pgno_t pgno, *mop = env->me_pghead; 1752 unsigned i, j, k, mop_len = mop ? mop[0] : 0, n2 = num-1; 1753 MDB_page *np; 1754 txnid_t oldest = 0, last; 1755 MDB_cursor_op op; 1756 MDB_cursor m2; 1757 1758 *mp = NULL; 1759 1760 /* If our dirty list is already full, we can't do anything */ 1761 if (txn->mt_dirty_room == 0) { 1762 rc = MDB_TXN_FULL; 1763 goto fail; 1764 } 1765 1766 for (op = MDB_FIRST;; op = MDB_NEXT) { 1767 MDB_val key, data; 1768 MDB_node *leaf; 1769 pgno_t *idl, old_id, new_id; 1770 1771 /* Seek a big enough contiguous page range. Prefer 1772 * pages at the tail, just truncating the list. 1773 */ 1774 if (mop_len > n2) { 1775 i = mop_len; 1776 do { 1777 pgno = mop[i]; 1778 if (mop[i-n2] == pgno+n2) 1779 goto search_done; 1780 } while (--i > n2); 1781 if (Max_retries < INT_MAX && --retry < 0) 1782 break; 1783 } 1784 1785 if (op == MDB_FIRST) { /* 1st iteration */ 1786 /* Prepare to fetch more and coalesce */ 1787 oldest = mdb_find_oldest(txn); 1788 last = env->me_pglast; 1789 mdb_cursor_init(&m2, txn, FREE_DBI, NULL); 1790 if (last) { 1791 op = MDB_SET_RANGE; 1792 key.mv_data = &last; /* will look up last+1 */ 1793 key.mv_size = sizeof(last); 1794 } 1795 if (Paranoid && mc->mc_dbi == FREE_DBI) 1796 retry = -1; 1797 } 1798 if (Paranoid && retry < 0 && mop_len) 1799 break; 1800 1801 last++; 1802 /* Do not fetch more if the record will be too recent */ 1803 if (oldest <= last) 1804 break; 1805 rc = mdb_cursor_get(&m2, &key, NULL, op); 1806 if (rc) { 1807 if (rc == MDB_NOTFOUND) 1808 break; 1809 goto fail; 1810 } 1811 last = *(txnid_t*)key.mv_data; 1812 if (oldest <= last) 1813 break; 1814 np = m2.mc_pg[m2.mc_top]; 1815 leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); 1816 if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS) 1817 return rc; 1818 1819 idl = (MDB_ID *) data.mv_data; 1820 i = idl[0]; 1821 if (!mop) { 1822 if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { 1823 rc = ENOMEM; 1824 goto fail; 1825 } 1826 } else { 1827 if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) 1828 goto fail; 1829 mop = env->me_pghead; 1830 } 1831 env->me_pglast = last; 1832 #if (MDB_DEBUG) > 1 1833 DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", 1834 last, txn->mt_dbs[FREE_DBI].md_root, i)); 1835 for (k = i; k; k--) 1836 DPRINTF(("IDL %"Z"u", idl[k])); 1837 #endif 1838 /* Merge in descending sorted order */ 1839 j = mop_len; 1840 k = mop_len += i; 1841 mop[0] = (pgno_t)-1; 1842 old_id = mop[j]; 1843 while (i) { 1844 new_id = idl[i--]; 1845 for (; old_id < new_id; old_id = mop[--j]) 1846 mop[k--] = old_id; 1847 mop[k--] = new_id; 1848 } 1849 mop[0] = mop_len; 1850 } 1851 1852 /* Use new pages from the map when nothing suitable in the freeDB */ 1853 i = 0; 1854 pgno = txn->mt_next_pgno; 1855 if (pgno + num >= env->me_maxpg) { 1856 DPUTS("DB size maxed out"); 1857 rc = MDB_MAP_FULL; 1858 goto fail; 1859 } 1860 1861 search_done: 1862 if (env->me_flags & MDB_WRITEMAP) { 1863 np = (MDB_page *)(env->me_map + env->me_psize * pgno); 1864 } else { 1865 if (!(np = mdb_page_malloc(txn, num))) { 1866 rc = ENOMEM; 1867 goto fail; 1868 } 1869 } 1870 if (i) { 1871 mop[0] = mop_len -= num; 1872 /* Move any stragglers down */ 1873 for (j = i-num; j < mop_len; ) 1874 mop[++j] = mop[++i]; 1875 } else { 1876 txn->mt_next_pgno = pgno + num; 1877 } 1878 np->mp_pgno = pgno; 1879 mdb_page_dirty(txn, np); 1880 *mp = np; 1881 1882 return MDB_SUCCESS; 1883 1884 fail: 1885 txn->mt_flags |= MDB_TXN_ERROR; 1886 return rc; 1887 } 1888 1889 /** Copy the used portions of a non-overflow page. 1890 * @param[in] dst page to copy into 1891 * @param[in] src page to copy from 1892 * @param[in] psize size of a page 1893 */ 1894 static void 1895 mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) 1896 { 1897 enum { Align = sizeof(pgno_t) }; 1898 indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; 1899 1900 /* If page isn't full, just copy the used portion. Adjust 1901 * alignment so memcpy may copy words instead of bytes. 1902 */ 1903 if ((unused &= -Align) && !IS_LEAF2(src)) { 1904 upper &= -Align; 1905 memcpy(dst, src, (lower + (Align-1)) & -Align); 1906 memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), 1907 psize - upper); 1908 } else { 1909 memcpy(dst, src, psize - unused); 1910 } 1911 } 1912 1913 /** Pull a page off the txn's spill list, if present. 1914 * If a page being referenced was spilled to disk in this txn, bring 1915 * it back and make it dirty/writable again. 1916 * @param[in] txn the transaction handle. 1917 * @param[in] mp the page being referenced. It must not be dirty. 1918 * @param[out] ret the writable page, if any. ret is unchanged if 1919 * mp wasn't spilled. 1920 */ 1921 static int 1922 mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) 1923 { 1924 MDB_env *env = txn->mt_env; 1925 const MDB_txn *tx2; 1926 unsigned x; 1927 pgno_t pgno = mp->mp_pgno, pn = pgno << 1; 1928 1929 for (tx2 = txn; tx2; tx2=tx2->mt_parent) { 1930 if (!tx2->mt_spill_pgs) 1931 continue; 1932 x = mdb_midl_search(tx2->mt_spill_pgs, pn); 1933 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { 1934 MDB_page *np; 1935 int num; 1936 if (txn->mt_dirty_room == 0) 1937 return MDB_TXN_FULL; 1938 if (IS_OVERFLOW(mp)) 1939 num = mp->mp_pages; 1940 else 1941 num = 1; 1942 if (env->me_flags & MDB_WRITEMAP) { 1943 np = mp; 1944 } else { 1945 np = mdb_page_malloc(txn, num); 1946 if (!np) 1947 return ENOMEM; 1948 if (num > 1) 1949 memcpy(np, mp, num * env->me_psize); 1950 else 1951 mdb_page_copy(np, mp, env->me_psize); 1952 } 1953 if (tx2 == txn) { 1954 /* If in current txn, this page is no longer spilled. 1955 * If it happens to be the last page, truncate the spill list. 1956 * Otherwise mark it as deleted by setting the LSB. 1957 */ 1958 if (x == txn->mt_spill_pgs[0]) 1959 txn->mt_spill_pgs[0]--; 1960 else 1961 txn->mt_spill_pgs[x] |= 1; 1962 } /* otherwise, if belonging to a parent txn, the 1963 * page remains spilled until child commits 1964 */ 1965 1966 mdb_page_dirty(txn, np); 1967 np->mp_flags |= P_DIRTY; 1968 *ret = np; 1969 break; 1970 } 1971 } 1972 return MDB_SUCCESS; 1973 } 1974 1975 /** Touch a page: make it dirty and re-insert into tree with updated pgno. 1976 * @param[in] mc cursor pointing to the page to be touched 1977 * @return 0 on success, non-zero on failure. 1978 */ 1979 static int 1980 mdb_page_touch(MDB_cursor *mc) 1981 { 1982 MDB_page *mp = mc->mc_pg[mc->mc_top], *np; 1983 MDB_txn *txn = mc->mc_txn; 1984 MDB_cursor *m2, *m3; 1985 pgno_t pgno; 1986 int rc; 1987 1988 if (!F_ISSET(mp->mp_flags, P_DIRTY)) { 1989 if (txn->mt_flags & MDB_TXN_SPILLS) { 1990 np = NULL; 1991 rc = mdb_page_unspill(txn, mp, &np); 1992 if (rc) 1993 goto fail; 1994 if (np) 1995 goto done; 1996 } 1997 if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || 1998 (rc = mdb_page_alloc(mc, 1, &np))) 1999 goto fail; 2000 pgno = np->mp_pgno; 2001 DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), 2002 mp->mp_pgno, pgno)); 2003 mdb_cassert(mc, mp->mp_pgno != pgno); 2004 mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); 2005 /* Update the parent page, if any, to point to the new page */ 2006 if (mc->mc_top) { 2007 MDB_page *parent = mc->mc_pg[mc->mc_top-1]; 2008 MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); 2009 SETPGNO(node, pgno); 2010 } else { 2011 mc->mc_db->md_root = pgno; 2012 } 2013 } else if (txn->mt_parent && !IS_SUBP(mp)) { 2014 MDB_ID2 mid, *dl = txn->mt_u.dirty_list; 2015 pgno = mp->mp_pgno; 2016 /* If txn has a parent, make sure the page is in our 2017 * dirty list. 2018 */ 2019 if (dl[0].mid) { 2020 unsigned x = mdb_mid2l_search(dl, pgno); 2021 if (x <= dl[0].mid && dl[x].mid == pgno) { 2022 if (mp != dl[x].mptr) { /* bad cursor? */ 2023 mc->mc_flags &= ~(C_INITIALIZED|C_EOF); 2024 txn->mt_flags |= MDB_TXN_ERROR; 2025 return MDB_CORRUPTED; 2026 } 2027 return 0; 2028 } 2029 } 2030 mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); 2031 /* No - copy it */ 2032 np = mdb_page_malloc(txn, 1); 2033 if (!np) 2034 return ENOMEM; 2035 mid.mid = pgno; 2036 mid.mptr = np; 2037 rc = mdb_mid2l_insert(dl, &mid); 2038 mdb_cassert(mc, rc == 0); 2039 } else { 2040 return 0; 2041 } 2042 2043 mdb_page_copy(np, mp, txn->mt_env->me_psize); 2044 np->mp_pgno = pgno; 2045 np->mp_flags |= P_DIRTY; 2046 2047 done: 2048 /* Adjust cursors pointing to mp */ 2049 mc->mc_pg[mc->mc_top] = np; 2050 m2 = txn->mt_cursors[mc->mc_dbi]; 2051 if (mc->mc_flags & C_SUB) { 2052 for (; m2; m2=m2->mc_next) { 2053 m3 = &m2->mc_xcursor->mx_cursor; 2054 if (m3->mc_snum < mc->mc_snum) continue; 2055 if (m3->mc_pg[mc->mc_top] == mp) 2056 m3->mc_pg[mc->mc_top] = np; 2057 } 2058 } else { 2059 for (; m2; m2=m2->mc_next) { 2060 if (m2->mc_snum < mc->mc_snum) continue; 2061 if (m2->mc_pg[mc->mc_top] == mp) { 2062 m2->mc_pg[mc->mc_top] = np; 2063 if ((mc->mc_db->md_flags & MDB_DUPSORT) && 2064 m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) 2065 { 2066 MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]); 2067 if (!(leaf->mn_flags & F_SUBDATA)) 2068 m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); 2069 } 2070 } 2071 } 2072 } 2073 return 0; 2074 2075 fail: 2076 txn->mt_flags |= MDB_TXN_ERROR; 2077 return rc; 2078 } 2079 2080 int 2081 mdb_env_sync(MDB_env *env, int force) 2082 { 2083 int rc = 0; 2084 if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { 2085 if (env->me_flags & MDB_WRITEMAP) { 2086 int flags = ((env->me_flags & MDB_MAPASYNC) && !force) 2087 ? MS_ASYNC : MS_SYNC; 2088 if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) 2089 rc = ErrCode(); 2090 #ifdef _WIN32 2091 else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) 2092 rc = ErrCode(); 2093 #endif 2094 } else { 2095 if (MDB_FDATASYNC(env->me_fd)) 2096 rc = ErrCode(); 2097 } 2098 } 2099 return rc; 2100 } 2101 2102 /** Back up parent txn's cursors, then grab the originals for tracking */ 2103 static int 2104 mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) 2105 { 2106 MDB_cursor *mc, *bk; 2107 MDB_xcursor *mx; 2108 size_t size; 2109 int i; 2110 2111 for (i = src->mt_numdbs; --i >= 0; ) { 2112 if ((mc = src->mt_cursors[i]) != NULL) { 2113 size = sizeof(MDB_cursor); 2114 if (mc->mc_xcursor) 2115 size += sizeof(MDB_xcursor); 2116 for (; mc; mc = bk->mc_next) { 2117 bk = malloc(size); 2118 if (!bk) 2119 return ENOMEM; 2120 *bk = *mc; 2121 mc->mc_backup = bk; 2122 mc->mc_db = &dst->mt_dbs[i]; 2123 /* Kill pointers into src - and dst to reduce abuse: The 2124 * user may not use mc until dst ends. Otherwise we'd... 2125 */ 2126 mc->mc_txn = NULL; /* ...set this to dst */ 2127 mc->mc_dbflag = NULL; /* ...and &dst->mt_dbflags[i] */ 2128 if ((mx = mc->mc_xcursor) != NULL) { 2129 *(MDB_xcursor *)(bk+1) = *mx; 2130 mx->mx_cursor.mc_txn = NULL; /* ...and dst. */ 2131 } 2132 mc->mc_next = dst->mt_cursors[i]; 2133 dst->mt_cursors[i] = mc; 2134 } 2135 } 2136 } 2137 return MDB_SUCCESS; 2138 } 2139 2140 /** Close this write txn's cursors, give parent txn's cursors back to parent. 2141 * @param[in] txn the transaction handle. 2142 * @param[in] merge true to keep changes to parent cursors, false to revert. 2143 * @return 0 on success, non-zero on failure. 2144 */ 2145 static void 2146 mdb_cursors_close(MDB_txn *txn, unsigned merge) 2147 { 2148 MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; 2149 MDB_xcursor *mx; 2150 int i; 2151 2152 for (i = txn->mt_numdbs; --i >= 0; ) { 2153 for (mc = cursors[i]; mc; mc = next) { 2154 next = mc->mc_next; 2155 if ((bk = mc->mc_backup) != NULL) { 2156 if (merge) { 2157 /* Commit changes to parent txn */ 2158 mc->mc_next = bk->mc_next; 2159 mc->mc_backup = bk->mc_backup; 2160 mc->mc_txn = bk->mc_txn; 2161 mc->mc_db = bk->mc_db; 2162 mc->mc_dbflag = bk->mc_dbflag; 2163 if ((mx = mc->mc_xcursor) != NULL) 2164 mx->mx_cursor.mc_txn = bk->mc_txn; 2165 } else { 2166 /* Abort nested txn */ 2167 *mc = *bk; 2168 if ((mx = mc->mc_xcursor) != NULL) 2169 *mx = *(MDB_xcursor *)(bk+1); 2170 } 2171 mc = bk; 2172 } 2173 /* Only malloced cursors are permanently tracked. */ 2174 free(mc); 2175 } 2176 cursors[i] = NULL; 2177 } 2178 } 2179 2180 #if !(MDB_DEBUG) 2181 #define mdb_txn_reset0(txn, act) mdb_txn_reset0(txn) 2182 #endif 2183 static void 2184 mdb_txn_reset0(MDB_txn *txn, const char *act); 2185 2186 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ 2187 enum Pidlock_op { 2188 Pidset, Pidcheck 2189 }; 2190 #else 2191 enum Pidlock_op { 2192 Pidset = F_SETLK, Pidcheck = F_GETLK 2193 }; 2194 #endif 2195 2196 /** Set or check a pid lock. Set returns 0 on success. 2197 * Check returns 0 if the process is certainly dead, nonzero if it may 2198 * be alive (the lock exists or an error happened so we do not know). 2199 * 2200 * On Windows Pidset is a no-op, we merely check for the existence 2201 * of the process with the given pid. On POSIX we use a single byte 2202 * lock on the lockfile, set at an offset equal to the pid. 2203 */ 2204 static int 2205 mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid) 2206 { 2207 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ 2208 int ret = 0; 2209 HANDLE h; 2210 if (op == Pidcheck) { 2211 h = OpenProcess(env->me_pidquery, FALSE, pid); 2212 /* No documented "no such process" code, but other program use this: */ 2213 if (!h) 2214 return ErrCode() != ERROR_INVALID_PARAMETER; 2215 /* A process exists until all handles to it close. Has it exited? */ 2216 ret = WaitForSingleObject(h, 0) != 0; 2217 CloseHandle(h); 2218 } 2219 return ret; 2220 #else 2221 for (;;) { 2222 int rc; 2223 struct flock lock_info; 2224 memset(&lock_info, 0, sizeof(lock_info)); 2225 lock_info.l_type = F_WRLCK; 2226 lock_info.l_whence = SEEK_SET; 2227 lock_info.l_start = pid; 2228 lock_info.l_len = 1; 2229 if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { 2230 if (op == F_GETLK && lock_info.l_type != F_UNLCK) 2231 rc = -1; 2232 } else if ((rc = ErrCode()) == EINTR) { 2233 continue; 2234 } 2235 return rc; 2236 } 2237 #endif 2238 } 2239 2240 /** Common code for #mdb_txn_begin() and #mdb_txn_renew(). 2241 * @param[in] txn the transaction handle to initialize 2242 * @return 0 on success, non-zero on failure. 2243 */ 2244 static int 2245 mdb_txn_renew0(MDB_txn *txn) 2246 { 2247 MDB_env *env = txn->mt_env; 2248 MDB_txninfo *ti = env->me_txns; 2249 MDB_meta *meta; 2250 unsigned int i, nr; 2251 uint16_t x; 2252 int rc, new_notls = 0; 2253 2254 /* Setup db info */ 2255 txn->mt_numdbs = env->me_numdbs; 2256 txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ 2257 2258 if (txn->mt_flags & MDB_TXN_RDONLY) { 2259 if (!ti) { 2260 meta = env->me_metas[ mdb_env_pick_meta(env) ]; 2261 txn->mt_txnid = meta->mm_txnid; 2262 txn->mt_u.reader = NULL; 2263 } else { 2264 MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : 2265 pthread_getspecific(env->me_txkey); 2266 if (r) { 2267 if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) 2268 return MDB_BAD_RSLOT; 2269 } else { 2270 MDB_PID_T pid = env->me_pid; 2271 pthread_t tid = pthread_self(); 2272 2273 if (!(env->me_flags & MDB_LIVE_READER)) { 2274 rc = mdb_reader_pid(env, Pidset, pid); 2275 if (rc) 2276 return rc; 2277 env->me_flags |= MDB_LIVE_READER; 2278 } 2279 2280 LOCK_MUTEX_R(env); 2281 nr = ti->mti_numreaders; 2282 for (i=0; i<nr; i++) 2283 if (ti->mti_readers[i].mr_pid == 0) 2284 break; 2285 if (i == env->me_maxreaders) { 2286 UNLOCK_MUTEX_R(env); 2287 return MDB_READERS_FULL; 2288 } 2289 ti->mti_readers[i].mr_pid = pid; 2290 ti->mti_readers[i].mr_tid = tid; 2291 if (i == nr) 2292 ti->mti_numreaders = ++nr; 2293 /* Save numreaders for un-mutexed mdb_env_close() */ 2294 env->me_numreaders = nr; 2295 UNLOCK_MUTEX_R(env); 2296 2297 r = &ti->mti_readers[i]; 2298 new_notls = (env->me_flags & MDB_NOTLS); 2299 if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { 2300 r->mr_pid = 0; 2301 return rc; 2302 } 2303 } 2304 txn->mt_txnid = r->mr_txnid = ti->mti_txnid; 2305 txn->mt_u.reader = r; 2306 meta = env->me_metas[txn->mt_txnid & 1]; 2307 } 2308 } else { 2309 if (ti) { 2310 LOCK_MUTEX_W(env); 2311 2312 txn->mt_txnid = ti->mti_txnid; 2313 meta = env->me_metas[txn->mt_txnid & 1]; 2314 } else { 2315 meta = env->me_metas[ mdb_env_pick_meta(env) ]; 2316 txn->mt_txnid = meta->mm_txnid; 2317 } 2318 txn->mt_txnid++; 2319 #if MDB_DEBUG 2320 if (txn->mt_txnid == mdb_debug_start) 2321 mdb_debug = 1; 2322 #endif 2323 txn->mt_dirty_room = MDB_IDL_UM_MAX; 2324 txn->mt_u.dirty_list = env->me_dirty_list; 2325 txn->mt_u.dirty_list[0].mid = 0; 2326 txn->mt_free_pgs = env->me_free_pgs; 2327 txn->mt_free_pgs[0] = 0; 2328 txn->mt_spill_pgs = NULL; 2329 env->me_txn = txn; 2330 } 2331 2332 /* Copy the DB info and flags */ 2333 memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db)); 2334 2335 /* Moved to here to avoid a data race in read TXNs */ 2336 txn->mt_next_pgno = meta->mm_last_pg+1; 2337 2338 for (i=2; i<txn->mt_numdbs; i++) { 2339 x = env->me_dbflags[i]; 2340 txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; 2341 txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0; 2342 } 2343 txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID; 2344 2345 if (env->me_maxpg < txn->mt_next_pgno) { 2346 mdb_txn_reset0(txn, "renew0-mapfail"); 2347 if (new_notls) { 2348 txn->mt_u.reader->mr_pid = 0; 2349 txn->mt_u.reader = NULL; 2350 } 2351 return MDB_MAP_RESIZED; 2352 } 2353 2354 return MDB_SUCCESS; 2355 } 2356 2357 int 2358 mdb_txn_renew(MDB_txn *txn) 2359 { 2360 int rc; 2361 2362 if (!txn || txn->mt_dbxs) /* A reset txn has mt_dbxs==NULL */ 2363 return EINVAL; 2364 2365 if (txn->mt_env->me_flags & MDB_FATAL_ERROR) { 2366 DPUTS("environment had fatal error, must shutdown!"); 2367 return MDB_PANIC; 2368 } 2369 2370 rc = mdb_txn_renew0(txn); 2371 if (rc == MDB_SUCCESS) { 2372 DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", 2373 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', 2374 (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); 2375 } 2376 return rc; 2377 } 2378 2379 int 2380 mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) 2381 { 2382 MDB_txn *txn; 2383 MDB_ntxn *ntxn; 2384 int rc, size, tsize = sizeof(MDB_txn); 2385 2386 if (env->me_flags & MDB_FATAL_ERROR) { 2387 DPUTS("environment had fatal error, must shutdown!"); 2388 return MDB_PANIC; 2389 } 2390 if ((env->me_flags & MDB_RDONLY) && !(flags & MDB_RDONLY)) 2391 return EACCES; 2392 if (parent) { 2393 /* Nested transactions: Max 1 child, write txns only, no writemap */ 2394 if (parent->mt_child || 2395 (flags & MDB_RDONLY) || 2396 (parent->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) || 2397 (env->me_flags & MDB_WRITEMAP)) 2398 { 2399 return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; 2400 } 2401 tsize = sizeof(MDB_ntxn); 2402 } 2403 size = tsize + env->me_maxdbs * (sizeof(MDB_db)+1); 2404 if (!(flags & MDB_RDONLY)) 2405 size += env->me_maxdbs * sizeof(MDB_cursor *); 2406 2407 if ((txn = calloc(1, size)) == NULL) { 2408 DPRINTF(("calloc: %s", strerror(ErrCode()))); 2409 return ENOMEM; 2410 } 2411 txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); 2412 if (flags & MDB_RDONLY) { 2413 txn->mt_flags |= MDB_TXN_RDONLY; 2414 txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs); 2415 } else { 2416 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); 2417 txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs); 2418 } 2419 txn->mt_env = env; 2420 2421 if (parent) { 2422 unsigned int i; 2423 txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); 2424 if (!txn->mt_u.dirty_list || 2425 !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) 2426 { 2427 free(txn->mt_u.dirty_list); 2428 free(txn); 2429 return ENOMEM; 2430 } 2431 txn->mt_txnid = parent->mt_txnid; 2432 txn->mt_dirty_room = parent->mt_dirty_room; 2433 txn->mt_u.dirty_list[0].mid = 0; 2434 txn->mt_spill_pgs = NULL; 2435 txn->mt_next_pgno = parent->mt_next_pgno; 2436 parent->mt_child = txn; 2437 txn->mt_parent = parent; 2438 txn->mt_numdbs = parent->mt_numdbs; 2439 txn->mt_flags = parent->mt_flags; 2440 txn->mt_dbxs = parent->mt_dbxs; 2441 memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); 2442 /* Copy parent's mt_dbflags, but clear DB_NEW */ 2443 for (i=0; i<txn->mt_numdbs; i++) 2444 txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; 2445 rc = 0; 2446 ntxn = (MDB_ntxn *)txn; 2447 ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ 2448 if (env->me_pghead) { 2449 size = MDB_IDL_SIZEOF(env->me_pghead); 2450 env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); 2451 if (env->me_pghead) 2452 memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); 2453 else 2454 rc = ENOMEM; 2455 } 2456 if (!rc) 2457 rc = mdb_cursor_shadow(parent, txn); 2458 if (rc) 2459 mdb_txn_reset0(txn, "beginchild-fail"); 2460 } else { 2461 rc = mdb_txn_renew0(txn); 2462 } 2463 if (rc) 2464 free(txn); 2465 else { 2466 *ret = txn; 2467 DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", 2468 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', 2469 (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); 2470 } 2471 2472 return rc; 2473 } 2474 2475 MDB_env * 2476 mdb_txn_env(MDB_txn *txn) 2477 { 2478 if(!txn) return NULL; 2479 return txn->mt_env; 2480 } 2481 2482 /** Export or close DBI handles opened in this txn. */ 2483 static void 2484 mdb_dbis_update(MDB_txn *txn, int keep) 2485 { 2486 int i; 2487 MDB_dbi n = txn->mt_numdbs; 2488 MDB_env *env = txn->mt_env; 2489 unsigned char *tdbflags = txn->mt_dbflags; 2490 2491 for (i = n; --i >= 2;) { 2492 if (tdbflags[i] & DB_NEW) { 2493 if (keep) { 2494 env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; 2495 } else { 2496 char *ptr = env->me_dbxs[i].md_name.mv_data; 2497 env->me_dbxs[i].md_name.mv_data = NULL; 2498 env->me_dbxs[i].md_name.mv_size = 0; 2499 env->me_dbflags[i] = 0; 2500 free(ptr); 2501 } 2502 } 2503 } 2504 if (keep && env->me_numdbs < n) 2505 env->me_numdbs = n; 2506 } 2507 2508 /** Common code for #mdb_txn_reset() and #mdb_txn_abort(). 2509 * May be called twice for readonly txns: First reset it, then abort. 2510 * @param[in] txn the transaction handle to reset 2511 * @param[in] act why the transaction is being reset 2512 */ 2513 static void 2514 mdb_txn_reset0(MDB_txn *txn, const char *act) 2515 { 2516 MDB_env *env = txn->mt_env; 2517 2518 /* Close any DBI handles opened in this txn */ 2519 mdb_dbis_update(txn, 0); 2520 2521 DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", 2522 act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', 2523 (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); 2524 2525 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { 2526 if (txn->mt_u.reader) { 2527 txn->mt_u.reader->mr_txnid = (txnid_t)-1; 2528 if (!(env->me_flags & MDB_NOTLS)) 2529 txn->mt_u.reader = NULL; /* txn does not own reader */ 2530 } 2531 txn->mt_numdbs = 0; /* close nothing if called again */ 2532 txn->mt_dbxs = NULL; /* mark txn as reset */ 2533 } else { 2534 mdb_cursors_close(txn, 0); 2535 2536 if (!(env->me_flags & MDB_WRITEMAP)) { 2537 mdb_dlist_free(txn); 2538 } 2539 mdb_midl_free(env->me_pghead); 2540 2541 if (txn->mt_parent) { 2542 txn->mt_parent->mt_child = NULL; 2543 env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; 2544 mdb_midl_free(txn->mt_free_pgs); 2545 mdb_midl_free(txn->mt_spill_pgs); 2546 free(txn->mt_u.dirty_list); 2547 return; 2548 } 2549 2550 if (mdb_midl_shrink(&txn->mt_free_pgs)) 2551 env->me_free_pgs = txn->mt_free_pgs; 2552 env->me_pghead = NULL; 2553 env->me_pglast = 0; 2554 2555 env->me_txn = NULL; 2556 /* The writer mutex was locked in mdb_txn_begin. */ 2557 if (env->me_txns) 2558 UNLOCK_MUTEX_W(env); 2559 } 2560 } 2561 2562 void 2563 mdb_txn_reset(MDB_txn *txn) 2564 { 2565 if (txn == NULL) 2566 return; 2567 2568 /* This call is only valid for read-only txns */ 2569 if (!(txn->mt_flags & MDB_TXN_RDONLY)) 2570 return; 2571 2572 mdb_txn_reset0(txn, "reset"); 2573 } 2574 2575 void 2576 mdb_txn_abort(MDB_txn *txn) 2577 { 2578 if (txn == NULL) 2579 return; 2580 2581 if (txn->mt_child) 2582 mdb_txn_abort(txn->mt_child); 2583 2584 mdb_txn_reset0(txn, "abort"); 2585 /* Free reader slot tied to this txn (if MDB_NOTLS && writable FS) */ 2586 if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) 2587 txn->mt_u.reader->mr_pid = 0; 2588 2589 free(txn); 2590 } 2591 2592 /** Save the freelist as of this transaction to the freeDB. 2593 * This changes the freelist. Keep trying until it stabilizes. 2594 */ 2595 static int 2596 mdb_freelist_save(MDB_txn *txn) 2597 { 2598 /* env->me_pghead[] can grow and shrink during this call. 2599 * env->me_pglast and txn->mt_free_pgs[] can only grow. 2600 * Page numbers cannot disappear from txn->mt_free_pgs[]. 2601 */ 2602 MDB_cursor mc; 2603 MDB_env *env = txn->mt_env; 2604 int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; 2605 txnid_t pglast = 0, head_id = 0; 2606 pgno_t freecnt = 0, *free_pgs, *mop; 2607 ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; 2608 2609 mdb_cursor_init(&mc, txn, FREE_DBI, NULL); 2610 2611 if (env->me_pghead) { 2612 /* Make sure first page of freeDB is touched and on freelist */ 2613 rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); 2614 if (rc && rc != MDB_NOTFOUND) 2615 return rc; 2616 } 2617 2618 /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ 2619 clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) 2620 ? SSIZE_MAX : maxfree_1pg; 2621 2622 for (;;) { 2623 /* Come back here after each Put() in case freelist changed */ 2624 MDB_val key, data; 2625 pgno_t *pgs; 2626 ssize_t j; 2627 2628 /* If using records from freeDB which we have not yet 2629 * deleted, delete them and any we reserved for me_pghead. 2630 */ 2631 while (pglast < env->me_pglast) { 2632 rc = mdb_cursor_first(&mc, &key, NULL); 2633 if (rc) 2634 return rc; 2635 pglast = head_id = *(txnid_t *)key.mv_data; 2636 total_room = head_room = 0; 2637 mdb_tassert(txn, pglast <= env->me_pglast); 2638 rc = mdb_cursor_del(&mc, 0); 2639 if (rc) 2640 return rc; 2641 } 2642 2643 /* Save the IDL of pages freed by this txn, to a single record */ 2644 if (freecnt < txn->mt_free_pgs[0]) { 2645 if (!freecnt) { 2646 /* Make sure last page of freeDB is touched and on freelist */ 2647 rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); 2648 if (rc && rc != MDB_NOTFOUND) 2649 return rc; 2650 } 2651 free_pgs = txn->mt_free_pgs; 2652 /* Write to last page of freeDB */ 2653 key.mv_size = sizeof(txn->mt_txnid); 2654 key.mv_data = &txn->mt_txnid; 2655 do { 2656 freecnt = free_pgs[0]; 2657 data.mv_size = MDB_IDL_SIZEOF(free_pgs); 2658 rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); 2659 if (rc) 2660 return rc; 2661 /* Retry if mt_free_pgs[] grew during the Put() */ 2662 free_pgs = txn->mt_free_pgs; 2663 } while (freecnt < free_pgs[0]); 2664 mdb_midl_sort(free_pgs); 2665 memcpy(data.mv_data, free_pgs, data.mv_size); 2666 #if (MDB_DEBUG) > 1 2667 { 2668 unsigned int i = free_pgs[0]; 2669 DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", 2670 txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); 2671 for (; i; i--) 2672 DPRINTF(("IDL %"Z"u", free_pgs[i])); 2673 } 2674 #endif 2675 continue; 2676 } 2677 2678 mop = env->me_pghead; 2679 mop_len = mop ? mop[0] : 0; 2680 2681 /* Reserve records for me_pghead[]. Split it if multi-page, 2682 * to avoid searching freeDB for a page range. Use keys in 2683 * range [1,me_pglast]: Smaller than txnid of oldest reader. 2684 */ 2685 if (total_room >= mop_len) { 2686 if (total_room == mop_len || --more < 0) 2687 break; 2688 } else if (head_room >= maxfree_1pg && head_id > 1) { 2689 /* Keep current record (overflow page), add a new one */ 2690 head_id--; 2691 head_room = 0; 2692 } 2693 /* (Re)write {key = head_id, IDL length = head_room} */ 2694 total_room -= head_room; 2695 head_room = mop_len - total_room; 2696 if (head_room > maxfree_1pg && head_id > 1) { 2697 /* Overflow multi-page for part of me_pghead */ 2698 head_room /= head_id; /* amortize page sizes */ 2699 head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); 2700 } else if (head_room < 0) { 2701 /* Rare case, not bothering to delete this record */ 2702 head_room = 0; 2703 } 2704 key.mv_size = sizeof(head_id); 2705 key.mv_data = &head_id; 2706 data.mv_size = (head_room + 1) * sizeof(pgno_t); 2707 rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); 2708 if (rc) 2709 return rc; 2710 /* IDL is initially empty, zero out at least the length */ 2711 pgs = (pgno_t *)data.mv_data; 2712 j = head_room > clean_limit ? head_room : 0; 2713 do { 2714 pgs[j] = 0; 2715 } while (--j >= 0); 2716 total_room += head_room; 2717 } 2718 2719 /* Fill in the reserved me_pghead records */ 2720 rc = MDB_SUCCESS; 2721 if (mop_len) { 2722 MDB_val key, data; 2723 2724 mop += mop_len; 2725 rc = mdb_cursor_first(&mc, &key, &data); 2726 for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { 2727 unsigned flags = MDB_CURRENT; 2728 txnid_t id = *(txnid_t *)key.mv_data; 2729 ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; 2730 MDB_ID save; 2731 2732 mdb_tassert(txn, len >= 0 && id <= env->me_pglast); 2733 key.mv_data = &id; 2734 if (len > mop_len) { 2735 len = mop_len; 2736 data.mv_size = (len + 1) * sizeof(MDB_ID); 2737 flags = 0; 2738 } 2739 data.mv_data = mop -= len; 2740 save = mop[0]; 2741 mop[0] = len; 2742 rc = mdb_cursor_put(&mc, &key, &data, flags); 2743 mop[0] = save; 2744 if (rc || !(mop_len -= len)) 2745 break; 2746 } 2747 } 2748 return rc; 2749 } 2750 2751 /** Flush (some) dirty pages to the map, after clearing their dirty flag. 2752 * @param[in] txn the transaction that's being committed 2753 * @param[in] keep number of initial pages in dirty_list to keep dirty. 2754 * @return 0 on success, non-zero on failure. 2755 */ 2756 static int 2757 mdb_page_flush(MDB_txn *txn, int keep) 2758 { 2759 MDB_env *env = txn->mt_env; 2760 MDB_ID2L dl = txn->mt_u.dirty_list; 2761 unsigned psize = env->me_psize, j; 2762 int i, pagecount = dl[0].mid, rc; 2763 size_t size = 0, pos = 0; 2764 pgno_t pgno = 0; 2765 MDB_page *dp = NULL; 2766 #ifdef _WIN32 2767 OVERLAPPED ov; 2768 #else 2769 struct iovec iov[MDB_COMMIT_PAGES]; 2770 ssize_t wpos = 0, wsize = 0, wres; 2771 size_t next_pos = 1; /* impossible pos, so pos != next_pos */ 2772 int n = 0; 2773 #endif 2774 2775 j = i = keep; 2776 2777 if (env->me_flags & MDB_WRITEMAP) { 2778 /* Clear dirty flags */ 2779 while (++i <= pagecount) { 2780 dp = dl[i].mptr; 2781 /* Don't flush this page yet */ 2782 if (dp->mp_flags & P_KEEP) { 2783 dp->mp_flags ^= P_KEEP; 2784 dl[++j] = dl[i]; 2785 continue; 2786 } 2787 dp->mp_flags &= ~P_DIRTY; 2788 } 2789 goto done; 2790 } 2791 2792 /* Write the pages */ 2793 for (;;) { 2794 if (++i <= pagecount) { 2795 dp = dl[i].mptr; 2796 /* Don't flush this page yet */ 2797 if (dp->mp_flags & P_KEEP) { 2798 dp->mp_flags ^= P_KEEP; 2799 dl[i].mid = 0; 2800 continue; 2801 } 2802 pgno = dl[i].mid; 2803 /* clear dirty flag */ 2804 dp->mp_flags &= ~P_DIRTY; 2805 pos = pgno * psize; 2806 size = psize; 2807 if (IS_OVERFLOW(dp)) size *= dp->mp_pages; 2808 } 2809 #ifdef _WIN32 2810 else break; 2811 2812 /* Windows actually supports scatter/gather I/O, but only on 2813 * unbuffered file handles. Since we're relying on the OS page 2814 * cache for all our data, that's self-defeating. So we just 2815 * write pages one at a time. We use the ov structure to set 2816 * the write offset, to at least save the overhead of a Seek 2817 * system call. 2818 */ 2819 DPRINTF(("committing page %"Z"u", pgno)); 2820 memset(&ov, 0, sizeof(ov)); 2821 ov.Offset = pos & 0xffffffff; 2822 ov.OffsetHigh = pos >> 16 >> 16; 2823 if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { 2824 rc = ErrCode(); 2825 DPRINTF(("WriteFile: %d", rc)); 2826 return rc; 2827 } 2828 #else 2829 /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ 2830 if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { 2831 if (n) { 2832 /* Write previous page(s) */ 2833 #ifdef MDB_USE_PWRITEV 2834 wres = pwritev(env->me_fd, iov, n, wpos); 2835 #else 2836 if (n == 1) { 2837 wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); 2838 } else { 2839 if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { 2840 rc = ErrCode(); 2841 DPRINTF(("lseek: %s", strerror(rc))); 2842 return rc; 2843 } 2844 wres = writev(env->me_fd, iov, n); 2845 } 2846 #endif 2847 if (wres != wsize) { 2848 if (wres < 0) { 2849 rc = ErrCode(); 2850 DPRINTF(("Write error: %s", strerror(rc))); 2851 } else { 2852 rc = EIO; /* TODO: Use which error code? */ 2853 DPUTS("short write, filesystem full?"); 2854 } 2855 return rc; 2856 } 2857 n = 0; 2858 } 2859 if (i > pagecount) 2860 break; 2861 wpos = pos; 2862 wsize = 0; 2863 } 2864 DPRINTF(("committing page %"Z"u", pgno)); 2865 next_pos = pos + size; 2866 iov[n].iov_len = size; 2867 iov[n].iov_base = (char *)dp; 2868 wsize += size; 2869 n++; 2870 #endif /* _WIN32 */ 2871 } 2872 2873 for (i = keep; ++i <= pagecount; ) { 2874 dp = dl[i].mptr; 2875 /* This is a page we skipped above */ 2876 if (!dl[i].mid) { 2877 dl[++j] = dl[i]; 2878 dl[j].mid = dp->mp_pgno; 2879 continue; 2880 } 2881 mdb_dpage_free(env, dp); 2882 } 2883 2884 done: 2885 i--; 2886 txn->mt_dirty_room += i - j; 2887 dl[0].mid = j; 2888 return MDB_SUCCESS; 2889 } 2890 2891 int 2892 mdb_txn_commit(MDB_txn *txn) 2893 { 2894 int rc; 2895 unsigned int i; 2896 MDB_env *env; 2897 2898 if (txn == NULL || txn->mt_env == NULL) 2899 return EINVAL; 2900 2901 if (txn->mt_child) { 2902 rc = mdb_txn_commit(txn->mt_child); 2903 txn->mt_child = NULL; 2904 if (rc) 2905 goto fail; 2906 } 2907 2908 env = txn->mt_env; 2909 2910 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { 2911 mdb_dbis_update(txn, 1); 2912 txn->mt_numdbs = 2; /* so txn_abort() doesn't close any new handles */ 2913 mdb_txn_abort(txn); 2914 return MDB_SUCCESS; 2915 } 2916 2917 if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) { 2918 DPUTS("error flag is set, can't commit"); 2919 if (txn->mt_parent) 2920 txn->mt_parent->mt_flags |= MDB_TXN_ERROR; 2921 rc = MDB_BAD_TXN; 2922 goto fail; 2923 } 2924 2925 if (txn->mt_parent) { 2926 MDB_txn *parent = txn->mt_parent; 2927 MDB_ID2L dst, src; 2928 MDB_IDL pspill; 2929 unsigned x, y, len, ps_len; 2930 2931 /* Append our free list to parent's */ 2932 rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); 2933 if (rc) 2934 goto fail; 2935 mdb_midl_free(txn->mt_free_pgs); 2936 /* Failures after this must either undo the changes 2937 * to the parent or set MDB_TXN_ERROR in the parent. 2938 */ 2939 2940 parent->mt_next_pgno = txn->mt_next_pgno; 2941 parent->mt_flags = txn->mt_flags; 2942 2943 /* Merge our cursors into parent's and close them */ 2944 mdb_cursors_close(txn, 1); 2945 2946 /* Update parent's DB table. */ 2947 memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); 2948 parent->mt_numdbs = txn->mt_numdbs; 2949 parent->mt_dbflags[0] = txn->mt_dbflags[0]; 2950 parent->mt_dbflags[1] = txn->mt_dbflags[1]; 2951 for (i=2; i<txn->mt_numdbs; i++) { 2952 /* preserve parent's DB_NEW status */ 2953 x = parent->mt_dbflags[i] & DB_NEW; 2954 parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; 2955 } 2956 2957 dst = parent->mt_u.dirty_list; 2958 src = txn->mt_u.dirty_list; 2959 /* Remove anything in our dirty list from parent's spill list */ 2960 if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { 2961 x = y = ps_len; 2962 pspill[0] = (pgno_t)-1; 2963 /* Mark our dirty pages as deleted in parent spill list */ 2964 for (i=0, len=src[0].mid; ++i <= len; ) { 2965 MDB_ID pn = src[i].mid << 1; 2966 while (pn > pspill[x]) 2967 x--; 2968 if (pn == pspill[x]) { 2969 pspill[x] = 1; 2970 y = --x; 2971 } 2972 } 2973 /* Squash deleted pagenums if we deleted any */ 2974 for (x=y; ++x <= ps_len; ) 2975 if (!(pspill[x] & 1)) 2976 pspill[++y] = pspill[x]; 2977 pspill[0] = y; 2978 } 2979 2980 /* Find len = length of merging our dirty list with parent's */ 2981 x = dst[0].mid; 2982 dst[0].mid = 0; /* simplify loops */ 2983 if (parent->mt_parent) { 2984 len = x + src[0].mid; 2985 y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; 2986 for (i = x; y && i; y--) { 2987 pgno_t yp = src[y].mid; 2988 while (yp < dst[i].mid) 2989 i--; 2990 if (yp == dst[i].mid) { 2991 i--; 2992 len--; 2993 } 2994 } 2995 } else { /* Simplify the above for single-ancestor case */ 2996 len = MDB_IDL_UM_MAX - txn->mt_dirty_room; 2997 } 2998 /* Merge our dirty list with parent's */ 2999 y = src[0].mid; 3000 for (i = len; y; dst[i--] = src[y--]) { 3001 pgno_t yp = src[y].mid; 3002 while (yp < dst[x].mid) 3003 dst[i--] = dst[x--]; 3004 if (yp == dst[x].mid) 3005 free(dst[x--].mptr); 3006 } 3007 mdb_tassert(txn, i == x); 3008 dst[0].mid = len; 3009 free(txn->mt_u.dirty_list); 3010 parent->mt_dirty_room = txn->mt_dirty_room; 3011 if (txn->mt_spill_pgs) { 3012 if (parent->mt_spill_pgs) { 3013 /* TODO: Prevent failure here, so parent does not fail */ 3014 rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); 3015 if (rc) 3016 parent->mt_flags |= MDB_TXN_ERROR; 3017 mdb_midl_free(txn->mt_spill_pgs); 3018 mdb_midl_sort(parent->mt_spill_pgs); 3019 } else { 3020 parent->mt_spill_pgs = txn->mt_spill_pgs; 3021 } 3022 } 3023 3024 parent->mt_child = NULL; 3025 mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); 3026 free(txn); 3027 return rc; 3028 } 3029 3030 if (txn != env->me_txn) { 3031 DPUTS("attempt to commit unknown transaction"); 3032 rc = EINVAL; 3033 goto fail; 3034 } 3035 3036 mdb_cursors_close(txn, 0); 3037 3038 if (!txn->mt_u.dirty_list[0].mid && 3039 !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) 3040 goto done; 3041 3042 DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", 3043 txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); 3044 3045 /* Update DB root pointers */ 3046 if (txn->mt_numdbs > 2) { 3047 MDB_cursor mc; 3048 MDB_dbi i; 3049 MDB_val data; 3050 data.mv_size = sizeof(MDB_db); 3051 3052 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); 3053 for (i = 2; i < txn->mt_numdbs; i++) { 3054 if (txn->mt_dbflags[i] & DB_DIRTY) { 3055 data.mv_data = &txn->mt_dbs[i]; 3056 rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0); 3057 if (rc) 3058 goto fail; 3059 } 3060 } 3061 } 3062 3063 rc = mdb_freelist_save(txn); 3064 if (rc) 3065 goto fail; 3066 3067 mdb_midl_free(env->me_pghead); 3068 env->me_pghead = NULL; 3069 if (mdb_midl_shrink(&txn->mt_free_pgs)) 3070 env->me_free_pgs = txn->mt_free_pgs; 3071 3072 #if (MDB_DEBUG) > 2 3073 mdb_audit(txn); 3074 #endif 3075 3076 if ((rc = mdb_page_flush(txn, 0)) || 3077 (rc = mdb_env_sync(env, 0)) || 3078 (rc = mdb_env_write_meta(txn))) 3079 goto fail; 3080 3081 done: 3082 env->me_pglast = 0; 3083 env->me_txn = NULL; 3084 mdb_dbis_update(txn, 1); 3085 3086 if (env->me_txns) 3087 UNLOCK_MUTEX_W(env); 3088 free(txn); 3089 3090 return MDB_SUCCESS; 3091 3092 fail: 3093 mdb_txn_abort(txn); 3094 return rc; 3095 } 3096 3097 /** Read the environment parameters of a DB environment before 3098 * mapping it into memory. 3099 * @param[in] env the environment handle 3100 * @param[out] meta address of where to store the meta information 3101 * @return 0 on success, non-zero on failure. 3102 */ 3103 static int 3104 mdb_env_read_header(MDB_env *env, MDB_meta *meta) 3105 { 3106 MDB_metabuf pbuf; 3107 MDB_page *p; 3108 MDB_meta *m; 3109 int i, rc, off; 3110 enum { Size = sizeof(pbuf) }; 3111 3112 /* We don't know the page size yet, so use a minimum value. 3113 * Read both meta pages so we can use the latest one. 3114 */ 3115 3116 for (i=off=0; i<2; i++, off = meta->mm_psize) { 3117 #ifdef _WIN32 3118 DWORD len; 3119 OVERLAPPED ov; 3120 memset(&ov, 0, sizeof(ov)); 3121 ov.Offset = off; 3122 rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1; 3123 if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) 3124 rc = 0; 3125 #else 3126 rc = pread(env->me_fd, &pbuf, Size, off); 3127 #endif 3128 if (rc != Size) { 3129 if (rc == 0 && off == 0) 3130 return ENOENT; 3131 rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; 3132 DPRINTF(("read: %s", mdb_strerror(rc))); 3133 return rc; 3134 } 3135 3136 p = (MDB_page *)&pbuf; 3137 3138 if (!F_ISSET(p->mp_flags, P_META)) { 3139 DPRINTF(("page %"Z"u not a meta page", p->mp_pgno)); 3140 return MDB_INVALID; 3141 } 3142 3143 m = METADATA(p); 3144 if (m->mm_magic != MDB_MAGIC) { 3145 DPUTS("meta has invalid magic"); 3146 return MDB_INVALID; 3147 } 3148 3149 if (m->mm_version != MDB_DATA_VERSION) { 3150 DPRINTF(("database is version %u, expected version %u", 3151 m->mm_version, MDB_DATA_VERSION)); 3152 return MDB_VERSION_MISMATCH; 3153 } 3154 3155 if (off == 0 || m->mm_txnid > meta->mm_txnid) 3156 *meta = *m; 3157 } 3158 return 0; 3159 } 3160 3161 /** Write the environment parameters of a freshly created DB environment. 3162 * @param[in] env the environment handle 3163 * @param[out] meta address of where to store the meta information 3164 * @return 0 on success, non-zero on failure. 3165 */ 3166 static int 3167 mdb_env_init_meta(MDB_env *env, MDB_meta *meta) 3168 { 3169 MDB_page *p, *q; 3170 int rc; 3171 unsigned int psize; 3172 #ifdef _WIN32 3173 DWORD len; 3174 OVERLAPPED ov; 3175 memset(&ov, 0, sizeof(ov)); 3176 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ 3177 ov.Offset = pos; \ 3178 rc = WriteFile(fd, ptr, size, &len, &ov); } while(0) 3179 #else 3180 int len; 3181 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ 3182 len = pwrite(fd, ptr, size, pos); \ 3183 rc = (len >= 0); } while(0) 3184 #endif 3185 3186 DPUTS("writing new meta page"); 3187 3188 psize = env->me_psize; 3189 3190 meta->mm_magic = MDB_MAGIC; 3191 meta->mm_version = MDB_DATA_VERSION; 3192 meta->mm_mapsize = env->me_mapsize; 3193 meta->mm_psize = psize; 3194 meta->mm_last_pg = 1; 3195 meta->mm_flags = env->me_flags & 0xffff; 3196 meta->mm_flags |= MDB_INTEGERKEY; 3197 meta->mm_dbs[0].md_root = P_INVALID; 3198 meta->mm_dbs[1].md_root = P_INVALID; 3199 3200 p = calloc(2, psize); 3201 p->mp_pgno = 0; 3202 p->mp_flags = P_META; 3203 *(MDB_meta *)METADATA(p) = *meta; 3204 3205 q = (MDB_page *)((char *)p + psize); 3206 q->mp_pgno = 1; 3207 q->mp_flags = P_META; 3208 *(MDB_meta *)METADATA(q) = *meta; 3209 3210 DO_PWRITE(rc, env->me_fd, p, psize * 2, len, 0); 3211 if (!rc) 3212 rc = ErrCode(); 3213 else if ((unsigned) len == psize * 2) 3214 rc = MDB_SUCCESS; 3215 else 3216 rc = ENOSPC; 3217 free(p); 3218 return rc; 3219 } 3220 3221 /** Update the environment info to commit a transaction. 3222 * @param[in] txn the transaction that's being committed 3223 * @return 0 on success, non-zero on failure. 3224 */ 3225 static int 3226 mdb_env_write_meta(MDB_txn *txn) 3227 { 3228 MDB_env *env; 3229 MDB_meta meta, metab, *mp; 3230 off_t off; 3231 int rc, len, toggle; 3232 char *ptr; 3233 HANDLE mfd; 3234 #ifdef _WIN32 3235 OVERLAPPED ov; 3236 #else 3237 int r2; 3238 #endif 3239 3240 toggle = txn->mt_txnid & 1; 3241 DPRINTF(("writing meta page %d for root page %"Z"u", 3242 toggle, txn->mt_dbs[MAIN_DBI].md_root)); 3243 3244 env = txn->mt_env; 3245 mp = env->me_metas[toggle]; 3246 3247 if (env->me_flags & MDB_WRITEMAP) { 3248 /* Persist any increases of mapsize config */ 3249 if (env->me_mapsize > mp->mm_mapsize) 3250 mp->mm_mapsize = env->me_mapsize; 3251 mp->mm_dbs[0] = txn->mt_dbs[0]; 3252 mp->mm_dbs[1] = txn->mt_dbs[1]; 3253 mp->mm_last_pg = txn->mt_next_pgno - 1; 3254 mp->mm_txnid = txn->mt_txnid; 3255 if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { 3256 unsigned meta_size = env->me_psize; 3257 rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; 3258 ptr = env->me_map; 3259 if (toggle) { 3260 #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ 3261 if (meta_size < env->me_os_psize) 3262 meta_size += meta_size; 3263 else 3264 #endif 3265 ptr += meta_size; 3266 } 3267 if (MDB_MSYNC(ptr, meta_size, rc)) { 3268 rc = ErrCode(); 3269 goto fail; 3270 } 3271 } 3272 goto done; 3273 } 3274 metab.mm_txnid = env->me_metas[toggle]->mm_txnid; 3275 metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg; 3276 3277 ptr = (char *)&meta; 3278 if (env->me_mapsize > mp->mm_mapsize) { 3279 /* Persist any increases of mapsize config */ 3280 meta.mm_mapsize = env->me_mapsize; 3281 off = offsetof(MDB_meta, mm_mapsize); 3282 } else { 3283 off = offsetof(MDB_meta, mm_dbs[0].md_depth); 3284 } 3285 len = sizeof(MDB_meta) - off; 3286 3287 ptr += off; 3288 meta.mm_dbs[0] = txn->mt_dbs[0]; 3289 meta.mm_dbs[1] = txn->mt_dbs[1]; 3290 meta.mm_last_pg = txn->mt_next_pgno - 1; 3291 meta.mm_txnid = txn->mt_txnid; 3292 3293 if (toggle) 3294 off += env->me_psize; 3295 off += PAGEHDRSZ; 3296 3297 /* Write to the SYNC fd */ 3298 mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ? 3299 env->me_fd : env->me_mfd; 3300 #ifdef _WIN32 3301 { 3302 memset(&ov, 0, sizeof(ov)); 3303 ov.Offset = off; 3304 if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) 3305 rc = -1; 3306 } 3307 #else 3308 rc = pwrite(mfd, ptr, len, off); 3309 #endif 3310 if (rc != len) { 3311 rc = rc < 0 ? ErrCode() : EIO; 3312 DPUTS("write failed, disk error?"); 3313 /* On a failure, the pagecache still contains the new data. 3314 * Write some old data back, to prevent it from being used. 3315 * Use the non-SYNC fd; we know it will fail anyway. 3316 */ 3317 meta.mm_last_pg = metab.mm_last_pg; 3318 meta.mm_txnid = metab.mm_txnid; 3319 #ifdef _WIN32 3320 memset(&ov, 0, sizeof(ov)); 3321 ov.Offset = off; 3322 WriteFile(env->me_fd, ptr, len, NULL, &ov); 3323 #else 3324 r2 = pwrite(env->me_fd, ptr, len, off); 3325 (void)r2; /* Silence warnings. We don't care about pwrite's return value */ 3326 #endif 3327 fail: 3328 env->me_flags |= MDB_FATAL_ERROR; 3329 return rc; 3330 } 3331 done: 3332 /* Memory ordering issues are irrelevant; since the entire writer 3333 * is wrapped by wmutex, all of these changes will become visible 3334 * after the wmutex is unlocked. Since the DB is multi-version, 3335 * readers will get consistent data regardless of how fresh or 3336 * how stale their view of these values is. 3337 */ 3338 if (env->me_txns) 3339 env->me_txns->mti_txnid = txn->mt_txnid; 3340 3341 return MDB_SUCCESS; 3342 } 3343 3344 /** Check both meta pages to see which one is newer. 3345 * @param[in] env the environment handle 3346 * @return meta toggle (0 or 1). 3347 */ 3348 static int 3349 mdb_env_pick_meta(const MDB_env *env) 3350 { 3351 return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid); 3352 } 3353 3354 int 3355 mdb_env_create(MDB_env **env) 3356 { 3357 MDB_env *e; 3358 3359 e = calloc(1, sizeof(MDB_env)); 3360 if (!e) 3361 return ENOMEM; 3362 3363 e->me_maxreaders = DEFAULT_READERS; 3364 e->me_maxdbs = e->me_numdbs = 2; 3365 e->me_fd = INVALID_HANDLE_VALUE; 3366 e->me_lfd = INVALID_HANDLE_VALUE; 3367 e->me_mfd = INVALID_HANDLE_VALUE; 3368 #ifdef MDB_USE_POSIX_SEM 3369 e->me_rmutex = SEM_FAILED; 3370 e->me_wmutex = SEM_FAILED; 3371 #endif 3372 e->me_pid = getpid(); 3373 GET_PAGESIZE(e->me_os_psize); 3374 VGMEMP_CREATE(e,0,0); 3375 *env = e; 3376 return MDB_SUCCESS; 3377 } 3378 3379 static int 3380 mdb_env_map(MDB_env *env, void *addr, int newsize) 3381 { 3382 MDB_page *p; 3383 unsigned int flags = env->me_flags; 3384 #ifdef _WIN32 3385 int rc; 3386 HANDLE mh; 3387 LONG sizelo, sizehi; 3388 sizelo = env->me_mapsize & 0xffffffff; 3389 sizehi = env->me_mapsize >> 16 >> 16; /* only needed on Win64 */ 3390 3391 /* Windows won't create mappings for zero length files. 3392 * Just allocate the maxsize right now. 3393 */ 3394 if (newsize) { 3395 if (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo 3396 || !SetEndOfFile(env->me_fd) 3397 || SetFilePointer(env->me_fd, 0, NULL, 0) != 0) 3398 return ErrCode(); 3399 } 3400 mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? 3401 PAGE_READWRITE : PAGE_READONLY, 3402 sizehi, sizelo, NULL); 3403 if (!mh) 3404 return ErrCode(); 3405 env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? 3406 FILE_MAP_WRITE : FILE_MAP_READ, 3407 0, 0, env->me_mapsize, addr); 3408 rc = env->me_map ? 0 : ErrCode(); 3409 CloseHandle(mh); 3410 if (rc) 3411 return rc; 3412 #else 3413 int prot = PROT_READ; 3414 if (flags & MDB_WRITEMAP) { 3415 prot |= PROT_WRITE; 3416 if (ftruncate(env->me_fd, env->me_mapsize) < 0) 3417 return ErrCode(); 3418 } 3419 env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, 3420 env->me_fd, 0); 3421 if (env->me_map == MAP_FAILED) { 3422 env->me_map = NULL; 3423 return ErrCode(); 3424 } 3425 3426 if (flags & MDB_NORDAHEAD) { 3427 /* Turn off readahead. It's harmful when the DB is larger than RAM. */ 3428 #ifdef MADV_RANDOM 3429 madvise(env->me_map, env->me_mapsize, MADV_RANDOM); 3430 #else 3431 #ifdef POSIX_MADV_RANDOM 3432 posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); 3433 #endif /* POSIX_MADV_RANDOM */ 3434 #endif /* MADV_RANDOM */ 3435 } 3436 #endif /* _WIN32 */ 3437 3438 /* Can happen because the address argument to mmap() is just a 3439 * hint. mmap() can pick another, e.g. if the range is in use. 3440 * The MAP_FIXED flag would prevent that, but then mmap could 3441 * instead unmap existing pages to make room for the new map. 3442 */ 3443 if (addr && env->me_map != addr) 3444 return EBUSY; /* TODO: Make a new MDB_* error code? */ 3445 3446 p = (MDB_page *)env->me_map; 3447 env->me_metas[0] = METADATA(p); 3448 env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize); 3449 3450 return MDB_SUCCESS; 3451 } 3452 3453 int 3454 mdb_env_set_mapsize(MDB_env *env, size_t size) 3455 { 3456 /* If env is already open, caller is responsible for making 3457 * sure there are no active txns. 3458 */ 3459 if (env->me_map) { 3460 int rc; 3461 void *old; 3462 if (env->me_txn) 3463 return EINVAL; 3464 if (!size) 3465 size = env->me_metas[mdb_env_pick_meta(env)]->mm_mapsize; 3466 else if (size < env->me_mapsize) { 3467 /* If the configured size is smaller, make sure it's 3468 * still big enough. Silently round up to minimum if not. 3469 */ 3470 size_t minsize = (env->me_metas[mdb_env_pick_meta(env)]->mm_last_pg + 1) * env->me_psize; 3471 if (size < minsize) 3472 size = minsize; 3473 } 3474 munmap(env->me_map, env->me_mapsize); 3475 env->me_mapsize = size; 3476 old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; 3477 rc = mdb_env_map(env, old, 1); 3478 if (rc) 3479 return rc; 3480 } 3481 env->me_mapsize = size; 3482 if (env->me_psize) 3483 env->me_maxpg = env->me_mapsize / env->me_psize; 3484 return MDB_SUCCESS; 3485 } 3486 3487 int 3488 mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) 3489 { 3490 if (env->me_map) 3491 return EINVAL; 3492 env->me_maxdbs = dbs + 2; /* Named databases + main and free DB */ 3493 return MDB_SUCCESS; 3494 } 3495 3496 int 3497 mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) 3498 { 3499 if (env->me_map || readers < 1) 3500 return EINVAL; 3501 env->me_maxreaders = readers; 3502 return MDB_SUCCESS; 3503 } 3504 3505 int 3506 mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) 3507 { 3508 if (!env || !readers) 3509 return EINVAL; 3510 *readers = env->me_maxreaders; 3511 return MDB_SUCCESS; 3512 } 3513 3514 /** Further setup required for opening an MDB environment 3515 */ 3516 static int 3517 mdb_env_open2(MDB_env *env) 3518 { 3519 unsigned int flags = env->me_flags; 3520 int i, newenv = 0, rc; 3521 MDB_meta meta; 3522 3523 #ifdef _WIN32 3524 /* See if we should use QueryLimited */ 3525 rc = GetVersion(); 3526 if ((rc & 0xff) > 5) 3527 env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION; 3528 else 3529 env->me_pidquery = PROCESS_QUERY_INFORMATION; 3530 #endif /* _WIN32 */ 3531 3532 memset(&meta, 0, sizeof(meta)); 3533 3534 if ((i = mdb_env_read_header(env, &meta)) != 0) { 3535 if (i != ENOENT) 3536 return i; 3537 DPUTS("new mdbenv"); 3538 newenv = 1; 3539 env->me_psize = env->me_os_psize; 3540 if (env->me_psize > MAX_PAGESIZE) 3541 env->me_psize = MAX_PAGESIZE; 3542 } else { 3543 env->me_psize = meta.mm_psize; 3544 } 3545 3546 /* Was a mapsize configured? */ 3547 if (!env->me_mapsize) { 3548 /* If this is a new environment, take the default, 3549 * else use the size recorded in the existing env. 3550 */ 3551 env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize; 3552 } else if (env->me_mapsize < meta.mm_mapsize) { 3553 /* If the configured size is smaller, make sure it's 3554 * still big enough. Silently round up to minimum if not. 3555 */ 3556 size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; 3557 if (env->me_mapsize < minsize) 3558 env->me_mapsize = minsize; 3559 } 3560 3561 rc = mdb_env_map(env, meta.mm_address, newenv || env->me_mapsize != meta.mm_mapsize); 3562 if (rc) 3563 return rc; 3564 3565 if (newenv) { 3566 if (flags & MDB_FIXEDMAP) 3567 meta.mm_address = env->me_map; 3568 i = mdb_env_init_meta(env, &meta); 3569 if (i != MDB_SUCCESS) { 3570 return i; 3571 } 3572 } 3573 3574 env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; 3575 env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) 3576 - sizeof(indx_t); 3577 #if !(MDB_MAXKEYSIZE) 3578 env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); 3579 #endif 3580 env->me_maxpg = env->me_mapsize / env->me_psize; 3581 3582 #if MDB_DEBUG 3583 { 3584 int toggle = mdb_env_pick_meta(env); 3585 MDB_db *db = &env->me_metas[toggle]->mm_dbs[MAIN_DBI]; 3586 3587 DPRINTF(("opened database version %u, pagesize %u", 3588 env->me_metas[0]->mm_version, env->me_psize)); 3589 DPRINTF(("using meta page %d", toggle)); 3590 DPRINTF(("depth: %u", db->md_depth)); 3591 DPRINTF(("entries: %"Z"u", db->md_entries)); 3592 DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); 3593 DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages)); 3594 DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages)); 3595 DPRINTF(("root: %"Z"u", db->md_root)); 3596 } 3597 #endif 3598 3599 return MDB_SUCCESS; 3600 } 3601 3602 3603 /** Release a reader thread's slot in the reader lock table. 3604 * This function is called automatically when a thread exits. 3605 * @param[in] ptr This points to the slot in the reader lock table. 3606 */ 3607 static void 3608 mdb_env_reader_dest(void *ptr) 3609 { 3610 MDB_reader *reader = ptr; 3611 3612 reader->mr_pid = 0; 3613 } 3614 3615 #ifdef _WIN32 3616 /** Junk for arranging thread-specific callbacks on Windows. This is 3617 * necessarily platform and compiler-specific. Windows supports up 3618 * to 1088 keys. Let's assume nobody opens more than 64 environments 3619 * in a single process, for now. They can override this if needed. 3620 */ 3621 #ifndef MAX_TLS_KEYS 3622 #define MAX_TLS_KEYS 64 3623 #endif 3624 static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS]; 3625 static int mdb_tls_nkeys; 3626 3627 static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr) 3628 { 3629 int i; 3630 switch(reason) { 3631 case DLL_PROCESS_ATTACH: break; 3632 case DLL_THREAD_ATTACH: break; 3633 case DLL_THREAD_DETACH: 3634 for (i=0; i<mdb_tls_nkeys; i++) { 3635 MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]); 3636 mdb_env_reader_dest(r); 3637 } 3638 break; 3639 case DLL_PROCESS_DETACH: break; 3640 } 3641 } 3642 #ifdef __GNUC__ 3643 #ifdef _WIN64 3644 const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback; 3645 #else 3646 PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback; 3647 #endif 3648 #else 3649 #ifdef _WIN64 3650 /* Force some symbol references. 3651 * _tls_used forces the linker to create the TLS directory if not already done 3652 * mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol. 3653 */ 3654 #pragma comment(linker, "/INCLUDE:_tls_used") 3655 #pragma comment(linker, "/INCLUDE:mdb_tls_cbp") 3656 #pragma const_seg(".CRT$XLB") 3657 extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp; 3658 const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; 3659 #pragma const_seg() 3660 #else /* WIN32 */ 3661 #pragma comment(linker, "/INCLUDE:__tls_used") 3662 #pragma comment(linker, "/INCLUDE:_mdb_tls_cbp") 3663 #pragma data_seg(".CRT$XLB") 3664 PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; 3665 #pragma data_seg() 3666 #endif /* WIN 32/64 */ 3667 #endif /* !__GNUC__ */ 3668 #endif 3669 3670 /** Downgrade the exclusive lock on the region back to shared */ 3671 static int 3672 mdb_env_share_locks(MDB_env *env, int *excl) 3673 { 3674 int rc = 0, toggle = mdb_env_pick_meta(env); 3675 3676 env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid; 3677 3678 #ifdef _WIN32 3679 { 3680 OVERLAPPED ov; 3681 /* First acquire a shared lock. The Unlock will 3682 * then release the existing exclusive lock. 3683 */ 3684 memset(&ov, 0, sizeof(ov)); 3685 if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { 3686 rc = ErrCode(); 3687 } else { 3688 UnlockFile(env->me_lfd, 0, 0, 1, 0); 3689 *excl = 0; 3690 } 3691 } 3692 #else 3693 { 3694 struct flock lock_info; 3695 /* The shared lock replaces the existing lock */ 3696 memset((void *)&lock_info, 0, sizeof(lock_info)); 3697 lock_info.l_type = F_RDLCK; 3698 lock_info.l_whence = SEEK_SET; 3699 lock_info.l_start = 0; 3700 lock_info.l_len = 1; 3701 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && 3702 (rc = ErrCode()) == EINTR) ; 3703 *excl = rc ? -1 : 0; /* error may mean we lost the lock */ 3704 } 3705 #endif 3706 3707 return rc; 3708 } 3709 3710 /** Try to get exlusive lock, otherwise shared. 3711 * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. 3712 */ 3713 static int 3714 mdb_env_excl_lock(MDB_env *env, int *excl) 3715 { 3716 int rc = 0; 3717 #ifdef _WIN32 3718 if (LockFile(env->me_lfd, 0, 0, 1, 0)) { 3719 *excl = 1; 3720 } else { 3721 OVERLAPPED ov; 3722 memset(&ov, 0, sizeof(ov)); 3723 if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { 3724 *excl = 0; 3725 } else { 3726 rc = ErrCode(); 3727 } 3728 } 3729 #else 3730 struct flock lock_info; 3731 memset((void *)&lock_info, 0, sizeof(lock_info)); 3732 lock_info.l_type = F_WRLCK; 3733 lock_info.l_whence = SEEK_SET; 3734 lock_info.l_start = 0; 3735 lock_info.l_len = 1; 3736 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && 3737 (rc = ErrCode()) == EINTR) ; 3738 if (!rc) { 3739 *excl = 1; 3740 } else 3741 # ifdef MDB_USE_POSIX_SEM 3742 if (*excl < 0) /* always true when !MDB_USE_POSIX_SEM */ 3743 # endif 3744 { 3745 lock_info.l_type = F_RDLCK; 3746 while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && 3747 (rc = ErrCode()) == EINTR) ; 3748 if (rc == 0) 3749 *excl = 0; 3750 } 3751 #endif 3752 return rc; 3753 } 3754 3755 #ifdef MDB_USE_HASH 3756 /* 3757 * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code 3758 * 3759 * @(#) Revision: 5.1 3760 * @(#) Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp 3761 * @(#) Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v 3762 * 3763 * http://www.isthe.com/chongo/tech/comp/fnv/index.html 3764 * 3765 *** 3766 * 3767 * Please do not copyright this code. This code is in the public domain. 3768 * 3769 * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 3770 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO 3771 * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR 3772 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF 3773 * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 3774 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 3775 * PERFORMANCE OF THIS SOFTWARE. 3776 * 3777 * By: 3778 * chongo <Landon Curt Noll> /\oo/\ 3779 * http://www.isthe.com/chongo/ 3780 * 3781 * Share and Enjoy! :-) 3782 */ 3783 3784 typedef unsigned long long mdb_hash_t; 3785 #define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) 3786 3787 /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer 3788 * @param[in] val value to hash 3789 * @param[in] hval initial value for hash 3790 * @return 64 bit hash 3791 * 3792 * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the 3793 * hval arg on the first call. 3794 */ 3795 static mdb_hash_t 3796 mdb_hash_val(MDB_val *val, mdb_hash_t hval) 3797 { 3798 unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ 3799 unsigned char *end = s + val->mv_size; 3800 /* 3801 * FNV-1a hash each octet of the string 3802 */ 3803 while (s < end) { 3804 /* xor the bottom with the current octet */ 3805 hval ^= (mdb_hash_t)*s++; 3806 3807 /* multiply by the 64 bit FNV magic prime mod 2^64 */ 3808 hval += (hval << 1) + (hval << 4) + (hval << 5) + 3809 (hval << 7) + (hval << 8) + (hval << 40); 3810 } 3811 /* return our new hash value */ 3812 return hval; 3813 } 3814 3815 /** Hash the string and output the encoded hash. 3816 * This uses modified RFC1924 Ascii85 encoding to accommodate systems with 3817 * very short name limits. We don't care about the encoding being reversible, 3818 * we just want to preserve as many bits of the input as possible in a 3819 * small printable string. 3820 * @param[in] str string to hash 3821 * @param[out] encbuf an array of 11 chars to hold the hash 3822 */ 3823 static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; 3824 3825 static void 3826 mdb_pack85(unsigned long l, char *out) 3827 { 3828 int i; 3829 3830 for (i=0; i<5; i++) { 3831 *out++ = mdb_a85[l % 85]; 3832 l /= 85; 3833 } 3834 } 3835 3836 static void 3837 mdb_hash_enc(MDB_val *val, char *encbuf) 3838 { 3839 mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); 3840 3841 mdb_pack85(h, encbuf); 3842 mdb_pack85(h>>32, encbuf+5); 3843 encbuf[10] = '\0'; 3844 } 3845 #endif 3846 3847 /** Open and/or initialize the lock region for the environment. 3848 * @param[in] env The MDB environment. 3849 * @param[in] lpath The pathname of the file used for the lock region. 3850 * @param[in] mode The Unix permissions for the file, if we create it. 3851 * @param[out] excl Resulting file lock type: -1 none, 0 shared, 1 exclusive 3852 * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive 3853 * @return 0 on success, non-zero on failure. 3854 */ 3855 static int 3856 mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) 3857 { 3858 #ifdef _WIN32 3859 # define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT 3860 #else 3861 # define MDB_ERRCODE_ROFS EROFS 3862 #ifdef O_CLOEXEC /* Linux: Open file and set FD_CLOEXEC atomically */ 3863 # define MDB_CLOEXEC O_CLOEXEC 3864 #else 3865 int fdflags; 3866 # define MDB_CLOEXEC 0 3867 #endif 3868 #endif 3869 int rc; 3870 off_t size, rsize; 3871 3872 #ifdef _WIN32 3873 env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE, 3874 FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, 3875 FILE_ATTRIBUTE_NORMAL, NULL); 3876 #else 3877 env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode); 3878 #endif 3879 if (env->me_lfd == INVALID_HANDLE_VALUE) { 3880 rc = ErrCode(); 3881 if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { 3882 return MDB_SUCCESS; 3883 } 3884 goto fail_errno; 3885 } 3886 #if ! ((MDB_CLOEXEC) || defined(_WIN32)) 3887 /* Lose record locks when exec*() */ 3888 if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) 3889 fcntl(env->me_lfd, F_SETFD, fdflags); 3890 #endif 3891 3892 if (!(env->me_flags & MDB_NOTLS)) { 3893 rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); 3894 if (rc) 3895 goto fail; 3896 env->me_flags |= MDB_ENV_TXKEY; 3897 #ifdef _WIN32 3898 /* Windows TLS callbacks need help finding their TLS info. */ 3899 if (mdb_tls_nkeys >= MAX_TLS_KEYS) { 3900 rc = MDB_TLS_FULL; 3901 goto fail; 3902 } 3903 mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; 3904 #endif 3905 } 3906 3907 /* Try to get exclusive lock. If we succeed, then 3908 * nobody is using the lock region and we should initialize it. 3909 */ 3910 if ((rc = mdb_env_excl_lock(env, excl))) goto fail; 3911 3912 #ifdef _WIN32 3913 size = GetFileSize(env->me_lfd, NULL); 3914 #else 3915 size = lseek(env->me_lfd, 0, SEEK_END); 3916 if (size == -1) goto fail_errno; 3917 #endif 3918 rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); 3919 if (size < rsize && *excl > 0) { 3920 #ifdef _WIN32 3921 if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize 3922 || !SetEndOfFile(env->me_lfd)) 3923 goto fail_errno; 3924 #else 3925 if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; 3926 #endif 3927 } else { 3928 rsize = size; 3929 size = rsize - sizeof(MDB_txninfo); 3930 env->me_maxreaders = size/sizeof(MDB_reader) + 1; 3931 } 3932 { 3933 #ifdef _WIN32 3934 HANDLE mh; 3935 mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE, 3936 0, 0, NULL); 3937 if (!mh) goto fail_errno; 3938 env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL); 3939 CloseHandle(mh); 3940 if (!env->me_txns) goto fail_errno; 3941 #else 3942 void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, 3943 env->me_lfd, 0); 3944 if (m == MAP_FAILED) goto fail_errno; 3945 env->me_txns = m; 3946 #endif 3947 } 3948 if (*excl > 0) { 3949 #ifdef _WIN32 3950 BY_HANDLE_FILE_INFORMATION stbuf; 3951 struct { 3952 DWORD volume; 3953 DWORD nhigh; 3954 DWORD nlow; 3955 } idbuf; 3956 MDB_val val; 3957 char encbuf[11]; 3958 3959 if (!mdb_sec_inited) { 3960 InitializeSecurityDescriptor(&mdb_null_sd, 3961 SECURITY_DESCRIPTOR_REVISION); 3962 SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE); 3963 mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES); 3964 mdb_all_sa.bInheritHandle = FALSE; 3965 mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd; 3966 mdb_sec_inited = 1; 3967 } 3968 if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno; 3969 idbuf.volume = stbuf.dwVolumeSerialNumber; 3970 idbuf.nhigh = stbuf.nFileIndexHigh; 3971 idbuf.nlow = stbuf.nFileIndexLow; 3972 val.mv_data = &idbuf; 3973 val.mv_size = sizeof(idbuf); 3974 mdb_hash_enc(&val, encbuf); 3975 sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf); 3976 sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf); 3977 env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); 3978 if (!env->me_rmutex) goto fail_errno; 3979 env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); 3980 if (!env->me_wmutex) goto fail_errno; 3981 #elif defined(MDB_USE_POSIX_SEM) 3982 struct stat stbuf; 3983 struct { 3984 dev_t dev; 3985 ino_t ino; 3986 } idbuf; 3987 MDB_val val; 3988 char encbuf[11]; 3989 3990 #if defined(__NetBSD__) 3991 #define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */ 3992 #endif 3993 if (fstat(env->me_lfd, &stbuf)) goto fail_errno; 3994 idbuf.dev = stbuf.st_dev; 3995 idbuf.ino = stbuf.st_ino; 3996 val.mv_data = &idbuf; 3997 val.mv_size = sizeof(idbuf); 3998 mdb_hash_enc(&val, encbuf); 3999 #ifdef MDB_SHORT_SEMNAMES 4000 encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */ 4001 #endif 4002 sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf); 4003 sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf); 4004 /* Clean up after a previous run, if needed: Try to 4005 * remove both semaphores before doing anything else. 4006 */ 4007 sem_unlink(env->me_txns->mti_rmname); 4008 sem_unlink(env->me_txns->mti_wmname); 4009 env->me_rmutex = sem_open(env->me_txns->mti_rmname, 4010 O_CREAT|O_EXCL, mode, 1); 4011 if (env->me_rmutex == SEM_FAILED) goto fail_errno; 4012 env->me_wmutex = sem_open(env->me_txns->mti_wmname, 4013 O_CREAT|O_EXCL, mode, 1); 4014 if (env->me_wmutex == SEM_FAILED) goto fail_errno; 4015 #else /* MDB_USE_POSIX_SEM */ 4016 pthread_mutexattr_t mattr; 4017 4018 if ((rc = pthread_mutexattr_init(&mattr)) 4019 || (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED)) 4020 || (rc = pthread_mutex_init(&env->me_txns->mti_mutex, &mattr)) 4021 || (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr))) 4022 goto fail; 4023 pthread_mutexattr_destroy(&mattr); 4024 #endif /* _WIN32 || MDB_USE_POSIX_SEM */ 4025 4026 env->me_txns->mti_magic = MDB_MAGIC; 4027 env->me_txns->mti_format = MDB_LOCK_FORMAT; 4028 env->me_txns->mti_txnid = 0; 4029 env->me_txns->mti_numreaders = 0; 4030 4031 } else { 4032 if (env->me_txns->mti_magic != MDB_MAGIC) { 4033 DPUTS("lock region has invalid magic"); 4034 rc = MDB_INVALID; 4035 goto fail; 4036 } 4037 if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { 4038 DPRINTF(("lock region has format+version 0x%x, expected 0x%x", 4039 env->me_txns->mti_format, MDB_LOCK_FORMAT)); 4040 rc = MDB_VERSION_MISMATCH; 4041 goto fail; 4042 } 4043 rc = ErrCode(); 4044 if (rc && rc != EACCES && rc != EAGAIN) { 4045 goto fail; 4046 } 4047 #ifdef _WIN32 4048 env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); 4049 if (!env->me_rmutex) goto fail_errno; 4050 env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); 4051 if (!env->me_wmutex) goto fail_errno; 4052 #elif defined(MDB_USE_POSIX_SEM) 4053 env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); 4054 if (env->me_rmutex == SEM_FAILED) goto fail_errno; 4055 env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); 4056 if (env->me_wmutex == SEM_FAILED) goto fail_errno; 4057 #endif 4058 } 4059 return MDB_SUCCESS; 4060 4061 fail_errno: 4062 rc = ErrCode(); 4063 fail: 4064 return rc; 4065 } 4066 4067 /** The name of the lock file in the DB environment */ 4068 #define LOCKNAME "/lock.mdb" 4069 /** The name of the data file in the DB environment */ 4070 #define DATANAME "/data.mdb" 4071 /** The suffix of the lock file when no subdir is used */ 4072 #define LOCKSUFF "-lock" 4073 /** Only a subset of the @ref mdb_env flags can be changed 4074 * at runtime. Changing other flags requires closing the 4075 * environment and re-opening it with the new flags. 4076 */ 4077 #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) 4078 #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \ 4079 MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) 4080 4081 #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) 4082 # error "Persistent DB flags & env flags overlap, but both go in mm_flags" 4083 #endif 4084 4085 int 4086 mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) 4087 { 4088 int oflags, rc, len, excl = -1; 4089 char *lpath, *dpath; 4090 4091 if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) 4092 return EINVAL; 4093 4094 len = strlen(path); 4095 if (flags & MDB_NOSUBDIR) { 4096 rc = len + sizeof(LOCKSUFF) + len + 1; 4097 } else { 4098 rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); 4099 } 4100 lpath = malloc(rc); 4101 if (!lpath) 4102 return ENOMEM; 4103 if (flags & MDB_NOSUBDIR) { 4104 dpath = lpath + len + sizeof(LOCKSUFF); 4105 sprintf(lpath, "%s" LOCKSUFF, path); 4106 strcpy(dpath, path); 4107 } else { 4108 dpath = lpath + len + sizeof(LOCKNAME); 4109 sprintf(lpath, "%s" LOCKNAME, path); 4110 sprintf(dpath, "%s" DATANAME, path); 4111 } 4112 4113 rc = MDB_SUCCESS; 4114 flags |= env->me_flags; 4115 if (flags & MDB_RDONLY) { 4116 /* silently ignore WRITEMAP when we're only getting read access */ 4117 flags &= ~MDB_WRITEMAP; 4118 } else { 4119 if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && 4120 (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) 4121 rc = ENOMEM; 4122 } 4123 env->me_flags = flags |= MDB_ENV_ACTIVE; 4124 if (rc) 4125 goto leave; 4126 4127 env->me_path = strdup(path); 4128 env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); 4129 env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); 4130 if (!(env->me_dbxs && env->me_path && env->me_dbflags)) { 4131 rc = ENOMEM; 4132 goto leave; 4133 } 4134 4135 /* For RDONLY, get lockfile after we know datafile exists */ 4136 if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { 4137 rc = mdb_env_setup_locks(env, lpath, mode, &excl); 4138 if (rc) 4139 goto leave; 4140 } 4141 4142 #ifdef _WIN32 4143 if (F_ISSET(flags, MDB_RDONLY)) { 4144 oflags = GENERIC_READ; 4145 len = OPEN_EXISTING; 4146 } else { 4147 oflags = GENERIC_READ|GENERIC_WRITE; 4148 len = OPEN_ALWAYS; 4149 } 4150 mode = FILE_ATTRIBUTE_NORMAL; 4151 env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, 4152 NULL, len, mode, NULL); 4153 #else 4154 if (F_ISSET(flags, MDB_RDONLY)) 4155 oflags = O_RDONLY; 4156 else 4157 oflags = O_RDWR | O_CREAT; 4158 4159 env->me_fd = open(dpath, oflags, mode); 4160 #endif 4161 if (env->me_fd == INVALID_HANDLE_VALUE) { 4162 rc = ErrCode(); 4163 goto leave; 4164 } 4165 4166 if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { 4167 rc = mdb_env_setup_locks(env, lpath, mode, &excl); 4168 if (rc) 4169 goto leave; 4170 } 4171 4172 if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { 4173 if (flags & (MDB_RDONLY|MDB_WRITEMAP)) { 4174 env->me_mfd = env->me_fd; 4175 } else { 4176 /* Synchronous fd for meta writes. Needed even with 4177 * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. 4178 */ 4179 #ifdef _WIN32 4180 len = OPEN_EXISTING; 4181 env->me_mfd = CreateFile(dpath, oflags, 4182 FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, 4183 mode | FILE_FLAG_WRITE_THROUGH, NULL); 4184 #else 4185 oflags &= ~O_CREAT; 4186 env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode); 4187 #endif 4188 if (env->me_mfd == INVALID_HANDLE_VALUE) { 4189 rc = ErrCode(); 4190 goto leave; 4191 } 4192 } 4193 DPRINTF(("opened dbenv %p", (void *) env)); 4194 if (excl > 0) { 4195 rc = mdb_env_share_locks(env, &excl); 4196 if (rc) 4197 goto leave; 4198 } 4199 if (!((flags & MDB_RDONLY) || 4200 (env->me_pbuf = calloc(1, env->me_psize)))) 4201 rc = ENOMEM; 4202 } 4203 4204 leave: 4205 if (rc) { 4206 mdb_env_close0(env, excl); 4207 } 4208 free(lpath); 4209 return rc; 4210 } 4211 4212 /** Destroy resources from mdb_env_open(), clear our readers & DBIs */ 4213 static void 4214 mdb_env_close0(MDB_env *env, int excl) 4215 { 4216 int i; 4217 4218 if (!(env->me_flags & MDB_ENV_ACTIVE)) 4219 return; 4220 4221 /* Doing this here since me_dbxs may not exist during mdb_env_close */ 4222 for (i = env->me_maxdbs; --i > MAIN_DBI; ) 4223 free(env->me_dbxs[i].md_name.mv_data); 4224 4225 free(env->me_pbuf); 4226 free(env->me_dbflags); 4227 free(env->me_dbxs); 4228 free(env->me_path); 4229 free(env->me_dirty_list); 4230 mdb_midl_free(env->me_free_pgs); 4231 4232 if (env->me_flags & MDB_ENV_TXKEY) { 4233 pthread_key_delete(env->me_txkey); 4234 #ifdef _WIN32 4235 /* Delete our key from the global list */ 4236 for (i=0; i<mdb_tls_nkeys; i++) 4237 if (mdb_tls_keys[i] == env->me_txkey) { 4238 mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; 4239 mdb_tls_nkeys--; 4240 break; 4241 } 4242 #endif 4243 } 4244 4245 if (env->me_map) { 4246 munmap(env->me_map, env->me_mapsize); 4247 } 4248 if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE) 4249 (void) close(env->me_mfd); 4250 if (env->me_fd != INVALID_HANDLE_VALUE) 4251 (void) close(env->me_fd); 4252 if (env->me_txns) { 4253 MDB_PID_T pid = env->me_pid; 4254 /* Clearing readers is done in this function because 4255 * me_txkey with its destructor must be disabled first. 4256 */ 4257 for (i = env->me_numreaders; --i >= 0; ) 4258 if (env->me_txns->mti_readers[i].mr_pid == pid) 4259 env->me_txns->mti_readers[i].mr_pid = 0; 4260 #ifdef _WIN32 4261 if (env->me_rmutex) { 4262 CloseHandle(env->me_rmutex); 4263 if (env->me_wmutex) CloseHandle(env->me_wmutex); 4264 } 4265 /* Windows automatically destroys the mutexes when 4266 * the last handle closes. 4267 */ 4268 #elif defined(MDB_USE_POSIX_SEM) 4269 if (env->me_rmutex != SEM_FAILED) { 4270 sem_close(env->me_rmutex); 4271 if (env->me_wmutex != SEM_FAILED) 4272 sem_close(env->me_wmutex); 4273 /* If we have the filelock: If we are the 4274 * only remaining user, clean up semaphores. 4275 */ 4276 if (excl == 0) 4277 mdb_env_excl_lock(env, &excl); 4278 if (excl > 0) { 4279 sem_unlink(env->me_txns->mti_rmname); 4280 sem_unlink(env->me_txns->mti_wmname); 4281 } 4282 } 4283 #endif 4284 munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); 4285 } 4286 if (env->me_lfd != INVALID_HANDLE_VALUE) { 4287 #ifdef _WIN32 4288 if (excl >= 0) { 4289 /* Unlock the lockfile. Windows would have unlocked it 4290 * after closing anyway, but not necessarily at once. 4291 */ 4292 UnlockFile(env->me_lfd, 0, 0, 1, 0); 4293 } 4294 #endif 4295 (void) close(env->me_lfd); 4296 } 4297 4298 env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); 4299 } 4300 4301 int 4302 mdb_env_copyfd(MDB_env *env, HANDLE fd) 4303 { 4304 MDB_txn *txn = NULL; 4305 int rc; 4306 size_t wsize; 4307 char *ptr; 4308 #ifdef _WIN32 4309 DWORD len, w2; 4310 #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) 4311 #else 4312 ssize_t len; 4313 size_t w2; 4314 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) 4315 #endif 4316 4317 /* Do the lock/unlock of the reader mutex before starting the 4318 * write txn. Otherwise other read txns could block writers. 4319 */ 4320 rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); 4321 if (rc) 4322 return rc; 4323 4324 if (env->me_txns) { 4325 /* We must start the actual read txn after blocking writers */ 4326 mdb_txn_reset0(txn, "reset-stage1"); 4327 4328 /* Temporarily block writers until we snapshot the meta pages */ 4329 LOCK_MUTEX_W(env); 4330 4331 rc = mdb_txn_renew0(txn); 4332 if (rc) { 4333 UNLOCK_MUTEX_W(env); 4334 goto leave; 4335 } 4336 } 4337 4338 wsize = env->me_psize * 2; 4339 ptr = env->me_map; 4340 w2 = wsize; 4341 while (w2 > 0) { 4342 DO_WRITE(rc, fd, ptr, w2, len); 4343 if (!rc) { 4344 rc = ErrCode(); 4345 break; 4346 } else if (len > 0) { 4347 rc = MDB_SUCCESS; 4348 ptr += len; 4349 w2 -= len; 4350 continue; 4351 } else { 4352 /* Non-blocking or async handles are not supported */ 4353 rc = EIO; 4354 break; 4355 } 4356 } 4357 if (env->me_txns) 4358 UNLOCK_MUTEX_W(env); 4359 4360 if (rc) 4361 goto leave; 4362 4363 wsize = txn->mt_next_pgno * env->me_psize - wsize; 4364 while (wsize > 0) { 4365 if (wsize > MAX_WRITE) 4366 w2 = MAX_WRITE; 4367 else 4368 w2 = wsize; 4369 DO_WRITE(rc, fd, ptr, w2, len); 4370 if (!rc) { 4371 rc = ErrCode(); 4372 break; 4373 } else if (len > 0) { 4374 rc = MDB_SUCCESS; 4375 ptr += len; 4376 wsize -= len; 4377 continue; 4378 } else { 4379 rc = EIO; 4380 break; 4381 } 4382 } 4383 4384 leave: 4385 mdb_txn_abort(txn); 4386 return rc; 4387 } 4388 4389 int 4390 mdb_env_copy(MDB_env *env, const char *path) 4391 { 4392 int rc, len; 4393 char *lpath; 4394 HANDLE newfd = INVALID_HANDLE_VALUE; 4395 4396 if (env->me_flags & MDB_NOSUBDIR) { 4397 lpath = (char *)path; 4398 } else { 4399 len = strlen(path); 4400 len += sizeof(DATANAME); 4401 lpath = malloc(len); 4402 if (!lpath) 4403 return ENOMEM; 4404 sprintf(lpath, "%s" DATANAME, path); 4405 } 4406 4407 /* The destination path must exist, but the destination file must not. 4408 * We don't want the OS to cache the writes, since the source data is 4409 * already in the OS cache. 4410 */ 4411 #ifdef _WIN32 4412 newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, 4413 FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); 4414 #else 4415 newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); 4416 #endif 4417 if (newfd == INVALID_HANDLE_VALUE) { 4418 rc = ErrCode(); 4419 goto leave; 4420 } 4421 4422 #ifdef O_DIRECT 4423 /* Set O_DIRECT if the file system supports it */ 4424 if ((rc = fcntl(newfd, F_GETFL)) != -1) 4425 (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); 4426 #endif 4427 #ifdef F_NOCACHE /* __APPLE__ */ 4428 rc = fcntl(newfd, F_NOCACHE, 1); 4429 if (rc) { 4430 rc = ErrCode(); 4431 goto leave; 4432 } 4433 #endif 4434 4435 rc = mdb_env_copyfd(env, newfd); 4436 4437 leave: 4438 if (!(env->me_flags & MDB_NOSUBDIR)) 4439 free(lpath); 4440 if (newfd != INVALID_HANDLE_VALUE) 4441 if (close(newfd) < 0 && rc == MDB_SUCCESS) 4442 rc = ErrCode(); 4443 4444 return rc; 4445 } 4446 4447 void 4448 mdb_env_close(MDB_env *env) 4449 { 4450 MDB_page *dp; 4451 4452 if (env == NULL) 4453 return; 4454 4455 VGMEMP_DESTROY(env); 4456 while ((dp = env->me_dpages) != NULL) { 4457 VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); 4458 env->me_dpages = dp->mp_next; 4459 free(dp); 4460 } 4461 4462 mdb_env_close0(env, 0); 4463 free(env); 4464 } 4465 4466 /** Compare two items pointing at aligned size_t's */ 4467 static int 4468 mdb_cmp_long(const MDB_val *a, const MDB_val *b) 4469 { 4470 return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 : 4471 *(size_t *)a->mv_data > *(size_t *)b->mv_data; 4472 } 4473 4474 /** Compare two items pointing at aligned unsigned int's */ 4475 static int 4476 mdb_cmp_int(const MDB_val *a, const MDB_val *b) 4477 { 4478 return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 : 4479 *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data; 4480 } 4481 4482 /** Compare two items pointing at unsigned ints of unknown alignment. 4483 * Nodes and keys are guaranteed to be 2-byte aligned. 4484 */ 4485 static int 4486 mdb_cmp_cint(const MDB_val *a, const MDB_val *b) 4487 { 4488 #if BYTE_ORDER == LITTLE_ENDIAN 4489 unsigned short *u, *c; 4490 int x; 4491 4492 u = (unsigned short *) ((char *) a->mv_data + a->mv_size); 4493 c = (unsigned short *) ((char *) b->mv_data + a->mv_size); 4494 do { 4495 x = *--u - *--c; 4496 } while(!x && u > (unsigned short *)a->mv_data); 4497 return x; 4498 #else 4499 return memcmp(a->mv_data, b->mv_data, a->mv_size); 4500 #endif 4501 } 4502 4503 /** Compare two items lexically */ 4504 static int 4505 mdb_cmp_memn(const MDB_val *a, const MDB_val *b) 4506 { 4507 int diff; 4508 ssize_t len_diff; 4509 unsigned int len; 4510 4511 len = a->mv_size; 4512 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; 4513 if (len_diff > 0) { 4514 len = b->mv_size; 4515 len_diff = 1; 4516 } 4517 4518 diff = memcmp(a->mv_data, b->mv_data, len); 4519 return diff ? diff : len_diff<0 ? -1 : len_diff; 4520 } 4521 4522 /** Compare two items in reverse byte order */ 4523 static int 4524 mdb_cmp_memnr(const MDB_val *a, const MDB_val *b) 4525 { 4526 const unsigned char *p1, *p2, *p1_lim; 4527 ssize_t len_diff; 4528 int diff; 4529 4530 p1_lim = (const unsigned char *)a->mv_data; 4531 p1 = (const unsigned char *)a->mv_data + a->mv_size; 4532 p2 = (const unsigned char *)b->mv_data + b->mv_size; 4533 4534 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; 4535 if (len_diff > 0) { 4536 p1_lim += len_diff; 4537 len_diff = 1; 4538 } 4539 4540 while (p1 > p1_lim) { 4541 diff = *--p1 - *--p2; 4542 if (diff) 4543 return diff; 4544 } 4545 return len_diff<0 ? -1 : len_diff; 4546 } 4547 4548 /** Search for key within a page, using binary search. 4549 * Returns the smallest entry larger or equal to the key. 4550 * If exactp is non-null, stores whether the found entry was an exact match 4551 * in *exactp (1 or 0). 4552 * Updates the cursor index with the index of the found entry. 4553 * If no entry larger or equal to the key is found, returns NULL. 4554 */ 4555 static MDB_node * 4556 mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) 4557 { 4558 unsigned int i = 0, nkeys; 4559 int low, high; 4560 int rc = 0; 4561 MDB_page *mp = mc->mc_pg[mc->mc_top]; 4562 MDB_node *node = NULL; 4563 MDB_val nodekey; 4564 MDB_cmp_func *cmp; 4565 DKBUF; 4566 4567 nkeys = NUMKEYS(mp); 4568 4569 DPRINTF(("searching %u keys in %s %spage %"Z"u", 4570 nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", 4571 mdb_dbg_pgno(mp))); 4572 4573 low = IS_LEAF(mp) ? 0 : 1; 4574 high = nkeys - 1; 4575 cmp = mc->mc_dbx->md_cmp; 4576 4577 /* Branch pages have no data, so if using integer keys, 4578 * alignment is guaranteed. Use faster mdb_cmp_int. 4579 */ 4580 if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { 4581 if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) 4582 cmp = mdb_cmp_long; 4583 else 4584 cmp = mdb_cmp_int; 4585 } 4586 4587 if (IS_LEAF2(mp)) { 4588 nodekey.mv_size = mc->mc_db->md_pad; 4589 node = NODEPTR(mp, 0); /* fake */ 4590 while (low <= high) { 4591 i = (low + high) >> 1; 4592 nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); 4593 rc = cmp(key, &nodekey); 4594 DPRINTF(("found leaf index %u [%s], rc = %i", 4595 i, DKEY(&nodekey), rc)); 4596 if (rc == 0) 4597 break; 4598 if (rc > 0) 4599 low = i + 1; 4600 else 4601 high = i - 1; 4602 } 4603 } else { 4604 while (low <= high) { 4605 i = (low + high) >> 1; 4606 4607 node = NODEPTR(mp, i); 4608 nodekey.mv_size = NODEKSZ(node); 4609 nodekey.mv_data = NODEKEY(node); 4610 4611 rc = cmp(key, &nodekey); 4612 #if MDB_DEBUG 4613 if (IS_LEAF(mp)) 4614 DPRINTF(("found leaf index %u [%s], rc = %i", 4615 i, DKEY(&nodekey), rc)); 4616 else 4617 DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", 4618 i, DKEY(&nodekey), NODEPGNO(node), rc)); 4619 #endif 4620 if (rc == 0) 4621 break; 4622 if (rc > 0) 4623 low = i + 1; 4624 else 4625 high = i - 1; 4626 } 4627 } 4628 4629 if (rc > 0) { /* Found entry is less than the key. */ 4630 i++; /* Skip to get the smallest entry larger than key. */ 4631 if (!IS_LEAF2(mp)) 4632 node = NODEPTR(mp, i); 4633 } 4634 if (exactp) 4635 *exactp = (rc == 0 && nkeys > 0); 4636 /* store the key index */ 4637 mc->mc_ki[mc->mc_top] = i; 4638 if (i >= nkeys) 4639 /* There is no entry larger or equal to the key. */ 4640 return NULL; 4641 4642 /* nodeptr is fake for LEAF2 */ 4643 return node; 4644 } 4645 4646 #if 0 4647 static void 4648 mdb_cursor_adjust(MDB_cursor *mc, func) 4649 { 4650 MDB_cursor *m2; 4651 4652 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { 4653 if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { 4654 func(mc, m2); 4655 } 4656 } 4657 } 4658 #endif 4659 4660 /** Pop a page off the top of the cursor's stack. */ 4661 static void 4662 mdb_cursor_pop(MDB_cursor *mc) 4663 { 4664 if (mc->mc_snum) { 4665 #if MDB_DEBUG 4666 MDB_page *top = mc->mc_pg[mc->mc_top]; 4667 #endif 4668 mc->mc_snum--; 4669 if (mc->mc_snum) 4670 mc->mc_top--; 4671 4672 DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno, 4673 DDBI(mc), (void *) mc)); 4674 } 4675 } 4676 4677 /** Push a page onto the top of the cursor's stack. */ 4678 static int 4679 mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) 4680 { 4681 DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, 4682 DDBI(mc), (void *) mc)); 4683 4684 if (mc->mc_snum >= CURSOR_STACK) { 4685 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 4686 return MDB_CURSOR_FULL; 4687 } 4688 4689 mc->mc_top = mc->mc_snum++; 4690 mc->mc_pg[mc->mc_top] = mp; 4691 mc->mc_ki[mc->mc_top] = 0; 4692 4693 return MDB_SUCCESS; 4694 } 4695 4696 /** Find the address of the page corresponding to a given page number. 4697 * @param[in] txn the transaction for this access. 4698 * @param[in] pgno the page number for the page to retrieve. 4699 * @param[out] ret address of a pointer where the page's address will be stored. 4700 * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. 4701 * @return 0 on success, non-zero on failure. 4702 */ 4703 static int 4704 mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) 4705 { 4706 MDB_env *env = txn->mt_env; 4707 MDB_page *p = NULL; 4708 int level; 4709 4710 if (!((txn->mt_flags & MDB_TXN_RDONLY) | (env->me_flags & MDB_WRITEMAP))) { 4711 MDB_txn *tx2 = txn; 4712 level = 1; 4713 do { 4714 MDB_ID2L dl = tx2->mt_u.dirty_list; 4715 unsigned x; 4716 /* Spilled pages were dirtied in this txn and flushed 4717 * because the dirty list got full. Bring this page 4718 * back in from the map (but don't unspill it here, 4719 * leave that unless page_touch happens again). 4720 */ 4721 if (tx2->mt_spill_pgs) { 4722 MDB_ID pn = pgno << 1; 4723 x = mdb_midl_search(tx2->mt_spill_pgs, pn); 4724 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { 4725 p = (MDB_page *)(env->me_map + env->me_psize * pgno); 4726 goto done; 4727 } 4728 } 4729 if (dl[0].mid) { 4730 unsigned x = mdb_mid2l_search(dl, pgno); 4731 if (x <= dl[0].mid && dl[x].mid == pgno) { 4732 p = dl[x].mptr; 4733 goto done; 4734 } 4735 } 4736 level++; 4737 } while ((tx2 = tx2->mt_parent) != NULL); 4738 } 4739 4740 if (pgno < txn->mt_next_pgno) { 4741 level = 0; 4742 p = (MDB_page *)(env->me_map + env->me_psize * pgno); 4743 } else { 4744 DPRINTF(("page %"Z"u not found", pgno)); 4745 txn->mt_flags |= MDB_TXN_ERROR; 4746 return MDB_PAGE_NOTFOUND; 4747 } 4748 4749 done: 4750 *ret = p; 4751 if (lvl) 4752 *lvl = level; 4753 return MDB_SUCCESS; 4754 } 4755 4756 /** Finish #mdb_page_search() / #mdb_page_search_lowest(). 4757 * The cursor is at the root page, set up the rest of it. 4758 */ 4759 static int 4760 mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) 4761 { 4762 MDB_page *mp = mc->mc_pg[mc->mc_top]; 4763 int rc; 4764 DKBUF; 4765 4766 while (IS_BRANCH(mp)) { 4767 MDB_node *node; 4768 indx_t i; 4769 4770 DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); 4771 mdb_cassert(mc, NUMKEYS(mp) > 1); 4772 DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); 4773 4774 if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { 4775 i = 0; 4776 if (flags & MDB_PS_LAST) 4777 i = NUMKEYS(mp) - 1; 4778 } else { 4779 int exact; 4780 node = mdb_node_search(mc, key, &exact); 4781 if (node == NULL) 4782 i = NUMKEYS(mp) - 1; 4783 else { 4784 i = mc->mc_ki[mc->mc_top]; 4785 if (!exact) { 4786 mdb_cassert(mc, i > 0); 4787 i--; 4788 } 4789 } 4790 DPRINTF(("following index %u for key [%s]", i, DKEY(key))); 4791 } 4792 4793 mdb_cassert(mc, i < NUMKEYS(mp)); 4794 node = NODEPTR(mp, i); 4795 4796 if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) 4797 return rc; 4798 4799 mc->mc_ki[mc->mc_top] = i; 4800 if ((rc = mdb_cursor_push(mc, mp))) 4801 return rc; 4802 4803 if (flags & MDB_PS_MODIFY) { 4804 if ((rc = mdb_page_touch(mc)) != 0) 4805 return rc; 4806 mp = mc->mc_pg[mc->mc_top]; 4807 } 4808 } 4809 4810 if (!IS_LEAF(mp)) { 4811 DPRINTF(("internal error, index points to a %02X page!?", 4812 mp->mp_flags)); 4813 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 4814 return MDB_CORRUPTED; 4815 } 4816 4817 DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, 4818 key ? DKEY(key) : "null")); 4819 mc->mc_flags |= C_INITIALIZED; 4820 mc->mc_flags &= ~C_EOF; 4821 4822 return MDB_SUCCESS; 4823 } 4824 4825 /** Search for the lowest key under the current branch page. 4826 * This just bypasses a NUMKEYS check in the current page 4827 * before calling mdb_page_search_root(), because the callers 4828 * are all in situations where the current page is known to 4829 * be underfilled. 4830 */ 4831 static int 4832 mdb_page_search_lowest(MDB_cursor *mc) 4833 { 4834 MDB_page *mp = mc->mc_pg[mc->mc_top]; 4835 MDB_node *node = NODEPTR(mp, 0); 4836 int rc; 4837 4838 if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) 4839 return rc; 4840 4841 mc->mc_ki[mc->mc_top] = 0; 4842 if ((rc = mdb_cursor_push(mc, mp))) 4843 return rc; 4844 return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); 4845 } 4846 4847 /** Search for the page a given key should be in. 4848 * Push it and its parent pages on the cursor stack. 4849 * @param[in,out] mc the cursor for this operation. 4850 * @param[in] key the key to search for, or NULL for first/last page. 4851 * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB 4852 * are touched (updated with new page numbers). 4853 * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. 4854 * This is used by #mdb_cursor_first() and #mdb_cursor_last(). 4855 * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. 4856 * @return 0 on success, non-zero on failure. 4857 */ 4858 static int 4859 mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) 4860 { 4861 int rc; 4862 pgno_t root; 4863 4864 /* Make sure the txn is still viable, then find the root from 4865 * the txn's db table and set it as the root of the cursor's stack. 4866 */ 4867 if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) { 4868 DPUTS("transaction has failed, must abort"); 4869 return MDB_BAD_TXN; 4870 } else { 4871 /* Make sure we're using an up-to-date root */ 4872 if (*mc->mc_dbflag & DB_STALE) { 4873 MDB_cursor mc2; 4874 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); 4875 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); 4876 if (rc) 4877 return rc; 4878 { 4879 MDB_val data; 4880 int exact = 0; 4881 uint16_t flags; 4882 MDB_node *leaf = mdb_node_search(&mc2, 4883 &mc->mc_dbx->md_name, &exact); 4884 if (!exact) 4885 return MDB_NOTFOUND; 4886 rc = mdb_node_read(mc->mc_txn, leaf, &data); 4887 if (rc) 4888 return rc; 4889 memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), 4890 sizeof(uint16_t)); 4891 /* The txn may not know this DBI, or another process may 4892 * have dropped and recreated the DB with other flags. 4893 */ 4894 if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) 4895 return MDB_INCOMPATIBLE; 4896 memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); 4897 } 4898 *mc->mc_dbflag &= ~DB_STALE; 4899 } 4900 root = mc->mc_db->md_root; 4901 4902 if (root == P_INVALID) { /* Tree is empty. */ 4903 DPUTS("tree is empty"); 4904 return MDB_NOTFOUND; 4905 } 4906 } 4907 4908 mdb_cassert(mc, root > 1); 4909 if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) 4910 if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0) 4911 return rc; 4912 4913 mc->mc_snum = 1; 4914 mc->mc_top = 0; 4915 4916 DPRINTF(("db %d root page %"Z"u has flags 0x%X", 4917 DDBI(mc), root, mc->mc_pg[0]->mp_flags)); 4918 4919 if (flags & MDB_PS_MODIFY) { 4920 if ((rc = mdb_page_touch(mc))) 4921 return rc; 4922 } 4923 4924 if (flags & MDB_PS_ROOTONLY) 4925 return MDB_SUCCESS; 4926 4927 return mdb_page_search_root(mc, key, flags); 4928 } 4929 4930 static int 4931 mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) 4932 { 4933 MDB_txn *txn = mc->mc_txn; 4934 pgno_t pg = mp->mp_pgno; 4935 unsigned x = 0, ovpages = mp->mp_pages; 4936 MDB_env *env = txn->mt_env; 4937 MDB_IDL sl = txn->mt_spill_pgs; 4938 MDB_ID pn = pg << 1; 4939 int rc; 4940 4941 DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); 4942 /* If the page is dirty or on the spill list we just acquired it, 4943 * so we should give it back to our current free list, if any. 4944 * Otherwise put it onto the list of pages we freed in this txn. 4945 * 4946 * Won't create me_pghead: me_pglast must be inited along with it. 4947 * Unsupported in nested txns: They would need to hide the page 4948 * range in ancestor txns' dirty and spilled lists. 4949 */ 4950 if (env->me_pghead && 4951 !txn->mt_parent && 4952 ((mp->mp_flags & P_DIRTY) || 4953 (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) 4954 { 4955 unsigned i, j; 4956 pgno_t *mop; 4957 MDB_ID2 *dl, ix, iy; 4958 rc = mdb_midl_need(&env->me_pghead, ovpages); 4959 if (rc) 4960 return rc; 4961 if (!(mp->mp_flags & P_DIRTY)) { 4962 /* This page is no longer spilled */ 4963 if (x == sl[0]) 4964 sl[0]--; 4965 else 4966 sl[x] |= 1; 4967 goto release; 4968 } 4969 /* Remove from dirty list */ 4970 dl = txn->mt_u.dirty_list; 4971 x = dl[0].mid--; 4972 for (ix = dl[x]; ix.mptr != mp; ix = iy) { 4973 if (x > 1) { 4974 x--; 4975 iy = dl[x]; 4976 dl[x] = ix; 4977 } else { 4978 mdb_cassert(mc, x > 1); 4979 j = ++(dl[0].mid); 4980 dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ 4981 txn->mt_flags |= MDB_TXN_ERROR; 4982 return MDB_CORRUPTED; 4983 } 4984 } 4985 if (!(env->me_flags & MDB_WRITEMAP)) 4986 mdb_dpage_free(env, mp); 4987 release: 4988 /* Insert in me_pghead */ 4989 mop = env->me_pghead; 4990 j = mop[0] + ovpages; 4991 for (i = mop[0]; i && mop[i] < pg; i--) 4992 mop[j--] = mop[i]; 4993 while (j>i) 4994 mop[j--] = pg++; 4995 mop[0] += ovpages; 4996 } else { 4997 rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); 4998 if (rc) 4999 return rc; 5000 } 5001 mc->mc_db->md_overflow_pages -= ovpages; 5002 return 0; 5003 } 5004 5005 /** Return the data associated with a given node. 5006 * @param[in] txn The transaction for this operation. 5007 * @param[in] leaf The node being read. 5008 * @param[out] data Updated to point to the node's data. 5009 * @return 0 on success, non-zero on failure. 5010 */ 5011 static int 5012 mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data) 5013 { 5014 MDB_page *omp; /* overflow page */ 5015 pgno_t pgno; 5016 int rc; 5017 5018 if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { 5019 data->mv_size = NODEDSZ(leaf); 5020 data->mv_data = NODEDATA(leaf); 5021 return MDB_SUCCESS; 5022 } 5023 5024 /* Read overflow data. 5025 */ 5026 data->mv_size = NODEDSZ(leaf); 5027 memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); 5028 if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) { 5029 DPRINTF(("read overflow page %"Z"u failed", pgno)); 5030 return rc; 5031 } 5032 data->mv_data = METADATA(omp); 5033 5034 return MDB_SUCCESS; 5035 } 5036 5037 int 5038 mdb_get(MDB_txn *txn, MDB_dbi dbi, 5039 MDB_val *key, MDB_val *data) 5040 { 5041 MDB_cursor mc; 5042 MDB_xcursor mx; 5043 int exact = 0; 5044 DKBUF; 5045 5046 if (key == NULL || data == NULL) 5047 return EINVAL; 5048 5049 DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); 5050 5051 if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) 5052 return EINVAL; 5053 5054 if (txn->mt_flags & MDB_TXN_ERROR) 5055 return MDB_BAD_TXN; 5056 5057 mdb_cursor_init(&mc, txn, dbi, &mx); 5058 return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); 5059 } 5060 5061 /** Find a sibling for a page. 5062 * Replaces the page at the top of the cursor's stack with the 5063 * specified sibling, if one exists. 5064 * @param[in] mc The cursor for this operation. 5065 * @param[in] move_right Non-zero if the right sibling is requested, 5066 * otherwise the left sibling. 5067 * @return 0 on success, non-zero on failure. 5068 */ 5069 static int 5070 mdb_cursor_sibling(MDB_cursor *mc, int move_right) 5071 { 5072 int rc; 5073 MDB_node *indx; 5074 MDB_page *mp; 5075 5076 if (mc->mc_snum < 2) { 5077 return MDB_NOTFOUND; /* root has no siblings */ 5078 } 5079 5080 mdb_cursor_pop(mc); 5081 DPRINTF(("parent page is page %"Z"u, index %u", 5082 mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); 5083 5084 if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) 5085 : (mc->mc_ki[mc->mc_top] == 0)) { 5086 DPRINTF(("no more keys left, moving to %s sibling", 5087 move_right ? "right" : "left")); 5088 if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { 5089 /* undo cursor_pop before returning */ 5090 mc->mc_top++; 5091 mc->mc_snum++; 5092 return rc; 5093 } 5094 } else { 5095 if (move_right) 5096 mc->mc_ki[mc->mc_top]++; 5097 else 5098 mc->mc_ki[mc->mc_top]--; 5099 DPRINTF(("just moving to %s index key %u", 5100 move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); 5101 } 5102 mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); 5103 5104 indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 5105 if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) { 5106 /* mc will be inconsistent if caller does mc_snum++ as above */ 5107 mc->mc_flags &= ~(C_INITIALIZED|C_EOF); 5108 return rc; 5109 } 5110 5111 mdb_cursor_push(mc, mp); 5112 if (!move_right) 5113 mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; 5114 5115 return MDB_SUCCESS; 5116 } 5117 5118 /** Move the cursor to the next data item. */ 5119 static int 5120 mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) 5121 { 5122 MDB_page *mp; 5123 MDB_node *leaf; 5124 int rc; 5125 5126 if (mc->mc_flags & C_EOF) { 5127 return MDB_NOTFOUND; 5128 } 5129 5130 mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); 5131 5132 mp = mc->mc_pg[mc->mc_top]; 5133 5134 if (mc->mc_db->md_flags & MDB_DUPSORT) { 5135 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 5136 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5137 if (op == MDB_NEXT || op == MDB_NEXT_DUP) { 5138 rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); 5139 if (op != MDB_NEXT || rc != MDB_NOTFOUND) { 5140 if (rc == MDB_SUCCESS) 5141 MDB_GET_KEY(leaf, key); 5142 return rc; 5143 } 5144 } 5145 } else { 5146 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 5147 if (op == MDB_NEXT_DUP) 5148 return MDB_NOTFOUND; 5149 } 5150 } 5151 5152 DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", 5153 mdb_dbg_pgno(mp), (void *) mc)); 5154 if (mc->mc_flags & C_DEL) 5155 goto skip; 5156 5157 if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { 5158 DPUTS("=====> move to next sibling page"); 5159 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { 5160 mc->mc_flags |= C_EOF; 5161 return rc; 5162 } 5163 mp = mc->mc_pg[mc->mc_top]; 5164 DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); 5165 } else 5166 mc->mc_ki[mc->mc_top]++; 5167 5168 skip: 5169 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", 5170 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); 5171 5172 if (IS_LEAF2(mp)) { 5173 key->mv_size = mc->mc_db->md_pad; 5174 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 5175 return MDB_SUCCESS; 5176 } 5177 5178 mdb_cassert(mc, IS_LEAF(mp)); 5179 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 5180 5181 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5182 mdb_xcursor_init1(mc, leaf); 5183 } 5184 if (data) { 5185 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) 5186 return rc; 5187 5188 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5189 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 5190 if (rc != MDB_SUCCESS) 5191 return rc; 5192 } 5193 } 5194 5195 MDB_GET_KEY(leaf, key); 5196 return MDB_SUCCESS; 5197 } 5198 5199 /** Move the cursor to the previous data item. */ 5200 static int 5201 mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) 5202 { 5203 MDB_page *mp; 5204 MDB_node *leaf; 5205 int rc; 5206 5207 mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); 5208 5209 mp = mc->mc_pg[mc->mc_top]; 5210 5211 if (mc->mc_db->md_flags & MDB_DUPSORT) { 5212 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 5213 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5214 if (op == MDB_PREV || op == MDB_PREV_DUP) { 5215 rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); 5216 if (op != MDB_PREV || rc != MDB_NOTFOUND) { 5217 if (rc == MDB_SUCCESS) 5218 MDB_GET_KEY(leaf, key); 5219 return rc; 5220 } 5221 } else { 5222 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 5223 if (op == MDB_PREV_DUP) 5224 return MDB_NOTFOUND; 5225 } 5226 } 5227 } 5228 5229 DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", 5230 mdb_dbg_pgno(mp), (void *) mc)); 5231 5232 if (mc->mc_ki[mc->mc_top] == 0) { 5233 DPUTS("=====> move to prev sibling page"); 5234 if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { 5235 return rc; 5236 } 5237 mp = mc->mc_pg[mc->mc_top]; 5238 mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; 5239 DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); 5240 } else 5241 mc->mc_ki[mc->mc_top]--; 5242 5243 mc->mc_flags &= ~C_EOF; 5244 5245 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", 5246 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); 5247 5248 if (IS_LEAF2(mp)) { 5249 key->mv_size = mc->mc_db->md_pad; 5250 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 5251 return MDB_SUCCESS; 5252 } 5253 5254 mdb_cassert(mc, IS_LEAF(mp)); 5255 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 5256 5257 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5258 mdb_xcursor_init1(mc, leaf); 5259 } 5260 if (data) { 5261 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) 5262 return rc; 5263 5264 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5265 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); 5266 if (rc != MDB_SUCCESS) 5267 return rc; 5268 } 5269 } 5270 5271 MDB_GET_KEY(leaf, key); 5272 return MDB_SUCCESS; 5273 } 5274 5275 /** Set the cursor on a specific data item. */ 5276 static int 5277 mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, 5278 MDB_cursor_op op, int *exactp) 5279 { 5280 int rc; 5281 MDB_page *mp; 5282 MDB_node *leaf = NULL; 5283 DKBUF; 5284 5285 if (key->mv_size == 0) 5286 return MDB_BAD_VALSIZE; 5287 5288 if (mc->mc_xcursor) 5289 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 5290 5291 /* See if we're already on the right page */ 5292 if (mc->mc_flags & C_INITIALIZED) { 5293 MDB_val nodekey; 5294 5295 mp = mc->mc_pg[mc->mc_top]; 5296 if (!NUMKEYS(mp)) { 5297 mc->mc_ki[mc->mc_top] = 0; 5298 return MDB_NOTFOUND; 5299 } 5300 if (mp->mp_flags & P_LEAF2) { 5301 nodekey.mv_size = mc->mc_db->md_pad; 5302 nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); 5303 } else { 5304 leaf = NODEPTR(mp, 0); 5305 MDB_GET_KEY2(leaf, nodekey); 5306 } 5307 rc = mc->mc_dbx->md_cmp(key, &nodekey); 5308 if (rc == 0) { 5309 /* Probably happens rarely, but first node on the page 5310 * was the one we wanted. 5311 */ 5312 mc->mc_ki[mc->mc_top] = 0; 5313 if (exactp) 5314 *exactp = 1; 5315 goto set1; 5316 } 5317 if (rc > 0) { 5318 unsigned int i; 5319 unsigned int nkeys = NUMKEYS(mp); 5320 if (nkeys > 1) { 5321 if (mp->mp_flags & P_LEAF2) { 5322 nodekey.mv_data = LEAF2KEY(mp, 5323 nkeys-1, nodekey.mv_size); 5324 } else { 5325 leaf = NODEPTR(mp, nkeys-1); 5326 MDB_GET_KEY2(leaf, nodekey); 5327 } 5328 rc = mc->mc_dbx->md_cmp(key, &nodekey); 5329 if (rc == 0) { 5330 /* last node was the one we wanted */ 5331 mc->mc_ki[mc->mc_top] = nkeys-1; 5332 if (exactp) 5333 *exactp = 1; 5334 goto set1; 5335 } 5336 if (rc < 0) { 5337 if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { 5338 /* This is definitely the right page, skip search_page */ 5339 if (mp->mp_flags & P_LEAF2) { 5340 nodekey.mv_data = LEAF2KEY(mp, 5341 mc->mc_ki[mc->mc_top], nodekey.mv_size); 5342 } else { 5343 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 5344 MDB_GET_KEY2(leaf, nodekey); 5345 } 5346 rc = mc->mc_dbx->md_cmp(key, &nodekey); 5347 if (rc == 0) { 5348 /* current node was the one we wanted */ 5349 if (exactp) 5350 *exactp = 1; 5351 goto set1; 5352 } 5353 } 5354 rc = 0; 5355 goto set2; 5356 } 5357 } 5358 /* If any parents have right-sibs, search. 5359 * Otherwise, there's nothing further. 5360 */ 5361 for (i=0; i<mc->mc_top; i++) 5362 if (mc->mc_ki[i] < 5363 NUMKEYS(mc->mc_pg[i])-1) 5364 break; 5365 if (i == mc->mc_top) { 5366 /* There are no other pages */ 5367 mc->mc_ki[mc->mc_top] = nkeys; 5368 return MDB_NOTFOUND; 5369 } 5370 } 5371 if (!mc->mc_top) { 5372 /* There are no other pages */ 5373 mc->mc_ki[mc->mc_top] = 0; 5374 if (op == MDB_SET_RANGE) { 5375 rc = 0; 5376 goto set1; 5377 } else 5378 return MDB_NOTFOUND; 5379 } 5380 } 5381 5382 rc = mdb_page_search(mc, key, 0); 5383 if (rc != MDB_SUCCESS) 5384 return rc; 5385 5386 mp = mc->mc_pg[mc->mc_top]; 5387 mdb_cassert(mc, IS_LEAF(mp)); 5388 5389 set2: 5390 leaf = mdb_node_search(mc, key, exactp); 5391 if (exactp != NULL && !*exactp) { 5392 /* MDB_SET specified and not an exact match. */ 5393 return MDB_NOTFOUND; 5394 } 5395 5396 if (leaf == NULL) { 5397 DPUTS("===> inexact leaf not found, goto sibling"); 5398 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) 5399 return rc; /* no entries matched */ 5400 mp = mc->mc_pg[mc->mc_top]; 5401 mdb_cassert(mc, IS_LEAF(mp)); 5402 leaf = NODEPTR(mp, 0); 5403 } 5404 5405 set1: 5406 mc->mc_flags |= C_INITIALIZED; 5407 mc->mc_flags &= ~C_EOF; 5408 5409 if (IS_LEAF2(mp)) { 5410 key->mv_size = mc->mc_db->md_pad; 5411 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 5412 return MDB_SUCCESS; 5413 } 5414 5415 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5416 mdb_xcursor_init1(mc, leaf); 5417 } 5418 if (data) { 5419 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5420 if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { 5421 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 5422 } else { 5423 int ex2, *ex2p; 5424 if (op == MDB_GET_BOTH) { 5425 ex2p = &ex2; 5426 ex2 = 0; 5427 } else { 5428 ex2p = NULL; 5429 } 5430 rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); 5431 if (rc != MDB_SUCCESS) 5432 return rc; 5433 } 5434 } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { 5435 MDB_val d2; 5436 if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS) 5437 return rc; 5438 rc = mc->mc_dbx->md_dcmp(data, &d2); 5439 if (rc) { 5440 if (op == MDB_GET_BOTH || rc > 0) 5441 return MDB_NOTFOUND; 5442 rc = 0; 5443 *data = d2; 5444 } 5445 5446 } else { 5447 if (mc->mc_xcursor) 5448 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 5449 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) 5450 return rc; 5451 } 5452 } 5453 5454 /* The key already matches in all other cases */ 5455 if (op == MDB_SET_RANGE || op == MDB_SET_KEY) 5456 MDB_GET_KEY(leaf, key); 5457 DPRINTF(("==> cursor placed on key [%s]", DKEY(key))); 5458 5459 return rc; 5460 } 5461 5462 /** Move the cursor to the first item in the database. */ 5463 static int 5464 mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) 5465 { 5466 int rc; 5467 MDB_node *leaf; 5468 5469 if (mc->mc_xcursor) 5470 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 5471 5472 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { 5473 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); 5474 if (rc != MDB_SUCCESS) 5475 return rc; 5476 } 5477 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); 5478 5479 leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); 5480 mc->mc_flags |= C_INITIALIZED; 5481 mc->mc_flags &= ~C_EOF; 5482 5483 mc->mc_ki[mc->mc_top] = 0; 5484 5485 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 5486 key->mv_size = mc->mc_db->md_pad; 5487 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); 5488 return MDB_SUCCESS; 5489 } 5490 5491 if (data) { 5492 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5493 mdb_xcursor_init1(mc, leaf); 5494 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 5495 if (rc) 5496 return rc; 5497 } else { 5498 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) 5499 return rc; 5500 } 5501 } 5502 MDB_GET_KEY(leaf, key); 5503 return MDB_SUCCESS; 5504 } 5505 5506 /** Move the cursor to the last item in the database. */ 5507 static int 5508 mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) 5509 { 5510 int rc; 5511 MDB_node *leaf; 5512 5513 if (mc->mc_xcursor) 5514 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 5515 5516 if (!(mc->mc_flags & C_EOF)) { 5517 5518 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { 5519 rc = mdb_page_search(mc, NULL, MDB_PS_LAST); 5520 if (rc != MDB_SUCCESS) 5521 return rc; 5522 } 5523 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); 5524 5525 } 5526 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; 5527 mc->mc_flags |= C_INITIALIZED|C_EOF; 5528 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 5529 5530 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 5531 key->mv_size = mc->mc_db->md_pad; 5532 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); 5533 return MDB_SUCCESS; 5534 } 5535 5536 if (data) { 5537 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5538 mdb_xcursor_init1(mc, leaf); 5539 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); 5540 if (rc) 5541 return rc; 5542 } else { 5543 if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) 5544 return rc; 5545 } 5546 } 5547 5548 MDB_GET_KEY(leaf, key); 5549 return MDB_SUCCESS; 5550 } 5551 5552 int 5553 mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, 5554 MDB_cursor_op op) 5555 { 5556 int rc; 5557 int exact = 0; 5558 int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); 5559 5560 if (mc == NULL) 5561 return EINVAL; 5562 5563 if (mc->mc_txn->mt_flags & MDB_TXN_ERROR) 5564 return MDB_BAD_TXN; 5565 5566 switch (op) { 5567 case MDB_GET_CURRENT: 5568 if (!(mc->mc_flags & C_INITIALIZED)) { 5569 rc = EINVAL; 5570 } else { 5571 MDB_page *mp = mc->mc_pg[mc->mc_top]; 5572 int nkeys = NUMKEYS(mp); 5573 if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { 5574 mc->mc_ki[mc->mc_top] = nkeys; 5575 rc = MDB_NOTFOUND; 5576 break; 5577 } 5578 rc = MDB_SUCCESS; 5579 if (IS_LEAF2(mp)) { 5580 key->mv_size = mc->mc_db->md_pad; 5581 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 5582 } else { 5583 MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 5584 MDB_GET_KEY(leaf, key); 5585 if (data) { 5586 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5587 if (mc->mc_flags & C_DEL) 5588 mdb_xcursor_init1(mc, leaf); 5589 rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); 5590 } else { 5591 rc = mdb_node_read(mc->mc_txn, leaf, data); 5592 } 5593 } 5594 } 5595 } 5596 break; 5597 case MDB_GET_BOTH: 5598 case MDB_GET_BOTH_RANGE: 5599 if (data == NULL) { 5600 rc = EINVAL; 5601 break; 5602 } 5603 if (mc->mc_xcursor == NULL) { 5604 rc = MDB_INCOMPATIBLE; 5605 break; 5606 } 5607 /* FALLTHRU */ 5608 case MDB_SET: 5609 case MDB_SET_KEY: 5610 case MDB_SET_RANGE: 5611 if (key == NULL) { 5612 rc = EINVAL; 5613 } else { 5614 rc = mdb_cursor_set(mc, key, data, op, 5615 op == MDB_SET_RANGE ? NULL : &exact); 5616 } 5617 break; 5618 case MDB_GET_MULTIPLE: 5619 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { 5620 rc = EINVAL; 5621 break; 5622 } 5623 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 5624 rc = MDB_INCOMPATIBLE; 5625 break; 5626 } 5627 rc = MDB_SUCCESS; 5628 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || 5629 (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) 5630 break; 5631 goto fetchm; 5632 case MDB_NEXT_MULTIPLE: 5633 if (data == NULL) { 5634 rc = EINVAL; 5635 break; 5636 } 5637 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 5638 rc = MDB_INCOMPATIBLE; 5639 break; 5640 } 5641 if (!(mc->mc_flags & C_INITIALIZED)) 5642 rc = mdb_cursor_first(mc, key, data); 5643 else 5644 rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); 5645 if (rc == MDB_SUCCESS) { 5646 if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { 5647 MDB_cursor *mx; 5648 fetchm: 5649 mx = &mc->mc_xcursor->mx_cursor; 5650 data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * 5651 mx->mc_db->md_pad; 5652 data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); 5653 mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; 5654 } else { 5655 rc = MDB_NOTFOUND; 5656 } 5657 } 5658 break; 5659 case MDB_NEXT: 5660 case MDB_NEXT_DUP: 5661 case MDB_NEXT_NODUP: 5662 if (!(mc->mc_flags & C_INITIALIZED)) 5663 rc = mdb_cursor_first(mc, key, data); 5664 else 5665 rc = mdb_cursor_next(mc, key, data, op); 5666 break; 5667 case MDB_PREV: 5668 case MDB_PREV_DUP: 5669 case MDB_PREV_NODUP: 5670 if (!(mc->mc_flags & C_INITIALIZED)) { 5671 rc = mdb_cursor_last(mc, key, data); 5672 if (rc) 5673 break; 5674 mc->mc_flags |= C_INITIALIZED; 5675 mc->mc_ki[mc->mc_top]++; 5676 } 5677 rc = mdb_cursor_prev(mc, key, data, op); 5678 break; 5679 case MDB_FIRST: 5680 rc = mdb_cursor_first(mc, key, data); 5681 break; 5682 case MDB_FIRST_DUP: 5683 mfunc = mdb_cursor_first; 5684 mmove: 5685 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { 5686 rc = EINVAL; 5687 break; 5688 } 5689 if (mc->mc_xcursor == NULL) { 5690 rc = MDB_INCOMPATIBLE; 5691 break; 5692 } 5693 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { 5694 rc = EINVAL; 5695 break; 5696 } 5697 rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); 5698 break; 5699 case MDB_LAST: 5700 rc = mdb_cursor_last(mc, key, data); 5701 break; 5702 case MDB_LAST_DUP: 5703 mfunc = mdb_cursor_last; 5704 goto mmove; 5705 default: 5706 DPRINTF(("unhandled/unimplemented cursor operation %u", op)); 5707 rc = EINVAL; 5708 break; 5709 } 5710 5711 if (mc->mc_flags & C_DEL) 5712 mc->mc_flags ^= C_DEL; 5713 5714 return rc; 5715 } 5716 5717 /** Touch all the pages in the cursor stack. Set mc_top. 5718 * Makes sure all the pages are writable, before attempting a write operation. 5719 * @param[in] mc The cursor to operate on. 5720 */ 5721 static int 5722 mdb_cursor_touch(MDB_cursor *mc) 5723 { 5724 int rc = MDB_SUCCESS; 5725 5726 if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { 5727 MDB_cursor mc2; 5728 MDB_xcursor mcx; 5729 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); 5730 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); 5731 if (rc) 5732 return rc; 5733 *mc->mc_dbflag |= DB_DIRTY; 5734 } 5735 mc->mc_top = 0; 5736 if (mc->mc_snum) { 5737 do { 5738 rc = mdb_page_touch(mc); 5739 } while (!rc && ++(mc->mc_top) < mc->mc_snum); 5740 mc->mc_top = mc->mc_snum-1; 5741 } 5742 return rc; 5743 } 5744 5745 /** Do not spill pages to disk if txn is getting full, may fail instead */ 5746 #define MDB_NOSPILL 0x8000 5747 5748 int 5749 mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, 5750 unsigned int flags) 5751 { 5752 enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */ 5753 MDB_env *env; 5754 MDB_node *leaf = NULL; 5755 MDB_page *fp, *mp; 5756 uint16_t fp_flags; 5757 MDB_val xdata, *rdata, dkey, olddata; 5758 MDB_db dummy; 5759 int do_sub = 0, insert; 5760 unsigned int mcount = 0, dcount = 0, nospill; 5761 size_t nsize; 5762 int rc, rc2; 5763 unsigned int nflags; 5764 DKBUF; 5765 5766 if (mc == NULL || key == NULL) 5767 return EINVAL; 5768 5769 env = mc->mc_txn->mt_env; 5770 5771 /* Check this first so counter will always be zero on any 5772 * early failures. 5773 */ 5774 if (flags & MDB_MULTIPLE) { 5775 dcount = data[1].mv_size; 5776 data[1].mv_size = 0; 5777 if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) 5778 return MDB_INCOMPATIBLE; 5779 } 5780 5781 nospill = flags & MDB_NOSPILL; 5782 flags &= ~MDB_NOSPILL; 5783 5784 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) 5785 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 5786 5787 if (flags != MDB_CURRENT && key->mv_size-1 >= ENV_MAXKEY(env)) 5788 return MDB_BAD_VALSIZE; 5789 5790 #if SIZE_MAX > MAXDATASIZE 5791 if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) 5792 return MDB_BAD_VALSIZE; 5793 #else 5794 if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) 5795 return MDB_BAD_VALSIZE; 5796 #endif 5797 5798 DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u", 5799 DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); 5800 5801 dkey.mv_size = 0; 5802 5803 if (flags == MDB_CURRENT) { 5804 if (!(mc->mc_flags & C_INITIALIZED)) 5805 return EINVAL; 5806 rc = MDB_SUCCESS; 5807 } else if (mc->mc_db->md_root == P_INVALID) { 5808 /* new database, cursor has nothing to point to */ 5809 mc->mc_snum = 0; 5810 mc->mc_top = 0; 5811 mc->mc_flags &= ~C_INITIALIZED; 5812 rc = MDB_NO_ROOT; 5813 } else { 5814 int exact = 0; 5815 MDB_val d2; 5816 if (flags & MDB_APPEND) { 5817 MDB_val k2; 5818 rc = mdb_cursor_last(mc, &k2, &d2); 5819 if (rc == 0) { 5820 rc = mc->mc_dbx->md_cmp(key, &k2); 5821 if (rc > 0) { 5822 rc = MDB_NOTFOUND; 5823 mc->mc_ki[mc->mc_top]++; 5824 } else { 5825 /* new key is <= last key */ 5826 rc = MDB_KEYEXIST; 5827 } 5828 } 5829 } else { 5830 rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); 5831 } 5832 if ((flags & MDB_NOOVERWRITE) && rc == 0) { 5833 DPRINTF(("duplicate key [%s]", DKEY(key))); 5834 *data = d2; 5835 return MDB_KEYEXIST; 5836 } 5837 if (rc && rc != MDB_NOTFOUND) 5838 return rc; 5839 } 5840 5841 if (mc->mc_flags & C_DEL) 5842 mc->mc_flags ^= C_DEL; 5843 5844 /* Cursor is positioned, check for room in the dirty list */ 5845 if (!nospill) { 5846 if (flags & MDB_MULTIPLE) { 5847 rdata = &xdata; 5848 xdata.mv_size = data->mv_size * dcount; 5849 } else { 5850 rdata = data; 5851 } 5852 if ((rc2 = mdb_page_spill(mc, key, rdata))) 5853 return rc2; 5854 } 5855 5856 if (rc == MDB_NO_ROOT) { 5857 MDB_page *np; 5858 /* new database, write a root leaf page */ 5859 DPUTS("allocating new root leaf page"); 5860 if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { 5861 return rc2; 5862 } 5863 mdb_cursor_push(mc, np); 5864 mc->mc_db->md_root = np->mp_pgno; 5865 mc->mc_db->md_depth++; 5866 *mc->mc_dbflag |= DB_DIRTY; 5867 if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) 5868 == MDB_DUPFIXED) 5869 np->mp_flags |= P_LEAF2; 5870 mc->mc_flags |= C_INITIALIZED; 5871 } else { 5872 /* make sure all cursor pages are writable */ 5873 rc2 = mdb_cursor_touch(mc); 5874 if (rc2) 5875 return rc2; 5876 } 5877 5878 insert = rc; 5879 if (insert) { 5880 /* The key does not exist */ 5881 DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); 5882 if ((mc->mc_db->md_flags & MDB_DUPSORT) && 5883 LEAFSIZE(key, data) > env->me_nodemax) 5884 { 5885 /* Too big for a node, insert in sub-DB */ 5886 fp_flags = P_LEAF|P_DIRTY; 5887 fp = env->me_pbuf; 5888 fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */ 5889 fp->mp_lower = fp->mp_upper = olddata.mv_size = PAGEHDRSZ; 5890 goto prep_subDB; 5891 } 5892 } else { 5893 /* there's only a key anyway, so this is a no-op */ 5894 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 5895 unsigned int ksize = mc->mc_db->md_pad; 5896 if (key->mv_size != ksize) 5897 return MDB_BAD_VALSIZE; 5898 if (flags == MDB_CURRENT) { 5899 char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); 5900 memcpy(ptr, key->mv_data, ksize); 5901 } 5902 return MDB_SUCCESS; 5903 } 5904 5905 more: 5906 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 5907 olddata.mv_size = NODEDSZ(leaf); 5908 olddata.mv_data = NODEDATA(leaf); 5909 5910 /* DB has dups? */ 5911 if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { 5912 /* Prepare (sub-)page/sub-DB to accept the new item, 5913 * if needed. fp: old sub-page or a header faking 5914 * it. mp: new (sub-)page. offset: growth in page 5915 * size. xdata: node data with new page or DB. 5916 */ 5917 unsigned i, offset = 0; 5918 mp = fp = xdata.mv_data = env->me_pbuf; 5919 mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; 5920 5921 /* Was a single item before, must convert now */ 5922 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5923 /* Just overwrite the current item */ 5924 if (flags == MDB_CURRENT) 5925 goto current; 5926 5927 #if UINT_MAX < SIZE_MAX 5928 if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) 5929 #ifdef MISALIGNED_OK 5930 mc->mc_dbx->md_dcmp = mdb_cmp_long; 5931 #else 5932 mc->mc_dbx->md_dcmp = mdb_cmp_cint; 5933 #endif 5934 #endif 5935 /* if data matches, skip it */ 5936 if (!mc->mc_dbx->md_dcmp(data, &olddata)) { 5937 if (flags & MDB_NODUPDATA) 5938 rc = MDB_KEYEXIST; 5939 else if (flags & MDB_MULTIPLE) 5940 goto next_mult; 5941 else 5942 rc = MDB_SUCCESS; 5943 return rc; 5944 } 5945 5946 /* Back up original data item */ 5947 dkey.mv_size = olddata.mv_size; 5948 dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); 5949 5950 /* Make sub-page header for the dup items, with dummy body */ 5951 fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; 5952 fp->mp_lower = PAGEHDRSZ; 5953 xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; 5954 if (mc->mc_db->md_flags & MDB_DUPFIXED) { 5955 fp->mp_flags |= P_LEAF2; 5956 fp->mp_pad = data->mv_size; 5957 xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ 5958 } else { 5959 xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + 5960 (dkey.mv_size & 1) + (data->mv_size & 1); 5961 } 5962 fp->mp_upper = xdata.mv_size; 5963 olddata.mv_size = fp->mp_upper; /* pretend olddata is fp */ 5964 } else if (leaf->mn_flags & F_SUBDATA) { 5965 /* Data is on sub-DB, just store it */ 5966 flags |= F_DUPDATA|F_SUBDATA; 5967 goto put_sub; 5968 } else { 5969 /* Data is on sub-page */ 5970 fp = olddata.mv_data; 5971 switch (flags) { 5972 default: 5973 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 5974 offset = EVEN(NODESIZE + sizeof(indx_t) + 5975 data->mv_size); 5976 break; 5977 } 5978 offset = fp->mp_pad; 5979 if (SIZELEFT(fp) < offset) { 5980 offset *= 4; /* space for 4 more */ 5981 break; 5982 } 5983 /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */ 5984 case MDB_CURRENT: 5985 fp->mp_flags |= P_DIRTY; 5986 COPY_PGNO(fp->mp_pgno, mp->mp_pgno); 5987 mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; 5988 flags |= F_DUPDATA; 5989 goto put_sub; 5990 } 5991 xdata.mv_size = olddata.mv_size + offset; 5992 } 5993 5994 fp_flags = fp->mp_flags; 5995 if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { 5996 /* Too big for a sub-page, convert to sub-DB */ 5997 fp_flags &= ~P_SUBP; 5998 prep_subDB: 5999 if (mc->mc_db->md_flags & MDB_DUPFIXED) { 6000 fp_flags |= P_LEAF2; 6001 dummy.md_pad = fp->mp_pad; 6002 dummy.md_flags = MDB_DUPFIXED; 6003 if (mc->mc_db->md_flags & MDB_INTEGERDUP) 6004 dummy.md_flags |= MDB_INTEGERKEY; 6005 } else { 6006 dummy.md_pad = 0; 6007 dummy.md_flags = 0; 6008 } 6009 dummy.md_depth = 1; 6010 dummy.md_branch_pages = 0; 6011 dummy.md_leaf_pages = 1; 6012 dummy.md_overflow_pages = 0; 6013 dummy.md_entries = NUMKEYS(fp); 6014 xdata.mv_size = sizeof(MDB_db); 6015 xdata.mv_data = &dummy; 6016 if ((rc = mdb_page_alloc(mc, 1, &mp))) 6017 return rc; 6018 offset = env->me_psize - olddata.mv_size; 6019 flags |= F_DUPDATA|F_SUBDATA; 6020 dummy.md_root = mp->mp_pgno; 6021 } 6022 if (mp != fp) { 6023 mp->mp_flags = fp_flags | P_DIRTY; 6024 mp->mp_pad = fp->mp_pad; 6025 mp->mp_lower = fp->mp_lower; 6026 mp->mp_upper = fp->mp_upper + offset; 6027 if (fp_flags & P_LEAF2) { 6028 memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); 6029 } else { 6030 memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, 6031 olddata.mv_size - fp->mp_upper); 6032 for (i=0; i<NUMKEYS(fp); i++) 6033 mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; 6034 } 6035 } 6036 6037 rdata = &xdata; 6038 flags |= F_DUPDATA; 6039 do_sub = 1; 6040 if (!insert) 6041 mdb_node_del(mc, 0); 6042 goto new_sub; 6043 } 6044 current: 6045 /* overflow page overwrites need special handling */ 6046 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { 6047 MDB_page *omp; 6048 pgno_t pg; 6049 int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); 6050 6051 memcpy(&pg, olddata.mv_data, sizeof(pg)); 6052 if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0) 6053 return rc2; 6054 ovpages = omp->mp_pages; 6055 6056 /* Is the ov page large enough? */ 6057 if (ovpages >= dpages) { 6058 if (!(omp->mp_flags & P_DIRTY) && 6059 (level || (env->me_flags & MDB_WRITEMAP))) 6060 { 6061 rc = mdb_page_unspill(mc->mc_txn, omp, &omp); 6062 if (rc) 6063 return rc; 6064 level = 0; /* dirty in this txn or clean */ 6065 } 6066 /* Is it dirty? */ 6067 if (omp->mp_flags & P_DIRTY) { 6068 /* yes, overwrite it. Note in this case we don't 6069 * bother to try shrinking the page if the new data 6070 * is smaller than the overflow threshold. 6071 */ 6072 if (level > 1) { 6073 /* It is writable only in a parent txn */ 6074 size_t sz = (size_t) env->me_psize * ovpages, off; 6075 MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); 6076 MDB_ID2 id2; 6077 if (!np) 6078 return ENOMEM; 6079 id2.mid = pg; 6080 id2.mptr = np; 6081 rc = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); 6082 mdb_cassert(mc, rc == 0); 6083 if (!(flags & MDB_RESERVE)) { 6084 /* Copy end of page, adjusting alignment so 6085 * compiler may copy words instead of bytes. 6086 */ 6087 off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); 6088 memcpy((size_t *)((char *)np + off), 6089 (size_t *)((char *)omp + off), sz - off); 6090 sz = PAGEHDRSZ; 6091 } 6092 memcpy(np, omp, sz); /* Copy beginning of page */ 6093 omp = np; 6094 } 6095 SETDSZ(leaf, data->mv_size); 6096 if (F_ISSET(flags, MDB_RESERVE)) 6097 data->mv_data = METADATA(omp); 6098 else 6099 memcpy(METADATA(omp), data->mv_data, data->mv_size); 6100 goto done; 6101 } 6102 } 6103 if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) 6104 return rc2; 6105 } else if (data->mv_size == olddata.mv_size) { 6106 /* same size, just replace it. Note that we could 6107 * also reuse this node if the new data is smaller, 6108 * but instead we opt to shrink the node in that case. 6109 */ 6110 if (F_ISSET(flags, MDB_RESERVE)) 6111 data->mv_data = olddata.mv_data; 6112 else if (data->mv_size) 6113 memcpy(olddata.mv_data, data->mv_data, data->mv_size); 6114 else 6115 memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); 6116 goto done; 6117 } 6118 mdb_node_del(mc, 0); 6119 mc->mc_db->md_entries--; 6120 } 6121 6122 rdata = data; 6123 6124 new_sub: 6125 nflags = flags & NODE_ADD_FLAGS; 6126 nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); 6127 if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { 6128 if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) 6129 nflags &= ~MDB_APPEND; 6130 if (!insert) 6131 nflags |= MDB_SPLIT_REPLACE; 6132 rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); 6133 } else { 6134 /* There is room already in this leaf page. */ 6135 rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); 6136 if (rc == 0 && !do_sub && insert) { 6137 /* Adjust other cursors pointing to mp */ 6138 MDB_cursor *m2, *m3; 6139 MDB_dbi dbi = mc->mc_dbi; 6140 unsigned i = mc->mc_top; 6141 MDB_page *mp = mc->mc_pg[i]; 6142 6143 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 6144 if (mc->mc_flags & C_SUB) 6145 m3 = &m2->mc_xcursor->mx_cursor; 6146 else 6147 m3 = m2; 6148 if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; 6149 if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) { 6150 m3->mc_ki[i]++; 6151 } 6152 } 6153 } 6154 } 6155 6156 if (rc != MDB_SUCCESS) 6157 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 6158 else { 6159 /* Now store the actual data in the child DB. Note that we're 6160 * storing the user data in the keys field, so there are strict 6161 * size limits on dupdata. The actual data fields of the child 6162 * DB are all zero size. 6163 */ 6164 if (do_sub) { 6165 int xflags; 6166 put_sub: 6167 xdata.mv_size = 0; 6168 xdata.mv_data = ""; 6169 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 6170 if (flags & MDB_CURRENT) { 6171 xflags = MDB_CURRENT|MDB_NOSPILL; 6172 } else { 6173 mdb_xcursor_init1(mc, leaf); 6174 xflags = (flags & MDB_NODUPDATA) ? 6175 MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; 6176 } 6177 /* converted, write the original data first */ 6178 if (dkey.mv_size) { 6179 rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); 6180 if (rc) 6181 return rc; 6182 { 6183 /* Adjust other cursors pointing to mp */ 6184 MDB_cursor *m2; 6185 unsigned i = mc->mc_top; 6186 MDB_page *mp = mc->mc_pg[i]; 6187 6188 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { 6189 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; 6190 if (!(m2->mc_flags & C_INITIALIZED)) continue; 6191 if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) { 6192 mdb_xcursor_init1(m2, leaf); 6193 } 6194 } 6195 } 6196 /* we've done our job */ 6197 dkey.mv_size = 0; 6198 } 6199 if (flags & MDB_APPENDDUP) 6200 xflags |= MDB_APPEND; 6201 rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); 6202 if (flags & F_SUBDATA) { 6203 void *db = NODEDATA(leaf); 6204 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); 6205 } 6206 } 6207 /* sub-writes might have failed so check rc again. 6208 * Don't increment count if we just replaced an existing item. 6209 */ 6210 if (!rc && !(flags & MDB_CURRENT)) 6211 mc->mc_db->md_entries++; 6212 if (flags & MDB_MULTIPLE) { 6213 if (!rc) { 6214 next_mult: 6215 mcount++; 6216 /* let caller know how many succeeded, if any */ 6217 data[1].mv_size = mcount; 6218 if (mcount < dcount) { 6219 data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; 6220 goto more; 6221 } 6222 } 6223 } 6224 } 6225 done: 6226 /* If we succeeded and the key didn't exist before, make sure 6227 * the cursor is marked valid. 6228 */ 6229 if (!rc && insert) 6230 mc->mc_flags |= C_INITIALIZED; 6231 return rc; 6232 } 6233 6234 int 6235 mdb_cursor_del(MDB_cursor *mc, unsigned int flags) 6236 { 6237 MDB_node *leaf; 6238 MDB_page *mp; 6239 int rc; 6240 6241 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) 6242 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 6243 6244 if (!(mc->mc_flags & C_INITIALIZED)) 6245 return EINVAL; 6246 6247 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) 6248 return MDB_NOTFOUND; 6249 6250 if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) 6251 return rc; 6252 6253 rc = mdb_cursor_touch(mc); 6254 if (rc) 6255 return rc; 6256 6257 mp = mc->mc_pg[mc->mc_top]; 6258 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6259 6260 if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6261 if (!(flags & MDB_NODUPDATA)) { 6262 if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { 6263 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); 6264 } 6265 rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); 6266 /* If sub-DB still has entries, we're done */ 6267 if (mc->mc_xcursor->mx_db.md_entries) { 6268 if (leaf->mn_flags & F_SUBDATA) { 6269 /* update subDB info */ 6270 void *db = NODEDATA(leaf); 6271 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); 6272 } else { 6273 MDB_cursor *m2; 6274 /* shrink fake page */ 6275 mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); 6276 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6277 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); 6278 /* fix other sub-DB cursors pointed at this fake page */ 6279 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { 6280 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; 6281 if (m2->mc_pg[mc->mc_top] == mp && 6282 m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) 6283 m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); 6284 } 6285 } 6286 mc->mc_db->md_entries--; 6287 mc->mc_flags |= C_DEL; 6288 return rc; 6289 } 6290 /* otherwise fall thru and delete the sub-DB */ 6291 } 6292 6293 if (leaf->mn_flags & F_SUBDATA) { 6294 /* add all the child DB's pages to the free list */ 6295 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); 6296 if (rc == MDB_SUCCESS) { 6297 mc->mc_db->md_entries -= 6298 mc->mc_xcursor->mx_db.md_entries; 6299 } 6300 } 6301 } 6302 6303 return mdb_cursor_del0(mc, leaf); 6304 } 6305 6306 /** Allocate and initialize new pages for a database. 6307 * @param[in] mc a cursor on the database being added to. 6308 * @param[in] flags flags defining what type of page is being allocated. 6309 * @param[in] num the number of pages to allocate. This is usually 1, 6310 * unless allocating overflow pages for a large record. 6311 * @param[out] mp Address of a page, or NULL on failure. 6312 * @return 0 on success, non-zero on failure. 6313 */ 6314 static int 6315 mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) 6316 { 6317 MDB_page *np; 6318 int rc; 6319 6320 if ((rc = mdb_page_alloc(mc, num, &np))) 6321 return rc; 6322 DPRINTF(("allocated new mpage %"Z"u, page size %u", 6323 np->mp_pgno, mc->mc_txn->mt_env->me_psize)); 6324 np->mp_flags = flags | P_DIRTY; 6325 np->mp_lower = PAGEHDRSZ; 6326 np->mp_upper = mc->mc_txn->mt_env->me_psize; 6327 6328 if (IS_BRANCH(np)) 6329 mc->mc_db->md_branch_pages++; 6330 else if (IS_LEAF(np)) 6331 mc->mc_db->md_leaf_pages++; 6332 else if (IS_OVERFLOW(np)) { 6333 mc->mc_db->md_overflow_pages += num; 6334 np->mp_pages = num; 6335 } 6336 *mp = np; 6337 6338 return 0; 6339 } 6340 6341 /** Calculate the size of a leaf node. 6342 * The size depends on the environment's page size; if a data item 6343 * is too large it will be put onto an overflow page and the node 6344 * size will only include the key and not the data. Sizes are always 6345 * rounded up to an even number of bytes, to guarantee 2-byte alignment 6346 * of the #MDB_node headers. 6347 * @param[in] env The environment handle. 6348 * @param[in] key The key for the node. 6349 * @param[in] data The data for the node. 6350 * @return The number of bytes needed to store the node. 6351 */ 6352 static size_t 6353 mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) 6354 { 6355 size_t sz; 6356 6357 sz = LEAFSIZE(key, data); 6358 if (sz > env->me_nodemax) { 6359 /* put on overflow page */ 6360 sz -= data->mv_size - sizeof(pgno_t); 6361 } 6362 6363 return EVEN(sz + sizeof(indx_t)); 6364 } 6365 6366 /** Calculate the size of a branch node. 6367 * The size should depend on the environment's page size but since 6368 * we currently don't support spilling large keys onto overflow 6369 * pages, it's simply the size of the #MDB_node header plus the 6370 * size of the key. Sizes are always rounded up to an even number 6371 * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. 6372 * @param[in] env The environment handle. 6373 * @param[in] key The key for the node. 6374 * @return The number of bytes needed to store the node. 6375 */ 6376 static size_t 6377 mdb_branch_size(MDB_env *env, MDB_val *key) 6378 { 6379 size_t sz; 6380 6381 sz = INDXSIZE(key); 6382 if (sz > env->me_nodemax) { 6383 /* put on overflow page */ 6384 /* not implemented */ 6385 /* sz -= key->size - sizeof(pgno_t); */ 6386 } 6387 6388 return sz + sizeof(indx_t); 6389 } 6390 6391 /** Add a node to the page pointed to by the cursor. 6392 * @param[in] mc The cursor for this operation. 6393 * @param[in] indx The index on the page where the new node should be added. 6394 * @param[in] key The key for the new node. 6395 * @param[in] data The data for the new node, if any. 6396 * @param[in] pgno The page number, if adding a branch node. 6397 * @param[in] flags Flags for the node. 6398 * @return 0 on success, non-zero on failure. Possible errors are: 6399 * <ul> 6400 * <li>ENOMEM - failed to allocate overflow pages for the node. 6401 * <li>MDB_PAGE_FULL - there is insufficient room in the page. This error 6402 * should never happen since all callers already calculate the 6403 * page's free space before calling this function. 6404 * </ul> 6405 */ 6406 static int 6407 mdb_node_add(MDB_cursor *mc, indx_t indx, 6408 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags) 6409 { 6410 unsigned int i; 6411 size_t node_size = NODESIZE; 6412 ssize_t room; 6413 indx_t ofs; 6414 MDB_node *node; 6415 MDB_page *mp = mc->mc_pg[mc->mc_top]; 6416 MDB_page *ofp = NULL; /* overflow page */ 6417 DKBUF; 6418 6419 mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); 6420 6421 DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", 6422 IS_LEAF(mp) ? "leaf" : "branch", 6423 IS_SUBP(mp) ? "sub-" : "", 6424 mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, 6425 key ? key->mv_size : 0, key ? DKEY(key) : "null")); 6426 6427 if (IS_LEAF2(mp)) { 6428 /* Move higher keys up one slot. */ 6429 int ksize = mc->mc_db->md_pad, dif; 6430 char *ptr = LEAF2KEY(mp, indx, ksize); 6431 dif = NUMKEYS(mp) - indx; 6432 if (dif > 0) 6433 memmove(ptr+ksize, ptr, dif*ksize); 6434 /* insert new key */ 6435 memcpy(ptr, key->mv_data, ksize); 6436 6437 /* Just using these for counting */ 6438 mp->mp_lower += sizeof(indx_t); 6439 mp->mp_upper -= ksize - sizeof(indx_t); 6440 return MDB_SUCCESS; 6441 } 6442 6443 room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); 6444 if (key != NULL) 6445 node_size += key->mv_size; 6446 if (IS_LEAF(mp)) { 6447 mdb_cassert(mc, data); 6448 if (F_ISSET(flags, F_BIGDATA)) { 6449 /* Data already on overflow page. */ 6450 node_size += sizeof(pgno_t); 6451 } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { 6452 int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); 6453 int rc; 6454 /* Put data on overflow page. */ 6455 DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", 6456 data->mv_size, node_size+data->mv_size)); 6457 node_size = EVEN(node_size + sizeof(pgno_t)); 6458 if ((ssize_t)node_size > room) 6459 goto full; 6460 if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) 6461 return rc; 6462 DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); 6463 flags |= F_BIGDATA; 6464 goto update; 6465 } else { 6466 node_size += data->mv_size; 6467 } 6468 } 6469 node_size = EVEN(node_size); 6470 if ((ssize_t)node_size > room) 6471 goto full; 6472 6473 update: 6474 /* Move higher pointers up one slot. */ 6475 for (i = NUMKEYS(mp); i > indx; i--) 6476 mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; 6477 6478 /* Adjust free space offsets. */ 6479 ofs = mp->mp_upper - node_size; 6480 mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); 6481 mp->mp_ptrs[indx] = ofs; 6482 mp->mp_upper = ofs; 6483 mp->mp_lower += sizeof(indx_t); 6484 6485 /* Write the node data. */ 6486 node = NODEPTR(mp, indx); 6487 node->mn_ksize = (key == NULL) ? 0 : key->mv_size; 6488 node->mn_flags = flags; 6489 if (IS_LEAF(mp)) 6490 SETDSZ(node,data->mv_size); 6491 else 6492 SETPGNO(node,pgno); 6493 6494 if (key) 6495 memcpy(NODEKEY(node), key->mv_data, key->mv_size); 6496 6497 if (IS_LEAF(mp)) { 6498 mdb_cassert(mc, key); 6499 if (ofp == NULL) { 6500 if (F_ISSET(flags, F_BIGDATA)) 6501 memcpy(node->mn_data + key->mv_size, data->mv_data, 6502 sizeof(pgno_t)); 6503 else if (F_ISSET(flags, MDB_RESERVE)) 6504 data->mv_data = node->mn_data + key->mv_size; 6505 else 6506 memcpy(node->mn_data + key->mv_size, data->mv_data, 6507 data->mv_size); 6508 } else { 6509 memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno, 6510 sizeof(pgno_t)); 6511 if (F_ISSET(flags, MDB_RESERVE)) 6512 data->mv_data = METADATA(ofp); 6513 else 6514 memcpy(METADATA(ofp), data->mv_data, data->mv_size); 6515 } 6516 } 6517 6518 return MDB_SUCCESS; 6519 6520 full: 6521 DPRINTF(("not enough room in page %"Z"u, got %u ptrs", 6522 mdb_dbg_pgno(mp), NUMKEYS(mp))); 6523 DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); 6524 DPRINTF(("node size = %"Z"u", node_size)); 6525 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 6526 return MDB_PAGE_FULL; 6527 } 6528 6529 /** Delete the specified node from a page. 6530 * @param[in] mp The page to operate on. 6531 * @param[in] indx The index of the node to delete. 6532 * @param[in] ksize The size of a node. Only used if the page is 6533 * part of a #MDB_DUPFIXED database. 6534 */ 6535 static void 6536 mdb_node_del(MDB_cursor *mc, int ksize) 6537 { 6538 MDB_page *mp = mc->mc_pg[mc->mc_top]; 6539 indx_t indx = mc->mc_ki[mc->mc_top]; 6540 unsigned int sz; 6541 indx_t i, j, numkeys, ptr; 6542 MDB_node *node; 6543 char *base; 6544 6545 DPRINTF(("delete node %u on %s page %"Z"u", indx, 6546 IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); 6547 numkeys = NUMKEYS(mp); 6548 mdb_cassert(mc, indx < numkeys); 6549 6550 if (IS_LEAF2(mp)) { 6551 int x = numkeys - 1 - indx; 6552 base = LEAF2KEY(mp, indx, ksize); 6553 if (x) 6554 memmove(base, base + ksize, x * ksize); 6555 mp->mp_lower -= sizeof(indx_t); 6556 mp->mp_upper += ksize - sizeof(indx_t); 6557 return; 6558 } 6559 6560 node = NODEPTR(mp, indx); 6561 sz = NODESIZE + node->mn_ksize; 6562 if (IS_LEAF(mp)) { 6563 if (F_ISSET(node->mn_flags, F_BIGDATA)) 6564 sz += sizeof(pgno_t); 6565 else 6566 sz += NODEDSZ(node); 6567 } 6568 sz = EVEN(sz); 6569 6570 ptr = mp->mp_ptrs[indx]; 6571 for (i = j = 0; i < numkeys; i++) { 6572 if (i != indx) { 6573 mp->mp_ptrs[j] = mp->mp_ptrs[i]; 6574 if (mp->mp_ptrs[i] < ptr) 6575 mp->mp_ptrs[j] += sz; 6576 j++; 6577 } 6578 } 6579 6580 base = (char *)mp + mp->mp_upper; 6581 memmove(base + sz, base, ptr - mp->mp_upper); 6582 6583 mp->mp_lower -= sizeof(indx_t); 6584 mp->mp_upper += sz; 6585 } 6586 6587 /** Compact the main page after deleting a node on a subpage. 6588 * @param[in] mp The main page to operate on. 6589 * @param[in] indx The index of the subpage on the main page. 6590 */ 6591 static void 6592 mdb_node_shrink(MDB_page *mp, indx_t indx) 6593 { 6594 MDB_node *node; 6595 MDB_page *sp, *xp; 6596 char *base; 6597 int nsize, delta; 6598 indx_t i, numkeys, ptr; 6599 6600 node = NODEPTR(mp, indx); 6601 sp = (MDB_page *)NODEDATA(node); 6602 delta = SIZELEFT(sp); 6603 xp = (MDB_page *)((char *)sp + delta); 6604 6605 /* shift subpage upward */ 6606 if (IS_LEAF2(sp)) { 6607 nsize = NUMKEYS(sp) * sp->mp_pad; 6608 if (nsize & 1) 6609 return; /* do not make the node uneven-sized */ 6610 memmove(METADATA(xp), METADATA(sp), nsize); 6611 } else { 6612 int i; 6613 numkeys = NUMKEYS(sp); 6614 for (i=numkeys-1; i>=0; i--) 6615 xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; 6616 } 6617 xp->mp_upper = sp->mp_lower; 6618 xp->mp_lower = sp->mp_lower; 6619 xp->mp_flags = sp->mp_flags; 6620 xp->mp_pad = sp->mp_pad; 6621 COPY_PGNO(xp->mp_pgno, mp->mp_pgno); 6622 6623 nsize = NODEDSZ(node) - delta; 6624 SETDSZ(node, nsize); 6625 6626 /* shift lower nodes upward */ 6627 ptr = mp->mp_ptrs[indx]; 6628 numkeys = NUMKEYS(mp); 6629 for (i = 0; i < numkeys; i++) { 6630 if (mp->mp_ptrs[i] <= ptr) 6631 mp->mp_ptrs[i] += delta; 6632 } 6633 6634 base = (char *)mp + mp->mp_upper; 6635 memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node)); 6636 mp->mp_upper += delta; 6637 } 6638 6639 /** Initial setup of a sorted-dups cursor. 6640 * Sorted duplicates are implemented as a sub-database for the given key. 6641 * The duplicate data items are actually keys of the sub-database. 6642 * Operations on the duplicate data items are performed using a sub-cursor 6643 * initialized when the sub-database is first accessed. This function does 6644 * the preliminary setup of the sub-cursor, filling in the fields that 6645 * depend only on the parent DB. 6646 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. 6647 */ 6648 static void 6649 mdb_xcursor_init0(MDB_cursor *mc) 6650 { 6651 MDB_xcursor *mx = mc->mc_xcursor; 6652 6653 mx->mx_cursor.mc_xcursor = NULL; 6654 mx->mx_cursor.mc_txn = mc->mc_txn; 6655 mx->mx_cursor.mc_db = &mx->mx_db; 6656 mx->mx_cursor.mc_dbx = &mx->mx_dbx; 6657 mx->mx_cursor.mc_dbi = mc->mc_dbi; 6658 mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; 6659 mx->mx_cursor.mc_snum = 0; 6660 mx->mx_cursor.mc_top = 0; 6661 mx->mx_cursor.mc_flags = C_SUB; 6662 mx->mx_dbx.md_name.mv_size = 0; 6663 mx->mx_dbx.md_name.mv_data = NULL; 6664 mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; 6665 mx->mx_dbx.md_dcmp = NULL; 6666 mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; 6667 } 6668 6669 /** Final setup of a sorted-dups cursor. 6670 * Sets up the fields that depend on the data from the main cursor. 6671 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. 6672 * @param[in] node The data containing the #MDB_db record for the 6673 * sorted-dup database. 6674 */ 6675 static void 6676 mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) 6677 { 6678 MDB_xcursor *mx = mc->mc_xcursor; 6679 6680 if (node->mn_flags & F_SUBDATA) { 6681 memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); 6682 mx->mx_cursor.mc_pg[0] = 0; 6683 mx->mx_cursor.mc_snum = 0; 6684 mx->mx_cursor.mc_top = 0; 6685 mx->mx_cursor.mc_flags = C_SUB; 6686 } else { 6687 MDB_page *fp = NODEDATA(node); 6688 mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad; 6689 mx->mx_db.md_flags = 0; 6690 mx->mx_db.md_depth = 1; 6691 mx->mx_db.md_branch_pages = 0; 6692 mx->mx_db.md_leaf_pages = 1; 6693 mx->mx_db.md_overflow_pages = 0; 6694 mx->mx_db.md_entries = NUMKEYS(fp); 6695 COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); 6696 mx->mx_cursor.mc_snum = 1; 6697 mx->mx_cursor.mc_top = 0; 6698 mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; 6699 mx->mx_cursor.mc_pg[0] = fp; 6700 mx->mx_cursor.mc_ki[0] = 0; 6701 if (mc->mc_db->md_flags & MDB_DUPFIXED) { 6702 mx->mx_db.md_flags = MDB_DUPFIXED; 6703 mx->mx_db.md_pad = fp->mp_pad; 6704 if (mc->mc_db->md_flags & MDB_INTEGERDUP) 6705 mx->mx_db.md_flags |= MDB_INTEGERKEY; 6706 } 6707 } 6708 DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, 6709 mx->mx_db.md_root)); 6710 mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ 6711 #if UINT_MAX < SIZE_MAX 6712 if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) 6713 #ifdef MISALIGNED_OK 6714 mx->mx_dbx.md_cmp = mdb_cmp_long; 6715 #else 6716 mx->mx_dbx.md_cmp = mdb_cmp_cint; 6717 #endif 6718 #endif 6719 } 6720 6721 /** Initialize a cursor for a given transaction and database. */ 6722 static void 6723 mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) 6724 { 6725 mc->mc_next = NULL; 6726 mc->mc_backup = NULL; 6727 mc->mc_dbi = dbi; 6728 mc->mc_txn = txn; 6729 mc->mc_db = &txn->mt_dbs[dbi]; 6730 mc->mc_dbx = &txn->mt_dbxs[dbi]; 6731 mc->mc_dbflag = &txn->mt_dbflags[dbi]; 6732 mc->mc_snum = 0; 6733 mc->mc_top = 0; 6734 mc->mc_pg[0] = 0; 6735 mc->mc_flags = 0; 6736 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { 6737 mdb_tassert(txn, mx != NULL); 6738 mc->mc_xcursor = mx; 6739 mdb_xcursor_init0(mc); 6740 } else { 6741 mc->mc_xcursor = NULL; 6742 } 6743 if (*mc->mc_dbflag & DB_STALE) { 6744 mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); 6745 } 6746 } 6747 6748 int 6749 mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) 6750 { 6751 MDB_cursor *mc; 6752 size_t size = sizeof(MDB_cursor); 6753 6754 if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) 6755 return EINVAL; 6756 6757 if (txn->mt_flags & MDB_TXN_ERROR) 6758 return MDB_BAD_TXN; 6759 6760 /* Allow read access to the freelist */ 6761 if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) 6762 return EINVAL; 6763 6764 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) 6765 size += sizeof(MDB_xcursor); 6766 6767 if ((mc = malloc(size)) != NULL) { 6768 mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); 6769 if (txn->mt_cursors) { 6770 mc->mc_next = txn->mt_cursors[dbi]; 6771 txn->mt_cursors[dbi] = mc; 6772 mc->mc_flags |= C_UNTRACK; 6773 } 6774 } else { 6775 return ENOMEM; 6776 } 6777 6778 *ret = mc; 6779 6780 return MDB_SUCCESS; 6781 } 6782 6783 int 6784 mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) 6785 { 6786 if (txn == NULL || mc == NULL || mc->mc_dbi >= txn->mt_numdbs) 6787 return EINVAL; 6788 6789 if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) 6790 return EINVAL; 6791 6792 mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); 6793 return MDB_SUCCESS; 6794 } 6795 6796 /* Return the count of duplicate data items for the current key */ 6797 int 6798 mdb_cursor_count(MDB_cursor *mc, size_t *countp) 6799 { 6800 MDB_node *leaf; 6801 6802 if (mc == NULL || countp == NULL) 6803 return EINVAL; 6804 6805 if (mc->mc_xcursor == NULL) 6806 return MDB_INCOMPATIBLE; 6807 6808 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 6809 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6810 *countp = 1; 6811 } else { 6812 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) 6813 return EINVAL; 6814 6815 *countp = mc->mc_xcursor->mx_db.md_entries; 6816 } 6817 return MDB_SUCCESS; 6818 } 6819 6820 void 6821 mdb_cursor_close(MDB_cursor *mc) 6822 { 6823 if (mc && !mc->mc_backup) { 6824 /* remove from txn, if tracked */ 6825 if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { 6826 MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; 6827 while (*prev && *prev != mc) prev = &(*prev)->mc_next; 6828 if (*prev == mc) 6829 *prev = mc->mc_next; 6830 } 6831 free(mc); 6832 } 6833 } 6834 6835 MDB_txn * 6836 mdb_cursor_txn(MDB_cursor *mc) 6837 { 6838 if (!mc) return NULL; 6839 return mc->mc_txn; 6840 } 6841 6842 MDB_dbi 6843 mdb_cursor_dbi(MDB_cursor *mc) 6844 { 6845 return mc->mc_dbi; 6846 } 6847 6848 /** Replace the key for a branch node with a new key. 6849 * @param[in] mc Cursor pointing to the node to operate on. 6850 * @param[in] key The new key to use. 6851 * @return 0 on success, non-zero on failure. 6852 */ 6853 static int 6854 mdb_update_key(MDB_cursor *mc, MDB_val *key) 6855 { 6856 MDB_page *mp; 6857 MDB_node *node; 6858 char *base; 6859 size_t len; 6860 int delta, ksize, oksize; 6861 indx_t ptr, i, numkeys, indx; 6862 DKBUF; 6863 6864 indx = mc->mc_ki[mc->mc_top]; 6865 mp = mc->mc_pg[mc->mc_top]; 6866 node = NODEPTR(mp, indx); 6867 ptr = mp->mp_ptrs[indx]; 6868 #if MDB_DEBUG 6869 { 6870 MDB_val k2; 6871 char kbuf2[DKBUF_MAXKEYSIZE*2+1]; 6872 k2.mv_data = NODEKEY(node); 6873 k2.mv_size = node->mn_ksize; 6874 DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", 6875 indx, ptr, 6876 mdb_dkey(&k2, kbuf2), 6877 DKEY(key), 6878 mp->mp_pgno)); 6879 } 6880 #endif 6881 6882 /* Sizes must be 2-byte aligned. */ 6883 ksize = EVEN(key->mv_size); 6884 oksize = EVEN(node->mn_ksize); 6885 delta = ksize - oksize; 6886 6887 /* Shift node contents if EVEN(key length) changed. */ 6888 if (delta) { 6889 if (delta > 0 && SIZELEFT(mp) < delta) { 6890 pgno_t pgno; 6891 /* not enough space left, do a delete and split */ 6892 DPRINTF(("Not enough room, delta = %d, splitting...", delta)); 6893 pgno = NODEPGNO(node); 6894 mdb_node_del(mc, 0); 6895 return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); 6896 } 6897 6898 numkeys = NUMKEYS(mp); 6899 for (i = 0; i < numkeys; i++) { 6900 if (mp->mp_ptrs[i] <= ptr) 6901 mp->mp_ptrs[i] -= delta; 6902 } 6903 6904 base = (char *)mp + mp->mp_upper; 6905 len = ptr - mp->mp_upper + NODESIZE; 6906 memmove(base - delta, base, len); 6907 mp->mp_upper -= delta; 6908 6909 node = NODEPTR(mp, indx); 6910 } 6911 6912 /* But even if no shift was needed, update ksize */ 6913 if (node->mn_ksize != key->mv_size) 6914 node->mn_ksize = key->mv_size; 6915 6916 if (key->mv_size) 6917 memcpy(NODEKEY(node), key->mv_data, key->mv_size); 6918 6919 return MDB_SUCCESS; 6920 } 6921 6922 static void 6923 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); 6924 6925 /** Move a node from csrc to cdst. 6926 */ 6927 static int 6928 mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) 6929 { 6930 MDB_node *srcnode; 6931 MDB_val key, data; 6932 pgno_t srcpg; 6933 MDB_cursor mn; 6934 int rc; 6935 unsigned short flags; 6936 6937 DKBUF; 6938 6939 /* Mark src and dst as dirty. */ 6940 if ((rc = mdb_page_touch(csrc)) || 6941 (rc = mdb_page_touch(cdst))) 6942 return rc; 6943 6944 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 6945 key.mv_size = csrc->mc_db->md_pad; 6946 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); 6947 data.mv_size = 0; 6948 data.mv_data = NULL; 6949 srcpg = 0; 6950 flags = 0; 6951 } else { 6952 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); 6953 mdb_cassert(csrc, !((size_t)srcnode & 1)); 6954 srcpg = NODEPGNO(srcnode); 6955 flags = srcnode->mn_flags; 6956 if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { 6957 unsigned int snum = csrc->mc_snum; 6958 MDB_node *s2; 6959 /* must find the lowest key below src */ 6960 mdb_page_search_lowest(csrc); 6961 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 6962 key.mv_size = csrc->mc_db->md_pad; 6963 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); 6964 } else { 6965 s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); 6966 key.mv_size = NODEKSZ(s2); 6967 key.mv_data = NODEKEY(s2); 6968 } 6969 csrc->mc_snum = snum--; 6970 csrc->mc_top = snum; 6971 } else { 6972 key.mv_size = NODEKSZ(srcnode); 6973 key.mv_data = NODEKEY(srcnode); 6974 } 6975 data.mv_size = NODEDSZ(srcnode); 6976 data.mv_data = NODEDATA(srcnode); 6977 } 6978 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { 6979 unsigned int snum = cdst->mc_snum; 6980 MDB_node *s2; 6981 MDB_val bkey; 6982 /* must find the lowest key below dst */ 6983 mdb_page_search_lowest(cdst); 6984 if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) { 6985 bkey.mv_size = cdst->mc_db->md_pad; 6986 bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size); 6987 } else { 6988 s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); 6989 bkey.mv_size = NODEKSZ(s2); 6990 bkey.mv_data = NODEKEY(s2); 6991 } 6992 cdst->mc_snum = snum--; 6993 cdst->mc_top = snum; 6994 mdb_cursor_copy(cdst, &mn); 6995 mn.mc_ki[snum] = 0; 6996 rc = mdb_update_key(&mn, &bkey); 6997 if (rc) 6998 return rc; 6999 } 7000 7001 DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", 7002 IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", 7003 csrc->mc_ki[csrc->mc_top], 7004 DKEY(&key), 7005 csrc->mc_pg[csrc->mc_top]->mp_pgno, 7006 cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); 7007 7008 /* Add the node to the destination page. 7009 */ 7010 rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); 7011 if (rc != MDB_SUCCESS) 7012 return rc; 7013 7014 /* Delete the node from the source page. 7015 */ 7016 mdb_node_del(csrc, key.mv_size); 7017 7018 { 7019 /* Adjust other cursors pointing to mp */ 7020 MDB_cursor *m2, *m3; 7021 MDB_dbi dbi = csrc->mc_dbi; 7022 MDB_page *mp = csrc->mc_pg[csrc->mc_top]; 7023 7024 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 7025 if (csrc->mc_flags & C_SUB) 7026 m3 = &m2->mc_xcursor->mx_cursor; 7027 else 7028 m3 = m2; 7029 if (m3 == csrc) continue; 7030 if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] == 7031 csrc->mc_ki[csrc->mc_top]) { 7032 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; 7033 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; 7034 } 7035 } 7036 } 7037 7038 /* Update the parent separators. 7039 */ 7040 if (csrc->mc_ki[csrc->mc_top] == 0) { 7041 if (csrc->mc_ki[csrc->mc_top-1] != 0) { 7042 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 7043 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); 7044 } else { 7045 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); 7046 key.mv_size = NODEKSZ(srcnode); 7047 key.mv_data = NODEKEY(srcnode); 7048 } 7049 DPRINTF(("update separator for source page %"Z"u to [%s]", 7050 csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); 7051 mdb_cursor_copy(csrc, &mn); 7052 mn.mc_snum--; 7053 mn.mc_top--; 7054 if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) 7055 return rc; 7056 } 7057 if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { 7058 MDB_val nullkey; 7059 indx_t ix = csrc->mc_ki[csrc->mc_top]; 7060 nullkey.mv_size = 0; 7061 csrc->mc_ki[csrc->mc_top] = 0; 7062 rc = mdb_update_key(csrc, &nullkey); 7063 csrc->mc_ki[csrc->mc_top] = ix; 7064 mdb_cassert(csrc, rc == MDB_SUCCESS); 7065 } 7066 } 7067 7068 if (cdst->mc_ki[cdst->mc_top] == 0) { 7069 if (cdst->mc_ki[cdst->mc_top-1] != 0) { 7070 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 7071 key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); 7072 } else { 7073 srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); 7074 key.mv_size = NODEKSZ(srcnode); 7075 key.mv_data = NODEKEY(srcnode); 7076 } 7077 DPRINTF(("update separator for destination page %"Z"u to [%s]", 7078 cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); 7079 mdb_cursor_copy(cdst, &mn); 7080 mn.mc_snum--; 7081 mn.mc_top--; 7082 if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS) 7083 return rc; 7084 } 7085 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { 7086 MDB_val nullkey; 7087 indx_t ix = cdst->mc_ki[cdst->mc_top]; 7088 nullkey.mv_size = 0; 7089 cdst->mc_ki[cdst->mc_top] = 0; 7090 rc = mdb_update_key(cdst, &nullkey); 7091 cdst->mc_ki[cdst->mc_top] = ix; 7092 mdb_cassert(csrc, rc == MDB_SUCCESS); 7093 } 7094 } 7095 7096 return MDB_SUCCESS; 7097 } 7098 7099 /** Merge one page into another. 7100 * The nodes from the page pointed to by \b csrc will 7101 * be copied to the page pointed to by \b cdst and then 7102 * the \b csrc page will be freed. 7103 * @param[in] csrc Cursor pointing to the source page. 7104 * @param[in] cdst Cursor pointing to the destination page. 7105 */ 7106 static int 7107 mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) 7108 { 7109 int rc; 7110 indx_t i, j; 7111 MDB_node *srcnode; 7112 MDB_val key, data; 7113 unsigned nkeys; 7114 7115 DPRINTF(("merging page %"Z"u into %"Z"u", csrc->mc_pg[csrc->mc_top]->mp_pgno, 7116 cdst->mc_pg[cdst->mc_top]->mp_pgno)); 7117 7118 mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ 7119 mdb_cassert(csrc, cdst->mc_snum > 1); 7120 7121 /* Mark dst as dirty. */ 7122 if ((rc = mdb_page_touch(cdst))) 7123 return rc; 7124 7125 /* Move all nodes from src to dst. 7126 */ 7127 j = nkeys = NUMKEYS(cdst->mc_pg[cdst->mc_top]); 7128 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 7129 key.mv_size = csrc->mc_db->md_pad; 7130 key.mv_data = METADATA(csrc->mc_pg[csrc->mc_top]); 7131 for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) { 7132 rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); 7133 if (rc != MDB_SUCCESS) 7134 return rc; 7135 key.mv_data = (char *)key.mv_data + key.mv_size; 7136 } 7137 } else { 7138 for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) { 7139 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i); 7140 if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { 7141 unsigned int snum = csrc->mc_snum; 7142 MDB_node *s2; 7143 /* must find the lowest key below src */ 7144 mdb_page_search_lowest(csrc); 7145 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 7146 key.mv_size = csrc->mc_db->md_pad; 7147 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); 7148 } else { 7149 s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); 7150 key.mv_size = NODEKSZ(s2); 7151 key.mv_data = NODEKEY(s2); 7152 } 7153 csrc->mc_snum = snum--; 7154 csrc->mc_top = snum; 7155 } else { 7156 key.mv_size = srcnode->mn_ksize; 7157 key.mv_data = NODEKEY(srcnode); 7158 } 7159 7160 data.mv_size = NODEDSZ(srcnode); 7161 data.mv_data = NODEDATA(srcnode); 7162 rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); 7163 if (rc != MDB_SUCCESS) 7164 return rc; 7165 } 7166 } 7167 7168 DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", 7169 cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]), 7170 (float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10)); 7171 7172 /* Unlink the src page from parent and add to free list. 7173 */ 7174 csrc->mc_top--; 7175 mdb_node_del(csrc, 0); 7176 if (csrc->mc_ki[csrc->mc_top] == 0) { 7177 key.mv_size = 0; 7178 rc = mdb_update_key(csrc, &key); 7179 if (rc) { 7180 csrc->mc_top++; 7181 return rc; 7182 } 7183 } 7184 csrc->mc_top++; 7185 7186 rc = mdb_midl_append(&csrc->mc_txn->mt_free_pgs, 7187 csrc->mc_pg[csrc->mc_top]->mp_pgno); 7188 if (rc) 7189 return rc; 7190 if (IS_LEAF(csrc->mc_pg[csrc->mc_top])) 7191 csrc->mc_db->md_leaf_pages--; 7192 else 7193 csrc->mc_db->md_branch_pages--; 7194 { 7195 /* Adjust other cursors pointing to mp */ 7196 MDB_cursor *m2, *m3; 7197 MDB_dbi dbi = csrc->mc_dbi; 7198 MDB_page *mp = cdst->mc_pg[cdst->mc_top]; 7199 7200 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 7201 if (csrc->mc_flags & C_SUB) 7202 m3 = &m2->mc_xcursor->mx_cursor; 7203 else 7204 m3 = m2; 7205 if (m3 == csrc) continue; 7206 if (m3->mc_snum < csrc->mc_snum) continue; 7207 if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) { 7208 m3->mc_pg[csrc->mc_top] = mp; 7209 m3->mc_ki[csrc->mc_top] += nkeys; 7210 } 7211 } 7212 } 7213 mdb_cursor_pop(csrc); 7214 7215 return mdb_rebalance(csrc); 7216 } 7217 7218 /** Copy the contents of a cursor. 7219 * @param[in] csrc The cursor to copy from. 7220 * @param[out] cdst The cursor to copy to. 7221 */ 7222 static void 7223 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) 7224 { 7225 unsigned int i; 7226 7227 cdst->mc_txn = csrc->mc_txn; 7228 cdst->mc_dbi = csrc->mc_dbi; 7229 cdst->mc_db = csrc->mc_db; 7230 cdst->mc_dbx = csrc->mc_dbx; 7231 cdst->mc_snum = csrc->mc_snum; 7232 cdst->mc_top = csrc->mc_top; 7233 cdst->mc_flags = csrc->mc_flags; 7234 7235 for (i=0; i<csrc->mc_snum; i++) { 7236 cdst->mc_pg[i] = csrc->mc_pg[i]; 7237 cdst->mc_ki[i] = csrc->mc_ki[i]; 7238 } 7239 } 7240 7241 /** Rebalance the tree after a delete operation. 7242 * @param[in] mc Cursor pointing to the page where rebalancing 7243 * should begin. 7244 * @return 0 on success, non-zero on failure. 7245 */ 7246 static int 7247 mdb_rebalance(MDB_cursor *mc) 7248 { 7249 MDB_node *node; 7250 int rc; 7251 unsigned int ptop, minkeys; 7252 MDB_cursor mn; 7253 7254 minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top])); 7255 DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", 7256 IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", 7257 mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), 7258 (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); 7259 7260 if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD && 7261 NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { 7262 DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", 7263 mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); 7264 return MDB_SUCCESS; 7265 } 7266 7267 if (mc->mc_snum < 2) { 7268 MDB_page *mp = mc->mc_pg[0]; 7269 if (IS_SUBP(mp)) { 7270 DPUTS("Can't rebalance a subpage, ignoring"); 7271 return MDB_SUCCESS; 7272 } 7273 if (NUMKEYS(mp) == 0) { 7274 DPUTS("tree is completely empty"); 7275 mc->mc_db->md_root = P_INVALID; 7276 mc->mc_db->md_depth = 0; 7277 mc->mc_db->md_leaf_pages = 0; 7278 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); 7279 if (rc) 7280 return rc; 7281 /* Adjust cursors pointing to mp */ 7282 mc->mc_snum = 0; 7283 mc->mc_top = 0; 7284 mc->mc_flags &= ~C_INITIALIZED; 7285 { 7286 MDB_cursor *m2, *m3; 7287 MDB_dbi dbi = mc->mc_dbi; 7288 7289 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 7290 if (mc->mc_flags & C_SUB) 7291 m3 = &m2->mc_xcursor->mx_cursor; 7292 else 7293 m3 = m2; 7294 if (m3->mc_snum < mc->mc_snum) continue; 7295 if (m3->mc_pg[0] == mp) { 7296 m3->mc_snum = 0; 7297 m3->mc_top = 0; 7298 m3->mc_flags &= ~C_INITIALIZED; 7299 } 7300 } 7301 } 7302 } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { 7303 DPUTS("collapsing root page!"); 7304 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); 7305 if (rc) 7306 return rc; 7307 mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); 7308 rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL); 7309 if (rc) 7310 return rc; 7311 mc->mc_db->md_depth--; 7312 mc->mc_db->md_branch_pages--; 7313 mc->mc_ki[0] = mc->mc_ki[1]; 7314 { 7315 /* Adjust other cursors pointing to mp */ 7316 MDB_cursor *m2, *m3; 7317 MDB_dbi dbi = mc->mc_dbi; 7318 7319 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 7320 if (mc->mc_flags & C_SUB) 7321 m3 = &m2->mc_xcursor->mx_cursor; 7322 else 7323 m3 = m2; 7324 if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; 7325 if (m3->mc_pg[0] == mp) { 7326 int i; 7327 m3->mc_snum--; 7328 m3->mc_top--; 7329 for (i=0; i<m3->mc_snum; i++) { 7330 m3->mc_pg[i] = m3->mc_pg[i+1]; 7331 m3->mc_ki[i] = m3->mc_ki[i+1]; 7332 } 7333 } 7334 } 7335 } 7336 } else 7337 DPUTS("root page doesn't need rebalancing"); 7338 return MDB_SUCCESS; 7339 } 7340 7341 /* The parent (branch page) must have at least 2 pointers, 7342 * otherwise the tree is invalid. 7343 */ 7344 ptop = mc->mc_top-1; 7345 mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); 7346 7347 /* Leaf page fill factor is below the threshold. 7348 * Try to move keys from left or right neighbor, or 7349 * merge with a neighbor page. 7350 */ 7351 7352 /* Find neighbors. 7353 */ 7354 mdb_cursor_copy(mc, &mn); 7355 mn.mc_xcursor = NULL; 7356 7357 if (mc->mc_ki[ptop] == 0) { 7358 /* We're the leftmost leaf in our parent. 7359 */ 7360 DPUTS("reading right neighbor"); 7361 mn.mc_ki[ptop]++; 7362 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); 7363 rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); 7364 if (rc) 7365 return rc; 7366 mn.mc_ki[mn.mc_top] = 0; 7367 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); 7368 } else { 7369 /* There is at least one neighbor to the left. 7370 */ 7371 DPUTS("reading left neighbor"); 7372 mn.mc_ki[ptop]--; 7373 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); 7374 rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); 7375 if (rc) 7376 return rc; 7377 mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; 7378 mc->mc_ki[mc->mc_top] = 0; 7379 } 7380 7381 DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", 7382 mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), 7383 (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); 7384 7385 /* If the neighbor page is above threshold and has enough keys, 7386 * move one key from it. Otherwise we should try to merge them. 7387 * (A branch page must never have less than 2 keys.) 7388 */ 7389 minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top])); 7390 if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) 7391 return mdb_node_move(&mn, mc); 7392 else { 7393 if (mc->mc_ki[ptop] == 0) 7394 rc = mdb_page_merge(&mn, mc); 7395 else { 7396 mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; 7397 rc = mdb_page_merge(mc, &mn); 7398 mdb_cursor_copy(&mn, mc); 7399 } 7400 mc->mc_flags &= ~(C_INITIALIZED|C_EOF); 7401 } 7402 return rc; 7403 } 7404 7405 /** Complete a delete operation started by #mdb_cursor_del(). */ 7406 static int 7407 mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) 7408 { 7409 int rc; 7410 MDB_page *mp; 7411 indx_t ki; 7412 unsigned int nkeys; 7413 7414 mp = mc->mc_pg[mc->mc_top]; 7415 ki = mc->mc_ki[mc->mc_top]; 7416 7417 /* add overflow pages to free list */ 7418 if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_BIGDATA)) { 7419 MDB_page *omp; 7420 pgno_t pg; 7421 7422 memcpy(&pg, NODEDATA(leaf), sizeof(pg)); 7423 if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) || 7424 (rc = mdb_ovpage_free(mc, omp))) 7425 return rc; 7426 } 7427 mdb_node_del(mc, mc->mc_db->md_pad); 7428 mc->mc_db->md_entries--; 7429 rc = mdb_rebalance(mc); 7430 if (rc != MDB_SUCCESS) 7431 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 7432 else { 7433 MDB_cursor *m2, *m3; 7434 MDB_dbi dbi = mc->mc_dbi; 7435 7436 mp = mc->mc_pg[mc->mc_top]; 7437 nkeys = NUMKEYS(mp); 7438 7439 /* if mc points past last node in page, find next sibling */ 7440 if (mc->mc_ki[mc->mc_top] >= nkeys) 7441 mdb_cursor_sibling(mc, 1); 7442 7443 /* Adjust other cursors pointing to mp */ 7444 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 7445 m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 7446 if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 7447 continue; 7448 if (m3 == mc || m3->mc_snum < mc->mc_snum) 7449 continue; 7450 if (m3->mc_pg[mc->mc_top] == mp) { 7451 if (m3->mc_ki[mc->mc_top] >= ki) { 7452 m3->mc_flags |= C_DEL; 7453 if (m3->mc_ki[mc->mc_top] > ki) 7454 m3->mc_ki[mc->mc_top]--; 7455 } 7456 if (m3->mc_ki[mc->mc_top] >= nkeys) 7457 mdb_cursor_sibling(m3, 1); 7458 } 7459 } 7460 mc->mc_flags |= C_DEL; 7461 } 7462 7463 return rc; 7464 } 7465 7466 int 7467 mdb_del(MDB_txn *txn, MDB_dbi dbi, 7468 MDB_val *key, MDB_val *data) 7469 { 7470 MDB_cursor mc; 7471 MDB_xcursor mx; 7472 MDB_cursor_op op; 7473 MDB_val rdata, *xdata; 7474 int rc, exact; 7475 DKBUF; 7476 7477 if (key == NULL) 7478 return EINVAL; 7479 7480 DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); 7481 7482 if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) 7483 return EINVAL; 7484 7485 if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) 7486 return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 7487 7488 mdb_cursor_init(&mc, txn, dbi, &mx); 7489 7490 exact = 0; 7491 if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { 7492 /* must ignore any data */ 7493 data = NULL; 7494 } 7495 if (data) { 7496 op = MDB_GET_BOTH; 7497 rdata = *data; 7498 xdata = &rdata; 7499 } else { 7500 op = MDB_SET; 7501 xdata = NULL; 7502 } 7503 rc = mdb_cursor_set(&mc, key, xdata, op, &exact); 7504 if (rc == 0) { 7505 /* let mdb_page_split know about this cursor if needed: 7506 * delete will trigger a rebalance; if it needs to move 7507 * a node from one page to another, it will have to 7508 * update the parent's separator key(s). If the new sepkey 7509 * is larger than the current one, the parent page may 7510 * run out of space, triggering a split. We need this 7511 * cursor to be consistent until the end of the rebalance. 7512 */ 7513 mc.mc_flags |= C_UNTRACK; 7514 mc.mc_next = txn->mt_cursors[dbi]; 7515 txn->mt_cursors[dbi] = &mc; 7516 rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA); 7517 txn->mt_cursors[dbi] = mc.mc_next; 7518 } 7519 return rc; 7520 } 7521 7522 /** Split a page and insert a new node. 7523 * @param[in,out] mc Cursor pointing to the page and desired insertion index. 7524 * The cursor will be updated to point to the actual page and index where 7525 * the node got inserted after the split. 7526 * @param[in] newkey The key for the newly inserted node. 7527 * @param[in] newdata The data for the newly inserted node. 7528 * @param[in] newpgno The page number, if the new node is a branch node. 7529 * @param[in] nflags The #NODE_ADD_FLAGS for the new node. 7530 * @return 0 on success, non-zero on failure. 7531 */ 7532 static int 7533 mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, 7534 unsigned int nflags) 7535 { 7536 unsigned int flags; 7537 int rc = MDB_SUCCESS, new_root = 0, did_split = 0; 7538 indx_t newindx; 7539 pgno_t pgno = 0; 7540 int i, j, split_indx, nkeys, pmax; 7541 MDB_env *env = mc->mc_txn->mt_env; 7542 MDB_node *node; 7543 MDB_val sepkey, rkey, xdata, *rdata = &xdata; 7544 MDB_page *copy = NULL; 7545 MDB_page *mp, *rp, *pp; 7546 int ptop; 7547 MDB_cursor mn; 7548 DKBUF; 7549 7550 mp = mc->mc_pg[mc->mc_top]; 7551 newindx = mc->mc_ki[mc->mc_top]; 7552 nkeys = NUMKEYS(mp); 7553 7554 DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", 7555 IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, 7556 DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); 7557 7558 /* Create a right sibling. */ 7559 if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) 7560 return rc; 7561 DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); 7562 7563 if (mc->mc_snum < 2) { 7564 if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) 7565 return rc; 7566 /* shift current top to make room for new parent */ 7567 mc->mc_pg[1] = mc->mc_pg[0]; 7568 mc->mc_ki[1] = mc->mc_ki[0]; 7569 mc->mc_pg[0] = pp; 7570 mc->mc_ki[0] = 0; 7571 mc->mc_db->md_root = pp->mp_pgno; 7572 DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); 7573 mc->mc_db->md_depth++; 7574 new_root = 1; 7575 7576 /* Add left (implicit) pointer. */ 7577 if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { 7578 /* undo the pre-push */ 7579 mc->mc_pg[0] = mc->mc_pg[1]; 7580 mc->mc_ki[0] = mc->mc_ki[1]; 7581 mc->mc_db->md_root = mp->mp_pgno; 7582 mc->mc_db->md_depth--; 7583 return rc; 7584 } 7585 mc->mc_snum = 2; 7586 mc->mc_top = 1; 7587 ptop = 0; 7588 } else { 7589 ptop = mc->mc_top-1; 7590 DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); 7591 } 7592 7593 mc->mc_flags |= C_SPLITTING; 7594 mdb_cursor_copy(mc, &mn); 7595 mn.mc_pg[mn.mc_top] = rp; 7596 mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; 7597 7598 if (nflags & MDB_APPEND) { 7599 mn.mc_ki[mn.mc_top] = 0; 7600 sepkey = *newkey; 7601 split_indx = newindx; 7602 nkeys = 0; 7603 } else { 7604 7605 split_indx = (nkeys+1) / 2; 7606 7607 if (IS_LEAF2(rp)) { 7608 char *split, *ins; 7609 int x; 7610 unsigned int lsize, rsize, ksize; 7611 /* Move half of the keys to the right sibling */ 7612 copy = NULL; 7613 x = mc->mc_ki[mc->mc_top] - split_indx; 7614 ksize = mc->mc_db->md_pad; 7615 split = LEAF2KEY(mp, split_indx, ksize); 7616 rsize = (nkeys - split_indx) * ksize; 7617 lsize = (nkeys - split_indx) * sizeof(indx_t); 7618 mp->mp_lower -= lsize; 7619 rp->mp_lower += lsize; 7620 mp->mp_upper += rsize - lsize; 7621 rp->mp_upper -= rsize - lsize; 7622 sepkey.mv_size = ksize; 7623 if (newindx == split_indx) { 7624 sepkey.mv_data = newkey->mv_data; 7625 } else { 7626 sepkey.mv_data = split; 7627 } 7628 if (x<0) { 7629 ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); 7630 memcpy(rp->mp_ptrs, split, rsize); 7631 sepkey.mv_data = rp->mp_ptrs; 7632 memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); 7633 memcpy(ins, newkey->mv_data, ksize); 7634 mp->mp_lower += sizeof(indx_t); 7635 mp->mp_upper -= ksize - sizeof(indx_t); 7636 } else { 7637 if (x) 7638 memcpy(rp->mp_ptrs, split, x * ksize); 7639 ins = LEAF2KEY(rp, x, ksize); 7640 memcpy(ins, newkey->mv_data, ksize); 7641 memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); 7642 rp->mp_lower += sizeof(indx_t); 7643 rp->mp_upper -= ksize - sizeof(indx_t); 7644 mc->mc_ki[mc->mc_top] = x; 7645 mc->mc_pg[mc->mc_top] = rp; 7646 } 7647 } else { 7648 int psize, nsize, k; 7649 /* Maximum free space in an empty page */ 7650 pmax = env->me_psize - PAGEHDRSZ; 7651 if (IS_LEAF(mp)) 7652 nsize = mdb_leaf_size(env, newkey, newdata); 7653 else 7654 nsize = mdb_branch_size(env, newkey); 7655 nsize = EVEN(nsize); 7656 7657 /* grab a page to hold a temporary copy */ 7658 copy = mdb_page_malloc(mc->mc_txn, 1); 7659 if (copy == NULL) 7660 return ENOMEM; 7661 copy->mp_pgno = mp->mp_pgno; 7662 copy->mp_flags = mp->mp_flags; 7663 copy->mp_lower = PAGEHDRSZ; 7664 copy->mp_upper = env->me_psize; 7665 7666 /* prepare to insert */ 7667 for (i=0, j=0; i<nkeys; i++) { 7668 if (i == newindx) { 7669 copy->mp_ptrs[j++] = 0; 7670 } 7671 copy->mp_ptrs[j++] = mp->mp_ptrs[i]; 7672 } 7673 7674 /* When items are relatively large the split point needs 7675 * to be checked, because being off-by-one will make the 7676 * difference between success or failure in mdb_node_add. 7677 * 7678 * It's also relevant if a page happens to be laid out 7679 * such that one half of its nodes are all "small" and 7680 * the other half of its nodes are "large." If the new 7681 * item is also "large" and falls on the half with 7682 * "large" nodes, it also may not fit. 7683 * 7684 * As a final tweak, if the new item goes on the last 7685 * spot on the page (and thus, onto the new page), bias 7686 * the split so the new page is emptier than the old page. 7687 * This yields better packing during sequential inserts. 7688 */ 7689 if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { 7690 /* Find split point */ 7691 psize = 0; 7692 if (newindx <= split_indx || newindx >= nkeys) { 7693 i = 0; j = 1; 7694 k = newindx >= nkeys ? nkeys : split_indx+2; 7695 } else { 7696 i = nkeys; j = -1; 7697 k = split_indx-1; 7698 } 7699 for (; i!=k; i+=j) { 7700 if (i == newindx) { 7701 psize += nsize; 7702 node = NULL; 7703 } else { 7704 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]); 7705 psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); 7706 if (IS_LEAF(mp)) { 7707 if (F_ISSET(node->mn_flags, F_BIGDATA)) 7708 psize += sizeof(pgno_t); 7709 else 7710 psize += NODEDSZ(node); 7711 } 7712 psize = EVEN(psize); 7713 } 7714 if (psize > pmax || i == k-j) { 7715 split_indx = i + (j<0); 7716 break; 7717 } 7718 } 7719 } 7720 if (split_indx == newindx) { 7721 sepkey.mv_size = newkey->mv_size; 7722 sepkey.mv_data = newkey->mv_data; 7723 } else { 7724 node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx]); 7725 sepkey.mv_size = node->mn_ksize; 7726 sepkey.mv_data = NODEKEY(node); 7727 } 7728 } 7729 } 7730 7731 DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); 7732 7733 /* Copy separator key to the parent. 7734 */ 7735 if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { 7736 mn.mc_snum--; 7737 mn.mc_top--; 7738 did_split = 1; 7739 rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0); 7740 7741 /* root split? */ 7742 if (mn.mc_snum == mc->mc_snum) { 7743 mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top]; 7744 mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top]; 7745 mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop]; 7746 mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop]; 7747 mc->mc_snum++; 7748 mc->mc_top++; 7749 ptop++; 7750 } 7751 /* Right page might now have changed parent. 7752 * Check if left page also changed parent. 7753 */ 7754 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 7755 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { 7756 for (i=0; i<ptop; i++) { 7757 mc->mc_pg[i] = mn.mc_pg[i]; 7758 mc->mc_ki[i] = mn.mc_ki[i]; 7759 } 7760 mc->mc_pg[ptop] = mn.mc_pg[ptop]; 7761 mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; 7762 } 7763 } else { 7764 mn.mc_top--; 7765 rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); 7766 mn.mc_top++; 7767 } 7768 mc->mc_flags ^= C_SPLITTING; 7769 if (rc != MDB_SUCCESS) { 7770 return rc; 7771 } 7772 if (nflags & MDB_APPEND) { 7773 mc->mc_pg[mc->mc_top] = rp; 7774 mc->mc_ki[mc->mc_top] = 0; 7775 rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); 7776 if (rc) 7777 return rc; 7778 for (i=0; i<mc->mc_top; i++) 7779 mc->mc_ki[i] = mn.mc_ki[i]; 7780 } else if (!IS_LEAF2(mp)) { 7781 /* Move nodes */ 7782 mc->mc_pg[mc->mc_top] = rp; 7783 i = split_indx; 7784 j = 0; 7785 do { 7786 if (i == newindx) { 7787 rkey.mv_data = newkey->mv_data; 7788 rkey.mv_size = newkey->mv_size; 7789 if (IS_LEAF(mp)) { 7790 rdata = newdata; 7791 } else 7792 pgno = newpgno; 7793 flags = nflags; 7794 /* Update index for the new key. */ 7795 mc->mc_ki[mc->mc_top] = j; 7796 } else { 7797 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i]); 7798 rkey.mv_data = NODEKEY(node); 7799 rkey.mv_size = node->mn_ksize; 7800 if (IS_LEAF(mp)) { 7801 xdata.mv_data = NODEDATA(node); 7802 xdata.mv_size = NODEDSZ(node); 7803 rdata = &xdata; 7804 } else 7805 pgno = NODEPGNO(node); 7806 flags = node->mn_flags; 7807 } 7808 7809 if (!IS_LEAF(mp) && j == 0) { 7810 /* First branch index doesn't need key data. */ 7811 rkey.mv_size = 0; 7812 } 7813 7814 rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); 7815 if (rc) { 7816 /* return tmp page to freelist */ 7817 mdb_page_free(env, copy); 7818 return rc; 7819 } 7820 if (i == nkeys) { 7821 i = 0; 7822 j = 0; 7823 mc->mc_pg[mc->mc_top] = copy; 7824 } else { 7825 i++; 7826 j++; 7827 } 7828 } while (i != split_indx); 7829 7830 nkeys = NUMKEYS(copy); 7831 for (i=0; i<nkeys; i++) 7832 mp->mp_ptrs[i] = copy->mp_ptrs[i]; 7833 mp->mp_lower = copy->mp_lower; 7834 mp->mp_upper = copy->mp_upper; 7835 memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), 7836 env->me_psize - copy->mp_upper); 7837 7838 /* reset back to original page */ 7839 if (newindx < split_indx) { 7840 mc->mc_pg[mc->mc_top] = mp; 7841 if (nflags & MDB_RESERVE) { 7842 node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 7843 if (!(node->mn_flags & F_BIGDATA)) 7844 newdata->mv_data = NODEDATA(node); 7845 } 7846 } else { 7847 mc->mc_pg[mc->mc_top] = rp; 7848 mc->mc_ki[ptop]++; 7849 /* Make sure mc_ki is still valid. 7850 */ 7851 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 7852 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { 7853 for (i=0; i<ptop; i++) { 7854 mc->mc_pg[i] = mn.mc_pg[i]; 7855 mc->mc_ki[i] = mn.mc_ki[i]; 7856 } 7857 mc->mc_pg[ptop] = mn.mc_pg[ptop]; 7858 mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; 7859 } 7860 } 7861 /* return tmp page to freelist */ 7862 mdb_page_free(env, copy); 7863 } 7864 7865 { 7866 /* Adjust other cursors pointing to mp */ 7867 MDB_cursor *m2, *m3; 7868 MDB_dbi dbi = mc->mc_dbi; 7869 int fixup = NUMKEYS(mp); 7870 7871 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 7872 if (mc->mc_flags & C_SUB) 7873 m3 = &m2->mc_xcursor->mx_cursor; 7874 else 7875 m3 = m2; 7876 if (m3 == mc) 7877 continue; 7878 if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 7879 continue; 7880 if (m3->mc_flags & C_SPLITTING) 7881 continue; 7882 if (new_root) { 7883 int k; 7884 /* root split */ 7885 for (k=m3->mc_top; k>=0; k--) { 7886 m3->mc_ki[k+1] = m3->mc_ki[k]; 7887 m3->mc_pg[k+1] = m3->mc_pg[k]; 7888 } 7889 if (m3->mc_ki[0] >= split_indx) { 7890 m3->mc_ki[0] = 1; 7891 } else { 7892 m3->mc_ki[0] = 0; 7893 } 7894 m3->mc_pg[0] = mc->mc_pg[0]; 7895 m3->mc_snum++; 7896 m3->mc_top++; 7897 } 7898 if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { 7899 if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) 7900 m3->mc_ki[mc->mc_top]++; 7901 if (m3->mc_ki[mc->mc_top] >= fixup) { 7902 m3->mc_pg[mc->mc_top] = rp; 7903 m3->mc_ki[mc->mc_top] -= fixup; 7904 m3->mc_ki[ptop] = mn.mc_ki[ptop]; 7905 } 7906 } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && 7907 m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { 7908 m3->mc_ki[ptop]++; 7909 } 7910 } 7911 } 7912 DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); 7913 return rc; 7914 } 7915 7916 int 7917 mdb_put(MDB_txn *txn, MDB_dbi dbi, 7918 MDB_val *key, MDB_val *data, unsigned int flags) 7919 { 7920 MDB_cursor mc; 7921 MDB_xcursor mx; 7922 7923 if (key == NULL || data == NULL) 7924 return EINVAL; 7925 7926 if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) 7927 return EINVAL; 7928 7929 if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) 7930 return EINVAL; 7931 7932 mdb_cursor_init(&mc, txn, dbi, &mx); 7933 return mdb_cursor_put(&mc, key, data, flags); 7934 } 7935 7936 int 7937 mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) 7938 { 7939 if ((flag & CHANGEABLE) != flag) 7940 return EINVAL; 7941 if (onoff) 7942 env->me_flags |= flag; 7943 else 7944 env->me_flags &= ~flag; 7945 return MDB_SUCCESS; 7946 } 7947 7948 int 7949 mdb_env_get_flags(MDB_env *env, unsigned int *arg) 7950 { 7951 if (!env || !arg) 7952 return EINVAL; 7953 7954 *arg = env->me_flags; 7955 return MDB_SUCCESS; 7956 } 7957 7958 int 7959 mdb_env_set_userctx(MDB_env *env, void *ctx) 7960 { 7961 if (!env) 7962 return EINVAL; 7963 env->me_userctx = ctx; 7964 return MDB_SUCCESS; 7965 } 7966 7967 void * 7968 mdb_env_get_userctx(MDB_env *env) 7969 { 7970 return env ? env->me_userctx : NULL; 7971 } 7972 7973 int 7974 mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) 7975 { 7976 if (!env) 7977 return EINVAL; 7978 #ifndef NDEBUG 7979 env->me_assert_func = func; 7980 #endif 7981 return MDB_SUCCESS; 7982 } 7983 7984 int 7985 mdb_env_get_path(MDB_env *env, const char **arg) 7986 { 7987 if (!env || !arg) 7988 return EINVAL; 7989 7990 *arg = env->me_path; 7991 return MDB_SUCCESS; 7992 } 7993 7994 int 7995 mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) 7996 { 7997 if (!env || !arg) 7998 return EINVAL; 7999 8000 *arg = env->me_fd; 8001 return MDB_SUCCESS; 8002 } 8003 8004 /** Common code for #mdb_stat() and #mdb_env_stat(). 8005 * @param[in] env the environment to operate in. 8006 * @param[in] db the #MDB_db record containing the stats to return. 8007 * @param[out] arg the address of an #MDB_stat structure to receive the stats. 8008 * @return 0, this function always succeeds. 8009 */ 8010 static int 8011 mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) 8012 { 8013 arg->ms_psize = env->me_psize; 8014 arg->ms_depth = db->md_depth; 8015 arg->ms_branch_pages = db->md_branch_pages; 8016 arg->ms_leaf_pages = db->md_leaf_pages; 8017 arg->ms_overflow_pages = db->md_overflow_pages; 8018 arg->ms_entries = db->md_entries; 8019 8020 return MDB_SUCCESS; 8021 } 8022 int 8023 mdb_env_stat(MDB_env *env, MDB_stat *arg) 8024 { 8025 int toggle; 8026 8027 if (env == NULL || arg == NULL) 8028 return EINVAL; 8029 8030 toggle = mdb_env_pick_meta(env); 8031 8032 return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg); 8033 } 8034 8035 int 8036 mdb_env_info(MDB_env *env, MDB_envinfo *arg) 8037 { 8038 int toggle; 8039 8040 if (env == NULL || arg == NULL) 8041 return EINVAL; 8042 8043 toggle = mdb_env_pick_meta(env); 8044 arg->me_mapaddr = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : 0; 8045 arg->me_mapsize = env->me_mapsize; 8046 arg->me_maxreaders = env->me_maxreaders; 8047 8048 /* me_numreaders may be zero if this process never used any readers. Use 8049 * the shared numreader count if it exists. 8050 */ 8051 arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : env->me_numreaders; 8052 8053 arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg; 8054 arg->me_last_txnid = env->me_metas[toggle]->mm_txnid; 8055 return MDB_SUCCESS; 8056 } 8057 8058 /** Set the default comparison functions for a database. 8059 * Called immediately after a database is opened to set the defaults. 8060 * The user can then override them with #mdb_set_compare() or 8061 * #mdb_set_dupsort(). 8062 * @param[in] txn A transaction handle returned by #mdb_txn_begin() 8063 * @param[in] dbi A database handle returned by #mdb_dbi_open() 8064 */ 8065 static void 8066 mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) 8067 { 8068 uint16_t f = txn->mt_dbs[dbi].md_flags; 8069 8070 txn->mt_dbxs[dbi].md_cmp = 8071 (f & MDB_REVERSEKEY) ? mdb_cmp_memnr : 8072 (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn; 8073 8074 txn->mt_dbxs[dbi].md_dcmp = 8075 !(f & MDB_DUPSORT) ? 0 : 8076 ((f & MDB_INTEGERDUP) 8077 ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint) 8078 : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); 8079 } 8080 8081 int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) 8082 { 8083 MDB_val key, data; 8084 MDB_dbi i; 8085 MDB_cursor mc; 8086 int rc, dbflag, exact; 8087 unsigned int unused = 0; 8088 size_t len; 8089 8090 if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) { 8091 mdb_default_cmp(txn, FREE_DBI); 8092 } 8093 8094 if ((flags & VALID_FLAGS) != flags) 8095 return EINVAL; 8096 if (txn->mt_flags & MDB_TXN_ERROR) 8097 return MDB_BAD_TXN; 8098 8099 /* main DB? */ 8100 if (!name) { 8101 *dbi = MAIN_DBI; 8102 if (flags & PERSISTENT_FLAGS) { 8103 uint16_t f2 = flags & PERSISTENT_FLAGS; 8104 /* make sure flag changes get committed */ 8105 if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { 8106 txn->mt_dbs[MAIN_DBI].md_flags |= f2; 8107 txn->mt_flags |= MDB_TXN_DIRTY; 8108 } 8109 } 8110 mdb_default_cmp(txn, MAIN_DBI); 8111 return MDB_SUCCESS; 8112 } 8113 8114 if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { 8115 mdb_default_cmp(txn, MAIN_DBI); 8116 } 8117 8118 /* Is the DB already open? */ 8119 len = strlen(name); 8120 for (i=2; i<txn->mt_numdbs; i++) { 8121 if (!txn->mt_dbxs[i].md_name.mv_size) { 8122 /* Remember this free slot */ 8123 if (!unused) unused = i; 8124 continue; 8125 } 8126 if (len == txn->mt_dbxs[i].md_name.mv_size && 8127 !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { 8128 *dbi = i; 8129 return MDB_SUCCESS; 8130 } 8131 } 8132 8133 /* If no free slot and max hit, fail */ 8134 if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) 8135 return MDB_DBS_FULL; 8136 8137 /* Cannot mix named databases with some mainDB flags */ 8138 if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) 8139 return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; 8140 8141 /* Find the DB info */ 8142 dbflag = DB_NEW|DB_VALID; 8143 exact = 0; 8144 key.mv_size = len; 8145 key.mv_data = (void *)name; 8146 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); 8147 rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); 8148 if (rc == MDB_SUCCESS) { 8149 /* make sure this is actually a DB */ 8150 MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); 8151 if (!(node->mn_flags & F_SUBDATA)) 8152 return MDB_INCOMPATIBLE; 8153 } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) { 8154 /* Create if requested */ 8155 MDB_db dummy; 8156 data.mv_size = sizeof(MDB_db); 8157 data.mv_data = &dummy; 8158 memset(&dummy, 0, sizeof(dummy)); 8159 dummy.md_root = P_INVALID; 8160 dummy.md_flags = flags & PERSISTENT_FLAGS; 8161 rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); 8162 dbflag |= DB_DIRTY; 8163 } 8164 8165 /* OK, got info, add to table */ 8166 if (rc == MDB_SUCCESS) { 8167 unsigned int slot = unused ? unused : txn->mt_numdbs; 8168 txn->mt_dbxs[slot].md_name.mv_data = strdup(name); 8169 txn->mt_dbxs[slot].md_name.mv_size = len; 8170 txn->mt_dbxs[slot].md_rel = NULL; 8171 txn->mt_dbflags[slot] = dbflag; 8172 memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); 8173 *dbi = slot; 8174 mdb_default_cmp(txn, slot); 8175 if (!unused) { 8176 txn->mt_numdbs++; 8177 } 8178 } 8179 8180 return rc; 8181 } 8182 8183 int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) 8184 { 8185 if (txn == NULL || arg == NULL || dbi >= txn->mt_numdbs) 8186 return EINVAL; 8187 8188 if (txn->mt_dbflags[dbi] & DB_STALE) { 8189 MDB_cursor mc; 8190 MDB_xcursor mx; 8191 /* Stale, must read the DB's root. cursor_init does it for us. */ 8192 mdb_cursor_init(&mc, txn, dbi, &mx); 8193 } 8194 return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); 8195 } 8196 8197 void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) 8198 { 8199 char *ptr; 8200 if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs) 8201 return; 8202 ptr = env->me_dbxs[dbi].md_name.mv_data; 8203 env->me_dbxs[dbi].md_name.mv_data = NULL; 8204 env->me_dbxs[dbi].md_name.mv_size = 0; 8205 env->me_dbflags[dbi] = 0; 8206 free(ptr); 8207 } 8208 8209 int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) 8210 { 8211 /* We could return the flags for the FREE_DBI too but what's the point? */ 8212 if (txn == NULL || dbi < MAIN_DBI || dbi >= txn->mt_numdbs) 8213 return EINVAL; 8214 *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; 8215 return MDB_SUCCESS; 8216 } 8217 8218 /** Add all the DB's pages to the free list. 8219 * @param[in] mc Cursor on the DB to free. 8220 * @param[in] subs non-Zero to check for sub-DBs in this DB. 8221 * @return 0 on success, non-zero on failure. 8222 */ 8223 static int 8224 mdb_drop0(MDB_cursor *mc, int subs) 8225 { 8226 int rc; 8227 8228 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); 8229 if (rc == MDB_SUCCESS) { 8230 MDB_txn *txn = mc->mc_txn; 8231 MDB_node *ni; 8232 MDB_cursor mx; 8233 unsigned int i; 8234 8235 /* LEAF2 pages have no nodes, cannot have sub-DBs */ 8236 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) 8237 mdb_cursor_pop(mc); 8238 8239 mdb_cursor_copy(mc, &mx); 8240 while (mc->mc_snum > 0) { 8241 MDB_page *mp = mc->mc_pg[mc->mc_top]; 8242 unsigned n = NUMKEYS(mp); 8243 if (IS_LEAF(mp)) { 8244 for (i=0; i<n; i++) { 8245 ni = NODEPTR(mp, i); 8246 if (ni->mn_flags & F_BIGDATA) { 8247 MDB_page *omp; 8248 pgno_t pg; 8249 memcpy(&pg, NODEDATA(ni), sizeof(pg)); 8250 rc = mdb_page_get(txn, pg, &omp, NULL); 8251 if (rc != 0) 8252 return rc; 8253 mdb_cassert(mc, IS_OVERFLOW(omp)); 8254 rc = mdb_midl_append_range(&txn->mt_free_pgs, 8255 pg, omp->mp_pages); 8256 if (rc) 8257 return rc; 8258 } else if (subs && (ni->mn_flags & F_SUBDATA)) { 8259 mdb_xcursor_init1(mc, ni); 8260 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); 8261 if (rc) 8262 return rc; 8263 } 8264 } 8265 } else { 8266 if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) 8267 return rc; 8268 for (i=0; i<n; i++) { 8269 pgno_t pg; 8270 ni = NODEPTR(mp, i); 8271 pg = NODEPGNO(ni); 8272 /* free it */ 8273 mdb_midl_xappend(txn->mt_free_pgs, pg); 8274 } 8275 } 8276 if (!mc->mc_top) 8277 break; 8278 mc->mc_ki[mc->mc_top] = i; 8279 rc = mdb_cursor_sibling(mc, 1); 8280 if (rc) { 8281 /* no more siblings, go back to beginning 8282 * of previous level. 8283 */ 8284 mdb_cursor_pop(mc); 8285 mc->mc_ki[0] = 0; 8286 for (i=1; i<mc->mc_snum; i++) { 8287 mc->mc_ki[i] = 0; 8288 mc->mc_pg[i] = mx.mc_pg[i]; 8289 } 8290 } 8291 } 8292 /* free it */ 8293 rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); 8294 } else if (rc == MDB_NOTFOUND) { 8295 rc = MDB_SUCCESS; 8296 } 8297 return rc; 8298 } 8299 8300 int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) 8301 { 8302 MDB_cursor *mc, *m2; 8303 int rc; 8304 8305 if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1 || !(txn->mt_dbflags[dbi] & DB_VALID)) 8306 return EINVAL; 8307 8308 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) 8309 return EACCES; 8310 8311 rc = mdb_cursor_open(txn, dbi, &mc); 8312 if (rc) 8313 return rc; 8314 8315 rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); 8316 /* Invalidate the dropped DB's cursors */ 8317 for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) 8318 m2->mc_flags &= ~(C_INITIALIZED|C_EOF); 8319 if (rc) 8320 goto leave; 8321 8322 /* Can't delete the main DB */ 8323 if (del && dbi > MAIN_DBI) { 8324 rc = mdb_del(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL); 8325 if (!rc) { 8326 txn->mt_dbflags[dbi] = DB_STALE; 8327 mdb_dbi_close(txn->mt_env, dbi); 8328 } 8329 } else { 8330 /* reset the DB record, mark it dirty */ 8331 txn->mt_dbflags[dbi] |= DB_DIRTY; 8332 txn->mt_dbs[dbi].md_depth = 0; 8333 txn->mt_dbs[dbi].md_branch_pages = 0; 8334 txn->mt_dbs[dbi].md_leaf_pages = 0; 8335 txn->mt_dbs[dbi].md_overflow_pages = 0; 8336 txn->mt_dbs[dbi].md_entries = 0; 8337 txn->mt_dbs[dbi].md_root = P_INVALID; 8338 8339 txn->mt_flags |= MDB_TXN_DIRTY; 8340 } 8341 leave: 8342 mdb_cursor_close(mc); 8343 return rc; 8344 } 8345 8346 int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) 8347 { 8348 if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) 8349 return EINVAL; 8350 8351 txn->mt_dbxs[dbi].md_cmp = cmp; 8352 return MDB_SUCCESS; 8353 } 8354 8355 int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) 8356 { 8357 if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) 8358 return EINVAL; 8359 8360 txn->mt_dbxs[dbi].md_dcmp = cmp; 8361 return MDB_SUCCESS; 8362 } 8363 8364 int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) 8365 { 8366 if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) 8367 return EINVAL; 8368 8369 txn->mt_dbxs[dbi].md_rel = rel; 8370 return MDB_SUCCESS; 8371 } 8372 8373 int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) 8374 { 8375 if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) 8376 return EINVAL; 8377 8378 txn->mt_dbxs[dbi].md_relctx = ctx; 8379 return MDB_SUCCESS; 8380 } 8381 8382 int mdb_env_get_maxkeysize(MDB_env *env) 8383 { 8384 return ENV_MAXKEY(env); 8385 } 8386 8387 int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) 8388 { 8389 unsigned int i, rdrs; 8390 MDB_reader *mr; 8391 char buf[64]; 8392 int rc = 0, first = 1; 8393 8394 if (!env || !func) 8395 return -1; 8396 if (!env->me_txns) { 8397 return func("(no reader locks)\n", ctx); 8398 } 8399 rdrs = env->me_txns->mti_numreaders; 8400 mr = env->me_txns->mti_readers; 8401 for (i=0; i<rdrs; i++) { 8402 if (mr[i].mr_pid) { 8403 txnid_t txnid = mr[i].mr_txnid; 8404 sprintf(buf, txnid == (txnid_t)-1 ? 8405 "%10d %"Z"x -\n" : "%10d %"Z"x %"Z"u\n", 8406 (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); 8407 if (first) { 8408 first = 0; 8409 rc = func(" pid thread txnid\n", ctx); 8410 if (rc < 0) 8411 break; 8412 } 8413 rc = func(buf, ctx); 8414 if (rc < 0) 8415 break; 8416 } 8417 } 8418 if (first) { 8419 rc = func("(no active readers)\n", ctx); 8420 } 8421 return rc; 8422 } 8423 8424 /** Insert pid into list if not already present. 8425 * return -1 if already present. 8426 */ 8427 static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid) 8428 { 8429 /* binary search of pid in list */ 8430 unsigned base = 0; 8431 unsigned cursor = 1; 8432 int val = 0; 8433 unsigned n = ids[0]; 8434 8435 while( 0 < n ) { 8436 unsigned pivot = n >> 1; 8437 cursor = base + pivot + 1; 8438 val = pid - ids[cursor]; 8439 8440 if( val < 0 ) { 8441 n = pivot; 8442 8443 } else if ( val > 0 ) { 8444 base = cursor; 8445 n -= pivot + 1; 8446 8447 } else { 8448 /* found, so it's a duplicate */ 8449 return -1; 8450 } 8451 } 8452 8453 if( val > 0 ) { 8454 ++cursor; 8455 } 8456 ids[0]++; 8457 for (n = ids[0]; n > cursor; n--) 8458 ids[n] = ids[n-1]; 8459 ids[n] = pid; 8460 return 0; 8461 } 8462 8463 int mdb_reader_check(MDB_env *env, int *dead) 8464 { 8465 unsigned int i, j, rdrs; 8466 MDB_reader *mr; 8467 MDB_PID_T *pids, pid; 8468 int count = 0; 8469 8470 if (!env) 8471 return EINVAL; 8472 if (dead) 8473 *dead = 0; 8474 if (!env->me_txns) 8475 return MDB_SUCCESS; 8476 rdrs = env->me_txns->mti_numreaders; 8477 pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); 8478 if (!pids) 8479 return ENOMEM; 8480 pids[0] = 0; 8481 mr = env->me_txns->mti_readers; 8482 for (i=0; i<rdrs; i++) { 8483 if (mr[i].mr_pid && mr[i].mr_pid != env->me_pid) { 8484 pid = mr[i].mr_pid; 8485 if (mdb_pid_insert(pids, pid) == 0) { 8486 if (!mdb_reader_pid(env, Pidcheck, pid)) { 8487 LOCK_MUTEX_R(env); 8488 /* Recheck, a new process may have reused pid */ 8489 if (!mdb_reader_pid(env, Pidcheck, pid)) { 8490 for (j=i; j<rdrs; j++) 8491 if (mr[j].mr_pid == pid) { 8492 DPRINTF(("clear stale reader pid %u txn %"Z"d", 8493 (unsigned) pid, mr[j].mr_txnid)); 8494 mr[j].mr_pid = 0; 8495 count++; 8496 } 8497 } 8498 UNLOCK_MUTEX_R(env); 8499 } 8500 } 8501 } 8502 } 8503 free(pids); 8504 if (dead) 8505 *dead = count; 8506 return MDB_SUCCESS; 8507 } 8508 /** @} */ 8509