1 /* $NetBSD: mdb.c,v 1.1.1.3 2018/02/06 01:53:08 christos Exp $ */ 2 3 /** @file mdb.c 4 * @brief Lightning memory-mapped database library 5 * 6 * A Btree-based database management library modeled loosely on the 7 * BerkeleyDB API, but much simplified. 8 */ 9 /* 10 * Copyright 2011-2017 Howard Chu, Symas Corp. 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted only as authorized by the OpenLDAP 15 * Public License. 16 * 17 * A copy of this license is available in the file LICENSE in the 18 * top-level directory of the distribution or, alternatively, at 19 * <http://www.OpenLDAP.org/license.html>. 20 * 21 * This code is derived from btree.c written by Martin Hedenfalk. 22 * 23 * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se> 24 * 25 * Permission to use, copy, modify, and distribute this software for any 26 * purpose with or without fee is hereby granted, provided that the above 27 * copyright notice and this permission notice appear in all copies. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 30 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 31 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 32 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 33 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 34 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 35 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 36 */ 37 #ifndef _GNU_SOURCE 38 #define _GNU_SOURCE 1 39 #endif 40 #if defined(__WIN64__) 41 #define _FILE_OFFSET_BITS 64 42 #endif 43 #ifdef _WIN32 44 #include <malloc.h> 45 #include <windows.h> 46 #include <wchar.h> /* get wcscpy() */ 47 48 /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it 49 * as int64 which is wrong. MSVC doesn't define it at all, so just 50 * don't use it. 51 */ 52 #define MDB_PID_T int 53 #define MDB_THR_T DWORD 54 #include <sys/types.h> 55 #include <sys/stat.h> 56 #ifdef __GNUC__ 57 # include <sys/param.h> 58 #else 59 # define LITTLE_ENDIAN 1234 60 # define BIG_ENDIAN 4321 61 # define BYTE_ORDER LITTLE_ENDIAN 62 # ifndef SSIZE_MAX 63 # define SSIZE_MAX INT_MAX 64 # endif 65 #endif 66 #else 67 #include <sys/types.h> 68 #include <sys/stat.h> 69 #define MDB_PID_T pid_t 70 #define MDB_THR_T pthread_t 71 #include <sys/param.h> 72 #include <sys/uio.h> 73 #include <sys/mman.h> 74 #ifdef HAVE_SYS_FILE_H 75 #include <sys/file.h> 76 #endif 77 #include <fcntl.h> 78 #endif 79 80 #if defined(__mips) && defined(__linux) 81 /* MIPS has cache coherency issues, requires explicit cache control */ 82 #include <asm/cachectl.h> 83 extern int cacheflush(char *addr, int nbytes, int cache); 84 #define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache) 85 #else 86 #define CACHEFLUSH(addr, bytes, cache) 87 #endif 88 89 #if defined(__linux) && !defined(MDB_FDATASYNC_WORKS) 90 /** fdatasync is broken on ext3/ext4fs on older kernels, see 91 * description in #mdb_env_open2 comments. You can safely 92 * define MDB_FDATASYNC_WORKS if this code will only be run 93 * on kernels 3.6 and newer. 94 */ 95 #define BROKEN_FDATASYNC 96 #endif 97 98 #include <errno.h> 99 #include <limits.h> 100 #include <stddef.h> 101 #include <inttypes.h> 102 #include <stdio.h> 103 #include <stdlib.h> 104 #include <string.h> 105 #include <time.h> 106 107 #ifdef _MSC_VER 108 #include <io.h> 109 typedef SSIZE_T ssize_t; 110 #else 111 #include <unistd.h> 112 #endif 113 114 #if defined(__sun) || defined(ANDROID) 115 /* Most platforms have posix_memalign, older may only have memalign */ 116 #define HAVE_MEMALIGN 1 117 #include <malloc.h> 118 #endif 119 120 #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) 121 #include <netinet/in.h> 122 #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */ 123 #endif 124 125 #if defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) 126 # define MDB_USE_POSIX_SEM 1 127 # define MDB_FDATASYNC fsync 128 #elif defined(ANDROID) 129 # define MDB_FDATASYNC fsync 130 #endif 131 132 #ifndef _WIN32 133 #include <pthread.h> 134 #include <signal.h> 135 #ifdef MDB_USE_POSIX_SEM 136 # define MDB_USE_HASH 1 137 #include <semaphore.h> 138 #else 139 #define MDB_USE_POSIX_MUTEX 1 140 #endif 141 #endif 142 143 #if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) \ 144 + defined(MDB_USE_POSIX_MUTEX) != 1 145 # error "Ambiguous shared-lock implementation" 146 #endif 147 148 #ifdef USE_VALGRIND 149 #include <valgrind/memcheck.h> 150 #define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z) 151 #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s) 152 #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a) 153 #define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h) 154 #define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s) 155 #else 156 #define VGMEMP_CREATE(h,r,z) 157 #define VGMEMP_ALLOC(h,a,s) 158 #define VGMEMP_FREE(h,a) 159 #define VGMEMP_DESTROY(h) 160 #define VGMEMP_DEFINED(a,s) 161 #endif 162 163 #ifndef BYTE_ORDER 164 # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) 165 /* Solaris just defines one or the other */ 166 # define LITTLE_ENDIAN 1234 167 # define BIG_ENDIAN 4321 168 # ifdef _LITTLE_ENDIAN 169 # define BYTE_ORDER LITTLE_ENDIAN 170 # else 171 # define BYTE_ORDER BIG_ENDIAN 172 # endif 173 # else 174 # define BYTE_ORDER __BYTE_ORDER 175 # endif 176 #endif 177 178 #ifndef LITTLE_ENDIAN 179 #define LITTLE_ENDIAN __LITTLE_ENDIAN 180 #endif 181 #ifndef BIG_ENDIAN 182 #define BIG_ENDIAN __BIG_ENDIAN 183 #endif 184 185 #if defined(__i386) || defined(__x86_64) || defined(_M_IX86) 186 #define MISALIGNED_OK 1 187 #endif 188 189 #include "lmdb.h" 190 #include "midl.h" 191 192 #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) 193 # error "Unknown or unsupported endianness (BYTE_ORDER)" 194 #elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF 195 # error "Two's complement, reasonably sized integer types, please" 196 #endif 197 198 #ifdef __GNUC__ 199 /** Put infrequently used env functions in separate section */ 200 # ifdef __APPLE__ 201 # define ESECT __attribute__ ((section("__TEXT,text_env"))) 202 # else 203 # define ESECT __attribute__ ((section("text_env"))) 204 # endif 205 #else 206 #define ESECT 207 #endif 208 209 #ifdef _WIN32 210 #define CALL_CONV WINAPI 211 #else 212 #define CALL_CONV 213 #endif 214 215 /** @defgroup internal LMDB Internals 216 * @{ 217 */ 218 /** @defgroup compat Compatibility Macros 219 * A bunch of macros to minimize the amount of platform-specific ifdefs 220 * needed throughout the rest of the code. When the features this library 221 * needs are similar enough to POSIX to be hidden in a one-or-two line 222 * replacement, this macro approach is used. 223 * @{ 224 */ 225 226 /** Features under development */ 227 #ifndef MDB_DEVEL 228 #define MDB_DEVEL 0 229 #endif 230 231 /** Wrapper around __func__, which is a C99 feature */ 232 #if __STDC_VERSION__ >= 199901L 233 # define mdb_func_ __func__ 234 #elif __GNUC__ >= 2 || _MSC_VER >= 1300 235 # define mdb_func_ __FUNCTION__ 236 #else 237 /* If a debug message says <mdb_unknown>(), update the #if statements above */ 238 # define mdb_func_ "<mdb_unknown>" 239 #endif 240 241 /* Internal error codes, not exposed outside liblmdb */ 242 #define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) 243 #ifdef _WIN32 244 #define MDB_OWNERDEAD ((int) WAIT_ABANDONED) 245 #elif defined(MDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD) 246 #define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */ 247 #endif 248 249 #ifdef __GLIBC__ 250 #define GLIBC_VER ((__GLIBC__ << 16 )| __GLIBC_MINOR__) 251 #endif 252 /** Some platforms define the EOWNERDEAD error code 253 * even though they don't support Robust Mutexes. 254 * Compile with -DMDB_USE_ROBUST=0, or use some other 255 * mechanism like -DMDB_USE_POSIX_SEM instead of 256 * -DMDB_USE_POSIX_MUTEX. 257 * (Posix semaphores are not robust.) 258 */ 259 #ifndef MDB_USE_ROBUST 260 /* Android currently lacks Robust Mutex support. So does glibc < 2.4. */ 261 # if defined(MDB_USE_POSIX_MUTEX) && (defined(ANDROID) || \ 262 (defined(__GLIBC__) && GLIBC_VER < 0x020004)) 263 # define MDB_USE_ROBUST 0 264 # else 265 # define MDB_USE_ROBUST 1 266 # endif 267 #endif /* !MDB_USE_ROBUST */ 268 269 #if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST) 270 /* glibc < 2.12 only provided _np API */ 271 # if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \ 272 (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST)) 273 # define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP 274 # define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) 275 # define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) 276 # endif 277 #endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */ 278 279 #if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST) 280 #define MDB_ROBUST_SUPPORTED 1 281 #endif 282 283 #ifdef _WIN32 284 #define MDB_USE_HASH 1 285 #define MDB_PIDLOCK 0 286 #define THREAD_RET DWORD 287 #define pthread_t HANDLE 288 #define pthread_mutex_t HANDLE 289 #define pthread_cond_t HANDLE 290 typedef HANDLE mdb_mutex_t, mdb_mutexref_t; 291 #define pthread_key_t DWORD 292 #define pthread_self() GetCurrentThreadId() 293 #define pthread_key_create(x,y) \ 294 ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0) 295 #define pthread_key_delete(x) TlsFree(x) 296 #define pthread_getspecific(x) TlsGetValue(x) 297 #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) 298 #define pthread_mutex_unlock(x) ReleaseMutex(*x) 299 #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) 300 #define pthread_cond_signal(x) SetEvent(*x) 301 #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) 302 #define THREAD_CREATE(thr,start,arg) \ 303 (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode()) 304 #define THREAD_FINISH(thr) \ 305 (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0) 306 #define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE) 307 #define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex) 308 #define mdb_mutex_consistent(mutex) 0 309 #define getpid() GetCurrentProcessId() 310 #define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd)) 311 #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) 312 #define ErrCode() GetLastError() 313 #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} 314 #define close(fd) (CloseHandle(fd) ? 0 : -1) 315 #define munmap(ptr,len) UnmapViewOfFile(ptr) 316 #ifdef PROCESS_QUERY_LIMITED_INFORMATION 317 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION 318 #else 319 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 320 #endif 321 #define Z "I" 322 #else 323 #define THREAD_RET void * 324 #define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg) 325 #define THREAD_FINISH(thr) pthread_join(thr,NULL) 326 #define Z "z" /**< printf format modifier for size_t */ 327 328 /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ 329 #define MDB_PIDLOCK 1 330 331 #ifdef MDB_USE_POSIX_SEM 332 333 typedef sem_t *mdb_mutex_t, *mdb_mutexref_t; 334 #define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex) 335 #define UNLOCK_MUTEX(mutex) sem_post(mutex) 336 337 static int 338 mdb_sem_wait(sem_t *sem) 339 { 340 int rc; 341 while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ; 342 return rc; 343 } 344 345 #else /* MDB_USE_POSIX_MUTEX: */ 346 /** Shared mutex/semaphore as the original is stored. 347 * 348 * Not for copies. Instead it can be assigned to an #mdb_mutexref_t. 349 * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it 350 * is array[size 1] so it can be assigned to the pointer. 351 */ 352 typedef pthread_mutex_t mdb_mutex_t[1]; 353 /** Reference to an #mdb_mutex_t */ 354 typedef pthread_mutex_t *mdb_mutexref_t; 355 /** Lock the reader or writer mutex. 356 * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX(). 357 */ 358 #define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex) 359 /** Unlock the reader or writer mutex. 360 */ 361 #define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex) 362 /** Mark mutex-protected data as repaired, after death of previous owner. 363 */ 364 #define mdb_mutex_consistent(mutex) pthread_mutex_consistent(mutex) 365 #endif /* MDB_USE_POSIX_SEM */ 366 367 /** Get the error code for the last failed system function. 368 */ 369 #define ErrCode() errno 370 371 /** An abstraction for a file handle. 372 * On POSIX systems file handles are small integers. On Windows 373 * they're opaque pointers. 374 */ 375 #define HANDLE int 376 377 /** A value for an invalid file handle. 378 * Mainly used to initialize file variables and signify that they are 379 * unused. 380 */ 381 #define INVALID_HANDLE_VALUE (-1) 382 383 /** Get the size of a memory page for the system. 384 * This is the basic size that the platform's memory manager uses, and is 385 * fundamental to the use of memory-mapped files. 386 */ 387 #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) 388 #endif 389 390 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) 391 #define MNAME_LEN 32 392 #else 393 #define MNAME_LEN (sizeof(pthread_mutex_t)) 394 #endif 395 396 /** @} */ 397 398 #ifdef MDB_ROBUST_SUPPORTED 399 /** Lock mutex, handle any error, set rc = result. 400 * Return 0 on success, nonzero (not rc) on error. 401 */ 402 #define LOCK_MUTEX(rc, env, mutex) \ 403 (((rc) = LOCK_MUTEX0(mutex)) && \ 404 ((rc) = mdb_mutex_failed(env, mutex, rc))) 405 static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc); 406 #else 407 #define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex)) 408 #define mdb_mutex_failed(env, mutex, rc) (rc) 409 #endif 410 411 #ifndef _WIN32 412 /** A flag for opening a file and requesting synchronous data writes. 413 * This is only used when writing a meta page. It's not strictly needed; 414 * we could just do a normal write and then immediately perform a flush. 415 * But if this flag is available it saves us an extra system call. 416 * 417 * @note If O_DSYNC is undefined but exists in /usr/include, 418 * preferably set some compiler flag to get the definition. 419 */ 420 #ifndef MDB_DSYNC 421 # ifdef O_DSYNC 422 # define MDB_DSYNC O_DSYNC 423 # else 424 # define MDB_DSYNC O_SYNC 425 # endif 426 #endif 427 #endif 428 429 /** Function for flushing the data of a file. Define this to fsync 430 * if fdatasync() is not supported. 431 */ 432 #ifndef MDB_FDATASYNC 433 # define MDB_FDATASYNC fdatasync 434 #endif 435 436 #ifndef MDB_MSYNC 437 # define MDB_MSYNC(addr,len,flags) msync(addr,len,flags) 438 #endif 439 440 #ifndef MS_SYNC 441 #define MS_SYNC 1 442 #endif 443 444 #ifndef MS_ASYNC 445 #define MS_ASYNC 0 446 #endif 447 448 /** A page number in the database. 449 * Note that 64 bit page numbers are overkill, since pages themselves 450 * already represent 12-13 bits of addressable memory, and the OS will 451 * always limit applications to a maximum of 63 bits of address space. 452 * 453 * @note In the #MDB_node structure, we only store 48 bits of this value, 454 * which thus limits us to only 60 bits of addressable data. 455 */ 456 typedef MDB_ID pgno_t; 457 458 /** A transaction ID. 459 * See struct MDB_txn.mt_txnid for details. 460 */ 461 typedef MDB_ID txnid_t; 462 463 /** @defgroup debug Debug Macros 464 * @{ 465 */ 466 #ifndef MDB_DEBUG 467 /** Enable debug output. Needs variable argument macros (a C99 feature). 468 * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs 469 * read from and written to the database (used for free space management). 470 */ 471 #define MDB_DEBUG 0 472 #endif 473 474 #if MDB_DEBUG 475 static int mdb_debug; 476 static txnid_t mdb_debug_start; 477 478 /** Print a debug message with printf formatting. 479 * Requires double parenthesis around 2 or more args. 480 */ 481 # define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args)) 482 # define DPRINTF0(fmt, ...) \ 483 fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__) 484 #else 485 # define DPRINTF(args) ((void) 0) 486 #endif 487 /** Print a debug string. 488 * The string is printed literally, with no format processing. 489 */ 490 #define DPUTS(arg) DPRINTF(("%s", arg)) 491 /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ 492 #define DDBI(mc) \ 493 (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) 494 /** @} */ 495 496 /** @brief The maximum size of a database page. 497 * 498 * It is 32k or 64k, since value-PAGEBASE must fit in 499 * #MDB_page.%mp_upper. 500 * 501 * LMDB will use database pages < OS pages if needed. 502 * That causes more I/O in write transactions: The OS must 503 * know (read) the whole page before writing a partial page. 504 * 505 * Note that we don't currently support Huge pages. On Linux, 506 * regular data files cannot use Huge pages, and in general 507 * Huge pages aren't actually pageable. We rely on the OS 508 * demand-pager to read our data and page it out when memory 509 * pressure from other processes is high. So until OSs have 510 * actual paging support for Huge pages, they're not viable. 511 */ 512 #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) 513 514 /** The minimum number of keys required in a database page. 515 * Setting this to a larger value will place a smaller bound on the 516 * maximum size of a data item. Data items larger than this size will 517 * be pushed into overflow pages instead of being stored directly in 518 * the B-tree node. This value used to default to 4. With a page size 519 * of 4096 bytes that meant that any item larger than 1024 bytes would 520 * go into an overflow page. That also meant that on average 2-3KB of 521 * each overflow page was wasted space. The value cannot be lower than 522 * 2 because then there would no longer be a tree structure. With this 523 * value, items larger than 2KB will go into overflow pages, and on 524 * average only 1KB will be wasted. 525 */ 526 #define MDB_MINKEYS 2 527 528 /** A stamp that identifies a file as an LMDB file. 529 * There's nothing special about this value other than that it is easily 530 * recognizable, and it will reflect any byte order mismatches. 531 */ 532 #define MDB_MAGIC 0xBEEFC0DE 533 534 /** The version number for a database's datafile format. */ 535 #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) 536 /** The version number for a database's lockfile format. */ 537 #define MDB_LOCK_VERSION 1 538 539 /** @brief The max size of a key we can write, or 0 for computed max. 540 * 541 * This macro should normally be left alone or set to 0. 542 * Note that a database with big keys or dupsort data cannot be 543 * reliably modified by a liblmdb which uses a smaller max. 544 * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. 545 * 546 * Other values are allowed, for backwards compat. However: 547 * A value bigger than the computed max can break if you do not 548 * know what you are doing, and liblmdb <= 0.9.10 can break when 549 * modifying a DB with keys/dupsort data bigger than its max. 550 * 551 * Data items in an #MDB_DUPSORT database are also limited to 552 * this size, since they're actually keys of a sub-DB. Keys and 553 * #MDB_DUPSORT data items must fit on a node in a regular page. 554 */ 555 #ifndef MDB_MAXKEYSIZE 556 #define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) 557 #endif 558 559 /** The maximum size of a key we can write to the environment. */ 560 #if MDB_MAXKEYSIZE 561 #define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) 562 #else 563 #define ENV_MAXKEY(env) ((env)->me_maxkey) 564 #endif 565 566 /** @brief The maximum size of a data item. 567 * 568 * We only store a 32 bit value for node sizes. 569 */ 570 #define MAXDATASIZE 0xffffffffUL 571 572 #if MDB_DEBUG 573 /** Key size which fits in a #DKBUF. 574 * @ingroup debug 575 */ 576 #define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) 577 /** A key buffer. 578 * @ingroup debug 579 * This is used for printing a hex dump of a key's contents. 580 */ 581 #define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] 582 /** Display a key in hex. 583 * @ingroup debug 584 * Invoke a function to display a key in hex. 585 */ 586 #define DKEY(x) mdb_dkey(x, kbuf) 587 #else 588 #define DKBUF 589 #define DKEY(x) 0 590 #endif 591 592 /** An invalid page number. 593 * Mainly used to denote an empty tree. 594 */ 595 #define P_INVALID (~(pgno_t)0) 596 597 /** Test if the flags \b f are set in a flag word \b w. */ 598 #define F_ISSET(w, f) (((w) & (f)) == (f)) 599 600 /** Round \b n up to an even number. */ 601 #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ 602 603 /** Used for offsets within a single page. 604 * Since memory pages are typically 4 or 8KB in size, 12-13 bits, 605 * this is plenty. 606 */ 607 typedef uint16_t indx_t; 608 609 /** Default size of memory map. 610 * This is certainly too small for any actual applications. Apps should always set 611 * the size explicitly using #mdb_env_set_mapsize(). 612 */ 613 #define DEFAULT_MAPSIZE 1048576 614 615 /** @defgroup readers Reader Lock Table 616 * Readers don't acquire any locks for their data access. Instead, they 617 * simply record their transaction ID in the reader table. The reader 618 * mutex is needed just to find an empty slot in the reader table. The 619 * slot's address is saved in thread-specific data so that subsequent read 620 * transactions started by the same thread need no further locking to proceed. 621 * 622 * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. 623 * 624 * No reader table is used if the database is on a read-only filesystem, or 625 * if #MDB_NOLOCK is set. 626 * 627 * Since the database uses multi-version concurrency control, readers don't 628 * actually need any locking. This table is used to keep track of which 629 * readers are using data from which old transactions, so that we'll know 630 * when a particular old transaction is no longer in use. Old transactions 631 * that have discarded any data pages can then have those pages reclaimed 632 * for use by a later write transaction. 633 * 634 * The lock table is constructed such that reader slots are aligned with the 635 * processor's cache line size. Any slot is only ever used by one thread. 636 * This alignment guarantees that there will be no contention or cache 637 * thrashing as threads update their own slot info, and also eliminates 638 * any need for locking when accessing a slot. 639 * 640 * A writer thread will scan every slot in the table to determine the oldest 641 * outstanding reader transaction. Any freed pages older than this will be 642 * reclaimed by the writer. The writer doesn't use any locks when scanning 643 * this table. This means that there's no guarantee that the writer will 644 * see the most up-to-date reader info, but that's not required for correct 645 * operation - all we need is to know the upper bound on the oldest reader, 646 * we don't care at all about the newest reader. So the only consequence of 647 * reading stale information here is that old pages might hang around a 648 * while longer before being reclaimed. That's actually good anyway, because 649 * the longer we delay reclaiming old pages, the more likely it is that a 650 * string of contiguous pages can be found after coalescing old pages from 651 * many old transactions together. 652 * @{ 653 */ 654 /** Number of slots in the reader table. 655 * This value was chosen somewhat arbitrarily. 126 readers plus a 656 * couple mutexes fit exactly into 8KB on my development machine. 657 * Applications should set the table size using #mdb_env_set_maxreaders(). 658 */ 659 #define DEFAULT_READERS 126 660 661 /** The size of a CPU cache line in bytes. We want our lock structures 662 * aligned to this size to avoid false cache line sharing in the 663 * lock table. 664 * This value works for most CPUs. For Itanium this should be 128. 665 */ 666 #ifndef CACHELINE 667 #define CACHELINE 64 668 #endif 669 670 /** The information we store in a single slot of the reader table. 671 * In addition to a transaction ID, we also record the process and 672 * thread ID that owns a slot, so that we can detect stale information, 673 * e.g. threads or processes that went away without cleaning up. 674 * @note We currently don't check for stale records. We simply re-init 675 * the table when we know that we're the only process opening the 676 * lock file. 677 */ 678 typedef struct MDB_rxbody { 679 /** Current Transaction ID when this transaction began, or (txnid_t)-1. 680 * Multiple readers that start at the same time will probably have the 681 * same ID here. Again, it's not important to exclude them from 682 * anything; all we need to know is which version of the DB they 683 * started from so we can avoid overwriting any data used in that 684 * particular version. 685 */ 686 volatile txnid_t mrb_txnid; 687 /** The process ID of the process owning this reader txn. */ 688 volatile MDB_PID_T mrb_pid; 689 /** The thread ID of the thread owning this txn. */ 690 volatile MDB_THR_T mrb_tid; 691 } MDB_rxbody; 692 693 /** The actual reader record, with cacheline padding. */ 694 typedef struct MDB_reader { 695 union { 696 MDB_rxbody mrx; 697 /** shorthand for mrb_txnid */ 698 #define mr_txnid mru.mrx.mrb_txnid 699 #define mr_pid mru.mrx.mrb_pid 700 #define mr_tid mru.mrx.mrb_tid 701 /** cache line alignment */ 702 char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)]; 703 } mru; 704 } MDB_reader; 705 706 /** The header for the reader table. 707 * The table resides in a memory-mapped file. (This is a different file 708 * than is used for the main database.) 709 * 710 * For POSIX the actual mutexes reside in the shared memory of this 711 * mapped file. On Windows, mutexes are named objects allocated by the 712 * kernel; we store the mutex names in this mapped file so that other 713 * processes can grab them. This same approach is also used on 714 * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support 715 * process-shared POSIX mutexes. For these cases where a named object 716 * is used, the object name is derived from a 64 bit FNV hash of the 717 * environment pathname. As such, naming collisions are extremely 718 * unlikely. If a collision occurs, the results are unpredictable. 719 */ 720 typedef struct MDB_txbody { 721 /** Stamp identifying this as an LMDB file. It must be set 722 * to #MDB_MAGIC. */ 723 uint32_t mtb_magic; 724 /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ 725 uint32_t mtb_format; 726 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) 727 char mtb_rmname[MNAME_LEN]; 728 #else 729 /** Mutex protecting access to this table. 730 * This is the reader table lock used with LOCK_MUTEX(). 731 */ 732 mdb_mutex_t mtb_rmutex; 733 #endif 734 /** The ID of the last transaction committed to the database. 735 * This is recorded here only for convenience; the value can always 736 * be determined by reading the main database meta pages. 737 */ 738 volatile txnid_t mtb_txnid; 739 /** The number of slots that have been used in the reader table. 740 * This always records the maximum count, it is not decremented 741 * when readers release their slots. 742 */ 743 volatile unsigned mtb_numreaders; 744 } MDB_txbody; 745 746 /** The actual reader table definition. */ 747 typedef struct MDB_txninfo { 748 union { 749 MDB_txbody mtb; 750 #define mti_magic mt1.mtb.mtb_magic 751 #define mti_format mt1.mtb.mtb_format 752 #define mti_rmutex mt1.mtb.mtb_rmutex 753 #define mti_rmname mt1.mtb.mtb_rmname 754 #define mti_txnid mt1.mtb.mtb_txnid 755 #define mti_numreaders mt1.mtb.mtb_numreaders 756 char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; 757 } mt1; 758 union { 759 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) 760 char mt2_wmname[MNAME_LEN]; 761 #define mti_wmname mt2.mt2_wmname 762 #else 763 mdb_mutex_t mt2_wmutex; 764 #define mti_wmutex mt2.mt2_wmutex 765 #endif 766 char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; 767 } mt2; 768 MDB_reader mti_readers[1]; 769 } MDB_txninfo; 770 771 /** Lockfile format signature: version, features and field layout */ 772 #define MDB_LOCK_FORMAT \ 773 ((uint32_t) \ 774 ((MDB_LOCK_VERSION) \ 775 /* Flags which describe functionality */ \ 776 + (((MDB_PIDLOCK) != 0) << 16))) 777 /** @} */ 778 779 /** Common header for all page types. The page type depends on #mp_flags. 780 * 781 * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with 782 * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages 783 * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. 784 * 785 * #P_OVERFLOW records occupy one or more contiguous pages where only the 786 * first has a page header. They hold the real data of #F_BIGDATA nodes. 787 * 788 * #P_SUBP sub-pages are small leaf "pages" with duplicate data. 789 * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. 790 * (Duplicate data can also go in sub-databases, which use normal pages.) 791 * 792 * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. 793 * 794 * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once 795 * in the snapshot: Either used by a database or listed in a freeDB record. 796 */ 797 typedef struct MDB_page { 798 #define mp_pgno mp_p.p_pgno 799 #define mp_next mp_p.p_next 800 union { 801 pgno_t p_pgno; /**< page number */ 802 struct MDB_page *p_next; /**< for in-memory list of freed pages */ 803 } mp_p; 804 uint16_t mp_pad; /**< key size if this is a LEAF2 page */ 805 /** @defgroup mdb_page Page Flags 806 * @ingroup internal 807 * Flags for the page headers. 808 * @{ 809 */ 810 #define P_BRANCH 0x01 /**< branch page */ 811 #define P_LEAF 0x02 /**< leaf page */ 812 #define P_OVERFLOW 0x04 /**< overflow page */ 813 #define P_META 0x08 /**< meta page */ 814 #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ 815 #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ 816 #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ 817 #define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ 818 #define P_KEEP 0x8000 /**< leave this page alone during spill */ 819 /** @} */ 820 uint16_t mp_flags; /**< @ref mdb_page */ 821 #define mp_lower mp_pb.pb.pb_lower 822 #define mp_upper mp_pb.pb.pb_upper 823 #define mp_pages mp_pb.pb_pages 824 union { 825 struct { 826 indx_t pb_lower; /**< lower bound of free space */ 827 indx_t pb_upper; /**< upper bound of free space */ 828 } pb; 829 uint32_t pb_pages; /**< number of overflow pages */ 830 } mp_pb; 831 indx_t mp_ptrs[1]; /**< dynamic size */ 832 } MDB_page; 833 834 /** Size of the page header, excluding dynamic data at the end */ 835 #define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) 836 837 /** Address of first usable data byte in a page, after the header */ 838 #define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) 839 840 /** ITS#7713, change PAGEBASE to handle 65536 byte pages */ 841 #define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) 842 843 /** Number of nodes on a page */ 844 #define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1) 845 846 /** The amount of space remaining in the page */ 847 #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) 848 849 /** The percentage of space used in the page, in tenths of a percent. */ 850 #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ 851 ((env)->me_psize - PAGEHDRSZ)) 852 /** The minimum page fill factor, in tenths of a percent. 853 * Pages emptier than this are candidates for merging. 854 */ 855 #define FILL_THRESHOLD 250 856 857 /** Test if a page is a leaf page */ 858 #define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) 859 /** Test if a page is a LEAF2 page */ 860 #define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) 861 /** Test if a page is a branch page */ 862 #define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) 863 /** Test if a page is an overflow page */ 864 #define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) 865 /** Test if a page is a sub page */ 866 #define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) 867 868 /** The number of overflow pages needed to store the given size. */ 869 #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) 870 871 /** Link in #MDB_txn.%mt_loose_pgs list. 872 * Kept outside the page header, which is needed when reusing the page. 873 */ 874 #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) 875 876 /** Header for a single key/data pair within a page. 877 * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. 878 * We guarantee 2-byte alignment for 'MDB_node's. 879 * 880 * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child 881 * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used 882 * for pgno. (Branch nodes have no flags). Lo and hi are in host byte 883 * order in case some accesses can be optimized to 32-bit word access. 884 * 885 * Leaf node flags describe node contents. #F_BIGDATA says the node's 886 * data part is the page number of an overflow page with actual data. 887 * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in 888 * a sub-page/sub-database, and named databases (just #F_SUBDATA). 889 */ 890 typedef struct MDB_node { 891 /** part of data size or pgno 892 * @{ */ 893 #if BYTE_ORDER == LITTLE_ENDIAN 894 unsigned short mn_lo, mn_hi; 895 #else 896 unsigned short mn_hi, mn_lo; 897 #endif 898 /** @} */ 899 /** @defgroup mdb_node Node Flags 900 * @ingroup internal 901 * Flags for node headers. 902 * @{ 903 */ 904 #define F_BIGDATA 0x01 /**< data put on overflow page */ 905 #define F_SUBDATA 0x02 /**< data is a sub-database */ 906 #define F_DUPDATA 0x04 /**< data has duplicates */ 907 908 /** valid flags for #mdb_node_add() */ 909 #define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND) 910 911 /** @} */ 912 unsigned short mn_flags; /**< @ref mdb_node */ 913 unsigned short mn_ksize; /**< key size */ 914 char mn_data[1]; /**< key and data are appended here */ 915 } MDB_node; 916 917 /** Size of the node header, excluding dynamic data at the end */ 918 #define NODESIZE offsetof(MDB_node, mn_data) 919 920 /** Bit position of top word in page number, for shifting mn_flags */ 921 #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) 922 923 /** Size of a node in a branch page with a given key. 924 * This is just the node header plus the key, there is no data. 925 */ 926 #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) 927 928 /** Size of a node in a leaf page with a given key and data. 929 * This is node header plus key plus data size. 930 */ 931 #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) 932 933 /** Address of node \b i in page \b p */ 934 #define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE)) 935 936 /** Address of the key for the node */ 937 #define NODEKEY(node) (void *)((node)->mn_data) 938 939 /** Address of the data for a node */ 940 #define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) 941 942 /** Get the page number pointed to by a branch node */ 943 #define NODEPGNO(node) \ 944 ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \ 945 (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0)) 946 /** Set the page number in a branch node */ 947 #define SETPGNO(node,pgno) do { \ 948 (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \ 949 if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0) 950 951 /** Get the size of the data in a leaf node */ 952 #define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) 953 /** Set the size of the data for a leaf node */ 954 #define SETDSZ(node,size) do { \ 955 (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0) 956 /** The size of a key in a node */ 957 #define NODEKSZ(node) ((node)->mn_ksize) 958 959 /** Copy a page number from src to dst */ 960 #ifdef MISALIGNED_OK 961 #define COPY_PGNO(dst,src) dst = src 962 #else 963 #if SIZE_MAX > 4294967295UL 964 #define COPY_PGNO(dst,src) do { \ 965 unsigned short *s, *d; \ 966 s = (unsigned short *)&(src); \ 967 d = (unsigned short *)&(dst); \ 968 *d++ = *s++; \ 969 *d++ = *s++; \ 970 *d++ = *s++; \ 971 *d = *s; \ 972 } while (0) 973 #else 974 #define COPY_PGNO(dst,src) do { \ 975 unsigned short *s, *d; \ 976 s = (unsigned short *)&(src); \ 977 d = (unsigned short *)&(dst); \ 978 *d++ = *s++; \ 979 *d = *s; \ 980 } while (0) 981 #endif 982 #endif 983 /** The address of a key in a LEAF2 page. 984 * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs. 985 * There are no node headers, keys are stored contiguously. 986 */ 987 #define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) 988 989 /** Set the \b node's key into \b keyptr, if requested. */ 990 #define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \ 991 (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } 992 993 /** Set the \b node's key into \b key. */ 994 #define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } 995 996 /** Information about a single database in the environment. */ 997 typedef struct MDB_db { 998 uint32_t md_pad; /**< also ksize for LEAF2 pages */ 999 uint16_t md_flags; /**< @ref mdb_dbi_open */ 1000 uint16_t md_depth; /**< depth of this tree */ 1001 pgno_t md_branch_pages; /**< number of internal pages */ 1002 pgno_t md_leaf_pages; /**< number of leaf pages */ 1003 pgno_t md_overflow_pages; /**< number of overflow pages */ 1004 size_t md_entries; /**< number of data items */ 1005 pgno_t md_root; /**< the root page of this tree */ 1006 } MDB_db; 1007 1008 #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ 1009 #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) 1010 /** #mdb_dbi_open() flags */ 1011 #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ 1012 MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) 1013 1014 /** Handle for the DB used to track free pages. */ 1015 #define FREE_DBI 0 1016 /** Handle for the default DB. */ 1017 #define MAIN_DBI 1 1018 /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ 1019 #define CORE_DBS 2 1020 1021 /** Number of meta pages - also hardcoded elsewhere */ 1022 #define NUM_METAS 2 1023 1024 /** Meta page content. 1025 * A meta page is the start point for accessing a database snapshot. 1026 * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). 1027 */ 1028 typedef struct MDB_meta { 1029 /** Stamp identifying this as an LMDB file. It must be set 1030 * to #MDB_MAGIC. */ 1031 uint32_t mm_magic; 1032 /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ 1033 uint32_t mm_version; 1034 void *mm_address; /**< address for fixed mapping */ 1035 size_t mm_mapsize; /**< size of mmap region */ 1036 MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ 1037 /** The size of pages used in this DB */ 1038 #define mm_psize mm_dbs[FREE_DBI].md_pad 1039 /** Any persistent environment flags. @ref mdb_env */ 1040 #define mm_flags mm_dbs[FREE_DBI].md_flags 1041 /** Last used page in the datafile. 1042 * Actually the file may be shorter if the freeDB lists the final pages. 1043 */ 1044 pgno_t mm_last_pg; 1045 volatile txnid_t mm_txnid; /**< txnid that committed this page */ 1046 } MDB_meta; 1047 1048 /** Buffer for a stack-allocated meta page. 1049 * The members define size and alignment, and silence type 1050 * aliasing warnings. They are not used directly; that could 1051 * mean incorrectly using several union members in parallel. 1052 */ 1053 typedef union MDB_metabuf { 1054 MDB_page mb_page; 1055 struct { 1056 char mm_pad[PAGEHDRSZ]; 1057 MDB_meta mm_meta; 1058 } mb_metabuf; 1059 } MDB_metabuf; 1060 1061 /** Auxiliary DB info. 1062 * The information here is mostly static/read-only. There is 1063 * only a single copy of this record in the environment. 1064 */ 1065 typedef struct MDB_dbx { 1066 MDB_val md_name; /**< name of the database */ 1067 MDB_cmp_func *md_cmp; /**< function for comparing keys */ 1068 MDB_cmp_func *md_dcmp; /**< function for comparing data items */ 1069 MDB_rel_func *md_rel; /**< user relocate function */ 1070 void *md_relctx; /**< user-provided context for md_rel */ 1071 } MDB_dbx; 1072 1073 /** A database transaction. 1074 * Every operation requires a transaction handle. 1075 */ 1076 struct MDB_txn { 1077 MDB_txn *mt_parent; /**< parent of a nested txn */ 1078 /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ 1079 MDB_txn *mt_child; 1080 pgno_t mt_next_pgno; /**< next unallocated page */ 1081 /** The ID of this transaction. IDs are integers incrementing from 1. 1082 * Only committed write transactions increment the ID. If a transaction 1083 * aborts, the ID may be re-used by the next writer. 1084 */ 1085 txnid_t mt_txnid; 1086 MDB_env *mt_env; /**< the DB environment */ 1087 /** The list of pages that became unused during this transaction. 1088 */ 1089 MDB_IDL mt_free_pgs; 1090 /** The list of loose pages that became unused and may be reused 1091 * in this transaction, linked through #NEXT_LOOSE_PAGE(page). 1092 */ 1093 MDB_page *mt_loose_pgs; 1094 /** Number of loose pages (#mt_loose_pgs) */ 1095 int mt_loose_count; 1096 /** The sorted list of dirty pages we temporarily wrote to disk 1097 * because the dirty list was full. page numbers in here are 1098 * shifted left by 1, deleted slots have the LSB set. 1099 */ 1100 MDB_IDL mt_spill_pgs; 1101 union { 1102 /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ 1103 MDB_ID2L dirty_list; 1104 /** For read txns: This thread/txn's reader table slot, or NULL. */ 1105 MDB_reader *reader; 1106 } mt_u; 1107 /** Array of records for each DB known in the environment. */ 1108 MDB_dbx *mt_dbxs; 1109 /** Array of MDB_db records for each known DB */ 1110 MDB_db *mt_dbs; 1111 /** Array of sequence numbers for each DB handle */ 1112 unsigned int *mt_dbiseqs; 1113 /** @defgroup mt_dbflag Transaction DB Flags 1114 * @ingroup internal 1115 * @{ 1116 */ 1117 #define DB_DIRTY 0x01 /**< DB was written in this txn */ 1118 #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ 1119 #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ 1120 #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ 1121 #define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ 1122 #define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ 1123 /** @} */ 1124 /** In write txns, array of cursors for each DB */ 1125 MDB_cursor **mt_cursors; 1126 /** Array of flags for each DB */ 1127 unsigned char *mt_dbflags; 1128 /** Number of DB records in use, or 0 when the txn is finished. 1129 * This number only ever increments until the txn finishes; we 1130 * don't decrement it when individual DB handles are closed. 1131 */ 1132 MDB_dbi mt_numdbs; 1133 1134 /** @defgroup mdb_txn Transaction Flags 1135 * @ingroup internal 1136 * @{ 1137 */ 1138 /** #mdb_txn_begin() flags */ 1139 #define MDB_TXN_BEGIN_FLAGS MDB_RDONLY 1140 #define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ 1141 /* internal txn flags */ 1142 #define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ 1143 #define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ 1144 #define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ 1145 #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ 1146 #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ 1147 #define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ 1148 /** most operations on the txn are currently illegal */ 1149 #define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD) 1150 /** @} */ 1151 unsigned int mt_flags; /**< @ref mdb_txn */ 1152 /** #dirty_list room: Array size - \#dirty pages visible to this txn. 1153 * Includes ancestor txns' dirty pages not hidden by other txns' 1154 * dirty/spilled pages. Thus commit(nested txn) has room to merge 1155 * dirty_list into mt_parent after freeing hidden mt_parent pages. 1156 */ 1157 unsigned int mt_dirty_room; 1158 }; 1159 1160 /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. 1161 * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to 1162 * raise this on a 64 bit machine. 1163 */ 1164 #define CURSOR_STACK 32 1165 1166 struct MDB_xcursor; 1167 1168 /** Cursors are used for all DB operations. 1169 * A cursor holds a path of (page pointer, key index) from the DB 1170 * root to a position in the DB, plus other state. #MDB_DUPSORT 1171 * cursors include an xcursor to the current data item. Write txns 1172 * track their cursors and keep them up to date when data moves. 1173 * Exception: An xcursor's pointer to a #P_SUBP page can be stale. 1174 * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). 1175 */ 1176 struct MDB_cursor { 1177 /** Next cursor on this DB in this txn */ 1178 MDB_cursor *mc_next; 1179 /** Backup of the original cursor if this cursor is a shadow */ 1180 MDB_cursor *mc_backup; 1181 /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ 1182 struct MDB_xcursor *mc_xcursor; 1183 /** The transaction that owns this cursor */ 1184 MDB_txn *mc_txn; 1185 /** The database handle this cursor operates on */ 1186 MDB_dbi mc_dbi; 1187 /** The database record for this cursor */ 1188 MDB_db *mc_db; 1189 /** The database auxiliary record for this cursor */ 1190 MDB_dbx *mc_dbx; 1191 /** The @ref mt_dbflag for this database */ 1192 unsigned char *mc_dbflag; 1193 unsigned short mc_snum; /**< number of pushed pages */ 1194 unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ 1195 /** @defgroup mdb_cursor Cursor Flags 1196 * @ingroup internal 1197 * Cursor state flags. 1198 * @{ 1199 */ 1200 #define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ 1201 #define C_EOF 0x02 /**< No more data */ 1202 #define C_SUB 0x04 /**< Cursor is a sub-cursor */ 1203 #define C_DEL 0x08 /**< last op was a cursor_del */ 1204 #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ 1205 /** @} */ 1206 unsigned int mc_flags; /**< @ref mdb_cursor */ 1207 MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ 1208 indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ 1209 }; 1210 1211 /** Context for sorted-dup records. 1212 * We could have gone to a fully recursive design, with arbitrarily 1213 * deep nesting of sub-databases. But for now we only handle these 1214 * levels - main DB, optional sub-DB, sorted-duplicate DB. 1215 */ 1216 typedef struct MDB_xcursor { 1217 /** A sub-cursor for traversing the Dup DB */ 1218 MDB_cursor mx_cursor; 1219 /** The database record for this Dup DB */ 1220 MDB_db mx_db; 1221 /** The auxiliary DB record for this Dup DB */ 1222 MDB_dbx mx_dbx; 1223 /** The @ref mt_dbflag for this Dup DB */ 1224 unsigned char mx_dbflag; 1225 } MDB_xcursor; 1226 1227 /** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */ 1228 #define XCURSOR_INITED(mc) \ 1229 ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) 1230 1231 /** Update sub-page pointer, if any, in \b mc->mc_xcursor. Needed 1232 * when the node which contains the sub-page may have moved. Called 1233 * with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top]. 1234 */ 1235 #define XCURSOR_REFRESH(mc, mp, ki) do { \ 1236 MDB_page *xr_pg = (mp); \ 1237 MDB_node *xr_node = NODEPTR(xr_pg, ki); \ 1238 if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ 1239 (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ 1240 } while (0) 1241 1242 /** State of FreeDB old pages, stored in the MDB_env */ 1243 typedef struct MDB_pgstate { 1244 pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ 1245 txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ 1246 } MDB_pgstate; 1247 1248 /** The database environment. */ 1249 struct MDB_env { 1250 HANDLE me_fd; /**< The main data file */ 1251 HANDLE me_lfd; /**< The lock file */ 1252 HANDLE me_mfd; /**< For writing and syncing the meta pages */ 1253 /** Failed to update the meta page. Probably an I/O error. */ 1254 #define MDB_FATAL_ERROR 0x80000000U 1255 /** Some fields are initialized. */ 1256 #define MDB_ENV_ACTIVE 0x20000000U 1257 /** me_txkey is set */ 1258 #define MDB_ENV_TXKEY 0x10000000U 1259 /** fdatasync is unreliable */ 1260 #define MDB_FSYNCONLY 0x08000000U 1261 uint32_t me_flags; /**< @ref mdb_env */ 1262 unsigned int me_psize; /**< DB page size, inited from me_os_psize */ 1263 unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ 1264 unsigned int me_maxreaders; /**< size of the reader table */ 1265 /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */ 1266 volatile int me_close_readers; 1267 MDB_dbi me_numdbs; /**< number of DBs opened */ 1268 MDB_dbi me_maxdbs; /**< size of the DB table */ 1269 MDB_PID_T me_pid; /**< process ID of this env */ 1270 char *me_path; /**< path to the DB files */ 1271 char *me_map; /**< the memory map of the data file */ 1272 MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ 1273 MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */ 1274 void *me_pbuf; /**< scratch area for DUPSORT put() */ 1275 MDB_txn *me_txn; /**< current write transaction */ 1276 MDB_txn *me_txn0; /**< prealloc'd write transaction */ 1277 size_t me_mapsize; /**< size of the data memory map */ 1278 off_t me_size; /**< current file size */ 1279 pgno_t me_maxpg; /**< me_mapsize / me_psize */ 1280 MDB_dbx *me_dbxs; /**< array of static DB info */ 1281 uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ 1282 unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */ 1283 pthread_key_t me_txkey; /**< thread-key for readers */ 1284 txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ 1285 MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ 1286 # define me_pglast me_pgstate.mf_pglast 1287 # define me_pghead me_pgstate.mf_pghead 1288 MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ 1289 /** IDL of pages that became unused in a write txn */ 1290 MDB_IDL me_free_pgs; 1291 /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ 1292 MDB_ID2L me_dirty_list; 1293 /** Max number of freelist items that can fit in a single overflow page */ 1294 int me_maxfree_1pg; 1295 /** Max size of a node on a page */ 1296 unsigned int me_nodemax; 1297 #if !(MDB_MAXKEYSIZE) 1298 unsigned int me_maxkey; /**< max size of a key */ 1299 #endif 1300 int me_live_reader; /**< have liveness lock in reader table */ 1301 #ifdef _WIN32 1302 int me_pidquery; /**< Used in OpenProcess */ 1303 #endif 1304 #ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */ 1305 # define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */ 1306 # define me_wmutex me_txns->mti_wmutex /**< Shared writer lock */ 1307 #else 1308 mdb_mutex_t me_rmutex; 1309 mdb_mutex_t me_wmutex; 1310 #endif 1311 void *me_userctx; /**< User-settable context */ 1312 MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ 1313 }; 1314 1315 /** Nested transaction */ 1316 typedef struct MDB_ntxn { 1317 MDB_txn mnt_txn; /**< the transaction */ 1318 MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ 1319 } MDB_ntxn; 1320 1321 /** max number of pages to commit in one writev() call */ 1322 #define MDB_COMMIT_PAGES 64 1323 #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES 1324 #undef MDB_COMMIT_PAGES 1325 #define MDB_COMMIT_PAGES IOV_MAX 1326 #endif 1327 1328 /** max bytes to write in one call */ 1329 #define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4)) 1330 1331 /** Check \b txn and \b dbi arguments to a function */ 1332 #define TXN_DBI_EXIST(txn, dbi, validity) \ 1333 ((txn) && (dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) 1334 1335 /** Check for misused \b dbi handles */ 1336 #define TXN_DBI_CHANGED(txn, dbi) \ 1337 ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) 1338 1339 static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); 1340 static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); 1341 static int mdb_page_touch(MDB_cursor *mc); 1342 1343 #define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ 1344 "reset-tmp", "fail-begin", "fail-beginchild"} 1345 enum { 1346 /* mdb_txn_end operation number, for logging */ 1347 MDB_END_COMMITTED, MDB_END_EMPTY_COMMIT, MDB_END_ABORT, MDB_END_RESET, 1348 MDB_END_RESET_TMP, MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD 1349 }; 1350 #define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */ 1351 #define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ 1352 #define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ 1353 #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ 1354 static void mdb_txn_end(MDB_txn *txn, unsigned mode); 1355 1356 static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); 1357 static int mdb_page_search_root(MDB_cursor *mc, 1358 MDB_val *key, int modify); 1359 #define MDB_PS_MODIFY 1 1360 #define MDB_PS_ROOTONLY 2 1361 #define MDB_PS_FIRST 4 1362 #define MDB_PS_LAST 8 1363 static int mdb_page_search(MDB_cursor *mc, 1364 MDB_val *key, int flags); 1365 static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); 1366 1367 #define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ 1368 static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, 1369 pgno_t newpgno, unsigned int nflags); 1370 1371 static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); 1372 static MDB_meta *mdb_env_pick_meta(const MDB_env *env); 1373 static int mdb_env_write_meta(MDB_txn *txn); 1374 #ifdef MDB_USE_POSIX_MUTEX /* Drop unused excl arg */ 1375 # define mdb_env_close0(env, excl) mdb_env_close1(env) 1376 #endif 1377 static void mdb_env_close0(MDB_env *env, int excl); 1378 1379 static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); 1380 static int mdb_node_add(MDB_cursor *mc, indx_t indx, 1381 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); 1382 static void mdb_node_del(MDB_cursor *mc, int ksize); 1383 static void mdb_node_shrink(MDB_page *mp, indx_t indx); 1384 static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); 1385 static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); 1386 static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); 1387 static size_t mdb_branch_size(MDB_env *env, MDB_val *key); 1388 1389 static int mdb_rebalance(MDB_cursor *mc); 1390 static int mdb_update_key(MDB_cursor *mc, MDB_val *key); 1391 1392 static void mdb_cursor_pop(MDB_cursor *mc); 1393 static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); 1394 1395 static int mdb_cursor_del0(MDB_cursor *mc); 1396 static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); 1397 static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); 1398 static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); 1399 static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); 1400 static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, 1401 int *exactp); 1402 static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); 1403 static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); 1404 1405 static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); 1406 static void mdb_xcursor_init0(MDB_cursor *mc); 1407 static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); 1408 static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); 1409 1410 static int mdb_drop0(MDB_cursor *mc, int subs); 1411 static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); 1412 static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); 1413 1414 /** @cond */ 1415 static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; 1416 /** @endcond */ 1417 1418 /** Compare two items pointing at size_t's of unknown alignment. */ 1419 #ifdef MISALIGNED_OK 1420 # define mdb_cmp_clong mdb_cmp_long 1421 #else 1422 # define mdb_cmp_clong mdb_cmp_cint 1423 #endif 1424 1425 #ifdef _WIN32 1426 static SECURITY_DESCRIPTOR mdb_null_sd; 1427 static SECURITY_ATTRIBUTES mdb_all_sa; 1428 static int mdb_sec_inited; 1429 1430 struct MDB_name; 1431 static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra); 1432 #endif 1433 1434 /** Return the library version info. */ 1435 char * ESECT 1436 mdb_version(int *major, int *minor, int *patch) 1437 { 1438 if (major) *major = MDB_VERSION_MAJOR; 1439 if (minor) *minor = MDB_VERSION_MINOR; 1440 if (patch) *patch = MDB_VERSION_PATCH; 1441 return MDB_VERSION_STRING; 1442 } 1443 1444 /** Table of descriptions for LMDB @ref errors */ 1445 static char *const mdb_errstr[] = { 1446 "MDB_KEYEXIST: Key/data pair already exists", 1447 "MDB_NOTFOUND: No matching key/data pair found", 1448 "MDB_PAGE_NOTFOUND: Requested page not found", 1449 "MDB_CORRUPTED: Located page was wrong type", 1450 "MDB_PANIC: Update of meta page failed or environment had fatal error", 1451 "MDB_VERSION_MISMATCH: Database environment version mismatch", 1452 "MDB_INVALID: File is not an LMDB file", 1453 "MDB_MAP_FULL: Environment mapsize limit reached", 1454 "MDB_DBS_FULL: Environment maxdbs limit reached", 1455 "MDB_READERS_FULL: Environment maxreaders limit reached", 1456 "MDB_TLS_FULL: Thread-local storage keys full - too many environments open", 1457 "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big", 1458 "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", 1459 "MDB_PAGE_FULL: Internal error - page has no more space", 1460 "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", 1461 "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", 1462 "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", 1463 "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", 1464 "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", 1465 "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", 1466 }; 1467 1468 char * 1469 mdb_strerror(int err) 1470 { 1471 #ifdef _WIN32 1472 /** HACK: pad 4KB on stack over the buf. Return system msgs in buf. 1473 * This works as long as no function between the call to mdb_strerror 1474 * and the actual use of the message uses more than 4K of stack. 1475 */ 1476 #define MSGSIZE 1024 1477 #define PADSIZE 4096 1478 char buf[MSGSIZE+PADSIZE], *ptr = buf; 1479 #endif 1480 int i; 1481 if (!err) 1482 return ("Successful return: 0"); 1483 1484 if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { 1485 i = err - MDB_KEYEXIST; 1486 return mdb_errstr[i]; 1487 } 1488 1489 #ifdef _WIN32 1490 /* These are the C-runtime error codes we use. The comment indicates 1491 * their numeric value, and the Win32 error they would correspond to 1492 * if the error actually came from a Win32 API. A major mess, we should 1493 * have used LMDB-specific error codes for everything. 1494 */ 1495 switch(err) { 1496 case ENOENT: /* 2, FILE_NOT_FOUND */ 1497 case EIO: /* 5, ACCESS_DENIED */ 1498 case ENOMEM: /* 12, INVALID_ACCESS */ 1499 case EACCES: /* 13, INVALID_DATA */ 1500 case EBUSY: /* 16, CURRENT_DIRECTORY */ 1501 case EINVAL: /* 22, BAD_COMMAND */ 1502 case ENOSPC: /* 28, OUT_OF_PAPER */ 1503 return strerror(err); 1504 default: 1505 ; 1506 } 1507 buf[0] = 0; 1508 FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | 1509 FORMAT_MESSAGE_IGNORE_INSERTS, 1510 NULL, err, 0, ptr, MSGSIZE, (va_list *)buf+MSGSIZE); 1511 return ptr; 1512 #else 1513 return strerror(err); 1514 #endif 1515 } 1516 1517 /** assert(3) variant in cursor context */ 1518 #define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) 1519 /** assert(3) variant in transaction context */ 1520 #define mdb_tassert(txn, expr) mdb_assert0((txn)->mt_env, expr, #expr) 1521 /** assert(3) variant in environment context */ 1522 #define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) 1523 1524 #ifndef NDEBUG 1525 # define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ 1526 mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) 1527 1528 static void ESECT 1529 mdb_assert_fail(MDB_env *env, const char *expr_txt, 1530 const char *func, const char *file, int line) 1531 { 1532 char buf[400]; 1533 sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()", 1534 file, line, expr_txt, func); 1535 if (env->me_assert_func) 1536 env->me_assert_func(env, buf); 1537 fprintf(stderr, "%s\n", buf); 1538 abort(); 1539 } 1540 #else 1541 # define mdb_assert0(env, expr, expr_txt) ((void) 0) 1542 #endif /* NDEBUG */ 1543 1544 #if MDB_DEBUG 1545 /** Return the page number of \b mp which may be sub-page, for debug output */ 1546 static pgno_t 1547 mdb_dbg_pgno(MDB_page *mp) 1548 { 1549 pgno_t ret; 1550 COPY_PGNO(ret, mp->mp_pgno); 1551 return ret; 1552 } 1553 1554 /** Display a key in hexadecimal and return the address of the result. 1555 * @param[in] key the key to display 1556 * @param[in] buf the buffer to write into. Should always be #DKBUF. 1557 * @return The key in hexadecimal form. 1558 */ 1559 char * 1560 mdb_dkey(MDB_val *key, char *buf) 1561 { 1562 char *ptr = buf; 1563 unsigned char *c = key->mv_data; 1564 unsigned int i; 1565 1566 if (!key) 1567 return ""; 1568 1569 if (key->mv_size > DKBUF_MAXKEYSIZE) 1570 return "MDB_MAXKEYSIZE"; 1571 /* may want to make this a dynamic check: if the key is mostly 1572 * printable characters, print it as-is instead of converting to hex. 1573 */ 1574 #if 1 1575 buf[0] = '\0'; 1576 for (i=0; i<key->mv_size; i++) 1577 ptr += sprintf(ptr, "%02x", *c++); 1578 #else 1579 sprintf(buf, "%.*s", key->mv_size, key->mv_data); 1580 #endif 1581 return buf; 1582 } 1583 1584 static const char * 1585 mdb_leafnode_type(MDB_node *n) 1586 { 1587 static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; 1588 return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : 1589 tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; 1590 } 1591 1592 /** Display all the keys in the page. */ 1593 void 1594 mdb_page_list(MDB_page *mp) 1595 { 1596 pgno_t pgno = mdb_dbg_pgno(mp); 1597 const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; 1598 MDB_node *node; 1599 unsigned int i, nkeys, nsize, total = 0; 1600 MDB_val key; 1601 DKBUF; 1602 1603 switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { 1604 case P_BRANCH: type = "Branch page"; break; 1605 case P_LEAF: type = "Leaf page"; break; 1606 case P_LEAF|P_SUBP: type = "Sub-page"; break; 1607 case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; 1608 case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; 1609 case P_OVERFLOW: 1610 fprintf(stderr, "Overflow page %"Z"u pages %u%s\n", 1611 pgno, mp->mp_pages, state); 1612 return; 1613 case P_META: 1614 fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n", 1615 pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); 1616 return; 1617 default: 1618 fprintf(stderr, "Bad page %"Z"u flags 0x%X\n", pgno, mp->mp_flags); 1619 return; 1620 } 1621 1622 nkeys = NUMKEYS(mp); 1623 fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state); 1624 1625 for (i=0; i<nkeys; i++) { 1626 if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ 1627 key.mv_size = nsize = mp->mp_pad; 1628 key.mv_data = LEAF2KEY(mp, i, nsize); 1629 total += nsize; 1630 fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); 1631 continue; 1632 } 1633 node = NODEPTR(mp, i); 1634 key.mv_size = node->mn_ksize; 1635 key.mv_data = node->mn_data; 1636 nsize = NODESIZE + key.mv_size; 1637 if (IS_BRANCH(mp)) { 1638 fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node), 1639 DKEY(&key)); 1640 total += nsize; 1641 } else { 1642 if (F_ISSET(node->mn_flags, F_BIGDATA)) 1643 nsize += sizeof(pgno_t); 1644 else 1645 nsize += NODEDSZ(node); 1646 total += nsize; 1647 nsize += sizeof(indx_t); 1648 fprintf(stderr, "key %d: nsize %d, %s%s\n", 1649 i, nsize, DKEY(&key), mdb_leafnode_type(node)); 1650 } 1651 total = EVEN(total); 1652 } 1653 fprintf(stderr, "Total: header %d + contents %d + unused %d\n", 1654 IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp)); 1655 } 1656 1657 void 1658 mdb_cursor_chk(MDB_cursor *mc) 1659 { 1660 unsigned int i; 1661 MDB_node *node; 1662 MDB_page *mp; 1663 1664 if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; 1665 for (i=0; i<mc->mc_top; i++) { 1666 mp = mc->mc_pg[i]; 1667 node = NODEPTR(mp, mc->mc_ki[i]); 1668 if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno) 1669 printf("oops!\n"); 1670 } 1671 if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) 1672 printf("ack!\n"); 1673 if (XCURSOR_INITED(mc)) { 1674 node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 1675 if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && 1676 mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { 1677 printf("blah!\n"); 1678 } 1679 } 1680 } 1681 #endif 1682 1683 #if (MDB_DEBUG) > 2 1684 /** Count all the pages in each DB and in the freelist 1685 * and make sure it matches the actual number of pages 1686 * being used. 1687 * All named DBs must be open for a correct count. 1688 */ 1689 static void mdb_audit(MDB_txn *txn) 1690 { 1691 MDB_cursor mc; 1692 MDB_val key, data; 1693 MDB_ID freecount, count; 1694 MDB_dbi i; 1695 int rc; 1696 1697 freecount = 0; 1698 mdb_cursor_init(&mc, txn, FREE_DBI, NULL); 1699 while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) 1700 freecount += *(MDB_ID *)data.mv_data; 1701 mdb_tassert(txn, rc == MDB_NOTFOUND); 1702 1703 count = 0; 1704 for (i = 0; i<txn->mt_numdbs; i++) { 1705 MDB_xcursor mx; 1706 if (!(txn->mt_dbflags[i] & DB_VALID)) 1707 continue; 1708 mdb_cursor_init(&mc, txn, i, &mx); 1709 if (txn->mt_dbs[i].md_root == P_INVALID) 1710 continue; 1711 count += txn->mt_dbs[i].md_branch_pages + 1712 txn->mt_dbs[i].md_leaf_pages + 1713 txn->mt_dbs[i].md_overflow_pages; 1714 if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { 1715 rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST); 1716 for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) { 1717 unsigned j; 1718 MDB_page *mp; 1719 mp = mc.mc_pg[mc.mc_top]; 1720 for (j=0; j<NUMKEYS(mp); j++) { 1721 MDB_node *leaf = NODEPTR(mp, j); 1722 if (leaf->mn_flags & F_SUBDATA) { 1723 MDB_db db; 1724 memcpy(&db, NODEDATA(leaf), sizeof(db)); 1725 count += db.md_branch_pages + db.md_leaf_pages + 1726 db.md_overflow_pages; 1727 } 1728 } 1729 } 1730 mdb_tassert(txn, rc == MDB_NOTFOUND); 1731 } 1732 } 1733 if (freecount + count + NUM_METAS != txn->mt_next_pgno) { 1734 fprintf(stderr, "audit: %"Z"u freecount: %"Z"u count: %"Z"u total: %"Z"u next_pgno: %"Z"u\n", 1735 txn->mt_txnid, freecount, count+NUM_METAS, 1736 freecount+count+NUM_METAS, txn->mt_next_pgno); 1737 } 1738 } 1739 #endif 1740 1741 int 1742 mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) 1743 { 1744 return txn->mt_dbxs[dbi].md_cmp(a, b); 1745 } 1746 1747 int 1748 mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) 1749 { 1750 MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp; 1751 #if UINT_MAX < SIZE_MAX 1752 if (dcmp == mdb_cmp_int && a->mv_size == sizeof(size_t)) 1753 dcmp = mdb_cmp_clong; 1754 #endif 1755 return dcmp(a, b); 1756 } 1757 1758 /** Allocate memory for a page. 1759 * Re-use old malloc'd pages first for singletons, otherwise just malloc. 1760 * Set #MDB_TXN_ERROR on failure. 1761 */ 1762 static MDB_page * 1763 mdb_page_malloc(MDB_txn *txn, unsigned num) 1764 { 1765 MDB_env *env = txn->mt_env; 1766 MDB_page *ret = env->me_dpages; 1767 size_t psize = env->me_psize, sz = psize, off; 1768 /* For ! #MDB_NOMEMINIT, psize counts how much to init. 1769 * For a single page alloc, we init everything after the page header. 1770 * For multi-page, we init the final page; if the caller needed that 1771 * many pages they will be filling in at least up to the last page. 1772 */ 1773 if (num == 1) { 1774 if (ret) { 1775 VGMEMP_ALLOC(env, ret, sz); 1776 VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); 1777 env->me_dpages = ret->mp_next; 1778 return ret; 1779 } 1780 psize -= off = PAGEHDRSZ; 1781 } else { 1782 sz *= num; 1783 off = sz - psize; 1784 } 1785 if ((ret = malloc(sz)) != NULL) { 1786 VGMEMP_ALLOC(env, ret, sz); 1787 if (!(env->me_flags & MDB_NOMEMINIT)) { 1788 memset((char *)ret + off, 0, psize); 1789 ret->mp_pad = 0; 1790 } 1791 } else { 1792 txn->mt_flags |= MDB_TXN_ERROR; 1793 } 1794 return ret; 1795 } 1796 /** Free a single page. 1797 * Saves single pages to a list, for future reuse. 1798 * (This is not used for multi-page overflow pages.) 1799 */ 1800 static void 1801 mdb_page_free(MDB_env *env, MDB_page *mp) 1802 { 1803 mp->mp_next = env->me_dpages; 1804 VGMEMP_FREE(env, mp); 1805 env->me_dpages = mp; 1806 } 1807 1808 /** Free a dirty page */ 1809 static void 1810 mdb_dpage_free(MDB_env *env, MDB_page *dp) 1811 { 1812 if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { 1813 mdb_page_free(env, dp); 1814 } else { 1815 /* large pages just get freed directly */ 1816 VGMEMP_FREE(env, dp); 1817 free(dp); 1818 } 1819 } 1820 1821 /** Return all dirty pages to dpage list */ 1822 static void 1823 mdb_dlist_free(MDB_txn *txn) 1824 { 1825 MDB_env *env = txn->mt_env; 1826 MDB_ID2L dl = txn->mt_u.dirty_list; 1827 unsigned i, n = dl[0].mid; 1828 1829 for (i = 1; i <= n; i++) { 1830 mdb_dpage_free(env, dl[i].mptr); 1831 } 1832 dl[0].mid = 0; 1833 } 1834 1835 /** Loosen or free a single page. 1836 * Saves single pages to a list for future reuse 1837 * in this same txn. It has been pulled from the freeDB 1838 * and already resides on the dirty list, but has been 1839 * deleted. Use these pages first before pulling again 1840 * from the freeDB. 1841 * 1842 * If the page wasn't dirtied in this txn, just add it 1843 * to this txn's free list. 1844 */ 1845 static int 1846 mdb_page_loose(MDB_cursor *mc, MDB_page *mp) 1847 { 1848 int loose = 0; 1849 pgno_t pgno = mp->mp_pgno; 1850 MDB_txn *txn = mc->mc_txn; 1851 1852 if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { 1853 if (txn->mt_parent) { 1854 MDB_ID2 *dl = txn->mt_u.dirty_list; 1855 /* If txn has a parent, make sure the page is in our 1856 * dirty list. 1857 */ 1858 if (dl[0].mid) { 1859 unsigned x = mdb_mid2l_search(dl, pgno); 1860 if (x <= dl[0].mid && dl[x].mid == pgno) { 1861 if (mp != dl[x].mptr) { /* bad cursor? */ 1862 mc->mc_flags &= ~(C_INITIALIZED|C_EOF); 1863 txn->mt_flags |= MDB_TXN_ERROR; 1864 return MDB_CORRUPTED; 1865 } 1866 /* ok, it's ours */ 1867 loose = 1; 1868 } 1869 } 1870 } else { 1871 /* no parent txn, so it's just ours */ 1872 loose = 1; 1873 } 1874 } 1875 if (loose) { 1876 DPRINTF(("loosen db %d page %"Z"u", DDBI(mc), 1877 mp->mp_pgno)); 1878 NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; 1879 txn->mt_loose_pgs = mp; 1880 txn->mt_loose_count++; 1881 mp->mp_flags |= P_LOOSE; 1882 } else { 1883 int rc = mdb_midl_append(&txn->mt_free_pgs, pgno); 1884 if (rc) 1885 return rc; 1886 } 1887 1888 return MDB_SUCCESS; 1889 } 1890 1891 /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. 1892 * @param[in] mc A cursor handle for the current operation. 1893 * @param[in] pflags Flags of the pages to update: 1894 * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. 1895 * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). 1896 * @return 0 on success, non-zero on failure. 1897 */ 1898 static int 1899 mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) 1900 { 1901 enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; 1902 MDB_txn *txn = mc->mc_txn; 1903 MDB_cursor *m3, *m0 = mc; 1904 MDB_xcursor *mx; 1905 MDB_page *dp, *mp; 1906 MDB_node *leaf; 1907 unsigned i, j; 1908 int rc = MDB_SUCCESS, level; 1909 1910 /* Mark pages seen by cursors */ 1911 if (mc->mc_flags & C_UNTRACK) 1912 mc = NULL; /* will find mc in mt_cursors */ 1913 for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { 1914 for (; mc; mc=mc->mc_next) { 1915 if (!(mc->mc_flags & C_INITIALIZED)) 1916 continue; 1917 for (m3 = mc;; m3 = &mx->mx_cursor) { 1918 mp = NULL; 1919 for (j=0; j<m3->mc_snum; j++) { 1920 mp = m3->mc_pg[j]; 1921 if ((mp->mp_flags & Mask) == pflags) 1922 mp->mp_flags ^= P_KEEP; 1923 } 1924 mx = m3->mc_xcursor; 1925 /* Proceed to mx if it is at a sub-database */ 1926 if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) 1927 break; 1928 if (! (mp && (mp->mp_flags & P_LEAF))) 1929 break; 1930 leaf = NODEPTR(mp, m3->mc_ki[j-1]); 1931 if (!(leaf->mn_flags & F_SUBDATA)) 1932 break; 1933 } 1934 } 1935 if (i == 0) 1936 break; 1937 } 1938 1939 if (all) { 1940 /* Mark dirty root pages */ 1941 for (i=0; i<txn->mt_numdbs; i++) { 1942 if (txn->mt_dbflags[i] & DB_DIRTY) { 1943 pgno_t pgno = txn->mt_dbs[i].md_root; 1944 if (pgno == P_INVALID) 1945 continue; 1946 if ((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS) 1947 break; 1948 if ((dp->mp_flags & Mask) == pflags && level <= 1) 1949 dp->mp_flags ^= P_KEEP; 1950 } 1951 } 1952 } 1953 1954 return rc; 1955 } 1956 1957 static int mdb_page_flush(MDB_txn *txn, int keep); 1958 1959 /** Spill pages from the dirty list back to disk. 1960 * This is intended to prevent running into #MDB_TXN_FULL situations, 1961 * but note that they may still occur in a few cases: 1962 * 1) our estimate of the txn size could be too small. Currently this 1963 * seems unlikely, except with a large number of #MDB_MULTIPLE items. 1964 * 2) child txns may run out of space if their parents dirtied a 1965 * lot of pages and never spilled them. TODO: we probably should do 1966 * a preemptive spill during #mdb_txn_begin() of a child txn, if 1967 * the parent's dirty_room is below a given threshold. 1968 * 1969 * Otherwise, if not using nested txns, it is expected that apps will 1970 * not run into #MDB_TXN_FULL any more. The pages are flushed to disk 1971 * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. 1972 * If the txn never references them again, they can be left alone. 1973 * If the txn only reads them, they can be used without any fuss. 1974 * If the txn writes them again, they can be dirtied immediately without 1975 * going thru all of the work of #mdb_page_touch(). Such references are 1976 * handled by #mdb_page_unspill(). 1977 * 1978 * Also note, we never spill DB root pages, nor pages of active cursors, 1979 * because we'll need these back again soon anyway. And in nested txns, 1980 * we can't spill a page in a child txn if it was already spilled in a 1981 * parent txn. That would alter the parent txns' data even though 1982 * the child hasn't committed yet, and we'd have no way to undo it if 1983 * the child aborted. 1984 * 1985 * @param[in] m0 cursor A cursor handle identifying the transaction and 1986 * database for which we are checking space. 1987 * @param[in] key For a put operation, the key being stored. 1988 * @param[in] data For a put operation, the data being stored. 1989 * @return 0 on success, non-zero on failure. 1990 */ 1991 static int 1992 mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) 1993 { 1994 MDB_txn *txn = m0->mc_txn; 1995 MDB_page *dp; 1996 MDB_ID2L dl = txn->mt_u.dirty_list; 1997 unsigned int i, j, need; 1998 int rc; 1999 2000 if (m0->mc_flags & C_SUB) 2001 return MDB_SUCCESS; 2002 2003 /* Estimate how much space this op will take */ 2004 i = m0->mc_db->md_depth; 2005 /* Named DBs also dirty the main DB */ 2006 if (m0->mc_dbi >= CORE_DBS) 2007 i += txn->mt_dbs[MAIN_DBI].md_depth; 2008 /* For puts, roughly factor in the key+data size */ 2009 if (key) 2010 i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; 2011 i += i; /* double it for good measure */ 2012 need = i; 2013 2014 if (txn->mt_dirty_room > i) 2015 return MDB_SUCCESS; 2016 2017 if (!txn->mt_spill_pgs) { 2018 txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); 2019 if (!txn->mt_spill_pgs) 2020 return ENOMEM; 2021 } else { 2022 /* purge deleted slots */ 2023 MDB_IDL sl = txn->mt_spill_pgs; 2024 unsigned int num = sl[0]; 2025 j=0; 2026 for (i=1; i<=num; i++) { 2027 if (!(sl[i] & 1)) 2028 sl[++j] = sl[i]; 2029 } 2030 sl[0] = j; 2031 } 2032 2033 /* Preserve pages which may soon be dirtied again */ 2034 if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) 2035 goto done; 2036 2037 /* Less aggressive spill - we originally spilled the entire dirty list, 2038 * with a few exceptions for cursor pages and DB root pages. But this 2039 * turns out to be a lot of wasted effort because in a large txn many 2040 * of those pages will need to be used again. So now we spill only 1/8th 2041 * of the dirty pages. Testing revealed this to be a good tradeoff, 2042 * better than 1/2, 1/4, or 1/10. 2043 */ 2044 if (need < MDB_IDL_UM_MAX / 8) 2045 need = MDB_IDL_UM_MAX / 8; 2046 2047 /* Save the page IDs of all the pages we're flushing */ 2048 /* flush from the tail forward, this saves a lot of shifting later on. */ 2049 for (i=dl[0].mid; i && need; i--) { 2050 MDB_ID pn = dl[i].mid << 1; 2051 dp = dl[i].mptr; 2052 if (dp->mp_flags & (P_LOOSE|P_KEEP)) 2053 continue; 2054 /* Can't spill twice, make sure it's not already in a parent's 2055 * spill list. 2056 */ 2057 if (txn->mt_parent) { 2058 MDB_txn *tx2; 2059 for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { 2060 if (tx2->mt_spill_pgs) { 2061 j = mdb_midl_search(tx2->mt_spill_pgs, pn); 2062 if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { 2063 dp->mp_flags |= P_KEEP; 2064 break; 2065 } 2066 } 2067 } 2068 if (tx2) 2069 continue; 2070 } 2071 if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) 2072 goto done; 2073 need--; 2074 } 2075 mdb_midl_sort(txn->mt_spill_pgs); 2076 2077 /* Flush the spilled part of dirty list */ 2078 if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) 2079 goto done; 2080 2081 /* Reset any dirty pages we kept that page_flush didn't see */ 2082 rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); 2083 2084 done: 2085 txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; 2086 return rc; 2087 } 2088 2089 /** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ 2090 static txnid_t 2091 mdb_find_oldest(MDB_txn *txn) 2092 { 2093 int i; 2094 txnid_t mr, oldest = txn->mt_txnid - 1; 2095 if (txn->mt_env->me_txns) { 2096 MDB_reader *r = txn->mt_env->me_txns->mti_readers; 2097 for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { 2098 if (r[i].mr_pid) { 2099 mr = r[i].mr_txnid; 2100 if (oldest > mr) 2101 oldest = mr; 2102 } 2103 } 2104 } 2105 return oldest; 2106 } 2107 2108 /** Add a page to the txn's dirty list */ 2109 static void 2110 mdb_page_dirty(MDB_txn *txn, MDB_page *mp) 2111 { 2112 MDB_ID2 mid; 2113 int rc, (*insert)(MDB_ID2L, MDB_ID2 *); 2114 2115 if (txn->mt_flags & MDB_TXN_WRITEMAP) { 2116 insert = mdb_mid2l_append; 2117 } else { 2118 insert = mdb_mid2l_insert; 2119 } 2120 mid.mid = mp->mp_pgno; 2121 mid.mptr = mp; 2122 rc = insert(txn->mt_u.dirty_list, &mid); 2123 mdb_tassert(txn, rc == 0); 2124 txn->mt_dirty_room--; 2125 } 2126 2127 /** Allocate page numbers and memory for writing. Maintain me_pglast, 2128 * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. 2129 * 2130 * If there are free pages available from older transactions, they 2131 * are re-used first. Otherwise allocate a new page at mt_next_pgno. 2132 * Do not modify the freedB, just merge freeDB records into me_pghead[] 2133 * and move me_pglast to say which records were consumed. Only this 2134 * function can create me_pghead and move me_pglast/mt_next_pgno. 2135 * @param[in] mc cursor A cursor handle identifying the transaction and 2136 * database for which we are allocating. 2137 * @param[in] num the number of pages to allocate. 2138 * @param[out] mp Address of the allocated page(s). Requests for multiple pages 2139 * will always be satisfied by a single contiguous chunk of memory. 2140 * @return 0 on success, non-zero on failure. 2141 */ 2142 static int 2143 mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) 2144 { 2145 #ifdef MDB_PARANOID /* Seems like we can ignore this now */ 2146 /* Get at most <Max_retries> more freeDB records once me_pghead 2147 * has enough pages. If not enough, use new pages from the map. 2148 * If <Paranoid> and mc is updating the freeDB, only get new 2149 * records if me_pghead is empty. Then the freelist cannot play 2150 * catch-up with itself by growing while trying to save it. 2151 */ 2152 enum { Paranoid = 1, Max_retries = 500 }; 2153 #else 2154 enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; 2155 #endif 2156 int rc, retry = num * 60; 2157 MDB_txn *txn = mc->mc_txn; 2158 MDB_env *env = txn->mt_env; 2159 pgno_t pgno, *mop = env->me_pghead; 2160 unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1; 2161 MDB_page *np; 2162 txnid_t oldest = 0, last; 2163 MDB_cursor_op op; 2164 MDB_cursor m2; 2165 int found_old = 0; 2166 2167 /* If there are any loose pages, just use them */ 2168 if (num == 1 && txn->mt_loose_pgs) { 2169 np = txn->mt_loose_pgs; 2170 txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); 2171 txn->mt_loose_count--; 2172 DPRINTF(("db %d use loose page %"Z"u", DDBI(mc), 2173 np->mp_pgno)); 2174 *mp = np; 2175 return MDB_SUCCESS; 2176 } 2177 2178 *mp = NULL; 2179 2180 /* If our dirty list is already full, we can't do anything */ 2181 if (txn->mt_dirty_room == 0) { 2182 rc = MDB_TXN_FULL; 2183 goto fail; 2184 } 2185 2186 for (op = MDB_FIRST;; op = MDB_NEXT) { 2187 MDB_val key, data; 2188 MDB_node *leaf; 2189 pgno_t *idl; 2190 2191 /* Seek a big enough contiguous page range. Prefer 2192 * pages at the tail, just truncating the list. 2193 */ 2194 if (mop_len > n2) { 2195 i = mop_len; 2196 do { 2197 pgno = mop[i]; 2198 if (mop[i-n2] == pgno+n2) 2199 goto search_done; 2200 } while (--i > n2); 2201 if (--retry < 0) 2202 break; 2203 } 2204 2205 if (op == MDB_FIRST) { /* 1st iteration */ 2206 /* Prepare to fetch more and coalesce */ 2207 last = env->me_pglast; 2208 oldest = env->me_pgoldest; 2209 mdb_cursor_init(&m2, txn, FREE_DBI, NULL); 2210 if (last) { 2211 op = MDB_SET_RANGE; 2212 key.mv_data = &last; /* will look up last+1 */ 2213 key.mv_size = sizeof(last); 2214 } 2215 if (Paranoid && mc->mc_dbi == FREE_DBI) 2216 retry = -1; 2217 } 2218 if (Paranoid && retry < 0 && mop_len) 2219 break; 2220 2221 last++; 2222 /* Do not fetch more if the record will be too recent */ 2223 if (oldest <= last) { 2224 if (!found_old) { 2225 oldest = mdb_find_oldest(txn); 2226 env->me_pgoldest = oldest; 2227 found_old = 1; 2228 } 2229 if (oldest <= last) 2230 break; 2231 } 2232 rc = mdb_cursor_get(&m2, &key, NULL, op); 2233 if (rc) { 2234 if (rc == MDB_NOTFOUND) 2235 break; 2236 goto fail; 2237 } 2238 last = *(txnid_t*)key.mv_data; 2239 if (oldest <= last) { 2240 if (!found_old) { 2241 oldest = mdb_find_oldest(txn); 2242 env->me_pgoldest = oldest; 2243 found_old = 1; 2244 } 2245 if (oldest <= last) 2246 break; 2247 } 2248 np = m2.mc_pg[m2.mc_top]; 2249 leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); 2250 if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS) 2251 goto fail; 2252 2253 idl = (MDB_ID *) data.mv_data; 2254 i = idl[0]; 2255 if (!mop) { 2256 if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { 2257 rc = ENOMEM; 2258 goto fail; 2259 } 2260 } else { 2261 if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) 2262 goto fail; 2263 mop = env->me_pghead; 2264 } 2265 env->me_pglast = last; 2266 #if (MDB_DEBUG) > 1 2267 DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", 2268 last, txn->mt_dbs[FREE_DBI].md_root, i)); 2269 for (j = i; j; j--) 2270 DPRINTF(("IDL %"Z"u", idl[j])); 2271 #endif 2272 /* Merge in descending sorted order */ 2273 mdb_midl_xmerge(mop, idl); 2274 mop_len = mop[0]; 2275 } 2276 2277 /* Use new pages from the map when nothing suitable in the freeDB */ 2278 i = 0; 2279 pgno = txn->mt_next_pgno; 2280 if (pgno + num >= env->me_maxpg) { 2281 DPUTS("DB size maxed out"); 2282 rc = MDB_MAP_FULL; 2283 goto fail; 2284 } 2285 2286 search_done: 2287 if (env->me_flags & MDB_WRITEMAP) { 2288 np = (MDB_page *)(env->me_map + env->me_psize * pgno); 2289 } else { 2290 if (!(np = mdb_page_malloc(txn, num))) { 2291 rc = ENOMEM; 2292 goto fail; 2293 } 2294 } 2295 if (i) { 2296 mop[0] = mop_len -= num; 2297 /* Move any stragglers down */ 2298 for (j = i-num; j < mop_len; ) 2299 mop[++j] = mop[++i]; 2300 } else { 2301 txn->mt_next_pgno = pgno + num; 2302 } 2303 np->mp_pgno = pgno; 2304 mdb_page_dirty(txn, np); 2305 *mp = np; 2306 2307 return MDB_SUCCESS; 2308 2309 fail: 2310 txn->mt_flags |= MDB_TXN_ERROR; 2311 return rc; 2312 } 2313 2314 /** Copy the used portions of a non-overflow page. 2315 * @param[in] dst page to copy into 2316 * @param[in] src page to copy from 2317 * @param[in] psize size of a page 2318 */ 2319 static void 2320 mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) 2321 { 2322 enum { Align = sizeof(pgno_t) }; 2323 indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; 2324 2325 /* If page isn't full, just copy the used portion. Adjust 2326 * alignment so memcpy may copy words instead of bytes. 2327 */ 2328 if ((unused &= -Align) && !IS_LEAF2(src)) { 2329 upper = (upper + PAGEBASE) & -Align; 2330 memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align); 2331 memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), 2332 psize - upper); 2333 } else { 2334 memcpy(dst, src, psize - unused); 2335 } 2336 } 2337 2338 /** Pull a page off the txn's spill list, if present. 2339 * If a page being referenced was spilled to disk in this txn, bring 2340 * it back and make it dirty/writable again. 2341 * @param[in] txn the transaction handle. 2342 * @param[in] mp the page being referenced. It must not be dirty. 2343 * @param[out] ret the writable page, if any. ret is unchanged if 2344 * mp wasn't spilled. 2345 */ 2346 static int 2347 mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) 2348 { 2349 MDB_env *env = txn->mt_env; 2350 const MDB_txn *tx2; 2351 unsigned x; 2352 pgno_t pgno = mp->mp_pgno, pn = pgno << 1; 2353 2354 for (tx2 = txn; tx2; tx2=tx2->mt_parent) { 2355 if (!tx2->mt_spill_pgs) 2356 continue; 2357 x = mdb_midl_search(tx2->mt_spill_pgs, pn); 2358 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { 2359 MDB_page *np; 2360 int num; 2361 if (txn->mt_dirty_room == 0) 2362 return MDB_TXN_FULL; 2363 if (IS_OVERFLOW(mp)) 2364 num = mp->mp_pages; 2365 else 2366 num = 1; 2367 if (env->me_flags & MDB_WRITEMAP) { 2368 np = mp; 2369 } else { 2370 np = mdb_page_malloc(txn, num); 2371 if (!np) 2372 return ENOMEM; 2373 if (num > 1) 2374 memcpy(np, mp, num * env->me_psize); 2375 else 2376 mdb_page_copy(np, mp, env->me_psize); 2377 } 2378 if (tx2 == txn) { 2379 /* If in current txn, this page is no longer spilled. 2380 * If it happens to be the last page, truncate the spill list. 2381 * Otherwise mark it as deleted by setting the LSB. 2382 */ 2383 if (x == txn->mt_spill_pgs[0]) 2384 txn->mt_spill_pgs[0]--; 2385 else 2386 txn->mt_spill_pgs[x] |= 1; 2387 } /* otherwise, if belonging to a parent txn, the 2388 * page remains spilled until child commits 2389 */ 2390 2391 mdb_page_dirty(txn, np); 2392 np->mp_flags |= P_DIRTY; 2393 *ret = np; 2394 break; 2395 } 2396 } 2397 return MDB_SUCCESS; 2398 } 2399 2400 /** Touch a page: make it dirty and re-insert into tree with updated pgno. 2401 * Set #MDB_TXN_ERROR on failure. 2402 * @param[in] mc cursor pointing to the page to be touched 2403 * @return 0 on success, non-zero on failure. 2404 */ 2405 static int 2406 mdb_page_touch(MDB_cursor *mc) 2407 { 2408 MDB_page *mp = mc->mc_pg[mc->mc_top], *np; 2409 MDB_txn *txn = mc->mc_txn; 2410 MDB_cursor *m2, *m3; 2411 pgno_t pgno; 2412 int rc; 2413 2414 if (!F_ISSET(mp->mp_flags, P_DIRTY)) { 2415 if (txn->mt_flags & MDB_TXN_SPILLS) { 2416 np = NULL; 2417 rc = mdb_page_unspill(txn, mp, &np); 2418 if (rc) 2419 goto fail; 2420 if (np) 2421 goto done; 2422 } 2423 if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || 2424 (rc = mdb_page_alloc(mc, 1, &np))) 2425 goto fail; 2426 pgno = np->mp_pgno; 2427 DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), 2428 mp->mp_pgno, pgno)); 2429 mdb_cassert(mc, mp->mp_pgno != pgno); 2430 mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); 2431 /* Update the parent page, if any, to point to the new page */ 2432 if (mc->mc_top) { 2433 MDB_page *parent = mc->mc_pg[mc->mc_top-1]; 2434 MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); 2435 SETPGNO(node, pgno); 2436 } else { 2437 mc->mc_db->md_root = pgno; 2438 } 2439 } else if (txn->mt_parent && !IS_SUBP(mp)) { 2440 MDB_ID2 mid, *dl = txn->mt_u.dirty_list; 2441 pgno = mp->mp_pgno; 2442 /* If txn has a parent, make sure the page is in our 2443 * dirty list. 2444 */ 2445 if (dl[0].mid) { 2446 unsigned x = mdb_mid2l_search(dl, pgno); 2447 if (x <= dl[0].mid && dl[x].mid == pgno) { 2448 if (mp != dl[x].mptr) { /* bad cursor? */ 2449 mc->mc_flags &= ~(C_INITIALIZED|C_EOF); 2450 txn->mt_flags |= MDB_TXN_ERROR; 2451 return MDB_CORRUPTED; 2452 } 2453 return 0; 2454 } 2455 } 2456 mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); 2457 /* No - copy it */ 2458 np = mdb_page_malloc(txn, 1); 2459 if (!np) 2460 return ENOMEM; 2461 mid.mid = pgno; 2462 mid.mptr = np; 2463 rc = mdb_mid2l_insert(dl, &mid); 2464 mdb_cassert(mc, rc == 0); 2465 } else { 2466 return 0; 2467 } 2468 2469 mdb_page_copy(np, mp, txn->mt_env->me_psize); 2470 np->mp_pgno = pgno; 2471 np->mp_flags |= P_DIRTY; 2472 2473 done: 2474 /* Adjust cursors pointing to mp */ 2475 mc->mc_pg[mc->mc_top] = np; 2476 m2 = txn->mt_cursors[mc->mc_dbi]; 2477 if (mc->mc_flags & C_SUB) { 2478 for (; m2; m2=m2->mc_next) { 2479 m3 = &m2->mc_xcursor->mx_cursor; 2480 if (m3->mc_snum < mc->mc_snum) continue; 2481 if (m3->mc_pg[mc->mc_top] == mp) 2482 m3->mc_pg[mc->mc_top] = np; 2483 } 2484 } else { 2485 for (; m2; m2=m2->mc_next) { 2486 if (m2->mc_snum < mc->mc_snum) continue; 2487 if (m2 == mc) continue; 2488 if (m2->mc_pg[mc->mc_top] == mp) { 2489 m2->mc_pg[mc->mc_top] = np; 2490 if (XCURSOR_INITED(m2) && IS_LEAF(np)) 2491 XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]); 2492 } 2493 } 2494 } 2495 return 0; 2496 2497 fail: 2498 txn->mt_flags |= MDB_TXN_ERROR; 2499 return rc; 2500 } 2501 2502 int 2503 mdb_env_sync(MDB_env *env, int force) 2504 { 2505 int rc = 0; 2506 if (env->me_flags & MDB_RDONLY) 2507 return EACCES; 2508 if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { 2509 if (env->me_flags & MDB_WRITEMAP) { 2510 int flags = ((env->me_flags & MDB_MAPASYNC) && !force) 2511 ? MS_ASYNC : MS_SYNC; 2512 if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) 2513 rc = ErrCode(); 2514 #ifdef _WIN32 2515 else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) 2516 rc = ErrCode(); 2517 #endif 2518 } else { 2519 #ifdef BROKEN_FDATASYNC 2520 if (env->me_flags & MDB_FSYNCONLY) { 2521 if (fsync(env->me_fd)) 2522 rc = ErrCode(); 2523 } else 2524 #endif 2525 if (MDB_FDATASYNC(env->me_fd)) 2526 rc = ErrCode(); 2527 } 2528 } 2529 return rc; 2530 } 2531 2532 /** Back up parent txn's cursors, then grab the originals for tracking */ 2533 static int 2534 mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) 2535 { 2536 MDB_cursor *mc, *bk; 2537 MDB_xcursor *mx; 2538 size_t size; 2539 int i; 2540 2541 for (i = src->mt_numdbs; --i >= 0; ) { 2542 if ((mc = src->mt_cursors[i]) != NULL) { 2543 size = sizeof(MDB_cursor); 2544 if (mc->mc_xcursor) 2545 size += sizeof(MDB_xcursor); 2546 for (; mc; mc = bk->mc_next) { 2547 bk = malloc(size); 2548 if (!bk) 2549 return ENOMEM; 2550 *bk = *mc; 2551 mc->mc_backup = bk; 2552 mc->mc_db = &dst->mt_dbs[i]; 2553 /* Kill pointers into src to reduce abuse: The 2554 * user may not use mc until dst ends. But we need a valid 2555 * txn pointer here for cursor fixups to keep working. 2556 */ 2557 mc->mc_txn = dst; 2558 mc->mc_dbflag = &dst->mt_dbflags[i]; 2559 if ((mx = mc->mc_xcursor) != NULL) { 2560 *(MDB_xcursor *)(bk+1) = *mx; 2561 mx->mx_cursor.mc_txn = dst; 2562 } 2563 mc->mc_next = dst->mt_cursors[i]; 2564 dst->mt_cursors[i] = mc; 2565 } 2566 } 2567 } 2568 return MDB_SUCCESS; 2569 } 2570 2571 /** Close this write txn's cursors, give parent txn's cursors back to parent. 2572 * @param[in] txn the transaction handle. 2573 * @param[in] merge true to keep changes to parent cursors, false to revert. 2574 * @return 0 on success, non-zero on failure. 2575 */ 2576 static void 2577 mdb_cursors_close(MDB_txn *txn, unsigned merge) 2578 { 2579 MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; 2580 MDB_xcursor *mx; 2581 int i; 2582 2583 for (i = txn->mt_numdbs; --i >= 0; ) { 2584 for (mc = cursors[i]; mc; mc = next) { 2585 next = mc->mc_next; 2586 if ((bk = mc->mc_backup) != NULL) { 2587 if (merge) { 2588 /* Commit changes to parent txn */ 2589 mc->mc_next = bk->mc_next; 2590 mc->mc_backup = bk->mc_backup; 2591 mc->mc_txn = bk->mc_txn; 2592 mc->mc_db = bk->mc_db; 2593 mc->mc_dbflag = bk->mc_dbflag; 2594 if ((mx = mc->mc_xcursor) != NULL) 2595 mx->mx_cursor.mc_txn = bk->mc_txn; 2596 } else { 2597 /* Abort nested txn */ 2598 *mc = *bk; 2599 if ((mx = mc->mc_xcursor) != NULL) 2600 *mx = *(MDB_xcursor *)(bk+1); 2601 } 2602 mc = bk; 2603 } 2604 /* Only malloced cursors are permanently tracked. */ 2605 free(mc); 2606 } 2607 cursors[i] = NULL; 2608 } 2609 } 2610 2611 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ 2612 enum Pidlock_op { 2613 Pidset, Pidcheck 2614 }; 2615 #else 2616 enum Pidlock_op { 2617 Pidset = F_SETLK, Pidcheck = F_GETLK 2618 }; 2619 #endif 2620 2621 /** Set or check a pid lock. Set returns 0 on success. 2622 * Check returns 0 if the process is certainly dead, nonzero if it may 2623 * be alive (the lock exists or an error happened so we do not know). 2624 * 2625 * On Windows Pidset is a no-op, we merely check for the existence 2626 * of the process with the given pid. On POSIX we use a single byte 2627 * lock on the lockfile, set at an offset equal to the pid. 2628 */ 2629 static int 2630 mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid) 2631 { 2632 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ 2633 int ret = 0; 2634 HANDLE h; 2635 if (op == Pidcheck) { 2636 h = OpenProcess(env->me_pidquery, FALSE, pid); 2637 /* No documented "no such process" code, but other program use this: */ 2638 if (!h) 2639 return ErrCode() != ERROR_INVALID_PARAMETER; 2640 /* A process exists until all handles to it close. Has it exited? */ 2641 ret = WaitForSingleObject(h, 0) != 0; 2642 CloseHandle(h); 2643 } 2644 return ret; 2645 #else 2646 for (;;) { 2647 int rc; 2648 struct flock lock_info; 2649 memset(&lock_info, 0, sizeof(lock_info)); 2650 lock_info.l_type = F_WRLCK; 2651 lock_info.l_whence = SEEK_SET; 2652 lock_info.l_start = pid; 2653 lock_info.l_len = 1; 2654 if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { 2655 if (op == F_GETLK && lock_info.l_type != F_UNLCK) 2656 rc = -1; 2657 } else if ((rc = ErrCode()) == EINTR) { 2658 continue; 2659 } 2660 return rc; 2661 } 2662 #endif 2663 } 2664 2665 /** Common code for #mdb_txn_begin() and #mdb_txn_renew(). 2666 * @param[in] txn the transaction handle to initialize 2667 * @return 0 on success, non-zero on failure. 2668 */ 2669 static int 2670 mdb_txn_renew0(MDB_txn *txn) 2671 { 2672 MDB_env *env = txn->mt_env; 2673 MDB_txninfo *ti = env->me_txns; 2674 MDB_meta *meta; 2675 unsigned int i, nr, flags = txn->mt_flags; 2676 uint16_t x; 2677 int rc, new_notls = 0; 2678 2679 if ((flags &= MDB_TXN_RDONLY) != 0) { 2680 if (!ti) { 2681 meta = mdb_env_pick_meta(env); 2682 txn->mt_txnid = meta->mm_txnid; 2683 txn->mt_u.reader = NULL; 2684 } else { 2685 MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : 2686 pthread_getspecific(env->me_txkey); 2687 if (r) { 2688 if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) 2689 return MDB_BAD_RSLOT; 2690 } else { 2691 MDB_PID_T pid = env->me_pid; 2692 MDB_THR_T tid = pthread_self(); 2693 mdb_mutexref_t rmutex = env->me_rmutex; 2694 2695 if (!env->me_live_reader) { 2696 rc = mdb_reader_pid(env, Pidset, pid); 2697 if (rc) 2698 return rc; 2699 env->me_live_reader = 1; 2700 } 2701 2702 if (LOCK_MUTEX(rc, env, rmutex)) 2703 return rc; 2704 nr = ti->mti_numreaders; 2705 for (i=0; i<nr; i++) 2706 if (ti->mti_readers[i].mr_pid == 0) 2707 break; 2708 if (i == env->me_maxreaders) { 2709 UNLOCK_MUTEX(rmutex); 2710 return MDB_READERS_FULL; 2711 } 2712 r = &ti->mti_readers[i]; 2713 /* Claim the reader slot, carefully since other code 2714 * uses the reader table un-mutexed: First reset the 2715 * slot, next publish it in mti_numreaders. After 2716 * that, it is safe for mdb_env_close() to touch it. 2717 * When it will be closed, we can finally claim it. 2718 */ 2719 r->mr_pid = 0; 2720 r->mr_txnid = (txnid_t)-1; 2721 r->mr_tid = tid; 2722 if (i == nr) 2723 ti->mti_numreaders = ++nr; 2724 env->me_close_readers = nr; 2725 r->mr_pid = pid; 2726 UNLOCK_MUTEX(rmutex); 2727 2728 new_notls = (env->me_flags & MDB_NOTLS); 2729 if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { 2730 r->mr_pid = 0; 2731 return rc; 2732 } 2733 } 2734 do /* LY: Retry on a race, ITS#7970. */ 2735 r->mr_txnid = ti->mti_txnid; 2736 while(r->mr_txnid != ti->mti_txnid); 2737 txn->mt_txnid = r->mr_txnid; 2738 txn->mt_u.reader = r; 2739 meta = env->me_metas[txn->mt_txnid & 1]; 2740 } 2741 2742 } else { 2743 /* Not yet touching txn == env->me_txn0, it may be active */ 2744 if (ti) { 2745 if (LOCK_MUTEX(rc, env, env->me_wmutex)) 2746 return rc; 2747 txn->mt_txnid = ti->mti_txnid; 2748 meta = env->me_metas[txn->mt_txnid & 1]; 2749 } else { 2750 meta = mdb_env_pick_meta(env); 2751 txn->mt_txnid = meta->mm_txnid; 2752 } 2753 txn->mt_txnid++; 2754 #if MDB_DEBUG 2755 if (txn->mt_txnid == mdb_debug_start) 2756 mdb_debug = 1; 2757 #endif 2758 txn->mt_child = NULL; 2759 txn->mt_loose_pgs = NULL; 2760 txn->mt_loose_count = 0; 2761 txn->mt_dirty_room = MDB_IDL_UM_MAX; 2762 txn->mt_u.dirty_list = env->me_dirty_list; 2763 txn->mt_u.dirty_list[0].mid = 0; 2764 txn->mt_free_pgs = env->me_free_pgs; 2765 txn->mt_free_pgs[0] = 0; 2766 txn->mt_spill_pgs = NULL; 2767 env->me_txn = txn; 2768 memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int)); 2769 } 2770 2771 /* Copy the DB info and flags */ 2772 memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); 2773 2774 /* Moved to here to avoid a data race in read TXNs */ 2775 txn->mt_next_pgno = meta->mm_last_pg+1; 2776 2777 txn->mt_flags = flags; 2778 2779 /* Setup db info */ 2780 txn->mt_numdbs = env->me_numdbs; 2781 for (i=CORE_DBS; i<txn->mt_numdbs; i++) { 2782 x = env->me_dbflags[i]; 2783 txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; 2784 txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0; 2785 } 2786 txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID; 2787 txn->mt_dbflags[FREE_DBI] = DB_VALID; 2788 2789 if (env->me_flags & MDB_FATAL_ERROR) { 2790 DPUTS("environment had fatal error, must shutdown!"); 2791 rc = MDB_PANIC; 2792 } else if (env->me_maxpg < txn->mt_next_pgno) { 2793 rc = MDB_MAP_RESIZED; 2794 } else { 2795 return MDB_SUCCESS; 2796 } 2797 mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); 2798 return rc; 2799 } 2800 2801 int 2802 mdb_txn_renew(MDB_txn *txn) 2803 { 2804 int rc; 2805 2806 if (!txn || !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED)) 2807 return EINVAL; 2808 2809 rc = mdb_txn_renew0(txn); 2810 if (rc == MDB_SUCCESS) { 2811 DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", 2812 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', 2813 (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); 2814 } 2815 return rc; 2816 } 2817 2818 int 2819 mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) 2820 { 2821 MDB_txn *txn; 2822 MDB_ntxn *ntxn; 2823 int rc, size, tsize; 2824 2825 flags &= MDB_TXN_BEGIN_FLAGS; 2826 flags |= env->me_flags & MDB_WRITEMAP; 2827 2828 if (env->me_flags & MDB_RDONLY & ~flags) /* write txn in RDONLY env */ 2829 return EACCES; 2830 2831 if (parent) { 2832 /* Nested transactions: Max 1 child, write txns only, no writemap */ 2833 flags |= parent->mt_flags; 2834 if (flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED)) { 2835 return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; 2836 } 2837 /* Child txns save MDB_pgstate and use own copy of cursors */ 2838 size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1); 2839 size += tsize = sizeof(MDB_ntxn); 2840 } else if (flags & MDB_RDONLY) { 2841 size = env->me_maxdbs * (sizeof(MDB_db)+1); 2842 size += tsize = sizeof(MDB_txn); 2843 } else { 2844 /* Reuse preallocated write txn. However, do not touch it until 2845 * mdb_txn_renew0() succeeds, since it currently may be active. 2846 */ 2847 txn = env->me_txn0; 2848 goto renew; 2849 } 2850 if ((txn = calloc(1, size)) == NULL) { 2851 DPRINTF(("calloc: %s", strerror(errno))); 2852 return ENOMEM; 2853 } 2854 txn->mt_dbxs = env->me_dbxs; /* static */ 2855 txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); 2856 txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; 2857 txn->mt_flags = flags; 2858 txn->mt_env = env; 2859 2860 if (parent) { 2861 unsigned int i; 2862 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); 2863 txn->mt_dbiseqs = parent->mt_dbiseqs; 2864 txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); 2865 if (!txn->mt_u.dirty_list || 2866 !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) 2867 { 2868 free(txn->mt_u.dirty_list); 2869 free(txn); 2870 return ENOMEM; 2871 } 2872 txn->mt_txnid = parent->mt_txnid; 2873 txn->mt_dirty_room = parent->mt_dirty_room; 2874 txn->mt_u.dirty_list[0].mid = 0; 2875 txn->mt_spill_pgs = NULL; 2876 txn->mt_next_pgno = parent->mt_next_pgno; 2877 parent->mt_flags |= MDB_TXN_HAS_CHILD; 2878 parent->mt_child = txn; 2879 txn->mt_parent = parent; 2880 txn->mt_numdbs = parent->mt_numdbs; 2881 memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); 2882 /* Copy parent's mt_dbflags, but clear DB_NEW */ 2883 for (i=0; i<txn->mt_numdbs; i++) 2884 txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; 2885 rc = 0; 2886 ntxn = (MDB_ntxn *)txn; 2887 ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ 2888 if (env->me_pghead) { 2889 size = MDB_IDL_SIZEOF(env->me_pghead); 2890 env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); 2891 if (env->me_pghead) 2892 memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); 2893 else 2894 rc = ENOMEM; 2895 } 2896 if (!rc) 2897 rc = mdb_cursor_shadow(parent, txn); 2898 if (rc) 2899 mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD); 2900 } else { /* MDB_RDONLY */ 2901 txn->mt_dbiseqs = env->me_dbiseqs; 2902 renew: 2903 rc = mdb_txn_renew0(txn); 2904 } 2905 if (rc) { 2906 if (txn != env->me_txn0) 2907 free(txn); 2908 } else { 2909 txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */ 2910 *ret = txn; 2911 DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", 2912 txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', 2913 (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); 2914 } 2915 2916 return rc; 2917 } 2918 2919 MDB_env * 2920 mdb_txn_env(MDB_txn *txn) 2921 { 2922 if(!txn) return NULL; 2923 return txn->mt_env; 2924 } 2925 2926 size_t 2927 mdb_txn_id(MDB_txn *txn) 2928 { 2929 if(!txn) return 0; 2930 return txn->mt_txnid; 2931 } 2932 2933 /** Export or close DBI handles opened in this txn. */ 2934 static void 2935 mdb_dbis_update(MDB_txn *txn, int keep) 2936 { 2937 int i; 2938 MDB_dbi n = txn->mt_numdbs; 2939 MDB_env *env = txn->mt_env; 2940 unsigned char *tdbflags = txn->mt_dbflags; 2941 2942 for (i = n; --i >= CORE_DBS;) { 2943 if (tdbflags[i] & DB_NEW) { 2944 if (keep) { 2945 env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; 2946 } else { 2947 char *ptr = env->me_dbxs[i].md_name.mv_data; 2948 if (ptr) { 2949 env->me_dbxs[i].md_name.mv_data = NULL; 2950 env->me_dbxs[i].md_name.mv_size = 0; 2951 env->me_dbflags[i] = 0; 2952 env->me_dbiseqs[i]++; 2953 free(ptr); 2954 } 2955 } 2956 } 2957 } 2958 if (keep && env->me_numdbs < n) 2959 env->me_numdbs = n; 2960 } 2961 2962 /** End a transaction, except successful commit of a nested transaction. 2963 * May be called twice for readonly txns: First reset it, then abort. 2964 * @param[in] txn the transaction handle to end 2965 * @param[in] mode why and how to end the transaction 2966 */ 2967 static void 2968 mdb_txn_end(MDB_txn *txn, unsigned mode) 2969 { 2970 MDB_env *env = txn->mt_env; 2971 #if MDB_DEBUG 2972 static const char *const names[] = MDB_END_NAMES; 2973 #endif 2974 2975 /* Export or close DBI handles opened in this txn */ 2976 mdb_dbis_update(txn, mode & MDB_END_UPDATE); 2977 2978 DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", 2979 names[mode & MDB_END_OPMASK], 2980 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', 2981 (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); 2982 2983 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { 2984 if (txn->mt_u.reader) { 2985 txn->mt_u.reader->mr_txnid = (txnid_t)-1; 2986 if (!(env->me_flags & MDB_NOTLS)) { 2987 txn->mt_u.reader = NULL; /* txn does not own reader */ 2988 } else if (mode & MDB_END_SLOT) { 2989 txn->mt_u.reader->mr_pid = 0; 2990 txn->mt_u.reader = NULL; 2991 } /* else txn owns the slot until it does MDB_END_SLOT */ 2992 } 2993 txn->mt_numdbs = 0; /* prevent further DBI activity */ 2994 txn->mt_flags |= MDB_TXN_FINISHED; 2995 2996 } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { 2997 pgno_t *pghead = env->me_pghead; 2998 2999 if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ 3000 mdb_cursors_close(txn, 0); 3001 if (!(env->me_flags & MDB_WRITEMAP)) { 3002 mdb_dlist_free(txn); 3003 } 3004 3005 txn->mt_numdbs = 0; 3006 txn->mt_flags = MDB_TXN_FINISHED; 3007 3008 if (!txn->mt_parent) { 3009 mdb_midl_shrink(&txn->mt_free_pgs); 3010 env->me_free_pgs = txn->mt_free_pgs; 3011 /* me_pgstate: */ 3012 env->me_pghead = NULL; 3013 env->me_pglast = 0; 3014 3015 env->me_txn = NULL; 3016 mode = 0; /* txn == env->me_txn0, do not free() it */ 3017 3018 /* The writer mutex was locked in mdb_txn_begin. */ 3019 if (env->me_txns) 3020 UNLOCK_MUTEX(env->me_wmutex); 3021 } else { 3022 txn->mt_parent->mt_child = NULL; 3023 txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; 3024 env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; 3025 mdb_midl_free(txn->mt_free_pgs); 3026 mdb_midl_free(txn->mt_spill_pgs); 3027 free(txn->mt_u.dirty_list); 3028 } 3029 3030 mdb_midl_free(pghead); 3031 } 3032 3033 if (mode & MDB_END_FREE) 3034 free(txn); 3035 } 3036 3037 void 3038 mdb_txn_reset(MDB_txn *txn) 3039 { 3040 if (txn == NULL) 3041 return; 3042 3043 /* This call is only valid for read-only txns */ 3044 if (!(txn->mt_flags & MDB_TXN_RDONLY)) 3045 return; 3046 3047 mdb_txn_end(txn, MDB_END_RESET); 3048 } 3049 3050 void 3051 mdb_txn_abort(MDB_txn *txn) 3052 { 3053 if (txn == NULL) 3054 return; 3055 3056 if (txn->mt_child) 3057 mdb_txn_abort(txn->mt_child); 3058 3059 mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); 3060 } 3061 3062 /** Save the freelist as of this transaction to the freeDB. 3063 * This changes the freelist. Keep trying until it stabilizes. 3064 */ 3065 static int 3066 mdb_freelist_save(MDB_txn *txn) 3067 { 3068 /* env->me_pghead[] can grow and shrink during this call. 3069 * env->me_pglast and txn->mt_free_pgs[] can only grow. 3070 * Page numbers cannot disappear from txn->mt_free_pgs[]. 3071 */ 3072 MDB_cursor mc; 3073 MDB_env *env = txn->mt_env; 3074 int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; 3075 txnid_t pglast = 0, head_id = 0; 3076 pgno_t freecnt = 0, *free_pgs, *mop; 3077 ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; 3078 3079 mdb_cursor_init(&mc, txn, FREE_DBI, NULL); 3080 3081 if (env->me_pghead) { 3082 /* Make sure first page of freeDB is touched and on freelist */ 3083 rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); 3084 if (rc && rc != MDB_NOTFOUND) 3085 return rc; 3086 } 3087 3088 if (!env->me_pghead && txn->mt_loose_pgs) { 3089 /* Put loose page numbers in mt_free_pgs, since 3090 * we may be unable to return them to me_pghead. 3091 */ 3092 MDB_page *mp = txn->mt_loose_pgs; 3093 if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) 3094 return rc; 3095 for (; mp; mp = NEXT_LOOSE_PAGE(mp)) 3096 mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); 3097 txn->mt_loose_pgs = NULL; 3098 txn->mt_loose_count = 0; 3099 } 3100 3101 /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ 3102 clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) 3103 ? SSIZE_MAX : maxfree_1pg; 3104 3105 for (;;) { 3106 /* Come back here after each Put() in case freelist changed */ 3107 MDB_val key, data; 3108 pgno_t *pgs; 3109 ssize_t j; 3110 3111 /* If using records from freeDB which we have not yet 3112 * deleted, delete them and any we reserved for me_pghead. 3113 */ 3114 while (pglast < env->me_pglast) { 3115 rc = mdb_cursor_first(&mc, &key, NULL); 3116 if (rc) 3117 return rc; 3118 pglast = head_id = *(txnid_t *)key.mv_data; 3119 total_room = head_room = 0; 3120 mdb_tassert(txn, pglast <= env->me_pglast); 3121 rc = mdb_cursor_del(&mc, 0); 3122 if (rc) 3123 return rc; 3124 } 3125 3126 /* Save the IDL of pages freed by this txn, to a single record */ 3127 if (freecnt < txn->mt_free_pgs[0]) { 3128 if (!freecnt) { 3129 /* Make sure last page of freeDB is touched and on freelist */ 3130 rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); 3131 if (rc && rc != MDB_NOTFOUND) 3132 return rc; 3133 } 3134 free_pgs = txn->mt_free_pgs; 3135 /* Write to last page of freeDB */ 3136 key.mv_size = sizeof(txn->mt_txnid); 3137 key.mv_data = &txn->mt_txnid; 3138 do { 3139 freecnt = free_pgs[0]; 3140 data.mv_size = MDB_IDL_SIZEOF(free_pgs); 3141 rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); 3142 if (rc) 3143 return rc; 3144 /* Retry if mt_free_pgs[] grew during the Put() */ 3145 free_pgs = txn->mt_free_pgs; 3146 } while (freecnt < free_pgs[0]); 3147 mdb_midl_sort(free_pgs); 3148 memcpy(data.mv_data, free_pgs, data.mv_size); 3149 #if (MDB_DEBUG) > 1 3150 { 3151 unsigned int i = free_pgs[0]; 3152 DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", 3153 txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); 3154 for (; i; i--) 3155 DPRINTF(("IDL %"Z"u", free_pgs[i])); 3156 } 3157 #endif 3158 continue; 3159 } 3160 3161 mop = env->me_pghead; 3162 mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; 3163 3164 /* Reserve records for me_pghead[]. Split it if multi-page, 3165 * to avoid searching freeDB for a page range. Use keys in 3166 * range [1,me_pglast]: Smaller than txnid of oldest reader. 3167 */ 3168 if (total_room >= mop_len) { 3169 if (total_room == mop_len || --more < 0) 3170 break; 3171 } else if (head_room >= maxfree_1pg && head_id > 1) { 3172 /* Keep current record (overflow page), add a new one */ 3173 head_id--; 3174 head_room = 0; 3175 } 3176 /* (Re)write {key = head_id, IDL length = head_room} */ 3177 total_room -= head_room; 3178 head_room = mop_len - total_room; 3179 if (head_room > maxfree_1pg && head_id > 1) { 3180 /* Overflow multi-page for part of me_pghead */ 3181 head_room /= head_id; /* amortize page sizes */ 3182 head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); 3183 } else if (head_room < 0) { 3184 /* Rare case, not bothering to delete this record */ 3185 head_room = 0; 3186 } 3187 key.mv_size = sizeof(head_id); 3188 key.mv_data = &head_id; 3189 data.mv_size = (head_room + 1) * sizeof(pgno_t); 3190 rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); 3191 if (rc) 3192 return rc; 3193 /* IDL is initially empty, zero out at least the length */ 3194 pgs = (pgno_t *)data.mv_data; 3195 j = head_room > clean_limit ? head_room : 0; 3196 do { 3197 pgs[j] = 0; 3198 } while (--j >= 0); 3199 total_room += head_room; 3200 } 3201 3202 /* Return loose page numbers to me_pghead, though usually none are 3203 * left at this point. The pages themselves remain in dirty_list. 3204 */ 3205 if (txn->mt_loose_pgs) { 3206 MDB_page *mp = txn->mt_loose_pgs; 3207 unsigned count = txn->mt_loose_count; 3208 MDB_IDL loose; 3209 /* Room for loose pages + temp IDL with same */ 3210 if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0) 3211 return rc; 3212 mop = env->me_pghead; 3213 loose = mop + MDB_IDL_ALLOCLEN(mop) - count; 3214 for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) 3215 loose[ ++count ] = mp->mp_pgno; 3216 loose[0] = count; 3217 mdb_midl_sort(loose); 3218 mdb_midl_xmerge(mop, loose); 3219 txn->mt_loose_pgs = NULL; 3220 txn->mt_loose_count = 0; 3221 mop_len = mop[0]; 3222 } 3223 3224 /* Fill in the reserved me_pghead records */ 3225 rc = MDB_SUCCESS; 3226 if (mop_len) { 3227 MDB_val key, data; 3228 3229 mop += mop_len; 3230 rc = mdb_cursor_first(&mc, &key, &data); 3231 for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { 3232 txnid_t id = *(txnid_t *)key.mv_data; 3233 ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; 3234 MDB_ID save; 3235 3236 mdb_tassert(txn, len >= 0 && id <= env->me_pglast); 3237 key.mv_data = &id; 3238 if (len > mop_len) { 3239 len = mop_len; 3240 data.mv_size = (len + 1) * sizeof(MDB_ID); 3241 } 3242 data.mv_data = mop -= len; 3243 save = mop[0]; 3244 mop[0] = len; 3245 rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); 3246 mop[0] = save; 3247 if (rc || !(mop_len -= len)) 3248 break; 3249 } 3250 } 3251 return rc; 3252 } 3253 3254 /** Flush (some) dirty pages to the map, after clearing their dirty flag. 3255 * @param[in] txn the transaction that's being committed 3256 * @param[in] keep number of initial pages in dirty_list to keep dirty. 3257 * @return 0 on success, non-zero on failure. 3258 */ 3259 static int 3260 mdb_page_flush(MDB_txn *txn, int keep) 3261 { 3262 MDB_env *env = txn->mt_env; 3263 MDB_ID2L dl = txn->mt_u.dirty_list; 3264 unsigned psize = env->me_psize, j; 3265 int i, pagecount = dl[0].mid, rc; 3266 size_t size = 0, pos = 0; 3267 pgno_t pgno = 0; 3268 MDB_page *dp = NULL; 3269 #ifdef _WIN32 3270 OVERLAPPED ov; 3271 #else 3272 struct iovec iov[MDB_COMMIT_PAGES]; 3273 ssize_t wpos = 0, wsize = 0, wres; 3274 size_t next_pos = 1; /* impossible pos, so pos != next_pos */ 3275 int n = 0; 3276 #endif 3277 3278 j = i = keep; 3279 3280 if (env->me_flags & MDB_WRITEMAP) { 3281 /* Clear dirty flags */ 3282 while (++i <= pagecount) { 3283 dp = dl[i].mptr; 3284 /* Don't flush this page yet */ 3285 if (dp->mp_flags & (P_LOOSE|P_KEEP)) { 3286 dp->mp_flags &= ~P_KEEP; 3287 dl[++j] = dl[i]; 3288 continue; 3289 } 3290 dp->mp_flags &= ~P_DIRTY; 3291 } 3292 goto done; 3293 } 3294 3295 /* Write the pages */ 3296 for (;;) { 3297 if (++i <= pagecount) { 3298 dp = dl[i].mptr; 3299 /* Don't flush this page yet */ 3300 if (dp->mp_flags & (P_LOOSE|P_KEEP)) { 3301 dp->mp_flags &= ~P_KEEP; 3302 dl[i].mid = 0; 3303 continue; 3304 } 3305 pgno = dl[i].mid; 3306 /* clear dirty flag */ 3307 dp->mp_flags &= ~P_DIRTY; 3308 pos = pgno * psize; 3309 size = psize; 3310 if (IS_OVERFLOW(dp)) size *= dp->mp_pages; 3311 } 3312 #ifdef _WIN32 3313 else break; 3314 3315 /* Windows actually supports scatter/gather I/O, but only on 3316 * unbuffered file handles. Since we're relying on the OS page 3317 * cache for all our data, that's self-defeating. So we just 3318 * write pages one at a time. We use the ov structure to set 3319 * the write offset, to at least save the overhead of a Seek 3320 * system call. 3321 */ 3322 DPRINTF(("committing page %"Z"u", pgno)); 3323 memset(&ov, 0, sizeof(ov)); 3324 ov.Offset = pos & 0xffffffff; 3325 ov.OffsetHigh = pos >> 16 >> 16; 3326 if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { 3327 rc = ErrCode(); 3328 DPRINTF(("WriteFile: %d", rc)); 3329 return rc; 3330 } 3331 #else 3332 /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ 3333 if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { 3334 if (n) { 3335 retry_write: 3336 /* Write previous page(s) */ 3337 #ifdef MDB_USE_PWRITEV 3338 wres = pwritev(env->me_fd, iov, n, wpos); 3339 #else 3340 if (n == 1) { 3341 wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); 3342 } else { 3343 retry_seek: 3344 if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { 3345 rc = ErrCode(); 3346 if (rc == EINTR) 3347 goto retry_seek; 3348 DPRINTF(("lseek: %s", strerror(rc))); 3349 return rc; 3350 } 3351 wres = writev(env->me_fd, iov, n); 3352 } 3353 #endif 3354 if (wres != wsize) { 3355 if (wres < 0) { 3356 rc = ErrCode(); 3357 if (rc == EINTR) 3358 goto retry_write; 3359 DPRINTF(("Write error: %s", strerror(rc))); 3360 } else { 3361 rc = EIO; /* TODO: Use which error code? */ 3362 DPUTS("short write, filesystem full?"); 3363 } 3364 return rc; 3365 } 3366 n = 0; 3367 } 3368 if (i > pagecount) 3369 break; 3370 wpos = pos; 3371 wsize = 0; 3372 } 3373 DPRINTF(("committing page %"Z"u", pgno)); 3374 next_pos = pos + size; 3375 iov[n].iov_len = size; 3376 iov[n].iov_base = (char *)dp; 3377 wsize += size; 3378 n++; 3379 #endif /* _WIN32 */ 3380 } 3381 3382 /* MIPS has cache coherency issues, this is a no-op everywhere else 3383 * Note: for any size >= on-chip cache size, entire on-chip cache is 3384 * flushed. 3385 */ 3386 CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); 3387 3388 for (i = keep; ++i <= pagecount; ) { 3389 dp = dl[i].mptr; 3390 /* This is a page we skipped above */ 3391 if (!dl[i].mid) { 3392 dl[++j] = dl[i]; 3393 dl[j].mid = dp->mp_pgno; 3394 continue; 3395 } 3396 mdb_dpage_free(env, dp); 3397 } 3398 3399 done: 3400 i--; 3401 txn->mt_dirty_room += i - j; 3402 dl[0].mid = j; 3403 return MDB_SUCCESS; 3404 } 3405 3406 int 3407 mdb_txn_commit(MDB_txn *txn) 3408 { 3409 int rc; 3410 unsigned int i, end_mode; 3411 MDB_env *env; 3412 3413 if (txn == NULL) 3414 return EINVAL; 3415 3416 /* mdb_txn_end() mode for a commit which writes nothing */ 3417 end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; 3418 3419 if (txn->mt_child) { 3420 rc = mdb_txn_commit(txn->mt_child); 3421 if (rc) 3422 goto fail; 3423 } 3424 3425 env = txn->mt_env; 3426 3427 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { 3428 goto done; 3429 } 3430 3431 if (txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR)) { 3432 DPUTS("txn has failed/finished, can't commit"); 3433 if (txn->mt_parent) 3434 txn->mt_parent->mt_flags |= MDB_TXN_ERROR; 3435 rc = MDB_BAD_TXN; 3436 goto fail; 3437 } 3438 3439 if (txn->mt_parent) { 3440 MDB_txn *parent = txn->mt_parent; 3441 MDB_page **lp; 3442 MDB_ID2L dst, src; 3443 MDB_IDL pspill; 3444 unsigned x, y, len, ps_len; 3445 3446 /* Append our free list to parent's */ 3447 rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); 3448 if (rc) 3449 goto fail; 3450 mdb_midl_free(txn->mt_free_pgs); 3451 /* Failures after this must either undo the changes 3452 * to the parent or set MDB_TXN_ERROR in the parent. 3453 */ 3454 3455 parent->mt_next_pgno = txn->mt_next_pgno; 3456 parent->mt_flags = txn->mt_flags; 3457 3458 /* Merge our cursors into parent's and close them */ 3459 mdb_cursors_close(txn, 1); 3460 3461 /* Update parent's DB table. */ 3462 memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); 3463 parent->mt_numdbs = txn->mt_numdbs; 3464 parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; 3465 parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; 3466 for (i=CORE_DBS; i<txn->mt_numdbs; i++) { 3467 /* preserve parent's DB_NEW status */ 3468 x = parent->mt_dbflags[i] & DB_NEW; 3469 parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; 3470 } 3471 3472 dst = parent->mt_u.dirty_list; 3473 src = txn->mt_u.dirty_list; 3474 /* Remove anything in our dirty list from parent's spill list */ 3475 if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { 3476 x = y = ps_len; 3477 pspill[0] = (pgno_t)-1; 3478 /* Mark our dirty pages as deleted in parent spill list */ 3479 for (i=0, len=src[0].mid; ++i <= len; ) { 3480 MDB_ID pn = src[i].mid << 1; 3481 while (pn > pspill[x]) 3482 x--; 3483 if (pn == pspill[x]) { 3484 pspill[x] = 1; 3485 y = --x; 3486 } 3487 } 3488 /* Squash deleted pagenums if we deleted any */ 3489 for (x=y; ++x <= ps_len; ) 3490 if (!(pspill[x] & 1)) 3491 pspill[++y] = pspill[x]; 3492 pspill[0] = y; 3493 } 3494 3495 /* Remove anything in our spill list from parent's dirty list */ 3496 if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { 3497 for (i=1; i<=txn->mt_spill_pgs[0]; i++) { 3498 MDB_ID pn = txn->mt_spill_pgs[i]; 3499 if (pn & 1) 3500 continue; /* deleted spillpg */ 3501 pn >>= 1; 3502 y = mdb_mid2l_search(dst, pn); 3503 if (y <= dst[0].mid && dst[y].mid == pn) { 3504 free(dst[y].mptr); 3505 while (y < dst[0].mid) { 3506 dst[y] = dst[y+1]; 3507 y++; 3508 } 3509 dst[0].mid--; 3510 } 3511 } 3512 } 3513 3514 /* Find len = length of merging our dirty list with parent's */ 3515 x = dst[0].mid; 3516 dst[0].mid = 0; /* simplify loops */ 3517 if (parent->mt_parent) { 3518 len = x + src[0].mid; 3519 y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; 3520 for (i = x; y && i; y--) { 3521 pgno_t yp = src[y].mid; 3522 while (yp < dst[i].mid) 3523 i--; 3524 if (yp == dst[i].mid) { 3525 i--; 3526 len--; 3527 } 3528 } 3529 } else { /* Simplify the above for single-ancestor case */ 3530 len = MDB_IDL_UM_MAX - txn->mt_dirty_room; 3531 } 3532 /* Merge our dirty list with parent's */ 3533 y = src[0].mid; 3534 for (i = len; y; dst[i--] = src[y--]) { 3535 pgno_t yp = src[y].mid; 3536 while (yp < dst[x].mid) 3537 dst[i--] = dst[x--]; 3538 if (yp == dst[x].mid) 3539 free(dst[x--].mptr); 3540 } 3541 mdb_tassert(txn, i == x); 3542 dst[0].mid = len; 3543 free(txn->mt_u.dirty_list); 3544 parent->mt_dirty_room = txn->mt_dirty_room; 3545 if (txn->mt_spill_pgs) { 3546 if (parent->mt_spill_pgs) { 3547 /* TODO: Prevent failure here, so parent does not fail */ 3548 rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); 3549 if (rc) 3550 parent->mt_flags |= MDB_TXN_ERROR; 3551 mdb_midl_free(txn->mt_spill_pgs); 3552 mdb_midl_sort(parent->mt_spill_pgs); 3553 } else { 3554 parent->mt_spill_pgs = txn->mt_spill_pgs; 3555 } 3556 } 3557 3558 /* Append our loose page list to parent's */ 3559 for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) 3560 ; 3561 *lp = txn->mt_loose_pgs; 3562 parent->mt_loose_count += txn->mt_loose_count; 3563 3564 parent->mt_child = NULL; 3565 mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); 3566 free(txn); 3567 return rc; 3568 } 3569 3570 if (txn != env->me_txn) { 3571 DPUTS("attempt to commit unknown transaction"); 3572 rc = EINVAL; 3573 goto fail; 3574 } 3575 3576 mdb_cursors_close(txn, 0); 3577 3578 if (!txn->mt_u.dirty_list[0].mid && 3579 !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) 3580 goto done; 3581 3582 DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", 3583 txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); 3584 3585 /* Update DB root pointers */ 3586 if (txn->mt_numdbs > CORE_DBS) { 3587 MDB_cursor mc; 3588 MDB_dbi i; 3589 MDB_val data; 3590 data.mv_size = sizeof(MDB_db); 3591 3592 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); 3593 for (i = CORE_DBS; i < txn->mt_numdbs; i++) { 3594 if (txn->mt_dbflags[i] & DB_DIRTY) { 3595 if (TXN_DBI_CHANGED(txn, i)) { 3596 rc = MDB_BAD_DBI; 3597 goto fail; 3598 } 3599 data.mv_data = &txn->mt_dbs[i]; 3600 rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 3601 F_SUBDATA); 3602 if (rc) 3603 goto fail; 3604 } 3605 } 3606 } 3607 3608 rc = mdb_freelist_save(txn); 3609 if (rc) 3610 goto fail; 3611 3612 mdb_midl_free(env->me_pghead); 3613 env->me_pghead = NULL; 3614 mdb_midl_shrink(&txn->mt_free_pgs); 3615 3616 #if (MDB_DEBUG) > 2 3617 mdb_audit(txn); 3618 #endif 3619 3620 if ((rc = mdb_page_flush(txn, 0)) || 3621 (rc = mdb_env_sync(env, 0)) || 3622 (rc = mdb_env_write_meta(txn))) 3623 goto fail; 3624 end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; 3625 3626 done: 3627 mdb_txn_end(txn, end_mode); 3628 return MDB_SUCCESS; 3629 3630 fail: 3631 mdb_txn_abort(txn); 3632 return rc; 3633 } 3634 3635 /** Read the environment parameters of a DB environment before 3636 * mapping it into memory. 3637 * @param[in] env the environment handle 3638 * @param[out] meta address of where to store the meta information 3639 * @return 0 on success, non-zero on failure. 3640 */ 3641 static int ESECT 3642 mdb_env_read_header(MDB_env *env, MDB_meta *meta) 3643 { 3644 MDB_metabuf pbuf; 3645 MDB_page *p; 3646 MDB_meta *m; 3647 int i, rc, off; 3648 enum { Size = sizeof(pbuf) }; 3649 3650 /* We don't know the page size yet, so use a minimum value. 3651 * Read both meta pages so we can use the latest one. 3652 */ 3653 3654 for (i=off=0; i<NUM_METAS; i++, off += meta->mm_psize) { 3655 #ifdef _WIN32 3656 DWORD len; 3657 OVERLAPPED ov; 3658 memset(&ov, 0, sizeof(ov)); 3659 ov.Offset = off; 3660 rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1; 3661 if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) 3662 rc = 0; 3663 #else 3664 rc = pread(env->me_fd, &pbuf, Size, off); 3665 #endif 3666 if (rc != Size) { 3667 if (rc == 0 && off == 0) 3668 return ENOENT; 3669 rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; 3670 DPRINTF(("read: %s", mdb_strerror(rc))); 3671 return rc; 3672 } 3673 3674 p = (MDB_page *)&pbuf; 3675 3676 if (!F_ISSET(p->mp_flags, P_META)) { 3677 DPRINTF(("page %"Z"u not a meta page", p->mp_pgno)); 3678 return MDB_INVALID; 3679 } 3680 3681 m = METADATA(p); 3682 if (m->mm_magic != MDB_MAGIC) { 3683 DPUTS("meta has invalid magic"); 3684 return MDB_INVALID; 3685 } 3686 3687 if (m->mm_version != MDB_DATA_VERSION) { 3688 DPRINTF(("database is version %u, expected version %u", 3689 m->mm_version, MDB_DATA_VERSION)); 3690 return MDB_VERSION_MISMATCH; 3691 } 3692 3693 if (off == 0 || m->mm_txnid > meta->mm_txnid) 3694 *meta = *m; 3695 } 3696 return 0; 3697 } 3698 3699 /** Fill in most of the zeroed #MDB_meta for an empty database environment */ 3700 static void ESECT 3701 mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) 3702 { 3703 meta->mm_magic = MDB_MAGIC; 3704 meta->mm_version = MDB_DATA_VERSION; 3705 meta->mm_mapsize = env->me_mapsize; 3706 meta->mm_psize = env->me_psize; 3707 meta->mm_last_pg = NUM_METAS-1; 3708 meta->mm_flags = env->me_flags & 0xffff; 3709 meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ 3710 meta->mm_dbs[FREE_DBI].md_root = P_INVALID; 3711 meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; 3712 } 3713 3714 /** Write the environment parameters of a freshly created DB environment. 3715 * @param[in] env the environment handle 3716 * @param[in] meta the #MDB_meta to write 3717 * @return 0 on success, non-zero on failure. 3718 */ 3719 static int ESECT 3720 mdb_env_init_meta(MDB_env *env, MDB_meta *meta) 3721 { 3722 MDB_page *p, *q; 3723 int rc; 3724 unsigned int psize; 3725 #ifdef _WIN32 3726 DWORD len; 3727 OVERLAPPED ov; 3728 memset(&ov, 0, sizeof(ov)); 3729 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ 3730 ov.Offset = pos; \ 3731 rc = WriteFile(fd, ptr, size, &len, &ov); } while(0) 3732 #else 3733 int len; 3734 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ 3735 len = pwrite(fd, ptr, size, pos); \ 3736 if (len == -1 && ErrCode() == EINTR) continue; \ 3737 rc = (len >= 0); break; } while(1) 3738 #endif 3739 3740 DPUTS("writing new meta page"); 3741 3742 psize = env->me_psize; 3743 3744 p = calloc(NUM_METAS, psize); 3745 if (!p) 3746 return ENOMEM; 3747 3748 p->mp_pgno = 0; 3749 p->mp_flags = P_META; 3750 *(MDB_meta *)METADATA(p) = *meta; 3751 3752 q = (MDB_page *)((char *)p + psize); 3753 q->mp_pgno = 1; 3754 q->mp_flags = P_META; 3755 *(MDB_meta *)METADATA(q) = *meta; 3756 3757 DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0); 3758 if (!rc) 3759 rc = ErrCode(); 3760 else if ((unsigned) len == psize * NUM_METAS) 3761 rc = MDB_SUCCESS; 3762 else 3763 rc = ENOSPC; 3764 free(p); 3765 return rc; 3766 } 3767 3768 /** Update the environment info to commit a transaction. 3769 * @param[in] txn the transaction that's being committed 3770 * @return 0 on success, non-zero on failure. 3771 */ 3772 static int 3773 mdb_env_write_meta(MDB_txn *txn) 3774 { 3775 MDB_env *env; 3776 MDB_meta meta, metab, *mp; 3777 unsigned flags; 3778 size_t mapsize; 3779 off_t off; 3780 int rc, len, toggle; 3781 char *ptr; 3782 HANDLE mfd; 3783 #ifdef _WIN32 3784 OVERLAPPED ov; 3785 #else 3786 int r2; 3787 #endif 3788 3789 toggle = txn->mt_txnid & 1; 3790 DPRINTF(("writing meta page %d for root page %"Z"u", 3791 toggle, txn->mt_dbs[MAIN_DBI].md_root)); 3792 3793 env = txn->mt_env; 3794 flags = env->me_flags; 3795 mp = env->me_metas[toggle]; 3796 mapsize = env->me_metas[toggle ^ 1]->mm_mapsize; 3797 /* Persist any increases of mapsize config */ 3798 if (mapsize < env->me_mapsize) 3799 mapsize = env->me_mapsize; 3800 3801 if (flags & MDB_WRITEMAP) { 3802 mp->mm_mapsize = mapsize; 3803 mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; 3804 mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; 3805 mp->mm_last_pg = txn->mt_next_pgno - 1; 3806 #if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \ 3807 !(defined(__i386__) || defined(__x86_64__)) 3808 /* LY: issue a memory barrier, if not x86. ITS#7969 */ 3809 __sync_synchronize(); 3810 #endif 3811 mp->mm_txnid = txn->mt_txnid; 3812 if (!(flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { 3813 unsigned meta_size = env->me_psize; 3814 rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; 3815 ptr = (char *)mp - PAGEHDRSZ; 3816 #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ 3817 r2 = (ptr - env->me_map) & (env->me_os_psize - 1); 3818 ptr -= r2; 3819 meta_size += r2; 3820 #endif 3821 if (MDB_MSYNC(ptr, meta_size, rc)) { 3822 rc = ErrCode(); 3823 goto fail; 3824 } 3825 } 3826 goto done; 3827 } 3828 metab.mm_txnid = mp->mm_txnid; 3829 metab.mm_last_pg = mp->mm_last_pg; 3830 3831 meta.mm_mapsize = mapsize; 3832 meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; 3833 meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; 3834 meta.mm_last_pg = txn->mt_next_pgno - 1; 3835 meta.mm_txnid = txn->mt_txnid; 3836 3837 off = offsetof(MDB_meta, mm_mapsize); 3838 ptr = (char *)&meta + off; 3839 len = sizeof(MDB_meta) - off; 3840 off += (char *)mp - env->me_map; 3841 3842 /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC. 3843 * (me_mfd goes to the same file as me_fd, but writing to it 3844 * also syncs to disk. Avoids a separate fdatasync() call.) 3845 */ 3846 mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; 3847 #ifdef _WIN32 3848 { 3849 memset(&ov, 0, sizeof(ov)); 3850 ov.Offset = off; 3851 if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) 3852 rc = -1; 3853 } 3854 #else 3855 retry_write: 3856 rc = pwrite(mfd, ptr, len, off); 3857 #endif 3858 if (rc != len) { 3859 rc = rc < 0 ? ErrCode() : EIO; 3860 #ifndef _WIN32 3861 if (rc == EINTR) 3862 goto retry_write; 3863 #endif 3864 DPUTS("write failed, disk error?"); 3865 /* On a failure, the pagecache still contains the new data. 3866 * Write some old data back, to prevent it from being used. 3867 * Use the non-SYNC fd; we know it will fail anyway. 3868 */ 3869 meta.mm_last_pg = metab.mm_last_pg; 3870 meta.mm_txnid = metab.mm_txnid; 3871 #ifdef _WIN32 3872 memset(&ov, 0, sizeof(ov)); 3873 ov.Offset = off; 3874 WriteFile(env->me_fd, ptr, len, NULL, &ov); 3875 #else 3876 r2 = pwrite(env->me_fd, ptr, len, off); 3877 (void)r2; /* Silence warnings. We don't care about pwrite's return value */ 3878 #endif 3879 fail: 3880 env->me_flags |= MDB_FATAL_ERROR; 3881 return rc; 3882 } 3883 /* MIPS has cache coherency issues, this is a no-op everywhere else */ 3884 CACHEFLUSH(env->me_map + off, len, DCACHE); 3885 done: 3886 /* Memory ordering issues are irrelevant; since the entire writer 3887 * is wrapped by wmutex, all of these changes will become visible 3888 * after the wmutex is unlocked. Since the DB is multi-version, 3889 * readers will get consistent data regardless of how fresh or 3890 * how stale their view of these values is. 3891 */ 3892 if (env->me_txns) 3893 env->me_txns->mti_txnid = txn->mt_txnid; 3894 3895 return MDB_SUCCESS; 3896 } 3897 3898 /** Check both meta pages to see which one is newer. 3899 * @param[in] env the environment handle 3900 * @return newest #MDB_meta. 3901 */ 3902 static MDB_meta * 3903 mdb_env_pick_meta(const MDB_env *env) 3904 { 3905 MDB_meta *const *metas = env->me_metas; 3906 return metas[ metas[0]->mm_txnid < metas[1]->mm_txnid ]; 3907 } 3908 3909 int ESECT 3910 mdb_env_create(MDB_env **env) 3911 { 3912 MDB_env *e; 3913 3914 e = calloc(1, sizeof(MDB_env)); 3915 if (!e) 3916 return ENOMEM; 3917 3918 e->me_maxreaders = DEFAULT_READERS; 3919 e->me_maxdbs = e->me_numdbs = CORE_DBS; 3920 e->me_fd = INVALID_HANDLE_VALUE; 3921 e->me_lfd = INVALID_HANDLE_VALUE; 3922 e->me_mfd = INVALID_HANDLE_VALUE; 3923 #ifdef MDB_USE_POSIX_SEM 3924 e->me_rmutex = SEM_FAILED; 3925 e->me_wmutex = SEM_FAILED; 3926 #endif 3927 e->me_pid = getpid(); 3928 GET_PAGESIZE(e->me_os_psize); 3929 VGMEMP_CREATE(e,0,0); 3930 *env = e; 3931 return MDB_SUCCESS; 3932 } 3933 3934 static int ESECT 3935 mdb_env_map(MDB_env *env, void *addr) 3936 { 3937 MDB_page *p; 3938 unsigned int flags = env->me_flags; 3939 #ifdef _WIN32 3940 int rc; 3941 HANDLE mh; 3942 LONG sizelo, sizehi; 3943 size_t msize; 3944 3945 if (flags & MDB_RDONLY) { 3946 /* Don't set explicit map size, use whatever exists */ 3947 msize = 0; 3948 sizelo = 0; 3949 sizehi = 0; 3950 } else { 3951 msize = env->me_mapsize; 3952 sizelo = msize & 0xffffffff; 3953 sizehi = msize >> 16 >> 16; /* only needed on Win64 */ 3954 3955 /* Windows won't create mappings for zero length files. 3956 * and won't map more than the file size. 3957 * Just set the maxsize right now. 3958 */ 3959 if (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo 3960 || !SetEndOfFile(env->me_fd) 3961 || SetFilePointer(env->me_fd, 0, NULL, 0) != 0) 3962 return ErrCode(); 3963 } 3964 3965 mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? 3966 PAGE_READWRITE : PAGE_READONLY, 3967 sizehi, sizelo, NULL); 3968 if (!mh) 3969 return ErrCode(); 3970 env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? 3971 FILE_MAP_WRITE : FILE_MAP_READ, 3972 0, 0, msize, addr); 3973 rc = env->me_map ? 0 : ErrCode(); 3974 CloseHandle(mh); 3975 if (rc) 3976 return rc; 3977 #else 3978 int prot = PROT_READ; 3979 if (flags & MDB_WRITEMAP) { 3980 prot |= PROT_WRITE; 3981 if (ftruncate(env->me_fd, env->me_mapsize) < 0) 3982 return ErrCode(); 3983 } 3984 env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, 3985 env->me_fd, 0); 3986 if (env->me_map == MAP_FAILED) { 3987 env->me_map = NULL; 3988 return ErrCode(); 3989 } 3990 3991 if (flags & MDB_NORDAHEAD) { 3992 /* Turn off readahead. It's harmful when the DB is larger than RAM. */ 3993 #ifdef MADV_RANDOM 3994 madvise(env->me_map, env->me_mapsize, MADV_RANDOM); 3995 #else 3996 #ifdef POSIX_MADV_RANDOM 3997 posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); 3998 #endif /* POSIX_MADV_RANDOM */ 3999 #endif /* MADV_RANDOM */ 4000 } 4001 #endif /* _WIN32 */ 4002 4003 /* Can happen because the address argument to mmap() is just a 4004 * hint. mmap() can pick another, e.g. if the range is in use. 4005 * The MAP_FIXED flag would prevent that, but then mmap could 4006 * instead unmap existing pages to make room for the new map. 4007 */ 4008 if (addr && env->me_map != addr) 4009 return EBUSY; /* TODO: Make a new MDB_* error code? */ 4010 4011 p = (MDB_page *)env->me_map; 4012 env->me_metas[0] = METADATA(p); 4013 env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize); 4014 4015 return MDB_SUCCESS; 4016 } 4017 4018 int ESECT 4019 mdb_env_set_mapsize(MDB_env *env, size_t size) 4020 { 4021 /* If env is already open, caller is responsible for making 4022 * sure there are no active txns. 4023 */ 4024 if (env->me_map) { 4025 int rc; 4026 MDB_meta *meta; 4027 void *old; 4028 if (env->me_txn) 4029 return EINVAL; 4030 meta = mdb_env_pick_meta(env); 4031 if (!size) 4032 size = meta->mm_mapsize; 4033 { 4034 /* Silently round up to minimum if the size is too small */ 4035 size_t minsize = (meta->mm_last_pg + 1) * env->me_psize; 4036 if (size < minsize) 4037 size = minsize; 4038 } 4039 munmap(env->me_map, env->me_mapsize); 4040 env->me_mapsize = size; 4041 old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; 4042 rc = mdb_env_map(env, old); 4043 if (rc) 4044 return rc; 4045 } 4046 env->me_mapsize = size; 4047 if (env->me_psize) 4048 env->me_maxpg = env->me_mapsize / env->me_psize; 4049 return MDB_SUCCESS; 4050 } 4051 4052 int ESECT 4053 mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) 4054 { 4055 if (env->me_map) 4056 return EINVAL; 4057 env->me_maxdbs = dbs + CORE_DBS; 4058 return MDB_SUCCESS; 4059 } 4060 4061 int ESECT 4062 mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) 4063 { 4064 if (env->me_map || readers < 1) 4065 return EINVAL; 4066 env->me_maxreaders = readers; 4067 return MDB_SUCCESS; 4068 } 4069 4070 int ESECT 4071 mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) 4072 { 4073 if (!env || !readers) 4074 return EINVAL; 4075 *readers = env->me_maxreaders; 4076 return MDB_SUCCESS; 4077 } 4078 4079 static int ESECT 4080 mdb_fsize(HANDLE fd, size_t *size) 4081 { 4082 #ifdef _WIN32 4083 LARGE_INTEGER fsize; 4084 4085 if (!GetFileSizeEx(fd, &fsize)) 4086 return ErrCode(); 4087 4088 *size = fsize.QuadPart; 4089 #else 4090 struct stat st; 4091 4092 if (fstat(fd, &st)) 4093 return ErrCode(); 4094 4095 *size = st.st_size; 4096 #endif 4097 return MDB_SUCCESS; 4098 } 4099 4100 4101 #ifdef _WIN32 4102 typedef wchar_t mdb_nchar_t; 4103 # define MDB_NAME(str) L##str 4104 # define mdb_name_cpy wcscpy 4105 #else 4106 /** Character type for file names: char on Unix, wchar_t on Windows */ 4107 typedef char mdb_nchar_t; 4108 # define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */ 4109 # define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */ 4110 #endif 4111 4112 /** Filename - string of #mdb_nchar_t[] */ 4113 typedef struct MDB_name { 4114 int mn_len; /**< Length */ 4115 int mn_alloced; /**< True if #mn_val was malloced */ 4116 mdb_nchar_t *mn_val; /**< Contents */ 4117 } MDB_name; 4118 4119 /** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */ 4120 static const mdb_nchar_t *const mdb_suffixes[2][2] = { 4121 { MDB_NAME("/data.mdb"), MDB_NAME("") }, 4122 { MDB_NAME("/lock.mdb"), MDB_NAME("-lock") } 4123 }; 4124 4125 #define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */ 4126 4127 /** Set up filename + scratch area for filename suffix, for opening files. 4128 * It should be freed with #mdb_fname_destroy(). 4129 * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16. 4130 * 4131 * @param[in] path Pathname for #mdb_env_open(). 4132 * @param[in] envflags Whether a subdir and/or lockfile will be used. 4133 * @param[out] fname Resulting filename, with room for a suffix if necessary. 4134 */ 4135 static int ESECT 4136 mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) 4137 { 4138 int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK); 4139 fname->mn_alloced = 0; 4140 #ifdef _WIN32 4141 return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN); 4142 #else 4143 fname->mn_len = strlen(path); 4144 if (no_suffix) 4145 fname->mn_val = (char *) path; 4146 else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) { 4147 fname->mn_alloced = 1; 4148 strcpy(fname->mn_val, path); 4149 } 4150 else 4151 return ENOMEM; 4152 return MDB_SUCCESS; 4153 #endif 4154 } 4155 4156 /** Destroy \b fname from #mdb_fname_init() */ 4157 #define mdb_fname_destroy(fname) \ 4158 do { if ((fname).mn_alloced) free((fname).mn_val); } while (0) 4159 4160 #ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */ 4161 # define MDB_CLOEXEC O_CLOEXEC 4162 #else 4163 # define MDB_CLOEXEC 0 4164 #endif 4165 4166 /** File type, access mode etc. for #mdb_fopen() */ 4167 enum mdb_fopen_type { 4168 #ifdef _WIN32 4169 MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS 4170 #else 4171 /* A comment in mdb_fopen() explains some O_* flag choices. */ 4172 MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ 4173 MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */ 4174 MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */ 4175 MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */ 4176 /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits 4177 * distinguish otherwise-equal MDB_O_* constants from each other. 4178 */ 4179 MDB_O_MASK = MDB_O_RDWR|MDB_CLOEXEC | MDB_O_RDONLY|MDB_O_META|MDB_O_COPY, 4180 MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */ 4181 #endif 4182 }; 4183 4184 /** Open an LMDB file. 4185 * @param[in] env The LMDB environment. 4186 * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is 4187 * appended if necessary to create the filename, without changing mn_len. 4188 * @param[in] which Determines file type, access mode, etc. 4189 * @param[in] mode The Unix permissions for the file, if we create it. 4190 * @param[out] res Resulting file handle. 4191 * @return 0 on success, non-zero on failure. 4192 */ 4193 static int ESECT 4194 mdb_fopen(const MDB_env *env, MDB_name *fname, 4195 enum mdb_fopen_type which, mdb_mode_t mode, 4196 HANDLE *res) 4197 { 4198 int rc = MDB_SUCCESS; 4199 HANDLE fd; 4200 #ifdef _WIN32 4201 DWORD acc, share, disp, attrs; 4202 #else 4203 int flags; 4204 #endif 4205 4206 if (fname->mn_alloced) /* modifiable copy */ 4207 mdb_name_cpy(fname->mn_val + fname->mn_len, 4208 mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]); 4209 4210 /* The directory must already exist. Usually the file need not. 4211 * MDB_O_META requires the file because we already created it using 4212 * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file. 4213 * 4214 * With MDB_O_COPY we do not want the OS to cache the writes, since 4215 * the source data is already in the OS cache. 4216 * 4217 * The lockfile needs FD_CLOEXEC (close file descriptor on exec*()) 4218 * to avoid the flock() issues noted under Caveats in lmdb.h. 4219 * Also set it for other filehandles which the user cannot get at 4220 * and close himself, which he may need after fork(). I.e. all but 4221 * me_fd, which programs do use via mdb_env_get_fd(). 4222 */ 4223 4224 #ifdef _WIN32 4225 acc = GENERIC_READ|GENERIC_WRITE; 4226 share = FILE_SHARE_READ|FILE_SHARE_WRITE; 4227 disp = OPEN_ALWAYS; 4228 attrs = FILE_ATTRIBUTE_NORMAL; 4229 switch (which) { 4230 case MDB_O_RDONLY: /* read-only datafile */ 4231 acc = GENERIC_READ; 4232 disp = OPEN_EXISTING; 4233 break; 4234 case MDB_O_META: /* for writing metapages */ 4235 acc = GENERIC_WRITE; 4236 disp = OPEN_EXISTING; 4237 attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH; 4238 break; 4239 case MDB_O_COPY: /* mdb_env_copy() & co */ 4240 acc = GENERIC_WRITE; 4241 share = 0; 4242 disp = CREATE_NEW; 4243 attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH; 4244 break; 4245 default: break; /* silence gcc -Wswitch (not all enum values handled) */ 4246 } 4247 fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL); 4248 #else 4249 fd = open(fname->mn_val, which & MDB_O_MASK, mode); 4250 #endif 4251 4252 if (fd == INVALID_HANDLE_VALUE) 4253 rc = ErrCode(); 4254 #ifndef _WIN32 4255 else { 4256 if (which != MDB_O_RDONLY && which != MDB_O_RDWR) { 4257 /* Set CLOEXEC if we could not pass it to open() */ 4258 if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1) 4259 (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); 4260 } 4261 if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) { 4262 /* This may require buffer alignment. There is no portable 4263 * way to ask how much, so we require OS pagesize alignment. 4264 */ 4265 # ifdef F_NOCACHE /* __APPLE__ */ 4266 (void) fcntl(fd, F_NOCACHE, 1); 4267 # elif defined O_DIRECT 4268 /* open(...O_DIRECT...) would break on filesystems without 4269 * O_DIRECT support (ITS#7682). Try to set it here instead. 4270 */ 4271 if ((flags = fcntl(fd, F_GETFL)) != -1) 4272 (void) fcntl(fd, F_SETFL, flags | O_DIRECT); 4273 # endif 4274 } 4275 } 4276 #endif /* !_WIN32 */ 4277 4278 *res = fd; 4279 return rc; 4280 } 4281 4282 4283 #ifdef BROKEN_FDATASYNC 4284 #include <sys/utsname.h> 4285 #include <sys/vfs.h> 4286 #endif 4287 4288 /** Further setup required for opening an LMDB environment 4289 */ 4290 static int ESECT 4291 mdb_env_open2(MDB_env *env) 4292 { 4293 unsigned int flags = env->me_flags; 4294 int i, newenv = 0, rc; 4295 MDB_meta meta; 4296 4297 #ifdef _WIN32 4298 /* See if we should use QueryLimited */ 4299 rc = GetVersion(); 4300 if ((rc & 0xff) > 5) 4301 env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION; 4302 else 4303 env->me_pidquery = PROCESS_QUERY_INFORMATION; 4304 #endif /* _WIN32 */ 4305 4306 #ifdef BROKEN_FDATASYNC 4307 /* ext3/ext4 fdatasync is broken on some older Linux kernels. 4308 * https://lkml.org/lkml/2012/9/3/83 4309 * Kernels after 3.6-rc6 are known good. 4310 * https://lkml.org/lkml/2012/9/10/556 4311 * See if the DB is on ext3/ext4, then check for new enough kernel 4312 * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known 4313 * to be patched. 4314 */ 4315 { 4316 struct statfs st; 4317 fstatfs(env->me_fd, &st); 4318 while (st.f_type == 0xEF53) { 4319 struct utsname uts; 4320 int i; 4321 uname(&uts); 4322 if (uts.release[0] < '3') { 4323 if (!strncmp(uts.release, "2.6.32.", 7)) { 4324 i = atoi(uts.release+7); 4325 if (i >= 60) 4326 break; /* 2.6.32.60 and newer is OK */ 4327 } else if (!strncmp(uts.release, "2.6.34.", 7)) { 4328 i = atoi(uts.release+7); 4329 if (i >= 15) 4330 break; /* 2.6.34.15 and newer is OK */ 4331 } 4332 } else if (uts.release[0] == '3') { 4333 i = atoi(uts.release+2); 4334 if (i > 5) 4335 break; /* 3.6 and newer is OK */ 4336 if (i == 5) { 4337 i = atoi(uts.release+4); 4338 if (i >= 4) 4339 break; /* 3.5.4 and newer is OK */ 4340 } else if (i == 2) { 4341 i = atoi(uts.release+4); 4342 if (i >= 30) 4343 break; /* 3.2.30 and newer is OK */ 4344 } 4345 } else { /* 4.x and newer is OK */ 4346 break; 4347 } 4348 env->me_flags |= MDB_FSYNCONLY; 4349 break; 4350 } 4351 } 4352 #endif 4353 4354 if ((i = mdb_env_read_header(env, &meta)) != 0) { 4355 if (i != ENOENT) 4356 return i; 4357 DPUTS("new mdbenv"); 4358 newenv = 1; 4359 env->me_psize = env->me_os_psize; 4360 if (env->me_psize > MAX_PAGESIZE) 4361 env->me_psize = MAX_PAGESIZE; 4362 memset(&meta, 0, sizeof(meta)); 4363 mdb_env_init_meta0(env, &meta); 4364 meta.mm_mapsize = DEFAULT_MAPSIZE; 4365 } else { 4366 env->me_psize = meta.mm_psize; 4367 } 4368 4369 /* Was a mapsize configured? */ 4370 if (!env->me_mapsize) { 4371 env->me_mapsize = meta.mm_mapsize; 4372 } 4373 { 4374 /* Make sure mapsize >= committed data size. Even when using 4375 * mm_mapsize, which could be broken in old files (ITS#7789). 4376 */ 4377 size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; 4378 if (env->me_mapsize < minsize) 4379 env->me_mapsize = minsize; 4380 } 4381 meta.mm_mapsize = env->me_mapsize; 4382 4383 if (newenv && !(flags & MDB_FIXEDMAP)) { 4384 /* mdb_env_map() may grow the datafile. Write the metapages 4385 * first, so the file will be valid if initialization fails. 4386 * Except with FIXEDMAP, since we do not yet know mm_address. 4387 * We could fill in mm_address later, but then a different 4388 * program might end up doing that - one with a memory layout 4389 * and map address which does not suit the main program. 4390 */ 4391 rc = mdb_env_init_meta(env, &meta); 4392 if (rc) 4393 return rc; 4394 newenv = 0; 4395 } 4396 4397 rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); 4398 if (rc) 4399 return rc; 4400 4401 if (newenv) { 4402 if (flags & MDB_FIXEDMAP) 4403 meta.mm_address = env->me_map; 4404 i = mdb_env_init_meta(env, &meta); 4405 if (i != MDB_SUCCESS) { 4406 return i; 4407 } 4408 } 4409 4410 env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; 4411 env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) 4412 - sizeof(indx_t); 4413 #if !(MDB_MAXKEYSIZE) 4414 env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); 4415 #endif 4416 env->me_maxpg = env->me_mapsize / env->me_psize; 4417 4418 #if MDB_DEBUG 4419 { 4420 MDB_meta *meta = mdb_env_pick_meta(env); 4421 MDB_db *db = &meta->mm_dbs[MAIN_DBI]; 4422 4423 DPRINTF(("opened database version %u, pagesize %u", 4424 meta->mm_version, env->me_psize)); 4425 DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1))); 4426 DPRINTF(("depth: %u", db->md_depth)); 4427 DPRINTF(("entries: %"Z"u", db->md_entries)); 4428 DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); 4429 DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages)); 4430 DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages)); 4431 DPRINTF(("root: %"Z"u", db->md_root)); 4432 } 4433 #endif 4434 4435 return MDB_SUCCESS; 4436 } 4437 4438 4439 /** Release a reader thread's slot in the reader lock table. 4440 * This function is called automatically when a thread exits. 4441 * @param[in] ptr This points to the slot in the reader lock table. 4442 */ 4443 static void 4444 mdb_env_reader_dest(void *ptr) 4445 { 4446 MDB_reader *reader = ptr; 4447 4448 #ifndef _WIN32 4449 if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */ 4450 #endif 4451 /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */ 4452 reader->mr_pid = 0; 4453 } 4454 4455 #ifdef _WIN32 4456 /** Junk for arranging thread-specific callbacks on Windows. This is 4457 * necessarily platform and compiler-specific. Windows supports up 4458 * to 1088 keys. Let's assume nobody opens more than 64 environments 4459 * in a single process, for now. They can override this if needed. 4460 */ 4461 #ifndef MAX_TLS_KEYS 4462 #define MAX_TLS_KEYS 64 4463 #endif 4464 static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS]; 4465 static int mdb_tls_nkeys; 4466 4467 static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr) 4468 { 4469 int i; 4470 switch(reason) { 4471 case DLL_PROCESS_ATTACH: break; 4472 case DLL_THREAD_ATTACH: break; 4473 case DLL_THREAD_DETACH: 4474 for (i=0; i<mdb_tls_nkeys; i++) { 4475 MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]); 4476 if (r) { 4477 mdb_env_reader_dest(r); 4478 } 4479 } 4480 break; 4481 case DLL_PROCESS_DETACH: break; 4482 } 4483 } 4484 #ifdef __GNUC__ 4485 #ifdef _WIN64 4486 const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback; 4487 #else 4488 PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback; 4489 #endif 4490 #else 4491 #ifdef _WIN64 4492 /* Force some symbol references. 4493 * _tls_used forces the linker to create the TLS directory if not already done 4494 * mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol. 4495 */ 4496 #pragma comment(linker, "/INCLUDE:_tls_used") 4497 #pragma comment(linker, "/INCLUDE:mdb_tls_cbp") 4498 #pragma const_seg(".CRT$XLB") 4499 extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp; 4500 const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; 4501 #pragma const_seg() 4502 #else /* _WIN32 */ 4503 #pragma comment(linker, "/INCLUDE:__tls_used") 4504 #pragma comment(linker, "/INCLUDE:_mdb_tls_cbp") 4505 #pragma data_seg(".CRT$XLB") 4506 PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; 4507 #pragma data_seg() 4508 #endif /* WIN 32/64 */ 4509 #endif /* !__GNUC__ */ 4510 #endif 4511 4512 /** Downgrade the exclusive lock on the region back to shared */ 4513 static int ESECT 4514 mdb_env_share_locks(MDB_env *env, int *excl) 4515 { 4516 int rc = 0; 4517 MDB_meta *meta = mdb_env_pick_meta(env); 4518 4519 env->me_txns->mti_txnid = meta->mm_txnid; 4520 4521 #ifdef _WIN32 4522 { 4523 OVERLAPPED ov; 4524 /* First acquire a shared lock. The Unlock will 4525 * then release the existing exclusive lock. 4526 */ 4527 memset(&ov, 0, sizeof(ov)); 4528 if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { 4529 rc = ErrCode(); 4530 } else { 4531 UnlockFile(env->me_lfd, 0, 0, 1, 0); 4532 *excl = 0; 4533 } 4534 } 4535 #else 4536 { 4537 struct flock lock_info; 4538 /* The shared lock replaces the existing lock */ 4539 memset((void *)&lock_info, 0, sizeof(lock_info)); 4540 lock_info.l_type = F_RDLCK; 4541 lock_info.l_whence = SEEK_SET; 4542 lock_info.l_start = 0; 4543 lock_info.l_len = 1; 4544 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && 4545 (rc = ErrCode()) == EINTR) ; 4546 *excl = rc ? -1 : 0; /* error may mean we lost the lock */ 4547 } 4548 #endif 4549 4550 return rc; 4551 } 4552 4553 /** Try to get exclusive lock, otherwise shared. 4554 * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. 4555 */ 4556 static int ESECT 4557 mdb_env_excl_lock(MDB_env *env, int *excl) 4558 { 4559 int rc = 0; 4560 #ifdef _WIN32 4561 if (LockFile(env->me_lfd, 0, 0, 1, 0)) { 4562 *excl = 1; 4563 } else { 4564 OVERLAPPED ov; 4565 memset(&ov, 0, sizeof(ov)); 4566 if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { 4567 *excl = 0; 4568 } else { 4569 rc = ErrCode(); 4570 } 4571 } 4572 #else 4573 struct flock lock_info; 4574 memset((void *)&lock_info, 0, sizeof(lock_info)); 4575 lock_info.l_type = F_WRLCK; 4576 lock_info.l_whence = SEEK_SET; 4577 lock_info.l_start = 0; 4578 lock_info.l_len = 1; 4579 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && 4580 (rc = ErrCode()) == EINTR) ; 4581 if (!rc) { 4582 *excl = 1; 4583 } else 4584 # ifndef MDB_USE_POSIX_MUTEX 4585 if (*excl < 0) /* always true when MDB_USE_POSIX_MUTEX */ 4586 # endif 4587 { 4588 lock_info.l_type = F_RDLCK; 4589 while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && 4590 (rc = ErrCode()) == EINTR) ; 4591 if (rc == 0) 4592 *excl = 0; 4593 } 4594 #endif 4595 return rc; 4596 } 4597 4598 #ifdef MDB_USE_HASH 4599 /* 4600 * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code 4601 * 4602 * @(#) Revision: 5.1 4603 * @(#) Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp 4604 * @(#) Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v 4605 * 4606 * http://www.isthe.com/chongo/tech/comp/fnv/index.html 4607 * 4608 *** 4609 * 4610 * Please do not copyright this code. This code is in the public domain. 4611 * 4612 * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 4613 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO 4614 * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR 4615 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF 4616 * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 4617 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 4618 * PERFORMANCE OF THIS SOFTWARE. 4619 * 4620 * By: 4621 * chongo <Landon Curt Noll> /\oo/\ 4622 * http://www.isthe.com/chongo/ 4623 * 4624 * Share and Enjoy! :-) 4625 */ 4626 4627 typedef unsigned long long mdb_hash_t; 4628 #define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) 4629 4630 /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer 4631 * @param[in] val value to hash 4632 * @param[in] hval initial value for hash 4633 * @return 64 bit hash 4634 * 4635 * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the 4636 * hval arg on the first call. 4637 */ 4638 static mdb_hash_t 4639 mdb_hash_val(MDB_val *val, mdb_hash_t hval) 4640 { 4641 unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ 4642 unsigned char *end = s + val->mv_size; 4643 /* 4644 * FNV-1a hash each octet of the string 4645 */ 4646 while (s < end) { 4647 /* xor the bottom with the current octet */ 4648 hval ^= (mdb_hash_t)*s++; 4649 4650 /* multiply by the 64 bit FNV magic prime mod 2^64 */ 4651 hval += (hval << 1) + (hval << 4) + (hval << 5) + 4652 (hval << 7) + (hval << 8) + (hval << 40); 4653 } 4654 /* return our new hash value */ 4655 return hval; 4656 } 4657 4658 /** Hash the string and output the encoded hash. 4659 * This uses modified RFC1924 Ascii85 encoding to accommodate systems with 4660 * very short name limits. We don't care about the encoding being reversible, 4661 * we just want to preserve as many bits of the input as possible in a 4662 * small printable string. 4663 * @param[in] str string to hash 4664 * @param[out] encbuf an array of 11 chars to hold the hash 4665 */ 4666 static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; 4667 4668 static void ESECT 4669 mdb_pack85(unsigned long l, char *out) 4670 { 4671 int i; 4672 4673 for (i=0; i<5; i++) { 4674 *out++ = mdb_a85[l % 85]; 4675 l /= 85; 4676 } 4677 } 4678 4679 static void ESECT 4680 mdb_hash_enc(MDB_val *val, char *encbuf) 4681 { 4682 mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); 4683 4684 mdb_pack85(h, encbuf); 4685 mdb_pack85(h>>32, encbuf+5); 4686 encbuf[10] = '\0'; 4687 } 4688 #endif 4689 4690 /** Open and/or initialize the lock region for the environment. 4691 * @param[in] env The LMDB environment. 4692 * @param[in] fname Filename + scratch area, from #mdb_fname_init(). 4693 * @param[in] mode The Unix permissions for the file, if we create it. 4694 * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive 4695 * @return 0 on success, non-zero on failure. 4696 */ 4697 static int ESECT 4698 mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) 4699 { 4700 #ifdef _WIN32 4701 # define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT 4702 #else 4703 # define MDB_ERRCODE_ROFS EROFS 4704 #endif 4705 int rc; 4706 off_t size, rsize; 4707 4708 rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd); 4709 if (rc) { 4710 /* Omit lockfile if read-only env on read-only filesystem */ 4711 if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { 4712 return MDB_SUCCESS; 4713 } 4714 goto fail; 4715 } 4716 4717 if (!(env->me_flags & MDB_NOTLS)) { 4718 rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); 4719 if (rc) 4720 goto fail; 4721 env->me_flags |= MDB_ENV_TXKEY; 4722 #ifdef _WIN32 4723 /* Windows TLS callbacks need help finding their TLS info. */ 4724 if (mdb_tls_nkeys >= MAX_TLS_KEYS) { 4725 rc = MDB_TLS_FULL; 4726 goto fail; 4727 } 4728 mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; 4729 #endif 4730 } 4731 4732 /* Try to get exclusive lock. If we succeed, then 4733 * nobody is using the lock region and we should initialize it. 4734 */ 4735 if ((rc = mdb_env_excl_lock(env, excl))) goto fail; 4736 4737 #ifdef _WIN32 4738 size = GetFileSize(env->me_lfd, NULL); 4739 #else 4740 size = lseek(env->me_lfd, 0, SEEK_END); 4741 if (size == -1) goto fail_errno; 4742 #endif 4743 rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); 4744 if (size < rsize && *excl > 0) { 4745 #ifdef _WIN32 4746 if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize 4747 || !SetEndOfFile(env->me_lfd)) 4748 goto fail_errno; 4749 #else 4750 if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; 4751 #endif 4752 } else { 4753 rsize = size; 4754 size = rsize - sizeof(MDB_txninfo); 4755 env->me_maxreaders = size/sizeof(MDB_reader) + 1; 4756 } 4757 { 4758 #ifdef _WIN32 4759 HANDLE mh; 4760 mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE, 4761 0, 0, NULL); 4762 if (!mh) goto fail_errno; 4763 env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL); 4764 CloseHandle(mh); 4765 if (!env->me_txns) goto fail_errno; 4766 #else 4767 void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, 4768 env->me_lfd, 0); 4769 if (m == MAP_FAILED) goto fail_errno; 4770 env->me_txns = m; 4771 #endif 4772 } 4773 if (*excl > 0) { 4774 #ifdef _WIN32 4775 BY_HANDLE_FILE_INFORMATION stbuf; 4776 struct { 4777 DWORD volume; 4778 DWORD nhigh; 4779 DWORD nlow; 4780 } idbuf; 4781 MDB_val val; 4782 char encbuf[11]; 4783 4784 if (!mdb_sec_inited) { 4785 InitializeSecurityDescriptor(&mdb_null_sd, 4786 SECURITY_DESCRIPTOR_REVISION); 4787 SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE); 4788 mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES); 4789 mdb_all_sa.bInheritHandle = FALSE; 4790 mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd; 4791 mdb_sec_inited = 1; 4792 } 4793 if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno; 4794 idbuf.volume = stbuf.dwVolumeSerialNumber; 4795 idbuf.nhigh = stbuf.nFileIndexHigh; 4796 idbuf.nlow = stbuf.nFileIndexLow; 4797 val.mv_data = &idbuf; 4798 val.mv_size = sizeof(idbuf); 4799 mdb_hash_enc(&val, encbuf); 4800 sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf); 4801 sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf); 4802 env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); 4803 if (!env->me_rmutex) goto fail_errno; 4804 env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); 4805 if (!env->me_wmutex) goto fail_errno; 4806 #elif defined(MDB_USE_POSIX_SEM) 4807 struct stat stbuf; 4808 struct { 4809 dev_t dev; 4810 ino_t ino; 4811 } idbuf; 4812 MDB_val val; 4813 char encbuf[11]; 4814 4815 #if defined(__NetBSD__) 4816 #define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */ 4817 #endif 4818 if (fstat(env->me_lfd, &stbuf)) goto fail_errno; 4819 idbuf.dev = stbuf.st_dev; 4820 idbuf.ino = stbuf.st_ino; 4821 val.mv_data = &idbuf; 4822 val.mv_size = sizeof(idbuf); 4823 mdb_hash_enc(&val, encbuf); 4824 #ifdef MDB_SHORT_SEMNAMES 4825 encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */ 4826 #endif 4827 sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf); 4828 sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf); 4829 /* Clean up after a previous run, if needed: Try to 4830 * remove both semaphores before doing anything else. 4831 */ 4832 sem_unlink(env->me_txns->mti_rmname); 4833 sem_unlink(env->me_txns->mti_wmname); 4834 env->me_rmutex = sem_open(env->me_txns->mti_rmname, 4835 O_CREAT|O_EXCL, mode, 1); 4836 if (env->me_rmutex == SEM_FAILED) goto fail_errno; 4837 env->me_wmutex = sem_open(env->me_txns->mti_wmname, 4838 O_CREAT|O_EXCL, mode, 1); 4839 if (env->me_wmutex == SEM_FAILED) goto fail_errno; 4840 #else /* MDB_USE_POSIX_MUTEX: */ 4841 pthread_mutexattr_t mattr; 4842 4843 /* Solaris needs this before initing a robust mutex. Otherwise 4844 * it may skip the init and return EBUSY "seems someone already 4845 * inited" or EINVAL "it was inited differently". 4846 */ 4847 memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); 4848 memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); 4849 4850 if ((rc = pthread_mutexattr_init(&mattr))) 4851 goto fail; 4852 4853 rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); 4854 #ifdef MDB_ROBUST_SUPPORTED 4855 if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); 4856 #endif 4857 if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); 4858 if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); 4859 pthread_mutexattr_destroy(&mattr); 4860 if (rc) 4861 goto fail; 4862 #endif /* _WIN32 || MDB_USE_POSIX_SEM */ 4863 4864 env->me_txns->mti_magic = MDB_MAGIC; 4865 env->me_txns->mti_format = MDB_LOCK_FORMAT; 4866 env->me_txns->mti_txnid = 0; 4867 env->me_txns->mti_numreaders = 0; 4868 4869 } else { 4870 if (env->me_txns->mti_magic != MDB_MAGIC) { 4871 DPUTS("lock region has invalid magic"); 4872 rc = MDB_INVALID; 4873 goto fail; 4874 } 4875 if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { 4876 DPRINTF(("lock region has format+version 0x%x, expected 0x%x", 4877 env->me_txns->mti_format, MDB_LOCK_FORMAT)); 4878 rc = MDB_VERSION_MISMATCH; 4879 goto fail; 4880 } 4881 rc = ErrCode(); 4882 if (rc && rc != EACCES && rc != EAGAIN) { 4883 goto fail; 4884 } 4885 #ifdef _WIN32 4886 env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); 4887 if (!env->me_rmutex) goto fail_errno; 4888 env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); 4889 if (!env->me_wmutex) goto fail_errno; 4890 #elif defined(MDB_USE_POSIX_SEM) 4891 env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); 4892 if (env->me_rmutex == SEM_FAILED) goto fail_errno; 4893 env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); 4894 if (env->me_wmutex == SEM_FAILED) goto fail_errno; 4895 #endif 4896 } 4897 return MDB_SUCCESS; 4898 4899 fail_errno: 4900 rc = ErrCode(); 4901 fail: 4902 return rc; 4903 } 4904 4905 /** Only a subset of the @ref mdb_env flags can be changed 4906 * at runtime. Changing other flags requires closing the 4907 * environment and re-opening it with the new flags. 4908 */ 4909 #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) 4910 #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ 4911 MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) 4912 4913 #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) 4914 # error "Persistent DB flags & env flags overlap, but both go in mm_flags" 4915 #endif 4916 4917 int ESECT 4918 mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) 4919 { 4920 int rc, excl = -1; 4921 MDB_name fname; 4922 4923 if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) 4924 return EINVAL; 4925 4926 flags |= env->me_flags; 4927 4928 rc = mdb_fname_init(path, flags, &fname); 4929 if (rc) 4930 return rc; 4931 4932 if (flags & MDB_RDONLY) { 4933 /* silently ignore WRITEMAP when we're only getting read access */ 4934 flags &= ~MDB_WRITEMAP; 4935 } else { 4936 if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && 4937 (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) 4938 rc = ENOMEM; 4939 } 4940 env->me_flags = flags |= MDB_ENV_ACTIVE; 4941 if (rc) 4942 goto leave; 4943 4944 env->me_path = strdup(path); 4945 env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); 4946 env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); 4947 env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int)); 4948 if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { 4949 rc = ENOMEM; 4950 goto leave; 4951 } 4952 env->me_dbxs[FREE_DBI].md_cmp = mdb_cmp_long; /* aligned MDB_INTEGERKEY */ 4953 4954 /* For RDONLY, get lockfile after we know datafile exists */ 4955 if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { 4956 rc = mdb_env_setup_locks(env, &fname, mode, &excl); 4957 if (rc) 4958 goto leave; 4959 } 4960 4961 rc = mdb_fopen(env, &fname, 4962 (flags & MDB_RDONLY) ? MDB_O_RDONLY : MDB_O_RDWR, 4963 mode, &env->me_fd); 4964 if (rc) 4965 goto leave; 4966 4967 if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { 4968 rc = mdb_env_setup_locks(env, &fname, mode, &excl); 4969 if (rc) 4970 goto leave; 4971 } 4972 4973 if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { 4974 if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { 4975 /* Synchronous fd for meta writes. Needed even with 4976 * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. 4977 */ 4978 rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); 4979 if (rc) 4980 goto leave; 4981 } 4982 DPRINTF(("opened dbenv %p", (void *) env)); 4983 if (excl > 0) { 4984 rc = mdb_env_share_locks(env, &excl); 4985 if (rc) 4986 goto leave; 4987 } 4988 if (!(flags & MDB_RDONLY)) { 4989 MDB_txn *txn; 4990 int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs * 4991 (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned int)+1); 4992 if ((env->me_pbuf = calloc(1, env->me_psize)) && 4993 (txn = calloc(1, size))) 4994 { 4995 txn->mt_dbs = (MDB_db *)((char *)txn + tsize); 4996 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); 4997 txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); 4998 txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); 4999 txn->mt_env = env; 5000 txn->mt_dbxs = env->me_dbxs; 5001 txn->mt_flags = MDB_TXN_FINISHED; 5002 env->me_txn0 = txn; 5003 } else { 5004 rc = ENOMEM; 5005 } 5006 } 5007 } 5008 5009 leave: 5010 if (rc) { 5011 mdb_env_close0(env, excl); 5012 } 5013 mdb_fname_destroy(fname); 5014 return rc; 5015 } 5016 5017 /** Destroy resources from mdb_env_open(), clear our readers & DBIs */ 5018 static void ESECT 5019 mdb_env_close0(MDB_env *env, int excl) 5020 { 5021 int i; 5022 5023 if (!(env->me_flags & MDB_ENV_ACTIVE)) 5024 return; 5025 5026 /* Doing this here since me_dbxs may not exist during mdb_env_close */ 5027 if (env->me_dbxs) { 5028 for (i = env->me_maxdbs; --i >= CORE_DBS; ) 5029 free(env->me_dbxs[i].md_name.mv_data); 5030 free(env->me_dbxs); 5031 } 5032 5033 free(env->me_pbuf); 5034 free(env->me_dbiseqs); 5035 free(env->me_dbflags); 5036 free(env->me_path); 5037 free(env->me_dirty_list); 5038 free(env->me_txn0); 5039 mdb_midl_free(env->me_free_pgs); 5040 5041 if (env->me_flags & MDB_ENV_TXKEY) { 5042 pthread_key_delete(env->me_txkey); 5043 #ifdef _WIN32 5044 /* Delete our key from the global list */ 5045 for (i=0; i<mdb_tls_nkeys; i++) 5046 if (mdb_tls_keys[i] == env->me_txkey) { 5047 mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; 5048 mdb_tls_nkeys--; 5049 break; 5050 } 5051 #endif 5052 } 5053 5054 if (env->me_map) { 5055 munmap(env->me_map, env->me_mapsize); 5056 } 5057 if (env->me_mfd != INVALID_HANDLE_VALUE) 5058 (void) close(env->me_mfd); 5059 if (env->me_fd != INVALID_HANDLE_VALUE) 5060 (void) close(env->me_fd); 5061 if (env->me_txns) { 5062 MDB_PID_T pid = env->me_pid; 5063 /* Clearing readers is done in this function because 5064 * me_txkey with its destructor must be disabled first. 5065 * 5066 * We skip the the reader mutex, so we touch only 5067 * data owned by this process (me_close_readers and 5068 * our readers), and clear each reader atomically. 5069 */ 5070 for (i = env->me_close_readers; --i >= 0; ) 5071 if (env->me_txns->mti_readers[i].mr_pid == pid) 5072 env->me_txns->mti_readers[i].mr_pid = 0; 5073 #ifdef _WIN32 5074 if (env->me_rmutex) { 5075 CloseHandle(env->me_rmutex); 5076 if (env->me_wmutex) CloseHandle(env->me_wmutex); 5077 } 5078 /* Windows automatically destroys the mutexes when 5079 * the last handle closes. 5080 */ 5081 #elif defined(MDB_USE_POSIX_SEM) 5082 if (env->me_rmutex != SEM_FAILED) { 5083 sem_close(env->me_rmutex); 5084 if (env->me_wmutex != SEM_FAILED) 5085 sem_close(env->me_wmutex); 5086 /* If we have the filelock: If we are the 5087 * only remaining user, clean up semaphores. 5088 */ 5089 if (excl == 0) 5090 mdb_env_excl_lock(env, &excl); 5091 if (excl > 0) { 5092 sem_unlink(env->me_txns->mti_rmname); 5093 sem_unlink(env->me_txns->mti_wmname); 5094 } 5095 } 5096 #endif 5097 munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); 5098 } 5099 if (env->me_lfd != INVALID_HANDLE_VALUE) { 5100 #ifdef _WIN32 5101 if (excl >= 0) { 5102 /* Unlock the lockfile. Windows would have unlocked it 5103 * after closing anyway, but not necessarily at once. 5104 */ 5105 UnlockFile(env->me_lfd, 0, 0, 1, 0); 5106 } 5107 #endif 5108 (void) close(env->me_lfd); 5109 } 5110 5111 env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); 5112 } 5113 5114 void ESECT 5115 mdb_env_close(MDB_env *env) 5116 { 5117 MDB_page *dp; 5118 5119 if (env == NULL) 5120 return; 5121 5122 VGMEMP_DESTROY(env); 5123 while ((dp = env->me_dpages) != NULL) { 5124 VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); 5125 env->me_dpages = dp->mp_next; 5126 free(dp); 5127 } 5128 5129 mdb_env_close0(env, 0); 5130 free(env); 5131 } 5132 5133 /** Compare two items pointing at aligned size_t's */ 5134 static int 5135 mdb_cmp_long(const MDB_val *a, const MDB_val *b) 5136 { 5137 return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 : 5138 *(size_t *)a->mv_data > *(size_t *)b->mv_data; 5139 } 5140 5141 /** Compare two items pointing at aligned unsigned int's. 5142 * 5143 * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp, 5144 * but #mdb_cmp_clong() is called instead if the data type is size_t. 5145 */ 5146 static int 5147 mdb_cmp_int(const MDB_val *a, const MDB_val *b) 5148 { 5149 return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 : 5150 *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data; 5151 } 5152 5153 /** Compare two items pointing at unsigned ints of unknown alignment. 5154 * Nodes and keys are guaranteed to be 2-byte aligned. 5155 */ 5156 static int 5157 mdb_cmp_cint(const MDB_val *a, const MDB_val *b) 5158 { 5159 #if BYTE_ORDER == LITTLE_ENDIAN 5160 unsigned short *u, *c; 5161 int x; 5162 5163 u = (unsigned short *) ((char *) a->mv_data + a->mv_size); 5164 c = (unsigned short *) ((char *) b->mv_data + a->mv_size); 5165 do { 5166 x = *--u - *--c; 5167 } while(!x && u > (unsigned short *)a->mv_data); 5168 return x; 5169 #else 5170 unsigned short *u, *c, *end; 5171 int x; 5172 5173 end = (unsigned short *) ((char *) a->mv_data + a->mv_size); 5174 u = (unsigned short *)a->mv_data; 5175 c = (unsigned short *)b->mv_data; 5176 do { 5177 x = *u++ - *c++; 5178 } while(!x && u < end); 5179 return x; 5180 #endif 5181 } 5182 5183 /** Compare two items lexically */ 5184 static int 5185 mdb_cmp_memn(const MDB_val *a, const MDB_val *b) 5186 { 5187 int diff; 5188 ssize_t len_diff; 5189 unsigned int len; 5190 5191 len = a->mv_size; 5192 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; 5193 if (len_diff > 0) { 5194 len = b->mv_size; 5195 len_diff = 1; 5196 } 5197 5198 diff = memcmp(a->mv_data, b->mv_data, len); 5199 return diff ? diff : len_diff<0 ? -1 : len_diff; 5200 } 5201 5202 /** Compare two items in reverse byte order */ 5203 static int 5204 mdb_cmp_memnr(const MDB_val *a, const MDB_val *b) 5205 { 5206 const unsigned char *p1, *p2, *p1_lim; 5207 ssize_t len_diff; 5208 int diff; 5209 5210 p1_lim = (const unsigned char *)a->mv_data; 5211 p1 = (const unsigned char *)a->mv_data + a->mv_size; 5212 p2 = (const unsigned char *)b->mv_data + b->mv_size; 5213 5214 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; 5215 if (len_diff > 0) { 5216 p1_lim += len_diff; 5217 len_diff = 1; 5218 } 5219 5220 while (p1 > p1_lim) { 5221 diff = *--p1 - *--p2; 5222 if (diff) 5223 return diff; 5224 } 5225 return len_diff<0 ? -1 : len_diff; 5226 } 5227 5228 /** Search for key within a page, using binary search. 5229 * Returns the smallest entry larger or equal to the key. 5230 * If exactp is non-null, stores whether the found entry was an exact match 5231 * in *exactp (1 or 0). 5232 * Updates the cursor index with the index of the found entry. 5233 * If no entry larger or equal to the key is found, returns NULL. 5234 */ 5235 static MDB_node * 5236 mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) 5237 { 5238 unsigned int i = 0, nkeys; 5239 int low, high; 5240 int rc = 0; 5241 MDB_page *mp = mc->mc_pg[mc->mc_top]; 5242 MDB_node *node = NULL; 5243 MDB_val nodekey; 5244 MDB_cmp_func *cmp; 5245 DKBUF; 5246 5247 nkeys = NUMKEYS(mp); 5248 5249 DPRINTF(("searching %u keys in %s %spage %"Z"u", 5250 nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", 5251 mdb_dbg_pgno(mp))); 5252 5253 low = IS_LEAF(mp) ? 0 : 1; 5254 high = nkeys - 1; 5255 cmp = mc->mc_dbx->md_cmp; 5256 5257 /* Branch pages have no data, so if using integer keys, 5258 * alignment is guaranteed. Use faster mdb_cmp_int. 5259 */ 5260 if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { 5261 if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) 5262 cmp = mdb_cmp_long; 5263 else 5264 cmp = mdb_cmp_int; 5265 } 5266 5267 if (IS_LEAF2(mp)) { 5268 nodekey.mv_size = mc->mc_db->md_pad; 5269 node = NODEPTR(mp, 0); /* fake */ 5270 while (low <= high) { 5271 i = (low + high) >> 1; 5272 nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); 5273 rc = cmp(key, &nodekey); 5274 DPRINTF(("found leaf index %u [%s], rc = %i", 5275 i, DKEY(&nodekey), rc)); 5276 if (rc == 0) 5277 break; 5278 if (rc > 0) 5279 low = i + 1; 5280 else 5281 high = i - 1; 5282 } 5283 } else { 5284 while (low <= high) { 5285 i = (low + high) >> 1; 5286 5287 node = NODEPTR(mp, i); 5288 nodekey.mv_size = NODEKSZ(node); 5289 nodekey.mv_data = NODEKEY(node); 5290 5291 rc = cmp(key, &nodekey); 5292 #if MDB_DEBUG 5293 if (IS_LEAF(mp)) 5294 DPRINTF(("found leaf index %u [%s], rc = %i", 5295 i, DKEY(&nodekey), rc)); 5296 else 5297 DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", 5298 i, DKEY(&nodekey), NODEPGNO(node), rc)); 5299 #endif 5300 if (rc == 0) 5301 break; 5302 if (rc > 0) 5303 low = i + 1; 5304 else 5305 high = i - 1; 5306 } 5307 } 5308 5309 if (rc > 0) { /* Found entry is less than the key. */ 5310 i++; /* Skip to get the smallest entry larger than key. */ 5311 if (!IS_LEAF2(mp)) 5312 node = NODEPTR(mp, i); 5313 } 5314 if (exactp) 5315 *exactp = (rc == 0 && nkeys > 0); 5316 /* store the key index */ 5317 mc->mc_ki[mc->mc_top] = i; 5318 if (i >= nkeys) 5319 /* There is no entry larger or equal to the key. */ 5320 return NULL; 5321 5322 /* nodeptr is fake for LEAF2 */ 5323 return node; 5324 } 5325 5326 #if 0 5327 static void 5328 mdb_cursor_adjust(MDB_cursor *mc, func) 5329 { 5330 MDB_cursor *m2; 5331 5332 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { 5333 if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { 5334 func(mc, m2); 5335 } 5336 } 5337 } 5338 #endif 5339 5340 /** Pop a page off the top of the cursor's stack. */ 5341 static void 5342 mdb_cursor_pop(MDB_cursor *mc) 5343 { 5344 if (mc->mc_snum) { 5345 DPRINTF(("popping page %"Z"u off db %d cursor %p", 5346 mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); 5347 5348 mc->mc_snum--; 5349 if (mc->mc_snum) { 5350 mc->mc_top--; 5351 } else { 5352 mc->mc_flags &= ~C_INITIALIZED; 5353 } 5354 } 5355 } 5356 5357 /** Push a page onto the top of the cursor's stack. 5358 * Set #MDB_TXN_ERROR on failure. 5359 */ 5360 static int 5361 mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) 5362 { 5363 DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, 5364 DDBI(mc), (void *) mc)); 5365 5366 if (mc->mc_snum >= CURSOR_STACK) { 5367 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 5368 return MDB_CURSOR_FULL; 5369 } 5370 5371 mc->mc_top = mc->mc_snum++; 5372 mc->mc_pg[mc->mc_top] = mp; 5373 mc->mc_ki[mc->mc_top] = 0; 5374 5375 return MDB_SUCCESS; 5376 } 5377 5378 /** Find the address of the page corresponding to a given page number. 5379 * Set #MDB_TXN_ERROR on failure. 5380 * @param[in] mc the cursor accessing the page. 5381 * @param[in] pgno the page number for the page to retrieve. 5382 * @param[out] ret address of a pointer where the page's address will be stored. 5383 * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. 5384 * @return 0 on success, non-zero on failure. 5385 */ 5386 static int 5387 mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) 5388 { 5389 MDB_txn *txn = mc->mc_txn; 5390 MDB_env *env = txn->mt_env; 5391 MDB_page *p = NULL; 5392 int level; 5393 5394 if (! (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_WRITEMAP))) { 5395 MDB_txn *tx2 = txn; 5396 level = 1; 5397 do { 5398 MDB_ID2L dl = tx2->mt_u.dirty_list; 5399 unsigned x; 5400 /* Spilled pages were dirtied in this txn and flushed 5401 * because the dirty list got full. Bring this page 5402 * back in from the map (but don't unspill it here, 5403 * leave that unless page_touch happens again). 5404 */ 5405 if (tx2->mt_spill_pgs) { 5406 MDB_ID pn = pgno << 1; 5407 x = mdb_midl_search(tx2->mt_spill_pgs, pn); 5408 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { 5409 p = (MDB_page *)(env->me_map + env->me_psize * pgno); 5410 goto done; 5411 } 5412 } 5413 if (dl[0].mid) { 5414 unsigned x = mdb_mid2l_search(dl, pgno); 5415 if (x <= dl[0].mid && dl[x].mid == pgno) { 5416 p = dl[x].mptr; 5417 goto done; 5418 } 5419 } 5420 level++; 5421 } while ((tx2 = tx2->mt_parent) != NULL); 5422 } 5423 5424 if (pgno < txn->mt_next_pgno) { 5425 level = 0; 5426 p = (MDB_page *)(env->me_map + env->me_psize * pgno); 5427 } else { 5428 DPRINTF(("page %"Z"u not found", pgno)); 5429 txn->mt_flags |= MDB_TXN_ERROR; 5430 return MDB_PAGE_NOTFOUND; 5431 } 5432 5433 done: 5434 *ret = p; 5435 if (lvl) 5436 *lvl = level; 5437 return MDB_SUCCESS; 5438 } 5439 5440 /** Finish #mdb_page_search() / #mdb_page_search_lowest(). 5441 * The cursor is at the root page, set up the rest of it. 5442 */ 5443 static int 5444 mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) 5445 { 5446 MDB_page *mp = mc->mc_pg[mc->mc_top]; 5447 int rc; 5448 DKBUF; 5449 5450 while (IS_BRANCH(mp)) { 5451 MDB_node *node; 5452 indx_t i; 5453 5454 DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); 5455 /* Don't assert on branch pages in the FreeDB. We can get here 5456 * while in the process of rebalancing a FreeDB branch page; we must 5457 * let that proceed. ITS#8336 5458 */ 5459 mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); 5460 DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); 5461 5462 if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { 5463 i = 0; 5464 if (flags & MDB_PS_LAST) { 5465 i = NUMKEYS(mp) - 1; 5466 /* if already init'd, see if we're already in right place */ 5467 if (mc->mc_flags & C_INITIALIZED) { 5468 if (mc->mc_ki[mc->mc_top] == i) { 5469 mc->mc_top = mc->mc_snum++; 5470 mp = mc->mc_pg[mc->mc_top]; 5471 goto ready; 5472 } 5473 } 5474 } 5475 } else { 5476 int exact; 5477 node = mdb_node_search(mc, key, &exact); 5478 if (node == NULL) 5479 i = NUMKEYS(mp) - 1; 5480 else { 5481 i = mc->mc_ki[mc->mc_top]; 5482 if (!exact) { 5483 mdb_cassert(mc, i > 0); 5484 i--; 5485 } 5486 } 5487 DPRINTF(("following index %u for key [%s]", i, DKEY(key))); 5488 } 5489 5490 mdb_cassert(mc, i < NUMKEYS(mp)); 5491 node = NODEPTR(mp, i); 5492 5493 if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) 5494 return rc; 5495 5496 mc->mc_ki[mc->mc_top] = i; 5497 if ((rc = mdb_cursor_push(mc, mp))) 5498 return rc; 5499 5500 ready: 5501 if (flags & MDB_PS_MODIFY) { 5502 if ((rc = mdb_page_touch(mc)) != 0) 5503 return rc; 5504 mp = mc->mc_pg[mc->mc_top]; 5505 } 5506 } 5507 5508 if (!IS_LEAF(mp)) { 5509 DPRINTF(("internal error, index points to a %02X page!?", 5510 mp->mp_flags)); 5511 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 5512 return MDB_CORRUPTED; 5513 } 5514 5515 DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, 5516 key ? DKEY(key) : "null")); 5517 mc->mc_flags |= C_INITIALIZED; 5518 mc->mc_flags &= ~C_EOF; 5519 5520 return MDB_SUCCESS; 5521 } 5522 5523 /** Search for the lowest key under the current branch page. 5524 * This just bypasses a NUMKEYS check in the current page 5525 * before calling mdb_page_search_root(), because the callers 5526 * are all in situations where the current page is known to 5527 * be underfilled. 5528 */ 5529 static int 5530 mdb_page_search_lowest(MDB_cursor *mc) 5531 { 5532 MDB_page *mp = mc->mc_pg[mc->mc_top]; 5533 MDB_node *node = NODEPTR(mp, 0); 5534 int rc; 5535 5536 if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) 5537 return rc; 5538 5539 mc->mc_ki[mc->mc_top] = 0; 5540 if ((rc = mdb_cursor_push(mc, mp))) 5541 return rc; 5542 return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); 5543 } 5544 5545 /** Search for the page a given key should be in. 5546 * Push it and its parent pages on the cursor stack. 5547 * @param[in,out] mc the cursor for this operation. 5548 * @param[in] key the key to search for, or NULL for first/last page. 5549 * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB 5550 * are touched (updated with new page numbers). 5551 * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. 5552 * This is used by #mdb_cursor_first() and #mdb_cursor_last(). 5553 * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. 5554 * @return 0 on success, non-zero on failure. 5555 */ 5556 static int 5557 mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) 5558 { 5559 int rc; 5560 pgno_t root; 5561 5562 /* Make sure the txn is still viable, then find the root from 5563 * the txn's db table and set it as the root of the cursor's stack. 5564 */ 5565 if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) { 5566 DPUTS("transaction may not be used now"); 5567 return MDB_BAD_TXN; 5568 } else { 5569 /* Make sure we're using an up-to-date root */ 5570 if (*mc->mc_dbflag & DB_STALE) { 5571 MDB_cursor mc2; 5572 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) 5573 return MDB_BAD_DBI; 5574 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); 5575 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); 5576 if (rc) 5577 return rc; 5578 { 5579 MDB_val data; 5580 int exact = 0; 5581 uint16_t flags; 5582 MDB_node *leaf = mdb_node_search(&mc2, 5583 &mc->mc_dbx->md_name, &exact); 5584 if (!exact) 5585 return MDB_NOTFOUND; 5586 if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) 5587 return MDB_INCOMPATIBLE; /* not a named DB */ 5588 rc = mdb_node_read(&mc2, leaf, &data); 5589 if (rc) 5590 return rc; 5591 memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), 5592 sizeof(uint16_t)); 5593 /* The txn may not know this DBI, or another process may 5594 * have dropped and recreated the DB with other flags. 5595 */ 5596 if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) 5597 return MDB_INCOMPATIBLE; 5598 memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); 5599 } 5600 *mc->mc_dbflag &= ~DB_STALE; 5601 } 5602 root = mc->mc_db->md_root; 5603 5604 if (root == P_INVALID) { /* Tree is empty. */ 5605 DPUTS("tree is empty"); 5606 return MDB_NOTFOUND; 5607 } 5608 } 5609 5610 mdb_cassert(mc, root > 1); 5611 if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) 5612 if ((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0) 5613 return rc; 5614 5615 mc->mc_snum = 1; 5616 mc->mc_top = 0; 5617 5618 DPRINTF(("db %d root page %"Z"u has flags 0x%X", 5619 DDBI(mc), root, mc->mc_pg[0]->mp_flags)); 5620 5621 if (flags & MDB_PS_MODIFY) { 5622 if ((rc = mdb_page_touch(mc))) 5623 return rc; 5624 } 5625 5626 if (flags & MDB_PS_ROOTONLY) 5627 return MDB_SUCCESS; 5628 5629 return mdb_page_search_root(mc, key, flags); 5630 } 5631 5632 static int 5633 mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) 5634 { 5635 MDB_txn *txn = mc->mc_txn; 5636 pgno_t pg = mp->mp_pgno; 5637 unsigned x = 0, ovpages = mp->mp_pages; 5638 MDB_env *env = txn->mt_env; 5639 MDB_IDL sl = txn->mt_spill_pgs; 5640 MDB_ID pn = pg << 1; 5641 int rc; 5642 5643 DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); 5644 /* If the page is dirty or on the spill list we just acquired it, 5645 * so we should give it back to our current free list, if any. 5646 * Otherwise put it onto the list of pages we freed in this txn. 5647 * 5648 * Won't create me_pghead: me_pglast must be inited along with it. 5649 * Unsupported in nested txns: They would need to hide the page 5650 * range in ancestor txns' dirty and spilled lists. 5651 */ 5652 if (env->me_pghead && 5653 !txn->mt_parent && 5654 ((mp->mp_flags & P_DIRTY) || 5655 (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) 5656 { 5657 unsigned i, j; 5658 pgno_t *mop; 5659 MDB_ID2 *dl, ix, iy; 5660 rc = mdb_midl_need(&env->me_pghead, ovpages); 5661 if (rc) 5662 return rc; 5663 if (!(mp->mp_flags & P_DIRTY)) { 5664 /* This page is no longer spilled */ 5665 if (x == sl[0]) 5666 sl[0]--; 5667 else 5668 sl[x] |= 1; 5669 goto release; 5670 } 5671 /* Remove from dirty list */ 5672 dl = txn->mt_u.dirty_list; 5673 x = dl[0].mid--; 5674 for (ix = dl[x]; ix.mptr != mp; ix = iy) { 5675 if (x > 1) { 5676 x--; 5677 iy = dl[x]; 5678 dl[x] = ix; 5679 } else { 5680 mdb_cassert(mc, x > 1); 5681 j = ++(dl[0].mid); 5682 dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ 5683 txn->mt_flags |= MDB_TXN_ERROR; 5684 return MDB_CORRUPTED; 5685 } 5686 } 5687 txn->mt_dirty_room++; 5688 if (!(env->me_flags & MDB_WRITEMAP)) 5689 mdb_dpage_free(env, mp); 5690 release: 5691 /* Insert in me_pghead */ 5692 mop = env->me_pghead; 5693 j = mop[0] + ovpages; 5694 for (i = mop[0]; i && mop[i] < pg; i--) 5695 mop[j--] = mop[i]; 5696 while (j>i) 5697 mop[j--] = pg++; 5698 mop[0] += ovpages; 5699 } else { 5700 rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); 5701 if (rc) 5702 return rc; 5703 } 5704 mc->mc_db->md_overflow_pages -= ovpages; 5705 return 0; 5706 } 5707 5708 /** Return the data associated with a given node. 5709 * @param[in] mc The cursor for this operation. 5710 * @param[in] leaf The node being read. 5711 * @param[out] data Updated to point to the node's data. 5712 * @return 0 on success, non-zero on failure. 5713 */ 5714 static int 5715 mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) 5716 { 5717 MDB_page *omp; /* overflow page */ 5718 pgno_t pgno; 5719 int rc; 5720 5721 if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { 5722 data->mv_size = NODEDSZ(leaf); 5723 data->mv_data = NODEDATA(leaf); 5724 return MDB_SUCCESS; 5725 } 5726 5727 /* Read overflow data. 5728 */ 5729 data->mv_size = NODEDSZ(leaf); 5730 memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); 5731 if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) { 5732 DPRINTF(("read overflow page %"Z"u failed", pgno)); 5733 return rc; 5734 } 5735 data->mv_data = METADATA(omp); 5736 5737 return MDB_SUCCESS; 5738 } 5739 5740 int 5741 mdb_get(MDB_txn *txn, MDB_dbi dbi, 5742 MDB_val *key, MDB_val *data) 5743 { 5744 MDB_cursor mc; 5745 MDB_xcursor mx; 5746 int exact = 0; 5747 DKBUF; 5748 5749 DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); 5750 5751 if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 5752 return EINVAL; 5753 5754 if (txn->mt_flags & MDB_TXN_BLOCKED) 5755 return MDB_BAD_TXN; 5756 5757 mdb_cursor_init(&mc, txn, dbi, &mx); 5758 return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); 5759 } 5760 5761 /** Find a sibling for a page. 5762 * Replaces the page at the top of the cursor's stack with the 5763 * specified sibling, if one exists. 5764 * @param[in] mc The cursor for this operation. 5765 * @param[in] move_right Non-zero if the right sibling is requested, 5766 * otherwise the left sibling. 5767 * @return 0 on success, non-zero on failure. 5768 */ 5769 static int 5770 mdb_cursor_sibling(MDB_cursor *mc, int move_right) 5771 { 5772 int rc; 5773 MDB_node *indx; 5774 MDB_page *mp; 5775 5776 if (mc->mc_snum < 2) { 5777 return MDB_NOTFOUND; /* root has no siblings */ 5778 } 5779 5780 mdb_cursor_pop(mc); 5781 DPRINTF(("parent page is page %"Z"u, index %u", 5782 mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); 5783 5784 if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) 5785 : (mc->mc_ki[mc->mc_top] == 0)) { 5786 DPRINTF(("no more keys left, moving to %s sibling", 5787 move_right ? "right" : "left")); 5788 if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { 5789 /* undo cursor_pop before returning */ 5790 mc->mc_top++; 5791 mc->mc_snum++; 5792 return rc; 5793 } 5794 } else { 5795 if (move_right) 5796 mc->mc_ki[mc->mc_top]++; 5797 else 5798 mc->mc_ki[mc->mc_top]--; 5799 DPRINTF(("just moving to %s index key %u", 5800 move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); 5801 } 5802 mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); 5803 5804 indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 5805 if ((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) { 5806 /* mc will be inconsistent if caller does mc_snum++ as above */ 5807 mc->mc_flags &= ~(C_INITIALIZED|C_EOF); 5808 return rc; 5809 } 5810 5811 mdb_cursor_push(mc, mp); 5812 if (!move_right) 5813 mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; 5814 5815 return MDB_SUCCESS; 5816 } 5817 5818 /** Move the cursor to the next data item. */ 5819 static int 5820 mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) 5821 { 5822 MDB_page *mp; 5823 MDB_node *leaf; 5824 int rc; 5825 5826 if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP)) 5827 return MDB_NOTFOUND; 5828 5829 if (!(mc->mc_flags & C_INITIALIZED)) 5830 return mdb_cursor_first(mc, key, data); 5831 5832 mp = mc->mc_pg[mc->mc_top]; 5833 5834 if (mc->mc_flags & C_EOF) { 5835 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) 5836 return MDB_NOTFOUND; 5837 mc->mc_flags ^= C_EOF; 5838 } 5839 5840 if (mc->mc_db->md_flags & MDB_DUPSORT) { 5841 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 5842 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5843 if (op == MDB_NEXT || op == MDB_NEXT_DUP) { 5844 rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); 5845 if (op != MDB_NEXT || rc != MDB_NOTFOUND) { 5846 if (rc == MDB_SUCCESS) 5847 MDB_GET_KEY(leaf, key); 5848 return rc; 5849 } 5850 } 5851 } else { 5852 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 5853 if (op == MDB_NEXT_DUP) 5854 return MDB_NOTFOUND; 5855 } 5856 } 5857 5858 DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", 5859 mdb_dbg_pgno(mp), (void *) mc)); 5860 if (mc->mc_flags & C_DEL) { 5861 mc->mc_flags ^= C_DEL; 5862 goto skip; 5863 } 5864 5865 if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { 5866 DPUTS("=====> move to next sibling page"); 5867 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { 5868 mc->mc_flags |= C_EOF; 5869 return rc; 5870 } 5871 mp = mc->mc_pg[mc->mc_top]; 5872 DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); 5873 } else 5874 mc->mc_ki[mc->mc_top]++; 5875 5876 skip: 5877 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", 5878 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); 5879 5880 if (IS_LEAF2(mp)) { 5881 key->mv_size = mc->mc_db->md_pad; 5882 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 5883 return MDB_SUCCESS; 5884 } 5885 5886 mdb_cassert(mc, IS_LEAF(mp)); 5887 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 5888 5889 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5890 mdb_xcursor_init1(mc, leaf); 5891 } 5892 if (data) { 5893 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 5894 return rc; 5895 5896 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5897 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 5898 if (rc != MDB_SUCCESS) 5899 return rc; 5900 } 5901 } 5902 5903 MDB_GET_KEY(leaf, key); 5904 return MDB_SUCCESS; 5905 } 5906 5907 /** Move the cursor to the previous data item. */ 5908 static int 5909 mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) 5910 { 5911 MDB_page *mp; 5912 MDB_node *leaf; 5913 int rc; 5914 5915 if (!(mc->mc_flags & C_INITIALIZED)) { 5916 rc = mdb_cursor_last(mc, key, data); 5917 if (rc) 5918 return rc; 5919 mc->mc_ki[mc->mc_top]++; 5920 } 5921 5922 mp = mc->mc_pg[mc->mc_top]; 5923 5924 if (mc->mc_db->md_flags & MDB_DUPSORT) { 5925 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 5926 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5927 if (op == MDB_PREV || op == MDB_PREV_DUP) { 5928 rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); 5929 if (op != MDB_PREV || rc != MDB_NOTFOUND) { 5930 if (rc == MDB_SUCCESS) { 5931 MDB_GET_KEY(leaf, key); 5932 mc->mc_flags &= ~C_EOF; 5933 } 5934 return rc; 5935 } 5936 } 5937 } else { 5938 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 5939 if (op == MDB_PREV_DUP) 5940 return MDB_NOTFOUND; 5941 } 5942 } 5943 5944 DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", 5945 mdb_dbg_pgno(mp), (void *) mc)); 5946 5947 mc->mc_flags &= ~(C_EOF|C_DEL); 5948 5949 if (mc->mc_ki[mc->mc_top] == 0) { 5950 DPUTS("=====> move to prev sibling page"); 5951 if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { 5952 return rc; 5953 } 5954 mp = mc->mc_pg[mc->mc_top]; 5955 mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; 5956 DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); 5957 } else 5958 mc->mc_ki[mc->mc_top]--; 5959 5960 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", 5961 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); 5962 5963 if (IS_LEAF2(mp)) { 5964 key->mv_size = mc->mc_db->md_pad; 5965 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 5966 return MDB_SUCCESS; 5967 } 5968 5969 mdb_cassert(mc, IS_LEAF(mp)); 5970 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 5971 5972 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5973 mdb_xcursor_init1(mc, leaf); 5974 } 5975 if (data) { 5976 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 5977 return rc; 5978 5979 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5980 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); 5981 if (rc != MDB_SUCCESS) 5982 return rc; 5983 } 5984 } 5985 5986 MDB_GET_KEY(leaf, key); 5987 return MDB_SUCCESS; 5988 } 5989 5990 /** Set the cursor on a specific data item. */ 5991 static int 5992 mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, 5993 MDB_cursor_op op, int *exactp) 5994 { 5995 int rc; 5996 MDB_page *mp; 5997 MDB_node *leaf = NULL; 5998 DKBUF; 5999 6000 if (key->mv_size == 0) 6001 return MDB_BAD_VALSIZE; 6002 6003 if (mc->mc_xcursor) 6004 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6005 6006 /* See if we're already on the right page */ 6007 if (mc->mc_flags & C_INITIALIZED) { 6008 MDB_val nodekey; 6009 6010 mp = mc->mc_pg[mc->mc_top]; 6011 if (!NUMKEYS(mp)) { 6012 mc->mc_ki[mc->mc_top] = 0; 6013 return MDB_NOTFOUND; 6014 } 6015 if (mp->mp_flags & P_LEAF2) { 6016 nodekey.mv_size = mc->mc_db->md_pad; 6017 nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); 6018 } else { 6019 leaf = NODEPTR(mp, 0); 6020 MDB_GET_KEY2(leaf, nodekey); 6021 } 6022 rc = mc->mc_dbx->md_cmp(key, &nodekey); 6023 if (rc == 0) { 6024 /* Probably happens rarely, but first node on the page 6025 * was the one we wanted. 6026 */ 6027 mc->mc_ki[mc->mc_top] = 0; 6028 if (exactp) 6029 *exactp = 1; 6030 goto set1; 6031 } 6032 if (rc > 0) { 6033 unsigned int i; 6034 unsigned int nkeys = NUMKEYS(mp); 6035 if (nkeys > 1) { 6036 if (mp->mp_flags & P_LEAF2) { 6037 nodekey.mv_data = LEAF2KEY(mp, 6038 nkeys-1, nodekey.mv_size); 6039 } else { 6040 leaf = NODEPTR(mp, nkeys-1); 6041 MDB_GET_KEY2(leaf, nodekey); 6042 } 6043 rc = mc->mc_dbx->md_cmp(key, &nodekey); 6044 if (rc == 0) { 6045 /* last node was the one we wanted */ 6046 mc->mc_ki[mc->mc_top] = nkeys-1; 6047 if (exactp) 6048 *exactp = 1; 6049 goto set1; 6050 } 6051 if (rc < 0) { 6052 if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { 6053 /* This is definitely the right page, skip search_page */ 6054 if (mp->mp_flags & P_LEAF2) { 6055 nodekey.mv_data = LEAF2KEY(mp, 6056 mc->mc_ki[mc->mc_top], nodekey.mv_size); 6057 } else { 6058 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6059 MDB_GET_KEY2(leaf, nodekey); 6060 } 6061 rc = mc->mc_dbx->md_cmp(key, &nodekey); 6062 if (rc == 0) { 6063 /* current node was the one we wanted */ 6064 if (exactp) 6065 *exactp = 1; 6066 goto set1; 6067 } 6068 } 6069 rc = 0; 6070 mc->mc_flags &= ~C_EOF; 6071 goto set2; 6072 } 6073 } 6074 /* If any parents have right-sibs, search. 6075 * Otherwise, there's nothing further. 6076 */ 6077 for (i=0; i<mc->mc_top; i++) 6078 if (mc->mc_ki[i] < 6079 NUMKEYS(mc->mc_pg[i])-1) 6080 break; 6081 if (i == mc->mc_top) { 6082 /* There are no other pages */ 6083 mc->mc_ki[mc->mc_top] = nkeys; 6084 return MDB_NOTFOUND; 6085 } 6086 } 6087 if (!mc->mc_top) { 6088 /* There are no other pages */ 6089 mc->mc_ki[mc->mc_top] = 0; 6090 if (op == MDB_SET_RANGE && !exactp) { 6091 rc = 0; 6092 goto set1; 6093 } else 6094 return MDB_NOTFOUND; 6095 } 6096 } else { 6097 mc->mc_pg[0] = 0; 6098 } 6099 6100 rc = mdb_page_search(mc, key, 0); 6101 if (rc != MDB_SUCCESS) 6102 return rc; 6103 6104 mp = mc->mc_pg[mc->mc_top]; 6105 mdb_cassert(mc, IS_LEAF(mp)); 6106 6107 set2: 6108 leaf = mdb_node_search(mc, key, exactp); 6109 if (exactp != NULL && !*exactp) { 6110 /* MDB_SET specified and not an exact match. */ 6111 return MDB_NOTFOUND; 6112 } 6113 6114 if (leaf == NULL) { 6115 DPUTS("===> inexact leaf not found, goto sibling"); 6116 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { 6117 mc->mc_flags |= C_EOF; 6118 return rc; /* no entries matched */ 6119 } 6120 mp = mc->mc_pg[mc->mc_top]; 6121 mdb_cassert(mc, IS_LEAF(mp)); 6122 leaf = NODEPTR(mp, 0); 6123 } 6124 6125 set1: 6126 mc->mc_flags |= C_INITIALIZED; 6127 mc->mc_flags &= ~C_EOF; 6128 6129 if (IS_LEAF2(mp)) { 6130 if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { 6131 key->mv_size = mc->mc_db->md_pad; 6132 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 6133 } 6134 return MDB_SUCCESS; 6135 } 6136 6137 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6138 mdb_xcursor_init1(mc, leaf); 6139 } 6140 if (data) { 6141 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6142 if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { 6143 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 6144 } else { 6145 int ex2, *ex2p; 6146 if (op == MDB_GET_BOTH) { 6147 ex2p = &ex2; 6148 ex2 = 0; 6149 } else { 6150 ex2p = NULL; 6151 } 6152 rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); 6153 if (rc != MDB_SUCCESS) 6154 return rc; 6155 } 6156 } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { 6157 MDB_val olddata; 6158 MDB_cmp_func *dcmp; 6159 if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS) 6160 return rc; 6161 dcmp = mc->mc_dbx->md_dcmp; 6162 #if UINT_MAX < SIZE_MAX 6163 if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) 6164 dcmp = mdb_cmp_clong; 6165 #endif 6166 rc = dcmp(data, &olddata); 6167 if (rc) { 6168 if (op == MDB_GET_BOTH || rc > 0) 6169 return MDB_NOTFOUND; 6170 rc = 0; 6171 } 6172 *data = olddata; 6173 6174 } else { 6175 if (mc->mc_xcursor) 6176 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6177 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 6178 return rc; 6179 } 6180 } 6181 6182 /* The key already matches in all other cases */ 6183 if (op == MDB_SET_RANGE || op == MDB_SET_KEY) 6184 MDB_GET_KEY(leaf, key); 6185 DPRINTF(("==> cursor placed on key [%s]", DKEY(key))); 6186 6187 return rc; 6188 } 6189 6190 /** Move the cursor to the first item in the database. */ 6191 static int 6192 mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) 6193 { 6194 int rc; 6195 MDB_node *leaf; 6196 6197 if (mc->mc_xcursor) 6198 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6199 6200 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { 6201 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); 6202 if (rc != MDB_SUCCESS) 6203 return rc; 6204 } 6205 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); 6206 6207 leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); 6208 mc->mc_flags |= C_INITIALIZED; 6209 mc->mc_flags &= ~C_EOF; 6210 6211 mc->mc_ki[mc->mc_top] = 0; 6212 6213 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 6214 key->mv_size = mc->mc_db->md_pad; 6215 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); 6216 return MDB_SUCCESS; 6217 } 6218 6219 if (data) { 6220 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6221 mdb_xcursor_init1(mc, leaf); 6222 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 6223 if (rc) 6224 return rc; 6225 } else { 6226 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 6227 return rc; 6228 } 6229 } 6230 MDB_GET_KEY(leaf, key); 6231 return MDB_SUCCESS; 6232 } 6233 6234 /** Move the cursor to the last item in the database. */ 6235 static int 6236 mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) 6237 { 6238 int rc; 6239 MDB_node *leaf; 6240 6241 if (mc->mc_xcursor) 6242 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6243 6244 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { 6245 rc = mdb_page_search(mc, NULL, MDB_PS_LAST); 6246 if (rc != MDB_SUCCESS) 6247 return rc; 6248 } 6249 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); 6250 6251 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; 6252 mc->mc_flags |= C_INITIALIZED|C_EOF; 6253 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 6254 6255 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 6256 key->mv_size = mc->mc_db->md_pad; 6257 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); 6258 return MDB_SUCCESS; 6259 } 6260 6261 if (data) { 6262 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6263 mdb_xcursor_init1(mc, leaf); 6264 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); 6265 if (rc) 6266 return rc; 6267 } else { 6268 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 6269 return rc; 6270 } 6271 } 6272 6273 MDB_GET_KEY(leaf, key); 6274 return MDB_SUCCESS; 6275 } 6276 6277 int 6278 mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, 6279 MDB_cursor_op op) 6280 { 6281 int rc; 6282 int exact = 0; 6283 int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); 6284 6285 if (mc == NULL) 6286 return EINVAL; 6287 6288 if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) 6289 return MDB_BAD_TXN; 6290 6291 switch (op) { 6292 case MDB_GET_CURRENT: 6293 if (!(mc->mc_flags & C_INITIALIZED)) { 6294 rc = EINVAL; 6295 } else { 6296 MDB_page *mp = mc->mc_pg[mc->mc_top]; 6297 int nkeys = NUMKEYS(mp); 6298 if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { 6299 mc->mc_ki[mc->mc_top] = nkeys; 6300 rc = MDB_NOTFOUND; 6301 break; 6302 } 6303 rc = MDB_SUCCESS; 6304 if (IS_LEAF2(mp)) { 6305 key->mv_size = mc->mc_db->md_pad; 6306 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 6307 } else { 6308 MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6309 MDB_GET_KEY(leaf, key); 6310 if (data) { 6311 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6312 rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); 6313 } else { 6314 rc = mdb_node_read(mc, leaf, data); 6315 } 6316 } 6317 } 6318 } 6319 break; 6320 case MDB_GET_BOTH: 6321 case MDB_GET_BOTH_RANGE: 6322 if (data == NULL) { 6323 rc = EINVAL; 6324 break; 6325 } 6326 if (mc->mc_xcursor == NULL) { 6327 rc = MDB_INCOMPATIBLE; 6328 break; 6329 } 6330 /* FALLTHRU */ 6331 case MDB_SET: 6332 case MDB_SET_KEY: 6333 case MDB_SET_RANGE: 6334 if (key == NULL) { 6335 rc = EINVAL; 6336 } else { 6337 rc = mdb_cursor_set(mc, key, data, op, 6338 op == MDB_SET_RANGE ? NULL : &exact); 6339 } 6340 break; 6341 case MDB_GET_MULTIPLE: 6342 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { 6343 rc = EINVAL; 6344 break; 6345 } 6346 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 6347 rc = MDB_INCOMPATIBLE; 6348 break; 6349 } 6350 rc = MDB_SUCCESS; 6351 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || 6352 (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) 6353 break; 6354 goto fetchm; 6355 case MDB_NEXT_MULTIPLE: 6356 if (data == NULL) { 6357 rc = EINVAL; 6358 break; 6359 } 6360 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 6361 rc = MDB_INCOMPATIBLE; 6362 break; 6363 } 6364 rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); 6365 if (rc == MDB_SUCCESS) { 6366 if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { 6367 MDB_cursor *mx; 6368 fetchm: 6369 mx = &mc->mc_xcursor->mx_cursor; 6370 data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * 6371 mx->mc_db->md_pad; 6372 data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); 6373 mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; 6374 } else { 6375 rc = MDB_NOTFOUND; 6376 } 6377 } 6378 break; 6379 case MDB_PREV_MULTIPLE: 6380 if (data == NULL) { 6381 rc = EINVAL; 6382 break; 6383 } 6384 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 6385 rc = MDB_INCOMPATIBLE; 6386 break; 6387 } 6388 if (!(mc->mc_flags & C_INITIALIZED)) 6389 rc = mdb_cursor_last(mc, key, data); 6390 else 6391 rc = MDB_SUCCESS; 6392 if (rc == MDB_SUCCESS) { 6393 MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; 6394 if (mx->mc_flags & C_INITIALIZED) { 6395 rc = mdb_cursor_sibling(mx, 0); 6396 if (rc == MDB_SUCCESS) 6397 goto fetchm; 6398 } else { 6399 rc = MDB_NOTFOUND; 6400 } 6401 } 6402 break; 6403 case MDB_NEXT: 6404 case MDB_NEXT_DUP: 6405 case MDB_NEXT_NODUP: 6406 rc = mdb_cursor_next(mc, key, data, op); 6407 break; 6408 case MDB_PREV: 6409 case MDB_PREV_DUP: 6410 case MDB_PREV_NODUP: 6411 rc = mdb_cursor_prev(mc, key, data, op); 6412 break; 6413 case MDB_FIRST: 6414 rc = mdb_cursor_first(mc, key, data); 6415 break; 6416 case MDB_FIRST_DUP: 6417 mfunc = mdb_cursor_first; 6418 mmove: 6419 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { 6420 rc = EINVAL; 6421 break; 6422 } 6423 if (mc->mc_xcursor == NULL) { 6424 rc = MDB_INCOMPATIBLE; 6425 break; 6426 } 6427 { 6428 MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 6429 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6430 MDB_GET_KEY(leaf, key); 6431 rc = mdb_node_read(mc, leaf, data); 6432 break; 6433 } 6434 } 6435 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { 6436 rc = EINVAL; 6437 break; 6438 } 6439 rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); 6440 break; 6441 case MDB_LAST: 6442 rc = mdb_cursor_last(mc, key, data); 6443 break; 6444 case MDB_LAST_DUP: 6445 mfunc = mdb_cursor_last; 6446 goto mmove; 6447 default: 6448 DPRINTF(("unhandled/unimplemented cursor operation %u", op)); 6449 rc = EINVAL; 6450 break; 6451 } 6452 6453 if (mc->mc_flags & C_DEL) 6454 mc->mc_flags ^= C_DEL; 6455 6456 return rc; 6457 } 6458 6459 /** Touch all the pages in the cursor stack. Set mc_top. 6460 * Makes sure all the pages are writable, before attempting a write operation. 6461 * @param[in] mc The cursor to operate on. 6462 */ 6463 static int 6464 mdb_cursor_touch(MDB_cursor *mc) 6465 { 6466 int rc = MDB_SUCCESS; 6467 6468 if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { 6469 /* Touch DB record of named DB */ 6470 MDB_cursor mc2; 6471 MDB_xcursor mcx; 6472 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) 6473 return MDB_BAD_DBI; 6474 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); 6475 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); 6476 if (rc) 6477 return rc; 6478 *mc->mc_dbflag |= DB_DIRTY; 6479 } 6480 mc->mc_top = 0; 6481 if (mc->mc_snum) { 6482 do { 6483 rc = mdb_page_touch(mc); 6484 } while (!rc && ++(mc->mc_top) < mc->mc_snum); 6485 mc->mc_top = mc->mc_snum-1; 6486 } 6487 return rc; 6488 } 6489 6490 /** Do not spill pages to disk if txn is getting full, may fail instead */ 6491 #define MDB_NOSPILL 0x8000 6492 6493 int 6494 mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, 6495 unsigned int flags) 6496 { 6497 MDB_env *env; 6498 MDB_node *leaf = NULL; 6499 MDB_page *fp, *mp, *sub_root = NULL; 6500 uint16_t fp_flags; 6501 MDB_val xdata, *rdata, dkey, olddata; 6502 MDB_db dummy; 6503 int do_sub = 0, insert_key, insert_data; 6504 unsigned int mcount = 0, dcount = 0, nospill; 6505 size_t nsize; 6506 int rc, rc2; 6507 unsigned int nflags; 6508 DKBUF; 6509 6510 if (mc == NULL || key == NULL) 6511 return EINVAL; 6512 6513 env = mc->mc_txn->mt_env; 6514 6515 /* Check this first so counter will always be zero on any 6516 * early failures. 6517 */ 6518 if (flags & MDB_MULTIPLE) { 6519 dcount = data[1].mv_size; 6520 data[1].mv_size = 0; 6521 if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) 6522 return MDB_INCOMPATIBLE; 6523 } 6524 6525 nospill = flags & MDB_NOSPILL; 6526 flags &= ~MDB_NOSPILL; 6527 6528 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) 6529 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 6530 6531 if (key->mv_size-1 >= ENV_MAXKEY(env)) 6532 return MDB_BAD_VALSIZE; 6533 6534 #if SIZE_MAX > MAXDATASIZE 6535 if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) 6536 return MDB_BAD_VALSIZE; 6537 #else 6538 if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) 6539 return MDB_BAD_VALSIZE; 6540 #endif 6541 6542 DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u", 6543 DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); 6544 6545 dkey.mv_size = 0; 6546 6547 if (flags == MDB_CURRENT) { 6548 if (!(mc->mc_flags & C_INITIALIZED)) 6549 return EINVAL; 6550 rc = MDB_SUCCESS; 6551 } else if (mc->mc_db->md_root == P_INVALID) { 6552 /* new database, cursor has nothing to point to */ 6553 mc->mc_snum = 0; 6554 mc->mc_top = 0; 6555 mc->mc_flags &= ~C_INITIALIZED; 6556 rc = MDB_NO_ROOT; 6557 } else { 6558 int exact = 0; 6559 MDB_val d2; 6560 if (flags & MDB_APPEND) { 6561 MDB_val k2; 6562 rc = mdb_cursor_last(mc, &k2, &d2); 6563 if (rc == 0) { 6564 rc = mc->mc_dbx->md_cmp(key, &k2); 6565 if (rc > 0) { 6566 rc = MDB_NOTFOUND; 6567 mc->mc_ki[mc->mc_top]++; 6568 } else { 6569 /* new key is <= last key */ 6570 rc = MDB_KEYEXIST; 6571 } 6572 } 6573 } else { 6574 rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); 6575 } 6576 if ((flags & MDB_NOOVERWRITE) && rc == 0) { 6577 DPRINTF(("duplicate key [%s]", DKEY(key))); 6578 *data = d2; 6579 return MDB_KEYEXIST; 6580 } 6581 if (rc && rc != MDB_NOTFOUND) 6582 return rc; 6583 } 6584 6585 if (mc->mc_flags & C_DEL) 6586 mc->mc_flags ^= C_DEL; 6587 6588 /* Cursor is positioned, check for room in the dirty list */ 6589 if (!nospill) { 6590 if (flags & MDB_MULTIPLE) { 6591 rdata = &xdata; 6592 xdata.mv_size = data->mv_size * dcount; 6593 } else { 6594 rdata = data; 6595 } 6596 if ((rc2 = mdb_page_spill(mc, key, rdata))) 6597 return rc2; 6598 } 6599 6600 if (rc == MDB_NO_ROOT) { 6601 MDB_page *np; 6602 /* new database, write a root leaf page */ 6603 DPUTS("allocating new root leaf page"); 6604 if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { 6605 return rc2; 6606 } 6607 mdb_cursor_push(mc, np); 6608 mc->mc_db->md_root = np->mp_pgno; 6609 mc->mc_db->md_depth++; 6610 *mc->mc_dbflag |= DB_DIRTY; 6611 if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) 6612 == MDB_DUPFIXED) 6613 np->mp_flags |= P_LEAF2; 6614 mc->mc_flags |= C_INITIALIZED; 6615 } else { 6616 /* make sure all cursor pages are writable */ 6617 rc2 = mdb_cursor_touch(mc); 6618 if (rc2) 6619 return rc2; 6620 } 6621 6622 insert_key = insert_data = rc; 6623 if (insert_key) { 6624 /* The key does not exist */ 6625 DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); 6626 if ((mc->mc_db->md_flags & MDB_DUPSORT) && 6627 LEAFSIZE(key, data) > env->me_nodemax) 6628 { 6629 /* Too big for a node, insert in sub-DB. Set up an empty 6630 * "old sub-page" for prep_subDB to expand to a full page. 6631 */ 6632 fp_flags = P_LEAF|P_DIRTY; 6633 fp = env->me_pbuf; 6634 fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */ 6635 fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE); 6636 olddata.mv_size = PAGEHDRSZ; 6637 goto prep_subDB; 6638 } 6639 } else { 6640 /* there's only a key anyway, so this is a no-op */ 6641 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 6642 char *ptr; 6643 unsigned int ksize = mc->mc_db->md_pad; 6644 if (key->mv_size != ksize) 6645 return MDB_BAD_VALSIZE; 6646 ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); 6647 memcpy(ptr, key->mv_data, ksize); 6648 fix_parent: 6649 /* if overwriting slot 0 of leaf, need to 6650 * update branch key if there is a parent page 6651 */ 6652 if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { 6653 unsigned short dtop = 1; 6654 mc->mc_top--; 6655 /* slot 0 is always an empty key, find real slot */ 6656 while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { 6657 mc->mc_top--; 6658 dtop++; 6659 } 6660 if (mc->mc_ki[mc->mc_top]) 6661 rc2 = mdb_update_key(mc, key); 6662 else 6663 rc2 = MDB_SUCCESS; 6664 mc->mc_top += dtop; 6665 if (rc2) 6666 return rc2; 6667 } 6668 return MDB_SUCCESS; 6669 } 6670 6671 more: 6672 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 6673 olddata.mv_size = NODEDSZ(leaf); 6674 olddata.mv_data = NODEDATA(leaf); 6675 6676 /* DB has dups? */ 6677 if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { 6678 /* Prepare (sub-)page/sub-DB to accept the new item, 6679 * if needed. fp: old sub-page or a header faking 6680 * it. mp: new (sub-)page. offset: growth in page 6681 * size. xdata: node data with new page or DB. 6682 */ 6683 unsigned i, offset = 0; 6684 mp = fp = xdata.mv_data = env->me_pbuf; 6685 mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; 6686 6687 /* Was a single item before, must convert now */ 6688 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6689 MDB_cmp_func *dcmp; 6690 /* Just overwrite the current item */ 6691 if (flags == MDB_CURRENT) 6692 goto current; 6693 dcmp = mc->mc_dbx->md_dcmp; 6694 #if UINT_MAX < SIZE_MAX 6695 if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) 6696 dcmp = mdb_cmp_clong; 6697 #endif 6698 /* does data match? */ 6699 if (!dcmp(data, &olddata)) { 6700 if (flags & (MDB_NODUPDATA|MDB_APPENDDUP)) 6701 return MDB_KEYEXIST; 6702 /* overwrite it */ 6703 goto current; 6704 } 6705 6706 /* Back up original data item */ 6707 dkey.mv_size = olddata.mv_size; 6708 dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); 6709 6710 /* Make sub-page header for the dup items, with dummy body */ 6711 fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; 6712 fp->mp_lower = (PAGEHDRSZ-PAGEBASE); 6713 xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; 6714 if (mc->mc_db->md_flags & MDB_DUPFIXED) { 6715 fp->mp_flags |= P_LEAF2; 6716 fp->mp_pad = data->mv_size; 6717 xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ 6718 } else { 6719 xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + 6720 (dkey.mv_size & 1) + (data->mv_size & 1); 6721 } 6722 fp->mp_upper = xdata.mv_size - PAGEBASE; 6723 olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ 6724 } else if (leaf->mn_flags & F_SUBDATA) { 6725 /* Data is on sub-DB, just store it */ 6726 flags |= F_DUPDATA|F_SUBDATA; 6727 goto put_sub; 6728 } else { 6729 /* Data is on sub-page */ 6730 fp = olddata.mv_data; 6731 switch (flags) { 6732 default: 6733 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 6734 offset = EVEN(NODESIZE + sizeof(indx_t) + 6735 data->mv_size); 6736 break; 6737 } 6738 offset = fp->mp_pad; 6739 if (SIZELEFT(fp) < offset) { 6740 offset *= 4; /* space for 4 more */ 6741 break; 6742 } 6743 /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */ 6744 case MDB_CURRENT: 6745 fp->mp_flags |= P_DIRTY; 6746 COPY_PGNO(fp->mp_pgno, mp->mp_pgno); 6747 mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; 6748 flags |= F_DUPDATA; 6749 goto put_sub; 6750 } 6751 xdata.mv_size = olddata.mv_size + offset; 6752 } 6753 6754 fp_flags = fp->mp_flags; 6755 if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { 6756 /* Too big for a sub-page, convert to sub-DB */ 6757 fp_flags &= ~P_SUBP; 6758 prep_subDB: 6759 if (mc->mc_db->md_flags & MDB_DUPFIXED) { 6760 fp_flags |= P_LEAF2; 6761 dummy.md_pad = fp->mp_pad; 6762 dummy.md_flags = MDB_DUPFIXED; 6763 if (mc->mc_db->md_flags & MDB_INTEGERDUP) 6764 dummy.md_flags |= MDB_INTEGERKEY; 6765 } else { 6766 dummy.md_pad = 0; 6767 dummy.md_flags = 0; 6768 } 6769 dummy.md_depth = 1; 6770 dummy.md_branch_pages = 0; 6771 dummy.md_leaf_pages = 1; 6772 dummy.md_overflow_pages = 0; 6773 dummy.md_entries = NUMKEYS(fp); 6774 xdata.mv_size = sizeof(MDB_db); 6775 xdata.mv_data = &dummy; 6776 if ((rc = mdb_page_alloc(mc, 1, &mp))) 6777 return rc; 6778 offset = env->me_psize - olddata.mv_size; 6779 flags |= F_DUPDATA|F_SUBDATA; 6780 dummy.md_root = mp->mp_pgno; 6781 sub_root = mp; 6782 } 6783 if (mp != fp) { 6784 mp->mp_flags = fp_flags | P_DIRTY; 6785 mp->mp_pad = fp->mp_pad; 6786 mp->mp_lower = fp->mp_lower; 6787 mp->mp_upper = fp->mp_upper + offset; 6788 if (fp_flags & P_LEAF2) { 6789 memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); 6790 } else { 6791 memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, 6792 olddata.mv_size - fp->mp_upper - PAGEBASE); 6793 for (i=0; i<NUMKEYS(fp); i++) 6794 mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; 6795 } 6796 } 6797 6798 rdata = &xdata; 6799 flags |= F_DUPDATA; 6800 do_sub = 1; 6801 if (!insert_key) 6802 mdb_node_del(mc, 0); 6803 goto new_sub; 6804 } 6805 current: 6806 /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ 6807 if ((leaf->mn_flags ^ flags) & F_SUBDATA) 6808 return MDB_INCOMPATIBLE; 6809 /* overflow page overwrites need special handling */ 6810 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { 6811 MDB_page *omp; 6812 pgno_t pg; 6813 int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); 6814 6815 memcpy(&pg, olddata.mv_data, sizeof(pg)); 6816 if ((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0) 6817 return rc2; 6818 ovpages = omp->mp_pages; 6819 6820 /* Is the ov page large enough? */ 6821 if (ovpages >= dpages) { 6822 if (!(omp->mp_flags & P_DIRTY) && 6823 (level || (env->me_flags & MDB_WRITEMAP))) 6824 { 6825 rc = mdb_page_unspill(mc->mc_txn, omp, &omp); 6826 if (rc) 6827 return rc; 6828 level = 0; /* dirty in this txn or clean */ 6829 } 6830 /* Is it dirty? */ 6831 if (omp->mp_flags & P_DIRTY) { 6832 /* yes, overwrite it. Note in this case we don't 6833 * bother to try shrinking the page if the new data 6834 * is smaller than the overflow threshold. 6835 */ 6836 if (level > 1) { 6837 /* It is writable only in a parent txn */ 6838 size_t sz = (size_t) env->me_psize * ovpages, off; 6839 MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); 6840 MDB_ID2 id2; 6841 if (!np) 6842 return ENOMEM; 6843 id2.mid = pg; 6844 id2.mptr = np; 6845 /* Note - this page is already counted in parent's dirty_room */ 6846 rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); 6847 mdb_cassert(mc, rc2 == 0); 6848 /* Currently we make the page look as with put() in the 6849 * parent txn, in case the user peeks at MDB_RESERVEd 6850 * or unused parts. Some users treat ovpages specially. 6851 */ 6852 if (!(flags & MDB_RESERVE)) { 6853 /* Skip the part where LMDB will put *data. 6854 * Copy end of page, adjusting alignment so 6855 * compiler may copy words instead of bytes. 6856 */ 6857 off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); 6858 memcpy((size_t *)((char *)np + off), 6859 (size_t *)((char *)omp + off), sz - off); 6860 sz = PAGEHDRSZ; 6861 } 6862 memcpy(np, omp, sz); /* Copy beginning of page */ 6863 omp = np; 6864 } 6865 SETDSZ(leaf, data->mv_size); 6866 if (F_ISSET(flags, MDB_RESERVE)) 6867 data->mv_data = METADATA(omp); 6868 else 6869 memcpy(METADATA(omp), data->mv_data, data->mv_size); 6870 return MDB_SUCCESS; 6871 } 6872 } 6873 if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) 6874 return rc2; 6875 } else if (data->mv_size == olddata.mv_size) { 6876 /* same size, just replace it. Note that we could 6877 * also reuse this node if the new data is smaller, 6878 * but instead we opt to shrink the node in that case. 6879 */ 6880 if (F_ISSET(flags, MDB_RESERVE)) 6881 data->mv_data = olddata.mv_data; 6882 else if (!(mc->mc_flags & C_SUB)) 6883 memcpy(olddata.mv_data, data->mv_data, data->mv_size); 6884 else { 6885 memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); 6886 goto fix_parent; 6887 } 6888 return MDB_SUCCESS; 6889 } 6890 mdb_node_del(mc, 0); 6891 } 6892 6893 rdata = data; 6894 6895 new_sub: 6896 nflags = flags & NODE_ADD_FLAGS; 6897 nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); 6898 if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { 6899 if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) 6900 nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ 6901 if (!insert_key) 6902 nflags |= MDB_SPLIT_REPLACE; 6903 rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); 6904 } else { 6905 /* There is room already in this leaf page. */ 6906 rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); 6907 if (rc == 0) { 6908 /* Adjust other cursors pointing to mp */ 6909 MDB_cursor *m2, *m3; 6910 MDB_dbi dbi = mc->mc_dbi; 6911 unsigned i = mc->mc_top; 6912 MDB_page *mp = mc->mc_pg[i]; 6913 6914 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 6915 if (mc->mc_flags & C_SUB) 6916 m3 = &m2->mc_xcursor->mx_cursor; 6917 else 6918 m3 = m2; 6919 if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; 6920 if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { 6921 m3->mc_ki[i]++; 6922 } 6923 if (XCURSOR_INITED(m3)) 6924 XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); 6925 } 6926 } 6927 } 6928 6929 if (rc == MDB_SUCCESS) { 6930 /* Now store the actual data in the child DB. Note that we're 6931 * storing the user data in the keys field, so there are strict 6932 * size limits on dupdata. The actual data fields of the child 6933 * DB are all zero size. 6934 */ 6935 if (do_sub) { 6936 int xflags, new_dupdata; 6937 size_t ecount; 6938 put_sub: 6939 xdata.mv_size = 0; 6940 xdata.mv_data = ""; 6941 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 6942 if (flags & MDB_CURRENT) { 6943 xflags = MDB_CURRENT|MDB_NOSPILL; 6944 } else { 6945 mdb_xcursor_init1(mc, leaf); 6946 xflags = (flags & MDB_NODUPDATA) ? 6947 MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; 6948 } 6949 if (sub_root) 6950 mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; 6951 new_dupdata = (int)dkey.mv_size; 6952 /* converted, write the original data first */ 6953 if (dkey.mv_size) { 6954 rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); 6955 if (rc) 6956 goto bad_sub; 6957 /* we've done our job */ 6958 dkey.mv_size = 0; 6959 } 6960 if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { 6961 /* Adjust other cursors pointing to mp */ 6962 MDB_cursor *m2; 6963 MDB_xcursor *mx = mc->mc_xcursor; 6964 unsigned i = mc->mc_top; 6965 MDB_page *mp = mc->mc_pg[i]; 6966 int nkeys = NUMKEYS(mp); 6967 6968 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { 6969 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; 6970 if (!(m2->mc_flags & C_INITIALIZED)) continue; 6971 if (m2->mc_pg[i] == mp) { 6972 if (m2->mc_ki[i] == mc->mc_ki[i]) { 6973 mdb_xcursor_init2(m2, mx, new_dupdata); 6974 } else if (!insert_key && m2->mc_ki[i] < nkeys) { 6975 XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]); 6976 } 6977 } 6978 } 6979 } 6980 ecount = mc->mc_xcursor->mx_db.md_entries; 6981 if (flags & MDB_APPENDDUP) 6982 xflags |= MDB_APPEND; 6983 rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); 6984 if (flags & F_SUBDATA) { 6985 void *db = NODEDATA(leaf); 6986 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); 6987 } 6988 insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; 6989 } 6990 /* Increment count unless we just replaced an existing item. */ 6991 if (insert_data) 6992 mc->mc_db->md_entries++; 6993 if (insert_key) { 6994 /* Invalidate txn if we created an empty sub-DB */ 6995 if (rc) 6996 goto bad_sub; 6997 /* If we succeeded and the key didn't exist before, 6998 * make sure the cursor is marked valid. 6999 */ 7000 mc->mc_flags |= C_INITIALIZED; 7001 } 7002 if (flags & MDB_MULTIPLE) { 7003 if (!rc) { 7004 mcount++; 7005 /* let caller know how many succeeded, if any */ 7006 data[1].mv_size = mcount; 7007 if (mcount < dcount) { 7008 data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; 7009 insert_key = insert_data = 0; 7010 goto more; 7011 } 7012 } 7013 } 7014 return rc; 7015 bad_sub: 7016 if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */ 7017 rc = MDB_CORRUPTED; 7018 } 7019 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 7020 return rc; 7021 } 7022 7023 int 7024 mdb_cursor_del(MDB_cursor *mc, unsigned int flags) 7025 { 7026 MDB_node *leaf; 7027 MDB_page *mp; 7028 int rc; 7029 7030 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) 7031 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 7032 7033 if (!(mc->mc_flags & C_INITIALIZED)) 7034 return EINVAL; 7035 7036 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) 7037 return MDB_NOTFOUND; 7038 7039 if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) 7040 return rc; 7041 7042 rc = mdb_cursor_touch(mc); 7043 if (rc) 7044 return rc; 7045 7046 mp = mc->mc_pg[mc->mc_top]; 7047 if (IS_LEAF2(mp)) 7048 goto del_key; 7049 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 7050 7051 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 7052 if (flags & MDB_NODUPDATA) { 7053 /* mdb_cursor_del0() will subtract the final entry */ 7054 mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; 7055 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; 7056 } else { 7057 if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { 7058 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); 7059 } 7060 rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); 7061 if (rc) 7062 return rc; 7063 /* If sub-DB still has entries, we're done */ 7064 if (mc->mc_xcursor->mx_db.md_entries) { 7065 if (leaf->mn_flags & F_SUBDATA) { 7066 /* update subDB info */ 7067 void *db = NODEDATA(leaf); 7068 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); 7069 } else { 7070 MDB_cursor *m2; 7071 /* shrink fake page */ 7072 mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); 7073 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 7074 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); 7075 /* fix other sub-DB cursors pointed at fake pages on this page */ 7076 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { 7077 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; 7078 if (!(m2->mc_flags & C_INITIALIZED)) continue; 7079 if (m2->mc_pg[mc->mc_top] == mp) { 7080 MDB_node *n2 = leaf; 7081 if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) { 7082 n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); 7083 if (n2->mn_flags & F_SUBDATA) continue; 7084 } 7085 m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); 7086 } 7087 } 7088 } 7089 mc->mc_db->md_entries--; 7090 return rc; 7091 } else { 7092 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; 7093 } 7094 /* otherwise fall thru and delete the sub-DB */ 7095 } 7096 7097 if (leaf->mn_flags & F_SUBDATA) { 7098 /* add all the child DB's pages to the free list */ 7099 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); 7100 if (rc) 7101 goto fail; 7102 } 7103 } 7104 /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ 7105 else if ((leaf->mn_flags ^ flags) & F_SUBDATA) { 7106 rc = MDB_INCOMPATIBLE; 7107 goto fail; 7108 } 7109 7110 /* add overflow pages to free list */ 7111 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { 7112 MDB_page *omp; 7113 pgno_t pg; 7114 7115 memcpy(&pg, NODEDATA(leaf), sizeof(pg)); 7116 if ((rc = mdb_page_get(mc, pg, &omp, NULL)) || 7117 (rc = mdb_ovpage_free(mc, omp))) 7118 goto fail; 7119 } 7120 7121 del_key: 7122 return mdb_cursor_del0(mc); 7123 7124 fail: 7125 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 7126 return rc; 7127 } 7128 7129 /** Allocate and initialize new pages for a database. 7130 * Set #MDB_TXN_ERROR on failure. 7131 * @param[in] mc a cursor on the database being added to. 7132 * @param[in] flags flags defining what type of page is being allocated. 7133 * @param[in] num the number of pages to allocate. This is usually 1, 7134 * unless allocating overflow pages for a large record. 7135 * @param[out] mp Address of a page, or NULL on failure. 7136 * @return 0 on success, non-zero on failure. 7137 */ 7138 static int 7139 mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) 7140 { 7141 MDB_page *np; 7142 int rc; 7143 7144 if ((rc = mdb_page_alloc(mc, num, &np))) 7145 return rc; 7146 DPRINTF(("allocated new mpage %"Z"u, page size %u", 7147 np->mp_pgno, mc->mc_txn->mt_env->me_psize)); 7148 np->mp_flags = flags | P_DIRTY; 7149 np->mp_lower = (PAGEHDRSZ-PAGEBASE); 7150 np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; 7151 7152 if (IS_BRANCH(np)) 7153 mc->mc_db->md_branch_pages++; 7154 else if (IS_LEAF(np)) 7155 mc->mc_db->md_leaf_pages++; 7156 else if (IS_OVERFLOW(np)) { 7157 mc->mc_db->md_overflow_pages += num; 7158 np->mp_pages = num; 7159 } 7160 *mp = np; 7161 7162 return 0; 7163 } 7164 7165 /** Calculate the size of a leaf node. 7166 * The size depends on the environment's page size; if a data item 7167 * is too large it will be put onto an overflow page and the node 7168 * size will only include the key and not the data. Sizes are always 7169 * rounded up to an even number of bytes, to guarantee 2-byte alignment 7170 * of the #MDB_node headers. 7171 * @param[in] env The environment handle. 7172 * @param[in] key The key for the node. 7173 * @param[in] data The data for the node. 7174 * @return The number of bytes needed to store the node. 7175 */ 7176 static size_t 7177 mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) 7178 { 7179 size_t sz; 7180 7181 sz = LEAFSIZE(key, data); 7182 if (sz > env->me_nodemax) { 7183 /* put on overflow page */ 7184 sz -= data->mv_size - sizeof(pgno_t); 7185 } 7186 7187 return EVEN(sz + sizeof(indx_t)); 7188 } 7189 7190 /** Calculate the size of a branch node. 7191 * The size should depend on the environment's page size but since 7192 * we currently don't support spilling large keys onto overflow 7193 * pages, it's simply the size of the #MDB_node header plus the 7194 * size of the key. Sizes are always rounded up to an even number 7195 * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. 7196 * @param[in] env The environment handle. 7197 * @param[in] key The key for the node. 7198 * @return The number of bytes needed to store the node. 7199 */ 7200 static size_t 7201 mdb_branch_size(MDB_env *env, MDB_val *key) 7202 { 7203 size_t sz; 7204 7205 sz = INDXSIZE(key); 7206 if (sz > env->me_nodemax) { 7207 /* put on overflow page */ 7208 /* not implemented */ 7209 /* sz -= key->size - sizeof(pgno_t); */ 7210 } 7211 7212 return sz + sizeof(indx_t); 7213 } 7214 7215 /** Add a node to the page pointed to by the cursor. 7216 * Set #MDB_TXN_ERROR on failure. 7217 * @param[in] mc The cursor for this operation. 7218 * @param[in] indx The index on the page where the new node should be added. 7219 * @param[in] key The key for the new node. 7220 * @param[in] data The data for the new node, if any. 7221 * @param[in] pgno The page number, if adding a branch node. 7222 * @param[in] flags Flags for the node. 7223 * @return 0 on success, non-zero on failure. Possible errors are: 7224 * <ul> 7225 * <li>ENOMEM - failed to allocate overflow pages for the node. 7226 * <li>MDB_PAGE_FULL - there is insufficient room in the page. This error 7227 * should never happen since all callers already calculate the 7228 * page's free space before calling this function. 7229 * </ul> 7230 */ 7231 static int 7232 mdb_node_add(MDB_cursor *mc, indx_t indx, 7233 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags) 7234 { 7235 unsigned int i; 7236 size_t node_size = NODESIZE; 7237 ssize_t room; 7238 indx_t ofs; 7239 MDB_node *node; 7240 MDB_page *mp = mc->mc_pg[mc->mc_top]; 7241 MDB_page *ofp = NULL; /* overflow page */ 7242 void *ndata; 7243 DKBUF; 7244 7245 mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); 7246 7247 DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", 7248 IS_LEAF(mp) ? "leaf" : "branch", 7249 IS_SUBP(mp) ? "sub-" : "", 7250 mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, 7251 key ? key->mv_size : 0, key ? DKEY(key) : "null")); 7252 7253 if (IS_LEAF2(mp)) { 7254 /* Move higher keys up one slot. */ 7255 int ksize = mc->mc_db->md_pad, dif; 7256 char *ptr = LEAF2KEY(mp, indx, ksize); 7257 dif = NUMKEYS(mp) - indx; 7258 if (dif > 0) 7259 memmove(ptr+ksize, ptr, dif*ksize); 7260 /* insert new key */ 7261 memcpy(ptr, key->mv_data, ksize); 7262 7263 /* Just using these for counting */ 7264 mp->mp_lower += sizeof(indx_t); 7265 mp->mp_upper -= ksize - sizeof(indx_t); 7266 return MDB_SUCCESS; 7267 } 7268 7269 room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); 7270 if (key != NULL) 7271 node_size += key->mv_size; 7272 if (IS_LEAF(mp)) { 7273 mdb_cassert(mc, key && data); 7274 if (F_ISSET(flags, F_BIGDATA)) { 7275 /* Data already on overflow page. */ 7276 node_size += sizeof(pgno_t); 7277 } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { 7278 int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); 7279 int rc; 7280 /* Put data on overflow page. */ 7281 DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", 7282 data->mv_size, node_size+data->mv_size)); 7283 node_size = EVEN(node_size + sizeof(pgno_t)); 7284 if ((ssize_t)node_size > room) 7285 goto full; 7286 if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) 7287 return rc; 7288 DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); 7289 flags |= F_BIGDATA; 7290 goto update; 7291 } else { 7292 node_size += data->mv_size; 7293 } 7294 } 7295 node_size = EVEN(node_size); 7296 if ((ssize_t)node_size > room) 7297 goto full; 7298 7299 update: 7300 /* Move higher pointers up one slot. */ 7301 for (i = NUMKEYS(mp); i > indx; i--) 7302 mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; 7303 7304 /* Adjust free space offsets. */ 7305 ofs = mp->mp_upper - node_size; 7306 mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); 7307 mp->mp_ptrs[indx] = ofs; 7308 mp->mp_upper = ofs; 7309 mp->mp_lower += sizeof(indx_t); 7310 7311 /* Write the node data. */ 7312 node = NODEPTR(mp, indx); 7313 node->mn_ksize = (key == NULL) ? 0 : key->mv_size; 7314 node->mn_flags = flags; 7315 if (IS_LEAF(mp)) 7316 SETDSZ(node,data->mv_size); 7317 else 7318 SETPGNO(node,pgno); 7319 7320 if (key) 7321 memcpy(NODEKEY(node), key->mv_data, key->mv_size); 7322 7323 if (IS_LEAF(mp)) { 7324 ndata = NODEDATA(node); 7325 if (ofp == NULL) { 7326 if (F_ISSET(flags, F_BIGDATA)) 7327 memcpy(ndata, data->mv_data, sizeof(pgno_t)); 7328 else if (F_ISSET(flags, MDB_RESERVE)) 7329 data->mv_data = ndata; 7330 else 7331 memcpy(ndata, data->mv_data, data->mv_size); 7332 } else { 7333 memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); 7334 ndata = METADATA(ofp); 7335 if (F_ISSET(flags, MDB_RESERVE)) 7336 data->mv_data = ndata; 7337 else 7338 memcpy(ndata, data->mv_data, data->mv_size); 7339 } 7340 } 7341 7342 return MDB_SUCCESS; 7343 7344 full: 7345 DPRINTF(("not enough room in page %"Z"u, got %u ptrs", 7346 mdb_dbg_pgno(mp), NUMKEYS(mp))); 7347 DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); 7348 DPRINTF(("node size = %"Z"u", node_size)); 7349 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 7350 return MDB_PAGE_FULL; 7351 } 7352 7353 /** Delete the specified node from a page. 7354 * @param[in] mc Cursor pointing to the node to delete. 7355 * @param[in] ksize The size of a node. Only used if the page is 7356 * part of a #MDB_DUPFIXED database. 7357 */ 7358 static void 7359 mdb_node_del(MDB_cursor *mc, int ksize) 7360 { 7361 MDB_page *mp = mc->mc_pg[mc->mc_top]; 7362 indx_t indx = mc->mc_ki[mc->mc_top]; 7363 unsigned int sz; 7364 indx_t i, j, numkeys, ptr; 7365 MDB_node *node; 7366 char *base; 7367 7368 DPRINTF(("delete node %u on %s page %"Z"u", indx, 7369 IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); 7370 numkeys = NUMKEYS(mp); 7371 mdb_cassert(mc, indx < numkeys); 7372 7373 if (IS_LEAF2(mp)) { 7374 int x = numkeys - 1 - indx; 7375 base = LEAF2KEY(mp, indx, ksize); 7376 if (x) 7377 memmove(base, base + ksize, x * ksize); 7378 mp->mp_lower -= sizeof(indx_t); 7379 mp->mp_upper += ksize - sizeof(indx_t); 7380 return; 7381 } 7382 7383 node = NODEPTR(mp, indx); 7384 sz = NODESIZE + node->mn_ksize; 7385 if (IS_LEAF(mp)) { 7386 if (F_ISSET(node->mn_flags, F_BIGDATA)) 7387 sz += sizeof(pgno_t); 7388 else 7389 sz += NODEDSZ(node); 7390 } 7391 sz = EVEN(sz); 7392 7393 ptr = mp->mp_ptrs[indx]; 7394 for (i = j = 0; i < numkeys; i++) { 7395 if (i != indx) { 7396 mp->mp_ptrs[j] = mp->mp_ptrs[i]; 7397 if (mp->mp_ptrs[i] < ptr) 7398 mp->mp_ptrs[j] += sz; 7399 j++; 7400 } 7401 } 7402 7403 base = (char *)mp + mp->mp_upper + PAGEBASE; 7404 memmove(base + sz, base, ptr - mp->mp_upper); 7405 7406 mp->mp_lower -= sizeof(indx_t); 7407 mp->mp_upper += sz; 7408 } 7409 7410 /** Compact the main page after deleting a node on a subpage. 7411 * @param[in] mp The main page to operate on. 7412 * @param[in] indx The index of the subpage on the main page. 7413 */ 7414 static void 7415 mdb_node_shrink(MDB_page *mp, indx_t indx) 7416 { 7417 MDB_node *node; 7418 MDB_page *sp, *xp; 7419 char *base; 7420 indx_t delta, nsize, len, ptr; 7421 int i; 7422 7423 node = NODEPTR(mp, indx); 7424 sp = (MDB_page *)NODEDATA(node); 7425 delta = SIZELEFT(sp); 7426 nsize = NODEDSZ(node) - delta; 7427 7428 /* Prepare to shift upward, set len = length(subpage part to shift) */ 7429 if (IS_LEAF2(sp)) { 7430 len = nsize; 7431 if (nsize & 1) 7432 return; /* do not make the node uneven-sized */ 7433 } else { 7434 xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ 7435 for (i = NUMKEYS(sp); --i >= 0; ) 7436 xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; 7437 len = PAGEHDRSZ; 7438 } 7439 sp->mp_upper = sp->mp_lower; 7440 COPY_PGNO(sp->mp_pgno, mp->mp_pgno); 7441 SETDSZ(node, nsize); 7442 7443 /* Shift <lower nodes...initial part of subpage> upward */ 7444 base = (char *)mp + mp->mp_upper + PAGEBASE; 7445 memmove(base + delta, base, (char *)sp + len - base); 7446 7447 ptr = mp->mp_ptrs[indx]; 7448 for (i = NUMKEYS(mp); --i >= 0; ) { 7449 if (mp->mp_ptrs[i] <= ptr) 7450 mp->mp_ptrs[i] += delta; 7451 } 7452 mp->mp_upper += delta; 7453 } 7454 7455 /** Initial setup of a sorted-dups cursor. 7456 * Sorted duplicates are implemented as a sub-database for the given key. 7457 * The duplicate data items are actually keys of the sub-database. 7458 * Operations on the duplicate data items are performed using a sub-cursor 7459 * initialized when the sub-database is first accessed. This function does 7460 * the preliminary setup of the sub-cursor, filling in the fields that 7461 * depend only on the parent DB. 7462 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. 7463 */ 7464 static void 7465 mdb_xcursor_init0(MDB_cursor *mc) 7466 { 7467 MDB_xcursor *mx = mc->mc_xcursor; 7468 7469 mx->mx_cursor.mc_xcursor = NULL; 7470 mx->mx_cursor.mc_txn = mc->mc_txn; 7471 mx->mx_cursor.mc_db = &mx->mx_db; 7472 mx->mx_cursor.mc_dbx = &mx->mx_dbx; 7473 mx->mx_cursor.mc_dbi = mc->mc_dbi; 7474 mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; 7475 mx->mx_cursor.mc_snum = 0; 7476 mx->mx_cursor.mc_top = 0; 7477 mx->mx_cursor.mc_flags = C_SUB; 7478 mx->mx_dbx.md_name.mv_size = 0; 7479 mx->mx_dbx.md_name.mv_data = NULL; 7480 mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; 7481 mx->mx_dbx.md_dcmp = NULL; 7482 mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; 7483 } 7484 7485 /** Final setup of a sorted-dups cursor. 7486 * Sets up the fields that depend on the data from the main cursor. 7487 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. 7488 * @param[in] node The data containing the #MDB_db record for the 7489 * sorted-dup database. 7490 */ 7491 static void 7492 mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) 7493 { 7494 MDB_xcursor *mx = mc->mc_xcursor; 7495 7496 if (node->mn_flags & F_SUBDATA) { 7497 memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); 7498 mx->mx_cursor.mc_pg[0] = 0; 7499 mx->mx_cursor.mc_snum = 0; 7500 mx->mx_cursor.mc_top = 0; 7501 mx->mx_cursor.mc_flags = C_SUB; 7502 } else { 7503 MDB_page *fp = NODEDATA(node); 7504 mx->mx_db.md_pad = 0; 7505 mx->mx_db.md_flags = 0; 7506 mx->mx_db.md_depth = 1; 7507 mx->mx_db.md_branch_pages = 0; 7508 mx->mx_db.md_leaf_pages = 1; 7509 mx->mx_db.md_overflow_pages = 0; 7510 mx->mx_db.md_entries = NUMKEYS(fp); 7511 COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); 7512 mx->mx_cursor.mc_snum = 1; 7513 mx->mx_cursor.mc_top = 0; 7514 mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; 7515 mx->mx_cursor.mc_pg[0] = fp; 7516 mx->mx_cursor.mc_ki[0] = 0; 7517 if (mc->mc_db->md_flags & MDB_DUPFIXED) { 7518 mx->mx_db.md_flags = MDB_DUPFIXED; 7519 mx->mx_db.md_pad = fp->mp_pad; 7520 if (mc->mc_db->md_flags & MDB_INTEGERDUP) 7521 mx->mx_db.md_flags |= MDB_INTEGERKEY; 7522 } 7523 } 7524 DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, 7525 mx->mx_db.md_root)); 7526 mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; 7527 #if UINT_MAX < SIZE_MAX 7528 if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) 7529 mx->mx_dbx.md_cmp = mdb_cmp_clong; 7530 #endif 7531 } 7532 7533 7534 /** Fixup a sorted-dups cursor due to underlying update. 7535 * Sets up some fields that depend on the data from the main cursor. 7536 * Almost the same as init1, but skips initialization steps if the 7537 * xcursor had already been used. 7538 * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. 7539 * @param[in] src_mx The xcursor of an up-to-date cursor. 7540 * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. 7541 */ 7542 static void 7543 mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) 7544 { 7545 MDB_xcursor *mx = mc->mc_xcursor; 7546 7547 if (new_dupdata) { 7548 mx->mx_cursor.mc_snum = 1; 7549 mx->mx_cursor.mc_top = 0; 7550 mx->mx_cursor.mc_flags |= C_INITIALIZED; 7551 mx->mx_cursor.mc_ki[0] = 0; 7552 mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; 7553 #if UINT_MAX < SIZE_MAX 7554 mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; 7555 #endif 7556 } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { 7557 return; 7558 } 7559 mx->mx_db = src_mx->mx_db; 7560 mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; 7561 DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, 7562 mx->mx_db.md_root)); 7563 } 7564 7565 /** Initialize a cursor for a given transaction and database. */ 7566 static void 7567 mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) 7568 { 7569 mc->mc_next = NULL; 7570 mc->mc_backup = NULL; 7571 mc->mc_dbi = dbi; 7572 mc->mc_txn = txn; 7573 mc->mc_db = &txn->mt_dbs[dbi]; 7574 mc->mc_dbx = &txn->mt_dbxs[dbi]; 7575 mc->mc_dbflag = &txn->mt_dbflags[dbi]; 7576 mc->mc_snum = 0; 7577 mc->mc_top = 0; 7578 mc->mc_pg[0] = 0; 7579 mc->mc_ki[0] = 0; 7580 mc->mc_flags = 0; 7581 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { 7582 mdb_tassert(txn, mx != NULL); 7583 mc->mc_xcursor = mx; 7584 mdb_xcursor_init0(mc); 7585 } else { 7586 mc->mc_xcursor = NULL; 7587 } 7588 if (*mc->mc_dbflag & DB_STALE) { 7589 mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); 7590 } 7591 } 7592 7593 int 7594 mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) 7595 { 7596 MDB_cursor *mc; 7597 size_t size = sizeof(MDB_cursor); 7598 7599 if (!ret || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) 7600 return EINVAL; 7601 7602 if (txn->mt_flags & MDB_TXN_BLOCKED) 7603 return MDB_BAD_TXN; 7604 7605 if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) 7606 return EINVAL; 7607 7608 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) 7609 size += sizeof(MDB_xcursor); 7610 7611 if ((mc = malloc(size)) != NULL) { 7612 mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); 7613 if (txn->mt_cursors) { 7614 mc->mc_next = txn->mt_cursors[dbi]; 7615 txn->mt_cursors[dbi] = mc; 7616 mc->mc_flags |= C_UNTRACK; 7617 } 7618 } else { 7619 return ENOMEM; 7620 } 7621 7622 *ret = mc; 7623 7624 return MDB_SUCCESS; 7625 } 7626 7627 int 7628 mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) 7629 { 7630 if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID)) 7631 return EINVAL; 7632 7633 if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) 7634 return EINVAL; 7635 7636 if (txn->mt_flags & MDB_TXN_BLOCKED) 7637 return MDB_BAD_TXN; 7638 7639 mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); 7640 return MDB_SUCCESS; 7641 } 7642 7643 /* Return the count of duplicate data items for the current key */ 7644 int 7645 mdb_cursor_count(MDB_cursor *mc, size_t *countp) 7646 { 7647 MDB_node *leaf; 7648 7649 if (mc == NULL || countp == NULL) 7650 return EINVAL; 7651 7652 if (mc->mc_xcursor == NULL) 7653 return MDB_INCOMPATIBLE; 7654 7655 if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) 7656 return MDB_BAD_TXN; 7657 7658 if (!(mc->mc_flags & C_INITIALIZED)) 7659 return EINVAL; 7660 7661 if (!mc->mc_snum) 7662 return MDB_NOTFOUND; 7663 7664 if (mc->mc_flags & C_EOF) { 7665 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) 7666 return MDB_NOTFOUND; 7667 mc->mc_flags ^= C_EOF; 7668 } 7669 7670 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 7671 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { 7672 *countp = 1; 7673 } else { 7674 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) 7675 return EINVAL; 7676 7677 *countp = mc->mc_xcursor->mx_db.md_entries; 7678 } 7679 return MDB_SUCCESS; 7680 } 7681 7682 void 7683 mdb_cursor_close(MDB_cursor *mc) 7684 { 7685 if (mc && !mc->mc_backup) { 7686 /* remove from txn, if tracked */ 7687 if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { 7688 MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; 7689 while (*prev && *prev != mc) prev = &(*prev)->mc_next; 7690 if (*prev == mc) 7691 *prev = mc->mc_next; 7692 } 7693 free(mc); 7694 } 7695 } 7696 7697 MDB_txn * 7698 mdb_cursor_txn(MDB_cursor *mc) 7699 { 7700 if (!mc) return NULL; 7701 return mc->mc_txn; 7702 } 7703 7704 MDB_dbi 7705 mdb_cursor_dbi(MDB_cursor *mc) 7706 { 7707 return mc->mc_dbi; 7708 } 7709 7710 /** Replace the key for a branch node with a new key. 7711 * Set #MDB_TXN_ERROR on failure. 7712 * @param[in] mc Cursor pointing to the node to operate on. 7713 * @param[in] key The new key to use. 7714 * @return 0 on success, non-zero on failure. 7715 */ 7716 static int 7717 mdb_update_key(MDB_cursor *mc, MDB_val *key) 7718 { 7719 MDB_page *mp; 7720 MDB_node *node; 7721 char *base; 7722 size_t len; 7723 int delta, ksize, oksize; 7724 indx_t ptr, i, numkeys, indx; 7725 DKBUF; 7726 7727 indx = mc->mc_ki[mc->mc_top]; 7728 mp = mc->mc_pg[mc->mc_top]; 7729 node = NODEPTR(mp, indx); 7730 ptr = mp->mp_ptrs[indx]; 7731 #if MDB_DEBUG 7732 { 7733 MDB_val k2; 7734 char kbuf2[DKBUF_MAXKEYSIZE*2+1]; 7735 k2.mv_data = NODEKEY(node); 7736 k2.mv_size = node->mn_ksize; 7737 DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", 7738 indx, ptr, 7739 mdb_dkey(&k2, kbuf2), 7740 DKEY(key), 7741 mp->mp_pgno)); 7742 } 7743 #endif 7744 7745 /* Sizes must be 2-byte aligned. */ 7746 ksize = EVEN(key->mv_size); 7747 oksize = EVEN(node->mn_ksize); 7748 delta = ksize - oksize; 7749 7750 /* Shift node contents if EVEN(key length) changed. */ 7751 if (delta) { 7752 if (delta > 0 && SIZELEFT(mp) < delta) { 7753 pgno_t pgno; 7754 /* not enough space left, do a delete and split */ 7755 DPRINTF(("Not enough room, delta = %d, splitting...", delta)); 7756 pgno = NODEPGNO(node); 7757 mdb_node_del(mc, 0); 7758 return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); 7759 } 7760 7761 numkeys = NUMKEYS(mp); 7762 for (i = 0; i < numkeys; i++) { 7763 if (mp->mp_ptrs[i] <= ptr) 7764 mp->mp_ptrs[i] -= delta; 7765 } 7766 7767 base = (char *)mp + mp->mp_upper + PAGEBASE; 7768 len = ptr - mp->mp_upper + NODESIZE; 7769 memmove(base - delta, base, len); 7770 mp->mp_upper -= delta; 7771 7772 node = NODEPTR(mp, indx); 7773 } 7774 7775 /* But even if no shift was needed, update ksize */ 7776 if (node->mn_ksize != key->mv_size) 7777 node->mn_ksize = key->mv_size; 7778 7779 if (key->mv_size) 7780 memcpy(NODEKEY(node), key->mv_data, key->mv_size); 7781 7782 return MDB_SUCCESS; 7783 } 7784 7785 static void 7786 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); 7787 7788 /** Perform \b act while tracking temporary cursor \b mn */ 7789 #define WITH_CURSOR_TRACKING(mn, act) do { \ 7790 MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ 7791 if ((mn).mc_flags & C_SUB) { \ 7792 dummy.mc_flags = C_INITIALIZED; \ 7793 dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ 7794 tracked = &dummy; \ 7795 } else { \ 7796 tracked = &(mn); \ 7797 } \ 7798 tracked->mc_next = *tp; \ 7799 *tp = tracked; \ 7800 { act; } \ 7801 *tp = tracked->mc_next; \ 7802 } while (0) 7803 7804 /** Move a node from csrc to cdst. 7805 */ 7806 static int 7807 mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) 7808 { 7809 MDB_node *srcnode; 7810 MDB_val key, data; 7811 pgno_t srcpg; 7812 MDB_cursor mn; 7813 int rc; 7814 unsigned short flags; 7815 7816 DKBUF; 7817 7818 /* Mark src and dst as dirty. */ 7819 if ((rc = mdb_page_touch(csrc)) || 7820 (rc = mdb_page_touch(cdst))) 7821 return rc; 7822 7823 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 7824 key.mv_size = csrc->mc_db->md_pad; 7825 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); 7826 data.mv_size = 0; 7827 data.mv_data = NULL; 7828 srcpg = 0; 7829 flags = 0; 7830 } else { 7831 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); 7832 mdb_cassert(csrc, !((size_t)srcnode & 1)); 7833 srcpg = NODEPGNO(srcnode); 7834 flags = srcnode->mn_flags; 7835 if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { 7836 unsigned int snum = csrc->mc_snum; 7837 MDB_node *s2; 7838 /* must find the lowest key below src */ 7839 rc = mdb_page_search_lowest(csrc); 7840 if (rc) 7841 return rc; 7842 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 7843 key.mv_size = csrc->mc_db->md_pad; 7844 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); 7845 } else { 7846 s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); 7847 key.mv_size = NODEKSZ(s2); 7848 key.mv_data = NODEKEY(s2); 7849 } 7850 csrc->mc_snum = snum--; 7851 csrc->mc_top = snum; 7852 } else { 7853 key.mv_size = NODEKSZ(srcnode); 7854 key.mv_data = NODEKEY(srcnode); 7855 } 7856 data.mv_size = NODEDSZ(srcnode); 7857 data.mv_data = NODEDATA(srcnode); 7858 } 7859 mn.mc_xcursor = NULL; 7860 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { 7861 unsigned int snum = cdst->mc_snum; 7862 MDB_node *s2; 7863 MDB_val bkey; 7864 /* must find the lowest key below dst */ 7865 mdb_cursor_copy(cdst, &mn); 7866 rc = mdb_page_search_lowest(&mn); 7867 if (rc) 7868 return rc; 7869 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { 7870 bkey.mv_size = mn.mc_db->md_pad; 7871 bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); 7872 } else { 7873 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); 7874 bkey.mv_size = NODEKSZ(s2); 7875 bkey.mv_data = NODEKEY(s2); 7876 } 7877 mn.mc_snum = snum--; 7878 mn.mc_top = snum; 7879 mn.mc_ki[snum] = 0; 7880 rc = mdb_update_key(&mn, &bkey); 7881 if (rc) 7882 return rc; 7883 } 7884 7885 DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", 7886 IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", 7887 csrc->mc_ki[csrc->mc_top], 7888 DKEY(&key), 7889 csrc->mc_pg[csrc->mc_top]->mp_pgno, 7890 cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); 7891 7892 /* Add the node to the destination page. 7893 */ 7894 rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); 7895 if (rc != MDB_SUCCESS) 7896 return rc; 7897 7898 /* Delete the node from the source page. 7899 */ 7900 mdb_node_del(csrc, key.mv_size); 7901 7902 { 7903 /* Adjust other cursors pointing to mp */ 7904 MDB_cursor *m2, *m3; 7905 MDB_dbi dbi = csrc->mc_dbi; 7906 MDB_page *mpd, *mps; 7907 7908 mps = csrc->mc_pg[csrc->mc_top]; 7909 /* If we're adding on the left, bump others up */ 7910 if (fromleft) { 7911 mpd = cdst->mc_pg[csrc->mc_top]; 7912 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 7913 if (csrc->mc_flags & C_SUB) 7914 m3 = &m2->mc_xcursor->mx_cursor; 7915 else 7916 m3 = m2; 7917 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) 7918 continue; 7919 if (m3 != cdst && 7920 m3->mc_pg[csrc->mc_top] == mpd && 7921 m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { 7922 m3->mc_ki[csrc->mc_top]++; 7923 } 7924 if (m3 !=csrc && 7925 m3->mc_pg[csrc->mc_top] == mps && 7926 m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { 7927 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; 7928 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; 7929 m3->mc_ki[csrc->mc_top-1]++; 7930 } 7931 if (XCURSOR_INITED(m3) && IS_LEAF(mps)) 7932 XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); 7933 } 7934 } else 7935 /* Adding on the right, bump others down */ 7936 { 7937 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 7938 if (csrc->mc_flags & C_SUB) 7939 m3 = &m2->mc_xcursor->mx_cursor; 7940 else 7941 m3 = m2; 7942 if (m3 == csrc) continue; 7943 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) 7944 continue; 7945 if (m3->mc_pg[csrc->mc_top] == mps) { 7946 if (!m3->mc_ki[csrc->mc_top]) { 7947 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; 7948 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; 7949 m3->mc_ki[csrc->mc_top-1]--; 7950 } else { 7951 m3->mc_ki[csrc->mc_top]--; 7952 } 7953 if (XCURSOR_INITED(m3) && IS_LEAF(mps)) 7954 XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); 7955 } 7956 } 7957 } 7958 } 7959 7960 /* Update the parent separators. 7961 */ 7962 if (csrc->mc_ki[csrc->mc_top] == 0) { 7963 if (csrc->mc_ki[csrc->mc_top-1] != 0) { 7964 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 7965 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); 7966 } else { 7967 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); 7968 key.mv_size = NODEKSZ(srcnode); 7969 key.mv_data = NODEKEY(srcnode); 7970 } 7971 DPRINTF(("update separator for source page %"Z"u to [%s]", 7972 csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); 7973 mdb_cursor_copy(csrc, &mn); 7974 mn.mc_snum--; 7975 mn.mc_top--; 7976 /* We want mdb_rebalance to find mn when doing fixups */ 7977 WITH_CURSOR_TRACKING(mn, 7978 rc = mdb_update_key(&mn, &key)); 7979 if (rc) 7980 return rc; 7981 } 7982 if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { 7983 MDB_val nullkey; 7984 indx_t ix = csrc->mc_ki[csrc->mc_top]; 7985 nullkey.mv_size = 0; 7986 csrc->mc_ki[csrc->mc_top] = 0; 7987 rc = mdb_update_key(csrc, &nullkey); 7988 csrc->mc_ki[csrc->mc_top] = ix; 7989 mdb_cassert(csrc, rc == MDB_SUCCESS); 7990 } 7991 } 7992 7993 if (cdst->mc_ki[cdst->mc_top] == 0) { 7994 if (cdst->mc_ki[cdst->mc_top-1] != 0) { 7995 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 7996 key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); 7997 } else { 7998 srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); 7999 key.mv_size = NODEKSZ(srcnode); 8000 key.mv_data = NODEKEY(srcnode); 8001 } 8002 DPRINTF(("update separator for destination page %"Z"u to [%s]", 8003 cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); 8004 mdb_cursor_copy(cdst, &mn); 8005 mn.mc_snum--; 8006 mn.mc_top--; 8007 /* We want mdb_rebalance to find mn when doing fixups */ 8008 WITH_CURSOR_TRACKING(mn, 8009 rc = mdb_update_key(&mn, &key)); 8010 if (rc) 8011 return rc; 8012 } 8013 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { 8014 MDB_val nullkey; 8015 indx_t ix = cdst->mc_ki[cdst->mc_top]; 8016 nullkey.mv_size = 0; 8017 cdst->mc_ki[cdst->mc_top] = 0; 8018 rc = mdb_update_key(cdst, &nullkey); 8019 cdst->mc_ki[cdst->mc_top] = ix; 8020 mdb_cassert(cdst, rc == MDB_SUCCESS); 8021 } 8022 } 8023 8024 return MDB_SUCCESS; 8025 } 8026 8027 /** Merge one page into another. 8028 * The nodes from the page pointed to by \b csrc will 8029 * be copied to the page pointed to by \b cdst and then 8030 * the \b csrc page will be freed. 8031 * @param[in] csrc Cursor pointing to the source page. 8032 * @param[in] cdst Cursor pointing to the destination page. 8033 * @return 0 on success, non-zero on failure. 8034 */ 8035 static int 8036 mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) 8037 { 8038 MDB_page *psrc, *pdst; 8039 MDB_node *srcnode; 8040 MDB_val key, data; 8041 unsigned nkeys; 8042 int rc; 8043 indx_t i, j; 8044 8045 psrc = csrc->mc_pg[csrc->mc_top]; 8046 pdst = cdst->mc_pg[cdst->mc_top]; 8047 8048 DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno)); 8049 8050 mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ 8051 mdb_cassert(csrc, cdst->mc_snum > 1); 8052 8053 /* Mark dst as dirty. */ 8054 if ((rc = mdb_page_touch(cdst))) 8055 return rc; 8056 8057 /* get dst page again now that we've touched it. */ 8058 pdst = cdst->mc_pg[cdst->mc_top]; 8059 8060 /* Move all nodes from src to dst. 8061 */ 8062 j = nkeys = NUMKEYS(pdst); 8063 if (IS_LEAF2(psrc)) { 8064 key.mv_size = csrc->mc_db->md_pad; 8065 key.mv_data = METADATA(psrc); 8066 for (i = 0; i < NUMKEYS(psrc); i++, j++) { 8067 rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); 8068 if (rc != MDB_SUCCESS) 8069 return rc; 8070 key.mv_data = (char *)key.mv_data + key.mv_size; 8071 } 8072 } else { 8073 for (i = 0; i < NUMKEYS(psrc); i++, j++) { 8074 srcnode = NODEPTR(psrc, i); 8075 if (i == 0 && IS_BRANCH(psrc)) { 8076 MDB_cursor mn; 8077 MDB_node *s2; 8078 mdb_cursor_copy(csrc, &mn); 8079 mn.mc_xcursor = NULL; 8080 /* must find the lowest key below src */ 8081 rc = mdb_page_search_lowest(&mn); 8082 if (rc) 8083 return rc; 8084 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { 8085 key.mv_size = mn.mc_db->md_pad; 8086 key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); 8087 } else { 8088 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); 8089 key.mv_size = NODEKSZ(s2); 8090 key.mv_data = NODEKEY(s2); 8091 } 8092 } else { 8093 key.mv_size = srcnode->mn_ksize; 8094 key.mv_data = NODEKEY(srcnode); 8095 } 8096 8097 data.mv_size = NODEDSZ(srcnode); 8098 data.mv_data = NODEDATA(srcnode); 8099 rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); 8100 if (rc != MDB_SUCCESS) 8101 return rc; 8102 } 8103 } 8104 8105 DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", 8106 pdst->mp_pgno, NUMKEYS(pdst), 8107 (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); 8108 8109 /* Unlink the src page from parent and add to free list. 8110 */ 8111 csrc->mc_top--; 8112 mdb_node_del(csrc, 0); 8113 if (csrc->mc_ki[csrc->mc_top] == 0) { 8114 key.mv_size = 0; 8115 rc = mdb_update_key(csrc, &key); 8116 if (rc) { 8117 csrc->mc_top++; 8118 return rc; 8119 } 8120 } 8121 csrc->mc_top++; 8122 8123 psrc = csrc->mc_pg[csrc->mc_top]; 8124 /* If not operating on FreeDB, allow this page to be reused 8125 * in this txn. Otherwise just add to free list. 8126 */ 8127 rc = mdb_page_loose(csrc, psrc); 8128 if (rc) 8129 return rc; 8130 if (IS_LEAF(psrc)) 8131 csrc->mc_db->md_leaf_pages--; 8132 else 8133 csrc->mc_db->md_branch_pages--; 8134 { 8135 /* Adjust other cursors pointing to mp */ 8136 MDB_cursor *m2, *m3; 8137 MDB_dbi dbi = csrc->mc_dbi; 8138 unsigned int top = csrc->mc_top; 8139 8140 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8141 if (csrc->mc_flags & C_SUB) 8142 m3 = &m2->mc_xcursor->mx_cursor; 8143 else 8144 m3 = m2; 8145 if (m3 == csrc) continue; 8146 if (m3->mc_snum < csrc->mc_snum) continue; 8147 if (m3->mc_pg[top] == psrc) { 8148 m3->mc_pg[top] = pdst; 8149 m3->mc_ki[top] += nkeys; 8150 m3->mc_ki[top-1] = cdst->mc_ki[top-1]; 8151 } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] && 8152 m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { 8153 m3->mc_ki[top-1]--; 8154 } 8155 if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) 8156 XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]); 8157 } 8158 } 8159 { 8160 unsigned int snum = cdst->mc_snum; 8161 uint16_t depth = cdst->mc_db->md_depth; 8162 mdb_cursor_pop(cdst); 8163 rc = mdb_rebalance(cdst); 8164 /* Did the tree height change? */ 8165 if (depth != cdst->mc_db->md_depth) 8166 snum += cdst->mc_db->md_depth - depth; 8167 cdst->mc_snum = snum; 8168 cdst->mc_top = snum-1; 8169 } 8170 return rc; 8171 } 8172 8173 /** Copy the contents of a cursor. 8174 * @param[in] csrc The cursor to copy from. 8175 * @param[out] cdst The cursor to copy to. 8176 */ 8177 static void 8178 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) 8179 { 8180 unsigned int i; 8181 8182 cdst->mc_txn = csrc->mc_txn; 8183 cdst->mc_dbi = csrc->mc_dbi; 8184 cdst->mc_db = csrc->mc_db; 8185 cdst->mc_dbx = csrc->mc_dbx; 8186 cdst->mc_snum = csrc->mc_snum; 8187 cdst->mc_top = csrc->mc_top; 8188 cdst->mc_flags = csrc->mc_flags; 8189 8190 for (i=0; i<csrc->mc_snum; i++) { 8191 cdst->mc_pg[i] = csrc->mc_pg[i]; 8192 cdst->mc_ki[i] = csrc->mc_ki[i]; 8193 } 8194 } 8195 8196 /** Rebalance the tree after a delete operation. 8197 * @param[in] mc Cursor pointing to the page where rebalancing 8198 * should begin. 8199 * @return 0 on success, non-zero on failure. 8200 */ 8201 static int 8202 mdb_rebalance(MDB_cursor *mc) 8203 { 8204 MDB_node *node; 8205 int rc, fromleft; 8206 unsigned int ptop, minkeys, thresh; 8207 MDB_cursor mn; 8208 indx_t oldki; 8209 8210 if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { 8211 minkeys = 2; 8212 thresh = 1; 8213 } else { 8214 minkeys = 1; 8215 thresh = FILL_THRESHOLD; 8216 } 8217 DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", 8218 IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", 8219 mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), 8220 (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); 8221 8222 if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && 8223 NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { 8224 DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", 8225 mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); 8226 return MDB_SUCCESS; 8227 } 8228 8229 if (mc->mc_snum < 2) { 8230 MDB_page *mp = mc->mc_pg[0]; 8231 if (IS_SUBP(mp)) { 8232 DPUTS("Can't rebalance a subpage, ignoring"); 8233 return MDB_SUCCESS; 8234 } 8235 if (NUMKEYS(mp) == 0) { 8236 DPUTS("tree is completely empty"); 8237 mc->mc_db->md_root = P_INVALID; 8238 mc->mc_db->md_depth = 0; 8239 mc->mc_db->md_leaf_pages = 0; 8240 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); 8241 if (rc) 8242 return rc; 8243 /* Adjust cursors pointing to mp */ 8244 mc->mc_snum = 0; 8245 mc->mc_top = 0; 8246 mc->mc_flags &= ~C_INITIALIZED; 8247 { 8248 MDB_cursor *m2, *m3; 8249 MDB_dbi dbi = mc->mc_dbi; 8250 8251 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8252 if (mc->mc_flags & C_SUB) 8253 m3 = &m2->mc_xcursor->mx_cursor; 8254 else 8255 m3 = m2; 8256 if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) 8257 continue; 8258 if (m3->mc_pg[0] == mp) { 8259 m3->mc_snum = 0; 8260 m3->mc_top = 0; 8261 m3->mc_flags &= ~C_INITIALIZED; 8262 } 8263 } 8264 } 8265 } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { 8266 int i; 8267 DPUTS("collapsing root page!"); 8268 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); 8269 if (rc) 8270 return rc; 8271 mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); 8272 rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); 8273 if (rc) 8274 return rc; 8275 mc->mc_db->md_depth--; 8276 mc->mc_db->md_branch_pages--; 8277 mc->mc_ki[0] = mc->mc_ki[1]; 8278 for (i = 1; i<mc->mc_db->md_depth; i++) { 8279 mc->mc_pg[i] = mc->mc_pg[i+1]; 8280 mc->mc_ki[i] = mc->mc_ki[i+1]; 8281 } 8282 { 8283 /* Adjust other cursors pointing to mp */ 8284 MDB_cursor *m2, *m3; 8285 MDB_dbi dbi = mc->mc_dbi; 8286 8287 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8288 if (mc->mc_flags & C_SUB) 8289 m3 = &m2->mc_xcursor->mx_cursor; 8290 else 8291 m3 = m2; 8292 if (m3 == mc) continue; 8293 if (!(m3->mc_flags & C_INITIALIZED)) 8294 continue; 8295 if (m3->mc_pg[0] == mp) { 8296 for (i=0; i<mc->mc_db->md_depth; i++) { 8297 m3->mc_pg[i] = m3->mc_pg[i+1]; 8298 m3->mc_ki[i] = m3->mc_ki[i+1]; 8299 } 8300 m3->mc_snum--; 8301 m3->mc_top--; 8302 } 8303 } 8304 } 8305 } else 8306 DPUTS("root page doesn't need rebalancing"); 8307 return MDB_SUCCESS; 8308 } 8309 8310 /* The parent (branch page) must have at least 2 pointers, 8311 * otherwise the tree is invalid. 8312 */ 8313 ptop = mc->mc_top-1; 8314 mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); 8315 8316 /* Leaf page fill factor is below the threshold. 8317 * Try to move keys from left or right neighbor, or 8318 * merge with a neighbor page. 8319 */ 8320 8321 /* Find neighbors. 8322 */ 8323 mdb_cursor_copy(mc, &mn); 8324 mn.mc_xcursor = NULL; 8325 8326 oldki = mc->mc_ki[mc->mc_top]; 8327 if (mc->mc_ki[ptop] == 0) { 8328 /* We're the leftmost leaf in our parent. 8329 */ 8330 DPUTS("reading right neighbor"); 8331 mn.mc_ki[ptop]++; 8332 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); 8333 rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); 8334 if (rc) 8335 return rc; 8336 mn.mc_ki[mn.mc_top] = 0; 8337 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); 8338 fromleft = 0; 8339 } else { 8340 /* There is at least one neighbor to the left. 8341 */ 8342 DPUTS("reading left neighbor"); 8343 mn.mc_ki[ptop]--; 8344 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); 8345 rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); 8346 if (rc) 8347 return rc; 8348 mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; 8349 mc->mc_ki[mc->mc_top] = 0; 8350 fromleft = 1; 8351 } 8352 8353 DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", 8354 mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), 8355 (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); 8356 8357 /* If the neighbor page is above threshold and has enough keys, 8358 * move one key from it. Otherwise we should try to merge them. 8359 * (A branch page must never have less than 2 keys.) 8360 */ 8361 if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { 8362 rc = mdb_node_move(&mn, mc, fromleft); 8363 if (fromleft) { 8364 /* if we inserted on left, bump position up */ 8365 oldki++; 8366 } 8367 } else { 8368 if (!fromleft) { 8369 rc = mdb_page_merge(&mn, mc); 8370 } else { 8371 oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); 8372 mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; 8373 /* We want mdb_rebalance to find mn when doing fixups */ 8374 WITH_CURSOR_TRACKING(mn, 8375 rc = mdb_page_merge(mc, &mn)); 8376 mdb_cursor_copy(&mn, mc); 8377 } 8378 mc->mc_flags &= ~C_EOF; 8379 } 8380 mc->mc_ki[mc->mc_top] = oldki; 8381 return rc; 8382 } 8383 8384 /** Complete a delete operation started by #mdb_cursor_del(). */ 8385 static int 8386 mdb_cursor_del0(MDB_cursor *mc) 8387 { 8388 int rc; 8389 MDB_page *mp; 8390 indx_t ki; 8391 unsigned int nkeys; 8392 MDB_cursor *m2, *m3; 8393 MDB_dbi dbi = mc->mc_dbi; 8394 8395 ki = mc->mc_ki[mc->mc_top]; 8396 mp = mc->mc_pg[mc->mc_top]; 8397 mdb_node_del(mc, mc->mc_db->md_pad); 8398 mc->mc_db->md_entries--; 8399 { 8400 /* Adjust other cursors pointing to mp */ 8401 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8402 m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 8403 if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 8404 continue; 8405 if (m3 == mc || m3->mc_snum < mc->mc_snum) 8406 continue; 8407 if (m3->mc_pg[mc->mc_top] == mp) { 8408 if (m3->mc_ki[mc->mc_top] == ki) { 8409 m3->mc_flags |= C_DEL; 8410 if (mc->mc_db->md_flags & MDB_DUPSORT) { 8411 /* Sub-cursor referred into dataset which is gone */ 8412 m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 8413 } 8414 continue; 8415 } else if (m3->mc_ki[mc->mc_top] > ki) { 8416 m3->mc_ki[mc->mc_top]--; 8417 } 8418 if (XCURSOR_INITED(m3)) 8419 XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); 8420 } 8421 } 8422 } 8423 rc = mdb_rebalance(mc); 8424 8425 if (rc == MDB_SUCCESS) { 8426 /* DB is totally empty now, just bail out. 8427 * Other cursors adjustments were already done 8428 * by mdb_rebalance and aren't needed here. 8429 */ 8430 if (!mc->mc_snum) 8431 return rc; 8432 8433 mp = mc->mc_pg[mc->mc_top]; 8434 nkeys = NUMKEYS(mp); 8435 8436 /* Adjust other cursors pointing to mp */ 8437 for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { 8438 m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 8439 if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 8440 continue; 8441 if (m3->mc_snum < mc->mc_snum) 8442 continue; 8443 if (m3->mc_pg[mc->mc_top] == mp) { 8444 /* if m3 points past last node in page, find next sibling */ 8445 if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { 8446 if (m3->mc_ki[mc->mc_top] >= nkeys) { 8447 rc = mdb_cursor_sibling(m3, 1); 8448 if (rc == MDB_NOTFOUND) { 8449 m3->mc_flags |= C_EOF; 8450 rc = MDB_SUCCESS; 8451 continue; 8452 } 8453 } 8454 if (mc->mc_db->md_flags & MDB_DUPSORT) { 8455 MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); 8456 /* If this node has dupdata, it may need to be reinited 8457 * because its data has moved. 8458 * If the xcursor was not initd it must be reinited. 8459 * Else if node points to a subDB, nothing is needed. 8460 * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. 8461 */ 8462 if (node->mn_flags & F_DUPDATA) { 8463 if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { 8464 if (!(node->mn_flags & F_SUBDATA)) 8465 m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); 8466 } else 8467 mdb_xcursor_init1(m3, node); 8468 } 8469 } 8470 } 8471 } 8472 } 8473 mc->mc_flags |= C_DEL; 8474 } 8475 8476 if (rc) 8477 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 8478 return rc; 8479 } 8480 8481 int 8482 mdb_del(MDB_txn *txn, MDB_dbi dbi, 8483 MDB_val *key, MDB_val *data) 8484 { 8485 if (!key || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 8486 return EINVAL; 8487 8488 if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) 8489 return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 8490 8491 if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { 8492 /* must ignore any data */ 8493 data = NULL; 8494 } 8495 8496 return mdb_del0(txn, dbi, key, data, 0); 8497 } 8498 8499 static int 8500 mdb_del0(MDB_txn *txn, MDB_dbi dbi, 8501 MDB_val *key, MDB_val *data, unsigned flags) 8502 { 8503 MDB_cursor mc; 8504 MDB_xcursor mx; 8505 MDB_cursor_op op; 8506 MDB_val rdata, *xdata; 8507 int rc, exact = 0; 8508 DKBUF; 8509 8510 DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); 8511 8512 mdb_cursor_init(&mc, txn, dbi, &mx); 8513 8514 if (data) { 8515 op = MDB_GET_BOTH; 8516 rdata = *data; 8517 xdata = &rdata; 8518 } else { 8519 op = MDB_SET; 8520 xdata = NULL; 8521 flags |= MDB_NODUPDATA; 8522 } 8523 rc = mdb_cursor_set(&mc, key, xdata, op, &exact); 8524 if (rc == 0) { 8525 /* let mdb_page_split know about this cursor if needed: 8526 * delete will trigger a rebalance; if it needs to move 8527 * a node from one page to another, it will have to 8528 * update the parent's separator key(s). If the new sepkey 8529 * is larger than the current one, the parent page may 8530 * run out of space, triggering a split. We need this 8531 * cursor to be consistent until the end of the rebalance. 8532 */ 8533 mc.mc_flags |= C_UNTRACK; 8534 mc.mc_next = txn->mt_cursors[dbi]; 8535 txn->mt_cursors[dbi] = &mc; 8536 rc = mdb_cursor_del(&mc, flags); 8537 txn->mt_cursors[dbi] = mc.mc_next; 8538 } 8539 return rc; 8540 } 8541 8542 /** Split a page and insert a new node. 8543 * Set #MDB_TXN_ERROR on failure. 8544 * @param[in,out] mc Cursor pointing to the page and desired insertion index. 8545 * The cursor will be updated to point to the actual page and index where 8546 * the node got inserted after the split. 8547 * @param[in] newkey The key for the newly inserted node. 8548 * @param[in] newdata The data for the newly inserted node. 8549 * @param[in] newpgno The page number, if the new node is a branch node. 8550 * @param[in] nflags The #NODE_ADD_FLAGS for the new node. 8551 * @return 0 on success, non-zero on failure. 8552 */ 8553 static int 8554 mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, 8555 unsigned int nflags) 8556 { 8557 unsigned int flags; 8558 int rc = MDB_SUCCESS, new_root = 0, did_split = 0; 8559 indx_t newindx; 8560 pgno_t pgno = 0; 8561 int i, j, split_indx, nkeys, pmax; 8562 MDB_env *env = mc->mc_txn->mt_env; 8563 MDB_node *node; 8564 MDB_val sepkey, rkey, xdata, *rdata = &xdata; 8565 MDB_page *copy = NULL; 8566 MDB_page *mp, *rp, *pp; 8567 int ptop; 8568 MDB_cursor mn; 8569 DKBUF; 8570 8571 mp = mc->mc_pg[mc->mc_top]; 8572 newindx = mc->mc_ki[mc->mc_top]; 8573 nkeys = NUMKEYS(mp); 8574 8575 DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", 8576 IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, 8577 DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); 8578 8579 /* Create a right sibling. */ 8580 if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) 8581 return rc; 8582 rp->mp_pad = mp->mp_pad; 8583 DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); 8584 8585 /* Usually when splitting the root page, the cursor 8586 * height is 1. But when called from mdb_update_key, 8587 * the cursor height may be greater because it walks 8588 * up the stack while finding the branch slot to update. 8589 */ 8590 if (mc->mc_top < 1) { 8591 if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) 8592 goto done; 8593 /* shift current top to make room for new parent */ 8594 for (i=mc->mc_snum; i>0; i--) { 8595 mc->mc_pg[i] = mc->mc_pg[i-1]; 8596 mc->mc_ki[i] = mc->mc_ki[i-1]; 8597 } 8598 mc->mc_pg[0] = pp; 8599 mc->mc_ki[0] = 0; 8600 mc->mc_db->md_root = pp->mp_pgno; 8601 DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); 8602 new_root = mc->mc_db->md_depth++; 8603 8604 /* Add left (implicit) pointer. */ 8605 if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { 8606 /* undo the pre-push */ 8607 mc->mc_pg[0] = mc->mc_pg[1]; 8608 mc->mc_ki[0] = mc->mc_ki[1]; 8609 mc->mc_db->md_root = mp->mp_pgno; 8610 mc->mc_db->md_depth--; 8611 goto done; 8612 } 8613 mc->mc_snum++; 8614 mc->mc_top++; 8615 ptop = 0; 8616 } else { 8617 ptop = mc->mc_top-1; 8618 DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); 8619 } 8620 8621 mdb_cursor_copy(mc, &mn); 8622 mn.mc_xcursor = NULL; 8623 mn.mc_pg[mn.mc_top] = rp; 8624 mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; 8625 8626 if (nflags & MDB_APPEND) { 8627 mn.mc_ki[mn.mc_top] = 0; 8628 sepkey = *newkey; 8629 split_indx = newindx; 8630 nkeys = 0; 8631 } else { 8632 8633 split_indx = (nkeys+1) / 2; 8634 8635 if (IS_LEAF2(rp)) { 8636 char *split, *ins; 8637 int x; 8638 unsigned int lsize, rsize, ksize; 8639 /* Move half of the keys to the right sibling */ 8640 x = mc->mc_ki[mc->mc_top] - split_indx; 8641 ksize = mc->mc_db->md_pad; 8642 split = LEAF2KEY(mp, split_indx, ksize); 8643 rsize = (nkeys - split_indx) * ksize; 8644 lsize = (nkeys - split_indx) * sizeof(indx_t); 8645 mp->mp_lower -= lsize; 8646 rp->mp_lower += lsize; 8647 mp->mp_upper += rsize - lsize; 8648 rp->mp_upper -= rsize - lsize; 8649 sepkey.mv_size = ksize; 8650 if (newindx == split_indx) { 8651 sepkey.mv_data = newkey->mv_data; 8652 } else { 8653 sepkey.mv_data = split; 8654 } 8655 if (x<0) { 8656 ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); 8657 memcpy(rp->mp_ptrs, split, rsize); 8658 sepkey.mv_data = rp->mp_ptrs; 8659 memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); 8660 memcpy(ins, newkey->mv_data, ksize); 8661 mp->mp_lower += sizeof(indx_t); 8662 mp->mp_upper -= ksize - sizeof(indx_t); 8663 } else { 8664 if (x) 8665 memcpy(rp->mp_ptrs, split, x * ksize); 8666 ins = LEAF2KEY(rp, x, ksize); 8667 memcpy(ins, newkey->mv_data, ksize); 8668 memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); 8669 rp->mp_lower += sizeof(indx_t); 8670 rp->mp_upper -= ksize - sizeof(indx_t); 8671 mc->mc_ki[mc->mc_top] = x; 8672 } 8673 } else { 8674 int psize, nsize, k; 8675 /* Maximum free space in an empty page */ 8676 pmax = env->me_psize - PAGEHDRSZ; 8677 if (IS_LEAF(mp)) 8678 nsize = mdb_leaf_size(env, newkey, newdata); 8679 else 8680 nsize = mdb_branch_size(env, newkey); 8681 nsize = EVEN(nsize); 8682 8683 /* grab a page to hold a temporary copy */ 8684 copy = mdb_page_malloc(mc->mc_txn, 1); 8685 if (copy == NULL) { 8686 rc = ENOMEM; 8687 goto done; 8688 } 8689 copy->mp_pgno = mp->mp_pgno; 8690 copy->mp_flags = mp->mp_flags; 8691 copy->mp_lower = (PAGEHDRSZ-PAGEBASE); 8692 copy->mp_upper = env->me_psize - PAGEBASE; 8693 8694 /* prepare to insert */ 8695 for (i=0, j=0; i<nkeys; i++) { 8696 if (i == newindx) { 8697 copy->mp_ptrs[j++] = 0; 8698 } 8699 copy->mp_ptrs[j++] = mp->mp_ptrs[i]; 8700 } 8701 8702 /* When items are relatively large the split point needs 8703 * to be checked, because being off-by-one will make the 8704 * difference between success or failure in mdb_node_add. 8705 * 8706 * It's also relevant if a page happens to be laid out 8707 * such that one half of its nodes are all "small" and 8708 * the other half of its nodes are "large." If the new 8709 * item is also "large" and falls on the half with 8710 * "large" nodes, it also may not fit. 8711 * 8712 * As a final tweak, if the new item goes on the last 8713 * spot on the page (and thus, onto the new page), bias 8714 * the split so the new page is emptier than the old page. 8715 * This yields better packing during sequential inserts. 8716 */ 8717 if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { 8718 /* Find split point */ 8719 psize = 0; 8720 if (newindx <= split_indx || newindx >= nkeys) { 8721 i = 0; j = 1; 8722 k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp); 8723 } else { 8724 i = nkeys; j = -1; 8725 k = split_indx-1; 8726 } 8727 for (; i!=k; i+=j) { 8728 if (i == newindx) { 8729 psize += nsize; 8730 node = NULL; 8731 } else { 8732 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); 8733 psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); 8734 if (IS_LEAF(mp)) { 8735 if (F_ISSET(node->mn_flags, F_BIGDATA)) 8736 psize += sizeof(pgno_t); 8737 else 8738 psize += NODEDSZ(node); 8739 } 8740 psize = EVEN(psize); 8741 } 8742 if (psize > pmax || i == k-j) { 8743 split_indx = i + (j<0); 8744 break; 8745 } 8746 } 8747 } 8748 if (split_indx == newindx) { 8749 sepkey.mv_size = newkey->mv_size; 8750 sepkey.mv_data = newkey->mv_data; 8751 } else { 8752 node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); 8753 sepkey.mv_size = node->mn_ksize; 8754 sepkey.mv_data = NODEKEY(node); 8755 } 8756 } 8757 } 8758 8759 DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); 8760 8761 /* Copy separator key to the parent. 8762 */ 8763 if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { 8764 int snum = mc->mc_snum; 8765 mn.mc_snum--; 8766 mn.mc_top--; 8767 did_split = 1; 8768 /* We want other splits to find mn when doing fixups */ 8769 WITH_CURSOR_TRACKING(mn, 8770 rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); 8771 if (rc) 8772 goto done; 8773 8774 /* root split? */ 8775 if (mc->mc_snum > snum) { 8776 ptop++; 8777 } 8778 /* Right page might now have changed parent. 8779 * Check if left page also changed parent. 8780 */ 8781 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 8782 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { 8783 for (i=0; i<ptop; i++) { 8784 mc->mc_pg[i] = mn.mc_pg[i]; 8785 mc->mc_ki[i] = mn.mc_ki[i]; 8786 } 8787 mc->mc_pg[ptop] = mn.mc_pg[ptop]; 8788 if (mn.mc_ki[ptop]) { 8789 mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; 8790 } else { 8791 /* find right page's left sibling */ 8792 mc->mc_ki[ptop] = mn.mc_ki[ptop]; 8793 mdb_cursor_sibling(mc, 0); 8794 } 8795 } 8796 } else { 8797 mn.mc_top--; 8798 rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); 8799 mn.mc_top++; 8800 } 8801 if (rc != MDB_SUCCESS) { 8802 goto done; 8803 } 8804 if (nflags & MDB_APPEND) { 8805 mc->mc_pg[mc->mc_top] = rp; 8806 mc->mc_ki[mc->mc_top] = 0; 8807 rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); 8808 if (rc) 8809 goto done; 8810 for (i=0; i<mc->mc_top; i++) 8811 mc->mc_ki[i] = mn.mc_ki[i]; 8812 } else if (!IS_LEAF2(mp)) { 8813 /* Move nodes */ 8814 mc->mc_pg[mc->mc_top] = rp; 8815 i = split_indx; 8816 j = 0; 8817 do { 8818 if (i == newindx) { 8819 rkey.mv_data = newkey->mv_data; 8820 rkey.mv_size = newkey->mv_size; 8821 if (IS_LEAF(mp)) { 8822 rdata = newdata; 8823 } else 8824 pgno = newpgno; 8825 flags = nflags; 8826 /* Update index for the new key. */ 8827 mc->mc_ki[mc->mc_top] = j; 8828 } else { 8829 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); 8830 rkey.mv_data = NODEKEY(node); 8831 rkey.mv_size = node->mn_ksize; 8832 if (IS_LEAF(mp)) { 8833 xdata.mv_data = NODEDATA(node); 8834 xdata.mv_size = NODEDSZ(node); 8835 rdata = &xdata; 8836 } else 8837 pgno = NODEPGNO(node); 8838 flags = node->mn_flags; 8839 } 8840 8841 if (!IS_LEAF(mp) && j == 0) { 8842 /* First branch index doesn't need key data. */ 8843 rkey.mv_size = 0; 8844 } 8845 8846 rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); 8847 if (rc) 8848 goto done; 8849 if (i == nkeys) { 8850 i = 0; 8851 j = 0; 8852 mc->mc_pg[mc->mc_top] = copy; 8853 } else { 8854 i++; 8855 j++; 8856 } 8857 } while (i != split_indx); 8858 8859 nkeys = NUMKEYS(copy); 8860 for (i=0; i<nkeys; i++) 8861 mp->mp_ptrs[i] = copy->mp_ptrs[i]; 8862 mp->mp_lower = copy->mp_lower; 8863 mp->mp_upper = copy->mp_upper; 8864 memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), 8865 env->me_psize - copy->mp_upper - PAGEBASE); 8866 8867 /* reset back to original page */ 8868 if (newindx < split_indx) { 8869 mc->mc_pg[mc->mc_top] = mp; 8870 } else { 8871 mc->mc_pg[mc->mc_top] = rp; 8872 mc->mc_ki[ptop]++; 8873 /* Make sure mc_ki is still valid. 8874 */ 8875 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 8876 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { 8877 for (i=0; i<=ptop; i++) { 8878 mc->mc_pg[i] = mn.mc_pg[i]; 8879 mc->mc_ki[i] = mn.mc_ki[i]; 8880 } 8881 } 8882 } 8883 if (nflags & MDB_RESERVE) { 8884 node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 8885 if (!(node->mn_flags & F_BIGDATA)) 8886 newdata->mv_data = NODEDATA(node); 8887 } 8888 } else { 8889 if (newindx >= split_indx) { 8890 mc->mc_pg[mc->mc_top] = rp; 8891 mc->mc_ki[ptop]++; 8892 /* Make sure mc_ki is still valid. 8893 */ 8894 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 8895 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { 8896 for (i=0; i<=ptop; i++) { 8897 mc->mc_pg[i] = mn.mc_pg[i]; 8898 mc->mc_ki[i] = mn.mc_ki[i]; 8899 } 8900 } 8901 } 8902 } 8903 8904 { 8905 /* Adjust other cursors pointing to mp */ 8906 MDB_cursor *m2, *m3; 8907 MDB_dbi dbi = mc->mc_dbi; 8908 nkeys = NUMKEYS(mp); 8909 8910 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8911 if (mc->mc_flags & C_SUB) 8912 m3 = &m2->mc_xcursor->mx_cursor; 8913 else 8914 m3 = m2; 8915 if (m3 == mc) 8916 continue; 8917 if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 8918 continue; 8919 if (new_root) { 8920 int k; 8921 /* sub cursors may be on different DB */ 8922 if (m3->mc_pg[0] != mp) 8923 continue; 8924 /* root split */ 8925 for (k=new_root; k>=0; k--) { 8926 m3->mc_ki[k+1] = m3->mc_ki[k]; 8927 m3->mc_pg[k+1] = m3->mc_pg[k]; 8928 } 8929 if (m3->mc_ki[0] >= nkeys) { 8930 m3->mc_ki[0] = 1; 8931 } else { 8932 m3->mc_ki[0] = 0; 8933 } 8934 m3->mc_pg[0] = mc->mc_pg[0]; 8935 m3->mc_snum++; 8936 m3->mc_top++; 8937 } 8938 if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { 8939 if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) 8940 m3->mc_ki[mc->mc_top]++; 8941 if (m3->mc_ki[mc->mc_top] >= nkeys) { 8942 m3->mc_pg[mc->mc_top] = rp; 8943 m3->mc_ki[mc->mc_top] -= nkeys; 8944 for (i=0; i<mc->mc_top; i++) { 8945 m3->mc_ki[i] = mn.mc_ki[i]; 8946 m3->mc_pg[i] = mn.mc_pg[i]; 8947 } 8948 } 8949 } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && 8950 m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { 8951 m3->mc_ki[ptop]++; 8952 } 8953 if (XCURSOR_INITED(m3) && IS_LEAF(mp)) 8954 XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); 8955 } 8956 } 8957 DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); 8958 8959 done: 8960 if (copy) /* tmp page */ 8961 mdb_page_free(env, copy); 8962 if (rc) 8963 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 8964 return rc; 8965 } 8966 8967 int 8968 mdb_put(MDB_txn *txn, MDB_dbi dbi, 8969 MDB_val *key, MDB_val *data, unsigned int flags) 8970 { 8971 MDB_cursor mc; 8972 MDB_xcursor mx; 8973 int rc; 8974 8975 if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 8976 return EINVAL; 8977 8978 if (flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) 8979 return EINVAL; 8980 8981 if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) 8982 return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 8983 8984 mdb_cursor_init(&mc, txn, dbi, &mx); 8985 mc.mc_next = txn->mt_cursors[dbi]; 8986 txn->mt_cursors[dbi] = &mc; 8987 rc = mdb_cursor_put(&mc, key, data, flags); 8988 txn->mt_cursors[dbi] = mc.mc_next; 8989 return rc; 8990 } 8991 8992 #ifndef MDB_WBUF 8993 #define MDB_WBUF (1024*1024) 8994 #endif 8995 #define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ 8996 8997 /** State needed for a double-buffering compacting copy. */ 8998 typedef struct mdb_copy { 8999 MDB_env *mc_env; 9000 MDB_txn *mc_txn; 9001 pthread_mutex_t mc_mutex; 9002 pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ 9003 char *mc_wbuf[2]; 9004 char *mc_over[2]; 9005 int mc_wlen[2]; 9006 int mc_olen[2]; 9007 pgno_t mc_next_pgno; 9008 HANDLE mc_fd; 9009 int mc_toggle; /**< Buffer number in provider */ 9010 int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ 9011 /** Error code. Never cleared if set. Both threads can set nonzero 9012 * to fail the copy. Not mutex-protected, LMDB expects atomic int. 9013 */ 9014 volatile int mc_error; 9015 } mdb_copy; 9016 9017 /** Dedicated writer thread for compacting copy. */ 9018 static THREAD_RET ESECT CALL_CONV 9019 mdb_env_copythr(void *arg) 9020 { 9021 mdb_copy *my = arg; 9022 char *ptr; 9023 int toggle = 0, wsize, rc; 9024 #ifdef _WIN32 9025 DWORD len; 9026 #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) 9027 #else 9028 int len; 9029 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) 9030 #ifdef SIGPIPE 9031 sigset_t set; 9032 sigemptyset(&set); 9033 sigaddset(&set, SIGPIPE); 9034 if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) 9035 my->mc_error = rc; 9036 #endif 9037 #endif 9038 9039 pthread_mutex_lock(&my->mc_mutex); 9040 for(;;) { 9041 while (!my->mc_new) 9042 pthread_cond_wait(&my->mc_cond, &my->mc_mutex); 9043 if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ 9044 break; 9045 wsize = my->mc_wlen[toggle]; 9046 ptr = my->mc_wbuf[toggle]; 9047 again: 9048 rc = MDB_SUCCESS; 9049 while (wsize > 0 && !my->mc_error) { 9050 DO_WRITE(rc, my->mc_fd, ptr, wsize, len); 9051 if (!rc) { 9052 rc = ErrCode(); 9053 #if defined(SIGPIPE) && !defined(_WIN32) 9054 if (rc == EPIPE) { 9055 /* Collect the pending SIGPIPE, otherwise at least OS X 9056 * gives it to the process on thread-exit (ITS#8504). 9057 */ 9058 int tmp; 9059 sigwait(&set, &tmp); 9060 } 9061 #endif 9062 break; 9063 } else if (len > 0) { 9064 rc = MDB_SUCCESS; 9065 ptr += len; 9066 wsize -= len; 9067 continue; 9068 } else { 9069 rc = EIO; 9070 break; 9071 } 9072 } 9073 if (rc) { 9074 my->mc_error = rc; 9075 } 9076 /* If there's an overflow page tail, write it too */ 9077 if (my->mc_olen[toggle]) { 9078 wsize = my->mc_olen[toggle]; 9079 ptr = my->mc_over[toggle]; 9080 my->mc_olen[toggle] = 0; 9081 goto again; 9082 } 9083 my->mc_wlen[toggle] = 0; 9084 toggle ^= 1; 9085 /* Return the empty buffer to provider */ 9086 my->mc_new--; 9087 pthread_cond_signal(&my->mc_cond); 9088 } 9089 pthread_mutex_unlock(&my->mc_mutex); 9090 return (THREAD_RET)0; 9091 #undef DO_WRITE 9092 } 9093 9094 /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. 9095 * 9096 * @param[in] my control structure. 9097 * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). 9098 */ 9099 static int ESECT 9100 mdb_env_cthr_toggle(mdb_copy *my, int adjust) 9101 { 9102 pthread_mutex_lock(&my->mc_mutex); 9103 my->mc_new += adjust; 9104 pthread_cond_signal(&my->mc_cond); 9105 while (my->mc_new & 2) /* both buffers in use */ 9106 pthread_cond_wait(&my->mc_cond, &my->mc_mutex); 9107 pthread_mutex_unlock(&my->mc_mutex); 9108 9109 my->mc_toggle ^= (adjust & 1); 9110 /* Both threads reset mc_wlen, to be safe from threading errors */ 9111 my->mc_wlen[my->mc_toggle] = 0; 9112 return my->mc_error; 9113 } 9114 9115 /** Depth-first tree traversal for compacting copy. 9116 * @param[in] my control structure. 9117 * @param[in,out] pg database root. 9118 * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. 9119 */ 9120 static int ESECT 9121 mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) 9122 { 9123 MDB_cursor mc = {0}; 9124 MDB_node *ni; 9125 MDB_page *mo, *mp, *leaf; 9126 char *buf, *ptr; 9127 int rc, toggle; 9128 unsigned int i; 9129 9130 /* Empty DB, nothing to do */ 9131 if (*pg == P_INVALID) 9132 return MDB_SUCCESS; 9133 9134 mc.mc_snum = 1; 9135 mc.mc_txn = my->mc_txn; 9136 9137 rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); 9138 if (rc) 9139 return rc; 9140 rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); 9141 if (rc) 9142 return rc; 9143 9144 /* Make cursor pages writable */ 9145 buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); 9146 if (buf == NULL) 9147 return ENOMEM; 9148 9149 for (i=0; i<mc.mc_top; i++) { 9150 mdb_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize); 9151 mc.mc_pg[i] = (MDB_page *)ptr; 9152 ptr += my->mc_env->me_psize; 9153 } 9154 9155 /* This is writable space for a leaf page. Usually not needed. */ 9156 leaf = (MDB_page *)ptr; 9157 9158 toggle = my->mc_toggle; 9159 while (mc.mc_snum > 0) { 9160 unsigned n; 9161 mp = mc.mc_pg[mc.mc_top]; 9162 n = NUMKEYS(mp); 9163 9164 if (IS_LEAF(mp)) { 9165 if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { 9166 for (i=0; i<n; i++) { 9167 ni = NODEPTR(mp, i); 9168 if (ni->mn_flags & F_BIGDATA) { 9169 MDB_page *omp; 9170 pgno_t pg; 9171 9172 /* Need writable leaf */ 9173 if (mp != leaf) { 9174 mc.mc_pg[mc.mc_top] = leaf; 9175 mdb_page_copy(leaf, mp, my->mc_env->me_psize); 9176 mp = leaf; 9177 ni = NODEPTR(mp, i); 9178 } 9179 9180 memcpy(&pg, NODEDATA(ni), sizeof(pg)); 9181 memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); 9182 rc = mdb_page_get(&mc, pg, &omp, NULL); 9183 if (rc) 9184 goto done; 9185 if (my->mc_wlen[toggle] >= MDB_WBUF) { 9186 rc = mdb_env_cthr_toggle(my, 1); 9187 if (rc) 9188 goto done; 9189 toggle = my->mc_toggle; 9190 } 9191 mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); 9192 memcpy(mo, omp, my->mc_env->me_psize); 9193 mo->mp_pgno = my->mc_next_pgno; 9194 my->mc_next_pgno += omp->mp_pages; 9195 my->mc_wlen[toggle] += my->mc_env->me_psize; 9196 if (omp->mp_pages > 1) { 9197 my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); 9198 my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; 9199 rc = mdb_env_cthr_toggle(my, 1); 9200 if (rc) 9201 goto done; 9202 toggle = my->mc_toggle; 9203 } 9204 } else if (ni->mn_flags & F_SUBDATA) { 9205 MDB_db db; 9206 9207 /* Need writable leaf */ 9208 if (mp != leaf) { 9209 mc.mc_pg[mc.mc_top] = leaf; 9210 mdb_page_copy(leaf, mp, my->mc_env->me_psize); 9211 mp = leaf; 9212 ni = NODEPTR(mp, i); 9213 } 9214 9215 memcpy(&db, NODEDATA(ni), sizeof(db)); 9216 my->mc_toggle = toggle; 9217 rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); 9218 if (rc) 9219 goto done; 9220 toggle = my->mc_toggle; 9221 memcpy(NODEDATA(ni), &db, sizeof(db)); 9222 } 9223 } 9224 } 9225 } else { 9226 mc.mc_ki[mc.mc_top]++; 9227 if (mc.mc_ki[mc.mc_top] < n) { 9228 pgno_t pg; 9229 again: 9230 ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); 9231 pg = NODEPGNO(ni); 9232 rc = mdb_page_get(&mc, pg, &mp, NULL); 9233 if (rc) 9234 goto done; 9235 mc.mc_top++; 9236 mc.mc_snum++; 9237 mc.mc_ki[mc.mc_top] = 0; 9238 if (IS_BRANCH(mp)) { 9239 /* Whenever we advance to a sibling branch page, 9240 * we must proceed all the way down to its first leaf. 9241 */ 9242 mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); 9243 goto again; 9244 } else 9245 mc.mc_pg[mc.mc_top] = mp; 9246 continue; 9247 } 9248 } 9249 if (my->mc_wlen[toggle] >= MDB_WBUF) { 9250 rc = mdb_env_cthr_toggle(my, 1); 9251 if (rc) 9252 goto done; 9253 toggle = my->mc_toggle; 9254 } 9255 mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); 9256 mdb_page_copy(mo, mp, my->mc_env->me_psize); 9257 mo->mp_pgno = my->mc_next_pgno++; 9258 my->mc_wlen[toggle] += my->mc_env->me_psize; 9259 if (mc.mc_top) { 9260 /* Update parent if there is one */ 9261 ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]); 9262 SETPGNO(ni, mo->mp_pgno); 9263 mdb_cursor_pop(&mc); 9264 } else { 9265 /* Otherwise we're done */ 9266 *pg = mo->mp_pgno; 9267 break; 9268 } 9269 } 9270 done: 9271 free(buf); 9272 return rc; 9273 } 9274 9275 /** Copy environment with compaction. */ 9276 static int ESECT 9277 mdb_env_copyfd1(MDB_env *env, HANDLE fd) 9278 { 9279 MDB_meta *mm; 9280 MDB_page *mp; 9281 mdb_copy my = {0}; 9282 MDB_txn *txn = NULL; 9283 pthread_t thr; 9284 pgno_t root, new_root; 9285 int rc = MDB_SUCCESS; 9286 9287 #ifdef _WIN32 9288 if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) || 9289 !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { 9290 rc = ErrCode(); 9291 goto done; 9292 } 9293 my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); 9294 if (my.mc_wbuf[0] == NULL) { 9295 /* _aligned_malloc() sets errno, but we use Windows error codes */ 9296 rc = ERROR_NOT_ENOUGH_MEMORY; 9297 goto done; 9298 } 9299 #else 9300 if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) 9301 return rc; 9302 if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) 9303 goto done2; 9304 #ifdef HAVE_MEMALIGN 9305 my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); 9306 if (my.mc_wbuf[0] == NULL) { 9307 rc = errno; 9308 goto done; 9309 } 9310 #else 9311 { 9312 void *p; 9313 if ((rc = posix_memalign(&p, env->me_os_psize, MDB_WBUF*2)) != 0) 9314 goto done; 9315 my.mc_wbuf[0] = p; 9316 } 9317 #endif 9318 #endif 9319 memset(my.mc_wbuf[0], 0, MDB_WBUF*2); 9320 my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; 9321 my.mc_next_pgno = NUM_METAS; 9322 my.mc_env = env; 9323 my.mc_fd = fd; 9324 rc = THREAD_CREATE(thr, mdb_env_copythr, &my); 9325 if (rc) 9326 goto done; 9327 9328 rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); 9329 if (rc) 9330 goto finish; 9331 9332 mp = (MDB_page *)my.mc_wbuf[0]; 9333 memset(mp, 0, NUM_METAS * env->me_psize); 9334 mp->mp_pgno = 0; 9335 mp->mp_flags = P_META; 9336 mm = (MDB_meta *)METADATA(mp); 9337 mdb_env_init_meta0(env, mm); 9338 mm->mm_address = env->me_metas[0]->mm_address; 9339 9340 mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); 9341 mp->mp_pgno = 1; 9342 mp->mp_flags = P_META; 9343 *(MDB_meta *)METADATA(mp) = *mm; 9344 mm = (MDB_meta *)METADATA(mp); 9345 9346 /* Set metapage 1 with current main DB */ 9347 root = new_root = txn->mt_dbs[MAIN_DBI].md_root; 9348 if (root != P_INVALID) { 9349 /* Count free pages + freeDB pages. Subtract from last_pg 9350 * to find the new last_pg, which also becomes the new root. 9351 */ 9352 MDB_ID freecount = 0; 9353 MDB_cursor mc; 9354 MDB_val key, data; 9355 mdb_cursor_init(&mc, txn, FREE_DBI, NULL); 9356 while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) 9357 freecount += *(MDB_ID *)data.mv_data; 9358 if (rc != MDB_NOTFOUND) 9359 goto finish; 9360 freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + 9361 txn->mt_dbs[FREE_DBI].md_leaf_pages + 9362 txn->mt_dbs[FREE_DBI].md_overflow_pages; 9363 9364 new_root = txn->mt_next_pgno - 1 - freecount; 9365 mm->mm_last_pg = new_root; 9366 mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; 9367 mm->mm_dbs[MAIN_DBI].md_root = new_root; 9368 } else { 9369 /* When the DB is empty, handle it specially to 9370 * fix any breakage like page leaks from ITS#8174. 9371 */ 9372 mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; 9373 } 9374 if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { 9375 mm->mm_txnid = 1; /* use metapage 1 */ 9376 } 9377 9378 my.mc_wlen[0] = env->me_psize * NUM_METAS; 9379 my.mc_txn = txn; 9380 rc = mdb_env_cwalk(&my, &root, 0); 9381 if (rc == MDB_SUCCESS && root != new_root) { 9382 rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ 9383 } 9384 9385 finish: 9386 if (rc) 9387 my.mc_error = rc; 9388 mdb_env_cthr_toggle(&my, 1 | MDB_EOF); 9389 rc = THREAD_FINISH(thr); 9390 mdb_txn_abort(txn); 9391 9392 done: 9393 #ifdef _WIN32 9394 if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]); 9395 if (my.mc_cond) CloseHandle(my.mc_cond); 9396 if (my.mc_mutex) CloseHandle(my.mc_mutex); 9397 #else 9398 free(my.mc_wbuf[0]); 9399 pthread_cond_destroy(&my.mc_cond); 9400 done2: 9401 pthread_mutex_destroy(&my.mc_mutex); 9402 #endif 9403 return rc ? rc : my.mc_error; 9404 } 9405 9406 /** Copy environment as-is. */ 9407 static int ESECT 9408 mdb_env_copyfd0(MDB_env *env, HANDLE fd) 9409 { 9410 MDB_txn *txn = NULL; 9411 mdb_mutexref_t wmutex = NULL; 9412 int rc; 9413 size_t wsize, w3; 9414 char *ptr; 9415 #ifdef _WIN32 9416 DWORD len, w2; 9417 #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) 9418 #else 9419 ssize_t len; 9420 size_t w2; 9421 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) 9422 #endif 9423 9424 /* Do the lock/unlock of the reader mutex before starting the 9425 * write txn. Otherwise other read txns could block writers. 9426 */ 9427 rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); 9428 if (rc) 9429 return rc; 9430 9431 if (env->me_txns) { 9432 /* We must start the actual read txn after blocking writers */ 9433 mdb_txn_end(txn, MDB_END_RESET_TMP); 9434 9435 /* Temporarily block writers until we snapshot the meta pages */ 9436 wmutex = env->me_wmutex; 9437 if (LOCK_MUTEX(rc, env, wmutex)) 9438 goto leave; 9439 9440 rc = mdb_txn_renew0(txn); 9441 if (rc) { 9442 UNLOCK_MUTEX(wmutex); 9443 goto leave; 9444 } 9445 } 9446 9447 wsize = env->me_psize * NUM_METAS; 9448 ptr = env->me_map; 9449 w2 = wsize; 9450 while (w2 > 0) { 9451 DO_WRITE(rc, fd, ptr, w2, len); 9452 if (!rc) { 9453 rc = ErrCode(); 9454 break; 9455 } else if (len > 0) { 9456 rc = MDB_SUCCESS; 9457 ptr += len; 9458 w2 -= len; 9459 continue; 9460 } else { 9461 /* Non-blocking or async handles are not supported */ 9462 rc = EIO; 9463 break; 9464 } 9465 } 9466 if (wmutex) 9467 UNLOCK_MUTEX(wmutex); 9468 9469 if (rc) 9470 goto leave; 9471 9472 w3 = txn->mt_next_pgno * env->me_psize; 9473 { 9474 size_t fsize = 0; 9475 if ((rc = mdb_fsize(env->me_fd, &fsize))) 9476 goto leave; 9477 if (w3 > fsize) 9478 w3 = fsize; 9479 } 9480 wsize = w3 - wsize; 9481 while (wsize > 0) { 9482 if (wsize > MAX_WRITE) 9483 w2 = MAX_WRITE; 9484 else 9485 w2 = wsize; 9486 DO_WRITE(rc, fd, ptr, w2, len); 9487 if (!rc) { 9488 rc = ErrCode(); 9489 break; 9490 } else if (len > 0) { 9491 rc = MDB_SUCCESS; 9492 ptr += len; 9493 wsize -= len; 9494 continue; 9495 } else { 9496 rc = EIO; 9497 break; 9498 } 9499 } 9500 9501 leave: 9502 mdb_txn_abort(txn); 9503 return rc; 9504 } 9505 9506 int ESECT 9507 mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags) 9508 { 9509 if (flags & MDB_CP_COMPACT) 9510 return mdb_env_copyfd1(env, fd); 9511 else 9512 return mdb_env_copyfd0(env, fd); 9513 } 9514 9515 int ESECT 9516 mdb_env_copyfd(MDB_env *env, HANDLE fd) 9517 { 9518 return mdb_env_copyfd2(env, fd, 0); 9519 } 9520 9521 int ESECT 9522 mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) 9523 { 9524 int rc; 9525 MDB_name fname; 9526 HANDLE newfd = INVALID_HANDLE_VALUE; 9527 9528 rc = mdb_fname_init(path, env->me_flags | MDB_NOLOCK, &fname); 9529 if (rc == MDB_SUCCESS) { 9530 rc = mdb_fopen(env, &fname, MDB_O_COPY, 0666, &newfd); 9531 mdb_fname_destroy(fname); 9532 } 9533 if (rc == MDB_SUCCESS) { 9534 rc = mdb_env_copyfd2(env, newfd, flags); 9535 if (close(newfd) < 0 && rc == MDB_SUCCESS) 9536 rc = ErrCode(); 9537 } 9538 return rc; 9539 } 9540 9541 int ESECT 9542 mdb_env_copy(MDB_env *env, const char *path) 9543 { 9544 return mdb_env_copy2(env, path, 0); 9545 } 9546 9547 int ESECT 9548 mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) 9549 { 9550 if (flag & ~CHANGEABLE) 9551 return EINVAL; 9552 if (onoff) 9553 env->me_flags |= flag; 9554 else 9555 env->me_flags &= ~flag; 9556 return MDB_SUCCESS; 9557 } 9558 9559 int ESECT 9560 mdb_env_get_flags(MDB_env *env, unsigned int *arg) 9561 { 9562 if (!env || !arg) 9563 return EINVAL; 9564 9565 *arg = env->me_flags & (CHANGEABLE|CHANGELESS); 9566 return MDB_SUCCESS; 9567 } 9568 9569 int ESECT 9570 mdb_env_set_userctx(MDB_env *env, void *ctx) 9571 { 9572 if (!env) 9573 return EINVAL; 9574 env->me_userctx = ctx; 9575 return MDB_SUCCESS; 9576 } 9577 9578 void * ESECT 9579 mdb_env_get_userctx(MDB_env *env) 9580 { 9581 return env ? env->me_userctx : NULL; 9582 } 9583 9584 int ESECT 9585 mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) 9586 { 9587 if (!env) 9588 return EINVAL; 9589 #ifndef NDEBUG 9590 env->me_assert_func = func; 9591 #endif 9592 return MDB_SUCCESS; 9593 } 9594 9595 int ESECT 9596 mdb_env_get_path(MDB_env *env, const char **arg) 9597 { 9598 if (!env || !arg) 9599 return EINVAL; 9600 9601 *arg = env->me_path; 9602 return MDB_SUCCESS; 9603 } 9604 9605 int ESECT 9606 mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) 9607 { 9608 if (!env || !arg) 9609 return EINVAL; 9610 9611 *arg = env->me_fd; 9612 return MDB_SUCCESS; 9613 } 9614 9615 /** Common code for #mdb_stat() and #mdb_env_stat(). 9616 * @param[in] env the environment to operate in. 9617 * @param[in] db the #MDB_db record containing the stats to return. 9618 * @param[out] arg the address of an #MDB_stat structure to receive the stats. 9619 * @return 0, this function always succeeds. 9620 */ 9621 static int ESECT 9622 mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) 9623 { 9624 arg->ms_psize = env->me_psize; 9625 arg->ms_depth = db->md_depth; 9626 arg->ms_branch_pages = db->md_branch_pages; 9627 arg->ms_leaf_pages = db->md_leaf_pages; 9628 arg->ms_overflow_pages = db->md_overflow_pages; 9629 arg->ms_entries = db->md_entries; 9630 9631 return MDB_SUCCESS; 9632 } 9633 9634 int ESECT 9635 mdb_env_stat(MDB_env *env, MDB_stat *arg) 9636 { 9637 MDB_meta *meta; 9638 9639 if (env == NULL || arg == NULL) 9640 return EINVAL; 9641 9642 meta = mdb_env_pick_meta(env); 9643 9644 return mdb_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); 9645 } 9646 9647 int ESECT 9648 mdb_env_info(MDB_env *env, MDB_envinfo *arg) 9649 { 9650 MDB_meta *meta; 9651 9652 if (env == NULL || arg == NULL) 9653 return EINVAL; 9654 9655 meta = mdb_env_pick_meta(env); 9656 arg->me_mapaddr = meta->mm_address; 9657 arg->me_last_pgno = meta->mm_last_pg; 9658 arg->me_last_txnid = meta->mm_txnid; 9659 9660 arg->me_mapsize = env->me_mapsize; 9661 arg->me_maxreaders = env->me_maxreaders; 9662 arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0; 9663 return MDB_SUCCESS; 9664 } 9665 9666 /** Set the default comparison functions for a database. 9667 * Called immediately after a database is opened to set the defaults. 9668 * The user can then override them with #mdb_set_compare() or 9669 * #mdb_set_dupsort(). 9670 * @param[in] txn A transaction handle returned by #mdb_txn_begin() 9671 * @param[in] dbi A database handle returned by #mdb_dbi_open() 9672 */ 9673 static void 9674 mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) 9675 { 9676 uint16_t f = txn->mt_dbs[dbi].md_flags; 9677 9678 txn->mt_dbxs[dbi].md_cmp = 9679 (f & MDB_REVERSEKEY) ? mdb_cmp_memnr : 9680 (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn; 9681 9682 txn->mt_dbxs[dbi].md_dcmp = 9683 !(f & MDB_DUPSORT) ? 0 : 9684 ((f & MDB_INTEGERDUP) 9685 ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint) 9686 : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); 9687 } 9688 9689 int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) 9690 { 9691 MDB_val key, data; 9692 MDB_dbi i; 9693 MDB_cursor mc; 9694 MDB_db dummy; 9695 int rc, dbflag, exact; 9696 unsigned int unused = 0, seq; 9697 char *namedup; 9698 size_t len; 9699 9700 if (flags & ~VALID_FLAGS) 9701 return EINVAL; 9702 if (txn->mt_flags & MDB_TXN_BLOCKED) 9703 return MDB_BAD_TXN; 9704 9705 /* main DB? */ 9706 if (!name) { 9707 *dbi = MAIN_DBI; 9708 if (flags & PERSISTENT_FLAGS) { 9709 uint16_t f2 = flags & PERSISTENT_FLAGS; 9710 /* make sure flag changes get committed */ 9711 if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { 9712 txn->mt_dbs[MAIN_DBI].md_flags |= f2; 9713 txn->mt_flags |= MDB_TXN_DIRTY; 9714 } 9715 } 9716 mdb_default_cmp(txn, MAIN_DBI); 9717 return MDB_SUCCESS; 9718 } 9719 9720 if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { 9721 mdb_default_cmp(txn, MAIN_DBI); 9722 } 9723 9724 /* Is the DB already open? */ 9725 len = strlen(name); 9726 for (i=CORE_DBS; i<txn->mt_numdbs; i++) { 9727 if (!txn->mt_dbxs[i].md_name.mv_size) { 9728 /* Remember this free slot */ 9729 if (!unused) unused = i; 9730 continue; 9731 } 9732 if (len == txn->mt_dbxs[i].md_name.mv_size && 9733 !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { 9734 *dbi = i; 9735 return MDB_SUCCESS; 9736 } 9737 } 9738 9739 /* If no free slot and max hit, fail */ 9740 if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) 9741 return MDB_DBS_FULL; 9742 9743 /* Cannot mix named databases with some mainDB flags */ 9744 if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) 9745 return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; 9746 9747 /* Find the DB info */ 9748 dbflag = DB_NEW|DB_VALID|DB_USRVALID; 9749 exact = 0; 9750 key.mv_size = len; 9751 key.mv_data = (void *)name; 9752 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); 9753 rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); 9754 if (rc == MDB_SUCCESS) { 9755 /* make sure this is actually a DB */ 9756 MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); 9757 if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) 9758 return MDB_INCOMPATIBLE; 9759 } else if (! (rc == MDB_NOTFOUND && (flags & MDB_CREATE))) { 9760 return rc; 9761 } 9762 9763 /* Done here so we cannot fail after creating a new DB */ 9764 if ((namedup = strdup(name)) == NULL) 9765 return ENOMEM; 9766 9767 if (rc) { 9768 /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ 9769 data.mv_size = sizeof(MDB_db); 9770 data.mv_data = &dummy; 9771 memset(&dummy, 0, sizeof(dummy)); 9772 dummy.md_root = P_INVALID; 9773 dummy.md_flags = flags & PERSISTENT_FLAGS; 9774 WITH_CURSOR_TRACKING(mc, 9775 rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); 9776 dbflag |= DB_DIRTY; 9777 } 9778 9779 if (rc) { 9780 free(namedup); 9781 } else { 9782 /* Got info, register DBI in this txn */ 9783 unsigned int slot = unused ? unused : txn->mt_numdbs; 9784 txn->mt_dbxs[slot].md_name.mv_data = namedup; 9785 txn->mt_dbxs[slot].md_name.mv_size = len; 9786 txn->mt_dbxs[slot].md_rel = NULL; 9787 txn->mt_dbflags[slot] = dbflag; 9788 /* txn-> and env-> are the same in read txns, use 9789 * tmp variable to avoid undefined assignment 9790 */ 9791 seq = ++txn->mt_env->me_dbiseqs[slot]; 9792 txn->mt_dbiseqs[slot] = seq; 9793 9794 memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); 9795 *dbi = slot; 9796 mdb_default_cmp(txn, slot); 9797 if (!unused) { 9798 txn->mt_numdbs++; 9799 } 9800 } 9801 9802 return rc; 9803 } 9804 9805 int ESECT 9806 mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) 9807 { 9808 if (!arg || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) 9809 return EINVAL; 9810 9811 if (txn->mt_flags & MDB_TXN_BLOCKED) 9812 return MDB_BAD_TXN; 9813 9814 if (txn->mt_dbflags[dbi] & DB_STALE) { 9815 MDB_cursor mc; 9816 MDB_xcursor mx; 9817 /* Stale, must read the DB's root. cursor_init does it for us. */ 9818 mdb_cursor_init(&mc, txn, dbi, &mx); 9819 } 9820 return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); 9821 } 9822 9823 void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) 9824 { 9825 char *ptr; 9826 if (dbi < CORE_DBS || dbi >= env->me_maxdbs) 9827 return; 9828 ptr = env->me_dbxs[dbi].md_name.mv_data; 9829 /* If there was no name, this was already closed */ 9830 if (ptr) { 9831 env->me_dbxs[dbi].md_name.mv_data = NULL; 9832 env->me_dbxs[dbi].md_name.mv_size = 0; 9833 env->me_dbflags[dbi] = 0; 9834 env->me_dbiseqs[dbi]++; 9835 free(ptr); 9836 } 9837 } 9838 9839 int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) 9840 { 9841 /* We could return the flags for the FREE_DBI too but what's the point? */ 9842 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 9843 return EINVAL; 9844 *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; 9845 return MDB_SUCCESS; 9846 } 9847 9848 /** Add all the DB's pages to the free list. 9849 * @param[in] mc Cursor on the DB to free. 9850 * @param[in] subs non-Zero to check for sub-DBs in this DB. 9851 * @return 0 on success, non-zero on failure. 9852 */ 9853 static int 9854 mdb_drop0(MDB_cursor *mc, int subs) 9855 { 9856 int rc; 9857 9858 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); 9859 if (rc == MDB_SUCCESS) { 9860 MDB_txn *txn = mc->mc_txn; 9861 MDB_node *ni; 9862 MDB_cursor mx; 9863 unsigned int i; 9864 9865 /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. 9866 * This also avoids any P_LEAF2 pages, which have no nodes. 9867 * Also if the DB doesn't have sub-DBs and has no overflow 9868 * pages, omit scanning leaves. 9869 */ 9870 if ((mc->mc_flags & C_SUB) || 9871 (!subs && !mc->mc_db->md_overflow_pages)) 9872 mdb_cursor_pop(mc); 9873 9874 mdb_cursor_copy(mc, &mx); 9875 while (mc->mc_snum > 0) { 9876 MDB_page *mp = mc->mc_pg[mc->mc_top]; 9877 unsigned n = NUMKEYS(mp); 9878 if (IS_LEAF(mp)) { 9879 for (i=0; i<n; i++) { 9880 ni = NODEPTR(mp, i); 9881 if (ni->mn_flags & F_BIGDATA) { 9882 MDB_page *omp; 9883 pgno_t pg; 9884 memcpy(&pg, NODEDATA(ni), sizeof(pg)); 9885 rc = mdb_page_get(mc, pg, &omp, NULL); 9886 if (rc != 0) 9887 goto done; 9888 mdb_cassert(mc, IS_OVERFLOW(omp)); 9889 rc = mdb_midl_append_range(&txn->mt_free_pgs, 9890 pg, omp->mp_pages); 9891 if (rc) 9892 goto done; 9893 mc->mc_db->md_overflow_pages -= omp->mp_pages; 9894 if (!mc->mc_db->md_overflow_pages && !subs) 9895 break; 9896 } else if (subs && (ni->mn_flags & F_SUBDATA)) { 9897 mdb_xcursor_init1(mc, ni); 9898 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); 9899 if (rc) 9900 goto done; 9901 } 9902 } 9903 if (!subs && !mc->mc_db->md_overflow_pages) 9904 goto pop; 9905 } else { 9906 if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) 9907 goto done; 9908 for (i=0; i<n; i++) { 9909 pgno_t pg; 9910 ni = NODEPTR(mp, i); 9911 pg = NODEPGNO(ni); 9912 /* free it */ 9913 mdb_midl_xappend(txn->mt_free_pgs, pg); 9914 } 9915 } 9916 if (!mc->mc_top) 9917 break; 9918 mc->mc_ki[mc->mc_top] = i; 9919 rc = mdb_cursor_sibling(mc, 1); 9920 if (rc) { 9921 if (rc != MDB_NOTFOUND) 9922 goto done; 9923 /* no more siblings, go back to beginning 9924 * of previous level. 9925 */ 9926 pop: 9927 mdb_cursor_pop(mc); 9928 mc->mc_ki[0] = 0; 9929 for (i=1; i<mc->mc_snum; i++) { 9930 mc->mc_ki[i] = 0; 9931 mc->mc_pg[i] = mx.mc_pg[i]; 9932 } 9933 } 9934 } 9935 /* free it */ 9936 rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); 9937 done: 9938 if (rc) 9939 txn->mt_flags |= MDB_TXN_ERROR; 9940 } else if (rc == MDB_NOTFOUND) { 9941 rc = MDB_SUCCESS; 9942 } 9943 mc->mc_flags &= ~C_INITIALIZED; 9944 return rc; 9945 } 9946 9947 int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) 9948 { 9949 MDB_cursor *mc, *m2; 9950 int rc; 9951 9952 if ((unsigned)del > 1 || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 9953 return EINVAL; 9954 9955 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) 9956 return EACCES; 9957 9958 if (TXN_DBI_CHANGED(txn, dbi)) 9959 return MDB_BAD_DBI; 9960 9961 rc = mdb_cursor_open(txn, dbi, &mc); 9962 if (rc) 9963 return rc; 9964 9965 rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); 9966 /* Invalidate the dropped DB's cursors */ 9967 for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) 9968 m2->mc_flags &= ~(C_INITIALIZED|C_EOF); 9969 if (rc) 9970 goto leave; 9971 9972 /* Can't delete the main DB */ 9973 if (del && dbi >= CORE_DBS) { 9974 rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); 9975 if (!rc) { 9976 txn->mt_dbflags[dbi] = DB_STALE; 9977 mdb_dbi_close(txn->mt_env, dbi); 9978 } else { 9979 txn->mt_flags |= MDB_TXN_ERROR; 9980 } 9981 } else { 9982 /* reset the DB record, mark it dirty */ 9983 txn->mt_dbflags[dbi] |= DB_DIRTY; 9984 txn->mt_dbs[dbi].md_depth = 0; 9985 txn->mt_dbs[dbi].md_branch_pages = 0; 9986 txn->mt_dbs[dbi].md_leaf_pages = 0; 9987 txn->mt_dbs[dbi].md_overflow_pages = 0; 9988 txn->mt_dbs[dbi].md_entries = 0; 9989 txn->mt_dbs[dbi].md_root = P_INVALID; 9990 9991 txn->mt_flags |= MDB_TXN_DIRTY; 9992 } 9993 leave: 9994 mdb_cursor_close(mc); 9995 return rc; 9996 } 9997 9998 int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) 9999 { 10000 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10001 return EINVAL; 10002 10003 txn->mt_dbxs[dbi].md_cmp = cmp; 10004 return MDB_SUCCESS; 10005 } 10006 10007 int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) 10008 { 10009 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10010 return EINVAL; 10011 10012 txn->mt_dbxs[dbi].md_dcmp = cmp; 10013 return MDB_SUCCESS; 10014 } 10015 10016 int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) 10017 { 10018 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10019 return EINVAL; 10020 10021 txn->mt_dbxs[dbi].md_rel = rel; 10022 return MDB_SUCCESS; 10023 } 10024 10025 int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) 10026 { 10027 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10028 return EINVAL; 10029 10030 txn->mt_dbxs[dbi].md_relctx = ctx; 10031 return MDB_SUCCESS; 10032 } 10033 10034 int ESECT 10035 mdb_env_get_maxkeysize(MDB_env *env) 10036 { 10037 return ENV_MAXKEY(env); 10038 } 10039 10040 int ESECT 10041 mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) 10042 { 10043 unsigned int i, rdrs; 10044 MDB_reader *mr; 10045 char buf[64]; 10046 int rc = 0, first = 1; 10047 10048 if (!env || !func) 10049 return -1; 10050 if (!env->me_txns) { 10051 return func("(no reader locks)\n", ctx); 10052 } 10053 rdrs = env->me_txns->mti_numreaders; 10054 mr = env->me_txns->mti_readers; 10055 for (i=0; i<rdrs; i++) { 10056 if (mr[i].mr_pid) { 10057 txnid_t txnid = mr[i].mr_txnid; 10058 sprintf(buf, txnid == (txnid_t)-1 ? 10059 "%10d %"Z"x -\n" : "%10d %"Z"x %"Z"u\n", 10060 (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); 10061 if (first) { 10062 first = 0; 10063 rc = func(" pid thread txnid\n", ctx); 10064 if (rc < 0) 10065 break; 10066 } 10067 rc = func(buf, ctx); 10068 if (rc < 0) 10069 break; 10070 } 10071 } 10072 if (first) { 10073 rc = func("(no active readers)\n", ctx); 10074 } 10075 return rc; 10076 } 10077 10078 /** Insert pid into list if not already present. 10079 * return -1 if already present. 10080 */ 10081 static int ESECT 10082 mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid) 10083 { 10084 /* binary search of pid in list */ 10085 unsigned base = 0; 10086 unsigned cursor = 1; 10087 int val = 0; 10088 unsigned n = ids[0]; 10089 10090 while( 0 < n ) { 10091 unsigned pivot = n >> 1; 10092 cursor = base + pivot + 1; 10093 val = pid - ids[cursor]; 10094 10095 if( val < 0 ) { 10096 n = pivot; 10097 10098 } else if ( val > 0 ) { 10099 base = cursor; 10100 n -= pivot + 1; 10101 10102 } else { 10103 /* found, so it's a duplicate */ 10104 return -1; 10105 } 10106 } 10107 10108 if( val > 0 ) { 10109 ++cursor; 10110 } 10111 ids[0]++; 10112 for (n = ids[0]; n > cursor; n--) 10113 ids[n] = ids[n-1]; 10114 ids[n] = pid; 10115 return 0; 10116 } 10117 10118 int ESECT 10119 mdb_reader_check(MDB_env *env, int *dead) 10120 { 10121 if (!env) 10122 return EINVAL; 10123 if (dead) 10124 *dead = 0; 10125 return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; 10126 } 10127 10128 /** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ 10129 static int ESECT 10130 mdb_reader_check0(MDB_env *env, int rlocked, int *dead) 10131 { 10132 mdb_mutexref_t rmutex = rlocked ? NULL : env->me_rmutex; 10133 unsigned int i, j, rdrs; 10134 MDB_reader *mr; 10135 MDB_PID_T *pids, pid; 10136 int rc = MDB_SUCCESS, count = 0; 10137 10138 rdrs = env->me_txns->mti_numreaders; 10139 pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); 10140 if (!pids) 10141 return ENOMEM; 10142 pids[0] = 0; 10143 mr = env->me_txns->mti_readers; 10144 for (i=0; i<rdrs; i++) { 10145 pid = mr[i].mr_pid; 10146 if (pid && pid != env->me_pid) { 10147 if (mdb_pid_insert(pids, pid) == 0) { 10148 if (!mdb_reader_pid(env, Pidcheck, pid)) { 10149 /* Stale reader found */ 10150 j = i; 10151 if (rmutex) { 10152 if ((rc = LOCK_MUTEX0(rmutex)) != 0) { 10153 if ((rc = mdb_mutex_failed(env, rmutex, rc))) 10154 break; 10155 rdrs = 0; /* the above checked all readers */ 10156 } else { 10157 /* Recheck, a new process may have reused pid */ 10158 if (mdb_reader_pid(env, Pidcheck, pid)) 10159 j = rdrs; 10160 } 10161 } 10162 for (; j<rdrs; j++) 10163 if (mr[j].mr_pid == pid) { 10164 DPRINTF(("clear stale reader pid %u txn %"Z"d", 10165 (unsigned) pid, mr[j].mr_txnid)); 10166 mr[j].mr_pid = 0; 10167 count++; 10168 } 10169 if (rmutex) 10170 UNLOCK_MUTEX(rmutex); 10171 } 10172 } 10173 } 10174 } 10175 free(pids); 10176 if (dead) 10177 *dead = count; 10178 return rc; 10179 } 10180 10181 #ifdef MDB_ROBUST_SUPPORTED 10182 /** Handle #LOCK_MUTEX0() failure. 10183 * Try to repair the lock file if the mutex owner died. 10184 * @param[in] env the environment handle 10185 * @param[in] mutex LOCK_MUTEX0() mutex 10186 * @param[in] rc LOCK_MUTEX0() error (nonzero) 10187 * @return 0 on success with the mutex locked, or an error code on failure. 10188 */ 10189 static int ESECT 10190 mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) 10191 { 10192 int rlocked, rc2; 10193 MDB_meta *meta; 10194 10195 if (rc == MDB_OWNERDEAD) { 10196 /* We own the mutex. Clean up after dead previous owner. */ 10197 rc = MDB_SUCCESS; 10198 rlocked = (mutex == env->me_rmutex); 10199 if (!rlocked) { 10200 /* Keep mti_txnid updated, otherwise next writer can 10201 * overwrite data which latest meta page refers to. 10202 */ 10203 meta = mdb_env_pick_meta(env); 10204 env->me_txns->mti_txnid = meta->mm_txnid; 10205 /* env is hosed if the dead thread was ours */ 10206 if (env->me_txn) { 10207 env->me_flags |= MDB_FATAL_ERROR; 10208 env->me_txn = NULL; 10209 rc = MDB_PANIC; 10210 } 10211 } 10212 DPRINTF(("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), 10213 (rc ? "this process' env is hosed" : "recovering"))); 10214 rc2 = mdb_reader_check0(env, rlocked, NULL); 10215 if (rc2 == 0) 10216 rc2 = mdb_mutex_consistent(mutex); 10217 if (rc || (rc = rc2)) { 10218 DPRINTF(("LOCK_MUTEX recovery failed, %s", mdb_strerror(rc))); 10219 UNLOCK_MUTEX(mutex); 10220 } 10221 } else { 10222 #ifdef _WIN32 10223 rc = ErrCode(); 10224 #endif 10225 DPRINTF(("LOCK_MUTEX failed, %s", mdb_strerror(rc))); 10226 } 10227 10228 return rc; 10229 } 10230 #endif /* MDB_ROBUST_SUPPORTED */ 10231 10232 #if defined(_WIN32) 10233 /** Convert \b src to new wchar_t[] string with room for \b xtra extra chars */ 10234 static int ESECT 10235 utf8_to_utf16(const char *src, MDB_name *dst, int xtra) 10236 { 10237 int rc, need = 0; 10238 wchar_t *result = NULL; 10239 for (;;) { /* malloc result, then fill it in */ 10240 need = MultiByteToWideChar(CP_UTF8, 0, src, -1, result, need); 10241 if (!need) { 10242 rc = ErrCode(); 10243 free(result); 10244 return rc; 10245 } 10246 if (!result) { 10247 result = malloc(sizeof(wchar_t) * (need + xtra)); 10248 if (!result) 10249 return ENOMEM; 10250 continue; 10251 } 10252 dst->mn_alloced = 1; 10253 dst->mn_len = need - 1; 10254 dst->mn_val = result; 10255 return MDB_SUCCESS; 10256 } 10257 } 10258 #endif /* defined(_WIN32) */ 10259 /** @} */ 10260