1*0Sstevel@tonic-gate /*- 2*0Sstevel@tonic-gate * See the file LICENSE for redistribution information. 3*0Sstevel@tonic-gate * 4*0Sstevel@tonic-gate * Copyright (c) 1996, 1997, 1998 5*0Sstevel@tonic-gate * Sleepycat Software. All rights reserved. 6*0Sstevel@tonic-gate * 7*0Sstevel@tonic-gate * @(#)mp.h 10.37 (Sleepycat) 1/1/99 8*0Sstevel@tonic-gate */ 9*0Sstevel@tonic-gate 10*0Sstevel@tonic-gate struct __bh; typedef struct __bh BH; 11*0Sstevel@tonic-gate struct __db_mpreg; typedef struct __db_mpreg DB_MPREG; 12*0Sstevel@tonic-gate struct __mpool; typedef struct __mpool MPOOL; 13*0Sstevel@tonic-gate struct __mpoolfile; typedef struct __mpoolfile MPOOLFILE; 14*0Sstevel@tonic-gate 15*0Sstevel@tonic-gate /* Default mpool name. */ 16*0Sstevel@tonic-gate #define DB_DEFAULT_MPOOL_FILE "__db_mpool.share" 17*0Sstevel@tonic-gate 18*0Sstevel@tonic-gate /* 19*0Sstevel@tonic-gate * We default to 256K (32 8K pages) if the user doesn't specify, and 20*0Sstevel@tonic-gate * require a minimum of 20K. 21*0Sstevel@tonic-gate */ 22*0Sstevel@tonic-gate #ifndef DB_CACHESIZE_DEF 23*0Sstevel@tonic-gate #define DB_CACHESIZE_DEF (256 * 1024) 24*0Sstevel@tonic-gate #endif 25*0Sstevel@tonic-gate #define DB_CACHESIZE_MIN ( 20 * 1024) 26*0Sstevel@tonic-gate 27*0Sstevel@tonic-gate #define INVALID 0 /* Invalid shared memory offset. */ 28*0Sstevel@tonic-gate 29*0Sstevel@tonic-gate /* 30*0Sstevel@tonic-gate * There are three ways we do locking in the mpool code: 31*0Sstevel@tonic-gate * 32*0Sstevel@tonic-gate * Locking a handle mutex to provide concurrency for DB_THREAD operations. 33*0Sstevel@tonic-gate * Locking the region mutex to provide mutual exclusion while reading and 34*0Sstevel@tonic-gate * writing structures in the shared region. 35*0Sstevel@tonic-gate * Locking buffer header mutexes during I/O. 36*0Sstevel@tonic-gate * 37*0Sstevel@tonic-gate * The first will not be further described here. We use the shared mpool 38*0Sstevel@tonic-gate * region lock to provide mutual exclusion while reading/modifying all of 39*0Sstevel@tonic-gate * the data structures, including the buffer headers. We use a per-buffer 40*0Sstevel@tonic-gate * header lock to wait on buffer I/O. The order of locking is as follows: 41*0Sstevel@tonic-gate * 42*0Sstevel@tonic-gate * Searching for a buffer: 43*0Sstevel@tonic-gate * Acquire the region lock. 44*0Sstevel@tonic-gate * Find the buffer header. 45*0Sstevel@tonic-gate * Increment the reference count (guarantee the buffer stays). 46*0Sstevel@tonic-gate * While the BH_LOCKED flag is set (I/O is going on) { 47*0Sstevel@tonic-gate * Release the region lock. 48*0Sstevel@tonic-gate * Explicitly yield the processor if it's not the first pass 49*0Sstevel@tonic-gate * through this loop, otherwise, we can simply spin because 50*0Sstevel@tonic-gate * we'll be simply switching between the two locks. 51*0Sstevel@tonic-gate * Request the buffer lock. 52*0Sstevel@tonic-gate * The I/O will complete... 53*0Sstevel@tonic-gate * Acquire the buffer lock. 54*0Sstevel@tonic-gate * Release the buffer lock. 55*0Sstevel@tonic-gate * Acquire the region lock. 56*0Sstevel@tonic-gate * } 57*0Sstevel@tonic-gate * Return the buffer. 58*0Sstevel@tonic-gate * 59*0Sstevel@tonic-gate * Reading/writing a buffer: 60*0Sstevel@tonic-gate * Acquire the region lock. 61*0Sstevel@tonic-gate * Find/create the buffer header. 62*0Sstevel@tonic-gate * If reading, increment the reference count (guarantee the buffer stays). 63*0Sstevel@tonic-gate * Set the BH_LOCKED flag. 64*0Sstevel@tonic-gate * Acquire the buffer lock (guaranteed not to block). 65*0Sstevel@tonic-gate * Release the region lock. 66*0Sstevel@tonic-gate * Do the I/O and/or initialize the buffer contents. 67*0Sstevel@tonic-gate * Release the buffer lock. 68*0Sstevel@tonic-gate * At this point, the buffer lock is available, but the logical 69*0Sstevel@tonic-gate * operation (flagged by BH_LOCKED) is not yet completed. For 70*0Sstevel@tonic-gate * this reason, among others, threads checking the BH_LOCKED flag 71*0Sstevel@tonic-gate * must loop around their test. 72*0Sstevel@tonic-gate * Acquire the region lock. 73*0Sstevel@tonic-gate * Clear the BH_LOCKED flag. 74*0Sstevel@tonic-gate * Release the region lock. 75*0Sstevel@tonic-gate * Return/discard the buffer. 76*0Sstevel@tonic-gate * 77*0Sstevel@tonic-gate * Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are not 78*0Sstevel@tonic-gate * reacquired when a region lock is reacquired because they couldn't have been 79*0Sstevel@tonic-gate * closed/discarded and because they never move in memory. 80*0Sstevel@tonic-gate */ 81*0Sstevel@tonic-gate #define LOCKINIT(dbmp, mutexp) \ 82*0Sstevel@tonic-gate if (F_ISSET(dbmp, MP_LOCKHANDLE | MP_LOCKREGION)) \ 83*0Sstevel@tonic-gate (void)__db_mutex_init(mutexp, \ 84*0Sstevel@tonic-gate MUTEX_LOCK_OFFSET((dbmp)->reginfo.addr, mutexp)) 85*0Sstevel@tonic-gate 86*0Sstevel@tonic-gate #define LOCKHANDLE(dbmp, mutexp) \ 87*0Sstevel@tonic-gate if (F_ISSET(dbmp, MP_LOCKHANDLE)) \ 88*0Sstevel@tonic-gate (void)__db_mutex_lock(mutexp, (dbmp)->reginfo.fd) 89*0Sstevel@tonic-gate #define UNLOCKHANDLE(dbmp, mutexp) \ 90*0Sstevel@tonic-gate if (F_ISSET(dbmp, MP_LOCKHANDLE)) \ 91*0Sstevel@tonic-gate (void)__db_mutex_unlock(mutexp, (dbmp)->reginfo.fd) 92*0Sstevel@tonic-gate 93*0Sstevel@tonic-gate #define LOCKREGION(dbmp) \ 94*0Sstevel@tonic-gate if (F_ISSET(dbmp, MP_LOCKREGION)) \ 95*0Sstevel@tonic-gate (void)__db_mutex_lock(&((RLAYOUT *)(dbmp)->mp)->lock, \ 96*0Sstevel@tonic-gate (dbmp)->reginfo.fd) 97*0Sstevel@tonic-gate #define UNLOCKREGION(dbmp) \ 98*0Sstevel@tonic-gate if (F_ISSET(dbmp, MP_LOCKREGION)) \ 99*0Sstevel@tonic-gate (void)__db_mutex_unlock(&((RLAYOUT *)(dbmp)->mp)->lock, \ 100*0Sstevel@tonic-gate (dbmp)->reginfo.fd) 101*0Sstevel@tonic-gate 102*0Sstevel@tonic-gate #define LOCKBUFFER(dbmp, bhp) \ 103*0Sstevel@tonic-gate if (F_ISSET(dbmp, MP_LOCKREGION)) \ 104*0Sstevel@tonic-gate (void)__db_mutex_lock(&(bhp)->mutex, (dbmp)->reginfo.fd) 105*0Sstevel@tonic-gate #define UNLOCKBUFFER(dbmp, bhp) \ 106*0Sstevel@tonic-gate if (F_ISSET(dbmp, MP_LOCKREGION)) \ 107*0Sstevel@tonic-gate (void)__db_mutex_unlock(&(bhp)->mutex, (dbmp)->reginfo.fd) 108*0Sstevel@tonic-gate 109*0Sstevel@tonic-gate /* Check for region catastrophic shutdown. */ 110*0Sstevel@tonic-gate #define MP_PANIC_CHECK(dbmp) { \ 111*0Sstevel@tonic-gate if ((dbmp)->mp->rlayout.panic) \ 112*0Sstevel@tonic-gate return (DB_RUNRECOVERY); \ 113*0Sstevel@tonic-gate } 114*0Sstevel@tonic-gate 115*0Sstevel@tonic-gate /* 116*0Sstevel@tonic-gate * DB_MPOOL -- 117*0Sstevel@tonic-gate * Per-process memory pool structure. 118*0Sstevel@tonic-gate */ 119*0Sstevel@tonic-gate struct __db_mpool { 120*0Sstevel@tonic-gate /* These fields need to be protected for multi-threaded support. */ 121*0Sstevel@tonic-gate db_mutex_t *mutexp; /* Structure lock. */ 122*0Sstevel@tonic-gate 123*0Sstevel@tonic-gate /* List of pgin/pgout routines. */ 124*0Sstevel@tonic-gate LIST_HEAD(__db_mpregh, __db_mpreg) dbregq; 125*0Sstevel@tonic-gate 126*0Sstevel@tonic-gate /* List of DB_MPOOLFILE's. */ 127*0Sstevel@tonic-gate TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq; 128*0Sstevel@tonic-gate 129*0Sstevel@tonic-gate /* These fields are not protected. */ 130*0Sstevel@tonic-gate DB_ENV *dbenv; /* Reference to error information. */ 131*0Sstevel@tonic-gate REGINFO reginfo; /* Region information. */ 132*0Sstevel@tonic-gate 133*0Sstevel@tonic-gate MPOOL *mp; /* Address of the shared MPOOL. */ 134*0Sstevel@tonic-gate 135*0Sstevel@tonic-gate void *addr; /* Address of shalloc() region. */ 136*0Sstevel@tonic-gate 137*0Sstevel@tonic-gate DB_HASHTAB *htab; /* Hash table of bucket headers. */ 138*0Sstevel@tonic-gate 139*0Sstevel@tonic-gate #define MP_LOCKHANDLE 0x01 /* Threaded, lock handles and region. */ 140*0Sstevel@tonic-gate #define MP_LOCKREGION 0x02 /* Concurrent access, lock region. */ 141*0Sstevel@tonic-gate u_int32_t flags; 142*0Sstevel@tonic-gate }; 143*0Sstevel@tonic-gate 144*0Sstevel@tonic-gate /* 145*0Sstevel@tonic-gate * DB_MPREG -- 146*0Sstevel@tonic-gate * DB_MPOOL registry of pgin/pgout functions. 147*0Sstevel@tonic-gate */ 148*0Sstevel@tonic-gate struct __db_mpreg { 149*0Sstevel@tonic-gate LIST_ENTRY(__db_mpreg) q; /* Linked list. */ 150*0Sstevel@tonic-gate 151*0Sstevel@tonic-gate int ftype; /* File type. */ 152*0Sstevel@tonic-gate /* Pgin, pgout routines. */ 153*0Sstevel@tonic-gate int (DB_CALLBACK *pgin) __P((db_pgno_t, void *, DBT *)); 154*0Sstevel@tonic-gate int (DB_CALLBACK *pgout) __P((db_pgno_t, void *, DBT *)); 155*0Sstevel@tonic-gate }; 156*0Sstevel@tonic-gate 157*0Sstevel@tonic-gate /* 158*0Sstevel@tonic-gate * DB_MPOOLFILE -- 159*0Sstevel@tonic-gate * Per-process DB_MPOOLFILE information. 160*0Sstevel@tonic-gate */ 161*0Sstevel@tonic-gate struct __db_mpoolfile { 162*0Sstevel@tonic-gate /* These fields need to be protected for multi-threaded support. */ 163*0Sstevel@tonic-gate db_mutex_t *mutexp; /* Structure lock. */ 164*0Sstevel@tonic-gate 165*0Sstevel@tonic-gate int fd; /* Underlying file descriptor. */ 166*0Sstevel@tonic-gate 167*0Sstevel@tonic-gate u_int32_t ref; /* Reference count. */ 168*0Sstevel@tonic-gate 169*0Sstevel@tonic-gate /* 170*0Sstevel@tonic-gate * !!! 171*0Sstevel@tonic-gate * This field is a special case -- it's protected by the region lock 172*0Sstevel@tonic-gate * NOT the thread lock. The reason for this is that we always have 173*0Sstevel@tonic-gate * the region lock immediately before or after we modify the field, 174*0Sstevel@tonic-gate * and we don't want to use the structure lock to protect it because 175*0Sstevel@tonic-gate * then I/O (which is done with the structure lock held because of 176*0Sstevel@tonic-gate * the race between the seek and write of the file descriptor) will 177*0Sstevel@tonic-gate * block any other put/get calls using this DB_MPOOLFILE structure. 178*0Sstevel@tonic-gate */ 179*0Sstevel@tonic-gate u_int32_t pinref; /* Pinned block reference count. */ 180*0Sstevel@tonic-gate 181*0Sstevel@tonic-gate /* These fields are not protected. */ 182*0Sstevel@tonic-gate TAILQ_ENTRY(__db_mpoolfile) q; /* Linked list of DB_MPOOLFILE's. */ 183*0Sstevel@tonic-gate 184*0Sstevel@tonic-gate DB_MPOOL *dbmp; /* Overlying DB_MPOOL. */ 185*0Sstevel@tonic-gate MPOOLFILE *mfp; /* Underlying MPOOLFILE. */ 186*0Sstevel@tonic-gate 187*0Sstevel@tonic-gate void *addr; /* Address of mmap'd region. */ 188*0Sstevel@tonic-gate size_t len; /* Length of mmap'd region. */ 189*0Sstevel@tonic-gate 190*0Sstevel@tonic-gate /* These fields need to be protected for multi-threaded support. */ 191*0Sstevel@tonic-gate #define MP_READONLY 0x01 /* File is readonly. */ 192*0Sstevel@tonic-gate #define MP_UPGRADE 0x02 /* File descriptor is readwrite. */ 193*0Sstevel@tonic-gate #define MP_UPGRADE_FAIL 0x04 /* Upgrade wasn't possible. */ 194*0Sstevel@tonic-gate u_int32_t flags; 195*0Sstevel@tonic-gate }; 196*0Sstevel@tonic-gate 197*0Sstevel@tonic-gate /* 198*0Sstevel@tonic-gate * MPOOL -- 199*0Sstevel@tonic-gate * Shared memory pool region. One of these is allocated in shared 200*0Sstevel@tonic-gate * memory, and describes the pool. 201*0Sstevel@tonic-gate */ 202*0Sstevel@tonic-gate struct __mpool { 203*0Sstevel@tonic-gate RLAYOUT rlayout; /* General region information. */ 204*0Sstevel@tonic-gate 205*0Sstevel@tonic-gate SH_TAILQ_HEAD(__bhq) bhq; /* LRU list of buckets. */ 206*0Sstevel@tonic-gate SH_TAILQ_HEAD(__bhfq) bhfq; /* Free buckets. */ 207*0Sstevel@tonic-gate SH_TAILQ_HEAD(__mpfq) mpfq; /* List of MPOOLFILEs. */ 208*0Sstevel@tonic-gate 209*0Sstevel@tonic-gate /* 210*0Sstevel@tonic-gate * We make the assumption that the early pages of the file are far 211*0Sstevel@tonic-gate * more likely to be retrieved than the later pages, which means 212*0Sstevel@tonic-gate * that the top bits are more interesting for hashing since they're 213*0Sstevel@tonic-gate * less likely to collide. On the other hand, since 512 4K pages 214*0Sstevel@tonic-gate * represents a 2MB file, only the bottom 9 bits of the page number 215*0Sstevel@tonic-gate * are likely to be set. We XOR in the offset in the MPOOL of the 216*0Sstevel@tonic-gate * MPOOLFILE that backs this particular page, since that should also 217*0Sstevel@tonic-gate * be unique for the page. 218*0Sstevel@tonic-gate */ 219*0Sstevel@tonic-gate #define BUCKET(mp, mf_offset, pgno) \ 220*0Sstevel@tonic-gate (((pgno) ^ ((mf_offset) << 9)) % (mp)->htab_buckets) 221*0Sstevel@tonic-gate 222*0Sstevel@tonic-gate size_t htab; /* Hash table offset. */ 223*0Sstevel@tonic-gate size_t htab_buckets; /* Number of hash table entries. */ 224*0Sstevel@tonic-gate 225*0Sstevel@tonic-gate DB_LSN lsn; /* Maximum checkpoint LSN. */ 226*0Sstevel@tonic-gate u_int32_t lsn_cnt; /* Checkpoint buffers left to write. */ 227*0Sstevel@tonic-gate 228*0Sstevel@tonic-gate DB_MPOOL_STAT stat; /* Global mpool statistics. */ 229*0Sstevel@tonic-gate 230*0Sstevel@tonic-gate #define MP_LSN_RETRY 0x01 /* Retry all BH_WRITE buffers. */ 231*0Sstevel@tonic-gate u_int32_t flags; 232*0Sstevel@tonic-gate }; 233*0Sstevel@tonic-gate 234*0Sstevel@tonic-gate /* 235*0Sstevel@tonic-gate * MPOOLFILE -- 236*0Sstevel@tonic-gate * Shared DB_MPOOLFILE information. 237*0Sstevel@tonic-gate */ 238*0Sstevel@tonic-gate struct __mpoolfile { 239*0Sstevel@tonic-gate SH_TAILQ_ENTRY q; /* List of MPOOLFILEs */ 240*0Sstevel@tonic-gate 241*0Sstevel@tonic-gate u_int32_t ref; /* Reference count. */ 242*0Sstevel@tonic-gate 243*0Sstevel@tonic-gate int ftype; /* File type. */ 244*0Sstevel@tonic-gate 245*0Sstevel@tonic-gate int32_t lsn_off; /* Page's LSN offset. */ 246*0Sstevel@tonic-gate u_int32_t clear_len; /* Bytes to clear on page create. */ 247*0Sstevel@tonic-gate 248*0Sstevel@tonic-gate size_t path_off; /* File name location. */ 249*0Sstevel@tonic-gate size_t fileid_off; /* File identification location. */ 250*0Sstevel@tonic-gate 251*0Sstevel@tonic-gate size_t pgcookie_len; /* Pgin/pgout cookie length. */ 252*0Sstevel@tonic-gate size_t pgcookie_off; /* Pgin/pgout cookie location. */ 253*0Sstevel@tonic-gate 254*0Sstevel@tonic-gate u_int32_t lsn_cnt; /* Checkpoint buffers left to write. */ 255*0Sstevel@tonic-gate 256*0Sstevel@tonic-gate db_pgno_t last_pgno; /* Last page in the file. */ 257*0Sstevel@tonic-gate db_pgno_t orig_last_pgno; /* Original last page in the file. */ 258*0Sstevel@tonic-gate 259*0Sstevel@tonic-gate #define MP_CAN_MMAP 0x01 /* If the file can be mmap'd. */ 260*0Sstevel@tonic-gate #define MP_TEMP 0x02 /* Backing file is a temporary. */ 261*0Sstevel@tonic-gate u_int32_t flags; 262*0Sstevel@tonic-gate 263*0Sstevel@tonic-gate DB_MPOOL_FSTAT stat; /* Per-file mpool statistics. */ 264*0Sstevel@tonic-gate }; 265*0Sstevel@tonic-gate 266*0Sstevel@tonic-gate /* 267*0Sstevel@tonic-gate * BH -- 268*0Sstevel@tonic-gate * Buffer header. 269*0Sstevel@tonic-gate */ 270*0Sstevel@tonic-gate struct __bh { 271*0Sstevel@tonic-gate db_mutex_t mutex; /* Structure lock. */ 272*0Sstevel@tonic-gate 273*0Sstevel@tonic-gate u_int16_t ref; /* Reference count. */ 274*0Sstevel@tonic-gate 275*0Sstevel@tonic-gate #define BH_CALLPGIN 0x001 /* Page needs to be reworked... */ 276*0Sstevel@tonic-gate #define BH_DIRTY 0x002 /* Page was modified. */ 277*0Sstevel@tonic-gate #define BH_DISCARD 0x004 /* Page is useless. */ 278*0Sstevel@tonic-gate #define BH_LOCKED 0x008 /* Page is locked (I/O in progress). */ 279*0Sstevel@tonic-gate #define BH_TRASH 0x010 /* Page is garbage. */ 280*0Sstevel@tonic-gate #define BH_WRITE 0x020 /* Page scheduled for writing. */ 281*0Sstevel@tonic-gate u_int16_t flags; 282*0Sstevel@tonic-gate 283*0Sstevel@tonic-gate SH_TAILQ_ENTRY q; /* LRU queue. */ 284*0Sstevel@tonic-gate SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */ 285*0Sstevel@tonic-gate 286*0Sstevel@tonic-gate db_pgno_t pgno; /* Underlying MPOOLFILE page number. */ 287*0Sstevel@tonic-gate size_t mf_offset; /* Associated MPOOLFILE offset. */ 288*0Sstevel@tonic-gate 289*0Sstevel@tonic-gate /* 290*0Sstevel@tonic-gate * !!! 291*0Sstevel@tonic-gate * This array must be size_t aligned -- the DB access methods put PAGE 292*0Sstevel@tonic-gate * and other structures into it, and expect to be able to access them 293*0Sstevel@tonic-gate * directly. (We guarantee size_t alignment in the db_mpool(3) manual 294*0Sstevel@tonic-gate * page as well.) 295*0Sstevel@tonic-gate */ 296*0Sstevel@tonic-gate u_int8_t buf[1]; /* Variable length data. */ 297*0Sstevel@tonic-gate }; 298*0Sstevel@tonic-gate 299*0Sstevel@tonic-gate #include "mp_ext.h" 300