1*0Sstevel@tonic-gate /*- 2*0Sstevel@tonic-gate * See the file LICENSE for redistribution information. 3*0Sstevel@tonic-gate * 4*0Sstevel@tonic-gate * Copyright (c) 1996, 1997, 1998 5*0Sstevel@tonic-gate * Sleepycat Software. All rights reserved. 6*0Sstevel@tonic-gate */ 7*0Sstevel@tonic-gate #include "config.h" 8*0Sstevel@tonic-gate 9*0Sstevel@tonic-gate #ifndef lint 10*0Sstevel@tonic-gate static const char sccsid[] = "@(#)mp_bh.c 10.45 (Sleepycat) 11/25/98"; 11*0Sstevel@tonic-gate #endif /* not lint */ 12*0Sstevel@tonic-gate 13*0Sstevel@tonic-gate #ifndef NO_SYSTEM_INCLUDES 14*0Sstevel@tonic-gate #include <sys/types.h> 15*0Sstevel@tonic-gate 16*0Sstevel@tonic-gate #include <errno.h> 17*0Sstevel@tonic-gate #include <string.h> 18*0Sstevel@tonic-gate #include <unistd.h> 19*0Sstevel@tonic-gate #endif 20*0Sstevel@tonic-gate 21*0Sstevel@tonic-gate #include "db_int.h" 22*0Sstevel@tonic-gate #include "shqueue.h" 23*0Sstevel@tonic-gate #include "db_shash.h" 24*0Sstevel@tonic-gate #include "mp.h" 25*0Sstevel@tonic-gate #include "common_ext.h" 26*0Sstevel@tonic-gate 27*0Sstevel@tonic-gate static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *)); 28*0Sstevel@tonic-gate 29*0Sstevel@tonic-gate /* 30*0Sstevel@tonic-gate * __memp_bhwrite -- 31*0Sstevel@tonic-gate * Write the page associated with a given bucket header. 32*0Sstevel@tonic-gate * 33*0Sstevel@tonic-gate * PUBLIC: int __memp_bhwrite 34*0Sstevel@tonic-gate * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *)); 35*0Sstevel@tonic-gate */ 36*0Sstevel@tonic-gate int 37*0Sstevel@tonic-gate __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep) 38*0Sstevel@tonic-gate DB_MPOOL *dbmp; 39*0Sstevel@tonic-gate MPOOLFILE *mfp; 40*0Sstevel@tonic-gate BH *bhp; 41*0Sstevel@tonic-gate int *restartp, *wrotep; 42*0Sstevel@tonic-gate { 43*0Sstevel@tonic-gate DB_MPOOLFILE *dbmfp; 44*0Sstevel@tonic-gate DB_MPREG *mpreg; 45*0Sstevel@tonic-gate int incremented, ret; 46*0Sstevel@tonic-gate 47*0Sstevel@tonic-gate if (restartp != NULL) 48*0Sstevel@tonic-gate *restartp = 0; 49*0Sstevel@tonic-gate if (wrotep != NULL) 50*0Sstevel@tonic-gate *wrotep = 0; 51*0Sstevel@tonic-gate incremented = 0; 52*0Sstevel@tonic-gate 53*0Sstevel@tonic-gate /* 54*0Sstevel@tonic-gate * Walk the process' DB_MPOOLFILE list and find a file descriptor for 55*0Sstevel@tonic-gate * the file. We also check that the descriptor is open for writing. 56*0Sstevel@tonic-gate * If we find a descriptor on the file that's not open for writing, we 57*0Sstevel@tonic-gate * try and upgrade it to make it writeable. If that fails, we're done. 58*0Sstevel@tonic-gate */ 59*0Sstevel@tonic-gate LOCKHANDLE(dbmp, dbmp->mutexp); 60*0Sstevel@tonic-gate for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq); 61*0Sstevel@tonic-gate dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) 62*0Sstevel@tonic-gate if (dbmfp->mfp == mfp) { 63*0Sstevel@tonic-gate if (F_ISSET(dbmfp, MP_READONLY) && 64*0Sstevel@tonic-gate __memp_upgrade(dbmp, dbmfp, mfp)) { 65*0Sstevel@tonic-gate UNLOCKHANDLE(dbmp, dbmp->mutexp); 66*0Sstevel@tonic-gate return (0); 67*0Sstevel@tonic-gate } 68*0Sstevel@tonic-gate 69*0Sstevel@tonic-gate /* 70*0Sstevel@tonic-gate * Increment the reference count -- see the comment in 71*0Sstevel@tonic-gate * memp_fclose(). 72*0Sstevel@tonic-gate */ 73*0Sstevel@tonic-gate ++dbmfp->ref; 74*0Sstevel@tonic-gate incremented = 1; 75*0Sstevel@tonic-gate break; 76*0Sstevel@tonic-gate } 77*0Sstevel@tonic-gate UNLOCKHANDLE(dbmp, dbmp->mutexp); 78*0Sstevel@tonic-gate if (dbmfp != NULL) 79*0Sstevel@tonic-gate goto found; 80*0Sstevel@tonic-gate 81*0Sstevel@tonic-gate /* 82*0Sstevel@tonic-gate * It's not a page from a file we've opened. If the file requires 83*0Sstevel@tonic-gate * input/output processing, see if this process has ever registered 84*0Sstevel@tonic-gate * information as to how to write this type of file. If not, there's 85*0Sstevel@tonic-gate * nothing we can do. 86*0Sstevel@tonic-gate */ 87*0Sstevel@tonic-gate if (mfp->ftype != 0) { 88*0Sstevel@tonic-gate LOCKHANDLE(dbmp, dbmp->mutexp); 89*0Sstevel@tonic-gate for (mpreg = LIST_FIRST(&dbmp->dbregq); 90*0Sstevel@tonic-gate mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) 91*0Sstevel@tonic-gate if (mpreg->ftype == mfp->ftype) 92*0Sstevel@tonic-gate break; 93*0Sstevel@tonic-gate UNLOCKHANDLE(dbmp, dbmp->mutexp); 94*0Sstevel@tonic-gate if (mpreg == NULL) 95*0Sstevel@tonic-gate return (0); 96*0Sstevel@tonic-gate } 97*0Sstevel@tonic-gate 98*0Sstevel@tonic-gate /* 99*0Sstevel@tonic-gate * Try and open the file, attaching to the underlying shared area. 100*0Sstevel@tonic-gate * 101*0Sstevel@tonic-gate * XXX 102*0Sstevel@tonic-gate * Don't try to attach to temporary files. There are two problems in 103*0Sstevel@tonic-gate * trying to do that. First, if we have different privileges than the 104*0Sstevel@tonic-gate * process that "owns" the temporary file, we might create the backing 105*0Sstevel@tonic-gate * disk file such that the owning process couldn't read/write its own 106*0Sstevel@tonic-gate * buffers, e.g., memp_trickle() running as root creating a file owned 107*0Sstevel@tonic-gate * as root, mode 600. Second, if the temporary file has already been 108*0Sstevel@tonic-gate * created, we don't have any way of finding out what its real name is, 109*0Sstevel@tonic-gate * and, even if we did, it was already unlinked (so that it won't be 110*0Sstevel@tonic-gate * left if the process dies horribly). This decision causes a problem, 111*0Sstevel@tonic-gate * however: if the temporary file consumes the entire buffer cache, 112*0Sstevel@tonic-gate * and the owner doesn't flush the buffers to disk, we could end up 113*0Sstevel@tonic-gate * with resource starvation, and the memp_trickle() thread couldn't do 114*0Sstevel@tonic-gate * anything about it. That's a pretty unlikely scenario, though. 115*0Sstevel@tonic-gate * 116*0Sstevel@tonic-gate * XXX 117*0Sstevel@tonic-gate * There's no negative cache, so we may repeatedly try and open files 118*0Sstevel@tonic-gate * that we have previously tried (and failed) to open. 119*0Sstevel@tonic-gate * 120*0Sstevel@tonic-gate * Ignore any error, assume it's a permissions problem. 121*0Sstevel@tonic-gate */ 122*0Sstevel@tonic-gate if (F_ISSET(mfp, MP_TEMP)) 123*0Sstevel@tonic-gate return (0); 124*0Sstevel@tonic-gate 125*0Sstevel@tonic-gate if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp, mfp->path_off), 126*0Sstevel@tonic-gate 0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0) 127*0Sstevel@tonic-gate return (0); 128*0Sstevel@tonic-gate 129*0Sstevel@tonic-gate found: ret = __memp_pgwrite(dbmfp, bhp, restartp, wrotep); 130*0Sstevel@tonic-gate 131*0Sstevel@tonic-gate if (incremented) { 132*0Sstevel@tonic-gate LOCKHANDLE(dbmp, dbmp->mutexp); 133*0Sstevel@tonic-gate --dbmfp->ref; 134*0Sstevel@tonic-gate UNLOCKHANDLE(dbmp, dbmp->mutexp); 135*0Sstevel@tonic-gate } 136*0Sstevel@tonic-gate 137*0Sstevel@tonic-gate return (ret); 138*0Sstevel@tonic-gate } 139*0Sstevel@tonic-gate 140*0Sstevel@tonic-gate /* 141*0Sstevel@tonic-gate * __memp_pgread -- 142*0Sstevel@tonic-gate * Read a page from a file. 143*0Sstevel@tonic-gate * 144*0Sstevel@tonic-gate * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int)); 145*0Sstevel@tonic-gate */ 146*0Sstevel@tonic-gate int 147*0Sstevel@tonic-gate __memp_pgread(dbmfp, bhp, can_create) 148*0Sstevel@tonic-gate DB_MPOOLFILE *dbmfp; 149*0Sstevel@tonic-gate BH *bhp; 150*0Sstevel@tonic-gate int can_create; 151*0Sstevel@tonic-gate { 152*0Sstevel@tonic-gate DB_IO db_io; 153*0Sstevel@tonic-gate DB_MPOOL *dbmp; 154*0Sstevel@tonic-gate MPOOLFILE *mfp; 155*0Sstevel@tonic-gate size_t len, pagesize; 156*0Sstevel@tonic-gate ssize_t nr; 157*0Sstevel@tonic-gate int created, ret; 158*0Sstevel@tonic-gate 159*0Sstevel@tonic-gate dbmp = dbmfp->dbmp; 160*0Sstevel@tonic-gate mfp = dbmfp->mfp; 161*0Sstevel@tonic-gate pagesize = mfp->stat.st_pagesize; 162*0Sstevel@tonic-gate 163*0Sstevel@tonic-gate F_SET(bhp, BH_LOCKED | BH_TRASH); 164*0Sstevel@tonic-gate LOCKBUFFER(dbmp, bhp); 165*0Sstevel@tonic-gate UNLOCKREGION(dbmp); 166*0Sstevel@tonic-gate 167*0Sstevel@tonic-gate /* 168*0Sstevel@tonic-gate * Temporary files may not yet have been created. We don't create 169*0Sstevel@tonic-gate * them now, we create them when the pages have to be flushed. 170*0Sstevel@tonic-gate */ 171*0Sstevel@tonic-gate nr = 0; 172*0Sstevel@tonic-gate if (dbmfp->fd == -1) 173*0Sstevel@tonic-gate ret = 0; 174*0Sstevel@tonic-gate else { 175*0Sstevel@tonic-gate /* 176*0Sstevel@tonic-gate * Ignore read errors if we have permission to create the page. 177*0Sstevel@tonic-gate * Assume that the page doesn't exist, and that we'll create it 178*0Sstevel@tonic-gate * when we write it out. 179*0Sstevel@tonic-gate */ 180*0Sstevel@tonic-gate db_io.fd_io = dbmfp->fd; 181*0Sstevel@tonic-gate db_io.fd_lock = dbmp->reginfo.fd; 182*0Sstevel@tonic-gate db_io.mutexp = 183*0Sstevel@tonic-gate F_ISSET(dbmp, MP_LOCKHANDLE) ? dbmfp->mutexp : NULL; 184*0Sstevel@tonic-gate db_io.pagesize = db_io.bytes = pagesize; 185*0Sstevel@tonic-gate db_io.pgno = bhp->pgno; 186*0Sstevel@tonic-gate db_io.buf = bhp->buf; 187*0Sstevel@tonic-gate 188*0Sstevel@tonic-gate ret = __os_io(&db_io, DB_IO_READ, &nr); 189*0Sstevel@tonic-gate } 190*0Sstevel@tonic-gate 191*0Sstevel@tonic-gate created = 0; 192*0Sstevel@tonic-gate if (nr < (ssize_t)pagesize) 193*0Sstevel@tonic-gate if (can_create) 194*0Sstevel@tonic-gate created = 1; 195*0Sstevel@tonic-gate else { 196*0Sstevel@tonic-gate /* If we had a short read, ret may be 0. */ 197*0Sstevel@tonic-gate if (ret == 0) 198*0Sstevel@tonic-gate ret = EIO; 199*0Sstevel@tonic-gate __db_err(dbmp->dbenv, 200*0Sstevel@tonic-gate "%s: page %lu doesn't exist, create flag not set", 201*0Sstevel@tonic-gate __memp_fn(dbmfp), (u_long)bhp->pgno); 202*0Sstevel@tonic-gate goto err; 203*0Sstevel@tonic-gate } 204*0Sstevel@tonic-gate 205*0Sstevel@tonic-gate /* 206*0Sstevel@tonic-gate * Clear any bytes we didn't read that need to be cleared. If we're 207*0Sstevel@tonic-gate * running in diagnostic mode, smash any bytes on the page that are 208*0Sstevel@tonic-gate * unknown quantities for the caller. 209*0Sstevel@tonic-gate */ 210*0Sstevel@tonic-gate if (nr != (ssize_t)pagesize) { 211*0Sstevel@tonic-gate len = mfp->clear_len == 0 ? pagesize : mfp->clear_len; 212*0Sstevel@tonic-gate if (nr < (ssize_t)len) 213*0Sstevel@tonic-gate memset(bhp->buf + nr, 0, len - nr); 214*0Sstevel@tonic-gate #ifdef DIAGNOSTIC 215*0Sstevel@tonic-gate if (nr > (ssize_t)len) 216*0Sstevel@tonic-gate len = nr; 217*0Sstevel@tonic-gate if (len < pagesize) 218*0Sstevel@tonic-gate memset(bhp->buf + len, 0xdb, pagesize - len); 219*0Sstevel@tonic-gate #endif 220*0Sstevel@tonic-gate } 221*0Sstevel@tonic-gate 222*0Sstevel@tonic-gate /* Call any pgin function. */ 223*0Sstevel@tonic-gate ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1); 224*0Sstevel@tonic-gate 225*0Sstevel@tonic-gate /* Unlock the buffer and reacquire the region lock. */ 226*0Sstevel@tonic-gate err: UNLOCKBUFFER(dbmp, bhp); 227*0Sstevel@tonic-gate LOCKREGION(dbmp); 228*0Sstevel@tonic-gate 229*0Sstevel@tonic-gate /* 230*0Sstevel@tonic-gate * If no errors occurred, the data is now valid, clear the BH_TRASH 231*0Sstevel@tonic-gate * flag; regardless, clear the lock bit and let other threads proceed. 232*0Sstevel@tonic-gate */ 233*0Sstevel@tonic-gate F_CLR(bhp, BH_LOCKED); 234*0Sstevel@tonic-gate if (ret == 0) { 235*0Sstevel@tonic-gate F_CLR(bhp, BH_TRASH); 236*0Sstevel@tonic-gate 237*0Sstevel@tonic-gate /* Update the statistics. */ 238*0Sstevel@tonic-gate if (created) { 239*0Sstevel@tonic-gate ++dbmp->mp->stat.st_page_create; 240*0Sstevel@tonic-gate ++mfp->stat.st_page_create; 241*0Sstevel@tonic-gate } else { 242*0Sstevel@tonic-gate ++dbmp->mp->stat.st_page_in; 243*0Sstevel@tonic-gate ++mfp->stat.st_page_in; 244*0Sstevel@tonic-gate } 245*0Sstevel@tonic-gate } 246*0Sstevel@tonic-gate 247*0Sstevel@tonic-gate return (ret); 248*0Sstevel@tonic-gate } 249*0Sstevel@tonic-gate 250*0Sstevel@tonic-gate /* 251*0Sstevel@tonic-gate * __memp_pgwrite -- 252*0Sstevel@tonic-gate * Write a page to a file. 253*0Sstevel@tonic-gate * 254*0Sstevel@tonic-gate * PUBLIC: int __memp_pgwrite __P((DB_MPOOLFILE *, BH *, int *, int *)); 255*0Sstevel@tonic-gate */ 256*0Sstevel@tonic-gate int 257*0Sstevel@tonic-gate __memp_pgwrite(dbmfp, bhp, restartp, wrotep) 258*0Sstevel@tonic-gate DB_MPOOLFILE *dbmfp; 259*0Sstevel@tonic-gate BH *bhp; 260*0Sstevel@tonic-gate int *restartp, *wrotep; 261*0Sstevel@tonic-gate { 262*0Sstevel@tonic-gate DB_ENV *dbenv; 263*0Sstevel@tonic-gate DB_IO db_io; 264*0Sstevel@tonic-gate DB_LOG *lg_info; 265*0Sstevel@tonic-gate DB_LSN lsn; 266*0Sstevel@tonic-gate DB_MPOOL *dbmp; 267*0Sstevel@tonic-gate MPOOL *mp; 268*0Sstevel@tonic-gate MPOOLFILE *mfp; 269*0Sstevel@tonic-gate ssize_t nw; 270*0Sstevel@tonic-gate int callpgin, dosync, ret, syncfail; 271*0Sstevel@tonic-gate const char *fail; 272*0Sstevel@tonic-gate 273*0Sstevel@tonic-gate dbmp = dbmfp->dbmp; 274*0Sstevel@tonic-gate dbenv = dbmp->dbenv; 275*0Sstevel@tonic-gate mp = dbmp->mp; 276*0Sstevel@tonic-gate mfp = dbmfp->mfp; 277*0Sstevel@tonic-gate 278*0Sstevel@tonic-gate if (restartp != NULL) 279*0Sstevel@tonic-gate *restartp = 0; 280*0Sstevel@tonic-gate if (wrotep != NULL) 281*0Sstevel@tonic-gate *wrotep = 0; 282*0Sstevel@tonic-gate callpgin = 0; 283*0Sstevel@tonic-gate 284*0Sstevel@tonic-gate /* 285*0Sstevel@tonic-gate * Check the dirty bit -- this buffer may have been written since we 286*0Sstevel@tonic-gate * decided to write it. 287*0Sstevel@tonic-gate */ 288*0Sstevel@tonic-gate if (!F_ISSET(bhp, BH_DIRTY)) { 289*0Sstevel@tonic-gate if (wrotep != NULL) 290*0Sstevel@tonic-gate *wrotep = 1; 291*0Sstevel@tonic-gate return (0); 292*0Sstevel@tonic-gate } 293*0Sstevel@tonic-gate 294*0Sstevel@tonic-gate LOCKBUFFER(dbmp, bhp); 295*0Sstevel@tonic-gate 296*0Sstevel@tonic-gate /* 297*0Sstevel@tonic-gate * If there were two writers, we may have just been waiting while the 298*0Sstevel@tonic-gate * other writer completed I/O on this buffer. Check the dirty bit one 299*0Sstevel@tonic-gate * more time. 300*0Sstevel@tonic-gate */ 301*0Sstevel@tonic-gate if (!F_ISSET(bhp, BH_DIRTY)) { 302*0Sstevel@tonic-gate UNLOCKBUFFER(dbmp, bhp); 303*0Sstevel@tonic-gate 304*0Sstevel@tonic-gate if (wrotep != NULL) 305*0Sstevel@tonic-gate *wrotep = 1; 306*0Sstevel@tonic-gate return (0); 307*0Sstevel@tonic-gate } 308*0Sstevel@tonic-gate 309*0Sstevel@tonic-gate F_SET(bhp, BH_LOCKED); 310*0Sstevel@tonic-gate UNLOCKREGION(dbmp); 311*0Sstevel@tonic-gate 312*0Sstevel@tonic-gate if (restartp != NULL) 313*0Sstevel@tonic-gate *restartp = 1; 314*0Sstevel@tonic-gate 315*0Sstevel@tonic-gate /* Copy the LSN off the page if we're going to need it. */ 316*0Sstevel@tonic-gate lg_info = dbenv->lg_info; 317*0Sstevel@tonic-gate if (lg_info != NULL || F_ISSET(bhp, BH_WRITE)) 318*0Sstevel@tonic-gate memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN)); 319*0Sstevel@tonic-gate 320*0Sstevel@tonic-gate /* Ensure the appropriate log records are on disk. */ 321*0Sstevel@tonic-gate if (lg_info != NULL && (ret = log_flush(lg_info, &lsn)) != 0) 322*0Sstevel@tonic-gate goto err; 323*0Sstevel@tonic-gate 324*0Sstevel@tonic-gate /* 325*0Sstevel@tonic-gate * Call any pgout function. We set the callpgin flag so that we flag 326*0Sstevel@tonic-gate * that the contents of the buffer will need to be passed through pgin 327*0Sstevel@tonic-gate * before they are reused. 328*0Sstevel@tonic-gate */ 329*0Sstevel@tonic-gate if (mfp->ftype == 0) 330*0Sstevel@tonic-gate ret = 0; 331*0Sstevel@tonic-gate else { 332*0Sstevel@tonic-gate callpgin = 1; 333*0Sstevel@tonic-gate if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0) 334*0Sstevel@tonic-gate goto err; 335*0Sstevel@tonic-gate } 336*0Sstevel@tonic-gate 337*0Sstevel@tonic-gate /* Temporary files may not yet have been created. */ 338*0Sstevel@tonic-gate if (dbmfp->fd == -1) { 339*0Sstevel@tonic-gate LOCKHANDLE(dbmp, dbmfp->mutexp); 340*0Sstevel@tonic-gate if (dbmfp->fd == -1 && ((ret = __db_appname(dbenv, 341*0Sstevel@tonic-gate DB_APP_TMP, NULL, NULL, DB_CREATE | DB_EXCL | DB_TEMPORARY, 342*0Sstevel@tonic-gate &dbmfp->fd, NULL)) != 0 || dbmfp->fd == -1)) { 343*0Sstevel@tonic-gate UNLOCKHANDLE(dbmp, dbmfp->mutexp); 344*0Sstevel@tonic-gate __db_err(dbenv, 345*0Sstevel@tonic-gate "unable to create temporary backing file"); 346*0Sstevel@tonic-gate goto err; 347*0Sstevel@tonic-gate } 348*0Sstevel@tonic-gate UNLOCKHANDLE(dbmp, dbmfp->mutexp); 349*0Sstevel@tonic-gate } 350*0Sstevel@tonic-gate 351*0Sstevel@tonic-gate /* Write the page. */ 352*0Sstevel@tonic-gate db_io.fd_io = dbmfp->fd; 353*0Sstevel@tonic-gate db_io.fd_lock = dbmp->reginfo.fd; 354*0Sstevel@tonic-gate db_io.mutexp = F_ISSET(dbmp, MP_LOCKHANDLE) ? dbmfp->mutexp : NULL; 355*0Sstevel@tonic-gate db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize; 356*0Sstevel@tonic-gate db_io.pgno = bhp->pgno; 357*0Sstevel@tonic-gate db_io.buf = bhp->buf; 358*0Sstevel@tonic-gate if ((ret = __os_io(&db_io, DB_IO_WRITE, &nw)) != 0) { 359*0Sstevel@tonic-gate __db_panic(dbenv, ret); 360*0Sstevel@tonic-gate fail = "write"; 361*0Sstevel@tonic-gate goto syserr; 362*0Sstevel@tonic-gate } 363*0Sstevel@tonic-gate if (nw != (ssize_t)mfp->stat.st_pagesize) { 364*0Sstevel@tonic-gate ret = EIO; 365*0Sstevel@tonic-gate fail = "write"; 366*0Sstevel@tonic-gate goto syserr; 367*0Sstevel@tonic-gate } 368*0Sstevel@tonic-gate 369*0Sstevel@tonic-gate if (wrotep != NULL) 370*0Sstevel@tonic-gate *wrotep = 1; 371*0Sstevel@tonic-gate 372*0Sstevel@tonic-gate /* Unlock the buffer and reacquire the region lock. */ 373*0Sstevel@tonic-gate UNLOCKBUFFER(dbmp, bhp); 374*0Sstevel@tonic-gate LOCKREGION(dbmp); 375*0Sstevel@tonic-gate 376*0Sstevel@tonic-gate /* 377*0Sstevel@tonic-gate * Clean up the flags based on a successful write. 378*0Sstevel@tonic-gate * 379*0Sstevel@tonic-gate * If we rewrote the page, it will need processing by the pgin 380*0Sstevel@tonic-gate * routine before reuse. 381*0Sstevel@tonic-gate */ 382*0Sstevel@tonic-gate if (callpgin) 383*0Sstevel@tonic-gate F_SET(bhp, BH_CALLPGIN); 384*0Sstevel@tonic-gate F_CLR(bhp, BH_DIRTY | BH_LOCKED); 385*0Sstevel@tonic-gate 386*0Sstevel@tonic-gate /* 387*0Sstevel@tonic-gate * If we write a buffer for which a checkpoint is waiting, update 388*0Sstevel@tonic-gate * the count of pending buffers (both in the mpool as a whole and 389*0Sstevel@tonic-gate * for this file). If the count for this file goes to zero, set a 390*0Sstevel@tonic-gate * flag so we flush the writes. 391*0Sstevel@tonic-gate */ 392*0Sstevel@tonic-gate if (F_ISSET(bhp, BH_WRITE)) { 393*0Sstevel@tonic-gate F_CLR(bhp, BH_WRITE); 394*0Sstevel@tonic-gate 395*0Sstevel@tonic-gate --mp->lsn_cnt; 396*0Sstevel@tonic-gate dosync = --mfp->lsn_cnt == 0 ? 1 : 0; 397*0Sstevel@tonic-gate } else 398*0Sstevel@tonic-gate dosync = 0; 399*0Sstevel@tonic-gate 400*0Sstevel@tonic-gate /* Update the page clean/dirty statistics. */ 401*0Sstevel@tonic-gate ++mp->stat.st_page_clean; 402*0Sstevel@tonic-gate --mp->stat.st_page_dirty; 403*0Sstevel@tonic-gate 404*0Sstevel@tonic-gate /* Update I/O statistics. */ 405*0Sstevel@tonic-gate ++mp->stat.st_page_out; 406*0Sstevel@tonic-gate ++mfp->stat.st_page_out; 407*0Sstevel@tonic-gate 408*0Sstevel@tonic-gate /* 409*0Sstevel@tonic-gate * Do the sync after everything else has been updated, so any incoming 410*0Sstevel@tonic-gate * checkpoint doesn't see inconsistent information. 411*0Sstevel@tonic-gate * 412*0Sstevel@tonic-gate * XXX: 413*0Sstevel@tonic-gate * Don't lock the region around the sync, fsync(2) has no atomicity 414*0Sstevel@tonic-gate * issues. 415*0Sstevel@tonic-gate * 416*0Sstevel@tonic-gate * XXX: 417*0Sstevel@tonic-gate * We ignore errors from the sync -- it makes no sense to return an 418*0Sstevel@tonic-gate * error to the calling process, so set a flag causing the checkpoint 419*0Sstevel@tonic-gate * to be retried later. There is a possibility, of course, that a 420*0Sstevel@tonic-gate * subsequent checkpoint was started and that we're going to force it 421*0Sstevel@tonic-gate * to fail. That should be unlikely, and fixing it would be difficult. 422*0Sstevel@tonic-gate */ 423*0Sstevel@tonic-gate if (dosync) { 424*0Sstevel@tonic-gate UNLOCKREGION(dbmp); 425*0Sstevel@tonic-gate syncfail = __os_fsync(dbmfp->fd) != 0; 426*0Sstevel@tonic-gate LOCKREGION(dbmp); 427*0Sstevel@tonic-gate if (syncfail) 428*0Sstevel@tonic-gate F_SET(mp, MP_LSN_RETRY); 429*0Sstevel@tonic-gate } 430*0Sstevel@tonic-gate 431*0Sstevel@tonic-gate return (0); 432*0Sstevel@tonic-gate 433*0Sstevel@tonic-gate syserr: __db_err(dbenv, "%s: %s failed for page %lu", 434*0Sstevel@tonic-gate __memp_fn(dbmfp), fail, (u_long)bhp->pgno); 435*0Sstevel@tonic-gate 436*0Sstevel@tonic-gate err: /* Unlock the buffer and reacquire the region lock. */ 437*0Sstevel@tonic-gate UNLOCKBUFFER(dbmp, bhp); 438*0Sstevel@tonic-gate LOCKREGION(dbmp); 439*0Sstevel@tonic-gate 440*0Sstevel@tonic-gate /* 441*0Sstevel@tonic-gate * Clean up the flags based on a failure. 442*0Sstevel@tonic-gate * 443*0Sstevel@tonic-gate * The page remains dirty but we remove our lock. If we rewrote the 444*0Sstevel@tonic-gate * page, it will need processing by the pgin routine before reuse. 445*0Sstevel@tonic-gate */ 446*0Sstevel@tonic-gate if (callpgin) 447*0Sstevel@tonic-gate F_SET(bhp, BH_CALLPGIN); 448*0Sstevel@tonic-gate F_CLR(bhp, BH_LOCKED); 449*0Sstevel@tonic-gate 450*0Sstevel@tonic-gate return (ret); 451*0Sstevel@tonic-gate } 452*0Sstevel@tonic-gate 453*0Sstevel@tonic-gate /* 454*0Sstevel@tonic-gate * __memp_pg -- 455*0Sstevel@tonic-gate * Call the pgin/pgout routine. 456*0Sstevel@tonic-gate * 457*0Sstevel@tonic-gate * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int)); 458*0Sstevel@tonic-gate */ 459*0Sstevel@tonic-gate int 460*0Sstevel@tonic-gate __memp_pg(dbmfp, bhp, is_pgin) 461*0Sstevel@tonic-gate DB_MPOOLFILE *dbmfp; 462*0Sstevel@tonic-gate BH *bhp; 463*0Sstevel@tonic-gate int is_pgin; 464*0Sstevel@tonic-gate { 465*0Sstevel@tonic-gate DBT dbt, *dbtp; 466*0Sstevel@tonic-gate DB_MPOOL *dbmp; 467*0Sstevel@tonic-gate DB_MPREG *mpreg; 468*0Sstevel@tonic-gate MPOOLFILE *mfp; 469*0Sstevel@tonic-gate int ftype, ret; 470*0Sstevel@tonic-gate 471*0Sstevel@tonic-gate dbmp = dbmfp->dbmp; 472*0Sstevel@tonic-gate mfp = dbmfp->mfp; 473*0Sstevel@tonic-gate 474*0Sstevel@tonic-gate LOCKHANDLE(dbmp, dbmp->mutexp); 475*0Sstevel@tonic-gate 476*0Sstevel@tonic-gate ftype = mfp->ftype; 477*0Sstevel@tonic-gate for (mpreg = LIST_FIRST(&dbmp->dbregq); 478*0Sstevel@tonic-gate mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) { 479*0Sstevel@tonic-gate if (ftype != mpreg->ftype) 480*0Sstevel@tonic-gate continue; 481*0Sstevel@tonic-gate if (mfp->pgcookie_len == 0) 482*0Sstevel@tonic-gate dbtp = NULL; 483*0Sstevel@tonic-gate else { 484*0Sstevel@tonic-gate dbt.size = mfp->pgcookie_len; 485*0Sstevel@tonic-gate dbt.data = R_ADDR(dbmp, mfp->pgcookie_off); 486*0Sstevel@tonic-gate dbtp = &dbt; 487*0Sstevel@tonic-gate } 488*0Sstevel@tonic-gate UNLOCKHANDLE(dbmp, dbmp->mutexp); 489*0Sstevel@tonic-gate 490*0Sstevel@tonic-gate if (is_pgin) { 491*0Sstevel@tonic-gate if (mpreg->pgin != NULL && (ret = 492*0Sstevel@tonic-gate mpreg->pgin(bhp->pgno, bhp->buf, dbtp)) != 0) 493*0Sstevel@tonic-gate goto err; 494*0Sstevel@tonic-gate } else 495*0Sstevel@tonic-gate if (mpreg->pgout != NULL && (ret = 496*0Sstevel@tonic-gate mpreg->pgout(bhp->pgno, bhp->buf, dbtp)) != 0) 497*0Sstevel@tonic-gate goto err; 498*0Sstevel@tonic-gate break; 499*0Sstevel@tonic-gate } 500*0Sstevel@tonic-gate 501*0Sstevel@tonic-gate if (mpreg == NULL) 502*0Sstevel@tonic-gate UNLOCKHANDLE(dbmp, dbmp->mutexp); 503*0Sstevel@tonic-gate 504*0Sstevel@tonic-gate return (0); 505*0Sstevel@tonic-gate 506*0Sstevel@tonic-gate err: UNLOCKHANDLE(dbmp, dbmp->mutexp); 507*0Sstevel@tonic-gate __db_err(dbmp->dbenv, "%s: %s failed for page %lu", 508*0Sstevel@tonic-gate __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno); 509*0Sstevel@tonic-gate return (ret); 510*0Sstevel@tonic-gate } 511*0Sstevel@tonic-gate 512*0Sstevel@tonic-gate /* 513*0Sstevel@tonic-gate * __memp_bhfree -- 514*0Sstevel@tonic-gate * Free a bucket header and its referenced data. 515*0Sstevel@tonic-gate * 516*0Sstevel@tonic-gate * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, MPOOLFILE *, BH *, int)); 517*0Sstevel@tonic-gate */ 518*0Sstevel@tonic-gate void 519*0Sstevel@tonic-gate __memp_bhfree(dbmp, mfp, bhp, free_mem) 520*0Sstevel@tonic-gate DB_MPOOL *dbmp; 521*0Sstevel@tonic-gate MPOOLFILE *mfp; 522*0Sstevel@tonic-gate BH *bhp; 523*0Sstevel@tonic-gate int free_mem; 524*0Sstevel@tonic-gate { 525*0Sstevel@tonic-gate size_t off; 526*0Sstevel@tonic-gate 527*0Sstevel@tonic-gate /* Delete the buffer header from the hash bucket queue. */ 528*0Sstevel@tonic-gate off = BUCKET(dbmp->mp, R_OFFSET(dbmp, mfp), bhp->pgno); 529*0Sstevel@tonic-gate SH_TAILQ_REMOVE(&dbmp->htab[off], bhp, hq, __bh); 530*0Sstevel@tonic-gate 531*0Sstevel@tonic-gate /* Delete the buffer header from the LRU queue. */ 532*0Sstevel@tonic-gate SH_TAILQ_REMOVE(&dbmp->mp->bhq, bhp, q, __bh); 533*0Sstevel@tonic-gate 534*0Sstevel@tonic-gate /* 535*0Sstevel@tonic-gate * If we're not reusing it immediately, free the buffer header 536*0Sstevel@tonic-gate * and data for real. 537*0Sstevel@tonic-gate */ 538*0Sstevel@tonic-gate if (free_mem) { 539*0Sstevel@tonic-gate __db_shalloc_free(dbmp->addr, bhp); 540*0Sstevel@tonic-gate --dbmp->mp->stat.st_page_clean; 541*0Sstevel@tonic-gate } 542*0Sstevel@tonic-gate } 543*0Sstevel@tonic-gate 544*0Sstevel@tonic-gate /* 545*0Sstevel@tonic-gate * __memp_upgrade -- 546*0Sstevel@tonic-gate * Upgrade a file descriptor from readonly to readwrite. 547*0Sstevel@tonic-gate */ 548*0Sstevel@tonic-gate static int 549*0Sstevel@tonic-gate __memp_upgrade(dbmp, dbmfp, mfp) 550*0Sstevel@tonic-gate DB_MPOOL *dbmp; 551*0Sstevel@tonic-gate DB_MPOOLFILE *dbmfp; 552*0Sstevel@tonic-gate MPOOLFILE *mfp; 553*0Sstevel@tonic-gate { 554*0Sstevel@tonic-gate int fd, ret; 555*0Sstevel@tonic-gate char *rpath; 556*0Sstevel@tonic-gate 557*0Sstevel@tonic-gate /* 558*0Sstevel@tonic-gate * !!! 559*0Sstevel@tonic-gate * We expect the handle to already be locked. 560*0Sstevel@tonic-gate */ 561*0Sstevel@tonic-gate 562*0Sstevel@tonic-gate /* Check to see if we've already upgraded. */ 563*0Sstevel@tonic-gate if (F_ISSET(dbmfp, MP_UPGRADE)) 564*0Sstevel@tonic-gate return (0); 565*0Sstevel@tonic-gate 566*0Sstevel@tonic-gate /* Check to see if we've already failed. */ 567*0Sstevel@tonic-gate if (F_ISSET(dbmfp, MP_UPGRADE_FAIL)) 568*0Sstevel@tonic-gate return (1); 569*0Sstevel@tonic-gate 570*0Sstevel@tonic-gate /* 571*0Sstevel@tonic-gate * Calculate the real name for this file and try to open it read/write. 572*0Sstevel@tonic-gate * We know we have a valid pathname for the file because it's the only 573*0Sstevel@tonic-gate * way we could have gotten a file descriptor of any kind. 574*0Sstevel@tonic-gate */ 575*0Sstevel@tonic-gate if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA, 576*0Sstevel@tonic-gate NULL, R_ADDR(dbmp, mfp->path_off), 0, NULL, &rpath)) != 0) 577*0Sstevel@tonic-gate return (ret); 578*0Sstevel@tonic-gate if (__db_open(rpath, 0, 0, 0, &fd) != 0) { 579*0Sstevel@tonic-gate F_SET(dbmfp, MP_UPGRADE_FAIL); 580*0Sstevel@tonic-gate ret = 1; 581*0Sstevel@tonic-gate } else { 582*0Sstevel@tonic-gate /* Swap the descriptors and set the upgrade flag. */ 583*0Sstevel@tonic-gate (void)__os_close(dbmfp->fd); 584*0Sstevel@tonic-gate dbmfp->fd = fd; 585*0Sstevel@tonic-gate F_SET(dbmfp, MP_UPGRADE); 586*0Sstevel@tonic-gate ret = 0; 587*0Sstevel@tonic-gate } 588*0Sstevel@tonic-gate __os_freestr(rpath); 589*0Sstevel@tonic-gate return (ret); 590*0Sstevel@tonic-gate } 591