1*0Sstevel@tonic-gate /*-
2*0Sstevel@tonic-gate * See the file LICENSE for redistribution information.
3*0Sstevel@tonic-gate *
4*0Sstevel@tonic-gate * Copyright (c) 1996, 1997, 1998
5*0Sstevel@tonic-gate * Sleepycat Software. All rights reserved.
6*0Sstevel@tonic-gate */
7*0Sstevel@tonic-gate
8*0Sstevel@tonic-gate #include "config.h"
9*0Sstevel@tonic-gate
10*0Sstevel@tonic-gate #ifndef lint
11*0Sstevel@tonic-gate static const char sccsid[] = "@(#)db_region.c 10.53 (Sleepycat) 11/10/98";
12*0Sstevel@tonic-gate #endif /* not lint */
13*0Sstevel@tonic-gate
14*0Sstevel@tonic-gate #ifndef NO_SYSTEM_INCLUDES
15*0Sstevel@tonic-gate #include <sys/types.h>
16*0Sstevel@tonic-gate
17*0Sstevel@tonic-gate #include <errno.h>
18*0Sstevel@tonic-gate #include <string.h>
19*0Sstevel@tonic-gate #include <unistd.h>
20*0Sstevel@tonic-gate #endif
21*0Sstevel@tonic-gate
22*0Sstevel@tonic-gate #include "db_int.h"
23*0Sstevel@tonic-gate #include "common_ext.h"
24*0Sstevel@tonic-gate
25*0Sstevel@tonic-gate static int __db_growregion __P((REGINFO *, size_t));
26*0Sstevel@tonic-gate
27*0Sstevel@tonic-gate /*
28*0Sstevel@tonic-gate * __db_rattach --
29*0Sstevel@tonic-gate * Optionally create and attach to a shared memory region.
30*0Sstevel@tonic-gate *
31*0Sstevel@tonic-gate * PUBLIC: int __db_rattach __P((REGINFO *));
32*0Sstevel@tonic-gate */
33*0Sstevel@tonic-gate int
__db_rattach(infop)34*0Sstevel@tonic-gate __db_rattach(infop)
35*0Sstevel@tonic-gate REGINFO *infop;
36*0Sstevel@tonic-gate {
37*0Sstevel@tonic-gate RLAYOUT *rlp, rl;
38*0Sstevel@tonic-gate size_t grow_region, size;
39*0Sstevel@tonic-gate ssize_t nr, nw;
40*0Sstevel@tonic-gate u_int32_t flags, mbytes, bytes;
41*0Sstevel@tonic-gate u_int8_t *p;
42*0Sstevel@tonic-gate int malloc_possible, ret, retry_cnt;
43*0Sstevel@tonic-gate
44*0Sstevel@tonic-gate grow_region = 0;
45*0Sstevel@tonic-gate malloc_possible = 1;
46*0Sstevel@tonic-gate ret = retry_cnt = 0;
47*0Sstevel@tonic-gate
48*0Sstevel@tonic-gate /* Round off the requested size to the next page boundary. */
49*0Sstevel@tonic-gate DB_ROUNDOFF(infop->size, DB_VMPAGESIZE);
50*0Sstevel@tonic-gate
51*0Sstevel@tonic-gate /* Some architectures have hard limits on the maximum region size. */
52*0Sstevel@tonic-gate #ifdef DB_REGIONSIZE_MAX
53*0Sstevel@tonic-gate if (infop->size > DB_REGIONSIZE_MAX) {
54*0Sstevel@tonic-gate __db_err(infop->dbenv, "__db_rattach: cache size too large");
55*0Sstevel@tonic-gate return (EINVAL);
56*0Sstevel@tonic-gate }
57*0Sstevel@tonic-gate #endif
58*0Sstevel@tonic-gate
59*0Sstevel@tonic-gate /* Intialize the return information in the REGINFO structure. */
60*0Sstevel@tonic-gate loop: infop->addr = NULL;
61*0Sstevel@tonic-gate infop->fd = -1;
62*0Sstevel@tonic-gate infop->segid = INVALID_SEGID;
63*0Sstevel@tonic-gate if (infop->name != NULL) {
64*0Sstevel@tonic-gate __os_freestr(infop->name);
65*0Sstevel@tonic-gate infop->name = NULL;
66*0Sstevel@tonic-gate }
67*0Sstevel@tonic-gate F_CLR(infop, REGION_CANGROW | REGION_CREATED);
68*0Sstevel@tonic-gate
69*0Sstevel@tonic-gate #ifndef HAVE_SPINLOCKS
70*0Sstevel@tonic-gate /*
71*0Sstevel@tonic-gate * XXX
72*0Sstevel@tonic-gate * Lacking spinlocks, we must have a file descriptor for fcntl(2)
73*0Sstevel@tonic-gate * locking, which implies using mmap(2) to map in a regular file.
74*0Sstevel@tonic-gate * (Theoretically, we could probably get a file descriptor to lock
75*0Sstevel@tonic-gate * other types of shared regions, but I don't see any reason to
76*0Sstevel@tonic-gate * bother.)
77*0Sstevel@tonic-gate *
78*0Sstevel@tonic-gate * Since we may be using shared memory regions, e.g., shmget(2),
79*0Sstevel@tonic-gate * and not mmap of regular files, the backing file may be only a
80*0Sstevel@tonic-gate * few tens of bytes in length. So, this depends on the ability
81*0Sstevel@tonic-gate * to fcntl lock file offsets much larger than the physical file.
82*0Sstevel@tonic-gate */
83*0Sstevel@tonic-gate malloc_possible = 0;
84*0Sstevel@tonic-gate #endif
85*0Sstevel@tonic-gate
86*0Sstevel@tonic-gate #ifdef __hppa
87*0Sstevel@tonic-gate /*
88*0Sstevel@tonic-gate * XXX
89*0Sstevel@tonic-gate * HP-UX won't permit mutexes to live in anything but shared memory.
90*0Sstevel@tonic-gate * Instantiate a shared region file on that architecture, regardless.
91*0Sstevel@tonic-gate */
92*0Sstevel@tonic-gate malloc_possible = 0;
93*0Sstevel@tonic-gate #endif
94*0Sstevel@tonic-gate /*
95*0Sstevel@tonic-gate * If a region is truly private, malloc the memory. That's faster
96*0Sstevel@tonic-gate * than either anonymous memory or a shared file.
97*0Sstevel@tonic-gate */
98*0Sstevel@tonic-gate if (malloc_possible && F_ISSET(infop, REGION_PRIVATE)) {
99*0Sstevel@tonic-gate if ((ret = __os_malloc(infop->size, NULL, &infop->addr)) != 0)
100*0Sstevel@tonic-gate return (ret);
101*0Sstevel@tonic-gate
102*0Sstevel@tonic-gate /*
103*0Sstevel@tonic-gate * It's sometimes significantly faster to page-fault in all of
104*0Sstevel@tonic-gate * the region's pages before we run the application, as we see
105*0Sstevel@tonic-gate * nasty side-effects when we page-fault while holding various
106*0Sstevel@tonic-gate * locks, i.e., the lock takes a long time to acquire because
107*0Sstevel@tonic-gate * of the underlying page fault, and the other threads convoy
108*0Sstevel@tonic-gate * behind the lock holder.
109*0Sstevel@tonic-gate */
110*0Sstevel@tonic-gate if (DB_GLOBAL(db_region_init))
111*0Sstevel@tonic-gate for (p = infop->addr;
112*0Sstevel@tonic-gate p < (u_int8_t *)infop->addr + infop->size;
113*0Sstevel@tonic-gate p += DB_VMPAGESIZE)
114*0Sstevel@tonic-gate p[0] = '\0';
115*0Sstevel@tonic-gate
116*0Sstevel@tonic-gate F_SET(infop, REGION_CREATED | REGION_MALLOC);
117*0Sstevel@tonic-gate goto region_init;
118*0Sstevel@tonic-gate }
119*0Sstevel@tonic-gate
120*0Sstevel@tonic-gate /*
121*0Sstevel@tonic-gate * Get the name of the region (creating the file if a temporary file
122*0Sstevel@tonic-gate * is being used). The dbenv contains the current DB environment,
123*0Sstevel@tonic-gate * including naming information. The path argument may be a file or
124*0Sstevel@tonic-gate * a directory. If path is a directory, it must exist and file is the
125*0Sstevel@tonic-gate * file name to be created inside the directory. If path is a file,
126*0Sstevel@tonic-gate * then file must be NULL.
127*0Sstevel@tonic-gate */
128*0Sstevel@tonic-gate if ((ret = __db_appname(infop->dbenv, infop->appname, infop->path,
129*0Sstevel@tonic-gate infop->file, infop->dbflags, &infop->fd, &infop->name)) != 0)
130*0Sstevel@tonic-gate return (ret);
131*0Sstevel@tonic-gate if (infop->fd != -1)
132*0Sstevel@tonic-gate F_SET(infop, REGION_CREATED);
133*0Sstevel@tonic-gate
134*0Sstevel@tonic-gate /*
135*0Sstevel@tonic-gate * Try to create the file, if we have authority. We have to make sure
136*0Sstevel@tonic-gate * that multiple threads/processes attempting to simultaneously create
137*0Sstevel@tonic-gate * the region are properly ordered, so we open it using DB_CREATE and
138*0Sstevel@tonic-gate * DB_EXCL, so two attempts to create the region will return failure in
139*0Sstevel@tonic-gate * one.
140*0Sstevel@tonic-gate */
141*0Sstevel@tonic-gate if (infop->fd == -1 && infop->dbflags & DB_CREATE) {
142*0Sstevel@tonic-gate flags = infop->dbflags;
143*0Sstevel@tonic-gate LF_SET(DB_EXCL);
144*0Sstevel@tonic-gate if ((ret = __db_open(infop->name,
145*0Sstevel@tonic-gate flags, flags, infop->mode, &infop->fd)) == 0)
146*0Sstevel@tonic-gate F_SET(infop, REGION_CREATED);
147*0Sstevel@tonic-gate else
148*0Sstevel@tonic-gate if (ret != EEXIST)
149*0Sstevel@tonic-gate goto errmsg;
150*0Sstevel@tonic-gate }
151*0Sstevel@tonic-gate
152*0Sstevel@tonic-gate /* If we couldn't create the file, try and open it. */
153*0Sstevel@tonic-gate if (infop->fd == -1) {
154*0Sstevel@tonic-gate flags = infop->dbflags;
155*0Sstevel@tonic-gate LF_CLR(DB_CREATE | DB_EXCL);
156*0Sstevel@tonic-gate if ((ret = __db_open(infop->name,
157*0Sstevel@tonic-gate flags, flags, infop->mode, &infop->fd)) != 0)
158*0Sstevel@tonic-gate goto errmsg;
159*0Sstevel@tonic-gate }
160*0Sstevel@tonic-gate
161*0Sstevel@tonic-gate /*
162*0Sstevel@tonic-gate * There are three cases we support:
163*0Sstevel@tonic-gate * 1. Named anonymous memory (shmget(2)).
164*0Sstevel@tonic-gate * 2. Unnamed anonymous memory (mmap(2): MAP_ANON/MAP_ANONYMOUS).
165*0Sstevel@tonic-gate * 3. Memory backed by a regular file (mmap(2)).
166*0Sstevel@tonic-gate *
167*0Sstevel@tonic-gate * We instantiate a backing file in all cases, which contains at least
168*0Sstevel@tonic-gate * the RLAYOUT structure, and in case #3, contains the actual region.
169*0Sstevel@tonic-gate * This is necessary for a couple of reasons:
170*0Sstevel@tonic-gate *
171*0Sstevel@tonic-gate * First, the mpool region uses temporary files to name regions, and
172*0Sstevel@tonic-gate * since you may have multiple regions in the same directory, we need
173*0Sstevel@tonic-gate * a filesystem name to ensure that they don't collide.
174*0Sstevel@tonic-gate *
175*0Sstevel@tonic-gate * Second, applications are allowed to forcibly remove regions, even
176*0Sstevel@tonic-gate * if they don't know anything about them other than the name. If a
177*0Sstevel@tonic-gate * region is backed by anonymous memory, there has to be some way for
178*0Sstevel@tonic-gate * the application to find out that information, and, in some cases,
179*0Sstevel@tonic-gate * determine ID information for the anonymous memory.
180*0Sstevel@tonic-gate */
181*0Sstevel@tonic-gate if (F_ISSET(infop, REGION_CREATED)) {
182*0Sstevel@tonic-gate /*
183*0Sstevel@tonic-gate * If we're using anonymous memory to back this region, set
184*0Sstevel@tonic-gate * the flag.
185*0Sstevel@tonic-gate */
186*0Sstevel@tonic-gate if (DB_GLOBAL(db_region_anon))
187*0Sstevel@tonic-gate F_SET(infop, REGION_ANONYMOUS);
188*0Sstevel@tonic-gate
189*0Sstevel@tonic-gate /*
190*0Sstevel@tonic-gate * If we're using a regular file to back a region we created,
191*0Sstevel@tonic-gate * grow it to the specified size.
192*0Sstevel@tonic-gate */
193*0Sstevel@tonic-gate if (!DB_GLOBAL(db_region_anon) &&
194*0Sstevel@tonic-gate (ret = __db_growregion(infop, infop->size)) != 0)
195*0Sstevel@tonic-gate goto err;
196*0Sstevel@tonic-gate } else {
197*0Sstevel@tonic-gate /*
198*0Sstevel@tonic-gate * If we're joining a region, figure out what it looks like.
199*0Sstevel@tonic-gate *
200*0Sstevel@tonic-gate * XXX
201*0Sstevel@tonic-gate * We have to figure out if the file is a regular file backing
202*0Sstevel@tonic-gate * a region that we want to map into our address space, or a
203*0Sstevel@tonic-gate * file with the information we need to find a shared anonymous
204*0Sstevel@tonic-gate * region that we want to map into our address space.
205*0Sstevel@tonic-gate *
206*0Sstevel@tonic-gate * All this noise is because some systems don't have a coherent
207*0Sstevel@tonic-gate * VM and buffer cache, and worse, if you mix operations on the
208*0Sstevel@tonic-gate * VM and buffer cache, half the time you hang the system.
209*0Sstevel@tonic-gate *
210*0Sstevel@tonic-gate * There are two possibilities. If the file is the size of an
211*0Sstevel@tonic-gate * RLAYOUT structure, then we know that the real region is in
212*0Sstevel@tonic-gate * shared memory, because otherwise it would be bigger. (As
213*0Sstevel@tonic-gate * the RLAYOUT structure size is smaller than a disk sector,
214*0Sstevel@tonic-gate * the only way it can be this size is if deliberately written
215*0Sstevel@tonic-gate * that way.) In which case, retrieve the information we need
216*0Sstevel@tonic-gate * from the RLAYOUT structure and use it to acquire the shared
217*0Sstevel@tonic-gate * memory.
218*0Sstevel@tonic-gate *
219*0Sstevel@tonic-gate * If the structure is larger than an RLAYOUT structure, then
220*0Sstevel@tonic-gate * the file is backing the shared memory region, and we use
221*0Sstevel@tonic-gate * the current size of the file without reading any information
222*0Sstevel@tonic-gate * from the file itself so that we don't confuse the VM.
223*0Sstevel@tonic-gate *
224*0Sstevel@tonic-gate * And yes, this makes me want to take somebody and kill them,
225*0Sstevel@tonic-gate * but I can't think of any other solution.
226*0Sstevel@tonic-gate */
227*0Sstevel@tonic-gate if ((ret = __os_ioinfo(infop->name,
228*0Sstevel@tonic-gate infop->fd, &mbytes, &bytes, NULL)) != 0)
229*0Sstevel@tonic-gate goto errmsg;
230*0Sstevel@tonic-gate size = mbytes * MEGABYTE + bytes;
231*0Sstevel@tonic-gate
232*0Sstevel@tonic-gate if (size <= sizeof(RLAYOUT)) {
233*0Sstevel@tonic-gate /*
234*0Sstevel@tonic-gate * If the size is too small, the read fails or the
235*0Sstevel@tonic-gate * valid flag is incorrect, assume it's because the
236*0Sstevel@tonic-gate * RLAYOUT information hasn't been written out yet,
237*0Sstevel@tonic-gate * and retry.
238*0Sstevel@tonic-gate */
239*0Sstevel@tonic-gate if (size < sizeof(RLAYOUT))
240*0Sstevel@tonic-gate goto retry;
241*0Sstevel@tonic-gate if ((ret =
242*0Sstevel@tonic-gate __os_read(infop->fd, &rl, sizeof(rl), &nr)) != 0)
243*0Sstevel@tonic-gate goto retry;
244*0Sstevel@tonic-gate if (rl.valid != DB_REGIONMAGIC)
245*0Sstevel@tonic-gate goto retry;
246*0Sstevel@tonic-gate
247*0Sstevel@tonic-gate /* Copy the size, memory id and characteristics. */
248*0Sstevel@tonic-gate size = rl.size;
249*0Sstevel@tonic-gate infop->segid = rl.segid;
250*0Sstevel@tonic-gate if (F_ISSET(&rl, REGION_ANONYMOUS))
251*0Sstevel@tonic-gate F_SET(infop, REGION_ANONYMOUS);
252*0Sstevel@tonic-gate }
253*0Sstevel@tonic-gate
254*0Sstevel@tonic-gate /*
255*0Sstevel@tonic-gate * If the region is larger than we think, that's okay, use the
256*0Sstevel@tonic-gate * current size. If it's smaller than we think, and we were
257*0Sstevel@tonic-gate * just using the default size, that's okay, use the current
258*0Sstevel@tonic-gate * size. If it's smaller than we think and we really care,
259*0Sstevel@tonic-gate * save the size and we'll catch that further down -- we can't
260*0Sstevel@tonic-gate * correct it here because we have to have a lock to grow the
261*0Sstevel@tonic-gate * region.
262*0Sstevel@tonic-gate */
263*0Sstevel@tonic-gate if (infop->size > size && !F_ISSET(infop, REGION_SIZEDEF))
264*0Sstevel@tonic-gate grow_region = infop->size;
265*0Sstevel@tonic-gate infop->size = size;
266*0Sstevel@tonic-gate }
267*0Sstevel@tonic-gate
268*0Sstevel@tonic-gate /*
269*0Sstevel@tonic-gate * Map the region into our address space. If we're creating it, the
270*0Sstevel@tonic-gate * underlying routines will make it the right size.
271*0Sstevel@tonic-gate *
272*0Sstevel@tonic-gate * There are at least two cases where we can "reasonably" fail when
273*0Sstevel@tonic-gate * we attempt to map in the region. On Windows/95, closing the last
274*0Sstevel@tonic-gate * reference to a region causes it to be zeroed out. On UNIX, when
275*0Sstevel@tonic-gate * using the shmget(2) interfaces, the region will no longer exist
276*0Sstevel@tonic-gate * if the system was rebooted. In these cases, the underlying map call
277*0Sstevel@tonic-gate * returns EAGAIN, and we *remove* our file and try again. There are
278*0Sstevel@tonic-gate * obvious races in doing this, but it should eventually settle down
279*0Sstevel@tonic-gate * to a winner and then things should proceed normally.
280*0Sstevel@tonic-gate */
281*0Sstevel@tonic-gate if ((ret = __db_mapregion(infop->name, infop)) != 0)
282*0Sstevel@tonic-gate if (ret == EAGAIN) {
283*0Sstevel@tonic-gate /*
284*0Sstevel@tonic-gate * Pretend we created the region even if we didn't so
285*0Sstevel@tonic-gate * that our error processing unlinks it.
286*0Sstevel@tonic-gate */
287*0Sstevel@tonic-gate F_SET(infop, REGION_CREATED);
288*0Sstevel@tonic-gate ret = 0;
289*0Sstevel@tonic-gate goto retry;
290*0Sstevel@tonic-gate } else
291*0Sstevel@tonic-gate goto err;
292*0Sstevel@tonic-gate
293*0Sstevel@tonic-gate region_init:
294*0Sstevel@tonic-gate /*
295*0Sstevel@tonic-gate * Initialize the common region information.
296*0Sstevel@tonic-gate *
297*0Sstevel@tonic-gate * !!!
298*0Sstevel@tonic-gate * We have to order the region creates so that two processes don't try
299*0Sstevel@tonic-gate * to simultaneously create the region. This is handled by using the
300*0Sstevel@tonic-gate * DB_CREATE and DB_EXCL flags when we create the "backing" region file.
301*0Sstevel@tonic-gate *
302*0Sstevel@tonic-gate * We also have to order region joins so that processes joining regions
303*0Sstevel@tonic-gate * never see inconsistent data. We'd like to play permissions games
304*0Sstevel@tonic-gate * with the backing file, but we can't because WNT filesystems won't
305*0Sstevel@tonic-gate * open a file mode 0.
306*0Sstevel@tonic-gate */
307*0Sstevel@tonic-gate rlp = (RLAYOUT *)infop->addr;
308*0Sstevel@tonic-gate if (F_ISSET(infop, REGION_CREATED)) {
309*0Sstevel@tonic-gate /*
310*0Sstevel@tonic-gate * The process creating the region acquires a lock before it
311*0Sstevel@tonic-gate * sets the valid flag. Any processes joining the region will
312*0Sstevel@tonic-gate * check the valid flag before acquiring the lock.
313*0Sstevel@tonic-gate *
314*0Sstevel@tonic-gate * Check the return of __db_mutex_init() and __db_mutex_lock(),
315*0Sstevel@tonic-gate * even though we don't usually check elsewhere. This is the
316*0Sstevel@tonic-gate * first lock we initialize and acquire, and we have to know if
317*0Sstevel@tonic-gate * it fails. (It CAN fail, e.g., SunOS, when using fcntl(2)
318*0Sstevel@tonic-gate * for locking, with an in-memory filesystem specified as the
319*0Sstevel@tonic-gate * database home.)
320*0Sstevel@tonic-gate */
321*0Sstevel@tonic-gate if ((ret = __db_mutex_init(&rlp->lock,
322*0Sstevel@tonic-gate MUTEX_LOCK_OFFSET(rlp, &rlp->lock))) != 0 ||
323*0Sstevel@tonic-gate (ret = __db_mutex_lock(&rlp->lock, infop->fd)) != 0)
324*0Sstevel@tonic-gate goto err;
325*0Sstevel@tonic-gate
326*0Sstevel@tonic-gate /* Initialize the remaining region information. */
327*0Sstevel@tonic-gate rlp->refcnt = 1;
328*0Sstevel@tonic-gate rlp->size = infop->size;
329*0Sstevel@tonic-gate db_version(&rlp->majver, &rlp->minver, &rlp->patch);
330*0Sstevel@tonic-gate rlp->panic = 0;
331*0Sstevel@tonic-gate rlp->segid = infop->segid;
332*0Sstevel@tonic-gate rlp->flags = 0;
333*0Sstevel@tonic-gate if (F_ISSET(infop, REGION_ANONYMOUS))
334*0Sstevel@tonic-gate F_SET(rlp, REGION_ANONYMOUS);
335*0Sstevel@tonic-gate
336*0Sstevel@tonic-gate /*
337*0Sstevel@tonic-gate * Fill in the valid field last -- use a magic number, memory
338*0Sstevel@tonic-gate * may not be zero-filled, and we want to minimize the chance
339*0Sstevel@tonic-gate * for collision.
340*0Sstevel@tonic-gate */
341*0Sstevel@tonic-gate rlp->valid = DB_REGIONMAGIC;
342*0Sstevel@tonic-gate
343*0Sstevel@tonic-gate /*
344*0Sstevel@tonic-gate * If the region is anonymous, write the RLAYOUT information
345*0Sstevel@tonic-gate * into the backing file so that future region join and unlink
346*0Sstevel@tonic-gate * calls can find it.
347*0Sstevel@tonic-gate *
348*0Sstevel@tonic-gate * XXX
349*0Sstevel@tonic-gate * We MUST do the seek before we do the write. On Win95, while
350*0Sstevel@tonic-gate * closing the last reference to an anonymous shared region
351*0Sstevel@tonic-gate * doesn't discard the region, it does zero it out. So, the
352*0Sstevel@tonic-gate * REGION_CREATED may be set, but the file may have already
353*0Sstevel@tonic-gate * been written and the file descriptor may be at the end of
354*0Sstevel@tonic-gate * the file.
355*0Sstevel@tonic-gate */
356*0Sstevel@tonic-gate if (F_ISSET(infop, REGION_ANONYMOUS)) {
357*0Sstevel@tonic-gate if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, 0)) != 0)
358*0Sstevel@tonic-gate goto err;
359*0Sstevel@tonic-gate if ((ret =
360*0Sstevel@tonic-gate __os_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0)
361*0Sstevel@tonic-gate goto err;
362*0Sstevel@tonic-gate }
363*0Sstevel@tonic-gate } else {
364*0Sstevel@tonic-gate /* Check to see if the region has had catastrophic failure. */
365*0Sstevel@tonic-gate if (rlp->panic) {
366*0Sstevel@tonic-gate ret = DB_RUNRECOVERY;
367*0Sstevel@tonic-gate goto err;
368*0Sstevel@tonic-gate }
369*0Sstevel@tonic-gate
370*0Sstevel@tonic-gate /*
371*0Sstevel@tonic-gate * Check the valid flag to ensure the region is initialized.
372*0Sstevel@tonic-gate * If the valid flag has not been set, the mutex may not have
373*0Sstevel@tonic-gate * been initialized, and an attempt to get it could lead to
374*0Sstevel@tonic-gate * random behavior.
375*0Sstevel@tonic-gate */
376*0Sstevel@tonic-gate if (rlp->valid != DB_REGIONMAGIC)
377*0Sstevel@tonic-gate goto retry;
378*0Sstevel@tonic-gate
379*0Sstevel@tonic-gate /* Get the region lock. */
380*0Sstevel@tonic-gate (void)__db_mutex_lock(&rlp->lock, infop->fd);
381*0Sstevel@tonic-gate
382*0Sstevel@tonic-gate /*
383*0Sstevel@tonic-gate * We now own the region. There are a couple of things that
384*0Sstevel@tonic-gate * may have gone wrong, however.
385*0Sstevel@tonic-gate *
386*0Sstevel@tonic-gate * Problem #1: while we were waiting for the lock, the region
387*0Sstevel@tonic-gate * was deleted. Detected by re-checking the valid flag, since
388*0Sstevel@tonic-gate * it's cleared by the delete region routines.
389*0Sstevel@tonic-gate */
390*0Sstevel@tonic-gate if (rlp->valid != DB_REGIONMAGIC) {
391*0Sstevel@tonic-gate (void)__db_mutex_unlock(&rlp->lock, infop->fd);
392*0Sstevel@tonic-gate goto retry;
393*0Sstevel@tonic-gate }
394*0Sstevel@tonic-gate
395*0Sstevel@tonic-gate /*
396*0Sstevel@tonic-gate * Problem #3: when we checked the size of the file, it was
397*0Sstevel@tonic-gate * still growing as part of creation. Detected by the fact
398*0Sstevel@tonic-gate * that infop->size isn't the same size as the region.
399*0Sstevel@tonic-gate */
400*0Sstevel@tonic-gate if (infop->size != rlp->size) {
401*0Sstevel@tonic-gate (void)__db_mutex_unlock(&rlp->lock, infop->fd);
402*0Sstevel@tonic-gate goto retry;
403*0Sstevel@tonic-gate }
404*0Sstevel@tonic-gate
405*0Sstevel@tonic-gate /* Increment the reference count. */
406*0Sstevel@tonic-gate ++rlp->refcnt;
407*0Sstevel@tonic-gate }
408*0Sstevel@tonic-gate
409*0Sstevel@tonic-gate /* Return the region in a locked condition. */
410*0Sstevel@tonic-gate
411*0Sstevel@tonic-gate if (0) {
412*0Sstevel@tonic-gate errmsg: __db_err(infop->dbenv, "%s: %s", infop->name, strerror(ret));
413*0Sstevel@tonic-gate
414*0Sstevel@tonic-gate err:
415*0Sstevel@tonic-gate retry: /* Discard the region. */
416*0Sstevel@tonic-gate if (infop->addr != NULL) {
417*0Sstevel@tonic-gate (void)__db_unmapregion(infop);
418*0Sstevel@tonic-gate infop->addr = NULL;
419*0Sstevel@tonic-gate }
420*0Sstevel@tonic-gate
421*0Sstevel@tonic-gate /* Discard the backing file. */
422*0Sstevel@tonic-gate if (infop->fd != -1) {
423*0Sstevel@tonic-gate (void)__os_close(infop->fd);
424*0Sstevel@tonic-gate infop->fd = -1;
425*0Sstevel@tonic-gate
426*0Sstevel@tonic-gate if (F_ISSET(infop, REGION_CREATED))
427*0Sstevel@tonic-gate (void)__os_unlink(infop->name);
428*0Sstevel@tonic-gate }
429*0Sstevel@tonic-gate
430*0Sstevel@tonic-gate /* Discard the name. */
431*0Sstevel@tonic-gate if (infop->name != NULL) {
432*0Sstevel@tonic-gate __os_freestr(infop->name);
433*0Sstevel@tonic-gate infop->name = NULL;
434*0Sstevel@tonic-gate }
435*0Sstevel@tonic-gate
436*0Sstevel@tonic-gate /*
437*0Sstevel@tonic-gate * If we had a temporary error, wait a few seconds and
438*0Sstevel@tonic-gate * try again.
439*0Sstevel@tonic-gate */
440*0Sstevel@tonic-gate if (ret == 0) {
441*0Sstevel@tonic-gate if (++retry_cnt <= 3) {
442*0Sstevel@tonic-gate __os_sleep(retry_cnt * 2, 0);
443*0Sstevel@tonic-gate goto loop;
444*0Sstevel@tonic-gate }
445*0Sstevel@tonic-gate ret = EAGAIN;
446*0Sstevel@tonic-gate }
447*0Sstevel@tonic-gate }
448*0Sstevel@tonic-gate
449*0Sstevel@tonic-gate /*
450*0Sstevel@tonic-gate * XXX
451*0Sstevel@tonic-gate * HP-UX won't permit mutexes to live in anything but shared memory.
452*0Sstevel@tonic-gate * Instantiate a shared region file on that architecture, regardless.
453*0Sstevel@tonic-gate *
454*0Sstevel@tonic-gate * XXX
455*0Sstevel@tonic-gate * There's a problem in cleaning this up on application exit, or on
456*0Sstevel@tonic-gate * application failure. If an application opens a database without
457*0Sstevel@tonic-gate * an environment, we create a temporary backing mpool region for it.
458*0Sstevel@tonic-gate * That region is marked REGION_PRIVATE, but as HP-UX won't permit
459*0Sstevel@tonic-gate * mutexes to live in anything but shared memory, we instantiate a
460*0Sstevel@tonic-gate * real file plus a memory region of some form. If the application
461*0Sstevel@tonic-gate * crashes, the necessary information to delete the backing file and
462*0Sstevel@tonic-gate * any system region (e.g., the shmget(2) segment ID) is no longer
463*0Sstevel@tonic-gate * available. We can't completely fix the problem, but we try.
464*0Sstevel@tonic-gate *
465*0Sstevel@tonic-gate * The underlying UNIX __db_mapregion() code preferentially uses the
466*0Sstevel@tonic-gate * mmap(2) interface with the MAP_ANON/MAP_ANONYMOUS flags for regions
467*0Sstevel@tonic-gate * that are marked REGION_PRIVATE. This means that we normally aren't
468*0Sstevel@tonic-gate * holding any system resources when we get here, in which case we can
469*0Sstevel@tonic-gate * delete the backing file. This results in a short race, from the
470*0Sstevel@tonic-gate * __db_open() call above to here.
471*0Sstevel@tonic-gate *
472*0Sstevel@tonic-gate * If, for some reason, we are holding system resources when we get
473*0Sstevel@tonic-gate * here, we don't have any choice -- we can't delete the backing file
474*0Sstevel@tonic-gate * because we may need it to detach from the resources. Set the
475*0Sstevel@tonic-gate * REGION_LASTDETACH flag, so that we do all necessary cleanup when
476*0Sstevel@tonic-gate * the application closes the region.
477*0Sstevel@tonic-gate */
478*0Sstevel@tonic-gate if (F_ISSET(infop, REGION_PRIVATE) && !F_ISSET(infop, REGION_MALLOC))
479*0Sstevel@tonic-gate if (F_ISSET(infop, REGION_HOLDINGSYS))
480*0Sstevel@tonic-gate F_SET(infop, REGION_LASTDETACH);
481*0Sstevel@tonic-gate else {
482*0Sstevel@tonic-gate F_SET(infop, REGION_REMOVED);
483*0Sstevel@tonic-gate F_CLR(infop, REGION_CANGROW);
484*0Sstevel@tonic-gate
485*0Sstevel@tonic-gate (void)__os_close(infop->fd);
486*0Sstevel@tonic-gate (void)__os_unlink(infop->name);
487*0Sstevel@tonic-gate }
488*0Sstevel@tonic-gate
489*0Sstevel@tonic-gate return (ret);
490*0Sstevel@tonic-gate }
491*0Sstevel@tonic-gate
492*0Sstevel@tonic-gate /*
493*0Sstevel@tonic-gate * __db_rdetach --
494*0Sstevel@tonic-gate * De-attach from a shared memory region.
495*0Sstevel@tonic-gate *
496*0Sstevel@tonic-gate * PUBLIC: int __db_rdetach __P((REGINFO *));
497*0Sstevel@tonic-gate */
498*0Sstevel@tonic-gate int
__db_rdetach(infop)499*0Sstevel@tonic-gate __db_rdetach(infop)
500*0Sstevel@tonic-gate REGINFO *infop;
501*0Sstevel@tonic-gate {
502*0Sstevel@tonic-gate RLAYOUT *rlp;
503*0Sstevel@tonic-gate int detach, ret, t_ret;
504*0Sstevel@tonic-gate
505*0Sstevel@tonic-gate ret = 0;
506*0Sstevel@tonic-gate
507*0Sstevel@tonic-gate /*
508*0Sstevel@tonic-gate * If the region was removed when it was created, no further action
509*0Sstevel@tonic-gate * is required.
510*0Sstevel@tonic-gate */
511*0Sstevel@tonic-gate if (F_ISSET(infop, REGION_REMOVED))
512*0Sstevel@tonic-gate goto done;
513*0Sstevel@tonic-gate /*
514*0Sstevel@tonic-gate * If the region was created in memory returned by malloc, the only
515*0Sstevel@tonic-gate * action required is freeing the memory.
516*0Sstevel@tonic-gate */
517*0Sstevel@tonic-gate if (F_ISSET(infop, REGION_MALLOC)) {
518*0Sstevel@tonic-gate __os_free(infop->addr, 0);
519*0Sstevel@tonic-gate goto done;
520*0Sstevel@tonic-gate }
521*0Sstevel@tonic-gate
522*0Sstevel@tonic-gate /* Otherwise, attach to the region and optionally delete it. */
523*0Sstevel@tonic-gate rlp = infop->addr;
524*0Sstevel@tonic-gate
525*0Sstevel@tonic-gate /* Get the lock. */
526*0Sstevel@tonic-gate (void)__db_mutex_lock(&rlp->lock, infop->fd);
527*0Sstevel@tonic-gate
528*0Sstevel@tonic-gate /* Decrement the reference count. */
529*0Sstevel@tonic-gate if (rlp->refcnt == 0)
530*0Sstevel@tonic-gate __db_err(infop->dbenv,
531*0Sstevel@tonic-gate "region rdetach: reference count went to zero!");
532*0Sstevel@tonic-gate else
533*0Sstevel@tonic-gate --rlp->refcnt;
534*0Sstevel@tonic-gate
535*0Sstevel@tonic-gate /*
536*0Sstevel@tonic-gate * If we're going to remove the region, clear the valid flag so
537*0Sstevel@tonic-gate * that any region join that's blocked waiting for us will know
538*0Sstevel@tonic-gate * what happened.
539*0Sstevel@tonic-gate */
540*0Sstevel@tonic-gate detach = 0;
541*0Sstevel@tonic-gate if (F_ISSET(infop, REGION_LASTDETACH))
542*0Sstevel@tonic-gate if (rlp->refcnt == 0) {
543*0Sstevel@tonic-gate detach = 1;
544*0Sstevel@tonic-gate rlp->valid = 0;
545*0Sstevel@tonic-gate } else
546*0Sstevel@tonic-gate ret = EBUSY;
547*0Sstevel@tonic-gate
548*0Sstevel@tonic-gate /* Release the lock. */
549*0Sstevel@tonic-gate (void)__db_mutex_unlock(&rlp->lock, infop->fd);
550*0Sstevel@tonic-gate
551*0Sstevel@tonic-gate /* Close the backing file descriptor. */
552*0Sstevel@tonic-gate (void)__os_close(infop->fd);
553*0Sstevel@tonic-gate infop->fd = -1;
554*0Sstevel@tonic-gate
555*0Sstevel@tonic-gate /* Discard our mapping of the region. */
556*0Sstevel@tonic-gate if ((t_ret = __db_unmapregion(infop)) != 0 && ret == 0)
557*0Sstevel@tonic-gate ret = t_ret;
558*0Sstevel@tonic-gate
559*0Sstevel@tonic-gate /* Discard the region itself. */
560*0Sstevel@tonic-gate if (detach) {
561*0Sstevel@tonic-gate if ((t_ret =
562*0Sstevel@tonic-gate __db_unlinkregion(infop->name, infop) != 0) && ret == 0)
563*0Sstevel@tonic-gate ret = t_ret;
564*0Sstevel@tonic-gate if ((t_ret = __os_unlink(infop->name) != 0) && ret == 0)
565*0Sstevel@tonic-gate ret = t_ret;
566*0Sstevel@tonic-gate }
567*0Sstevel@tonic-gate
568*0Sstevel@tonic-gate done: /* Discard the name. */
569*0Sstevel@tonic-gate if (infop->name != NULL) {
570*0Sstevel@tonic-gate __os_freestr(infop->name);
571*0Sstevel@tonic-gate infop->name = NULL;
572*0Sstevel@tonic-gate }
573*0Sstevel@tonic-gate
574*0Sstevel@tonic-gate return (ret);
575*0Sstevel@tonic-gate }
576*0Sstevel@tonic-gate
577*0Sstevel@tonic-gate /*
578*0Sstevel@tonic-gate * __db_runlink --
579*0Sstevel@tonic-gate * Remove a region.
580*0Sstevel@tonic-gate *
581*0Sstevel@tonic-gate * PUBLIC: int __db_runlink __P((REGINFO *, int));
582*0Sstevel@tonic-gate */
583*0Sstevel@tonic-gate int
__db_runlink(infop,force)584*0Sstevel@tonic-gate __db_runlink(infop, force)
585*0Sstevel@tonic-gate REGINFO *infop;
586*0Sstevel@tonic-gate int force;
587*0Sstevel@tonic-gate {
588*0Sstevel@tonic-gate RLAYOUT rl, *rlp;
589*0Sstevel@tonic-gate size_t size;
590*0Sstevel@tonic-gate ssize_t nr;
591*0Sstevel@tonic-gate u_int32_t mbytes, bytes;
592*0Sstevel@tonic-gate int fd, ret, t_ret;
593*0Sstevel@tonic-gate char *name;
594*0Sstevel@tonic-gate
595*0Sstevel@tonic-gate /*
596*0Sstevel@tonic-gate * XXX
597*0Sstevel@tonic-gate * We assume that we've created a new REGINFO structure for this
598*0Sstevel@tonic-gate * call, not used one that was already initialized. Regardless,
599*0Sstevel@tonic-gate * if anyone is planning to use it after we're done, they're going
600*0Sstevel@tonic-gate * to be sorely disappointed.
601*0Sstevel@tonic-gate *
602*0Sstevel@tonic-gate * If force isn't set, we attach to the region, set a flag to delete
603*0Sstevel@tonic-gate * the region on last close, and let the region delete code do the
604*0Sstevel@tonic-gate * work.
605*0Sstevel@tonic-gate */
606*0Sstevel@tonic-gate if (!force) {
607*0Sstevel@tonic-gate if ((ret = __db_rattach(infop)) != 0)
608*0Sstevel@tonic-gate return (ret);
609*0Sstevel@tonic-gate
610*0Sstevel@tonic-gate rlp = (RLAYOUT *)infop->addr;
611*0Sstevel@tonic-gate (void)__db_mutex_unlock(&rlp->lock, infop->fd);
612*0Sstevel@tonic-gate
613*0Sstevel@tonic-gate F_SET(infop, REGION_LASTDETACH);
614*0Sstevel@tonic-gate
615*0Sstevel@tonic-gate return (__db_rdetach(infop));
616*0Sstevel@tonic-gate }
617*0Sstevel@tonic-gate
618*0Sstevel@tonic-gate /*
619*0Sstevel@tonic-gate * Otherwise, we don't want to attach to the region. We may have been
620*0Sstevel@tonic-gate * called to clean up if a process died leaving a region locked and/or
621*0Sstevel@tonic-gate * corrupted, which could cause the attach to hang.
622*0Sstevel@tonic-gate */
623*0Sstevel@tonic-gate if ((ret = __db_appname(infop->dbenv, infop->appname,
624*0Sstevel@tonic-gate infop->path, infop->file, infop->dbflags, NULL, &name)) != 0)
625*0Sstevel@tonic-gate return (ret);
626*0Sstevel@tonic-gate
627*0Sstevel@tonic-gate /*
628*0Sstevel@tonic-gate * An underlying file is created for all regions other than private
629*0Sstevel@tonic-gate * (REGION_PRIVATE) ones, regardless of whether or not it's used to
630*0Sstevel@tonic-gate * back the region. If that file doesn't exist, we're done.
631*0Sstevel@tonic-gate */
632*0Sstevel@tonic-gate if (__os_exists(name, NULL) != 0) {
633*0Sstevel@tonic-gate __os_freestr(name);
634*0Sstevel@tonic-gate return (0);
635*0Sstevel@tonic-gate }
636*0Sstevel@tonic-gate
637*0Sstevel@tonic-gate /*
638*0Sstevel@tonic-gate * See the comments in __db_rattach -- figure out if this is a regular
639*0Sstevel@tonic-gate * file backing a region or if it's a regular file with information
640*0Sstevel@tonic-gate * about a region.
641*0Sstevel@tonic-gate */
642*0Sstevel@tonic-gate if ((ret = __db_open(name, DB_RDONLY, DB_RDONLY, 0, &fd)) != 0)
643*0Sstevel@tonic-gate goto errmsg;
644*0Sstevel@tonic-gate if ((ret = __os_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0)
645*0Sstevel@tonic-gate goto errmsg;
646*0Sstevel@tonic-gate size = mbytes * MEGABYTE + bytes;
647*0Sstevel@tonic-gate
648*0Sstevel@tonic-gate if (size <= sizeof(RLAYOUT)) {
649*0Sstevel@tonic-gate if ((ret = __os_read(fd, &rl, sizeof(rl), &nr)) != 0)
650*0Sstevel@tonic-gate goto errmsg;
651*0Sstevel@tonic-gate if (rl.valid != DB_REGIONMAGIC) {
652*0Sstevel@tonic-gate __db_err(infop->dbenv,
653*0Sstevel@tonic-gate "%s: illegal region magic number", name);
654*0Sstevel@tonic-gate ret = EINVAL;
655*0Sstevel@tonic-gate goto err;
656*0Sstevel@tonic-gate }
657*0Sstevel@tonic-gate
658*0Sstevel@tonic-gate /* Set the size, memory id and characteristics. */
659*0Sstevel@tonic-gate infop->size = rl.size;
660*0Sstevel@tonic-gate infop->segid = rl.segid;
661*0Sstevel@tonic-gate if (F_ISSET(&rl, REGION_ANONYMOUS))
662*0Sstevel@tonic-gate F_SET(infop, REGION_ANONYMOUS);
663*0Sstevel@tonic-gate } else {
664*0Sstevel@tonic-gate infop->size = size;
665*0Sstevel@tonic-gate infop->segid = INVALID_SEGID;
666*0Sstevel@tonic-gate }
667*0Sstevel@tonic-gate
668*0Sstevel@tonic-gate /* Remove the underlying region. */
669*0Sstevel@tonic-gate ret = __db_unlinkregion(name, infop);
670*0Sstevel@tonic-gate
671*0Sstevel@tonic-gate /*
672*0Sstevel@tonic-gate * Unlink the backing file. Close the open file descriptor first,
673*0Sstevel@tonic-gate * because some architectures (e.g., Win32) won't unlink a file if
674*0Sstevel@tonic-gate * open file descriptors remain.
675*0Sstevel@tonic-gate */
676*0Sstevel@tonic-gate (void)__os_close(fd);
677*0Sstevel@tonic-gate if ((t_ret = __os_unlink(name)) != 0 && ret == 0)
678*0Sstevel@tonic-gate ret = t_ret;
679*0Sstevel@tonic-gate
680*0Sstevel@tonic-gate if (0) {
681*0Sstevel@tonic-gate errmsg: __db_err(infop->dbenv, "%s: %s", name, strerror(ret));
682*0Sstevel@tonic-gate err: (void)__os_close(fd);
683*0Sstevel@tonic-gate }
684*0Sstevel@tonic-gate
685*0Sstevel@tonic-gate __os_freestr(name);
686*0Sstevel@tonic-gate return (ret);
687*0Sstevel@tonic-gate }
688*0Sstevel@tonic-gate
689*0Sstevel@tonic-gate /*
690*0Sstevel@tonic-gate * __db_rgrow --
691*0Sstevel@tonic-gate * Extend a region.
692*0Sstevel@tonic-gate *
693*0Sstevel@tonic-gate * PUBLIC: int __db_rgrow __P((REGINFO *, size_t));
694*0Sstevel@tonic-gate */
695*0Sstevel@tonic-gate int
__db_rgrow(infop,new_size)696*0Sstevel@tonic-gate __db_rgrow(infop, new_size)
697*0Sstevel@tonic-gate REGINFO *infop;
698*0Sstevel@tonic-gate size_t new_size;
699*0Sstevel@tonic-gate {
700*0Sstevel@tonic-gate RLAYOUT *rlp;
701*0Sstevel@tonic-gate size_t increment;
702*0Sstevel@tonic-gate int ret;
703*0Sstevel@tonic-gate
704*0Sstevel@tonic-gate /*
705*0Sstevel@tonic-gate * !!!
706*0Sstevel@tonic-gate * This routine MUST be called with the region already locked.
707*0Sstevel@tonic-gate */
708*0Sstevel@tonic-gate
709*0Sstevel@tonic-gate /* The underlying routines have flagged if this region can grow. */
710*0Sstevel@tonic-gate if (!F_ISSET(infop, REGION_CANGROW))
711*0Sstevel@tonic-gate return (EINVAL);
712*0Sstevel@tonic-gate
713*0Sstevel@tonic-gate /*
714*0Sstevel@tonic-gate * Round off the requested size to the next page boundary, and
715*0Sstevel@tonic-gate * determine the additional space required.
716*0Sstevel@tonic-gate */
717*0Sstevel@tonic-gate rlp = (RLAYOUT *)infop->addr;
718*0Sstevel@tonic-gate DB_ROUNDOFF(new_size, DB_VMPAGESIZE);
719*0Sstevel@tonic-gate increment = new_size - rlp->size;
720*0Sstevel@tonic-gate
721*0Sstevel@tonic-gate if ((ret = __db_growregion(infop, increment)) != 0)
722*0Sstevel@tonic-gate return (ret);
723*0Sstevel@tonic-gate
724*0Sstevel@tonic-gate /* Update the on-disk region size. */
725*0Sstevel@tonic-gate rlp->size = new_size;
726*0Sstevel@tonic-gate
727*0Sstevel@tonic-gate /* Detach from and reattach to the region. */
728*0Sstevel@tonic-gate return (__db_rreattach(infop, new_size));
729*0Sstevel@tonic-gate }
730*0Sstevel@tonic-gate
731*0Sstevel@tonic-gate /*
732*0Sstevel@tonic-gate * __db_growregion --
733*0Sstevel@tonic-gate * Grow a shared memory region.
734*0Sstevel@tonic-gate */
735*0Sstevel@tonic-gate static int
__db_growregion(infop,increment)736*0Sstevel@tonic-gate __db_growregion(infop, increment)
737*0Sstevel@tonic-gate REGINFO *infop;
738*0Sstevel@tonic-gate size_t increment;
739*0Sstevel@tonic-gate {
740*0Sstevel@tonic-gate db_pgno_t pages;
741*0Sstevel@tonic-gate size_t i;
742*0Sstevel@tonic-gate ssize_t nr, nw;
743*0Sstevel@tonic-gate u_int32_t relative;
744*0Sstevel@tonic-gate int ret;
745*0Sstevel@tonic-gate char buf[DB_VMPAGESIZE];
746*0Sstevel@tonic-gate
747*0Sstevel@tonic-gate /* Seek to the end of the region. */
748*0Sstevel@tonic-gate if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0)
749*0Sstevel@tonic-gate goto err;
750*0Sstevel@tonic-gate
751*0Sstevel@tonic-gate /* Write nuls to the new bytes. */
752*0Sstevel@tonic-gate memset(buf, 0, sizeof(buf));
753*0Sstevel@tonic-gate
754*0Sstevel@tonic-gate /*
755*0Sstevel@tonic-gate * Some systems require that all of the bytes of the region be
756*0Sstevel@tonic-gate * written before it can be mapped and accessed randomly, and
757*0Sstevel@tonic-gate * other systems don't zero out the pages.
758*0Sstevel@tonic-gate */
759*0Sstevel@tonic-gate if (__db_mapinit())
760*0Sstevel@tonic-gate /* Extend the region by writing each new page. */
761*0Sstevel@tonic-gate for (i = 0; i < increment; i += DB_VMPAGESIZE) {
762*0Sstevel@tonic-gate if ((ret =
763*0Sstevel@tonic-gate __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
764*0Sstevel@tonic-gate goto err;
765*0Sstevel@tonic-gate if (nw != sizeof(buf))
766*0Sstevel@tonic-gate goto eio;
767*0Sstevel@tonic-gate }
768*0Sstevel@tonic-gate else {
769*0Sstevel@tonic-gate /*
770*0Sstevel@tonic-gate * Extend the region by writing the last page. If the region
771*0Sstevel@tonic-gate * is >4Gb, increment may be larger than the maximum possible
772*0Sstevel@tonic-gate * seek "relative" argument, as it's an unsigned 32-bit value.
773*0Sstevel@tonic-gate * Break the offset into pages of 1MB each so that we don't
774*0Sstevel@tonic-gate * overflow (2^20 + 2^32 is bigger than any memory I expect
775*0Sstevel@tonic-gate * to see for awhile).
776*0Sstevel@tonic-gate */
777*0Sstevel@tonic-gate pages = (increment - DB_VMPAGESIZE) / MEGABYTE;
778*0Sstevel@tonic-gate relative = (increment - DB_VMPAGESIZE) % MEGABYTE;
779*0Sstevel@tonic-gate if ((ret = __os_seek(infop->fd,
780*0Sstevel@tonic-gate MEGABYTE, pages, relative, 0, SEEK_CUR)) != 0)
781*0Sstevel@tonic-gate goto err;
782*0Sstevel@tonic-gate if ((ret = __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
783*0Sstevel@tonic-gate goto err;
784*0Sstevel@tonic-gate if (nw != sizeof(buf))
785*0Sstevel@tonic-gate goto eio;
786*0Sstevel@tonic-gate
787*0Sstevel@tonic-gate /*
788*0Sstevel@tonic-gate * It's sometimes significantly faster to page-fault in all of
789*0Sstevel@tonic-gate * the region's pages before we run the application, as we see
790*0Sstevel@tonic-gate * nasty side-effects when we page-fault while holding various
791*0Sstevel@tonic-gate * locks, i.e., the lock takes a long time to acquire because
792*0Sstevel@tonic-gate * of the underlying page fault, and the other threads convoy
793*0Sstevel@tonic-gate * behind the lock holder.
794*0Sstevel@tonic-gate *
795*0Sstevel@tonic-gate * We also use REGION_INIT to guarantee that there is enough
796*0Sstevel@tonic-gate * disk space for the region, so we also write a byte to each
797*0Sstevel@tonic-gate * page. Reading the byte is insufficient as some systems
798*0Sstevel@tonic-gate * (e.g., Solaris) do not instantiate disk pages to satisfy
799*0Sstevel@tonic-gate * a read, and so we don't know if there is enough disk space
800*0Sstevel@tonic-gate * or not.
801*0Sstevel@tonic-gate */
802*0Sstevel@tonic-gate if (DB_GLOBAL(db_region_init)) {
803*0Sstevel@tonic-gate pages = increment / MEGABYTE;
804*0Sstevel@tonic-gate relative = increment % MEGABYTE;
805*0Sstevel@tonic-gate if ((ret = __os_seek(infop->fd,
806*0Sstevel@tonic-gate MEGABYTE, pages, relative, 1, SEEK_END)) != 0)
807*0Sstevel@tonic-gate goto err;
808*0Sstevel@tonic-gate
809*0Sstevel@tonic-gate /* Write a byte to each page. */
810*0Sstevel@tonic-gate for (i = 0; i < increment; i += DB_VMPAGESIZE) {
811*0Sstevel@tonic-gate if ((ret =
812*0Sstevel@tonic-gate __os_write(infop->fd, buf, 1, &nr)) != 0)
813*0Sstevel@tonic-gate goto err;
814*0Sstevel@tonic-gate if (nr != 1)
815*0Sstevel@tonic-gate goto eio;
816*0Sstevel@tonic-gate if ((ret = __os_seek(infop->fd,
817*0Sstevel@tonic-gate 0, 0, DB_VMPAGESIZE - 1, 0, SEEK_CUR)) != 0)
818*0Sstevel@tonic-gate goto err;
819*0Sstevel@tonic-gate }
820*0Sstevel@tonic-gate }
821*0Sstevel@tonic-gate }
822*0Sstevel@tonic-gate return (0);
823*0Sstevel@tonic-gate
824*0Sstevel@tonic-gate eio: ret = EIO;
825*0Sstevel@tonic-gate err: __db_err(infop->dbenv, "region grow: %s", strerror(ret));
826*0Sstevel@tonic-gate return (ret);
827*0Sstevel@tonic-gate }
828*0Sstevel@tonic-gate
829*0Sstevel@tonic-gate /*
830*0Sstevel@tonic-gate * __db_rreattach --
831*0Sstevel@tonic-gate * Detach from and reattach to a region.
832*0Sstevel@tonic-gate *
833*0Sstevel@tonic-gate * PUBLIC: int __db_rreattach __P((REGINFO *, size_t));
834*0Sstevel@tonic-gate */
835*0Sstevel@tonic-gate int
__db_rreattach(infop,new_size)836*0Sstevel@tonic-gate __db_rreattach(infop, new_size)
837*0Sstevel@tonic-gate REGINFO *infop;
838*0Sstevel@tonic-gate size_t new_size;
839*0Sstevel@tonic-gate {
840*0Sstevel@tonic-gate int ret;
841*0Sstevel@tonic-gate
842*0Sstevel@tonic-gate #ifdef DIAGNOSTIC
843*0Sstevel@tonic-gate if (infop->name == NULL) {
844*0Sstevel@tonic-gate __db_err(infop->dbenv, "__db_rreattach: name was NULL");
845*0Sstevel@tonic-gate return (EINVAL);
846*0Sstevel@tonic-gate }
847*0Sstevel@tonic-gate #endif
848*0Sstevel@tonic-gate /*
849*0Sstevel@tonic-gate * If we're growing an already mapped region, we have to unmap it
850*0Sstevel@tonic-gate * and get it back. We have it locked, so nobody else can get in,
851*0Sstevel@tonic-gate * which makes it fairly straight-forward to do, as everybody else
852*0Sstevel@tonic-gate * is going to block while we do the unmap/remap. NB: if we fail
853*0Sstevel@tonic-gate * to get it back, the pooch is genuinely screwed, because we can
854*0Sstevel@tonic-gate * never release the lock we're holding.
855*0Sstevel@tonic-gate *
856*0Sstevel@tonic-gate * Detach from the region. We have to do this first so architectures
857*0Sstevel@tonic-gate * that don't permit a file to be mapped into different places in the
858*0Sstevel@tonic-gate * address space simultaneously, e.g., HP's PaRisc, will work.
859*0Sstevel@tonic-gate */
860*0Sstevel@tonic-gate if ((ret = __db_unmapregion(infop)) != 0)
861*0Sstevel@tonic-gate return (ret);
862*0Sstevel@tonic-gate
863*0Sstevel@tonic-gate /* Update the caller's REGINFO size to the new map size. */
864*0Sstevel@tonic-gate infop->size = new_size;
865*0Sstevel@tonic-gate
866*0Sstevel@tonic-gate /* Attach to the region. */
867*0Sstevel@tonic-gate ret = __db_mapregion(infop->name, infop);
868*0Sstevel@tonic-gate
869*0Sstevel@tonic-gate return (ret);
870*0Sstevel@tonic-gate }
871