xref: /onnv-gate/usr/src/cmd/sendmail/db/db/db_region.c (revision 0:68f95e015346)
1*0Sstevel@tonic-gate /*-
2*0Sstevel@tonic-gate  * See the file LICENSE for redistribution information.
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * Copyright (c) 1996, 1997, 1998
5*0Sstevel@tonic-gate  *	Sleepycat Software.  All rights reserved.
6*0Sstevel@tonic-gate  */
7*0Sstevel@tonic-gate 
8*0Sstevel@tonic-gate #include "config.h"
9*0Sstevel@tonic-gate 
10*0Sstevel@tonic-gate #ifndef lint
11*0Sstevel@tonic-gate static const char sccsid[] = "@(#)db_region.c	10.53 (Sleepycat) 11/10/98";
12*0Sstevel@tonic-gate #endif /* not lint */
13*0Sstevel@tonic-gate 
14*0Sstevel@tonic-gate #ifndef NO_SYSTEM_INCLUDES
15*0Sstevel@tonic-gate #include <sys/types.h>
16*0Sstevel@tonic-gate 
17*0Sstevel@tonic-gate #include <errno.h>
18*0Sstevel@tonic-gate #include <string.h>
19*0Sstevel@tonic-gate #include <unistd.h>
20*0Sstevel@tonic-gate #endif
21*0Sstevel@tonic-gate 
22*0Sstevel@tonic-gate #include "db_int.h"
23*0Sstevel@tonic-gate #include "common_ext.h"
24*0Sstevel@tonic-gate 
25*0Sstevel@tonic-gate static int __db_growregion __P((REGINFO *, size_t));
26*0Sstevel@tonic-gate 
27*0Sstevel@tonic-gate /*
28*0Sstevel@tonic-gate  * __db_rattach --
29*0Sstevel@tonic-gate  *	Optionally create and attach to a shared memory region.
30*0Sstevel@tonic-gate  *
31*0Sstevel@tonic-gate  * PUBLIC: int __db_rattach __P((REGINFO *));
32*0Sstevel@tonic-gate  */
33*0Sstevel@tonic-gate int
__db_rattach(infop)34*0Sstevel@tonic-gate __db_rattach(infop)
35*0Sstevel@tonic-gate 	REGINFO *infop;
36*0Sstevel@tonic-gate {
37*0Sstevel@tonic-gate 	RLAYOUT *rlp, rl;
38*0Sstevel@tonic-gate 	size_t grow_region, size;
39*0Sstevel@tonic-gate 	ssize_t nr, nw;
40*0Sstevel@tonic-gate 	u_int32_t flags, mbytes, bytes;
41*0Sstevel@tonic-gate 	u_int8_t *p;
42*0Sstevel@tonic-gate 	int malloc_possible, ret, retry_cnt;
43*0Sstevel@tonic-gate 
44*0Sstevel@tonic-gate 	grow_region = 0;
45*0Sstevel@tonic-gate 	malloc_possible = 1;
46*0Sstevel@tonic-gate 	ret = retry_cnt = 0;
47*0Sstevel@tonic-gate 
48*0Sstevel@tonic-gate 	/* Round off the requested size to the next page boundary. */
49*0Sstevel@tonic-gate 	DB_ROUNDOFF(infop->size, DB_VMPAGESIZE);
50*0Sstevel@tonic-gate 
51*0Sstevel@tonic-gate 	/* Some architectures have hard limits on the maximum region size. */
52*0Sstevel@tonic-gate #ifdef DB_REGIONSIZE_MAX
53*0Sstevel@tonic-gate 	if (infop->size > DB_REGIONSIZE_MAX) {
54*0Sstevel@tonic-gate 		__db_err(infop->dbenv, "__db_rattach: cache size too large");
55*0Sstevel@tonic-gate 		return (EINVAL);
56*0Sstevel@tonic-gate 	}
57*0Sstevel@tonic-gate #endif
58*0Sstevel@tonic-gate 
59*0Sstevel@tonic-gate 	/* Intialize the return information in the REGINFO structure. */
60*0Sstevel@tonic-gate loop:	infop->addr = NULL;
61*0Sstevel@tonic-gate 	infop->fd = -1;
62*0Sstevel@tonic-gate 	infop->segid = INVALID_SEGID;
63*0Sstevel@tonic-gate 	if (infop->name != NULL) {
64*0Sstevel@tonic-gate 		__os_freestr(infop->name);
65*0Sstevel@tonic-gate 		infop->name = NULL;
66*0Sstevel@tonic-gate 	}
67*0Sstevel@tonic-gate 	F_CLR(infop, REGION_CANGROW | REGION_CREATED);
68*0Sstevel@tonic-gate 
69*0Sstevel@tonic-gate #ifndef HAVE_SPINLOCKS
70*0Sstevel@tonic-gate 	/*
71*0Sstevel@tonic-gate 	 * XXX
72*0Sstevel@tonic-gate 	 * Lacking spinlocks, we must have a file descriptor for fcntl(2)
73*0Sstevel@tonic-gate 	 * locking, which implies using mmap(2) to map in a regular file.
74*0Sstevel@tonic-gate 	 * (Theoretically, we could probably get a file descriptor to lock
75*0Sstevel@tonic-gate 	 * other types of shared regions, but I don't see any reason to
76*0Sstevel@tonic-gate 	 * bother.)
77*0Sstevel@tonic-gate 	 *
78*0Sstevel@tonic-gate 	 * Since we may be using shared memory regions, e.g., shmget(2),
79*0Sstevel@tonic-gate 	 * and not mmap of regular files, the backing file may be only a
80*0Sstevel@tonic-gate 	 * few tens of bytes in length.  So, this depends on the ability
81*0Sstevel@tonic-gate 	 * to fcntl lock file offsets much larger than the physical file.
82*0Sstevel@tonic-gate 	 */
83*0Sstevel@tonic-gate 	malloc_possible = 0;
84*0Sstevel@tonic-gate #endif
85*0Sstevel@tonic-gate 
86*0Sstevel@tonic-gate #ifdef __hppa
87*0Sstevel@tonic-gate 	/*
88*0Sstevel@tonic-gate 	 * XXX
89*0Sstevel@tonic-gate 	 * HP-UX won't permit mutexes to live in anything but shared memory.
90*0Sstevel@tonic-gate 	 * Instantiate a shared region file on that architecture, regardless.
91*0Sstevel@tonic-gate 	 */
92*0Sstevel@tonic-gate 	malloc_possible = 0;
93*0Sstevel@tonic-gate #endif
94*0Sstevel@tonic-gate 	/*
95*0Sstevel@tonic-gate 	 * If a region is truly private, malloc the memory.  That's faster
96*0Sstevel@tonic-gate 	 * than either anonymous memory or a shared file.
97*0Sstevel@tonic-gate 	 */
98*0Sstevel@tonic-gate 	if (malloc_possible && F_ISSET(infop, REGION_PRIVATE)) {
99*0Sstevel@tonic-gate 		if ((ret = __os_malloc(infop->size, NULL, &infop->addr)) != 0)
100*0Sstevel@tonic-gate 			return (ret);
101*0Sstevel@tonic-gate 
102*0Sstevel@tonic-gate 		/*
103*0Sstevel@tonic-gate 		 * It's sometimes significantly faster to page-fault in all of
104*0Sstevel@tonic-gate 		 * the region's pages before we run the application, as we see
105*0Sstevel@tonic-gate 		 * nasty side-effects when we page-fault while holding various
106*0Sstevel@tonic-gate 		 * locks, i.e., the lock takes a long time to acquire because
107*0Sstevel@tonic-gate 		 * of the underlying page fault, and the other threads convoy
108*0Sstevel@tonic-gate 		 * behind the lock holder.
109*0Sstevel@tonic-gate 		 */
110*0Sstevel@tonic-gate 		if (DB_GLOBAL(db_region_init))
111*0Sstevel@tonic-gate 			for (p = infop->addr;
112*0Sstevel@tonic-gate 			    p < (u_int8_t *)infop->addr + infop->size;
113*0Sstevel@tonic-gate 			    p += DB_VMPAGESIZE)
114*0Sstevel@tonic-gate 				p[0] = '\0';
115*0Sstevel@tonic-gate 
116*0Sstevel@tonic-gate 		F_SET(infop, REGION_CREATED | REGION_MALLOC);
117*0Sstevel@tonic-gate 		goto region_init;
118*0Sstevel@tonic-gate 	}
119*0Sstevel@tonic-gate 
120*0Sstevel@tonic-gate 	/*
121*0Sstevel@tonic-gate 	 * Get the name of the region (creating the file if a temporary file
122*0Sstevel@tonic-gate 	 * is being used).  The dbenv contains the current DB environment,
123*0Sstevel@tonic-gate 	 * including naming information.  The path argument may be a file or
124*0Sstevel@tonic-gate 	 * a directory.  If path is a directory, it must exist and file is the
125*0Sstevel@tonic-gate 	 * file name to be created inside the directory.  If path is a file,
126*0Sstevel@tonic-gate 	 * then file must be NULL.
127*0Sstevel@tonic-gate 	 */
128*0Sstevel@tonic-gate 	if ((ret = __db_appname(infop->dbenv, infop->appname, infop->path,
129*0Sstevel@tonic-gate 	    infop->file, infop->dbflags, &infop->fd, &infop->name)) != 0)
130*0Sstevel@tonic-gate 		return (ret);
131*0Sstevel@tonic-gate 	if (infop->fd != -1)
132*0Sstevel@tonic-gate 		F_SET(infop, REGION_CREATED);
133*0Sstevel@tonic-gate 
134*0Sstevel@tonic-gate 	/*
135*0Sstevel@tonic-gate 	 * Try to create the file, if we have authority.  We have to make sure
136*0Sstevel@tonic-gate 	 * that multiple threads/processes attempting to simultaneously create
137*0Sstevel@tonic-gate 	 * the region are properly ordered, so we open it using DB_CREATE and
138*0Sstevel@tonic-gate 	 * DB_EXCL, so two attempts to create the region will return failure in
139*0Sstevel@tonic-gate 	 * one.
140*0Sstevel@tonic-gate 	 */
141*0Sstevel@tonic-gate 	if (infop->fd == -1 && infop->dbflags & DB_CREATE) {
142*0Sstevel@tonic-gate 		flags = infop->dbflags;
143*0Sstevel@tonic-gate 		LF_SET(DB_EXCL);
144*0Sstevel@tonic-gate 		if ((ret = __db_open(infop->name,
145*0Sstevel@tonic-gate 		    flags, flags, infop->mode, &infop->fd)) == 0)
146*0Sstevel@tonic-gate 			F_SET(infop, REGION_CREATED);
147*0Sstevel@tonic-gate 		else
148*0Sstevel@tonic-gate 			if (ret != EEXIST)
149*0Sstevel@tonic-gate 				goto errmsg;
150*0Sstevel@tonic-gate 	}
151*0Sstevel@tonic-gate 
152*0Sstevel@tonic-gate 	/* If we couldn't create the file, try and open it. */
153*0Sstevel@tonic-gate 	if (infop->fd == -1) {
154*0Sstevel@tonic-gate 		flags = infop->dbflags;
155*0Sstevel@tonic-gate 		LF_CLR(DB_CREATE | DB_EXCL);
156*0Sstevel@tonic-gate 		if ((ret = __db_open(infop->name,
157*0Sstevel@tonic-gate 		    flags, flags, infop->mode, &infop->fd)) != 0)
158*0Sstevel@tonic-gate 			goto errmsg;
159*0Sstevel@tonic-gate 	}
160*0Sstevel@tonic-gate 
161*0Sstevel@tonic-gate 	/*
162*0Sstevel@tonic-gate 	 * There are three cases we support:
163*0Sstevel@tonic-gate 	 *    1. Named anonymous memory (shmget(2)).
164*0Sstevel@tonic-gate 	 *    2. Unnamed anonymous memory (mmap(2): MAP_ANON/MAP_ANONYMOUS).
165*0Sstevel@tonic-gate 	 *    3. Memory backed by a regular file (mmap(2)).
166*0Sstevel@tonic-gate 	 *
167*0Sstevel@tonic-gate 	 * We instantiate a backing file in all cases, which contains at least
168*0Sstevel@tonic-gate 	 * the RLAYOUT structure, and in case #3, contains the actual region.
169*0Sstevel@tonic-gate 	 * This is necessary for a couple of reasons:
170*0Sstevel@tonic-gate 	 *
171*0Sstevel@tonic-gate 	 * First, the mpool region uses temporary files to name regions, and
172*0Sstevel@tonic-gate 	 * since you may have multiple regions in the same directory, we need
173*0Sstevel@tonic-gate 	 * a filesystem name to ensure that they don't collide.
174*0Sstevel@tonic-gate 	 *
175*0Sstevel@tonic-gate 	 * Second, applications are allowed to forcibly remove regions, even
176*0Sstevel@tonic-gate 	 * if they don't know anything about them other than the name.  If a
177*0Sstevel@tonic-gate 	 * region is backed by anonymous memory, there has to be some way for
178*0Sstevel@tonic-gate 	 * the application to find out that information, and, in some cases,
179*0Sstevel@tonic-gate 	 * determine ID information for the anonymous memory.
180*0Sstevel@tonic-gate 	 */
181*0Sstevel@tonic-gate 	if (F_ISSET(infop, REGION_CREATED)) {
182*0Sstevel@tonic-gate 		/*
183*0Sstevel@tonic-gate 		 * If we're using anonymous memory to back this region, set
184*0Sstevel@tonic-gate 		 * the flag.
185*0Sstevel@tonic-gate 		 */
186*0Sstevel@tonic-gate 		if (DB_GLOBAL(db_region_anon))
187*0Sstevel@tonic-gate 			F_SET(infop, REGION_ANONYMOUS);
188*0Sstevel@tonic-gate 
189*0Sstevel@tonic-gate 		/*
190*0Sstevel@tonic-gate 		 * If we're using a regular file to back a region we created,
191*0Sstevel@tonic-gate 		 * grow it to the specified size.
192*0Sstevel@tonic-gate 		 */
193*0Sstevel@tonic-gate 		if (!DB_GLOBAL(db_region_anon) &&
194*0Sstevel@tonic-gate 		    (ret = __db_growregion(infop, infop->size)) != 0)
195*0Sstevel@tonic-gate 			goto err;
196*0Sstevel@tonic-gate 	} else {
197*0Sstevel@tonic-gate 		/*
198*0Sstevel@tonic-gate 		 * If we're joining a region, figure out what it looks like.
199*0Sstevel@tonic-gate 		 *
200*0Sstevel@tonic-gate 		 * XXX
201*0Sstevel@tonic-gate 		 * We have to figure out if the file is a regular file backing
202*0Sstevel@tonic-gate 		 * a region that we want to map into our address space, or a
203*0Sstevel@tonic-gate 		 * file with the information we need to find a shared anonymous
204*0Sstevel@tonic-gate 		 * region that we want to map into our address space.
205*0Sstevel@tonic-gate 		 *
206*0Sstevel@tonic-gate 		 * All this noise is because some systems don't have a coherent
207*0Sstevel@tonic-gate 		 * VM and buffer cache, and worse, if you mix operations on the
208*0Sstevel@tonic-gate 		 * VM and buffer cache, half the time you hang the system.
209*0Sstevel@tonic-gate 		 *
210*0Sstevel@tonic-gate 		 * There are two possibilities.  If the file is the size of an
211*0Sstevel@tonic-gate 		 * RLAYOUT structure, then we know that the real region is in
212*0Sstevel@tonic-gate 		 * shared memory, because otherwise it would be bigger.  (As
213*0Sstevel@tonic-gate 		 * the RLAYOUT structure size is smaller than a disk sector,
214*0Sstevel@tonic-gate 		 * the only way it can be this size is if deliberately written
215*0Sstevel@tonic-gate 		 * that way.)  In which case, retrieve the information we need
216*0Sstevel@tonic-gate 		 * from the RLAYOUT structure and use it to acquire the shared
217*0Sstevel@tonic-gate 		 * memory.
218*0Sstevel@tonic-gate 		 *
219*0Sstevel@tonic-gate 		 * If the structure is larger than an RLAYOUT structure, then
220*0Sstevel@tonic-gate 		 * the file is backing the shared memory region, and we use
221*0Sstevel@tonic-gate 		 * the current size of the file without reading any information
222*0Sstevel@tonic-gate 		 * from the file itself so that we don't confuse the VM.
223*0Sstevel@tonic-gate 		 *
224*0Sstevel@tonic-gate 		 * And yes, this makes me want to take somebody and kill them,
225*0Sstevel@tonic-gate 		 * but I can't think of any other solution.
226*0Sstevel@tonic-gate 		 */
227*0Sstevel@tonic-gate 		if ((ret = __os_ioinfo(infop->name,
228*0Sstevel@tonic-gate 		    infop->fd, &mbytes, &bytes, NULL)) != 0)
229*0Sstevel@tonic-gate 			goto errmsg;
230*0Sstevel@tonic-gate 		size = mbytes * MEGABYTE + bytes;
231*0Sstevel@tonic-gate 
232*0Sstevel@tonic-gate 		if (size <= sizeof(RLAYOUT)) {
233*0Sstevel@tonic-gate 			/*
234*0Sstevel@tonic-gate 			 * If the size is too small, the read fails or the
235*0Sstevel@tonic-gate 			 * valid flag is incorrect, assume it's because the
236*0Sstevel@tonic-gate 			 * RLAYOUT information hasn't been written out yet,
237*0Sstevel@tonic-gate 			 * and retry.
238*0Sstevel@tonic-gate 			 */
239*0Sstevel@tonic-gate 			if (size < sizeof(RLAYOUT))
240*0Sstevel@tonic-gate 				goto retry;
241*0Sstevel@tonic-gate 			if ((ret =
242*0Sstevel@tonic-gate 			    __os_read(infop->fd, &rl, sizeof(rl), &nr)) != 0)
243*0Sstevel@tonic-gate 				goto retry;
244*0Sstevel@tonic-gate 			if (rl.valid != DB_REGIONMAGIC)
245*0Sstevel@tonic-gate 				goto retry;
246*0Sstevel@tonic-gate 
247*0Sstevel@tonic-gate 			/* Copy the size, memory id and characteristics. */
248*0Sstevel@tonic-gate 			size = rl.size;
249*0Sstevel@tonic-gate 			infop->segid = rl.segid;
250*0Sstevel@tonic-gate 			if (F_ISSET(&rl, REGION_ANONYMOUS))
251*0Sstevel@tonic-gate 				F_SET(infop, REGION_ANONYMOUS);
252*0Sstevel@tonic-gate 		}
253*0Sstevel@tonic-gate 
254*0Sstevel@tonic-gate 		/*
255*0Sstevel@tonic-gate 		 * If the region is larger than we think, that's okay, use the
256*0Sstevel@tonic-gate 		 * current size.  If it's smaller than we think, and we were
257*0Sstevel@tonic-gate 		 * just using the default size, that's okay, use the current
258*0Sstevel@tonic-gate 		 * size.  If it's smaller than we think and we really care,
259*0Sstevel@tonic-gate 		 * save the size and we'll catch that further down -- we can't
260*0Sstevel@tonic-gate 		 * correct it here because we have to have a lock to grow the
261*0Sstevel@tonic-gate 		 * region.
262*0Sstevel@tonic-gate 		 */
263*0Sstevel@tonic-gate 		if (infop->size > size && !F_ISSET(infop, REGION_SIZEDEF))
264*0Sstevel@tonic-gate 			grow_region = infop->size;
265*0Sstevel@tonic-gate 		infop->size = size;
266*0Sstevel@tonic-gate 	}
267*0Sstevel@tonic-gate 
268*0Sstevel@tonic-gate 	/*
269*0Sstevel@tonic-gate 	 * Map the region into our address space.  If we're creating it, the
270*0Sstevel@tonic-gate 	 * underlying routines will make it the right size.
271*0Sstevel@tonic-gate 	 *
272*0Sstevel@tonic-gate 	 * There are at least two cases where we can "reasonably" fail when
273*0Sstevel@tonic-gate 	 * we attempt to map in the region.  On Windows/95, closing the last
274*0Sstevel@tonic-gate 	 * reference to a region causes it to be zeroed out.  On UNIX, when
275*0Sstevel@tonic-gate 	 * using the shmget(2) interfaces, the region will no longer exist
276*0Sstevel@tonic-gate 	 * if the system was rebooted.  In these cases, the underlying map call
277*0Sstevel@tonic-gate 	 * returns EAGAIN, and we *remove* our file and try again.  There are
278*0Sstevel@tonic-gate 	 * obvious races in doing this, but it should eventually settle down
279*0Sstevel@tonic-gate 	 * to a winner and then things should proceed normally.
280*0Sstevel@tonic-gate 	 */
281*0Sstevel@tonic-gate 	if ((ret = __db_mapregion(infop->name, infop)) != 0)
282*0Sstevel@tonic-gate 		if (ret == EAGAIN) {
283*0Sstevel@tonic-gate 			/*
284*0Sstevel@tonic-gate 			 * Pretend we created the region even if we didn't so
285*0Sstevel@tonic-gate 			 * that our error processing unlinks it.
286*0Sstevel@tonic-gate 			 */
287*0Sstevel@tonic-gate 			F_SET(infop, REGION_CREATED);
288*0Sstevel@tonic-gate 			ret = 0;
289*0Sstevel@tonic-gate 			goto retry;
290*0Sstevel@tonic-gate 		} else
291*0Sstevel@tonic-gate 			goto err;
292*0Sstevel@tonic-gate 
293*0Sstevel@tonic-gate region_init:
294*0Sstevel@tonic-gate 	/*
295*0Sstevel@tonic-gate 	 * Initialize the common region information.
296*0Sstevel@tonic-gate 	 *
297*0Sstevel@tonic-gate 	 * !!!
298*0Sstevel@tonic-gate 	 * We have to order the region creates so that two processes don't try
299*0Sstevel@tonic-gate 	 * to simultaneously create the region.  This is handled by using the
300*0Sstevel@tonic-gate 	 * DB_CREATE and DB_EXCL flags when we create the "backing" region file.
301*0Sstevel@tonic-gate 	 *
302*0Sstevel@tonic-gate 	 * We also have to order region joins so that processes joining regions
303*0Sstevel@tonic-gate 	 * never see inconsistent data.  We'd like to play permissions games
304*0Sstevel@tonic-gate 	 * with the backing file, but we can't because WNT filesystems won't
305*0Sstevel@tonic-gate 	 * open a file mode 0.
306*0Sstevel@tonic-gate 	 */
307*0Sstevel@tonic-gate 	rlp = (RLAYOUT *)infop->addr;
308*0Sstevel@tonic-gate 	if (F_ISSET(infop, REGION_CREATED)) {
309*0Sstevel@tonic-gate 		/*
310*0Sstevel@tonic-gate 		 * The process creating the region acquires a lock before it
311*0Sstevel@tonic-gate 		 * sets the valid flag.  Any processes joining the region will
312*0Sstevel@tonic-gate 		 * check the valid flag before acquiring the lock.
313*0Sstevel@tonic-gate 		 *
314*0Sstevel@tonic-gate 		 * Check the return of __db_mutex_init() and __db_mutex_lock(),
315*0Sstevel@tonic-gate 		 * even though we don't usually check elsewhere.  This is the
316*0Sstevel@tonic-gate 		 * first lock we initialize and acquire, and we have to know if
317*0Sstevel@tonic-gate 		 * it fails.  (It CAN fail, e.g., SunOS, when using fcntl(2)
318*0Sstevel@tonic-gate 		 * for locking, with an in-memory filesystem specified as the
319*0Sstevel@tonic-gate 		 * database home.)
320*0Sstevel@tonic-gate 		 */
321*0Sstevel@tonic-gate 		if ((ret = __db_mutex_init(&rlp->lock,
322*0Sstevel@tonic-gate 		    MUTEX_LOCK_OFFSET(rlp, &rlp->lock))) != 0 ||
323*0Sstevel@tonic-gate 		    (ret = __db_mutex_lock(&rlp->lock, infop->fd)) != 0)
324*0Sstevel@tonic-gate 			goto err;
325*0Sstevel@tonic-gate 
326*0Sstevel@tonic-gate 		/* Initialize the remaining region information. */
327*0Sstevel@tonic-gate 		rlp->refcnt = 1;
328*0Sstevel@tonic-gate 		rlp->size = infop->size;
329*0Sstevel@tonic-gate 		db_version(&rlp->majver, &rlp->minver, &rlp->patch);
330*0Sstevel@tonic-gate 		rlp->panic = 0;
331*0Sstevel@tonic-gate 		rlp->segid = infop->segid;
332*0Sstevel@tonic-gate 		rlp->flags = 0;
333*0Sstevel@tonic-gate 		if (F_ISSET(infop, REGION_ANONYMOUS))
334*0Sstevel@tonic-gate 			F_SET(rlp, REGION_ANONYMOUS);
335*0Sstevel@tonic-gate 
336*0Sstevel@tonic-gate 		/*
337*0Sstevel@tonic-gate 		 * Fill in the valid field last -- use a magic number, memory
338*0Sstevel@tonic-gate 		 * may not be zero-filled, and we want to minimize the chance
339*0Sstevel@tonic-gate 		 * for collision.
340*0Sstevel@tonic-gate 		 */
341*0Sstevel@tonic-gate 		rlp->valid = DB_REGIONMAGIC;
342*0Sstevel@tonic-gate 
343*0Sstevel@tonic-gate 		/*
344*0Sstevel@tonic-gate 		 * If the region is anonymous, write the RLAYOUT information
345*0Sstevel@tonic-gate 		 * into the backing file so that future region join and unlink
346*0Sstevel@tonic-gate 		 * calls can find it.
347*0Sstevel@tonic-gate 		 *
348*0Sstevel@tonic-gate 		 * XXX
349*0Sstevel@tonic-gate 		 * We MUST do the seek before we do the write.  On Win95, while
350*0Sstevel@tonic-gate 		 * closing the last reference to an anonymous shared region
351*0Sstevel@tonic-gate 		 * doesn't discard the region, it does zero it out.  So, the
352*0Sstevel@tonic-gate 		 * REGION_CREATED may be set, but the file may have already
353*0Sstevel@tonic-gate 		 * been written and the file descriptor may be at the end of
354*0Sstevel@tonic-gate 		 * the file.
355*0Sstevel@tonic-gate 		 */
356*0Sstevel@tonic-gate 		if (F_ISSET(infop, REGION_ANONYMOUS)) {
357*0Sstevel@tonic-gate 			if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, 0)) != 0)
358*0Sstevel@tonic-gate 				goto err;
359*0Sstevel@tonic-gate 			if ((ret =
360*0Sstevel@tonic-gate 			    __os_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0)
361*0Sstevel@tonic-gate 				goto err;
362*0Sstevel@tonic-gate 		}
363*0Sstevel@tonic-gate 	} else {
364*0Sstevel@tonic-gate 		/* Check to see if the region has had catastrophic failure. */
365*0Sstevel@tonic-gate 		if (rlp->panic) {
366*0Sstevel@tonic-gate 			ret = DB_RUNRECOVERY;
367*0Sstevel@tonic-gate 			goto err;
368*0Sstevel@tonic-gate 		}
369*0Sstevel@tonic-gate 
370*0Sstevel@tonic-gate 		/*
371*0Sstevel@tonic-gate 		 * Check the valid flag to ensure the region is initialized.
372*0Sstevel@tonic-gate 		 * If the valid flag has not been set, the mutex may not have
373*0Sstevel@tonic-gate 		 * been initialized, and an attempt to get it could lead to
374*0Sstevel@tonic-gate 		 * random behavior.
375*0Sstevel@tonic-gate 		 */
376*0Sstevel@tonic-gate 		if (rlp->valid != DB_REGIONMAGIC)
377*0Sstevel@tonic-gate 			goto retry;
378*0Sstevel@tonic-gate 
379*0Sstevel@tonic-gate 		/* Get the region lock. */
380*0Sstevel@tonic-gate 		(void)__db_mutex_lock(&rlp->lock, infop->fd);
381*0Sstevel@tonic-gate 
382*0Sstevel@tonic-gate 		/*
383*0Sstevel@tonic-gate 		 * We now own the region.  There are a couple of things that
384*0Sstevel@tonic-gate 		 * may have gone wrong, however.
385*0Sstevel@tonic-gate 		 *
386*0Sstevel@tonic-gate 		 * Problem #1: while we were waiting for the lock, the region
387*0Sstevel@tonic-gate 		 * was deleted.  Detected by re-checking the valid flag, since
388*0Sstevel@tonic-gate 		 * it's cleared by the delete region routines.
389*0Sstevel@tonic-gate 		 */
390*0Sstevel@tonic-gate 		if (rlp->valid != DB_REGIONMAGIC) {
391*0Sstevel@tonic-gate 			(void)__db_mutex_unlock(&rlp->lock, infop->fd);
392*0Sstevel@tonic-gate 			goto retry;
393*0Sstevel@tonic-gate 		}
394*0Sstevel@tonic-gate 
395*0Sstevel@tonic-gate 		/*
396*0Sstevel@tonic-gate 		 * Problem #3: when we checked the size of the file, it was
397*0Sstevel@tonic-gate 		 * still growing as part of creation.  Detected by the fact
398*0Sstevel@tonic-gate 		 * that infop->size isn't the same size as the region.
399*0Sstevel@tonic-gate 		 */
400*0Sstevel@tonic-gate 		if (infop->size != rlp->size) {
401*0Sstevel@tonic-gate 			(void)__db_mutex_unlock(&rlp->lock, infop->fd);
402*0Sstevel@tonic-gate 			goto retry;
403*0Sstevel@tonic-gate 		}
404*0Sstevel@tonic-gate 
405*0Sstevel@tonic-gate 		/* Increment the reference count. */
406*0Sstevel@tonic-gate 		++rlp->refcnt;
407*0Sstevel@tonic-gate 	}
408*0Sstevel@tonic-gate 
409*0Sstevel@tonic-gate 	/* Return the region in a locked condition. */
410*0Sstevel@tonic-gate 
411*0Sstevel@tonic-gate 	if (0) {
412*0Sstevel@tonic-gate errmsg:		__db_err(infop->dbenv, "%s: %s", infop->name, strerror(ret));
413*0Sstevel@tonic-gate 
414*0Sstevel@tonic-gate err:
415*0Sstevel@tonic-gate retry:		/* Discard the region. */
416*0Sstevel@tonic-gate 		if (infop->addr != NULL) {
417*0Sstevel@tonic-gate 			(void)__db_unmapregion(infop);
418*0Sstevel@tonic-gate 			infop->addr = NULL;
419*0Sstevel@tonic-gate 		}
420*0Sstevel@tonic-gate 
421*0Sstevel@tonic-gate 		/* Discard the backing file. */
422*0Sstevel@tonic-gate 		if (infop->fd != -1) {
423*0Sstevel@tonic-gate 			(void)__os_close(infop->fd);
424*0Sstevel@tonic-gate 			infop->fd = -1;
425*0Sstevel@tonic-gate 
426*0Sstevel@tonic-gate 			if (F_ISSET(infop, REGION_CREATED))
427*0Sstevel@tonic-gate 				(void)__os_unlink(infop->name);
428*0Sstevel@tonic-gate 		}
429*0Sstevel@tonic-gate 
430*0Sstevel@tonic-gate 		/* Discard the name. */
431*0Sstevel@tonic-gate 		if (infop->name != NULL) {
432*0Sstevel@tonic-gate 			__os_freestr(infop->name);
433*0Sstevel@tonic-gate 			infop->name = NULL;
434*0Sstevel@tonic-gate 		}
435*0Sstevel@tonic-gate 
436*0Sstevel@tonic-gate 		/*
437*0Sstevel@tonic-gate 		 * If we had a temporary error, wait a few seconds and
438*0Sstevel@tonic-gate 		 * try again.
439*0Sstevel@tonic-gate 		 */
440*0Sstevel@tonic-gate 		if (ret == 0) {
441*0Sstevel@tonic-gate 			if (++retry_cnt <= 3) {
442*0Sstevel@tonic-gate 				__os_sleep(retry_cnt * 2, 0);
443*0Sstevel@tonic-gate 				goto loop;
444*0Sstevel@tonic-gate 			}
445*0Sstevel@tonic-gate 			ret = EAGAIN;
446*0Sstevel@tonic-gate 		}
447*0Sstevel@tonic-gate 	}
448*0Sstevel@tonic-gate 
449*0Sstevel@tonic-gate 	/*
450*0Sstevel@tonic-gate 	 * XXX
451*0Sstevel@tonic-gate 	 * HP-UX won't permit mutexes to live in anything but shared memory.
452*0Sstevel@tonic-gate 	 * Instantiate a shared region file on that architecture, regardless.
453*0Sstevel@tonic-gate 	 *
454*0Sstevel@tonic-gate 	 * XXX
455*0Sstevel@tonic-gate 	 * There's a problem in cleaning this up on application exit, or on
456*0Sstevel@tonic-gate 	 * application failure.  If an application opens a database without
457*0Sstevel@tonic-gate 	 * an environment, we create a temporary backing mpool region for it.
458*0Sstevel@tonic-gate 	 * That region is marked REGION_PRIVATE, but as HP-UX won't permit
459*0Sstevel@tonic-gate 	 * mutexes to live in anything but shared memory, we instantiate a
460*0Sstevel@tonic-gate 	 * real file plus a memory region of some form.  If the application
461*0Sstevel@tonic-gate 	 * crashes, the necessary information to delete the backing file and
462*0Sstevel@tonic-gate 	 * any system region (e.g., the shmget(2) segment ID) is no longer
463*0Sstevel@tonic-gate 	 * available.  We can't completely fix the problem, but we try.
464*0Sstevel@tonic-gate 	 *
465*0Sstevel@tonic-gate 	 * The underlying UNIX __db_mapregion() code preferentially uses the
466*0Sstevel@tonic-gate 	 * mmap(2) interface with the MAP_ANON/MAP_ANONYMOUS flags for regions
467*0Sstevel@tonic-gate 	 * that are marked REGION_PRIVATE.  This means that we normally aren't
468*0Sstevel@tonic-gate 	 * holding any system resources when we get here, in which case we can
469*0Sstevel@tonic-gate 	 * delete the backing file.  This results in a short race, from the
470*0Sstevel@tonic-gate 	 * __db_open() call above to here.
471*0Sstevel@tonic-gate 	 *
472*0Sstevel@tonic-gate 	 * If, for some reason, we are holding system resources when we get
473*0Sstevel@tonic-gate 	 * here, we don't have any choice -- we can't delete the backing file
474*0Sstevel@tonic-gate 	 * because we may need it to detach from the resources.  Set the
475*0Sstevel@tonic-gate 	 * REGION_LASTDETACH flag, so that we do all necessary cleanup when
476*0Sstevel@tonic-gate 	 * the application closes the region.
477*0Sstevel@tonic-gate 	 */
478*0Sstevel@tonic-gate 	if (F_ISSET(infop, REGION_PRIVATE) && !F_ISSET(infop, REGION_MALLOC))
479*0Sstevel@tonic-gate 		if (F_ISSET(infop, REGION_HOLDINGSYS))
480*0Sstevel@tonic-gate 			F_SET(infop, REGION_LASTDETACH);
481*0Sstevel@tonic-gate 		else {
482*0Sstevel@tonic-gate 			F_SET(infop, REGION_REMOVED);
483*0Sstevel@tonic-gate 			F_CLR(infop, REGION_CANGROW);
484*0Sstevel@tonic-gate 
485*0Sstevel@tonic-gate 			(void)__os_close(infop->fd);
486*0Sstevel@tonic-gate 			(void)__os_unlink(infop->name);
487*0Sstevel@tonic-gate 		}
488*0Sstevel@tonic-gate 
489*0Sstevel@tonic-gate 	return (ret);
490*0Sstevel@tonic-gate }
491*0Sstevel@tonic-gate 
492*0Sstevel@tonic-gate /*
493*0Sstevel@tonic-gate  * __db_rdetach --
494*0Sstevel@tonic-gate  *	De-attach from a shared memory region.
495*0Sstevel@tonic-gate  *
496*0Sstevel@tonic-gate  * PUBLIC: int __db_rdetach __P((REGINFO *));
497*0Sstevel@tonic-gate  */
498*0Sstevel@tonic-gate int
__db_rdetach(infop)499*0Sstevel@tonic-gate __db_rdetach(infop)
500*0Sstevel@tonic-gate 	REGINFO *infop;
501*0Sstevel@tonic-gate {
502*0Sstevel@tonic-gate 	RLAYOUT *rlp;
503*0Sstevel@tonic-gate 	int detach, ret, t_ret;
504*0Sstevel@tonic-gate 
505*0Sstevel@tonic-gate 	ret = 0;
506*0Sstevel@tonic-gate 
507*0Sstevel@tonic-gate 	/*
508*0Sstevel@tonic-gate 	 * If the region was removed when it was created, no further action
509*0Sstevel@tonic-gate 	 * is required.
510*0Sstevel@tonic-gate 	 */
511*0Sstevel@tonic-gate 	if (F_ISSET(infop, REGION_REMOVED))
512*0Sstevel@tonic-gate 		goto done;
513*0Sstevel@tonic-gate 	/*
514*0Sstevel@tonic-gate 	 * If the region was created in memory returned by malloc, the only
515*0Sstevel@tonic-gate 	 * action required is freeing the memory.
516*0Sstevel@tonic-gate 	 */
517*0Sstevel@tonic-gate 	if (F_ISSET(infop, REGION_MALLOC)) {
518*0Sstevel@tonic-gate 		__os_free(infop->addr, 0);
519*0Sstevel@tonic-gate 		goto done;
520*0Sstevel@tonic-gate 	}
521*0Sstevel@tonic-gate 
522*0Sstevel@tonic-gate 	/* Otherwise, attach to the region and optionally delete it. */
523*0Sstevel@tonic-gate 	rlp = infop->addr;
524*0Sstevel@tonic-gate 
525*0Sstevel@tonic-gate 	/* Get the lock. */
526*0Sstevel@tonic-gate 	(void)__db_mutex_lock(&rlp->lock, infop->fd);
527*0Sstevel@tonic-gate 
528*0Sstevel@tonic-gate 	/* Decrement the reference count. */
529*0Sstevel@tonic-gate 	if (rlp->refcnt == 0)
530*0Sstevel@tonic-gate 		__db_err(infop->dbenv,
531*0Sstevel@tonic-gate 		    "region rdetach: reference count went to zero!");
532*0Sstevel@tonic-gate 	else
533*0Sstevel@tonic-gate 		--rlp->refcnt;
534*0Sstevel@tonic-gate 
535*0Sstevel@tonic-gate 	/*
536*0Sstevel@tonic-gate 	 * If we're going to remove the region, clear the valid flag so
537*0Sstevel@tonic-gate 	 * that any region join that's blocked waiting for us will know
538*0Sstevel@tonic-gate 	 * what happened.
539*0Sstevel@tonic-gate 	 */
540*0Sstevel@tonic-gate 	detach = 0;
541*0Sstevel@tonic-gate 	if (F_ISSET(infop, REGION_LASTDETACH))
542*0Sstevel@tonic-gate 		if (rlp->refcnt == 0) {
543*0Sstevel@tonic-gate 			detach = 1;
544*0Sstevel@tonic-gate 			rlp->valid = 0;
545*0Sstevel@tonic-gate 		} else
546*0Sstevel@tonic-gate 			ret = EBUSY;
547*0Sstevel@tonic-gate 
548*0Sstevel@tonic-gate 	/* Release the lock. */
549*0Sstevel@tonic-gate 	(void)__db_mutex_unlock(&rlp->lock, infop->fd);
550*0Sstevel@tonic-gate 
551*0Sstevel@tonic-gate 	/* Close the backing file descriptor. */
552*0Sstevel@tonic-gate 	(void)__os_close(infop->fd);
553*0Sstevel@tonic-gate 	infop->fd = -1;
554*0Sstevel@tonic-gate 
555*0Sstevel@tonic-gate 	/* Discard our mapping of the region. */
556*0Sstevel@tonic-gate 	if ((t_ret = __db_unmapregion(infop)) != 0 && ret == 0)
557*0Sstevel@tonic-gate 		ret = t_ret;
558*0Sstevel@tonic-gate 
559*0Sstevel@tonic-gate 	/* Discard the region itself. */
560*0Sstevel@tonic-gate 	if (detach) {
561*0Sstevel@tonic-gate 		if ((t_ret =
562*0Sstevel@tonic-gate 		    __db_unlinkregion(infop->name, infop) != 0) && ret == 0)
563*0Sstevel@tonic-gate 			ret = t_ret;
564*0Sstevel@tonic-gate 		if ((t_ret = __os_unlink(infop->name) != 0) && ret == 0)
565*0Sstevel@tonic-gate 			ret = t_ret;
566*0Sstevel@tonic-gate 	}
567*0Sstevel@tonic-gate 
568*0Sstevel@tonic-gate done:	/* Discard the name. */
569*0Sstevel@tonic-gate 	if (infop->name != NULL) {
570*0Sstevel@tonic-gate 		__os_freestr(infop->name);
571*0Sstevel@tonic-gate 		infop->name = NULL;
572*0Sstevel@tonic-gate 	}
573*0Sstevel@tonic-gate 
574*0Sstevel@tonic-gate 	return (ret);
575*0Sstevel@tonic-gate }
576*0Sstevel@tonic-gate 
577*0Sstevel@tonic-gate /*
578*0Sstevel@tonic-gate  * __db_runlink --
579*0Sstevel@tonic-gate  *	Remove a region.
580*0Sstevel@tonic-gate  *
581*0Sstevel@tonic-gate  * PUBLIC: int __db_runlink __P((REGINFO *, int));
582*0Sstevel@tonic-gate  */
583*0Sstevel@tonic-gate int
__db_runlink(infop,force)584*0Sstevel@tonic-gate __db_runlink(infop, force)
585*0Sstevel@tonic-gate 	REGINFO *infop;
586*0Sstevel@tonic-gate 	int force;
587*0Sstevel@tonic-gate {
588*0Sstevel@tonic-gate 	RLAYOUT rl, *rlp;
589*0Sstevel@tonic-gate 	size_t size;
590*0Sstevel@tonic-gate 	ssize_t nr;
591*0Sstevel@tonic-gate 	u_int32_t mbytes, bytes;
592*0Sstevel@tonic-gate 	int fd, ret, t_ret;
593*0Sstevel@tonic-gate 	char *name;
594*0Sstevel@tonic-gate 
595*0Sstevel@tonic-gate 	/*
596*0Sstevel@tonic-gate 	 * XXX
597*0Sstevel@tonic-gate 	 * We assume that we've created a new REGINFO structure for this
598*0Sstevel@tonic-gate 	 * call, not used one that was already initialized.  Regardless,
599*0Sstevel@tonic-gate 	 * if anyone is planning to use it after we're done, they're going
600*0Sstevel@tonic-gate 	 * to be sorely disappointed.
601*0Sstevel@tonic-gate 	 *
602*0Sstevel@tonic-gate 	 * If force isn't set, we attach to the region, set a flag to delete
603*0Sstevel@tonic-gate 	 * the region on last close, and let the region delete code do the
604*0Sstevel@tonic-gate 	 * work.
605*0Sstevel@tonic-gate 	 */
606*0Sstevel@tonic-gate 	if (!force) {
607*0Sstevel@tonic-gate 		if ((ret = __db_rattach(infop)) != 0)
608*0Sstevel@tonic-gate 			return (ret);
609*0Sstevel@tonic-gate 
610*0Sstevel@tonic-gate 		rlp = (RLAYOUT *)infop->addr;
611*0Sstevel@tonic-gate 		(void)__db_mutex_unlock(&rlp->lock, infop->fd);
612*0Sstevel@tonic-gate 
613*0Sstevel@tonic-gate 		F_SET(infop, REGION_LASTDETACH);
614*0Sstevel@tonic-gate 
615*0Sstevel@tonic-gate 		return (__db_rdetach(infop));
616*0Sstevel@tonic-gate 	}
617*0Sstevel@tonic-gate 
618*0Sstevel@tonic-gate 	/*
619*0Sstevel@tonic-gate 	 * Otherwise, we don't want to attach to the region.  We may have been
620*0Sstevel@tonic-gate 	 * called to clean up if a process died leaving a region locked and/or
621*0Sstevel@tonic-gate 	 * corrupted, which could cause the attach to hang.
622*0Sstevel@tonic-gate 	 */
623*0Sstevel@tonic-gate 	if ((ret = __db_appname(infop->dbenv, infop->appname,
624*0Sstevel@tonic-gate 	    infop->path, infop->file, infop->dbflags, NULL, &name)) != 0)
625*0Sstevel@tonic-gate 		return (ret);
626*0Sstevel@tonic-gate 
627*0Sstevel@tonic-gate 	/*
628*0Sstevel@tonic-gate 	 * An underlying file is created for all regions other than private
629*0Sstevel@tonic-gate 	 * (REGION_PRIVATE) ones, regardless of whether or not it's used to
630*0Sstevel@tonic-gate 	 * back the region.  If that file doesn't exist, we're done.
631*0Sstevel@tonic-gate 	 */
632*0Sstevel@tonic-gate 	if (__os_exists(name, NULL) != 0) {
633*0Sstevel@tonic-gate 		__os_freestr(name);
634*0Sstevel@tonic-gate 		return (0);
635*0Sstevel@tonic-gate 	}
636*0Sstevel@tonic-gate 
637*0Sstevel@tonic-gate 	/*
638*0Sstevel@tonic-gate 	 * See the comments in __db_rattach -- figure out if this is a regular
639*0Sstevel@tonic-gate 	 * file backing a region or if it's a regular file with information
640*0Sstevel@tonic-gate 	 * about a region.
641*0Sstevel@tonic-gate 	 */
642*0Sstevel@tonic-gate 	if ((ret = __db_open(name, DB_RDONLY, DB_RDONLY, 0, &fd)) != 0)
643*0Sstevel@tonic-gate 		goto errmsg;
644*0Sstevel@tonic-gate 	if ((ret = __os_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0)
645*0Sstevel@tonic-gate 		goto errmsg;
646*0Sstevel@tonic-gate 	size = mbytes * MEGABYTE + bytes;
647*0Sstevel@tonic-gate 
648*0Sstevel@tonic-gate 	if (size <= sizeof(RLAYOUT)) {
649*0Sstevel@tonic-gate 		if ((ret = __os_read(fd, &rl, sizeof(rl), &nr)) != 0)
650*0Sstevel@tonic-gate 			goto errmsg;
651*0Sstevel@tonic-gate 		if (rl.valid != DB_REGIONMAGIC) {
652*0Sstevel@tonic-gate 			__db_err(infop->dbenv,
653*0Sstevel@tonic-gate 			    "%s: illegal region magic number", name);
654*0Sstevel@tonic-gate 			ret = EINVAL;
655*0Sstevel@tonic-gate 			goto err;
656*0Sstevel@tonic-gate 		}
657*0Sstevel@tonic-gate 
658*0Sstevel@tonic-gate 		/* Set the size, memory id and characteristics. */
659*0Sstevel@tonic-gate 		infop->size = rl.size;
660*0Sstevel@tonic-gate 		infop->segid = rl.segid;
661*0Sstevel@tonic-gate 		if (F_ISSET(&rl, REGION_ANONYMOUS))
662*0Sstevel@tonic-gate 			F_SET(infop, REGION_ANONYMOUS);
663*0Sstevel@tonic-gate 	} else {
664*0Sstevel@tonic-gate 		infop->size = size;
665*0Sstevel@tonic-gate 		infop->segid = INVALID_SEGID;
666*0Sstevel@tonic-gate 	}
667*0Sstevel@tonic-gate 
668*0Sstevel@tonic-gate 	/* Remove the underlying region. */
669*0Sstevel@tonic-gate 	ret = __db_unlinkregion(name, infop);
670*0Sstevel@tonic-gate 
671*0Sstevel@tonic-gate 	/*
672*0Sstevel@tonic-gate 	 * Unlink the backing file.  Close the open file descriptor first,
673*0Sstevel@tonic-gate 	 * because some architectures (e.g., Win32) won't unlink a file if
674*0Sstevel@tonic-gate 	 * open file descriptors remain.
675*0Sstevel@tonic-gate 	 */
676*0Sstevel@tonic-gate 	(void)__os_close(fd);
677*0Sstevel@tonic-gate 	if ((t_ret = __os_unlink(name)) != 0 && ret == 0)
678*0Sstevel@tonic-gate 		ret = t_ret;
679*0Sstevel@tonic-gate 
680*0Sstevel@tonic-gate 	if (0) {
681*0Sstevel@tonic-gate errmsg:		__db_err(infop->dbenv, "%s: %s", name, strerror(ret));
682*0Sstevel@tonic-gate err:		(void)__os_close(fd);
683*0Sstevel@tonic-gate 	}
684*0Sstevel@tonic-gate 
685*0Sstevel@tonic-gate 	__os_freestr(name);
686*0Sstevel@tonic-gate 	return (ret);
687*0Sstevel@tonic-gate }
688*0Sstevel@tonic-gate 
689*0Sstevel@tonic-gate /*
690*0Sstevel@tonic-gate  * __db_rgrow --
691*0Sstevel@tonic-gate  *	Extend a region.
692*0Sstevel@tonic-gate  *
693*0Sstevel@tonic-gate  * PUBLIC: int __db_rgrow __P((REGINFO *, size_t));
694*0Sstevel@tonic-gate  */
695*0Sstevel@tonic-gate int
__db_rgrow(infop,new_size)696*0Sstevel@tonic-gate __db_rgrow(infop, new_size)
697*0Sstevel@tonic-gate 	REGINFO *infop;
698*0Sstevel@tonic-gate 	size_t new_size;
699*0Sstevel@tonic-gate {
700*0Sstevel@tonic-gate 	RLAYOUT *rlp;
701*0Sstevel@tonic-gate 	size_t increment;
702*0Sstevel@tonic-gate 	int ret;
703*0Sstevel@tonic-gate 
704*0Sstevel@tonic-gate 	/*
705*0Sstevel@tonic-gate 	 * !!!
706*0Sstevel@tonic-gate 	 * This routine MUST be called with the region already locked.
707*0Sstevel@tonic-gate 	 */
708*0Sstevel@tonic-gate 
709*0Sstevel@tonic-gate 	/* The underlying routines have flagged if this region can grow. */
710*0Sstevel@tonic-gate 	if (!F_ISSET(infop, REGION_CANGROW))
711*0Sstevel@tonic-gate 		return (EINVAL);
712*0Sstevel@tonic-gate 
713*0Sstevel@tonic-gate 	/*
714*0Sstevel@tonic-gate 	 * Round off the requested size to the next page boundary, and
715*0Sstevel@tonic-gate 	 * determine the additional space required.
716*0Sstevel@tonic-gate 	 */
717*0Sstevel@tonic-gate 	rlp = (RLAYOUT *)infop->addr;
718*0Sstevel@tonic-gate 	DB_ROUNDOFF(new_size, DB_VMPAGESIZE);
719*0Sstevel@tonic-gate 	increment = new_size - rlp->size;
720*0Sstevel@tonic-gate 
721*0Sstevel@tonic-gate 	if ((ret = __db_growregion(infop, increment)) != 0)
722*0Sstevel@tonic-gate 		return (ret);
723*0Sstevel@tonic-gate 
724*0Sstevel@tonic-gate 	/* Update the on-disk region size. */
725*0Sstevel@tonic-gate 	rlp->size = new_size;
726*0Sstevel@tonic-gate 
727*0Sstevel@tonic-gate 	/* Detach from and reattach to the region. */
728*0Sstevel@tonic-gate 	return (__db_rreattach(infop, new_size));
729*0Sstevel@tonic-gate }
730*0Sstevel@tonic-gate 
731*0Sstevel@tonic-gate /*
732*0Sstevel@tonic-gate  * __db_growregion --
733*0Sstevel@tonic-gate  *	Grow a shared memory region.
734*0Sstevel@tonic-gate  */
735*0Sstevel@tonic-gate static int
__db_growregion(infop,increment)736*0Sstevel@tonic-gate __db_growregion(infop, increment)
737*0Sstevel@tonic-gate 	REGINFO *infop;
738*0Sstevel@tonic-gate 	size_t increment;
739*0Sstevel@tonic-gate {
740*0Sstevel@tonic-gate 	db_pgno_t pages;
741*0Sstevel@tonic-gate 	size_t i;
742*0Sstevel@tonic-gate 	ssize_t nr, nw;
743*0Sstevel@tonic-gate 	u_int32_t relative;
744*0Sstevel@tonic-gate 	int ret;
745*0Sstevel@tonic-gate 	char buf[DB_VMPAGESIZE];
746*0Sstevel@tonic-gate 
747*0Sstevel@tonic-gate 	/* Seek to the end of the region. */
748*0Sstevel@tonic-gate 	if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0)
749*0Sstevel@tonic-gate 		goto err;
750*0Sstevel@tonic-gate 
751*0Sstevel@tonic-gate 	/* Write nuls to the new bytes. */
752*0Sstevel@tonic-gate 	memset(buf, 0, sizeof(buf));
753*0Sstevel@tonic-gate 
754*0Sstevel@tonic-gate 	/*
755*0Sstevel@tonic-gate 	 * Some systems require that all of the bytes of the region be
756*0Sstevel@tonic-gate 	 * written before it can be mapped and accessed randomly, and
757*0Sstevel@tonic-gate 	 * other systems don't zero out the pages.
758*0Sstevel@tonic-gate 	 */
759*0Sstevel@tonic-gate 	if (__db_mapinit())
760*0Sstevel@tonic-gate 		/* Extend the region by writing each new page. */
761*0Sstevel@tonic-gate 		for (i = 0; i < increment; i += DB_VMPAGESIZE) {
762*0Sstevel@tonic-gate 			if ((ret =
763*0Sstevel@tonic-gate 			    __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
764*0Sstevel@tonic-gate 				goto err;
765*0Sstevel@tonic-gate 			if (nw != sizeof(buf))
766*0Sstevel@tonic-gate 				goto eio;
767*0Sstevel@tonic-gate 		}
768*0Sstevel@tonic-gate 	else {
769*0Sstevel@tonic-gate 		/*
770*0Sstevel@tonic-gate 		 * Extend the region by writing the last page.  If the region
771*0Sstevel@tonic-gate 		 * is >4Gb, increment may be larger than the maximum possible
772*0Sstevel@tonic-gate 		 * seek "relative" argument, as it's an unsigned 32-bit value.
773*0Sstevel@tonic-gate 		 * Break the offset into pages of 1MB each so that we don't
774*0Sstevel@tonic-gate 		 * overflow (2^20 + 2^32 is bigger than any memory I expect
775*0Sstevel@tonic-gate 		 * to see for awhile).
776*0Sstevel@tonic-gate 		 */
777*0Sstevel@tonic-gate 		pages = (increment - DB_VMPAGESIZE) / MEGABYTE;
778*0Sstevel@tonic-gate 		relative = (increment - DB_VMPAGESIZE) % MEGABYTE;
779*0Sstevel@tonic-gate 		if ((ret = __os_seek(infop->fd,
780*0Sstevel@tonic-gate 		    MEGABYTE, pages, relative, 0, SEEK_CUR)) != 0)
781*0Sstevel@tonic-gate 			goto err;
782*0Sstevel@tonic-gate 		if ((ret = __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
783*0Sstevel@tonic-gate 			goto err;
784*0Sstevel@tonic-gate 		if (nw != sizeof(buf))
785*0Sstevel@tonic-gate 			goto eio;
786*0Sstevel@tonic-gate 
787*0Sstevel@tonic-gate 		/*
788*0Sstevel@tonic-gate 		 * It's sometimes significantly faster to page-fault in all of
789*0Sstevel@tonic-gate 		 * the region's pages before we run the application, as we see
790*0Sstevel@tonic-gate 		 * nasty side-effects when we page-fault while holding various
791*0Sstevel@tonic-gate 		 * locks, i.e., the lock takes a long time to acquire because
792*0Sstevel@tonic-gate 		 * of the underlying page fault, and the other threads convoy
793*0Sstevel@tonic-gate 		 * behind the lock holder.
794*0Sstevel@tonic-gate 		 *
795*0Sstevel@tonic-gate 		 * We also use REGION_INIT to guarantee that there is enough
796*0Sstevel@tonic-gate 		 * disk space for the region, so we also write a byte to each
797*0Sstevel@tonic-gate 		 * page.  Reading the byte is insufficient as some systems
798*0Sstevel@tonic-gate 		 * (e.g., Solaris) do not instantiate disk pages to satisfy
799*0Sstevel@tonic-gate 		 * a read, and so we don't know if there is enough disk space
800*0Sstevel@tonic-gate 		 * or not.
801*0Sstevel@tonic-gate 		 */
802*0Sstevel@tonic-gate 		if (DB_GLOBAL(db_region_init)) {
803*0Sstevel@tonic-gate 			pages = increment / MEGABYTE;
804*0Sstevel@tonic-gate 			relative = increment % MEGABYTE;
805*0Sstevel@tonic-gate 			if ((ret = __os_seek(infop->fd,
806*0Sstevel@tonic-gate 			    MEGABYTE, pages, relative, 1, SEEK_END)) != 0)
807*0Sstevel@tonic-gate 				goto err;
808*0Sstevel@tonic-gate 
809*0Sstevel@tonic-gate 			/* Write a byte to each page. */
810*0Sstevel@tonic-gate 			for (i = 0; i < increment; i += DB_VMPAGESIZE) {
811*0Sstevel@tonic-gate 				if ((ret =
812*0Sstevel@tonic-gate 				    __os_write(infop->fd, buf, 1, &nr)) != 0)
813*0Sstevel@tonic-gate 					goto err;
814*0Sstevel@tonic-gate 				if (nr != 1)
815*0Sstevel@tonic-gate 					goto eio;
816*0Sstevel@tonic-gate 				if ((ret = __os_seek(infop->fd,
817*0Sstevel@tonic-gate 				    0, 0, DB_VMPAGESIZE - 1, 0, SEEK_CUR)) != 0)
818*0Sstevel@tonic-gate 					goto err;
819*0Sstevel@tonic-gate 			}
820*0Sstevel@tonic-gate 		}
821*0Sstevel@tonic-gate 	}
822*0Sstevel@tonic-gate 	return (0);
823*0Sstevel@tonic-gate 
824*0Sstevel@tonic-gate eio:	ret = EIO;
825*0Sstevel@tonic-gate err:	__db_err(infop->dbenv, "region grow: %s", strerror(ret));
826*0Sstevel@tonic-gate 	return (ret);
827*0Sstevel@tonic-gate }
828*0Sstevel@tonic-gate 
829*0Sstevel@tonic-gate /*
830*0Sstevel@tonic-gate  * __db_rreattach --
831*0Sstevel@tonic-gate  *	Detach from and reattach to a region.
832*0Sstevel@tonic-gate  *
833*0Sstevel@tonic-gate  * PUBLIC: int __db_rreattach __P((REGINFO *, size_t));
834*0Sstevel@tonic-gate  */
835*0Sstevel@tonic-gate int
__db_rreattach(infop,new_size)836*0Sstevel@tonic-gate __db_rreattach(infop, new_size)
837*0Sstevel@tonic-gate 	REGINFO *infop;
838*0Sstevel@tonic-gate 	size_t new_size;
839*0Sstevel@tonic-gate {
840*0Sstevel@tonic-gate 	int ret;
841*0Sstevel@tonic-gate 
842*0Sstevel@tonic-gate #ifdef DIAGNOSTIC
843*0Sstevel@tonic-gate 	if (infop->name == NULL) {
844*0Sstevel@tonic-gate 		__db_err(infop->dbenv, "__db_rreattach: name was NULL");
845*0Sstevel@tonic-gate 		return (EINVAL);
846*0Sstevel@tonic-gate 	}
847*0Sstevel@tonic-gate #endif
848*0Sstevel@tonic-gate 	/*
849*0Sstevel@tonic-gate 	 * If we're growing an already mapped region, we have to unmap it
850*0Sstevel@tonic-gate 	 * and get it back.  We have it locked, so nobody else can get in,
851*0Sstevel@tonic-gate 	 * which makes it fairly straight-forward to do, as everybody else
852*0Sstevel@tonic-gate 	 * is going to block while we do the unmap/remap.  NB: if we fail
853*0Sstevel@tonic-gate 	 * to get it back, the pooch is genuinely screwed, because we can
854*0Sstevel@tonic-gate 	 * never release the lock we're holding.
855*0Sstevel@tonic-gate 	 *
856*0Sstevel@tonic-gate 	 * Detach from the region.  We have to do this first so architectures
857*0Sstevel@tonic-gate 	 * that don't permit a file to be mapped into different places in the
858*0Sstevel@tonic-gate 	 * address space simultaneously, e.g., HP's PaRisc, will work.
859*0Sstevel@tonic-gate 	 */
860*0Sstevel@tonic-gate 	if ((ret = __db_unmapregion(infop)) != 0)
861*0Sstevel@tonic-gate 		return (ret);
862*0Sstevel@tonic-gate 
863*0Sstevel@tonic-gate 	/* Update the caller's REGINFO size to the new map size. */
864*0Sstevel@tonic-gate 	infop->size = new_size;
865*0Sstevel@tonic-gate 
866*0Sstevel@tonic-gate 	/* Attach to the region. */
867*0Sstevel@tonic-gate 	ret = __db_mapregion(infop->name, infop);
868*0Sstevel@tonic-gate 
869*0Sstevel@tonic-gate 	return (ret);
870*0Sstevel@tonic-gate }
871