1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996, 1997, 1998
5 *	Sleepycat Software.  All rights reserved.
6 */
7
8#include "config.h"
9
10#ifndef lint
11static const char sccsid[] = "@(#)db_region.c	10.53 (Sleepycat) 11/10/98";
12#endif /* not lint */
13
14#ifndef NO_SYSTEM_INCLUDES
15#include <sys/types.h>
16
17#include <errno.h>
18#include <string.h>
19#include <unistd.h>
20#endif
21
22#include "db_int.h"
23#include "common_ext.h"
24
25static int __db_growregion __P((REGINFO *, size_t));
26
27/*
28 * __db_rattach --
29 *	Optionally create and attach to a shared memory region.
30 *
31 * PUBLIC: int __db_rattach __P((REGINFO *));
32 */
33int
34__db_rattach(infop)
35	REGINFO *infop;
36{
37	RLAYOUT *rlp, rl;
38	size_t grow_region, size;
39	ssize_t nr, nw;
40	u_int32_t flags, mbytes, bytes;
41	u_int8_t *p;
42	int malloc_possible, ret, retry_cnt;
43
44	grow_region = 0;
45	malloc_possible = 1;
46	ret = retry_cnt = 0;
47
48	/* Round off the requested size to the next page boundary. */
49	DB_ROUNDOFF(infop->size, DB_VMPAGESIZE);
50
51	/* Some architectures have hard limits on the maximum region size. */
52#ifdef DB_REGIONSIZE_MAX
53	if (infop->size > DB_REGIONSIZE_MAX) {
54		__db_err(infop->dbenv, "__db_rattach: cache size too large");
55		return (EINVAL);
56	}
57#endif
58
59	/* Intialize the return information in the REGINFO structure. */
60loop:	infop->addr = NULL;
61	infop->fd = -1;
62	infop->segid = INVALID_SEGID;
63	if (infop->name != NULL) {
64		__os_freestr(infop->name);
65		infop->name = NULL;
66	}
67	F_CLR(infop, REGION_CANGROW | REGION_CREATED);
68
69#ifndef HAVE_SPINLOCKS
70	/*
71	 * XXX
72	 * Lacking spinlocks, we must have a file descriptor for fcntl(2)
73	 * locking, which implies using mmap(2) to map in a regular file.
74	 * (Theoretically, we could probably get a file descriptor to lock
75	 * other types of shared regions, but I don't see any reason to
76	 * bother.)
77	 *
78	 * Since we may be using shared memory regions, e.g., shmget(2),
79	 * and not mmap of regular files, the backing file may be only a
80	 * few tens of bytes in length.  So, this depends on the ability
81	 * to fcntl lock file offsets much larger than the physical file.
82	 */
83	malloc_possible = 0;
84#endif
85
86#ifdef __hppa
87	/*
88	 * XXX
89	 * HP-UX won't permit mutexes to live in anything but shared memory.
90	 * Instantiate a shared region file on that architecture, regardless.
91	 */
92	malloc_possible = 0;
93#endif
94	/*
95	 * If a region is truly private, malloc the memory.  That's faster
96	 * than either anonymous memory or a shared file.
97	 */
98	if (malloc_possible && F_ISSET(infop, REGION_PRIVATE)) {
99		if ((ret = __os_malloc(infop->size, NULL, &infop->addr)) != 0)
100			return (ret);
101
102		/*
103		 * It's sometimes significantly faster to page-fault in all of
104		 * the region's pages before we run the application, as we see
105		 * nasty side-effects when we page-fault while holding various
106		 * locks, i.e., the lock takes a long time to acquire because
107		 * of the underlying page fault, and the other threads convoy
108		 * behind the lock holder.
109		 */
110		if (DB_GLOBAL(db_region_init))
111			for (p = infop->addr;
112			    p < (u_int8_t *)infop->addr + infop->size;
113			    p += DB_VMPAGESIZE)
114				p[0] = '\0';
115
116		F_SET(infop, REGION_CREATED | REGION_MALLOC);
117		goto region_init;
118	}
119
120	/*
121	 * Get the name of the region (creating the file if a temporary file
122	 * is being used).  The dbenv contains the current DB environment,
123	 * including naming information.  The path argument may be a file or
124	 * a directory.  If path is a directory, it must exist and file is the
125	 * file name to be created inside the directory.  If path is a file,
126	 * then file must be NULL.
127	 */
128	if ((ret = __db_appname(infop->dbenv, infop->appname, infop->path,
129	    infop->file, infop->dbflags, &infop->fd, &infop->name)) != 0)
130		return (ret);
131	if (infop->fd != -1)
132		F_SET(infop, REGION_CREATED);
133
134	/*
135	 * Try to create the file, if we have authority.  We have to make sure
136	 * that multiple threads/processes attempting to simultaneously create
137	 * the region are properly ordered, so we open it using DB_CREATE and
138	 * DB_EXCL, so two attempts to create the region will return failure in
139	 * one.
140	 */
141	if (infop->fd == -1 && infop->dbflags & DB_CREATE) {
142		flags = infop->dbflags;
143		LF_SET(DB_EXCL);
144		if ((ret = __db_open(infop->name,
145		    flags, flags, infop->mode, &infop->fd)) == 0)
146			F_SET(infop, REGION_CREATED);
147		else
148			if (ret != EEXIST)
149				goto errmsg;
150	}
151
152	/* If we couldn't create the file, try and open it. */
153	if (infop->fd == -1) {
154		flags = infop->dbflags;
155		LF_CLR(DB_CREATE | DB_EXCL);
156		if ((ret = __db_open(infop->name,
157		    flags, flags, infop->mode, &infop->fd)) != 0)
158			goto errmsg;
159	}
160
161	/*
162	 * There are three cases we support:
163	 *    1. Named anonymous memory (shmget(2)).
164	 *    2. Unnamed anonymous memory (mmap(2): MAP_ANON/MAP_ANONYMOUS).
165	 *    3. Memory backed by a regular file (mmap(2)).
166	 *
167	 * We instantiate a backing file in all cases, which contains at least
168	 * the RLAYOUT structure, and in case #3, contains the actual region.
169	 * This is necessary for a couple of reasons:
170	 *
171	 * First, the mpool region uses temporary files to name regions, and
172	 * since you may have multiple regions in the same directory, we need
173	 * a filesystem name to ensure that they don't collide.
174	 *
175	 * Second, applications are allowed to forcibly remove regions, even
176	 * if they don't know anything about them other than the name.  If a
177	 * region is backed by anonymous memory, there has to be some way for
178	 * the application to find out that information, and, in some cases,
179	 * determine ID information for the anonymous memory.
180	 */
181	if (F_ISSET(infop, REGION_CREATED)) {
182		/*
183		 * If we're using anonymous memory to back this region, set
184		 * the flag.
185		 */
186		if (DB_GLOBAL(db_region_anon))
187			F_SET(infop, REGION_ANONYMOUS);
188
189		/*
190		 * If we're using a regular file to back a region we created,
191		 * grow it to the specified size.
192		 */
193		if (!DB_GLOBAL(db_region_anon) &&
194		    (ret = __db_growregion(infop, infop->size)) != 0)
195			goto err;
196	} else {
197		/*
198		 * If we're joining a region, figure out what it looks like.
199		 *
200		 * XXX
201		 * We have to figure out if the file is a regular file backing
202		 * a region that we want to map into our address space, or a
203		 * file with the information we need to find a shared anonymous
204		 * region that we want to map into our address space.
205		 *
206		 * All this noise is because some systems don't have a coherent
207		 * VM and buffer cache, and worse, if you mix operations on the
208		 * VM and buffer cache, half the time you hang the system.
209		 *
210		 * There are two possibilities.  If the file is the size of an
211		 * RLAYOUT structure, then we know that the real region is in
212		 * shared memory, because otherwise it would be bigger.  (As
213		 * the RLAYOUT structure size is smaller than a disk sector,
214		 * the only way it can be this size is if deliberately written
215		 * that way.)  In which case, retrieve the information we need
216		 * from the RLAYOUT structure and use it to acquire the shared
217		 * memory.
218		 *
219		 * If the structure is larger than an RLAYOUT structure, then
220		 * the file is backing the shared memory region, and we use
221		 * the current size of the file without reading any information
222		 * from the file itself so that we don't confuse the VM.
223		 *
224		 * And yes, this makes me want to take somebody and kill them,
225		 * but I can't think of any other solution.
226		 */
227		if ((ret = __os_ioinfo(infop->name,
228		    infop->fd, &mbytes, &bytes, NULL)) != 0)
229			goto errmsg;
230		size = mbytes * MEGABYTE + bytes;
231
232		if (size <= sizeof(RLAYOUT)) {
233			/*
234			 * If the size is too small, the read fails or the
235			 * valid flag is incorrect, assume it's because the
236			 * RLAYOUT information hasn't been written out yet,
237			 * and retry.
238			 */
239			if (size < sizeof(RLAYOUT))
240				goto retry;
241			if ((ret =
242			    __os_read(infop->fd, &rl, sizeof(rl), &nr)) != 0)
243				goto retry;
244			if (rl.valid != DB_REGIONMAGIC)
245				goto retry;
246
247			/* Copy the size, memory id and characteristics. */
248			size = rl.size;
249			infop->segid = rl.segid;
250			if (F_ISSET(&rl, REGION_ANONYMOUS))
251				F_SET(infop, REGION_ANONYMOUS);
252		}
253
254		/*
255		 * If the region is larger than we think, that's okay, use the
256		 * current size.  If it's smaller than we think, and we were
257		 * just using the default size, that's okay, use the current
258		 * size.  If it's smaller than we think and we really care,
259		 * save the size and we'll catch that further down -- we can't
260		 * correct it here because we have to have a lock to grow the
261		 * region.
262		 */
263		if (infop->size > size && !F_ISSET(infop, REGION_SIZEDEF))
264			grow_region = infop->size;
265		infop->size = size;
266	}
267
268	/*
269	 * Map the region into our address space.  If we're creating it, the
270	 * underlying routines will make it the right size.
271	 *
272	 * There are at least two cases where we can "reasonably" fail when
273	 * we attempt to map in the region.  On Windows/95, closing the last
274	 * reference to a region causes it to be zeroed out.  On UNIX, when
275	 * using the shmget(2) interfaces, the region will no longer exist
276	 * if the system was rebooted.  In these cases, the underlying map call
277	 * returns EAGAIN, and we *remove* our file and try again.  There are
278	 * obvious races in doing this, but it should eventually settle down
279	 * to a winner and then things should proceed normally.
280	 */
281	if ((ret = __db_mapregion(infop->name, infop)) != 0)
282		if (ret == EAGAIN) {
283			/*
284			 * Pretend we created the region even if we didn't so
285			 * that our error processing unlinks it.
286			 */
287			F_SET(infop, REGION_CREATED);
288			ret = 0;
289			goto retry;
290		} else
291			goto err;
292
293region_init:
294	/*
295	 * Initialize the common region information.
296	 *
297	 * !!!
298	 * We have to order the region creates so that two processes don't try
299	 * to simultaneously create the region.  This is handled by using the
300	 * DB_CREATE and DB_EXCL flags when we create the "backing" region file.
301	 *
302	 * We also have to order region joins so that processes joining regions
303	 * never see inconsistent data.  We'd like to play permissions games
304	 * with the backing file, but we can't because WNT filesystems won't
305	 * open a file mode 0.
306	 */
307	rlp = (RLAYOUT *)infop->addr;
308	if (F_ISSET(infop, REGION_CREATED)) {
309		/*
310		 * The process creating the region acquires a lock before it
311		 * sets the valid flag.  Any processes joining the region will
312		 * check the valid flag before acquiring the lock.
313		 *
314		 * Check the return of __db_mutex_init() and __db_mutex_lock(),
315		 * even though we don't usually check elsewhere.  This is the
316		 * first lock we initialize and acquire, and we have to know if
317		 * it fails.  (It CAN fail, e.g., SunOS, when using fcntl(2)
318		 * for locking, with an in-memory filesystem specified as the
319		 * database home.)
320		 */
321		if ((ret = __db_mutex_init(&rlp->lock,
322		    MUTEX_LOCK_OFFSET(rlp, &rlp->lock))) != 0 ||
323		    (ret = __db_mutex_lock(&rlp->lock, infop->fd)) != 0)
324			goto err;
325
326		/* Initialize the remaining region information. */
327		rlp->refcnt = 1;
328		rlp->size = infop->size;
329		db_version(&rlp->majver, &rlp->minver, &rlp->patch);
330		rlp->panic = 0;
331		rlp->segid = infop->segid;
332		rlp->flags = 0;
333		if (F_ISSET(infop, REGION_ANONYMOUS))
334			F_SET(rlp, REGION_ANONYMOUS);
335
336		/*
337		 * Fill in the valid field last -- use a magic number, memory
338		 * may not be zero-filled, and we want to minimize the chance
339		 * for collision.
340		 */
341		rlp->valid = DB_REGIONMAGIC;
342
343		/*
344		 * If the region is anonymous, write the RLAYOUT information
345		 * into the backing file so that future region join and unlink
346		 * calls can find it.
347		 *
348		 * XXX
349		 * We MUST do the seek before we do the write.  On Win95, while
350		 * closing the last reference to an anonymous shared region
351		 * doesn't discard the region, it does zero it out.  So, the
352		 * REGION_CREATED may be set, but the file may have already
353		 * been written and the file descriptor may be at the end of
354		 * the file.
355		 */
356		if (F_ISSET(infop, REGION_ANONYMOUS)) {
357			if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, 0)) != 0)
358				goto err;
359			if ((ret =
360			    __os_write(infop->fd, rlp, sizeof(*rlp), &nw)) != 0)
361				goto err;
362		}
363	} else {
364		/* Check to see if the region has had catastrophic failure. */
365		if (rlp->panic) {
366			ret = DB_RUNRECOVERY;
367			goto err;
368		}
369
370		/*
371		 * Check the valid flag to ensure the region is initialized.
372		 * If the valid flag has not been set, the mutex may not have
373		 * been initialized, and an attempt to get it could lead to
374		 * random behavior.
375		 */
376		if (rlp->valid != DB_REGIONMAGIC)
377			goto retry;
378
379		/* Get the region lock. */
380		(void)__db_mutex_lock(&rlp->lock, infop->fd);
381
382		/*
383		 * We now own the region.  There are a couple of things that
384		 * may have gone wrong, however.
385		 *
386		 * Problem #1: while we were waiting for the lock, the region
387		 * was deleted.  Detected by re-checking the valid flag, since
388		 * it's cleared by the delete region routines.
389		 */
390		if (rlp->valid != DB_REGIONMAGIC) {
391			(void)__db_mutex_unlock(&rlp->lock, infop->fd);
392			goto retry;
393		}
394
395		/*
396		 * Problem #3: when we checked the size of the file, it was
397		 * still growing as part of creation.  Detected by the fact
398		 * that infop->size isn't the same size as the region.
399		 */
400		if (infop->size != rlp->size) {
401			(void)__db_mutex_unlock(&rlp->lock, infop->fd);
402			goto retry;
403		}
404
405		/* Increment the reference count. */
406		++rlp->refcnt;
407	}
408
409	/* Return the region in a locked condition. */
410
411	if (0) {
412errmsg:		__db_err(infop->dbenv, "%s: %s", infop->name, strerror(ret));
413
414err:
415retry:		/* Discard the region. */
416		if (infop->addr != NULL) {
417			(void)__db_unmapregion(infop);
418			infop->addr = NULL;
419		}
420
421		/* Discard the backing file. */
422		if (infop->fd != -1) {
423			(void)__os_close(infop->fd);
424			infop->fd = -1;
425
426			if (F_ISSET(infop, REGION_CREATED))
427				(void)__os_unlink(infop->name);
428		}
429
430		/* Discard the name. */
431		if (infop->name != NULL) {
432			__os_freestr(infop->name);
433			infop->name = NULL;
434		}
435
436		/*
437		 * If we had a temporary error, wait a few seconds and
438		 * try again.
439		 */
440		if (ret == 0) {
441			if (++retry_cnt <= 3) {
442				__os_sleep(retry_cnt * 2, 0);
443				goto loop;
444			}
445			ret = EAGAIN;
446		}
447	}
448
449	/*
450	 * XXX
451	 * HP-UX won't permit mutexes to live in anything but shared memory.
452	 * Instantiate a shared region file on that architecture, regardless.
453	 *
454	 * XXX
455	 * There's a problem in cleaning this up on application exit, or on
456	 * application failure.  If an application opens a database without
457	 * an environment, we create a temporary backing mpool region for it.
458	 * That region is marked REGION_PRIVATE, but as HP-UX won't permit
459	 * mutexes to live in anything but shared memory, we instantiate a
460	 * real file plus a memory region of some form.  If the application
461	 * crashes, the necessary information to delete the backing file and
462	 * any system region (e.g., the shmget(2) segment ID) is no longer
463	 * available.  We can't completely fix the problem, but we try.
464	 *
465	 * The underlying UNIX __db_mapregion() code preferentially uses the
466	 * mmap(2) interface with the MAP_ANON/MAP_ANONYMOUS flags for regions
467	 * that are marked REGION_PRIVATE.  This means that we normally aren't
468	 * holding any system resources when we get here, in which case we can
469	 * delete the backing file.  This results in a short race, from the
470	 * __db_open() call above to here.
471	 *
472	 * If, for some reason, we are holding system resources when we get
473	 * here, we don't have any choice -- we can't delete the backing file
474	 * because we may need it to detach from the resources.  Set the
475	 * REGION_LASTDETACH flag, so that we do all necessary cleanup when
476	 * the application closes the region.
477	 */
478	if (F_ISSET(infop, REGION_PRIVATE) && !F_ISSET(infop, REGION_MALLOC))
479		if (F_ISSET(infop, REGION_HOLDINGSYS))
480			F_SET(infop, REGION_LASTDETACH);
481		else {
482			F_SET(infop, REGION_REMOVED);
483			F_CLR(infop, REGION_CANGROW);
484
485			(void)__os_close(infop->fd);
486			(void)__os_unlink(infop->name);
487		}
488
489	return (ret);
490}
491
492/*
493 * __db_rdetach --
494 *	De-attach from a shared memory region.
495 *
496 * PUBLIC: int __db_rdetach __P((REGINFO *));
497 */
498int
499__db_rdetach(infop)
500	REGINFO *infop;
501{
502	RLAYOUT *rlp;
503	int detach, ret, t_ret;
504
505	ret = 0;
506
507	/*
508	 * If the region was removed when it was created, no further action
509	 * is required.
510	 */
511	if (F_ISSET(infop, REGION_REMOVED))
512		goto done;
513	/*
514	 * If the region was created in memory returned by malloc, the only
515	 * action required is freeing the memory.
516	 */
517	if (F_ISSET(infop, REGION_MALLOC)) {
518		__os_free(infop->addr, 0);
519		goto done;
520	}
521
522	/* Otherwise, attach to the region and optionally delete it. */
523	rlp = infop->addr;
524
525	/* Get the lock. */
526	(void)__db_mutex_lock(&rlp->lock, infop->fd);
527
528	/* Decrement the reference count. */
529	if (rlp->refcnt == 0)
530		__db_err(infop->dbenv,
531		    "region rdetach: reference count went to zero!");
532	else
533		--rlp->refcnt;
534
535	/*
536	 * If we're going to remove the region, clear the valid flag so
537	 * that any region join that's blocked waiting for us will know
538	 * what happened.
539	 */
540	detach = 0;
541	if (F_ISSET(infop, REGION_LASTDETACH))
542		if (rlp->refcnt == 0) {
543			detach = 1;
544			rlp->valid = 0;
545		} else
546			ret = EBUSY;
547
548	/* Release the lock. */
549	(void)__db_mutex_unlock(&rlp->lock, infop->fd);
550
551	/* Close the backing file descriptor. */
552	(void)__os_close(infop->fd);
553	infop->fd = -1;
554
555	/* Discard our mapping of the region. */
556	if ((t_ret = __db_unmapregion(infop)) != 0 && ret == 0)
557		ret = t_ret;
558
559	/* Discard the region itself. */
560	if (detach) {
561		if ((t_ret =
562		    __db_unlinkregion(infop->name, infop) != 0) && ret == 0)
563			ret = t_ret;
564		if ((t_ret = __os_unlink(infop->name) != 0) && ret == 0)
565			ret = t_ret;
566	}
567
568done:	/* Discard the name. */
569	if (infop->name != NULL) {
570		__os_freestr(infop->name);
571		infop->name = NULL;
572	}
573
574	return (ret);
575}
576
577/*
578 * __db_runlink --
579 *	Remove a region.
580 *
581 * PUBLIC: int __db_runlink __P((REGINFO *, int));
582 */
583int
584__db_runlink(infop, force)
585	REGINFO *infop;
586	int force;
587{
588	RLAYOUT rl, *rlp;
589	size_t size;
590	ssize_t nr;
591	u_int32_t mbytes, bytes;
592	int fd, ret, t_ret;
593	char *name;
594
595	/*
596	 * XXX
597	 * We assume that we've created a new REGINFO structure for this
598	 * call, not used one that was already initialized.  Regardless,
599	 * if anyone is planning to use it after we're done, they're going
600	 * to be sorely disappointed.
601	 *
602	 * If force isn't set, we attach to the region, set a flag to delete
603	 * the region on last close, and let the region delete code do the
604	 * work.
605	 */
606	if (!force) {
607		if ((ret = __db_rattach(infop)) != 0)
608			return (ret);
609
610		rlp = (RLAYOUT *)infop->addr;
611		(void)__db_mutex_unlock(&rlp->lock, infop->fd);
612
613		F_SET(infop, REGION_LASTDETACH);
614
615		return (__db_rdetach(infop));
616	}
617
618	/*
619	 * Otherwise, we don't want to attach to the region.  We may have been
620	 * called to clean up if a process died leaving a region locked and/or
621	 * corrupted, which could cause the attach to hang.
622	 */
623	if ((ret = __db_appname(infop->dbenv, infop->appname,
624	    infop->path, infop->file, infop->dbflags, NULL, &name)) != 0)
625		return (ret);
626
627	/*
628	 * An underlying file is created for all regions other than private
629	 * (REGION_PRIVATE) ones, regardless of whether or not it's used to
630	 * back the region.  If that file doesn't exist, we're done.
631	 */
632	if (__os_exists(name, NULL) != 0) {
633		__os_freestr(name);
634		return (0);
635	}
636
637	/*
638	 * See the comments in __db_rattach -- figure out if this is a regular
639	 * file backing a region or if it's a regular file with information
640	 * about a region.
641	 */
642	if ((ret = __db_open(name, DB_RDONLY, DB_RDONLY, 0, &fd)) != 0)
643		goto errmsg;
644	if ((ret = __os_ioinfo(name, fd, &mbytes, &bytes, NULL)) != 0)
645		goto errmsg;
646	size = mbytes * MEGABYTE + bytes;
647
648	if (size <= sizeof(RLAYOUT)) {
649		if ((ret = __os_read(fd, &rl, sizeof(rl), &nr)) != 0)
650			goto errmsg;
651		if (rl.valid != DB_REGIONMAGIC) {
652			__db_err(infop->dbenv,
653			    "%s: illegal region magic number", name);
654			ret = EINVAL;
655			goto err;
656		}
657
658		/* Set the size, memory id and characteristics. */
659		infop->size = rl.size;
660		infop->segid = rl.segid;
661		if (F_ISSET(&rl, REGION_ANONYMOUS))
662			F_SET(infop, REGION_ANONYMOUS);
663	} else {
664		infop->size = size;
665		infop->segid = INVALID_SEGID;
666	}
667
668	/* Remove the underlying region. */
669	ret = __db_unlinkregion(name, infop);
670
671	/*
672	 * Unlink the backing file.  Close the open file descriptor first,
673	 * because some architectures (e.g., Win32) won't unlink a file if
674	 * open file descriptors remain.
675	 */
676	(void)__os_close(fd);
677	if ((t_ret = __os_unlink(name)) != 0 && ret == 0)
678		ret = t_ret;
679
680	if (0) {
681errmsg:		__db_err(infop->dbenv, "%s: %s", name, strerror(ret));
682err:		(void)__os_close(fd);
683	}
684
685	__os_freestr(name);
686	return (ret);
687}
688
689/*
690 * __db_rgrow --
691 *	Extend a region.
692 *
693 * PUBLIC: int __db_rgrow __P((REGINFO *, size_t));
694 */
695int
696__db_rgrow(infop, new_size)
697	REGINFO *infop;
698	size_t new_size;
699{
700	RLAYOUT *rlp;
701	size_t increment;
702	int ret;
703
704	/*
705	 * !!!
706	 * This routine MUST be called with the region already locked.
707	 */
708
709	/* The underlying routines have flagged if this region can grow. */
710	if (!F_ISSET(infop, REGION_CANGROW))
711		return (EINVAL);
712
713	/*
714	 * Round off the requested size to the next page boundary, and
715	 * determine the additional space required.
716	 */
717	rlp = (RLAYOUT *)infop->addr;
718	DB_ROUNDOFF(new_size, DB_VMPAGESIZE);
719	increment = new_size - rlp->size;
720
721	if ((ret = __db_growregion(infop, increment)) != 0)
722		return (ret);
723
724	/* Update the on-disk region size. */
725	rlp->size = new_size;
726
727	/* Detach from and reattach to the region. */
728	return (__db_rreattach(infop, new_size));
729}
730
731/*
732 * __db_growregion --
733 *	Grow a shared memory region.
734 */
735static int
736__db_growregion(infop, increment)
737	REGINFO *infop;
738	size_t increment;
739{
740	db_pgno_t pages;
741	size_t i;
742	ssize_t nr, nw;
743	u_int32_t relative;
744	int ret;
745	char buf[DB_VMPAGESIZE];
746
747	/* Seek to the end of the region. */
748	if ((ret = __os_seek(infop->fd, 0, 0, 0, 0, SEEK_END)) != 0)
749		goto err;
750
751	/* Write nuls to the new bytes. */
752	memset(buf, 0, sizeof(buf));
753
754	/*
755	 * Some systems require that all of the bytes of the region be
756	 * written before it can be mapped and accessed randomly, and
757	 * other systems don't zero out the pages.
758	 */
759	if (__db_mapinit())
760		/* Extend the region by writing each new page. */
761		for (i = 0; i < increment; i += DB_VMPAGESIZE) {
762			if ((ret =
763			    __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
764				goto err;
765			if (nw != sizeof(buf))
766				goto eio;
767		}
768	else {
769		/*
770		 * Extend the region by writing the last page.  If the region
771		 * is >4Gb, increment may be larger than the maximum possible
772		 * seek "relative" argument, as it's an unsigned 32-bit value.
773		 * Break the offset into pages of 1MB each so that we don't
774		 * overflow (2^20 + 2^32 is bigger than any memory I expect
775		 * to see for awhile).
776		 */
777		pages = (increment - DB_VMPAGESIZE) / MEGABYTE;
778		relative = (increment - DB_VMPAGESIZE) % MEGABYTE;
779		if ((ret = __os_seek(infop->fd,
780		    MEGABYTE, pages, relative, 0, SEEK_CUR)) != 0)
781			goto err;
782		if ((ret = __os_write(infop->fd, buf, sizeof(buf), &nw)) != 0)
783			goto err;
784		if (nw != sizeof(buf))
785			goto eio;
786
787		/*
788		 * It's sometimes significantly faster to page-fault in all of
789		 * the region's pages before we run the application, as we see
790		 * nasty side-effects when we page-fault while holding various
791		 * locks, i.e., the lock takes a long time to acquire because
792		 * of the underlying page fault, and the other threads convoy
793		 * behind the lock holder.
794		 *
795		 * We also use REGION_INIT to guarantee that there is enough
796		 * disk space for the region, so we also write a byte to each
797		 * page.  Reading the byte is insufficient as some systems
798		 * (e.g., Solaris) do not instantiate disk pages to satisfy
799		 * a read, and so we don't know if there is enough disk space
800		 * or not.
801		 */
802		if (DB_GLOBAL(db_region_init)) {
803			pages = increment / MEGABYTE;
804			relative = increment % MEGABYTE;
805			if ((ret = __os_seek(infop->fd,
806			    MEGABYTE, pages, relative, 1, SEEK_END)) != 0)
807				goto err;
808
809			/* Write a byte to each page. */
810			for (i = 0; i < increment; i += DB_VMPAGESIZE) {
811				if ((ret =
812				    __os_write(infop->fd, buf, 1, &nr)) != 0)
813					goto err;
814				if (nr != 1)
815					goto eio;
816				if ((ret = __os_seek(infop->fd,
817				    0, 0, DB_VMPAGESIZE - 1, 0, SEEK_CUR)) != 0)
818					goto err;
819			}
820		}
821	}
822	return (0);
823
824eio:	ret = EIO;
825err:	__db_err(infop->dbenv, "region grow: %s", strerror(ret));
826	return (ret);
827}
828
829/*
830 * __db_rreattach --
831 *	Detach from and reattach to a region.
832 *
833 * PUBLIC: int __db_rreattach __P((REGINFO *, size_t));
834 */
835int
836__db_rreattach(infop, new_size)
837	REGINFO *infop;
838	size_t new_size;
839{
840	int ret;
841
842#ifdef DIAGNOSTIC
843	if (infop->name == NULL) {
844		__db_err(infop->dbenv, "__db_rreattach: name was NULL");
845		return (EINVAL);
846	}
847#endif
848	/*
849	 * If we're growing an already mapped region, we have to unmap it
850	 * and get it back.  We have it locked, so nobody else can get in,
851	 * which makes it fairly straight-forward to do, as everybody else
852	 * is going to block while we do the unmap/remap.  NB: if we fail
853	 * to get it back, the pooch is genuinely screwed, because we can
854	 * never release the lock we're holding.
855	 *
856	 * Detach from the region.  We have to do this first so architectures
857	 * that don't permit a file to be mapped into different places in the
858	 * address space simultaneously, e.g., HP's PaRisc, will work.
859	 */
860	if ((ret = __db_unmapregion(infop)) != 0)
861		return (ret);
862
863	/* Update the caller's REGINFO size to the new map size. */
864	infop->size = new_size;
865
866	/* Attach to the region. */
867	ret = __db_mapregion(infop->name, infop);
868
869	return (ret);
870}
871