1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: env_region.c,v 12.45 2008/01/31 18:40:43 bostic Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12#include "dbinc/mp.h"
13
14static int  __env_des_get __P((ENV *, REGINFO *, REGINFO *, REGION **));
15static int  __env_faultmem __P((ENV *, void *, size_t, int));
16static int  __env_sys_attach __P((ENV *, REGINFO *, REGION *));
17static int  __env_sys_detach __P((ENV *, REGINFO *, int));
18static void __env_des_destroy __P((ENV *, REGION *));
19static void __env_remove_file __P((ENV *));
20
21/*
22 * __env_attach
23 *	Join/create the environment
24 *
25 * PUBLIC: int __env_attach __P((ENV *, u_int32_t *, int, int));
26 */
27int
28__env_attach(env, init_flagsp, create_ok, retry_ok)
29	ENV *env;
30	u_int32_t *init_flagsp;
31	int create_ok, retry_ok;
32{
33	DB_ENV *dbenv;
34	REGENV *renv;
35	REGENV_REF ref;
36	REGINFO *infop;
37	REGION *rp, tregion;
38	size_t nrw, size;
39	u_int32_t bytes, i, mbytes, nregions, signature;
40	u_int retry_cnt;
41	int majver, minver, patchver, ret, segid;
42	char buf[sizeof(DB_REGION_FMT) + 20];
43
44	/* Initialization */
45	dbenv = env->dbenv;
46	retry_cnt = 0;
47	signature = __env_struct_sig();
48
49	/* Repeated initialization. */
50loop:	renv = NULL;
51
52	/* Set up the ENV's REG_INFO structure. */
53	if ((ret = __os_calloc(env, 1, sizeof(REGINFO), &infop)) != 0)
54		return (ret);
55	infop->env = env;
56	infop->type = REGION_TYPE_ENV;
57	infop->id = REGION_ID_ENV;
58	infop->flags = REGION_JOIN_OK;
59	if (create_ok)
60		F_SET(infop, REGION_CREATE_OK);
61
62	/* Build the region name. */
63	if (F_ISSET(env, ENV_PRIVATE))
64		ret = __os_strdup(env, "process-private", &infop->name);
65	else {
66		(void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
67		ret =
68		    __db_appname(env, DB_APP_NONE, buf, 0, NULL, &infop->name);
69	}
70	if (ret != 0)
71		goto err;
72
73	/*
74	 * We have to single-thread the creation of the REGENV region.  Once
75	 * it exists, we can serialize using region mutexes, but until then
76	 * we have to be the only player in the game.
77	 *
78	 * If this is a private environment, we are only called once and there
79	 * are no possible race conditions.
80	 *
81	 * If this is a public environment, we use the filesystem to ensure
82	 * the creation of the environment file is single-threaded.
83	 *
84	 * If the application has specified their own mapping functions, try
85	 * and create the region.  The application will have to let us know if
86	 * it's actually a creation or not, and we'll have to fall-back to a
87	 * join if it's not a create.
88	 */
89	if (F_ISSET(env, ENV_PRIVATE) || DB_GLOBAL(j_region_map) != NULL)
90		goto creation;
91
92	/*
93	 * Try to create the file, if we have the authority.  We have to ensure
94	 * that multiple threads/processes attempting to simultaneously create
95	 * the file are properly ordered.  Open using the O_CREAT and O_EXCL
96	 * flags so that multiple attempts to create the region will return
97	 * failure in all but one.  POSIX 1003.1 requires that EEXIST be the
98	 * errno return value -- I sure hope they're right.
99	 */
100	if (create_ok) {
101		if ((ret = __os_open(env, infop->name, 0,
102		    DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_REGION,
103		    env->db_mode, &env->lockfhp)) == 0)
104			goto creation;
105		if (ret != EEXIST) {
106			__db_err(env, ret, "%s", infop->name);
107			goto err;
108		}
109	}
110
111	/* The region must exist, it's not okay to recreate it. */
112	F_CLR(infop, REGION_CREATE_OK);
113
114	/*
115	 * If we couldn't create the file, try and open it.  (If that fails,
116	 * we're done.)
117	 */
118	if ((ret = __os_open(
119	    env, infop->name, 0, DB_OSO_REGION, 0, &env->lockfhp)) != 0)
120		goto err;
121
122	/*
123	 * !!!
124	 * The region may be in system memory not backed by the filesystem
125	 * (more specifically, not backed by this file), and we're joining
126	 * it.  In that case, the process that created it will have written
127	 * out a REGENV_REF structure as its only contents.  We read that
128	 * structure before we do anything further, e.g., we can't just map
129	 * that file in and then figure out what's going on.
130	 *
131	 * All of this noise is because some systems don't have a coherent VM
132	 * and buffer cache, and what's worse, when you mix operations on the
133	 * VM and buffer cache, half the time you hang the system.
134	 *
135	 * If the file is the size of an REGENV_REF structure, then we know
136	 * the real region is in some other memory.  (The only way you get a
137	 * file that size is to deliberately write it, as it's smaller than
138	 * any possible disk sector created by writing a file or mapping the
139	 * file into memory.)  In which case, retrieve the structure from the
140	 * file and use it to acquire the referenced memory.
141	 *
142	 * If the structure is larger than a REGENV_REF structure, then this
143	 * file is backing the shared memory region, and we just map it into
144	 * memory.
145	 *
146	 * And yes, this makes me want to take somebody and kill them.  (I
147	 * digress -- but you have no freakin' idea.  This is unbelievably
148	 * stupid and gross, and I've probably spent six months of my life,
149	 * now, trying to make different versions of it work.)
150	 */
151	if ((ret = __os_ioinfo(env, infop->name,
152	    env->lockfhp, &mbytes, &bytes, NULL)) != 0) {
153		__db_err(env, ret, "%s", infop->name);
154		goto err;
155	}
156
157	/*
158	 * !!!
159	 * A size_t is OK -- regions get mapped into memory, and so can't
160	 * be larger than a size_t.
161	 */
162	size = mbytes * MEGABYTE + bytes;
163
164	/*
165	 * If the size is less than the size of a REGENV_REF structure, the
166	 * region (or, possibly, the REGENV_REF structure) has not yet been
167	 * completely written.  Shouldn't be possible, but there's no reason
168	 * not to wait awhile and try again.
169	 *
170	 * Otherwise, if the size is the size of a REGENV_REF structure,
171	 * read it into memory and use it as a reference to the real region.
172	 */
173	if (size <= sizeof(ref)) {
174		if (size != sizeof(ref))
175			goto retry;
176
177		if ((ret = __os_read(env, env->lockfhp, &ref,
178		    sizeof(ref), &nrw)) != 0 || nrw < (size_t)sizeof(ref)) {
179			if (ret == 0)
180				ret = EIO;
181			__db_err(env, ret,
182		    "%s: unable to read system-memory information",
183			    infop->name);
184			goto err;
185		}
186		size = ref.size;
187		segid = ref.segid;
188
189		F_SET(env, ENV_SYSTEM_MEM);
190	} else if (F_ISSET(env, ENV_SYSTEM_MEM)) {
191		ret = EINVAL;
192		__db_err(env, ret,
193		    "%s: existing environment not created in system memory",
194		    infop->name);
195		goto err;
196	} else
197		segid = INVALID_REGION_SEGID;
198
199#ifndef HAVE_MUTEX_FCNTL
200	/*
201	 * If we're not doing fcntl locking, we can close the file handle.  We
202	 * no longer need it and the less contact between the buffer cache and
203	 * the VM, the better.
204	 */
205	 (void)__os_closehandle(env, env->lockfhp);
206	 env->lockfhp = NULL;
207#endif
208
209	/* Call the region join routine to acquire the region. */
210	memset(&tregion, 0, sizeof(tregion));
211	tregion.size = (roff_t)size;
212	tregion.segid = segid;
213	if ((ret = __env_sys_attach(env, infop, &tregion)) != 0)
214		goto err;
215
216user_map_functions:
217	/*
218	 * The environment's REGENV structure has to live at offset 0 instead
219	 * of the usual alloc information.  Set the primary reference and
220	 * correct the "addr" value to reference the alloc region.  Note,
221	 * this means that all of our offsets (R_ADDR/R_OFFSET) get shifted
222	 * as well, but that should be fine.
223	 */
224	infop->primary = infop->addr;
225	infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV);
226	renv = infop->primary;
227
228	/*
229	 * Make sure the region matches our build.  Special case a region
230	 * that's all nul bytes, just treat it like any other corruption.
231	 */
232	if (renv->majver != DB_VERSION_MAJOR ||
233	    renv->minver != DB_VERSION_MINOR) {
234		if (renv->majver != 0 || renv->minver != 0) {
235			__db_errx(env,
236	"Program version %d.%d doesn't match environment version %d.%d",
237			    DB_VERSION_MAJOR, DB_VERSION_MINOR,
238			    renv->majver, renv->minver);
239			ret = DB_VERSION_MISMATCH;
240		} else
241			ret = EINVAL;
242		goto err;
243	}
244	if (renv->signature != signature) {
245		__db_errx(env, "Build signature doesn't match environment");
246		ret = DB_VERSION_MISMATCH;
247		goto err;
248	}
249
250	/*
251	 * Check if the environment has had a catastrophic failure.
252	 *
253	 * Check the magic number to ensure the region is initialized.  If the
254	 * magic number isn't set, the lock may not have been initialized, and
255	 * an attempt to use it could lead to random behavior.
256	 *
257	 * The panic and magic values aren't protected by any lock, so we never
258	 * use them in any check that's more complex than set/not-set.
259	 *
260	 * !!!
261	 * I'd rather play permissions games using the underlying file, but I
262	 * can't because Windows/NT filesystems won't open files mode 0.
263	 */
264	if (renv->panic && !F_ISSET(dbenv, DB_ENV_NOPANIC)) {
265		ret = __env_panic_msg(env);
266		goto err;
267	}
268	if (renv->magic != DB_REGION_MAGIC)
269		goto retry;
270
271	/*
272	 * Get a reference to the underlying REGION information for this
273	 * environment.
274	 */
275	if ((ret = __env_des_get(env, infop, infop, &rp)) != 0 || rp == NULL)
276		goto find_err;
277	infop->rp = rp;
278
279	/*
280	 * There's still a possibility for inconsistent data.  When we acquired
281	 * the size of the region and attached to it, it might have still been
282	 * growing as part of its creation.  We can detect this by checking the
283	 * size we originally found against the region's current size.  (The
284	 * region's current size has to be final, the creator finished growing
285	 * it before setting the magic number in the region.)
286	 *
287	 * !!!
288	 * Skip this test when the application specified its own map functions.
289	 * The size of the region is essentially unknown in that case: some
290	 * other process asked the application's map function for some bytes,
291	 * but we were never told the final size of the region.  We could get
292	 * a size back from the map function, but for all we know, our process'
293	 * map function only knows how to join regions, it has no clue how big
294	 * those regions are.
295	 */
296	if (DB_GLOBAL(j_region_map) == NULL && rp->size != size)
297		goto retry;
298
299	/*
300	 * Check our callers configuration flags, it's an error to configure
301	 * incompatible or additional subsystems in an existing environment.
302	 * Return the total set of flags to the caller so they initialize the
303	 * correct set of subsystems.
304	 */
305	if (init_flagsp != NULL) {
306		FLD_CLR(*init_flagsp, renv->init_flags);
307		if (*init_flagsp != 0) {
308			__db_errx(env,
309    "configured environment flags incompatible with existing environment");
310			ret = EINVAL;
311			goto err;
312		}
313		*init_flagsp = renv->init_flags;
314	}
315
316	/*
317	 * Fault the pages into memory.  Note, do this AFTER releasing the
318	 * lock, because we're only reading the pages, not writing them.
319	 */
320	(void)__env_faultmem(env, infop->primary, rp->size, 0);
321
322	/* Everything looks good, we're done. */
323	env->reginfo = infop;
324	return (0);
325
326creation:
327	/* Create the environment region. */
328	F_SET(infop, REGION_CREATE);
329
330	/*
331	 * Allocate room for REGION structures plus overhead.
332	 *
333	 * XXX
334	 * Overhead is so high because encryption passwds, replication vote
335	 * arrays and the thread control block table are all stored in the
336	 * base environment region.  This is a bug, at the least replication
337	 * should have its own region.
338	 *
339	 * Allocate space for thread info blocks.  Max is only advisory,
340	 * so we allocate 25% more.
341	 */
342	memset(&tregion, 0, sizeof(tregion));
343	nregions = __memp_max_regions(env) + 10;
344	size = nregions * sizeof(REGION);
345	size += dbenv->passwd_len;
346	size += (dbenv->thr_max + dbenv->thr_max / 4) *
347	    __env_alloc_size(sizeof(DB_THREAD_INFO));
348	size += env->thr_nbucket * __env_alloc_size(sizeof(DB_HASHTAB));
349	size += 16 * 1024;
350	tregion.size = size;
351	tregion.segid = INVALID_REGION_SEGID;
352	if ((ret = __env_sys_attach(env, infop, &tregion)) != 0)
353		goto err;
354
355	/*
356	 * If the application has specified its own mapping functions, we don't
357	 * know until we get here if we are creating the region or not.   The
358	 * way we find out is underlying functions clear the REGION_CREATE flag.
359	 */
360	if (!F_ISSET(infop, REGION_CREATE))
361		goto user_map_functions;
362
363	/*
364	 * Fault the pages into memory.  Note, do this BEFORE we initialize
365	 * anything, because we're writing the pages, not just reading them.
366	 */
367	(void)__env_faultmem(env, infop->addr, tregion.size, 1);
368
369	/*
370	 * The first object in the region is the REGENV structure.  This is
371	 * different from the other regions, and, from everything else in
372	 * this region, where all objects are allocated from the pool, i.e.,
373	 * there aren't any fixed locations.  The remaining space is made
374	 * available for later allocation.
375	 *
376	 * The allocation space must be size_t aligned, because that's what
377	 * the initialization routine is going to store there.  To make sure
378	 * that happens, the REGENV structure was padded with a final size_t.
379	 * No other region needs to worry about it because all of them treat
380	 * the entire region as allocation space.
381	 *
382	 * Set the primary reference and correct the "addr" value to reference
383	 * the alloc region.  Note, this requires that we "uncorrect" it at
384	 * region detach, and that all of our offsets (R_ADDR/R_OFFSET) will be
385	 * shifted as well, but that should be fine.
386	 */
387	infop->primary = infop->addr;
388	infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV);
389	__env_alloc_init(infop, tregion.size - sizeof(REGENV));
390
391	/*
392	 * Initialize the rest of the REGENV structure.  (Don't set the magic
393	 * number to the correct value, that would validate the environment).
394	 */
395	renv = infop->primary;
396	renv->magic = 0;
397	renv->panic = 0;
398
399	(void)db_version(&majver, &minver, &patchver);
400	renv->majver = (u_int32_t)majver;
401	renv->minver = (u_int32_t)minver;
402	renv->patchver = (u_int32_t)patchver;
403	renv->signature = signature;
404
405	(void)time(&renv->timestamp);
406	__os_unique_id(env, &renv->envid);
407
408	/*
409	 * Initialize init_flags to store the flags that any other environment
410	 * handle that uses DB_JOINENV to join this environment will need.
411	 */
412	renv->init_flags = (init_flagsp == NULL) ? 0 : *init_flagsp;
413
414	/*
415	 * Set up the region array.  We use an array rather than a linked list
416	 * as we have to traverse this list after failure in some cases, and
417	 * we don't want to infinitely loop should the application fail while
418	 * we're manipulating the list.
419	 */
420	renv->region_cnt = nregions;
421	if ((ret = __env_alloc(infop, nregions * sizeof(REGION), &rp)) != 0) {
422		__db_err(
423		    env, ret, "unable to create new master region array");
424		goto err;
425	}
426	renv->region_off = R_OFFSET(infop, rp);
427	for (i = 0; i < nregions; ++i, ++rp)
428		rp->id = INVALID_REGION_ID;
429
430	renv->cipher_off = renv->thread_off = renv->rep_off = INVALID_ROFF;
431	renv->flags = 0;
432	renv->op_timestamp = renv->rep_timestamp = 0;
433	renv->mtx_regenv = MUTEX_INVALID;
434
435	/*
436	 * Get the underlying REGION structure for this environment.  Note,
437	 * we created the underlying OS region before we acquired the REGION
438	 * structure, which is backwards from the normal procedure.  Update
439	 * the REGION structure.
440	 */
441	if ((ret = __env_des_get(env, infop, infop, &rp)) != 0) {
442find_err:	__db_errx(env, "%s: unable to find environment", infop->name);
443		if (ret == 0)
444			ret = EINVAL;
445		goto err;
446	}
447	infop->rp = rp;
448	rp->size = tregion.size;
449	rp->segid = tregion.segid;
450
451	/*
452	 * !!!
453	 * If we create an environment where regions are public and in system
454	 * memory, we have to inform processes joining the environment how to
455	 * attach to the shared memory segment.  So, we write the shared memory
456	 * identifier into the file, to be read by those other processes.
457	 *
458	 * XXX
459	 * This is really OS-layer information, but I can't see any easy way
460	 * to move it down there without passing down information that it has
461	 * no right to know, e.g., that this is the one-and-only REGENV region
462	 * and not some other random region.
463	 */
464	if (tregion.segid != INVALID_REGION_SEGID) {
465		ref.size = tregion.size;
466		ref.segid = tregion.segid;
467		if ((ret = __os_write(
468		    env, env->lockfhp, &ref, sizeof(ref), &nrw)) != 0) {
469			__db_err(env, ret,
470			    "%s: unable to write out public environment ID",
471			    infop->name);
472			goto err;
473		}
474	}
475
476#ifndef HAVE_MUTEX_FCNTL
477	/*
478	 * If we're not doing fcntl locking, we can close the file handle.  We
479	 * no longer need it and the less contact between the buffer cache and
480	 * the VM, the better.
481	 */
482	if (env->lockfhp != NULL) {
483		 (void)__os_closehandle(env, env->lockfhp);
484		 env->lockfhp = NULL;
485	}
486#endif
487
488	/* Everything looks good, we're done. */
489	env->reginfo = infop;
490	return (0);
491
492err:
493retry:	/* Close any open file handle. */
494	if (env->lockfhp != NULL) {
495		(void)__os_closehandle(env, env->lockfhp);
496		env->lockfhp = NULL;
497	}
498
499	/*
500	 * If we joined or created the region, detach from it.  If we created
501	 * it, destroy it.  Note, there's a path in the above code where we're
502	 * using a temporary REGION structure because we haven't yet allocated
503	 * the real one.  In that case the region address (addr) will be filled
504	 * in, but the REGION pointer (rp) won't.  Fix it.
505	 */
506	if (infop->addr != NULL) {
507		if (infop->rp == NULL)
508			infop->rp = &tregion;
509
510		/* Reset the addr value that we "corrected" above. */
511		infop->addr = infop->primary;
512		(void)__env_sys_detach(env,
513		    infop, F_ISSET(infop, REGION_CREATE));
514	}
515
516	/* Free the allocated name and/or REGINFO structure. */
517	if (infop->name != NULL)
518		__os_free(env, infop->name);
519	__os_free(env, infop);
520
521	/* If we had a temporary error, wait awhile and try again. */
522	if (ret == 0) {
523		if (!retry_ok || ++retry_cnt > 3) {
524			__db_errx(env, "unable to join the environment");
525			ret = EAGAIN;
526		} else {
527			__os_yield(env, retry_cnt * 3, 0);
528			goto loop;
529		}
530	}
531
532	return (ret);
533}
534
535/*
536 * __env_turn_on --
537 *	Turn on the created environment.
538 *
539 * PUBLIC: int __env_turn_on __P((ENV *));
540 */
541int
542__env_turn_on(env)
543	ENV *env;
544{
545	REGENV *renv;
546	REGINFO *infop;
547
548	infop = env->reginfo;
549	renv = infop->primary;
550
551	/* If we didn't create the region, there's no need for further work. */
552	if (!F_ISSET(infop, REGION_CREATE))
553		return (0);
554
555	/*
556	 * Validate the file.  All other threads of control are waiting
557	 * on this value to be written -- "Let slip the hounds of war!"
558	 */
559	renv->magic = DB_REGION_MAGIC;
560
561	return (0);
562}
563
564/*
565 * __env_turn_off --
566 *	Turn off the environment.
567 *
568 * PUBLIC: int __env_turn_off __P((ENV *, u_int32_t));
569 */
570int
571__env_turn_off(env, flags)
572	ENV *env;
573	u_int32_t flags;
574{
575	REGENV *renv;
576	REGINFO *infop;
577	int ret, t_ret;
578
579	ret = 0;
580
581	/*
582	 * Connect to the environment: If we can't join the environment, we
583	 * guess it's because it doesn't exist and we're done.
584	 *
585	 * If the environment exists, attach and lock the environment.
586	 */
587	if (__env_attach(env, NULL, 0, 1) != 0)
588		return (0);
589
590	infop = env->reginfo;
591	renv = infop->primary;
592
593	MUTEX_LOCK(env, renv->mtx_regenv);
594
595	/*
596	 * If the environment is in use, we're done unless we're forcing the
597	 * issue or the environment has panic'd.  (If the environment panic'd,
598	 * the thread holding the reference count may not have cleaned up, so
599	 * we clean up.  It's possible the application didn't plan on removing
600	 * the environment in this particular call, but panic'd environments
601	 * aren't useful to anyone.)
602	 *
603	 * Otherwise, panic the environment and overwrite the magic number so
604	 * any thread of control attempting to connect (or racing with us) will
605	 * back off and retry, or just die.
606	 */
607	if (renv->refcnt > 0 && !LF_ISSET(DB_FORCE) && !renv->panic)
608		ret = EBUSY;
609	else
610		renv->panic = 1;
611
612	/*
613	 * Unlock the environment (nobody should need this lock because
614	 * we've poisoned the pool) and detach from the environment.
615	 */
616	MUTEX_UNLOCK(env, renv->mtx_regenv);
617
618	if ((t_ret = __env_detach(env, 0)) != 0 && ret == 0)
619		ret = t_ret;
620
621	return (ret);
622}
623
624/*
625 * __env_panic_set --
626 *	Set/clear unrecoverable error.
627 *
628 * PUBLIC: void __env_panic_set __P((ENV *, int));
629 */
630void
631__env_panic_set(env, on)
632	ENV *env;
633	int on;
634{
635	if (env != NULL && env->reginfo != NULL)
636		((REGENV *)env->reginfo->primary)->panic = on ? 1 : 0;
637}
638
639/*
640 * __env_ref_increment --
641 *	Increment the environment's reference count.
642 *
643 * PUBLIC: int __env_ref_increment __P((ENV *));
644 */
645int
646__env_ref_increment(env)
647	ENV *env;
648{
649	REGENV *renv;
650	REGINFO *infop;
651	int ret;
652
653	infop = env->reginfo;
654	renv = infop->primary;
655
656	/* If we're creating the primary region, allocate a mutex. */
657	if (F_ISSET(infop, REGION_CREATE)) {
658		if ((ret = __mutex_alloc(
659		    env, MTX_ENV_REGION, 0, &renv->mtx_regenv)) != 0)
660			return (ret);
661		renv->refcnt = 1;
662	} else {
663		/* Lock the environment, increment the reference, unlock. */
664		MUTEX_LOCK(env, renv->mtx_regenv);
665		++renv->refcnt;
666		MUTEX_UNLOCK(env, renv->mtx_regenv);
667	}
668
669	F_SET(env, ENV_REF_COUNTED);
670	return (0);
671}
672
673/*
674 * __env_ref_decrement --
675 *	Decrement the environment's reference count.
676 *
677 * PUBLIC: int __env_ref_decrement __P((ENV *));
678 */
679int
680__env_ref_decrement(env)
681	ENV *env;
682{
683	REGENV *renv;
684	REGINFO *infop;
685
686	/* Be cautious -- we may not have an environment. */
687	if ((infop = env->reginfo) == NULL)
688		return (0);
689
690	renv = infop->primary;
691
692	/* Even if we have an environment, may not have reference counted it. */
693	if (F_ISSET(env, ENV_REF_COUNTED)) {
694		/* Lock the environment, decrement the reference, unlock. */
695		MUTEX_LOCK(env, renv->mtx_regenv);
696		if (renv->refcnt == 0)
697			__db_errx(env,
698			    "environment reference count went negative");
699		else
700			--renv->refcnt;
701		MUTEX_UNLOCK(env, renv->mtx_regenv);
702
703		F_CLR(env, ENV_REF_COUNTED);
704	}
705
706	/* If a private environment, we're done with the mutex, destroy it. */
707	return (F_ISSET(env, ENV_PRIVATE) ?
708	    __mutex_free(env, &renv->mtx_regenv) : 0);
709}
710
711/*
712 * __env_detach --
713 *	Detach from the environment.
714 *
715 * PUBLIC: int __env_detach __P((ENV *, int));
716 */
717int
718__env_detach(env, destroy)
719	ENV *env;
720	int destroy;
721{
722	REGENV *renv;
723	REGINFO *infop;
724	REGION rp;
725	int ret, t_ret;
726
727	infop = env->reginfo;
728	renv = infop->primary;
729	ret = 0;
730
731	/* Close the locking file handle. */
732	if (env->lockfhp != NULL) {
733		if ((t_ret =
734		    __os_closehandle(env, env->lockfhp)) != 0 && ret == 0)
735			ret = t_ret;
736		env->lockfhp = NULL;
737	}
738
739	/*
740	 * If a private region, return the memory to the heap.  Not needed for
741	 * filesystem-backed or system shared memory regions, that memory isn't
742	 * owned by any particular process.
743	 */
744	if (destroy) {
745		/*
746		 * Free the REGION array.
747		 *
748		 * The actual underlying region structure is allocated from the
749		 * primary shared region, and we're about to free it.  Save a
750		 * copy on our stack for the REGINFO to reference when it calls
751		 * down into the OS layer to release the shared memory segment.
752		 */
753		rp = *infop->rp;
754		infop->rp = &rp;
755
756		if (renv->region_off != INVALID_ROFF)
757			__env_alloc_free(
758			   infop, R_ADDR(infop, renv->region_off));
759	}
760
761	/*
762	 * Set the ENV->reginfo field to NULL.  BDB uses the ENV->reginfo
763	 * field to decide if the underlying region can be accessed or needs
764	 * cleanup.  We're about to destroy what it references, so it needs to
765	 * be cleared.
766	 */
767	env->reginfo = NULL;
768
769	/* Reset the addr value that we "corrected" above. */
770	infop->addr = infop->primary;
771
772	if ((t_ret = __env_sys_detach(env, infop, destroy)) != 0 && ret == 0)
773		ret = t_ret;
774	if (infop->name != NULL)
775		__os_free(env, infop->name);
776
777	/* Discard the ENV->reginfo field's memory. */
778	__os_free(env, infop);
779
780	return (ret);
781}
782
783/*
784 * __env_remove_env --
785 *	Remove an environment.
786 *
787 * PUBLIC: int __env_remove_env __P((ENV *));
788 */
789int
790__env_remove_env(env)
791	ENV *env;
792{
793	DB_ENV *dbenv;
794	REGENV *renv;
795	REGINFO *infop, reginfo;
796	REGION *rp;
797	u_int32_t flags_orig, i;
798
799	dbenv = env->dbenv;
800
801	/*
802	 * We do not want to hang on a mutex request, nor do we care about
803	 * panics.
804	 */
805	flags_orig = F_ISSET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
806	F_SET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
807
808	/*
809	 * This routine has to walk a nasty line between not looking into the
810	 * environment (which may be corrupted after an app or system crash),
811	 * and removing everything that needs removing.
812	 *
813	 * Connect to the environment: If we can't join the environment, we
814	 * guess it's because it doesn't exist.  Remove the underlying files,
815	 * at least.
816	 */
817	if (__env_attach(env, NULL, 0, 0) != 0)
818		goto remfiles;
819
820	infop = env->reginfo;
821	renv = infop->primary;
822
823	/*
824	 * Kill the environment, if it's not already dead.
825	 */
826	renv->panic = 1;
827
828	/*
829	 * Walk the array of regions.  Connect to each region and disconnect
830	 * with the destroy flag set.  This shouldn't cause any problems, even
831	 * if the region is corrupted, because we never look inside the region
832	 * (with the single exception of mutex regions on systems where we have
833	 * to return resources to the underlying system).
834	 */
835	for (rp = R_ADDR(infop, renv->region_off),
836	    i = 0; i < renv->region_cnt; ++i, ++rp) {
837		if (rp->id == INVALID_REGION_ID || rp->type == REGION_TYPE_ENV)
838			continue;
839		/*
840		 * !!!
841		 * The REGION_CREATE_OK flag is set for Windows/95 -- regions
842		 * are zero'd out when the last reference to the region goes
843		 * away, in which case the underlying OS region code requires
844		 * callers be prepared to create the region in order to join it.
845		 */
846		memset(&reginfo, 0, sizeof(reginfo));
847		reginfo.id = rp->id;
848		reginfo.flags = REGION_CREATE_OK;
849
850		/*
851		 * If we get here and can't attach and/or detach to the
852		 * region, it's a mess.  Ignore errors, there's nothing
853		 * we can do about them.
854		 */
855		if (__env_region_attach(env, &reginfo, 0) != 0)
856			continue;
857
858#ifdef  HAVE_MUTEX_SYSTEM_RESOURCES
859		/*
860		 * If destroying the mutex region, return any system
861		 * resources to the system.
862		 */
863		if (reginfo.type == REGION_TYPE_MUTEX)
864			__mutex_resource_return(env, &reginfo);
865#endif
866		(void)__env_region_detach(env, &reginfo, 1);
867	}
868
869	/* Detach from the environment's primary region. */
870	(void)__env_detach(env, 1);
871
872remfiles:
873	/*
874	 * Walk the list of files in the directory, unlinking files in the
875	 * Berkeley DB name space.
876	 */
877	__env_remove_file(env);
878
879	F_CLR(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
880	F_SET(dbenv, flags_orig);
881
882	return (0);
883}
884
885/*
886 * __env_remove_file --
887 *	Discard any region files in the filesystem.
888 */
889static void
890__env_remove_file(env)
891	ENV *env;
892{
893	int cnt, fcnt, lastrm, ret;
894	const char *dir;
895	char saved_char, *p, **names, *path, buf[sizeof(DB_REGION_FMT) + 20];
896
897	/* Get the full path of a file in the environment. */
898	(void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
899	if ((ret = __db_appname(env, DB_APP_NONE, buf, 0, NULL, &path)) != 0)
900		return;
901
902	/* Get the parent directory for the environment. */
903	if ((p = __db_rpath(path)) == NULL) {
904		p = path;
905		saved_char = *p;
906
907		dir = PATH_DOT;
908	} else {
909		saved_char = *p;
910		*p = '\0';
911
912		dir = path;
913	}
914
915	/* Get the list of file names. */
916	if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0)
917		__db_err(env, ret, "%s", dir);
918
919	/* Restore the path, and free it. */
920	*p = saved_char;
921	__os_free(env, path);
922
923	if (ret != 0)
924		return;
925
926	/*
927	 * Remove files from the region directory.
928	 */
929	for (lastrm = -1, cnt = fcnt; --cnt >= 0;) {
930		/* Skip anything outside our name space. */
931		if (strncmp(names[cnt],
932		    DB_REGION_PREFIX, sizeof(DB_REGION_PREFIX) - 1))
933			continue;
934
935		/* Skip queue extent files. */
936		if (strncmp(names[cnt], "__dbq.", 6) == 0)
937			continue;
938
939		/* Skip registry files. */
940		if (strncmp(names[cnt], "__db.register", 13) == 0)
941			continue;
942
943		/* Skip replication files. */
944		if (strncmp(names[cnt], "__db.rep", 8) == 0)
945			continue;
946
947		/*
948		 * Remove the primary environment region last, because it's
949		 * the key to this whole mess.
950		 */
951		if (strcmp(names[cnt], DB_REGION_ENV) == 0) {
952			lastrm = cnt;
953			continue;
954		}
955
956		/* Remove the file. */
957		if (__db_appname(env,
958		    DB_APP_NONE, names[cnt], 0, NULL, &path) == 0) {
959			/*
960			 * Overwrite region files.  Temporary files would have
961			 * been maintained in encrypted format, so there's no
962			 * reason to overwrite them.  This is not an exact
963			 * check on the file being a region file, but it's
964			 * not likely to be wrong, and the worst thing that can
965			 * happen is we overwrite a file that didn't need to be
966			 * overwritten.
967			 */
968			(void)__os_unlink(env, path, 1);
969			__os_free(env, path);
970		}
971	}
972
973	if (lastrm != -1)
974		if (__db_appname(env,
975		    DB_APP_NONE, names[lastrm], 0, NULL, &path) == 0) {
976			(void)__os_unlink(env, path, 1);
977			__os_free(env, path);
978		}
979	__os_dirfree(env, names, fcnt);
980}
981
982/*
983 * __env_region_attach
984 *	Join/create a region.
985 *
986 * PUBLIC: int __env_region_attach __P((ENV *, REGINFO *, size_t));
987 */
988int
989__env_region_attach(env, infop, size)
990	ENV *env;
991	REGINFO *infop;
992	size_t size;
993{
994	REGION *rp;
995	int ret;
996	char buf[sizeof(DB_REGION_FMT) + 20];
997
998	/*
999	 * Find or create a REGION structure for this region.  If we create
1000	 * it, the REGION_CREATE flag will be set in the infop structure.
1001	 */
1002	F_CLR(infop, REGION_CREATE);
1003	if ((ret = __env_des_get(env, env->reginfo, infop, &rp)) != 0)
1004		return (ret);
1005	infop->env = env;
1006	infop->rp = rp;
1007	infop->type = rp->type;
1008	infop->id = rp->id;
1009
1010	/*
1011	 * __env_des_get may have created the region and reset the create
1012	 * flag.  If we're creating the region, set the desired size.
1013	 */
1014	if (F_ISSET(infop, REGION_CREATE))
1015		rp->size = (roff_t)size;
1016
1017	/* Join/create the underlying region. */
1018	(void)snprintf(buf, sizeof(buf), DB_REGION_FMT, infop->id);
1019	if ((ret = __db_appname(env,
1020	    DB_APP_NONE, buf, 0, NULL, &infop->name)) != 0)
1021		goto err;
1022	if ((ret = __env_sys_attach(env, infop, rp)) != 0)
1023		goto err;
1024
1025	/*
1026	 * Fault the pages into memory.  Note, do this BEFORE we initialize
1027	 * anything because we're writing pages in created regions, not just
1028	 * reading them.
1029	 */
1030	(void)__env_faultmem(env,
1031	    infop->addr, rp->size, F_ISSET(infop, REGION_CREATE));
1032
1033	/*
1034	 * !!!
1035	 * The underlying layer may have just decided that we are going
1036	 * to create the region.  There are various system issues that
1037	 * can result in a useless region that requires re-initialization.
1038	 *
1039	 * If we created the region, initialize it for allocation.
1040	 */
1041	if (F_ISSET(infop, REGION_CREATE))
1042		__env_alloc_init(infop, rp->size);
1043
1044	return (0);
1045
1046err:	/* Discard the underlying region. */
1047	if (infop->addr != NULL)
1048		(void)__env_sys_detach(env,
1049		    infop, F_ISSET(infop, REGION_CREATE));
1050	infop->rp = NULL;
1051	infop->id = INVALID_REGION_ID;
1052
1053	/* Discard the REGION structure if we created it. */
1054	if (F_ISSET(infop, REGION_CREATE)) {
1055		__env_des_destroy(env, rp);
1056		F_CLR(infop, REGION_CREATE);
1057	}
1058
1059	return (ret);
1060}
1061
1062/*
1063 * __env_region_detach --
1064 *	Detach from a region.
1065 *
1066 * PUBLIC: int __env_region_detach __P((ENV *, REGINFO *, int));
1067 */
1068int
1069__env_region_detach(env, infop, destroy)
1070	ENV *env;
1071	REGINFO *infop;
1072	int destroy;
1073{
1074	REGION *rp;
1075	int ret;
1076
1077	rp = infop->rp;
1078	if (F_ISSET(env, ENV_PRIVATE))
1079		destroy = 1;
1080
1081	/*
1082	 * When discarding the regions as we shut down a database environment,
1083	 * discard any allocated shared memory segments.  This is the last time
1084	 * we use them, and db_region_destroy is the last region-specific call
1085	 * we make.
1086	 */
1087	if (F_ISSET(env, ENV_PRIVATE) && infop->primary != NULL)
1088		__env_alloc_free(infop, infop->primary);
1089
1090	/* Detach from the underlying OS region. */
1091	ret = __env_sys_detach(env, infop, destroy);
1092
1093	/* If we destroyed the region, discard the REGION structure. */
1094	if (destroy)
1095		__env_des_destroy(env, rp);
1096
1097	/* Destroy the structure. */
1098	if (infop->name != NULL)
1099		__os_free(env, infop->name);
1100
1101	return (ret);
1102}
1103
1104/*
1105 * __env_sys_attach --
1106 *	Prep and call the underlying OS attach function.
1107 */
1108static int
1109__env_sys_attach(env, infop, rp)
1110	ENV *env;
1111	REGINFO *infop;
1112	REGION *rp;
1113{
1114	int ret;
1115
1116	/*
1117	 * All regions are created on 8K boundaries out of sheer paranoia,
1118	 * so we don't make some underlying VM unhappy. Make sure we don't
1119	 * overflow or underflow.
1120	 */
1121#define	OS_VMPAGESIZE		(8 * 1024)
1122#define	OS_VMROUNDOFF(i) {						\
1123	if ((i) <							\
1124	    (UINT32_MAX - OS_VMPAGESIZE) + 1 || (i) < OS_VMPAGESIZE)	\
1125		(i) += OS_VMPAGESIZE - 1;				\
1126	(i) -= (i) % OS_VMPAGESIZE;					\
1127}
1128	OS_VMROUNDOFF(rp->size);
1129
1130#ifdef DB_REGIONSIZE_MAX
1131	/* Some architectures have hard limits on the maximum region size. */
1132	if (rp->size > DB_REGIONSIZE_MAX) {
1133		__db_errx(env, "region size %lu is too large; maximum is %lu",
1134		    (u_long)rp->size, (u_long)DB_REGIONSIZE_MAX);
1135		return (EINVAL);
1136	}
1137#endif
1138
1139	/*
1140	 * If a region is private, malloc the memory.
1141	 *
1142	 * !!!
1143	 * If this fails because the region is too large to malloc, mmap(2)
1144	 * using the MAP_ANON or MAP_ANONYMOUS flags would be an alternative.
1145	 * I don't know of any architectures (yet!) where malloc is a problem.
1146	 */
1147	if (F_ISSET(env, ENV_PRIVATE)) {
1148#if defined(HAVE_MUTEX_HPPA_MSEM_INIT)
1149		/*
1150		 * !!!
1151		 * There exist spinlocks that don't work in malloc memory, e.g.,
1152		 * the HP/UX msemaphore interface.  If we don't have locks that
1153		 * will work in malloc memory, we better not be private or not
1154		 * be threaded.
1155		 */
1156		if (F_ISSET(env, ENV_THREAD)) {
1157			__db_errx(env, "%s",
1158    "architecture does not support locks inside process-local (malloc) memory");
1159			__db_errx(env, "%s",
1160    "application may not specify both DB_PRIVATE and DB_THREAD");
1161			return (EINVAL);
1162		}
1163#endif
1164		if ((ret = __os_malloc(
1165		    env, sizeof(REGENV), &infop->addr)) != 0)
1166			return (ret);
1167
1168		infop->max_alloc = rp->size;
1169	} else
1170		if ((ret = __os_attach(env, infop, rp)) != 0)
1171			return (ret);
1172
1173	/*
1174	 * We may require alignment the underlying system or heap allocation
1175	 * library doesn't supply.  Align the address if necessary, saving
1176	 * the original values for restoration when the region is discarded.
1177	 */
1178	infop->addr_orig = infop->addr;
1179	infop->addr = ALIGNP_INC(infop->addr_orig, sizeof(size_t));
1180
1181	rp->size_orig = rp->size;
1182	if (infop->addr != infop->addr_orig)
1183		rp->size -= (roff_t)
1184		    ((u_int8_t *)infop->addr - (u_int8_t *)infop->addr_orig);
1185
1186	return (0);
1187}
1188
1189/*
1190 * __env_sys_detach --
1191 *	Prep and call the underlying OS detach function.
1192 */
1193static int
1194__env_sys_detach(env, infop, destroy)
1195	ENV *env;
1196	REGINFO *infop;
1197	int destroy;
1198{
1199	REGION *rp;
1200
1201	rp = infop->rp;
1202
1203	/* Restore any address/size altered for alignment reasons. */
1204	if (infop->addr != infop->addr_orig) {
1205		infop->addr = infop->addr_orig;
1206		rp->size = rp->size_orig;
1207	}
1208
1209	/* If a region is private, free the memory. */
1210	if (F_ISSET(env, ENV_PRIVATE)) {
1211		__os_free(env, infop->addr);
1212		return (0);
1213	}
1214
1215	return (__os_detach(env, infop, destroy));
1216}
1217
1218/*
1219 * __env_des_get --
1220 *	Return a reference to the shared information for a REGION,
1221 *	optionally creating a new entry.
1222 */
1223static int
1224__env_des_get(env, env_infop, infop, rpp)
1225	ENV *env;
1226	REGINFO *env_infop, *infop;
1227	REGION **rpp;
1228{
1229	REGENV *renv;
1230	REGION *rp, *empty_slot, *first_type;
1231	u_int32_t i, maxid;
1232
1233	*rpp = NULL;
1234	renv = env_infop->primary;
1235
1236	/*
1237	 * If the caller wants to join a region, walk through the existing
1238	 * regions looking for a matching ID (if ID specified) or matching
1239	 * type (if type specified).  If we return based on a matching type
1240	 * return the "primary" region, that is, the first region that was
1241	 * created of this type.
1242	 *
1243	 * Track the first empty slot and maximum region ID for new region
1244	 * allocation.
1245	 *
1246	 * MaxID starts at REGION_ID_ENV, the ID of the primary environment.
1247	 */
1248	maxid = REGION_ID_ENV;
1249	empty_slot = first_type = NULL;
1250	for (rp = R_ADDR(env_infop, renv->region_off),
1251	    i = 0; i < renv->region_cnt; ++i, ++rp) {
1252		if (rp->id == INVALID_REGION_ID) {
1253			if (empty_slot == NULL)
1254				empty_slot = rp;
1255			continue;
1256		}
1257		if (infop->id != INVALID_REGION_ID) {
1258			if (infop->id == rp->id)
1259				break;
1260			continue;
1261		}
1262		if (infop->type == rp->type &&
1263		    F_ISSET(infop, REGION_JOIN_OK) &&
1264		    (first_type == NULL || first_type->id > rp->id))
1265			first_type = rp;
1266
1267		if (rp->id > maxid)
1268			maxid = rp->id;
1269	}
1270
1271	/* If we found a matching ID (or a matching type), return it. */
1272	if (i >= renv->region_cnt)
1273		rp = first_type;
1274	if (rp != NULL) {
1275		*rpp = rp;
1276		return (0);
1277	}
1278
1279	/*
1280	 * If we didn't find a region and we don't have permission to create
1281	 * the region, fail.  The caller generates any error message.
1282	 */
1283	if (!F_ISSET(infop, REGION_CREATE_OK))
1284		return (ENOENT);
1285
1286	/*
1287	 * If we didn't find a region and don't have room to create the region
1288	 * fail with an error message, there's a sizing problem.
1289	 */
1290	if (empty_slot == NULL) {
1291		__db_errx(env, "no room remaining for additional REGIONs");
1292		return (ENOENT);
1293	}
1294
1295	/*
1296	 * Initialize a REGION structure for the caller.  If id was set, use
1297	 * that value, otherwise we use the next available ID.
1298	 */
1299	memset(empty_slot, 0, sizeof(REGION));
1300	empty_slot->segid = INVALID_REGION_SEGID;
1301
1302	/*
1303	 * Set the type and ID; if no region ID was specified,
1304	 * allocate one.
1305	 */
1306	empty_slot->type = infop->type;
1307	empty_slot->id = infop->id == INVALID_REGION_ID ? maxid + 1 : infop->id;
1308
1309	F_SET(infop, REGION_CREATE);
1310
1311	*rpp = empty_slot;
1312	return (0);
1313}
1314
1315/*
1316 * __env_des_destroy --
1317 *	Destroy a reference to a REGION.
1318 */
1319static void
1320__env_des_destroy(env, rp)
1321	ENV *env;
1322	REGION *rp;
1323{
1324	COMPQUIET(env, NULL);
1325
1326	rp->id = INVALID_REGION_ID;
1327}
1328
1329/*
1330 * __env_faultmem --
1331 *	Fault the region into memory.
1332 */
1333static int
1334__env_faultmem(env, addr, size, created)
1335	ENV *env;
1336	void *addr;
1337	size_t size;
1338	int created;
1339{
1340	int ret;
1341	u_int8_t *p, *t;
1342
1343	/* Ignore heap regions. */
1344	if (F_ISSET(env, ENV_PRIVATE))
1345		return (0);
1346
1347	/*
1348	 * It's sometimes significantly faster to page-fault in all of the
1349	 * region's pages before we run the application, as we see nasty
1350	 * side-effects when we page-fault while holding various locks, i.e.,
1351	 * the lock takes a long time to acquire because of the underlying
1352	 * page fault, and the other threads convoy behind the lock holder.
1353	 *
1354	 * If we created the region, we write a non-zero value so that the
1355	 * system can't cheat.  If we're just joining the region, we can
1356	 * only read the value and try to confuse the compiler sufficiently
1357	 * that it doesn't figure out that we're never really using it.
1358	 *
1359	 * Touch every page (assuming pages are 512B, the smallest VM page
1360	 * size used in any general purpose processor).
1361	 */
1362	ret = 0;
1363	if (F_ISSET(env->dbenv, DB_ENV_REGION_INIT)) {
1364		if (created)
1365			for (p = addr,
1366			    t = (u_int8_t *)addr + size; p < t; p += 512)
1367				p[0] = 0xdb;
1368		else
1369			for (p = addr,
1370			    t = (u_int8_t *)addr + size; p < t; p += 512)
1371				ret |= p[0];
1372	}
1373
1374	return (ret);
1375}
1376