1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996,2008 Oracle.  All rights reserved.
5 *
6 * $Id: os_map.c,v 12.25 2008/05/07 12:27:35 bschmeck Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12
13static int __os_map
14  __P((ENV *, char *, REGINFO *, DB_FH *, size_t, int, int, int, void **));
15static int __os_unique_name __P((_TCHAR *, HANDLE, _TCHAR *, size_t));
16
17/*
18 * __os_attach --
19 *	Create/join a shared memory region.
20 */
21int
22__os_attach(env, infop, rp)
23	ENV *env;
24	REGINFO *infop;
25	REGION *rp;
26{
27	DB_FH *fhp;
28	int ret;
29
30	/*
31	 * On Windows/9X, files that are opened by multiple processes do not
32	 * share data correctly.  For this reason, we require that DB_PRIVATE
33	 * be specified on that platform.
34	 */
35	if (!F_ISSET(env, ENV_PRIVATE) && __os_is_winnt() == 0) {
36		__db_err(env,
37		    EINVAL, "Windows 9X systems must specify DB_PRIVATE");
38		return (EINVAL);
39	}
40
41	/*
42	 * Try to open/create the file.  We DO NOT need to ensure that multiple
43	 * threads/processes attempting to simultaneously create the region are
44	 * properly ordered, our caller has already taken care of that.
45	 */
46	if ((ret = __os_open(env, infop->name, 0, DB_OSO_REGION |
47	    (F_ISSET(infop, REGION_CREATE_OK) ? DB_OSO_CREATE : 0),
48	    env->db_mode, &fhp)) != 0) {
49		__db_err(env, ret, "%s", infop->name);
50		return (ret);
51	}
52
53	/*
54	 * Map the file in.  If we're creating an in-system-memory region,
55	 * specify a segment ID (which is never used again) so that the
56	 * calling code writes out the REGENV_REF structure to the primary
57	 * environment file.
58	 */
59	ret = __os_map(env, infop->name, infop, fhp, rp->size,
60	   1, F_ISSET(env, ENV_SYSTEM_MEM), 0, &infop->addr);
61	if (ret == 0 && F_ISSET(env, ENV_SYSTEM_MEM))
62		rp->segid = 1;
63
64	(void)__os_closehandle(env, fhp);
65
66	return (ret);
67}
68
69/*
70 * __os_detach --
71 *	Detach from a shared memory region.
72 */
73int
74__os_detach(env, infop, destroy)
75	ENV *env;
76	REGINFO *infop;
77	int destroy;
78{
79	DB_ENV *dbenv;
80	int ret, t_ret;
81
82	dbenv = env->dbenv;
83
84	if (infop->wnt_handle != NULL) {
85		(void)CloseHandle(infop->wnt_handle);
86		infop->wnt_handle = NULL;
87	}
88
89	ret = !UnmapViewOfFile(infop->addr) ? __os_get_syserr() : 0;
90	if (ret != 0) {
91		__db_syserr(env, ret, "UnmapViewOfFile");
92		ret = __os_posix_err(ret);
93	}
94
95	if (!F_ISSET(env, ENV_SYSTEM_MEM) && destroy &&
96	    (t_ret = __os_unlink(env, infop->name, 1)) != 0 && ret == 0)
97		ret = t_ret;
98
99	return (ret);
100}
101
102/*
103 * __os_mapfile --
104 *	Map in a shared memory file.
105 */
106int
107__os_mapfile(env, path, fhp, len, is_rdonly, addr)
108	ENV *env;
109	char *path;
110	DB_FH *fhp;
111	int is_rdonly;
112	size_t len;
113	void **addr;
114{
115#ifdef DB_WINCE
116	/*
117	 * Windows CE has special requirements for file mapping to work.
118	 * * The input handle needs to be opened using CreateFileForMapping
119	 * * Concurrent access via a non mapped file is not supported.
120	 * So we disable support for memory mapping files on Windows CE. It is
121	 * currently only used as an optimization in mpool for small read only
122	 * databases.
123	 */
124	return (EFAULT);
125#else
126	DB_ENV *dbenv;
127
128	dbenv = env == NULL ? NULL : env->dbenv;
129
130	if (dbenv != NULL &&
131	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
132		__db_msg(env, "fileops: mmap %s", path);
133	return (__os_map(env, path, NULL, fhp, len, 0, 0, is_rdonly, addr));
134#endif
135}
136
137/*
138 * __os_unmapfile --
139 *	Unmap the shared memory file.
140 */
141int
142__os_unmapfile(env, addr, len)
143	ENV *env;
144	void *addr;
145	size_t len;
146{
147	DB_ENV *dbenv;
148
149	dbenv = env == NULL ? NULL : env->dbenv;
150
151	if (dbenv != NULL &&
152	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
153		__db_msg(env, "fileops: munmap");
154
155	return (!UnmapViewOfFile(addr) ? __os_posix_err(__os_get_syserr()) : 0);
156}
157
158/*
159 * __os_unique_name --
160 *	Create a unique identifying name from a pathname (may be absolute or
161 *	relative) and/or a file descriptor.
162 *
163 *	The name returned must be unique (different files map to different
164 *	names), and repeatable (same files, map to same names).  It's not
165 *	so easy to do by name.  Should handle not only:
166 *
167 *		foo.bar == ./foo.bar == c:/whatever_path/foo.bar
168 *
169 *	but also understand that:
170 *
171 *		foo.bar == Foo.Bar	(FAT file system)
172 *		foo.bar != Foo.Bar	(NTFS)
173 *
174 *	The best solution is to use the file index, found in the file
175 *	information structure (similar to UNIX inode #).
176 *
177 *	When a file is deleted, its file index may be reused,
178 *	but if the unique name has not gone from its namespace,
179 *	we may get a conflict.  So to ensure some tie in to the
180 *	original pathname, we also use the creation time and the
181 *	file basename.  This is not a perfect system, but it
182 *	should work for all but anamolous test cases.
183 *
184 */
185static int
186__os_unique_name(orig_path, hfile, result_path, result_path_len)
187	_TCHAR *orig_path, *result_path;
188	HANDLE hfile;
189	size_t result_path_len;
190{
191	BY_HANDLE_FILE_INFORMATION fileinfo;
192	_TCHAR *basename, *p;
193
194	/*
195	 * In Windows, pathname components are delimited by '/' or '\', and
196	 * if neither is present, we need to strip off leading drive letter
197	 * (e.g. c:foo.txt).
198	 */
199	basename = _tcsrchr(orig_path, '/');
200	p = _tcsrchr(orig_path, '\\');
201	if (basename == NULL || (p != NULL && p > basename))
202		basename = p;
203	if (basename == NULL)
204		basename = _tcsrchr(orig_path, ':');
205
206	if (basename == NULL)
207		basename = orig_path;
208	else
209		basename++;
210
211	if (!GetFileInformationByHandle(hfile, &fileinfo))
212		return (__os_posix_err(__os_get_syserr()));
213
214	(void)_sntprintf(result_path, result_path_len,
215	    _T("__db_shmem.%8.8lx.%8.8lx.%8.8lx.%8.8lx.%8.8lx.%s"),
216	    fileinfo.dwVolumeSerialNumber,
217	    fileinfo.nFileIndexHigh,
218	    fileinfo.nFileIndexLow,
219	    fileinfo.ftCreationTime.dwHighDateTime,
220	    fileinfo.ftCreationTime.dwHighDateTime,
221	    basename);
222
223	return (0);
224}
225
226/*
227 * __os_map --
228 *	The mmap(2) function for Windows.
229 */
230static int
231__os_map(env, path, infop, fhp, len, is_region, is_system, is_rdonly, addr)
232	ENV *env;
233	REGINFO *infop;
234	char *path;
235	DB_FH *fhp;
236	int is_region, is_system, is_rdonly;
237	size_t len;
238	void **addr;
239{
240	HANDLE hMemory;
241	int ret, use_pagefile;
242	_TCHAR *tpath, shmem_name[DB_MAXPATHLEN];
243	void *pMemory;
244	unsigned __int64 len64;
245
246	ret = 0;
247	if (infop != NULL)
248		infop->wnt_handle = NULL;
249
250	/*
251	 * On 64 bit systems, len is already a 64 bit value.
252	 * On 32 bit systems len is a 32 bit value.
253	 * Always convert to a 64 bit value, so that the high order
254	 * DWORD can be simply extracted on 64 bit platforms.
255	 */
256	len64 = len;
257
258	use_pagefile = is_region && is_system;
259
260	/*
261	 * If creating a region in system space, get a matching name in the
262	 * paging file namespace.
263	 */
264	if (use_pagefile) {
265#ifdef DB_WINCE
266		__db_errx(env, "Unable to memory map regions using system "
267		    "memory on WinCE.");
268		return (EFAULT);
269#endif
270		TO_TSTRING(env, path, tpath, ret);
271		if (ret != 0)
272			return (ret);
273		ret = __os_unique_name(tpath, fhp->handle,
274		    shmem_name, sizeof(shmem_name));
275		FREE_STRING(env, tpath);
276		if (ret != 0)
277			return (ret);
278	}
279
280	/*
281	 * XXX
282	 * DB: We have not implemented copy-on-write here.
283	 *
284	 * If this is an region in system memory, we try to open it using the
285	 * OpenFileMapping() first, and only call CreateFileMapping() if we're
286	 * really creating the section.  There are two reasons:
287	 *
288	 * 1) We only create the mapping if we have newly created the region.
289	 *    This avoids a long-running problem caused by Windows reference
290	 *    counting, where regions that are closed by all processes are
291	 *    deleted.  It turns out that just checking for a zeroed region
292	 *    is not good enough. See [#4882] and [#7127] for the details.
293	 *
294	 * 2) CreateFileMapping seems to mess up making the commit charge to
295	 *    the process. It thinks, incorrectly, that when we want to join a
296	 *    previously existing section, that it should make a commit charge
297	 *    for the whole section.  In fact, there is no new committed memory
298	 *    whatever.  The call can fail if there is insufficient memory free
299	 *    to handle the erroneous commit charge.  So, we find that the
300	 *    bogus commit is not made if we call OpenFileMapping.
301	 */
302	hMemory = NULL;
303	if (use_pagefile) {
304#ifndef DB_WINCE
305		hMemory = OpenFileMapping(
306		    is_rdonly ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS,
307		    0, shmem_name);
308
309		if (hMemory == NULL && F_ISSET(infop, REGION_CREATE_OK))
310			hMemory = CreateFileMapping((HANDLE)-1, 0,
311			    is_rdonly ? PAGE_READONLY : PAGE_READWRITE,
312			    (DWORD)(len64 >> 32), (DWORD)len64, shmem_name);
313#endif
314	} else {
315		hMemory = CreateFileMapping(fhp->handle, 0,
316		    is_rdonly ? PAGE_READONLY : PAGE_READWRITE,
317		    (DWORD)(len64 >> 32), (DWORD)len64, NULL);
318#ifdef DB_WINCE
319		/*
320		 * WinCE automatically closes the handle passed in.
321		 * Ensure DB does not attempt to close the handle again.
322		 */
323		fhp->handle = INVALID_HANDLE_VALUE;
324		F_CLR(fhp, DB_FH_OPENED);
325#endif
326	}
327
328	if (hMemory == NULL) {
329		ret = __os_get_syserr();
330		__db_syserr(env, ret, "OpenFileMapping");
331		return (__env_panic(env, __os_posix_err(ret)));
332	}
333
334	pMemory = MapViewOfFile(hMemory,
335	    (is_rdonly ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS), 0, 0, len);
336	if (pMemory == NULL) {
337		ret = __os_get_syserr();
338		__db_syserr(env, ret, "MapViewOfFile");
339		return (__env_panic(env, __os_posix_err(ret)));
340	}
341
342	/*
343	 * XXX
344	 * It turns out that the kernel object underlying the named section
345	 * is reference counted, but that the call to MapViewOfFile() above
346	 * does NOT increment the reference count! So, if we close the handle
347	 * here, the kernel deletes the object from the kernel namespace.
348	 * When a second process comes along to join the region, the kernel
349	 * happily creates a new object with the same name, but completely
350	 * different identity. The two processes then have distinct isolated
351	 * mapped sections, not at all what was wanted. Not closing the handle
352	 * here fixes this problem.  We carry the handle around in the region
353	 * structure so we can close it when unmap is called.
354	 */
355	if (use_pagefile && infop != NULL)
356		infop->wnt_handle = hMemory;
357	else
358		CloseHandle(hMemory);
359
360	*addr = pMemory;
361	return (ret);
362}
363