1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1997,2008 Oracle.  All rights reserved.
5 *
6 * $Id: os_open.c,v 12.29 2008/03/26 04:11:35 david Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12
13/*
14 * __os_open --
15 *	Open a file descriptor (including page size and log size information).
16 */
17int
18__os_open(env, name, page_size, flags, mode, fhpp)
19	ENV *env;
20	const char *name;
21	u_int32_t page_size, flags;
22	int mode;
23	DB_FH **fhpp;
24{
25	DB_ENV *dbenv;
26	DB_FH *fhp;
27#ifndef DB_WINCE
28	DWORD cluster_size, sector_size, free_clusters, total_clusters;
29	_TCHAR *drive, dbuf[4]; /* <letter><colon><slash><nul> */
30#endif
31	int access, attr, createflag, nrepeat, ret, share;
32	_TCHAR *tname;
33
34	dbenv = env == NULL ? NULL : env->dbenv;
35	*fhpp = NULL;
36	tname = NULL;
37
38	if (dbenv != NULL &&
39	    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS | DB_VERB_FILEOPS_ALL))
40		__db_msg(env, "fileops: open %s", name);
41
42#define	OKFLAGS								\
43	(DB_OSO_ABSMODE | DB_OSO_CREATE | DB_OSO_DIRECT | DB_OSO_DSYNC |\
44	DB_OSO_EXCL | DB_OSO_RDONLY | DB_OSO_REGION |	DB_OSO_SEQ |	\
45	DB_OSO_TEMP | DB_OSO_TRUNC)
46	if ((ret = __db_fchk(env, "__os_open", flags, OKFLAGS)) != 0)
47		return (ret);
48
49	TO_TSTRING(env, name, tname, ret);
50	if (ret != 0)
51		goto err;
52
53	/*
54	 * Allocate the file handle and copy the file name.  We generally only
55	 * use the name for verbose or error messages, but on systems where we
56	 * can't unlink temporary files immediately, we use the name to unlink
57	 * the temporary file when the file handle is closed.
58	 *
59	 * Lock the ENV handle and insert the new file handle on the list.
60	 */
61	if ((ret = __os_calloc(env, 1, sizeof(DB_FH), &fhp)) != 0)
62		return (ret);
63	if ((ret = __os_strdup(env, name, &fhp->name)) != 0)
64		goto err;
65	if (env != NULL) {
66		MUTEX_LOCK(env, env->mtx_env);
67		TAILQ_INSERT_TAIL(&env->fdlist, fhp, q);
68		MUTEX_UNLOCK(env, env->mtx_env);
69		F_SET(fhp, DB_FH_ENVLINK);
70	}
71
72	/*
73	 * Otherwise, use the Windows/32 CreateFile interface so that we can
74	 * play magic games with files to get data flush effects similar to
75	 * the POSIX O_DSYNC flag.
76	 *
77	 * !!!
78	 * We currently ignore the 'mode' argument.  It would be possible
79	 * to construct a set of security attributes that we could pass to
80	 * CreateFile that would accurately represents the mode.  In worst
81	 * case, this would require looking up user and all group names and
82	 * creating an entry for each.  Alternatively, we could call the
83	 * _chmod (partial emulation) function after file creation, although
84	 * this leaves us with an obvious race.  However, these efforts are
85	 * largely meaningless on FAT, the most common file system, which
86	 * only has a "readable" and "writeable" flag, applying to all users.
87	 */
88	access = GENERIC_READ;
89	if (!LF_ISSET(DB_OSO_RDONLY))
90		access |= GENERIC_WRITE;
91
92#ifdef DB_WINCE
93	/*
94	 * WinCE translates these flags into share flags for
95	 * CreateFileForMapping.
96	 * Also WinCE does not support the FILE_SHARE_DELETE flag.
97	 */
98	if (LF_ISSET(DB_OSO_REGION))
99		share = GENERIC_READ | GENERIC_WRITE;
100	else
101		share = FILE_SHARE_READ | FILE_SHARE_WRITE;
102#else
103	share = FILE_SHARE_READ | FILE_SHARE_WRITE;
104	if (__os_is_winnt())
105		share |= FILE_SHARE_DELETE;
106#endif
107	attr = FILE_ATTRIBUTE_NORMAL;
108
109	/*
110	 * Reproduce POSIX 1003.1 semantics: if O_CREATE and O_EXCL are both
111	 * specified, fail, returning EEXIST, unless we create the file.
112	 */
113	if (LF_ISSET(DB_OSO_CREATE) && LF_ISSET(DB_OSO_EXCL))
114		createflag = CREATE_NEW;	/* create only if !exist*/
115	else if (!LF_ISSET(DB_OSO_CREATE) && LF_ISSET(DB_OSO_TRUNC))
116		createflag = TRUNCATE_EXISTING; /* truncate, fail if !exist */
117	else if (LF_ISSET(DB_OSO_TRUNC))
118		createflag = CREATE_ALWAYS;	/* create and truncate */
119	else if (LF_ISSET(DB_OSO_CREATE))
120		createflag = OPEN_ALWAYS;	/* open or create */
121	else
122		createflag = OPEN_EXISTING;	/* open only if existing */
123
124	if (LF_ISSET(DB_OSO_DSYNC)) {
125		F_SET(fhp, DB_FH_NOSYNC);
126		attr |= FILE_FLAG_WRITE_THROUGH;
127	}
128
129#ifndef DB_WINCE
130	if (LF_ISSET(DB_OSO_SEQ))
131		attr |= FILE_FLAG_SEQUENTIAL_SCAN;
132	else
133		attr |= FILE_FLAG_RANDOM_ACCESS;
134#endif
135
136	if (LF_ISSET(DB_OSO_TEMP))
137		attr |= FILE_FLAG_DELETE_ON_CLOSE;
138
139	/*
140	 * We can turn filesystem buffering off if the page size is a
141	 * multiple of the disk's sector size. To find the sector size,
142	 * we call GetDiskFreeSpace, which expects a drive name like "d:\\"
143	 * or NULL for the current disk (i.e., a relative path).
144	 *
145	 * WinCE only has GetDiskFreeSpaceEx which does not
146	 * return the sector size.
147	 */
148#ifndef DB_WINCE
149	if (LF_ISSET(DB_OSO_DIRECT) && page_size != 0 && name[0] != '\0') {
150		if (name[1] == ':') {
151			drive = dbuf;
152			_sntprintf(dbuf, sizeof(dbuf), _T("%c:\\"), tname[0]);
153		} else
154			drive = NULL;
155
156		/*
157		 * We ignore all results except sectorsize, but some versions
158		 * of Windows require that the parameters are non-NULL.
159		 */
160		if (GetDiskFreeSpace(drive, &cluster_size,
161		    &sector_size, &free_clusters, &total_clusters) &&
162		    page_size % sector_size == 0)
163			attr |= FILE_FLAG_NO_BUFFERING;
164	}
165#endif
166
167	fhp->handle = fhp->trunc_handle = INVALID_HANDLE_VALUE;
168	for (nrepeat = 1;; ++nrepeat) {
169		if (fhp->handle == INVALID_HANDLE_VALUE) {
170#ifdef DB_WINCE
171			if (LF_ISSET(DB_OSO_REGION))
172				fhp->handle = CreateFileForMapping(tname,
173				    access, share, NULL, createflag, attr, 0);
174			else
175#endif
176				fhp->handle = CreateFile(tname,
177				    access, share, NULL, createflag, attr, 0);
178		}
179
180		/*
181		 * Since WinCE does not support truncate, we don't
182		 * need to open this second handle.
183		 * This code will not work unaltered on WinCE, the
184		 * creation of the second handle fails.
185		 */
186#ifndef DB_WINCE
187		/*
188		 * Windows does not provide truncate directly.  There is no
189		 * safe way to use a handle for truncate concurrently with
190		 * reads or writes.  To deal with this, we open a second handle
191		 * used just for truncating.
192		 */
193		if (fhp->handle != INVALID_HANDLE_VALUE &&
194		    !LF_ISSET(DB_OSO_RDONLY | DB_OSO_TEMP) &&
195		    fhp->trunc_handle == INVALID_HANDLE_VALUE)
196			fhp->trunc_handle = CreateFile(
197			    tname, access, share, NULL, OPEN_EXISTING, attr, 0);
198
199		if (fhp->handle == INVALID_HANDLE_VALUE ||
200		    (!LF_ISSET(DB_OSO_RDONLY | DB_OSO_TEMP) &&
201		    fhp->trunc_handle == INVALID_HANDLE_VALUE))
202#else
203		if (fhp->handle == INVALID_HANDLE_VALUE)
204#endif
205		{
206			/*
207			 * If it's a "temporary" error, we retry up to 3 times,
208			 * waiting up to 12 seconds.  While it's not a problem
209			 * if we can't open a database, an inability to open a
210			 * log file is cause for serious dismay.
211			 */
212			ret = __os_posix_err(__os_get_syserr());
213			if ((ret != ENFILE && ret != EMFILE && ret != ENOSPC) ||
214			    nrepeat > 3)
215				goto err;
216
217			__os_yield(env, nrepeat * 2, 0);
218		} else
219			break;
220	}
221
222	FREE_STRING(env, tname);
223
224	if (LF_ISSET(DB_OSO_REGION))
225		F_SET(fhp, DB_FH_REGION);
226	F_SET(fhp, DB_FH_OPENED);
227	*fhpp = fhp;
228	return (0);
229
230err:	FREE_STRING(env, tname);
231	if (fhp != NULL)
232		(void)__os_closehandle(env, fhp);
233	return (ret);
234}
235