1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1997,2008 Oracle.  All rights reserved.
5 *
6 * $Id: os_rw.c,v 12.32 2008/02/12 16:08:51 bostic Exp $
7 */
8
9#include "db_config.h"
10
11#include "db_int.h"
12
13/*
14 * __os_io --
15 *	Do an I/O.
16 *
17 * PUBLIC: int __os_io __P((ENV *, int, DB_FH *, db_pgno_t,
18 * PUBLIC:     u_int32_t, u_int32_t, u_int32_t, u_int8_t *, size_t *));
19 */
20int
21__os_io(env, op, fhp, pgno, pgsize, relative, io_len, buf, niop)
22	ENV *env;
23	int op;
24	DB_FH *fhp;
25	db_pgno_t pgno;
26	u_int32_t pgsize, relative, io_len;
27	u_int8_t *buf;
28	size_t *niop;
29{
30#if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
31	DB_ENV *dbenv;
32	off_t offset;
33	ssize_t nio;
34#endif
35	int ret;
36
37	/*
38	 * Check for illegal usage.
39	 *
40	 * This routine is used in one of two ways: reading bytes from an
41	 * absolute offset and reading a specific database page.  All of
42	 * our absolute offsets are known to fit into a u_int32_t, while
43	 * our database pages might be at offsets larger than a u_int32_t.
44	 * We don't want to specify an absolute offset in our caller as we
45	 * aren't exactly sure what size an off_t might be.
46	 */
47	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
48	DB_ASSERT(env, (pgno == 0 && pgsize == 0) || relative == 0);
49
50#if defined(HAVE_PREAD) && defined(HAVE_PWRITE)
51	dbenv = env == NULL ? NULL : env->dbenv;
52
53	if ((offset = relative) == 0)
54		offset = (off_t)pgno * pgsize;
55	switch (op) {
56	case DB_IO_READ:
57		if (DB_GLOBAL(j_read) != NULL)
58			goto slow;
59#if defined(HAVE_STATISTICS)
60		++fhp->read_count;
61#endif
62		if (dbenv != NULL &&
63		    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
64			__db_msg(env,
65			    "fileops: read %s: %lu bytes at offset %lu",
66			    fhp->name, (u_long)io_len, (u_long)offset);
67
68		LAST_PANIC_CHECK_BEFORE_IO(env);
69		nio = DB_GLOBAL(j_pread) != NULL ?
70		    DB_GLOBAL(j_pread)(fhp->fd, buf, io_len, offset) :
71		    pread(fhp->fd, buf, io_len, offset);
72		break;
73	case DB_IO_WRITE:
74		if (DB_GLOBAL(j_write) != NULL)
75			goto slow;
76#ifdef HAVE_FILESYSTEM_NOTZERO
77		if (__os_fs_notzero())
78			goto slow;
79#endif
80#if defined(HAVE_STATISTICS)
81		++fhp->write_count;
82#endif
83		if (dbenv != NULL &&
84		    FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
85			__db_msg(env,
86			    "fileops: write %s: %lu bytes at offset %lu",
87			    fhp->name, (u_long)io_len, (u_long)offset);
88
89		LAST_PANIC_CHECK_BEFORE_IO(env);
90		nio = DB_GLOBAL(j_pwrite) != NULL ?
91		    DB_GLOBAL(j_pwrite)(fhp->fd, buf, io_len, offset) :
92		    pwrite(fhp->fd, buf, io_len, offset);
93		break;
94	default:
95		return (EINVAL);
96	}
97	if (nio == (ssize_t)io_len) {
98		*niop = io_len;
99		return (0);
100	}
101slow:
102#endif
103	MUTEX_LOCK(env, fhp->mtx_fh);
104
105	if ((ret = __os_seek(env, fhp, pgno, pgsize, relative)) != 0)
106		goto err;
107	switch (op) {
108	case DB_IO_READ:
109		ret = __os_read(env, fhp, buf, io_len, niop);
110		break;
111	case DB_IO_WRITE:
112		ret = __os_write(env, fhp, buf, io_len, niop);
113		break;
114	default:
115		ret = EINVAL;
116		break;
117	}
118
119err:	MUTEX_UNLOCK(env, fhp->mtx_fh);
120
121	return (ret);
122
123}
124
125/*
126 * __os_read --
127 *	Read from a file handle.
128 *
129 * PUBLIC: int __os_read __P((ENV *, DB_FH *, void *, size_t, size_t *));
130 */
131int
132__os_read(env, fhp, addr, len, nrp)
133	ENV *env;
134	DB_FH *fhp;
135	void *addr;
136	size_t len;
137	size_t *nrp;
138{
139	DB_ENV *dbenv;
140	size_t offset;
141	ssize_t nr;
142	int ret;
143	u_int8_t *taddr;
144
145	dbenv = env == NULL ? NULL : env->dbenv;
146	ret = 0;
147
148	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
149
150#if defined(HAVE_STATISTICS)
151	++fhp->read_count;
152#endif
153	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
154		__db_msg(env,
155		    "fileops: read %s: %lu bytes", fhp->name, (u_long)len);
156
157	if (DB_GLOBAL(j_read) != NULL) {
158		*nrp = len;
159		LAST_PANIC_CHECK_BEFORE_IO(env);
160		if (DB_GLOBAL(j_read)(fhp->fd, addr, len) != (ssize_t)len) {
161			ret = __os_get_syserr();
162			__db_syserr(env, ret, "read: %#lx, %lu",
163			    P_TO_ULONG(addr), (u_long)len);
164			ret = __os_posix_err(ret);
165		}
166		return (ret);
167	}
168
169	for (taddr = addr, offset = 0;
170	    offset < len; taddr += nr, offset += (u_int32_t)nr) {
171		LAST_PANIC_CHECK_BEFORE_IO(env);
172		RETRY_CHK(((nr = read(fhp->fd,
173		    CHAR_STAR_CAST taddr, len - offset)) < 0 ? 1 : 0), ret);
174		if (nr == 0 || ret != 0)
175			break;
176	}
177	*nrp = (size_t)(taddr - (u_int8_t *)addr);
178	if (ret != 0) {
179		__db_syserr(env, ret, "read: %#lx, %lu",
180		    P_TO_ULONG(taddr), (u_long)len - offset);
181		ret = __os_posix_err(ret);
182	}
183	return (ret);
184}
185
186/*
187 * __os_write --
188 *	Write to a file handle.
189 *
190 * PUBLIC: int __os_write __P((ENV *, DB_FH *, void *, size_t, size_t *));
191 */
192int
193__os_write(env, fhp, addr, len, nwp)
194	ENV *env;
195	DB_FH *fhp;
196	void *addr;
197	size_t len;
198	size_t *nwp;
199{
200	DB_ASSERT(env, F_ISSET(fhp, DB_FH_OPENED) && fhp->fd != -1);
201
202#ifdef HAVE_FILESYSTEM_NOTZERO
203	/* Zero-fill as necessary. */
204	if (__os_fs_notzero()) {
205		int ret;
206		if ((ret = __db_zero_fill(env, fhp)) != 0)
207			return (ret);
208	}
209#endif
210	return (__os_physwrite(env, fhp, addr, len, nwp));
211}
212
213/*
214 * __os_physwrite --
215 *	Physical write to a file handle.
216 *
217 * PUBLIC: int __os_physwrite
218 * PUBLIC:     __P((ENV *, DB_FH *, void *, size_t, size_t *));
219 */
220int
221__os_physwrite(env, fhp, addr, len, nwp)
222	ENV *env;
223	DB_FH *fhp;
224	void *addr;
225	size_t len;
226	size_t *nwp;
227{
228	DB_ENV *dbenv;
229	size_t offset;
230	ssize_t nw;
231	int ret;
232	u_int8_t *taddr;
233
234	dbenv = env == NULL ? NULL : env->dbenv;
235	ret = 0;
236
237#if defined(HAVE_STATISTICS)
238	++fhp->write_count;
239#endif
240	if (dbenv != NULL && FLD_ISSET(dbenv->verbose, DB_VERB_FILEOPS_ALL))
241		__db_msg(env,
242		    "fileops: write %s: %lu bytes", fhp->name, (u_long)len);
243
244#if defined(HAVE_FILESYSTEM_NOTZERO) && defined(DIAGNOSTIC)
245	if (__os_fs_notzero()) {
246		struct stat sb;
247		off_t cur_off;
248
249		DB_ASSERT(env, fstat(fhp->fd, &sb) != -1 &&
250		    (cur_off = lseek(fhp->fd, (off_t)0, SEEK_CUR)) != -1 &&
251		    cur_off <= sb.st_size);
252	}
253#endif
254	if (DB_GLOBAL(j_write) != NULL) {
255		*nwp = len;
256		LAST_PANIC_CHECK_BEFORE_IO(env);
257		if (DB_GLOBAL(j_write)(fhp->fd, addr, len) != (ssize_t)len) {
258			ret = __os_get_syserr();
259			__db_syserr(env, ret, "write: %#lx, %lu",
260			    P_TO_ULONG(addr), (u_long)len);
261			ret = __os_posix_err(ret);
262
263			DB_EVENT(env, DB_EVENT_WRITE_FAILED, NULL);
264		}
265		return (ret);
266	}
267
268	for (taddr = addr, offset = 0;
269	    offset < len; taddr += nw, offset += (u_int32_t)nw) {
270		LAST_PANIC_CHECK_BEFORE_IO(env);
271		RETRY_CHK(((nw = write(fhp->fd,
272		    CHAR_STAR_CAST taddr, len - offset)) < 0 ? 1 : 0), ret);
273		if (ret != 0)
274			break;
275	}
276	*nwp = len;
277	if (ret != 0) {
278		__db_syserr(env, ret, "write: %#lx, %lu",
279		    P_TO_ULONG(taddr), (u_long)len - offset);
280		ret = __os_posix_err(ret);
281
282		DB_EVENT(env, DB_EVENT_WRITE_FAILED, NULL);
283	}
284	return (ret);
285}
286