1/*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 2001,2008 Oracle. All rights reserved. 5 * 6 * $Id: fop_basic.c,v 12.30 2008/01/11 20:50:00 bostic Exp $ 7 */ 8 9#include "db_config.h" 10 11#include "db_int.h" 12#include "dbinc/db_page.h" 13#include "dbinc/fop.h" 14#include "dbinc/log.h" 15#include "dbinc/mp.h" 16#include "dbinc/txn.h" 17#include "dbinc/db_am.h" 18 19/* 20 * The transactional guarantees Berkeley DB provides for file 21 * system level operations (database physical file create, delete, 22 * rename) are based on our understanding of current file system 23 * semantics; a system that does not provide these semantics and 24 * guarantees could be in danger. 25 * 26 * First, as in standard database changes, fsync and fdatasync must 27 * work: when applied to the log file, the records written into the 28 * log must be transferred to stable storage. 29 * 30 * Second, it must not be possible for the log file to be removed 31 * without previous file system level operations being flushed to 32 * stable storage. Berkeley DB applications write log records 33 * describing file system operations into the log, then perform the 34 * file system operation, then commit the enclosing transaction 35 * (which flushes the log file to stable storage). Subsequently, 36 * a database environment checkpoint may make it possible for the 37 * application to remove the log file containing the record of the 38 * file system operation. DB's transactional guarantees for file 39 * system operations require the log file removal not succeed until 40 * all previous filesystem operations have been flushed to stable 41 * storage. In other words, the flush of the log file, or the 42 * removal of the log file, must block until all previous 43 * filesystem operations have been flushed to stable storage. This 44 * semantic is not, as far as we know, required by any existing 45 * standards document, but we have never seen a filesystem where 46 * it does not apply. 47 */ 48 49/* 50 * __fop_create -- 51 * Create a (transactionally protected) file system object. This is used 52 * to create DB files now, potentially blobs, queue extents and anything 53 * else you wish to store in a file system object. 54 * 55 * PUBLIC: int __fop_create __P((ENV *, 56 * PUBLIC: DB_TXN *, DB_FH **, const char *, APPNAME, int, u_int32_t)); 57 */ 58int 59__fop_create(env, txn, fhpp, name, appname, mode, flags) 60 ENV *env; 61 DB_TXN *txn; 62 DB_FH **fhpp; 63 const char *name; 64 APPNAME appname; 65 int mode; 66 u_int32_t flags; 67{ 68 DBT data; 69 DB_FH *fhp; 70 DB_LSN lsn; 71 int ret; 72 char *real_name; 73 74 real_name = NULL; 75 fhp = NULL; 76 77 if ((ret = 78 __db_appname(env, appname, name, 0, NULL, &real_name)) != 0) 79 return (ret); 80 81 if (mode == 0) 82 mode = DB_MODE_600; 83 84 if (DBENV_LOGGING(env) 85#if !defined(DEBUG_WOP) 86 && txn != NULL 87#endif 88 ) { 89 DB_INIT_DBT(data, name, strlen(name) + 1); 90 if ((ret = __fop_create_log(env, txn, &lsn, 91 flags | DB_FLUSH, 92 &data, (u_int32_t)appname, (u_int32_t)mode)) != 0) 93 goto err; 94 } 95 96 DB_ENV_TEST_RECOVERY(env, DB_TEST_POSTLOG, ret, name); 97 98 if (fhpp == NULL) 99 fhpp = &fhp; 100 ret = __os_open( 101 env, real_name, 0, DB_OSO_CREATE | DB_OSO_EXCL, mode, fhpp); 102 103err: 104DB_TEST_RECOVERY_LABEL 105 if (fhpp == &fhp && fhp != NULL) 106 (void)__os_closehandle(env, fhp); 107 if (real_name != NULL) 108 __os_free(env, real_name); 109 return (ret); 110} 111 112/* 113 * __fop_remove -- 114 * Remove a file system object. 115 * 116 * PUBLIC: int __fop_remove __P((ENV *, 117 * PUBLIC: DB_TXN *, u_int8_t *, const char *, APPNAME, u_int32_t)); 118 */ 119int 120__fop_remove(env, txn, fileid, name, appname, flags) 121 ENV *env; 122 DB_TXN *txn; 123 u_int8_t *fileid; 124 const char *name; 125 APPNAME appname; 126 u_int32_t flags; 127{ 128 DBT fdbt, ndbt; 129 DB_LSN lsn; 130 char *real_name; 131 int ret; 132 133 real_name = NULL; 134 135 if ((ret = 136 __db_appname(env, appname, name, 0, NULL, &real_name)) != 0) 137 goto err; 138 139 if (!IS_REAL_TXN(txn)) { 140 if (fileid != NULL && (ret = __memp_nameop( 141 env, fileid, NULL, real_name, NULL, 0)) != 0) 142 goto err; 143 } else { 144 if (DBENV_LOGGING(env) 145#if !defined(DEBUG_WOP) 146 && txn != NULL 147#endif 148 ) { 149 memset(&fdbt, 0, sizeof(ndbt)); 150 fdbt.data = fileid; 151 fdbt.size = fileid == NULL ? 0 : DB_FILE_ID_LEN; 152 DB_INIT_DBT(ndbt, name, strlen(name) + 1); 153 if ((ret = __fop_remove_log(env, txn, &lsn, 154 flags, &ndbt, &fdbt, (u_int32_t)appname)) != 0) 155 goto err; 156 } 157 ret = __txn_remevent(env, txn, real_name, fileid, 0); 158 } 159 160err: if (real_name != NULL) 161 __os_free(env, real_name); 162 return (ret); 163} 164 165/* 166 * __fop_write 167 * 168 * Write "size" bytes from "buf" to file "name" beginning at offset "off." 169 * If the file is open, supply a handle in fhp. Istmp indicate if this is 170 * an operation that needs to be undone in the face of failure (i.e., if 171 * this is a write to a temporary file, we're simply going to remove the 172 * file, so don't worry about undoing the write). 173 * 174 * Currently, we *only* use this with istmp true. If we need more general 175 * handling, then we'll have to zero out regions on abort (and possibly 176 * log the before image of the data in the log record). 177 * 178 * PUBLIC: int __fop_write __P((ENV *, 179 * PUBLIC: DB_TXN *, const char *, APPNAME, DB_FH *, u_int32_t, db_pgno_t, 180 * PUBLIC: u_int32_t, void *, u_int32_t, u_int32_t, u_int32_t)); 181 */ 182int 183__fop_write(env, 184 txn, name, appname, fhp, pgsize, pageno, off, buf, size, istmp, flags) 185 ENV *env; 186 DB_TXN *txn; 187 const char *name; 188 APPNAME appname; 189 DB_FH *fhp; 190 u_int32_t pgsize; 191 db_pgno_t pageno; 192 u_int32_t off; 193 void *buf; 194 u_int32_t size, istmp, flags; 195{ 196 DBT data, namedbt; 197 DB_LSN lsn; 198 size_t nbytes; 199 int local_open, ret, t_ret; 200 char *real_name; 201 202 DB_ASSERT(env, istmp != 0); 203 204 ret = local_open = 0; 205 real_name = NULL; 206 207 if ((ret = 208 __db_appname(env, appname, name, 0, NULL, &real_name)) != 0) 209 return (ret); 210 211 if (DBENV_LOGGING(env) 212#if !defined(DEBUG_WOP) 213 && txn != NULL 214#endif 215 ) { 216 memset(&data, 0, sizeof(data)); 217 data.data = buf; 218 data.size = size; 219 DB_INIT_DBT(namedbt, name, strlen(name) + 1); 220 if ((ret = __fop_write_log(env, txn, 221 &lsn, flags, &namedbt, (u_int32_t)appname, 222 pgsize, pageno, off, &data, istmp)) != 0) 223 goto err; 224 } 225 226 if (fhp == NULL) { 227 /* File isn't open; we need to reopen it. */ 228 if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0) 229 goto err; 230 local_open = 1; 231 } 232 233 /* Seek to offset. */ 234 if ((ret = __os_seek(env, fhp, pageno, pgsize, off)) != 0) 235 goto err; 236 237 /* Now do the write. */ 238 if ((ret = __os_write(env, fhp, buf, size, &nbytes)) != 0) 239 goto err; 240 241err: if (local_open && 242 (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0) 243 ret = t_ret; 244 245 if (real_name != NULL) 246 __os_free(env, real_name); 247 return (ret); 248} 249 250/* 251 * __fop_rename -- 252 * Change a file's name. 253 * 254 * PUBLIC: int __fop_rename __P((ENV *, DB_TXN *, const char *, 255 * PUBLIC: const char *, u_int8_t *, APPNAME, int, u_int32_t)); 256 */ 257int 258__fop_rename(env, txn, oldname, newname, fid, appname, with_undo, flags) 259 ENV *env; 260 DB_TXN *txn; 261 const char *oldname; 262 const char *newname; 263 u_int8_t *fid; 264 APPNAME appname; 265 int with_undo; 266 u_int32_t flags; 267{ 268 DBT fiddbt, new, old; 269 DB_LSN lsn; 270 int ret; 271 char *n, *o; 272 273 o = n = NULL; 274 if ((ret = __db_appname(env, appname, oldname, 0, NULL, &o)) != 0) 275 goto err; 276 if ((ret = __db_appname(env, appname, newname, 0, NULL, &n)) != 0) 277 goto err; 278 279 if (DBENV_LOGGING(env) 280#if !defined(DEBUG_WOP) 281 && txn != NULL 282#endif 283 ) { 284 DB_INIT_DBT(old, oldname, strlen(oldname) + 1); 285 DB_INIT_DBT(new, newname, strlen(newname) + 1); 286 memset(&fiddbt, 0, sizeof(fiddbt)); 287 fiddbt.data = fid; 288 fiddbt.size = DB_FILE_ID_LEN; 289 if (with_undo) 290 ret = __fop_rename_log(env, 291 txn, &lsn, flags | DB_FLUSH, 292 &old, &new, &fiddbt, (u_int32_t)appname); 293 else 294 ret = __fop_rename_noundo_log(env, 295 txn, &lsn, flags | DB_FLUSH, 296 &old, &new, &fiddbt, (u_int32_t)appname); 297 if (ret != 0) 298 goto err; 299 } 300 301 ret = __memp_nameop(env, fid, newname, o, n, 0); 302 303err: if (o != NULL) 304 __os_free(env, o); 305 if (n != NULL) 306 __os_free(env, n); 307 return (ret); 308} 309