1/* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * dlmfs.c 5 * 6 * Code which implements the kernel side of a minimal userspace 7 * interface to our DLM. This file handles the virtual file system 8 * used for communication with userspace. Credit should go to ramfs, 9 * which was a template for the fs side of this module. 10 * 11 * Copyright (C) 2003, 2004 Oracle. All rights reserved. 12 * 13 * This program is free software; you can redistribute it and/or 14 * modify it under the terms of the GNU General Public 15 * License as published by the Free Software Foundation; either 16 * version 2 of the License, or (at your option) any later version. 17 * 18 * This program is distributed in the hope that it will be useful, 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 * General Public License for more details. 22 * 23 * You should have received a copy of the GNU General Public 24 * License along with this program; if not, write to the 25 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 26 * Boston, MA 021110-1307, USA. 27 */ 28 29/* Simple VFS hooks based on: */ 30/* 31 * Resizable simple ram filesystem for Linux. 32 * 33 * Copyright (C) 2000 Linus Torvalds. 34 * 2000 Transmeta Corp. 35 */ 36 37#include <linux/module.h> 38#include <linux/fs.h> 39#include <linux/pagemap.h> 40#include <linux/types.h> 41#include <linux/slab.h> 42#include <linux/highmem.h> 43#include <linux/init.h> 44#include <linux/string.h> 45#include <linux/backing-dev.h> 46 47#include <asm/uaccess.h> 48 49 50#include "cluster/nodemanager.h" 51#include "cluster/heartbeat.h" 52#include "cluster/tcp.h" 53 54#include "dlmapi.h" 55 56#include "userdlm.h" 57 58#include "dlmfsver.h" 59 60#define MLOG_MASK_PREFIX ML_DLMFS 61#include "cluster/masklog.h" 62 63static const struct super_operations dlmfs_ops; 64static const struct file_operations dlmfs_file_operations; 65static const struct inode_operations dlmfs_dir_inode_operations; 66static const struct inode_operations dlmfs_root_inode_operations; 67static const struct inode_operations dlmfs_file_inode_operations; 68static struct kmem_cache *dlmfs_inode_cache; 69 70struct workqueue_struct *user_dlm_worker; 71 72/* 73 * decodes a set of open flags into a valid lock level and a set of flags. 74 * returns < 0 if we have invalid flags 75 * flags which mean something to us: 76 * O_RDONLY -> PRMODE level 77 * O_WRONLY -> EXMODE level 78 * 79 * O_NONBLOCK -> LKM_NOQUEUE 80 */ 81static int dlmfs_decode_open_flags(int open_flags, 82 int *level, 83 int *flags) 84{ 85 if (open_flags & (O_WRONLY|O_RDWR)) 86 *level = LKM_EXMODE; 87 else 88 *level = LKM_PRMODE; 89 90 *flags = 0; 91 if (open_flags & O_NONBLOCK) 92 *flags |= LKM_NOQUEUE; 93 94 return 0; 95} 96 97static int dlmfs_file_open(struct inode *inode, 98 struct file *file) 99{ 100 int status, level, flags; 101 struct dlmfs_filp_private *fp = NULL; 102 struct dlmfs_inode_private *ip; 103 104 if (S_ISDIR(inode->i_mode)) 105 BUG(); 106 107 mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino, 108 file->f_flags); 109 110 status = dlmfs_decode_open_flags(file->f_flags, &level, &flags); 111 if (status < 0) 112 goto bail; 113 114 /* We don't want to honor O_APPEND at read/write time as it 115 * doesn't make sense for LVB writes. */ 116 file->f_flags &= ~O_APPEND; 117 118 fp = kmalloc(sizeof(*fp), GFP_NOFS); 119 if (!fp) { 120 status = -ENOMEM; 121 goto bail; 122 } 123 fp->fp_lock_level = level; 124 125 ip = DLMFS_I(inode); 126 127 status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags); 128 if (status < 0) { 129 /* this is a strange error to return here but I want 130 * to be able userspace to be able to distinguish a 131 * valid lock request from one that simply couldn't be 132 * granted. */ 133 if (flags & LKM_NOQUEUE && status == -EAGAIN) 134 status = -ETXTBSY; 135 kfree(fp); 136 goto bail; 137 } 138 139 file->private_data = fp; 140bail: 141 return status; 142} 143 144static int dlmfs_file_release(struct inode *inode, 145 struct file *file) 146{ 147 int level, status; 148 struct dlmfs_inode_private *ip = DLMFS_I(inode); 149 struct dlmfs_filp_private *fp = 150 (struct dlmfs_filp_private *) file->private_data; 151 152 if (S_ISDIR(inode->i_mode)) 153 BUG(); 154 155 mlog(0, "close called on inode %lu\n", inode->i_ino); 156 157 status = 0; 158 if (fp) { 159 level = fp->fp_lock_level; 160 if (level != LKM_IVMODE) 161 user_dlm_cluster_unlock(&ip->ip_lockres, level); 162 163 kfree(fp); 164 file->private_data = NULL; 165 } 166 167 return 0; 168} 169 170static ssize_t dlmfs_file_read(struct file *filp, 171 char __user *buf, 172 size_t count, 173 loff_t *ppos) 174{ 175 int bytes_left; 176 ssize_t readlen; 177 char *lvb_buf; 178 struct inode *inode = filp->f_path.dentry->d_inode; 179 180 mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", 181 inode->i_ino, count, *ppos); 182 183 if (*ppos >= i_size_read(inode)) 184 return 0; 185 186 if (!count) 187 return 0; 188 189 if (!access_ok(VERIFY_WRITE, buf, count)) 190 return -EFAULT; 191 192 /* don't read past the lvb */ 193 if ((count + *ppos) > i_size_read(inode)) 194 readlen = i_size_read(inode) - *ppos; 195 else 196 readlen = count - *ppos; 197 198 lvb_buf = kmalloc(readlen, GFP_NOFS); 199 if (!lvb_buf) 200 return -ENOMEM; 201 202 user_dlm_read_lvb(inode, lvb_buf, readlen); 203 bytes_left = __copy_to_user(buf, lvb_buf, readlen); 204 readlen -= bytes_left; 205 206 kfree(lvb_buf); 207 208 *ppos = *ppos + readlen; 209 210 mlog(0, "read %zd bytes\n", readlen); 211 return readlen; 212} 213 214static ssize_t dlmfs_file_write(struct file *filp, 215 const char __user *buf, 216 size_t count, 217 loff_t *ppos) 218{ 219 int bytes_left; 220 ssize_t writelen; 221 char *lvb_buf; 222 struct inode *inode = filp->f_path.dentry->d_inode; 223 224 mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", 225 inode->i_ino, count, *ppos); 226 227 if (*ppos >= i_size_read(inode)) 228 return -ENOSPC; 229 230 if (!count) 231 return 0; 232 233 if (!access_ok(VERIFY_READ, buf, count)) 234 return -EFAULT; 235 236 /* don't write past the lvb */ 237 if ((count + *ppos) > i_size_read(inode)) 238 writelen = i_size_read(inode) - *ppos; 239 else 240 writelen = count - *ppos; 241 242 lvb_buf = kmalloc(writelen, GFP_NOFS); 243 if (!lvb_buf) 244 return -ENOMEM; 245 246 bytes_left = copy_from_user(lvb_buf, buf, writelen); 247 writelen -= bytes_left; 248 if (writelen) 249 user_dlm_write_lvb(inode, lvb_buf, writelen); 250 251 kfree(lvb_buf); 252 253 *ppos = *ppos + writelen; 254 mlog(0, "wrote %zd bytes\n", writelen); 255 return writelen; 256} 257 258static void dlmfs_init_once(void *foo, 259 struct kmem_cache *cachep, 260 unsigned long flags) 261{ 262 struct dlmfs_inode_private *ip = 263 (struct dlmfs_inode_private *) foo; 264 265 ip->ip_dlm = NULL; 266 ip->ip_parent = NULL; 267 268 inode_init_once(&ip->ip_vfs_inode); 269} 270 271static struct inode *dlmfs_alloc_inode(struct super_block *sb) 272{ 273 struct dlmfs_inode_private *ip; 274 275 ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS); 276 if (!ip) 277 return NULL; 278 279 return &ip->ip_vfs_inode; 280} 281 282static void dlmfs_destroy_inode(struct inode *inode) 283{ 284 kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); 285} 286 287static void dlmfs_clear_inode(struct inode *inode) 288{ 289 int status; 290 struct dlmfs_inode_private *ip; 291 292 if (!inode) 293 return; 294 295 mlog(0, "inode %lu\n", inode->i_ino); 296 297 ip = DLMFS_I(inode); 298 299 if (S_ISREG(inode->i_mode)) { 300 status = user_dlm_destroy_lock(&ip->ip_lockres); 301 if (status < 0) 302 mlog_errno(status); 303 iput(ip->ip_parent); 304 goto clear_fields; 305 } 306 307 mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm); 308 /* we must be a directory. If required, lets unregister the 309 * dlm context now. */ 310 if (ip->ip_dlm) 311 user_dlm_unregister_context(ip->ip_dlm); 312clear_fields: 313 ip->ip_parent = NULL; 314 ip->ip_dlm = NULL; 315} 316 317static struct backing_dev_info dlmfs_backing_dev_info = { 318 .ra_pages = 0, /* No readahead */ 319 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, 320}; 321 322static struct inode *dlmfs_get_root_inode(struct super_block *sb) 323{ 324 struct inode *inode = new_inode(sb); 325 int mode = S_IFDIR | 0755; 326 struct dlmfs_inode_private *ip; 327 328 if (inode) { 329 ip = DLMFS_I(inode); 330 331 inode->i_mode = mode; 332 inode->i_uid = current->fsuid; 333 inode->i_gid = current->fsgid; 334 inode->i_blocks = 0; 335 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; 336 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 337 inc_nlink(inode); 338 339 inode->i_fop = &simple_dir_operations; 340 inode->i_op = &dlmfs_root_inode_operations; 341 } 342 343 return inode; 344} 345 346static struct inode *dlmfs_get_inode(struct inode *parent, 347 struct dentry *dentry, 348 int mode) 349{ 350 struct super_block *sb = parent->i_sb; 351 struct inode * inode = new_inode(sb); 352 struct dlmfs_inode_private *ip; 353 354 if (!inode) 355 return NULL; 356 357 inode->i_mode = mode; 358 inode->i_uid = current->fsuid; 359 inode->i_gid = current->fsgid; 360 inode->i_blocks = 0; 361 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; 362 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 363 364 ip = DLMFS_I(inode); 365 ip->ip_dlm = DLMFS_I(parent)->ip_dlm; 366 367 switch (mode & S_IFMT) { 368 default: 369 /* for now we don't support anything other than 370 * directories and regular files. */ 371 BUG(); 372 break; 373 case S_IFREG: 374 inode->i_op = &dlmfs_file_inode_operations; 375 inode->i_fop = &dlmfs_file_operations; 376 377 i_size_write(inode, DLM_LVB_LEN); 378 379 user_dlm_lock_res_init(&ip->ip_lockres, dentry); 380 381 /* released at clear_inode time, this insures that we 382 * get to drop the dlm reference on each lock *before* 383 * we call the unregister code for releasing parent 384 * directories. */ 385 ip->ip_parent = igrab(parent); 386 BUG_ON(!ip->ip_parent); 387 break; 388 case S_IFDIR: 389 inode->i_op = &dlmfs_dir_inode_operations; 390 inode->i_fop = &simple_dir_operations; 391 392 /* directory inodes start off with i_nlink == 393 * 2 (for "." entry) */ 394 inc_nlink(inode); 395 break; 396 } 397 398 if (parent->i_mode & S_ISGID) { 399 inode->i_gid = parent->i_gid; 400 if (S_ISDIR(mode)) 401 inode->i_mode |= S_ISGID; 402 } 403 404 return inode; 405} 406 407/* 408 * File creation. Allocate an inode, and we're done.. 409 */ 410/* SMP-safe */ 411static int dlmfs_mkdir(struct inode * dir, 412 struct dentry * dentry, 413 int mode) 414{ 415 int status; 416 struct inode *inode = NULL; 417 struct qstr *domain = &dentry->d_name; 418 struct dlmfs_inode_private *ip; 419 struct dlm_ctxt *dlm; 420 421 mlog(0, "mkdir %.*s\n", domain->len, domain->name); 422 423 /* verify that we have a proper domain */ 424 if (domain->len >= O2NM_MAX_NAME_LEN) { 425 status = -EINVAL; 426 mlog(ML_ERROR, "invalid domain name for directory.\n"); 427 goto bail; 428 } 429 430 inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR); 431 if (!inode) { 432 status = -ENOMEM; 433 mlog_errno(status); 434 goto bail; 435 } 436 437 ip = DLMFS_I(inode); 438 439 dlm = user_dlm_register_context(domain); 440 if (IS_ERR(dlm)) { 441 status = PTR_ERR(dlm); 442 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", 443 status, domain->len, domain->name); 444 goto bail; 445 } 446 ip->ip_dlm = dlm; 447 448 inc_nlink(dir); 449 d_instantiate(dentry, inode); 450 dget(dentry); /* Extra count - pin the dentry in core */ 451 452 status = 0; 453bail: 454 if (status < 0) 455 iput(inode); 456 return status; 457} 458 459static int dlmfs_create(struct inode *dir, 460 struct dentry *dentry, 461 int mode, 462 struct nameidata *nd) 463{ 464 int status = 0; 465 struct inode *inode; 466 struct qstr *name = &dentry->d_name; 467 468 mlog(0, "create %.*s\n", name->len, name->name); 469 470 /* verify name is valid and doesn't contain any dlm reserved 471 * characters */ 472 if (name->len >= USER_DLM_LOCK_ID_MAX_LEN || 473 name->name[0] == '$') { 474 status = -EINVAL; 475 mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len, 476 name->name); 477 goto bail; 478 } 479 480 inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG); 481 if (!inode) { 482 status = -ENOMEM; 483 mlog_errno(status); 484 goto bail; 485 } 486 487 d_instantiate(dentry, inode); 488 dget(dentry); /* Extra count - pin the dentry in core */ 489bail: 490 return status; 491} 492 493static int dlmfs_unlink(struct inode *dir, 494 struct dentry *dentry) 495{ 496 int status; 497 struct inode *inode = dentry->d_inode; 498 499 mlog(0, "unlink inode %lu\n", inode->i_ino); 500 501 /* if there are no current holders, or none that are waiting 502 * to acquire a lock, this basically destroys our lockres. */ 503 status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres); 504 if (status < 0) { 505 mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n", 506 dentry->d_name.len, dentry->d_name.name, status); 507 goto bail; 508 } 509 status = simple_unlink(dir, dentry); 510bail: 511 return status; 512} 513 514static int dlmfs_fill_super(struct super_block * sb, 515 void * data, 516 int silent) 517{ 518 struct inode * inode; 519 struct dentry * root; 520 521 sb->s_maxbytes = MAX_LFS_FILESIZE; 522 sb->s_blocksize = PAGE_CACHE_SIZE; 523 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 524 sb->s_magic = DLMFS_MAGIC; 525 sb->s_op = &dlmfs_ops; 526 inode = dlmfs_get_root_inode(sb); 527 if (!inode) 528 return -ENOMEM; 529 530 root = d_alloc_root(inode); 531 if (!root) { 532 iput(inode); 533 return -ENOMEM; 534 } 535 sb->s_root = root; 536 return 0; 537} 538 539static const struct file_operations dlmfs_file_operations = { 540 .open = dlmfs_file_open, 541 .release = dlmfs_file_release, 542 .read = dlmfs_file_read, 543 .write = dlmfs_file_write, 544}; 545 546static const struct inode_operations dlmfs_dir_inode_operations = { 547 .create = dlmfs_create, 548 .lookup = simple_lookup, 549 .unlink = dlmfs_unlink, 550}; 551 552/* this way we can restrict mkdir to only the toplevel of the fs. */ 553static const struct inode_operations dlmfs_root_inode_operations = { 554 .lookup = simple_lookup, 555 .mkdir = dlmfs_mkdir, 556 .rmdir = simple_rmdir, 557}; 558 559static const struct super_operations dlmfs_ops = { 560 .statfs = simple_statfs, 561 .alloc_inode = dlmfs_alloc_inode, 562 .destroy_inode = dlmfs_destroy_inode, 563 .clear_inode = dlmfs_clear_inode, 564 .drop_inode = generic_delete_inode, 565}; 566 567static const struct inode_operations dlmfs_file_inode_operations = { 568 .getattr = simple_getattr, 569}; 570 571static int dlmfs_get_sb(struct file_system_type *fs_type, 572 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 573{ 574 return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt); 575} 576 577static struct file_system_type dlmfs_fs_type = { 578 .owner = THIS_MODULE, 579 .name = "ocfs2_dlmfs", 580 .get_sb = dlmfs_get_sb, 581 .kill_sb = kill_litter_super, 582}; 583 584static int __init init_dlmfs_fs(void) 585{ 586 int status; 587 int cleanup_inode = 0, cleanup_worker = 0; 588 589 dlmfs_print_version(); 590 591 dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", 592 sizeof(struct dlmfs_inode_private), 593 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 594 SLAB_MEM_SPREAD), 595 dlmfs_init_once, NULL); 596 if (!dlmfs_inode_cache) 597 return -ENOMEM; 598 cleanup_inode = 1; 599 600 user_dlm_worker = create_singlethread_workqueue("user_dlm"); 601 if (!user_dlm_worker) { 602 status = -ENOMEM; 603 goto bail; 604 } 605 cleanup_worker = 1; 606 607 status = register_filesystem(&dlmfs_fs_type); 608bail: 609 if (status) { 610 if (cleanup_inode) 611 kmem_cache_destroy(dlmfs_inode_cache); 612 if (cleanup_worker) 613 destroy_workqueue(user_dlm_worker); 614 } else 615 printk("OCFS2 User DLM kernel interface loaded\n"); 616 return status; 617} 618 619static void __exit exit_dlmfs_fs(void) 620{ 621 unregister_filesystem(&dlmfs_fs_type); 622 623 flush_workqueue(user_dlm_worker); 624 destroy_workqueue(user_dlm_worker); 625 626 kmem_cache_destroy(dlmfs_inode_cache); 627} 628 629MODULE_AUTHOR("Oracle"); 630MODULE_LICENSE("GPL"); 631 632module_init(init_dlmfs_fs) 633module_exit(exit_dlmfs_fs) 634