1/* 2 * linux/fs/nfs/direct.c 3 * 4 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com> 5 * 6 * High-performance uncached I/O for the Linux NFS client 7 * 8 * There are important applications whose performance or correctness 9 * depends on uncached access to file data. Database clusters 10 * (multiple copies of the same instance running on separate hosts) 11 * implement their own cache coherency protocol that subsumes file 12 * system cache protocols. Applications that process datasets 13 * considerably larger than the client's memory do not always benefit 14 * from a local cache. A streaming video server, for instance, has no 15 * need to cache the contents of a file. 16 * 17 * When an application requests uncached I/O, all read and write requests 18 * are made directly to the server; data stored or fetched via these 19 * requests is not cached in the Linux page cache. The client does not 20 * correct unaligned requests from applications. All requested bytes are 21 * held on permanent storage before a direct write system call returns to 22 * an application. 23 * 24 * Solaris implements an uncached I/O facility called directio() that 25 * is used for backups and sequential I/O to very large files. Solaris 26 * also supports uncaching whole NFS partitions with "-o forcedirectio," 27 * an undocumented mount option. 28 * 29 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with 30 * help from Andrew Morton. 31 * 32 * 18 Dec 2001 Initial implementation for 2.4 --cel 33 * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy 34 * 08 Jun 2003 Port to 2.5 APIs --cel 35 * 31 Mar 2004 Handle direct I/O without VFS support --cel 36 * 15 Sep 2004 Parallel async reads --cel 37 * 04 May 2005 support O_DIRECT with aio --cel 38 * 39 */ 40 41#include <linux/errno.h> 42#include <linux/sched.h> 43#include <linux/kernel.h> 44#include <linux/file.h> 45#include <linux/pagemap.h> 46#include <linux/kref.h> 47 48#include <linux/nfs_fs.h> 49#include <linux/nfs_page.h> 50#include <linux/sunrpc/clnt.h> 51 52#include <asm/system.h> 53#include <asm/uaccess.h> 54#include <asm/atomic.h> 55 56#include "internal.h" 57#include "iostat.h" 58 59#define NFSDBG_FACILITY NFSDBG_VFS 60 61static struct kmem_cache *nfs_direct_cachep; 62 63/* 64 * This represents a set of asynchronous requests that we're waiting on 65 */ 66struct nfs_direct_req { 67 struct kref kref; /* release manager */ 68 69 /* I/O parameters */ 70 struct nfs_open_context *ctx; /* file open context info */ 71 struct kiocb * iocb; /* controlling i/o request */ 72 struct inode * inode; /* target file of i/o */ 73 74 /* completion state */ 75 atomic_t io_count; /* i/os we're waiting for */ 76 spinlock_t lock; /* protect completion state */ 77 ssize_t count, /* bytes actually processed */ 78 error; /* any reported error */ 79 struct completion completion; /* wait for i/o completion */ 80 81 /* commit state */ 82 struct list_head rewrite_list; /* saved nfs_write_data structs */ 83 struct nfs_write_data * commit_data; /* special write_data for commits */ 84 int flags; 85#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ 86#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ 87 struct nfs_writeverf verf; /* unstable write verifier */ 88}; 89 90static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode); 91static const struct rpc_call_ops nfs_write_direct_ops; 92 93static inline void get_dreq(struct nfs_direct_req *dreq) 94{ 95 atomic_inc(&dreq->io_count); 96} 97 98static inline int put_dreq(struct nfs_direct_req *dreq) 99{ 100 return atomic_dec_and_test(&dreq->io_count); 101} 102 103/** 104 * nfs_direct_IO - NFS address space operation for direct I/O 105 * @rw: direction (read or write) 106 * @iocb: target I/O control block 107 * @iov: array of vectors that define I/O buffer 108 * @pos: offset in file to begin the operation 109 * @nr_segs: size of iovec array 110 * 111 * The presence of this routine in the address space ops vector means 112 * the NFS client supports direct I/O. However, we shunt off direct 113 * read and write requests before the VFS gets them, so this method 114 * should never be called. 115 */ 116ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) 117{ 118 dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", 119 iocb->ki_filp->f_path.dentry->d_name.name, 120 (long long) pos, nr_segs); 121 122 return -EINVAL; 123} 124 125static void nfs_direct_dirty_pages(struct page **pages, unsigned int pgbase, size_t count) 126{ 127 unsigned int npages; 128 unsigned int i; 129 130 if (count == 0) 131 return; 132 pages += (pgbase >> PAGE_SHIFT); 133 npages = (count + (pgbase & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; 134 for (i = 0; i < npages; i++) { 135 struct page *page = pages[i]; 136 if (!PageCompound(page)) 137 set_page_dirty(page); 138 } 139} 140 141static void nfs_direct_release_pages(struct page **pages, unsigned int npages) 142{ 143 unsigned int i; 144 for (i = 0; i < npages; i++) 145 page_cache_release(pages[i]); 146} 147 148static inline struct nfs_direct_req *nfs_direct_req_alloc(void) 149{ 150 struct nfs_direct_req *dreq; 151 152 dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL); 153 if (!dreq) 154 return NULL; 155 156 kref_init(&dreq->kref); 157 kref_get(&dreq->kref); 158 init_completion(&dreq->completion); 159 INIT_LIST_HEAD(&dreq->rewrite_list); 160 dreq->iocb = NULL; 161 dreq->ctx = NULL; 162 spin_lock_init(&dreq->lock); 163 atomic_set(&dreq->io_count, 0); 164 dreq->count = 0; 165 dreq->error = 0; 166 dreq->flags = 0; 167 168 return dreq; 169} 170 171static void nfs_direct_req_free(struct kref *kref) 172{ 173 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); 174 175 if (dreq->ctx != NULL) 176 put_nfs_open_context(dreq->ctx); 177 kmem_cache_free(nfs_direct_cachep, dreq); 178} 179 180static void nfs_direct_req_release(struct nfs_direct_req *dreq) 181{ 182 kref_put(&dreq->kref, nfs_direct_req_free); 183} 184 185/* 186 * Collects and returns the final error value/byte-count. 187 */ 188static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq) 189{ 190 ssize_t result = -EIOCBQUEUED; 191 192 /* Async requests don't wait here */ 193 if (dreq->iocb) 194 goto out; 195 196 result = wait_for_completion_interruptible(&dreq->completion); 197 198 if (!result) 199 result = dreq->error; 200 if (!result) 201 result = dreq->count; 202 203out: 204 return (ssize_t) result; 205} 206 207/* 208 * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust 209 * the iocb is still valid here if this is a synchronous request. 210 */ 211static void nfs_direct_complete(struct nfs_direct_req *dreq) 212{ 213 if (dreq->iocb) { 214 long res = (long) dreq->error; 215 if (!res) 216 res = (long) dreq->count; 217 aio_complete(dreq->iocb, res, 0); 218 } 219 complete_all(&dreq->completion); 220 221 nfs_direct_req_release(dreq); 222} 223 224/* 225 * We must hold a reference to all the pages in this direct read request 226 * until the RPCs complete. This could be long *after* we are woken up in 227 * nfs_direct_wait (for instance, if someone hits ^C on a slow server). 228 */ 229static void nfs_direct_read_result(struct rpc_task *task, void *calldata) 230{ 231 struct nfs_read_data *data = calldata; 232 struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; 233 234 if (nfs_readpage_result(task, data) != 0) 235 return; 236 237 spin_lock(&dreq->lock); 238 if (unlikely(task->tk_status < 0)) { 239 dreq->error = task->tk_status; 240 spin_unlock(&dreq->lock); 241 } else { 242 dreq->count += data->res.count; 243 spin_unlock(&dreq->lock); 244 nfs_direct_dirty_pages(data->pagevec, 245 data->args.pgbase, 246 data->res.count); 247 } 248 nfs_direct_release_pages(data->pagevec, data->npages); 249 250 if (put_dreq(dreq)) 251 nfs_direct_complete(dreq); 252} 253 254static const struct rpc_call_ops nfs_read_direct_ops = { 255 .rpc_call_done = nfs_direct_read_result, 256 .rpc_release = nfs_readdata_release, 257}; 258 259/* 260 * For each rsize'd chunk of the user's buffer, dispatch an NFS READ 261 * operation. If nfs_readdata_alloc() or get_user_pages() fails, 262 * bail and stop sending more reads. Read length accounting is 263 * handled automatically by nfs_direct_read_result(). Otherwise, if 264 * no requests have been sent, just return an error. 265 */ 266static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos) 267{ 268 struct nfs_open_context *ctx = dreq->ctx; 269 struct inode *inode = ctx->dentry->d_inode; 270 size_t rsize = NFS_SERVER(inode)->rsize; 271 unsigned int pgbase; 272 int result; 273 ssize_t started = 0; 274 275 get_dreq(dreq); 276 277 do { 278 struct nfs_read_data *data; 279 size_t bytes; 280 281 pgbase = user_addr & ~PAGE_MASK; 282 bytes = min(rsize,count); 283 284 result = -ENOMEM; 285 data = nfs_readdata_alloc(nfs_page_array_len(pgbase, bytes)); 286 if (unlikely(!data)) 287 break; 288 289 down_read(¤t->mm->mmap_sem); 290 result = get_user_pages(current, current->mm, user_addr, 291 data->npages, 1, 0, data->pagevec, NULL); 292 up_read(¤t->mm->mmap_sem); 293 if (result < 0) { 294 nfs_readdata_release(data); 295 break; 296 } 297 if ((unsigned)result < data->npages) { 298 nfs_direct_release_pages(data->pagevec, result); 299 nfs_readdata_release(data); 300 break; 301 } 302 303 get_dreq(dreq); 304 305 data->req = (struct nfs_page *) dreq; 306 data->inode = inode; 307 data->cred = ctx->cred; 308 data->args.fh = NFS_FH(inode); 309 data->args.context = ctx; 310 data->args.offset = pos; 311 data->args.pgbase = pgbase; 312 data->args.pages = data->pagevec; 313 data->args.count = bytes; 314 data->res.fattr = &data->fattr; 315 data->res.eof = 0; 316 data->res.count = bytes; 317 318 rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC, 319 &nfs_read_direct_ops, data); 320 NFS_PROTO(inode)->read_setup(data); 321 322 data->task.tk_cookie = (unsigned long) inode; 323 324 rpc_execute(&data->task); 325 326 dprintk("NFS: %5u initiated direct read call " 327 "(req %s/%Ld, %zu bytes @ offset %Lu)\n", 328 data->task.tk_pid, 329 inode->i_sb->s_id, 330 (long long)NFS_FILEID(inode), 331 bytes, 332 (unsigned long long)data->args.offset); 333 334 started += bytes; 335 user_addr += bytes; 336 pos += bytes; 337 pgbase += bytes; 338 pgbase &= ~PAGE_MASK; 339 BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); 340 341 count -= bytes; 342 } while (count != 0); 343 344 if (put_dreq(dreq)) 345 nfs_direct_complete(dreq); 346 347 if (started) 348 return 0; 349 return result < 0 ? (ssize_t) result : -EFAULT; 350} 351 352static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos) 353{ 354 ssize_t result = 0; 355 sigset_t oldset; 356 struct inode *inode = iocb->ki_filp->f_mapping->host; 357 struct rpc_clnt *clnt = NFS_CLIENT(inode); 358 struct nfs_direct_req *dreq; 359 360 dreq = nfs_direct_req_alloc(); 361 if (!dreq) 362 return -ENOMEM; 363 364 dreq->inode = inode; 365 dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); 366 if (!is_sync_kiocb(iocb)) 367 dreq->iocb = iocb; 368 369 nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count); 370 rpc_clnt_sigmask(clnt, &oldset); 371 result = nfs_direct_read_schedule(dreq, user_addr, count, pos); 372 if (!result) 373 result = nfs_direct_wait(dreq); 374 rpc_clnt_sigunmask(clnt, &oldset); 375 nfs_direct_req_release(dreq); 376 377 return result; 378} 379 380static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) 381{ 382 while (!list_empty(&dreq->rewrite_list)) { 383 struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); 384 list_del(&data->pages); 385 nfs_direct_release_pages(data->pagevec, data->npages); 386 nfs_writedata_release(data); 387 } 388} 389 390#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 391static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) 392{ 393 struct inode *inode = dreq->inode; 394 struct list_head *p; 395 struct nfs_write_data *data; 396 397 dreq->count = 0; 398 get_dreq(dreq); 399 400 list_for_each(p, &dreq->rewrite_list) { 401 data = list_entry(p, struct nfs_write_data, pages); 402 403 get_dreq(dreq); 404 405 /* 406 * Reset data->res. 407 */ 408 nfs_fattr_init(&data->fattr); 409 data->res.count = data->args.count; 410 memset(&data->verf, 0, sizeof(data->verf)); 411 412 /* 413 * Reuse data->task; data->args should not have changed 414 * since the original request was sent. 415 */ 416 rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC, 417 &nfs_write_direct_ops, data); 418 NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE); 419 420 data->task.tk_priority = RPC_PRIORITY_NORMAL; 421 data->task.tk_cookie = (unsigned long) inode; 422 423 /* 424 * We're called via an RPC callback, so BKL is already held. 425 */ 426 rpc_execute(&data->task); 427 428 dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", 429 data->task.tk_pid, 430 inode->i_sb->s_id, 431 (long long)NFS_FILEID(inode), 432 data->args.count, 433 (unsigned long long)data->args.offset); 434 } 435 436 if (put_dreq(dreq)) 437 nfs_direct_write_complete(dreq, inode); 438} 439 440static void nfs_direct_commit_result(struct rpc_task *task, void *calldata) 441{ 442 struct nfs_write_data *data = calldata; 443 struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; 444 445 /* Call the NFS version-specific code */ 446 if (NFS_PROTO(data->inode)->commit_done(task, data) != 0) 447 return; 448 if (unlikely(task->tk_status < 0)) { 449 dprintk("NFS: %5u commit failed with error %d.\n", 450 task->tk_pid, task->tk_status); 451 dreq->flags = NFS_ODIRECT_RESCHED_WRITES; 452 } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { 453 dprintk("NFS: %5u commit verify failed\n", task->tk_pid); 454 dreq->flags = NFS_ODIRECT_RESCHED_WRITES; 455 } 456 457 dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status); 458 nfs_direct_write_complete(dreq, data->inode); 459} 460 461static const struct rpc_call_ops nfs_commit_direct_ops = { 462 .rpc_call_done = nfs_direct_commit_result, 463 .rpc_release = nfs_commit_release, 464}; 465 466static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) 467{ 468 struct nfs_write_data *data = dreq->commit_data; 469 470 data->inode = dreq->inode; 471 data->cred = dreq->ctx->cred; 472 473 data->args.fh = NFS_FH(data->inode); 474 data->args.offset = 0; 475 data->args.count = 0; 476 data->res.count = 0; 477 data->res.fattr = &data->fattr; 478 data->res.verf = &data->verf; 479 480 rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC, 481 &nfs_commit_direct_ops, data); 482 NFS_PROTO(data->inode)->commit_setup(data, 0); 483 484 data->task.tk_priority = RPC_PRIORITY_NORMAL; 485 data->task.tk_cookie = (unsigned long)data->inode; 486 /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ 487 dreq->commit_data = NULL; 488 489 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); 490 491 rpc_execute(&data->task); 492} 493 494static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) 495{ 496 int flags = dreq->flags; 497 498 dreq->flags = 0; 499 switch (flags) { 500 case NFS_ODIRECT_DO_COMMIT: 501 nfs_direct_commit_schedule(dreq); 502 break; 503 case NFS_ODIRECT_RESCHED_WRITES: 504 nfs_direct_write_reschedule(dreq); 505 break; 506 default: 507 nfs_end_data_update(inode); 508 if (dreq->commit_data != NULL) 509 nfs_commit_free(dreq->commit_data); 510 nfs_direct_free_writedata(dreq); 511 nfs_zap_mapping(inode, inode->i_mapping); 512 nfs_direct_complete(dreq); 513 } 514} 515 516static void nfs_alloc_commit_data(struct nfs_direct_req *dreq) 517{ 518 dreq->commit_data = nfs_commit_alloc(); 519 if (dreq->commit_data != NULL) 520 dreq->commit_data->req = (struct nfs_page *) dreq; 521} 522#else 523static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq) 524{ 525 dreq->commit_data = NULL; 526} 527 528static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) 529{ 530 nfs_end_data_update(inode); 531 nfs_direct_free_writedata(dreq); 532 nfs_zap_mapping(inode, inode->i_mapping); 533 nfs_direct_complete(dreq); 534} 535#endif 536 537static void nfs_direct_write_result(struct rpc_task *task, void *calldata) 538{ 539 struct nfs_write_data *data = calldata; 540 struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; 541 int status = task->tk_status; 542 543 if (nfs_writeback_done(task, data) != 0) 544 return; 545 546 spin_lock(&dreq->lock); 547 548 if (unlikely(dreq->error != 0)) 549 goto out_unlock; 550 if (unlikely(status < 0)) { 551 /* An error has occured, so we should not commit */ 552 dreq->flags = 0; 553 dreq->error = status; 554 } 555 556 dreq->count += data->res.count; 557 558 if (data->res.verf->committed != NFS_FILE_SYNC) { 559 switch (dreq->flags) { 560 case 0: 561 memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf)); 562 dreq->flags = NFS_ODIRECT_DO_COMMIT; 563 break; 564 case NFS_ODIRECT_DO_COMMIT: 565 if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) { 566 dprintk("NFS: %5u write verify failed\n", task->tk_pid); 567 dreq->flags = NFS_ODIRECT_RESCHED_WRITES; 568 } 569 } 570 } 571out_unlock: 572 spin_unlock(&dreq->lock); 573} 574 575/* 576 * NB: Return the value of the first error return code. Subsequent 577 * errors after the first one are ignored. 578 */ 579static void nfs_direct_write_release(void *calldata) 580{ 581 struct nfs_write_data *data = calldata; 582 struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; 583 584 if (put_dreq(dreq)) 585 nfs_direct_write_complete(dreq, data->inode); 586} 587 588static const struct rpc_call_ops nfs_write_direct_ops = { 589 .rpc_call_done = nfs_direct_write_result, 590 .rpc_release = nfs_direct_write_release, 591}; 592 593/* 594 * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE 595 * operation. If nfs_writedata_alloc() or get_user_pages() fails, 596 * bail and stop sending more writes. Write length accounting is 597 * handled automatically by nfs_direct_write_result(). Otherwise, if 598 * no requests have been sent, just return an error. 599 */ 600static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync) 601{ 602 struct nfs_open_context *ctx = dreq->ctx; 603 struct inode *inode = ctx->dentry->d_inode; 604 size_t wsize = NFS_SERVER(inode)->wsize; 605 unsigned int pgbase; 606 int result; 607 ssize_t started = 0; 608 609 get_dreq(dreq); 610 611 do { 612 struct nfs_write_data *data; 613 size_t bytes; 614 615 pgbase = user_addr & ~PAGE_MASK; 616 bytes = min(wsize,count); 617 618 result = -ENOMEM; 619 data = nfs_writedata_alloc(nfs_page_array_len(pgbase, bytes)); 620 if (unlikely(!data)) 621 break; 622 623 down_read(¤t->mm->mmap_sem); 624 result = get_user_pages(current, current->mm, user_addr, 625 data->npages, 0, 0, data->pagevec, NULL); 626 up_read(¤t->mm->mmap_sem); 627 if (result < 0) { 628 nfs_writedata_release(data); 629 break; 630 } 631 if ((unsigned)result < data->npages) { 632 nfs_direct_release_pages(data->pagevec, result); 633 nfs_writedata_release(data); 634 break; 635 } 636 637 get_dreq(dreq); 638 639 list_move_tail(&data->pages, &dreq->rewrite_list); 640 641 data->req = (struct nfs_page *) dreq; 642 data->inode = inode; 643 data->cred = ctx->cred; 644 data->args.fh = NFS_FH(inode); 645 data->args.context = ctx; 646 data->args.offset = pos; 647 data->args.pgbase = pgbase; 648 data->args.pages = data->pagevec; 649 data->args.count = bytes; 650 data->res.fattr = &data->fattr; 651 data->res.count = bytes; 652 data->res.verf = &data->verf; 653 654 rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC, 655 &nfs_write_direct_ops, data); 656 NFS_PROTO(inode)->write_setup(data, sync); 657 658 data->task.tk_priority = RPC_PRIORITY_NORMAL; 659 data->task.tk_cookie = (unsigned long) inode; 660 661 rpc_execute(&data->task); 662 663 dprintk("NFS: %5u initiated direct write call " 664 "(req %s/%Ld, %zu bytes @ offset %Lu)\n", 665 data->task.tk_pid, 666 inode->i_sb->s_id, 667 (long long)NFS_FILEID(inode), 668 bytes, 669 (unsigned long long)data->args.offset); 670 671 started += bytes; 672 user_addr += bytes; 673 pos += bytes; 674 675 pgbase += bytes; 676 pgbase &= ~PAGE_MASK; 677 BUG_ON(pgbase != (user_addr & ~PAGE_MASK)); 678 679 count -= bytes; 680 } while (count != 0); 681 682 if (put_dreq(dreq)) 683 nfs_direct_write_complete(dreq, inode); 684 685 if (started) 686 return 0; 687 return result < 0 ? (ssize_t) result : -EFAULT; 688} 689 690static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos) 691{ 692 ssize_t result = 0; 693 sigset_t oldset; 694 struct inode *inode = iocb->ki_filp->f_mapping->host; 695 struct rpc_clnt *clnt = NFS_CLIENT(inode); 696 struct nfs_direct_req *dreq; 697 size_t wsize = NFS_SERVER(inode)->wsize; 698 int sync = 0; 699 700 dreq = nfs_direct_req_alloc(); 701 if (!dreq) 702 return -ENOMEM; 703 nfs_alloc_commit_data(dreq); 704 705 if (dreq->commit_data == NULL || count < wsize) 706 sync = FLUSH_STABLE; 707 708 dreq->inode = inode; 709 dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); 710 if (!is_sync_kiocb(iocb)) 711 dreq->iocb = iocb; 712 713 nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count); 714 715 nfs_begin_data_update(inode); 716 717 rpc_clnt_sigmask(clnt, &oldset); 718 result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync); 719 if (!result) 720 result = nfs_direct_wait(dreq); 721 rpc_clnt_sigunmask(clnt, &oldset); 722 nfs_direct_req_release(dreq); 723 724 return result; 725} 726 727/** 728 * nfs_file_direct_read - file direct read operation for NFS files 729 * @iocb: target I/O control block 730 * @iov: vector of user buffers into which to read data 731 * @nr_segs: size of iov vector 732 * @pos: byte offset in file where reading starts 733 * 734 * We use this function for direct reads instead of calling 735 * generic_file_aio_read() in order to avoid gfar's check to see if 736 * the request starts before the end of the file. For that check 737 * to work, we must generate a GETATTR before each direct read, and 738 * even then there is a window between the GETATTR and the subsequent 739 * READ where the file size could change. Our preference is simply 740 * to do all reads the application wants, and the server will take 741 * care of managing the end of file boundary. 742 * 743 * This function also eliminates unnecessarily updating the file's 744 * atime locally, as the NFS server sets the file's atime, and this 745 * client must read the updated atime from the server back into its 746 * cache. 747 */ 748ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, 749 unsigned long nr_segs, loff_t pos) 750{ 751 ssize_t retval = -EINVAL; 752 struct file *file = iocb->ki_filp; 753 struct address_space *mapping = file->f_mapping; 754 const char __user *buf = iov[0].iov_base; 755 size_t count = iov[0].iov_len; 756 757 dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n", 758 file->f_path.dentry->d_parent->d_name.name, 759 file->f_path.dentry->d_name.name, 760 (unsigned long) count, (long long) pos); 761 762 if (nr_segs != 1) 763 return -EINVAL; 764 765 if (count < 0) 766 goto out; 767 retval = -EFAULT; 768 if (!access_ok(VERIFY_WRITE, buf, count)) 769 goto out; 770 retval = 0; 771 if (!count) 772 goto out; 773 774 retval = nfs_sync_mapping(mapping); 775 if (retval) 776 goto out; 777 778 retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos); 779 if (retval > 0) 780 iocb->ki_pos = pos + retval; 781 782out: 783 return retval; 784} 785 786/** 787 * nfs_file_direct_write - file direct write operation for NFS files 788 * @iocb: target I/O control block 789 * @iov: vector of user buffers from which to write data 790 * @nr_segs: size of iov vector 791 * @pos: byte offset in file where writing starts 792 * 793 * We use this function for direct writes instead of calling 794 * generic_file_aio_write() in order to avoid taking the inode 795 * semaphore and updating the i_size. The NFS server will set 796 * the new i_size and this client must read the updated size 797 * back into its cache. We let the server do generic write 798 * parameter checking and report problems. 799 * 800 * We also avoid an unnecessary invocation of generic_osync_inode(), 801 * as it is fairly meaningless to sync the metadata of an NFS file. 802 * 803 * We eliminate local atime updates, see direct read above. 804 * 805 * We avoid unnecessary page cache invalidations for normal cached 806 * readers of this file. 807 * 808 * Note that O_APPEND is not supported for NFS direct writes, as there 809 * is no atomic O_APPEND write facility in the NFS protocol. 810 */ 811ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 812 unsigned long nr_segs, loff_t pos) 813{ 814 ssize_t retval; 815 struct file *file = iocb->ki_filp; 816 struct address_space *mapping = file->f_mapping; 817 const char __user *buf = iov[0].iov_base; 818 size_t count = iov[0].iov_len; 819 820 dprintk("nfs: direct write(%s/%s, %lu@%Ld)\n", 821 file->f_path.dentry->d_parent->d_name.name, 822 file->f_path.dentry->d_name.name, 823 (unsigned long) count, (long long) pos); 824 825 if (nr_segs != 1) 826 return -EINVAL; 827 828 retval = generic_write_checks(file, &pos, &count, 0); 829 if (retval) 830 goto out; 831 832 retval = -EINVAL; 833 if ((ssize_t) count < 0) 834 goto out; 835 retval = 0; 836 if (!count) 837 goto out; 838 839 retval = -EFAULT; 840 if (!access_ok(VERIFY_READ, buf, count)) 841 goto out; 842 843 retval = nfs_sync_mapping(mapping); 844 if (retval) 845 goto out; 846 847 retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos); 848 849 if (retval > 0) 850 iocb->ki_pos = pos + retval; 851 852out: 853 return retval; 854} 855 856/** 857 * nfs_init_directcache - create a slab cache for nfs_direct_req structures 858 * 859 */ 860int __init nfs_init_directcache(void) 861{ 862 nfs_direct_cachep = kmem_cache_create("nfs_direct_cache", 863 sizeof(struct nfs_direct_req), 864 0, (SLAB_RECLAIM_ACCOUNT| 865 SLAB_MEM_SPREAD), 866 NULL, NULL); 867 if (nfs_direct_cachep == NULL) 868 return -ENOMEM; 869 870 return 0; 871} 872 873/** 874 * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures 875 * 876 */ 877void nfs_destroy_directcache(void) 878{ 879 kmem_cache_destroy(nfs_direct_cachep); 880} 881