1/*
2 * fs/direct-io.c
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 *
6 * O_DIRECT
7 *
8 * 04Jul2002	akpm@zip.com.au
9 *		Initial version
10 * 11Sep2002	janetinc@us.ibm.com
11 * 		added readv/writev support.
12 * 29Oct2002	akpm@zip.com.au
13 *		rewrote bio_add_page() support.
14 * 30Oct2002	pbadari@us.ibm.com
15 *		added support for non-aligned IO.
16 * 06Nov2002	pbadari@us.ibm.com
17 *		added asynchronous IO support.
18 * 21Jul2003	nathans@sgi.com
19 *		added IO completion notifier.
20 */
21
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/types.h>
25#include <linux/fs.h>
26#include <linux/mm.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/pagemap.h>
30#include <linux/task_io_accounting_ops.h>
31#include <linux/bio.h>
32#include <linux/wait.h>
33#include <linux/err.h>
34#include <linux/blkdev.h>
35#include <linux/buffer_head.h>
36#include <linux/rwsem.h>
37#include <linux/uio.h>
38#include <asm/atomic.h>
39
40/*
41 * How many user pages to map in one call to get_user_pages().  This determines
42 * the size of a structure on the stack.
43 */
44#define DIO_PAGES	64
45
46/*
47 * This code generally works in units of "dio_blocks".  A dio_block is
48 * somewhere between the hard sector size and the filesystem block size.  it
49 * is determined on a per-invocation basis.   When talking to the filesystem
50 * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity
51 * down by dio->blkfactor.  Similarly, fs-blocksize quantities are converted
52 * to bio_block quantities by shifting left by blkfactor.
53 *
54 * If blkfactor is zero then the user's request was aligned to the filesystem's
55 * blocksize.
56 *
57 * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
58 * This determines whether we need to do the fancy locking which prevents
59 * direct-IO from being able to read uninitialised disk blocks.  If its zero
60 * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
61 * not held for the entire direct write (taken briefly, initially, during a
62 * direct read though, but its never held for the duration of a direct-IO).
63 */
64
65struct dio {
66	/* BIO submission state */
67	struct bio *bio;		/* bio under assembly */
68	struct inode *inode;
69	int rw;
70	loff_t i_size;			/* i_size when submitted */
71	int lock_type;			/* doesn't change */
72	unsigned blkbits;		/* doesn't change */
73	unsigned blkfactor;		/* When we're using an alignment which
74					   is finer than the filesystem's soft
75					   blocksize, this specifies how much
76					   finer.  blkfactor=2 means 1/4-block
77					   alignment.  Does not change */
78	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
79					   been performed at the start of a
80					   write */
81	int pages_in_io;		/* approximate total IO pages */
82	size_t	size;			/* total request size (doesn't change)*/
83	sector_t block_in_file;		/* Current offset into the underlying
84					   file in dio_block units. */
85	unsigned blocks_available;	/* At block_in_file.  changes */
86	sector_t final_block_in_request;/* doesn't change */
87	unsigned first_block_in_page;	/* doesn't change, Used only once */
88	int boundary;			/* prev block is at a boundary */
89	int reap_counter;		/* rate limit reaping */
90	get_block_t *get_block;		/* block mapping function */
91	dio_iodone_t *end_io;		/* IO completion function */
92	sector_t final_block_in_bio;	/* current final block in bio + 1 */
93	sector_t next_block_for_io;	/* next block to be put under IO,
94					   in dio_blocks units */
95	struct buffer_head map_bh;	/* last get_block() result */
96
97	/*
98	 * Deferred addition of a page to the dio.  These variables are
99	 * private to dio_send_cur_page(), submit_page_section() and
100	 * dio_bio_add_page().
101	 */
102	struct page *cur_page;		/* The page */
103	unsigned cur_page_offset;	/* Offset into it, in bytes */
104	unsigned cur_page_len;		/* Nr of bytes at cur_page_offset */
105	sector_t cur_page_block;	/* Where it starts */
106
107	/*
108	 * Page fetching state. These variables belong to dio_refill_pages().
109	 */
110	int curr_page;			/* changes */
111	int total_pages;		/* doesn't change */
112	unsigned long curr_user_address;/* changes */
113
114	/*
115	 * Page queue.  These variables belong to dio_refill_pages() and
116	 * dio_get_page().
117	 */
118	struct page *pages[DIO_PAGES];	/* page buffer */
119	unsigned head;			/* next page to process */
120	unsigned tail;			/* last valid page + 1 */
121	int page_errors;		/* errno from get_user_pages() */
122
123	/* BIO completion state */
124	spinlock_t bio_lock;		/* protects BIO fields below */
125	unsigned long refcount;		/* direct_io_worker() and bios */
126	struct bio *bio_list;		/* singly linked via bi_private */
127	struct task_struct *waiter;	/* waiting task (NULL if none) */
128
129	/* AIO related stuff */
130	struct kiocb *iocb;		/* kiocb */
131	int is_async;			/* is IO async ? */
132	int io_error;			/* IO error in completion path */
133	ssize_t result;                 /* IO result */
134};
135
136/*
137 * How many pages are in the queue?
138 */
139static inline unsigned dio_pages_present(struct dio *dio)
140{
141	return dio->tail - dio->head;
142}
143
144/*
145 * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
146 */
147static int dio_refill_pages(struct dio *dio)
148{
149	int ret;
150	int nr_pages;
151
152	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
153	down_read(&current->mm->mmap_sem);
154	ret = get_user_pages(
155		current,			/* Task for fault acounting */
156		current->mm,			/* whose pages? */
157		dio->curr_user_address,		/* Where from? */
158		nr_pages,			/* How many pages? */
159		dio->rw == READ,		/* Write to memory? */
160		0,				/* force (?) */
161		&dio->pages[0],
162		NULL);				/* vmas */
163	up_read(&current->mm->mmap_sem);
164
165	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
166		struct page *page = ZERO_PAGE(dio->curr_user_address);
167		/*
168		 * A memory fault, but the filesystem has some outstanding
169		 * mapped blocks.  We need to use those blocks up to avoid
170		 * leaking stale data in the file.
171		 */
172		if (dio->page_errors == 0)
173			dio->page_errors = ret;
174		page_cache_get(page);
175		dio->pages[0] = page;
176		dio->head = 0;
177		dio->tail = 1;
178		ret = 0;
179		goto out;
180	}
181
182	if (ret >= 0) {
183		dio->curr_user_address += ret * PAGE_SIZE;
184		dio->curr_page += ret;
185		dio->head = 0;
186		dio->tail = ret;
187		ret = 0;
188	}
189out:
190	return ret;
191}
192
193/*
194 * Get another userspace page.  Returns an ERR_PTR on error.  Pages are
195 * buffered inside the dio so that we can call get_user_pages() against a
196 * decent number of pages, less frequently.  To provide nicer use of the
197 * L1 cache.
198 */
199static struct page *dio_get_page(struct dio *dio)
200{
201	if (dio_pages_present(dio) == 0) {
202		int ret;
203
204		ret = dio_refill_pages(dio);
205		if (ret)
206			return ERR_PTR(ret);
207		BUG_ON(dio_pages_present(dio) == 0);
208	}
209	return dio->pages[dio->head++];
210}
211
212/**
213 * dio_complete() - called when all DIO BIO I/O has been completed
214 * @offset: the byte offset in the file of the completed operation
215 *
216 * This releases locks as dictated by the locking type, lets interested parties
217 * know that a DIO operation has completed, and calculates the resulting return
218 * code for the operation.
219 *
220 * It lets the filesystem know if it registered an interest earlier via
221 * get_block.  Pass the private field of the map buffer_head so that
222 * filesystems can use it to hold additional state between get_block calls and
223 * dio_complete.
224 */
225static int dio_complete(struct dio *dio, loff_t offset, int ret)
226{
227	ssize_t transferred = 0;
228
229	/*
230	 * AIO submission can race with bio completion to get here while
231	 * expecting to have the last io completed by bio completion.
232	 * In that case -EIOCBQUEUED is in fact not an error we want
233	 * to preserve through this call.
234	 */
235	if (ret == -EIOCBQUEUED)
236		ret = 0;
237
238	if (dio->result) {
239		transferred = dio->result;
240
241		/* Check for short read case */
242		if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))
243			transferred = dio->i_size - offset;
244	}
245
246	if (dio->end_io && dio->result)
247		dio->end_io(dio->iocb, offset, transferred,
248			    dio->map_bh.b_private);
249	if (dio->lock_type == DIO_LOCKING)
250		/* lockdep: non-owner release */
251		up_read_non_owner(&dio->inode->i_alloc_sem);
252
253	if (ret == 0)
254		ret = dio->page_errors;
255	if (ret == 0)
256		ret = dio->io_error;
257	if (ret == 0)
258		ret = transferred;
259
260	return ret;
261}
262
263static int dio_bio_complete(struct dio *dio, struct bio *bio);
264/*
265 * Asynchronous IO callback.
266 */
267static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
268{
269	struct dio *dio = bio->bi_private;
270	unsigned long remaining;
271	unsigned long flags;
272
273	if (bio->bi_size)
274		return 1;
275
276	/* cleanup the bio */
277	dio_bio_complete(dio, bio);
278
279	spin_lock_irqsave(&dio->bio_lock, flags);
280	remaining = --dio->refcount;
281	if (remaining == 1 && dio->waiter)
282		wake_up_process(dio->waiter);
283	spin_unlock_irqrestore(&dio->bio_lock, flags);
284
285	if (remaining == 0) {
286		int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
287		aio_complete(dio->iocb, ret, 0);
288		kfree(dio);
289	}
290
291	return 0;
292}
293
294/*
295 * The BIO completion handler simply queues the BIO up for the process-context
296 * handler.
297 *
298 * During I/O bi_private points at the dio.  After I/O, bi_private is used to
299 * implement a singly-linked list of completed BIOs, at dio->bio_list.
300 */
301static int dio_bio_end_io(struct bio *bio, unsigned int bytes_done, int error)
302{
303	struct dio *dio = bio->bi_private;
304	unsigned long flags;
305
306	if (bio->bi_size)
307		return 1;
308
309	spin_lock_irqsave(&dio->bio_lock, flags);
310	bio->bi_private = dio->bio_list;
311	dio->bio_list = bio;
312	if (--dio->refcount == 1 && dio->waiter)
313		wake_up_process(dio->waiter);
314	spin_unlock_irqrestore(&dio->bio_lock, flags);
315	return 0;
316}
317
318static int
319dio_bio_alloc(struct dio *dio, struct block_device *bdev,
320		sector_t first_sector, int nr_vecs)
321{
322	struct bio *bio;
323
324	bio = bio_alloc(GFP_KERNEL, nr_vecs);
325	if (bio == NULL)
326		return -ENOMEM;
327
328	bio->bi_bdev = bdev;
329	bio->bi_sector = first_sector;
330	if (dio->is_async)
331		bio->bi_end_io = dio_bio_end_aio;
332	else
333		bio->bi_end_io = dio_bio_end_io;
334
335	dio->bio = bio;
336	return 0;
337}
338
339/*
340 * In the AIO read case we speculatively dirty the pages before starting IO.
341 * During IO completion, any of these pages which happen to have been written
342 * back will be redirtied by bio_check_pages_dirty().
343 *
344 * bios hold a dio reference between submit_bio and ->end_io.
345 */
346static void dio_bio_submit(struct dio *dio)
347{
348	struct bio *bio = dio->bio;
349	unsigned long flags;
350
351	bio->bi_private = dio;
352
353	spin_lock_irqsave(&dio->bio_lock, flags);
354	dio->refcount++;
355	spin_unlock_irqrestore(&dio->bio_lock, flags);
356
357	if (dio->is_async && dio->rw == READ)
358		bio_set_pages_dirty(bio);
359
360	submit_bio(dio->rw, bio);
361
362	dio->bio = NULL;
363	dio->boundary = 0;
364}
365
366/*
367 * Release any resources in case of a failure
368 */
369static void dio_cleanup(struct dio *dio)
370{
371	while (dio_pages_present(dio))
372		page_cache_release(dio_get_page(dio));
373}
374
375/*
376 * Wait for the next BIO to complete.  Remove it and return it.  NULL is
377 * returned once all BIOs have been completed.  This must only be called once
378 * all bios have been issued so that dio->refcount can only decrease.  This
379 * requires that that the caller hold a reference on the dio.
380 */
381static struct bio *dio_await_one(struct dio *dio)
382{
383	unsigned long flags;
384	struct bio *bio = NULL;
385
386	spin_lock_irqsave(&dio->bio_lock, flags);
387
388	/*
389	 * Wait as long as the list is empty and there are bios in flight.  bio
390	 * completion drops the count, maybe adds to the list, and wakes while
391	 * holding the bio_lock so we don't need set_current_state()'s barrier
392	 * and can call it after testing our condition.
393	 */
394	while (dio->refcount > 1 && dio->bio_list == NULL) {
395		__set_current_state(TASK_UNINTERRUPTIBLE);
396		dio->waiter = current;
397		spin_unlock_irqrestore(&dio->bio_lock, flags);
398		io_schedule();
399		/* wake up sets us TASK_RUNNING */
400		spin_lock_irqsave(&dio->bio_lock, flags);
401		dio->waiter = NULL;
402	}
403	if (dio->bio_list) {
404		bio = dio->bio_list;
405		dio->bio_list = bio->bi_private;
406	}
407	spin_unlock_irqrestore(&dio->bio_lock, flags);
408	return bio;
409}
410
411/*
412 * Process one completed BIO.  No locks are held.
413 */
414static int dio_bio_complete(struct dio *dio, struct bio *bio)
415{
416	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
417	struct bio_vec *bvec = bio->bi_io_vec;
418	int page_no;
419
420	if (!uptodate)
421		dio->io_error = -EIO;
422
423	if (dio->is_async && dio->rw == READ) {
424		bio_check_pages_dirty(bio);	/* transfers ownership */
425	} else {
426		for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
427			struct page *page = bvec[page_no].bv_page;
428
429			if (dio->rw == READ && !PageCompound(page))
430				set_page_dirty_lock(page);
431			page_cache_release(page);
432		}
433		bio_put(bio);
434	}
435	return uptodate ? 0 : -EIO;
436}
437
438/*
439 * Wait on and process all in-flight BIOs.  This must only be called once
440 * all bios have been issued so that the refcount can only decrease.
441 * This just waits for all bios to make it through dio_bio_complete.  IO
442 * errors are propagated through dio->io_error and should be propagated via
443 * dio_complete().
444 */
445static void dio_await_completion(struct dio *dio)
446{
447	struct bio *bio;
448	do {
449		bio = dio_await_one(dio);
450		if (bio)
451			dio_bio_complete(dio, bio);
452	} while (bio);
453}
454
455/*
456 * A really large O_DIRECT read or write can generate a lot of BIOs.  So
457 * to keep the memory consumption sane we periodically reap any completed BIOs
458 * during the BIO generation phase.
459 *
460 * This also helps to limit the peak amount of pinned userspace memory.
461 */
462static int dio_bio_reap(struct dio *dio)
463{
464	int ret = 0;
465
466	if (dio->reap_counter++ >= 64) {
467		while (dio->bio_list) {
468			unsigned long flags;
469			struct bio *bio;
470			int ret2;
471
472			spin_lock_irqsave(&dio->bio_lock, flags);
473			bio = dio->bio_list;
474			dio->bio_list = bio->bi_private;
475			spin_unlock_irqrestore(&dio->bio_lock, flags);
476			ret2 = dio_bio_complete(dio, bio);
477			if (ret == 0)
478				ret = ret2;
479		}
480		dio->reap_counter = 0;
481	}
482	return ret;
483}
484
485/*
486 * Call into the fs to map some more disk blocks.  We record the current number
487 * of available blocks at dio->blocks_available.  These are in units of the
488 * fs blocksize, (1 << inode->i_blkbits).
489 *
490 * The fs is allowed to map lots of blocks at once.  If it wants to do that,
491 * it uses the passed inode-relative block number as the file offset, as usual.
492 *
493 * get_block() is passed the number of i_blkbits-sized blocks which direct_io
494 * has remaining to do.  The fs should not map more than this number of blocks.
495 *
496 * If the fs has mapped a lot of blocks, it should populate bh->b_size to
497 * indicate how much contiguous disk space has been made available at
498 * bh->b_blocknr.
499 *
500 * If *any* of the mapped blocks are new, then the fs must set buffer_new().
501 * This isn't very efficient...
502 *
503 * In the case of filesystem holes: the fs may return an arbitrarily-large
504 * hole by returning an appropriate value in b_size and by clearing
505 * buffer_mapped().  However the direct-io code will only process holes one
506 * block at a time - it will repeatedly call get_block() as it walks the hole.
507 */
508static int get_more_blocks(struct dio *dio)
509{
510	int ret;
511	struct buffer_head *map_bh = &dio->map_bh;
512	sector_t fs_startblk;	/* Into file, in filesystem-sized blocks */
513	unsigned long fs_count;	/* Number of filesystem-sized blocks */
514	unsigned long dio_count;/* Number of dio_block-sized blocks */
515	unsigned long blkmask;
516	int create;
517
518	/*
519	 * If there was a memory error and we've overwritten all the
520	 * mapped blocks then we can now return that memory error
521	 */
522	ret = dio->page_errors;
523	if (ret == 0) {
524		BUG_ON(dio->block_in_file >= dio->final_block_in_request);
525		fs_startblk = dio->block_in_file >> dio->blkfactor;
526		dio_count = dio->final_block_in_request - dio->block_in_file;
527		fs_count = dio_count >> dio->blkfactor;
528		blkmask = (1 << dio->blkfactor) - 1;
529		if (dio_count & blkmask)
530			fs_count++;
531
532		map_bh->b_state = 0;
533		map_bh->b_size = fs_count << dio->inode->i_blkbits;
534
535		create = dio->rw & WRITE;
536		if (dio->lock_type == DIO_LOCKING) {
537			if (dio->block_in_file < (i_size_read(dio->inode) >>
538							dio->blkbits))
539				create = 0;
540		} else if (dio->lock_type == DIO_NO_LOCKING) {
541			create = 0;
542		}
543
544		/*
545		 * For writes inside i_size we forbid block creations: only
546		 * overwrites are permitted.  We fall back to buffered writes
547		 * at a higher level for inside-i_size block-instantiating
548		 * writes.
549		 */
550		ret = (*dio->get_block)(dio->inode, fs_startblk,
551						map_bh, create);
552	}
553	return ret;
554}
555
556/*
557 * There is no bio.  Make one now.
558 */
559static int dio_new_bio(struct dio *dio, sector_t start_sector)
560{
561	sector_t sector;
562	int ret, nr_pages;
563
564	ret = dio_bio_reap(dio);
565	if (ret)
566		goto out;
567	sector = start_sector << (dio->blkbits - 9);
568	nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
569	BUG_ON(nr_pages <= 0);
570	ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
571	dio->boundary = 0;
572out:
573	return ret;
574}
575
576/*
577 * Attempt to put the current chunk of 'cur_page' into the current BIO.  If
578 * that was successful then update final_block_in_bio and take a ref against
579 * the just-added page.
580 *
581 * Return zero on success.  Non-zero means the caller needs to start a new BIO.
582 */
583static int dio_bio_add_page(struct dio *dio)
584{
585	int ret;
586
587	ret = bio_add_page(dio->bio, dio->cur_page,
588			dio->cur_page_len, dio->cur_page_offset);
589	if (ret == dio->cur_page_len) {
590		/*
591		 * Decrement count only, if we are done with this page
592		 */
593		if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE)
594			dio->pages_in_io--;
595		page_cache_get(dio->cur_page);
596		dio->final_block_in_bio = dio->cur_page_block +
597			(dio->cur_page_len >> dio->blkbits);
598		ret = 0;
599	} else {
600		ret = 1;
601	}
602	return ret;
603}
604
605/*
606 * Put cur_page under IO.  The section of cur_page which is described by
607 * cur_page_offset,cur_page_len is put into a BIO.  The section of cur_page
608 * starts on-disk at cur_page_block.
609 *
610 * We take a ref against the page here (on behalf of its presence in the bio).
611 *
612 * The caller of this function is responsible for removing cur_page from the
613 * dio, and for dropping the refcount which came from that presence.
614 */
615static int dio_send_cur_page(struct dio *dio)
616{
617	int ret = 0;
618
619	if (dio->bio) {
620		/*
621		 * See whether this new request is contiguous with the old
622		 */
623		if (dio->final_block_in_bio != dio->cur_page_block)
624			dio_bio_submit(dio);
625		/*
626		 * Submit now if the underlying fs is about to perform a
627		 * metadata read
628		 */
629		if (dio->boundary)
630			dio_bio_submit(dio);
631	}
632
633	if (dio->bio == NULL) {
634		ret = dio_new_bio(dio, dio->cur_page_block);
635		if (ret)
636			goto out;
637	}
638
639	if (dio_bio_add_page(dio) != 0) {
640		dio_bio_submit(dio);
641		ret = dio_new_bio(dio, dio->cur_page_block);
642		if (ret == 0) {
643			ret = dio_bio_add_page(dio);
644			BUG_ON(ret != 0);
645		}
646	}
647out:
648	return ret;
649}
650
651/*
652 * An autonomous function to put a chunk of a page under deferred IO.
653 *
654 * The caller doesn't actually know (or care) whether this piece of page is in
655 * a BIO, or is under IO or whatever.  We just take care of all possible
656 * situations here.  The separation between the logic of do_direct_IO() and
657 * that of submit_page_section() is important for clarity.  Please don't break.
658 *
659 * The chunk of page starts on-disk at blocknr.
660 *
661 * We perform deferred IO, by recording the last-submitted page inside our
662 * private part of the dio structure.  If possible, we just expand the IO
663 * across that page here.
664 *
665 * If that doesn't work out then we put the old page into the bio and add this
666 * page to the dio instead.
667 */
668static int
669submit_page_section(struct dio *dio, struct page *page,
670		unsigned offset, unsigned len, sector_t blocknr)
671{
672	int ret = 0;
673
674	if (dio->rw & WRITE) {
675		/*
676		 * Read accounting is performed in submit_bio()
677		 */
678		task_io_account_write(len);
679	}
680
681	/*
682	 * Can we just grow the current page's presence in the dio?
683	 */
684	if (	(dio->cur_page == page) &&
685		(dio->cur_page_offset + dio->cur_page_len == offset) &&
686		(dio->cur_page_block +
687			(dio->cur_page_len >> dio->blkbits) == blocknr)) {
688		dio->cur_page_len += len;
689
690		/*
691		 * If dio->boundary then we want to schedule the IO now to
692		 * avoid metadata seeks.
693		 */
694		if (dio->boundary) {
695			ret = dio_send_cur_page(dio);
696			page_cache_release(dio->cur_page);
697			dio->cur_page = NULL;
698		}
699		goto out;
700	}
701
702	/*
703	 * If there's a deferred page already there then send it.
704	 */
705	if (dio->cur_page) {
706		ret = dio_send_cur_page(dio);
707		page_cache_release(dio->cur_page);
708		dio->cur_page = NULL;
709		if (ret)
710			goto out;
711	}
712
713	page_cache_get(page);		/* It is in dio */
714	dio->cur_page = page;
715	dio->cur_page_offset = offset;
716	dio->cur_page_len = len;
717	dio->cur_page_block = blocknr;
718out:
719	return ret;
720}
721
722/*
723 * Clean any dirty buffers in the blockdev mapping which alias newly-created
724 * file blocks.  Only called for S_ISREG files - blockdevs do not set
725 * buffer_new
726 */
727static void clean_blockdev_aliases(struct dio *dio)
728{
729	unsigned i;
730	unsigned nblocks;
731
732	nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits;
733
734	for (i = 0; i < nblocks; i++) {
735		unmap_underlying_metadata(dio->map_bh.b_bdev,
736					dio->map_bh.b_blocknr + i);
737	}
738}
739
740/*
741 * If we are not writing the entire block and get_block() allocated
742 * the block for us, we need to fill-in the unused portion of the
743 * block with zeros. This happens only if user-buffer, fileoffset or
744 * io length is not filesystem block-size multiple.
745 *
746 * `end' is zero if we're doing the start of the IO, 1 at the end of the
747 * IO.
748 */
749static void dio_zero_block(struct dio *dio, int end)
750{
751	unsigned dio_blocks_per_fs_block;
752	unsigned this_chunk_blocks;	/* In dio_blocks */
753	unsigned this_chunk_bytes;
754	struct page *page;
755
756	dio->start_zero_done = 1;
757	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
758		return;
759
760	dio_blocks_per_fs_block = 1 << dio->blkfactor;
761	this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
762
763	if (!this_chunk_blocks)
764		return;
765
766	/*
767	 * We need to zero out part of an fs block.  It is either at the
768	 * beginning or the end of the fs block.
769	 */
770	if (end)
771		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
772
773	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
774
775	page = ZERO_PAGE(dio->curr_user_address);
776	if (submit_page_section(dio, page, 0, this_chunk_bytes,
777				dio->next_block_for_io))
778		return;
779
780	dio->next_block_for_io += this_chunk_blocks;
781}
782
783/*
784 * Walk the user pages, and the file, mapping blocks to disk and generating
785 * a sequence of (page,offset,len,block) mappings.  These mappings are injected
786 * into submit_page_section(), which takes care of the next stage of submission
787 *
788 * Direct IO against a blockdev is different from a file.  Because we can
789 * happily perform page-sized but 512-byte aligned IOs.  It is important that
790 * blockdev IO be able to have fine alignment and large sizes.
791 *
792 * So what we do is to permit the ->get_block function to populate bh.b_size
793 * with the size of IO which is permitted at this offset and this i_blkbits.
794 *
795 * For best results, the blockdev should be set up with 512-byte i_blkbits and
796 * it should set b_size to PAGE_SIZE or more inside get_block().  This gives
797 * fine alignment but still allows this function to work in PAGE_SIZE units.
798 */
799static int do_direct_IO(struct dio *dio)
800{
801	const unsigned blkbits = dio->blkbits;
802	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
803	struct page *page;
804	unsigned block_in_page;
805	struct buffer_head *map_bh = &dio->map_bh;
806	int ret = 0;
807
808	/* The I/O can start at any block offset within the first page */
809	block_in_page = dio->first_block_in_page;
810
811	while (dio->block_in_file < dio->final_block_in_request) {
812		page = dio_get_page(dio);
813		if (IS_ERR(page)) {
814			ret = PTR_ERR(page);
815			goto out;
816		}
817
818		while (block_in_page < blocks_per_page) {
819			unsigned offset_in_page = block_in_page << blkbits;
820			unsigned this_chunk_bytes;	/* # of bytes mapped */
821			unsigned this_chunk_blocks;	/* # of blocks */
822			unsigned u;
823
824			if (dio->blocks_available == 0) {
825				/*
826				 * Need to go and map some more disk
827				 */
828				unsigned long blkmask;
829				unsigned long dio_remainder;
830
831				ret = get_more_blocks(dio);
832				if (ret) {
833					page_cache_release(page);
834					goto out;
835				}
836				if (!buffer_mapped(map_bh))
837					goto do_holes;
838
839				dio->blocks_available =
840						map_bh->b_size >> dio->blkbits;
841				dio->next_block_for_io =
842					map_bh->b_blocknr << dio->blkfactor;
843				if (buffer_new(map_bh))
844					clean_blockdev_aliases(dio);
845
846				if (!dio->blkfactor)
847					goto do_holes;
848
849				blkmask = (1 << dio->blkfactor) - 1;
850				dio_remainder = (dio->block_in_file & blkmask);
851
852				/*
853				 * If we are at the start of IO and that IO
854				 * starts partway into a fs-block,
855				 * dio_remainder will be non-zero.  If the IO
856				 * is a read then we can simply advance the IO
857				 * cursor to the first block which is to be
858				 * read.  But if the IO is a write and the
859				 * block was newly allocated we cannot do that;
860				 * the start of the fs block must be zeroed out
861				 * on-disk
862				 */
863				if (!buffer_new(map_bh))
864					dio->next_block_for_io += dio_remainder;
865				dio->blocks_available -= dio_remainder;
866			}
867do_holes:
868			/* Handle holes */
869			if (!buffer_mapped(map_bh)) {
870				loff_t i_size_aligned;
871
872				/* AKPM: eargh, -ENOTBLK is a hack */
873				if (dio->rw & WRITE) {
874					page_cache_release(page);
875					return -ENOTBLK;
876				}
877
878				/*
879				 * Be sure to account for a partial block as the
880				 * last block in the file
881				 */
882				i_size_aligned = ALIGN(i_size_read(dio->inode),
883							1 << blkbits);
884				if (dio->block_in_file >=
885						i_size_aligned >> blkbits) {
886					/* We hit eof */
887					page_cache_release(page);
888					goto out;
889				}
890				zero_user_page(page, block_in_page << blkbits,
891						1 << blkbits, KM_USER0);
892				dio->block_in_file++;
893				block_in_page++;
894				goto next_block;
895			}
896
897			/*
898			 * If we're performing IO which has an alignment which
899			 * is finer than the underlying fs, go check to see if
900			 * we must zero out the start of this block.
901			 */
902			if (unlikely(dio->blkfactor && !dio->start_zero_done))
903				dio_zero_block(dio, 0);
904
905			/*
906			 * Work out, in this_chunk_blocks, how much disk we
907			 * can add to this page
908			 */
909			this_chunk_blocks = dio->blocks_available;
910			u = (PAGE_SIZE - offset_in_page) >> blkbits;
911			if (this_chunk_blocks > u)
912				this_chunk_blocks = u;
913			u = dio->final_block_in_request - dio->block_in_file;
914			if (this_chunk_blocks > u)
915				this_chunk_blocks = u;
916			this_chunk_bytes = this_chunk_blocks << blkbits;
917			BUG_ON(this_chunk_bytes == 0);
918
919			dio->boundary = buffer_boundary(map_bh);
920			ret = submit_page_section(dio, page, offset_in_page,
921				this_chunk_bytes, dio->next_block_for_io);
922			if (ret) {
923				page_cache_release(page);
924				goto out;
925			}
926			dio->next_block_for_io += this_chunk_blocks;
927
928			dio->block_in_file += this_chunk_blocks;
929			block_in_page += this_chunk_blocks;
930			dio->blocks_available -= this_chunk_blocks;
931next_block:
932			BUG_ON(dio->block_in_file > dio->final_block_in_request);
933			if (dio->block_in_file == dio->final_block_in_request)
934				break;
935		}
936
937		/* Drop the ref which was taken in get_user_pages() */
938		page_cache_release(page);
939		block_in_page = 0;
940	}
941out:
942	return ret;
943}
944
945/*
946 * Releases both i_mutex and i_alloc_sem
947 */
948static ssize_t
949direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
950	const struct iovec *iov, loff_t offset, unsigned long nr_segs,
951	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
952	struct dio *dio)
953{
954	unsigned long user_addr;
955	unsigned long flags;
956	int seg;
957	ssize_t ret = 0;
958	ssize_t ret2;
959	size_t bytes;
960
961	dio->bio = NULL;
962	dio->inode = inode;
963	dio->rw = rw;
964	dio->blkbits = blkbits;
965	dio->blkfactor = inode->i_blkbits - blkbits;
966	dio->start_zero_done = 0;
967	dio->size = 0;
968	dio->block_in_file = offset >> blkbits;
969	dio->blocks_available = 0;
970	dio->cur_page = NULL;
971
972	dio->boundary = 0;
973	dio->reap_counter = 0;
974	dio->get_block = get_block;
975	dio->end_io = end_io;
976	dio->map_bh.b_private = NULL;
977	dio->final_block_in_bio = -1;
978	dio->next_block_for_io = -1;
979
980	dio->page_errors = 0;
981	dio->io_error = 0;
982	dio->result = 0;
983	dio->iocb = iocb;
984	dio->i_size = i_size_read(inode);
985
986	spin_lock_init(&dio->bio_lock);
987	dio->refcount = 1;
988	dio->bio_list = NULL;
989	dio->waiter = NULL;
990
991	/*
992	 * In case of non-aligned buffers, we may need 2 more
993	 * pages since we need to zero out first and last block.
994	 */
995	if (unlikely(dio->blkfactor))
996		dio->pages_in_io = 2;
997	else
998		dio->pages_in_io = 0;
999
1000	for (seg = 0; seg < nr_segs; seg++) {
1001		user_addr = (unsigned long)iov[seg].iov_base;
1002		dio->pages_in_io +=
1003			((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
1004				- user_addr/PAGE_SIZE);
1005	}
1006
1007	for (seg = 0; seg < nr_segs; seg++) {
1008		user_addr = (unsigned long)iov[seg].iov_base;
1009		dio->size += bytes = iov[seg].iov_len;
1010
1011		/* Index into the first page of the first block */
1012		dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
1013		dio->final_block_in_request = dio->block_in_file +
1014						(bytes >> blkbits);
1015		/* Page fetching state */
1016		dio->head = 0;
1017		dio->tail = 0;
1018		dio->curr_page = 0;
1019
1020		dio->total_pages = 0;
1021		if (user_addr & (PAGE_SIZE-1)) {
1022			dio->total_pages++;
1023			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
1024		}
1025		dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1026		dio->curr_user_address = user_addr;
1027
1028		ret = do_direct_IO(dio);
1029
1030		dio->result += iov[seg].iov_len -
1031			((dio->final_block_in_request - dio->block_in_file) <<
1032					blkbits);
1033
1034		if (ret) {
1035			dio_cleanup(dio);
1036			break;
1037		}
1038	} /* end iovec loop */
1039
1040	if (ret == -ENOTBLK && (rw & WRITE)) {
1041		/*
1042		 * The remaining part of the request will be
1043		 * be handled by buffered I/O when we return
1044		 */
1045		ret = 0;
1046	}
1047	/*
1048	 * There may be some unwritten disk at the end of a part-written
1049	 * fs-block-sized block.  Go zero that now.
1050	 */
1051	dio_zero_block(dio, 1);
1052
1053	if (dio->cur_page) {
1054		ret2 = dio_send_cur_page(dio);
1055		if (ret == 0)
1056			ret = ret2;
1057		page_cache_release(dio->cur_page);
1058		dio->cur_page = NULL;
1059	}
1060	if (dio->bio)
1061		dio_bio_submit(dio);
1062
1063	/* All IO is now issued, send it on its way */
1064	blk_run_address_space(inode->i_mapping);
1065
1066	/*
1067	 * It is possible that, we return short IO due to end of file.
1068	 * In that case, we need to release all the pages we got hold on.
1069	 */
1070	dio_cleanup(dio);
1071
1072	/*
1073	 * All block lookups have been performed. For READ requests
1074	 * we can let i_mutex go now that its achieved its purpose
1075	 * of protecting us from looking up uninitialized blocks.
1076	 */
1077	if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
1078		mutex_unlock(&dio->inode->i_mutex);
1079
1080	/*
1081	 * The only time we want to leave bios in flight is when a successful
1082	 * partial aio read or full aio write have been setup.  In that case
1083	 * bio completion will call aio_complete.  The only time it's safe to
1084	 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
1085	 * This had *better* be the only place that raises -EIOCBQUEUED.
1086	 */
1087	BUG_ON(ret == -EIOCBQUEUED);
1088	if (dio->is_async && ret == 0 && dio->result &&
1089	    ((rw & READ) || (dio->result == dio->size)))
1090		ret = -EIOCBQUEUED;
1091
1092	if (ret != -EIOCBQUEUED)
1093		dio_await_completion(dio);
1094
1095	/*
1096	 * Sync will always be dropping the final ref and completing the
1097	 * operation.  AIO can if it was a broken operation described above or
1098	 * in fact if all the bios race to complete before we get here.  In
1099	 * that case dio_complete() translates the EIOCBQUEUED into the proper
1100	 * return code that the caller will hand to aio_complete().
1101	 *
1102	 * This is managed by the bio_lock instead of being an atomic_t so that
1103	 * completion paths can drop their ref and use the remaining count to
1104	 * decide to wake the submission path atomically.
1105	 */
1106	spin_lock_irqsave(&dio->bio_lock, flags);
1107	ret2 = --dio->refcount;
1108	spin_unlock_irqrestore(&dio->bio_lock, flags);
1109
1110	if (ret2 == 0) {
1111		ret = dio_complete(dio, offset, ret);
1112		kfree(dio);
1113	} else
1114		BUG_ON(ret != -EIOCBQUEUED);
1115
1116	return ret;
1117}
1118
1119/*
1120 * This is a library function for use by filesystem drivers.
1121 * The locking rules are governed by the dio_lock_type parameter.
1122 *
1123 * DIO_NO_LOCKING (no locking, for raw block device access)
1124 * For writes, i_mutex is not held on entry; it is never taken.
1125 *
1126 * DIO_LOCKING (simple locking for regular files)
1127 * For writes we are called under i_mutex and return with i_mutex held, even
1128 * though it is internally dropped.
1129 * For reads, i_mutex is not held on entry, but it is taken and dropped before
1130 * returning.
1131 *
1132 * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
1133 *	uninitialised data, allowing parallel direct readers and writers)
1134 * For writes we are called without i_mutex, return without it, never touch it.
1135 * For reads we are called under i_mutex and return with i_mutex held, even
1136 * though it may be internally dropped.
1137 *
1138 * Additional i_alloc_sem locking requirements described inline below.
1139 */
1140ssize_t
1141__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1142	struct block_device *bdev, const struct iovec *iov, loff_t offset,
1143	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1144	int dio_lock_type)
1145{
1146	int seg;
1147	size_t size;
1148	unsigned long addr;
1149	unsigned blkbits = inode->i_blkbits;
1150	unsigned bdev_blkbits = 0;
1151	unsigned blocksize_mask = (1 << blkbits) - 1;
1152	ssize_t retval = -EINVAL;
1153	loff_t end = offset;
1154	struct dio *dio;
1155	int release_i_mutex = 0;
1156	int acquire_i_mutex = 0;
1157
1158	if (rw & WRITE)
1159		rw = WRITE_SYNC;
1160
1161	if (bdev)
1162		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
1163
1164	if (offset & blocksize_mask) {
1165		if (bdev)
1166			 blkbits = bdev_blkbits;
1167		blocksize_mask = (1 << blkbits) - 1;
1168		if (offset & blocksize_mask)
1169			goto out;
1170	}
1171
1172	/* Check the memory alignment.  Blocks cannot straddle pages */
1173	for (seg = 0; seg < nr_segs; seg++) {
1174		addr = (unsigned long)iov[seg].iov_base;
1175		size = iov[seg].iov_len;
1176		end += size;
1177		if ((addr & blocksize_mask) || (size & blocksize_mask))  {
1178			if (bdev)
1179				 blkbits = bdev_blkbits;
1180			blocksize_mask = (1 << blkbits) - 1;
1181			if ((addr & blocksize_mask) || (size & blocksize_mask))
1182				goto out;
1183		}
1184	}
1185
1186	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
1187	retval = -ENOMEM;
1188	if (!dio)
1189		goto out;
1190
1191	/*
1192	 * For block device access DIO_NO_LOCKING is used,
1193	 *	neither readers nor writers do any locking at all
1194	 * For regular files using DIO_LOCKING,
1195	 *	readers need to grab i_mutex and i_alloc_sem
1196	 *	writers need to grab i_alloc_sem only (i_mutex is already held)
1197	 * For regular files using DIO_OWN_LOCKING,
1198	 *	neither readers nor writers take any locks here
1199	 */
1200	dio->lock_type = dio_lock_type;
1201	if (dio_lock_type != DIO_NO_LOCKING) {
1202		/* watch out for a 0 len io from a tricksy fs */
1203		if (rw == READ && end > offset) {
1204			struct address_space *mapping;
1205
1206			mapping = iocb->ki_filp->f_mapping;
1207			if (dio_lock_type != DIO_OWN_LOCKING) {
1208				mutex_lock(&inode->i_mutex);
1209				release_i_mutex = 1;
1210			}
1211
1212			retval = filemap_write_and_wait_range(mapping, offset,
1213							      end - 1);
1214			if (retval) {
1215				kfree(dio);
1216				goto out;
1217			}
1218
1219			if (dio_lock_type == DIO_OWN_LOCKING) {
1220				mutex_unlock(&inode->i_mutex);
1221				acquire_i_mutex = 1;
1222			}
1223		}
1224
1225		if (dio_lock_type == DIO_LOCKING)
1226			/* lockdep: not the owner will release it */
1227			down_read_non_owner(&inode->i_alloc_sem);
1228	}
1229
1230	/*
1231	 * For file extending writes updating i_size before data
1232	 * writeouts complete can expose uninitialized blocks. So
1233	 * even for AIO, we need to wait for i/o to complete before
1234	 * returning in this case.
1235	 */
1236	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
1237		(end > i_size_read(inode)));
1238
1239	retval = direct_io_worker(rw, iocb, inode, iov, offset,
1240				nr_segs, blkbits, get_block, end_io, dio);
1241
1242	if (rw == READ && dio_lock_type == DIO_LOCKING)
1243		release_i_mutex = 0;
1244
1245out:
1246	if (release_i_mutex)
1247		mutex_unlock(&inode->i_mutex);
1248	else if (acquire_i_mutex)
1249		mutex_lock(&inode->i_mutex);
1250	return retval;
1251}
1252EXPORT_SYMBOL(__blockdev_direct_IO);
1253