1/* Modified by Broadcom Corp. Portions Copyright (c) Broadcom Corp, 2012. */
2/*
3 * fs/mpage.c
4 *
5 * Copyright (C) 2002, Linus Torvalds.
6 *
7 * Contains functions related to preparing and submitting BIOs which contain
8 * multiple pagecache pages.
9 *
10 * 15May2002	Andrew Morton
11 *		Initial version
12 * 27Jun2002	axboe@suse.de
13 *		use bio_add_page() to build bio's just the right size
14 */
15
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/mm.h>
19#include <linux/kdev_t.h>
20#include <linux/gfp.h>
21#include <linux/bio.h>
22#include <linux/fs.h>
23#include <linux/buffer_head.h>
24#include <linux/blkdev.h>
25#include <linux/highmem.h>
26#include <linux/prefetch.h>
27#include <linux/mpage.h>
28#include <linux/writeback.h>
29#include <linux/backing-dev.h>
30#include <linux/pagevec.h>
31
32#include <typedefs.h>
33#include <bcmdefs.h>
34
35/*
36 * I/O completion handler for multipage BIOs.
37 *
38 * The mpage code never puts partial pages into a BIO (except for end-of-file).
39 * If a page does not map to a contiguous run of blocks then it simply falls
40 * back to block_read_full_page().
41 *
42 * Why is this?  If a page's completion depends on a number of different BIOs
43 * which can complete in any order (or at the same time) then determining the
44 * status of that page is hard.  See end_buffer_async_read() for the details.
45 * There is no point in duplicating all that complexity.
46 */
47static void mpage_end_io_read(struct bio *bio, int err)
48{
49	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
50	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
51
52	do {
53		struct page *page = bvec->bv_page;
54
55		if (--bvec >= bio->bi_io_vec)
56			prefetchw(&bvec->bv_page->flags);
57
58		if (uptodate) {
59			SetPageUptodate(page);
60		} else {
61			ClearPageUptodate(page);
62			SetPageError(page);
63		}
64		unlock_page(page);
65	} while (bvec >= bio->bi_io_vec);
66	bio_put(bio);
67}
68
69static void mpage_end_io_write(struct bio *bio, int err)
70{
71	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
72	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
73
74	do {
75		struct page *page = bvec->bv_page;
76
77		if (--bvec >= bio->bi_io_vec)
78			prefetchw(&bvec->bv_page->flags);
79
80		if (!uptodate){
81			SetPageError(page);
82			if (page->mapping)
83				set_bit(AS_EIO, &page->mapping->flags);
84		}
85		end_page_writeback(page);
86	} while (bvec >= bio->bi_io_vec);
87	bio_put(bio);
88}
89
90static struct bio *mpage_bio_submit(int rw, struct bio *bio)
91{
92	bio->bi_end_io = mpage_end_io_read;
93	if (rw == WRITE)
94		bio->bi_end_io = mpage_end_io_write;
95	submit_bio(rw, bio);
96	return NULL;
97}
98
99static struct bio *
100mpage_alloc(struct block_device *bdev,
101		sector_t first_sector, int nr_vecs,
102		gfp_t gfp_flags)
103{
104	struct bio *bio;
105
106	bio = bio_alloc(gfp_flags, nr_vecs);
107
108	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
109		while (!bio && (nr_vecs /= 2))
110			bio = bio_alloc(gfp_flags, nr_vecs);
111	}
112
113	if (bio) {
114		bio->bi_bdev = bdev;
115		bio->bi_sector = first_sector;
116	}
117	return bio;
118}
119
120/*
121 * support function for mpage_readpages.  The fs supplied get_block might
122 * return an up to date buffer.  This is used to map that buffer into
123 * the page, which allows readpage to avoid triggering a duplicate call
124 * to get_block.
125 *
126 * The idea is to avoid adding buffers to pages that don't already have
127 * them.  So when the buffer is up to date and the page size == block size,
128 * this marks the page up to date instead of adding new buffers.
129 */
130static void
131map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
132{
133	struct inode *inode = page->mapping->host;
134	struct buffer_head *page_bh, *head;
135	int block = 0;
136
137	if (!page_has_buffers(page)) {
138		/*
139		 * don't make any buffers if there is only one buffer on
140		 * the page and the page just needs to be set up to date
141		 */
142		if (inode->i_blkbits == PAGE_CACHE_SHIFT &&
143		    buffer_uptodate(bh)) {
144			SetPageUptodate(page);
145			return;
146		}
147		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
148	}
149	head = page_buffers(page);
150	page_bh = head;
151	do {
152		if (block == page_block) {
153			page_bh->b_state = bh->b_state;
154			page_bh->b_bdev = bh->b_bdev;
155			page_bh->b_blocknr = bh->b_blocknr;
156			break;
157		}
158		page_bh = page_bh->b_this_page;
159		block++;
160	} while (page_bh != head);
161}
162
163/*
164 * This is the worker routine which does all the work of mapping the disk
165 * blocks and constructs largest possible bios, submits them for IO if the
166 * blocks are not contiguous on the disk.
167 *
168 * We pass a buffer_head back and forth and use its buffer_mapped() flag to
169 * represent the validity of its disk mapping and to decide when to do the next
170 * get_block() call.
171 */
172static struct bio * BCMFASTPATH_HOST
173do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
174		sector_t *last_block_in_bio, struct buffer_head *map_bh,
175		unsigned long *first_logical_block, get_block_t get_block)
176{
177	struct inode *inode = page->mapping->host;
178	const unsigned blkbits = inode->i_blkbits;
179	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
180	const unsigned blocksize = 1 << blkbits;
181	sector_t block_in_file;
182	sector_t last_block;
183	sector_t last_block_in_file;
184	sector_t blocks[MAX_BUF_PER_PAGE];
185	unsigned page_block;
186	unsigned first_hole = blocks_per_page;
187	struct block_device *bdev = NULL;
188	int length;
189	int fully_mapped = 1;
190	unsigned nblocks;
191	unsigned relative_block;
192
193	if (page_has_buffers(page))
194		goto confused;
195
196	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
197	last_block = block_in_file + nr_pages * blocks_per_page;
198	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
199	if (last_block > last_block_in_file)
200		last_block = last_block_in_file;
201	page_block = 0;
202
203	/*
204	 * Map blocks using the result from the previous get_blocks call first.
205	 */
206	nblocks = map_bh->b_size >> blkbits;
207	if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
208			block_in_file < (*first_logical_block + nblocks)) {
209		unsigned map_offset = block_in_file - *first_logical_block;
210		unsigned last = nblocks - map_offset;
211
212		for (relative_block = 0; ; relative_block++) {
213			if (relative_block == last) {
214				clear_buffer_mapped(map_bh);
215				break;
216			}
217			if (page_block == blocks_per_page)
218				break;
219			blocks[page_block] = map_bh->b_blocknr + map_offset +
220						relative_block;
221			page_block++;
222			block_in_file++;
223		}
224		bdev = map_bh->b_bdev;
225	}
226
227	/*
228	 * Then do more get_blocks calls until we are done with this page.
229	 */
230	map_bh->b_page = page;
231	while (page_block < blocks_per_page) {
232		map_bh->b_state = 0;
233		map_bh->b_size = 0;
234
235		if (block_in_file < last_block) {
236			map_bh->b_size = (last_block-block_in_file) << blkbits;
237			if (get_block(inode, block_in_file, map_bh, 0))
238				goto confused;
239			*first_logical_block = block_in_file;
240		}
241
242		if (!buffer_mapped(map_bh)) {
243			fully_mapped = 0;
244			if (first_hole == blocks_per_page)
245				first_hole = page_block;
246			page_block++;
247			block_in_file++;
248			continue;
249		}
250
251		/* some filesystems will copy data into the page during
252		 * the get_block call, in which case we don't want to
253		 * read it again.  map_buffer_to_page copies the data
254		 * we just collected from get_block into the page's buffers
255		 * so readpage doesn't have to repeat the get_block call
256		 */
257		if (buffer_uptodate(map_bh)) {
258			map_buffer_to_page(page, map_bh, page_block);
259			goto confused;
260		}
261
262		if (first_hole != blocks_per_page)
263			goto confused;		/* hole -> non-hole */
264
265		/* Contiguous blocks? */
266		if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
267			goto confused;
268		nblocks = map_bh->b_size >> blkbits;
269		for (relative_block = 0; ; relative_block++) {
270			if (relative_block == nblocks) {
271				clear_buffer_mapped(map_bh);
272				break;
273			} else if (page_block == blocks_per_page)
274				break;
275			blocks[page_block] = map_bh->b_blocknr+relative_block;
276			page_block++;
277			block_in_file++;
278		}
279		bdev = map_bh->b_bdev;
280	}
281
282	if (first_hole != blocks_per_page) {
283		zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
284		if (first_hole == 0) {
285			SetPageUptodate(page);
286			unlock_page(page);
287			goto out;
288		}
289	} else if (fully_mapped) {
290		SetPageMappedToDisk(page);
291	}
292
293	/*
294	 * This page will go to BIO.  Do we need to send this BIO off first?
295	 */
296	if (bio && (*last_block_in_bio != blocks[0] - 1))
297		bio = mpage_bio_submit(READ, bio);
298
299alloc_new:
300	if (bio == NULL) {
301		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
302			  	min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
303				GFP_KERNEL);
304		if (bio == NULL)
305			goto confused;
306	}
307
308	length = first_hole << blkbits;
309	if (bio_add_page(bio, page, length, 0) < length) {
310		bio = mpage_bio_submit(READ, bio);
311		goto alloc_new;
312	}
313
314	relative_block = block_in_file - *first_logical_block;
315	nblocks = map_bh->b_size >> blkbits;
316	if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
317	    (first_hole != blocks_per_page))
318		bio = mpage_bio_submit(READ, bio);
319	else
320		*last_block_in_bio = blocks[blocks_per_page - 1];
321out:
322	return bio;
323
324confused:
325	if (bio)
326		bio = mpage_bio_submit(READ, bio);
327	if (!PageUptodate(page))
328	        block_read_full_page(page, get_block);
329	else
330		unlock_page(page);
331	goto out;
332}
333
334/**
335 * mpage_readpages - populate an address space with some pages & start reads against them
336 * @mapping: the address_space
337 * @pages: The address of a list_head which contains the target pages.  These
338 *   pages have their ->index populated and are otherwise uninitialised.
339 *   The page at @pages->prev has the lowest file offset, and reads should be
340 *   issued in @pages->prev to @pages->next order.
341 * @nr_pages: The number of pages at *@pages
342 * @get_block: The filesystem's block mapper function.
343 *
344 * This function walks the pages and the blocks within each page, building and
345 * emitting large BIOs.
346 *
347 * If anything unusual happens, such as:
348 *
349 * - encountering a page which has buffers
350 * - encountering a page which has a non-hole after a hole
351 * - encountering a page with non-contiguous blocks
352 *
353 * then this code just gives up and calls the buffer_head-based read function.
354 * It does handle a page which has holes at the end - that is a common case:
355 * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
356 *
357 * BH_Boundary explanation:
358 *
359 * There is a problem.  The mpage read code assembles several pages, gets all
360 * their disk mappings, and then submits them all.  That's fine, but obtaining
361 * the disk mappings may require I/O.  Reads of indirect blocks, for example.
362 *
363 * So an mpage read of the first 16 blocks of an ext2 file will cause I/O to be
364 * submitted in the following order:
365 * 	12 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16
366 *
367 * because the indirect block has to be read to get the mappings of blocks
368 * 13,14,15,16.  Obviously, this impacts performance.
369 *
370 * So what we do it to allow the filesystem's get_block() function to set
371 * BH_Boundary when it maps block 11.  BH_Boundary says: mapping of the block
372 * after this one will require I/O against a block which is probably close to
373 * this one.  So you should push what I/O you have currently accumulated.
374 *
375 * This all causes the disk requests to be issued in the correct order.
376 */
377int
378mpage_readpages(struct address_space *mapping, struct list_head *pages,
379				unsigned nr_pages, get_block_t get_block)
380{
381	struct bio *bio = NULL;
382	unsigned page_idx;
383	sector_t last_block_in_bio = 0;
384	struct buffer_head map_bh;
385	unsigned long first_logical_block = 0;
386
387	map_bh.b_state = 0;
388	map_bh.b_size = 0;
389	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
390		struct page *page = list_entry(pages->prev, struct page, lru);
391
392		prefetchw(&page->flags);
393		list_del(&page->lru);
394		if (!add_to_page_cache_lru(page, mapping,
395					page->index, GFP_KERNEL)) {
396			bio = do_mpage_readpage(bio, page,
397					nr_pages - page_idx,
398					&last_block_in_bio, &map_bh,
399					&first_logical_block,
400					get_block);
401		}
402		page_cache_release(page);
403	}
404	BUG_ON(!list_empty(pages));
405	if (bio)
406		mpage_bio_submit(READ, bio);
407	return 0;
408}
409EXPORT_SYMBOL(mpage_readpages);
410
411/*
412 * This isn't called much at all
413 */
414int mpage_readpage(struct page *page, get_block_t get_block)
415{
416	struct bio *bio = NULL;
417	sector_t last_block_in_bio = 0;
418	struct buffer_head map_bh;
419	unsigned long first_logical_block = 0;
420
421	map_bh.b_state = 0;
422	map_bh.b_size = 0;
423	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
424			&map_bh, &first_logical_block, get_block);
425	if (bio)
426		mpage_bio_submit(READ, bio);
427	return 0;
428}
429EXPORT_SYMBOL(mpage_readpage);
430
431
432struct mpage_data {
433	struct bio *bio;
434	sector_t last_block_in_bio;
435	get_block_t *get_block;
436	unsigned use_writepage;
437};
438
439static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
440		      void *data)
441{
442	struct mpage_data *mpd = data;
443	struct bio *bio = mpd->bio;
444	struct address_space *mapping = page->mapping;
445	struct inode *inode = page->mapping->host;
446	const unsigned blkbits = inode->i_blkbits;
447	unsigned long end_index;
448	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
449	sector_t last_block;
450	sector_t block_in_file;
451	sector_t blocks[MAX_BUF_PER_PAGE];
452	unsigned page_block;
453	unsigned first_unmapped = blocks_per_page;
454	struct block_device *bdev = NULL;
455	int boundary = 0;
456	sector_t boundary_block = 0;
457	struct block_device *boundary_bdev = NULL;
458	int length;
459	struct buffer_head map_bh;
460	loff_t i_size = i_size_read(inode);
461	int ret = 0;
462
463	if (page_has_buffers(page)) {
464		struct buffer_head *head = page_buffers(page);
465		struct buffer_head *bh = head;
466
467		/* If they're all mapped and dirty, do it */
468		page_block = 0;
469		do {
470			BUG_ON(buffer_locked(bh));
471			if (!buffer_mapped(bh)) {
472				/*
473				 * unmapped dirty buffers are created by
474				 * __set_page_dirty_buffers -> mmapped data
475				 */
476				if (buffer_dirty(bh))
477					goto confused;
478				if (first_unmapped == blocks_per_page)
479					first_unmapped = page_block;
480				continue;
481			}
482
483			if (first_unmapped != blocks_per_page)
484				goto confused;	/* hole -> non-hole */
485
486			if (!buffer_dirty(bh) || !buffer_uptodate(bh))
487				goto confused;
488			if (page_block) {
489				if (bh->b_blocknr != blocks[page_block-1] + 1)
490					goto confused;
491			}
492			blocks[page_block++] = bh->b_blocknr;
493			boundary = buffer_boundary(bh);
494			if (boundary) {
495				boundary_block = bh->b_blocknr;
496				boundary_bdev = bh->b_bdev;
497			}
498			bdev = bh->b_bdev;
499		} while ((bh = bh->b_this_page) != head);
500
501		if (first_unmapped)
502			goto page_is_mapped;
503
504		/*
505		 * Page has buffers, but they are all unmapped. The page was
506		 * created by pagein or read over a hole which was handled by
507		 * block_read_full_page().  If this address_space is also
508		 * using mpage_readpages then this can rarely happen.
509		 */
510		goto confused;
511	}
512
513	/*
514	 * The page has no buffers: map it to disk
515	 */
516	BUG_ON(!PageUptodate(page));
517	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
518	last_block = (i_size - 1) >> blkbits;
519	map_bh.b_page = page;
520	for (page_block = 0; page_block < blocks_per_page; ) {
521
522		map_bh.b_state = 0;
523		map_bh.b_size = 1 << blkbits;
524		if (mpd->get_block(inode, block_in_file, &map_bh, 1))
525			goto confused;
526		if (buffer_new(&map_bh))
527			unmap_underlying_metadata(map_bh.b_bdev,
528						map_bh.b_blocknr);
529		if (buffer_boundary(&map_bh)) {
530			boundary_block = map_bh.b_blocknr;
531			boundary_bdev = map_bh.b_bdev;
532		}
533		if (page_block) {
534			if (map_bh.b_blocknr != blocks[page_block-1] + 1)
535				goto confused;
536		}
537		blocks[page_block++] = map_bh.b_blocknr;
538		boundary = buffer_boundary(&map_bh);
539		bdev = map_bh.b_bdev;
540		if (block_in_file == last_block)
541			break;
542		block_in_file++;
543	}
544	BUG_ON(page_block == 0);
545
546	first_unmapped = page_block;
547
548page_is_mapped:
549	end_index = i_size >> PAGE_CACHE_SHIFT;
550	if (page->index >= end_index) {
551		/*
552		 * The page straddles i_size.  It must be zeroed out on each
553		 * and every writepage invocation because it may be mmapped.
554		 * "A file is mapped in multiples of the page size.  For a file
555		 * that is not a multiple of the page size, the remaining memory
556		 * is zeroed when mapped, and writes to that region are not
557		 * written out to the file."
558		 */
559		unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
560
561		if (page->index > end_index || !offset)
562			goto confused;
563		zero_user_segment(page, offset, PAGE_CACHE_SIZE);
564	}
565
566	/*
567	 * This page will go to BIO.  Do we need to send this BIO off first?
568	 */
569	if (bio && mpd->last_block_in_bio != blocks[0] - 1)
570		bio = mpage_bio_submit(WRITE, bio);
571
572alloc_new:
573	if (bio == NULL) {
574		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
575				bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
576		if (bio == NULL)
577			goto confused;
578	}
579
580	/*
581	 * Must try to add the page before marking the buffer clean or
582	 * the confused fail path above (OOM) will be very confused when
583	 * it finds all bh marked clean (i.e. it will not write anything)
584	 */
585	length = first_unmapped << blkbits;
586	if (bio_add_page(bio, page, length, 0) < length) {
587		bio = mpage_bio_submit(WRITE, bio);
588		goto alloc_new;
589	}
590
591	/*
592	 * OK, we have our BIO, so we can now mark the buffers clean.  Make
593	 * sure to only clean buffers which we know we'll be writing.
594	 */
595	if (page_has_buffers(page)) {
596		struct buffer_head *head = page_buffers(page);
597		struct buffer_head *bh = head;
598		unsigned buffer_counter = 0;
599
600		do {
601			if (buffer_counter++ == first_unmapped)
602				break;
603			clear_buffer_dirty(bh);
604			bh = bh->b_this_page;
605		} while (bh != head);
606
607		/*
608		 * we cannot drop the bh if the page is not uptodate
609		 * or a concurrent readpage would fail to serialize with the bh
610		 * and it would read from disk before we reach the platter.
611		 */
612		if (buffer_heads_over_limit && PageUptodate(page))
613			try_to_free_buffers(page);
614	}
615
616	BUG_ON(PageWriteback(page));
617	set_page_writeback(page);
618	unlock_page(page);
619	if (boundary || (first_unmapped != blocks_per_page)) {
620		bio = mpage_bio_submit(WRITE, bio);
621		if (boundary_block) {
622			write_boundary_block(boundary_bdev,
623					boundary_block, 1 << blkbits);
624		}
625	} else {
626		mpd->last_block_in_bio = blocks[blocks_per_page - 1];
627	}
628	goto out;
629
630confused:
631	if (bio)
632		bio = mpage_bio_submit(WRITE, bio);
633
634	if (mpd->use_writepage) {
635		ret = mapping->a_ops->writepage(page, wbc);
636	} else {
637		ret = -EAGAIN;
638		goto out;
639	}
640	/*
641	 * The caller has a ref on the inode, so *mapping is stable
642	 */
643	mapping_set_error(mapping, ret);
644out:
645	mpd->bio = bio;
646	return ret;
647}
648
649/**
650 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
651 * @mapping: address space structure to write
652 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
653 * @get_block: the filesystem's block mapper function.
654 *             If this is NULL then use a_ops->writepage.  Otherwise, go
655 *             direct-to-BIO.
656 *
657 * This is a library function, which implements the writepages()
658 * address_space_operation.
659 *
660 * If a page is already under I/O, generic_writepages() skips it, even
661 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
662 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
663 * and msync() need to guarantee that all the data which was dirty at the time
664 * the call was made get new I/O started against them.  If wbc->sync_mode is
665 * WB_SYNC_ALL then we were called for data integrity and we must wait for
666 * existing IO to complete.
667 */
668int
669mpage_writepages(struct address_space *mapping,
670		struct writeback_control *wbc, get_block_t get_block)
671{
672	int ret;
673
674	if (!get_block)
675		ret = generic_writepages(mapping, wbc);
676	else {
677		struct mpage_data mpd = {
678			.bio = NULL,
679			.last_block_in_bio = 0,
680			.get_block = get_block,
681			.use_writepage = 1,
682		};
683
684		ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
685		if (mpd.bio)
686			mpage_bio_submit(WRITE, mpd.bio);
687	}
688	return ret;
689}
690EXPORT_SYMBOL(mpage_writepages);
691
692int mpage_writepage(struct page *page, get_block_t get_block,
693	struct writeback_control *wbc)
694{
695	struct mpage_data mpd = {
696		.bio = NULL,
697		.last_block_in_bio = 0,
698		.get_block = get_block,
699		.use_writepage = 0,
700	};
701	int ret = __mpage_writepage(page, wbc, &mpd);
702	if (mpd.bio)
703		mpage_bio_submit(WRITE, mpd.bio);
704	return ret;
705}
706EXPORT_SYMBOL(mpage_writepage);
707