1/*
2 *  linux/fs/buffer.c
3 *
4 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5 */
6
7/*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9 *
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12 *
13 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15 *
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17 *
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19 */
20
21#include <linux/kernel.h>
22#include <linux/syscalls.h>
23#include <linux/fs.h>
24#include <linux/mm.h>
25#include <linux/percpu.h>
26#include <linux/slab.h>
27#include <linux/capability.h>
28#include <linux/blkdev.h>
29#include <linux/file.h>
30#include <linux/quotaops.h>
31#include <linux/highmem.h>
32#include <linux/module.h>
33#include <linux/writeback.h>
34#include <linux/hash.h>
35#include <linux/suspend.h>
36#include <linux/buffer_head.h>
37#include <linux/task_io_accounting_ops.h>
38#include <linux/bio.h>
39#include <linux/notifier.h>
40#include <linux/cpu.h>
41#include <linux/bitops.h>
42#include <linux/mpage.h>
43#include <linux/bit_spinlock.h>
44
45static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46
47#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48
49inline void
50init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51{
52	bh->b_end_io = handler;
53	bh->b_private = private;
54}
55EXPORT_SYMBOL(init_buffer);
56
57static int sync_buffer(void *word)
58{
59	struct block_device *bd;
60	struct buffer_head *bh
61		= container_of(word, struct buffer_head, b_state);
62
63	smp_mb();
64	bd = bh->b_bdev;
65	if (bd)
66		blk_run_address_space(bd->bd_inode->i_mapping);
67	io_schedule();
68	return 0;
69}
70
71void __lock_buffer(struct buffer_head *bh)
72{
73	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
74							TASK_UNINTERRUPTIBLE);
75}
76EXPORT_SYMBOL(__lock_buffer);
77
78void unlock_buffer(struct buffer_head *bh)
79{
80	clear_bit_unlock(BH_Lock, &bh->b_state);
81	smp_mb__after_clear_bit();
82	wake_up_bit(&bh->b_state, BH_Lock);
83}
84EXPORT_SYMBOL(unlock_buffer);
85
86/*
87 * Block until a buffer comes unlocked.  This doesn't stop it
88 * from becoming locked again - you have to lock it yourself
89 * if you want to preserve its state.
90 */
91void __wait_on_buffer(struct buffer_head * bh)
92{
93	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
94}
95EXPORT_SYMBOL(__wait_on_buffer);
96
97static void
98__clear_page_buffers(struct page *page)
99{
100	ClearPagePrivate(page);
101	set_page_private(page, 0);
102	page_cache_release(page);
103}
104
105
106static int quiet_error(struct buffer_head *bh)
107{
108	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
109		return 0;
110	return 1;
111}
112
113
114static void buffer_io_error(struct buffer_head *bh)
115{
116	char b[BDEVNAME_SIZE];
117	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
118			bdevname(bh->b_bdev, b),
119			(unsigned long long)bh->b_blocknr);
120}
121
122/*
123 * End-of-IO handler helper function which does not touch the bh after
124 * unlocking it.
125 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
126 * a race there is benign: unlock_buffer() only use the bh's address for
127 * hashing after unlocking the buffer, so it doesn't actually touch the bh
128 * itself.
129 */
130static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
131{
132	if (uptodate) {
133		set_buffer_uptodate(bh);
134	} else {
135		/* This happens, due to failed READA attempts. */
136		clear_buffer_uptodate(bh);
137	}
138	unlock_buffer(bh);
139}
140
141/*
142 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
143 * unlock the buffer. This is what ll_rw_block uses too.
144 */
145void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
146{
147	__end_buffer_read_notouch(bh, uptodate);
148	put_bh(bh);
149}
150EXPORT_SYMBOL(end_buffer_read_sync);
151
152void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
153{
154	char b[BDEVNAME_SIZE];
155
156	if (uptodate) {
157		set_buffer_uptodate(bh);
158	} else {
159		if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
160			buffer_io_error(bh);
161			printk(KERN_WARNING "lost page write due to "
162					"I/O error on %s\n",
163				       bdevname(bh->b_bdev, b));
164		}
165		set_buffer_write_io_error(bh);
166		clear_buffer_uptodate(bh);
167	}
168	unlock_buffer(bh);
169	put_bh(bh);
170}
171EXPORT_SYMBOL(end_buffer_write_sync);
172
173/*
174 * Various filesystems appear to want __find_get_block to be non-blocking.
175 * But it's the page lock which protects the buffers.  To get around this,
176 * we get exclusion from try_to_free_buffers with the blockdev mapping's
177 * private_lock.
178 *
179 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
180 * may be quite high.  This code could TryLock the page, and if that
181 * succeeds, there is no need to take private_lock. (But if
182 * private_lock is contended then so is mapping->tree_lock).
183 */
184static struct buffer_head *
185__find_get_block_slow(struct block_device *bdev, sector_t block)
186{
187	struct inode *bd_inode = bdev->bd_inode;
188	struct address_space *bd_mapping = bd_inode->i_mapping;
189	struct buffer_head *ret = NULL;
190	pgoff_t index;
191	struct buffer_head *bh;
192	struct buffer_head *head;
193	struct page *page;
194	int all_mapped = 1;
195
196	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
197	page = find_get_page(bd_mapping, index);
198	if (!page)
199		goto out;
200
201	spin_lock(&bd_mapping->private_lock);
202	if (!page_has_buffers(page))
203		goto out_unlock;
204	head = page_buffers(page);
205	bh = head;
206	do {
207		if (!buffer_mapped(bh))
208			all_mapped = 0;
209		else if (bh->b_blocknr == block) {
210			ret = bh;
211			get_bh(bh);
212			goto out_unlock;
213		}
214		bh = bh->b_this_page;
215	} while (bh != head);
216
217	/* we might be here because some of the buffers on this page are
218	 * not mapped.  This is due to various races between
219	 * file io on the block device and getblk.  It gets dealt with
220	 * elsewhere, don't buffer_error if we had some unmapped buffers
221	 */
222	if (all_mapped) {
223		printk("__find_get_block_slow() failed. "
224			"block=%llu, b_blocknr=%llu\n",
225			(unsigned long long)block,
226			(unsigned long long)bh->b_blocknr);
227		printk("b_state=0x%08lx, b_size=%zu\n",
228			bh->b_state, bh->b_size);
229		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
230	}
231out_unlock:
232	spin_unlock(&bd_mapping->private_lock);
233	page_cache_release(page);
234out:
235	return ret;
236}
237
238/* If invalidate_buffers() will trash dirty buffers, it means some kind
239   of fs corruption is going on. Trashing dirty data always imply losing
240   information that was supposed to be just stored on the physical layer
241   by the user.
242
243   Thus invalidate_buffers in general usage is not allwowed to trash
244   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
245   be preserved.  These buffers are simply skipped.
246
247   We also skip buffers which are still in use.  For example this can
248   happen if a userspace program is reading the block device.
249
250   NOTE: In the case where the user removed a removable-media-disk even if
251   there's still dirty data not synced on disk (due a bug in the device driver
252   or due an error of the user), by not destroying the dirty buffers we could
253   generate corruption also on the next media inserted, thus a parameter is
254   necessary to handle this case in the most safe way possible (trying
255   to not corrupt also the new disk inserted with the data belonging to
256   the old now corrupted disk). Also for the ramdisk the natural thing
257   to do in order to release the ramdisk memory is to destroy dirty buffers.
258
259   These are two special cases. Normal usage imply the device driver
260   to issue a sync on the device (without waiting I/O completion) and
261   then an invalidate_buffers call that doesn't trash dirty buffers.
262
263   For handling cache coherency with the blkdev pagecache the 'update' case
264   is been introduced. It is needed to re-read from disk any pinned
265   buffer. NOTE: re-reading from disk is destructive so we can do it only
266   when we assume nobody is changing the buffercache under our I/O and when
267   we think the disk contains more recent information than the buffercache.
268   The update == 1 pass marks the buffers we need to update, the update == 2
269   pass does the actual I/O. */
270void invalidate_bdev(struct block_device *bdev)
271{
272	struct address_space *mapping = bdev->bd_inode->i_mapping;
273
274	if (mapping->nrpages == 0)
275		return;
276
277	invalidate_bh_lrus();
278	lru_add_drain_all();	/* make sure all lru add caches are flushed */
279	invalidate_mapping_pages(mapping, 0, -1);
280}
281EXPORT_SYMBOL(invalidate_bdev);
282
283/*
284 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
285 */
286static void free_more_memory(void)
287{
288	struct zone *zone;
289	int nid;
290
291	wakeup_flusher_threads(1024);
292	yield();
293
294	for_each_online_node(nid) {
295		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
296						gfp_zone(GFP_NOFS), NULL,
297						&zone);
298		if (zone)
299			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
300						GFP_NOFS, NULL);
301	}
302}
303
304/*
305 * I/O completion handler for block_read_full_page() - pages
306 * which come unlocked at the end of I/O.
307 */
308static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
309{
310	unsigned long flags;
311	struct buffer_head *first;
312	struct buffer_head *tmp;
313	struct page *page;
314	int page_uptodate = 1;
315
316	BUG_ON(!buffer_async_read(bh));
317
318	page = bh->b_page;
319	if (uptodate) {
320		set_buffer_uptodate(bh);
321	} else {
322		clear_buffer_uptodate(bh);
323		if (!quiet_error(bh))
324			buffer_io_error(bh);
325		SetPageError(page);
326	}
327
328	/*
329	 * Be _very_ careful from here on. Bad things can happen if
330	 * two buffer heads end IO at almost the same time and both
331	 * decide that the page is now completely done.
332	 */
333	first = page_buffers(page);
334	local_irq_save(flags);
335	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
336	clear_buffer_async_read(bh);
337	unlock_buffer(bh);
338	tmp = bh;
339	do {
340		if (!buffer_uptodate(tmp))
341			page_uptodate = 0;
342		if (buffer_async_read(tmp)) {
343			BUG_ON(!buffer_locked(tmp));
344			goto still_busy;
345		}
346		tmp = tmp->b_this_page;
347	} while (tmp != bh);
348	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
349	local_irq_restore(flags);
350
351	/*
352	 * If none of the buffers had errors and they are all
353	 * uptodate then we can set the page uptodate.
354	 */
355	if (page_uptodate && !PageError(page))
356		SetPageUptodate(page);
357	unlock_page(page);
358	return;
359
360still_busy:
361	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
362	local_irq_restore(flags);
363	return;
364}
365
366/*
367 * Completion handler for block_write_full_page() - pages which are unlocked
368 * during I/O, and which have PageWriteback cleared upon I/O completion.
369 */
370void end_buffer_async_write(struct buffer_head *bh, int uptodate)
371{
372	char b[BDEVNAME_SIZE];
373	unsigned long flags;
374	struct buffer_head *first;
375	struct buffer_head *tmp;
376	struct page *page;
377
378	BUG_ON(!buffer_async_write(bh));
379
380	page = bh->b_page;
381	if (uptodate) {
382		set_buffer_uptodate(bh);
383	} else {
384		if (!quiet_error(bh)) {
385			buffer_io_error(bh);
386			printk(KERN_WARNING "lost page write due to "
387					"I/O error on %s\n",
388			       bdevname(bh->b_bdev, b));
389		}
390		set_bit(AS_EIO, &page->mapping->flags);
391		set_buffer_write_io_error(bh);
392		clear_buffer_uptodate(bh);
393		SetPageError(page);
394	}
395
396	first = page_buffers(page);
397	local_irq_save(flags);
398	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
399
400	clear_buffer_async_write(bh);
401	unlock_buffer(bh);
402	tmp = bh->b_this_page;
403	while (tmp != bh) {
404		if (buffer_async_write(tmp)) {
405			BUG_ON(!buffer_locked(tmp));
406			goto still_busy;
407		}
408		tmp = tmp->b_this_page;
409	}
410	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
411	local_irq_restore(flags);
412	end_page_writeback(page);
413	return;
414
415still_busy:
416	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
417	local_irq_restore(flags);
418	return;
419}
420EXPORT_SYMBOL(end_buffer_async_write);
421
422/*
423 * If a page's buffers are under async readin (end_buffer_async_read
424 * completion) then there is a possibility that another thread of
425 * control could lock one of the buffers after it has completed
426 * but while some of the other buffers have not completed.  This
427 * locked buffer would confuse end_buffer_async_read() into not unlocking
428 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
429 * that this buffer is not under async I/O.
430 *
431 * The page comes unlocked when it has no locked buffer_async buffers
432 * left.
433 *
434 * PageLocked prevents anyone starting new async I/O reads any of
435 * the buffers.
436 *
437 * PageWriteback is used to prevent simultaneous writeout of the same
438 * page.
439 *
440 * PageLocked prevents anyone from starting writeback of a page which is
441 * under read I/O (PageWriteback is only ever set against a locked page).
442 */
443static void mark_buffer_async_read(struct buffer_head *bh)
444{
445	bh->b_end_io = end_buffer_async_read;
446	set_buffer_async_read(bh);
447}
448
449static void mark_buffer_async_write_endio(struct buffer_head *bh,
450					  bh_end_io_t *handler)
451{
452	bh->b_end_io = handler;
453	set_buffer_async_write(bh);
454}
455
456void mark_buffer_async_write(struct buffer_head *bh)
457{
458	mark_buffer_async_write_endio(bh, end_buffer_async_write);
459}
460EXPORT_SYMBOL(mark_buffer_async_write);
461
462
463
464/*
465 * The buffer's backing address_space's private_lock must be held
466 */
467static void __remove_assoc_queue(struct buffer_head *bh)
468{
469	list_del_init(&bh->b_assoc_buffers);
470	WARN_ON(!bh->b_assoc_map);
471	if (buffer_write_io_error(bh))
472		set_bit(AS_EIO, &bh->b_assoc_map->flags);
473	bh->b_assoc_map = NULL;
474}
475
476int inode_has_buffers(struct inode *inode)
477{
478	return !list_empty(&inode->i_data.private_list);
479}
480
481/*
482 * osync is designed to support O_SYNC io.  It waits synchronously for
483 * all already-submitted IO to complete, but does not queue any new
484 * writes to the disk.
485 *
486 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
487 * you dirty the buffers, and then use osync_inode_buffers to wait for
488 * completion.  Any other dirty buffers which are not yet queued for
489 * write will not be flushed to disk by the osync.
490 */
491static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
492{
493	struct buffer_head *bh;
494	struct list_head *p;
495	int err = 0;
496
497	spin_lock(lock);
498repeat:
499	list_for_each_prev(p, list) {
500		bh = BH_ENTRY(p);
501		if (buffer_locked(bh)) {
502			get_bh(bh);
503			spin_unlock(lock);
504			wait_on_buffer(bh);
505			if (!buffer_uptodate(bh))
506				err = -EIO;
507			brelse(bh);
508			spin_lock(lock);
509			goto repeat;
510		}
511	}
512	spin_unlock(lock);
513	return err;
514}
515
516static void do_thaw_one(struct super_block *sb, void *unused)
517{
518	char b[BDEVNAME_SIZE];
519	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
520		printk(KERN_WARNING "Emergency Thaw on %s\n",
521		       bdevname(sb->s_bdev, b));
522}
523
524static void do_thaw_all(struct work_struct *work)
525{
526	iterate_supers(do_thaw_one, NULL);
527	kfree(work);
528	printk(KERN_WARNING "Emergency Thaw complete\n");
529}
530
531/**
532 * emergency_thaw_all -- forcibly thaw every frozen filesystem
533 *
534 * Used for emergency unfreeze of all filesystems via SysRq
535 */
536void emergency_thaw_all(void)
537{
538	struct work_struct *work;
539
540	work = kmalloc(sizeof(*work), GFP_ATOMIC);
541	if (work) {
542		INIT_WORK(work, do_thaw_all);
543		schedule_work(work);
544	}
545}
546
547/**
548 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
549 * @mapping: the mapping which wants those buffers written
550 *
551 * Starts I/O against the buffers at mapping->private_list, and waits upon
552 * that I/O.
553 *
554 * Basically, this is a convenience function for fsync().
555 * @mapping is a file or directory which needs those buffers to be written for
556 * a successful fsync().
557 */
558int sync_mapping_buffers(struct address_space *mapping)
559{
560	struct address_space *buffer_mapping = mapping->assoc_mapping;
561
562	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
563		return 0;
564
565	return fsync_buffers_list(&buffer_mapping->private_lock,
566					&mapping->private_list);
567}
568EXPORT_SYMBOL(sync_mapping_buffers);
569
570/*
571 * Called when we've recently written block `bblock', and it is known that
572 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
573 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
574 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
575 */
576void write_boundary_block(struct block_device *bdev,
577			sector_t bblock, unsigned blocksize)
578{
579	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
580	if (bh) {
581		if (buffer_dirty(bh))
582			ll_rw_block(WRITE, 1, &bh);
583		put_bh(bh);
584	}
585}
586
587void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
588{
589	struct address_space *mapping = inode->i_mapping;
590	struct address_space *buffer_mapping = bh->b_page->mapping;
591
592	mark_buffer_dirty(bh);
593	if (!mapping->assoc_mapping) {
594		mapping->assoc_mapping = buffer_mapping;
595	} else {
596		BUG_ON(mapping->assoc_mapping != buffer_mapping);
597	}
598	if (!bh->b_assoc_map) {
599		spin_lock(&buffer_mapping->private_lock);
600		list_move_tail(&bh->b_assoc_buffers,
601				&mapping->private_list);
602		bh->b_assoc_map = mapping;
603		spin_unlock(&buffer_mapping->private_lock);
604	}
605}
606EXPORT_SYMBOL(mark_buffer_dirty_inode);
607
608/*
609 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
610 * dirty.
611 *
612 * If warn is true, then emit a warning if the page is not uptodate and has
613 * not been truncated.
614 */
615static void __set_page_dirty(struct page *page,
616		struct address_space *mapping, int warn)
617{
618	spin_lock_irq(&mapping->tree_lock);
619	if (page->mapping) {	/* Race with truncate? */
620		WARN_ON_ONCE(warn && !PageUptodate(page));
621		account_page_dirtied(page, mapping);
622		radix_tree_tag_set(&mapping->page_tree,
623				page_index(page), PAGECACHE_TAG_DIRTY);
624	}
625	spin_unlock_irq(&mapping->tree_lock);
626	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
627}
628
629int __set_page_dirty_buffers(struct page *page)
630{
631	int newly_dirty;
632	struct address_space *mapping = page_mapping(page);
633
634	if (unlikely(!mapping))
635		return !TestSetPageDirty(page);
636
637	spin_lock(&mapping->private_lock);
638	if (page_has_buffers(page)) {
639		struct buffer_head *head = page_buffers(page);
640		struct buffer_head *bh = head;
641
642		do {
643			set_buffer_dirty(bh);
644			bh = bh->b_this_page;
645		} while (bh != head);
646	}
647	newly_dirty = !TestSetPageDirty(page);
648	spin_unlock(&mapping->private_lock);
649
650	if (newly_dirty)
651		__set_page_dirty(page, mapping, 1);
652	return newly_dirty;
653}
654EXPORT_SYMBOL(__set_page_dirty_buffers);
655
656/*
657 * Write out and wait upon a list of buffers.
658 *
659 * We have conflicting pressures: we want to make sure that all
660 * initially dirty buffers get waited on, but that any subsequently
661 * dirtied buffers don't.  After all, we don't want fsync to last
662 * forever if somebody is actively writing to the file.
663 *
664 * Do this in two main stages: first we copy dirty buffers to a
665 * temporary inode list, queueing the writes as we go.  Then we clean
666 * up, waiting for those writes to complete.
667 *
668 * During this second stage, any subsequent updates to the file may end
669 * up refiling the buffer on the original inode's dirty list again, so
670 * there is a chance we will end up with a buffer queued for write but
671 * not yet completed on that list.  So, as a final cleanup we go through
672 * the osync code to catch these locked, dirty buffers without requeuing
673 * any newly dirty buffers for write.
674 */
675static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
676{
677	struct buffer_head *bh;
678	struct list_head tmp;
679	struct address_space *mapping, *prev_mapping = NULL;
680	int err = 0, err2;
681
682	INIT_LIST_HEAD(&tmp);
683
684	spin_lock(lock);
685	while (!list_empty(list)) {
686		bh = BH_ENTRY(list->next);
687		mapping = bh->b_assoc_map;
688		__remove_assoc_queue(bh);
689		/* Avoid race with mark_buffer_dirty_inode() which does
690		 * a lockless check and we rely on seeing the dirty bit */
691		smp_mb();
692		if (buffer_dirty(bh) || buffer_locked(bh)) {
693			list_add(&bh->b_assoc_buffers, &tmp);
694			bh->b_assoc_map = mapping;
695			if (buffer_dirty(bh)) {
696				get_bh(bh);
697				spin_unlock(lock);
698				/*
699				 * Ensure any pending I/O completes so that
700				 * write_dirty_buffer() actually writes the
701				 * current contents - it is a noop if I/O is
702				 * still in flight on potentially older
703				 * contents.
704				 */
705				write_dirty_buffer(bh, WRITE_SYNC_PLUG);
706
707				/*
708				 * Kick off IO for the previous mapping. Note
709				 * that we will not run the very last mapping,
710				 * wait_on_buffer() will do that for us
711				 * through sync_buffer().
712				 */
713				if (prev_mapping && prev_mapping != mapping)
714					blk_run_address_space(prev_mapping);
715				prev_mapping = mapping;
716
717				brelse(bh);
718				spin_lock(lock);
719			}
720		}
721	}
722
723	while (!list_empty(&tmp)) {
724		bh = BH_ENTRY(tmp.prev);
725		get_bh(bh);
726		mapping = bh->b_assoc_map;
727		__remove_assoc_queue(bh);
728		/* Avoid race with mark_buffer_dirty_inode() which does
729		 * a lockless check and we rely on seeing the dirty bit */
730		smp_mb();
731		if (buffer_dirty(bh)) {
732			list_add(&bh->b_assoc_buffers,
733				 &mapping->private_list);
734			bh->b_assoc_map = mapping;
735		}
736		spin_unlock(lock);
737		wait_on_buffer(bh);
738		if (!buffer_uptodate(bh))
739			err = -EIO;
740		brelse(bh);
741		spin_lock(lock);
742	}
743
744	spin_unlock(lock);
745	err2 = osync_buffers_list(lock, list);
746	if (err)
747		return err;
748	else
749		return err2;
750}
751
752/*
753 * Invalidate any and all dirty buffers on a given inode.  We are
754 * probably unmounting the fs, but that doesn't mean we have already
755 * done a sync().  Just drop the buffers from the inode list.
756 *
757 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
758 * assumes that all the buffers are against the blockdev.  Not true
759 * for reiserfs.
760 */
761void invalidate_inode_buffers(struct inode *inode)
762{
763	if (inode_has_buffers(inode)) {
764		struct address_space *mapping = &inode->i_data;
765		struct list_head *list = &mapping->private_list;
766		struct address_space *buffer_mapping = mapping->assoc_mapping;
767
768		spin_lock(&buffer_mapping->private_lock);
769		while (!list_empty(list))
770			__remove_assoc_queue(BH_ENTRY(list->next));
771		spin_unlock(&buffer_mapping->private_lock);
772	}
773}
774EXPORT_SYMBOL(invalidate_inode_buffers);
775
776/*
777 * Remove any clean buffers from the inode's buffer list.  This is called
778 * when we're trying to free the inode itself.  Those buffers can pin it.
779 *
780 * Returns true if all buffers were removed.
781 */
782int remove_inode_buffers(struct inode *inode)
783{
784	int ret = 1;
785
786	if (inode_has_buffers(inode)) {
787		struct address_space *mapping = &inode->i_data;
788		struct list_head *list = &mapping->private_list;
789		struct address_space *buffer_mapping = mapping->assoc_mapping;
790
791		spin_lock(&buffer_mapping->private_lock);
792		while (!list_empty(list)) {
793			struct buffer_head *bh = BH_ENTRY(list->next);
794			if (buffer_dirty(bh)) {
795				ret = 0;
796				break;
797			}
798			__remove_assoc_queue(bh);
799		}
800		spin_unlock(&buffer_mapping->private_lock);
801	}
802	return ret;
803}
804
805/*
806 * Create the appropriate buffers when given a page for data area and
807 * the size of each buffer.. Use the bh->b_this_page linked list to
808 * follow the buffers created.  Return NULL if unable to create more
809 * buffers.
810 *
811 * The retry flag is used to differentiate async IO (paging, swapping)
812 * which may not fail from ordinary buffer allocations.
813 */
814struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
815		int retry)
816{
817	struct buffer_head *bh, *head;
818	long offset;
819
820try_again:
821	head = NULL;
822	offset = PAGE_SIZE;
823	while ((offset -= size) >= 0) {
824		bh = alloc_buffer_head(GFP_NOFS);
825		if (!bh)
826			goto no_grow;
827
828		bh->b_bdev = NULL;
829		bh->b_this_page = head;
830		bh->b_blocknr = -1;
831		head = bh;
832
833		bh->b_state = 0;
834		atomic_set(&bh->b_count, 0);
835		bh->b_private = NULL;
836		bh->b_size = size;
837
838		/* Link the buffer to its page */
839		set_bh_page(bh, page, offset);
840
841		init_buffer(bh, NULL, NULL);
842	}
843	return head;
844/*
845 * In case anything failed, we just free everything we got.
846 */
847no_grow:
848	if (head) {
849		do {
850			bh = head;
851			head = head->b_this_page;
852			free_buffer_head(bh);
853		} while (head);
854	}
855
856	/*
857	 * Return failure for non-async IO requests.  Async IO requests
858	 * are not allowed to fail, so we have to wait until buffer heads
859	 * become available.  But we don't want tasks sleeping with
860	 * partially complete buffers, so all were released above.
861	 */
862	if (!retry)
863		return NULL;
864
865	/* We're _really_ low on memory. Now we just
866	 * wait for old buffer heads to become free due to
867	 * finishing IO.  Since this is an async request and
868	 * the reserve list is empty, we're sure there are
869	 * async buffer heads in use.
870	 */
871	free_more_memory();
872	goto try_again;
873}
874EXPORT_SYMBOL_GPL(alloc_page_buffers);
875
876static inline void
877link_dev_buffers(struct page *page, struct buffer_head *head)
878{
879	struct buffer_head *bh, *tail;
880
881	bh = head;
882	do {
883		tail = bh;
884		bh = bh->b_this_page;
885	} while (bh);
886	tail->b_this_page = head;
887	attach_page_buffers(page, head);
888}
889
890/*
891 * Initialise the state of a blockdev page's buffers.
892 */
893static void
894init_page_buffers(struct page *page, struct block_device *bdev,
895			sector_t block, int size)
896{
897	struct buffer_head *head = page_buffers(page);
898	struct buffer_head *bh = head;
899	int uptodate = PageUptodate(page);
900
901	do {
902		if (!buffer_mapped(bh)) {
903			init_buffer(bh, NULL, NULL);
904			bh->b_bdev = bdev;
905			bh->b_blocknr = block;
906			if (uptodate)
907				set_buffer_uptodate(bh);
908			set_buffer_mapped(bh);
909		}
910		block++;
911		bh = bh->b_this_page;
912	} while (bh != head);
913}
914
915/*
916 * Create the page-cache page that contains the requested block.
917 *
918 * This is user purely for blockdev mappings.
919 */
920static struct page *
921grow_dev_page(struct block_device *bdev, sector_t block,
922		pgoff_t index, int size)
923{
924	struct inode *inode = bdev->bd_inode;
925	struct page *page;
926	struct buffer_head *bh;
927
928	page = find_or_create_page(inode->i_mapping, index,
929		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
930	if (!page)
931		return NULL;
932
933	BUG_ON(!PageLocked(page));
934
935	if (page_has_buffers(page)) {
936		bh = page_buffers(page);
937		if (bh->b_size == size) {
938			init_page_buffers(page, bdev, block, size);
939			return page;
940		}
941		if (!try_to_free_buffers(page))
942			goto failed;
943	}
944
945	/*
946	 * Allocate some buffers for this page
947	 */
948	bh = alloc_page_buffers(page, size, 0);
949	if (!bh)
950		goto failed;
951
952	/*
953	 * Link the page to the buffers and initialise them.  Take the
954	 * lock to be atomic wrt __find_get_block(), which does not
955	 * run under the page lock.
956	 */
957	spin_lock(&inode->i_mapping->private_lock);
958	link_dev_buffers(page, bh);
959	init_page_buffers(page, bdev, block, size);
960	spin_unlock(&inode->i_mapping->private_lock);
961	return page;
962
963failed:
964	BUG();
965	unlock_page(page);
966	page_cache_release(page);
967	return NULL;
968}
969
970/*
971 * Create buffers for the specified block device block's page.  If
972 * that page was dirty, the buffers are set dirty also.
973 */
974static int
975grow_buffers(struct block_device *bdev, sector_t block, int size)
976{
977	struct page *page;
978	pgoff_t index;
979	int sizebits;
980
981	sizebits = -1;
982	do {
983		sizebits++;
984	} while ((size << sizebits) < PAGE_SIZE);
985
986	index = block >> sizebits;
987
988	/*
989	 * Check for a block which wants to lie outside our maximum possible
990	 * pagecache index.  (this comparison is done using sector_t types).
991	 */
992	if (unlikely(index != block >> sizebits)) {
993		char b[BDEVNAME_SIZE];
994
995		printk(KERN_ERR "%s: requested out-of-range block %llu for "
996			"device %s\n",
997			__func__, (unsigned long long)block,
998			bdevname(bdev, b));
999		return -EIO;
1000	}
1001	block = index << sizebits;
1002	/* Create a page with the proper size buffers.. */
1003	page = grow_dev_page(bdev, block, index, size);
1004	if (!page)
1005		return 0;
1006	unlock_page(page);
1007	page_cache_release(page);
1008	return 1;
1009}
1010
1011static struct buffer_head *
1012__getblk_slow(struct block_device *bdev, sector_t block, int size)
1013{
1014	/* Size must be multiple of hard sectorsize */
1015	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1016			(size < 512 || size > PAGE_SIZE))) {
1017		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1018					size);
1019		printk(KERN_ERR "logical block size: %d\n",
1020					bdev_logical_block_size(bdev));
1021
1022		dump_stack();
1023		return NULL;
1024	}
1025
1026	for (;;) {
1027		struct buffer_head * bh;
1028		int ret;
1029
1030		bh = __find_get_block(bdev, block, size);
1031		if (bh)
1032			return bh;
1033
1034		ret = grow_buffers(bdev, block, size);
1035		if (ret < 0)
1036			return NULL;
1037		if (ret == 0)
1038			free_more_memory();
1039	}
1040}
1041
1042/*
1043 * The relationship between dirty buffers and dirty pages:
1044 *
1045 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1046 * the page is tagged dirty in its radix tree.
1047 *
1048 * At all times, the dirtiness of the buffers represents the dirtiness of
1049 * subsections of the page.  If the page has buffers, the page dirty bit is
1050 * merely a hint about the true dirty state.
1051 *
1052 * When a page is set dirty in its entirety, all its buffers are marked dirty
1053 * (if the page has buffers).
1054 *
1055 * When a buffer is marked dirty, its page is dirtied, but the page's other
1056 * buffers are not.
1057 *
1058 * Also.  When blockdev buffers are explicitly read with bread(), they
1059 * individually become uptodate.  But their backing page remains not
1060 * uptodate - even if all of its buffers are uptodate.  A subsequent
1061 * block_read_full_page() against that page will discover all the uptodate
1062 * buffers, will set the page uptodate and will perform no I/O.
1063 */
1064
1065/**
1066 * mark_buffer_dirty - mark a buffer_head as needing writeout
1067 * @bh: the buffer_head to mark dirty
1068 *
1069 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1070 * backing page dirty, then tag the page as dirty in its address_space's radix
1071 * tree and then attach the address_space's inode to its superblock's dirty
1072 * inode list.
1073 *
1074 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1075 * mapping->tree_lock and the global inode_lock.
1076 */
1077void mark_buffer_dirty(struct buffer_head *bh)
1078{
1079	WARN_ON_ONCE(!buffer_uptodate(bh));
1080
1081	/*
1082	 * Very *carefully* optimize the it-is-already-dirty case.
1083	 *
1084	 * Don't let the final "is it dirty" escape to before we
1085	 * perhaps modified the buffer.
1086	 */
1087	if (buffer_dirty(bh)) {
1088		smp_mb();
1089		if (buffer_dirty(bh))
1090			return;
1091	}
1092
1093	if (!test_set_buffer_dirty(bh)) {
1094		struct page *page = bh->b_page;
1095		if (!TestSetPageDirty(page)) {
1096			struct address_space *mapping = page_mapping(page);
1097			if (mapping)
1098				__set_page_dirty(page, mapping, 0);
1099		}
1100	}
1101}
1102EXPORT_SYMBOL(mark_buffer_dirty);
1103
1104/*
1105 * Decrement a buffer_head's reference count.  If all buffers against a page
1106 * have zero reference count, are clean and unlocked, and if the page is clean
1107 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1108 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1109 * a page but it ends up not being freed, and buffers may later be reattached).
1110 */
1111void __brelse(struct buffer_head * buf)
1112{
1113	if (atomic_read(&buf->b_count)) {
1114		put_bh(buf);
1115		return;
1116	}
1117	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1118}
1119EXPORT_SYMBOL(__brelse);
1120
1121/*
1122 * bforget() is like brelse(), except it discards any
1123 * potentially dirty data.
1124 */
1125void __bforget(struct buffer_head *bh)
1126{
1127	clear_buffer_dirty(bh);
1128	if (bh->b_assoc_map) {
1129		struct address_space *buffer_mapping = bh->b_page->mapping;
1130
1131		spin_lock(&buffer_mapping->private_lock);
1132		list_del_init(&bh->b_assoc_buffers);
1133		bh->b_assoc_map = NULL;
1134		spin_unlock(&buffer_mapping->private_lock);
1135	}
1136	__brelse(bh);
1137}
1138EXPORT_SYMBOL(__bforget);
1139
1140static struct buffer_head *__bread_slow(struct buffer_head *bh)
1141{
1142	lock_buffer(bh);
1143	if (buffer_uptodate(bh)) {
1144		unlock_buffer(bh);
1145		return bh;
1146	} else {
1147		get_bh(bh);
1148		bh->b_end_io = end_buffer_read_sync;
1149		submit_bh(READ, bh);
1150		wait_on_buffer(bh);
1151		if (buffer_uptodate(bh))
1152			return bh;
1153	}
1154	brelse(bh);
1155	return NULL;
1156}
1157
1158/*
1159 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1160 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1161 * refcount elevated by one when they're in an LRU.  A buffer can only appear
1162 * once in a particular CPU's LRU.  A single buffer can be present in multiple
1163 * CPU's LRUs at the same time.
1164 *
1165 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1166 * sb_find_get_block().
1167 *
1168 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1169 * a local interrupt disable for that.
1170 */
1171
1172#define BH_LRU_SIZE	8
1173
1174struct bh_lru {
1175	struct buffer_head *bhs[BH_LRU_SIZE];
1176};
1177
1178static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1179
1180#ifdef CONFIG_SMP
1181#define bh_lru_lock()	local_irq_disable()
1182#define bh_lru_unlock()	local_irq_enable()
1183#else
1184#define bh_lru_lock()	preempt_disable()
1185#define bh_lru_unlock()	preempt_enable()
1186#endif
1187
1188static inline void check_irqs_on(void)
1189{
1190#ifdef irqs_disabled
1191	BUG_ON(irqs_disabled());
1192#endif
1193}
1194
1195/*
1196 * The LRU management algorithm is dopey-but-simple.  Sorry.
1197 */
1198static void bh_lru_install(struct buffer_head *bh)
1199{
1200	struct buffer_head *evictee = NULL;
1201	struct bh_lru *lru;
1202
1203	check_irqs_on();
1204	bh_lru_lock();
1205	lru = &__get_cpu_var(bh_lrus);
1206	if (lru->bhs[0] != bh) {
1207		struct buffer_head *bhs[BH_LRU_SIZE];
1208		int in;
1209		int out = 0;
1210
1211		get_bh(bh);
1212		bhs[out++] = bh;
1213		for (in = 0; in < BH_LRU_SIZE; in++) {
1214			struct buffer_head *bh2 = lru->bhs[in];
1215
1216			if (bh2 == bh) {
1217				__brelse(bh2);
1218			} else {
1219				if (out >= BH_LRU_SIZE) {
1220					BUG_ON(evictee != NULL);
1221					evictee = bh2;
1222				} else {
1223					bhs[out++] = bh2;
1224				}
1225			}
1226		}
1227		while (out < BH_LRU_SIZE)
1228			bhs[out++] = NULL;
1229		memcpy(lru->bhs, bhs, sizeof(bhs));
1230	}
1231	bh_lru_unlock();
1232
1233	if (evictee)
1234		__brelse(evictee);
1235}
1236
1237/*
1238 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1239 */
1240static struct buffer_head *
1241lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1242{
1243	struct buffer_head *ret = NULL;
1244	struct bh_lru *lru;
1245	unsigned int i;
1246
1247	check_irqs_on();
1248	bh_lru_lock();
1249	lru = &__get_cpu_var(bh_lrus);
1250	for (i = 0; i < BH_LRU_SIZE; i++) {
1251		struct buffer_head *bh = lru->bhs[i];
1252
1253		if (bh && bh->b_bdev == bdev &&
1254				bh->b_blocknr == block && bh->b_size == size) {
1255			if (i) {
1256				while (i) {
1257					lru->bhs[i] = lru->bhs[i - 1];
1258					i--;
1259				}
1260				lru->bhs[0] = bh;
1261			}
1262			get_bh(bh);
1263			ret = bh;
1264			break;
1265		}
1266	}
1267	bh_lru_unlock();
1268	return ret;
1269}
1270
1271/*
1272 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1273 * it in the LRU and mark it as accessed.  If it is not present then return
1274 * NULL
1275 */
1276struct buffer_head *
1277__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1278{
1279	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1280
1281	if (bh == NULL) {
1282		bh = __find_get_block_slow(bdev, block);
1283		if (bh)
1284			bh_lru_install(bh);
1285	}
1286	if (bh)
1287		touch_buffer(bh);
1288	return bh;
1289}
1290EXPORT_SYMBOL(__find_get_block);
1291
1292struct buffer_head *
1293__getblk(struct block_device *bdev, sector_t block, unsigned size)
1294{
1295	struct buffer_head *bh = __find_get_block(bdev, block, size);
1296
1297	might_sleep();
1298	if (bh == NULL)
1299		bh = __getblk_slow(bdev, block, size);
1300	return bh;
1301}
1302EXPORT_SYMBOL(__getblk);
1303
1304/*
1305 * Do async read-ahead on a buffer..
1306 */
1307void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1308{
1309	struct buffer_head *bh = __getblk(bdev, block, size);
1310	if (likely(bh)) {
1311		ll_rw_block(READA, 1, &bh);
1312		brelse(bh);
1313	}
1314}
1315EXPORT_SYMBOL(__breadahead);
1316
1317/**
1318 *  __bread() - reads a specified block and returns the bh
1319 *  @bdev: the block_device to read from
1320 *  @block: number of block
1321 *  @size: size (in bytes) to read
1322 *
1323 *  Reads a specified block, and returns buffer head that contains it.
1324 *  It returns NULL if the block was unreadable.
1325 */
1326struct buffer_head *
1327__bread(struct block_device *bdev, sector_t block, unsigned size)
1328{
1329	struct buffer_head *bh = __getblk(bdev, block, size);
1330
1331	if (likely(bh) && !buffer_uptodate(bh))
1332		bh = __bread_slow(bh);
1333	return bh;
1334}
1335EXPORT_SYMBOL(__bread);
1336
1337/*
1338 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1339 * This doesn't race because it runs in each cpu either in irq
1340 * or with preempt disabled.
1341 */
1342static void invalidate_bh_lru(void *arg)
1343{
1344	struct bh_lru *b = &get_cpu_var(bh_lrus);
1345	int i;
1346
1347	for (i = 0; i < BH_LRU_SIZE; i++) {
1348		brelse(b->bhs[i]);
1349		b->bhs[i] = NULL;
1350	}
1351	put_cpu_var(bh_lrus);
1352}
1353
1354void invalidate_bh_lrus(void)
1355{
1356	on_each_cpu(invalidate_bh_lru, NULL, 1);
1357}
1358EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1359
1360void set_bh_page(struct buffer_head *bh,
1361		struct page *page, unsigned long offset)
1362{
1363	bh->b_page = page;
1364	BUG_ON(offset >= PAGE_SIZE);
1365	if (PageHighMem(page))
1366		/*
1367		 * This catches illegal uses and preserves the offset:
1368		 */
1369		bh->b_data = (char *)(0 + offset);
1370	else
1371		bh->b_data = page_address(page) + offset;
1372}
1373EXPORT_SYMBOL(set_bh_page);
1374
1375/*
1376 * Called when truncating a buffer on a page completely.
1377 */
1378static void discard_buffer(struct buffer_head * bh)
1379{
1380	lock_buffer(bh);
1381	clear_buffer_dirty(bh);
1382	bh->b_bdev = NULL;
1383	clear_buffer_mapped(bh);
1384	clear_buffer_req(bh);
1385	clear_buffer_new(bh);
1386	clear_buffer_delay(bh);
1387	clear_buffer_unwritten(bh);
1388	unlock_buffer(bh);
1389}
1390
1391/**
1392 * block_invalidatepage - invalidate part of all of a buffer-backed page
1393 *
1394 * @page: the page which is affected
1395 * @offset: the index of the truncation point
1396 *
1397 * block_invalidatepage() is called when all or part of the page has become
1398 * invalidatedby a truncate operation.
1399 *
1400 * block_invalidatepage() does not have to release all buffers, but it must
1401 * ensure that no dirty buffer is left outside @offset and that no I/O
1402 * is underway against any of the blocks which are outside the truncation
1403 * point.  Because the caller is about to free (and possibly reuse) those
1404 * blocks on-disk.
1405 */
1406void block_invalidatepage(struct page *page, unsigned long offset)
1407{
1408	struct buffer_head *head, *bh, *next;
1409	unsigned int curr_off = 0;
1410
1411	BUG_ON(!PageLocked(page));
1412	if (!page_has_buffers(page))
1413		goto out;
1414
1415	head = page_buffers(page);
1416	bh = head;
1417	do {
1418		unsigned int next_off = curr_off + bh->b_size;
1419		next = bh->b_this_page;
1420
1421		/*
1422		 * is this block fully invalidated?
1423		 */
1424		if (offset <= curr_off)
1425			discard_buffer(bh);
1426		curr_off = next_off;
1427		bh = next;
1428	} while (bh != head);
1429
1430	/*
1431	 * We release buffers only if the entire page is being invalidated.
1432	 * The get_block cached value has been unconditionally invalidated,
1433	 * so real IO is not possible anymore.
1434	 */
1435	if (offset == 0)
1436		try_to_release_page(page, 0);
1437out:
1438	return;
1439}
1440EXPORT_SYMBOL(block_invalidatepage);
1441
1442/*
1443 * We attach and possibly dirty the buffers atomically wrt
1444 * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1445 * is already excluded via the page lock.
1446 */
1447void create_empty_buffers(struct page *page,
1448			unsigned long blocksize, unsigned long b_state)
1449{
1450	struct buffer_head *bh, *head, *tail;
1451
1452	head = alloc_page_buffers(page, blocksize, 1);
1453	bh = head;
1454	do {
1455		bh->b_state |= b_state;
1456		tail = bh;
1457		bh = bh->b_this_page;
1458	} while (bh);
1459	tail->b_this_page = head;
1460
1461	spin_lock(&page->mapping->private_lock);
1462	if (PageUptodate(page) || PageDirty(page)) {
1463		bh = head;
1464		do {
1465			if (PageDirty(page))
1466				set_buffer_dirty(bh);
1467			if (PageUptodate(page))
1468				set_buffer_uptodate(bh);
1469			bh = bh->b_this_page;
1470		} while (bh != head);
1471	}
1472	attach_page_buffers(page, head);
1473	spin_unlock(&page->mapping->private_lock);
1474}
1475EXPORT_SYMBOL(create_empty_buffers);
1476
1477/*
1478 * We are taking a block for data and we don't want any output from any
1479 * buffer-cache aliases starting from return from that function and
1480 * until the moment when something will explicitly mark the buffer
1481 * dirty (hopefully that will not happen until we will free that block ;-)
1482 * We don't even need to mark it not-uptodate - nobody can expect
1483 * anything from a newly allocated buffer anyway. We used to used
1484 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1485 * don't want to mark the alias unmapped, for example - it would confuse
1486 * anyone who might pick it with bread() afterwards...
1487 *
1488 * Also..  Note that bforget() doesn't lock the buffer.  So there can
1489 * be writeout I/O going on against recently-freed buffers.  We don't
1490 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1491 * only if we really need to.  That happens here.
1492 */
1493void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1494{
1495	struct buffer_head *old_bh;
1496
1497	might_sleep();
1498
1499	old_bh = __find_get_block_slow(bdev, block);
1500	if (old_bh) {
1501		clear_buffer_dirty(old_bh);
1502		wait_on_buffer(old_bh);
1503		clear_buffer_req(old_bh);
1504		__brelse(old_bh);
1505	}
1506}
1507EXPORT_SYMBOL(unmap_underlying_metadata);
1508
1509/*
1510 * NOTE! All mapped/uptodate combinations are valid:
1511 *
1512 *	Mapped	Uptodate	Meaning
1513 *
1514 *	No	No		"unknown" - must do get_block()
1515 *	No	Yes		"hole" - zero-filled
1516 *	Yes	No		"allocated" - allocated on disk, not read in
1517 *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1518 *
1519 * "Dirty" is valid only with the last case (mapped+uptodate).
1520 */
1521
1522/*
1523 * While block_write_full_page is writing back the dirty buffers under
1524 * the page lock, whoever dirtied the buffers may decide to clean them
1525 * again at any time.  We handle that by only looking at the buffer
1526 * state inside lock_buffer().
1527 *
1528 * If block_write_full_page() is called for regular writeback
1529 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1530 * locked buffer.   This only can happen if someone has written the buffer
1531 * directly, with submit_bh().  At the address_space level PageWriteback
1532 * prevents this contention from occurring.
1533 *
1534 * If block_write_full_page() is called with wbc->sync_mode ==
1535 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
1536 * causes the writes to be flagged as synchronous writes, but the
1537 * block device queue will NOT be unplugged, since usually many pages
1538 * will be pushed to the out before the higher-level caller actually
1539 * waits for the writes to be completed.  The various wait functions,
1540 * such as wait_on_writeback_range() will ultimately call sync_page()
1541 * which will ultimately call blk_run_backing_dev(), which will end up
1542 * unplugging the device queue.
1543 */
1544static int __block_write_full_page(struct inode *inode, struct page *page,
1545			get_block_t *get_block, struct writeback_control *wbc,
1546			bh_end_io_t *handler)
1547{
1548	int err;
1549	sector_t block;
1550	sector_t last_block;
1551	struct buffer_head *bh, *head;
1552	const unsigned blocksize = 1 << inode->i_blkbits;
1553	int nr_underway = 0;
1554	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1555			WRITE_SYNC_PLUG : WRITE);
1556
1557	BUG_ON(!PageLocked(page));
1558
1559	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1560
1561	if (!page_has_buffers(page)) {
1562		create_empty_buffers(page, blocksize,
1563					(1 << BH_Dirty)|(1 << BH_Uptodate));
1564	}
1565
1566	/*
1567	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1568	 * here, and the (potentially unmapped) buffers may become dirty at
1569	 * any time.  If a buffer becomes dirty here after we've inspected it
1570	 * then we just miss that fact, and the page stays dirty.
1571	 *
1572	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1573	 * handle that here by just cleaning them.
1574	 */
1575
1576	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1577	head = page_buffers(page);
1578	bh = head;
1579
1580	/*
1581	 * Get all the dirty buffers mapped to disk addresses and
1582	 * handle any aliases from the underlying blockdev's mapping.
1583	 */
1584	do {
1585		if (block > last_block) {
1586			/*
1587			 * mapped buffers outside i_size will occur, because
1588			 * this page can be outside i_size when there is a
1589			 * truncate in progress.
1590			 */
1591			/*
1592			 * The buffer was zeroed by block_write_full_page()
1593			 */
1594			clear_buffer_dirty(bh);
1595			set_buffer_uptodate(bh);
1596		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1597			   buffer_dirty(bh)) {
1598			WARN_ON(bh->b_size != blocksize);
1599			err = get_block(inode, block, bh, 1);
1600			if (err)
1601				goto recover;
1602			clear_buffer_delay(bh);
1603			if (buffer_new(bh)) {
1604				/* blockdev mappings never come here */
1605				clear_buffer_new(bh);
1606				unmap_underlying_metadata(bh->b_bdev,
1607							bh->b_blocknr);
1608			}
1609		}
1610		bh = bh->b_this_page;
1611		block++;
1612	} while (bh != head);
1613
1614	do {
1615		if (!buffer_mapped(bh))
1616			continue;
1617		/*
1618		 * If it's a fully non-blocking write attempt and we cannot
1619		 * lock the buffer then redirty the page.  Note that this can
1620		 * potentially cause a busy-wait loop from writeback threads
1621		 * and kswapd activity, but those code paths have their own
1622		 * higher-level throttling.
1623		 */
1624		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1625			lock_buffer(bh);
1626		} else if (!trylock_buffer(bh)) {
1627			redirty_page_for_writepage(wbc, page);
1628			continue;
1629		}
1630		if (test_clear_buffer_dirty(bh)) {
1631			mark_buffer_async_write_endio(bh, handler);
1632		} else {
1633			unlock_buffer(bh);
1634		}
1635	} while ((bh = bh->b_this_page) != head);
1636
1637	/*
1638	 * The page and its buffers are protected by PageWriteback(), so we can
1639	 * drop the bh refcounts early.
1640	 */
1641	BUG_ON(PageWriteback(page));
1642	set_page_writeback(page);
1643
1644	do {
1645		struct buffer_head *next = bh->b_this_page;
1646		if (buffer_async_write(bh)) {
1647			submit_bh(write_op, bh);
1648			nr_underway++;
1649		}
1650		bh = next;
1651	} while (bh != head);
1652	unlock_page(page);
1653
1654	err = 0;
1655done:
1656	if (nr_underway == 0) {
1657		/*
1658		 * The page was marked dirty, but the buffers were
1659		 * clean.  Someone wrote them back by hand with
1660		 * ll_rw_block/submit_bh.  A rare case.
1661		 */
1662		end_page_writeback(page);
1663
1664		/*
1665		 * The page and buffer_heads can be released at any time from
1666		 * here on.
1667		 */
1668	}
1669	return err;
1670
1671recover:
1672	/*
1673	 * ENOSPC, or some other error.  We may already have added some
1674	 * blocks to the file, so we need to write these out to avoid
1675	 * exposing stale data.
1676	 * The page is currently locked and not marked for writeback
1677	 */
1678	bh = head;
1679	/* Recovery: lock and submit the mapped buffers */
1680	do {
1681		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1682		    !buffer_delay(bh)) {
1683			lock_buffer(bh);
1684			mark_buffer_async_write_endio(bh, handler);
1685		} else {
1686			/*
1687			 * The buffer may have been set dirty during
1688			 * attachment to a dirty page.
1689			 */
1690			clear_buffer_dirty(bh);
1691		}
1692	} while ((bh = bh->b_this_page) != head);
1693	SetPageError(page);
1694	BUG_ON(PageWriteback(page));
1695	mapping_set_error(page->mapping, err);
1696	set_page_writeback(page);
1697	do {
1698		struct buffer_head *next = bh->b_this_page;
1699		if (buffer_async_write(bh)) {
1700			clear_buffer_dirty(bh);
1701			submit_bh(write_op, bh);
1702			nr_underway++;
1703		}
1704		bh = next;
1705	} while (bh != head);
1706	unlock_page(page);
1707	goto done;
1708}
1709
1710/*
1711 * If a page has any new buffers, zero them out here, and mark them uptodate
1712 * and dirty so they'll be written out (in order to prevent uninitialised
1713 * block data from leaking). And clear the new bit.
1714 */
1715void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1716{
1717	unsigned int block_start, block_end;
1718	struct buffer_head *head, *bh;
1719
1720	BUG_ON(!PageLocked(page));
1721	if (!page_has_buffers(page))
1722		return;
1723
1724	bh = head = page_buffers(page);
1725	block_start = 0;
1726	do {
1727		block_end = block_start + bh->b_size;
1728
1729		if (buffer_new(bh)) {
1730			if (block_end > from && block_start < to) {
1731				if (!PageUptodate(page)) {
1732					unsigned start, size;
1733
1734					start = max(from, block_start);
1735					size = min(to, block_end) - start;
1736
1737					zero_user(page, start, size);
1738					set_buffer_uptodate(bh);
1739				}
1740
1741				clear_buffer_new(bh);
1742				mark_buffer_dirty(bh);
1743			}
1744		}
1745
1746		block_start = block_end;
1747		bh = bh->b_this_page;
1748	} while (bh != head);
1749}
1750EXPORT_SYMBOL(page_zero_new_buffers);
1751
1752int block_prepare_write(struct page *page, unsigned from, unsigned to,
1753		get_block_t *get_block)
1754{
1755	struct inode *inode = page->mapping->host;
1756	unsigned block_start, block_end;
1757	sector_t block;
1758	int err = 0;
1759	unsigned blocksize, bbits;
1760	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1761
1762	BUG_ON(!PageLocked(page));
1763	BUG_ON(from > PAGE_CACHE_SIZE);
1764	BUG_ON(to > PAGE_CACHE_SIZE);
1765	BUG_ON(from > to);
1766
1767	blocksize = 1 << inode->i_blkbits;
1768	if (!page_has_buffers(page))
1769		create_empty_buffers(page, blocksize, 0);
1770	head = page_buffers(page);
1771
1772	bbits = inode->i_blkbits;
1773	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1774
1775	for(bh = head, block_start = 0; bh != head || !block_start;
1776	    block++, block_start=block_end, bh = bh->b_this_page) {
1777		block_end = block_start + blocksize;
1778		if (block_end <= from || block_start >= to) {
1779			if (PageUptodate(page)) {
1780				if (!buffer_uptodate(bh))
1781					set_buffer_uptodate(bh);
1782			}
1783			continue;
1784		}
1785		if (buffer_new(bh))
1786			clear_buffer_new(bh);
1787		if (!buffer_mapped(bh)) {
1788			WARN_ON(bh->b_size != blocksize);
1789			err = get_block(inode, block, bh, 1);
1790			if (err)
1791				break;
1792			if (buffer_new(bh)) {
1793				unmap_underlying_metadata(bh->b_bdev,
1794							bh->b_blocknr);
1795				if (PageUptodate(page)) {
1796					clear_buffer_new(bh);
1797					set_buffer_uptodate(bh);
1798					mark_buffer_dirty(bh);
1799					continue;
1800				}
1801				if (block_end > to || block_start < from)
1802					zero_user_segments(page,
1803						to, block_end,
1804						block_start, from);
1805				continue;
1806			}
1807		}
1808		if (PageUptodate(page)) {
1809			if (!buffer_uptodate(bh))
1810				set_buffer_uptodate(bh);
1811			continue;
1812		}
1813		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1814		    !buffer_unwritten(bh) &&
1815		     (block_start < from || block_end > to)) {
1816			ll_rw_block(READ, 1, &bh);
1817			*wait_bh++=bh;
1818		}
1819	}
1820	/*
1821	 * If we issued read requests - let them complete.
1822	 */
1823	while(wait_bh > wait) {
1824		wait_on_buffer(*--wait_bh);
1825		if (!buffer_uptodate(*wait_bh))
1826			err = -EIO;
1827	}
1828	if (unlikely(err)) {
1829		page_zero_new_buffers(page, from, to);
1830		ClearPageUptodate(page);
1831	}
1832	return err;
1833}
1834EXPORT_SYMBOL(block_prepare_write);
1835
1836static int __block_commit_write(struct inode *inode, struct page *page,
1837		unsigned from, unsigned to)
1838{
1839	unsigned block_start, block_end;
1840	int partial = 0;
1841	unsigned blocksize;
1842	struct buffer_head *bh, *head;
1843
1844	blocksize = 1 << inode->i_blkbits;
1845
1846	for(bh = head = page_buffers(page), block_start = 0;
1847	    bh != head || !block_start;
1848	    block_start=block_end, bh = bh->b_this_page) {
1849		block_end = block_start + blocksize;
1850		if (block_end <= from || block_start >= to) {
1851			if (!buffer_uptodate(bh))
1852				partial = 1;
1853		} else {
1854			set_buffer_uptodate(bh);
1855			mark_buffer_dirty(bh);
1856		}
1857		clear_buffer_new(bh);
1858	}
1859
1860	/*
1861	 * If this is a partial write which happened to make all buffers
1862	 * uptodate then we can optimize away a bogus readpage() for
1863	 * the next read(). Here we 'discover' whether the page went
1864	 * uptodate as a result of this (potentially partial) write.
1865	 */
1866	if (!partial)
1867		SetPageUptodate(page);
1868	return 0;
1869}
1870
1871int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1872		get_block_t *get_block)
1873{
1874	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
1875
1876	return block_prepare_write(page, start, start + len, get_block);
1877}
1878EXPORT_SYMBOL(__block_write_begin);
1879
1880/*
1881 * block_write_begin takes care of the basic task of block allocation and
1882 * bringing partial write blocks uptodate first.
1883 *
1884 * The filesystem needs to handle block truncation upon failure.
1885 */
1886int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
1887		unsigned flags, struct page **pagep, get_block_t *get_block)
1888{
1889	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1890	struct page *page;
1891	int status;
1892
1893	page = grab_cache_page_write_begin(mapping, index, flags);
1894	if (!page)
1895		return -ENOMEM;
1896
1897	status = __block_write_begin(page, pos, len, get_block);
1898	if (unlikely(status)) {
1899		unlock_page(page);
1900		page_cache_release(page);
1901		page = NULL;
1902	}
1903
1904	*pagep = page;
1905	return status;
1906}
1907EXPORT_SYMBOL(block_write_begin);
1908
1909int block_write_end(struct file *file, struct address_space *mapping,
1910			loff_t pos, unsigned len, unsigned copied,
1911			struct page *page, void *fsdata)
1912{
1913	struct inode *inode = mapping->host;
1914	unsigned start;
1915
1916	start = pos & (PAGE_CACHE_SIZE - 1);
1917
1918	if (unlikely(copied < len)) {
1919		/*
1920		 * The buffers that were written will now be uptodate, so we
1921		 * don't have to worry about a readpage reading them and
1922		 * overwriting a partial write. However if we have encountered
1923		 * a short write and only partially written into a buffer, it
1924		 * will not be marked uptodate, so a readpage might come in and
1925		 * destroy our partial write.
1926		 *
1927		 * Do the simplest thing, and just treat any short write to a
1928		 * non uptodate page as a zero-length write, and force the
1929		 * caller to redo the whole thing.
1930		 */
1931		if (!PageUptodate(page))
1932			copied = 0;
1933
1934		page_zero_new_buffers(page, start+copied, start+len);
1935	}
1936	flush_dcache_page(page);
1937
1938	/* This could be a short (even 0-length) commit */
1939	__block_commit_write(inode, page, start, start+copied);
1940
1941	return copied;
1942}
1943EXPORT_SYMBOL(block_write_end);
1944
1945int generic_write_end(struct file *file, struct address_space *mapping,
1946			loff_t pos, unsigned len, unsigned copied,
1947			struct page *page, void *fsdata)
1948{
1949	struct inode *inode = mapping->host;
1950	int i_size_changed = 0;
1951
1952	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
1953
1954	/*
1955	 * No need to use i_size_read() here, the i_size
1956	 * cannot change under us because we hold i_mutex.
1957	 *
1958	 * But it's important to update i_size while still holding page lock:
1959	 * page writeout could otherwise come in and zero beyond i_size.
1960	 */
1961	if (pos+copied > inode->i_size) {
1962		i_size_write(inode, pos+copied);
1963		i_size_changed = 1;
1964	}
1965
1966	unlock_page(page);
1967	page_cache_release(page);
1968
1969	/*
1970	 * Don't mark the inode dirty under page lock. First, it unnecessarily
1971	 * makes the holding time of page lock longer. Second, it forces lock
1972	 * ordering of page lock and transaction start for journaling
1973	 * filesystems.
1974	 */
1975	if (i_size_changed)
1976		mark_inode_dirty(inode);
1977
1978	return copied;
1979}
1980EXPORT_SYMBOL(generic_write_end);
1981
1982/*
1983 * block_is_partially_uptodate checks whether buffers within a page are
1984 * uptodate or not.
1985 *
1986 * Returns true if all buffers which correspond to a file portion
1987 * we want to read are uptodate.
1988 */
1989int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
1990					unsigned long from)
1991{
1992	struct inode *inode = page->mapping->host;
1993	unsigned block_start, block_end, blocksize;
1994	unsigned to;
1995	struct buffer_head *bh, *head;
1996	int ret = 1;
1997
1998	if (!page_has_buffers(page))
1999		return 0;
2000
2001	blocksize = 1 << inode->i_blkbits;
2002	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2003	to = from + to;
2004	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2005		return 0;
2006
2007	head = page_buffers(page);
2008	bh = head;
2009	block_start = 0;
2010	do {
2011		block_end = block_start + blocksize;
2012		if (block_end > from && block_start < to) {
2013			if (!buffer_uptodate(bh)) {
2014				ret = 0;
2015				break;
2016			}
2017			if (block_end >= to)
2018				break;
2019		}
2020		block_start = block_end;
2021		bh = bh->b_this_page;
2022	} while (bh != head);
2023
2024	return ret;
2025}
2026EXPORT_SYMBOL(block_is_partially_uptodate);
2027
2028/*
2029 * Generic "read page" function for block devices that have the normal
2030 * get_block functionality. This is most of the block device filesystems.
2031 * Reads the page asynchronously --- the unlock_buffer() and
2032 * set/clear_buffer_uptodate() functions propagate buffer state into the
2033 * page struct once IO has completed.
2034 */
2035int block_read_full_page(struct page *page, get_block_t *get_block)
2036{
2037	struct inode *inode = page->mapping->host;
2038	sector_t iblock, lblock;
2039	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2040	unsigned int blocksize;
2041	int nr, i;
2042	int fully_mapped = 1;
2043
2044	BUG_ON(!PageLocked(page));
2045	blocksize = 1 << inode->i_blkbits;
2046	if (!page_has_buffers(page))
2047		create_empty_buffers(page, blocksize, 0);
2048	head = page_buffers(page);
2049
2050	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2051	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2052	bh = head;
2053	nr = 0;
2054	i = 0;
2055
2056	do {
2057		if (buffer_uptodate(bh))
2058			continue;
2059
2060		if (!buffer_mapped(bh)) {
2061			int err = 0;
2062
2063			fully_mapped = 0;
2064			if (iblock < lblock) {
2065				WARN_ON(bh->b_size != blocksize);
2066				err = get_block(inode, iblock, bh, 0);
2067				if (err)
2068					SetPageError(page);
2069			}
2070			if (!buffer_mapped(bh)) {
2071				zero_user(page, i * blocksize, blocksize);
2072				if (!err)
2073					set_buffer_uptodate(bh);
2074				continue;
2075			}
2076			/*
2077			 * get_block() might have updated the buffer
2078			 * synchronously
2079			 */
2080			if (buffer_uptodate(bh))
2081				continue;
2082		}
2083		arr[nr++] = bh;
2084	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2085
2086	if (fully_mapped)
2087		SetPageMappedToDisk(page);
2088
2089	if (!nr) {
2090		/*
2091		 * All buffers are uptodate - we can set the page uptodate
2092		 * as well. But not if get_block() returned an error.
2093		 */
2094		if (!PageError(page))
2095			SetPageUptodate(page);
2096		unlock_page(page);
2097		return 0;
2098	}
2099
2100	/* Stage two: lock the buffers */
2101	for (i = 0; i < nr; i++) {
2102		bh = arr[i];
2103		lock_buffer(bh);
2104		mark_buffer_async_read(bh);
2105	}
2106
2107	/*
2108	 * Stage 3: start the IO.  Check for uptodateness
2109	 * inside the buffer lock in case another process reading
2110	 * the underlying blockdev brought it uptodate (the sct fix).
2111	 */
2112	for (i = 0; i < nr; i++) {
2113		bh = arr[i];
2114		if (buffer_uptodate(bh))
2115			end_buffer_async_read(bh, 1);
2116		else
2117			submit_bh(READ, bh);
2118	}
2119	return 0;
2120}
2121EXPORT_SYMBOL(block_read_full_page);
2122
2123/* utility function for filesystems that need to do work on expanding
2124 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2125 * deal with the hole.
2126 */
2127int generic_cont_expand_simple(struct inode *inode, loff_t size)
2128{
2129	struct address_space *mapping = inode->i_mapping;
2130	struct page *page;
2131	void *fsdata;
2132	int err;
2133
2134	err = inode_newsize_ok(inode, size);
2135	if (err)
2136		goto out;
2137
2138	err = pagecache_write_begin(NULL, mapping, size, 0,
2139				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2140				&page, &fsdata);
2141	if (err)
2142		goto out;
2143
2144	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2145	BUG_ON(err > 0);
2146
2147out:
2148	return err;
2149}
2150EXPORT_SYMBOL(generic_cont_expand_simple);
2151
2152static int cont_expand_zero(struct file *file, struct address_space *mapping,
2153			    loff_t pos, loff_t *bytes)
2154{
2155	struct inode *inode = mapping->host;
2156	unsigned blocksize = 1 << inode->i_blkbits;
2157	struct page *page;
2158	void *fsdata;
2159	pgoff_t index, curidx;
2160	loff_t curpos;
2161	unsigned zerofrom, offset, len;
2162	int err = 0;
2163
2164	index = pos >> PAGE_CACHE_SHIFT;
2165	offset = pos & ~PAGE_CACHE_MASK;
2166
2167	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2168		zerofrom = curpos & ~PAGE_CACHE_MASK;
2169		if (zerofrom & (blocksize-1)) {
2170			*bytes |= (blocksize-1);
2171			(*bytes)++;
2172		}
2173		len = PAGE_CACHE_SIZE - zerofrom;
2174
2175		err = pagecache_write_begin(file, mapping, curpos, len,
2176						AOP_FLAG_UNINTERRUPTIBLE,
2177						&page, &fsdata);
2178		if (err)
2179			goto out;
2180		zero_user(page, zerofrom, len);
2181		err = pagecache_write_end(file, mapping, curpos, len, len,
2182						page, fsdata);
2183		if (err < 0)
2184			goto out;
2185		BUG_ON(err != len);
2186		err = 0;
2187
2188		balance_dirty_pages_ratelimited(mapping);
2189	}
2190
2191	/* page covers the boundary, find the boundary offset */
2192	if (index == curidx) {
2193		zerofrom = curpos & ~PAGE_CACHE_MASK;
2194		/* if we will expand the thing last block will be filled */
2195		if (offset <= zerofrom) {
2196			goto out;
2197		}
2198		if (zerofrom & (blocksize-1)) {
2199			*bytes |= (blocksize-1);
2200			(*bytes)++;
2201		}
2202		len = offset - zerofrom;
2203
2204		err = pagecache_write_begin(file, mapping, curpos, len,
2205						AOP_FLAG_UNINTERRUPTIBLE,
2206						&page, &fsdata);
2207		if (err)
2208			goto out;
2209		zero_user(page, zerofrom, len);
2210		err = pagecache_write_end(file, mapping, curpos, len, len,
2211						page, fsdata);
2212		if (err < 0)
2213			goto out;
2214		BUG_ON(err != len);
2215		err = 0;
2216	}
2217out:
2218	return err;
2219}
2220
2221/*
2222 * For moronic filesystems that do not allow holes in file.
2223 * We may have to extend the file.
2224 */
2225int cont_write_begin(struct file *file, struct address_space *mapping,
2226			loff_t pos, unsigned len, unsigned flags,
2227			struct page **pagep, void **fsdata,
2228			get_block_t *get_block, loff_t *bytes)
2229{
2230	struct inode *inode = mapping->host;
2231	unsigned blocksize = 1 << inode->i_blkbits;
2232	unsigned zerofrom;
2233	int err;
2234
2235	err = cont_expand_zero(file, mapping, pos, bytes);
2236	if (err)
2237		return err;
2238
2239	zerofrom = *bytes & ~PAGE_CACHE_MASK;
2240	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2241		*bytes |= (blocksize-1);
2242		(*bytes)++;
2243	}
2244
2245	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2246}
2247EXPORT_SYMBOL(cont_write_begin);
2248
2249int block_commit_write(struct page *page, unsigned from, unsigned to)
2250{
2251	struct inode *inode = page->mapping->host;
2252	__block_commit_write(inode,page,from,to);
2253	return 0;
2254}
2255EXPORT_SYMBOL(block_commit_write);
2256
2257/*
2258 * block_page_mkwrite() is not allowed to change the file size as it gets
2259 * called from a page fault handler when a page is first dirtied. Hence we must
2260 * be careful to check for EOF conditions here. We set the page up correctly
2261 * for a written page which means we get ENOSPC checking when writing into
2262 * holes and correct delalloc and unwritten extent mapping on filesystems that
2263 * support these features.
2264 *
2265 * We are not allowed to take the i_mutex here so we have to play games to
2266 * protect against truncate races as the page could now be beyond EOF.  Because
2267 * truncate writes the inode size before removing pages, once we have the
2268 * page lock we can determine safely if the page is beyond EOF. If it is not
2269 * beyond EOF, then the page is guaranteed safe against truncation until we
2270 * unlock the page.
2271 */
2272int
2273block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2274		   get_block_t get_block)
2275{
2276	struct page *page = vmf->page;
2277	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2278	unsigned long end;
2279	loff_t size;
2280	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
2281
2282	lock_page(page);
2283	size = i_size_read(inode);
2284	if ((page->mapping != inode->i_mapping) ||
2285	    (page_offset(page) > size)) {
2286		/* page got truncated out from underneath us */
2287		unlock_page(page);
2288		goto out;
2289	}
2290
2291	/* page is wholly or partially inside EOF */
2292	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2293		end = size & ~PAGE_CACHE_MASK;
2294	else
2295		end = PAGE_CACHE_SIZE;
2296
2297	ret = block_prepare_write(page, 0, end, get_block);
2298	if (!ret)
2299		ret = block_commit_write(page, 0, end);
2300
2301	if (unlikely(ret)) {
2302		unlock_page(page);
2303		if (ret == -ENOMEM)
2304			ret = VM_FAULT_OOM;
2305		else /* -ENOSPC, -EIO, etc */
2306			ret = VM_FAULT_SIGBUS;
2307	} else
2308		ret = VM_FAULT_LOCKED;
2309
2310out:
2311	return ret;
2312}
2313EXPORT_SYMBOL(block_page_mkwrite);
2314
2315/*
2316 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2317 * immediately, while under the page lock.  So it needs a special end_io
2318 * handler which does not touch the bh after unlocking it.
2319 */
2320static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2321{
2322	__end_buffer_read_notouch(bh, uptodate);
2323}
2324
2325/*
2326 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2327 * the page (converting it to circular linked list and taking care of page
2328 * dirty races).
2329 */
2330static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2331{
2332	struct buffer_head *bh;
2333
2334	BUG_ON(!PageLocked(page));
2335
2336	spin_lock(&page->mapping->private_lock);
2337	bh = head;
2338	do {
2339		if (PageDirty(page))
2340			set_buffer_dirty(bh);
2341		if (!bh->b_this_page)
2342			bh->b_this_page = head;
2343		bh = bh->b_this_page;
2344	} while (bh != head);
2345	attach_page_buffers(page, head);
2346	spin_unlock(&page->mapping->private_lock);
2347}
2348
2349/*
2350 * On entry, the page is fully not uptodate.
2351 * On exit the page is fully uptodate in the areas outside (from,to)
2352 * The filesystem needs to handle block truncation upon failure.
2353 */
2354int nobh_write_begin(struct address_space *mapping,
2355			loff_t pos, unsigned len, unsigned flags,
2356			struct page **pagep, void **fsdata,
2357			get_block_t *get_block)
2358{
2359	struct inode *inode = mapping->host;
2360	const unsigned blkbits = inode->i_blkbits;
2361	const unsigned blocksize = 1 << blkbits;
2362	struct buffer_head *head, *bh;
2363	struct page *page;
2364	pgoff_t index;
2365	unsigned from, to;
2366	unsigned block_in_page;
2367	unsigned block_start, block_end;
2368	sector_t block_in_file;
2369	int nr_reads = 0;
2370	int ret = 0;
2371	int is_mapped_to_disk = 1;
2372
2373	index = pos >> PAGE_CACHE_SHIFT;
2374	from = pos & (PAGE_CACHE_SIZE - 1);
2375	to = from + len;
2376
2377	page = grab_cache_page_write_begin(mapping, index, flags);
2378	if (!page)
2379		return -ENOMEM;
2380	*pagep = page;
2381	*fsdata = NULL;
2382
2383	if (page_has_buffers(page)) {
2384		unlock_page(page);
2385		page_cache_release(page);
2386		*pagep = NULL;
2387		return block_write_begin(mapping, pos, len, flags, pagep,
2388					 get_block);
2389	}
2390
2391	if (PageMappedToDisk(page))
2392		return 0;
2393
2394	/*
2395	 * Allocate buffers so that we can keep track of state, and potentially
2396	 * attach them to the page if an error occurs. In the common case of
2397	 * no error, they will just be freed again without ever being attached
2398	 * to the page (which is all OK, because we're under the page lock).
2399	 *
2400	 * Be careful: the buffer linked list is a NULL terminated one, rather
2401	 * than the circular one we're used to.
2402	 */
2403	head = alloc_page_buffers(page, blocksize, 0);
2404	if (!head) {
2405		ret = -ENOMEM;
2406		goto out_release;
2407	}
2408
2409	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2410
2411	/*
2412	 * We loop across all blocks in the page, whether or not they are
2413	 * part of the affected region.  This is so we can discover if the
2414	 * page is fully mapped-to-disk.
2415	 */
2416	for (block_start = 0, block_in_page = 0, bh = head;
2417		  block_start < PAGE_CACHE_SIZE;
2418		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2419		int create;
2420
2421		block_end = block_start + blocksize;
2422		bh->b_state = 0;
2423		create = 1;
2424		if (block_start >= to)
2425			create = 0;
2426		ret = get_block(inode, block_in_file + block_in_page,
2427					bh, create);
2428		if (ret)
2429			goto failed;
2430		if (!buffer_mapped(bh))
2431			is_mapped_to_disk = 0;
2432		if (buffer_new(bh))
2433			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2434		if (PageUptodate(page)) {
2435			set_buffer_uptodate(bh);
2436			continue;
2437		}
2438		if (buffer_new(bh) || !buffer_mapped(bh)) {
2439			zero_user_segments(page, block_start, from,
2440							to, block_end);
2441			continue;
2442		}
2443		if (buffer_uptodate(bh))
2444			continue;	/* reiserfs does this */
2445		if (block_start < from || block_end > to) {
2446			lock_buffer(bh);
2447			bh->b_end_io = end_buffer_read_nobh;
2448			submit_bh(READ, bh);
2449			nr_reads++;
2450		}
2451	}
2452
2453	if (nr_reads) {
2454		/*
2455		 * The page is locked, so these buffers are protected from
2456		 * any VM or truncate activity.  Hence we don't need to care
2457		 * for the buffer_head refcounts.
2458		 */
2459		for (bh = head; bh; bh = bh->b_this_page) {
2460			wait_on_buffer(bh);
2461			if (!buffer_uptodate(bh))
2462				ret = -EIO;
2463		}
2464		if (ret)
2465			goto failed;
2466	}
2467
2468	if (is_mapped_to_disk)
2469		SetPageMappedToDisk(page);
2470
2471	*fsdata = head; /* to be released by nobh_write_end */
2472
2473	return 0;
2474
2475failed:
2476	BUG_ON(!ret);
2477	/*
2478	 * Error recovery is a bit difficult. We need to zero out blocks that
2479	 * were newly allocated, and dirty them to ensure they get written out.
2480	 * Buffers need to be attached to the page at this point, otherwise
2481	 * the handling of potential IO errors during writeout would be hard
2482	 * (could try doing synchronous writeout, but what if that fails too?)
2483	 */
2484	attach_nobh_buffers(page, head);
2485	page_zero_new_buffers(page, from, to);
2486
2487out_release:
2488	unlock_page(page);
2489	page_cache_release(page);
2490	*pagep = NULL;
2491
2492	return ret;
2493}
2494EXPORT_SYMBOL(nobh_write_begin);
2495
2496int nobh_write_end(struct file *file, struct address_space *mapping,
2497			loff_t pos, unsigned len, unsigned copied,
2498			struct page *page, void *fsdata)
2499{
2500	struct inode *inode = page->mapping->host;
2501	struct buffer_head *head = fsdata;
2502	struct buffer_head *bh;
2503	BUG_ON(fsdata != NULL && page_has_buffers(page));
2504
2505	if (unlikely(copied < len) && head)
2506		attach_nobh_buffers(page, head);
2507	if (page_has_buffers(page))
2508		return generic_write_end(file, mapping, pos, len,
2509					copied, page, fsdata);
2510
2511	SetPageUptodate(page);
2512	set_page_dirty(page);
2513	if (pos+copied > inode->i_size) {
2514		i_size_write(inode, pos+copied);
2515		mark_inode_dirty(inode);
2516	}
2517
2518	unlock_page(page);
2519	page_cache_release(page);
2520
2521	while (head) {
2522		bh = head;
2523		head = head->b_this_page;
2524		free_buffer_head(bh);
2525	}
2526
2527	return copied;
2528}
2529EXPORT_SYMBOL(nobh_write_end);
2530
2531/*
2532 * nobh_writepage() - based on block_full_write_page() except
2533 * that it tries to operate without attaching bufferheads to
2534 * the page.
2535 */
2536int nobh_writepage(struct page *page, get_block_t *get_block,
2537			struct writeback_control *wbc)
2538{
2539	struct inode * const inode = page->mapping->host;
2540	loff_t i_size = i_size_read(inode);
2541	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2542	unsigned offset;
2543	int ret;
2544
2545	/* Is the page fully inside i_size? */
2546	if (page->index < end_index)
2547		goto out;
2548
2549	/* Is the page fully outside i_size? (truncate in progress) */
2550	offset = i_size & (PAGE_CACHE_SIZE-1);
2551	if (page->index >= end_index+1 || !offset) {
2552		/*
2553		 * The page may have dirty, unmapped buffers.  For example,
2554		 * they may have been added in ext3_writepage().  Make them
2555		 * freeable here, so the page does not leak.
2556		 */
2557		unlock_page(page);
2558		return 0; /* don't care */
2559	}
2560
2561	/*
2562	 * The page straddles i_size.  It must be zeroed out on each and every
2563	 * writepage invocation because it may be mmapped.  "A file is mapped
2564	 * in multiples of the page size.  For a file that is not a multiple of
2565	 * the  page size, the remaining memory is zeroed when mapped, and
2566	 * writes to that region are not written out to the file."
2567	 */
2568	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2569out:
2570	ret = mpage_writepage(page, get_block, wbc);
2571	if (ret == -EAGAIN)
2572		ret = __block_write_full_page(inode, page, get_block, wbc,
2573					      end_buffer_async_write);
2574	return ret;
2575}
2576EXPORT_SYMBOL(nobh_writepage);
2577
2578int nobh_truncate_page(struct address_space *mapping,
2579			loff_t from, get_block_t *get_block)
2580{
2581	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2582	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2583	unsigned blocksize;
2584	sector_t iblock;
2585	unsigned length, pos;
2586	struct inode *inode = mapping->host;
2587	struct page *page;
2588	struct buffer_head map_bh;
2589	int err;
2590
2591	blocksize = 1 << inode->i_blkbits;
2592	length = offset & (blocksize - 1);
2593
2594	/* Block boundary? Nothing to do */
2595	if (!length)
2596		return 0;
2597
2598	length = blocksize - length;
2599	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2600
2601	page = grab_cache_page(mapping, index);
2602	err = -ENOMEM;
2603	if (!page)
2604		goto out;
2605
2606	if (page_has_buffers(page)) {
2607has_buffers:
2608		unlock_page(page);
2609		page_cache_release(page);
2610		return block_truncate_page(mapping, from, get_block);
2611	}
2612
2613	/* Find the buffer that contains "offset" */
2614	pos = blocksize;
2615	while (offset >= pos) {
2616		iblock++;
2617		pos += blocksize;
2618	}
2619
2620	map_bh.b_size = blocksize;
2621	map_bh.b_state = 0;
2622	err = get_block(inode, iblock, &map_bh, 0);
2623	if (err)
2624		goto unlock;
2625	/* unmapped? It's a hole - nothing to do */
2626	if (!buffer_mapped(&map_bh))
2627		goto unlock;
2628
2629	/* Ok, it's mapped. Make sure it's up-to-date */
2630	if (!PageUptodate(page)) {
2631		err = mapping->a_ops->readpage(NULL, page);
2632		if (err) {
2633			page_cache_release(page);
2634			goto out;
2635		}
2636		lock_page(page);
2637		if (!PageUptodate(page)) {
2638			err = -EIO;
2639			goto unlock;
2640		}
2641		if (page_has_buffers(page))
2642			goto has_buffers;
2643	}
2644	zero_user(page, offset, length);
2645	set_page_dirty(page);
2646	err = 0;
2647
2648unlock:
2649	unlock_page(page);
2650	page_cache_release(page);
2651out:
2652	return err;
2653}
2654EXPORT_SYMBOL(nobh_truncate_page);
2655
2656int block_truncate_page(struct address_space *mapping,
2657			loff_t from, get_block_t *get_block)
2658{
2659	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2660	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2661	unsigned blocksize;
2662	sector_t iblock;
2663	unsigned length, pos;
2664	struct inode *inode = mapping->host;
2665	struct page *page;
2666	struct buffer_head *bh;
2667	int err;
2668
2669	blocksize = 1 << inode->i_blkbits;
2670	length = offset & (blocksize - 1);
2671
2672	/* Block boundary? Nothing to do */
2673	if (!length)
2674		return 0;
2675
2676	length = blocksize - length;
2677	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2678
2679	page = grab_cache_page(mapping, index);
2680	err = -ENOMEM;
2681	if (!page)
2682		goto out;
2683
2684	if (!page_has_buffers(page))
2685		create_empty_buffers(page, blocksize, 0);
2686
2687	/* Find the buffer that contains "offset" */
2688	bh = page_buffers(page);
2689	pos = blocksize;
2690	while (offset >= pos) {
2691		bh = bh->b_this_page;
2692		iblock++;
2693		pos += blocksize;
2694	}
2695
2696	err = 0;
2697	if (!buffer_mapped(bh)) {
2698		WARN_ON(bh->b_size != blocksize);
2699		err = get_block(inode, iblock, bh, 0);
2700		if (err)
2701			goto unlock;
2702		/* unmapped? It's a hole - nothing to do */
2703		if (!buffer_mapped(bh))
2704			goto unlock;
2705	}
2706
2707	/* Ok, it's mapped. Make sure it's up-to-date */
2708	if (PageUptodate(page))
2709		set_buffer_uptodate(bh);
2710
2711	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2712		err = -EIO;
2713		ll_rw_block(READ, 1, &bh);
2714		wait_on_buffer(bh);
2715		/* Uhhuh. Read error. Complain and punt. */
2716		if (!buffer_uptodate(bh))
2717			goto unlock;
2718	}
2719
2720	zero_user(page, offset, length);
2721	mark_buffer_dirty(bh);
2722	err = 0;
2723
2724unlock:
2725	unlock_page(page);
2726	page_cache_release(page);
2727out:
2728	return err;
2729}
2730EXPORT_SYMBOL(block_truncate_page);
2731
2732/*
2733 * The generic ->writepage function for buffer-backed address_spaces
2734 * this form passes in the end_io handler used to finish the IO.
2735 */
2736int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2737			struct writeback_control *wbc, bh_end_io_t *handler)
2738{
2739	struct inode * const inode = page->mapping->host;
2740	loff_t i_size = i_size_read(inode);
2741	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2742	unsigned offset;
2743
2744	/* Is the page fully inside i_size? */
2745	if (page->index < end_index)
2746		return __block_write_full_page(inode, page, get_block, wbc,
2747					       handler);
2748
2749	/* Is the page fully outside i_size? (truncate in progress) */
2750	offset = i_size & (PAGE_CACHE_SIZE-1);
2751	if (page->index >= end_index+1 || !offset) {
2752		/*
2753		 * The page may have dirty, unmapped buffers.  For example,
2754		 * they may have been added in ext3_writepage().  Make them
2755		 * freeable here, so the page does not leak.
2756		 */
2757		do_invalidatepage(page, 0);
2758		unlock_page(page);
2759		return 0; /* don't care */
2760	}
2761
2762	/*
2763	 * The page straddles i_size.  It must be zeroed out on each and every
2764	 * writepage invocation because it may be mmapped.  "A file is mapped
2765	 * in multiples of the page size.  For a file that is not a multiple of
2766	 * the  page size, the remaining memory is zeroed when mapped, and
2767	 * writes to that region are not written out to the file."
2768	 */
2769	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2770	return __block_write_full_page(inode, page, get_block, wbc, handler);
2771}
2772EXPORT_SYMBOL(block_write_full_page_endio);
2773
2774/*
2775 * The generic ->writepage function for buffer-backed address_spaces
2776 */
2777int block_write_full_page(struct page *page, get_block_t *get_block,
2778			struct writeback_control *wbc)
2779{
2780	return block_write_full_page_endio(page, get_block, wbc,
2781					   end_buffer_async_write);
2782}
2783EXPORT_SYMBOL(block_write_full_page);
2784
2785sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2786			    get_block_t *get_block)
2787{
2788	struct buffer_head tmp;
2789	struct inode *inode = mapping->host;
2790	tmp.b_state = 0;
2791	tmp.b_blocknr = 0;
2792	tmp.b_size = 1 << inode->i_blkbits;
2793	get_block(inode, block, &tmp, 0);
2794	return tmp.b_blocknr;
2795}
2796EXPORT_SYMBOL(generic_block_bmap);
2797
2798static void end_bio_bh_io_sync(struct bio *bio, int err)
2799{
2800	struct buffer_head *bh = bio->bi_private;
2801
2802	if (err == -EOPNOTSUPP) {
2803		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2804		set_bit(BH_Eopnotsupp, &bh->b_state);
2805	}
2806
2807	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2808		set_bit(BH_Quiet, &bh->b_state);
2809
2810	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2811	bio_put(bio);
2812}
2813
2814int submit_bh(int rw, struct buffer_head * bh)
2815{
2816	struct bio *bio;
2817	int ret = 0;
2818
2819	BUG_ON(!buffer_locked(bh));
2820	BUG_ON(!buffer_mapped(bh));
2821	BUG_ON(!bh->b_end_io);
2822	BUG_ON(buffer_delay(bh));
2823	BUG_ON(buffer_unwritten(bh));
2824
2825	/*
2826	 * Only clear out a write error when rewriting
2827	 */
2828	if (test_set_buffer_req(bh) && (rw & WRITE))
2829		clear_buffer_write_io_error(bh);
2830
2831	/*
2832	 * from here on down, it's all bio -- do the initial mapping,
2833	 * submit_bio -> generic_make_request may further map this bio around
2834	 */
2835	bio = bio_alloc(GFP_NOIO, 1);
2836
2837	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2838	bio->bi_bdev = bh->b_bdev;
2839	bio->bi_io_vec[0].bv_page = bh->b_page;
2840	bio->bi_io_vec[0].bv_len = bh->b_size;
2841	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2842
2843	bio->bi_vcnt = 1;
2844	bio->bi_idx = 0;
2845	bio->bi_size = bh->b_size;
2846
2847	bio->bi_end_io = end_bio_bh_io_sync;
2848	bio->bi_private = bh;
2849
2850	bio_get(bio);
2851	submit_bio(rw, bio);
2852
2853	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2854		ret = -EOPNOTSUPP;
2855
2856	bio_put(bio);
2857	return ret;
2858}
2859EXPORT_SYMBOL(submit_bh);
2860
2861/**
2862 * ll_rw_block: low-level access to block devices (DEPRECATED)
2863 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2864 * @nr: number of &struct buffer_heads in the array
2865 * @bhs: array of pointers to &struct buffer_head
2866 *
2867 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2868 * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2869 * %READA option is described in the documentation for generic_make_request()
2870 * which ll_rw_block() calls.
2871 *
2872 * This function drops any buffer that it cannot get a lock on (with the
2873 * BH_Lock state bit), any buffer that appears to be clean when doing a write
2874 * request, and any buffer that appears to be up-to-date when doing read
2875 * request.  Further it marks as clean buffers that are processed for
2876 * writing (the buffer cache won't assume that they are actually clean
2877 * until the buffer gets unlocked).
2878 *
2879 * ll_rw_block sets b_end_io to simple completion handler that marks
2880 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2881 * any waiters.
2882 *
2883 * All of the buffers must be for the same device, and must also be a
2884 * multiple of the current approved size for the device.
2885 */
2886void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2887{
2888	int i;
2889
2890	for (i = 0; i < nr; i++) {
2891		struct buffer_head *bh = bhs[i];
2892
2893		if (!trylock_buffer(bh))
2894			continue;
2895		if (rw == WRITE) {
2896			if (test_clear_buffer_dirty(bh)) {
2897				bh->b_end_io = end_buffer_write_sync;
2898				get_bh(bh);
2899				submit_bh(WRITE, bh);
2900				continue;
2901			}
2902		} else {
2903			if (!buffer_uptodate(bh)) {
2904				bh->b_end_io = end_buffer_read_sync;
2905				get_bh(bh);
2906				submit_bh(rw, bh);
2907				continue;
2908			}
2909		}
2910		unlock_buffer(bh);
2911	}
2912}
2913EXPORT_SYMBOL(ll_rw_block);
2914
2915void write_dirty_buffer(struct buffer_head *bh, int rw)
2916{
2917	lock_buffer(bh);
2918	if (!test_clear_buffer_dirty(bh)) {
2919		unlock_buffer(bh);
2920		return;
2921	}
2922	bh->b_end_io = end_buffer_write_sync;
2923	get_bh(bh);
2924	submit_bh(rw, bh);
2925}
2926EXPORT_SYMBOL(write_dirty_buffer);
2927
2928/*
2929 * For a data-integrity writeout, we need to wait upon any in-progress I/O
2930 * and then start new I/O and then wait upon it.  The caller must have a ref on
2931 * the buffer_head.
2932 */
2933int __sync_dirty_buffer(struct buffer_head *bh, int rw)
2934{
2935	int ret = 0;
2936
2937	WARN_ON(atomic_read(&bh->b_count) < 1);
2938	lock_buffer(bh);
2939	if (test_clear_buffer_dirty(bh)) {
2940		get_bh(bh);
2941		bh->b_end_io = end_buffer_write_sync;
2942		ret = submit_bh(rw, bh);
2943		wait_on_buffer(bh);
2944		if (buffer_eopnotsupp(bh)) {
2945			clear_buffer_eopnotsupp(bh);
2946			ret = -EOPNOTSUPP;
2947		}
2948		if (!ret && !buffer_uptodate(bh))
2949			ret = -EIO;
2950	} else {
2951		unlock_buffer(bh);
2952	}
2953	return ret;
2954}
2955EXPORT_SYMBOL(__sync_dirty_buffer);
2956
2957int sync_dirty_buffer(struct buffer_head *bh)
2958{
2959	return __sync_dirty_buffer(bh, WRITE_SYNC);
2960}
2961EXPORT_SYMBOL(sync_dirty_buffer);
2962
2963/*
2964 * try_to_free_buffers() checks if all the buffers on this particular page
2965 * are unused, and releases them if so.
2966 *
2967 * Exclusion against try_to_free_buffers may be obtained by either
2968 * locking the page or by holding its mapping's private_lock.
2969 *
2970 * If the page is dirty but all the buffers are clean then we need to
2971 * be sure to mark the page clean as well.  This is because the page
2972 * may be against a block device, and a later reattachment of buffers
2973 * to a dirty page will set *all* buffers dirty.  Which would corrupt
2974 * filesystem data on the same device.
2975 *
2976 * The same applies to regular filesystem pages: if all the buffers are
2977 * clean then we set the page clean and proceed.  To do that, we require
2978 * total exclusion from __set_page_dirty_buffers().  That is obtained with
2979 * private_lock.
2980 *
2981 * try_to_free_buffers() is non-blocking.
2982 */
2983static inline int buffer_busy(struct buffer_head *bh)
2984{
2985	return atomic_read(&bh->b_count) |
2986		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2987}
2988
2989static int
2990drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2991{
2992	struct buffer_head *head = page_buffers(page);
2993	struct buffer_head *bh;
2994
2995	bh = head;
2996	do {
2997		if (buffer_write_io_error(bh) && page->mapping)
2998			set_bit(AS_EIO, &page->mapping->flags);
2999		if (buffer_busy(bh))
3000			goto failed;
3001		bh = bh->b_this_page;
3002	} while (bh != head);
3003
3004	do {
3005		struct buffer_head *next = bh->b_this_page;
3006
3007		if (bh->b_assoc_map)
3008			__remove_assoc_queue(bh);
3009		bh = next;
3010	} while (bh != head);
3011	*buffers_to_free = head;
3012	__clear_page_buffers(page);
3013	return 1;
3014failed:
3015	return 0;
3016}
3017
3018int try_to_free_buffers(struct page *page)
3019{
3020	struct address_space * const mapping = page->mapping;
3021	struct buffer_head *buffers_to_free = NULL;
3022	int ret = 0;
3023
3024	BUG_ON(!PageLocked(page));
3025	if (PageWriteback(page))
3026		return 0;
3027
3028	if (mapping == NULL) {		/* can this still happen? */
3029		ret = drop_buffers(page, &buffers_to_free);
3030		goto out;
3031	}
3032
3033	spin_lock(&mapping->private_lock);
3034	ret = drop_buffers(page, &buffers_to_free);
3035
3036	/*
3037	 * If the filesystem writes its buffers by hand (eg ext3)
3038	 * then we can have clean buffers against a dirty page.  We
3039	 * clean the page here; otherwise the VM will never notice
3040	 * that the filesystem did any IO at all.
3041	 *
3042	 * Also, during truncate, discard_buffer will have marked all
3043	 * the page's buffers clean.  We discover that here and clean
3044	 * the page also.
3045	 *
3046	 * private_lock must be held over this entire operation in order
3047	 * to synchronise against __set_page_dirty_buffers and prevent the
3048	 * dirty bit from being lost.
3049	 */
3050	if (ret)
3051		cancel_dirty_page(page, PAGE_CACHE_SIZE);
3052	spin_unlock(&mapping->private_lock);
3053out:
3054	if (buffers_to_free) {
3055		struct buffer_head *bh = buffers_to_free;
3056
3057		do {
3058			struct buffer_head *next = bh->b_this_page;
3059			free_buffer_head(bh);
3060			bh = next;
3061		} while (bh != buffers_to_free);
3062	}
3063	return ret;
3064}
3065EXPORT_SYMBOL(try_to_free_buffers);
3066
3067void block_sync_page(struct page *page)
3068{
3069	struct address_space *mapping;
3070
3071	smp_mb();
3072	mapping = page_mapping(page);
3073	if (mapping)
3074		blk_run_backing_dev(mapping->backing_dev_info, page);
3075}
3076EXPORT_SYMBOL(block_sync_page);
3077
3078/*
3079 * There are no bdflush tunables left.  But distributions are
3080 * still running obsolete flush daemons, so we terminate them here.
3081 *
3082 * Use of bdflush() is deprecated and will be removed in a future kernel.
3083 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3084 */
3085SYSCALL_DEFINE2(bdflush, int, func, long, data)
3086{
3087	static int msg_count;
3088
3089	if (!capable(CAP_SYS_ADMIN))
3090		return -EPERM;
3091
3092	if (msg_count < 5) {
3093		msg_count++;
3094		printk(KERN_INFO
3095			"warning: process `%s' used the obsolete bdflush"
3096			" system call\n", current->comm);
3097		printk(KERN_INFO "Fix your initscripts?\n");
3098	}
3099
3100	if (func == 1)
3101		do_exit(0);
3102	return 0;
3103}
3104
3105/*
3106 * Buffer-head allocation
3107 */
3108static struct kmem_cache *bh_cachep;
3109
3110/*
3111 * Once the number of bh's in the machine exceeds this level, we start
3112 * stripping them in writeback.
3113 */
3114static int max_buffer_heads;
3115
3116int buffer_heads_over_limit;
3117
3118struct bh_accounting {
3119	int nr;			/* Number of live bh's */
3120	int ratelimit;		/* Limit cacheline bouncing */
3121};
3122
3123static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3124
3125static void recalc_bh_state(void)
3126{
3127	int i;
3128	int tot = 0;
3129
3130	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3131		return;
3132	__get_cpu_var(bh_accounting).ratelimit = 0;
3133	for_each_online_cpu(i)
3134		tot += per_cpu(bh_accounting, i).nr;
3135	buffer_heads_over_limit = (tot > max_buffer_heads);
3136}
3137
3138struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3139{
3140	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3141	if (ret) {
3142		INIT_LIST_HEAD(&ret->b_assoc_buffers);
3143		get_cpu_var(bh_accounting).nr++;
3144		recalc_bh_state();
3145		put_cpu_var(bh_accounting);
3146	}
3147	return ret;
3148}
3149EXPORT_SYMBOL(alloc_buffer_head);
3150
3151void free_buffer_head(struct buffer_head *bh)
3152{
3153	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3154	kmem_cache_free(bh_cachep, bh);
3155	get_cpu_var(bh_accounting).nr--;
3156	recalc_bh_state();
3157	put_cpu_var(bh_accounting);
3158}
3159EXPORT_SYMBOL(free_buffer_head);
3160
3161static void buffer_exit_cpu(int cpu)
3162{
3163	int i;
3164	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3165
3166	for (i = 0; i < BH_LRU_SIZE; i++) {
3167		brelse(b->bhs[i]);
3168		b->bhs[i] = NULL;
3169	}
3170	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3171	per_cpu(bh_accounting, cpu).nr = 0;
3172	put_cpu_var(bh_accounting);
3173}
3174
3175static int buffer_cpu_notify(struct notifier_block *self,
3176			      unsigned long action, void *hcpu)
3177{
3178	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3179		buffer_exit_cpu((unsigned long)hcpu);
3180	return NOTIFY_OK;
3181}
3182
3183/**
3184 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3185 * @bh: struct buffer_head
3186 *
3187 * Return true if the buffer is up-to-date and false,
3188 * with the buffer locked, if not.
3189 */
3190int bh_uptodate_or_lock(struct buffer_head *bh)
3191{
3192	if (!buffer_uptodate(bh)) {
3193		lock_buffer(bh);
3194		if (!buffer_uptodate(bh))
3195			return 0;
3196		unlock_buffer(bh);
3197	}
3198	return 1;
3199}
3200EXPORT_SYMBOL(bh_uptodate_or_lock);
3201
3202/**
3203 * bh_submit_read - Submit a locked buffer for reading
3204 * @bh: struct buffer_head
3205 *
3206 * Returns zero on success and -EIO on error.
3207 */
3208int bh_submit_read(struct buffer_head *bh)
3209{
3210	BUG_ON(!buffer_locked(bh));
3211
3212	if (buffer_uptodate(bh)) {
3213		unlock_buffer(bh);
3214		return 0;
3215	}
3216
3217	get_bh(bh);
3218	bh->b_end_io = end_buffer_read_sync;
3219	submit_bh(READ, bh);
3220	wait_on_buffer(bh);
3221	if (buffer_uptodate(bh))
3222		return 0;
3223	return -EIO;
3224}
3225EXPORT_SYMBOL(bh_submit_read);
3226
3227void __init buffer_init(void)
3228{
3229	int nrpages;
3230
3231	bh_cachep = kmem_cache_create("buffer_head",
3232			sizeof(struct buffer_head), 0,
3233				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3234				SLAB_MEM_SPREAD),
3235				NULL);
3236
3237	/*
3238	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3239	 */
3240	nrpages = (nr_free_buffer_pages() * 10) / 100;
3241	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3242	hotcpu_notifier(buffer_cpu_notify, 0);
3243}
3244