1/*
2 * Copyright (C) 2007 Oracle.  All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/fs.h>
20#include <linux/slab.h>
21#include <linux/sched.h>
22#include <linux/writeback.h>
23#include <linux/pagemap.h>
24#include <linux/blkdev.h>
25#include "ctree.h"
26#include "disk-io.h"
27#include "transaction.h"
28#include "locking.h"
29#include "tree-log.h"
30
31#define BTRFS_ROOT_TRANS_TAG 0
32
33static noinline void put_transaction(struct btrfs_transaction *transaction)
34{
35	WARN_ON(transaction->use_count == 0);
36	transaction->use_count--;
37	if (transaction->use_count == 0) {
38		list_del_init(&transaction->list);
39		memset(transaction, 0, sizeof(*transaction));
40		kmem_cache_free(btrfs_transaction_cachep, transaction);
41	}
42}
43
44static noinline void switch_commit_root(struct btrfs_root *root)
45{
46	free_extent_buffer(root->commit_root);
47	root->commit_root = btrfs_root_node(root);
48}
49
50/*
51 * either allocate a new transaction or hop into the existing one
52 */
53static noinline int join_transaction(struct btrfs_root *root)
54{
55	struct btrfs_transaction *cur_trans;
56	cur_trans = root->fs_info->running_transaction;
57	if (!cur_trans) {
58		cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
59					     GFP_NOFS);
60		BUG_ON(!cur_trans);
61		root->fs_info->generation++;
62		cur_trans->num_writers = 1;
63		cur_trans->num_joined = 0;
64		cur_trans->transid = root->fs_info->generation;
65		init_waitqueue_head(&cur_trans->writer_wait);
66		init_waitqueue_head(&cur_trans->commit_wait);
67		cur_trans->in_commit = 0;
68		cur_trans->blocked = 0;
69		cur_trans->use_count = 1;
70		cur_trans->commit_done = 0;
71		cur_trans->start_time = get_seconds();
72
73		cur_trans->delayed_refs.root = RB_ROOT;
74		cur_trans->delayed_refs.num_entries = 0;
75		cur_trans->delayed_refs.num_heads_ready = 0;
76		cur_trans->delayed_refs.num_heads = 0;
77		cur_trans->delayed_refs.flushing = 0;
78		cur_trans->delayed_refs.run_delayed_start = 0;
79		spin_lock_init(&cur_trans->delayed_refs.lock);
80
81		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
82		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
83		extent_io_tree_init(&cur_trans->dirty_pages,
84				     root->fs_info->btree_inode->i_mapping,
85				     GFP_NOFS);
86		spin_lock(&root->fs_info->new_trans_lock);
87		root->fs_info->running_transaction = cur_trans;
88		spin_unlock(&root->fs_info->new_trans_lock);
89	} else {
90		cur_trans->num_writers++;
91		cur_trans->num_joined++;
92	}
93
94	return 0;
95}
96
97/*
98 * this does all the record keeping required to make sure that a reference
99 * counted root is properly recorded in a given transaction.  This is required
100 * to make sure the old root from before we joined the transaction is deleted
101 * when the transaction commits
102 */
103static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
104					 struct btrfs_root *root)
105{
106	if (root->ref_cows && root->last_trans < trans->transid) {
107		WARN_ON(root == root->fs_info->extent_root);
108		WARN_ON(root->commit_root != root->node);
109
110		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
111			   (unsigned long)root->root_key.objectid,
112			   BTRFS_ROOT_TRANS_TAG);
113		root->last_trans = trans->transid;
114		btrfs_init_reloc_root(trans, root);
115	}
116	return 0;
117}
118
119int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
120			       struct btrfs_root *root)
121{
122	if (!root->ref_cows)
123		return 0;
124
125	mutex_lock(&root->fs_info->trans_mutex);
126	if (root->last_trans == trans->transid) {
127		mutex_unlock(&root->fs_info->trans_mutex);
128		return 0;
129	}
130
131	record_root_in_trans(trans, root);
132	mutex_unlock(&root->fs_info->trans_mutex);
133	return 0;
134}
135
136/* wait for commit against the current transaction to become unblocked
137 * when this is done, it is safe to start a new transaction, but the current
138 * transaction might not be fully on disk.
139 */
140static void wait_current_trans(struct btrfs_root *root)
141{
142	struct btrfs_transaction *cur_trans;
143
144	cur_trans = root->fs_info->running_transaction;
145	if (cur_trans && cur_trans->blocked) {
146		DEFINE_WAIT(wait);
147		cur_trans->use_count++;
148		while (1) {
149			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
150					TASK_UNINTERRUPTIBLE);
151			if (!cur_trans->blocked)
152				break;
153			mutex_unlock(&root->fs_info->trans_mutex);
154			schedule();
155			mutex_lock(&root->fs_info->trans_mutex);
156		}
157		finish_wait(&root->fs_info->transaction_wait, &wait);
158		put_transaction(cur_trans);
159	}
160}
161
162enum btrfs_trans_type {
163	TRANS_START,
164	TRANS_JOIN,
165	TRANS_USERSPACE,
166};
167
168static int may_wait_transaction(struct btrfs_root *root, int type)
169{
170	if (!root->fs_info->log_root_recovering &&
171	    ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
172	     type == TRANS_USERSPACE))
173		return 1;
174	return 0;
175}
176
177static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
178						    u64 num_items, int type)
179{
180	struct btrfs_trans_handle *h;
181	struct btrfs_transaction *cur_trans;
182	int retries = 0;
183	int ret;
184again:
185	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186	if (!h)
187		return ERR_PTR(-ENOMEM);
188
189	mutex_lock(&root->fs_info->trans_mutex);
190	if (may_wait_transaction(root, type))
191		wait_current_trans(root);
192
193	ret = join_transaction(root);
194	BUG_ON(ret);
195
196	cur_trans = root->fs_info->running_transaction;
197	cur_trans->use_count++;
198	mutex_unlock(&root->fs_info->trans_mutex);
199
200	h->transid = cur_trans->transid;
201	h->transaction = cur_trans;
202	h->blocks_used = 0;
203	h->block_group = 0;
204	h->bytes_reserved = 0;
205	h->delayed_ref_updates = 0;
206	h->block_rsv = NULL;
207
208	smp_mb();
209	if (cur_trans->blocked && may_wait_transaction(root, type)) {
210		btrfs_commit_transaction(h, root);
211		goto again;
212	}
213
214	if (num_items > 0) {
215		ret = btrfs_trans_reserve_metadata(h, root, num_items,
216						   &retries);
217		if (ret == -EAGAIN) {
218			btrfs_commit_transaction(h, root);
219			goto again;
220		}
221		if (ret < 0) {
222			btrfs_end_transaction(h, root);
223			return ERR_PTR(ret);
224		}
225	}
226
227	mutex_lock(&root->fs_info->trans_mutex);
228	record_root_in_trans(h, root);
229	mutex_unlock(&root->fs_info->trans_mutex);
230
231	if (!current->journal_info && type != TRANS_USERSPACE)
232		current->journal_info = h;
233	return h;
234}
235
236struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
237						   int num_items)
238{
239	return start_transaction(root, num_items, TRANS_START);
240}
241struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
242						   int num_blocks)
243{
244	return start_transaction(root, 0, TRANS_JOIN);
245}
246
247struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
248							 int num_blocks)
249{
250	return start_transaction(r, 0, TRANS_USERSPACE);
251}
252
253/* wait for a transaction commit to be fully complete */
254static noinline int wait_for_commit(struct btrfs_root *root,
255				    struct btrfs_transaction *commit)
256{
257	DEFINE_WAIT(wait);
258	mutex_lock(&root->fs_info->trans_mutex);
259	while (!commit->commit_done) {
260		prepare_to_wait(&commit->commit_wait, &wait,
261				TASK_UNINTERRUPTIBLE);
262		if (commit->commit_done)
263			break;
264		mutex_unlock(&root->fs_info->trans_mutex);
265		schedule();
266		mutex_lock(&root->fs_info->trans_mutex);
267	}
268	mutex_unlock(&root->fs_info->trans_mutex);
269	finish_wait(&commit->commit_wait, &wait);
270	return 0;
271}
272
273
274void btrfs_throttle(struct btrfs_root *root)
275{
276	mutex_lock(&root->fs_info->trans_mutex);
277	if (!root->fs_info->open_ioctl_trans)
278		wait_current_trans(root);
279	mutex_unlock(&root->fs_info->trans_mutex);
280}
281
282static int should_end_transaction(struct btrfs_trans_handle *trans,
283				  struct btrfs_root *root)
284{
285	int ret;
286	ret = btrfs_block_rsv_check(trans, root,
287				    &root->fs_info->global_block_rsv, 0, 5);
288	return ret ? 1 : 0;
289}
290
291int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
292				 struct btrfs_root *root)
293{
294	struct btrfs_transaction *cur_trans = trans->transaction;
295	int updates;
296
297	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
298		return 1;
299
300	updates = trans->delayed_ref_updates;
301	trans->delayed_ref_updates = 0;
302	if (updates)
303		btrfs_run_delayed_refs(trans, root, updates);
304
305	return should_end_transaction(trans, root);
306}
307
308static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
309			  struct btrfs_root *root, int throttle)
310{
311	struct btrfs_transaction *cur_trans = trans->transaction;
312	struct btrfs_fs_info *info = root->fs_info;
313	int count = 0;
314
315	while (count < 4) {
316		unsigned long cur = trans->delayed_ref_updates;
317		trans->delayed_ref_updates = 0;
318		if (cur &&
319		    trans->transaction->delayed_refs.num_heads_ready > 64) {
320			trans->delayed_ref_updates = 0;
321
322			/*
323			 * do a full flush if the transaction is trying
324			 * to close
325			 */
326			if (trans->transaction->delayed_refs.flushing)
327				cur = 0;
328			btrfs_run_delayed_refs(trans, root, cur);
329		} else {
330			break;
331		}
332		count++;
333	}
334
335	btrfs_trans_release_metadata(trans, root);
336
337	if (!root->fs_info->open_ioctl_trans &&
338	    should_end_transaction(trans, root))
339		trans->transaction->blocked = 1;
340
341	if (cur_trans->blocked && !cur_trans->in_commit) {
342		if (throttle)
343			return btrfs_commit_transaction(trans, root);
344		else
345			wake_up_process(info->transaction_kthread);
346	}
347
348	mutex_lock(&info->trans_mutex);
349	WARN_ON(cur_trans != info->running_transaction);
350	WARN_ON(cur_trans->num_writers < 1);
351	cur_trans->num_writers--;
352
353	if (waitqueue_active(&cur_trans->writer_wait))
354		wake_up(&cur_trans->writer_wait);
355	put_transaction(cur_trans);
356	mutex_unlock(&info->trans_mutex);
357
358	if (current->journal_info == trans)
359		current->journal_info = NULL;
360	memset(trans, 0, sizeof(*trans));
361	kmem_cache_free(btrfs_trans_handle_cachep, trans);
362
363	if (throttle)
364		btrfs_run_delayed_iputs(root);
365
366	return 0;
367}
368
369int btrfs_end_transaction(struct btrfs_trans_handle *trans,
370			  struct btrfs_root *root)
371{
372	return __btrfs_end_transaction(trans, root, 0);
373}
374
375int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
376				   struct btrfs_root *root)
377{
378	return __btrfs_end_transaction(trans, root, 1);
379}
380
381/*
382 * when btree blocks are allocated, they have some corresponding bits set for
383 * them in one of two extent_io trees.  This is used to make sure all of
384 * those extents are sent to disk but does not wait on them
385 */
386int btrfs_write_marked_extents(struct btrfs_root *root,
387			       struct extent_io_tree *dirty_pages, int mark)
388{
389	int ret;
390	int err = 0;
391	int werr = 0;
392	struct page *page;
393	struct inode *btree_inode = root->fs_info->btree_inode;
394	u64 start = 0;
395	u64 end;
396	unsigned long index;
397
398	while (1) {
399		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
400					    mark);
401		if (ret)
402			break;
403		while (start <= end) {
404			cond_resched();
405
406			index = start >> PAGE_CACHE_SHIFT;
407			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
408			page = find_get_page(btree_inode->i_mapping, index);
409			if (!page)
410				continue;
411
412			btree_lock_page_hook(page);
413			if (!page->mapping) {
414				unlock_page(page);
415				page_cache_release(page);
416				continue;
417			}
418
419			if (PageWriteback(page)) {
420				if (PageDirty(page))
421					wait_on_page_writeback(page);
422				else {
423					unlock_page(page);
424					page_cache_release(page);
425					continue;
426				}
427			}
428			err = write_one_page(page, 0);
429			if (err)
430				werr = err;
431			page_cache_release(page);
432		}
433	}
434	if (err)
435		werr = err;
436	return werr;
437}
438
439/*
440 * when btree blocks are allocated, they have some corresponding bits set for
441 * them in one of two extent_io trees.  This is used to make sure all of
442 * those extents are on disk for transaction or log commit.  We wait
443 * on all the pages and clear them from the dirty pages state tree
444 */
445int btrfs_wait_marked_extents(struct btrfs_root *root,
446			      struct extent_io_tree *dirty_pages, int mark)
447{
448	int ret;
449	int err = 0;
450	int werr = 0;
451	struct page *page;
452	struct inode *btree_inode = root->fs_info->btree_inode;
453	u64 start = 0;
454	u64 end;
455	unsigned long index;
456
457	while (1) {
458		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
459					    mark);
460		if (ret)
461			break;
462
463		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
464		while (start <= end) {
465			index = start >> PAGE_CACHE_SHIFT;
466			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
467			page = find_get_page(btree_inode->i_mapping, index);
468			if (!page)
469				continue;
470			if (PageDirty(page)) {
471				btree_lock_page_hook(page);
472				wait_on_page_writeback(page);
473				err = write_one_page(page, 0);
474				if (err)
475					werr = err;
476			}
477			wait_on_page_writeback(page);
478			page_cache_release(page);
479			cond_resched();
480		}
481	}
482	if (err)
483		werr = err;
484	return werr;
485}
486
487/*
488 * when btree blocks are allocated, they have some corresponding bits set for
489 * them in one of two extent_io trees.  This is used to make sure all of
490 * those extents are on disk for transaction or log commit
491 */
492int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
493				struct extent_io_tree *dirty_pages, int mark)
494{
495	int ret;
496	int ret2;
497
498	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
499	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
500	return ret || ret2;
501}
502
503int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
504				     struct btrfs_root *root)
505{
506	if (!trans || !trans->transaction) {
507		struct inode *btree_inode;
508		btree_inode = root->fs_info->btree_inode;
509		return filemap_write_and_wait(btree_inode->i_mapping);
510	}
511	return btrfs_write_and_wait_marked_extents(root,
512					   &trans->transaction->dirty_pages,
513					   EXTENT_DIRTY);
514}
515
516/*
517 * this is used to update the root pointer in the tree of tree roots.
518 *
519 * But, in the case of the extent allocation tree, updating the root
520 * pointer may allocate blocks which may change the root of the extent
521 * allocation tree.
522 *
523 * So, this loops and repeats and makes sure the cowonly root didn't
524 * change while the root pointer was being updated in the metadata.
525 */
526static int update_cowonly_root(struct btrfs_trans_handle *trans,
527			       struct btrfs_root *root)
528{
529	int ret;
530	u64 old_root_bytenr;
531	u64 old_root_used;
532	struct btrfs_root *tree_root = root->fs_info->tree_root;
533
534	old_root_used = btrfs_root_used(&root->root_item);
535	btrfs_write_dirty_block_groups(trans, root);
536
537	while (1) {
538		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
539		if (old_root_bytenr == root->node->start &&
540		    old_root_used == btrfs_root_used(&root->root_item))
541			break;
542
543		btrfs_set_root_node(&root->root_item, root->node);
544		ret = btrfs_update_root(trans, tree_root,
545					&root->root_key,
546					&root->root_item);
547		BUG_ON(ret);
548
549		old_root_used = btrfs_root_used(&root->root_item);
550		ret = btrfs_write_dirty_block_groups(trans, root);
551		BUG_ON(ret);
552	}
553
554	if (root != root->fs_info->extent_root)
555		switch_commit_root(root);
556
557	return 0;
558}
559
560/*
561 * update all the cowonly tree roots on disk
562 */
563static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
564					 struct btrfs_root *root)
565{
566	struct btrfs_fs_info *fs_info = root->fs_info;
567	struct list_head *next;
568	struct extent_buffer *eb;
569	int ret;
570
571	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
572	BUG_ON(ret);
573
574	eb = btrfs_lock_root_node(fs_info->tree_root);
575	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
576	btrfs_tree_unlock(eb);
577	free_extent_buffer(eb);
578
579	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
580	BUG_ON(ret);
581
582	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
583		next = fs_info->dirty_cowonly_roots.next;
584		list_del_init(next);
585		root = list_entry(next, struct btrfs_root, dirty_list);
586
587		update_cowonly_root(trans, root);
588	}
589
590	down_write(&fs_info->extent_commit_sem);
591	switch_commit_root(fs_info->extent_root);
592	up_write(&fs_info->extent_commit_sem);
593
594	return 0;
595}
596
597/*
598 * dead roots are old snapshots that need to be deleted.  This allocates
599 * a dirty root struct and adds it into the list of dead roots that need to
600 * be deleted
601 */
602int btrfs_add_dead_root(struct btrfs_root *root)
603{
604	mutex_lock(&root->fs_info->trans_mutex);
605	list_add(&root->root_list, &root->fs_info->dead_roots);
606	mutex_unlock(&root->fs_info->trans_mutex);
607	return 0;
608}
609
610/*
611 * update all the cowonly tree roots on disk
612 */
613static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
614				    struct btrfs_root *root)
615{
616	struct btrfs_root *gang[8];
617	struct btrfs_fs_info *fs_info = root->fs_info;
618	int i;
619	int ret;
620	int err = 0;
621
622	while (1) {
623		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
624						 (void **)gang, 0,
625						 ARRAY_SIZE(gang),
626						 BTRFS_ROOT_TRANS_TAG);
627		if (ret == 0)
628			break;
629		for (i = 0; i < ret; i++) {
630			root = gang[i];
631			radix_tree_tag_clear(&fs_info->fs_roots_radix,
632					(unsigned long)root->root_key.objectid,
633					BTRFS_ROOT_TRANS_TAG);
634
635			btrfs_free_log(trans, root);
636			btrfs_update_reloc_root(trans, root);
637			btrfs_orphan_commit_root(trans, root);
638
639			if (root->commit_root != root->node) {
640				switch_commit_root(root);
641				btrfs_set_root_node(&root->root_item,
642						    root->node);
643			}
644
645			err = btrfs_update_root(trans, fs_info->tree_root,
646						&root->root_key,
647						&root->root_item);
648			if (err)
649				break;
650		}
651	}
652	return err;
653}
654
655/*
656 * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
657 * otherwise every leaf in the btree is read and defragged.
658 */
659int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
660{
661	struct btrfs_fs_info *info = root->fs_info;
662	struct btrfs_trans_handle *trans;
663	int ret;
664	unsigned long nr;
665
666	if (xchg(&root->defrag_running, 1))
667		return 0;
668
669	while (1) {
670		trans = btrfs_start_transaction(root, 0);
671		if (IS_ERR(trans))
672			return PTR_ERR(trans);
673
674		ret = btrfs_defrag_leaves(trans, root, cacheonly);
675
676		nr = trans->blocks_used;
677		btrfs_end_transaction(trans, root);
678		btrfs_btree_balance_dirty(info->tree_root, nr);
679		cond_resched();
680
681		if (root->fs_info->closing || ret != -EAGAIN)
682			break;
683	}
684	root->defrag_running = 0;
685	return ret;
686}
687
688
689/*
690 * new snapshots need to be created at a very specific time in the
691 * transaction commit.  This does the actual creation
692 */
693static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
694				   struct btrfs_fs_info *fs_info,
695				   struct btrfs_pending_snapshot *pending)
696{
697	struct btrfs_key key;
698	struct btrfs_root_item *new_root_item;
699	struct btrfs_root *tree_root = fs_info->tree_root;
700	struct btrfs_root *root = pending->root;
701	struct btrfs_root *parent_root;
702	struct inode *parent_inode;
703	struct dentry *dentry;
704	struct extent_buffer *tmp;
705	struct extent_buffer *old;
706	int ret;
707	int retries = 0;
708	u64 to_reserve = 0;
709	u64 index = 0;
710	u64 objectid;
711
712	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
713	if (!new_root_item) {
714		pending->error = -ENOMEM;
715		goto fail;
716	}
717
718	ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
719	if (ret) {
720		pending->error = ret;
721		goto fail;
722	}
723
724	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
725	btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
726
727	if (to_reserve > 0) {
728		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
729					  to_reserve, &retries);
730		if (ret) {
731			pending->error = ret;
732			goto fail;
733		}
734	}
735
736	key.objectid = objectid;
737	key.offset = (u64)-1;
738	key.type = BTRFS_ROOT_ITEM_KEY;
739
740	trans->block_rsv = &pending->block_rsv;
741
742	dentry = pending->dentry;
743	parent_inode = dentry->d_parent->d_inode;
744	parent_root = BTRFS_I(parent_inode)->root;
745	record_root_in_trans(trans, parent_root);
746
747	/*
748	 * insert the directory item
749	 */
750	ret = btrfs_set_inode_index(parent_inode, &index);
751	BUG_ON(ret);
752	ret = btrfs_insert_dir_item(trans, parent_root,
753				dentry->d_name.name, dentry->d_name.len,
754				parent_inode->i_ino, &key,
755				BTRFS_FT_DIR, index);
756	BUG_ON(ret);
757
758	btrfs_i_size_write(parent_inode, parent_inode->i_size +
759					 dentry->d_name.len * 2);
760	ret = btrfs_update_inode(trans, parent_root, parent_inode);
761	BUG_ON(ret);
762
763	record_root_in_trans(trans, root);
764	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
765	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
766
767	old = btrfs_lock_root_node(root);
768	btrfs_cow_block(trans, root, old, NULL, 0, &old);
769	btrfs_set_lock_blocking(old);
770
771	btrfs_copy_root(trans, root, old, &tmp, objectid);
772	btrfs_tree_unlock(old);
773	free_extent_buffer(old);
774
775	btrfs_set_root_node(new_root_item, tmp);
776	/* record when the snapshot was created in key.offset */
777	key.offset = trans->transid;
778	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
779	btrfs_tree_unlock(tmp);
780	free_extent_buffer(tmp);
781	BUG_ON(ret);
782
783	/*
784	 * insert root back/forward references
785	 */
786	ret = btrfs_add_root_ref(trans, tree_root, objectid,
787				 parent_root->root_key.objectid,
788				 parent_inode->i_ino, index,
789				 dentry->d_name.name, dentry->d_name.len);
790	BUG_ON(ret);
791
792	key.offset = (u64)-1;
793	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
794	BUG_ON(IS_ERR(pending->snap));
795
796	btrfs_reloc_post_snapshot(trans, pending);
797	btrfs_orphan_post_snapshot(trans, pending);
798fail:
799	kfree(new_root_item);
800	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
801	return 0;
802}
803
804/*
805 * create all the snapshots we've scheduled for creation
806 */
807static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
808					     struct btrfs_fs_info *fs_info)
809{
810	struct btrfs_pending_snapshot *pending;
811	struct list_head *head = &trans->transaction->pending_snapshots;
812	int ret;
813
814	list_for_each_entry(pending, head, list) {
815		ret = create_pending_snapshot(trans, fs_info, pending);
816		BUG_ON(ret);
817	}
818	return 0;
819}
820
821static void update_super_roots(struct btrfs_root *root)
822{
823	struct btrfs_root_item *root_item;
824	struct btrfs_super_block *super;
825
826	super = &root->fs_info->super_copy;
827
828	root_item = &root->fs_info->chunk_root->root_item;
829	super->chunk_root = root_item->bytenr;
830	super->chunk_root_generation = root_item->generation;
831	super->chunk_root_level = root_item->level;
832
833	root_item = &root->fs_info->tree_root->root_item;
834	super->root = root_item->bytenr;
835	super->generation = root_item->generation;
836	super->root_level = root_item->level;
837}
838
839int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
840{
841	int ret = 0;
842	spin_lock(&info->new_trans_lock);
843	if (info->running_transaction)
844		ret = info->running_transaction->in_commit;
845	spin_unlock(&info->new_trans_lock);
846	return ret;
847}
848
849int btrfs_transaction_blocked(struct btrfs_fs_info *info)
850{
851	int ret = 0;
852	spin_lock(&info->new_trans_lock);
853	if (info->running_transaction)
854		ret = info->running_transaction->blocked;
855	spin_unlock(&info->new_trans_lock);
856	return ret;
857}
858
859int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
860			     struct btrfs_root *root)
861{
862	unsigned long joined = 0;
863	unsigned long timeout = 1;
864	struct btrfs_transaction *cur_trans;
865	struct btrfs_transaction *prev_trans = NULL;
866	DEFINE_WAIT(wait);
867	int ret;
868	int should_grow = 0;
869	unsigned long now = get_seconds();
870	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
871
872	btrfs_run_ordered_operations(root, 0);
873
874	/* make a pass through all the delayed refs we have so far
875	 * any runnings procs may add more while we are here
876	 */
877	ret = btrfs_run_delayed_refs(trans, root, 0);
878	BUG_ON(ret);
879
880	btrfs_trans_release_metadata(trans, root);
881
882	cur_trans = trans->transaction;
883	/*
884	 * set the flushing flag so procs in this transaction have to
885	 * start sending their work down.
886	 */
887	cur_trans->delayed_refs.flushing = 1;
888
889	ret = btrfs_run_delayed_refs(trans, root, 0);
890	BUG_ON(ret);
891
892	mutex_lock(&root->fs_info->trans_mutex);
893	if (cur_trans->in_commit) {
894		cur_trans->use_count++;
895		mutex_unlock(&root->fs_info->trans_mutex);
896		btrfs_end_transaction(trans, root);
897
898		ret = wait_for_commit(root, cur_trans);
899		BUG_ON(ret);
900
901		mutex_lock(&root->fs_info->trans_mutex);
902		put_transaction(cur_trans);
903		mutex_unlock(&root->fs_info->trans_mutex);
904
905		return 0;
906	}
907
908	trans->transaction->in_commit = 1;
909	trans->transaction->blocked = 1;
910	if (cur_trans->list.prev != &root->fs_info->trans_list) {
911		prev_trans = list_entry(cur_trans->list.prev,
912					struct btrfs_transaction, list);
913		if (!prev_trans->commit_done) {
914			prev_trans->use_count++;
915			mutex_unlock(&root->fs_info->trans_mutex);
916
917			wait_for_commit(root, prev_trans);
918
919			mutex_lock(&root->fs_info->trans_mutex);
920			put_transaction(prev_trans);
921		}
922	}
923
924	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
925		should_grow = 1;
926
927	do {
928		int snap_pending = 0;
929		joined = cur_trans->num_joined;
930		if (!list_empty(&trans->transaction->pending_snapshots))
931			snap_pending = 1;
932
933		WARN_ON(cur_trans != trans->transaction);
934		if (cur_trans->num_writers > 1)
935			timeout = MAX_SCHEDULE_TIMEOUT;
936		else if (should_grow)
937			timeout = 1;
938
939		mutex_unlock(&root->fs_info->trans_mutex);
940
941		if (flush_on_commit || snap_pending) {
942			btrfs_start_delalloc_inodes(root, 1);
943			ret = btrfs_wait_ordered_extents(root, 0, 1);
944			BUG_ON(ret);
945		}
946
947		/*
948		 * rename don't use btrfs_join_transaction, so, once we
949		 * set the transaction to blocked above, we aren't going
950		 * to get any new ordered operations.  We can safely run
951		 * it here and no for sure that nothing new will be added
952		 * to the list
953		 */
954		btrfs_run_ordered_operations(root, 1);
955
956		prepare_to_wait(&cur_trans->writer_wait, &wait,
957				TASK_UNINTERRUPTIBLE);
958
959		smp_mb();
960		if (cur_trans->num_writers > 1 || should_grow)
961			schedule_timeout(timeout);
962
963		mutex_lock(&root->fs_info->trans_mutex);
964		finish_wait(&cur_trans->writer_wait, &wait);
965	} while (cur_trans->num_writers > 1 ||
966		 (should_grow && cur_trans->num_joined != joined));
967
968	ret = create_pending_snapshots(trans, root->fs_info);
969	BUG_ON(ret);
970
971	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
972	BUG_ON(ret);
973
974	WARN_ON(cur_trans != trans->transaction);
975
976	/* btrfs_commit_tree_roots is responsible for getting the
977	 * various roots consistent with each other.  Every pointer
978	 * in the tree of tree roots has to point to the most up to date
979	 * root for every subvolume and other tree.  So, we have to keep
980	 * the tree logging code from jumping in and changing any
981	 * of the trees.
982	 *
983	 * At this point in the commit, there can't be any tree-log
984	 * writers, but a little lower down we drop the trans mutex
985	 * and let new people in.  By holding the tree_log_mutex
986	 * from now until after the super is written, we avoid races
987	 * with the tree-log code.
988	 */
989	mutex_lock(&root->fs_info->tree_log_mutex);
990
991	ret = commit_fs_roots(trans, root);
992	BUG_ON(ret);
993
994	/* commit_fs_roots gets rid of all the tree log roots, it is now
995	 * safe to free the root of tree log roots
996	 */
997	btrfs_free_log_root_tree(trans, root->fs_info);
998
999	ret = commit_cowonly_roots(trans, root);
1000	BUG_ON(ret);
1001
1002	btrfs_prepare_extent_commit(trans, root);
1003
1004	cur_trans = root->fs_info->running_transaction;
1005	spin_lock(&root->fs_info->new_trans_lock);
1006	root->fs_info->running_transaction = NULL;
1007	spin_unlock(&root->fs_info->new_trans_lock);
1008
1009	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1010			    root->fs_info->tree_root->node);
1011	switch_commit_root(root->fs_info->tree_root);
1012
1013	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
1014			    root->fs_info->chunk_root->node);
1015	switch_commit_root(root->fs_info->chunk_root);
1016
1017	update_super_roots(root);
1018
1019	if (!root->fs_info->log_root_recovering) {
1020		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1021		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1022	}
1023
1024	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1025	       sizeof(root->fs_info->super_copy));
1026
1027	trans->transaction->blocked = 0;
1028
1029	wake_up(&root->fs_info->transaction_wait);
1030
1031	mutex_unlock(&root->fs_info->trans_mutex);
1032	ret = btrfs_write_and_wait_transaction(trans, root);
1033	BUG_ON(ret);
1034	write_ctree_super(trans, root, 0);
1035
1036	/*
1037	 * the super is written, we can safely allow the tree-loggers
1038	 * to go about their business
1039	 */
1040	mutex_unlock(&root->fs_info->tree_log_mutex);
1041
1042	btrfs_finish_extent_commit(trans, root);
1043
1044	mutex_lock(&root->fs_info->trans_mutex);
1045
1046	cur_trans->commit_done = 1;
1047
1048	root->fs_info->last_trans_committed = cur_trans->transid;
1049
1050	wake_up(&cur_trans->commit_wait);
1051
1052	put_transaction(cur_trans);
1053	put_transaction(cur_trans);
1054
1055	mutex_unlock(&root->fs_info->trans_mutex);
1056
1057	if (current->journal_info == trans)
1058		current->journal_info = NULL;
1059
1060	kmem_cache_free(btrfs_trans_handle_cachep, trans);
1061
1062	if (current != root->fs_info->transaction_kthread)
1063		btrfs_run_delayed_iputs(root);
1064
1065	return ret;
1066}
1067
1068/*
1069 * interface function to delete all the snapshots we have scheduled for deletion
1070 */
1071int btrfs_clean_old_snapshots(struct btrfs_root *root)
1072{
1073	LIST_HEAD(list);
1074	struct btrfs_fs_info *fs_info = root->fs_info;
1075
1076	mutex_lock(&fs_info->trans_mutex);
1077	list_splice_init(&fs_info->dead_roots, &list);
1078	mutex_unlock(&fs_info->trans_mutex);
1079
1080	while (!list_empty(&list)) {
1081		root = list_entry(list.next, struct btrfs_root, root_list);
1082		list_del(&root->root_list);
1083
1084		if (btrfs_header_backref_rev(root->node) <
1085		    BTRFS_MIXED_BACKREF_REV)
1086			btrfs_drop_snapshot(root, NULL, 0);
1087		else
1088			btrfs_drop_snapshot(root, NULL, 1);
1089	}
1090	return 0;
1091}
1092