1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/time.h>
6#include <linux/reiserfs_fs.h>
7#include <linux/reiserfs_acl.h>
8#include <linux/reiserfs_xattr.h>
9#include <asm/uaccess.h>
10#include <linux/pagemap.h>
11#include <linux/swap.h>
12#include <linux/writeback.h>
13#include <linux/blkdev.h>
14#include <linux/buffer_head.h>
15#include <linux/quotaops.h>
16
17/*
18** We pack the tails of files on file close, not at the time they are written.
19** This implies an unnecessary copy of the tail and an unnecessary indirect item
20** insertion/balancing, for files that are written in one write.
21** It avoids unnecessary tail packings (balances) for files that are written in
22** multiple writes and are small enough to have tails.
23**
24** file_release is called by the VFS layer when the file is closed.  If
25** this is the last open file descriptor, and the file
26** small enough to have a tail, and the tail is currently in an
27** unformatted node, the tail is converted back into a direct item.
28**
29** We use reiserfs_truncate_file to pack the tail, since it already has
30** all the conditions coded.
31*/
32static int reiserfs_file_release(struct inode *inode, struct file *filp)
33{
34
35	struct reiserfs_transaction_handle th;
36	int err;
37	int jbegin_failure = 0;
38
39	BUG_ON(!S_ISREG(inode->i_mode));
40
41	/* fast out for when nothing needs to be done */
42	if ((atomic_read(&inode->i_count) > 1 ||
43	     !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
44	     !tail_has_to_be_packed(inode)) &&
45	    REISERFS_I(inode)->i_prealloc_count <= 0) {
46		return 0;
47	}
48
49	mutex_lock(&inode->i_mutex);
50
51	mutex_lock(&(REISERFS_I(inode)->i_mmap));
52	if (REISERFS_I(inode)->i_flags & i_ever_mapped)
53		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
54
55	reiserfs_write_lock(inode->i_sb);
56	/* freeing preallocation only involves relogging blocks that
57	 * are already in the current transaction.  preallocation gets
58	 * freed at the end of each transaction, so it is impossible for
59	 * us to log any additional blocks (including quota blocks)
60	 */
61	err = journal_begin(&th, inode->i_sb, 1);
62	if (err) {
63		/* uh oh, we can't allow the inode to go away while there
64		 * is still preallocation blocks pending.  Try to join the
65		 * aborted transaction
66		 */
67		jbegin_failure = err;
68		err = journal_join_abort(&th, inode->i_sb, 1);
69
70		if (err) {
71			/* hmpf, our choices here aren't good.  We can pin the inode
72			 * which will disallow unmount from every happening, we can
73			 * do nothing, which will corrupt random memory on unmount,
74			 * or we can forcibly remove the file from the preallocation
75			 * list, which will leak blocks on disk.  Lets pin the inode
76			 * and let the admin know what is going on.
77			 */
78			igrab(inode);
79			reiserfs_warning(inode->i_sb,
80					 "pinning inode %lu because the "
81					 "preallocation can't be freed",
82					 inode->i_ino);
83			goto out;
84		}
85	}
86	reiserfs_update_inode_transaction(inode);
87
88#ifdef REISERFS_PREALLOCATE
89	reiserfs_discard_prealloc(&th, inode);
90#endif
91	err = journal_end(&th, inode->i_sb, 1);
92
93	/* copy back the error code from journal_begin */
94	if (!err)
95		err = jbegin_failure;
96
97	if (!err && atomic_read(&inode->i_count) <= 1 &&
98	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
99	    tail_has_to_be_packed(inode)) {
100		/* if regular file is released by last holder and it has been
101		   appended (we append by unformatted node only) or its direct
102		   item(s) had to be converted, then it may have to be
103		   indirect2direct converted */
104		err = reiserfs_truncate_file(inode, 0);
105	}
106      out:
107	mutex_unlock(&(REISERFS_I(inode)->i_mmap));
108	mutex_unlock(&inode->i_mutex);
109	reiserfs_write_unlock(inode->i_sb);
110	return err;
111}
112
113static int reiserfs_file_mmap(struct file *file, struct vm_area_struct *vma)
114{
115	struct inode *inode;
116
117	inode = file->f_path.dentry->d_inode;
118	mutex_lock(&(REISERFS_I(inode)->i_mmap));
119	REISERFS_I(inode)->i_flags |= i_ever_mapped;
120	mutex_unlock(&(REISERFS_I(inode)->i_mmap));
121
122	return generic_file_mmap(file, vma);
123}
124
125static void reiserfs_vfs_truncate_file(struct inode *inode)
126{
127	reiserfs_truncate_file(inode, 1);
128}
129
130/* Sync a reiserfs file. */
131
132
133static int reiserfs_sync_file(struct file *p_s_filp,
134			      struct dentry *p_s_dentry, int datasync)
135{
136	struct inode *p_s_inode = p_s_dentry->d_inode;
137	int n_err;
138	int barrier_done;
139
140	BUG_ON(!S_ISREG(p_s_inode->i_mode));
141	n_err = sync_mapping_buffers(p_s_inode->i_mapping);
142	reiserfs_write_lock(p_s_inode->i_sb);
143	barrier_done = reiserfs_commit_for_inode(p_s_inode);
144	reiserfs_write_unlock(p_s_inode->i_sb);
145	if (barrier_done != 1 && reiserfs_barrier_flush(p_s_inode->i_sb))
146		blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
147	if (barrier_done < 0)
148		return barrier_done;
149	return (n_err < 0) ? -EIO : 0;
150}
151
152/* I really do not want to play with memory shortage right now, so
153   to simplify the code, we are not going to write more than this much pages at
154   a time. This still should considerably improve performance compared to 4k
155   at a time case. This is 32 pages of 4k size. */
156#define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
157
158/* Allocates blocks for a file to fulfil write request.
159   Maps all unmapped but prepared pages from the list.
160   Updates metadata with newly allocated blocknumbers as needed */
161static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode we work with */
162					       loff_t pos,	/* Writing position */
163					       int num_pages,	/* number of pages write going
164								   to touch */
165					       int write_bytes,	/* amount of bytes to write */
166					       struct page **prepared_pages,	/* array of
167										   prepared pages
168										 */
169					       int blocks_to_allocate	/* Amount of blocks we
170									   need to allocate to
171									   fit the data into file
172									 */
173    )
174{
175	struct cpu_key key;	// cpu key of item that we are going to deal with
176	struct item_head *ih;	// pointer to item head that we are going to deal with
177	struct buffer_head *bh;	// Buffer head that contains items that we are going to deal with
178	__le32 *item;		// pointer to item we are going to deal with
179	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
180	b_blocknr_t *allocated_blocks;	// Pointer to a place where allocated blocknumbers would be stored.
181	reiserfs_blocknr_hint_t hint;	// hint structure for block allocator.
182	size_t res;		// return value of various functions that we call.
183	int curr_block;		// current block used to keep track of unmapped blocks.
184	int i;			// loop counter
185	int itempos;		// position in item
186	unsigned int from = (pos & (PAGE_CACHE_SIZE - 1));	// writing position in
187	// first page
188	unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;	/* last modified byte offset in last page */
189	__u64 hole_size;	// amount of blocks for a file hole, if it needed to be created.
190	int modifying_this_item = 0;	// Flag for items traversal code to keep track
191	// of the fact that we already prepared
192	// current block for journal
193	int will_prealloc = 0;
194	RFALSE(!blocks_to_allocate,
195	       "green-9004: tried to allocate zero blocks?");
196
197	/* only preallocate if this is a small write */
198	if (REISERFS_I(inode)->i_prealloc_count ||
199	    (!(write_bytes & (inode->i_sb->s_blocksize - 1)) &&
200	     blocks_to_allocate <
201	     REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
202		will_prealloc =
203		    REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
204
205	allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
206				   sizeof(b_blocknr_t), GFP_NOFS);
207	if (!allocated_blocks)
208		return -ENOMEM;
209
210	/* First we compose a key to point at the writing position, we want to do
211	   that outside of any locking region. */
212	make_cpu_key(&key, inode, pos + 1, TYPE_ANY, 3 /*key length */ );
213
214	/* If we came here, it means we absolutely need to open a transaction,
215	   since we need to allocate some blocks */
216	reiserfs_write_lock(inode->i_sb);	// Journaling stuff and we need that.
217	res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));	// Wish I know if this number enough
218	if (res)
219		goto error_exit;
220	reiserfs_update_inode_transaction(inode);
221
222	/* Look for the in-tree position of our write, need path for block allocator */
223	res = search_for_position_by_key(inode->i_sb, &key, &path);
224	if (res == IO_ERROR) {
225		res = -EIO;
226		goto error_exit;
227	}
228
229	/* Allocate blocks */
230	/* First fill in "hint" structure for block allocator */
231	hint.th = th;		// transaction handle.
232	hint.path = &path;	// Path, so that block allocator can determine packing locality or whatever it needs to determine.
233	hint.inode = inode;	// Inode is needed by block allocator too.
234	hint.search_start = 0;	// We have no hint on where to search free blocks for block allocator.
235	hint.key = key.on_disk_key;	// on disk key of file.
236	hint.block = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);	// Number of disk blocks this file occupies already.
237	hint.formatted_node = 0;	// We are allocating blocks for unformatted node.
238	hint.preallocate = will_prealloc;
239
240	/* Call block allocator to allocate blocks */
241	res =
242	    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
243				       blocks_to_allocate, blocks_to_allocate);
244	if (res != CARRY_ON) {
245		if (res == NO_DISK_SPACE) {
246			/* We flush the transaction in case of no space. This way some
247			   blocks might become free */
248			SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
249			res = restart_transaction(th, inode, &path);
250			if (res)
251				goto error_exit;
252
253			/* We might have scheduled, so search again */
254			res =
255			    search_for_position_by_key(inode->i_sb, &key,
256						       &path);
257			if (res == IO_ERROR) {
258				res = -EIO;
259				goto error_exit;
260			}
261
262			/* update changed info for hint structure. */
263			res =
264			    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
265						       blocks_to_allocate,
266						       blocks_to_allocate);
267			if (res != CARRY_ON) {
268				res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
269				pathrelse(&path);
270				goto error_exit;
271			}
272		} else {
273			res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
274			pathrelse(&path);
275			goto error_exit;
276		}
277	}
278#ifdef __BIG_ENDIAN
279	// Too bad, I have not found any way to convert a given region from
280	// cpu format to little endian format
281	{
282		int i;
283		for (i = 0; i < blocks_to_allocate; i++)
284			allocated_blocks[i] = cpu_to_le32(allocated_blocks[i]);
285	}
286#endif
287
288	/* Blocks allocating well might have scheduled and tree might have changed,
289	   let's search the tree again */
290	/* find where in the tree our write should go */
291	res = search_for_position_by_key(inode->i_sb, &key, &path);
292	if (res == IO_ERROR) {
293		res = -EIO;
294		goto error_exit_free_blocks;
295	}
296
297	bh = get_last_bh(&path);	// Get a bufferhead for last element in path.
298	ih = get_ih(&path);	// Get a pointer to last item head in path.
299	item = get_item(&path);	// Get a pointer to last item in path
300
301	/* Let's see what we have found */
302	if (res != POSITION_FOUND) {	/* position not found, this means that we
303					   might need to append file with holes
304					   first */
305		// Since we are writing past the file's end, we need to find out if
306		// there is a hole that needs to be inserted before our writing
307		// position, and how many blocks it is going to cover (we need to
308		//  populate pointers to file blocks representing the hole with zeros)
309
310		{
311			int item_offset = 1;
312			/*
313			 * if ih is stat data, its offset is 0 and we don't want to
314			 * add 1 to pos in the hole_size calculation
315			 */
316			if (is_statdata_le_ih(ih))
317				item_offset = 0;
318			hole_size = (pos + item_offset -
319				     (le_key_k_offset
320				      (get_inode_item_key_version(inode),
321				       &(ih->ih_key)) + op_bytes_number(ih,
322									inode->
323									i_sb->
324									s_blocksize)))
325			    >> inode->i_sb->s_blocksize_bits;
326		}
327
328		if (hole_size > 0) {
329			int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize) / UNFM_P_SIZE);	// How much data to insert first time.
330			/* area filled with zeroes, to supply as list of zero blocknumbers
331			   We allocate it outside of loop just in case loop would spin for
332			   several iterations. */
333			char *zeros = kzalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC);	// We cannot insert more than MAX_ITEM_LEN bytes anyway.
334			if (!zeros) {
335				res = -ENOMEM;
336				goto error_exit_free_blocks;
337			}
338			do {
339				to_paste =
340				    min_t(__u64, hole_size,
341					  MAX_ITEM_LEN(inode->i_sb->
342						       s_blocksize) /
343					  UNFM_P_SIZE);
344				if (is_indirect_le_ih(ih)) {
345					/* Ok, there is existing indirect item already. Need to append it */
346					/* Calculate position past inserted item */
347					make_cpu_key(&key, inode,
348						     le_key_k_offset
349						     (get_inode_item_key_version
350						      (inode),
351						      &(ih->ih_key)) +
352						     op_bytes_number(ih,
353								     inode->
354								     i_sb->
355								     s_blocksize),
356						     TYPE_INDIRECT, 3);
357					res =
358					    reiserfs_paste_into_item(th, &path,
359								     &key,
360								     inode,
361								     (char *)
362								     zeros,
363								     UNFM_P_SIZE
364								     *
365								     to_paste);
366					if (res) {
367						kfree(zeros);
368						goto error_exit_free_blocks;
369					}
370				} else if (is_statdata_le_ih(ih)) {
371					/* No existing item, create it */
372					/* item head for new item */
373					struct item_head ins_ih;
374
375					/* create a key for our new item */
376					make_cpu_key(&key, inode, 1,
377						     TYPE_INDIRECT, 3);
378
379					/* Create new item head for our new item */
380					make_le_item_head(&ins_ih, &key,
381							  key.version, 1,
382							  TYPE_INDIRECT,
383							  to_paste *
384							  UNFM_P_SIZE,
385							  0 /* free space */ );
386
387					/* Find where such item should live in the tree */
388					res =
389					    search_item(inode->i_sb, &key,
390							&path);
391					if (res != ITEM_NOT_FOUND) {
392						/* item should not exist, otherwise we have error */
393						if (res != -ENOSPC) {
394							reiserfs_warning(inode->
395									 i_sb,
396									 "green-9008: search_by_key (%K) returned %d",
397									 &key,
398									 res);
399						}
400						res = -EIO;
401						kfree(zeros);
402						goto error_exit_free_blocks;
403					}
404					res =
405					    reiserfs_insert_item(th, &path,
406								 &key, &ins_ih,
407								 inode,
408								 (char *)zeros);
409				} else {
410					reiserfs_panic(inode->i_sb,
411						       "green-9011: Unexpected key type %K\n",
412						       &key);
413				}
414				if (res) {
415					kfree(zeros);
416					goto error_exit_free_blocks;
417				}
418				/* Now we want to check if transaction is too full, and if it is
419				   we restart it. This will also free the path. */
420				if (journal_transaction_should_end
421				    (th, th->t_blocks_allocated)) {
422					inode->i_size = cpu_key_k_offset(&key) +
423						(to_paste << inode->i_blkbits);
424					res =
425					    restart_transaction(th, inode,
426								&path);
427					if (res) {
428						pathrelse(&path);
429						kfree(zeros);
430						goto error_exit;
431					}
432				}
433
434				/* Well, need to recalculate path and stuff */
435				set_cpu_key_k_offset(&key,
436						     cpu_key_k_offset(&key) +
437						     (to_paste << inode->
438						      i_blkbits));
439				res =
440				    search_for_position_by_key(inode->i_sb,
441							       &key, &path);
442				if (res == IO_ERROR) {
443					res = -EIO;
444					kfree(zeros);
445					goto error_exit_free_blocks;
446				}
447				bh = get_last_bh(&path);
448				ih = get_ih(&path);
449				item = get_item(&path);
450				hole_size -= to_paste;
451			} while (hole_size);
452			kfree(zeros);
453		}
454	}
455	// Go through existing indirect items first
456	// replace all zeroes with blocknumbers from list
457	// Note that if no corresponding item was found, by previous search,
458	// it means there are no existing in-tree representation for file area
459	// we are going to overwrite, so there is nothing to scan through for holes.
460	for (curr_block = 0, itempos = path.pos_in_item;
461	     curr_block < blocks_to_allocate && res == POSITION_FOUND;) {
462	      retry:
463
464		if (itempos >= ih_item_len(ih) / UNFM_P_SIZE) {
465			/* We run out of data in this indirect item, let's look for another
466			   one. */
467			/* First if we are already modifying current item, log it */
468			if (modifying_this_item) {
469				journal_mark_dirty(th, inode->i_sb, bh);
470				modifying_this_item = 0;
471			}
472			/* Then set the key to look for a new indirect item (offset of old
473			   item is added to old item length */
474			set_cpu_key_k_offset(&key,
475					     le_key_k_offset
476					     (get_inode_item_key_version(inode),
477					      &(ih->ih_key)) +
478					     op_bytes_number(ih,
479							     inode->i_sb->
480							     s_blocksize));
481			/* Search ofor position of new key in the tree. */
482			res =
483			    search_for_position_by_key(inode->i_sb, &key,
484						       &path);
485			if (res == IO_ERROR) {
486				res = -EIO;
487				goto error_exit_free_blocks;
488			}
489			bh = get_last_bh(&path);
490			ih = get_ih(&path);
491			item = get_item(&path);
492			itempos = path.pos_in_item;
493			continue;	// loop to check all kinds of conditions and so on.
494		}
495		/* Ok, we have correct position in item now, so let's see if it is
496		   representing file hole (blocknumber is zero) and fill it if needed */
497		if (!item[itempos]) {
498			/* Ok, a hole. Now we need to check if we already prepared this
499			   block to be journaled */
500			while (!modifying_this_item) {	// loop until succeed
501				/* Well, this item is not journaled yet, so we must prepare
502				   it for journal first, before we can change it */
503				struct item_head tmp_ih;	// We copy item head of found item,
504				// here to detect if fs changed under
505				// us while we were preparing for
506				// journal.
507				int fs_gen;	// We store fs generation here to find if someone
508				// changes fs under our feet
509
510				copy_item_head(&tmp_ih, ih);	// Remember itemhead
511				fs_gen = get_generation(inode->i_sb);	// remember fs generation
512				reiserfs_prepare_for_journal(inode->i_sb, bh, 1);	// Prepare a buffer within which indirect item is stored for changing.
513				if (fs_changed(fs_gen, inode->i_sb)
514				    && item_moved(&tmp_ih, &path)) {
515					// Sigh, fs was changed under us, we need to look for new
516					// location of item we are working with
517
518					/* unmark prepaerd area as journaled and search for it's
519					   new position */
520					reiserfs_restore_prepared_buffer(inode->
521									 i_sb,
522									 bh);
523					res =
524					    search_for_position_by_key(inode->
525								       i_sb,
526								       &key,
527								       &path);
528					if (res == IO_ERROR) {
529						res = -EIO;
530						goto error_exit_free_blocks;
531					}
532					bh = get_last_bh(&path);
533					ih = get_ih(&path);
534					item = get_item(&path);
535					itempos = path.pos_in_item;
536					goto retry;
537				}
538				modifying_this_item = 1;
539			}
540			item[itempos] = allocated_blocks[curr_block];	// Assign new block
541			curr_block++;
542		}
543		itempos++;
544	}
545
546	if (modifying_this_item) {	// We need to log last-accessed block, if it
547		// was modified, but not logged yet.
548		journal_mark_dirty(th, inode->i_sb, bh);
549	}
550
551	if (curr_block < blocks_to_allocate) {
552		// Oh, well need to append to indirect item, or to create indirect item
553		// if there weren't any
554		if (is_indirect_le_ih(ih)) {
555			// Existing indirect item - append. First calculate key for append
556			// position. We do not need to recalculate path as it should
557			// already point to correct place.
558			make_cpu_key(&key, inode,
559				     le_key_k_offset(get_inode_item_key_version
560						     (inode),
561						     &(ih->ih_key)) +
562				     op_bytes_number(ih,
563						     inode->i_sb->s_blocksize),
564				     TYPE_INDIRECT, 3);
565			res =
566			    reiserfs_paste_into_item(th, &path, &key, inode,
567						     (char *)(allocated_blocks +
568							      curr_block),
569						     UNFM_P_SIZE *
570						     (blocks_to_allocate -
571						      curr_block));
572			if (res) {
573				goto error_exit_free_blocks;
574			}
575		} else if (is_statdata_le_ih(ih)) {
576			// Last found item was statdata. That means we need to create indirect item.
577			struct item_head ins_ih;	/* itemhead for new item */
578
579			/* create a key for our new item */
580			make_cpu_key(&key, inode, 1, TYPE_INDIRECT, 3);	// Position one,
581			// because that's
582			// where first
583			// indirect item
584			// begins
585			/* Create new item head for our new item */
586			make_le_item_head(&ins_ih, &key, key.version, 1,
587					  TYPE_INDIRECT,
588					  (blocks_to_allocate -
589					   curr_block) * UNFM_P_SIZE,
590					  0 /* free space */ );
591			/* Find where such item should live in the tree */
592			res = search_item(inode->i_sb, &key, &path);
593			if (res != ITEM_NOT_FOUND) {
594				/* Well, if we have found such item already, or some error
595				   occured, we need to warn user and return error */
596				if (res != -ENOSPC) {
597					reiserfs_warning(inode->i_sb,
598							 "green-9009: search_by_key (%K) "
599							 "returned %d", &key,
600							 res);
601				}
602				res = -EIO;
603				goto error_exit_free_blocks;
604			}
605			/* Insert item into the tree with the data as its body */
606			res =
607			    reiserfs_insert_item(th, &path, &key, &ins_ih,
608						 inode,
609						 (char *)(allocated_blocks +
610							  curr_block));
611		} else {
612			reiserfs_panic(inode->i_sb,
613				       "green-9010: unexpected item type for key %K\n",
614				       &key);
615		}
616	}
617	// the caller is responsible for closing the transaction
618	// unless we return an error, they are also responsible for logging
619	// the inode.
620	//
621	pathrelse(&path);
622	/*
623	 * cleanup prellocation from previous writes
624	 * if this is a partial block write
625	 */
626	if (write_bytes & (inode->i_sb->s_blocksize - 1))
627		reiserfs_discard_prealloc(th, inode);
628	reiserfs_write_unlock(inode->i_sb);
629
630	// go through all the pages/buffers and map the buffers to newly allocated
631	// blocks (so that system knows where to write these pages later).
632	curr_block = 0;
633	for (i = 0; i < num_pages; i++) {
634		struct page *page = prepared_pages[i];	//current page
635		struct buffer_head *head = page_buffers(page);	// first buffer for a page
636		int block_start, block_end;	// in-page offsets for buffers.
637
638		if (!page_buffers(page))
639			reiserfs_panic(inode->i_sb,
640				       "green-9005: No buffers for prepared page???");
641
642		/* For each buffer in page */
643		for (bh = head, block_start = 0; bh != head || !block_start;
644		     block_start = block_end, bh = bh->b_this_page) {
645			if (!bh)
646				reiserfs_panic(inode->i_sb,
647					       "green-9006: Allocated but absent buffer for a page?");
648			block_end = block_start + inode->i_sb->s_blocksize;
649			if (i == 0 && block_end <= from)
650				/* if this buffer is before requested data to map, skip it */
651				continue;
652			if (i == num_pages - 1 && block_start >= to)
653				/* If this buffer is after requested data to map, abort
654				   processing of current page */
655				break;
656
657			if (!buffer_mapped(bh)) {	// Ok, unmapped buffer, need to map it
658				map_bh(bh, inode->i_sb,
659				       le32_to_cpu(allocated_blocks
660						   [curr_block]));
661				curr_block++;
662				set_buffer_new(bh);
663			}
664		}
665	}
666
667	RFALSE(curr_block > blocks_to_allocate,
668	       "green-9007: Used too many blocks? weird");
669
670	kfree(allocated_blocks);
671	return 0;
672
673// Need to deal with transaction here.
674      error_exit_free_blocks:
675	pathrelse(&path);
676	// free blocks
677	for (i = 0; i < blocks_to_allocate; i++)
678		reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]),
679				    1);
680
681      error_exit:
682	if (th->t_trans_id) {
683		int err;
684		// update any changes we made to blk count
685		mark_inode_dirty(inode);
686		err =
687		    journal_end(th, inode->i_sb,
688				JOURNAL_PER_BALANCE_CNT * 3 + 1 +
689				2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));
690		if (err)
691			res = err;
692	}
693	reiserfs_write_unlock(inode->i_sb);
694	kfree(allocated_blocks);
695
696	return res;
697}
698
699/* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
700static void reiserfs_unprepare_pages(struct page **prepared_pages,	/* list of locked pages */
701				     size_t num_pages /* amount of pages */ )
702{
703	int i;			// loop counter
704
705	for (i = 0; i < num_pages; i++) {
706		struct page *page = prepared_pages[i];
707
708		try_to_free_buffers(page);
709		unlock_page(page);
710		page_cache_release(page);
711	}
712}
713
714/* This function will copy data from userspace to specified pages within
715   supplied byte range */
716static int reiserfs_copy_from_user_to_file_region(loff_t pos,	/* In-file position */
717						  int num_pages,	/* Number of pages affected */
718						  int write_bytes,	/* Amount of bytes to write */
719						  struct page **prepared_pages,	/* pointer to
720										   array to
721										   prepared pages
722										 */
723						  const char __user * buf	/* Pointer to user-supplied
724										   data */
725    )
726{
727	long page_fault = 0;	// status of copy_from_user.
728	int i;			// loop counter.
729	int offset;		// offset in page
730
731	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
732	     i++, offset = 0) {
733		size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
734		struct page *page = prepared_pages[i];	// Current page we process.
735
736		fault_in_pages_readable(buf, count);
737
738		/* Copy data from userspace to the current page */
739		kmap(page);
740		page_fault = __copy_from_user(page_address(page) + offset, buf, count);	// Copy the data.
741		/* Flush processor's dcache for this page */
742		flush_dcache_page(page);
743		kunmap(page);
744		buf += count;
745		write_bytes -= count;
746
747		if (page_fault)
748			break;	// Was there a fault? abort.
749	}
750
751	return page_fault ? -EFAULT : 0;
752}
753
754/* taken fs/buffer.c:__block_commit_write */
755int reiserfs_commit_page(struct inode *inode, struct page *page,
756			 unsigned from, unsigned to)
757{
758	unsigned block_start, block_end;
759	int partial = 0;
760	unsigned blocksize;
761	struct buffer_head *bh, *head;
762	unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
763	int new;
764	int logit = reiserfs_file_data_log(inode);
765	struct super_block *s = inode->i_sb;
766	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
767	struct reiserfs_transaction_handle th;
768	int ret = 0;
769
770	th.t_trans_id = 0;
771	blocksize = 1 << inode->i_blkbits;
772
773	if (logit) {
774		reiserfs_write_lock(s);
775		ret = journal_begin(&th, s, bh_per_page + 1);
776		if (ret)
777			goto drop_write_lock;
778		reiserfs_update_inode_transaction(inode);
779	}
780	for (bh = head = page_buffers(page), block_start = 0;
781	     bh != head || !block_start;
782	     block_start = block_end, bh = bh->b_this_page) {
783
784		new = buffer_new(bh);
785		clear_buffer_new(bh);
786		block_end = block_start + blocksize;
787		if (block_end <= from || block_start >= to) {
788			if (!buffer_uptodate(bh))
789				partial = 1;
790		} else {
791			set_buffer_uptodate(bh);
792			if (logit) {
793				reiserfs_prepare_for_journal(s, bh, 1);
794				journal_mark_dirty(&th, s, bh);
795			} else if (!buffer_dirty(bh)) {
796				mark_buffer_dirty(bh);
797				/* do data=ordered on any page past the end
798				 * of file and any buffer marked BH_New.
799				 */
800				if (reiserfs_data_ordered(inode->i_sb) &&
801				    (new || page->index >= i_size_index)) {
802					reiserfs_add_ordered_list(inode, bh);
803				}
804			}
805		}
806	}
807	if (logit) {
808		ret = journal_end(&th, s, bh_per_page + 1);
809	      drop_write_lock:
810		reiserfs_write_unlock(s);
811	}
812	/*
813	 * If this is a partial write which happened to make all buffers
814	 * uptodate then we can optimize away a bogus readpage() for
815	 * the next read(). Here we 'discover' whether the page went
816	 * uptodate as a result of this (potentially partial) write.
817	 */
818	if (!partial)
819		SetPageUptodate(page);
820	return ret;
821}
822
823/* Submit pages for write. This was separated from actual file copying
824   because we might want to allocate block numbers in-between.
825   This function assumes that caller will adjust file size to correct value. */
826static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_handle *th, struct inode *inode, loff_t pos,	/* Writing position offset */
827						 size_t num_pages,	/* Number of pages to write */
828						 size_t write_bytes,	/* number of bytes to write */
829						 struct page **prepared_pages	/* list of pages */
830    )
831{
832	int status;		// return status of block_commit_write.
833	int retval = 0;		// Return value we are going to return.
834	int i;			// loop counter
835	int offset;		// Writing offset in page.
836	int orig_write_bytes = write_bytes;
837	int sd_update = 0;
838
839	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
840	     i++, offset = 0) {
841		int count = min_t(int, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
842		struct page *page = prepared_pages[i];	// Current page we process.
843
844		status =
845		    reiserfs_commit_page(inode, page, offset, offset + count);
846		if (status)
847			retval = status;	// To not overcomplicate matters We are going to
848		// submit all the pages even if there was error.
849		// we only remember error status to report it on
850		// exit.
851		write_bytes -= count;
852	}
853	/* now that we've gotten all the ordered buffers marked dirty,
854	 * we can safely update i_size and close any running transaction
855	 */
856	if (pos + orig_write_bytes > inode->i_size) {
857		inode->i_size = pos + orig_write_bytes;	// Set new size
858		/* If the file have grown so much that tail packing is no
859		 * longer possible, reset "need to pack" flag */
860		if ((have_large_tails(inode->i_sb) &&
861		     inode->i_size > i_block_size(inode) * 4) ||
862		    (have_small_tails(inode->i_sb) &&
863		     inode->i_size > i_block_size(inode)))
864			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
865		else if ((have_large_tails(inode->i_sb) &&
866			  inode->i_size < i_block_size(inode) * 4) ||
867			 (have_small_tails(inode->i_sb) &&
868			  inode->i_size < i_block_size(inode)))
869			REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
870
871		if (th->t_trans_id) {
872			reiserfs_write_lock(inode->i_sb);
873			// this sets the proper flags for O_SYNC to trigger a commit
874			mark_inode_dirty(inode);
875			reiserfs_write_unlock(inode->i_sb);
876		} else {
877			reiserfs_write_lock(inode->i_sb);
878			reiserfs_update_inode_transaction(inode);
879			mark_inode_dirty(inode);
880			reiserfs_write_unlock(inode->i_sb);
881		}
882
883		sd_update = 1;
884	}
885	if (th->t_trans_id) {
886		reiserfs_write_lock(inode->i_sb);
887		if (!sd_update)
888			mark_inode_dirty(inode);
889		status = journal_end(th, th->t_super, th->t_blocks_allocated);
890		if (status)
891			retval = status;
892		reiserfs_write_unlock(inode->i_sb);
893	}
894	th->t_trans_id = 0;
895
896	/*
897	 * we have to unlock the pages after updating i_size, otherwise
898	 * we race with writepage
899	 */
900	for (i = 0; i < num_pages; i++) {
901		struct page *page = prepared_pages[i];
902		unlock_page(page);
903		mark_page_accessed(page);
904		page_cache_release(page);
905	}
906	return retval;
907}
908
909/* Look if passed writing region is going to touch file's tail
910   (if it is present). And if it is, convert the tail to unformatted node */
911static int reiserfs_check_for_tail_and_convert(struct inode *inode,	/* inode to deal with */
912					       loff_t pos,	/* Writing position */
913					       int write_bytes	/* amount of bytes to write */
914    )
915{
916	INITIALIZE_PATH(path);	// needed for search_for_position
917	struct cpu_key key;	// Key that would represent last touched writing byte.
918	struct item_head *ih;	// item header of found block;
919	int res;		// Return value of various functions we call.
920	int cont_expand_offset;	// We will put offset for generic_cont_expand here
921	// This can be int just because tails are created
922	// only for small files.
923
924/* this embodies a dependency on a particular tail policy */
925	if (inode->i_size >= inode->i_sb->s_blocksize * 4) {
926		/* such a big files do not have tails, so we won't bother ourselves
927		   to look for tails, simply return */
928		return 0;
929	}
930
931	reiserfs_write_lock(inode->i_sb);
932	/* find the item containing the last byte to be written, or if
933	 * writing past the end of the file then the last item of the
934	 * file (and then we check its type). */
935	make_cpu_key(&key, inode, pos + write_bytes + 1, TYPE_ANY,
936		     3 /*key length */ );
937	res = search_for_position_by_key(inode->i_sb, &key, &path);
938	if (res == IO_ERROR) {
939		reiserfs_write_unlock(inode->i_sb);
940		return -EIO;
941	}
942	ih = get_ih(&path);
943	res = 0;
944	if (is_direct_le_ih(ih)) {
945		/* Ok, closest item is file tail (tails are stored in "direct"
946		 * items), so we need to unpack it. */
947		/* To not overcomplicate matters, we just call generic_cont_expand
948		   which will in turn call other stuff and finally will boil down to
949		   reiserfs_get_block() that would do necessary conversion. */
950		cont_expand_offset =
951		    le_key_k_offset(get_inode_item_key_version(inode),
952				    &(ih->ih_key));
953		pathrelse(&path);
954		res = generic_cont_expand(inode, cont_expand_offset);
955	} else
956		pathrelse(&path);
957
958	reiserfs_write_unlock(inode->i_sb);
959	return res;
960}
961
962/* This function locks pages starting from @pos for @inode.
963   @num_pages pages are locked and stored in
964   @prepared_pages array. Also buffers are allocated for these pages.
965   First and last page of the region is read if it is overwritten only
966   partially. If last page did not exist before write (file hole or file
967   append), it is zeroed, then.
968   Returns number of unallocated blocks that should be allocated to cover
969   new file data.*/
970static int reiserfs_prepare_file_region_for_write(struct inode *inode
971						  /* Inode of the file */ ,
972						  loff_t pos,	/* position in the file */
973						  size_t num_pages,	/* number of pages to
974									   prepare */
975						  size_t write_bytes,	/* Amount of bytes to be
976									   overwritten from
977									   @pos */
978						  struct page **prepared_pages	/* pointer to array
979										   where to store
980										   prepared pages */
981    )
982{
983	int res = 0;		// Return values of different functions we call.
984	unsigned long index = pos >> PAGE_CACHE_SHIFT;	// Offset in file in pages.
985	int from = (pos & (PAGE_CACHE_SIZE - 1));	// Writing offset in first page
986	int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
987	/* offset of last modified byte in last
988	   page */
989	struct address_space *mapping = inode->i_mapping;	// Pages are mapped here.
990	int i;			// Simple counter
991	int blocks = 0;		/* Return value (blocks that should be allocated) */
992	struct buffer_head *bh, *head;	// Current bufferhead and first bufferhead
993	// of a page.
994	unsigned block_start, block_end;	// Starting and ending offsets of current
995	// buffer in the page.
996	struct buffer_head *wait[2], **wait_bh = wait;	// Buffers for page, if
997	// Page appeared to be not up
998	// to date. Note how we have
999	// at most 2 buffers, this is
1000	// because we at most may
1001	// partially overwrite two
1002	// buffers for one page. One at                                                 // the beginning of write area
1003	// and one at the end.
1004	// Everything inthe middle gets                                                 // overwritten totally.
1005
1006	struct cpu_key key;	// cpu key of item that we are going to deal with
1007	struct item_head *ih = NULL;	// pointer to item head that we are going to deal with
1008	struct buffer_head *itembuf = NULL;	// Buffer head that contains items that we are going to deal with
1009	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
1010	__le32 *item = NULL;	// pointer to item we are going to deal with
1011	int item_pos = -1;	/* Position in indirect item */
1012
1013	if (num_pages < 1) {
1014		reiserfs_warning(inode->i_sb,
1015				 "green-9001: reiserfs_prepare_file_region_for_write "
1016				 "called with zero number of pages to process");
1017		return -EFAULT;
1018	}
1019
1020	/* We have 2 loops for pages. In first loop we grab and lock the pages, so
1021	   that nobody would touch these until we release the pages. Then
1022	   we'd start to deal with mapping buffers to blocks. */
1023	for (i = 0; i < num_pages; i++) {
1024		prepared_pages[i] = grab_cache_page(mapping, index + i);	// locks the page
1025		if (!prepared_pages[i]) {
1026			res = -ENOMEM;
1027			goto failed_page_grabbing;
1028		}
1029		if (!page_has_buffers(prepared_pages[i]))
1030			create_empty_buffers(prepared_pages[i],
1031					     inode->i_sb->s_blocksize, 0);
1032	}
1033
1034	/* Let's count amount of blocks for a case where all the blocks
1035	   overwritten are new (we will substract already allocated blocks later) */
1036	if (num_pages > 2)
1037		/* These are full-overwritten pages so we count all the blocks in
1038		   these pages are counted as needed to be allocated */
1039		blocks =
1040		    (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1041
1042	/* count blocks needed for first page (possibly partially written) */
1043	blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + !!(from & (inode->i_sb->s_blocksize - 1));	/* roundup */
1044
1045	/* Now we account for last page. If last page == first page (we
1046	   overwrite only one page), we substract all the blocks past the
1047	   last writing position in a page out of already calculated number
1048	   of blocks */
1049	blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT - inode->i_blkbits)) -
1050	    ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
1051	/* Note how we do not roundup here since partial blocks still
1052	   should be allocated */
1053
1054	/* Now if all the write area lies past the file end, no point in
1055	   maping blocks, since there is none, so we just zero out remaining
1056	   parts of first and last pages in write area (if needed) */
1057	if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) {
1058		if (from != 0)		/* First page needs to be partially zeroed */
1059			zero_user_page(prepared_pages[0], 0, from, KM_USER0);
1060
1061		if (to != PAGE_CACHE_SIZE)	/* Last page needs to be partially zeroed */
1062			zero_user_page(prepared_pages[num_pages-1], to,
1063					PAGE_CACHE_SIZE - to, KM_USER0);
1064
1065		/* Since all blocks are new - use already calculated value */
1066		return blocks;
1067	}
1068
1069	/* Well, since we write somewhere into the middle of a file, there is
1070	   possibility we are writing over some already allocated blocks, so
1071	   let's map these blocks and substract number of such blocks out of blocks
1072	   we need to allocate (calculated above) */
1073	/* Mask write position to start on blocksize, we do it out of the
1074	   loop for performance reasons */
1075	pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
1076	/* Set cpu key to the starting position in a file (on left block boundary) */
1077	make_cpu_key(&key, inode,
1078		     1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)),
1079		     TYPE_ANY, 3 /*key length */ );
1080
1081	reiserfs_write_lock(inode->i_sb);	// We need that for at least search_by_key()
1082	for (i = 0; i < num_pages; i++) {
1083
1084		head = page_buffers(prepared_pages[i]);
1085		/* For each buffer in the page */
1086		for (bh = head, block_start = 0; bh != head || !block_start;
1087		     block_start = block_end, bh = bh->b_this_page) {
1088			if (!bh)
1089				reiserfs_panic(inode->i_sb,
1090					       "green-9002: Allocated but absent buffer for a page?");
1091			/* Find where this buffer ends */
1092			block_end = block_start + inode->i_sb->s_blocksize;
1093			if (i == 0 && block_end <= from)
1094				/* if this buffer is before requested data to map, skip it */
1095				continue;
1096
1097			if (i == num_pages - 1 && block_start >= to) {
1098				/* If this buffer is after requested data to map, abort
1099				   processing of current page */
1100				break;
1101			}
1102
1103			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1104				/* This is optimisation for a case where buffer is mapped
1105				   and have blocknumber assigned. In case significant amount
1106				   of such buffers are present, we may avoid some amount
1107				   of search_by_key calls.
1108				   Probably it would be possible to move parts of this code
1109				   out of BKL, but I afraid that would overcomplicate code
1110				   without any noticeable benefit.
1111				 */
1112				item_pos++;
1113				/* Update the key */
1114				set_cpu_key_k_offset(&key,
1115						     cpu_key_k_offset(&key) +
1116						     inode->i_sb->s_blocksize);
1117				blocks--;	// Decrease the amount of blocks that need to be
1118				// allocated
1119				continue;	// Go to the next buffer
1120			}
1121
1122			if (!itembuf ||	/* if first iteration */
1123			    item_pos >= ih_item_len(ih) / UNFM_P_SIZE) {	/* or if we progressed past the
1124										   current unformatted_item */
1125				/* Try to find next item */
1126				res =
1127				    search_for_position_by_key(inode->i_sb,
1128							       &key, &path);
1129				/* Abort if no more items */
1130				if (res != POSITION_FOUND) {
1131					/* make sure later loops don't use this item */
1132					itembuf = NULL;
1133					item = NULL;
1134					break;
1135				}
1136
1137				/* Update information about current indirect item */
1138				itembuf = get_last_bh(&path);
1139				ih = get_ih(&path);
1140				item = get_item(&path);
1141				item_pos = path.pos_in_item;
1142
1143				RFALSE(!is_indirect_le_ih(ih),
1144				       "green-9003: indirect item expected");
1145			}
1146
1147			/* See if there is some block associated with the file
1148			   at that position, map the buffer to this block */
1149			if (get_block_num(item, item_pos)) {
1150				map_bh(bh, inode->i_sb,
1151				       get_block_num(item, item_pos));
1152				blocks--;	// Decrease the amount of blocks that need to be
1153				// allocated
1154			}
1155			item_pos++;
1156			/* Update the key */
1157			set_cpu_key_k_offset(&key,
1158					     cpu_key_k_offset(&key) +
1159					     inode->i_sb->s_blocksize);
1160		}
1161	}
1162	pathrelse(&path);	// Free the path
1163	reiserfs_write_unlock(inode->i_sb);
1164
1165	/* Now zero out unmappend buffers for the first and last pages of
1166	   write area or issue read requests if page is mapped. */
1167	/* First page, see if it is not uptodate */
1168	if (!PageUptodate(prepared_pages[0])) {
1169		head = page_buffers(prepared_pages[0]);
1170
1171		/* For each buffer in page */
1172		for (bh = head, block_start = 0; bh != head || !block_start;
1173		     block_start = block_end, bh = bh->b_this_page) {
1174
1175			if (!bh)
1176				reiserfs_panic(inode->i_sb,
1177					       "green-9002: Allocated but absent buffer for a page?");
1178			/* Find where this buffer ends */
1179			block_end = block_start + inode->i_sb->s_blocksize;
1180			if (block_end <= from)
1181				/* if this buffer is before requested data to map, skip it */
1182				continue;
1183			if (block_start < from) {	/* Aha, our partial buffer */
1184				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
1185								   issue READ request for it to
1186								   not loose data */
1187					ll_rw_block(READ, 1, &bh);
1188					*wait_bh++ = bh;
1189				} else {	/* Not mapped, zero it */
1190					zero_user_page(prepared_pages[0],
1191						       block_start,
1192						       from - block_start, KM_USER0);
1193					set_buffer_uptodate(bh);
1194				}
1195			}
1196		}
1197	}
1198
1199	/* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
1200	if (!PageUptodate(prepared_pages[num_pages - 1]) ||
1201	    ((pos + write_bytes) >> PAGE_CACHE_SHIFT) >
1202	    (inode->i_size >> PAGE_CACHE_SHIFT)) {
1203		head = page_buffers(prepared_pages[num_pages - 1]);
1204
1205		/* for each buffer in page */
1206		for (bh = head, block_start = 0; bh != head || !block_start;
1207		     block_start = block_end, bh = bh->b_this_page) {
1208
1209			if (!bh)
1210				reiserfs_panic(inode->i_sb,
1211					       "green-9002: Allocated but absent buffer for a page?");
1212			/* Find where this buffer ends */
1213			block_end = block_start + inode->i_sb->s_blocksize;
1214			if (block_start >= to)
1215				/* if this buffer is after requested data to map, skip it */
1216				break;
1217			if (block_end > to) {	/* Aha, our partial buffer */
1218				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
1219								   issue READ request for it to
1220								   not loose data */
1221					ll_rw_block(READ, 1, &bh);
1222					*wait_bh++ = bh;
1223				} else {	/* Not mapped, zero it */
1224					zero_user_page(prepared_pages[num_pages-1],
1225							to, block_end - to, KM_USER0);
1226					set_buffer_uptodate(bh);
1227				}
1228			}
1229		}
1230	}
1231
1232	/* Wait for read requests we made to happen, if necessary */
1233	while (wait_bh > wait) {
1234		wait_on_buffer(*--wait_bh);
1235		if (!buffer_uptodate(*wait_bh)) {
1236			res = -EIO;
1237			goto failed_read;
1238		}
1239	}
1240
1241	return blocks;
1242      failed_page_grabbing:
1243	num_pages = i;
1244      failed_read:
1245	reiserfs_unprepare_pages(prepared_pages, num_pages);
1246	return res;
1247}
1248
1249/* Write @count bytes at position @ppos in a file indicated by @file
1250   from the buffer @buf.
1251
1252   generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
1253   something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
1254   written for (ext2/3).  This is for several reasons:
1255
1256   * It has no understanding of any filesystem specific optimizations.
1257
1258   * It enters the filesystem repeatedly for each page that is written.
1259
1260   * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
1261   * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
1262   * to reiserfs which allows for fewer tree traversals.
1263
1264   * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
1265
1266   * Asking the block allocation code for blocks one at a time is slightly less efficient.
1267
1268   All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
1269   use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
1270   things right finally.
1271
1272   Future Features: providing search_by_key with hints.
1273
1274*/
1275static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going to write into */
1276				   const char __user * buf,	/*  pointer to user supplied data
1277								   (in userspace) */
1278				   size_t count,	/* amount of bytes to write */
1279				   loff_t * ppos	/* pointer to position in file that we start writing at. Should be updated to
1280							 * new current position before returning. */
1281				   )
1282{
1283	size_t already_written = 0;	// Number of bytes already written to the file.
1284	loff_t pos;		// Current position in the file.
1285	ssize_t res;		// return value of various functions that we call.
1286	int err = 0;
1287	struct inode *inode = file->f_path.dentry->d_inode;	// Inode of the file that we are writing to.
1288	/* To simplify coding at this time, we store
1289	   locked pages in array for now */
1290	struct page *prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
1291	struct reiserfs_transaction_handle th;
1292	th.t_trans_id = 0;
1293
1294	/* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
1295	* lying around (most of the disk, in fact). Despite the filesystem
1296	* now being a v3.6 format, the old items still can't support large
1297	* file sizes. Catch this case here, as the rest of the VFS layer is
1298	* oblivious to the different limitations between old and new items.
1299	* reiserfs_setattr catches this for truncates. This chunk is lifted
1300	* from generic_write_checks. */
1301	if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
1302	    *ppos + count > MAX_NON_LFS) {
1303		if (*ppos >= MAX_NON_LFS) {
1304			send_sig(SIGXFSZ, current, 0);
1305			return -EFBIG;
1306		}
1307		if (count > MAX_NON_LFS - (unsigned long)*ppos)
1308			count = MAX_NON_LFS - (unsigned long)*ppos;
1309	}
1310
1311	if (file->f_flags & O_DIRECT)
1312		return do_sync_write(file, buf, count, ppos);
1313
1314	if (unlikely((ssize_t) count < 0))
1315		return -EINVAL;
1316
1317	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1318		return -EFAULT;
1319
1320	mutex_lock(&inode->i_mutex);	// locks the entire file for just us
1321
1322	pos = *ppos;
1323
1324	/* Check if we can write to specified region of file, file
1325	   is not overly big and this kind of stuff. Adjust pos and
1326	   count, if needed */
1327	res = generic_write_checks(file, &pos, &count, 0);
1328	if (res)
1329		goto out;
1330
1331	if (count == 0)
1332		goto out;
1333
1334	res = remove_suid(file->f_path.dentry);
1335	if (res)
1336		goto out;
1337
1338	file_update_time(file);
1339
1340	// Ok, we are done with all the checks.
1341
1342	// Now we should start real work
1343
1344	/* If we are going to write past the file's packed tail or if we are going
1345	   to overwrite part of the tail, we need that tail to be converted into
1346	   unformatted node */
1347	res = reiserfs_check_for_tail_and_convert(inode, pos, count);
1348	if (res)
1349		goto out;
1350
1351	while (count > 0) {
1352		/* This is the main loop in which we running until some error occures
1353		   or until we write all of the data. */
1354		size_t num_pages;	/* amount of pages we are going to write this iteration */
1355		size_t write_bytes;	/* amount of bytes to write during this iteration */
1356		size_t blocks_to_allocate;	/* how much blocks we need to allocate for this iteration */
1357
1358		/*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos */
1359		num_pages = !!((pos + count) & (PAGE_CACHE_SIZE - 1)) +	/* round up partial
1360									   pages */
1361		    ((count +
1362		      (pos & (PAGE_CACHE_SIZE - 1))) >> PAGE_CACHE_SHIFT);
1363		/* convert size to amount of
1364		   pages */
1365		reiserfs_write_lock(inode->i_sb);
1366		if (num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
1367		    || num_pages > reiserfs_can_fit_pages(inode->i_sb)) {
1368			/* If we were asked to write more data than we want to or if there
1369			   is not that much space, then we shorten amount of data to write
1370			   for this iteration. */
1371			num_pages =
1372			    min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME,
1373				  reiserfs_can_fit_pages(inode->i_sb));
1374			/* Also we should not forget to set size in bytes accordingly */
1375			write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
1376			    (pos & (PAGE_CACHE_SIZE - 1));
1377			/* If position is not on the
1378			   start of the page, we need
1379			   to substract the offset
1380			   within page */
1381		} else
1382			write_bytes = count;
1383
1384		/* reserve the blocks to be allocated later, so that later on
1385		   we still have the space to write the blocks to */
1386		reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1387						      num_pages <<
1388						      (PAGE_CACHE_SHIFT -
1389						       inode->i_blkbits));
1390		reiserfs_write_unlock(inode->i_sb);
1391
1392		if (!num_pages) {	/* If we do not have enough space even for a single page... */
1393			if (pos >
1394			    inode->i_size + inode->i_sb->s_blocksize -
1395			    (pos & (inode->i_sb->s_blocksize - 1))) {
1396				res = -ENOSPC;
1397				break;	// In case we are writing past the end of the last file block, break.
1398			}
1399			// Otherwise we are possibly overwriting the file, so
1400			// let's set write size to be equal or less than blocksize.
1401			// This way we get it correctly for file holes.
1402			// But overwriting files on absolutelly full volumes would not
1403			// be very efficient. Well, people are not supposed to fill
1404			// 100% of disk space anyway.
1405			write_bytes =
1406			    min_t(size_t, count,
1407				  inode->i_sb->s_blocksize -
1408				  (pos & (inode->i_sb->s_blocksize - 1)));
1409			num_pages = 1;
1410			// No blocks were claimed before, so do it now.
1411			reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1412							      1 <<
1413							      (PAGE_CACHE_SHIFT
1414							       -
1415							       inode->
1416							       i_blkbits));
1417		}
1418
1419		/* Prepare for writing into the region, read in all the
1420		   partially overwritten pages, if needed. And lock the pages,
1421		   so that nobody else can access these until we are done.
1422		   We get number of actual blocks needed as a result. */
1423		res = reiserfs_prepare_file_region_for_write(inode, pos,
1424							     num_pages,
1425							     write_bytes,
1426							     prepared_pages);
1427		if (res < 0) {
1428			reiserfs_release_claimed_blocks(inode->i_sb,
1429							num_pages <<
1430							(PAGE_CACHE_SHIFT -
1431							 inode->i_blkbits));
1432			break;
1433		}
1434
1435		blocks_to_allocate = res;
1436
1437		/* First we correct our estimate of how many blocks we need */
1438		reiserfs_release_claimed_blocks(inode->i_sb,
1439						(num_pages <<
1440						 (PAGE_CACHE_SHIFT -
1441						  inode->i_sb->
1442						  s_blocksize_bits)) -
1443						blocks_to_allocate);
1444
1445		if (blocks_to_allocate > 0) {	/*We only allocate blocks if we need to */
1446			/* Fill in all the possible holes and append the file if needed */
1447			res =
1448			    reiserfs_allocate_blocks_for_region(&th, inode, pos,
1449								num_pages,
1450								write_bytes,
1451								prepared_pages,
1452								blocks_to_allocate);
1453		}
1454
1455		/* well, we have allocated the blocks, so it is time to free
1456		   the reservation we made earlier. */
1457		reiserfs_release_claimed_blocks(inode->i_sb,
1458						blocks_to_allocate);
1459		if (res) {
1460			reiserfs_unprepare_pages(prepared_pages, num_pages);
1461			break;
1462		}
1463
1464/* NOTE that allocating blocks and filling blocks can be done in reverse order
1465   and probably we would do that just to get rid of garbage in files after a
1466   crash */
1467
1468		/* Copy data from user-supplied buffer to file's pages */
1469		res =
1470		    reiserfs_copy_from_user_to_file_region(pos, num_pages,
1471							   write_bytes,
1472							   prepared_pages, buf);
1473		if (res) {
1474			reiserfs_unprepare_pages(prepared_pages, num_pages);
1475			break;
1476		}
1477
1478		/* Send the pages to disk and unlock them. */
1479		res =
1480		    reiserfs_submit_file_region_for_write(&th, inode, pos,
1481							  num_pages,
1482							  write_bytes,
1483							  prepared_pages);
1484		if (res)
1485			break;
1486
1487		already_written += write_bytes;
1488		buf += write_bytes;
1489		*ppos = pos += write_bytes;
1490		count -= write_bytes;
1491		balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
1492	}
1493
1494	/* this is only true on error */
1495	if (th.t_trans_id) {
1496		reiserfs_write_lock(inode->i_sb);
1497		err = journal_end(&th, th.t_super, th.t_blocks_allocated);
1498		reiserfs_write_unlock(inode->i_sb);
1499		if (err) {
1500			res = err;
1501			goto out;
1502		}
1503	}
1504
1505	if (likely(res >= 0) &&
1506	    (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))))
1507		res = generic_osync_inode(inode, file->f_mapping,
1508		                          OSYNC_METADATA | OSYNC_DATA);
1509
1510	mutex_unlock(&inode->i_mutex);
1511	reiserfs_async_progress_wait(inode->i_sb);
1512	return (already_written != 0) ? already_written : res;
1513
1514      out:
1515	mutex_unlock(&inode->i_mutex);	// unlock the file on exit.
1516	return res;
1517}
1518
1519const struct file_operations reiserfs_file_operations = {
1520	.read = do_sync_read,
1521	.write = reiserfs_file_write,
1522	.ioctl = reiserfs_ioctl,
1523#ifdef CONFIG_COMPAT
1524	.compat_ioctl = reiserfs_compat_ioctl,
1525#endif
1526	.mmap = reiserfs_file_mmap,
1527	.open = generic_file_open,
1528	.release = reiserfs_file_release,
1529	.fsync = reiserfs_sync_file,
1530	.sendfile = generic_file_sendfile,
1531	.aio_read = generic_file_aio_read,
1532	.aio_write = generic_file_aio_write,
1533	.splice_read = generic_file_splice_read,
1534	.splice_write = generic_file_splice_write,
1535};
1536
1537const struct inode_operations reiserfs_file_inode_operations = {
1538	.truncate = reiserfs_vfs_truncate_file,
1539	.setattr = reiserfs_setattr,
1540	.setxattr = reiserfs_setxattr,
1541	.getxattr = reiserfs_getxattr,
1542	.listxattr = reiserfs_listxattr,
1543	.removexattr = reiserfs_removexattr,
1544	.permission = reiserfs_permission,
1545};
1546