1/*
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */
4
5#include <linux/time.h>
6#include <linux/fs.h>
7#include <linux/reiserfs_fs.h>
8#include <linux/reiserfs_acl.h>
9#include <linux/reiserfs_xattr.h>
10#include <linux/smp_lock.h>
11#include <linux/pagemap.h>
12#include <linux/highmem.h>
13#include <asm/uaccess.h>
14#include <asm/unaligned.h>
15#include <linux/buffer_head.h>
16#include <linux/mpage.h>
17#include <linux/writeback.h>
18#include <linux/quotaops.h>
19
20static int reiserfs_commit_write(struct file *f, struct page *page,
21				 unsigned from, unsigned to);
22static int reiserfs_prepare_write(struct file *f, struct page *page,
23				  unsigned from, unsigned to);
24
25void reiserfs_delete_inode(struct inode *inode)
26{
27	/* We need blocks for transaction + (user+group) quota update (possibly delete) */
28	int jbegin_count =
29	    JOURNAL_PER_BALANCE_CNT * 2 +
30	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
31	struct reiserfs_transaction_handle th;
32	int err;
33
34	truncate_inode_pages(&inode->i_data, 0);
35
36	reiserfs_write_lock(inode->i_sb);
37
38	/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
39	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {	/* also handles bad_inode case */
40		reiserfs_delete_xattrs(inode);
41
42		if (journal_begin(&th, inode->i_sb, jbegin_count))
43			goto out;
44		reiserfs_update_inode_transaction(inode);
45
46		err = reiserfs_delete_object(&th, inode);
47
48		/* Do quota update inside a transaction for journaled quotas. We must do that
49		 * after delete_object so that quota updates go into the same transaction as
50		 * stat data deletion */
51		if (!err)
52			DQUOT_FREE_INODE(inode);
53
54		if (journal_end(&th, inode->i_sb, jbegin_count))
55			goto out;
56
57		/* check return value from reiserfs_delete_object after
58		 * ending the transaction
59		 */
60		if (err)
61		    goto out;
62
63		/* all items of file are deleted, so we can remove "save" link */
64		remove_save_link(inode, 0 /* not truncate */ );	/* we can't do anything
65								 * about an error here */
66	} else {
67		/* no object items are in the tree */
68		;
69	}
70      out:
71	clear_inode(inode);	/* note this must go after the journal_end to prevent deadlock */
72	inode->i_blocks = 0;
73	reiserfs_write_unlock(inode->i_sb);
74}
75
76static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
77			  __u32 objectid, loff_t offset, int type, int length)
78{
79	key->version = version;
80
81	key->on_disk_key.k_dir_id = dirid;
82	key->on_disk_key.k_objectid = objectid;
83	set_cpu_key_k_offset(key, offset);
84	set_cpu_key_k_type(key, type);
85	key->key_length = length;
86}
87
88/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
89   offset and type of key */
90void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
91		  int type, int length)
92{
93	_make_cpu_key(key, get_inode_item_key_version(inode),
94		      le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
95		      le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
96		      length);
97}
98
99//
100// when key is 0, do not set version and short key
101//
102inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
103			      int version,
104			      loff_t offset, int type, int length,
105			      int entry_count /*or ih_free_space */ )
106{
107	if (key) {
108		ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
109		ih->ih_key.k_objectid =
110		    cpu_to_le32(key->on_disk_key.k_objectid);
111	}
112	put_ih_version(ih, version);
113	set_le_ih_k_offset(ih, offset);
114	set_le_ih_k_type(ih, type);
115	put_ih_item_len(ih, length);
116	/*    set_ih_free_space (ih, 0); */
117	// for directory items it is entry count, for directs and stat
118	// datas - 0xffff, for indirects - 0
119	put_ih_entry_count(ih, entry_count);
120}
121
122//
123
124// Ugh.  Not too eager for that....
125//  I cut the code until such time as I see a convincing argument (benchmark).
126// I don't want a bloated inode struct..., and I don't like code complexity....
127
128/* cutting the code is fine, since it really isn't in use yet and is easy
129** to add back in.  But, Vladimir has a really good idea here.  Think
130** about what happens for reading a file.  For each page,
131** The VFS layer calls reiserfs_readpage, who searches the tree to find
132** an indirect item.  This indirect item has X number of pointers, where
133** X is a big number if we've done the block allocation right.  But,
134** we only use one or two of these pointers during each call to readpage,
135** needlessly researching again later on.
136**
137** The size of the cache could be dynamic based on the size of the file.
138**
139** I'd also like to see us cache the location the stat data item, since
140** we are needlessly researching for that frequently.
141**
142** --chris
143*/
144
145/* If this page has a file tail in it, and
146** it was read in by get_block_create_0, the page data is valid,
147** but tail is still sitting in a direct item, and we can't write to
148** it.  So, look through this page, and check all the mapped buffers
149** to make sure they have valid block numbers.  Any that don't need
150** to be unmapped, so that block_prepare_write will correctly call
151** reiserfs_get_block to convert the tail into an unformatted node
152*/
153static inline void fix_tail_page_for_writing(struct page *page)
154{
155	struct buffer_head *head, *next, *bh;
156
157	if (page && page_has_buffers(page)) {
158		head = page_buffers(page);
159		bh = head;
160		do {
161			next = bh->b_this_page;
162			if (buffer_mapped(bh) && bh->b_blocknr == 0) {
163				reiserfs_unmap_buffer(bh);
164			}
165			bh = next;
166		} while (bh != head);
167	}
168}
169
170/* reiserfs_get_block does not need to allocate a block only if it has been
171   done already or non-hole position has been found in the indirect item */
172static inline int allocation_needed(int retval, b_blocknr_t allocated,
173				    struct item_head *ih,
174				    __le32 * item, int pos_in_item)
175{
176	if (allocated)
177		return 0;
178	if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
179	    get_block_num(item, pos_in_item))
180		return 0;
181	return 1;
182}
183
184static inline int indirect_item_found(int retval, struct item_head *ih)
185{
186	return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
187}
188
189static inline void set_block_dev_mapped(struct buffer_head *bh,
190					b_blocknr_t block, struct inode *inode)
191{
192	map_bh(bh, inode->i_sb, block);
193}
194
195//
196// files which were created in the earlier version can not be longer,
197// than 2 gb
198//
199static int file_capable(struct inode *inode, long block)
200{
201	if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||	// it is new file.
202	    block < (1 << (31 - inode->i_sb->s_blocksize_bits)))	// old file, but 'block' is inside of 2gb
203		return 1;
204
205	return 0;
206}
207
208/*static*/ int restart_transaction(struct reiserfs_transaction_handle *th,
209				   struct inode *inode, struct treepath *path)
210{
211	struct super_block *s = th->t_super;
212	int len = th->t_blocks_allocated;
213	int err;
214
215	BUG_ON(!th->t_trans_id);
216	BUG_ON(!th->t_refcount);
217
218	pathrelse(path);
219
220	/* we cannot restart while nested */
221	if (th->t_refcount > 1) {
222		return 0;
223	}
224	reiserfs_update_sd(th, inode);
225	err = journal_end(th, s, len);
226	if (!err) {
227		err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
228		if (!err)
229			reiserfs_update_inode_transaction(inode);
230	}
231	return err;
232}
233
234// it is called by get_block when create == 0. Returns block number
235// for 'block'-th logical block of file. When it hits direct item it
236// returns 0 (being called from bmap) or read direct item into piece
237// of page (bh_result)
238
239// Please improve the english/clarity in the comment above, as it is
240// hard to understand.
241
242static int _get_block_create_0(struct inode *inode, long block,
243			       struct buffer_head *bh_result, int args)
244{
245	INITIALIZE_PATH(path);
246	struct cpu_key key;
247	struct buffer_head *bh;
248	struct item_head *ih, tmp_ih;
249	int fs_gen;
250	int blocknr;
251	char *p = NULL;
252	int chars;
253	int ret;
254	int result;
255	int done = 0;
256	unsigned long offset;
257
258	// prepare the key to look for the 'block'-th block of file
259	make_cpu_key(&key, inode,
260		     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
261		     3);
262
263      research:
264	result = search_for_position_by_key(inode->i_sb, &key, &path);
265	if (result != POSITION_FOUND) {
266		pathrelse(&path);
267		if (p)
268			kunmap(bh_result->b_page);
269		if (result == IO_ERROR)
270			return -EIO;
271		// We do not return -ENOENT if there is a hole but page is uptodate, because it means
272		// That there is some MMAPED data associated with it that is yet to be written to disk.
273		if ((args & GET_BLOCK_NO_HOLE)
274		    && !PageUptodate(bh_result->b_page)) {
275			return -ENOENT;
276		}
277		return 0;
278	}
279	//
280	bh = get_last_bh(&path);
281	ih = get_ih(&path);
282	if (is_indirect_le_ih(ih)) {
283		__le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
284
285		blocknr = get_block_num(ind_item, path.pos_in_item);
286		ret = 0;
287		if (blocknr) {
288			map_bh(bh_result, inode->i_sb, blocknr);
289			if (path.pos_in_item ==
290			    ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
291				set_buffer_boundary(bh_result);
292			}
293		} else
294			// We do not return -ENOENT if there is a hole but page is uptodate, because it means
295			// That there is some MMAPED data associated with it that is yet to  be written to disk.
296		if ((args & GET_BLOCK_NO_HOLE)
297			    && !PageUptodate(bh_result->b_page)) {
298			ret = -ENOENT;
299		}
300
301		pathrelse(&path);
302		if (p)
303			kunmap(bh_result->b_page);
304		return ret;
305	}
306	// requested data are in direct item(s)
307	if (!(args & GET_BLOCK_READ_DIRECT)) {
308		// when it is stored in direct item(s)
309		pathrelse(&path);
310		if (p)
311			kunmap(bh_result->b_page);
312		return -ENOENT;
313	}
314
315	/* if we've got a direct item, and the buffer or page was uptodate,
316	 ** we don't want to pull data off disk again.  skip to the
317	 ** end, where we map the buffer and return
318	 */
319	if (buffer_uptodate(bh_result)) {
320		goto finished;
321	} else
322		/*
323		 ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
324		 ** pages without any buffers.  If the page is up to date, we don't want
325		 ** read old data off disk.  Set the up to date bit on the buffer instead
326		 ** and jump to the end
327		 */
328	if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
329		set_buffer_uptodate(bh_result);
330		goto finished;
331	}
332	// read file tail into part of page
333	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
334	fs_gen = get_generation(inode->i_sb);
335	copy_item_head(&tmp_ih, ih);
336
337	/* we only want to kmap if we are reading the tail into the page.
338	 ** this is not the common case, so we don't kmap until we are
339	 ** sure we need to.  But, this means the item might move if
340	 ** kmap schedules
341	 */
342	if (!p) {
343		p = (char *)kmap(bh_result->b_page);
344		if (fs_changed(fs_gen, inode->i_sb)
345		    && item_moved(&tmp_ih, &path)) {
346			goto research;
347		}
348	}
349	p += offset;
350	memset(p, 0, inode->i_sb->s_blocksize);
351	do {
352		if (!is_direct_le_ih(ih)) {
353			BUG();
354		}
355		/* make sure we don't read more bytes than actually exist in
356		 ** the file.  This can happen in odd cases where i_size isn't
357		 ** correct, and when direct item padding results in a few
358		 ** extra bytes at the end of the direct item
359		 */
360		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
361			break;
362		if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
363			chars =
364			    inode->i_size - (le_ih_k_offset(ih) - 1) -
365			    path.pos_in_item;
366			done = 1;
367		} else {
368			chars = ih_item_len(ih) - path.pos_in_item;
369		}
370		memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
371
372		if (done)
373			break;
374
375		p += chars;
376
377		if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
378			// we done, if read direct item is not the last item of
379			// to see whether direct item continues in the right
380			// neighbor or rely on i_size
381			break;
382
383		// update key to look for the next piece
384		set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
385		result = search_for_position_by_key(inode->i_sb, &key, &path);
386		if (result != POSITION_FOUND)
387			// i/o error most likely
388			break;
389		bh = get_last_bh(&path);
390		ih = get_ih(&path);
391	} while (1);
392
393	flush_dcache_page(bh_result->b_page);
394	kunmap(bh_result->b_page);
395
396      finished:
397	pathrelse(&path);
398
399	if (result == IO_ERROR)
400		return -EIO;
401
402	/* this buffer has valid data, but isn't valid for io.  mapping it to
403	 * block #0 tells the rest of reiserfs it just has a tail in it
404	 */
405	map_bh(bh_result, inode->i_sb, 0);
406	set_buffer_uptodate(bh_result);
407	return 0;
408}
409
410// this is called to create file map. So, _get_block_create_0 will not
411// read direct item
412static int reiserfs_bmap(struct inode *inode, sector_t block,
413			 struct buffer_head *bh_result, int create)
414{
415	if (!file_capable(inode, block))
416		return -EFBIG;
417
418	reiserfs_write_lock(inode->i_sb);
419	/* do not read the direct item */
420	_get_block_create_0(inode, block, bh_result, 0);
421	reiserfs_write_unlock(inode->i_sb);
422	return 0;
423}
424
425/* special version of get_block that is only used by grab_tail_page right
426** now.  It is sent to block_prepare_write, and when you try to get a
427** block past the end of the file (or a block from a hole) it returns
428** -ENOENT instead of a valid buffer.  block_prepare_write expects to
429** be able to do i/o on the buffers returned, unless an error value
430** is also returned.
431**
432** So, this allows block_prepare_write to be used for reading a single block
433** in a page.  Where it does not produce a valid page for holes, or past the
434** end of the file.  This turns out to be exactly what we need for reading
435** tails for conversion.
436**
437** The point of the wrapper is forcing a certain value for create, even
438** though the VFS layer is calling this function with create==1.  If you
439** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
440** don't use this function.
441*/
442static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
443				       struct buffer_head *bh_result,
444				       int create)
445{
446	return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
447}
448
449/* This is special helper for reiserfs_get_block in case we are executing
450   direct_IO request. */
451static int reiserfs_get_blocks_direct_io(struct inode *inode,
452					 sector_t iblock,
453					 struct buffer_head *bh_result,
454					 int create)
455{
456	int ret;
457
458	bh_result->b_page = NULL;
459
460	/* We set the b_size before reiserfs_get_block call since it is
461	   referenced in convert_tail_for_hole() that may be called from
462	   reiserfs_get_block() */
463	bh_result->b_size = (1 << inode->i_blkbits);
464
465	ret = reiserfs_get_block(inode, iblock, bh_result,
466				 create | GET_BLOCK_NO_DANGLE);
467	if (ret)
468		goto out;
469
470	/* don't allow direct io onto tail pages */
471	if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
472		/* make sure future calls to the direct io funcs for this offset
473		 ** in the file fail by unmapping the buffer
474		 */
475		clear_buffer_mapped(bh_result);
476		ret = -EINVAL;
477	}
478	/* Possible unpacked tail. Flush the data before pages have
479	   disappeared */
480	if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
481		int err;
482		lock_kernel();
483		err = reiserfs_commit_for_inode(inode);
484		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
485		unlock_kernel();
486		if (err < 0)
487			ret = err;
488	}
489      out:
490	return ret;
491}
492
493/*
494** helper function for when reiserfs_get_block is called for a hole
495** but the file tail is still in a direct item
496** bh_result is the buffer head for the hole
497** tail_offset is the offset of the start of the tail in the file
498**
499** This calls prepare_write, which will start a new transaction
500** you should not be in a transaction, or have any paths held when you
501** call this.
502*/
503static int convert_tail_for_hole(struct inode *inode,
504				 struct buffer_head *bh_result,
505				 loff_t tail_offset)
506{
507	unsigned long index;
508	unsigned long tail_end;
509	unsigned long tail_start;
510	struct page *tail_page;
511	struct page *hole_page = bh_result->b_page;
512	int retval = 0;
513
514	if ((tail_offset & (bh_result->b_size - 1)) != 1)
515		return -EIO;
516
517	/* always try to read until the end of the block */
518	tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
519	tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
520
521	index = tail_offset >> PAGE_CACHE_SHIFT;
522	/* hole_page can be zero in case of direct_io, we are sure
523	   that we cannot get here if we write with O_DIRECT into
524	   tail page */
525	if (!hole_page || index != hole_page->index) {
526		tail_page = grab_cache_page(inode->i_mapping, index);
527		retval = -ENOMEM;
528		if (!tail_page) {
529			goto out;
530		}
531	} else {
532		tail_page = hole_page;
533	}
534
535	/* we don't have to make sure the conversion did not happen while
536	 ** we were locking the page because anyone that could convert
537	 ** must first take i_mutex.
538	 **
539	 ** We must fix the tail page for writing because it might have buffers
540	 ** that are mapped, but have a block number of 0.  This indicates tail
541	 ** data that has been read directly into the page, and block_prepare_write
542	 ** won't trigger a get_block in this case.
543	 */
544	fix_tail_page_for_writing(tail_page);
545	retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
546	if (retval)
547		goto unlock;
548
549	/* tail conversion might change the data in the page */
550	flush_dcache_page(tail_page);
551
552	retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
553
554      unlock:
555	if (tail_page != hole_page) {
556		unlock_page(tail_page);
557		page_cache_release(tail_page);
558	}
559      out:
560	return retval;
561}
562
563static inline int _allocate_block(struct reiserfs_transaction_handle *th,
564				  long block,
565				  struct inode *inode,
566				  b_blocknr_t * allocated_block_nr,
567				  struct treepath *path, int flags)
568{
569	BUG_ON(!th->t_trans_id);
570
571#ifdef REISERFS_PREALLOCATE
572	if (!(flags & GET_BLOCK_NO_IMUX)) {
573		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
574						  path, block);
575	}
576#endif
577	return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
578					 block);
579}
580
581int reiserfs_get_block(struct inode *inode, sector_t block,
582		       struct buffer_head *bh_result, int create)
583{
584	int repeat, retval = 0;
585	b_blocknr_t allocated_block_nr = 0;	// b_blocknr_t is (unsigned) 32 bit int
586	INITIALIZE_PATH(path);
587	int pos_in_item;
588	struct cpu_key key;
589	struct buffer_head *bh, *unbh = NULL;
590	struct item_head *ih, tmp_ih;
591	__le32 *item;
592	int done;
593	int fs_gen;
594	struct reiserfs_transaction_handle *th = NULL;
595	int jbegin_count =
596	    JOURNAL_PER_BALANCE_CNT * 3 + 1 +
597	    2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
598	int version;
599	int dangle = 1;
600	loff_t new_offset =
601	    (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
602
603	/* bad.... */
604	reiserfs_write_lock(inode->i_sb);
605	version = get_inode_item_key_version(inode);
606
607	if (!file_capable(inode, block)) {
608		reiserfs_write_unlock(inode->i_sb);
609		return -EFBIG;
610	}
611
612	/* if !create, we aren't changing the FS, so we don't need to
613	 ** log anything, so we don't need to start a transaction
614	 */
615	if (!(create & GET_BLOCK_CREATE)) {
616		int ret;
617		/* find number of block-th logical block of the file */
618		ret = _get_block_create_0(inode, block, bh_result,
619					  create | GET_BLOCK_READ_DIRECT);
620		reiserfs_write_unlock(inode->i_sb);
621		return ret;
622	}
623	/*
624	 * if we're already in a transaction, make sure to close
625	 * any new transactions we start in this func
626	 */
627	if ((create & GET_BLOCK_NO_DANGLE) ||
628	    reiserfs_transaction_running(inode->i_sb))
629		dangle = 0;
630
631	/* If file is of such a size, that it might have a tail and tails are enabled
632	 ** we should mark it as possibly needing tail packing on close
633	 */
634	if ((have_large_tails(inode->i_sb)
635	     && inode->i_size < i_block_size(inode) * 4)
636	    || (have_small_tails(inode->i_sb)
637		&& inode->i_size < i_block_size(inode)))
638		REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
639
640	/* set the key of the first byte in the 'block'-th block of file */
641	make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
642	if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
643	      start_trans:
644		th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
645		if (!th) {
646			retval = -ENOMEM;
647			goto failure;
648		}
649		reiserfs_update_inode_transaction(inode);
650	}
651      research:
652
653	retval = search_for_position_by_key(inode->i_sb, &key, &path);
654	if (retval == IO_ERROR) {
655		retval = -EIO;
656		goto failure;
657	}
658
659	bh = get_last_bh(&path);
660	ih = get_ih(&path);
661	item = get_item(&path);
662	pos_in_item = path.pos_in_item;
663
664	fs_gen = get_generation(inode->i_sb);
665	copy_item_head(&tmp_ih, ih);
666
667	if (allocation_needed
668	    (retval, allocated_block_nr, ih, item, pos_in_item)) {
669		/* we have to allocate block for the unformatted node */
670		if (!th) {
671			pathrelse(&path);
672			goto start_trans;
673		}
674
675		repeat =
676		    _allocate_block(th, block, inode, &allocated_block_nr,
677				    &path, create);
678
679		if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
680			/* restart the transaction to give the journal a chance to free
681			 ** some blocks.  releases the path, so we have to go back to
682			 ** research if we succeed on the second try
683			 */
684			SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
685			retval = restart_transaction(th, inode, &path);
686			if (retval)
687				goto failure;
688			repeat =
689			    _allocate_block(th, block, inode,
690					    &allocated_block_nr, NULL, create);
691
692			if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
693				goto research;
694			}
695			if (repeat == QUOTA_EXCEEDED)
696				retval = -EDQUOT;
697			else
698				retval = -ENOSPC;
699			goto failure;
700		}
701
702		if (fs_changed(fs_gen, inode->i_sb)
703		    && item_moved(&tmp_ih, &path)) {
704			goto research;
705		}
706	}
707
708	if (indirect_item_found(retval, ih)) {
709		b_blocknr_t unfm_ptr;
710		/* 'block'-th block is in the file already (there is
711		   corresponding cell in some indirect item). But it may be
712		   zero unformatted node pointer (hole) */
713		unfm_ptr = get_block_num(item, pos_in_item);
714		if (unfm_ptr == 0) {
715			/* use allocated block to plug the hole */
716			reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
717			if (fs_changed(fs_gen, inode->i_sb)
718			    && item_moved(&tmp_ih, &path)) {
719				reiserfs_restore_prepared_buffer(inode->i_sb,
720								 bh);
721				goto research;
722			}
723			set_buffer_new(bh_result);
724			if (buffer_dirty(bh_result)
725			    && reiserfs_data_ordered(inode->i_sb))
726				reiserfs_add_ordered_list(inode, bh_result);
727			put_block_num(item, pos_in_item, allocated_block_nr);
728			unfm_ptr = allocated_block_nr;
729			journal_mark_dirty(th, inode->i_sb, bh);
730			reiserfs_update_sd(th, inode);
731		}
732		set_block_dev_mapped(bh_result, unfm_ptr, inode);
733		pathrelse(&path);
734		retval = 0;
735		if (!dangle && th)
736			retval = reiserfs_end_persistent_transaction(th);
737
738		reiserfs_write_unlock(inode->i_sb);
739
740		/* the item was found, so new blocks were not added to the file
741		 ** there is no need to make sure the inode is updated with this
742		 ** transaction
743		 */
744		return retval;
745	}
746
747	if (!th) {
748		pathrelse(&path);
749		goto start_trans;
750	}
751
752	/* desired position is not found or is in the direct item. We have
753	   to append file with holes up to 'block'-th block converting
754	   direct items to indirect one if necessary */
755	done = 0;
756	do {
757		if (is_statdata_le_ih(ih)) {
758			__le32 unp = 0;
759			struct cpu_key tmp_key;
760
761			/* indirect item has to be inserted */
762			make_le_item_head(&tmp_ih, &key, version, 1,
763					  TYPE_INDIRECT, UNFM_P_SIZE,
764					  0 /* free_space */ );
765
766			if (cpu_key_k_offset(&key) == 1) {
767				/* we are going to add 'block'-th block to the file. Use
768				   allocated block for that */
769				unp = cpu_to_le32(allocated_block_nr);
770				set_block_dev_mapped(bh_result,
771						     allocated_block_nr, inode);
772				set_buffer_new(bh_result);
773				done = 1;
774			}
775			tmp_key = key;	// ;)
776			set_cpu_key_k_offset(&tmp_key, 1);
777			PATH_LAST_POSITION(&path)++;
778
779			retval =
780			    reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
781						 inode, (char *)&unp);
782			if (retval) {
783				reiserfs_free_block(th, inode,
784						    allocated_block_nr, 1);
785				goto failure;	// retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
786			}
787			//mark_tail_converted (inode);
788		} else if (is_direct_le_ih(ih)) {
789			/* direct item has to be converted */
790			loff_t tail_offset;
791
792			tail_offset =
793			    ((le_ih_k_offset(ih) -
794			      1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
795			if (tail_offset == cpu_key_k_offset(&key)) {
796				/* direct item we just found fits into block we have
797				   to map. Convert it into unformatted node: use
798				   bh_result for the conversion */
799				set_block_dev_mapped(bh_result,
800						     allocated_block_nr, inode);
801				unbh = bh_result;
802				done = 1;
803			} else {
804
805				pathrelse(&path);
806				/*
807				 * ugly, but we can only end the transaction if
808				 * we aren't nested
809				 */
810				BUG_ON(!th->t_refcount);
811				if (th->t_refcount == 1) {
812					retval =
813					    reiserfs_end_persistent_transaction
814					    (th);
815					th = NULL;
816					if (retval)
817						goto failure;
818				}
819
820				retval =
821				    convert_tail_for_hole(inode, bh_result,
822							  tail_offset);
823				if (retval) {
824					if (retval != -ENOSPC)
825						reiserfs_warning(inode->i_sb,
826								 "clm-6004: convert tail failed inode %lu, error %d",
827								 inode->i_ino,
828								 retval);
829					if (allocated_block_nr) {
830						/* the bitmap, the super, and the stat data == 3 */
831						if (!th)
832							th = reiserfs_persistent_transaction(inode->i_sb, 3);
833						if (th)
834							reiserfs_free_block(th,
835									    inode,
836									    allocated_block_nr,
837									    1);
838					}
839					goto failure;
840				}
841				goto research;
842			}
843			retval =
844			    direct2indirect(th, inode, &path, unbh,
845					    tail_offset);
846			if (retval) {
847				reiserfs_unmap_buffer(unbh);
848				reiserfs_free_block(th, inode,
849						    allocated_block_nr, 1);
850				goto failure;
851			}
852			/* it is important the set_buffer_uptodate is done after
853			 ** the direct2indirect.  The buffer might contain valid
854			 ** data newer than the data on disk (read by readpage, changed,
855			 ** and then sent here by writepage).  direct2indirect needs
856			 ** to know if unbh was already up to date, so it can decide
857			 ** if the data in unbh needs to be replaced with data from
858			 ** the disk
859			 */
860			set_buffer_uptodate(unbh);
861
862			/* unbh->b_page == NULL in case of DIRECT_IO request, this means
863			   buffer will disappear shortly, so it should not be added to
864			 */
865			if (unbh->b_page) {
866				/* we've converted the tail, so we must
867				 ** flush unbh before the transaction commits
868				 */
869				reiserfs_add_tail_list(inode, unbh);
870
871				/* mark it dirty now to prevent commit_write from adding
872				 ** this buffer to the inode's dirty buffer list
873				 */
874				/*
875				 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
876				 * It's still atomic, but it sets the page dirty too,
877				 * which makes it eligible for writeback at any time by the
878				 * VM (which was also the case with __mark_buffer_dirty())
879				 */
880				mark_buffer_dirty(unbh);
881			}
882		} else {
883			/* append indirect item with holes if needed, when appending
884			   pointer to 'block'-th block use block, which is already
885			   allocated */
886			struct cpu_key tmp_key;
887			unp_t unf_single = 0;	// We use this in case we need to allocate only
888			// one block which is a fastpath
889			unp_t *un;
890			__u64 max_to_insert =
891			    MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
892			    UNFM_P_SIZE;
893			__u64 blocks_needed;
894
895			RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
896			       "vs-804: invalid position for append");
897			/* indirect item has to be appended, set up key of that position */
898			make_cpu_key(&tmp_key, inode,
899				     le_key_k_offset(version,
900						     &(ih->ih_key)) +
901				     op_bytes_number(ih,
902						     inode->i_sb->s_blocksize),
903				     //pos_in_item * inode->i_sb->s_blocksize,
904				     TYPE_INDIRECT, 3);	// key type is unimportant
905
906			RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
907			       "green-805: invalid offset");
908			blocks_needed =
909			    1 +
910			    ((cpu_key_k_offset(&key) -
911			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
912			     s_blocksize_bits);
913
914			if (blocks_needed == 1) {
915				un = &unf_single;
916			} else {
917				un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);	// We need to avoid scheduling.
918				if (!un) {
919					un = &unf_single;
920					blocks_needed = 1;
921					max_to_insert = 0;
922				}
923			}
924			if (blocks_needed <= max_to_insert) {
925				/* we are going to add target block to the file. Use allocated
926				   block for that */
927				un[blocks_needed - 1] =
928				    cpu_to_le32(allocated_block_nr);
929				set_block_dev_mapped(bh_result,
930						     allocated_block_nr, inode);
931				set_buffer_new(bh_result);
932				done = 1;
933			} else {
934				/* paste hole to the indirect item */
935				/* If kmalloc failed, max_to_insert becomes zero and it means we
936				   only have space for one block */
937				blocks_needed =
938				    max_to_insert ? max_to_insert : 1;
939			}
940			retval =
941			    reiserfs_paste_into_item(th, &path, &tmp_key, inode,
942						     (char *)un,
943						     UNFM_P_SIZE *
944						     blocks_needed);
945
946			if (blocks_needed != 1)
947				kfree(un);
948
949			if (retval) {
950				reiserfs_free_block(th, inode,
951						    allocated_block_nr, 1);
952				goto failure;
953			}
954			if (!done) {
955				/* We need to mark new file size in case this function will be
956				   interrupted/aborted later on. And we may do this only for
957				   holes. */
958				inode->i_size +=
959				    inode->i_sb->s_blocksize * blocks_needed;
960			}
961		}
962
963		if (done == 1)
964			break;
965
966		/* this loop could log more blocks than we had originally asked
967		 ** for.  So, we have to allow the transaction to end if it is
968		 ** too big or too full.  Update the inode so things are
969		 ** consistent if we crash before the function returns
970		 **
971		 ** release the path so that anybody waiting on the path before
972		 ** ending their transaction will be able to continue.
973		 */
974		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
975			retval = restart_transaction(th, inode, &path);
976			if (retval)
977				goto failure;
978		}
979		/* inserting indirect pointers for a hole can take a
980		 ** long time.  reschedule if needed
981		 */
982		cond_resched();
983
984		retval = search_for_position_by_key(inode->i_sb, &key, &path);
985		if (retval == IO_ERROR) {
986			retval = -EIO;
987			goto failure;
988		}
989		if (retval == POSITION_FOUND) {
990			reiserfs_warning(inode->i_sb,
991					 "vs-825: reiserfs_get_block: "
992					 "%K should not be found", &key);
993			retval = -EEXIST;
994			if (allocated_block_nr)
995				reiserfs_free_block(th, inode,
996						    allocated_block_nr, 1);
997			pathrelse(&path);
998			goto failure;
999		}
1000		bh = get_last_bh(&path);
1001		ih = get_ih(&path);
1002		item = get_item(&path);
1003		pos_in_item = path.pos_in_item;
1004	} while (1);
1005
1006	retval = 0;
1007
1008      failure:
1009	if (th && (!dangle || (retval && !th->t_trans_id))) {
1010		int err;
1011		if (th->t_trans_id)
1012			reiserfs_update_sd(th, inode);
1013		err = reiserfs_end_persistent_transaction(th);
1014		if (err)
1015			retval = err;
1016	}
1017
1018	reiserfs_write_unlock(inode->i_sb);
1019	reiserfs_check_path(&path);
1020	return retval;
1021}
1022
1023static int
1024reiserfs_readpages(struct file *file, struct address_space *mapping,
1025		   struct list_head *pages, unsigned nr_pages)
1026{
1027	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
1028}
1029
1030/* Compute real number of used bytes by file
1031 * Following three functions can go away when we'll have enough space in stat item
1032 */
1033static int real_space_diff(struct inode *inode, int sd_size)
1034{
1035	int bytes;
1036	loff_t blocksize = inode->i_sb->s_blocksize;
1037
1038	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
1039		return sd_size;
1040
1041	/* End of file is also in full block with indirect reference, so round
1042	 ** up to the next block.
1043	 **
1044	 ** there is just no way to know if the tail is actually packed
1045	 ** on the file, so we have to assume it isn't.  When we pack the
1046	 ** tail, we add 4 bytes to pretend there really is an unformatted
1047	 ** node pointer
1048	 */
1049	bytes =
1050	    ((inode->i_size +
1051	      (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
1052	    sd_size;
1053	return bytes;
1054}
1055
1056static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
1057					int sd_size)
1058{
1059	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1060		return inode->i_size +
1061		    (loff_t) (real_space_diff(inode, sd_size));
1062	}
1063	return ((loff_t) real_space_diff(inode, sd_size)) +
1064	    (((loff_t) blocks) << 9);
1065}
1066
1067/* Compute number of blocks used by file in ReiserFS counting */
1068static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
1069{
1070	loff_t bytes = inode_get_bytes(inode);
1071	loff_t real_space = real_space_diff(inode, sd_size);
1072
1073	/* keeps fsck and non-quota versions of reiserfs happy */
1074	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
1075		bytes += (loff_t) 511;
1076	}
1077
1078	/* files from before the quota patch might i_blocks such that
1079	 ** bytes < real_space.  Deal with that here to prevent it from
1080	 ** going negative.
1081	 */
1082	if (bytes < real_space)
1083		return 0;
1084	return (bytes - real_space) >> 9;
1085}
1086
1087//
1088// BAD: new directories have stat data of new type and all other items
1089// of old type. Version stored in the inode says about body items, so
1090// in update_stat_data we can not rely on inode, but have to check
1091// item version directly
1092//
1093
1094// called by read_locked_inode
1095static void init_inode(struct inode *inode, struct treepath *path)
1096{
1097	struct buffer_head *bh;
1098	struct item_head *ih;
1099	__u32 rdev;
1100	//int version = ITEM_VERSION_1;
1101
1102	bh = PATH_PLAST_BUFFER(path);
1103	ih = PATH_PITEM_HEAD(path);
1104
1105	copy_key(INODE_PKEY(inode), &(ih->ih_key));
1106
1107	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
1108	REISERFS_I(inode)->i_flags = 0;
1109	REISERFS_I(inode)->i_prealloc_block = 0;
1110	REISERFS_I(inode)->i_prealloc_count = 0;
1111	REISERFS_I(inode)->i_trans_id = 0;
1112	REISERFS_I(inode)->i_jl = NULL;
1113	mutex_init(&(REISERFS_I(inode)->i_mmap));
1114	reiserfs_init_acl_access(inode);
1115	reiserfs_init_acl_default(inode);
1116	reiserfs_init_xattr_rwsem(inode);
1117
1118	if (stat_data_v1(ih)) {
1119		struct stat_data_v1 *sd =
1120		    (struct stat_data_v1 *)B_I_PITEM(bh, ih);
1121		unsigned long blocks;
1122
1123		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1124		set_inode_sd_version(inode, STAT_DATA_V1);
1125		inode->i_mode = sd_v1_mode(sd);
1126		inode->i_nlink = sd_v1_nlink(sd);
1127		inode->i_uid = sd_v1_uid(sd);
1128		inode->i_gid = sd_v1_gid(sd);
1129		inode->i_size = sd_v1_size(sd);
1130		inode->i_atime.tv_sec = sd_v1_atime(sd);
1131		inode->i_mtime.tv_sec = sd_v1_mtime(sd);
1132		inode->i_ctime.tv_sec = sd_v1_ctime(sd);
1133		inode->i_atime.tv_nsec = 0;
1134		inode->i_ctime.tv_nsec = 0;
1135		inode->i_mtime.tv_nsec = 0;
1136
1137		inode->i_blocks = sd_v1_blocks(sd);
1138		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1139		blocks = (inode->i_size + 511) >> 9;
1140		blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
1141		if (inode->i_blocks > blocks) {
1142			// there was a bug in <=3.5.23 when i_blocks could take negative
1143			// values. Starting from 3.5.17 this value could even be stored in
1144			// stat data. For such files we set i_blocks based on file
1145			// size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
1146			// only updated if file's inode will ever change
1147			inode->i_blocks = blocks;
1148		}
1149
1150		rdev = sd_v1_rdev(sd);
1151		REISERFS_I(inode)->i_first_direct_byte =
1152		    sd_v1_first_direct_byte(sd);
1153		/* an early bug in the quota code can give us an odd number for the
1154		 ** block count.  This is incorrect, fix it here.
1155		 */
1156		if (inode->i_blocks & 1) {
1157			inode->i_blocks++;
1158		}
1159		inode_set_bytes(inode,
1160				to_real_used_space(inode, inode->i_blocks,
1161						   SD_V1_SIZE));
1162		/* nopack is initially zero for v1 objects. For v2 objects,
1163		   nopack is initialised from sd_attrs */
1164		REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
1165	} else {
1166		// new stat data found, but object may have old items
1167		// (directories and symlinks)
1168		struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
1169
1170		inode->i_mode = sd_v2_mode(sd);
1171		inode->i_nlink = sd_v2_nlink(sd);
1172		inode->i_uid = sd_v2_uid(sd);
1173		inode->i_size = sd_v2_size(sd);
1174		inode->i_gid = sd_v2_gid(sd);
1175		inode->i_mtime.tv_sec = sd_v2_mtime(sd);
1176		inode->i_atime.tv_sec = sd_v2_atime(sd);
1177		inode->i_ctime.tv_sec = sd_v2_ctime(sd);
1178		inode->i_ctime.tv_nsec = 0;
1179		inode->i_mtime.tv_nsec = 0;
1180		inode->i_atime.tv_nsec = 0;
1181		inode->i_blocks = sd_v2_blocks(sd);
1182		rdev = sd_v2_rdev(sd);
1183		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1184			inode->i_generation =
1185			    le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1186		else
1187			inode->i_generation = sd_v2_generation(sd);
1188
1189		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1190			set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1191		else
1192			set_inode_item_key_version(inode, KEY_FORMAT_3_6);
1193		REISERFS_I(inode)->i_first_direct_byte = 0;
1194		set_inode_sd_version(inode, STAT_DATA_V2);
1195		inode_set_bytes(inode,
1196				to_real_used_space(inode, inode->i_blocks,
1197						   SD_V2_SIZE));
1198		/* read persistent inode attributes from sd and initalise
1199		   generic inode flags from them */
1200		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
1201		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
1202	}
1203
1204	pathrelse(path);
1205	if (S_ISREG(inode->i_mode)) {
1206		inode->i_op = &reiserfs_file_inode_operations;
1207		inode->i_fop = &reiserfs_file_operations;
1208		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1209	} else if (S_ISDIR(inode->i_mode)) {
1210		inode->i_op = &reiserfs_dir_inode_operations;
1211		inode->i_fop = &reiserfs_dir_operations;
1212	} else if (S_ISLNK(inode->i_mode)) {
1213		inode->i_op = &reiserfs_symlink_inode_operations;
1214		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
1215	} else {
1216		inode->i_blocks = 0;
1217		inode->i_op = &reiserfs_special_inode_operations;
1218		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
1219	}
1220}
1221
1222// update new stat data with inode fields
1223static void inode2sd(void *sd, struct inode *inode, loff_t size)
1224{
1225	struct stat_data *sd_v2 = (struct stat_data *)sd;
1226	__u16 flags;
1227
1228	set_sd_v2_mode(sd_v2, inode->i_mode);
1229	set_sd_v2_nlink(sd_v2, inode->i_nlink);
1230	set_sd_v2_uid(sd_v2, inode->i_uid);
1231	set_sd_v2_size(sd_v2, size);
1232	set_sd_v2_gid(sd_v2, inode->i_gid);
1233	set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
1234	set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
1235	set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
1236	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
1237	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1238		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
1239	else
1240		set_sd_v2_generation(sd_v2, inode->i_generation);
1241	flags = REISERFS_I(inode)->i_attrs;
1242	i_attrs_to_sd_attrs(inode, &flags);
1243	set_sd_v2_attrs(sd_v2, flags);
1244}
1245
1246// used to copy inode's fields to old stat data
1247static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
1248{
1249	struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
1250
1251	set_sd_v1_mode(sd_v1, inode->i_mode);
1252	set_sd_v1_uid(sd_v1, inode->i_uid);
1253	set_sd_v1_gid(sd_v1, inode->i_gid);
1254	set_sd_v1_nlink(sd_v1, inode->i_nlink);
1255	set_sd_v1_size(sd_v1, size);
1256	set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
1257	set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec);
1258	set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
1259
1260	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
1261		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
1262	else
1263		set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
1264
1265	// Sigh. i_first_direct_byte is back
1266	set_sd_v1_first_direct_byte(sd_v1,
1267				    REISERFS_I(inode)->i_first_direct_byte);
1268}
1269
1270/* NOTE, you must prepare the buffer head before sending it here,
1271** and then log it after the call
1272*/
1273static void update_stat_data(struct treepath *path, struct inode *inode,
1274			     loff_t size)
1275{
1276	struct buffer_head *bh;
1277	struct item_head *ih;
1278
1279	bh = PATH_PLAST_BUFFER(path);
1280	ih = PATH_PITEM_HEAD(path);
1281
1282	if (!is_statdata_le_ih(ih))
1283		reiserfs_panic(inode->i_sb,
1284			       "vs-13065: update_stat_data: key %k, found item %h",
1285			       INODE_PKEY(inode), ih);
1286
1287	if (stat_data_v1(ih)) {
1288		// path points to old stat data
1289		inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
1290	} else {
1291		inode2sd(B_I_PITEM(bh, ih), inode, size);
1292	}
1293
1294	return;
1295}
1296
1297void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
1298			     struct inode *inode, loff_t size)
1299{
1300	struct cpu_key key;
1301	INITIALIZE_PATH(path);
1302	struct buffer_head *bh;
1303	int fs_gen;
1304	struct item_head *ih, tmp_ih;
1305	int retval;
1306
1307	BUG_ON(!th->t_trans_id);
1308
1309	make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);	//key type is unimportant
1310
1311	for (;;) {
1312		int pos;
1313		/* look for the object's stat data */
1314		retval = search_item(inode->i_sb, &key, &path);
1315		if (retval == IO_ERROR) {
1316			reiserfs_warning(inode->i_sb,
1317					 "vs-13050: reiserfs_update_sd: "
1318					 "i/o failure occurred trying to update %K stat data",
1319					 &key);
1320			return;
1321		}
1322		if (retval == ITEM_NOT_FOUND) {
1323			pos = PATH_LAST_POSITION(&path);
1324			pathrelse(&path);
1325			if (inode->i_nlink == 0) {
1326				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
1327				return;
1328			}
1329			reiserfs_warning(inode->i_sb,
1330					 "vs-13060: reiserfs_update_sd: "
1331					 "stat data of object %k (nlink == %d) not found (pos %d)",
1332					 INODE_PKEY(inode), inode->i_nlink,
1333					 pos);
1334			reiserfs_check_path(&path);
1335			return;
1336		}
1337
1338		/* sigh, prepare_for_journal might schedule.  When it schedules the
1339		 ** FS might change.  We have to detect that, and loop back to the
1340		 ** search if the stat data item has moved
1341		 */
1342		bh = get_last_bh(&path);
1343		ih = get_ih(&path);
1344		copy_item_head(&tmp_ih, ih);
1345		fs_gen = get_generation(inode->i_sb);
1346		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
1347		if (fs_changed(fs_gen, inode->i_sb)
1348		    && item_moved(&tmp_ih, &path)) {
1349			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
1350			continue;	/* Stat_data item has been moved after scheduling. */
1351		}
1352		break;
1353	}
1354	update_stat_data(&path, inode, size);
1355	journal_mark_dirty(th, th->t_super, bh);
1356	pathrelse(&path);
1357	return;
1358}
1359
1360/* reiserfs_read_locked_inode is called to read the inode off disk, and it
1361** does a make_bad_inode when things go wrong.  But, we need to make sure
1362** and clear the key in the private portion of the inode, otherwise a
1363** corresponding iput might try to delete whatever object the inode last
1364** represented.
1365*/
1366static void reiserfs_make_bad_inode(struct inode *inode)
1367{
1368	memset(INODE_PKEY(inode), 0, KEY_SIZE);
1369	make_bad_inode(inode);
1370}
1371
1372//
1373// initially this function was derived from minix or ext2's analog and
1374// evolved as the prototype did
1375//
1376
1377int reiserfs_init_locked_inode(struct inode *inode, void *p)
1378{
1379	struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
1380	inode->i_ino = args->objectid;
1381	INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
1382	return 0;
1383}
1384
1385/* looks for stat data in the tree, and fills up the fields of in-core
1386   inode stat data fields */
1387void reiserfs_read_locked_inode(struct inode *inode,
1388				struct reiserfs_iget_args *args)
1389{
1390	INITIALIZE_PATH(path_to_sd);
1391	struct cpu_key key;
1392	unsigned long dirino;
1393	int retval;
1394
1395	dirino = args->dirid;
1396
1397	/* set version 1, version 2 could be used too, because stat data
1398	   key is the same in both versions */
1399	key.version = KEY_FORMAT_3_5;
1400	key.on_disk_key.k_dir_id = dirino;
1401	key.on_disk_key.k_objectid = inode->i_ino;
1402	key.on_disk_key.k_offset = 0;
1403	key.on_disk_key.k_type = 0;
1404
1405	/* look for the object's stat data */
1406	retval = search_item(inode->i_sb, &key, &path_to_sd);
1407	if (retval == IO_ERROR) {
1408		reiserfs_warning(inode->i_sb,
1409				 "vs-13070: reiserfs_read_locked_inode: "
1410				 "i/o failure occurred trying to find stat data of %K",
1411				 &key);
1412		reiserfs_make_bad_inode(inode);
1413		return;
1414	}
1415	if (retval != ITEM_FOUND) {
1416		/* a stale NFS handle can trigger this without it being an error */
1417		pathrelse(&path_to_sd);
1418		reiserfs_make_bad_inode(inode);
1419		inode->i_nlink = 0;
1420		return;
1421	}
1422
1423	init_inode(inode, &path_to_sd);
1424
1425	/* It is possible that knfsd is trying to access inode of a file
1426	   that is being removed from the disk by some other thread. As we
1427	   update sd on unlink all that is required is to check for nlink
1428	   here. This bug was first found by Sizif when debugging
1429	   SquidNG/Butterfly, forgotten, and found again after Philippe
1430	   Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
1431
1432	   More logical fix would require changes in fs/inode.c:iput() to
1433	   remove inode from hash-table _after_ fs cleaned disk stuff up and
1434	   in iget() to return NULL if I_FREEING inode is found in
1435	   hash-table. */
1436	/* Currently there is one place where it's ok to meet inode with
1437	   nlink==0: processing of open-unlinked and half-truncated files
1438	   during mount (fs/reiserfs/super.c:finish_unfinished()). */
1439	if ((inode->i_nlink == 0) &&
1440	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
1441		reiserfs_warning(inode->i_sb,
1442				 "vs-13075: reiserfs_read_locked_inode: "
1443				 "dead inode read from disk %K. "
1444				 "This is likely to be race with knfsd. Ignore",
1445				 &key);
1446		reiserfs_make_bad_inode(inode);
1447	}
1448
1449	reiserfs_check_path(&path_to_sd);	/* init inode should be relsing */
1450
1451}
1452
1453/**
1454 * reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
1455 *
1456 * @inode:    inode from hash table to check
1457 * @opaque:   "cookie" passed to iget5_locked(). This is &reiserfs_iget_args.
1458 *
1459 * This function is called by iget5_locked() to distinguish reiserfs inodes
1460 * having the same inode numbers. Such inodes can only exist due to some
1461 * error condition. One of them should be bad. Inodes with identical
1462 * inode numbers (objectids) are distinguished by parent directory ids.
1463 *
1464 */
1465int reiserfs_find_actor(struct inode *inode, void *opaque)
1466{
1467	struct reiserfs_iget_args *args;
1468
1469	args = opaque;
1470	/* args is already in CPU order */
1471	return (inode->i_ino == args->objectid) &&
1472	    (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
1473}
1474
1475struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
1476{
1477	struct inode *inode;
1478	struct reiserfs_iget_args args;
1479
1480	args.objectid = key->on_disk_key.k_objectid;
1481	args.dirid = key->on_disk_key.k_dir_id;
1482	inode = iget5_locked(s, key->on_disk_key.k_objectid,
1483			     reiserfs_find_actor, reiserfs_init_locked_inode,
1484			     (void *)(&args));
1485	if (!inode)
1486		return ERR_PTR(-ENOMEM);
1487
1488	if (inode->i_state & I_NEW) {
1489		reiserfs_read_locked_inode(inode, &args);
1490		unlock_new_inode(inode);
1491	}
1492
1493	if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
1494		/* either due to i/o error or a stale NFS handle */
1495		iput(inode);
1496		inode = NULL;
1497	}
1498	return inode;
1499}
1500
1501struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp)
1502{
1503	__u32 *data = vobjp;
1504	struct cpu_key key;
1505	struct dentry *result;
1506	struct inode *inode;
1507
1508	key.on_disk_key.k_objectid = data[0];
1509	key.on_disk_key.k_dir_id = data[1];
1510	reiserfs_write_lock(sb);
1511	inode = reiserfs_iget(sb, &key);
1512	if (inode && !IS_ERR(inode) && data[2] != 0 &&
1513	    data[2] != inode->i_generation) {
1514		iput(inode);
1515		inode = NULL;
1516	}
1517	reiserfs_write_unlock(sb);
1518	if (!inode)
1519		inode = ERR_PTR(-ESTALE);
1520	if (IS_ERR(inode))
1521		return ERR_PTR(PTR_ERR(inode));
1522	result = d_alloc_anon(inode);
1523	if (!result) {
1524		iput(inode);
1525		return ERR_PTR(-ENOMEM);
1526	}
1527	return result;
1528}
1529
1530struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 * data,
1531				  int len, int fhtype,
1532				  int (*acceptable) (void *contect,
1533						     struct dentry * de),
1534				  void *context)
1535{
1536	__u32 obj[3], parent[3];
1537
1538	/* fhtype happens to reflect the number of u32s encoded.
1539	 * due to a bug in earlier code, fhtype might indicate there
1540	 * are more u32s then actually fitted.
1541	 * so if fhtype seems to be more than len, reduce fhtype.
1542	 * Valid types are:
1543	 *   2 - objectid + dir_id - legacy support
1544	 *   3 - objectid + dir_id + generation
1545	 *   4 - objectid + dir_id + objectid and dirid of parent - legacy
1546	 *   5 - objectid + dir_id + generation + objectid and dirid of parent
1547	 *   6 - as above plus generation of directory
1548	 * 6 does not fit in NFSv2 handles
1549	 */
1550	if (fhtype > len) {
1551		if (fhtype != 6 || len != 5)
1552			reiserfs_warning(sb,
1553					 "nfsd/reiserfs, fhtype=%d, len=%d - odd",
1554					 fhtype, len);
1555		fhtype = 5;
1556	}
1557
1558	obj[0] = data[0];
1559	obj[1] = data[1];
1560	if (fhtype == 3 || fhtype >= 5)
1561		obj[2] = data[2];
1562	else
1563		obj[2] = 0;	/* generation number */
1564
1565	if (fhtype >= 4) {
1566		parent[0] = data[fhtype >= 5 ? 3 : 2];
1567		parent[1] = data[fhtype >= 5 ? 4 : 3];
1568		if (fhtype == 6)
1569			parent[2] = data[5];
1570		else
1571			parent[2] = 0;
1572	}
1573	return sb->s_export_op->find_exported_dentry(sb, obj,
1574						     fhtype < 4 ? NULL : parent,
1575						     acceptable, context);
1576}
1577
1578int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1579		       int need_parent)
1580{
1581	struct inode *inode = dentry->d_inode;
1582	int maxlen = *lenp;
1583
1584	if (maxlen < 3)
1585		return 255;
1586
1587	data[0] = inode->i_ino;
1588	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1589	data[2] = inode->i_generation;
1590	*lenp = 3;
1591	/* no room for directory info? return what we've stored so far */
1592	if (maxlen < 5 || !need_parent)
1593		return 3;
1594
1595	spin_lock(&dentry->d_lock);
1596	inode = dentry->d_parent->d_inode;
1597	data[3] = inode->i_ino;
1598	data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
1599	*lenp = 5;
1600	if (maxlen >= 6) {
1601		data[5] = inode->i_generation;
1602		*lenp = 6;
1603	}
1604	spin_unlock(&dentry->d_lock);
1605	return *lenp;
1606}
1607
1608/* looks for stat data, then copies fields to it, marks the buffer
1609   containing stat data as dirty */
1610/* reiserfs inodes are never really dirty, since the dirty inode call
1611** always logs them.  This call allows the VFS inode marking routines
1612** to properly mark inodes for datasync and such, but only actually
1613** does something when called for a synchronous update.
1614*/
1615int reiserfs_write_inode(struct inode *inode, int do_sync)
1616{
1617	struct reiserfs_transaction_handle th;
1618	int jbegin_count = 1;
1619
1620	if (inode->i_sb->s_flags & MS_RDONLY)
1621		return -EROFS;
1622	/* memory pressure can sometimes initiate write_inode calls with sync == 1,
1623	 ** these cases are just when the system needs ram, not when the
1624	 ** inode needs to reach disk for safety, and they can safely be
1625	 ** ignored because the altered inode has already been logged.
1626	 */
1627	if (do_sync && !(current->flags & PF_MEMALLOC)) {
1628		reiserfs_write_lock(inode->i_sb);
1629		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
1630			reiserfs_update_sd(&th, inode);
1631			journal_end_sync(&th, inode->i_sb, jbegin_count);
1632		}
1633		reiserfs_write_unlock(inode->i_sb);
1634	}
1635	return 0;
1636}
1637
1638/* stat data of new object is inserted already, this inserts the item
1639   containing "." and ".." entries */
1640static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
1641				  struct inode *inode,
1642				  struct item_head *ih, struct treepath *path,
1643				  struct inode *dir)
1644{
1645	struct super_block *sb = th->t_super;
1646	char empty_dir[EMPTY_DIR_SIZE];
1647	char *body = empty_dir;
1648	struct cpu_key key;
1649	int retval;
1650
1651	BUG_ON(!th->t_trans_id);
1652
1653	_make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
1654		      le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
1655		      TYPE_DIRENTRY, 3 /*key length */ );
1656
1657	/* compose item head for new item. Directories consist of items of
1658	   old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
1659	   is done by reiserfs_new_inode */
1660	if (old_format_only(sb)) {
1661		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
1662				  TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
1663
1664		make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
1665				       ih->ih_key.k_objectid,
1666				       INODE_PKEY(dir)->k_dir_id,
1667				       INODE_PKEY(dir)->k_objectid);
1668	} else {
1669		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
1670				  TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
1671
1672		make_empty_dir_item(body, ih->ih_key.k_dir_id,
1673				    ih->ih_key.k_objectid,
1674				    INODE_PKEY(dir)->k_dir_id,
1675				    INODE_PKEY(dir)->k_objectid);
1676	}
1677
1678	/* look for place in the tree for new item */
1679	retval = search_item(sb, &key, path);
1680	if (retval == IO_ERROR) {
1681		reiserfs_warning(sb, "vs-13080: reiserfs_new_directory: "
1682				 "i/o failure occurred creating new directory");
1683		return -EIO;
1684	}
1685	if (retval == ITEM_FOUND) {
1686		pathrelse(path);
1687		reiserfs_warning(sb, "vs-13070: reiserfs_new_directory: "
1688				 "object with this key exists (%k)",
1689				 &(ih->ih_key));
1690		return -EEXIST;
1691	}
1692
1693	/* insert item, that is empty directory item */
1694	return reiserfs_insert_item(th, path, &key, ih, inode, body);
1695}
1696
1697/* stat data of object has been inserted, this inserts the item
1698   containing the body of symlink */
1699static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode of symlink */
1700				struct item_head *ih,
1701				struct treepath *path, const char *symname,
1702				int item_len)
1703{
1704	struct super_block *sb = th->t_super;
1705	struct cpu_key key;
1706	int retval;
1707
1708	BUG_ON(!th->t_trans_id);
1709
1710	_make_cpu_key(&key, KEY_FORMAT_3_5,
1711		      le32_to_cpu(ih->ih_key.k_dir_id),
1712		      le32_to_cpu(ih->ih_key.k_objectid),
1713		      1, TYPE_DIRECT, 3 /*key length */ );
1714
1715	make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
1716			  0 /*free_space */ );
1717
1718	/* look for place in the tree for new item */
1719	retval = search_item(sb, &key, path);
1720	if (retval == IO_ERROR) {
1721		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlinik: "
1722				 "i/o failure occurred creating new symlink");
1723		return -EIO;
1724	}
1725	if (retval == ITEM_FOUND) {
1726		pathrelse(path);
1727		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlink: "
1728				 "object with this key exists (%k)",
1729				 &(ih->ih_key));
1730		return -EEXIST;
1731	}
1732
1733	/* insert item, that is body of symlink */
1734	return reiserfs_insert_item(th, path, &key, ih, inode, symname);
1735}
1736
1737/* inserts the stat data into the tree, and then calls
1738   reiserfs_new_directory (to insert ".", ".." item if new object is
1739   directory) or reiserfs_new_symlink (to insert symlink body if new
1740   object is symlink) or nothing (if new object is regular file)
1741
1742   NOTE! uid and gid must already be set in the inode.  If we return
1743   non-zero due to an error, we have to drop the quota previously allocated
1744   for the fresh inode.  This can only be done outside a transaction, so
1745   if we return non-zero, we also end the transaction.  */
1746int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
1747		       struct inode *dir, int mode, const char *symname,
1748		       /* 0 for regular, EMTRY_DIR_SIZE for dirs,
1749		          strlen (symname) for symlinks) */
1750		       loff_t i_size, struct dentry *dentry,
1751		       struct inode *inode)
1752{
1753	struct super_block *sb;
1754	INITIALIZE_PATH(path_to_key);
1755	struct cpu_key key;
1756	struct item_head ih;
1757	struct stat_data sd;
1758	int retval;
1759	int err;
1760
1761	BUG_ON(!th->t_trans_id);
1762
1763	if (DQUOT_ALLOC_INODE(inode)) {
1764		err = -EDQUOT;
1765		goto out_end_trans;
1766	}
1767	if (!dir->i_nlink) {
1768		err = -EPERM;
1769		goto out_bad_inode;
1770	}
1771
1772	sb = dir->i_sb;
1773
1774	/* item head of new item */
1775	ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
1776	ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
1777	if (!ih.ih_key.k_objectid) {
1778		err = -ENOMEM;
1779		goto out_bad_inode;
1780	}
1781	if (old_format_only(sb))
1782		/* not a perfect generation count, as object ids can be reused, but
1783		 ** this is as good as reiserfs can do right now.
1784		 ** note that the private part of inode isn't filled in yet, we have
1785		 ** to use the directory.
1786		 */
1787		inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
1788	else
1789#if defined(USE_INODE_GENERATION_COUNTER)
1790		inode->i_generation =
1791		    le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
1792#else
1793		inode->i_generation = ++event;
1794#endif
1795
1796	/* fill stat data */
1797	inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
1798
1799	/* uid and gid must already be set by the caller for quota init */
1800
1801	/* symlink cannot be immutable or append only, right? */
1802	if (S_ISLNK(inode->i_mode))
1803		inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
1804
1805	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
1806	inode->i_size = i_size;
1807	inode->i_blocks = 0;
1808	inode->i_bytes = 0;
1809	REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
1810	    U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
1811
1812	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
1813	REISERFS_I(inode)->i_flags = 0;
1814	REISERFS_I(inode)->i_prealloc_block = 0;
1815	REISERFS_I(inode)->i_prealloc_count = 0;
1816	REISERFS_I(inode)->i_trans_id = 0;
1817	REISERFS_I(inode)->i_jl = NULL;
1818	REISERFS_I(inode)->i_attrs =
1819	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
1820	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
1821	mutex_init(&(REISERFS_I(inode)->i_mmap));
1822	reiserfs_init_acl_access(inode);
1823	reiserfs_init_acl_default(inode);
1824	reiserfs_init_xattr_rwsem(inode);
1825
1826	if (old_format_only(sb))
1827		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
1828				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
1829	else
1830		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
1831				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
1832
1833	/* key to search for correct place for new stat data */
1834	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
1835		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
1836		      TYPE_STAT_DATA, 3 /*key length */ );
1837
1838	/* find proper place for inserting of stat data */
1839	retval = search_item(sb, &key, &path_to_key);
1840	if (retval == IO_ERROR) {
1841		err = -EIO;
1842		goto out_bad_inode;
1843	}
1844	if (retval == ITEM_FOUND) {
1845		pathrelse(&path_to_key);
1846		err = -EEXIST;
1847		goto out_bad_inode;
1848	}
1849	if (old_format_only(sb)) {
1850		if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
1851			pathrelse(&path_to_key);
1852			/* i_uid or i_gid is too big to be stored in stat data v3.5 */
1853			err = -EINVAL;
1854			goto out_bad_inode;
1855		}
1856		inode2sd_v1(&sd, inode, inode->i_size);
1857	} else {
1858		inode2sd(&sd, inode, inode->i_size);
1859	}
1860	// these do not go to on-disk stat data
1861	inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
1862
1863	// store in in-core inode the key of stat data and version all
1864	// object items will have (directory items will have old offset
1865	// format, other new objects will consist of new items)
1866	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
1867	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
1868		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
1869	else
1870		set_inode_item_key_version(inode, KEY_FORMAT_3_6);
1871	if (old_format_only(sb))
1872		set_inode_sd_version(inode, STAT_DATA_V1);
1873	else
1874		set_inode_sd_version(inode, STAT_DATA_V2);
1875
1876	/* insert the stat data into the tree */
1877#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1878	if (REISERFS_I(dir)->new_packing_locality)
1879		th->displace_new_blocks = 1;
1880#endif
1881	retval =
1882	    reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
1883				 (char *)(&sd));
1884	if (retval) {
1885		err = retval;
1886		reiserfs_check_path(&path_to_key);
1887		goto out_bad_inode;
1888	}
1889#ifdef DISPLACE_NEW_PACKING_LOCALITIES
1890	if (!th->displace_new_blocks)
1891		REISERFS_I(dir)->new_packing_locality = 0;
1892#endif
1893	if (S_ISDIR(mode)) {
1894		/* insert item with "." and ".." */
1895		retval =
1896		    reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
1897	}
1898
1899	if (S_ISLNK(mode)) {
1900		/* insert body of symlink */
1901		if (!old_format_only(sb))
1902			i_size = ROUND_UP(i_size);
1903		retval =
1904		    reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
1905					 i_size);
1906	}
1907	if (retval) {
1908		err = retval;
1909		reiserfs_check_path(&path_to_key);
1910		journal_end(th, th->t_super, th->t_blocks_allocated);
1911		goto out_inserted_sd;
1912	}
1913
1914	if (reiserfs_posixacl(inode->i_sb)) {
1915		retval = reiserfs_inherit_default_acl(dir, dentry, inode);
1916		if (retval) {
1917			err = retval;
1918			reiserfs_check_path(&path_to_key);
1919			journal_end(th, th->t_super, th->t_blocks_allocated);
1920			goto out_inserted_sd;
1921		}
1922	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
1923		reiserfs_warning(inode->i_sb, "ACLs aren't enabled in the fs, "
1924				 "but vfs thinks they are!");
1925	} else if (is_reiserfs_priv_object(dir)) {
1926		reiserfs_mark_inode_private(inode);
1927	}
1928
1929	insert_inode_hash(inode);
1930	reiserfs_update_sd(th, inode);
1931	reiserfs_check_path(&path_to_key);
1932
1933	return 0;
1934
1935/* it looks like you can easily compress these two goto targets into
1936 * one.  Keeping it like this doesn't actually hurt anything, and they
1937 * are place holders for what the quota code actually needs.
1938 */
1939      out_bad_inode:
1940	/* Invalidate the object, nothing was inserted yet */
1941	INODE_PKEY(inode)->k_objectid = 0;
1942
1943	/* Quota change must be inside a transaction for journaling */
1944	DQUOT_FREE_INODE(inode);
1945
1946      out_end_trans:
1947	journal_end(th, th->t_super, th->t_blocks_allocated);
1948	/* Drop can be outside and it needs more credits so it's better to have it outside */
1949	DQUOT_DROP(inode);
1950	inode->i_flags |= S_NOQUOTA;
1951	make_bad_inode(inode);
1952
1953      out_inserted_sd:
1954	inode->i_nlink = 0;
1955	th->t_trans_id = 0;	/* so the caller can't use this handle later */
1956
1957	/* If we were inheriting an ACL, we need to release the lock so that
1958	 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
1959	 * code really needs to be reworked, but this will take care of it
1960	 * for now. -jeffm */
1961#ifdef CONFIG_REISERFS_FS_POSIX_ACL
1962	if (REISERFS_I(dir)->i_acl_default && !IS_ERR(REISERFS_I(dir)->i_acl_default)) {
1963		reiserfs_write_unlock_xattrs(dir->i_sb);
1964		iput(inode);
1965		reiserfs_write_lock_xattrs(dir->i_sb);
1966	} else
1967#endif
1968		iput(inode);
1969	return err;
1970}
1971
1972/*
1973** finds the tail page in the page cache,
1974** reads the last block in.
1975**
1976** On success, page_result is set to a locked, pinned page, and bh_result
1977** is set to an up to date buffer for the last block in the file.  returns 0.
1978**
1979** tail conversion is not done, so bh_result might not be valid for writing
1980** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
1981** trying to write the block.
1982**
1983** on failure, nonzero is returned, page_result and bh_result are untouched.
1984*/
1985static int grab_tail_page(struct inode *p_s_inode,
1986			  struct page **page_result,
1987			  struct buffer_head **bh_result)
1988{
1989
1990	/* we want the page with the last byte in the file,
1991	 ** not the page that will hold the next byte for appending
1992	 */
1993	unsigned long index = (p_s_inode->i_size - 1) >> PAGE_CACHE_SHIFT;
1994	unsigned long pos = 0;
1995	unsigned long start = 0;
1996	unsigned long blocksize = p_s_inode->i_sb->s_blocksize;
1997	unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1);
1998	struct buffer_head *bh;
1999	struct buffer_head *head;
2000	struct page *page;
2001	int error;
2002
2003	/* we know that we are only called with inode->i_size > 0.
2004	 ** we also know that a file tail can never be as big as a block
2005	 ** If i_size % blocksize == 0, our file is currently block aligned
2006	 ** and it won't need converting or zeroing after a truncate.
2007	 */
2008	if ((offset & (blocksize - 1)) == 0) {
2009		return -ENOENT;
2010	}
2011	page = grab_cache_page(p_s_inode->i_mapping, index);
2012	error = -ENOMEM;
2013	if (!page) {
2014		goto out;
2015	}
2016	/* start within the page of the last block in the file */
2017	start = (offset / blocksize) * blocksize;
2018
2019	error = block_prepare_write(page, start, offset,
2020				    reiserfs_get_block_create_0);
2021	if (error)
2022		goto unlock;
2023
2024	head = page_buffers(page);
2025	bh = head;
2026	do {
2027		if (pos >= start) {
2028			break;
2029		}
2030		bh = bh->b_this_page;
2031		pos += blocksize;
2032	} while (bh != head);
2033
2034	if (!buffer_uptodate(bh)) {
2035		/* note, this should never happen, prepare_write should
2036		 ** be taking care of this for us.  If the buffer isn't up to date,
2037		 ** I've screwed up the code to find the buffer, or the code to
2038		 ** call prepare_write
2039		 */
2040		reiserfs_warning(p_s_inode->i_sb,
2041				 "clm-6000: error reading block %lu on dev %s",
2042				 bh->b_blocknr,
2043				 reiserfs_bdevname(p_s_inode->i_sb));
2044		error = -EIO;
2045		goto unlock;
2046	}
2047	*bh_result = bh;
2048	*page_result = page;
2049
2050      out:
2051	return error;
2052
2053      unlock:
2054	unlock_page(page);
2055	page_cache_release(page);
2056	return error;
2057}
2058
2059/*
2060** vfs version of truncate file.  Must NOT be called with
2061** a transaction already started.
2062**
2063** some code taken from block_truncate_page
2064*/
2065int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
2066{
2067	struct reiserfs_transaction_handle th;
2068	/* we want the offset for the first byte after the end of the file */
2069	unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1);
2070	unsigned blocksize = p_s_inode->i_sb->s_blocksize;
2071	unsigned length;
2072	struct page *page = NULL;
2073	int error;
2074	struct buffer_head *bh = NULL;
2075	int err2;
2076
2077	reiserfs_write_lock(p_s_inode->i_sb);
2078
2079	if (p_s_inode->i_size > 0) {
2080		if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
2081			// -ENOENT means we truncated past the end of the file,
2082			// and get_block_create_0 could not find a block to read in,
2083			// which is ok.
2084			if (error != -ENOENT)
2085				reiserfs_warning(p_s_inode->i_sb,
2086						 "clm-6001: grab_tail_page failed %d",
2087						 error);
2088			page = NULL;
2089			bh = NULL;
2090		}
2091	}
2092
2093	/* so, if page != NULL, we have a buffer head for the offset at
2094	 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
2095	 ** then we have an unformatted node.  Otherwise, we have a direct item,
2096	 ** and no zeroing is required on disk.  We zero after the truncate,
2097	 ** because the truncate might pack the item anyway
2098	 ** (it will unmap bh if it packs).
2099	 */
2100	/* it is enough to reserve space in transaction for 2 balancings:
2101	   one for "save" link adding and another for the first
2102	   cut_from_item. 1 is for update_sd */
2103	error = journal_begin(&th, p_s_inode->i_sb,
2104			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
2105	if (error)
2106		goto out;
2107	reiserfs_update_inode_transaction(p_s_inode);
2108	if (update_timestamps)
2109		/* we are doing real truncate: if the system crashes before the last
2110		   transaction of truncating gets committed - on reboot the file
2111		   either appears truncated properly or not truncated at all */
2112		add_save_link(&th, p_s_inode, 1);
2113	err2 = reiserfs_do_truncate(&th, p_s_inode, page, update_timestamps);
2114	error =
2115	    journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
2116	if (error)
2117		goto out;
2118
2119	/* check reiserfs_do_truncate after ending the transaction */
2120	if (err2) {
2121		error = err2;
2122  		goto out;
2123	}
2124
2125	if (update_timestamps) {
2126		error = remove_save_link(p_s_inode, 1 /* truncate */ );
2127		if (error)
2128			goto out;
2129	}
2130
2131	if (page) {
2132		length = offset & (blocksize - 1);
2133		/* if we are not on a block boundary */
2134		if (length) {
2135			length = blocksize - length;
2136			zero_user_page(page, offset, length, KM_USER0);
2137			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
2138				mark_buffer_dirty(bh);
2139			}
2140		}
2141		unlock_page(page);
2142		page_cache_release(page);
2143	}
2144
2145	reiserfs_write_unlock(p_s_inode->i_sb);
2146	return 0;
2147      out:
2148	if (page) {
2149		unlock_page(page);
2150		page_cache_release(page);
2151	}
2152	reiserfs_write_unlock(p_s_inode->i_sb);
2153	return error;
2154}
2155
2156static int map_block_for_writepage(struct inode *inode,
2157				   struct buffer_head *bh_result,
2158				   unsigned long block)
2159{
2160	struct reiserfs_transaction_handle th;
2161	int fs_gen;
2162	struct item_head tmp_ih;
2163	struct item_head *ih;
2164	struct buffer_head *bh;
2165	__le32 *item;
2166	struct cpu_key key;
2167	INITIALIZE_PATH(path);
2168	int pos_in_item;
2169	int jbegin_count = JOURNAL_PER_BALANCE_CNT;
2170	loff_t byte_offset = ((loff_t)block << inode->i_sb->s_blocksize_bits)+1;
2171	int retval;
2172	int use_get_block = 0;
2173	int bytes_copied = 0;
2174	int copy_size;
2175	int trans_running = 0;
2176
2177	/* catch places below that try to log something without starting a trans */
2178	th.t_trans_id = 0;
2179
2180	if (!buffer_uptodate(bh_result)) {
2181		return -EIO;
2182	}
2183
2184	kmap(bh_result->b_page);
2185      start_over:
2186	reiserfs_write_lock(inode->i_sb);
2187	make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
2188
2189      research:
2190	retval = search_for_position_by_key(inode->i_sb, &key, &path);
2191	if (retval != POSITION_FOUND) {
2192		use_get_block = 1;
2193		goto out;
2194	}
2195
2196	bh = get_last_bh(&path);
2197	ih = get_ih(&path);
2198	item = get_item(&path);
2199	pos_in_item = path.pos_in_item;
2200
2201	/* we've found an unformatted node */
2202	if (indirect_item_found(retval, ih)) {
2203		if (bytes_copied > 0) {
2204			reiserfs_warning(inode->i_sb,
2205					 "clm-6002: bytes_copied %d",
2206					 bytes_copied);
2207		}
2208		if (!get_block_num(item, pos_in_item)) {
2209			/* crap, we are writing to a hole */
2210			use_get_block = 1;
2211			goto out;
2212		}
2213		set_block_dev_mapped(bh_result,
2214				     get_block_num(item, pos_in_item), inode);
2215	} else if (is_direct_le_ih(ih)) {
2216		char *p;
2217		p = page_address(bh_result->b_page);
2218		p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
2219		copy_size = ih_item_len(ih) - pos_in_item;
2220
2221		fs_gen = get_generation(inode->i_sb);
2222		copy_item_head(&tmp_ih, ih);
2223
2224		if (!trans_running) {
2225			/* vs-3050 is gone, no need to drop the path */
2226			retval = journal_begin(&th, inode->i_sb, jbegin_count);
2227			if (retval)
2228				goto out;
2229			reiserfs_update_inode_transaction(inode);
2230			trans_running = 1;
2231			if (fs_changed(fs_gen, inode->i_sb)
2232			    && item_moved(&tmp_ih, &path)) {
2233				reiserfs_restore_prepared_buffer(inode->i_sb,
2234								 bh);
2235				goto research;
2236			}
2237		}
2238
2239		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
2240
2241		if (fs_changed(fs_gen, inode->i_sb)
2242		    && item_moved(&tmp_ih, &path)) {
2243			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
2244			goto research;
2245		}
2246
2247		memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
2248		       copy_size);
2249
2250		journal_mark_dirty(&th, inode->i_sb, bh);
2251		bytes_copied += copy_size;
2252		set_block_dev_mapped(bh_result, 0, inode);
2253
2254		/* are there still bytes left? */
2255		if (bytes_copied < bh_result->b_size &&
2256		    (byte_offset + bytes_copied) < inode->i_size) {
2257			set_cpu_key_k_offset(&key,
2258					     cpu_key_k_offset(&key) +
2259					     copy_size);
2260			goto research;
2261		}
2262	} else {
2263		reiserfs_warning(inode->i_sb,
2264				 "clm-6003: bad item inode %lu, device %s",
2265				 inode->i_ino, reiserfs_bdevname(inode->i_sb));
2266		retval = -EIO;
2267		goto out;
2268	}
2269	retval = 0;
2270
2271      out:
2272	pathrelse(&path);
2273	if (trans_running) {
2274		int err = journal_end(&th, inode->i_sb, jbegin_count);
2275		if (err)
2276			retval = err;
2277		trans_running = 0;
2278	}
2279	reiserfs_write_unlock(inode->i_sb);
2280
2281	/* this is where we fill in holes in the file. */
2282	if (use_get_block) {
2283		retval = reiserfs_get_block(inode, block, bh_result,
2284					    GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX
2285					    | GET_BLOCK_NO_DANGLE);
2286		if (!retval) {
2287			if (!buffer_mapped(bh_result)
2288			    || bh_result->b_blocknr == 0) {
2289				/* get_block failed to find a mapped unformatted node. */
2290				use_get_block = 0;
2291				goto start_over;
2292			}
2293		}
2294	}
2295	kunmap(bh_result->b_page);
2296
2297	if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
2298		/* we've copied data from the page into the direct item, so the
2299		 * buffer in the page is now clean, mark it to reflect that.
2300		 */
2301		lock_buffer(bh_result);
2302		clear_buffer_dirty(bh_result);
2303		unlock_buffer(bh_result);
2304	}
2305	return retval;
2306}
2307
2308/*
2309 * mason@suse.com: updated in 2.5.54 to follow the same general io
2310 * start/recovery path as __block_write_full_page, along with special
2311 * code to handle reiserfs tails.
2312 */
2313static int reiserfs_write_full_page(struct page *page,
2314				    struct writeback_control *wbc)
2315{
2316	struct inode *inode = page->mapping->host;
2317	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2318	int error = 0;
2319	unsigned long block;
2320	sector_t last_block;
2321	struct buffer_head *head, *bh;
2322	int partial = 0;
2323	int nr = 0;
2324	int checked = PageChecked(page);
2325	struct reiserfs_transaction_handle th;
2326	struct super_block *s = inode->i_sb;
2327	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
2328	th.t_trans_id = 0;
2329
2330	/* no logging allowed when nonblocking or from PF_MEMALLOC */
2331	if (checked && (current->flags & PF_MEMALLOC)) {
2332		redirty_page_for_writepage(wbc, page);
2333		unlock_page(page);
2334		return 0;
2335	}
2336
2337	/* The page dirty bit is cleared before writepage is called, which
2338	 * means we have to tell create_empty_buffers to make dirty buffers
2339	 * The page really should be up to date at this point, so tossing
2340	 * in the BH_Uptodate is just a sanity check.
2341	 */
2342	if (!page_has_buffers(page)) {
2343		create_empty_buffers(page, s->s_blocksize,
2344				     (1 << BH_Dirty) | (1 << BH_Uptodate));
2345	}
2346	head = page_buffers(page);
2347
2348	/* last page in the file, zero out any contents past the
2349	 ** last byte in the file
2350	 */
2351	if (page->index >= end_index) {
2352		unsigned last_offset;
2353
2354		last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
2355		/* no file contents in this page */
2356		if (page->index >= end_index + 1 || !last_offset) {
2357			unlock_page(page);
2358			return 0;
2359		}
2360		zero_user_page(page, last_offset, PAGE_CACHE_SIZE - last_offset, KM_USER0);
2361	}
2362	bh = head;
2363	block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
2364	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
2365	/* first map all the buffers, logging any direct items we find */
2366	do {
2367		if (block > last_block) {
2368			/*
2369			 * This can happen when the block size is less than
2370			 * the page size.  The corresponding bytes in the page
2371			 * were zero filled above
2372			 */
2373			clear_buffer_dirty(bh);
2374			set_buffer_uptodate(bh);
2375		} else if ((checked || buffer_dirty(bh)) &&
2376		           (!buffer_mapped(bh) || (buffer_mapped(bh)
2377						       && bh->b_blocknr ==
2378						       0))) {
2379			/* not mapped yet, or it points to a direct item, search
2380			 * the btree for the mapping info, and log any direct
2381			 * items found
2382			 */
2383			if ((error = map_block_for_writepage(inode, bh, block))) {
2384				goto fail;
2385			}
2386		}
2387		bh = bh->b_this_page;
2388		block++;
2389	} while (bh != head);
2390
2391	/*
2392	 * we start the transaction after map_block_for_writepage,
2393	 * because it can create holes in the file (an unbounded operation).
2394	 * starting it here, we can make a reliable estimate for how many
2395	 * blocks we're going to log
2396	 */
2397	if (checked) {
2398		ClearPageChecked(page);
2399		reiserfs_write_lock(s);
2400		error = journal_begin(&th, s, bh_per_page + 1);
2401		if (error) {
2402			reiserfs_write_unlock(s);
2403			goto fail;
2404		}
2405		reiserfs_update_inode_transaction(inode);
2406	}
2407	/* now go through and lock any dirty buffers on the page */
2408	do {
2409		get_bh(bh);
2410		if (!buffer_mapped(bh))
2411			continue;
2412		if (buffer_mapped(bh) && bh->b_blocknr == 0)
2413			continue;
2414
2415		if (checked) {
2416			reiserfs_prepare_for_journal(s, bh, 1);
2417			journal_mark_dirty(&th, s, bh);
2418			continue;
2419		}
2420		/* from this point on, we know the buffer is mapped to a
2421		 * real block and not a direct item
2422		 */
2423		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
2424			lock_buffer(bh);
2425		} else {
2426			if (test_set_buffer_locked(bh)) {
2427				redirty_page_for_writepage(wbc, page);
2428				continue;
2429			}
2430		}
2431		if (test_clear_buffer_dirty(bh)) {
2432			mark_buffer_async_write(bh);
2433		} else {
2434			unlock_buffer(bh);
2435		}
2436	} while ((bh = bh->b_this_page) != head);
2437
2438	if (checked) {
2439		error = journal_end(&th, s, bh_per_page + 1);
2440		reiserfs_write_unlock(s);
2441		if (error)
2442			goto fail;
2443	}
2444	BUG_ON(PageWriteback(page));
2445	set_page_writeback(page);
2446	unlock_page(page);
2447
2448	/*
2449	 * since any buffer might be the only dirty buffer on the page,
2450	 * the first submit_bh can bring the page out of writeback.
2451	 * be careful with the buffers.
2452	 */
2453	do {
2454		struct buffer_head *next = bh->b_this_page;
2455		if (buffer_async_write(bh)) {
2456			submit_bh(WRITE, bh);
2457			nr++;
2458		}
2459		put_bh(bh);
2460		bh = next;
2461	} while (bh != head);
2462
2463	error = 0;
2464      done:
2465	if (nr == 0) {
2466		/*
2467		 * if this page only had a direct item, it is very possible for
2468		 * no io to be required without there being an error.  Or,
2469		 * someone else could have locked them and sent them down the
2470		 * pipe without locking the page
2471		 */
2472		bh = head;
2473		do {
2474			if (!buffer_uptodate(bh)) {
2475				partial = 1;
2476				break;
2477			}
2478			bh = bh->b_this_page;
2479		} while (bh != head);
2480		if (!partial)
2481			SetPageUptodate(page);
2482		end_page_writeback(page);
2483	}
2484	return error;
2485
2486      fail:
2487	/* catches various errors, we need to make sure any valid dirty blocks
2488	 * get to the media.  The page is currently locked and not marked for
2489	 * writeback
2490	 */
2491	ClearPageUptodate(page);
2492	bh = head;
2493	do {
2494		get_bh(bh);
2495		if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
2496			lock_buffer(bh);
2497			mark_buffer_async_write(bh);
2498		} else {
2499			/*
2500			 * clear any dirty bits that might have come from getting
2501			 * attached to a dirty page
2502			 */
2503			clear_buffer_dirty(bh);
2504		}
2505		bh = bh->b_this_page;
2506	} while (bh != head);
2507	SetPageError(page);
2508	BUG_ON(PageWriteback(page));
2509	set_page_writeback(page);
2510	unlock_page(page);
2511	do {
2512		struct buffer_head *next = bh->b_this_page;
2513		if (buffer_async_write(bh)) {
2514			clear_buffer_dirty(bh);
2515			submit_bh(WRITE, bh);
2516			nr++;
2517		}
2518		put_bh(bh);
2519		bh = next;
2520	} while (bh != head);
2521	goto done;
2522}
2523
2524static int reiserfs_readpage(struct file *f, struct page *page)
2525{
2526	return block_read_full_page(page, reiserfs_get_block);
2527}
2528
2529static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
2530{
2531	struct inode *inode = page->mapping->host;
2532	reiserfs_wait_on_write_block(inode->i_sb);
2533	return reiserfs_write_full_page(page, wbc);
2534}
2535
2536static int reiserfs_prepare_write(struct file *f, struct page *page,
2537				  unsigned from, unsigned to)
2538{
2539	struct inode *inode = page->mapping->host;
2540	int ret;
2541	int old_ref = 0;
2542
2543	reiserfs_wait_on_write_block(inode->i_sb);
2544	fix_tail_page_for_writing(page);
2545	if (reiserfs_transaction_running(inode->i_sb)) {
2546		struct reiserfs_transaction_handle *th;
2547		th = (struct reiserfs_transaction_handle *)current->
2548		    journal_info;
2549		BUG_ON(!th->t_refcount);
2550		BUG_ON(!th->t_trans_id);
2551		old_ref = th->t_refcount;
2552		th->t_refcount++;
2553	}
2554
2555	ret = block_prepare_write(page, from, to, reiserfs_get_block);
2556	if (ret && reiserfs_transaction_running(inode->i_sb)) {
2557		struct reiserfs_transaction_handle *th = current->journal_info;
2558		/* this gets a little ugly.  If reiserfs_get_block returned an
2559		 * error and left a transacstion running, we've got to close it,
2560		 * and we've got to free handle if it was a persistent transaction.
2561		 *
2562		 * But, if we had nested into an existing transaction, we need
2563		 * to just drop the ref count on the handle.
2564		 *
2565		 * If old_ref == 0, the transaction is from reiserfs_get_block,
2566		 * and it was a persistent trans.  Otherwise, it was nested above.
2567		 */
2568		if (th->t_refcount > old_ref) {
2569			if (old_ref)
2570				th->t_refcount--;
2571			else {
2572				int err;
2573				reiserfs_write_lock(inode->i_sb);
2574				err = reiserfs_end_persistent_transaction(th);
2575				reiserfs_write_unlock(inode->i_sb);
2576				if (err)
2577					ret = err;
2578			}
2579		}
2580	}
2581	return ret;
2582
2583}
2584
2585static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
2586{
2587	return generic_block_bmap(as, block, reiserfs_bmap);
2588}
2589
2590static int reiserfs_commit_write(struct file *f, struct page *page,
2591				 unsigned from, unsigned to)
2592{
2593	struct inode *inode = page->mapping->host;
2594	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
2595	int ret = 0;
2596	int update_sd = 0;
2597	struct reiserfs_transaction_handle *th = NULL;
2598
2599	reiserfs_wait_on_write_block(inode->i_sb);
2600	if (reiserfs_transaction_running(inode->i_sb)) {
2601		th = current->journal_info;
2602	}
2603	reiserfs_commit_page(inode, page, from, to);
2604
2605	/* generic_commit_write does this for us, but does not update the
2606	 ** transaction tracking stuff when the size changes.  So, we have
2607	 ** to do the i_size updates here.
2608	 */
2609	if (pos > inode->i_size) {
2610		struct reiserfs_transaction_handle myth;
2611		reiserfs_write_lock(inode->i_sb);
2612		/* If the file have grown beyond the border where it
2613		   can have a tail, unmark it as needing a tail
2614		   packing */
2615		if ((have_large_tails(inode->i_sb)
2616		     && inode->i_size > i_block_size(inode) * 4)
2617		    || (have_small_tails(inode->i_sb)
2618			&& inode->i_size > i_block_size(inode)))
2619			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
2620
2621		ret = journal_begin(&myth, inode->i_sb, 1);
2622		if (ret) {
2623			reiserfs_write_unlock(inode->i_sb);
2624			goto journal_error;
2625		}
2626		reiserfs_update_inode_transaction(inode);
2627		inode->i_size = pos;
2628		/*
2629		 * this will just nest into our transaction.  It's important
2630		 * to use mark_inode_dirty so the inode gets pushed around on the
2631		 * dirty lists, and so that O_SYNC works as expected
2632		 */
2633		mark_inode_dirty(inode);
2634		reiserfs_update_sd(&myth, inode);
2635		update_sd = 1;
2636		ret = journal_end(&myth, inode->i_sb, 1);
2637		reiserfs_write_unlock(inode->i_sb);
2638		if (ret)
2639			goto journal_error;
2640	}
2641	if (th) {
2642		reiserfs_write_lock(inode->i_sb);
2643		if (!update_sd)
2644			mark_inode_dirty(inode);
2645		ret = reiserfs_end_persistent_transaction(th);
2646		reiserfs_write_unlock(inode->i_sb);
2647		if (ret)
2648			goto out;
2649	}
2650
2651      out:
2652	return ret;
2653
2654      journal_error:
2655	if (th) {
2656		reiserfs_write_lock(inode->i_sb);
2657		if (!update_sd)
2658			reiserfs_update_sd(th, inode);
2659		ret = reiserfs_end_persistent_transaction(th);
2660		reiserfs_write_unlock(inode->i_sb);
2661	}
2662
2663	return ret;
2664}
2665
2666void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
2667{
2668	if (reiserfs_attrs(inode->i_sb)) {
2669		if (sd_attrs & REISERFS_SYNC_FL)
2670			inode->i_flags |= S_SYNC;
2671		else
2672			inode->i_flags &= ~S_SYNC;
2673		if (sd_attrs & REISERFS_IMMUTABLE_FL)
2674			inode->i_flags |= S_IMMUTABLE;
2675		else
2676			inode->i_flags &= ~S_IMMUTABLE;
2677		if (sd_attrs & REISERFS_APPEND_FL)
2678			inode->i_flags |= S_APPEND;
2679		else
2680			inode->i_flags &= ~S_APPEND;
2681		if (sd_attrs & REISERFS_NOATIME_FL)
2682			inode->i_flags |= S_NOATIME;
2683		else
2684			inode->i_flags &= ~S_NOATIME;
2685		if (sd_attrs & REISERFS_NOTAIL_FL)
2686			REISERFS_I(inode)->i_flags |= i_nopack_mask;
2687		else
2688			REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
2689	}
2690}
2691
2692void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
2693{
2694	if (reiserfs_attrs(inode->i_sb)) {
2695		if (inode->i_flags & S_IMMUTABLE)
2696			*sd_attrs |= REISERFS_IMMUTABLE_FL;
2697		else
2698			*sd_attrs &= ~REISERFS_IMMUTABLE_FL;
2699		if (inode->i_flags & S_SYNC)
2700			*sd_attrs |= REISERFS_SYNC_FL;
2701		else
2702			*sd_attrs &= ~REISERFS_SYNC_FL;
2703		if (inode->i_flags & S_NOATIME)
2704			*sd_attrs |= REISERFS_NOATIME_FL;
2705		else
2706			*sd_attrs &= ~REISERFS_NOATIME_FL;
2707		if (REISERFS_I(inode)->i_flags & i_nopack_mask)
2708			*sd_attrs |= REISERFS_NOTAIL_FL;
2709		else
2710			*sd_attrs &= ~REISERFS_NOTAIL_FL;
2711	}
2712}
2713
2714/* decide if this buffer needs to stay around for data logging or ordered
2715** write purposes
2716*/
2717static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2718{
2719	int ret = 1;
2720	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
2721
2722	lock_buffer(bh);
2723	spin_lock(&j->j_dirty_buffers_lock);
2724	if (!buffer_mapped(bh)) {
2725		goto free_jh;
2726	}
2727	/* the page is locked, and the only places that log a data buffer
2728	 * also lock the page.
2729	 */
2730	if (reiserfs_file_data_log(inode)) {
2731		/*
2732		 * very conservative, leave the buffer pinned if
2733		 * anyone might need it.
2734		 */
2735		if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
2736			ret = 0;
2737		}
2738	} else  if (buffer_dirty(bh)) {
2739		struct reiserfs_journal_list *jl;
2740		struct reiserfs_jh *jh = bh->b_private;
2741
2742		/* why is this safe?
2743		 * reiserfs_setattr updates i_size in the on disk
2744		 * stat data before allowing vmtruncate to be called.
2745		 *
2746		 * If buffer was put onto the ordered list for this
2747		 * transaction, we know for sure either this transaction
2748		 * or an older one already has updated i_size on disk,
2749		 * and this ordered data won't be referenced in the file
2750		 * if we crash.
2751		 *
2752		 * if the buffer was put onto the ordered list for an older
2753		 * transaction, we need to leave it around
2754		 */
2755		if (jh && (jl = jh->jl)
2756		    && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
2757			ret = 0;
2758	}
2759      free_jh:
2760	if (ret && bh->b_private) {
2761		reiserfs_free_jh(bh);
2762	}
2763	spin_unlock(&j->j_dirty_buffers_lock);
2764	unlock_buffer(bh);
2765	return ret;
2766}
2767
2768/* clm -- taken from fs/buffer.c:block_invalidate_page */
2769static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
2770{
2771	struct buffer_head *head, *bh, *next;
2772	struct inode *inode = page->mapping->host;
2773	unsigned int curr_off = 0;
2774	int ret = 1;
2775
2776	BUG_ON(!PageLocked(page));
2777
2778	if (offset == 0)
2779		ClearPageChecked(page);
2780
2781	if (!page_has_buffers(page))
2782		goto out;
2783
2784	head = page_buffers(page);
2785	bh = head;
2786	do {
2787		unsigned int next_off = curr_off + bh->b_size;
2788		next = bh->b_this_page;
2789
2790		/*
2791		 * is this block fully invalidated?
2792		 */
2793		if (offset <= curr_off) {
2794			if (invalidatepage_can_drop(inode, bh))
2795				reiserfs_unmap_buffer(bh);
2796			else
2797				ret = 0;
2798		}
2799		curr_off = next_off;
2800		bh = next;
2801	} while (bh != head);
2802
2803	/*
2804	 * We release buffers only if the entire page is being invalidated.
2805	 * The get_block cached value has been unconditionally invalidated,
2806	 * so real IO is not possible anymore.
2807	 */
2808	if (!offset && ret) {
2809		ret = try_to_release_page(page, 0);
2810		/* maybe should BUG_ON(!ret); - neilb */
2811	}
2812      out:
2813	return;
2814}
2815
2816static int reiserfs_set_page_dirty(struct page *page)
2817{
2818	struct inode *inode = page->mapping->host;
2819	if (reiserfs_file_data_log(inode)) {
2820		SetPageChecked(page);
2821		return __set_page_dirty_nobuffers(page);
2822	}
2823	return __set_page_dirty_buffers(page);
2824}
2825
2826/*
2827 * Returns 1 if the page's buffers were dropped.  The page is locked.
2828 *
2829 * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
2830 * in the buffers at page_buffers(page).
2831 *
2832 * even in -o notail mode, we can't be sure an old mount without -o notail
2833 * didn't create files with tails.
2834 */
2835static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
2836{
2837	struct inode *inode = page->mapping->host;
2838	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
2839	struct buffer_head *head;
2840	struct buffer_head *bh;
2841	int ret = 1;
2842
2843	WARN_ON(PageChecked(page));
2844	spin_lock(&j->j_dirty_buffers_lock);
2845	head = page_buffers(page);
2846	bh = head;
2847	do {
2848		if (bh->b_private) {
2849			if (!buffer_dirty(bh) && !buffer_locked(bh)) {
2850				reiserfs_free_jh(bh);
2851			} else {
2852				ret = 0;
2853				break;
2854			}
2855		}
2856		bh = bh->b_this_page;
2857	} while (bh != head);
2858	if (ret)
2859		ret = try_to_free_buffers(page);
2860	spin_unlock(&j->j_dirty_buffers_lock);
2861	return ret;
2862}
2863
2864/* We thank Mingming Cao for helping us understand in great detail what
2865   to do in this section of the code. */
2866static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
2867				  const struct iovec *iov, loff_t offset,
2868				  unsigned long nr_segs)
2869{
2870	struct file *file = iocb->ki_filp;
2871	struct inode *inode = file->f_mapping->host;
2872
2873	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
2874				  offset, nr_segs,
2875				  reiserfs_get_blocks_direct_io, NULL);
2876}
2877
2878int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
2879{
2880	struct inode *inode = dentry->d_inode;
2881	int error;
2882	unsigned int ia_valid = attr->ia_valid;
2883	reiserfs_write_lock(inode->i_sb);
2884	if (attr->ia_valid & ATTR_SIZE) {
2885		/* version 2 items will be caught by the s_maxbytes check
2886		 ** done for us in vmtruncate
2887		 */
2888		if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
2889		    attr->ia_size > MAX_NON_LFS) {
2890			error = -EFBIG;
2891			goto out;
2892		}
2893		/* fill in hole pointers in the expanding truncate case. */
2894		if (attr->ia_size > inode->i_size) {
2895			error = generic_cont_expand(inode, attr->ia_size);
2896			if (REISERFS_I(inode)->i_prealloc_count > 0) {
2897				int err;
2898				struct reiserfs_transaction_handle th;
2899				/* we're changing at most 2 bitmaps, inode + super */
2900				err = journal_begin(&th, inode->i_sb, 4);
2901				if (!err) {
2902					reiserfs_discard_prealloc(&th, inode);
2903					err = journal_end(&th, inode->i_sb, 4);
2904				}
2905				if (err)
2906					error = err;
2907			}
2908			if (error)
2909				goto out;
2910			/*
2911			 * file size is changed, ctime and mtime are
2912			 * to be updated
2913			 */
2914			attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
2915		}
2916	}
2917
2918	if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
2919	     ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
2920	    (get_inode_sd_version(inode) == STAT_DATA_V1)) {
2921		/* stat data of format v3.5 has 16 bit uid and gid */
2922		error = -EINVAL;
2923		goto out;
2924	}
2925
2926	error = inode_change_ok(inode, attr);
2927	if (!error) {
2928		if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2929		    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2930			error = reiserfs_chown_xattrs(inode, attr);
2931
2932			if (!error) {
2933				struct reiserfs_transaction_handle th;
2934				int jbegin_count =
2935				    2 *
2936				    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
2937				     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
2938				    2;
2939
2940				/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
2941				error =
2942				    journal_begin(&th, inode->i_sb,
2943						  jbegin_count);
2944				if (error)
2945					goto out;
2946				error =
2947				    DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2948				if (error) {
2949					journal_end(&th, inode->i_sb,
2950						    jbegin_count);
2951					goto out;
2952				}
2953				/* Update corresponding info in inode so that everything is in
2954				 * one transaction */
2955				if (attr->ia_valid & ATTR_UID)
2956					inode->i_uid = attr->ia_uid;
2957				if (attr->ia_valid & ATTR_GID)
2958					inode->i_gid = attr->ia_gid;
2959				mark_inode_dirty(inode);
2960				error =
2961				    journal_end(&th, inode->i_sb, jbegin_count);
2962			}
2963		}
2964		if (!error)
2965			error = inode_setattr(inode, attr);
2966	}
2967
2968	if (!error && reiserfs_posixacl(inode->i_sb)) {
2969		if (attr->ia_valid & ATTR_MODE)
2970			error = reiserfs_acl_chmod(inode);
2971	}
2972
2973      out:
2974	reiserfs_write_unlock(inode->i_sb);
2975	return error;
2976}
2977
2978const struct address_space_operations reiserfs_address_space_operations = {
2979	.writepage = reiserfs_writepage,
2980	.readpage = reiserfs_readpage,
2981	.readpages = reiserfs_readpages,
2982	.releasepage = reiserfs_releasepage,
2983	.invalidatepage = reiserfs_invalidatepage,
2984	.sync_page = block_sync_page,
2985	.prepare_write = reiserfs_prepare_write,
2986	.commit_write = reiserfs_commit_write,
2987	.bmap = reiserfs_aop_bmap,
2988	.direct_IO = reiserfs_direct_IO,
2989	.set_page_dirty = reiserfs_set_page_dirty,
2990};
2991