1/*
2 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10/*
11 * Implements Extendible Hashing as described in:
12 *   "Extendible Hashing" by Fagin, et al in
13 *     __ACM Trans. on Database Systems__, Sept 1979.
14 *
15 *
16 * Here's the layout of dirents which is essentially the same as that of ext2
17 * within a single block. The field de_name_len is the number of bytes
18 * actually required for the name (no null terminator). The field de_rec_len
19 * is the number of bytes allocated to the dirent. The offset of the next
20 * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
21 * deleted, the preceding dirent inherits its allocated space, ie
22 * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
23 * by adding de_rec_len to the current dirent, this essentially causes the
24 * deleted dirent to get jumped over when iterating through all the dirents.
25 *
26 * When deleting the first dirent in a block, there is no previous dirent so
27 * the field de_ino is set to zero to designate it as deleted. When allocating
28 * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
29 * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
30 * dirent is allocated. Otherwise it must go through all the 'used' dirents
31 * searching for one in which the amount of total space minus the amount of
32 * used space will provide enough space for the new dirent.
33 *
34 * There are two types of blocks in which dirents reside. In a stuffed dinode,
35 * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
36 * the block.  In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
37 * beginning of the leaf block. The dirents reside in leaves when
38 *
39 * dip->i_diskflags & GFS2_DIF_EXHASH is true
40 *
41 * Otherwise, the dirents are "linear", within a single stuffed dinode block.
42 *
43 * When the dirents are in leaves, the actual contents of the directory file are
44 * used as an array of 64-bit block pointers pointing to the leaf blocks. The
45 * dirents are NOT in the directory file itself. There can be more than one
46 * block pointer in the array that points to the same leaf. In fact, when a
47 * directory is first converted from linear to exhash, all of the pointers
48 * point to the same leaf.
49 *
50 * When a leaf is completely full, the size of the hash table can be
51 * doubled unless it is already at the maximum size which is hard coded into
52 * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
53 * but never before the maximum hash table size has been reached.
54 */
55
56#include <linux/slab.h>
57#include <linux/spinlock.h>
58#include <linux/buffer_head.h>
59#include <linux/sort.h>
60#include <linux/gfs2_ondisk.h>
61#include <linux/crc32.h>
62#include <linux/vmalloc.h>
63
64#include "gfs2.h"
65#include "incore.h"
66#include "dir.h"
67#include "glock.h"
68#include "inode.h"
69#include "meta_io.h"
70#include "quota.h"
71#include "rgrp.h"
72#include "trans.h"
73#include "bmap.h"
74#include "util.h"
75
76#define IS_LEAF     1 /* Hashed (leaf) directory */
77#define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
78
79#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
80#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
81
82typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
83			    u64 leaf_no, void *data);
84typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
85			    const struct qstr *name, void *opaque);
86
87
88int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
89			    struct buffer_head **bhp)
90{
91	struct buffer_head *bh;
92
93	bh = gfs2_meta_new(ip->i_gl, block);
94	gfs2_trans_add_bh(ip->i_gl, bh, 1);
95	gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
96	gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
97	*bhp = bh;
98	return 0;
99}
100
101static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
102					struct buffer_head **bhp)
103{
104	struct buffer_head *bh;
105	int error;
106
107	error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
108	if (error)
109		return error;
110	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
111		brelse(bh);
112		return -EIO;
113	}
114	*bhp = bh;
115	return 0;
116}
117
118static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
119				  unsigned int offset, unsigned int size)
120{
121	struct buffer_head *dibh;
122	int error;
123
124	error = gfs2_meta_inode_buffer(ip, &dibh);
125	if (error)
126		return error;
127
128	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
129	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
130	if (ip->i_disksize < offset + size)
131		ip->i_disksize = offset + size;
132	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
133	gfs2_dinode_out(ip, dibh->b_data);
134
135	brelse(dibh);
136
137	return size;
138}
139
140
141
142/**
143 * gfs2_dir_write_data - Write directory information to the inode
144 * @ip: The GFS2 inode
145 * @buf: The buffer containing information to be written
146 * @offset: The file offset to start writing at
147 * @size: The amount of data to write
148 *
149 * Returns: The number of bytes correctly written or error code
150 */
151static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
152			       u64 offset, unsigned int size)
153{
154	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
155	struct buffer_head *dibh;
156	u64 lblock, dblock;
157	u32 extlen = 0;
158	unsigned int o;
159	int copied = 0;
160	int error = 0;
161	int new = 0;
162
163	if (!size)
164		return 0;
165
166	if (gfs2_is_stuffed(ip) &&
167	    offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
168		return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
169					      size);
170
171	if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
172		return -EINVAL;
173
174	if (gfs2_is_stuffed(ip)) {
175		error = gfs2_unstuff_dinode(ip, NULL);
176		if (error)
177			return error;
178	}
179
180	lblock = offset;
181	o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
182
183	while (copied < size) {
184		unsigned int amount;
185		struct buffer_head *bh;
186
187		amount = size - copied;
188		if (amount > sdp->sd_sb.sb_bsize - o)
189			amount = sdp->sd_sb.sb_bsize - o;
190
191		if (!extlen) {
192			new = 1;
193			error = gfs2_extent_map(&ip->i_inode, lblock, &new,
194						&dblock, &extlen);
195			if (error)
196				goto fail;
197			error = -EIO;
198			if (gfs2_assert_withdraw(sdp, dblock))
199				goto fail;
200		}
201
202		if (amount == sdp->sd_jbsize || new)
203			error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
204		else
205			error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
206
207		if (error)
208			goto fail;
209
210		gfs2_trans_add_bh(ip->i_gl, bh, 1);
211		memcpy(bh->b_data + o, buf, amount);
212		brelse(bh);
213
214		buf += amount;
215		copied += amount;
216		lblock++;
217		dblock++;
218		extlen--;
219
220		o = sizeof(struct gfs2_meta_header);
221	}
222
223out:
224	error = gfs2_meta_inode_buffer(ip, &dibh);
225	if (error)
226		return error;
227
228	if (ip->i_disksize < offset + copied)
229		ip->i_disksize = offset + copied;
230	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
231
232	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
233	gfs2_dinode_out(ip, dibh->b_data);
234	brelse(dibh);
235
236	return copied;
237fail:
238	if (copied)
239		goto out;
240	return error;
241}
242
243static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
244				 u64 offset, unsigned int size)
245{
246	struct buffer_head *dibh;
247	int error;
248
249	error = gfs2_meta_inode_buffer(ip, &dibh);
250	if (!error) {
251		offset += sizeof(struct gfs2_dinode);
252		memcpy(buf, dibh->b_data + offset, size);
253		brelse(dibh);
254	}
255
256	return (error) ? error : size;
257}
258
259
260/**
261 * gfs2_dir_read_data - Read a data from a directory inode
262 * @ip: The GFS2 Inode
263 * @buf: The buffer to place result into
264 * @offset: File offset to begin jdata_readng from
265 * @size: Amount of data to transfer
266 *
267 * Returns: The amount of data actually copied or the error
268 */
269static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
270			      unsigned int size, unsigned ra)
271{
272	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
273	u64 lblock, dblock;
274	u32 extlen = 0;
275	unsigned int o;
276	int copied = 0;
277	int error = 0;
278
279	if (offset >= ip->i_disksize)
280		return 0;
281
282	if (offset + size > ip->i_disksize)
283		size = ip->i_disksize - offset;
284
285	if (!size)
286		return 0;
287
288	if (gfs2_is_stuffed(ip))
289		return gfs2_dir_read_stuffed(ip, buf, offset, size);
290
291	if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
292		return -EINVAL;
293
294	lblock = offset;
295	o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
296
297	while (copied < size) {
298		unsigned int amount;
299		struct buffer_head *bh;
300		int new;
301
302		amount = size - copied;
303		if (amount > sdp->sd_sb.sb_bsize - o)
304			amount = sdp->sd_sb.sb_bsize - o;
305
306		if (!extlen) {
307			new = 0;
308			error = gfs2_extent_map(&ip->i_inode, lblock, &new,
309						&dblock, &extlen);
310			if (error || !dblock)
311				goto fail;
312			BUG_ON(extlen < 1);
313			if (!ra)
314				extlen = 1;
315			bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
316		} else {
317			error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
318			if (error)
319				goto fail;
320		}
321		error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD);
322		if (error) {
323			brelse(bh);
324			goto fail;
325		}
326		dblock++;
327		extlen--;
328		memcpy(buf, bh->b_data + o, amount);
329		brelse(bh);
330		buf += amount;
331		copied += amount;
332		lblock++;
333		o = sizeof(struct gfs2_meta_header);
334	}
335
336	return copied;
337fail:
338	return (copied) ? copied : error;
339}
340
341static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
342{
343	return dent->de_inum.no_addr == 0 || dent->de_inum.no_formal_ino == 0;
344}
345
346static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
347				     const struct qstr *name, int ret)
348{
349	if (!gfs2_dirent_sentinel(dent) &&
350	    be32_to_cpu(dent->de_hash) == name->hash &&
351	    be16_to_cpu(dent->de_name_len) == name->len &&
352	    memcmp(dent+1, name->name, name->len) == 0)
353		return ret;
354	return 0;
355}
356
357static int gfs2_dirent_find(const struct gfs2_dirent *dent,
358			    const struct qstr *name,
359			    void *opaque)
360{
361	return __gfs2_dirent_find(dent, name, 1);
362}
363
364static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
365			    const struct qstr *name,
366			    void *opaque)
367{
368	return __gfs2_dirent_find(dent, name, 2);
369}
370
371/*
372 * name->name holds ptr to start of block.
373 * name->len holds size of block.
374 */
375static int gfs2_dirent_last(const struct gfs2_dirent *dent,
376			    const struct qstr *name,
377			    void *opaque)
378{
379	const char *start = name->name;
380	const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
381	if (name->len == (end - start))
382		return 1;
383	return 0;
384}
385
386static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
387				  const struct qstr *name,
388				  void *opaque)
389{
390	unsigned required = GFS2_DIRENT_SIZE(name->len);
391	unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
392	unsigned totlen = be16_to_cpu(dent->de_rec_len);
393
394	if (gfs2_dirent_sentinel(dent))
395		actual = 0;
396	if (totlen - actual >= required)
397		return 1;
398	return 0;
399}
400
401struct dirent_gather {
402	const struct gfs2_dirent **pdent;
403	unsigned offset;
404};
405
406static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
407			      const struct qstr *name,
408			      void *opaque)
409{
410	struct dirent_gather *g = opaque;
411	if (!gfs2_dirent_sentinel(dent)) {
412		g->pdent[g->offset++] = dent;
413	}
414	return 0;
415}
416
417/*
418 * Other possible things to check:
419 * - Inode located within filesystem size (and on valid block)
420 * - Valid directory entry type
421 * Not sure how heavy-weight we want to make this... could also check
422 * hash is correct for example, but that would take a lot of extra time.
423 * For now the most important thing is to check that the various sizes
424 * are correct.
425 */
426static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
427			     unsigned int size, unsigned int len, int first)
428{
429	const char *msg = "gfs2_dirent too small";
430	if (unlikely(size < sizeof(struct gfs2_dirent)))
431		goto error;
432	msg = "gfs2_dirent misaligned";
433	if (unlikely(offset & 0x7))
434		goto error;
435	msg = "gfs2_dirent points beyond end of block";
436	if (unlikely(offset + size > len))
437		goto error;
438	msg = "zero inode number";
439	if (unlikely(!first && gfs2_dirent_sentinel(dent)))
440		goto error;
441	msg = "name length is greater than space in dirent";
442	if (!gfs2_dirent_sentinel(dent) &&
443	    unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
444		     size))
445		goto error;
446	return 0;
447error:
448	printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
449	       first ? "first in block" : "not first in block");
450	return -EIO;
451}
452
453static int gfs2_dirent_offset(const void *buf)
454{
455	const struct gfs2_meta_header *h = buf;
456	int offset;
457
458	BUG_ON(buf == NULL);
459
460	switch(be32_to_cpu(h->mh_type)) {
461	case GFS2_METATYPE_LF:
462		offset = sizeof(struct gfs2_leaf);
463		break;
464	case GFS2_METATYPE_DI:
465		offset = sizeof(struct gfs2_dinode);
466		break;
467	default:
468		goto wrong_type;
469	}
470	return offset;
471wrong_type:
472	printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
473	       be32_to_cpu(h->mh_type));
474	return -1;
475}
476
477static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
478					    unsigned int len, gfs2_dscan_t scan,
479					    const struct qstr *name,
480					    void *opaque)
481{
482	struct gfs2_dirent *dent, *prev;
483	unsigned offset;
484	unsigned size;
485	int ret = 0;
486
487	ret = gfs2_dirent_offset(buf);
488	if (ret < 0)
489		goto consist_inode;
490
491	offset = ret;
492	prev = NULL;
493	dent = buf + offset;
494	size = be16_to_cpu(dent->de_rec_len);
495	if (gfs2_check_dirent(dent, offset, size, len, 1))
496		goto consist_inode;
497	do {
498		ret = scan(dent, name, opaque);
499		if (ret)
500			break;
501		offset += size;
502		if (offset == len)
503			break;
504		prev = dent;
505		dent = buf + offset;
506		size = be16_to_cpu(dent->de_rec_len);
507		if (gfs2_check_dirent(dent, offset, size, len, 0))
508			goto consist_inode;
509	} while(1);
510
511	switch(ret) {
512	case 0:
513		return NULL;
514	case 1:
515		return dent;
516	case 2:
517		return prev ? prev : dent;
518	default:
519		BUG_ON(ret > 0);
520		return ERR_PTR(ret);
521	}
522
523consist_inode:
524	gfs2_consist_inode(GFS2_I(inode));
525	return ERR_PTR(-EIO);
526}
527
528static int dirent_check_reclen(struct gfs2_inode *dip,
529			       const struct gfs2_dirent *d, const void *end_p)
530{
531	const void *ptr = d;
532	u16 rec_len = be16_to_cpu(d->de_rec_len);
533
534	if (unlikely(rec_len < sizeof(struct gfs2_dirent)))
535		goto broken;
536	ptr += rec_len;
537	if (ptr < end_p)
538		return rec_len;
539	if (ptr == end_p)
540		return -ENOENT;
541broken:
542	gfs2_consist_inode(dip);
543	return -EIO;
544}
545
546/**
547 * dirent_next - Next dirent
548 * @dip: the directory
549 * @bh: The buffer
550 * @dent: Pointer to list of dirents
551 *
552 * Returns: 0 on success, error code otherwise
553 */
554
555static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
556		       struct gfs2_dirent **dent)
557{
558	struct gfs2_dirent *cur = *dent, *tmp;
559	char *bh_end = bh->b_data + bh->b_size;
560	int ret;
561
562	ret = dirent_check_reclen(dip, cur, bh_end);
563	if (ret < 0)
564		return ret;
565
566	tmp = (void *)cur + ret;
567	ret = dirent_check_reclen(dip, tmp, bh_end);
568	if (ret == -EIO)
569		return ret;
570
571        /* Only the first dent could ever have de_inum.no_addr == 0 */
572	if (gfs2_dirent_sentinel(tmp)) {
573		gfs2_consist_inode(dip);
574		return -EIO;
575	}
576
577	*dent = tmp;
578	return 0;
579}
580
581/**
582 * dirent_del - Delete a dirent
583 * @dip: The GFS2 inode
584 * @bh: The buffer
585 * @prev: The previous dirent
586 * @cur: The current dirent
587 *
588 */
589
590static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
591		       struct gfs2_dirent *prev, struct gfs2_dirent *cur)
592{
593	u16 cur_rec_len, prev_rec_len;
594
595	if (gfs2_dirent_sentinel(cur)) {
596		gfs2_consist_inode(dip);
597		return;
598	}
599
600	gfs2_trans_add_bh(dip->i_gl, bh, 1);
601
602	/* If there is no prev entry, this is the first entry in the block.
603	   The de_rec_len is already as big as it needs to be.  Just zero
604	   out the inode number and return.  */
605
606	if (!prev) {
607		cur->de_inum.no_addr = 0;
608		cur->de_inum.no_formal_ino = 0;
609		return;
610	}
611
612	/*  Combine this dentry with the previous one.  */
613
614	prev_rec_len = be16_to_cpu(prev->de_rec_len);
615	cur_rec_len = be16_to_cpu(cur->de_rec_len);
616
617	if ((char *)prev + prev_rec_len != (char *)cur)
618		gfs2_consist_inode(dip);
619	if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
620		gfs2_consist_inode(dip);
621
622	prev_rec_len += cur_rec_len;
623	prev->de_rec_len = cpu_to_be16(prev_rec_len);
624}
625
626/*
627 * Takes a dent from which to grab space as an argument. Returns the
628 * newly created dent.
629 */
630static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
631					    struct gfs2_dirent *dent,
632					    const struct qstr *name,
633					    struct buffer_head *bh)
634{
635	struct gfs2_inode *ip = GFS2_I(inode);
636	struct gfs2_dirent *ndent;
637	unsigned offset = 0, totlen;
638
639	if (!gfs2_dirent_sentinel(dent))
640		offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
641	totlen = be16_to_cpu(dent->de_rec_len);
642	BUG_ON(offset + name->len > totlen);
643	gfs2_trans_add_bh(ip->i_gl, bh, 1);
644	ndent = (struct gfs2_dirent *)((char *)dent + offset);
645	dent->de_rec_len = cpu_to_be16(offset);
646	gfs2_qstr2dirent(name, totlen - offset, ndent);
647	return ndent;
648}
649
650static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
651					     struct buffer_head *bh,
652					     const struct qstr *name)
653{
654	struct gfs2_dirent *dent;
655	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
656				gfs2_dirent_find_space, name, NULL);
657	if (!dent || IS_ERR(dent))
658		return dent;
659	return gfs2_init_dirent(inode, dent, name, bh);
660}
661
662static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
663		    struct buffer_head **bhp)
664{
665	int error;
666
667	error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
668	if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
669		/* printk(KERN_INFO "block num=%llu\n", leaf_no); */
670		error = -EIO;
671	}
672
673	return error;
674}
675
676/**
677 * get_leaf_nr - Get a leaf number associated with the index
678 * @dip: The GFS2 inode
679 * @index:
680 * @leaf_out:
681 *
682 * Returns: 0 on success, error code otherwise
683 */
684
685static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
686		       u64 *leaf_out)
687{
688	__be64 leaf_no;
689	int error;
690
691	error = gfs2_dir_read_data(dip, (char *)&leaf_no,
692				    index * sizeof(__be64),
693				    sizeof(__be64), 0);
694	if (error != sizeof(u64))
695		return (error < 0) ? error : -EIO;
696
697	*leaf_out = be64_to_cpu(leaf_no);
698
699	return 0;
700}
701
702static int get_first_leaf(struct gfs2_inode *dip, u32 index,
703			  struct buffer_head **bh_out)
704{
705	u64 leaf_no;
706	int error;
707
708	error = get_leaf_nr(dip, index, &leaf_no);
709	if (!error)
710		error = get_leaf(dip, leaf_no, bh_out);
711
712	return error;
713}
714
715static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
716					      const struct qstr *name,
717					      gfs2_dscan_t scan,
718					      struct buffer_head **pbh)
719{
720	struct buffer_head *bh;
721	struct gfs2_dirent *dent;
722	struct gfs2_inode *ip = GFS2_I(inode);
723	int error;
724
725	if (ip->i_diskflags & GFS2_DIF_EXHASH) {
726		struct gfs2_leaf *leaf;
727		unsigned hsize = 1 << ip->i_depth;
728		unsigned index;
729		u64 ln;
730		if (hsize * sizeof(u64) != ip->i_disksize) {
731			gfs2_consist_inode(ip);
732			return ERR_PTR(-EIO);
733		}
734
735		index = name->hash >> (32 - ip->i_depth);
736		error = get_first_leaf(ip, index, &bh);
737		if (error)
738			return ERR_PTR(error);
739		do {
740			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
741						scan, name, NULL);
742			if (dent)
743				goto got_dent;
744			leaf = (struct gfs2_leaf *)bh->b_data;
745			ln = be64_to_cpu(leaf->lf_next);
746			brelse(bh);
747			if (!ln)
748				break;
749
750			error = get_leaf(ip, ln, &bh);
751		} while(!error);
752
753		return error ? ERR_PTR(error) : NULL;
754	}
755
756
757	error = gfs2_meta_inode_buffer(ip, &bh);
758	if (error)
759		return ERR_PTR(error);
760	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
761got_dent:
762	if (unlikely(dent == NULL || IS_ERR(dent))) {
763		brelse(bh);
764		bh = NULL;
765	}
766	*pbh = bh;
767	return dent;
768}
769
770static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
771{
772	struct gfs2_inode *ip = GFS2_I(inode);
773	unsigned int n = 1;
774	u64 bn;
775	int error;
776	struct buffer_head *bh;
777	struct gfs2_leaf *leaf;
778	struct gfs2_dirent *dent;
779	struct qstr name = { .name = "", .len = 0, .hash = 0 };
780
781	error = gfs2_alloc_block(ip, &bn, &n);
782	if (error)
783		return NULL;
784	bh = gfs2_meta_new(ip->i_gl, bn);
785	if (!bh)
786		return NULL;
787
788	gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
789	gfs2_trans_add_bh(ip->i_gl, bh, 1);
790	gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
791	leaf = (struct gfs2_leaf *)bh->b_data;
792	leaf->lf_depth = cpu_to_be16(depth);
793	leaf->lf_entries = 0;
794	leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
795	leaf->lf_next = 0;
796	memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
797	dent = (struct gfs2_dirent *)(leaf+1);
798	gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
799	*pbh = bh;
800	return leaf;
801}
802
803/**
804 * dir_make_exhash - Convert a stuffed directory into an ExHash directory
805 * @dip: The GFS2 inode
806 *
807 * Returns: 0 on success, error code otherwise
808 */
809
810static int dir_make_exhash(struct inode *inode)
811{
812	struct gfs2_inode *dip = GFS2_I(inode);
813	struct gfs2_sbd *sdp = GFS2_SB(inode);
814	struct gfs2_dirent *dent;
815	struct qstr args;
816	struct buffer_head *bh, *dibh;
817	struct gfs2_leaf *leaf;
818	int y;
819	u32 x;
820	__be64 *lp;
821	u64 bn;
822	int error;
823
824	error = gfs2_meta_inode_buffer(dip, &dibh);
825	if (error)
826		return error;
827
828	/*  Turn over a new leaf  */
829
830	leaf = new_leaf(inode, &bh, 0);
831	if (!leaf)
832		return -ENOSPC;
833	bn = bh->b_blocknr;
834
835	gfs2_assert(sdp, dip->i_entries < (1 << 16));
836	leaf->lf_entries = cpu_to_be16(dip->i_entries);
837
838	/*  Copy dirents  */
839
840	gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
841			     sizeof(struct gfs2_dinode));
842
843	/*  Find last entry  */
844
845	x = 0;
846	args.len = bh->b_size - sizeof(struct gfs2_dinode) +
847		   sizeof(struct gfs2_leaf);
848	args.name = bh->b_data;
849	dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
850				gfs2_dirent_last, &args, NULL);
851	if (!dent) {
852		brelse(bh);
853		brelse(dibh);
854		return -EIO;
855	}
856	if (IS_ERR(dent)) {
857		brelse(bh);
858		brelse(dibh);
859		return PTR_ERR(dent);
860	}
861
862	/*  Adjust the last dirent's record length
863	   (Remember that dent still points to the last entry.)  */
864
865	dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
866		sizeof(struct gfs2_dinode) -
867		sizeof(struct gfs2_leaf));
868
869	brelse(bh);
870
871	/*  We're done with the new leaf block, now setup the new
872	    hash table.  */
873
874	gfs2_trans_add_bh(dip->i_gl, dibh, 1);
875	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
876
877	lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
878
879	for (x = sdp->sd_hash_ptrs; x--; lp++)
880		*lp = cpu_to_be64(bn);
881
882	dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
883	gfs2_add_inode_blocks(&dip->i_inode, 1);
884	dip->i_diskflags |= GFS2_DIF_EXHASH;
885
886	for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
887	dip->i_depth = y;
888
889	gfs2_dinode_out(dip, dibh->b_data);
890
891	brelse(dibh);
892
893	return 0;
894}
895
896/**
897 * dir_split_leaf - Split a leaf block into two
898 * @dip: The GFS2 inode
899 * @index:
900 * @leaf_no:
901 *
902 * Returns: 0 on success, error code on failure
903 */
904
905static int dir_split_leaf(struct inode *inode, const struct qstr *name)
906{
907	struct gfs2_inode *dip = GFS2_I(inode);
908	struct buffer_head *nbh, *obh, *dibh;
909	struct gfs2_leaf *nleaf, *oleaf;
910	struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
911	u32 start, len, half_len, divider;
912	u64 bn, leaf_no;
913	__be64 *lp;
914	u32 index;
915	int x, moved = 0;
916	int error;
917
918	index = name->hash >> (32 - dip->i_depth);
919	error = get_leaf_nr(dip, index, &leaf_no);
920	if (error)
921		return error;
922
923	/*  Get the old leaf block  */
924	error = get_leaf(dip, leaf_no, &obh);
925	if (error)
926		return error;
927
928	oleaf = (struct gfs2_leaf *)obh->b_data;
929	if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) {
930		brelse(obh);
931		return 1; /* can't split */
932	}
933
934	gfs2_trans_add_bh(dip->i_gl, obh, 1);
935
936	nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
937	if (!nleaf) {
938		brelse(obh);
939		return -ENOSPC;
940	}
941	bn = nbh->b_blocknr;
942
943	/*  Compute the start and len of leaf pointers in the hash table.  */
944	len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
945	half_len = len >> 1;
946	if (!half_len) {
947		printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
948		gfs2_consist_inode(dip);
949		error = -EIO;
950		goto fail_brelse;
951	}
952
953	start = (index & ~(len - 1));
954
955	/* Change the pointers.
956	   Don't bother distinguishing stuffed from non-stuffed.
957	   This code is complicated enough already. */
958	lp = kmalloc(half_len * sizeof(__be64), GFP_NOFS);
959	if (!lp) {
960		error = -ENOMEM;
961		goto fail_brelse;
962	}
963
964	/*  Change the pointers  */
965	for (x = 0; x < half_len; x++)
966		lp[x] = cpu_to_be64(bn);
967
968	error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64),
969				    half_len * sizeof(u64));
970	if (error != half_len * sizeof(u64)) {
971		if (error >= 0)
972			error = -EIO;
973		goto fail_lpfree;
974	}
975
976	kfree(lp);
977
978	/*  Compute the divider  */
979	divider = (start + half_len) << (32 - dip->i_depth);
980
981	/*  Copy the entries  */
982	dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf));
983
984	do {
985		next = dent;
986		if (dirent_next(dip, obh, &next))
987			next = NULL;
988
989		if (!gfs2_dirent_sentinel(dent) &&
990		    be32_to_cpu(dent->de_hash) < divider) {
991			struct qstr str;
992			str.name = (char*)(dent+1);
993			str.len = be16_to_cpu(dent->de_name_len);
994			str.hash = be32_to_cpu(dent->de_hash);
995			new = gfs2_dirent_alloc(inode, nbh, &str);
996			if (IS_ERR(new)) {
997				error = PTR_ERR(new);
998				break;
999			}
1000
1001			new->de_inum = dent->de_inum; /* No endian worries */
1002			new->de_type = dent->de_type; /* No endian worries */
1003			be16_add_cpu(&nleaf->lf_entries, 1);
1004
1005			dirent_del(dip, obh, prev, dent);
1006
1007			if (!oleaf->lf_entries)
1008				gfs2_consist_inode(dip);
1009			be16_add_cpu(&oleaf->lf_entries, -1);
1010
1011			if (!prev)
1012				prev = dent;
1013
1014			moved = 1;
1015		} else {
1016			prev = dent;
1017		}
1018		dent = next;
1019	} while (dent);
1020
1021	oleaf->lf_depth = nleaf->lf_depth;
1022
1023	error = gfs2_meta_inode_buffer(dip, &dibh);
1024	if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
1025		gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1026		gfs2_add_inode_blocks(&dip->i_inode, 1);
1027		gfs2_dinode_out(dip, dibh->b_data);
1028		brelse(dibh);
1029	}
1030
1031	brelse(obh);
1032	brelse(nbh);
1033
1034	return error;
1035
1036fail_lpfree:
1037	kfree(lp);
1038
1039fail_brelse:
1040	brelse(obh);
1041	brelse(nbh);
1042	return error;
1043}
1044
1045/**
1046 * dir_double_exhash - Double size of ExHash table
1047 * @dip: The GFS2 dinode
1048 *
1049 * Returns: 0 on success, error code on failure
1050 */
1051
1052static int dir_double_exhash(struct gfs2_inode *dip)
1053{
1054	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1055	struct buffer_head *dibh;
1056	u32 hsize;
1057	u64 *buf;
1058	u64 *from, *to;
1059	u64 block;
1060	int x;
1061	int error = 0;
1062
1063	hsize = 1 << dip->i_depth;
1064	if (hsize * sizeof(u64) != dip->i_disksize) {
1065		gfs2_consist_inode(dip);
1066		return -EIO;
1067	}
1068
1069	/*  Allocate both the "from" and "to" buffers in one big chunk  */
1070
1071	buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS);
1072	if (!buf)
1073		return -ENOMEM;
1074
1075	for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
1076		error = gfs2_dir_read_data(dip, (char *)buf,
1077					    block * sdp->sd_hash_bsize,
1078					    sdp->sd_hash_bsize, 1);
1079		if (error != sdp->sd_hash_bsize) {
1080			if (error >= 0)
1081				error = -EIO;
1082			goto fail;
1083		}
1084
1085		from = buf;
1086		to = (u64 *)((char *)buf + sdp->sd_hash_bsize);
1087
1088		for (x = sdp->sd_hash_ptrs; x--; from++) {
1089			*to++ = *from;	/*  No endianess worries  */
1090			*to++ = *from;
1091		}
1092
1093		error = gfs2_dir_write_data(dip,
1094					     (char *)buf + sdp->sd_hash_bsize,
1095					     block * sdp->sd_sb.sb_bsize,
1096					     sdp->sd_sb.sb_bsize);
1097		if (error != sdp->sd_sb.sb_bsize) {
1098			if (error >= 0)
1099				error = -EIO;
1100			goto fail;
1101		}
1102	}
1103
1104	kfree(buf);
1105
1106	error = gfs2_meta_inode_buffer(dip, &dibh);
1107	if (!gfs2_assert_withdraw(sdp, !error)) {
1108		dip->i_depth++;
1109		gfs2_dinode_out(dip, dibh->b_data);
1110		brelse(dibh);
1111	}
1112
1113	return error;
1114
1115fail:
1116	kfree(buf);
1117	return error;
1118}
1119
1120/**
1121 * compare_dents - compare directory entries by hash value
1122 * @a: first dent
1123 * @b: second dent
1124 *
1125 * When comparing the hash entries of @a to @b:
1126 *   gt: returns 1
1127 *   lt: returns -1
1128 *   eq: returns 0
1129 */
1130
1131static int compare_dents(const void *a, const void *b)
1132{
1133	const struct gfs2_dirent *dent_a, *dent_b;
1134	u32 hash_a, hash_b;
1135	int ret = 0;
1136
1137	dent_a = *(const struct gfs2_dirent **)a;
1138	hash_a = be32_to_cpu(dent_a->de_hash);
1139
1140	dent_b = *(const struct gfs2_dirent **)b;
1141	hash_b = be32_to_cpu(dent_b->de_hash);
1142
1143	if (hash_a > hash_b)
1144		ret = 1;
1145	else if (hash_a < hash_b)
1146		ret = -1;
1147	else {
1148		unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
1149		unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
1150
1151		if (len_a > len_b)
1152			ret = 1;
1153		else if (len_a < len_b)
1154			ret = -1;
1155		else
1156			ret = memcmp(dent_a + 1, dent_b + 1, len_a);
1157	}
1158
1159	return ret;
1160}
1161
1162/**
1163 * do_filldir_main - read out directory entries
1164 * @dip: The GFS2 inode
1165 * @offset: The offset in the file to read from
1166 * @opaque: opaque data to pass to filldir
1167 * @filldir: The function to pass entries to
1168 * @darr: an array of struct gfs2_dirent pointers to read
1169 * @entries: the number of entries in darr
1170 * @copied: pointer to int that's non-zero if a entry has been copied out
1171 *
1172 * Jump through some hoops to make sure that if there are hash collsions,
1173 * they are read out at the beginning of a buffer.  We want to minimize
1174 * the possibility that they will fall into different readdir buffers or
1175 * that someone will want to seek to that location.
1176 *
1177 * Returns: errno, >0 on exception from filldir
1178 */
1179
1180static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1181			   void *opaque, filldir_t filldir,
1182			   const struct gfs2_dirent **darr, u32 entries,
1183			   int *copied)
1184{
1185	const struct gfs2_dirent *dent, *dent_next;
1186	u64 off, off_next;
1187	unsigned int x, y;
1188	int run = 0;
1189	int error = 0;
1190
1191	sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
1192
1193	dent_next = darr[0];
1194	off_next = be32_to_cpu(dent_next->de_hash);
1195	off_next = gfs2_disk_hash2offset(off_next);
1196
1197	for (x = 0, y = 1; x < entries; x++, y++) {
1198		dent = dent_next;
1199		off = off_next;
1200
1201		if (y < entries) {
1202			dent_next = darr[y];
1203			off_next = be32_to_cpu(dent_next->de_hash);
1204			off_next = gfs2_disk_hash2offset(off_next);
1205
1206			if (off < *offset)
1207				continue;
1208			*offset = off;
1209
1210			if (off_next == off) {
1211				if (*copied && !run)
1212					return 1;
1213				run = 1;
1214			} else
1215				run = 0;
1216		} else {
1217			if (off < *offset)
1218				continue;
1219			*offset = off;
1220		}
1221
1222		error = filldir(opaque, (const char *)(dent + 1),
1223				be16_to_cpu(dent->de_name_len),
1224				off, be64_to_cpu(dent->de_inum.no_addr),
1225				be16_to_cpu(dent->de_type));
1226		if (error)
1227			return 1;
1228
1229		*copied = 1;
1230	}
1231
1232	/* Increment the *offset by one, so the next time we come into the
1233	   do_filldir fxn, we get the next entry instead of the last one in the
1234	   current leaf */
1235
1236	(*offset)++;
1237
1238	return 0;
1239}
1240
1241static void *gfs2_alloc_sort_buffer(unsigned size)
1242{
1243	void *ptr = NULL;
1244
1245	if (size < KMALLOC_MAX_SIZE)
1246		ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN);
1247	if (!ptr)
1248		ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL);
1249	return ptr;
1250}
1251
1252static void gfs2_free_sort_buffer(void *ptr)
1253{
1254	if (is_vmalloc_addr(ptr))
1255		vfree(ptr);
1256	else
1257		kfree(ptr);
1258}
1259
1260static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1261			      filldir_t filldir, int *copied, unsigned *depth,
1262			      u64 leaf_no)
1263{
1264	struct gfs2_inode *ip = GFS2_I(inode);
1265	struct gfs2_sbd *sdp = GFS2_SB(inode);
1266	struct buffer_head *bh;
1267	struct gfs2_leaf *lf;
1268	unsigned entries = 0, entries2 = 0;
1269	unsigned leaves = 0;
1270	const struct gfs2_dirent **darr, *dent;
1271	struct dirent_gather g;
1272	struct buffer_head **larr;
1273	int leaf = 0;
1274	int error, i;
1275	u64 lfn = leaf_no;
1276
1277	do {
1278		error = get_leaf(ip, lfn, &bh);
1279		if (error)
1280			goto out;
1281		lf = (struct gfs2_leaf *)bh->b_data;
1282		if (leaves == 0)
1283			*depth = be16_to_cpu(lf->lf_depth);
1284		entries += be16_to_cpu(lf->lf_entries);
1285		leaves++;
1286		lfn = be64_to_cpu(lf->lf_next);
1287		brelse(bh);
1288	} while(lfn);
1289
1290	if (!entries)
1291		return 0;
1292
1293	error = -ENOMEM;
1294	/*
1295	 * The extra 99 entries are not normally used, but are a buffer
1296	 * zone in case the number of entries in the leaf is corrupt.
1297	 * 99 is the maximum number of entries that can fit in a single
1298	 * leaf block.
1299	 */
1300	larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
1301	if (!larr)
1302		goto out;
1303	darr = (const struct gfs2_dirent **)(larr + leaves);
1304	g.pdent = darr;
1305	g.offset = 0;
1306	lfn = leaf_no;
1307
1308	do {
1309		error = get_leaf(ip, lfn, &bh);
1310		if (error)
1311			goto out_free;
1312		lf = (struct gfs2_leaf *)bh->b_data;
1313		lfn = be64_to_cpu(lf->lf_next);
1314		if (lf->lf_entries) {
1315			entries2 += be16_to_cpu(lf->lf_entries);
1316			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
1317						gfs2_dirent_gather, NULL, &g);
1318			error = PTR_ERR(dent);
1319			if (IS_ERR(dent))
1320				goto out_free;
1321			if (entries2 != g.offset) {
1322				fs_warn(sdp, "Number of entries corrupt in dir "
1323						"leaf %llu, entries2 (%u) != "
1324						"g.offset (%u)\n",
1325					(unsigned long long)bh->b_blocknr,
1326					entries2, g.offset);
1327
1328				error = -EIO;
1329				goto out_free;
1330			}
1331			error = 0;
1332			larr[leaf++] = bh;
1333		} else {
1334			brelse(bh);
1335		}
1336	} while(lfn);
1337
1338	BUG_ON(entries2 != entries);
1339	error = do_filldir_main(ip, offset, opaque, filldir, darr,
1340				entries, copied);
1341out_free:
1342	for(i = 0; i < leaf; i++)
1343		brelse(larr[i]);
1344	gfs2_free_sort_buffer(larr);
1345out:
1346	return error;
1347}
1348
1349/**
1350 * dir_e_read - Reads the entries from a directory into a filldir buffer
1351 * @dip: dinode pointer
1352 * @offset: the hash of the last entry read shifted to the right once
1353 * @opaque: buffer for the filldir function to fill
1354 * @filldir: points to the filldir function to use
1355 *
1356 * Returns: errno
1357 */
1358
1359static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1360		      filldir_t filldir)
1361{
1362	struct gfs2_inode *dip = GFS2_I(inode);
1363	struct gfs2_sbd *sdp = GFS2_SB(inode);
1364	u32 hsize, len = 0;
1365	u32 ht_offset, lp_offset, ht_offset_cur = -1;
1366	u32 hash, index;
1367	__be64 *lp;
1368	int copied = 0;
1369	int error = 0;
1370	unsigned depth = 0;
1371
1372	hsize = 1 << dip->i_depth;
1373	if (hsize * sizeof(u64) != dip->i_disksize) {
1374		gfs2_consist_inode(dip);
1375		return -EIO;
1376	}
1377
1378	hash = gfs2_dir_offset2hash(*offset);
1379	index = hash >> (32 - dip->i_depth);
1380
1381	lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
1382	if (!lp)
1383		return -ENOMEM;
1384
1385	while (index < hsize) {
1386		lp_offset = index & (sdp->sd_hash_ptrs - 1);
1387		ht_offset = index - lp_offset;
1388
1389		if (ht_offset_cur != ht_offset) {
1390			error = gfs2_dir_read_data(dip, (char *)lp,
1391						ht_offset * sizeof(__be64),
1392						sdp->sd_hash_bsize, 1);
1393			if (error != sdp->sd_hash_bsize) {
1394				if (error >= 0)
1395					error = -EIO;
1396				goto out;
1397			}
1398			ht_offset_cur = ht_offset;
1399		}
1400
1401		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
1402					   &copied, &depth,
1403					   be64_to_cpu(lp[lp_offset]));
1404		if (error)
1405			break;
1406
1407		len = 1 << (dip->i_depth - depth);
1408		index = (index & ~(len - 1)) + len;
1409	}
1410
1411out:
1412	kfree(lp);
1413	if (error > 0)
1414		error = 0;
1415	return error;
1416}
1417
1418int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1419		  filldir_t filldir)
1420{
1421	struct gfs2_inode *dip = GFS2_I(inode);
1422	struct gfs2_sbd *sdp = GFS2_SB(inode);
1423	struct dirent_gather g;
1424	const struct gfs2_dirent **darr, *dent;
1425	struct buffer_head *dibh;
1426	int copied = 0;
1427	int error;
1428
1429	if (!dip->i_entries)
1430		return 0;
1431
1432	if (dip->i_diskflags & GFS2_DIF_EXHASH)
1433		return dir_e_read(inode, offset, opaque, filldir);
1434
1435	if (!gfs2_is_stuffed(dip)) {
1436		gfs2_consist_inode(dip);
1437		return -EIO;
1438	}
1439
1440	error = gfs2_meta_inode_buffer(dip, &dibh);
1441	if (error)
1442		return error;
1443
1444	error = -ENOMEM;
1445	/* 96 is max number of dirents which can be stuffed into an inode */
1446	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
1447	if (darr) {
1448		g.pdent = darr;
1449		g.offset = 0;
1450		dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
1451					gfs2_dirent_gather, NULL, &g);
1452		if (IS_ERR(dent)) {
1453			error = PTR_ERR(dent);
1454			goto out;
1455		}
1456		if (dip->i_entries != g.offset) {
1457			fs_warn(sdp, "Number of entries corrupt in dir %llu, "
1458				"ip->i_entries (%u) != g.offset (%u)\n",
1459				(unsigned long long)dip->i_no_addr,
1460				dip->i_entries,
1461				g.offset);
1462			error = -EIO;
1463			goto out;
1464		}
1465		error = do_filldir_main(dip, offset, opaque, filldir, darr,
1466					dip->i_entries, &copied);
1467out:
1468		kfree(darr);
1469	}
1470
1471	if (error > 0)
1472		error = 0;
1473
1474	brelse(dibh);
1475
1476	return error;
1477}
1478
1479/**
1480 * gfs2_dir_search - Search a directory
1481 * @dip: The GFS2 inode
1482 * @filename:
1483 * @inode:
1484 *
1485 * This routine searches a directory for a file or another directory.
1486 * Assumes a glock is held on dip.
1487 *
1488 * Returns: errno
1489 */
1490
1491struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
1492{
1493	struct buffer_head *bh;
1494	struct gfs2_dirent *dent;
1495	struct inode *inode;
1496
1497	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1498	if (dent) {
1499		if (IS_ERR(dent))
1500			return ERR_CAST(dent);
1501		inode = gfs2_inode_lookup(dir->i_sb,
1502				be16_to_cpu(dent->de_type),
1503				be64_to_cpu(dent->de_inum.no_addr),
1504				be64_to_cpu(dent->de_inum.no_formal_ino));
1505		brelse(bh);
1506		return inode;
1507	}
1508	return ERR_PTR(-ENOENT);
1509}
1510
1511int gfs2_dir_check(struct inode *dir, const struct qstr *name,
1512		   const struct gfs2_inode *ip)
1513{
1514	struct buffer_head *bh;
1515	struct gfs2_dirent *dent;
1516	int ret = -ENOENT;
1517
1518	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1519	if (dent) {
1520		if (IS_ERR(dent))
1521			return PTR_ERR(dent);
1522		if (ip) {
1523			if (be64_to_cpu(dent->de_inum.no_addr) != ip->i_no_addr)
1524				goto out;
1525			if (be64_to_cpu(dent->de_inum.no_formal_ino) !=
1526			    ip->i_no_formal_ino)
1527				goto out;
1528			if (unlikely(IF2DT(ip->i_inode.i_mode) !=
1529			    be16_to_cpu(dent->de_type))) {
1530				gfs2_consist_inode(GFS2_I(dir));
1531				ret = -EIO;
1532				goto out;
1533			}
1534		}
1535		ret = 0;
1536out:
1537		brelse(bh);
1538	}
1539	return ret;
1540}
1541
1542static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1543{
1544	struct buffer_head *bh, *obh;
1545	struct gfs2_inode *ip = GFS2_I(inode);
1546	struct gfs2_leaf *leaf, *oleaf;
1547	int error;
1548	u32 index;
1549	u64 bn;
1550
1551	index = name->hash >> (32 - ip->i_depth);
1552	error = get_first_leaf(ip, index, &obh);
1553	if (error)
1554		return error;
1555	do {
1556		oleaf = (struct gfs2_leaf *)obh->b_data;
1557		bn = be64_to_cpu(oleaf->lf_next);
1558		if (!bn)
1559			break;
1560		brelse(obh);
1561		error = get_leaf(ip, bn, &obh);
1562		if (error)
1563			return error;
1564	} while(1);
1565
1566	gfs2_trans_add_bh(ip->i_gl, obh, 1);
1567
1568	leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
1569	if (!leaf) {
1570		brelse(obh);
1571		return -ENOSPC;
1572	}
1573	oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
1574	brelse(bh);
1575	brelse(obh);
1576
1577	error = gfs2_meta_inode_buffer(ip, &bh);
1578	if (error)
1579		return error;
1580	gfs2_trans_add_bh(ip->i_gl, bh, 1);
1581	gfs2_add_inode_blocks(&ip->i_inode, 1);
1582	gfs2_dinode_out(ip, bh->b_data);
1583	brelse(bh);
1584	return 0;
1585}
1586
1587/**
1588 * gfs2_dir_add - Add new filename into directory
1589 * @dip: The GFS2 inode
1590 * @filename: The new name
1591 * @inode: The inode number of the entry
1592 * @type: The type of the entry
1593 *
1594 * Returns: 0 on success, error code on failure
1595 */
1596
1597int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1598		 const struct gfs2_inode *nip, unsigned type)
1599{
1600	struct gfs2_inode *ip = GFS2_I(inode);
1601	struct buffer_head *bh;
1602	struct gfs2_dirent *dent;
1603	struct gfs2_leaf *leaf;
1604	int error;
1605
1606	while(1) {
1607		dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
1608					  &bh);
1609		if (dent) {
1610			if (IS_ERR(dent))
1611				return PTR_ERR(dent);
1612			dent = gfs2_init_dirent(inode, dent, name, bh);
1613			gfs2_inum_out(nip, dent);
1614			dent->de_type = cpu_to_be16(type);
1615			if (ip->i_diskflags & GFS2_DIF_EXHASH) {
1616				leaf = (struct gfs2_leaf *)bh->b_data;
1617				be16_add_cpu(&leaf->lf_entries, 1);
1618			}
1619			brelse(bh);
1620			error = gfs2_meta_inode_buffer(ip, &bh);
1621			if (error)
1622				break;
1623			gfs2_trans_add_bh(ip->i_gl, bh, 1);
1624			ip->i_entries++;
1625			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1626			gfs2_dinode_out(ip, bh->b_data);
1627			brelse(bh);
1628			error = 0;
1629			break;
1630		}
1631		if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) {
1632			error = dir_make_exhash(inode);
1633			if (error)
1634				break;
1635			continue;
1636		}
1637		error = dir_split_leaf(inode, name);
1638		if (error == 0)
1639			continue;
1640		if (error < 0)
1641			break;
1642		if (ip->i_depth < GFS2_DIR_MAX_DEPTH) {
1643			error = dir_double_exhash(ip);
1644			if (error)
1645				break;
1646			error = dir_split_leaf(inode, name);
1647			if (error < 0)
1648				break;
1649			if (error == 0)
1650				continue;
1651		}
1652		error = dir_new_leaf(inode, name);
1653		if (!error)
1654			continue;
1655		error = -ENOSPC;
1656		break;
1657	}
1658	return error;
1659}
1660
1661
1662/**
1663 * gfs2_dir_del - Delete a directory entry
1664 * @dip: The GFS2 inode
1665 * @filename: The filename
1666 *
1667 * Returns: 0 on success, error code on failure
1668 */
1669
1670int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
1671{
1672	struct gfs2_dirent *dent, *prev = NULL;
1673	struct buffer_head *bh;
1674	int error;
1675
1676	/* Returns _either_ the entry (if its first in block) or the
1677	   previous entry otherwise */
1678	dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
1679	if (!dent) {
1680		gfs2_consist_inode(dip);
1681		return -EIO;
1682	}
1683	if (IS_ERR(dent)) {
1684		gfs2_consist_inode(dip);
1685		return PTR_ERR(dent);
1686	}
1687	/* If not first in block, adjust pointers accordingly */
1688	if (gfs2_dirent_find(dent, name, NULL) == 0) {
1689		prev = dent;
1690		dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
1691	}
1692
1693	dirent_del(dip, bh, prev, dent);
1694	if (dip->i_diskflags & GFS2_DIF_EXHASH) {
1695		struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
1696		u16 entries = be16_to_cpu(leaf->lf_entries);
1697		if (!entries)
1698			gfs2_consist_inode(dip);
1699		leaf->lf_entries = cpu_to_be16(--entries);
1700	}
1701	brelse(bh);
1702
1703	error = gfs2_meta_inode_buffer(dip, &bh);
1704	if (error)
1705		return error;
1706
1707	if (!dip->i_entries)
1708		gfs2_consist_inode(dip);
1709	gfs2_trans_add_bh(dip->i_gl, bh, 1);
1710	dip->i_entries--;
1711	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
1712	gfs2_dinode_out(dip, bh->b_data);
1713	brelse(bh);
1714	mark_inode_dirty(&dip->i_inode);
1715
1716	return error;
1717}
1718
1719/**
1720 * gfs2_dir_mvino - Change inode number of directory entry
1721 * @dip: The GFS2 inode
1722 * @filename:
1723 * @new_inode:
1724 *
1725 * This routine changes the inode number of a directory entry.  It's used
1726 * by rename to change ".." when a directory is moved.
1727 * Assumes a glock is held on dvp.
1728 *
1729 * Returns: errno
1730 */
1731
1732int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
1733		   const struct gfs2_inode *nip, unsigned int new_type)
1734{
1735	struct buffer_head *bh;
1736	struct gfs2_dirent *dent;
1737	int error;
1738
1739	dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
1740	if (!dent) {
1741		gfs2_consist_inode(dip);
1742		return -EIO;
1743	}
1744	if (IS_ERR(dent))
1745		return PTR_ERR(dent);
1746
1747	gfs2_trans_add_bh(dip->i_gl, bh, 1);
1748	gfs2_inum_out(nip, dent);
1749	dent->de_type = cpu_to_be16(new_type);
1750
1751	if (dip->i_diskflags & GFS2_DIF_EXHASH) {
1752		brelse(bh);
1753		error = gfs2_meta_inode_buffer(dip, &bh);
1754		if (error)
1755			return error;
1756		gfs2_trans_add_bh(dip->i_gl, bh, 1);
1757	}
1758
1759	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
1760	gfs2_dinode_out(dip, bh->b_data);
1761	brelse(bh);
1762	return 0;
1763}
1764
1765/**
1766 * foreach_leaf - call a function for each leaf in a directory
1767 * @dip: the directory
1768 * @lc: the function to call for each each
1769 * @data: private data to pass to it
1770 *
1771 * Returns: errno
1772 */
1773
1774static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1775{
1776	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1777	struct buffer_head *bh;
1778	struct gfs2_leaf *leaf;
1779	u32 hsize, len;
1780	u32 ht_offset, lp_offset, ht_offset_cur = -1;
1781	u32 index = 0;
1782	__be64 *lp;
1783	u64 leaf_no;
1784	int error = 0;
1785
1786	hsize = 1 << dip->i_depth;
1787	if (hsize * sizeof(u64) != dip->i_disksize) {
1788		gfs2_consist_inode(dip);
1789		return -EIO;
1790	}
1791
1792	lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
1793	if (!lp)
1794		return -ENOMEM;
1795
1796	while (index < hsize) {
1797		lp_offset = index & (sdp->sd_hash_ptrs - 1);
1798		ht_offset = index - lp_offset;
1799
1800		if (ht_offset_cur != ht_offset) {
1801			error = gfs2_dir_read_data(dip, (char *)lp,
1802						ht_offset * sizeof(__be64),
1803						sdp->sd_hash_bsize, 1);
1804			if (error != sdp->sd_hash_bsize) {
1805				if (error >= 0)
1806					error = -EIO;
1807				goto out;
1808			}
1809			ht_offset_cur = ht_offset;
1810		}
1811
1812		leaf_no = be64_to_cpu(lp[lp_offset]);
1813		if (leaf_no) {
1814			error = get_leaf(dip, leaf_no, &bh);
1815			if (error)
1816				goto out;
1817			leaf = (struct gfs2_leaf *)bh->b_data;
1818			len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
1819			brelse(bh);
1820
1821			error = lc(dip, index, len, leaf_no, data);
1822			if (error)
1823				goto out;
1824
1825			index = (index & ~(len - 1)) + len;
1826		} else
1827			index++;
1828	}
1829
1830	if (index != hsize) {
1831		gfs2_consist_inode(dip);
1832		error = -EIO;
1833	}
1834
1835out:
1836	kfree(lp);
1837
1838	return error;
1839}
1840
1841/**
1842 * leaf_dealloc - Deallocate a directory leaf
1843 * @dip: the directory
1844 * @index: the hash table offset in the directory
1845 * @len: the number of pointers to this leaf
1846 * @leaf_no: the leaf number
1847 * @data: not used
1848 *
1849 * Returns: errno
1850 */
1851
1852static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1853			u64 leaf_no, void *data)
1854{
1855	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1856	struct gfs2_leaf *tmp_leaf;
1857	struct gfs2_rgrp_list rlist;
1858	struct buffer_head *bh, *dibh;
1859	u64 blk, nblk;
1860	unsigned int rg_blocks = 0, l_blocks = 0;
1861	char *ht;
1862	unsigned int x, size = len * sizeof(u64);
1863	int error;
1864
1865	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1866
1867	ht = kzalloc(size, GFP_NOFS);
1868	if (!ht)
1869		return -ENOMEM;
1870
1871	if (!gfs2_alloc_get(dip)) {
1872		error = -ENOMEM;
1873		goto out;
1874	}
1875
1876	error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1877	if (error)
1878		goto out_put;
1879
1880	error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
1881	if (error)
1882		goto out_qs;
1883
1884	/*  Count the number of leaves  */
1885
1886	for (blk = leaf_no; blk; blk = nblk) {
1887		error = get_leaf(dip, blk, &bh);
1888		if (error)
1889			goto out_rlist;
1890		tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1891		nblk = be64_to_cpu(tmp_leaf->lf_next);
1892		brelse(bh);
1893
1894		gfs2_rlist_add(sdp, &rlist, blk);
1895		l_blocks++;
1896	}
1897
1898	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
1899
1900	for (x = 0; x < rlist.rl_rgrps; x++) {
1901		struct gfs2_rgrpd *rgd;
1902		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
1903		rg_blocks += rgd->rd_length;
1904	}
1905
1906	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
1907	if (error)
1908		goto out_rlist;
1909
1910	error = gfs2_trans_begin(sdp,
1911			rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
1912			RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
1913	if (error)
1914		goto out_rg_gunlock;
1915
1916	for (blk = leaf_no; blk; blk = nblk) {
1917		error = get_leaf(dip, blk, &bh);
1918		if (error)
1919			goto out_end_trans;
1920		tmp_leaf = (struct gfs2_leaf *)bh->b_data;
1921		nblk = be64_to_cpu(tmp_leaf->lf_next);
1922		brelse(bh);
1923
1924		gfs2_free_meta(dip, blk, 1);
1925		gfs2_add_inode_blocks(&dip->i_inode, -1);
1926	}
1927
1928	error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
1929	if (error != size) {
1930		if (error >= 0)
1931			error = -EIO;
1932		goto out_end_trans;
1933	}
1934
1935	error = gfs2_meta_inode_buffer(dip, &dibh);
1936	if (error)
1937		goto out_end_trans;
1938
1939	gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1940	gfs2_dinode_out(dip, dibh->b_data);
1941	brelse(dibh);
1942
1943out_end_trans:
1944	gfs2_trans_end(sdp);
1945out_rg_gunlock:
1946	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
1947out_rlist:
1948	gfs2_rlist_free(&rlist);
1949	gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
1950out_qs:
1951	gfs2_quota_unhold(dip);
1952out_put:
1953	gfs2_alloc_put(dip);
1954out:
1955	kfree(ht);
1956	return error;
1957}
1958
1959/**
1960 * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
1961 * @dip: the directory
1962 *
1963 * Dealloc all on-disk directory leaves to FREEMETA state
1964 * Change on-disk inode type to "regular file"
1965 *
1966 * Returns: errno
1967 */
1968
1969int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
1970{
1971	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1972	struct buffer_head *bh;
1973	int error;
1974
1975	/* Dealloc on-disk leaves to FREEMETA state */
1976	error = foreach_leaf(dip, leaf_dealloc, NULL);
1977	if (error)
1978		return error;
1979
1980	/* Make this a regular file in case we crash.
1981	   (We don't want to free these blocks a second time.)  */
1982
1983	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1984	if (error)
1985		return error;
1986
1987	error = gfs2_meta_inode_buffer(dip, &bh);
1988	if (!error) {
1989		gfs2_trans_add_bh(dip->i_gl, bh, 1);
1990		((struct gfs2_dinode *)bh->b_data)->di_mode =
1991						cpu_to_be32(S_IFREG);
1992		brelse(bh);
1993	}
1994
1995	gfs2_trans_end(sdp);
1996
1997	return error;
1998}
1999
2000/**
2001 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
2002 * @ip: the file being written to
2003 * @filname: the filename that's going to be added
2004 *
2005 * Returns: 1 if alloc required, 0 if not, -ve on error
2006 */
2007
2008int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
2009{
2010	struct gfs2_dirent *dent;
2011	struct buffer_head *bh;
2012
2013	dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
2014	if (!dent) {
2015		return 1;
2016	}
2017	if (IS_ERR(dent))
2018		return PTR_ERR(dent);
2019	brelse(bh);
2020	return 0;
2021}
2022