ufs_dirhash.c revision 184651
1/*-
2 * Copyright (c) 2001, 2002 Ian Dowse.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26/*
27 * This implements a hash-based lookup scheme for UFS directories.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/ufs/ufs/ufs_dirhash.c 184651 2008-11-04 18:56:12Z jhb $");
32
33#include "opt_ufs.h"
34
35#ifdef UFS_DIRHASH
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/lock.h>
41#include <sys/mutex.h>
42#include <sys/malloc.h>
43#include <sys/fnv_hash.h>
44#include <sys/proc.h>
45#include <sys/bio.h>
46#include <sys/buf.h>
47#include <sys/vnode.h>
48#include <sys/mount.h>
49#include <sys/refcount.h>
50#include <sys/sysctl.h>
51#include <sys/sx.h>
52#include <vm/uma.h>
53
54#include <ufs/ufs/quota.h>
55#include <ufs/ufs/inode.h>
56#include <ufs/ufs/dir.h>
57#include <ufs/ufs/dirhash.h>
58#include <ufs/ufs/extattr.h>
59#include <ufs/ufs/ufsmount.h>
60#include <ufs/ufs/ufs_extern.h>
61
62#define WRAPINCR(val, limit)	(((val) + 1 == (limit)) ? 0 : ((val) + 1))
63#define WRAPDECR(val, limit)	(((val) == 0) ? ((limit) - 1) : ((val) - 1))
64#define OFSFMT(vp)		((vp)->v_mount->mnt_maxsymlinklen <= 0)
65#define BLKFREE2IDX(n)		((n) > DH_NFSTATS ? DH_NFSTATS : (n))
66
67static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables");
68
69static SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
70
71static int ufs_mindirhashsize = DIRBLKSIZ * 5;
72SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW,
73    &ufs_mindirhashsize,
74    0, "minimum directory size in bytes for which to use hashed lookup");
75static int ufs_dirhashmaxmem = 2 * 1024 * 1024;
76SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_maxmem, CTLFLAG_RW, &ufs_dirhashmaxmem,
77    0, "maximum allowed dirhash memory usage");
78static int ufs_dirhashmem;
79SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_mem, CTLFLAG_RD, &ufs_dirhashmem,
80    0, "current dirhash memory usage");
81static int ufs_dirhashcheck = 0;
82SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_docheck, CTLFLAG_RW, &ufs_dirhashcheck,
83    0, "enable extra sanity tests");
84
85
86static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen);
87static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff);
88static void ufsdirhash_delslot(struct dirhash *dh, int slot);
89static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen,
90	   doff_t offset);
91static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset);
92static int ufsdirhash_recycle(int wanted);
93static void ufsdirhash_free_locked(struct inode *ip);
94
95static uma_zone_t	ufsdirhash_zone;
96
97#define DIRHASHLIST_LOCK() 		mtx_lock(&ufsdirhash_mtx)
98#define DIRHASHLIST_UNLOCK() 		mtx_unlock(&ufsdirhash_mtx)
99#define DIRHASH_BLKALLOC_WAITOK() 	uma_zalloc(ufsdirhash_zone, M_WAITOK)
100#define DIRHASH_BLKFREE(ptr) 		uma_zfree(ufsdirhash_zone, (ptr))
101#define	DIRHASH_ASSERT_LOCKED(dh)					\
102    sx_assert(&(dh)->dh_lock, SA_LOCKED)
103
104/* Dirhash list; recently-used entries are near the tail. */
105static TAILQ_HEAD(, dirhash) ufsdirhash_list;
106
107/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
108static struct mtx	ufsdirhash_mtx;
109
110/*
111 * Locking:
112 *
113 * The relationship between inode and dirhash is protected either by an
114 * exclusive vnode lock or the vnode interlock where a shared vnode lock
115 * may be used.  The dirhash_mtx is acquired after the dirhash lock.  To
116 * handle teardown races, code wishing to lock the dirhash for an inode
117 * when using a shared vnode lock must obtain a private reference on the
118 * dirhash while holding the vnode interlock.  They can drop it once they
119 * have obtained the dirhash lock and verified that the dirhash wasn't
120 * recycled while they waited for the dirhash lock.
121 *
122 * ufsdirhash_build() acquires a shared lock on the dirhash when it is
123 * successful.  This lock is released after a call to ufsdirhash_lookup().
124 *
125 * Functions requiring exclusive access use ufsdirhash_acquire() which may
126 * free a dirhash structure that was recycled by ufsdirhash_recycle().
127 *
128 * The dirhash lock may be held across io operations.
129 */
130
131static void
132ufsdirhash_hold(struct dirhash *dh)
133{
134
135	refcount_acquire(&dh->dh_refcount);
136}
137
138static void
139ufsdirhash_drop(struct dirhash *dh)
140{
141
142	if (refcount_release(&dh->dh_refcount)) {
143		sx_destroy(&dh->dh_lock);
144		free(dh, M_DIRHASH);
145	}
146}
147
148/*
149 * Release the lock on a dirhash.
150 */
151static void
152ufsdirhash_release(struct dirhash *dh)
153{
154
155	sx_unlock(&dh->dh_lock);
156}
157
158/*
159 * Either acquire an existing hash locked shared or create a new hash and
160 * return it exclusively locked.  May return NULL if the allocation fails.
161 *
162 * The vnode interlock is used to protect the i_dirhash pointer from
163 * simultaneous access while only a shared vnode lock is held.
164 */
165static struct dirhash *
166ufsdirhash_create(struct inode *ip)
167{
168	struct dirhash *ndh;
169	struct dirhash *dh;
170	struct vnode *vp;
171	int error;
172
173	error = 0;
174	ndh = dh = NULL;
175	vp = ip->i_vnode;
176	for (;;) {
177		/* Racy check for i_dirhash to prefetch an dirhash structure. */
178		if (ip->i_dirhash == NULL && ndh == NULL) {
179			ndh = malloc(sizeof *dh, M_DIRHASH,
180			    M_NOWAIT | M_ZERO);
181			if (ndh == NULL)
182				return (NULL);
183			refcount_init(&ndh->dh_refcount, 1);
184
185			/*
186			 * The DUPOK is to prevent warnings from the
187			 * sx_slock() a few lines down which is safe
188			 * since the duplicate lock in that case is
189			 * the one for this dirhash we are creating
190			 * now which has no external references until
191			 * after this function returns.
192			 */
193			sx_init_flags(&ndh->dh_lock, "dirhash", SX_DUPOK);
194			sx_xlock(&ndh->dh_lock);
195		}
196		/*
197		 * Check i_dirhash.  If it's NULL just try to use a
198		 * preallocated structure.  If none exists loop and try again.
199		 */
200		VI_LOCK(vp);
201		dh = ip->i_dirhash;
202		if (dh == NULL) {
203			ip->i_dirhash = ndh;
204			VI_UNLOCK(vp);
205			if (ndh == NULL)
206				continue;
207			return (ndh);
208		}
209		ufsdirhash_hold(dh);
210		VI_UNLOCK(vp);
211
212		/* Acquire a shared lock on existing hashes. */
213		sx_slock(&dh->dh_lock);
214
215		/* The hash could've been recycled while we were waiting. */
216		VI_LOCK(vp);
217		if (ip->i_dirhash != dh) {
218			VI_UNLOCK(vp);
219			ufsdirhash_release(dh);
220			ufsdirhash_drop(dh);
221			continue;
222		}
223		VI_UNLOCK(vp);
224		ufsdirhash_drop(dh);
225
226		/* If the hash is still valid we've succeeded. */
227		if (dh->dh_hash != NULL)
228			break;
229		/*
230		 * If the hash is NULL it has been recycled.  Try to upgrade
231		 * so we can recreate it.  If we fail the upgrade, drop our
232		 * lock and try again.
233		 */
234		if (sx_try_upgrade(&dh->dh_lock))
235			break;
236		sx_sunlock(&dh->dh_lock);
237	}
238	/* Free the preallocated structure if it was not necessary. */
239	if (ndh) {
240		ufsdirhash_release(ndh);
241		ufsdirhash_drop(ndh);
242	}
243	return (dh);
244}
245
246/*
247 * Acquire an exclusive lock on an existing hash.  Requires an exclusive
248 * vnode lock to protect the i_dirhash pointer.  hashes that have been
249 * recycled are reclaimed here and NULL is returned.
250 */
251static struct dirhash *
252ufsdirhash_acquire(struct inode *ip)
253{
254	struct dirhash *dh;
255	struct vnode *vp;
256
257	ASSERT_VOP_ELOCKED(ip->i_vnode, __FUNCTION__);
258
259	vp = ip->i_vnode;
260	dh = ip->i_dirhash;
261	if (dh == NULL)
262		return (NULL);
263	sx_xlock(&dh->dh_lock);
264	if (dh->dh_hash != NULL)
265		return (dh);
266	ufsdirhash_free_locked(ip);
267	return (NULL);
268}
269
270/*
271 * Acquire exclusively and free the hash pointed to by ip.  Works with a
272 * shared or exclusive vnode lock.
273 */
274void
275ufsdirhash_free(struct inode *ip)
276{
277	struct dirhash *dh;
278	struct vnode *vp;
279
280	vp = ip->i_vnode;
281	for (;;) {
282		/* Grab a reference on this inode's dirhash if it has one. */
283		VI_LOCK(vp);
284		dh = ip->i_dirhash;
285		if (dh == NULL) {
286			VI_UNLOCK(vp);
287			return;
288		}
289		ufsdirhash_hold(dh);
290		VI_UNLOCK(vp);
291
292		/* Exclusively lock the dirhash. */
293		sx_xlock(&dh->dh_lock);
294
295		/* If this dirhash still belongs to this inode, then free it. */
296		VI_LOCK(vp);
297		if (ip->i_dirhash == dh) {
298			VI_UNLOCK(vp);
299			ufsdirhash_drop(dh);
300			break;
301		}
302		VI_UNLOCK(vp);
303
304		/*
305		 * This inode's dirhash has changed while we were
306		 * waiting for the dirhash lock, so try again.
307		 */
308		ufsdirhash_release(dh);
309		ufsdirhash_drop(dh);
310	}
311	ufsdirhash_free_locked(ip);
312}
313
314/*
315 * Attempt to build up a hash table for the directory contents in
316 * inode 'ip'. Returns 0 on success, or -1 of the operation failed.
317 */
318int
319ufsdirhash_build(struct inode *ip)
320{
321	struct dirhash *dh;
322	struct buf *bp = NULL;
323	struct direct *ep;
324	struct vnode *vp;
325	doff_t bmask, pos;
326	int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
327
328	/* Take care of a decreased sysctl value. */
329	while (ufs_dirhashmem > ufs_dirhashmaxmem)
330		if (ufsdirhash_recycle(0) != 0)
331			return (-1);
332
333	/* Check if we can/should use dirhash. */
334	if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode) ||
335	    ip->i_effnlink == 0) {
336		if (ip->i_dirhash)
337			ufsdirhash_free(ip);
338		return (-1);
339	}
340	dh = ufsdirhash_create(ip);
341	if (dh == NULL)
342		return (-1);
343	if (dh->dh_hash != NULL)
344		return (0);
345
346	vp = ip->i_vnode;
347	/* Allocate 50% more entries than this dir size could ever need. */
348	KASSERT(ip->i_size >= DIRBLKSIZ, ("ufsdirhash_build size"));
349	nslots = ip->i_size / DIRECTSIZ(1);
350	nslots = (nslots * 3 + 1) / 2;
351	narrays = howmany(nslots, DH_NBLKOFF);
352	nslots = narrays * DH_NBLKOFF;
353	dirblocks = howmany(ip->i_size, DIRBLKSIZ);
354	nblocks = (dirblocks * 3 + 1) / 2;
355	memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
356	    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
357	    nblocks * sizeof(*dh->dh_blkfree);
358	DIRHASHLIST_LOCK();
359	if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) {
360		DIRHASHLIST_UNLOCK();
361		if (memreqd > ufs_dirhashmaxmem / 2)
362			goto fail;
363		/* Try to free some space. */
364		if (ufsdirhash_recycle(memreqd) != 0)
365			goto fail;
366		/* Enough was freed, and list has been locked. */
367	}
368	ufs_dirhashmem += memreqd;
369	DIRHASHLIST_UNLOCK();
370
371	/* Initialise the hash table and block statistics. */
372	dh->dh_memreq = memreqd;
373	dh->dh_narrays = narrays;
374	dh->dh_hlen = nslots;
375	dh->dh_nblk = nblocks;
376	dh->dh_dirblks = dirblocks;
377	for (i = 0; i < DH_NFSTATS; i++)
378		dh->dh_firstfree[i] = -1;
379	dh->dh_firstfree[DH_NFSTATS] = 0;
380	dh->dh_hused = 0;
381	dh->dh_seqopt = 0;
382	dh->dh_seqoff = 0;
383	dh->dh_score = DH_SCOREINIT;
384
385	/*
386	 * Use non-blocking mallocs so that we will revert to a linear
387	 * lookup on failure rather than potentially blocking forever.
388	 */
389	dh->dh_hash = malloc(narrays * sizeof(dh->dh_hash[0]),
390	    M_DIRHASH, M_NOWAIT | M_ZERO);
391	if (dh->dh_hash == NULL)
392		goto fail;
393	dh->dh_blkfree = malloc(nblocks * sizeof(dh->dh_blkfree[0]),
394	    M_DIRHASH, M_NOWAIT);
395	if (dh->dh_blkfree == NULL)
396		goto fail;
397	for (i = 0; i < narrays; i++) {
398		if ((dh->dh_hash[i] = DIRHASH_BLKALLOC_WAITOK()) == NULL)
399			goto fail;
400		for (j = 0; j < DH_NBLKOFF; j++)
401			dh->dh_hash[i][j] = DIRHASH_EMPTY;
402	}
403	for (i = 0; i < dirblocks; i++)
404		dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN;
405	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
406	pos = 0;
407	while (pos < ip->i_size) {
408		/* If necessary, get the next directory block. */
409		if ((pos & bmask) == 0) {
410			if (bp != NULL)
411				brelse(bp);
412			if (UFS_BLKATOFF(vp, (off_t)pos, NULL, &bp) != 0)
413				goto fail;
414		}
415
416		/* Add this entry to the hash. */
417		ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
418		if (ep->d_reclen == 0 || ep->d_reclen >
419		    DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) {
420			/* Corrupted directory. */
421			brelse(bp);
422			goto fail;
423		}
424		if (ep->d_ino != 0) {
425			/* Add the entry (simplified ufsdirhash_add). */
426			slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen);
427			while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
428				slot = WRAPINCR(slot, dh->dh_hlen);
429			dh->dh_hused++;
430			DH_ENTRY(dh, slot) = pos;
431			ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep));
432		}
433		pos += ep->d_reclen;
434	}
435
436	if (bp != NULL)
437		brelse(bp);
438	DIRHASHLIST_LOCK();
439	TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
440	dh->dh_onlist = 1;
441	DIRHASHLIST_UNLOCK();
442	sx_downgrade(&dh->dh_lock);
443	return (0);
444
445fail:
446	ufsdirhash_free_locked(ip);
447	return (-1);
448}
449
450/*
451 * Free any hash table associated with inode 'ip'.
452 */
453static void
454ufsdirhash_free_locked(struct inode *ip)
455{
456	struct dirhash *dh;
457	struct vnode *vp;
458	int i;
459
460	DIRHASH_ASSERT_LOCKED(ip->i_dirhash);
461
462	/*
463	 * Clear the pointer in the inode to prevent new threads from
464	 * finding the dead structure.
465	 */
466	vp = ip->i_vnode;
467	VI_LOCK(vp);
468	dh = ip->i_dirhash;
469	ip->i_dirhash = NULL;
470	VI_UNLOCK(vp);
471
472	/*
473	 * Remove the hash from the list since we are going to free its
474	 * memory.
475	 */
476	DIRHASHLIST_LOCK();
477	if (dh->dh_onlist)
478		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
479	ufs_dirhashmem -= dh->dh_memreq;
480	DIRHASHLIST_UNLOCK();
481
482	/*
483	 * At this point, any waiters for the lock should hold their
484	 * own reference on the dirhash structure.  They will drop
485	 * that reference once they grab the vnode interlock and see
486	 * that ip->i_dirhash is NULL.
487	 */
488	sx_xunlock(&dh->dh_lock);
489
490	/*
491	 * Handle partially recycled as well as fully constructed hashes.
492	 */
493	if (dh->dh_hash != NULL) {
494		for (i = 0; i < dh->dh_narrays; i++)
495			if (dh->dh_hash[i] != NULL)
496				DIRHASH_BLKFREE(dh->dh_hash[i]);
497		free(dh->dh_hash, M_DIRHASH);
498		if (dh->dh_blkfree != NULL)
499			free(dh->dh_blkfree, M_DIRHASH);
500	}
501
502	/*
503	 * Drop the inode's reference to the data structure.
504	 */
505	ufsdirhash_drop(dh);
506}
507
508/*
509 * Find the offset of the specified name within the given inode.
510 * Returns 0 on success, ENOENT if the entry does not exist, or
511 * EJUSTRETURN if the caller should revert to a linear search.
512 *
513 * If successful, the directory offset is stored in *offp, and a
514 * pointer to a struct buf containing the entry is stored in *bpp. If
515 * prevoffp is non-NULL, the offset of the previous entry within
516 * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
517 * is the first in a block, the start of the block is used).
518 *
519 * Must be called with the hash locked.  Returns with the hash unlocked.
520 */
521int
522ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp,
523    struct buf **bpp, doff_t *prevoffp)
524{
525	struct dirhash *dh, *dh_next;
526	struct direct *dp;
527	struct vnode *vp;
528	struct buf *bp;
529	doff_t blkoff, bmask, offset, prevoff;
530	int i, slot;
531	int error;
532
533	dh = ip->i_dirhash;
534	KASSERT(dh != NULL && dh->dh_hash != NULL,
535	    ("ufsdirhash_lookup: Invalid dirhash %p\n", dh));
536	DIRHASH_ASSERT_LOCKED(dh);
537	/*
538	 * Move this dirhash towards the end of the list if it has a
539	 * score higher than the next entry, and acquire the dh_lock.
540	 */
541	DIRHASHLIST_LOCK();
542	if (TAILQ_NEXT(dh, dh_list) != NULL) {
543		/*
544		 * If the new score will be greater than that of the next
545		 * entry, then move this entry past it. With both mutexes
546		 * held, dh_next won't go away, but its dh_score could
547		 * change; that's not important since it is just a hint.
548		 */
549		if ((dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
550		    dh->dh_score >= dh_next->dh_score) {
551			KASSERT(dh->dh_onlist, ("dirhash: not on list"));
552			TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
553			TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
554			    dh_list);
555		}
556	}
557	/* Update the score. */
558	if (dh->dh_score < DH_SCOREMAX)
559		dh->dh_score++;
560	DIRHASHLIST_UNLOCK();
561
562	vp = ip->i_vnode;
563	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
564	blkoff = -1;
565	bp = NULL;
566restart:
567	slot = ufsdirhash_hash(dh, name, namelen);
568
569	if (dh->dh_seqopt) {
570		/*
571		 * Sequential access optimisation. dh_seqoff contains the
572		 * offset of the directory entry immediately following
573		 * the last entry that was looked up. Check if this offset
574		 * appears in the hash chain for the name we are looking for.
575		 */
576		for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
577		    i = WRAPINCR(i, dh->dh_hlen))
578			if (offset == dh->dh_seqoff)
579				break;
580		if (offset == dh->dh_seqoff) {
581			/*
582			 * We found an entry with the expected offset. This
583			 * is probably the entry we want, but if not, the
584			 * code below will turn off seqopt and retry.
585			 */
586			slot = i;
587		} else
588			dh->dh_seqopt = 0;
589	}
590
591	for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY;
592	    slot = WRAPINCR(slot, dh->dh_hlen)) {
593		if (offset == DIRHASH_DEL)
594			continue;
595		if (offset < 0 || offset >= ip->i_size)
596			panic("ufsdirhash_lookup: bad offset in hash array");
597		if ((offset & ~bmask) != blkoff) {
598			if (bp != NULL)
599				brelse(bp);
600			blkoff = offset & ~bmask;
601			if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0) {
602				error = EJUSTRETURN;
603				goto fail;
604			}
605		}
606		dp = (struct direct *)(bp->b_data + (offset & bmask));
607		if (dp->d_reclen == 0 || dp->d_reclen >
608		    DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) {
609			/* Corrupted directory. */
610			error = EJUSTRETURN;
611			goto fail;
612		}
613		if (dp->d_namlen == namelen &&
614		    bcmp(dp->d_name, name, namelen) == 0) {
615			/* Found. Get the prev offset if needed. */
616			if (prevoffp != NULL) {
617				if (offset & (DIRBLKSIZ - 1)) {
618					prevoff = ufsdirhash_getprev(dp,
619					    offset);
620					if (prevoff == -1) {
621						error = EJUSTRETURN;
622						goto fail;
623					}
624				} else
625					prevoff = offset;
626				*prevoffp = prevoff;
627			}
628
629			/* Check for sequential access, and update offset. */
630			if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
631				dh->dh_seqopt = 1;
632			dh->dh_seqoff = offset + DIRSIZ(0, dp);
633			*bpp = bp;
634			*offp = offset;
635			ufsdirhash_release(dh);
636			return (0);
637		}
638
639		/*
640		 * When the name doesn't match in the seqopt case, go back
641		 * and search normally.
642		 */
643		if (dh->dh_seqopt) {
644			dh->dh_seqopt = 0;
645			goto restart;
646		}
647	}
648	error = ENOENT;
649fail:
650	ufsdirhash_release(dh);
651	if (bp != NULL)
652		brelse(bp);
653	return (error);
654}
655
656/*
657 * Find a directory block with room for 'slotneeded' bytes. Returns
658 * the offset of the directory entry that begins the free space.
659 * This will either be the offset of an existing entry that has free
660 * space at the end, or the offset of an entry with d_ino == 0 at
661 * the start of a DIRBLKSIZ block.
662 *
663 * To use the space, the caller may need to compact existing entries in
664 * the directory. The total number of bytes in all of the entries involved
665 * in the compaction is stored in *slotsize. In other words, all of
666 * the entries that must be compacted are exactly contained in the
667 * region beginning at the returned offset and spanning *slotsize bytes.
668 *
669 * Returns -1 if no space was found, indicating that the directory
670 * must be extended.
671 */
672doff_t
673ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
674{
675	struct direct *dp;
676	struct dirhash *dh;
677	struct buf *bp;
678	doff_t pos, slotstart;
679	int dirblock, error, freebytes, i;
680
681	dh = ip->i_dirhash;
682	KASSERT(dh != NULL && dh->dh_hash != NULL,
683	    ("ufsdirhash_findfree: Invalid dirhash %p\n", dh));
684	DIRHASH_ASSERT_LOCKED(dh);
685
686	/* Find a directory block with the desired free space. */
687	dirblock = -1;
688	for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
689		if ((dirblock = dh->dh_firstfree[i]) != -1)
690			break;
691	if (dirblock == -1)
692		return (-1);
693
694	KASSERT(dirblock < dh->dh_nblk &&
695	    dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN),
696	    ("ufsdirhash_findfree: bad stats"));
697	pos = dirblock * DIRBLKSIZ;
698	error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp);
699	if (error)
700		return (-1);
701
702	/* Find the first entry with free space. */
703	for (i = 0; i < DIRBLKSIZ; ) {
704		if (dp->d_reclen == 0) {
705			brelse(bp);
706			return (-1);
707		}
708		if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp))
709			break;
710		i += dp->d_reclen;
711		dp = (struct direct *)((char *)dp + dp->d_reclen);
712	}
713	if (i > DIRBLKSIZ) {
714		brelse(bp);
715		return (-1);
716	}
717	slotstart = pos + i;
718
719	/* Find the range of entries needed to get enough space */
720	freebytes = 0;
721	while (i < DIRBLKSIZ && freebytes < slotneeded) {
722		freebytes += dp->d_reclen;
723		if (dp->d_ino != 0)
724			freebytes -= DIRSIZ(0, dp);
725		if (dp->d_reclen == 0) {
726			brelse(bp);
727			return (-1);
728		}
729		i += dp->d_reclen;
730		dp = (struct direct *)((char *)dp + dp->d_reclen);
731	}
732	if (i > DIRBLKSIZ) {
733		brelse(bp);
734		return (-1);
735	}
736	if (freebytes < slotneeded)
737		panic("ufsdirhash_findfree: free mismatch");
738	brelse(bp);
739	*slotsize = pos + i - slotstart;
740	return (slotstart);
741}
742
743/*
744 * Return the start of the unused space at the end of a directory, or
745 * -1 if there are no trailing unused blocks.
746 */
747doff_t
748ufsdirhash_enduseful(struct inode *ip)
749{
750
751	struct dirhash *dh;
752	int i;
753
754	dh = ip->i_dirhash;
755	DIRHASH_ASSERT_LOCKED(dh);
756	KASSERT(dh != NULL && dh->dh_hash != NULL,
757	    ("ufsdirhash_enduseful: Invalid dirhash %p\n", dh));
758
759	if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN)
760		return (-1);
761
762	for (i = dh->dh_dirblks - 1; i >= 0; i--)
763		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
764			break;
765
766	return ((doff_t)(i + 1) * DIRBLKSIZ);
767}
768
769/*
770 * Insert information into the hash about a new directory entry. dirp
771 * points to a struct direct containing the entry, and offset specifies
772 * the offset of this entry.
773 */
774void
775ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
776{
777	struct dirhash *dh;
778	int slot;
779
780	if ((dh = ufsdirhash_acquire(ip)) == NULL)
781		return;
782
783	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
784	    ("ufsdirhash_add: bad offset"));
785	/*
786	 * Normal hash usage is < 66%. If the usage gets too high then
787	 * remove the hash entirely and let it be rebuilt later.
788	 */
789	if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
790		ufsdirhash_free_locked(ip);
791		return;
792	}
793
794	/* Find a free hash slot (empty or deleted), and add the entry. */
795	slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
796	while (DH_ENTRY(dh, slot) >= 0)
797		slot = WRAPINCR(slot, dh->dh_hlen);
798	if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
799		dh->dh_hused++;
800	DH_ENTRY(dh, slot) = offset;
801
802	/* Update the per-block summary info. */
803	ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp));
804	ufsdirhash_release(dh);
805}
806
807/*
808 * Remove the specified directory entry from the hash. The entry to remove
809 * is defined by the name in `dirp', which must exist at the specified
810 * `offset' within the directory.
811 */
812void
813ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
814{
815	struct dirhash *dh;
816	int slot;
817
818	if ((dh = ufsdirhash_acquire(ip)) == NULL)
819		return;
820
821	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
822	    ("ufsdirhash_remove: bad offset"));
823	/* Find the entry */
824	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
825
826	/* Remove the hash entry. */
827	ufsdirhash_delslot(dh, slot);
828
829	/* Update the per-block summary info. */
830	ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp));
831	ufsdirhash_release(dh);
832}
833
834/*
835 * Change the offset associated with a directory entry in the hash. Used
836 * when compacting directory blocks.
837 */
838void
839ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
840    doff_t newoff)
841{
842	struct dirhash *dh;
843	int slot;
844
845	if ((dh = ufsdirhash_acquire(ip)) == NULL)
846		return;
847
848	KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ &&
849	    newoff < dh->dh_dirblks * DIRBLKSIZ,
850	    ("ufsdirhash_move: bad offset"));
851	/* Find the entry, and update the offset. */
852	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
853	DH_ENTRY(dh, slot) = newoff;
854	ufsdirhash_release(dh);
855}
856
857/*
858 * Inform dirhash that the directory has grown by one block that
859 * begins at offset (i.e. the new length is offset + DIRBLKSIZ).
860 */
861void
862ufsdirhash_newblk(struct inode *ip, doff_t offset)
863{
864	struct dirhash *dh;
865	int block;
866
867	if ((dh = ufsdirhash_acquire(ip)) == NULL)
868		return;
869
870	KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ,
871	    ("ufsdirhash_newblk: bad offset"));
872	block = offset / DIRBLKSIZ;
873	if (block >= dh->dh_nblk) {
874		/* Out of space; must rebuild. */
875		ufsdirhash_free_locked(ip);
876		return;
877	}
878	dh->dh_dirblks = block + 1;
879
880	/* Account for the new free block. */
881	dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN;
882	if (dh->dh_firstfree[DH_NFSTATS] == -1)
883		dh->dh_firstfree[DH_NFSTATS] = block;
884	ufsdirhash_release(dh);
885}
886
887/*
888 * Inform dirhash that the directory is being truncated.
889 */
890void
891ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
892{
893	struct dirhash *dh;
894	int block, i;
895
896	if ((dh = ufsdirhash_acquire(ip)) == NULL)
897		return;
898
899	KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ,
900	    ("ufsdirhash_dirtrunc: bad offset"));
901	block = howmany(offset, DIRBLKSIZ);
902	/*
903	 * If the directory shrinks to less than 1/8 of dh_nblk blocks
904	 * (about 20% of its original size due to the 50% extra added in
905	 * ufsdirhash_build) then free it, and let the caller rebuild
906	 * if necessary.
907	 */
908	if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
909		ufsdirhash_free_locked(ip);
910		return;
911	}
912
913	/*
914	 * Remove any `first free' information pertaining to the
915	 * truncated blocks. All blocks we're removing should be
916	 * completely unused.
917	 */
918	if (dh->dh_firstfree[DH_NFSTATS] >= block)
919		dh->dh_firstfree[DH_NFSTATS] = -1;
920	for (i = block; i < dh->dh_dirblks; i++)
921		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
922			panic("ufsdirhash_dirtrunc: blocks in use");
923	for (i = 0; i < DH_NFSTATS; i++)
924		if (dh->dh_firstfree[i] >= block)
925			panic("ufsdirhash_dirtrunc: first free corrupt");
926	dh->dh_dirblks = block;
927	ufsdirhash_release(dh);
928}
929
930/*
931 * Debugging function to check that the dirhash information about
932 * a directory block matches its actual contents. Panics if a mismatch
933 * is detected.
934 *
935 * On entry, `buf' should point to the start of an in-core
936 * DIRBLKSIZ-sized directory block, and `offset' should contain the
937 * offset from the start of the directory of that block.
938 */
939void
940ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset)
941{
942	struct dirhash *dh;
943	struct direct *dp;
944	int block, ffslot, i, nfree;
945
946	if (!ufs_dirhashcheck)
947		return;
948	if ((dh = ufsdirhash_acquire(ip)) == NULL)
949		return;
950
951	block = offset / DIRBLKSIZ;
952	if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks)
953		panic("ufsdirhash_checkblock: bad offset");
954
955	nfree = 0;
956	for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) {
957		dp = (struct direct *)(buf + i);
958		if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ)
959			panic("ufsdirhash_checkblock: bad dir");
960
961		if (dp->d_ino == 0) {
962#if 0
963			/*
964			 * XXX entries with d_ino == 0 should only occur
965			 * at the start of a DIRBLKSIZ block. However the
966			 * ufs code is tolerant of such entries at other
967			 * offsets, and fsck does not fix them.
968			 */
969			if (i != 0)
970				panic("ufsdirhash_checkblock: bad dir inode");
971#endif
972			nfree += dp->d_reclen;
973			continue;
974		}
975
976		/* Check that the entry	exists (will panic if it doesn't). */
977		ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
978
979		nfree += dp->d_reclen - DIRSIZ(0, dp);
980	}
981	if (i != DIRBLKSIZ)
982		panic("ufsdirhash_checkblock: bad dir end");
983
984	if (dh->dh_blkfree[block] * DIRALIGN != nfree)
985		panic("ufsdirhash_checkblock: bad free count");
986
987	ffslot = BLKFREE2IDX(nfree / DIRALIGN);
988	for (i = 0; i <= DH_NFSTATS; i++)
989		if (dh->dh_firstfree[i] == block && i != ffslot)
990			panic("ufsdirhash_checkblock: bad first-free");
991	if (dh->dh_firstfree[ffslot] == -1)
992		panic("ufsdirhash_checkblock: missing first-free entry");
993	ufsdirhash_release(dh);
994}
995
996/*
997 * Hash the specified filename into a dirhash slot.
998 */
999static int
1000ufsdirhash_hash(struct dirhash *dh, char *name, int namelen)
1001{
1002	u_int32_t hash;
1003
1004	/*
1005	 * We hash the name and then some other bit of data that is
1006	 * invariant over the dirhash's lifetime. Otherwise names
1007	 * differing only in the last byte are placed close to one
1008	 * another in the table, which is bad for linear probing.
1009	 */
1010	hash = fnv_32_buf(name, namelen, FNV1_32_INIT);
1011	hash = fnv_32_buf(&dh, sizeof(dh), hash);
1012	return (hash % dh->dh_hlen);
1013}
1014
1015/*
1016 * Adjust the number of free bytes in the block containing `offset'
1017 * by the value specified by `diff'.
1018 *
1019 * The caller must ensure we have exclusive access to `dh'; normally
1020 * that means that dh_lock should be held, but this is also called
1021 * from ufsdirhash_build() where exclusive access can be assumed.
1022 */
1023static void
1024ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff)
1025{
1026	int block, i, nfidx, ofidx;
1027
1028	/* Update the per-block summary info. */
1029	block = offset / DIRBLKSIZ;
1030	KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks,
1031	     ("dirhash bad offset"));
1032	ofidx = BLKFREE2IDX(dh->dh_blkfree[block]);
1033	dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
1034	nfidx = BLKFREE2IDX(dh->dh_blkfree[block]);
1035
1036	/* Update the `first free' list if necessary. */
1037	if (ofidx != nfidx) {
1038		/* If removing, scan forward for the next block. */
1039		if (dh->dh_firstfree[ofidx] == block) {
1040			for (i = block + 1; i < dh->dh_dirblks; i++)
1041				if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx)
1042					break;
1043			dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
1044		}
1045
1046		/* Make this the new `first free' if necessary */
1047		if (dh->dh_firstfree[nfidx] > block ||
1048		    dh->dh_firstfree[nfidx] == -1)
1049			dh->dh_firstfree[nfidx] = block;
1050	}
1051}
1052
1053/*
1054 * Find the specified name which should have the specified offset.
1055 * Returns a slot number, and panics on failure.
1056 *
1057 * `dh' must be locked on entry and remains so on return.
1058 */
1059static int
1060ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset)
1061{
1062	int slot;
1063
1064	DIRHASH_ASSERT_LOCKED(dh);
1065
1066	/* Find the entry. */
1067	KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full"));
1068	slot = ufsdirhash_hash(dh, name, namelen);
1069	while (DH_ENTRY(dh, slot) != offset &&
1070	    DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
1071		slot = WRAPINCR(slot, dh->dh_hlen);
1072	if (DH_ENTRY(dh, slot) != offset)
1073		panic("ufsdirhash_findslot: '%.*s' not found", namelen, name);
1074
1075	return (slot);
1076}
1077
1078/*
1079 * Remove the entry corresponding to the specified slot from the hash array.
1080 *
1081 * `dh' must be locked on entry and remains so on return.
1082 */
1083static void
1084ufsdirhash_delslot(struct dirhash *dh, int slot)
1085{
1086	int i;
1087
1088	DIRHASH_ASSERT_LOCKED(dh);
1089
1090	/* Mark the entry as deleted. */
1091	DH_ENTRY(dh, slot) = DIRHASH_DEL;
1092
1093	/* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
1094	for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
1095		i = WRAPINCR(i, dh->dh_hlen);
1096	if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
1097		i = WRAPDECR(i, dh->dh_hlen);
1098		while (DH_ENTRY(dh, i) == DIRHASH_DEL) {
1099			DH_ENTRY(dh, i) = DIRHASH_EMPTY;
1100			dh->dh_hused--;
1101			i = WRAPDECR(i, dh->dh_hlen);
1102		}
1103		KASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen"));
1104	}
1105}
1106
1107/*
1108 * Given a directory entry and its offset, find the offset of the
1109 * previous entry in the same DIRBLKSIZ-sized block. Returns an
1110 * offset, or -1 if there is no previous entry in the block or some
1111 * other problem occurred.
1112 */
1113static doff_t
1114ufsdirhash_getprev(struct direct *dirp, doff_t offset)
1115{
1116	struct direct *dp;
1117	char *blkbuf;
1118	doff_t blkoff, prevoff;
1119	int entrypos, i;
1120
1121	blkoff = offset & ~(DIRBLKSIZ - 1);	/* offset of start of block */
1122	entrypos = offset & (DIRBLKSIZ - 1);	/* entry relative to block */
1123	blkbuf = (char *)dirp - entrypos;
1124	prevoff = blkoff;
1125
1126	/* If `offset' is the start of a block, there is no previous entry. */
1127	if (entrypos == 0)
1128		return (-1);
1129
1130	/* Scan from the start of the block until we get to the entry. */
1131	for (i = 0; i < entrypos; i += dp->d_reclen) {
1132		dp = (struct direct *)(blkbuf + i);
1133		if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
1134			return (-1);	/* Corrupted directory. */
1135		prevoff = blkoff + i;
1136	}
1137	return (prevoff);
1138}
1139
1140/*
1141 * Try to free up `wanted' bytes by stealing memory from existing
1142 * dirhashes. Returns zero with list locked if successful.
1143 */
1144static int
1145ufsdirhash_recycle(int wanted)
1146{
1147	struct dirhash *dh;
1148	doff_t **hash;
1149	u_int8_t *blkfree;
1150	int i, mem, narrays;
1151
1152	DIRHASHLIST_LOCK();
1153	dh = TAILQ_FIRST(&ufsdirhash_list);
1154	while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
1155		/* Decrement the score; only recycle if it becomes zero. */
1156		if (dh == NULL || --dh->dh_score > 0) {
1157			DIRHASHLIST_UNLOCK();
1158			return (-1);
1159		}
1160		/*
1161		 * If we can't lock it it's in use and we don't want to
1162		 * recycle it anyway.
1163		 */
1164		if (!sx_try_xlock(&dh->dh_lock)) {
1165			dh = TAILQ_NEXT(dh, dh_list);
1166			continue;
1167		}
1168		KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list"));
1169
1170		/* Remove it from the list and detach its memory. */
1171		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
1172		dh->dh_onlist = 0;
1173		hash = dh->dh_hash;
1174		dh->dh_hash = NULL;
1175		blkfree = dh->dh_blkfree;
1176		dh->dh_blkfree = NULL;
1177		narrays = dh->dh_narrays;
1178		mem = dh->dh_memreq;
1179		dh->dh_memreq = 0;
1180
1181		/* Unlock everything, free the detached memory. */
1182		ufsdirhash_release(dh);
1183		DIRHASHLIST_UNLOCK();
1184		for (i = 0; i < narrays; i++)
1185			DIRHASH_BLKFREE(hash[i]);
1186		free(hash, M_DIRHASH);
1187		free(blkfree, M_DIRHASH);
1188
1189		/* Account for the returned memory, and repeat if necessary. */
1190		DIRHASHLIST_LOCK();
1191		ufs_dirhashmem -= mem;
1192		dh = TAILQ_FIRST(&ufsdirhash_list);
1193	}
1194	/* Success; return with list locked. */
1195	return (0);
1196}
1197
1198
1199void
1200ufsdirhash_init()
1201{
1202	ufsdirhash_zone = uma_zcreate("DIRHASH", DH_NBLKOFF * sizeof(doff_t),
1203	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1204	mtx_init(&ufsdirhash_mtx, "dirhash list", NULL, MTX_DEF);
1205	TAILQ_INIT(&ufsdirhash_list);
1206}
1207
1208void
1209ufsdirhash_uninit()
1210{
1211	KASSERT(TAILQ_EMPTY(&ufsdirhash_list), ("ufsdirhash_uninit"));
1212	uma_zdestroy(ufsdirhash_zone);
1213	mtx_destroy(&ufsdirhash_mtx);
1214}
1215
1216#endif /* UFS_DIRHASH */
1217