ufs_dirhash.c revision 178110
1/*-
2 * Copyright (c) 2001, 2002 Ian Dowse.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26/*
27 * This implements a hash-based lookup scheme for UFS directories.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: head/sys/ufs/ufs/ufs_dirhash.c 178110 2008-04-11 09:48:12Z jeff $");
32
33#include "opt_ufs.h"
34
35#ifdef UFS_DIRHASH
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/lock.h>
41#include <sys/lockmgr.h>
42#include <sys/mutex.h>
43#include <sys/malloc.h>
44#include <sys/fnv_hash.h>
45#include <sys/proc.h>
46#include <sys/bio.h>
47#include <sys/buf.h>
48#include <sys/vnode.h>
49#include <sys/mount.h>
50#include <sys/sysctl.h>
51#include <vm/uma.h>
52
53#include <ufs/ufs/quota.h>
54#include <ufs/ufs/inode.h>
55#include <ufs/ufs/dir.h>
56#include <ufs/ufs/dirhash.h>
57#include <ufs/ufs/extattr.h>
58#include <ufs/ufs/ufsmount.h>
59#include <ufs/ufs/ufs_extern.h>
60
61#define WRAPINCR(val, limit)	(((val) + 1 == (limit)) ? 0 : ((val) + 1))
62#define WRAPDECR(val, limit)	(((val) == 0) ? ((limit) - 1) : ((val) - 1))
63#define OFSFMT(vp)		((vp)->v_mount->mnt_maxsymlinklen <= 0)
64#define BLKFREE2IDX(n)		((n) > DH_NFSTATS ? DH_NFSTATS : (n))
65
66static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables");
67
68static SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
69
70static int ufs_mindirhashsize = DIRBLKSIZ * 5;
71SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW,
72    &ufs_mindirhashsize,
73    0, "minimum directory size in bytes for which to use hashed lookup");
74static int ufs_dirhashmaxmem = 2 * 1024 * 1024;
75SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_maxmem, CTLFLAG_RW, &ufs_dirhashmaxmem,
76    0, "maximum allowed dirhash memory usage");
77static int ufs_dirhashmem;
78SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_mem, CTLFLAG_RD, &ufs_dirhashmem,
79    0, "current dirhash memory usage");
80static int ufs_dirhashcheck = 0;
81SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_docheck, CTLFLAG_RW, &ufs_dirhashcheck,
82    0, "enable extra sanity tests");
83
84
85static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen);
86static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff);
87static void ufsdirhash_delslot(struct dirhash *dh, int slot);
88static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen,
89	   doff_t offset);
90static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset);
91static int ufsdirhash_recycle(int wanted);
92static void ufsdirhash_free_locked(struct inode *ip);
93
94static uma_zone_t	ufsdirhash_zone;
95
96#define DIRHASHLIST_LOCK() 		mtx_lock(&ufsdirhash_mtx)
97#define DIRHASHLIST_UNLOCK() 		mtx_unlock(&ufsdirhash_mtx)
98#define DIRHASH_BLKALLOC_WAITOK() 	uma_zalloc(ufsdirhash_zone, M_WAITOK)
99#define DIRHASH_BLKFREE(ptr) 		uma_zfree(ufsdirhash_zone, (ptr))
100#define	DIRHASH_ASSERT_LOCKED(dh)					\
101    lockmgr_assert(&(dh)->dh_lock, KA_LOCKED)
102
103/* Dirhash list; recently-used entries are near the tail. */
104static TAILQ_HEAD(, dirhash) ufsdirhash_list;
105
106/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
107static struct mtx	ufsdirhash_mtx;
108
109/*
110 * Locking:
111 *
112 * The relationship between inode and dirhash is protected either by an
113 * exclusive vnode lock or the vnode interlock where a shared vnode lock
114 * may be used.  The dirhash_mtx is acquired after the dirhash lock.
115 *
116 * ufsdirhash_build() acquires a shared lock on the dirhash when it is
117 * successful.  This lock is released after a call to ufsdirhash_lookup().
118 *
119 * Functions requiring exclusive access use ufsdirhash_acquire() which may
120 * free a dirhash structure that was recycled by ufsdirhash_recycle().
121 *
122 * The dirhash lock may be held across io operations.
123 */
124
125/*
126 * Release the lock on a dirhash.
127 */
128static void
129ufsdirhash_release(struct dirhash *dh)
130{
131
132	lockmgr(&dh->dh_lock, LK_RELEASE, 0);
133}
134
135/*
136 * Either acquire an existing hash locked shared or create a new hash and
137 * return it exclusively locked.  May return NULL if the allocation fails.
138 *
139 * The vnode interlock is used to protect the i_dirhash pointer from
140 * simultaneous access while only a shared vnode lock is held.
141 */
142static struct dirhash *
143ufsdirhash_create(struct inode *ip)
144{
145	struct dirhash *ndh;
146	struct dirhash *dh;
147	struct vnode *vp;
148	int error;
149
150	error = 0;
151	ndh = dh = NULL;
152	vp = ip->i_vnode;
153	for (;;) {
154		/* Racy check for i_dirhash to prefetch an dirhash structure. */
155		if (ip->i_dirhash == NULL && ndh == NULL) {
156			MALLOC(ndh, struct dirhash *, sizeof *dh, M_DIRHASH,
157			    M_NOWAIT | M_ZERO);
158			if (ndh == NULL)
159				return (NULL);
160			lockinit(&ndh->dh_lock, PRIBIO, "dirhash", 0, 0);
161			lockmgr(&ndh->dh_lock, LK_EXCLUSIVE, NULL);
162		}
163		/*
164		 * Check i_dirhash.  If it's NULL just try to use a
165		 * preallocated structure.  If none exists loop and try again.
166		 */
167		VI_LOCK(vp);
168		dh = ip->i_dirhash;
169		if (dh == NULL) {
170			ip->i_dirhash = ndh;
171			VI_UNLOCK(vp);
172			if (ndh == NULL)
173				continue;
174			return (ndh);
175		}
176		/* Try to acquire shared on existing hashes. */
177		if (lockmgr(&dh->dh_lock, LK_SHARED | LK_INTERLOCK,
178		    VI_MTX(vp)))
179			continue;
180		/* The hash could've been recycled while we were waiting. */
181		if (ip->i_dirhash != dh) {
182			ufsdirhash_release(dh);
183			continue;
184		}
185		/* If the hash is still valid we've succeeded. */
186		if (dh->dh_hash != NULL)
187			break;
188		/*
189		 * If the hash is NULL it has been recycled.  Try to upgrade
190		 * so we can recreate it.  If we fail the upgrade another
191		 * thread must've already exclusively locked it.
192		 */
193		if (lockmgr(&dh->dh_lock, LK_UPGRADE | LK_SLEEPFAIL, NULL) == 0)
194			break;
195	}
196	/* Free the preallocated structure if it was not necessary. */
197	if (ndh) {
198		lockmgr(&ndh->dh_lock, LK_RELEASE, NULL);
199		lockdestroy(&ndh->dh_lock);
200		FREE(ndh, M_DIRHASH);
201	}
202	return (dh);
203}
204
205/*
206 * Acquire an exclusive lock on an existing hash.  Requires an exclusive
207 * vnode lock to protect the i_dirhash pointer.  hashes that have been
208 * recycled are reclaimed here and NULL is returned.
209 */
210static struct dirhash *
211ufsdirhash_acquire(struct inode *ip)
212{
213	struct dirhash *dh;
214	struct vnode *vp;
215
216	ASSERT_VOP_ELOCKED(ip->i_vnode, __FUNCTION__);
217
218	vp = ip->i_vnode;
219	dh = ip->i_dirhash;
220	if (dh == NULL)
221		return (NULL);
222	lockmgr(&dh->dh_lock, LK_EXCLUSIVE, 0);
223	if (dh->dh_hash != NULL)
224		return (dh);
225	ufsdirhash_free_locked(ip);
226	return (NULL);
227}
228
229/*
230 * Acquire exclusively and free the hash pointed to by ip.  Works with a
231 * shared or exclusive vnode lock.
232 */
233void
234ufsdirhash_free(struct inode *ip)
235{
236	struct dirhash *dh;
237	struct vnode *vp;
238
239	vp = ip->i_vnode;
240	for (;;) {
241		VI_LOCK(vp);
242		dh = ip->i_dirhash;
243		if (dh == NULL) {
244			VI_UNLOCK(vp);
245			return;
246		}
247		if (lockmgr(&dh->dh_lock, LK_EXCLUSIVE | LK_INTERLOCK,
248		    VI_MTX(vp)))
249			continue;
250		if (ip->i_dirhash == dh)
251			break;
252		ufsdirhash_release(dh);
253	}
254	ufsdirhash_free_locked(ip);
255}
256
257/*
258 * Attempt to build up a hash table for the directory contents in
259 * inode 'ip'. Returns 0 on success, or -1 of the operation failed.
260 */
261int
262ufsdirhash_build(struct inode *ip)
263{
264	struct dirhash *dh;
265	struct buf *bp = NULL;
266	struct direct *ep;
267	struct vnode *vp;
268	doff_t bmask, pos;
269	int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
270
271	/* Take care of a decreased sysctl value. */
272	while (ufs_dirhashmem > ufs_dirhashmaxmem)
273		if (ufsdirhash_recycle(0) != 0)
274			return (-1);
275
276	/* Check if we can/should use dirhash. */
277	if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode) ||
278	    ip->i_effnlink == 0) {
279		if (ip->i_dirhash)
280			ufsdirhash_free(ip);
281		return (-1);
282	}
283	dh = ufsdirhash_create(ip);
284	if (dh == NULL)
285		return (-1);
286	if (dh->dh_hash != NULL)
287		return (0);
288
289	vp = ip->i_vnode;
290	/* Allocate 50% more entries than this dir size could ever need. */
291	KASSERT(ip->i_size >= DIRBLKSIZ, ("ufsdirhash_build size"));
292	nslots = ip->i_size / DIRECTSIZ(1);
293	nslots = (nslots * 3 + 1) / 2;
294	narrays = howmany(nslots, DH_NBLKOFF);
295	nslots = narrays * DH_NBLKOFF;
296	dirblocks = howmany(ip->i_size, DIRBLKSIZ);
297	nblocks = (dirblocks * 3 + 1) / 2;
298	memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
299	    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
300	    nblocks * sizeof(*dh->dh_blkfree);
301	DIRHASHLIST_LOCK();
302	if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) {
303		DIRHASHLIST_UNLOCK();
304		if (memreqd > ufs_dirhashmaxmem / 2)
305			goto fail;
306		/* Try to free some space. */
307		if (ufsdirhash_recycle(memreqd) != 0)
308			goto fail;
309		/* Enough was freed, and list has been locked. */
310	}
311	ufs_dirhashmem += memreqd;
312	DIRHASHLIST_UNLOCK();
313
314	/* Initialise the hash table and block statistics. */
315	dh->dh_memreq = memreqd;
316	dh->dh_narrays = narrays;
317	dh->dh_hlen = nslots;
318	dh->dh_nblk = nblocks;
319	dh->dh_dirblks = dirblocks;
320	for (i = 0; i < DH_NFSTATS; i++)
321		dh->dh_firstfree[i] = -1;
322	dh->dh_firstfree[DH_NFSTATS] = 0;
323	dh->dh_hused = 0;
324	dh->dh_seqopt = 0;
325	dh->dh_seqoff = 0;
326	dh->dh_score = DH_SCOREINIT;
327
328	/*
329	 * Use non-blocking mallocs so that we will revert to a linear
330	 * lookup on failure rather than potentially blocking forever.
331	 */
332	MALLOC(dh->dh_hash, doff_t **, narrays * sizeof(dh->dh_hash[0]),
333	    M_DIRHASH, M_NOWAIT | M_ZERO);
334	if (dh->dh_hash == NULL)
335		goto fail;
336	MALLOC(dh->dh_blkfree, u_int8_t *, nblocks * sizeof(dh->dh_blkfree[0]),
337	    M_DIRHASH, M_NOWAIT);
338	if (dh->dh_blkfree == NULL)
339		goto fail;
340	for (i = 0; i < narrays; i++) {
341		if ((dh->dh_hash[i] = DIRHASH_BLKALLOC_WAITOK()) == NULL)
342			goto fail;
343		for (j = 0; j < DH_NBLKOFF; j++)
344			dh->dh_hash[i][j] = DIRHASH_EMPTY;
345	}
346	for (i = 0; i < dirblocks; i++)
347		dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN;
348	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
349	pos = 0;
350	while (pos < ip->i_size) {
351		/* If necessary, get the next directory block. */
352		if ((pos & bmask) == 0) {
353			if (bp != NULL)
354				brelse(bp);
355			if (UFS_BLKATOFF(vp, (off_t)pos, NULL, &bp) != 0)
356				goto fail;
357		}
358
359		/* Add this entry to the hash. */
360		ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
361		if (ep->d_reclen == 0 || ep->d_reclen >
362		    DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) {
363			/* Corrupted directory. */
364			brelse(bp);
365			goto fail;
366		}
367		if (ep->d_ino != 0) {
368			/* Add the entry (simplified ufsdirhash_add). */
369			slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen);
370			while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
371				slot = WRAPINCR(slot, dh->dh_hlen);
372			dh->dh_hused++;
373			DH_ENTRY(dh, slot) = pos;
374			ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep));
375		}
376		pos += ep->d_reclen;
377	}
378
379	if (bp != NULL)
380		brelse(bp);
381	DIRHASHLIST_LOCK();
382	TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
383	dh->dh_onlist = 1;
384	DIRHASHLIST_UNLOCK();
385	lockmgr(&dh->dh_lock, LK_DOWNGRADE, 0);
386	return (0);
387
388fail:
389	ufsdirhash_free_locked(ip);
390	return (-1);
391}
392
393/*
394 * Free any hash table associated with inode 'ip'.
395 */
396static void
397ufsdirhash_free_locked(struct inode *ip)
398{
399	struct dirhash *dh;
400	struct vnode *vp;
401	int i;
402
403	DIRHASH_ASSERT_LOCKED(ip->i_dirhash);
404	/*
405	 * Clear the pointer in the inode to prevent new threads from
406	 * finding the dead structure.
407	 */
408	vp = ip->i_vnode;
409	VI_LOCK(vp);
410	dh = ip->i_dirhash;
411	ip->i_dirhash = NULL;
412	VI_UNLOCK(vp);
413	/*
414	 * Drain waiters.  They will abort when they see that ip->i_dirhash
415	 * is NULL after locking.
416	 */
417	lockmgr(&dh->dh_lock, LK_RELEASE, 0);
418	lockmgr(&dh->dh_lock, LK_DRAIN, 0);
419	/*
420	 * Handle partially recycled as well as fully constructed hashes.
421	 */
422	if (dh->dh_hash != NULL) {
423		for (i = 0; i < dh->dh_narrays; i++)
424			if (dh->dh_hash[i] != NULL)
425				DIRHASH_BLKFREE(dh->dh_hash[i]);
426		FREE(dh->dh_hash, M_DIRHASH);
427		if (dh->dh_blkfree != NULL)
428			FREE(dh->dh_blkfree, M_DIRHASH);
429	}
430	DIRHASHLIST_LOCK();
431	if (dh->dh_onlist)
432		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
433	ufs_dirhashmem -= dh->dh_memreq;
434	DIRHASHLIST_UNLOCK();
435	/*
436	 * Release the lock and reclaim datastructure memory.
437	 */
438	lockmgr(&dh->dh_lock, LK_RELEASE, 0);
439	lockdestroy(&dh->dh_lock);
440	FREE(dh, M_DIRHASH);
441
442	return;
443}
444
445/*
446 * Find the offset of the specified name within the given inode.
447 * Returns 0 on success, ENOENT if the entry does not exist, or
448 * EJUSTRETURN if the caller should revert to a linear search.
449 *
450 * If successful, the directory offset is stored in *offp, and a
451 * pointer to a struct buf containing the entry is stored in *bpp. If
452 * prevoffp is non-NULL, the offset of the previous entry within
453 * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
454 * is the first in a block, the start of the block is used).
455 *
456 * Must be called with the hash locked.  Returns with the hash unlocked.
457 */
458int
459ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp,
460    struct buf **bpp, doff_t *prevoffp)
461{
462	struct dirhash *dh, *dh_next;
463	struct direct *dp;
464	struct vnode *vp;
465	struct buf *bp;
466	doff_t blkoff, bmask, offset, prevoff;
467	int i, slot;
468	int error;
469
470	dh = ip->i_dirhash;
471	KASSERT(dh != NULL && dh->dh_hash != NULL,
472	    ("ufsdirhash_lookup: Invalid dirhash %p\n", dh));
473	DIRHASH_ASSERT_LOCKED(dh);
474	/*
475	 * Move this dirhash towards the end of the list if it has a
476	 * score higher than the next entry, and acquire the dh_lock.
477	 */
478	DIRHASHLIST_LOCK();
479	if (TAILQ_NEXT(dh, dh_list) != NULL) {
480		/*
481		 * If the new score will be greater than that of the next
482		 * entry, then move this entry past it. With both mutexes
483		 * held, dh_next won't go away, but its dh_score could
484		 * change; that's not important since it is just a hint.
485		 */
486		if ((dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
487		    dh->dh_score >= dh_next->dh_score) {
488			KASSERT(dh->dh_onlist, ("dirhash: not on list"));
489			TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
490			TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
491			    dh_list);
492		}
493	}
494	/* Update the score. */
495	if (dh->dh_score < DH_SCOREMAX)
496		dh->dh_score++;
497	DIRHASHLIST_UNLOCK();
498
499	vp = ip->i_vnode;
500	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
501	blkoff = -1;
502	bp = NULL;
503restart:
504	slot = ufsdirhash_hash(dh, name, namelen);
505
506	if (dh->dh_seqopt) {
507		/*
508		 * Sequential access optimisation. dh_seqoff contains the
509		 * offset of the directory entry immediately following
510		 * the last entry that was looked up. Check if this offset
511		 * appears in the hash chain for the name we are looking for.
512		 */
513		for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
514		    i = WRAPINCR(i, dh->dh_hlen))
515			if (offset == dh->dh_seqoff)
516				break;
517		if (offset == dh->dh_seqoff) {
518			/*
519			 * We found an entry with the expected offset. This
520			 * is probably the entry we want, but if not, the
521			 * code below will turn off seqopt and retry.
522			 */
523			slot = i;
524		} else
525			dh->dh_seqopt = 0;
526	}
527
528	for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY;
529	    slot = WRAPINCR(slot, dh->dh_hlen)) {
530		if (offset == DIRHASH_DEL)
531			continue;
532		if (offset < 0 || offset >= ip->i_size)
533			panic("ufsdirhash_lookup: bad offset in hash array");
534		if ((offset & ~bmask) != blkoff) {
535			if (bp != NULL)
536				brelse(bp);
537			blkoff = offset & ~bmask;
538			if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0) {
539				error = EJUSTRETURN;
540				goto fail;
541			}
542		}
543		dp = (struct direct *)(bp->b_data + (offset & bmask));
544		if (dp->d_reclen == 0 || dp->d_reclen >
545		    DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) {
546			/* Corrupted directory. */
547			error = EJUSTRETURN;
548			goto fail;
549		}
550		if (dp->d_namlen == namelen &&
551		    bcmp(dp->d_name, name, namelen) == 0) {
552			/* Found. Get the prev offset if needed. */
553			if (prevoffp != NULL) {
554				if (offset & (DIRBLKSIZ - 1)) {
555					prevoff = ufsdirhash_getprev(dp,
556					    offset);
557					if (prevoff == -1) {
558						error = EJUSTRETURN;
559						goto fail;
560					}
561				} else
562					prevoff = offset;
563				*prevoffp = prevoff;
564			}
565
566			/* Check for sequential access, and update offset. */
567			if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
568				dh->dh_seqopt = 1;
569			dh->dh_seqoff = offset + DIRSIZ(0, dp);
570			*bpp = bp;
571			*offp = offset;
572			ufsdirhash_release(dh);
573			return (0);
574		}
575
576		/*
577		 * When the name doesn't match in the seqopt case, go back
578		 * and search normally.
579		 */
580		if (dh->dh_seqopt) {
581			dh->dh_seqopt = 0;
582			goto restart;
583		}
584	}
585	error = ENOENT;
586fail:
587	ufsdirhash_release(dh);
588	if (bp != NULL)
589		brelse(bp);
590	return (error);
591}
592
593/*
594 * Find a directory block with room for 'slotneeded' bytes. Returns
595 * the offset of the directory entry that begins the free space.
596 * This will either be the offset of an existing entry that has free
597 * space at the end, or the offset of an entry with d_ino == 0 at
598 * the start of a DIRBLKSIZ block.
599 *
600 * To use the space, the caller may need to compact existing entries in
601 * the directory. The total number of bytes in all of the entries involved
602 * in the compaction is stored in *slotsize. In other words, all of
603 * the entries that must be compacted are exactly contained in the
604 * region beginning at the returned offset and spanning *slotsize bytes.
605 *
606 * Returns -1 if no space was found, indicating that the directory
607 * must be extended.
608 */
609doff_t
610ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
611{
612	struct direct *dp;
613	struct dirhash *dh;
614	struct buf *bp;
615	doff_t pos, slotstart;
616	int dirblock, error, freebytes, i;
617
618	dh = ip->i_dirhash;
619	KASSERT(dh != NULL && dh->dh_hash != NULL,
620	    ("ufsdirhash_findfree: Invalid dirhash %p\n", dh));
621	DIRHASH_ASSERT_LOCKED(dh);
622
623	/* Find a directory block with the desired free space. */
624	dirblock = -1;
625	for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
626		if ((dirblock = dh->dh_firstfree[i]) != -1)
627			break;
628	if (dirblock == -1)
629		return (-1);
630
631	KASSERT(dirblock < dh->dh_nblk &&
632	    dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN),
633	    ("ufsdirhash_findfree: bad stats"));
634	pos = dirblock * DIRBLKSIZ;
635	error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp);
636	if (error)
637		return (-1);
638
639	/* Find the first entry with free space. */
640	for (i = 0; i < DIRBLKSIZ; ) {
641		if (dp->d_reclen == 0) {
642			brelse(bp);
643			return (-1);
644		}
645		if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp))
646			break;
647		i += dp->d_reclen;
648		dp = (struct direct *)((char *)dp + dp->d_reclen);
649	}
650	if (i > DIRBLKSIZ) {
651		brelse(bp);
652		return (-1);
653	}
654	slotstart = pos + i;
655
656	/* Find the range of entries needed to get enough space */
657	freebytes = 0;
658	while (i < DIRBLKSIZ && freebytes < slotneeded) {
659		freebytes += dp->d_reclen;
660		if (dp->d_ino != 0)
661			freebytes -= DIRSIZ(0, dp);
662		if (dp->d_reclen == 0) {
663			brelse(bp);
664			return (-1);
665		}
666		i += dp->d_reclen;
667		dp = (struct direct *)((char *)dp + dp->d_reclen);
668	}
669	if (i > DIRBLKSIZ) {
670		brelse(bp);
671		return (-1);
672	}
673	if (freebytes < slotneeded)
674		panic("ufsdirhash_findfree: free mismatch");
675	brelse(bp);
676	*slotsize = pos + i - slotstart;
677	return (slotstart);
678}
679
680/*
681 * Return the start of the unused space at the end of a directory, or
682 * -1 if there are no trailing unused blocks.
683 */
684doff_t
685ufsdirhash_enduseful(struct inode *ip)
686{
687
688	struct dirhash *dh;
689	int i;
690
691	dh = ip->i_dirhash;
692	DIRHASH_ASSERT_LOCKED(dh);
693	KASSERT(dh != NULL && dh->dh_hash != NULL,
694	    ("ufsdirhash_enduseful: Invalid dirhash %p\n", dh));
695
696	if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN)
697		return (-1);
698
699	for (i = dh->dh_dirblks - 1; i >= 0; i--)
700		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
701			break;
702
703	return ((doff_t)(i + 1) * DIRBLKSIZ);
704}
705
706/*
707 * Insert information into the hash about a new directory entry. dirp
708 * points to a struct direct containing the entry, and offset specifies
709 * the offset of this entry.
710 */
711void
712ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
713{
714	struct dirhash *dh;
715	int slot;
716
717	if ((dh = ufsdirhash_acquire(ip)) == NULL)
718		return;
719
720	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
721	    ("ufsdirhash_add: bad offset"));
722	/*
723	 * Normal hash usage is < 66%. If the usage gets too high then
724	 * remove the hash entirely and let it be rebuilt later.
725	 */
726	if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
727		ufsdirhash_free_locked(ip);
728		return;
729	}
730
731	/* Find a free hash slot (empty or deleted), and add the entry. */
732	slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
733	while (DH_ENTRY(dh, slot) >= 0)
734		slot = WRAPINCR(slot, dh->dh_hlen);
735	if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
736		dh->dh_hused++;
737	DH_ENTRY(dh, slot) = offset;
738
739	/* Update the per-block summary info. */
740	ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp));
741	ufsdirhash_release(dh);
742}
743
744/*
745 * Remove the specified directory entry from the hash. The entry to remove
746 * is defined by the name in `dirp', which must exist at the specified
747 * `offset' within the directory.
748 */
749void
750ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
751{
752	struct dirhash *dh;
753	int slot;
754
755	if ((dh = ufsdirhash_acquire(ip)) == NULL)
756		return;
757
758	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
759	    ("ufsdirhash_remove: bad offset"));
760	/* Find the entry */
761	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
762
763	/* Remove the hash entry. */
764	ufsdirhash_delslot(dh, slot);
765
766	/* Update the per-block summary info. */
767	ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp));
768	ufsdirhash_release(dh);
769}
770
771/*
772 * Change the offset associated with a directory entry in the hash. Used
773 * when compacting directory blocks.
774 */
775void
776ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
777    doff_t newoff)
778{
779	struct dirhash *dh;
780	int slot;
781
782	if ((dh = ufsdirhash_acquire(ip)) == NULL)
783		return;
784
785	KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ &&
786	    newoff < dh->dh_dirblks * DIRBLKSIZ,
787	    ("ufsdirhash_move: bad offset"));
788	/* Find the entry, and update the offset. */
789	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
790	DH_ENTRY(dh, slot) = newoff;
791	ufsdirhash_release(dh);
792}
793
794/*
795 * Inform dirhash that the directory has grown by one block that
796 * begins at offset (i.e. the new length is offset + DIRBLKSIZ).
797 */
798void
799ufsdirhash_newblk(struct inode *ip, doff_t offset)
800{
801	struct dirhash *dh;
802	int block;
803
804	if ((dh = ufsdirhash_acquire(ip)) == NULL)
805		return;
806
807	KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ,
808	    ("ufsdirhash_newblk: bad offset"));
809	block = offset / DIRBLKSIZ;
810	if (block >= dh->dh_nblk) {
811		/* Out of space; must rebuild. */
812		ufsdirhash_free_locked(ip);
813		return;
814	}
815	dh->dh_dirblks = block + 1;
816
817	/* Account for the new free block. */
818	dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN;
819	if (dh->dh_firstfree[DH_NFSTATS] == -1)
820		dh->dh_firstfree[DH_NFSTATS] = block;
821	ufsdirhash_release(dh);
822}
823
824/*
825 * Inform dirhash that the directory is being truncated.
826 */
827void
828ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
829{
830	struct dirhash *dh;
831	int block, i;
832
833	if ((dh = ufsdirhash_acquire(ip)) == NULL)
834		return;
835
836	KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ,
837	    ("ufsdirhash_dirtrunc: bad offset"));
838	block = howmany(offset, DIRBLKSIZ);
839	/*
840	 * If the directory shrinks to less than 1/8 of dh_nblk blocks
841	 * (about 20% of its original size due to the 50% extra added in
842	 * ufsdirhash_build) then free it, and let the caller rebuild
843	 * if necessary.
844	 */
845	if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
846		ufsdirhash_free_locked(ip);
847		return;
848	}
849
850	/*
851	 * Remove any `first free' information pertaining to the
852	 * truncated blocks. All blocks we're removing should be
853	 * completely unused.
854	 */
855	if (dh->dh_firstfree[DH_NFSTATS] >= block)
856		dh->dh_firstfree[DH_NFSTATS] = -1;
857	for (i = block; i < dh->dh_dirblks; i++)
858		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
859			panic("ufsdirhash_dirtrunc: blocks in use");
860	for (i = 0; i < DH_NFSTATS; i++)
861		if (dh->dh_firstfree[i] >= block)
862			panic("ufsdirhash_dirtrunc: first free corrupt");
863	dh->dh_dirblks = block;
864	ufsdirhash_release(dh);
865}
866
867/*
868 * Debugging function to check that the dirhash information about
869 * a directory block matches its actual contents. Panics if a mismatch
870 * is detected.
871 *
872 * On entry, `buf' should point to the start of an in-core
873 * DIRBLKSIZ-sized directory block, and `offset' should contain the
874 * offset from the start of the directory of that block.
875 */
876void
877ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset)
878{
879	struct dirhash *dh;
880	struct direct *dp;
881	int block, ffslot, i, nfree;
882
883	if (!ufs_dirhashcheck)
884		return;
885	if ((dh = ufsdirhash_acquire(ip)) == NULL)
886		return;
887
888	block = offset / DIRBLKSIZ;
889	if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks)
890		panic("ufsdirhash_checkblock: bad offset");
891
892	nfree = 0;
893	for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) {
894		dp = (struct direct *)(buf + i);
895		if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ)
896			panic("ufsdirhash_checkblock: bad dir");
897
898		if (dp->d_ino == 0) {
899#if 0
900			/*
901			 * XXX entries with d_ino == 0 should only occur
902			 * at the start of a DIRBLKSIZ block. However the
903			 * ufs code is tolerant of such entries at other
904			 * offsets, and fsck does not fix them.
905			 */
906			if (i != 0)
907				panic("ufsdirhash_checkblock: bad dir inode");
908#endif
909			nfree += dp->d_reclen;
910			continue;
911		}
912
913		/* Check that the entry	exists (will panic if it doesn't). */
914		ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
915
916		nfree += dp->d_reclen - DIRSIZ(0, dp);
917	}
918	if (i != DIRBLKSIZ)
919		panic("ufsdirhash_checkblock: bad dir end");
920
921	if (dh->dh_blkfree[block] * DIRALIGN != nfree)
922		panic("ufsdirhash_checkblock: bad free count");
923
924	ffslot = BLKFREE2IDX(nfree / DIRALIGN);
925	for (i = 0; i <= DH_NFSTATS; i++)
926		if (dh->dh_firstfree[i] == block && i != ffslot)
927			panic("ufsdirhash_checkblock: bad first-free");
928	if (dh->dh_firstfree[ffslot] == -1)
929		panic("ufsdirhash_checkblock: missing first-free entry");
930	ufsdirhash_release(dh);
931}
932
933/*
934 * Hash the specified filename into a dirhash slot.
935 */
936static int
937ufsdirhash_hash(struct dirhash *dh, char *name, int namelen)
938{
939	u_int32_t hash;
940
941	/*
942	 * We hash the name and then some other bit of data that is
943	 * invariant over the dirhash's lifetime. Otherwise names
944	 * differing only in the last byte are placed close to one
945	 * another in the table, which is bad for linear probing.
946	 */
947	hash = fnv_32_buf(name, namelen, FNV1_32_INIT);
948	hash = fnv_32_buf(&dh, sizeof(dh), hash);
949	return (hash % dh->dh_hlen);
950}
951
952/*
953 * Adjust the number of free bytes in the block containing `offset'
954 * by the value specified by `diff'.
955 *
956 * The caller must ensure we have exclusive access to `dh'; normally
957 * that means that dh_lock should be held, but this is also called
958 * from ufsdirhash_build() where exclusive access can be assumed.
959 */
960static void
961ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff)
962{
963	int block, i, nfidx, ofidx;
964
965	/* Update the per-block summary info. */
966	block = offset / DIRBLKSIZ;
967	KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks,
968	     ("dirhash bad offset"));
969	ofidx = BLKFREE2IDX(dh->dh_blkfree[block]);
970	dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
971	nfidx = BLKFREE2IDX(dh->dh_blkfree[block]);
972
973	/* Update the `first free' list if necessary. */
974	if (ofidx != nfidx) {
975		/* If removing, scan forward for the next block. */
976		if (dh->dh_firstfree[ofidx] == block) {
977			for (i = block + 1; i < dh->dh_dirblks; i++)
978				if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx)
979					break;
980			dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
981		}
982
983		/* Make this the new `first free' if necessary */
984		if (dh->dh_firstfree[nfidx] > block ||
985		    dh->dh_firstfree[nfidx] == -1)
986			dh->dh_firstfree[nfidx] = block;
987	}
988}
989
990/*
991 * Find the specified name which should have the specified offset.
992 * Returns a slot number, and panics on failure.
993 *
994 * `dh' must be locked on entry and remains so on return.
995 */
996static int
997ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset)
998{
999	int slot;
1000
1001	DIRHASH_ASSERT_LOCKED(dh);
1002
1003	/* Find the entry. */
1004	KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full"));
1005	slot = ufsdirhash_hash(dh, name, namelen);
1006	while (DH_ENTRY(dh, slot) != offset &&
1007	    DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
1008		slot = WRAPINCR(slot, dh->dh_hlen);
1009	if (DH_ENTRY(dh, slot) != offset)
1010		panic("ufsdirhash_findslot: '%.*s' not found", namelen, name);
1011
1012	return (slot);
1013}
1014
1015/*
1016 * Remove the entry corresponding to the specified slot from the hash array.
1017 *
1018 * `dh' must be locked on entry and remains so on return.
1019 */
1020static void
1021ufsdirhash_delslot(struct dirhash *dh, int slot)
1022{
1023	int i;
1024
1025	DIRHASH_ASSERT_LOCKED(dh);
1026
1027	/* Mark the entry as deleted. */
1028	DH_ENTRY(dh, slot) = DIRHASH_DEL;
1029
1030	/* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
1031	for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
1032		i = WRAPINCR(i, dh->dh_hlen);
1033	if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
1034		i = WRAPDECR(i, dh->dh_hlen);
1035		while (DH_ENTRY(dh, i) == DIRHASH_DEL) {
1036			DH_ENTRY(dh, i) = DIRHASH_EMPTY;
1037			dh->dh_hused--;
1038			i = WRAPDECR(i, dh->dh_hlen);
1039		}
1040		KASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen"));
1041	}
1042}
1043
1044/*
1045 * Given a directory entry and its offset, find the offset of the
1046 * previous entry in the same DIRBLKSIZ-sized block. Returns an
1047 * offset, or -1 if there is no previous entry in the block or some
1048 * other problem occurred.
1049 */
1050static doff_t
1051ufsdirhash_getprev(struct direct *dirp, doff_t offset)
1052{
1053	struct direct *dp;
1054	char *blkbuf;
1055	doff_t blkoff, prevoff;
1056	int entrypos, i;
1057
1058	blkoff = offset & ~(DIRBLKSIZ - 1);	/* offset of start of block */
1059	entrypos = offset & (DIRBLKSIZ - 1);	/* entry relative to block */
1060	blkbuf = (char *)dirp - entrypos;
1061	prevoff = blkoff;
1062
1063	/* If `offset' is the start of a block, there is no previous entry. */
1064	if (entrypos == 0)
1065		return (-1);
1066
1067	/* Scan from the start of the block until we get to the entry. */
1068	for (i = 0; i < entrypos; i += dp->d_reclen) {
1069		dp = (struct direct *)(blkbuf + i);
1070		if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
1071			return (-1);	/* Corrupted directory. */
1072		prevoff = blkoff + i;
1073	}
1074	return (prevoff);
1075}
1076
1077/*
1078 * Try to free up `wanted' bytes by stealing memory from existing
1079 * dirhashes. Returns zero with list locked if successful.
1080 */
1081static int
1082ufsdirhash_recycle(int wanted)
1083{
1084	struct dirhash *dh;
1085	doff_t **hash;
1086	u_int8_t *blkfree;
1087	int i, mem, narrays;
1088
1089	DIRHASHLIST_LOCK();
1090	dh = TAILQ_FIRST(&ufsdirhash_list);
1091	while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
1092		/* Decrement the score; only recycle if it becomes zero. */
1093		if (dh == NULL || --dh->dh_score > 0) {
1094			DIRHASHLIST_UNLOCK();
1095			return (-1);
1096		}
1097		/*
1098		 * If we can't lock it it's in use and we don't want to
1099		 * recycle it anyway.
1100		 */
1101		if (lockmgr(&dh->dh_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
1102			dh = TAILQ_NEXT(dh, dh_list);
1103			continue;
1104		}
1105		KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list"));
1106
1107		/* Remove it from the list and detach its memory. */
1108		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
1109		dh->dh_onlist = 0;
1110		hash = dh->dh_hash;
1111		dh->dh_hash = NULL;
1112		blkfree = dh->dh_blkfree;
1113		dh->dh_blkfree = NULL;
1114		narrays = dh->dh_narrays;
1115		mem = dh->dh_memreq;
1116		dh->dh_memreq = 0;
1117
1118		/* Unlock everything, free the detached memory. */
1119		ufsdirhash_release(dh);
1120		DIRHASHLIST_UNLOCK();
1121		for (i = 0; i < narrays; i++)
1122			DIRHASH_BLKFREE(hash[i]);
1123		FREE(hash, M_DIRHASH);
1124		FREE(blkfree, M_DIRHASH);
1125
1126		/* Account for the returned memory, and repeat if necessary. */
1127		DIRHASHLIST_LOCK();
1128		ufs_dirhashmem -= mem;
1129		dh = TAILQ_FIRST(&ufsdirhash_list);
1130	}
1131	/* Success; return with list locked. */
1132	return (0);
1133}
1134
1135
1136void
1137ufsdirhash_init()
1138{
1139	ufsdirhash_zone = uma_zcreate("DIRHASH", DH_NBLKOFF * sizeof(doff_t),
1140	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1141	mtx_init(&ufsdirhash_mtx, "dirhash list", NULL, MTX_DEF);
1142	TAILQ_INIT(&ufsdirhash_list);
1143}
1144
1145void
1146ufsdirhash_uninit()
1147{
1148	KASSERT(TAILQ_EMPTY(&ufsdirhash_list), ("ufsdirhash_uninit"));
1149	uma_zdestroy(ufsdirhash_zone);
1150	mtx_destroy(&ufsdirhash_mtx);
1151}
1152
1153#endif /* UFS_DIRHASH */
1154