ufs_dirhash.c revision 92098
1/*
2 * Copyright (c) 2001 Ian Dowse.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD: head/sys/ufs/ufs/ufs_dirhash.c 92098 2002-03-11 19:13:22Z iedowse $
26 */
27/*
28 * This implements a hash-based lookup scheme for UFS directories.
29 */
30
31#include "opt_ufs.h"
32
33#ifdef UFS_DIRHASH
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/kernel.h>
38#include <sys/lock.h>
39#include <sys/mutex.h>
40#include <sys/malloc.h>
41#include <sys/fnv_hash.h>
42#include <sys/proc.h>
43#include <sys/bio.h>
44#include <sys/buf.h>
45#include <sys/vnode.h>
46#include <sys/mount.h>
47#include <sys/sysctl.h>
48#include <vm/vm_zone.h>
49
50#include <ufs/ufs/quota.h>
51#include <ufs/ufs/inode.h>
52#include <ufs/ufs/dir.h>
53#include <ufs/ufs/dirhash.h>
54#include <ufs/ufs/extattr.h>
55#include <ufs/ufs/ufsmount.h>
56#include <ufs/ufs/ufs_extern.h>
57
58#define WRAPINCR(val, limit)	(((val) + 1 == (limit)) ? 0 : ((val) + 1))
59#define OFSFMT(vp)		((vp)->v_mount->mnt_maxsymlinklen <= 0)
60#define BLKFREE2IDX(n)		((n) > DH_NFSTATS ? DH_NFSTATS : (n))
61
62static MALLOC_DEFINE(M_DIRHASH, "UFS dirhash", "UFS directory hash tables");
63
64SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
65
66static int ufs_mindirhashsize = DIRBLKSIZ * 5;
67SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW,
68    &ufs_mindirhashsize,
69    0, "minimum directory size in bytes for which to use hashed lookup");
70static int ufs_dirhashmaxmem = 2 * 1024 * 1024;
71SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_maxmem, CTLFLAG_RW, &ufs_dirhashmaxmem,
72    0, "maximum allowed dirhash memory usage");
73static int ufs_dirhashmem;
74SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_mem, CTLFLAG_RD, &ufs_dirhashmem,
75    0, "current dirhash memory usage");
76static int ufs_dirhashcheck = 0;
77SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_docheck, CTLFLAG_RW, &ufs_dirhashcheck,
78    0, "enable extra sanity tests");
79
80
81static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen);
82static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff);
83static void ufsdirhash_delslot(struct dirhash *dh, int slot);
84static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen,
85	   doff_t offset);
86static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset);
87static void ufsdirhash_init(void);
88static int ufsdirhash_recycle(int wanted);
89
90static vm_zone_t	ufsdirhash_zone;
91
92/* Dirhash list; recently-used entries are near the tail. */
93static TAILQ_HEAD(, dirhash) ufsdirhash_list;
94
95/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
96static struct mtx	ufsdirhash_mtx;
97
98/*
99 * Locking order:
100 *	ufsdirhash_mtx
101 *	dh_mtx
102 *
103 * The dh_mtx mutex should be aquired either via the inode lock, or via
104 * ufsdirhash_mtx. Only the owner of the inode may free the associated
105 * dirhash, but anything can steal its memory and set dh_hash to NULL.
106 */
107
108/*
109 * Attempt to build up a hash table for the directory contents in
110 * inode 'ip'. Returns 0 on success, or -1 of the operation failed.
111 */
112int
113ufsdirhash_build(struct inode *ip)
114{
115	struct dirhash *dh;
116	struct buf *bp = NULL;
117	struct direct *ep;
118	struct vnode *vp;
119	doff_t bmask, pos;
120	int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
121
122	/* Check if we can/should use dirhash. */
123	if (ip->i_dirhash == NULL) {
124		if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode))
125			return (-1);
126	} else {
127		/* Hash exists, but sysctls could have changed. */
128		if (ip->i_size < ufs_mindirhashsize ||
129		    ufs_dirhashmem > ufs_dirhashmaxmem) {
130			ufsdirhash_free(ip);
131			return (-1);
132		}
133		/* Check if hash exists and is intact (note: unlocked read). */
134		if (ip->i_dirhash->dh_hash != NULL)
135			return (0);
136		/* Free the old, recycled hash and build a new one. */
137		ufsdirhash_free(ip);
138	}
139
140	/* Don't hash removed directories. */
141	if (ip->i_effnlink == 0)
142		return (-1);
143
144	vp = ip->i_vnode;
145	/* Allocate 50% more entries than this dir size could ever need. */
146	KASSERT(ip->i_size >= DIRBLKSIZ, ("ufsdirhash_build size"));
147	nslots = ip->i_size / DIRECTSIZ(1);
148	nslots = (nslots * 3 + 1) / 2;
149	narrays = howmany(nslots, DH_NBLKOFF);
150	nslots = narrays * DH_NBLKOFF;
151	dirblocks = howmany(ip->i_size, DIRBLKSIZ);
152	nblocks = (dirblocks * 3 + 1) / 2;
153
154	memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
155	    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
156	    nblocks * sizeof(*dh->dh_blkfree);
157	mtx_lock(&ufsdirhash_mtx);
158	if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) {
159		mtx_unlock(&ufsdirhash_mtx);
160		if (memreqd > ufs_dirhashmaxmem / 2)
161			return (-1);
162
163		/* Try to free some space. */
164		if (ufsdirhash_recycle(memreqd) != 0)
165			return (-1);
166		/* Enough was freed, and ufsdirhash_mtx has been locked. */
167	}
168	ufs_dirhashmem += memreqd;
169	mtx_unlock(&ufsdirhash_mtx);
170
171	/*
172	 * Use non-blocking mallocs so that we will revert to a linear
173	 * lookup on failure rather than potentially blocking forever.
174	 */
175	MALLOC(dh, struct dirhash *, sizeof *dh, M_DIRHASH, M_NOWAIT | M_ZERO);
176	if (dh == NULL)
177		return (-1);
178	MALLOC(dh->dh_hash, doff_t **, narrays * sizeof(dh->dh_hash[0]),
179	    M_DIRHASH, M_NOWAIT | M_ZERO);
180	MALLOC(dh->dh_blkfree, u_int8_t *, nblocks * sizeof(dh->dh_blkfree[0]),
181	    M_DIRHASH, M_NOWAIT);
182	if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
183		goto fail;
184	for (i = 0; i < narrays; i++) {
185		if ((dh->dh_hash[i] = zalloc(ufsdirhash_zone)) == NULL)
186			goto fail;
187		for (j = 0; j < DH_NBLKOFF; j++)
188			dh->dh_hash[i][j] = DIRHASH_EMPTY;
189	}
190
191	/* Initialise the hash table and block statistics. */
192	mtx_init(&dh->dh_mtx, "dirhash", MTX_DEF);
193	dh->dh_narrays = narrays;
194	dh->dh_hlen = nslots;
195	dh->dh_nblk = nblocks;
196	dh->dh_dirblks = dirblocks;
197	for (i = 0; i < dirblocks; i++)
198		dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN;
199	for (i = 0; i < DH_NFSTATS; i++)
200		dh->dh_firstfree[i] = -1;
201	dh->dh_firstfree[DH_NFSTATS] = 0;
202	dh->dh_seqopt = 0;
203	dh->dh_seqoff = 0;
204	dh->dh_score = DH_SCOREINIT;
205	ip->i_dirhash = dh;
206
207	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
208	pos = 0;
209	while (pos < ip->i_size) {
210		/* If necessary, get the next directory block. */
211		if ((pos & bmask) == 0) {
212			if (bp != NULL)
213				brelse(bp);
214			if (UFS_BLKATOFF(vp, (off_t)pos, NULL, &bp) != 0)
215				goto fail;
216		}
217
218		/* Add this entry to the hash. */
219		ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
220		if (ep->d_reclen == 0 || ep->d_reclen >
221		    DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) {
222			/* Corrupted directory. */
223			brelse(bp);
224			goto fail;
225		}
226		if (ep->d_ino != 0) {
227			/* Add the entry (simplified ufsdirhash_add). */
228			slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen);
229			while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
230				slot = WRAPINCR(slot, dh->dh_hlen);
231			dh->dh_hused++;
232			DH_ENTRY(dh, slot) = pos;
233			ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep));
234		}
235		pos += ep->d_reclen;
236	}
237
238	if (bp != NULL)
239		brelse(bp);
240	mtx_lock(&ufsdirhash_mtx);
241	TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
242	dh->dh_onlist = 1;
243	mtx_unlock(&ufsdirhash_mtx);
244	return (0);
245
246fail:
247	if (dh->dh_hash != NULL) {
248		for (i = 0; i < narrays; i++)
249			if (dh->dh_hash[i] != NULL)
250				zfree(ufsdirhash_zone, dh->dh_hash[i]);
251		FREE(dh->dh_hash, M_DIRHASH);
252	}
253	if (dh->dh_blkfree != NULL)
254		FREE(dh->dh_blkfree, M_DIRHASH);
255	FREE(dh, M_DIRHASH);
256	ip->i_dirhash = NULL;
257	mtx_lock(&ufsdirhash_mtx);
258	ufs_dirhashmem -= memreqd;
259	mtx_unlock(&ufsdirhash_mtx);
260	return (-1);
261}
262
263/*
264 * Free any hash table associated with inode 'ip'.
265 */
266void
267ufsdirhash_free(struct inode *ip)
268{
269	struct dirhash *dh;
270	int i, mem;
271
272	if ((dh = ip->i_dirhash) == NULL)
273		return;
274	mtx_lock(&ufsdirhash_mtx);
275	mtx_lock(&dh->dh_mtx);
276	if (dh->dh_onlist)
277		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
278	mtx_unlock(&dh->dh_mtx);
279	mtx_unlock(&ufsdirhash_mtx);
280
281	/* The dirhash pointed to by 'dh' is exclusively ours now. */
282
283	mem = sizeof(*dh);
284	if (dh->dh_hash != NULL) {
285		for (i = 0; i < dh->dh_narrays; i++)
286			zfree(ufsdirhash_zone, dh->dh_hash[i]);
287		FREE(dh->dh_hash, M_DIRHASH);
288		FREE(dh->dh_blkfree, M_DIRHASH);
289		mem += dh->dh_narrays * sizeof(*dh->dh_hash) +
290		    dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
291		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
292	}
293	mtx_destroy(&dh->dh_mtx);
294	FREE(dh, M_DIRHASH);
295	ip->i_dirhash = NULL;
296
297	mtx_lock(&ufsdirhash_mtx);
298	ufs_dirhashmem -= mem;
299	mtx_unlock(&ufsdirhash_mtx);
300}
301
302/*
303 * Find the offset of the specified name within the given inode.
304 * Returns 0 on success, ENOENT if the entry does not exist, or
305 * EJUSTRETURN if the caller should revert to a linear search.
306 *
307 * If successful, the directory offset is stored in *offp, and a
308 * pointer to a struct buf containing the entry is stored in *bpp. If
309 * prevoffp is non-NULL, the offset of the previous entry within
310 * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
311 * is the first in a block, the start of the block is used).
312 */
313int
314ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp,
315    struct buf **bpp, doff_t *prevoffp)
316{
317	struct dirhash *dh, *dh_next;
318	struct direct *dp;
319	struct vnode *vp;
320	struct buf *bp;
321	doff_t blkoff, bmask, offset, prevoff;
322	int i, slot;
323
324	if ((dh = ip->i_dirhash) == NULL)
325		return (EJUSTRETURN);
326	/*
327	 * Move this dirhash towards the end of the list if it has a
328	 * score higher than the next entry, and aquire the dh_mtx.
329	 * Optimise the case where it's already the last by performing
330	 * an unlocked read of the TAILQ_NEXT pointer.
331	 *
332	 * In both cases, end up holding just dh_mtx.
333	 */
334	if (TAILQ_NEXT(dh, dh_list) != NULL) {
335		mtx_lock(&ufsdirhash_mtx);
336		mtx_lock(&dh->dh_mtx);
337		/*
338		 * If the new score will be greater than that of the next
339		 * entry, then move this entry past it. With both mutexes
340		 * held, dh_next won't go away, but its dh_score could
341		 * change; that's not important since it is just a hint.
342		 */
343		if (dh->dh_hash != NULL &&
344		    (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
345		    dh->dh_score >= dh_next->dh_score) {
346			KASSERT(dh->dh_onlist, ("dirhash: not on list"));
347			TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
348			TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
349			    dh_list);
350		}
351		mtx_unlock(&ufsdirhash_mtx);
352	} else {
353		/* Already the last, though that could change as we wait. */
354		mtx_lock(&dh->dh_mtx);
355	}
356	if (dh->dh_hash == NULL) {
357		mtx_unlock(&dh->dh_mtx);
358		ufsdirhash_free(ip);
359		return (EJUSTRETURN);
360	}
361
362	/* Update the score. */
363	if (dh->dh_score < DH_SCOREMAX)
364		dh->dh_score++;
365
366	vp = ip->i_vnode;
367	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
368	blkoff = -1;
369	bp = NULL;
370restart:
371	slot = ufsdirhash_hash(dh, name, namelen);
372
373	if (dh->dh_seqopt) {
374		/*
375		 * Sequential access optimisation. dh_seqoff contains the
376		 * offset of the directory entry immediately following
377		 * the last entry that was looked up. Check if this offset
378		 * appears in the hash chain for the name we are looking for.
379		 */
380		for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
381		    i = WRAPINCR(i, dh->dh_hlen))
382			if (offset == dh->dh_seqoff)
383				break;
384		if (offset == dh->dh_seqoff) {
385			/*
386			 * We found an entry with the expected offset. This
387			 * is probably the entry we want, but if not, the
388			 * code below will turn off seqoff and retry.
389			 */
390			slot = i;
391		} else
392			dh->dh_seqopt = 0;
393	}
394
395	for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY;
396	    slot = WRAPINCR(slot, dh->dh_hlen)) {
397		if (offset == DIRHASH_DEL)
398			continue;
399		mtx_unlock(&dh->dh_mtx);
400
401		if (offset < 0 || offset >= ip->i_size)
402			panic("ufsdirhash_lookup: bad offset in hash array");
403		if ((offset & ~bmask) != blkoff) {
404			if (bp != NULL)
405				brelse(bp);
406			blkoff = offset & ~bmask;
407			if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0)
408				return (EJUSTRETURN);
409		}
410		dp = (struct direct *)(bp->b_data + (offset & bmask));
411		if (dp->d_reclen == 0 || dp->d_reclen >
412		    DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) {
413			/* Corrupted directory. */
414			brelse(bp);
415			return (EJUSTRETURN);
416		}
417		if (dp->d_namlen == namelen &&
418		    bcmp(dp->d_name, name, namelen) == 0) {
419			/* Found. Get the prev offset if needed. */
420			if (prevoffp != NULL) {
421				if (offset & (DIRBLKSIZ - 1)) {
422					prevoff = ufsdirhash_getprev(dp,
423					    offset);
424					if (prevoff == -1) {
425						brelse(bp);
426						return (EJUSTRETURN);
427					}
428				} else
429					prevoff = offset;
430				*prevoffp = prevoff;
431			}
432
433			/* Check for sequential access, and update offset. */
434			if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
435				dh->dh_seqopt = 1;
436			dh->dh_seqoff = offset + DIRSIZ(0, dp);
437
438			*bpp = bp;
439			*offp = offset;
440			return (0);
441		}
442
443		mtx_lock(&dh->dh_mtx);
444		if (dh->dh_hash == NULL) {
445			mtx_unlock(&dh->dh_mtx);
446			if (bp != NULL)
447				brelse(bp);
448			ufsdirhash_free(ip);
449			return (EJUSTRETURN);
450		}
451		/*
452		 * When the name doesn't match in the seqopt case, go back
453		 * and search normally.
454		 */
455		if (dh->dh_seqopt) {
456			dh->dh_seqopt = 0;
457			goto restart;
458		}
459	}
460	mtx_unlock(&dh->dh_mtx);
461	if (bp != NULL)
462		brelse(bp);
463	return (ENOENT);
464}
465
466/*
467 * Find a directory block with room for 'slotneeded' bytes. Returns
468 * the offset of the directory entry that begins the free space.
469 * This will either be the offset of an existing entry that has free
470 * space at the end, or the offset of an entry with d_ino == 0 at
471 * the start of a DIRBLKSIZ block.
472 *
473 * To use the space, the caller may need to compact existing entries in
474 * the directory. The total number of bytes in all of the entries involved
475 * in the compaction is stored in *slotsize. In other words, all of
476 * the entries that must be compacted are exactly contained in the
477 * region beginning at the returned offset and spanning *slotsize bytes.
478 *
479 * Returns -1 if no space was found, indicating that the directory
480 * must be extended.
481 */
482doff_t
483ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
484{
485	struct direct *dp;
486	struct dirhash *dh;
487	struct buf *bp;
488	doff_t pos, slotstart;
489	int dirblock, error, freebytes, i;
490
491	if ((dh = ip->i_dirhash) == NULL)
492		return (-1);
493	mtx_lock(&dh->dh_mtx);
494	if (dh->dh_hash == NULL) {
495		mtx_unlock(&dh->dh_mtx);
496		ufsdirhash_free(ip);
497		return (-1);
498	}
499
500	/* Find a directory block with the desired free space. */
501	dirblock = -1;
502	for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
503		if ((dirblock = dh->dh_firstfree[i]) != -1)
504			break;
505	if (dirblock == -1) {
506		mtx_unlock(&dh->dh_mtx);
507		return (-1);
508	}
509
510	KASSERT(dirblock < dh->dh_nblk &&
511	    dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN),
512	    ("ufsdirhash_findfree: bad stats"));
513	mtx_unlock(&dh->dh_mtx);
514	pos = dirblock * DIRBLKSIZ;
515	error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp);
516	if (error)
517		return (-1);
518
519	/* Find the first entry with free space. */
520	for (i = 0; i < DIRBLKSIZ; ) {
521		if (dp->d_reclen == 0) {
522			brelse(bp);
523			return (-1);
524		}
525		if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp))
526			break;
527		i += dp->d_reclen;
528		dp = (struct direct *)((char *)dp + dp->d_reclen);
529	}
530	if (i > DIRBLKSIZ) {
531		brelse(bp);
532		return (-1);
533	}
534	slotstart = pos + i;
535
536	/* Find the range of entries needed to get enough space */
537	freebytes = 0;
538	while (i < DIRBLKSIZ && freebytes < slotneeded) {
539		freebytes += dp->d_reclen;
540		if (dp->d_ino != 0)
541			freebytes -= DIRSIZ(0, dp);
542		if (dp->d_reclen == 0) {
543			brelse(bp);
544			return (-1);
545		}
546		i += dp->d_reclen;
547		dp = (struct direct *)((char *)dp + dp->d_reclen);
548	}
549	if (i > DIRBLKSIZ) {
550		brelse(bp);
551		return (-1);
552	}
553	if (freebytes < slotneeded)
554		panic("ufsdirhash_findfree: free mismatch");
555	brelse(bp);
556	*slotsize = pos + i - slotstart;
557	return (slotstart);
558}
559
560/*
561 * Return the start of the unused space at the end of a directory, or
562 * -1 if there are no trailing unused blocks.
563 */
564doff_t
565ufsdirhash_enduseful(struct inode *ip)
566{
567
568	struct dirhash *dh;
569	int i;
570
571	if ((dh = ip->i_dirhash) == NULL)
572		return (-1);
573	mtx_lock(&dh->dh_mtx);
574	if (dh->dh_hash == NULL) {
575		mtx_unlock(&dh->dh_mtx);
576		ufsdirhash_free(ip);
577		return (-1);
578	}
579
580	if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) {
581		mtx_unlock(&dh->dh_mtx);
582		return (-1);
583	}
584
585	for (i = dh->dh_dirblks - 1; i >= 0; i--)
586		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
587			break;
588	mtx_unlock(&dh->dh_mtx);
589	return ((doff_t)(i + 1) * DIRBLKSIZ);
590}
591
592/*
593 * Insert information into the hash about a new directory entry. dirp
594 * points to a struct direct containing the entry, and offset specifies
595 * the offset of this entry.
596 */
597void
598ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
599{
600	struct dirhash *dh;
601	int slot;
602
603	if ((dh = ip->i_dirhash) == NULL)
604		return;
605	mtx_lock(&dh->dh_mtx);
606	if (dh->dh_hash == NULL) {
607		mtx_unlock(&dh->dh_mtx);
608		ufsdirhash_free(ip);
609		return;
610	}
611
612	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
613	    ("ufsdirhash_add: bad offset"));
614	/*
615	 * Normal hash usage is < 66%. If the usage gets too high then
616	 * remove the hash entirely and let it be rebuilt later.
617	 */
618	if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
619		mtx_unlock(&dh->dh_mtx);
620		ufsdirhash_free(ip);
621		return;
622	}
623
624	/* Find a free hash slot (empty or deleted), and add the entry. */
625	slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
626	while (DH_ENTRY(dh, slot) >= 0)
627		slot = WRAPINCR(slot, dh->dh_hlen);
628	if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
629		dh->dh_hused++;
630	DH_ENTRY(dh, slot) = offset;
631
632	/* Update the per-block summary info. */
633	ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp));
634	mtx_unlock(&dh->dh_mtx);
635}
636
637/*
638 * Remove the specified directory entry from the hash. The entry to remove
639 * is defined by the name in `dirp', which must exist at the specified
640 * `offset' within the directory.
641 */
642void
643ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
644{
645	struct dirhash *dh;
646	int slot;
647
648	if ((dh = ip->i_dirhash) == NULL)
649		return;
650	mtx_lock(&dh->dh_mtx);
651	if (dh->dh_hash == NULL) {
652		mtx_unlock(&dh->dh_mtx);
653		ufsdirhash_free(ip);
654		return;
655	}
656
657	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
658	    ("ufsdirhash_remove: bad offset"));
659	/* Find the entry */
660	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
661
662	/* Remove the hash entry. */
663	ufsdirhash_delslot(dh, slot);
664
665	/* Update the per-block summary info. */
666	ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp));
667	mtx_unlock(&dh->dh_mtx);
668}
669
670/*
671 * Change the offset associated with a directory entry in the hash. Used
672 * when compacting directory blocks.
673 */
674void
675ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
676    doff_t newoff)
677{
678	struct dirhash *dh;
679	int slot;
680
681	if ((dh = ip->i_dirhash) == NULL)
682		return;
683	mtx_lock(&dh->dh_mtx);
684	if (dh->dh_hash == NULL) {
685		mtx_unlock(&dh->dh_mtx);
686		ufsdirhash_free(ip);
687		return;
688	}
689
690	KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ &&
691	    newoff < dh->dh_dirblks * DIRBLKSIZ,
692	    ("ufsdirhash_move: bad offset"));
693	/* Find the entry, and update the offset. */
694	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
695	DH_ENTRY(dh, slot) = newoff;
696	mtx_unlock(&dh->dh_mtx);
697}
698
699/*
700 * Inform dirhash that the directory has grown by one block that
701 * begins at offset (i.e. the new length is offset + DIRBLKSIZ).
702 */
703void
704ufsdirhash_newblk(struct inode *ip, doff_t offset)
705{
706	struct dirhash *dh;
707	int block;
708
709	if ((dh = ip->i_dirhash) == NULL)
710		return;
711	mtx_lock(&dh->dh_mtx);
712	if (dh->dh_hash == NULL) {
713		mtx_unlock(&dh->dh_mtx);
714		ufsdirhash_free(ip);
715		return;
716	}
717
718	KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ,
719	    ("ufsdirhash_newblk: bad offset"));
720	block = offset / DIRBLKSIZ;
721	if (block >= dh->dh_nblk) {
722		/* Out of space; must rebuild. */
723		mtx_unlock(&dh->dh_mtx);
724		ufsdirhash_free(ip);
725		return;
726	}
727	dh->dh_dirblks = block + 1;
728
729	/* Account for the new free block. */
730	dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN;
731	if (dh->dh_firstfree[DH_NFSTATS] == -1)
732		dh->dh_firstfree[DH_NFSTATS] = block;
733	mtx_unlock(&dh->dh_mtx);
734}
735
736/*
737 * Inform dirhash that the directory is being truncated.
738 */
739void
740ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
741{
742	struct dirhash *dh;
743	int block, i;
744
745	if ((dh = ip->i_dirhash) == NULL)
746		return;
747	mtx_lock(&dh->dh_mtx);
748	if (dh->dh_hash == NULL) {
749		mtx_unlock(&dh->dh_mtx);
750		ufsdirhash_free(ip);
751		return;
752	}
753
754	KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ,
755	    ("ufsdirhash_dirtrunc: bad offset"));
756	block = howmany(offset, DIRBLKSIZ);
757	/*
758	 * If the directory shrinks to less than 1/8 of dh_nblk blocks
759	 * (about 20% of its original size due to the 50% extra added in
760	 * ufsdirhash_build) then free it, and let the caller rebuild
761	 * if necessary.
762	 */
763	if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
764		mtx_unlock(&dh->dh_mtx);
765		ufsdirhash_free(ip);
766		return;
767	}
768
769	/*
770	 * Remove any `first free' information pertaining to the
771	 * truncated blocks. All blocks we're removing should be
772	 * completely unused.
773	 */
774	if (dh->dh_firstfree[DH_NFSTATS] >= block)
775		dh->dh_firstfree[DH_NFSTATS] = -1;
776	for (i = block; i < dh->dh_dirblks; i++)
777		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
778			panic("ufsdirhash_dirtrunc: blocks in use");
779	for (i = 0; i < DH_NFSTATS; i++)
780		if (dh->dh_firstfree[i] >= block)
781			panic("ufsdirhash_dirtrunc: first free corrupt");
782	dh->dh_dirblks = block;
783	mtx_unlock(&dh->dh_mtx);
784}
785
786/*
787 * Debugging function to check that the dirhash information about
788 * a directory block matches its actual contents. Panics if a mismatch
789 * is detected.
790 *
791 * On entry, `buf' should point to the start of an in-core
792 * DIRBLKSIZ-sized directory block, and `offset' should contain the
793 * offset from the start of the directory of that block.
794 */
795void
796ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset)
797{
798	struct dirhash *dh;
799	struct direct *dp;
800	int block, ffslot, i, nfree;
801
802	if (!ufs_dirhashcheck)
803		return;
804	if ((dh = ip->i_dirhash) == NULL)
805		return;
806	mtx_lock(&dh->dh_mtx);
807	if (dh->dh_hash == NULL) {
808		mtx_unlock(&dh->dh_mtx);
809		ufsdirhash_free(ip);
810		return;
811	}
812
813	block = offset / DIRBLKSIZ;
814	if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks)
815		panic("ufsdirhash_checkblock: bad offset");
816
817	nfree = 0;
818	for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) {
819		dp = (struct direct *)(buf + i);
820		if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ)
821			panic("ufsdirhash_checkblock: bad dir");
822
823		if (dp->d_ino == 0) {
824#if 0
825			/*
826			 * XXX entries with d_ino == 0 should only occur
827			 * at the start of a DIRBLKSIZ block. However the
828			 * ufs code is tolerant of such entries at other
829			 * offsets, and fsck does not fix them.
830			 */
831			if (i != 0)
832				panic("ufsdirhash_checkblock: bad dir inode");
833#endif
834			nfree += dp->d_reclen;
835			continue;
836		}
837
838		/* Check that the entry	exists (will panic if it doesn't). */
839		ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
840
841		nfree += dp->d_reclen - DIRSIZ(0, dp);
842	}
843	if (i != DIRBLKSIZ)
844		panic("ufsdirhash_checkblock: bad dir end");
845
846	if (dh->dh_blkfree[block] * DIRALIGN != nfree)
847		panic("ufsdirhash_checkblock: bad free count");
848
849	ffslot = BLKFREE2IDX(nfree / DIRALIGN);
850	for (i = 0; i <= DH_NFSTATS; i++)
851		if (dh->dh_firstfree[i] == block && i != ffslot)
852			panic("ufsdirhash_checkblock: bad first-free");
853	if (dh->dh_firstfree[ffslot] == -1)
854		panic("ufsdirhash_checkblock: missing first-free entry");
855	mtx_unlock(&dh->dh_mtx);
856}
857
858/*
859 * Hash the specified filename into a dirhash slot.
860 */
861static int
862ufsdirhash_hash(struct dirhash *dh, char *name, int namelen)
863{
864	return (fnv_32_buf(name, namelen, FNV1_32_INIT) % dh->dh_hlen);
865}
866
867/*
868 * Adjust the number of free bytes in the block containing `offset'
869 * by the value specified by `diff'.
870 *
871 * The caller must ensure we have exclusive access to `dh'; normally
872 * that means that dh_mtx should be held, but this is also called
873 * from ufsdirhash_build() where exclusive access can be assumed.
874 */
875static void
876ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff)
877{
878	int block, i, nfidx, ofidx;
879
880	/* Update the per-block summary info. */
881	block = offset / DIRBLKSIZ;
882	KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks,
883	     ("dirhash bad offset"));
884	ofidx = BLKFREE2IDX(dh->dh_blkfree[block]);
885	dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
886	nfidx = BLKFREE2IDX(dh->dh_blkfree[block]);
887
888	/* Update the `first free' list if necessary. */
889	if (ofidx != nfidx) {
890		/* If removing, scan forward for the next block. */
891		if (dh->dh_firstfree[ofidx] == block) {
892			for (i = block + 1; i < dh->dh_dirblks; i++)
893				if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx)
894					break;
895			dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
896		}
897
898		/* Make this the new `first free' if necessary */
899		if (dh->dh_firstfree[nfidx] > block ||
900		    dh->dh_firstfree[nfidx] == -1)
901			dh->dh_firstfree[nfidx] = block;
902	}
903}
904
905/*
906 * Find the specified name which should have the specified offset.
907 * Returns a slot number, and panics on failure.
908 *
909 * `dh' must be locked on entry and remains so on return.
910 */
911static int
912ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset)
913{
914	int slot;
915
916	mtx_assert(&dh->dh_mtx, MA_OWNED);
917
918	/* Find the entry. */
919	KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full"));
920	slot = ufsdirhash_hash(dh, name, namelen);
921	while (DH_ENTRY(dh, slot) != offset &&
922	    DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
923		slot = WRAPINCR(slot, dh->dh_hlen);
924	if (DH_ENTRY(dh, slot) != offset)
925		panic("ufsdirhash_findslot: '%.*s' not found", namelen, name);
926
927	return (slot);
928}
929
930/*
931 * Remove the entry corresponding to the specified slot from the hash array.
932 *
933 * `dh' must be locked on entry and remains so on return.
934 */
935static void
936ufsdirhash_delslot(struct dirhash *dh, int slot)
937{
938	int i;
939
940	mtx_assert(&dh->dh_mtx, MA_OWNED);
941
942	/* Mark the entry as deleted. */
943	DH_ENTRY(dh, slot) = DIRHASH_DEL;
944
945	/* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
946	for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
947		i = WRAPINCR(i, dh->dh_hlen);
948	if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
949		for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) {
950			DH_ENTRY(dh, i) = DIRHASH_EMPTY;
951			dh->dh_hused--;
952			i = WRAPINCR(i, dh->dh_hlen);
953		}
954		KASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen"));
955	}
956}
957
958/*
959 * Given a directory entry and its offset, find the offset of the
960 * previous entry in the same DIRBLKSIZ-sized block. Returns an
961 * offset, or -1 if there is no previous entry in the block or some
962 * other problem occurred.
963 */
964static doff_t
965ufsdirhash_getprev(struct direct *dirp, doff_t offset)
966{
967	struct direct *dp;
968	char *blkbuf;
969	doff_t blkoff, prevoff;
970	int entrypos, i;
971
972	blkoff = offset & ~(DIRBLKSIZ - 1);	/* offset of start of block */
973	entrypos = offset & (DIRBLKSIZ - 1);	/* entry relative to block */
974	blkbuf = (char *)dirp - entrypos;
975	prevoff = blkoff;
976
977	/* If `offset' is the start of a block, there is no previous entry. */
978	if (entrypos == 0)
979		return (-1);
980
981	/* Scan from the start of the block until we get to the entry. */
982	for (i = 0; i < entrypos; i += dp->d_reclen) {
983		dp = (struct direct *)(blkbuf + i);
984		if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
985			return (-1);	/* Corrupted directory. */
986		prevoff = blkoff + i;
987	}
988	return (prevoff);
989}
990
991/*
992 * Try to free up `wanted' bytes by stealing memory from existing
993 * dirhashes. Returns zero with ufsdirhash_mtx locked if successful.
994 */
995static int
996ufsdirhash_recycle(int wanted)
997{
998	struct dirhash *dh;
999	doff_t **hash;
1000	u_int8_t *blkfree;
1001	int i, mem, narrays;
1002
1003	mtx_lock(&ufsdirhash_mtx);
1004	while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
1005		/* Find a dirhash, and lock it. */
1006		if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) {
1007			mtx_unlock(&ufsdirhash_mtx);
1008			return (-1);
1009		}
1010		mtx_lock(&dh->dh_mtx);
1011		KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list"));
1012
1013		/* Decrement the score; only recycle if it becomes zero. */
1014		if (--dh->dh_score > 0) {
1015			mtx_unlock(&dh->dh_mtx);
1016			mtx_unlock(&ufsdirhash_mtx);
1017			return (-1);
1018		}
1019
1020		/* Remove it from the list and detach its memory. */
1021		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
1022		dh->dh_onlist = 0;
1023		hash = dh->dh_hash;
1024		dh->dh_hash = NULL;
1025		blkfree = dh->dh_blkfree;
1026		dh->dh_blkfree = NULL;
1027		narrays = dh->dh_narrays;
1028		mem = narrays * sizeof(*dh->dh_hash) +
1029		    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
1030		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
1031
1032		/* Unlock everything, free the detached memory. */
1033		mtx_unlock(&dh->dh_mtx);
1034		mtx_unlock(&ufsdirhash_mtx);
1035		for (i = 0; i < narrays; i++)
1036			zfree(ufsdirhash_zone, hash[i]);
1037		FREE(hash, M_DIRHASH);
1038		FREE(blkfree, M_DIRHASH);
1039
1040		/* Account for the returned memory, and repeat if necessary. */
1041		mtx_lock(&ufsdirhash_mtx);
1042		ufs_dirhashmem -= mem;
1043	}
1044	/* Success; return with ufsdirhash_mtx locked. */
1045	return (0);
1046}
1047
1048
1049static void
1050ufsdirhash_init()
1051{
1052	ufsdirhash_zone = zinit("DIRHASH", DH_NBLKOFF * sizeof(daddr_t), 0,
1053	    0, 1);
1054	mtx_init(&ufsdirhash_mtx, "dirhash list", MTX_DEF);
1055	TAILQ_INIT(&ufsdirhash_list);
1056}
1057SYSINIT(ufsdirhash, SI_SUB_PSEUDO, SI_ORDER_ANY, ufsdirhash_init, NULL)
1058
1059
1060#endif /* UFS_DIRHASH */
1061