ufs_dirhash.c revision 111119
1/*
2 * Copyright (c) 2001, 2002 Ian Dowse.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD: head/sys/ufs/ufs/ufs_dirhash.c 111119 2003-02-19 05:47:46Z imp $
26 */
27/*
28 * This implements a hash-based lookup scheme for UFS directories.
29 */
30
31#include "opt_ufs.h"
32
33#ifdef UFS_DIRHASH
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/kernel.h>
38#include <sys/lock.h>
39#include <sys/mutex.h>
40#include <sys/malloc.h>
41#include <sys/fnv_hash.h>
42#include <sys/proc.h>
43#include <sys/bio.h>
44#include <sys/buf.h>
45#include <sys/vnode.h>
46#include <sys/mount.h>
47#include <sys/sysctl.h>
48#include <vm/uma.h>
49
50#include <ufs/ufs/quota.h>
51#include <ufs/ufs/inode.h>
52#include <ufs/ufs/dir.h>
53#include <ufs/ufs/dirhash.h>
54#include <ufs/ufs/extattr.h>
55#include <ufs/ufs/ufsmount.h>
56#include <ufs/ufs/ufs_extern.h>
57
58#define WRAPINCR(val, limit)	(((val) + 1 == (limit)) ? 0 : ((val) + 1))
59#define WRAPDECR(val, limit)	(((val) == 0) ? ((limit) - 1) : ((val) - 1))
60#define OFSFMT(vp)		((vp)->v_mount->mnt_maxsymlinklen <= 0)
61#define BLKFREE2IDX(n)		((n) > DH_NFSTATS ? DH_NFSTATS : (n))
62
63static MALLOC_DEFINE(M_DIRHASH, "UFS dirhash", "UFS directory hash tables");
64
65SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
66
67static int ufs_mindirhashsize = DIRBLKSIZ * 5;
68SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW,
69    &ufs_mindirhashsize,
70    0, "minimum directory size in bytes for which to use hashed lookup");
71static int ufs_dirhashmaxmem = 2 * 1024 * 1024;
72SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_maxmem, CTLFLAG_RW, &ufs_dirhashmaxmem,
73    0, "maximum allowed dirhash memory usage");
74static int ufs_dirhashmem;
75SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_mem, CTLFLAG_RD, &ufs_dirhashmem,
76    0, "current dirhash memory usage");
77static int ufs_dirhashcheck = 0;
78SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_docheck, CTLFLAG_RW, &ufs_dirhashcheck,
79    0, "enable extra sanity tests");
80
81
82static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen);
83static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff);
84static void ufsdirhash_delslot(struct dirhash *dh, int slot);
85static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen,
86	   doff_t offset);
87static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset);
88static int ufsdirhash_recycle(int wanted);
89
90static uma_zone_t	ufsdirhash_zone;
91
92/* Dirhash list; recently-used entries are near the tail. */
93static TAILQ_HEAD(, dirhash) ufsdirhash_list;
94
95/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
96static struct mtx	ufsdirhash_mtx;
97
98/*
99 * Locking order:
100 *	ufsdirhash_mtx
101 *	dh_mtx
102 *
103 * The dh_mtx mutex should be acquired either via the inode lock, or via
104 * ufsdirhash_mtx. Only the owner of the inode may free the associated
105 * dirhash, but anything can steal its memory and set dh_hash to NULL.
106 */
107
108/*
109 * Attempt to build up a hash table for the directory contents in
110 * inode 'ip'. Returns 0 on success, or -1 of the operation failed.
111 */
112int
113ufsdirhash_build(struct inode *ip)
114{
115	struct dirhash *dh;
116	struct buf *bp = NULL;
117	struct direct *ep;
118	struct vnode *vp;
119	doff_t bmask, pos;
120	int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
121
122	/* Check if we can/should use dirhash. */
123	if (ip->i_dirhash == NULL) {
124		if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode))
125			return (-1);
126	} else {
127		/* Hash exists, but sysctls could have changed. */
128		if (ip->i_size < ufs_mindirhashsize ||
129		    ufs_dirhashmem > ufs_dirhashmaxmem) {
130			ufsdirhash_free(ip);
131			return (-1);
132		}
133		/* Check if hash exists and is intact (note: unlocked read). */
134		if (ip->i_dirhash->dh_hash != NULL)
135			return (0);
136		/* Free the old, recycled hash and build a new one. */
137		ufsdirhash_free(ip);
138	}
139
140	/* Don't hash removed directories. */
141	if (ip->i_effnlink == 0)
142		return (-1);
143
144	vp = ip->i_vnode;
145	/* Allocate 50% more entries than this dir size could ever need. */
146	KASSERT(ip->i_size >= DIRBLKSIZ, ("ufsdirhash_build size"));
147	nslots = ip->i_size / DIRECTSIZ(1);
148	nslots = (nslots * 3 + 1) / 2;
149	narrays = howmany(nslots, DH_NBLKOFF);
150	nslots = narrays * DH_NBLKOFF;
151	dirblocks = howmany(ip->i_size, DIRBLKSIZ);
152	nblocks = (dirblocks * 3 + 1) / 2;
153
154	memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
155	    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
156	    nblocks * sizeof(*dh->dh_blkfree);
157	mtx_lock(&ufsdirhash_mtx);
158	if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) {
159		mtx_unlock(&ufsdirhash_mtx);
160		if (memreqd > ufs_dirhashmaxmem / 2)
161			return (-1);
162
163		/* Try to free some space. */
164		if (ufsdirhash_recycle(memreqd) != 0)
165			return (-1);
166		/* Enough was freed, and ufsdirhash_mtx has been locked. */
167	}
168	ufs_dirhashmem += memreqd;
169	mtx_unlock(&ufsdirhash_mtx);
170
171	/*
172	 * Use non-blocking mallocs so that we will revert to a linear
173	 * lookup on failure rather than potentially blocking forever.
174	 */
175	MALLOC(dh, struct dirhash *, sizeof *dh, M_DIRHASH, M_NOWAIT | M_ZERO);
176	if (dh == NULL) {
177		mtx_lock(&ufsdirhash_mtx);
178		ufs_dirhashmem -= memreqd;
179		mtx_unlock(&ufsdirhash_mtx);
180		return (-1);
181	}
182	MALLOC(dh->dh_hash, doff_t **, narrays * sizeof(dh->dh_hash[0]),
183	    M_DIRHASH, M_NOWAIT | M_ZERO);
184	MALLOC(dh->dh_blkfree, u_int8_t *, nblocks * sizeof(dh->dh_blkfree[0]),
185	    M_DIRHASH, M_NOWAIT);
186	if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
187		goto fail;
188	for (i = 0; i < narrays; i++) {
189		if ((dh->dh_hash[i] = uma_zalloc(ufsdirhash_zone,
190		    M_WAITOK)) == NULL)
191			goto fail;
192		for (j = 0; j < DH_NBLKOFF; j++)
193			dh->dh_hash[i][j] = DIRHASH_EMPTY;
194	}
195
196	/* Initialise the hash table and block statistics. */
197	mtx_init(&dh->dh_mtx, "dirhash", NULL, MTX_DEF);
198	dh->dh_narrays = narrays;
199	dh->dh_hlen = nslots;
200	dh->dh_nblk = nblocks;
201	dh->dh_dirblks = dirblocks;
202	for (i = 0; i < dirblocks; i++)
203		dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN;
204	for (i = 0; i < DH_NFSTATS; i++)
205		dh->dh_firstfree[i] = -1;
206	dh->dh_firstfree[DH_NFSTATS] = 0;
207	dh->dh_seqopt = 0;
208	dh->dh_seqoff = 0;
209	dh->dh_score = DH_SCOREINIT;
210	ip->i_dirhash = dh;
211
212	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
213	pos = 0;
214	while (pos < ip->i_size) {
215		/* If necessary, get the next directory block. */
216		if ((pos & bmask) == 0) {
217			if (bp != NULL)
218				brelse(bp);
219			if (UFS_BLKATOFF(vp, (off_t)pos, NULL, &bp) != 0)
220				goto fail;
221		}
222
223		/* Add this entry to the hash. */
224		ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
225		if (ep->d_reclen == 0 || ep->d_reclen >
226		    DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) {
227			/* Corrupted directory. */
228			brelse(bp);
229			goto fail;
230		}
231		if (ep->d_ino != 0) {
232			/* Add the entry (simplified ufsdirhash_add). */
233			slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen);
234			while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
235				slot = WRAPINCR(slot, dh->dh_hlen);
236			dh->dh_hused++;
237			DH_ENTRY(dh, slot) = pos;
238			ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep));
239		}
240		pos += ep->d_reclen;
241	}
242
243	if (bp != NULL)
244		brelse(bp);
245	mtx_lock(&ufsdirhash_mtx);
246	TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
247	dh->dh_onlist = 1;
248	mtx_unlock(&ufsdirhash_mtx);
249	return (0);
250
251fail:
252	if (dh->dh_hash != NULL) {
253		for (i = 0; i < narrays; i++)
254			if (dh->dh_hash[i] != NULL)
255				uma_zfree(ufsdirhash_zone, dh->dh_hash[i]);
256		FREE(dh->dh_hash, M_DIRHASH);
257	}
258	if (dh->dh_blkfree != NULL)
259		FREE(dh->dh_blkfree, M_DIRHASH);
260	FREE(dh, M_DIRHASH);
261	ip->i_dirhash = NULL;
262	mtx_lock(&ufsdirhash_mtx);
263	ufs_dirhashmem -= memreqd;
264	mtx_unlock(&ufsdirhash_mtx);
265	return (-1);
266}
267
268/*
269 * Free any hash table associated with inode 'ip'.
270 */
271void
272ufsdirhash_free(struct inode *ip)
273{
274	struct dirhash *dh;
275	int i, mem;
276
277	if ((dh = ip->i_dirhash) == NULL)
278		return;
279	mtx_lock(&ufsdirhash_mtx);
280	mtx_lock(&dh->dh_mtx);
281	if (dh->dh_onlist)
282		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
283	mtx_unlock(&dh->dh_mtx);
284	mtx_unlock(&ufsdirhash_mtx);
285
286	/* The dirhash pointed to by 'dh' is exclusively ours now. */
287
288	mem = sizeof(*dh);
289	if (dh->dh_hash != NULL) {
290		for (i = 0; i < dh->dh_narrays; i++)
291			uma_zfree(ufsdirhash_zone, dh->dh_hash[i]);
292		FREE(dh->dh_hash, M_DIRHASH);
293		FREE(dh->dh_blkfree, M_DIRHASH);
294		mem += dh->dh_narrays * sizeof(*dh->dh_hash) +
295		    dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
296		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
297	}
298	mtx_destroy(&dh->dh_mtx);
299	FREE(dh, M_DIRHASH);
300	ip->i_dirhash = NULL;
301
302	mtx_lock(&ufsdirhash_mtx);
303	ufs_dirhashmem -= mem;
304	mtx_unlock(&ufsdirhash_mtx);
305}
306
307/*
308 * Find the offset of the specified name within the given inode.
309 * Returns 0 on success, ENOENT if the entry does not exist, or
310 * EJUSTRETURN if the caller should revert to a linear search.
311 *
312 * If successful, the directory offset is stored in *offp, and a
313 * pointer to a struct buf containing the entry is stored in *bpp. If
314 * prevoffp is non-NULL, the offset of the previous entry within
315 * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
316 * is the first in a block, the start of the block is used).
317 */
318int
319ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp,
320    struct buf **bpp, doff_t *prevoffp)
321{
322	struct dirhash *dh, *dh_next;
323	struct direct *dp;
324	struct vnode *vp;
325	struct buf *bp;
326	doff_t blkoff, bmask, offset, prevoff;
327	int i, slot;
328
329	if ((dh = ip->i_dirhash) == NULL)
330		return (EJUSTRETURN);
331	/*
332	 * Move this dirhash towards the end of the list if it has a
333	 * score higher than the next entry, and acquire the dh_mtx.
334	 * Optimise the case where it's already the last by performing
335	 * an unlocked read of the TAILQ_NEXT pointer.
336	 *
337	 * In both cases, end up holding just dh_mtx.
338	 */
339	if (TAILQ_NEXT(dh, dh_list) != NULL) {
340		mtx_lock(&ufsdirhash_mtx);
341		mtx_lock(&dh->dh_mtx);
342		/*
343		 * If the new score will be greater than that of the next
344		 * entry, then move this entry past it. With both mutexes
345		 * held, dh_next won't go away, but its dh_score could
346		 * change; that's not important since it is just a hint.
347		 */
348		if (dh->dh_hash != NULL &&
349		    (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
350		    dh->dh_score >= dh_next->dh_score) {
351			KASSERT(dh->dh_onlist, ("dirhash: not on list"));
352			TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
353			TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
354			    dh_list);
355		}
356		mtx_unlock(&ufsdirhash_mtx);
357	} else {
358		/* Already the last, though that could change as we wait. */
359		mtx_lock(&dh->dh_mtx);
360	}
361	if (dh->dh_hash == NULL) {
362		mtx_unlock(&dh->dh_mtx);
363		ufsdirhash_free(ip);
364		return (EJUSTRETURN);
365	}
366
367	/* Update the score. */
368	if (dh->dh_score < DH_SCOREMAX)
369		dh->dh_score++;
370
371	vp = ip->i_vnode;
372	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
373	blkoff = -1;
374	bp = NULL;
375restart:
376	slot = ufsdirhash_hash(dh, name, namelen);
377
378	if (dh->dh_seqopt) {
379		/*
380		 * Sequential access optimisation. dh_seqoff contains the
381		 * offset of the directory entry immediately following
382		 * the last entry that was looked up. Check if this offset
383		 * appears in the hash chain for the name we are looking for.
384		 */
385		for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
386		    i = WRAPINCR(i, dh->dh_hlen))
387			if (offset == dh->dh_seqoff)
388				break;
389		if (offset == dh->dh_seqoff) {
390			/*
391			 * We found an entry with the expected offset. This
392			 * is probably the entry we want, but if not, the
393			 * code below will turn off seqoff and retry.
394			 */
395			slot = i;
396		} else
397			dh->dh_seqopt = 0;
398	}
399
400	for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY;
401	    slot = WRAPINCR(slot, dh->dh_hlen)) {
402		if (offset == DIRHASH_DEL)
403			continue;
404		mtx_unlock(&dh->dh_mtx);
405
406		if (offset < 0 || offset >= ip->i_size)
407			panic("ufsdirhash_lookup: bad offset in hash array");
408		if ((offset & ~bmask) != blkoff) {
409			if (bp != NULL)
410				brelse(bp);
411			blkoff = offset & ~bmask;
412			if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0)
413				return (EJUSTRETURN);
414		}
415		dp = (struct direct *)(bp->b_data + (offset & bmask));
416		if (dp->d_reclen == 0 || dp->d_reclen >
417		    DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) {
418			/* Corrupted directory. */
419			brelse(bp);
420			return (EJUSTRETURN);
421		}
422		if (dp->d_namlen == namelen &&
423		    bcmp(dp->d_name, name, namelen) == 0) {
424			/* Found. Get the prev offset if needed. */
425			if (prevoffp != NULL) {
426				if (offset & (DIRBLKSIZ - 1)) {
427					prevoff = ufsdirhash_getprev(dp,
428					    offset);
429					if (prevoff == -1) {
430						brelse(bp);
431						return (EJUSTRETURN);
432					}
433				} else
434					prevoff = offset;
435				*prevoffp = prevoff;
436			}
437
438			/* Check for sequential access, and update offset. */
439			if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
440				dh->dh_seqopt = 1;
441			dh->dh_seqoff = offset + DIRSIZ(0, dp);
442
443			*bpp = bp;
444			*offp = offset;
445			return (0);
446		}
447
448		mtx_lock(&dh->dh_mtx);
449		if (dh->dh_hash == NULL) {
450			mtx_unlock(&dh->dh_mtx);
451			if (bp != NULL)
452				brelse(bp);
453			ufsdirhash_free(ip);
454			return (EJUSTRETURN);
455		}
456		/*
457		 * When the name doesn't match in the seqopt case, go back
458		 * and search normally.
459		 */
460		if (dh->dh_seqopt) {
461			dh->dh_seqopt = 0;
462			goto restart;
463		}
464	}
465	mtx_unlock(&dh->dh_mtx);
466	if (bp != NULL)
467		brelse(bp);
468	return (ENOENT);
469}
470
471/*
472 * Find a directory block with room for 'slotneeded' bytes. Returns
473 * the offset of the directory entry that begins the free space.
474 * This will either be the offset of an existing entry that has free
475 * space at the end, or the offset of an entry with d_ino == 0 at
476 * the start of a DIRBLKSIZ block.
477 *
478 * To use the space, the caller may need to compact existing entries in
479 * the directory. The total number of bytes in all of the entries involved
480 * in the compaction is stored in *slotsize. In other words, all of
481 * the entries that must be compacted are exactly contained in the
482 * region beginning at the returned offset and spanning *slotsize bytes.
483 *
484 * Returns -1 if no space was found, indicating that the directory
485 * must be extended.
486 */
487doff_t
488ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
489{
490	struct direct *dp;
491	struct dirhash *dh;
492	struct buf *bp;
493	doff_t pos, slotstart;
494	int dirblock, error, freebytes, i;
495
496	if ((dh = ip->i_dirhash) == NULL)
497		return (-1);
498	mtx_lock(&dh->dh_mtx);
499	if (dh->dh_hash == NULL) {
500		mtx_unlock(&dh->dh_mtx);
501		ufsdirhash_free(ip);
502		return (-1);
503	}
504
505	/* Find a directory block with the desired free space. */
506	dirblock = -1;
507	for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
508		if ((dirblock = dh->dh_firstfree[i]) != -1)
509			break;
510	if (dirblock == -1) {
511		mtx_unlock(&dh->dh_mtx);
512		return (-1);
513	}
514
515	KASSERT(dirblock < dh->dh_nblk &&
516	    dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN),
517	    ("ufsdirhash_findfree: bad stats"));
518	mtx_unlock(&dh->dh_mtx);
519	pos = dirblock * DIRBLKSIZ;
520	error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp);
521	if (error)
522		return (-1);
523
524	/* Find the first entry with free space. */
525	for (i = 0; i < DIRBLKSIZ; ) {
526		if (dp->d_reclen == 0) {
527			brelse(bp);
528			return (-1);
529		}
530		if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp))
531			break;
532		i += dp->d_reclen;
533		dp = (struct direct *)((char *)dp + dp->d_reclen);
534	}
535	if (i > DIRBLKSIZ) {
536		brelse(bp);
537		return (-1);
538	}
539	slotstart = pos + i;
540
541	/* Find the range of entries needed to get enough space */
542	freebytes = 0;
543	while (i < DIRBLKSIZ && freebytes < slotneeded) {
544		freebytes += dp->d_reclen;
545		if (dp->d_ino != 0)
546			freebytes -= DIRSIZ(0, dp);
547		if (dp->d_reclen == 0) {
548			brelse(bp);
549			return (-1);
550		}
551		i += dp->d_reclen;
552		dp = (struct direct *)((char *)dp + dp->d_reclen);
553	}
554	if (i > DIRBLKSIZ) {
555		brelse(bp);
556		return (-1);
557	}
558	if (freebytes < slotneeded)
559		panic("ufsdirhash_findfree: free mismatch");
560	brelse(bp);
561	*slotsize = pos + i - slotstart;
562	return (slotstart);
563}
564
565/*
566 * Return the start of the unused space at the end of a directory, or
567 * -1 if there are no trailing unused blocks.
568 */
569doff_t
570ufsdirhash_enduseful(struct inode *ip)
571{
572
573	struct dirhash *dh;
574	int i;
575
576	if ((dh = ip->i_dirhash) == NULL)
577		return (-1);
578	mtx_lock(&dh->dh_mtx);
579	if (dh->dh_hash == NULL) {
580		mtx_unlock(&dh->dh_mtx);
581		ufsdirhash_free(ip);
582		return (-1);
583	}
584
585	if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) {
586		mtx_unlock(&dh->dh_mtx);
587		return (-1);
588	}
589
590	for (i = dh->dh_dirblks - 1; i >= 0; i--)
591		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
592			break;
593	mtx_unlock(&dh->dh_mtx);
594	return ((doff_t)(i + 1) * DIRBLKSIZ);
595}
596
597/*
598 * Insert information into the hash about a new directory entry. dirp
599 * points to a struct direct containing the entry, and offset specifies
600 * the offset of this entry.
601 */
602void
603ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
604{
605	struct dirhash *dh;
606	int slot;
607
608	if ((dh = ip->i_dirhash) == NULL)
609		return;
610	mtx_lock(&dh->dh_mtx);
611	if (dh->dh_hash == NULL) {
612		mtx_unlock(&dh->dh_mtx);
613		ufsdirhash_free(ip);
614		return;
615	}
616
617	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
618	    ("ufsdirhash_add: bad offset"));
619	/*
620	 * Normal hash usage is < 66%. If the usage gets too high then
621	 * remove the hash entirely and let it be rebuilt later.
622	 */
623	if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
624		mtx_unlock(&dh->dh_mtx);
625		ufsdirhash_free(ip);
626		return;
627	}
628
629	/* Find a free hash slot (empty or deleted), and add the entry. */
630	slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
631	while (DH_ENTRY(dh, slot) >= 0)
632		slot = WRAPINCR(slot, dh->dh_hlen);
633	if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
634		dh->dh_hused++;
635	DH_ENTRY(dh, slot) = offset;
636
637	/* Update the per-block summary info. */
638	ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp));
639	mtx_unlock(&dh->dh_mtx);
640}
641
642/*
643 * Remove the specified directory entry from the hash. The entry to remove
644 * is defined by the name in `dirp', which must exist at the specified
645 * `offset' within the directory.
646 */
647void
648ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
649{
650	struct dirhash *dh;
651	int slot;
652
653	if ((dh = ip->i_dirhash) == NULL)
654		return;
655	mtx_lock(&dh->dh_mtx);
656	if (dh->dh_hash == NULL) {
657		mtx_unlock(&dh->dh_mtx);
658		ufsdirhash_free(ip);
659		return;
660	}
661
662	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
663	    ("ufsdirhash_remove: bad offset"));
664	/* Find the entry */
665	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
666
667	/* Remove the hash entry. */
668	ufsdirhash_delslot(dh, slot);
669
670	/* Update the per-block summary info. */
671	ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp));
672	mtx_unlock(&dh->dh_mtx);
673}
674
675/*
676 * Change the offset associated with a directory entry in the hash. Used
677 * when compacting directory blocks.
678 */
679void
680ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
681    doff_t newoff)
682{
683	struct dirhash *dh;
684	int slot;
685
686	if ((dh = ip->i_dirhash) == NULL)
687		return;
688	mtx_lock(&dh->dh_mtx);
689	if (dh->dh_hash == NULL) {
690		mtx_unlock(&dh->dh_mtx);
691		ufsdirhash_free(ip);
692		return;
693	}
694
695	KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ &&
696	    newoff < dh->dh_dirblks * DIRBLKSIZ,
697	    ("ufsdirhash_move: bad offset"));
698	/* Find the entry, and update the offset. */
699	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
700	DH_ENTRY(dh, slot) = newoff;
701	mtx_unlock(&dh->dh_mtx);
702}
703
704/*
705 * Inform dirhash that the directory has grown by one block that
706 * begins at offset (i.e. the new length is offset + DIRBLKSIZ).
707 */
708void
709ufsdirhash_newblk(struct inode *ip, doff_t offset)
710{
711	struct dirhash *dh;
712	int block;
713
714	if ((dh = ip->i_dirhash) == NULL)
715		return;
716	mtx_lock(&dh->dh_mtx);
717	if (dh->dh_hash == NULL) {
718		mtx_unlock(&dh->dh_mtx);
719		ufsdirhash_free(ip);
720		return;
721	}
722
723	KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ,
724	    ("ufsdirhash_newblk: bad offset"));
725	block = offset / DIRBLKSIZ;
726	if (block >= dh->dh_nblk) {
727		/* Out of space; must rebuild. */
728		mtx_unlock(&dh->dh_mtx);
729		ufsdirhash_free(ip);
730		return;
731	}
732	dh->dh_dirblks = block + 1;
733
734	/* Account for the new free block. */
735	dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN;
736	if (dh->dh_firstfree[DH_NFSTATS] == -1)
737		dh->dh_firstfree[DH_NFSTATS] = block;
738	mtx_unlock(&dh->dh_mtx);
739}
740
741/*
742 * Inform dirhash that the directory is being truncated.
743 */
744void
745ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
746{
747	struct dirhash *dh;
748	int block, i;
749
750	if ((dh = ip->i_dirhash) == NULL)
751		return;
752	mtx_lock(&dh->dh_mtx);
753	if (dh->dh_hash == NULL) {
754		mtx_unlock(&dh->dh_mtx);
755		ufsdirhash_free(ip);
756		return;
757	}
758
759	KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ,
760	    ("ufsdirhash_dirtrunc: bad offset"));
761	block = howmany(offset, DIRBLKSIZ);
762	/*
763	 * If the directory shrinks to less than 1/8 of dh_nblk blocks
764	 * (about 20% of its original size due to the 50% extra added in
765	 * ufsdirhash_build) then free it, and let the caller rebuild
766	 * if necessary.
767	 */
768	if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
769		mtx_unlock(&dh->dh_mtx);
770		ufsdirhash_free(ip);
771		return;
772	}
773
774	/*
775	 * Remove any `first free' information pertaining to the
776	 * truncated blocks. All blocks we're removing should be
777	 * completely unused.
778	 */
779	if (dh->dh_firstfree[DH_NFSTATS] >= block)
780		dh->dh_firstfree[DH_NFSTATS] = -1;
781	for (i = block; i < dh->dh_dirblks; i++)
782		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
783			panic("ufsdirhash_dirtrunc: blocks in use");
784	for (i = 0; i < DH_NFSTATS; i++)
785		if (dh->dh_firstfree[i] >= block)
786			panic("ufsdirhash_dirtrunc: first free corrupt");
787	dh->dh_dirblks = block;
788	mtx_unlock(&dh->dh_mtx);
789}
790
791/*
792 * Debugging function to check that the dirhash information about
793 * a directory block matches its actual contents. Panics if a mismatch
794 * is detected.
795 *
796 * On entry, `buf' should point to the start of an in-core
797 * DIRBLKSIZ-sized directory block, and `offset' should contain the
798 * offset from the start of the directory of that block.
799 */
800void
801ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset)
802{
803	struct dirhash *dh;
804	struct direct *dp;
805	int block, ffslot, i, nfree;
806
807	if (!ufs_dirhashcheck)
808		return;
809	if ((dh = ip->i_dirhash) == NULL)
810		return;
811	mtx_lock(&dh->dh_mtx);
812	if (dh->dh_hash == NULL) {
813		mtx_unlock(&dh->dh_mtx);
814		ufsdirhash_free(ip);
815		return;
816	}
817
818	block = offset / DIRBLKSIZ;
819	if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks)
820		panic("ufsdirhash_checkblock: bad offset");
821
822	nfree = 0;
823	for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) {
824		dp = (struct direct *)(buf + i);
825		if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ)
826			panic("ufsdirhash_checkblock: bad dir");
827
828		if (dp->d_ino == 0) {
829#if 0
830			/*
831			 * XXX entries with d_ino == 0 should only occur
832			 * at the start of a DIRBLKSIZ block. However the
833			 * ufs code is tolerant of such entries at other
834			 * offsets, and fsck does not fix them.
835			 */
836			if (i != 0)
837				panic("ufsdirhash_checkblock: bad dir inode");
838#endif
839			nfree += dp->d_reclen;
840			continue;
841		}
842
843		/* Check that the entry	exists (will panic if it doesn't). */
844		ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
845
846		nfree += dp->d_reclen - DIRSIZ(0, dp);
847	}
848	if (i != DIRBLKSIZ)
849		panic("ufsdirhash_checkblock: bad dir end");
850
851	if (dh->dh_blkfree[block] * DIRALIGN != nfree)
852		panic("ufsdirhash_checkblock: bad free count");
853
854	ffslot = BLKFREE2IDX(nfree / DIRALIGN);
855	for (i = 0; i <= DH_NFSTATS; i++)
856		if (dh->dh_firstfree[i] == block && i != ffslot)
857			panic("ufsdirhash_checkblock: bad first-free");
858	if (dh->dh_firstfree[ffslot] == -1)
859		panic("ufsdirhash_checkblock: missing first-free entry");
860	mtx_unlock(&dh->dh_mtx);
861}
862
863/*
864 * Hash the specified filename into a dirhash slot.
865 */
866static int
867ufsdirhash_hash(struct dirhash *dh, char *name, int namelen)
868{
869	u_int32_t hash;
870
871	/*
872	 * We hash the name and then some other bit of data that is
873	 * invariant over the dirhash's lifetime. Otherwise names
874	 * differing only in the last byte are placed close to one
875	 * another in the table, which is bad for linear probing.
876	 */
877	hash = fnv_32_buf(name, namelen, FNV1_32_INIT);
878	hash = fnv_32_buf(dh, sizeof(dh), hash);
879	return (hash % dh->dh_hlen);
880}
881
882/*
883 * Adjust the number of free bytes in the block containing `offset'
884 * by the value specified by `diff'.
885 *
886 * The caller must ensure we have exclusive access to `dh'; normally
887 * that means that dh_mtx should be held, but this is also called
888 * from ufsdirhash_build() where exclusive access can be assumed.
889 */
890static void
891ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff)
892{
893	int block, i, nfidx, ofidx;
894
895	/* Update the per-block summary info. */
896	block = offset / DIRBLKSIZ;
897	KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks,
898	     ("dirhash bad offset"));
899	ofidx = BLKFREE2IDX(dh->dh_blkfree[block]);
900	dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
901	nfidx = BLKFREE2IDX(dh->dh_blkfree[block]);
902
903	/* Update the `first free' list if necessary. */
904	if (ofidx != nfidx) {
905		/* If removing, scan forward for the next block. */
906		if (dh->dh_firstfree[ofidx] == block) {
907			for (i = block + 1; i < dh->dh_dirblks; i++)
908				if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx)
909					break;
910			dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
911		}
912
913		/* Make this the new `first free' if necessary */
914		if (dh->dh_firstfree[nfidx] > block ||
915		    dh->dh_firstfree[nfidx] == -1)
916			dh->dh_firstfree[nfidx] = block;
917	}
918}
919
920/*
921 * Find the specified name which should have the specified offset.
922 * Returns a slot number, and panics on failure.
923 *
924 * `dh' must be locked on entry and remains so on return.
925 */
926static int
927ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset)
928{
929	int slot;
930
931	mtx_assert(&dh->dh_mtx, MA_OWNED);
932
933	/* Find the entry. */
934	KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full"));
935	slot = ufsdirhash_hash(dh, name, namelen);
936	while (DH_ENTRY(dh, slot) != offset &&
937	    DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
938		slot = WRAPINCR(slot, dh->dh_hlen);
939	if (DH_ENTRY(dh, slot) != offset)
940		panic("ufsdirhash_findslot: '%.*s' not found", namelen, name);
941
942	return (slot);
943}
944
945/*
946 * Remove the entry corresponding to the specified slot from the hash array.
947 *
948 * `dh' must be locked on entry and remains so on return.
949 */
950static void
951ufsdirhash_delslot(struct dirhash *dh, int slot)
952{
953	int i;
954
955	mtx_assert(&dh->dh_mtx, MA_OWNED);
956
957	/* Mark the entry as deleted. */
958	DH_ENTRY(dh, slot) = DIRHASH_DEL;
959
960	/* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
961	for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
962		i = WRAPINCR(i, dh->dh_hlen);
963	if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
964		i = WRAPDECR(i, dh->dh_hlen);
965		while (DH_ENTRY(dh, i) == DIRHASH_DEL) {
966			DH_ENTRY(dh, i) = DIRHASH_EMPTY;
967			dh->dh_hused--;
968			i = WRAPDECR(i, dh->dh_hlen);
969		}
970		KASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen"));
971	}
972}
973
974/*
975 * Given a directory entry and its offset, find the offset of the
976 * previous entry in the same DIRBLKSIZ-sized block. Returns an
977 * offset, or -1 if there is no previous entry in the block or some
978 * other problem occurred.
979 */
980static doff_t
981ufsdirhash_getprev(struct direct *dirp, doff_t offset)
982{
983	struct direct *dp;
984	char *blkbuf;
985	doff_t blkoff, prevoff;
986	int entrypos, i;
987
988	blkoff = offset & ~(DIRBLKSIZ - 1);	/* offset of start of block */
989	entrypos = offset & (DIRBLKSIZ - 1);	/* entry relative to block */
990	blkbuf = (char *)dirp - entrypos;
991	prevoff = blkoff;
992
993	/* If `offset' is the start of a block, there is no previous entry. */
994	if (entrypos == 0)
995		return (-1);
996
997	/* Scan from the start of the block until we get to the entry. */
998	for (i = 0; i < entrypos; i += dp->d_reclen) {
999		dp = (struct direct *)(blkbuf + i);
1000		if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
1001			return (-1);	/* Corrupted directory. */
1002		prevoff = blkoff + i;
1003	}
1004	return (prevoff);
1005}
1006
1007/*
1008 * Try to free up `wanted' bytes by stealing memory from existing
1009 * dirhashes. Returns zero with ufsdirhash_mtx locked if successful.
1010 */
1011static int
1012ufsdirhash_recycle(int wanted)
1013{
1014	struct dirhash *dh;
1015	doff_t **hash;
1016	u_int8_t *blkfree;
1017	int i, mem, narrays;
1018
1019	mtx_lock(&ufsdirhash_mtx);
1020	while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
1021		/* Find a dirhash, and lock it. */
1022		if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) {
1023			mtx_unlock(&ufsdirhash_mtx);
1024			return (-1);
1025		}
1026		mtx_lock(&dh->dh_mtx);
1027		KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list"));
1028
1029		/* Decrement the score; only recycle if it becomes zero. */
1030		if (--dh->dh_score > 0) {
1031			mtx_unlock(&dh->dh_mtx);
1032			mtx_unlock(&ufsdirhash_mtx);
1033			return (-1);
1034		}
1035
1036		/* Remove it from the list and detach its memory. */
1037		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
1038		dh->dh_onlist = 0;
1039		hash = dh->dh_hash;
1040		dh->dh_hash = NULL;
1041		blkfree = dh->dh_blkfree;
1042		dh->dh_blkfree = NULL;
1043		narrays = dh->dh_narrays;
1044		mem = narrays * sizeof(*dh->dh_hash) +
1045		    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
1046		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
1047
1048		/* Unlock everything, free the detached memory. */
1049		mtx_unlock(&dh->dh_mtx);
1050		mtx_unlock(&ufsdirhash_mtx);
1051		for (i = 0; i < narrays; i++)
1052			uma_zfree(ufsdirhash_zone, hash[i]);
1053		FREE(hash, M_DIRHASH);
1054		FREE(blkfree, M_DIRHASH);
1055
1056		/* Account for the returned memory, and repeat if necessary. */
1057		mtx_lock(&ufsdirhash_mtx);
1058		ufs_dirhashmem -= mem;
1059	}
1060	/* Success; return with ufsdirhash_mtx locked. */
1061	return (0);
1062}
1063
1064
1065void
1066ufsdirhash_init()
1067{
1068	ufsdirhash_zone = uma_zcreate("DIRHASH", DH_NBLKOFF * sizeof(doff_t),
1069	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1070	mtx_init(&ufsdirhash_mtx, "dirhash list", NULL, MTX_DEF);
1071	TAILQ_INIT(&ufsdirhash_list);
1072}
1073
1074void
1075ufsdirhash_uninit()
1076{
1077	KASSERT(TAILQ_EMPTY(&ufsdirhash_list), ("ufsdirhash_uninit"));
1078	uma_zdestroy(ufsdirhash_zone);
1079	mtx_destroy(&ufsdirhash_mtx);
1080}
1081
1082#endif /* UFS_DIRHASH */
1083