ufs_dirhash.c revision 84811
1/*
2 * Copyright (c) 2001 Ian Dowse.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD: head/sys/ufs/ufs/ufs_dirhash.c 84811 2001-10-11 17:52:20Z jhb $
26 */
27/*
28 * This implements a hash-based lookup scheme for UFS directories.
29 */
30
31#include "opt_ufs.h"
32
33#ifdef UFS_DIRHASH
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/kernel.h>
38#include <sys/lock.h>
39#include <sys/mutex.h>
40#include <sys/malloc.h>
41#include <sys/fnv_hash.h>
42#include <sys/proc.h>
43#include <sys/bio.h>
44#include <sys/buf.h>
45#include <sys/vnode.h>
46#include <sys/mount.h>
47#include <sys/sysctl.h>
48#include <vm/vm_zone.h>
49
50#include <ufs/ufs/quota.h>
51#include <ufs/ufs/inode.h>
52#include <ufs/ufs/dir.h>
53#include <ufs/ufs/dirhash.h>
54#include <ufs/ufs/extattr.h>
55#include <ufs/ufs/ufsmount.h>
56#include <ufs/ufs/ufs_extern.h>
57
58#define WRAPINCR(val, limit)	(((val) + 1 == (limit)) ? 0 : ((val) + 1))
59#define OFSFMT(vp)		((vp)->v_mount->mnt_maxsymlinklen <= 0)
60
61static MALLOC_DEFINE(M_DIRHASH, "UFS dirhash", "UFS directory hash tables");
62
63SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
64
65static int ufs_mindirhashsize = DIRBLKSIZ * 5;
66SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW,
67    &ufs_mindirhashsize,
68    0, "minimum directory size in bytes for which to use hashed lookup");
69static int ufs_dirhashmaxmem = 2 * 1024 * 1024;
70SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_maxmem, CTLFLAG_RW, &ufs_dirhashmaxmem,
71    0, "maximum allowed dirhash memory usage");
72static int ufs_dirhashmem;
73SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_mem, CTLFLAG_RD, &ufs_dirhashmem,
74    0, "current dirhash memory usage");
75static int ufs_dirhashcheck = 1;
76SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_docheck, CTLFLAG_RW, &ufs_dirhashcheck,
77    0, "enable extra sanity tests");
78
79
80static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen);
81static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff);
82static void ufsdirhash_delslot(struct dirhash *dh, int slot);
83static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen,
84	   doff_t offset);
85static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset);
86static void ufsdirhash_init(void);
87static int ufsdirhash_recycle(int wanted);
88
89static vm_zone_t	ufsdirhash_zone;
90
91/* Dirhash list; recently-used entries are near the tail. */
92static TAILQ_HEAD(, dirhash) ufsdirhash_list;
93
94/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
95static struct mtx	ufsdirhash_mtx;
96
97/*
98 * Locking order:
99 *	ufsdirhash_mtx
100 *	dh_mtx
101 *
102 * The dh_mtx mutex should be aquired either via the inode lock, or via
103 * ufsdirhash_mtx. Only the owner of the inode may free the associated
104 * dirhash, but anything can steal its memory and set dh_hash to NULL.
105 */
106
107/*
108 * Attempt to build up a hash table for the directory contents in
109 * inode 'ip'. Returns 0 on success, or -1 of the operation failed.
110 */
111int
112ufsdirhash_build(struct inode *ip)
113{
114	struct dirhash *dh;
115	struct buf *bp = NULL;
116	struct direct *ep;
117	struct vnode *vp;
118	doff_t bmask, pos;
119	int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
120
121	/* Check if we can/should use dirhash. */
122	if (ip->i_dirhash == NULL) {
123		if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode))
124			return (-1);
125	} else {
126		/* Hash exists, but sysctls could have changed. */
127		if (ip->i_size < ufs_mindirhashsize ||
128		    ufs_dirhashmem > ufs_dirhashmaxmem) {
129			ufsdirhash_free(ip);
130			return (-1);
131		}
132		/* Check if hash exists and is intact (note: unlocked read). */
133		if (ip->i_dirhash->dh_hash != NULL)
134			return (0);
135		/* Free the old, recycled hash and build a new one. */
136		ufsdirhash_free(ip);
137	}
138
139	/* Don't hash removed directories. */
140	if (ip->i_effnlink == 0)
141		return (-1);
142
143	vp = ip->i_vnode;
144	/* Allocate 50% more entries than this dir size could ever need. */
145	KASSERT(ip->i_size >= DIRBLKSIZ, ("ufsdirhash_build size"));
146	nslots = ip->i_size / DIRECTSIZ(1);
147	nslots = (nslots * 3 + 1) / 2;
148	narrays = howmany(nslots, DH_NBLKOFF);
149	nslots = narrays * DH_NBLKOFF;
150	dirblocks = howmany(ip->i_size, DIRBLKSIZ);
151	nblocks = (dirblocks * 3 + 1) / 2;
152
153	memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
154	    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
155	    nblocks * sizeof(*dh->dh_blkfree);
156	mtx_lock(&ufsdirhash_mtx);
157	if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) {
158		mtx_unlock(&ufsdirhash_mtx);
159		if (memreqd > ufs_dirhashmaxmem / 2)
160			return (-1);
161
162		/* Try to free some space. */
163		if (ufsdirhash_recycle(memreqd) != 0)
164			return (-1);
165		/* Enough was freed, and ufsdirhash_mtx has been locked. */
166	}
167	ufs_dirhashmem += memreqd;
168	mtx_unlock(&ufsdirhash_mtx);
169
170	/*
171	 * Use non-blocking mallocs so that we will revert to a linear
172	 * lookup on failure rather than potentially blocking forever.
173	 */
174	MALLOC(dh, struct dirhash *, sizeof *dh, M_DIRHASH, M_NOWAIT | M_ZERO);
175	if (dh == NULL)
176		return (-1);
177	MALLOC(dh->dh_hash, doff_t **, narrays * sizeof(dh->dh_hash[0]),
178	    M_DIRHASH, M_NOWAIT | M_ZERO);
179	MALLOC(dh->dh_blkfree, u_int8_t *, nblocks * sizeof(dh->dh_blkfree[0]),
180	    M_DIRHASH, M_NOWAIT);
181	if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
182		goto fail;
183	for (i = 0; i < narrays; i++) {
184		if ((dh->dh_hash[i] = zalloc(ufsdirhash_zone)) == NULL)
185			goto fail;
186		for (j = 0; j < DH_NBLKOFF; j++)
187			dh->dh_hash[i][j] = DIRHASH_EMPTY;
188	}
189
190	/* Initialise the hash table and block statistics. */
191	mtx_init(&dh->dh_mtx, "dirhash", MTX_DEF);
192	dh->dh_narrays = narrays;
193	dh->dh_hlen = nslots;
194	dh->dh_nblk = nblocks;
195	dh->dh_dirblks = dirblocks;
196	for (i = 0; i < dirblocks; i++)
197		dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN;
198	for (i = 0; i < DH_NFSTATS; i++)
199		dh->dh_firstfree[i] = -1;
200	dh->dh_firstfree[DH_NFSTATS] = 0;
201	dh->dh_seqopt = 0;
202	dh->dh_seqoff = 0;
203	dh->dh_score = DH_SCOREINIT;
204	ip->i_dirhash = dh;
205
206	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
207	pos = 0;
208	while (pos < ip->i_size) {
209		/* If necessary, get the next directory block. */
210		if ((pos & bmask) == 0) {
211			if (bp != NULL)
212				brelse(bp);
213			if (UFS_BLKATOFF(vp, (off_t)pos, NULL, &bp) != 0)
214				goto fail;
215		}
216
217		/* Add this entry to the hash. */
218		ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
219		if (ep->d_reclen == 0 || ep->d_reclen >
220		    DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) {
221			/* Corrupted directory. */
222			brelse(bp);
223			goto fail;
224		}
225		if (ep->d_ino != 0) {
226			/* Add the entry (simplified ufsdirhash_add). */
227			slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen);
228			while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
229				slot = WRAPINCR(slot, dh->dh_hlen);
230			dh->dh_hused++;
231			DH_ENTRY(dh, slot) = pos;
232			ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep));
233		}
234		pos += ep->d_reclen;
235	}
236
237	if (bp != NULL)
238		brelse(bp);
239	mtx_lock(&ufsdirhash_mtx);
240	TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
241	dh->dh_onlist = 1;
242	mtx_unlock(&ufsdirhash_mtx);
243	return (0);
244
245fail:
246	if (dh->dh_hash != NULL) {
247		for (i = 0; i < narrays; i++)
248			if (dh->dh_hash[i] != NULL)
249				zfree(ufsdirhash_zone, dh->dh_hash[i]);
250		FREE(dh->dh_hash, M_DIRHASH);
251	}
252	if (dh->dh_blkfree != NULL)
253		FREE(dh->dh_blkfree, M_DIRHASH);
254	FREE(dh, M_DIRHASH);
255	ip->i_dirhash = NULL;
256	mtx_lock(&ufsdirhash_mtx);
257	ufs_dirhashmem -= memreqd;
258	mtx_unlock(&ufsdirhash_mtx);
259	return (-1);
260}
261
262/*
263 * Free any hash table associated with inode 'ip'.
264 */
265void
266ufsdirhash_free(struct inode *ip)
267{
268	struct dirhash *dh;
269	int i, mem;
270
271	if ((dh = ip->i_dirhash) == NULL)
272		return;
273	mtx_lock(&ufsdirhash_mtx);
274	mtx_lock(&dh->dh_mtx);
275	if (dh->dh_onlist)
276		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
277	mtx_unlock(&dh->dh_mtx);
278	mtx_unlock(&ufsdirhash_mtx);
279
280	/* The dirhash pointed to by 'dh' is exclusively ours now. */
281
282	mem = sizeof(*dh);
283	if (dh->dh_hash != NULL) {
284		for (i = 0; i < dh->dh_narrays; i++)
285			zfree(ufsdirhash_zone, dh->dh_hash[i]);
286		FREE(dh->dh_hash, M_DIRHASH);
287		FREE(dh->dh_blkfree, M_DIRHASH);
288		mem += dh->dh_narrays * sizeof(*dh->dh_hash) +
289		    dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
290		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
291	}
292	mtx_destroy(&dh->dh_mtx);
293	FREE(dh, M_DIRHASH);
294	ip->i_dirhash = NULL;
295
296	mtx_lock(&ufsdirhash_mtx);
297	ufs_dirhashmem -= mem;
298	mtx_unlock(&ufsdirhash_mtx);
299}
300
301/*
302 * Find the offset of the specified name within the given inode.
303 * Returns 0 on success, ENOENT if the entry does not exist, or
304 * EJUSTRETURN if the caller should revert to a linear search.
305 *
306 * If successful, the directory offset is stored in *offp, and a
307 * pointer to a struct buf containing the entry is stored in *bpp. If
308 * prevoffp is non-NULL, the offset of the previous entry within
309 * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
310 * is the first in a block, the start of the block is used).
311 */
312int
313ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp,
314    struct buf **bpp, doff_t *prevoffp)
315{
316	struct dirhash *dh, *dh_next;
317	struct direct *dp;
318	struct vnode *vp;
319	struct buf *bp;
320	doff_t blkoff, bmask, offset, prevoff;
321	int i, slot;
322
323	if ((dh = ip->i_dirhash) == NULL)
324		return (EJUSTRETURN);
325	/*
326	 * Move this dirhash towards the end of the list if it has a
327	 * score higher than the next entry, and aquire the dh_mtx.
328	 * Optimise the case where it's already the last by performing
329	 * an unlocked read of the TAILQ_NEXT pointer.
330	 *
331	 * In both cases, end up holding just dh_mtx.
332	 */
333	if (TAILQ_NEXT(dh, dh_list) != NULL) {
334		mtx_lock(&ufsdirhash_mtx);
335		mtx_lock(&dh->dh_mtx);
336		/*
337		 * If the new score will be greater than that of the next
338		 * entry, then move this entry past it. With both mutexes
339		 * held, dh_next won't go away, but its dh_score could
340		 * change; that's not important since it is just a hint.
341		 */
342		if (dh->dh_hash != NULL &&
343		    (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
344		    dh->dh_score >= dh_next->dh_score) {
345			KASSERT(dh->dh_onlist, ("dirhash: not on list"));
346			TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
347			TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
348			    dh_list);
349		}
350		mtx_unlock(&ufsdirhash_mtx);
351	} else {
352		/* Already the last, though that could change as we wait. */
353		mtx_lock(&dh->dh_mtx);
354	}
355	if (dh->dh_hash == NULL) {
356		mtx_unlock(&dh->dh_mtx);
357		ufsdirhash_free(ip);
358		return (EJUSTRETURN);
359	}
360
361	/* Update the score. */
362	if (dh->dh_score < DH_SCOREMAX)
363		dh->dh_score++;
364
365	vp = ip->i_vnode;
366	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
367	blkoff = -1;
368	bp = NULL;
369restart:
370	slot = ufsdirhash_hash(dh, name, namelen);
371
372	if (dh->dh_seqopt) {
373		/*
374		 * Sequential access optimisation. dh_seqoff contains the
375		 * offset of the directory entry immediately following
376		 * the last entry that was looked up. Check if this offset
377		 * appears in the hash chain for the name we are looking for.
378		 */
379		for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
380		    i = WRAPINCR(i, dh->dh_hlen))
381			if (offset == dh->dh_seqopt)
382				break;
383		if (offset == dh->dh_seqoff) {
384			/*
385			 * We found an entry with the expected offset. This
386			 * is probably the entry we want, but if not, the
387			 * code below will turn off seqoff and retry.
388			 */
389			slot = i;
390		} else
391			dh->dh_seqopt = 0;
392	}
393
394	for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY;
395	    slot = WRAPINCR(slot, dh->dh_hlen)) {
396		if (offset == DIRHASH_DEL)
397			continue;
398		mtx_unlock(&dh->dh_mtx);
399
400		if (offset < 0 || offset >= ip->i_size)
401			panic("ufsdirhash_lookup: bad offset in hash array");
402		if ((offset & ~bmask) != blkoff) {
403			if (bp != NULL)
404				brelse(bp);
405			blkoff = offset & ~bmask;
406			if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0)
407				return (EJUSTRETURN);
408		}
409		dp = (struct direct *)(bp->b_data + (offset & bmask));
410		if (dp->d_reclen == 0 || dp->d_reclen >
411		    DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) {
412			/* Corrupted directory. */
413			brelse(bp);
414			return (EJUSTRETURN);
415		}
416		if (dp->d_namlen == namelen &&
417		    bcmp(dp->d_name, name, namelen) == 0) {
418			/* Found. Get the prev offset if needed. */
419			if (prevoffp != NULL) {
420				if (offset & (DIRBLKSIZ - 1)) {
421					prevoff = ufsdirhash_getprev(dp,
422					    offset);
423					if (prevoff == -1) {
424						brelse(bp);
425						return (EJUSTRETURN);
426					}
427				} else
428					prevoff = offset;
429				*prevoffp = prevoff;
430			}
431
432			/* Check for sequential access, and update offset. */
433			if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
434				dh->dh_seqopt = 1;
435			dh->dh_seqoff = offset + DIRSIZ(0, dp);
436
437			*bpp = bp;
438			*offp = offset;
439			return (0);
440		}
441
442		mtx_lock(&dh->dh_mtx);
443		if (dh->dh_hash == NULL) {
444			mtx_unlock(&dh->dh_mtx);
445			if (bp != NULL)
446				brelse(bp);
447			ufsdirhash_free(ip);
448			return (EJUSTRETURN);
449		}
450		/*
451		 * When the name doesn't match in the seqopt case, go back
452		 * and search normally.
453		 */
454		if (dh->dh_seqopt) {
455			dh->dh_seqopt = 0;
456			goto restart;
457		}
458	}
459	mtx_unlock(&dh->dh_mtx);
460	if (bp != NULL)
461		brelse(bp);
462	return (ENOENT);
463}
464
465/*
466 * Find a directory block with room for 'slotneeded' bytes. Returns
467 * the offset of the directory entry that begins the free space.
468 * This will either be the offset of an existing entry that has free
469 * space at the end, or the offset of an entry with d_ino == 0 at
470 * the start of a DIRBLKSIZ block.
471 *
472 * To use the space, the caller may need to compact existing entries in
473 * the directory. The total number of bytes in all of the entries involved
474 * in the compaction is stored in *slotsize. In other words, all of
475 * the entries that must be compacted are exactly contained in the
476 * region beginning at the returned offset and spanning *slotsize bytes.
477 *
478 * Returns -1 if no space was found, indicating that the directory
479 * must be extended.
480 */
481doff_t
482ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
483{
484	struct direct *dp;
485	struct dirhash *dh;
486	struct buf *bp;
487	doff_t pos, slotstart;
488	int dirblock, error, freebytes, i;
489
490	if ((dh = ip->i_dirhash) == NULL)
491		return (-1);
492	mtx_lock(&dh->dh_mtx);
493	if (dh->dh_hash == NULL) {
494		mtx_unlock(&dh->dh_mtx);
495		ufsdirhash_free(ip);
496		return (-1);
497	}
498
499	/* Find a directory block with the desired free space. */
500	dirblock = -1;
501	for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
502		if ((dirblock = dh->dh_firstfree[i]) != -1)
503			break;
504	if (dirblock == -1) {
505		mtx_unlock(&dh->dh_mtx);
506		return (-1);
507	}
508
509	KASSERT(dirblock < dh->dh_nblk &&
510	    dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN),
511	    ("ufsdirhash_findfree: bad stats"));
512	mtx_unlock(&dh->dh_mtx);
513	pos = dirblock * DIRBLKSIZ;
514	error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp);
515	if (error)
516		return (-1);
517
518	/* Find the first entry with free space. */
519	for (i = 0; i < DIRBLKSIZ; ) {
520		if (dp->d_reclen == 0) {
521			brelse(bp);
522			return (-1);
523		}
524		if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp))
525			break;
526		i += dp->d_reclen;
527		dp = (struct direct *)((char *)dp + dp->d_reclen);
528	}
529	if (i > DIRBLKSIZ) {
530		brelse(bp);
531		return (-1);
532	}
533	slotstart = pos + i;
534
535	/* Find the range of entries needed to get enough space */
536	freebytes = 0;
537	while (i < DIRBLKSIZ && freebytes < slotneeded) {
538		freebytes += dp->d_reclen;
539		if (dp->d_ino != 0)
540			freebytes -= DIRSIZ(0, dp);
541		if (dp->d_reclen == 0) {
542			brelse(bp);
543			return (-1);
544		}
545		i += dp->d_reclen;
546		dp = (struct direct *)((char *)dp + dp->d_reclen);
547	}
548	if (i > DIRBLKSIZ) {
549		brelse(bp);
550		return (-1);
551	}
552	if (freebytes < slotneeded)
553		panic("ufsdirhash_findfree: free mismatch");
554	brelse(bp);
555	*slotsize = pos + i - slotstart;
556	return (slotstart);
557}
558
559/*
560 * Return the start of the unused space at the end of a directory, or
561 * -1 if there are no trailing unused blocks.
562 */
563doff_t
564ufsdirhash_enduseful(struct inode *ip)
565{
566
567	struct dirhash *dh;
568	int i;
569
570	if ((dh = ip->i_dirhash) == NULL)
571		return (-1);
572	mtx_lock(&dh->dh_mtx);
573	if (dh->dh_hash == NULL) {
574		mtx_unlock(&dh->dh_mtx);
575		ufsdirhash_free(ip);
576		return (-1);
577	}
578
579	if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) {
580		mtx_unlock(&dh->dh_mtx);
581		return (-1);
582	}
583
584	for (i = dh->dh_dirblks - 1; i >= 0; i--)
585		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
586			break;
587	mtx_unlock(&dh->dh_mtx);
588	return ((doff_t)(i + 1) * DIRBLKSIZ);
589}
590
591/*
592 * Insert information into the hash about a new directory entry. dirp
593 * points to a struct direct containing the entry, and offset specifies
594 * the offset of this entry.
595 */
596void
597ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
598{
599	struct dirhash *dh;
600	int slot;
601
602	if ((dh = ip->i_dirhash) == NULL)
603		return;
604	mtx_lock(&dh->dh_mtx);
605	if (dh->dh_hash == NULL) {
606		mtx_unlock(&dh->dh_mtx);
607		ufsdirhash_free(ip);
608		return;
609	}
610
611	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
612	    ("ufsdirhash_add: bad offset"));
613	/*
614	 * Normal hash usage is < 66%. If the usage gets too high then
615	 * remove the hash entirely and let it be rebuilt later.
616	 */
617	if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
618		mtx_unlock(&dh->dh_mtx);
619		ufsdirhash_free(ip);
620		return;
621	}
622
623	/* Find a free hash slot (empty or deleted), and add the entry. */
624	slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
625	while (DH_ENTRY(dh, slot) >= 0)
626		slot = WRAPINCR(slot, dh->dh_hlen);
627	if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
628		dh->dh_hused++;
629	DH_ENTRY(dh, slot) = offset;
630
631	/* Update the per-block summary info. */
632	ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp));
633	mtx_unlock(&dh->dh_mtx);
634}
635
636/*
637 * Remove the specified directory entry from the hash. The entry to remove
638 * is defined by the name in `dirp', which must exist at the specified
639 * `offset' within the directory.
640 */
641void
642ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
643{
644	struct dirhash *dh;
645	int slot;
646
647	if ((dh = ip->i_dirhash) == NULL)
648		return;
649	mtx_lock(&dh->dh_mtx);
650	if (dh->dh_hash == NULL) {
651		mtx_unlock(&dh->dh_mtx);
652		ufsdirhash_free(ip);
653		return;
654	}
655
656	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
657	    ("ufsdirhash_remove: bad offset"));
658	/* Find the entry */
659	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
660
661	/* Remove the hash entry. */
662	ufsdirhash_delslot(dh, slot);
663
664	/* Update the per-block summary info. */
665	ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp));
666	mtx_unlock(&dh->dh_mtx);
667}
668
669/*
670 * Change the offset associated with a directory entry in the hash. Used
671 * when compacting directory blocks.
672 */
673void
674ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
675    doff_t newoff)
676{
677	struct dirhash *dh;
678	int slot;
679
680	if ((dh = ip->i_dirhash) == NULL)
681		return;
682	mtx_lock(&dh->dh_mtx);
683	if (dh->dh_hash == NULL) {
684		mtx_unlock(&dh->dh_mtx);
685		ufsdirhash_free(ip);
686		return;
687	}
688
689	KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ &&
690	    newoff < dh->dh_dirblks * DIRBLKSIZ,
691	    ("ufsdirhash_move: bad offset"));
692	/* Find the entry, and update the offset. */
693	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
694	DH_ENTRY(dh, slot) = newoff;
695	mtx_unlock(&dh->dh_mtx);
696}
697
698/*
699 * Inform dirhash that the directory has grown by one block that
700 * begins at offset (i.e. the new length is offset + DIRBLKSIZ).
701 */
702void
703ufsdirhash_newblk(struct inode *ip, doff_t offset)
704{
705	struct dirhash *dh;
706	int block;
707
708	if ((dh = ip->i_dirhash) == NULL)
709		return;
710	mtx_lock(&dh->dh_mtx);
711	if (dh->dh_hash == NULL) {
712		mtx_unlock(&dh->dh_mtx);
713		ufsdirhash_free(ip);
714		return;
715	}
716
717	KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ,
718	    ("ufsdirhash_newblk: bad offset"));
719	block = offset / DIRBLKSIZ;
720	if (block >= dh->dh_nblk) {
721		/* Out of space; must rebuild. */
722		mtx_unlock(&dh->dh_mtx);
723		ufsdirhash_free(ip);
724		return;
725	}
726	dh->dh_dirblks = block + 1;
727
728	/* Account for the new free block. */
729	dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN;
730	if (dh->dh_firstfree[DH_NFSTATS] == -1)
731		dh->dh_firstfree[DH_NFSTATS] = block;
732	mtx_unlock(&dh->dh_mtx);
733}
734
735/*
736 * Inform dirhash that the directory is being truncated.
737 */
738void
739ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
740{
741	struct dirhash *dh;
742	int block, i;
743
744	if ((dh = ip->i_dirhash) == NULL)
745		return;
746	mtx_lock(&dh->dh_mtx);
747	if (dh->dh_hash == NULL) {
748		mtx_unlock(&dh->dh_mtx);
749		ufsdirhash_free(ip);
750		return;
751	}
752
753	KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ,
754	    ("ufsdirhash_dirtrunc: bad offset"));
755	block = howmany(offset, DIRBLKSIZ);
756	/*
757	 * If the directory shrinks to less than 1/8 of dh_nblk blocks
758	 * (about 20% of its original size due to the 50% extra added in
759	 * ufsdirhash_build) then free it, and let the caller rebuild
760	 * if necessary.
761	 */
762	if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
763		mtx_unlock(&dh->dh_mtx);
764		ufsdirhash_free(ip);
765		return;
766	}
767
768	/*
769	 * Remove any `first free' information pertaining to the
770	 * truncated blocks. All blocks we're removing should be
771	 * completely unused.
772	 */
773	if (dh->dh_firstfree[DH_NFSTATS] >= block)
774		dh->dh_firstfree[DH_NFSTATS] = -1;
775	for (i = block; i < dh->dh_dirblks; i++)
776		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
777			panic("ufsdirhash_dirtrunc: blocks in use");
778	for (i = 0; i < DH_NFSTATS; i++)
779		if (dh->dh_firstfree[i] >= block)
780			panic("ufsdirhash_dirtrunc: first free corrupt");
781	dh->dh_dirblks = block;
782	mtx_unlock(&dh->dh_mtx);
783}
784
785/*
786 * Debugging function to check that the dirhash information about
787 * a directory block matches its actual contents. Panics if a mismatch
788 * is detected.
789 *
790 * On entry, `buf' should point to the start of an in-core
791 * DIRBLKSIZ-sized directory block, and `offset' should contain the
792 * offset from the start of the directory of that block.
793 */
794void
795ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset)
796{
797	struct dirhash *dh;
798	struct direct *dp;
799	int block, ffslot, i, nfree;
800
801	if (!ufs_dirhashcheck)
802		return;
803	if ((dh = ip->i_dirhash) == NULL)
804		return;
805	mtx_lock(&dh->dh_mtx);
806	if (dh->dh_hash == NULL) {
807		mtx_unlock(&dh->dh_mtx);
808		ufsdirhash_free(ip);
809		return;
810	}
811
812	block = offset / DIRBLKSIZ;
813	if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks)
814		panic("ufsdirhash_checkblock: bad offset");
815
816	nfree = 0;
817	for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) {
818		dp = (struct direct *)(buf + i);
819		if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ)
820			panic("ufsdirhash_checkblock: bad dir");
821
822		if (dp->d_ino == 0) {
823#if 0
824			/*
825			 * XXX entries with d_ino == 0 should only occur
826			 * at the start of a DIRBLKSIZ block. However the
827			 * ufs code is tolerant of such entries at other
828			 * offsets, and fsck does not fix them.
829			 */
830			if (i != 0)
831				panic("ufsdirhash_checkblock: bad dir inode");
832#endif
833			nfree += dp->d_reclen;
834			continue;
835		}
836
837		/* Check that the entry	exists (will panic if it doesn't). */
838		ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
839
840		nfree += dp->d_reclen - DIRSIZ(0, dp);
841	}
842	if (i != DIRBLKSIZ)
843		panic("ufsdirhash_checkblock: bad dir end");
844
845	if (dh->dh_blkfree[block] * DIRALIGN != nfree)
846		panic("ufsdirhash_checkblock: bad free count");
847
848	ffslot = nfree / DIRALIGN;
849	if (ffslot > DH_NFSTATS)
850		ffslot = DH_NFSTATS;
851	for (i = 0; i <= DH_NFSTATS; i++)
852		if (dh->dh_firstfree[i] == block && i != ffslot)
853			panic("ufsdirhash_checkblock: bad first-free");
854	mtx_unlock(&dh->dh_mtx);
855}
856
857/*
858 * Hash the specified filename into a dirhash slot.
859 */
860static int
861ufsdirhash_hash(struct dirhash *dh, char *name, int namelen)
862{
863	return (fnv_32_buf(name, namelen, FNV1_32_INIT) % dh->dh_hlen);
864}
865
866/*
867 * Adjust the number of free bytes in the block containing `offset'
868 * by the value specified by `diff'.
869 *
870 * The caller must ensure we have exclusive access to `dh'; normally
871 * that means that dh_mtx should be held, but this is also called
872 * from ufsdirhash_build() where exclusive access can be assumed.
873 */
874static void
875ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff)
876{
877	int block, i, nfidx, ofidx;
878
879	/* Update the per-block summary info. */
880	block = offset / DIRBLKSIZ;
881	KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks,
882	     ("dirhash bad offset"));
883	ofidx = dh->dh_blkfree[block];
884	if (ofidx > DH_NFSTATS)
885		ofidx = DH_NFSTATS;
886	dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
887	nfidx = dh->dh_blkfree[block];
888	if (nfidx > DH_NFSTATS)
889		nfidx = DH_NFSTATS;
890
891	/* Update the `first free' list if necessary. */
892	if (ofidx != nfidx) {
893		/* If removing, scan forward for the next block. */
894		if (dh->dh_firstfree[ofidx] == block) {
895			for (i = block + 1; i < dh->dh_dirblks; i++)
896				if (dh->dh_blkfree[i] == ofidx)
897					break;
898			dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
899		}
900
901		/* Make this the new `first free' if necessary */
902		if (dh->dh_firstfree[nfidx] > block ||
903		    dh->dh_firstfree[nfidx] == -1)
904			dh->dh_firstfree[nfidx] = block;
905	}
906}
907
908/*
909 * Find the specified name which should have the specified offset.
910 * Returns a slot number, and panics on failure.
911 *
912 * `dh' must be locked on entry and remains so on return.
913 */
914static int
915ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset)
916{
917	int slot;
918
919	mtx_assert(&dh->dh_mtx, MA_OWNED);
920
921	/* Find the entry. */
922	KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full"));
923	slot = ufsdirhash_hash(dh, name, namelen);
924	while (DH_ENTRY(dh, slot) != offset &&
925	    DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
926		slot = WRAPINCR(slot, dh->dh_hlen);
927	if (DH_ENTRY(dh, slot) != offset)
928		panic("ufsdirhash_findslot: '%.*s' not found", namelen, name);
929
930	return (slot);
931}
932
933/*
934 * Remove the entry corresponding to the specified slot from the hash array.
935 *
936 * `dh' must be locked on entry and remains so on return.
937 */
938static void
939ufsdirhash_delslot(struct dirhash *dh, int slot)
940{
941	int i;
942
943	mtx_assert(&dh->dh_mtx, MA_OWNED);
944
945	/* Mark the entry as deleted. */
946	DH_ENTRY(dh, slot) = DIRHASH_DEL;
947
948	/* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
949	for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
950		i = WRAPINCR(i, dh->dh_hlen);
951	if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
952		for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) {
953			DH_ENTRY(dh, i) = DIRHASH_EMPTY;
954			dh->dh_hused--;
955			i = WRAPINCR(i, dh->dh_hlen);
956		}
957		KASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen"));
958	}
959}
960
961/*
962 * Given a directory entry and its offset, find the offset of the
963 * previous entry in the same DIRBLKSIZ-sized block. Returns an
964 * offset, or -1 if there is no previous entry in the block or some
965 * other problem occurred.
966 */
967static doff_t
968ufsdirhash_getprev(struct direct *dirp, doff_t offset)
969{
970	struct direct *dp;
971	char *blkbuf;
972	doff_t blkoff, prevoff;
973	int entrypos, i;
974
975	blkoff = offset & ~(DIRBLKSIZ - 1);	/* offset of start of block */
976	entrypos = offset & (DIRBLKSIZ - 1);	/* entry relative to block */
977	blkbuf = (char *)dirp - entrypos;
978	prevoff = blkoff;
979
980	/* If `offset' is the start of a block, there is no previous entry. */
981	if (entrypos == 0)
982		return (-1);
983
984	/* Scan from the start of the block until we get to the entry. */
985	for (i = 0; i < entrypos; i += dp->d_reclen) {
986		dp = (struct direct *)(blkbuf + i);
987		if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
988			return (-1);	/* Corrupted directory. */
989		prevoff = blkoff + i;
990	}
991	return (prevoff);
992}
993
994/*
995 * Try to free up `wanted' bytes by stealing memory from existing
996 * dirhashes. Returns zero with ufsdirhash_mtx locked if successful.
997 */
998static int
999ufsdirhash_recycle(int wanted)
1000{
1001	struct dirhash *dh;
1002	doff_t **hash;
1003	u_int8_t *blkfree;
1004	int i, mem, narrays;
1005
1006	mtx_lock(&ufsdirhash_mtx);
1007	while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
1008		/* Find a dirhash, and lock it. */
1009		if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) {
1010			mtx_unlock(&ufsdirhash_mtx);
1011			return (-1);
1012		}
1013		mtx_lock(&dh->dh_mtx);
1014		KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list"));
1015
1016		/* Decrement the score; only recycle if it becomes zero. */
1017		if (--dh->dh_score > 0) {
1018			mtx_unlock(&dh->dh_mtx);
1019			mtx_unlock(&ufsdirhash_mtx);
1020			return (-1);
1021		}
1022
1023		/* Remove it from the list and detach its memory. */
1024		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
1025		dh->dh_onlist = 0;
1026		hash = dh->dh_hash;
1027		dh->dh_hash = NULL;
1028		blkfree = dh->dh_blkfree;
1029		dh->dh_blkfree = NULL;
1030		narrays = dh->dh_narrays;
1031		mem = narrays * sizeof(*dh->dh_hash) +
1032		    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
1033		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
1034
1035		/* Unlock everything, free the detached memory. */
1036		mtx_unlock(&dh->dh_mtx);
1037		mtx_unlock(&ufsdirhash_mtx);
1038		for (i = 0; i < narrays; i++)
1039			zfree(ufsdirhash_zone, hash[i]);
1040		FREE(hash, M_DIRHASH);
1041		FREE(blkfree, M_DIRHASH);
1042
1043		/* Account for the returned memory, and repeat if necessary. */
1044		mtx_lock(&ufsdirhash_mtx);
1045		ufs_dirhashmem -= mem;
1046	}
1047	/* Success; return with ufsdirhash_mtx locked. */
1048	return (0);
1049}
1050
1051
1052static void
1053ufsdirhash_init()
1054{
1055	ufsdirhash_zone = zinit("DIRHASH", DH_NBLKOFF * sizeof(daddr_t), 0,
1056	    0, 1);
1057	mtx_init(&ufsdirhash_mtx, "dirhash list", MTX_DEF);
1058	TAILQ_INIT(&ufsdirhash_list);
1059}
1060SYSINIT(ufsdirhash, SI_SUB_PSEUDO, SI_ORDER_ANY, ufsdirhash_init, NULL)
1061
1062
1063#endif /* UFS_DIRHASH */
1064