ufs_dirhash.c revision 79561
1/*
2 * Copyright (c) 2001 Ian Dowse.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD: head/sys/ufs/ufs/ufs_dirhash.c 79561 2001-07-10 21:21:29Z iedowse $
26 */
27/*
28 * This implements a hash-based lookup scheme for UFS directories.
29 */
30
31#include "opt_ufs.h"
32
33#ifdef UFS_DIRHASH
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/kernel.h>
38#include <sys/mutex.h>
39#include <sys/malloc.h>
40#include <sys/fnv_hash.h>
41#include <sys/proc.h>
42#include <sys/bio.h>
43#include <sys/buf.h>
44#include <sys/vnode.h>
45#include <sys/mount.h>
46#include <sys/sysctl.h>
47#include <vm/vm_zone.h>
48
49#include <ufs/ufs/quota.h>
50#include <ufs/ufs/inode.h>
51#include <ufs/ufs/dir.h>
52#include <ufs/ufs/dirhash.h>
53#include <ufs/ufs/extattr.h>
54#include <ufs/ufs/ufsmount.h>
55#include <ufs/ufs/ufs_extern.h>
56
57#define WRAPINCR(val, limit)	(((val) + 1 == (limit)) ? 0 : ((val) + 1))
58#define OFSFMT(vp)		((vp)->v_mount->mnt_maxsymlinklen <= 0)
59
60static MALLOC_DEFINE(M_DIRHASH, "UFS dirhash", "UFS directory hash tables");
61
62SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
63
64static int ufs_mindirhashsize = DIRBLKSIZ * 5;
65SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW,
66    &ufs_mindirhashsize,
67    0, "minimum directory size in bytes for which to use hashed lookup");
68static int ufs_dirhashmaxmem = 2 * 1024 * 1024;
69SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_maxmem, CTLFLAG_RW, &ufs_dirhashmaxmem,
70    0, "maximum allowed dirhash memory usage");
71static int ufs_dirhashmem;
72SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_mem, CTLFLAG_RD, &ufs_dirhashmem,
73    0, "current dirhash memory usage");
74static int ufs_dirhashcheck = 1;
75SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_docheck, CTLFLAG_RW, &ufs_dirhashcheck,
76    0, "enable extra sanity tests");
77
78
79static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen);
80static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff);
81static void ufsdirhash_delslot(struct dirhash *dh, int slot);
82static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen,
83	   doff_t offset);
84static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset);
85static void ufsdirhash_init(void);
86static int ufsdirhash_recycle(int wanted);
87
88static vm_zone_t	ufsdirhash_zone;
89
90/* Dirhash list; recently-used entries are near the tail. */
91static TAILQ_HEAD(, dirhash) ufsdirhash_list;
92
93/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
94static struct mtx	ufsdirhash_mtx;
95
96/*
97 * Locking order:
98 *	ufsdirhash_mtx
99 *	dh_mtx
100 *
101 * The dh_mtx mutex should be aquired either via the inode lock, or via
102 * ufsdirhash_mtx. Only the owner of the inode may free the associated
103 * dirhash, but anything can steal its memory and set dh_hash to NULL.
104 */
105
106/*
107 * Attempt to build up a hash table for the directory contents in
108 * inode 'ip'. Returns 0 on success, or -1 of the operation failed.
109 */
110int
111ufsdirhash_build(struct inode *ip)
112{
113	struct dirhash *dh;
114	struct buf *bp = NULL;
115	struct direct *ep;
116	struct vnode *vp;
117	doff_t bmask, pos;
118	int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
119
120	/* Check if we can/should use dirhash. */
121	if (ip->i_dirhash == NULL) {
122		if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode))
123			return (-1);
124	} else {
125		/* Hash exists, but sysctls could have changed. */
126		if (ip->i_size < ufs_mindirhashsize ||
127		    ufs_dirhashmem > ufs_dirhashmaxmem) {
128			ufsdirhash_free(ip);
129			return (-1);
130		}
131		/* Check if hash exists and is intact (note: unlocked read). */
132		if (ip->i_dirhash->dh_hash != NULL)
133			return (0);
134		/* Free the old, recycled hash and build a new one. */
135		ufsdirhash_free(ip);
136	}
137
138	vp = ip->i_vnode;
139	/* Allocate 50% more entries than this dir size could ever need. */
140	KASSERT(ip->i_size >= DIRBLKSIZ, ("ufsdirhash_build size"));
141	nslots = ip->i_size / DIRECTSIZ(1);
142	nslots = (nslots * 3 + 1) / 2;
143	narrays = howmany(nslots, DH_NBLKOFF);
144	nslots = narrays * DH_NBLKOFF;
145	dirblocks = howmany(ip->i_size, DIRBLKSIZ);
146	nblocks = (dirblocks * 3 + 1) / 2;
147
148	memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
149	    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
150	    nblocks * sizeof(*dh->dh_blkfree);
151	mtx_lock(&ufsdirhash_mtx);
152	if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) {
153		mtx_unlock(&ufsdirhash_mtx);
154		if (memreqd > ufs_dirhashmaxmem / 2)
155			return (-1);
156
157		/* Try to free some space. */
158		if (ufsdirhash_recycle(memreqd) != 0)
159			return (-1);
160		/* Enough was freed, and ufsdirhash_mtx has been locked. */
161	}
162	ufs_dirhashmem += memreqd;
163	mtx_unlock(&ufsdirhash_mtx);
164
165	/*
166	 * Use non-blocking mallocs so that we will revert to a linear
167	 * lookup on failure rather than potentially blocking forever.
168	 */
169	MALLOC(dh, struct dirhash *, sizeof *dh, M_DIRHASH, M_NOWAIT | M_ZERO);
170	if (dh == NULL)
171		return (-1);
172	MALLOC(dh->dh_hash, doff_t **, narrays * sizeof(dh->dh_hash[0]),
173	    M_DIRHASH, M_NOWAIT | M_ZERO);
174	MALLOC(dh->dh_blkfree, u_int8_t *, nblocks * sizeof(dh->dh_blkfree[0]),
175	    M_DIRHASH, M_NOWAIT);
176	if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
177		goto fail;
178	for (i = 0; i < narrays; i++) {
179		if ((dh->dh_hash[i] = zalloc(ufsdirhash_zone)) == NULL)
180			goto fail;
181		for (j = 0; j < DH_NBLKOFF; j++)
182			dh->dh_hash[i][j] = DIRHASH_EMPTY;
183	}
184
185	/* Initialise the hash table and block statistics. */
186	mtx_init(&dh->dh_mtx, "dirhash", MTX_DEF);
187	dh->dh_narrays = narrays;
188	dh->dh_hlen = nslots;
189	dh->dh_nblk = nblocks;
190	dh->dh_dirblks = dirblocks;
191	for (i = 0; i < dirblocks; i++)
192		dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN;
193	for (i = 0; i < DH_NFSTATS; i++)
194		dh->dh_firstfree[i] = -1;
195	dh->dh_firstfree[DH_NFSTATS] = 0;
196	dh->dh_seqopt = 0;
197	dh->dh_seqoff = 0;
198	dh->dh_score = DH_SCOREINIT;
199	ip->i_dirhash = dh;
200
201	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
202	pos = 0;
203	while (pos < ip->i_size) {
204		/* If necessary, get the next directory block. */
205		if ((pos & bmask) == 0) {
206			if (bp != NULL)
207				brelse(bp);
208			if (UFS_BLKATOFF(vp, (off_t)pos, NULL, &bp) != 0)
209				goto fail;
210		}
211
212		/* Add this entry to the hash. */
213		ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
214		if (ep->d_reclen == 0 || ep->d_reclen >
215		    DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) {
216			/* Corrupted directory. */
217			brelse(bp);
218			goto fail;
219		}
220		if (ep->d_ino != 0) {
221			/* Add the entry (simplified ufsdirhash_add). */
222			slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen);
223			while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
224				slot = WRAPINCR(slot, dh->dh_hlen);
225			dh->dh_hused++;
226			DH_ENTRY(dh, slot) = pos;
227			ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep));
228		}
229		pos += ep->d_reclen;
230	}
231
232	if (bp != NULL)
233		brelse(bp);
234	mtx_lock(&ufsdirhash_mtx);
235	TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
236	dh->dh_onlist = 1;
237	mtx_unlock(&ufsdirhash_mtx);
238	return (0);
239
240fail:
241	if (dh->dh_hash != NULL) {
242		for (i = 0; i < narrays; i++)
243			if (dh->dh_hash[i] != NULL)
244				zfree(ufsdirhash_zone, dh->dh_hash[i]);
245		FREE(dh->dh_hash, M_DIRHASH);
246	}
247	if (dh->dh_blkfree != NULL)
248		FREE(dh->dh_blkfree, M_DIRHASH);
249	FREE(dh, M_DIRHASH);
250	ip->i_dirhash = NULL;
251	mtx_lock(&ufsdirhash_mtx);
252	ufs_dirhashmem -= memreqd;
253	mtx_unlock(&ufsdirhash_mtx);
254	return (-1);
255}
256
257/*
258 * Free any hash table associated with inode 'ip'.
259 */
260void
261ufsdirhash_free(struct inode *ip)
262{
263	struct dirhash *dh;
264	int i, mem;
265
266	if ((dh = ip->i_dirhash) == NULL)
267		return;
268	mtx_lock(&ufsdirhash_mtx);
269	mtx_lock(&dh->dh_mtx);
270	if (dh->dh_onlist)
271		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
272	mtx_unlock(&dh->dh_mtx);
273	mtx_unlock(&ufsdirhash_mtx);
274
275	/* The dirhash pointed to by 'dh' is exclusively ours now. */
276
277	mem = sizeof(*dh);
278	if (dh->dh_hash != NULL) {
279		for (i = 0; i < dh->dh_narrays; i++)
280			zfree(ufsdirhash_zone, dh->dh_hash[i]);
281		FREE(dh->dh_hash, M_DIRHASH);
282		FREE(dh->dh_blkfree, M_DIRHASH);
283		mem += dh->dh_narrays * sizeof(*dh->dh_hash) +
284		    dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
285		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
286	}
287	mtx_destroy(&dh->dh_mtx);
288	FREE(dh, M_DIRHASH);
289	ip->i_dirhash = NULL;
290
291	mtx_lock(&ufsdirhash_mtx);
292	ufs_dirhashmem -= mem;
293	mtx_unlock(&ufsdirhash_mtx);
294}
295
296/*
297 * Find the offset of the specified name within the given inode.
298 * Returns 0 on success, ENOENT if the entry does not exist, or
299 * EJUSTRETURN if the caller should revert to a linear search.
300 *
301 * If successful, the directory offset is stored in *offp. If
302 * prevoffp is non-NULL, the offset of the previous entry within
303 * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
304 * is the first in a block, the start of the block is used).
305 */
306int
307ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp,
308    doff_t *prevoffp)
309{
310	struct dirhash *dh, *dh_next;
311	struct direct *dp;
312	struct vnode *vp;
313	struct buf *bp;
314	doff_t blkoff, bmask, offset, prevoff;
315	int i, slot;
316
317	if ((dh = ip->i_dirhash) == NULL)
318		return (EJUSTRETURN);
319	/*
320	 * Move this dirhash towards the end of the list if it has a
321	 * score higher than the next entry, and aquire the dh_mtx.
322	 * Optimise the case where it's already the last by performing
323	 * an unlocked read of the TAILQ_NEXT pointer.
324	 *
325	 * In both cases, end up holding just dh_mtx.
326	 */
327	if (TAILQ_NEXT(dh, dh_list) != NULL) {
328		mtx_lock(&ufsdirhash_mtx);
329		mtx_lock(&dh->dh_mtx);
330		/*
331		 * If the new score will be greater than that of the next
332		 * entry, then move this entry past it. With both mutexes
333		 * held, dh_next won't go away, but its dh_score could
334		 * change; that's not important since it is just a hint.
335		 */
336		if (dh->dh_hash != NULL &&
337		    (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
338		    dh->dh_score >= dh_next->dh_score) {
339			KASSERT(dh->dh_onlist, ("dirhash: not on list"));
340			TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
341			TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
342			    dh_list);
343		}
344		mtx_unlock(&ufsdirhash_mtx);
345	} else {
346		/* Already the last, though that could change as we wait. */
347		mtx_lock(&dh->dh_mtx);
348	}
349	if (dh->dh_hash == NULL) {
350		mtx_unlock(&dh->dh_mtx);
351		ufsdirhash_free(ip);
352		return (EJUSTRETURN);
353	}
354
355	/* Update the score. */
356	if (dh->dh_score < DH_SCOREMAX)
357		dh->dh_score++;
358
359	vp = ip->i_vnode;
360	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
361	blkoff = -1;
362	bp = NULL;
363restart:
364	slot = ufsdirhash_hash(dh, name, namelen);
365
366	if (dh->dh_seqopt) {
367		/*
368		 * Sequential access optimisation. dh_seqoff contains the
369		 * offset of the directory entry immediately following
370		 * the last entry that was looked up. Check if this offset
371		 * appears in the hash chain for the name we are looking for.
372		 */
373		for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
374		    i = WRAPINCR(i, dh->dh_hlen))
375			if (offset == dh->dh_seqopt)
376				break;
377		if (offset == dh->dh_seqoff) {
378			/*
379			 * We found an entry with the expected offset. This
380			 * is probably the entry we want, but if not, the
381			 * code below will turn off seqoff and retry.
382			 */
383			slot = i;
384		} else
385			dh->dh_seqopt = 0;
386	}
387
388	for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY;
389	    slot = WRAPINCR(slot, dh->dh_hlen)) {
390		if (offset == DIRHASH_DEL)
391			continue;
392		mtx_unlock(&dh->dh_mtx);
393
394		if (offset < 0 || offset >= ip->i_size)
395			panic("ufsdirhash_lookup: bad offset in hash array");
396		if ((offset & ~bmask) != blkoff) {
397			if (bp != NULL)
398				brelse(bp);
399			blkoff = offset & ~bmask;
400			if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0)
401				return (EJUSTRETURN);
402		}
403		dp = (struct direct *)(bp->b_data + (offset & bmask));
404		if (dp->d_reclen == 0 || dp->d_reclen >
405		    DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) {
406			/* Corrupted directory. */
407			brelse(bp);
408			return (EJUSTRETURN);
409		}
410		if (dp->d_namlen == namelen &&
411		    bcmp(dp->d_name, name, namelen) == 0) {
412			/* Found. Get the prev offset if needed. */
413			if (prevoffp != NULL) {
414				if (offset & (DIRBLKSIZ - 1)) {
415					prevoff = ufsdirhash_getprev(dp,
416					    offset);
417					if (prevoff == -1) {
418						brelse(bp);
419						return (EJUSTRETURN);
420					}
421				} else
422					prevoff = offset;
423				*prevoffp = prevoff;
424			}
425
426			/* Check for sequential access, and update offset. */
427			if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
428				dh->dh_seqopt = 1;
429			dh->dh_seqoff = offset + DIRSIZ(0, dp);
430
431			brelse(bp);
432			*offp = offset;
433			return (0);
434		}
435
436		mtx_lock(&dh->dh_mtx);
437		if (dh->dh_hash == NULL) {
438			mtx_unlock(&dh->dh_mtx);
439			if (bp != NULL)
440				brelse(bp);
441			ufsdirhash_free(ip);
442			return (EJUSTRETURN);
443		}
444		/*
445		 * When the name doesn't match in the seqopt case, go back
446		 * and search normally.
447		 */
448		if (dh->dh_seqopt) {
449			dh->dh_seqopt = 0;
450			goto restart;
451		}
452	}
453	mtx_unlock(&dh->dh_mtx);
454	if (bp != NULL)
455		brelse(bp);
456	return (ENOENT);
457}
458
459/*
460 * Find a directory block with room for 'slotneeded' bytes. Returns
461 * the offset of the directory entry that begins the free space.
462 * This will either be the offset of an existing entry that has free
463 * space at the end, or the offset of an entry with d_ino == 0 at
464 * the start of a DIRBLKSIZ block.
465 *
466 * To use the space, the caller may need to compact existing entries in
467 * the directory. The total number of bytes in all of the entries involved
468 * in the compaction is stored in *slotsize. In other words, all of
469 * the entries that must be compacted are exactly contained in the
470 * region beginning at the returned offset and spanning *slotsize bytes.
471 *
472 * Returns -1 if no space was found, indicating that the directory
473 * must be extended.
474 */
475doff_t
476ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
477{
478	struct direct *dp;
479	struct dirhash *dh;
480	struct buf *bp;
481	doff_t pos, slotstart;
482	int dirblock, error, freebytes, i;
483
484	if ((dh = ip->i_dirhash) == NULL)
485		return (-1);
486	mtx_lock(&dh->dh_mtx);
487	if (dh->dh_hash == NULL) {
488		mtx_unlock(&dh->dh_mtx);
489		ufsdirhash_free(ip);
490		return (-1);
491	}
492
493	/* Find a directory block with the desired free space. */
494	dirblock = -1;
495	for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
496		if ((dirblock = dh->dh_firstfree[i]) != -1)
497			break;
498	if (dirblock == -1) {
499		mtx_unlock(&dh->dh_mtx);
500		return (-1);
501	}
502
503	KASSERT(dirblock < dh->dh_nblk &&
504	    dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN),
505	    ("ufsdirhash_findfree: bad stats"));
506	mtx_unlock(&dh->dh_mtx);
507	pos = dirblock * DIRBLKSIZ;
508	error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp);
509	if (error)
510		return (-1);
511
512	/* Find the first entry with free space. */
513	for (i = 0; i < DIRBLKSIZ; ) {
514		if (dp->d_reclen == 0) {
515			brelse(bp);
516			return (-1);
517		}
518		if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp))
519			break;
520		i += dp->d_reclen;
521		dp = (struct direct *)((char *)dp + dp->d_reclen);
522	}
523	if (i > DIRBLKSIZ) {
524		brelse(bp);
525		return (-1);
526	}
527	slotstart = pos + i;
528
529	/* Find the range of entries needed to get enough space */
530	freebytes = 0;
531	while (i < DIRBLKSIZ && freebytes < slotneeded) {
532		freebytes += dp->d_reclen;
533		if (dp->d_ino != 0)
534			freebytes -= DIRSIZ(0, dp);
535		if (dp->d_reclen == 0) {
536			brelse(bp);
537			return (-1);
538		}
539		i += dp->d_reclen;
540		dp = (struct direct *)((char *)dp + dp->d_reclen);
541	}
542	if (i > DIRBLKSIZ) {
543		brelse(bp);
544		return (-1);
545	}
546	if (freebytes < slotneeded)
547		panic("ufsdirhash_findfree: free mismatch");
548	brelse(bp);
549	*slotsize = pos + i - slotstart;
550	return (slotstart);
551}
552
553/*
554 * Return the start of the unused space at the end of a directory, or
555 * -1 if there are no trailing unused blocks.
556 */
557doff_t
558ufsdirhash_enduseful(struct inode *ip)
559{
560
561	struct dirhash *dh;
562	int i;
563
564	if ((dh = ip->i_dirhash) == NULL)
565		return (-1);
566	mtx_lock(&dh->dh_mtx);
567	if (dh->dh_hash == NULL) {
568		mtx_unlock(&dh->dh_mtx);
569		ufsdirhash_free(ip);
570		return (-1);
571	}
572
573	if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) {
574		mtx_unlock(&dh->dh_mtx);
575		return (-1);
576	}
577
578	for (i = dh->dh_dirblks - 1; i >= 0; i--)
579		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
580			break;
581	mtx_unlock(&dh->dh_mtx);
582	return ((doff_t)(i + 1) * DIRBLKSIZ);
583}
584
585/*
586 * Insert information into the hash about a new directory entry. dirp
587 * points to a struct direct containing the entry, and offset specifies
588 * the offset of this entry.
589 */
590void
591ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
592{
593	struct dirhash *dh;
594	int slot;
595
596	if ((dh = ip->i_dirhash) == NULL)
597		return;
598	mtx_lock(&dh->dh_mtx);
599	if (dh->dh_hash == NULL) {
600		mtx_unlock(&dh->dh_mtx);
601		ufsdirhash_free(ip);
602		return;
603	}
604
605	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
606	    ("ufsdirhash_add: bad offset"));
607	/*
608	 * Normal hash usage is < 66%. If the usage gets too high then
609	 * remove the hash entirely and let it be rebuilt later.
610	 */
611	if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
612		mtx_unlock(&dh->dh_mtx);
613		ufsdirhash_free(ip);
614		return;
615	}
616
617	/* Find a free hash slot (empty or deleted), and add the entry. */
618	slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
619	while (DH_ENTRY(dh, slot) >= 0)
620		slot = WRAPINCR(slot, dh->dh_hlen);
621	if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
622		dh->dh_hused++;
623	DH_ENTRY(dh, slot) = offset;
624
625	/* Update the per-block summary info. */
626	ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp));
627	mtx_unlock(&dh->dh_mtx);
628}
629
630/*
631 * Remove the specified directory entry from the hash. The entry to remove
632 * is defined by the name in `dirp', which must exist at the specified
633 * `offset' within the directory.
634 */
635void
636ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
637{
638	struct dirhash *dh;
639	int slot;
640
641	if ((dh = ip->i_dirhash) == NULL)
642		return;
643	mtx_lock(&dh->dh_mtx);
644	if (dh->dh_hash == NULL) {
645		mtx_unlock(&dh->dh_mtx);
646		ufsdirhash_free(ip);
647		return;
648	}
649
650	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
651	    ("ufsdirhash_remove: bad offset"));
652	/* Find the entry */
653	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
654
655	/* Remove the hash entry. */
656	ufsdirhash_delslot(dh, slot);
657
658	/* Update the per-block summary info. */
659	ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp));
660	mtx_unlock(&dh->dh_mtx);
661}
662
663/*
664 * Change the offset associated with a directory entry in the hash. Used
665 * when compacting directory blocks.
666 */
667void
668ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
669    doff_t newoff)
670{
671	struct dirhash *dh;
672	int slot;
673
674	if ((dh = ip->i_dirhash) == NULL)
675		return;
676	mtx_lock(&dh->dh_mtx);
677	if (dh->dh_hash == NULL) {
678		mtx_unlock(&dh->dh_mtx);
679		ufsdirhash_free(ip);
680		return;
681	}
682
683	KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ &&
684	    newoff < dh->dh_dirblks * DIRBLKSIZ,
685	    ("ufsdirhash_move: bad offset"));
686	/* Find the entry, and update the offset. */
687	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
688	DH_ENTRY(dh, slot) = newoff;
689	mtx_unlock(&dh->dh_mtx);
690}
691
692/*
693 * Inform dirhash that the directory has grown by one block that
694 * begins at offset (i.e. the new length is offset + DIRBLKSIZ).
695 */
696void
697ufsdirhash_newblk(struct inode *ip, doff_t offset)
698{
699	struct dirhash *dh;
700	int block;
701
702	if ((dh = ip->i_dirhash) == NULL)
703		return;
704	mtx_lock(&dh->dh_mtx);
705	if (dh->dh_hash == NULL) {
706		mtx_unlock(&dh->dh_mtx);
707		ufsdirhash_free(ip);
708		return;
709	}
710
711	KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ,
712	    ("ufsdirhash_newblk: bad offset"));
713	block = offset / DIRBLKSIZ;
714	if (block >= dh->dh_nblk) {
715		/* Out of space; must rebuild. */
716		mtx_unlock(&dh->dh_mtx);
717		ufsdirhash_free(ip);
718		return;
719	}
720	dh->dh_dirblks = block + 1;
721
722	/* Account for the new free block. */
723	dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN;
724	if (dh->dh_firstfree[DH_NFSTATS] == -1)
725		dh->dh_firstfree[DH_NFSTATS] = block;
726	mtx_unlock(&dh->dh_mtx);
727}
728
729/*
730 * Inform dirhash that the directory is being truncated.
731 */
732void
733ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
734{
735	struct dirhash *dh;
736	int block, i;
737
738	if ((dh = ip->i_dirhash) == NULL)
739		return;
740	mtx_lock(&dh->dh_mtx);
741	if (dh->dh_hash == NULL) {
742		mtx_unlock(&dh->dh_mtx);
743		ufsdirhash_free(ip);
744		return;
745	}
746
747	KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ,
748	    ("ufsdirhash_dirtrunc: bad offset"));
749	block = howmany(offset, DIRBLKSIZ);
750	/*
751	 * If the directory shrinks to less than 1/8 of dh_nblk blocks
752	 * (about 20% of its original size due to the 50% extra added in
753	 * ufsdirhash_build) then free it, and let the caller rebuild
754	 * if necessary.
755	 */
756	if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
757		mtx_unlock(&dh->dh_mtx);
758		ufsdirhash_free(ip);
759		return;
760	}
761
762	/*
763	 * Remove any `first free' information pertaining to the
764	 * truncated blocks. All blocks we're removing should be
765	 * completely unused.
766	 */
767	if (dh->dh_firstfree[DH_NFSTATS] >= block)
768		dh->dh_firstfree[DH_NFSTATS] = -1;
769	for (i = block; i < dh->dh_dirblks; i++)
770		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
771			panic("ufsdirhash_dirtrunc: blocks in use");
772	for (i = 0; i < DH_NFSTATS; i++)
773		if (dh->dh_firstfree[i] >= block)
774			panic("ufsdirhash_dirtrunc: first free corrupt");
775	dh->dh_dirblks = block;
776	mtx_unlock(&dh->dh_mtx);
777}
778
779/*
780 * Debugging function to check that the dirhash information about
781 * a directory block matches its actual contents. Panics if a mismatch
782 * is detected.
783 *
784 * On entry, `buf' should point to the start of an in-core
785 * DIRBLKSIZ-sized directory block, and `offset' should contain the
786 * offset from the start of the directory of that block.
787 */
788void
789ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset)
790{
791	struct dirhash *dh;
792	struct direct *dp;
793	int block, ffslot, i, nfree;
794
795	if (!ufs_dirhashcheck)
796		return;
797	if ((dh = ip->i_dirhash) == NULL)
798		return;
799	mtx_lock(&dh->dh_mtx);
800	if (dh->dh_hash == NULL) {
801		mtx_unlock(&dh->dh_mtx);
802		ufsdirhash_free(ip);
803		return;
804	}
805
806	block = offset / DIRBLKSIZ;
807	if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks)
808		panic("ufsdirhash_checkblock: bad offset");
809
810	nfree = 0;
811	for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) {
812		dp = (struct direct *)(buf + i);
813		if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ)
814			panic("ufsdirhash_checkblock: bad dir");
815
816		if (dp->d_ino == 0) {
817			if (i != 0)
818				panic("ufsdirhash_checkblock: bad dir inode");
819			nfree += dp->d_reclen;
820			continue;
821		}
822
823		/* Check that the entry	exists (will panic if it doesn't). */
824		ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
825
826		nfree += dp->d_reclen - DIRSIZ(0, dp);
827	}
828	if (i != DIRBLKSIZ)
829		panic("ufsdirhash_checkblock: bad dir end");
830
831	if (dh->dh_blkfree[block] * DIRALIGN != nfree)
832		panic("ufsdirhash_checkblock: bad free count");
833
834	ffslot = nfree / DIRALIGN;
835	if (ffslot > DH_NFSTATS)
836		ffslot = DH_NFSTATS;
837	for (i = 0; i <= DH_NFSTATS; i++)
838		if (dh->dh_firstfree[i] == block && i != ffslot)
839			panic("ufsdirhash_checkblock: bad first-free");
840	mtx_unlock(&dh->dh_mtx);
841}
842
843/*
844 * Hash the specified filename into a dirhash slot.
845 */
846static int
847ufsdirhash_hash(struct dirhash *dh, char *name, int namelen)
848{
849	return (fnv_32_buf(name, namelen, FNV1_32_INIT) % dh->dh_hlen);
850}
851
852/*
853 * Adjust the number of free bytes in the block containing `offset'
854 * by the value specified by `diff'.
855 *
856 * The caller must ensure we have exclusive access to `dh'; normally
857 * that means that dh_mtx should be held, but this is also called
858 * from ufsdirhash_build() where exclusive access can be assumed.
859 */
860static void
861ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff)
862{
863	int block, i, nfidx, ofidx;
864
865	/* Update the per-block summary info. */
866	block = offset / DIRBLKSIZ;
867	KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks,
868	     ("dirhash bad offset"));
869	ofidx = dh->dh_blkfree[block];
870	if (ofidx > DH_NFSTATS)
871		ofidx = DH_NFSTATS;
872	dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
873	nfidx = dh->dh_blkfree[block];
874	if (nfidx > DH_NFSTATS)
875		nfidx = DH_NFSTATS;
876
877	/* Update the `first free' list if necessary. */
878	if (ofidx != nfidx) {
879		/* If removing, scan forward for the next block. */
880		if (dh->dh_firstfree[ofidx] == block) {
881			for (i = block + 1; i < dh->dh_dirblks; i++)
882				if (dh->dh_blkfree[i] == ofidx)
883					break;
884			dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
885		}
886
887		/* Make this the new `first free' if necessary */
888		if (dh->dh_firstfree[nfidx] > block ||
889		    dh->dh_firstfree[nfidx] == -1)
890			dh->dh_firstfree[nfidx] = block;
891	}
892}
893
894/*
895 * Find the specified name which should have the specified offset.
896 * Returns a slot number, and panics on failure.
897 *
898 * `dh' must be locked on entry and remains so on return.
899 */
900static int
901ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset)
902{
903	int slot;
904
905	mtx_assert(&dh->dh_mtx, MA_OWNED);
906
907	/* Find the entry. */
908	KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full"));
909	slot = ufsdirhash_hash(dh, name, namelen);
910	while (DH_ENTRY(dh, slot) != offset &&
911	    DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
912		slot = WRAPINCR(slot, dh->dh_hlen);
913	if (DH_ENTRY(dh, slot) != offset)
914		panic("ufsdirhash_findslot: '%.*s' not found", namelen, name);
915
916	return (slot);
917}
918
919/*
920 * Remove the entry corresponding to the specified slot from the hash array.
921 *
922 * `dh' must be locked on entry and remains so on return.
923 */
924static void
925ufsdirhash_delslot(struct dirhash *dh, int slot)
926{
927	int i;
928
929	mtx_assert(&dh->dh_mtx, MA_OWNED);
930
931	/* Mark the entry as deleted. */
932	DH_ENTRY(dh, slot) = DIRHASH_DEL;
933
934	/* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
935	for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
936		i = WRAPINCR(i, dh->dh_hlen);
937	if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
938		for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) {
939			DH_ENTRY(dh, i) = DIRHASH_EMPTY;
940			dh->dh_hused--;
941			i = WRAPINCR(i, dh->dh_hlen);
942		}
943		KASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen"));
944	}
945}
946
947/*
948 * Given a directory entry and its offset, find the offset of the
949 * previous entry in the same DIRBLKSIZ-sized block. Returns an
950 * offset, or -1 if there is no previous entry in the block or some
951 * other problem occurred.
952 */
953static doff_t
954ufsdirhash_getprev(struct direct *dirp, doff_t offset)
955{
956	struct direct *dp;
957	char *blkbuf;
958	doff_t blkoff, prevoff;
959	int entrypos, i;
960
961	blkoff = offset & ~(DIRBLKSIZ - 1);	/* offset of start of block */
962	entrypos = offset & (DIRBLKSIZ - 1);	/* entry relative to block */
963	blkbuf = (char *)dirp - entrypos;
964	prevoff = blkoff;
965
966	/* If `offset' is the start of a block, there is no previous entry. */
967	if (entrypos == 0)
968		return (-1);
969
970	/* Scan from the start of the block until we get to the entry. */
971	for (i = 0; i < entrypos; i += dp->d_reclen) {
972		dp = (struct direct *)(blkbuf + i);
973		if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
974			return (-1);	/* Corrupted directory. */
975		prevoff = blkoff + i;
976	}
977	return (prevoff);
978}
979
980/*
981 * Try to free up `wanted' bytes by stealing memory from existing
982 * dirhashes. Returns zero with ufsdirhash_mtx locked if successful.
983 */
984static int
985ufsdirhash_recycle(int wanted)
986{
987	struct dirhash *dh;
988	doff_t **hash;
989	u_int8_t *blkfree;
990	int i, mem, narrays;
991
992	mtx_lock(&ufsdirhash_mtx);
993	while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
994		/* Find a dirhash, and lock it. */
995		if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) {
996			mtx_unlock(&ufsdirhash_mtx);
997			return (-1);
998		}
999		mtx_lock(&dh->dh_mtx);
1000		KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list"));
1001
1002		/* Decrement the score; only recycle if it becomes zero. */
1003		if (--dh->dh_score > 0) {
1004			mtx_unlock(&dh->dh_mtx);
1005			mtx_unlock(&ufsdirhash_mtx);
1006			return (-1);
1007		}
1008
1009		/* Remove it from the list and detach its memory. */
1010		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
1011		dh->dh_onlist = 0;
1012		hash = dh->dh_hash;
1013		dh->dh_hash = NULL;
1014		blkfree = dh->dh_blkfree;
1015		dh->dh_blkfree = NULL;
1016		narrays = dh->dh_narrays;
1017		mem = narrays * sizeof(*dh->dh_hash) +
1018		    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
1019		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
1020
1021		/* Unlock everything, free the detached memory. */
1022		mtx_unlock(&dh->dh_mtx);
1023		mtx_unlock(&ufsdirhash_mtx);
1024		for (i = 0; i < narrays; i++)
1025			zfree(ufsdirhash_zone, hash[i]);
1026		FREE(hash, M_DIRHASH);
1027		FREE(blkfree, M_DIRHASH);
1028
1029		/* Account for the returned memory, and repeat if necessary. */
1030		mtx_lock(&ufsdirhash_mtx);
1031		ufs_dirhashmem -= mem;
1032	}
1033	/* Success; return with ufsdirhash_mtx locked. */
1034	return (0);
1035}
1036
1037
1038static void
1039ufsdirhash_init()
1040{
1041	ufsdirhash_zone = zinit("DIRHASH", DH_NBLKOFF * sizeof(daddr_t), 0,
1042	    0, 1);
1043	mtx_init(&ufsdirhash_mtx, "dirhash list", MTX_DEF);
1044	TAILQ_INIT(&ufsdirhash_list);
1045}
1046SYSINIT(ufsdirhash, SI_SUB_PSEUDO, SI_ORDER_ANY, ufsdirhash_init, NULL)
1047
1048
1049#endif /* UFS_DIRHASH */
1050