ufs_dirhash.c revision 82364
1/*
2 * Copyright (c) 2001 Ian Dowse.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD: head/sys/ufs/ufs/ufs_dirhash.c 82364 2001-08-26 20:47:19Z iedowse $
26 */
27/*
28 * This implements a hash-based lookup scheme for UFS directories.
29 */
30
31#include "opt_ufs.h"
32
33#ifdef UFS_DIRHASH
34
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/kernel.h>
38#include <sys/mutex.h>
39#include <sys/malloc.h>
40#include <sys/fnv_hash.h>
41#include <sys/proc.h>
42#include <sys/bio.h>
43#include <sys/buf.h>
44#include <sys/vnode.h>
45#include <sys/mount.h>
46#include <sys/sysctl.h>
47#include <vm/vm_zone.h>
48
49#include <ufs/ufs/quota.h>
50#include <ufs/ufs/inode.h>
51#include <ufs/ufs/dir.h>
52#include <ufs/ufs/dirhash.h>
53#include <ufs/ufs/extattr.h>
54#include <ufs/ufs/ufsmount.h>
55#include <ufs/ufs/ufs_extern.h>
56
57#define WRAPINCR(val, limit)	(((val) + 1 == (limit)) ? 0 : ((val) + 1))
58#define OFSFMT(vp)		((vp)->v_mount->mnt_maxsymlinklen <= 0)
59
60static MALLOC_DEFINE(M_DIRHASH, "UFS dirhash", "UFS directory hash tables");
61
62SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
63
64static int ufs_mindirhashsize = DIRBLKSIZ * 5;
65SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW,
66    &ufs_mindirhashsize,
67    0, "minimum directory size in bytes for which to use hashed lookup");
68static int ufs_dirhashmaxmem = 2 * 1024 * 1024;
69SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_maxmem, CTLFLAG_RW, &ufs_dirhashmaxmem,
70    0, "maximum allowed dirhash memory usage");
71static int ufs_dirhashmem;
72SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_mem, CTLFLAG_RD, &ufs_dirhashmem,
73    0, "current dirhash memory usage");
74static int ufs_dirhashcheck = 1;
75SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_docheck, CTLFLAG_RW, &ufs_dirhashcheck,
76    0, "enable extra sanity tests");
77
78
79static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen);
80static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff);
81static void ufsdirhash_delslot(struct dirhash *dh, int slot);
82static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen,
83	   doff_t offset);
84static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset);
85static void ufsdirhash_init(void);
86static int ufsdirhash_recycle(int wanted);
87
88static vm_zone_t	ufsdirhash_zone;
89
90/* Dirhash list; recently-used entries are near the tail. */
91static TAILQ_HEAD(, dirhash) ufsdirhash_list;
92
93/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
94static struct mtx	ufsdirhash_mtx;
95
96/*
97 * Locking order:
98 *	ufsdirhash_mtx
99 *	dh_mtx
100 *
101 * The dh_mtx mutex should be aquired either via the inode lock, or via
102 * ufsdirhash_mtx. Only the owner of the inode may free the associated
103 * dirhash, but anything can steal its memory and set dh_hash to NULL.
104 */
105
106/*
107 * Attempt to build up a hash table for the directory contents in
108 * inode 'ip'. Returns 0 on success, or -1 of the operation failed.
109 */
110int
111ufsdirhash_build(struct inode *ip)
112{
113	struct dirhash *dh;
114	struct buf *bp = NULL;
115	struct direct *ep;
116	struct vnode *vp;
117	doff_t bmask, pos;
118	int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
119
120	/* Check if we can/should use dirhash. */
121	if (ip->i_dirhash == NULL) {
122		if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode))
123			return (-1);
124	} else {
125		/* Hash exists, but sysctls could have changed. */
126		if (ip->i_size < ufs_mindirhashsize ||
127		    ufs_dirhashmem > ufs_dirhashmaxmem) {
128			ufsdirhash_free(ip);
129			return (-1);
130		}
131		/* Check if hash exists and is intact (note: unlocked read). */
132		if (ip->i_dirhash->dh_hash != NULL)
133			return (0);
134		/* Free the old, recycled hash and build a new one. */
135		ufsdirhash_free(ip);
136	}
137
138	/* Don't hash removed directories. */
139	if (ip->i_effnlink == 0)
140		return (-1);
141
142	vp = ip->i_vnode;
143	/* Allocate 50% more entries than this dir size could ever need. */
144	KASSERT(ip->i_size >= DIRBLKSIZ, ("ufsdirhash_build size"));
145	nslots = ip->i_size / DIRECTSIZ(1);
146	nslots = (nslots * 3 + 1) / 2;
147	narrays = howmany(nslots, DH_NBLKOFF);
148	nslots = narrays * DH_NBLKOFF;
149	dirblocks = howmany(ip->i_size, DIRBLKSIZ);
150	nblocks = (dirblocks * 3 + 1) / 2;
151
152	memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
153	    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
154	    nblocks * sizeof(*dh->dh_blkfree);
155	mtx_lock(&ufsdirhash_mtx);
156	if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) {
157		mtx_unlock(&ufsdirhash_mtx);
158		if (memreqd > ufs_dirhashmaxmem / 2)
159			return (-1);
160
161		/* Try to free some space. */
162		if (ufsdirhash_recycle(memreqd) != 0)
163			return (-1);
164		/* Enough was freed, and ufsdirhash_mtx has been locked. */
165	}
166	ufs_dirhashmem += memreqd;
167	mtx_unlock(&ufsdirhash_mtx);
168
169	/*
170	 * Use non-blocking mallocs so that we will revert to a linear
171	 * lookup on failure rather than potentially blocking forever.
172	 */
173	MALLOC(dh, struct dirhash *, sizeof *dh, M_DIRHASH, M_NOWAIT | M_ZERO);
174	if (dh == NULL)
175		return (-1);
176	MALLOC(dh->dh_hash, doff_t **, narrays * sizeof(dh->dh_hash[0]),
177	    M_DIRHASH, M_NOWAIT | M_ZERO);
178	MALLOC(dh->dh_blkfree, u_int8_t *, nblocks * sizeof(dh->dh_blkfree[0]),
179	    M_DIRHASH, M_NOWAIT);
180	if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
181		goto fail;
182	for (i = 0; i < narrays; i++) {
183		if ((dh->dh_hash[i] = zalloc(ufsdirhash_zone)) == NULL)
184			goto fail;
185		for (j = 0; j < DH_NBLKOFF; j++)
186			dh->dh_hash[i][j] = DIRHASH_EMPTY;
187	}
188
189	/* Initialise the hash table and block statistics. */
190	mtx_init(&dh->dh_mtx, "dirhash", MTX_DEF);
191	dh->dh_narrays = narrays;
192	dh->dh_hlen = nslots;
193	dh->dh_nblk = nblocks;
194	dh->dh_dirblks = dirblocks;
195	for (i = 0; i < dirblocks; i++)
196		dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN;
197	for (i = 0; i < DH_NFSTATS; i++)
198		dh->dh_firstfree[i] = -1;
199	dh->dh_firstfree[DH_NFSTATS] = 0;
200	dh->dh_seqopt = 0;
201	dh->dh_seqoff = 0;
202	dh->dh_score = DH_SCOREINIT;
203	ip->i_dirhash = dh;
204
205	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
206	pos = 0;
207	while (pos < ip->i_size) {
208		/* If necessary, get the next directory block. */
209		if ((pos & bmask) == 0) {
210			if (bp != NULL)
211				brelse(bp);
212			if (UFS_BLKATOFF(vp, (off_t)pos, NULL, &bp) != 0)
213				goto fail;
214		}
215
216		/* Add this entry to the hash. */
217		ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
218		if (ep->d_reclen == 0 || ep->d_reclen >
219		    DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) {
220			/* Corrupted directory. */
221			brelse(bp);
222			goto fail;
223		}
224		if (ep->d_ino != 0) {
225			/* Add the entry (simplified ufsdirhash_add). */
226			slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen);
227			while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
228				slot = WRAPINCR(slot, dh->dh_hlen);
229			dh->dh_hused++;
230			DH_ENTRY(dh, slot) = pos;
231			ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep));
232		}
233		pos += ep->d_reclen;
234	}
235
236	if (bp != NULL)
237		brelse(bp);
238	mtx_lock(&ufsdirhash_mtx);
239	TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
240	dh->dh_onlist = 1;
241	mtx_unlock(&ufsdirhash_mtx);
242	return (0);
243
244fail:
245	if (dh->dh_hash != NULL) {
246		for (i = 0; i < narrays; i++)
247			if (dh->dh_hash[i] != NULL)
248				zfree(ufsdirhash_zone, dh->dh_hash[i]);
249		FREE(dh->dh_hash, M_DIRHASH);
250	}
251	if (dh->dh_blkfree != NULL)
252		FREE(dh->dh_blkfree, M_DIRHASH);
253	FREE(dh, M_DIRHASH);
254	ip->i_dirhash = NULL;
255	mtx_lock(&ufsdirhash_mtx);
256	ufs_dirhashmem -= memreqd;
257	mtx_unlock(&ufsdirhash_mtx);
258	return (-1);
259}
260
261/*
262 * Free any hash table associated with inode 'ip'.
263 */
264void
265ufsdirhash_free(struct inode *ip)
266{
267	struct dirhash *dh;
268	int i, mem;
269
270	if ((dh = ip->i_dirhash) == NULL)
271		return;
272	mtx_lock(&ufsdirhash_mtx);
273	mtx_lock(&dh->dh_mtx);
274	if (dh->dh_onlist)
275		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
276	mtx_unlock(&dh->dh_mtx);
277	mtx_unlock(&ufsdirhash_mtx);
278
279	/* The dirhash pointed to by 'dh' is exclusively ours now. */
280
281	mem = sizeof(*dh);
282	if (dh->dh_hash != NULL) {
283		for (i = 0; i < dh->dh_narrays; i++)
284			zfree(ufsdirhash_zone, dh->dh_hash[i]);
285		FREE(dh->dh_hash, M_DIRHASH);
286		FREE(dh->dh_blkfree, M_DIRHASH);
287		mem += dh->dh_narrays * sizeof(*dh->dh_hash) +
288		    dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
289		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
290	}
291	mtx_destroy(&dh->dh_mtx);
292	FREE(dh, M_DIRHASH);
293	ip->i_dirhash = NULL;
294
295	mtx_lock(&ufsdirhash_mtx);
296	ufs_dirhashmem -= mem;
297	mtx_unlock(&ufsdirhash_mtx);
298}
299
300/*
301 * Find the offset of the specified name within the given inode.
302 * Returns 0 on success, ENOENT if the entry does not exist, or
303 * EJUSTRETURN if the caller should revert to a linear search.
304 *
305 * If successful, the directory offset is stored in *offp, and a
306 * pointer to a struct buf containing the entry is stored in *bpp. If
307 * prevoffp is non-NULL, the offset of the previous entry within
308 * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
309 * is the first in a block, the start of the block is used).
310 */
311int
312ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp,
313    struct buf **bpp, doff_t *prevoffp)
314{
315	struct dirhash *dh, *dh_next;
316	struct direct *dp;
317	struct vnode *vp;
318	struct buf *bp;
319	doff_t blkoff, bmask, offset, prevoff;
320	int i, slot;
321
322	if ((dh = ip->i_dirhash) == NULL)
323		return (EJUSTRETURN);
324	/*
325	 * Move this dirhash towards the end of the list if it has a
326	 * score higher than the next entry, and aquire the dh_mtx.
327	 * Optimise the case where it's already the last by performing
328	 * an unlocked read of the TAILQ_NEXT pointer.
329	 *
330	 * In both cases, end up holding just dh_mtx.
331	 */
332	if (TAILQ_NEXT(dh, dh_list) != NULL) {
333		mtx_lock(&ufsdirhash_mtx);
334		mtx_lock(&dh->dh_mtx);
335		/*
336		 * If the new score will be greater than that of the next
337		 * entry, then move this entry past it. With both mutexes
338		 * held, dh_next won't go away, but its dh_score could
339		 * change; that's not important since it is just a hint.
340		 */
341		if (dh->dh_hash != NULL &&
342		    (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
343		    dh->dh_score >= dh_next->dh_score) {
344			KASSERT(dh->dh_onlist, ("dirhash: not on list"));
345			TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
346			TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
347			    dh_list);
348		}
349		mtx_unlock(&ufsdirhash_mtx);
350	} else {
351		/* Already the last, though that could change as we wait. */
352		mtx_lock(&dh->dh_mtx);
353	}
354	if (dh->dh_hash == NULL) {
355		mtx_unlock(&dh->dh_mtx);
356		ufsdirhash_free(ip);
357		return (EJUSTRETURN);
358	}
359
360	/* Update the score. */
361	if (dh->dh_score < DH_SCOREMAX)
362		dh->dh_score++;
363
364	vp = ip->i_vnode;
365	bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
366	blkoff = -1;
367	bp = NULL;
368restart:
369	slot = ufsdirhash_hash(dh, name, namelen);
370
371	if (dh->dh_seqopt) {
372		/*
373		 * Sequential access optimisation. dh_seqoff contains the
374		 * offset of the directory entry immediately following
375		 * the last entry that was looked up. Check if this offset
376		 * appears in the hash chain for the name we are looking for.
377		 */
378		for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
379		    i = WRAPINCR(i, dh->dh_hlen))
380			if (offset == dh->dh_seqopt)
381				break;
382		if (offset == dh->dh_seqoff) {
383			/*
384			 * We found an entry with the expected offset. This
385			 * is probably the entry we want, but if not, the
386			 * code below will turn off seqoff and retry.
387			 */
388			slot = i;
389		} else
390			dh->dh_seqopt = 0;
391	}
392
393	for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY;
394	    slot = WRAPINCR(slot, dh->dh_hlen)) {
395		if (offset == DIRHASH_DEL)
396			continue;
397		mtx_unlock(&dh->dh_mtx);
398
399		if (offset < 0 || offset >= ip->i_size)
400			panic("ufsdirhash_lookup: bad offset in hash array");
401		if ((offset & ~bmask) != blkoff) {
402			if (bp != NULL)
403				brelse(bp);
404			blkoff = offset & ~bmask;
405			if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0)
406				return (EJUSTRETURN);
407		}
408		dp = (struct direct *)(bp->b_data + (offset & bmask));
409		if (dp->d_reclen == 0 || dp->d_reclen >
410		    DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) {
411			/* Corrupted directory. */
412			brelse(bp);
413			return (EJUSTRETURN);
414		}
415		if (dp->d_namlen == namelen &&
416		    bcmp(dp->d_name, name, namelen) == 0) {
417			/* Found. Get the prev offset if needed. */
418			if (prevoffp != NULL) {
419				if (offset & (DIRBLKSIZ - 1)) {
420					prevoff = ufsdirhash_getprev(dp,
421					    offset);
422					if (prevoff == -1) {
423						brelse(bp);
424						return (EJUSTRETURN);
425					}
426				} else
427					prevoff = offset;
428				*prevoffp = prevoff;
429			}
430
431			/* Check for sequential access, and update offset. */
432			if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset)
433				dh->dh_seqopt = 1;
434			dh->dh_seqoff = offset + DIRSIZ(0, dp);
435
436			*bpp = bp;
437			*offp = offset;
438			return (0);
439		}
440
441		mtx_lock(&dh->dh_mtx);
442		if (dh->dh_hash == NULL) {
443			mtx_unlock(&dh->dh_mtx);
444			if (bp != NULL)
445				brelse(bp);
446			ufsdirhash_free(ip);
447			return (EJUSTRETURN);
448		}
449		/*
450		 * When the name doesn't match in the seqopt case, go back
451		 * and search normally.
452		 */
453		if (dh->dh_seqopt) {
454			dh->dh_seqopt = 0;
455			goto restart;
456		}
457	}
458	mtx_unlock(&dh->dh_mtx);
459	if (bp != NULL)
460		brelse(bp);
461	return (ENOENT);
462}
463
464/*
465 * Find a directory block with room for 'slotneeded' bytes. Returns
466 * the offset of the directory entry that begins the free space.
467 * This will either be the offset of an existing entry that has free
468 * space at the end, or the offset of an entry with d_ino == 0 at
469 * the start of a DIRBLKSIZ block.
470 *
471 * To use the space, the caller may need to compact existing entries in
472 * the directory. The total number of bytes in all of the entries involved
473 * in the compaction is stored in *slotsize. In other words, all of
474 * the entries that must be compacted are exactly contained in the
475 * region beginning at the returned offset and spanning *slotsize bytes.
476 *
477 * Returns -1 if no space was found, indicating that the directory
478 * must be extended.
479 */
480doff_t
481ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
482{
483	struct direct *dp;
484	struct dirhash *dh;
485	struct buf *bp;
486	doff_t pos, slotstart;
487	int dirblock, error, freebytes, i;
488
489	if ((dh = ip->i_dirhash) == NULL)
490		return (-1);
491	mtx_lock(&dh->dh_mtx);
492	if (dh->dh_hash == NULL) {
493		mtx_unlock(&dh->dh_mtx);
494		ufsdirhash_free(ip);
495		return (-1);
496	}
497
498	/* Find a directory block with the desired free space. */
499	dirblock = -1;
500	for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
501		if ((dirblock = dh->dh_firstfree[i]) != -1)
502			break;
503	if (dirblock == -1) {
504		mtx_unlock(&dh->dh_mtx);
505		return (-1);
506	}
507
508	KASSERT(dirblock < dh->dh_nblk &&
509	    dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN),
510	    ("ufsdirhash_findfree: bad stats"));
511	mtx_unlock(&dh->dh_mtx);
512	pos = dirblock * DIRBLKSIZ;
513	error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp);
514	if (error)
515		return (-1);
516
517	/* Find the first entry with free space. */
518	for (i = 0; i < DIRBLKSIZ; ) {
519		if (dp->d_reclen == 0) {
520			brelse(bp);
521			return (-1);
522		}
523		if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp))
524			break;
525		i += dp->d_reclen;
526		dp = (struct direct *)((char *)dp + dp->d_reclen);
527	}
528	if (i > DIRBLKSIZ) {
529		brelse(bp);
530		return (-1);
531	}
532	slotstart = pos + i;
533
534	/* Find the range of entries needed to get enough space */
535	freebytes = 0;
536	while (i < DIRBLKSIZ && freebytes < slotneeded) {
537		freebytes += dp->d_reclen;
538		if (dp->d_ino != 0)
539			freebytes -= DIRSIZ(0, dp);
540		if (dp->d_reclen == 0) {
541			brelse(bp);
542			return (-1);
543		}
544		i += dp->d_reclen;
545		dp = (struct direct *)((char *)dp + dp->d_reclen);
546	}
547	if (i > DIRBLKSIZ) {
548		brelse(bp);
549		return (-1);
550	}
551	if (freebytes < slotneeded)
552		panic("ufsdirhash_findfree: free mismatch");
553	brelse(bp);
554	*slotsize = pos + i - slotstart;
555	return (slotstart);
556}
557
558/*
559 * Return the start of the unused space at the end of a directory, or
560 * -1 if there are no trailing unused blocks.
561 */
562doff_t
563ufsdirhash_enduseful(struct inode *ip)
564{
565
566	struct dirhash *dh;
567	int i;
568
569	if ((dh = ip->i_dirhash) == NULL)
570		return (-1);
571	mtx_lock(&dh->dh_mtx);
572	if (dh->dh_hash == NULL) {
573		mtx_unlock(&dh->dh_mtx);
574		ufsdirhash_free(ip);
575		return (-1);
576	}
577
578	if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) {
579		mtx_unlock(&dh->dh_mtx);
580		return (-1);
581	}
582
583	for (i = dh->dh_dirblks - 1; i >= 0; i--)
584		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
585			break;
586	mtx_unlock(&dh->dh_mtx);
587	return ((doff_t)(i + 1) * DIRBLKSIZ);
588}
589
590/*
591 * Insert information into the hash about a new directory entry. dirp
592 * points to a struct direct containing the entry, and offset specifies
593 * the offset of this entry.
594 */
595void
596ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
597{
598	struct dirhash *dh;
599	int slot;
600
601	if ((dh = ip->i_dirhash) == NULL)
602		return;
603	mtx_lock(&dh->dh_mtx);
604	if (dh->dh_hash == NULL) {
605		mtx_unlock(&dh->dh_mtx);
606		ufsdirhash_free(ip);
607		return;
608	}
609
610	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
611	    ("ufsdirhash_add: bad offset"));
612	/*
613	 * Normal hash usage is < 66%. If the usage gets too high then
614	 * remove the hash entirely and let it be rebuilt later.
615	 */
616	if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
617		mtx_unlock(&dh->dh_mtx);
618		ufsdirhash_free(ip);
619		return;
620	}
621
622	/* Find a free hash slot (empty or deleted), and add the entry. */
623	slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen);
624	while (DH_ENTRY(dh, slot) >= 0)
625		slot = WRAPINCR(slot, dh->dh_hlen);
626	if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY)
627		dh->dh_hused++;
628	DH_ENTRY(dh, slot) = offset;
629
630	/* Update the per-block summary info. */
631	ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp));
632	mtx_unlock(&dh->dh_mtx);
633}
634
635/*
636 * Remove the specified directory entry from the hash. The entry to remove
637 * is defined by the name in `dirp', which must exist at the specified
638 * `offset' within the directory.
639 */
640void
641ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
642{
643	struct dirhash *dh;
644	int slot;
645
646	if ((dh = ip->i_dirhash) == NULL)
647		return;
648	mtx_lock(&dh->dh_mtx);
649	if (dh->dh_hash == NULL) {
650		mtx_unlock(&dh->dh_mtx);
651		ufsdirhash_free(ip);
652		return;
653	}
654
655	KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ,
656	    ("ufsdirhash_remove: bad offset"));
657	/* Find the entry */
658	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
659
660	/* Remove the hash entry. */
661	ufsdirhash_delslot(dh, slot);
662
663	/* Update the per-block summary info. */
664	ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp));
665	mtx_unlock(&dh->dh_mtx);
666}
667
668/*
669 * Change the offset associated with a directory entry in the hash. Used
670 * when compacting directory blocks.
671 */
672void
673ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
674    doff_t newoff)
675{
676	struct dirhash *dh;
677	int slot;
678
679	if ((dh = ip->i_dirhash) == NULL)
680		return;
681	mtx_lock(&dh->dh_mtx);
682	if (dh->dh_hash == NULL) {
683		mtx_unlock(&dh->dh_mtx);
684		ufsdirhash_free(ip);
685		return;
686	}
687
688	KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ &&
689	    newoff < dh->dh_dirblks * DIRBLKSIZ,
690	    ("ufsdirhash_move: bad offset"));
691	/* Find the entry, and update the offset. */
692	slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
693	DH_ENTRY(dh, slot) = newoff;
694	mtx_unlock(&dh->dh_mtx);
695}
696
697/*
698 * Inform dirhash that the directory has grown by one block that
699 * begins at offset (i.e. the new length is offset + DIRBLKSIZ).
700 */
701void
702ufsdirhash_newblk(struct inode *ip, doff_t offset)
703{
704	struct dirhash *dh;
705	int block;
706
707	if ((dh = ip->i_dirhash) == NULL)
708		return;
709	mtx_lock(&dh->dh_mtx);
710	if (dh->dh_hash == NULL) {
711		mtx_unlock(&dh->dh_mtx);
712		ufsdirhash_free(ip);
713		return;
714	}
715
716	KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ,
717	    ("ufsdirhash_newblk: bad offset"));
718	block = offset / DIRBLKSIZ;
719	if (block >= dh->dh_nblk) {
720		/* Out of space; must rebuild. */
721		mtx_unlock(&dh->dh_mtx);
722		ufsdirhash_free(ip);
723		return;
724	}
725	dh->dh_dirblks = block + 1;
726
727	/* Account for the new free block. */
728	dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN;
729	if (dh->dh_firstfree[DH_NFSTATS] == -1)
730		dh->dh_firstfree[DH_NFSTATS] = block;
731	mtx_unlock(&dh->dh_mtx);
732}
733
734/*
735 * Inform dirhash that the directory is being truncated.
736 */
737void
738ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
739{
740	struct dirhash *dh;
741	int block, i;
742
743	if ((dh = ip->i_dirhash) == NULL)
744		return;
745	mtx_lock(&dh->dh_mtx);
746	if (dh->dh_hash == NULL) {
747		mtx_unlock(&dh->dh_mtx);
748		ufsdirhash_free(ip);
749		return;
750	}
751
752	KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ,
753	    ("ufsdirhash_dirtrunc: bad offset"));
754	block = howmany(offset, DIRBLKSIZ);
755	/*
756	 * If the directory shrinks to less than 1/8 of dh_nblk blocks
757	 * (about 20% of its original size due to the 50% extra added in
758	 * ufsdirhash_build) then free it, and let the caller rebuild
759	 * if necessary.
760	 */
761	if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) {
762		mtx_unlock(&dh->dh_mtx);
763		ufsdirhash_free(ip);
764		return;
765	}
766
767	/*
768	 * Remove any `first free' information pertaining to the
769	 * truncated blocks. All blocks we're removing should be
770	 * completely unused.
771	 */
772	if (dh->dh_firstfree[DH_NFSTATS] >= block)
773		dh->dh_firstfree[DH_NFSTATS] = -1;
774	for (i = block; i < dh->dh_dirblks; i++)
775		if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN)
776			panic("ufsdirhash_dirtrunc: blocks in use");
777	for (i = 0; i < DH_NFSTATS; i++)
778		if (dh->dh_firstfree[i] >= block)
779			panic("ufsdirhash_dirtrunc: first free corrupt");
780	dh->dh_dirblks = block;
781	mtx_unlock(&dh->dh_mtx);
782}
783
784/*
785 * Debugging function to check that the dirhash information about
786 * a directory block matches its actual contents. Panics if a mismatch
787 * is detected.
788 *
789 * On entry, `buf' should point to the start of an in-core
790 * DIRBLKSIZ-sized directory block, and `offset' should contain the
791 * offset from the start of the directory of that block.
792 */
793void
794ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset)
795{
796	struct dirhash *dh;
797	struct direct *dp;
798	int block, ffslot, i, nfree;
799
800	if (!ufs_dirhashcheck)
801		return;
802	if ((dh = ip->i_dirhash) == NULL)
803		return;
804	mtx_lock(&dh->dh_mtx);
805	if (dh->dh_hash == NULL) {
806		mtx_unlock(&dh->dh_mtx);
807		ufsdirhash_free(ip);
808		return;
809	}
810
811	block = offset / DIRBLKSIZ;
812	if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks)
813		panic("ufsdirhash_checkblock: bad offset");
814
815	nfree = 0;
816	for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) {
817		dp = (struct direct *)(buf + i);
818		if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ)
819			panic("ufsdirhash_checkblock: bad dir");
820
821		if (dp->d_ino == 0) {
822#if 0
823			/*
824			 * XXX entries with d_ino == 0 should only occur
825			 * at the start of a DIRBLKSIZ block. However the
826			 * ufs code is tolerant of such entries at other
827			 * offsets, and fsck does not fix them.
828			 */
829			if (i != 0)
830				panic("ufsdirhash_checkblock: bad dir inode");
831#endif
832			nfree += dp->d_reclen;
833			continue;
834		}
835
836		/* Check that the entry	exists (will panic if it doesn't). */
837		ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
838
839		nfree += dp->d_reclen - DIRSIZ(0, dp);
840	}
841	if (i != DIRBLKSIZ)
842		panic("ufsdirhash_checkblock: bad dir end");
843
844	if (dh->dh_blkfree[block] * DIRALIGN != nfree)
845		panic("ufsdirhash_checkblock: bad free count");
846
847	ffslot = nfree / DIRALIGN;
848	if (ffslot > DH_NFSTATS)
849		ffslot = DH_NFSTATS;
850	for (i = 0; i <= DH_NFSTATS; i++)
851		if (dh->dh_firstfree[i] == block && i != ffslot)
852			panic("ufsdirhash_checkblock: bad first-free");
853	mtx_unlock(&dh->dh_mtx);
854}
855
856/*
857 * Hash the specified filename into a dirhash slot.
858 */
859static int
860ufsdirhash_hash(struct dirhash *dh, char *name, int namelen)
861{
862	return (fnv_32_buf(name, namelen, FNV1_32_INIT) % dh->dh_hlen);
863}
864
865/*
866 * Adjust the number of free bytes in the block containing `offset'
867 * by the value specified by `diff'.
868 *
869 * The caller must ensure we have exclusive access to `dh'; normally
870 * that means that dh_mtx should be held, but this is also called
871 * from ufsdirhash_build() where exclusive access can be assumed.
872 */
873static void
874ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff)
875{
876	int block, i, nfidx, ofidx;
877
878	/* Update the per-block summary info. */
879	block = offset / DIRBLKSIZ;
880	KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks,
881	     ("dirhash bad offset"));
882	ofidx = dh->dh_blkfree[block];
883	if (ofidx > DH_NFSTATS)
884		ofidx = DH_NFSTATS;
885	dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
886	nfidx = dh->dh_blkfree[block];
887	if (nfidx > DH_NFSTATS)
888		nfidx = DH_NFSTATS;
889
890	/* Update the `first free' list if necessary. */
891	if (ofidx != nfidx) {
892		/* If removing, scan forward for the next block. */
893		if (dh->dh_firstfree[ofidx] == block) {
894			for (i = block + 1; i < dh->dh_dirblks; i++)
895				if (dh->dh_blkfree[i] == ofidx)
896					break;
897			dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
898		}
899
900		/* Make this the new `first free' if necessary */
901		if (dh->dh_firstfree[nfidx] > block ||
902		    dh->dh_firstfree[nfidx] == -1)
903			dh->dh_firstfree[nfidx] = block;
904	}
905}
906
907/*
908 * Find the specified name which should have the specified offset.
909 * Returns a slot number, and panics on failure.
910 *
911 * `dh' must be locked on entry and remains so on return.
912 */
913static int
914ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset)
915{
916	int slot;
917
918	mtx_assert(&dh->dh_mtx, MA_OWNED);
919
920	/* Find the entry. */
921	KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full"));
922	slot = ufsdirhash_hash(dh, name, namelen);
923	while (DH_ENTRY(dh, slot) != offset &&
924	    DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
925		slot = WRAPINCR(slot, dh->dh_hlen);
926	if (DH_ENTRY(dh, slot) != offset)
927		panic("ufsdirhash_findslot: '%.*s' not found", namelen, name);
928
929	return (slot);
930}
931
932/*
933 * Remove the entry corresponding to the specified slot from the hash array.
934 *
935 * `dh' must be locked on entry and remains so on return.
936 */
937static void
938ufsdirhash_delslot(struct dirhash *dh, int slot)
939{
940	int i;
941
942	mtx_assert(&dh->dh_mtx, MA_OWNED);
943
944	/* Mark the entry as deleted. */
945	DH_ENTRY(dh, slot) = DIRHASH_DEL;
946
947	/* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
948	for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; )
949		i = WRAPINCR(i, dh->dh_hlen);
950	if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
951		for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) {
952			DH_ENTRY(dh, i) = DIRHASH_EMPTY;
953			dh->dh_hused--;
954			i = WRAPINCR(i, dh->dh_hlen);
955		}
956		KASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen"));
957	}
958}
959
960/*
961 * Given a directory entry and its offset, find the offset of the
962 * previous entry in the same DIRBLKSIZ-sized block. Returns an
963 * offset, or -1 if there is no previous entry in the block or some
964 * other problem occurred.
965 */
966static doff_t
967ufsdirhash_getprev(struct direct *dirp, doff_t offset)
968{
969	struct direct *dp;
970	char *blkbuf;
971	doff_t blkoff, prevoff;
972	int entrypos, i;
973
974	blkoff = offset & ~(DIRBLKSIZ - 1);	/* offset of start of block */
975	entrypos = offset & (DIRBLKSIZ - 1);	/* entry relative to block */
976	blkbuf = (char *)dirp - entrypos;
977	prevoff = blkoff;
978
979	/* If `offset' is the start of a block, there is no previous entry. */
980	if (entrypos == 0)
981		return (-1);
982
983	/* Scan from the start of the block until we get to the entry. */
984	for (i = 0; i < entrypos; i += dp->d_reclen) {
985		dp = (struct direct *)(blkbuf + i);
986		if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
987			return (-1);	/* Corrupted directory. */
988		prevoff = blkoff + i;
989	}
990	return (prevoff);
991}
992
993/*
994 * Try to free up `wanted' bytes by stealing memory from existing
995 * dirhashes. Returns zero with ufsdirhash_mtx locked if successful.
996 */
997static int
998ufsdirhash_recycle(int wanted)
999{
1000	struct dirhash *dh;
1001	doff_t **hash;
1002	u_int8_t *blkfree;
1003	int i, mem, narrays;
1004
1005	mtx_lock(&ufsdirhash_mtx);
1006	while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
1007		/* Find a dirhash, and lock it. */
1008		if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) {
1009			mtx_unlock(&ufsdirhash_mtx);
1010			return (-1);
1011		}
1012		mtx_lock(&dh->dh_mtx);
1013		KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list"));
1014
1015		/* Decrement the score; only recycle if it becomes zero. */
1016		if (--dh->dh_score > 0) {
1017			mtx_unlock(&dh->dh_mtx);
1018			mtx_unlock(&ufsdirhash_mtx);
1019			return (-1);
1020		}
1021
1022		/* Remove it from the list and detach its memory. */
1023		TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
1024		dh->dh_onlist = 0;
1025		hash = dh->dh_hash;
1026		dh->dh_hash = NULL;
1027		blkfree = dh->dh_blkfree;
1028		dh->dh_blkfree = NULL;
1029		narrays = dh->dh_narrays;
1030		mem = narrays * sizeof(*dh->dh_hash) +
1031		    narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
1032		    dh->dh_nblk * sizeof(*dh->dh_blkfree);
1033
1034		/* Unlock everything, free the detached memory. */
1035		mtx_unlock(&dh->dh_mtx);
1036		mtx_unlock(&ufsdirhash_mtx);
1037		for (i = 0; i < narrays; i++)
1038			zfree(ufsdirhash_zone, hash[i]);
1039		FREE(hash, M_DIRHASH);
1040		FREE(blkfree, M_DIRHASH);
1041
1042		/* Account for the returned memory, and repeat if necessary. */
1043		mtx_lock(&ufsdirhash_mtx);
1044		ufs_dirhashmem -= mem;
1045	}
1046	/* Success; return with ufsdirhash_mtx locked. */
1047	return (0);
1048}
1049
1050
1051static void
1052ufsdirhash_init()
1053{
1054	ufsdirhash_zone = zinit("DIRHASH", DH_NBLKOFF * sizeof(daddr_t), 0,
1055	    0, 1);
1056	mtx_init(&ufsdirhash_mtx, "dirhash list", MTX_DEF);
1057	TAILQ_INIT(&ufsdirhash_list);
1058}
1059SYSINIT(ufsdirhash, SI_SUB_PSEUDO, SI_ORDER_ANY, ufsdirhash_init, NULL)
1060
1061
1062#endif /* UFS_DIRHASH */
1063