ufs_dirhash.c revision 92807
183364Sdfr/* 283364Sdfr * Copyright (c) 2001 Ian Dowse. All rights reserved. 383364Sdfr * 483364Sdfr * Redistribution and use in source and binary forms, with or without 583364Sdfr * modification, are permitted provided that the following conditions 683364Sdfr * are met: 783364Sdfr * 1. Redistributions of source code must retain the above copyright 883364Sdfr * notice, this list of conditions and the following disclaimer. 983364Sdfr * 2. Redistributions in binary form must reproduce the above copyright 1083364Sdfr * notice, this list of conditions and the following disclaimer in the 1183364Sdfr * documentation and/or other materials provided with the distribution. 1283364Sdfr * 1383364Sdfr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 1483364Sdfr * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1583364Sdfr * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1683364Sdfr * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 1783364Sdfr * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 1883364Sdfr * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 1983364Sdfr * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2083364Sdfr * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2183364Sdfr * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2283364Sdfr * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2383364Sdfr * SUCH DAMAGE. 2483364Sdfr * 2583364Sdfr * $FreeBSD: head/sys/ufs/ufs/ufs_dirhash.c 92807 2002-03-20 17:58:02Z dwmalone $ 2683364Sdfr */ 2783364Sdfr/* 2883364Sdfr * This implements a hash-based lookup scheme for UFS directories. 2983364Sdfr */ 3083364Sdfr 3183364Sdfr#include "opt_ufs.h" 3283364Sdfr 3383364Sdfr#ifdef UFS_DIRHASH 3483364Sdfr 3583364Sdfr#include <sys/param.h> 3683364Sdfr#include <sys/systm.h> 3783364Sdfr#include <sys/kernel.h> 3883364Sdfr#include <sys/lock.h> 3983364Sdfr#include <sys/mutex.h> 4083364Sdfr#include <sys/malloc.h> 4183364Sdfr#include <sys/fnv_hash.h> 4283364Sdfr#include <sys/proc.h> 43#include <sys/bio.h> 44#include <sys/buf.h> 45#include <sys/vnode.h> 46#include <sys/mount.h> 47#include <sys/sysctl.h> 48#include <vm/uma.h> 49 50#include <ufs/ufs/quota.h> 51#include <ufs/ufs/inode.h> 52#include <ufs/ufs/dir.h> 53#include <ufs/ufs/dirhash.h> 54#include <ufs/ufs/extattr.h> 55#include <ufs/ufs/ufsmount.h> 56#include <ufs/ufs/ufs_extern.h> 57 58#define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1)) 59#define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1)) 60#define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) 61#define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n)) 62 63static MALLOC_DEFINE(M_DIRHASH, "UFS dirhash", "UFS directory hash tables"); 64 65SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem"); 66 67static int ufs_mindirhashsize = DIRBLKSIZ * 5; 68SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW, 69 &ufs_mindirhashsize, 70 0, "minimum directory size in bytes for which to use hashed lookup"); 71static int ufs_dirhashmaxmem = 2 * 1024 * 1024; 72SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_maxmem, CTLFLAG_RW, &ufs_dirhashmaxmem, 73 0, "maximum allowed dirhash memory usage"); 74static int ufs_dirhashmem; 75SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_mem, CTLFLAG_RD, &ufs_dirhashmem, 76 0, "current dirhash memory usage"); 77static int ufs_dirhashcheck = 0; 78SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_docheck, CTLFLAG_RW, &ufs_dirhashcheck, 79 0, "enable extra sanity tests"); 80 81 82static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen); 83static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff); 84static void ufsdirhash_delslot(struct dirhash *dh, int slot); 85static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, 86 doff_t offset); 87static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset); 88static void ufsdirhash_init(void); 89static int ufsdirhash_recycle(int wanted); 90 91static uma_zone_t ufsdirhash_zone; 92 93/* Dirhash list; recently-used entries are near the tail. */ 94static TAILQ_HEAD(, dirhash) ufsdirhash_list; 95 96/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */ 97static struct mtx ufsdirhash_mtx; 98 99/* 100 * Locking order: 101 * ufsdirhash_mtx 102 * dh_mtx 103 * 104 * The dh_mtx mutex should be aquired either via the inode lock, or via 105 * ufsdirhash_mtx. Only the owner of the inode may free the associated 106 * dirhash, but anything can steal its memory and set dh_hash to NULL. 107 */ 108 109/* 110 * Attempt to build up a hash table for the directory contents in 111 * inode 'ip'. Returns 0 on success, or -1 of the operation failed. 112 */ 113int 114ufsdirhash_build(struct inode *ip) 115{ 116 struct dirhash *dh; 117 struct buf *bp = NULL; 118 struct direct *ep; 119 struct vnode *vp; 120 doff_t bmask, pos; 121 int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot; 122 123 /* Check if we can/should use dirhash. */ 124 if (ip->i_dirhash == NULL) { 125 if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode)) 126 return (-1); 127 } else { 128 /* Hash exists, but sysctls could have changed. */ 129 if (ip->i_size < ufs_mindirhashsize || 130 ufs_dirhashmem > ufs_dirhashmaxmem) { 131 ufsdirhash_free(ip); 132 return (-1); 133 } 134 /* Check if hash exists and is intact (note: unlocked read). */ 135 if (ip->i_dirhash->dh_hash != NULL) 136 return (0); 137 /* Free the old, recycled hash and build a new one. */ 138 ufsdirhash_free(ip); 139 } 140 141 /* Don't hash removed directories. */ 142 if (ip->i_effnlink == 0) 143 return (-1); 144 145 vp = ip->i_vnode; 146 /* Allocate 50% more entries than this dir size could ever need. */ 147 KASSERT(ip->i_size >= DIRBLKSIZ, ("ufsdirhash_build size")); 148 nslots = ip->i_size / DIRECTSIZ(1); 149 nslots = (nslots * 3 + 1) / 2; 150 narrays = howmany(nslots, DH_NBLKOFF); 151 nslots = narrays * DH_NBLKOFF; 152 dirblocks = howmany(ip->i_size, DIRBLKSIZ); 153 nblocks = (dirblocks * 3 + 1) / 2; 154 155 memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) + 156 narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + 157 nblocks * sizeof(*dh->dh_blkfree); 158 mtx_lock(&ufsdirhash_mtx); 159 if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) { 160 mtx_unlock(&ufsdirhash_mtx); 161 if (memreqd > ufs_dirhashmaxmem / 2) 162 return (-1); 163 164 /* Try to free some space. */ 165 if (ufsdirhash_recycle(memreqd) != 0) 166 return (-1); 167 /* Enough was freed, and ufsdirhash_mtx has been locked. */ 168 } 169 ufs_dirhashmem += memreqd; 170 mtx_unlock(&ufsdirhash_mtx); 171 172 /* 173 * Use non-blocking mallocs so that we will revert to a linear 174 * lookup on failure rather than potentially blocking forever. 175 */ 176 MALLOC(dh, struct dirhash *, sizeof *dh, M_DIRHASH, M_NOWAIT | M_ZERO); 177 if (dh == NULL) 178 return (-1); 179 MALLOC(dh->dh_hash, doff_t **, narrays * sizeof(dh->dh_hash[0]), 180 M_DIRHASH, M_NOWAIT | M_ZERO); 181 MALLOC(dh->dh_blkfree, u_int8_t *, nblocks * sizeof(dh->dh_blkfree[0]), 182 M_DIRHASH, M_NOWAIT); 183 if (dh->dh_hash == NULL || dh->dh_blkfree == NULL) 184 goto fail; 185 for (i = 0; i < narrays; i++) { 186 if ((dh->dh_hash[i] = uma_zalloc(ufsdirhash_zone, 187 M_WAITOK)) == NULL) 188 goto fail; 189 for (j = 0; j < DH_NBLKOFF; j++) 190 dh->dh_hash[i][j] = DIRHASH_EMPTY; 191 } 192 193 /* Initialise the hash table and block statistics. */ 194 mtx_init(&dh->dh_mtx, "dirhash", MTX_DEF); 195 dh->dh_narrays = narrays; 196 dh->dh_hlen = nslots; 197 dh->dh_nblk = nblocks; 198 dh->dh_dirblks = dirblocks; 199 for (i = 0; i < dirblocks; i++) 200 dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN; 201 for (i = 0; i < DH_NFSTATS; i++) 202 dh->dh_firstfree[i] = -1; 203 dh->dh_firstfree[DH_NFSTATS] = 0; 204 dh->dh_seqopt = 0; 205 dh->dh_seqoff = 0; 206 dh->dh_score = DH_SCOREINIT; 207 ip->i_dirhash = dh; 208 209 bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; 210 pos = 0; 211 while (pos < ip->i_size) { 212 /* If necessary, get the next directory block. */ 213 if ((pos & bmask) == 0) { 214 if (bp != NULL) 215 brelse(bp); 216 if (UFS_BLKATOFF(vp, (off_t)pos, NULL, &bp) != 0) 217 goto fail; 218 } 219 220 /* Add this entry to the hash. */ 221 ep = (struct direct *)((char *)bp->b_data + (pos & bmask)); 222 if (ep->d_reclen == 0 || ep->d_reclen > 223 DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) { 224 /* Corrupted directory. */ 225 brelse(bp); 226 goto fail; 227 } 228 if (ep->d_ino != 0) { 229 /* Add the entry (simplified ufsdirhash_add). */ 230 slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen); 231 while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY) 232 slot = WRAPINCR(slot, dh->dh_hlen); 233 dh->dh_hused++; 234 DH_ENTRY(dh, slot) = pos; 235 ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep)); 236 } 237 pos += ep->d_reclen; 238 } 239 240 if (bp != NULL) 241 brelse(bp); 242 mtx_lock(&ufsdirhash_mtx); 243 TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list); 244 dh->dh_onlist = 1; 245 mtx_unlock(&ufsdirhash_mtx); 246 return (0); 247 248fail: 249 if (dh->dh_hash != NULL) { 250 for (i = 0; i < narrays; i++) 251 if (dh->dh_hash[i] != NULL) 252 uma_zfree(ufsdirhash_zone, dh->dh_hash[i]); 253 FREE(dh->dh_hash, M_DIRHASH); 254 } 255 if (dh->dh_blkfree != NULL) 256 FREE(dh->dh_blkfree, M_DIRHASH); 257 FREE(dh, M_DIRHASH); 258 ip->i_dirhash = NULL; 259 mtx_lock(&ufsdirhash_mtx); 260 ufs_dirhashmem -= memreqd; 261 mtx_unlock(&ufsdirhash_mtx); 262 return (-1); 263} 264 265/* 266 * Free any hash table associated with inode 'ip'. 267 */ 268void 269ufsdirhash_free(struct inode *ip) 270{ 271 struct dirhash *dh; 272 int i, mem; 273 274 if ((dh = ip->i_dirhash) == NULL) 275 return; 276 mtx_lock(&ufsdirhash_mtx); 277 mtx_lock(&dh->dh_mtx); 278 if (dh->dh_onlist) 279 TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); 280 mtx_unlock(&dh->dh_mtx); 281 mtx_unlock(&ufsdirhash_mtx); 282 283 /* The dirhash pointed to by 'dh' is exclusively ours now. */ 284 285 mem = sizeof(*dh); 286 if (dh->dh_hash != NULL) { 287 for (i = 0; i < dh->dh_narrays; i++) 288 uma_zfree(ufsdirhash_zone, dh->dh_hash[i]); 289 FREE(dh->dh_hash, M_DIRHASH); 290 FREE(dh->dh_blkfree, M_DIRHASH); 291 mem += dh->dh_narrays * sizeof(*dh->dh_hash) + 292 dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + 293 dh->dh_nblk * sizeof(*dh->dh_blkfree); 294 } 295 mtx_destroy(&dh->dh_mtx); 296 FREE(dh, M_DIRHASH); 297 ip->i_dirhash = NULL; 298 299 mtx_lock(&ufsdirhash_mtx); 300 ufs_dirhashmem -= mem; 301 mtx_unlock(&ufsdirhash_mtx); 302} 303 304/* 305 * Find the offset of the specified name within the given inode. 306 * Returns 0 on success, ENOENT if the entry does not exist, or 307 * EJUSTRETURN if the caller should revert to a linear search. 308 * 309 * If successful, the directory offset is stored in *offp, and a 310 * pointer to a struct buf containing the entry is stored in *bpp. If 311 * prevoffp is non-NULL, the offset of the previous entry within 312 * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry 313 * is the first in a block, the start of the block is used). 314 */ 315int 316ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp, 317 struct buf **bpp, doff_t *prevoffp) 318{ 319 struct dirhash *dh, *dh_next; 320 struct direct *dp; 321 struct vnode *vp; 322 struct buf *bp; 323 doff_t blkoff, bmask, offset, prevoff; 324 int i, slot; 325 326 if ((dh = ip->i_dirhash) == NULL) 327 return (EJUSTRETURN); 328 /* 329 * Move this dirhash towards the end of the list if it has a 330 * score higher than the next entry, and aquire the dh_mtx. 331 * Optimise the case where it's already the last by performing 332 * an unlocked read of the TAILQ_NEXT pointer. 333 * 334 * In both cases, end up holding just dh_mtx. 335 */ 336 if (TAILQ_NEXT(dh, dh_list) != NULL) { 337 mtx_lock(&ufsdirhash_mtx); 338 mtx_lock(&dh->dh_mtx); 339 /* 340 * If the new score will be greater than that of the next 341 * entry, then move this entry past it. With both mutexes 342 * held, dh_next won't go away, but its dh_score could 343 * change; that's not important since it is just a hint. 344 */ 345 if (dh->dh_hash != NULL && 346 (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL && 347 dh->dh_score >= dh_next->dh_score) { 348 KASSERT(dh->dh_onlist, ("dirhash: not on list")); 349 TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); 350 TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh, 351 dh_list); 352 } 353 mtx_unlock(&ufsdirhash_mtx); 354 } else { 355 /* Already the last, though that could change as we wait. */ 356 mtx_lock(&dh->dh_mtx); 357 } 358 if (dh->dh_hash == NULL) { 359 mtx_unlock(&dh->dh_mtx); 360 ufsdirhash_free(ip); 361 return (EJUSTRETURN); 362 } 363 364 /* Update the score. */ 365 if (dh->dh_score < DH_SCOREMAX) 366 dh->dh_score++; 367 368 vp = ip->i_vnode; 369 bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; 370 blkoff = -1; 371 bp = NULL; 372restart: 373 slot = ufsdirhash_hash(dh, name, namelen); 374 375 if (dh->dh_seqopt) { 376 /* 377 * Sequential access optimisation. dh_seqoff contains the 378 * offset of the directory entry immediately following 379 * the last entry that was looked up. Check if this offset 380 * appears in the hash chain for the name we are looking for. 381 */ 382 for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY; 383 i = WRAPINCR(i, dh->dh_hlen)) 384 if (offset == dh->dh_seqoff) 385 break; 386 if (offset == dh->dh_seqoff) { 387 /* 388 * We found an entry with the expected offset. This 389 * is probably the entry we want, but if not, the 390 * code below will turn off seqoff and retry. 391 */ 392 slot = i; 393 } else 394 dh->dh_seqopt = 0; 395 } 396 397 for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY; 398 slot = WRAPINCR(slot, dh->dh_hlen)) { 399 if (offset == DIRHASH_DEL) 400 continue; 401 mtx_unlock(&dh->dh_mtx); 402 403 if (offset < 0 || offset >= ip->i_size) 404 panic("ufsdirhash_lookup: bad offset in hash array"); 405 if ((offset & ~bmask) != blkoff) { 406 if (bp != NULL) 407 brelse(bp); 408 blkoff = offset & ~bmask; 409 if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0) 410 return (EJUSTRETURN); 411 } 412 dp = (struct direct *)(bp->b_data + (offset & bmask)); 413 if (dp->d_reclen == 0 || dp->d_reclen > 414 DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) { 415 /* Corrupted directory. */ 416 brelse(bp); 417 return (EJUSTRETURN); 418 } 419 if (dp->d_namlen == namelen && 420 bcmp(dp->d_name, name, namelen) == 0) { 421 /* Found. Get the prev offset if needed. */ 422 if (prevoffp != NULL) { 423 if (offset & (DIRBLKSIZ - 1)) { 424 prevoff = ufsdirhash_getprev(dp, 425 offset); 426 if (prevoff == -1) { 427 brelse(bp); 428 return (EJUSTRETURN); 429 } 430 } else 431 prevoff = offset; 432 *prevoffp = prevoff; 433 } 434 435 /* Check for sequential access, and update offset. */ 436 if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset) 437 dh->dh_seqopt = 1; 438 dh->dh_seqoff = offset + DIRSIZ(0, dp); 439 440 *bpp = bp; 441 *offp = offset; 442 return (0); 443 } 444 445 mtx_lock(&dh->dh_mtx); 446 if (dh->dh_hash == NULL) { 447 mtx_unlock(&dh->dh_mtx); 448 if (bp != NULL) 449 brelse(bp); 450 ufsdirhash_free(ip); 451 return (EJUSTRETURN); 452 } 453 /* 454 * When the name doesn't match in the seqopt case, go back 455 * and search normally. 456 */ 457 if (dh->dh_seqopt) { 458 dh->dh_seqopt = 0; 459 goto restart; 460 } 461 } 462 mtx_unlock(&dh->dh_mtx); 463 if (bp != NULL) 464 brelse(bp); 465 return (ENOENT); 466} 467 468/* 469 * Find a directory block with room for 'slotneeded' bytes. Returns 470 * the offset of the directory entry that begins the free space. 471 * This will either be the offset of an existing entry that has free 472 * space at the end, or the offset of an entry with d_ino == 0 at 473 * the start of a DIRBLKSIZ block. 474 * 475 * To use the space, the caller may need to compact existing entries in 476 * the directory. The total number of bytes in all of the entries involved 477 * in the compaction is stored in *slotsize. In other words, all of 478 * the entries that must be compacted are exactly contained in the 479 * region beginning at the returned offset and spanning *slotsize bytes. 480 * 481 * Returns -1 if no space was found, indicating that the directory 482 * must be extended. 483 */ 484doff_t 485ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize) 486{ 487 struct direct *dp; 488 struct dirhash *dh; 489 struct buf *bp; 490 doff_t pos, slotstart; 491 int dirblock, error, freebytes, i; 492 493 if ((dh = ip->i_dirhash) == NULL) 494 return (-1); 495 mtx_lock(&dh->dh_mtx); 496 if (dh->dh_hash == NULL) { 497 mtx_unlock(&dh->dh_mtx); 498 ufsdirhash_free(ip); 499 return (-1); 500 } 501 502 /* Find a directory block with the desired free space. */ 503 dirblock = -1; 504 for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++) 505 if ((dirblock = dh->dh_firstfree[i]) != -1) 506 break; 507 if (dirblock == -1) { 508 mtx_unlock(&dh->dh_mtx); 509 return (-1); 510 } 511 512 KASSERT(dirblock < dh->dh_nblk && 513 dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN), 514 ("ufsdirhash_findfree: bad stats")); 515 mtx_unlock(&dh->dh_mtx); 516 pos = dirblock * DIRBLKSIZ; 517 error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp); 518 if (error) 519 return (-1); 520 521 /* Find the first entry with free space. */ 522 for (i = 0; i < DIRBLKSIZ; ) { 523 if (dp->d_reclen == 0) { 524 brelse(bp); 525 return (-1); 526 } 527 if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp)) 528 break; 529 i += dp->d_reclen; 530 dp = (struct direct *)((char *)dp + dp->d_reclen); 531 } 532 if (i > DIRBLKSIZ) { 533 brelse(bp); 534 return (-1); 535 } 536 slotstart = pos + i; 537 538 /* Find the range of entries needed to get enough space */ 539 freebytes = 0; 540 while (i < DIRBLKSIZ && freebytes < slotneeded) { 541 freebytes += dp->d_reclen; 542 if (dp->d_ino != 0) 543 freebytes -= DIRSIZ(0, dp); 544 if (dp->d_reclen == 0) { 545 brelse(bp); 546 return (-1); 547 } 548 i += dp->d_reclen; 549 dp = (struct direct *)((char *)dp + dp->d_reclen); 550 } 551 if (i > DIRBLKSIZ) { 552 brelse(bp); 553 return (-1); 554 } 555 if (freebytes < slotneeded) 556 panic("ufsdirhash_findfree: free mismatch"); 557 brelse(bp); 558 *slotsize = pos + i - slotstart; 559 return (slotstart); 560} 561 562/* 563 * Return the start of the unused space at the end of a directory, or 564 * -1 if there are no trailing unused blocks. 565 */ 566doff_t 567ufsdirhash_enduseful(struct inode *ip) 568{ 569 570 struct dirhash *dh; 571 int i; 572 573 if ((dh = ip->i_dirhash) == NULL) 574 return (-1); 575 mtx_lock(&dh->dh_mtx); 576 if (dh->dh_hash == NULL) { 577 mtx_unlock(&dh->dh_mtx); 578 ufsdirhash_free(ip); 579 return (-1); 580 } 581 582 if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) { 583 mtx_unlock(&dh->dh_mtx); 584 return (-1); 585 } 586 587 for (i = dh->dh_dirblks - 1; i >= 0; i--) 588 if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN) 589 break; 590 mtx_unlock(&dh->dh_mtx); 591 return ((doff_t)(i + 1) * DIRBLKSIZ); 592} 593 594/* 595 * Insert information into the hash about a new directory entry. dirp 596 * points to a struct direct containing the entry, and offset specifies 597 * the offset of this entry. 598 */ 599void 600ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset) 601{ 602 struct dirhash *dh; 603 int slot; 604 605 if ((dh = ip->i_dirhash) == NULL) 606 return; 607 mtx_lock(&dh->dh_mtx); 608 if (dh->dh_hash == NULL) { 609 mtx_unlock(&dh->dh_mtx); 610 ufsdirhash_free(ip); 611 return; 612 } 613 614 KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ, 615 ("ufsdirhash_add: bad offset")); 616 /* 617 * Normal hash usage is < 66%. If the usage gets too high then 618 * remove the hash entirely and let it be rebuilt later. 619 */ 620 if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) { 621 mtx_unlock(&dh->dh_mtx); 622 ufsdirhash_free(ip); 623 return; 624 } 625 626 /* Find a free hash slot (empty or deleted), and add the entry. */ 627 slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen); 628 while (DH_ENTRY(dh, slot) >= 0) 629 slot = WRAPINCR(slot, dh->dh_hlen); 630 if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY) 631 dh->dh_hused++; 632 DH_ENTRY(dh, slot) = offset; 633 634 /* Update the per-block summary info. */ 635 ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp)); 636 mtx_unlock(&dh->dh_mtx); 637} 638 639/* 640 * Remove the specified directory entry from the hash. The entry to remove 641 * is defined by the name in `dirp', which must exist at the specified 642 * `offset' within the directory. 643 */ 644void 645ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset) 646{ 647 struct dirhash *dh; 648 int slot; 649 650 if ((dh = ip->i_dirhash) == NULL) 651 return; 652 mtx_lock(&dh->dh_mtx); 653 if (dh->dh_hash == NULL) { 654 mtx_unlock(&dh->dh_mtx); 655 ufsdirhash_free(ip); 656 return; 657 } 658 659 KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ, 660 ("ufsdirhash_remove: bad offset")); 661 /* Find the entry */ 662 slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset); 663 664 /* Remove the hash entry. */ 665 ufsdirhash_delslot(dh, slot); 666 667 /* Update the per-block summary info. */ 668 ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp)); 669 mtx_unlock(&dh->dh_mtx); 670} 671 672/* 673 * Change the offset associated with a directory entry in the hash. Used 674 * when compacting directory blocks. 675 */ 676void 677ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff, 678 doff_t newoff) 679{ 680 struct dirhash *dh; 681 int slot; 682 683 if ((dh = ip->i_dirhash) == NULL) 684 return; 685 mtx_lock(&dh->dh_mtx); 686 if (dh->dh_hash == NULL) { 687 mtx_unlock(&dh->dh_mtx); 688 ufsdirhash_free(ip); 689 return; 690 } 691 692 KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ && 693 newoff < dh->dh_dirblks * DIRBLKSIZ, 694 ("ufsdirhash_move: bad offset")); 695 /* Find the entry, and update the offset. */ 696 slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff); 697 DH_ENTRY(dh, slot) = newoff; 698 mtx_unlock(&dh->dh_mtx); 699} 700 701/* 702 * Inform dirhash that the directory has grown by one block that 703 * begins at offset (i.e. the new length is offset + DIRBLKSIZ). 704 */ 705void 706ufsdirhash_newblk(struct inode *ip, doff_t offset) 707{ 708 struct dirhash *dh; 709 int block; 710 711 if ((dh = ip->i_dirhash) == NULL) 712 return; 713 mtx_lock(&dh->dh_mtx); 714 if (dh->dh_hash == NULL) { 715 mtx_unlock(&dh->dh_mtx); 716 ufsdirhash_free(ip); 717 return; 718 } 719 720 KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ, 721 ("ufsdirhash_newblk: bad offset")); 722 block = offset / DIRBLKSIZ; 723 if (block >= dh->dh_nblk) { 724 /* Out of space; must rebuild. */ 725 mtx_unlock(&dh->dh_mtx); 726 ufsdirhash_free(ip); 727 return; 728 } 729 dh->dh_dirblks = block + 1; 730 731 /* Account for the new free block. */ 732 dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN; 733 if (dh->dh_firstfree[DH_NFSTATS] == -1) 734 dh->dh_firstfree[DH_NFSTATS] = block; 735 mtx_unlock(&dh->dh_mtx); 736} 737 738/* 739 * Inform dirhash that the directory is being truncated. 740 */ 741void 742ufsdirhash_dirtrunc(struct inode *ip, doff_t offset) 743{ 744 struct dirhash *dh; 745 int block, i; 746 747 if ((dh = ip->i_dirhash) == NULL) 748 return; 749 mtx_lock(&dh->dh_mtx); 750 if (dh->dh_hash == NULL) { 751 mtx_unlock(&dh->dh_mtx); 752 ufsdirhash_free(ip); 753 return; 754 } 755 756 KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ, 757 ("ufsdirhash_dirtrunc: bad offset")); 758 block = howmany(offset, DIRBLKSIZ); 759 /* 760 * If the directory shrinks to less than 1/8 of dh_nblk blocks 761 * (about 20% of its original size due to the 50% extra added in 762 * ufsdirhash_build) then free it, and let the caller rebuild 763 * if necessary. 764 */ 765 if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) { 766 mtx_unlock(&dh->dh_mtx); 767 ufsdirhash_free(ip); 768 return; 769 } 770 771 /* 772 * Remove any `first free' information pertaining to the 773 * truncated blocks. All blocks we're removing should be 774 * completely unused. 775 */ 776 if (dh->dh_firstfree[DH_NFSTATS] >= block) 777 dh->dh_firstfree[DH_NFSTATS] = -1; 778 for (i = block; i < dh->dh_dirblks; i++) 779 if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN) 780 panic("ufsdirhash_dirtrunc: blocks in use"); 781 for (i = 0; i < DH_NFSTATS; i++) 782 if (dh->dh_firstfree[i] >= block) 783 panic("ufsdirhash_dirtrunc: first free corrupt"); 784 dh->dh_dirblks = block; 785 mtx_unlock(&dh->dh_mtx); 786} 787 788/* 789 * Debugging function to check that the dirhash information about 790 * a directory block matches its actual contents. Panics if a mismatch 791 * is detected. 792 * 793 * On entry, `buf' should point to the start of an in-core 794 * DIRBLKSIZ-sized directory block, and `offset' should contain the 795 * offset from the start of the directory of that block. 796 */ 797void 798ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset) 799{ 800 struct dirhash *dh; 801 struct direct *dp; 802 int block, ffslot, i, nfree; 803 804 if (!ufs_dirhashcheck) 805 return; 806 if ((dh = ip->i_dirhash) == NULL) 807 return; 808 mtx_lock(&dh->dh_mtx); 809 if (dh->dh_hash == NULL) { 810 mtx_unlock(&dh->dh_mtx); 811 ufsdirhash_free(ip); 812 return; 813 } 814 815 block = offset / DIRBLKSIZ; 816 if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks) 817 panic("ufsdirhash_checkblock: bad offset"); 818 819 nfree = 0; 820 for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) { 821 dp = (struct direct *)(buf + i); 822 if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ) 823 panic("ufsdirhash_checkblock: bad dir"); 824 825 if (dp->d_ino == 0) { 826#if 0 827 /* 828 * XXX entries with d_ino == 0 should only occur 829 * at the start of a DIRBLKSIZ block. However the 830 * ufs code is tolerant of such entries at other 831 * offsets, and fsck does not fix them. 832 */ 833 if (i != 0) 834 panic("ufsdirhash_checkblock: bad dir inode"); 835#endif 836 nfree += dp->d_reclen; 837 continue; 838 } 839 840 /* Check that the entry exists (will panic if it doesn't). */ 841 ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i); 842 843 nfree += dp->d_reclen - DIRSIZ(0, dp); 844 } 845 if (i != DIRBLKSIZ) 846 panic("ufsdirhash_checkblock: bad dir end"); 847 848 if (dh->dh_blkfree[block] * DIRALIGN != nfree) 849 panic("ufsdirhash_checkblock: bad free count"); 850 851 ffslot = BLKFREE2IDX(nfree / DIRALIGN); 852 for (i = 0; i <= DH_NFSTATS; i++) 853 if (dh->dh_firstfree[i] == block && i != ffslot) 854 panic("ufsdirhash_checkblock: bad first-free"); 855 if (dh->dh_firstfree[ffslot] == -1) 856 panic("ufsdirhash_checkblock: missing first-free entry"); 857 mtx_unlock(&dh->dh_mtx); 858} 859 860/* 861 * Hash the specified filename into a dirhash slot. 862 */ 863static int 864ufsdirhash_hash(struct dirhash *dh, char *name, int namelen) 865{ 866 u_int32_t hash; 867 868 /* 869 * We hash the name and then some ofther bit of data which is 870 * invarient over the dirhash's lifetime. Otherwise names 871 * differing only in the last byte are placed close to one 872 * another in the table, which is bad for linear probing. 873 */ 874 hash = fnv_32_buf(name, namelen, FNV1_32_INIT); 875 hash = fnv_32_buf(dh, sizeof(dh), hash); 876 return (hash % dh->dh_hlen); 877} 878 879/* 880 * Adjust the number of free bytes in the block containing `offset' 881 * by the value specified by `diff'. 882 * 883 * The caller must ensure we have exclusive access to `dh'; normally 884 * that means that dh_mtx should be held, but this is also called 885 * from ufsdirhash_build() where exclusive access can be assumed. 886 */ 887static void 888ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff) 889{ 890 int block, i, nfidx, ofidx; 891 892 /* Update the per-block summary info. */ 893 block = offset / DIRBLKSIZ; 894 KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks, 895 ("dirhash bad offset")); 896 ofidx = BLKFREE2IDX(dh->dh_blkfree[block]); 897 dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN); 898 nfidx = BLKFREE2IDX(dh->dh_blkfree[block]); 899 900 /* Update the `first free' list if necessary. */ 901 if (ofidx != nfidx) { 902 /* If removing, scan forward for the next block. */ 903 if (dh->dh_firstfree[ofidx] == block) { 904 for (i = block + 1; i < dh->dh_dirblks; i++) 905 if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx) 906 break; 907 dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1; 908 } 909 910 /* Make this the new `first free' if necessary */ 911 if (dh->dh_firstfree[nfidx] > block || 912 dh->dh_firstfree[nfidx] == -1) 913 dh->dh_firstfree[nfidx] = block; 914 } 915} 916 917/* 918 * Find the specified name which should have the specified offset. 919 * Returns a slot number, and panics on failure. 920 * 921 * `dh' must be locked on entry and remains so on return. 922 */ 923static int 924ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset) 925{ 926 int slot; 927 928 mtx_assert(&dh->dh_mtx, MA_OWNED); 929 930 /* Find the entry. */ 931 KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full")); 932 slot = ufsdirhash_hash(dh, name, namelen); 933 while (DH_ENTRY(dh, slot) != offset && 934 DH_ENTRY(dh, slot) != DIRHASH_EMPTY) 935 slot = WRAPINCR(slot, dh->dh_hlen); 936 if (DH_ENTRY(dh, slot) != offset) 937 panic("ufsdirhash_findslot: '%.*s' not found", namelen, name); 938 939 return (slot); 940} 941 942/* 943 * Remove the entry corresponding to the specified slot from the hash array. 944 * 945 * `dh' must be locked on entry and remains so on return. 946 */ 947static void 948ufsdirhash_delslot(struct dirhash *dh, int slot) 949{ 950 int i; 951 952 mtx_assert(&dh->dh_mtx, MA_OWNED); 953 954 /* Mark the entry as deleted. */ 955 DH_ENTRY(dh, slot) = DIRHASH_DEL; 956 957 /* If this is the end of a chain of DIRHASH_DEL slots, remove them. */ 958 for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) 959 i = WRAPINCR(i, dh->dh_hlen); 960 if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) { 961 i = WRAPDECR(i, dh->dh_hlen); 962 while (DH_ENTRY(dh, i) == DIRHASH_DEL) { 963 DH_ENTRY(dh, i) = DIRHASH_EMPTY; 964 dh->dh_hused--; 965 i = WRAPDECR(i, dh->dh_hlen); 966 } 967 KASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen")); 968 } 969} 970 971/* 972 * Given a directory entry and its offset, find the offset of the 973 * previous entry in the same DIRBLKSIZ-sized block. Returns an 974 * offset, or -1 if there is no previous entry in the block or some 975 * other problem occurred. 976 */ 977static doff_t 978ufsdirhash_getprev(struct direct *dirp, doff_t offset) 979{ 980 struct direct *dp; 981 char *blkbuf; 982 doff_t blkoff, prevoff; 983 int entrypos, i; 984 985 blkoff = offset & ~(DIRBLKSIZ - 1); /* offset of start of block */ 986 entrypos = offset & (DIRBLKSIZ - 1); /* entry relative to block */ 987 blkbuf = (char *)dirp - entrypos; 988 prevoff = blkoff; 989 990 /* If `offset' is the start of a block, there is no previous entry. */ 991 if (entrypos == 0) 992 return (-1); 993 994 /* Scan from the start of the block until we get to the entry. */ 995 for (i = 0; i < entrypos; i += dp->d_reclen) { 996 dp = (struct direct *)(blkbuf + i); 997 if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos) 998 return (-1); /* Corrupted directory. */ 999 prevoff = blkoff + i; 1000 } 1001 return (prevoff); 1002} 1003 1004/* 1005 * Try to free up `wanted' bytes by stealing memory from existing 1006 * dirhashes. Returns zero with ufsdirhash_mtx locked if successful. 1007 */ 1008static int 1009ufsdirhash_recycle(int wanted) 1010{ 1011 struct dirhash *dh; 1012 doff_t **hash; 1013 u_int8_t *blkfree; 1014 int i, mem, narrays; 1015 1016 mtx_lock(&ufsdirhash_mtx); 1017 while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) { 1018 /* Find a dirhash, and lock it. */ 1019 if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) { 1020 mtx_unlock(&ufsdirhash_mtx); 1021 return (-1); 1022 } 1023 mtx_lock(&dh->dh_mtx); 1024 KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list")); 1025 1026 /* Decrement the score; only recycle if it becomes zero. */ 1027 if (--dh->dh_score > 0) { 1028 mtx_unlock(&dh->dh_mtx); 1029 mtx_unlock(&ufsdirhash_mtx); 1030 return (-1); 1031 } 1032 1033 /* Remove it from the list and detach its memory. */ 1034 TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); 1035 dh->dh_onlist = 0; 1036 hash = dh->dh_hash; 1037 dh->dh_hash = NULL; 1038 blkfree = dh->dh_blkfree; 1039 dh->dh_blkfree = NULL; 1040 narrays = dh->dh_narrays; 1041 mem = narrays * sizeof(*dh->dh_hash) + 1042 narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + 1043 dh->dh_nblk * sizeof(*dh->dh_blkfree); 1044 1045 /* Unlock everything, free the detached memory. */ 1046 mtx_unlock(&dh->dh_mtx); 1047 mtx_unlock(&ufsdirhash_mtx); 1048 for (i = 0; i < narrays; i++) 1049 uma_zfree(ufsdirhash_zone, hash[i]); 1050 FREE(hash, M_DIRHASH); 1051 FREE(blkfree, M_DIRHASH); 1052 1053 /* Account for the returned memory, and repeat if necessary. */ 1054 mtx_lock(&ufsdirhash_mtx); 1055 ufs_dirhashmem -= mem; 1056 } 1057 /* Success; return with ufsdirhash_mtx locked. */ 1058 return (0); 1059} 1060 1061 1062static void 1063ufsdirhash_init() 1064{ 1065 ufsdirhash_zone = uma_zcreate("DIRHASH", DH_NBLKOFF * sizeof(daddr_t), 1066 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 1067 mtx_init(&ufsdirhash_mtx, "dirhash list", MTX_DEF); 1068 TAILQ_INIT(&ufsdirhash_list); 1069} 1070SYSINIT(ufsdirhash, SI_SUB_PSEUDO, SI_ORDER_ANY, ufsdirhash_init, NULL) 1071 1072 1073#endif /* UFS_DIRHASH */ 1074