1139825Simp/*- 298542Smckusick * Copyright (c) 2002 Networks Associates Technology, Inc. 398542Smckusick * All rights reserved. 498542Smckusick * 598542Smckusick * This software was developed for the FreeBSD Project by Marshall 698542Smckusick * Kirk McKusick and Network Associates Laboratories, the Security 798542Smckusick * Research Division of Network Associates, Inc. under DARPA/SPAWAR 898542Smckusick * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 998542Smckusick * research program 1098542Smckusick * 11136721Srwatson * Redistribution and use in source and binary forms, with or without 12136721Srwatson * modification, are permitted provided that the following conditions 13136721Srwatson * are met: 14136721Srwatson * 1. Redistributions of source code must retain the above copyright 15136721Srwatson * notice, this list of conditions and the following disclaimer. 16136721Srwatson * 2. Redistributions in binary form must reproduce the above copyright 17136721Srwatson * notice, this list of conditions and the following disclaimer in the 18136721Srwatson * documentation and/or other materials provided with the distribution. 19136721Srwatson * 20136721Srwatson * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21136721Srwatson * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22136721Srwatson * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23136721Srwatson * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24136721Srwatson * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25136721Srwatson * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26136721Srwatson * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27136721Srwatson * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28136721Srwatson * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29136721Srwatson * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30136721Srwatson * SUCH DAMAGE. 31136721Srwatson * 321541Srgrimes * Copyright (c) 1982, 1986, 1989, 1993 331541Srgrimes * The Regents of the University of California. All rights reserved. 341541Srgrimes * 351541Srgrimes * Redistribution and use in source and binary forms, with or without 361541Srgrimes * modification, are permitted provided that the following conditions 371541Srgrimes * are met: 381541Srgrimes * 1. Redistributions of source code must retain the above copyright 391541Srgrimes * notice, this list of conditions and the following disclaimer. 401541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 411541Srgrimes * notice, this list of conditions and the following disclaimer in the 421541Srgrimes * documentation and/or other materials provided with the distribution. 431541Srgrimes * 4. Neither the name of the University nor the names of its contributors 441541Srgrimes * may be used to endorse or promote products derived from this software 451541Srgrimes * without specific prior written permission. 461541Srgrimes * 471541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 481541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 491541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 501541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 511541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 521541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 531541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 541541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 551541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 561541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 571541Srgrimes * SUCH DAMAGE. 581541Srgrimes * 5922521Sdyson * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 601541Srgrimes */ 611541Srgrimes 62116192Sobrien#include <sys/cdefs.h> 63116192Sobrien__FBSDID("$FreeBSD$"); 64116192Sobrien 6513260Swollman#include "opt_quota.h" 6613260Swollman 671541Srgrimes#include <sys/param.h> 68224778Srwatson#include <sys/capability.h> 691541Srgrimes#include <sys/systm.h> 7060041Sphk#include <sys/bio.h> 711541Srgrimes#include <sys/buf.h> 7250253Sbde#include <sys/conf.h> 73202113Smckusick#include <sys/fcntl.h> 7474548Smckusick#include <sys/file.h> 75108524Salfred#include <sys/filedesc.h> 76164033Srwatson#include <sys/priv.h> 771541Srgrimes#include <sys/proc.h> 781541Srgrimes#include <sys/vnode.h> 791541Srgrimes#include <sys/mount.h> 8041124Sdg#include <sys/kernel.h> 81202113Smckusick#include <sys/syscallsubr.h> 8212911Sphk#include <sys/sysctl.h> 831541Srgrimes#include <sys/syslog.h> 84216796Skib#include <sys/taskqueue.h> 851541Srgrimes 86202113Smckusick#include <security/audit/audit.h> 87202113Smckusick 88216796Skib#include <geom/geom.h> 89216796Skib 90202113Smckusick#include <ufs/ufs/dir.h> 9159241Srwatson#include <ufs/ufs/extattr.h> 921541Srgrimes#include <ufs/ufs/quota.h> 931541Srgrimes#include <ufs/ufs/inode.h> 9441124Sdg#include <ufs/ufs/ufs_extern.h> 9530474Sphk#include <ufs/ufs/ufsmount.h> 961541Srgrimes 971541Srgrimes#include <ufs/ffs/fs.h> 981541Srgrimes#include <ufs/ffs/ffs_extern.h> 99216796Skib#include <ufs/ffs/softdep.h> 1001541Srgrimes 101203763Smckusicktypedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref, 102207141Sjeff int size, int rsize); 10312590Sbde 104207141Sjeffstatic ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int); 10598542Smckusickstatic ufs2_daddr_t 106207141Sjeff ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); 107216796Skibstatic void ffs_blkfree_cg(struct ufsmount *, struct fs *, 108216796Skib struct vnode *, ufs2_daddr_t, long, ino_t, 109216796Skib struct workhead *); 110216796Skibstatic void ffs_blkfree_trim_completed(struct bio *); 111216796Skibstatic void ffs_blkfree_trim_task(void *ctx, int pending __unused); 112173464Sobrien#ifdef INVARIANTS 11398542Smckusickstatic int ffs_checkblk(struct inode *, ufs2_daddr_t, long); 11431352Sbde#endif 115207141Sjeffstatic ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int, 116207141Sjeff int); 11792728Salfredstatic ino_t ffs_dirpref(struct inode *); 118203763Smckusickstatic ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t, 119203763Smckusick int, int); 12098542Smckusickstatic ufs2_daddr_t ffs_hashalloc 121207141Sjeff (struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *); 122207141Sjeffstatic ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int, 123207141Sjeff int); 12498542Smckusickstatic ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); 12598542Smckusickstatic int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); 12698542Smckusickstatic int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); 1271541Srgrimes 1281541Srgrimes/* 12996755Strhodes * Allocate a block in the filesystem. 1308876Srgrimes * 1311541Srgrimes * The size of the requested block is given, which must be some 1321541Srgrimes * multiple of fs_fsize and <= fs_bsize. 1331541Srgrimes * A preference may be optionally specified. If a preference is given 1341541Srgrimes * the following hierarchy is used to allocate a block: 1351541Srgrimes * 1) allocate the requested block. 1361541Srgrimes * 2) allocate a rotationally optimal block in the same cylinder. 1371541Srgrimes * 3) allocate a block in the same cylinder group. 1381541Srgrimes * 4) quadradically rehash into other cylinder groups, until an 1391541Srgrimes * available block is located. 140166051Smpp * If no block preference is given the following hierarchy is used 1411541Srgrimes * to allocate a block: 1421541Srgrimes * 1) allocate a block in the cylinder group that contains the 1431541Srgrimes * inode for the file. 1441541Srgrimes * 2) quadradically rehash into other cylinder groups, until an 1451541Srgrimes * available block is located. 1461541Srgrimes */ 1471549Srgrimesint 148187790Srwatsonffs_alloc(ip, lbn, bpref, size, flags, cred, bnp) 14996506Sphk struct inode *ip; 15098542Smckusick ufs2_daddr_t lbn, bpref; 151187790Srwatson int size, flags; 1521541Srgrimes struct ucred *cred; 15398542Smckusick ufs2_daddr_t *bnp; 1541541Srgrimes{ 15596506Sphk struct fs *fs; 156140704Sjeff struct ufsmount *ump; 15798542Smckusick ufs2_daddr_t bno; 158203763Smckusick u_int cg, reclaimed; 159151906Sps static struct timeval lastfail; 160151906Sps static int curfail; 161166924Sbrian int64_t delta; 1626357Sphk#ifdef QUOTA 1636357Sphk int error; 1646357Sphk#endif 1658876Srgrimes 1661541Srgrimes *bnp = 0; 1671541Srgrimes fs = ip->i_fs; 168140704Sjeff ump = ip->i_ump; 169140704Sjeff mtx_assert(UFS_MTX(ump), MA_OWNED); 170173464Sobrien#ifdef INVARIANTS 1711541Srgrimes if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { 17250253Sbde printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", 17350253Sbde devtoname(ip->i_dev), (long)fs->fs_bsize, size, 17450253Sbde fs->fs_fsmnt); 1751541Srgrimes panic("ffs_alloc: bad size"); 1761541Srgrimes } 1771541Srgrimes if (cred == NOCRED) 1787170Sdg panic("ffs_alloc: missing credential"); 179173464Sobrien#endif /* INVARIANTS */ 18089637Smckusick reclaimed = 0; 18189637Smckusickretry: 182140704Sjeff#ifdef QUOTA 183140704Sjeff UFS_UNLOCK(ump); 184140704Sjeff error = chkdq(ip, btodb(size), cred, 0); 185140704Sjeff if (error) 186140704Sjeff return (error); 187140704Sjeff UFS_LOCK(ump); 188140704Sjeff#endif 1891541Srgrimes if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 1901541Srgrimes goto nospace; 191170587Srwatson if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) && 19229609Sphk freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) 1931541Srgrimes goto nospace; 1941541Srgrimes if (bpref >= fs->fs_size) 1951541Srgrimes bpref = 0; 1961541Srgrimes if (bpref == 0) 1971541Srgrimes cg = ino_to_cg(fs, ip->i_number); 1981541Srgrimes else 1991541Srgrimes cg = dtog(fs, bpref); 200207141Sjeff bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); 2011541Srgrimes if (bno > 0) { 202166924Sbrian delta = btodb(size); 203166924Sbrian DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 204187790Srwatson if (flags & IO_EXT) 205187790Srwatson ip->i_flag |= IN_CHANGE; 206187790Srwatson else 207187790Srwatson ip->i_flag |= IN_CHANGE | IN_UPDATE; 2081541Srgrimes *bnp = bno; 2091541Srgrimes return (0); 2101541Srgrimes } 211166142Smppnospace: 2121541Srgrimes#ifdef QUOTA 213140704Sjeff UFS_UNLOCK(ump); 2141541Srgrimes /* 2151541Srgrimes * Restore user's disk quota because allocation failed. 2161541Srgrimes */ 21798542Smckusick (void) chkdq(ip, -btodb(size), cred, FORCE); 218140704Sjeff UFS_LOCK(ump); 2191541Srgrimes#endif 220222958Sjeff if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 22189637Smckusick reclaimed = 1; 222220374Smckusick softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); 22389637Smckusick goto retry; 22489637Smckusick } 225140704Sjeff UFS_UNLOCK(ump); 226223114Smckusick if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) { 227151906Sps ffs_fserr(fs, ip->i_number, "filesystem full"); 228151906Sps uprintf("\n%s: write failed, filesystem is full\n", 229151906Sps fs->fs_fsmnt); 230151906Sps } 2311541Srgrimes return (ENOSPC); 2321541Srgrimes} 2331541Srgrimes 2341541Srgrimes/* 2351541Srgrimes * Reallocate a fragment to a bigger size 2361541Srgrimes * 2371541Srgrimes * The number and size of the old block is given, and a preference 2381541Srgrimes * and new size is also specified. The allocator attempts to extend 2391541Srgrimes * the original block. Failing that, the regular block allocator is 2401541Srgrimes * invoked to get an appropriate block. 2411541Srgrimes */ 2421549Srgrimesint 243187790Srwatsonffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) 24496506Sphk struct inode *ip; 24598542Smckusick ufs2_daddr_t lbprev; 246100344Smckusick ufs2_daddr_t bprev; 24798542Smckusick ufs2_daddr_t bpref; 248187790Srwatson int osize, nsize, flags; 2491541Srgrimes struct ucred *cred; 2501541Srgrimes struct buf **bpp; 2511541Srgrimes{ 25289637Smckusick struct vnode *vp; 25389637Smckusick struct fs *fs; 2541541Srgrimes struct buf *bp; 255140704Sjeff struct ufsmount *ump; 256203763Smckusick u_int cg, request, reclaimed; 257251897Sscottl int error, gbflags; 258100344Smckusick ufs2_daddr_t bno; 259151906Sps static struct timeval lastfail; 260151906Sps static int curfail; 261166924Sbrian int64_t delta; 2628876Srgrimes 2631541Srgrimes *bpp = 0; 26489637Smckusick vp = ITOV(ip); 2651541Srgrimes fs = ip->i_fs; 266140704Sjeff bp = NULL; 267140704Sjeff ump = ip->i_ump; 268251897Sscottl gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 269251897Sscottl 270140704Sjeff mtx_assert(UFS_MTX(ump), MA_OWNED); 271173464Sobrien#ifdef INVARIANTS 27289637Smckusick if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 27362976Smckusick panic("ffs_realloccg: allocation on suspended filesystem"); 2741541Srgrimes if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 2751541Srgrimes (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 2761541Srgrimes printf( 27750253Sbde "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", 27850253Sbde devtoname(ip->i_dev), (long)fs->fs_bsize, osize, 2798456Srgrimes nsize, fs->fs_fsmnt); 2801541Srgrimes panic("ffs_realloccg: bad size"); 2811541Srgrimes } 2821541Srgrimes if (cred == NOCRED) 2837170Sdg panic("ffs_realloccg: missing credential"); 284173464Sobrien#endif /* INVARIANTS */ 28589637Smckusick reclaimed = 0; 28689637Smckusickretry: 287170587Srwatson if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) && 288140704Sjeff freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { 2891541Srgrimes goto nospace; 290140704Sjeff } 291100344Smckusick if (bprev == 0) { 29298687Smux printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", 29398542Smckusick devtoname(ip->i_dev), (long)fs->fs_bsize, (intmax_t)bprev, 29437555Sbde fs->fs_fsmnt); 2951541Srgrimes panic("ffs_realloccg: bad bprev"); 2961541Srgrimes } 297140704Sjeff UFS_UNLOCK(ump); 2981541Srgrimes /* 2991541Srgrimes * Allocate the extra space in the buffer. 3001541Srgrimes */ 301251897Sscottl error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); 3023487Sphk if (error) { 3031541Srgrimes brelse(bp); 3041541Srgrimes return (error); 3051541Srgrimes } 3066864Sdg 30798542Smckusick if (bp->b_blkno == bp->b_lblkno) { 30898542Smckusick if (lbprev >= NDADDR) 3096864Sdg panic("ffs_realloccg: lbprev out of range"); 3106864Sdg bp->b_blkno = fsbtodb(fs, bprev); 3116864Sdg } 3128876Srgrimes 3131541Srgrimes#ifdef QUOTA 31498542Smckusick error = chkdq(ip, btodb(nsize - osize), cred, 0); 3153487Sphk if (error) { 3161541Srgrimes brelse(bp); 3171541Srgrimes return (error); 3181541Srgrimes } 3191541Srgrimes#endif 3201541Srgrimes /* 3211541Srgrimes * Check for extension in the existing location. 3221541Srgrimes */ 3231541Srgrimes cg = dtog(fs, bprev); 324140704Sjeff UFS_LOCK(ump); 32598542Smckusick bno = ffs_fragextend(ip, cg, bprev, osize, nsize); 3263487Sphk if (bno) { 3271541Srgrimes if (bp->b_blkno != fsbtodb(fs, bno)) 32823560Smpp panic("ffs_realloccg: bad blockno"); 329166924Sbrian delta = btodb(nsize - osize); 330166924Sbrian DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 331187790Srwatson if (flags & IO_EXT) 332187790Srwatson ip->i_flag |= IN_CHANGE; 333187790Srwatson else 334187790Srwatson ip->i_flag |= IN_CHANGE | IN_UPDATE; 3357399Sdg allocbuf(bp, nsize); 3361541Srgrimes bp->b_flags |= B_DONE; 337251897Sscottl vfs_bio_bzero_buf(bp, osize, nsize - osize); 338192260Salc if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 339192260Salc vfs_bio_set_valid(bp, osize, nsize - osize); 3401541Srgrimes *bpp = bp; 3411541Srgrimes return (0); 3421541Srgrimes } 3431541Srgrimes /* 3441541Srgrimes * Allocate a new disk location. 3451541Srgrimes */ 3461541Srgrimes if (bpref >= fs->fs_size) 3471541Srgrimes bpref = 0; 3481541Srgrimes switch ((int)fs->fs_optim) { 3491541Srgrimes case FS_OPTSPACE: 3501541Srgrimes /* 3518876Srgrimes * Allocate an exact sized fragment. Although this makes 3528876Srgrimes * best use of space, we will waste time relocating it if 3531541Srgrimes * the file continues to grow. If the fragmentation is 3541541Srgrimes * less than half of the minimum free reserve, we choose 3551541Srgrimes * to begin optimizing for time. 3561541Srgrimes */ 3571541Srgrimes request = nsize; 3586993Sdg if (fs->fs_minfree <= 5 || 3591541Srgrimes fs->fs_cstotal.cs_nffree > 36058087Smckusick (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) 3611541Srgrimes break; 3621541Srgrimes log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", 3631541Srgrimes fs->fs_fsmnt); 3641541Srgrimes fs->fs_optim = FS_OPTTIME; 3651541Srgrimes break; 3661541Srgrimes case FS_OPTTIME: 3671541Srgrimes /* 3681541Srgrimes * At this point we have discovered a file that is trying to 3691541Srgrimes * grow a small fragment to a larger fragment. To save time, 3701541Srgrimes * we allocate a full sized block, then free the unused portion. 3711541Srgrimes * If the file continues to grow, the `ffs_fragextend' call 3721541Srgrimes * above will be able to grow it in place without further 3731541Srgrimes * copying. If aberrant programs cause disk fragmentation to 3741541Srgrimes * grow within 2% of the free reserve, we choose to begin 3751541Srgrimes * optimizing for space. 3761541Srgrimes */ 3771541Srgrimes request = fs->fs_bsize; 3781541Srgrimes if (fs->fs_cstotal.cs_nffree < 37958087Smckusick (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) 3801541Srgrimes break; 3811541Srgrimes log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", 3821541Srgrimes fs->fs_fsmnt); 3831541Srgrimes fs->fs_optim = FS_OPTSPACE; 3841541Srgrimes break; 3851541Srgrimes default: 38650253Sbde printf("dev = %s, optim = %ld, fs = %s\n", 38750253Sbde devtoname(ip->i_dev), (long)fs->fs_optim, fs->fs_fsmnt); 3881541Srgrimes panic("ffs_realloccg: bad optim"); 3891541Srgrimes /* NOTREACHED */ 3901541Srgrimes } 391207141Sjeff bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); 3921541Srgrimes if (bno > 0) { 3931541Srgrimes bp->b_blkno = fsbtodb(fs, bno); 39489637Smckusick if (!DOINGSOFTDEP(vp)) 395140704Sjeff ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize, 396223127Smckusick ip->i_number, vp->v_type, NULL); 397166924Sbrian delta = btodb(nsize - osize); 398166924Sbrian DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 399187790Srwatson if (flags & IO_EXT) 400187790Srwatson ip->i_flag |= IN_CHANGE; 401187790Srwatson else 402187790Srwatson ip->i_flag |= IN_CHANGE | IN_UPDATE; 4037399Sdg allocbuf(bp, nsize); 4041541Srgrimes bp->b_flags |= B_DONE; 405251897Sscottl vfs_bio_bzero_buf(bp, osize, nsize - osize); 406192260Salc if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 407192260Salc vfs_bio_set_valid(bp, osize, nsize - osize); 4081541Srgrimes *bpp = bp; 4091541Srgrimes return (0); 4101541Srgrimes } 4111541Srgrimes#ifdef QUOTA 412140704Sjeff UFS_UNLOCK(ump); 4131541Srgrimes /* 4141541Srgrimes * Restore user's disk quota because allocation failed. 4151541Srgrimes */ 41698542Smckusick (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); 417140704Sjeff UFS_LOCK(ump); 4181541Srgrimes#endif 4191541Srgrimesnospace: 4201541Srgrimes /* 4211541Srgrimes * no space available 4221541Srgrimes */ 423222958Sjeff if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 42489637Smckusick reclaimed = 1; 425140704Sjeff UFS_UNLOCK(ump); 426203818Skib if (bp) { 427140704Sjeff brelse(bp); 428203818Skib bp = NULL; 429203818Skib } 430140704Sjeff UFS_LOCK(ump); 431222724Smckusick softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); 43289637Smckusick goto retry; 43389637Smckusick } 434140704Sjeff UFS_UNLOCK(ump); 435140704Sjeff if (bp) 436140704Sjeff brelse(bp); 437223114Smckusick if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) { 438151906Sps ffs_fserr(fs, ip->i_number, "filesystem full"); 439151906Sps uprintf("\n%s: write failed, filesystem is full\n", 440151906Sps fs->fs_fsmnt); 441151906Sps } 4421541Srgrimes return (ENOSPC); 4431541Srgrimes} 4441541Srgrimes 4451541Srgrimes/* 4461541Srgrimes * Reallocate a sequence of blocks into a contiguous sequence of blocks. 4471541Srgrimes * 4481541Srgrimes * The vnode and an array of buffer pointers for a range of sequential 4491541Srgrimes * logical blocks to be made contiguous is given. The allocator attempts 45098542Smckusick * to find a range of sequential blocks starting as close as possible 45198542Smckusick * from the end of the allocation for the logical block immediately 45298542Smckusick * preceding the current range. If successful, the physical block numbers 45398542Smckusick * in the buffer pointers and in the inode are changed to reflect the new 45498542Smckusick * allocation. If unsuccessful, the allocation is left unchanged. The 45598542Smckusick * success in doing the reallocation is returned. Note that the error 45698542Smckusick * return is not reflected back to the user. Rather the previous block 45798542Smckusick * allocation will be used. 4581541Srgrimes */ 45974548Smckusick 46074548SmckusickSYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem"); 46174548Smckusick 46212911Sphkstatic int doasyncfree = 1; 46374548SmckusickSYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, ""); 46422521Sdyson 46531352Sbdestatic int doreallocblks = 1; 46674548SmckusickSYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, ""); 46722521Sdyson 46842351Sbde#ifdef DEBUG 46942351Sbdestatic volatile int prtrealloc = 0; 47042351Sbde#endif 47131351Sbde 4721541Srgrimesint 4731541Srgrimesffs_reallocblks(ap) 4741541Srgrimes struct vop_reallocblks_args /* { 4751541Srgrimes struct vnode *a_vp; 4761541Srgrimes struct cluster_save *a_buflist; 4771541Srgrimes } */ *ap; 4781541Srgrimes{ 47998542Smckusick 48098542Smckusick if (doreallocblks == 0) 48198542Smckusick return (ENOSPC); 482207141Sjeff /* 483207141Sjeff * We can't wait in softdep prealloc as it may fsync and recurse 484207141Sjeff * here. Instead we simply fail to reallocate blocks if this 485207141Sjeff * rare condition arises. 486207141Sjeff */ 487207141Sjeff if (DOINGSOFTDEP(ap->a_vp)) 488207141Sjeff if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) 489207141Sjeff return (ENOSPC); 49098542Smckusick if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1) 49198542Smckusick return (ffs_reallocblks_ufs1(ap)); 49298542Smckusick return (ffs_reallocblks_ufs2(ap)); 49398542Smckusick} 49498542Smckusick 49598542Smckusickstatic int 49698542Smckusickffs_reallocblks_ufs1(ap) 49798542Smckusick struct vop_reallocblks_args /* { 49898542Smckusick struct vnode *a_vp; 49998542Smckusick struct cluster_save *a_buflist; 50098542Smckusick } */ *ap; 50198542Smckusick{ 5021541Srgrimes struct fs *fs; 5031541Srgrimes struct inode *ip; 5041541Srgrimes struct vnode *vp; 5051541Srgrimes struct buf *sbp, *ebp; 50698542Smckusick ufs1_daddr_t *bap, *sbap, *ebap = 0; 5071541Srgrimes struct cluster_save *buflist; 508140704Sjeff struct ufsmount *ump; 50998542Smckusick ufs_lbn_t start_lbn, end_lbn; 51098542Smckusick ufs1_daddr_t soff, newblk, blkno; 51198542Smckusick ufs2_daddr_t pref; 5121541Srgrimes struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; 51398542Smckusick int i, len, start_lvl, end_lvl, ssize; 5141541Srgrimes 5151541Srgrimes vp = ap->a_vp; 5161541Srgrimes ip = VTOI(vp); 5171541Srgrimes fs = ip->i_fs; 518140704Sjeff ump = ip->i_ump; 519255494Smckusick /* 520260829Smckusick * If we are not tracking block clusters or if we have less than 4% 521255494Smckusick * free blocks left, then do not attempt to cluster. Running with 522255494Smckusick * less than 5% free block reserve is not recommended and those that 523255494Smckusick * choose to do so do not expect to have good file layout. 524255494Smckusick */ 525260829Smckusick if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 5261541Srgrimes return (ENOSPC); 5271541Srgrimes buflist = ap->a_buflist; 5281541Srgrimes len = buflist->bs_nchildren; 5291541Srgrimes start_lbn = buflist->bs_children[0]->b_lblkno; 5301541Srgrimes end_lbn = start_lbn + len - 1; 531173464Sobrien#ifdef INVARIANTS 53222521Sdyson for (i = 0; i < len; i++) 53322521Sdyson if (!ffs_checkblk(ip, 53422521Sdyson dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 53522521Sdyson panic("ffs_reallocblks: unallocated block 1"); 5361541Srgrimes for (i = 1; i < len; i++) 5371541Srgrimes if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 53822521Sdyson panic("ffs_reallocblks: non-logical cluster"); 53922521Sdyson blkno = buflist->bs_children[0]->b_blkno; 54022521Sdyson ssize = fsbtodb(fs, fs->fs_frag); 54122521Sdyson for (i = 1; i < len - 1; i++) 54222521Sdyson if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 54322521Sdyson panic("ffs_reallocblks: non-physical cluster %d", i); 5441541Srgrimes#endif 5451541Srgrimes /* 546249782Smckusick * If the cluster crosses the boundary for the first indirect 547249782Smckusick * block, leave space for the indirect block. Indirect blocks 548249782Smckusick * are initially laid out in a position after the last direct 549249782Smckusick * block. Block reallocation would usually destroy locality by 550249782Smckusick * moving the indirect block out of the way to make room for 551249782Smckusick * data blocks if we didn't compensate here. We should also do 552249782Smckusick * this for other indirect block boundaries, but it is only 553249782Smckusick * important for the first one. 554249782Smckusick */ 555249782Smckusick if (start_lbn < NDADDR && end_lbn >= NDADDR) 556249782Smckusick return (ENOSPC); 557249782Smckusick /* 5581541Srgrimes * If the latest allocation is in a new cylinder group, assume that 5591541Srgrimes * the filesystem has decided to move and do not force it back to 5601541Srgrimes * the previous cylinder group. 5611541Srgrimes */ 5621541Srgrimes if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 5631541Srgrimes dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 5641541Srgrimes return (ENOSPC); 5651541Srgrimes if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 5661541Srgrimes ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 5671541Srgrimes return (ENOSPC); 5681541Srgrimes /* 5691541Srgrimes * Get the starting offset and block map for the first block. 5701541Srgrimes */ 5711541Srgrimes if (start_lvl == 0) { 57298542Smckusick sbap = &ip->i_din1->di_db[0]; 5731541Srgrimes soff = start_lbn; 5741541Srgrimes } else { 5751541Srgrimes idp = &start_ap[start_lvl - 1]; 5761541Srgrimes if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 5771541Srgrimes brelse(sbp); 5781541Srgrimes return (ENOSPC); 5791541Srgrimes } 58098542Smckusick sbap = (ufs1_daddr_t *)sbp->b_data; 5811541Srgrimes soff = idp->in_off; 5821541Srgrimes } 5831541Srgrimes /* 5841541Srgrimes * If the block range spans two block maps, get the second map. 5851541Srgrimes */ 5861541Srgrimes if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 5871541Srgrimes ssize = len; 5881541Srgrimes } else { 589173464Sobrien#ifdef INVARIANTS 590174126Skensmith if (start_lvl > 0 && 591174126Skensmith start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 5921541Srgrimes panic("ffs_reallocblk: start == end"); 5931541Srgrimes#endif 5941541Srgrimes ssize = len - (idp->in_off + 1); 5951541Srgrimes if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 5961541Srgrimes goto fail; 59798542Smckusick ebap = (ufs1_daddr_t *)ebp->b_data; 5981541Srgrimes } 5991541Srgrimes /* 600140704Sjeff * Find the preferred location for the cluster. 601140704Sjeff */ 602140704Sjeff UFS_LOCK(ump); 603140704Sjeff pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); 604140704Sjeff /* 6051541Srgrimes * Search the block map looking for an allocation of the desired size. 6061541Srgrimes */ 60798542Smckusick if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, 608207141Sjeff len, len, ffs_clusteralloc)) == 0) { 609140704Sjeff UFS_UNLOCK(ump); 6101541Srgrimes goto fail; 611140704Sjeff } 6121541Srgrimes /* 6131541Srgrimes * We have found a new contiguous block. 6141541Srgrimes * 6151541Srgrimes * First we have to replace the old block pointers with the new 6161541Srgrimes * block pointers in the inode and indirect blocks associated 6171541Srgrimes * with the file. 6181541Srgrimes */ 61922521Sdyson#ifdef DEBUG 62022521Sdyson if (prtrealloc) 621103594Sobrien printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number, 62298542Smckusick (intmax_t)start_lbn, (intmax_t)end_lbn); 62322521Sdyson#endif 6241541Srgrimes blkno = newblk; 6251541Srgrimes for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 62634266Sjulian if (i == ssize) { 6271541Srgrimes bap = ebap; 62834266Sjulian soff = -i; 62934266Sjulian } 630173464Sobrien#ifdef INVARIANTS 63122521Sdyson if (!ffs_checkblk(ip, 63222521Sdyson dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 63322521Sdyson panic("ffs_reallocblks: unallocated block 2"); 63422521Sdyson if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 6351541Srgrimes panic("ffs_reallocblks: alloc mismatch"); 6361541Srgrimes#endif 63722521Sdyson#ifdef DEBUG 63822521Sdyson if (prtrealloc) 63922521Sdyson printf(" %d,", *bap); 64022521Sdyson#endif 64134266Sjulian if (DOINGSOFTDEP(vp)) { 64298542Smckusick if (sbap == &ip->i_din1->di_db[0] && i < ssize) 64334266Sjulian softdep_setup_allocdirect(ip, start_lbn + i, 64434266Sjulian blkno, *bap, fs->fs_bsize, fs->fs_bsize, 64534266Sjulian buflist->bs_children[i]); 64634266Sjulian else 64734266Sjulian softdep_setup_allocindir_page(ip, start_lbn + i, 64834266Sjulian i < ssize ? sbp : ebp, soff + i, blkno, 64934266Sjulian *bap, buflist->bs_children[i]); 65034266Sjulian } 6511541Srgrimes *bap++ = blkno; 6521541Srgrimes } 6531541Srgrimes /* 6541541Srgrimes * Next we must write out the modified inode and indirect blocks. 6551541Srgrimes * For strict correctness, the writes should be synchronous since 6561541Srgrimes * the old block values may have been written to disk. In practise 6578876Srgrimes * they are almost never written, but if we are concerned about 6581541Srgrimes * strict correctness, the `doasyncfree' flag should be set to zero. 6591541Srgrimes * 6601541Srgrimes * The test on `doasyncfree' should be changed to test a flag 6611541Srgrimes * that shows whether the associated buffers and inodes have 6621541Srgrimes * been written. The flag should be set when the cluster is 6631541Srgrimes * started and cleared whenever the buffer or inode is flushed. 6641541Srgrimes * We can then check below to see if it is set, and do the 6651541Srgrimes * synchronous write only when it has been cleared. 6661541Srgrimes */ 66798542Smckusick if (sbap != &ip->i_din1->di_db[0]) { 6681541Srgrimes if (doasyncfree) 6691541Srgrimes bdwrite(sbp); 6701541Srgrimes else 6711541Srgrimes bwrite(sbp); 6721541Srgrimes } else { 6731541Srgrimes ip->i_flag |= IN_CHANGE | IN_UPDATE; 67442374Sbde if (!doasyncfree) 675141526Sphk ffs_update(vp, 1); 6761541Srgrimes } 67746568Speter if (ssize < len) { 6781541Srgrimes if (doasyncfree) 6791541Srgrimes bdwrite(ebp); 6801541Srgrimes else 6811541Srgrimes bwrite(ebp); 68246568Speter } 6831541Srgrimes /* 6841541Srgrimes * Last, free the old blocks and assign the new blocks to the buffers. 6851541Srgrimes */ 68622521Sdyson#ifdef DEBUG 68722521Sdyson if (prtrealloc) 68822521Sdyson printf("\n\tnew:"); 68922521Sdyson#endif 6901541Srgrimes for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 69134266Sjulian if (!DOINGSOFTDEP(vp)) 692140704Sjeff ffs_blkfree(ump, fs, ip->i_devvp, 69334266Sjulian dbtofsb(fs, buflist->bs_children[i]->b_blkno), 694223127Smckusick fs->fs_bsize, ip->i_number, vp->v_type, NULL); 6951541Srgrimes buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); 696173464Sobrien#ifdef INVARIANTS 69722521Sdyson if (!ffs_checkblk(ip, 69822521Sdyson dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 69922521Sdyson panic("ffs_reallocblks: unallocated block 3"); 70050305Ssheldonh#endif 70150305Ssheldonh#ifdef DEBUG 70222521Sdyson if (prtrealloc) 70322521Sdyson printf(" %d,", blkno); 70422521Sdyson#endif 7051541Srgrimes } 70622521Sdyson#ifdef DEBUG 70722521Sdyson if (prtrealloc) { 70822521Sdyson prtrealloc--; 70922521Sdyson printf("\n"); 71022521Sdyson } 71122521Sdyson#endif 7121541Srgrimes return (0); 7131541Srgrimes 7141541Srgrimesfail: 7151541Srgrimes if (ssize < len) 7161541Srgrimes brelse(ebp); 71798542Smckusick if (sbap != &ip->i_din1->di_db[0]) 7181541Srgrimes brelse(sbp); 7191541Srgrimes return (ENOSPC); 7201541Srgrimes} 7211541Srgrimes 72298542Smckusickstatic int 72398542Smckusickffs_reallocblks_ufs2(ap) 72498542Smckusick struct vop_reallocblks_args /* { 72598542Smckusick struct vnode *a_vp; 72698542Smckusick struct cluster_save *a_buflist; 72798542Smckusick } */ *ap; 72898542Smckusick{ 72998542Smckusick struct fs *fs; 73098542Smckusick struct inode *ip; 73198542Smckusick struct vnode *vp; 73298542Smckusick struct buf *sbp, *ebp; 73398542Smckusick ufs2_daddr_t *bap, *sbap, *ebap = 0; 73498542Smckusick struct cluster_save *buflist; 735140704Sjeff struct ufsmount *ump; 73698542Smckusick ufs_lbn_t start_lbn, end_lbn; 73798542Smckusick ufs2_daddr_t soff, newblk, blkno, pref; 73898542Smckusick struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; 73998542Smckusick int i, len, start_lvl, end_lvl, ssize; 74098542Smckusick 74198542Smckusick vp = ap->a_vp; 74298542Smckusick ip = VTOI(vp); 74398542Smckusick fs = ip->i_fs; 744140704Sjeff ump = ip->i_ump; 745255494Smckusick /* 746260829Smckusick * If we are not tracking block clusters or if we have less than 4% 747255494Smckusick * free blocks left, then do not attempt to cluster. Running with 748255494Smckusick * less than 5% free block reserve is not recommended and those that 749255494Smckusick * choose to do so do not expect to have good file layout. 750255494Smckusick */ 751260829Smckusick if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 75298542Smckusick return (ENOSPC); 75398542Smckusick buflist = ap->a_buflist; 75498542Smckusick len = buflist->bs_nchildren; 75598542Smckusick start_lbn = buflist->bs_children[0]->b_lblkno; 75698542Smckusick end_lbn = start_lbn + len - 1; 757173464Sobrien#ifdef INVARIANTS 75898542Smckusick for (i = 0; i < len; i++) 75998542Smckusick if (!ffs_checkblk(ip, 76098542Smckusick dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 76198542Smckusick panic("ffs_reallocblks: unallocated block 1"); 76298542Smckusick for (i = 1; i < len; i++) 76398542Smckusick if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 76498542Smckusick panic("ffs_reallocblks: non-logical cluster"); 76598542Smckusick blkno = buflist->bs_children[0]->b_blkno; 76698542Smckusick ssize = fsbtodb(fs, fs->fs_frag); 76798542Smckusick for (i = 1; i < len - 1; i++) 76898542Smckusick if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 76998542Smckusick panic("ffs_reallocblks: non-physical cluster %d", i); 77098542Smckusick#endif 77198542Smckusick /* 772249782Smckusick * If the cluster crosses the boundary for the first indirect 773249782Smckusick * block, do not move anything in it. Indirect blocks are 774249782Smckusick * usually initially laid out in a position between the data 775249782Smckusick * blocks. Block reallocation would usually destroy locality by 776249782Smckusick * moving the indirect block out of the way to make room for 777249782Smckusick * data blocks if we didn't compensate here. We should also do 778249782Smckusick * this for other indirect block boundaries, but it is only 779249782Smckusick * important for the first one. 780249782Smckusick */ 781249782Smckusick if (start_lbn < NDADDR && end_lbn >= NDADDR) 782249782Smckusick return (ENOSPC); 783249782Smckusick /* 78498542Smckusick * If the latest allocation is in a new cylinder group, assume that 78598542Smckusick * the filesystem has decided to move and do not force it back to 78698542Smckusick * the previous cylinder group. 78798542Smckusick */ 78898542Smckusick if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 78998542Smckusick dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 79098542Smckusick return (ENOSPC); 79198542Smckusick if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 79298542Smckusick ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 79398542Smckusick return (ENOSPC); 79498542Smckusick /* 79598542Smckusick * Get the starting offset and block map for the first block. 79698542Smckusick */ 79798542Smckusick if (start_lvl == 0) { 79898542Smckusick sbap = &ip->i_din2->di_db[0]; 79998542Smckusick soff = start_lbn; 80098542Smckusick } else { 80198542Smckusick idp = &start_ap[start_lvl - 1]; 80298542Smckusick if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 80398542Smckusick brelse(sbp); 80498542Smckusick return (ENOSPC); 80598542Smckusick } 80698542Smckusick sbap = (ufs2_daddr_t *)sbp->b_data; 80798542Smckusick soff = idp->in_off; 80898542Smckusick } 80998542Smckusick /* 81098542Smckusick * If the block range spans two block maps, get the second map. 81198542Smckusick */ 81298542Smckusick if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 81398542Smckusick ssize = len; 81498542Smckusick } else { 815173464Sobrien#ifdef INVARIANTS 816174126Skensmith if (start_lvl > 0 && 817174126Skensmith start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 81898542Smckusick panic("ffs_reallocblk: start == end"); 81998542Smckusick#endif 82098542Smckusick ssize = len - (idp->in_off + 1); 82198542Smckusick if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 82298542Smckusick goto fail; 82398542Smckusick ebap = (ufs2_daddr_t *)ebp->b_data; 82498542Smckusick } 82598542Smckusick /* 826140704Sjeff * Find the preferred location for the cluster. 827140704Sjeff */ 828140704Sjeff UFS_LOCK(ump); 829140704Sjeff pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); 830140704Sjeff /* 83198542Smckusick * Search the block map looking for an allocation of the desired size. 83298542Smckusick */ 83398542Smckusick if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, 834207141Sjeff len, len, ffs_clusteralloc)) == 0) { 835140704Sjeff UFS_UNLOCK(ump); 83698542Smckusick goto fail; 837140704Sjeff } 83898542Smckusick /* 83998542Smckusick * We have found a new contiguous block. 84098542Smckusick * 84198542Smckusick * First we have to replace the old block pointers with the new 84298542Smckusick * block pointers in the inode and indirect blocks associated 84398542Smckusick * with the file. 84498542Smckusick */ 84598542Smckusick#ifdef DEBUG 84698542Smckusick if (prtrealloc) 847103594Sobrien printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number, 84898542Smckusick (intmax_t)start_lbn, (intmax_t)end_lbn); 84998542Smckusick#endif 85098542Smckusick blkno = newblk; 85198542Smckusick for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 85298542Smckusick if (i == ssize) { 85398542Smckusick bap = ebap; 85498542Smckusick soff = -i; 85598542Smckusick } 856173464Sobrien#ifdef INVARIANTS 85798542Smckusick if (!ffs_checkblk(ip, 85898542Smckusick dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 85998542Smckusick panic("ffs_reallocblks: unallocated block 2"); 86098542Smckusick if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 86198542Smckusick panic("ffs_reallocblks: alloc mismatch"); 86298542Smckusick#endif 86398542Smckusick#ifdef DEBUG 86498542Smckusick if (prtrealloc) 865103594Sobrien printf(" %jd,", (intmax_t)*bap); 86698542Smckusick#endif 86798542Smckusick if (DOINGSOFTDEP(vp)) { 86898542Smckusick if (sbap == &ip->i_din2->di_db[0] && i < ssize) 86998542Smckusick softdep_setup_allocdirect(ip, start_lbn + i, 87098542Smckusick blkno, *bap, fs->fs_bsize, fs->fs_bsize, 87198542Smckusick buflist->bs_children[i]); 87298542Smckusick else 87398542Smckusick softdep_setup_allocindir_page(ip, start_lbn + i, 87498542Smckusick i < ssize ? sbp : ebp, soff + i, blkno, 87598542Smckusick *bap, buflist->bs_children[i]); 87698542Smckusick } 87798542Smckusick *bap++ = blkno; 87898542Smckusick } 87998542Smckusick /* 88098542Smckusick * Next we must write out the modified inode and indirect blocks. 88198542Smckusick * For strict correctness, the writes should be synchronous since 88298542Smckusick * the old block values may have been written to disk. In practise 88398542Smckusick * they are almost never written, but if we are concerned about 88498542Smckusick * strict correctness, the `doasyncfree' flag should be set to zero. 88598542Smckusick * 88698542Smckusick * The test on `doasyncfree' should be changed to test a flag 88798542Smckusick * that shows whether the associated buffers and inodes have 88898542Smckusick * been written. The flag should be set when the cluster is 88998542Smckusick * started and cleared whenever the buffer or inode is flushed. 89098542Smckusick * We can then check below to see if it is set, and do the 89198542Smckusick * synchronous write only when it has been cleared. 89298542Smckusick */ 89398542Smckusick if (sbap != &ip->i_din2->di_db[0]) { 89498542Smckusick if (doasyncfree) 89598542Smckusick bdwrite(sbp); 89698542Smckusick else 89798542Smckusick bwrite(sbp); 89898542Smckusick } else { 89998542Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 90098542Smckusick if (!doasyncfree) 901141526Sphk ffs_update(vp, 1); 90298542Smckusick } 90398542Smckusick if (ssize < len) { 90498542Smckusick if (doasyncfree) 90598542Smckusick bdwrite(ebp); 90698542Smckusick else 90798542Smckusick bwrite(ebp); 90898542Smckusick } 90998542Smckusick /* 91098542Smckusick * Last, free the old blocks and assign the new blocks to the buffers. 91198542Smckusick */ 91298542Smckusick#ifdef DEBUG 91398542Smckusick if (prtrealloc) 91498542Smckusick printf("\n\tnew:"); 91598542Smckusick#endif 91698542Smckusick for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 91798542Smckusick if (!DOINGSOFTDEP(vp)) 918140704Sjeff ffs_blkfree(ump, fs, ip->i_devvp, 91998542Smckusick dbtofsb(fs, buflist->bs_children[i]->b_blkno), 920223127Smckusick fs->fs_bsize, ip->i_number, vp->v_type, NULL); 92198542Smckusick buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); 922173464Sobrien#ifdef INVARIANTS 92398542Smckusick if (!ffs_checkblk(ip, 92498542Smckusick dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 92598542Smckusick panic("ffs_reallocblks: unallocated block 3"); 92698542Smckusick#endif 92798542Smckusick#ifdef DEBUG 92898542Smckusick if (prtrealloc) 92999590Sbde printf(" %jd,", (intmax_t)blkno); 93098542Smckusick#endif 93198542Smckusick } 93298542Smckusick#ifdef DEBUG 93398542Smckusick if (prtrealloc) { 93498542Smckusick prtrealloc--; 93598542Smckusick printf("\n"); 93698542Smckusick } 93798542Smckusick#endif 93898542Smckusick return (0); 93998542Smckusick 94098542Smckusickfail: 94198542Smckusick if (ssize < len) 94298542Smckusick brelse(ebp); 94398542Smckusick if (sbap != &ip->i_din2->di_db[0]) 94498542Smckusick brelse(sbp); 94598542Smckusick return (ENOSPC); 94698542Smckusick} 94798542Smckusick 9481541Srgrimes/* 94996755Strhodes * Allocate an inode in the filesystem. 9508876Srgrimes * 9511541Srgrimes * If allocating a directory, use ffs_dirpref to select the inode. 9521541Srgrimes * If allocating in a directory, the following hierarchy is followed: 9531541Srgrimes * 1) allocate the preferred inode. 9541541Srgrimes * 2) allocate an inode in the same cylinder group. 9551541Srgrimes * 3) quadradically rehash into other cylinder groups, until an 9561541Srgrimes * available inode is located. 957166051Smpp * If no inode preference is given the following hierarchy is used 9581541Srgrimes * to allocate an inode: 9591541Srgrimes * 1) allocate an inode in cylinder group 0. 9601541Srgrimes * 2) quadradically rehash into other cylinder groups, until an 9611541Srgrimes * available inode is located. 9621541Srgrimes */ 9631549Srgrimesint 96430474Sphkffs_valloc(pvp, mode, cred, vpp) 96530474Sphk struct vnode *pvp; 96630474Sphk int mode; 96730474Sphk struct ucred *cred; 96830474Sphk struct vnode **vpp; 9691541Srgrimes{ 97096506Sphk struct inode *pip; 97196506Sphk struct fs *fs; 97296506Sphk struct inode *ip; 97398542Smckusick struct timespec ts; 974140704Sjeff struct ufsmount *ump; 9751541Srgrimes ino_t ino, ipref; 976203763Smckusick u_int cg; 977219895Smckusick int error, error1, reclaimed; 978151906Sps static struct timeval lastfail; 979151906Sps static int curfail; 9808876Srgrimes 98130474Sphk *vpp = NULL; 9821541Srgrimes pip = VTOI(pvp); 9831541Srgrimes fs = pip->i_fs; 984140704Sjeff ump = pip->i_ump; 985140704Sjeff 986140704Sjeff UFS_LOCK(ump); 987219895Smckusick reclaimed = 0; 988219895Smckusickretry: 9891541Srgrimes if (fs->fs_cstotal.cs_nifree == 0) 9901541Srgrimes goto noinodes; 9911541Srgrimes 9921541Srgrimes if ((mode & IFMT) == IFDIR) 99375377Smckusick ipref = ffs_dirpref(pip); 9941541Srgrimes else 9951541Srgrimes ipref = pip->i_number; 996108010Smckusick if (ipref >= fs->fs_ncg * fs->fs_ipg) 9971541Srgrimes ipref = 0; 9981541Srgrimes cg = ino_to_cg(fs, ipref); 99975377Smckusick /* 100075377Smckusick * Track number of dirs created one after another 100175377Smckusick * in a same cg without intervening by files. 100275377Smckusick */ 100375377Smckusick if ((mode & IFMT) == IFDIR) { 100475377Smckusick if (fs->fs_contigdirs[cg] < 255) 100575377Smckusick fs->fs_contigdirs[cg]++; 100675377Smckusick } else { 100775377Smckusick if (fs->fs_contigdirs[cg] > 0) 100875377Smckusick fs->fs_contigdirs[cg]--; 100975377Smckusick } 1010207141Sjeff ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 101112861Speter (allocfcn_t *)ffs_nodealloccg); 10121541Srgrimes if (ino == 0) 10131541Srgrimes goto noinodes; 1014141526Sphk error = ffs_vget(pvp->v_mount, ino, LK_EXCLUSIVE, vpp); 10151541Srgrimes if (error) { 1016182366Skib error1 = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, 1017182366Skib FFSV_FORCEINSMQ); 1018141526Sphk ffs_vfree(pvp, ino, mode); 1019182366Skib if (error1 == 0) { 1020182366Skib ip = VTOI(*vpp); 1021182366Skib if (ip->i_mode) 1022182366Skib goto dup_alloc; 1023182366Skib ip->i_flag |= IN_MODIFIED; 1024182366Skib vput(*vpp); 1025182366Skib } 10261541Srgrimes return (error); 10271541Srgrimes } 102830474Sphk ip = VTOI(*vpp); 10291541Srgrimes if (ip->i_mode) { 1030182366Skibdup_alloc: 103137555Sbde printf("mode = 0%o, inum = %lu, fs = %s\n", 103237555Sbde ip->i_mode, (u_long)ip->i_number, fs->fs_fsmnt); 10331541Srgrimes panic("ffs_valloc: dup alloc"); 10341541Srgrimes } 103598542Smckusick if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ 103637555Sbde printf("free inode %s/%lu had %ld blocks\n", 103798542Smckusick fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks)); 1038132775Skan DIP_SET(ip, i_blocks, 0); 10391541Srgrimes } 10401541Srgrimes ip->i_flags = 0; 1041132775Skan DIP_SET(ip, i_flags, 0); 10421541Srgrimes /* 10431541Srgrimes * Set up a new generation number for this inode. 10441541Srgrimes */ 104531484Sbde if (ip->i_gen == 0 || ++ip->i_gen == 0) 1046110885Smckusick ip->i_gen = arc4random() / 2 + 1; 1047132775Skan DIP_SET(ip, i_gen, ip->i_gen); 104898542Smckusick if (fs->fs_magic == FS_UFS2_MAGIC) { 104998542Smckusick vfs_timestamp(&ts); 1050100201Smckusick ip->i_din2->di_birthtime = ts.tv_sec; 1051100201Smckusick ip->i_din2->di_birthnsec = ts.tv_nsec; 105298542Smckusick } 1053220985Skib ufs_prepare_reclaim(*vpp); 1054150891Struckman ip->i_flag = 0; 1055220985Skib (*vpp)->v_vflag = 0; 1056151176Stegge (*vpp)->v_type = VNON; 1057151176Stegge if (fs->fs_magic == FS_UFS2_MAGIC) 1058151176Stegge (*vpp)->v_op = &ffs_vnodeops2; 1059151176Stegge else 1060151176Stegge (*vpp)->v_op = &ffs_vnodeops1; 10611541Srgrimes return (0); 10621541Srgrimesnoinodes: 1063222422Smckusick if (reclaimed == 0) { 1064219895Smckusick reclaimed = 1; 1065220374Smckusick softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); 1066219895Smckusick goto retry; 1067219895Smckusick } 1068140704Sjeff UFS_UNLOCK(ump); 1069151906Sps if (ppsratecheck(&lastfail, &curfail, 1)) { 1070151906Sps ffs_fserr(fs, pip->i_number, "out of inodes"); 1071151906Sps uprintf("\n%s: create/symlink failed, no inodes free\n", 1072151906Sps fs->fs_fsmnt); 1073151906Sps } 10741541Srgrimes return (ENOSPC); 10751541Srgrimes} 10761541Srgrimes 10771541Srgrimes/* 107875377Smckusick * Find a cylinder group to place a directory. 10791541Srgrimes * 108075377Smckusick * The policy implemented by this algorithm is to allocate a 108175377Smckusick * directory inode in the same cylinder group as its parent 108275377Smckusick * directory, but also to reserve space for its files inodes 108375377Smckusick * and data. Restrict the number of directories which may be 108475377Smckusick * allocated one after another in the same cylinder group 108575377Smckusick * without intervening allocation of files. 108675377Smckusick * 108775377Smckusick * If we allocate a first level directory then force allocation 108875377Smckusick * in another cylinder group. 10891541Srgrimes */ 10901541Srgrimesstatic ino_t 109175377Smckusickffs_dirpref(pip) 109275377Smckusick struct inode *pip; 109375377Smckusick{ 109496506Sphk struct fs *fs; 1095249782Smckusick int cg, prefcg, dirsize, cgsize; 1096203763Smckusick u_int avgifree, avgbfree, avgndir, curdirsize; 1097203763Smckusick u_int minifree, minbfree, maxndir; 1098203763Smckusick u_int mincg, minndir; 1099203763Smckusick u_int maxcontigdirs; 11001541Srgrimes 1101140704Sjeff mtx_assert(UFS_MTX(pip->i_ump), MA_OWNED); 110275377Smckusick fs = pip->i_fs; 110375377Smckusick 11041541Srgrimes avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 110575377Smckusick avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 110675377Smckusick avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; 110775377Smckusick 110875377Smckusick /* 110975377Smckusick * Force allocation in another cg if creating a first level dir. 111075377Smckusick */ 1111101308Sjeff ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref"); 1112101308Sjeff if (ITOV(pip)->v_vflag & VV_ROOT) { 111375377Smckusick prefcg = arc4random() % fs->fs_ncg; 111475377Smckusick mincg = prefcg; 111575377Smckusick minndir = fs->fs_ipg; 111675377Smckusick for (cg = prefcg; cg < fs->fs_ncg; cg++) 111775377Smckusick if (fs->fs_cs(fs, cg).cs_ndir < minndir && 111875377Smckusick fs->fs_cs(fs, cg).cs_nifree >= avgifree && 111975377Smckusick fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 112075377Smckusick mincg = cg; 112175377Smckusick minndir = fs->fs_cs(fs, cg).cs_ndir; 112275377Smckusick } 112375377Smckusick for (cg = 0; cg < prefcg; cg++) 112475377Smckusick if (fs->fs_cs(fs, cg).cs_ndir < minndir && 112575377Smckusick fs->fs_cs(fs, cg).cs_nifree >= avgifree && 112675377Smckusick fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 112775377Smckusick mincg = cg; 112875377Smckusick minndir = fs->fs_cs(fs, cg).cs_ndir; 112975377Smckusick } 113075377Smckusick return ((ino_t)(fs->fs_ipg * mincg)); 113175377Smckusick } 113275377Smckusick 113375377Smckusick /* 113475377Smckusick * Count various limits which used for 113575377Smckusick * optimal allocation of a directory inode. 113675377Smckusick */ 113775377Smckusick maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); 1138121785Struckman minifree = avgifree - avgifree / 4; 1139121785Struckman if (minifree < 1) 1140121785Struckman minifree = 1; 1141121785Struckman minbfree = avgbfree - avgbfree / 4; 1142121785Struckman if (minbfree < 1) 1143121785Struckman minbfree = 1; 114475377Smckusick cgsize = fs->fs_fsize * fs->fs_fpg; 114575377Smckusick dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; 114675377Smckusick curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; 114775377Smckusick if (dirsize < curdirsize) 114875377Smckusick dirsize = curdirsize; 1149172113Sbz if (dirsize <= 0) 1150172113Sbz maxcontigdirs = 0; /* dirsize overflowed */ 1151172113Sbz else 1152172113Sbz maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); 115375377Smckusick if (fs->fs_avgfpdir > 0) 115475377Smckusick maxcontigdirs = min(maxcontigdirs, 115575377Smckusick fs->fs_ipg / fs->fs_avgfpdir); 115675377Smckusick if (maxcontigdirs == 0) 115775377Smckusick maxcontigdirs = 1; 115875377Smckusick 115975377Smckusick /* 116075377Smckusick * Limit number of dirs in one cg and reserve space for 116175377Smckusick * regular files, but only if we have no deficit in 116275377Smckusick * inodes or space. 1163249782Smckusick * 1164249782Smckusick * We are trying to find a suitable cylinder group nearby 1165249782Smckusick * our preferred cylinder group to place a new directory. 1166249782Smckusick * We scan from our preferred cylinder group forward looking 1167249782Smckusick * for a cylinder group that meets our criterion. If we get 1168249782Smckusick * to the final cylinder group and do not find anything, 1169249782Smckusick * we start scanning backwards from our preferred cylinder 1170249782Smckusick * group. The ideal would be to alternate looking forward 1171249782Smckusick * and backward, but that is just too complex to code for 1172249782Smckusick * the gain it would get. The most likely place where the 1173249782Smckusick * backward scan would take effect is when we start near 1174249782Smckusick * the end of the filesystem and do not find anything from 1175249782Smckusick * where we are to the end. In that case, scanning backward 1176249782Smckusick * will likely find us a suitable cylinder group much closer 1177249782Smckusick * to our desired location than if we were to start scanning 1178249782Smckusick * forward from the beginning of the filesystem. 117975377Smckusick */ 118075377Smckusick prefcg = ino_to_cg(fs, pip->i_number); 118175377Smckusick for (cg = prefcg; cg < fs->fs_ncg; cg++) 118275377Smckusick if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 118375377Smckusick fs->fs_cs(fs, cg).cs_nifree >= minifree && 1184262780Spfg fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 118575377Smckusick if (fs->fs_contigdirs[cg] < maxcontigdirs) 118675377Smckusick return ((ino_t)(fs->fs_ipg * cg)); 11871541Srgrimes } 1188255494Smckusick for (cg = 0; cg < prefcg; cg++) 118975377Smckusick if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 119075377Smckusick fs->fs_cs(fs, cg).cs_nifree >= minifree && 1191262780Spfg fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 119275377Smckusick if (fs->fs_contigdirs[cg] < maxcontigdirs) 119375377Smckusick return ((ino_t)(fs->fs_ipg * cg)); 119475377Smckusick } 119575377Smckusick /* 119675377Smckusick * This is a backstop when we have deficit in space. 119775377Smckusick */ 119875377Smckusick for (cg = prefcg; cg < fs->fs_ncg; cg++) 119975377Smckusick if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 120075377Smckusick return ((ino_t)(fs->fs_ipg * cg)); 1201255494Smckusick for (cg = 0; cg < prefcg; cg++) 120275377Smckusick if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 120375377Smckusick break; 120475377Smckusick return ((ino_t)(fs->fs_ipg * cg)); 12051541Srgrimes} 12061541Srgrimes 12071541Srgrimes/* 12081541Srgrimes * Select the desired position for the next block in a file. The file is 12091541Srgrimes * logically divided into sections. The first section is composed of the 12101541Srgrimes * direct blocks. Each additional section contains fs_maxbpg blocks. 12118876Srgrimes * 12121541Srgrimes * If no blocks have been allocated in the first section, the policy is to 12131541Srgrimes * request a block in the same cylinder group as the inode that describes 1214249782Smckusick * the file. The first indirect is allocated immediately following the last 1215249782Smckusick * direct block and the data blocks for the first indirect immediately 1216249782Smckusick * follow it. 1217249782Smckusick * 1218249782Smckusick * If no blocks have been allocated in any other section, the indirect 1219249782Smckusick * block(s) are allocated in the same cylinder group as its inode in an 1220249782Smckusick * area reserved immediately following the inode blocks. The policy for 1221249782Smckusick * the data blocks is to place them in a cylinder group with a greater than 1222249782Smckusick * average number of free blocks. An appropriate cylinder group is found 12231541Srgrimes * by using a rotor that sweeps the cylinder groups. When a new group of 12241541Srgrimes * blocks is needed, the sweep begins in the cylinder group following the 12251541Srgrimes * cylinder group from which the previous allocation was made. The sweep 12261541Srgrimes * continues until a cylinder group with greater than the average number 12271541Srgrimes * of free blocks is found. If the allocation is for the first block in an 12281541Srgrimes * indirect block, the information on the previous allocation is unavailable; 12291541Srgrimes * here a best guess is made based upon the logical block number being 12301541Srgrimes * allocated. 12318876Srgrimes * 12321541Srgrimes * If a section is already partially allocated, the policy is to 123398542Smckusick * contiguously allocate fs_maxcontig blocks. The end of one of these 123498542Smckusick * contiguous blocks and the beginning of the next is laid out 123598542Smckusick * contiguously if possible. 12361541Srgrimes */ 123798542Smckusickufs2_daddr_t 123898542Smckusickffs_blkpref_ufs1(ip, lbn, indx, bap) 12391541Srgrimes struct inode *ip; 124098542Smckusick ufs_lbn_t lbn; 12411541Srgrimes int indx; 124298542Smckusick ufs1_daddr_t *bap; 12431541Srgrimes{ 124496506Sphk struct fs *fs; 1245249782Smckusick u_int cg, inocg; 1246203763Smckusick u_int avgbfree, startcg; 1247249782Smckusick ufs2_daddr_t pref; 12481541Srgrimes 1249249782Smckusick KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1250140704Sjeff mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED); 12511541Srgrimes fs = ip->i_fs; 1252249782Smckusick /* 1253249782Smckusick * Allocation of indirect blocks is indicated by passing negative 1254249782Smckusick * values in indx: -1 for single indirect, -2 for double indirect, 1255249782Smckusick * -3 for triple indirect. As noted below, we attempt to allocate 1256249782Smckusick * the first indirect inline with the file data. For all later 1257249782Smckusick * indirect blocks, the data is often allocated in other cylinder 1258249782Smckusick * groups. However to speed random file access and to speed up 1259249782Smckusick * fsck, the filesystem reserves the first fs_metaspace blocks 1260249782Smckusick * (typically half of fs_minfree) of the data area of each cylinder 1261249782Smckusick * group to hold these later indirect blocks. 1262249782Smckusick */ 1263249782Smckusick inocg = ino_to_cg(fs, ip->i_number); 1264249782Smckusick if (indx < 0) { 1265249782Smckusick /* 1266249782Smckusick * Our preference for indirect blocks is the zone at the 1267249782Smckusick * beginning of the inode's cylinder group data area that 1268249782Smckusick * we try to reserve for indirect blocks. 1269249782Smckusick */ 1270249782Smckusick pref = cgmeta(fs, inocg); 1271249782Smckusick /* 1272249782Smckusick * If we are allocating the first indirect block, try to 1273249782Smckusick * place it immediately following the last direct block. 1274249782Smckusick */ 1275249782Smckusick if (indx == -1 && lbn < NDADDR + NINDIR(fs) && 1276249782Smckusick ip->i_din1->di_db[NDADDR - 1] != 0) 1277249782Smckusick pref = ip->i_din1->di_db[NDADDR - 1] + fs->fs_frag; 1278249782Smckusick return (pref); 1279249782Smckusick } 1280249782Smckusick /* 1281249782Smckusick * If we are allocating the first data block in the first indirect 1282249782Smckusick * block and the indirect has been allocated in the data block area, 1283249782Smckusick * try to place it immediately following the indirect block. 1284249782Smckusick */ 1285249782Smckusick if (lbn == NDADDR) { 1286249782Smckusick pref = ip->i_din1->di_ib[0]; 1287249782Smckusick if (pref != 0 && pref >= cgdata(fs, inocg) && 1288249782Smckusick pref < cgbase(fs, inocg + 1)) 1289249782Smckusick return (pref + fs->fs_frag); 1290249782Smckusick } 1291249782Smckusick /* 1292249782Smckusick * If we are at the beginning of a file, or we have already allocated 1293249782Smckusick * the maximum number of blocks per cylinder group, or we do not 1294249782Smckusick * have a block allocated immediately preceeding us, then we need 1295249782Smckusick * to decide where to start allocating new blocks. 1296249782Smckusick */ 12971541Srgrimes if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 12981541Srgrimes /* 1299249782Smckusick * If we are allocating a directory data block, we want 1300249782Smckusick * to place it in the metadata area. 1301249782Smckusick */ 1302249782Smckusick if ((ip->i_mode & IFMT) == IFDIR) 1303249782Smckusick return (cgmeta(fs, inocg)); 1304249782Smckusick /* 1305249782Smckusick * Until we fill all the direct and all the first indirect's 1306249782Smckusick * blocks, we try to allocate in the data area of the inode's 1307249782Smckusick * cylinder group. 1308249782Smckusick */ 1309249782Smckusick if (lbn < NDADDR + NINDIR(fs)) 1310249782Smckusick return (cgdata(fs, inocg)); 1311249782Smckusick /* 13121541Srgrimes * Find a cylinder with greater than average number of 13131541Srgrimes * unused data blocks. 13141541Srgrimes */ 13151541Srgrimes if (indx == 0 || bap[indx - 1] == 0) 1316249782Smckusick startcg = inocg + lbn / fs->fs_maxbpg; 13171541Srgrimes else 13181541Srgrimes startcg = dtog(fs, bap[indx - 1]) + 1; 13191541Srgrimes startcg %= fs->fs_ncg; 13201541Srgrimes avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 13211541Srgrimes for (cg = startcg; cg < fs->fs_ncg; cg++) 13221541Srgrimes if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 13231541Srgrimes fs->fs_cgrotor = cg; 1324249782Smckusick return (cgdata(fs, cg)); 13251541Srgrimes } 13261541Srgrimes for (cg = 0; cg <= startcg; cg++) 13271541Srgrimes if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 13281541Srgrimes fs->fs_cgrotor = cg; 1329249782Smckusick return (cgdata(fs, cg)); 13301541Srgrimes } 133117108Sbde return (0); 13321541Srgrimes } 13331541Srgrimes /* 1334249782Smckusick * Otherwise, we just always try to lay things out contiguously. 13351541Srgrimes */ 133698542Smckusick return (bap[indx - 1] + fs->fs_frag); 133798542Smckusick} 133898542Smckusick 133998542Smckusick/* 134098542Smckusick * Same as above, but for UFS2 134198542Smckusick */ 134298542Smckusickufs2_daddr_t 134398542Smckusickffs_blkpref_ufs2(ip, lbn, indx, bap) 134498542Smckusick struct inode *ip; 134598542Smckusick ufs_lbn_t lbn; 134698542Smckusick int indx; 134798542Smckusick ufs2_daddr_t *bap; 134898542Smckusick{ 134998542Smckusick struct fs *fs; 1350249782Smckusick u_int cg, inocg; 1351203763Smckusick u_int avgbfree, startcg; 1352249782Smckusick ufs2_daddr_t pref; 135398542Smckusick 1354249782Smckusick KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1355140704Sjeff mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED); 135698542Smckusick fs = ip->i_fs; 1357249782Smckusick /* 1358249782Smckusick * Allocation of indirect blocks is indicated by passing negative 1359249782Smckusick * values in indx: -1 for single indirect, -2 for double indirect, 1360249782Smckusick * -3 for triple indirect. As noted below, we attempt to allocate 1361249782Smckusick * the first indirect inline with the file data. For all later 1362249782Smckusick * indirect blocks, the data is often allocated in other cylinder 1363249782Smckusick * groups. However to speed random file access and to speed up 1364249782Smckusick * fsck, the filesystem reserves the first fs_metaspace blocks 1365249782Smckusick * (typically half of fs_minfree) of the data area of each cylinder 1366249782Smckusick * group to hold these later indirect blocks. 1367249782Smckusick */ 1368249782Smckusick inocg = ino_to_cg(fs, ip->i_number); 1369249782Smckusick if (indx < 0) { 1370249782Smckusick /* 1371249782Smckusick * Our preference for indirect blocks is the zone at the 1372249782Smckusick * beginning of the inode's cylinder group data area that 1373249782Smckusick * we try to reserve for indirect blocks. 1374249782Smckusick */ 1375249782Smckusick pref = cgmeta(fs, inocg); 1376249782Smckusick /* 1377249782Smckusick * If we are allocating the first indirect block, try to 1378249782Smckusick * place it immediately following the last direct block. 1379249782Smckusick */ 1380249782Smckusick if (indx == -1 && lbn < NDADDR + NINDIR(fs) && 1381249782Smckusick ip->i_din2->di_db[NDADDR - 1] != 0) 1382249782Smckusick pref = ip->i_din2->di_db[NDADDR - 1] + fs->fs_frag; 1383249782Smckusick return (pref); 1384249782Smckusick } 1385249782Smckusick /* 1386249782Smckusick * If we are allocating the first data block in the first indirect 1387249782Smckusick * block and the indirect has been allocated in the data block area, 1388249782Smckusick * try to place it immediately following the indirect block. 1389249782Smckusick */ 1390249782Smckusick if (lbn == NDADDR) { 1391249782Smckusick pref = ip->i_din2->di_ib[0]; 1392249782Smckusick if (pref != 0 && pref >= cgdata(fs, inocg) && 1393249782Smckusick pref < cgbase(fs, inocg + 1)) 1394249782Smckusick return (pref + fs->fs_frag); 1395249782Smckusick } 1396249782Smckusick /* 1397249782Smckusick * If we are at the beginning of a file, or we have already allocated 1398249782Smckusick * the maximum number of blocks per cylinder group, or we do not 1399249782Smckusick * have a block allocated immediately preceeding us, then we need 1400249782Smckusick * to decide where to start allocating new blocks. 1401249782Smckusick */ 140298542Smckusick if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 140398542Smckusick /* 1404249782Smckusick * If we are allocating a directory data block, we want 1405249782Smckusick * to place it in the metadata area. 1406249782Smckusick */ 1407249782Smckusick if ((ip->i_mode & IFMT) == IFDIR) 1408249782Smckusick return (cgmeta(fs, inocg)); 1409249782Smckusick /* 1410249782Smckusick * Until we fill all the direct and all the first indirect's 1411249782Smckusick * blocks, we try to allocate in the data area of the inode's 1412249782Smckusick * cylinder group. 1413249782Smckusick */ 1414249782Smckusick if (lbn < NDADDR + NINDIR(fs)) 1415249782Smckusick return (cgdata(fs, inocg)); 1416249782Smckusick /* 141798542Smckusick * Find a cylinder with greater than average number of 141898542Smckusick * unused data blocks. 141998542Smckusick */ 142098542Smckusick if (indx == 0 || bap[indx - 1] == 0) 1421249782Smckusick startcg = inocg + lbn / fs->fs_maxbpg; 142298542Smckusick else 142398542Smckusick startcg = dtog(fs, bap[indx - 1]) + 1; 142498542Smckusick startcg %= fs->fs_ncg; 142598542Smckusick avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 142698542Smckusick for (cg = startcg; cg < fs->fs_ncg; cg++) 142798542Smckusick if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 142898542Smckusick fs->fs_cgrotor = cg; 1429249782Smckusick return (cgdata(fs, cg)); 143098542Smckusick } 143198542Smckusick for (cg = 0; cg <= startcg; cg++) 143298542Smckusick if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 143398542Smckusick fs->fs_cgrotor = cg; 1434249782Smckusick return (cgdata(fs, cg)); 143598542Smckusick } 143698542Smckusick return (0); 143798542Smckusick } 143810632Sdg /* 1439249782Smckusick * Otherwise, we just always try to lay things out contiguously. 144010632Sdg */ 144198542Smckusick return (bap[indx - 1] + fs->fs_frag); 14421541Srgrimes} 14431541Srgrimes 14441541Srgrimes/* 14451541Srgrimes * Implement the cylinder overflow algorithm. 14461541Srgrimes * 14471541Srgrimes * The policy implemented by this algorithm is: 14481541Srgrimes * 1) allocate the block in its requested cylinder group. 14491541Srgrimes * 2) quadradically rehash on the cylinder group number. 14501541Srgrimes * 3) brute force search for a free block. 1451140704Sjeff * 1452140704Sjeff * Must be called with the UFS lock held. Will release the lock on success 1453140704Sjeff * and return with it held on failure. 14541541Srgrimes */ 14551541Srgrimes/*VARARGS5*/ 145698542Smckusickstatic ufs2_daddr_t 1457207141Sjeffffs_hashalloc(ip, cg, pref, size, rsize, allocator) 14581541Srgrimes struct inode *ip; 1459203763Smckusick u_int cg; 146098542Smckusick ufs2_daddr_t pref; 1461207141Sjeff int size; /* Search size for data blocks, mode for inodes */ 1462207141Sjeff int rsize; /* Real allocated size. */ 146312590Sbde allocfcn_t *allocator; 14641541Srgrimes{ 146596506Sphk struct fs *fs; 146698542Smckusick ufs2_daddr_t result; 1467203763Smckusick u_int i, icg = cg; 14681541Srgrimes 1469140704Sjeff mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED); 1470173464Sobrien#ifdef INVARIANTS 147162976Smckusick if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 147262976Smckusick panic("ffs_hashalloc: allocation on suspended filesystem"); 147362976Smckusick#endif 14741541Srgrimes fs = ip->i_fs; 14751541Srgrimes /* 14761541Srgrimes * 1: preferred cylinder group 14771541Srgrimes */ 1478207141Sjeff result = (*allocator)(ip, cg, pref, size, rsize); 14791541Srgrimes if (result) 14801541Srgrimes return (result); 14811541Srgrimes /* 14821541Srgrimes * 2: quadratic rehash 14831541Srgrimes */ 14841541Srgrimes for (i = 1; i < fs->fs_ncg; i *= 2) { 14851541Srgrimes cg += i; 14861541Srgrimes if (cg >= fs->fs_ncg) 14871541Srgrimes cg -= fs->fs_ncg; 1488207141Sjeff result = (*allocator)(ip, cg, 0, size, rsize); 14891541Srgrimes if (result) 14901541Srgrimes return (result); 14911541Srgrimes } 14921541Srgrimes /* 14931541Srgrimes * 3: brute force search 14941541Srgrimes * Note that we start at i == 2, since 0 was checked initially, 14951541Srgrimes * and 1 is always checked in the quadratic rehash. 14961541Srgrimes */ 14971541Srgrimes cg = (icg + 2) % fs->fs_ncg; 14981541Srgrimes for (i = 2; i < fs->fs_ncg; i++) { 1499207141Sjeff result = (*allocator)(ip, cg, 0, size, rsize); 15001541Srgrimes if (result) 15011541Srgrimes return (result); 15021541Srgrimes cg++; 15031541Srgrimes if (cg == fs->fs_ncg) 15041541Srgrimes cg = 0; 15051541Srgrimes } 150612590Sbde return (0); 15071541Srgrimes} 15081541Srgrimes 15091541Srgrimes/* 15101541Srgrimes * Determine whether a fragment can be extended. 15111541Srgrimes * 15128876Srgrimes * Check to see if the necessary fragments are available, and 15131541Srgrimes * if they are, allocate them. 15141541Srgrimes */ 151598542Smckusickstatic ufs2_daddr_t 15161541Srgrimesffs_fragextend(ip, cg, bprev, osize, nsize) 15171541Srgrimes struct inode *ip; 1518203763Smckusick u_int cg; 151998542Smckusick ufs2_daddr_t bprev; 15201541Srgrimes int osize, nsize; 15211541Srgrimes{ 152296506Sphk struct fs *fs; 152396506Sphk struct cg *cgp; 15241541Srgrimes struct buf *bp; 1525140704Sjeff struct ufsmount *ump; 1526140704Sjeff int nffree; 15271541Srgrimes long bno; 15281541Srgrimes int frags, bbase; 15291541Srgrimes int i, error; 153058087Smckusick u_int8_t *blksfree; 15311541Srgrimes 1532140704Sjeff ump = ip->i_ump; 15331541Srgrimes fs = ip->i_fs; 15341541Srgrimes if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 153517108Sbde return (0); 15361541Srgrimes frags = numfrags(fs, nsize); 15371541Srgrimes bbase = fragnum(fs, bprev); 15381541Srgrimes if (bbase > fragnum(fs, (bprev + frags - 1))) { 15391541Srgrimes /* cannot extend across a block boundary */ 154017108Sbde return (0); 15411541Srgrimes } 1542140704Sjeff UFS_UNLOCK(ump); 15431541Srgrimes error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 15441541Srgrimes (int)fs->fs_cgsize, NOCRED, &bp); 1545140704Sjeff if (error) 1546140704Sjeff goto fail; 15471541Srgrimes cgp = (struct cg *)bp->b_data; 1548140704Sjeff if (!cg_chkmagic(cgp)) 1549140704Sjeff goto fail; 155055697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 155198542Smckusick cgp->cg_old_time = cgp->cg_time = time_second; 15521541Srgrimes bno = dtogd(fs, bprev); 155358087Smckusick blksfree = cg_blksfree(cgp); 15541541Srgrimes for (i = numfrags(fs, osize); i < frags; i++) 1555140704Sjeff if (isclr(blksfree, bno + i)) 1556140704Sjeff goto fail; 15571541Srgrimes /* 15581541Srgrimes * the current fragment can be extended 15591541Srgrimes * deduct the count on fragment being extended into 15601541Srgrimes * increase the count on the remaining fragment (if any) 15611541Srgrimes * allocate the extended piece 15621541Srgrimes */ 15631541Srgrimes for (i = frags; i < fs->fs_frag - bbase; i++) 156458087Smckusick if (isclr(blksfree, bno + i)) 15651541Srgrimes break; 15661541Srgrimes cgp->cg_frsum[i - numfrags(fs, osize)]--; 15671541Srgrimes if (i != frags) 15681541Srgrimes cgp->cg_frsum[i - frags]++; 1569140704Sjeff for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { 157058087Smckusick clrbit(blksfree, bno + i); 15711541Srgrimes cgp->cg_cs.cs_nffree--; 1572140704Sjeff nffree++; 15731541Srgrimes } 1574140704Sjeff UFS_LOCK(ump); 1575140704Sjeff fs->fs_cstotal.cs_nffree -= nffree; 1576140704Sjeff fs->fs_cs(fs, cg).cs_nffree -= nffree; 15771541Srgrimes fs->fs_fmod = 1; 1578140704Sjeff ACTIVECLEAR(fs, cg); 1579140704Sjeff UFS_UNLOCK(ump); 158034266Sjulian if (DOINGSOFTDEP(ITOV(ip))) 1581207141Sjeff softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, 1582207141Sjeff frags, numfrags(fs, osize)); 15831541Srgrimes bdwrite(bp); 15841541Srgrimes return (bprev); 1585140704Sjeff 1586140704Sjefffail: 1587140704Sjeff brelse(bp); 1588140704Sjeff UFS_LOCK(ump); 1589140704Sjeff return (0); 1590140704Sjeff 15911541Srgrimes} 15921541Srgrimes 15931541Srgrimes/* 15941541Srgrimes * Determine whether a block can be allocated. 15951541Srgrimes * 15961541Srgrimes * Check to see if a block of the appropriate size is available, 15971541Srgrimes * and if it is, allocate it. 15981541Srgrimes */ 159998542Smckusickstatic ufs2_daddr_t 1600207141Sjeffffs_alloccg(ip, cg, bpref, size, rsize) 16011541Srgrimes struct inode *ip; 1602203763Smckusick u_int cg; 160398542Smckusick ufs2_daddr_t bpref; 16041541Srgrimes int size; 1605207141Sjeff int rsize; 16061541Srgrimes{ 160796506Sphk struct fs *fs; 160896506Sphk struct cg *cgp; 16091541Srgrimes struct buf *bp; 1610140704Sjeff struct ufsmount *ump; 161198542Smckusick ufs1_daddr_t bno; 161298542Smckusick ufs2_daddr_t blkno; 161398542Smckusick int i, allocsiz, error, frags; 161458087Smckusick u_int8_t *blksfree; 16151541Srgrimes 1616140704Sjeff ump = ip->i_ump; 16171541Srgrimes fs = ip->i_fs; 16181541Srgrimes if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 161917108Sbde return (0); 1620140704Sjeff UFS_UNLOCK(ump); 16211541Srgrimes error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 16221541Srgrimes (int)fs->fs_cgsize, NOCRED, &bp); 1623140704Sjeff if (error) 1624140704Sjeff goto fail; 16251541Srgrimes cgp = (struct cg *)bp->b_data; 16261541Srgrimes if (!cg_chkmagic(cgp) || 1627140704Sjeff (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) 1628140704Sjeff goto fail; 162955697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 163098542Smckusick cgp->cg_old_time = cgp->cg_time = time_second; 16311541Srgrimes if (size == fs->fs_bsize) { 1632140704Sjeff UFS_LOCK(ump); 1633207141Sjeff blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1634140704Sjeff ACTIVECLEAR(fs, cg); 1635140704Sjeff UFS_UNLOCK(ump); 16361541Srgrimes bdwrite(bp); 163798542Smckusick return (blkno); 16381541Srgrimes } 16391541Srgrimes /* 16401541Srgrimes * check to see if any fragments are already available 16411541Srgrimes * allocsiz is the size which will be allocated, hacking 16421541Srgrimes * it down to a smaller size if necessary 16431541Srgrimes */ 164458087Smckusick blksfree = cg_blksfree(cgp); 16451541Srgrimes frags = numfrags(fs, size); 16461541Srgrimes for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 16471541Srgrimes if (cgp->cg_frsum[allocsiz] != 0) 16481541Srgrimes break; 16491541Srgrimes if (allocsiz == fs->fs_frag) { 16501541Srgrimes /* 16518876Srgrimes * no fragments were available, so a block will be 16521541Srgrimes * allocated, and hacked up 16531541Srgrimes */ 1654140704Sjeff if (cgp->cg_cs.cs_nbfree == 0) 1655140704Sjeff goto fail; 1656140704Sjeff UFS_LOCK(ump); 1657207141Sjeff blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1658140704Sjeff ACTIVECLEAR(fs, cg); 1659140704Sjeff UFS_UNLOCK(ump); 16601541Srgrimes bdwrite(bp); 166198542Smckusick return (blkno); 16621541Srgrimes } 1663207141Sjeff KASSERT(size == rsize, 1664207141Sjeff ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); 16651541Srgrimes bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); 1666140704Sjeff if (bno < 0) 1667140704Sjeff goto fail; 16681541Srgrimes for (i = 0; i < frags; i++) 166958087Smckusick clrbit(blksfree, bno + i); 16701541Srgrimes cgp->cg_cs.cs_nffree -= frags; 1671140704Sjeff cgp->cg_frsum[allocsiz]--; 1672140704Sjeff if (frags != allocsiz) 1673140704Sjeff cgp->cg_frsum[allocsiz - frags]++; 1674140704Sjeff UFS_LOCK(ump); 16751541Srgrimes fs->fs_cstotal.cs_nffree -= frags; 16761541Srgrimes fs->fs_cs(fs, cg).cs_nffree -= frags; 16771541Srgrimes fs->fs_fmod = 1; 1678138634Smckusick blkno = cgbase(fs, cg) + bno; 1679140704Sjeff ACTIVECLEAR(fs, cg); 1680140704Sjeff UFS_UNLOCK(ump); 168134266Sjulian if (DOINGSOFTDEP(ITOV(ip))) 1682207141Sjeff softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); 16831541Srgrimes bdwrite(bp); 168498542Smckusick return (blkno); 1685140704Sjeff 1686140704Sjefffail: 1687140704Sjeff brelse(bp); 1688140704Sjeff UFS_LOCK(ump); 1689140704Sjeff return (0); 16901541Srgrimes} 16911541Srgrimes 16921541Srgrimes/* 16931541Srgrimes * Allocate a block in a cylinder group. 16941541Srgrimes * 16951541Srgrimes * This algorithm implements the following policy: 16961541Srgrimes * 1) allocate the requested block. 16971541Srgrimes * 2) allocate a rotationally optimal block in the same cylinder. 16981541Srgrimes * 3) allocate the next available block on the block rotor for the 16991541Srgrimes * specified cylinder group. 17001541Srgrimes * Note that this routine only allocates fs_bsize blocks; these 17011541Srgrimes * blocks may be fragmented by the routine that allocates them. 17021541Srgrimes */ 170398542Smckusickstatic ufs2_daddr_t 1704207141Sjeffffs_alloccgblk(ip, bp, bpref, size) 170534266Sjulian struct inode *ip; 170634266Sjulian struct buf *bp; 170798542Smckusick ufs2_daddr_t bpref; 1708207141Sjeff int size; 17091541Srgrimes{ 171034266Sjulian struct fs *fs; 171134266Sjulian struct cg *cgp; 1712140704Sjeff struct ufsmount *ump; 171398542Smckusick ufs1_daddr_t bno; 171498542Smckusick ufs2_daddr_t blkno; 171558087Smckusick u_int8_t *blksfree; 1716249782Smckusick int i, cgbpref; 17171541Srgrimes 171834266Sjulian fs = ip->i_fs; 1719140704Sjeff ump = ip->i_ump; 1720140704Sjeff mtx_assert(UFS_MTX(ump), MA_OWNED); 172134266Sjulian cgp = (struct cg *)bp->b_data; 172258087Smckusick blksfree = cg_blksfree(cgp); 1723249782Smckusick if (bpref == 0) { 1724253324Smckusick bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; 1725249782Smckusick } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { 1726249782Smckusick /* map bpref to correct zone in this cg */ 1727249782Smckusick if (bpref < cgdata(fs, cgbpref)) 1728249782Smckusick bpref = cgmeta(fs, cgp->cg_cgx); 1729249782Smckusick else 1730249782Smckusick bpref = cgdata(fs, cgp->cg_cgx); 17311541Srgrimes } 17321541Srgrimes /* 1733249782Smckusick * if the requested block is available, use it 1734249782Smckusick */ 1735249782Smckusick bno = dtogd(fs, blknum(fs, bpref)); 1736249782Smckusick if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) 1737249782Smckusick goto gotit; 1738249782Smckusick /* 173998542Smckusick * Take the next available block in this cylinder group. 17406769Sse */ 17411541Srgrimes bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); 17421541Srgrimes if (bno < 0) 174317108Sbde return (0); 1744249782Smckusick /* Update cg_rotor only if allocated from the data zone */ 1745249782Smckusick if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) 1746249782Smckusick cgp->cg_rotor = bno; 17471541Srgrimesgotit: 17481541Srgrimes blkno = fragstoblks(fs, bno); 174958087Smckusick ffs_clrblock(fs, blksfree, (long)blkno); 1750207141Sjeff ffs_clusteracct(fs, cgp, blkno, -1); 17511541Srgrimes cgp->cg_cs.cs_nbfree--; 17521541Srgrimes fs->fs_cstotal.cs_nbfree--; 17531541Srgrimes fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 17541541Srgrimes fs->fs_fmod = 1; 1755138634Smckusick blkno = cgbase(fs, cgp->cg_cgx) + bno; 1756207141Sjeff /* 1757207141Sjeff * If the caller didn't want the whole block free the frags here. 1758207141Sjeff */ 1759207141Sjeff size = numfrags(fs, size); 1760207141Sjeff if (size != fs->fs_frag) { 1761207141Sjeff bno = dtogd(fs, blkno); 1762207141Sjeff for (i = size; i < fs->fs_frag; i++) 1763207141Sjeff setbit(blksfree, bno + i); 1764207141Sjeff i = fs->fs_frag - size; 1765207141Sjeff cgp->cg_cs.cs_nffree += i; 1766207141Sjeff fs->fs_cstotal.cs_nffree += i; 1767207141Sjeff fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; 1768207141Sjeff fs->fs_fmod = 1; 1769207141Sjeff cgp->cg_frsum[i]++; 1770207141Sjeff } 1771140704Sjeff /* XXX Fixme. */ 1772140704Sjeff UFS_UNLOCK(ump); 177334266Sjulian if (DOINGSOFTDEP(ITOV(ip))) 1774207141Sjeff softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, 1775207141Sjeff size, 0); 1776140704Sjeff UFS_LOCK(ump); 177734266Sjulian return (blkno); 17781541Srgrimes} 17791541Srgrimes 17801541Srgrimes/* 17811541Srgrimes * Determine whether a cluster can be allocated. 17821541Srgrimes * 17831541Srgrimes * We do not currently check for optimal rotational layout if there 17841541Srgrimes * are multiple choices in the same cylinder group. Instead we just 17851541Srgrimes * take the first one that we find following bpref. 17861541Srgrimes */ 178798542Smckusickstatic ufs2_daddr_t 1788207141Sjeffffs_clusteralloc(ip, cg, bpref, len, unused) 17891541Srgrimes struct inode *ip; 1790203763Smckusick u_int cg; 179198542Smckusick ufs2_daddr_t bpref; 17921541Srgrimes int len; 1793207141Sjeff int unused; 17941541Srgrimes{ 179596506Sphk struct fs *fs; 179696506Sphk struct cg *cgp; 17971541Srgrimes struct buf *bp; 1798140704Sjeff struct ufsmount *ump; 179998542Smckusick int i, run, bit, map, got; 180098542Smckusick ufs2_daddr_t bno; 18011541Srgrimes u_char *mapp; 180222521Sdyson int32_t *lp; 180358087Smckusick u_int8_t *blksfree; 18041541Srgrimes 18051541Srgrimes fs = ip->i_fs; 1806140704Sjeff ump = ip->i_ump; 180722521Sdyson if (fs->fs_maxcluster[cg] < len) 180854952Seivind return (0); 1809140704Sjeff UFS_UNLOCK(ump); 18101541Srgrimes if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, 18111541Srgrimes NOCRED, &bp)) 1812140704Sjeff goto fail_lock; 18131541Srgrimes cgp = (struct cg *)bp->b_data; 18141541Srgrimes if (!cg_chkmagic(cgp)) 1815140704Sjeff goto fail_lock; 181655697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 18171541Srgrimes /* 18181541Srgrimes * Check to see if a cluster of the needed size (or bigger) is 18191541Srgrimes * available in this cylinder group. 18201541Srgrimes */ 182122521Sdyson lp = &cg_clustersum(cgp)[len]; 18221541Srgrimes for (i = len; i <= fs->fs_contigsumsize; i++) 182322521Sdyson if (*lp++ > 0) 18241541Srgrimes break; 182522521Sdyson if (i > fs->fs_contigsumsize) { 182622521Sdyson /* 182722521Sdyson * This is the first time looking for a cluster in this 182822521Sdyson * cylinder group. Update the cluster summary information 182922521Sdyson * to reflect the true maximum sized cluster so that 183022521Sdyson * future cluster allocation requests can avoid reading 183122521Sdyson * the cylinder group map only to find no clusters. 183222521Sdyson */ 183322521Sdyson lp = &cg_clustersum(cgp)[len - 1]; 183422521Sdyson for (i = len - 1; i > 0; i--) 183522521Sdyson if (*lp-- > 0) 183622521Sdyson break; 1837140704Sjeff UFS_LOCK(ump); 183822521Sdyson fs->fs_maxcluster[cg] = i; 18391541Srgrimes goto fail; 184022521Sdyson } 18411541Srgrimes /* 18421541Srgrimes * Search the cluster map to find a big enough cluster. 18431541Srgrimes * We take the first one that we find, even if it is larger 18441541Srgrimes * than we need as we prefer to get one close to the previous 18451541Srgrimes * block allocation. We do not search before the current 18461541Srgrimes * preference point as we do not want to allocate a block 18471541Srgrimes * that is allocated before the previous one (as we will 18481541Srgrimes * then have to wait for another pass of the elevator 18491541Srgrimes * algorithm before it will be read). We prefer to fail and 18501541Srgrimes * be recalled to try an allocation in the next cylinder group. 18511541Srgrimes */ 18521541Srgrimes if (dtog(fs, bpref) != cg) 1853249782Smckusick bpref = cgdata(fs, cg); 18541541Srgrimes else 1855249782Smckusick bpref = blknum(fs, bpref); 1856249782Smckusick bpref = fragstoblks(fs, dtogd(fs, bpref)); 18571541Srgrimes mapp = &cg_clustersfree(cgp)[bpref / NBBY]; 18581541Srgrimes map = *mapp++; 18591541Srgrimes bit = 1 << (bpref % NBBY); 186022521Sdyson for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { 18611541Srgrimes if ((map & bit) == 0) { 18621541Srgrimes run = 0; 18631541Srgrimes } else { 18641541Srgrimes run++; 18651541Srgrimes if (run == len) 18661541Srgrimes break; 18671541Srgrimes } 186822521Sdyson if ((got & (NBBY - 1)) != (NBBY - 1)) { 18691541Srgrimes bit <<= 1; 18701541Srgrimes } else { 18711541Srgrimes map = *mapp++; 18721541Srgrimes bit = 1; 18731541Srgrimes } 18741541Srgrimes } 187527890Sphk if (got >= cgp->cg_nclusterblks) 1876140704Sjeff goto fail_lock; 18771541Srgrimes /* 18781541Srgrimes * Allocate the cluster that we have found. 18791541Srgrimes */ 188058087Smckusick blksfree = cg_blksfree(cgp); 188122521Sdyson for (i = 1; i <= len; i++) 188258087Smckusick if (!ffs_isblock(fs, blksfree, got - run + i)) 188322521Sdyson panic("ffs_clusteralloc: map mismatch"); 1884138634Smckusick bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); 188522521Sdyson if (dtog(fs, bno) != cg) 188622521Sdyson panic("ffs_clusteralloc: allocated out of group"); 18871541Srgrimes len = blkstofrags(fs, len); 1888140704Sjeff UFS_LOCK(ump); 18891541Srgrimes for (i = 0; i < len; i += fs->fs_frag) 1890207141Sjeff if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) 18911541Srgrimes panic("ffs_clusteralloc: lost block"); 1892140704Sjeff ACTIVECLEAR(fs, cg); 1893140704Sjeff UFS_UNLOCK(ump); 18949980Sdg bdwrite(bp); 18951541Srgrimes return (bno); 18961541Srgrimes 1897140704Sjefffail_lock: 1898140704Sjeff UFS_LOCK(ump); 18991541Srgrimesfail: 19001541Srgrimes brelse(bp); 19011541Srgrimes return (0); 19021541Srgrimes} 19031541Srgrimes 1904248667Skibstatic inline struct buf * 1905248667Skibgetinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags) 1906248667Skib{ 1907248667Skib struct fs *fs; 1908248667Skib 1909248667Skib fs = ip->i_fs; 1910248667Skib return (getblk(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, 1911248667Skib cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, 1912248667Skib gbflags)); 1913248667Skib} 1914248667Skib 19151541Srgrimes/* 19161541Srgrimes * Determine whether an inode can be allocated. 19171541Srgrimes * 19181541Srgrimes * Check to see if an inode is available, and if it is, 19191541Srgrimes * allocate it using the following policy: 19201541Srgrimes * 1) allocate the requested inode. 19211541Srgrimes * 2) allocate the next available inode after the requested 19221541Srgrimes * inode in the specified cylinder group. 19231541Srgrimes */ 192498640Smckusickstatic ufs2_daddr_t 1925207141Sjeffffs_nodealloccg(ip, cg, ipref, mode, unused) 19261541Srgrimes struct inode *ip; 1927203763Smckusick u_int cg; 192898542Smckusick ufs2_daddr_t ipref; 19291541Srgrimes int mode; 1930207141Sjeff int unused; 19311541Srgrimes{ 193296506Sphk struct fs *fs; 193396506Sphk struct cg *cgp; 193498542Smckusick struct buf *bp, *ibp; 1935140704Sjeff struct ufsmount *ump; 193658087Smckusick u_int8_t *inosused; 193798542Smckusick struct ufs2_dinode *dp2; 19381541Srgrimes int error, start, len, loc, map, i; 1939248667Skib u_int32_t old_initediblk; 19401541Srgrimes 19411541Srgrimes fs = ip->i_fs; 1942140704Sjeff ump = ip->i_ump; 1943248667Skibcheck_nifree: 19441541Srgrimes if (fs->fs_cs(fs, cg).cs_nifree == 0) 194517108Sbde return (0); 1946140704Sjeff UFS_UNLOCK(ump); 19471541Srgrimes error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 19481541Srgrimes (int)fs->fs_cgsize, NOCRED, &bp); 19491541Srgrimes if (error) { 19501541Srgrimes brelse(bp); 1951140704Sjeff UFS_LOCK(ump); 195217108Sbde return (0); 19531541Srgrimes } 19541541Srgrimes cgp = (struct cg *)bp->b_data; 1955248667Skibrestart: 19561541Srgrimes if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) { 19571541Srgrimes brelse(bp); 1958140704Sjeff UFS_LOCK(ump); 195917108Sbde return (0); 19601541Srgrimes } 196155697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 196258087Smckusick inosused = cg_inosused(cgp); 19631541Srgrimes if (ipref) { 19641541Srgrimes ipref %= fs->fs_ipg; 196558087Smckusick if (isclr(inosused, ipref)) 19661541Srgrimes goto gotit; 19671541Srgrimes } 19681541Srgrimes start = cgp->cg_irotor / NBBY; 19691541Srgrimes len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 197058087Smckusick loc = skpc(0xff, len, &inosused[start]); 19711541Srgrimes if (loc == 0) { 19721541Srgrimes len = start + 1; 19731541Srgrimes start = 0; 197458087Smckusick loc = skpc(0xff, len, &inosused[0]); 19751541Srgrimes if (loc == 0) { 19766357Sphk printf("cg = %d, irotor = %ld, fs = %s\n", 197737555Sbde cg, (long)cgp->cg_irotor, fs->fs_fsmnt); 19781541Srgrimes panic("ffs_nodealloccg: map corrupted"); 19791541Srgrimes /* NOTREACHED */ 19801541Srgrimes } 19811541Srgrimes } 19821541Srgrimes i = start + len - loc; 1983219276Sjhb map = inosused[i] ^ 0xff; 1984219276Sjhb if (map == 0) { 1985219276Sjhb printf("fs = %s\n", fs->fs_fsmnt); 1986219276Sjhb panic("ffs_nodealloccg: block not in map"); 19871541Srgrimes } 1988219276Sjhb ipref = i * NBBY + ffs(map) - 1; 19891541Srgrimesgotit: 199098542Smckusick /* 199198542Smckusick * Check to see if we need to initialize more inodes. 199298542Smckusick */ 199398542Smckusick if (fs->fs_magic == FS_UFS2_MAGIC && 199498542Smckusick ipref + INOPB(fs) > cgp->cg_initediblk && 199598542Smckusick cgp->cg_initediblk < cgp->cg_niblk) { 1996248667Skib old_initediblk = cgp->cg_initediblk; 1997248667Skib 1998248667Skib /* 1999248667Skib * Free the cylinder group lock before writing the 2000248667Skib * initialized inode block. Entering the 2001248667Skib * babarrierwrite() with the cylinder group lock 2002248667Skib * causes lock order violation between the lock and 2003248667Skib * snaplk. 2004248667Skib * 2005248667Skib * Another thread can decide to initialize the same 2006248667Skib * inode block, but whichever thread first gets the 2007248667Skib * cylinder group lock after writing the newly 2008248667Skib * allocated inode block will update it and the other 2009248667Skib * will realize that it has lost and leave the 2010248667Skib * cylinder group unchanged. 2011248667Skib */ 2012248667Skib ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); 2013248667Skib brelse(bp); 2014248667Skib if (ibp == NULL) { 2015248667Skib /* 2016248667Skib * The inode block buffer is already owned by 2017248667Skib * another thread, which must initialize it. 2018248667Skib * Wait on the buffer to allow another thread 2019248667Skib * to finish the updates, with dropped cg 2020248667Skib * buffer lock, then retry. 2021248667Skib */ 2022248667Skib ibp = getinobuf(ip, cg, old_initediblk, 0); 2023248667Skib brelse(ibp); 2024248667Skib UFS_LOCK(ump); 2025248667Skib goto check_nifree; 2026248667Skib } 202798542Smckusick bzero(ibp->b_data, (int)fs->fs_bsize); 202898542Smckusick dp2 = (struct ufs2_dinode *)(ibp->b_data); 202998542Smckusick for (i = 0; i < INOPB(fs); i++) { 2030110885Smckusick dp2->di_gen = arc4random() / 2 + 1; 203198542Smckusick dp2++; 203298542Smckusick } 2033248665Smckusick /* 2034248665Smckusick * Rather than adding a soft updates dependency to ensure 2035248665Smckusick * that the new inode block is written before it is claimed 2036248665Smckusick * by the cylinder group map, we just do a barrier write 2037248665Smckusick * here. The barrier write will ensure that the inode block 2038248665Smckusick * gets written before the updated cylinder group map can be 2039248665Smckusick * written. The barrier write should only slow down bulk 2040248665Smckusick * loading of newly created filesystems. 2041248665Smckusick */ 2042248665Smckusick babarrierwrite(ibp); 2043248667Skib 2044248667Skib /* 2045248667Skib * After the inode block is written, try to update the 2046248667Skib * cg initediblk pointer. If another thread beat us 2047248667Skib * to it, then leave it unchanged as the other thread 2048248667Skib * has already set it correctly. 2049248667Skib */ 2050248667Skib error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 2051248667Skib (int)fs->fs_cgsize, NOCRED, &bp); 2052248667Skib UFS_LOCK(ump); 2053248667Skib ACTIVECLEAR(fs, cg); 2054248667Skib UFS_UNLOCK(ump); 2055248667Skib if (error != 0) { 2056248667Skib brelse(bp); 2057248667Skib return (error); 2058248667Skib } 2059248667Skib cgp = (struct cg *)bp->b_data; 2060248667Skib if (cgp->cg_initediblk == old_initediblk) 2061248667Skib cgp->cg_initediblk += INOPB(fs); 2062248667Skib goto restart; 206398542Smckusick } 2064248667Skib cgp->cg_old_time = cgp->cg_time = time_second; 2065248667Skib cgp->cg_irotor = ipref; 2066140704Sjeff UFS_LOCK(ump); 2067140704Sjeff ACTIVECLEAR(fs, cg); 2068140704Sjeff setbit(inosused, ipref); 2069140704Sjeff cgp->cg_cs.cs_nifree--; 2070140704Sjeff fs->fs_cstotal.cs_nifree--; 2071140704Sjeff fs->fs_cs(fs, cg).cs_nifree--; 2072140704Sjeff fs->fs_fmod = 1; 2073140704Sjeff if ((mode & IFMT) == IFDIR) { 2074140704Sjeff cgp->cg_cs.cs_ndir++; 2075140704Sjeff fs->fs_cstotal.cs_ndir++; 2076140704Sjeff fs->fs_cs(fs, cg).cs_ndir++; 2077140704Sjeff } 2078140704Sjeff UFS_UNLOCK(ump); 2079140704Sjeff if (DOINGSOFTDEP(ITOV(ip))) 2080223325Sjeff softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); 20811541Srgrimes bdwrite(bp); 2082203763Smckusick return ((ino_t)(cg * fs->fs_ipg + ipref)); 20831541Srgrimes} 20841541Srgrimes 20851541Srgrimes/* 20861541Srgrimes * Free a block or fragment. 20871541Srgrimes * 20881541Srgrimes * The specified block or fragment is placed back in the 20898876Srgrimes * free map. If a fragment is deallocated, a possible 20901541Srgrimes * block reassembly is checked. 20911541Srgrimes */ 2092216796Skibstatic void 2093216796Skibffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd) 2094140704Sjeff struct ufsmount *ump; 209590098Smckusick struct fs *fs; 209690098Smckusick struct vnode *devvp; 209798542Smckusick ufs2_daddr_t bno; 20981541Srgrimes long size; 209990098Smckusick ino_t inum; 2100207141Sjeff struct workhead *dephd; 21011541Srgrimes{ 2102207141Sjeff struct mount *mp; 210390098Smckusick struct cg *cgp; 21041541Srgrimes struct buf *bp; 210598542Smckusick ufs1_daddr_t fragno, cgbno; 210698542Smckusick ufs2_daddr_t cgblkno; 2107203763Smckusick int i, blk, frags, bbase; 2108203763Smckusick u_int cg; 210958087Smckusick u_int8_t *blksfree; 2110130585Sphk struct cdev *dev; 21111541Srgrimes 211290098Smckusick cg = dtog(fs, bno); 2113188240Strasz if (devvp->v_type == VREG) { 211490098Smckusick /* devvp is a snapshot */ 211590098Smckusick dev = VTOI(devvp)->i_devvp->v_rdev; 211698542Smckusick cgblkno = fragstoblks(fs, cgtod(fs, cg)); 211790098Smckusick } else { 211890098Smckusick /* devvp is a normal disk device */ 211990098Smckusick dev = devvp->v_rdev; 212098542Smckusick cgblkno = fsbtodb(fs, cgtod(fs, cg)); 2121222334Smckusick ASSERT_VOP_LOCKED(devvp, "ffs_blkfree_cg"); 212290098Smckusick } 2123173464Sobrien#ifdef INVARIANTS 212434266Sjulian if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 || 212534266Sjulian fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { 2126103594Sobrien printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", 212798542Smckusick devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, 212890098Smckusick size, fs->fs_fsmnt); 2129223127Smckusick panic("ffs_blkfree_cg: bad size"); 21301541Srgrimes } 213162976Smckusick#endif 21321541Srgrimes if ((u_int)bno >= fs->fs_size) { 213398687Smux printf("bad block %jd, ino %lu\n", (intmax_t)bno, 213498542Smckusick (u_long)inum); 213590098Smckusick ffs_fserr(fs, inum, "bad block"); 21361541Srgrimes return; 21371541Srgrimes } 2138115474Sphk if (bread(devvp, cgblkno, (int)fs->fs_cgsize, NOCRED, &bp)) { 21391541Srgrimes brelse(bp); 21401541Srgrimes return; 21411541Srgrimes } 21421541Srgrimes cgp = (struct cg *)bp->b_data; 21431541Srgrimes if (!cg_chkmagic(cgp)) { 21441541Srgrimes brelse(bp); 21451541Srgrimes return; 21461541Srgrimes } 214755697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 214898542Smckusick cgp->cg_old_time = cgp->cg_time = time_second; 214974545Smckusick cgbno = dtogd(fs, bno); 215058087Smckusick blksfree = cg_blksfree(cgp); 2151140704Sjeff UFS_LOCK(ump); 21521541Srgrimes if (size == fs->fs_bsize) { 215374545Smckusick fragno = fragstoblks(fs, cgbno); 215474545Smckusick if (!ffs_isfreeblock(fs, blksfree, fragno)) { 2155188240Strasz if (devvp->v_type == VREG) { 2156140704Sjeff UFS_UNLOCK(ump); 215790098Smckusick /* devvp is a snapshot */ 215890098Smckusick brelse(bp); 215990098Smckusick return; 216090098Smckusick } 216198687Smux printf("dev = %s, block = %jd, fs = %s\n", 216298542Smckusick devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); 2163223127Smckusick panic("ffs_blkfree_cg: freeing free block"); 21641541Srgrimes } 216574545Smckusick ffs_setblock(fs, blksfree, fragno); 2166207141Sjeff ffs_clusteracct(fs, cgp, fragno, 1); 21671541Srgrimes cgp->cg_cs.cs_nbfree++; 21681541Srgrimes fs->fs_cstotal.cs_nbfree++; 21691541Srgrimes fs->fs_cs(fs, cg).cs_nbfree++; 21701541Srgrimes } else { 217174545Smckusick bbase = cgbno - fragnum(fs, cgbno); 21721541Srgrimes /* 21731541Srgrimes * decrement the counts associated with the old frags 21741541Srgrimes */ 217558087Smckusick blk = blkmap(fs, blksfree, bbase); 21761541Srgrimes ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 21771541Srgrimes /* 21781541Srgrimes * deallocate the fragment 21791541Srgrimes */ 21801541Srgrimes frags = numfrags(fs, size); 21811541Srgrimes for (i = 0; i < frags; i++) { 218274545Smckusick if (isset(blksfree, cgbno + i)) { 218398687Smux printf("dev = %s, block = %jd, fs = %s\n", 218498542Smckusick devtoname(dev), (intmax_t)(bno + i), 218537555Sbde fs->fs_fsmnt); 2186223127Smckusick panic("ffs_blkfree_cg: freeing free frag"); 21871541Srgrimes } 218874545Smckusick setbit(blksfree, cgbno + i); 21891541Srgrimes } 21901541Srgrimes cgp->cg_cs.cs_nffree += i; 21911541Srgrimes fs->fs_cstotal.cs_nffree += i; 21921541Srgrimes fs->fs_cs(fs, cg).cs_nffree += i; 21931541Srgrimes /* 21941541Srgrimes * add back in counts associated with the new frags 21951541Srgrimes */ 219658087Smckusick blk = blkmap(fs, blksfree, bbase); 21971541Srgrimes ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 21981541Srgrimes /* 21991541Srgrimes * if a complete block has been reassembled, account for it 22001541Srgrimes */ 220174545Smckusick fragno = fragstoblks(fs, bbase); 220274545Smckusick if (ffs_isblock(fs, blksfree, fragno)) { 22031541Srgrimes cgp->cg_cs.cs_nffree -= fs->fs_frag; 22041541Srgrimes fs->fs_cstotal.cs_nffree -= fs->fs_frag; 22051541Srgrimes fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 2206207141Sjeff ffs_clusteracct(fs, cgp, fragno, 1); 22071541Srgrimes cgp->cg_cs.cs_nbfree++; 22081541Srgrimes fs->fs_cstotal.cs_nbfree++; 22091541Srgrimes fs->fs_cs(fs, cg).cs_nbfree++; 22101541Srgrimes } 22111541Srgrimes } 22121541Srgrimes fs->fs_fmod = 1; 2213140704Sjeff ACTIVECLEAR(fs, cg); 2214140704Sjeff UFS_UNLOCK(ump); 2215207141Sjeff mp = UFSTOVFS(ump); 2216224503Smckusick if (MOUNTEDSOFTDEP(mp) && devvp->v_type != VREG) 2217207141Sjeff softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, 2218207141Sjeff numfrags(fs, size), dephd); 22191541Srgrimes bdwrite(bp); 22201541Srgrimes} 22211541Srgrimes 2222216796SkibTASKQUEUE_DEFINE_THREAD(ffs_trim); 2223216796Skib 2224216796Skibstruct ffs_blkfree_trim_params { 2225216796Skib struct task task; 2226216796Skib struct ufsmount *ump; 2227216796Skib struct vnode *devvp; 2228216796Skib ufs2_daddr_t bno; 2229216796Skib long size; 2230216796Skib ino_t inum; 2231216796Skib struct workhead *pdephd; 2232216796Skib struct workhead dephd; 2233216796Skib}; 2234216796Skib 2235216796Skibstatic void 2236216796Skibffs_blkfree_trim_task(ctx, pending) 2237216796Skib void *ctx; 2238216796Skib int pending; 2239216796Skib{ 2240216796Skib struct ffs_blkfree_trim_params *tp; 2241216796Skib 2242216796Skib tp = ctx; 2243216796Skib ffs_blkfree_cg(tp->ump, tp->ump->um_fs, tp->devvp, tp->bno, tp->size, 2244216796Skib tp->inum, tp->pdephd); 2245216796Skib vn_finished_secondary_write(UFSTOVFS(tp->ump)); 2246216796Skib free(tp, M_TEMP); 2247216796Skib} 2248216796Skib 2249216796Skibstatic void 2250216796Skibffs_blkfree_trim_completed(bip) 2251216796Skib struct bio *bip; 2252216796Skib{ 2253216796Skib struct ffs_blkfree_trim_params *tp; 2254216796Skib 2255216796Skib tp = bip->bio_caller2; 2256216796Skib g_destroy_bio(bip); 2257216796Skib TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); 2258216796Skib taskqueue_enqueue(taskqueue_ffs_trim, &tp->task); 2259216796Skib} 2260216796Skib 2261216796Skibvoid 2262223127Smckusickffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) 2263216796Skib struct ufsmount *ump; 2264216796Skib struct fs *fs; 2265216796Skib struct vnode *devvp; 2266216796Skib ufs2_daddr_t bno; 2267216796Skib long size; 2268216796Skib ino_t inum; 2269223127Smckusick enum vtype vtype; 2270216796Skib struct workhead *dephd; 2271216796Skib{ 2272216796Skib struct mount *mp; 2273216796Skib struct bio *bip; 2274216796Skib struct ffs_blkfree_trim_params *tp; 2275216796Skib 2276222334Smckusick /* 2277222334Smckusick * Check to see if a snapshot wants to claim the block. 2278222334Smckusick * Check that devvp is a normal disk device, not a snapshot, 2279222334Smckusick * it has a snapshot(s) associated with it, and one of the 2280222334Smckusick * snapshots wants to claim the block. 2281222334Smckusick */ 2282222334Smckusick if (devvp->v_type != VREG && 2283222334Smckusick (devvp->v_vflag & VV_COPYONWRITE) && 2284223127Smckusick ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { 2285222334Smckusick return; 2286222334Smckusick } 2287223902Smckusick /* 2288223902Smckusick * Nothing to delay if TRIM is disabled, or the operation is 2289223902Smckusick * performed on the snapshot. 2290223902Smckusick */ 2291223902Smckusick if (!ump->um_candelete || devvp->v_type == VREG) { 2292216796Skib ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); 2293216796Skib return; 2294216796Skib } 2295216796Skib 2296216796Skib /* 2297216796Skib * Postpone the set of the free bit in the cg bitmap until the 2298216796Skib * BIO_DELETE is completed. Otherwise, due to disk queue 2299216796Skib * reordering, TRIM might be issued after we reuse the block 2300216796Skib * and write some new data into it. 2301216796Skib */ 2302216796Skib tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK); 2303216796Skib tp->ump = ump; 2304216796Skib tp->devvp = devvp; 2305216796Skib tp->bno = bno; 2306216796Skib tp->size = size; 2307216796Skib tp->inum = inum; 2308216796Skib if (dephd != NULL) { 2309216796Skib LIST_INIT(&tp->dephd); 2310216796Skib LIST_SWAP(dephd, &tp->dephd, worklist, wk_list); 2311216796Skib tp->pdephd = &tp->dephd; 2312216796Skib } else 2313216796Skib tp->pdephd = NULL; 2314216796Skib 2315216796Skib bip = g_alloc_bio(); 2316216796Skib bip->bio_cmd = BIO_DELETE; 2317216796Skib bip->bio_offset = dbtob(fsbtodb(fs, bno)); 2318216796Skib bip->bio_done = ffs_blkfree_trim_completed; 2319216796Skib bip->bio_length = size; 2320216796Skib bip->bio_caller2 = tp; 2321216796Skib 2322216796Skib mp = UFSTOVFS(ump); 2323216796Skib vn_start_secondary_write(NULL, &mp, 0); 2324216796Skib g_io_request(bip, (struct g_consumer *)devvp->v_bufobj.bo_private); 2325216796Skib} 2326216796Skib 2327173464Sobrien#ifdef INVARIANTS 23281541Srgrimes/* 232922521Sdyson * Verify allocation of a block or fragment. Returns true if block or 233022521Sdyson * fragment is allocated, false if it is free. 233122521Sdyson */ 233231352Sbdestatic int 233322521Sdysonffs_checkblk(ip, bno, size) 233422521Sdyson struct inode *ip; 233598542Smckusick ufs2_daddr_t bno; 233622521Sdyson long size; 233722521Sdyson{ 233822521Sdyson struct fs *fs; 233922521Sdyson struct cg *cgp; 234022521Sdyson struct buf *bp; 234198542Smckusick ufs1_daddr_t cgbno; 234222521Sdyson int i, error, frags, free; 234358087Smckusick u_int8_t *blksfree; 234422521Sdyson 234522521Sdyson fs = ip->i_fs; 234622521Sdyson if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { 234737555Sbde printf("bsize = %ld, size = %ld, fs = %s\n", 234837555Sbde (long)fs->fs_bsize, size, fs->fs_fsmnt); 234922544Smpp panic("ffs_checkblk: bad size"); 235022521Sdyson } 235122521Sdyson if ((u_int)bno >= fs->fs_size) 2352103594Sobrien panic("ffs_checkblk: bad block %jd", (intmax_t)bno); 235322521Sdyson error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, dtog(fs, bno))), 235422521Sdyson (int)fs->fs_cgsize, NOCRED, &bp); 235522544Smpp if (error) 235622544Smpp panic("ffs_checkblk: cg bread failed"); 235722521Sdyson cgp = (struct cg *)bp->b_data; 235822544Smpp if (!cg_chkmagic(cgp)) 235922544Smpp panic("ffs_checkblk: cg magic mismatch"); 236055697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 236158087Smckusick blksfree = cg_blksfree(cgp); 236298542Smckusick cgbno = dtogd(fs, bno); 236322521Sdyson if (size == fs->fs_bsize) { 236498542Smckusick free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); 236522521Sdyson } else { 236622521Sdyson frags = numfrags(fs, size); 236722521Sdyson for (free = 0, i = 0; i < frags; i++) 236898542Smckusick if (isset(blksfree, cgbno + i)) 236922521Sdyson free++; 237022521Sdyson if (free != 0 && free != frags) 237122544Smpp panic("ffs_checkblk: partially free fragment"); 237222521Sdyson } 237322521Sdyson brelse(bp); 237422521Sdyson return (!free); 237522521Sdyson} 2376173464Sobrien#endif /* INVARIANTS */ 237722521Sdyson 237822521Sdyson/* 23791541Srgrimes * Free an inode. 23801541Srgrimes */ 23811541Srgrimesint 238274548Smckusickffs_vfree(pvp, ino, mode) 238330474Sphk struct vnode *pvp; 238430474Sphk ino_t ino; 238530474Sphk int mode; 23861541Srgrimes{ 2387140704Sjeff struct inode *ip; 2388140704Sjeff 238934266Sjulian if (DOINGSOFTDEP(pvp)) { 239034266Sjulian softdep_freefile(pvp, ino, mode); 239134266Sjulian return (0); 239234266Sjulian } 2393140704Sjeff ip = VTOI(pvp); 2394207141Sjeff return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode, 2395207141Sjeff NULL)); 239634266Sjulian} 239734266Sjulian 239834266Sjulian/* 239934266Sjulian * Do the actual free operation. 240034266Sjulian * The specified inode is placed back in the free map. 240134266Sjulian */ 240274548Smckusickint 2403207141Sjeffffs_freefile(ump, fs, devvp, ino, mode, wkhd) 2404140704Sjeff struct ufsmount *ump; 240590098Smckusick struct fs *fs; 240690098Smckusick struct vnode *devvp; 240734266Sjulian ino_t ino; 240834266Sjulian int mode; 2409207141Sjeff struct workhead *wkhd; 241034266Sjulian{ 241190098Smckusick struct cg *cgp; 24121541Srgrimes struct buf *bp; 241398542Smckusick ufs2_daddr_t cgbno; 2414203763Smckusick int error; 2415203763Smckusick u_int cg; 241658087Smckusick u_int8_t *inosused; 2417130585Sphk struct cdev *dev; 24181541Srgrimes 241990098Smckusick cg = ino_to_cg(fs, ino); 2420188240Strasz if (devvp->v_type == VREG) { 242190098Smckusick /* devvp is a snapshot */ 242290098Smckusick dev = VTOI(devvp)->i_devvp->v_rdev; 242390098Smckusick cgbno = fragstoblks(fs, cgtod(fs, cg)); 242490098Smckusick } else { 242590098Smckusick /* devvp is a normal disk device */ 242690098Smckusick dev = devvp->v_rdev; 242790098Smckusick cgbno = fsbtodb(fs, cgtod(fs, cg)); 242890098Smckusick } 2429203763Smckusick if (ino >= fs->fs_ipg * fs->fs_ncg) 2430108010Smckusick panic("ffs_freefile: range: dev = %s, ino = %lu, fs = %s", 2431108010Smckusick devtoname(dev), (u_long)ino, fs->fs_fsmnt); 243290098Smckusick if ((error = bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp))) { 24331541Srgrimes brelse(bp); 243434266Sjulian return (error); 24351541Srgrimes } 24361541Srgrimes cgp = (struct cg *)bp->b_data; 24371541Srgrimes if (!cg_chkmagic(cgp)) { 24381541Srgrimes brelse(bp); 24391541Srgrimes return (0); 24401541Srgrimes } 244155697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 244298542Smckusick cgp->cg_old_time = cgp->cg_time = time_second; 244358087Smckusick inosused = cg_inosused(cgp); 24441541Srgrimes ino %= fs->fs_ipg; 244558087Smckusick if (isclr(inosused, ino)) { 2446203763Smckusick printf("dev = %s, ino = %u, fs = %s\n", devtoname(dev), 2447203763Smckusick ino + cg * fs->fs_ipg, fs->fs_fsmnt); 24481541Srgrimes if (fs->fs_ronly == 0) 2449108010Smckusick panic("ffs_freefile: freeing free inode"); 24501541Srgrimes } 245158087Smckusick clrbit(inosused, ino); 24521541Srgrimes if (ino < cgp->cg_irotor) 24531541Srgrimes cgp->cg_irotor = ino; 24541541Srgrimes cgp->cg_cs.cs_nifree++; 2455140704Sjeff UFS_LOCK(ump); 24561541Srgrimes fs->fs_cstotal.cs_nifree++; 24571541Srgrimes fs->fs_cs(fs, cg).cs_nifree++; 245830474Sphk if ((mode & IFMT) == IFDIR) { 24591541Srgrimes cgp->cg_cs.cs_ndir--; 24601541Srgrimes fs->fs_cstotal.cs_ndir--; 24611541Srgrimes fs->fs_cs(fs, cg).cs_ndir--; 24621541Srgrimes } 24631541Srgrimes fs->fs_fmod = 1; 2464140704Sjeff ACTIVECLEAR(fs, cg); 2465140704Sjeff UFS_UNLOCK(ump); 2466224503Smckusick if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type != VREG) 2467207141Sjeff softdep_setup_inofree(UFSTOVFS(ump), bp, 2468207141Sjeff ino + cg * fs->fs_ipg, wkhd); 24691541Srgrimes bdwrite(bp); 24701541Srgrimes return (0); 24711541Srgrimes} 24721541Srgrimes 24731541Srgrimes/* 2474111239Smckusick * Check to see if a file is free. 2475111239Smckusick */ 2476111239Smckusickint 2477111239Smckusickffs_checkfreefile(fs, devvp, ino) 2478111239Smckusick struct fs *fs; 2479111239Smckusick struct vnode *devvp; 2480111239Smckusick ino_t ino; 2481111239Smckusick{ 2482111239Smckusick struct cg *cgp; 2483111239Smckusick struct buf *bp; 2484111239Smckusick ufs2_daddr_t cgbno; 2485203763Smckusick int ret; 2486203763Smckusick u_int cg; 2487111239Smckusick u_int8_t *inosused; 2488111239Smckusick 2489111239Smckusick cg = ino_to_cg(fs, ino); 2490188240Strasz if (devvp->v_type == VREG) { 2491111239Smckusick /* devvp is a snapshot */ 2492111239Smckusick cgbno = fragstoblks(fs, cgtod(fs, cg)); 2493111239Smckusick } else { 2494111239Smckusick /* devvp is a normal disk device */ 2495111239Smckusick cgbno = fsbtodb(fs, cgtod(fs, cg)); 2496111239Smckusick } 2497203763Smckusick if (ino >= fs->fs_ipg * fs->fs_ncg) 2498111239Smckusick return (1); 2499115474Sphk if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp)) { 2500111239Smckusick brelse(bp); 2501111239Smckusick return (1); 2502111239Smckusick } 2503111239Smckusick cgp = (struct cg *)bp->b_data; 2504111239Smckusick if (!cg_chkmagic(cgp)) { 2505111239Smckusick brelse(bp); 2506111239Smckusick return (1); 2507111239Smckusick } 2508111239Smckusick inosused = cg_inosused(cgp); 2509111239Smckusick ino %= fs->fs_ipg; 2510111239Smckusick ret = isclr(inosused, ino); 2511111239Smckusick brelse(bp); 2512111239Smckusick return (ret); 2513111239Smckusick} 2514111239Smckusick 2515111239Smckusick/* 25161541Srgrimes * Find a block of the specified size in the specified cylinder group. 25171541Srgrimes * 25181541Srgrimes * It is a panic if a request is made to find a block if none are 25191541Srgrimes * available. 25201541Srgrimes */ 252198542Smckusickstatic ufs1_daddr_t 25221541Srgrimesffs_mapsearch(fs, cgp, bpref, allocsiz) 252396506Sphk struct fs *fs; 252496506Sphk struct cg *cgp; 252598542Smckusick ufs2_daddr_t bpref; 25261541Srgrimes int allocsiz; 25271541Srgrimes{ 252898542Smckusick ufs1_daddr_t bno; 25291541Srgrimes int start, len, loc, i; 25301541Srgrimes int blk, field, subfield, pos; 253158087Smckusick u_int8_t *blksfree; 25321541Srgrimes 25331541Srgrimes /* 25341541Srgrimes * find the fragment by searching through the free block 25351541Srgrimes * map for an appropriate bit pattern 25361541Srgrimes */ 25371541Srgrimes if (bpref) 25381541Srgrimes start = dtogd(fs, bpref) / NBBY; 25391541Srgrimes else 25401541Srgrimes start = cgp->cg_frotor / NBBY; 254158087Smckusick blksfree = cg_blksfree(cgp); 25421541Srgrimes len = howmany(fs->fs_fpg, NBBY) - start; 254358087Smckusick loc = scanc((u_int)len, (u_char *)&blksfree[start], 2544160462Sstefanf fragtbl[fs->fs_frag], 25451541Srgrimes (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 25461541Srgrimes if (loc == 0) { 25471541Srgrimes len = start + 1; 25481541Srgrimes start = 0; 254958087Smckusick loc = scanc((u_int)len, (u_char *)&blksfree[0], 2550160462Sstefanf fragtbl[fs->fs_frag], 25511541Srgrimes (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 25521541Srgrimes if (loc == 0) { 25531541Srgrimes printf("start = %d, len = %d, fs = %s\n", 25541541Srgrimes start, len, fs->fs_fsmnt); 25551541Srgrimes panic("ffs_alloccg: map corrupted"); 25561541Srgrimes /* NOTREACHED */ 25571541Srgrimes } 25581541Srgrimes } 25591541Srgrimes bno = (start + len - loc) * NBBY; 25601541Srgrimes cgp->cg_frotor = bno; 25611541Srgrimes /* 25621541Srgrimes * found the byte in the map 25631541Srgrimes * sift through the bits to find the selected frag 25641541Srgrimes */ 25651541Srgrimes for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 256658087Smckusick blk = blkmap(fs, blksfree, bno); 25671541Srgrimes blk <<= 1; 25681541Srgrimes field = around[allocsiz]; 25691541Srgrimes subfield = inside[allocsiz]; 25701541Srgrimes for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { 25711541Srgrimes if ((blk & field) == subfield) 25721541Srgrimes return (bno + pos); 25731541Srgrimes field <<= 1; 25741541Srgrimes subfield <<= 1; 25751541Srgrimes } 25761541Srgrimes } 25773487Sphk printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt); 25781541Srgrimes panic("ffs_alloccg: block not in map"); 25791541Srgrimes return (-1); 25801541Srgrimes} 25811541Srgrimes 25821541Srgrimes/* 258396755Strhodes * Fserr prints the name of a filesystem with an error diagnostic. 25848876Srgrimes * 25851541Srgrimes * The form of the error message is: 25861541Srgrimes * fs: error message 25871541Srgrimes */ 2588223114Smckusickvoid 258990098Smckusickffs_fserr(fs, inum, cp) 25901541Srgrimes struct fs *fs; 259190098Smckusick ino_t inum; 25921541Srgrimes char *cp; 25931541Srgrimes{ 2594112450Sjhb struct thread *td = curthread; /* XXX */ 2595112450Sjhb struct proc *p = td->td_proc; 25961541Srgrimes 259790098Smckusick log(LOG_ERR, "pid %d (%s), uid %d inumber %d on %s: %s\n", 2598112450Sjhb p->p_pid, p->p_comm, td->td_ucred->cr_uid, inum, fs->fs_fsmnt, cp); 25991541Srgrimes} 260074548Smckusick 260174548Smckusick/* 260274548Smckusick * This function provides the capability for the fsck program to 2603202113Smckusick * update an active filesystem. Fourteen operations are provided: 260474548Smckusick * 260574548Smckusick * adjrefcnt(inode, amt) - adjusts the reference count on the 260674548Smckusick * specified inode by the specified amount. Under normal 260774548Smckusick * operation the count should always go down. Decrementing 260874548Smckusick * the count to zero will cause the inode to be freed. 2609222724Smckusick * adjblkcnt(inode, amt) - adjust the number of blocks used by the 2610222724Smckusick * inode by the specified amount. 2611142123Sdelphij * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - 2612142123Sdelphij * adjust the superblock summary. 261374548Smckusick * freedirs(inode, count) - directory inodes [inode..inode + count - 1] 261474548Smckusick * are marked as free. Inodes should never have to be marked 261574548Smckusick * as in use. 261674548Smckusick * freefiles(inode, count) - file inodes [inode..inode + count - 1] 261774548Smckusick * are marked as free. Inodes should never have to be marked 261874548Smckusick * as in use. 261974548Smckusick * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] 262074548Smckusick * are marked as free. Blocks should never have to be marked 262174548Smckusick * as in use. 262274548Smckusick * setflags(flags, set/clear) - the fs_flags field has the specified 262374548Smckusick * flags set (second parameter +1) or cleared (second parameter -1). 2624202113Smckusick * setcwd(dirinode) - set the current directory to dirinode in the 2625202113Smckusick * filesystem associated with the snapshot. 2626202113Smckusick * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." 2627202113Smckusick * in the current directory is oldvalue then change it to newvalue. 2628202113Smckusick * unlink(nameptr, oldvalue) - Verify that the inode number associated 2629202113Smckusick * with nameptr in the current directory is oldvalue then unlink it. 2630224061Smckusick * 2631224061Smckusick * The following functions may only be used on a quiescent filesystem 2632224061Smckusick * by the soft updates journal. They are not safe to be run on an active 2633224061Smckusick * filesystem. 2634224061Smckusick * 2635224061Smckusick * setinode(inode, dip) - the specified disk inode is replaced with the 2636224061Smckusick * contents pointed to by dip. 2637224061Smckusick * setbufoutput(fd, flags) - output associated with the specified file 2638224061Smckusick * descriptor (which must reference the character device supporting 2639224061Smckusick * the filesystem) switches from using physio to running through the 2640224061Smckusick * buffer cache when flags is set to 1. The descriptor reverts to 2641224061Smckusick * physio for output when flags is set to zero. 264274548Smckusick */ 264374548Smckusick 264492728Salfredstatic int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); 264574548Smckusick 264674548SmckusickSYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR|CTLTYPE_STRUCT, 264774548Smckusick 0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count"); 264874548Smckusick 2649141631Sphkstatic SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR, 265074548Smckusick sysctl_ffs_fsck, "Adjust Inode Used Blocks Count"); 265174548Smckusick 2652142123Sdelphijstatic SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR, 2653142123Sdelphij sysctl_ffs_fsck, "Adjust number of directories"); 2654142123Sdelphij 2655142123Sdelphijstatic SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR, 2656142123Sdelphij sysctl_ffs_fsck, "Adjust number of free blocks"); 2657142123Sdelphij 2658142123Sdelphijstatic SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR, 2659142123Sdelphij sysctl_ffs_fsck, "Adjust number of free inodes"); 2660142123Sdelphij 2661142123Sdelphijstatic SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR, 2662142123Sdelphij sysctl_ffs_fsck, "Adjust number of free frags"); 2663142123Sdelphij 2664142123Sdelphijstatic SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR, 2665142123Sdelphij sysctl_ffs_fsck, "Adjust number of free clusters"); 2666142123Sdelphij 2667141631Sphkstatic SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR, 266874548Smckusick sysctl_ffs_fsck, "Free Range of Directory Inodes"); 266974548Smckusick 2670141631Sphkstatic SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR, 267174548Smckusick sysctl_ffs_fsck, "Free Range of File Inodes"); 267274548Smckusick 2673141631Sphkstatic SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR, 267474548Smckusick sysctl_ffs_fsck, "Free Range of Blocks"); 267574548Smckusick 2676141631Sphkstatic SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR, 267774548Smckusick sysctl_ffs_fsck, "Change Filesystem Flags"); 267874548Smckusick 2679202113Smckusickstatic SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, CTLFLAG_WR, 2680202113Smckusick sysctl_ffs_fsck, "Set Current Working Directory"); 2681202113Smckusick 2682202113Smckusickstatic SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, CTLFLAG_WR, 2683202113Smckusick sysctl_ffs_fsck, "Change Value of .. Entry"); 2684202113Smckusick 2685202113Smckusickstatic SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR, 2686202113Smckusick sysctl_ffs_fsck, "Unlink a Duplicate Name"); 2687202113Smckusick 2688224061Smckusickstatic SYSCTL_NODE(_vfs_ffs, FFS_SET_INODE, setinode, CTLFLAG_WR, 2689224061Smckusick sysctl_ffs_fsck, "Update an On-Disk Inode"); 2690224061Smckusick 2691224061Smckusickstatic SYSCTL_NODE(_vfs_ffs, FFS_SET_BUFOUTPUT, setbufoutput, CTLFLAG_WR, 2692224061Smckusick sysctl_ffs_fsck, "Set Buffered Writing for Descriptor"); 2693224061Smckusick 2694224061Smckusick#define DEBUG 1 269574548Smckusick#ifdef DEBUG 2696224272Smckusickstatic int fsckcmds = 0; 269774548SmckusickSYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, ""); 269874548Smckusick#endif /* DEBUG */ 269974548Smckusick 2700224061Smckusickstatic int buffered_write(struct file *, struct uio *, struct ucred *, 2701224061Smckusick int, struct thread *); 2702224061Smckusick 270374548Smckusickstatic int 270474548Smckusicksysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) 270574548Smckusick{ 2706202113Smckusick struct thread *td = curthread; 270774548Smckusick struct fsck_cmd cmd; 270874548Smckusick struct ufsmount *ump; 2709202113Smckusick struct vnode *vp, *vpold, *dvp, *fdvp; 2710202113Smckusick struct inode *ip, *dp; 271174548Smckusick struct mount *mp; 271274548Smckusick struct fs *fs; 271398542Smckusick ufs2_daddr_t blkno; 271474548Smckusick long blkcnt, blksize; 2715202113Smckusick struct filedesc *fdp; 2716224061Smckusick struct file *fp, *vfp; 2717202113Smckusick int vfslocked, filetype, error; 2718224061Smckusick static struct fileops *origops, bufferedops; 271974548Smckusick 272074548Smckusick if (req->newlen > sizeof cmd) 272174548Smckusick return (EBADRPC); 272274548Smckusick if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0) 272374548Smckusick return (error); 272474548Smckusick if (cmd.version != FFS_CMD_VERSION) 272574548Smckusick return (ERPCMISMATCH); 2726224778Srwatson if ((error = getvnode(td->td_proc->p_fd, cmd.handle, CAP_FSCK, 2727224778Srwatson &fp)) != 0) 272874548Smckusick return (error); 2729202113Smckusick vp = fp->f_data; 2730202113Smckusick if (vp->v_type != VREG && vp->v_type != VDIR) { 2731202113Smckusick fdrop(fp, td); 2732202113Smckusick return (EINVAL); 2733202113Smckusick } 2734202113Smckusick vn_start_write(vp, &mp, V_WAIT); 273575572Smckusick if (mp == 0 || strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { 273675572Smckusick vn_finished_write(mp); 2737202113Smckusick fdrop(fp, td); 273874705Smckusick return (EINVAL); 273975572Smckusick } 2740224061Smckusick ump = VFSTOUFS(mp); 2741224061Smckusick if ((mp->mnt_flag & MNT_RDONLY) && 2742224061Smckusick ump->um_fsckpid != td->td_proc->p_pid) { 274375572Smckusick vn_finished_write(mp); 2744202113Smckusick fdrop(fp, td); 274574548Smckusick return (EROFS); 274675572Smckusick } 274774548Smckusick fs = ump->um_fs; 274874548Smckusick filetype = IFREG; 274974548Smckusick 275074548Smckusick switch (oidp->oid_number) { 275174548Smckusick 275274548Smckusick case FFS_SET_FLAGS: 275374548Smckusick#ifdef DEBUG 275474548Smckusick if (fsckcmds) 275574548Smckusick printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, 275674548Smckusick cmd.size > 0 ? "set" : "clear"); 275774548Smckusick#endif /* DEBUG */ 275874548Smckusick if (cmd.size > 0) 275974548Smckusick fs->fs_flags |= (long)cmd.value; 276074548Smckusick else 276174548Smckusick fs->fs_flags &= ~(long)cmd.value; 276274548Smckusick break; 276374548Smckusick 276474548Smckusick case FFS_ADJ_REFCNT: 276574548Smckusick#ifdef DEBUG 276674548Smckusick if (fsckcmds) { 2767224061Smckusick printf("%s: adjust inode %jd link count by %jd\n", 276899590Sbde mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 276999590Sbde (intmax_t)cmd.size); 277074548Smckusick } 277174548Smckusick#endif /* DEBUG */ 2772141526Sphk if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 277375572Smckusick break; 277474548Smckusick ip = VTOI(vp); 277574548Smckusick ip->i_nlink += cmd.size; 2776132775Skan DIP_SET(ip, i_nlink, ip->i_nlink); 277774548Smckusick ip->i_effnlink += cmd.size; 2778224061Smckusick ip->i_flag |= IN_CHANGE | IN_MODIFIED; 2779224061Smckusick error = ffs_update(vp, 1); 278074548Smckusick if (DOINGSOFTDEP(vp)) 278174548Smckusick softdep_change_linkcnt(ip); 278274548Smckusick vput(vp); 278374548Smckusick break; 278474548Smckusick 278574548Smckusick case FFS_ADJ_BLKCNT: 278674548Smckusick#ifdef DEBUG 278774548Smckusick if (fsckcmds) { 278899590Sbde printf("%s: adjust inode %jd block count by %jd\n", 278999590Sbde mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 279099590Sbde (intmax_t)cmd.size); 279174548Smckusick } 279274548Smckusick#endif /* DEBUG */ 2793141526Sphk if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 279475572Smckusick break; 279574548Smckusick ip = VTOI(vp); 2796132775Skan DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); 2797224061Smckusick ip->i_flag |= IN_CHANGE | IN_MODIFIED; 2798224061Smckusick error = ffs_update(vp, 1); 279974548Smckusick vput(vp); 280074548Smckusick break; 280174548Smckusick 280274548Smckusick case FFS_DIR_FREE: 280374548Smckusick filetype = IFDIR; 280474548Smckusick /* fall through */ 280574548Smckusick 280674548Smckusick case FFS_FILE_FREE: 280774548Smckusick#ifdef DEBUG 280874548Smckusick if (fsckcmds) { 280974548Smckusick if (cmd.size == 1) 281074548Smckusick printf("%s: free %s inode %d\n", 281174548Smckusick mp->mnt_stat.f_mntonname, 281274548Smckusick filetype == IFDIR ? "directory" : "file", 281374548Smckusick (ino_t)cmd.value); 281474548Smckusick else 281574548Smckusick printf("%s: free %s inodes %d-%d\n", 281674548Smckusick mp->mnt_stat.f_mntonname, 281774548Smckusick filetype == IFDIR ? "directory" : "file", 281874747Sasmodai (ino_t)cmd.value, 281978256Speter (ino_t)(cmd.value + cmd.size - 1)); 282074548Smckusick } 282174548Smckusick#endif /* DEBUG */ 282274548Smckusick while (cmd.size > 0) { 2823140704Sjeff if ((error = ffs_freefile(ump, fs, ump->um_devvp, 2824207141Sjeff cmd.value, filetype, NULL))) 282575572Smckusick break; 282674548Smckusick cmd.size -= 1; 282774548Smckusick cmd.value += 1; 282874548Smckusick } 282974548Smckusick break; 283074548Smckusick 283174548Smckusick case FFS_BLK_FREE: 283274548Smckusick#ifdef DEBUG 283374548Smckusick if (fsckcmds) { 283474548Smckusick if (cmd.size == 1) 2835103594Sobrien printf("%s: free block %jd\n", 283674548Smckusick mp->mnt_stat.f_mntonname, 283798542Smckusick (intmax_t)cmd.value); 283874548Smckusick else 2839103594Sobrien printf("%s: free blocks %jd-%jd\n", 284074548Smckusick mp->mnt_stat.f_mntonname, 284198542Smckusick (intmax_t)cmd.value, 284298542Smckusick (intmax_t)cmd.value + cmd.size - 1); 284374548Smckusick } 284474548Smckusick#endif /* DEBUG */ 284598542Smckusick blkno = cmd.value; 284674548Smckusick blkcnt = cmd.size; 284774548Smckusick blksize = fs->fs_frag - (blkno % fs->fs_frag); 284874548Smckusick while (blkcnt > 0) { 284974548Smckusick if (blksize > blkcnt) 285074548Smckusick blksize = blkcnt; 2851140704Sjeff ffs_blkfree(ump, fs, ump->um_devvp, blkno, 2852223127Smckusick blksize * fs->fs_fsize, ROOTINO, VDIR, NULL); 285374548Smckusick blkno += blksize; 285474548Smckusick blkcnt -= blksize; 285574548Smckusick blksize = fs->fs_frag; 285674548Smckusick } 285774548Smckusick break; 285874548Smckusick 2859142123Sdelphij /* 2860142123Sdelphij * Adjust superblock summaries. fsck(8) is expected to 2861142123Sdelphij * submit deltas when necessary. 2862142123Sdelphij */ 2863142123Sdelphij case FFS_ADJ_NDIR: 2864142123Sdelphij#ifdef DEBUG 2865142123Sdelphij if (fsckcmds) { 2866142123Sdelphij printf("%s: adjust number of directories by %jd\n", 2867142123Sdelphij mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 2868142123Sdelphij } 2869142123Sdelphij#endif /* DEBUG */ 2870142123Sdelphij fs->fs_cstotal.cs_ndir += cmd.value; 2871142123Sdelphij break; 2872202113Smckusick 2873142123Sdelphij case FFS_ADJ_NBFREE: 2874142123Sdelphij#ifdef DEBUG 2875142123Sdelphij if (fsckcmds) { 2876142123Sdelphij printf("%s: adjust number of free blocks by %+jd\n", 2877142123Sdelphij mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 2878142123Sdelphij } 2879142123Sdelphij#endif /* DEBUG */ 2880142123Sdelphij fs->fs_cstotal.cs_nbfree += cmd.value; 2881142123Sdelphij break; 2882202113Smckusick 2883142123Sdelphij case FFS_ADJ_NIFREE: 2884142123Sdelphij#ifdef DEBUG 2885142123Sdelphij if (fsckcmds) { 2886142123Sdelphij printf("%s: adjust number of free inodes by %+jd\n", 2887142123Sdelphij mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 2888142123Sdelphij } 2889142123Sdelphij#endif /* DEBUG */ 2890142123Sdelphij fs->fs_cstotal.cs_nifree += cmd.value; 2891142123Sdelphij break; 2892202113Smckusick 2893142123Sdelphij case FFS_ADJ_NFFREE: 2894142123Sdelphij#ifdef DEBUG 2895142123Sdelphij if (fsckcmds) { 2896142123Sdelphij printf("%s: adjust number of free frags by %+jd\n", 2897142123Sdelphij mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 2898142123Sdelphij } 2899142123Sdelphij#endif /* DEBUG */ 2900142123Sdelphij fs->fs_cstotal.cs_nffree += cmd.value; 2901142123Sdelphij break; 2902202113Smckusick 2903142123Sdelphij case FFS_ADJ_NUMCLUSTERS: 2904142123Sdelphij#ifdef DEBUG 2905142123Sdelphij if (fsckcmds) { 2906142123Sdelphij printf("%s: adjust number of free clusters by %+jd\n", 2907142123Sdelphij mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 2908142123Sdelphij } 2909142123Sdelphij#endif /* DEBUG */ 2910142123Sdelphij fs->fs_cstotal.cs_numclusters += cmd.value; 2911142123Sdelphij break; 2912142123Sdelphij 2913202113Smckusick case FFS_SET_CWD: 2914202113Smckusick#ifdef DEBUG 2915202113Smckusick if (fsckcmds) { 2916202113Smckusick printf("%s: set current directory to inode %jd\n", 2917202113Smckusick mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 2918202113Smckusick } 2919202113Smckusick#endif /* DEBUG */ 2920202113Smckusick if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) 2921202113Smckusick break; 2922202113Smckusick vfslocked = VFS_LOCK_GIANT(vp->v_mount); 2923202113Smckusick AUDIT_ARG_VNODE1(vp); 2924202113Smckusick if ((error = change_dir(vp, td)) != 0) { 2925202113Smckusick vput(vp); 2926202113Smckusick VFS_UNLOCK_GIANT(vfslocked); 2927202113Smckusick break; 2928202113Smckusick } 2929202113Smckusick VOP_UNLOCK(vp, 0); 2930202113Smckusick VFS_UNLOCK_GIANT(vfslocked); 2931202113Smckusick fdp = td->td_proc->p_fd; 2932202113Smckusick FILEDESC_XLOCK(fdp); 2933202113Smckusick vpold = fdp->fd_cdir; 2934202113Smckusick fdp->fd_cdir = vp; 2935202113Smckusick FILEDESC_XUNLOCK(fdp); 2936202113Smckusick vfslocked = VFS_LOCK_GIANT(vpold->v_mount); 2937202113Smckusick vrele(vpold); 2938202113Smckusick VFS_UNLOCK_GIANT(vfslocked); 2939202113Smckusick break; 2940202113Smckusick 2941202113Smckusick case FFS_SET_DOTDOT: 2942202113Smckusick#ifdef DEBUG 2943202113Smckusick if (fsckcmds) { 2944202113Smckusick printf("%s: change .. in cwd from %jd to %jd\n", 2945202113Smckusick mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 2946202113Smckusick (intmax_t)cmd.size); 2947202113Smckusick } 2948202113Smckusick#endif /* DEBUG */ 2949202113Smckusick /* 2950202113Smckusick * First we have to get and lock the parent directory 2951202113Smckusick * to which ".." points. 2952202113Smckusick */ 2953202113Smckusick error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); 2954202113Smckusick if (error) 2955202113Smckusick break; 2956202113Smckusick /* 2957202113Smckusick * Now we get and lock the child directory containing "..". 2958202113Smckusick */ 2959202113Smckusick FILEDESC_SLOCK(td->td_proc->p_fd); 2960202113Smckusick dvp = td->td_proc->p_fd->fd_cdir; 2961202113Smckusick FILEDESC_SUNLOCK(td->td_proc->p_fd); 2962202113Smckusick if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) { 2963202113Smckusick vput(fdvp); 2964202113Smckusick break; 2965202113Smckusick } 2966202113Smckusick dp = VTOI(dvp); 2967202113Smckusick dp->i_offset = 12; /* XXX mastertemplate.dot_reclen */ 2968202113Smckusick error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, 2969202113Smckusick DT_DIR, 0); 2970202113Smckusick cache_purge(fdvp); 2971202113Smckusick cache_purge(dvp); 2972202113Smckusick vput(dvp); 2973202113Smckusick vput(fdvp); 2974202113Smckusick break; 2975202113Smckusick 2976202113Smckusick case FFS_UNLINK: 2977202113Smckusick#ifdef DEBUG 2978202113Smckusick if (fsckcmds) { 2979202113Smckusick char buf[32]; 2980202113Smckusick 2981202125Smckusick if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) 2982202113Smckusick strncpy(buf, "Name_too_long", 32); 2983202113Smckusick printf("%s: unlink %s (inode %jd)\n", 2984202113Smckusick mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); 2985202113Smckusick } 2986202113Smckusick#endif /* DEBUG */ 2987202113Smckusick /* 2988202113Smckusick * kern_unlinkat will do its own start/finish writes and 2989202113Smckusick * they do not nest, so drop ours here. Setting mp == NULL 2990202113Smckusick * indicates that vn_finished_write is not needed down below. 2991202113Smckusick */ 2992202113Smckusick vn_finished_write(mp); 2993202113Smckusick mp = NULL; 2994202125Smckusick error = kern_unlinkat(td, AT_FDCWD, (char *)(intptr_t)cmd.value, 2995202113Smckusick UIO_USERSPACE, (ino_t)cmd.size); 2996202113Smckusick break; 2997202113Smckusick 2998224061Smckusick case FFS_SET_INODE: 2999224061Smckusick if (ump->um_fsckpid != td->td_proc->p_pid) { 3000224061Smckusick error = EPERM; 3001224061Smckusick break; 3002224061Smckusick } 3003224061Smckusick#ifdef DEBUG 3004224272Smckusick if (fsckcmds) { 3005224061Smckusick printf("%s: update inode %jd\n", 3006224061Smckusick mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3007224061Smckusick } 3008224061Smckusick#endif /* DEBUG */ 3009224061Smckusick if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3010224061Smckusick break; 3011224061Smckusick vfslocked = VFS_LOCK_GIANT(vp->v_mount); 3012224061Smckusick AUDIT_ARG_VNODE1(vp); 3013224061Smckusick ip = VTOI(vp); 3014224061Smckusick if (ip->i_ump->um_fstype == UFS1) 3015224061Smckusick error = copyin((void *)(intptr_t)cmd.size, ip->i_din1, 3016224061Smckusick sizeof(struct ufs1_dinode)); 3017224061Smckusick else 3018224061Smckusick error = copyin((void *)(intptr_t)cmd.size, ip->i_din2, 3019224061Smckusick sizeof(struct ufs2_dinode)); 3020224061Smckusick if (error) { 3021224061Smckusick vput(vp); 3022224061Smckusick VFS_UNLOCK_GIANT(vfslocked); 3023224061Smckusick break; 3024224061Smckusick } 3025224061Smckusick ip->i_flag |= IN_CHANGE | IN_MODIFIED; 3026224061Smckusick error = ffs_update(vp, 1); 3027224061Smckusick vput(vp); 3028224061Smckusick VFS_UNLOCK_GIANT(vfslocked); 3029224061Smckusick break; 3030224061Smckusick 3031224061Smckusick case FFS_SET_BUFOUTPUT: 3032224061Smckusick if (ump->um_fsckpid != td->td_proc->p_pid) { 3033224061Smckusick error = EPERM; 3034224061Smckusick break; 3035224061Smckusick } 3036224061Smckusick if (VTOI(vp)->i_ump != ump) { 3037224061Smckusick error = EINVAL; 3038224061Smckusick break; 3039224061Smckusick } 3040224061Smckusick#ifdef DEBUG 3041224061Smckusick if (fsckcmds) { 3042224061Smckusick printf("%s: %s buffered output for descriptor %jd\n", 3043224061Smckusick mp->mnt_stat.f_mntonname, 3044224061Smckusick cmd.size == 1 ? "enable" : "disable", 3045224061Smckusick (intmax_t)cmd.value); 3046224061Smckusick } 3047224061Smckusick#endif /* DEBUG */ 3048224778Srwatson if ((error = getvnode(td->td_proc->p_fd, cmd.value, 3049224778Srwatson CAP_FSCK, &vfp)) != 0) 3050224061Smckusick break; 3051224061Smckusick if (vfp->f_vnode->v_type != VCHR) { 3052224061Smckusick fdrop(vfp, td); 3053224061Smckusick error = EINVAL; 3054224061Smckusick break; 3055224061Smckusick } 3056224061Smckusick if (origops == NULL) { 3057224061Smckusick origops = vfp->f_ops; 3058224061Smckusick bcopy((void *)origops, (void *)&bufferedops, 3059224061Smckusick sizeof(bufferedops)); 3060224061Smckusick bufferedops.fo_write = buffered_write; 3061224061Smckusick } 3062224061Smckusick if (cmd.size == 1) 3063224061Smckusick atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, 3064224061Smckusick (uintptr_t)&bufferedops); 3065224061Smckusick else 3066224061Smckusick atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, 3067224061Smckusick (uintptr_t)origops); 3068224061Smckusick fdrop(vfp, td); 3069224061Smckusick break; 3070224061Smckusick 307174548Smckusick default: 307274548Smckusick#ifdef DEBUG 307374548Smckusick if (fsckcmds) { 307474548Smckusick printf("Invalid request %d from fsck\n", 307574548Smckusick oidp->oid_number); 307674548Smckusick } 307774548Smckusick#endif /* DEBUG */ 307875572Smckusick error = EINVAL; 307975572Smckusick break; 308074548Smckusick 308174548Smckusick } 3082202113Smckusick fdrop(fp, td); 308375572Smckusick vn_finished_write(mp); 308475572Smckusick return (error); 308574548Smckusick} 3086224061Smckusick 3087224061Smckusick/* 3088224061Smckusick * Function to switch a descriptor to use the buffer cache to stage 3089224061Smckusick * its I/O. This is needed so that writes to the filesystem device 3090224061Smckusick * will give snapshots a chance to copy modified blocks for which it 3091224061Smckusick * needs to retain copies. 3092224061Smckusick */ 3093224061Smckusickstatic int 3094224061Smckusickbuffered_write(fp, uio, active_cred, flags, td) 3095224061Smckusick struct file *fp; 3096224061Smckusick struct uio *uio; 3097224061Smckusick struct ucred *active_cred; 3098224061Smckusick int flags; 3099224061Smckusick struct thread *td; 3100224061Smckusick{ 3101247211Skib struct vnode *devvp, *vp; 3102224061Smckusick struct inode *ip; 3103224061Smckusick struct buf *bp; 3104224061Smckusick struct fs *fs; 3105247211Skib struct filedesc *fdp; 3106224061Smckusick int error, vfslocked; 3107224061Smckusick daddr_t lbn; 3108224061Smckusick 3109224061Smckusick /* 3110224061Smckusick * The devvp is associated with the /dev filesystem. To discover 3111224061Smckusick * the filesystem with which the device is associated, we depend 3112224061Smckusick * on the application setting the current directory to a location 3113224061Smckusick * within the filesystem being written. Yes, this is an ugly hack. 3114224061Smckusick */ 3115224061Smckusick devvp = fp->f_vnode; 3116247211Skib if (!vn_isdisk(devvp, NULL)) 3117224061Smckusick return (EINVAL); 3118247211Skib fdp = td->td_proc->p_fd; 3119247211Skib FILEDESC_SLOCK(fdp); 3120247211Skib vp = fdp->fd_cdir; 3121247211Skib vref(vp); 3122247211Skib FILEDESC_SUNLOCK(fdp); 3123247211Skib vfslocked = VFS_LOCK_GIANT(vp->v_mount); 3124247211Skib vn_lock(vp, LK_SHARED | LK_RETRY); 3125247211Skib /* 3126247211Skib * Check that the current directory vnode indeed belongs to 3127247211Skib * UFS before trying to dereference UFS-specific v_data fields. 3128247211Skib */ 3129247211Skib if (vp->v_op != &ffs_vnodeops1 && vp->v_op != &ffs_vnodeops2) { 3130247211Skib vput(vp); 3131247211Skib VFS_UNLOCK_GIANT(vfslocked); 3132247211Skib return (EINVAL); 3133247211Skib } 3134247211Skib ip = VTOI(vp); 3135247211Skib if (ip->i_devvp != devvp) { 3136247211Skib vput(vp); 3137247211Skib VFS_UNLOCK_GIANT(vfslocked); 3138247211Skib return (EINVAL); 3139247211Skib } 3140224061Smckusick fs = ip->i_fs; 3141247211Skib vput(vp); 3142247211Skib VFS_UNLOCK_GIANT(vfslocked); 3143239843Skib foffset_lock_uio(fp, uio, flags); 3144224061Smckusick vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 3145224061Smckusick#ifdef DEBUG 3146224272Smckusick if (fsckcmds) { 3147224061Smckusick printf("%s: buffered write for block %jd\n", 3148224061Smckusick fs->fs_fsmnt, (intmax_t)btodb(uio->uio_offset)); 3149224061Smckusick } 3150224061Smckusick#endif /* DEBUG */ 3151224061Smckusick /* 3152224061Smckusick * All I/O must be contained within a filesystem block, start on 3153224061Smckusick * a fragment boundary, and be a multiple of fragments in length. 3154224061Smckusick */ 3155224061Smckusick if (uio->uio_resid > fs->fs_bsize - (uio->uio_offset % fs->fs_bsize) || 3156224061Smckusick fragoff(fs, uio->uio_offset) != 0 || 3157224061Smckusick fragoff(fs, uio->uio_resid) != 0) { 3158224061Smckusick error = EINVAL; 3159224061Smckusick goto out; 3160224061Smckusick } 3161224061Smckusick lbn = numfrags(fs, uio->uio_offset); 3162224061Smckusick bp = getblk(devvp, lbn, uio->uio_resid, 0, 0, 0); 3163224061Smckusick bp->b_flags |= B_RELBUF; 3164224061Smckusick if ((error = uiomove((char *)bp->b_data, uio->uio_resid, uio)) != 0) { 3165224061Smckusick brelse(bp); 3166224061Smckusick goto out; 3167224061Smckusick } 3168224061Smckusick error = bwrite(bp); 3169224061Smckusickout: 3170224061Smckusick VOP_UNLOCK(devvp, 0); 3171239843Skib foffset_unlock_uio(fp, uio, flags | FOF_NEXTOFF); 3172224061Smckusick return (error); 3173224061Smckusick} 3174