1139825Simp/*- 298542Smckusick * Copyright (c) 2002 Networks Associates Technology, Inc. 398542Smckusick * All rights reserved. 498542Smckusick * 598542Smckusick * This software was developed for the FreeBSD Project by Marshall 698542Smckusick * Kirk McKusick and Network Associates Laboratories, the Security 798542Smckusick * Research Division of Network Associates, Inc. under DARPA/SPAWAR 898542Smckusick * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 998542Smckusick * research program 1098542Smckusick * 11136721Srwatson * Redistribution and use in source and binary forms, with or without 12136721Srwatson * modification, are permitted provided that the following conditions 13136721Srwatson * are met: 14136721Srwatson * 1. Redistributions of source code must retain the above copyright 15136721Srwatson * notice, this list of conditions and the following disclaimer. 16136721Srwatson * 2. Redistributions in binary form must reproduce the above copyright 17136721Srwatson * notice, this list of conditions and the following disclaimer in the 18136721Srwatson * documentation and/or other materials provided with the distribution. 19136721Srwatson * 20136721Srwatson * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21136721Srwatson * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22136721Srwatson * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23136721Srwatson * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24136721Srwatson * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25136721Srwatson * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26136721Srwatson * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27136721Srwatson * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28136721Srwatson * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29136721Srwatson * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30136721Srwatson * SUCH DAMAGE. 31136721Srwatson * 321541Srgrimes * Copyright (c) 1982, 1986, 1989, 1993 331541Srgrimes * The Regents of the University of California. All rights reserved. 341541Srgrimes * 351541Srgrimes * Redistribution and use in source and binary forms, with or without 361541Srgrimes * modification, are permitted provided that the following conditions 371541Srgrimes * are met: 381541Srgrimes * 1. Redistributions of source code must retain the above copyright 391541Srgrimes * notice, this list of conditions and the following disclaimer. 401541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 411541Srgrimes * notice, this list of conditions and the following disclaimer in the 421541Srgrimes * documentation and/or other materials provided with the distribution. 431541Srgrimes * 4. Neither the name of the University nor the names of its contributors 441541Srgrimes * may be used to endorse or promote products derived from this software 451541Srgrimes * without specific prior written permission. 461541Srgrimes * 471541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 481541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 491541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 501541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 511541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 521541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 531541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 541541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 551541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 561541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 571541Srgrimes * SUCH DAMAGE. 581541Srgrimes * 5922521Sdyson * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 601541Srgrimes */ 611541Srgrimes 62116192Sobrien#include <sys/cdefs.h> 63116192Sobrien__FBSDID("$FreeBSD$"); 64116192Sobrien 6513260Swollman#include "opt_quota.h" 6613260Swollman 671541Srgrimes#include <sys/param.h> 68224778Srwatson#include <sys/capability.h> 691541Srgrimes#include <sys/systm.h> 7060041Sphk#include <sys/bio.h> 711541Srgrimes#include <sys/buf.h> 7250253Sbde#include <sys/conf.h> 73202113Smckusick#include <sys/fcntl.h> 7474548Smckusick#include <sys/file.h> 75108524Salfred#include <sys/filedesc.h> 76164033Srwatson#include <sys/priv.h> 771541Srgrimes#include <sys/proc.h> 781541Srgrimes#include <sys/vnode.h> 791541Srgrimes#include <sys/mount.h> 8041124Sdg#include <sys/kernel.h> 81202113Smckusick#include <sys/syscallsubr.h> 8212911Sphk#include <sys/sysctl.h> 831541Srgrimes#include <sys/syslog.h> 84216796Skib#include <sys/taskqueue.h> 851541Srgrimes 86202113Smckusick#include <security/audit/audit.h> 87202113Smckusick 88216796Skib#include <geom/geom.h> 89216796Skib 90202113Smckusick#include <ufs/ufs/dir.h> 9159241Srwatson#include <ufs/ufs/extattr.h> 921541Srgrimes#include <ufs/ufs/quota.h> 931541Srgrimes#include <ufs/ufs/inode.h> 9441124Sdg#include <ufs/ufs/ufs_extern.h> 9530474Sphk#include <ufs/ufs/ufsmount.h> 961541Srgrimes 971541Srgrimes#include <ufs/ffs/fs.h> 981541Srgrimes#include <ufs/ffs/ffs_extern.h> 99216796Skib#include <ufs/ffs/softdep.h> 1001541Srgrimes 101203763Smckusicktypedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref, 102207141Sjeff int size, int rsize); 10312590Sbde 104207141Sjeffstatic ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int); 10598542Smckusickstatic ufs2_daddr_t 106207141Sjeff ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); 107216796Skibstatic void ffs_blkfree_cg(struct ufsmount *, struct fs *, 108216796Skib struct vnode *, ufs2_daddr_t, long, ino_t, 109216796Skib struct workhead *); 110216796Skibstatic void ffs_blkfree_trim_completed(struct bio *); 111216796Skibstatic void ffs_blkfree_trim_task(void *ctx, int pending __unused); 112173464Sobrien#ifdef INVARIANTS 11398542Smckusickstatic int ffs_checkblk(struct inode *, ufs2_daddr_t, long); 11431352Sbde#endif 115207141Sjeffstatic ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int, 116207141Sjeff int); 11792728Salfredstatic ino_t ffs_dirpref(struct inode *); 118203763Smckusickstatic ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t, 119203763Smckusick int, int); 12098542Smckusickstatic ufs2_daddr_t ffs_hashalloc 121207141Sjeff (struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *); 122207141Sjeffstatic ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int, 123207141Sjeff int); 12498542Smckusickstatic ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); 12598542Smckusickstatic int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); 12698542Smckusickstatic int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); 1271541Srgrimes 1281541Srgrimes/* 12996755Strhodes * Allocate a block in the filesystem. 1308876Srgrimes * 1311541Srgrimes * The size of the requested block is given, which must be some 1321541Srgrimes * multiple of fs_fsize and <= fs_bsize. 1331541Srgrimes * A preference may be optionally specified. If a preference is given 1341541Srgrimes * the following hierarchy is used to allocate a block: 1351541Srgrimes * 1) allocate the requested block. 1361541Srgrimes * 2) allocate a rotationally optimal block in the same cylinder. 1371541Srgrimes * 3) allocate a block in the same cylinder group. 1381541Srgrimes * 4) quadradically rehash into other cylinder groups, until an 1391541Srgrimes * available block is located. 140166051Smpp * If no block preference is given the following hierarchy is used 1411541Srgrimes * to allocate a block: 1421541Srgrimes * 1) allocate a block in the cylinder group that contains the 1431541Srgrimes * inode for the file. 1441541Srgrimes * 2) quadradically rehash into other cylinder groups, until an 1451541Srgrimes * available block is located. 1461541Srgrimes */ 1471549Srgrimesint 148187790Srwatsonffs_alloc(ip, lbn, bpref, size, flags, cred, bnp) 14996506Sphk struct inode *ip; 15098542Smckusick ufs2_daddr_t lbn, bpref; 151187790Srwatson int size, flags; 1521541Srgrimes struct ucred *cred; 15398542Smckusick ufs2_daddr_t *bnp; 1541541Srgrimes{ 15596506Sphk struct fs *fs; 156140704Sjeff struct ufsmount *ump; 15798542Smckusick ufs2_daddr_t bno; 158203763Smckusick u_int cg, reclaimed; 159151906Sps static struct timeval lastfail; 160151906Sps static int curfail; 161166924Sbrian int64_t delta; 1626357Sphk#ifdef QUOTA 1636357Sphk int error; 1646357Sphk#endif 1658876Srgrimes 1661541Srgrimes *bnp = 0; 1671541Srgrimes fs = ip->i_fs; 168140704Sjeff ump = ip->i_ump; 169140704Sjeff mtx_assert(UFS_MTX(ump), MA_OWNED); 170173464Sobrien#ifdef INVARIANTS 1711541Srgrimes if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { 17250253Sbde printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", 17350253Sbde devtoname(ip->i_dev), (long)fs->fs_bsize, size, 17450253Sbde fs->fs_fsmnt); 1751541Srgrimes panic("ffs_alloc: bad size"); 1761541Srgrimes } 1771541Srgrimes if (cred == NOCRED) 1787170Sdg panic("ffs_alloc: missing credential"); 179173464Sobrien#endif /* INVARIANTS */ 18089637Smckusick reclaimed = 0; 18189637Smckusickretry: 182140704Sjeff#ifdef QUOTA 183140704Sjeff UFS_UNLOCK(ump); 184140704Sjeff error = chkdq(ip, btodb(size), cred, 0); 185140704Sjeff if (error) 186140704Sjeff return (error); 187140704Sjeff UFS_LOCK(ump); 188140704Sjeff#endif 1891541Srgrimes if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 1901541Srgrimes goto nospace; 191170587Srwatson if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) && 19229609Sphk freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) 1931541Srgrimes goto nospace; 1941541Srgrimes if (bpref >= fs->fs_size) 1951541Srgrimes bpref = 0; 1961541Srgrimes if (bpref == 0) 1971541Srgrimes cg = ino_to_cg(fs, ip->i_number); 1981541Srgrimes else 1991541Srgrimes cg = dtog(fs, bpref); 200207141Sjeff bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); 2011541Srgrimes if (bno > 0) { 202166924Sbrian delta = btodb(size); 203166924Sbrian DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 204187790Srwatson if (flags & IO_EXT) 205187790Srwatson ip->i_flag |= IN_CHANGE; 206187790Srwatson else 207187790Srwatson ip->i_flag |= IN_CHANGE | IN_UPDATE; 2081541Srgrimes *bnp = bno; 2091541Srgrimes return (0); 2101541Srgrimes } 211166142Smppnospace: 2121541Srgrimes#ifdef QUOTA 213140704Sjeff UFS_UNLOCK(ump); 2141541Srgrimes /* 2151541Srgrimes * Restore user's disk quota because allocation failed. 2161541Srgrimes */ 21798542Smckusick (void) chkdq(ip, -btodb(size), cred, FORCE); 218140704Sjeff UFS_LOCK(ump); 2191541Srgrimes#endif 220222958Sjeff if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 22189637Smckusick reclaimed = 1; 222220374Smckusick softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); 22389637Smckusick goto retry; 22489637Smckusick } 225140704Sjeff UFS_UNLOCK(ump); 226223114Smckusick if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) { 227151906Sps ffs_fserr(fs, ip->i_number, "filesystem full"); 228151906Sps uprintf("\n%s: write failed, filesystem is full\n", 229151906Sps fs->fs_fsmnt); 230151906Sps } 2311541Srgrimes return (ENOSPC); 2321541Srgrimes} 2331541Srgrimes 2341541Srgrimes/* 2351541Srgrimes * Reallocate a fragment to a bigger size 2361541Srgrimes * 2371541Srgrimes * The number and size of the old block is given, and a preference 2381541Srgrimes * and new size is also specified. The allocator attempts to extend 2391541Srgrimes * the original block. Failing that, the regular block allocator is 2401541Srgrimes * invoked to get an appropriate block. 2411541Srgrimes */ 2421549Srgrimesint 243187790Srwatsonffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) 24496506Sphk struct inode *ip; 24598542Smckusick ufs2_daddr_t lbprev; 246100344Smckusick ufs2_daddr_t bprev; 24798542Smckusick ufs2_daddr_t bpref; 248187790Srwatson int osize, nsize, flags; 2491541Srgrimes struct ucred *cred; 2501541Srgrimes struct buf **bpp; 2511541Srgrimes{ 25289637Smckusick struct vnode *vp; 25389637Smckusick struct fs *fs; 2541541Srgrimes struct buf *bp; 255140704Sjeff struct ufsmount *ump; 256203763Smckusick u_int cg, request, reclaimed; 257248521Skib int error, gbflags; 258100344Smckusick ufs2_daddr_t bno; 259151906Sps static struct timeval lastfail; 260151906Sps static int curfail; 261166924Sbrian int64_t delta; 2628876Srgrimes 2631541Srgrimes *bpp = 0; 26489637Smckusick vp = ITOV(ip); 2651541Srgrimes fs = ip->i_fs; 266140704Sjeff bp = NULL; 267140704Sjeff ump = ip->i_ump; 268248521Skib gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 269248521Skib 270140704Sjeff mtx_assert(UFS_MTX(ump), MA_OWNED); 271173464Sobrien#ifdef INVARIANTS 27289637Smckusick if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 27362976Smckusick panic("ffs_realloccg: allocation on suspended filesystem"); 2741541Srgrimes if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 2751541Srgrimes (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 2761541Srgrimes printf( 27750253Sbde "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", 27850253Sbde devtoname(ip->i_dev), (long)fs->fs_bsize, osize, 2798456Srgrimes nsize, fs->fs_fsmnt); 2801541Srgrimes panic("ffs_realloccg: bad size"); 2811541Srgrimes } 2821541Srgrimes if (cred == NOCRED) 2837170Sdg panic("ffs_realloccg: missing credential"); 284173464Sobrien#endif /* INVARIANTS */ 28589637Smckusick reclaimed = 0; 28689637Smckusickretry: 287170587Srwatson if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) && 288140704Sjeff freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { 2891541Srgrimes goto nospace; 290140704Sjeff } 291100344Smckusick if (bprev == 0) { 29298687Smux printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", 29398542Smckusick devtoname(ip->i_dev), (long)fs->fs_bsize, (intmax_t)bprev, 29437555Sbde fs->fs_fsmnt); 2951541Srgrimes panic("ffs_realloccg: bad bprev"); 2961541Srgrimes } 297140704Sjeff UFS_UNLOCK(ump); 2981541Srgrimes /* 2991541Srgrimes * Allocate the extra space in the buffer. 3001541Srgrimes */ 301248521Skib error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); 3023487Sphk if (error) { 3031541Srgrimes brelse(bp); 3041541Srgrimes return (error); 3051541Srgrimes } 3066864Sdg 30798542Smckusick if (bp->b_blkno == bp->b_lblkno) { 30898542Smckusick if (lbprev >= NDADDR) 3096864Sdg panic("ffs_realloccg: lbprev out of range"); 3106864Sdg bp->b_blkno = fsbtodb(fs, bprev); 3116864Sdg } 3128876Srgrimes 3131541Srgrimes#ifdef QUOTA 31498542Smckusick error = chkdq(ip, btodb(nsize - osize), cred, 0); 3153487Sphk if (error) { 3161541Srgrimes brelse(bp); 3171541Srgrimes return (error); 3181541Srgrimes } 3191541Srgrimes#endif 3201541Srgrimes /* 3211541Srgrimes * Check for extension in the existing location. 3221541Srgrimes */ 3231541Srgrimes cg = dtog(fs, bprev); 324140704Sjeff UFS_LOCK(ump); 32598542Smckusick bno = ffs_fragextend(ip, cg, bprev, osize, nsize); 3263487Sphk if (bno) { 3271541Srgrimes if (bp->b_blkno != fsbtodb(fs, bno)) 32823560Smpp panic("ffs_realloccg: bad blockno"); 329166924Sbrian delta = btodb(nsize - osize); 330166924Sbrian DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 331187790Srwatson if (flags & IO_EXT) 332187790Srwatson ip->i_flag |= IN_CHANGE; 333187790Srwatson else 334187790Srwatson ip->i_flag |= IN_CHANGE | IN_UPDATE; 3357399Sdg allocbuf(bp, nsize); 3361541Srgrimes bp->b_flags |= B_DONE; 337248521Skib vfs_bio_bzero_buf(bp, osize, nsize - osize); 338192260Salc if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 339192260Salc vfs_bio_set_valid(bp, osize, nsize - osize); 3401541Srgrimes *bpp = bp; 3411541Srgrimes return (0); 3421541Srgrimes } 3431541Srgrimes /* 3441541Srgrimes * Allocate a new disk location. 3451541Srgrimes */ 3461541Srgrimes if (bpref >= fs->fs_size) 3471541Srgrimes bpref = 0; 3481541Srgrimes switch ((int)fs->fs_optim) { 3491541Srgrimes case FS_OPTSPACE: 3501541Srgrimes /* 3518876Srgrimes * Allocate an exact sized fragment. Although this makes 3528876Srgrimes * best use of space, we will waste time relocating it if 3531541Srgrimes * the file continues to grow. If the fragmentation is 3541541Srgrimes * less than half of the minimum free reserve, we choose 3551541Srgrimes * to begin optimizing for time. 3561541Srgrimes */ 3571541Srgrimes request = nsize; 3586993Sdg if (fs->fs_minfree <= 5 || 3591541Srgrimes fs->fs_cstotal.cs_nffree > 36058087Smckusick (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) 3611541Srgrimes break; 3621541Srgrimes log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", 3631541Srgrimes fs->fs_fsmnt); 3641541Srgrimes fs->fs_optim = FS_OPTTIME; 3651541Srgrimes break; 3661541Srgrimes case FS_OPTTIME: 3671541Srgrimes /* 3681541Srgrimes * At this point we have discovered a file that is trying to 3691541Srgrimes * grow a small fragment to a larger fragment. To save time, 3701541Srgrimes * we allocate a full sized block, then free the unused portion. 3711541Srgrimes * If the file continues to grow, the `ffs_fragextend' call 3721541Srgrimes * above will be able to grow it in place without further 3731541Srgrimes * copying. If aberrant programs cause disk fragmentation to 3741541Srgrimes * grow within 2% of the free reserve, we choose to begin 3751541Srgrimes * optimizing for space. 3761541Srgrimes */ 3771541Srgrimes request = fs->fs_bsize; 3781541Srgrimes if (fs->fs_cstotal.cs_nffree < 37958087Smckusick (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) 3801541Srgrimes break; 3811541Srgrimes log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", 3821541Srgrimes fs->fs_fsmnt); 3831541Srgrimes fs->fs_optim = FS_OPTSPACE; 3841541Srgrimes break; 3851541Srgrimes default: 38650253Sbde printf("dev = %s, optim = %ld, fs = %s\n", 38750253Sbde devtoname(ip->i_dev), (long)fs->fs_optim, fs->fs_fsmnt); 3881541Srgrimes panic("ffs_realloccg: bad optim"); 3891541Srgrimes /* NOTREACHED */ 3901541Srgrimes } 391207141Sjeff bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); 3921541Srgrimes if (bno > 0) { 3931541Srgrimes bp->b_blkno = fsbtodb(fs, bno); 39489637Smckusick if (!DOINGSOFTDEP(vp)) 395140704Sjeff ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize, 396223127Smckusick ip->i_number, vp->v_type, NULL); 397166924Sbrian delta = btodb(nsize - osize); 398166924Sbrian DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 399187790Srwatson if (flags & IO_EXT) 400187790Srwatson ip->i_flag |= IN_CHANGE; 401187790Srwatson else 402187790Srwatson ip->i_flag |= IN_CHANGE | IN_UPDATE; 4037399Sdg allocbuf(bp, nsize); 4041541Srgrimes bp->b_flags |= B_DONE; 405248521Skib vfs_bio_bzero_buf(bp, osize, nsize - osize); 406192260Salc if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 407192260Salc vfs_bio_set_valid(bp, osize, nsize - osize); 4081541Srgrimes *bpp = bp; 4091541Srgrimes return (0); 4101541Srgrimes } 4111541Srgrimes#ifdef QUOTA 412140704Sjeff UFS_UNLOCK(ump); 4131541Srgrimes /* 4141541Srgrimes * Restore user's disk quota because allocation failed. 4151541Srgrimes */ 41698542Smckusick (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); 417140704Sjeff UFS_LOCK(ump); 4181541Srgrimes#endif 4191541Srgrimesnospace: 4201541Srgrimes /* 4211541Srgrimes * no space available 4221541Srgrimes */ 423222958Sjeff if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 42489637Smckusick reclaimed = 1; 425140704Sjeff UFS_UNLOCK(ump); 426203818Skib if (bp) { 427140704Sjeff brelse(bp); 428203818Skib bp = NULL; 429203818Skib } 430140704Sjeff UFS_LOCK(ump); 431222724Smckusick softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); 43289637Smckusick goto retry; 43389637Smckusick } 434140704Sjeff UFS_UNLOCK(ump); 435140704Sjeff if (bp) 436140704Sjeff brelse(bp); 437223114Smckusick if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) { 438151906Sps ffs_fserr(fs, ip->i_number, "filesystem full"); 439151906Sps uprintf("\n%s: write failed, filesystem is full\n", 440151906Sps fs->fs_fsmnt); 441151906Sps } 4421541Srgrimes return (ENOSPC); 4431541Srgrimes} 4441541Srgrimes 4451541Srgrimes/* 4461541Srgrimes * Reallocate a sequence of blocks into a contiguous sequence of blocks. 4471541Srgrimes * 4481541Srgrimes * The vnode and an array of buffer pointers for a range of sequential 4491541Srgrimes * logical blocks to be made contiguous is given. The allocator attempts 45098542Smckusick * to find a range of sequential blocks starting as close as possible 45198542Smckusick * from the end of the allocation for the logical block immediately 45298542Smckusick * preceding the current range. If successful, the physical block numbers 45398542Smckusick * in the buffer pointers and in the inode are changed to reflect the new 45498542Smckusick * allocation. If unsuccessful, the allocation is left unchanged. The 45598542Smckusick * success in doing the reallocation is returned. Note that the error 45698542Smckusick * return is not reflected back to the user. Rather the previous block 45798542Smckusick * allocation will be used. 4581541Srgrimes */ 45974548Smckusick 46074548SmckusickSYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem"); 46174548Smckusick 46212911Sphkstatic int doasyncfree = 1; 46374548SmckusickSYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, ""); 46422521Sdyson 46531352Sbdestatic int doreallocblks = 1; 46674548SmckusickSYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, ""); 46722521Sdyson 46842351Sbde#ifdef DEBUG 46942351Sbdestatic volatile int prtrealloc = 0; 47042351Sbde#endif 47131351Sbde 4721541Srgrimesint 4731541Srgrimesffs_reallocblks(ap) 4741541Srgrimes struct vop_reallocblks_args /* { 4751541Srgrimes struct vnode *a_vp; 4761541Srgrimes struct cluster_save *a_buflist; 4771541Srgrimes } */ *ap; 4781541Srgrimes{ 47998542Smckusick 48098542Smckusick if (doreallocblks == 0) 48198542Smckusick return (ENOSPC); 482207141Sjeff /* 483207141Sjeff * We can't wait in softdep prealloc as it may fsync and recurse 484207141Sjeff * here. Instead we simply fail to reallocate blocks if this 485207141Sjeff * rare condition arises. 486207141Sjeff */ 487207141Sjeff if (DOINGSOFTDEP(ap->a_vp)) 488207141Sjeff if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) 489207141Sjeff return (ENOSPC); 49098542Smckusick if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1) 49198542Smckusick return (ffs_reallocblks_ufs1(ap)); 49298542Smckusick return (ffs_reallocblks_ufs2(ap)); 49398542Smckusick} 49498542Smckusick 49598542Smckusickstatic int 49698542Smckusickffs_reallocblks_ufs1(ap) 49798542Smckusick struct vop_reallocblks_args /* { 49898542Smckusick struct vnode *a_vp; 49998542Smckusick struct cluster_save *a_buflist; 50098542Smckusick } */ *ap; 50198542Smckusick{ 5021541Srgrimes struct fs *fs; 5031541Srgrimes struct inode *ip; 5041541Srgrimes struct vnode *vp; 5051541Srgrimes struct buf *sbp, *ebp; 50698542Smckusick ufs1_daddr_t *bap, *sbap, *ebap = 0; 5071541Srgrimes struct cluster_save *buflist; 508140704Sjeff struct ufsmount *ump; 50998542Smckusick ufs_lbn_t start_lbn, end_lbn; 51098542Smckusick ufs1_daddr_t soff, newblk, blkno; 51198542Smckusick ufs2_daddr_t pref; 5121541Srgrimes struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; 51398542Smckusick int i, len, start_lvl, end_lvl, ssize; 5141541Srgrimes 5151541Srgrimes vp = ap->a_vp; 5161541Srgrimes ip = VTOI(vp); 5171541Srgrimes fs = ip->i_fs; 518140704Sjeff ump = ip->i_ump; 519254995Smckusick /* 520260828Smckusick * If we are not tracking block clusters or if we have less than 4% 521254995Smckusick * free blocks left, then do not attempt to cluster. Running with 522254995Smckusick * less than 5% free block reserve is not recommended and those that 523254995Smckusick * choose to do so do not expect to have good file layout. 524254995Smckusick */ 525260828Smckusick if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 5261541Srgrimes return (ENOSPC); 5271541Srgrimes buflist = ap->a_buflist; 5281541Srgrimes len = buflist->bs_nchildren; 5291541Srgrimes start_lbn = buflist->bs_children[0]->b_lblkno; 5301541Srgrimes end_lbn = start_lbn + len - 1; 531173464Sobrien#ifdef INVARIANTS 53222521Sdyson for (i = 0; i < len; i++) 53322521Sdyson if (!ffs_checkblk(ip, 53422521Sdyson dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 53522521Sdyson panic("ffs_reallocblks: unallocated block 1"); 5361541Srgrimes for (i = 1; i < len; i++) 5371541Srgrimes if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 53822521Sdyson panic("ffs_reallocblks: non-logical cluster"); 53922521Sdyson blkno = buflist->bs_children[0]->b_blkno; 54022521Sdyson ssize = fsbtodb(fs, fs->fs_frag); 54122521Sdyson for (i = 1; i < len - 1; i++) 54222521Sdyson if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 54322521Sdyson panic("ffs_reallocblks: non-physical cluster %d", i); 5441541Srgrimes#endif 5451541Srgrimes /* 546242520Smckusick * If the cluster crosses the boundary for the first indirect 547242520Smckusick * block, leave space for the indirect block. Indirect blocks 548242520Smckusick * are initially laid out in a position after the last direct 549242520Smckusick * block. Block reallocation would usually destroy locality by 550242520Smckusick * moving the indirect block out of the way to make room for 551242520Smckusick * data blocks if we didn't compensate here. We should also do 552242520Smckusick * this for other indirect block boundaries, but it is only 553242520Smckusick * important for the first one. 554242520Smckusick */ 555242520Smckusick if (start_lbn < NDADDR && end_lbn >= NDADDR) 556242520Smckusick return (ENOSPC); 557242520Smckusick /* 5581541Srgrimes * If the latest allocation is in a new cylinder group, assume that 5591541Srgrimes * the filesystem has decided to move and do not force it back to 5601541Srgrimes * the previous cylinder group. 5611541Srgrimes */ 5621541Srgrimes if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 5631541Srgrimes dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 5641541Srgrimes return (ENOSPC); 5651541Srgrimes if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 5661541Srgrimes ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 5671541Srgrimes return (ENOSPC); 5681541Srgrimes /* 5691541Srgrimes * Get the starting offset and block map for the first block. 5701541Srgrimes */ 5711541Srgrimes if (start_lvl == 0) { 57298542Smckusick sbap = &ip->i_din1->di_db[0]; 5731541Srgrimes soff = start_lbn; 5741541Srgrimes } else { 5751541Srgrimes idp = &start_ap[start_lvl - 1]; 5761541Srgrimes if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 5771541Srgrimes brelse(sbp); 5781541Srgrimes return (ENOSPC); 5791541Srgrimes } 58098542Smckusick sbap = (ufs1_daddr_t *)sbp->b_data; 5811541Srgrimes soff = idp->in_off; 5821541Srgrimes } 5831541Srgrimes /* 5841541Srgrimes * If the block range spans two block maps, get the second map. 5851541Srgrimes */ 5861541Srgrimes if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 5871541Srgrimes ssize = len; 5881541Srgrimes } else { 589173464Sobrien#ifdef INVARIANTS 590174126Skensmith if (start_lvl > 0 && 591174126Skensmith start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 5921541Srgrimes panic("ffs_reallocblk: start == end"); 5931541Srgrimes#endif 5941541Srgrimes ssize = len - (idp->in_off + 1); 5951541Srgrimes if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 5961541Srgrimes goto fail; 59798542Smckusick ebap = (ufs1_daddr_t *)ebp->b_data; 5981541Srgrimes } 5991541Srgrimes /* 600140704Sjeff * Find the preferred location for the cluster. 601140704Sjeff */ 602140704Sjeff UFS_LOCK(ump); 603140704Sjeff pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); 604140704Sjeff /* 6051541Srgrimes * Search the block map looking for an allocation of the desired size. 6061541Srgrimes */ 60798542Smckusick if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, 608207141Sjeff len, len, ffs_clusteralloc)) == 0) { 609140704Sjeff UFS_UNLOCK(ump); 6101541Srgrimes goto fail; 611140704Sjeff } 6121541Srgrimes /* 6131541Srgrimes * We have found a new contiguous block. 6141541Srgrimes * 6151541Srgrimes * First we have to replace the old block pointers with the new 6161541Srgrimes * block pointers in the inode and indirect blocks associated 6171541Srgrimes * with the file. 6181541Srgrimes */ 61922521Sdyson#ifdef DEBUG 62022521Sdyson if (prtrealloc) 621241011Smdf printf("realloc: ino %ju, lbns %jd-%jd\n\told:", 622241011Smdf (uintmax_t)ip->i_number, 62398542Smckusick (intmax_t)start_lbn, (intmax_t)end_lbn); 62422521Sdyson#endif 6251541Srgrimes blkno = newblk; 6261541Srgrimes for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 62734266Sjulian if (i == ssize) { 6281541Srgrimes bap = ebap; 62934266Sjulian soff = -i; 63034266Sjulian } 631173464Sobrien#ifdef INVARIANTS 63222521Sdyson if (!ffs_checkblk(ip, 63322521Sdyson dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 63422521Sdyson panic("ffs_reallocblks: unallocated block 2"); 63522521Sdyson if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 6361541Srgrimes panic("ffs_reallocblks: alloc mismatch"); 6371541Srgrimes#endif 63822521Sdyson#ifdef DEBUG 63922521Sdyson if (prtrealloc) 64022521Sdyson printf(" %d,", *bap); 64122521Sdyson#endif 64234266Sjulian if (DOINGSOFTDEP(vp)) { 64398542Smckusick if (sbap == &ip->i_din1->di_db[0] && i < ssize) 64434266Sjulian softdep_setup_allocdirect(ip, start_lbn + i, 64534266Sjulian blkno, *bap, fs->fs_bsize, fs->fs_bsize, 64634266Sjulian buflist->bs_children[i]); 64734266Sjulian else 64834266Sjulian softdep_setup_allocindir_page(ip, start_lbn + i, 64934266Sjulian i < ssize ? sbp : ebp, soff + i, blkno, 65034266Sjulian *bap, buflist->bs_children[i]); 65134266Sjulian } 6521541Srgrimes *bap++ = blkno; 6531541Srgrimes } 6541541Srgrimes /* 6551541Srgrimes * Next we must write out the modified inode and indirect blocks. 6561541Srgrimes * For strict correctness, the writes should be synchronous since 6571541Srgrimes * the old block values may have been written to disk. In practise 6588876Srgrimes * they are almost never written, but if we are concerned about 6591541Srgrimes * strict correctness, the `doasyncfree' flag should be set to zero. 6601541Srgrimes * 6611541Srgrimes * The test on `doasyncfree' should be changed to test a flag 6621541Srgrimes * that shows whether the associated buffers and inodes have 6631541Srgrimes * been written. The flag should be set when the cluster is 6641541Srgrimes * started and cleared whenever the buffer or inode is flushed. 6651541Srgrimes * We can then check below to see if it is set, and do the 6661541Srgrimes * synchronous write only when it has been cleared. 6671541Srgrimes */ 66898542Smckusick if (sbap != &ip->i_din1->di_db[0]) { 6691541Srgrimes if (doasyncfree) 6701541Srgrimes bdwrite(sbp); 6711541Srgrimes else 6721541Srgrimes bwrite(sbp); 6731541Srgrimes } else { 6741541Srgrimes ip->i_flag |= IN_CHANGE | IN_UPDATE; 67542374Sbde if (!doasyncfree) 676141526Sphk ffs_update(vp, 1); 6771541Srgrimes } 67846568Speter if (ssize < len) { 6791541Srgrimes if (doasyncfree) 6801541Srgrimes bdwrite(ebp); 6811541Srgrimes else 6821541Srgrimes bwrite(ebp); 68346568Speter } 6841541Srgrimes /* 6851541Srgrimes * Last, free the old blocks and assign the new blocks to the buffers. 6861541Srgrimes */ 68722521Sdyson#ifdef DEBUG 68822521Sdyson if (prtrealloc) 68922521Sdyson printf("\n\tnew:"); 69022521Sdyson#endif 6911541Srgrimes for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 69234266Sjulian if (!DOINGSOFTDEP(vp)) 693140704Sjeff ffs_blkfree(ump, fs, ip->i_devvp, 69434266Sjulian dbtofsb(fs, buflist->bs_children[i]->b_blkno), 695223127Smckusick fs->fs_bsize, ip->i_number, vp->v_type, NULL); 6961541Srgrimes buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); 697173464Sobrien#ifdef INVARIANTS 69822521Sdyson if (!ffs_checkblk(ip, 69922521Sdyson dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 70022521Sdyson panic("ffs_reallocblks: unallocated block 3"); 70150305Ssheldonh#endif 70250305Ssheldonh#ifdef DEBUG 70322521Sdyson if (prtrealloc) 70422521Sdyson printf(" %d,", blkno); 70522521Sdyson#endif 7061541Srgrimes } 70722521Sdyson#ifdef DEBUG 70822521Sdyson if (prtrealloc) { 70922521Sdyson prtrealloc--; 71022521Sdyson printf("\n"); 71122521Sdyson } 71222521Sdyson#endif 7131541Srgrimes return (0); 7141541Srgrimes 7151541Srgrimesfail: 7161541Srgrimes if (ssize < len) 7171541Srgrimes brelse(ebp); 71898542Smckusick if (sbap != &ip->i_din1->di_db[0]) 7191541Srgrimes brelse(sbp); 7201541Srgrimes return (ENOSPC); 7211541Srgrimes} 7221541Srgrimes 72398542Smckusickstatic int 72498542Smckusickffs_reallocblks_ufs2(ap) 72598542Smckusick struct vop_reallocblks_args /* { 72698542Smckusick struct vnode *a_vp; 72798542Smckusick struct cluster_save *a_buflist; 72898542Smckusick } */ *ap; 72998542Smckusick{ 73098542Smckusick struct fs *fs; 73198542Smckusick struct inode *ip; 73298542Smckusick struct vnode *vp; 73398542Smckusick struct buf *sbp, *ebp; 73498542Smckusick ufs2_daddr_t *bap, *sbap, *ebap = 0; 73598542Smckusick struct cluster_save *buflist; 736140704Sjeff struct ufsmount *ump; 73798542Smckusick ufs_lbn_t start_lbn, end_lbn; 73898542Smckusick ufs2_daddr_t soff, newblk, blkno, pref; 73998542Smckusick struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; 74098542Smckusick int i, len, start_lvl, end_lvl, ssize; 74198542Smckusick 74298542Smckusick vp = ap->a_vp; 74398542Smckusick ip = VTOI(vp); 74498542Smckusick fs = ip->i_fs; 745140704Sjeff ump = ip->i_ump; 746254995Smckusick /* 747260828Smckusick * If we are not tracking block clusters or if we have less than 4% 748254995Smckusick * free blocks left, then do not attempt to cluster. Running with 749254995Smckusick * less than 5% free block reserve is not recommended and those that 750254995Smckusick * choose to do so do not expect to have good file layout. 751254995Smckusick */ 752260828Smckusick if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 75398542Smckusick return (ENOSPC); 75498542Smckusick buflist = ap->a_buflist; 75598542Smckusick len = buflist->bs_nchildren; 75698542Smckusick start_lbn = buflist->bs_children[0]->b_lblkno; 75798542Smckusick end_lbn = start_lbn + len - 1; 758173464Sobrien#ifdef INVARIANTS 75998542Smckusick for (i = 0; i < len; i++) 76098542Smckusick if (!ffs_checkblk(ip, 76198542Smckusick dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 76298542Smckusick panic("ffs_reallocblks: unallocated block 1"); 76398542Smckusick for (i = 1; i < len; i++) 76498542Smckusick if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 76598542Smckusick panic("ffs_reallocblks: non-logical cluster"); 76698542Smckusick blkno = buflist->bs_children[0]->b_blkno; 76798542Smckusick ssize = fsbtodb(fs, fs->fs_frag); 76898542Smckusick for (i = 1; i < len - 1; i++) 76998542Smckusick if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 77098542Smckusick panic("ffs_reallocblks: non-physical cluster %d", i); 77198542Smckusick#endif 77298542Smckusick /* 773242520Smckusick * If the cluster crosses the boundary for the first indirect 774242520Smckusick * block, do not move anything in it. Indirect blocks are 775242520Smckusick * usually initially laid out in a position between the data 776242520Smckusick * blocks. Block reallocation would usually destroy locality by 777242520Smckusick * moving the indirect block out of the way to make room for 778242520Smckusick * data blocks if we didn't compensate here. We should also do 779242520Smckusick * this for other indirect block boundaries, but it is only 780242520Smckusick * important for the first one. 781242520Smckusick */ 782242520Smckusick if (start_lbn < NDADDR && end_lbn >= NDADDR) 783242520Smckusick return (ENOSPC); 784242520Smckusick /* 78598542Smckusick * If the latest allocation is in a new cylinder group, assume that 78698542Smckusick * the filesystem has decided to move and do not force it back to 78798542Smckusick * the previous cylinder group. 78898542Smckusick */ 78998542Smckusick if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 79098542Smckusick dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 79198542Smckusick return (ENOSPC); 79298542Smckusick if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 79398542Smckusick ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 79498542Smckusick return (ENOSPC); 79598542Smckusick /* 79698542Smckusick * Get the starting offset and block map for the first block. 79798542Smckusick */ 79898542Smckusick if (start_lvl == 0) { 79998542Smckusick sbap = &ip->i_din2->di_db[0]; 80098542Smckusick soff = start_lbn; 80198542Smckusick } else { 80298542Smckusick idp = &start_ap[start_lvl - 1]; 80398542Smckusick if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 80498542Smckusick brelse(sbp); 80598542Smckusick return (ENOSPC); 80698542Smckusick } 80798542Smckusick sbap = (ufs2_daddr_t *)sbp->b_data; 80898542Smckusick soff = idp->in_off; 80998542Smckusick } 81098542Smckusick /* 81198542Smckusick * If the block range spans two block maps, get the second map. 81298542Smckusick */ 81398542Smckusick if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 81498542Smckusick ssize = len; 81598542Smckusick } else { 816173464Sobrien#ifdef INVARIANTS 817174126Skensmith if (start_lvl > 0 && 818174126Skensmith start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 81998542Smckusick panic("ffs_reallocblk: start == end"); 82098542Smckusick#endif 82198542Smckusick ssize = len - (idp->in_off + 1); 82298542Smckusick if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 82398542Smckusick goto fail; 82498542Smckusick ebap = (ufs2_daddr_t *)ebp->b_data; 82598542Smckusick } 82698542Smckusick /* 827140704Sjeff * Find the preferred location for the cluster. 828140704Sjeff */ 829140704Sjeff UFS_LOCK(ump); 830140704Sjeff pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); 831140704Sjeff /* 83298542Smckusick * Search the block map looking for an allocation of the desired size. 83398542Smckusick */ 83498542Smckusick if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref, 835207141Sjeff len, len, ffs_clusteralloc)) == 0) { 836140704Sjeff UFS_UNLOCK(ump); 83798542Smckusick goto fail; 838140704Sjeff } 83998542Smckusick /* 84098542Smckusick * We have found a new contiguous block. 84198542Smckusick * 84298542Smckusick * First we have to replace the old block pointers with the new 84398542Smckusick * block pointers in the inode and indirect blocks associated 84498542Smckusick * with the file. 84598542Smckusick */ 84698542Smckusick#ifdef DEBUG 84798542Smckusick if (prtrealloc) 848103594Sobrien printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number, 84998542Smckusick (intmax_t)start_lbn, (intmax_t)end_lbn); 85098542Smckusick#endif 85198542Smckusick blkno = newblk; 85298542Smckusick for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 85398542Smckusick if (i == ssize) { 85498542Smckusick bap = ebap; 85598542Smckusick soff = -i; 85698542Smckusick } 857173464Sobrien#ifdef INVARIANTS 85898542Smckusick if (!ffs_checkblk(ip, 85998542Smckusick dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 86098542Smckusick panic("ffs_reallocblks: unallocated block 2"); 86198542Smckusick if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 86298542Smckusick panic("ffs_reallocblks: alloc mismatch"); 86398542Smckusick#endif 86498542Smckusick#ifdef DEBUG 86598542Smckusick if (prtrealloc) 866103594Sobrien printf(" %jd,", (intmax_t)*bap); 86798542Smckusick#endif 86898542Smckusick if (DOINGSOFTDEP(vp)) { 86998542Smckusick if (sbap == &ip->i_din2->di_db[0] && i < ssize) 87098542Smckusick softdep_setup_allocdirect(ip, start_lbn + i, 87198542Smckusick blkno, *bap, fs->fs_bsize, fs->fs_bsize, 87298542Smckusick buflist->bs_children[i]); 87398542Smckusick else 87498542Smckusick softdep_setup_allocindir_page(ip, start_lbn + i, 87598542Smckusick i < ssize ? sbp : ebp, soff + i, blkno, 87698542Smckusick *bap, buflist->bs_children[i]); 87798542Smckusick } 87898542Smckusick *bap++ = blkno; 87998542Smckusick } 88098542Smckusick /* 88198542Smckusick * Next we must write out the modified inode and indirect blocks. 88298542Smckusick * For strict correctness, the writes should be synchronous since 88398542Smckusick * the old block values may have been written to disk. In practise 88498542Smckusick * they are almost never written, but if we are concerned about 88598542Smckusick * strict correctness, the `doasyncfree' flag should be set to zero. 88698542Smckusick * 88798542Smckusick * The test on `doasyncfree' should be changed to test a flag 88898542Smckusick * that shows whether the associated buffers and inodes have 88998542Smckusick * been written. The flag should be set when the cluster is 89098542Smckusick * started and cleared whenever the buffer or inode is flushed. 89198542Smckusick * We can then check below to see if it is set, and do the 89298542Smckusick * synchronous write only when it has been cleared. 89398542Smckusick */ 89498542Smckusick if (sbap != &ip->i_din2->di_db[0]) { 89598542Smckusick if (doasyncfree) 89698542Smckusick bdwrite(sbp); 89798542Smckusick else 89898542Smckusick bwrite(sbp); 89998542Smckusick } else { 90098542Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 90198542Smckusick if (!doasyncfree) 902141526Sphk ffs_update(vp, 1); 90398542Smckusick } 90498542Smckusick if (ssize < len) { 90598542Smckusick if (doasyncfree) 90698542Smckusick bdwrite(ebp); 90798542Smckusick else 90898542Smckusick bwrite(ebp); 90998542Smckusick } 91098542Smckusick /* 91198542Smckusick * Last, free the old blocks and assign the new blocks to the buffers. 91298542Smckusick */ 91398542Smckusick#ifdef DEBUG 91498542Smckusick if (prtrealloc) 91598542Smckusick printf("\n\tnew:"); 91698542Smckusick#endif 91798542Smckusick for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 91898542Smckusick if (!DOINGSOFTDEP(vp)) 919140704Sjeff ffs_blkfree(ump, fs, ip->i_devvp, 92098542Smckusick dbtofsb(fs, buflist->bs_children[i]->b_blkno), 921223127Smckusick fs->fs_bsize, ip->i_number, vp->v_type, NULL); 92298542Smckusick buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); 923173464Sobrien#ifdef INVARIANTS 92498542Smckusick if (!ffs_checkblk(ip, 92598542Smckusick dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 92698542Smckusick panic("ffs_reallocblks: unallocated block 3"); 92798542Smckusick#endif 92898542Smckusick#ifdef DEBUG 92998542Smckusick if (prtrealloc) 93099590Sbde printf(" %jd,", (intmax_t)blkno); 93198542Smckusick#endif 93298542Smckusick } 93398542Smckusick#ifdef DEBUG 93498542Smckusick if (prtrealloc) { 93598542Smckusick prtrealloc--; 93698542Smckusick printf("\n"); 93798542Smckusick } 93898542Smckusick#endif 93998542Smckusick return (0); 94098542Smckusick 94198542Smckusickfail: 94298542Smckusick if (ssize < len) 94398542Smckusick brelse(ebp); 94498542Smckusick if (sbap != &ip->i_din2->di_db[0]) 94598542Smckusick brelse(sbp); 94698542Smckusick return (ENOSPC); 94798542Smckusick} 94898542Smckusick 9491541Srgrimes/* 95096755Strhodes * Allocate an inode in the filesystem. 9518876Srgrimes * 9521541Srgrimes * If allocating a directory, use ffs_dirpref to select the inode. 9531541Srgrimes * If allocating in a directory, the following hierarchy is followed: 9541541Srgrimes * 1) allocate the preferred inode. 9551541Srgrimes * 2) allocate an inode in the same cylinder group. 9561541Srgrimes * 3) quadradically rehash into other cylinder groups, until an 9571541Srgrimes * available inode is located. 958166051Smpp * If no inode preference is given the following hierarchy is used 9591541Srgrimes * to allocate an inode: 9601541Srgrimes * 1) allocate an inode in cylinder group 0. 9611541Srgrimes * 2) quadradically rehash into other cylinder groups, until an 9621541Srgrimes * available inode is located. 9631541Srgrimes */ 9641549Srgrimesint 96530474Sphkffs_valloc(pvp, mode, cred, vpp) 96630474Sphk struct vnode *pvp; 96730474Sphk int mode; 96830474Sphk struct ucred *cred; 96930474Sphk struct vnode **vpp; 9701541Srgrimes{ 97196506Sphk struct inode *pip; 97296506Sphk struct fs *fs; 97396506Sphk struct inode *ip; 97498542Smckusick struct timespec ts; 975140704Sjeff struct ufsmount *ump; 9761541Srgrimes ino_t ino, ipref; 977203763Smckusick u_int cg; 978219895Smckusick int error, error1, reclaimed; 979151906Sps static struct timeval lastfail; 980151906Sps static int curfail; 9818876Srgrimes 98230474Sphk *vpp = NULL; 9831541Srgrimes pip = VTOI(pvp); 9841541Srgrimes fs = pip->i_fs; 985140704Sjeff ump = pip->i_ump; 986140704Sjeff 987140704Sjeff UFS_LOCK(ump); 988219895Smckusick reclaimed = 0; 989219895Smckusickretry: 9901541Srgrimes if (fs->fs_cstotal.cs_nifree == 0) 9911541Srgrimes goto noinodes; 9921541Srgrimes 9931541Srgrimes if ((mode & IFMT) == IFDIR) 99475377Smckusick ipref = ffs_dirpref(pip); 9951541Srgrimes else 9961541Srgrimes ipref = pip->i_number; 997108010Smckusick if (ipref >= fs->fs_ncg * fs->fs_ipg) 9981541Srgrimes ipref = 0; 9991541Srgrimes cg = ino_to_cg(fs, ipref); 100075377Smckusick /* 100175377Smckusick * Track number of dirs created one after another 100275377Smckusick * in a same cg without intervening by files. 100375377Smckusick */ 100475377Smckusick if ((mode & IFMT) == IFDIR) { 100575377Smckusick if (fs->fs_contigdirs[cg] < 255) 100675377Smckusick fs->fs_contigdirs[cg]++; 100775377Smckusick } else { 100875377Smckusick if (fs->fs_contigdirs[cg] > 0) 100975377Smckusick fs->fs_contigdirs[cg]--; 101075377Smckusick } 1011207141Sjeff ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 101212861Speter (allocfcn_t *)ffs_nodealloccg); 10131541Srgrimes if (ino == 0) 10141541Srgrimes goto noinodes; 1015141526Sphk error = ffs_vget(pvp->v_mount, ino, LK_EXCLUSIVE, vpp); 10161541Srgrimes if (error) { 1017182366Skib error1 = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, 1018182366Skib FFSV_FORCEINSMQ); 1019141526Sphk ffs_vfree(pvp, ino, mode); 1020182366Skib if (error1 == 0) { 1021182366Skib ip = VTOI(*vpp); 1022182366Skib if (ip->i_mode) 1023182366Skib goto dup_alloc; 1024182366Skib ip->i_flag |= IN_MODIFIED; 1025182366Skib vput(*vpp); 1026182366Skib } 10271541Srgrimes return (error); 10281541Srgrimes } 102930474Sphk ip = VTOI(*vpp); 10301541Srgrimes if (ip->i_mode) { 1031182366Skibdup_alloc: 103237555Sbde printf("mode = 0%o, inum = %lu, fs = %s\n", 103337555Sbde ip->i_mode, (u_long)ip->i_number, fs->fs_fsmnt); 10341541Srgrimes panic("ffs_valloc: dup alloc"); 10351541Srgrimes } 103698542Smckusick if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ 103737555Sbde printf("free inode %s/%lu had %ld blocks\n", 103898542Smckusick fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks)); 1039132775Skan DIP_SET(ip, i_blocks, 0); 10401541Srgrimes } 10411541Srgrimes ip->i_flags = 0; 1042132775Skan DIP_SET(ip, i_flags, 0); 10431541Srgrimes /* 10441541Srgrimes * Set up a new generation number for this inode. 10451541Srgrimes */ 104631484Sbde if (ip->i_gen == 0 || ++ip->i_gen == 0) 1047110885Smckusick ip->i_gen = arc4random() / 2 + 1; 1048132775Skan DIP_SET(ip, i_gen, ip->i_gen); 104998542Smckusick if (fs->fs_magic == FS_UFS2_MAGIC) { 105098542Smckusick vfs_timestamp(&ts); 1051100201Smckusick ip->i_din2->di_birthtime = ts.tv_sec; 1052100201Smckusick ip->i_din2->di_birthnsec = ts.tv_nsec; 105398542Smckusick } 1054220985Skib ufs_prepare_reclaim(*vpp); 1055150891Struckman ip->i_flag = 0; 1056220985Skib (*vpp)->v_vflag = 0; 1057151176Stegge (*vpp)->v_type = VNON; 1058151176Stegge if (fs->fs_magic == FS_UFS2_MAGIC) 1059151176Stegge (*vpp)->v_op = &ffs_vnodeops2; 1060151176Stegge else 1061151176Stegge (*vpp)->v_op = &ffs_vnodeops1; 10621541Srgrimes return (0); 10631541Srgrimesnoinodes: 1064222422Smckusick if (reclaimed == 0) { 1065219895Smckusick reclaimed = 1; 1066220374Smckusick softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); 1067219895Smckusick goto retry; 1068219895Smckusick } 1069140704Sjeff UFS_UNLOCK(ump); 1070151906Sps if (ppsratecheck(&lastfail, &curfail, 1)) { 1071151906Sps ffs_fserr(fs, pip->i_number, "out of inodes"); 1072151906Sps uprintf("\n%s: create/symlink failed, no inodes free\n", 1073151906Sps fs->fs_fsmnt); 1074151906Sps } 10751541Srgrimes return (ENOSPC); 10761541Srgrimes} 10771541Srgrimes 10781541Srgrimes/* 107975377Smckusick * Find a cylinder group to place a directory. 10801541Srgrimes * 108175377Smckusick * The policy implemented by this algorithm is to allocate a 108275377Smckusick * directory inode in the same cylinder group as its parent 108375377Smckusick * directory, but also to reserve space for its files inodes 108475377Smckusick * and data. Restrict the number of directories which may be 108575377Smckusick * allocated one after another in the same cylinder group 108675377Smckusick * without intervening allocation of files. 108775377Smckusick * 108875377Smckusick * If we allocate a first level directory then force allocation 108975377Smckusick * in another cylinder group. 10901541Srgrimes */ 10911541Srgrimesstatic ino_t 109275377Smckusickffs_dirpref(pip) 109375377Smckusick struct inode *pip; 109475377Smckusick{ 109596506Sphk struct fs *fs; 1096248623Smckusick int cg, prefcg, dirsize, cgsize; 1097203763Smckusick u_int avgifree, avgbfree, avgndir, curdirsize; 1098203763Smckusick u_int minifree, minbfree, maxndir; 1099203763Smckusick u_int mincg, minndir; 1100203763Smckusick u_int maxcontigdirs; 11011541Srgrimes 1102140704Sjeff mtx_assert(UFS_MTX(pip->i_ump), MA_OWNED); 110375377Smckusick fs = pip->i_fs; 110475377Smckusick 11051541Srgrimes avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 110675377Smckusick avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 110775377Smckusick avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; 110875377Smckusick 110975377Smckusick /* 111075377Smckusick * Force allocation in another cg if creating a first level dir. 111175377Smckusick */ 1112101308Sjeff ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref"); 1113101308Sjeff if (ITOV(pip)->v_vflag & VV_ROOT) { 111475377Smckusick prefcg = arc4random() % fs->fs_ncg; 111575377Smckusick mincg = prefcg; 111675377Smckusick minndir = fs->fs_ipg; 111775377Smckusick for (cg = prefcg; cg < fs->fs_ncg; cg++) 111875377Smckusick if (fs->fs_cs(fs, cg).cs_ndir < minndir && 111975377Smckusick fs->fs_cs(fs, cg).cs_nifree >= avgifree && 112075377Smckusick fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 112175377Smckusick mincg = cg; 112275377Smckusick minndir = fs->fs_cs(fs, cg).cs_ndir; 112375377Smckusick } 112475377Smckusick for (cg = 0; cg < prefcg; cg++) 112575377Smckusick if (fs->fs_cs(fs, cg).cs_ndir < minndir && 112675377Smckusick fs->fs_cs(fs, cg).cs_nifree >= avgifree && 112775377Smckusick fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 112875377Smckusick mincg = cg; 112975377Smckusick minndir = fs->fs_cs(fs, cg).cs_ndir; 113075377Smckusick } 113175377Smckusick return ((ino_t)(fs->fs_ipg * mincg)); 113275377Smckusick } 113375377Smckusick 113475377Smckusick /* 113575377Smckusick * Count various limits which used for 113675377Smckusick * optimal allocation of a directory inode. 113775377Smckusick */ 113875377Smckusick maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); 1139121785Struckman minifree = avgifree - avgifree / 4; 1140121785Struckman if (minifree < 1) 1141121785Struckman minifree = 1; 1142121785Struckman minbfree = avgbfree - avgbfree / 4; 1143121785Struckman if (minbfree < 1) 1144121785Struckman minbfree = 1; 114575377Smckusick cgsize = fs->fs_fsize * fs->fs_fpg; 114675377Smckusick dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; 114775377Smckusick curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; 114875377Smckusick if (dirsize < curdirsize) 114975377Smckusick dirsize = curdirsize; 1150172113Sbz if (dirsize <= 0) 1151172113Sbz maxcontigdirs = 0; /* dirsize overflowed */ 1152172113Sbz else 1153172113Sbz maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); 115475377Smckusick if (fs->fs_avgfpdir > 0) 115575377Smckusick maxcontigdirs = min(maxcontigdirs, 115675377Smckusick fs->fs_ipg / fs->fs_avgfpdir); 115775377Smckusick if (maxcontigdirs == 0) 115875377Smckusick maxcontigdirs = 1; 115975377Smckusick 116075377Smckusick /* 116175377Smckusick * Limit number of dirs in one cg and reserve space for 116275377Smckusick * regular files, but only if we have no deficit in 116375377Smckusick * inodes or space. 1164248623Smckusick * 1165248623Smckusick * We are trying to find a suitable cylinder group nearby 1166248623Smckusick * our preferred cylinder group to place a new directory. 1167248623Smckusick * We scan from our preferred cylinder group forward looking 1168248623Smckusick * for a cylinder group that meets our criterion. If we get 1169248623Smckusick * to the final cylinder group and do not find anything, 1170248623Smckusick * we start scanning backwards from our preferred cylinder 1171248623Smckusick * group. The ideal would be to alternate looking forward 1172248623Smckusick * and backward, but that is just too complex to code for 1173248623Smckusick * the gain it would get. The most likely place where the 1174248623Smckusick * backward scan would take effect is when we start near 1175248623Smckusick * the end of the filesystem and do not find anything from 1176248623Smckusick * where we are to the end. In that case, scanning backward 1177248623Smckusick * will likely find us a suitable cylinder group much closer 1178248623Smckusick * to our desired location than if we were to start scanning 1179248623Smckusick * forward from the beginning of the filesystem. 118075377Smckusick */ 118175377Smckusick prefcg = ino_to_cg(fs, pip->i_number); 118275377Smckusick for (cg = prefcg; cg < fs->fs_ncg; cg++) 118375377Smckusick if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 118475377Smckusick fs->fs_cs(fs, cg).cs_nifree >= minifree && 1185262779Spfg fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 118675377Smckusick if (fs->fs_contigdirs[cg] < maxcontigdirs) 118775377Smckusick return ((ino_t)(fs->fs_ipg * cg)); 11881541Srgrimes } 1189254996Smckusick for (cg = 0; cg < prefcg; cg++) 119075377Smckusick if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 119175377Smckusick fs->fs_cs(fs, cg).cs_nifree >= minifree && 1192262779Spfg fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 119375377Smckusick if (fs->fs_contigdirs[cg] < maxcontigdirs) 119475377Smckusick return ((ino_t)(fs->fs_ipg * cg)); 119575377Smckusick } 119675377Smckusick /* 119775377Smckusick * This is a backstop when we have deficit in space. 119875377Smckusick */ 119975377Smckusick for (cg = prefcg; cg < fs->fs_ncg; cg++) 120075377Smckusick if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 120175377Smckusick return ((ino_t)(fs->fs_ipg * cg)); 1202254996Smckusick for (cg = 0; cg < prefcg; cg++) 120375377Smckusick if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 120475377Smckusick break; 120575377Smckusick return ((ino_t)(fs->fs_ipg * cg)); 12061541Srgrimes} 12071541Srgrimes 12081541Srgrimes/* 12091541Srgrimes * Select the desired position for the next block in a file. The file is 12101541Srgrimes * logically divided into sections. The first section is composed of the 1211253341Smckusick * direct blocks and the next fs_maxbpg blocks. Each additional section 1212253341Smckusick * contains fs_maxbpg blocks. 12138876Srgrimes * 12141541Srgrimes * If no blocks have been allocated in the first section, the policy is to 12151541Srgrimes * request a block in the same cylinder group as the inode that describes 1216248623Smckusick * the file. The first indirect is allocated immediately following the last 1217248623Smckusick * direct block and the data blocks for the first indirect immediately 1218248623Smckusick * follow it. 1219248623Smckusick * 1220248623Smckusick * If no blocks have been allocated in any other section, the indirect 1221248623Smckusick * block(s) are allocated in the same cylinder group as its inode in an 1222248623Smckusick * area reserved immediately following the inode blocks. The policy for 1223248623Smckusick * the data blocks is to place them in a cylinder group with a greater than 1224248623Smckusick * average number of free blocks. An appropriate cylinder group is found 12251541Srgrimes * by using a rotor that sweeps the cylinder groups. When a new group of 12261541Srgrimes * blocks is needed, the sweep begins in the cylinder group following the 12271541Srgrimes * cylinder group from which the previous allocation was made. The sweep 12281541Srgrimes * continues until a cylinder group with greater than the average number 12291541Srgrimes * of free blocks is found. If the allocation is for the first block in an 1230253341Smckusick * indirect block or the previous block is a hole, then the information on 1231253341Smckusick * the previous allocation is unavailable; here a best guess is made based 1232253341Smckusick * on the logical block number being allocated. 12338876Srgrimes * 12341541Srgrimes * If a section is already partially allocated, the policy is to 1235253341Smckusick * allocate blocks contiguously within the section if possible. 12361541Srgrimes */ 123798542Smckusickufs2_daddr_t 123898542Smckusickffs_blkpref_ufs1(ip, lbn, indx, bap) 12391541Srgrimes struct inode *ip; 124098542Smckusick ufs_lbn_t lbn; 12411541Srgrimes int indx; 124298542Smckusick ufs1_daddr_t *bap; 12431541Srgrimes{ 124496506Sphk struct fs *fs; 1245248623Smckusick u_int cg, inocg; 1246203763Smckusick u_int avgbfree, startcg; 1247242520Smckusick ufs2_daddr_t pref; 12481541Srgrimes 1249248623Smckusick KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1250140704Sjeff mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED); 12511541Srgrimes fs = ip->i_fs; 1252242520Smckusick /* 1253248623Smckusick * Allocation of indirect blocks is indicated by passing negative 1254248623Smckusick * values in indx: -1 for single indirect, -2 for double indirect, 1255248623Smckusick * -3 for triple indirect. As noted below, we attempt to allocate 1256248623Smckusick * the first indirect inline with the file data. For all later 1257248623Smckusick * indirect blocks, the data is often allocated in other cylinder 1258248623Smckusick * groups. However to speed random file access and to speed up 1259248623Smckusick * fsck, the filesystem reserves the first fs_metaspace blocks 1260248623Smckusick * (typically half of fs_minfree) of the data area of each cylinder 1261248623Smckusick * group to hold these later indirect blocks. 1262248623Smckusick */ 1263248623Smckusick inocg = ino_to_cg(fs, ip->i_number); 1264248623Smckusick if (indx < 0) { 1265248623Smckusick /* 1266248623Smckusick * Our preference for indirect blocks is the zone at the 1267248623Smckusick * beginning of the inode's cylinder group data area that 1268248623Smckusick * we try to reserve for indirect blocks. 1269248623Smckusick */ 1270248623Smckusick pref = cgmeta(fs, inocg); 1271248623Smckusick /* 1272248623Smckusick * If we are allocating the first indirect block, try to 1273248623Smckusick * place it immediately following the last direct block. 1274248623Smckusick */ 1275248623Smckusick if (indx == -1 && lbn < NDADDR + NINDIR(fs) && 1276248623Smckusick ip->i_din1->di_db[NDADDR - 1] != 0) 1277248623Smckusick pref = ip->i_din1->di_db[NDADDR - 1] + fs->fs_frag; 1278248623Smckusick return (pref); 1279248623Smckusick } 1280248623Smckusick /* 1281242520Smckusick * If we are allocating the first data block in the first indirect 1282248623Smckusick * block and the indirect has been allocated in the data block area, 1283248623Smckusick * try to place it immediately following the indirect block. 1284242520Smckusick */ 1285242520Smckusick if (lbn == NDADDR) { 1286242520Smckusick pref = ip->i_din1->di_ib[0]; 1287248623Smckusick if (pref != 0 && pref >= cgdata(fs, inocg) && 1288248623Smckusick pref < cgbase(fs, inocg + 1)) 1289242520Smckusick return (pref + fs->fs_frag); 1290242520Smckusick } 1291248623Smckusick /* 1292248623Smckusick * If we are at the beginning of a file, or we have already allocated 1293248623Smckusick * the maximum number of blocks per cylinder group, or we do not 1294248623Smckusick * have a block allocated immediately preceeding us, then we need 1295248623Smckusick * to decide where to start allocating new blocks. 1296248623Smckusick */ 12971541Srgrimes if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 12981541Srgrimes /* 1299248623Smckusick * If we are allocating a directory data block, we want 1300248623Smckusick * to place it in the metadata area. 1301248623Smckusick */ 1302248623Smckusick if ((ip->i_mode & IFMT) == IFDIR) 1303248623Smckusick return (cgmeta(fs, inocg)); 1304248623Smckusick /* 1305248623Smckusick * Until we fill all the direct and all the first indirect's 1306248623Smckusick * blocks, we try to allocate in the data area of the inode's 1307248623Smckusick * cylinder group. 1308248623Smckusick */ 1309248623Smckusick if (lbn < NDADDR + NINDIR(fs)) 1310248623Smckusick return (cgdata(fs, inocg)); 1311248623Smckusick /* 13121541Srgrimes * Find a cylinder with greater than average number of 13131541Srgrimes * unused data blocks. 13141541Srgrimes */ 13151541Srgrimes if (indx == 0 || bap[indx - 1] == 0) 1316248623Smckusick startcg = inocg + lbn / fs->fs_maxbpg; 13171541Srgrimes else 13181541Srgrimes startcg = dtog(fs, bap[indx - 1]) + 1; 13191541Srgrimes startcg %= fs->fs_ncg; 13201541Srgrimes avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 13211541Srgrimes for (cg = startcg; cg < fs->fs_ncg; cg++) 13221541Srgrimes if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 13231541Srgrimes fs->fs_cgrotor = cg; 1324248623Smckusick return (cgdata(fs, cg)); 13251541Srgrimes } 13261541Srgrimes for (cg = 0; cg <= startcg; cg++) 13271541Srgrimes if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 13281541Srgrimes fs->fs_cgrotor = cg; 1329248623Smckusick return (cgdata(fs, cg)); 13301541Srgrimes } 133117108Sbde return (0); 13321541Srgrimes } 13331541Srgrimes /* 1334248623Smckusick * Otherwise, we just always try to lay things out contiguously. 13351541Srgrimes */ 133698542Smckusick return (bap[indx - 1] + fs->fs_frag); 133798542Smckusick} 133898542Smckusick 133998542Smckusick/* 134098542Smckusick * Same as above, but for UFS2 134198542Smckusick */ 134298542Smckusickufs2_daddr_t 134398542Smckusickffs_blkpref_ufs2(ip, lbn, indx, bap) 134498542Smckusick struct inode *ip; 134598542Smckusick ufs_lbn_t lbn; 134698542Smckusick int indx; 134798542Smckusick ufs2_daddr_t *bap; 134898542Smckusick{ 134998542Smckusick struct fs *fs; 1350248623Smckusick u_int cg, inocg; 1351203763Smckusick u_int avgbfree, startcg; 1352242520Smckusick ufs2_daddr_t pref; 135398542Smckusick 1354248623Smckusick KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1355140704Sjeff mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED); 135698542Smckusick fs = ip->i_fs; 1357242520Smckusick /* 1358248623Smckusick * Allocation of indirect blocks is indicated by passing negative 1359248623Smckusick * values in indx: -1 for single indirect, -2 for double indirect, 1360248623Smckusick * -3 for triple indirect. As noted below, we attempt to allocate 1361248623Smckusick * the first indirect inline with the file data. For all later 1362248623Smckusick * indirect blocks, the data is often allocated in other cylinder 1363248623Smckusick * groups. However to speed random file access and to speed up 1364248623Smckusick * fsck, the filesystem reserves the first fs_metaspace blocks 1365248623Smckusick * (typically half of fs_minfree) of the data area of each cylinder 1366248623Smckusick * group to hold these later indirect blocks. 1367248623Smckusick */ 1368248623Smckusick inocg = ino_to_cg(fs, ip->i_number); 1369248623Smckusick if (indx < 0) { 1370248623Smckusick /* 1371248623Smckusick * Our preference for indirect blocks is the zone at the 1372248623Smckusick * beginning of the inode's cylinder group data area that 1373248623Smckusick * we try to reserve for indirect blocks. 1374248623Smckusick */ 1375248623Smckusick pref = cgmeta(fs, inocg); 1376248623Smckusick /* 1377248623Smckusick * If we are allocating the first indirect block, try to 1378248623Smckusick * place it immediately following the last direct block. 1379248623Smckusick */ 1380248623Smckusick if (indx == -1 && lbn < NDADDR + NINDIR(fs) && 1381248623Smckusick ip->i_din2->di_db[NDADDR - 1] != 0) 1382248623Smckusick pref = ip->i_din2->di_db[NDADDR - 1] + fs->fs_frag; 1383248623Smckusick return (pref); 1384248623Smckusick } 1385248623Smckusick /* 1386242520Smckusick * If we are allocating the first data block in the first indirect 1387248623Smckusick * block and the indirect has been allocated in the data block area, 1388248623Smckusick * try to place it immediately following the indirect block. 1389242520Smckusick */ 1390242520Smckusick if (lbn == NDADDR) { 1391248623Smckusick pref = ip->i_din2->di_ib[0]; 1392248623Smckusick if (pref != 0 && pref >= cgdata(fs, inocg) && 1393248623Smckusick pref < cgbase(fs, inocg + 1)) 1394242520Smckusick return (pref + fs->fs_frag); 1395242520Smckusick } 1396248623Smckusick /* 1397248623Smckusick * If we are at the beginning of a file, or we have already allocated 1398248623Smckusick * the maximum number of blocks per cylinder group, or we do not 1399248623Smckusick * have a block allocated immediately preceeding us, then we need 1400248623Smckusick * to decide where to start allocating new blocks. 1401248623Smckusick */ 140298542Smckusick if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 140398542Smckusick /* 1404248623Smckusick * If we are allocating a directory data block, we want 1405248623Smckusick * to place it in the metadata area. 1406248623Smckusick */ 1407248623Smckusick if ((ip->i_mode & IFMT) == IFDIR) 1408248623Smckusick return (cgmeta(fs, inocg)); 1409248623Smckusick /* 1410248623Smckusick * Until we fill all the direct and all the first indirect's 1411248623Smckusick * blocks, we try to allocate in the data area of the inode's 1412248623Smckusick * cylinder group. 1413248623Smckusick */ 1414248623Smckusick if (lbn < NDADDR + NINDIR(fs)) 1415248623Smckusick return (cgdata(fs, inocg)); 1416248623Smckusick /* 141798542Smckusick * Find a cylinder with greater than average number of 141898542Smckusick * unused data blocks. 141998542Smckusick */ 142098542Smckusick if (indx == 0 || bap[indx - 1] == 0) 1421248623Smckusick startcg = inocg + lbn / fs->fs_maxbpg; 142298542Smckusick else 142398542Smckusick startcg = dtog(fs, bap[indx - 1]) + 1; 142498542Smckusick startcg %= fs->fs_ncg; 142598542Smckusick avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 142698542Smckusick for (cg = startcg; cg < fs->fs_ncg; cg++) 142798542Smckusick if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 142898542Smckusick fs->fs_cgrotor = cg; 1429248623Smckusick return (cgdata(fs, cg)); 143098542Smckusick } 143198542Smckusick for (cg = 0; cg <= startcg; cg++) 143298542Smckusick if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 143398542Smckusick fs->fs_cgrotor = cg; 1434248623Smckusick return (cgdata(fs, cg)); 143598542Smckusick } 143698542Smckusick return (0); 143798542Smckusick } 143810632Sdg /* 1439248623Smckusick * Otherwise, we just always try to lay things out contiguously. 144010632Sdg */ 144198542Smckusick return (bap[indx - 1] + fs->fs_frag); 14421541Srgrimes} 14431541Srgrimes 14441541Srgrimes/* 14451541Srgrimes * Implement the cylinder overflow algorithm. 14461541Srgrimes * 14471541Srgrimes * The policy implemented by this algorithm is: 14481541Srgrimes * 1) allocate the block in its requested cylinder group. 14491541Srgrimes * 2) quadradically rehash on the cylinder group number. 14501541Srgrimes * 3) brute force search for a free block. 1451140704Sjeff * 1452140704Sjeff * Must be called with the UFS lock held. Will release the lock on success 1453140704Sjeff * and return with it held on failure. 14541541Srgrimes */ 14551541Srgrimes/*VARARGS5*/ 145698542Smckusickstatic ufs2_daddr_t 1457207141Sjeffffs_hashalloc(ip, cg, pref, size, rsize, allocator) 14581541Srgrimes struct inode *ip; 1459203763Smckusick u_int cg; 146098542Smckusick ufs2_daddr_t pref; 1461207141Sjeff int size; /* Search size for data blocks, mode for inodes */ 1462207141Sjeff int rsize; /* Real allocated size. */ 146312590Sbde allocfcn_t *allocator; 14641541Srgrimes{ 146596506Sphk struct fs *fs; 146698542Smckusick ufs2_daddr_t result; 1467203763Smckusick u_int i, icg = cg; 14681541Srgrimes 1469140704Sjeff mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED); 1470173464Sobrien#ifdef INVARIANTS 147162976Smckusick if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 147262976Smckusick panic("ffs_hashalloc: allocation on suspended filesystem"); 147362976Smckusick#endif 14741541Srgrimes fs = ip->i_fs; 14751541Srgrimes /* 14761541Srgrimes * 1: preferred cylinder group 14771541Srgrimes */ 1478207141Sjeff result = (*allocator)(ip, cg, pref, size, rsize); 14791541Srgrimes if (result) 14801541Srgrimes return (result); 14811541Srgrimes /* 14821541Srgrimes * 2: quadratic rehash 14831541Srgrimes */ 14841541Srgrimes for (i = 1; i < fs->fs_ncg; i *= 2) { 14851541Srgrimes cg += i; 14861541Srgrimes if (cg >= fs->fs_ncg) 14871541Srgrimes cg -= fs->fs_ncg; 1488207141Sjeff result = (*allocator)(ip, cg, 0, size, rsize); 14891541Srgrimes if (result) 14901541Srgrimes return (result); 14911541Srgrimes } 14921541Srgrimes /* 14931541Srgrimes * 3: brute force search 14941541Srgrimes * Note that we start at i == 2, since 0 was checked initially, 14951541Srgrimes * and 1 is always checked in the quadratic rehash. 14961541Srgrimes */ 14971541Srgrimes cg = (icg + 2) % fs->fs_ncg; 14981541Srgrimes for (i = 2; i < fs->fs_ncg; i++) { 1499207141Sjeff result = (*allocator)(ip, cg, 0, size, rsize); 15001541Srgrimes if (result) 15011541Srgrimes return (result); 15021541Srgrimes cg++; 15031541Srgrimes if (cg == fs->fs_ncg) 15041541Srgrimes cg = 0; 15051541Srgrimes } 150612590Sbde return (0); 15071541Srgrimes} 15081541Srgrimes 15091541Srgrimes/* 15101541Srgrimes * Determine whether a fragment can be extended. 15111541Srgrimes * 15128876Srgrimes * Check to see if the necessary fragments are available, and 15131541Srgrimes * if they are, allocate them. 15141541Srgrimes */ 151598542Smckusickstatic ufs2_daddr_t 15161541Srgrimesffs_fragextend(ip, cg, bprev, osize, nsize) 15171541Srgrimes struct inode *ip; 1518203763Smckusick u_int cg; 151998542Smckusick ufs2_daddr_t bprev; 15201541Srgrimes int osize, nsize; 15211541Srgrimes{ 152296506Sphk struct fs *fs; 152396506Sphk struct cg *cgp; 15241541Srgrimes struct buf *bp; 1525140704Sjeff struct ufsmount *ump; 1526140704Sjeff int nffree; 15271541Srgrimes long bno; 15281541Srgrimes int frags, bbase; 15291541Srgrimes int i, error; 153058087Smckusick u_int8_t *blksfree; 15311541Srgrimes 1532140704Sjeff ump = ip->i_ump; 15331541Srgrimes fs = ip->i_fs; 15341541Srgrimes if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 153517108Sbde return (0); 15361541Srgrimes frags = numfrags(fs, nsize); 15371541Srgrimes bbase = fragnum(fs, bprev); 15381541Srgrimes if (bbase > fragnum(fs, (bprev + frags - 1))) { 15391541Srgrimes /* cannot extend across a block boundary */ 154017108Sbde return (0); 15411541Srgrimes } 1542140704Sjeff UFS_UNLOCK(ump); 15431541Srgrimes error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 15441541Srgrimes (int)fs->fs_cgsize, NOCRED, &bp); 1545140704Sjeff if (error) 1546140704Sjeff goto fail; 15471541Srgrimes cgp = (struct cg *)bp->b_data; 1548140704Sjeff if (!cg_chkmagic(cgp)) 1549140704Sjeff goto fail; 155055697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 155198542Smckusick cgp->cg_old_time = cgp->cg_time = time_second; 15521541Srgrimes bno = dtogd(fs, bprev); 155358087Smckusick blksfree = cg_blksfree(cgp); 15541541Srgrimes for (i = numfrags(fs, osize); i < frags; i++) 1555140704Sjeff if (isclr(blksfree, bno + i)) 1556140704Sjeff goto fail; 15571541Srgrimes /* 15581541Srgrimes * the current fragment can be extended 15591541Srgrimes * deduct the count on fragment being extended into 15601541Srgrimes * increase the count on the remaining fragment (if any) 15611541Srgrimes * allocate the extended piece 15621541Srgrimes */ 15631541Srgrimes for (i = frags; i < fs->fs_frag - bbase; i++) 156458087Smckusick if (isclr(blksfree, bno + i)) 15651541Srgrimes break; 15661541Srgrimes cgp->cg_frsum[i - numfrags(fs, osize)]--; 15671541Srgrimes if (i != frags) 15681541Srgrimes cgp->cg_frsum[i - frags]++; 1569140704Sjeff for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { 157058087Smckusick clrbit(blksfree, bno + i); 15711541Srgrimes cgp->cg_cs.cs_nffree--; 1572140704Sjeff nffree++; 15731541Srgrimes } 1574140704Sjeff UFS_LOCK(ump); 1575140704Sjeff fs->fs_cstotal.cs_nffree -= nffree; 1576140704Sjeff fs->fs_cs(fs, cg).cs_nffree -= nffree; 15771541Srgrimes fs->fs_fmod = 1; 1578140704Sjeff ACTIVECLEAR(fs, cg); 1579140704Sjeff UFS_UNLOCK(ump); 158034266Sjulian if (DOINGSOFTDEP(ITOV(ip))) 1581207141Sjeff softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, 1582207141Sjeff frags, numfrags(fs, osize)); 15831541Srgrimes bdwrite(bp); 15841541Srgrimes return (bprev); 1585140704Sjeff 1586140704Sjefffail: 1587140704Sjeff brelse(bp); 1588140704Sjeff UFS_LOCK(ump); 1589140704Sjeff return (0); 1590140704Sjeff 15911541Srgrimes} 15921541Srgrimes 15931541Srgrimes/* 15941541Srgrimes * Determine whether a block can be allocated. 15951541Srgrimes * 15961541Srgrimes * Check to see if a block of the appropriate size is available, 15971541Srgrimes * and if it is, allocate it. 15981541Srgrimes */ 159998542Smckusickstatic ufs2_daddr_t 1600207141Sjeffffs_alloccg(ip, cg, bpref, size, rsize) 16011541Srgrimes struct inode *ip; 1602203763Smckusick u_int cg; 160398542Smckusick ufs2_daddr_t bpref; 16041541Srgrimes int size; 1605207141Sjeff int rsize; 16061541Srgrimes{ 160796506Sphk struct fs *fs; 160896506Sphk struct cg *cgp; 16091541Srgrimes struct buf *bp; 1610140704Sjeff struct ufsmount *ump; 161198542Smckusick ufs1_daddr_t bno; 161298542Smckusick ufs2_daddr_t blkno; 161398542Smckusick int i, allocsiz, error, frags; 161458087Smckusick u_int8_t *blksfree; 16151541Srgrimes 1616140704Sjeff ump = ip->i_ump; 16171541Srgrimes fs = ip->i_fs; 16181541Srgrimes if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 161917108Sbde return (0); 1620140704Sjeff UFS_UNLOCK(ump); 16211541Srgrimes error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 16221541Srgrimes (int)fs->fs_cgsize, NOCRED, &bp); 1623140704Sjeff if (error) 1624140704Sjeff goto fail; 16251541Srgrimes cgp = (struct cg *)bp->b_data; 16261541Srgrimes if (!cg_chkmagic(cgp) || 1627140704Sjeff (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) 1628140704Sjeff goto fail; 162955697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 163098542Smckusick cgp->cg_old_time = cgp->cg_time = time_second; 16311541Srgrimes if (size == fs->fs_bsize) { 1632140704Sjeff UFS_LOCK(ump); 1633207141Sjeff blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1634140704Sjeff ACTIVECLEAR(fs, cg); 1635140704Sjeff UFS_UNLOCK(ump); 16361541Srgrimes bdwrite(bp); 163798542Smckusick return (blkno); 16381541Srgrimes } 16391541Srgrimes /* 16401541Srgrimes * check to see if any fragments are already available 16411541Srgrimes * allocsiz is the size which will be allocated, hacking 16421541Srgrimes * it down to a smaller size if necessary 16431541Srgrimes */ 164458087Smckusick blksfree = cg_blksfree(cgp); 16451541Srgrimes frags = numfrags(fs, size); 16461541Srgrimes for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 16471541Srgrimes if (cgp->cg_frsum[allocsiz] != 0) 16481541Srgrimes break; 16491541Srgrimes if (allocsiz == fs->fs_frag) { 16501541Srgrimes /* 16518876Srgrimes * no fragments were available, so a block will be 16521541Srgrimes * allocated, and hacked up 16531541Srgrimes */ 1654140704Sjeff if (cgp->cg_cs.cs_nbfree == 0) 1655140704Sjeff goto fail; 1656140704Sjeff UFS_LOCK(ump); 1657207141Sjeff blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1658140704Sjeff ACTIVECLEAR(fs, cg); 1659140704Sjeff UFS_UNLOCK(ump); 16601541Srgrimes bdwrite(bp); 166198542Smckusick return (blkno); 16621541Srgrimes } 1663207141Sjeff KASSERT(size == rsize, 1664207141Sjeff ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); 16651541Srgrimes bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); 1666140704Sjeff if (bno < 0) 1667140704Sjeff goto fail; 16681541Srgrimes for (i = 0; i < frags; i++) 166958087Smckusick clrbit(blksfree, bno + i); 16701541Srgrimes cgp->cg_cs.cs_nffree -= frags; 1671140704Sjeff cgp->cg_frsum[allocsiz]--; 1672140704Sjeff if (frags != allocsiz) 1673140704Sjeff cgp->cg_frsum[allocsiz - frags]++; 1674140704Sjeff UFS_LOCK(ump); 16751541Srgrimes fs->fs_cstotal.cs_nffree -= frags; 16761541Srgrimes fs->fs_cs(fs, cg).cs_nffree -= frags; 16771541Srgrimes fs->fs_fmod = 1; 1678138634Smckusick blkno = cgbase(fs, cg) + bno; 1679140704Sjeff ACTIVECLEAR(fs, cg); 1680140704Sjeff UFS_UNLOCK(ump); 168134266Sjulian if (DOINGSOFTDEP(ITOV(ip))) 1682207141Sjeff softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); 16831541Srgrimes bdwrite(bp); 168498542Smckusick return (blkno); 1685140704Sjeff 1686140704Sjefffail: 1687140704Sjeff brelse(bp); 1688140704Sjeff UFS_LOCK(ump); 1689140704Sjeff return (0); 16901541Srgrimes} 16911541Srgrimes 16921541Srgrimes/* 16931541Srgrimes * Allocate a block in a cylinder group. 16941541Srgrimes * 16951541Srgrimes * This algorithm implements the following policy: 16961541Srgrimes * 1) allocate the requested block. 16971541Srgrimes * 2) allocate a rotationally optimal block in the same cylinder. 16981541Srgrimes * 3) allocate the next available block on the block rotor for the 16991541Srgrimes * specified cylinder group. 17001541Srgrimes * Note that this routine only allocates fs_bsize blocks; these 17011541Srgrimes * blocks may be fragmented by the routine that allocates them. 17021541Srgrimes */ 170398542Smckusickstatic ufs2_daddr_t 1704207141Sjeffffs_alloccgblk(ip, bp, bpref, size) 170534266Sjulian struct inode *ip; 170634266Sjulian struct buf *bp; 170798542Smckusick ufs2_daddr_t bpref; 1708207141Sjeff int size; 17091541Srgrimes{ 171034266Sjulian struct fs *fs; 171134266Sjulian struct cg *cgp; 1712140704Sjeff struct ufsmount *ump; 171398542Smckusick ufs1_daddr_t bno; 171498542Smckusick ufs2_daddr_t blkno; 171558087Smckusick u_int8_t *blksfree; 1716248623Smckusick int i, cgbpref; 17171541Srgrimes 171834266Sjulian fs = ip->i_fs; 1719140704Sjeff ump = ip->i_ump; 1720140704Sjeff mtx_assert(UFS_MTX(ump), MA_OWNED); 172134266Sjulian cgp = (struct cg *)bp->b_data; 172258087Smckusick blksfree = cg_blksfree(cgp); 1723248623Smckusick if (bpref == 0) { 1724252527Smckusick bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; 1725248623Smckusick } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { 1726248623Smckusick /* map bpref to correct zone in this cg */ 1727248623Smckusick if (bpref < cgdata(fs, cgbpref)) 1728248623Smckusick bpref = cgmeta(fs, cgp->cg_cgx); 1729248623Smckusick else 1730248623Smckusick bpref = cgdata(fs, cgp->cg_cgx); 17311541Srgrimes } 17321541Srgrimes /* 1733248623Smckusick * if the requested block is available, use it 1734248623Smckusick */ 1735248623Smckusick bno = dtogd(fs, blknum(fs, bpref)); 1736248623Smckusick if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) 1737248623Smckusick goto gotit; 1738248623Smckusick /* 173998542Smckusick * Take the next available block in this cylinder group. 17406769Sse */ 17411541Srgrimes bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); 17421541Srgrimes if (bno < 0) 174317108Sbde return (0); 1744248623Smckusick /* Update cg_rotor only if allocated from the data zone */ 1745248623Smckusick if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) 1746248623Smckusick cgp->cg_rotor = bno; 17471541Srgrimesgotit: 17481541Srgrimes blkno = fragstoblks(fs, bno); 174958087Smckusick ffs_clrblock(fs, blksfree, (long)blkno); 1750207141Sjeff ffs_clusteracct(fs, cgp, blkno, -1); 17511541Srgrimes cgp->cg_cs.cs_nbfree--; 17521541Srgrimes fs->fs_cstotal.cs_nbfree--; 17531541Srgrimes fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 17541541Srgrimes fs->fs_fmod = 1; 1755138634Smckusick blkno = cgbase(fs, cgp->cg_cgx) + bno; 1756207141Sjeff /* 1757207141Sjeff * If the caller didn't want the whole block free the frags here. 1758207141Sjeff */ 1759207141Sjeff size = numfrags(fs, size); 1760207141Sjeff if (size != fs->fs_frag) { 1761207141Sjeff bno = dtogd(fs, blkno); 1762207141Sjeff for (i = size; i < fs->fs_frag; i++) 1763207141Sjeff setbit(blksfree, bno + i); 1764207141Sjeff i = fs->fs_frag - size; 1765207141Sjeff cgp->cg_cs.cs_nffree += i; 1766207141Sjeff fs->fs_cstotal.cs_nffree += i; 1767207141Sjeff fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; 1768207141Sjeff fs->fs_fmod = 1; 1769207141Sjeff cgp->cg_frsum[i]++; 1770207141Sjeff } 1771140704Sjeff /* XXX Fixme. */ 1772140704Sjeff UFS_UNLOCK(ump); 177334266Sjulian if (DOINGSOFTDEP(ITOV(ip))) 1774207141Sjeff softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, 1775207141Sjeff size, 0); 1776140704Sjeff UFS_LOCK(ump); 177734266Sjulian return (blkno); 17781541Srgrimes} 17791541Srgrimes 17801541Srgrimes/* 17811541Srgrimes * Determine whether a cluster can be allocated. 17821541Srgrimes * 17831541Srgrimes * We do not currently check for optimal rotational layout if there 17841541Srgrimes * are multiple choices in the same cylinder group. Instead we just 17851541Srgrimes * take the first one that we find following bpref. 17861541Srgrimes */ 178798542Smckusickstatic ufs2_daddr_t 1788207141Sjeffffs_clusteralloc(ip, cg, bpref, len, unused) 17891541Srgrimes struct inode *ip; 1790203763Smckusick u_int cg; 179198542Smckusick ufs2_daddr_t bpref; 17921541Srgrimes int len; 1793207141Sjeff int unused; 17941541Srgrimes{ 179596506Sphk struct fs *fs; 179696506Sphk struct cg *cgp; 17971541Srgrimes struct buf *bp; 1798140704Sjeff struct ufsmount *ump; 179998542Smckusick int i, run, bit, map, got; 180098542Smckusick ufs2_daddr_t bno; 18011541Srgrimes u_char *mapp; 180222521Sdyson int32_t *lp; 180358087Smckusick u_int8_t *blksfree; 18041541Srgrimes 18051541Srgrimes fs = ip->i_fs; 1806140704Sjeff ump = ip->i_ump; 180722521Sdyson if (fs->fs_maxcluster[cg] < len) 180854952Seivind return (0); 1809140704Sjeff UFS_UNLOCK(ump); 18101541Srgrimes if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, 18111541Srgrimes NOCRED, &bp)) 1812140704Sjeff goto fail_lock; 18131541Srgrimes cgp = (struct cg *)bp->b_data; 18141541Srgrimes if (!cg_chkmagic(cgp)) 1815140704Sjeff goto fail_lock; 181655697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 18171541Srgrimes /* 18181541Srgrimes * Check to see if a cluster of the needed size (or bigger) is 18191541Srgrimes * available in this cylinder group. 18201541Srgrimes */ 182122521Sdyson lp = &cg_clustersum(cgp)[len]; 18221541Srgrimes for (i = len; i <= fs->fs_contigsumsize; i++) 182322521Sdyson if (*lp++ > 0) 18241541Srgrimes break; 182522521Sdyson if (i > fs->fs_contigsumsize) { 182622521Sdyson /* 182722521Sdyson * This is the first time looking for a cluster in this 182822521Sdyson * cylinder group. Update the cluster summary information 182922521Sdyson * to reflect the true maximum sized cluster so that 183022521Sdyson * future cluster allocation requests can avoid reading 183122521Sdyson * the cylinder group map only to find no clusters. 183222521Sdyson */ 183322521Sdyson lp = &cg_clustersum(cgp)[len - 1]; 183422521Sdyson for (i = len - 1; i > 0; i--) 183522521Sdyson if (*lp-- > 0) 183622521Sdyson break; 1837140704Sjeff UFS_LOCK(ump); 183822521Sdyson fs->fs_maxcluster[cg] = i; 18391541Srgrimes goto fail; 184022521Sdyson } 18411541Srgrimes /* 18421541Srgrimes * Search the cluster map to find a big enough cluster. 18431541Srgrimes * We take the first one that we find, even if it is larger 18441541Srgrimes * than we need as we prefer to get one close to the previous 18451541Srgrimes * block allocation. We do not search before the current 18461541Srgrimes * preference point as we do not want to allocate a block 18471541Srgrimes * that is allocated before the previous one (as we will 18481541Srgrimes * then have to wait for another pass of the elevator 18491541Srgrimes * algorithm before it will be read). We prefer to fail and 18501541Srgrimes * be recalled to try an allocation in the next cylinder group. 18511541Srgrimes */ 18521541Srgrimes if (dtog(fs, bpref) != cg) 1853248623Smckusick bpref = cgdata(fs, cg); 18541541Srgrimes else 1855248623Smckusick bpref = blknum(fs, bpref); 1856248623Smckusick bpref = fragstoblks(fs, dtogd(fs, bpref)); 18571541Srgrimes mapp = &cg_clustersfree(cgp)[bpref / NBBY]; 18581541Srgrimes map = *mapp++; 18591541Srgrimes bit = 1 << (bpref % NBBY); 186022521Sdyson for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { 18611541Srgrimes if ((map & bit) == 0) { 18621541Srgrimes run = 0; 18631541Srgrimes } else { 18641541Srgrimes run++; 18651541Srgrimes if (run == len) 18661541Srgrimes break; 18671541Srgrimes } 186822521Sdyson if ((got & (NBBY - 1)) != (NBBY - 1)) { 18691541Srgrimes bit <<= 1; 18701541Srgrimes } else { 18711541Srgrimes map = *mapp++; 18721541Srgrimes bit = 1; 18731541Srgrimes } 18741541Srgrimes } 187527890Sphk if (got >= cgp->cg_nclusterblks) 1876140704Sjeff goto fail_lock; 18771541Srgrimes /* 18781541Srgrimes * Allocate the cluster that we have found. 18791541Srgrimes */ 188058087Smckusick blksfree = cg_blksfree(cgp); 188122521Sdyson for (i = 1; i <= len; i++) 188258087Smckusick if (!ffs_isblock(fs, blksfree, got - run + i)) 188322521Sdyson panic("ffs_clusteralloc: map mismatch"); 1884138634Smckusick bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); 188522521Sdyson if (dtog(fs, bno) != cg) 188622521Sdyson panic("ffs_clusteralloc: allocated out of group"); 18871541Srgrimes len = blkstofrags(fs, len); 1888140704Sjeff UFS_LOCK(ump); 18891541Srgrimes for (i = 0; i < len; i += fs->fs_frag) 1890207141Sjeff if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) 18911541Srgrimes panic("ffs_clusteralloc: lost block"); 1892140704Sjeff ACTIVECLEAR(fs, cg); 1893140704Sjeff UFS_UNLOCK(ump); 18949980Sdg bdwrite(bp); 18951541Srgrimes return (bno); 18961541Srgrimes 1897140704Sjefffail_lock: 1898140704Sjeff UFS_LOCK(ump); 18991541Srgrimesfail: 19001541Srgrimes brelse(bp); 19011541Srgrimes return (0); 19021541Srgrimes} 19031541Srgrimes 1904247387Skibstatic inline struct buf * 1905247387Skibgetinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags) 1906247387Skib{ 1907247387Skib struct fs *fs; 1908247387Skib 1909247387Skib fs = ip->i_fs; 1910247387Skib return (getblk(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, 1911247387Skib cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, 1912247387Skib gbflags)); 1913247387Skib} 1914247387Skib 19151541Srgrimes/* 19161541Srgrimes * Determine whether an inode can be allocated. 19171541Srgrimes * 19181541Srgrimes * Check to see if an inode is available, and if it is, 19191541Srgrimes * allocate it using the following policy: 19201541Srgrimes * 1) allocate the requested inode. 19211541Srgrimes * 2) allocate the next available inode after the requested 19221541Srgrimes * inode in the specified cylinder group. 19231541Srgrimes */ 192498640Smckusickstatic ufs2_daddr_t 1925207141Sjeffffs_nodealloccg(ip, cg, ipref, mode, unused) 19261541Srgrimes struct inode *ip; 1927203763Smckusick u_int cg; 192898542Smckusick ufs2_daddr_t ipref; 19291541Srgrimes int mode; 1930207141Sjeff int unused; 19311541Srgrimes{ 193296506Sphk struct fs *fs; 193396506Sphk struct cg *cgp; 193498542Smckusick struct buf *bp, *ibp; 1935140704Sjeff struct ufsmount *ump; 1936229200Sed u_int8_t *inosused, *loc; 193798542Smckusick struct ufs2_dinode *dp2; 1938229200Sed int error, start, len, i; 1939247387Skib u_int32_t old_initediblk; 19401541Srgrimes 19411541Srgrimes fs = ip->i_fs; 1942140704Sjeff ump = ip->i_ump; 1943247387Skibcheck_nifree: 19441541Srgrimes if (fs->fs_cs(fs, cg).cs_nifree == 0) 194517108Sbde return (0); 1946140704Sjeff UFS_UNLOCK(ump); 19471541Srgrimes error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 19481541Srgrimes (int)fs->fs_cgsize, NOCRED, &bp); 19491541Srgrimes if (error) { 19501541Srgrimes brelse(bp); 1951140704Sjeff UFS_LOCK(ump); 195217108Sbde return (0); 19531541Srgrimes } 19541541Srgrimes cgp = (struct cg *)bp->b_data; 1955247387Skibrestart: 19561541Srgrimes if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) { 19571541Srgrimes brelse(bp); 1958140704Sjeff UFS_LOCK(ump); 195917108Sbde return (0); 19601541Srgrimes } 196155697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 196258087Smckusick inosused = cg_inosused(cgp); 19631541Srgrimes if (ipref) { 19641541Srgrimes ipref %= fs->fs_ipg; 196558087Smckusick if (isclr(inosused, ipref)) 19661541Srgrimes goto gotit; 19671541Srgrimes } 19681541Srgrimes start = cgp->cg_irotor / NBBY; 19691541Srgrimes len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 1970229200Sed loc = memcchr(&inosused[start], 0xff, len); 1971229200Sed if (loc == NULL) { 19721541Srgrimes len = start + 1; 19731541Srgrimes start = 0; 1974229200Sed loc = memcchr(&inosused[start], 0xff, len); 1975229200Sed if (loc == NULL) { 19766357Sphk printf("cg = %d, irotor = %ld, fs = %s\n", 197737555Sbde cg, (long)cgp->cg_irotor, fs->fs_fsmnt); 19781541Srgrimes panic("ffs_nodealloccg: map corrupted"); 19791541Srgrimes /* NOTREACHED */ 19801541Srgrimes } 19811541Srgrimes } 1982229200Sed ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; 19831541Srgrimesgotit: 198498542Smckusick /* 198598542Smckusick * Check to see if we need to initialize more inodes. 198698542Smckusick */ 198798542Smckusick if (fs->fs_magic == FS_UFS2_MAGIC && 198898542Smckusick ipref + INOPB(fs) > cgp->cg_initediblk && 198998542Smckusick cgp->cg_initediblk < cgp->cg_niblk) { 1990247387Skib old_initediblk = cgp->cg_initediblk; 1991247387Skib 1992247387Skib /* 1993247387Skib * Free the cylinder group lock before writing the 1994247387Skib * initialized inode block. Entering the 1995247387Skib * babarrierwrite() with the cylinder group lock 1996247387Skib * causes lock order violation between the lock and 1997247387Skib * snaplk. 1998247387Skib * 1999247387Skib * Another thread can decide to initialize the same 2000247387Skib * inode block, but whichever thread first gets the 2001247387Skib * cylinder group lock after writing the newly 2002247387Skib * allocated inode block will update it and the other 2003247387Skib * will realize that it has lost and leave the 2004247387Skib * cylinder group unchanged. 2005247387Skib */ 2006247387Skib ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); 2007247387Skib brelse(bp); 2008247387Skib if (ibp == NULL) { 2009247387Skib /* 2010247387Skib * The inode block buffer is already owned by 2011247387Skib * another thread, which must initialize it. 2012247387Skib * Wait on the buffer to allow another thread 2013247387Skib * to finish the updates, with dropped cg 2014247387Skib * buffer lock, then retry. 2015247387Skib */ 2016247387Skib ibp = getinobuf(ip, cg, old_initediblk, 0); 2017247387Skib brelse(ibp); 2018247387Skib UFS_LOCK(ump); 2019247387Skib goto check_nifree; 2020247387Skib } 202198542Smckusick bzero(ibp->b_data, (int)fs->fs_bsize); 202298542Smckusick dp2 = (struct ufs2_dinode *)(ibp->b_data); 202398542Smckusick for (i = 0; i < INOPB(fs); i++) { 2024110885Smckusick dp2->di_gen = arc4random() / 2 + 1; 202598542Smckusick dp2++; 202698542Smckusick } 2027246877Smckusick /* 2028246877Smckusick * Rather than adding a soft updates dependency to ensure 2029246877Smckusick * that the new inode block is written before it is claimed 2030246877Smckusick * by the cylinder group map, we just do a barrier write 2031246877Smckusick * here. The barrier write will ensure that the inode block 2032246877Smckusick * gets written before the updated cylinder group map can be 2033246877Smckusick * written. The barrier write should only slow down bulk 2034246877Smckusick * loading of newly created filesystems. 2035246877Smckusick */ 2036246877Smckusick babarrierwrite(ibp); 2037247387Skib 2038247387Skib /* 2039247387Skib * After the inode block is written, try to update the 2040247387Skib * cg initediblk pointer. If another thread beat us 2041247387Skib * to it, then leave it unchanged as the other thread 2042247387Skib * has already set it correctly. 2043247387Skib */ 2044247387Skib error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 2045247387Skib (int)fs->fs_cgsize, NOCRED, &bp); 2046247387Skib UFS_LOCK(ump); 2047247387Skib ACTIVECLEAR(fs, cg); 2048247387Skib UFS_UNLOCK(ump); 2049247387Skib if (error != 0) { 2050247387Skib brelse(bp); 2051247387Skib return (error); 2052247387Skib } 2053247387Skib cgp = (struct cg *)bp->b_data; 2054247387Skib if (cgp->cg_initediblk == old_initediblk) 2055247387Skib cgp->cg_initediblk += INOPB(fs); 2056247387Skib goto restart; 205798542Smckusick } 2058247387Skib cgp->cg_old_time = cgp->cg_time = time_second; 2059247387Skib cgp->cg_irotor = ipref; 2060140704Sjeff UFS_LOCK(ump); 2061140704Sjeff ACTIVECLEAR(fs, cg); 2062140704Sjeff setbit(inosused, ipref); 2063140704Sjeff cgp->cg_cs.cs_nifree--; 2064140704Sjeff fs->fs_cstotal.cs_nifree--; 2065140704Sjeff fs->fs_cs(fs, cg).cs_nifree--; 2066140704Sjeff fs->fs_fmod = 1; 2067140704Sjeff if ((mode & IFMT) == IFDIR) { 2068140704Sjeff cgp->cg_cs.cs_ndir++; 2069140704Sjeff fs->fs_cstotal.cs_ndir++; 2070140704Sjeff fs->fs_cs(fs, cg).cs_ndir++; 2071140704Sjeff } 2072140704Sjeff UFS_UNLOCK(ump); 2073140704Sjeff if (DOINGSOFTDEP(ITOV(ip))) 2074223325Sjeff softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); 20751541Srgrimes bdwrite(bp); 2076203763Smckusick return ((ino_t)(cg * fs->fs_ipg + ipref)); 20771541Srgrimes} 20781541Srgrimes 20791541Srgrimes/* 20801541Srgrimes * Free a block or fragment. 20811541Srgrimes * 20821541Srgrimes * The specified block or fragment is placed back in the 20838876Srgrimes * free map. If a fragment is deallocated, a possible 20841541Srgrimes * block reassembly is checked. 20851541Srgrimes */ 2086216796Skibstatic void 2087216796Skibffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd) 2088140704Sjeff struct ufsmount *ump; 208990098Smckusick struct fs *fs; 209090098Smckusick struct vnode *devvp; 209198542Smckusick ufs2_daddr_t bno; 20921541Srgrimes long size; 209390098Smckusick ino_t inum; 2094207141Sjeff struct workhead *dephd; 20951541Srgrimes{ 2096207141Sjeff struct mount *mp; 209790098Smckusick struct cg *cgp; 20981541Srgrimes struct buf *bp; 209998542Smckusick ufs1_daddr_t fragno, cgbno; 210098542Smckusick ufs2_daddr_t cgblkno; 2101203763Smckusick int i, blk, frags, bbase; 2102203763Smckusick u_int cg; 210358087Smckusick u_int8_t *blksfree; 2104130585Sphk struct cdev *dev; 21051541Srgrimes 210690098Smckusick cg = dtog(fs, bno); 2107188240Strasz if (devvp->v_type == VREG) { 210890098Smckusick /* devvp is a snapshot */ 210990098Smckusick dev = VTOI(devvp)->i_devvp->v_rdev; 211098542Smckusick cgblkno = fragstoblks(fs, cgtod(fs, cg)); 211190098Smckusick } else { 211290098Smckusick /* devvp is a normal disk device */ 211390098Smckusick dev = devvp->v_rdev; 211498542Smckusick cgblkno = fsbtodb(fs, cgtod(fs, cg)); 2115222334Smckusick ASSERT_VOP_LOCKED(devvp, "ffs_blkfree_cg"); 211690098Smckusick } 2117173464Sobrien#ifdef INVARIANTS 211834266Sjulian if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 || 211934266Sjulian fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { 2120103594Sobrien printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", 212198542Smckusick devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, 212290098Smckusick size, fs->fs_fsmnt); 2123223127Smckusick panic("ffs_blkfree_cg: bad size"); 21241541Srgrimes } 212562976Smckusick#endif 21261541Srgrimes if ((u_int)bno >= fs->fs_size) { 212798687Smux printf("bad block %jd, ino %lu\n", (intmax_t)bno, 212898542Smckusick (u_long)inum); 212990098Smckusick ffs_fserr(fs, inum, "bad block"); 21301541Srgrimes return; 21311541Srgrimes } 2132115474Sphk if (bread(devvp, cgblkno, (int)fs->fs_cgsize, NOCRED, &bp)) { 21331541Srgrimes brelse(bp); 21341541Srgrimes return; 21351541Srgrimes } 21361541Srgrimes cgp = (struct cg *)bp->b_data; 21371541Srgrimes if (!cg_chkmagic(cgp)) { 21381541Srgrimes brelse(bp); 21391541Srgrimes return; 21401541Srgrimes } 214155697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 214298542Smckusick cgp->cg_old_time = cgp->cg_time = time_second; 214374545Smckusick cgbno = dtogd(fs, bno); 214458087Smckusick blksfree = cg_blksfree(cgp); 2145140704Sjeff UFS_LOCK(ump); 21461541Srgrimes if (size == fs->fs_bsize) { 214774545Smckusick fragno = fragstoblks(fs, cgbno); 214874545Smckusick if (!ffs_isfreeblock(fs, blksfree, fragno)) { 2149188240Strasz if (devvp->v_type == VREG) { 2150140704Sjeff UFS_UNLOCK(ump); 215190098Smckusick /* devvp is a snapshot */ 215290098Smckusick brelse(bp); 215390098Smckusick return; 215490098Smckusick } 215598687Smux printf("dev = %s, block = %jd, fs = %s\n", 215698542Smckusick devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); 2157223127Smckusick panic("ffs_blkfree_cg: freeing free block"); 21581541Srgrimes } 215974545Smckusick ffs_setblock(fs, blksfree, fragno); 2160207141Sjeff ffs_clusteracct(fs, cgp, fragno, 1); 21611541Srgrimes cgp->cg_cs.cs_nbfree++; 21621541Srgrimes fs->fs_cstotal.cs_nbfree++; 21631541Srgrimes fs->fs_cs(fs, cg).cs_nbfree++; 21641541Srgrimes } else { 216574545Smckusick bbase = cgbno - fragnum(fs, cgbno); 21661541Srgrimes /* 21671541Srgrimes * decrement the counts associated with the old frags 21681541Srgrimes */ 216958087Smckusick blk = blkmap(fs, blksfree, bbase); 21701541Srgrimes ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 21711541Srgrimes /* 21721541Srgrimes * deallocate the fragment 21731541Srgrimes */ 21741541Srgrimes frags = numfrags(fs, size); 21751541Srgrimes for (i = 0; i < frags; i++) { 217674545Smckusick if (isset(blksfree, cgbno + i)) { 217798687Smux printf("dev = %s, block = %jd, fs = %s\n", 217898542Smckusick devtoname(dev), (intmax_t)(bno + i), 217937555Sbde fs->fs_fsmnt); 2180223127Smckusick panic("ffs_blkfree_cg: freeing free frag"); 21811541Srgrimes } 218274545Smckusick setbit(blksfree, cgbno + i); 21831541Srgrimes } 21841541Srgrimes cgp->cg_cs.cs_nffree += i; 21851541Srgrimes fs->fs_cstotal.cs_nffree += i; 21861541Srgrimes fs->fs_cs(fs, cg).cs_nffree += i; 21871541Srgrimes /* 21881541Srgrimes * add back in counts associated with the new frags 21891541Srgrimes */ 219058087Smckusick blk = blkmap(fs, blksfree, bbase); 21911541Srgrimes ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 21921541Srgrimes /* 21931541Srgrimes * if a complete block has been reassembled, account for it 21941541Srgrimes */ 219574545Smckusick fragno = fragstoblks(fs, bbase); 219674545Smckusick if (ffs_isblock(fs, blksfree, fragno)) { 21971541Srgrimes cgp->cg_cs.cs_nffree -= fs->fs_frag; 21981541Srgrimes fs->fs_cstotal.cs_nffree -= fs->fs_frag; 21991541Srgrimes fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 2200207141Sjeff ffs_clusteracct(fs, cgp, fragno, 1); 22011541Srgrimes cgp->cg_cs.cs_nbfree++; 22021541Srgrimes fs->fs_cstotal.cs_nbfree++; 22031541Srgrimes fs->fs_cs(fs, cg).cs_nbfree++; 22041541Srgrimes } 22051541Srgrimes } 22061541Srgrimes fs->fs_fmod = 1; 2207140704Sjeff ACTIVECLEAR(fs, cg); 2208140704Sjeff UFS_UNLOCK(ump); 2209207141Sjeff mp = UFSTOVFS(ump); 2210224503Smckusick if (MOUNTEDSOFTDEP(mp) && devvp->v_type != VREG) 2211207141Sjeff softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, 2212207141Sjeff numfrags(fs, size), dephd); 22131541Srgrimes bdwrite(bp); 22141541Srgrimes} 22151541Srgrimes 2216216796SkibTASKQUEUE_DEFINE_THREAD(ffs_trim); 2217216796Skib 2218216796Skibstruct ffs_blkfree_trim_params { 2219216796Skib struct task task; 2220216796Skib struct ufsmount *ump; 2221216796Skib struct vnode *devvp; 2222216796Skib ufs2_daddr_t bno; 2223216796Skib long size; 2224216796Skib ino_t inum; 2225216796Skib struct workhead *pdephd; 2226216796Skib struct workhead dephd; 2227216796Skib}; 2228216796Skib 2229216796Skibstatic void 2230216796Skibffs_blkfree_trim_task(ctx, pending) 2231216796Skib void *ctx; 2232216796Skib int pending; 2233216796Skib{ 2234216796Skib struct ffs_blkfree_trim_params *tp; 2235216796Skib 2236216796Skib tp = ctx; 2237216796Skib ffs_blkfree_cg(tp->ump, tp->ump->um_fs, tp->devvp, tp->bno, tp->size, 2238216796Skib tp->inum, tp->pdephd); 2239216796Skib vn_finished_secondary_write(UFSTOVFS(tp->ump)); 2240216796Skib free(tp, M_TEMP); 2241216796Skib} 2242216796Skib 2243216796Skibstatic void 2244216796Skibffs_blkfree_trim_completed(bip) 2245216796Skib struct bio *bip; 2246216796Skib{ 2247216796Skib struct ffs_blkfree_trim_params *tp; 2248216796Skib 2249216796Skib tp = bip->bio_caller2; 2250216796Skib g_destroy_bio(bip); 2251216796Skib TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); 2252216796Skib taskqueue_enqueue(taskqueue_ffs_trim, &tp->task); 2253216796Skib} 2254216796Skib 2255216796Skibvoid 2256223127Smckusickffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) 2257216796Skib struct ufsmount *ump; 2258216796Skib struct fs *fs; 2259216796Skib struct vnode *devvp; 2260216796Skib ufs2_daddr_t bno; 2261216796Skib long size; 2262216796Skib ino_t inum; 2263223127Smckusick enum vtype vtype; 2264216796Skib struct workhead *dephd; 2265216796Skib{ 2266216796Skib struct mount *mp; 2267216796Skib struct bio *bip; 2268216796Skib struct ffs_blkfree_trim_params *tp; 2269216796Skib 2270222334Smckusick /* 2271222334Smckusick * Check to see if a snapshot wants to claim the block. 2272222334Smckusick * Check that devvp is a normal disk device, not a snapshot, 2273222334Smckusick * it has a snapshot(s) associated with it, and one of the 2274222334Smckusick * snapshots wants to claim the block. 2275222334Smckusick */ 2276222334Smckusick if (devvp->v_type != VREG && 2277222334Smckusick (devvp->v_vflag & VV_COPYONWRITE) && 2278223127Smckusick ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { 2279222334Smckusick return; 2280222334Smckusick } 2281223902Smckusick /* 2282223902Smckusick * Nothing to delay if TRIM is disabled, or the operation is 2283223902Smckusick * performed on the snapshot. 2284223902Smckusick */ 2285223902Smckusick if (!ump->um_candelete || devvp->v_type == VREG) { 2286216796Skib ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); 2287216796Skib return; 2288216796Skib } 2289216796Skib 2290216796Skib /* 2291216796Skib * Postpone the set of the free bit in the cg bitmap until the 2292216796Skib * BIO_DELETE is completed. Otherwise, due to disk queue 2293216796Skib * reordering, TRIM might be issued after we reuse the block 2294216796Skib * and write some new data into it. 2295216796Skib */ 2296216796Skib tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK); 2297216796Skib tp->ump = ump; 2298216796Skib tp->devvp = devvp; 2299216796Skib tp->bno = bno; 2300216796Skib tp->size = size; 2301216796Skib tp->inum = inum; 2302216796Skib if (dephd != NULL) { 2303216796Skib LIST_INIT(&tp->dephd); 2304216796Skib LIST_SWAP(dephd, &tp->dephd, worklist, wk_list); 2305216796Skib tp->pdephd = &tp->dephd; 2306216796Skib } else 2307216796Skib tp->pdephd = NULL; 2308216796Skib 2309216796Skib bip = g_alloc_bio(); 2310216796Skib bip->bio_cmd = BIO_DELETE; 2311216796Skib bip->bio_offset = dbtob(fsbtodb(fs, bno)); 2312216796Skib bip->bio_done = ffs_blkfree_trim_completed; 2313216796Skib bip->bio_length = size; 2314216796Skib bip->bio_caller2 = tp; 2315216796Skib 2316216796Skib mp = UFSTOVFS(ump); 2317216796Skib vn_start_secondary_write(NULL, &mp, 0); 2318216796Skib g_io_request(bip, (struct g_consumer *)devvp->v_bufobj.bo_private); 2319216796Skib} 2320216796Skib 2321173464Sobrien#ifdef INVARIANTS 23221541Srgrimes/* 232322521Sdyson * Verify allocation of a block or fragment. Returns true if block or 232422521Sdyson * fragment is allocated, false if it is free. 232522521Sdyson */ 232631352Sbdestatic int 232722521Sdysonffs_checkblk(ip, bno, size) 232822521Sdyson struct inode *ip; 232998542Smckusick ufs2_daddr_t bno; 233022521Sdyson long size; 233122521Sdyson{ 233222521Sdyson struct fs *fs; 233322521Sdyson struct cg *cgp; 233422521Sdyson struct buf *bp; 233598542Smckusick ufs1_daddr_t cgbno; 233622521Sdyson int i, error, frags, free; 233758087Smckusick u_int8_t *blksfree; 233822521Sdyson 233922521Sdyson fs = ip->i_fs; 234022521Sdyson if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { 234137555Sbde printf("bsize = %ld, size = %ld, fs = %s\n", 234237555Sbde (long)fs->fs_bsize, size, fs->fs_fsmnt); 234322544Smpp panic("ffs_checkblk: bad size"); 234422521Sdyson } 234522521Sdyson if ((u_int)bno >= fs->fs_size) 2346103594Sobrien panic("ffs_checkblk: bad block %jd", (intmax_t)bno); 234722521Sdyson error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, dtog(fs, bno))), 234822521Sdyson (int)fs->fs_cgsize, NOCRED, &bp); 234922544Smpp if (error) 235022544Smpp panic("ffs_checkblk: cg bread failed"); 235122521Sdyson cgp = (struct cg *)bp->b_data; 235222544Smpp if (!cg_chkmagic(cgp)) 235322544Smpp panic("ffs_checkblk: cg magic mismatch"); 235455697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 235558087Smckusick blksfree = cg_blksfree(cgp); 235698542Smckusick cgbno = dtogd(fs, bno); 235722521Sdyson if (size == fs->fs_bsize) { 235898542Smckusick free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); 235922521Sdyson } else { 236022521Sdyson frags = numfrags(fs, size); 236122521Sdyson for (free = 0, i = 0; i < frags; i++) 236298542Smckusick if (isset(blksfree, cgbno + i)) 236322521Sdyson free++; 236422521Sdyson if (free != 0 && free != frags) 236522544Smpp panic("ffs_checkblk: partially free fragment"); 236622521Sdyson } 236722521Sdyson brelse(bp); 236822521Sdyson return (!free); 236922521Sdyson} 2370173464Sobrien#endif /* INVARIANTS */ 237122521Sdyson 237222521Sdyson/* 23731541Srgrimes * Free an inode. 23741541Srgrimes */ 23751541Srgrimesint 237674548Smckusickffs_vfree(pvp, ino, mode) 237730474Sphk struct vnode *pvp; 237830474Sphk ino_t ino; 237930474Sphk int mode; 23801541Srgrimes{ 2381140704Sjeff struct inode *ip; 2382140704Sjeff 238334266Sjulian if (DOINGSOFTDEP(pvp)) { 238434266Sjulian softdep_freefile(pvp, ino, mode); 238534266Sjulian return (0); 238634266Sjulian } 2387140704Sjeff ip = VTOI(pvp); 2388207141Sjeff return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode, 2389207141Sjeff NULL)); 239034266Sjulian} 239134266Sjulian 239234266Sjulian/* 239334266Sjulian * Do the actual free operation. 239434266Sjulian * The specified inode is placed back in the free map. 239534266Sjulian */ 239674548Smckusickint 2397207141Sjeffffs_freefile(ump, fs, devvp, ino, mode, wkhd) 2398140704Sjeff struct ufsmount *ump; 239990098Smckusick struct fs *fs; 240090098Smckusick struct vnode *devvp; 240134266Sjulian ino_t ino; 240234266Sjulian int mode; 2403207141Sjeff struct workhead *wkhd; 240434266Sjulian{ 240590098Smckusick struct cg *cgp; 24061541Srgrimes struct buf *bp; 240798542Smckusick ufs2_daddr_t cgbno; 2408203763Smckusick int error; 2409203763Smckusick u_int cg; 241058087Smckusick u_int8_t *inosused; 2411130585Sphk struct cdev *dev; 24121541Srgrimes 241390098Smckusick cg = ino_to_cg(fs, ino); 2414188240Strasz if (devvp->v_type == VREG) { 241590098Smckusick /* devvp is a snapshot */ 241690098Smckusick dev = VTOI(devvp)->i_devvp->v_rdev; 241790098Smckusick cgbno = fragstoblks(fs, cgtod(fs, cg)); 241890098Smckusick } else { 241990098Smckusick /* devvp is a normal disk device */ 242090098Smckusick dev = devvp->v_rdev; 242190098Smckusick cgbno = fsbtodb(fs, cgtod(fs, cg)); 242290098Smckusick } 2423203763Smckusick if (ino >= fs->fs_ipg * fs->fs_ncg) 2424241011Smdf panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s", 2425241011Smdf devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); 242690098Smckusick if ((error = bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp))) { 24271541Srgrimes brelse(bp); 242834266Sjulian return (error); 24291541Srgrimes } 24301541Srgrimes cgp = (struct cg *)bp->b_data; 24311541Srgrimes if (!cg_chkmagic(cgp)) { 24321541Srgrimes brelse(bp); 24331541Srgrimes return (0); 24341541Srgrimes } 243555697Smckusick bp->b_xflags |= BX_BKGRDWRITE; 243698542Smckusick cgp->cg_old_time = cgp->cg_time = time_second; 243758087Smckusick inosused = cg_inosused(cgp); 24381541Srgrimes ino %= fs->fs_ipg; 243958087Smckusick if (isclr(inosused, ino)) { 2440241011Smdf printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev), 2441241011Smdf (uintmax_t)(ino + cg * fs->fs_ipg), fs->fs_fsmnt); 24421541Srgrimes if (fs->fs_ronly == 0) 2443108010Smckusick panic("ffs_freefile: freeing free inode"); 24441541Srgrimes } 244558087Smckusick clrbit(inosused, ino); 24461541Srgrimes if (ino < cgp->cg_irotor) 24471541Srgrimes cgp->cg_irotor = ino; 24481541Srgrimes cgp->cg_cs.cs_nifree++; 2449140704Sjeff UFS_LOCK(ump); 24501541Srgrimes fs->fs_cstotal.cs_nifree++; 24511541Srgrimes fs->fs_cs(fs, cg).cs_nifree++; 245230474Sphk if ((mode & IFMT) == IFDIR) { 24531541Srgrimes cgp->cg_cs.cs_ndir--; 24541541Srgrimes fs->fs_cstotal.cs_ndir--; 24551541Srgrimes fs->fs_cs(fs, cg).cs_ndir--; 24561541Srgrimes } 24571541Srgrimes fs->fs_fmod = 1; 2458140704Sjeff ACTIVECLEAR(fs, cg); 2459140704Sjeff UFS_UNLOCK(ump); 2460224503Smckusick if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type != VREG) 2461207141Sjeff softdep_setup_inofree(UFSTOVFS(ump), bp, 2462207141Sjeff ino + cg * fs->fs_ipg, wkhd); 24631541Srgrimes bdwrite(bp); 24641541Srgrimes return (0); 24651541Srgrimes} 24661541Srgrimes 24671541Srgrimes/* 2468111239Smckusick * Check to see if a file is free. 2469111239Smckusick */ 2470111239Smckusickint 2471111239Smckusickffs_checkfreefile(fs, devvp, ino) 2472111239Smckusick struct fs *fs; 2473111239Smckusick struct vnode *devvp; 2474111239Smckusick ino_t ino; 2475111239Smckusick{ 2476111239Smckusick struct cg *cgp; 2477111239Smckusick struct buf *bp; 2478111239Smckusick ufs2_daddr_t cgbno; 2479203763Smckusick int ret; 2480203763Smckusick u_int cg; 2481111239Smckusick u_int8_t *inosused; 2482111239Smckusick 2483111239Smckusick cg = ino_to_cg(fs, ino); 2484188240Strasz if (devvp->v_type == VREG) { 2485111239Smckusick /* devvp is a snapshot */ 2486111239Smckusick cgbno = fragstoblks(fs, cgtod(fs, cg)); 2487111239Smckusick } else { 2488111239Smckusick /* devvp is a normal disk device */ 2489111239Smckusick cgbno = fsbtodb(fs, cgtod(fs, cg)); 2490111239Smckusick } 2491203763Smckusick if (ino >= fs->fs_ipg * fs->fs_ncg) 2492111239Smckusick return (1); 2493115474Sphk if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp)) { 2494111239Smckusick brelse(bp); 2495111239Smckusick return (1); 2496111239Smckusick } 2497111239Smckusick cgp = (struct cg *)bp->b_data; 2498111239Smckusick if (!cg_chkmagic(cgp)) { 2499111239Smckusick brelse(bp); 2500111239Smckusick return (1); 2501111239Smckusick } 2502111239Smckusick inosused = cg_inosused(cgp); 2503111239Smckusick ino %= fs->fs_ipg; 2504111239Smckusick ret = isclr(inosused, ino); 2505111239Smckusick brelse(bp); 2506111239Smckusick return (ret); 2507111239Smckusick} 2508111239Smckusick 2509111239Smckusick/* 25101541Srgrimes * Find a block of the specified size in the specified cylinder group. 25111541Srgrimes * 25121541Srgrimes * It is a panic if a request is made to find a block if none are 25131541Srgrimes * available. 25141541Srgrimes */ 251598542Smckusickstatic ufs1_daddr_t 25161541Srgrimesffs_mapsearch(fs, cgp, bpref, allocsiz) 251796506Sphk struct fs *fs; 251896506Sphk struct cg *cgp; 251998542Smckusick ufs2_daddr_t bpref; 25201541Srgrimes int allocsiz; 25211541Srgrimes{ 252298542Smckusick ufs1_daddr_t bno; 25231541Srgrimes int start, len, loc, i; 25241541Srgrimes int blk, field, subfield, pos; 252558087Smckusick u_int8_t *blksfree; 25261541Srgrimes 25271541Srgrimes /* 25281541Srgrimes * find the fragment by searching through the free block 25291541Srgrimes * map for an appropriate bit pattern 25301541Srgrimes */ 25311541Srgrimes if (bpref) 25321541Srgrimes start = dtogd(fs, bpref) / NBBY; 25331541Srgrimes else 25341541Srgrimes start = cgp->cg_frotor / NBBY; 253558087Smckusick blksfree = cg_blksfree(cgp); 25361541Srgrimes len = howmany(fs->fs_fpg, NBBY) - start; 253758087Smckusick loc = scanc((u_int)len, (u_char *)&blksfree[start], 2538160462Sstefanf fragtbl[fs->fs_frag], 25391541Srgrimes (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 25401541Srgrimes if (loc == 0) { 25411541Srgrimes len = start + 1; 25421541Srgrimes start = 0; 254358087Smckusick loc = scanc((u_int)len, (u_char *)&blksfree[0], 2544160462Sstefanf fragtbl[fs->fs_frag], 25451541Srgrimes (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 25461541Srgrimes if (loc == 0) { 25471541Srgrimes printf("start = %d, len = %d, fs = %s\n", 25481541Srgrimes start, len, fs->fs_fsmnt); 25491541Srgrimes panic("ffs_alloccg: map corrupted"); 25501541Srgrimes /* NOTREACHED */ 25511541Srgrimes } 25521541Srgrimes } 25531541Srgrimes bno = (start + len - loc) * NBBY; 25541541Srgrimes cgp->cg_frotor = bno; 25551541Srgrimes /* 25561541Srgrimes * found the byte in the map 25571541Srgrimes * sift through the bits to find the selected frag 25581541Srgrimes */ 25591541Srgrimes for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 256058087Smckusick blk = blkmap(fs, blksfree, bno); 25611541Srgrimes blk <<= 1; 25621541Srgrimes field = around[allocsiz]; 25631541Srgrimes subfield = inside[allocsiz]; 25641541Srgrimes for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { 25651541Srgrimes if ((blk & field) == subfield) 25661541Srgrimes return (bno + pos); 25671541Srgrimes field <<= 1; 25681541Srgrimes subfield <<= 1; 25691541Srgrimes } 25701541Srgrimes } 25713487Sphk printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt); 25721541Srgrimes panic("ffs_alloccg: block not in map"); 25731541Srgrimes return (-1); 25741541Srgrimes} 25751541Srgrimes 25761541Srgrimes/* 257796755Strhodes * Fserr prints the name of a filesystem with an error diagnostic. 25788876Srgrimes * 25791541Srgrimes * The form of the error message is: 25801541Srgrimes * fs: error message 25811541Srgrimes */ 2582223114Smckusickvoid 258390098Smckusickffs_fserr(fs, inum, cp) 25841541Srgrimes struct fs *fs; 258590098Smckusick ino_t inum; 25861541Srgrimes char *cp; 25871541Srgrimes{ 2588112450Sjhb struct thread *td = curthread; /* XXX */ 2589112450Sjhb struct proc *p = td->td_proc; 25901541Srgrimes 2591241011Smdf log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n", 2592241011Smdf p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum, 2593241011Smdf fs->fs_fsmnt, cp); 25941541Srgrimes} 259574548Smckusick 259674548Smckusick/* 259774548Smckusick * This function provides the capability for the fsck program to 2598202113Smckusick * update an active filesystem. Fourteen operations are provided: 259974548Smckusick * 260074548Smckusick * adjrefcnt(inode, amt) - adjusts the reference count on the 260174548Smckusick * specified inode by the specified amount. Under normal 260274548Smckusick * operation the count should always go down. Decrementing 260374548Smckusick * the count to zero will cause the inode to be freed. 2604222724Smckusick * adjblkcnt(inode, amt) - adjust the number of blocks used by the 2605222724Smckusick * inode by the specified amount. 2606142123Sdelphij * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - 2607142123Sdelphij * adjust the superblock summary. 260874548Smckusick * freedirs(inode, count) - directory inodes [inode..inode + count - 1] 260974548Smckusick * are marked as free. Inodes should never have to be marked 261074548Smckusick * as in use. 261174548Smckusick * freefiles(inode, count) - file inodes [inode..inode + count - 1] 261274548Smckusick * are marked as free. Inodes should never have to be marked 261374548Smckusick * as in use. 261474548Smckusick * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] 261574548Smckusick * are marked as free. Blocks should never have to be marked 261674548Smckusick * as in use. 261774548Smckusick * setflags(flags, set/clear) - the fs_flags field has the specified 261874548Smckusick * flags set (second parameter +1) or cleared (second parameter -1). 2619202113Smckusick * setcwd(dirinode) - set the current directory to dirinode in the 2620202113Smckusick * filesystem associated with the snapshot. 2621202113Smckusick * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." 2622202113Smckusick * in the current directory is oldvalue then change it to newvalue. 2623202113Smckusick * unlink(nameptr, oldvalue) - Verify that the inode number associated 2624202113Smckusick * with nameptr in the current directory is oldvalue then unlink it. 2625224061Smckusick * 2626224061Smckusick * The following functions may only be used on a quiescent filesystem 2627224061Smckusick * by the soft updates journal. They are not safe to be run on an active 2628224061Smckusick * filesystem. 2629224061Smckusick * 2630224061Smckusick * setinode(inode, dip) - the specified disk inode is replaced with the 2631224061Smckusick * contents pointed to by dip. 2632224061Smckusick * setbufoutput(fd, flags) - output associated with the specified file 2633224061Smckusick * descriptor (which must reference the character device supporting 2634224061Smckusick * the filesystem) switches from using physio to running through the 2635224061Smckusick * buffer cache when flags is set to 1. The descriptor reverts to 2636224061Smckusick * physio for output when flags is set to zero. 263774548Smckusick */ 263874548Smckusick 263992728Salfredstatic int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); 264074548Smckusick 264174548SmckusickSYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR|CTLTYPE_STRUCT, 264274548Smckusick 0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count"); 264374548Smckusick 2644141631Sphkstatic SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR, 264574548Smckusick sysctl_ffs_fsck, "Adjust Inode Used Blocks Count"); 264674548Smckusick 2647142123Sdelphijstatic SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR, 2648142123Sdelphij sysctl_ffs_fsck, "Adjust number of directories"); 2649142123Sdelphij 2650142123Sdelphijstatic SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR, 2651142123Sdelphij sysctl_ffs_fsck, "Adjust number of free blocks"); 2652142123Sdelphij 2653142123Sdelphijstatic SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR, 2654142123Sdelphij sysctl_ffs_fsck, "Adjust number of free inodes"); 2655142123Sdelphij 2656142123Sdelphijstatic SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR, 2657142123Sdelphij sysctl_ffs_fsck, "Adjust number of free frags"); 2658142123Sdelphij 2659142123Sdelphijstatic SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR, 2660142123Sdelphij sysctl_ffs_fsck, "Adjust number of free clusters"); 2661142123Sdelphij 2662141631Sphkstatic SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR, 266374548Smckusick sysctl_ffs_fsck, "Free Range of Directory Inodes"); 266474548Smckusick 2665141631Sphkstatic SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR, 266674548Smckusick sysctl_ffs_fsck, "Free Range of File Inodes"); 266774548Smckusick 2668141631Sphkstatic SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR, 266974548Smckusick sysctl_ffs_fsck, "Free Range of Blocks"); 267074548Smckusick 2671141631Sphkstatic SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR, 267274548Smckusick sysctl_ffs_fsck, "Change Filesystem Flags"); 267374548Smckusick 2674202113Smckusickstatic SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, CTLFLAG_WR, 2675202113Smckusick sysctl_ffs_fsck, "Set Current Working Directory"); 2676202113Smckusick 2677202113Smckusickstatic SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, CTLFLAG_WR, 2678202113Smckusick sysctl_ffs_fsck, "Change Value of .. Entry"); 2679202113Smckusick 2680202113Smckusickstatic SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR, 2681202113Smckusick sysctl_ffs_fsck, "Unlink a Duplicate Name"); 2682202113Smckusick 2683224061Smckusickstatic SYSCTL_NODE(_vfs_ffs, FFS_SET_INODE, setinode, CTLFLAG_WR, 2684224061Smckusick sysctl_ffs_fsck, "Update an On-Disk Inode"); 2685224061Smckusick 2686224061Smckusickstatic SYSCTL_NODE(_vfs_ffs, FFS_SET_BUFOUTPUT, setbufoutput, CTLFLAG_WR, 2687224061Smckusick sysctl_ffs_fsck, "Set Buffered Writing for Descriptor"); 2688224061Smckusick 2689224061Smckusick#define DEBUG 1 269074548Smckusick#ifdef DEBUG 2691224272Smckusickstatic int fsckcmds = 0; 269274548SmckusickSYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, ""); 269374548Smckusick#endif /* DEBUG */ 269474548Smckusick 2695224061Smckusickstatic int buffered_write(struct file *, struct uio *, struct ucred *, 2696224061Smckusick int, struct thread *); 2697224061Smckusick 269874548Smckusickstatic int 269974548Smckusicksysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) 270074548Smckusick{ 2701202113Smckusick struct thread *td = curthread; 270274548Smckusick struct fsck_cmd cmd; 270374548Smckusick struct ufsmount *ump; 2704202113Smckusick struct vnode *vp, *vpold, *dvp, *fdvp; 2705202113Smckusick struct inode *ip, *dp; 270674548Smckusick struct mount *mp; 270774548Smckusick struct fs *fs; 270898542Smckusick ufs2_daddr_t blkno; 270974548Smckusick long blkcnt, blksize; 2710202113Smckusick struct filedesc *fdp; 2711224061Smckusick struct file *fp, *vfp; 2712255219Spjd cap_rights_t rights; 2713241896Skib int filetype, error; 2714224061Smckusick static struct fileops *origops, bufferedops; 271574548Smckusick 271674548Smckusick if (req->newlen > sizeof cmd) 271774548Smckusick return (EBADRPC); 271874548Smckusick if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0) 271974548Smckusick return (error); 272074548Smckusick if (cmd.version != FFS_CMD_VERSION) 272174548Smckusick return (ERPCMISMATCH); 2722255219Spjd if ((error = getvnode(td->td_proc->p_fd, cmd.handle, 2723255219Spjd cap_rights_init(&rights, CAP_FSCK), &fp)) != 0) 272474548Smckusick return (error); 2725202113Smckusick vp = fp->f_data; 2726202113Smckusick if (vp->v_type != VREG && vp->v_type != VDIR) { 2727202113Smckusick fdrop(fp, td); 2728202113Smckusick return (EINVAL); 2729202113Smckusick } 2730202113Smckusick vn_start_write(vp, &mp, V_WAIT); 273175572Smckusick if (mp == 0 || strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { 273275572Smckusick vn_finished_write(mp); 2733202113Smckusick fdrop(fp, td); 273474705Smckusick return (EINVAL); 273575572Smckusick } 2736224061Smckusick ump = VFSTOUFS(mp); 2737224061Smckusick if ((mp->mnt_flag & MNT_RDONLY) && 2738224061Smckusick ump->um_fsckpid != td->td_proc->p_pid) { 273975572Smckusick vn_finished_write(mp); 2740202113Smckusick fdrop(fp, td); 274174548Smckusick return (EROFS); 274275572Smckusick } 274374548Smckusick fs = ump->um_fs; 274474548Smckusick filetype = IFREG; 274574548Smckusick 274674548Smckusick switch (oidp->oid_number) { 274774548Smckusick 274874548Smckusick case FFS_SET_FLAGS: 274974548Smckusick#ifdef DEBUG 275074548Smckusick if (fsckcmds) 275174548Smckusick printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, 275274548Smckusick cmd.size > 0 ? "set" : "clear"); 275374548Smckusick#endif /* DEBUG */ 275474548Smckusick if (cmd.size > 0) 275574548Smckusick fs->fs_flags |= (long)cmd.value; 275674548Smckusick else 275774548Smckusick fs->fs_flags &= ~(long)cmd.value; 275874548Smckusick break; 275974548Smckusick 276074548Smckusick case FFS_ADJ_REFCNT: 276174548Smckusick#ifdef DEBUG 276274548Smckusick if (fsckcmds) { 2763224061Smckusick printf("%s: adjust inode %jd link count by %jd\n", 276499590Sbde mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 276599590Sbde (intmax_t)cmd.size); 276674548Smckusick } 276774548Smckusick#endif /* DEBUG */ 2768141526Sphk if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 276975572Smckusick break; 277074548Smckusick ip = VTOI(vp); 277174548Smckusick ip->i_nlink += cmd.size; 2772132775Skan DIP_SET(ip, i_nlink, ip->i_nlink); 277374548Smckusick ip->i_effnlink += cmd.size; 2774224061Smckusick ip->i_flag |= IN_CHANGE | IN_MODIFIED; 2775224061Smckusick error = ffs_update(vp, 1); 277674548Smckusick if (DOINGSOFTDEP(vp)) 277774548Smckusick softdep_change_linkcnt(ip); 277874548Smckusick vput(vp); 277974548Smckusick break; 278074548Smckusick 278174548Smckusick case FFS_ADJ_BLKCNT: 278274548Smckusick#ifdef DEBUG 278374548Smckusick if (fsckcmds) { 278499590Sbde printf("%s: adjust inode %jd block count by %jd\n", 278599590Sbde mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 278699590Sbde (intmax_t)cmd.size); 278774548Smckusick } 278874548Smckusick#endif /* DEBUG */ 2789141526Sphk if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 279075572Smckusick break; 279174548Smckusick ip = VTOI(vp); 2792132775Skan DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); 2793224061Smckusick ip->i_flag |= IN_CHANGE | IN_MODIFIED; 2794224061Smckusick error = ffs_update(vp, 1); 279574548Smckusick vput(vp); 279674548Smckusick break; 279774548Smckusick 279874548Smckusick case FFS_DIR_FREE: 279974548Smckusick filetype = IFDIR; 280074548Smckusick /* fall through */ 280174548Smckusick 280274548Smckusick case FFS_FILE_FREE: 280374548Smckusick#ifdef DEBUG 280474548Smckusick if (fsckcmds) { 280574548Smckusick if (cmd.size == 1) 2806241011Smdf printf("%s: free %s inode %ju\n", 280774548Smckusick mp->mnt_stat.f_mntonname, 280874548Smckusick filetype == IFDIR ? "directory" : "file", 2809241011Smdf (uintmax_t)cmd.value); 281074548Smckusick else 2811241011Smdf printf("%s: free %s inodes %ju-%ju\n", 281274548Smckusick mp->mnt_stat.f_mntonname, 281374548Smckusick filetype == IFDIR ? "directory" : "file", 2814241011Smdf (uintmax_t)cmd.value, 2815241011Smdf (uintmax_t)(cmd.value + cmd.size - 1)); 281674548Smckusick } 281774548Smckusick#endif /* DEBUG */ 281874548Smckusick while (cmd.size > 0) { 2819140704Sjeff if ((error = ffs_freefile(ump, fs, ump->um_devvp, 2820207141Sjeff cmd.value, filetype, NULL))) 282175572Smckusick break; 282274548Smckusick cmd.size -= 1; 282374548Smckusick cmd.value += 1; 282474548Smckusick } 282574548Smckusick break; 282674548Smckusick 282774548Smckusick case FFS_BLK_FREE: 282874548Smckusick#ifdef DEBUG 282974548Smckusick if (fsckcmds) { 283074548Smckusick if (cmd.size == 1) 2831103594Sobrien printf("%s: free block %jd\n", 283274548Smckusick mp->mnt_stat.f_mntonname, 283398542Smckusick (intmax_t)cmd.value); 283474548Smckusick else 2835103594Sobrien printf("%s: free blocks %jd-%jd\n", 283674548Smckusick mp->mnt_stat.f_mntonname, 283798542Smckusick (intmax_t)cmd.value, 283898542Smckusick (intmax_t)cmd.value + cmd.size - 1); 283974548Smckusick } 284074548Smckusick#endif /* DEBUG */ 284198542Smckusick blkno = cmd.value; 284274548Smckusick blkcnt = cmd.size; 284374548Smckusick blksize = fs->fs_frag - (blkno % fs->fs_frag); 284474548Smckusick while (blkcnt > 0) { 284574548Smckusick if (blksize > blkcnt) 284674548Smckusick blksize = blkcnt; 2847140704Sjeff ffs_blkfree(ump, fs, ump->um_devvp, blkno, 2848223127Smckusick blksize * fs->fs_fsize, ROOTINO, VDIR, NULL); 284974548Smckusick blkno += blksize; 285074548Smckusick blkcnt -= blksize; 285174548Smckusick blksize = fs->fs_frag; 285274548Smckusick } 285374548Smckusick break; 285474548Smckusick 2855142123Sdelphij /* 2856142123Sdelphij * Adjust superblock summaries. fsck(8) is expected to 2857142123Sdelphij * submit deltas when necessary. 2858142123Sdelphij */ 2859142123Sdelphij case FFS_ADJ_NDIR: 2860142123Sdelphij#ifdef DEBUG 2861142123Sdelphij if (fsckcmds) { 2862142123Sdelphij printf("%s: adjust number of directories by %jd\n", 2863142123Sdelphij mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 2864142123Sdelphij } 2865142123Sdelphij#endif /* DEBUG */ 2866142123Sdelphij fs->fs_cstotal.cs_ndir += cmd.value; 2867142123Sdelphij break; 2868202113Smckusick 2869142123Sdelphij case FFS_ADJ_NBFREE: 2870142123Sdelphij#ifdef DEBUG 2871142123Sdelphij if (fsckcmds) { 2872142123Sdelphij printf("%s: adjust number of free blocks by %+jd\n", 2873142123Sdelphij mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 2874142123Sdelphij } 2875142123Sdelphij#endif /* DEBUG */ 2876142123Sdelphij fs->fs_cstotal.cs_nbfree += cmd.value; 2877142123Sdelphij break; 2878202113Smckusick 2879142123Sdelphij case FFS_ADJ_NIFREE: 2880142123Sdelphij#ifdef DEBUG 2881142123Sdelphij if (fsckcmds) { 2882142123Sdelphij printf("%s: adjust number of free inodes by %+jd\n", 2883142123Sdelphij mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 2884142123Sdelphij } 2885142123Sdelphij#endif /* DEBUG */ 2886142123Sdelphij fs->fs_cstotal.cs_nifree += cmd.value; 2887142123Sdelphij break; 2888202113Smckusick 2889142123Sdelphij case FFS_ADJ_NFFREE: 2890142123Sdelphij#ifdef DEBUG 2891142123Sdelphij if (fsckcmds) { 2892142123Sdelphij printf("%s: adjust number of free frags by %+jd\n", 2893142123Sdelphij mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 2894142123Sdelphij } 2895142123Sdelphij#endif /* DEBUG */ 2896142123Sdelphij fs->fs_cstotal.cs_nffree += cmd.value; 2897142123Sdelphij break; 2898202113Smckusick 2899142123Sdelphij case FFS_ADJ_NUMCLUSTERS: 2900142123Sdelphij#ifdef DEBUG 2901142123Sdelphij if (fsckcmds) { 2902142123Sdelphij printf("%s: adjust number of free clusters by %+jd\n", 2903142123Sdelphij mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 2904142123Sdelphij } 2905142123Sdelphij#endif /* DEBUG */ 2906142123Sdelphij fs->fs_cstotal.cs_numclusters += cmd.value; 2907142123Sdelphij break; 2908142123Sdelphij 2909202113Smckusick case FFS_SET_CWD: 2910202113Smckusick#ifdef DEBUG 2911202113Smckusick if (fsckcmds) { 2912202113Smckusick printf("%s: set current directory to inode %jd\n", 2913202113Smckusick mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 2914202113Smckusick } 2915202113Smckusick#endif /* DEBUG */ 2916202113Smckusick if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) 2917202113Smckusick break; 2918202113Smckusick AUDIT_ARG_VNODE1(vp); 2919202113Smckusick if ((error = change_dir(vp, td)) != 0) { 2920202113Smckusick vput(vp); 2921202113Smckusick break; 2922202113Smckusick } 2923202113Smckusick VOP_UNLOCK(vp, 0); 2924202113Smckusick fdp = td->td_proc->p_fd; 2925202113Smckusick FILEDESC_XLOCK(fdp); 2926202113Smckusick vpold = fdp->fd_cdir; 2927202113Smckusick fdp->fd_cdir = vp; 2928202113Smckusick FILEDESC_XUNLOCK(fdp); 2929202113Smckusick vrele(vpold); 2930202113Smckusick break; 2931202113Smckusick 2932202113Smckusick case FFS_SET_DOTDOT: 2933202113Smckusick#ifdef DEBUG 2934202113Smckusick if (fsckcmds) { 2935202113Smckusick printf("%s: change .. in cwd from %jd to %jd\n", 2936202113Smckusick mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 2937202113Smckusick (intmax_t)cmd.size); 2938202113Smckusick } 2939202113Smckusick#endif /* DEBUG */ 2940202113Smckusick /* 2941202113Smckusick * First we have to get and lock the parent directory 2942202113Smckusick * to which ".." points. 2943202113Smckusick */ 2944202113Smckusick error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); 2945202113Smckusick if (error) 2946202113Smckusick break; 2947202113Smckusick /* 2948202113Smckusick * Now we get and lock the child directory containing "..". 2949202113Smckusick */ 2950202113Smckusick FILEDESC_SLOCK(td->td_proc->p_fd); 2951202113Smckusick dvp = td->td_proc->p_fd->fd_cdir; 2952202113Smckusick FILEDESC_SUNLOCK(td->td_proc->p_fd); 2953202113Smckusick if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) { 2954202113Smckusick vput(fdvp); 2955202113Smckusick break; 2956202113Smckusick } 2957202113Smckusick dp = VTOI(dvp); 2958202113Smckusick dp->i_offset = 12; /* XXX mastertemplate.dot_reclen */ 2959202113Smckusick error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, 2960202113Smckusick DT_DIR, 0); 2961202113Smckusick cache_purge(fdvp); 2962202113Smckusick cache_purge(dvp); 2963202113Smckusick vput(dvp); 2964202113Smckusick vput(fdvp); 2965202113Smckusick break; 2966202113Smckusick 2967202113Smckusick case FFS_UNLINK: 2968202113Smckusick#ifdef DEBUG 2969202113Smckusick if (fsckcmds) { 2970202113Smckusick char buf[32]; 2971202113Smckusick 2972202125Smckusick if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) 2973202113Smckusick strncpy(buf, "Name_too_long", 32); 2974202113Smckusick printf("%s: unlink %s (inode %jd)\n", 2975202113Smckusick mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); 2976202113Smckusick } 2977202113Smckusick#endif /* DEBUG */ 2978202113Smckusick /* 2979202113Smckusick * kern_unlinkat will do its own start/finish writes and 2980202113Smckusick * they do not nest, so drop ours here. Setting mp == NULL 2981202113Smckusick * indicates that vn_finished_write is not needed down below. 2982202113Smckusick */ 2983202113Smckusick vn_finished_write(mp); 2984202113Smckusick mp = NULL; 2985202125Smckusick error = kern_unlinkat(td, AT_FDCWD, (char *)(intptr_t)cmd.value, 2986202113Smckusick UIO_USERSPACE, (ino_t)cmd.size); 2987202113Smckusick break; 2988202113Smckusick 2989224061Smckusick case FFS_SET_INODE: 2990224061Smckusick if (ump->um_fsckpid != td->td_proc->p_pid) { 2991224061Smckusick error = EPERM; 2992224061Smckusick break; 2993224061Smckusick } 2994224061Smckusick#ifdef DEBUG 2995224272Smckusick if (fsckcmds) { 2996224061Smckusick printf("%s: update inode %jd\n", 2997224061Smckusick mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 2998224061Smckusick } 2999224061Smckusick#endif /* DEBUG */ 3000224061Smckusick if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3001224061Smckusick break; 3002224061Smckusick AUDIT_ARG_VNODE1(vp); 3003224061Smckusick ip = VTOI(vp); 3004224061Smckusick if (ip->i_ump->um_fstype == UFS1) 3005224061Smckusick error = copyin((void *)(intptr_t)cmd.size, ip->i_din1, 3006224061Smckusick sizeof(struct ufs1_dinode)); 3007224061Smckusick else 3008224061Smckusick error = copyin((void *)(intptr_t)cmd.size, ip->i_din2, 3009224061Smckusick sizeof(struct ufs2_dinode)); 3010224061Smckusick if (error) { 3011224061Smckusick vput(vp); 3012224061Smckusick break; 3013224061Smckusick } 3014224061Smckusick ip->i_flag |= IN_CHANGE | IN_MODIFIED; 3015224061Smckusick error = ffs_update(vp, 1); 3016224061Smckusick vput(vp); 3017224061Smckusick break; 3018224061Smckusick 3019224061Smckusick case FFS_SET_BUFOUTPUT: 3020224061Smckusick if (ump->um_fsckpid != td->td_proc->p_pid) { 3021224061Smckusick error = EPERM; 3022224061Smckusick break; 3023224061Smckusick } 3024224061Smckusick if (VTOI(vp)->i_ump != ump) { 3025224061Smckusick error = EINVAL; 3026224061Smckusick break; 3027224061Smckusick } 3028224061Smckusick#ifdef DEBUG 3029224061Smckusick if (fsckcmds) { 3030224061Smckusick printf("%s: %s buffered output for descriptor %jd\n", 3031224061Smckusick mp->mnt_stat.f_mntonname, 3032224061Smckusick cmd.size == 1 ? "enable" : "disable", 3033224061Smckusick (intmax_t)cmd.value); 3034224061Smckusick } 3035224061Smckusick#endif /* DEBUG */ 3036224778Srwatson if ((error = getvnode(td->td_proc->p_fd, cmd.value, 3037255219Spjd cap_rights_init(&rights, CAP_FSCK), &vfp)) != 0) 3038224061Smckusick break; 3039224061Smckusick if (vfp->f_vnode->v_type != VCHR) { 3040224061Smckusick fdrop(vfp, td); 3041224061Smckusick error = EINVAL; 3042224061Smckusick break; 3043224061Smckusick } 3044224061Smckusick if (origops == NULL) { 3045224061Smckusick origops = vfp->f_ops; 3046224061Smckusick bcopy((void *)origops, (void *)&bufferedops, 3047224061Smckusick sizeof(bufferedops)); 3048224061Smckusick bufferedops.fo_write = buffered_write; 3049224061Smckusick } 3050224061Smckusick if (cmd.size == 1) 3051224061Smckusick atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, 3052224061Smckusick (uintptr_t)&bufferedops); 3053224061Smckusick else 3054224061Smckusick atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, 3055224061Smckusick (uintptr_t)origops); 3056224061Smckusick fdrop(vfp, td); 3057224061Smckusick break; 3058224061Smckusick 305974548Smckusick default: 306074548Smckusick#ifdef DEBUG 306174548Smckusick if (fsckcmds) { 306274548Smckusick printf("Invalid request %d from fsck\n", 306374548Smckusick oidp->oid_number); 306474548Smckusick } 306574548Smckusick#endif /* DEBUG */ 306675572Smckusick error = EINVAL; 306775572Smckusick break; 306874548Smckusick 306974548Smckusick } 3070202113Smckusick fdrop(fp, td); 307175572Smckusick vn_finished_write(mp); 307275572Smckusick return (error); 307374548Smckusick} 3074224061Smckusick 3075224061Smckusick/* 3076224061Smckusick * Function to switch a descriptor to use the buffer cache to stage 3077224061Smckusick * its I/O. This is needed so that writes to the filesystem device 3078224061Smckusick * will give snapshots a chance to copy modified blocks for which it 3079224061Smckusick * needs to retain copies. 3080224061Smckusick */ 3081224061Smckusickstatic int 3082224061Smckusickbuffered_write(fp, uio, active_cred, flags, td) 3083224061Smckusick struct file *fp; 3084224061Smckusick struct uio *uio; 3085224061Smckusick struct ucred *active_cred; 3086224061Smckusick int flags; 3087224061Smckusick struct thread *td; 3088224061Smckusick{ 3089246612Skib struct vnode *devvp, *vp; 3090224061Smckusick struct inode *ip; 3091224061Smckusick struct buf *bp; 3092224061Smckusick struct fs *fs; 3093246612Skib struct filedesc *fdp; 3094241896Skib int error; 3095224061Smckusick daddr_t lbn; 3096224061Smckusick 3097224061Smckusick /* 3098224061Smckusick * The devvp is associated with the /dev filesystem. To discover 3099224061Smckusick * the filesystem with which the device is associated, we depend 3100224061Smckusick * on the application setting the current directory to a location 3101224061Smckusick * within the filesystem being written. Yes, this is an ugly hack. 3102224061Smckusick */ 3103224061Smckusick devvp = fp->f_vnode; 3104246612Skib if (!vn_isdisk(devvp, NULL)) 3105224061Smckusick return (EINVAL); 3106246612Skib fdp = td->td_proc->p_fd; 3107246612Skib FILEDESC_SLOCK(fdp); 3108246612Skib vp = fdp->fd_cdir; 3109246612Skib vref(vp); 3110246612Skib FILEDESC_SUNLOCK(fdp); 3111246612Skib vn_lock(vp, LK_SHARED | LK_RETRY); 3112246612Skib /* 3113246612Skib * Check that the current directory vnode indeed belongs to 3114246612Skib * UFS before trying to dereference UFS-specific v_data fields. 3115246612Skib */ 3116246612Skib if (vp->v_op != &ffs_vnodeops1 && vp->v_op != &ffs_vnodeops2) { 3117246612Skib vput(vp); 3118246612Skib return (EINVAL); 3119246612Skib } 3120246612Skib ip = VTOI(vp); 3121246612Skib if (ip->i_devvp != devvp) { 3122246612Skib vput(vp); 3123246612Skib return (EINVAL); 3124246612Skib } 3125224061Smckusick fs = ip->i_fs; 3126246612Skib vput(vp); 3127238029Skib foffset_lock_uio(fp, uio, flags); 3128224061Smckusick vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); 3129224061Smckusick#ifdef DEBUG 3130224272Smckusick if (fsckcmds) { 3131224061Smckusick printf("%s: buffered write for block %jd\n", 3132224061Smckusick fs->fs_fsmnt, (intmax_t)btodb(uio->uio_offset)); 3133224061Smckusick } 3134224061Smckusick#endif /* DEBUG */ 3135224061Smckusick /* 3136224061Smckusick * All I/O must be contained within a filesystem block, start on 3137224061Smckusick * a fragment boundary, and be a multiple of fragments in length. 3138224061Smckusick */ 3139224061Smckusick if (uio->uio_resid > fs->fs_bsize - (uio->uio_offset % fs->fs_bsize) || 3140224061Smckusick fragoff(fs, uio->uio_offset) != 0 || 3141224061Smckusick fragoff(fs, uio->uio_resid) != 0) { 3142224061Smckusick error = EINVAL; 3143224061Smckusick goto out; 3144224061Smckusick } 3145224061Smckusick lbn = numfrags(fs, uio->uio_offset); 3146224061Smckusick bp = getblk(devvp, lbn, uio->uio_resid, 0, 0, 0); 3147224061Smckusick bp->b_flags |= B_RELBUF; 3148224061Smckusick if ((error = uiomove((char *)bp->b_data, uio->uio_resid, uio)) != 0) { 3149224061Smckusick brelse(bp); 3150224061Smckusick goto out; 3151224061Smckusick } 3152224061Smckusick error = bwrite(bp); 3153224061Smckusickout: 3154224061Smckusick VOP_UNLOCK(devvp, 0); 3155238029Skib foffset_unlock_uio(fp, uio, flags | FOF_NEXTOFF); 3156224061Smckusick return (error); 3157224061Smckusick} 3158