Cross Reference: /freebsd-9.3-release/sys/ufs/ffs/ffs

Deleted Added

sdiff udiff text old ( 112181 ) new ( 112694 )

full compact

ffs_vnops.c (112181)	ffs_vnops.c (112694)
1/* 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Copyright (c) 1982, 1986, 1989, 1993 12 * The Regents of the University of California. All rights reserved. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by the University of 25 * California, Berkeley and its contributors. 26 * 4. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 * 42 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95	1/* 2 * Copyright (c) 2002 Networks Associates Technology, Inc. 3 * All rights reserved. 4 * 5 * This software was developed for the FreeBSD Project by Marshall 6 * Kirk McKusick and Network Associates Laboratories, the Security 7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 9 * research program 10 * 11 * Copyright (c) 1982, 1986, 1989, 1993 12 * The Regents of the University of California. All rights reserved. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by the University of 25 * California, Berkeley and its contributors. 26 * 4. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 * 42 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
43 * $FreeBSD: head/sys/ufs/ffs/ffs_vnops.c 112181 2003-03-13 07:19:23Z jeff $	43 * $FreeBSD: head/sys/ufs/ffs/ffs_vnops.c 112694 2003-03-26 23:40:42Z tegge $
44 */ 45 46#include <sys/param.h> 47#include <sys/bio.h> 48#include <sys/systm.h> 49#include <sys/buf.h> 50#include <sys/conf.h> 51#include <sys/extattr.h> 52#include <sys/kernel.h> 53#include <sys/malloc.h> 54#include <sys/mount.h> 55#include <sys/proc.h> 56#include <sys/resourcevar.h> 57#include <sys/signalvar.h> 58#include <sys/stat.h> 59#include <sys/vmmeter.h> 60#include <sys/vnode.h> 61 62#include <machine/limits.h> 63 64#include <vm/vm.h> 65#include <vm/vm_extern.h> 66#include <vm/vm_object.h> 67#include <vm/vm_page.h> 68#include <vm/vm_pager.h> 69#include <vm/vnode_pager.h> 70 71#include <ufs/ufs/extattr.h> 72#include <ufs/ufs/quota.h> 73#include <ufs/ufs/inode.h> 74#include <ufs/ufs/ufs_extern.h> 75#include <ufs/ufs/ufsmount.h> 76 77#include <ufs/ffs/fs.h> 78#include <ufs/ffs/ffs_extern.h>	44 */ 45 46#include <sys/param.h> 47#include <sys/bio.h> 48#include <sys/systm.h> 49#include <sys/buf.h> 50#include <sys/conf.h> 51#include <sys/extattr.h> 52#include <sys/kernel.h> 53#include <sys/malloc.h> 54#include <sys/mount.h> 55#include <sys/proc.h> 56#include <sys/resourcevar.h> 57#include <sys/signalvar.h> 58#include <sys/stat.h> 59#include <sys/vmmeter.h> 60#include <sys/vnode.h> 61 62#include <machine/limits.h> 63 64#include <vm/vm.h> 65#include <vm/vm_extern.h> 66#include <vm/vm_object.h> 67#include <vm/vm_page.h> 68#include <vm/vm_pager.h> 69#include <vm/vnode_pager.h> 70 71#include <ufs/ufs/extattr.h> 72#include <ufs/ufs/quota.h> 73#include <ufs/ufs/inode.h> 74#include <ufs/ufs/ufs_extern.h> 75#include <ufs/ufs/ufsmount.h> 76 77#include <ufs/ffs/fs.h> 78#include <ufs/ffs/ffs_extern.h>
	79#include "opt_directio.h"
79	80
	81#ifdef DIRECTIO 82extern int ffs_rawread(struct vnode vp, struct uio uio, int *workdone); 83#endif
80static int ffs_fsync(struct vop_fsync_args ); 81static int ffs_getpages(struct vop_getpages_args ); 82static int ffs_read(struct vop_read_args ); 83static int ffs_write(struct vop_write_args ); 84static int ffs_extread(struct vnode vp, struct uio uio, int ioflag); 85static int ffs_extwrite(struct vnode vp, struct uio uio, int ioflag, 86 struct ucred cred); 87static int ffsext_strategy(struct vop_strategy_args ); 88static int ffs_closeextattr(struct vop_closeextattr_args ); 89static int ffs_getextattr(struct vop_getextattr_args ); 90static int ffs_openextattr(struct vop_openextattr_args ); 91static int ffs_setextattr(struct vop_setextattr_args ); 92 93 94/* Global vfs data structures for ufs. / 95vop_t ffs_vnodeop_p; 96static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { 97 { &vop_default_desc, (vop_t ) ufs_vnoperate }, 98 { &vop_fsync_desc, (vop_t ) ffs_fsync }, 99 { &vop_getpages_desc, (vop_t ) ffs_getpages }, 100 { &vop_read_desc, (vop_t ) ffs_read }, 101* { &vop_reallocblks_desc, (vop_t ) ffs_reallocblks }, 102* { &vop_write_desc, (vop_t ) ffs_write }, 103* { &vop_closeextattr_desc, (vop_t ) ffs_closeextattr }, 104* { &vop_getextattr_desc, (vop_t ) ffs_getextattr }, 105* { &vop_openextattr_desc, (vop_t ) ffs_openextattr }, 106* { &vop_setextattr_desc, (vop_t ) ffs_setextattr }, 107* { NULL, NULL } 108}; 109static struct vnodeopv_desc ffs_vnodeop_opv_desc = 110 { &ffs_vnodeop_p, ffs_vnodeop_entries }; 111 112vop_t *ffs_specop_p; 113static struct vnodeopv_entry_desc ffs_specop_entries[] = { 114* { &vop_default_desc, (vop_t ) ufs_vnoperatespec }, 115* { &vop_fsync_desc, (vop_t ) ffs_fsync }, 116* { &vop_reallocblks_desc, (vop_t ) ffs_reallocblks }, 117* { &vop_strategy_desc, (vop_t ) ffsext_strategy }, 118* { &vop_closeextattr_desc, (vop_t ) ffs_closeextattr }, 119* { &vop_getextattr_desc, (vop_t ) ffs_getextattr }, 120* { &vop_openextattr_desc, (vop_t ) ffs_openextattr }, 121* { &vop_setextattr_desc, (vop_t ) ffs_setextattr }, 122* { NULL, NULL } 123}; 124static struct vnodeopv_desc ffs_specop_opv_desc = 125 { &ffs_specop_p, ffs_specop_entries }; 126 127vop_t *ffs_fifoop_p; 128static struct vnodeopv_entry_desc ffs_fifoop_entries[] = { 129* { &vop_default_desc, (vop_t ) ufs_vnoperatefifo }, 130* { &vop_fsync_desc, (vop_t ) ffs_fsync }, 131* { &vop_reallocblks_desc, (vop_t ) ffs_reallocblks }, 132* { &vop_strategy_desc, (vop_t ) ffsext_strategy }, 133* { &vop_closeextattr_desc, (vop_t ) ffs_closeextattr }, 134* { &vop_getextattr_desc, (vop_t ) ffs_getextattr }, 135* { &vop_openextattr_desc, (vop_t ) ffs_openextattr }, 136* { &vop_setextattr_desc, (vop_t ) ffs_setextattr }, 137* { NULL, NULL } 138}; 139static struct vnodeopv_desc ffs_fifoop_opv_desc = 140 { &ffs_fifoop_p, ffs_fifoop_entries }; 141 142VNODEOP_SET(ffs_vnodeop_opv_desc); 143VNODEOP_SET(ffs_specop_opv_desc); 144VNODEOP_SET(ffs_fifoop_opv_desc); 145 146/* 147 * Synch an open file. 148 / 149/ ARGSUSED / 150static int 151ffs_fsync(ap) 152* struct vop_fsync_args /* { 153 struct vnode a_vp; 154* struct ucred a_cred; 155* int a_waitfor; 156 struct thread a_td; 157* } / ap; 158{ 159 struct vnode vp = ap->a_vp; 160* struct inode ip = VTOI(vp); 161* struct buf bp; 162* struct buf nbp; 163* int s, error, wait, passes, skipmeta; 164 ufs_lbn_t lbn; 165 166 wait = (ap->a_waitfor == MNT_WAIT); 167 if (vn_isdisk(vp, NULL)) { 168 lbn = INT_MAX; 169 if (vp->v_rdev->si_mountpoint != NULL && 170 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP)) 171 softdep_fsync_mountdev(vp); 172 } else { 173 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 174 } 175 176 /* 177 * Flush all dirty buffers associated with a vnode. 178 / 179* passes = NIADDR + 1; 180 skipmeta = 0; 181 if (wait) 182 skipmeta = 1; 183 s = splbio(); 184 VI_LOCK(vp); 185loop: 186 TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) 187 bp->b_vflags &= ~BV_SCANNED; 188 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 189 nbp = TAILQ_NEXT(bp, b_vnbufs); 190 /* 191 * Reasons to skip this buffer: it has already been considered 192 * on this pass, this pass is the first time through on a 193 * synchronous flush request and the buffer being considered 194 * is metadata, the buffer has dependencies that will cause 195 * it to be redirtied and it has not already been deferred, 196 * or it is already being written. 197 / 198* if ((bp->b_vflags & BV_SCANNED) != 0) 199 continue; 200 bp->b_vflags \|= BV_SCANNED; 201 if ((skipmeta == 1 && bp->b_lblkno < 0)) 202 continue; 203 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT, NULL)) 204 continue; 205 if (!wait && LIST_FIRST(&bp->b_dep) != NULL && 206 (bp->b_flags & B_DEFERRED) == 0 && 207 buf_countdeps(bp, 0)) { 208 bp->b_flags \|= B_DEFERRED; 209 BUF_UNLOCK(bp); 210 continue; 211 } 212 VI_UNLOCK(vp); 213 if ((bp->b_flags & B_DELWRI) == 0) 214 panic("ffs_fsync: not dirty"); 215 if (vp != bp->b_vp) 216 panic("ffs_fsync: vp != vp->b_vp"); 217 /* 218 * If this is a synchronous flush request, or it is not a 219 * file or device, start the write on this buffer immediatly. 220 / 221* if (wait \|\| (vp->v_type != VREG && vp->v_type != VBLK)) { 222 223 /* 224 * On our final pass through, do all I/O synchronously 225 * so that we can find out if our flush is failing 226 * because of write errors. 227 / 228* if (passes > 0 \|\| !wait) { 229 if ((bp->b_flags & B_CLUSTEROK) && !wait) { 230 (void) vfs_bio_awrite(bp); 231 } else { 232 bremfree(bp); 233 splx(s); 234 (void) bawrite(bp); 235 s = splbio(); 236 } 237 } else { 238 bremfree(bp); 239 splx(s); 240 if ((error = bwrite(bp)) != 0) 241 return (error); 242 s = splbio(); 243 } 244 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { 245 /* 246 * If the buffer is for data that has been truncated 247 * off the file, then throw it away. 248 / 249* bremfree(bp); 250 bp->b_flags \|= B_INVAL \| B_NOCACHE; 251 splx(s); 252 brelse(bp); 253 s = splbio(); 254 } else 255 vfs_bio_awrite(bp); 256 257 /* 258 * Since we may have slept during the I/O, we need 259 * to start from a known point. 260 / 261* VI_LOCK(vp); 262 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); 263 } 264 /* 265 * If we were asked to do this synchronously, then go back for 266 * another pass, this time doing the metadata. 267 / 268* if (skipmeta) { 269 skipmeta = 0; 270 goto loop; 271 } 272 273 if (wait) { 274 while (vp->v_numoutput) { 275 vp->v_iflag \|= VI_BWAIT; 276 msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp), 277 PRIBIO + 4, "ffsfsn", 0); 278 } 279 VI_UNLOCK(vp); 280 281 /* 282 * Ensure that any filesystem metatdata associated 283 * with the vnode has been written. 284 / 285* splx(s); 286 if ((error = softdep_sync_metadata(ap)) != 0) 287 return (error); 288 s = splbio(); 289 290 VI_LOCK(vp); 291 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 292 /* 293 * Block devices associated with filesystems may 294 * have new I/O requests posted for them even if 295 * the vnode is locked, so no amount of trying will 296 * get them clean. Thus we give block devices a 297 * good effort, then just give up. For all other file 298 * types, go around and try again until it is clean. 299 / 300* if (passes > 0) { 301 passes -= 1; 302 goto loop; 303 } 304#ifdef DIAGNOSTIC 305 if (!vn_isdisk(vp, NULL)) 306 vprint("ffs_fsync: dirty", vp); 307#endif 308 } 309 } 310 VI_UNLOCK(vp); 311 splx(s); 312 return (UFS_UPDATE(vp, wait)); 313} 314 315 316/* 317 * Vnode op for reading. 318 / 319/ ARGSUSED / 320static int 321ffs_read(ap) 322* struct vop_read_args /* { 323 struct vnode a_vp; 324* struct uio a_uio; 325* int a_ioflag; 326 struct ucred a_cred; 327* } / ap; 328{ 329 struct vnode vp; 330* struct inode ip; 331* struct uio uio; 332* struct fs fs; 333* struct buf bp; 334* ufs_lbn_t lbn, nextlbn; 335 off_t bytesinfile; 336 long size, xfersize, blkoffset; 337 int error, orig_resid; 338 mode_t mode; 339 int seqcount; 340 int ioflag; 341 vm_object_t object; 342 343 vp = ap->a_vp; 344 uio = ap->a_uio; 345 ioflag = ap->a_ioflag; 346 if (ap->a_ioflag & IO_EXT) 347#ifdef notyet 348 return (ffs_extread(vp, uio, ioflag)); 349#else 350 panic("ffs_read+IO_EXT"); 351#endif	84static int ffs_fsync(struct vop_fsync_args ); 85static int ffs_getpages(struct vop_getpages_args ); 86static int ffs_read(struct vop_read_args ); 87static int ffs_write(struct vop_write_args ); 88static int ffs_extread(struct vnode vp, struct uio uio, int ioflag); 89static int ffs_extwrite(struct vnode vp, struct uio uio, int ioflag, 90 struct ucred cred); 91static int ffsext_strategy(struct vop_strategy_args ); 92static int ffs_closeextattr(struct vop_closeextattr_args ); 93static int ffs_getextattr(struct vop_getextattr_args ); 94static int ffs_openextattr(struct vop_openextattr_args ); 95static int ffs_setextattr(struct vop_setextattr_args ); 96 97 98/* Global vfs data structures for ufs. / 99vop_t ffs_vnodeop_p; 100static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { 101* { &vop_default_desc, (vop_t ) ufs_vnoperate }, 102* { &vop_fsync_desc, (vop_t ) ffs_fsync }, 103* { &vop_getpages_desc, (vop_t ) ffs_getpages }, 104* { &vop_read_desc, (vop_t ) ffs_read }, 105* { &vop_reallocblks_desc, (vop_t ) ffs_reallocblks }, 106* { &vop_write_desc, (vop_t ) ffs_write }, 107* { &vop_closeextattr_desc, (vop_t ) ffs_closeextattr }, 108* { &vop_getextattr_desc, (vop_t ) ffs_getextattr }, 109* { &vop_openextattr_desc, (vop_t ) ffs_openextattr }, 110* { &vop_setextattr_desc, (vop_t ) ffs_setextattr }, 111* { NULL, NULL } 112}; 113static struct vnodeopv_desc ffs_vnodeop_opv_desc = 114 { &ffs_vnodeop_p, ffs_vnodeop_entries }; 115 116vop_t *ffs_specop_p; 117static struct vnodeopv_entry_desc ffs_specop_entries[] = { 118* { &vop_default_desc, (vop_t ) ufs_vnoperatespec }, 119* { &vop_fsync_desc, (vop_t ) ffs_fsync }, 120* { &vop_reallocblks_desc, (vop_t ) ffs_reallocblks }, 121* { &vop_strategy_desc, (vop_t ) ffsext_strategy }, 122* { &vop_closeextattr_desc, (vop_t ) ffs_closeextattr }, 123* { &vop_getextattr_desc, (vop_t ) ffs_getextattr }, 124* { &vop_openextattr_desc, (vop_t ) ffs_openextattr }, 125* { &vop_setextattr_desc, (vop_t ) ffs_setextattr }, 126* { NULL, NULL } 127}; 128static struct vnodeopv_desc ffs_specop_opv_desc = 129 { &ffs_specop_p, ffs_specop_entries }; 130 131vop_t *ffs_fifoop_p; 132static struct vnodeopv_entry_desc ffs_fifoop_entries[] = { 133* { &vop_default_desc, (vop_t ) ufs_vnoperatefifo }, 134* { &vop_fsync_desc, (vop_t ) ffs_fsync }, 135* { &vop_reallocblks_desc, (vop_t ) ffs_reallocblks }, 136* { &vop_strategy_desc, (vop_t ) ffsext_strategy }, 137* { &vop_closeextattr_desc, (vop_t ) ffs_closeextattr }, 138* { &vop_getextattr_desc, (vop_t ) ffs_getextattr }, 139* { &vop_openextattr_desc, (vop_t ) ffs_openextattr }, 140* { &vop_setextattr_desc, (vop_t ) ffs_setextattr }, 141* { NULL, NULL } 142}; 143static struct vnodeopv_desc ffs_fifoop_opv_desc = 144 { &ffs_fifoop_p, ffs_fifoop_entries }; 145 146VNODEOP_SET(ffs_vnodeop_opv_desc); 147VNODEOP_SET(ffs_specop_opv_desc); 148VNODEOP_SET(ffs_fifoop_opv_desc); 149 150/* 151 * Synch an open file. 152 / 153/ ARGSUSED / 154static int 155ffs_fsync(ap) 156* struct vop_fsync_args /* { 157 struct vnode a_vp; 158* struct ucred a_cred; 159* int a_waitfor; 160 struct thread a_td; 161* } / ap; 162{ 163 struct vnode vp = ap->a_vp; 164* struct inode ip = VTOI(vp); 165* struct buf bp; 166* struct buf nbp; 167* int s, error, wait, passes, skipmeta; 168 ufs_lbn_t lbn; 169 170 wait = (ap->a_waitfor == MNT_WAIT); 171 if (vn_isdisk(vp, NULL)) { 172 lbn = INT_MAX; 173 if (vp->v_rdev->si_mountpoint != NULL && 174 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP)) 175 softdep_fsync_mountdev(vp); 176 } else { 177 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); 178 } 179 180 /* 181 * Flush all dirty buffers associated with a vnode. 182 / 183* passes = NIADDR + 1; 184 skipmeta = 0; 185 if (wait) 186 skipmeta = 1; 187 s = splbio(); 188 VI_LOCK(vp); 189loop: 190 TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) 191 bp->b_vflags &= ~BV_SCANNED; 192 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 193 nbp = TAILQ_NEXT(bp, b_vnbufs); 194 /* 195 * Reasons to skip this buffer: it has already been considered 196 * on this pass, this pass is the first time through on a 197 * synchronous flush request and the buffer being considered 198 * is metadata, the buffer has dependencies that will cause 199 * it to be redirtied and it has not already been deferred, 200 * or it is already being written. 201 / 202* if ((bp->b_vflags & BV_SCANNED) != 0) 203 continue; 204 bp->b_vflags \|= BV_SCANNED; 205 if ((skipmeta == 1 && bp->b_lblkno < 0)) 206 continue; 207 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT, NULL)) 208 continue; 209 if (!wait && LIST_FIRST(&bp->b_dep) != NULL && 210 (bp->b_flags & B_DEFERRED) == 0 && 211 buf_countdeps(bp, 0)) { 212 bp->b_flags \|= B_DEFERRED; 213 BUF_UNLOCK(bp); 214 continue; 215 } 216 VI_UNLOCK(vp); 217 if ((bp->b_flags & B_DELWRI) == 0) 218 panic("ffs_fsync: not dirty"); 219 if (vp != bp->b_vp) 220 panic("ffs_fsync: vp != vp->b_vp"); 221 /* 222 * If this is a synchronous flush request, or it is not a 223 * file or device, start the write on this buffer immediatly. 224 / 225* if (wait \|\| (vp->v_type != VREG && vp->v_type != VBLK)) { 226 227 /* 228 * On our final pass through, do all I/O synchronously 229 * so that we can find out if our flush is failing 230 * because of write errors. 231 / 232* if (passes > 0 \|\| !wait) { 233 if ((bp->b_flags & B_CLUSTEROK) && !wait) { 234 (void) vfs_bio_awrite(bp); 235 } else { 236 bremfree(bp); 237 splx(s); 238 (void) bawrite(bp); 239 s = splbio(); 240 } 241 } else { 242 bremfree(bp); 243 splx(s); 244 if ((error = bwrite(bp)) != 0) 245 return (error); 246 s = splbio(); 247 } 248 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { 249 /* 250 * If the buffer is for data that has been truncated 251 * off the file, then throw it away. 252 / 253* bremfree(bp); 254 bp->b_flags \|= B_INVAL \| B_NOCACHE; 255 splx(s); 256 brelse(bp); 257 s = splbio(); 258 } else 259 vfs_bio_awrite(bp); 260 261 /* 262 * Since we may have slept during the I/O, we need 263 * to start from a known point. 264 / 265* VI_LOCK(vp); 266 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd); 267 } 268 /* 269 * If we were asked to do this synchronously, then go back for 270 * another pass, this time doing the metadata. 271 / 272* if (skipmeta) { 273 skipmeta = 0; 274 goto loop; 275 } 276 277 if (wait) { 278 while (vp->v_numoutput) { 279 vp->v_iflag \|= VI_BWAIT; 280 msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp), 281 PRIBIO + 4, "ffsfsn", 0); 282 } 283 VI_UNLOCK(vp); 284 285 /* 286 * Ensure that any filesystem metatdata associated 287 * with the vnode has been written. 288 / 289* splx(s); 290 if ((error = softdep_sync_metadata(ap)) != 0) 291 return (error); 292 s = splbio(); 293 294 VI_LOCK(vp); 295 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 296 /* 297 * Block devices associated with filesystems may 298 * have new I/O requests posted for them even if 299 * the vnode is locked, so no amount of trying will 300 * get them clean. Thus we give block devices a 301 * good effort, then just give up. For all other file 302 * types, go around and try again until it is clean. 303 / 304* if (passes > 0) { 305 passes -= 1; 306 goto loop; 307 } 308#ifdef DIAGNOSTIC 309 if (!vn_isdisk(vp, NULL)) 310 vprint("ffs_fsync: dirty", vp); 311#endif 312 } 313 } 314 VI_UNLOCK(vp); 315 splx(s); 316 return (UFS_UPDATE(vp, wait)); 317} 318 319 320/* 321 * Vnode op for reading. 322 / 323/ ARGSUSED / 324static int 325ffs_read(ap) 326* struct vop_read_args /* { 327 struct vnode a_vp; 328* struct uio a_uio; 329* int a_ioflag; 330 struct ucred a_cred; 331* } / ap; 332{ 333 struct vnode vp; 334* struct inode ip; 335* struct uio uio; 336* struct fs fs; 337* struct buf bp; 338* ufs_lbn_t lbn, nextlbn; 339 off_t bytesinfile; 340 long size, xfersize, blkoffset; 341 int error, orig_resid; 342 mode_t mode; 343 int seqcount; 344 int ioflag; 345 vm_object_t object; 346 347 vp = ap->a_vp; 348 uio = ap->a_uio; 349 ioflag = ap->a_ioflag; 350 if (ap->a_ioflag & IO_EXT) 351#ifdef notyet 352 return (ffs_extread(vp, uio, ioflag)); 353#else 354 panic("ffs_read+IO_EXT"); 355#endif
	356#ifdef DIRECTIO 357 if ((ioflag & IO_DIRECT) != 0) { 358 int workdone;
352	359
	360 error = ffs_rawread(vp, uio, &workdone); 361 if (error != 0 \|\| workdone != 0) 362 return error; 363 } 364#endif 365
353 GIANT_REQUIRED; 354 355 seqcount = ap->a_ioflag >> 16; 356 ip = VTOI(vp); 357 mode = ip->i_mode; 358 359#ifdef DIAGNOSTIC 360 if (uio->uio_rw != UIO_READ) 361 panic("ffs_read: mode"); 362 363 if (vp->v_type == VLNK) { 364 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 365 panic("ffs_read: short symlink"); 366 } else if (vp->v_type != VREG && vp->v_type != VDIR) 367 panic("ffs_read: type %d", vp->v_type); 368#endif 369 fs = ip->i_fs; 370 if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize) 371 return (EFBIG); 372 373 orig_resid = uio->uio_resid; 374 if (orig_resid <= 0) 375 return (0); 376 377 object = vp->v_object; 378 379 bytesinfile = ip->i_size - uio->uio_offset; 380 if (bytesinfile <= 0) { 381 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 382 ip->i_flag \|= IN_ACCESS; 383 return 0; 384 } 385 386 if (object) { 387 vm_object_reference(object); 388 } 389 390 /* 391 * Ok so we couldn't do it all in one vm trick... 392 * so cycle around trying smaller bites.. 393 / 394* for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 395 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 396 break; 397 398 lbn = lblkno(fs, uio->uio_offset); 399 nextlbn = lbn + 1; 400 401 /* 402 * size of buffer. The buffer representing the 403 * end of the file is rounded up to the size of 404 * the block type ( fragment or full block, 405 * depending ). 406 / 407* size = blksize(fs, ip, lbn); 408 blkoffset = blkoff(fs, uio->uio_offset); 409 410 /* 411 * The amount we want to transfer in this iteration is 412 * one FS block less the amount of the data before 413 * our startpoint (duh!) 414 / 415* xfersize = fs->fs_bsize - blkoffset; 416 417 /* 418 * But if we actually want less than the block, 419 * or the file doesn't have a whole block more of data, 420 * then use the lesser number. 421 / 422* if (uio->uio_resid < xfersize) 423 xfersize = uio->uio_resid; 424 if (bytesinfile < xfersize) 425 xfersize = bytesinfile; 426 427 if (lblktosize(fs, nextlbn) >= ip->i_size) { 428 /* 429 * Don't do readahead if this is the end of the file. 430 / 431* error = bread(vp, lbn, size, NOCRED, &bp); 432 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 433 /* 434 * Otherwise if we are allowed to cluster, 435 * grab as much as we can. 436 * 437 * XXX This may not be a win if we are not 438 * doing sequential access. 439 / 440* error = cluster_read(vp, ip->i_size, lbn, 441 size, NOCRED, uio->uio_resid, seqcount, &bp); 442 } else if (seqcount > 1) { 443 /* 444 * If we are NOT allowed to cluster, then 445 * if we appear to be acting sequentially, 446 * fire off a request for a readahead 447 * as well as a read. Note that the 4th and 5th 448 * arguments point to arrays of the size specified in 449 * the 6th argument. 450 / 451* int nextsize = blksize(fs, ip, nextlbn); 452 error = breadn(vp, lbn, 453 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 454 } else { 455 /* 456 * Failing all of the above, just read what the 457 * user asked for. Interestingly, the same as 458 * the first option above. 459 / 460* error = bread(vp, lbn, size, NOCRED, &bp); 461 } 462 if (error) { 463 brelse(bp); 464 bp = NULL; 465 break; 466 } 467 468 /* 469 * If IO_DIRECT then set B_DIRECT for the buffer. This 470 * will cause us to attempt to release the buffer later on 471 * and will cause the buffer cache to attempt to free the 472 * underlying pages. 473 / 474* if (ioflag & IO_DIRECT) 475 bp->b_flags \|= B_DIRECT; 476 477 /* 478 * We should only get non-zero b_resid when an I/O error 479 * has occurred, which should cause us to break above. 480 * However, if the short read did not cause an error, 481 * then we want to ensure that we do not uiomove bad 482 * or uninitialized data. 483 / 484* size -= bp->b_resid; 485 if (size < xfersize) { 486 if (size == 0) 487 break; 488 xfersize = size; 489 } 490 491 { 492 /* 493 * otherwise use the general form 494 / 495* error = 496 uiomove((char )bp->b_data + blkoffset, 497* (int)xfersize, uio); 498 } 499 500 if (error) 501 break; 502 503 if ((ioflag & (IO_VMIO\|IO_DIRECT)) && 504 (LIST_FIRST(&bp->b_dep) == NULL)) { 505 /* 506 * If there are no dependencies, and it's VMIO, 507 * then we don't need the buf, mark it available 508 * for freeing. The VM has the data. 509 / 510* bp->b_flags \|= B_RELBUF; 511 brelse(bp); 512 } else { 513 /* 514 * Otherwise let whoever 515 * made the request take care of 516 * freeing it. We just queue 517 * it onto another list. 518 / 519* bqrelse(bp); 520 } 521 } 522 523 /* 524 * This can only happen in the case of an error 525 * because the loop above resets bp to NULL on each iteration 526 * and on normal completion has not set a new value into it. 527 * so it must have come from a 'break' statement 528 / 529* if (bp != NULL) { 530 if ((ioflag & (IO_VMIO\|IO_DIRECT)) && 531 (LIST_FIRST(&bp->b_dep) == NULL)) { 532 bp->b_flags \|= B_RELBUF; 533 brelse(bp); 534 } else { 535 bqrelse(bp); 536 } 537 } 538 539 if (object) { 540 vm_object_vndeallocate(object); 541 } 542 if ((error == 0 \|\| uio->uio_resid != orig_resid) && 543 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 544 ip->i_flag \|= IN_ACCESS; 545 return (error); 546} 547 548/* 549 * Vnode op for writing. 550 / 551static int 552ffs_write(ap) 553* struct vop_write_args /* { 554 struct vnode a_vp; 555* struct uio a_uio; 556* int a_ioflag; 557 struct ucred a_cred; 558* } / ap; 559{ 560 struct vnode vp; 561* struct uio uio; 562* struct inode ip; 563* struct fs fs; 564* struct buf bp; 565* struct thread td; 566* ufs_lbn_t lbn; 567 off_t osize; 568 int seqcount; 569 int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; 570 vm_object_t object; 571 572 vp = ap->a_vp; 573 uio = ap->a_uio; 574 ioflag = ap->a_ioflag; 575 if (ap->a_ioflag & IO_EXT) 576#ifdef notyet 577 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 578#else 579 panic("ffs_read+IO_EXT"); 580#endif 581 582 GIANT_REQUIRED; 583 584 extended = 0; 585 seqcount = ap->a_ioflag >> 16; 586 ip = VTOI(vp); 587 588 object = vp->v_object; 589 if (object) { 590 vm_object_reference(object); 591 } 592 593#ifdef DIAGNOSTIC 594 if (uio->uio_rw != UIO_WRITE) 595 panic("ffswrite: mode"); 596#endif 597 598 switch (vp->v_type) { 599 case VREG: 600 if (ioflag & IO_APPEND) 601 uio->uio_offset = ip->i_size; 602 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) { 603 if (object) { 604 vm_object_vndeallocate(object); 605 } 606 return (EPERM); 607 } 608 /* FALLTHROUGH / 609* case VLNK: 610 break; 611 case VDIR: 612 panic("ffswrite: dir write"); 613 break; 614 default: 615 panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type, 616 (int)uio->uio_offset, 617 (int)uio->uio_resid 618 ); 619 } 620 621 fs = ip->i_fs; 622 if (uio->uio_offset < 0 \|\| 623 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) { 624 if (object) { 625 vm_object_vndeallocate(object); 626 } 627 return (EFBIG); 628 } 629 /* 630 * Maybe this should be above the vnode op call, but so long as 631 * file servers have no limits, I don't think it matters. 632 / 633* td = uio->uio_td; 634 if (vp->v_type == VREG && td && 635 uio->uio_offset + uio->uio_resid > 636 td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 637 PROC_LOCK(td->td_proc); 638 psignal(td->td_proc, SIGXFSZ); 639 PROC_UNLOCK(td->td_proc); 640 if (object) { 641 vm_object_vndeallocate(object); 642 } 643 return (EFBIG); 644 } 645 646 resid = uio->uio_resid; 647 osize = ip->i_size; 648 if (seqcount > BA_SEQMAX) 649 flags = BA_SEQMAX << BA_SEQSHIFT; 650 else 651 flags = seqcount << BA_SEQSHIFT; 652 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 653 flags \|= IO_SYNC; 654 655 for (error = 0; uio->uio_resid > 0;) { 656 lbn = lblkno(fs, uio->uio_offset); 657 blkoffset = blkoff(fs, uio->uio_offset); 658 xfersize = fs->fs_bsize - blkoffset; 659 if (uio->uio_resid < xfersize) 660 xfersize = uio->uio_resid; 661 662 if (uio->uio_offset + xfersize > ip->i_size) 663 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 664 665 /* 666 * We must perform a read-before-write if the transfer size 667 * does not cover the entire buffer. 668 / 669* if (fs->fs_bsize > xfersize) 670 flags \|= BA_CLRBUF; 671 else 672 flags &= ~BA_CLRBUF; 673/* XXX is uio->uio_offset the right thing here? / 674* error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 675 ap->a_cred, flags, &bp); 676 if (error != 0) 677 break; 678 /* 679 * If the buffer is not valid we have to clear out any 680 * garbage data from the pages instantiated for the buffer. 681 * If we do not, a failed uiomove() during a write can leave 682 * the prior contents of the pages exposed to a userland 683 * mmap(). XXX deal with uiomove() errors a better way. 684 / 685* if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 686 vfs_bio_clrbuf(bp); 687 if (ioflag & IO_DIRECT) 688 bp->b_flags \|= B_DIRECT; 689 if (ioflag & IO_NOWDRAIN) 690 bp->b_flags \|= B_NOWDRAIN; 691 692 if (uio->uio_offset + xfersize > ip->i_size) { 693 ip->i_size = uio->uio_offset + xfersize; 694 DIP(ip, i_size) = ip->i_size; 695 extended = 1; 696 } 697 698 size = blksize(fs, ip, lbn) - bp->b_resid; 699 if (size < xfersize) 700 xfersize = size; 701 702 error = 703 uiomove((char )bp->b_data + blkoffset, (int)xfersize, uio); 704* if ((ioflag & (IO_VMIO\|IO_DIRECT)) && 705 (LIST_FIRST(&bp->b_dep) == NULL)) { 706 bp->b_flags \|= B_RELBUF; 707 } 708 709 /* 710 * If IO_SYNC each buffer is written synchronously. Otherwise 711 * if we have a severe page deficiency write the buffer 712 * asynchronously. Otherwise try to cluster, and if that 713 * doesn't do it then either do an async write (if O_DIRECT), 714 * or a delayed write (if not). 715 / 716* if (ioflag & IO_SYNC) { 717 (void)bwrite(bp); 718 } else if (vm_page_count_severe() \|\| 719 buf_dirty_count_severe() \|\| 720 (ioflag & IO_ASYNC)) { 721 bp->b_flags \|= B_CLUSTEROK; 722 bawrite(bp); 723 } else if (xfersize + blkoffset == fs->fs_bsize) { 724 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 725 bp->b_flags \|= B_CLUSTEROK; 726 cluster_write(bp, ip->i_size, seqcount); 727 } else { 728 bawrite(bp); 729 } 730 } else if (ioflag & IO_DIRECT) { 731 bp->b_flags \|= B_CLUSTEROK; 732 bawrite(bp); 733 } else { 734 bp->b_flags \|= B_CLUSTEROK; 735 bdwrite(bp); 736 } 737 if (error \|\| xfersize == 0) 738 break; 739 ip->i_flag \|= IN_CHANGE \| IN_UPDATE; 740 } 741 /* 742 * If we successfully wrote any data, and we are not the superuser 743 * we clear the setuid and setgid bits as a precaution against 744 * tampering. 745 / 746* if (resid > uio->uio_resid && ap->a_cred && 747 suser_cred(ap->a_cred, PRISON_ROOT)) { 748 ip->i_mode &= ~(ISUID \| ISGID); 749 DIP(ip, i_mode) = ip->i_mode; 750 } 751 if (resid > uio->uio_resid) 752 VN_KNOTE(vp, NOTE_WRITE \| (extended ? NOTE_EXTEND : 0)); 753 if (error) { 754 if (ioflag & IO_UNIT) { 755 (void)UFS_TRUNCATE(vp, osize, 756 IO_NORMAL \| (ioflag & IO_SYNC), 757 ap->a_cred, uio->uio_td); 758 uio->uio_offset -= resid - uio->uio_resid; 759 uio->uio_resid = resid; 760 } 761 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 762 error = UFS_UPDATE(vp, 1); 763 764 if (object) { 765 vm_object_vndeallocate(object); 766 } 767 768 return (error); 769} 770 771/* 772 * get page routine 773 / 774static int 775ffs_getpages(ap) 776* struct vop_getpages_args ap; 777{ 778* off_t foff, physoffset; 779 int i, size, bsize; 780 struct vnode dp, vp; 781 vm_object_t obj; 782 vm_pindex_t pindex, firstindex; 783 vm_page_t mreq; 784 int bbackwards, bforwards; 785 int pbackwards, pforwards; 786 int firstpage; 787 ufs2_daddr_t reqblkno, reqlblkno; 788 int poff; 789 int pcount; 790 int rtval; 791 int pagesperblock; 792 793 GIANT_REQUIRED; 794 795 pcount = round_page(ap->a_count) / PAGE_SIZE; 796 mreq = ap->a_m[ap->a_reqpage]; 797 firstindex = ap->a_m[0]->pindex; 798 799 /* 800 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 801 * then the entire page is valid. Since the page may be mapped, 802 * user programs might reference data beyond the actual end of file 803 * occuring within the page. We have to zero that data. 804 / 805* if (mreq->valid) { 806 if (mreq->valid != VM_PAGE_BITS_ALL) 807 vm_page_zero_invalid(mreq, TRUE); 808 vm_page_lock_queues(); 809 for (i = 0; i < pcount; i++) { 810 if (i != ap->a_reqpage) { 811 vm_page_free(ap->a_m[i]); 812 } 813 } 814 vm_page_unlock_queues(); 815 return VM_PAGER_OK; 816 } 817 818 vp = ap->a_vp; 819 obj = vp->v_object; 820 bsize = vp->v_mount->mnt_stat.f_iosize; 821 pindex = mreq->pindex; 822 foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero /; 823* 824 if (bsize < PAGE_SIZE) 825 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 826 ap->a_count, 827 ap->a_reqpage); 828 829 /* 830 * foff is the file offset of the required page 831 * reqlblkno is the logical block that contains the page 832 * poff is the index of the page into the logical block 833 / 834* reqlblkno = foff / bsize; 835 poff = (foff % bsize) / PAGE_SIZE; 836 837 dp = VTOI(vp)->i_devvp; 838 if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards) 839 \|\| (reqblkno == -1)) { 840 vm_page_lock_queues(); 841 for(i = 0; i < pcount; i++) { 842 if (i != ap->a_reqpage) 843 vm_page_free(ap->a_m[i]); 844 } 845 vm_page_unlock_queues(); 846 if (reqblkno == -1) { 847 if ((mreq->flags & PG_ZERO) == 0) 848 pmap_zero_page(mreq); 849 vm_page_undirty(mreq); 850 mreq->valid = VM_PAGE_BITS_ALL; 851 return VM_PAGER_OK; 852 } else { 853 return VM_PAGER_ERROR; 854 } 855 } 856 857 physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE; 858 pagesperblock = bsize / PAGE_SIZE; 859 /* 860 * find the first page that is contiguous... 861 * note that pbackwards is the number of pages that are contiguous 862 * backwards. 863 / 864* firstpage = 0; 865 if (ap->a_count) { 866 pbackwards = poff + bbackwards * pagesperblock; 867 if (ap->a_reqpage > pbackwards) { 868 firstpage = ap->a_reqpage - pbackwards; 869 vm_page_lock_queues(); 870 for(i=0;i<firstpage;i++) 871 vm_page_free(ap->a_m[i]); 872 vm_page_unlock_queues(); 873 } 874 875 /* 876 * pforwards is the number of pages that are contiguous 877 * after the current page. 878 / 879* pforwards = (pagesperblock - (poff + 1)) + 880 bforwards * pagesperblock; 881 if (pforwards < (pcount - (ap->a_reqpage + 1))) { 882 vm_page_lock_queues(); 883 for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++) 884 vm_page_free(ap->a_m[i]); 885 vm_page_unlock_queues(); 886 pcount = ap->a_reqpage + pforwards + 1; 887 } 888 889 /* 890 * number of pages for I/O corrected for the non-contig pages at 891 * the beginning of the array. 892 / 893* pcount -= firstpage; 894 } 895 896 /* 897 * calculate the size of the transfer 898 / 899* 900 size = pcount * PAGE_SIZE; 901 902 if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) > 903 obj->un_pager.vnp.vnp_size) 904 size = obj->un_pager.vnp.vnp_size - 905 IDX_TO_OFF(ap->a_m[firstpage]->pindex); 906 907 physoffset -= foff; 908 rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size, 909 (ap->a_reqpage - firstpage), physoffset); 910 911 return (rtval); 912} 913 914/* 915 * Extended attribute area reading. 916 / 917static int 918ffs_extread(struct vnode vp, struct uio uio, int ioflag) 919{ 920* struct inode ip; 921* struct ufs2_dinode dp; 922* struct fs fs; 923* struct buf bp; 924* ufs_lbn_t lbn, nextlbn; 925 off_t bytesinfile; 926 long size, xfersize, blkoffset; 927 int error, orig_resid; 928 mode_t mode; 929 930 GIANT_REQUIRED; 931 932 ip = VTOI(vp); 933 fs = ip->i_fs; 934 dp = ip->i_din2; 935 mode = ip->i_mode; 936 937#ifdef DIAGNOSTIC 938 if (uio->uio_rw != UIO_READ \|\| fs->fs_magic != FS_UFS2_MAGIC) 939 panic("ffs_extread: mode"); 940 941#endif 942 orig_resid = uio->uio_resid; 943 if (orig_resid <= 0) 944 return (0); 945 946 bytesinfile = dp->di_extsize - uio->uio_offset; 947 if (bytesinfile <= 0) { 948 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 949 ip->i_flag \|= IN_ACCESS; 950 return 0; 951 } 952 953 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 954 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 955 break; 956 957 lbn = lblkno(fs, uio->uio_offset); 958 nextlbn = lbn + 1; 959 960 /* 961 * size of buffer. The buffer representing the 962 * end of the file is rounded up to the size of 963 * the block type ( fragment or full block, 964 * depending ). 965 / 966* size = sblksize(fs, dp->di_extsize, lbn); 967 blkoffset = blkoff(fs, uio->uio_offset); 968 969 /* 970 * The amount we want to transfer in this iteration is 971 * one FS block less the amount of the data before 972 * our startpoint (duh!) 973 / 974* xfersize = fs->fs_bsize - blkoffset; 975 976 /* 977 * But if we actually want less than the block, 978 * or the file doesn't have a whole block more of data, 979 * then use the lesser number. 980 / 981* if (uio->uio_resid < xfersize) 982 xfersize = uio->uio_resid; 983 if (bytesinfile < xfersize) 984 xfersize = bytesinfile; 985 986 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 987 /* 988 * Don't do readahead if this is the end of the info. 989 / 990* error = bread(vp, -1 - lbn, size, NOCRED, &bp); 991 } else { 992 /* 993 * If we have a second block, then 994 * fire off a request for a readahead 995 * as well as a read. Note that the 4th and 5th 996 * arguments point to arrays of the size specified in 997 * the 6th argument. 998 / 999* int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 1000 1001 nextlbn = -1 - nextlbn; 1002 error = breadn(vp, -1 - lbn, 1003 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 1004 } 1005 if (error) { 1006 brelse(bp); 1007 bp = NULL; 1008 break; 1009 } 1010 1011 /* 1012 * If IO_DIRECT then set B_DIRECT for the buffer. This 1013 * will cause us to attempt to release the buffer later on 1014 * and will cause the buffer cache to attempt to free the 1015 * underlying pages. 1016 / 1017* if (ioflag & IO_DIRECT) 1018 bp->b_flags \|= B_DIRECT; 1019 1020 /* 1021 * We should only get non-zero b_resid when an I/O error 1022 * has occurred, which should cause us to break above. 1023 * However, if the short read did not cause an error, 1024 * then we want to ensure that we do not uiomove bad 1025 * or uninitialized data. 1026 / 1027* size -= bp->b_resid; 1028 if (size < xfersize) { 1029 if (size == 0) 1030 break; 1031 xfersize = size; 1032 } 1033 1034 error = uiomove((char )bp->b_data + blkoffset, 1035* (int)xfersize, uio); 1036 if (error) 1037 break; 1038 1039 if ((ioflag & (IO_VMIO\|IO_DIRECT)) && 1040 (LIST_FIRST(&bp->b_dep) == NULL)) { 1041 /* 1042 * If there are no dependencies, and it's VMIO, 1043 * then we don't need the buf, mark it available 1044 * for freeing. The VM has the data. 1045 / 1046* bp->b_flags \|= B_RELBUF; 1047 brelse(bp); 1048 } else { 1049 /* 1050 * Otherwise let whoever 1051 * made the request take care of 1052 * freeing it. We just queue 1053 * it onto another list. 1054 / 1055* bqrelse(bp); 1056 } 1057 } 1058 1059 /* 1060 * This can only happen in the case of an error 1061 * because the loop above resets bp to NULL on each iteration 1062 * and on normal completion has not set a new value into it. 1063 * so it must have come from a 'break' statement 1064 / 1065* if (bp != NULL) { 1066 if ((ioflag & (IO_VMIO\|IO_DIRECT)) && 1067 (LIST_FIRST(&bp->b_dep) == NULL)) { 1068 bp->b_flags \|= B_RELBUF; 1069 brelse(bp); 1070 } else { 1071 bqrelse(bp); 1072 } 1073 } 1074 1075 if ((error == 0 \|\| uio->uio_resid != orig_resid) && 1076 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 1077 ip->i_flag \|= IN_ACCESS; 1078 return (error); 1079} 1080 1081/* 1082 * Extended attribute area writing. 1083 / 1084static int 1085ffs_extwrite(struct vnode vp, struct uio uio, int ioflag, struct ucred ucred) 1086{ 1087 struct inode ip; 1088* struct ufs2_dinode dp; 1089* struct fs fs; 1090* struct buf bp; 1091* ufs_lbn_t lbn; 1092 off_t osize; 1093 int blkoffset, error, flags, resid, size, xfersize; 1094 1095 GIANT_REQUIRED; 1096 1097 ip = VTOI(vp); 1098 fs = ip->i_fs; 1099 dp = ip->i_din2; 1100 1101#ifdef DIAGNOSTIC 1102 if (uio->uio_rw != UIO_WRITE \|\| fs->fs_magic != FS_UFS2_MAGIC) 1103 panic("ext_write: mode"); 1104#endif 1105 1106 if (ioflag & IO_APPEND) 1107 uio->uio_offset = dp->di_extsize; 1108 1109 if (uio->uio_offset < 0 \|\| 1110 (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1111 return (EFBIG); 1112 1113 resid = uio->uio_resid; 1114 osize = dp->di_extsize; 1115 flags = IO_EXT; 1116 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1117 flags \|= IO_SYNC; 1118 1119 for (error = 0; uio->uio_resid > 0;) { 1120 lbn = lblkno(fs, uio->uio_offset); 1121 blkoffset = blkoff(fs, uio->uio_offset); 1122 xfersize = fs->fs_bsize - blkoffset; 1123 if (uio->uio_resid < xfersize) 1124 xfersize = uio->uio_resid; 1125 1126 /* 1127 * We must perform a read-before-write if the transfer size 1128 * does not cover the entire buffer. 1129 / 1130* if (fs->fs_bsize > xfersize) 1131 flags \|= BA_CLRBUF; 1132 else 1133 flags &= ~BA_CLRBUF; 1134 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1135 ucred, flags, &bp); 1136 if (error != 0) 1137 break; 1138 /* 1139 * If the buffer is not valid we have to clear out any 1140 * garbage data from the pages instantiated for the buffer. 1141 * If we do not, a failed uiomove() during a write can leave 1142 * the prior contents of the pages exposed to a userland 1143 * mmap(). XXX deal with uiomove() errors a better way. 1144 / 1145* if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1146 vfs_bio_clrbuf(bp); 1147 if (ioflag & IO_DIRECT) 1148 bp->b_flags \|= B_DIRECT; 1149 if (ioflag & IO_NOWDRAIN) 1150 bp->b_flags \|= B_NOWDRAIN; 1151 1152 if (uio->uio_offset + xfersize > dp->di_extsize) 1153 dp->di_extsize = uio->uio_offset + xfersize; 1154 1155 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1156 if (size < xfersize) 1157 xfersize = size; 1158 1159 error = 1160 uiomove((char )bp->b_data + blkoffset, (int)xfersize, uio); 1161* if ((ioflag & (IO_VMIO\|IO_DIRECT)) && 1162 (LIST_FIRST(&bp->b_dep) == NULL)) { 1163 bp->b_flags \|= B_RELBUF; 1164 } 1165 1166 /* 1167 * If IO_SYNC each buffer is written synchronously. Otherwise 1168 * if we have a severe page deficiency write the buffer 1169 * asynchronously. Otherwise try to cluster, and if that 1170 * doesn't do it then either do an async write (if O_DIRECT), 1171 * or a delayed write (if not). 1172 / 1173* if (ioflag & IO_SYNC) { 1174 (void)bwrite(bp); 1175 } else if (vm_page_count_severe() \|\| 1176 buf_dirty_count_severe() \|\| 1177 xfersize + blkoffset == fs->fs_bsize \|\| 1178 (ioflag & (IO_ASYNC \| IO_DIRECT))) 1179 bawrite(bp); 1180 else 1181 bdwrite(bp); 1182 if (error \|\| xfersize == 0) 1183 break; 1184 ip->i_flag \|= IN_CHANGE \| IN_UPDATE; 1185 } 1186 /* 1187 * If we successfully wrote any data, and we are not the superuser 1188 * we clear the setuid and setgid bits as a precaution against 1189 * tampering. 1190 / 1191* if (resid > uio->uio_resid && ucred && 1192 suser_cred(ucred, PRISON_ROOT)) { 1193 ip->i_mode &= ~(ISUID \| ISGID); 1194 dp->di_mode = ip->i_mode; 1195 } 1196 if (error) { 1197 if (ioflag & IO_UNIT) { 1198 (void)UFS_TRUNCATE(vp, osize, 1199 IO_EXT \| (ioflag&IO_SYNC), ucred, uio->uio_td); 1200 uio->uio_offset -= resid - uio->uio_resid; 1201 uio->uio_resid = resid; 1202 } 1203 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1204 error = UFS_UPDATE(vp, 1); 1205 return (error); 1206} 1207 1208 1209/* 1210 * Vnode operating to retrieve a named extended attribute. 1211 * 1212 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1213 * the length of the EA, and possibly the pointer to the entry and to the data. 1214 / 1215static int 1216ffs_findextattr(u_char ptr, uint length, int nspace, const char name, u_char eap, u_char eac) 1217{ 1218* u_char p, pe, pn, p0; 1219 int eapad1, eapad2, ealength, ealen, nlen; 1220 uint32_t ul; 1221 1222 pe = ptr + length; 1223 nlen = strlen(name); 1224 1225 for (p = ptr; p < pe; p = pn) { 1226 p0 = p; 1227 bcopy(p, &ul, sizeof(ul)); 1228 pn = p + ul; 1229 /* make sure this entry is complete / 1230* if (pn > pe) 1231 break; 1232 p += sizeof(uint32_t); 1233 if (p != nspace) 1234* continue; 1235 p++; 1236 eapad2 = p++; 1237* if (p != nlen) 1238* continue; 1239 p++; 1240 if (bcmp(p, name, nlen)) 1241 continue; 1242 ealength = sizeof(uint32_t) + 3 + nlen; 1243 eapad1 = 8 - (ealength % 8); 1244 if (eapad1 == 8) 1245 eapad1 = 0; 1246 ealength += eapad1; 1247 ealen = ul - ealength - eapad2; 1248 p += nlen + eapad1; 1249 if (eap != NULL) 1250 eap = p0; 1251* if (eac != NULL) 1252 eac = p; 1253* return (ealen); 1254 } 1255 return(-1); 1256} 1257 1258static int 1259ffs_rdextattr(u_char *p, struct vnode vp, struct thread td, int extra) 1260{ 1261* struct inode ip; 1262* struct fs fs; 1263* struct ufs2_dinode dp; 1264* struct uio luio; 1265 struct iovec liovec; 1266 int easize, error; 1267 u_char eae; 1268* 1269 ip = VTOI(vp); 1270 fs = ip->i_fs; 1271 dp = ip->i_din2; 1272 easize = dp->di_extsize; 1273 1274 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1275 1276 liovec.iov_base = eae; 1277 liovec.iov_len = easize; 1278 luio.uio_iov = &liovec; 1279 luio.uio_iovcnt = 1; 1280 luio.uio_offset = 0; 1281 luio.uio_resid = easize; 1282 luio.uio_segflg = UIO_SYSSPACE; 1283 luio.uio_rw = UIO_READ; 1284 luio.uio_td = td; 1285 1286 error = ffs_extread(vp, &luio, IO_EXT \| IO_SYNC); 1287 if (error) { 1288 free(eae, M_TEMP); 1289 return(error); 1290 } 1291 p = eae; 1292* return (0); 1293} 1294 1295static int 1296ffs_open_ea(struct vnode vp, struct ucred cred, struct thread td) 1297{ 1298* struct inode ip; 1299* struct fs fs; 1300* struct ufs2_dinode dp; 1301* int error; 1302 1303 ip = VTOI(vp); 1304 fs = ip->i_fs; 1305 1306 if (ip->i_ea_area != NULL) 1307 return (EBUSY); 1308 dp = ip->i_din2; 1309 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1310 if (error) 1311 return (error); 1312 ip->i_ea_len = dp->di_extsize; 1313 ip->i_ea_error = 0; 1314 return (0); 1315} 1316 1317/* 1318 * Vnode extattr transaction commit/abort 1319 / 1320static int 1321ffs_close_ea(struct vnode vp, int commit, struct ucred cred, struct thread td) 1322{ 1323 struct inode ip; 1324* struct fs fs; 1325* struct uio luio; 1326 struct iovec liovec; 1327 int error; 1328 struct ufs2_dinode dp; 1329* 1330 ip = VTOI(vp); 1331 fs = ip->i_fs; 1332 if (ip->i_ea_area == NULL) 1333 return (EINVAL); 1334 dp = ip->i_din2; 1335 error = ip->i_ea_error; 1336 if (commit && error == 0) { 1337 if (cred == NOCRED) 1338 cred = vp->v_mount->mnt_cred; 1339 liovec.iov_base = ip->i_ea_area; 1340 liovec.iov_len = ip->i_ea_len; 1341 luio.uio_iov = &liovec; 1342 luio.uio_iovcnt = 1; 1343 luio.uio_offset = 0; 1344 luio.uio_resid = ip->i_ea_len; 1345 luio.uio_segflg = UIO_SYSSPACE; 1346 luio.uio_rw = UIO_WRITE; 1347 luio.uio_td = td; 1348 /* XXX: I'm not happy about truncating to zero size / 1349* if (ip->i_ea_len < dp->di_extsize) 1350 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1351 error = ffs_extwrite(vp, &luio, IO_EXT \| IO_SYNC, cred); 1352 } 1353 free(ip->i_ea_area, M_TEMP); 1354 ip->i_ea_area = NULL; 1355 ip->i_ea_len = 0; 1356 ip->i_ea_error = 0; 1357 return (error); 1358} 1359 1360/* 1361 * Vnode extattr strategy routine for special devices and fifos. 1362 * 1363 * We need to check for a read or write of the external attributes. 1364 * Otherwise we just fall through and do the usual thing. 1365 / 1366static int 1367ffsext_strategy(struct vop_strategy_args ap) 1368/* 1369struct vop_strategy_args { 1370 struct vnodeop_desc a_desc; 1371* struct vnode a_vp; 1372* struct buf a_bp; 1373}; 1374*/ 1375{ 1376* struct vnode vp; 1377* daddr_t lbn; 1378 1379 vp = ap->a_vp; 1380 lbn = ap->a_bp->b_lblkno; 1381 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1382 lbn < 0 && lbn >= -NXADDR) 1383 return (ufs_vnoperate((struct vop_generic_args )ap)); 1384* if (vp->v_type == VFIFO) 1385 return (ufs_vnoperatefifo((struct vop_generic_args )ap)); 1386* return (ufs_vnoperatespec((struct vop_generic_args )ap)); 1387} 1388* 1389/* 1390 * Vnode extattr transaction commit/abort 1391 / 1392static int 1393ffs_openextattr(struct vop_openextattr_args ap) 1394/* 1395struct vop_openextattr_args { 1396 struct vnodeop_desc a_desc; 1397* struct vnode a_vp; 1398* IN struct ucred a_cred; 1399* IN struct thread a_td; 1400}; 1401*/ 1402{ 1403* struct inode ip; 1404* struct fs fs; 1405* 1406 ip = VTOI(ap->a_vp); 1407 fs = ip->i_fs; 1408 if (fs->fs_magic == FS_UFS1_MAGIC) 1409 return (ufs_vnoperate((struct vop_generic_args )ap)); 1410* return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1411} 1412 1413 1414/* 1415 * Vnode extattr transaction commit/abort 1416 / 1417static int 1418ffs_closeextattr(struct vop_closeextattr_args ap) 1419/* 1420struct vop_closeextattr_args { 1421 struct vnodeop_desc a_desc; 1422* struct vnode a_vp; 1423* int a_commit; 1424 IN struct ucred a_cred; 1425* IN struct thread a_td; 1426}; 1427*/ 1428{ 1429* struct inode ip; 1430* struct fs fs; 1431* 1432 ip = VTOI(ap->a_vp); 1433 fs = ip->i_fs; 1434 if (fs->fs_magic == FS_UFS1_MAGIC) 1435 return (ufs_vnoperate((struct vop_generic_args )ap)); 1436* return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1437} 1438 1439 1440 1441/* 1442 * Vnode operation to retrieve a named extended attribute. 1443 / 1444static int 1445ffs_getextattr(struct vop_getextattr_args ap) 1446/* 1447vop_getextattr { 1448 IN struct vnode a_vp; 1449* IN int a_attrnamespace; 1450 IN const char a_name; 1451* INOUT struct uio a_uio; 1452* OUT size_t a_size; 1453* IN struct ucred a_cred; 1454* IN struct thread a_td; 1455}; 1456*/ 1457{ 1458* struct inode ip; 1459* struct fs fs; 1460* u_char eae, p, pe, pn; 1461 struct ufs2_dinode dp; 1462* unsigned easize; 1463 uint32_t ul; 1464 int error, ealen, stand_alone; 1465 1466 ip = VTOI(ap->a_vp); 1467 fs = ip->i_fs; 1468 1469 if (fs->fs_magic == FS_UFS1_MAGIC) 1470 return (ufs_vnoperate((struct vop_generic_args )ap)); 1471* 1472 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1473 ap->a_cred, ap->a_td, IREAD); 1474 if (error) 1475 return (error); 1476 1477 if (ip->i_ea_area == NULL) { 1478 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1479 if (error) 1480 return (error); 1481 stand_alone = 1; 1482 } else { 1483 stand_alone = 0; 1484 } 1485 dp = ip->i_din2; 1486 eae = ip->i_ea_area; 1487 easize = ip->i_ea_len; 1488 if (strlen(ap->a_name) > 0) { 1489 ealen = ffs_findextattr(eae, easize, 1490 ap->a_attrnamespace, ap->a_name, NULL, &p); 1491 if (ealen >= 0) { 1492 error = 0; 1493 if (ap->a_size != NULL) 1494 ap->a_size = ealen; 1495* else if (ap->a_uio != NULL) 1496 error = uiomove(p, ealen, ap->a_uio); 1497 } else { 1498 error = ENOATTR; 1499 } 1500 } else { 1501 error = 0; 1502 if (ap->a_size != NULL) 1503 ap->a_size = 0; 1504* pe = eae + easize; 1505 for(p = eae; error == 0 && p < pe; p = pn) { 1506 bcopy(p, &ul, sizeof(ul)); 1507 pn = p + ul; 1508 if (pn > pe) 1509 break; 1510 p += sizeof(ul); 1511 if (p++ != ap->a_attrnamespace) 1512* continue; 1513 p++; /* pad2 / 1514* ealen = p; 1515* if (ap->a_size != NULL) { 1516 ap->a_size += ealen + 1; 1517* } else if (ap->a_uio != NULL) { 1518 error = uiomove(p, ealen + 1, ap->a_uio); 1519 } 1520 } 1521 } 1522 if (stand_alone) 1523 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1524 return(error); 1525} 1526 1527/* 1528 * Vnode operation to set a named attribute. 1529 / 1530static int 1531ffs_setextattr(struct vop_setextattr_args ap) 1532/* 1533vop_setextattr { 1534 IN struct vnode a_vp; 1535* IN int a_attrnamespace; 1536 IN const char a_name; 1537* INOUT struct uio a_uio; 1538* IN struct ucred a_cred; 1539* IN struct thread a_td; 1540}; 1541*/ 1542{ 1543* struct inode ip; 1544* struct fs fs; 1545* uint32_t ealength, ul; 1546 int ealen, olen, eacont, eapad1, eapad2, error, i, easize; 1547 u_char eae, p; 1548 struct ufs2_dinode dp; 1549* struct ucred cred; 1550* int stand_alone; 1551 1552 ip = VTOI(ap->a_vp); 1553 fs = ip->i_fs; 1554 1555 if (fs->fs_magic == FS_UFS1_MAGIC) 1556 return (ufs_vnoperate((struct vop_generic_args )ap)); 1557* 1558 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1559 ap->a_cred, ap->a_td, IWRITE); 1560 if (error) { 1561 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1562 ip->i_ea_error = error; 1563 return (error); 1564 } 1565 1566 if (ap->a_cred != NOCRED) 1567 cred = ap->a_cred; 1568 else 1569 cred = ap->a_vp->v_mount->mnt_cred; 1570 1571 dp = ip->i_din2; 1572 1573 if (ip->i_ea_area == NULL) { 1574 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1575 if (error) 1576 return (error); 1577 stand_alone = 1; 1578 } else { 1579 stand_alone = 0; 1580 } 1581 1582 /* Calculate the length of the EA entry / 1583* if (ap->a_uio == NULL) { 1584 /* delete / 1585* ealength = eapad1 = ealen = eapad2 = eacont = 0; 1586 } else { 1587 ealen = ap->a_uio->uio_resid; 1588 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1589 eapad1 = 8 - (ealength % 8); 1590 if (eapad1 == 8) 1591 eapad1 = 0; 1592 eacont = ealength + eapad1; 1593 eapad2 = 8 - (ealen % 8); 1594 if (eapad2 == 8) 1595 eapad2 = 0; 1596 ealength += eapad1 + ealen + eapad2; 1597 } 1598 1599 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1600 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1601 easize = ip->i_ea_len; 1602 1603 olen = ffs_findextattr(eae, easize, 1604 ap->a_attrnamespace, ap->a_name, &p, NULL); 1605 if (olen == -1 && ealength == 0) { 1606 /* delete but nonexistent / 1607* free(eae, M_TEMP); 1608 if (stand_alone) 1609 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1610 return(ENOATTR); 1611 } 1612 if (olen == -1) { 1613 /* new, append at end / 1614* p = eae + easize; 1615 easize += ealength; 1616 } else { 1617 bcopy(p, &ul, sizeof ul); 1618 i = p - eae + ul; 1619 if (ul != ealength) { 1620 bcopy(p + ul, p + ealength, easize - i); 1621 easize += (ealength - ul); 1622 } 1623 } 1624 if (easize > NXADDR * fs->fs_bsize) { 1625 free(eae, M_TEMP); 1626 if (stand_alone) 1627 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1628 else if (ip->i_ea_error == 0) 1629 ip->i_ea_error = ENOSPC; 1630 return(ENOSPC); 1631 } 1632 if (ealength != 0) { 1633 bcopy(&ealength, p, sizeof(ealength)); 1634 p += sizeof(ealength); 1635 p++ = ap->a_attrnamespace; 1636* p++ = eapad2; 1637* p++ = strlen(ap->a_name); 1638* strcpy(p, ap->a_name); 1639 p += strlen(ap->a_name); 1640 bzero(p, eapad1); 1641 p += eapad1; 1642 error = uiomove(p, ealen, ap->a_uio); 1643 if (error) { 1644 free(eae, M_TEMP); 1645 if (stand_alone) 1646 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1647 else if (ip->i_ea_error == 0) 1648 ip->i_ea_error = error; 1649 return(error); 1650 } 1651 p += ealen; 1652 bzero(p, eapad2); 1653 } 1654 p = ip->i_ea_area; 1655 ip->i_ea_area = eae; 1656 ip->i_ea_len = easize; 1657 free(p, M_TEMP); 1658 if (stand_alone) 1659 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1660 return(error); 1661}	366 GIANT_REQUIRED; 367 368 seqcount = ap->a_ioflag >> 16; 369 ip = VTOI(vp); 370 mode = ip->i_mode; 371 372#ifdef DIAGNOSTIC 373 if (uio->uio_rw != UIO_READ) 374 panic("ffs_read: mode"); 375 376 if (vp->v_type == VLNK) { 377 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) 378 panic("ffs_read: short symlink"); 379 } else if (vp->v_type != VREG && vp->v_type != VDIR) 380 panic("ffs_read: type %d", vp->v_type); 381#endif 382 fs = ip->i_fs; 383 if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize) 384 return (EFBIG); 385 386 orig_resid = uio->uio_resid; 387 if (orig_resid <= 0) 388 return (0); 389 390 object = vp->v_object; 391 392 bytesinfile = ip->i_size - uio->uio_offset; 393 if (bytesinfile <= 0) { 394 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 395 ip->i_flag \|= IN_ACCESS; 396 return 0; 397 } 398 399 if (object) { 400 vm_object_reference(object); 401 } 402 403 /* 404 * Ok so we couldn't do it all in one vm trick... 405 * so cycle around trying smaller bites.. 406 / 407* for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 408 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) 409 break; 410 411 lbn = lblkno(fs, uio->uio_offset); 412 nextlbn = lbn + 1; 413 414 /* 415 * size of buffer. The buffer representing the 416 * end of the file is rounded up to the size of 417 * the block type ( fragment or full block, 418 * depending ). 419 / 420* size = blksize(fs, ip, lbn); 421 blkoffset = blkoff(fs, uio->uio_offset); 422 423 /* 424 * The amount we want to transfer in this iteration is 425 * one FS block less the amount of the data before 426 * our startpoint (duh!) 427 / 428* xfersize = fs->fs_bsize - blkoffset; 429 430 /* 431 * But if we actually want less than the block, 432 * or the file doesn't have a whole block more of data, 433 * then use the lesser number. 434 / 435* if (uio->uio_resid < xfersize) 436 xfersize = uio->uio_resid; 437 if (bytesinfile < xfersize) 438 xfersize = bytesinfile; 439 440 if (lblktosize(fs, nextlbn) >= ip->i_size) { 441 /* 442 * Don't do readahead if this is the end of the file. 443 / 444* error = bread(vp, lbn, size, NOCRED, &bp); 445 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { 446 /* 447 * Otherwise if we are allowed to cluster, 448 * grab as much as we can. 449 * 450 * XXX This may not be a win if we are not 451 * doing sequential access. 452 / 453* error = cluster_read(vp, ip->i_size, lbn, 454 size, NOCRED, uio->uio_resid, seqcount, &bp); 455 } else if (seqcount > 1) { 456 /* 457 * If we are NOT allowed to cluster, then 458 * if we appear to be acting sequentially, 459 * fire off a request for a readahead 460 * as well as a read. Note that the 4th and 5th 461 * arguments point to arrays of the size specified in 462 * the 6th argument. 463 / 464* int nextsize = blksize(fs, ip, nextlbn); 465 error = breadn(vp, lbn, 466 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 467 } else { 468 /* 469 * Failing all of the above, just read what the 470 * user asked for. Interestingly, the same as 471 * the first option above. 472 / 473* error = bread(vp, lbn, size, NOCRED, &bp); 474 } 475 if (error) { 476 brelse(bp); 477 bp = NULL; 478 break; 479 } 480 481 /* 482 * If IO_DIRECT then set B_DIRECT for the buffer. This 483 * will cause us to attempt to release the buffer later on 484 * and will cause the buffer cache to attempt to free the 485 * underlying pages. 486 / 487* if (ioflag & IO_DIRECT) 488 bp->b_flags \|= B_DIRECT; 489 490 /* 491 * We should only get non-zero b_resid when an I/O error 492 * has occurred, which should cause us to break above. 493 * However, if the short read did not cause an error, 494 * then we want to ensure that we do not uiomove bad 495 * or uninitialized data. 496 / 497* size -= bp->b_resid; 498 if (size < xfersize) { 499 if (size == 0) 500 break; 501 xfersize = size; 502 } 503 504 { 505 /* 506 * otherwise use the general form 507 / 508* error = 509 uiomove((char )bp->b_data + blkoffset, 510* (int)xfersize, uio); 511 } 512 513 if (error) 514 break; 515 516 if ((ioflag & (IO_VMIO\|IO_DIRECT)) && 517 (LIST_FIRST(&bp->b_dep) == NULL)) { 518 /* 519 * If there are no dependencies, and it's VMIO, 520 * then we don't need the buf, mark it available 521 * for freeing. The VM has the data. 522 / 523* bp->b_flags \|= B_RELBUF; 524 brelse(bp); 525 } else { 526 /* 527 * Otherwise let whoever 528 * made the request take care of 529 * freeing it. We just queue 530 * it onto another list. 531 / 532* bqrelse(bp); 533 } 534 } 535 536 /* 537 * This can only happen in the case of an error 538 * because the loop above resets bp to NULL on each iteration 539 * and on normal completion has not set a new value into it. 540 * so it must have come from a 'break' statement 541 / 542* if (bp != NULL) { 543 if ((ioflag & (IO_VMIO\|IO_DIRECT)) && 544 (LIST_FIRST(&bp->b_dep) == NULL)) { 545 bp->b_flags \|= B_RELBUF; 546 brelse(bp); 547 } else { 548 bqrelse(bp); 549 } 550 } 551 552 if (object) { 553 vm_object_vndeallocate(object); 554 } 555 if ((error == 0 \|\| uio->uio_resid != orig_resid) && 556 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 557 ip->i_flag \|= IN_ACCESS; 558 return (error); 559} 560 561/* 562 * Vnode op for writing. 563 / 564static int 565ffs_write(ap) 566* struct vop_write_args /* { 567 struct vnode a_vp; 568* struct uio a_uio; 569* int a_ioflag; 570 struct ucred a_cred; 571* } / ap; 572{ 573 struct vnode vp; 574* struct uio uio; 575* struct inode ip; 576* struct fs fs; 577* struct buf bp; 578* struct thread td; 579* ufs_lbn_t lbn; 580 off_t osize; 581 int seqcount; 582 int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; 583 vm_object_t object; 584 585 vp = ap->a_vp; 586 uio = ap->a_uio; 587 ioflag = ap->a_ioflag; 588 if (ap->a_ioflag & IO_EXT) 589#ifdef notyet 590 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); 591#else 592 panic("ffs_read+IO_EXT"); 593#endif 594 595 GIANT_REQUIRED; 596 597 extended = 0; 598 seqcount = ap->a_ioflag >> 16; 599 ip = VTOI(vp); 600 601 object = vp->v_object; 602 if (object) { 603 vm_object_reference(object); 604 } 605 606#ifdef DIAGNOSTIC 607 if (uio->uio_rw != UIO_WRITE) 608 panic("ffswrite: mode"); 609#endif 610 611 switch (vp->v_type) { 612 case VREG: 613 if (ioflag & IO_APPEND) 614 uio->uio_offset = ip->i_size; 615 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) { 616 if (object) { 617 vm_object_vndeallocate(object); 618 } 619 return (EPERM); 620 } 621 /* FALLTHROUGH / 622* case VLNK: 623 break; 624 case VDIR: 625 panic("ffswrite: dir write"); 626 break; 627 default: 628 panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type, 629 (int)uio->uio_offset, 630 (int)uio->uio_resid 631 ); 632 } 633 634 fs = ip->i_fs; 635 if (uio->uio_offset < 0 \|\| 636 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) { 637 if (object) { 638 vm_object_vndeallocate(object); 639 } 640 return (EFBIG); 641 } 642 /* 643 * Maybe this should be above the vnode op call, but so long as 644 * file servers have no limits, I don't think it matters. 645 / 646* td = uio->uio_td; 647 if (vp->v_type == VREG && td && 648 uio->uio_offset + uio->uio_resid > 649 td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 650 PROC_LOCK(td->td_proc); 651 psignal(td->td_proc, SIGXFSZ); 652 PROC_UNLOCK(td->td_proc); 653 if (object) { 654 vm_object_vndeallocate(object); 655 } 656 return (EFBIG); 657 } 658 659 resid = uio->uio_resid; 660 osize = ip->i_size; 661 if (seqcount > BA_SEQMAX) 662 flags = BA_SEQMAX << BA_SEQSHIFT; 663 else 664 flags = seqcount << BA_SEQSHIFT; 665 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 666 flags \|= IO_SYNC; 667 668 for (error = 0; uio->uio_resid > 0;) { 669 lbn = lblkno(fs, uio->uio_offset); 670 blkoffset = blkoff(fs, uio->uio_offset); 671 xfersize = fs->fs_bsize - blkoffset; 672 if (uio->uio_resid < xfersize) 673 xfersize = uio->uio_resid; 674 675 if (uio->uio_offset + xfersize > ip->i_size) 676 vnode_pager_setsize(vp, uio->uio_offset + xfersize); 677 678 /* 679 * We must perform a read-before-write if the transfer size 680 * does not cover the entire buffer. 681 / 682* if (fs->fs_bsize > xfersize) 683 flags \|= BA_CLRBUF; 684 else 685 flags &= ~BA_CLRBUF; 686/* XXX is uio->uio_offset the right thing here? / 687* error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 688 ap->a_cred, flags, &bp); 689 if (error != 0) 690 break; 691 /* 692 * If the buffer is not valid we have to clear out any 693 * garbage data from the pages instantiated for the buffer. 694 * If we do not, a failed uiomove() during a write can leave 695 * the prior contents of the pages exposed to a userland 696 * mmap(). XXX deal with uiomove() errors a better way. 697 / 698* if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 699 vfs_bio_clrbuf(bp); 700 if (ioflag & IO_DIRECT) 701 bp->b_flags \|= B_DIRECT; 702 if (ioflag & IO_NOWDRAIN) 703 bp->b_flags \|= B_NOWDRAIN; 704 705 if (uio->uio_offset + xfersize > ip->i_size) { 706 ip->i_size = uio->uio_offset + xfersize; 707 DIP(ip, i_size) = ip->i_size; 708 extended = 1; 709 } 710 711 size = blksize(fs, ip, lbn) - bp->b_resid; 712 if (size < xfersize) 713 xfersize = size; 714 715 error = 716 uiomove((char )bp->b_data + blkoffset, (int)xfersize, uio); 717* if ((ioflag & (IO_VMIO\|IO_DIRECT)) && 718 (LIST_FIRST(&bp->b_dep) == NULL)) { 719 bp->b_flags \|= B_RELBUF; 720 } 721 722 /* 723 * If IO_SYNC each buffer is written synchronously. Otherwise 724 * if we have a severe page deficiency write the buffer 725 * asynchronously. Otherwise try to cluster, and if that 726 * doesn't do it then either do an async write (if O_DIRECT), 727 * or a delayed write (if not). 728 / 729* if (ioflag & IO_SYNC) { 730 (void)bwrite(bp); 731 } else if (vm_page_count_severe() \|\| 732 buf_dirty_count_severe() \|\| 733 (ioflag & IO_ASYNC)) { 734 bp->b_flags \|= B_CLUSTEROK; 735 bawrite(bp); 736 } else if (xfersize + blkoffset == fs->fs_bsize) { 737 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { 738 bp->b_flags \|= B_CLUSTEROK; 739 cluster_write(bp, ip->i_size, seqcount); 740 } else { 741 bawrite(bp); 742 } 743 } else if (ioflag & IO_DIRECT) { 744 bp->b_flags \|= B_CLUSTEROK; 745 bawrite(bp); 746 } else { 747 bp->b_flags \|= B_CLUSTEROK; 748 bdwrite(bp); 749 } 750 if (error \|\| xfersize == 0) 751 break; 752 ip->i_flag \|= IN_CHANGE \| IN_UPDATE; 753 } 754 /* 755 * If we successfully wrote any data, and we are not the superuser 756 * we clear the setuid and setgid bits as a precaution against 757 * tampering. 758 / 759* if (resid > uio->uio_resid && ap->a_cred && 760 suser_cred(ap->a_cred, PRISON_ROOT)) { 761 ip->i_mode &= ~(ISUID \| ISGID); 762 DIP(ip, i_mode) = ip->i_mode; 763 } 764 if (resid > uio->uio_resid) 765 VN_KNOTE(vp, NOTE_WRITE \| (extended ? NOTE_EXTEND : 0)); 766 if (error) { 767 if (ioflag & IO_UNIT) { 768 (void)UFS_TRUNCATE(vp, osize, 769 IO_NORMAL \| (ioflag & IO_SYNC), 770 ap->a_cred, uio->uio_td); 771 uio->uio_offset -= resid - uio->uio_resid; 772 uio->uio_resid = resid; 773 } 774 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 775 error = UFS_UPDATE(vp, 1); 776 777 if (object) { 778 vm_object_vndeallocate(object); 779 } 780 781 return (error); 782} 783 784/* 785 * get page routine 786 / 787static int 788ffs_getpages(ap) 789* struct vop_getpages_args ap; 790{ 791* off_t foff, physoffset; 792 int i, size, bsize; 793 struct vnode dp, vp; 794 vm_object_t obj; 795 vm_pindex_t pindex, firstindex; 796 vm_page_t mreq; 797 int bbackwards, bforwards; 798 int pbackwards, pforwards; 799 int firstpage; 800 ufs2_daddr_t reqblkno, reqlblkno; 801 int poff; 802 int pcount; 803 int rtval; 804 int pagesperblock; 805 806 GIANT_REQUIRED; 807 808 pcount = round_page(ap->a_count) / PAGE_SIZE; 809 mreq = ap->a_m[ap->a_reqpage]; 810 firstindex = ap->a_m[0]->pindex; 811 812 /* 813 * if ANY DEV_BSIZE blocks are valid on a large filesystem block, 814 * then the entire page is valid. Since the page may be mapped, 815 * user programs might reference data beyond the actual end of file 816 * occuring within the page. We have to zero that data. 817 / 818* if (mreq->valid) { 819 if (mreq->valid != VM_PAGE_BITS_ALL) 820 vm_page_zero_invalid(mreq, TRUE); 821 vm_page_lock_queues(); 822 for (i = 0; i < pcount; i++) { 823 if (i != ap->a_reqpage) { 824 vm_page_free(ap->a_m[i]); 825 } 826 } 827 vm_page_unlock_queues(); 828 return VM_PAGER_OK; 829 } 830 831 vp = ap->a_vp; 832 obj = vp->v_object; 833 bsize = vp->v_mount->mnt_stat.f_iosize; 834 pindex = mreq->pindex; 835 foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero /; 836* 837 if (bsize < PAGE_SIZE) 838 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, 839 ap->a_count, 840 ap->a_reqpage); 841 842 /* 843 * foff is the file offset of the required page 844 * reqlblkno is the logical block that contains the page 845 * poff is the index of the page into the logical block 846 / 847* reqlblkno = foff / bsize; 848 poff = (foff % bsize) / PAGE_SIZE; 849 850 dp = VTOI(vp)->i_devvp; 851 if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards) 852 \|\| (reqblkno == -1)) { 853 vm_page_lock_queues(); 854 for(i = 0; i < pcount; i++) { 855 if (i != ap->a_reqpage) 856 vm_page_free(ap->a_m[i]); 857 } 858 vm_page_unlock_queues(); 859 if (reqblkno == -1) { 860 if ((mreq->flags & PG_ZERO) == 0) 861 pmap_zero_page(mreq); 862 vm_page_undirty(mreq); 863 mreq->valid = VM_PAGE_BITS_ALL; 864 return VM_PAGER_OK; 865 } else { 866 return VM_PAGER_ERROR; 867 } 868 } 869 870 physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE; 871 pagesperblock = bsize / PAGE_SIZE; 872 /* 873 * find the first page that is contiguous... 874 * note that pbackwards is the number of pages that are contiguous 875 * backwards. 876 / 877* firstpage = 0; 878 if (ap->a_count) { 879 pbackwards = poff + bbackwards * pagesperblock; 880 if (ap->a_reqpage > pbackwards) { 881 firstpage = ap->a_reqpage - pbackwards; 882 vm_page_lock_queues(); 883 for(i=0;i<firstpage;i++) 884 vm_page_free(ap->a_m[i]); 885 vm_page_unlock_queues(); 886 } 887 888 /* 889 * pforwards is the number of pages that are contiguous 890 * after the current page. 891 / 892* pforwards = (pagesperblock - (poff + 1)) + 893 bforwards * pagesperblock; 894 if (pforwards < (pcount - (ap->a_reqpage + 1))) { 895 vm_page_lock_queues(); 896 for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++) 897 vm_page_free(ap->a_m[i]); 898 vm_page_unlock_queues(); 899 pcount = ap->a_reqpage + pforwards + 1; 900 } 901 902 /* 903 * number of pages for I/O corrected for the non-contig pages at 904 * the beginning of the array. 905 / 906* pcount -= firstpage; 907 } 908 909 /* 910 * calculate the size of the transfer 911 / 912* 913 size = pcount * PAGE_SIZE; 914 915 if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) > 916 obj->un_pager.vnp.vnp_size) 917 size = obj->un_pager.vnp.vnp_size - 918 IDX_TO_OFF(ap->a_m[firstpage]->pindex); 919 920 physoffset -= foff; 921 rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size, 922 (ap->a_reqpage - firstpage), physoffset); 923 924 return (rtval); 925} 926 927/* 928 * Extended attribute area reading. 929 / 930static int 931ffs_extread(struct vnode vp, struct uio uio, int ioflag) 932{ 933* struct inode ip; 934* struct ufs2_dinode dp; 935* struct fs fs; 936* struct buf bp; 937* ufs_lbn_t lbn, nextlbn; 938 off_t bytesinfile; 939 long size, xfersize, blkoffset; 940 int error, orig_resid; 941 mode_t mode; 942 943 GIANT_REQUIRED; 944 945 ip = VTOI(vp); 946 fs = ip->i_fs; 947 dp = ip->i_din2; 948 mode = ip->i_mode; 949 950#ifdef DIAGNOSTIC 951 if (uio->uio_rw != UIO_READ \|\| fs->fs_magic != FS_UFS2_MAGIC) 952 panic("ffs_extread: mode"); 953 954#endif 955 orig_resid = uio->uio_resid; 956 if (orig_resid <= 0) 957 return (0); 958 959 bytesinfile = dp->di_extsize - uio->uio_offset; 960 if (bytesinfile <= 0) { 961 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 962 ip->i_flag \|= IN_ACCESS; 963 return 0; 964 } 965 966 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { 967 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) 968 break; 969 970 lbn = lblkno(fs, uio->uio_offset); 971 nextlbn = lbn + 1; 972 973 /* 974 * size of buffer. The buffer representing the 975 * end of the file is rounded up to the size of 976 * the block type ( fragment or full block, 977 * depending ). 978 / 979* size = sblksize(fs, dp->di_extsize, lbn); 980 blkoffset = blkoff(fs, uio->uio_offset); 981 982 /* 983 * The amount we want to transfer in this iteration is 984 * one FS block less the amount of the data before 985 * our startpoint (duh!) 986 / 987* xfersize = fs->fs_bsize - blkoffset; 988 989 /* 990 * But if we actually want less than the block, 991 * or the file doesn't have a whole block more of data, 992 * then use the lesser number. 993 / 994* if (uio->uio_resid < xfersize) 995 xfersize = uio->uio_resid; 996 if (bytesinfile < xfersize) 997 xfersize = bytesinfile; 998 999 if (lblktosize(fs, nextlbn) >= dp->di_extsize) { 1000 /* 1001 * Don't do readahead if this is the end of the info. 1002 / 1003* error = bread(vp, -1 - lbn, size, NOCRED, &bp); 1004 } else { 1005 /* 1006 * If we have a second block, then 1007 * fire off a request for a readahead 1008 * as well as a read. Note that the 4th and 5th 1009 * arguments point to arrays of the size specified in 1010 * the 6th argument. 1011 / 1012* int nextsize = sblksize(fs, dp->di_extsize, nextlbn); 1013 1014 nextlbn = -1 - nextlbn; 1015 error = breadn(vp, -1 - lbn, 1016 size, &nextlbn, &nextsize, 1, NOCRED, &bp); 1017 } 1018 if (error) { 1019 brelse(bp); 1020 bp = NULL; 1021 break; 1022 } 1023 1024 /* 1025 * If IO_DIRECT then set B_DIRECT for the buffer. This 1026 * will cause us to attempt to release the buffer later on 1027 * and will cause the buffer cache to attempt to free the 1028 * underlying pages. 1029 / 1030* if (ioflag & IO_DIRECT) 1031 bp->b_flags \|= B_DIRECT; 1032 1033 /* 1034 * We should only get non-zero b_resid when an I/O error 1035 * has occurred, which should cause us to break above. 1036 * However, if the short read did not cause an error, 1037 * then we want to ensure that we do not uiomove bad 1038 * or uninitialized data. 1039 / 1040* size -= bp->b_resid; 1041 if (size < xfersize) { 1042 if (size == 0) 1043 break; 1044 xfersize = size; 1045 } 1046 1047 error = uiomove((char )bp->b_data + blkoffset, 1048* (int)xfersize, uio); 1049 if (error) 1050 break; 1051 1052 if ((ioflag & (IO_VMIO\|IO_DIRECT)) && 1053 (LIST_FIRST(&bp->b_dep) == NULL)) { 1054 /* 1055 * If there are no dependencies, and it's VMIO, 1056 * then we don't need the buf, mark it available 1057 * for freeing. The VM has the data. 1058 / 1059* bp->b_flags \|= B_RELBUF; 1060 brelse(bp); 1061 } else { 1062 /* 1063 * Otherwise let whoever 1064 * made the request take care of 1065 * freeing it. We just queue 1066 * it onto another list. 1067 / 1068* bqrelse(bp); 1069 } 1070 } 1071 1072 /* 1073 * This can only happen in the case of an error 1074 * because the loop above resets bp to NULL on each iteration 1075 * and on normal completion has not set a new value into it. 1076 * so it must have come from a 'break' statement 1077 / 1078* if (bp != NULL) { 1079 if ((ioflag & (IO_VMIO\|IO_DIRECT)) && 1080 (LIST_FIRST(&bp->b_dep) == NULL)) { 1081 bp->b_flags \|= B_RELBUF; 1082 brelse(bp); 1083 } else { 1084 bqrelse(bp); 1085 } 1086 } 1087 1088 if ((error == 0 \|\| uio->uio_resid != orig_resid) && 1089 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) 1090 ip->i_flag \|= IN_ACCESS; 1091 return (error); 1092} 1093 1094/* 1095 * Extended attribute area writing. 1096 / 1097static int 1098ffs_extwrite(struct vnode vp, struct uio uio, int ioflag, struct ucred ucred) 1099{ 1100 struct inode ip; 1101* struct ufs2_dinode dp; 1102* struct fs fs; 1103* struct buf bp; 1104* ufs_lbn_t lbn; 1105 off_t osize; 1106 int blkoffset, error, flags, resid, size, xfersize; 1107 1108 GIANT_REQUIRED; 1109 1110 ip = VTOI(vp); 1111 fs = ip->i_fs; 1112 dp = ip->i_din2; 1113 1114#ifdef DIAGNOSTIC 1115 if (uio->uio_rw != UIO_WRITE \|\| fs->fs_magic != FS_UFS2_MAGIC) 1116 panic("ext_write: mode"); 1117#endif 1118 1119 if (ioflag & IO_APPEND) 1120 uio->uio_offset = dp->di_extsize; 1121 1122 if (uio->uio_offset < 0 \|\| 1123 (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) 1124 return (EFBIG); 1125 1126 resid = uio->uio_resid; 1127 osize = dp->di_extsize; 1128 flags = IO_EXT; 1129 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) 1130 flags \|= IO_SYNC; 1131 1132 for (error = 0; uio->uio_resid > 0;) { 1133 lbn = lblkno(fs, uio->uio_offset); 1134 blkoffset = blkoff(fs, uio->uio_offset); 1135 xfersize = fs->fs_bsize - blkoffset; 1136 if (uio->uio_resid < xfersize) 1137 xfersize = uio->uio_resid; 1138 1139 /* 1140 * We must perform a read-before-write if the transfer size 1141 * does not cover the entire buffer. 1142 / 1143* if (fs->fs_bsize > xfersize) 1144 flags \|= BA_CLRBUF; 1145 else 1146 flags &= ~BA_CLRBUF; 1147 error = UFS_BALLOC(vp, uio->uio_offset, xfersize, 1148 ucred, flags, &bp); 1149 if (error != 0) 1150 break; 1151 /* 1152 * If the buffer is not valid we have to clear out any 1153 * garbage data from the pages instantiated for the buffer. 1154 * If we do not, a failed uiomove() during a write can leave 1155 * the prior contents of the pages exposed to a userland 1156 * mmap(). XXX deal with uiomove() errors a better way. 1157 / 1158* if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) 1159 vfs_bio_clrbuf(bp); 1160 if (ioflag & IO_DIRECT) 1161 bp->b_flags \|= B_DIRECT; 1162 if (ioflag & IO_NOWDRAIN) 1163 bp->b_flags \|= B_NOWDRAIN; 1164 1165 if (uio->uio_offset + xfersize > dp->di_extsize) 1166 dp->di_extsize = uio->uio_offset + xfersize; 1167 1168 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; 1169 if (size < xfersize) 1170 xfersize = size; 1171 1172 error = 1173 uiomove((char )bp->b_data + blkoffset, (int)xfersize, uio); 1174* if ((ioflag & (IO_VMIO\|IO_DIRECT)) && 1175 (LIST_FIRST(&bp->b_dep) == NULL)) { 1176 bp->b_flags \|= B_RELBUF; 1177 } 1178 1179 /* 1180 * If IO_SYNC each buffer is written synchronously. Otherwise 1181 * if we have a severe page deficiency write the buffer 1182 * asynchronously. Otherwise try to cluster, and if that 1183 * doesn't do it then either do an async write (if O_DIRECT), 1184 * or a delayed write (if not). 1185 / 1186* if (ioflag & IO_SYNC) { 1187 (void)bwrite(bp); 1188 } else if (vm_page_count_severe() \|\| 1189 buf_dirty_count_severe() \|\| 1190 xfersize + blkoffset == fs->fs_bsize \|\| 1191 (ioflag & (IO_ASYNC \| IO_DIRECT))) 1192 bawrite(bp); 1193 else 1194 bdwrite(bp); 1195 if (error \|\| xfersize == 0) 1196 break; 1197 ip->i_flag \|= IN_CHANGE \| IN_UPDATE; 1198 } 1199 /* 1200 * If we successfully wrote any data, and we are not the superuser 1201 * we clear the setuid and setgid bits as a precaution against 1202 * tampering. 1203 / 1204* if (resid > uio->uio_resid && ucred && 1205 suser_cred(ucred, PRISON_ROOT)) { 1206 ip->i_mode &= ~(ISUID \| ISGID); 1207 dp->di_mode = ip->i_mode; 1208 } 1209 if (error) { 1210 if (ioflag & IO_UNIT) { 1211 (void)UFS_TRUNCATE(vp, osize, 1212 IO_EXT \| (ioflag&IO_SYNC), ucred, uio->uio_td); 1213 uio->uio_offset -= resid - uio->uio_resid; 1214 uio->uio_resid = resid; 1215 } 1216 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) 1217 error = UFS_UPDATE(vp, 1); 1218 return (error); 1219} 1220 1221 1222/* 1223 * Vnode operating to retrieve a named extended attribute. 1224 * 1225 * Locate a particular EA (nspace:name) in the area (ptr:length), and return 1226 * the length of the EA, and possibly the pointer to the entry and to the data. 1227 / 1228static int 1229ffs_findextattr(u_char ptr, uint length, int nspace, const char name, u_char eap, u_char eac) 1230{ 1231* u_char p, pe, pn, p0; 1232 int eapad1, eapad2, ealength, ealen, nlen; 1233 uint32_t ul; 1234 1235 pe = ptr + length; 1236 nlen = strlen(name); 1237 1238 for (p = ptr; p < pe; p = pn) { 1239 p0 = p; 1240 bcopy(p, &ul, sizeof(ul)); 1241 pn = p + ul; 1242 /* make sure this entry is complete / 1243* if (pn > pe) 1244 break; 1245 p += sizeof(uint32_t); 1246 if (p != nspace) 1247* continue; 1248 p++; 1249 eapad2 = p++; 1250* if (p != nlen) 1251* continue; 1252 p++; 1253 if (bcmp(p, name, nlen)) 1254 continue; 1255 ealength = sizeof(uint32_t) + 3 + nlen; 1256 eapad1 = 8 - (ealength % 8); 1257 if (eapad1 == 8) 1258 eapad1 = 0; 1259 ealength += eapad1; 1260 ealen = ul - ealength - eapad2; 1261 p += nlen + eapad1; 1262 if (eap != NULL) 1263 eap = p0; 1264* if (eac != NULL) 1265 eac = p; 1266* return (ealen); 1267 } 1268 return(-1); 1269} 1270 1271static int 1272ffs_rdextattr(u_char *p, struct vnode vp, struct thread td, int extra) 1273{ 1274* struct inode ip; 1275* struct fs fs; 1276* struct ufs2_dinode dp; 1277* struct uio luio; 1278 struct iovec liovec; 1279 int easize, error; 1280 u_char eae; 1281* 1282 ip = VTOI(vp); 1283 fs = ip->i_fs; 1284 dp = ip->i_din2; 1285 easize = dp->di_extsize; 1286 1287 eae = malloc(easize + extra, M_TEMP, M_WAITOK); 1288 1289 liovec.iov_base = eae; 1290 liovec.iov_len = easize; 1291 luio.uio_iov = &liovec; 1292 luio.uio_iovcnt = 1; 1293 luio.uio_offset = 0; 1294 luio.uio_resid = easize; 1295 luio.uio_segflg = UIO_SYSSPACE; 1296 luio.uio_rw = UIO_READ; 1297 luio.uio_td = td; 1298 1299 error = ffs_extread(vp, &luio, IO_EXT \| IO_SYNC); 1300 if (error) { 1301 free(eae, M_TEMP); 1302 return(error); 1303 } 1304 p = eae; 1305* return (0); 1306} 1307 1308static int 1309ffs_open_ea(struct vnode vp, struct ucred cred, struct thread td) 1310{ 1311* struct inode ip; 1312* struct fs fs; 1313* struct ufs2_dinode dp; 1314* int error; 1315 1316 ip = VTOI(vp); 1317 fs = ip->i_fs; 1318 1319 if (ip->i_ea_area != NULL) 1320 return (EBUSY); 1321 dp = ip->i_din2; 1322 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); 1323 if (error) 1324 return (error); 1325 ip->i_ea_len = dp->di_extsize; 1326 ip->i_ea_error = 0; 1327 return (0); 1328} 1329 1330/* 1331 * Vnode extattr transaction commit/abort 1332 / 1333static int 1334ffs_close_ea(struct vnode vp, int commit, struct ucred cred, struct thread td) 1335{ 1336 struct inode ip; 1337* struct fs fs; 1338* struct uio luio; 1339 struct iovec liovec; 1340 int error; 1341 struct ufs2_dinode dp; 1342* 1343 ip = VTOI(vp); 1344 fs = ip->i_fs; 1345 if (ip->i_ea_area == NULL) 1346 return (EINVAL); 1347 dp = ip->i_din2; 1348 error = ip->i_ea_error; 1349 if (commit && error == 0) { 1350 if (cred == NOCRED) 1351 cred = vp->v_mount->mnt_cred; 1352 liovec.iov_base = ip->i_ea_area; 1353 liovec.iov_len = ip->i_ea_len; 1354 luio.uio_iov = &liovec; 1355 luio.uio_iovcnt = 1; 1356 luio.uio_offset = 0; 1357 luio.uio_resid = ip->i_ea_len; 1358 luio.uio_segflg = UIO_SYSSPACE; 1359 luio.uio_rw = UIO_WRITE; 1360 luio.uio_td = td; 1361 /* XXX: I'm not happy about truncating to zero size / 1362* if (ip->i_ea_len < dp->di_extsize) 1363 error = ffs_truncate(vp, 0, IO_EXT, cred, td); 1364 error = ffs_extwrite(vp, &luio, IO_EXT \| IO_SYNC, cred); 1365 } 1366 free(ip->i_ea_area, M_TEMP); 1367 ip->i_ea_area = NULL; 1368 ip->i_ea_len = 0; 1369 ip->i_ea_error = 0; 1370 return (error); 1371} 1372 1373/* 1374 * Vnode extattr strategy routine for special devices and fifos. 1375 * 1376 * We need to check for a read or write of the external attributes. 1377 * Otherwise we just fall through and do the usual thing. 1378 / 1379static int 1380ffsext_strategy(struct vop_strategy_args ap) 1381/* 1382struct vop_strategy_args { 1383 struct vnodeop_desc a_desc; 1384* struct vnode a_vp; 1385* struct buf a_bp; 1386}; 1387*/ 1388{ 1389* struct vnode vp; 1390* daddr_t lbn; 1391 1392 vp = ap->a_vp; 1393 lbn = ap->a_bp->b_lblkno; 1394 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && 1395 lbn < 0 && lbn >= -NXADDR) 1396 return (ufs_vnoperate((struct vop_generic_args )ap)); 1397* if (vp->v_type == VFIFO) 1398 return (ufs_vnoperatefifo((struct vop_generic_args )ap)); 1399* return (ufs_vnoperatespec((struct vop_generic_args )ap)); 1400} 1401* 1402/* 1403 * Vnode extattr transaction commit/abort 1404 / 1405static int 1406ffs_openextattr(struct vop_openextattr_args ap) 1407/* 1408struct vop_openextattr_args { 1409 struct vnodeop_desc a_desc; 1410* struct vnode a_vp; 1411* IN struct ucred a_cred; 1412* IN struct thread a_td; 1413}; 1414*/ 1415{ 1416* struct inode ip; 1417* struct fs fs; 1418* 1419 ip = VTOI(ap->a_vp); 1420 fs = ip->i_fs; 1421 if (fs->fs_magic == FS_UFS1_MAGIC) 1422 return (ufs_vnoperate((struct vop_generic_args )ap)); 1423* return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); 1424} 1425 1426 1427/* 1428 * Vnode extattr transaction commit/abort 1429 / 1430static int 1431ffs_closeextattr(struct vop_closeextattr_args ap) 1432/* 1433struct vop_closeextattr_args { 1434 struct vnodeop_desc a_desc; 1435* struct vnode a_vp; 1436* int a_commit; 1437 IN struct ucred a_cred; 1438* IN struct thread a_td; 1439}; 1440*/ 1441{ 1442* struct inode ip; 1443* struct fs fs; 1444* 1445 ip = VTOI(ap->a_vp); 1446 fs = ip->i_fs; 1447 if (fs->fs_magic == FS_UFS1_MAGIC) 1448 return (ufs_vnoperate((struct vop_generic_args )ap)); 1449* return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); 1450} 1451 1452 1453 1454/* 1455 * Vnode operation to retrieve a named extended attribute. 1456 / 1457static int 1458ffs_getextattr(struct vop_getextattr_args ap) 1459/* 1460vop_getextattr { 1461 IN struct vnode a_vp; 1462* IN int a_attrnamespace; 1463 IN const char a_name; 1464* INOUT struct uio a_uio; 1465* OUT size_t a_size; 1466* IN struct ucred a_cred; 1467* IN struct thread a_td; 1468}; 1469*/ 1470{ 1471* struct inode ip; 1472* struct fs fs; 1473* u_char eae, p, pe, pn; 1474 struct ufs2_dinode dp; 1475* unsigned easize; 1476 uint32_t ul; 1477 int error, ealen, stand_alone; 1478 1479 ip = VTOI(ap->a_vp); 1480 fs = ip->i_fs; 1481 1482 if (fs->fs_magic == FS_UFS1_MAGIC) 1483 return (ufs_vnoperate((struct vop_generic_args )ap)); 1484* 1485 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1486 ap->a_cred, ap->a_td, IREAD); 1487 if (error) 1488 return (error); 1489 1490 if (ip->i_ea_area == NULL) { 1491 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1492 if (error) 1493 return (error); 1494 stand_alone = 1; 1495 } else { 1496 stand_alone = 0; 1497 } 1498 dp = ip->i_din2; 1499 eae = ip->i_ea_area; 1500 easize = ip->i_ea_len; 1501 if (strlen(ap->a_name) > 0) { 1502 ealen = ffs_findextattr(eae, easize, 1503 ap->a_attrnamespace, ap->a_name, NULL, &p); 1504 if (ealen >= 0) { 1505 error = 0; 1506 if (ap->a_size != NULL) 1507 ap->a_size = ealen; 1508* else if (ap->a_uio != NULL) 1509 error = uiomove(p, ealen, ap->a_uio); 1510 } else { 1511 error = ENOATTR; 1512 } 1513 } else { 1514 error = 0; 1515 if (ap->a_size != NULL) 1516 ap->a_size = 0; 1517* pe = eae + easize; 1518 for(p = eae; error == 0 && p < pe; p = pn) { 1519 bcopy(p, &ul, sizeof(ul)); 1520 pn = p + ul; 1521 if (pn > pe) 1522 break; 1523 p += sizeof(ul); 1524 if (p++ != ap->a_attrnamespace) 1525* continue; 1526 p++; /* pad2 / 1527* ealen = p; 1528* if (ap->a_size != NULL) { 1529 ap->a_size += ealen + 1; 1530* } else if (ap->a_uio != NULL) { 1531 error = uiomove(p, ealen + 1, ap->a_uio); 1532 } 1533 } 1534 } 1535 if (stand_alone) 1536 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1537 return(error); 1538} 1539 1540/* 1541 * Vnode operation to set a named attribute. 1542 / 1543static int 1544ffs_setextattr(struct vop_setextattr_args ap) 1545/* 1546vop_setextattr { 1547 IN struct vnode a_vp; 1548* IN int a_attrnamespace; 1549 IN const char a_name; 1550* INOUT struct uio a_uio; 1551* IN struct ucred a_cred; 1552* IN struct thread a_td; 1553}; 1554*/ 1555{ 1556* struct inode ip; 1557* struct fs fs; 1558* uint32_t ealength, ul; 1559 int ealen, olen, eacont, eapad1, eapad2, error, i, easize; 1560 u_char eae, p; 1561 struct ufs2_dinode dp; 1562* struct ucred cred; 1563* int stand_alone; 1564 1565 ip = VTOI(ap->a_vp); 1566 fs = ip->i_fs; 1567 1568 if (fs->fs_magic == FS_UFS1_MAGIC) 1569 return (ufs_vnoperate((struct vop_generic_args )ap)); 1570* 1571 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, 1572 ap->a_cred, ap->a_td, IWRITE); 1573 if (error) { 1574 if (ip->i_ea_area != NULL && ip->i_ea_error == 0) 1575 ip->i_ea_error = error; 1576 return (error); 1577 } 1578 1579 if (ap->a_cred != NOCRED) 1580 cred = ap->a_cred; 1581 else 1582 cred = ap->a_vp->v_mount->mnt_cred; 1583 1584 dp = ip->i_din2; 1585 1586 if (ip->i_ea_area == NULL) { 1587 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); 1588 if (error) 1589 return (error); 1590 stand_alone = 1; 1591 } else { 1592 stand_alone = 0; 1593 } 1594 1595 /* Calculate the length of the EA entry / 1596* if (ap->a_uio == NULL) { 1597 /* delete / 1598* ealength = eapad1 = ealen = eapad2 = eacont = 0; 1599 } else { 1600 ealen = ap->a_uio->uio_resid; 1601 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); 1602 eapad1 = 8 - (ealength % 8); 1603 if (eapad1 == 8) 1604 eapad1 = 0; 1605 eacont = ealength + eapad1; 1606 eapad2 = 8 - (ealen % 8); 1607 if (eapad2 == 8) 1608 eapad2 = 0; 1609 ealength += eapad1 + ealen + eapad2; 1610 } 1611 1612 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); 1613 bcopy(ip->i_ea_area, eae, ip->i_ea_len); 1614 easize = ip->i_ea_len; 1615 1616 olen = ffs_findextattr(eae, easize, 1617 ap->a_attrnamespace, ap->a_name, &p, NULL); 1618 if (olen == -1 && ealength == 0) { 1619 /* delete but nonexistent / 1620* free(eae, M_TEMP); 1621 if (stand_alone) 1622 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1623 return(ENOATTR); 1624 } 1625 if (olen == -1) { 1626 /* new, append at end / 1627* p = eae + easize; 1628 easize += ealength; 1629 } else { 1630 bcopy(p, &ul, sizeof ul); 1631 i = p - eae + ul; 1632 if (ul != ealength) { 1633 bcopy(p + ul, p + ealength, easize - i); 1634 easize += (ealength - ul); 1635 } 1636 } 1637 if (easize > NXADDR * fs->fs_bsize) { 1638 free(eae, M_TEMP); 1639 if (stand_alone) 1640 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1641 else if (ip->i_ea_error == 0) 1642 ip->i_ea_error = ENOSPC; 1643 return(ENOSPC); 1644 } 1645 if (ealength != 0) { 1646 bcopy(&ealength, p, sizeof(ealength)); 1647 p += sizeof(ealength); 1648 p++ = ap->a_attrnamespace; 1649* p++ = eapad2; 1650* p++ = strlen(ap->a_name); 1651* strcpy(p, ap->a_name); 1652 p += strlen(ap->a_name); 1653 bzero(p, eapad1); 1654 p += eapad1; 1655 error = uiomove(p, ealen, ap->a_uio); 1656 if (error) { 1657 free(eae, M_TEMP); 1658 if (stand_alone) 1659 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); 1660 else if (ip->i_ea_error == 0) 1661 ip->i_ea_error = error; 1662 return(error); 1663 } 1664 p += ealen; 1665 bzero(p, eapad2); 1666 } 1667 p = ip->i_ea_area; 1668 ip->i_ea_area = eae; 1669 ip->i_ea_len = easize; 1670 free(p, M_TEMP); 1671 if (stand_alone) 1672 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); 1673 return(error); 1674}