Deleted Added
full compact
ffs_vnops.c (112181) ffs_vnops.c (112694)
1/*
2 * Copyright (c) 2002 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Copyright (c) 1982, 1986, 1989, 1993
12 * The Regents of the University of California. All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by the University of
25 * California, Berkeley and its contributors.
26 * 4. Neither the name of the University nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 *
42 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
1/*
2 * Copyright (c) 2002 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Copyright (c) 1982, 1986, 1989, 1993
12 * The Regents of the University of California. All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by the University of
25 * California, Berkeley and its contributors.
26 * 4. Neither the name of the University nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 *
42 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
43 * $FreeBSD: head/sys/ufs/ffs/ffs_vnops.c 112181 2003-03-13 07:19:23Z jeff $
43 * $FreeBSD: head/sys/ufs/ffs/ffs_vnops.c 112694 2003-03-26 23:40:42Z tegge $
44 */
45
46#include <sys/param.h>
47#include <sys/bio.h>
48#include <sys/systm.h>
49#include <sys/buf.h>
50#include <sys/conf.h>
51#include <sys/extattr.h>
52#include <sys/kernel.h>
53#include <sys/malloc.h>
54#include <sys/mount.h>
55#include <sys/proc.h>
56#include <sys/resourcevar.h>
57#include <sys/signalvar.h>
58#include <sys/stat.h>
59#include <sys/vmmeter.h>
60#include <sys/vnode.h>
61
62#include <machine/limits.h>
63
64#include <vm/vm.h>
65#include <vm/vm_extern.h>
66#include <vm/vm_object.h>
67#include <vm/vm_page.h>
68#include <vm/vm_pager.h>
69#include <vm/vnode_pager.h>
70
71#include <ufs/ufs/extattr.h>
72#include <ufs/ufs/quota.h>
73#include <ufs/ufs/inode.h>
74#include <ufs/ufs/ufs_extern.h>
75#include <ufs/ufs/ufsmount.h>
76
77#include <ufs/ffs/fs.h>
78#include <ufs/ffs/ffs_extern.h>
44 */
45
46#include <sys/param.h>
47#include <sys/bio.h>
48#include <sys/systm.h>
49#include <sys/buf.h>
50#include <sys/conf.h>
51#include <sys/extattr.h>
52#include <sys/kernel.h>
53#include <sys/malloc.h>
54#include <sys/mount.h>
55#include <sys/proc.h>
56#include <sys/resourcevar.h>
57#include <sys/signalvar.h>
58#include <sys/stat.h>
59#include <sys/vmmeter.h>
60#include <sys/vnode.h>
61
62#include <machine/limits.h>
63
64#include <vm/vm.h>
65#include <vm/vm_extern.h>
66#include <vm/vm_object.h>
67#include <vm/vm_page.h>
68#include <vm/vm_pager.h>
69#include <vm/vnode_pager.h>
70
71#include <ufs/ufs/extattr.h>
72#include <ufs/ufs/quota.h>
73#include <ufs/ufs/inode.h>
74#include <ufs/ufs/ufs_extern.h>
75#include <ufs/ufs/ufsmount.h>
76
77#include <ufs/ffs/fs.h>
78#include <ufs/ffs/ffs_extern.h>
79#include "opt_directio.h"
79
80
81#ifdef DIRECTIO
82extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
83#endif
80static int ffs_fsync(struct vop_fsync_args *);
81static int ffs_getpages(struct vop_getpages_args *);
82static int ffs_read(struct vop_read_args *);
83static int ffs_write(struct vop_write_args *);
84static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
85static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
86 struct ucred *cred);
87static int ffsext_strategy(struct vop_strategy_args *);
88static int ffs_closeextattr(struct vop_closeextattr_args *);
89static int ffs_getextattr(struct vop_getextattr_args *);
90static int ffs_openextattr(struct vop_openextattr_args *);
91static int ffs_setextattr(struct vop_setextattr_args *);
92
93
94/* Global vfs data structures for ufs. */
95vop_t **ffs_vnodeop_p;
96static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
97 { &vop_default_desc, (vop_t *) ufs_vnoperate },
98 { &vop_fsync_desc, (vop_t *) ffs_fsync },
99 { &vop_getpages_desc, (vop_t *) ffs_getpages },
100 { &vop_read_desc, (vop_t *) ffs_read },
101 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
102 { &vop_write_desc, (vop_t *) ffs_write },
103 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
104 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
105 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
106 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
107 { NULL, NULL }
108};
109static struct vnodeopv_desc ffs_vnodeop_opv_desc =
110 { &ffs_vnodeop_p, ffs_vnodeop_entries };
111
112vop_t **ffs_specop_p;
113static struct vnodeopv_entry_desc ffs_specop_entries[] = {
114 { &vop_default_desc, (vop_t *) ufs_vnoperatespec },
115 { &vop_fsync_desc, (vop_t *) ffs_fsync },
116 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
117 { &vop_strategy_desc, (vop_t *) ffsext_strategy },
118 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
119 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
120 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
121 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
122 { NULL, NULL }
123};
124static struct vnodeopv_desc ffs_specop_opv_desc =
125 { &ffs_specop_p, ffs_specop_entries };
126
127vop_t **ffs_fifoop_p;
128static struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
129 { &vop_default_desc, (vop_t *) ufs_vnoperatefifo },
130 { &vop_fsync_desc, (vop_t *) ffs_fsync },
131 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
132 { &vop_strategy_desc, (vop_t *) ffsext_strategy },
133 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
134 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
135 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
136 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
137 { NULL, NULL }
138};
139static struct vnodeopv_desc ffs_fifoop_opv_desc =
140 { &ffs_fifoop_p, ffs_fifoop_entries };
141
142VNODEOP_SET(ffs_vnodeop_opv_desc);
143VNODEOP_SET(ffs_specop_opv_desc);
144VNODEOP_SET(ffs_fifoop_opv_desc);
145
146/*
147 * Synch an open file.
148 */
149/* ARGSUSED */
150static int
151ffs_fsync(ap)
152 struct vop_fsync_args /* {
153 struct vnode *a_vp;
154 struct ucred *a_cred;
155 int a_waitfor;
156 struct thread *a_td;
157 } */ *ap;
158{
159 struct vnode *vp = ap->a_vp;
160 struct inode *ip = VTOI(vp);
161 struct buf *bp;
162 struct buf *nbp;
163 int s, error, wait, passes, skipmeta;
164 ufs_lbn_t lbn;
165
166 wait = (ap->a_waitfor == MNT_WAIT);
167 if (vn_isdisk(vp, NULL)) {
168 lbn = INT_MAX;
169 if (vp->v_rdev->si_mountpoint != NULL &&
170 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP))
171 softdep_fsync_mountdev(vp);
172 } else {
173 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
174 }
175
176 /*
177 * Flush all dirty buffers associated with a vnode.
178 */
179 passes = NIADDR + 1;
180 skipmeta = 0;
181 if (wait)
182 skipmeta = 1;
183 s = splbio();
184 VI_LOCK(vp);
185loop:
186 TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs)
187 bp->b_vflags &= ~BV_SCANNED;
188 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
189 nbp = TAILQ_NEXT(bp, b_vnbufs);
190 /*
191 * Reasons to skip this buffer: it has already been considered
192 * on this pass, this pass is the first time through on a
193 * synchronous flush request and the buffer being considered
194 * is metadata, the buffer has dependencies that will cause
195 * it to be redirtied and it has not already been deferred,
196 * or it is already being written.
197 */
198 if ((bp->b_vflags & BV_SCANNED) != 0)
199 continue;
200 bp->b_vflags |= BV_SCANNED;
201 if ((skipmeta == 1 && bp->b_lblkno < 0))
202 continue;
203 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
204 continue;
205 if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
206 (bp->b_flags & B_DEFERRED) == 0 &&
207 buf_countdeps(bp, 0)) {
208 bp->b_flags |= B_DEFERRED;
209 BUF_UNLOCK(bp);
210 continue;
211 }
212 VI_UNLOCK(vp);
213 if ((bp->b_flags & B_DELWRI) == 0)
214 panic("ffs_fsync: not dirty");
215 if (vp != bp->b_vp)
216 panic("ffs_fsync: vp != vp->b_vp");
217 /*
218 * If this is a synchronous flush request, or it is not a
219 * file or device, start the write on this buffer immediatly.
220 */
221 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
222
223 /*
224 * On our final pass through, do all I/O synchronously
225 * so that we can find out if our flush is failing
226 * because of write errors.
227 */
228 if (passes > 0 || !wait) {
229 if ((bp->b_flags & B_CLUSTEROK) && !wait) {
230 (void) vfs_bio_awrite(bp);
231 } else {
232 bremfree(bp);
233 splx(s);
234 (void) bawrite(bp);
235 s = splbio();
236 }
237 } else {
238 bremfree(bp);
239 splx(s);
240 if ((error = bwrite(bp)) != 0)
241 return (error);
242 s = splbio();
243 }
244 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
245 /*
246 * If the buffer is for data that has been truncated
247 * off the file, then throw it away.
248 */
249 bremfree(bp);
250 bp->b_flags |= B_INVAL | B_NOCACHE;
251 splx(s);
252 brelse(bp);
253 s = splbio();
254 } else
255 vfs_bio_awrite(bp);
256
257 /*
258 * Since we may have slept during the I/O, we need
259 * to start from a known point.
260 */
261 VI_LOCK(vp);
262 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
263 }
264 /*
265 * If we were asked to do this synchronously, then go back for
266 * another pass, this time doing the metadata.
267 */
268 if (skipmeta) {
269 skipmeta = 0;
270 goto loop;
271 }
272
273 if (wait) {
274 while (vp->v_numoutput) {
275 vp->v_iflag |= VI_BWAIT;
276 msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
277 PRIBIO + 4, "ffsfsn", 0);
278 }
279 VI_UNLOCK(vp);
280
281 /*
282 * Ensure that any filesystem metatdata associated
283 * with the vnode has been written.
284 */
285 splx(s);
286 if ((error = softdep_sync_metadata(ap)) != 0)
287 return (error);
288 s = splbio();
289
290 VI_LOCK(vp);
291 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
292 /*
293 * Block devices associated with filesystems may
294 * have new I/O requests posted for them even if
295 * the vnode is locked, so no amount of trying will
296 * get them clean. Thus we give block devices a
297 * good effort, then just give up. For all other file
298 * types, go around and try again until it is clean.
299 */
300 if (passes > 0) {
301 passes -= 1;
302 goto loop;
303 }
304#ifdef DIAGNOSTIC
305 if (!vn_isdisk(vp, NULL))
306 vprint("ffs_fsync: dirty", vp);
307#endif
308 }
309 }
310 VI_UNLOCK(vp);
311 splx(s);
312 return (UFS_UPDATE(vp, wait));
313}
314
315
316/*
317 * Vnode op for reading.
318 */
319/* ARGSUSED */
320static int
321ffs_read(ap)
322 struct vop_read_args /* {
323 struct vnode *a_vp;
324 struct uio *a_uio;
325 int a_ioflag;
326 struct ucred *a_cred;
327 } */ *ap;
328{
329 struct vnode *vp;
330 struct inode *ip;
331 struct uio *uio;
332 struct fs *fs;
333 struct buf *bp;
334 ufs_lbn_t lbn, nextlbn;
335 off_t bytesinfile;
336 long size, xfersize, blkoffset;
337 int error, orig_resid;
338 mode_t mode;
339 int seqcount;
340 int ioflag;
341 vm_object_t object;
342
343 vp = ap->a_vp;
344 uio = ap->a_uio;
345 ioflag = ap->a_ioflag;
346 if (ap->a_ioflag & IO_EXT)
347#ifdef notyet
348 return (ffs_extread(vp, uio, ioflag));
349#else
350 panic("ffs_read+IO_EXT");
351#endif
84static int ffs_fsync(struct vop_fsync_args *);
85static int ffs_getpages(struct vop_getpages_args *);
86static int ffs_read(struct vop_read_args *);
87static int ffs_write(struct vop_write_args *);
88static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
89static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
90 struct ucred *cred);
91static int ffsext_strategy(struct vop_strategy_args *);
92static int ffs_closeextattr(struct vop_closeextattr_args *);
93static int ffs_getextattr(struct vop_getextattr_args *);
94static int ffs_openextattr(struct vop_openextattr_args *);
95static int ffs_setextattr(struct vop_setextattr_args *);
96
97
98/* Global vfs data structures for ufs. */
99vop_t **ffs_vnodeop_p;
100static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
101 { &vop_default_desc, (vop_t *) ufs_vnoperate },
102 { &vop_fsync_desc, (vop_t *) ffs_fsync },
103 { &vop_getpages_desc, (vop_t *) ffs_getpages },
104 { &vop_read_desc, (vop_t *) ffs_read },
105 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
106 { &vop_write_desc, (vop_t *) ffs_write },
107 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
108 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
109 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
110 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
111 { NULL, NULL }
112};
113static struct vnodeopv_desc ffs_vnodeop_opv_desc =
114 { &ffs_vnodeop_p, ffs_vnodeop_entries };
115
116vop_t **ffs_specop_p;
117static struct vnodeopv_entry_desc ffs_specop_entries[] = {
118 { &vop_default_desc, (vop_t *) ufs_vnoperatespec },
119 { &vop_fsync_desc, (vop_t *) ffs_fsync },
120 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
121 { &vop_strategy_desc, (vop_t *) ffsext_strategy },
122 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
123 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
124 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
125 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
126 { NULL, NULL }
127};
128static struct vnodeopv_desc ffs_specop_opv_desc =
129 { &ffs_specop_p, ffs_specop_entries };
130
131vop_t **ffs_fifoop_p;
132static struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
133 { &vop_default_desc, (vop_t *) ufs_vnoperatefifo },
134 { &vop_fsync_desc, (vop_t *) ffs_fsync },
135 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
136 { &vop_strategy_desc, (vop_t *) ffsext_strategy },
137 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
138 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
139 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
140 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
141 { NULL, NULL }
142};
143static struct vnodeopv_desc ffs_fifoop_opv_desc =
144 { &ffs_fifoop_p, ffs_fifoop_entries };
145
146VNODEOP_SET(ffs_vnodeop_opv_desc);
147VNODEOP_SET(ffs_specop_opv_desc);
148VNODEOP_SET(ffs_fifoop_opv_desc);
149
150/*
151 * Synch an open file.
152 */
153/* ARGSUSED */
154static int
155ffs_fsync(ap)
156 struct vop_fsync_args /* {
157 struct vnode *a_vp;
158 struct ucred *a_cred;
159 int a_waitfor;
160 struct thread *a_td;
161 } */ *ap;
162{
163 struct vnode *vp = ap->a_vp;
164 struct inode *ip = VTOI(vp);
165 struct buf *bp;
166 struct buf *nbp;
167 int s, error, wait, passes, skipmeta;
168 ufs_lbn_t lbn;
169
170 wait = (ap->a_waitfor == MNT_WAIT);
171 if (vn_isdisk(vp, NULL)) {
172 lbn = INT_MAX;
173 if (vp->v_rdev->si_mountpoint != NULL &&
174 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP))
175 softdep_fsync_mountdev(vp);
176 } else {
177 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
178 }
179
180 /*
181 * Flush all dirty buffers associated with a vnode.
182 */
183 passes = NIADDR + 1;
184 skipmeta = 0;
185 if (wait)
186 skipmeta = 1;
187 s = splbio();
188 VI_LOCK(vp);
189loop:
190 TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs)
191 bp->b_vflags &= ~BV_SCANNED;
192 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
193 nbp = TAILQ_NEXT(bp, b_vnbufs);
194 /*
195 * Reasons to skip this buffer: it has already been considered
196 * on this pass, this pass is the first time through on a
197 * synchronous flush request and the buffer being considered
198 * is metadata, the buffer has dependencies that will cause
199 * it to be redirtied and it has not already been deferred,
200 * or it is already being written.
201 */
202 if ((bp->b_vflags & BV_SCANNED) != 0)
203 continue;
204 bp->b_vflags |= BV_SCANNED;
205 if ((skipmeta == 1 && bp->b_lblkno < 0))
206 continue;
207 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
208 continue;
209 if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
210 (bp->b_flags & B_DEFERRED) == 0 &&
211 buf_countdeps(bp, 0)) {
212 bp->b_flags |= B_DEFERRED;
213 BUF_UNLOCK(bp);
214 continue;
215 }
216 VI_UNLOCK(vp);
217 if ((bp->b_flags & B_DELWRI) == 0)
218 panic("ffs_fsync: not dirty");
219 if (vp != bp->b_vp)
220 panic("ffs_fsync: vp != vp->b_vp");
221 /*
222 * If this is a synchronous flush request, or it is not a
223 * file or device, start the write on this buffer immediatly.
224 */
225 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
226
227 /*
228 * On our final pass through, do all I/O synchronously
229 * so that we can find out if our flush is failing
230 * because of write errors.
231 */
232 if (passes > 0 || !wait) {
233 if ((bp->b_flags & B_CLUSTEROK) && !wait) {
234 (void) vfs_bio_awrite(bp);
235 } else {
236 bremfree(bp);
237 splx(s);
238 (void) bawrite(bp);
239 s = splbio();
240 }
241 } else {
242 bremfree(bp);
243 splx(s);
244 if ((error = bwrite(bp)) != 0)
245 return (error);
246 s = splbio();
247 }
248 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
249 /*
250 * If the buffer is for data that has been truncated
251 * off the file, then throw it away.
252 */
253 bremfree(bp);
254 bp->b_flags |= B_INVAL | B_NOCACHE;
255 splx(s);
256 brelse(bp);
257 s = splbio();
258 } else
259 vfs_bio_awrite(bp);
260
261 /*
262 * Since we may have slept during the I/O, we need
263 * to start from a known point.
264 */
265 VI_LOCK(vp);
266 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
267 }
268 /*
269 * If we were asked to do this synchronously, then go back for
270 * another pass, this time doing the metadata.
271 */
272 if (skipmeta) {
273 skipmeta = 0;
274 goto loop;
275 }
276
277 if (wait) {
278 while (vp->v_numoutput) {
279 vp->v_iflag |= VI_BWAIT;
280 msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
281 PRIBIO + 4, "ffsfsn", 0);
282 }
283 VI_UNLOCK(vp);
284
285 /*
286 * Ensure that any filesystem metatdata associated
287 * with the vnode has been written.
288 */
289 splx(s);
290 if ((error = softdep_sync_metadata(ap)) != 0)
291 return (error);
292 s = splbio();
293
294 VI_LOCK(vp);
295 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
296 /*
297 * Block devices associated with filesystems may
298 * have new I/O requests posted for them even if
299 * the vnode is locked, so no amount of trying will
300 * get them clean. Thus we give block devices a
301 * good effort, then just give up. For all other file
302 * types, go around and try again until it is clean.
303 */
304 if (passes > 0) {
305 passes -= 1;
306 goto loop;
307 }
308#ifdef DIAGNOSTIC
309 if (!vn_isdisk(vp, NULL))
310 vprint("ffs_fsync: dirty", vp);
311#endif
312 }
313 }
314 VI_UNLOCK(vp);
315 splx(s);
316 return (UFS_UPDATE(vp, wait));
317}
318
319
320/*
321 * Vnode op for reading.
322 */
323/* ARGSUSED */
324static int
325ffs_read(ap)
326 struct vop_read_args /* {
327 struct vnode *a_vp;
328 struct uio *a_uio;
329 int a_ioflag;
330 struct ucred *a_cred;
331 } */ *ap;
332{
333 struct vnode *vp;
334 struct inode *ip;
335 struct uio *uio;
336 struct fs *fs;
337 struct buf *bp;
338 ufs_lbn_t lbn, nextlbn;
339 off_t bytesinfile;
340 long size, xfersize, blkoffset;
341 int error, orig_resid;
342 mode_t mode;
343 int seqcount;
344 int ioflag;
345 vm_object_t object;
346
347 vp = ap->a_vp;
348 uio = ap->a_uio;
349 ioflag = ap->a_ioflag;
350 if (ap->a_ioflag & IO_EXT)
351#ifdef notyet
352 return (ffs_extread(vp, uio, ioflag));
353#else
354 panic("ffs_read+IO_EXT");
355#endif
356#ifdef DIRECTIO
357 if ((ioflag & IO_DIRECT) != 0) {
358 int workdone;
352
359
360 error = ffs_rawread(vp, uio, &workdone);
361 if (error != 0 || workdone != 0)
362 return error;
363 }
364#endif
365
353 GIANT_REQUIRED;
354
355 seqcount = ap->a_ioflag >> 16;
356 ip = VTOI(vp);
357 mode = ip->i_mode;
358
359#ifdef DIAGNOSTIC
360 if (uio->uio_rw != UIO_READ)
361 panic("ffs_read: mode");
362
363 if (vp->v_type == VLNK) {
364 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
365 panic("ffs_read: short symlink");
366 } else if (vp->v_type != VREG && vp->v_type != VDIR)
367 panic("ffs_read: type %d", vp->v_type);
368#endif
369 fs = ip->i_fs;
370 if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize)
371 return (EFBIG);
372
373 orig_resid = uio->uio_resid;
374 if (orig_resid <= 0)
375 return (0);
376
377 object = vp->v_object;
378
379 bytesinfile = ip->i_size - uio->uio_offset;
380 if (bytesinfile <= 0) {
381 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
382 ip->i_flag |= IN_ACCESS;
383 return 0;
384 }
385
386 if (object) {
387 vm_object_reference(object);
388 }
389
390 /*
391 * Ok so we couldn't do it all in one vm trick...
392 * so cycle around trying smaller bites..
393 */
394 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
395 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
396 break;
397
398 lbn = lblkno(fs, uio->uio_offset);
399 nextlbn = lbn + 1;
400
401 /*
402 * size of buffer. The buffer representing the
403 * end of the file is rounded up to the size of
404 * the block type ( fragment or full block,
405 * depending ).
406 */
407 size = blksize(fs, ip, lbn);
408 blkoffset = blkoff(fs, uio->uio_offset);
409
410 /*
411 * The amount we want to transfer in this iteration is
412 * one FS block less the amount of the data before
413 * our startpoint (duh!)
414 */
415 xfersize = fs->fs_bsize - blkoffset;
416
417 /*
418 * But if we actually want less than the block,
419 * or the file doesn't have a whole block more of data,
420 * then use the lesser number.
421 */
422 if (uio->uio_resid < xfersize)
423 xfersize = uio->uio_resid;
424 if (bytesinfile < xfersize)
425 xfersize = bytesinfile;
426
427 if (lblktosize(fs, nextlbn) >= ip->i_size) {
428 /*
429 * Don't do readahead if this is the end of the file.
430 */
431 error = bread(vp, lbn, size, NOCRED, &bp);
432 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
433 /*
434 * Otherwise if we are allowed to cluster,
435 * grab as much as we can.
436 *
437 * XXX This may not be a win if we are not
438 * doing sequential access.
439 */
440 error = cluster_read(vp, ip->i_size, lbn,
441 size, NOCRED, uio->uio_resid, seqcount, &bp);
442 } else if (seqcount > 1) {
443 /*
444 * If we are NOT allowed to cluster, then
445 * if we appear to be acting sequentially,
446 * fire off a request for a readahead
447 * as well as a read. Note that the 4th and 5th
448 * arguments point to arrays of the size specified in
449 * the 6th argument.
450 */
451 int nextsize = blksize(fs, ip, nextlbn);
452 error = breadn(vp, lbn,
453 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
454 } else {
455 /*
456 * Failing all of the above, just read what the
457 * user asked for. Interestingly, the same as
458 * the first option above.
459 */
460 error = bread(vp, lbn, size, NOCRED, &bp);
461 }
462 if (error) {
463 brelse(bp);
464 bp = NULL;
465 break;
466 }
467
468 /*
469 * If IO_DIRECT then set B_DIRECT for the buffer. This
470 * will cause us to attempt to release the buffer later on
471 * and will cause the buffer cache to attempt to free the
472 * underlying pages.
473 */
474 if (ioflag & IO_DIRECT)
475 bp->b_flags |= B_DIRECT;
476
477 /*
478 * We should only get non-zero b_resid when an I/O error
479 * has occurred, which should cause us to break above.
480 * However, if the short read did not cause an error,
481 * then we want to ensure that we do not uiomove bad
482 * or uninitialized data.
483 */
484 size -= bp->b_resid;
485 if (size < xfersize) {
486 if (size == 0)
487 break;
488 xfersize = size;
489 }
490
491 {
492 /*
493 * otherwise use the general form
494 */
495 error =
496 uiomove((char *)bp->b_data + blkoffset,
497 (int)xfersize, uio);
498 }
499
500 if (error)
501 break;
502
503 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
504 (LIST_FIRST(&bp->b_dep) == NULL)) {
505 /*
506 * If there are no dependencies, and it's VMIO,
507 * then we don't need the buf, mark it available
508 * for freeing. The VM has the data.
509 */
510 bp->b_flags |= B_RELBUF;
511 brelse(bp);
512 } else {
513 /*
514 * Otherwise let whoever
515 * made the request take care of
516 * freeing it. We just queue
517 * it onto another list.
518 */
519 bqrelse(bp);
520 }
521 }
522
523 /*
524 * This can only happen in the case of an error
525 * because the loop above resets bp to NULL on each iteration
526 * and on normal completion has not set a new value into it.
527 * so it must have come from a 'break' statement
528 */
529 if (bp != NULL) {
530 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
531 (LIST_FIRST(&bp->b_dep) == NULL)) {
532 bp->b_flags |= B_RELBUF;
533 brelse(bp);
534 } else {
535 bqrelse(bp);
536 }
537 }
538
539 if (object) {
540 vm_object_vndeallocate(object);
541 }
542 if ((error == 0 || uio->uio_resid != orig_resid) &&
543 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
544 ip->i_flag |= IN_ACCESS;
545 return (error);
546}
547
548/*
549 * Vnode op for writing.
550 */
551static int
552ffs_write(ap)
553 struct vop_write_args /* {
554 struct vnode *a_vp;
555 struct uio *a_uio;
556 int a_ioflag;
557 struct ucred *a_cred;
558 } */ *ap;
559{
560 struct vnode *vp;
561 struct uio *uio;
562 struct inode *ip;
563 struct fs *fs;
564 struct buf *bp;
565 struct thread *td;
566 ufs_lbn_t lbn;
567 off_t osize;
568 int seqcount;
569 int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
570 vm_object_t object;
571
572 vp = ap->a_vp;
573 uio = ap->a_uio;
574 ioflag = ap->a_ioflag;
575 if (ap->a_ioflag & IO_EXT)
576#ifdef notyet
577 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
578#else
579 panic("ffs_read+IO_EXT");
580#endif
581
582 GIANT_REQUIRED;
583
584 extended = 0;
585 seqcount = ap->a_ioflag >> 16;
586 ip = VTOI(vp);
587
588 object = vp->v_object;
589 if (object) {
590 vm_object_reference(object);
591 }
592
593#ifdef DIAGNOSTIC
594 if (uio->uio_rw != UIO_WRITE)
595 panic("ffswrite: mode");
596#endif
597
598 switch (vp->v_type) {
599 case VREG:
600 if (ioflag & IO_APPEND)
601 uio->uio_offset = ip->i_size;
602 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) {
603 if (object) {
604 vm_object_vndeallocate(object);
605 }
606 return (EPERM);
607 }
608 /* FALLTHROUGH */
609 case VLNK:
610 break;
611 case VDIR:
612 panic("ffswrite: dir write");
613 break;
614 default:
615 panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type,
616 (int)uio->uio_offset,
617 (int)uio->uio_resid
618 );
619 }
620
621 fs = ip->i_fs;
622 if (uio->uio_offset < 0 ||
623 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
624 if (object) {
625 vm_object_vndeallocate(object);
626 }
627 return (EFBIG);
628 }
629 /*
630 * Maybe this should be above the vnode op call, but so long as
631 * file servers have no limits, I don't think it matters.
632 */
633 td = uio->uio_td;
634 if (vp->v_type == VREG && td &&
635 uio->uio_offset + uio->uio_resid >
636 td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
637 PROC_LOCK(td->td_proc);
638 psignal(td->td_proc, SIGXFSZ);
639 PROC_UNLOCK(td->td_proc);
640 if (object) {
641 vm_object_vndeallocate(object);
642 }
643 return (EFBIG);
644 }
645
646 resid = uio->uio_resid;
647 osize = ip->i_size;
648 if (seqcount > BA_SEQMAX)
649 flags = BA_SEQMAX << BA_SEQSHIFT;
650 else
651 flags = seqcount << BA_SEQSHIFT;
652 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
653 flags |= IO_SYNC;
654
655 for (error = 0; uio->uio_resid > 0;) {
656 lbn = lblkno(fs, uio->uio_offset);
657 blkoffset = blkoff(fs, uio->uio_offset);
658 xfersize = fs->fs_bsize - blkoffset;
659 if (uio->uio_resid < xfersize)
660 xfersize = uio->uio_resid;
661
662 if (uio->uio_offset + xfersize > ip->i_size)
663 vnode_pager_setsize(vp, uio->uio_offset + xfersize);
664
665 /*
666 * We must perform a read-before-write if the transfer size
667 * does not cover the entire buffer.
668 */
669 if (fs->fs_bsize > xfersize)
670 flags |= BA_CLRBUF;
671 else
672 flags &= ~BA_CLRBUF;
673/* XXX is uio->uio_offset the right thing here? */
674 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
675 ap->a_cred, flags, &bp);
676 if (error != 0)
677 break;
678 /*
679 * If the buffer is not valid we have to clear out any
680 * garbage data from the pages instantiated for the buffer.
681 * If we do not, a failed uiomove() during a write can leave
682 * the prior contents of the pages exposed to a userland
683 * mmap(). XXX deal with uiomove() errors a better way.
684 */
685 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
686 vfs_bio_clrbuf(bp);
687 if (ioflag & IO_DIRECT)
688 bp->b_flags |= B_DIRECT;
689 if (ioflag & IO_NOWDRAIN)
690 bp->b_flags |= B_NOWDRAIN;
691
692 if (uio->uio_offset + xfersize > ip->i_size) {
693 ip->i_size = uio->uio_offset + xfersize;
694 DIP(ip, i_size) = ip->i_size;
695 extended = 1;
696 }
697
698 size = blksize(fs, ip, lbn) - bp->b_resid;
699 if (size < xfersize)
700 xfersize = size;
701
702 error =
703 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
704 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
705 (LIST_FIRST(&bp->b_dep) == NULL)) {
706 bp->b_flags |= B_RELBUF;
707 }
708
709 /*
710 * If IO_SYNC each buffer is written synchronously. Otherwise
711 * if we have a severe page deficiency write the buffer
712 * asynchronously. Otherwise try to cluster, and if that
713 * doesn't do it then either do an async write (if O_DIRECT),
714 * or a delayed write (if not).
715 */
716 if (ioflag & IO_SYNC) {
717 (void)bwrite(bp);
718 } else if (vm_page_count_severe() ||
719 buf_dirty_count_severe() ||
720 (ioflag & IO_ASYNC)) {
721 bp->b_flags |= B_CLUSTEROK;
722 bawrite(bp);
723 } else if (xfersize + blkoffset == fs->fs_bsize) {
724 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
725 bp->b_flags |= B_CLUSTEROK;
726 cluster_write(bp, ip->i_size, seqcount);
727 } else {
728 bawrite(bp);
729 }
730 } else if (ioflag & IO_DIRECT) {
731 bp->b_flags |= B_CLUSTEROK;
732 bawrite(bp);
733 } else {
734 bp->b_flags |= B_CLUSTEROK;
735 bdwrite(bp);
736 }
737 if (error || xfersize == 0)
738 break;
739 ip->i_flag |= IN_CHANGE | IN_UPDATE;
740 }
741 /*
742 * If we successfully wrote any data, and we are not the superuser
743 * we clear the setuid and setgid bits as a precaution against
744 * tampering.
745 */
746 if (resid > uio->uio_resid && ap->a_cred &&
747 suser_cred(ap->a_cred, PRISON_ROOT)) {
748 ip->i_mode &= ~(ISUID | ISGID);
749 DIP(ip, i_mode) = ip->i_mode;
750 }
751 if (resid > uio->uio_resid)
752 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
753 if (error) {
754 if (ioflag & IO_UNIT) {
755 (void)UFS_TRUNCATE(vp, osize,
756 IO_NORMAL | (ioflag & IO_SYNC),
757 ap->a_cred, uio->uio_td);
758 uio->uio_offset -= resid - uio->uio_resid;
759 uio->uio_resid = resid;
760 }
761 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
762 error = UFS_UPDATE(vp, 1);
763
764 if (object) {
765 vm_object_vndeallocate(object);
766 }
767
768 return (error);
769}
770
771/*
772 * get page routine
773 */
774static int
775ffs_getpages(ap)
776 struct vop_getpages_args *ap;
777{
778 off_t foff, physoffset;
779 int i, size, bsize;
780 struct vnode *dp, *vp;
781 vm_object_t obj;
782 vm_pindex_t pindex, firstindex;
783 vm_page_t mreq;
784 int bbackwards, bforwards;
785 int pbackwards, pforwards;
786 int firstpage;
787 ufs2_daddr_t reqblkno, reqlblkno;
788 int poff;
789 int pcount;
790 int rtval;
791 int pagesperblock;
792
793 GIANT_REQUIRED;
794
795 pcount = round_page(ap->a_count) / PAGE_SIZE;
796 mreq = ap->a_m[ap->a_reqpage];
797 firstindex = ap->a_m[0]->pindex;
798
799 /*
800 * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
801 * then the entire page is valid. Since the page may be mapped,
802 * user programs might reference data beyond the actual end of file
803 * occuring within the page. We have to zero that data.
804 */
805 if (mreq->valid) {
806 if (mreq->valid != VM_PAGE_BITS_ALL)
807 vm_page_zero_invalid(mreq, TRUE);
808 vm_page_lock_queues();
809 for (i = 0; i < pcount; i++) {
810 if (i != ap->a_reqpage) {
811 vm_page_free(ap->a_m[i]);
812 }
813 }
814 vm_page_unlock_queues();
815 return VM_PAGER_OK;
816 }
817
818 vp = ap->a_vp;
819 obj = vp->v_object;
820 bsize = vp->v_mount->mnt_stat.f_iosize;
821 pindex = mreq->pindex;
822 foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */;
823
824 if (bsize < PAGE_SIZE)
825 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
826 ap->a_count,
827 ap->a_reqpage);
828
829 /*
830 * foff is the file offset of the required page
831 * reqlblkno is the logical block that contains the page
832 * poff is the index of the page into the logical block
833 */
834 reqlblkno = foff / bsize;
835 poff = (foff % bsize) / PAGE_SIZE;
836
837 dp = VTOI(vp)->i_devvp;
838 if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards)
839 || (reqblkno == -1)) {
840 vm_page_lock_queues();
841 for(i = 0; i < pcount; i++) {
842 if (i != ap->a_reqpage)
843 vm_page_free(ap->a_m[i]);
844 }
845 vm_page_unlock_queues();
846 if (reqblkno == -1) {
847 if ((mreq->flags & PG_ZERO) == 0)
848 pmap_zero_page(mreq);
849 vm_page_undirty(mreq);
850 mreq->valid = VM_PAGE_BITS_ALL;
851 return VM_PAGER_OK;
852 } else {
853 return VM_PAGER_ERROR;
854 }
855 }
856
857 physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE;
858 pagesperblock = bsize / PAGE_SIZE;
859 /*
860 * find the first page that is contiguous...
861 * note that pbackwards is the number of pages that are contiguous
862 * backwards.
863 */
864 firstpage = 0;
865 if (ap->a_count) {
866 pbackwards = poff + bbackwards * pagesperblock;
867 if (ap->a_reqpage > pbackwards) {
868 firstpage = ap->a_reqpage - pbackwards;
869 vm_page_lock_queues();
870 for(i=0;i<firstpage;i++)
871 vm_page_free(ap->a_m[i]);
872 vm_page_unlock_queues();
873 }
874
875 /*
876 * pforwards is the number of pages that are contiguous
877 * after the current page.
878 */
879 pforwards = (pagesperblock - (poff + 1)) +
880 bforwards * pagesperblock;
881 if (pforwards < (pcount - (ap->a_reqpage + 1))) {
882 vm_page_lock_queues();
883 for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
884 vm_page_free(ap->a_m[i]);
885 vm_page_unlock_queues();
886 pcount = ap->a_reqpage + pforwards + 1;
887 }
888
889 /*
890 * number of pages for I/O corrected for the non-contig pages at
891 * the beginning of the array.
892 */
893 pcount -= firstpage;
894 }
895
896 /*
897 * calculate the size of the transfer
898 */
899
900 size = pcount * PAGE_SIZE;
901
902 if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) >
903 obj->un_pager.vnp.vnp_size)
904 size = obj->un_pager.vnp.vnp_size -
905 IDX_TO_OFF(ap->a_m[firstpage]->pindex);
906
907 physoffset -= foff;
908 rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
909 (ap->a_reqpage - firstpage), physoffset);
910
911 return (rtval);
912}
913
914/*
915 * Extended attribute area reading.
916 */
917static int
918ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
919{
920 struct inode *ip;
921 struct ufs2_dinode *dp;
922 struct fs *fs;
923 struct buf *bp;
924 ufs_lbn_t lbn, nextlbn;
925 off_t bytesinfile;
926 long size, xfersize, blkoffset;
927 int error, orig_resid;
928 mode_t mode;
929
930 GIANT_REQUIRED;
931
932 ip = VTOI(vp);
933 fs = ip->i_fs;
934 dp = ip->i_din2;
935 mode = ip->i_mode;
936
937#ifdef DIAGNOSTIC
938 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
939 panic("ffs_extread: mode");
940
941#endif
942 orig_resid = uio->uio_resid;
943 if (orig_resid <= 0)
944 return (0);
945
946 bytesinfile = dp->di_extsize - uio->uio_offset;
947 if (bytesinfile <= 0) {
948 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
949 ip->i_flag |= IN_ACCESS;
950 return 0;
951 }
952
953 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
954 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
955 break;
956
957 lbn = lblkno(fs, uio->uio_offset);
958 nextlbn = lbn + 1;
959
960 /*
961 * size of buffer. The buffer representing the
962 * end of the file is rounded up to the size of
963 * the block type ( fragment or full block,
964 * depending ).
965 */
966 size = sblksize(fs, dp->di_extsize, lbn);
967 blkoffset = blkoff(fs, uio->uio_offset);
968
969 /*
970 * The amount we want to transfer in this iteration is
971 * one FS block less the amount of the data before
972 * our startpoint (duh!)
973 */
974 xfersize = fs->fs_bsize - blkoffset;
975
976 /*
977 * But if we actually want less than the block,
978 * or the file doesn't have a whole block more of data,
979 * then use the lesser number.
980 */
981 if (uio->uio_resid < xfersize)
982 xfersize = uio->uio_resid;
983 if (bytesinfile < xfersize)
984 xfersize = bytesinfile;
985
986 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
987 /*
988 * Don't do readahead if this is the end of the info.
989 */
990 error = bread(vp, -1 - lbn, size, NOCRED, &bp);
991 } else {
992 /*
993 * If we have a second block, then
994 * fire off a request for a readahead
995 * as well as a read. Note that the 4th and 5th
996 * arguments point to arrays of the size specified in
997 * the 6th argument.
998 */
999 int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1000
1001 nextlbn = -1 - nextlbn;
1002 error = breadn(vp, -1 - lbn,
1003 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1004 }
1005 if (error) {
1006 brelse(bp);
1007 bp = NULL;
1008 break;
1009 }
1010
1011 /*
1012 * If IO_DIRECT then set B_DIRECT for the buffer. This
1013 * will cause us to attempt to release the buffer later on
1014 * and will cause the buffer cache to attempt to free the
1015 * underlying pages.
1016 */
1017 if (ioflag & IO_DIRECT)
1018 bp->b_flags |= B_DIRECT;
1019
1020 /*
1021 * We should only get non-zero b_resid when an I/O error
1022 * has occurred, which should cause us to break above.
1023 * However, if the short read did not cause an error,
1024 * then we want to ensure that we do not uiomove bad
1025 * or uninitialized data.
1026 */
1027 size -= bp->b_resid;
1028 if (size < xfersize) {
1029 if (size == 0)
1030 break;
1031 xfersize = size;
1032 }
1033
1034 error = uiomove((char *)bp->b_data + blkoffset,
1035 (int)xfersize, uio);
1036 if (error)
1037 break;
1038
1039 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1040 (LIST_FIRST(&bp->b_dep) == NULL)) {
1041 /*
1042 * If there are no dependencies, and it's VMIO,
1043 * then we don't need the buf, mark it available
1044 * for freeing. The VM has the data.
1045 */
1046 bp->b_flags |= B_RELBUF;
1047 brelse(bp);
1048 } else {
1049 /*
1050 * Otherwise let whoever
1051 * made the request take care of
1052 * freeing it. We just queue
1053 * it onto another list.
1054 */
1055 bqrelse(bp);
1056 }
1057 }
1058
1059 /*
1060 * This can only happen in the case of an error
1061 * because the loop above resets bp to NULL on each iteration
1062 * and on normal completion has not set a new value into it.
1063 * so it must have come from a 'break' statement
1064 */
1065 if (bp != NULL) {
1066 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1067 (LIST_FIRST(&bp->b_dep) == NULL)) {
1068 bp->b_flags |= B_RELBUF;
1069 brelse(bp);
1070 } else {
1071 bqrelse(bp);
1072 }
1073 }
1074
1075 if ((error == 0 || uio->uio_resid != orig_resid) &&
1076 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
1077 ip->i_flag |= IN_ACCESS;
1078 return (error);
1079}
1080
1081/*
1082 * Extended attribute area writing.
1083 */
1084static int
1085ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1086{
1087 struct inode *ip;
1088 struct ufs2_dinode *dp;
1089 struct fs *fs;
1090 struct buf *bp;
1091 ufs_lbn_t lbn;
1092 off_t osize;
1093 int blkoffset, error, flags, resid, size, xfersize;
1094
1095 GIANT_REQUIRED;
1096
1097 ip = VTOI(vp);
1098 fs = ip->i_fs;
1099 dp = ip->i_din2;
1100
1101#ifdef DIAGNOSTIC
1102 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1103 panic("ext_write: mode");
1104#endif
1105
1106 if (ioflag & IO_APPEND)
1107 uio->uio_offset = dp->di_extsize;
1108
1109 if (uio->uio_offset < 0 ||
1110 (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
1111 return (EFBIG);
1112
1113 resid = uio->uio_resid;
1114 osize = dp->di_extsize;
1115 flags = IO_EXT;
1116 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
1117 flags |= IO_SYNC;
1118
1119 for (error = 0; uio->uio_resid > 0;) {
1120 lbn = lblkno(fs, uio->uio_offset);
1121 blkoffset = blkoff(fs, uio->uio_offset);
1122 xfersize = fs->fs_bsize - blkoffset;
1123 if (uio->uio_resid < xfersize)
1124 xfersize = uio->uio_resid;
1125
1126 /*
1127 * We must perform a read-before-write if the transfer size
1128 * does not cover the entire buffer.
1129 */
1130 if (fs->fs_bsize > xfersize)
1131 flags |= BA_CLRBUF;
1132 else
1133 flags &= ~BA_CLRBUF;
1134 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1135 ucred, flags, &bp);
1136 if (error != 0)
1137 break;
1138 /*
1139 * If the buffer is not valid we have to clear out any
1140 * garbage data from the pages instantiated for the buffer.
1141 * If we do not, a failed uiomove() during a write can leave
1142 * the prior contents of the pages exposed to a userland
1143 * mmap(). XXX deal with uiomove() errors a better way.
1144 */
1145 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1146 vfs_bio_clrbuf(bp);
1147 if (ioflag & IO_DIRECT)
1148 bp->b_flags |= B_DIRECT;
1149 if (ioflag & IO_NOWDRAIN)
1150 bp->b_flags |= B_NOWDRAIN;
1151
1152 if (uio->uio_offset + xfersize > dp->di_extsize)
1153 dp->di_extsize = uio->uio_offset + xfersize;
1154
1155 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1156 if (size < xfersize)
1157 xfersize = size;
1158
1159 error =
1160 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1161 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1162 (LIST_FIRST(&bp->b_dep) == NULL)) {
1163 bp->b_flags |= B_RELBUF;
1164 }
1165
1166 /*
1167 * If IO_SYNC each buffer is written synchronously. Otherwise
1168 * if we have a severe page deficiency write the buffer
1169 * asynchronously. Otherwise try to cluster, and if that
1170 * doesn't do it then either do an async write (if O_DIRECT),
1171 * or a delayed write (if not).
1172 */
1173 if (ioflag & IO_SYNC) {
1174 (void)bwrite(bp);
1175 } else if (vm_page_count_severe() ||
1176 buf_dirty_count_severe() ||
1177 xfersize + blkoffset == fs->fs_bsize ||
1178 (ioflag & (IO_ASYNC | IO_DIRECT)))
1179 bawrite(bp);
1180 else
1181 bdwrite(bp);
1182 if (error || xfersize == 0)
1183 break;
1184 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1185 }
1186 /*
1187 * If we successfully wrote any data, and we are not the superuser
1188 * we clear the setuid and setgid bits as a precaution against
1189 * tampering.
1190 */
1191 if (resid > uio->uio_resid && ucred &&
1192 suser_cred(ucred, PRISON_ROOT)) {
1193 ip->i_mode &= ~(ISUID | ISGID);
1194 dp->di_mode = ip->i_mode;
1195 }
1196 if (error) {
1197 if (ioflag & IO_UNIT) {
1198 (void)UFS_TRUNCATE(vp, osize,
1199 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td);
1200 uio->uio_offset -= resid - uio->uio_resid;
1201 uio->uio_resid = resid;
1202 }
1203 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1204 error = UFS_UPDATE(vp, 1);
1205 return (error);
1206}
1207
1208
1209/*
1210 * Vnode operating to retrieve a named extended attribute.
1211 *
1212 * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1213 * the length of the EA, and possibly the pointer to the entry and to the data.
1214 */
1215static int
1216ffs_findextattr(u_char *ptr, uint length, int nspace, const char *name, u_char **eap, u_char **eac)
1217{
1218 u_char *p, *pe, *pn, *p0;
1219 int eapad1, eapad2, ealength, ealen, nlen;
1220 uint32_t ul;
1221
1222 pe = ptr + length;
1223 nlen = strlen(name);
1224
1225 for (p = ptr; p < pe; p = pn) {
1226 p0 = p;
1227 bcopy(p, &ul, sizeof(ul));
1228 pn = p + ul;
1229 /* make sure this entry is complete */
1230 if (pn > pe)
1231 break;
1232 p += sizeof(uint32_t);
1233 if (*p != nspace)
1234 continue;
1235 p++;
1236 eapad2 = *p++;
1237 if (*p != nlen)
1238 continue;
1239 p++;
1240 if (bcmp(p, name, nlen))
1241 continue;
1242 ealength = sizeof(uint32_t) + 3 + nlen;
1243 eapad1 = 8 - (ealength % 8);
1244 if (eapad1 == 8)
1245 eapad1 = 0;
1246 ealength += eapad1;
1247 ealen = ul - ealength - eapad2;
1248 p += nlen + eapad1;
1249 if (eap != NULL)
1250 *eap = p0;
1251 if (eac != NULL)
1252 *eac = p;
1253 return (ealen);
1254 }
1255 return(-1);
1256}
1257
1258static int
1259ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1260{
1261 struct inode *ip;
1262 struct fs *fs;
1263 struct ufs2_dinode *dp;
1264 struct uio luio;
1265 struct iovec liovec;
1266 int easize, error;
1267 u_char *eae;
1268
1269 ip = VTOI(vp);
1270 fs = ip->i_fs;
1271 dp = ip->i_din2;
1272 easize = dp->di_extsize;
1273
1274 eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1275
1276 liovec.iov_base = eae;
1277 liovec.iov_len = easize;
1278 luio.uio_iov = &liovec;
1279 luio.uio_iovcnt = 1;
1280 luio.uio_offset = 0;
1281 luio.uio_resid = easize;
1282 luio.uio_segflg = UIO_SYSSPACE;
1283 luio.uio_rw = UIO_READ;
1284 luio.uio_td = td;
1285
1286 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1287 if (error) {
1288 free(eae, M_TEMP);
1289 return(error);
1290 }
1291 *p = eae;
1292 return (0);
1293}
1294
1295static int
1296ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1297{
1298 struct inode *ip;
1299 struct fs *fs;
1300 struct ufs2_dinode *dp;
1301 int error;
1302
1303 ip = VTOI(vp);
1304 fs = ip->i_fs;
1305
1306 if (ip->i_ea_area != NULL)
1307 return (EBUSY);
1308 dp = ip->i_din2;
1309 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1310 if (error)
1311 return (error);
1312 ip->i_ea_len = dp->di_extsize;
1313 ip->i_ea_error = 0;
1314 return (0);
1315}
1316
1317/*
1318 * Vnode extattr transaction commit/abort
1319 */
1320static int
1321ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1322{
1323 struct inode *ip;
1324 struct fs *fs;
1325 struct uio luio;
1326 struct iovec liovec;
1327 int error;
1328 struct ufs2_dinode *dp;
1329
1330 ip = VTOI(vp);
1331 fs = ip->i_fs;
1332 if (ip->i_ea_area == NULL)
1333 return (EINVAL);
1334 dp = ip->i_din2;
1335 error = ip->i_ea_error;
1336 if (commit && error == 0) {
1337 if (cred == NOCRED)
1338 cred = vp->v_mount->mnt_cred;
1339 liovec.iov_base = ip->i_ea_area;
1340 liovec.iov_len = ip->i_ea_len;
1341 luio.uio_iov = &liovec;
1342 luio.uio_iovcnt = 1;
1343 luio.uio_offset = 0;
1344 luio.uio_resid = ip->i_ea_len;
1345 luio.uio_segflg = UIO_SYSSPACE;
1346 luio.uio_rw = UIO_WRITE;
1347 luio.uio_td = td;
1348 /* XXX: I'm not happy about truncating to zero size */
1349 if (ip->i_ea_len < dp->di_extsize)
1350 error = ffs_truncate(vp, 0, IO_EXT, cred, td);
1351 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1352 }
1353 free(ip->i_ea_area, M_TEMP);
1354 ip->i_ea_area = NULL;
1355 ip->i_ea_len = 0;
1356 ip->i_ea_error = 0;
1357 return (error);
1358}
1359
1360/*
1361 * Vnode extattr strategy routine for special devices and fifos.
1362 *
1363 * We need to check for a read or write of the external attributes.
1364 * Otherwise we just fall through and do the usual thing.
1365 */
1366static int
1367ffsext_strategy(struct vop_strategy_args *ap)
1368/*
1369struct vop_strategy_args {
1370 struct vnodeop_desc *a_desc;
1371 struct vnode *a_vp;
1372 struct buf *a_bp;
1373};
1374*/
1375{
1376 struct vnode *vp;
1377 daddr_t lbn;
1378
1379 vp = ap->a_vp;
1380 lbn = ap->a_bp->b_lblkno;
1381 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
1382 lbn < 0 && lbn >= -NXADDR)
1383 return (ufs_vnoperate((struct vop_generic_args *)ap));
1384 if (vp->v_type == VFIFO)
1385 return (ufs_vnoperatefifo((struct vop_generic_args *)ap));
1386 return (ufs_vnoperatespec((struct vop_generic_args *)ap));
1387}
1388
1389/*
1390 * Vnode extattr transaction commit/abort
1391 */
1392static int
1393ffs_openextattr(struct vop_openextattr_args *ap)
1394/*
1395struct vop_openextattr_args {
1396 struct vnodeop_desc *a_desc;
1397 struct vnode *a_vp;
1398 IN struct ucred *a_cred;
1399 IN struct thread *a_td;
1400};
1401*/
1402{
1403 struct inode *ip;
1404 struct fs *fs;
1405
1406 ip = VTOI(ap->a_vp);
1407 fs = ip->i_fs;
1408 if (fs->fs_magic == FS_UFS1_MAGIC)
1409 return (ufs_vnoperate((struct vop_generic_args *)ap));
1410 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1411}
1412
1413
1414/*
1415 * Vnode extattr transaction commit/abort
1416 */
1417static int
1418ffs_closeextattr(struct vop_closeextattr_args *ap)
1419/*
1420struct vop_closeextattr_args {
1421 struct vnodeop_desc *a_desc;
1422 struct vnode *a_vp;
1423 int a_commit;
1424 IN struct ucred *a_cred;
1425 IN struct thread *a_td;
1426};
1427*/
1428{
1429 struct inode *ip;
1430 struct fs *fs;
1431
1432 ip = VTOI(ap->a_vp);
1433 fs = ip->i_fs;
1434 if (fs->fs_magic == FS_UFS1_MAGIC)
1435 return (ufs_vnoperate((struct vop_generic_args *)ap));
1436 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1437}
1438
1439
1440
1441/*
1442 * Vnode operation to retrieve a named extended attribute.
1443 */
1444static int
1445ffs_getextattr(struct vop_getextattr_args *ap)
1446/*
1447vop_getextattr {
1448 IN struct vnode *a_vp;
1449 IN int a_attrnamespace;
1450 IN const char *a_name;
1451 INOUT struct uio *a_uio;
1452 OUT size_t *a_size;
1453 IN struct ucred *a_cred;
1454 IN struct thread *a_td;
1455};
1456*/
1457{
1458 struct inode *ip;
1459 struct fs *fs;
1460 u_char *eae, *p, *pe, *pn;
1461 struct ufs2_dinode *dp;
1462 unsigned easize;
1463 uint32_t ul;
1464 int error, ealen, stand_alone;
1465
1466 ip = VTOI(ap->a_vp);
1467 fs = ip->i_fs;
1468
1469 if (fs->fs_magic == FS_UFS1_MAGIC)
1470 return (ufs_vnoperate((struct vop_generic_args *)ap));
1471
1472 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1473 ap->a_cred, ap->a_td, IREAD);
1474 if (error)
1475 return (error);
1476
1477 if (ip->i_ea_area == NULL) {
1478 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1479 if (error)
1480 return (error);
1481 stand_alone = 1;
1482 } else {
1483 stand_alone = 0;
1484 }
1485 dp = ip->i_din2;
1486 eae = ip->i_ea_area;
1487 easize = ip->i_ea_len;
1488 if (strlen(ap->a_name) > 0) {
1489 ealen = ffs_findextattr(eae, easize,
1490 ap->a_attrnamespace, ap->a_name, NULL, &p);
1491 if (ealen >= 0) {
1492 error = 0;
1493 if (ap->a_size != NULL)
1494 *ap->a_size = ealen;
1495 else if (ap->a_uio != NULL)
1496 error = uiomove(p, ealen, ap->a_uio);
1497 } else {
1498 error = ENOATTR;
1499 }
1500 } else {
1501 error = 0;
1502 if (ap->a_size != NULL)
1503 *ap->a_size = 0;
1504 pe = eae + easize;
1505 for(p = eae; error == 0 && p < pe; p = pn) {
1506 bcopy(p, &ul, sizeof(ul));
1507 pn = p + ul;
1508 if (pn > pe)
1509 break;
1510 p += sizeof(ul);
1511 if (*p++ != ap->a_attrnamespace)
1512 continue;
1513 p++; /* pad2 */
1514 ealen = *p;
1515 if (ap->a_size != NULL) {
1516 *ap->a_size += ealen + 1;
1517 } else if (ap->a_uio != NULL) {
1518 error = uiomove(p, ealen + 1, ap->a_uio);
1519 }
1520 }
1521 }
1522 if (stand_alone)
1523 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1524 return(error);
1525}
1526
1527/*
1528 * Vnode operation to set a named attribute.
1529 */
1530static int
1531ffs_setextattr(struct vop_setextattr_args *ap)
1532/*
1533vop_setextattr {
1534 IN struct vnode *a_vp;
1535 IN int a_attrnamespace;
1536 IN const char *a_name;
1537 INOUT struct uio *a_uio;
1538 IN struct ucred *a_cred;
1539 IN struct thread *a_td;
1540};
1541*/
1542{
1543 struct inode *ip;
1544 struct fs *fs;
1545 uint32_t ealength, ul;
1546 int ealen, olen, eacont, eapad1, eapad2, error, i, easize;
1547 u_char *eae, *p;
1548 struct ufs2_dinode *dp;
1549 struct ucred *cred;
1550 int stand_alone;
1551
1552 ip = VTOI(ap->a_vp);
1553 fs = ip->i_fs;
1554
1555 if (fs->fs_magic == FS_UFS1_MAGIC)
1556 return (ufs_vnoperate((struct vop_generic_args *)ap));
1557
1558 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1559 ap->a_cred, ap->a_td, IWRITE);
1560 if (error) {
1561 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1562 ip->i_ea_error = error;
1563 return (error);
1564 }
1565
1566 if (ap->a_cred != NOCRED)
1567 cred = ap->a_cred;
1568 else
1569 cred = ap->a_vp->v_mount->mnt_cred;
1570
1571 dp = ip->i_din2;
1572
1573 if (ip->i_ea_area == NULL) {
1574 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1575 if (error)
1576 return (error);
1577 stand_alone = 1;
1578 } else {
1579 stand_alone = 0;
1580 }
1581
1582 /* Calculate the length of the EA entry */
1583 if (ap->a_uio == NULL) {
1584 /* delete */
1585 ealength = eapad1 = ealen = eapad2 = eacont = 0;
1586 } else {
1587 ealen = ap->a_uio->uio_resid;
1588 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1589 eapad1 = 8 - (ealength % 8);
1590 if (eapad1 == 8)
1591 eapad1 = 0;
1592 eacont = ealength + eapad1;
1593 eapad2 = 8 - (ealen % 8);
1594 if (eapad2 == 8)
1595 eapad2 = 0;
1596 ealength += eapad1 + ealen + eapad2;
1597 }
1598
1599 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1600 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1601 easize = ip->i_ea_len;
1602
1603 olen = ffs_findextattr(eae, easize,
1604 ap->a_attrnamespace, ap->a_name, &p, NULL);
1605 if (olen == -1 && ealength == 0) {
1606 /* delete but nonexistent */
1607 free(eae, M_TEMP);
1608 if (stand_alone)
1609 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1610 return(ENOATTR);
1611 }
1612 if (olen == -1) {
1613 /* new, append at end */
1614 p = eae + easize;
1615 easize += ealength;
1616 } else {
1617 bcopy(p, &ul, sizeof ul);
1618 i = p - eae + ul;
1619 if (ul != ealength) {
1620 bcopy(p + ul, p + ealength, easize - i);
1621 easize += (ealength - ul);
1622 }
1623 }
1624 if (easize > NXADDR * fs->fs_bsize) {
1625 free(eae, M_TEMP);
1626 if (stand_alone)
1627 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1628 else if (ip->i_ea_error == 0)
1629 ip->i_ea_error = ENOSPC;
1630 return(ENOSPC);
1631 }
1632 if (ealength != 0) {
1633 bcopy(&ealength, p, sizeof(ealength));
1634 p += sizeof(ealength);
1635 *p++ = ap->a_attrnamespace;
1636 *p++ = eapad2;
1637 *p++ = strlen(ap->a_name);
1638 strcpy(p, ap->a_name);
1639 p += strlen(ap->a_name);
1640 bzero(p, eapad1);
1641 p += eapad1;
1642 error = uiomove(p, ealen, ap->a_uio);
1643 if (error) {
1644 free(eae, M_TEMP);
1645 if (stand_alone)
1646 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1647 else if (ip->i_ea_error == 0)
1648 ip->i_ea_error = error;
1649 return(error);
1650 }
1651 p += ealen;
1652 bzero(p, eapad2);
1653 }
1654 p = ip->i_ea_area;
1655 ip->i_ea_area = eae;
1656 ip->i_ea_len = easize;
1657 free(p, M_TEMP);
1658 if (stand_alone)
1659 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1660 return(error);
1661}
366 GIANT_REQUIRED;
367
368 seqcount = ap->a_ioflag >> 16;
369 ip = VTOI(vp);
370 mode = ip->i_mode;
371
372#ifdef DIAGNOSTIC
373 if (uio->uio_rw != UIO_READ)
374 panic("ffs_read: mode");
375
376 if (vp->v_type == VLNK) {
377 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
378 panic("ffs_read: short symlink");
379 } else if (vp->v_type != VREG && vp->v_type != VDIR)
380 panic("ffs_read: type %d", vp->v_type);
381#endif
382 fs = ip->i_fs;
383 if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize)
384 return (EFBIG);
385
386 orig_resid = uio->uio_resid;
387 if (orig_resid <= 0)
388 return (0);
389
390 object = vp->v_object;
391
392 bytesinfile = ip->i_size - uio->uio_offset;
393 if (bytesinfile <= 0) {
394 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
395 ip->i_flag |= IN_ACCESS;
396 return 0;
397 }
398
399 if (object) {
400 vm_object_reference(object);
401 }
402
403 /*
404 * Ok so we couldn't do it all in one vm trick...
405 * so cycle around trying smaller bites..
406 */
407 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
408 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
409 break;
410
411 lbn = lblkno(fs, uio->uio_offset);
412 nextlbn = lbn + 1;
413
414 /*
415 * size of buffer. The buffer representing the
416 * end of the file is rounded up to the size of
417 * the block type ( fragment or full block,
418 * depending ).
419 */
420 size = blksize(fs, ip, lbn);
421 blkoffset = blkoff(fs, uio->uio_offset);
422
423 /*
424 * The amount we want to transfer in this iteration is
425 * one FS block less the amount of the data before
426 * our startpoint (duh!)
427 */
428 xfersize = fs->fs_bsize - blkoffset;
429
430 /*
431 * But if we actually want less than the block,
432 * or the file doesn't have a whole block more of data,
433 * then use the lesser number.
434 */
435 if (uio->uio_resid < xfersize)
436 xfersize = uio->uio_resid;
437 if (bytesinfile < xfersize)
438 xfersize = bytesinfile;
439
440 if (lblktosize(fs, nextlbn) >= ip->i_size) {
441 /*
442 * Don't do readahead if this is the end of the file.
443 */
444 error = bread(vp, lbn, size, NOCRED, &bp);
445 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
446 /*
447 * Otherwise if we are allowed to cluster,
448 * grab as much as we can.
449 *
450 * XXX This may not be a win if we are not
451 * doing sequential access.
452 */
453 error = cluster_read(vp, ip->i_size, lbn,
454 size, NOCRED, uio->uio_resid, seqcount, &bp);
455 } else if (seqcount > 1) {
456 /*
457 * If we are NOT allowed to cluster, then
458 * if we appear to be acting sequentially,
459 * fire off a request for a readahead
460 * as well as a read. Note that the 4th and 5th
461 * arguments point to arrays of the size specified in
462 * the 6th argument.
463 */
464 int nextsize = blksize(fs, ip, nextlbn);
465 error = breadn(vp, lbn,
466 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
467 } else {
468 /*
469 * Failing all of the above, just read what the
470 * user asked for. Interestingly, the same as
471 * the first option above.
472 */
473 error = bread(vp, lbn, size, NOCRED, &bp);
474 }
475 if (error) {
476 brelse(bp);
477 bp = NULL;
478 break;
479 }
480
481 /*
482 * If IO_DIRECT then set B_DIRECT for the buffer. This
483 * will cause us to attempt to release the buffer later on
484 * and will cause the buffer cache to attempt to free the
485 * underlying pages.
486 */
487 if (ioflag & IO_DIRECT)
488 bp->b_flags |= B_DIRECT;
489
490 /*
491 * We should only get non-zero b_resid when an I/O error
492 * has occurred, which should cause us to break above.
493 * However, if the short read did not cause an error,
494 * then we want to ensure that we do not uiomove bad
495 * or uninitialized data.
496 */
497 size -= bp->b_resid;
498 if (size < xfersize) {
499 if (size == 0)
500 break;
501 xfersize = size;
502 }
503
504 {
505 /*
506 * otherwise use the general form
507 */
508 error =
509 uiomove((char *)bp->b_data + blkoffset,
510 (int)xfersize, uio);
511 }
512
513 if (error)
514 break;
515
516 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
517 (LIST_FIRST(&bp->b_dep) == NULL)) {
518 /*
519 * If there are no dependencies, and it's VMIO,
520 * then we don't need the buf, mark it available
521 * for freeing. The VM has the data.
522 */
523 bp->b_flags |= B_RELBUF;
524 brelse(bp);
525 } else {
526 /*
527 * Otherwise let whoever
528 * made the request take care of
529 * freeing it. We just queue
530 * it onto another list.
531 */
532 bqrelse(bp);
533 }
534 }
535
536 /*
537 * This can only happen in the case of an error
538 * because the loop above resets bp to NULL on each iteration
539 * and on normal completion has not set a new value into it.
540 * so it must have come from a 'break' statement
541 */
542 if (bp != NULL) {
543 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
544 (LIST_FIRST(&bp->b_dep) == NULL)) {
545 bp->b_flags |= B_RELBUF;
546 brelse(bp);
547 } else {
548 bqrelse(bp);
549 }
550 }
551
552 if (object) {
553 vm_object_vndeallocate(object);
554 }
555 if ((error == 0 || uio->uio_resid != orig_resid) &&
556 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
557 ip->i_flag |= IN_ACCESS;
558 return (error);
559}
560
561/*
562 * Vnode op for writing.
563 */
564static int
565ffs_write(ap)
566 struct vop_write_args /* {
567 struct vnode *a_vp;
568 struct uio *a_uio;
569 int a_ioflag;
570 struct ucred *a_cred;
571 } */ *ap;
572{
573 struct vnode *vp;
574 struct uio *uio;
575 struct inode *ip;
576 struct fs *fs;
577 struct buf *bp;
578 struct thread *td;
579 ufs_lbn_t lbn;
580 off_t osize;
581 int seqcount;
582 int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
583 vm_object_t object;
584
585 vp = ap->a_vp;
586 uio = ap->a_uio;
587 ioflag = ap->a_ioflag;
588 if (ap->a_ioflag & IO_EXT)
589#ifdef notyet
590 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
591#else
592 panic("ffs_read+IO_EXT");
593#endif
594
595 GIANT_REQUIRED;
596
597 extended = 0;
598 seqcount = ap->a_ioflag >> 16;
599 ip = VTOI(vp);
600
601 object = vp->v_object;
602 if (object) {
603 vm_object_reference(object);
604 }
605
606#ifdef DIAGNOSTIC
607 if (uio->uio_rw != UIO_WRITE)
608 panic("ffswrite: mode");
609#endif
610
611 switch (vp->v_type) {
612 case VREG:
613 if (ioflag & IO_APPEND)
614 uio->uio_offset = ip->i_size;
615 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) {
616 if (object) {
617 vm_object_vndeallocate(object);
618 }
619 return (EPERM);
620 }
621 /* FALLTHROUGH */
622 case VLNK:
623 break;
624 case VDIR:
625 panic("ffswrite: dir write");
626 break;
627 default:
628 panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type,
629 (int)uio->uio_offset,
630 (int)uio->uio_resid
631 );
632 }
633
634 fs = ip->i_fs;
635 if (uio->uio_offset < 0 ||
636 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
637 if (object) {
638 vm_object_vndeallocate(object);
639 }
640 return (EFBIG);
641 }
642 /*
643 * Maybe this should be above the vnode op call, but so long as
644 * file servers have no limits, I don't think it matters.
645 */
646 td = uio->uio_td;
647 if (vp->v_type == VREG && td &&
648 uio->uio_offset + uio->uio_resid >
649 td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
650 PROC_LOCK(td->td_proc);
651 psignal(td->td_proc, SIGXFSZ);
652 PROC_UNLOCK(td->td_proc);
653 if (object) {
654 vm_object_vndeallocate(object);
655 }
656 return (EFBIG);
657 }
658
659 resid = uio->uio_resid;
660 osize = ip->i_size;
661 if (seqcount > BA_SEQMAX)
662 flags = BA_SEQMAX << BA_SEQSHIFT;
663 else
664 flags = seqcount << BA_SEQSHIFT;
665 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
666 flags |= IO_SYNC;
667
668 for (error = 0; uio->uio_resid > 0;) {
669 lbn = lblkno(fs, uio->uio_offset);
670 blkoffset = blkoff(fs, uio->uio_offset);
671 xfersize = fs->fs_bsize - blkoffset;
672 if (uio->uio_resid < xfersize)
673 xfersize = uio->uio_resid;
674
675 if (uio->uio_offset + xfersize > ip->i_size)
676 vnode_pager_setsize(vp, uio->uio_offset + xfersize);
677
678 /*
679 * We must perform a read-before-write if the transfer size
680 * does not cover the entire buffer.
681 */
682 if (fs->fs_bsize > xfersize)
683 flags |= BA_CLRBUF;
684 else
685 flags &= ~BA_CLRBUF;
686/* XXX is uio->uio_offset the right thing here? */
687 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
688 ap->a_cred, flags, &bp);
689 if (error != 0)
690 break;
691 /*
692 * If the buffer is not valid we have to clear out any
693 * garbage data from the pages instantiated for the buffer.
694 * If we do not, a failed uiomove() during a write can leave
695 * the prior contents of the pages exposed to a userland
696 * mmap(). XXX deal with uiomove() errors a better way.
697 */
698 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
699 vfs_bio_clrbuf(bp);
700 if (ioflag & IO_DIRECT)
701 bp->b_flags |= B_DIRECT;
702 if (ioflag & IO_NOWDRAIN)
703 bp->b_flags |= B_NOWDRAIN;
704
705 if (uio->uio_offset + xfersize > ip->i_size) {
706 ip->i_size = uio->uio_offset + xfersize;
707 DIP(ip, i_size) = ip->i_size;
708 extended = 1;
709 }
710
711 size = blksize(fs, ip, lbn) - bp->b_resid;
712 if (size < xfersize)
713 xfersize = size;
714
715 error =
716 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
717 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
718 (LIST_FIRST(&bp->b_dep) == NULL)) {
719 bp->b_flags |= B_RELBUF;
720 }
721
722 /*
723 * If IO_SYNC each buffer is written synchronously. Otherwise
724 * if we have a severe page deficiency write the buffer
725 * asynchronously. Otherwise try to cluster, and if that
726 * doesn't do it then either do an async write (if O_DIRECT),
727 * or a delayed write (if not).
728 */
729 if (ioflag & IO_SYNC) {
730 (void)bwrite(bp);
731 } else if (vm_page_count_severe() ||
732 buf_dirty_count_severe() ||
733 (ioflag & IO_ASYNC)) {
734 bp->b_flags |= B_CLUSTEROK;
735 bawrite(bp);
736 } else if (xfersize + blkoffset == fs->fs_bsize) {
737 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
738 bp->b_flags |= B_CLUSTEROK;
739 cluster_write(bp, ip->i_size, seqcount);
740 } else {
741 bawrite(bp);
742 }
743 } else if (ioflag & IO_DIRECT) {
744 bp->b_flags |= B_CLUSTEROK;
745 bawrite(bp);
746 } else {
747 bp->b_flags |= B_CLUSTEROK;
748 bdwrite(bp);
749 }
750 if (error || xfersize == 0)
751 break;
752 ip->i_flag |= IN_CHANGE | IN_UPDATE;
753 }
754 /*
755 * If we successfully wrote any data, and we are not the superuser
756 * we clear the setuid and setgid bits as a precaution against
757 * tampering.
758 */
759 if (resid > uio->uio_resid && ap->a_cred &&
760 suser_cred(ap->a_cred, PRISON_ROOT)) {
761 ip->i_mode &= ~(ISUID | ISGID);
762 DIP(ip, i_mode) = ip->i_mode;
763 }
764 if (resid > uio->uio_resid)
765 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
766 if (error) {
767 if (ioflag & IO_UNIT) {
768 (void)UFS_TRUNCATE(vp, osize,
769 IO_NORMAL | (ioflag & IO_SYNC),
770 ap->a_cred, uio->uio_td);
771 uio->uio_offset -= resid - uio->uio_resid;
772 uio->uio_resid = resid;
773 }
774 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
775 error = UFS_UPDATE(vp, 1);
776
777 if (object) {
778 vm_object_vndeallocate(object);
779 }
780
781 return (error);
782}
783
784/*
785 * get page routine
786 */
787static int
788ffs_getpages(ap)
789 struct vop_getpages_args *ap;
790{
791 off_t foff, physoffset;
792 int i, size, bsize;
793 struct vnode *dp, *vp;
794 vm_object_t obj;
795 vm_pindex_t pindex, firstindex;
796 vm_page_t mreq;
797 int bbackwards, bforwards;
798 int pbackwards, pforwards;
799 int firstpage;
800 ufs2_daddr_t reqblkno, reqlblkno;
801 int poff;
802 int pcount;
803 int rtval;
804 int pagesperblock;
805
806 GIANT_REQUIRED;
807
808 pcount = round_page(ap->a_count) / PAGE_SIZE;
809 mreq = ap->a_m[ap->a_reqpage];
810 firstindex = ap->a_m[0]->pindex;
811
812 /*
813 * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
814 * then the entire page is valid. Since the page may be mapped,
815 * user programs might reference data beyond the actual end of file
816 * occuring within the page. We have to zero that data.
817 */
818 if (mreq->valid) {
819 if (mreq->valid != VM_PAGE_BITS_ALL)
820 vm_page_zero_invalid(mreq, TRUE);
821 vm_page_lock_queues();
822 for (i = 0; i < pcount; i++) {
823 if (i != ap->a_reqpage) {
824 vm_page_free(ap->a_m[i]);
825 }
826 }
827 vm_page_unlock_queues();
828 return VM_PAGER_OK;
829 }
830
831 vp = ap->a_vp;
832 obj = vp->v_object;
833 bsize = vp->v_mount->mnt_stat.f_iosize;
834 pindex = mreq->pindex;
835 foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */;
836
837 if (bsize < PAGE_SIZE)
838 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
839 ap->a_count,
840 ap->a_reqpage);
841
842 /*
843 * foff is the file offset of the required page
844 * reqlblkno is the logical block that contains the page
845 * poff is the index of the page into the logical block
846 */
847 reqlblkno = foff / bsize;
848 poff = (foff % bsize) / PAGE_SIZE;
849
850 dp = VTOI(vp)->i_devvp;
851 if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards)
852 || (reqblkno == -1)) {
853 vm_page_lock_queues();
854 for(i = 0; i < pcount; i++) {
855 if (i != ap->a_reqpage)
856 vm_page_free(ap->a_m[i]);
857 }
858 vm_page_unlock_queues();
859 if (reqblkno == -1) {
860 if ((mreq->flags & PG_ZERO) == 0)
861 pmap_zero_page(mreq);
862 vm_page_undirty(mreq);
863 mreq->valid = VM_PAGE_BITS_ALL;
864 return VM_PAGER_OK;
865 } else {
866 return VM_PAGER_ERROR;
867 }
868 }
869
870 physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE;
871 pagesperblock = bsize / PAGE_SIZE;
872 /*
873 * find the first page that is contiguous...
874 * note that pbackwards is the number of pages that are contiguous
875 * backwards.
876 */
877 firstpage = 0;
878 if (ap->a_count) {
879 pbackwards = poff + bbackwards * pagesperblock;
880 if (ap->a_reqpage > pbackwards) {
881 firstpage = ap->a_reqpage - pbackwards;
882 vm_page_lock_queues();
883 for(i=0;i<firstpage;i++)
884 vm_page_free(ap->a_m[i]);
885 vm_page_unlock_queues();
886 }
887
888 /*
889 * pforwards is the number of pages that are contiguous
890 * after the current page.
891 */
892 pforwards = (pagesperblock - (poff + 1)) +
893 bforwards * pagesperblock;
894 if (pforwards < (pcount - (ap->a_reqpage + 1))) {
895 vm_page_lock_queues();
896 for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
897 vm_page_free(ap->a_m[i]);
898 vm_page_unlock_queues();
899 pcount = ap->a_reqpage + pforwards + 1;
900 }
901
902 /*
903 * number of pages for I/O corrected for the non-contig pages at
904 * the beginning of the array.
905 */
906 pcount -= firstpage;
907 }
908
909 /*
910 * calculate the size of the transfer
911 */
912
913 size = pcount * PAGE_SIZE;
914
915 if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) >
916 obj->un_pager.vnp.vnp_size)
917 size = obj->un_pager.vnp.vnp_size -
918 IDX_TO_OFF(ap->a_m[firstpage]->pindex);
919
920 physoffset -= foff;
921 rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
922 (ap->a_reqpage - firstpage), physoffset);
923
924 return (rtval);
925}
926
927/*
928 * Extended attribute area reading.
929 */
930static int
931ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
932{
933 struct inode *ip;
934 struct ufs2_dinode *dp;
935 struct fs *fs;
936 struct buf *bp;
937 ufs_lbn_t lbn, nextlbn;
938 off_t bytesinfile;
939 long size, xfersize, blkoffset;
940 int error, orig_resid;
941 mode_t mode;
942
943 GIANT_REQUIRED;
944
945 ip = VTOI(vp);
946 fs = ip->i_fs;
947 dp = ip->i_din2;
948 mode = ip->i_mode;
949
950#ifdef DIAGNOSTIC
951 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
952 panic("ffs_extread: mode");
953
954#endif
955 orig_resid = uio->uio_resid;
956 if (orig_resid <= 0)
957 return (0);
958
959 bytesinfile = dp->di_extsize - uio->uio_offset;
960 if (bytesinfile <= 0) {
961 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
962 ip->i_flag |= IN_ACCESS;
963 return 0;
964 }
965
966 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
967 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
968 break;
969
970 lbn = lblkno(fs, uio->uio_offset);
971 nextlbn = lbn + 1;
972
973 /*
974 * size of buffer. The buffer representing the
975 * end of the file is rounded up to the size of
976 * the block type ( fragment or full block,
977 * depending ).
978 */
979 size = sblksize(fs, dp->di_extsize, lbn);
980 blkoffset = blkoff(fs, uio->uio_offset);
981
982 /*
983 * The amount we want to transfer in this iteration is
984 * one FS block less the amount of the data before
985 * our startpoint (duh!)
986 */
987 xfersize = fs->fs_bsize - blkoffset;
988
989 /*
990 * But if we actually want less than the block,
991 * or the file doesn't have a whole block more of data,
992 * then use the lesser number.
993 */
994 if (uio->uio_resid < xfersize)
995 xfersize = uio->uio_resid;
996 if (bytesinfile < xfersize)
997 xfersize = bytesinfile;
998
999 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
1000 /*
1001 * Don't do readahead if this is the end of the info.
1002 */
1003 error = bread(vp, -1 - lbn, size, NOCRED, &bp);
1004 } else {
1005 /*
1006 * If we have a second block, then
1007 * fire off a request for a readahead
1008 * as well as a read. Note that the 4th and 5th
1009 * arguments point to arrays of the size specified in
1010 * the 6th argument.
1011 */
1012 int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1013
1014 nextlbn = -1 - nextlbn;
1015 error = breadn(vp, -1 - lbn,
1016 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1017 }
1018 if (error) {
1019 brelse(bp);
1020 bp = NULL;
1021 break;
1022 }
1023
1024 /*
1025 * If IO_DIRECT then set B_DIRECT for the buffer. This
1026 * will cause us to attempt to release the buffer later on
1027 * and will cause the buffer cache to attempt to free the
1028 * underlying pages.
1029 */
1030 if (ioflag & IO_DIRECT)
1031 bp->b_flags |= B_DIRECT;
1032
1033 /*
1034 * We should only get non-zero b_resid when an I/O error
1035 * has occurred, which should cause us to break above.
1036 * However, if the short read did not cause an error,
1037 * then we want to ensure that we do not uiomove bad
1038 * or uninitialized data.
1039 */
1040 size -= bp->b_resid;
1041 if (size < xfersize) {
1042 if (size == 0)
1043 break;
1044 xfersize = size;
1045 }
1046
1047 error = uiomove((char *)bp->b_data + blkoffset,
1048 (int)xfersize, uio);
1049 if (error)
1050 break;
1051
1052 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1053 (LIST_FIRST(&bp->b_dep) == NULL)) {
1054 /*
1055 * If there are no dependencies, and it's VMIO,
1056 * then we don't need the buf, mark it available
1057 * for freeing. The VM has the data.
1058 */
1059 bp->b_flags |= B_RELBUF;
1060 brelse(bp);
1061 } else {
1062 /*
1063 * Otherwise let whoever
1064 * made the request take care of
1065 * freeing it. We just queue
1066 * it onto another list.
1067 */
1068 bqrelse(bp);
1069 }
1070 }
1071
1072 /*
1073 * This can only happen in the case of an error
1074 * because the loop above resets bp to NULL on each iteration
1075 * and on normal completion has not set a new value into it.
1076 * so it must have come from a 'break' statement
1077 */
1078 if (bp != NULL) {
1079 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1080 (LIST_FIRST(&bp->b_dep) == NULL)) {
1081 bp->b_flags |= B_RELBUF;
1082 brelse(bp);
1083 } else {
1084 bqrelse(bp);
1085 }
1086 }
1087
1088 if ((error == 0 || uio->uio_resid != orig_resid) &&
1089 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
1090 ip->i_flag |= IN_ACCESS;
1091 return (error);
1092}
1093
1094/*
1095 * Extended attribute area writing.
1096 */
1097static int
1098ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1099{
1100 struct inode *ip;
1101 struct ufs2_dinode *dp;
1102 struct fs *fs;
1103 struct buf *bp;
1104 ufs_lbn_t lbn;
1105 off_t osize;
1106 int blkoffset, error, flags, resid, size, xfersize;
1107
1108 GIANT_REQUIRED;
1109
1110 ip = VTOI(vp);
1111 fs = ip->i_fs;
1112 dp = ip->i_din2;
1113
1114#ifdef DIAGNOSTIC
1115 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1116 panic("ext_write: mode");
1117#endif
1118
1119 if (ioflag & IO_APPEND)
1120 uio->uio_offset = dp->di_extsize;
1121
1122 if (uio->uio_offset < 0 ||
1123 (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
1124 return (EFBIG);
1125
1126 resid = uio->uio_resid;
1127 osize = dp->di_extsize;
1128 flags = IO_EXT;
1129 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
1130 flags |= IO_SYNC;
1131
1132 for (error = 0; uio->uio_resid > 0;) {
1133 lbn = lblkno(fs, uio->uio_offset);
1134 blkoffset = blkoff(fs, uio->uio_offset);
1135 xfersize = fs->fs_bsize - blkoffset;
1136 if (uio->uio_resid < xfersize)
1137 xfersize = uio->uio_resid;
1138
1139 /*
1140 * We must perform a read-before-write if the transfer size
1141 * does not cover the entire buffer.
1142 */
1143 if (fs->fs_bsize > xfersize)
1144 flags |= BA_CLRBUF;
1145 else
1146 flags &= ~BA_CLRBUF;
1147 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1148 ucred, flags, &bp);
1149 if (error != 0)
1150 break;
1151 /*
1152 * If the buffer is not valid we have to clear out any
1153 * garbage data from the pages instantiated for the buffer.
1154 * If we do not, a failed uiomove() during a write can leave
1155 * the prior contents of the pages exposed to a userland
1156 * mmap(). XXX deal with uiomove() errors a better way.
1157 */
1158 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1159 vfs_bio_clrbuf(bp);
1160 if (ioflag & IO_DIRECT)
1161 bp->b_flags |= B_DIRECT;
1162 if (ioflag & IO_NOWDRAIN)
1163 bp->b_flags |= B_NOWDRAIN;
1164
1165 if (uio->uio_offset + xfersize > dp->di_extsize)
1166 dp->di_extsize = uio->uio_offset + xfersize;
1167
1168 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1169 if (size < xfersize)
1170 xfersize = size;
1171
1172 error =
1173 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1174 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1175 (LIST_FIRST(&bp->b_dep) == NULL)) {
1176 bp->b_flags |= B_RELBUF;
1177 }
1178
1179 /*
1180 * If IO_SYNC each buffer is written synchronously. Otherwise
1181 * if we have a severe page deficiency write the buffer
1182 * asynchronously. Otherwise try to cluster, and if that
1183 * doesn't do it then either do an async write (if O_DIRECT),
1184 * or a delayed write (if not).
1185 */
1186 if (ioflag & IO_SYNC) {
1187 (void)bwrite(bp);
1188 } else if (vm_page_count_severe() ||
1189 buf_dirty_count_severe() ||
1190 xfersize + blkoffset == fs->fs_bsize ||
1191 (ioflag & (IO_ASYNC | IO_DIRECT)))
1192 bawrite(bp);
1193 else
1194 bdwrite(bp);
1195 if (error || xfersize == 0)
1196 break;
1197 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1198 }
1199 /*
1200 * If we successfully wrote any data, and we are not the superuser
1201 * we clear the setuid and setgid bits as a precaution against
1202 * tampering.
1203 */
1204 if (resid > uio->uio_resid && ucred &&
1205 suser_cred(ucred, PRISON_ROOT)) {
1206 ip->i_mode &= ~(ISUID | ISGID);
1207 dp->di_mode = ip->i_mode;
1208 }
1209 if (error) {
1210 if (ioflag & IO_UNIT) {
1211 (void)UFS_TRUNCATE(vp, osize,
1212 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td);
1213 uio->uio_offset -= resid - uio->uio_resid;
1214 uio->uio_resid = resid;
1215 }
1216 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1217 error = UFS_UPDATE(vp, 1);
1218 return (error);
1219}
1220
1221
1222/*
1223 * Vnode operating to retrieve a named extended attribute.
1224 *
1225 * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1226 * the length of the EA, and possibly the pointer to the entry and to the data.
1227 */
1228static int
1229ffs_findextattr(u_char *ptr, uint length, int nspace, const char *name, u_char **eap, u_char **eac)
1230{
1231 u_char *p, *pe, *pn, *p0;
1232 int eapad1, eapad2, ealength, ealen, nlen;
1233 uint32_t ul;
1234
1235 pe = ptr + length;
1236 nlen = strlen(name);
1237
1238 for (p = ptr; p < pe; p = pn) {
1239 p0 = p;
1240 bcopy(p, &ul, sizeof(ul));
1241 pn = p + ul;
1242 /* make sure this entry is complete */
1243 if (pn > pe)
1244 break;
1245 p += sizeof(uint32_t);
1246 if (*p != nspace)
1247 continue;
1248 p++;
1249 eapad2 = *p++;
1250 if (*p != nlen)
1251 continue;
1252 p++;
1253 if (bcmp(p, name, nlen))
1254 continue;
1255 ealength = sizeof(uint32_t) + 3 + nlen;
1256 eapad1 = 8 - (ealength % 8);
1257 if (eapad1 == 8)
1258 eapad1 = 0;
1259 ealength += eapad1;
1260 ealen = ul - ealength - eapad2;
1261 p += nlen + eapad1;
1262 if (eap != NULL)
1263 *eap = p0;
1264 if (eac != NULL)
1265 *eac = p;
1266 return (ealen);
1267 }
1268 return(-1);
1269}
1270
1271static int
1272ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1273{
1274 struct inode *ip;
1275 struct fs *fs;
1276 struct ufs2_dinode *dp;
1277 struct uio luio;
1278 struct iovec liovec;
1279 int easize, error;
1280 u_char *eae;
1281
1282 ip = VTOI(vp);
1283 fs = ip->i_fs;
1284 dp = ip->i_din2;
1285 easize = dp->di_extsize;
1286
1287 eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1288
1289 liovec.iov_base = eae;
1290 liovec.iov_len = easize;
1291 luio.uio_iov = &liovec;
1292 luio.uio_iovcnt = 1;
1293 luio.uio_offset = 0;
1294 luio.uio_resid = easize;
1295 luio.uio_segflg = UIO_SYSSPACE;
1296 luio.uio_rw = UIO_READ;
1297 luio.uio_td = td;
1298
1299 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1300 if (error) {
1301 free(eae, M_TEMP);
1302 return(error);
1303 }
1304 *p = eae;
1305 return (0);
1306}
1307
1308static int
1309ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1310{
1311 struct inode *ip;
1312 struct fs *fs;
1313 struct ufs2_dinode *dp;
1314 int error;
1315
1316 ip = VTOI(vp);
1317 fs = ip->i_fs;
1318
1319 if (ip->i_ea_area != NULL)
1320 return (EBUSY);
1321 dp = ip->i_din2;
1322 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1323 if (error)
1324 return (error);
1325 ip->i_ea_len = dp->di_extsize;
1326 ip->i_ea_error = 0;
1327 return (0);
1328}
1329
1330/*
1331 * Vnode extattr transaction commit/abort
1332 */
1333static int
1334ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1335{
1336 struct inode *ip;
1337 struct fs *fs;
1338 struct uio luio;
1339 struct iovec liovec;
1340 int error;
1341 struct ufs2_dinode *dp;
1342
1343 ip = VTOI(vp);
1344 fs = ip->i_fs;
1345 if (ip->i_ea_area == NULL)
1346 return (EINVAL);
1347 dp = ip->i_din2;
1348 error = ip->i_ea_error;
1349 if (commit && error == 0) {
1350 if (cred == NOCRED)
1351 cred = vp->v_mount->mnt_cred;
1352 liovec.iov_base = ip->i_ea_area;
1353 liovec.iov_len = ip->i_ea_len;
1354 luio.uio_iov = &liovec;
1355 luio.uio_iovcnt = 1;
1356 luio.uio_offset = 0;
1357 luio.uio_resid = ip->i_ea_len;
1358 luio.uio_segflg = UIO_SYSSPACE;
1359 luio.uio_rw = UIO_WRITE;
1360 luio.uio_td = td;
1361 /* XXX: I'm not happy about truncating to zero size */
1362 if (ip->i_ea_len < dp->di_extsize)
1363 error = ffs_truncate(vp, 0, IO_EXT, cred, td);
1364 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1365 }
1366 free(ip->i_ea_area, M_TEMP);
1367 ip->i_ea_area = NULL;
1368 ip->i_ea_len = 0;
1369 ip->i_ea_error = 0;
1370 return (error);
1371}
1372
1373/*
1374 * Vnode extattr strategy routine for special devices and fifos.
1375 *
1376 * We need to check for a read or write of the external attributes.
1377 * Otherwise we just fall through and do the usual thing.
1378 */
1379static int
1380ffsext_strategy(struct vop_strategy_args *ap)
1381/*
1382struct vop_strategy_args {
1383 struct vnodeop_desc *a_desc;
1384 struct vnode *a_vp;
1385 struct buf *a_bp;
1386};
1387*/
1388{
1389 struct vnode *vp;
1390 daddr_t lbn;
1391
1392 vp = ap->a_vp;
1393 lbn = ap->a_bp->b_lblkno;
1394 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
1395 lbn < 0 && lbn >= -NXADDR)
1396 return (ufs_vnoperate((struct vop_generic_args *)ap));
1397 if (vp->v_type == VFIFO)
1398 return (ufs_vnoperatefifo((struct vop_generic_args *)ap));
1399 return (ufs_vnoperatespec((struct vop_generic_args *)ap));
1400}
1401
1402/*
1403 * Vnode extattr transaction commit/abort
1404 */
1405static int
1406ffs_openextattr(struct vop_openextattr_args *ap)
1407/*
1408struct vop_openextattr_args {
1409 struct vnodeop_desc *a_desc;
1410 struct vnode *a_vp;
1411 IN struct ucred *a_cred;
1412 IN struct thread *a_td;
1413};
1414*/
1415{
1416 struct inode *ip;
1417 struct fs *fs;
1418
1419 ip = VTOI(ap->a_vp);
1420 fs = ip->i_fs;
1421 if (fs->fs_magic == FS_UFS1_MAGIC)
1422 return (ufs_vnoperate((struct vop_generic_args *)ap));
1423 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1424}
1425
1426
1427/*
1428 * Vnode extattr transaction commit/abort
1429 */
1430static int
1431ffs_closeextattr(struct vop_closeextattr_args *ap)
1432/*
1433struct vop_closeextattr_args {
1434 struct vnodeop_desc *a_desc;
1435 struct vnode *a_vp;
1436 int a_commit;
1437 IN struct ucred *a_cred;
1438 IN struct thread *a_td;
1439};
1440*/
1441{
1442 struct inode *ip;
1443 struct fs *fs;
1444
1445 ip = VTOI(ap->a_vp);
1446 fs = ip->i_fs;
1447 if (fs->fs_magic == FS_UFS1_MAGIC)
1448 return (ufs_vnoperate((struct vop_generic_args *)ap));
1449 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1450}
1451
1452
1453
1454/*
1455 * Vnode operation to retrieve a named extended attribute.
1456 */
1457static int
1458ffs_getextattr(struct vop_getextattr_args *ap)
1459/*
1460vop_getextattr {
1461 IN struct vnode *a_vp;
1462 IN int a_attrnamespace;
1463 IN const char *a_name;
1464 INOUT struct uio *a_uio;
1465 OUT size_t *a_size;
1466 IN struct ucred *a_cred;
1467 IN struct thread *a_td;
1468};
1469*/
1470{
1471 struct inode *ip;
1472 struct fs *fs;
1473 u_char *eae, *p, *pe, *pn;
1474 struct ufs2_dinode *dp;
1475 unsigned easize;
1476 uint32_t ul;
1477 int error, ealen, stand_alone;
1478
1479 ip = VTOI(ap->a_vp);
1480 fs = ip->i_fs;
1481
1482 if (fs->fs_magic == FS_UFS1_MAGIC)
1483 return (ufs_vnoperate((struct vop_generic_args *)ap));
1484
1485 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1486 ap->a_cred, ap->a_td, IREAD);
1487 if (error)
1488 return (error);
1489
1490 if (ip->i_ea_area == NULL) {
1491 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1492 if (error)
1493 return (error);
1494 stand_alone = 1;
1495 } else {
1496 stand_alone = 0;
1497 }
1498 dp = ip->i_din2;
1499 eae = ip->i_ea_area;
1500 easize = ip->i_ea_len;
1501 if (strlen(ap->a_name) > 0) {
1502 ealen = ffs_findextattr(eae, easize,
1503 ap->a_attrnamespace, ap->a_name, NULL, &p);
1504 if (ealen >= 0) {
1505 error = 0;
1506 if (ap->a_size != NULL)
1507 *ap->a_size = ealen;
1508 else if (ap->a_uio != NULL)
1509 error = uiomove(p, ealen, ap->a_uio);
1510 } else {
1511 error = ENOATTR;
1512 }
1513 } else {
1514 error = 0;
1515 if (ap->a_size != NULL)
1516 *ap->a_size = 0;
1517 pe = eae + easize;
1518 for(p = eae; error == 0 && p < pe; p = pn) {
1519 bcopy(p, &ul, sizeof(ul));
1520 pn = p + ul;
1521 if (pn > pe)
1522 break;
1523 p += sizeof(ul);
1524 if (*p++ != ap->a_attrnamespace)
1525 continue;
1526 p++; /* pad2 */
1527 ealen = *p;
1528 if (ap->a_size != NULL) {
1529 *ap->a_size += ealen + 1;
1530 } else if (ap->a_uio != NULL) {
1531 error = uiomove(p, ealen + 1, ap->a_uio);
1532 }
1533 }
1534 }
1535 if (stand_alone)
1536 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1537 return(error);
1538}
1539
1540/*
1541 * Vnode operation to set a named attribute.
1542 */
1543static int
1544ffs_setextattr(struct vop_setextattr_args *ap)
1545/*
1546vop_setextattr {
1547 IN struct vnode *a_vp;
1548 IN int a_attrnamespace;
1549 IN const char *a_name;
1550 INOUT struct uio *a_uio;
1551 IN struct ucred *a_cred;
1552 IN struct thread *a_td;
1553};
1554*/
1555{
1556 struct inode *ip;
1557 struct fs *fs;
1558 uint32_t ealength, ul;
1559 int ealen, olen, eacont, eapad1, eapad2, error, i, easize;
1560 u_char *eae, *p;
1561 struct ufs2_dinode *dp;
1562 struct ucred *cred;
1563 int stand_alone;
1564
1565 ip = VTOI(ap->a_vp);
1566 fs = ip->i_fs;
1567
1568 if (fs->fs_magic == FS_UFS1_MAGIC)
1569 return (ufs_vnoperate((struct vop_generic_args *)ap));
1570
1571 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1572 ap->a_cred, ap->a_td, IWRITE);
1573 if (error) {
1574 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1575 ip->i_ea_error = error;
1576 return (error);
1577 }
1578
1579 if (ap->a_cred != NOCRED)
1580 cred = ap->a_cred;
1581 else
1582 cred = ap->a_vp->v_mount->mnt_cred;
1583
1584 dp = ip->i_din2;
1585
1586 if (ip->i_ea_area == NULL) {
1587 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1588 if (error)
1589 return (error);
1590 stand_alone = 1;
1591 } else {
1592 stand_alone = 0;
1593 }
1594
1595 /* Calculate the length of the EA entry */
1596 if (ap->a_uio == NULL) {
1597 /* delete */
1598 ealength = eapad1 = ealen = eapad2 = eacont = 0;
1599 } else {
1600 ealen = ap->a_uio->uio_resid;
1601 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1602 eapad1 = 8 - (ealength % 8);
1603 if (eapad1 == 8)
1604 eapad1 = 0;
1605 eacont = ealength + eapad1;
1606 eapad2 = 8 - (ealen % 8);
1607 if (eapad2 == 8)
1608 eapad2 = 0;
1609 ealength += eapad1 + ealen + eapad2;
1610 }
1611
1612 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1613 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1614 easize = ip->i_ea_len;
1615
1616 olen = ffs_findextattr(eae, easize,
1617 ap->a_attrnamespace, ap->a_name, &p, NULL);
1618 if (olen == -1 && ealength == 0) {
1619 /* delete but nonexistent */
1620 free(eae, M_TEMP);
1621 if (stand_alone)
1622 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1623 return(ENOATTR);
1624 }
1625 if (olen == -1) {
1626 /* new, append at end */
1627 p = eae + easize;
1628 easize += ealength;
1629 } else {
1630 bcopy(p, &ul, sizeof ul);
1631 i = p - eae + ul;
1632 if (ul != ealength) {
1633 bcopy(p + ul, p + ealength, easize - i);
1634 easize += (ealength - ul);
1635 }
1636 }
1637 if (easize > NXADDR * fs->fs_bsize) {
1638 free(eae, M_TEMP);
1639 if (stand_alone)
1640 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1641 else if (ip->i_ea_error == 0)
1642 ip->i_ea_error = ENOSPC;
1643 return(ENOSPC);
1644 }
1645 if (ealength != 0) {
1646 bcopy(&ealength, p, sizeof(ealength));
1647 p += sizeof(ealength);
1648 *p++ = ap->a_attrnamespace;
1649 *p++ = eapad2;
1650 *p++ = strlen(ap->a_name);
1651 strcpy(p, ap->a_name);
1652 p += strlen(ap->a_name);
1653 bzero(p, eapad1);
1654 p += eapad1;
1655 error = uiomove(p, ealen, ap->a_uio);
1656 if (error) {
1657 free(eae, M_TEMP);
1658 if (stand_alone)
1659 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1660 else if (ip->i_ea_error == 0)
1661 ip->i_ea_error = error;
1662 return(error);
1663 }
1664 p += ealen;
1665 bzero(p, eapad2);
1666 }
1667 p = ip->i_ea_area;
1668 ip->i_ea_area = eae;
1669 ip->i_ea_len = easize;
1670 free(p, M_TEMP);
1671 if (stand_alone)
1672 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1673 return(error);
1674}