Deleted Added
full compact
ffs_vnops.c (118131) ffs_vnops.c (118607)
1/*
2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Copyright (c) 1982, 1986, 1989, 1993
12 * The Regents of the University of California. All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by the University of
25 * California, Berkeley and its contributors.
26 * 4. Neither the name of the University nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 *
42 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
43 */
44
45#include <sys/cdefs.h>
1/*
2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Copyright (c) 1982, 1986, 1989, 1993
12 * The Regents of the University of California. All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by the University of
25 * California, Berkeley and its contributors.
26 * 4. Neither the name of the University nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 *
42 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
43 */
44
45#include <sys/cdefs.h>
46__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_vnops.c 118131 2003-07-28 18:53:29Z rwatson $");
46__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_vnops.c 118607 2003-08-07 15:04:27Z jhb $");
47
48#include <sys/param.h>
49#include <sys/bio.h>
50#include <sys/systm.h>
51#include <sys/buf.h>
52#include <sys/conf.h>
53#include <sys/extattr.h>
54#include <sys/kernel.h>
55#include <sys/limits.h>
56#include <sys/malloc.h>
57#include <sys/mount.h>
58#include <sys/proc.h>
59#include <sys/resourcevar.h>
60#include <sys/signalvar.h>
61#include <sys/stat.h>
62#include <sys/vmmeter.h>
63#include <sys/vnode.h>
64
65#include <vm/vm.h>
66#include <vm/vm_extern.h>
67#include <vm/vm_object.h>
68#include <vm/vm_page.h>
69#include <vm/vm_pager.h>
70#include <vm/vnode_pager.h>
71
72#include <ufs/ufs/extattr.h>
73#include <ufs/ufs/quota.h>
74#include <ufs/ufs/inode.h>
75#include <ufs/ufs/ufs_extern.h>
76#include <ufs/ufs/ufsmount.h>
77
78#include <ufs/ffs/fs.h>
79#include <ufs/ffs/ffs_extern.h>
80#include "opt_directio.h"
81
82#ifdef DIRECTIO
83extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
84#endif
85static int ffs_fsync(struct vop_fsync_args *);
86static int ffs_getpages(struct vop_getpages_args *);
87static int ffs_read(struct vop_read_args *);
88static int ffs_write(struct vop_write_args *);
89static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
90static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
91 struct ucred *cred);
92static int ffsext_strategy(struct vop_strategy_args *);
93static int ffs_closeextattr(struct vop_closeextattr_args *);
94static int ffs_deleteextattr(struct vop_deleteextattr_args *);
95static int ffs_getextattr(struct vop_getextattr_args *);
96static int ffs_listextattr(struct vop_listextattr_args *);
97static int ffs_openextattr(struct vop_openextattr_args *);
98static int ffs_setextattr(struct vop_setextattr_args *);
99
100
101/* Global vfs data structures for ufs. */
102vop_t **ffs_vnodeop_p;
103static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
104 { &vop_default_desc, (vop_t *) ufs_vnoperate },
105 { &vop_fsync_desc, (vop_t *) ffs_fsync },
106 { &vop_getpages_desc, (vop_t *) ffs_getpages },
107 { &vop_read_desc, (vop_t *) ffs_read },
108 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
109 { &vop_write_desc, (vop_t *) ffs_write },
110 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
111 { &vop_deleteextattr_desc, (vop_t *) ffs_deleteextattr },
112 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
113 { &vop_listextattr_desc, (vop_t *) ffs_listextattr },
114 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
115 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
116 { NULL, NULL }
117};
118static struct vnodeopv_desc ffs_vnodeop_opv_desc =
119 { &ffs_vnodeop_p, ffs_vnodeop_entries };
120
121vop_t **ffs_specop_p;
122static struct vnodeopv_entry_desc ffs_specop_entries[] = {
123 { &vop_default_desc, (vop_t *) ufs_vnoperatespec },
124 { &vop_fsync_desc, (vop_t *) ffs_fsync },
125 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
126 { &vop_strategy_desc, (vop_t *) ffsext_strategy },
127 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
128 { &vop_deleteextattr_desc, (vop_t *) ffs_deleteextattr },
129 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
130 { &vop_listextattr_desc, (vop_t *) ffs_listextattr },
131 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
132 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
133 { NULL, NULL }
134};
135static struct vnodeopv_desc ffs_specop_opv_desc =
136 { &ffs_specop_p, ffs_specop_entries };
137
138vop_t **ffs_fifoop_p;
139static struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
140 { &vop_default_desc, (vop_t *) ufs_vnoperatefifo },
141 { &vop_fsync_desc, (vop_t *) ffs_fsync },
142 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
143 { &vop_strategy_desc, (vop_t *) ffsext_strategy },
144 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
145 { &vop_deleteextattr_desc, (vop_t *) ffs_deleteextattr },
146 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
147 { &vop_listextattr_desc, (vop_t *) ffs_listextattr },
148 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
149 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
150 { NULL, NULL }
151};
152static struct vnodeopv_desc ffs_fifoop_opv_desc =
153 { &ffs_fifoop_p, ffs_fifoop_entries };
154
155VNODEOP_SET(ffs_vnodeop_opv_desc);
156VNODEOP_SET(ffs_specop_opv_desc);
157VNODEOP_SET(ffs_fifoop_opv_desc);
158
159/*
160 * Synch an open file.
161 */
162/* ARGSUSED */
163static int
164ffs_fsync(ap)
165 struct vop_fsync_args /* {
166 struct vnode *a_vp;
167 struct ucred *a_cred;
168 int a_waitfor;
169 struct thread *a_td;
170 } */ *ap;
171{
172 struct vnode *vp = ap->a_vp;
173 struct inode *ip = VTOI(vp);
174 struct buf *bp;
175 struct buf *nbp;
176 int s, error, wait, passes, skipmeta;
177 ufs_lbn_t lbn;
178
179 wait = (ap->a_waitfor == MNT_WAIT);
180 if (vn_isdisk(vp, NULL)) {
181 lbn = INT_MAX;
182 if (vp->v_rdev->si_mountpoint != NULL &&
183 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP))
184 softdep_fsync_mountdev(vp);
185 } else {
186 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
187 }
188
189 /*
190 * Flush all dirty buffers associated with a vnode.
191 */
192 passes = NIADDR + 1;
193 skipmeta = 0;
194 if (wait)
195 skipmeta = 1;
196 s = splbio();
197 VI_LOCK(vp);
198loop:
199 TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs)
200 bp->b_vflags &= ~BV_SCANNED;
201 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
202 nbp = TAILQ_NEXT(bp, b_vnbufs);
203 /*
204 * Reasons to skip this buffer: it has already been considered
205 * on this pass, this pass is the first time through on a
206 * synchronous flush request and the buffer being considered
207 * is metadata, the buffer has dependencies that will cause
208 * it to be redirtied and it has not already been deferred,
209 * or it is already being written.
210 */
211 if ((bp->b_vflags & BV_SCANNED) != 0)
212 continue;
213 bp->b_vflags |= BV_SCANNED;
214 if ((skipmeta == 1 && bp->b_lblkno < 0))
215 continue;
216 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
217 continue;
218 if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
219 (bp->b_flags & B_DEFERRED) == 0 &&
220 buf_countdeps(bp, 0)) {
221 bp->b_flags |= B_DEFERRED;
222 BUF_UNLOCK(bp);
223 continue;
224 }
225 VI_UNLOCK(vp);
226 if ((bp->b_flags & B_DELWRI) == 0)
227 panic("ffs_fsync: not dirty");
228 if (vp != bp->b_vp)
229 panic("ffs_fsync: vp != vp->b_vp");
230 /*
231 * If this is a synchronous flush request, or it is not a
232 * file or device, start the write on this buffer immediatly.
233 */
234 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
235
236 /*
237 * On our final pass through, do all I/O synchronously
238 * so that we can find out if our flush is failing
239 * because of write errors.
240 */
241 if (passes > 0 || !wait) {
242 if ((bp->b_flags & B_CLUSTEROK) && !wait) {
243 (void) vfs_bio_awrite(bp);
244 } else {
245 bremfree(bp);
246 splx(s);
247 (void) bawrite(bp);
248 s = splbio();
249 }
250 } else {
251 bremfree(bp);
252 splx(s);
253 if ((error = bwrite(bp)) != 0)
254 return (error);
255 s = splbio();
256 }
257 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
258 /*
259 * If the buffer is for data that has been truncated
260 * off the file, then throw it away.
261 */
262 bremfree(bp);
263 bp->b_flags |= B_INVAL | B_NOCACHE;
264 splx(s);
265 brelse(bp);
266 s = splbio();
267 } else
268 vfs_bio_awrite(bp);
269
270 /*
271 * Since we may have slept during the I/O, we need
272 * to start from a known point.
273 */
274 VI_LOCK(vp);
275 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
276 }
277 /*
278 * If we were asked to do this synchronously, then go back for
279 * another pass, this time doing the metadata.
280 */
281 if (skipmeta) {
282 skipmeta = 0;
283 goto loop;
284 }
285
286 if (wait) {
287 while (vp->v_numoutput) {
288 vp->v_iflag |= VI_BWAIT;
289 msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
290 PRIBIO + 4, "ffsfsn", 0);
291 }
292 VI_UNLOCK(vp);
293
294 /*
295 * Ensure that any filesystem metatdata associated
296 * with the vnode has been written.
297 */
298 splx(s);
299 if ((error = softdep_sync_metadata(ap)) != 0)
300 return (error);
301 s = splbio();
302
303 VI_LOCK(vp);
304 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
305 /*
306 * Block devices associated with filesystems may
307 * have new I/O requests posted for them even if
308 * the vnode is locked, so no amount of trying will
309 * get them clean. Thus we give block devices a
310 * good effort, then just give up. For all other file
311 * types, go around and try again until it is clean.
312 */
313 if (passes > 0) {
314 passes -= 1;
315 goto loop;
316 }
317#ifdef DIAGNOSTIC
318 if (!vn_isdisk(vp, NULL))
319 vprint("ffs_fsync: dirty", vp);
320#endif
321 }
322 }
323 VI_UNLOCK(vp);
324 splx(s);
325 return (UFS_UPDATE(vp, wait));
326}
327
328
329/*
330 * Vnode op for reading.
331 */
332/* ARGSUSED */
333static int
334ffs_read(ap)
335 struct vop_read_args /* {
336 struct vnode *a_vp;
337 struct uio *a_uio;
338 int a_ioflag;
339 struct ucred *a_cred;
340 } */ *ap;
341{
342 struct vnode *vp;
343 struct inode *ip;
344 struct uio *uio;
345 struct fs *fs;
346 struct buf *bp;
347 ufs_lbn_t lbn, nextlbn;
348 off_t bytesinfile;
349 long size, xfersize, blkoffset;
350 int error, orig_resid;
351 int seqcount;
352 int ioflag;
353 vm_object_t object;
354
355 vp = ap->a_vp;
356 uio = ap->a_uio;
357 ioflag = ap->a_ioflag;
358 if (ap->a_ioflag & IO_EXT)
359#ifdef notyet
360 return (ffs_extread(vp, uio, ioflag));
361#else
362 panic("ffs_read+IO_EXT");
363#endif
364#ifdef DIRECTIO
365 if ((ioflag & IO_DIRECT) != 0) {
366 int workdone;
367
368 error = ffs_rawread(vp, uio, &workdone);
369 if (error != 0 || workdone != 0)
370 return error;
371 }
372#endif
373
374 GIANT_REQUIRED;
375
376 seqcount = ap->a_ioflag >> 16;
377 ip = VTOI(vp);
378
379#ifdef DIAGNOSTIC
380 if (uio->uio_rw != UIO_READ)
381 panic("ffs_read: mode");
382
383 if (vp->v_type == VLNK) {
384 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
385 panic("ffs_read: short symlink");
386 } else if (vp->v_type != VREG && vp->v_type != VDIR)
387 panic("ffs_read: type %d", vp->v_type);
388#endif
389 fs = ip->i_fs;
390 if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize)
391 return (EFBIG);
392
393 orig_resid = uio->uio_resid;
394 if (orig_resid <= 0)
395 return (0);
396
397 object = vp->v_object;
398
399 bytesinfile = ip->i_size - uio->uio_offset;
400 if (bytesinfile <= 0) {
401 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
402 ip->i_flag |= IN_ACCESS;
403 return 0;
404 }
405
406 if (object) {
407 vm_object_reference(object);
408 }
409
410 /*
411 * Ok so we couldn't do it all in one vm trick...
412 * so cycle around trying smaller bites..
413 */
414 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
415 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
416 break;
417
418 lbn = lblkno(fs, uio->uio_offset);
419 nextlbn = lbn + 1;
420
421 /*
422 * size of buffer. The buffer representing the
423 * end of the file is rounded up to the size of
424 * the block type ( fragment or full block,
425 * depending ).
426 */
427 size = blksize(fs, ip, lbn);
428 blkoffset = blkoff(fs, uio->uio_offset);
429
430 /*
431 * The amount we want to transfer in this iteration is
432 * one FS block less the amount of the data before
433 * our startpoint (duh!)
434 */
435 xfersize = fs->fs_bsize - blkoffset;
436
437 /*
438 * But if we actually want less than the block,
439 * or the file doesn't have a whole block more of data,
440 * then use the lesser number.
441 */
442 if (uio->uio_resid < xfersize)
443 xfersize = uio->uio_resid;
444 if (bytesinfile < xfersize)
445 xfersize = bytesinfile;
446
447 if (lblktosize(fs, nextlbn) >= ip->i_size) {
448 /*
449 * Don't do readahead if this is the end of the file.
450 */
451 error = bread(vp, lbn, size, NOCRED, &bp);
452 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
453 /*
454 * Otherwise if we are allowed to cluster,
455 * grab as much as we can.
456 *
457 * XXX This may not be a win if we are not
458 * doing sequential access.
459 */
460 error = cluster_read(vp, ip->i_size, lbn,
461 size, NOCRED, uio->uio_resid, seqcount, &bp);
462 } else if (seqcount > 1) {
463 /*
464 * If we are NOT allowed to cluster, then
465 * if we appear to be acting sequentially,
466 * fire off a request for a readahead
467 * as well as a read. Note that the 4th and 5th
468 * arguments point to arrays of the size specified in
469 * the 6th argument.
470 */
471 int nextsize = blksize(fs, ip, nextlbn);
472 error = breadn(vp, lbn,
473 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
474 } else {
475 /*
476 * Failing all of the above, just read what the
477 * user asked for. Interestingly, the same as
478 * the first option above.
479 */
480 error = bread(vp, lbn, size, NOCRED, &bp);
481 }
482 if (error) {
483 brelse(bp);
484 bp = NULL;
485 break;
486 }
487
488 /*
489 * If IO_DIRECT then set B_DIRECT for the buffer. This
490 * will cause us to attempt to release the buffer later on
491 * and will cause the buffer cache to attempt to free the
492 * underlying pages.
493 */
494 if (ioflag & IO_DIRECT)
495 bp->b_flags |= B_DIRECT;
496
497 /*
498 * We should only get non-zero b_resid when an I/O error
499 * has occurred, which should cause us to break above.
500 * However, if the short read did not cause an error,
501 * then we want to ensure that we do not uiomove bad
502 * or uninitialized data.
503 */
504 size -= bp->b_resid;
505 if (size < xfersize) {
506 if (size == 0)
507 break;
508 xfersize = size;
509 }
510
511 {
512 /*
513 * otherwise use the general form
514 */
515 error =
516 uiomove((char *)bp->b_data + blkoffset,
517 (int)xfersize, uio);
518 }
519
520 if (error)
521 break;
522
523 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
524 (LIST_FIRST(&bp->b_dep) == NULL)) {
525 /*
526 * If there are no dependencies, and it's VMIO,
527 * then we don't need the buf, mark it available
528 * for freeing. The VM has the data.
529 */
530 bp->b_flags |= B_RELBUF;
531 brelse(bp);
532 } else {
533 /*
534 * Otherwise let whoever
535 * made the request take care of
536 * freeing it. We just queue
537 * it onto another list.
538 */
539 bqrelse(bp);
540 }
541 }
542
543 /*
544 * This can only happen in the case of an error
545 * because the loop above resets bp to NULL on each iteration
546 * and on normal completion has not set a new value into it.
547 * so it must have come from a 'break' statement
548 */
549 if (bp != NULL) {
550 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
551 (LIST_FIRST(&bp->b_dep) == NULL)) {
552 bp->b_flags |= B_RELBUF;
553 brelse(bp);
554 } else {
555 bqrelse(bp);
556 }
557 }
558
559 if (object) {
560 VM_OBJECT_LOCK(object);
561 vm_object_vndeallocate(object);
562 }
563 if ((error == 0 || uio->uio_resid != orig_resid) &&
564 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
565 ip->i_flag |= IN_ACCESS;
566 return (error);
567}
568
569/*
570 * Vnode op for writing.
571 */
572static int
573ffs_write(ap)
574 struct vop_write_args /* {
575 struct vnode *a_vp;
576 struct uio *a_uio;
577 int a_ioflag;
578 struct ucred *a_cred;
579 } */ *ap;
580{
581 struct vnode *vp;
582 struct uio *uio;
583 struct inode *ip;
584 struct fs *fs;
585 struct buf *bp;
586 struct thread *td;
587 ufs_lbn_t lbn;
588 off_t osize;
589 int seqcount;
590 int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
591 vm_object_t object;
592
593 vp = ap->a_vp;
594 uio = ap->a_uio;
595 ioflag = ap->a_ioflag;
596 if (ap->a_ioflag & IO_EXT)
597#ifdef notyet
598 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
599#else
600 panic("ffs_read+IO_EXT");
601#endif
602
603 GIANT_REQUIRED;
604
605 extended = 0;
606 seqcount = ap->a_ioflag >> 16;
607 ip = VTOI(vp);
608
609 object = vp->v_object;
610 if (object) {
611 vm_object_reference(object);
612 }
613
614#ifdef DIAGNOSTIC
615 if (uio->uio_rw != UIO_WRITE)
616 panic("ffswrite: mode");
617#endif
618
619 switch (vp->v_type) {
620 case VREG:
621 if (ioflag & IO_APPEND)
622 uio->uio_offset = ip->i_size;
623 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) {
624 if (object) {
625 VM_OBJECT_LOCK(object);
626 vm_object_vndeallocate(object);
627 }
628 return (EPERM);
629 }
630 /* FALLTHROUGH */
631 case VLNK:
632 break;
633 case VDIR:
634 panic("ffswrite: dir write");
635 break;
636 default:
637 panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type,
638 (int)uio->uio_offset,
639 (int)uio->uio_resid
640 );
641 }
642
643 fs = ip->i_fs;
644 if (uio->uio_offset < 0 ||
645 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
646 if (object) {
647 VM_OBJECT_LOCK(object);
648 vm_object_vndeallocate(object);
649 }
650 return (EFBIG);
651 }
652 /*
653 * Maybe this should be above the vnode op call, but so long as
654 * file servers have no limits, I don't think it matters.
655 */
656 td = uio->uio_td;
657 if (vp->v_type == VREG && td &&
658 uio->uio_offset + uio->uio_resid >
659 td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
660 PROC_LOCK(td->td_proc);
661 psignal(td->td_proc, SIGXFSZ);
662 PROC_UNLOCK(td->td_proc);
663 if (object) {
664 VM_OBJECT_LOCK(object);
665 vm_object_vndeallocate(object);
666 }
667 return (EFBIG);
668 }
669
670 resid = uio->uio_resid;
671 osize = ip->i_size;
672 if (seqcount > BA_SEQMAX)
673 flags = BA_SEQMAX << BA_SEQSHIFT;
674 else
675 flags = seqcount << BA_SEQSHIFT;
676 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
677 flags |= IO_SYNC;
678
679 for (error = 0; uio->uio_resid > 0;) {
680 lbn = lblkno(fs, uio->uio_offset);
681 blkoffset = blkoff(fs, uio->uio_offset);
682 xfersize = fs->fs_bsize - blkoffset;
683 if (uio->uio_resid < xfersize)
684 xfersize = uio->uio_resid;
685
686 if (uio->uio_offset + xfersize > ip->i_size)
687 vnode_pager_setsize(vp, uio->uio_offset + xfersize);
688
689 /*
690 * We must perform a read-before-write if the transfer size
691 * does not cover the entire buffer.
692 */
693 if (fs->fs_bsize > xfersize)
694 flags |= BA_CLRBUF;
695 else
696 flags &= ~BA_CLRBUF;
697/* XXX is uio->uio_offset the right thing here? */
698 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
699 ap->a_cred, flags, &bp);
700 if (error != 0)
701 break;
702 /*
703 * If the buffer is not valid we have to clear out any
704 * garbage data from the pages instantiated for the buffer.
705 * If we do not, a failed uiomove() during a write can leave
706 * the prior contents of the pages exposed to a userland
707 * mmap(). XXX deal with uiomove() errors a better way.
708 */
709 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
710 vfs_bio_clrbuf(bp);
711 if (ioflag & IO_DIRECT)
712 bp->b_flags |= B_DIRECT;
713
714 if (uio->uio_offset + xfersize > ip->i_size) {
715 ip->i_size = uio->uio_offset + xfersize;
716 DIP(ip, i_size) = ip->i_size;
717 extended = 1;
718 }
719
720 size = blksize(fs, ip, lbn) - bp->b_resid;
721 if (size < xfersize)
722 xfersize = size;
723
724 error =
725 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
726 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
727 (LIST_FIRST(&bp->b_dep) == NULL)) {
728 bp->b_flags |= B_RELBUF;
729 }
730
731 /*
732 * If IO_SYNC each buffer is written synchronously. Otherwise
733 * if we have a severe page deficiency write the buffer
734 * asynchronously. Otherwise try to cluster, and if that
735 * doesn't do it then either do an async write (if O_DIRECT),
736 * or a delayed write (if not).
737 */
738 if (ioflag & IO_SYNC) {
739 (void)bwrite(bp);
740 } else if (vm_page_count_severe() ||
741 buf_dirty_count_severe() ||
742 (ioflag & IO_ASYNC)) {
743 bp->b_flags |= B_CLUSTEROK;
744 bawrite(bp);
745 } else if (xfersize + blkoffset == fs->fs_bsize) {
746 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
747 bp->b_flags |= B_CLUSTEROK;
748 cluster_write(bp, ip->i_size, seqcount);
749 } else {
750 bawrite(bp);
751 }
752 } else if (ioflag & IO_DIRECT) {
753 bp->b_flags |= B_CLUSTEROK;
754 bawrite(bp);
755 } else {
756 bp->b_flags |= B_CLUSTEROK;
757 bdwrite(bp);
758 }
759 if (error || xfersize == 0)
760 break;
761 ip->i_flag |= IN_CHANGE | IN_UPDATE;
762 }
763 /*
764 * If we successfully wrote any data, and we are not the superuser
765 * we clear the setuid and setgid bits as a precaution against
766 * tampering.
767 */
768 if (resid > uio->uio_resid && ap->a_cred &&
769 suser_cred(ap->a_cred, PRISON_ROOT)) {
770 ip->i_mode &= ~(ISUID | ISGID);
771 DIP(ip, i_mode) = ip->i_mode;
772 }
773 if (resid > uio->uio_resid)
774 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
775 if (error) {
776 if (ioflag & IO_UNIT) {
777 (void)UFS_TRUNCATE(vp, osize,
778 IO_NORMAL | (ioflag & IO_SYNC),
779 ap->a_cred, uio->uio_td);
780 uio->uio_offset -= resid - uio->uio_resid;
781 uio->uio_resid = resid;
782 }
783 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
784 error = UFS_UPDATE(vp, 1);
785
786 if (object) {
787 VM_OBJECT_LOCK(object);
788 vm_object_vndeallocate(object);
789 }
790
791 return (error);
792}
793
794/*
795 * get page routine
796 */
797static int
798ffs_getpages(ap)
799 struct vop_getpages_args *ap;
800{
801 off_t foff, physoffset;
802 int i, size, bsize;
803 struct vnode *dp, *vp;
804 vm_object_t obj;
805 vm_pindex_t pindex;
806 vm_page_t mreq;
807 int bbackwards, bforwards;
808 int pbackwards, pforwards;
809 int firstpage;
810 ufs2_daddr_t reqblkno, reqlblkno;
811 int poff;
812 int pcount;
813 int rtval;
814 int pagesperblock;
815
816 GIANT_REQUIRED;
817
818 pcount = round_page(ap->a_count) / PAGE_SIZE;
819 mreq = ap->a_m[ap->a_reqpage];
820
821 /*
822 * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
823 * then the entire page is valid. Since the page may be mapped,
824 * user programs might reference data beyond the actual end of file
825 * occuring within the page. We have to zero that data.
826 */
827 if (mreq->valid) {
828 if (mreq->valid != VM_PAGE_BITS_ALL)
829 vm_page_zero_invalid(mreq, TRUE);
830 VM_OBJECT_LOCK(mreq->object);
831 vm_page_lock_queues();
832 for (i = 0; i < pcount; i++) {
833 if (i != ap->a_reqpage) {
834 vm_page_free(ap->a_m[i]);
835 }
836 }
837 vm_page_unlock_queues();
838 VM_OBJECT_UNLOCK(mreq->object);
839 return VM_PAGER_OK;
840 }
841
842 vp = ap->a_vp;
843 obj = vp->v_object;
844 bsize = vp->v_mount->mnt_stat.f_iosize;
845 pindex = mreq->pindex;
846 foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */;
847
848 if (bsize < PAGE_SIZE)
849 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
850 ap->a_count,
851 ap->a_reqpage);
852
853 /*
854 * foff is the file offset of the required page
855 * reqlblkno is the logical block that contains the page
856 * poff is the index of the page into the logical block
857 */
858 reqlblkno = foff / bsize;
859 poff = (foff % bsize) / PAGE_SIZE;
860
861 dp = VTOI(vp)->i_devvp;
862 if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards)
863 || (reqblkno == -1)) {
864 VM_OBJECT_LOCK(obj);
865 vm_page_lock_queues();
866 for(i = 0; i < pcount; i++) {
867 if (i != ap->a_reqpage)
868 vm_page_free(ap->a_m[i]);
869 }
870 vm_page_unlock_queues();
871 VM_OBJECT_UNLOCK(obj);
872 if (reqblkno == -1) {
873 if ((mreq->flags & PG_ZERO) == 0)
874 pmap_zero_page(mreq);
875 vm_page_undirty(mreq);
876 mreq->valid = VM_PAGE_BITS_ALL;
877 return VM_PAGER_OK;
878 } else {
879 return VM_PAGER_ERROR;
880 }
881 }
882
883 physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE;
884 pagesperblock = bsize / PAGE_SIZE;
885 /*
886 * find the first page that is contiguous...
887 * note that pbackwards is the number of pages that are contiguous
888 * backwards.
889 */
890 firstpage = 0;
891 if (ap->a_count) {
892 pbackwards = poff + bbackwards * pagesperblock;
893 if (ap->a_reqpage > pbackwards) {
894 firstpage = ap->a_reqpage - pbackwards;
895 VM_OBJECT_LOCK(obj);
896 vm_page_lock_queues();
897 for(i=0;i<firstpage;i++)
898 vm_page_free(ap->a_m[i]);
899 vm_page_unlock_queues();
900 VM_OBJECT_UNLOCK(obj);
901 }
902
903 /*
904 * pforwards is the number of pages that are contiguous
905 * after the current page.
906 */
907 pforwards = (pagesperblock - (poff + 1)) +
908 bforwards * pagesperblock;
909 if (pforwards < (pcount - (ap->a_reqpage + 1))) {
910 VM_OBJECT_LOCK(obj);
911 vm_page_lock_queues();
912 for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
913 vm_page_free(ap->a_m[i]);
914 vm_page_unlock_queues();
915 VM_OBJECT_UNLOCK(obj);
916 pcount = ap->a_reqpage + pforwards + 1;
917 }
918
919 /*
920 * number of pages for I/O corrected for the non-contig pages at
921 * the beginning of the array.
922 */
923 pcount -= firstpage;
924 }
925
926 /*
927 * calculate the size of the transfer
928 */
929
930 size = pcount * PAGE_SIZE;
931
932 if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) >
933 obj->un_pager.vnp.vnp_size)
934 size = obj->un_pager.vnp.vnp_size -
935 IDX_TO_OFF(ap->a_m[firstpage]->pindex);
936
937 physoffset -= foff;
938 rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
939 (ap->a_reqpage - firstpage), physoffset);
940
941 return (rtval);
942}
943
944/*
945 * Extended attribute area reading.
946 */
947static int
948ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
949{
950 struct inode *ip;
951 struct ufs2_dinode *dp;
952 struct fs *fs;
953 struct buf *bp;
954 ufs_lbn_t lbn, nextlbn;
955 off_t bytesinfile;
956 long size, xfersize, blkoffset;
957 int error, orig_resid;
958
959 GIANT_REQUIRED;
960
961 ip = VTOI(vp);
962 fs = ip->i_fs;
963 dp = ip->i_din2;
964
965#ifdef DIAGNOSTIC
966 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
967 panic("ffs_extread: mode");
968
969#endif
970 orig_resid = uio->uio_resid;
971 if (orig_resid <= 0)
972 return (0);
973
974 bytesinfile = dp->di_extsize - uio->uio_offset;
975 if (bytesinfile <= 0) {
976 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
977 ip->i_flag |= IN_ACCESS;
978 return 0;
979 }
980
981 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
982 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
983 break;
984
985 lbn = lblkno(fs, uio->uio_offset);
986 nextlbn = lbn + 1;
987
988 /*
989 * size of buffer. The buffer representing the
990 * end of the file is rounded up to the size of
991 * the block type ( fragment or full block,
992 * depending ).
993 */
994 size = sblksize(fs, dp->di_extsize, lbn);
995 blkoffset = blkoff(fs, uio->uio_offset);
996
997 /*
998 * The amount we want to transfer in this iteration is
999 * one FS block less the amount of the data before
1000 * our startpoint (duh!)
1001 */
1002 xfersize = fs->fs_bsize - blkoffset;
1003
1004 /*
1005 * But if we actually want less than the block,
1006 * or the file doesn't have a whole block more of data,
1007 * then use the lesser number.
1008 */
1009 if (uio->uio_resid < xfersize)
1010 xfersize = uio->uio_resid;
1011 if (bytesinfile < xfersize)
1012 xfersize = bytesinfile;
1013
1014 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
1015 /*
1016 * Don't do readahead if this is the end of the info.
1017 */
1018 error = bread(vp, -1 - lbn, size, NOCRED, &bp);
1019 } else {
1020 /*
1021 * If we have a second block, then
1022 * fire off a request for a readahead
1023 * as well as a read. Note that the 4th and 5th
1024 * arguments point to arrays of the size specified in
1025 * the 6th argument.
1026 */
1027 int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1028
1029 nextlbn = -1 - nextlbn;
1030 error = breadn(vp, -1 - lbn,
1031 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1032 }
1033 if (error) {
1034 brelse(bp);
1035 bp = NULL;
1036 break;
1037 }
1038
1039 /*
1040 * If IO_DIRECT then set B_DIRECT for the buffer. This
1041 * will cause us to attempt to release the buffer later on
1042 * and will cause the buffer cache to attempt to free the
1043 * underlying pages.
1044 */
1045 if (ioflag & IO_DIRECT)
1046 bp->b_flags |= B_DIRECT;
1047
1048 /*
1049 * We should only get non-zero b_resid when an I/O error
1050 * has occurred, which should cause us to break above.
1051 * However, if the short read did not cause an error,
1052 * then we want to ensure that we do not uiomove bad
1053 * or uninitialized data.
1054 */
1055 size -= bp->b_resid;
1056 if (size < xfersize) {
1057 if (size == 0)
1058 break;
1059 xfersize = size;
1060 }
1061
1062 error = uiomove((char *)bp->b_data + blkoffset,
1063 (int)xfersize, uio);
1064 if (error)
1065 break;
1066
1067 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1068 (LIST_FIRST(&bp->b_dep) == NULL)) {
1069 /*
1070 * If there are no dependencies, and it's VMIO,
1071 * then we don't need the buf, mark it available
1072 * for freeing. The VM has the data.
1073 */
1074 bp->b_flags |= B_RELBUF;
1075 brelse(bp);
1076 } else {
1077 /*
1078 * Otherwise let whoever
1079 * made the request take care of
1080 * freeing it. We just queue
1081 * it onto another list.
1082 */
1083 bqrelse(bp);
1084 }
1085 }
1086
1087 /*
1088 * This can only happen in the case of an error
1089 * because the loop above resets bp to NULL on each iteration
1090 * and on normal completion has not set a new value into it.
1091 * so it must have come from a 'break' statement
1092 */
1093 if (bp != NULL) {
1094 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1095 (LIST_FIRST(&bp->b_dep) == NULL)) {
1096 bp->b_flags |= B_RELBUF;
1097 brelse(bp);
1098 } else {
1099 bqrelse(bp);
1100 }
1101 }
1102
1103 if ((error == 0 || uio->uio_resid != orig_resid) &&
1104 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
1105 ip->i_flag |= IN_ACCESS;
1106 return (error);
1107}
1108
1109/*
1110 * Extended attribute area writing.
1111 */
1112static int
1113ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1114{
1115 struct inode *ip;
1116 struct ufs2_dinode *dp;
1117 struct fs *fs;
1118 struct buf *bp;
1119 ufs_lbn_t lbn;
1120 off_t osize;
1121 int blkoffset, error, flags, resid, size, xfersize;
1122
1123 GIANT_REQUIRED;
1124
1125 ip = VTOI(vp);
1126 fs = ip->i_fs;
1127 dp = ip->i_din2;
1128
1129#ifdef DIAGNOSTIC
1130 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1131 panic("ext_write: mode");
1132#endif
1133
1134 if (ioflag & IO_APPEND)
1135 uio->uio_offset = dp->di_extsize;
1136
1137 if (uio->uio_offset < 0 ||
1138 (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
1139 return (EFBIG);
1140
1141 resid = uio->uio_resid;
1142 osize = dp->di_extsize;
1143 flags = IO_EXT;
1144 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
1145 flags |= IO_SYNC;
1146
1147 for (error = 0; uio->uio_resid > 0;) {
1148 lbn = lblkno(fs, uio->uio_offset);
1149 blkoffset = blkoff(fs, uio->uio_offset);
1150 xfersize = fs->fs_bsize - blkoffset;
1151 if (uio->uio_resid < xfersize)
1152 xfersize = uio->uio_resid;
1153
1154 /*
1155 * We must perform a read-before-write if the transfer size
1156 * does not cover the entire buffer.
1157 */
1158 if (fs->fs_bsize > xfersize)
1159 flags |= BA_CLRBUF;
1160 else
1161 flags &= ~BA_CLRBUF;
1162 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1163 ucred, flags, &bp);
1164 if (error != 0)
1165 break;
1166 /*
1167 * If the buffer is not valid we have to clear out any
1168 * garbage data from the pages instantiated for the buffer.
1169 * If we do not, a failed uiomove() during a write can leave
1170 * the prior contents of the pages exposed to a userland
1171 * mmap(). XXX deal with uiomove() errors a better way.
1172 */
1173 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1174 vfs_bio_clrbuf(bp);
1175 if (ioflag & IO_DIRECT)
1176 bp->b_flags |= B_DIRECT;
1177
1178 if (uio->uio_offset + xfersize > dp->di_extsize)
1179 dp->di_extsize = uio->uio_offset + xfersize;
1180
1181 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1182 if (size < xfersize)
1183 xfersize = size;
1184
1185 error =
1186 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1187 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1188 (LIST_FIRST(&bp->b_dep) == NULL)) {
1189 bp->b_flags |= B_RELBUF;
1190 }
1191
1192 /*
1193 * If IO_SYNC each buffer is written synchronously. Otherwise
1194 * if we have a severe page deficiency write the buffer
1195 * asynchronously. Otherwise try to cluster, and if that
1196 * doesn't do it then either do an async write (if O_DIRECT),
1197 * or a delayed write (if not).
1198 */
1199 if (ioflag & IO_SYNC) {
1200 (void)bwrite(bp);
1201 } else if (vm_page_count_severe() ||
1202 buf_dirty_count_severe() ||
1203 xfersize + blkoffset == fs->fs_bsize ||
1204 (ioflag & (IO_ASYNC | IO_DIRECT)))
1205 bawrite(bp);
1206 else
1207 bdwrite(bp);
1208 if (error || xfersize == 0)
1209 break;
1210 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1211 }
1212 /*
1213 * If we successfully wrote any data, and we are not the superuser
1214 * we clear the setuid and setgid bits as a precaution against
1215 * tampering.
1216 */
1217 if (resid > uio->uio_resid && ucred &&
1218 suser_cred(ucred, PRISON_ROOT)) {
1219 ip->i_mode &= ~(ISUID | ISGID);
1220 dp->di_mode = ip->i_mode;
1221 }
1222 if (error) {
1223 if (ioflag & IO_UNIT) {
1224 (void)UFS_TRUNCATE(vp, osize,
1225 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td);
1226 uio->uio_offset -= resid - uio->uio_resid;
1227 uio->uio_resid = resid;
1228 }
1229 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1230 error = UFS_UPDATE(vp, 1);
1231 return (error);
1232}
1233
1234
1235/*
1236 * Vnode operating to retrieve a named extended attribute.
1237 *
1238 * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1239 * the length of the EA, and possibly the pointer to the entry and to the data.
1240 */
1241static int
47
48#include <sys/param.h>
49#include <sys/bio.h>
50#include <sys/systm.h>
51#include <sys/buf.h>
52#include <sys/conf.h>
53#include <sys/extattr.h>
54#include <sys/kernel.h>
55#include <sys/limits.h>
56#include <sys/malloc.h>
57#include <sys/mount.h>
58#include <sys/proc.h>
59#include <sys/resourcevar.h>
60#include <sys/signalvar.h>
61#include <sys/stat.h>
62#include <sys/vmmeter.h>
63#include <sys/vnode.h>
64
65#include <vm/vm.h>
66#include <vm/vm_extern.h>
67#include <vm/vm_object.h>
68#include <vm/vm_page.h>
69#include <vm/vm_pager.h>
70#include <vm/vnode_pager.h>
71
72#include <ufs/ufs/extattr.h>
73#include <ufs/ufs/quota.h>
74#include <ufs/ufs/inode.h>
75#include <ufs/ufs/ufs_extern.h>
76#include <ufs/ufs/ufsmount.h>
77
78#include <ufs/ffs/fs.h>
79#include <ufs/ffs/ffs_extern.h>
80#include "opt_directio.h"
81
82#ifdef DIRECTIO
83extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
84#endif
85static int ffs_fsync(struct vop_fsync_args *);
86static int ffs_getpages(struct vop_getpages_args *);
87static int ffs_read(struct vop_read_args *);
88static int ffs_write(struct vop_write_args *);
89static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
90static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
91 struct ucred *cred);
92static int ffsext_strategy(struct vop_strategy_args *);
93static int ffs_closeextattr(struct vop_closeextattr_args *);
94static int ffs_deleteextattr(struct vop_deleteextattr_args *);
95static int ffs_getextattr(struct vop_getextattr_args *);
96static int ffs_listextattr(struct vop_listextattr_args *);
97static int ffs_openextattr(struct vop_openextattr_args *);
98static int ffs_setextattr(struct vop_setextattr_args *);
99
100
101/* Global vfs data structures for ufs. */
102vop_t **ffs_vnodeop_p;
103static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
104 { &vop_default_desc, (vop_t *) ufs_vnoperate },
105 { &vop_fsync_desc, (vop_t *) ffs_fsync },
106 { &vop_getpages_desc, (vop_t *) ffs_getpages },
107 { &vop_read_desc, (vop_t *) ffs_read },
108 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
109 { &vop_write_desc, (vop_t *) ffs_write },
110 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
111 { &vop_deleteextattr_desc, (vop_t *) ffs_deleteextattr },
112 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
113 { &vop_listextattr_desc, (vop_t *) ffs_listextattr },
114 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
115 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
116 { NULL, NULL }
117};
118static struct vnodeopv_desc ffs_vnodeop_opv_desc =
119 { &ffs_vnodeop_p, ffs_vnodeop_entries };
120
121vop_t **ffs_specop_p;
122static struct vnodeopv_entry_desc ffs_specop_entries[] = {
123 { &vop_default_desc, (vop_t *) ufs_vnoperatespec },
124 { &vop_fsync_desc, (vop_t *) ffs_fsync },
125 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
126 { &vop_strategy_desc, (vop_t *) ffsext_strategy },
127 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
128 { &vop_deleteextattr_desc, (vop_t *) ffs_deleteextattr },
129 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
130 { &vop_listextattr_desc, (vop_t *) ffs_listextattr },
131 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
132 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
133 { NULL, NULL }
134};
135static struct vnodeopv_desc ffs_specop_opv_desc =
136 { &ffs_specop_p, ffs_specop_entries };
137
138vop_t **ffs_fifoop_p;
139static struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
140 { &vop_default_desc, (vop_t *) ufs_vnoperatefifo },
141 { &vop_fsync_desc, (vop_t *) ffs_fsync },
142 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
143 { &vop_strategy_desc, (vop_t *) ffsext_strategy },
144 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
145 { &vop_deleteextattr_desc, (vop_t *) ffs_deleteextattr },
146 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
147 { &vop_listextattr_desc, (vop_t *) ffs_listextattr },
148 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
149 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
150 { NULL, NULL }
151};
152static struct vnodeopv_desc ffs_fifoop_opv_desc =
153 { &ffs_fifoop_p, ffs_fifoop_entries };
154
155VNODEOP_SET(ffs_vnodeop_opv_desc);
156VNODEOP_SET(ffs_specop_opv_desc);
157VNODEOP_SET(ffs_fifoop_opv_desc);
158
159/*
160 * Synch an open file.
161 */
162/* ARGSUSED */
163static int
164ffs_fsync(ap)
165 struct vop_fsync_args /* {
166 struct vnode *a_vp;
167 struct ucred *a_cred;
168 int a_waitfor;
169 struct thread *a_td;
170 } */ *ap;
171{
172 struct vnode *vp = ap->a_vp;
173 struct inode *ip = VTOI(vp);
174 struct buf *bp;
175 struct buf *nbp;
176 int s, error, wait, passes, skipmeta;
177 ufs_lbn_t lbn;
178
179 wait = (ap->a_waitfor == MNT_WAIT);
180 if (vn_isdisk(vp, NULL)) {
181 lbn = INT_MAX;
182 if (vp->v_rdev->si_mountpoint != NULL &&
183 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP))
184 softdep_fsync_mountdev(vp);
185 } else {
186 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
187 }
188
189 /*
190 * Flush all dirty buffers associated with a vnode.
191 */
192 passes = NIADDR + 1;
193 skipmeta = 0;
194 if (wait)
195 skipmeta = 1;
196 s = splbio();
197 VI_LOCK(vp);
198loop:
199 TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs)
200 bp->b_vflags &= ~BV_SCANNED;
201 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
202 nbp = TAILQ_NEXT(bp, b_vnbufs);
203 /*
204 * Reasons to skip this buffer: it has already been considered
205 * on this pass, this pass is the first time through on a
206 * synchronous flush request and the buffer being considered
207 * is metadata, the buffer has dependencies that will cause
208 * it to be redirtied and it has not already been deferred,
209 * or it is already being written.
210 */
211 if ((bp->b_vflags & BV_SCANNED) != 0)
212 continue;
213 bp->b_vflags |= BV_SCANNED;
214 if ((skipmeta == 1 && bp->b_lblkno < 0))
215 continue;
216 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
217 continue;
218 if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
219 (bp->b_flags & B_DEFERRED) == 0 &&
220 buf_countdeps(bp, 0)) {
221 bp->b_flags |= B_DEFERRED;
222 BUF_UNLOCK(bp);
223 continue;
224 }
225 VI_UNLOCK(vp);
226 if ((bp->b_flags & B_DELWRI) == 0)
227 panic("ffs_fsync: not dirty");
228 if (vp != bp->b_vp)
229 panic("ffs_fsync: vp != vp->b_vp");
230 /*
231 * If this is a synchronous flush request, or it is not a
232 * file or device, start the write on this buffer immediatly.
233 */
234 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
235
236 /*
237 * On our final pass through, do all I/O synchronously
238 * so that we can find out if our flush is failing
239 * because of write errors.
240 */
241 if (passes > 0 || !wait) {
242 if ((bp->b_flags & B_CLUSTEROK) && !wait) {
243 (void) vfs_bio_awrite(bp);
244 } else {
245 bremfree(bp);
246 splx(s);
247 (void) bawrite(bp);
248 s = splbio();
249 }
250 } else {
251 bremfree(bp);
252 splx(s);
253 if ((error = bwrite(bp)) != 0)
254 return (error);
255 s = splbio();
256 }
257 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
258 /*
259 * If the buffer is for data that has been truncated
260 * off the file, then throw it away.
261 */
262 bremfree(bp);
263 bp->b_flags |= B_INVAL | B_NOCACHE;
264 splx(s);
265 brelse(bp);
266 s = splbio();
267 } else
268 vfs_bio_awrite(bp);
269
270 /*
271 * Since we may have slept during the I/O, we need
272 * to start from a known point.
273 */
274 VI_LOCK(vp);
275 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
276 }
277 /*
278 * If we were asked to do this synchronously, then go back for
279 * another pass, this time doing the metadata.
280 */
281 if (skipmeta) {
282 skipmeta = 0;
283 goto loop;
284 }
285
286 if (wait) {
287 while (vp->v_numoutput) {
288 vp->v_iflag |= VI_BWAIT;
289 msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
290 PRIBIO + 4, "ffsfsn", 0);
291 }
292 VI_UNLOCK(vp);
293
294 /*
295 * Ensure that any filesystem metatdata associated
296 * with the vnode has been written.
297 */
298 splx(s);
299 if ((error = softdep_sync_metadata(ap)) != 0)
300 return (error);
301 s = splbio();
302
303 VI_LOCK(vp);
304 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
305 /*
306 * Block devices associated with filesystems may
307 * have new I/O requests posted for them even if
308 * the vnode is locked, so no amount of trying will
309 * get them clean. Thus we give block devices a
310 * good effort, then just give up. For all other file
311 * types, go around and try again until it is clean.
312 */
313 if (passes > 0) {
314 passes -= 1;
315 goto loop;
316 }
317#ifdef DIAGNOSTIC
318 if (!vn_isdisk(vp, NULL))
319 vprint("ffs_fsync: dirty", vp);
320#endif
321 }
322 }
323 VI_UNLOCK(vp);
324 splx(s);
325 return (UFS_UPDATE(vp, wait));
326}
327
328
329/*
330 * Vnode op for reading.
331 */
332/* ARGSUSED */
333static int
334ffs_read(ap)
335 struct vop_read_args /* {
336 struct vnode *a_vp;
337 struct uio *a_uio;
338 int a_ioflag;
339 struct ucred *a_cred;
340 } */ *ap;
341{
342 struct vnode *vp;
343 struct inode *ip;
344 struct uio *uio;
345 struct fs *fs;
346 struct buf *bp;
347 ufs_lbn_t lbn, nextlbn;
348 off_t bytesinfile;
349 long size, xfersize, blkoffset;
350 int error, orig_resid;
351 int seqcount;
352 int ioflag;
353 vm_object_t object;
354
355 vp = ap->a_vp;
356 uio = ap->a_uio;
357 ioflag = ap->a_ioflag;
358 if (ap->a_ioflag & IO_EXT)
359#ifdef notyet
360 return (ffs_extread(vp, uio, ioflag));
361#else
362 panic("ffs_read+IO_EXT");
363#endif
364#ifdef DIRECTIO
365 if ((ioflag & IO_DIRECT) != 0) {
366 int workdone;
367
368 error = ffs_rawread(vp, uio, &workdone);
369 if (error != 0 || workdone != 0)
370 return error;
371 }
372#endif
373
374 GIANT_REQUIRED;
375
376 seqcount = ap->a_ioflag >> 16;
377 ip = VTOI(vp);
378
379#ifdef DIAGNOSTIC
380 if (uio->uio_rw != UIO_READ)
381 panic("ffs_read: mode");
382
383 if (vp->v_type == VLNK) {
384 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
385 panic("ffs_read: short symlink");
386 } else if (vp->v_type != VREG && vp->v_type != VDIR)
387 panic("ffs_read: type %d", vp->v_type);
388#endif
389 fs = ip->i_fs;
390 if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize)
391 return (EFBIG);
392
393 orig_resid = uio->uio_resid;
394 if (orig_resid <= 0)
395 return (0);
396
397 object = vp->v_object;
398
399 bytesinfile = ip->i_size - uio->uio_offset;
400 if (bytesinfile <= 0) {
401 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
402 ip->i_flag |= IN_ACCESS;
403 return 0;
404 }
405
406 if (object) {
407 vm_object_reference(object);
408 }
409
410 /*
411 * Ok so we couldn't do it all in one vm trick...
412 * so cycle around trying smaller bites..
413 */
414 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
415 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
416 break;
417
418 lbn = lblkno(fs, uio->uio_offset);
419 nextlbn = lbn + 1;
420
421 /*
422 * size of buffer. The buffer representing the
423 * end of the file is rounded up to the size of
424 * the block type ( fragment or full block,
425 * depending ).
426 */
427 size = blksize(fs, ip, lbn);
428 blkoffset = blkoff(fs, uio->uio_offset);
429
430 /*
431 * The amount we want to transfer in this iteration is
432 * one FS block less the amount of the data before
433 * our startpoint (duh!)
434 */
435 xfersize = fs->fs_bsize - blkoffset;
436
437 /*
438 * But if we actually want less than the block,
439 * or the file doesn't have a whole block more of data,
440 * then use the lesser number.
441 */
442 if (uio->uio_resid < xfersize)
443 xfersize = uio->uio_resid;
444 if (bytesinfile < xfersize)
445 xfersize = bytesinfile;
446
447 if (lblktosize(fs, nextlbn) >= ip->i_size) {
448 /*
449 * Don't do readahead if this is the end of the file.
450 */
451 error = bread(vp, lbn, size, NOCRED, &bp);
452 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
453 /*
454 * Otherwise if we are allowed to cluster,
455 * grab as much as we can.
456 *
457 * XXX This may not be a win if we are not
458 * doing sequential access.
459 */
460 error = cluster_read(vp, ip->i_size, lbn,
461 size, NOCRED, uio->uio_resid, seqcount, &bp);
462 } else if (seqcount > 1) {
463 /*
464 * If we are NOT allowed to cluster, then
465 * if we appear to be acting sequentially,
466 * fire off a request for a readahead
467 * as well as a read. Note that the 4th and 5th
468 * arguments point to arrays of the size specified in
469 * the 6th argument.
470 */
471 int nextsize = blksize(fs, ip, nextlbn);
472 error = breadn(vp, lbn,
473 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
474 } else {
475 /*
476 * Failing all of the above, just read what the
477 * user asked for. Interestingly, the same as
478 * the first option above.
479 */
480 error = bread(vp, lbn, size, NOCRED, &bp);
481 }
482 if (error) {
483 brelse(bp);
484 bp = NULL;
485 break;
486 }
487
488 /*
489 * If IO_DIRECT then set B_DIRECT for the buffer. This
490 * will cause us to attempt to release the buffer later on
491 * and will cause the buffer cache to attempt to free the
492 * underlying pages.
493 */
494 if (ioflag & IO_DIRECT)
495 bp->b_flags |= B_DIRECT;
496
497 /*
498 * We should only get non-zero b_resid when an I/O error
499 * has occurred, which should cause us to break above.
500 * However, if the short read did not cause an error,
501 * then we want to ensure that we do not uiomove bad
502 * or uninitialized data.
503 */
504 size -= bp->b_resid;
505 if (size < xfersize) {
506 if (size == 0)
507 break;
508 xfersize = size;
509 }
510
511 {
512 /*
513 * otherwise use the general form
514 */
515 error =
516 uiomove((char *)bp->b_data + blkoffset,
517 (int)xfersize, uio);
518 }
519
520 if (error)
521 break;
522
523 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
524 (LIST_FIRST(&bp->b_dep) == NULL)) {
525 /*
526 * If there are no dependencies, and it's VMIO,
527 * then we don't need the buf, mark it available
528 * for freeing. The VM has the data.
529 */
530 bp->b_flags |= B_RELBUF;
531 brelse(bp);
532 } else {
533 /*
534 * Otherwise let whoever
535 * made the request take care of
536 * freeing it. We just queue
537 * it onto another list.
538 */
539 bqrelse(bp);
540 }
541 }
542
543 /*
544 * This can only happen in the case of an error
545 * because the loop above resets bp to NULL on each iteration
546 * and on normal completion has not set a new value into it.
547 * so it must have come from a 'break' statement
548 */
549 if (bp != NULL) {
550 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
551 (LIST_FIRST(&bp->b_dep) == NULL)) {
552 bp->b_flags |= B_RELBUF;
553 brelse(bp);
554 } else {
555 bqrelse(bp);
556 }
557 }
558
559 if (object) {
560 VM_OBJECT_LOCK(object);
561 vm_object_vndeallocate(object);
562 }
563 if ((error == 0 || uio->uio_resid != orig_resid) &&
564 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
565 ip->i_flag |= IN_ACCESS;
566 return (error);
567}
568
569/*
570 * Vnode op for writing.
571 */
572static int
573ffs_write(ap)
574 struct vop_write_args /* {
575 struct vnode *a_vp;
576 struct uio *a_uio;
577 int a_ioflag;
578 struct ucred *a_cred;
579 } */ *ap;
580{
581 struct vnode *vp;
582 struct uio *uio;
583 struct inode *ip;
584 struct fs *fs;
585 struct buf *bp;
586 struct thread *td;
587 ufs_lbn_t lbn;
588 off_t osize;
589 int seqcount;
590 int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
591 vm_object_t object;
592
593 vp = ap->a_vp;
594 uio = ap->a_uio;
595 ioflag = ap->a_ioflag;
596 if (ap->a_ioflag & IO_EXT)
597#ifdef notyet
598 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
599#else
600 panic("ffs_read+IO_EXT");
601#endif
602
603 GIANT_REQUIRED;
604
605 extended = 0;
606 seqcount = ap->a_ioflag >> 16;
607 ip = VTOI(vp);
608
609 object = vp->v_object;
610 if (object) {
611 vm_object_reference(object);
612 }
613
614#ifdef DIAGNOSTIC
615 if (uio->uio_rw != UIO_WRITE)
616 panic("ffswrite: mode");
617#endif
618
619 switch (vp->v_type) {
620 case VREG:
621 if (ioflag & IO_APPEND)
622 uio->uio_offset = ip->i_size;
623 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) {
624 if (object) {
625 VM_OBJECT_LOCK(object);
626 vm_object_vndeallocate(object);
627 }
628 return (EPERM);
629 }
630 /* FALLTHROUGH */
631 case VLNK:
632 break;
633 case VDIR:
634 panic("ffswrite: dir write");
635 break;
636 default:
637 panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type,
638 (int)uio->uio_offset,
639 (int)uio->uio_resid
640 );
641 }
642
643 fs = ip->i_fs;
644 if (uio->uio_offset < 0 ||
645 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
646 if (object) {
647 VM_OBJECT_LOCK(object);
648 vm_object_vndeallocate(object);
649 }
650 return (EFBIG);
651 }
652 /*
653 * Maybe this should be above the vnode op call, but so long as
654 * file servers have no limits, I don't think it matters.
655 */
656 td = uio->uio_td;
657 if (vp->v_type == VREG && td &&
658 uio->uio_offset + uio->uio_resid >
659 td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
660 PROC_LOCK(td->td_proc);
661 psignal(td->td_proc, SIGXFSZ);
662 PROC_UNLOCK(td->td_proc);
663 if (object) {
664 VM_OBJECT_LOCK(object);
665 vm_object_vndeallocate(object);
666 }
667 return (EFBIG);
668 }
669
670 resid = uio->uio_resid;
671 osize = ip->i_size;
672 if (seqcount > BA_SEQMAX)
673 flags = BA_SEQMAX << BA_SEQSHIFT;
674 else
675 flags = seqcount << BA_SEQSHIFT;
676 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
677 flags |= IO_SYNC;
678
679 for (error = 0; uio->uio_resid > 0;) {
680 lbn = lblkno(fs, uio->uio_offset);
681 blkoffset = blkoff(fs, uio->uio_offset);
682 xfersize = fs->fs_bsize - blkoffset;
683 if (uio->uio_resid < xfersize)
684 xfersize = uio->uio_resid;
685
686 if (uio->uio_offset + xfersize > ip->i_size)
687 vnode_pager_setsize(vp, uio->uio_offset + xfersize);
688
689 /*
690 * We must perform a read-before-write if the transfer size
691 * does not cover the entire buffer.
692 */
693 if (fs->fs_bsize > xfersize)
694 flags |= BA_CLRBUF;
695 else
696 flags &= ~BA_CLRBUF;
697/* XXX is uio->uio_offset the right thing here? */
698 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
699 ap->a_cred, flags, &bp);
700 if (error != 0)
701 break;
702 /*
703 * If the buffer is not valid we have to clear out any
704 * garbage data from the pages instantiated for the buffer.
705 * If we do not, a failed uiomove() during a write can leave
706 * the prior contents of the pages exposed to a userland
707 * mmap(). XXX deal with uiomove() errors a better way.
708 */
709 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
710 vfs_bio_clrbuf(bp);
711 if (ioflag & IO_DIRECT)
712 bp->b_flags |= B_DIRECT;
713
714 if (uio->uio_offset + xfersize > ip->i_size) {
715 ip->i_size = uio->uio_offset + xfersize;
716 DIP(ip, i_size) = ip->i_size;
717 extended = 1;
718 }
719
720 size = blksize(fs, ip, lbn) - bp->b_resid;
721 if (size < xfersize)
722 xfersize = size;
723
724 error =
725 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
726 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
727 (LIST_FIRST(&bp->b_dep) == NULL)) {
728 bp->b_flags |= B_RELBUF;
729 }
730
731 /*
732 * If IO_SYNC each buffer is written synchronously. Otherwise
733 * if we have a severe page deficiency write the buffer
734 * asynchronously. Otherwise try to cluster, and if that
735 * doesn't do it then either do an async write (if O_DIRECT),
736 * or a delayed write (if not).
737 */
738 if (ioflag & IO_SYNC) {
739 (void)bwrite(bp);
740 } else if (vm_page_count_severe() ||
741 buf_dirty_count_severe() ||
742 (ioflag & IO_ASYNC)) {
743 bp->b_flags |= B_CLUSTEROK;
744 bawrite(bp);
745 } else if (xfersize + blkoffset == fs->fs_bsize) {
746 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
747 bp->b_flags |= B_CLUSTEROK;
748 cluster_write(bp, ip->i_size, seqcount);
749 } else {
750 bawrite(bp);
751 }
752 } else if (ioflag & IO_DIRECT) {
753 bp->b_flags |= B_CLUSTEROK;
754 bawrite(bp);
755 } else {
756 bp->b_flags |= B_CLUSTEROK;
757 bdwrite(bp);
758 }
759 if (error || xfersize == 0)
760 break;
761 ip->i_flag |= IN_CHANGE | IN_UPDATE;
762 }
763 /*
764 * If we successfully wrote any data, and we are not the superuser
765 * we clear the setuid and setgid bits as a precaution against
766 * tampering.
767 */
768 if (resid > uio->uio_resid && ap->a_cred &&
769 suser_cred(ap->a_cred, PRISON_ROOT)) {
770 ip->i_mode &= ~(ISUID | ISGID);
771 DIP(ip, i_mode) = ip->i_mode;
772 }
773 if (resid > uio->uio_resid)
774 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
775 if (error) {
776 if (ioflag & IO_UNIT) {
777 (void)UFS_TRUNCATE(vp, osize,
778 IO_NORMAL | (ioflag & IO_SYNC),
779 ap->a_cred, uio->uio_td);
780 uio->uio_offset -= resid - uio->uio_resid;
781 uio->uio_resid = resid;
782 }
783 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
784 error = UFS_UPDATE(vp, 1);
785
786 if (object) {
787 VM_OBJECT_LOCK(object);
788 vm_object_vndeallocate(object);
789 }
790
791 return (error);
792}
793
794/*
795 * get page routine
796 */
797static int
798ffs_getpages(ap)
799 struct vop_getpages_args *ap;
800{
801 off_t foff, physoffset;
802 int i, size, bsize;
803 struct vnode *dp, *vp;
804 vm_object_t obj;
805 vm_pindex_t pindex;
806 vm_page_t mreq;
807 int bbackwards, bforwards;
808 int pbackwards, pforwards;
809 int firstpage;
810 ufs2_daddr_t reqblkno, reqlblkno;
811 int poff;
812 int pcount;
813 int rtval;
814 int pagesperblock;
815
816 GIANT_REQUIRED;
817
818 pcount = round_page(ap->a_count) / PAGE_SIZE;
819 mreq = ap->a_m[ap->a_reqpage];
820
821 /*
822 * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
823 * then the entire page is valid. Since the page may be mapped,
824 * user programs might reference data beyond the actual end of file
825 * occuring within the page. We have to zero that data.
826 */
827 if (mreq->valid) {
828 if (mreq->valid != VM_PAGE_BITS_ALL)
829 vm_page_zero_invalid(mreq, TRUE);
830 VM_OBJECT_LOCK(mreq->object);
831 vm_page_lock_queues();
832 for (i = 0; i < pcount; i++) {
833 if (i != ap->a_reqpage) {
834 vm_page_free(ap->a_m[i]);
835 }
836 }
837 vm_page_unlock_queues();
838 VM_OBJECT_UNLOCK(mreq->object);
839 return VM_PAGER_OK;
840 }
841
842 vp = ap->a_vp;
843 obj = vp->v_object;
844 bsize = vp->v_mount->mnt_stat.f_iosize;
845 pindex = mreq->pindex;
846 foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */;
847
848 if (bsize < PAGE_SIZE)
849 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
850 ap->a_count,
851 ap->a_reqpage);
852
853 /*
854 * foff is the file offset of the required page
855 * reqlblkno is the logical block that contains the page
856 * poff is the index of the page into the logical block
857 */
858 reqlblkno = foff / bsize;
859 poff = (foff % bsize) / PAGE_SIZE;
860
861 dp = VTOI(vp)->i_devvp;
862 if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards)
863 || (reqblkno == -1)) {
864 VM_OBJECT_LOCK(obj);
865 vm_page_lock_queues();
866 for(i = 0; i < pcount; i++) {
867 if (i != ap->a_reqpage)
868 vm_page_free(ap->a_m[i]);
869 }
870 vm_page_unlock_queues();
871 VM_OBJECT_UNLOCK(obj);
872 if (reqblkno == -1) {
873 if ((mreq->flags & PG_ZERO) == 0)
874 pmap_zero_page(mreq);
875 vm_page_undirty(mreq);
876 mreq->valid = VM_PAGE_BITS_ALL;
877 return VM_PAGER_OK;
878 } else {
879 return VM_PAGER_ERROR;
880 }
881 }
882
883 physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE;
884 pagesperblock = bsize / PAGE_SIZE;
885 /*
886 * find the first page that is contiguous...
887 * note that pbackwards is the number of pages that are contiguous
888 * backwards.
889 */
890 firstpage = 0;
891 if (ap->a_count) {
892 pbackwards = poff + bbackwards * pagesperblock;
893 if (ap->a_reqpage > pbackwards) {
894 firstpage = ap->a_reqpage - pbackwards;
895 VM_OBJECT_LOCK(obj);
896 vm_page_lock_queues();
897 for(i=0;i<firstpage;i++)
898 vm_page_free(ap->a_m[i]);
899 vm_page_unlock_queues();
900 VM_OBJECT_UNLOCK(obj);
901 }
902
903 /*
904 * pforwards is the number of pages that are contiguous
905 * after the current page.
906 */
907 pforwards = (pagesperblock - (poff + 1)) +
908 bforwards * pagesperblock;
909 if (pforwards < (pcount - (ap->a_reqpage + 1))) {
910 VM_OBJECT_LOCK(obj);
911 vm_page_lock_queues();
912 for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
913 vm_page_free(ap->a_m[i]);
914 vm_page_unlock_queues();
915 VM_OBJECT_UNLOCK(obj);
916 pcount = ap->a_reqpage + pforwards + 1;
917 }
918
919 /*
920 * number of pages for I/O corrected for the non-contig pages at
921 * the beginning of the array.
922 */
923 pcount -= firstpage;
924 }
925
926 /*
927 * calculate the size of the transfer
928 */
929
930 size = pcount * PAGE_SIZE;
931
932 if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) >
933 obj->un_pager.vnp.vnp_size)
934 size = obj->un_pager.vnp.vnp_size -
935 IDX_TO_OFF(ap->a_m[firstpage]->pindex);
936
937 physoffset -= foff;
938 rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
939 (ap->a_reqpage - firstpage), physoffset);
940
941 return (rtval);
942}
943
944/*
945 * Extended attribute area reading.
946 */
947static int
948ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
949{
950 struct inode *ip;
951 struct ufs2_dinode *dp;
952 struct fs *fs;
953 struct buf *bp;
954 ufs_lbn_t lbn, nextlbn;
955 off_t bytesinfile;
956 long size, xfersize, blkoffset;
957 int error, orig_resid;
958
959 GIANT_REQUIRED;
960
961 ip = VTOI(vp);
962 fs = ip->i_fs;
963 dp = ip->i_din2;
964
965#ifdef DIAGNOSTIC
966 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
967 panic("ffs_extread: mode");
968
969#endif
970 orig_resid = uio->uio_resid;
971 if (orig_resid <= 0)
972 return (0);
973
974 bytesinfile = dp->di_extsize - uio->uio_offset;
975 if (bytesinfile <= 0) {
976 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
977 ip->i_flag |= IN_ACCESS;
978 return 0;
979 }
980
981 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
982 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
983 break;
984
985 lbn = lblkno(fs, uio->uio_offset);
986 nextlbn = lbn + 1;
987
988 /*
989 * size of buffer. The buffer representing the
990 * end of the file is rounded up to the size of
991 * the block type ( fragment or full block,
992 * depending ).
993 */
994 size = sblksize(fs, dp->di_extsize, lbn);
995 blkoffset = blkoff(fs, uio->uio_offset);
996
997 /*
998 * The amount we want to transfer in this iteration is
999 * one FS block less the amount of the data before
1000 * our startpoint (duh!)
1001 */
1002 xfersize = fs->fs_bsize - blkoffset;
1003
1004 /*
1005 * But if we actually want less than the block,
1006 * or the file doesn't have a whole block more of data,
1007 * then use the lesser number.
1008 */
1009 if (uio->uio_resid < xfersize)
1010 xfersize = uio->uio_resid;
1011 if (bytesinfile < xfersize)
1012 xfersize = bytesinfile;
1013
1014 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
1015 /*
1016 * Don't do readahead if this is the end of the info.
1017 */
1018 error = bread(vp, -1 - lbn, size, NOCRED, &bp);
1019 } else {
1020 /*
1021 * If we have a second block, then
1022 * fire off a request for a readahead
1023 * as well as a read. Note that the 4th and 5th
1024 * arguments point to arrays of the size specified in
1025 * the 6th argument.
1026 */
1027 int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1028
1029 nextlbn = -1 - nextlbn;
1030 error = breadn(vp, -1 - lbn,
1031 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1032 }
1033 if (error) {
1034 brelse(bp);
1035 bp = NULL;
1036 break;
1037 }
1038
1039 /*
1040 * If IO_DIRECT then set B_DIRECT for the buffer. This
1041 * will cause us to attempt to release the buffer later on
1042 * and will cause the buffer cache to attempt to free the
1043 * underlying pages.
1044 */
1045 if (ioflag & IO_DIRECT)
1046 bp->b_flags |= B_DIRECT;
1047
1048 /*
1049 * We should only get non-zero b_resid when an I/O error
1050 * has occurred, which should cause us to break above.
1051 * However, if the short read did not cause an error,
1052 * then we want to ensure that we do not uiomove bad
1053 * or uninitialized data.
1054 */
1055 size -= bp->b_resid;
1056 if (size < xfersize) {
1057 if (size == 0)
1058 break;
1059 xfersize = size;
1060 }
1061
1062 error = uiomove((char *)bp->b_data + blkoffset,
1063 (int)xfersize, uio);
1064 if (error)
1065 break;
1066
1067 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1068 (LIST_FIRST(&bp->b_dep) == NULL)) {
1069 /*
1070 * If there are no dependencies, and it's VMIO,
1071 * then we don't need the buf, mark it available
1072 * for freeing. The VM has the data.
1073 */
1074 bp->b_flags |= B_RELBUF;
1075 brelse(bp);
1076 } else {
1077 /*
1078 * Otherwise let whoever
1079 * made the request take care of
1080 * freeing it. We just queue
1081 * it onto another list.
1082 */
1083 bqrelse(bp);
1084 }
1085 }
1086
1087 /*
1088 * This can only happen in the case of an error
1089 * because the loop above resets bp to NULL on each iteration
1090 * and on normal completion has not set a new value into it.
1091 * so it must have come from a 'break' statement
1092 */
1093 if (bp != NULL) {
1094 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1095 (LIST_FIRST(&bp->b_dep) == NULL)) {
1096 bp->b_flags |= B_RELBUF;
1097 brelse(bp);
1098 } else {
1099 bqrelse(bp);
1100 }
1101 }
1102
1103 if ((error == 0 || uio->uio_resid != orig_resid) &&
1104 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
1105 ip->i_flag |= IN_ACCESS;
1106 return (error);
1107}
1108
1109/*
1110 * Extended attribute area writing.
1111 */
1112static int
1113ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1114{
1115 struct inode *ip;
1116 struct ufs2_dinode *dp;
1117 struct fs *fs;
1118 struct buf *bp;
1119 ufs_lbn_t lbn;
1120 off_t osize;
1121 int blkoffset, error, flags, resid, size, xfersize;
1122
1123 GIANT_REQUIRED;
1124
1125 ip = VTOI(vp);
1126 fs = ip->i_fs;
1127 dp = ip->i_din2;
1128
1129#ifdef DIAGNOSTIC
1130 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1131 panic("ext_write: mode");
1132#endif
1133
1134 if (ioflag & IO_APPEND)
1135 uio->uio_offset = dp->di_extsize;
1136
1137 if (uio->uio_offset < 0 ||
1138 (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
1139 return (EFBIG);
1140
1141 resid = uio->uio_resid;
1142 osize = dp->di_extsize;
1143 flags = IO_EXT;
1144 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
1145 flags |= IO_SYNC;
1146
1147 for (error = 0; uio->uio_resid > 0;) {
1148 lbn = lblkno(fs, uio->uio_offset);
1149 blkoffset = blkoff(fs, uio->uio_offset);
1150 xfersize = fs->fs_bsize - blkoffset;
1151 if (uio->uio_resid < xfersize)
1152 xfersize = uio->uio_resid;
1153
1154 /*
1155 * We must perform a read-before-write if the transfer size
1156 * does not cover the entire buffer.
1157 */
1158 if (fs->fs_bsize > xfersize)
1159 flags |= BA_CLRBUF;
1160 else
1161 flags &= ~BA_CLRBUF;
1162 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1163 ucred, flags, &bp);
1164 if (error != 0)
1165 break;
1166 /*
1167 * If the buffer is not valid we have to clear out any
1168 * garbage data from the pages instantiated for the buffer.
1169 * If we do not, a failed uiomove() during a write can leave
1170 * the prior contents of the pages exposed to a userland
1171 * mmap(). XXX deal with uiomove() errors a better way.
1172 */
1173 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1174 vfs_bio_clrbuf(bp);
1175 if (ioflag & IO_DIRECT)
1176 bp->b_flags |= B_DIRECT;
1177
1178 if (uio->uio_offset + xfersize > dp->di_extsize)
1179 dp->di_extsize = uio->uio_offset + xfersize;
1180
1181 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1182 if (size < xfersize)
1183 xfersize = size;
1184
1185 error =
1186 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1187 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1188 (LIST_FIRST(&bp->b_dep) == NULL)) {
1189 bp->b_flags |= B_RELBUF;
1190 }
1191
1192 /*
1193 * If IO_SYNC each buffer is written synchronously. Otherwise
1194 * if we have a severe page deficiency write the buffer
1195 * asynchronously. Otherwise try to cluster, and if that
1196 * doesn't do it then either do an async write (if O_DIRECT),
1197 * or a delayed write (if not).
1198 */
1199 if (ioflag & IO_SYNC) {
1200 (void)bwrite(bp);
1201 } else if (vm_page_count_severe() ||
1202 buf_dirty_count_severe() ||
1203 xfersize + blkoffset == fs->fs_bsize ||
1204 (ioflag & (IO_ASYNC | IO_DIRECT)))
1205 bawrite(bp);
1206 else
1207 bdwrite(bp);
1208 if (error || xfersize == 0)
1209 break;
1210 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1211 }
1212 /*
1213 * If we successfully wrote any data, and we are not the superuser
1214 * we clear the setuid and setgid bits as a precaution against
1215 * tampering.
1216 */
1217 if (resid > uio->uio_resid && ucred &&
1218 suser_cred(ucred, PRISON_ROOT)) {
1219 ip->i_mode &= ~(ISUID | ISGID);
1220 dp->di_mode = ip->i_mode;
1221 }
1222 if (error) {
1223 if (ioflag & IO_UNIT) {
1224 (void)UFS_TRUNCATE(vp, osize,
1225 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td);
1226 uio->uio_offset -= resid - uio->uio_resid;
1227 uio->uio_resid = resid;
1228 }
1229 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1230 error = UFS_UPDATE(vp, 1);
1231 return (error);
1232}
1233
1234
1235/*
1236 * Vnode operating to retrieve a named extended attribute.
1237 *
1238 * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1239 * the length of the EA, and possibly the pointer to the entry and to the data.
1240 */
1241static int
1242ffs_findextattr(u_char *ptr, uint length, int nspace, const char *name, u_char **eap, u_char **eac)
1242ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac)
1243{
1244 u_char *p, *pe, *pn, *p0;
1245 int eapad1, eapad2, ealength, ealen, nlen;
1246 uint32_t ul;
1247
1248 pe = ptr + length;
1249 nlen = strlen(name);
1250
1251 for (p = ptr; p < pe; p = pn) {
1252 p0 = p;
1253 bcopy(p, &ul, sizeof(ul));
1254 pn = p + ul;
1255 /* make sure this entry is complete */
1256 if (pn > pe)
1257 break;
1258 p += sizeof(uint32_t);
1259 if (*p != nspace)
1260 continue;
1261 p++;
1262 eapad2 = *p++;
1263 if (*p != nlen)
1264 continue;
1265 p++;
1266 if (bcmp(p, name, nlen))
1267 continue;
1268 ealength = sizeof(uint32_t) + 3 + nlen;
1269 eapad1 = 8 - (ealength % 8);
1270 if (eapad1 == 8)
1271 eapad1 = 0;
1272 ealength += eapad1;
1273 ealen = ul - ealength - eapad2;
1274 p += nlen + eapad1;
1275 if (eap != NULL)
1276 *eap = p0;
1277 if (eac != NULL)
1278 *eac = p;
1279 return (ealen);
1280 }
1281 return(-1);
1282}
1283
1284static int
1285ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1286{
1287 struct inode *ip;
1288 struct ufs2_dinode *dp;
1289 struct uio luio;
1290 struct iovec liovec;
1291 int easize, error;
1292 u_char *eae;
1293
1294 ip = VTOI(vp);
1295 dp = ip->i_din2;
1296 easize = dp->di_extsize;
1297
1298 eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1299
1300 liovec.iov_base = eae;
1301 liovec.iov_len = easize;
1302 luio.uio_iov = &liovec;
1303 luio.uio_iovcnt = 1;
1304 luio.uio_offset = 0;
1305 luio.uio_resid = easize;
1306 luio.uio_segflg = UIO_SYSSPACE;
1307 luio.uio_rw = UIO_READ;
1308 luio.uio_td = td;
1309
1310 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1311 if (error) {
1312 free(eae, M_TEMP);
1313 return(error);
1314 }
1315 *p = eae;
1316 return (0);
1317}
1318
1319static int
1320ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1321{
1322 struct inode *ip;
1323 struct ufs2_dinode *dp;
1324 int error;
1325
1326 ip = VTOI(vp);
1327
1328 if (ip->i_ea_area != NULL)
1329 return (EBUSY);
1330 dp = ip->i_din2;
1331 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1332 if (error)
1333 return (error);
1334 ip->i_ea_len = dp->di_extsize;
1335 ip->i_ea_error = 0;
1336 return (0);
1337}
1338
1339/*
1340 * Vnode extattr transaction commit/abort
1341 */
1342static int
1343ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1344{
1345 struct inode *ip;
1346 struct uio luio;
1347 struct iovec liovec;
1348 int error;
1349 struct ufs2_dinode *dp;
1350
1351 ip = VTOI(vp);
1352 if (ip->i_ea_area == NULL)
1353 return (EINVAL);
1354 dp = ip->i_din2;
1355 error = ip->i_ea_error;
1356 if (commit && error == 0) {
1357 if (cred == NOCRED)
1358 cred = vp->v_mount->mnt_cred;
1359 liovec.iov_base = ip->i_ea_area;
1360 liovec.iov_len = ip->i_ea_len;
1361 luio.uio_iov = &liovec;
1362 luio.uio_iovcnt = 1;
1363 luio.uio_offset = 0;
1364 luio.uio_resid = ip->i_ea_len;
1365 luio.uio_segflg = UIO_SYSSPACE;
1366 luio.uio_rw = UIO_WRITE;
1367 luio.uio_td = td;
1368 /* XXX: I'm not happy about truncating to zero size */
1369 if (ip->i_ea_len < dp->di_extsize)
1370 error = ffs_truncate(vp, 0, IO_EXT, cred, td);
1371 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1372 }
1373 free(ip->i_ea_area, M_TEMP);
1374 ip->i_ea_area = NULL;
1375 ip->i_ea_len = 0;
1376 ip->i_ea_error = 0;
1377 return (error);
1378}
1379
1380/*
1381 * Vnode extattr strategy routine for special devices and fifos.
1382 *
1383 * We need to check for a read or write of the external attributes.
1384 * Otherwise we just fall through and do the usual thing.
1385 */
1386static int
1387ffsext_strategy(struct vop_strategy_args *ap)
1388/*
1389struct vop_strategy_args {
1390 struct vnodeop_desc *a_desc;
1391 struct vnode *a_vp;
1392 struct buf *a_bp;
1393};
1394*/
1395{
1396 struct vnode *vp;
1397 daddr_t lbn;
1398
1399 KASSERT(ap->a_vp == ap->a_bp->b_vp, ("%s(%p != %p)",
1400 __func__, ap->a_vp, ap->a_bp->b_vp));
1401 vp = ap->a_vp;
1402 lbn = ap->a_bp->b_lblkno;
1403 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
1404 lbn < 0 && lbn >= -NXADDR)
1405 return (ufs_vnoperate((struct vop_generic_args *)ap));
1406 if (vp->v_type == VFIFO)
1407 return (ufs_vnoperatefifo((struct vop_generic_args *)ap));
1408 return (ufs_vnoperatespec((struct vop_generic_args *)ap));
1409}
1410
1411/*
1412 * Vnode extattr transaction commit/abort
1413 */
1414static int
1415ffs_openextattr(struct vop_openextattr_args *ap)
1416/*
1417struct vop_openextattr_args {
1418 struct vnodeop_desc *a_desc;
1419 struct vnode *a_vp;
1420 IN struct ucred *a_cred;
1421 IN struct thread *a_td;
1422};
1423*/
1424{
1425 struct inode *ip;
1426 struct fs *fs;
1427
1428 ip = VTOI(ap->a_vp);
1429 fs = ip->i_fs;
1430 if (fs->fs_magic == FS_UFS1_MAGIC)
1431 return (ufs_vnoperate((struct vop_generic_args *)ap));
1432
1433 if (ap->a_vp->v_type == VCHR)
1434 return (EOPNOTSUPP);
1435
1436 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1437}
1438
1439
1440/*
1441 * Vnode extattr transaction commit/abort
1442 */
1443static int
1444ffs_closeextattr(struct vop_closeextattr_args *ap)
1445/*
1446struct vop_closeextattr_args {
1447 struct vnodeop_desc *a_desc;
1448 struct vnode *a_vp;
1449 int a_commit;
1450 IN struct ucred *a_cred;
1451 IN struct thread *a_td;
1452};
1453*/
1454{
1455 struct inode *ip;
1456 struct fs *fs;
1457
1458 ip = VTOI(ap->a_vp);
1459 fs = ip->i_fs;
1460 if (fs->fs_magic == FS_UFS1_MAGIC)
1461 return (ufs_vnoperate((struct vop_generic_args *)ap));
1462
1463 if (ap->a_vp->v_type == VCHR)
1464 return (EOPNOTSUPP);
1465
1466 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1467}
1468
1469/*
1470 * Vnode operation to remove a named attribute.
1471 */
1472static int
1473ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1474/*
1475vop_deleteextattr {
1476 IN struct vnode *a_vp;
1477 IN int a_attrnamespace;
1478 IN const char *a_name;
1479 IN struct ucred *a_cred;
1480 IN struct thread *a_td;
1481};
1482*/
1483{
1484 struct inode *ip;
1485 struct fs *fs;
1486 uint32_t ealength, ul;
1487 int ealen, olen, eapad1, eapad2, error, i, easize;
1488 u_char *eae, *p;
1489 int stand_alone;
1490
1491 ip = VTOI(ap->a_vp);
1492 fs = ip->i_fs;
1493
1494 if (fs->fs_magic == FS_UFS1_MAGIC)
1495 return (ufs_vnoperate((struct vop_generic_args *)ap));
1496
1497 if (ap->a_vp->v_type == VCHR)
1498 return (EOPNOTSUPP);
1499
1500 if (strlen(ap->a_name) == 0)
1501 return (EINVAL);
1502
1503 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1504 ap->a_cred, ap->a_td, IWRITE);
1505 if (error) {
1506 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1507 ip->i_ea_error = error;
1508 return (error);
1509 }
1510
1511 if (ip->i_ea_area == NULL) {
1512 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1513 if (error)
1514 return (error);
1515 stand_alone = 1;
1516 } else {
1517 stand_alone = 0;
1518 }
1519
1520 ealength = eapad1 = ealen = eapad2 = 0;
1521
1522 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1523 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1524 easize = ip->i_ea_len;
1525
1526 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1527 &p, NULL);
1528 if (olen == -1) {
1529 /* delete but nonexistent */
1530 free(eae, M_TEMP);
1531 if (stand_alone)
1532 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1533 return(ENOATTR);
1534 }
1535 bcopy(p, &ul, sizeof ul);
1536 i = p - eae + ul;
1537 if (ul != ealength) {
1538 bcopy(p + ul, p + ealength, easize - i);
1539 easize += (ealength - ul);
1540 }
1541 if (easize > NXADDR * fs->fs_bsize) {
1542 free(eae, M_TEMP);
1543 if (stand_alone)
1544 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1545 else if (ip->i_ea_error == 0)
1546 ip->i_ea_error = ENOSPC;
1547 return(ENOSPC);
1548 }
1549 p = ip->i_ea_area;
1550 ip->i_ea_area = eae;
1551 ip->i_ea_len = easize;
1552 free(p, M_TEMP);
1553 if (stand_alone)
1554 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1555 return(error);
1556}
1557
1558/*
1559 * Vnode operation to retrieve a named extended attribute.
1560 */
1561static int
1562ffs_getextattr(struct vop_getextattr_args *ap)
1563/*
1564vop_getextattr {
1565 IN struct vnode *a_vp;
1566 IN int a_attrnamespace;
1567 IN const char *a_name;
1568 INOUT struct uio *a_uio;
1569 OUT size_t *a_size;
1570 IN struct ucred *a_cred;
1571 IN struct thread *a_td;
1572};
1573*/
1574{
1575 struct inode *ip;
1576 struct fs *fs;
1577 u_char *eae, *p;
1578 unsigned easize;
1579 int error, ealen, stand_alone;
1580
1581 ip = VTOI(ap->a_vp);
1582 fs = ip->i_fs;
1583
1584 if (fs->fs_magic == FS_UFS1_MAGIC)
1585 return (ufs_vnoperate((struct vop_generic_args *)ap));
1586
1587 if (ap->a_vp->v_type == VCHR)
1588 return (EOPNOTSUPP);
1589
1590 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1591 ap->a_cred, ap->a_td, IREAD);
1592 if (error)
1593 return (error);
1594
1595 if (ip->i_ea_area == NULL) {
1596 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1597 if (error)
1598 return (error);
1599 stand_alone = 1;
1600 } else {
1601 stand_alone = 0;
1602 }
1603 eae = ip->i_ea_area;
1604 easize = ip->i_ea_len;
1605
1606 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1607 NULL, &p);
1608 if (ealen >= 0) {
1609 error = 0;
1610 if (ap->a_size != NULL)
1611 *ap->a_size = ealen;
1612 else if (ap->a_uio != NULL)
1613 error = uiomove(p, ealen, ap->a_uio);
1614 } else
1615 error = ENOATTR;
1616 if (stand_alone)
1617 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1618 return(error);
1619}
1620
1621/*
1622 * Vnode operation to retrieve extended attributes on a vnode.
1623 */
1624static int
1625ffs_listextattr(struct vop_listextattr_args *ap)
1626/*
1627vop_listextattr {
1628 IN struct vnode *a_vp;
1629 IN int a_attrnamespace;
1630 INOUT struct uio *a_uio;
1631 OUT size_t *a_size;
1632 IN struct ucred *a_cred;
1633 IN struct thread *a_td;
1634};
1635*/
1636{
1637 struct inode *ip;
1638 struct fs *fs;
1639 u_char *eae, *p, *pe, *pn;
1640 unsigned easize;
1641 uint32_t ul;
1642 int error, ealen, stand_alone;
1643
1644 ip = VTOI(ap->a_vp);
1645 fs = ip->i_fs;
1646
1647 if (fs->fs_magic == FS_UFS1_MAGIC)
1648 return (ufs_vnoperate((struct vop_generic_args *)ap));
1649
1650 if (ap->a_vp->v_type == VCHR)
1651 return (EOPNOTSUPP);
1652
1653 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1654 ap->a_cred, ap->a_td, IREAD);
1655 if (error)
1656 return (error);
1657
1658 if (ip->i_ea_area == NULL) {
1659 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1660 if (error)
1661 return (error);
1662 stand_alone = 1;
1663 } else {
1664 stand_alone = 0;
1665 }
1666 eae = ip->i_ea_area;
1667 easize = ip->i_ea_len;
1668
1669 error = 0;
1670 if (ap->a_size != NULL)
1671 *ap->a_size = 0;
1672 pe = eae + easize;
1673 for(p = eae; error == 0 && p < pe; p = pn) {
1674 bcopy(p, &ul, sizeof(ul));
1675 pn = p + ul;
1676 if (pn > pe)
1677 break;
1678 p += sizeof(ul);
1679 if (*p++ != ap->a_attrnamespace)
1680 continue;
1681 p++; /* pad2 */
1682 ealen = *p;
1683 if (ap->a_size != NULL) {
1684 *ap->a_size += ealen + 1;
1685 } else if (ap->a_uio != NULL) {
1686 error = uiomove(p, ealen + 1, ap->a_uio);
1687 }
1688 }
1689 if (stand_alone)
1690 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1691 return(error);
1692}
1693
1694/*
1695 * Vnode operation to set a named attribute.
1696 */
1697static int
1698ffs_setextattr(struct vop_setextattr_args *ap)
1699/*
1700vop_setextattr {
1701 IN struct vnode *a_vp;
1702 IN int a_attrnamespace;
1703 IN const char *a_name;
1704 INOUT struct uio *a_uio;
1705 IN struct ucred *a_cred;
1706 IN struct thread *a_td;
1707};
1708*/
1709{
1710 struct inode *ip;
1711 struct fs *fs;
1712 uint32_t ealength, ul;
1713 int ealen, olen, eapad1, eapad2, error, i, easize;
1714 u_char *eae, *p;
1715 int stand_alone;
1716
1717 ip = VTOI(ap->a_vp);
1718 fs = ip->i_fs;
1719
1720 if (fs->fs_magic == FS_UFS1_MAGIC)
1721 return (ufs_vnoperate((struct vop_generic_args *)ap));
1722
1723 if (ap->a_vp->v_type == VCHR)
1724 return (EOPNOTSUPP);
1725
1726 if (strlen(ap->a_name) == 0)
1727 return (EINVAL);
1728
1729 /* XXX Now unsupported API to delete EAs using NULL uio. */
1730 if (ap->a_uio == NULL)
1731 return (EOPNOTSUPP);
1732
1733 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1734 ap->a_cred, ap->a_td, IWRITE);
1735 if (error) {
1736 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1737 ip->i_ea_error = error;
1738 return (error);
1739 }
1740
1741 if (ip->i_ea_area == NULL) {
1742 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1743 if (error)
1744 return (error);
1745 stand_alone = 1;
1746 } else {
1747 stand_alone = 0;
1748 }
1749
1750 ealen = ap->a_uio->uio_resid;
1751 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1752 eapad1 = 8 - (ealength % 8);
1753 if (eapad1 == 8)
1754 eapad1 = 0;
1755 eapad2 = 8 - (ealen % 8);
1756 if (eapad2 == 8)
1757 eapad2 = 0;
1758 ealength += eapad1 + ealen + eapad2;
1759
1760 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1761 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1762 easize = ip->i_ea_len;
1763
1764 olen = ffs_findextattr(eae, easize,
1765 ap->a_attrnamespace, ap->a_name, &p, NULL);
1766 if (olen == -1) {
1767 /* new, append at end */
1768 p = eae + easize;
1769 easize += ealength;
1770 } else {
1771 bcopy(p, &ul, sizeof ul);
1772 i = p - eae + ul;
1773 if (ul != ealength) {
1774 bcopy(p + ul, p + ealength, easize - i);
1775 easize += (ealength - ul);
1776 }
1777 }
1778 if (easize > NXADDR * fs->fs_bsize) {
1779 free(eae, M_TEMP);
1780 if (stand_alone)
1781 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1782 else if (ip->i_ea_error == 0)
1783 ip->i_ea_error = ENOSPC;
1784 return(ENOSPC);
1785 }
1786 bcopy(&ealength, p, sizeof(ealength));
1787 p += sizeof(ealength);
1788 *p++ = ap->a_attrnamespace;
1789 *p++ = eapad2;
1790 *p++ = strlen(ap->a_name);
1791 strcpy(p, ap->a_name);
1792 p += strlen(ap->a_name);
1793 bzero(p, eapad1);
1794 p += eapad1;
1795 error = uiomove(p, ealen, ap->a_uio);
1796 if (error) {
1797 free(eae, M_TEMP);
1798 if (stand_alone)
1799 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1800 else if (ip->i_ea_error == 0)
1801 ip->i_ea_error = error;
1802 return(error);
1803 }
1804 p += ealen;
1805 bzero(p, eapad2);
1806
1807 p = ip->i_ea_area;
1808 ip->i_ea_area = eae;
1809 ip->i_ea_len = easize;
1810 free(p, M_TEMP);
1811 if (stand_alone)
1812 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1813 return(error);
1814}
1243{
1244 u_char *p, *pe, *pn, *p0;
1245 int eapad1, eapad2, ealength, ealen, nlen;
1246 uint32_t ul;
1247
1248 pe = ptr + length;
1249 nlen = strlen(name);
1250
1251 for (p = ptr; p < pe; p = pn) {
1252 p0 = p;
1253 bcopy(p, &ul, sizeof(ul));
1254 pn = p + ul;
1255 /* make sure this entry is complete */
1256 if (pn > pe)
1257 break;
1258 p += sizeof(uint32_t);
1259 if (*p != nspace)
1260 continue;
1261 p++;
1262 eapad2 = *p++;
1263 if (*p != nlen)
1264 continue;
1265 p++;
1266 if (bcmp(p, name, nlen))
1267 continue;
1268 ealength = sizeof(uint32_t) + 3 + nlen;
1269 eapad1 = 8 - (ealength % 8);
1270 if (eapad1 == 8)
1271 eapad1 = 0;
1272 ealength += eapad1;
1273 ealen = ul - ealength - eapad2;
1274 p += nlen + eapad1;
1275 if (eap != NULL)
1276 *eap = p0;
1277 if (eac != NULL)
1278 *eac = p;
1279 return (ealen);
1280 }
1281 return(-1);
1282}
1283
1284static int
1285ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1286{
1287 struct inode *ip;
1288 struct ufs2_dinode *dp;
1289 struct uio luio;
1290 struct iovec liovec;
1291 int easize, error;
1292 u_char *eae;
1293
1294 ip = VTOI(vp);
1295 dp = ip->i_din2;
1296 easize = dp->di_extsize;
1297
1298 eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1299
1300 liovec.iov_base = eae;
1301 liovec.iov_len = easize;
1302 luio.uio_iov = &liovec;
1303 luio.uio_iovcnt = 1;
1304 luio.uio_offset = 0;
1305 luio.uio_resid = easize;
1306 luio.uio_segflg = UIO_SYSSPACE;
1307 luio.uio_rw = UIO_READ;
1308 luio.uio_td = td;
1309
1310 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1311 if (error) {
1312 free(eae, M_TEMP);
1313 return(error);
1314 }
1315 *p = eae;
1316 return (0);
1317}
1318
1319static int
1320ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1321{
1322 struct inode *ip;
1323 struct ufs2_dinode *dp;
1324 int error;
1325
1326 ip = VTOI(vp);
1327
1328 if (ip->i_ea_area != NULL)
1329 return (EBUSY);
1330 dp = ip->i_din2;
1331 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1332 if (error)
1333 return (error);
1334 ip->i_ea_len = dp->di_extsize;
1335 ip->i_ea_error = 0;
1336 return (0);
1337}
1338
1339/*
1340 * Vnode extattr transaction commit/abort
1341 */
1342static int
1343ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1344{
1345 struct inode *ip;
1346 struct uio luio;
1347 struct iovec liovec;
1348 int error;
1349 struct ufs2_dinode *dp;
1350
1351 ip = VTOI(vp);
1352 if (ip->i_ea_area == NULL)
1353 return (EINVAL);
1354 dp = ip->i_din2;
1355 error = ip->i_ea_error;
1356 if (commit && error == 0) {
1357 if (cred == NOCRED)
1358 cred = vp->v_mount->mnt_cred;
1359 liovec.iov_base = ip->i_ea_area;
1360 liovec.iov_len = ip->i_ea_len;
1361 luio.uio_iov = &liovec;
1362 luio.uio_iovcnt = 1;
1363 luio.uio_offset = 0;
1364 luio.uio_resid = ip->i_ea_len;
1365 luio.uio_segflg = UIO_SYSSPACE;
1366 luio.uio_rw = UIO_WRITE;
1367 luio.uio_td = td;
1368 /* XXX: I'm not happy about truncating to zero size */
1369 if (ip->i_ea_len < dp->di_extsize)
1370 error = ffs_truncate(vp, 0, IO_EXT, cred, td);
1371 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1372 }
1373 free(ip->i_ea_area, M_TEMP);
1374 ip->i_ea_area = NULL;
1375 ip->i_ea_len = 0;
1376 ip->i_ea_error = 0;
1377 return (error);
1378}
1379
1380/*
1381 * Vnode extattr strategy routine for special devices and fifos.
1382 *
1383 * We need to check for a read or write of the external attributes.
1384 * Otherwise we just fall through and do the usual thing.
1385 */
1386static int
1387ffsext_strategy(struct vop_strategy_args *ap)
1388/*
1389struct vop_strategy_args {
1390 struct vnodeop_desc *a_desc;
1391 struct vnode *a_vp;
1392 struct buf *a_bp;
1393};
1394*/
1395{
1396 struct vnode *vp;
1397 daddr_t lbn;
1398
1399 KASSERT(ap->a_vp == ap->a_bp->b_vp, ("%s(%p != %p)",
1400 __func__, ap->a_vp, ap->a_bp->b_vp));
1401 vp = ap->a_vp;
1402 lbn = ap->a_bp->b_lblkno;
1403 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
1404 lbn < 0 && lbn >= -NXADDR)
1405 return (ufs_vnoperate((struct vop_generic_args *)ap));
1406 if (vp->v_type == VFIFO)
1407 return (ufs_vnoperatefifo((struct vop_generic_args *)ap));
1408 return (ufs_vnoperatespec((struct vop_generic_args *)ap));
1409}
1410
1411/*
1412 * Vnode extattr transaction commit/abort
1413 */
1414static int
1415ffs_openextattr(struct vop_openextattr_args *ap)
1416/*
1417struct vop_openextattr_args {
1418 struct vnodeop_desc *a_desc;
1419 struct vnode *a_vp;
1420 IN struct ucred *a_cred;
1421 IN struct thread *a_td;
1422};
1423*/
1424{
1425 struct inode *ip;
1426 struct fs *fs;
1427
1428 ip = VTOI(ap->a_vp);
1429 fs = ip->i_fs;
1430 if (fs->fs_magic == FS_UFS1_MAGIC)
1431 return (ufs_vnoperate((struct vop_generic_args *)ap));
1432
1433 if (ap->a_vp->v_type == VCHR)
1434 return (EOPNOTSUPP);
1435
1436 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1437}
1438
1439
1440/*
1441 * Vnode extattr transaction commit/abort
1442 */
1443static int
1444ffs_closeextattr(struct vop_closeextattr_args *ap)
1445/*
1446struct vop_closeextattr_args {
1447 struct vnodeop_desc *a_desc;
1448 struct vnode *a_vp;
1449 int a_commit;
1450 IN struct ucred *a_cred;
1451 IN struct thread *a_td;
1452};
1453*/
1454{
1455 struct inode *ip;
1456 struct fs *fs;
1457
1458 ip = VTOI(ap->a_vp);
1459 fs = ip->i_fs;
1460 if (fs->fs_magic == FS_UFS1_MAGIC)
1461 return (ufs_vnoperate((struct vop_generic_args *)ap));
1462
1463 if (ap->a_vp->v_type == VCHR)
1464 return (EOPNOTSUPP);
1465
1466 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1467}
1468
1469/*
1470 * Vnode operation to remove a named attribute.
1471 */
1472static int
1473ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1474/*
1475vop_deleteextattr {
1476 IN struct vnode *a_vp;
1477 IN int a_attrnamespace;
1478 IN const char *a_name;
1479 IN struct ucred *a_cred;
1480 IN struct thread *a_td;
1481};
1482*/
1483{
1484 struct inode *ip;
1485 struct fs *fs;
1486 uint32_t ealength, ul;
1487 int ealen, olen, eapad1, eapad2, error, i, easize;
1488 u_char *eae, *p;
1489 int stand_alone;
1490
1491 ip = VTOI(ap->a_vp);
1492 fs = ip->i_fs;
1493
1494 if (fs->fs_magic == FS_UFS1_MAGIC)
1495 return (ufs_vnoperate((struct vop_generic_args *)ap));
1496
1497 if (ap->a_vp->v_type == VCHR)
1498 return (EOPNOTSUPP);
1499
1500 if (strlen(ap->a_name) == 0)
1501 return (EINVAL);
1502
1503 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1504 ap->a_cred, ap->a_td, IWRITE);
1505 if (error) {
1506 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1507 ip->i_ea_error = error;
1508 return (error);
1509 }
1510
1511 if (ip->i_ea_area == NULL) {
1512 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1513 if (error)
1514 return (error);
1515 stand_alone = 1;
1516 } else {
1517 stand_alone = 0;
1518 }
1519
1520 ealength = eapad1 = ealen = eapad2 = 0;
1521
1522 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1523 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1524 easize = ip->i_ea_len;
1525
1526 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1527 &p, NULL);
1528 if (olen == -1) {
1529 /* delete but nonexistent */
1530 free(eae, M_TEMP);
1531 if (stand_alone)
1532 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1533 return(ENOATTR);
1534 }
1535 bcopy(p, &ul, sizeof ul);
1536 i = p - eae + ul;
1537 if (ul != ealength) {
1538 bcopy(p + ul, p + ealength, easize - i);
1539 easize += (ealength - ul);
1540 }
1541 if (easize > NXADDR * fs->fs_bsize) {
1542 free(eae, M_TEMP);
1543 if (stand_alone)
1544 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1545 else if (ip->i_ea_error == 0)
1546 ip->i_ea_error = ENOSPC;
1547 return(ENOSPC);
1548 }
1549 p = ip->i_ea_area;
1550 ip->i_ea_area = eae;
1551 ip->i_ea_len = easize;
1552 free(p, M_TEMP);
1553 if (stand_alone)
1554 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1555 return(error);
1556}
1557
1558/*
1559 * Vnode operation to retrieve a named extended attribute.
1560 */
1561static int
1562ffs_getextattr(struct vop_getextattr_args *ap)
1563/*
1564vop_getextattr {
1565 IN struct vnode *a_vp;
1566 IN int a_attrnamespace;
1567 IN const char *a_name;
1568 INOUT struct uio *a_uio;
1569 OUT size_t *a_size;
1570 IN struct ucred *a_cred;
1571 IN struct thread *a_td;
1572};
1573*/
1574{
1575 struct inode *ip;
1576 struct fs *fs;
1577 u_char *eae, *p;
1578 unsigned easize;
1579 int error, ealen, stand_alone;
1580
1581 ip = VTOI(ap->a_vp);
1582 fs = ip->i_fs;
1583
1584 if (fs->fs_magic == FS_UFS1_MAGIC)
1585 return (ufs_vnoperate((struct vop_generic_args *)ap));
1586
1587 if (ap->a_vp->v_type == VCHR)
1588 return (EOPNOTSUPP);
1589
1590 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1591 ap->a_cred, ap->a_td, IREAD);
1592 if (error)
1593 return (error);
1594
1595 if (ip->i_ea_area == NULL) {
1596 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1597 if (error)
1598 return (error);
1599 stand_alone = 1;
1600 } else {
1601 stand_alone = 0;
1602 }
1603 eae = ip->i_ea_area;
1604 easize = ip->i_ea_len;
1605
1606 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1607 NULL, &p);
1608 if (ealen >= 0) {
1609 error = 0;
1610 if (ap->a_size != NULL)
1611 *ap->a_size = ealen;
1612 else if (ap->a_uio != NULL)
1613 error = uiomove(p, ealen, ap->a_uio);
1614 } else
1615 error = ENOATTR;
1616 if (stand_alone)
1617 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1618 return(error);
1619}
1620
1621/*
1622 * Vnode operation to retrieve extended attributes on a vnode.
1623 */
1624static int
1625ffs_listextattr(struct vop_listextattr_args *ap)
1626/*
1627vop_listextattr {
1628 IN struct vnode *a_vp;
1629 IN int a_attrnamespace;
1630 INOUT struct uio *a_uio;
1631 OUT size_t *a_size;
1632 IN struct ucred *a_cred;
1633 IN struct thread *a_td;
1634};
1635*/
1636{
1637 struct inode *ip;
1638 struct fs *fs;
1639 u_char *eae, *p, *pe, *pn;
1640 unsigned easize;
1641 uint32_t ul;
1642 int error, ealen, stand_alone;
1643
1644 ip = VTOI(ap->a_vp);
1645 fs = ip->i_fs;
1646
1647 if (fs->fs_magic == FS_UFS1_MAGIC)
1648 return (ufs_vnoperate((struct vop_generic_args *)ap));
1649
1650 if (ap->a_vp->v_type == VCHR)
1651 return (EOPNOTSUPP);
1652
1653 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1654 ap->a_cred, ap->a_td, IREAD);
1655 if (error)
1656 return (error);
1657
1658 if (ip->i_ea_area == NULL) {
1659 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1660 if (error)
1661 return (error);
1662 stand_alone = 1;
1663 } else {
1664 stand_alone = 0;
1665 }
1666 eae = ip->i_ea_area;
1667 easize = ip->i_ea_len;
1668
1669 error = 0;
1670 if (ap->a_size != NULL)
1671 *ap->a_size = 0;
1672 pe = eae + easize;
1673 for(p = eae; error == 0 && p < pe; p = pn) {
1674 bcopy(p, &ul, sizeof(ul));
1675 pn = p + ul;
1676 if (pn > pe)
1677 break;
1678 p += sizeof(ul);
1679 if (*p++ != ap->a_attrnamespace)
1680 continue;
1681 p++; /* pad2 */
1682 ealen = *p;
1683 if (ap->a_size != NULL) {
1684 *ap->a_size += ealen + 1;
1685 } else if (ap->a_uio != NULL) {
1686 error = uiomove(p, ealen + 1, ap->a_uio);
1687 }
1688 }
1689 if (stand_alone)
1690 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1691 return(error);
1692}
1693
1694/*
1695 * Vnode operation to set a named attribute.
1696 */
1697static int
1698ffs_setextattr(struct vop_setextattr_args *ap)
1699/*
1700vop_setextattr {
1701 IN struct vnode *a_vp;
1702 IN int a_attrnamespace;
1703 IN const char *a_name;
1704 INOUT struct uio *a_uio;
1705 IN struct ucred *a_cred;
1706 IN struct thread *a_td;
1707};
1708*/
1709{
1710 struct inode *ip;
1711 struct fs *fs;
1712 uint32_t ealength, ul;
1713 int ealen, olen, eapad1, eapad2, error, i, easize;
1714 u_char *eae, *p;
1715 int stand_alone;
1716
1717 ip = VTOI(ap->a_vp);
1718 fs = ip->i_fs;
1719
1720 if (fs->fs_magic == FS_UFS1_MAGIC)
1721 return (ufs_vnoperate((struct vop_generic_args *)ap));
1722
1723 if (ap->a_vp->v_type == VCHR)
1724 return (EOPNOTSUPP);
1725
1726 if (strlen(ap->a_name) == 0)
1727 return (EINVAL);
1728
1729 /* XXX Now unsupported API to delete EAs using NULL uio. */
1730 if (ap->a_uio == NULL)
1731 return (EOPNOTSUPP);
1732
1733 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1734 ap->a_cred, ap->a_td, IWRITE);
1735 if (error) {
1736 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1737 ip->i_ea_error = error;
1738 return (error);
1739 }
1740
1741 if (ip->i_ea_area == NULL) {
1742 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1743 if (error)
1744 return (error);
1745 stand_alone = 1;
1746 } else {
1747 stand_alone = 0;
1748 }
1749
1750 ealen = ap->a_uio->uio_resid;
1751 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1752 eapad1 = 8 - (ealength % 8);
1753 if (eapad1 == 8)
1754 eapad1 = 0;
1755 eapad2 = 8 - (ealen % 8);
1756 if (eapad2 == 8)
1757 eapad2 = 0;
1758 ealength += eapad1 + ealen + eapad2;
1759
1760 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1761 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1762 easize = ip->i_ea_len;
1763
1764 olen = ffs_findextattr(eae, easize,
1765 ap->a_attrnamespace, ap->a_name, &p, NULL);
1766 if (olen == -1) {
1767 /* new, append at end */
1768 p = eae + easize;
1769 easize += ealength;
1770 } else {
1771 bcopy(p, &ul, sizeof ul);
1772 i = p - eae + ul;
1773 if (ul != ealength) {
1774 bcopy(p + ul, p + ealength, easize - i);
1775 easize += (ealength - ul);
1776 }
1777 }
1778 if (easize > NXADDR * fs->fs_bsize) {
1779 free(eae, M_TEMP);
1780 if (stand_alone)
1781 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1782 else if (ip->i_ea_error == 0)
1783 ip->i_ea_error = ENOSPC;
1784 return(ENOSPC);
1785 }
1786 bcopy(&ealength, p, sizeof(ealength));
1787 p += sizeof(ealength);
1788 *p++ = ap->a_attrnamespace;
1789 *p++ = eapad2;
1790 *p++ = strlen(ap->a_name);
1791 strcpy(p, ap->a_name);
1792 p += strlen(ap->a_name);
1793 bzero(p, eapad1);
1794 p += eapad1;
1795 error = uiomove(p, ealen, ap->a_uio);
1796 if (error) {
1797 free(eae, M_TEMP);
1798 if (stand_alone)
1799 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1800 else if (ip->i_ea_error == 0)
1801 ip->i_ea_error = error;
1802 return(error);
1803 }
1804 p += ealen;
1805 bzero(p, eapad2);
1806
1807 p = ip->i_ea_area;
1808 ip->i_ea_area = eae;
1809 ip->i_ea_len = easize;
1810 free(p, M_TEMP);
1811 if (stand_alone)
1812 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1813 return(error);
1814}