1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* Portions Copyright 2007 Shivakumar GN */ 22/* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29#include <sys/types.h> 30#include <sys/cmn_err.h> 31#include <sys/debug.h> 32#include <sys/dirent.h> 33#include <sys/kmem.h> 34#include <sys/mman.h> 35#include <sys/mutex.h> 36#include <sys/sysmacros.h> 37#include <sys/systm.h> 38#include <sys/sunddi.h> 39#include <sys/uio.h> 40#include <sys/vfs.h> 41#include <sys/vnode.h> 42#include <sys/cred.h> 43 44#include <sys/gfs.h> 45 46/* 47 * Generic pseudo-filesystem routines. 48 * 49 * There are significant similarities between the implementation of certain file 50 * system entry points across different filesystems. While one could attempt to 51 * "choke up on the bat" and incorporate common functionality into a VOP 52 * preamble or postamble, such an approach is limited in the benefit it can 53 * provide. In this file we instead define a toolkit of routines which can be 54 * called from a filesystem (with in-kernel pseudo-filesystems being the focus 55 * of the exercise) in a more component-like fashion. 56 * 57 * There are three basic classes of routines: 58 * 59 * 1) Lowlevel support routines 60 * 61 * These routines are designed to play a support role for existing 62 * pseudo-filesystems (such as procfs). They simplify common tasks, 63 * without forcing the filesystem to hand over management to GFS. The 64 * routines covered are: 65 * 66 * gfs_readdir_init() 67 * gfs_readdir_emit() 68 * gfs_readdir_emitn() 69 * gfs_readdir_pred() 70 * gfs_readdir_fini() 71 * gfs_lookup_dot() 72 * 73 * 2) Complete GFS management 74 * 75 * These routines take a more active role in management of the 76 * pseudo-filesystem. They handle the relationship between vnode private 77 * data and VFS data, as well as the relationship between vnodes in the 78 * directory hierarchy. 79 * 80 * In order to use these interfaces, the first member of every private 81 * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control 82 * to GFS. 83 * 84 * gfs_file_create() 85 * gfs_dir_create() 86 * gfs_root_create() 87 * 88 * gfs_file_inactive() 89 * gfs_dir_inactive() 90 * gfs_dir_lookup() 91 * gfs_dir_readdir() 92 * 93 * gfs_vop_reclaim() 94 * gfs_vop_lookup() 95 * gfs_vop_readdir() 96 * gfs_vop_map() 97 * 98 * 3) Single File pseudo-filesystems 99 * 100 * This routine creates a rooted file to be overlayed ontop of another 101 * file in the physical filespace. 102 * 103 * Note that the parent is NULL (actually the vfs), but there is nothing 104 * technically keeping such a file from utilizing the "Complete GFS 105 * management" set of routines. 106 * 107 * gfs_root_create_file() 108 */ 109 110#ifdef sun 111/* 112 * gfs_make_opsvec: take an array of vnode type definitions and create 113 * their vnodeops_t structures 114 * 115 * This routine takes an array of gfs_opsvec_t's. It could 116 * alternatively take an array of gfs_opsvec_t*'s, which would allow 117 * vnode types to be completely defined in files external to the caller 118 * of gfs_make_opsvec(). As it stands, much more sharing takes place -- 119 * both the caller and the vnode type provider need to access gfsv_ops 120 * and gfsv_template, and the caller also needs to know gfsv_name. 121 */ 122int 123gfs_make_opsvec(gfs_opsvec_t *vec) 124{ 125 int error, i; 126 127 for (i = 0; ; i++) { 128 if (vec[i].gfsv_name == NULL) 129 return (0); 130 error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template, 131 vec[i].gfsv_ops); 132 if (error) 133 break; 134 } 135 136 cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'", 137 vec[i].gfsv_name); 138 for (i--; i >= 0; i--) { 139 vn_freevnodeops(*vec[i].gfsv_ops); 140 *vec[i].gfsv_ops = NULL; 141 } 142 return (error); 143} 144#endif /* sun */ 145 146/* 147 * Low level directory routines 148 * 149 * These routines provide some simple abstractions for reading directories. 150 * They are designed to be used by existing pseudo filesystems (namely procfs) 151 * that already have a complicated management infrastructure. 152 */ 153 154/* 155 * gfs_get_parent_ino: used to obtain a parent inode number and the 156 * inode number of the given vnode in preparation for calling gfs_readdir_init. 157 */ 158int 159gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct, 160 ino64_t *pino, ino64_t *ino) 161{ 162 vnode_t *parent; 163 gfs_dir_t *dp = dvp->v_data; 164 int error; 165 166 *ino = dp->gfsd_file.gfs_ino; 167 parent = dp->gfsd_file.gfs_parent; 168 169 if (parent == NULL) { 170 *pino = *ino; /* root of filesystem */ 171 } else if (dvp->v_flag & V_XATTRDIR) { 172#ifdef TODO 173 vattr_t va; 174 175 va.va_mask = AT_NODEID; 176 error = VOP_GETATTR(parent, &va, 0, cr, ct); 177 if (error) 178 return (error); 179 *pino = va.va_nodeid; 180#else 181 panic("%s:%u: not implemented", __func__, __LINE__); 182#endif 183 } else { 184 *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino; 185 } 186 187 return (0); 188} 189 190/* 191 * gfs_readdir_init: initiate a generic readdir 192 * st - a pointer to an uninitialized gfs_readdir_state_t structure 193 * name_max - the directory's maximum file name length 194 * ureclen - the exported file-space record length (1 for non-legacy FSs) 195 * uiop - the uiop passed to readdir 196 * parent - the parent directory's inode 197 * self - this directory's inode 198 * flags - flags from VOP_READDIR 199 * 200 * Returns 0 or a non-zero errno. 201 * 202 * Typical VOP_READDIR usage of gfs_readdir_*: 203 * 204 * if ((error = gfs_readdir_init(...)) != 0) 205 * return (error); 206 * eof = 0; 207 * while ((error = gfs_readdir_pred(..., &voffset)) != 0) { 208 * if (!consumer_entry_at(voffset)) 209 * voffset = consumer_next_entry(voffset); 210 * if (consumer_eof(voffset)) { 211 * eof = 1 212 * break; 213 * } 214 * if ((error = gfs_readdir_emit(..., voffset, 215 * consumer_ino(voffset), consumer_name(voffset))) != 0) 216 * break; 217 * } 218 * return (gfs_readdir_fini(..., error, eofp, eof)); 219 * 220 * As you can see, a zero result from gfs_readdir_pred() or 221 * gfs_readdir_emit() indicates that processing should continue, 222 * whereas a non-zero result indicates that the loop should terminate. 223 * Most consumers need do nothing more than let gfs_readdir_fini() 224 * determine what the cause of failure was and return the appropriate 225 * value. 226 */ 227int 228gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, 229 uio_t *uiop, ino64_t parent, ino64_t self, int flags) 230{ 231 size_t dirent_size; 232 233 if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || 234 (uiop->uio_loffset % ureclen) != 0) 235 return (EINVAL); 236 237 st->grd_ureclen = ureclen; 238 st->grd_oresid = uiop->uio_resid; 239 st->grd_namlen = name_max; 240 if (flags & V_RDDIR_ENTFLAGS) 241 dirent_size = EDIRENT_RECLEN(st->grd_namlen); 242 else 243 dirent_size = DIRENT64_RECLEN(st->grd_namlen); 244 st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP); 245 st->grd_parent = parent; 246 st->grd_self = self; 247 st->grd_flags = flags; 248 249 return (0); 250} 251 252/* 253 * gfs_readdir_emit_int: internal routine to emit directory entry 254 * 255 * st - the current readdir state, which must have d_ino/ed_ino 256 * and d_name/ed_name set 257 * uiop - caller-supplied uio pointer 258 * next - the offset of the next entry 259 */ 260static int 261gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next, 262 int *ncookies, u_long **cookies) 263{ 264 int reclen, namlen; 265 dirent64_t *dp; 266 edirent_t *edp; 267 268 if (st->grd_flags & V_RDDIR_ENTFLAGS) { 269 edp = st->grd_dirent; 270 namlen = strlen(edp->ed_name); 271 reclen = EDIRENT_RECLEN(namlen); 272 } else { 273 dp = st->grd_dirent; 274 namlen = strlen(dp->d_name); 275 reclen = DIRENT64_RECLEN(namlen); 276 } 277 278 if (reclen > uiop->uio_resid) { 279 /* 280 * Error if no entries were returned yet 281 */ 282 if (uiop->uio_resid == st->grd_oresid) 283 return (EINVAL); 284 return (-1); 285 } 286 287 if (st->grd_flags & V_RDDIR_ENTFLAGS) { 288 edp->ed_off = next; 289 edp->ed_reclen = (ushort_t)reclen; 290 } else { 291 /* XXX: This can change in the future. */ 292 dp->d_reclen = (ushort_t)reclen; 293 dp->d_type = DT_DIR; 294 dp->d_namlen = namlen; 295 } 296 297 if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) 298 return (EFAULT); 299 300 uiop->uio_loffset = next; 301 if (*cookies != NULL) { 302 **cookies = next; 303 (*cookies)++; 304 (*ncookies)--; 305 KASSERT(*ncookies >= 0, ("ncookies=%d", *ncookies)); 306 } 307 308 return (0); 309} 310 311/* 312 * gfs_readdir_emit: emit a directory entry 313 * voff - the virtual offset (obtained from gfs_readdir_pred) 314 * ino - the entry's inode 315 * name - the entry's name 316 * eflags - value for ed_eflags (if processing edirent_t) 317 * 318 * Returns a 0 on success, a non-zero errno on failure, or -1 if the 319 * readdir loop should terminate. A non-zero result (either errno or 320 * -1) from this function is typically passed directly to 321 * gfs_readdir_fini(). 322 */ 323int 324gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, 325 ino64_t ino, const char *name, int eflags, int *ncookies, u_long **cookies) 326{ 327 offset_t off = (voff + 2) * st->grd_ureclen; 328 329 if (st->grd_flags & V_RDDIR_ENTFLAGS) { 330 edirent_t *edp = st->grd_dirent; 331 332 edp->ed_ino = ino; 333 (void) strncpy(edp->ed_name, name, st->grd_namlen); 334 edp->ed_eflags = eflags; 335 } else { 336 dirent64_t *dp = st->grd_dirent; 337 338 dp->d_ino = ino; 339 (void) strncpy(dp->d_name, name, st->grd_namlen); 340 } 341 342 /* 343 * Inter-entry offsets are invalid, so we assume a record size of 344 * grd_ureclen and explicitly set the offset appropriately. 345 */ 346 return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen, ncookies, 347 cookies)); 348} 349 350#ifdef sun 351/* 352 * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer 353 * instead of a string for the entry's name. 354 */ 355int 356gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, 357 ino64_t ino, unsigned long num) 358{ 359 char buf[40]; 360 361 numtos(num, buf); 362 return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0)); 363} 364#endif 365 366/* 367 * gfs_readdir_pred: readdir loop predicate 368 * voffp - a pointer in which the next virtual offset should be stored 369 * 370 * Returns a 0 on success, a non-zero errno on failure, or -1 if the 371 * readdir loop should terminate. A non-zero result (either errno or 372 * -1) from this function is typically passed directly to 373 * gfs_readdir_fini(). 374 */ 375int 376gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp, 377 int *ncookies, u_long **cookies) 378{ 379 offset_t off, voff; 380 int error; 381 382top: 383 if (uiop->uio_resid <= 0) 384 return (-1); 385 386 off = uiop->uio_loffset / st->grd_ureclen; 387 voff = off - 2; 388 if (off == 0) { 389 if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, 390 ".", 0, ncookies, cookies)) == 0) 391 goto top; 392 } else if (off == 1) { 393 if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, 394 "..", 0, ncookies, cookies)) == 0) 395 goto top; 396 } else { 397 *voffp = voff; 398 return (0); 399 } 400 401 return (error); 402} 403 404/* 405 * gfs_readdir_fini: generic readdir cleanup 406 * error - if positive, an error to return 407 * eofp - the eofp passed to readdir 408 * eof - the eof value 409 * 410 * Returns a 0 on success, a non-zero errno on failure. This result 411 * should be returned from readdir. 412 */ 413int 414gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) 415{ 416 size_t dirent_size; 417 418 if (st->grd_flags & V_RDDIR_ENTFLAGS) 419 dirent_size = EDIRENT_RECLEN(st->grd_namlen); 420 else 421 dirent_size = DIRENT64_RECLEN(st->grd_namlen); 422 kmem_free(st->grd_dirent, dirent_size); 423 if (error > 0) 424 return (error); 425 if (eofp) 426 *eofp = eof; 427 return (0); 428} 429 430/* 431 * gfs_lookup_dot 432 * 433 * Performs a basic check for "." and ".." directory entries. 434 */ 435int 436gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm) 437{ 438 int ltype; 439 440 if (*nm == '\0' || strcmp(nm, ".") == 0) { 441 VN_HOLD(dvp); 442 *vpp = dvp; 443 return (0); 444 } else if (strcmp(nm, "..") == 0) { 445 if (pvp == NULL) { 446 ASSERT(dvp->v_flag & VROOT); 447 VN_HOLD(dvp); 448 *vpp = dvp; 449 ASSERT_VOP_ELOCKED(dvp, "gfs_lookup_dot: non-locked dvp"); 450 } else { 451 ltype = VOP_ISLOCKED(dvp); 452 VOP_UNLOCK(dvp, 0); 453 VN_HOLD(pvp); 454 *vpp = pvp; 455 vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); 456 vn_lock(dvp, ltype | LK_RETRY); 457 } 458 return (0); 459 } 460 461 return (-1); 462} 463 464/* 465 * gfs_file_create(): create a new GFS file 466 * 467 * size - size of private data structure (v_data) 468 * pvp - parent vnode (GFS directory) 469 * ops - vnode operations vector 470 * 471 * In order to use this interface, the parent vnode must have been created by 472 * gfs_dir_create(), and the private data stored in v_data must have a 473 * 'gfs_file_t' as its first field. 474 * 475 * Given these constraints, this routine will automatically: 476 * 477 * - Allocate v_data for the vnode 478 * - Initialize necessary fields in the vnode 479 * - Hold the parent 480 */ 481vnode_t * 482gfs_file_create(size_t size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops) 483{ 484 gfs_file_t *fp; 485 vnode_t *vp; 486 int error; 487 488 /* 489 * Allocate vnode and internal data structure 490 */ 491 fp = kmem_zalloc(size, KM_SLEEP); 492 error = getnewvnode("zfs", vfsp, ops, &vp); 493 ASSERT(error == 0); 494 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 495 vp->v_data = (caddr_t)fp; 496 497 /* 498 * Set up various pointers 499 */ 500 fp->gfs_vnode = vp; 501 fp->gfs_parent = pvp; 502 fp->gfs_size = size; 503 fp->gfs_type = GFS_FILE; 504 505 vp->v_vflag |= VV_FORCEINSMQ; 506 error = insmntque(vp, vfsp); 507 vp->v_vflag &= ~VV_FORCEINSMQ; 508 KASSERT(error == 0, ("insmntque() failed: error %d", error)); 509 510 /* 511 * Initialize vnode and hold parent. 512 */ 513 if (pvp) 514 VN_HOLD(pvp); 515 516 return (vp); 517} 518 519/* 520 * gfs_dir_create: creates a new directory in the parent 521 * 522 * size - size of private data structure (v_data) 523 * pvp - parent vnode (GFS directory) 524 * ops - vnode operations vector 525 * entries - NULL-terminated list of static entries (if any) 526 * maxlen - maximum length of a directory entry 527 * readdir_cb - readdir callback (see gfs_dir_readdir) 528 * inode_cb - inode callback (see gfs_dir_readdir) 529 * lookup_cb - lookup callback (see gfs_dir_lookup) 530 * 531 * In order to use this function, the first member of the private vnode 532 * structure (v_data) must be a gfs_dir_t. For each directory, there are 533 * static entries, defined when the structure is initialized, and dynamic 534 * entries, retrieved through callbacks. 535 * 536 * If a directory has static entries, then it must supply a inode callback, 537 * which will compute the inode number based on the parent and the index. 538 * For a directory with dynamic entries, the caller must supply a readdir 539 * callback and a lookup callback. If a static lookup fails, we fall back to 540 * the supplied lookup callback, if any. 541 * 542 * This function also performs the same initialization as gfs_file_create(). 543 */ 544vnode_t * 545gfs_dir_create(size_t struct_size, vnode_t *pvp, vfs_t *vfsp, vnodeops_t *ops, 546 gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, 547 gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) 548{ 549 vnode_t *vp; 550 gfs_dir_t *dp; 551 gfs_dirent_t *de; 552 553 vp = gfs_file_create(struct_size, pvp, vfsp, ops); 554 vp->v_type = VDIR; 555 556 dp = vp->v_data; 557 dp->gfsd_file.gfs_type = GFS_DIR; 558 dp->gfsd_maxlen = maxlen; 559 560 if (entries != NULL) { 561 for (de = entries; de->gfse_name != NULL; de++) 562 dp->gfsd_nstatic++; 563 564 dp->gfsd_static = kmem_alloc( 565 dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP); 566 bcopy(entries, dp->gfsd_static, 567 dp->gfsd_nstatic * sizeof (gfs_dirent_t)); 568 } 569 570 dp->gfsd_readdir = readdir_cb; 571 dp->gfsd_lookup = lookup_cb; 572 dp->gfsd_inode = inode_cb; 573 574 mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL); 575 576 return (vp); 577} 578 579/* 580 * gfs_root_create(): create a root vnode for a GFS filesystem 581 * 582 * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The 583 * only difference is that it takes a vfs_t instead of a vnode_t as its parent. 584 */ 585vnode_t * 586gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino, 587 gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, 588 gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) 589{ 590 vnode_t *vp; 591 592 VFS_HOLD(vfsp); 593 vp = gfs_dir_create(size, NULL, vfsp, ops, entries, inode_cb, 594 maxlen, readdir_cb, lookup_cb); 595 /* Manually set the inode */ 596 ((gfs_file_t *)vp->v_data)->gfs_ino = ino; 597 vp->v_flag |= VROOT; 598 599 return (vp); 600} 601 602#ifdef sun 603/* 604 * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem 605 * 606 * Similar to gfs_root_create(), this creates a root vnode for a file to 607 * be the pseudo-filesystem. 608 */ 609vnode_t * 610gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino) 611{ 612 vnode_t *vp = gfs_file_create(size, NULL, ops); 613 614 ((gfs_file_t *)vp->v_data)->gfs_ino = ino; 615 616 VFS_HOLD(vfsp); 617 VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0); 618 vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; 619 620 return (vp); 621} 622#endif /* sun */ 623 624/* 625 * gfs_file_inactive() 626 * 627 * Called from the VOP_RECLAIM() routine. If necessary, this routine will 628 * remove the given vnode from the parent directory and clean up any references 629 * in the VFS layer. 630 * 631 * If the vnode was not removed (due to a race with vget), then NULL is 632 * returned. Otherwise, a pointer to the private data is returned. 633 */ 634void * 635gfs_file_inactive(vnode_t *vp) 636{ 637 int i; 638 gfs_dirent_t *ge = NULL; 639 gfs_file_t *fp = vp->v_data; 640 gfs_dir_t *dp = NULL; 641 void *data; 642 643 if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR)) 644 goto found; 645 646 /* 647 * XXX cope with a FreeBSD-specific race wherein the parent's 648 * snapshot data can be freed before the parent is 649 */ 650 if ((dp = fp->gfs_parent->v_data) == NULL) 651 return (NULL); 652 653 /* 654 * First, see if this vnode is cached in the parent. 655 */ 656 gfs_dir_lock(dp); 657 658 /* 659 * Find it in the set of static entries. 660 */ 661 for (i = 0; i < dp->gfsd_nstatic; i++) { 662 ge = &dp->gfsd_static[i]; 663 664 if (ge->gfse_vnode == vp) 665 goto found; 666 } 667 668 /* 669 * If 'ge' is NULL, then it is a dynamic entry. 670 */ 671 ge = NULL; 672 673found: 674#ifdef TODO 675 if (vp->v_flag & V_XATTRDIR) 676 VI_LOCK(fp->gfs_parent); 677#endif 678 VI_LOCK(vp); 679 /* 680 * Really remove this vnode 681 */ 682 data = vp->v_data; 683 if (ge != NULL) { 684 /* 685 * If this was a statically cached entry, simply set the 686 * cached vnode to NULL. 687 */ 688 ge->gfse_vnode = NULL; 689 } 690 VI_UNLOCK(vp); 691 692 /* 693 * Free vnode and release parent 694 */ 695 if (fp->gfs_parent) { 696 if (dp) 697 gfs_dir_unlock(dp); 698 VOP_UNLOCK(vp, 0); 699 VN_RELE(fp->gfs_parent); 700 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 701 } else { 702 ASSERT(vp->v_vfsp != NULL); 703 VFS_RELE(vp->v_vfsp); 704 } 705#ifdef TODO 706 if (vp->v_flag & V_XATTRDIR) 707 VI_UNLOCK(fp->gfs_parent); 708#endif 709 return (data); 710} 711 712/* 713 * gfs_dir_inactive() 714 * 715 * Same as above, but for directories. 716 */ 717void * 718gfs_dir_inactive(vnode_t *vp) 719{ 720 gfs_dir_t *dp; 721 722 ASSERT(vp->v_type == VDIR); 723 724 if ((dp = gfs_file_inactive(vp)) != NULL) { 725 mutex_destroy(&dp->gfsd_lock); 726 if (dp->gfsd_nstatic) 727 kmem_free(dp->gfsd_static, 728 dp->gfsd_nstatic * sizeof (gfs_dirent_t)); 729 } 730 731 return (dp); 732} 733 734/* 735 * gfs_dir_lookup_dynamic() 736 * 737 * This routine looks up the provided name amongst the dynamic entries 738 * in the gfs directory and returns the corresponding vnode, if found. 739 * 740 * The gfs directory is expected to be locked by the caller prior to 741 * calling this function. The directory will be unlocked during the 742 * execution of this function, but will be locked upon return from the 743 * function. This function returns 0 on success, non-zero on error. 744 * 745 * The dynamic lookups are performed by invoking the lookup 746 * callback, which is passed to this function as the first argument. 747 * The arguments to the callback are: 748 * 749 * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr, 750 * int flags, int *deflgs, pathname_t *rpnp); 751 * 752 * pvp - parent vnode 753 * nm - name of entry 754 * vpp - pointer to resulting vnode 755 * cr - pointer to cred 756 * flags - flags value from lookup request 757 * ignored here; currently only used to request 758 * insensitive lookups 759 * direntflgs - output parameter, directory entry flags 760 * ignored here; currently only used to indicate a lookup 761 * has more than one possible match when case is not considered 762 * realpnp - output parameter, real pathname 763 * ignored here; when lookup was performed case-insensitively, 764 * this field contains the "real" name of the file. 765 * 766 * Returns 0 on success, non-zero on error. 767 */ 768static int 769gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp, 770 const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags, 771 int *direntflags, pathname_t *realpnp) 772{ 773 gfs_file_t *fp; 774 ino64_t ino; 775 int ret; 776 777 ASSERT(GFS_DIR_LOCKED(dp)); 778 779 /* 780 * Drop the directory lock, as the lookup routine 781 * will need to allocate memory, or otherwise deadlock on this 782 * directory. 783 */ 784 gfs_dir_unlock(dp); 785 ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp); 786 gfs_dir_lock(dp); 787 788 /* 789 * The callback for extended attributes returns a vnode 790 * with v_data from an underlying fs. 791 */ 792 if (ret == 0 && !IS_XATTRDIR(dvp)) { 793 fp = (gfs_file_t *)((*vpp)->v_data); 794 fp->gfs_index = -1; 795 fp->gfs_ino = ino; 796 } 797 798 return (ret); 799} 800 801/* 802 * gfs_dir_lookup_static() 803 * 804 * This routine looks up the provided name amongst the static entries 805 * in the gfs directory and returns the corresponding vnode, if found. 806 * The first argument to the function is a pointer to the comparison 807 * function this function should use to decide if names are a match. 808 * 809 * If a match is found, and GFS_CACHE_VNODE is set and the vnode 810 * exists, we simply return the existing vnode. Otherwise, we call 811 * the static entry's callback routine, caching the result if 812 * necessary. If the idx pointer argument is non-NULL, we use it to 813 * return the index of the matching static entry. 814 * 815 * The gfs directory is expected to be locked by the caller prior to calling 816 * this function. The directory may be unlocked during the execution of 817 * this function, but will be locked upon return from the function. 818 * 819 * This function returns 0 if a match is found, ENOENT if not. 820 */ 821static int 822gfs_dir_lookup_static(int (*compare)(const char *, const char *), 823 gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx, 824 vnode_t **vpp, pathname_t *rpnp) 825{ 826 gfs_dirent_t *ge; 827 vnode_t *vp = NULL; 828 int i; 829 830 ASSERT(GFS_DIR_LOCKED(dp)); 831 832 /* 833 * Search static entries. 834 */ 835 for (i = 0; i < dp->gfsd_nstatic; i++) { 836 ge = &dp->gfsd_static[i]; 837 838 if (compare(ge->gfse_name, nm) == 0) { 839 if (rpnp) 840 (void) strlcpy(rpnp->pn_buf, ge->gfse_name, 841 rpnp->pn_bufsize); 842 843 if (ge->gfse_vnode) { 844 ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); 845 vp = ge->gfse_vnode; 846 VN_HOLD(vp); 847 break; 848 } 849 850 /* 851 * We drop the directory lock, as the constructor will 852 * need to do KM_SLEEP allocations. If we return from 853 * the constructor only to find that a parallel 854 * operation has completed, and GFS_CACHE_VNODE is set 855 * for this entry, we discard the result in favor of 856 * the cached vnode. 857 */ 858 gfs_dir_unlock(dp); 859 vp = ge->gfse_ctor(dvp); 860 gfs_dir_lock(dp); 861 862 ((gfs_file_t *)vp->v_data)->gfs_index = i; 863 864 /* Set the inode according to the callback. */ 865 ((gfs_file_t *)vp->v_data)->gfs_ino = 866 dp->gfsd_inode(dvp, i); 867 868 if (ge->gfse_flags & GFS_CACHE_VNODE) { 869 if (ge->gfse_vnode == NULL) { 870 ge->gfse_vnode = vp; 871 } else { 872 /* 873 * A parallel constructor beat us to it; 874 * return existing vnode. We have to be 875 * careful because we can't release the 876 * current vnode while holding the 877 * directory lock; its inactive routine 878 * will try to lock this directory. 879 */ 880 vnode_t *oldvp = vp; 881 vp = ge->gfse_vnode; 882 VN_HOLD(vp); 883 884 gfs_dir_unlock(dp); 885 VN_RELE(oldvp); 886 gfs_dir_lock(dp); 887 } 888 } 889 break; 890 } 891 } 892 893 if (vp == NULL) 894 return (ENOENT); 895 else if (idx) 896 *idx = i; 897 *vpp = vp; 898 return (0); 899} 900 901/* 902 * gfs_dir_lookup() 903 * 904 * Looks up the given name in the directory and returns the corresponding 905 * vnode, if found. 906 * 907 * First, we search statically defined entries, if any, with a call to 908 * gfs_dir_lookup_static(). If no static entry is found, and we have 909 * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic(). 910 * 911 * This function returns 0 on success, non-zero on error. 912 */ 913int 914gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr, 915 int flags, int *direntflags, pathname_t *realpnp) 916{ 917 gfs_dir_t *dp = dvp->v_data; 918 boolean_t casecheck; 919 vnode_t *dynvp = NULL; 920 vnode_t *vp = NULL; 921 int (*compare)(const char *, const char *); 922 int error, idx; 923 924 ASSERT(dvp->v_type == VDIR); 925 926 if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) 927 return (0); 928 929 casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL; 930 if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) || 931 (flags & FIGNORECASE)) 932 compare = strcasecmp; 933 else 934 compare = strcmp; 935 936 gfs_dir_lock(dp); 937 938 error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp); 939 940 if (vp && casecheck) { 941 gfs_dirent_t *ge; 942 int i; 943 944 for (i = idx + 1; i < dp->gfsd_nstatic; i++) { 945 ge = &dp->gfsd_static[i]; 946 947 if (strcasecmp(ge->gfse_name, nm) == 0) { 948 *direntflags |= ED_CASE_CONFLICT; 949 goto out; 950 } 951 } 952 } 953 954 if ((error || casecheck) && dp->gfsd_lookup) 955 error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp, 956 &dynvp, cr, flags, direntflags, vp ? NULL : realpnp); 957 958 if (vp && dynvp) { 959 /* static and dynamic entries are case-insensitive conflict */ 960 ASSERT(casecheck); 961 *direntflags |= ED_CASE_CONFLICT; 962 VN_RELE(dynvp); 963 } else if (vp == NULL) { 964 vp = dynvp; 965 } else if (error == ENOENT) { 966 error = 0; 967 } else if (error) { 968 VN_RELE(vp); 969 vp = NULL; 970 } 971 972out: 973 gfs_dir_unlock(dp); 974 975 *vpp = vp; 976 return (error); 977} 978 979/* 980 * gfs_dir_readdir: does a readdir() on the given directory 981 * 982 * dvp - directory vnode 983 * uiop - uio structure 984 * eofp - eof pointer 985 * data - arbitrary data passed to readdir callback 986 * 987 * This routine does all the readdir() dirty work. Even so, the caller must 988 * supply two callbacks in order to get full compatibility. 989 * 990 * If the directory contains static entries, an inode callback must be 991 * specified. This avoids having to create every vnode and call VOP_GETATTR() 992 * when reading the directory. This function has the following arguments: 993 * 994 * ino_t gfs_inode_cb(vnode_t *vp, int index); 995 * 996 * vp - vnode for the directory 997 * index - index in original gfs_dirent_t array 998 * 999 * Returns the inode number for the given entry. 1000 * 1001 * For directories with dynamic entries, a readdir callback must be provided. 1002 * This is significantly more complex, thanks to the particulars of 1003 * VOP_READDIR(). 1004 * 1005 * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp, 1006 * offset_t *off, offset_t *nextoff, void *data, int flags) 1007 * 1008 * vp - directory vnode 1009 * dp - directory entry, sized according to maxlen given to 1010 * gfs_dir_create(). callback must fill in d_name and 1011 * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags 1012 * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS 1013 * is set in 'flags'. 1014 * eofp - callback must set to 1 when EOF has been reached 1015 * off - on entry, the last offset read from the directory. Callback 1016 * must set to the offset of the current entry, typically left 1017 * untouched. 1018 * nextoff - callback must set to offset of next entry. Typically 1019 * (off + 1) 1020 * data - caller-supplied data 1021 * flags - VOP_READDIR flags 1022 * 1023 * Return 0 on success, or error on failure. 1024 */ 1025int 1026gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, int *ncookies, 1027 u_long **cookies, void *data, cred_t *cr, int flags) 1028{ 1029 gfs_readdir_state_t gstate; 1030 int error, eof = 0; 1031 ino64_t ino, pino; 1032 offset_t off, next; 1033 gfs_dir_t *dp = dvp->v_data; 1034 1035 error = gfs_get_parent_ino(dvp, cr, NULL, &pino, &ino); 1036 if (error) 1037 return (error); 1038 1039 if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, 1040 pino, ino, flags)) != 0) 1041 return (error); 1042 1043 while ((error = gfs_readdir_pred(&gstate, uiop, &off, ncookies, 1044 cookies)) == 0 && !eof) { 1045 1046 if (off >= 0 && off < dp->gfsd_nstatic) { 1047 ino = dp->gfsd_inode(dvp, off); 1048 1049 if ((error = gfs_readdir_emit(&gstate, uiop, 1050 off, ino, dp->gfsd_static[off].gfse_name, 0, 1051 ncookies, cookies)) != 0) 1052 break; 1053 1054 } else if (dp->gfsd_readdir) { 1055 off -= dp->gfsd_nstatic; 1056 1057 if ((error = dp->gfsd_readdir(dvp, 1058 gstate.grd_dirent, &eof, &off, &next, 1059 data, flags)) != 0 || eof) 1060 break; 1061 1062 off += dp->gfsd_nstatic + 2; 1063 next += dp->gfsd_nstatic + 2; 1064 1065 if ((error = gfs_readdir_emit_int(&gstate, uiop, 1066 next, ncookies, cookies)) != 0) 1067 break; 1068 } else { 1069 /* 1070 * Offset is beyond the end of the static entries, and 1071 * we have no dynamic entries. Set EOF. 1072 */ 1073 eof = 1; 1074 } 1075 } 1076 1077 return (gfs_readdir_fini(&gstate, error, eofp, eof)); 1078} 1079 1080 1081/* 1082 * gfs_vop_lookup: VOP_LOOKUP() entry point 1083 * 1084 * For use directly in vnode ops table. Given a GFS directory, calls 1085 * gfs_dir_lookup() as necessary. 1086 */ 1087/* ARGSUSED */ 1088int 1089gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, 1090 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 1091 int *direntflags, pathname_t *realpnp) 1092{ 1093 return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp)); 1094} 1095 1096/* 1097 * gfs_vop_readdir: VOP_READDIR() entry point 1098 * 1099 * For use directly in vnode ops table. Given a GFS directory, calls 1100 * gfs_dir_readdir() as necessary. 1101 */ 1102/* ARGSUSED */ 1103int 1104gfs_vop_readdir(ap) 1105 struct vop_readdir_args /* { 1106 struct vnode *a_vp; 1107 struct uio *a_uio; 1108 struct ucred *a_cred; 1109 int *a_eofflag; 1110 int *ncookies; 1111 u_long **a_cookies; 1112 } */ *ap; 1113{ 1114 vnode_t *vp = ap->a_vp; 1115 uio_t *uiop = ap->a_uio; 1116 cred_t *cr = ap->a_cred; 1117 int *eofp = ap->a_eofflag; 1118 int ncookies = 0; 1119 u_long *cookies = NULL; 1120 int error; 1121 1122 if (ap->a_ncookies) { 1123 /* 1124 * Minimum entry size is dirent size and 1 byte for a file name. 1125 */ 1126 ncookies = uiop->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 1127 cookies = malloc(ncookies * sizeof(u_long), M_TEMP, M_WAITOK); 1128 *ap->a_cookies = cookies; 1129 *ap->a_ncookies = ncookies; 1130 } 1131 1132 error = gfs_dir_readdir(vp, uiop, eofp, &ncookies, &cookies, NULL, 1133 cr, 0); 1134 1135 if (error == 0) { 1136 /* Subtract unused cookies */ 1137 if (ap->a_ncookies) 1138 *ap->a_ncookies -= ncookies; 1139 } else if (ap->a_ncookies) { 1140 free(*ap->a_cookies, M_TEMP); 1141 *ap->a_cookies = NULL; 1142 *ap->a_ncookies = 0; 1143 } 1144 1145 return (error); 1146} 1147 1148 1149#ifdef sun 1150/* 1151 * gfs_vop_map: VOP_MAP() entry point 1152 * 1153 * Convenient routine for handling pseudo-files that wish to allow mmap() calls. 1154 * This function only works for readonly files, and uses the read function for 1155 * the vnode to fill in the data. The mapped data is immediately faulted in and 1156 * filled with the necessary data during this call; there are no getpage() or 1157 * putpage() routines. 1158 */ 1159/* ARGSUSED */ 1160int 1161gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 1162 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred, 1163 caller_context_t *ct) 1164{ 1165 int rv; 1166 ssize_t resid = len; 1167 1168 /* 1169 * Check for bad parameters 1170 */ 1171#ifdef _ILP32 1172 if (len > MAXOFF_T) 1173 return (ENOMEM); 1174#endif 1175 if (vp->v_flag & VNOMAP) 1176 return (ENOTSUP); 1177 if (off > MAXOFF_T) 1178 return (EFBIG); 1179 if ((long)off < 0 || (long)(off + len) < 0) 1180 return (EINVAL); 1181 if (vp->v_type != VREG) 1182 return (ENODEV); 1183 if ((prot & (PROT_EXEC | PROT_WRITE)) != 0) 1184 return (EACCES); 1185 1186 /* 1187 * Find appropriate address if needed, otherwise clear address range. 1188 */ 1189 as_rangelock(as); 1190 rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 1191 if (rv != 0) { 1192 as_rangeunlock(as); 1193 return (rv); 1194 } 1195 1196 /* 1197 * Create mapping 1198 */ 1199 rv = as_map(as, *addrp, len, segvn_create, zfod_argsp); 1200 as_rangeunlock(as); 1201 if (rv != 0) 1202 return (rv); 1203 1204 /* 1205 * Fill with data from read() 1206 */ 1207 rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE, 1208 0, (rlim64_t)0, cred, &resid); 1209 1210 if (rv == 0 && resid != 0) 1211 rv = ENXIO; 1212 1213 if (rv != 0) { 1214 as_rangelock(as); 1215 (void) as_unmap(as, *addrp, len); 1216 as_rangeunlock(as); 1217 } 1218 1219 return (rv); 1220} 1221#endif /* sun */ 1222 1223/* 1224 * gfs_vop_reclaim: VOP_RECLAIM() entry point (solaris' VOP_INACTIVE()) 1225 * 1226 * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or 1227 * gfs_dir_inactive() as necessary, and kmem_free()s associated private data. 1228 */ 1229/* ARGSUSED */ 1230int 1231gfs_vop_reclaim(ap) 1232 struct vop_reclaim_args /* { 1233 struct vnode *a_vp; 1234 struct thread *a_td; 1235 } */ *ap; 1236{ 1237 vnode_t *vp = ap->a_vp; 1238 gfs_file_t *fp = vp->v_data; 1239 1240 if (fp->gfs_type == GFS_DIR) 1241 gfs_dir_inactive(vp); 1242 else 1243 gfs_file_inactive(vp); 1244 1245 vnode_destroy_vobject(vp); 1246 VI_LOCK(vp); 1247 vp->v_data = NULL; 1248 VI_UNLOCK(vp); 1249 kmem_free(fp, fp->gfs_size); 1250 1251 return (0); 1252} 1253