/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #ifndef DS_DDICT #include #endif #include #include #include #include #include #include #include #ifndef DS_DDICT #include /* for lookupname */ #endif #include #include #include #include #ifdef DS_DDICT #include "../contract.h" #endif #include "../nsctl.h" #include "nskernd.h" typedef struct raw_maj { struct raw_maj *next; major_t major; struct dev_ops *devops; strategy_fn_t strategy; int (*open)(dev_t *, int, int, cred_t *); int (*close)(dev_t, int, int, cred_t *); int (*ioctl)(dev_t, int, intptr_t, int, cred_t *, int *); } raw_maj_t; typedef struct raw_dev { ldi_handle_t lh; /* Solaris layered driver handle */ struct vnode *vp; /* vnode of device */ uint64_t size; /* size of device in blocks */ raw_maj_t *major; /* pointer to major structure */ char *path; /* pathname -- kmem_alloc'd */ int plen; /* length of kmem_alloc for pathname */ dev_t rdev; /* device number */ char in_use; /* flag */ int partition; /* partition number */ } raw_dev_t; static int fd_hwm = 0; /* first never used entry in _nsc_raw_files */ static raw_dev_t *_nsc_raw_files; static raw_maj_t *_nsc_raw_majors; kmutex_t _nsc_raw_lock; int _nsc_raw_flags = 0; /* required by nsctl */ static int _nsc_raw_maxdevs; /* local copy */ static int _raw_strategy(struct buf *); /* forward decl */ static dev_t ldi_get_dev_t_from_path(char *path) { vnode_t *vp; dev_t rdev; /* Validate parameters */ if (path == NULL) return (NULL); /* Lookup path */ vp = NULL; if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) return (NULL); /* Validate resulting vnode */ if ((vp) && (vp->v_type == VCHR)) rdev = vp->v_rdev; else rdev = (dev_t)NULL; /* Release vnode */ if (vp) VN_RELE(vp); return (rdev); } int _nsc_init_raw(int maxdevs) { _nsc_raw_files = kmem_zalloc(sizeof (*_nsc_raw_files) * maxdevs, KM_SLEEP); if (!_nsc_raw_files) return (ENOMEM); _nsc_raw_maxdevs = maxdevs; _nsc_raw_majors = NULL; mutex_init(&_nsc_raw_lock, NULL, MUTEX_DRIVER, NULL); return (0); } void _nsc_deinit_raw(void) { raw_maj_t *maj = _nsc_raw_majors; raw_maj_t *next; /* Free the memory allocated for strategy pointers */ while (maj != NULL) { next = maj->next; kmem_free(maj, sizeof (*maj)); maj = next; } mutex_destroy(&_nsc_raw_lock); kmem_free(_nsc_raw_files, sizeof (*_nsc_raw_files) * _nsc_raw_maxdevs); _nsc_raw_files = NULL; _nsc_raw_maxdevs = 0; } /* must be called with the _nsc_raw_lock held */ static raw_maj_t * _raw_get_maj_info(major_t umaj) { raw_maj_t *maj = _nsc_raw_majors; ASSERT(MUTEX_HELD(&_nsc_raw_lock)); /* Walk through the linked list */ while (maj != NULL) { if (maj->major == umaj) { /* Found major number */ break; } maj = maj->next; } if (maj == NULL) { struct dev_ops *ops = NULL; #ifdef DEBUG const int maxtry = 5; int try = maxtry; #endif /* * The earlier ldi_open call has locked the driver * for this major number into memory, so just index into * the devopsp array to get the dev_ops pointer which * must be valid. */ ops = devopsp[umaj]; if (ops == NULL || ops->devo_cb_ops == NULL) { cmn_err(CE_WARN, "nskern: cannot find dev_ops for major %d", umaj); return (NULL); } #ifdef DEBUG cmn_err(CE_NOTE, "nsc_raw: held driver (%d) after %d attempts", umaj, (maxtry - try)); #endif /* DEBUG */ maj = kmem_zalloc(sizeof (raw_maj_t), KM_NOSLEEP); if (!maj) { return (NULL); } maj->strategy = ops->devo_cb_ops->cb_strategy; maj->ioctl = ops->devo_cb_ops->cb_ioctl; maj->close = ops->devo_cb_ops->cb_close; maj->open = ops->devo_cb_ops->cb_open; maj->major = umaj; maj->devops = ops; if (maj->strategy == NULL || maj->strategy == nodev || maj->strategy == nulldev) { cmn_err(CE_WARN, "nskern: no strategy function for " "disk driver (major %d)", umaj); kmem_free(maj, sizeof (*maj)); return (NULL); } maj->next = _nsc_raw_majors; _nsc_raw_majors = maj; } return (maj); } /* * nsc_get_strategy returns the strategy function associated with * the major number umaj. NULL is returned if no strategy is found. */ strategy_fn_t nsc_get_strategy(major_t umaj) { raw_maj_t *maj; strategy_fn_t strategy = NULL; mutex_enter(&_nsc_raw_lock); for (maj = _nsc_raw_majors; maj != NULL; maj = maj->next) { if (maj->major == umaj) { /* Found major number */ strategy = maj->strategy; break; } } mutex_exit(&_nsc_raw_lock); return (strategy); } void * nsc_get_devops(major_t umaj) { raw_maj_t *maj; void *devops = NULL; mutex_enter(&_nsc_raw_lock); for (maj = _nsc_raw_majors; maj != NULL; maj = maj->next) { if (maj->major == umaj) { devops = maj->devops; break; } } mutex_exit(&_nsc_raw_lock); return (devops); } /* * _raw_open * * Multiple opens, single close. */ /* ARGSUSED */ static int _raw_open(char *path, int flag, blind_t *cdp, void *iodev) { struct cred *cred; raw_dev_t *cdi = NULL; char *spath; dev_t rdev; int rc, cd, the_cd; int plen; ldi_ident_t li; if (proc_nskernd == NULL) { cmn_err(CE_WARN, "nskern: no nskernd daemon running!"); return (ENXIO); } if (_nsc_raw_maxdevs == 0) { cmn_err(CE_WARN, "nskern: _raw_open() before _nsc_init_raw()!"); return (ENXIO); } plen = strlen(path) + 1; spath = kmem_alloc(plen, KM_SLEEP); if (spath == NULL) { cmn_err(CE_WARN, "nskern: unable to alloc memory in _raw_open()"); return (ENOMEM); } (void) strcpy(spath, path); /* * Lookup the vnode to extract the dev_t info, * then release the vnode. */ if ((rdev = ldi_get_dev_t_from_path(path)) == 0) { kmem_free(spath, plen); return (ENXIO); } /* * See if this device is already opened */ the_cd = -1; mutex_enter(&_nsc_raw_lock); for (cd = 0, cdi = _nsc_raw_files; cd < fd_hwm; cd++, cdi++) { if (rdev == cdi->rdev) { the_cd = cd; break; } else if (the_cd == -1 && !cdi->in_use) the_cd = cd; } if (the_cd == -1) { if (fd_hwm < _nsc_raw_maxdevs) the_cd = fd_hwm++; else { mutex_exit(&_nsc_raw_lock); cmn_err(CE_WARN, "_raw_open: too many open devices"); kmem_free(spath, plen); return (EIO); } } cdi = &_nsc_raw_files[the_cd]; if (cdi->in_use) { /* already set up - just return */ mutex_exit(&_nsc_raw_lock); *cdp = (blind_t)cdi->rdev; kmem_free(spath, plen); return (0); } cdi->partition = -1; cdi->size = (uint64_t)0; cdi->rdev = rdev; cdi->path = spath; cdi->plen = plen; cred = ddi_get_cred(); /* * Layered driver * * We use xxx_open_by_dev() since this guarantees that a * specfs vnode is created and used, not a standard filesystem * vnode. This is necessary since in a cluster PXFS will block * vnode operations during switchovers, so we have to use the * underlying specfs vnode not the PXFS vnode. * */ if ((rc = ldi_ident_from_dev(cdi->rdev, &li)) == 0) { rc = ldi_open_by_dev(&cdi->rdev, OTYP_BLK, FREAD|FWRITE, cred, &cdi->lh, li); } if (rc != 0) { cdi->lh = NULL; goto failed; } /* * grab the major_t related information */ cdi->major = _raw_get_maj_info(getmajor(rdev)); if (cdi->major == NULL) { /* Out of memory */ cmn_err(CE_WARN, "_raw_open: cannot alloc major number structure"); rc = ENOMEM; goto failed; } *cdp = (blind_t)cdi->rdev; cdi->in_use++; mutex_exit(&_nsc_raw_lock); return (rc); failed: if (cdi->lh) (void) ldi_close(cdi->lh, FWRITE|FREAD, cred); bzero(cdi, sizeof (*cdi)); mutex_exit(&_nsc_raw_lock); kmem_free(spath, plen); return (rc); } static int __raw_get_cd(dev_t fd) { int cd; if (_nsc_raw_maxdevs != 0) { for (cd = 0; cd < fd_hwm; cd++) { if (fd == _nsc_raw_files[cd].rdev) return (cd); } } return (-1); } /* * _raw_close * * Multiple opens, single close. */ static int _raw_close(dev_t fd) { struct cred *cred; raw_dev_t *cdi; int rc; int cd; mutex_enter(&_nsc_raw_lock); if ((cd = __raw_get_cd(fd)) == -1 || !_nsc_raw_files[cd].in_use) { mutex_exit(&_nsc_raw_lock); return (EIO); } cdi = &_nsc_raw_files[cd]; cred = ddi_get_cred(); rc = ldi_close(cdi->lh, FREAD|FWRITE, cred); if (rc != 0) { mutex_exit(&_nsc_raw_lock); return (rc); } kmem_free(cdi->path, cdi->plen); bzero(cdi, sizeof (*cdi)); mutex_exit(&_nsc_raw_lock); return (0); } /* ARGSUSED */ static int _raw_uread(dev_t fd, uio_t *uiop, cred_t *crp) { return (physio(_raw_strategy, 0, fd, B_READ, minphys, uiop)); } /* ARGSUSED */ static int _raw_uwrite(dev_t fd, uio_t *uiop, cred_t *crp) { return (physio(_raw_strategy, 0, fd, B_WRITE, minphys, uiop)); } static int _raw_strategy(struct buf *bp) { int cd = __raw_get_cd(bp->b_edev); if (cd == -1 || _nsc_raw_files[cd].major == NULL) { bioerror(bp, ENXIO); biodone(bp); return (NULL); } return ((*_nsc_raw_files[cd].major->strategy)(bp)); } static int _raw_partsize(dev_t fd, nsc_size_t *rvalp) { int cd; if ((cd = __raw_get_cd(fd)) == -1 || !_nsc_raw_files[cd].in_use) return (EIO); *rvalp = (nsc_size_t)_nsc_raw_files[cd].size; return (0); } /* * Return largest i/o size. */ static nsc_size_t nsc_rawmaxfbas = 0; /* ARGSUSED */ static int _raw_maxfbas(dev_t dev, int flag, nsc_size_t *ptr) { struct buf *bp; if (flag == NSC_CACHEBLK) *ptr = 1; else { if (nsc_rawmaxfbas == 0) { bp = getrbuf(KM_SLEEP); bp->b_bcount = 4096 * 512; minphys(bp); nsc_rawmaxfbas = FBA_NUM(bp->b_bcount); freerbuf(bp); } *ptr = nsc_rawmaxfbas; } return (0); } /* * Control device or system. */ /* ARGSUSED */ static int _raw_control(dev_t dev, int cmd, int *ptr) { #ifdef DEBUG cmn_err(CE_WARN, "unrecognised nsc_control: %x", cmd); #endif return (EINVAL); /* no control commands understood */ } static int _raw_get_bsize(dev_t dev, uint64_t *bsizep, int *partitionp) { #ifdef DKIOCPARTITION struct partition64 *p64 = NULL; #endif struct dk_cinfo *dki_info = NULL; struct dev_ops *ops; struct cred *cred; struct vtoc *vtoc = NULL; dev_info_t *dip; raw_dev_t *cdi; int rc, cd; int flags; int rval; *partitionp = -1; *bsizep = 0; if ((cd = __raw_get_cd(dev)) == -1 || !_nsc_raw_files[cd].in_use) return (-1); cdi = &_nsc_raw_files[cd]; ops = cdi->major->devops; if (ops == NULL) { return (-1); } rc = (*ops->devo_getinfo)(NULL, DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&dip); if (rc != DDI_SUCCESS || dip == NULL) { return (-1); } if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, DDI_KERNEL_IOCTL)) { return (-1); } cred = ddi_get_cred(); flags = FKIOCTL | FREAD | FWRITE | DATAMODEL_NATIVE; dki_info = kmem_alloc(sizeof (*dki_info), KM_SLEEP); /* DKIOCINFO */ rc = (*cdi->major->ioctl)(dev, DKIOCINFO, (intptr_t)dki_info, flags, cred, &rval); if (rc != 0) { goto out; } /* return partition number */ *partitionp = (int)dki_info->dki_partition; vtoc = kmem_alloc(sizeof (*vtoc), KM_SLEEP); /* DKIOCGVTOC */ rc = (*cdi->major->ioctl)(dev, DKIOCGVTOC, (intptr_t)vtoc, flags, cred, &rval); if (rc) { /* DKIOCGVTOC failed, but there might be an EFI label */ rc = -1; #ifdef DKIOCPARTITION /* do we have an EFI partition table? */ p64 = kmem_alloc(sizeof (*p64), KM_SLEEP); p64->p_partno = (uint_t)*partitionp; /* DKIOCPARTITION */ rc = (*cdi->major->ioctl)(dev, DKIOCPARTITION, (intptr_t)p64, flags, cred, &rval); if (rc == 0) { /* found EFI, return size */ *bsizep = (uint64_t)p64->p_size; } else { /* both DKIOCGVTOC and DKIOCPARTITION failed - error */ rc = -1; } #endif goto out; } if ((vtoc->v_sanity != VTOC_SANE) || (vtoc->v_version != V_VERSION && vtoc->v_version != 0) || (dki_info->dki_partition > V_NUMPAR)) { rc = -1; goto out; } *bsizep = (uint64_t)vtoc->v_part[(int)dki_info->dki_partition].p_size; rc = 0; out: if (dki_info) { kmem_free(dki_info, sizeof (*dki_info)); } if (vtoc) { kmem_free(vtoc, sizeof (*vtoc)); } #ifdef DKIOCPARTITION if (p64) { kmem_free(p64, sizeof (*p64)); } #endif return (rc); } /* * Ugly, ugly, ugly. * * Some volume managers (Veritas) don't support layered ioctls * (no FKIOCTL support, no DDI_KERNEL_IOCTL property defined) AND * do not support the properties for bdev_Size()/bdev_size(). * * If the underlying driver has specified DDI_KERNEL_IOCTL, then we use * the FKIOCTL technique. Otherwise ... * * The only reliable way to get the partition size, is to bounce the * command through user land (nskernd). * * Then, SunCluster PXFS blocks access at the vnode level to device * nodes during failover / switchover, so a read_vtoc() function call * from user land deadlocks. So, we end up coming back into the kernel * to go directly to the underlying device driver - that's what * nskern_bsize() is doing below. * * There has to be a better way ... */ static int _raw_init_dev(dev_t fd, uint64_t *sizep, int *partitionp) { struct nskernd *nsk; int rc, cd; if ((cd = __raw_get_cd(fd)) == -1 || !_nsc_raw_files[cd].in_use) return (EIO); /* try the in-kernel way */ rc = _raw_get_bsize(fd, sizep, partitionp); if (rc == 0) { return (0); } /* fallback to the the slow way */ nsk = kmem_zalloc(sizeof (*nsk), KM_SLEEP); nsk->command = NSKERND_BSIZE; nsk->data1 = (uint64_t)0; nsk->data2 = (uint64_t)fd; (void) strncpy(nsk->char1, _nsc_raw_files[cd].path, NSC_MAXPATH); rc = nskernd_get(nsk); if (rc == 0) { *partitionp = (int)nsk->data2; *sizep = nsk->data1; } kmem_free(nsk, sizeof (*nsk)); return (rc < 0 ? EIO : 0); } static int _raw_attach_io(dev_t fd) { int cd; if ((cd = __raw_get_cd(fd)) == -1 || !_nsc_raw_files[cd].in_use) return (EIO); return (_raw_init_dev(fd, &_nsc_raw_files[cd].size, &_nsc_raw_files[cd].partition)); } /* * See the comment above _raw_init_dev(). */ int nskern_bsize(struct nscioc_bsize *bsize, int *rvp) { struct cred *cred; raw_dev_t *cdi; int errno = 0; int flag; int cd; *rvp = 0; if (bsize == NULL || rvp == NULL) return (EINVAL); cd = __raw_get_cd(bsize->raw_fd); if (cd == -1 || !_nsc_raw_files[cd].in_use) return (EIO); cdi = &_nsc_raw_files[cd]; cred = ddi_get_cred(); /* * ddi_mmap_get_model() returns the model for this user thread * which is what we want - get_udatamodel() is not public. */ flag = FREAD | FWRITE | ddi_mmap_get_model(); if (bsize->efi == 0) { /* DKIOCINFO */ errno = (*cdi->major->ioctl)(bsize->raw_fd, DKIOCINFO, (intptr_t)bsize->dki_info, flag, cred, rvp); if (errno) { return (errno); } /* DKIOCGVTOC */ errno = (*cdi->major->ioctl)(bsize->raw_fd, DKIOCGVTOC, (intptr_t)bsize->vtoc, flag, cred, rvp); if (errno) { return (errno); } } else { #ifdef DKIOCPARTITION /* do we have an EFI partition table? */ errno = (*cdi->major->ioctl)(bsize->raw_fd, DKIOCPARTITION, (intptr_t)bsize->p64, flag, cred, rvp); if (errno) { return (errno); } #endif } return (0); } /* * Private function for sv to use. */ int nskern_partition(dev_t fd, int *partitionp) { uint64_t size; int cd, rc; if ((cd = __raw_get_cd(fd)) == -1 || !_nsc_raw_files[cd].in_use) return (EIO); if ((*partitionp = _nsc_raw_files[cd].partition) != -1) { return (0); } rc = _raw_init_dev(fd, &size, partitionp); if (rc != 0 || *partitionp < 0) { return (EIO); } return (0); } nsc_def_t _nsc_raw_def[] = { "Open", (uintptr_t)_raw_open, 0, "Close", (uintptr_t)_raw_close, 0, "Attach", (uintptr_t)_raw_attach_io, 0, "UserRead", (uintptr_t)_raw_uread, 0, "UserWrite", (uintptr_t)_raw_uwrite, 0, "PartSize", (uintptr_t)_raw_partsize, 0, "MaxFbas", (uintptr_t)_raw_maxfbas, 0, "Control", (uintptr_t)_raw_control, 0, "Provide", NSC_DEVICE, 0, 0, 0, 0 };