/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include mhd_mhiargs_t defmhiargs = { 1000, { 6000, 6000, 30000 } }; #define MDDB #include #include #include #include #include #include #include #include #include extern char svm_bootpath[]; int md_maxbootlist = MAXBOOTLIST; static ulong_t mddb_maxblocks = 0; /* tune for small records */ static int mddb_maxbufheaders = 50; static uint_t mddb_maxcopies = MDDB_NLB; /* * If this is set, more detailed messages about DB init will be given, instead * of just the MDE_DB_NODB. */ static int mddb_db_err_detail = 0; /* * This lock is used to single-thread load/unload of all sets */ static kmutex_t mddb_lock; /* * You really do NOT want to change this boolean. * It can be VERY dangerous to do so. Loss of * data may occur. USE AT YOUR OWN RISK!!!! */ static int mddb_allow_half = 0; /* * For mirrored root allow reboot with only half the replicas available * Flag inserted for Santa Fe project. */ int mirrored_root_flag; #define ISWHITE(c) (((c) == ' ') || ((c) == '\t') || \ ((c) == '\r') || ((c) == '\n')) #define ISNUM(c) (((c) >= '0') && ((c) <= '9')) #define SETMUTEX(setno) (&md_set[setno].s_dbmx) extern md_krwlock_t md_unit_array_rw; /* md.c */ extern set_t md_nsets; /* md.c */ extern int md_nmedh; /* md.c */ extern md_set_t md_set[]; /* md.c */ extern int (*mdv_strategy_tstpnt)(buf_t *, int, void*); extern dev_info_t *md_devinfo; extern int md_init_debug; extern int md_status; extern md_ops_t *md_opslist; extern md_krwlock_t nm_lock; static int update_locatorblock(mddb_set_t *s, md_dev64_t dev, ddi_devid_t didptr, ddi_devid_t old_didptr); /* * Defines for crc calculation for records * rec_crcgen generates a crc checksum for a record block * rec_crcchk checks the crc checksum for a record block */ #define REC_CRCGEN 0 #define REC_CRCCHK 1 #define rec_crcgen(s, dep, rbp) \ (void) rec_crcfunc(s, dep, rbp, REC_CRCGEN) #define rec_crcchk(s, dep, rbp) \ rec_crcfunc(s, dep, rbp, REC_CRCCHK) /* * During upgrade, SVM basically runs with the devt from the target * being upgraded. Translations are made from the target devt to the * miniroot devt when writing data out to the disk. This is done by * the following routines: * wrtblklst * writeblks * readblklst * readblks * dt_read * * The following routines are used by the routines listed above and * expect a translated (aka miniroot) devt: * getblks * getmasters * * Also, when calling any system routines, such as ddi_lyr_get_devid, * the translated (aka miniroot) devt must be used. * * By the same token, the major number and major name conversion operations * need to use the name_to_major file from the target system instead * of the name_to_major file on the miniroot. So, calls to * ddi_name_to_major must be replaced with calls to md_targ_name_to_major * when running on an upgrade. Same is true with calls to * ddi_major_to_name. */ #ifndef MDDB_FAKE static int mddb_rwdata( mddb_set_t *s, /* incore db set structure */ int flag, /* B_ASYNC, B_FAILFAST or 0 passed in here */ buf_t *bp ) { int err = 0; bp->b_flags = (flag | B_BUSY) & (~B_ASYNC); mutex_exit(SETMUTEX(s->s_setno)); if (mdv_strategy_tstpnt == NULL || (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0) (void) bdev_strategy(bp); if (flag & B_ASYNC) { mutex_enter(SETMUTEX(s->s_setno)); return (0); } err = biowait(bp); mutex_enter(SETMUTEX(s->s_setno)); return (err); } static void setidentifier( mddb_set_t *s, identifier_t *ident ) { if (s->s_setno == MD_LOCAL_SET) (void) strcpy(&ident->serial[0], s->s_ident.serial); else ident->createtime = s->s_ident.createtime; } static int cmpidentifier( mddb_set_t *s, identifier_t *ident ) { if (s->s_setno == MD_LOCAL_SET) return (strcmp(ident->serial, s->s_ident.serial)); else return (timercmp(&ident->createtime, /*CSTYLED*/ &s->s_ident.createtime, !=)); } static int mddb_devopen( md_dev64_t dev ) { dev_t ddi_dev = md_dev64_to_dev(dev); if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0) return (0); return (1); } static void mddb_devclose( md_dev64_t dev ) { (void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred); } /* * stripe_skip_ts * * Returns a list of fields to be skipped in the stripe record structure. * These fields are ms_timestamp in the component structure. * Used to skip these fields when calculating the checksum. */ static crc_skip_t * stripe_skip_ts(void *un, uint_t revision) { struct ms_row32_od *small_mdr; struct ms_row *big_mdr; uint_t row, comp, ncomps, compoff; crc_skip_t *skip; crc_skip_t *skip_prev; crc_skip_t skip_start = {0, 0, 0}; ms_unit_t *big_un; ms_unit32_od_t *small_un; uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); switch (revision) { case MDDB_REV_RB: case MDDB_REV_RBFN: small_un = (ms_unit32_od_t *)un; skip_prev = &skip_start; if (small_un->un_nrows == 0) return (NULL); /* * walk through all rows to find the total number * of components */ small_mdr = &small_un->un_row[0]; ncomps = 0; for (row = 0; (row < small_un->un_nrows); row++) { ncomps += small_mdr[row].un_ncomp; } /* Now walk through the components */ compoff = small_un->un_ocomp + rb_off; for (comp = 0; (comp < ncomps); ++comp) { uint_t mdcp = compoff + (comp * sizeof (ms_comp32_od_t)); skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); skip->skip_offset = mdcp + offsetof(ms_comp32_od_t, un_mirror.ms_timestamp); skip->skip_size = sizeof (md_timeval32_t); skip_prev->skip_next = skip; skip_prev = skip; } break; case MDDB_REV_RB64: case MDDB_REV_RB64FN: big_un = (ms_unit_t *)un; skip_prev = &skip_start; if (big_un->un_nrows == 0) return (NULL); /* * walk through all rows to find the total number * of components */ big_mdr = &big_un->un_row[0]; ncomps = 0; for (row = 0; (row < big_un->un_nrows); row++) { ncomps += big_mdr[row].un_ncomp; } /* Now walk through the components */ compoff = big_un->un_ocomp + rb_off; for (comp = 0; (comp < ncomps); ++comp) { uint_t mdcp = compoff + (comp * sizeof (ms_comp_t)); skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); skip->skip_offset = mdcp + offsetof(ms_comp_t, un_mirror.ms_timestamp); skip->skip_size = sizeof (md_timeval32_t); skip_prev->skip_next = skip; skip_prev = skip; } break; } /* Return the start of the list of fields to skip */ return (skip_start.skip_next); } /* * mirror_skip_ts * * Returns a list of fields to be skipped in the mirror record structure. * This includes un_last_read and sm_timestamp for each submirror * Used to skip these fields when calculating the checksum. */ static crc_skip_t * mirror_skip_ts(uint_t revision) { int i; crc_skip_t *skip; crc_skip_t *skip_prev; crc_skip_t skip_start = {0, 0, 0}; uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); skip_prev = &skip_start; skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); switch (revision) { case MDDB_REV_RB: case MDDB_REV_RBFN: skip->skip_offset = offsetof(mm_unit32_od_t, un_last_read) + rb_off; break; case MDDB_REV_RB64: case MDDB_REV_RB64FN: skip->skip_offset = offsetof(mm_unit_t, un_last_read) + rb_off; break; } skip->skip_size = sizeof (int); skip_prev->skip_next = skip; skip_prev = skip; for (i = 0; i < NMIRROR; i++) { skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); switch (revision) { case MDDB_REV_RB: case MDDB_REV_RBFN: skip->skip_offset = offsetof(mm_unit32_od_t, un_sm[i].sm_timestamp) + rb_off; break; case MDDB_REV_RB64: case MDDB_REV_RB64FN: skip->skip_offset = offsetof(mm_unit_t, un_sm[i].sm_timestamp) + rb_off; break; } skip->skip_size = sizeof (md_timeval32_t); skip_prev->skip_next = skip; skip_prev = skip; } /* Return the start of the list of fields to skip */ return (skip_start.skip_next); } /* * hotspare_skip_ts * * Returns a list of the timestamp fields in the hotspare record structure. * Used to skip these fields when calculating the checksum. */ static crc_skip_t * hotspare_skip_ts(uint_t revision) { crc_skip_t *skip; uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); switch (revision) { case MDDB_REV_RB: case MDDB_REV_RBFN: skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) + rb_off; break; case MDDB_REV_RB64: case MDDB_REV_RB64FN: skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) + rb_off; break; } skip->skip_size = sizeof (md_timeval32_t); return (skip); } /* * rec_crcfunc * * Calculate or check the checksum for a record * Calculate the crc if check == 0, Check the crc if check == 1 * * Record block may be written by different nodes in a multi-owner diskset * (in case of master change), the function rec_crcchk excludes timestamp * fields in crc computation of record data. * Otherwise, timestamp fields will cause each node to have a different * checksum for same record block causing the exclusive-or of all record block * checksums and data block record sums to be non-zero after new master writes * at least one record block. */ static uint_t rec_crcfunc( mddb_set_t *s, mddb_de_ic_t *dep, mddb_rb32_t *rbp, int check ) { crc_skip_t *skip; crc_skip_t *skip_tail; mddb_type_t type = dep->de_type1; uint_t ret; /* * Generate a list of the areas to be skipped when calculating * the checksum. * First skip rb_checksum, rb_private and rb_userdata. */ skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle); skip->skip_size = 3 * sizeof (uint_t); skip_tail = skip; if (MD_MNSET_SETNO(s->s_setno)) { /* For a MN set, skip rb_timestamp */ skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp); skip_tail->skip_size = sizeof (md_timeval32_t); skip->skip_next = skip_tail; /* Now add a list of timestamps to be skipped */ if (type >= MDDB_FIRST_MODID) { switch (dep->de_flags) { case MDDB_F_STRIPE: skip_tail->skip_next = stripe_skip_ts((void *)rbp->rb_data, rbp->rb_revision); break; case MDDB_F_MIRROR: skip_tail->skip_next = mirror_skip_ts(rbp->rb_revision); break; case MDDB_F_HOTSPARE: skip_tail->skip_next = hotspare_skip_ts(rbp->rb_revision); break; default: break; } } } if (check) { ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip); } else { crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip); ret = rbp->rb_checksum; } while (skip) { crc_skip_t *skip_save = skip; skip = skip->skip_next; kmem_free(skip_save, sizeof (crc_skip_t)); } return (ret); } static mddb_bf_t * allocbuffer( mddb_set_t *s, int sleepflag ) { mddb_bf_t *bfp; while ((bfp = s->s_freebufhead) == NULL) { if (sleepflag == MDDB_NOSLEEP) return ((mddb_bf_t *)NULL); ++s->s_bufmisses; #ifdef DEBUG if (s->s_bufmisses == 1) cmn_err(CE_NOTE, "md: mddb: set %u sleeping for buffer", s->s_setno); #endif s->s_bufwakeup = 1; cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno)); } s->s_freebufhead = bfp->bf_next; bzero((caddr_t)bfp, sizeof (*bfp)); bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf; bfp->bf_buf.b_flags = B_BUSY; /* initialize flags */ return (bfp); } static void freebuffer( mddb_set_t *s, mddb_bf_t *bfp ) { bfp->bf_next = s->s_freebufhead; s->s_freebufhead = bfp; if (s->s_bufwakeup) { cv_broadcast(&s->s_buf_cv); s->s_bufwakeup = 0; } } static void blkbusy( mddb_set_t *s, mddb_block_t blk ) { int bit, byte; s->s_freeblkcnt--; byte = blk / 8; bit = 1 << (blk & 7); ASSERT(! (s->s_freebitmap[byte] & bit)); s->s_freebitmap[byte] |= bit; } static void blkfree( mddb_set_t *s, mddb_block_t blk ) { int bit, byte; s->s_freeblkcnt++; byte = blk / 8; bit = 1 << (blk & 7); ASSERT(s->s_freebitmap[byte] & bit); s->s_freebitmap[byte] &= ~bit; } static int blkcheck( mddb_set_t *s, mddb_block_t blk ) { int bit, byte; byte = blk / 8; bit = 1 << (blk & 7); return (s->s_freebitmap[byte] & bit); } /* * not fast but simple */ static mddb_block_t getfreeblks( mddb_set_t *s, size_t count ) { int i; size_t contig; contig = 0; for (i = 0; i < s->s_totalblkcnt; i++) { if (blkcheck(s, i)) { contig = 0; } else { contig++; if (contig == count) { contig = i - count + 1; for (i = (int)contig; i < contig + count; i++) blkbusy(s, i); return ((mddb_block_t)contig); } } } return (0); } static void computefreeblks( mddb_set_t *s ) { mddb_db_t *dbp; mddb_de_ic_t *dep; int i; int minblks; int freeblks; mddb_mb_ic_t *mbip; mddb_lb_t *lbp; mddb_block_t maxblk; mddb_did_db_t *did_dbp; int nblks; minblks = 0; lbp = s->s_lbp; maxblk = 0; /* * Determine the max number of blocks. */ nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS; /* * go through and find highest logical block */ for (dbp = s->s_dbp; dbp != 0; dbp = dbp->db_next) { if (dbp->db_blknum > maxblk) maxblk = dbp->db_blknum; for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next) for (i = 0; i < dep->de_blkcount; i++) if (dep->de_blks[i] > maxblk) maxblk = dep->de_blks[i]; } for (i = 0; i < lbp->lb_loccnt; i++) { mddb_locator_t *lp = &lbp->lb_locators[i]; if ((lp->l_flags & MDDB_F_DELETED) || (lp->l_flags & MDDB_F_EMASTER)) continue; freeblks = 0; for (mbip = s->s_mbiarray[i]; mbip != NULL; mbip = mbip->mbi_next) { freeblks += mbip->mbi_mddb_mb.mb_blkcnt; } if (freeblks == 0) /* this happen when there is no */ continue; /* master blk */ if (freeblks <= maxblk) { lp->l_flags |= MDDB_F_TOOSMALL; lp->l_flags &= ~MDDB_F_ACTIVE; } if (freeblks < minblks || minblks == 0) minblks = freeblks; } /* * set up reasonable freespace if no * data bases exist */ if (minblks == 0) minblks = 100; if (minblks > nblks) minblks = nblks; s->s_freeblkcnt = minblks; s->s_totalblkcnt = minblks; if (! s->s_freebitmapsize) { s->s_freebitmapsize = nblks / 8; s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize, KM_SLEEP); } bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize); /* locator block sectors */ for (i = 0; i < s->s_lbp->lb_blkcnt; i++) blkbusy(s, i); /* locator name sectors */ for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++) blkbusy(s, (s->s_lbp->lb_lnfirstblk + i)); if (lbp->lb_flags & MDDB_DEVID_STYLE) { /* locator block device id information */ for (i = 0; i < s->s_lbp->lb_didblkcnt; i++) blkbusy(s, (s->s_lbp->lb_didfirstblk + i)); /* disk blocks containing actual device ids */ did_dbp = s->s_did_icp->did_ic_dbp; while (did_dbp) { for (i = 0; i < did_dbp->db_blkcnt; i++) { blkbusy(s, did_dbp->db_firstblk + i); } did_dbp = did_dbp->db_next; } } /* Only use data tags if not a MN set */ if (!(lbp->lb_flags & MDDB_MNSET)) { /* Found a bad tag, do NOT mark the data tag blks busy here */ if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) { for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++) blkbusy(s, (s->s_lbp->lb_dtfirstblk + i)); } } /* directory block/entry sectors */ for (dbp = s->s_dbp; dbp != 0; dbp = dbp->db_next) { blkbusy(s, dbp->db_blknum); for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next) for (i = 0; i < dep->de_blkcount; i++) blkbusy(s, dep->de_blks[i]); } } /* * Add free space to the device id incore free list. * Called: * - During startup when all devid blocks are temporarily placed on the * free list * - After a devid has been deleted via the metadb command. * - When mddb_devid_free_get adds unused space from a disk block * to free list */ static int mddb_devid_free_add( mddb_set_t *s, uint_t firstblk, uint_t offset, uint_t length ) { mddb_did_free_t *did_freep; if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { return (0); } did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t), KM_SLEEP); did_freep->free_blk = firstblk; did_freep->free_offset = offset; did_freep->free_length = length; did_freep->free_next = s->s_did_icp->did_ic_freep; s->s_did_icp->did_ic_freep = did_freep; return (0); } /* * Remove specific free space from the device id incore free list. * Called at startup (after all devid blocks have been placed on * free list) in order to remove the free space from the list that * contains actual devids. * Returns 0 if area successfully removed. * Returns 1 if no matching area is found - so nothing removed. */ static int mddb_devid_free_delete( mddb_set_t *s, uint_t firstblk, uint_t offset, uint_t length ) { int block_found = 0; mddb_did_free_t *did_freep1; /* next free block */ mddb_did_free_t *did_freep2 = 0; /* previous free block */ mddb_did_free_t *did_freep_before; /* area before offset, len */ mddb_did_free_t *did_freep_after; /* area after offset, len */ uint_t old_length; if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { return (1); } /* find free block for this devid */ did_freep1 = s->s_did_icp->did_ic_freep; while (did_freep1) { /* * Look through free list of to * find our entry in the free list. Our entry should * exist since the entire devid block was placed into * this free list at startup. This code is just removing * the non-free (in-use) portions of the devid block so * that the remaining linked list does indeed just * contain a free list. * * Our entry has been found if * - the blocks match, * - the offset (starting address) in the free list is * less than the offset of our entry and * - the length+offset (ending address) in the free list is * greater than the length+offset of our entry. */ if ((did_freep1->free_blk == firstblk) && (did_freep1->free_offset <= offset) && ((did_freep1->free_length + did_freep1->free_offset) >= (length + offset))) { /* Have found our entry - remove from list */ block_found = 1; did_freep_before = did_freep1; old_length = did_freep1->free_length; /* did_freep1 - pts to next free block */ did_freep1 = did_freep1->free_next; if (did_freep2) { did_freep2->free_next = did_freep1; } else { s->s_did_icp->did_ic_freep = did_freep1; } /* * did_freep_before points to area in block before * offset, length. */ did_freep_before->free_length = offset - did_freep_before->free_offset; /* * did_freep_after points to area in block after * offset, length. */ did_freep_after = (mddb_did_free_t *)kmem_zalloc (sizeof (mddb_did_free_t), KM_SLEEP); did_freep_after->free_blk = did_freep_before->free_blk; did_freep_after->free_offset = offset + length; did_freep_after->free_length = old_length - length - did_freep_before->free_length; /* * Add before and after areas to free list * If area before or after offset, length has length * of 0, that entry is not added. */ if (did_freep_after->free_length) { did_freep_after->free_next = did_freep1; if (did_freep2) { did_freep2->free_next = did_freep_after; } else { s->s_did_icp->did_ic_freep = did_freep_after; } did_freep1 = did_freep_after; } else { kmem_free(did_freep_after, sizeof (mddb_did_free_t)); } if (did_freep_before->free_length) { did_freep_before->free_next = did_freep1; if (did_freep2) { did_freep2->free_next = did_freep_before; } else { s->s_did_icp->did_ic_freep = did_freep_before; } } else { kmem_free(did_freep_before, sizeof (mddb_did_free_t)); } break; } else { did_freep2 = did_freep1; did_freep1 = did_freep1->free_next; } } if (block_found == 0) { return (1); } else { return (0); } } /* * Find free space of devid length and remove free space from list. * Return a pointer to the previously free area. * * If there's not enough free space on the free list, get an empty * disk block, put the empty disk block on the did_ic_dbp linked list, * and add the disk block space not used for devid to the free list. * * Return pointer to address (inside disk block) of free area for devid. * Return 0 if error. */ static caddr_t mddb_devid_free_get( mddb_set_t *s, uint_t len, uint_t *blk, uint_t *cnt, uint_t *offset ) { mddb_did_free_t *freep, *freep2; mddb_did_db_t *dbp; uint_t blk_cnt, blk_num; ddi_devid_t devid_ptr = NULL; if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { return (0); } freep = s->s_did_icp->did_ic_freep; freep2 = (mddb_did_free_t *)NULL; while (freep) { /* found a free area - remove from free list */ if (len <= freep->free_length) { *blk = freep->free_blk; *offset = freep->free_offset; /* find disk block pointer that contains free area */ dbp = s->s_did_icp->did_ic_dbp; while (dbp) { if (dbp->db_firstblk == *blk) break; else dbp = dbp->db_next; } /* * If a disk block pointer can't be found - something * is wrong, so don't use this free space. */ if (dbp == NULL) { freep2 = freep; freep = freep->free_next; continue; } devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset); *cnt = dbp->db_blkcnt; /* Update free list information */ freep->free_offset += len; freep->free_length -= len; if (freep->free_length == 0) { if (freep2) { freep2->free_next = freep->free_next; } else { s->s_did_icp->did_ic_freep = freep->free_next; } kmem_free(freep, sizeof (mddb_did_free_t)); } break; } freep2 = freep; freep = freep->free_next; } /* Didn't find a free spot */ if (freep == NULL) { /* get free logical disk blk in replica */ blk_cnt = btodb(len + (MDDB_BSIZE - 1)); blk_num = getfreeblks(s, blk_cnt); if (blk_num == 0) return (0); /* Add disk block to disk block linked list */ dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP); dbp->db_firstblk = blk_num; dbp->db_blkcnt = blk_cnt; dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP); dbp->db_next = s->s_did_icp->did_ic_dbp; s->s_did_icp->did_ic_dbp = dbp; devid_ptr = (ddi_devid_t)dbp->db_ptr; /* Update return values */ *blk = blk_num; *offset = 0; *cnt = blk_cnt; /* Add unused part of block to free list */ (void) mddb_devid_free_add(s, blk_num, len, (dbtob(blk_cnt) - len)); } return ((caddr_t)devid_ptr); } /* * Add device id information for locator index to device id area in set. * Get free area to store device id from free list. Update checksum * for mddb_did_blk. * * This routine does not write any data out to disk. * After this routine has been called, the routine, writelocall, should * be called to write both the locator block and device id area out * to disk. */ static int mddb_devid_add( mddb_set_t *s, uint_t index, ddi_devid_t devid, char *minor_name ) { uint_t devid_len; uint_t blk, offset; ddi_devid_t devid_ptr; mddb_did_info_t *did_info; uint_t blkcnt, i; mddb_did_blk_t *did_blk; if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { return (1); } if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1)) return (1); /* Check if device id has already been added */ did_blk = s->s_did_icp->did_ic_blkp; did_info = &(did_blk->blk_info[index]); if (did_info->info_flags & MDDB_DID_EXISTS) return (0); devid_len = ddi_devid_sizeof(devid); devid_ptr = (ddi_devid_t)mddb_devid_free_get(s, devid_len, &blk, &blkcnt, &offset); if (devid_ptr == NULL) { return (1); } /* Copy devid into devid free area */ for (i = 0; i < devid_len; i++) ((char *)devid_ptr)[i] = ((char *)devid)[i]; /* Update mddb_did_info area for new device id */ did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID; /* * Only set UPDATED flag for non-replicated import cases. * This allows the side locator driver name index to get * updated in load_old_replicas. */ if (!(md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT)) did_info->info_flags |= MDDB_DID_UPDATED; did_info->info_firstblk = blk; did_info->info_blkcnt = blkcnt; did_info->info_offset = offset; did_info->info_length = devid_len; (void) strcpy(did_info->info_minor_name, minor_name); crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL); /* Add device id pointer to did_ic_devid array */ s->s_did_icp->did_ic_devid[index] = devid_ptr; return (0); } /* * Delete device id information for locator index from device id area in set. * Add device id space to free area. * * This routine does not write any data out to disk. * After this routine has been called, the routine, writelocall, should * be called to write both the locator block and device id area out * to disk. */ static int mddb_devid_delete(mddb_set_t *s, uint_t index) { mddb_did_info_t *did_info; mddb_did_blk_t *did_blk; if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { return (1); } /* Get device id information from mddb_did_blk */ did_blk = s->s_did_icp->did_ic_blkp; did_info = &(did_blk->blk_info[index]); /* * Ensure that the underlying device supports device ids * before arbitrarily removing them. */ if (!(did_info->info_flags & MDDB_DID_EXISTS)) { return (1); } /* Remove device id information from mddb_did_blk */ did_info->info_flags = 0; /* Remove device id from incore area */ s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL; /* Add new free space in disk block to free list */ (void) mddb_devid_free_add(s, did_info->info_firstblk, did_info->info_offset, did_info->info_length); return (0); } /* * Check if there is a device id for a locator index. * * Caller of this routine should not free devid or minor_name since * these will point to internal data structures that should not * be freed. */ static int mddb_devid_get( mddb_set_t *s, uint_t index, ddi_devid_t *devid, char **minor_name ) { mddb_did_info_t *did_info; if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { return (0); } did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]); if (did_info->info_flags & MDDB_DID_EXISTS) { *devid = s->s_did_icp->did_ic_devid[index]; *minor_name = s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name; return (1); } else return (0); } /* * Check if device id is valid on current system. * Needs devid, previously known dev_t and current minor_name. * * Success: * Returns 0 if valid device id is found and updates * dev_t if the dev_t associated with the device id is * different than dev_t. * Failure: * Returns 1 if device id not valid on current system. */ static int mddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name) { int retndevs; dev_t *ddi_devs; int devid_flag = 0; int cnt; if (dev == 0) return (1); /* * See if devid is valid in the current system. * If so, set dev to match the devid. */ if (ddi_lyr_devid_to_devlist(devid, minor_name, &retndevs, &ddi_devs) == DDI_SUCCESS) { if (retndevs > 0) { /* devid is valid to use */ devid_flag = 1; /* does dev_t in list match dev */ cnt = 0; while (cnt < retndevs) { if (*dev == md_expldev(ddi_devs[cnt])) break; cnt++; } /* * If a different dev_t, then setup * new dev and new major name */ if (cnt == retndevs) { *dev = md_expldev(ddi_devs[0]); } ddi_lyr_free_devlist(ddi_devs, retndevs); } } if (devid_flag) return (0); else return (1); } /* * Free the devid incore data areas */ static void mddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp) { mddb_did_free_t *did_freep1, *did_freep2; mddb_did_db_t *did_dbp1, *did_dbp2; mddb_did_ic_t *icp = *did_icp; if (icp) { if (icp->did_ic_blkp) { kmem_free((caddr_t)icp->did_ic_blkp, dbtob(lbp->lb_didblkcnt)); icp->did_ic_blkp = (mddb_did_blk_t *)NULL; } if (icp->did_ic_dbp) { did_dbp1 = icp->did_ic_dbp; while (did_dbp1) { did_dbp2 = did_dbp1->db_next; kmem_free((caddr_t)did_dbp1->db_ptr, dbtob(did_dbp1->db_blkcnt)); kmem_free((caddr_t)did_dbp1, sizeof (mddb_did_db_t)); did_dbp1 = did_dbp2; } } if (icp->did_ic_freep) { did_freep1 = icp->did_ic_freep; while (did_freep1) { did_freep2 = did_freep1->free_next; kmem_free((caddr_t)did_freep1, sizeof (mddb_did_free_t)); did_freep1 = did_freep2; } } kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t)); *did_icp = (mddb_did_ic_t *)NULL; } } static daddr_t getphysblk( mddb_block_t blk, mddb_mb_ic_t *mbip ) { mddb_mb_t *mbp = &(mbip->mbi_mddb_mb); while (blk >= mbp->mb_blkcnt) { if (! mbip->mbi_next) return ((daddr_t)-1); /* no such block */ blk -= mbp->mb_blkcnt; mbip = mbip->mbi_next; mbp = &(mbip->mbi_mddb_mb); } if (blk >= mbp->mb_blkmap.m_consecutive) return ((daddr_t)-1); /* no such block */ return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk)); } /* * when a buf header is passed in the new buffer must be * put on the front of the chain. writerec counts on it */ static int putblks( mddb_set_t *s, /* incore db set structure */ caddr_t buffer, /* adr of buffer to be written */ daddr_t blk, /* block number for first block */ int cnt, /* number of blocks to be written */ md_dev64_t device, /* device to be written to */ mddb_bf_t **bufhead /* if non-zero then ASYNC I/O */ /* and put buf address here */ ) { buf_t *bp; mddb_bf_t *bfp; int err = 0; bfp = allocbuffer(s, MDDB_SLEEPOK); bp = &bfp->bf_buf; bp->b_bcount = MDDB_BSIZE * cnt; bp->b_un.b_addr = buffer; bp->b_blkno = blk; bp->b_edev = md_dev64_to_dev(device); /* * if a header for a buf chain is passed in this is async io. * currently only done for optimize records */ if (bufhead) { bfp->bf_next = *bufhead; *bufhead = bfp; (void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp); return (0); } err = mddb_rwdata(s, B_WRITE, bp); freebuffer(s, bfp); if (err) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA, s->s_setno, device); return (MDDB_F_EWRITE); } return (0); } /* * wrtblklst - takes an array of logical block numbers * and writes the buffer to those blocks (scatter). * If called during upgrade, this routine expects a * non-translated (aka target) dev. */ static int wrtblklst( mddb_set_t *s, /* incore set structure */ caddr_t buffer, /* buffer to be written (record blk) */ mddb_block_t blka[], /* list of logical blks for record */ daddr_t cnt, /* number of logical blks */ const int li, /* locator index */ mddb_bf_t **bufhead, /* if non-zero then ASYNC I/O */ /* and put buf address here */ int master_only /* allow only master node to write */ ) { daddr_t blk; daddr_t blk1; int err = 0; int cons; mddb_lb_t *lbp = s->s_lbp; mddb_locator_t *lp = &lbp->lb_locators[li]; md_dev64_t dev; mddb_mb_ic_t *mbip = s->s_mbiarray[li]; /* * If a MN diskset and only the master can write, * then a non-master node will just return success. */ if (lbp->lb_flags & MDDB_MNSET) { if (master_only == MDDB_WR_ONLY_MASTER) { /* return successfully if we aren't the master */ if (!(md_set[s->s_setno].s_am_i_master)) { return (0); } } if (mbip == NULL) return (MDDB_F_EWRITE); } dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); if (dev == NODEV64) { return (1); } blk = getphysblk(blka[0], mbip); ASSERT(blk >= 0); cons = 1; while (cnt) { if (cons != cnt) { blk1 = getphysblk(blka[cons], mbip); ASSERT(blk1 >= 0); if ((blk + cons) == blk1) { cons++; continue; } } if (err = putblks(s, buffer, blk, cons, dev, bufhead)) { /* * If an MN diskset and any_node_can_write * then this request is coming from writeoptrecord * and l_flags field should not be updated. * l_flags will be updated as a result of sending * a class1 message to the master. Setting l_flags * here will cause slave to be out of sync with * master. * * Otherwise, set the error in l_flags * (this occurs if this is not a MN diskset or * only_master_can_write is set). */ if ((!(lbp->lb_flags & MDDB_MNSET)) || (master_only == MDDB_WR_ONLY_MASTER)) { lp->l_flags |= MDDB_F_EWRITE; } return (err); } if (bufhead) (*bufhead)->bf_locator = lp; buffer += MDDB_BSIZE * cons; cnt -= cons; blka += cons; if (cnt) { blk = getphysblk(blka[0], mbip); ASSERT(blk >= 0); } cons = 1; } return (0); } /* * writeblks - takes a logical block number/block count pair * and writes the buffer to those contiguous logical blocks. * If called during upgrade, this routine expects a non-translated * (aka target) dev. */ static int writeblks( mddb_set_t *s, /* incore set structure */ caddr_t buffer, /* buffer to be written */ mddb_block_t blk, /* starting logical block number */ int cnt, /* number of log blocks to be written */ const int li, /* locator index */ int master_only /* allow only master node to write */ ) { daddr_t physblk; int err = 0; int i; mddb_lb_t *lbp = s->s_lbp; mddb_locator_t *lp = &lbp->lb_locators[li]; md_dev64_t dev; mddb_block_t *blkarray; int size; int ret; /* * If a MN diskset and only the master can write, * then a non-master node will just return success. */ if ((lbp->lb_flags & MDDB_MNSET) && (master_only == MDDB_WR_ONLY_MASTER)) { /* return successfully if we aren't the master */ if (!(md_set[s->s_setno].s_am_i_master)) { return (0); } } dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); if (dev == NODEV64) { return (1); } if (cnt > 1) { size = sizeof (mddb_block_t) * cnt; blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP); for (i = 0; i < cnt; i++) blkarray[i] = blk + i; ret = wrtblklst(s, buffer, blkarray, cnt, li, 0, MDDB_WR_ONLY_MASTER); kmem_free(blkarray, size); return (ret); } physblk = getphysblk(blk, s->s_mbiarray[li]); ASSERT(physblk > 0); if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) { lp->l_flags |= MDDB_F_EWRITE; return (err); } return (0); } /* * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas. */ static int writeall( mddb_set_t *s, /* incore set structure */ caddr_t buffer, /* buffer to be written */ mddb_block_t block, /* starting logical block number */ int cnt, /* number of log blocks to be written */ int master_only /* allow only master node to write */ ) { int li; int err = 0; mddb_lb_t *lbp = s->s_lbp; for (li = 0; li < lbp->lb_loccnt; li++) { mddb_locator_t *lp = &lbp->lb_locators[li]; if ((! (lp->l_flags & MDDB_F_ACTIVE)) || (lp->l_flags & MDDB_F_EWRITE)) continue; err |= writeblks(s, buffer, block, cnt, li, master_only); } return (err); } /* * writelocall - write the locator block and device id information (if * replica is in device id format) to all ACTIVE/NON-ERRORER replicas. * * Increments the locator block's commitcnt. Updates the device id area's * commitcnt if the replica is in device id format. Regenerates the * checksums after updating the commitcnt(s). */ static int writelocall( mddb_set_t *s /* incore set structure */ ) { int li; int err = 0; mddb_lb_t *lbp = s->s_lbp; mddb_did_blk_t *did_blk; mddb_did_db_t *did_dbp; s->s_lbp->lb_commitcnt++; if (lbp->lb_flags & MDDB_DEVID_STYLE) { did_blk = s->s_did_icp->did_ic_blkp; did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt; crcgen(did_blk, &did_blk->blk_checksum, dbtob(lbp->lb_didblkcnt), NULL); } crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL); for (li = 0; li < lbp->lb_loccnt; li++) { mddb_locator_t *lp = &lbp->lb_locators[li]; if ((! (lp->l_flags & MDDB_F_ACTIVE)) || (lp->l_flags & MDDB_F_EWRITE)) continue; if (lbp->lb_flags & MDDB_DEVID_STYLE) { /* write out blocks containing actual device ids */ did_dbp = s->s_did_icp->did_ic_dbp; while (did_dbp) { err |= writeblks(s, (caddr_t)did_dbp->db_ptr, did_dbp->db_firstblk, did_dbp->db_blkcnt, li, MDDB_WR_ONLY_MASTER); did_dbp = did_dbp->db_next; } /* write out device id area block */ err |= writeblks(s, (caddr_t)did_blk, lbp->lb_didfirstblk, lbp->lb_didblkcnt, li, MDDB_WR_ONLY_MASTER); } /* write out locator block */ err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li, MDDB_WR_ONLY_MASTER); } /* * If a MN diskset and this is the master, set the PARSE_LOCBLK flag * in the mddb_set structure to show that the locator block has * been changed. */ if ((lbp->lb_flags & MDDB_MNSET) && (md_set[s->s_setno].s_am_i_master)) { s->s_mn_parseflags |= MDDB_PARSE_LOCBLK; } return (err); } /* * If called during upgrade, this routine expects a translated * (aka miniroot) dev. */ static int getblks( mddb_set_t *s, /* incore db set structure */ caddr_t buffer, /* buffer to read data into */ md_dev64_t device, /* device to read from */ daddr_t blk, /* physical block number to read */ int cnt, /* number of blocks to read */ int flag /* flags for I/O */ ) { buf_t *bp; mddb_bf_t *bfp; int err = 0; bfp = allocbuffer(s, MDDB_SLEEPOK); /* this will never sleep */ bp = &bfp->bf_buf; bp->b_bcount = MDDB_BSIZE * cnt; bp->b_un.b_addr = buffer; bp->b_blkno = blk; bp->b_edev = md_dev64_to_dev(device); err = mddb_rwdata(s, (B_READ | flag), bp); freebuffer(s, bfp); if (err) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA, s->s_setno, device); return (MDDB_F_EREAD); } return (0); } /* * readblklst - takes an array of logical block numbers * and reads those blocks (gather) into the buffer. * If called during upgrade, this routine expects a non-translated * (aka target) dev. */ static int readblklst( mddb_set_t *s, /* incore set structure */ caddr_t buffer, /* buffer to be read (record block) */ mddb_block_t blka[], /* list of logical blocks to be read */ daddr_t cnt, /* number of logical blocks */ int li, /* locator index */ int flag /* flags for I/O */ ) { daddr_t blk; daddr_t blk1; int err = 0; int cons; md_dev64_t dev; mddb_mb_ic_t *mbip; mbip = s->s_mbiarray[li]; dev = md_expldev(s->s_lbp->lb_locators[li].l_dev); dev = md_xlate_targ_2_mini(dev); if (dev == NODEV64) { return (1); } blk = getphysblk(blka[0], mbip); ASSERT(blk >= 0); cons = 1; while (cnt) { if (cons != cnt) { blk1 = getphysblk(blka[cons], mbip); ASSERT(blk1 >= 0); if ((blk + cons) == blk1) { cons++; continue; } } if (err = getblks(s, buffer, dev, blk, cons, flag)) return (err); buffer += MDDB_BSIZE * cons; cnt -= cons; blka += cons; if (cnt) { blk = getphysblk(blka[0], mbip); ASSERT(blk >= 0); } cons = 1; } return (0); } /* * readblks - takes a logical block number/block count pair * and reads those contiguous logical blocks into the buffer. * If called during upgrade, this routine expects a non-translated * (aka target) dev. */ static int readblks( mddb_set_t *s, /* incore set structure */ caddr_t buffer, /* buffer to be read into */ mddb_block_t blk, /* logical block number to be read */ int cnt, /* number of logical blocks to be read */ int li /* locator index */ ) { daddr_t physblk; md_dev64_t device; int i; mddb_block_t *blkarray; int size; int ret; if (cnt > 1) { size = sizeof (mddb_block_t) * cnt; blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP); for (i = 0; i < cnt; i++) blkarray[i] = blk + i; ret = readblklst(s, buffer, blkarray, cnt, li, 0); kmem_free(blkarray, size); return (ret); } physblk = getphysblk(blk, s->s_mbiarray[li]); ASSERT(physblk > 0); device = md_expldev(s->s_lbp->lb_locators[li].l_dev); device = md_xlate_targ_2_mini(device); if (device == NODEV64) { return (1); } return (getblks(s, buffer, device, physblk, 1, 0)); } static void single_thread_start( mddb_set_t *s ) { while (s->s_singlelockgotten) { s->s_singlelockwanted++; cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno)); } s->s_singlelockgotten++; } static void single_thread_end( mddb_set_t *s ) { ASSERT(s->s_singlelockgotten); s->s_singlelockgotten = 0; if (s->s_singlelockwanted) { s->s_singlelockwanted = 0; cv_broadcast(&s->s_single_thread_cv); } } static size_t sizeofde( mddb_de_ic_t *dep ) { size_t size; size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) + sizeof (mddb_block_t) * dep->de_blkcount; return (size); } static size_t sizeofde32( mddb_de32_t *dep ) { size_t size; size = sizeof (*dep) - sizeof (dep->de32_blks) + sizeof (mddb_block_t) * dep->de32_blkcount; return (size); } static mddb_de32_t * nextentry( mddb_de32_t *dep ) { mddb_de32_t *ret; ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep))); return (ret); } static void create_db32rec( mddb_db32_t *db32p, mddb_db_t *dbp ) { mddb_de_ic_t *dep; mddb_de32_t *de32p; #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); #endif dbtodb32(dbp, db32p); if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0)) db32p->db32_firstentry = 0x4; de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry) + sizeof (db32p->db32_firstentry))); for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { detode32(dep, de32p); if ((dep->de_next != NULL) && (de32p->de32_next == 0)) de32p->de32_next = 0x4; de32p = nextentry(de32p); } ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE); } /* * If called during upgrade, this routine expects a translated * (aka miniroot) dev. * If master blocks are found, set the mn_set parameter to 1 if the * the master block revision number is MDDB_REV_MNMB; otherwise, * set it to 0. * If master blocks are not found, do not change the mnset parameter. */ static mddb_mb_ic_t * getmasters( mddb_set_t *s, md_dev64_t dev, daddr_t blkno, uint_t *flag, int *mn_set ) { mddb_mb_ic_t *mbi = NULL; mddb_mb_t *mb; int error = 0; ddi_devid_t devid; if (mddb_devopen(dev)) { if (flag) *flag |= MDDB_F_EMASTER; return ((mddb_mb_ic_t *)NULL); } mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP); mb = &(mbi->mbi_mddb_mb); if (error = getblks(s, (caddr_t)mb, dev, blkno, btodb(MDDB_BSIZE), 0)) { error |= MDDB_F_EMASTER; } if (mb->mb_magic != MDDB_MAGIC_MB) { error = MDDB_F_EFMT | MDDB_F_EMASTER; } /* Check for MDDB_REV_MNMB and lower */ if (revchk(MDDB_REV_MNMB, mb->mb_revision)) { error = MDDB_F_EFMT | MDDB_F_EMASTER; } if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) { error = MDDB_F_EFMT | MDDB_F_EMASTER; } if (!(md_get_setstatus(s->s_setno) & (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && (mb->mb_setno != s->s_setno)) { error = MDDB_F_EFMT | MDDB_F_EMASTER; } if (mb->mb_blkno != blkno) { error = MDDB_F_EFMT | MDDB_F_EMASTER; } mb->mb_next = NULL; mbi->mbi_next = NULL; if (error) goto out; /* * Check the md_devid_destroy and md_keep_repl_state flags * to see if we need to regen the devid or not. * * Don't care about devid in local set since it is not used * and this should not be part of set importing */ if ((s->s_setno != MD_LOCAL_SET) && !(md_get_setstatus(s->s_setno) & (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT))) { /* * Now check the destroy flag. We also need to handle * the case where the destroy flag is reset after the * destroy */ if (md_devid_destroy || (mb->mb_devid_len == 0)) { if (md_devid_destroy) { bzero(mb->mb_devid, mb->mb_devid_len); mb->mb_devid_len = 0; } /* * Try to regenerate it if the 'keep' flag is not set */ if (!md_keep_repl_state) { if (ddi_lyr_get_devid(md_dev64_to_dev(dev), &devid) == DDI_SUCCESS) { mb->mb_devid_len = ddi_devid_sizeof(devid); bcopy(devid, mb->mb_devid, mb->mb_devid_len); ddi_devid_free(devid); } else { error = MDDB_F_EFMT | MDDB_F_EMASTER; } } crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL); /* * Push */ if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) { error = MDDB_F_EFMT | MDDB_F_EMASTER; } } } if (! error) { /* Set mn_set parameter to 1 if a MN set */ if (mb->mb_revision == MDDB_REV_MNMB) *mn_set = 1; else *mn_set = 0; return (mbi); } out: /* Error Out */ if (flag) *flag |= error; kmem_free((caddr_t)mbi, MDDB_IC_BSIZE); mddb_devclose(dev); return ((mddb_mb_ic_t *)NULL); } static int getrecord( mddb_set_t *s, mddb_de_ic_t *dep, int li ) { int err = 0; mddb_rb32_t *rbp; #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); #endif dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP); rbp = dep->de_rb; err = readblklst(s, (caddr_t)rbp, dep->de_blks, dep->de_blkcount, li, 0); if (err) { return (MDDB_F_EDATA | err); } if (rbp->rb_magic != MDDB_MAGIC_RB) { return (MDDB_F_EFMT | MDDB_F_EDATA); } if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) && (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0) && (revchk(MDDB_REV_RBFN, rbp->rb_revision) != 0) && (revchk(MDDB_REV_RB64FN, rbp->rb_revision) != 0)) { return (MDDB_F_EFMT | MDDB_F_EDATA); } /* Check crc for this record */ if (rec_crcchk(s, dep, rbp)) { return (MDDB_F_EFMT | MDDB_F_EDATA); } return (0); } /* * Code to read in the locator name information */ static int readlocnames( mddb_set_t *s, int li ) { mddb_ln_t *lnp; int err = 0; mddb_block_t ln_blkcnt, ln_blkno; /* * read in the locator name blocks */ s->s_lnp = NULL; ln_blkno = s->s_lbp->lb_lnfirstblk; ln_blkcnt = s->s_lbp->lb_lnblkcnt; lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP); err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li); if (err) { err |= MDDB_F_EDATA; goto out; } if (lnp->ln_magic != MDDB_MAGIC_LN) { err = MDDB_F_EDATA | MDDB_F_EFMT; goto out; } if (s->s_lbp->lb_flags & MDDB_MNSET) { if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) { err = MDDB_F_EDATA | MDDB_F_EFMT; goto out; } } else { if (revchk(MDDB_REV_LN, lnp->ln_revision)) { err = MDDB_F_EDATA | MDDB_F_EFMT; goto out; } } if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) { err = MDDB_F_EDATA | MDDB_F_EFMT; goto out; } out: /* * if error occurred in locator name blocks free them * and return */ if (err) { kmem_free((caddr_t)lnp, dbtob(ln_blkcnt)); return (err); } s->s_lnp = lnp; return (0); } /* * code to read in a copy of the database. */ static int readcopy( mddb_set_t *s, int li ) { uint_t blk; mddb_db_t *dbp, *dbp1, *dbhp; mddb_db32_t *db32p; mddb_de_ic_t *dep, *dep2; mddb_de32_t *de32p, *de32p2; int err = 0; uint_t checksum; #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); #endif dbp = NULL; dbhp = NULL; /* * read in all the directory blocks */ blk = s->s_lbp->lb_dbfirstblk; db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); for (; blk != 0; blk = dbp->db_nextblk) { dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP); if (! dbhp) { dbhp = dbp1; } else { dbp->db_next = dbp1; } dbp = dbp1; err = readblks(s, (caddr_t)db32p, blk, 1, li); if (err) { err |= MDDB_F_EDATA; break; } db32todb(db32p, dbp); if (db32p->db32_magic != MDDB_MAGIC_DB) { err = MDDB_F_EDATA | MDDB_F_EFMT; break; } if (revchk(MDDB_REV_DB, db32p->db32_revision)) { err = MDDB_F_EDATA | MDDB_F_EFMT; break; } if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) { err = MDDB_F_EDATA | MDDB_F_EFMT; break; } /* * first go through and fix up all de_next pointers */ if (dbp->db_firstentry) { de32p = (mddb_de32_t *) ((void *) ((caddr_t)(&db32p->db32_firstentry) + sizeof (db32p->db32_firstentry))); dep = (mddb_de_ic_t *) kmem_zalloc(sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) + sizeof (mddb_block_t) * de32p->de32_blkcount, KM_SLEEP); de32tode(de32p, dep); dbp->db_firstentry = dep; while (de32p && de32p->de32_next) { de32p2 = nextentry(de32p); dep2 = (mddb_de_ic_t *)kmem_zalloc( sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) + sizeof (mddb_block_t) * de32p2->de32_blkcount, KM_SLEEP); de32tode(de32p2, dep2); dep->de_next = dep2; dep = dep2; de32p = de32p2; } } /* * go through and make all of the pointer to record blocks * are null; */ for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) dep->de_rb = NULL; } kmem_free((caddr_t)db32p, MDDB_BSIZE); dbp->db_next = NULL; /* * if error occurred in directory blocks free them * and return */ if (err) { dbp = dbhp; while (dbp) { dep = dbp->db_firstentry; while (dep) { /* No mddb_rb32_t structures yet */ dep2 = dep->de_next; kmem_free((caddr_t)dep, sizeofde(dep)); dep = dep2; } dbp1 = dbp->db_next; kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); dbp = dbp1; } s->s_dbp = NULL; return (err); } /* */ err = 0; checksum = MDDB_GLOBAL_XOR; for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) { checksum ^= dbp->db_recsum; for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { if (dep->de_flags & MDDB_F_OPT) continue; err = getrecord(s, dep, li); if (err) break; /* Don't include CHANGELOG in big XOR */ if (dep->de_flags & MDDB_F_CHANGELOG) continue; checksum ^= dep->de_rb->rb_checksum; checksum ^= dep->de_rb->rb_checksum_fiddle; } if (err) break; } if (checksum) { if (! err) err = MDDB_F_EDATA | MDDB_F_EFMT; } if (err) { dbp = dbhp; dbhp = NULL; while (dbp) { dep = dbp->db_firstentry; while (dep) { if (dep->de_rb) kmem_free((caddr_t)dep->de_rb, dep->de_recsize); dep2 = dep->de_next; kmem_free((caddr_t)dep, sizeofde(dep)); dep = dep2; } dbp1 = dbp->db_next; kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); dbp = dbp1; } } s->s_dbp = dbhp; return (err); } static int getoptcnt( mddb_set_t *s, int li) { int result; mddb_de_ic_t *dep; mddb_db_t *dbp; #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); #endif result = 0; for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { dep = dbp->db_firstentry; for (; dep != NULL; dep = dep->de_next) { if (! (dep->de_flags & MDDB_F_OPT)) continue; if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) && (li == dep->de_optinfo[0].o_li)) || ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) && (li == dep->de_optinfo[1].o_li))) result++; } } return (result); } static void getoptdev( mddb_set_t *s, mddb_de_ic_t *rdep, int opti ) { mddb_lb_t *lbp; mddb_locator_t *lp; mddb_optinfo_t *otherop; mddb_optinfo_t *resultop; int li; dev_t otherdev; int blkonly = 0; int mincnt; int thiscnt; lbp = s->s_lbp; resultop = &rdep->de_optinfo[opti]; otherop = &rdep->de_optinfo[1-opti]; resultop->o_flags = 0; /* * scan through and see if data bases have to vary by only device */ if (otherop->o_flags & MDDB_F_ACTIVE) { blkonly = 1; otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev); for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (! (lp->l_flags & MDDB_F_ACTIVE)) continue; if (expldev(lp->l_dev) != otherdev) { blkonly = 0; break; } } } mincnt = 999999; for (li = 0; li < lbp->lb_loccnt; li++) { dev_info_t *devi; int removable = 0; lp = &lbp->lb_locators[li]; if (! (lp->l_flags & MDDB_F_ACTIVE)) continue; if (otherop->o_flags & MDDB_F_ACTIVE) { if (blkonly) { if (otherop->o_li == li) continue; } else { if (otherdev == expldev(lp->l_dev)) continue; } } /* * Check if this is a removable device. If it is we * assume it is something like a USB flash disk, a zip disk * or even a floppy that is being used to help maintain * mddb quorum. We don't want to put any optimized resync * records on these kinds of disks since they are usually * slower or don't have the same read/write lifetimes as * a regular fixed disk. */ if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) { int error; struct cb_ops *cb; ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; int propvalue = 0; int proplength = sizeof (int); if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops) != NULL) { error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi, prop_op, DDI_PROP_NOTPROM | DDI_PROP_DONTPASS, "removable-media", (caddr_t)&propvalue, &proplength); if (error == DDI_PROP_SUCCESS) removable = 1; } ddi_release_devi(devi); } if (removable) continue; thiscnt = getoptcnt(s, li); if (thiscnt < mincnt) { resultop->o_li = li; mincnt = thiscnt; resultop->o_flags = MDDB_F_ACTIVE; } } } static void allocuserdata( mddb_de_ic_t *dep ) { mddb_rb32_t *rbp; #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); #endif rbp = dep->de_rb; rbp->rb_private = 0; dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP); rbp->rb_userdata = 0x4; /* Make sure this is non-zero */ bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize); } static void getuserdata( set_t setno, mddb_de_ic_t *dep ) { mddb_rb32_t *rbp; mddb_type_t type = dep->de_type1; caddr_t data, udata; #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); #endif rbp = dep->de_rb; data = (caddr_t)rbp->rb_data; udata = (caddr_t)dep->de_rb_userdata; /* * If it's a driver record, and an old style record, and not a DRL * record, we must convert it because it was incore as a 64 bit * structure but its on disk layout has only 32 bit for block sizes */ if (!(md_get_setstatus(setno) & (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && (type >= MDDB_FIRST_MODID) && ((rbp->rb_revision == MDDB_REV_RB) || (rbp->rb_revision == MDDB_REV_RBFN))) { switch (dep->de_flags) { case MDDB_F_STRIPE: stripe_convert(data, udata, BIG_2_SMALL); break; case MDDB_F_MIRROR: mirror_convert(data, udata, BIG_2_SMALL); break; case MDDB_F_RAID: raid_convert(data, udata, BIG_2_SMALL); break; case MDDB_F_SOFTPART: softpart_convert(data, udata, BIG_2_SMALL); break; case MDDB_F_TRANS_MASTER: trans_master_convert(data, udata, BIG_2_SMALL); break; case MDDB_F_TRANS_LOG: trans_log_convert(data, udata, BIG_2_SMALL); break; case MDDB_F_HOTSPARE: hs_convert(data, udata, BIG_2_SMALL); break; case MDDB_F_OPT: default: bcopy(udata, data, dep->de_reqsize); } } else { bcopy(udata, data, dep->de_reqsize); } } static void getoptrecord( mddb_set_t *s, mddb_de_ic_t *dep ) { mddb_lb_t *lbp; mddb_locator_t *lp; mddb_rb32_t *rbp, *crbp; int li; int i; int err = 0; size_t recsize; #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); #endif lbp = s->s_lbp; recsize = dep->de_recsize; dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); rbp = dep->de_rb; crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); dep->de_optinfo[0].o_flags |= MDDB_F_EDATA; dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; for (i = 0; i < 2; i++) { if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) continue; li = dep->de_optinfo[i].o_li; lp = &lbp->lb_locators[li]; if (! (lp->l_flags & MDDB_F_ACTIVE) || (lp->l_flags & MDDB_F_EMASTER)) continue; err = readblklst(s, (caddr_t)rbp, dep->de_blks, dep->de_blkcount, li, 0); if (err) continue; if (rbp->rb_magic != MDDB_MAGIC_RB) continue; if (revchk(MDDB_REV_RB, rbp->rb_revision)) continue; /* Check the crc for this record */ if (rec_crcchk(s, dep, rbp)) { continue; } dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE; if (rbp == crbp) { if (rbp->rb_checksum != crbp->rb_checksum) dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; break; } rbp = crbp; } if (rbp == crbp) { rbp->rb_private = 0; kmem_free((caddr_t)crbp, recsize); return; } bzero((caddr_t)rbp, recsize); rbp->rb_magic = MDDB_MAGIC_RB; rbp->rb_revision = MDDB_REV_RB; uniqtime32(&rbp->rb_timestamp); /* Generate the crc for this record */ rec_crcgen(s, dep, rbp); kmem_free((caddr_t)crbp, recsize); } /* * writeoptrecord writes out an optimized record. */ static int writeoptrecord( mddb_set_t *s, mddb_de_ic_t *dep ) { mddb_rb32_t *rbp; int li; int err = 0, wrt_err = 0; mddb_bf_t *bufhead, *bfp; mddb_lb_t *lbp = s->s_lbp; mddb_locator_t *lp; int i; #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); #endif bufhead = NULL; err = 0; while (s->s_opthavequeuinglck) { s->s_optwantqueuinglck++; cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno)); } s->s_opthavequeuinglck++; rbp = dep->de_rb; for (i = 0; i < 2; i++) { /* * only possible error is xlate. This can * occur if a replica was off line and came * back. During the mean time the database grew * large than the now on line replica can store */ if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) continue; li = dep->de_optinfo[i].o_li; /* * In a MN diskset, any node can write optimized record(s). */ wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE); /* * For MN diskset, set error in optinfo structure so * that mddb_commitrec knows which replica failed. */ if ((MD_MNSET_SETNO(s->s_setno)) && (wrt_err & MDDB_F_EWRITE)) { dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE; } err |= wrt_err; } s->s_opthavequeuinglck = 0; if (s->s_optwantqueuinglck) { s->s_optwantqueuinglck = 0; cv_broadcast(&s->s_optqueuing_cv); } for (bfp = bufhead; bfp; bfp = bufhead) { mutex_exit(SETMUTEX(s->s_setno)); (void) biowait(&bfp->bf_buf); mutex_enter(SETMUTEX(s->s_setno)); if (bfp->bf_buf.b_flags & B_ERROR) { /* * If an MN diskset, don't set replica * in error since this hasn't been set in master. * Setting replica in error before master could * leave the nodes with different views of the * world since a class 1 configuration change * could occur in mddb_commitrec as soon as * all locks are dropped. Must keep this * node the same as master and can't afford a * failure from the class 1 config change * if master succeeded. */ if (!(MD_MNSET_SETNO(s->s_setno))) { bfp->bf_locator->l_flags |= MDDB_F_EWRITE; } else { /* * Find which de_optinfo (which replica) * had a failure and set the failure in * the o_flags field. */ lp = &lbp->lb_locators[dep->de_optinfo[0].o_li]; if (lp == bfp->bf_locator) { dep->de_optinfo[0].o_flags |= MDDB_F_EWRITE; } else { dep->de_optinfo[1].o_flags |= MDDB_F_EWRITE; } } err |= MDDB_F_EWRITE; } bufhead = bfp->bf_next; freebuffer(s, bfp); } return (err); } /* * Fix up the optimized resync record. Used in the traditional and local * disksets to move an optimized record from a failed or deleted mddb * to an active one. * * In a MN diskset, the fixing of the optimized record is split between * the master and slave nodes. If the master node moves the optimized * resync record, then the master node will send a MDDB_PARSE_OPTRECS * message to the slave nodes causing the slave nodes to reget the * directory entry containing the location of the optimized resync record. * After the record is reread from disk, then writeoptrecord is called * if the location of the optimized resync record or flags have changed. * When writeoptrecord is called, the node that is the owner of this record * will write the optimized record to the location specified in the directory * entry. Since the master node uses the highest class message (PARSE) * the record owner node is guaranteed to already have an updated * directory entry incore. * * The other difference between the traditional/local set and MN diskset * is that the directory entry can be written to disk before the optimized * record in a MN diskset if the record is owned by a slave node. So, * the users of an optimized record must handle the failure case when no * data is available from an optimized record since the master node could * have failed during the relocation of the optimized record to another mddb. */ static int fixoptrecord( mddb_set_t *s, mddb_de_ic_t *dep, mddb_db_t *dbp ) { int changed; int writedata; int err = 0; int i; mddb_lb_t *lbp; mddb_optinfo_t *op; mddb_db32_t *db32p; int rec_owner; /* Is node owner of record? */ #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); #endif lbp = s->s_lbp; changed = 0; writedata = 0; for (i = 0; i < 2; i++) { op = &dep->de_optinfo[i]; if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE)) op->o_flags = 0; /* * If optimized record has seen a replica failure, * assign new replica to record and re-write data * to new record. */ if (! (op->o_flags & MDDB_F_ACTIVE)) { getoptdev(s, dep, i); writedata++; changed++; /* Set flag for slaves to reread dep and write rec */ if (lbp->lb_flags & MDDB_MNSET) { s->s_mn_parseflags |= MDDB_PARSE_OPTRECS; } } /* * If just an error in the data was seen, set * the optimized record's replica flag to active (ok) * and try again. */ if (op->o_flags & MDDB_F_EDATA) { dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE; writedata++; } } rec_owner = 0; if (lbp->lb_flags & MDDB_MNSET) { /* * If a MN diskset then check the owner of optimized record. * If the master node owns the record or if there is * no owner of the record, then the master can write the * optimized record to disk. * Master node can write the optimized record now, but * slave nodes write their records during handling of * the MDDB_PARSE_OPTRECS message. */ if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) || (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) { rec_owner = 1; } } else { /* * In traditional diskset and local set, this node * is always the record owner and always the master. */ rec_owner = 1; } /* * If this node is the record owner, write out record. */ if ((writedata) && (rec_owner)) { if (err = writeoptrecord(s, dep)) { return (err); } } if (! changed) return (0); uniqtime32(&dbp->db_timestamp); dbp->db_revision = MDDB_REV_DB; db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); create_db32rec(db32p, dbp); crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); err = writeall(s, (caddr_t)db32p, db32p->db32_blknum, 1, MDDB_WR_ONLY_MASTER); kmem_free((caddr_t)db32p, MDDB_BSIZE); return (err); } static int fixoptrecords( mddb_set_t *s ) { mddb_de_ic_t *dep; mddb_db_t *dbp; int err = 0; set_t setno; /* * In a MN diskset, the master node is the only node that runs * fixoptrecords. If the master node changes anything, then the * master node sends PARSE message to the slave nodes. The slave * nodes will then re-read in the locator block or re-read in the * directory blocks and re-write the optimized resync records. */ setno = s->s_setno; if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && (md_set[setno].s_am_i_master == 0)) { return (0); } for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { if (! (dep->de_flags & MDDB_F_OPT)) continue; err = fixoptrecord(s, dep, dbp); if (err != 0) return (err); } } return (0); } /* * Checks incore version of mddb data to mddb data ondisk. * * Returns: * - 0 if the data was successfully read and is good. * - MDDB_F_EREAD if a read error occurred. * - 1 if the data read is bad (checksum failed, etc) */ static int checkcopy ( mddb_set_t *s, int li ) { mddb_db_t *dbp; mddb_db32_t *cdb32p; mddb_de_ic_t *dep; mddb_de32_t *cde32p; mddb_rb32_t *rbp, *crbp; size_t size; int i; int retval = 1; #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); #endif if (s->s_databuffer_size == 0) { size_t maxrecsize = MDDB_BSIZE; for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) for (dep = dbp->db_firstentry; dep; dep = dep->de_next) if (! (dep->de_flags & MDDB_F_OPT) && dep->de_recsize > maxrecsize) maxrecsize = dep->de_recsize; s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP); s->s_databuffer_size = maxrecsize; } cdb32p = (mddb_db32_t *)s->s_databuffer; /* * first go through and make sure all directory stuff * is the same */ for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) { retval = MDDB_F_EREAD; goto err; } if (cdb32p->db32_magic != MDDB_MAGIC_DB) goto err; if (revchk(MDDB_REV_DB, cdb32p->db32_revision)) goto err; if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL)) goto err; if (cdb32p->db32_nextblk != dbp->db_nextblk) goto err; if (cdb32p->db32_recsum != dbp->db_recsum) goto err; if (cdb32p->db32_firstentry) { cde32p = (mddb_de32_t *) ((void *)((caddr_t)(&cdb32p->db32_firstentry) + sizeof (cdb32p->db32_firstentry))); } else cde32p = NULL; dep = dbp->db_firstentry; /* * check if all directory entries are identical */ while (dep && cde32p) { if (dep->de_recid != cde32p->de32_recid) goto err; if (dep->de_type1 != cde32p->de32_type1) goto err; if (dep->de_type2 != cde32p->de32_type2) goto err; if (dep->de_reqsize != cde32p->de32_reqsize) goto err; if (dep->de_flags != cde32p->de32_flags) goto err; for (i = 0; i < 2; i++) { if (dep->de_optinfo[i].o_li != cde32p->de32_optinfo[i].o_li) break; } if (i != 2) goto err; size = sizeof (mddb_block_t) * dep->de_blkcount; if (bcmp((caddr_t)dep->de_blks, (caddr_t)cde32p->de32_blks, size)) goto err; dep = dep->de_next; if (cde32p->de32_next) cde32p = nextentry(cde32p); else cde32p = NULL; } if (dep || cde32p) goto err; } /* * If here, all directories are functionally identical * check to make sure all records are identical * the reason the records are not just bcmped is that the * lock flag does not want to be compared. */ crbp = (mddb_rb32_t *)cdb32p; for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { if ((dep->de_flags & MDDB_F_OPT) || (dep->de_flags & MDDB_F_CHANGELOG)) continue; rbp = (mddb_rb32_t *)dep->de_rb; if (readblklst(s, (caddr_t)crbp, dep->de_blks, dep->de_blkcount, li, 0)) { retval = MDDB_F_EREAD; goto err; } /* Check the crc for this record */ if (rec_crcchk(s, dep, crbp)) goto err; if (rbp->rb_checksum != crbp->rb_checksum || rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle) goto err; } } return (0); err: return (retval); } /* * Determine if the location information for two mddbs is the same. * The device slice and block offset should match. If both have devids then * use that for the comparison, otherwise we compare the dev_ts. * Comparing with the devid allows us to handle the case where a mddb was * relocated to a dead mddbs dev_t. The live mddb will have the dev_t of * the dead mddb but the devid comparison will catch this and not match. * * Return 1 if the location of the two mddbs match, 0 if not. */ static int match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev, daddr32_t blkno) { if (rip->ri_flags & MDDB_F_EMASTER) { /* * If this element is errored then we don't try to match on it. * If we try to match we could erroneously match on the dev_t * of a relocated disk. */ return (0); } if (rip->ri_devid && devid && minor) { /* * If old devid exists, then this is a replicated diskset * and both old and new devids must be checked. */ if (rip->ri_old_devid) { if (((ddi_devid_compare(rip->ri_devid, devid) != 0) && (ddi_devid_compare(rip->ri_old_devid, devid) != 0)) || (strcmp(rip->ri_minor_name, minor) != 0)) return (0); } else { if (ddi_devid_compare(rip->ri_devid, devid) != 0 || strcmp(rip->ri_minor_name, minor) != 0) return (0); } } else { if (rip->ri_dev != dev) return (0); } if (rip->ri_blkno != blkno) return (0); return (1); } static int ridev( mddb_ri_t **rip, mddb_cfg_loc_t *clp, dev32_t *dev_2b_fixed, int flag) { mddb_ri_t *r, *r1; md_dev64_t ldev, ndev; major_t majordev; int sz; if (MD_UPGRADE) { ldev = md_makedevice(md_targ_name_to_major(clp->l_driver), clp->l_mnum); } else { if (ddi_name_to_major(clp->l_driver) == (major_t)-1) return (EINVAL); ldev = md_makedevice(ddi_name_to_major(clp->l_driver), clp->l_mnum); } if (clp->l_devid != 0) { /* * Get dev associated with device id and minor name. * Setup correct driver name if dev is now different. * Don't change driver name if during upgrade. */ ndev = ldev; if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid, &ndev, clp->l_minor_name)) { if ((ndev != ldev) && (!(MD_UPGRADE))) { majordev = md_getmajor(ndev); (void) strcpy(clp->l_driver, ddi_major_to_name(majordev)); clp->l_mnum = md_getminor(ndev); clp->l_devid_flags |= MDDB_DEVID_VALID; ldev = ndev; } } else { /* Mark as invalid */ clp->l_devid_flags &= ~MDDB_DEVID_VALID; } } clp->l_dev = md_cmpldev(ldev); if (dev_2b_fixed) *dev_2b_fixed = clp->l_dev; r = *rip; while (r) { if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid, clp->l_minor_name, ldev, clp->l_blkno)) { if ((clp->l_devid != 0) && !(clp->l_devid_flags & MDDB_DEVID_VALID)) { r->ri_flags |= MDDB_F_EMASTER; } else { r->ri_flags |= flag; } return (0); /* already entered return success */ } r = r->ri_next; } /* * This replica not represented in the current rip list, * so add it to the list. */ r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP); r->ri_dev = ldev; r->ri_blkno = clp->l_blkno; (void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM); if (strlen(clp->l_driver) >= MD_MAXDRVNM) { r->ri_driver[(MD_MAXDRVNM -1)] = '\0'; } if (clp->l_devname != NULL) { (void) strcpy(r->ri_devname, clp->l_devname); } r->ri_flags |= flag; if (clp->l_devid != 0) { sz = clp->l_devid_sz; r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP); bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz); if (clp->l_old_devid != NULL) { sz = clp->l_old_devid_sz; r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP); bcopy((char *)(uintptr_t)clp->l_old_devid, (char *)r->ri_old_devid, sz); } else { r->ri_old_devid = 0; } if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX) (void) strcpy(r->ri_minor_name, clp->l_minor_name); if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) { /* * Devid is present, but not valid. This could * happen if device has been powered off or if * the device has been removed. Mark the device in * error. Don't allow any writes to this device * based on the dev_t since another device could * have been placed in its spot and be responding to * the dev_t accesses. */ r->ri_flags |= MDDB_F_EMASTER; } } else { r->ri_devid = 0; r->ri_old_devid = 0; } /* * If the rip list is empty then this entry * is the list. */ if (*rip == NULL) { *rip = r; return (0); } /* * Add this entry to the end of the rip list */ r1 = *rip; while (r1->ri_next) r1 = r1->ri_next; r1->ri_next = r; return (0); } /* * writecopy writes the incore data blocks out to all of the replicas. * This is called from writestart * - when a diskset is started or * - when an error has been enountered during the write to a mddb. * and from newdev when a new mddb is being added. * * flag can be 2 values: * MDDB_WRITECOPY_ALL - write all records to all mddbs. This is * always used for traditional and local disksets. * For MN diskset: * All nodes can call writecopy, but only the * master node actually writes data to the disk * except for optimized resync records. * An optimized resync record can only be written to * by the record owner. * MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new * master has been chosen, the new master may need to * write its incore mddb to disk (this is the case where the * old master had executed a message but hadn't relayed it * to this slave yet). New master should not write the * change log records since new master would be overwriting * valuable data. Only used during a reconfig cycle. */ static int writecopy( mddb_set_t *s, int li, int flag ) { mddb_db_t *dbp; mddb_db32_t *db32p; mddb_de_ic_t *dep; mddb_rb32_t *rbp; uint_t checksum; int err = 0; #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); #endif for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); create_db32rec(db32p, dbp); crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li, MDDB_WR_ONLY_MASTER); kmem_free((caddr_t)db32p, MDDB_BSIZE); if (err) return (err); for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { /* * In a multinode diskset, when a new master is * chosen the new master may need to write its * incore copy of the mddb to disk. In this case, * don't want to overwrite the change log records * so new master sets flag to MDDB_WRITECOPY_SYNC. */ if (flag == MDDB_WRITECOPY_SYNC) { if (dep->de_flags & MDDB_F_CHANGELOG) continue; } /* * In a multinode diskset, don't write out optimized * resync resyncs since only the mirror owner node * will have the correct data. If writecopy is * being called from writestart as a result of * an mddb failure, then writestart will handle * the optimized records when it calls fixoptrecords. */ if ((MD_MNSET_SETNO(s->s_setno)) && (dep->de_flags & MDDB_F_OPT)) { continue; } rbp = dep->de_rb; checksum = rbp->rb_checksum_fiddle; checksum ^= rbp->rb_checksum; /* Generate the crc for this record */ rec_crcgen(s, dep, rbp); checksum ^= rbp->rb_checksum; rbp->rb_checksum_fiddle = checksum; if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, dep->de_blkcount, li, (mddb_bf_t **)0, MDDB_WR_ONLY_MASTER)) return (err); } } return (0); } static int upd_med( mddb_set_t *s, char *tag ) { med_data_t meddb; int medok; mddb_lb_t *lbp = s->s_lbp; set_t setno = s->s_setno; int li; int alc; int lc; /* If no mediator hosts, nothing to do */ if (s->s_med.n_cnt == 0) return (0); /* * If this is a MN set and we are not the master, then don't * update mediator hosts or mark mediator as golden since * only master node should do that. */ if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && (md_set[setno].s_am_i_master == 0)) { return (0); } bzero((char *)&meddb, sizeof (med_data_t)); meddb.med_dat_mag = MED_DATA_MAGIC; meddb.med_dat_rev = MED_DATA_REV; meddb.med_dat_fl = 0; meddb.med_dat_sn = setno; meddb.med_dat_cc = lbp->lb_commitcnt; TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime); crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL); /* count accessible mediators */ medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag); /* count accessible and existing replicas */ for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) { mddb_locator_t *lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; lc++; if (! (lp->l_flags & MDDB_F_ACTIVE) || (lp->l_flags & MDDB_F_EMASTER) || (lp->l_flags & MDDB_F_EWRITE)) continue; alc++; } /* * Mediator update quorum is >= 50%: check for less than * "mediator update" quorum. */ if ((medok * 2) < s->s_med.n_cnt) { /* panic if <= 50% of all replicas are accessible */ if ((lc > 0) && ((alc * 2) <= lc)) { cmn_err(CE_PANIC, "md: Update of 50%% of the mediator hosts failed"); /* NOTREACHED */ } cmn_err(CE_WARN, "md: Update of 50%% of the mediator hosts failed"); } /* * If we have mediator update quorum and exactly 50% of the replicas * are accessible then mark the mediator as golden. */ if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) && ((alc * 2) == lc)) { meddb.med_dat_fl = MED_DFL_GOLDEN; crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL); (void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag); } return (0); } static int push_lb(mddb_set_t *s) { mddb_lb_t *lbp = s->s_lbp; /* push the change to all the replicas */ uniqtime32(&lbp->lb_timestamp); if (MD_MNSET_SETNO(s->s_setno)) { lbp->lb_revision = MDDB_REV_MNLB; } else { lbp->lb_revision = MDDB_REV_LB; } /* * The updates to the mediator hosts are done * by the callers of this function. */ return (writelocall(s)); } /* Should not call for MN diskset since data tags are not supported */ static int dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp) { int diff = 0; diff = (int)(odtp->dt_setno - ndtp->dt_setno); if (diff) return (diff); diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN); if (diff) return (diff); diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1); if (diff) return (diff); /*CSTYLED*/ return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=)); } /* Should not call for MN diskset since data tags are not supported */ static int dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp) { int nextid = 0; mddb_dtag_lst_t **dtlpp = &s->s_dtlp; /* Run to the end of the list */ for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) { if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0) return (0); nextid++; } /* Add the new member */ *dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP); /* Update the dtag portion of the list */ bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt), sizeof (mddb_dtag_t)); /* Fix up the id value */ (*dtlpp)->dtl_dt.dt_id = ++nextid; return (0); } /* * Even though data tags are not supported in MN disksets, dt_cntl may * be called for a MN diskset since this routine is called even before * it is known the kind of diskset being read in from disk. * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned. */ static int dtl_cntl(mddb_set_t *s) { mddb_dtag_lst_t *dtlp = s->s_dtlp; int ndt = 0; while (dtlp != NULL) { ndt++; dtlp = dtlp->dtl_nx; } return (ndt); } /* * Even though data tags are not supported in MN disksets, dt_cntl may * be called for a MN diskset since this routine is called even before * it is known the kind of diskset being read in from disk. * For a MNdiskset, s_dtlp is 0 so a 0 is returned. */ static mddb_dtag_t * dtl_findl(mddb_set_t *s, int id) { mddb_dtag_lst_t *dtlp = s->s_dtlp; while (dtlp != NULL) { if (dtlp->dtl_dt.dt_id == id) return (&dtlp->dtl_dt); dtlp = dtlp->dtl_nx; } return ((mddb_dtag_t *)NULL); } /* Should not call for MN diskset since data tags are not supported */ static void dtl_freel(mddb_dtag_lst_t **dtlpp) { mddb_dtag_lst_t *dtlp; mddb_dtag_lst_t *tdtlp; for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) { dtlp = tdtlp->dtl_nx; kmem_free(tdtlp, sizeof (mddb_dtag_lst_t)); } *dtlpp = (mddb_dtag_lst_t *)NULL; } /* * Even though data tags are not supported in MN disksets, dt_setup will * be called for a MN diskset since this routine is called even before * it is known the kind of diskset being read in from disk. * Once this set is known as a MN diskset, the dtp area will be freed. */ static void dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp) { mddb_dt_t *dtp; set_t setno = s->s_setno; if (md_set[setno].s_dtp == (mddb_dt_t *)NULL) md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP); else if (dtagp == (mddb_dtag_t *)NULL) bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); /* shorthand */ dtp = (mddb_dt_t *)md_set[setno].s_dtp; dtp->dt_mag = MDDB_MAGIC_DT; dtp->dt_rev = MDDB_REV_DT; if (dtagp != NULL) dtp->dt_dtag = *dtagp; /* structure assignment */ /* Initialize the setno */ dtp->dt_dtag.dt_setno = setno; /* Clear the id and flags, this is only used in user land */ dtp->dt_dtag.dt_id = 0; /* Checksum it */ crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL); } /* Should not call for MN diskset since data tags are not supported */ static int set_dtag(mddb_set_t *s, md_error_t *ep) { mddb_lb_t *lbp = s->s_lbp; mddb_dtag_t tag; if (lbp->lb_dtblkcnt == 0) { /* Data tags not used in a MN set - so no failure returned */ if (lbp->lb_flags & MDDB_MNSET) return (0); cmn_err(CE_WARN, "No tag record allocated, unable to tag data"); (void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno); return (1); } /* Clear the stack variable */ bzero((caddr_t)&tag, sizeof (mddb_dtag_t)); /* Get the HW serial number for this host */ (void) snprintf(tag.dt_sn, MDDB_SN_LEN, "%u", zone_get_hostid(NULL)); tag.dt_sn[MDDB_SN_LEN - 1] = '\0'; /* Get the nodename that this host goes by */ (void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME); tag.dt_hn[MD_MAX_NODENAME] = '\0'; /* Get a time stamp for NOW */ uniqtime32(&tag.dt_tv); /* Setup the data tag record */ dt_setup(s, &tag); /* Free any list of tags if they exist */ dtl_freel(&s->s_dtlp); /* Put the new tag onto the tag list */ (void) dtl_addl(s, &tag); return (0); } /* * If called during upgrade, this routine expects a non-translated * (aka target) dev. * Should not call for MN diskset since data tags are not supported. */ static int dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip) { int err = 0; md_dev64_t dev; caddr_t tbuf; daddr_t physblk; mddb_block_t blk; mddb_dt_t *dtp; mddb_dtag_t *dtagp; set_t setno = s->s_setno; /* If have not allocated a data tag record, there is nothing to do */ if (lbp->lb_dtblkcnt == 0) return (1); dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP); if (dtp == (mddb_dt_t *)NULL) return (1); /* shorthand */ dev = md_xlate_targ_2_mini(rip->ri_dev); if (dev == NODEV64) { return (1); } tbuf = (caddr_t)rip->ri_dtp; for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) { physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip); err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE), 0); /* error reading the tag */ if (err) { err = 1; goto out; } tbuf += MDDB_BSIZE; } /* magic is valid? */ if (dtp->dt_mag != MDDB_MAGIC_DT) { err = 1; goto out; } /* revision is valid? */ if (revchk(MDDB_REV_DT, dtp->dt_rev)) { err = 1; goto out; } /* crc is valid? */ if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) { err = 1; goto out; } /* shorthand */ dtagp = &dtp->dt_dtag; /* set number match? */ if (dtagp->dt_setno != setno) { err = 1; goto out; } /* tag is not empty? */ if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' && (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) && dtagp->dt_id == 0) { err = 2; goto out; } /* Mark the locator as having tagged data */ rip->ri_flags |= MDDB_F_TAGDATA; out: if (err) { if (err == 1) { md_set_setstatus(setno, MD_SET_BADTAG); rip->ri_flags |= MDDB_F_BADTAG; } if (dtp != NULL) { kmem_free(dtp, MDDB_DT_BYTES); rip->ri_dtp = (mddb_dt_t *)NULL; } } return (err); } /* Should not call for MN diskset since data tags are not supported */ static int dt_write(mddb_set_t *s) { int li; int err = 0; int werr; int empty_tag = 0; mddb_dtag_t *dtagp; mddb_dt_t *dtp; mddb_lb_t *lbp = s->s_lbp; set_t setno = s->s_setno; uint_t set_status = md_get_setstatus(setno); ASSERT(md_set[setno].s_dtp != NULL); /* Nowhere to write to */ if (lbp->lb_dtblkcnt == 0) return (err); if (set_status & MD_SET_BADTAG) return (err); /* shorthand */ dtp = (mddb_dt_t *)md_set[setno].s_dtp; dtagp = &dtp->dt_dtag; /* See if the tag is empty. */ if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' && (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) && dtagp->dt_id == 0) empty_tag = 1; /* Write the tag to the locators and reset appropriate flags. */ for (li = 0; li < lbp->lb_loccnt; li++) { mddb_locator_t *lp = &lbp->lb_locators[li]; if ((! (lp->l_flags & MDDB_F_ACTIVE)) || (lp->l_flags & MDDB_F_DELETED) || (lp->l_flags & MDDB_F_EWRITE)) continue; werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk, MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER); if (werr) { err |= werr; continue; } if (empty_tag) lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA); else { lp->l_flags |= MDDB_F_TAGDATA; lp->l_flags &= ~MDDB_F_BADTAG; } } if (err) return (err); /* If the tags were written, check to see if any tags remain. */ for (li = 0; li < lbp->lb_loccnt; li++) { mddb_locator_t *lp = &lbp->lb_locators[li]; if ((! (lp->l_flags & MDDB_F_ACTIVE)) || (lp->l_flags & MDDB_F_DELETED) || (lp->l_flags & MDDB_F_EWRITE)) continue; if (lp->l_flags & MDDB_F_TAGDATA) break; } /* If there are no tags, then clear CLRTAG and TAGDATA */ if (li == lbp->lb_loccnt) { md_clr_setstatus(setno, MD_SET_CLRTAG); md_clr_setstatus(setno, MD_SET_TAGDATA); } return (err); } /* Should not call for MN diskset since data tags are not supported */ static int dt_alloc_if_needed(mddb_set_t *s) { int i; int li; int moveit = 0; mddb_lb_t *lbp = s->s_lbp; mddb_block_t blkcnt = lbp->lb_dtblkcnt; set_t setno = s->s_setno; uint_t set_status = md_get_setstatus(setno); /* * If the data tag record is allocated (blkcnt != 0) and a bad tag was * not detected, there is nothing to do. */ if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG)) return (0); /* Bitmap not setup, checks can't be done */ if (s->s_totalblkcnt == 0) return (0); /* While reading the tag(s) an invalid tag data record was seen */ if (set_status & MD_SET_BADTAG) /* See if the invalid tag needs to be moved */ for (i = 0; i < MDDB_DT_BLOCKS; i++) if (blkcheck(s, (i + lbp->lb_dtfirstblk))) { moveit = 1; break; } /* Need to move or allocate the tag data record */ if (moveit || blkcnt == 0) { lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS); if (lbp->lb_dtfirstblk == 0) { cmn_err(CE_WARN, "Unable to allocate data tag record"); return (0); } lbp->lb_dtblkcnt = MDDB_DT_BLOCKS; /* Mark the locators so that they get written to disk. */ for (li = 0; li < lbp->lb_loccnt; li++) { mddb_locator_t *lp = &lbp->lb_locators[li]; if ((! (lp->l_flags & MDDB_F_ACTIVE)) || (lp->l_flags & MDDB_F_DELETED) || (lp->l_flags & MDDB_F_EWRITE)) continue; lp->l_flags |= MDDB_F_BADTAG; } return (1); } /* * Make sure the blocks are owned, since the calculation in * computefreeblks() is bypassed when MD_SET_BADTAG is set. */ for (i = 0; i < MDDB_DT_BLOCKS; i++) blkbusy(s, (i + lbp->lb_dtfirstblk)); return (1); } /* * Writestart writes the incore mddb out to all of the replicas. * This is called when a diskset is started and when an error has * been enountered during the write to a mddb. * * flag can be 2 values: * MDDB_WRITECOPY_ALL - write all records to all mddbs. This is * always used for traditional and local disksets. * This is the normal path for MN disksets since the slave * nodes aren't actually allowed to write to disk. * MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new * master has been chosen, the new master may need to * write its incore mddb to disk (this is the case where the * old master had executed a message but hadn't relayed it * to this slave yet). New master should not write the * change log records since new master would be overwriting * valuable data. Only used during a reconfig cycle. */ static int writestart( mddb_set_t *s, int flag ) { int li; mddb_locator_t *lp; mddb_lb_t *lbp; mddb_ln_t *lnp; int err = 0; uint_t set_status; lbp = s->s_lbp; for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (! (lp->l_flags & MDDB_F_ACTIVE)) continue; if (! (lp->l_flags & MDDB_F_SUSPECT)) continue; if (writecopy(s, li, flag)) return (1); lp->l_flags |= MDDB_F_UP2DATE; } for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (! (lp->l_flags & MDDB_F_ACTIVE)) continue; if ((lp->l_flags & MDDB_F_UP2DATE)) continue; if (checkcopy(s, li)) if (err = writecopy(s, li, flag)) return (1); lp->l_flags |= MDDB_F_UP2DATE; } /* * Call fixoptrecord even during a reconfig cycle since a replica * failure may force the master to re-assign the optimized * resync record to another replica. */ if (fixoptrecords(s)) return (1); set_status = md_get_setstatus(s->s_setno); /* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */ for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; if (((lp->l_flags & MDDB_F_ACTIVE) != 0 && (lp->l_flags & MDDB_F_OLDACT) == 0) || ((lp->l_flags & MDDB_F_ACTIVE) == 0 && (lp->l_flags & MDDB_F_OLDACT) != 0)) break; if ((set_status & MD_SET_TAGDATA) || (set_status & MD_SET_CLRTAG)) if ((lp->l_flags & MDDB_F_TAGDATA) || (lp->l_flags & MDDB_F_BADTAG)) break; } /* * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) * the lbp identifier and the set identifier doesn't match. */ if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) { /* Only call for traditional and local sets */ if (!(lbp->lb_flags & MDDB_MNSET)) (void) dt_write(s); setidentifier(s, &lbp->lb_ident); if (err = push_lb(s)) { (void) upd_med(s, "writestart(0)"); return (err); } (void) upd_med(s, "writestart(0)"); if (err = push_lb(s)) { (void) upd_med(s, "writestart(1)"); return (err); } (void) upd_med(s, "writestart(1)"); lnp = s->s_lnp; uniqtime32(&lnp->ln_timestamp); if (lbp->lb_flags & MDDB_MNSET) lnp->ln_revision = MDDB_REV_MNLN; else lnp->ln_revision = MDDB_REV_LN; crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, lbp->lb_lnblkcnt, 0); /* * If a MN diskset and this is the master, set the PARSE_LOCNM * flag in the mddb_set structure to show that the locator * names have changed. * Don't set parseflags as a result of a new master sync * during reconfig cycle since slaves nodes are already * in-sync with the new master. */ if ((lbp->lb_flags & MDDB_MNSET) && (md_set[s->s_setno].s_am_i_master) && (flag != MDDB_WRITECOPY_SYNC)) { s->s_mn_parseflags |= MDDB_PARSE_LOCNM; } if (err) return (err); } for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; if (lp->l_flags & MDDB_F_ACTIVE) { lp->l_flags |= MDDB_F_OLDACT; } else { lp->l_flags &= ~MDDB_F_OLDACT; } } md_clr_setstatus(s->s_setno, MD_SET_STALE); return (0); } /* * selectreplicas selects the working replicas and may write the incore * version of the mddb out to the replicas ondisk. * * flag can be 3 values: * MDDB_RETRYSCAN - quick scan to see if there is an error. * If no new error, returns without writing mddb * to disks. If a new error is seen, writes out * mddb to disks. * MDDB_SCANALL - lengthy scan to check out mddbs and always writes * out mddb to the replica ondisk. Calls writecopy * with MDDB_WRITECOPY_ALL flag which writes out * all records to the replicas ondisk. * MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore * and ondisk mddbs by writing incore values to disk. * Calls writecopy with MDDB_WRITECOPY_SYNC flag so * that change log records are not written out. * Only used by MN disksets. * * Returns: * 0 - Successful * 1 - Unable to write incore mddb data to disk since < 50% replicas. */ int selectreplicas( mddb_set_t *s, int flag ) { int li; int alc; int lc; mddb_locator_t *lp; mddb_lb_t *lbp = s->s_lbp; set_t setno = s->s_setno; int wc_flag; /* * can never transition from stale to not stale */ if (md_get_setstatus(setno) & MD_SET_STALE) { for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; if (! (lp->l_flags & MDDB_F_EMASTER)) { lp->l_flags |= MDDB_F_ACTIVE; } else { lp->l_flags &= ~MDDB_F_ACTIVE; } } return (1); } if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) { for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; if (lp->l_flags & MDDB_F_ACTIVE) { lp->l_flags |= MDDB_F_OLDACT; lp->l_flags &= ~MDDB_F_SUSPECT; } else { lp->l_flags |= MDDB_F_SUSPECT; lp->l_flags &= ~MDDB_F_OLDACT; } if (! (lp->l_flags & MDDB_F_EMASTER)) { lp->l_flags |= MDDB_F_ACTIVE; lp->l_flags &= ~MDDB_F_EWRITE; lp->l_flags &= ~MDDB_F_TOOSMALL; } else { lp->l_flags &= ~MDDB_F_ACTIVE; } } computefreeblks(s); /* set up free block bits */ } else { for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (! (lp->l_flags & MDDB_F_ACTIVE)) continue; if (lp->l_flags & MDDB_F_EWRITE) break; } /* * if there are no errors this is error has already * been processed return current state */ if (li == lbp->lb_loccnt) return (md_get_setstatus(setno) & MD_SET_TOOFEW); lp->l_flags &= ~MDDB_F_ACTIVE; do { lp = &lbp->lb_locators[li]; lp->l_flags &= ~MDDB_F_UP2DATE; } while (++li < lbp->lb_loccnt); } alc = 0; lc = 0; for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; lc++; if (! (lp->l_flags & MDDB_F_ACTIVE)) continue; alc++; } if (alc < ((lc + 1) / 2)) { md_set_setstatus(setno, MD_SET_TOOFEW); return (1); } /* Set wc_flag based on flag passed in. */ if (flag == MDDB_SCANALLSYNC) wc_flag = MDDB_WRITECOPY_SYNC; else wc_flag = MDDB_WRITECOPY_ALL; do { if (! writestart(s, wc_flag)) { md_clr_setstatus(setno, MD_SET_TOOFEW); return (0); } alc = 0; for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if ((lp->l_flags & MDDB_F_DELETED) || (lp->l_flags & MDDB_F_EMASTER)) continue; if (lp->l_flags & MDDB_F_EWRITE) { lp->l_flags &= ~MDDB_F_ACTIVE; lp->l_flags &= ~MDDB_F_UP2DATE; continue; } alc++; } } while (alc >= ((lc + 1) / 2)); md_set_setstatus(setno, MD_SET_TOOFEW); return (1); } static int checkstate( mddb_set_t *s, int probe ) { int error; uint_t set_status = md_get_setstatus(s->s_setno); ASSERT(s != NULL); if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW)) return (0); if (probe == MDDB_NOPROBE) return (1); single_thread_start(s); error = selectreplicas(s, MDDB_SCANALL); single_thread_end(s); if (error == 0 && s->s_zombie != 0) { mutex_exit(SETMUTEX(s->s_setno)); error = mddb_deleterec(s->s_zombie); mutex_enter(SETMUTEX(s->s_setno)); if (error == 0) s->s_zombie = 0; } return (error); } static int writeretry( mddb_set_t *s ) { if (selectreplicas(s, MDDB_RETRYSCAN)) if (selectreplicas(s, MDDB_SCANALL)) return (1); return (0); } static void free_mbipp(mddb_mb_ic_t **mbipp) { mddb_mb_ic_t *mbip1, *mbip2; for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) { mbip2 = mbip1->mbi_next; kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE); } *mbipp = (mddb_mb_ic_t *)NULL; } static mddb_ri_t * save_rip(mddb_set_t *s) { mddb_ri_t *trip = s->s_rip; mddb_ri_t *nrip = NULL; mddb_ri_t **nripp = &nrip; mddb_ri_t *rip; while (trip) { /* Run to the end of the list */ for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next) /* void */; /* Add the new member */ *nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP); ASSERT(*nripp != NULL); /* shorthand */ rip = *nripp; *rip = *trip; /* structure assignment */ /* Clear the stuff that is not needed for hints */ rip->ri_flags = 0; rip->ri_commitcnt = 0; rip->ri_transplant = 0; rip->ri_mbip = (mddb_mb_ic_t *)NULL; rip->ri_dtp = (mddb_dt_t *)NULL; rip->ri_lbp = (mddb_lb_t *)NULL; rip->ri_did_icp = (mddb_did_ic_t *)NULL; rip->ri_devid = (ddi_devid_t)NULL; rip->ri_old_devid = (ddi_devid_t)NULL; rip->ri_next = (mddb_ri_t *)NULL; trip = trip->ri_next; } return (nrip); } static void free_rip(mddb_ri_t **ripp) { mddb_ri_t *rip; mddb_ri_t *arip; for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) { arip = rip->ri_next; if (rip->ri_devid != (ddi_devid_t)NULL) { ddi_devid_free(rip->ri_devid); rip->ri_devid = (ddi_devid_t)NULL; } if (rip->ri_old_devid != (ddi_devid_t)NULL) { ddi_devid_free(rip->ri_old_devid); rip->ri_old_devid = (ddi_devid_t)NULL; } kmem_free((caddr_t)rip, sizeof (*rip)); } *ripp = (mddb_ri_t *)NULL; } /* * this routine selects the correct replica to use * the rules are as follows * 1. if all replica has same init time select highest commit count * 2. if some but not all replicas are from another hostid discard * them. * 3. find which init time is present is most replicas * 4. discard all replicas which do not match most init times * 5. select replica with highest commit count */ static mddb_lb_t * selectlocator( mddb_set_t *s ) { mddb_ri_t *rip = s->s_rip; mddb_ri_t *r, *r1; mddb_lb_t *lbp; struct timeval32 *tp = (struct timeval32 *)NULL; int different; int same; int count; int maxcount; set_t setno = s->s_setno; size_t sz; int mn_set = 0; /* Clear the ri_transplant flag on all the rip entries. */ /* Set ri_commitcnt to locator's commitcnt - if available */ for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { r->ri_transplant = 0; if (r->ri_lbp != (mddb_lb_t *)NULL) { r->ri_commitcnt = r->ri_lbp->lb_commitcnt; /* If any locators have MN bit set, set flag */ if (r->ri_lbp->lb_flags & MDDB_MNSET) mn_set = 1; } } /* * A data tag is being used, so use it to limit the selection first. * Data tags not used in MN diskset. */ if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) { mddb_dt_t *dtp = (mddb_dt_t *)md_set[setno].s_dtp; /* * now toss any locators that have a different data tag */ for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { if (r->ri_lbp == (mddb_lb_t *)NULL) continue; if (r->ri_dtp != (mddb_dt_t *)NULL) { /* If same tag, keep it */ if (dtl_cmp(&dtp->dt_dtag, &r->ri_dtp->dt_dtag) == 0) continue; } if (r->ri_dtp != (mddb_dt_t *)NULL) { kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); r->ri_dtp = (mddb_dt_t *)NULL; } mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { if (r->ri_old_devid != (ddi_devid_t)NULL) { sz = ddi_devid_sizeof(r->ri_old_devid); kmem_free((caddr_t)r->ri_old_devid, sz); r->ri_old_devid = (ddi_devid_t)NULL; } } kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt)); r->ri_lbp = (mddb_lb_t *)NULL; r->ri_transplant = 1; } /* Tag used, clear the bit */ md_clr_setstatus(s->s_setno, MD_SET_USETAG); if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) { /* * Get rid of the list of tags. */ dtl_freel(&s->s_dtlp); /* * Re-create the list with the tag used. */ (void) dtl_addl(s, &dtp->dt_dtag); } } /* * scan to see if all replicas have same time */ for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { if (r->ri_lbp == (mddb_lb_t *)NULL) continue; if (tp == NULL) { tp = &r->ri_lbp->lb_inittime; continue; } /* CSTYLED */ if (timercmp(tp, &r->ri_lbp->lb_inittime, !=)) break; } /* * if r == NULL then they were all them same. Choose highest * commit count */ if (r == (mddb_ri_t *)NULL) goto out; /* * If here, a bogus replica is present and at least 1 lb_inittime * did not match. */ /* * look and see if any but not all are from different id */ different = 0; same = 0; for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { if (r->ri_lbp == (mddb_lb_t *)NULL) continue; if (cmpidentifier(s, &r->ri_lbp->lb_ident)) different = 1; else same = 1; } /* * now go through and throw out different if there are some * that are the same */ if (different != 0 && same != 0) { for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { if (r->ri_lbp == (mddb_lb_t *)NULL) continue; if (!cmpidentifier(s, &r->ri_lbp->lb_ident)) continue; if (r->ri_dtp != (mddb_dt_t *)NULL) { kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); r->ri_dtp = (mddb_dt_t *)NULL; } mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { if (r->ri_old_devid != (ddi_devid_t)NULL) { sz = ddi_devid_sizeof(r->ri_old_devid); kmem_free((caddr_t)r->ri_old_devid, sz); r->ri_old_devid = (ddi_devid_t)NULL; } } kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt)); r->ri_lbp = (mddb_lb_t *)NULL; r->ri_transplant = 1; } } /* * go through and pick highest. Use n square because it is * simple and 40 some is max possible */ maxcount = 0; lbp = (mddb_lb_t *)NULL; for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) { if (r1->ri_lbp == (mddb_lb_t *)NULL) continue; count = 0; for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) { if (r->ri_lbp == (mddb_lb_t *)NULL) continue; if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */ &r->ri_lbp->lb_inittime, ==)) count++; } if (count > maxcount) { maxcount = count; lbp = r1->ri_lbp; } } /* * now go though and toss any that are of a different time stamp */ for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { if (r->ri_lbp == (mddb_lb_t *)NULL) continue; if (timercmp(&lbp->lb_inittime, /* CSTYLED */ &r->ri_lbp->lb_inittime, ==)) continue; if (r->ri_dtp != (mddb_dt_t *)NULL) { kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); r->ri_dtp = (mddb_dt_t *)NULL; } mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { if (r->ri_old_devid != (ddi_devid_t)NULL) { sz = ddi_devid_sizeof(r->ri_old_devid); kmem_free((caddr_t)r->ri_old_devid, sz); r->ri_old_devid = (ddi_devid_t)NULL; } } kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt)); r->ri_lbp = (mddb_lb_t *)NULL; r->ri_transplant = 1; } out: /* * Find the locator with the highest commit count, and make it the * "chosen" one. */ lbp = (mddb_lb_t *)NULL; for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { if (r->ri_lbp == (mddb_lb_t *)NULL) continue; if (lbp == NULL) { lbp = r->ri_lbp; continue; } if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt) lbp = r->ri_lbp; } /* Toss all locator blocks, except the "chosen" one. */ for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { if (r->ri_lbp == (mddb_lb_t *)NULL) continue; /* Get rid of all dtp's */ if (r->ri_dtp != (mddb_dt_t *)NULL) { kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); r->ri_dtp = (mddb_dt_t *)NULL; } if (r->ri_lbp == lbp) continue; /* Get rid of extra locator devid block info */ mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { if (r->ri_old_devid != (ddi_devid_t)NULL) { sz = ddi_devid_sizeof(r->ri_old_devid); kmem_free((caddr_t)r->ri_old_devid, sz); r->ri_old_devid = (ddi_devid_t)NULL; } } /* Get rid of extra locators */ kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt)); r->ri_lbp = (mddb_lb_t *)NULL; } return (lbp); } static void locator2cfgloc( mddb_lb_t *lbp, mddb_cfg_loc_t *clp, int li, side_t sideno, mddb_did_ic_t *did_icp ) { mddb_drvnm_t *dn; mddb_locator_t *lp = &lbp->lb_locators[li]; mddb_sidelocator_t *slp; mddb_mnsidelocator_t *mnslp; mddb_did_info_t *did_info; int i, sz, szalloc; int mn_set = 0; mddb_mnlb_t *mnlbp; if (lbp->lb_flags & MDDB_MNSET) { mn_set = 1; mnlbp = (mddb_mnlb_t *)lbp; for (i = 0; i < MD_MNMAXSIDES; i++) { mnslp = &mnlbp->lb_mnsidelocators[i][li]; if (mnslp->mnl_sideno == sideno) break; } if (i == MD_MNMAXSIDES) return; } else { slp = &lbp->lb_sidelocators[sideno][li]; } if (lbp->lb_flags & MDDB_DEVID_STYLE) { did_info = &(did_icp->did_ic_blkp->blk_info[li]); if (did_info->info_flags & MDDB_DID_EXISTS) { sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]); if (clp->l_devid_flags & MDDB_DEVID_SPACE) { /* * copy device id from mddb to * cfg_loc structure */ szalloc = clp->l_devid_sz; if (sz <= szalloc) { for (i = 0; i < sz; i++) { ((char *)(uintptr_t) clp->l_devid)[i] = ((char *)did_icp-> did_ic_devid[li])[i]; } clp->l_devid_flags |= MDDB_DEVID_VALID; (void) strcpy(clp->l_minor_name, did_info->info_minor_name); } else { clp->l_devid_flags |= MDDB_DEVID_NOSPACE; } } else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) { clp->l_devid_flags = MDDB_DEVID_SZ; clp->l_devid_sz = sz; } } } /* * Even if a devid exists, use the dev, drvnm and mnum in the locators * and sidelocators. During startup, the dev, drvnm and mnum in * these structures may not match the devid (the locators and * sidelocators will be updated to match the devid by the routine * load_old_replicas). Using out-of-sync values won't cause any * problems since ridev will re-derive these from the devid and mnum. * After startup, the dev, drvnm and mnum in these structures have * been updated and can be used. */ clp->l_blkno = lp->l_blkno; clp->l_flags = lp->l_flags; clp->l_dev = lp->l_dev; if (mn_set) { dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; clp->l_mnum = mnslp->mnl_mnum; } else { dn = &lbp->lb_drvnm[slp->l_drvnm_index]; clp->l_mnum = slp->l_mnum; } (void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM); } /* * Find the index into the mnsidelocator where entry will go. * Then index can be fed into both splitname2locatorblocks and * cfgloc2locator so that those entries can be kept in sync. * * Returns: * -1 if failed to find unused slot or if a traditional diskset * index, if successful (0 <= index <= MD_MNMAXSIDES) */ static int checklocator( mddb_lb_t *lbp, int li, side_t sideno ) { uchar_t i; mddb_mnsidelocator_t *mnslp; mddb_mnlb_t *mnlbp; int index = -1; if (lbp->lb_flags & MDDB_MNSET) { /* * Checking side locator structure. First, check if * there is already an entry for this side. If so, * then use that entry. Otherwise, find an entry * that has a sideno of 0. */ mnlbp = (mddb_mnlb_t *)lbp; for (i = 0; i < MD_MNMAXSIDES; i++) { mnslp = &mnlbp->lb_mnsidelocators[i][li]; if (mnslp->mnl_sideno == sideno) { /* Found a match - stop looking */ index = i; break; } else if ((mnslp->mnl_sideno == 0) && (index == -1)) { /* Set first empty slot, but keep looking */ index = i; } } /* Didn't find empty slot or previously used slot */ if ((i == MD_MNMAXSIDES) && (index == -1)) { return (-1); } return (index); } else return (0); } /* * Takes locator information (driver name, minor number, sideno) and * stores it in the locator block. * For traditional diskset, the sideno is the index into the sidelocator * array in the locator block. * For the MN diskset, the sideno is the nodeid which can be any number, * so the index passed in is the index into the mnsidelocator array * in the locator block. */ static int cfgloc2locator( mddb_lb_t *lbp, mddb_cfg_loc_t *clp, int li, side_t sideno, int index /* Only useful in MNsets when > 1 */ ) { uchar_t i; mddb_sidelocator_t *slp; mddb_mnsidelocator_t *mnslp; mddb_set_t *s; int mn_set = 0; mddb_mnlb_t *mnlbp; if (lbp->lb_flags & MDDB_MNSET) { mnlbp = (mddb_mnlb_t *)lbp; mn_set = 1; /* * Index will be the slot that has the given sideno or * the first empty slot if no match is found. * This was pre-checked out in check locator. */ mnslp = &mnlbp->lb_mnsidelocators[index][li]; } else { slp = &lbp->lb_sidelocators[sideno][li]; } /* * Look for the driver name */ for (i = 0; i < MDDB_DRVNMCNT; i++) { if (lbp->lb_drvnm[i].dn_len == 0) continue; if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver, MD_MAXDRVNM) == 0) break; } /* * Didn't find one, add a new one */ if (i == MDDB_DRVNMCNT) { for (i = 0; i < MDDB_DRVNMCNT; i++) { if (lbp->lb_drvnm[i].dn_len == 0) break; } if (i == MDDB_DRVNMCNT) return (1); (void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver, MD_MAXDRVNM); lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver); } /* Fill in the drvnm index */ if (mn_set) { mnslp->mnl_drvnm_index = i; mnslp->mnl_mnum = clp->l_mnum; mnslp->mnl_sideno = sideno; } else { slp->l_drvnm_index = i; slp->l_mnum = clp->l_mnum; } if (lbp->lb_flags & MDDB_DEVID_STYLE) { /* * This device id could already be associated with this index * if this is not the first side added to the set. * If device id is 0, there is no device id for this device. */ if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0) return (0); s = (mddb_set_t *)md_set[lbp->lb_setno].s_db; if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid, clp->l_minor_name)) { return (1); } } return (0); } /* * See if there are mediator hosts and try to use the data. */ static int mediate( mddb_set_t *s ) { mddb_lb_t *lbp = s->s_lbp; med_data_lst_t *meddlp = NULL; med_data_lst_t *tmeddlp = NULL; med_data_t *meddp; int medok = 0; int medacc = 0; uint_t maxcc; int golden = 0; int err = 1; set_t setno = s->s_setno; /* Do not have a mediator, then the state is stale */ if (s->s_med.n_cnt == 0) return (err); /* Contact the mediator hosts for the data */ meddlp = get_med_host_data(&s->s_med, s->s_setname, setno); /* No mediator data, stale */ if (meddlp == NULL) return (err); /* Mark all the mediator data that is not for this set as errored */ for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { struct timeval32 tmptime; meddp = tmeddlp->mdl_med; /* Count the number of mediators contacted */ medacc++; /* Paranoid check */ if (meddp->med_dat_sn != setno) meddp->med_dat_fl |= MED_DFL_ERROR; TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id); /*CSTYLED*/ if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=)) meddp->med_dat_fl |= MED_DFL_ERROR; } /* Get the max commitcount */ maxcc = 0; for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { meddp = tmeddlp->mdl_med; if (meddp->med_dat_fl & MED_DFL_ERROR) continue; if (meddp->med_dat_cc > maxcc) maxcc = meddp->med_dat_cc; } /* Now mark the records that don't have the highest cc as errored */ for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { meddp = tmeddlp->mdl_med; if (meddp->med_dat_fl & MED_DFL_ERROR) continue; if (meddp->med_dat_cc != maxcc) meddp->med_dat_fl |= MED_DFL_ERROR; } /* Now mark the records that don't match the lb commitcnt as errored */ for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { meddp = tmeddlp->mdl_med; if (meddp->med_dat_fl & MED_DFL_ERROR) continue; if (meddp->med_dat_cc != lbp->lb_commitcnt) meddp->med_dat_fl |= MED_DFL_ERROR; } /* Is there a "golden" copy and how many valid mediators */ for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { meddp = tmeddlp->mdl_med; if (meddp->med_dat_fl & MED_DFL_ERROR) continue; if (meddp->med_dat_fl & MED_DFL_GOLDEN) golden++; medok++; } /* No survivors, stale */ if (medok == 0) goto out; /* No mediator quorum and no golden copies, stale */ if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) { /* Skip odd numbers, no exact 50% */ if (s->s_med.n_cnt & 1) goto out; /* Have 50%, allow an accept */ if (medacc == (s->s_med.n_cnt / 2)) md_set_setstatus(setno, MD_SET_ACCOK); goto out; } /* We either have a quorum or a golden copy, or both */ err = 0; out: if (meddlp) { for (/* void */; meddlp != NULL; meddlp = tmeddlp) { tmeddlp = meddlp->mdl_nx; kmem_free(meddlp->mdl_med, sizeof (med_data_t)); kmem_free(meddlp, sizeof (med_data_lst_t)); } } return (err); } /* * 1. read masterblks and locator blocks for all know database locations * a. keep track of which have good master blks * b. keep track of which have good locators * */ static int get_mbs_n_lbs( mddb_set_t *s, int *write_lb ) { mddb_lb_t *lbp = NULL; /* pointer to locator block */ /* May be cast to mddb_mnlb_t */ /* if accessing sidenames in */ /* MN set */ mddb_did_ic_t *did_icp = NULL; /* ptr to Device ID incore */ mddb_did_blk_t *did_blkp = 0; int did_blkp_sz = 0; mddb_did_db_t *did_dbp; mddb_did_info_t *did_info; caddr_t did_block; mddb_ri_t *rip; mddb_dtag_lst_t *dtlp; mddb_locator_t *lp; daddr_t physblk; int li; uint_t blk; md_dev64_t dev; caddr_t buffer; uint_t lb_blkcnt; int retval = 0; int err = 0; int lb_ok = 0; int lb_total = 0; int lb_tagged = 0; int lb_tags; set_t setno = s->s_setno; int cont_flag, i; mddb_did_db_t *did_dbp1, *did_dbp2; int mn_set = 0; mddb_cfg_loc_t *cl; /* * read in master blocks and locator block for all known locators. * lb_blkcnt will be set correctly for MN set later once getmasters * has determined that the set is a MN set. */ lb_blkcnt = ((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT); for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL | MDDB_F_EMASTER); rip->ri_lbp = (mddb_lb_t *)NULL; rip->ri_did_icp = (mddb_did_ic_t *)NULL; /* * Translated dev is only used in calls to getmasters and * getblks which expect a translated (aka miniroot) dev. */ dev = md_xlate_targ_2_mini(rip->ri_dev); if (dev == NODEV64) { /* Set error flag that getmasters would have set */ /* if getmasters had been allowed to fail */ rip->ri_flags |= MDDB_F_EMASTER; } /* * Invalid device id on system (due to failed or * removed device) or invalid devt during upgrade * (due to powered off device) will cause this * replica to be marked in error and not used. */ if (rip->ri_flags & MDDB_F_EMASTER) continue; /* get all master blocks, does mddb_devopen() */ rip->ri_mbip = getmasters(s, dev, rip->ri_blkno, &rip->ri_flags, &mn_set); /* if invalid master block - try next replica */ if (! rip->ri_mbip) continue; /* * If lbp alloc'd to wrong size - reset it. * If MN set, lb_blkcnt must be MDDB_MNLBCNT. * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT. */ if (lbp) { if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) || ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) { kmem_free((caddr_t)lbp, dbtob(lb_blkcnt)); lbp = (mddb_lb_t *)NULL; } } if (lbp == (mddb_lb_t *)NULL) { /* If a MN set, set lb_blkcnt for MN loc blk size */ if (mn_set) lb_blkcnt = MDDB_MNLBCNT; lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt), KM_SLEEP); } /* * Read in all the sectors for the locator block * NOTE: Need to use getblks, rather than readblklst. * because it is too early and things are * NOT set up yet for read*()'s */ buffer = (caddr_t)lbp; for (blk = 0; blk < lb_blkcnt; blk++) { physblk = getphysblk(blk, rip->ri_mbip); err = getblks(s, buffer, dev, physblk, btodb(MDDB_BSIZE), 0); if (err) { rip->ri_flags |= err; break; } buffer += MDDB_BSIZE; } if (err) continue; /* Verify the locator block */ if (blk != lb_blkcnt) continue; if (lbp->lb_magic != MDDB_MAGIC_LB) continue; if (lbp->lb_blkcnt != lb_blkcnt) continue; if (mn_set) { /* If a MN set, check for MNLB revision in lb. */ if (revchk(MDDB_REV_MNLB, lbp->lb_revision)) continue; } else { /* If not a MN set, check for LB revision in lb. */ if (revchk(MDDB_REV_LB, lbp->lb_revision)) continue; } if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL)) continue; /* * With the addition of MultiNode Disksets, we must make sure * to verify that this is the correct set. A node could * have been out of the config for awhile and this disk could * have been moved to a different diskset and we don't want * to accidentally start the wrong set. * * We don't do this check if we're in the middle of * importing a set. */ if (!(md_get_setstatus(s->s_setno) & (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && (lbp->lb_setno != s->s_setno)) continue; rip->ri_flags |= MDDB_F_LOCACC; /* * a commit count of zero means this locator has been deleted */ if (lbp->lb_commitcnt == 0) continue; /* * If replica is in the device ID style and md_devid_destroy * flag is set, turn off device id style. This is only to be * used in a catastrophic failure case. Examples would be * where the device id of all drives in the system * (especially the mirror'd root drives) had been changed * by firmware upgrade or by a patch to an existing disk * driver. Another example would be in the case of non-unique * device ids due to a bug. The device id would be valid on * the system, but would return the wrong dev_t. */ if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) { lbp->lb_flags &= ~MDDB_DEVID_STYLE; lbp->lb_didfirstblk = 0; lbp->lb_didblkcnt = 0; *write_lb = 1; } /* * If replica is in device ID style, read in device ID * block and verify device ID block information. */ if (lbp->lb_flags & MDDB_DEVID_STYLE) { /* Read in device ID block */ if (did_icp == NULL) { did_icp = (mddb_did_ic_t *) kmem_zalloc(sizeof (mddb_did_ic_t), KM_SLEEP); } else { /* Reuse did_icp, but clear out data */ if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) { kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz); did_blkp = (mddb_did_blk_t *)NULL; did_icp->did_ic_blkp = (mddb_did_blk_t *)NULL; } if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) { did_dbp1 = did_icp->did_ic_dbp; while (did_dbp1) { did_dbp2 = did_dbp1->db_next; kmem_free((caddr_t) did_dbp1->db_ptr, dbtob(did_dbp1->db_blkcnt)); kmem_free((caddr_t)did_dbp1, sizeof (mddb_did_db_t)); did_dbp1 = did_dbp2; } did_icp->did_ic_dbp = (mddb_did_db_t *)NULL; } for (i = 0; i < MDDB_NLB; i++) { did_icp->did_ic_devid[i] = (ddi_devid_t)NULL; } } /* Can't reuse blkp since size could be different */ if (did_blkp != (mddb_did_blk_t *)NULL) { kmem_free(did_blkp, did_blkp_sz); } did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt); did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz, KM_SLEEP); did_icp->did_ic_blkp = did_blkp; buffer = (caddr_t)did_blkp; for (blk = lbp->lb_didfirstblk; blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk); blk++) { physblk = getphysblk(blk, rip->ri_mbip); err = getblks(s, buffer, dev, physblk, btodb(MDDB_BSIZE), 0); if (err) { rip->ri_flags |= err; break; } buffer += MDDB_BSIZE; } if (err) continue; /* Verify the Device ID block */ if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk)) continue; if (did_blkp->blk_magic != MDDB_MAGIC_DI) continue; if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS) continue; if (revchk(MDDB_REV_DI, did_blkp->blk_revision)) continue; if (crcchk(did_blkp, &did_blkp->blk_checksum, dbtob(lbp->lb_didblkcnt), NULL)) continue; /* * Check if device ID block is out of sync with the * Locator Block by checking if the locator block * commitcnt does not match the device id block * commitcnt. If an 'out of sync' condition * exists, discard this replica since it has * inconsistent data and can't be used in * determining the best replica. * * An 'out of sync' condition could happen if old * SDS code was running with new devid style replicas * or if a failure occurred between the writing of * the locator block's commitcnt and the device * id block's commitcnt. * * If old SDS code had been running, the upgrade * process should detect this situation and * have removed all of the device id information * via the md_devid_destroy flag in md.conf. */ if (did_blkp->blk_commitcnt != lbp->lb_commitcnt) { continue; } } /* * If replica is still in device ID style, read in all * of the device IDs, verify the checksum of the device IDs. */ if (lbp->lb_flags & MDDB_DEVID_STYLE) { /* * Reset valid bit in device id info block flags. This * flag is stored on disk, but the valid bit is reset * when reading in the replica. If the corresponding * device id is valid (aka meaning that the system * knows about this device id), the valid bit will * be set at a later time. The valid bit for this * replica's device ID will be set in this routine. * The valid bits for the rest of the device id's * will be set after the 'best' replica has * been selected in routine load_old_replicas. * Reset updated bit in device id info block flags. * This flag is also stored on disk, reset when read * in and set when the locators and side locators * have been updated to match this valid device * id information. */ for (li = 0; li < lbp->lb_loccnt; li++) { did_info = &did_blkp->blk_info[li]; if (did_info->info_flags & MDDB_DID_EXISTS) did_info->info_flags &= ~(MDDB_DID_VALID | MDDB_DID_UPDATED); } cont_flag = 0; for (li = 0; li < lbp->lb_loccnt; li++) { did_info = &did_blkp->blk_info[li]; did_block = (caddr_t)NULL; if (did_info->info_flags & MDDB_DID_EXISTS) { /* * Check if block has * already been read in */ did_dbp = did_icp->did_ic_dbp; while (did_dbp != 0) { if (did_dbp->db_firstblk == did_info->info_firstblk) break; else did_dbp = did_dbp->db_next; } /* if block not found, read it in */ if (did_dbp == NULL) { did_block = (caddr_t) (kmem_zalloc(dbtob( did_info->info_blkcnt), KM_SLEEP)); buffer = (caddr_t)did_block; for (blk = did_info->info_firstblk; blk < (did_info-> info_firstblk + did_info->info_blkcnt); blk++) { physblk = getphysblk(blk, rip->ri_mbip); err = getblks(s, buffer, dev, physblk, btodb( MDDB_BSIZE), 0); if (err) { rip->ri_flags |= err; break; } buffer += MDDB_BSIZE; } if (err) { kmem_free(did_block, dbtob(did_info-> info_blkcnt)); did_block = (caddr_t)NULL; cont_flag = 1; break; } /* * Block read in - * alloc Disk Block area */ did_dbp = (mddb_did_db_t *) kmem_zalloc( sizeof (mddb_did_db_t), KM_SLEEP); did_dbp->db_ptr = did_block; did_dbp->db_firstblk = did_info->info_firstblk; did_dbp->db_blkcnt = did_info->info_blkcnt; /* Add to front of dbp list */ did_dbp->db_next = did_icp->did_ic_dbp; did_icp->did_ic_dbp = did_dbp; } /* Check validity of devid in block */ if (crcchk(((char *)did_dbp->db_ptr + did_info->info_offset), &did_info->info_checksum, did_info->info_length, NULL)) { cont_flag = 1; break; } /* Block now pointed to by did_dbp */ did_icp->did_ic_devid[li] = (ddi_devid_t)((char *) did_dbp->db_ptr + did_info->info_offset); } } if (cont_flag) continue; } /* * All blocks containing devids are now in core. */ /* * If we're doing a replicated import (also known as * remote copy import), the device id in the locator * block is incorrect and we need to fix it up here * alongwith the l_dev otherwise we run into lots of * trouble later on. */ if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { mddb_ri_t *trip; for (li = 0; li < lbp->lb_loccnt; li++) { did_info = &did_blkp->blk_info[li]; lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; if (!(did_info->info_flags & MDDB_DID_EXISTS)) continue; if (did_icp->did_ic_devid[li] == NULL) continue; for (trip = s->s_rip; trip != NULL; trip = trip->ri_next) { if (trip->ri_old_devid == NULL) continue; if (ddi_devid_compare( trip->ri_old_devid, did_icp->did_ic_devid[li]) != 0) { continue; } /* update l_dev and side mnum */ lp->l_dev = md_cmpldev(trip->ri_dev); lbp->lb_sidelocators[0][li].l_mnum = md_getminor(trip->ri_dev); } } } /* * If there is a valid devid, verify that this locator * block has information about itself by checking the * device ID, minor_name and block * number from this replica's incore data structure * against the locator block information that has just * been read in from disk. * * If not a valid devid, verify that this locator block * has information about itself by checking the minor * number, block number and driver name from this * replica's incore data structure against the locator * block information that has just been read in from disk. */ if ((rip->ri_devid != NULL) && (lbp->lb_flags & MDDB_DEVID_STYLE)) { /* * This locator block MUST have locator (replica) * information about itself. Check against devid, * slice part of minor number, and block number. */ for (li = 0; li < lbp->lb_loccnt; li++) { did_info = &did_blkp->blk_info[li]; lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; if (!(did_info->info_flags & MDDB_DID_EXISTS)) continue; if (((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) && (rip->ri_old_devid != (ddi_devid_t)NULL)) { if (ddi_devid_compare(rip->ri_old_devid, did_icp->did_ic_devid[li]) != 0) continue; } else { if (ddi_devid_compare(rip->ri_devid, did_icp->did_ic_devid[li]) != 0) continue; } if (strcmp(rip->ri_minor_name, did_info->info_minor_name) != 0) continue; if (lp->l_blkno == rip->ri_blkno) break; } } else { /* * This locator block MUST have locator (replica) * information about itself. */ if (!mn_set) { for (li = 0; li < lbp->lb_loccnt; li++) { mddb_drvnm_t *dn; mddb_sidelocator_t *slp; lp = &lbp->lb_locators[li]; slp = &lbp-> lb_sidelocators[s->s_sideno][li]; if (lp->l_flags & MDDB_F_DELETED) continue; if (slp->l_mnum != md_getminor( rip->ri_dev)) continue; if (lp->l_blkno != rip->ri_blkno) continue; dn = &lbp->lb_drvnm[slp->l_drvnm_index]; if (strncmp(dn->dn_data, rip->ri_driver, MD_MAXDRVNM) == 0) break; } } else { for (li = 0; li < lbp->lb_loccnt; li++) { mddb_drvnm_t *dn; mddb_mnsidelocator_t *mnslp; mddb_mnlb_t *mnlbp; int i; /* * Check all possible locators locking * for match to the currently read-in * locator, must match on: * - blkno * - side locator for this * node's side * - side locator minor number * - side locator driver name */ /* * Looking at sidelocs: * cast lbp -> mnlbp */ mnlbp = (mddb_mnlb_t *)lbp; lp = &mnlbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; if (lp->l_blkno != rip->ri_blkno) continue; for (i = 0; i < MD_MNMAXSIDES; i++) { mnslp = &mnlbp-> lb_mnsidelocators[i][li]; if (mnslp->mnl_sideno == s->s_sideno) { break; } } /* No matching side found */ if (i == MD_MNMAXSIDES) continue; if (mnslp->mnl_mnum != md_getminor(rip->ri_dev)) continue; dn = &lbp-> lb_drvnm[mnslp->mnl_drvnm_index]; if (strncmp(dn->dn_data, rip->ri_driver, MD_MAXDRVNM) == 0) break; } } } /* * Didn't find ourself in this locator block it means * the locator block is a stale transplant. Probably from * a user doing a dd. */ if (li == lbp->lb_loccnt) continue; /* * Keep track of the number of accessed and valid * locator blocks. */ lb_ok++; /* * Read the tag in, skips invalid or blank tags. * Only valid tags allocate storage * Data tags are not used in MN disksets. */ if ((!mn_set) && (! dt_read(s, lbp, rip))) { /* * Keep track of the number of tagged * locator blocks. */ lb_tagged++; /* Keep a list of unique tags. */ (void) dtl_addl(s, &rip->ri_dtp->dt_dtag); } if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { /* * go through locator block and add any other * locations of the data base. * For the replicated import case, this was done earlier * and we really don't need or want to do so again */ cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP); for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; cl->l_devid_flags = MDDB_DEVID_GETSZ; cl->l_devid = (uint64_t)0; cl->l_devid_sz = 0; cl->l_old_devid = (uint64_t)0; cl->l_old_devid_sz = 0; cl->l_minor_name[0] = '\0'; locator2cfgloc(lbp, cl, li, s->s_sideno, did_icp); if (cl->l_devid_flags & MDDB_DEVID_SZ) { if ((cl->l_devid = (uintptr_t)kmem_alloc (cl->l_devid_sz, KM_SLEEP)) == NULL) { continue; } else { cl->l_devid_flags = MDDB_DEVID_SPACE; } } locator2cfgloc(lbp, cl, li, s->s_sideno, did_icp); (void) ridev(&s->s_rip, cl, &lp->l_dev, 0); if (cl->l_devid_flags & MDDB_DEVID_SPACE) kmem_free((caddr_t)(uintptr_t) cl->l_devid, cl->l_devid_sz); } kmem_free(cl, sizeof (mddb_cfg_loc_t)); } /* Save LB for later */ rip->ri_lbp = lbp; if (lbp->lb_flags & MDDB_DEVID_STYLE) { rip->ri_did_icp = did_icp; did_icp = (mddb_did_ic_t *)NULL; did_blkp = (mddb_did_blk_t *)NULL; } else rip->ri_did_icp = NULL; lbp = (mddb_lb_t *)NULL; } if (lbp != (mddb_lb_t *)NULL) kmem_free((caddr_t)lbp, dbtob(lb_blkcnt)); if (did_icp != (mddb_did_ic_t *)NULL) { if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) { kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz); did_blkp = (mddb_did_blk_t *)NULL; } if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) { mddb_did_db_t *did_dbp1, *did_dbp2; did_dbp1 = did_icp->did_ic_dbp; while (did_dbp1) { did_dbp2 = did_dbp1->db_next; kmem_free((caddr_t)did_dbp1->db_ptr, dbtob(did_dbp1->db_blkcnt)); kmem_free((caddr_t)did_dbp1, sizeof (mddb_did_db_t)); did_dbp1 = did_dbp2; } } kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t)); } if (did_blkp != (mddb_did_blk_t *)NULL) { kmem_free((caddr_t)did_blkp, did_blkp_sz); } /* No locator blocks were ok */ if (lb_ok == 0) goto out; /* No tagged data was found - will be 0 for MN diskset */ if (lb_tagged == 0) goto out; /* Find the highest non-deleted replica count */ for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { int lb_tot = 0; if (rip->ri_mbip == (mddb_mb_ic_t *)NULL) continue; if (rip->ri_lbp == (mddb_lb_t *)NULL) continue; for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) { lp = &rip->ri_lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; lb_tot++; } if (lb_tot > lb_total) lb_total = lb_tot; } /* Count the number of unique tags */ for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx) lb_tags++; /* Should have at least one tag at this point */ ASSERT(lb_tags > 0); /* * If the number of tagged locators is not the same as the number of * OK locators OR more than one tag exists, then make sure the * selected tag will be written out later. */ if ((lb_tagged - lb_ok) != 0 || lb_tags > 1) md_set_setstatus(setno, MD_SET_TAGDATA); /* Only a single tag, take the tagged data */ if (lb_tags == 1) { dt_setup(s, &s->s_dtlp->dtl_dt); md_set_setstatus(setno, MD_SET_USETAG); goto out; } /* Multiple tags, not selecting a tag, tag mode is on */ if (! (md_get_setstatus(setno) & MD_SET_USETAG)) retval = MDDB_E_TAGDATA; out: return (retval); } /* * 1. Select a locator. * 2. check if enough locators now have current copies * 3. read in database from one of latest * 4. if known to have latest make all database the same * 5. if configuration has changed rewrite locators * * Parameters: * s - pointer to mddb_set structure * flag - used in MN disksets to tell if this node is being joined to * a diskset that is in the STALE state. If the flag is * MDDB_MN_STALE, then this node should be marked in the STALE * state even if > 50% mddbs are available. (The diskset can * only change from STALE->OK if all nodes withdraw from the * MN diskset and then rejoin). */ static int load_old_replicas( mddb_set_t *s, int flag ) { mddb_lb_t *lbp = NULL; mddb_mnlb_t *mnlbp = NULL; mddb_ri_t *rip; mddb_locator_t *lp; mddb_db_t *dbp; mddb_de_ic_t *dep; int li; int alc; int lc; int tlc; int retval = 0; caddr_t p; size_t maxrecsize; set_t setno = s->s_setno; mddb_did_db_t *did_dbp1; mddb_did_info_t *did_info; mddb_did_ic_t *did_icp = NULL; md_dev64_t *newdev; mddb_sidelocator_t *slp = 0; mddb_mnsidelocator_t *mnslp = 0; uchar_t i; char *name; ddi_devid_t ret_devid; md_dev64_t dev; uint_t len, sz; char *minor_name; int write_lb = 0; int rval; int stale_rtn = 0; /* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */ if (retval = get_mbs_n_lbs(s, &write_lb)) goto errout; if ((lbp = s->s_lbp = selectlocator(s)) == NULL) { retval = MDDB_E_NOLOCBLK; goto errout; } /* If a multi-node set, then set md_set.s_status flag */ if (lbp->lb_flags & MDDB_MNSET) { md_set_setstatus(setno, MD_SET_MNSET); /* * If data tag area had been allocated before set type was * known - free it now. */ if (md_set[setno].s_dtp) { kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); md_set[setno].s_dtp = NULL; } } /* * If the replica is in devid format, setup the devid incore ptr. */ if (lbp->lb_flags & MDDB_DEVID_STYLE) { for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { if (rip->ri_lbp == s->s_lbp) { did_icp = s->s_did_icp = rip->ri_did_icp; break; } } /* * If no devid incore info found - something has gone * wrong so errout. */ if (rip == NULL) { retval = MDDB_E_NODEVID; goto errout; } /* * Add all blocks containing devids to free list. * Then remove addresses that actually contain devids. */ did_dbp1 = did_icp->did_ic_dbp; while (did_dbp1) { if (mddb_devid_free_add(s, did_dbp1->db_firstblk, 0, dbtob(did_dbp1->db_blkcnt))) { retval = MDDB_E_NOSPACE; goto errout; } did_dbp1 = did_dbp1->db_next; } for (li = 0; li < lbp->lb_loccnt; li++) { did_info = &(did_icp->did_ic_blkp->blk_info[li]); if (!(did_info->info_flags & MDDB_DID_EXISTS)) continue; if (mddb_devid_free_delete(s, did_info->info_firstblk, did_info->info_offset, did_info->info_length)) { /* unable to find disk block */ retval = MDDB_E_NODEVID; goto errout; } } } /* * create mddb_mbaray, count all locators and active locators. */ alc = 0; lc = 0; for (li = 0; li < lbp->lb_loccnt; li++) { ddi_devid_t li_devid; lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; /* Count non-deleted replicas */ lc++; /* * Use the devid of this locator to compare with the rip * list. The scenario to watch out for here is that this * locator could be on a disk that is dead and there could * be a valid entry in the rip list for a different disk * that has been moved to the dead disks dev_t. We don't * want to match with the moved disk. */ li_devid = NULL; (void) mddb_devid_get(s, li, &li_devid, &minor_name); for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { if (match_mddb(rip, li_devid, minor_name, md_expldev(lp->l_dev), lp->l_blkno)) { break; } } if (rip == NULL) { /* * If rip not found, then mark error in master block * so that no writes are later attempted to this * replica. rip may not be setup if ridev * failed due to un-found driver name. */ lp->l_flags |= MDDB_F_EMASTER; continue; } s->s_mbiarray[li] = rip->ri_mbip; lp->l_flags &= MDDB_F_ACTIVE; lp->l_flags |= (int)rip->ri_flags; if (rip->ri_transplant) lp->l_flags &= ~MDDB_F_ACTIVE; if (lp->l_flags & MDDB_F_LOCACC) alc++; } /* Save on a divide - calculate 50% + 1 up front */ tlc = ((lc + 1) / 2); if (alc > tlc) { /* alc > tlc - OK */ md_clr_setstatus(setno, MD_SET_STALE); } else if (alc < tlc) { /* alc < tlc - stale */ md_set_setstatus(setno, MD_SET_STALE); } else if (lc & 1) { /* alc == tlc && odd - OK */ md_clr_setstatus(setno, MD_SET_STALE); } else { /* alc == tlc && even - ? */ /* Can do an accept, and are */ if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) { md_clr_setstatus(setno, MD_SET_STALE); } else { /* possibly has a mediator */ if (mediate(s)) { md_set_setstatus(setno, MD_SET_STALE); } else { md_clr_setstatus(setno, MD_SET_STALE); } } /* * The mirrored_root_flag allows the sysadmin to decide to * start the local set in a read/write (non-stale) mode * when there are only 50% available mddbs on the system and * when the root file system is on a mirror. This is useful * in a 2 disk system where 1 disk failure would cause an mddb * quorum failure and subsequent boot failures since the root * filesystem would be in a read-only state. */ if (mirrored_root_flag == 1 && setno == 0 && svm_bootpath[0] != 0) { md_clr_setstatus(setno, MD_SET_STALE); } else { if (md_get_setstatus(setno) & MD_SET_STALE) { /* Allow half mode - CAREFUL! */ if (mddb_allow_half) md_clr_setstatus(setno, MD_SET_STALE); } } /* * In a MN diskset, * - if 50% mddbs are unavailable and this * has been marked STALE above * - master node isn't in the STALE state * - this node isn't the master node (this node * isn't the first node to join the set) * then clear the STALE state and set TOOFEW. * * If this node is the master node and set was marked STALE, * then the set stays STALE. * * If this node is not the master and this node's state is * STALE and the master node is not marked STALE, * then master node must be in the TOOFEW state or the * master is panic'ing. A MN diskset can only be placed into * the STALE state by having the first node join the set * with <= 50% mddbs. There's no way for a MN diskset to * transition between STALE and not-STALE states unless all * nodes are withdrawn from the diskset or all nodes in the * diskset are rebooted at the same time. * * So, mark this node's state as TOOFEW instead of STALE. */ if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE)) == (MD_SET_MNSET | MD_SET_STALE)) && ((flag & MDDB_MN_STALE) == 0) && (!(md_set[setno].s_am_i_master))) { md_clr_setstatus(setno, MD_SET_STALE); md_set_setstatus(setno, MD_SET_TOOFEW); } } /* * If a MN set is marked STALE on the other nodes, * mark it stale here. Override all other considerations * such as a mediator or > 50% mddbs available. */ if (md_get_setstatus(setno) & MD_SET_MNSET) { if (flag & MDDB_MN_STALE) md_set_setstatus(setno, MD_SET_STALE); } /* * read a good copy of the locator names * if an error occurs reading what is suppose * to be a good copy continue looking for another * good copy */ s->s_lnp = NULL; for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if ((! (lp->l_flags & MDDB_F_ACTIVE)) || (lp->l_flags & MDDB_F_EMASTER)) continue; /* Find rip entry for this locator if one exists */ for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev), lp->l_blkno)) break; } if (rip == NULL) { continue; } /* * Use the rip commitcnt since the commitcnt in lbp could * been cleared by selectlocator. Looking for a replica with * the same commitcnt as the 'golden' copy in order to * get the same data. */ if (rip->ri_commitcnt != lbp->lb_commitcnt) { continue; } /* * Now have a copy of the database that is equivalent * to the chosen locator block with respect to * inittime, identifier and commitcnt. Trying the * equivalent databases in the order that they were * written will provide the most up to date data. */ lp->l_flags |= readlocnames(s, li); if (s->s_lnp) break; } if (s->s_lnp == NULL) { retval = MDDB_E_NOLOCNMS; goto errout; } /* * read a good copy of the data base * if an error occurs reading what is suppose * to be a good copy continue looking for another * good copy */ s->s_dbp = NULL; for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if ((! (lp->l_flags & MDDB_F_ACTIVE)) || (lp->l_flags & MDDB_F_EMASTER)) continue; /* Find rip entry for this locator if one exists */ for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev), lp->l_blkno)) break; } if (rip == NULL) { continue; } /* * Use the rip commitcnt since the commitcnt in lbp could * been cleared by selectlocator. Looking for a replica with * the same commitcnt as the 'golden' copy in order to * get the same data. */ if (rip->ri_commitcnt != lbp->lb_commitcnt) { continue; } /* * Now have a copy of the database that is equivalent * to the chosen locator block with respect to * inittime, identifier and commitcnt. Trying the * equivalent databases in the order that they were * written will provide the most up to date data. */ lp->l_flags |= readcopy(s, li); if (s->s_dbp) break; } if (s->s_dbp == NULL) { retval = MDDB_E_NODIRBLK; goto errout; } lp->l_flags |= MDDB_F_MASTER; lp->l_flags |= MDDB_F_UP2DATE; /* * go through and find largest record; * Also fixup the user data area's */ maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) if (dep->de_flags & MDDB_F_OPT) getoptrecord(s, dep); else { allocuserdata(dep); maxrecsize = MAX(dep->de_recsize, maxrecsize); } if (maxrecsize > s->s_databuffer_size) { p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP); if (s->s_databuffer_size) kmem_free(s->s_databuffer, s->s_databuffer_size); s->s_databuffer = p; s->s_databuffer_size = maxrecsize; } /* If we can clear the tag data record, do it now. */ /* Data tags not supported on MN sets */ if ((md_get_setstatus(setno) & MD_SET_CLRTAG) && (!(md_get_setstatus(setno) & MD_SET_MNSET))) dt_setup(s, NULL); /* This will return non-zero if STALE or TOOFEW */ /* This will write out chosen replica image to all replicas */ stale_rtn = selectreplicas(s, MDDB_SCANALL); if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { ddi_devid_t devidptr; /* * ignore the return value from selectreplicas because we * may have a STALE or TOOFEW set in the case of a partial * replicated diskset. We will fix that up later. */ lbp = s->s_lbp; for (li = 0; li < lbp->lb_loccnt; li++) { did_info = &(did_icp->did_ic_blkp->blk_info[li]); if (did_info->info_flags & MDDB_DID_EXISTS) { devidptr = s->s_did_icp->did_ic_devid[li]; lp = &lbp->lb_locators[li]; for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { if (rip->ri_old_devid == 0) continue; if (ddi_devid_compare(rip->ri_old_devid, devidptr) != 0) { continue; } if (update_locatorblock(s, md_expldev(lp->l_dev), rip->ri_devid, rip->ri_old_devid)) { goto errout; } } } } } else { if (stale_rtn) goto errout; } /* * If the replica is in device id style - validate the device id's, * if present, in the locator block devid area. */ newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP); if (lbp->lb_flags & MDDB_DEVID_STYLE) { for (li = 0; li < lbp->lb_loccnt; li++) { newdev[li] = 0; lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; did_info = &(did_icp->did_ic_blkp->blk_info[li]); dev = md_expldev(lp->l_dev); if (did_info->info_flags & MDDB_DID_EXISTS) { /* Validate device id on current system */ newdev[li] = dev; if (mddb_devid_validate( did_icp->did_ic_devid[li], &(newdev[li]), did_info->info_minor_name) == 0) { /* Set valid flag */ did_info->info_flags |= MDDB_DID_VALID; } else { lp->l_flags |= MDDB_F_EMASTER; } } else if (!(MD_UPGRADE)) { /* * If a device doesn't have a device id, * check if there is now a device ID * associated with device. If one exists, * add it to the locator block devid area. * If there's not enough space to add it, * print a warning. * Don't do this during upgrade. */ dev_t ddi_dev = md_dev64_to_dev(dev); if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) { if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name) == DDI_SUCCESS) { if (mddb_devid_add(s, li, ret_devid, minor_name)) { cmn_err(CE_WARN, "Not enough space" " in metadevice" " state" " database\n"); cmn_err(CE_WARN, "to add relocation" " information for" " device:\n"); cmn_err(CE_WARN, " major = %d, " " minor = %d\n", getmajor(ddi_dev), getminor(ddi_dev)); } else { write_lb = 1; } kmem_free(minor_name, strlen(minor_name) + 1); } ddi_devid_free(ret_devid); } } } /* * If a device has a valid device id and if the dev_t * associated with the device id has changed, update the * driver name, minor num and dev_t in the local and side * locators to match the dev_t that the system currently * associates with the device id. * * Don't do this during upgrade. */ if (!(MD_UPGRADE)) { for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; did_info = &(did_icp->did_ic_blkp->blk_info [li]); if ((did_info->info_flags & MDDB_DID_VALID) && !(did_info->info_flags & MDDB_DID_UPDATED)) { if (lbp->lb_flags & MDDB_MNSET) { int j; int index = -1; mnlbp = (mddb_mnlb_t *)lbp; for (j = 0; j < MD_MNMAXSIDES; j++) { mnslp = &mnlbp-> lb_mnsidelocators[j] [li]; if (mnslp->mnl_sideno == s->s_sideno) break; if (mnslp->mnl_sideno == 0) index = j; } if (j == MD_MNMAXSIDES) { /* * No match found; take * empty */ mnslp = &mnlbp-> lb_mnsidelocators [index][li]; write_lb = 1; mnslp->mnl_mnum = md_getminor(newdev [li]); } else if (mnslp->mnl_mnum != md_getminor(newdev[li])) { write_lb = 1; mnslp->mnl_mnum = md_getminor(newdev [li]); } } else { slp = &lbp-> lb_sidelocators[s->s_sideno] [li]; if (slp->l_mnum != md_getminor(newdev[li])) { write_lb = 1; slp->l_mnum = md_getminor(newdev [li]); } } name = ddi_major_to_name(md_getmajor( newdev[li])); if (lbp->lb_flags & MDDB_MNSET) i = mnslp->mnl_drvnm_index; else i = slp->l_drvnm_index; if (strncmp(lbp->lb_drvnm[i].dn_data, name, lbp->lb_drvnm[i].dn_len) != 0) { /* Driver name has changed */ len = strlen(name); /* Look for the driver name */ for (i = 0; i < MDDB_DRVNMCNT; i++) { if (lbp->lb_drvnm[i]. dn_len != len) continue; if (strncmp(lbp-> lb_drvnm[i].dn_data, name, len) == 0) break; } /* Didn't find one, add it */ if (i == MDDB_DRVNMCNT) { for (i = 0; i < MDDB_DRVNMCNT; i++) { if (lbp-> lb_drvnm[i]. dn_len == 0) break; } if (i == MDDB_DRVNMCNT) { cmn_err(CE_WARN, "Unable to " " update " "driver " " name for " "dev: " "major = %d" ", minor = " "%d\n", md_getmajor( newdev[li]), md_getminor( newdev [li])); continue; } (void) strncpy(lbp-> lb_drvnm[i].dn_data, name, MD_MAXDRVNM); lbp->lb_drvnm[i]. dn_len = (uchar_t) strlen(name); } /* Fill in the drvnm index */ if (lbp->lb_flags & MDDB_MNSET) mnslp->mnl_drvnm_index = i; else slp->l_drvnm_index = i; write_lb = 1; } did_info->info_flags |= MDDB_DID_UPDATED; } } } } kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB); /* * If locator block has been changed by get_mbs_n_lbs, * by addition of new device id, by updated minor name or * by updated driver name - write out locator block. */ if (write_lb) { rval = push_lb(s); (void) upd_med(s, "load_old_replicas(0)"); if (rval) goto errout; } /* * If the tag was moved, allocated, or a BADTAG was seen for some other * reason, then make sure tags are written to all the replicas. * Data tags not supported on MN sets. */ if (!(md_get_setstatus(setno) & MD_SET_MNSET)) { if (! (lc = dt_alloc_if_needed(s))) { for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if ((! (lp->l_flags & MDDB_F_ACTIVE)) || (lp->l_flags & MDDB_F_EMASTER)) continue; if (lp->l_flags & MDDB_F_BADTAG) { lc = 1; break; } } } if (lc) { md_set_setstatus(setno, MD_SET_TAGDATA); md_clr_setstatus(setno, MD_SET_BADTAG); (void) selectreplicas(s, MDDB_SCANALL); } } errout: /* Free extraneous rip components. */ for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { /* Get rid of lbp's and dtp's */ if (rip->ri_lbp != lbp) { if (rip->ri_dtp != (mddb_dt_t *)NULL) { kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES); rip->ri_dtp = (mddb_dt_t *)NULL; } if (rip->ri_devid != (ddi_devid_t)NULL) { sz = (int)ddi_devid_sizeof(rip->ri_devid); kmem_free((caddr_t)rip->ri_devid, sz); rip->ri_devid = (ddi_devid_t)NULL; } if (rip->ri_old_devid != (ddi_devid_t)NULL) { sz = (int)ddi_devid_sizeof(rip->ri_old_devid); kmem_free((caddr_t)rip->ri_old_devid, sz); rip->ri_old_devid = (ddi_devid_t)NULL; } if (rip->ri_lbp != (mddb_lb_t *)NULL) { mddb_devid_icp_free(&rip->ri_did_icp, rip->ri_lbp); kmem_free((caddr_t)rip->ri_lbp, dbtob(rip->ri_lbp->lb_blkcnt)); rip->ri_lbp = (mddb_lb_t *)NULL; } } if (lbp != NULL) { for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; if (rip->ri_dev == md_expldev(lp->l_dev) && rip->ri_blkno == lp->l_blkno) break; } if (li < lbp->lb_loccnt) continue; } /* * Get rid of mbp's: * if lbp, those out of lb_loccnt bounds * if !lbp, all of them. */ if (rip->ri_mbip) { md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev); if (dev64 != NODEV64) mddb_devclose(dev64); free_mbipp(&rip->ri_mbip); } /* * Turn off MDDB_F_EMASTER flag in a diskset since diskset * code always ends up calling ridev for all replicas * before calling load_old_replicas. ridev will reset * MDDB_F_EMASTER flag if flag was due to unresolved devid. */ if (setno != MD_LOCAL_SET) rip->ri_flags &= ~MDDB_F_EMASTER; } return (retval); } /* * Given the devt from the md.conf info, get the devid for the device. */ static void lookup_db_devid(mddb_cfg_loc_t *cl) { dev_t ldev; ddi_devid_t devid; char *minor; if (ddi_name_to_major(cl->l_driver) == (major_t)-1) { cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver); return; } ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum); if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) { cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x", cl->l_driver, cl->l_mnum); return; } if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) { cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x", cl->l_mnum); return; } cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ; cl->l_devid_sz = (int)ddi_devid_sizeof(devid); cl->l_devid = (uint64_t)(uintptr_t)devid; (void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX); kmem_free(minor, strlen(minor) + 1); } /* * grab driver name, minor, block and devid out of * strings like "driver:minor:block:devid" */ static int parse_db_loc( char *str, mddb_cfg_loc_t *clp ) { char *p, *e; char *minor_name; ddi_devid_t ret_devid; clp->l_dev = 0; p = clp->l_driver; e = p + sizeof (clp->l_driver) - 1; while ((*str != ':') && (*str != '\0') && (p < e)) *p++ = *str++; *p = '\0'; if (*str++ != ':') return (-1); clp->l_mnum = 0; while (ISNUM(*str)) { clp->l_mnum *= 10; clp->l_mnum += *str++ - '0'; } if (*str++ != ':') return (-1); clp->l_blkno = 0; while (ISNUM(*str)) { clp->l_blkno *= 10; clp->l_blkno += *str++ - '0'; } if (*str++ != ':') return (-1); /* * If the md_devid_destroy flag is set, ignore the device ids. * This is only to used in a catastrophic failure case. Examples * would be where the device id of all drives in the system * (especially the mirror'd root drives) had been changed * by firmware upgrade or by a patch to an existing disk * driver. Another example would be in the case of non-unique * device ids due to a bug. The device id would be valid on * the system, but would return the wrong dev_t. */ if (md_devid_destroy) { clp->l_devid_flags = 0; clp->l_devid = (uint64_t)NULL; clp->l_devid_sz = 0; clp->l_old_devid = (uint64_t)NULL; clp->l_old_devid_sz = 0; clp->l_minor_name[0] = '\0'; return (0); } if (ddi_devid_str_decode(str, (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE) return (-1); clp->l_devid = (uint64_t)(uintptr_t)ret_devid; clp->l_devid_flags = 0; clp->l_old_devid = (uint64_t)NULL; clp->l_old_devid_sz = 0; /* If no device id associated with device, just return */ if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) { clp->l_devid_sz = 0; clp->l_minor_name[0] = '\0'; if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 && md_keep_repl_state == 0) { /* * No devid in md.conf; we're in recovery mode so * lookup the devid for the device as specified by * the devt in md.conf. */ lookup_db_devid(clp); } return (0); } clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ; clp->l_devid_sz = (int)ddi_devid_sizeof( (ddi_devid_t)(uintptr_t)clp->l_devid); (void) strcpy(clp->l_minor_name, minor_name); kmem_free(minor_name, strlen(minor_name) + 1); return (0); } /* * grab driver name, minor, and block out of * strings like "driver:minor:block:devid driver:minor:block:devid ..." */ static void parse_db_string( char *str ) { char *p, *e; mddb_cfg_loc_t *cl; char restore_space; /* CSTYLED */ cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP); for (p = str; (*p != '\0'); ) { for (; ((*p != '\0') && (ISWHITE(*p))); ++p) ; if (*p == '\0') break; for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e) ; /* * Only give parse_db_loc 1 entry, so stuff a null into * the string if we're not at the end. We need to save this * char and restore it after call. */ restore_space = '\0'; if (*e != '\0') { restore_space = *e; *e = '\0'; } if (parse_db_loc(p, cl) != 0) { cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p); } else { (void) ridev( &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip, cl, NULL, MDDB_F_PTCHED); if (cl->l_devid_flags & MDDB_DEVID_SPACE) { kmem_free((caddr_t)(uintptr_t)cl->l_devid, cl->l_devid_sz); } } if (restore_space != '\0') { *e = restore_space; } p = e; } kmem_free(cl, sizeof (mddb_cfg_loc_t)); } /* * grab database locations supplied by md.conf as properties */ static void parse_db_strings(void) { int bootlist_id; int proplen; /* * size of _bootlist_name should match uses of line and entry in * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c) */ char _bootlist_name[MDDB_BOOTLIST_MAX_LEN]; char *bootlist_name; caddr_t prop; /* * Step through the bootlist properties one at a time by forming the * correct name, fetching the property, parsing the property and * then freeing the memory. If a property does not exist or returns * some form of error just ignore it. There is no guarantee that * the properties will always exist in sequence, for example * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with * mddb_bootlist3 existing. */ bootlist_name = &_bootlist_name[0]; for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) { proplen = 0; (void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id); if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo, DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop, &proplen) != DDI_PROP_SUCCESS) continue; if (proplen <= 0) continue; if (md_init_debug) cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop); parse_db_string(prop); kmem_free(prop, proplen); } } static int initit( set_t setno, int flag ) { int i; mddb_set_t *s; mddb_lb_t *lbp; /* pointer to locator block */ mddb_ln_t *lnp; /* pointer to locator names */ mddb_db_t *dbp; /* pointer to directory block */ mddb_did_blk_t *did_blkp; /* pointer to Device ID block */ mddb_did_ic_t *did_icp; /* pointer to Device ID incore area */ mddb_bf_t *bfp; side_t sideno; side_t maxsides; mddb_block_t lb_blkcnt; int retval = 0; md_dev64_t dev; mddb_mnlb_t *mnlbp; int devid_flag; /* single thread's all loads/unloads of set's */ mutex_enter(&mddb_lock); mutex_enter(SETMUTEX(setno)); if (((mddb_set_t *)md_set[setno].s_db) == NULL) { mutex_exit(SETMUTEX(setno)); mutex_exit(&mddb_lock); return (MDDB_E_NOTNOW); } s = (mddb_set_t *)md_set[setno].s_db; single_thread_start(s); /* * init is already underway, block. Return success. */ if (s->s_lbp) { single_thread_end(s); mutex_exit(SETMUTEX(setno)); mutex_exit(&mddb_lock); return (0); } uniqtime32(&s->s_inittime); /* grab database locations patched by /etc/system */ if (setno == MD_LOCAL_SET) parse_db_strings(); s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc( sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP); s->s_zombie = 0; s->s_staledeletes = 0; s->s_optcmtcnt = 0; s->s_opthavelck = 0; s->s_optwantlck = 0; s->s_optwaiterr = 0; s->s_opthungerr = 0; /* * KEEPTAG can never be set for a MN diskset since no tags are * allowed to be stored in a MN diskset. No way to check * if this is a MN diskset or not at this point since the mddb * hasn't been read in from disk yet. (flag will only have * MUTLINODE bit set if a new set is being created.) */ if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG)) dt_setup(s, NULL); md_clr_setstatus(s->s_setno, MD_SET_TOOFEW); for (i = 0; i < mddb_maxbufheaders; i++) { bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP); sema_init(&bfp->bf_buf.b_io, 0, NULL, SEMA_DEFAULT, NULL); sema_init(&bfp->bf_buf.b_sem, 0, NULL, SEMA_DEFAULT, NULL); bfp->bf_buf.b_offset = -1; freebuffer(s, bfp); } retval = load_old_replicas(s, flag); /* If 0 return value - success */ if (! retval) { single_thread_end(s); mutex_exit(SETMUTEX(setno)); mutex_exit(&mddb_lock); return (0); } /* * If here, then the load_old_replicas() failed */ /* If the database was supposed to exist. */ if (flag & MDDB_MUSTEXIST) { if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) { for (i = 0; i < mddb_maxcopies; i++) { if (! s->s_mbiarray[i]) continue; dev = md_expldev( s->s_lbp->lb_locators[i].l_dev); dev = md_xlate_targ_2_mini(dev); if (dev != NODEV64) mddb_devclose(dev); free_mbipp(&s->s_mbiarray[i]); } kmem_free((caddr_t)s->s_mbiarray, sizeof (mddb_mb_ic_t *) * mddb_maxcopies); s->s_mbiarray = NULL; } if (s->s_lnp != (mddb_ln_t *)NULL) { kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt)); s->s_lnp = (mddb_ln_t *)NULL; } mddb_devid_icp_free(&s->s_did_icp, s->s_lbp); if (s->s_lbp != (mddb_lb_t *)NULL) { kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt)); s->s_lbp = (mddb_lb_t *)NULL; } while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL) kmem_free((caddr_t)bfp, sizeof (*bfp)); single_thread_end(s); mutex_exit(SETMUTEX(setno)); mutex_exit(&mddb_lock); if (retval == MDDB_E_TAGDATA) return (retval); /* Want a bit more detailed error messages */ if (mddb_db_err_detail) return (retval); return (MDDB_E_NODB); } /* * MDDB_NOOLDOK set - Creating a new database, so do * more initialization. */ lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT); if (flag & MDDB_MULTINODE) { lb_blkcnt = MDDB_MNLBCNT; } if (s->s_lbp == NULL) s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP); lbp = s->s_lbp; bzero((caddr_t)lbp, dbtob(lb_blkcnt)); lbp->lb_setno = setno; lbp->lb_magic = MDDB_MAGIC_LB; if (flag & MDDB_MULTINODE) { lbp->lb_revision = MDDB_REV_MNLB; } else { lbp->lb_revision = MDDB_REV_LB; } lbp->lb_inittime = s->s_inittime; if (flag & MDDB_MULTINODE) { mnlbp = (mddb_mnlb_t *)lbp; for (i = 0; i < MDDB_NLB; i++) { for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) { mddb_mnsidelocator_t *mnslp; mnslp = &mnlbp->lb_mnsidelocators[sideno][i]; mnslp->mnl_mnum = NODEV32; mnslp->mnl_sideno = 0; mnslp->mnl_drvnm_index = 0; } } } else { maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES); for (i = 0; i < MDDB_NLB; i++) { for (sideno = 0; sideno < maxsides; sideno++) { mddb_sidelocator_t *slp; slp = &lbp->lb_sidelocators[sideno][i]; slp->l_mnum = NODEV32; } } } lbp->lb_blkcnt = lb_blkcnt; /* lb starts on block 0 */ /* locator names starts after locator block */ lbp->lb_lnfirstblk = lb_blkcnt; if (flag & MDDB_MULTINODE) { lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT; } else { lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LNCNT : MDDB_LNCNT); } if (flag & MDDB_MULTINODE) { /* Creating a multinode diskset */ md_set_setstatus(setno, MD_SET_MNSET); lbp->lb_flags |= MDDB_MNSET; } /* Data portion of mddb located after locator names */ lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt; /* the btodb that follows is converting the directory block size */ /* Data tag part of mddb located after first block of mddb data */ lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk + btodb(MDDB_BSIZE)); /* Data tags are not used in MN diskset - so set count to 0 */ if (flag & MDDB_MULTINODE) lbp->lb_dtblkcnt = (mddb_block_t)0; else lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS; lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP); lnp->ln_magic = MDDB_MAGIC_LN; if (flag & MDDB_MULTINODE) { lnp->ln_revision = MDDB_REV_MNLN; } else { lnp->ln_revision = MDDB_REV_LN; } s->s_lnp = lnp; /* * Set up Device ID portion of Locator Block. * Do not set locator to device id style if * md_devid_destroy is 1 and md_keep_repl_state is 1 * (destroy all device id data and keep replica in * non device id mode). * * This is logically equivalent to set locator to * device id style if md_devid_destroy is 0 or * md_keep_repl_state is 0. * * In SunCluster environment, device id mode is disabled * which means diskset will be run in non-devid mode. For * localset, the behavior will remain intact and run in * device id mode. * * In multinode diskset devids are turned off. */ devid_flag = 1; if (cluster_bootflags & CLUSTER_CONFIGURED) if (setno != MD_LOCAL_SET) devid_flag = 0; if (flag & MDDB_MULTINODE) devid_flag = 0; if ((md_devid_destroy == 1) && (md_keep_repl_state == 1)) devid_flag = 0; /* * if we weren't devid style before and md_keep_repl_state=1 * we need to stay non-devid */ if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) && (md_keep_repl_state == 1)) devid_flag = 0; if (devid_flag) { lbp->lb_didfirstblk = lbp->lb_dtfirstblk + lbp->lb_dtblkcnt; lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS; lbp->lb_flags |= MDDB_DEVID_STYLE; did_icp = (mddb_did_ic_t *)kmem_zalloc (sizeof (mddb_did_ic_t), KM_SLEEP); did_blkp = (mddb_did_blk_t *) kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP); did_blkp->blk_magic = MDDB_MAGIC_DI; did_blkp->blk_revision = MDDB_REV_DI; did_icp->did_ic_blkp = did_blkp; s->s_did_icp = did_icp; } setidentifier(s, &lbp->lb_ident); uniqtime32(&lbp->lb_timestamp); dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP); dbp->db_magic = MDDB_MAGIC_DB; dbp->db_revision = MDDB_REV_DB; uniqtime32(&dbp->db_timestamp); dbp->db_nextblk = 0; dbp->db_firstentry = NULL; dbp->db_blknum = lbp->lb_dbfirstblk; dbp->db_recsum = MDDB_GLOBAL_XOR; s->s_dbp = dbp; single_thread_end(s); mutex_exit(SETMUTEX(setno)); mutex_exit(&mddb_lock); return (0); } mddb_set_t * mddb_setenter( set_t setno, int flag, int *errorcodep ) { mddb_set_t *s; int err = 0; size_t sz = sizeof (void *) * MD_MAXUNITS; mutex_enter(SETMUTEX(setno)); if (! md_set[setno].s_db) { mutex_exit(SETMUTEX(setno)); if (errorcodep != NULL) *errorcodep = MDDB_E_NOTOWNER; return (NULL); } /* Allocate s_un and s_ui arrays if not already present. */ if (md_set[setno].s_un == NULL) { md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP); if (md_set[setno].s_un == NULL) { mutex_exit(SETMUTEX(setno)); if (errorcodep != NULL) *errorcodep = MDDB_E_NOTOWNER; return (NULL); } } if (md_set[setno].s_ui == NULL) { md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP); if (md_set[setno].s_ui == NULL) { mutex_exit(&md_set[setno].s_dbmx); kmem_free(md_set[setno].s_un, sz); md_set[setno].s_un = NULL; if (errorcodep != NULL) *errorcodep = MDDB_E_NOTOWNER; return (NULL); } } s = (mddb_set_t *)md_set[setno].s_db; if (s->s_lbp) return (s); if (flag & MDDB_NOINIT) return (s); /* * Release the set mutex - it will be acquired and released in * initit after acquiring the mddb_lock. This is done to assure * that mutexes are always acquired in the same order to prevent * possible deadlock */ mutex_exit(SETMUTEX(setno)); if ((err = initit(setno, flag)) != 0) { if (errorcodep != NULL) *errorcodep = err; return (NULL); } mutex_enter(SETMUTEX(setno)); return ((mddb_set_t *)md_set[setno].s_db); } /* * Release the set lock for a given set. * * In a MN diskset, this routine may send messages to the rpc.mdcommd * in order to have the slave nodes re-parse parts of the mddb. * Messages are only sent if the global ioctl lock is not held. * * With the introduction of multi-threaded ioctls, there is no way * to determine which thread(s) are holding the ioctl lock. So, if * the ioctl lock is held (by process X) process X will send the * messages to the slave nodes when process X releases the ioctl lock. */ void mddb_setexit( mddb_set_t *s ) { md_mn_msg_mddb_parse_t *mddb_parse_msg; md_mn_kresult_t *kresult; mddb_lb_t *lbp = s->s_lbp; int i; int rval = 1; /* * If not a MN diskset OR * a MN diskset but this node isn't master, * then release the mutex. */ if (!(MD_MNSET_SETNO(s->s_setno)) || ((MD_MNSET_SETNO(s->s_setno)) && (!md_set[s->s_setno].s_am_i_master))) { mutex_exit(SETMUTEX(s->s_setno)); return; } /* * If global ioctl lock is held, then send no messages, * just release mutex and return. * */ if (md_status & MD_GBL_IOCTL_LOCK) { mutex_exit(SETMUTEX(s->s_setno)); return; } /* * This thread is not holding the ioctl lock, so drop the set * lock, send messages to slave nodes to reparse portions * of the mddb and return. * * If the block parse flag is set, do not send parse messages. * This flag is set when master is adding a new mddb that would * cause parse messages to be sent to the slaves, but the slaves * don't have knowledge of the new mddb yet since the mddb add * operation hasn't been run on the slave nodes yet. When the * master unblocks the parse flag, the parse messages will be * generated. * * If s_mn_parseflags_sending is non-zero, then another thread * is already currently sending a parse message, so just release * the mutex and return. If an mddb change occurred that results * in a parse message to be generated, the thread that is currently * sending a parse message would generate the additional parse message. * * If s_mn_parseflags_sending is zero and parsing is not blocked, * then loop until s_mn_parseflags is 0 (until there are no more * messages to send). * While s_mn_parseflags is non-zero, * put snapshot of parse_flags in s_mn_parseflags_sending * set s_mn_parseflags to zero * release mutex * send message * re-grab mutex * set s_mn_parseflags_sending to zero */ mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP); while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) && (s->s_mn_parseflags & MDDB_PARSE_MASK) && (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) { /* Grab snapshot of parse flags */ s->s_mn_parseflags_sending = s->s_mn_parseflags; s->s_mn_parseflags = 0; mutex_exit(SETMUTEX(s->s_setno)); /* * Send the message to the slaves to re-parse * the indicated portions of the mddb. Send the status * of the 50 mddbs in this set so that slaves know which * mddbs that the master node thinks are 'good'. * Otherwise, slave may reparse, but from wrong replica. */ mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending; for (i = 0; i < MDDB_NLB; i++) { mddb_parse_msg->msg_lb_flags[i] = lbp->lb_locators[i].l_flags; } kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); while (rval != 0) { rval = mdmn_ksend_message(s->s_setno, MD_MN_MSG_MDDB_PARSE, 0, 0, (char *)mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t), kresult); if (rval != 0) cmn_err(CE_WARN, "mddb_setexit: Unable to send " "mddb update message to other nodes in " "diskset %s\n", s->s_setname); } kmem_free(kresult, sizeof (md_mn_kresult_t)); /* * Re-grab mutex to clear sending field and to * see if another parse message needs to be generated. */ mutex_enter(SETMUTEX(s->s_setno)); s->s_mn_parseflags_sending = 0; } kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t)); mutex_exit(SETMUTEX(s->s_setno)); } static void mddb_setexit_no_parse( mddb_set_t *s ) { mutex_exit(SETMUTEX(s->s_setno)); } uint_t mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt) { uint_t li; mddb_lb_t *lbp = s->s_lbp; mddb_locator_t *lp; ddi_devid_t ret_devid; uint_t devid_len; dev_t ddi_dev; mddb_did_ic_t *did_icp; mddb_did_blk_t *did_blkp; char *minor_name; size_t sz; int retval; int err; md_dev64_t dev64; /* tmp var to make code look better */ /* Need disk block(s) to hold mddb_did_blk_t */ *blk_cnt = MDDB_DID_BLOCKS; if (doit) { /* * Alloc mddb_did_blk_t disk block and fill in header area. * Don't fill in did magic number until end of routine so * if machine panics in the middle of conversion, the * device id information will be thrown away at the * next snarfing of this set. * Need to set DEVID_STYLE so that mddb_devid_add will * function properly. */ /* grab the mutex */ if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) { return (1); } single_thread_start(s); lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS); if (lbp->lb_didfirstblk == 0) { single_thread_end(s); mddb_setexit(s); return (1); } lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS; did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t), KM_SLEEP); did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES, KM_SLEEP); did_blkp->blk_revision = MDDB_REV_DI; did_icp->did_ic_blkp = did_blkp; s->s_did_icp = did_icp; lbp->lb_flags |= MDDB_DEVID_STYLE; } /* Fill in information in mddb_did_info_t array */ for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); ddi_dev = md_dev64_to_dev(dev64); if (ddi_dev == NODEV) { /* * No translation available for replica. * Could fail conversion to device id replica, * but instead will just continue with next * replica in list. */ continue; } if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) { /* * Just count each devid as at least 1 block. This * is conservative since several device id's may fit * into 1 disk block, but it's better to overestimate * the number of blocks needed than to underestimate. */ devid_len = (int)ddi_devid_sizeof(ret_devid); *blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1)); if (doit) { if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name) == DDI_SUCCESS) { if (mddb_devid_add(s, li, ret_devid, minor_name)) { cmn_err(CE_WARN, "Not enough space in metadb" " to add device id for" " dev: major = %d, " "minor = %d\n", getmajor(ddi_dev), getminor(ddi_dev)); } sz = strlen(minor_name) + 1; kmem_free(minor_name, sz); } } ddi_devid_free(ret_devid); } } if (doit) { did_blkp->blk_magic = MDDB_MAGIC_DI; retval = push_lb(s); (void) upd_med(s, "mddb_lb_did_convert(0)"); single_thread_end(s); mddb_setexit(s); if (retval != 0) return (1); } return (0); } static mddb_set_t * init_set( mddb_config_t *cp, int flag, int *errp ) { mddb_set_t *s; char *setname = NULL; set_t setno = MD_LOCAL_SET; side_t sideno = 0; struct timeval32 *created = NULL; if (cp != NULL) { setname = cp->c_setname; setno = cp->c_setno; sideno = cp->c_sideno; created = &cp->c_timestamp; } if (setno >= MD_MAXSETS) return ((mddb_set_t *)NULL); if (md_set[setno].s_db) return (mddb_setenter(setno, flag, errp)); s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP); cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL); cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL); cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL); cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL); cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL); s->s_setno = setno; s->s_sideno = sideno; if (setno == MD_LOCAL_SET) { (void) snprintf(s->s_ident.serial, sizeof (s->s_ident.serial), "%u", zone_get_hostid(NULL)); } else { s->s_ident.createtime = *created; s->s_setname = (char *)kmem_alloc(strlen(setname) + 1, KM_SLEEP); (void) strcpy(s->s_setname, setname); } /* have a config struct, copy mediator information */ if (cp != NULL) s->s_med = cp->c_med; /* structure assignment */ md_set[setno].s_db = (void *) s; SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64); return (mddb_setenter(setno, flag, errp)); } void mddb_unload_set( set_t setno ) { mddb_set_t *s; mddb_db_t *dbp, *adbp = NULL; mddb_de_ic_t *dep, *dep2; mddb_bf_t *bfp; int i; md_dev64_t dev; if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL) return; single_thread_start(s); s->s_opthavequeuinglck = 0; s->s_optwantqueuinglck = 0; for (dbp = s->s_dbp; dbp != 0; dbp = adbp) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) { if (dep->de_rb_userdata != NULL) { if (dep->de_icreqsize) kmem_free(dep->de_rb_userdata_ic, dep->de_icreqsize); else kmem_free(dep->de_rb_userdata, dep->de_reqsize); } kmem_free((caddr_t)dep->de_rb, dep->de_recsize); dep2 = dep->de_next; kmem_free((caddr_t)dep, sizeofde(dep)); } adbp = dbp->db_next; kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); } s->s_dbp = (mddb_db_t *)NULL; free_rip(&s->s_rip); for (i = 0; i < mddb_maxcopies; i++) { if (! s->s_mbiarray) break; if (! s->s_mbiarray[i]) continue; dev = md_expldev(s->s_lbp->lb_locators[i].l_dev); dev = md_xlate_targ_2_mini(dev); if (dev != NODEV64) mddb_devclose(dev); free_mbipp(&s->s_mbiarray[i]); } if (s->s_mbiarray) { kmem_free((caddr_t)s->s_mbiarray, sizeof (mddb_mb_ic_t *) * mddb_maxcopies); s->s_mbiarray = (mddb_mb_ic_t **)NULL; } if (s->s_lnp) { kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt)); s->s_lnp = (mddb_ln_t *)NULL; } if (s->s_lbp) { mddb_devid_icp_free(&s->s_did_icp, s->s_lbp); kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt)); s->s_lbp = (mddb_lb_t *)NULL; } if (s->s_freebitmap) { kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize); s->s_freebitmap = NULL; s->s_freebitmapsize = 0; } while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL) kmem_free((caddr_t)bfp, sizeof (*bfp)); if (s->s_databuffer_size) { kmem_free(s->s_databuffer, s->s_databuffer_size); s->s_databuffer_size = 0; } if (s->s_setname != NULL) kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1); /* Data tags not supported on MN sets. */ if (!(md_get_setstatus(setno) & MD_SET_MNSET)) dtl_freel(&s->s_dtlp); md_set[setno].s_db = NULL; ASSERT(s->s_singlelockwanted == 0); kmem_free(s, sizeof (mddb_set_t)); /* Take care of things setup in the md_set array */ if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) { if (md_set[setno].s_dtp) { kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); md_set[setno].s_dtp = NULL; } } md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT | MD_SET_TAGDATA | MD_SET_USETAG | MD_SET_TOOFEW | MD_SET_STALE | MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET | MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | MD_SET_MN_MIR_STATE_RC | MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT); mutex_exit(SETMUTEX(setno)); } /* * returns 0 if name can be put into locator block * returns 1 if locator block prefixes are all used * * Takes splitname (suffix, prefix, sideno) and * stores it in the locator name structure. * For traditional diskset, the sideno is the index into the suffixes * array in the locator name structure. * For the MN diskset, the sideno is the nodeid which can be any number, * so the index passed in is the index into the mnsuffixes array * in the locator structure. This index was computed by the * routine checklocator which basically checked the locator block * mnside locator structure. */ static int splitname2locatorblock( md_splitname *spn, mddb_ln_t *lnp, int li, side_t sideno, int index ) { uchar_t i; md_name_suffix *sn; md_mnname_suffix_t *mnsn; mddb_mnln_t *mnlnp; for (i = 0; i < MDDB_PREFIXCNT; i++) { if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len) continue; if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data, SPN_PREFIX(spn).pre_len) == 0) break; } if (i == MDDB_PREFIXCNT) { for (i = 0; i < MDDB_PREFIXCNT; i++) { if (lnp->ln_prefixes[i].pre_len == 0) break; } if (i == MDDB_PREFIXCNT) return (1); bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_len); lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len; } if (lnp->ln_revision == MDDB_REV_MNLN) { /* If a MN diskset, use index */ mnlnp = (mddb_mnln_t *)lnp; mnsn = &mnlnp->ln_mnsuffixes[index][li]; mnsn->mn_ln_sideno = sideno; mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len; mnsn->mn_ln_suffix.suf_prefix = i; bcopy(SPN_SUFFIX(spn).suf_data, mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len); } else { sn = &lnp->ln_suffixes[sideno][li]; sn->suf_len = SPN_SUFFIX(spn).suf_len; sn->suf_prefix = i; bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data, SPN_SUFFIX(spn).suf_len); } return (0); } /* * Find the locator name for the given sideno and convert the locator name * information into a splitname structure. */ void mddb_locatorblock2splitname( mddb_ln_t *lnp, int li, side_t sideno, md_splitname *spn ) { int iprefix; md_name_suffix *sn; md_mnname_suffix_t *mnsn; int i; mddb_mnln_t *mnlnp; if (lnp->ln_revision == MDDB_REV_MNLN) { mnlnp = (mddb_mnln_t *)lnp; for (i = 0; i < MD_MNMAXSIDES; i++) { mnsn = &mnlnp->ln_mnsuffixes[i][li]; if (mnsn->mn_ln_sideno == sideno) break; } if (i == MD_MNMAXSIDES) return; SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len; bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data, SPN_SUFFIX(spn).suf_len); iprefix = mnsn->mn_ln_suffix.suf_prefix; } else { sn = &lnp->ln_suffixes[sideno][li]; SPN_SUFFIX(spn).suf_len = sn->suf_len; bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data, SPN_SUFFIX(spn).suf_len); iprefix = sn->suf_prefix; } SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len; bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data, SPN_PREFIX(spn).pre_len); } static int getdeldev( mddb_config_t *cp, int command, md_error_t *ep ) { mddb_set_t *s; mddb_lb_t *lbp; mddb_locator_t *locators; uint_t loccnt; mddb_mb_ic_t *mbip; mddb_block_t blk; int err = 0; int i, j; int li; uint_t commitcnt; set_t setno = cp->c_setno; uint_t set_status; md_dev64_t dev; int flags = MDDB_MUSTEXIST; mddb_ri_t *rip; cp->c_dbmax = MDDB_NLB; /* * Data checking */ if (setno >= md_nsets || cp->c_id < 0 || cp->c_id > cp->c_dbmax) { return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); } if (cp->c_flags & MDDB_C_STALE) flags |= MDDB_MN_STALE; if ((s = mddb_setenter(setno, flags, &err)) == NULL) return (mddbstatus2error(ep, err, NODEV32, setno)); cp->c_flags = 0; lbp = s->s_lbp; loccnt = lbp->lb_loccnt; locators = lbp->lb_locators; /* shorthand */ set_status = md_get_setstatus(setno); if (set_status & MD_SET_STALE) cp->c_flags |= MDDB_C_STALE; if (set_status & MD_SET_TOOFEW) cp->c_flags |= MDDB_C_TOOFEW; cp->c_sideno = s->s_sideno; cp->c_dbcnt = 0; /* * go through and count active entries */ for (i = 0; i < loccnt; i++) { if (locators[i].l_flags & MDDB_F_DELETED) continue; cp->c_dbcnt++; } /* * add the ability to accept a locator block index * which is not relative to previously deleted replicas. This * is for support of MD_DEBUG=STAT in metastat since it asks for * replica information specifically for each of the mirror resync * records. MDDB_CONFIG_SUBCMD uses one of the pad spares in * the mddb_config_t type. */ if (cp->c_subcmd == MDDB_CONFIG_ABS) { if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) { mddb_setexit(s); return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); } li = cp->c_id; } else { if (cp->c_id >= cp->c_dbcnt) { mddb_setexit(s); return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); } /* CSTYLED */ for (li = 0, j = 0; /* void */; li++) { if (locators[li].l_flags & MDDB_F_DELETED) continue; j++; if (j > cp->c_id) break; } } if (command == MDDB_ENDDEV) { daddr_t ib = 0, jb; blk = 0; if ((s != NULL) && s->s_mbiarray[li]) { mbip = s->s_mbiarray[li]; while ((jb = getphysblk(blk++, mbip)) > 0) { if (jb > ib) ib = jb; } cp->c_dbend = (int)ib; } else { cp->c_dbend = 0; } } locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp); mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname); if (command != MDDB_DELDEV) { mddb_setexit(s); return (0); } /* Currently don't allow addition/deletion of sides during upgrade */ if (MD_UPGRADE) { cmn_err(CE_WARN, "Deletion of replica not allowed during upgrade.\n"); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); } /* * If here, replica delete in progress. */ single_thread_start(s); if ((! (locators[li].l_flags & MDDB_F_EMASTER)) && (locators[li].l_flags & MDDB_F_ACTIVE)) { commitcnt = lbp->lb_commitcnt; lbp->lb_commitcnt = 0; setidentifier(s, &lbp->lb_ident); crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL); /* * Don't need to write out device id area, since locator * block on this replica is being deleted by setting the * commitcnt to 0. */ (void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li, MDDB_WR_ONLY_MASTER); lbp->lb_commitcnt = commitcnt; } if (s->s_mbiarray[li]) { /* A freed mbi pointer still exists in the mddb_ri_t */ for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { if (rip->ri_mbip == s->s_mbiarray[li]) rip->ri_mbip = NULL; } free_mbipp(&s->s_mbiarray[li]); } if (! (locators[li].l_flags & MDDB_F_EMASTER)) { dev = md_expldev(locators[li].l_dev); dev = md_xlate_targ_2_mini(dev); if (dev != NODEV64) mddb_devclose(dev); } s->s_mbiarray[li] = 0; lbp->lb_locators[li].l_flags = MDDB_F_DELETED; /* Only support data tags for traditional and local sets */ if ((md_get_setstatus(setno) & MD_SET_STALE) && (!(lbp->lb_flags & MDDB_MNSET)) && setno != MD_LOCAL_SET) if (set_dtag(s, ep)) mdclrerror(ep); /* Write data tags to all accessible devices */ /* Only support data tags for traditional and local sets */ if (!(lbp->lb_flags & MDDB_MNSET)) { (void) dt_write(s); } /* Delete device id of deleted replica */ if (lbp->lb_flags & MDDB_DEVID_STYLE) { (void) mddb_devid_delete(s, li); } /* write new locator to all devices */ err = writelocall(s); (void) upd_med(s, "getdeldev(0)"); SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno, md_expldev(locators[li].l_dev)); computefreeblks(s); /* recompute always it may be larger */ cp->c_dbcnt--; err |= fixoptrecords(s); if (err) { if (writeretry(s)) { single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno)); } } single_thread_end(s); mddb_setexit(s); return (0); } static int getdriver( mddb_cfg_loc_t *clp ) { major_t majordev; /* * Data checking */ if (clp->l_dev <= 0) return (EINVAL); majordev = getmajor(expldev(clp->l_dev)); if (ddi_major_to_name(majordev) == (char *)NULL) return (EINVAL); if (MD_UPGRADE) (void) strcpy(clp->l_driver, md_targ_major_to_name(majordev)); else (void) strcpy(clp->l_driver, ddi_major_to_name(majordev)); return (0); } /* * update_valid_replica - updates the locator block namespace (prefix * and/or suffix) with new pathname and devname. * RETURN * 1 Error * 0 Success */ static int update_valid_replica( side_t side, mddb_locator_t *lp, mddb_set_t *s, int li, char *devname, char *pathname, md_dev64_t devt ) { uchar_t pre_len, suf_len; md_name_suffix *sn; mddb_ln_t *lnp; uchar_t pre_index; uchar_t i; if (md_expldev(lp->l_dev) != devt) { return (0); } if (pathname[strlen(pathname) - 1] == '/') pathname[strlen(pathname) - 1] = '\0'; pre_len = (uchar_t)strlen(pathname); suf_len = (uchar_t)strlen(devname); if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX)) return (1); lnp = s->s_lnp; /* * Future note: Need to do something here for the MN diskset case * when device ids are supported in disksets. * Can't add until merging devids_in_diskset code into code base * Currently only called with side of 0. */ sn = &lnp->ln_suffixes[side][li]; /* * Check if prefix (Ex: /dev/dsk) needs to be changed. * If new prefix is the same as the previous prefix - no change. * * If new prefix is not the same, check if new prefix * matches an existing one. If so, use that one. * * If new prefix doesn't exist, add a new prefix. If not enough * space, return failure. */ pre_index = sn->suf_prefix; /* Check if new prefix is the same as the old prefix. */ if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) || (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname, pre_len) != 0)) { /* Check if new prefix is an already known prefix. */ for (i = 0; i < MDDB_PREFIXCNT; i++) { if (lnp->ln_prefixes[i].pre_len != pre_len) { continue; } if (bcmp(lnp->ln_prefixes[i].pre_data, pathname, pre_len) == 0) { break; } } /* If no match found for new prefix - add the new prefix */ if (i == MDDB_PREFIXCNT) { for (i = 0; i < MDDB_PREFIXCNT; i++) { if (lnp->ln_prefixes[i].pre_len == 0) break; } /* No space to add new prefix - return failure */ if (i == MDDB_PREFIXCNT) { return (1); } bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len); lnp->ln_prefixes[i].pre_len = pre_len; } sn->suf_prefix = i; } /* Now, update the suffix (Ex: c0t0d0s0) if needed */ if ((sn->suf_len != suf_len) || (bcmp(sn->suf_data, devname, suf_len) != 0)) { bcopy(devname, sn->suf_data, suf_len); sn->suf_len = suf_len; } return (0); } /* * md_update_locator_namespace - If in devid style and active and the devid's * exist and are valid update the locator namespace pathname * and devname. * RETURN * 1 Error * 0 Success */ int md_update_locator_namespace( set_t setno, /* which set to get name from */ side_t side, char *dname, char *pname, md_dev64_t devt ) { mddb_set_t *s; mddb_lb_t *lbp; int li; uint_t flg; int err = 0; mddb_ln_t *lnp; if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) return (1); single_thread_start(s); lbp = s->s_lbp; /* must be DEVID_STYLE */ if (lbp->lb_flags & MDDB_DEVID_STYLE) { for (li = 0; li < lbp->lb_loccnt; li++) { mddb_locator_t *lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) { continue; } /* replica also must be active */ if (lp->l_flags & MDDB_F_ACTIVE) { flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags; /* only update if did exists and is valid */ if ((flg & MDDB_DID_EXISTS) && (flg & MDDB_DID_VALID)) { if (update_valid_replica(side, lp, s, li, dname, pname, devt)) { err = 1; goto out; } } } } } lnp = s->s_lnp; uniqtime32(&lnp->ln_timestamp); if (lbp->lb_flags & MDDB_MNSET) lnp->ln_revision = MDDB_REV_MNLN; else lnp->ln_revision = MDDB_REV_LN; crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, lbp->lb_lnblkcnt, 0); /* * If a MN diskset and this is the master, set the PARSE_LOCNM * flag in the mddb_set structure to show that the locator * names have changed. */ if ((lbp->lb_flags & MDDB_MNSET) && (md_set[s->s_setno].s_am_i_master)) { s->s_mn_parseflags |= MDDB_PARSE_LOCNM; } out: single_thread_end(s); mddb_setexit(s); if (err) return (1); return (0); } /* * update_locatorblock - for active entries in the locator block, check * the devt to see if it matches the given devt. If so, and * there is an associated device id which is not the same * as the passed in devid, delete old devid and add a new one. * * During import of replicated disksets, old_didptr contains * the original disk's device id. Use this device id in * addition to the devt to determine if an entry is a match * and should be updated with the new device id of the * replicated disk. Specifically, this is the case being handled: * * Original_disk Replicated_disk Disk_Available_During_Import * c1t1d0 c1t3d0 no - so old name c1t1d0 shown * c1t2d0 c1t1d0 yes - name is c1t1d0 * c1t3d0 c1t2d0 yes - name is c1t2d0 * * Can't just match on devt since devt for the first and third * disks will be the same, but the original disk's device id * is known and can be used to distinguish which disk's * replicated device id should be updated. * RETURN * MDDB_E_NODEVID * MDDB_E_NOLOCBLK * 1 Error * 0 Success */ static int update_locatorblock( mddb_set_t *s, md_dev64_t dev, ddi_devid_t didptr, ddi_devid_t old_didptr ) { mddb_lb_t *lbp = NULL; mddb_locator_t *lp; int li; uint_t flg; ddi_devid_t devid_ptr; int retval = 0; char *minor_name; int repl_import_flag; /* Set replicated flag if this is a replicated import */ repl_import_flag = md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT; lbp = s->s_lbp; /* find replicas that haven't been deleted */ for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if ((lp->l_flags & MDDB_F_DELETED)) { continue; } /* * check to see if locator devt matches given dev * and if there is a device ID associated with it */ flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags; if ((md_expldev(lp->l_dev) == dev) && (flg & MDDB_DID_EXISTS)) { if (flg & MDDB_DID_VALID) { continue; /* cont to nxt active entry */ } devid_ptr = s->s_did_icp->did_ic_devid[li]; if (devid_ptr == NULL) { return (MDDB_E_NODEVID); } /* * During a replicated import the old_didptr * must match the current devid before the * devid can be updated. */ if (repl_import_flag) { if (ddi_devid_compare(devid_ptr, old_didptr) != 0) continue; } if (ddi_devid_compare(devid_ptr, didptr) != 0) { /* * devid's not equal so * delete and add */ if (ddi_lyr_get_minor_name( md_dev64_to_dev(dev), S_IFBLK, &minor_name) == DDI_SUCCESS) { (void) mddb_devid_delete(s, li); (void) mddb_devid_add(s, li, didptr, minor_name); kmem_free(minor_name, strlen(minor_name)+1); break; } else { retval = 1; goto err_out; } } } } /* end for */ retval = push_lb(s); (void) upd_med(s, "update_locatorblock(0)"); err_out: return (retval); } static int update_mb_devid( mddb_set_t *s, mddb_ri_t *rip, ddi_devid_t devidptr ) { mddb_mb_ic_t *mbip; mddb_mb_t *mb = NULL; daddr_t blkno; md_dev64_t device; uint_t sz; int mb2free = 0; int err = 0; /* * There is case where a disk may not have mddb, * and only has dummy mddb which contains * a valid devid we like to update and in this * case, the rip_lbp will be NULL but we still * like to update the devid embedded in the * dummy mb block. * */ if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) { mbip = rip->ri_mbip; mb = &mbip->mbi_mddb_mb; } else { /* * Done if it is non-replicated set */ if (devidptr != (ddi_devid_t)NULL) { mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); mb->mb_magic = MDDB_MAGIC_DU; mb->mb_revision = MDDB_REV_MB; mb2free = 1; } else { goto out; } } blkno = rip->ri_blkno; device = rip->ri_dev; /* * Replace the mb_devid with the new/valid one */ if (devidptr != (ddi_devid_t)NULL) { /* * Zero out what we have previously */ if (mb->mb_devid_len) bzero(mb->mb_devid, mb->mb_devid_len); sz = ddi_devid_sizeof(devidptr); bcopy((char *)devidptr, (char *)mb->mb_devid, sz); mb->mb_devid_len = sz; } mb->mb_setno = s->s_setno; uniqtime32(&mb->mb_timestamp); crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL); /* * putblks will * * - drop the s_dbmx lock * - biowait * - regain the s_dbmx lock * * Need to update this if we wants to handle * mb_next != NULL which it is unlikely will happen */ err = putblks(s, (caddr_t)mb, blkno, 1, device, 0); if (mb2free) { kmem_free(mb, MDDB_BSIZE); } out: return (err); } static int setdid( mddb_config_t *cp ) { ddi_devid_t devidp; dev_t ddi_dev; mddb_set_t *s; int err = 0; mddb_ri_t *rip; /* * Data integrity check */ if (cp->c_setno >= md_nsets || cp->c_devt <= 0) return (EINVAL); if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE)) return (0); ddi_dev = md_dev64_to_dev(cp->c_devt); if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) { return (-1); } if (devidp == NULL) { return (-1); } if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) return (-1); single_thread_start(s); for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { if (rip->ri_lbp == (mddb_lb_t *)NULL) continue; /* * We only update what is asked */ if (rip->ri_dev == cp->c_devt) { if (update_mb_devid(s, rip, devidp) != 0) { err = -1; goto out; } } } if (update_locatorblock(s, cp->c_devt, devidp, NULL)) { err = -1; goto out; } out: single_thread_end(s); mddb_setexit(s); ddi_devid_free(devidp); return (err); } static int delnewside( mddb_config_t *cp, int command, md_error_t *ep ) { mddb_set_t *s; int li; mddb_lb_t *lbp; /* pointer to locator block */ mddb_ln_t *lnp; /* pointer to locator names */ mddb_mnln_t *mnlnp; /* pointer to locator names */ mddb_locator_t *lp; mddb_sidelocator_t *slp; mddb_cfg_loc_t *clp; int err = 0; set_t setno = cp->c_setno; ddi_devid_t devid; ddi_devid_t ret_devid = NULL; char *minor_name; uint_t use_devid = 0; dev_t ddi_dev; md_mnname_suffix_t *mnsn; mddb_mnlb_t *mnlbp; mddb_mnsidelocator_t *mnslp; /* Currently don't allow addition/deletion of sides during upgrade */ if (MD_UPGRADE) { cmn_err(CE_WARN, "Addition and deletion of sides not allowed" " during upgrade. \n"); return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); } /* * Data integrity check */ if (setno >= md_nsets || cp->c_locator.l_dev <= 0) return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) return (mddbstatus2error(ep, err, NODEV32, setno)); single_thread_start(s); clp = &cp->c_locator; lbp = s->s_lbp; if (lbp->lb_setno != setno) { single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); } /* * Find this device/blkno pair */ if (lbp->lb_flags & MDDB_DEVID_STYLE) { ddi_dev = md_dev64_to_dev(clp->l_dev); if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) && (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name) == DDI_SUCCESS)) { if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) { clp->l_devid = (uint64_t)(uintptr_t)ret_devid; use_devid = 1; (void) strcpy(clp->l_minor_name, minor_name); } kmem_free(minor_name, strlen(minor_name)+1); } if (use_devid != 1 && ret_devid != NULL) ddi_devid_free(ret_devid); } for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; if (use_devid) { if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0) continue; if ((ddi_devid_compare(devid, (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) && (strcmp(clp->l_minor_name, minor_name) == 0) && ((daddr_t)lp->l_blkno == clp->l_blkno)) { break; } } else { if (lp->l_dev == clp->l_dev && (daddr_t)lp->l_blkno == clp->l_blkno) { break; } } } if (li == lbp->lb_loccnt) { if (use_devid) ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); } lnp = s->s_lnp; if (command == MDDB_NEWSIDE) { int index = 0; /* * If a MN diskset, need to find the index where the new * locator information is to be stored in the mnsidelocator * field of the locator block so that the locator name can * be stored at the same array index in the mnsuffixes * field of the locator names structure. */ if (lbp->lb_flags & MDDB_MNSET) { if ((index = checklocator(lbp, li, cp->c_sideno)) == -1) { if (use_devid) { ddi_devid_free((ddi_devid_t) (uintptr_t)clp->l_devid); } single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno)); } } /* * Store the locator name before the sidelocator information * in case a panic occurs between these 2 steps. Must have * the locator name information in order to print reasonable * error information. */ if (splitname2locatorblock(&cp->c_devname, lnp, li, cp->c_sideno, index)) { if (use_devid) ddi_devid_free( (ddi_devid_t)(uintptr_t)clp->l_devid); single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno)); } if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) { if (use_devid) ddi_devid_free( (ddi_devid_t)(uintptr_t)clp->l_devid); single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno)); } } if (use_devid) ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); if (command == MDDB_DELSIDE) { int i; for (i = 0; i < lbp->lb_loccnt; i++) { if (lbp->lb_flags & MDDB_MNSET) { int j; mnlbp = (mddb_mnlb_t *)lbp; for (j = 0; j < MD_MNMAXSIDES; j++) { mnslp = &mnlbp->lb_mnsidelocators[j][i]; if (mnslp->mnl_sideno == cp->c_sideno) break; } if (j < MD_MNMAXSIDES) { mnslp->mnl_mnum = NODEV32; mnslp->mnl_sideno = 0; mnlnp = (mddb_mnln_t *)lnp; mnsn = &(mnlnp->ln_mnsuffixes[j][i]); bzero((caddr_t)mnsn, sizeof (md_mnname_suffix_t)); } } else { slp = &lbp->lb_sidelocators[cp->c_sideno][i]; bzero((caddr_t)&lnp->ln_suffixes [cp->c_sideno][i], sizeof (md_name_suffix)); slp->l_mnum = NODEV32; } } } /* write new locator names to all devices */ uniqtime32(&lnp->ln_timestamp); if (lbp->lb_flags & MDDB_MNSET) lnp->ln_revision = MDDB_REV_MNLN; else lnp->ln_revision = MDDB_REV_LN; crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, lbp->lb_lnblkcnt, 0); /* * If a MN diskset and this is the master, set the PARSE_LOCNM * flag in the mddb_set structure to show that the locator * names have changed. */ if ((lbp->lb_flags & MDDB_MNSET) && (md_set[s->s_setno].s_am_i_master)) { s->s_mn_parseflags |= MDDB_PARSE_LOCNM; } if (err) { if (writeretry(s)) { single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); } } uniqtime32(&lbp->lb_timestamp); /* write new locator to all devices */ err = writelocall(s); (void) upd_med(s, "delnewside(0)"); computefreeblks(s); /* recompute always it may be larger */ if (err) { if (writeretry(s)) { single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); } } single_thread_end(s); mddb_setexit(s); return (0); } static int newdev( mddb_config_t *cp, int command, md_error_t *ep ) { mddb_set_t *s; mddb_mb_ic_t *mbip, *mbip1; int i, j; int li; mddb_lb_t *lbp; /* pointer to locator block */ mddb_ln_t *lnp; /* pointer to locator names */ mddb_locator_t *lp; mddb_cfg_loc_t *clp; int err = 0; set_t setno = cp->c_setno; ddi_devid_t devid2; ddi_devid_t ret_devid = NULL; char *minor_name; uint_t use_devid = 0; dev_t ddi_dev; int old_flags; int flags; int mn_set = 0; int index; mddb_ri_t *rip; int locator_deleted = 0; dev32_t locator_deleted_dev; int sz = 0; /* Currently don't allow addition of new replica during upgrade */ if (MD_UPGRADE) { cmn_err(CE_WARN, "Addition of new replica not allowed during upgrade.\n"); return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); } /* * Data integrity check */ if (setno >= md_nsets || cp->c_locator.l_dev <= 0) return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); /* Determine the flag settings for multinode sets */ flags = MDDB_NOOLDOK; if (cp->c_multi_node) flags |= MDDB_MULTINODE; if ((s = mddb_setenter(setno, flags, &err)) == NULL) { if (err != MDDB_E_NOTOWNER) return (mddbstatus2error(ep, err, NODEV32, setno)); s = init_set(cp, flags, &err); if (s == NULL) return (mddbstatus2error(ep, err, NODEV32, setno)); } single_thread_start(s); /* shorthand */ clp = &cp->c_locator; /* shorthand */ lbp = s->s_lbp; if (lbp->lb_setno != setno) { single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); } /* * See if this device/blkno pair is already a replica */ if (lbp->lb_flags & MDDB_DEVID_STYLE) { ddi_dev = expldev(clp->l_dev); if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) && (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name) == DDI_SUCCESS)) { if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) { clp->l_devid = (uint64_t)(uintptr_t)ret_devid; use_devid = 1; (void) strcpy(clp->l_minor_name, minor_name); } kmem_free(minor_name, strlen(minor_name)+1); } if (use_devid != 1 && ret_devid != NULL) ddi_devid_free(ret_devid); } for (i = 0; i < lbp->lb_loccnt; i++) { lp = &lbp->lb_locators[i]; if (lp->l_flags & MDDB_F_DELETED) continue; if (use_devid) { if ((mddb_devid_get(s, i, &devid2, &minor_name)) == 0) continue; if ((ddi_devid_compare(devid2, (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) && (strcmp(clp->l_minor_name, minor_name) == 0) && ((daddr_t)lp->l_blkno == clp->l_blkno)) { if (command == MDDB_NEWDEV) { ddi_devid_free((ddi_devid_t)(uintptr_t) clp->l_devid); single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_EXISTS, NODEV32, setno)); } } } else { if (lp->l_dev == clp->l_dev && (daddr_t)lp->l_blkno == clp->l_blkno) { if (command == MDDB_NEWDEV) { single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_EXISTS, NODEV32, setno)); } } } } /* * Really is a new replica, go get the master blocks */ mbip = getmasters(s, md_expldev(clp->l_dev), clp->l_blkno, (uint_t *)0, &mn_set); if (! mbip) { if (use_devid) ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_MASTER, NODEV32, setno)); } /* * Compute free blocks in replica. */ computefreeblks(s); /* * Check if this is large enough */ for (mbip1 = mbip, i = 0; mbip1 != NULL; mbip1 = mbip1->mbi_next) i += mbip1->mbi_mddb_mb.mb_blkcnt; for (j = i; j < s->s_totalblkcnt; j++) { if (blkcheck(s, j)) { while (mbip) { mbip1 = mbip->mbi_next; kmem_free((caddr_t)mbip, MDDB_IC_BSIZE); mbip = mbip1; } if (use_devid) ddi_devid_free( (ddi_devid_t)(uintptr_t)clp->l_devid); mddb_devclose(md_expldev(clp->l_dev)); single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno)); } } /* Look for a deleted slot */ for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) { locator_deleted = 1; locator_deleted_dev = lp->l_dev; break; } } /* If no deleted slots, add a new one */ if (li == lbp->lb_loccnt) { /* Already have the max replicas, bail */ if (lbp->lb_loccnt == MDDB_NLB) { if (use_devid) ddi_devid_free((ddi_devid_t)(uintptr_t) clp->l_devid); mddb_devclose(md_expldev(clp->l_dev)); single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32, setno)); } lbp->lb_loccnt++; lp = &lbp->lb_locators[li]; } /* Initialize the new or deleted slot */ old_flags = lp->l_flags; lp->l_dev = clp->l_dev; lp->l_blkno = (daddr32_t)clp->l_blkno; lp->l_flags = clp->l_flags; /* shorthand */ lnp = s->s_lnp; index = 0; if ((lbp->lb_flags & MDDB_MNSET) || (flags & MDDB_MULTINODE)) { /* * If a MN diskset, need to find the index where the new * locator information is to be stored in the mnsidelocator * field of the locator block so that the locator name can * be stored at the same array index in the mnsuffixes * field of the locator names structure. */ lbp->lb_flags |= MDDB_MNSET; if ((index = checklocator(lbp, li, s->s_sideno)) == -1) { if (use_devid) ddi_devid_free((ddi_devid_t)(uintptr_t)clp-> l_devid); lp->l_flags = old_flags; lbp->lb_loccnt--; mddb_devclose(md_expldev(clp->l_dev)); single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno)); } } /* * Store the locator name before the sidelocator information * in case a panic occurs between these 2 steps. Must have * the locator name information in order to print reasonable * error information. */ if (splitname2locatorblock(&cp->c_devname, lnp, li, s->s_sideno, index)) { if (use_devid) ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); lp->l_flags = old_flags; lbp->lb_loccnt--; mddb_devclose(md_expldev(clp->l_dev)); single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno)); } /* * Compute free blocks in replica before calling cfgloc2locator * since cfgloc2locator may attempt to alloc an unused block * to store the device id. * mbiarray needs to be setup before calling computefreeblks. */ s->s_mbiarray[li] = mbip; computefreeblks(s); if (cfgloc2locator(lbp, clp, li, s->s_sideno, index)) { if (use_devid) ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); lp->l_flags = old_flags; lbp->lb_loccnt--; s->s_mbiarray[li] = 0; mddb_devclose(md_expldev(clp->l_dev)); single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno)); } /* * Hijack a deleted rip master record and correct the contents */ if (locator_deleted) { for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { if (rip->ri_lbp != NULL && rip->ri_mbip == 0 && (rip->ri_dev == md_expldev(locator_deleted_dev))) { rip->ri_dev = md_expldev(clp->l_dev); rip->ri_mbip = mbip; if (use_devid && clp->l_devid != 0) { sz = (int)ddi_devid_sizeof( (ddi_devid_t)(uintptr_t) clp->l_devid); rip->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP); bcopy((void *)(uintptr_t)clp->l_devid, (char *)rip->ri_devid, sz); } break; } } } if (use_devid) ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); uniqtime32(&lbp->lb_timestamp); lp->l_flags = MDDB_F_ACTIVE; /* write db copy to new device */ err = writecopy(s, li, MDDB_WRITECOPY_ALL); lp->l_flags |= MDDB_F_UP2DATE; /* write new locator names to all devices */ uniqtime32(&lnp->ln_timestamp); if (lbp->lb_flags & MDDB_MNSET) lnp->ln_revision = MDDB_REV_MNLN; else lnp->ln_revision = MDDB_REV_LN; crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, lbp->lb_lnblkcnt, 0); /* * If a MN diskset and this is the master, set the PARSE_LOCNM * flag in the mddb_set structure to show that the locator * names have changed. */ if ((lbp->lb_flags & MDDB_MNSET) && (md_set[s->s_setno].s_am_i_master)) { s->s_mn_parseflags |= MDDB_PARSE_LOCNM; } if (err) { if (writeretry(s)) { single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); } } /* Data tags not supported on MN sets */ if ((md_get_setstatus(setno) & MD_SET_STALE) && (!(lbp->lb_flags & MDDB_MNSET)) && setno != MD_LOCAL_SET) if (set_dtag(s, ep)) mdclrerror(ep); /* Write data tags to all accessible devices */ /* Data tags not supported on MN sets */ if (!(lbp->lb_flags & MDDB_MNSET)) { (void) dt_write(s); } /* write new locator to all devices */ err = writelocall(s); (void) upd_med(s, "newdev(0)"); SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_REPLICA, setno, md_expldev(clp->l_dev)); computefreeblks(s); /* recompute always it may be smaller */ if (err) { if (writeretry(s)) { single_thread_end(s); mddb_setexit(s); return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); } } single_thread_end(s); mddb_setexit(s); return (0); } #ifdef DEBUG static void mddb_check_set( set_t setno ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; mddb_rb32_t *rbp; if (! md_set[setno].s_db) return; s = (mddb_set_t *)md_set[setno].s_db; for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { rbp = dep->de_rb; ASSERT(rbp->rb_magic == MDDB_MAGIC_RB); if (dep->de_rb_userdata) ASSERT((uintptr_t)dep->de_rb_userdata > 2000); } } } #endif /* DEBUG */ /* * Exported Entry Points */ #ifdef DEBUG void mddb_check(void) { int i; for (i = 0; i < md_nsets; i++) { if (! md_set[i].s_db) return; mddb_check_set(i); } } #endif /* DEBUG */ int mddb_configure( mddb_cfgcmd_t command, mddb_config_t *cp ) { mddb_set_t *s; md_error_t *ep = &cp->c_mde; int flag = 0; int err = 0; set_t setno = cp->c_setno; mdclrerror(ep); switch (command) { case MDDB_NEWDEV: err = newdev(cp, command, ep); break; case MDDB_NEWSIDE: case MDDB_DELSIDE: err = delnewside(cp, command, ep); break; case MDDB_GETDEV: case MDDB_DELDEV: case MDDB_ENDDEV: err = getdeldev(cp, command, ep); break; case MDDB_GETDRVRNAME: err = getdriver(&cp->c_locator); break; case MDDB_USEDEV: /* * Note: must allow USEDEV ioctl during upgrade to * support auto-take disksets. * * Also during the set import if the md_devid_destroy * flag is set then error out */ if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy) return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); if (setno >= md_nsets) return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) { if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) { err = mddbstatus2error(ep, err, NODEV32, setno); break; } } if (setno == MD_LOCAL_SET) flag = MDDB_F_IOCTL; if (cp->c_locator.l_old_devid) { md_set_setstatus(setno, MD_SET_REPLICATED_IMPORT); } err = ridev(&s->s_rip, &cp->c_locator, NULL, flag); mddb_setexit(s); break; case MDDB_RELEASESET: mutex_enter(&mddb_lock); mddb_unload_set(cp->c_setno); mutex_exit(&mddb_lock); break; case MDDB_SETDID: err = setdid(cp); break; default: err = mdmddberror(ep, MDE_DB_INVALID, NODEV32, cp->c_setno); } return (err); } int mddb_getoptloc( mddb_optloc_t *ol ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; mddb_recid_t id; set_t setno; ol->li[0] = -1; ol->li[1] = -1; id = ol->recid; setno = DBSET(id); if (setno >= md_nsets) return (EINVAL); if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL)) == NULL) return (0); id = DBID(id); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { if (dep->de_recid != id) continue; ol->li[0] = dep->de_optinfo[0].o_li; ol->li[1] = dep->de_optinfo[1].o_li; mddb_setexit(s); return (0); } } mddb_setexit(s); return (0); } void mddb_init(void) { mddb_set_t *s; mutex_init(&mddb_lock, NULL, MUTEX_DEFAULT, NULL); if ((s = init_set(NULL, MDDB_NOINIT, NULL)) != NULL) mddb_setexit(s); } void mddb_unload(void) { int i; mutex_enter(&mddb_lock); for (i = 0; i < md_nsets; i++) { md_clr_setstatus(i, MD_SET_KEEPTAG); mddb_unload_set(i); } crcfreetab(); mutex_exit(&mddb_lock); } mddb_recid_t mddb_createrec( size_t usersize, /* size of db record */ mddb_type_t type, /* type1 of db record */ uint_t type2, /* type2 of db record */ md_create_rec_option_t options, /* options for this creation */ set_t setno /* set number to create record in */ ) { mddb_set_t *s; mddb_db_t *dbp, *prevdbp, *newdbp; mddb_db32_t *db32p; mddb_de_ic_t *dep; /* LINTED variable unused - used for sizeof calculations */ mddb_de32_t *de32p; mddb_rb32_t *rbp; size_t recsize; ulong_t blkcnt; ulong_t maxblocks; size_t desize, desize_ic; size_t used; mddb_recid_t newid; caddr_t tmppnt; int i, err = 0; void *userdata; uint_t flag_type; #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); #endif /* * everyone is supposed to sepcify if it's a * 32 bit or a 64 bit record */ if ((options &(MD_CRO_32BIT|MD_CRO_64BIT)) == 0) { return (MDDB_E_INVALID); } if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) return (err); if (checkstate(s, MDDB_PROBE)) { mddb_setexit(s); return (MDDB_E_NOTNOW); } recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) + usersize, MDDB_BSIZE); blkcnt = btodb(recsize); if (mddb_maxblocks) maxblocks = mddb_maxblocks; else maxblocks = (MDDB_BSIZE - (sizeof (*db32p) + sizeof (*de32p) - sizeof (de32p->de32_blks))) / sizeof (mddb_block_t); if (blkcnt > maxblocks) { mddb_setexit(s); return (MDDB_E_INVALID); } /* * allocate record block * and new directory block so to avoid sleeping * after starting single_thread */ rbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); if ((options & MD_CRO_OPTIMIZE) == 0) userdata = kmem_zalloc(usersize, KM_SLEEP); newdbp = (mddb_db_t *)kmem_zalloc(sizeof (*newdbp), KM_SLEEP); /* * if this is the largest record allocate new buffer for * checkcopy(); */ if (recsize > s->s_databuffer_size) { tmppnt = (caddr_t)kmem_zalloc(recsize, KM_SLEEP); /* * this test is incase when to sleep during kmem_alloc * and some other task bumped max record size */ if (recsize > s->s_databuffer_size) { if (s->s_databuffer_size) kmem_free(s->s_databuffer, s->s_databuffer_size); s->s_databuffer = tmppnt; s->s_databuffer_size = recsize; } else { kmem_free(tmppnt, recsize); } } single_thread_start(s); newid = 0; do { newid++; if (DBID(newid) == 0) { kmem_free((caddr_t)newdbp, sizeof (*newdbp)); kmem_free((caddr_t)rbp, ((size_t)recsize)); if ((options & MD_CRO_OPTIMIZE) == 0) kmem_free(userdata, usersize); single_thread_end(s); mddb_setexit(s); return (MDDB_E_NOTNOW); } for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { if (dep->de_recid == newid) break; } if (dep != NULL) break; } } while (dbp); desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) + (sizeof (mddb_block_t) * blkcnt); /* * see if a directory block exists which will hold this entry */ for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { used = sizeof (*db32p); for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { used += sizeof (*de32p) - sizeof (de32p->de32_blks); used += sizeof (mddb_block_t) * dep->de_blkcount; } if ((used + desize) < MDDB_BSIZE) break; } if (dbp) { kmem_free((caddr_t)newdbp, sizeof (*newdbp)); if (blkcnt > s->s_freeblkcnt) { kmem_free((caddr_t)rbp, ((size_t)recsize)); if ((options & MD_CRO_OPTIMIZE) == 0) kmem_free(userdata, usersize); single_thread_end(s); mddb_setexit(s); return (MDDB_E_NOSPACE); } prevdbp = NULL; } else { /* * need to add directory block */ if ((blkcnt + 1) > s->s_freeblkcnt) { kmem_free((caddr_t)newdbp, sizeof (*newdbp)); kmem_free((caddr_t)rbp, ((size_t)recsize)); if ((options & MD_CRO_OPTIMIZE) == 0) kmem_free(userdata, usersize); single_thread_end(s); mddb_setexit(s); return (MDDB_E_NOSPACE); } for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next) ; dbp->db_next = newdbp; bzero((caddr_t)dbp->db_next, sizeof (*newdbp)); dbp->db_nextblk = getfreeblks(s, 1); dbp->db_next->db_blknum = dbp->db_nextblk; prevdbp = dbp; dbp = dbp->db_next; dbp->db_nextblk = 0; dbp->db_firstentry = NULL; dbp->db_recsum = 0; dbp->db_magic = MDDB_MAGIC_DB; } /* * ready to add record */ desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) + (sizeof (mddb_block_t) * blkcnt); if (dbp->db_firstentry) { for (dep = dbp->db_firstentry; dep->de_next; dep = dep->de_next) ; dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP); dep = dep->de_next; } else { dep = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP); dbp->db_firstentry = dep; } bzero((caddr_t)dep, desize_ic); dep->de_recid = newid; /* * Optimized records have an owner node associated with them in * a MN diskset. The owner is only set on a node that is actively * writing to that record. The other nodes will show that record * as having an invalid owner. The owner for an optimized record * is used during fixoptrecord to determine which node should * write out the record when the replicas associated with that * optimized record have been changed. */ if (MD_MNSET_SETNO(s->s_setno)) { dep->de_owner_nodeid = MD_MN_INVALID_NID; } dep->de_type1 = type; dep->de_type2 = type2; dep->de_reqsize = usersize; dep->de_recsize = recsize; dep->de_blkcount = blkcnt; flag_type = options & (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID | MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG | MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG); switch (flag_type) { case MD_CRO_OPTIMIZE: dep->de_flags = MDDB_F_OPT; getoptdev(s, dep, 0); getoptdev(s, dep, 1); break; case MD_CRO_STRIPE: dep->de_flags = MDDB_F_STRIPE; break; case MD_CRO_MIRROR: dep->de_flags = MDDB_F_MIRROR; break; case MD_CRO_RAID: dep->de_flags = MDDB_F_RAID; break; case MD_CRO_SOFTPART: dep->de_flags = MDDB_F_SOFTPART; break; case MD_CRO_TRANS_MASTER: dep->de_flags = MDDB_F_TRANS_MASTER; break; case MD_CRO_TRANS_LOG: dep->de_flags = MDDB_F_TRANS_LOG; break; case MD_CRO_HOTSPARE: dep->de_flags = MDDB_F_HOTSPARE; break; case MD_CRO_HOTSPARE_POOL: dep->de_flags = MDDB_F_HOTSPARE_POOL; break; case MD_CRO_CHANGELOG: dep->de_flags = MDDB_F_CHANGELOG; break; } /* * try to get all blocks consecutive. If not possible * just get them one at a time */ dep->de_blks[0] = getfreeblks(s, blkcnt); if (dep->de_blks[0]) { for (i = 1; i < blkcnt; i++) dep->de_blks[i] = dep->de_blks[0] + i; } else { for (i = 0; i < blkcnt; i++) dep->de_blks[i] = getfreeblks(s, 1); } dep->de_rb = rbp; bzero((caddr_t)rbp, recsize); rbp->rb_magic = MDDB_MAGIC_RB; /* Do we have to create an old style (32 bit) record? */ if (options & MD_CRO_32BIT) { if (options & MD_CRO_FN) rbp->rb_revision = MDDB_REV_RBFN; else rbp->rb_revision = MDDB_REV_RB; } else { if (options & MD_CRO_FN) rbp->rb_revision = MDDB_REV_RB64FN; else rbp->rb_revision = MDDB_REV_RB64; } /* set de_rb_userdata for non optimization records */ if ((options & MD_CRO_OPTIMIZE) == 0) { dep->de_rb_userdata = userdata; } uniqtime32(&rbp->rb_timestamp); /* Generate the crc for this record */ rec_crcgen(s, dep, rbp); tmppnt = (caddr_t)rbp; /* * the following code writes new records to all instances of * the data base. Writing one block at a time to each instance * is safe because they are not yet in a directory entry which * has been written to the data base */ err = 0; if ((options & MD_CRO_OPTIMIZE) == 0) { for (i = 0; i < blkcnt; i++) { err |= writeall(s, (caddr_t)tmppnt, dep->de_blks[i], 1, 0); tmppnt += MDDB_BSIZE; } } else { if ((MD_MNSET_SETNO(s->s_setno)) && md_set[s->s_setno].s_am_i_master) { /* * If a MN diskset then only master writes out newly * created optimized record. */ err |= writeoptrecord(s, dep); } } uniqtime32(&dbp->db_timestamp); dbp->db_revision = MDDB_REV_DB; /* Don't include opt resync and change log records in global XOR */ if (!(dep->de_flags & MDDB_F_OPT) && !(dep->de_flags & MDDB_F_CHANGELOG)) dbp->db_recsum ^= rbp->rb_checksum; db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); create_db32rec(db32p, dbp); crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0); if (prevdbp) { dbp = prevdbp; uniqtime32(&dbp->db_timestamp); dbp->db_revision = MDDB_REV_DB; create_db32rec(db32p, dbp); crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0); } kmem_free((caddr_t)db32p, MDDB_BSIZE); if (err) { if (writeretry(s)) { s->s_zombie = newid; single_thread_end(s); mddb_setexit(s); return (MDDB_E_NOTNOW); } } single_thread_end(s); mddb_setexit(s); ASSERT((newid & MDDB_SETMASK) == 0); return (MAKERECID(setno, newid)); } int mddb_deleterec( mddb_recid_t id ) { mddb_set_t *s; mddb_db_t *dbp; mddb_db32_t *db32p; mddb_de_ic_t *dep, *dep1; int i; #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); #endif s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL); ASSERT(s != NULL); id = DBID(id); if (checkstate(s, MDDB_PROBE)) { mddb_setexit(s); return (MDDB_E_NOTNOW); } ASSERT(s->s_lbp != NULL); single_thread_start(s); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { dep1 = NULL; for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { if (dep->de_recid == id) break; dep1 = dep; } if (dep != NULL) break; } /* * no such record */ if (dep == NULL) { single_thread_end(s); ASSERT(s->s_staledeletes != 0); s->s_staledeletes--; mddb_setexit(s); return (0); } if (!(dep->de_flags & MDDB_F_OPT) && !(dep->de_flags & MDDB_F_CHANGELOG)) { dbp->db_recsum ^= dep->de_rb->rb_checksum; dbp->db_recsum ^= dep->de_rb->rb_checksum_fiddle; } if (dep->de_rb_userdata != NULL) { if (dep->de_icreqsize) kmem_free(dep->de_rb_userdata_ic, dep->de_icreqsize); else kmem_free(dep->de_rb_userdata, dep->de_reqsize); } kmem_free((caddr_t)dep->de_rb, dep->de_recsize); for (i = 0; i < dep->de_blkcount; i++) blkfree(s, dep->de_blks[i]); if (dep1) dep1->de_next = dep->de_next; else dbp->db_firstentry = dep->de_next; kmem_free(dep, sizeofde(dep)); uniqtime32(&dbp->db_timestamp); dbp->db_revision = MDDB_REV_DB; db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); create_db32rec(db32p, dbp); crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); if (writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0)) { if (writeretry(s)) { /* * staledelete is used to mark deletes which failed. * its only use is to not panic when the user retries * the delete once the database is active again */ single_thread_end(s); s->s_staledeletes++; kmem_free((caddr_t)db32p, MDDB_BSIZE); mddb_setexit(s); return (MDDB_E_NOTNOW); } } single_thread_end(s); kmem_free((caddr_t)db32p, MDDB_BSIZE); mddb_setexit(s); return (0); } mddb_recid_t mddb_getnextrec( mddb_recid_t id, mddb_type_t typ, uint_t type2 ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; int searching, err; set_t setno; setno = DBSET(id); id = DBID(id); searching = id; if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) return (err); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { if (searching) { if (dep->de_recid == id) searching = 0; } else { if ((typ == MDDB_ALL || dep->de_type1 == typ) && (type2 == 0 || dep->de_type2 == type2)) { id = dep->de_recid; mddb_setexit(s); ASSERT((id & MDDB_SETMASK) == 0); return (MAKERECID(setno, id)); } } } } mddb_setexit(s); if (searching) return (MDDB_E_NORECORD); return (0); } void * mddb_getrecaddr( mddb_recid_t id ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; void *rval; if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) return (NULL); id = DBID(id); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { if (dep->de_recid != id) continue; if (dep->de_rb_userdata) rval = (void *)dep->de_rb_userdata; else rval = (void *)dep->de_rb->rb_data; mddb_setexit(s); return (rval); } } mddb_setexit(s); return (NULL); } mddb_de_ic_t * mddb_getrecdep( mddb_recid_t id ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) return (NULL); id = DBID(id); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { if (dep->de_recid != id) continue; mddb_setexit(s); return (dep); } } mddb_setexit(s); return (NULL); } void * mddb_getrecaddr_resize( mddb_recid_t id, size_t icsize, off_t off ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; void *rval = NULL; if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) return (NULL); id = DBID(id); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { if (dep->de_recid != id) continue; if (dep->de_rb_userdata) rval = (void *)dep->de_rb_userdata; else rval = (void *)dep->de_rb->rb_data; break; } if (rval != NULL) break; } if (rval == NULL) { mddb_setexit(s); return (NULL); } if (dep->de_rb_userdata) { caddr_t nud; if (dep->de_icreqsize || (dep->de_reqsize >= icsize)) { mddb_setexit(s); return (rval); } ASSERT((dep->de_reqsize + off) <= icsize); nud = kmem_zalloc(icsize, KM_SLEEP); bcopy(dep->de_rb_userdata, nud + off, dep->de_reqsize); kmem_free(dep->de_rb_userdata, dep->de_reqsize); dep->de_rb_userdata = nud + off; dep->de_rb_userdata_ic = nud; dep->de_icreqsize = icsize; rval = nud; } else { size_t recsize; /* LINTED variable unused - used for sizeof calculations */ mddb_rb32_t *nrbp; recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) + icsize, MDDB_BSIZE); if (dep->de_recsize < recsize) cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only " "nonoptimized records can be resized\n"); } mddb_setexit(s); return (rval); } int mddb_getrecprivate( mddb_recid_t id ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; int err = 0; int private; if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) return (err); id = DBID(id); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { if (dep->de_recid != id) continue; private = (int)dep->de_rb->rb_private; mddb_setexit(s); return (private); } } mddb_setexit(s); return (MDDB_E_NORECORD); } void mddb_setrecprivate( mddb_recid_t id, uint_t private ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) { ASSERT(0); return; } id = DBID(id); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { if (dep->de_recid != id) continue; dep->de_rb->rb_private = private; mddb_setexit(s); return; } } mddb_setexit(s); ASSERT(0); } mddb_type_t mddb_getrectype1( mddb_recid_t id ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; int err = 0; mddb_type_t rval; if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) return (err); id = DBID(id); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { if (dep->de_recid != id) continue; rval = dep->de_type1; mddb_setexit(s); return (rval); } } mddb_setexit(s); return (MDDB_E_NORECORD); } int mddb_getrectype2( mddb_recid_t id ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; int err = 0; int rval; if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) return (err); id = DBID(id); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { if (dep->de_recid != id) continue; rval = (int)dep->de_type2; mddb_setexit(s); return (rval); } } mddb_setexit(s); return (MDDB_E_NORECORD); } int mddb_getrecsize( mddb_recid_t id ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; int err = 0; int rval; if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) return (err); id = DBID(id); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { if (dep->de_recid != id) continue; rval = (int)dep->de_reqsize; mddb_setexit(s); return (rval); } } mddb_setexit(s); return (MDDB_E_NORECORD); } mddb_recstatus_t mddb_getrecstatus( mddb_recid_t id ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; int err = 0; mddb_recstatus_t e_err; if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) return ((mddb_recstatus_t)err); id = DBID(id); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { if (dep->de_recid == id) break; } if (dep) break; } e_err = MDDB_OK; if (! dep) e_err = MDDB_NORECORD; else if (! dep->de_rb->rb_commitcnt) e_err = MDDB_NODATA; else if (md_get_setstatus(s->s_setno) & MD_SET_STALE) e_err = MDDB_STALE; mddb_setexit(s); return (e_err); } static int mddb_commitrec_retries = 5; /* * Commit given record to disk. * If committing an optimized record, do not call * with md ioctl lock held. */ int mddb_commitrec( mddb_recid_t id ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; mddb_recid_t ids[2]; mddb_rb32_t *rbp; static int err = 0; md_mn_msg_mddb_optrecerr_t *msg_recerr; md_mn_kresult_t *kres; mddb_lb_t *lbp; mddb_mnlb_t *mnlbp; mddb_locator_t *lp; mddb_mnsidelocator_t *mnslp; mddb_drvnm_t *dn; int li; md_replica_recerr_t *recerr; int i, j; int rval; int hit_err = 0; int retry = mddb_commitrec_retries; int gave_up = 0; s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL); ASSERT(s != NULL); if (checkstate(s, MDDB_PROBE)) { mddb_setexit(s); return (MDDB_E_NOTNOW); } if (DBID(id) == 0) { mddb_setexit(s); return (0); } for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { if (dep->de_recid == DBID(id)) break; } if (dep) break; } if (dep == NULL) { mddb_setexit(s); return (MDDB_E_NORECORD); } if (! (dep->de_flags & MDDB_F_OPT)) { ids[0] = id; ids[1] = 0; mddb_setexit(s); return (mddb_commitrecs(ids)); } /* * following code allows multiple processes to be doing * optimization commits in parallel. * NOTE: if lots of optimization commits then the lock * will not get released until it winds down */ if (s->s_optwaiterr) { while (s->s_optwaiterr) { s->s_opthungerr = 1; cv_wait(&s->s_opthungerr_cv, SETMUTEX(s->s_setno)); } if (checkstate(s, MDDB_PROBE)) { mddb_setexit(s); return (MDDB_E_NOTNOW); } } if (s->s_optcmtcnt++ == 0) { single_thread_start(s); s->s_opthavelck = 1; if (s->s_optwantlck) { cv_broadcast(&s->s_optwantlck_cv); s->s_optwantlck = 0; } } else { while (! s->s_opthavelck) { s->s_optwantlck = 1; cv_wait(&s->s_optwantlck_cv, SETMUTEX(s->s_setno)); } } for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { if (dep->de_recid == DBID(id)) break; } if (dep) break; } if (dep == NULL) { if (! (--s->s_optcmtcnt)) { single_thread_end(s); s->s_opthavelck = 0; } mddb_setexit(s); return (MDDB_E_NORECORD); } rbp = dep->de_rb; rbp->rb_commitcnt++; uniqtime32(&rbp->rb_timestamp); /* Generate the crc for this record */ rec_crcgen(s, dep, rbp); if (writeoptrecord(s, dep)) { if (MD_MNSET_SETNO(s->s_setno)) { hit_err = 1; } s->s_optwaiterr++; } if (MD_MNSET_SETNO(s->s_setno)) { /* If last thread out, release single_thread_start */ if (! (--s->s_optcmtcnt)) { single_thread_end(s); s->s_opthavelck = 0; } /* * If this thread had a writeoptrecords failure, then * need to send message to master. * But, multiple threads could all be running on the * same single_thread_start, so serialize the threads * by making each thread grab single_thread_start. * * After return from sending message to master message, * replicas associated with optimized record will havei * been changed (via a callback from the master to all * nodes), so retry call to writeoptrecord. * This code is replacing the call to writeretry that * occurs for the local and traditional disksets. */ if (hit_err) { single_thread_start(s); /* * If > 50% of replicas are alive then continue * to send message to master until writeoptrecord * succeeds. For now, assume that minor name, * major number on this node is the same as on * the master node. Once devids are turned on * for MN disksets, can send devid. */ kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); msg_recerr = kmem_zalloc( sizeof (md_mn_msg_mddb_optrecerr_t), KM_SLEEP); while (!(md_get_setstatus(s->s_setno) & MD_SET_TOOFEW)) { bzero((caddr_t)msg_recerr, sizeof (md_mn_msg_mddb_optrecerr_t)); lbp = s->s_lbp; mnlbp = (mddb_mnlb_t *)lbp; for (i = 0; i < 2; i++) { li = dep->de_optinfo[i].o_li; lp = &lbp->lb_locators[li]; for (j = 0; j < MD_MNMAXSIDES; j++) { mnslp = &mnlbp-> lb_mnsidelocators[j][li]; if (mnslp->mnl_sideno == s->s_sideno) break; } if (j == MD_MNMAXSIDES) continue; dn = &lbp-> lb_drvnm[mnslp->mnl_drvnm_index]; recerr = &msg_recerr->msg_recerr[i]; recerr->r_li = li; recerr->r_flags = dep->de_optinfo[i].o_flags; recerr->r_blkno = lp->l_blkno; recerr->r_mnum = md_getminor(lp->l_dev); (void) strncpy(recerr->r_driver_name, dn->dn_data, MD_MAXDRVNM); } /* Release locks */ single_thread_end(s); mutex_exit(SETMUTEX(s->s_setno)); /* * Send message to master about optimized * record failure. After return, master * should have marked failed replicas * and sent parse message to slaves causing * slaves to have fixed up the optimized * record. * On return from ksend_message, retry * the write since this node should have fixed * the optimized resync records it owns. */ rval = mdmn_ksend_message(s->s_setno, MD_MN_MSG_MDDB_OPTRECERR, MD_MSGF_NO_BCAST, 0, (char *)msg_recerr, sizeof (md_mn_msg_mddb_optrecerr_t), kres); if (!MDMN_KSEND_MSG_OK(rval, kres)) { cmn_err(CE_WARN, "mddb_commitrec: " "Unable to send optimized " "resync record failure " "message to other nodes in " "diskset %s\n", s->s_setname); mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_MDDB_OPTRECERR"); } /* Regrab locks */ mutex_enter(SETMUTEX(s->s_setno)); single_thread_start(s); /* Start over in case mddb changed */ for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { if (dep->de_recid == DBID(id)) break; } if (dep) break; } if (dep) { rbp = dep->de_rb; rbp->rb_commitcnt++; uniqtime32(&rbp->rb_timestamp); /* Generate the crc for this record */ rec_crcgen(s, dep, rbp); /* * If writeoptrecord succeeds, then * break out. */ if (!(writeoptrecord(s, dep))) break; } if (--retry == 0) { cmn_err(CE_WARN, "mddb_commitrec: " "giving up writing optimized " "resync record for " "diskset %s, device %s,%d " "blkno 0x%x, flags 0x%x\n", s->s_setname, recerr->r_driver_name, recerr->r_mnum, recerr->r_blkno, recerr->r_flags); gave_up++; break; } } kmem_free(kres, sizeof (md_mn_kresult_t)); kmem_free(msg_recerr, sizeof (md_mn_msg_mddb_optrecerr_t)); /* Resync record should be fixed - if possible */ s->s_optwaiterr--; if (s->s_optwaiterr == 0) { /* All errors have been handled */ if (s->s_opthungerr) { s->s_opthungerr = 0; cv_broadcast(&s->s_opthungerr_cv); } } single_thread_end(s); mddb_setexit(s); if (md_get_setstatus(s->s_setno) & MD_SET_TOOFEW) { return (MDDB_E_NOTNOW); } else if (gave_up) { return (MDDB_E_STALE); } else { return (0); } } } else { /* If set is a traditional or local set */ if (! (--s->s_optcmtcnt)) { err = 0; if (s->s_optwaiterr) { err = writeretry(s); s->s_optwaiterr = 0; if (s->s_opthungerr) { s->s_opthungerr = 0; cv_broadcast(&s->s_opthungerr_cv); } } single_thread_end(s); s->s_opthavelck = 0; mddb_setexit(s); if (err) return (MDDB_E_NOTNOW); return (0); } if (s->s_optwaiterr) { while (s->s_optwaiterr) { s->s_opthungerr = 1; cv_wait(&s->s_opthungerr_cv, SETMUTEX(s->s_setno)); } if (checkstate(s, MDDB_NOPROBE)) { mddb_setexit(s); return (MDDB_E_NOTNOW); } } } mddb_setexit(s); return (0); } int mddb_commitrecs( mddb_recid_t ids[] ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; mddb_rb32_t *rbp; mddb_rb32_t *saverbp; mddb_lb_t *lbp; int li; uint_t checksum; mddb_recid_t *idp; int err = 0; set_t setno; if (panicstr) cmn_err(CE_PANIC, "md: mddb: commit not allowed"); /* * scan through and make sure ids are from the same set */ setno = DBSET(ids[0]); for (idp = ids; *idp != NULL; idp++) ASSERT(DBSET(*idp) == setno); s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL); if (checkstate(s, MDDB_PROBE)) { mddb_setexit(s); return (MDDB_E_NOTNOW); } ASSERT(s->s_lbp != NULL); err = 0; if (! ids[0]) { mddb_setexit(s); return (0); } single_thread_start(s); /* * scan through and make sure ids all exist */ for (idp = ids; *idp != NULL; idp++) { for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { if (dep->de_recid == DBID(*idp)) break; } if (dep != NULL) break; } if (dep == NULL) { single_thread_end(s); mddb_setexit(s); return (MDDB_E_NORECORD); } } /* * scan through records fix commit counts and * zero fiddles and update time stamp and rechecksum record */ checksum = 0; idp = ids; saverbp = NULL; while (*idp) { for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { if (dep->de_recid == DBID(*idp)) break; } if (dep != NULL) break; } rbp = dep->de_rb; ASSERT(! (dep->de_flags & MDDB_F_OPT)); getuserdata(setno, dep); /* Don't do fiddles for CHANGE LOG records */ if (!(dep->de_flags & MDDB_F_CHANGELOG)) { checksum ^= rbp->rb_checksum_fiddle; rbp->rb_checksum_fiddle = 0; checksum ^= rbp->rb_checksum; saverbp = rbp; } rbp->rb_commitcnt++; uniqtime32(&rbp->rb_timestamp); /* Generate the crc for this record */ rec_crcgen(s, dep, rbp); /* Don't do fiddles for CHANGE LOG records */ if (!(dep->de_flags & MDDB_F_CHANGELOG)) { checksum ^= rbp->rb_checksum; } idp++; } if (saverbp) saverbp->rb_checksum_fiddle = checksum; /* * If this is a MN set but we are not the master, then we are not * supposed to update the mddb on disk. So we finish at this point. */ if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && (md_set[setno].s_am_i_master == 0)) { single_thread_end(s); mddb_setexit(s); return (0); } lbp = s->s_lbp; for (li = 0; li < lbp->lb_loccnt; li++) { if (! (lbp->lb_locators[li].l_flags & MDDB_F_ACTIVE)) continue; idp = ids; while (*idp) { for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { dep = dbp->db_firstentry; while (dep && (dep->de_recid != DBID(*idp))) dep = dep->de_next; if (dep != NULL) break; } rbp = dep->de_rb; err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, dep->de_blkcount, li, (mddb_bf_t **)0, MDDB_WR_ONLY_MASTER); if (err) break; idp++; } if (err) break; } if (err) { if (writeretry(s)) { single_thread_end(s); mddb_setexit(s); return (MDDB_E_NOTNOW); } } single_thread_end(s); mddb_setexit(s); return (0); } mddb_recid_t mddb_makerecid( set_t setno, mddb_recid_t id ) { return (MAKERECID(setno, id)); } set_t mddb_getsetnum( mddb_recid_t id ) { return (DBSET(id)); } char * mddb_getsetname( set_t setno ) { return (((mddb_set_t *)md_set[setno].s_db)->s_setname); } side_t mddb_getsidenum( set_t setno ) { if (md_set[setno].s_db) return (((mddb_set_t *)md_set[setno].s_db)->s_sideno); return (0); } int mddb_ownset( set_t setno ) { if ((md_get_setstatus(setno) & MD_SET_TAGDATA) && md_set[setno].s_db) return (1); if (md_set[setno].s_db && ((mddb_set_t *)md_set[setno].s_db)->s_lbp) return (1); return (0); } /*ARGSUSED*/ int getmed_ioctl(mddb_med_parm_t *medpp, int mode) { mddb_set_t *s; int err = 0; set_t setno = medpp->med_setno; md_error_t *ep = &medpp->med_mde; mdclrerror(ep); if (setno >= md_nsets) return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) return (0); if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno)); if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) return (mddbstatus2error(ep, err, NODEV32, setno)); medpp->med = s->s_med; /* structure assignment */ mddb_setexit(s); return (0); } int setmed_ioctl(mddb_med_parm_t *medpp, int mode) { mddb_set_t *s; int err = 0; set_t setno = medpp->med_setno; md_error_t *ep = &medpp->med_mde; mdclrerror(ep); if ((mode & FWRITE) == 0) return (mdsyserror(ep, EACCES)); /* * This should be the only thing that prevents LOCAL sets from having * mediators, at least in the kernel, userland needs to have some code * written. */ if (setno == MD_LOCAL_SET) return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); if (setno >= md_nsets) return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) return (0); if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno)); if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) return (mddbstatus2error(ep, err, NODEV32, setno)); s->s_med = medpp->med; /* structure assignment */ mddb_setexit(s); return (0); } int updmed_ioctl(mddb_med_upd_parm_t *medpp, int mode) { mddb_set_t *s; int err = 0; set_t setno = medpp->med_setno; md_error_t *ep = &medpp->med_mde; mdclrerror(ep); if ((mode & FWRITE) == 0) return (mdsyserror(ep, EACCES)); if (setno >= md_nsets) return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) return (0); if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno)); if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) return (mddbstatus2error(ep, err, NODEV32, setno)); single_thread_start(s); (void) upd_med(s, "updmed_ioctl()"); single_thread_end(s); mddb_setexit(s); return (0); } int take_set(mddb_config_t *cp, int mode) { int err = 0; mddb_med_upd_parm_t medup; set_t setno = cp->c_setno; md_error_t *ep = &cp->c_mde; int snarf_ok = 0; if (md_get_setstatus(setno) & MD_SET_SNARFED) return (0); err = mddb_configure(MDDB_GETDEV, cp); if (! err && mdisok(ep)) { if (md_snarf_db_set(setno, ep) != 0) goto out; snarf_ok = 1; } /* * Clear replicated import flag since this is * used during the take of a diskset with * previously unresolved replicated disks. */ if (md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT) { md_clr_setstatus(setno, MD_SET_REPLICATED_IMPORT); } if (! err && mdisok(ep)) { if (! cp->c_flags) { medup.med_setno = setno; mdclrerror(&medup.med_mde); err = updmed_ioctl(&medup, mode); if (! mdisok(&medup.med_mde)) (void) mdstealerror(ep, &medup.med_mde); } } out: /* * In the case that the snarf failed, the diskset is * left with s_db set, but s_lbp not set. The node is not * an owner of the set and won't be allowed to release the * diskset in order to cleanup. With s_db set, any call to the * GETDEV or ENDDEV ioctl (done by libmeta routine metareplicalist) * will cause the diskset to be loaded. So, cleanup the diskset so * that an inadvertent start of the diskset doesn't happen later. */ if ((snarf_ok == 0) && md_set[setno].s_db && (((mddb_set_t *)md_set[setno].s_db)->s_lbp == 0)) { mutex_enter(&mddb_lock); mddb_unload_set(setno); mutex_exit(&mddb_lock); } return (err); } /*ARGSUSED*/ int release_set(mddb_config_t *cp, int mode) { int err = 0; set_t setno = cp->c_setno; md_error_t *ep = &cp->c_mde; /* * Data integrity check */ if (setno >= md_nsets) return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); rw_enter(&md_unit_array_rw.lock, RW_WRITER); md_haltsnarf_enter(setno); /* * Attempt to mark set as HOLD. If it is marked as HOLD, this means * that the mirror code is currently searching all mirrors for a * errored component that needs a hotspare. While this search is in * progress, we cannot release the set and thgerefore we return EBUSY. * Once we have set HOLD, the mirror function (check_4_hotspares) will * block before the search until the set is released. */ if (md_holdset_testandenter(setno) != 0) { md_haltsnarf_exit(setno); rw_exit(&md_unit_array_rw.lock); return (EBUSY); } if ((err = md_halt_set(setno, MD_HALT_ALL)) == 0) err = mddb_configure(MDDB_RELEASESET, cp); md_holdset_exit(setno); md_haltsnarf_exit(setno); rw_exit(&md_unit_array_rw.lock); if (! err && mdisok(ep)) { SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RELEASE, SVM_TAG_SET, setno, NODEV64); } return (err); } int gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, int mode) { mddb_set_t *s; int err = 0; mddb_dtag_lst_t *dtlp; set_t setno = dtgpp->dtgp_setno; md_error_t *ep = &dtgpp->dtgp_mde; mdclrerror(ep); if ((mode & FREAD) == 0) return (mdsyserror(ep, EACCES)); if (setno >= md_nsets) return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) return (0); if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) return (mddbstatus2error(ep, err, NODEV32, setno)); /* * Data tags not supported on MN sets so return invalid operation. * This ioctl could be called before the mddb has been read in so * the set status may not yet be set to MNSET, so code following * this check must handle a MN diskset properly. */ if (md_get_setstatus(setno) & MD_SET_MNSET) { mddb_setexit(s); return (mderror(ep, MDE_INVAL_MNOP)); } /* s_dtlp is NULL for MN diskset */ dtlp = s->s_dtlp; while (dtlp != NULL) { if (dtgpp->dtgp_dt.dt_id == 0 || dtgpp->dtgp_dt.dt_id == dtlp->dtl_dt.dt_id) { bcopy((caddr_t)&dtlp->dtl_dt, (caddr_t)&dtgpp->dtgp_dt, sizeof (mddb_dtag_t)); break; } dtlp = dtlp->dtl_nx; } /* Walked the whole list and id not found, return error */ if (dtlp == (mddb_dtag_lst_t *)NULL) { mddb_setexit(s); return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno)); } mddb_setexit(s); return (0); } int usetag_ioctl(mddb_dtag_use_parm_t *dtupp, int mode) { mddb_set_t *s; int err = 0; mddb_config_t *cp; mddb_ri_t *trip = NULL; mddb_dtag_t *dtagp = NULL; set_t setno = dtupp->dtup_setno; md_error_t *ep = &dtupp->dtup_mde; mdclrerror(ep); if ((mode & FWRITE) == 0) return (mdsyserror(ep, EACCES)); if (setno >= md_nsets) return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); if (dtupp->dtup_id < 0) return (mdsyserror(ep, EINVAL)); else if (dtupp->dtup_id == 0) return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno)); if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) return (0); if ((md_get_setstatus(setno) & MD_SET_TAGDATA) == 0) return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno)); if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) return (mddbstatus2error(ep, err, NODEV32, setno)); /* * Data tags not supported on MN sets so return invalid operation. * This ioctl could be called before the mddb has been read in so * the set status may not yet be set to MNSET, so code following * this check must handle a MN diskset properly. */ if (md_get_setstatus(setno) & MD_SET_MNSET) { mddb_setexit(s); return (mderror(ep, MDE_INVAL_MNOP)); } /* Validate and find the id requested - nothing found if MN diskset */ if ((dtagp = dtl_findl(s, dtupp->dtup_id)) == NULL) { mddb_setexit(s); return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno)); } /* Usetag is only valid when more than one tag exists */ if (dtl_cntl(s) < 2) { mddb_setexit(s); return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno)); } /* Put the selected tag in place */ dt_setup(s, dtagp); cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP); /* Save the hint information */ trip = save_rip(s); cp->c_timestamp = s->s_ident.createtime; /* struct assignment */ cp->c_setno = setno; cp->c_sideno = s->s_sideno; (void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME); cp->c_setname[MD_MAX_SETNAME] = '\0'; cp->c_med = s->s_med; /* struct assignment */ mddb_setexit(s); s = NULL; /* shorthand */ setno = cp->c_setno; /* Let unload know not to free the tag */ md_set_setstatus(setno, MD_SET_KEEPTAG); /* Release the set */ if (err = release_set(cp, mode)) goto out; if (! mdisok(&cp->c_mde)) { (void) mdstealerror(ep, &cp->c_mde); err = 1; goto out; } /* Re-init set using the saved mddb_config_t structure */ if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) { if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) { err = mddbstatus2error(ep, err, NODEV32, setno); goto out; } } ASSERT(s->s_rip == (mddb_ri_t *)NULL); /* use the saved rip structure */ s->s_rip = trip; trip = (mddb_ri_t *)NULL; /* Let the take code know a tag is being used */ md_set_setstatus(setno, MD_SET_USETAG); mddb_setexit(s); s = NULL; /* Take the set */ if (err = take_set(cp, mode)) goto out; if (! mdisok(&cp->c_mde)) (void) mdstealerror(ep, &cp->c_mde); out: md_clr_setstatus(setno, (MD_SET_USETAG | MD_SET_KEEPTAG)); kmem_free(cp, sizeof (mddb_config_t)); if (trip) free_rip(&trip); if (s) mddb_setexit(s); return (err); } int accept_ioctl(mddb_accept_parm_t *accpp, int mode) { mddb_set_t *s; int err = 0; mddb_config_t *cp; mddb_ri_t *trip = NULL; set_t setno = accpp->accp_setno; md_error_t *ep = &accpp->accp_mde; mdclrerror(ep); if ((mode & FWRITE) == 0) return (mdsyserror(ep, EACCES)); if (setno >= md_nsets) return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) return (0); if ((md_get_setstatus(setno) & MD_SET_ACCOK) == 0) return (mdmddberror(ep, MDE_DB_ACCNOTOK, NODEV32, setno)); if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) return (mddbstatus2error(ep, err, NODEV32, setno)); /* * Data tags not supported on MN sets so return invalid operation. * mddb is guaranteed to be incore at this point, so this * check will catch all MN disksets. */ if (md_get_setstatus(setno) & MD_SET_MNSET) { mddb_setexit(s); return (mderror(ep, MDE_INVAL_MNOP)); } cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP); trip = save_rip(s); cp->c_timestamp = s->s_ident.createtime; /* struct assignment */ cp->c_setno = setno; cp->c_sideno = s->s_sideno; (void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME); cp->c_setname[MD_MAX_SETNAME] = '\0'; cp->c_med = s->s_med; /* struct assignment */ /* Tag the data */ if (err = set_dtag(s, ep)) { err = mdsyserror(ep, err); goto out; } /* If we had a BADTAG, it will be re-written, so clear the bit. */ if (md_get_setstatus(setno) & MD_SET_BADTAG) md_clr_setstatus(setno, MD_SET_BADTAG); if (err = dt_write(s)) { err = mdsyserror(ep, err); goto out; } mddb_setexit(s); s = NULL; /* shorthand */ setno = cp->c_setno; /* Clear the keeptag */ md_clr_setstatus(setno, MD_SET_KEEPTAG); /* Release the set */ if (err = release_set(cp, mode)) goto out; if (! mdisok(&cp->c_mde)) { (void) mdstealerror(ep, &cp->c_mde); goto out; } /* Re-init set using the saved mddb_config_t structure */ if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) { if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) { err = mddbstatus2error(ep, err, NODEV32, setno); goto out; } } ASSERT(s->s_rip == (mddb_ri_t *)NULL); /* Free the allocated rip structure */ if (s->s_rip != (mddb_ri_t *)NULL) free_rip(&s->s_rip); /* use the saved rip structure */ s->s_rip = trip; trip = (mddb_ri_t *)NULL; /* Let the set init code know an accept is in progress */ md_set_setstatus(setno, MD_SET_ACCEPT); mddb_setexit(s); s = NULL; /* Take the set */ if (err = take_set(cp, mode)) goto out; if (! mdisok(&cp->c_mde)) (void) mdstealerror(ep, &cp->c_mde); out: md_clr_setstatus(setno, (MD_SET_ACCOK | MD_SET_ACCEPT)); kmem_free(cp, sizeof (mddb_config_t)); if (trip) free_rip(&trip); if (s) mddb_setexit(s); return (err); } /* * mddb_getinvlb_devid - cycles through the locator block and determines * if the device id's for any of the replica disks are invalid. * If so, it returns the diskname in the ctdptr. * RETURN * -1 Error * cnt number of invalid device id's */ int mddb_getinvlb_devid( set_t setno, int count, int size, char **ctdptr ) { mddb_set_t *s; int err = 0; mddb_lb_t *lbp; int li; mddb_did_blk_t *did_blk; mddb_did_info_t *did_info; int len; int cnt = 0; char *cptr; md_name_suffix *sn; int i, dont_add_it; char *tmpctd, *diskname; char *tmpname; cptr = *ctdptr; if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { return (-1); } single_thread_start(s); lbp = s->s_lbp; if (lbp->lb_setno != setno) { single_thread_end(s); mddb_setexit(s); return (-1); } /* check for lb being devid style */ if (lbp->lb_flags & MDDB_DEVID_STYLE) { did_blk = s->s_did_icp->did_ic_blkp; for (li = 0; li < lbp->lb_loccnt; li++) { did_info = &(did_blk->blk_info[li]); /* Only if devid exists and isn't valid */ if ((did_info->info_flags & MDDB_DID_EXISTS) && !(did_info->info_flags & MDDB_DID_VALID)) { /* * if we count more invalid did's than * was passed in there's an error somewhere */ if (cnt++ > count) { single_thread_end(s); mddb_setexit(s); return (-1); } /* * Future note: Need to do something here * for the MN diskset case when device ids * are supported in disksets. * Can't add until merging devids_in_diskset * code into code base. */ sn = &s->s_lnp->ln_suffixes[0][li]; /* * check to make sure length of device name is * not greater than computed first time through */ len = sn->suf_len; if (len > size) { single_thread_end(s); mddb_setexit(s); return (-1); } tmpctd = *ctdptr; /* strip off slice part */ diskname = md_strdup(sn->suf_data); tmpname = strrchr(diskname, 's'); *tmpname = '\0'; dont_add_it = 0; /* look to see if diskname is already in list */ for (i = 0; i < (cnt-1); i++) { if (strcmp(diskname, tmpctd) == 0) { /* already there, don't add */ dont_add_it = 1; break; } /* point to next diskname in list */ tmpctd += size; } if (dont_add_it == 0) { /* add diskname to list */ (void) strcpy(cptr, diskname); cptr += size; } kmem_free(diskname, strlen(sn->suf_data) + 1); } } } /* null terminate the list */ *cptr = '\0'; /* * need to save the new pointer so that calling routine can continue * to add information onto the end. */ *ctdptr = cptr; single_thread_end(s); mddb_setexit(s); return (cnt); } /* * mddb_validate_lb - count the number of lb's with invalid device id's. Keep * track of length of longest devicename. * RETURN * -1 error * cnt number of lb's with invalid devid's */ int mddb_validate_lb( set_t setno, int *rmaxsz ) { mddb_set_t *s; int err = 0; mddb_lb_t *lbp; int li; mddb_did_blk_t *did_blk; mddb_did_info_t *did_info; int len; int cnt = 0; if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) return (-1); single_thread_start(s); lbp = s->s_lbp; if (lbp->lb_setno != setno) { single_thread_end(s); mddb_setexit(s); return (-1); } /* lb must be in devid style */ if ((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) goto mvl_out; did_blk = s->s_did_icp->did_ic_blkp; for (li = 0; li < lbp->lb_loccnt; li++) { char *minor_name; mddb_locator_t *lp; dev_t ddi_dev; ddi_devid_t devid; ddi_devid_t rtn_devid = NULL; int get_rval; did_info = &(did_blk->blk_info[li]); if (((did_info->info_flags & MDDB_DID_EXISTS) == 0) || (did_info->info_flags & MDDB_DID_VALID)) continue; /* Here we know, did exists but isn't valid */ lp = &lbp->lb_locators[li]; ddi_dev = expldev(lp->l_dev); get_rval = mddb_devid_get(s, li, &devid, &minor_name); ASSERT(get_rval == 1); if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) && (ddi_devid_compare(rtn_devid, devid) == 0)) { did_info->info_flags = MDDB_DID_VALID | MDDB_DID_EXISTS | MDDB_DID_UPDATED; } else { cnt++; /* * Future note: Need to do something here * for the MN diskset case when device ids * are supported in disksets. * Can't add until merging devids_in_diskset * code into code base. */ len = (&s->s_lnp->ln_suffixes[0][li])-> suf_len; if (*rmaxsz < len) *rmaxsz = len; } if (rtn_devid != NULL) ddi_devid_free(rtn_devid); } mvl_out: if (push_lb(s) != 0) cnt = -1; (void) upd_med(s, "mddb_validate_lb(0)"); single_thread_end(s); mddb_setexit(s); return (cnt); } int check_active_locators() { mddb_set_t *s; mddb_lb_t *lbp; int li; int active = 0; mutex_enter(&mddb_lock); /* there is nothing here..so we can unload */ if ((mddb_set_t *)md_set[MD_LOCAL_SET].s_db == NULL) { mutex_exit(&mddb_lock); return (0); } s = (mddb_set_t *)md_set[MD_LOCAL_SET].s_db; lbp = s->s_lbp; if (lbp == NULL) { mutex_exit(&mddb_lock); return (0); } for (li = 0; li < lbp->lb_loccnt; li++) { mddb_locator_t *lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_ACTIVE) { active = 1; break; } } mutex_exit(&mddb_lock); return (active); } /* * regetoptrecord: * -------------- * Update the in-core optimized resync record contents by re-reading the * record from the on-disk metadb. * The contents of the resync record will be overwritten by calling this * routine. This means that callers that require the previous contents to * be preserved must save the data before calling this routine. * Return values: * 0 - successfully read in resync record from a mddb * 1 - failure. Unable to read resync record from either mddb. */ static int regetoptrecord( mddb_set_t *s, mddb_de_ic_t *dep ) { mddb_lb_t *lbp; mddb_locator_t *lp; mddb_rb32_t *rbp, *crbp; int li; int i; int err = 0; size_t recsize; #if defined(_ILP32) && !defined(lint) ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); #endif recsize = dep->de_recsize; crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); single_thread_start(s); rbp = dep->de_rb; dep->de_optinfo[0].o_flags |= MDDB_F_EDATA; dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; lbp = s->s_lbp; for (i = 0; i < 2; i++) { if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) continue; li = dep->de_optinfo[i].o_li; lp = &lbp->lb_locators[li]; if (! (lp->l_flags & MDDB_F_ACTIVE) || (lp->l_flags & MDDB_F_EMASTER)) continue; /* * re-read the optimized resync record with failfast set * since a failed disk could lead to a very long wait. */ err = readblklst(s, (caddr_t)rbp, dep->de_blks, dep->de_blkcount, li, B_FAILFAST); if (err) continue; if (rbp->rb_magic != MDDB_MAGIC_RB) continue; if (revchk(MDDB_REV_RB, rbp->rb_revision)) continue; /* Check the crc for this record */ if (rec_crcchk(s, dep, rbp)) { continue; } dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE; if (rbp == crbp) { if (rbp->rb_checksum != crbp->rb_checksum) dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; break; } rbp = crbp; } single_thread_end(s); if (rbp == crbp) { rbp->rb_private = 0; kmem_free((caddr_t)crbp, recsize); return (0); } uniqtime32(&rbp->rb_timestamp); /* Generate the crc for this record */ rec_crcgen(s, dep, rbp); kmem_free((caddr_t)crbp, recsize); return (1); } /* * mddb_reread_rr: * Re-read the resync record from the on-disk copy. This is required for * multi-node support so that a new mirror-owner can determine if a resync * operation is required to guarantee data integrity. * * Arguments: * setno Associated set * id Resync record ID * * Return Value: * 0 successful reread * -1 invalid set (not multi-node or non-existant) * >0 metadb state invalid, failed to reread */ int mddb_reread_rr( set_t setno, mddb_recid_t id ) { mddb_set_t *s; int err = 0; mddb_db_t *dbp; mddb_de_ic_t *dep; if (setno >= md_nsets) return (-1); if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) return (-1); if ((setno == MD_LOCAL_SET) || !(s->s_lbp->lb_flags & MDDB_MNSET)) { mddb_setexit(s); return (-1); } for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { dep = dbp->db_firstentry; while (dep && (dep->de_recid != DBID(id))) dep = dep->de_next; if (dep != NULL) break; } if (dep != NULL) { err = regetoptrecord(s, dep); } else { err = -1; } mddb_setexit(s); return (err); } /* * Set owner associated with MN optimized resync record. * * Optimized records have an owner node associated with them in * a MN diskset. The owner is only set on a node that is actively * writing to that record. The other nodes will show that record * as having an invalid owner. The owner for an optimized record * is used during fixoptrecord to determine which node should * write out the record when the replicas associated with that * optimized record have been changed. * * Called directly from mirror driver and not from an ioctl. * * Returns * NULL if successful. * MDDB_E_NORECORD if record not found. */ int mddb_setowner( mddb_recid_t id, md_mn_nodeid_t owner ) { mddb_set_t *s; mddb_db_t *dbp; mddb_de_ic_t *dep; int found = 0; if (DBSET(id) >= md_nsets) return (MDDB_E_NORECORD); if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) return (MDDB_E_NORECORD); id = DBID(id); for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { if (dep->de_recid != id) continue; dep->de_owner_nodeid = owner; found = 1; break; } if (found) break; } mddb_setexit(s); if (!found) { return (MDDB_E_NORECORD); } return (NULL); } /* * mddb_parse re-reads portions of the mddb from disk given a list * of good replicas to read from and flags describing * which portion of the mddb to read in. * * Used in a MN diskset when the master has made a change to some part * of the mddb and wants to relay this information to the slaves. */ int mddb_parse(mddb_parse_parm_t *mpp) { mddb_set_t *s; int err = 0; mddb_locator_t *lp, *old_lp; mddb_lb_t *lbp, *old_lbp; int rval = 0; int i, li; int found_good_one = 0; mddb_ln_t *lnp; mddb_block_t ln_blkcnt; md_error_t *ep = &mpp->c_mde; if (mpp->c_setno >= md_nsets) return (EINVAL); if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) return (0); if ((s = mddb_setenter(mpp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) { return (mddbstatus2error(ep, err, NODEV32, mpp->c_setno)); } if (!(MD_MNSET_SETNO(mpp->c_setno))) { mddb_setexit_no_parse(s); return (EINVAL); } /* * Master node initiated this request, so there's no work for * the master node to do. */ if (md_set[mpp->c_setno].s_am_i_master) { mddb_setexit_no_parse(s); return (rval); } single_thread_start(s); if (mpp->c_parse_flags & MDDB_PARSE_LOCBLK) { lbp = 0; for (i = 0; i < MDDB_NLB; i++) { /* Walk through master's active list */ if (!(mpp->c_lb_flags[i] & MDDB_F_ACTIVE)) continue; if (s->s_mbiarray[i] == NULL) continue; /* Assumes master blocks are already setup */ if (lbp == (mddb_lb_t *)NULL) { lbp = (mddb_lb_t *)kmem_zalloc( dbtob(MDDB_MNLBCNT), KM_SLEEP); } err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i); if (err) continue; if (lbp->lb_magic != MDDB_MAGIC_LB) continue; if (lbp->lb_blkcnt != MDDB_MNLBCNT) continue; if (revchk(MDDB_REV_MNLB, lbp->lb_revision)) continue; if (crcchk(lbp, &lbp->lb_checksum, dbtob(MDDB_MNLBCNT), NULL)) continue; if (lbp->lb_setno != s->s_setno) continue; /* * a commit count of zero means this locator has * been deleted */ if (lbp->lb_commitcnt == 0) { continue; } /* Found a good locator - keep it */ found_good_one = 1; break; } /* * If found a good copy of the mddb, then read it into * this node's locator block. Fix up the set's s_mbiarray * pointer (master block incore array pointer) to be * in sync with the newly read in locator block. If a * new mddb was added, read in the master blocks associated * with the new mddb. If an mddb was deleted, free the * master blocks associated with deleted mddb. */ if (found_good_one) { /* Compare old and new view of mddb locator blocks */ old_lbp = s->s_lbp; for (li = 0; li < lbp->lb_loccnt; li++) { int mn_set; lp = &lbp->lb_locators[li]; old_lp = &old_lbp->lb_locators[li]; /* If old and new views match, continue */ if ((lp->l_flags & MDDB_F_ACTIVE) == (old_lp->l_flags & MDDB_F_ACTIVE)) continue; if (lp->l_flags & MDDB_F_ACTIVE) { /* * If new mddb has been added - delete * old mbiarray and get new one. * * When devids are supported, will * need to get dev from devid. */ if (s->s_mbiarray[li]) { free_mbipp(&s->s_mbiarray[li]); } /* * If getmasters fails, getmasters * will set appropriate error flags. */ s->s_mbiarray[li] = getmasters(s, md_expldev(lp->l_dev), lp->l_blkno, (uint_t *)&(lp->l_flags), &mn_set); } else if (lp->l_flags & MDDB_F_DELETED) { /* * If old one has been deleted - * delete old mbiarray. */ if (s->s_mbiarray[li]) { free_mbipp(&s->s_mbiarray[li]); } } } /* Free this node's old view of mddb locator blocks */ kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt)); s->s_lbp = lbp; } else { if (lbp) kmem_free(lbp, dbtob(MDDB_MNLBCNT)); } } if (mpp->c_parse_flags & MDDB_PARSE_LOCNM) { lnp = s->s_lnp; lbp = s->s_lbp; ln_blkcnt = lbp->lb_lnblkcnt; s->s_lnp = NULL; /* readlocnames does this anyway */ for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if ((! (lp->l_flags & MDDB_F_ACTIVE)) || (lp->l_flags & MDDB_F_EMASTER)) continue; /* Successfully read the locator names */ if (readlocnames(s, li) == 0) break; } if (li == lbp->lb_loccnt) { /* Did not successfully read locnames; restore lnp */ s->s_lnp = lnp; } else { /* readlocnames successful, free old struct */ kmem_free((caddr_t)lnp, dbtob(ln_blkcnt)); } } if (mpp->c_parse_flags & MDDB_PARSE_OPTRECS) { mddb_de_ic_t *dep, *tdep, *first_dep, *dep2; mddb_db_t *dbp; mddb_db32_t *db32p; mddb_de32_t *de32p, *de32p2; int writeout; lbp = s->s_lbp; /* * Walk through directory block and directory entry incore * linked list looking for optimized resync records. * For each opt record found, re-read in directory block. * The directoy block consists of a number of directory * entries. The directory entry for this opt record will * describe which 2 mddbs actually contain the resync record * since it could have been relocated by the master node * due to mddb failure or mddb deletion. If this node * is the record owner for this opt record, then write out * the record to the 2 mddbs listed in the directory entry * if the mddbs locations are different than previously known. */ for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { /* Found an opt record */ if (dep->de_flags & MDDB_F_OPT) break; } /* If no opt records found, go to next dbp */ if (dep == NULL) continue; /* * Reread directory block from disk since * master could have rewritten in during fixoptrecord. */ db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); create_db32rec(db32p, dbp); for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if ((! (lp->l_flags & MDDB_F_ACTIVE)) || (lp->l_flags & MDDB_F_EMASTER)) continue; err = readblks(s, (caddr_t)db32p, db32p->db32_blknum, 1, li); if (err) continue; /* Reverify db; go to next mddb if bad */ if ((db32p->db32_magic != MDDB_MAGIC_DB) || (revchk(MDDB_REV_DB, db32p->db32_revision)) || (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL))) { continue; } else { break; } } /* * If all mddbs are unavailable then panic since * this slave cannot be allowed to continue out-of-sync * with the master node. Since the optimized resync * records are written by all nodes, all nodes must * stay in sync with the master. * * This also handles the case when all storage * connectivity to a slave node has failed. The * slave node will send an MDDB_OPTRECERR message to * the master node when the slave node has been unable * to write an optimized resync record to both * designated mddbs. After the master has fixed the * optimized records to be on available mddbs, the * MDDB_PARSE message (with the flag MDDB_PARSE_OPTRECS) * is sent to all slave nodes. If a slave node is * unable to access any mddb in order to read in the * relocated optimized resync record, then the slave * node must panic. */ if (li == lbp->lb_loccnt) { kmem_free((caddr_t)db32p, MDDB_BSIZE); cmn_err(CE_PANIC, "md: mddb: Node unable to " "access any SVM state database " "replicas for diskset %s\n", s->s_setname); } /* * Setup temp copy of linked list of de's. * Already have an incore copy, but need to walk * the directory entry list contained in the * new directory block that was just read in above. * After finding the directory entry of an opt record * by walking the incore list, find the corresponding * entry in the temporary list and then update * the incore directory entry record with * the (possibly changed) mddb location stored * for the optimized resync records. */ de32p = (mddb_de32_t *) ((void *) ((caddr_t) (&db32p->db32_firstentry) + sizeof (db32p->db32_firstentry))); tdep = (mddb_de_ic_t *) kmem_zalloc(sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) + sizeof (mddb_block_t) * de32p->de32_blkcount, KM_SLEEP); de32tode(de32p, tdep); first_dep = tdep; while (de32p && de32p->de32_next) { de32p2 = nextentry(de32p); dep2 = (mddb_de_ic_t *)kmem_zalloc( sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) + sizeof (mddb_block_t) * de32p2->de32_blkcount, KM_SLEEP); de32tode(de32p2, dep2); tdep->de_next = dep2; tdep = dep2; de32p = de32p2; } /* Now, walk the incore directory entry list */ for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { if (! (dep->de_flags & MDDB_F_OPT)) continue; /* * Found an opt record in the incore copy. * Find the corresponding entry in the temp * list. If anything has changed in the * opt record info between the incore copy * and the temp copy, update the incore copy * and set a flag to writeout the opt record * to the new mddb locations. */ for (tdep = first_dep; tdep; tdep = tdep->de_next) { if (dep->de_recid == tdep->de_recid) { writeout = 0; /* Check first mddb location */ if ((dep->de_optinfo[0].o_li != tdep->de_optinfo[0].o_li) || (dep->de_optinfo[0]. o_flags != tdep->de_optinfo [0].o_flags)) { dep->de_optinfo[0] = tdep->de_optinfo[0]; writeout = 1; } /* Check second mddb location */ if ((dep->de_optinfo[1].o_li != tdep->de_optinfo[1].o_li) || (dep->de_optinfo[1]. o_flags != tdep->de_optinfo [1].o_flags)) { dep->de_optinfo[1] = tdep->de_optinfo[1]; writeout = 1; } /* * Record owner should rewrite * it */ if ((writeout) && (dep->de_owner_nodeid == md_set[mpp->c_setno]. s_nodeid)) (void) writeoptrecord(s, dep); break; } } } /* * Update the incore checksum information for this * directory block to match the newly read in checksum. * This should have only changed if the incore and * temp directory entries differed, but it takes * more code to do the check than to just update * the information everytime. */ dbp->db_checksum = db32p->db32_checksum; /* Now free everything */ tdep = first_dep; while (tdep) { dep2 = tdep->de_next; kmem_free((caddr_t)tdep, sizeofde(tdep)); tdep = dep2; } kmem_free((caddr_t)db32p, MDDB_BSIZE); } rval = 0; } out: single_thread_end(s); mddb_setexit_no_parse(s); return (rval); } int mddb_block(mddb_block_parm_t *mbp) { mddb_set_t *s; int err = 0; md_error_t *ep = &mbp->c_mde; if (mbp->c_setno >= md_nsets) return (EINVAL); /* * If the new_master flag is set for this setno we are in the middle * of a reconfig cycle, and blocking or unblocking is not needed. * Hence we can return success immediately */ if (md_get_setstatus(mbp->c_setno) & MD_SET_MN_NEWMAS_RC) { return (0); } if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) return (0); if ((s = mddb_setenter(mbp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) { return (mddbstatus2error(ep, err, NODEV32, mbp->c_setno)); } if (!(MD_MNSET_SETNO(mbp->c_setno))) { mddb_setexit_no_parse(s); return (EINVAL); } single_thread_start(s); if (mbp->c_blk_flags & MDDB_BLOCK_PARSE) md_set_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK); if (mbp->c_blk_flags & MDDB_UNBLOCK_PARSE) md_clr_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK); single_thread_end(s); mddb_setexit_no_parse(s); return (err); } /* * mddb_optrecfix marks up to 2 mddbs as failed and calls fixoptrecords * to relocate any optimized resync records to available mddbs. * This routine is only called on the master node. * * Used in a MN diskset when a slave node has failed to write an optimized * resync record. The failed mddb information is sent to the master node * so the master can relocate the optimized records, if possible. If the * failed mddb information has a mddb marked as failed that was previously * marked active on the master, the master sets its incore mddb state to * EWRITE and sets the PARSE_LOCBLK flag. The master node then attempts * to relocate any optimized records on the newly failed mddbs by calling * fixoptrecords. (fixoptrecords will set the PARSE_OPTRECS flag if any * optimized records are relocated.) * * When mddb_optrecfix is finished, the ioctl exit code will notice the PARSE * flags and will send a PARSE message to the slave nodes. The PARSE_LOCBLK * flag causes the slave node to re-read in the locator block from disk. * The PARSE_OPTRECS flag causes the slave node to re-read in the directory * blocks and write out any optimized resync records that have been * relocated to a different mddb. */ int mddb_optrecfix(mddb_optrec_parm_t *mop) { mddb_set_t *s; int err = 0; mddb_lb_t *lbp; mddb_mnlb_t *mnlbp; mddb_locator_t *lp; int li; mddb_mnsidelocator_t *mnslp; mddb_drvnm_t *dn; int i, j; md_replica_recerr_t *recerr; md_error_t *ep = &mop->c_mde; int something_changed = 0; int alc, lc; int setno; setno = mop->c_setno; if (mop->c_setno >= md_nsets) return (EINVAL); if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) return (0); if ((s = mddb_setenter(mop->c_setno, MDDB_MUSTEXIST, &err)) == NULL) { return (mddbstatus2error(ep, err, NODEV32, mop->c_setno)); } if (!(MD_MNSET_SETNO(mop->c_setno))) { mddb_setexit(s); return (EINVAL); } single_thread_start(s); lbp = s->s_lbp; mnlbp = (mddb_mnlb_t *)lbp; /* * If slave node has seen an mddb failure, but the master node * hasn't encountered this failure, mark the mddb as failed on * the master node and set the something_changed flag to 1. */ for (i = 0; i < 2; i++) { recerr = &mop->c_recerr[i]; if (recerr->r_flags & MDDB_F_EWRITE) { li = recerr->r_li; lp = &lbp->lb_locators[li]; for (j = 0; j < MD_MNMAXSIDES; j++) { mnslp = &mnlbp->lb_mnsidelocators[j][li]; if (mnslp->mnl_sideno == s->s_sideno) break; } /* Do quick check using li */ if (j != MD_MNMAXSIDES) dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; if ((j != MD_MNMAXSIDES) && (strncmp(dn->dn_data, recerr->r_driver_name, MD_MAXDRVNM) == 0) && (recerr->r_blkno == lp->l_blkno) && (recerr->r_mnum == mnslp->mnl_mnum)) { if ((lp->l_flags & MDDB_F_ACTIVE) || ((lp->l_flags & MDDB_F_EWRITE) == 0)) { something_changed = 1; lp->l_flags |= MDDB_F_EWRITE; lp->l_flags &= ~MDDB_F_ACTIVE; } } else { /* * Passed in li from slave does not match * the replica in the master's structures. * This could have occurred if a delete * mddb command was running when the * optimized resync record had a failure. * Search all replicas for this entry. * If no match, just ignore. * If a match, set replica in error. */ for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; for (j = 0; j < MD_MNMAXSIDES; j++) { mnslp = &mnlbp-> lb_mnsidelocators[j][li]; if (mnslp->mnl_sideno == s->s_sideno) break; } if (j == MD_MNMAXSIDES) continue; dn = &lbp-> lb_drvnm[mnslp->mnl_drvnm_index]; if ((strncmp(dn->dn_data, recerr->r_driver_name, MD_MAXDRVNM) == 0) && (recerr->r_blkno == lp->l_blkno) && (recerr->r_mnum == mnslp->mnl_mnum)) { if ((lp->l_flags & MDDB_F_ACTIVE) || ((lp->l_flags & MDDB_F_EWRITE) == 0)) { something_changed = 1; lp->l_flags |= MDDB_F_EWRITE; lp->l_flags &= ~MDDB_F_ACTIVE; } break; } } } } } /* * If this message changed nothing, then we're done since this * failure has already been handled. * If some mddb state has been changed, send a parse message to * the slave nodes so that the slaves will re-read the locator * block from disk. */ if (something_changed == 0) { single_thread_end(s); mddb_setexit(s); return (0); } else { s->s_mn_parseflags |= MDDB_PARSE_LOCBLK; } /* * Scan replicas setting MD_SET_TOOFEW if * 50% or more of the mddbs have seen errors. * Note: Don't call selectreplicas or writeretry * since these routines may end up setting the ACTIVE flag * on a failed mddb if the master is able to access the mddb * but the slave node couldn't. Need to have the ACTIVE flag * turned off in order to relocate the optimized records to * mddbs that are (hopefully) available on all nodes. */ alc = 0; lc = 0; for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; lc++; if (! (lp->l_flags & MDDB_F_ACTIVE)) continue; alc++; } /* * If more than 50% mddbs have failed, then don't relocate opt recs. * The node sending the mddb failure information will detect TOOFEW * and will panic when it attempts to re-write the optimized record. */ if (alc < ((lc + 1) / 2)) { md_set_setstatus(setno, MD_SET_TOOFEW); (void) push_lb(s); (void) upd_med(s, "mddb_optrecfix(0)"); single_thread_end(s); mddb_setexit(s); return (0); } /* Attempt to relocate optimized records that are on failed mddbs */ (void) fixoptrecords(s); /* Push changed locator block out to disk */ (void) push_lb(s); (void) upd_med(s, "mddb_optrecfix(1)"); /* Recheck for TOOFEW after writing out locator blocks */ alc = 0; lc = 0; for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; lc++; if (! (lp->l_flags & MDDB_F_ACTIVE)) continue; alc++; } /* If more than 50% mddbs have failed, then don't relocate opt recs */ if (alc < ((lc + 1) / 2)) { md_set_setstatus(setno, MD_SET_TOOFEW); single_thread_end(s); mddb_setexit(s); return (0); } single_thread_end(s); mddb_setexit(s); return (0); } /* * Check if incore mddb on master node matches ondisk mddb. * If not, master writes out incore view to all mddbs. * Have previously verified that master is an owner of the * diskset (master has snarfed diskset) and that diskset is * not stale. * * Meant to be called during reconfig cycle during change of master. * Previous master in diskset may have changed the mddb and * panic'd before relaying information to slave nodes. New * master node just writes out its incore view of the mddb and * the replay of the change log will resync all the nodes. * * Only supported for MN disksets. * * Return values: * 0 - success * non-zero - failure */ int mddb_check_write_ioctl(mddb_config_t *info) { int err = 0; set_t setno = info->c_setno; mddb_set_t *s; int li; mddb_locator_t *lp; mddb_lb_t *lbp; mddb_mnlb_t *mnlbp_od; mddb_ln_t *lnp; mddb_mnln_t *mnlnp_od; mddb_db_t *dbp; mddb_de_ic_t *dep; int write_out_mddb; md_error_t *ep = &info->c_mde; int mddb_err = 0; int prev_li = 0; int rval = 0; int alc, lc; int mddbs_present = 0; /* Verify that setno is in valid range */ if (setno >= md_nsets) return (EINVAL); if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) return (0); if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { return (mddbstatus2error(ep, err, NODEV32, setno)); } /* Calling diskset must be a MN diskset */ if (!(MD_MNSET_SETNO(setno))) { mddb_setexit(s); return (EINVAL); } /* Re-verify that set is not stale */ if (md_get_setstatus(setno) & MD_SET_STALE) { mddb_setexit(s); return (mdmddberror(ep, MDE_DB_STALE, NODEV32, setno)); } lbp = s->s_lbp; lnp = s->s_lnp; /* * Previous master could have died during the write of data to * the mddbs so that the ondisk mddbs may not be consistent. * So, need to check the contents of the first and last active mddb * to see if the mddbs need to be rewritten. */ for (li = 0; li < lbp->lb_loccnt; li++) { int checkcopy_err; lp = &lbp->lb_locators[li]; /* Find replica that is active */ if (lp->l_flags & MDDB_F_DELETED) continue; mddbs_present = 1; if (! (lp->l_flags & MDDB_F_ACTIVE)) continue; if (s->s_mbiarray[li] == NULL) continue; /* Check locator block */ mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT), KM_SLEEP); /* read in on-disk locator block */ err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li); /* If err, try next mddb */ if (err) { kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT)); continue; } /* * We resnarf all changelog entries for this set. * They may have been altered by the previous master */ for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) { continue; } /* * This has been alloc'ed while * joining the set */ if (dep->de_rb) { kmem_free(dep->de_rb, dep->de_recsize); dep->de_rb = (mddb_rb32_t *)NULL; } if (dep->de_rb_userdata) { kmem_free(dep->de_rb_userdata, dep->de_reqsize); dep->de_rb_userdata = (caddr_t)NULL; } err = getrecord(s, dep, li); if (err) { /* * When we see on error while reading * the changelog entries, we move on * to the next mddb */ err = 1; break; /* out of inner for-loop */ } allocuserdata(dep); } if (err) break; /* out of outer for-loop */ } /* If err, try next mddb */ if (err) { kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT)); continue; } /* Is incore locator block same as ondisk? */ if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT)) == 1) { write_out_mddb = 1; kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); break; } kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); /* If lb ok, check locator names */ mnlnp_od = (mddb_mnln_t *)kmem_zalloc(dbtob(MDDB_MNLNCNT), KM_SLEEP); /* read in on-disk locator names */ err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk, lbp->lb_lnblkcnt, li); /* If err, try next mddb */ if (err) { kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT)); continue; } /* Are incore locator names same as ondisk? */ if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT)) == 1) { kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); write_out_mddb = 1; break; } kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); /* * Check records in mddb. * If a read error is encountered, set the error flag and * continue to the next mddb. Otherwise, if incore data is * different from ondisk, then set the flag to write out * the mddb and break out. */ checkcopy_err = checkcopy(s, li); if (checkcopy_err == MDDB_F_EREAD) { lp->l_flags |= MDDB_F_EREAD; mddb_err = 1; continue; } else if (checkcopy_err == 1) { write_out_mddb = 1; break; } /* * Have found first active mddb and the data is the same as * incore - break out of loop */ write_out_mddb = 0; break; } /* * Skip checking for last active mddb if: * - already found a mismatch in the first active mddb * (write_out_mddb is 1) OR * - didn't find a readable mddb when looking for first * active mddb (there are mddbs present but all failed * when read was attempted). * * In either case, go to write_out_mddb label in order to attempt * to write out the data. If < 50% mddbs are available, panic. */ if ((write_out_mddb == 1) || ((li == lbp->lb_loccnt) && mddbs_present)) { write_out_mddb = 1; goto write_out_mddb; } /* * Save which index was checked for the first active mddb. If only 1 * active mddb, don't want to recheck the same mddb when looking for * last active mddb. */ prev_li = li; /* * Now, checking for last active mddb. If found same index as before * (only 1 active mddb), then skip. */ for (li = (lbp->lb_loccnt - 1); li >= 0; li--) { int checkcopy_err; lp = &lbp->lb_locators[li]; /* Find replica that is active */ if (! (lp->l_flags & MDDB_F_ACTIVE)) continue; if (lp->l_flags & MDDB_F_DELETED) continue; if (s->s_mbiarray[li] == NULL) continue; /* If already checked mddb, bail out */ if (li == prev_li) break; /* Check locator block */ mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT), KM_SLEEP); /* read in on-disk locator block */ err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li); /* If err, try next mddb */ if (err) { kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT)); continue; } /* Is incore locator block same as ondisk? */ if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT)) == 1) { kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); write_out_mddb = 1; break; } kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); /* If lb ok, check locator names */ mnlnp_od = (mddb_mnln_t *) kmem_zalloc(dbtob(MDDB_MNLNCNT), KM_SLEEP); /* read in on-disk locator names */ err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk, lbp->lb_lnblkcnt, li); /* If err, try next mddb */ if (err) { kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT)); continue; } /* Are incore locator names same as ondisk? */ if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT)) == 1) { kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); write_out_mddb = 1; break; } kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); /* * Check records in mddb. * If a read error is encountered, set the error flag and * continue to the next mddb. Otherwise, if incore data is * different from ondisk, then set the flag to write out * the mddb and break out. */ checkcopy_err = checkcopy(s, li); if (checkcopy_err == MDDB_F_EREAD) { lp->l_flags |= MDDB_F_EREAD; mddb_err = 1; continue; } else if (checkcopy_err == 1) { write_out_mddb = 1; break; } /* * Have found last active mddb and the data is the same as * incore - break out of loop */ write_out_mddb = 0; break; } /* * If ondisk and incore versions of the mddb don't match, then * write out this node's incore version to disk. * Or, if unable to read a copy of the mddb, attempt to write * out a new one. */ write_out_mddb: if (write_out_mddb) { /* Recompute free blocks based on incore information */ computefreeblks(s); /* set up free block bits */ /* * Write directory entries and record blocks. * Use flag MDDB_WRITECOPY_SYNC so that writecopy * routine won't write out change log records. */ for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; /* Don't write to inactive or deleted mddbs */ if (! (lp->l_flags & MDDB_F_ACTIVE)) continue; if (lp->l_flags & MDDB_F_DELETED) continue; if (s->s_mbiarray[li] == NULL) continue; /* If encounter a write error, save it for later */ if (writecopy(s, li, MDDB_WRITECOPY_SYNC)) { lp->l_flags |= MDDB_F_EWRITE; mddb_err = 1; } } /* * Write out locator blocks to all replicas. * push_lb will set MDDB_F_EWRITE on replicas that fail. */ if (push_lb(s)) mddb_err = 1; (void) upd_med(s, "mddb_check_write_ioctl(0)"); /* Write out locator names to all replicas */ lnp = s->s_lnp; uniqtime32(&lnp->ln_timestamp); lnp->ln_revision = MDDB_REV_MNLN; crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); /* writeall sets MDDB_F_EWRITE if writes fails to replica */ if (writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, lbp->lb_lnblkcnt, 0)) mddb_err = 1; /* * The writes to the replicas above would have set * the MDDB_F_EWRITE flags if any write error was * encountered. * If < 50% of the mddbs are available, panic. */ lc = alc = 0; for (li = 0; li < lbp->lb_loccnt; li++) { lp = &lbp->lb_locators[li]; if (lp->l_flags & MDDB_F_DELETED) continue; lc++; /* * If mddb: * - is not active (previously had an error) * - had an error reading the master blocks or * - had an error in writing to the mddb * then don't count this mddb in the active count. */ if (! (lp->l_flags & MDDB_F_ACTIVE) || (lp->l_flags & MDDB_F_EMASTER) || (lp->l_flags & MDDB_F_EWRITE)) continue; alc++; } if (alc < ((lc + 1) / 2)) { cmn_err(CE_PANIC, "md: Panic due to lack of DiskSuite state\n" " database replicas. Fewer than 50%% of " "the total were available,\n so panic to " "ensure data integrity."); } } /* * If encountered an error during checking or writing of * mddbs, call selectreplicas so that replica error can * be properly handled. This will involve another attempt * to write the mddb out to any mddb marked MDDB_F_EWRITE. * If mddb still fails, it will have the MDDB_F_ACTIVE bit * turned off. Set the MDDB_SCANALLSYNC flag so that * selectreplicas doesn't overwrite the change log entries. * * Set the PARSE_LOCBLK flag in the mddb_set structure to show * that the locator block has been changed. */ if (mddb_err) { (void) selectreplicas(s, MDDB_SCANALLSYNC); s->s_mn_parseflags |= MDDB_PARSE_LOCBLK; } write_out_end: mddb_setexit(s); return (rval); } /* * Set/reset/get set flags in set structure. * Used during reconfig cycle * Only supported for MN disksets. * * Return values: * 0 - success * non-zero - failure */ int mddb_setflags_ioctl(mddb_setflags_config_t *info) { set_t setno = info->sf_setno; /* Verify that setno is in valid range */ if (setno >= md_nsets) return (EINVAL); /* * When setting the flags, the set may not * be snarfed yet. So, don't check for SNARFED or MNset * and don't call mddb_setenter. * In order to discourage bad ioctl calls, * verify that magic field in structure is set correctly. */ if (info->sf_magic != MDDB_SETFLAGS_MAGIC) return (EINVAL); switch (info->sf_flags) { case MDDB_NM_SET: if (info->sf_setflags & MD_SET_MN_NEWMAS_RC) md_set_setstatus(setno, MD_SET_MN_NEWMAS_RC); if (info->sf_setflags & MD_SET_MN_START_RC) md_set_setstatus(setno, MD_SET_MN_START_RC); if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC) md_set_setstatus(setno, MD_SET_MN_MIR_STATE_RC); break; case MDDB_NM_RESET: if (info->sf_setflags & MD_SET_MN_NEWMAS_RC) md_clr_setstatus(setno, MD_SET_MN_NEWMAS_RC); if (info->sf_setflags & MD_SET_MN_START_RC) md_clr_setstatus(setno, MD_SET_MN_START_RC); if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC) md_clr_setstatus(setno, MD_SET_MN_MIR_STATE_RC); break; case MDDB_NM_GET: info->sf_setflags = md_get_setstatus(setno) & (MD_SET_MN_NEWMAS_RC|MD_SET_MN_START_RC| MD_SET_MN_MIR_STATE_RC); break; } return (0); } /* * md_update_minor * * This function updates the minor in the namespace entry for an * underlying metadevice. The function is called in mod_imp_set * where mod is sp, stripe, mirror and raid. * */ int md_update_minor( set_t setno, side_t side, mdkey_t key ) { struct nm_next_hdr *nh; struct nm_name *n; char *shn; int retval = 1; side_t s; /* * Load the devid name space if it exists */ (void) md_load_namespace(setno, NULL, NM_DEVID); if (! md_load_namespace(setno, NULL, 0L)) { /* * Unload the devid namespace */ (void) md_unload_namespace(setno, NM_DEVID); return (0); } rw_enter(&nm_lock.lock, RW_READER); if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) { retval = 0; goto out; } /* * Look up the key */ for (s = 0; s < MD_MAXSIDES; s++) { /* * For side other than the import 'side', cleanup its entry */ if ((n = lookup_entry(nh, setno, s, key, NODEV64, 0L)) != NULL) { if (n->n_side == side) { /* * Update its n_minor if metadevice */ if (((shn = (char *)getshared_name(setno, n->n_drv_key, 0L)) != NULL) && (strcmp(shn, "md") == 0)) { n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor)); } } else { /* We are not the import side, cleanup */ (void) remove_entry(nh, n->n_side, key, 0L); } } } out: rw_exit(&nm_lock.lock); return (retval); } /* * md_update_top_device_minor * * This function updates the minor in the namespace entry for a top * level metadevice. The function is called in mod_imp_set where * mod is sp, stripe, mirror and raid. * */ int md_update_top_device_minor( set_t setno, side_t side, md_dev64_t dev ) { struct nm_next_hdr *nh; struct nm_name *n; char *shn; int retval = 1; /* * Load the devid name space if it exists */ (void) md_load_namespace(setno, NULL, NM_DEVID); if (! md_load_namespace(setno, NULL, 0L)) { /* * Unload the devid namespace */ (void) md_unload_namespace(setno, NM_DEVID); return (0); } rw_enter(&nm_lock.lock, RW_READER); if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) { retval = 0; goto out; } /* * Look up the key */ if ((n = lookup_entry(nh, setno, side, MD_KEYWILD, dev, 0L)) != NULL) { /* * Find the entry, update its n_minor if metadevice */ if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L)) == NULL) { retval = 0; goto out; } if (strcmp(shn, "md") == 0) { n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor)); } } out: rw_exit(&nm_lock.lock); return (retval); } static void md_imp_nm( mddb_set_t *s ) { mddb_db_t *dbp; mddb_de_ic_t *dep; struct nm_rec_hdr *hdr; struct nm_header *hhdr; set_t setno = s->s_setno; for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { switch (dep->de_type1) { case MDDB_NM_HDR: case MDDB_DID_NM_HDR: hhdr = (struct nm_header *) dep->de_rb_userdata; hdr = &hhdr->h_names; if (hdr->r_next_recid > 0) { hdr->r_next_recid = MAKERECID(setno, DBID(hdr->r_next_recid)); } hdr = &hhdr->h_shared; if (hdr->r_next_recid > 0) { hdr->r_next_recid = MAKERECID(setno, DBID(hdr->r_next_recid)); } break; case MDDB_NM: case MDDB_DID_NM: case MDDB_SHR_NM: case MDDB_DID_SHR_NM: hdr = (struct nm_rec_hdr *) dep->de_rb_userdata; if (hdr->r_next_recid > 0) { hdr->r_next_recid = MAKERECID (setno, DBID(hdr->r_next_recid)); } break; default: break; } } } } static int update_db_rec( mddb_set_t *s ) { mddb_db_t *dbp; mddb_de_ic_t *dep; mddb_recid_t ids[2]; for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { if (! (dep->de_flags & MDDB_F_OPT)) { ids[0] = MAKERECID(s->s_setno, dep->de_recid); ids[1] = 0; if (mddb_commitrecs(ids)) { return (MDDB_E_NORECORD); } } } } return (0); } static int update_mb( mddb_set_t *s ) { mddb_ri_t *rip; int err = 0; for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { if (rip->ri_flags & MDDB_F_EMASTER) /* disk is powered off or not there */ continue; if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) { /* * It is a replicated set */ if (rip->ri_devid == (ddi_devid_t)NULL) { return (-1); } err = update_mb_devid(s, rip, rip->ri_devid); } else { /* * It is a non-replicated set * and there is no need to update * devid */ err = update_mb_devid(s, rip, NULL); } if (err) return (err); } return (0); } static int update_setname( set_t setno ) { struct nm_next_hdr *nh; struct nm_shared_name *shn, *new_shn; char *prefix = "/dev/md/"; char *shrname; int len; mdkey_t o_key; uint32_t o_count, o_data; mddb_recid_t recid, ids[3]; int err = 0; mddb_set_t *dbp; /* Import setname */ dbp = (mddb_set_t *)md_set[setno].s_db; len = strlen(prefix) + strlen(dbp->s_setname) + strlen("/dsk/") + 1; shrname = kmem_zalloc(len, KM_SLEEP); (void) sprintf(shrname, "%s%s%s", prefix, dbp->s_setname, "/dsk/"); rw_enter(&nm_lock.lock, RW_WRITER); if ((nh = get_first_record(setno, 0, NM_SHARED)) == NULL) { /* * No namespace is okay */ err = 0; goto out; } if ((shn = (struct nm_shared_name *)lookup_shared_entry(nh, 0, prefix, NULL, NM_SHARED | NM_IMP_SHARED)) == NULL) { /* * No metadevice is okay */ err = 0; goto out; } /* * We have it, go ahead and update the namespace. */ o_key = shn->sn_key; o_count = shn->sn_count; o_data = shn->sn_data; if (remove_shared_entry(nh, o_key, NULL, 0L | NM_IMP_SHARED | NM_NOCOMMIT | NM_KEY_RECYCLE)) { err = MDDB_E_NORECORD; goto out; } if ((new_shn = (struct nm_shared_name *)alloc_entry( nh, md_set[setno].s_nmid, len, NM_SHARED | NM_NOCOMMIT, &recid)) == NULL) { err = MDDB_E_NORECORD; goto out; } new_shn->sn_key = o_key; new_shn->sn_count = o_count; new_shn->sn_data = o_data; new_shn->sn_namlen = (ushort_t)len; (void) strcpy(new_shn->sn_name, shrname); ids[0] = recid; ids[1] = md_set[setno].s_nmid; ids[2] = 0; err = mddb_commitrecs(ids); out: if (shrname) kmem_free(shrname, len); rw_exit(&nm_lock.lock); return (err); } /* * Returns 0 on success. * Returns -1 on failure with ep filled in. */ static int md_imp_db( set_t setno, int stale_flag, md_error_t *ep ) { mddb_set_t *s; int err = 0; mddb_dt_t *dtp; mddb_lb_t *lbp; int i; int loccnt; if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { return (mddbstatus2error(ep, err, NODEV32, setno)); } /* Update dt */ if ((dtp = (mddb_dt_t *)md_set[setno].s_dtp) != NULL) { crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL); } if ((err = dt_write(s)) != 0) { err = mdsyserror(ep, err); mddb_setexit(s); return (err); } /* * Update lb, no need to update the mediator because * the diskset will only exist on the importing node * and as such a mediator adds no value. */ /* Update lb */ if (stale_flag & MD_IMP_STALE_SET) { lbp = s->s_lbp; loccnt = lbp->lb_loccnt; for (i = 0; i < loccnt; i++) { mddb_locator_t *lp = &lbp->lb_locators[i]; md_dev64_t ndev = md_expldev(lp->l_dev); ddi_devid_t devid_ptr; devid_ptr = s->s_did_icp->did_ic_devid[i]; if (devid_ptr == NULL) { /* * Already deleted, go to next one. */ continue; } if (mddb_devid_validate((ddi_devid_t)devid_ptr, &ndev, NULL)) { /* disk unavailable, mark deleted */ lp->l_flags = MDDB_F_DELETED; /* then remove the device id from the list */ free_mbipp(&s->s_mbiarray[i]); (void) mddb_devid_delete(s, i); } } md_clr_setstatus(setno, MD_SET_STALE); } if ((err = writelocall(s)) != 0) { err = mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno); mddb_setexit(s); return (err); } mddb_setexit(s); /* Update db records */ if ((err = update_db_rec(s)) != 0) { return (mddbstatus2error(ep, err, NODEV32, setno)); } /* Update setname embedded in the namespace */ if ((err = update_setname(setno)) != 0) return (mddbstatus2error(ep, err, NODEV32, setno)); return (err); } static void md_dr_add( md_set_record *sr, md_drive_record *dr ) { md_drive_record *drv; if (sr->sr_driverec == 0) { sr->sr_driverec = dr->dr_selfid; return; } for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec); drv->dr_nextrec != 0; drv = (md_drive_record *)mddb_getrecaddr(drv->dr_nextrec)) ; drv->dr_nextrec = dr->dr_selfid; } static void md_setup_recids( md_set_record *sr, mddb_recid_t **ids, size_t size ) { md_drive_record *drv; int cnt; mddb_recid_t *recids; recids = (mddb_recid_t *)kmem_zalloc(sizeof (mddb_recid_t) * size, KM_SLEEP); recids[0] = sr->sr_selfid; cnt = 1; for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec); /* CSTYLED */ drv != NULL;) { recids[cnt++] = drv->dr_selfid; if (drv->dr_nextrec != 0) drv = (md_drive_record *)mddb_getrecaddr (drv->dr_nextrec); else drv = NULL; } recids[cnt] = 0; *ids = &recids[0]; } /* * The purpose of this function is to replace the old_devid with the * new_devid in the given namespace. This is used for importing * remotely replicated drives. */ int md_update_namespace_rr_did( mddb_config_t *cp ) { set_t setno = cp->c_setno; struct nm_next_hdr *nh; mdkey_t key = MD_KEYWILD; side_t side = MD_SIDEWILD; mddb_recid_t recids[3]; struct did_min_name *n; struct nm_next_hdr *did_shr_nh; struct did_shr_name *shr_n; mdkey_t ent_did_key; uint32_t ent_did_count; uint32_t ent_did_data; ddi_devid_t devid = NULL; struct did_shr_name *shn; void *old_devid, *new_devid; if (!(md_get_setstatus(setno) & MD_SET_NM_LOADED)) return (EIO); old_devid = (void *)(uintptr_t)cp->c_locator.l_old_devid; new_devid = (void *)(uintptr_t)cp->c_locator.l_devid; /* * It is okay if we dont have any configuration */ if ((nh = get_first_record(setno, 0, NM_DEVID | NM_NOTSHARED)) == NULL) { return (0); } while ((key = md_getnextkey(setno, side, key, NULL)) != MD_KEYWILD) { /* check out every entry in the namespace */ if ((n = (struct did_min_name *)lookup_entry(nh, setno, side, key, NODEV64, NM_DEVID)) == NULL) { continue; } else { did_shr_nh = get_first_record(setno, 0, NM_DEVID | NM_SHARED); if (did_shr_nh == NULL) { return (ENOENT); } shr_n = (struct did_shr_name *)lookup_shared_entry( did_shr_nh, n->min_devid_key, (char *)0, &recids[0], NM_DEVID); if (shr_n == NULL) { return (ENOENT); } rw_enter(&nm_lock.lock, RW_WRITER); devid = (ddi_devid_t)shr_n->did_devid; /* find this devid in the incore replica */ if (ddi_devid_compare(devid, old_devid) == 0) { /* * found the corresponding entry * update with new devid */ /* first remove old devid info */ ent_did_key = shr_n ->did_key; ent_did_count = shr_n->did_count; ent_did_data = shr_n->did_data; (void) remove_shared_entry(did_shr_nh, shr_n->did_key, NULL, NM_DEVID | NM_IMP_SHARED | NM_KEY_RECYCLE); /* add in new devid info */ if ((shn = (struct did_shr_name *) alloc_entry(did_shr_nh, md_set[setno].s_did_nmid, cp->c_locator.l_devid_sz, NM_DEVID | NM_SHARED | NM_NOCOMMIT, &recids[0])) == NULL) { rw_exit(&nm_lock.lock); return (ENOMEM); } shn->did_key = ent_did_key; shn->did_count = ent_did_count; ent_did_data |= NM_DEVID_VALID; shn->did_data = ent_did_data; shn->did_size = ddi_devid_sizeof( new_devid); bcopy((void *)new_devid, (void *) shn->did_devid, shn->did_size); recids[1] = md_set[setno].s_nmid; recids[2] = 0; mddb_commitrecs_wrapper(recids); } rw_exit(&nm_lock.lock); } } return (0); } /* * namespace is loaded before this is called. * This function is a wrapper for md_update_namespace_rr_did. * * md_update_namespace_rr_did may be called twice if attempting to * resolve a replicated device id during the take of a diskset - once * for the diskset namespace and a second time for the local namespace. * The local namespace would need to be updated when a drive has been * found during a take of the diskset that hadn't been resolved during * the import (aka partial replicated import). * * If being called during the import of the diskset (IMPORT flag set) * md_update_namespace_rr_did will only be called once with the disket * namespace. */ int md_update_nm_rr_did_ioctl( mddb_config_t *cp ) { int rval = 0; /* If update of diskset namespace fails, stop and return failure */ if ((rval = md_update_namespace_rr_did(cp)) != 0) return (rval); if (cp->c_flags & MDDB_C_IMPORT) return (0); /* If update of local namespace fails, return failure */ cp->c_setno = MD_LOCAL_SET; rval = md_update_namespace_rr_did(cp); return (rval); } /*ARGSUSED*/ int md_imp_snarf_set( mddb_config_t *cp ) { set_t setno; int stale_flag; mddb_set_t *s; int i, err = 0; md_ops_t *ops; md_error_t *ep = &cp->c_mde; setno = cp->c_setno; stale_flag = cp->c_flags; mdclrerror(ep); if (setno >= md_nsets) { return (mdsyserror(ep, EINVAL)); } md_haltsnarf_enter(setno); if (md_get_setstatus(setno) & MD_SET_IMPORT) { goto out; } /* Set the bit first otherwise load_old_replicas can fail */ md_set_setstatus(setno, MD_SET_IMPORT); if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { err = mddbstatus2error(ep, err, NODEV32, setno); goto out; } /* * Upon completion of load_old_replicas, the old setno is * restored from the disk so we need to reset */ s->s_lbp->lb_setno = setno; /* * Fixup the NM records before loading namespace */ (void) md_imp_nm(s); mddb_setexit(s); /* * Load the devid name space if it exists * and ask each module to fixup unit records */ if (!md_load_namespace(setno, NULL, NM_DEVID)) { err = mdsyserror(ep, ENOENT); goto cleanup; } if (!md_load_namespace(setno, NULL, 0L)) { (void) md_unload_namespace(setno, NM_DEVID); err = mdsyserror(ep, ENOENT); goto cleanup; } do { i = 0; for (ops = md_opslist; ops != NULL; ops = ops->md_next) if (ops->md_imp_set != NULL) i += ops->md_imp_set(setno); } while (i); /* * Fixup * (1) locator block * (2) locator name block if necessary * (3) master block * (4) directory block * calls appropriate writes to push changes out */ if ((err = md_imp_db(setno, stale_flag, ep)) != 0) { goto cleanup; } /* * Don't unload namespace if importing a replicated diskset. * Namespace will be unloaded with an explicit RELEASE_SET ioctl. */ if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) { md_haltsnarf_exit(setno); return (err); } cleanup: /* * Halt the set */ rw_enter(&md_unit_array_rw.lock, RW_WRITER); (void) md_halt_set(setno, MD_HALT_ALL); rw_exit(&md_unit_array_rw.lock); /* * Unload the namespace for the imported set */ mutex_enter(&mddb_lock); mddb_unload_set(setno); mutex_exit(&mddb_lock); out: md_haltsnarf_exit(setno); md_clr_setstatus(setno, MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT); return (err); } #endif /* MDDB_FAKE */