1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27/* 28 * NAME: raid_ioctl.c 29 * 30 * DESCRIPTION: RAID driver source file containing IOCTL operations. 31 * 32 * ROUTINES PROVIDED FOR EXTERNAL USE: 33 * raid_commit() - commits MD database updates for a RAID metadevice 34 * md_raid_ioctl() - RAID metadevice IOCTL operations entry point. 35 * 36 * ROUTINES PROVIDED FOR INTERNAL USE: 37 * raid_getun() - Performs unit checking on a RAID metadevice 38 * init_col_nextio() - normal backend when zeroing column of RAID metadevice. 39 * init_col_int() - I/O interrupt while zeroing column of RAID metadevice. 40 * raid_init_columns() - Zero one or more columns of a RAID metadevice. 41 * raid_set() - used to create a RAID metadevice 42 * raid_get() - used to get the unit structure of a RAID metadevice 43 * raid_replace() - used to replace a component of a RAID metadevice 44 * raid_grow() - Concatenate to a RAID metadevice 45 * raid_change() - change dynamic values of a RAID metadevice 46 * raid_reset() - used to reset (clear / remove) a RAID metadevice 47 * raid_get_geom() - used to get the geometry of a RAID metadevice 48 * raid_get_vtoc() - used to get the VTOC on a RAID metadevice 49 * raid_set_vtoc() - used to set the VTOC on a RAID metadevice 50 * raid_get_extvtoc() - used to get the extended VTOC on a RAID metadevice 51 * raid_set_extvtoc() - used to set the extended VTOC on a RAID metadevice 52 * raid_getdevs() - return all devices within a RAID metadevice 53 * raid_admin_ioctl() - IOCTL operations unique to metadevices and RAID 54 */ 55 56 57#include <sys/param.h> 58#include <sys/systm.h> 59#include <sys/conf.h> 60#include <sys/file.h> 61#include <sys/user.h> 62#include <sys/uio.h> 63#include <sys/t_lock.h> 64#include <sys/buf.h> 65#include <sys/dkio.h> 66#include <sys/vtoc.h> 67#include <sys/kmem.h> 68#include <vm/page.h> 69#include <sys/sysmacros.h> 70#include <sys/types.h> 71#include <sys/mkdev.h> 72#include <sys/stat.h> 73#include <sys/open.h> 74#include <sys/disp.h> 75#include <sys/modctl.h> 76#include <sys/ddi.h> 77#include <sys/sunddi.h> 78#include <sys/cred.h> 79#include <sys/lvm/mdvar.h> 80#include <sys/lvm/md_names.h> 81#include <sys/lvm/md_mddb.h> 82#include <sys/lvm/md_raid.h> 83#include <sys/lvm/md_convert.h> 84 85#include <sys/sysevent/eventdefs.h> 86#include <sys/sysevent/svm.h> 87 88extern int md_status; 89extern unit_t md_nunits; 90extern set_t md_nsets; 91extern md_set_t md_set[]; 92extern md_ops_t raid_md_ops; 93extern major_t md_major; 94extern md_krwlock_t md_unit_array_rw; 95extern mdq_anchor_t md_done_daemon; 96extern mdq_anchor_t md_ff_daemonq; 97extern int mdopen(); 98extern int mdclose(); 99extern void md_probe_one(); 100extern int md_init_probereq(md_probedev_impl_t *, 101 daemon_queue_t **); 102extern md_resync_t md_cpr_resync; 103 104 105extern void dump_mr_unit(mr_unit_t *); 106 107typedef struct raid_ci { 108 DAEMON_QUEUE 109 struct raid_ci *ci_next; 110 mr_unit_t *ci_un; 111 int ci_col; 112 int ci_err; 113 int ci_flag; 114 size_t ci_zerosize; 115 diskaddr_t ci_blkno; 116 diskaddr_t ci_lastblk; 117 buf_t ci_buf; 118} raid_ci_t; 119/* values for the ci_flag */ 120#define COL_INITING (0x0001) 121#define COL_INIT_DONE (0x0002) 122#define COL_READY (0x0004) 123 124/* 125 * NAME: raid_getun 126 * DESCRIPTION: performs a lot of unit checking on a RAID metadevice 127 * PARAMETERS: minor_t mnum - minor device number for RAID unit 128 * md_error_t *mde - pointer to error reporting structure 129 * int flags - pointer to error reporting structure 130 * STALE_OK - allow stale MD memory 131 * NO_OLD - unit must not exist 132 * NO_LOCK - no IOCTL lock needed 133 * WR_LOCK - write IOCTL lock needed 134 * RD_LOCK - read IOCTL lock needed 135 * IOLOCK *lock - pointer to IOCTL lock 136 * 137 * LOCKS: obtains unit reader or writer lock via IOLOCK 138 * 139 */ 140static mr_unit_t * 141raid_getun(minor_t mnum, md_error_t *mde, int flags, IOLOCK *lock) 142{ 143 mr_unit_t *un; 144 mdi_unit_t *ui; 145 set_t setno = MD_MIN2SET(mnum); 146 147 if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) { 148 (void) mdmderror(mde, MDE_INVAL_UNIT, mnum); 149 return (NULL); 150 } 151 152 if (!(flags & STALE_OK)) { 153 if (md_get_setstatus(setno) & MD_SET_STALE) { 154 (void) mdmddberror(mde, MDE_DB_STALE, mnum, setno); 155 return (NULL); 156 } 157 } 158 159 ui = MDI_UNIT(mnum); 160 if (flags & NO_OLD) { 161 if (ui != NULL) { 162 (void) mdmderror(mde, MDE_UNIT_ALREADY_SETUP, mnum); 163 return (NULL); 164 } 165 return ((mr_unit_t *)1); 166 } 167 168 if (ui == NULL) { 169 (void) mdmderror(mde, MDE_UNIT_NOT_SETUP, mnum); 170 return (NULL); 171 } 172 if (flags & ARRAY_WRITER) 173 md_array_writer(lock); 174 else if (flags & ARRAY_READER) 175 md_array_reader(lock); 176 177 if (!(flags & NO_LOCK)) { 178 if (flags & WR_LOCK) { 179 (void) md_ioctl_io_lock(lock, ui); 180 (void) md_ioctl_writerlock(lock, ui); 181 } else /* RD_LOCK */ 182 (void) md_ioctl_readerlock(lock, ui); 183 } 184 un = (mr_unit_t *)MD_UNIT(mnum); 185 186 if (un->c.un_type != MD_METARAID) { 187 (void) mdmderror(mde, MDE_NOT_RAID, mnum); 188 return (NULL); 189 } 190 191 return (un); 192} 193 194 195/* 196 * NAME: raid_commit 197 * DESCRIPTION: commits MD database updates for a RAID metadevice 198 * PARAMETERS: mr_unit_t *un - RAID unit to update in the MD database 199 * mddb_recid_t *extras - array of other record IDs to update 200 * 201 * LOCKS: assumes caller holds unit writer lock 202 * 203 */ 204void 205raid_commit(mr_unit_t *un, mddb_recid_t *extras) 206{ 207 mddb_recid_t *recids; 208 int ri = 0; 209 int nrecids = 0; 210 211 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE) 212 return; 213 214 /* Count the extra recids */ 215 if (extras != NULL) { 216 while (extras[nrecids] != 0) { 217 nrecids++; 218 } 219 } 220 221 /* 222 * Allocate space for two recids in addition to the extras: 223 * one for the unit structure, one for the null terminator. 224 */ 225 nrecids += 2; 226 recids = (mddb_recid_t *) 227 kmem_zalloc(nrecids * sizeof (mddb_recid_t), KM_SLEEP); 228 229 if (un != NULL) { 230 ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1); 231 recids[ri++] = un->c.un_record_id; 232 } 233 234 if (extras != NULL) { 235 while (*extras != 0) { 236 recids[ri++] = *extras; 237 extras++; 238 } 239 } 240 241 if (ri > 0) { 242 mddb_commitrecs_wrapper(recids); 243 } 244 245 kmem_free(recids, nrecids * sizeof (mddb_recid_t)); 246} 247 248static int 249raid_check_pw(mr_unit_t *un) 250{ 251 buf_t bp; 252 char *buf; 253 mr_column_t *colptr; 254 minor_t mnum = MD_SID(un); 255 int i; 256 int err = 0; 257 minor_t unit; 258 259 buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP); 260 261 for (i = 0; i < un->un_totalcolumncnt; i++) { 262 md_dev64_t tmpdev; 263 264 colptr = &un->un_column[i]; 265 266 tmpdev = colptr->un_dev; 267 /* 268 * Open by device id 269 * If this device is hotspared 270 * use the hotspare key 271 */ 272 tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ? 273 colptr->un_hs_key : colptr->un_orig_key); 274 if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) { 275 colptr->un_dev = tmpdev; 276 return (1); 277 } 278 colptr->un_dev = tmpdev; 279 280 bzero((caddr_t)&bp, sizeof (buf_t)); 281 bp.b_back = &bp; 282 bp.b_forw = &bp; 283 bp.b_flags = B_READ | B_BUSY; 284 sema_init(&bp.b_io, 0, NULL, 285 SEMA_DEFAULT, NULL); 286 sema_init(&bp.b_sem, 0, NULL, 287 SEMA_DEFAULT, NULL); 288 bp.b_edev = md_dev64_to_dev(colptr->un_dev); 289 bp.b_lblkno = colptr->un_pwstart; 290 bp.b_bcount = DEV_BSIZE; 291 bp.b_bufsize = DEV_BSIZE; 292 bp.b_un.b_addr = (caddr_t)buf; 293 bp.b_offset = -1; 294 (void) md_call_strategy(&bp, 0, NULL); 295 if (biowait(&bp)) 296 err = 1; 297 if (i == 0) { 298 if (un->c.un_revision & MD_64BIT_META_DEV) { 299 unit = ((raid_pwhdr_t *)buf)->rpw_unit; 300 } else { 301 unit = ((raid_pwhdr32_od_t *)buf)->rpw_unit; 302 } 303 } 304 /* 305 * depending upon being an 64bit or 32 bit raid, the 306 * pre write headers have different layout 307 */ 308 if (un->c.un_revision & MD_64BIT_META_DEV) { 309 if ((((raid_pwhdr_t *)buf)->rpw_column != i) || 310 (((raid_pwhdr_t *)buf)->rpw_unit != unit)) 311 err = 1; 312 } else { 313 if ((((raid_pwhdr32_od_t *)buf)->rpw_column != i) || 314 (((raid_pwhdr32_od_t *)buf)->rpw_unit != unit)) 315 err = 1; 316 } 317 md_layered_close(colptr->un_dev, MD_OFLG_NULL); 318 if (err) 319 break; 320 } 321 kmem_free(buf, DEV_BSIZE); 322 return (err); 323} 324 325/* 326 * NAME: init_col_nextio 327 * DESCRIPTION: normal backend process when zeroing column of a RAID metadevice. 328 * PARAMETERS: raid_ci_t *cur - struct for column being zeroed 329 * 330 * LOCKS: assumes caller holds unit reader lock, 331 * preiodically releases and reacquires unit reader lock, 332 * broadcasts on unit conditional variable (un_cv) 333 * 334 */ 335#define INIT_RLS_CNT 10 336static void 337init_col_nextio(raid_ci_t *cur) 338{ 339 mr_unit_t *un; 340 341 un = cur->ci_un; 342 343 cur->ci_blkno += cur->ci_zerosize; 344 345 mutex_enter(&un->un_mx); 346 /* ===> update un_percent_done */ 347 un->un_init_iocnt += btodb(cur->ci_buf.b_bcount); 348 mutex_exit(&un->un_mx); 349 350 /* 351 * When gorwing a device, normal I/O is still going on. 352 * The init thread still holds the unit reader lock which 353 * prevents I/O from doing state changes. 354 * So every INIT_RLS_CNT init I/Os, we will release the 355 * unit reader lock. 356 * 357 * CAVEAT: 358 * We know we are in the middle of a grow operation and the 359 * unit cannot be grown or removed (through reset or halt) 360 * so the mr_unit_t structure will not move or disappear. 361 * In addition, we know that only one of the init I/Os 362 * can be in col_init_nextio at a time because they are 363 * placed on the md_done_daemon queue and md only processes 364 * one element of this queue at a time. In addition, any 365 * code that needs to acquire the unit writer lock to change 366 * state is supposed to be on the md_mstr_daemon queue so 367 * it can be processing while we sit here waiting to get the 368 * unit reader lock back. 369 */ 370 371 if (cur->ci_blkno < cur->ci_lastblk) { 372 /* truncate last chunk to end_addr if needed */ 373 if (cur->ci_blkno + cur->ci_zerosize > cur->ci_lastblk) { 374 cur->ci_zerosize = (size_t) 375 (cur->ci_lastblk - cur->ci_blkno); 376 } 377 378 /* set address and length for I/O bufs */ 379 cur->ci_buf.b_bufsize = dbtob(cur->ci_zerosize); 380 cur->ci_buf.b_bcount = dbtob(cur->ci_zerosize); 381 cur->ci_buf.b_lblkno = cur->ci_blkno; 382 383 (void) md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL); 384 return; 385 } 386 /* finished initializing this column */ 387 mutex_enter(&un->un_mx); 388 cur->ci_flag = COL_INIT_DONE; 389 uniqtime32(&un->un_column[cur->ci_col].un_devtimestamp); 390 mutex_exit(&un->un_mx); 391 cv_broadcast(&un->un_cv); 392} 393 394/* 395 * NAME: init_col_int 396 * DESCRIPTION: I/O interrupt while zeroing column of a RAID metadevice. 397 * PARAMETERS: buf_t *cb - I/O buffer for which interrupt occurred 398 * 399 * LOCKS: assumes caller holds unit reader or writer lock 400 * 401 */ 402static int 403init_col_int(buf_t *cb) 404{ 405 raid_ci_t *cur; 406 407 cur = (raid_ci_t *)cb->b_chain; 408 if (cb->b_flags & B_ERROR) { 409 mutex_enter(&cur->ci_un->un_mx); 410 cur->ci_err = EIO; 411 mutex_exit(&cur->ci_un->un_mx); 412 cv_broadcast(&cur->ci_un->un_cv); 413 return (1); 414 } 415 daemon_request(&md_done_daemon, init_col_nextio, 416 (daemon_queue_t *)cur, REQ_OLD); 417 return (1); 418} 419 420/* 421 * NAME: raid_init_columns 422 * DESCRIPTION: Zero one or more columns of a RAID metadevice. 423 * PARAMETERS: minor_t mnum - RAID unit minor identifier 424 * 425 * LOCKS: obtains and releases unit reader lock, 426 * obtains and releases unit writer lock, 427 * obtains and releases md_unit_array_rw write lock, 428 * obtains and releases unit mutex (un_mx) lock, 429 * waits on unit conditional variable (un_cv) 430 * 431 */ 432static void 433raid_init_columns(minor_t mnum) 434{ 435 mr_unit_t *un; 436 mdi_unit_t *ui; 437 raid_ci_t *ci_chain = NULL, *cur; 438 rus_state_t state; 439 caddr_t zero_addr; 440 diskaddr_t end_off; 441 size_t zerosize; 442 int err = 0; 443 int ix; 444 int colcnt = 0; 445 int col; 446 set_t setno = MD_MIN2SET(mnum); 447 448 /* 449 * Increment the raid resync count for cpr 450 */ 451 mutex_enter(&md_cpr_resync.md_resync_mutex); 452 md_cpr_resync.md_raid_resync++; 453 mutex_exit(&md_cpr_resync.md_resync_mutex); 454 455 /* 456 * initialization is a multiple step process. The first step 457 * is to go through the unit structure and start each device 458 * in the init state writing zeros over the component. 459 * Next initialize the prewrite areas, so the device can be 460 * used if a metainit -k is done. Now close the componenets. 461 * 462 * Once this complete set the state of each component being 463 * zeroed and set the correct state for the unit. 464 * 465 * last commit the records. 466 */ 467 468 ui = MDI_UNIT(mnum); 469 un = md_unit_readerlock(ui); 470 471 /* check for active init on this column */ 472 /* exiting is cpr safe */ 473 if ((un->un_init_colcnt > 0) && (un->un_resync_index != -1)) { 474 md_unit_readerexit(ui); 475 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0); 476 /* 477 * Decrement the raid resync count for cpr 478 */ 479 mutex_enter(&md_cpr_resync.md_resync_mutex); 480 md_cpr_resync.md_raid_resync--; 481 mutex_exit(&md_cpr_resync.md_resync_mutex); 482 thread_exit(); 483 } 484 485 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_START, SVM_TAG_METADEVICE, setno, 486 MD_SID(un)); 487 un->un_init_colcnt = 0; 488 un->un_init_iocnt = 0; 489 end_off = un->un_pwsize + (un->un_segsize * un->un_segsincolumn); 490 zerosize = (size_t)MIN((diskaddr_t)un->un_maxio, end_off); 491 492 /* allocate zero-filled buffer */ 493 zero_addr = kmem_zalloc(dbtob(zerosize), KM_SLEEP); 494 495 for (ix = 0; ix < un->un_totalcolumncnt; ix++) { 496 if (un->un_column[ix].un_devstate != RCS_INIT) 497 continue; 498 /* allocate new column init structure */ 499 cur = (raid_ci_t *)kmem_zalloc((sizeof (raid_ci_t)), KM_SLEEP); 500 ASSERT(cur != NULL); 501 un->un_init_colcnt++; 502 cur->ci_next = ci_chain; 503 ci_chain = cur; 504 cur->ci_un = un; 505 cur->ci_col = ix; 506 cur->ci_err = 0; 507 cur->ci_flag = COL_INITING; 508 cur->ci_zerosize = zerosize; 509 cur->ci_blkno = un->un_column[ix].un_pwstart; 510 cur->ci_lastblk = cur->ci_blkno + un->un_pwsize 511 + (un->un_segsize * un->un_segsincolumn); 512 /* initialize static buf fields */ 513 cur->ci_buf.b_un.b_addr = zero_addr; 514 cur->ci_buf.b_chain = (buf_t *)cur; 515 cur->ci_buf.b_back = &cur->ci_buf; 516 cur->ci_buf.b_forw = &cur->ci_buf; 517 cur->ci_buf.b_iodone = init_col_int; 518 cur->ci_buf.b_flags = B_BUSY | B_WRITE; 519 cur->ci_buf.b_edev = md_dev64_to_dev(un->un_column[ix].un_dev); 520 sema_init(&cur->ci_buf.b_io, 0, NULL, SEMA_DEFAULT, NULL); 521 sema_init(&cur->ci_buf.b_sem, 0, NULL, SEMA_DEFAULT, NULL); 522 /* set address and length for I/O bufs */ 523 cur->ci_buf.b_bufsize = dbtob(zerosize); 524 cur->ci_buf.b_bcount = dbtob(zerosize); 525 cur->ci_buf.b_lblkno = un->un_column[ix].un_pwstart; 526 cur->ci_buf.b_offset = -1; 527 528 if (! (un->un_column[ix].un_devflags & MD_RAID_DEV_ISOPEN)) { 529 md_dev64_t tmpdev = un->un_column[ix].un_dev; 530 /* 531 * Open by device id 532 * If this column is hotspared then 533 * use the hotspare key 534 */ 535 tmpdev = md_resolve_bydevid(mnum, tmpdev, 536 HOTSPARED(un, ix) ? 537 un->un_column[ix].un_hs_key : 538 un->un_column[ix].un_orig_key); 539 if ((cur->ci_err = md_layered_open(mnum, &tmpdev, 540 MD_OFLG_NULL)) == 0) 541 un->un_column[ix].un_devflags |= 542 MD_RAID_DEV_ISOPEN; 543 un->un_column[ix].un_dev = tmpdev; 544 } 545 if (cur->ci_err == 0) 546 md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL); 547 } 548 549 md_unit_readerexit(ui); 550 state = un->un_state; 551 colcnt = un->un_init_colcnt; 552 mutex_enter(&un->un_mx); 553 while (colcnt) { 554 cv_wait(&un->un_cv, &un->un_mx); 555 556 colcnt = 0; 557 for (cur = ci_chain; cur != NULL; cur = cur->ci_next) { 558 col = cur->ci_col; 559 if ((cur->ci_flag != COL_INITING) || (cur->ci_err)) { 560 if (cur->ci_err) 561 err = cur->ci_err; 562 else if (cur->ci_flag == COL_INIT_DONE) { 563 (void) init_pw_area(un, 564 un->un_column[col].un_dev, 565 un->un_column[col].un_pwstart, 566 col); 567 cur->ci_flag = COL_READY; 568 } 569 } else { 570 colcnt++; 571 } 572 } 573 } 574 mutex_exit(&un->un_mx); 575 576 /* This prevents new opens */ 577 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 578 (void) md_io_writerlock(ui); 579 un = (mr_unit_t *)md_unit_writerlock(ui); 580 while (ci_chain) { 581 cur = ci_chain; 582 583 /* take this element out of the chain */ 584 ci_chain = cur->ci_next; 585 /* free this element */ 586 sema_destroy(&cur->ci_buf.b_io); 587 sema_destroy(&cur->ci_buf.b_sem); 588 if (cur->ci_err) 589 raid_set_state(cur->ci_un, cur->ci_col, 590 RCS_INIT_ERRED, 0); 591 else 592 raid_set_state(cur->ci_un, cur->ci_col, 593 RCS_OKAY, 0); 594 kmem_free(cur, sizeof (raid_ci_t)); 595 } 596 597 /* free the zeroed buffer */ 598 kmem_free(zero_addr, dbtob(zerosize)); 599 600 /* determine new unit state */ 601 if (err == 0) { 602 if (state == RUS_INIT) 603 un->un_state = RUS_OKAY; 604 else { 605 un->c.un_total_blocks = un->un_grow_tb; 606 md_nblocks_set(mnum, un->c.un_total_blocks); 607 un->un_grow_tb = 0; 608 if (raid_state_cnt(un, RCS_OKAY) == 609 un->un_totalcolumncnt) 610 un->un_state = RUS_OKAY; 611 } 612 } else { /* error orcurred */ 613 if (state & RUS_INIT) 614 un->un_state = RUS_DOI; 615 } 616 uniqtime32(&un->un_timestamp); 617 MD_STATUS(un) &= ~MD_UN_GROW_PENDING; 618 un->un_init_colcnt = 0; 619 un->un_init_iocnt = 0; 620 raid_commit(un, NULL); 621 md_unit_writerexit(ui); 622 (void) md_io_writerexit(ui); 623 rw_exit(&md_unit_array_rw.lock); 624 if (err) { 625 if (un->un_state & RUS_DOI) { 626 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL, 627 SVM_TAG_METADEVICE, setno, MD_SID(un)); 628 } else { 629 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED, 630 SVM_TAG_METADEVICE, setno, MD_SID(un)); 631 } 632 } else { 633 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_SUCCESS, 634 SVM_TAG_METADEVICE, setno, MD_SID(un)); 635 } 636 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0); 637 /* 638 * Decrement the raid resync count for cpr 639 */ 640 mutex_enter(&md_cpr_resync.md_resync_mutex); 641 md_cpr_resync.md_raid_resync--; 642 mutex_exit(&md_cpr_resync.md_resync_mutex); 643 thread_exit(); 644 /*NOTREACHED*/ 645} 646 647static int 648raid_init_unit(minor_t mnum, md_error_t *ep) 649{ 650 mdi_unit_t *ui; 651 mr_unit_t *un; 652 int rval, i; 653 set_t setno = MD_MIN2SET(mnum); 654 655 ui = MDI_UNIT(mnum); 656 if (md_get_setstatus(setno) & MD_SET_STALE) 657 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno)); 658 659 /* Don't start an init if the device is not available */ 660 if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) { 661 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum)); 662 } 663 664 if (raid_internal_open(mnum, (FREAD | FWRITE), 665 OTYP_LYR, MD_OFLG_ISINIT)) { 666 rval = mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum); 667 goto out; 668 } 669 670 un = md_unit_readerlock(ui); 671 un->un_percent_done = 0; 672 md_unit_readerexit(ui); 673 /* start resync_unit thread */ 674 (void) thread_create(NULL, 0, raid_init_columns, 675 (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri); 676 677 return (0); 678 679out: 680 un = md_unit_writerlock(ui); 681 MD_STATUS(un) &= ~MD_UN_GROW_PENDING; 682 /* recover state */ 683 for (i = 0; i < un->un_totalcolumncnt; i++) 684 if (COLUMN_STATE(un, i) == RCS_INIT) 685 raid_set_state(un, i, RCS_ERRED, 0); 686 if (un->un_state & RUS_INIT) 687 un->un_state = RUS_DOI; 688 raid_commit(un, NULL); 689 md_unit_writerexit(ui); 690 if (un->un_state & RUS_DOI) { 691 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL, 692 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 693 } else { 694 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED, 695 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 696 } 697 return (rval); 698} 699 700/* 701 * NAME: raid_regen 702 * 703 * DESCRIPTION: regenerate all the parity on the raid device. This 704 * routine starts a thread that will regenerate the 705 * parity on a raid device. If an I/O error occurs during 706 * this process the entire device is placed in error. 707 * 708 * PARAMETERS: md_set_params_t *msp - ioctl packet 709 */ 710static void 711regen_unit(minor_t mnum) 712{ 713 mdi_unit_t *ui = MDI_UNIT(mnum); 714 mr_unit_t *un = MD_UNIT(mnum); 715 buf_t buf, *bp; 716 caddr_t buffer; 717 int err = 0; 718 diskaddr_t total_segments; 719 diskaddr_t line; 720 size_t iosize; 721 722 /* 723 * Increment raid resync count for cpr 724 */ 725 mutex_enter(&md_cpr_resync.md_resync_mutex); 726 md_cpr_resync.md_raid_resync++; 727 mutex_exit(&md_cpr_resync.md_resync_mutex); 728 729 iosize = dbtob(un->un_segsize); 730 buffer = kmem_alloc(iosize, KM_SLEEP); 731 bp = &buf; 732 total_segments = un->un_segsincolumn; 733 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_START, SVM_TAG_METADEVICE, 734 MD_UN2SET(un), MD_SID(un)); 735 un->un_percent_done = 0; 736 init_buf(bp, B_READ | B_BUSY, iosize); 737 738 for (line = 0; line < total_segments; line++) { 739 bp->b_lblkno = line * 740 ((un->un_origcolumncnt - 1) * un->un_segsize); 741 bp->b_un.b_addr = buffer; 742 bp->b_bcount = iosize; 743 bp->b_iodone = NULL; 744 /* 745 * The following assignment is only correct because 746 * md_raid_strategy is fine when it's only a minor number 747 * and not a real dev_t. Yuck. 748 */ 749 bp->b_edev = mnum; 750 md_raid_strategy(bp, MD_STR_NOTTOP, NULL); 751 if (biowait(bp)) { 752 err = 1; 753 break; 754 } 755 un->un_percent_done = (uint_t)((line * 1000) / 756 un->un_segsincolumn); 757 /* just to avoid rounding errors */ 758 if (un->un_percent_done > 1000) 759 un->un_percent_done = 1000; 760 reset_buf(bp, B_READ | B_BUSY, iosize); 761 } 762 destroy_buf(bp); 763 kmem_free(buffer, iosize); 764 765 (void) md_io_writerlock(ui); 766 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0); 767 (void) md_io_writerexit(ui); 768 un = md_unit_writerlock(ui); 769 if (!err && 770 (raid_state_cnt(un, RCS_OKAY) == un->un_totalcolumncnt)) 771 un->un_state = RUS_OKAY; 772 raid_commit(un, NULL); 773 md_unit_writerexit(ui); 774 if (err || 775 raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) { 776 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_FAILED, 777 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 778 } else { 779 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_DONE, SVM_TAG_METADEVICE, 780 MD_UN2SET(un), MD_SID(un)); 781 } 782 783 /* 784 * Decrement the raid resync count for cpr 785 */ 786 mutex_enter(&md_cpr_resync.md_resync_mutex); 787 md_cpr_resync.md_raid_resync--; 788 mutex_exit(&md_cpr_resync.md_resync_mutex); 789 thread_exit(); 790} 791 792static int 793raid_regen_unit(minor_t mnum, md_error_t *ep) 794{ 795 mdi_unit_t *ui; 796 mr_unit_t *un; 797 int i; 798 set_t setno = MD_MIN2SET(mnum); 799 800 ui = MDI_UNIT(mnum); 801 un = (mr_unit_t *)MD_UNIT(mnum); 802 803 if (md_get_setstatus(setno) & MD_SET_STALE) 804 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno)); 805 806 /* Don't start a regen if the device is not available */ 807 if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) { 808 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum)); 809 } 810 811 if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) { 812 (void) md_unit_writerlock(ui); 813 for (i = 0; i < un->un_totalcolumncnt; i++) 814 raid_set_state(un, i, RCS_ERRED, 0); 815 md_unit_writerexit(ui); 816 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum)); 817 } 818 819 /* start resync_unit thread */ 820 (void) thread_create(NULL, 0, regen_unit, 821 (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri); 822 823 return (0); 824} 825 826static int 827raid_regen(md_regen_param_t *mrp, IOLOCK *lock) 828{ 829 minor_t mnum = mrp->mnum; 830 mr_unit_t *un; 831 832 mdclrerror(&mrp->mde); 833 834 un = md_unit_readerlock(MDI_UNIT(mnum)); 835 836 if (MD_STATUS(un) & MD_UN_GROW_PENDING) { 837 md_unit_readerexit(MDI_UNIT(mnum)); 838 return (mdmderror(&mrp->mde, MDE_IN_USE, mnum)); 839 } 840 841 if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) || 842 (raid_state_cnt(un, RCS_RESYNC))) { 843 md_unit_readerexit(MDI_UNIT(mnum)); 844 return (mdmderror(&mrp->mde, MDE_RESYNC_ACTIVE, mnum)); 845 } 846 847 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) { 848 md_unit_readerexit(MDI_UNIT(mnum)); 849 return (mdmderror(&mrp->mde, MDE_IN_USE, mnum)); 850 } 851 852 if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) || 853 (! (un->un_state & RUS_OKAY))) { 854 md_unit_readerexit(MDI_UNIT(mnum)); 855 return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum)); 856 } 857 858 md_unit_readerexit(MDI_UNIT(mnum)); 859 860 /* get locks and recheck to be sure something did not change */ 861 if ((un = raid_getun(mnum, &mrp->mde, WRITERS, lock)) == NULL) 862 return (0); 863 864 if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) || 865 (! (un->un_state & RUS_OKAY))) { 866 return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum)); 867 } 868 869 raid_set_state(un, 0, RCS_REGEN, 0); 870 raid_commit(un, NULL); 871 md_ioctl_droplocks(lock); 872 return (raid_regen_unit(mnum, &mrp->mde)); 873} 874 875/* 876 * NAME: raid_set 877 * DESCRIPTION: used to create a RAID metadevice 878 * PARAMETERS: md_set_params_t *d - pointer to set data structure 879 * int mode - must be FWRITE 880 * 881 * LOCKS: none 882 * 883 */ 884static int 885raid_set(void *d, int mode) 886{ 887 minor_t mnum; 888 mr_unit_t *un; 889 mddb_recid_t mr_recid; 890 mddb_recid_t *recids; 891 mddb_type_t typ1; 892 int err; 893 set_t setno; 894 int num_recs; 895 int rid; 896 int col; 897 md_set_params_t *msp = d; 898 899 900 mnum = msp->mnum; 901 setno = MD_MIN2SET(mnum); 902 903 mdclrerror(&msp->mde); 904 905 if (raid_getun(mnum, &msp->mde, NO_OLD, NULL) == NULL) 906 return (0); 907 908 typ1 = (mddb_type_t)md_getshared_key(setno, 909 raid_md_ops.md_driver.md_drivername); 910 911 /* create the db record for this mdstruct */ 912 913 if (msp->options & MD_CRO_64BIT) { 914#if defined(_ILP32) 915 return (mdmderror(&msp->mde, MDE_UNIT_TOO_LARGE, mnum)); 916#else 917 mr_recid = mddb_createrec(msp->size, typ1, 0, 918 MD_CRO_64BIT | MD_CRO_RAID | MD_CRO_FN, setno); 919#endif 920 } else { 921 mr_recid = mddb_createrec(msp->size, typ1, 0, 922 MD_CRO_32BIT | MD_CRO_RAID | MD_CRO_FN, setno); 923 } 924 925 if (mr_recid < 0) 926 return (mddbstatus2error(&msp->mde, 927 (int)mr_recid, mnum, setno)); 928 929 /* get the address of the mdstruct */ 930 un = (mr_unit_t *)mddb_getrecaddr(mr_recid); 931 /* 932 * It is okay that we muck with the mdstruct here, 933 * since no one else will know about the mdstruct 934 * until we commit it. If we crash, the record will 935 * be automatically purged, since we haven't 936 * committed it yet. 937 */ 938 939 /* copy in the user's mdstruct */ 940 if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, un, 941 msp->size, mode)) { 942 mddb_deleterec_wrapper(mr_recid); 943 return (EFAULT); 944 } 945 /* All 64 bit metadevices only support EFI labels. */ 946 if (msp->options & MD_CRO_64BIT) { 947 un->c.un_flag |= MD_EFILABEL; 948 } 949 950 /* 951 * allocate the real recids array. since we may have to commit 952 * underlying metadevice records, we need an array of size: 953 * total number of components in raid + 3 (1 for the raid itself, 954 * one for the hotspare, one for the end marker). 955 */ 956 num_recs = un->un_totalcolumncnt + 3; 957 rid = 0; 958 recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP); 959 recids[rid++] = mr_recid; 960 961 MD_SID(un) = mnum; 962 MD_RECID(un) = recids[0]; 963 MD_CAPAB(un) = MD_CAN_PARENT | MD_CAN_SP; 964 MD_PARENT(un) = MD_NO_PARENT; 965 un->un_resync_copysize = 0; 966 un->c.un_revision |= MD_FN_META_DEV; 967 968 if (UNIT_STATE(un) == RUS_INIT) 969 MD_STATUS(un) |= MD_UN_GROW_PENDING; 970 971 if ((UNIT_STATE(un) != RUS_INIT) && raid_check_pw(un)) { 972 mddb_deleterec_wrapper(mr_recid); 973 err = mderror(&msp->mde, MDE_RAID_INVALID); 974 goto out; 975 } 976 977 if (err = raid_build_incore(un, 0)) { 978 if (un->mr_ic) { 979 kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * 980 un->un_totalcolumncnt); 981 kmem_free(un->mr_ic, sizeof (*un->mr_ic)); 982 } 983 984 md_nblocks_set(mnum, -1ULL); 985 MD_UNIT(mnum) = NULL; 986 987 mddb_deleterec_wrapper(mr_recid); 988 goto out; 989 } 990 991 /* 992 * Update unit availability 993 */ 994 md_set[setno].s_un_avail--; 995 996 recids[rid] = 0; 997 if (un->un_hsp_id != -1) { 998 /* increment the reference count of the hot spare pool */ 999 err = md_hot_spare_ifc(HSP_INCREF, un->un_hsp_id, 0, 0, 1000 &recids[rid], NULL, NULL, NULL); 1001 if (err) { 1002 md_nblocks_set(mnum, -1ULL); 1003 MD_UNIT(mnum) = NULL; 1004 1005 mddb_deleterec_wrapper(mr_recid); 1006 goto out; 1007 } 1008 rid++; 1009 } 1010 1011 /* 1012 * set the parent on any metadevice components. 1013 * NOTE: currently soft partitions are the only metadevices 1014 * which can appear within a RAID metadevice. 1015 */ 1016 for (col = 0; col < un->un_totalcolumncnt; col++) { 1017 mr_column_t *mr_col = &un->un_column[col]; 1018 md_unit_t *comp_un; 1019 1020 if (md_getmajor(mr_col->un_dev) == md_major) { 1021 comp_un = MD_UNIT(md_getminor(mr_col->un_dev)); 1022 recids[rid++] = MD_RECID(comp_un); 1023 md_set_parent(mr_col->un_dev, MD_SID(un)); 1024 } 1025 } 1026 1027 /* set the end marker */ 1028 recids[rid] = 0; 1029 1030 mddb_commitrecs_wrapper(recids); 1031 md_create_unit_incore(mnum, &raid_md_ops, 1); 1032 1033 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE, setno, 1034 MD_SID(un)); 1035 1036out: 1037 kmem_free(recids, (num_recs * sizeof (mddb_recid_t))); 1038 if (err) 1039 return (err); 1040 1041 /* only attempt to init a device that is in the init state */ 1042 if (UNIT_STATE(un) != RUS_INIT) 1043 return (0); 1044 1045 return (raid_init_unit(mnum, &msp->mde)); 1046} 1047 1048/* 1049 * NAME: raid_get 1050 * DESCRIPTION: used to get the unit structure of a RAID metadevice 1051 * PARAMETERS: md_i_get_t *migp - pointer to get data structure 1052 * int mode - must be FREAD 1053 * IOLOCK *lock - pointer to IOCTL lock 1054 * 1055 * LOCKS: obtains unit reader lock via IOLOCK 1056 * 1057 */ 1058static int 1059raid_get( 1060 void *migp, 1061 int mode, 1062 IOLOCK *lock 1063) 1064{ 1065 minor_t mnum; 1066 mr_unit_t *un; 1067 md_i_get_t *migph = migp; 1068 1069 1070 mnum = migph->id; 1071 1072 mdclrerror(&migph->mde); 1073 1074 if ((un = raid_getun(mnum, &migph->mde, 1075 RD_LOCK, lock)) == NULL) 1076 return (0); 1077 1078 if (migph->size == 0) { 1079 migph->size = un->c.un_size; 1080 return (0); 1081 } 1082 1083 if (migph->size < un->c.un_size) { 1084 return (EFAULT); 1085 } 1086 if (ddi_copyout(un, (void *)(uintptr_t)migph->mdp, 1087 un->c.un_size, mode)) 1088 return (EFAULT); 1089 1090 return (0); 1091} 1092 1093 1094/* 1095 * NAME: raid_replace 1096 * DESCRIPTION: used to replace a component of a RAID metadevice 1097 * PARAMETERS: replace_params_t *mrp - pointer to replace data structure 1098 * IOLOCK *lock - pointer to IOCTL lock 1099 * 1100 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun), 1101 * obtains and releases md_unit_array_rw write lock 1102 * 1103 */ 1104static int 1105raid_replace( 1106 replace_params_t *mrp, 1107 IOLOCK *lock 1108) 1109{ 1110 minor_t mnum = mrp->mnum; 1111 md_dev64_t odev = mrp->old_dev; 1112 md_error_t *ep = &mrp->mde; 1113 mr_unit_t *un; 1114 rcs_state_t state; 1115 int ix, col = -1; 1116 int force = 0; 1117 int err = 0; 1118 replace_cmd_t cmd; 1119 set_t setno; 1120 side_t side; 1121 mdkey_t devkey; 1122 int nkeys; 1123 mddb_recid_t extra_recids[3] = { 0, 0, 0 }; 1124 int extra_rids = 0; 1125 md_error_t mde = mdnullerror; 1126 sv_dev_t sv = {MD_SET_BAD, MD_SIDEWILD, MD_KEYWILD}; 1127 1128 mdclrerror(ep); 1129 setno = MD_MIN2SET(mnum); 1130 side = mddb_getsidenum(setno); 1131 1132 un = md_unit_readerlock(MDI_UNIT(mnum)); 1133 1134 if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) || 1135 (raid_state_cnt(un, RCS_RESYNC) != 0)) { 1136 md_unit_readerexit(MDI_UNIT(mnum)); 1137 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum)); 1138 } 1139 1140 if (un->un_state & RUS_DOI) { 1141 md_unit_readerexit(MDI_UNIT(mnum)); 1142 return (mdmderror(ep, MDE_RAID_DOI, mnum)); 1143 } 1144 1145 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) || 1146 (MD_STATUS(un) & MD_UN_GROW_PENDING)) { 1147 md_unit_readerexit(MDI_UNIT(mnum)); 1148 return (mdmderror(ep, MDE_IN_USE, mnum)); 1149 } 1150 1151 md_unit_readerexit(MDI_UNIT(mnum)); 1152 1153 /* get locks and recheck to be sure something did not change */ 1154 if ((un = raid_getun(mnum, ep, WRITERS, lock)) == NULL) 1155 return (0); 1156 1157 if (md_getkeyfromdev(setno, side, odev, &devkey, &nkeys) != 0) { 1158 return (mddeverror(ep, MDE_NAME_SPACE, odev)); 1159 } 1160 1161 for (ix = 0; ix < un->un_totalcolumncnt; ix++) { 1162 md_dev64_t tmpdevt = un->un_column[ix].un_orig_dev; 1163 /* 1164 * Try to resolve devt again if NODEV64 1165 */ 1166 if (tmpdevt == NODEV64) { 1167 tmpdevt = md_resolve_bydevid(mnum, tmpdevt, 1168 un->un_column[ix].un_orig_key); 1169 un->un_column[ix].un_orig_dev = tmpdevt; 1170 } 1171 1172 if (un->un_column[ix].un_orig_dev == odev) { 1173 col = ix; 1174 break; 1175 } else { 1176 if (un->un_column[ix].un_orig_dev == NODEV64) { 1177 /* 1178 * Now we use the keys to match. 1179 * If no key found, continue. 1180 */ 1181 if (nkeys == 0) { 1182 continue; 1183 } 1184 if (un->un_column[ix].un_orig_key == devkey) { 1185 if (nkeys > 1) 1186 return (mddeverror(ep, 1187 MDE_MULTNM, odev)); 1188 col = ix; 1189 break; 1190 } 1191 } 1192 } 1193 } 1194 1195 if (col == -1) 1196 return (mdcomperror(ep, MDE_CANT_FIND_COMP, 1197 mnum, odev)); 1198 1199 if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) || 1200 (raid_state_cnt(un, RCS_RESYNC) != 0)) 1201 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum)); 1202 1203 if (un->un_state & RUS_DOI) 1204 return (mdcomperror(ep, MDE_REPL_INVAL_STATE, mnum, 1205 un->un_column[col].un_dev)); 1206 1207 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) || 1208 (MD_STATUS(un) & MD_UN_GROW_PENDING)) 1209 return (mdmderror(ep, MDE_IN_USE, mnum)); 1210 1211 if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == FORCE_REPLACE_COMP)) 1212 force = 1; 1213 if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == ENABLE_COMP)) 1214 cmd = ENABLE_COMP; 1215 if ((mrp->cmd == FORCE_REPLACE_COMP) || (mrp->cmd == REPLACE_COMP)) 1216 cmd = REPLACE_COMP; 1217 1218 if (un->un_state == RUS_LAST_ERRED) { 1219 /* Must use -f force flag for unit in LAST_ERRED state */ 1220 if (!force) 1221 return (mdmderror(ep, MDE_RAID_NEED_FORCE, mnum)); 1222 1223 /* Must use -f force flag on ERRED column first */ 1224 if (un->un_column[col].un_devstate != RCS_ERRED) { 1225 for (ix = 0; ix < un->un_totalcolumncnt; ix++) { 1226 if (un->un_column[ix].un_devstate & RCS_ERRED) 1227 return (mdcomperror(ep, 1228 MDE_RAID_COMP_ERRED, mnum, 1229 un->un_column[ix].un_dev)); 1230 } 1231 } 1232 1233 /* must use -f force flag on LAST_ERRED columns next */ 1234 if ((un->un_column[col].un_devstate != RCS_LAST_ERRED) && 1235 (un->un_column[col].un_devstate != RCS_ERRED)) 1236 return (mdcomperror(ep, MDE_RAID_COMP_ERRED, 1237 mnum, un->un_column[col].un_dev)); 1238 } 1239 1240 if (un->un_state == RUS_ERRED) { 1241 if (! (un->un_column[col].un_devstate & 1242 (RCS_ERRED | RCS_INIT_ERRED))) 1243 return (mdcomperror(ep, MDE_RAID_COMP_ERRED, 1244 mnum, un->un_column[ix].un_dev)); 1245 } 1246 1247 ASSERT(!(un->un_column[col].un_devflags & MD_RAID_ALT_ISOPEN)); 1248 ASSERT(!(un->un_column[col].un_devflags & MD_RAID_WRITE_ALT)); 1249 1250 state = un->un_column[col].un_devstate; 1251 if (state & RCS_INIT_ERRED) { 1252 MD_STATUS(un) |= MD_UN_GROW_PENDING; 1253 un->un_percent_done = 0; 1254 raid_set_state(un, col, RCS_INIT, 0); 1255 } else if (((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) && 1256 resync_request(mnum, col, 0, ep)) 1257 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum)); 1258 1259 1260 if (cmd == REPLACE_COMP) { 1261 md_dev64_t tmpdev = mrp->new_dev; 1262 1263 /* 1264 * open the device by device id 1265 */ 1266 tmpdev = md_resolve_bydevid(mnum, tmpdev, mrp->new_key); 1267 if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) { 1268 return (mdcomperror(ep, MDE_COMP_OPEN_ERR, mnum, 1269 tmpdev)); 1270 } 1271 1272 /* 1273 * If it's a metadevice, make sure it gets reparented 1274 */ 1275 if (md_getmajor(tmpdev) == md_major) { 1276 minor_t new_mnum = md_getminor(tmpdev); 1277 md_unit_t *new_un = MD_UNIT(new_mnum); 1278 1279 md_set_parent(tmpdev, MD_SID(un)); 1280 extra_recids[extra_rids++] = MD_RECID(new_un); 1281 } 1282 1283 mrp->new_dev = tmpdev; 1284 un->un_column[col].un_orig_dev = tmpdev; 1285 un->un_column[col].un_orig_key = mrp->new_key; 1286 un->un_column[col].un_orig_pwstart = mrp->start_blk; 1287 un->un_column[col].un_orig_devstart = 1288 mrp->start_blk + un->un_pwsize; 1289 1290 /* 1291 * If the old device was a metadevice, make sure to 1292 * reset its parent. 1293 */ 1294 if (md_getmajor(odev) == md_major) { 1295 minor_t old_mnum = md_getminor(odev); 1296 md_unit_t *old_un = MD_UNIT(old_mnum); 1297 1298 md_reset_parent(odev); 1299 extra_recids[extra_rids++] = 1300 MD_RECID(old_un); 1301 } 1302 1303 if (HOTSPARED(un, col)) { 1304 md_layered_close(mrp->new_dev, MD_OFLG_NULL); 1305 un->un_column[col].un_alt_dev = mrp->new_dev; 1306 un->un_column[col].un_alt_pwstart = mrp->start_blk; 1307 un->un_column[col].un_alt_devstart = 1308 mrp->start_blk + un->un_pwsize; 1309 un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC; 1310 } else { 1311 /* 1312 * not hot spared. Close the old device and 1313 * move the new device in. 1314 */ 1315 if (un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) 1316 md_layered_close(odev, MD_OFLG_NULL); 1317 un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN; 1318 un->un_column[col].un_dev = mrp->new_dev; 1319 un->un_column[col].un_pwstart = mrp->start_blk; 1320 un->un_column[col].un_devstart = 1321 mrp->start_blk + un->un_pwsize; 1322 if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) { 1323 un->un_column[col].un_devflags |= 1324 MD_RAID_REGEN_RESYNC; 1325 } 1326 } 1327 /* 1328 * If the old device is not a metadevice then 1329 * save off the set number and key so that it 1330 * can be removed from the namespace later. 1331 */ 1332 if (md_getmajor(odev) != md_major) { 1333 sv.setno = setno; 1334 sv.key = devkey; 1335 } 1336 } 1337 1338 if (cmd == ENABLE_COMP) { 1339 md_dev64_t tmpdev = un->un_column[col].un_orig_dev; 1340 mdkey_t raidkey = un->un_column[col].un_orig_key; 1341 1342 /* 1343 * We trust the dev_t because we cannot determine the 1344 * dev_t from the device id since a new disk is in the 1345 * same location. Since this is a call from metareplace -e dx 1346 * AND it is SCSI a new dev_t is not generated. So the 1347 * dev_t from the mddb is used. Before enabling the device 1348 * we check to make sure that multiple entries for the same 1349 * device does not exist in the namespace. If they do we 1350 * fail the ioctl. 1351 * One of the many ways multiple entries in the name space 1352 * can occur is if one removed the failed component in a 1353 * RAID metadevice and put another disk that was part of 1354 * another metadevice. After reboot metadevadm would correctly 1355 * update the device name for the metadevice whose component 1356 * has moved. However now in the metadb there are two entries 1357 * for the same name (ctds) that belong to different 1358 * metadevices. One is valid, the other is a ghost or "last 1359 * know as" ctds. 1360 */ 1361 tmpdev = md_resolve_bydevid(mnum, tmpdev, raidkey); 1362 if (tmpdev == NODEV64) 1363 tmpdev = md_getdevnum(setno, side, raidkey, 1364 MD_TRUST_DEVT); 1365 /* 1366 * check for multiple entries in namespace for the 1367 * same dev 1368 */ 1369 1370 if (md_getkeyfromdev(setno, side, tmpdev, &devkey, 1371 &nkeys) != 0) 1372 return (mddeverror(ep, MDE_NAME_SPACE, tmpdev)); 1373 /* 1374 * If number of keys are greater that 1375 * 1, then we have an invalid 1376 * namespace. STOP and return. 1377 */ 1378 if (nkeys > 1) 1379 return (mddeverror(ep, MDE_MULTNM, tmpdev)); 1380 if (devkey != raidkey) 1381 return (mdcomperror(ep, MDE_CANT_FIND_COMP, 1382 mnum, tmpdev)); 1383 1384 if (un->un_column[col].un_orig_dev == NODEV64) 1385 un->un_column[col].un_orig_dev = tmpdev; 1386 1387 if (HOTSPARED(un, col)) { 1388 un->un_column[col].un_alt_dev = 1389 un->un_column[col].un_orig_dev; 1390 un->un_column[col].un_alt_pwstart = 1391 un->un_column[col].un_orig_pwstart; 1392 un->un_column[col].un_alt_devstart = 1393 un->un_column[col].un_orig_devstart; 1394 un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC; 1395 } else { 1396 if (!(un->un_column[col].un_devflags & 1397 MD_RAID_DEV_ISOPEN)) { 1398 if (md_layered_open(mnum, &tmpdev, 1399 MD_OFLG_NULL)) { 1400 un->un_column[col].un_dev = tmpdev; 1401 return (mdcomperror(ep, 1402 MDE_COMP_OPEN_ERR, mnum, tmpdev)); 1403 } 1404 ASSERT(tmpdev != NODEV64 && 1405 tmpdev != 0); 1406 1407 if ((md_getmajor(tmpdev) != md_major) && 1408 (md_devid_found(setno, side, raidkey) 1409 == 1)) { 1410 if (md_update_namespace_did(setno, side, 1411 raidkey, &mde) != 0) { 1412 cmn_err(CE_WARN, 1413 "md: could not" 1414 " update namespace\n"); 1415 } 1416 } 1417 un->un_column[col].un_dev = 1418 un->un_column[col].un_orig_dev; 1419 } 1420 un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN; 1421 un->un_column[col].un_devflags |= MD_RAID_REGEN_RESYNC; 1422 } 1423 } 1424 if (mrp->has_label) { 1425 un->un_column[col].un_devflags |= MD_RAID_HAS_LABEL; 1426 } else { 1427 un->un_column[col].un_devflags &= ~MD_RAID_HAS_LABEL; 1428 } 1429 1430 raid_commit(un, extra_recids); 1431 1432 /* If the component has been replaced - clean up the name space */ 1433 if (sv.setno != MD_SET_BAD) { 1434 md_rem_names(&sv, 1); 1435 } 1436 1437 md_ioctl_droplocks(lock); 1438 1439 if ((cmd == ENABLE_COMP) || (cmd == FORCE_ENABLE_COMP)) { 1440 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ENABLE, SVM_TAG_METADEVICE, 1441 setno, MD_SID(un)); 1442 } else { 1443 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REPLACE, SVM_TAG_METADEVICE, 1444 setno, MD_SID(un)); 1445 } 1446 1447 if (un->un_column[col].un_devstate & RCS_INIT) 1448 err = raid_init_unit(mnum, ep); 1449 else if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) 1450 err = raid_resync_unit(mnum, ep); 1451 1452 mdclrerror(ep); 1453 if (!err) 1454 return (0); 1455 1456 /* be sure state */ 1457 /* is already set by this time */ 1458 /* fix state and commit record */ 1459 un = md_unit_writerlock(MDI_UNIT(mnum)); 1460 if (state & RCS_INIT_ERRED) 1461 raid_set_state(un, col, state, 1); 1462 else if (state & RCS_OKAY) 1463 raid_set_state(un, col, RCS_ERRED, 0); 1464 else 1465 raid_set_state(un, col, state, 1); 1466 raid_commit(un, NULL); 1467 md_unit_writerexit(MDI_UNIT(mnum)); 1468 mdclrerror(ep); 1469 return (0); 1470} 1471 1472 1473/* 1474 * NAME: raid_set_sync 1475 * DESCRIPTION: used to sync a component of a RAID metadevice 1476 * PARAMETERS: md_resync_ioctl_t *mrp - pointer to resync data structure 1477 * int mode - must be FWRITE 1478 * IOLOCK *lock - pointer to IOCTL lock 1479 * 1480 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun), 1481 * obtains and releases md_unit_array_rw write lock 1482 * 1483 */ 1484static int 1485raid_set_sync( 1486 md_resync_ioctl_t *rip, 1487 IOLOCK *lock 1488) 1489{ 1490 minor_t mnum = rip->ri_mnum; 1491 mr_unit_t *un; 1492 int init = 0; 1493 int resync = 0; 1494 int regen = 0; 1495 int ix; 1496 int err; 1497 1498 mdclrerror(&rip->mde); 1499 1500 if ((un = raid_getun(mnum, &rip->mde, WRITERS, lock)) == NULL) 1501 return (0); 1502 1503 if (un->un_state & RUS_DOI) 1504 return (mdmderror(&rip->mde, MDE_RAID_DOI, mnum)); 1505 1506 if (un->c.un_status & MD_UN_RESYNC_ACTIVE) 1507 return (mdmderror(&rip->mde, MDE_RESYNC_ACTIVE, mnum)); 1508 1509 /* This prevents new opens */ 1510 1511 rip->ri_flags = 0; 1512 if (un->un_state & RUS_REGEN) 1513 regen++; 1514 1515 if (raid_state_cnt(un, RCS_RESYNC)) 1516 resync++; 1517 1518 if (raid_state_cnt(un, RCS_INIT) || (un->un_state & RUS_INIT)) 1519 init++; 1520 1521 ASSERT(!(resync && init && regen)); 1522 md_ioctl_droplocks(lock); 1523 rip->ri_percent_done = 0; 1524 1525 if (init) { 1526 MD_STATUS(un) |= MD_UN_GROW_PENDING; 1527 return (raid_init_unit(mnum, &rip->mde)); 1528 } 1529 1530 /* 1531 * If resync is needed, it will call raid_internal_open forcing 1532 * replay before the open completes. 1533 * Otherwise, call raid_internal_open directly to force 1534 * replay to complete during boot (metasync -r). 1535 * NOTE: the unit writer lock must remain held while setting 1536 * MD_UN_RESYNC_ACTIVE but must be released before 1537 * calling raid_resync_unit or raid_internal_open. 1538 */ 1539 if (resync) { 1540 ASSERT(resync < 2); 1541 un = md_unit_writerlock(MDI_UNIT(mnum)); 1542 MD_STATUS(un) |= MD_UN_RESYNC_ACTIVE; 1543 /* Must release unit writer lock for resync */ 1544 /* 1545 * correctly setup the devices before trying to start the 1546 * resync operation. 1547 */ 1548 for (ix = 0; un->un_totalcolumncnt; ix++) { 1549 if (un->un_column[ix].un_devstate & RCS_RESYNC) { 1550 if ((un->un_column[ix].un_devflags & 1551 MD_RAID_COPY_RESYNC) && 1552 HOTSPARED(un, ix)) { 1553 un->un_column[ix].un_alt_dev = 1554 un->un_column[ix].un_orig_dev; 1555 un->un_column[ix].un_alt_devstart = 1556 un->un_column[ix].un_orig_devstart; 1557 un->un_column[ix].un_alt_pwstart = 1558 un->un_column[ix].un_orig_pwstart; 1559 } 1560 break; 1561 } 1562 } 1563 ASSERT(un->un_column[ix].un_devflags & 1564 (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC)); 1565 rip->ri_percent_done = 0; 1566 un->un_column[ix].un_devflags |= MD_RAID_RESYNC; 1567 (void) resync_request(mnum, ix, 0, NULL); 1568 md_unit_writerexit(MDI_UNIT(mnum)); 1569 err = raid_resync_unit(mnum, &rip->mde); 1570 return (err); 1571 } 1572 1573 if (regen) { 1574 err = raid_regen_unit(mnum, &rip->mde); 1575 return (err); 1576 } 1577 1578 /* The unit requires not work so just force replay of the device */ 1579 if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) 1580 return (mdmderror(&rip->mde, 1581 MDE_RAID_OPEN_FAILURE, mnum)); 1582 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0); 1583 1584 return (0); 1585} 1586 1587/* 1588 * NAME: raid_get_resync 1589 * DESCRIPTION: used to check resync status on a component of a RAID metadevice 1590 * PARAMETERS: md_resync_ioctl_t *mrp - pointer to resync data structure 1591 * int mode - must be FWRITE 1592 * IOLOCK *lock - pointer to IOCTL lock 1593 * 1594 * LOCKS: none 1595 * 1596 */ 1597static int 1598raid_get_resync( 1599 md_resync_ioctl_t *rip, 1600 IOLOCK *lock 1601) 1602{ 1603 minor_t mnum = rip->ri_mnum; 1604 mr_unit_t *un; 1605 u_longlong_t percent; 1606 int cnt; 1607 int ix; 1608 uint64_t d; 1609 1610 mdclrerror(&rip->mde); 1611 1612 if ((un = raid_getun(mnum, &rip->mde, RD_LOCK, lock)) == NULL) 1613 return (0); 1614 1615 rip->ri_flags = 0; 1616 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) { 1617 d = un->un_segsincolumn; 1618 percent = d ? ((1000 * un->un_resync_line_index) / d) : 0; 1619 if (percent > 1000) 1620 percent = 1000; /* can't go over 100% */ 1621 rip->ri_percent_done = (int)percent; 1622 rip->ri_flags |= MD_RI_INPROGRESS; 1623 } 1624 1625 if (UNIT_STATE(un) & RUS_INIT) { 1626 d = un->un_segsize * un->un_segsincolumn * 1627 un->un_totalcolumncnt; 1628 percent = 1629 d ? ((1000 * (u_longlong_t)un->un_init_iocnt) / d) : 0; 1630 if (percent > 1000) 1631 percent = 1000; /* can't go over 100% */ 1632 rip->ri_percent_done = (int)percent; 1633 rip->ri_flags |= MD_GROW_INPROGRESS; 1634 } else if (MD_STATUS(un) & MD_UN_GROW_PENDING) { 1635 d = un->un_segsize * un->un_segsincolumn * un->un_init_colcnt; 1636 percent = 1637 d ? (((u_longlong_t)un->un_init_iocnt * 1000) / d) : 0; 1638 if (percent > 1000) 1639 percent = 1000; 1640 rip->ri_percent_done = (int)percent; 1641 rip->ri_flags |= MD_GROW_INPROGRESS; 1642 } 1643 1644 if (un->un_state & RUS_REGEN) 1645 rip->ri_percent_done = un->un_percent_done; 1646 1647 cnt = 0; 1648 for (ix = 0; ix < un->un_totalcolumncnt; ix++) { 1649 switch (un->un_column[ix].un_devstate) { 1650 case RCS_INIT: 1651 case RCS_ERRED: 1652 case RCS_LAST_ERRED: 1653 cnt++; 1654 break; 1655 default: 1656 break; 1657 } 1658 } 1659 d = un->un_totalcolumncnt; 1660 rip->ri_percent_dirty = d ? (((u_longlong_t)cnt * 100) / d) : 0; 1661 return (0); 1662} 1663 1664/* 1665 * NAME: raid_grow 1666 * DESCRIPTION: Concatenate to a RAID metadevice 1667 * PARAMETERS: md_grow_params_t *mgp 1668 * - pointer to IOCGROW data structure 1669 * int mode - must be FWRITE 1670 * IOLOCK *lockp - IOCTL read/write and unit_array_rw lock 1671 * 1672 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun), 1673 * obtains and releases md_unit_array_rw write lock 1674 * 1675 */ 1676static int 1677raid_grow(void *mgp, int mode, IOLOCK *lock) 1678{ 1679 minor_t mnum; 1680 mr_unit_t *un, *new_un; 1681 mdi_unit_t *ui; 1682 mddb_type_t typ1; 1683 mddb_recid_t mr_recid; 1684 mddb_recid_t old_vtoc = 0; 1685 mddb_recid_t *recids; 1686 md_create_rec_option_t options; 1687 int err; 1688 int col, i; 1689 int64_t tb, atb; 1690 u_longlong_t unrev; 1691 int tc; 1692 int rval = 0; 1693 set_t setno; 1694 mr_column_ic_t *mrc; 1695 int num_recs, rid; 1696 md_grow_params_t *mgph = mgp; 1697 1698 1699 mnum = mgph->mnum; 1700 1701 mdclrerror(&mgph->mde); 1702 1703 ui = MDI_UNIT(mnum); 1704 un = md_unit_readerlock(ui); 1705 1706 if (MD_STATUS(un) & MD_UN_GROW_PENDING) { 1707 md_unit_readerexit(ui); 1708 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum)); 1709 } 1710 1711 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) { 1712 md_unit_readerexit(ui); 1713 return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum)); 1714 } 1715 1716 if (UNIT_STATE(un) & RUS_LAST_ERRED) { 1717 md_unit_readerexit(ui); 1718 return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum)); 1719 } 1720 1721 if (UNIT_STATE(un) & RUS_DOI) { 1722 md_unit_readerexit(ui); 1723 return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum)); 1724 } 1725 1726 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) { 1727 md_unit_readerexit(ui); 1728 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum)); 1729 } 1730 1731 md_unit_readerexit(ui); 1732 1733 if ((un = raid_getun(mnum, &mgph->mde, WRITERS, lock)) == 1734 NULL) 1735 return (0); 1736 1737 if (MD_STATUS(un) & MD_UN_GROW_PENDING) 1738 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum)); 1739 1740 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 1741 return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum)); 1742 1743 if (un->c.un_size >= mgph->size) 1744 return (EINVAL); 1745 1746 if (UNIT_STATE(un) & RUS_LAST_ERRED) 1747 return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum)); 1748 1749 if (UNIT_STATE(un) & RUS_DOI) 1750 return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum)); 1751 1752 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) 1753 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum)); 1754 1755 setno = MD_MIN2SET(mnum); 1756 1757 typ1 = (mddb_type_t)md_getshared_key(setno, 1758 raid_md_ops.md_driver.md_drivername); 1759 1760 /* 1761 * Preserve the friendly name nature of the device that is 1762 * growing. 1763 */ 1764 options = MD_CRO_RAID; 1765 if (un->c.un_revision & MD_FN_META_DEV) 1766 options |= MD_CRO_FN; 1767 if (mgph->options & MD_CRO_64BIT) { 1768#if defined(_ILP32) 1769 return (mdmderror(&mgph->mde, MDE_UNIT_TOO_LARGE, mnum)); 1770#else 1771 mr_recid = mddb_createrec(mgph->size, typ1, 0, 1772 MD_CRO_64BIT | options, setno); 1773#endif 1774 } else { 1775 mr_recid = mddb_createrec(mgph->size, typ1, 0, 1776 MD_CRO_32BIT | options, setno); 1777 } 1778 if (mr_recid < 0) { 1779 rval = mddbstatus2error(&mgph->mde, (int)mr_recid, 1780 mnum, setno); 1781 return (rval); 1782 } 1783 1784 /* get the address of the new unit */ 1785 new_un = (mr_unit_t *)mddb_getrecaddr(mr_recid); 1786 1787 /* 1788 * It is okay that we muck with the new unit here, 1789 * since no one else will know about the unit struct 1790 * until we commit it. If we crash, the record will 1791 * be automatically purged, since we haven't 1792 * committed it yet and the old unit struct will be found. 1793 */ 1794 1795 /* copy in the user's unit struct */ 1796 err = ddi_copyin((void *)(uintptr_t)mgph->mdp, new_un, 1797 mgph->size, mode); 1798 if (err) { 1799 mddb_deleterec_wrapper(mr_recid); 1800 return (EFAULT); 1801 } 1802 1803 /* make sure columns are being added */ 1804 if (un->un_totalcolumncnt >= new_un->un_totalcolumncnt) { 1805 mddb_deleterec_wrapper(mr_recid); 1806 return (EINVAL); 1807 } 1808 1809 /* 1810 * Save a few of the new unit structs fields. 1811 * Before they get clobbered. 1812 */ 1813 tc = new_un->un_totalcolumncnt; 1814 tb = new_un->c.un_total_blocks; 1815 atb = new_un->c.un_actual_tb; 1816 unrev = new_un->c.un_revision; 1817 1818 /* 1819 * Copy the old unit struct (static stuff) 1820 * into new unit struct 1821 */ 1822 bcopy((caddr_t)un, (caddr_t)new_un, un->c.un_size); 1823 1824 /* 1825 * Restore a few of the new unit struct values. 1826 */ 1827 new_un->un_totalcolumncnt = tc; 1828 new_un->c.un_actual_tb = atb; 1829 new_un->un_grow_tb = tb; 1830 new_un->c.un_revision = unrev; 1831 new_un->c.un_record_id = mr_recid; 1832 new_un->c.un_size = mgph->size; 1833 1834 ASSERT(new_un->mr_ic == un->mr_ic); 1835 1836 /* 1837 * Save old column slots 1838 */ 1839 mrc = un->un_column_ic; 1840 1841 /* 1842 * Allocate new column slot 1843 */ 1844 new_un->un_column_ic = (mr_column_ic_t *) 1845 kmem_zalloc(sizeof (mr_column_ic_t) * new_un->un_totalcolumncnt, 1846 KM_SLEEP); 1847 1848 /* 1849 * Restore old column slots 1850 * Free the old column slots 1851 */ 1852 bcopy(mrc, new_un->un_column_ic, 1853 sizeof (mr_column_ic_t) * un->un_totalcolumncnt); 1854 kmem_free(mrc, sizeof (mr_column_ic_t) * un->un_totalcolumncnt); 1855 1856 /* All 64 bit metadevices only support EFI labels. */ 1857 if (mgph->options & MD_CRO_64BIT) { 1858 new_un->c.un_flag |= MD_EFILABEL; 1859 /* 1860 * If the device was previously smaller than a terabyte, 1861 * and had a vtoc record attached to it, we remove the 1862 * vtoc record, because the layout has changed completely. 1863 */ 1864 if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) && 1865 (un->c.un_vtoc_id != 0)) { 1866 old_vtoc = un->c.un_vtoc_id; 1867 new_un->c.un_vtoc_id = 1868 md_vtoc_to_efi_record(old_vtoc, setno); 1869 } 1870 } 1871 1872 1873 /* 1874 * allocate the real recids array. since we may have to commit 1875 * underlying metadevice records, we need an array of size: 1876 * total number of new components being attach + 2 (one for the 1877 * raid itself, one for the end marker). 1878 */ 1879 num_recs = new_un->un_totalcolumncnt + 2; 1880 rid = 0; 1881 recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP); 1882 recids[rid++] = mr_recid; 1883 1884 for (col = un->un_totalcolumncnt; 1885 (col < new_un->un_totalcolumncnt); col++) { 1886 mr_column_t *mr_col = &new_un->un_column[col]; 1887 md_unit_t *comp_un; 1888 1889 if (raid_build_pw_reservation(new_un, col) != 0) { 1890 /* release pwslots already allocated by grow */ 1891 for (i = un->un_totalcolumncnt; i < col; i++) { 1892 raid_free_pw_reservation(new_un, i); 1893 } 1894 kmem_free(new_un->un_column_ic, 1895 sizeof (mr_column_ic_t) * 1896 new_un->un_totalcolumncnt); 1897 kmem_free(new_un->mr_ic, sizeof (*un->mr_ic)); 1898 kmem_free(recids, num_recs * sizeof (mddb_recid_t)); 1899 mddb_deleterec_wrapper(mr_recid); 1900 return (EINVAL); 1901 } 1902 /* 1903 * set parent on metadevices being added. 1904 * NOTE: currently soft partitions are the only metadevices 1905 * which can appear within a RAID metadevice. 1906 */ 1907 if (md_getmajor(mr_col->un_dev) == md_major) { 1908 comp_un = MD_UNIT(md_getminor(mr_col->un_dev)); 1909 recids[rid++] = MD_RECID(comp_un); 1910 md_set_parent(mr_col->un_dev, MD_SID(new_un)); 1911 } 1912 new_un->un_column[col].un_devflags = 0; 1913 } 1914 1915 /* set end marker */ 1916 recids[rid] = 0; 1917 1918 /* commit new unit struct */ 1919 mddb_commitrecs_wrapper(recids); 1920 1921 /* delete old unit struct */ 1922 mddb_deleterec_wrapper(un->c.un_record_id); 1923 1924 /* place new unit in in-core array */ 1925 md_nblocks_set(mnum, new_un->c.un_total_blocks); 1926 MD_UNIT(mnum) = new_un; 1927 1928 /* 1929 * If old_vtoc has a non zero value, we know: 1930 * - This unit crossed the border from smaller to larger one TB 1931 * - There was a vtoc record for the unit, 1932 * - This vtoc record is no longer needed, because 1933 * a new efi record has been created for this un. 1934 */ 1935 if (old_vtoc != 0) { 1936 mddb_deleterec_wrapper(old_vtoc); 1937 } 1938 1939 /* free recids */ 1940 kmem_free(recids, num_recs * sizeof (mddb_recid_t)); 1941 1942 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE, 1943 MD_UN2SET(new_un), MD_SID(new_un)); 1944 MD_STATUS(new_un) |= MD_UN_GROW_PENDING; 1945 1946 /* 1947 * Since the md_ioctl_writelock aquires the unit write lock 1948 * and open/close aquires the unit reader lock it is necessary 1949 * to drop the unit write lock and then reaquire it as needed 1950 * later. 1951 */ 1952 md_unit_writerexit(ui); 1953 1954 if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) { 1955 rval = mdmderror(&mgph->mde, MDE_RAID_OPEN_FAILURE, mnum); 1956 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 1957 MD_UN2SET(new_un), MD_SID(new_un)); 1958 return (rval); 1959 } 1960 (void) md_unit_writerlock(ui); 1961 for (i = 0; i < new_un->un_totalcolumncnt; i++) { 1962 if (new_un->un_column[i].un_devstate & RCS_OKAY) 1963 (void) init_pw_area(new_un, new_un->un_column[i].un_dev, 1964 new_un->un_column[i].un_pwstart, i); 1965 } 1966 md_unit_writerexit(ui); 1967 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0); 1968 (void) md_unit_writerlock(ui); 1969 /* create a background thread to initialize the columns */ 1970 md_ioctl_droplocks(lock); 1971 1972 return (raid_init_unit(mnum, &mgph->mde)); 1973} 1974 1975/* 1976 * NAME: raid_reset 1977 * DESCRIPTION: used to reset (clear / remove) a RAID metadevice 1978 * PARAMETERS: md_i_reset_t *mirp - pointer to reset data structure 1979 * 1980 * LOCKS: obtains and releases md_unit_array_rw write lock 1981 * 1982 */ 1983static int 1984raid_reset(md_i_reset_t *mirp) 1985{ 1986 minor_t mnum = mirp->mnum; 1987 mr_unit_t *un; 1988 mdi_unit_t *ui; 1989 set_t setno = MD_MIN2SET(mnum); 1990 1991 mdclrerror(&mirp->mde); 1992 1993 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 1994 /* 1995 * NOTE: need to get md_unit_writerlock to avoid conflict 1996 * with raid_init thread. 1997 */ 1998 if ((un = raid_getun(mnum, &mirp->mde, NO_LOCK, NULL)) == 1999 NULL) { 2000 rw_exit(&md_unit_array_rw.lock); 2001 return (0); 2002 } 2003 ui = MDI_UNIT(mnum); 2004 2005 if (MD_HAS_PARENT(MD_PARENT(un))) { 2006 rw_exit(&md_unit_array_rw.lock); 2007 return (mdmderror(&mirp->mde, MDE_IN_USE, mnum)); 2008 } 2009 2010 un = (mr_unit_t *)md_unit_openclose_enter(ui); 2011 if (md_unit_isopen(MDI_UNIT(mnum))) { 2012 md_unit_openclose_exit(ui); 2013 rw_exit(&md_unit_array_rw.lock); 2014 return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum)); 2015 } 2016 md_unit_openclose_exit(ui); 2017 if (UNIT_STATE(un) != RUS_OKAY && !mirp->force) { 2018 rw_exit(&md_unit_array_rw.lock); 2019 return (mdmderror(&mirp->mde, MDE_RAID_NEED_FORCE, mnum)); 2020 } 2021 2022 reset_raid(un, mnum, 1); 2023 2024 /* 2025 * Update unit availability 2026 */ 2027 md_set[setno].s_un_avail++; 2028 2029 /* 2030 * If MN set, reset s_un_next so all nodes can have 2031 * the same view of the next available slot when 2032 * nodes are -w and -j 2033 */ 2034 if (MD_MNSET_SETNO(setno)) { 2035 (void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum)); 2036 } 2037 2038 rw_exit(&md_unit_array_rw.lock); 2039 2040 return (0); 2041} 2042 2043/* 2044 * NAME: raid_get_geom 2045 * DESCRIPTION: used to get the geometry of a RAID metadevice 2046 * PARAMETERS: mr_unit_t *un - RAID unit to get the geometry for 2047 * struct dk_geom *gp - pointer to geometry data structure 2048 * 2049 * LOCKS: none 2050 * 2051 */ 2052static int 2053raid_get_geom( 2054 mr_unit_t *un, 2055 struct dk_geom *geomp 2056) 2057{ 2058 md_get_geom((md_unit_t *)un, geomp); 2059 2060 return (0); 2061} 2062 2063/* 2064 * NAME: raid_get_vtoc 2065 * DESCRIPTION: used to get the VTOC on a RAID metadevice 2066 * PARAMETERS: mr_unit_t *un - RAID unit to get the VTOC from 2067 * struct vtoc *vtocp - pointer to VTOC data structure 2068 * 2069 * LOCKS: none 2070 * 2071 */ 2072static int 2073raid_get_vtoc( 2074 mr_unit_t *un, 2075 struct vtoc *vtocp 2076) 2077{ 2078 md_get_vtoc((md_unit_t *)un, vtocp); 2079 2080 return (0); 2081} 2082 2083/* 2084 * NAME: raid_set_vtoc 2085 * DESCRIPTION: used to set the VTOC on a RAID metadevice 2086 * PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on 2087 * struct vtoc *vtocp - pointer to VTOC data structure 2088 * 2089 * LOCKS: none 2090 * 2091 */ 2092static int 2093raid_set_vtoc( 2094 mr_unit_t *un, 2095 struct vtoc *vtocp 2096) 2097{ 2098 return (md_set_vtoc((md_unit_t *)un, vtocp)); 2099} 2100 2101 2102/* 2103 * NAME: raid_get_extvtoc 2104 * DESCRIPTION: used to get the extended VTOC on a RAID metadevice 2105 * PARAMETERS: mr_unit_t *un - RAID unit to get the VTOC from 2106 * struct extvtoc *vtocp - pointer to extended VTOC data structure 2107 * 2108 * LOCKS: none 2109 * 2110 */ 2111static int 2112raid_get_extvtoc( 2113 mr_unit_t *un, 2114 struct extvtoc *vtocp 2115) 2116{ 2117 md_get_extvtoc((md_unit_t *)un, vtocp); 2118 2119 return (0); 2120} 2121 2122/* 2123 * NAME: raid_set_extvtoc 2124 * DESCRIPTION: used to set the extended VTOC on a RAID metadevice 2125 * PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on 2126 * struct extvtoc *vtocp - pointer to extended VTOC data structure 2127 * 2128 * LOCKS: none 2129 * 2130 */ 2131static int 2132raid_set_extvtoc( 2133 mr_unit_t *un, 2134 struct extvtoc *vtocp 2135) 2136{ 2137 return (md_set_extvtoc((md_unit_t *)un, vtocp)); 2138} 2139 2140 2141 2142/* 2143 * NAME: raid_get_cgapart 2144 * DESCRIPTION: used to get the dk_map on a RAID metadevice 2145 * PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on 2146 * struct vtoc *dkmapp - pointer to dk_map data structure 2147 * 2148 * LOCKS: none 2149 * 2150 */ 2151 2152static int 2153raid_get_cgapart( 2154 mr_unit_t *un, 2155 struct dk_map *dkmapp 2156) 2157{ 2158 md_get_cgapart((md_unit_t *)un, dkmapp); 2159 return (0); 2160} 2161 2162/* 2163 * NAME: raid_getdevs 2164 * DESCRIPTION: return all devices within a RAID metadevice 2165 * PARAMETERS: md_getdevs_params_t *mgdp 2166 * - pointer to getdevs IOCTL data structure 2167 * int mode - should be FREAD 2168 * IOLOCK *lockp - IOCTL read/write lock 2169 * 2170 * LOCKS: obtains unit reader lock via IOLOCK 2171 * 2172 */ 2173static int 2174raid_getdevs( 2175 void *mgdp, 2176 int mode, 2177 IOLOCK *lock 2178) 2179{ 2180 minor_t mnum; 2181 mr_unit_t *un; 2182 md_dev64_t *udevs; 2183 int i, cnt; 2184 md_dev64_t unit_dev; 2185 md_getdevs_params_t *mgdph = mgdp; 2186 2187 2188 mnum = mgdph->mnum; 2189 2190 /* check out unit */ 2191 mdclrerror(&mgdph->mde); 2192 2193 if ((un = raid_getun(mnum, &mgdph->mde, RD_LOCK, lock)) == NULL) 2194 return (0); 2195 2196 udevs = (md_dev64_t *)(uintptr_t)mgdph->devs; 2197 2198 for (cnt = 0, i = 0; i < un->un_totalcolumncnt; i++, cnt++) { 2199 if (cnt < mgdph->cnt) { 2200 unit_dev = un->un_column[i].un_orig_dev; 2201 if (md_getmajor(unit_dev) != md_major) { 2202 if ((unit_dev = md_xlate_mini_2_targ 2203 (unit_dev)) == NODEV64) 2204 return (ENODEV); 2205 } 2206 2207 if (ddi_copyout((caddr_t)&unit_dev, 2208 (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0) 2209 return (EFAULT); 2210 } 2211 if (HOTSPARED(un, i)) { 2212 cnt++; 2213 if (cnt >= mgdph->cnt) 2214 continue; 2215 2216 unit_dev = un->un_column[i].un_dev; 2217 if (md_getmajor(unit_dev) != md_major) { 2218 if ((unit_dev = md_xlate_mini_2_targ 2219 (unit_dev)) == NODEV64) 2220 return (ENODEV); 2221 } 2222 2223 if (ddi_copyout((caddr_t)&unit_dev, 2224 (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0) 2225 return (EFAULT); 2226 } 2227 } 2228 mgdph->cnt = cnt; 2229 return (0); 2230} 2231 2232/* 2233 * NAME: raid_change 2234 * DESCRIPTION: used to change the following dynamic values: 2235 * the hot spare pool 2236 * in the unit structure of a RAID metadevice 2237 * PARAMETERS: md_change_params_t *mcp - pointer to change data structure 2238 * IOLOCK *lock - pointer to IOCTL lock 2239 * 2240 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun) 2241 * 2242 */ 2243static int 2244raid_change( 2245 md_raid_params_t *mrp, 2246 IOLOCK *lock 2247) 2248{ 2249 minor_t mnum = mrp->mnum; 2250 mr_unit_t *un; 2251 int ix; 2252 mddb_recid_t recids[3] = {0, 0, 0}; 2253 int err; 2254 int irecid; 2255 int inc_new_hsp = 0; 2256 2257 mdclrerror(&mrp->mde); 2258 2259 if ((un = raid_getun(mnum, &mrp->mde, WR_LOCK, lock)) == NULL) 2260 return (0); 2261 2262 if (!mrp->params.change_hsp_id) 2263 return (0); 2264 2265 /* verify that no hotspare is in use */ 2266 for (ix = 0; ix < un->un_totalcolumncnt; ix++) { 2267 if (HOTSPARED(un, ix)) { 2268 return (mdmderror(&mrp->mde, MDE_HS_IN_USE, mnum)); 2269 } 2270 } 2271 2272 /* replace the hot spare pool */ 2273 2274 irecid = 0; 2275 if (mrp->params.hsp_id != -1) { 2276 /* increment the reference count of the new hsp */ 2277 err = md_hot_spare_ifc(HSP_INCREF, mrp->params.hsp_id, 0, 0, 2278 &recids[0], NULL, NULL, NULL); 2279 if (err) { 2280 return (mdhsperror(&mrp->mde, MDE_INVAL_HSP, 2281 mrp->params.hsp_id)); 2282 } 2283 inc_new_hsp = 1; 2284 irecid++; 2285 } 2286 2287 if (un->un_hsp_id != -1) { 2288 /* decrement the reference count of the old hsp */ 2289 err = md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0, 2290 &recids[irecid], NULL, NULL, NULL); 2291 if (err) { 2292 err = mdhsperror(&mrp->mde, MDE_INVAL_HSP, 2293 mrp->params.hsp_id); 2294 if (inc_new_hsp) { 2295 (void) md_hot_spare_ifc(HSP_DECREF, 2296 mrp->params.hsp_id, 0, 0, 2297 &recids[0], NULL, NULL, NULL); 2298 /* 2299 * Don't need to commit the record, 2300 * because it wasn't committed before 2301 */ 2302 } 2303 return (err); 2304 } 2305 } 2306 2307 un->un_hsp_id = mrp->params.hsp_id; 2308 2309 raid_commit(un, recids); 2310 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE, 2311 MD_UN2SET(un), MD_SID(un)); 2312 2313 /* Now trigger hot spare processing in case one is needed. */ 2314 if ((un->un_hsp_id != -1) && (un->un_state == RUS_ERRED)) 2315 (void) raid_hotspares(); 2316 2317 return (0); 2318} 2319 2320/* 2321 * NAME: raid_admin_ioctl 2322 * DESCRIPTION: IOCTL operations unique to metadevices and RAID 2323 * PARAMETERS: int cmd - IOCTL command to be executed 2324 * void *data - pointer to IOCTL data structure 2325 * int mode - either FREAD or FWRITE 2326 * IOLOCK *lockp - IOCTL read/write lock 2327 * 2328 * LOCKS: none 2329 * 2330 */ 2331static int 2332raid_admin_ioctl( 2333 int cmd, 2334 void *data, 2335 int mode, 2336 IOLOCK *lockp 2337) 2338{ 2339 size_t sz = 0; 2340 void *d = NULL; 2341 int err = 0; 2342 2343 /* We can only handle 32-bit clients for internal commands */ 2344 if ((mode & DATAMODEL_MASK) != DATAMODEL_ILP32) { 2345 return (EINVAL); 2346 } 2347 2348 2349 /* dispatch ioctl */ 2350 switch (cmd) { 2351 2352 case MD_IOCSET: 2353 { 2354 if (! (mode & FWRITE)) 2355 return (EACCES); 2356 2357 sz = sizeof (md_set_params_t); 2358 d = kmem_alloc(sz, KM_SLEEP); 2359 2360 if (ddi_copyin(data, d, sz, mode)) { 2361 err = EFAULT; 2362 break; 2363 } 2364 2365 err = raid_set(d, mode); 2366 break; 2367 } 2368 2369 case MD_IOCGET: 2370 { 2371 if (! (mode & FREAD)) 2372 return (EACCES); 2373 2374 sz = sizeof (md_i_get_t); 2375 d = kmem_alloc(sz, KM_SLEEP); 2376 2377 if (ddi_copyin(data, d, sz, mode)) { 2378 err = EFAULT; 2379 break; 2380 } 2381 2382 err = raid_get(d, mode, lockp); 2383 break; 2384 } 2385 2386 case MD_IOCREPLACE: 2387 { 2388 if (! (mode & FWRITE)) 2389 return (EACCES); 2390 2391 sz = sizeof (replace_params_t); 2392 d = kmem_alloc(sz, KM_SLEEP); 2393 2394 if (ddi_copyin(data, d, sz, mode)) { 2395 err = EFAULT; 2396 break; 2397 } 2398 2399 err = raid_replace((replace_params_t *)d, lockp); 2400 break; 2401 } 2402 2403 case MD_IOCSETSYNC: 2404 { 2405 if (! (mode & FWRITE)) 2406 return (EACCES); 2407 2408 sz = sizeof (md_resync_ioctl_t); 2409 d = kmem_alloc(sz, KM_SLEEP); 2410 2411 if (ddi_copyin(data, d, sz, mode)) { 2412 err = EFAULT; 2413 break; 2414 } 2415 2416 err = raid_set_sync((md_resync_ioctl_t *)d, lockp); 2417 break; 2418 } 2419 2420 case MD_IOCGETSYNC: 2421 { 2422 if (! (mode & FREAD)) 2423 return (EACCES); 2424 2425 sz = sizeof (md_resync_ioctl_t); 2426 d = kmem_alloc(sz, KM_SLEEP); 2427 2428 if (ddi_copyin(data, d, sz, mode)) { 2429 err = EFAULT; 2430 break; 2431 } 2432 err = raid_get_resync((md_resync_ioctl_t *)d, lockp); 2433 2434 break; 2435 } 2436 2437 case MD_IOCGROW: 2438 { 2439 if (! (mode & FWRITE)) 2440 return (EACCES); 2441 2442 sz = sizeof (md_grow_params_t); 2443 d = kmem_alloc(sz, KM_SLEEP); 2444 2445 if (ddi_copyin(data, d, sz, mode)) { 2446 err = EFAULT; 2447 break; 2448 } 2449 2450 err = raid_grow(d, mode, lockp); 2451 break; 2452 } 2453 2454 case MD_IOCCHANGE: 2455 { 2456 if (! (mode & FWRITE)) 2457 return (EACCES); 2458 2459 sz = sizeof (md_raid_params_t); 2460 d = kmem_alloc(sz, KM_SLEEP); 2461 2462 if (ddi_copyin(data, d, sz, mode)) { 2463 err = EFAULT; 2464 break; 2465 } 2466 2467 err = raid_change((md_raid_params_t *)d, lockp); 2468 break; 2469 } 2470 2471 case MD_IOCRESET: 2472 { 2473 if (! (mode & FWRITE)) 2474 return (EACCES); 2475 2476 sz = sizeof (md_i_reset_t); 2477 d = kmem_alloc(sz, KM_SLEEP); 2478 2479 if (ddi_copyin(data, d, sz, mode)) { 2480 err = EFAULT; 2481 break; 2482 } 2483 2484 err = raid_reset((md_i_reset_t *)d); 2485 break; 2486 } 2487 2488 case MD_IOCGET_DEVS: 2489 { 2490 if (! (mode & FREAD)) 2491 return (EACCES); 2492 2493 sz = sizeof (md_getdevs_params_t); 2494 d = kmem_alloc(sz, KM_SLEEP); 2495 2496 if (ddi_copyin(data, d, sz, mode)) { 2497 err = EFAULT; 2498 break; 2499 } 2500 2501 err = raid_getdevs(d, mode, lockp); 2502 break; 2503 } 2504 2505 case MD_IOCSETREGEN: 2506 { 2507 if (! (mode & FWRITE)) 2508 return (EACCES); 2509 2510 sz = sizeof (md_regen_param_t); 2511 d = kmem_alloc(sz, KM_SLEEP); 2512 2513 if (ddi_copyin(data, d, sz, mode)) { 2514 err = EFAULT; 2515 break; 2516 } 2517 2518 err = raid_regen((md_regen_param_t *)d, lockp); 2519 break; 2520 } 2521 2522 case MD_IOCPROBE_DEV: 2523 { 2524 md_probedev_impl_t *p = NULL; 2525 md_probedev_t *ph = NULL; 2526 daemon_queue_t *hdr = NULL; 2527 int i; 2528 size_t sz1 = 0; 2529 2530 2531 if (! (mode & FREAD)) 2532 return (EACCES); 2533 2534 sz = sizeof (md_probedev_t); 2535 2536 d = kmem_alloc(sz, KM_SLEEP); 2537 2538 /* now copy in the data */ 2539 if (ddi_copyin(data, d, sz, mode)) { 2540 err = EFAULT; 2541 goto free_mem; 2542 } 2543 2544 /* 2545 * Sanity test the args. Test name should have the keyword 2546 * probe. 2547 */ 2548 p = kmem_alloc(sizeof (md_probedev_impl_t), KM_SLEEP); 2549 p->probe_sema = NULL; 2550 p->probe_mx = NULL; 2551 p->probe.mnum_list = (uint64_t)NULL; 2552 2553 ph = (md_probedev_t *)d; 2554 p->probe.nmdevs = ph->nmdevs; 2555 (void) strcpy(p->probe.test_name, ph->test_name); 2556 bcopy(&ph->md_driver, &(p->probe.md_driver), 2557 sizeof (md_driver_t)); 2558 2559 if ((p->probe.nmdevs < 1) || 2560 (strstr(p->probe.test_name, "probe") == NULL)) { 2561 err = EINVAL; 2562 goto free_mem; 2563 } 2564 2565 sz1 = sizeof (minor_t) * p->probe.nmdevs; 2566 2567 p->probe.mnum_list = (uint64_t)(uintptr_t)kmem_alloc(sz1, 2568 KM_SLEEP); 2569 2570 if (ddi_copyin((caddr_t)(uintptr_t)ph->mnum_list, 2571 (caddr_t)(uintptr_t)p->probe.mnum_list, sz1, mode)) { 2572 err = EFAULT; 2573 goto free_mem; 2574 } 2575 2576 if (err = md_init_probereq(p, &hdr)) 2577 goto free_mem; 2578 2579 /* 2580 * put the request on the queue and wait. 2581 */ 2582 2583 daemon_request_new(&md_ff_daemonq, md_probe_one, hdr, REQ_NEW); 2584 2585 (void) IOLOCK_RETURN(0, lockp); 2586 /* wait for the events to occur */ 2587 for (i = 0; i < p->probe.nmdevs; i++) { 2588 sema_p(PROBE_SEMA(p)); 2589 } 2590 while (md_ioctl_lock_enter() == EINTR) 2591 ; 2592 2593 /* 2594 * clean up. The hdr list is freed in the probe routines 2595 * since the list is NULL by the time we get here. 2596 */ 2597free_mem: 2598 if (p) { 2599 if (p->probe_sema != NULL) { 2600 sema_destroy(PROBE_SEMA(p)); 2601 kmem_free(p->probe_sema, sizeof (ksema_t)); 2602 } 2603 if (p->probe_mx != NULL) { 2604 mutex_destroy(PROBE_MX(p)); 2605 kmem_free(p->probe_mx, sizeof (kmutex_t)); 2606 } 2607 if (p->probe.mnum_list) 2608 kmem_free((caddr_t)(uintptr_t) 2609 p->probe.mnum_list, sz1); 2610 2611 kmem_free(p, sizeof (md_probedev_impl_t)); 2612 } 2613 break; 2614 } 2615 2616 default: 2617 return (ENOTTY); 2618 } 2619 2620 /* 2621 * copyout and free any args 2622 */ 2623 if (sz != 0) { 2624 if (err == 0) { 2625 if (ddi_copyout(d, data, sz, mode) != 0) { 2626 err = EFAULT; 2627 } 2628 } 2629 kmem_free(d, sz); 2630 } 2631 return (err); 2632} 2633 2634/* 2635 * NAME: md_raid_ioctl 2636 * DESCRIPTION: RAID metadevice IOCTL operations entry point. 2637 * PARAMETERS: md_dev64_t dev - RAID device identifier 2638 * int cmd - IOCTL command to be executed 2639 * void *data - pointer to IOCTL data structure 2640 * int mode - either FREAD or FWRITE 2641 * IOLOCK *lockp - IOCTL read/write lock 2642 * 2643 * LOCKS: none 2644 * 2645 */ 2646int 2647md_raid_ioctl( 2648 dev_t dev, 2649 int cmd, 2650 void *data, 2651 int mode, 2652 IOLOCK *lockp 2653) 2654{ 2655 minor_t mnum = getminor(dev); 2656 mr_unit_t *un; 2657 int err = 0; 2658 2659 /* handle admin ioctls */ 2660 if (mnum == MD_ADM_MINOR) 2661 return (raid_admin_ioctl(cmd, data, mode, lockp)); 2662 2663 /* check unit */ 2664 if ((MD_MIN2SET(mnum) >= md_nsets) || 2665 (MD_MIN2UNIT(mnum) >= md_nunits) || 2666 ((un = MD_UNIT(mnum)) == NULL)) 2667 return (ENXIO); 2668 2669 /* is this a supported ioctl? */ 2670 err = md_check_ioctl_against_unit(cmd, un->c); 2671 if (err != 0) { 2672 return (err); 2673 } 2674 2675 /* dispatch ioctl */ 2676 switch (cmd) { 2677 2678 case DKIOCINFO: 2679 { 2680 struct dk_cinfo *p; 2681 2682 if (! (mode & FREAD)) 2683 return (EACCES); 2684 2685 p = kmem_alloc(sizeof (*p), KM_SLEEP); 2686 2687 get_info(p, mnum); 2688 if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0) 2689 err = EFAULT; 2690 2691 kmem_free(p, sizeof (*p)); 2692 return (err); 2693 } 2694 2695 case DKIOCGMEDIAINFO: 2696 { 2697 struct dk_minfo p; 2698 2699 if (! (mode & FREAD)) 2700 return (EACCES); 2701 2702 get_minfo(&p, mnum); 2703 if (ddi_copyout(&p, data, sizeof (struct dk_minfo), mode) != 0) 2704 err = EFAULT; 2705 2706 return (err); 2707 } 2708 2709 case DKIOCGGEOM: 2710 { 2711 struct dk_geom *p; 2712 2713 if (! (mode & FREAD)) 2714 return (EACCES); 2715 2716 p = kmem_alloc(sizeof (*p), KM_SLEEP); 2717 2718 if ((err = raid_get_geom(un, p)) == 0) { 2719 if (ddi_copyout((caddr_t)p, data, sizeof (*p), 2720 mode) != 0) 2721 err = EFAULT; 2722 } 2723 2724 kmem_free(p, sizeof (*p)); 2725 return (err); 2726 } 2727 2728 case DKIOCGVTOC: 2729 { 2730 struct vtoc *vtoc; 2731 2732 if (! (mode & FREAD)) 2733 return (EACCES); 2734 2735 vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP); 2736 if ((err = raid_get_vtoc(un, vtoc)) != 0) { 2737 kmem_free(vtoc, sizeof (*vtoc)); 2738 return (err); 2739 } 2740 2741 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 2742 if (ddi_copyout(vtoc, data, sizeof (*vtoc), mode)) 2743 err = EFAULT; 2744 } 2745#ifdef _SYSCALL32 2746 else { 2747 struct vtoc32 *vtoc32; 2748 2749 vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP); 2750 2751 vtoctovtoc32((*vtoc), (*vtoc32)); 2752 if (ddi_copyout(vtoc32, data, sizeof (*vtoc32), mode)) 2753 err = EFAULT; 2754 kmem_free(vtoc32, sizeof (*vtoc32)); 2755 } 2756#endif /* _SYSCALL32 */ 2757 2758 kmem_free(vtoc, sizeof (*vtoc)); 2759 return (err); 2760 } 2761 2762 case DKIOCSVTOC: 2763 { 2764 struct vtoc *vtoc; 2765 2766 if (! (mode & FWRITE)) 2767 return (EACCES); 2768 2769 vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP); 2770 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 2771 if (ddi_copyin(data, vtoc, sizeof (*vtoc), mode)) { 2772 err = EFAULT; 2773 } 2774 } 2775#ifdef _SYSCALL32 2776 else { 2777 struct vtoc32 *vtoc32; 2778 2779 vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP); 2780 2781 if (ddi_copyin(data, vtoc32, sizeof (*vtoc32), mode)) { 2782 err = EFAULT; 2783 } else { 2784 vtoc32tovtoc((*vtoc32), (*vtoc)); 2785 } 2786 kmem_free(vtoc32, sizeof (*vtoc32)); 2787 } 2788#endif /* _SYSCALL32 */ 2789 2790 if (err == 0) 2791 err = raid_set_vtoc(un, vtoc); 2792 2793 kmem_free(vtoc, sizeof (*vtoc)); 2794 return (err); 2795 } 2796 2797 case DKIOCGEXTVTOC: 2798 { 2799 struct extvtoc *extvtoc; 2800 2801 if (! (mode & FREAD)) 2802 return (EACCES); 2803 2804 extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP); 2805 if ((err = raid_get_extvtoc(un, extvtoc)) != 0) { 2806 kmem_free(extvtoc, sizeof (*extvtoc)); 2807 return (err); 2808 } 2809 2810 if (ddi_copyout(extvtoc, data, sizeof (*extvtoc), mode)) 2811 err = EFAULT; 2812 2813 kmem_free(extvtoc, sizeof (*extvtoc)); 2814 return (err); 2815 } 2816 2817 case DKIOCSEXTVTOC: 2818 { 2819 struct extvtoc *extvtoc; 2820 2821 if (! (mode & FWRITE)) 2822 return (EACCES); 2823 2824 extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP); 2825 if (ddi_copyin(data, extvtoc, sizeof (*extvtoc), mode)) { 2826 err = EFAULT; 2827 } 2828 2829 if (err == 0) 2830 err = raid_set_extvtoc(un, extvtoc); 2831 2832 kmem_free(extvtoc, sizeof (*extvtoc)); 2833 return (err); 2834 } 2835 2836 case DKIOCGAPART: 2837 { 2838 struct dk_map dmp; 2839 2840 if ((err = raid_get_cgapart(un, &dmp)) != 0) { 2841 return (err); 2842 } 2843 2844 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 2845 if (ddi_copyout((caddr_t)&dmp, data, sizeof (dmp), 2846 mode) != 0) 2847 err = EFAULT; 2848 } 2849#ifdef _SYSCALL32 2850 else { 2851 struct dk_map32 dmp32; 2852 2853 dmp32.dkl_cylno = dmp.dkl_cylno; 2854 dmp32.dkl_nblk = dmp.dkl_nblk; 2855 2856 if (ddi_copyout((caddr_t)&dmp32, data, sizeof (dmp32), 2857 mode) != 0) 2858 err = EFAULT; 2859 } 2860#endif /* _SYSCALL32 */ 2861 2862 return (err); 2863 } 2864 case DKIOCGETEFI: 2865 { 2866 /* 2867 * This one can be done centralized, 2868 * no need to put in the same code for all types of metadevices 2869 */ 2870 return (md_dkiocgetefi(mnum, data, mode)); 2871 } 2872 2873 case DKIOCSETEFI: 2874 { 2875 /* 2876 * This one can be done centralized, 2877 * no need to put in the same code for all types of metadevices 2878 */ 2879 return (md_dkiocsetefi(mnum, data, mode)); 2880 } 2881 2882 case DKIOCPARTITION: 2883 { 2884 return (md_dkiocpartition(mnum, data, mode)); 2885 } 2886 2887 default: 2888 return (ENOTTY); 2889 } 2890} 2891 2892/* 2893 * rename/exchange named service entry points and support functions follow. 2894 * Most functions are handled generically, except for raid-specific locking 2895 * and checking 2896 */ 2897 2898/* 2899 * NAME: raid_may_renexch_self 2900 * DESCRIPTION: support routine for rename check ("MDRNM_CHECK") named service 2901 * PARAMETERS: mr_unit_t *un - unit struct of raid unit to be renamed 2902 * mdi_unit_t *ui - in-core unit struct of same raid unit 2903 * md_rentxn_t *rtxnp - rename transaction state 2904 * 2905 * LOCKS: none 2906 * 2907 */ 2908static int 2909raid_may_renexch_self( 2910 mr_unit_t *un, 2911 mdi_unit_t *ui, 2912 md_rentxn_t *rtxnp) 2913{ 2914 minor_t from_min; 2915 minor_t to_min; 2916 bool_t toplevel; 2917 bool_t related; 2918 2919 from_min = rtxnp->from.mnum; 2920 to_min = rtxnp->to.mnum; 2921 2922 if (!un || !ui) { 2923 (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR, 2924 from_min); 2925 return (EINVAL); 2926 } 2927 2928 ASSERT(!(MD_CAPAB(un) & MD_CAN_META_CHILD)); 2929 if (MD_CAPAB(un) & MD_CAN_META_CHILD) { 2930 (void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min); 2931 return (EINVAL); 2932 } 2933 2934 if (MD_PARENT(un) == MD_MULTI_PARENT) { 2935 (void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min); 2936 return (EINVAL); 2937 } 2938 2939 toplevel = !MD_HAS_PARENT(MD_PARENT(un)); 2940 2941 /* we're related if trying to swap with our parent */ 2942 related = (!toplevel) && (MD_PARENT(un) == to_min); 2943 2944 switch (rtxnp->op) { 2945 case MDRNOP_EXCHANGE: 2946 2947 if (!related) { 2948 (void) mdmderror(&rtxnp->mde, 2949 MDE_RENAME_TARGET_UNRELATED, to_min); 2950 return (EINVAL); 2951 } 2952 2953 break; 2954 2955 case MDRNOP_RENAME: 2956 /* 2957 * if from is top-level and is open, then the kernel is using 2958 * the md_dev64_t. 2959 */ 2960 2961 if (toplevel && md_unit_isopen(ui)) { 2962 (void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY, 2963 from_min); 2964 return (EBUSY); 2965 } 2966 break; 2967 2968 default: 2969 (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR, 2970 from_min); 2971 return (EINVAL); 2972 } 2973 2974 return (0); /* ok */ 2975} 2976 2977/* 2978 * NAME: raid_rename_check 2979 * DESCRIPTION: ("MDRNM_CHECK") rename/exchange named service entry point 2980 * PARAMETERS: md_rendelta_t *delta - describes changes to be made to this 2981 * raid device for rename transaction 2982 * md_rentxn_t *rtxnp - rename transaction state 2983 * 2984 * LOCKS: none 2985 * 2986 */ 2987intptr_t 2988raid_rename_check( 2989 md_rendelta_t *delta, 2990 md_rentxn_t *rtxnp) 2991{ 2992 int err = 0; 2993 int column; 2994 mr_unit_t *un; 2995 2996 ASSERT(delta); 2997 ASSERT(rtxnp); 2998 ASSERT(delta->unp); 2999 ASSERT(delta->uip); 3000 3001 if (!delta || !rtxnp || !delta->unp || !delta->uip) { 3002 (void) mdsyserror(&rtxnp->mde, EINVAL); 3003 return (EINVAL); 3004 } 3005 3006 un = (mr_unit_t *)delta->unp; 3007 3008 for (column = 0; column < un->un_totalcolumncnt; column++) { 3009 rcs_state_t colstate; 3010 3011 colstate = un->un_column[column].un_devstate; 3012 3013 if (colstate & RCS_LAST_ERRED) { 3014 (void) mdmderror(&rtxnp->mde, MDE_RAID_LAST_ERRED, 3015 md_getminor(delta->dev)); 3016 return (EINVAL); 3017 } 3018 3019 if (colstate & RCS_INIT_ERRED) { 3020 (void) mdmderror(&rtxnp->mde, MDE_RAID_DOI, 3021 md_getminor(delta->dev)); 3022 return (EINVAL); 3023 } 3024 3025 /* How did we get this far before detecting this? */ 3026 if (colstate & RCS_RESYNC) { 3027 (void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY, 3028 md_getminor(delta->dev)); 3029 return (EBUSY); 3030 } 3031 3032 if (colstate & RCS_ERRED) { 3033 (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY, 3034 md_getminor(delta->dev)); 3035 return (EINVAL); 3036 } 3037 3038 if (!(colstate & RCS_OKAY)) { 3039 (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY, 3040 md_getminor(delta->dev)); 3041 return (EINVAL); 3042 } 3043 3044 if (HOTSPARED(un, column)) { 3045 (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY, 3046 md_getminor(delta->dev)); 3047 return (EINVAL); 3048 } 3049 } 3050 3051 /* self does additional checks */ 3052 if (delta->old_role == MDRR_SELF) { 3053 err = raid_may_renexch_self((mr_unit_t *)delta->unp, 3054 delta->uip, rtxnp); 3055 } 3056 return (err); 3057} 3058 3059/* 3060 * NAME: raid_rename_lock 3061 * DESCRIPTION: ("MDRNM_LOCK") rename/exchange named service entry point 3062 * PARAMETERS: md_rendelta_t *delta - describes changes to be made to this 3063 * raid device for rename transaction 3064 * md_rentxn_t *rtxnp - rename transaction state 3065 * 3066 * LOCKS: io and unit locks (taken explicitly *not* via ioctl wrappers) 3067 * 3068 */ 3069intptr_t 3070raid_rename_lock( 3071 md_rendelta_t *delta, 3072 md_rentxn_t *rtxnp) 3073{ 3074 minor_t mnum; 3075 3076 ASSERT(delta); 3077 ASSERT(rtxnp); 3078 3079 mnum = md_getminor(delta->dev); 3080 if (mnum == rtxnp->to.mnum && rtxnp->op == MDRNOP_RENAME) { 3081 return (0); 3082 } 3083 3084 ASSERT(delta->uip); 3085 if (!delta->uip) { 3086 (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum); 3087 return (ENODEV); 3088 } 3089 3090 ASSERT(delta->unp); 3091 if (!delta->unp) { 3092 3093 return (ENODEV); 3094 } 3095 3096 ASSERT(!IO_WRITER_HELD(delta->unp)); 3097 (void) md_io_writerlock(delta->uip); 3098 ASSERT(IO_WRITER_HELD(delta->unp)); 3099 3100 3101 ASSERT(!UNIT_WRITER_HELD(delta->unp)); 3102 (void) md_unit_writerlock(delta->uip); 3103 ASSERT(UNIT_WRITER_HELD(delta->unp)); 3104 3105 return (0); 3106} 3107 3108/* 3109 * NAME: raid_rename_unlock 3110 * DESCRIPTION: ("MDRNM_UNLOCK") rename/exchange named service entry point 3111 * PARAMETERS: md_rendelta_t *delta - describes changes to be made to this 3112 * raid device for rename transaction 3113 * md_rentxn_t *rtxnp - rename transaction state 3114 * 3115 * LOCKS: drops io and unit locks 3116 * 3117 */ 3118/* ARGSUSED */ 3119void 3120raid_rename_unlock( 3121 md_rendelta_t *delta, 3122 md_rentxn_t *rtxnp) 3123{ 3124 mr_unit_t *un = (mr_unit_t *)delta->unp; 3125 minor_t mnum = MD_SID(un); 3126 int col; 3127 3128 ASSERT(delta); 3129 ASSERT(delta->unp); 3130 ASSERT(delta->uip); 3131 3132 ASSERT(UNIT_WRITER_HELD(delta->unp)); 3133 md_unit_writerexit(delta->uip); 3134 ASSERT(!UNIT_WRITER_HELD(delta->unp)); 3135 3136 if (! (delta->txn_stat.role_swapped) || ! (delta->txn_stat.is_open)) { 3137 goto out; 3138 } 3139 if (raid_internal_open(mnum, (FREAD | FWRITE), 3140 OTYP_LYR, MD_OFLG_ISINIT) == 0) { 3141 for (col = 0; col < un->un_totalcolumncnt; col++) { 3142 if (un->un_column[col].un_devstate & RCS_OKAY) 3143 (void) init_pw_area(un, 3144 un->un_column[col].un_dev, 3145 un->un_column[col].un_pwstart, col); 3146 } 3147 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0); 3148 } 3149 3150out: 3151 ASSERT(IO_WRITER_HELD(delta->unp)); 3152 md_io_writerexit(delta->uip); 3153 ASSERT(!IO_WRITER_HELD(delta->unp)); 3154} 3155/* end of rename/exchange named service and support functions */ 3156