1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28/* 29 * Metadevice diskset interfaces 30 */ 31 32#include <meta.h> 33#include <mdmn_changelog.h> 34#include "meta_set_prv.h" 35#include "meta_repartition.h" 36 37static int 38check_setnodes_againstdrivelist( 39 mdsetname_t *sp, 40 mddrivenamelist_t *dnlp, 41 md_error_t *ep 42) 43{ 44 md_set_desc *sd; 45 mddrivenamelist_t *p; 46 int i; 47 md_mnnode_desc *nd; 48 49 if ((sd = metaget_setdesc(sp, ep)) == NULL) 50 return (-1); 51 52 if (MD_MNSET_DESC(sd)) { 53 nd = sd->sd_nodelist; 54 while (nd) { 55 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 56 nd = nd->nd_next; 57 continue; 58 } 59 for (p = dnlp; p != NULL; p = p->next) 60 if (checkdrive_onnode(sp, p->drivenamep, 61 nd->nd_nodename, ep)) 62 return (-1); 63 nd = nd->nd_next; 64 } 65 } else { 66 for (i = 0; i < MD_MAXSIDES; i++) { 67 /* Skip empty slots */ 68 if (sd->sd_nodes[i][0] == '\0') 69 continue; 70 71 for (p = dnlp; p != NULL; p = p->next) 72 if (checkdrive_onnode(sp, p->drivenamep, 73 sd->sd_nodes[i], ep)) 74 return (-1); 75 } 76 } 77 return (0); 78} 79 80static int 81drvsuniq(mdsetname_t *sp, mddrivenamelist_t *dnlp, md_error_t *ep) 82{ 83 mddrivenamelist_t *dl1, *dl2; 84 mddrivename_t *dn1, *dn2; 85 86 for (dl1 = dnlp; dl1 != NULL; dl1 = dl1->next) { 87 dn1 = dl1->drivenamep; 88 89 for (dl2 = dl1->next; dl2 != NULL; dl2 = dl2->next) { 90 dn2 = dl2->drivenamep; 91 if (strcmp(dn1->cname, dn2->cname) != 0) 92 continue; 93 94 return (mddserror(ep, MDE_DS_DUPDRIVE, sp->setno, 95 NULL, dn1->cname, sp->setname)); 96 } 97 } 98 return (0); 99} 100 101static md_drive_desc * 102metaget_drivedesc_fromdrivelist( 103 mdsetname_t *sp, 104 mddrivenamelist_t *dnlp, 105 uint_t flags, 106 md_error_t *ep 107) 108{ 109 mddrivenamelist_t *p; 110 md_drive_desc *dd = NULL; 111 md_set_desc *sd; 112 113 if ((sd = metaget_setdesc(sp, ep)) == NULL) 114 return (NULL); 115 116 for (p = dnlp; p != NULL; p = p->next) { 117 (void) metadrivedesc_append(&dd, p->drivenamep, 0, 0, 118 sd->sd_ctime, sd->sd_genid, flags); 119 } 120 121 return (dd); 122} 123 124/* 125 * Exported Entry Points 126 */ 127 128int 129meta_make_sidenmlist( 130 mdsetname_t *sp, 131 mddrivename_t *dnp, 132 int import_flag, /* flags partial import */ 133 md_im_drive_info_t *midp, /* import drive information */ 134 md_error_t *ep 135) 136{ 137 mdsidenames_t *sn, **sn_next; 138 mdname_t *np; 139 int done; 140 side_t sideno = MD_SIDEWILD; 141 uint_t rep_slice; 142 char *bname; 143 144 if (!import_flag) { 145 /* 146 * Normal (aka NOT partial import) code path. 147 */ 148 if (meta_replicaslice(dnp, &rep_slice, ep) != 0) { 149 return (-1); 150 } 151 152 dnp->side_names_key = MD_KEYWILD; 153 154 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) 155 return (-1); 156 bname = Strdup(np->bname); 157 } else { 158 /* 159 * When doing a partial import, we'll get the needed 160 * information from somewhere other than the system. 161 */ 162 dnp->side_names_key = MD_KEYWILD; 163 bname = Strdup(midp->mid_devname); 164 } 165 metaflushsidenames(dnp); 166 sn_next = &dnp->side_names; 167 /*CONSTCOND*/ 168 while (1) { 169 sn = Zalloc(sizeof (*sn)); 170 171 if ((done = meta_getnextside_devinfo(sp, bname, &sideno, 172 &sn->cname, &sn->dname, &sn->mnum, ep)) == -1) { 173 if (import_flag) { 174 mdclrerror(ep); 175 sn->dname = Strdup(midp->mid_driver_name); 176 sn->mnum = midp->mid_mnum; 177 } else { 178 Free(sn); 179 Free(bname); 180 return (-1); 181 } 182 } 183 184 if (done == 0) { 185 Free(sn); 186 Free(bname); 187 return (0); 188 } 189 190 sn->sideno = sideno; 191 192 /* Add to the end of the linked list */ 193 assert(*sn_next == NULL); 194 *sn_next = sn; 195 sn_next = &sn->next; 196 } 197 /*NOTREACHED*/ 198} 199 200int 201meta_set_adddrives( 202 mdsetname_t *sp, 203 mddrivenamelist_t *dnlp, 204 daddr_t dbsize, 205 int force_label, 206 md_error_t *ep 207) 208{ 209 md_set_desc *sd; 210 md_drive_desc *dd = NULL, *curdd = NULL, *ddp; 211 int i; 212 mddrivenamelist_t *p; 213 mhd_mhiargs_t mhiargs; 214 int rval = 0; 215 md_timeval32_t now; 216 sigset_t oldsigs; 217 ulong_t genid; 218 ulong_t max_genid = 0; 219 md_setkey_t *cl_sk; 220 int rb_level = 0; 221 md_error_t xep = mdnullerror; 222 md_mnnode_desc *nd; 223 int suspendall_flag = 0; 224 int suspend1_flag = 0; 225 int lock_flag = 0; 226 int flush_set_onerr = 0; 227 md_replicalist_t *rlp = NULL, *rl; 228 229 if ((sd = metaget_setdesc(sp, ep)) == NULL) 230 return (-1); 231 232 /* Make sure we own the set */ 233 if (meta_check_ownership(sp, ep) != 0) 234 return (-1); 235 236 /* 237 * The drive and node records are stored in the local mddbs of each 238 * node in the diskset. Each node's rpc.metad daemon reads in the set, 239 * drive and node records from that node's local mddb and caches them 240 * internally. Any process needing diskset information contacts its 241 * local rpc.metad to get this information. Since each node in the 242 * diskset is independently reading the set information from its local 243 * mddb, the set, drive and node records in the local mddbs must stay 244 * in-sync, so that all nodes have a consistent view of the diskset. 245 * 246 * For a multinode diskset, explicitly verify that all nodes in the 247 * diskset are ALIVE (i.e. are in the API membership list). Otherwise, 248 * fail this operation since all nodes must be ALIVE in order to add 249 * the new drive record to their local mddb. If a panic of this node 250 * leaves the local mddbs set, node and drive records out-of-sync, the 251 * reconfig cycle will fix the local mddbs and force them back into 252 * synchronization. 253 */ 254 if (MD_MNSET_DESC(sd)) { 255 nd = sd->sd_nodelist; 256 while (nd) { 257 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 258 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, 259 sp->setno, 260 nd->nd_nodename, NULL, sp->setname); 261 return (-1); 262 } 263 nd = nd->nd_next; 264 } 265 } 266 267 if (drvsuniq(sp, dnlp, ep) == -1) 268 return (-1); 269 270 /* 271 * Lock the set on current set members. 272 * Set locking done much earlier for MN diskset than for traditional 273 * diskset since lock_set and SUSPEND are used to protect against 274 * other meta* commands running on the other nodes. 275 */ 276 if (MD_MNSET_DESC(sd)) { 277 /* Make sure we are blocking all signals */ 278 if (procsigs(TRUE, &oldsigs, &xep) < 0) 279 mdclrerror(&xep); 280 281 nd = sd->sd_nodelist; 282 /* All nodes are guaranteed to be ALIVE */ 283 while (nd) { 284 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 285 rval = -1; 286 goto out; 287 } 288 lock_flag = 1; 289 nd = nd->nd_next; 290 } 291 /* 292 * Lock out other meta* commands by suspending 293 * class 1 messages across the diskset. 294 */ 295 nd = sd->sd_nodelist; 296 /* All nodes are guaranteed to be ALIVE */ 297 while (nd) { 298 if (clnt_mdcommdctl(nd->nd_nodename, 299 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, 300 MD_MSCF_NO_FLAGS, ep)) { 301 rval = -1; 302 goto out; 303 } 304 suspend1_flag = 1; 305 nd = nd->nd_next; 306 } 307 } 308 309 if (check_setnodes_againstdrivelist(sp, dnlp, ep)) { 310 rval = -1; 311 goto out; 312 } 313 314 for (p = dnlp; p != NULL; p = p->next) { 315 mdsetname_t *tmp; 316 317 if (meta_is_drive_in_anyset(p->drivenamep, &tmp, FALSE, 318 ep) == -1) { 319 rval = -1; 320 goto out; 321 } 322 323 if (tmp != NULL) { 324 (void) mddserror(ep, MDE_DS_DRIVEINSET, sp->setno, 325 tmp->setname, p->drivenamep->cname, sp->setname); 326 rval = -1; 327 goto out; 328 } 329 } 330 331 /* END CHECK CODE */ 332 333 /* 334 * This is a separate loop (from above) so that we validate all the 335 * drives handed to us before we repartition any one drive. 336 */ 337 for (p = dnlp; p != NULL; p = p->next) { 338 if (meta_repartition_drive(sp, 339 p->drivenamep, force_label == TRUE ? MD_REPART_FORCE : 0, 340 NULL, /* Don't return the VTOC. */ 341 ep) != 0) { 342 rval = -1; 343 goto out; 344 } 345 /* 346 * Create the names for the drives we are adding per side. 347 */ 348 if (meta_make_sidenmlist(sp, p->drivenamep, 0, NULL, 349 ep) == -1) { 350 rval = -1; 351 goto out; 352 } 353 } 354 355 /* 356 * Get the list of drives descriptors that we are adding. 357 */ 358 dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep); 359 360 if (! mdisok(ep)) { 361 rval = -1; 362 goto out; 363 } 364 365 /* 366 * Get the set timeout information. 367 */ 368 (void) memset(&mhiargs, '\0', sizeof (mhiargs)); 369 if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) { 370 rval = -1; 371 goto out; 372 } 373 374 /* 375 * Get timestamp and generation id for new records 376 */ 377 now = sd->sd_ctime; 378 genid = sd->sd_genid; 379 380 381 /* At this point, in case of error, set should be flushed. */ 382 flush_set_onerr = 1; 383 384 /* Lock the set on current set members */ 385 if (!(MD_MNSET_DESC(sd))) { 386 md_rb_sig_handling_on(); 387 for (i = 0; i < MD_MAXSIDES; i++) { 388 /* Skip empty slots */ 389 if (sd->sd_nodes[i][0] == '\0') 390 continue; 391 392 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 393 rval = -1; 394 goto out; 395 } 396 lock_flag = 1; 397 } 398 } 399 400 /* 401 * Get drive descriptors for the drives that are currently in the set. 402 */ 403 curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep); 404 if (! mdisok(ep)) 405 goto rollback; 406 407 /* 408 * If first drive being added to set, set the mastership 409 * of the multinode diskset to be this node. 410 * Only set it on this node. If all goes well 411 * and there are no errors, the mastership of this node will be set 412 * on all nodes in user space and in the kernel. 413 */ 414 if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) { 415 if (clnt_mnsetmaster(mynode(), sp, 416 sd->sd_mn_mynode->nd_nodename, 417 sd->sd_mn_mynode->nd_nodeid, ep)) { 418 goto rollback; 419 } 420 /* 421 * Set this up in my local cache of the set desc so that 422 * the set descriptor won't have to be gotten again from 423 * rpc.metad. If it is flushed and gotten again, these 424 * values will be set in sr2setdesc. 425 */ 426 sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid; 427 (void) strcpy(sd->sd_mn_master_nodenm, 428 sd->sd_mn_mynode->nd_nodename); 429 sd->sd_mn_am_i_master = 1; 430 } 431 432 RB_TEST(1, "adddrives", ep) 433 434 RB_PREEMPT; 435 rb_level = 1; /* level 1 */ 436 437 RB_TEST(2, "adddrives", ep) 438 439 /* 440 * Add the drive records for the drives that we are adding to 441 * each host in the set. Marks the drive as MD_DR_ADD. 442 */ 443 if (MD_MNSET_DESC(sd)) { 444 nd = sd->sd_nodelist; 445 /* All nodes are guaranteed to be ALIVE */ 446 while (nd) { 447 if (clnt_adddrvs(nd->nd_nodename, sp, dd, now, genid, 448 ep) == -1) 449 goto rollback; 450 451 RB_TEST(3, "adddrives", ep) 452 nd = nd->nd_next; 453 } 454 } else { 455 for (i = 0; i < MD_MAXSIDES; i++) { 456 /* Skip empty slots */ 457 if (sd->sd_nodes[i][0] == '\0') 458 continue; 459 460 if (clnt_adddrvs(sd->sd_nodes[i], sp, dd, now, genid, 461 ep) == -1) 462 goto rollback; 463 464 RB_TEST(3, "adddrives", ep) 465 } 466 } 467 468 RB_TEST(4, "adddrives", ep) 469 470 RB_PREEMPT; 471 rb_level = 2; /* level 2 */ 472 473 RB_TEST(5, "adddrives", ep) 474 475 /* 476 * Take ownership of the added drives. 477 */ 478 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { 479 if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep)) 480 goto rollback; 481 } 482 483 /* 484 * If this is not a MN set and the state flags do not indicate the 485 * presence of devids, update the set records on all nodes. 486 */ 487 if (!(sd->sd_flags & MD_SR_MB_DEVID) && !(MD_MNSET_DESC(sd))) { 488 if (meta_update_mb(sp, dd, ep) == 0) { 489 mdclrerror(ep); 490 491 /* update the sr_flags on all hosts */ 492 for (i = 0; i < MD_MAXSIDES; i++) { 493 if (sd->sd_nodes[i][0] == '\0') 494 continue; 495 496 if (clnt_upd_sr_flags(sd->sd_nodes[i], 497 sp, (sd->sd_flags | MD_SR_MB_DEVID), ep)) 498 goto rollback; 499 } 500 } 501 } 502 503 RB_TEST(6, "adddrives", ep) 504 505 RB_PREEMPT; 506 rb_level = 3; /* level 3 */ 507 508 RB_TEST(7, "adddrives", ep) 509 510 /* 511 * Balance the DB's according to the list of existing drives and the 512 * list of added drives. 513 */ 514 if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1) 515 goto rollback; 516 517 /* 518 * Slam a dummy master block on all the disks that we are adding 519 * that don't have replicas on them. 520 * Used by diskset import if the disksets are remotely replicated 521 */ 522 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) { 523 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) { 524 uint_t rep_slice; 525 int fd = -1; 526 mdname_t *np = NULL; 527 char *drive_name; 528 529 drive_name = ddp->dd_dnp->cname; 530 531 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 532 char *rep_name; 533 534 rep_name = 535 rl->rl_repp->r_namep->drivenamep->cname; 536 537 if (strcmp(drive_name, rep_name) == 0) { 538 /* 539 * Disk has a replica on it so don't 540 * add dummy master block. 541 */ 542 break; 543 } 544 } 545 if (rl == NULL) { 546 /* 547 * Drive doesn't have a replica on it so 548 * we need a dummy master block. Add it. 549 */ 550 if (meta_replicaslice(ddp->dd_dnp, &rep_slice, 551 &xep) != 0) { 552 mdclrerror(&xep); 553 continue; 554 } 555 556 if ((np = metaslicename(ddp->dd_dnp, rep_slice, 557 &xep)) == NULL) { 558 mdclrerror(&xep); 559 continue; 560 } 561 562 if ((fd = open(np->rname, O_RDWR)) >= 0) { 563 meta_mkdummymaster(sp, fd, 16); 564 (void) close(fd); 565 } 566 } 567 } 568 } 569 570 if ((curdd == NULL) && (MD_MNSET_DESC(sd))) { 571 /* 572 * Notify rpc.mdcommd on all nodes of a nodelist change. 573 * Start by suspending rpc.mdcommd (which drains it of all 574 * messages), then change the nodelist followed by a reinit 575 * and resume. 576 */ 577 nd = sd->sd_nodelist; 578 /* All nodes are guaranteed to be ALIVE */ 579 while (nd) { 580 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 581 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 582 rval = -1; 583 goto out; 584 } 585 suspendall_flag = 1; 586 nd = nd->nd_next; 587 } 588 } 589 590 /* 591 * If a MN diskset and this is the first disk(s) being added 592 * to set, then pre-allocate change log records here. 593 * When the other nodes are joined into the MN diskset, the 594 * USER records will just be snarfed in. 595 */ 596 if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) { 597 if (mdmn_allocate_changelog(sp, ep) != 0) 598 goto rollback; 599 } 600 601 /* 602 * Mark the drives MD_DR_OK. 603 * If first drive being added to MN diskset, then set 604 * master on all nodes to be this node and then join 605 * all alive nodes (nodes in membership list) to set. 606 */ 607 if (MD_MNSET_DESC(sd)) { 608 nd = sd->sd_nodelist; 609 /* All nodes are guaranteed to be ALIVE */ 610 while (nd) { 611 /* don't set master on this node - done earlier */ 612 if ((curdd == NULL) && (nd->nd_nodeid != 613 sd->sd_mn_mynode->nd_nodeid)) { 614 /* 615 * Set master on all alive nodes since 616 * all alive nodes will become joined nodes. 617 */ 618 if (clnt_mnsetmaster(nd->nd_nodename, sp, 619 sd->sd_mn_mynode->nd_nodename, 620 sd->sd_mn_mynode->nd_nodeid, ep)) { 621 goto rollback; 622 } 623 } 624 625 if (curdd == NULL) { 626 /* 627 * No special flags for join set. Since 628 * all nodes are joining if 1st drive is being 629 * added to set then all nodes will be either 630 * STALE or non-STALE and each node can 631 * determine this on its own. 632 */ 633 if (clnt_joinset(nd->nd_nodename, sp, 634 NULL, ep)) { 635 goto rollback; 636 } 637 /* Sets join node flag on all nodes in list */ 638 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 639 sd->sd_nodelist, MD_NR_JOIN, NULL, ep)) { 640 goto rollback; 641 } 642 } 643 644 /* 645 * Set MD_DR_OK as last thing before unlock. 646 * In case of panic on this node, recovery 647 * code can check for MD_DR_OK to determine 648 * status of diskset. 649 */ 650 if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd, 651 MD_DR_OK, ep) == -1) 652 goto rollback; 653 654 655 RB_TEST(8, "adddrives", ep) 656 nd = nd->nd_next; 657 } 658 } else { 659 for (i = 0; i < MD_MAXSIDES; i++) { 660 /* Skip empty slots */ 661 if (sd->sd_nodes[i][0] == '\0') 662 continue; 663 664 if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, MD_DR_OK, 665 ep) == -1) 666 goto rollback; 667 668 RB_TEST(8, "adddrives", ep) 669 } 670 } 671 672 RB_TEST(9, "adddrives", ep) 673 674out: 675 /* 676 * Notify rpc.mdcommd on all nodes of a nodelist change. 677 * Send reinit command to mdcommd which forces it to get 678 * fresh set description. 679 */ 680 if (suspendall_flag) { 681 /* Send reinit */ 682 nd = sd->sd_nodelist; 683 /* All nodes are guaranteed to be ALIVE */ 684 while (nd) { 685 /* Class is ignored for REINIT */ 686 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 687 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 688 if (rval == 0) 689 (void) mdstealerror(ep, &xep); 690 rval = -1; 691 mde_perror(ep, dgettext(TEXT_DOMAIN, 692 "Unable to reinit rpc.mdcommd.\n")); 693 } 694 nd = nd->nd_next; 695 } 696 } 697 /* 698 * Unlock diskset by resuming messages across the diskset. 699 * Just resume all classes so that resume is the same whether 700 * just one class was locked or all classes were locked. 701 */ 702 if ((suspend1_flag) || (suspendall_flag)) { 703 nd = sd->sd_nodelist; 704 /* All nodes are guaranteed to be ALIVE */ 705 while (nd) { 706 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 707 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 708 if (rval == 0) 709 (void) mdstealerror(ep, &xep); 710 rval = -1; 711 mde_perror(ep, dgettext(TEXT_DOMAIN, 712 "Unable to resume rpc.mdcommd.\n")); 713 } 714 nd = nd->nd_next; 715 } 716 meta_ping_mnset(sp->setno); 717 } 718 719 if (lock_flag) { 720 cl_sk = cl_get_setkey(sp->setno, sp->setname); 721 if (MD_MNSET_DESC(sd)) { 722 nd = sd->sd_nodelist; 723 /* All nodes are guaranteed to be ALIVE */ 724 while (nd) { 725 if (clnt_unlock_set(nd->nd_nodename, 726 cl_sk, &xep)) { 727 if (rval == 0) 728 (void) mdstealerror(ep, &xep); 729 rval = -1; 730 } 731 nd = nd->nd_next; 732 } 733 } else { 734 for (i = 0; i < MD_MAXSIDES; i++) { 735 /* Skip empty slots */ 736 if (sd->sd_nodes[i][0] == '\0') 737 continue; 738 739 if (clnt_unlock_set(sd->sd_nodes[i], 740 cl_sk, &xep)) { 741 if (rval == 0) 742 (void) mdstealerror(ep, &xep); 743 rval = -1; 744 } 745 } 746 } 747 cl_set_setkey(NULL); 748 } 749 750 metafreedrivedesc(&dd); 751 752 if (flush_set_onerr) { 753 metaflushsetname(sp); 754 if (!(MD_MNSET_DESC(sd))) { 755 md_rb_sig_handling_off(md_got_sig(), md_which_sig()); 756 } 757 } 758 759 if (MD_MNSET_DESC(sd)) { 760 /* release signals back to what they were on entry */ 761 if (procsigs(FALSE, &oldsigs, &xep) < 0) 762 mdclrerror(&xep); 763 } 764 765 return (rval); 766 767rollback: 768 /* all signals already blocked for MN disket */ 769 if (!(MD_MNSET_DESC(sd))) { 770 /* Make sure we are blocking all signals */ 771 if (procsigs(TRUE, &oldsigs, &xep) < 0) 772 mdclrerror(&xep); 773 } 774 775 rval = -1; 776 777 max_genid = sd->sd_genid; 778 779 /* level 3 */ 780 if (rb_level > 2) { 781 /* 782 * Since the add drive operation is failing, need 783 * to reset config back to the way it was 784 * before the add drive opration. 785 * If a MN diskset and this is the first drive being added, 786 * then reset master on all ALIVE nodes (which is all nodes) 787 * since the master would have not been set previously. 788 * Don't reset master on this node, since this 789 * is done later. 790 * This is ok to fail since next node to add first 791 * disk to diskset will also set the master on all nodes. 792 * 793 * Also, if this is the first drive being added, 794 * need to have each node withdraw itself from the set. 795 */ 796 if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) { 797 nd = sd->sd_nodelist; 798 /* All nodes are guaranteed to be ALIVE */ 799 while (nd) { 800 /* 801 * Be careful with ordering in case of 802 * panic between the steps and the 803 * effect on recovery during reconfig. 804 */ 805 if (clnt_withdrawset(nd->nd_nodename, sp, &xep)) 806 mdclrerror(&xep); 807 808 /* Sets withdraw flag on all nodes in list */ 809 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 810 sd->sd_nodelist, MD_NR_WITHDRAW, 811 NULL, &xep)) { 812 mdclrerror(&xep); 813 } 814 815 /* Skip this node */ 816 if (nd->nd_nodeid == 817 sd->sd_mn_mynode->nd_nodeid) { 818 nd = nd->nd_next; 819 continue; 820 } 821 /* Reset master on all of the other nodes. */ 822 if (clnt_mnsetmaster(nd->nd_nodename, sp, 823 "", MD_MN_INVALID_NID, &xep)) 824 mdclrerror(&xep); 825 nd = nd->nd_next; 826 } 827 } 828 } 829 830 /* 831 * Send resume command to mdcommd. Don't send reinit command 832 * since nodelist should not have changed. 833 * If suspendall_flag is set, then user would have been adding 834 * first drives to set. Since this failed, there is certainly 835 * no reinit message to send to rpc.commd since no nodes will 836 * be joined to set at the end of this metaset command. 837 */ 838 if (suspendall_flag) { 839 /* Send resume */ 840 nd = sd->sd_nodelist; 841 /* All nodes are guaranteed to be ALIVE */ 842 while (nd) { 843 /* 844 * Resume all classes but class 1 so that lock is held 845 * against meta* commands. 846 * To later resume class1, must issue a class0 resume. 847 */ 848 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 849 sp, MD_MSG_CLASS0, 850 MD_MSCF_DONT_RESUME_CLASS1, &xep)) { 851 mde_perror(&xep, dgettext(TEXT_DOMAIN, 852 "Unable to resume rpc.mdcommd.\n")); 853 mdclrerror(&xep); 854 } 855 nd = nd->nd_next; 856 } 857 meta_ping_mnset(sp->setno); 858 } 859 860 /* level 3 */ 861 if (rb_level > 2) { 862 mdnamelist_t *nlp; 863 mdname_t *np; 864 865 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) { 866 uint_t rep_slice; 867 868 if ((meta_replicaslice(ddp->dd_dnp, 869 &rep_slice, &xep) != 0) || 870 ((np = metaslicename(ddp->dd_dnp, rep_slice, 871 &xep)) == NULL)) { 872 mdclrerror(&xep); 873 continue; 874 } 875 nlp = NULL; 876 (void) metanamelist_append(&nlp, np); 877 878 if (meta_db_detach(sp, nlp, 879 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, &xep)) 880 mdclrerror(&xep); 881 882 metafreenamelist(nlp); 883 } 884 885 /* Re-balance */ 886 if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1) 887 mdclrerror(&xep); 888 889 /* Only if we are adding the first drive */ 890 /* Handled MN diskset above. */ 891 if ((curdd == NULL) && !(MD_MNSET_DESC(sd))) { 892 if (clnt_stimeout(mynode(), sp, &defmhiargs, 893 &xep) == -1) 894 mdclrerror(&xep); 895 896 /* This is needed because of a corner case */ 897 if (halt_set(sp, &xep)) 898 mdclrerror(&xep); 899 } 900 max_genid++; 901 } 902 903 /* level 2 */ 904 if (rb_level > 1) { 905 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { 906 if (rel_own_bydd(sp, dd, TRUE, &xep)) 907 mdclrerror(&xep); 908 } 909 } 910 911 /* level 1 */ 912 if (rb_level > 0) { 913 if (MD_MNSET_DESC(sd)) { 914 nd = sd->sd_nodelist; 915 /* All nodes are guaranteed to be ALIVE */ 916 while (nd) { 917 if (clnt_deldrvs(nd->nd_nodename, sp, dd, 918 &xep) == -1) 919 mdclrerror(&xep); 920 nd = nd->nd_next; 921 } 922 } else { 923 for (i = 0; i < MD_MAXSIDES; i++) { 924 /* Skip empty slots */ 925 if (sd->sd_nodes[i][0] == '\0') 926 continue; 927 928 if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, 929 &xep) == -1) 930 mdclrerror(&xep); 931 } 932 } 933 max_genid += 2; 934 resync_genid(sp, sd, max_genid, 0, NULL); 935 } 936 937 if ((suspend1_flag) || (suspendall_flag)) { 938 /* Send resume */ 939 nd = sd->sd_nodelist; 940 /* All nodes are guaranteed to be ALIVE */ 941 while (nd) { 942 /* 943 * Just resume all classes so that resume is the 944 * same whether just one class was locked or all 945 * classes were locked. 946 */ 947 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 948 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 949 mdclrerror(&xep); 950 } 951 nd = nd->nd_next; 952 } 953 meta_ping_mnset(sp->setno); 954 } 955 956 /* level 0 */ 957 cl_sk = cl_get_setkey(sp->setno, sp->setname); 958 /* Don't test lock flag since guaranteed to be set if in rollback */ 959 if (MD_MNSET_DESC(sd)) { 960 /* 961 * Since the add drive operation is failing, need 962 * to reset config back to the way it was 963 * before the add drive opration. 964 * If a MN diskset and this is the first drive being 965 * added, then reset master on this node since 966 * the master would have not been set previously. 967 * This is ok to fail since next node to add first 968 * disk to diskset will also set the master on all nodes. 969 */ 970 if (curdd == NULL) { 971 /* Reset master on mynode */ 972 if (clnt_mnsetmaster(mynode(), sp, "", 973 MD_MN_INVALID_NID, &xep)) 974 mdclrerror(&xep); 975 } 976 nd = sd->sd_nodelist; 977 /* All nodes are guaranteed to be ALIVE */ 978 while (nd) { 979 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) 980 mdclrerror(&xep); 981 nd = nd->nd_next; 982 } 983 } else { 984 for (i = 0; i < MD_MAXSIDES; i++) { 985 /* Skip empty slots */ 986 if (sd->sd_nodes[i][0] == '\0') 987 continue; 988 989 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) 990 mdclrerror(&xep); 991 } 992 } 993 cl_set_setkey(NULL); 994 995 /* release signals back to what they were on entry */ 996 if (procsigs(FALSE, &oldsigs, &xep) < 0) 997 mdclrerror(&xep); 998 999 metafreedrivedesc(&dd); 1000 1001 if (flush_set_onerr) { 1002 metaflushsetname(sp); 1003 if (!(MD_MNSET_DESC(sd))) { 1004 md_rb_sig_handling_off(md_got_sig(), md_which_sig()); 1005 } 1006 } 1007 1008 return (rval); 1009} 1010 1011/* 1012 * Add drives routine used during import of a diskset. 1013 */ 1014int 1015meta_imp_set_adddrives( 1016 mdsetname_t *sp, 1017 mddrivenamelist_t *dnlp, 1018 md_im_set_desc_t *misp, 1019 md_error_t *ep 1020) 1021{ 1022 md_set_desc *sd; 1023 mddrivenamelist_t *p; 1024 md_drive_desc *dd = NULL, *ddp; 1025 int flush_set_onerr = 0; 1026 md_timeval32_t now; 1027 ulong_t genid; 1028 mhd_mhiargs_t mhiargs; 1029 md_im_replica_info_t *mirp; 1030 md_im_drive_info_t *midp; 1031 int rval = 0; 1032 sigset_t oldsigs; 1033 ulong_t max_genid = 0; 1034 int rb_level = 0; 1035 md_error_t xep = mdnullerror; 1036 1037 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1038 return (-1); 1039 1040 for (p = dnlp; p != NULL; p = p->next) { 1041 int imp_flag = 0; 1042 1043 /* 1044 * If we have a partial diskset, meta_make_sidenmlist will 1045 * need information from midp to complete making the 1046 * side name structure. 1047 */ 1048 if (misp->mis_partial) { 1049 imp_flag = MDDB_C_IMPORT; 1050 for (midp = misp->mis_drives; midp != NULL; 1051 midp = midp->mid_next) { 1052 if (midp->mid_dnp == p->drivenamep) 1053 break; 1054 } 1055 if (midp == NULL) { 1056 (void) mddserror(ep, MDE_DS_SETNOTIMP, 1057 MD_SET_BAD, mynode(), NULL, sp->setname); 1058 rval = -1; 1059 goto out; 1060 } 1061 } 1062 /* 1063 * Create the names for the drives we are adding per side. 1064 */ 1065 if (meta_make_sidenmlist(sp, p->drivenamep, imp_flag, 1066 midp, ep) == -1) { 1067 rval = -1; 1068 goto out; 1069 } 1070 } 1071 1072 /* 1073 * Get the list of drives descriptors that we are adding. 1074 */ 1075 dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep); 1076 1077 if (! mdisok(ep)) { 1078 rval = -1; 1079 goto out; 1080 } 1081 1082 /* 1083 * Get the set timeout information. 1084 */ 1085 (void) memset(&mhiargs, '\0', sizeof (mhiargs)); 1086 if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) { 1087 rval = -1; 1088 goto out; 1089 } 1090 1091 /* 1092 * Get timestamp and generation id for new records 1093 */ 1094 now = sd->sd_ctime; 1095 genid = sd->sd_genid; 1096 1097 /* At this point, in case of error, set should be flushed. */ 1098 flush_set_onerr = 1; 1099 1100 rb_level = 1; /* level 1 */ 1101 1102 for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) { 1103 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) { 1104 if (ddp->dd_dnp == midp->mid_dnp) { 1105 /* same disk */ 1106 ddp->dd_dnp->devid = 1107 devid_str_encode(midp->mid_devid, 1108 midp->mid_minor_name); 1109 1110 ddp->dd_dbcnt = 0; 1111 mirp = midp->mid_replicas; 1112 if (mirp) { 1113 ddp->dd_dbsize = mirp->mir_length; 1114 for (; mirp != NULL; 1115 mirp = mirp->mir_next) { 1116 ddp->dd_dbcnt++; 1117 } 1118 } 1119 if ((midp->mid_available & 1120 MD_IM_DISK_NOT_AVAILABLE) && 1121 (misp->mis_flags & MD_IM_SET_REPLICATED)) { 1122 ddp->dd_flags = MD_DR_UNRSLV_REPLICATED; 1123 } 1124 } 1125 } 1126 } 1127 1128 /* 1129 * Add the drive records for the drives that we are adding to 1130 * each host in the set. Marks the drive records as MD_DR_ADD. 1131 * May also mark a drive record as MD_DR_UNRSLV_REPLICATED if 1132 * this flag was set in the dd_flags for that drive. 1133 */ 1134 if (clnt_imp_adddrvs(mynode(), sp, dd, now, genid, ep) == -1) 1135 goto rollback; 1136 1137 rb_level = 2; /* level 2 */ 1138 1139 /* 1140 * Take ownership of the added drives. 1141 */ 1142 if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep)) 1143 goto rollback; 1144 1145out: 1146 metafreedrivedesc(&dd); 1147 1148 if (flush_set_onerr) { 1149 metaflushsetname(sp); 1150 } 1151 1152 return (rval); 1153 1154rollback: 1155 /* Make sure we are blocking all signals */ 1156 if (procsigs(TRUE, &oldsigs, &xep) < 0) 1157 mdclrerror(&xep); 1158 1159 rval = -1; 1160 1161 max_genid = sd->sd_genid; 1162 1163 /* level 2 */ 1164 if (rb_level > 1) { 1165 if (!MD_ATSET_DESC(sd)) { 1166 if (rel_own_bydd(sp, dd, TRUE, &xep)) { 1167 mdclrerror(&xep); 1168 } 1169 } 1170 } 1171 1172 /* level 1 */ 1173 if (rb_level > 0) { 1174 if (clnt_deldrvs(mynode(), sp, dd, &xep) == -1) { 1175 mdclrerror(&xep); 1176 } 1177 max_genid += 2; 1178 resync_genid(sp, sd, max_genid, 0, NULL); 1179 } 1180 1181 /* level 0 */ 1182 1183 /* release signals back to what they were on entry */ 1184 if (procsigs(FALSE, &oldsigs, &xep) < 0) 1185 mdclrerror(&xep); 1186 1187 metafreedrivedesc(&dd); 1188 1189 if (flush_set_onerr) { 1190 metaflushsetname(sp); 1191 md_rb_sig_handling_off(md_got_sig(), md_which_sig()); 1192 } 1193 1194 return (rval); 1195} 1196 1197int 1198meta_set_deletedrives( 1199 mdsetname_t *sp, 1200 mddrivenamelist_t *dnlp, 1201 int forceflg, 1202 md_error_t *ep 1203) 1204{ 1205 md_set_desc *sd; 1206 md_drive_desc *ddp, *dd = NULL, *curdd = NULL; 1207 md_replicalist_t *rlp = NULL, *rl; 1208 mddrivenamelist_t *p; 1209 int deldrvcnt = 0; 1210 int rval = 0; 1211 mhd_mhiargs_t mhiargs; 1212 int i; 1213 sigset_t oldsigs; 1214 md_setkey_t *cl_sk; 1215 ulong_t max_genid = 0; 1216 int rb_level = 0; 1217 md_error_t xep = mdnullerror; 1218 md_mnnode_desc *nd; 1219 int has_set; 1220 int current_drv_cnt = 0; 1221 int suspendall_flag = 0, suspendall_flag_rb = 0; 1222 int suspend1_flag = 0; 1223 int lock_flag = 0; 1224 bool_t stale_bool = FALSE; 1225 int flush_set_onerr = 0; 1226 mdnamelist_t *nlp; 1227 mdname_t *np; 1228 1229 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1230 return (-1); 1231 1232 /* Make sure we own the set */ 1233 if (meta_check_ownership(sp, ep) != 0) 1234 return (-1); 1235 1236 if (drvsuniq(sp, dnlp, ep) == -1) 1237 return (-1); 1238 1239 /* 1240 * Check and see if all the nodes have the set. 1241 * 1242 * The drive and node records are stored in the local mddbs of each 1243 * node in the diskset. Each node's rpc.metad daemon reads in the set, 1244 * drive and node records from that node's local mddb and caches them 1245 * internally. Any process needing diskset information contacts its 1246 * local rpc.metad to get this information. Since each node in the 1247 * diskset is independently reading the set information from its local 1248 * mddb, the set, drive and node records in the local mddbs must stay 1249 * in-sync, so that all nodes have a consistent view of the diskset. 1250 * 1251 * For a multinode diskset, explicitly verify that all nodes in the 1252 * diskset are ALIVE (i.e. are in the API membership list). Otherwise, 1253 * fail this operation since all nodes must be ALIVE in order to delete 1254 * a drive record from their local mddb. If a panic of this node 1255 * leaves the local mddbs set, node and drive records out-of-sync, the 1256 * reconfig cycle will fix the local mddbs and force them back into 1257 * synchronization. 1258 */ 1259 if (MD_MNSET_DESC(sd)) { 1260 nd = sd->sd_nodelist; 1261 while (nd) { 1262 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1263 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, 1264 sp->setno, 1265 nd->nd_nodename, NULL, sp->setname); 1266 return (-1); 1267 } 1268 nd = nd->nd_next; 1269 } 1270 1271 /* Make sure we are blocking all signals */ 1272 if (procsigs(TRUE, &oldsigs, &xep) < 0) 1273 mdclrerror(&xep); 1274 1275 /* 1276 * Lock the set on current set members. 1277 * Set locking done much earlier for MN diskset than for 1278 * traditional diskset since lock_set and SUSPEND are used 1279 * to protect against other meta* commands running on the 1280 * other nodes. 1281 */ 1282 nd = sd->sd_nodelist; 1283 /* All nodes are guaranteed to be ALIVE */ 1284 while (nd) { 1285 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 1286 rval = -1; 1287 goto out; 1288 } 1289 lock_flag = 1; 1290 nd = nd->nd_next; 1291 } 1292 /* 1293 * Lock out other meta* commands by suspending 1294 * class 1 messages across the diskset. 1295 */ 1296 nd = sd->sd_nodelist; 1297 /* All nodes are guaranteed to be ALIVE */ 1298 while (nd) { 1299 if (clnt_mdcommdctl(nd->nd_nodename, 1300 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, 1301 MD_MSCF_NO_FLAGS, ep)) { 1302 rval = -1; 1303 goto out; 1304 } 1305 suspend1_flag = 1; 1306 nd = nd->nd_next; 1307 } 1308 1309 nd = sd->sd_nodelist; 1310 /* All nodes are guaranteed to be ALIVE */ 1311 while (nd) { 1312 if (strcmp(nd->nd_nodename, mynode()) == 0) { 1313 nd = nd->nd_next; 1314 continue; 1315 } 1316 1317 has_set = nodehasset(sp, nd->nd_nodename, 1318 NHS_NSTG_EQ, ep); 1319 if (has_set < 0) { 1320 rval = -1; 1321 goto out; 1322 } 1323 1324 if (! has_set) { 1325 (void) mddserror(ep, MDE_DS_NODENOSET, 1326 sp->setno, nd->nd_nodename, 1327 NULL, sp->setname); 1328 rval = -1; 1329 goto out; 1330 } 1331 nd = nd->nd_next; 1332 } 1333 } else { 1334 for (i = 0; i < MD_MAXSIDES; i++) { 1335 /* Skip empty slots */ 1336 if (sd->sd_nodes[i][0] == '\0') 1337 continue; 1338 1339 if (strcmp(sd->sd_nodes[i], mynode()) == 0) 1340 continue; 1341 1342 has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NSTG_EQ, 1343 ep); 1344 if (has_set < 0) { 1345 /* 1346 * Can directly return since !MN diskset; 1347 * nothing to unlock. 1348 */ 1349 return (-1); 1350 } 1351 1352 if (! has_set) { 1353 /* 1354 * Can directly return since !MN diskset; 1355 * nothing to unlock. 1356 */ 1357 return (mddserror(ep, MDE_DS_NODENOSET, 1358 sp->setno, sd->sd_nodes[i], NULL, 1359 sp->setname)); 1360 } 1361 } 1362 } 1363 1364 for (p = dnlp; p != NULL; p = p->next) { 1365 int is_it; 1366 mddrivename_t *dnp; 1367 1368 dnp = p->drivenamep; 1369 1370 if ((is_it = meta_is_drive_in_thisset(sp, dnp, FALSE, ep)) 1371 == -1) { 1372 rval = -1; 1373 goto out; 1374 } 1375 1376 if (! is_it) { 1377 (void) mddserror(ep, MDE_DS_DRIVENOTINSET, sp->setno, 1378 NULL, dnp->cname, sp->setname); 1379 rval = -1; 1380 goto out; 1381 } 1382 1383 if ((meta_check_drive_inuse(sp, dnp, FALSE, ep)) == -1) { 1384 rval = -1; 1385 goto out; 1386 } 1387 1388 deldrvcnt++; 1389 } 1390 current_drv_cnt = deldrvcnt; 1391 1392 /* 1393 * Get drive descriptors for the drives that are currently in the set. 1394 */ 1395 curdd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 1396 if (! mdisok(ep)) { 1397 rval = -1; 1398 goto out; 1399 } 1400 1401 /* 1402 * Decrement the the delete drive count for each drive currently in the 1403 * set. 1404 */ 1405 for (ddp = curdd; ddp != NULL; ddp = ddp->dd_next) 1406 deldrvcnt--; 1407 1408 /* 1409 * If the count of drives we are deleting is equal to the drives in the 1410 * set, and we haven't specified forceflg, return an error 1411 */ 1412 if (deldrvcnt == 0 && forceflg == FALSE) { 1413 (void) mderror(ep, MDE_FORCE_DEL_ALL_DRV, NULL); 1414 rval = -1; 1415 goto out; 1416 } 1417 1418 /* 1419 * Get the list of drive descriptors that we are deleting. 1420 */ 1421 dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_DEL, ep); 1422 if (! mdisok(ep)) { 1423 rval = -1; 1424 goto out; 1425 } 1426 1427 /* 1428 * Get the set timeout information in case we have to roll back. 1429 */ 1430 (void) memset(&mhiargs, '\0', sizeof (mhiargs)); 1431 if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) { 1432 rval = -1; 1433 goto out; 1434 } 1435 1436 /* At this point, in case of error, set should be flushed. */ 1437 flush_set_onerr = 1; 1438 1439 /* END CHECK CODE */ 1440 1441 /* Lock the set on current set members */ 1442 if (!(MD_MNSET_DESC(sd))) { 1443 md_rb_sig_handling_on(); 1444 for (i = 0; i < MD_MAXSIDES; i++) { 1445 /* Skip empty slots */ 1446 if (sd->sd_nodes[i][0] == '\0') 1447 continue; 1448 1449 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1450 rval = -1; 1451 goto out; 1452 } 1453 lock_flag = 1; 1454 } 1455 } 1456 1457 if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) { 1458 mddb_config_t c; 1459 /* 1460 * Is current set STALE? 1461 */ 1462 (void) memset(&c, 0, sizeof (c)); 1463 c.c_id = 0; 1464 c.c_setno = sp->setno; 1465 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 1466 (void) mdstealerror(ep, &c.c_mde); 1467 rval = -1; 1468 goto out; 1469 } 1470 if (c.c_flags & MDDB_C_STALE) { 1471 stale_bool = TRUE; 1472 } 1473 } 1474 1475 RB_TEST(1, "deletedrives", ep) 1476 1477 RB_PREEMPT; 1478 rb_level = 1; /* level 1 */ 1479 1480 RB_TEST(2, "deletedrives", ep) 1481 1482 /* 1483 * Mark the drives MD_DR_DEL 1484 */ 1485 if (MD_MNSET_DESC(sd)) { 1486 nd = sd->sd_nodelist; 1487 /* All nodes are guaranteed to be ALIVE */ 1488 while (nd) { 1489 if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd, 1490 MD_DR_DEL, ep) == -1) 1491 goto rollback; 1492 1493 RB_TEST(3, "deletedrives", ep) 1494 nd = nd->nd_next; 1495 } 1496 } else { 1497 for (i = 0; i < MD_MAXSIDES; i++) { 1498 /* Skip empty slots */ 1499 if (sd->sd_nodes[i][0] == '\0') 1500 continue; 1501 1502 if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, 1503 MD_DR_DEL, ep) == -1) 1504 goto rollback; 1505 1506 RB_TEST(3, "deletedrives", ep) 1507 } 1508 } 1509 1510 RB_TEST(4, "deletedrives", ep) 1511 1512 RB_PREEMPT; 1513 rb_level = 2; /* level 2 */ 1514 1515 RB_TEST(5, "deletedrives", ep) 1516 1517 /* 1518 * Balance the DB's according to the list of existing drives and the 1519 * list of deleted drives. 1520 */ 1521 if (meta_db_balance(sp, dd, curdd, 0, ep) == -1) 1522 goto rollback; 1523 1524 /* 1525 * If the drive(s) to be deleted cannot be accessed, 1526 * they haven't really been deleted yet. Check and delete now 1527 * if need be. 1528 */ 1529 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) { 1530 nlp = NULL; 1531 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) { 1532 char *delete_name; 1533 1534 delete_name = ddp->dd_dnp->cname; 1535 1536 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 1537 char *cur_name; 1538 1539 cur_name = 1540 rl->rl_repp->r_namep->drivenamep->cname; 1541 1542 if (strcmp(delete_name, cur_name) == 0) { 1543 /* put it on the delete list */ 1544 np = rl->rl_repp->r_namep; 1545 (void) metanamelist_append(&nlp, np); 1546 1547 } 1548 } 1549 } 1550 1551 if (nlp != NULL) { 1552 if (meta_db_detach(sp, nlp, 1553 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, 1554 ep) == -1) { 1555 metafreenamelist(nlp); 1556 goto rollback; 1557 } 1558 metafreenamelist(nlp); 1559 } 1560 } 1561 1562 RB_TEST(6, "deletedrives", ep) 1563 1564 RB_PREEMPT; 1565 rb_level = 3; /* level 3 */ 1566 1567 RB_TEST(7, "deletedrives", ep) 1568 1569 /* 1570 * Cannot suspend set until after meta_db_balance since 1571 * meta_db_balance uses META_DB_ATTACH/DETACH messages. 1572 */ 1573 if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) { 1574 /* 1575 * Notify rpc.mdcommd on all nodes of a nodelist change. 1576 * Start by suspending rpc.mdcommd (which drains it of all 1577 * messages), then change the nodelist followed by a reinit 1578 * and resume. 1579 */ 1580 nd = sd->sd_nodelist; 1581 /* All nodes are guaranteed to be ALIVE */ 1582 while (nd) { 1583 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 1584 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 1585 rval = -1; 1586 goto out; 1587 } 1588 suspendall_flag = 1; 1589 nd = nd->nd_next; 1590 } 1591 } 1592 1593 /* 1594 * Remove the drive records for the drives that were deleted from 1595 * each host in the set. This removes the record and dr_flags. 1596 */ 1597 if (MD_MNSET_DESC(sd)) { 1598 nd = sd->sd_nodelist; 1599 /* All nodes are guaranteed to be ALIVE */ 1600 while (nd) { 1601 if (clnt_deldrvs(nd->nd_nodename, sp, dd, ep) == -1) 1602 goto rollback; 1603 1604 RB_TEST(8, "deletedrives", ep) 1605 nd = nd->nd_next; 1606 } 1607 } else { 1608 for (i = 0; i < MD_MAXSIDES; i++) { 1609 /* Skip empty slots */ 1610 if (sd->sd_nodes[i][0] == '\0') 1611 continue; 1612 1613 if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep) == -1) 1614 goto rollback; 1615 1616 RB_TEST(8, "deletedrives", ep) 1617 } 1618 } 1619 1620 RB_TEST(9, "deletedrives", ep) 1621 1622 RB_PREEMPT; 1623 rb_level = 4; /* level 4 */ 1624 1625 RB_TEST(10, "deletedrives", ep) 1626 1627 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { 1628 if (rel_own_bydd(sp, dd, TRUE, ep)) 1629 goto rollback; 1630 } 1631 1632 /* If we deleted all the drives, then we need to halt the set. */ 1633 if (deldrvcnt == 0) { 1634 RB_TEST(11, "deletedrives", ep) 1635 1636 RB_PREEMPT; 1637 rb_level = 5; /* level 5 */ 1638 1639 RB_TEST(12, "deletedrives", ep) 1640 1641 if (clnt_stimeout(mynode(), sp, &defmhiargs, ep) == -1) 1642 goto rollback; 1643 1644 RB_TEST(13, "deletedrives", ep) 1645 1646 RB_PREEMPT; 1647 rb_level = 6; /* level 6 */ 1648 1649 RB_TEST(14, "deletedrives", ep) 1650 1651 /* Halt MN diskset on all nodes by having node withdraw */ 1652 if (MD_MNSET_DESC(sd)) { 1653 nd = sd->sd_nodelist; 1654 /* All nodes are guaranteed to be ALIVE */ 1655 while (nd) { 1656 /* Only withdraw nodes that are joined */ 1657 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 1658 nd = nd->nd_next; 1659 continue; 1660 } 1661 /* 1662 * Going to set locally cached node flags to 1663 * rollback join so in case of error, the 1664 * rollback code knows which nodes to re-join. 1665 */ 1666 nd->nd_flags |= MD_MN_NODE_RB_JOIN; 1667 1668 /* 1669 * Be careful in ordering of following steps 1670 * so that recovery from a panic between 1671 * the steps is viable. 1672 * Only reset master info in rpc.metad - 1673 * don't reset local cached information 1674 * which will be used to set master information 1675 * back in case of failure (rollback). 1676 */ 1677 if (clnt_withdrawset(nd->nd_nodename, sp, ep)) 1678 goto rollback; 1679 /* Sets withdraw flag on all nodes in list */ 1680 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 1681 sd->sd_nodelist, MD_NR_WITHDRAW, 1682 NULL, ep)) { 1683 goto rollback; 1684 } 1685 if (clnt_mnsetmaster(nd->nd_nodename, sp, 1686 "", MD_MN_INVALID_NID, ep)) { 1687 goto rollback; 1688 } 1689 nd = nd->nd_next; 1690 } 1691 } else { 1692 if (halt_set(sp, ep)) 1693 goto rollback; 1694 } 1695 1696 RB_TEST(15, "deletedrives", ep) 1697 } 1698 1699 RB_TEST(16, "deletedrives", ep) 1700 1701out: 1702 /* 1703 * Notify rpc.mdcommd on all nodes of a nodelist change. 1704 * Send reinit command to mdcommd which forces it to get 1705 * fresh set description. 1706 */ 1707 if (suspendall_flag) { 1708 /* Send reinit */ 1709 nd = sd->sd_nodelist; 1710 /* All nodes are guaranteed to be ALIVE */ 1711 while (nd) { 1712 /* Class is ignored for REINIT */ 1713 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 1714 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 1715 if (rval == 0) 1716 (void) mdstealerror(ep, &xep); 1717 rval = -1; 1718 mde_perror(ep, dgettext(TEXT_DOMAIN, 1719 "Unable to reinit rpc.mdcommd.\n")); 1720 } 1721 nd = nd->nd_next; 1722 } 1723 } 1724 1725 /* 1726 * Just resume all classes so that resume is the same whether 1727 * just one class was locked or all classes were locked. 1728 */ 1729 if ((suspend1_flag) || (suspendall_flag)) { 1730 /* Send resume */ 1731 nd = sd->sd_nodelist; 1732 /* All nodes are guaranteed to be ALIVE */ 1733 while (nd) { 1734 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 1735 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 1736 if (rval == 0) 1737 (void) mdstealerror(ep, &xep); 1738 rval = -1; 1739 mde_perror(ep, dgettext(TEXT_DOMAIN, 1740 "Unable to resume rpc.mdcommd.\n")); 1741 } 1742 nd = nd->nd_next; 1743 } 1744 meta_ping_mnset(sp->setno); 1745 } 1746 if (lock_flag) { 1747 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1748 if (MD_MNSET_DESC(sd)) { 1749 nd = sd->sd_nodelist; 1750 /* All nodes are guaranteed to be ALIVE */ 1751 while (nd) { 1752 if (clnt_unlock_set(nd->nd_nodename, 1753 cl_sk, &xep)) { 1754 if (rval == 0) 1755 (void) mdstealerror(ep, &xep); 1756 rval = -1; 1757 } 1758 nd = nd->nd_next; 1759 } 1760 } else { 1761 for (i = 0; i < MD_MAXSIDES; i++) { 1762 /* Skip empty slots */ 1763 if (sd->sd_nodes[i][0] == '\0') 1764 continue; 1765 1766 if (clnt_unlock_set(sd->sd_nodes[i], 1767 cl_sk, &xep)) { 1768 if (rval == 0) 1769 (void) mdstealerror(ep, &xep); 1770 rval = -1; 1771 } 1772 } 1773 } 1774 cl_set_setkey(NULL); 1775 } 1776 1777 metafreedrivedesc(&dd); 1778 1779 if (flush_set_onerr) { 1780 metaflushsetname(sp); 1781 if (!(MD_MNSET_DESC(sd))) { 1782 md_rb_sig_handling_off(md_got_sig(), md_which_sig()); 1783 } 1784 } 1785 1786 if (MD_MNSET_DESC(sd)) { 1787 /* release signals back to what they were on entry */ 1788 if (procsigs(FALSE, &oldsigs, &xep) < 0) 1789 mdclrerror(&xep); 1790 } 1791 1792 return (rval); 1793 1794rollback: 1795 /* all signals already blocked for MN disket */ 1796 if (!(MD_MNSET_DESC(sd))) { 1797 /* Make sure we are blocking all signals */ 1798 if (procsigs(TRUE, &oldsigs, &xep) < 0) 1799 mdclrerror(&xep); 1800 } 1801 1802 rval = -1; 1803 1804 max_genid = sd->sd_genid; 1805 1806 /* Set the master on all nodes first thing */ 1807 if (rb_level > 5) { 1808 if (MD_MNSET_DESC(sd)) { 1809 nd = sd->sd_nodelist; 1810 /* All nodes are guaranteed to be ALIVE */ 1811 while (nd) { 1812 if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) { 1813 continue; 1814 } 1815 /* 1816 * Set master on all re-joining nodes to be 1817 * my cached view of master. 1818 */ 1819 if (clnt_mnsetmaster(nd->nd_nodename, sp, 1820 sd->sd_mn_master_nodenm, 1821 sd->sd_mn_master_nodeid, &xep)) { 1822 mdclrerror(&xep); 1823 } 1824 } 1825 } 1826 } 1827 1828 /* level 3 */ 1829 if (rb_level > 2) { 1830 md_set_record *sr; 1831 md_mnset_record *mnsr; 1832 md_drive_record *dr; 1833 int sr_drive_cnt; 1834 1835 /* 1836 * See if we have to re-add the drives specified. 1837 */ 1838 if (MD_MNSET_DESC(sd)) { 1839 nd = sd->sd_nodelist; 1840 /* All nodes are guaranteed to be ALIVE */ 1841 while (nd) { 1842 /* 1843 * Must get current set record from each 1844 * node to see what else must be done 1845 * to recover. 1846 * Record should be for a multi-node diskset. 1847 */ 1848 if (clnt_mngetset(nd->nd_nodename, sp->setname, 1849 MD_SET_BAD, &mnsr, &xep) == -1) { 1850 mdclrerror(&xep); 1851 nd = nd->nd_next; 1852 continue; 1853 } 1854 1855 /* 1856 * If all drives are already there, skip 1857 * to next node. 1858 */ 1859 sr_drive_cnt = 0; 1860 dr = mnsr->sr_drivechain; 1861 while (dr) { 1862 sr_drive_cnt++; 1863 dr = dr->dr_next; 1864 } 1865 if (sr_drive_cnt == current_drv_cnt) { 1866 free_sr((md_set_record *)mnsr); 1867 nd = nd->nd_next; 1868 continue; 1869 } 1870 1871 /* Readd all drives */ 1872 if (clnt_adddrvs(nd->nd_nodename, sp, dd, 1873 mnsr->sr_ctime, mnsr->sr_genid, &xep) == -1) 1874 mdclrerror(&xep); 1875 1876 free_sr((struct md_set_record *)mnsr); 1877 nd = nd->nd_next; 1878 } 1879 } else { 1880 for (i = 0; i < MD_MAXSIDES; i++) { 1881 /* Skip empty slots */ 1882 if (sd->sd_nodes[i][0] == '\0') 1883 continue; 1884 1885 /* Record should be for a non-multi-node set */ 1886 if (clnt_getset(sd->sd_nodes[i], sp->setname, 1887 MD_SET_BAD, &sr, &xep) == -1) { 1888 mdclrerror(&xep); 1889 continue; 1890 } 1891 1892 /* 1893 * Set record structure was allocated from RPC 1894 * routine getset so this structure is only of 1895 * size md_set_record even if the MN flag is 1896 * set. So, clear the flag so that the free 1897 * code doesn't attempt to free a structure 1898 * the size of md_mnset_record. 1899 */ 1900 if (MD_MNSET_REC(sr)) { 1901 sr->sr_flags &= ~MD_SR_MN; 1902 free_sr(sr); 1903 continue; 1904 } 1905 1906 /* Drive already added, skip to next node */ 1907 if (sr->sr_drivechain != NULL) { 1908 free_sr(sr); 1909 continue; 1910 } 1911 1912 if (clnt_adddrvs(sd->sd_nodes[i], sp, dd, 1913 sr->sr_ctime, sr->sr_genid, &xep) == -1) 1914 mdclrerror(&xep); 1915 1916 free_sr(sr); 1917 } 1918 } 1919 max_genid += 2; 1920 } 1921 1922 /* 1923 * Notify rpc.mdcommd on all nodes of a nodelist change. 1924 * At this point in time, don't know which nodes are joined 1925 * to the set. So, send a reinit command to mdcommd 1926 * which forces it to get fresh set description. Then send resume. 1927 * 1928 * Later, this code will use rpc.mdcommd messages to reattach disks 1929 * and then rpc.mdcommd may be suspended again, rest of the nodes 1930 * joined, rpc.mdcommd reinited and then resumed. 1931 */ 1932 if (suspendall_flag) { 1933 /* Send reinit */ 1934 nd = sd->sd_nodelist; 1935 /* All nodes are guaranteed to be ALIVE */ 1936 while (nd) { 1937 /* Class is ignored for REINIT */ 1938 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 1939 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 1940 mde_perror(&xep, dgettext(TEXT_DOMAIN, 1941 "Unable to reinit rpc.mdcommd.\n")); 1942 mdclrerror(&xep); 1943 } 1944 nd = nd->nd_next; 1945 } 1946 1947 /* Send resume */ 1948 nd = sd->sd_nodelist; 1949 /* All nodes are guaranteed to be ALIVE */ 1950 while (nd) { 1951 /* 1952 * Resume all classes but class 1 so that lock is held 1953 * against meta* commands. 1954 * To later resume class1, must issue a class0 resume. 1955 */ 1956 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 1957 sp, MD_MSG_CLASS0, 1958 MD_MSCF_DONT_RESUME_CLASS1, &xep)) { 1959 mde_perror(&xep, dgettext(TEXT_DOMAIN, 1960 "Unable to resume rpc.mdcommd.\n")); 1961 mdclrerror(&xep); 1962 } 1963 nd = nd->nd_next; 1964 } 1965 meta_ping_mnset(sp->setno); 1966 } 1967 1968 /* level 2 */ 1969 if (rb_level > 1) { 1970 mdnamelist_t *nlp; 1971 mdname_t *np; 1972 1973 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) { 1974 uint_t rep_slice; 1975 1976 if ((meta_replicaslice(ddp->dd_dnp, 1977 &rep_slice, &xep) != 0) || 1978 ((np = metaslicename(ddp->dd_dnp, rep_slice, 1979 &xep)) == NULL)) { 1980 mdclrerror(&xep); 1981 continue; 1982 } 1983 nlp = NULL; 1984 (void) metanamelist_append(&nlp, np); 1985 1986 if (meta_db_attach(sp, nlp, 1987 (MDCHK_DRVINSET | MDCHK_SET_LOCKED), 1988 &sd->sd_ctime, ddp->dd_dbcnt, ddp->dd_dbsize, 1989 NULL, &xep) == -1) 1990 mdclrerror(&xep); 1991 1992 metafreenamelist(nlp); 1993 } 1994 /* Re-balance */ 1995 if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1) 1996 mdclrerror(&xep); 1997 } 1998 1999 /* level 4 */ 2000 if (rb_level > 3) { 2001 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { 2002 if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep)) 2003 mdclrerror(&xep); 2004 } 2005 } 2006 2007 /* level 5 */ 2008 if (rb_level > 4) { 2009 if (clnt_stimeout(mynode(), sp, &mhiargs, &xep) == -1) 2010 mdclrerror(&xep); 2011 } 2012 2013 /* 2014 * If at least one node needs to be rejoined to MN diskset, 2015 * then suspend commd again. 2016 */ 2017 if (MD_MNSET_DESC(sd)) { 2018 nd = sd->sd_nodelist; 2019 /* All nodes are guaranteed to be ALIVE */ 2020 while (nd) { 2021 if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) { 2022 nd = nd->nd_next; 2023 continue; 2024 } 2025 break; 2026 } 2027 if (nd) { 2028 /* 2029 * Found node that will be rejoined so 2030 * notify rpc.mdcommd on all nodes of a nodelist change. 2031 * Start by suspending rpc.mdcommd (which drains it of 2032 * all messages), then change the nodelist followed by 2033 * a reinit and resume. 2034 */ 2035 nd = sd->sd_nodelist; 2036 /* All nodes are guaranteed to be ALIVE */ 2037 while (nd) { 2038 if (clnt_mdcommdctl(nd->nd_nodename, 2039 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS0, 2040 MD_MSCF_NO_FLAGS, &xep)) { 2041 mdclrerror(&xep); 2042 } 2043 suspendall_flag_rb = 1; 2044 nd = nd->nd_next; 2045 } 2046 } 2047 } 2048 2049 2050 2051 /* level 6 */ 2052 if (rb_level > 5) { 2053 if (MD_MNSET_DESC(sd)) { 2054 int join_flags = 0; 2055 2056 nd = sd->sd_nodelist; 2057 /* All nodes are guaranteed to be ALIVE */ 2058 while (nd) { 2059 /* Only rejoin nodes that were joined before */ 2060 if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) { 2061 nd = nd->nd_next; 2062 continue; 2063 } 2064 /* 2065 * Rejoin nodes to same state as before - 2066 * either STALE or non-STALE. 2067 */ 2068 if (stale_bool == TRUE) 2069 join_flags = MNSET_IS_STALE; 2070 if (clnt_joinset(nd->nd_nodename, sp, 2071 join_flags, &xep)) 2072 mdclrerror(&xep); 2073 /* Sets OWN flag on all nodes in list */ 2074 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 2075 sd->sd_nodelist, MD_NR_JOIN, NULL, &xep)) { 2076 mdclrerror(&xep); 2077 } 2078 nd = nd->nd_next; 2079 } 2080 } else { 2081 if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) 2082 mdclrerror(&xep); 2083 2084 /* No special flag for traditional diskset */ 2085 if (snarf_set(sp, NULL, &xep)) 2086 mdclrerror(&xep); 2087 } 2088 } 2089 2090 /* level 1 */ 2091 if (rb_level > 0) { 2092 /* 2093 * Mark the drives as OK. 2094 */ 2095 if (MD_MNSET_DESC(sd)) { 2096 nd = sd->sd_nodelist; 2097 /* All nodes are guaranteed to be ALIVE */ 2098 while (nd) { 2099 /* 2100 * Must be last action before unlock. 2101 * In case of panic, recovery code checks 2102 * for MD_DR_OK to know that drive 2103 * and possible master are fully added back. 2104 */ 2105 if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd, 2106 MD_DR_OK, &xep) == -1) 2107 mdclrerror(&xep); 2108 nd = nd->nd_next; 2109 } 2110 } else { 2111 for (i = 0; i < MD_MAXSIDES; i++) { 2112 /* Skip empty slots */ 2113 if (sd->sd_nodes[i][0] == '\0') 2114 continue; 2115 2116 if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, 2117 MD_DR_OK, &xep) == -1) 2118 mdclrerror(&xep); 2119 2120 } 2121 } 2122 max_genid += 2; 2123 resync_genid(sp, sd, max_genid, 0, NULL); 2124 } 2125 /* 2126 * Notify rpc.mdcommd on all nodes of a nodelist change. 2127 * Send a reinit command to mdcommd which forces it to get 2128 * fresh set description. 2129 */ 2130 if (suspendall_flag_rb) { 2131 /* Send reinit */ 2132 nd = sd->sd_nodelist; 2133 /* All nodes are guaranteed to be ALIVE */ 2134 while (nd) { 2135 /* Class is ignored for REINIT */ 2136 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 2137 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 2138 mde_perror(&xep, dgettext(TEXT_DOMAIN, 2139 "Unable to reinit rpc.mdcommd.\n")); 2140 mdclrerror(&xep); 2141 } 2142 nd = nd->nd_next; 2143 } 2144 } 2145 2146 /* 2147 * Just resume all classes so that resume is the same whether 2148 * just one class was locked or all classes were locked. 2149 */ 2150 if ((suspend1_flag) || (suspendall_flag_rb) || (suspendall_flag)) { 2151 /* Send resume */ 2152 nd = sd->sd_nodelist; 2153 /* All nodes are guaranteed to be ALIVE */ 2154 while (nd) { 2155 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 2156 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 2157 mde_perror(&xep, dgettext(TEXT_DOMAIN, 2158 "Unable to resume rpc.mdcommd.\n")); 2159 mdclrerror(&xep); 2160 } 2161 nd = nd->nd_next; 2162 } 2163 meta_ping_mnset(sp->setno); 2164 } 2165 2166 2167 /* level 0 */ 2168 cl_sk = cl_get_setkey(sp->setno, sp->setname); 2169 /* Don't test lock flag since guaranteed to be set if in rollback */ 2170 if (MD_MNSET_DESC(sd)) { 2171 nd = sd->sd_nodelist; 2172 /* All nodes are guaranteed to be ALIVE */ 2173 while (nd) { 2174 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) 2175 mdclrerror(&xep); 2176 nd = nd->nd_next; 2177 } 2178 } else { 2179 for (i = 0; i < MD_MAXSIDES; i++) { 2180 /* Skip empty slots */ 2181 if (sd->sd_nodes[i][0] == '\0') 2182 continue; 2183 2184 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) 2185 mdclrerror(&xep); 2186 } 2187 } 2188 cl_set_setkey(NULL); 2189 2190 /* release signals back to what they were on entry */ 2191 if (procsigs(FALSE, &oldsigs, &xep) < 0) 2192 mdclrerror(&xep); 2193 2194 metafreedrivedesc(&dd); 2195 2196 if (flush_set_onerr) { 2197 metaflushsetname(sp); 2198 if (!(MD_MNSET_DESC(sd))) { 2199 md_rb_sig_handling_off(md_got_sig(), md_which_sig()); 2200 } 2201 } 2202 2203 return (rval); 2204} 2205