1/* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 11 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 12 - kmod support by: Cyrus Durgin 13 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 14 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 15 16 - lots of fixes and improvements to the RAID1/RAID5 and generic 17 RAID code (such as request based resynchronization): 18 19 Neil Brown <neilb@cse.unsw.edu.au>. 20 21 This program is free software; you can redistribute it and/or modify 22 it under the terms of the GNU General Public License as published by 23 the Free Software Foundation; either version 2, or (at your option) 24 any later version. 25 26 You should have received a copy of the GNU General Public License 27 (for example /usr/src/linux/COPYING); if not, write to the Free 28 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 29*/ 30 31#include <linux/module.h> 32#include <linux/config.h> 33#include <linux/raid/md.h> 34#include <linux/sysctl.h> 35#include <linux/raid/xor.h> 36#include <linux/devfs_fs_kernel.h> 37 38#include <linux/init.h> 39 40#ifdef CONFIG_KMOD 41#include <linux/kmod.h> 42#endif 43 44#define __KERNEL_SYSCALLS__ 45#include <linux/unistd.h> 46 47#include <asm/unaligned.h> 48 49#define MAJOR_NR MD_MAJOR 50#define MD_DRIVER 51 52#include <linux/blk.h> 53 54#define DEBUG 0 55#if DEBUG 56# define dprintk(x...) printk(x) 57#else 58# define dprintk(x...) do { } while(0) 59#endif 60 61#ifndef MODULE 62static void autostart_arrays (void); 63#endif 64 65static mdk_personality_t *pers[MAX_PERSONALITY]; 66 67/* 68 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 69 * is 100 KB/sec, so the extra system load does not show up that much. 70 * Increase it if you want to have more _guaranteed_ speed. Note that 71 * the RAID driver will use the maximum available bandwith if the IO 72 * subsystem is idle. There is also an 'absolute maximum' reconstruction 73 * speed limit - in case reconstruction slows down your system despite 74 * idle IO detection. 75 * 76 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 77 */ 78 79static int sysctl_speed_limit_min = 100; 80static int sysctl_speed_limit_max = 100000; 81 82static struct ctl_table_header *raid_table_header; 83 84static ctl_table raid_table[] = { 85 {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min", 86 &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec}, 87 {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max", 88 &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec}, 89 {0} 90}; 91 92static ctl_table raid_dir_table[] = { 93 {DEV_RAID, "raid", NULL, 0, 0555, raid_table}, 94 {0} 95}; 96 97static ctl_table raid_root_table[] = { 98 {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table}, 99 {0} 100}; 101 102/* 103 * these have to be allocated separately because external 104 * subsystems want to have a pre-defined structure 105 */ 106struct hd_struct md_hd_struct[MAX_MD_DEVS]; 107static int md_blocksizes[MAX_MD_DEVS]; 108static int md_hardsect_sizes[MAX_MD_DEVS]; 109static int md_maxreadahead[MAX_MD_DEVS]; 110static mdk_thread_t *md_recovery_thread; 111 112int md_size[MAX_MD_DEVS]; 113 114static struct block_device_operations md_fops; 115static devfs_handle_t devfs_handle; 116 117static struct gendisk md_gendisk= 118{ 119 major: MD_MAJOR, 120 major_name: "md", 121 minor_shift: 0, 122 max_p: 1, 123 part: md_hd_struct, 124 sizes: md_size, 125 nr_real: MAX_MD_DEVS, 126 real_devices: NULL, 127 next: NULL, 128 fops: &md_fops, 129}; 130 131/* 132 * Enables to iterate over all existing md arrays 133 */ 134static MD_LIST_HEAD(all_mddevs); 135 136/* 137 * The mapping between kdev and mddev is not necessary a simple 138 * one! Eg. HSM uses several sub-devices to implement Logical 139 * Volumes. All these sub-devices map to the same mddev. 140 */ 141dev_mapping_t mddev_map[MAX_MD_DEVS]; 142 143void add_mddev_mapping(mddev_t * mddev, kdev_t dev, void *data) 144{ 145 unsigned int minor = MINOR(dev); 146 147 if (MAJOR(dev) != MD_MAJOR) { 148 MD_BUG(); 149 return; 150 } 151 if (mddev_map[minor].mddev) { 152 MD_BUG(); 153 return; 154 } 155 mddev_map[minor].mddev = mddev; 156 mddev_map[minor].data = data; 157} 158 159void del_mddev_mapping(mddev_t * mddev, kdev_t dev) 160{ 161 unsigned int minor = MINOR(dev); 162 163 if (MAJOR(dev) != MD_MAJOR) { 164 MD_BUG(); 165 return; 166 } 167 if (mddev_map[minor].mddev != mddev) { 168 MD_BUG(); 169 return; 170 } 171 mddev_map[minor].mddev = NULL; 172 mddev_map[minor].data = NULL; 173} 174 175static int md_make_request(request_queue_t *q, int rw, struct buffer_head * bh) 176{ 177 mddev_t *mddev = kdev_to_mddev(bh->b_rdev); 178 179 if (mddev && mddev->pers) 180 return mddev->pers->make_request(mddev, rw, bh); 181 else { 182 buffer_IO_error(bh); 183 return 0; 184 } 185} 186 187static mddev_t * alloc_mddev(kdev_t dev) 188{ 189 mddev_t *mddev; 190 191 if (MAJOR(dev) != MD_MAJOR) { 192 MD_BUG(); 193 return 0; 194 } 195 mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL); 196 if (!mddev) 197 return NULL; 198 199 memset(mddev, 0, sizeof(*mddev)); 200 201 mddev->__minor = MINOR(dev); 202 init_MUTEX(&mddev->reconfig_sem); 203 init_MUTEX(&mddev->recovery_sem); 204 init_MUTEX(&mddev->resync_sem); 205 MD_INIT_LIST_HEAD(&mddev->disks); 206 MD_INIT_LIST_HEAD(&mddev->all_mddevs); 207 atomic_set(&mddev->active, 0); 208 209 /* 210 * The 'base' mddev is the one with data NULL. 211 * personalities can create additional mddevs 212 * if necessary. 213 */ 214 add_mddev_mapping(mddev, dev, 0); 215 md_list_add(&mddev->all_mddevs, &all_mddevs); 216 217 MOD_INC_USE_COUNT; 218 219 return mddev; 220} 221 222mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 223{ 224 mdk_rdev_t * rdev; 225 struct md_list_head *tmp; 226 227 ITERATE_RDEV(mddev,rdev,tmp) { 228 if (rdev->desc_nr == nr) 229 return rdev; 230 } 231 return NULL; 232} 233 234mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) 235{ 236 struct md_list_head *tmp; 237 mdk_rdev_t *rdev; 238 239 ITERATE_RDEV(mddev,rdev,tmp) { 240 if (rdev->dev == dev) 241 return rdev; 242 } 243 return NULL; 244} 245 246static MD_LIST_HEAD(device_names); 247 248char * partition_name(kdev_t dev) 249{ 250 struct gendisk *hd; 251 static char nomem [] = "<nomem>"; 252 dev_name_t *dname; 253 struct md_list_head *tmp = device_names.next; 254 255 while (tmp != &device_names) { 256 dname = md_list_entry(tmp, dev_name_t, list); 257 if (dname->dev == dev) 258 return dname->name; 259 tmp = tmp->next; 260 } 261 262 dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); 263 264 if (!dname) 265 return nomem; 266 /* 267 * ok, add this new device name to the list 268 */ 269 hd = get_gendisk (dev); 270 dname->name = NULL; 271 if (hd) 272 dname->name = disk_name (hd, MINOR(dev), dname->namebuf); 273 if (!dname->name) { 274 sprintf (dname->namebuf, "[dev %s]", kdevname(dev)); 275 dname->name = dname->namebuf; 276 } 277 278 dname->dev = dev; 279 MD_INIT_LIST_HEAD(&dname->list); 280 md_list_add(&dname->list, &device_names); 281 282 return dname->name; 283} 284 285static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev, 286 int persistent) 287{ 288 unsigned int size = 0; 289 290 if (blk_size[MAJOR(dev)]) 291 size = blk_size[MAJOR(dev)][MINOR(dev)]; 292 if (persistent) 293 size = MD_NEW_SIZE_BLOCKS(size); 294 return size; 295} 296 297static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent) 298{ 299 unsigned int size; 300 301 size = calc_dev_sboffset(dev, mddev, persistent); 302 if (!mddev->sb) { 303 MD_BUG(); 304 return size; 305 } 306 if (mddev->sb->chunk_size) 307 size &= ~(mddev->sb->chunk_size/1024 - 1); 308 return size; 309} 310 311static unsigned int zoned_raid_size(mddev_t *mddev) 312{ 313 unsigned int mask; 314 mdk_rdev_t * rdev; 315 struct md_list_head *tmp; 316 317 if (!mddev->sb) { 318 MD_BUG(); 319 return -EINVAL; 320 } 321 /* 322 * do size and offset calculations. 323 */ 324 mask = ~(mddev->sb->chunk_size/1024 - 1); 325 326 ITERATE_RDEV(mddev,rdev,tmp) { 327 rdev->size &= mask; 328 md_size[mdidx(mddev)] += rdev->size; 329 } 330 return 0; 331} 332 333/* 334 * We check wether all devices are numbered from 0 to nb_dev-1. The 335 * order is guaranteed even after device name changes. 336 * 337 * Some personalities (raid0, linear) use this. Personalities that 338 * provide data have to be able to deal with loss of individual 339 * disks, so they do their checking themselves. 340 */ 341int md_check_ordering(mddev_t *mddev) 342{ 343 int i, c; 344 mdk_rdev_t *rdev; 345 struct md_list_head *tmp; 346 347 /* 348 * First, all devices must be fully functional 349 */ 350 ITERATE_RDEV(mddev,rdev,tmp) { 351 if (rdev->faulty) { 352 printk(KERN_ERR "md: md%d's device %s faulty, aborting.\n", 353 mdidx(mddev), partition_name(rdev->dev)); 354 goto abort; 355 } 356 } 357 358 c = 0; 359 ITERATE_RDEV(mddev,rdev,tmp) { 360 c++; 361 } 362 if (c != mddev->nb_dev) { 363 MD_BUG(); 364 goto abort; 365 } 366 if (mddev->nb_dev != mddev->sb->raid_disks) { 367 printk(KERN_ERR "md: md%d, array needs %d disks, has %d, aborting.\n", 368 mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev); 369 goto abort; 370 } 371 /* 372 * Now the numbering check 373 */ 374 for (i = 0; i < mddev->nb_dev; i++) { 375 c = 0; 376 ITERATE_RDEV(mddev,rdev,tmp) { 377 if (rdev->desc_nr == i) 378 c++; 379 } 380 if (!c) { 381 printk(KERN_ERR "md: md%d, missing disk #%d, aborting.\n", 382 mdidx(mddev), i); 383 goto abort; 384 } 385 if (c > 1) { 386 printk(KERN_ERR "md: md%d, too many disks #%d, aborting.\n", 387 mdidx(mddev), i); 388 goto abort; 389 } 390 } 391 return 0; 392abort: 393 return 1; 394} 395 396static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb) 397{ 398 if (disk_active(disk)) { 399 sb->working_disks--; 400 } else { 401 if (disk_spare(disk)) { 402 sb->spare_disks--; 403 sb->working_disks--; 404 } else { 405 sb->failed_disks--; 406 } 407 } 408 sb->nr_disks--; 409 disk->major = 0; 410 disk->minor = 0; 411 mark_disk_removed(disk); 412} 413 414#define BAD_MAGIC KERN_ERR \ 415"md: invalid raid superblock magic on %s\n" 416 417#define BAD_MINOR KERN_ERR \ 418"md: %s: invalid raid minor (%x)\n" 419 420#define OUT_OF_MEM KERN_ALERT \ 421"md: out of memory.\n" 422 423#define NO_SB KERN_ERR \ 424"md: disabled device %s, could not read superblock.\n" 425 426#define BAD_CSUM KERN_WARNING \ 427"md: invalid superblock checksum on %s\n" 428 429static int alloc_array_sb(mddev_t * mddev) 430{ 431 if (mddev->sb) { 432 MD_BUG(); 433 return 0; 434 } 435 436 mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL); 437 if (!mddev->sb) 438 return -ENOMEM; 439 md_clear_page(mddev->sb); 440 return 0; 441} 442 443static int alloc_disk_sb(mdk_rdev_t * rdev) 444{ 445 if (rdev->sb) 446 MD_BUG(); 447 448 rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL); 449 if (!rdev->sb) { 450 printk(OUT_OF_MEM); 451 return -EINVAL; 452 } 453 md_clear_page(rdev->sb); 454 455 return 0; 456} 457 458static void free_disk_sb(mdk_rdev_t * rdev) 459{ 460 if (rdev->sb) { 461 free_page((unsigned long) rdev->sb); 462 rdev->sb = NULL; 463 rdev->sb_offset = 0; 464 rdev->size = 0; 465 } else { 466 if (!rdev->faulty) 467 MD_BUG(); 468 } 469} 470 471static int read_disk_sb(mdk_rdev_t * rdev) 472{ 473 int ret = -EINVAL; 474 struct buffer_head *bh = NULL; 475 kdev_t dev = rdev->dev; 476 mdp_super_t *sb; 477 unsigned long sb_offset; 478 479 if (!rdev->sb) { 480 MD_BUG(); 481 goto abort; 482 } 483 484 /* 485 * Calculate the position of the superblock, 486 * it's at the end of the disk 487 */ 488 sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); 489 rdev->sb_offset = sb_offset; 490 fsync_dev(dev); 491 set_blocksize (dev, MD_SB_BYTES); 492 bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); 493 494 if (bh) { 495 sb = (mdp_super_t *) bh->b_data; 496 memcpy (rdev->sb, sb, MD_SB_BYTES); 497 } else { 498 printk(NO_SB,partition_name(rdev->dev)); 499 goto abort; 500 } 501 printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo); 502 ret = 0; 503abort: 504 if (bh) 505 brelse (bh); 506 return ret; 507} 508 509static unsigned int calc_sb_csum(mdp_super_t * sb) 510{ 511 unsigned int disk_csum, csum; 512 513 disk_csum = sb->sb_csum; 514 sb->sb_csum = 0; 515 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 516 sb->sb_csum = disk_csum; 517 return csum; 518} 519 520/* 521 * Check one RAID superblock for generic plausibility 522 */ 523 524static int check_disk_sb(mdk_rdev_t * rdev) 525{ 526 mdp_super_t *sb; 527 int ret = -EINVAL; 528 529 sb = rdev->sb; 530 if (!sb) { 531 MD_BUG(); 532 goto abort; 533 } 534 535 if (sb->md_magic != MD_SB_MAGIC) { 536 printk(BAD_MAGIC, partition_name(rdev->dev)); 537 goto abort; 538 } 539 540 if (sb->md_minor >= MAX_MD_DEVS) { 541 printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor); 542 goto abort; 543 } 544 545 if (calc_sb_csum(sb) != sb->sb_csum) { 546 printk(BAD_CSUM, partition_name(rdev->dev)); 547 goto abort; 548 } 549 ret = 0; 550abort: 551 return ret; 552} 553 554static kdev_t dev_unit(kdev_t dev) 555{ 556 unsigned int mask; 557 struct gendisk *hd = get_gendisk(dev); 558 559 if (!hd) 560 return 0; 561 mask = ~((1 << hd->minor_shift) - 1); 562 563 return MKDEV(MAJOR(dev), MINOR(dev) & mask); 564} 565 566static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) 567{ 568 struct md_list_head *tmp; 569 mdk_rdev_t *rdev; 570 571 ITERATE_RDEV(mddev,rdev,tmp) 572 if (dev_unit(rdev->dev) == dev_unit(dev)) 573 return rdev; 574 575 return NULL; 576} 577 578static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 579{ 580 struct md_list_head *tmp; 581 mdk_rdev_t *rdev; 582 583 ITERATE_RDEV(mddev1,rdev,tmp) 584 if (match_dev_unit(mddev2, rdev->dev)) 585 return 1; 586 587 return 0; 588} 589 590static MD_LIST_HEAD(all_raid_disks); 591static MD_LIST_HEAD(pending_raid_disks); 592 593static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 594{ 595 mdk_rdev_t *same_pdev; 596 597 if (rdev->mddev) { 598 MD_BUG(); 599 return; 600 } 601 same_pdev = match_dev_unit(mddev, rdev->dev); 602 if (same_pdev) 603 printk( KERN_WARNING 604"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" 605" protection against single-disk failure might be compromised.\n", 606 mdidx(mddev), partition_name(rdev->dev), 607 partition_name(same_pdev->dev)); 608 609 md_list_add(&rdev->same_set, &mddev->disks); 610 rdev->mddev = mddev; 611 mddev->nb_dev++; 612 printk(KERN_INFO "md: bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev); 613} 614 615static void unbind_rdev_from_array(mdk_rdev_t * rdev) 616{ 617 if (!rdev->mddev) { 618 MD_BUG(); 619 return; 620 } 621 md_list_del(&rdev->same_set); 622 MD_INIT_LIST_HEAD(&rdev->same_set); 623 rdev->mddev->nb_dev--; 624 printk(KERN_INFO "md: unbind<%s,%d>\n", partition_name(rdev->dev), 625 rdev->mddev->nb_dev); 626 rdev->mddev = NULL; 627} 628 629/* 630 * prevent the device from being mounted, repartitioned or 631 * otherwise reused by a RAID array (or any other kernel 632 * subsystem), by opening the device. [simply getting an 633 * inode is not enough, the SCSI module usage code needs 634 * an explicit open() on the device] 635 */ 636static int lock_rdev(mdk_rdev_t *rdev) 637{ 638 int err = 0; 639 struct block_device *bdev; 640 641 bdev = bdget(rdev->dev); 642 if (!bdev) 643 return -ENOMEM; 644 err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW); 645 if (!err) 646 rdev->bdev = bdev; 647 return err; 648} 649 650static void unlock_rdev(mdk_rdev_t *rdev) 651{ 652 struct block_device *bdev = rdev->bdev; 653 rdev->bdev = NULL; 654 if (!bdev) 655 MD_BUG(); 656 blkdev_put(bdev, BDEV_RAW); 657} 658 659void md_autodetect_dev(kdev_t dev); 660 661static void export_rdev(mdk_rdev_t * rdev) 662{ 663 printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev)); 664 if (rdev->mddev) 665 MD_BUG(); 666 unlock_rdev(rdev); 667 free_disk_sb(rdev); 668 md_list_del(&rdev->all); 669 MD_INIT_LIST_HEAD(&rdev->all); 670 if (rdev->pending.next != &rdev->pending) { 671 printk(KERN_INFO "md: (%s was pending)\n", 672 partition_name(rdev->dev)); 673 md_list_del(&rdev->pending); 674 MD_INIT_LIST_HEAD(&rdev->pending); 675 } 676#ifndef MODULE 677 md_autodetect_dev(rdev->dev); 678#endif 679 rdev->dev = 0; 680 rdev->faulty = 0; 681 kfree(rdev); 682} 683 684static void kick_rdev_from_array(mdk_rdev_t * rdev) 685{ 686 unbind_rdev_from_array(rdev); 687 export_rdev(rdev); 688} 689 690static void export_array(mddev_t *mddev) 691{ 692 struct md_list_head *tmp; 693 mdk_rdev_t *rdev; 694 mdp_super_t *sb = mddev->sb; 695 696 if (mddev->sb) { 697 mddev->sb = NULL; 698 free_page((unsigned long) sb); 699 } 700 701 ITERATE_RDEV(mddev,rdev,tmp) { 702 if (!rdev->mddev) { 703 MD_BUG(); 704 continue; 705 } 706 kick_rdev_from_array(rdev); 707 } 708 if (mddev->nb_dev) 709 MD_BUG(); 710} 711 712static void free_mddev(mddev_t *mddev) 713{ 714 if (!mddev) { 715 MD_BUG(); 716 return; 717 } 718 719 export_array(mddev); 720 md_size[mdidx(mddev)] = 0; 721 md_hd_struct[mdidx(mddev)].nr_sects = 0; 722 723 /* 724 * Make sure nobody else is using this mddev 725 * (careful, we rely on the global kernel lock here) 726 */ 727 while (sem_getcount(&mddev->resync_sem) != 1) 728 schedule(); 729 while (sem_getcount(&mddev->recovery_sem) != 1) 730 schedule(); 731 732 del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev))); 733 md_list_del(&mddev->all_mddevs); 734 MD_INIT_LIST_HEAD(&mddev->all_mddevs); 735 kfree(mddev); 736 MOD_DEC_USE_COUNT; 737} 738 739#undef BAD_CSUM 740#undef BAD_MAGIC 741#undef OUT_OF_MEM 742#undef NO_SB 743 744static void print_desc(mdp_disk_t *desc) 745{ 746 printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number, 747 partition_name(MKDEV(desc->major,desc->minor)), 748 desc->major,desc->minor,desc->raid_disk,desc->state); 749} 750 751static void print_sb(mdp_super_t *sb) 752{ 753 int i; 754 755 printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 756 sb->major_version, sb->minor_version, sb->patch_version, 757 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 758 sb->ctime); 759 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, 760 sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, 761 sb->layout, sb->chunk_size); 762 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", 763 sb->utime, sb->state, sb->active_disks, sb->working_disks, 764 sb->failed_disks, sb->spare_disks, 765 sb->sb_csum, (unsigned long)sb->events_lo); 766 767 printk(KERN_INFO); 768 for (i = 0; i < MD_SB_DISKS; i++) { 769 mdp_disk_t *desc; 770 771 desc = sb->disks + i; 772 if (desc->number || desc->major || desc->minor || 773 desc->raid_disk || (desc->state && (desc->state != 4))) { 774 printk(" D %2d: ", i); 775 print_desc(desc); 776 } 777 } 778 printk(KERN_INFO "md: THIS: "); 779 print_desc(&sb->this_disk); 780 781} 782 783static void print_rdev(mdk_rdev_t *rdev) 784{ 785 printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ", 786 partition_name(rdev->dev), partition_name(rdev->old_dev), 787 rdev->size, rdev->faulty, rdev->desc_nr); 788 if (rdev->sb) { 789 printk(KERN_INFO "md: rdev superblock:\n"); 790 print_sb(rdev->sb); 791 } else 792 printk(KERN_INFO "md: no rdev superblock!\n"); 793} 794 795void md_print_devices(void) 796{ 797 struct md_list_head *tmp, *tmp2; 798 mdk_rdev_t *rdev; 799 mddev_t *mddev; 800 801 printk("\n"); 802 printk("md: **********************************\n"); 803 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 804 printk("md: **********************************\n"); 805 ITERATE_MDDEV(mddev,tmp) { 806 printk("md%d: ", mdidx(mddev)); 807 808 ITERATE_RDEV(mddev,rdev,tmp2) 809 printk("<%s>", partition_name(rdev->dev)); 810 811 if (mddev->sb) { 812 printk(" array superblock:\n"); 813 print_sb(mddev->sb); 814 } else 815 printk(" no array superblock.\n"); 816 817 ITERATE_RDEV(mddev,rdev,tmp2) 818 print_rdev(rdev); 819 } 820 printk("md: **********************************\n"); 821 printk("\n"); 822} 823 824static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 825{ 826 int ret; 827 mdp_super_t *tmp1, *tmp2; 828 829 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 830 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 831 832 if (!tmp1 || !tmp2) { 833 ret = 0; 834 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 835 goto abort; 836 } 837 838 *tmp1 = *sb1; 839 *tmp2 = *sb2; 840 841 /* 842 * nr_disks is not constant 843 */ 844 tmp1->nr_disks = 0; 845 tmp2->nr_disks = 0; 846 847 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 848 ret = 0; 849 else 850 ret = 1; 851 852abort: 853 if (tmp1) 854 kfree(tmp1); 855 if (tmp2) 856 kfree(tmp2); 857 858 return ret; 859} 860 861static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) 862{ 863 if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && 864 (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && 865 (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && 866 (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) 867 868 return 1; 869 870 return 0; 871} 872 873static mdk_rdev_t * find_rdev_all(kdev_t dev) 874{ 875 struct md_list_head *tmp; 876 mdk_rdev_t *rdev; 877 878 tmp = all_raid_disks.next; 879 while (tmp != &all_raid_disks) { 880 rdev = md_list_entry(tmp, mdk_rdev_t, all); 881 if (rdev->dev == dev) 882 return rdev; 883 tmp = tmp->next; 884 } 885 return NULL; 886} 887 888#define GETBLK_FAILED KERN_ERR \ 889"md: getblk failed for device %s\n" 890 891static int write_disk_sb(mdk_rdev_t * rdev) 892{ 893 struct buffer_head *bh; 894 kdev_t dev; 895 unsigned long sb_offset, size; 896 mdp_super_t *sb; 897 898 if (!rdev->sb) { 899 MD_BUG(); 900 return 1; 901 } 902 if (rdev->faulty) { 903 MD_BUG(); 904 return 1; 905 } 906 if (rdev->sb->md_magic != MD_SB_MAGIC) { 907 MD_BUG(); 908 return 1; 909 } 910 911 dev = rdev->dev; 912 sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1); 913 if (rdev->sb_offset != sb_offset) { 914 printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n", 915 partition_name(dev), rdev->sb_offset, sb_offset); 916 goto skip; 917 } 918 /* 919 * If the disk went offline meanwhile and it's just a spare, then 920 * its size has changed to zero silently, and the MD code does 921 * not yet know that it's faulty. 922 */ 923 size = calc_dev_size(dev, rdev->mddev, 1); 924 if (size != rdev->size) { 925 printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n", 926 partition_name(dev), rdev->size, size); 927 goto skip; 928 } 929 930 printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset); 931 fsync_dev(dev); 932 set_blocksize(dev, MD_SB_BYTES); 933 bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); 934 if (!bh) { 935 printk(GETBLK_FAILED, partition_name(dev)); 936 return 1; 937 } 938 memset(bh->b_data,0,bh->b_size); 939 sb = (mdp_super_t *) bh->b_data; 940 memcpy(sb, rdev->sb, MD_SB_BYTES); 941 942 mark_buffer_uptodate(bh, 1); 943 mark_buffer_dirty(bh); 944 ll_rw_block(WRITE, 1, &bh); 945 wait_on_buffer(bh); 946 brelse(bh); 947 fsync_dev(dev); 948skip: 949 return 0; 950} 951#undef GETBLK_FAILED 952 953static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) 954{ 955 int i, ok = 0; 956 mdp_disk_t *desc; 957 958 for (i = 0; i < MD_SB_DISKS; i++) { 959 desc = mddev->sb->disks + i; 960 if (MKDEV(desc->major,desc->minor) == rdev->dev) { 961 rdev->sb->this_disk = *desc; 962 rdev->desc_nr = desc->number; 963 ok = 1; 964 break; 965 } 966 } 967 968 if (!ok) { 969 MD_BUG(); 970 } 971} 972 973static int sync_sbs(mddev_t * mddev) 974{ 975 mdk_rdev_t *rdev; 976 mdp_super_t *sb; 977 struct md_list_head *tmp; 978 979 ITERATE_RDEV(mddev,rdev,tmp) { 980 if (rdev->faulty || rdev->alias_device) 981 continue; 982 sb = rdev->sb; 983 *sb = *mddev->sb; 984 set_this_disk(mddev, rdev); 985 sb->sb_csum = calc_sb_csum(sb); 986 } 987 return 0; 988} 989 990int md_update_sb(mddev_t * mddev) 991{ 992 int err, count = 100; 993 struct md_list_head *tmp; 994 mdk_rdev_t *rdev; 995 996 if (!mddev->sb_dirty) { 997 printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0)); 998 return 0; 999 } 1000 mddev->sb_dirty = 0; 1001repeat: 1002 mddev->sb->utime = CURRENT_TIME; 1003 if ((++mddev->sb->events_lo)==0) 1004 ++mddev->sb->events_hi; 1005 1006 if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) { 1007 /* 1008 * oops, this 64-bit counter should never wrap. 1009 * Either we are in around ~1 trillion A.C., assuming 1010 * 1 reboot per second, or we have a bug: 1011 */ 1012 MD_BUG(); 1013 mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff; 1014 } 1015 sync_sbs(mddev); 1016 1017 /* 1018 * do not write anything to disk if using 1019 * nonpersistent superblocks 1020 */ 1021 if (mddev->sb->not_persistent) 1022 return 0; 1023 1024 printk(KERN_INFO "md: updating md%d RAID superblock on device\n", 1025 mdidx(mddev)); 1026 1027 err = 0; 1028 ITERATE_RDEV(mddev,rdev,tmp) { 1029 printk(KERN_INFO "md: "); 1030 if (rdev->faulty) 1031 printk("(skipping faulty "); 1032 if (rdev->alias_device) 1033 printk("(skipping alias "); 1034 1035 printk("%s ", partition_name(rdev->dev)); 1036 if (!rdev->faulty && !rdev->alias_device) { 1037 printk("[events: %08lx]", 1038 (unsigned long)rdev->sb->events_lo); 1039 err += write_disk_sb(rdev); 1040 } else 1041 printk(")\n"); 1042 } 1043 if (err) { 1044 if (--count) { 1045 printk(KERN_ERR "md: errors occurred during superblock update, repeating\n"); 1046 goto repeat; 1047 } 1048 printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n"); 1049 } 1050 return 0; 1051} 1052 1053/* 1054 * Import a device. If 'on_disk', then sanity check the superblock 1055 * 1056 * mark the device faulty if: 1057 * 1058 * - the device is nonexistent (zero size) 1059 * - the device has no valid superblock 1060 * 1061 * a faulty rdev _never_ has rdev->sb set. 1062 */ 1063static int md_import_device(kdev_t newdev, int on_disk) 1064{ 1065 int err; 1066 mdk_rdev_t *rdev; 1067 unsigned int size; 1068 1069 if (find_rdev_all(newdev)) 1070 return -EEXIST; 1071 1072 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 1073 if (!rdev) { 1074 printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev)); 1075 return -ENOMEM; 1076 } 1077 memset(rdev, 0, sizeof(*rdev)); 1078 1079 if (is_mounted(newdev)) { 1080 printk(KERN_WARNING "md: can not import %s, has active inodes!\n", 1081 partition_name(newdev)); 1082 err = -EBUSY; 1083 goto abort_free; 1084 } 1085 1086 if ((err = alloc_disk_sb(rdev))) 1087 goto abort_free; 1088 1089 rdev->dev = newdev; 1090 if (lock_rdev(rdev)) { 1091 printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n", 1092 partition_name(newdev)); 1093 err = -EINVAL; 1094 goto abort_free; 1095 } 1096 rdev->desc_nr = -1; 1097 rdev->faulty = 0; 1098 1099 size = 0; 1100 if (blk_size[MAJOR(newdev)]) 1101 size = blk_size[MAJOR(newdev)][MINOR(newdev)]; 1102 if (!size) { 1103 printk(KERN_WARNING "md: %s has zero size, marking faulty!\n", 1104 partition_name(newdev)); 1105 err = -EINVAL; 1106 goto abort_free; 1107 } 1108 1109 if (on_disk) { 1110 if ((err = read_disk_sb(rdev))) { 1111 printk(KERN_WARNING "md: could not read %s's sb, not importing!\n", 1112 partition_name(newdev)); 1113 goto abort_free; 1114 } 1115 if ((err = check_disk_sb(rdev))) { 1116 printk(KERN_WARNING "md: %s has invalid sb, not importing!\n", 1117 partition_name(newdev)); 1118 goto abort_free; 1119 } 1120 1121 if (rdev->sb->level != -4) { 1122 rdev->old_dev = MKDEV(rdev->sb->this_disk.major, 1123 rdev->sb->this_disk.minor); 1124 rdev->desc_nr = rdev->sb->this_disk.number; 1125 } else { 1126 rdev->old_dev = MKDEV(0, 0); 1127 rdev->desc_nr = -1; 1128 } 1129 } 1130 md_list_add(&rdev->all, &all_raid_disks); 1131 MD_INIT_LIST_HEAD(&rdev->pending); 1132 1133 if (rdev->faulty && rdev->sb) 1134 free_disk_sb(rdev); 1135 return 0; 1136 1137abort_free: 1138 if (rdev->sb) { 1139 if (rdev->bdev) 1140 unlock_rdev(rdev); 1141 free_disk_sb(rdev); 1142 } 1143 kfree(rdev); 1144 return err; 1145} 1146 1147/* 1148 * Check a full RAID array for plausibility 1149 */ 1150 1151#define INCONSISTENT KERN_ERR \ 1152"md: fatal superblock inconsistency in %s -- removing from array\n" 1153 1154#define OUT_OF_DATE KERN_ERR \ 1155"md: superblock update time inconsistency -- using the most recent one\n" 1156 1157#define OLD_VERSION KERN_ALERT \ 1158"md: md%d: unsupported raid array version %d.%d.%d\n" 1159 1160#define NOT_CLEAN_IGNORE KERN_ERR \ 1161"md: md%d: raid array is not clean -- starting background reconstruction\n" 1162 1163#define UNKNOWN_LEVEL KERN_ERR \ 1164"md: md%d: unsupported raid level %d\n" 1165 1166static int analyze_sbs(mddev_t * mddev) 1167{ 1168 int out_of_date = 0, i, first; 1169 struct md_list_head *tmp, *tmp2; 1170 mdk_rdev_t *rdev, *rdev2, *freshest; 1171 mdp_super_t *sb; 1172 1173 /* 1174 * Verify the RAID superblock on each real device 1175 */ 1176 ITERATE_RDEV(mddev,rdev,tmp) { 1177 if (rdev->faulty) { 1178 MD_BUG(); 1179 goto abort; 1180 } 1181 if (!rdev->sb) { 1182 MD_BUG(); 1183 goto abort; 1184 } 1185 if (check_disk_sb(rdev)) 1186 goto abort; 1187 } 1188 1189 /* 1190 * The superblock constant part has to be the same 1191 * for all disks in the array. 1192 */ 1193 sb = NULL; 1194 1195 ITERATE_RDEV(mddev,rdev,tmp) { 1196 if (!sb) { 1197 sb = rdev->sb; 1198 continue; 1199 } 1200 if (!sb_equal(sb, rdev->sb)) { 1201 printk(INCONSISTENT, partition_name(rdev->dev)); 1202 kick_rdev_from_array(rdev); 1203 continue; 1204 } 1205 } 1206 1207 /* 1208 * OK, we have all disks and the array is ready to run. Let's 1209 * find the freshest superblock, that one will be the superblock 1210 * that represents the whole array. 1211 */ 1212 if (!mddev->sb) 1213 if (alloc_array_sb(mddev)) 1214 goto abort; 1215 sb = mddev->sb; 1216 freshest = NULL; 1217 1218 ITERATE_RDEV(mddev,rdev,tmp) { 1219 __u64 ev1, ev2; 1220 /* 1221 * if the checksum is invalid, use the superblock 1222 * only as a last resort. (decrease it's age by 1223 * one event) 1224 */ 1225 if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { 1226 if (rdev->sb->events_lo || rdev->sb->events_hi) 1227 if ((rdev->sb->events_lo--)==0) 1228 rdev->sb->events_hi--; 1229 } 1230 1231 printk(KERN_INFO "md: %s's event counter: %08lx\n", 1232 partition_name(rdev->dev), 1233 (unsigned long)rdev->sb->events_lo); 1234 if (!freshest) { 1235 freshest = rdev; 1236 continue; 1237 } 1238 /* 1239 * Find the newest superblock version 1240 */ 1241 ev1 = md_event(rdev->sb); 1242 ev2 = md_event(freshest->sb); 1243 if (ev1 != ev2) { 1244 out_of_date = 1; 1245 if (ev1 > ev2) 1246 freshest = rdev; 1247 } 1248 } 1249 if (out_of_date) { 1250 printk(OUT_OF_DATE); 1251 printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev)); 1252 } 1253 memcpy (sb, freshest->sb, sizeof(*sb)); 1254 1255 /* 1256 * at this point we have picked the 'best' superblock 1257 * from all available superblocks. 1258 * now we validate this superblock and kick out possibly 1259 * failed disks. 1260 */ 1261 ITERATE_RDEV(mddev,rdev,tmp) { 1262 /* 1263 * Kick all non-fresh devices 1264 */ 1265 __u64 ev1, ev2; 1266 ev1 = md_event(rdev->sb); 1267 ev2 = md_event(sb); 1268 ++ev1; 1269 if (ev1 < ev2) { 1270 printk(KERN_WARNING "md: kicking non-fresh %s from array!\n", 1271 partition_name(rdev->dev)); 1272 kick_rdev_from_array(rdev); 1273 continue; 1274 } 1275 } 1276 1277 /* 1278 * Fix up changed device names ... but only if this disk has a 1279 * recent update time. Use faulty checksum ones too. 1280 */ 1281 if (mddev->sb->level != -4) 1282 ITERATE_RDEV(mddev,rdev,tmp) { 1283 __u64 ev1, ev2, ev3; 1284 if (rdev->faulty || rdev->alias_device) { 1285 MD_BUG(); 1286 goto abort; 1287 } 1288 ev1 = md_event(rdev->sb); 1289 ev2 = md_event(sb); 1290 ev3 = ev2; 1291 --ev3; 1292 if ((rdev->dev != rdev->old_dev) && 1293 ((ev1 == ev2) || (ev1 == ev3))) { 1294 mdp_disk_t *desc; 1295 1296 printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n", 1297 partition_name(rdev->old_dev), partition_name(rdev->dev)); 1298 if (rdev->desc_nr == -1) { 1299 MD_BUG(); 1300 goto abort; 1301 } 1302 desc = &sb->disks[rdev->desc_nr]; 1303 if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { 1304 MD_BUG(); 1305 goto abort; 1306 } 1307 desc->major = MAJOR(rdev->dev); 1308 desc->minor = MINOR(rdev->dev); 1309 desc = &rdev->sb->this_disk; 1310 desc->major = MAJOR(rdev->dev); 1311 desc->minor = MINOR(rdev->dev); 1312 } 1313 } 1314 1315 /* 1316 * Remove unavailable and faulty devices ... 1317 * 1318 * note that if an array becomes completely unrunnable due to 1319 * missing devices, we do not write the superblock back, so the 1320 * administrator has a chance to fix things up. The removal thus 1321 * only happens if it's nonfatal to the contents of the array. 1322 */ 1323 for (i = 0; i < MD_SB_DISKS; i++) { 1324 int found; 1325 mdp_disk_t *desc; 1326 kdev_t dev; 1327 1328 desc = sb->disks + i; 1329 dev = MKDEV(desc->major, desc->minor); 1330 1331 /* 1332 * We kick faulty devices/descriptors immediately. 1333 * 1334 * Note: multipath devices are a special case. Since we 1335 * were able to read the superblock on the path, we don't 1336 * care if it was previously marked as faulty, it's up now 1337 * so enable it. 1338 */ 1339 if (disk_faulty(desc) && mddev->sb->level != -4) { 1340 found = 0; 1341 ITERATE_RDEV(mddev,rdev,tmp) { 1342 if (rdev->desc_nr != desc->number) 1343 continue; 1344 printk(KERN_WARNING "md%d: kicking faulty %s!\n", 1345 mdidx(mddev),partition_name(rdev->dev)); 1346 kick_rdev_from_array(rdev); 1347 found = 1; 1348 break; 1349 } 1350 if (!found) { 1351 if (dev == MKDEV(0,0)) 1352 continue; 1353 printk(KERN_WARNING "md%d: removing former faulty %s!\n", 1354 mdidx(mddev), partition_name(dev)); 1355 } 1356 remove_descriptor(desc, sb); 1357 continue; 1358 } else if (disk_faulty(desc)) { 1359 /* 1360 * multipath entry marked as faulty, unfaulty it 1361 */ 1362 rdev = find_rdev(mddev, dev); 1363 if(rdev) 1364 mark_disk_spare(desc); 1365 else 1366 remove_descriptor(desc, sb); 1367 } 1368 1369 if (dev == MKDEV(0,0)) 1370 continue; 1371 /* 1372 * Is this device present in the rdev ring? 1373 */ 1374 found = 0; 1375 ITERATE_RDEV(mddev,rdev,tmp) { 1376 /* 1377 * Multi-path IO special-case: since we have no 1378 * this_disk descriptor at auto-detect time, 1379 * we cannot check rdev->number. 1380 * We can check the device though. 1381 */ 1382 if ((sb->level == -4) && (rdev->dev == 1383 MKDEV(desc->major,desc->minor))) { 1384 found = 1; 1385 break; 1386 } 1387 if (rdev->desc_nr == desc->number) { 1388 found = 1; 1389 break; 1390 } 1391 } 1392 if (found) 1393 continue; 1394 1395 printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n", 1396 mdidx(mddev), partition_name(dev)); 1397 remove_descriptor(desc, sb); 1398 } 1399 1400 /* 1401 * Double check wether all devices mentioned in the 1402 * superblock are in the rdev ring. 1403 */ 1404 first = 1; 1405 for (i = 0; i < MD_SB_DISKS; i++) { 1406 mdp_disk_t *desc; 1407 kdev_t dev; 1408 1409 desc = sb->disks + i; 1410 dev = MKDEV(desc->major, desc->minor); 1411 1412 if (dev == MKDEV(0,0)) 1413 continue; 1414 1415 if (disk_faulty(desc)) { 1416 MD_BUG(); 1417 goto abort; 1418 } 1419 1420 rdev = find_rdev(mddev, dev); 1421 if (!rdev) { 1422 MD_BUG(); 1423 goto abort; 1424 } 1425 /* 1426 * In the case of Multipath-IO, we have no 1427 * other information source to find out which 1428 * disk is which, only the position of the device 1429 * in the superblock: 1430 */ 1431 if (mddev->sb->level == -4) { 1432 if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) { 1433 MD_BUG(); 1434 goto abort; 1435 } 1436 rdev->desc_nr = i; 1437 if (!first) 1438 rdev->alias_device = 1; 1439 else 1440 first = 0; 1441 } 1442 } 1443 1444 /* 1445 * Kick all rdevs that are not in the 1446 * descriptor array: 1447 */ 1448 ITERATE_RDEV(mddev,rdev,tmp) { 1449 if (rdev->desc_nr == -1) 1450 kick_rdev_from_array(rdev); 1451 } 1452 1453 /* 1454 * Do a final reality check. 1455 */ 1456 if (mddev->sb->level != -4) { 1457 ITERATE_RDEV(mddev,rdev,tmp) { 1458 if (rdev->desc_nr == -1) { 1459 MD_BUG(); 1460 goto abort; 1461 } 1462 /* 1463 * is the desc_nr unique? 1464 */ 1465 ITERATE_RDEV(mddev,rdev2,tmp2) { 1466 if ((rdev2 != rdev) && 1467 (rdev2->desc_nr == rdev->desc_nr)) { 1468 MD_BUG(); 1469 goto abort; 1470 } 1471 } 1472 /* 1473 * is the device unique? 1474 */ 1475 ITERATE_RDEV(mddev,rdev2,tmp2) { 1476 if ((rdev2 != rdev) && 1477 (rdev2->dev == rdev->dev)) { 1478 MD_BUG(); 1479 goto abort; 1480 } 1481 } 1482 } 1483 } 1484 1485 /* 1486 * Check if we can support this RAID array 1487 */ 1488 if (sb->major_version != MD_MAJOR_VERSION || 1489 sb->minor_version > MD_MINOR_VERSION) { 1490 1491 printk(OLD_VERSION, mdidx(mddev), sb->major_version, 1492 sb->minor_version, sb->patch_version); 1493 goto abort; 1494 } 1495 1496 if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || 1497 (sb->level == 4) || (sb->level == 5))) 1498 printk(NOT_CLEAN_IGNORE, mdidx(mddev)); 1499 1500 return 0; 1501abort: 1502 return 1; 1503} 1504 1505#undef INCONSISTENT 1506#undef OUT_OF_DATE 1507#undef OLD_VERSION 1508#undef OLD_LEVEL 1509 1510static int device_size_calculation(mddev_t * mddev) 1511{ 1512 int data_disks = 0, persistent; 1513 unsigned int readahead; 1514 mdp_super_t *sb = mddev->sb; 1515 struct md_list_head *tmp; 1516 mdk_rdev_t *rdev; 1517 1518 /* 1519 * Do device size calculation. Bail out if too small. 1520 * (we have to do this after having validated chunk_size, 1521 * because device size has to be modulo chunk_size) 1522 */ 1523 persistent = !mddev->sb->not_persistent; 1524 ITERATE_RDEV(mddev,rdev,tmp) { 1525 if (rdev->faulty) 1526 continue; 1527 if (rdev->size) { 1528 MD_BUG(); 1529 continue; 1530 } 1531 rdev->size = calc_dev_size(rdev->dev, mddev, persistent); 1532 if (rdev->size < sb->chunk_size / 1024) { 1533 printk(KERN_WARNING 1534 "md: Dev %s smaller than chunk_size: %ldk < %dk\n", 1535 partition_name(rdev->dev), 1536 rdev->size, sb->chunk_size / 1024); 1537 return -EINVAL; 1538 } 1539 } 1540 1541 switch (sb->level) { 1542 case -4: 1543 data_disks = 1; 1544 break; 1545 case -3: 1546 data_disks = 1; 1547 break; 1548 case -2: 1549 data_disks = 1; 1550 break; 1551 case -1: 1552 zoned_raid_size(mddev); 1553 data_disks = 1; 1554 break; 1555 case 0: 1556 zoned_raid_size(mddev); 1557 data_disks = sb->raid_disks; 1558 break; 1559 case 1: 1560 data_disks = 1; 1561 break; 1562 case 4: 1563 case 5: 1564 data_disks = sb->raid_disks-1; 1565 break; 1566 default: 1567 printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level); 1568 goto abort; 1569 } 1570 if (!md_size[mdidx(mddev)]) 1571 md_size[mdidx(mddev)] = sb->size * data_disks; 1572 1573 readahead = MD_READAHEAD; 1574 if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) { 1575 readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks; 1576 if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2) 1577 readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2; 1578 } else { 1579 // (no multipath branch - it uses the default setting) 1580 if (sb->level == -3) 1581 readahead = 0; 1582 } 1583 md_maxreadahead[mdidx(mddev)] = readahead; 1584 1585 printk(KERN_INFO "md%d: max total readahead window set to %ldk\n", 1586 mdidx(mddev), readahead*(PAGE_SIZE/1024)); 1587 1588 printk(KERN_INFO 1589 "md%d: %d data-disks, max readahead per data-disk: %ldk\n", 1590 mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024)); 1591 return 0; 1592abort: 1593 return 1; 1594} 1595 1596 1597#define TOO_BIG_CHUNKSIZE KERN_ERR \ 1598"too big chunk_size: %d > %d\n" 1599 1600#define TOO_SMALL_CHUNKSIZE KERN_ERR \ 1601"too small chunk_size: %d < %ld\n" 1602 1603#define BAD_CHUNKSIZE KERN_ERR \ 1604"no chunksize specified, see 'man raidtab'\n" 1605 1606static int do_md_run(mddev_t * mddev) 1607{ 1608 int pnum, err; 1609 int chunk_size; 1610 struct md_list_head *tmp; 1611 mdk_rdev_t *rdev; 1612 1613 1614 if (!mddev->nb_dev) { 1615 MD_BUG(); 1616 return -EINVAL; 1617 } 1618 1619 if (mddev->pers) 1620 return -EBUSY; 1621 1622 /* 1623 * Resize disks to align partitions size on a given 1624 * chunk size. 1625 */ 1626 md_size[mdidx(mddev)] = 0; 1627 1628 /* 1629 * Analyze all RAID superblock(s) 1630 */ 1631 if (analyze_sbs(mddev)) { 1632 MD_BUG(); 1633 return -EINVAL; 1634 } 1635 1636 chunk_size = mddev->sb->chunk_size; 1637 pnum = level_to_pers(mddev->sb->level); 1638 1639 mddev->param.chunk_size = chunk_size; 1640 mddev->param.personality = pnum; 1641 1642 if ((pnum != MULTIPATH) && (pnum != RAID1)) { 1643 if (!chunk_size) { 1644 /* 1645 * 'default chunksize' in the old md code used to 1646 * be PAGE_SIZE, baaad. 1647 * we abort here to be on the safe side. We dont 1648 * want to continue the bad practice. 1649 */ 1650 printk(BAD_CHUNKSIZE); 1651 return -EINVAL; 1652 } 1653 if (chunk_size > MAX_CHUNK_SIZE) { 1654 printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); 1655 return -EINVAL; 1656 } 1657 /* 1658 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 1659 */ 1660 if ( (1 << ffz(~chunk_size)) != chunk_size) { 1661 MD_BUG(); 1662 return -EINVAL; 1663 } 1664 if (chunk_size < PAGE_SIZE) { 1665 printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); 1666 return -EINVAL; 1667 } 1668 } else 1669 if (chunk_size) 1670 printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n", 1671 mddev->sb->level); 1672 1673 if (pnum >= MAX_PERSONALITY) { 1674 MD_BUG(); 1675 return -EINVAL; 1676 } 1677 1678 if (!pers[pnum]) 1679 { 1680#ifdef CONFIG_KMOD 1681 char module_name[80]; 1682 sprintf (module_name, "md-personality-%d", pnum); 1683 request_module (module_name); 1684 if (!pers[pnum]) 1685#endif 1686 { 1687 printk(KERN_ERR "md: personality %d is not loaded!\n", 1688 pnum); 1689 return -EINVAL; 1690 } 1691 } 1692 1693 if (device_size_calculation(mddev)) 1694 return -EINVAL; 1695 1696 /* 1697 * Drop all container device buffers, from now on 1698 * the only valid external interface is through the md 1699 * device. 1700 * Also find largest hardsector size 1701 */ 1702 md_hardsect_sizes[mdidx(mddev)] = 512; 1703 ITERATE_RDEV(mddev,rdev,tmp) { 1704 if (rdev->faulty) 1705 continue; 1706 invalidate_device(rdev->dev, 1); 1707 if (get_hardsect_size(rdev->dev) 1708 > md_hardsect_sizes[mdidx(mddev)]) 1709 md_hardsect_sizes[mdidx(mddev)] = 1710 get_hardsect_size(rdev->dev); 1711 } 1712 md_blocksizes[mdidx(mddev)] = 1024; 1713 if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)]) 1714 md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)]; 1715 mddev->pers = pers[pnum]; 1716 1717 err = mddev->pers->run(mddev); 1718 if (err) { 1719 printk(KERN_ERR "md: pers->run() failed ...\n"); 1720 mddev->pers = NULL; 1721 return -EINVAL; 1722 } 1723 1724 mddev->sb->state &= ~(1 << MD_SB_CLEAN); 1725 mddev->sb_dirty = 1; 1726 md_update_sb(mddev); 1727 1728 /* 1729 * md_size has units of 1K blocks, which are 1730 * twice as large as sectors. 1731 */ 1732 md_hd_struct[mdidx(mddev)].start_sect = 0; 1733 register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)), 1734 1, &md_fops, md_size[mdidx(mddev)]<<1); 1735 1736 read_ahead[MD_MAJOR] = 1024; 1737 return (0); 1738} 1739 1740#undef TOO_BIG_CHUNKSIZE 1741#undef BAD_CHUNKSIZE 1742 1743#define OUT(x) do { err = (x); goto out; } while (0) 1744 1745static int restart_array(mddev_t *mddev) 1746{ 1747 int err = 0; 1748 1749 /* 1750 * Complain if it has no devices 1751 */ 1752 if (!mddev->nb_dev) 1753 OUT(-ENXIO); 1754 1755 if (mddev->pers) { 1756 if (!mddev->ro) 1757 OUT(-EBUSY); 1758 1759 mddev->ro = 0; 1760 set_device_ro(mddev_to_kdev(mddev), 0); 1761 1762 printk(KERN_INFO 1763 "md: md%d switched to read-write mode.\n", mdidx(mddev)); 1764 /* 1765 * Kick recovery or resync if necessary 1766 */ 1767 md_recover_arrays(); 1768 if (mddev->pers->restart_resync) 1769 mddev->pers->restart_resync(mddev); 1770 } else { 1771 printk(KERN_ERR "md: md%d has no personality assigned.\n", 1772 mdidx(mddev)); 1773 err = -EINVAL; 1774 } 1775 1776out: 1777 return err; 1778} 1779 1780#define STILL_MOUNTED KERN_WARNING \ 1781"md: md%d still mounted.\n" 1782#define STILL_IN_USE \ 1783"md: md%d still in use.\n" 1784 1785static int do_md_stop(mddev_t * mddev, int ro) 1786{ 1787 int err = 0, resync_interrupted = 0; 1788 kdev_t dev = mddev_to_kdev(mddev); 1789 1790 if (atomic_read(&mddev->active)>1) { 1791 printk(STILL_IN_USE, mdidx(mddev)); 1792 OUT(-EBUSY); 1793 } 1794 1795 if (mddev->pers) { 1796 /* 1797 * It is safe to call stop here, it only frees private 1798 * data. Also, it tells us if a device is unstoppable 1799 * (eg. resyncing is in progress) 1800 */ 1801 if (mddev->pers->stop_resync) 1802 if (mddev->pers->stop_resync(mddev)) 1803 resync_interrupted = 1; 1804 1805 if (mddev->recovery_running) 1806 md_interrupt_thread(md_recovery_thread); 1807 1808 /* 1809 * This synchronizes with signal delivery to the 1810 * resync or reconstruction thread. It also nicely 1811 * hangs the process if some reconstruction has not 1812 * finished. 1813 */ 1814 down(&mddev->recovery_sem); 1815 up(&mddev->recovery_sem); 1816 1817 invalidate_device(dev, 1); 1818 1819 if (ro) { 1820 if (mddev->ro) 1821 OUT(-ENXIO); 1822 mddev->ro = 1; 1823 } else { 1824 if (mddev->ro) 1825 set_device_ro(dev, 0); 1826 if (mddev->pers->stop(mddev)) { 1827 if (mddev->ro) 1828 set_device_ro(dev, 1); 1829 OUT(-EBUSY); 1830 } 1831 if (mddev->ro) 1832 mddev->ro = 0; 1833 } 1834 if (mddev->sb) { 1835 /* 1836 * mark it clean only if there was no resync 1837 * interrupted. 1838 */ 1839 if (!mddev->recovery_running && !resync_interrupted) { 1840 printk(KERN_INFO "md: marking sb clean...\n"); 1841 mddev->sb->state |= 1 << MD_SB_CLEAN; 1842 } 1843 mddev->sb_dirty = 1; 1844 md_update_sb(mddev); 1845 } 1846 if (ro) 1847 set_device_ro(dev, 1); 1848 } 1849 1850 /* 1851 * Free resources if final stop 1852 */ 1853 if (!ro) { 1854 printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev)); 1855 free_mddev(mddev); 1856 1857 } else 1858 printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev)); 1859out: 1860 return err; 1861} 1862 1863#undef OUT 1864 1865/* 1866 * We have to safely support old arrays too. 1867 */ 1868int detect_old_array(mdp_super_t *sb) 1869{ 1870 if (sb->major_version > 0) 1871 return 0; 1872 if (sb->minor_version >= 90) 1873 return 0; 1874 1875 return -EINVAL; 1876} 1877 1878 1879static void autorun_array(mddev_t *mddev) 1880{ 1881 mdk_rdev_t *rdev; 1882 struct md_list_head *tmp; 1883 int err; 1884 1885 if (mddev->disks.prev == &mddev->disks) { 1886 MD_BUG(); 1887 return; 1888 } 1889 1890 printk(KERN_INFO "md: running: "); 1891 1892 ITERATE_RDEV(mddev,rdev,tmp) { 1893 printk("<%s>", partition_name(rdev->dev)); 1894 } 1895 printk("\n"); 1896 1897 err = do_md_run (mddev); 1898 if (err) { 1899 printk(KERN_WARNING "md :do_md_run() returned %d\n", err); 1900 /* 1901 * prevent the writeback of an unrunnable array 1902 */ 1903 mddev->sb_dirty = 0; 1904 do_md_stop (mddev, 0); 1905 } 1906} 1907 1908/* 1909 * lets try to run arrays based on all disks that have arrived 1910 * until now. (those are in the ->pending list) 1911 * 1912 * the method: pick the first pending disk, collect all disks with 1913 * the same UUID, remove all from the pending list and put them into 1914 * the 'same_array' list. Then order this list based on superblock 1915 * update time (freshest comes first), kick out 'old' disks and 1916 * compare superblocks. If everything's fine then run it. 1917 * 1918 * If "unit" is allocated, then bump its reference count 1919 */ 1920static void autorun_devices(kdev_t countdev) 1921{ 1922 struct md_list_head candidates; 1923 struct md_list_head *tmp; 1924 mdk_rdev_t *rdev0, *rdev; 1925 mddev_t *mddev; 1926 kdev_t md_kdev; 1927 1928 1929 printk(KERN_INFO "md: autorun ...\n"); 1930 while (pending_raid_disks.next != &pending_raid_disks) { 1931 rdev0 = md_list_entry(pending_raid_disks.next, 1932 mdk_rdev_t, pending); 1933 1934 printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev)); 1935 MD_INIT_LIST_HEAD(&candidates); 1936 ITERATE_RDEV_PENDING(rdev,tmp) { 1937 if (uuid_equal(rdev0, rdev)) { 1938 if (!sb_equal(rdev0->sb, rdev->sb)) { 1939 printk(KERN_WARNING 1940 "md: %s has same UUID as %s, but superblocks differ ...\n", 1941 partition_name(rdev->dev), partition_name(rdev0->dev)); 1942 continue; 1943 } 1944 printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev)); 1945 md_list_del(&rdev->pending); 1946 md_list_add(&rdev->pending, &candidates); 1947 } 1948 } 1949 /* 1950 * now we have a set of devices, with all of them having 1951 * mostly sane superblocks. It's time to allocate the 1952 * mddev. 1953 */ 1954 md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor); 1955 mddev = kdev_to_mddev(md_kdev); 1956 if (mddev) { 1957 printk(KERN_WARNING "md: md%d already running, cannot run %s\n", 1958 mdidx(mddev), partition_name(rdev0->dev)); 1959 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) 1960 export_rdev(rdev); 1961 continue; 1962 } 1963 mddev = alloc_mddev(md_kdev); 1964 if (!mddev) { 1965 printk(KERN_ERR "md: cannot allocate memory for md drive.\n"); 1966 break; 1967 } 1968 if (md_kdev == countdev) 1969 atomic_inc(&mddev->active); 1970 printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); 1971 ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { 1972 bind_rdev_to_array(rdev, mddev); 1973 md_list_del(&rdev->pending); 1974 MD_INIT_LIST_HEAD(&rdev->pending); 1975 } 1976 autorun_array(mddev); 1977 } 1978 printk(KERN_INFO "md: ... autorun DONE.\n"); 1979} 1980 1981/* 1982 * import RAID devices based on one partition 1983 * if possible, the array gets run as well. 1984 */ 1985 1986#define BAD_VERSION KERN_ERR \ 1987"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n" 1988 1989#define OUT_OF_MEM KERN_ALERT \ 1990"md: out of memory.\n" 1991 1992#define NO_DEVICE KERN_ERR \ 1993"md: disabled device %s\n" 1994 1995#define AUTOADD_FAILED KERN_ERR \ 1996"md: auto-adding devices to md%d FAILED (error %d).\n" 1997 1998#define AUTOADD_FAILED_USED KERN_ERR \ 1999"md: cannot auto-add device %s to md%d, already used.\n" 2000 2001#define AUTORUN_FAILED KERN_ERR \ 2002"md: auto-running md%d FAILED (error %d).\n" 2003 2004#define MDDEV_BUSY KERN_ERR \ 2005"md: cannot auto-add to md%d, already running.\n" 2006 2007#define AUTOADDING KERN_INFO \ 2008"md: auto-adding devices to md%d, based on %s's superblock.\n" 2009 2010#define AUTORUNNING KERN_INFO \ 2011"md: auto-running md%d.\n" 2012 2013static int autostart_array(kdev_t startdev, kdev_t countdev) 2014{ 2015 int err = -EINVAL, i; 2016 mdp_super_t *sb = NULL; 2017 mdk_rdev_t *start_rdev = NULL, *rdev; 2018 2019 if (md_import_device(startdev, 1)) { 2020 printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev)); 2021 goto abort; 2022 } 2023 2024 start_rdev = find_rdev_all(startdev); 2025 if (!start_rdev) { 2026 MD_BUG(); 2027 goto abort; 2028 } 2029 if (start_rdev->faulty) { 2030 printk(KERN_WARNING "md: can not autostart based on faulty %s!\n", 2031 partition_name(startdev)); 2032 goto abort; 2033 } 2034 md_list_add(&start_rdev->pending, &pending_raid_disks); 2035 2036 sb = start_rdev->sb; 2037 2038 err = detect_old_array(sb); 2039 if (err) { 2040 printk(KERN_WARNING "md: array version is too old to be autostarted ," 2041 "use raidtools 0.90 mkraid --upgrade to upgrade the array " 2042 "without data loss!\n"); 2043 goto abort; 2044 } 2045 2046 for (i = 0; i < MD_SB_DISKS; i++) { 2047 mdp_disk_t *desc; 2048 kdev_t dev; 2049 2050 desc = sb->disks + i; 2051 dev = MKDEV(desc->major, desc->minor); 2052 2053 if (dev == MKDEV(0,0)) 2054 continue; 2055 if (dev == startdev) 2056 continue; 2057 if (md_import_device(dev, 1)) { 2058 printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n", 2059 partition_name(dev)); 2060 continue; 2061 } 2062 rdev = find_rdev_all(dev); 2063 if (!rdev) { 2064 MD_BUG(); 2065 goto abort; 2066 } 2067 md_list_add(&rdev->pending, &pending_raid_disks); 2068 } 2069 2070 /* 2071 * possibly return codes 2072 */ 2073 autorun_devices(countdev); 2074 return 0; 2075 2076abort: 2077 if (start_rdev) 2078 export_rdev(start_rdev); 2079 return err; 2080} 2081 2082#undef BAD_VERSION 2083#undef OUT_OF_MEM 2084#undef NO_DEVICE 2085#undef AUTOADD_FAILED_USED 2086#undef AUTOADD_FAILED 2087#undef AUTORUN_FAILED 2088#undef AUTOADDING 2089#undef AUTORUNNING 2090 2091 2092static int get_version(void * arg) 2093{ 2094 mdu_version_t ver; 2095 2096 ver.major = MD_MAJOR_VERSION; 2097 ver.minor = MD_MINOR_VERSION; 2098 ver.patchlevel = MD_PATCHLEVEL_VERSION; 2099 2100 if (md_copy_to_user(arg, &ver, sizeof(ver))) 2101 return -EFAULT; 2102 2103 return 0; 2104} 2105 2106#define SET_FROM_SB(x) info.x = mddev->sb->x 2107static int get_array_info(mddev_t * mddev, void * arg) 2108{ 2109 mdu_array_info_t info; 2110 2111 if (!mddev->sb) { 2112 MD_BUG(); 2113 return -EINVAL; 2114 } 2115 2116 SET_FROM_SB(major_version); 2117 SET_FROM_SB(minor_version); 2118 SET_FROM_SB(patch_version); 2119 SET_FROM_SB(ctime); 2120 SET_FROM_SB(level); 2121 SET_FROM_SB(size); 2122 SET_FROM_SB(nr_disks); 2123 SET_FROM_SB(raid_disks); 2124 SET_FROM_SB(md_minor); 2125 SET_FROM_SB(not_persistent); 2126 2127 SET_FROM_SB(utime); 2128 SET_FROM_SB(state); 2129 SET_FROM_SB(active_disks); 2130 SET_FROM_SB(working_disks); 2131 SET_FROM_SB(failed_disks); 2132 SET_FROM_SB(spare_disks); 2133 2134 SET_FROM_SB(layout); 2135 SET_FROM_SB(chunk_size); 2136 2137 if (md_copy_to_user(arg, &info, sizeof(info))) 2138 return -EFAULT; 2139 2140 return 0; 2141} 2142#undef SET_FROM_SB 2143 2144#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x 2145static int get_disk_info(mddev_t * mddev, void * arg) 2146{ 2147 mdu_disk_info_t info; 2148 unsigned int nr; 2149 2150 if (!mddev->sb) 2151 return -EINVAL; 2152 2153 if (md_copy_from_user(&info, arg, sizeof(info))) 2154 return -EFAULT; 2155 2156 nr = info.number; 2157 if (nr >= MD_SB_DISKS) 2158 return -EINVAL; 2159 2160 SET_FROM_SB(major); 2161 SET_FROM_SB(minor); 2162 SET_FROM_SB(raid_disk); 2163 SET_FROM_SB(state); 2164 2165 if (md_copy_to_user(arg, &info, sizeof(info))) 2166 return -EFAULT; 2167 2168 return 0; 2169} 2170#undef SET_FROM_SB 2171 2172#define SET_SB(x) mddev->sb->disks[nr].x = info->x 2173 2174static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2175{ 2176 int err, size, persistent; 2177 mdk_rdev_t *rdev; 2178 unsigned int nr; 2179 kdev_t dev; 2180 dev = MKDEV(info->major,info->minor); 2181 2182 if (find_rdev_all(dev)) { 2183 printk(KERN_WARNING "md: device %s already used in a RAID array!\n", 2184 partition_name(dev)); 2185 return -EBUSY; 2186 } 2187 if (!mddev->sb) { 2188 /* expecting a device which has a superblock */ 2189 err = md_import_device(dev, 1); 2190 if (err) { 2191 printk(KERN_WARNING "md: md_import_device returned %d\n", err); 2192 return -EINVAL; 2193 } 2194 rdev = find_rdev_all(dev); 2195 if (!rdev) { 2196 MD_BUG(); 2197 return -EINVAL; 2198 } 2199 if (mddev->nb_dev) { 2200 mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next, 2201 mdk_rdev_t, same_set); 2202 if (!uuid_equal(rdev0, rdev)) { 2203 printk(KERN_WARNING "md: %s has different UUID to %s\n", 2204 partition_name(rdev->dev), partition_name(rdev0->dev)); 2205 export_rdev(rdev); 2206 return -EINVAL; 2207 } 2208 if (!sb_equal(rdev0->sb, rdev->sb)) { 2209 printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n", 2210 partition_name(rdev->dev), partition_name(rdev0->dev)); 2211 export_rdev(rdev); 2212 return -EINVAL; 2213 } 2214 } 2215 bind_rdev_to_array(rdev, mddev); 2216 return 0; 2217 } 2218 2219 nr = info->number; 2220 if (nr >= mddev->sb->nr_disks) { 2221 MD_BUG(); 2222 return -EINVAL; 2223 } 2224 2225 2226 SET_SB(number); 2227 SET_SB(major); 2228 SET_SB(minor); 2229 SET_SB(raid_disk); 2230 SET_SB(state); 2231 2232 if ((info->state & (1<<MD_DISK_FAULTY))==0) { 2233 err = md_import_device (dev, 0); 2234 if (err) { 2235 printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err); 2236 return -EINVAL; 2237 } 2238 rdev = find_rdev_all(dev); 2239 if (!rdev) { 2240 MD_BUG(); 2241 return -EINVAL; 2242 } 2243 2244 rdev->old_dev = dev; 2245 rdev->desc_nr = info->number; 2246 2247 bind_rdev_to_array(rdev, mddev); 2248 2249 persistent = !mddev->sb->not_persistent; 2250 if (!persistent) 2251 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2252 2253 size = calc_dev_size(dev, mddev, persistent); 2254 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); 2255 2256 if (!mddev->sb->size || (mddev->sb->size > size)) 2257 mddev->sb->size = size; 2258 } 2259 2260 /* 2261 * sync all other superblocks with the main superblock 2262 */ 2263 sync_sbs(mddev); 2264 2265 return 0; 2266} 2267#undef SET_SB 2268 2269static int hot_generate_error(mddev_t * mddev, kdev_t dev) 2270{ 2271 struct request_queue *q; 2272 mdk_rdev_t *rdev; 2273 mdp_disk_t *disk; 2274 2275 if (!mddev->pers) 2276 return -ENODEV; 2277 2278 printk(KERN_INFO "md: trying to generate %s error in md%d ... \n", 2279 partition_name(dev), mdidx(mddev)); 2280 2281 rdev = find_rdev(mddev, dev); 2282 if (!rdev) { 2283 MD_BUG(); 2284 return -ENXIO; 2285 } 2286 2287 if (rdev->desc_nr == -1) { 2288 MD_BUG(); 2289 return -EINVAL; 2290 } 2291 disk = &mddev->sb->disks[rdev->desc_nr]; 2292 if (!disk_active(disk)) 2293 return -ENODEV; 2294 2295 q = blk_get_queue(rdev->dev); 2296 if (!q) { 2297 MD_BUG(); 2298 return -ENODEV; 2299 } 2300 printk(KERN_INFO "md: okay, generating error!\n"); 2301// q->oneshot_error = 1; // disabled for now 2302 2303 return 0; 2304} 2305 2306static int hot_remove_disk(mddev_t * mddev, kdev_t dev) 2307{ 2308 int err; 2309 mdk_rdev_t *rdev; 2310 mdp_disk_t *disk; 2311 2312 if (!mddev->pers) 2313 return -ENODEV; 2314 2315 printk(KERN_INFO "md: trying to remove %s from md%d ... \n", 2316 partition_name(dev), mdidx(mddev)); 2317 2318 if (!mddev->pers->diskop) { 2319 printk(KERN_WARNING "md%d: personality does not support diskops!\n", 2320 mdidx(mddev)); 2321 return -EINVAL; 2322 } 2323 2324 rdev = find_rdev(mddev, dev); 2325 if (!rdev) 2326 return -ENXIO; 2327 2328 if (rdev->desc_nr == -1) { 2329 MD_BUG(); 2330 return -EINVAL; 2331 } 2332 disk = &mddev->sb->disks[rdev->desc_nr]; 2333 if (disk_active(disk)) { 2334 MD_BUG(); 2335 goto busy; 2336 } 2337 if (disk_removed(disk)) { 2338 MD_BUG(); 2339 return -EINVAL; 2340 } 2341 2342 err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); 2343 if (err == -EBUSY) { 2344 MD_BUG(); 2345 goto busy; 2346 } 2347 if (err) { 2348 MD_BUG(); 2349 return -EINVAL; 2350 } 2351 2352 remove_descriptor(disk, mddev->sb); 2353 kick_rdev_from_array(rdev); 2354 mddev->sb_dirty = 1; 2355 md_update_sb(mddev); 2356 2357 return 0; 2358busy: 2359 printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n", 2360 partition_name(dev), mdidx(mddev)); 2361 return -EBUSY; 2362} 2363 2364static int hot_add_disk(mddev_t * mddev, kdev_t dev) 2365{ 2366 int i, err, persistent; 2367 unsigned int size; 2368 mdk_rdev_t *rdev; 2369 mdp_disk_t *disk; 2370 2371 if (!mddev->pers) 2372 return -ENODEV; 2373 2374 printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n", 2375 partition_name(dev), mdidx(mddev)); 2376 2377 if (!mddev->pers->diskop) { 2378 printk(KERN_WARNING "md%d: personality does not support diskops!\n", 2379 mdidx(mddev)); 2380 return -EINVAL; 2381 } 2382 2383 persistent = !mddev->sb->not_persistent; 2384 size = calc_dev_size(dev, mddev, persistent); 2385 2386 if (size < mddev->sb->size) { 2387 printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n", 2388 mdidx(mddev), size, mddev->sb->size); 2389 return -ENOSPC; 2390 } 2391 2392 rdev = find_rdev(mddev, dev); 2393 if (rdev) 2394 return -EBUSY; 2395 2396 err = md_import_device (dev, 0); 2397 if (err) { 2398 printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err); 2399 return -EINVAL; 2400 } 2401 rdev = find_rdev_all(dev); 2402 if (!rdev) { 2403 MD_BUG(); 2404 return -EINVAL; 2405 } 2406 if (rdev->faulty) { 2407 printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n", 2408 partition_name(dev), mdidx(mddev)); 2409 err = -EINVAL; 2410 goto abort_export; 2411 } 2412 bind_rdev_to_array(rdev, mddev); 2413 2414 /* 2415 * The rest should better be atomic, we can have disk failures 2416 * noticed in interrupt contexts ... 2417 */ 2418 rdev->old_dev = dev; 2419 rdev->size = size; 2420 rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); 2421 2422 disk = mddev->sb->disks + mddev->sb->raid_disks; 2423 for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { 2424 disk = mddev->sb->disks + i; 2425 2426 if (!disk->major && !disk->minor) 2427 break; 2428 if (disk_removed(disk)) 2429 break; 2430 } 2431 if (i == MD_SB_DISKS) { 2432 printk(KERN_WARNING "md%d: can not hot-add to full array!\n", 2433 mdidx(mddev)); 2434 err = -EBUSY; 2435 goto abort_unbind_export; 2436 } 2437 2438 if (disk_removed(disk)) { 2439 /* 2440 * reuse slot 2441 */ 2442 if (disk->number != i) { 2443 MD_BUG(); 2444 err = -EINVAL; 2445 goto abort_unbind_export; 2446 } 2447 } else { 2448 disk->number = i; 2449 } 2450 2451 disk->raid_disk = disk->number; 2452 disk->major = MAJOR(dev); 2453 disk->minor = MINOR(dev); 2454 2455 if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) { 2456 MD_BUG(); 2457 err = -EINVAL; 2458 goto abort_unbind_export; 2459 } 2460 2461 mark_disk_spare(disk); 2462 mddev->sb->nr_disks++; 2463 mddev->sb->spare_disks++; 2464 mddev->sb->working_disks++; 2465 2466 mddev->sb_dirty = 1; 2467 md_update_sb(mddev); 2468 2469 /* 2470 * Kick recovery, maybe this spare has to be added to the 2471 * array immediately. 2472 */ 2473 md_recover_arrays(); 2474 2475 return 0; 2476 2477abort_unbind_export: 2478 unbind_rdev_from_array(rdev); 2479 2480abort_export: 2481 export_rdev(rdev); 2482 return err; 2483} 2484 2485#define SET_SB(x) mddev->sb->x = info->x 2486static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 2487{ 2488 2489 if (alloc_array_sb(mddev)) 2490 return -ENOMEM; 2491 2492 mddev->sb->major_version = MD_MAJOR_VERSION; 2493 mddev->sb->minor_version = MD_MINOR_VERSION; 2494 mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; 2495 mddev->sb->ctime = CURRENT_TIME; 2496 2497 SET_SB(level); 2498 SET_SB(size); 2499 SET_SB(nr_disks); 2500 SET_SB(raid_disks); 2501 SET_SB(md_minor); 2502 SET_SB(not_persistent); 2503 2504 SET_SB(state); 2505 SET_SB(active_disks); 2506 SET_SB(working_disks); 2507 SET_SB(failed_disks); 2508 SET_SB(spare_disks); 2509 2510 SET_SB(layout); 2511 SET_SB(chunk_size); 2512 2513 mddev->sb->md_magic = MD_SB_MAGIC; 2514 2515 /* 2516 * Generate a 128 bit UUID 2517 */ 2518 get_random_bytes(&mddev->sb->set_uuid0, 4); 2519 get_random_bytes(&mddev->sb->set_uuid1, 4); 2520 get_random_bytes(&mddev->sb->set_uuid2, 4); 2521 get_random_bytes(&mddev->sb->set_uuid3, 4); 2522 2523 return 0; 2524} 2525#undef SET_SB 2526 2527static int set_disk_info(mddev_t * mddev, void * arg) 2528{ 2529 printk(KERN_INFO "md: not yet"); 2530 return -EINVAL; 2531} 2532 2533static int clear_array(mddev_t * mddev) 2534{ 2535 printk(KERN_INFO "md: not yet"); 2536 return -EINVAL; 2537} 2538 2539static int write_raid_info(mddev_t * mddev) 2540{ 2541 printk(KERN_INFO "md: not yet"); 2542 return -EINVAL; 2543} 2544 2545static int protect_array(mddev_t * mddev) 2546{ 2547 printk(KERN_INFO "md: not yet"); 2548 return -EINVAL; 2549} 2550 2551static int unprotect_array(mddev_t * mddev) 2552{ 2553 printk(KERN_INFO "md: not yet"); 2554 return -EINVAL; 2555} 2556 2557static int set_disk_faulty(mddev_t *mddev, kdev_t dev) 2558{ 2559 int ret; 2560 2561 ret = md_error(mddev, dev); 2562 return ret; 2563} 2564 2565static int md_ioctl(struct inode *inode, struct file *file, 2566 unsigned int cmd, unsigned long arg) 2567{ 2568 unsigned int minor; 2569 int err = 0; 2570 struct hd_geometry *loc = (struct hd_geometry *) arg; 2571 mddev_t *mddev = NULL; 2572 kdev_t dev; 2573 2574 if (!md_capable_admin()) 2575 return -EACCES; 2576 2577 dev = inode->i_rdev; 2578 minor = MINOR(dev); 2579 if (minor >= MAX_MD_DEVS) { 2580 MD_BUG(); 2581 return -EINVAL; 2582 } 2583 2584 /* 2585 * Commands dealing with the RAID driver but not any 2586 * particular array: 2587 */ 2588 switch (cmd) 2589 { 2590 case RAID_VERSION: 2591 err = get_version((void *)arg); 2592 goto done; 2593 2594 case PRINT_RAID_DEBUG: 2595 err = 0; 2596 md_print_devices(); 2597 goto done_unlock; 2598 2599#ifndef MODULE 2600 case RAID_AUTORUN: 2601 err = 0; 2602 autostart_arrays(); 2603 goto done; 2604#endif 2605 2606 case BLKGETSIZE: /* Return device size */ 2607 if (!arg) { 2608 err = -EINVAL; 2609 MD_BUG(); 2610 goto abort; 2611 } 2612 err = md_put_user(md_hd_struct[minor].nr_sects, 2613 (unsigned long *) arg); 2614 goto done; 2615 2616 case BLKGETSIZE64: /* Return device size */ 2617 err = md_put_user((u64)md_hd_struct[minor].nr_sects << 9, 2618 (u64 *) arg); 2619 goto done; 2620 2621 case BLKRAGET: 2622 case BLKRASET: 2623 case BLKFLSBUF: 2624 case BLKBSZGET: 2625 case BLKBSZSET: 2626 err = blk_ioctl (dev, cmd, arg); 2627 goto abort; 2628 2629 default:; 2630 } 2631 2632 /* 2633 * Commands creating/starting a new array: 2634 */ 2635 2636 mddev = kdev_to_mddev(dev); 2637 2638 switch (cmd) 2639 { 2640 case SET_ARRAY_INFO: 2641 case START_ARRAY: 2642 if (mddev) { 2643 printk(KERN_WARNING "md: array md%d already exists!\n", 2644 mdidx(mddev)); 2645 err = -EEXIST; 2646 goto abort; 2647 } 2648 default:; 2649 } 2650 switch (cmd) 2651 { 2652 case SET_ARRAY_INFO: 2653 mddev = alloc_mddev(dev); 2654 if (!mddev) { 2655 err = -ENOMEM; 2656 goto abort; 2657 } 2658 atomic_inc(&mddev->active); 2659 2660 /* 2661 * alloc_mddev() should possibly self-lock. 2662 */ 2663 err = lock_mddev(mddev); 2664 if (err) { 2665 printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n", 2666 err, cmd); 2667 goto abort; 2668 } 2669 2670 if (mddev->sb) { 2671 printk(KERN_WARNING "md: array md%d already has a superblock!\n", 2672 mdidx(mddev)); 2673 err = -EBUSY; 2674 goto abort_unlock; 2675 } 2676 if (arg) { 2677 mdu_array_info_t info; 2678 if (md_copy_from_user(&info, (void*)arg, sizeof(info))) { 2679 err = -EFAULT; 2680 goto abort_unlock; 2681 } 2682 err = set_array_info(mddev, &info); 2683 if (err) { 2684 printk(KERN_WARNING "md: couldnt set array info. %d\n", err); 2685 goto abort_unlock; 2686 } 2687 } 2688 goto done_unlock; 2689 2690 case START_ARRAY: 2691 /* 2692 * possibly make it lock the array ... 2693 */ 2694 err = autostart_array((kdev_t)arg, dev); 2695 if (err) { 2696 printk(KERN_WARNING "md: autostart %s failed!\n", 2697 partition_name((kdev_t)arg)); 2698 goto abort; 2699 } 2700 goto done; 2701 2702 default:; 2703 } 2704 2705 /* 2706 * Commands querying/configuring an existing array: 2707 */ 2708 2709 if (!mddev) { 2710 err = -ENODEV; 2711 goto abort; 2712 } 2713 err = lock_mddev(mddev); 2714 if (err) { 2715 printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd); 2716 goto abort; 2717 } 2718 /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ 2719 if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { 2720 err = -ENODEV; 2721 goto abort_unlock; 2722 } 2723 2724 /* 2725 * Commands even a read-only array can execute: 2726 */ 2727 switch (cmd) 2728 { 2729 case GET_ARRAY_INFO: 2730 err = get_array_info(mddev, (void *)arg); 2731 goto done_unlock; 2732 2733 case GET_DISK_INFO: 2734 err = get_disk_info(mddev, (void *)arg); 2735 goto done_unlock; 2736 2737 case RESTART_ARRAY_RW: 2738 err = restart_array(mddev); 2739 goto done_unlock; 2740 2741 case STOP_ARRAY: 2742 if (!(err = do_md_stop (mddev, 0))) 2743 mddev = NULL; 2744 goto done_unlock; 2745 2746 case STOP_ARRAY_RO: 2747 err = do_md_stop (mddev, 1); 2748 goto done_unlock; 2749 2750 /* 2751 * We have a problem here : there is no easy way to give a CHS 2752 * virtual geometry. We currently pretend that we have a 2 heads 2753 * 4 sectors (with a BIG number of cylinders...). This drives 2754 * dosfs just mad... ;-) 2755 */ 2756 case HDIO_GETGEO: 2757 if (!loc) { 2758 err = -EINVAL; 2759 goto abort_unlock; 2760 } 2761 err = md_put_user (2, (char *) &loc->heads); 2762 if (err) 2763 goto abort_unlock; 2764 err = md_put_user (4, (char *) &loc->sectors); 2765 if (err) 2766 goto abort_unlock; 2767 err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8, 2768 (short *) &loc->cylinders); 2769 if (err) 2770 goto abort_unlock; 2771 err = md_put_user (md_hd_struct[minor].start_sect, 2772 (long *) &loc->start); 2773 goto done_unlock; 2774 } 2775 2776 /* 2777 * The remaining ioctls are changing the state of the 2778 * superblock, so we do not allow read-only arrays 2779 * here: 2780 */ 2781 if (mddev->ro) { 2782 err = -EROFS; 2783 goto abort_unlock; 2784 } 2785 2786 switch (cmd) 2787 { 2788 case CLEAR_ARRAY: 2789 err = clear_array(mddev); 2790 goto done_unlock; 2791 2792 case ADD_NEW_DISK: 2793 { 2794 mdu_disk_info_t info; 2795 if (md_copy_from_user(&info, (void*)arg, sizeof(info))) 2796 err = -EFAULT; 2797 else 2798 err = add_new_disk(mddev, &info); 2799 goto done_unlock; 2800 } 2801 case HOT_GENERATE_ERROR: 2802 err = hot_generate_error(mddev, (kdev_t)arg); 2803 goto done_unlock; 2804 case HOT_REMOVE_DISK: 2805 err = hot_remove_disk(mddev, (kdev_t)arg); 2806 goto done_unlock; 2807 2808 case HOT_ADD_DISK: 2809 err = hot_add_disk(mddev, (kdev_t)arg); 2810 goto done_unlock; 2811 2812 case SET_DISK_INFO: 2813 err = set_disk_info(mddev, (void *)arg); 2814 goto done_unlock; 2815 2816 case WRITE_RAID_INFO: 2817 err = write_raid_info(mddev); 2818 goto done_unlock; 2819 2820 case UNPROTECT_ARRAY: 2821 err = unprotect_array(mddev); 2822 goto done_unlock; 2823 2824 case PROTECT_ARRAY: 2825 err = protect_array(mddev); 2826 goto done_unlock; 2827 2828 case SET_DISK_FAULTY: 2829 err = set_disk_faulty(mddev, (kdev_t)arg); 2830 goto done_unlock; 2831 2832 case RUN_ARRAY: 2833 { 2834/* The data is never used.... 2835 mdu_param_t param; 2836 err = md_copy_from_user(¶m, (mdu_param_t *)arg, 2837 sizeof(param)); 2838 if (err) 2839 goto abort_unlock; 2840*/ 2841 err = do_md_run (mddev); 2842 /* 2843 * we have to clean up the mess if 2844 * the array cannot be run for some 2845 * reason ... 2846 */ 2847 if (err) { 2848 mddev->sb_dirty = 0; 2849 if (!do_md_stop (mddev, 0)) 2850 mddev = NULL; 2851 } 2852 goto done_unlock; 2853 } 2854 2855 default: 2856 printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, " 2857 "upgrade your software to use new ictls.\n", 2858 current->comm, current->pid); 2859 err = -EINVAL; 2860 goto abort_unlock; 2861 } 2862 2863done_unlock: 2864abort_unlock: 2865 if (mddev) 2866 unlock_mddev(mddev); 2867 2868 return err; 2869done: 2870 if (err) 2871 MD_BUG(); 2872abort: 2873 return err; 2874} 2875 2876static int md_open(struct inode *inode, struct file *file) 2877{ 2878 /* 2879 * Always succeed, but increment the usage count 2880 */ 2881 mddev_t *mddev = kdev_to_mddev(inode->i_rdev); 2882 if (mddev) 2883 atomic_inc(&mddev->active); 2884 return (0); 2885} 2886 2887static int md_release(struct inode *inode, struct file * file) 2888{ 2889 mddev_t *mddev = kdev_to_mddev(inode->i_rdev); 2890 if (mddev) 2891 atomic_dec(&mddev->active); 2892 return 0; 2893} 2894 2895static struct block_device_operations md_fops= 2896{ 2897 owner: THIS_MODULE, 2898 open: md_open, 2899 release: md_release, 2900 ioctl: md_ioctl, 2901}; 2902 2903 2904int md_thread(void * arg) 2905{ 2906 mdk_thread_t *thread = arg; 2907 2908 md_lock_kernel(); 2909 2910 /* 2911 * Detach thread 2912 */ 2913 2914 daemonize(); 2915 2916 sprintf(current->comm, thread->name); 2917 md_init_signals(); 2918 md_flush_signals(); 2919 thread->tsk = current; 2920 2921 /* 2922 * md_thread is a 'system-thread', it's priority should be very 2923 * high. We avoid resource deadlocks individually in each 2924 * raid personality. (RAID5 does preallocation) We also use RR and 2925 * the very same RT priority as kswapd, thus we will never get 2926 * into a priority inversion deadlock. 2927 * 2928 * we definitely have to have equal or higher priority than 2929 * bdflush, otherwise bdflush will deadlock if there are too 2930 * many dirty RAID5 blocks. 2931 */ 2932 current->policy = SCHED_OTHER; 2933 current->nice = -20; 2934 md_unlock_kernel(); 2935 2936 complete(thread->event); 2937 while (thread->run) { 2938 void (*run)(void *data); 2939 DECLARE_WAITQUEUE(wait, current); 2940 2941 add_wait_queue(&thread->wqueue, &wait); 2942 set_task_state(current, TASK_INTERRUPTIBLE); 2943 if (!test_bit(THREAD_WAKEUP, &thread->flags)) { 2944 dprintk("md: thread %p went to sleep.\n", thread); 2945 schedule(); 2946 dprintk("md: thread %p woke up.\n", thread); 2947 } 2948 current->state = TASK_RUNNING; 2949 remove_wait_queue(&thread->wqueue, &wait); 2950 clear_bit(THREAD_WAKEUP, &thread->flags); 2951 2952 run = thread->run; 2953 if (run) { 2954 run(thread->data); 2955 run_task_queue(&tq_disk); 2956 } 2957 if (md_signal_pending(current)) 2958 md_flush_signals(); 2959 } 2960 complete(thread->event); 2961 return 0; 2962} 2963 2964void md_wakeup_thread(mdk_thread_t *thread) 2965{ 2966 dprintk("md: waking up MD thread %p.\n", thread); 2967 set_bit(THREAD_WAKEUP, &thread->flags); 2968 wake_up(&thread->wqueue); 2969} 2970 2971mdk_thread_t *md_register_thread(void (*run) (void *), 2972 void *data, const char *name) 2973{ 2974 mdk_thread_t *thread; 2975 int ret; 2976 struct completion event; 2977 2978 thread = (mdk_thread_t *) kmalloc 2979 (sizeof(mdk_thread_t), GFP_KERNEL); 2980 if (!thread) 2981 return NULL; 2982 2983 memset(thread, 0, sizeof(mdk_thread_t)); 2984 md_init_waitqueue_head(&thread->wqueue); 2985 2986 init_completion(&event); 2987 thread->event = &event; 2988 thread->run = run; 2989 thread->data = data; 2990 thread->name = name; 2991 ret = kernel_thread(md_thread, thread, 0); 2992 if (ret < 0) { 2993 kfree(thread); 2994 return NULL; 2995 } 2996 wait_for_completion(&event); 2997 return thread; 2998} 2999 3000void md_interrupt_thread(mdk_thread_t *thread) 3001{ 3002 if (!thread->tsk) { 3003 MD_BUG(); 3004 return; 3005 } 3006 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 3007 send_sig(SIGKILL, thread->tsk, 1); 3008} 3009 3010void md_unregister_thread(mdk_thread_t *thread) 3011{ 3012 struct completion event; 3013 3014 init_completion(&event); 3015 3016 thread->event = &event; 3017 thread->run = NULL; 3018 thread->name = NULL; 3019 md_interrupt_thread(thread); 3020 wait_for_completion(&event); 3021 kfree(thread); 3022} 3023 3024void md_recover_arrays(void) 3025{ 3026 if (!md_recovery_thread) { 3027 MD_BUG(); 3028 return; 3029 } 3030 md_wakeup_thread(md_recovery_thread); 3031} 3032 3033 3034int md_error(mddev_t *mddev, kdev_t rdev) 3035{ 3036 mdk_rdev_t * rrdev; 3037 3038 dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3039 MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev), 3040 __builtin_return_address(0),__builtin_return_address(1), 3041 __builtin_return_address(2),__builtin_return_address(3)); 3042 3043 if (!mddev) { 3044 MD_BUG(); 3045 return 0; 3046 } 3047 rrdev = find_rdev(mddev, rdev); 3048 if (!rrdev || rrdev->faulty) 3049 return 0; 3050 if (!mddev->pers->error_handler 3051 || mddev->pers->error_handler(mddev,rdev) <= 0) { 3052 free_disk_sb(rrdev); 3053 rrdev->faulty = 1; 3054 } else 3055 return 1; 3056 /* 3057 * if recovery was running, stop it now. 3058 */ 3059 if (mddev->pers->stop_resync) 3060 mddev->pers->stop_resync(mddev); 3061 if (mddev->recovery_running) 3062 md_interrupt_thread(md_recovery_thread); 3063 md_recover_arrays(); 3064 3065 return 0; 3066} 3067 3068static int status_unused(char * page) 3069{ 3070 int sz = 0, i = 0; 3071 mdk_rdev_t *rdev; 3072 struct md_list_head *tmp; 3073 3074 sz += sprintf(page + sz, "unused devices: "); 3075 3076 ITERATE_RDEV_ALL(rdev,tmp) { 3077 if (!rdev->same_set.next && !rdev->same_set.prev) { 3078 /* 3079 * The device is not yet used by any array. 3080 */ 3081 i++; 3082 sz += sprintf(page + sz, "%s ", 3083 partition_name(rdev->dev)); 3084 } 3085 } 3086 if (!i) 3087 sz += sprintf(page + sz, "<none>"); 3088 3089 sz += sprintf(page + sz, "\n"); 3090 return sz; 3091} 3092 3093 3094static int status_resync(char * page, mddev_t * mddev) 3095{ 3096 int sz = 0; 3097 unsigned long max_blocks, resync, res, dt, db, rt; 3098 3099 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 3100 max_blocks = mddev->sb->size; 3101 3102 /* 3103 * Should not happen. 3104 */ 3105 if (!max_blocks) { 3106 MD_BUG(); 3107 return 0; 3108 } 3109 res = (resync/1024)*1000/(max_blocks/1024 + 1); 3110 { 3111 int i, x = res/50, y = 20-x; 3112 sz += sprintf(page + sz, "["); 3113 for (i = 0; i < x; i++) 3114 sz += sprintf(page + sz, "="); 3115 sz += sprintf(page + sz, ">"); 3116 for (i = 0; i < y; i++) 3117 sz += sprintf(page + sz, "."); 3118 sz += sprintf(page + sz, "] "); 3119 } 3120 if (!mddev->recovery_running) 3121 /* 3122 * true resync 3123 */ 3124 sz += sprintf(page + sz, " resync =%3lu.%lu%% (%lu/%lu)", 3125 res/10, res % 10, resync, max_blocks); 3126 else 3127 /* 3128 * recovery ... 3129 */ 3130 sz += sprintf(page + sz, " recovery =%3lu.%lu%% (%lu/%lu)", 3131 res/10, res % 10, resync, max_blocks); 3132 3133 /* 3134 * We do not want to overflow, so the order of operands and 3135 * the * 100 / 100 trick are important. We do a +1 to be 3136 * safe against division by zero. We only estimate anyway. 3137 * 3138 * dt: time from mark until now 3139 * db: blocks written from mark until now 3140 * rt: remaining time 3141 */ 3142 dt = ((jiffies - mddev->resync_mark) / HZ); 3143 if (!dt) dt++; 3144 db = resync - (mddev->resync_mark_cnt/2); 3145 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 3146 3147 sz += sprintf(page + sz, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 3148 3149 sz += sprintf(page + sz, " speed=%ldK/sec", db/dt); 3150 3151 return sz; 3152} 3153 3154static int md_status_read_proc(char *page, char **start, off_t off, 3155 int count, int *eof, void *data) 3156{ 3157 int sz = 0, j, size; 3158 struct md_list_head *tmp, *tmp2; 3159 mdk_rdev_t *rdev; 3160 mddev_t *mddev; 3161 3162 sz += sprintf(page + sz, "Personalities : "); 3163 for (j = 0; j < MAX_PERSONALITY; j++) 3164 if (pers[j]) 3165 sz += sprintf(page+sz, "[%s] ", pers[j]->name); 3166 3167 sz += sprintf(page+sz, "\n"); 3168 3169 3170 sz += sprintf(page+sz, "read_ahead "); 3171 if (read_ahead[MD_MAJOR] == INT_MAX) 3172 sz += sprintf(page+sz, "not set\n"); 3173 else 3174 sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]); 3175 3176 ITERATE_MDDEV(mddev,tmp) { 3177 sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev), 3178 mddev->pers ? "" : "in"); 3179 if (mddev->pers) { 3180 if (mddev->ro) 3181 sz += sprintf(page + sz, " (read-only)"); 3182 sz += sprintf(page + sz, " %s", mddev->pers->name); 3183 } 3184 3185 size = 0; 3186 ITERATE_RDEV(mddev,rdev,tmp2) { 3187 sz += sprintf(page + sz, " %s[%d]", 3188 partition_name(rdev->dev), rdev->desc_nr); 3189 if (rdev->faulty) { 3190 sz += sprintf(page + sz, "(F)"); 3191 continue; 3192 } 3193 size += rdev->size; 3194 } 3195 3196 if (mddev->nb_dev) { 3197 if (mddev->pers) 3198 sz += sprintf(page + sz, "\n %d blocks", 3199 md_size[mdidx(mddev)]); 3200 else 3201 sz += sprintf(page + sz, "\n %d blocks", size); 3202 } 3203 3204 if (!mddev->pers) { 3205 sz += sprintf(page+sz, "\n"); 3206 continue; 3207 } 3208 3209 sz += mddev->pers->status (page+sz, mddev); 3210 3211 sz += sprintf(page+sz, "\n "); 3212 if (mddev->curr_resync) { 3213 sz += status_resync (page+sz, mddev); 3214 } else { 3215 if (sem_getcount(&mddev->resync_sem) != 1) 3216 sz += sprintf(page + sz, " resync=DELAYED"); 3217 } 3218 sz += sprintf(page + sz, "\n"); 3219 } 3220 sz += status_unused(page + sz); 3221 3222 return sz; 3223} 3224 3225int register_md_personality(int pnum, mdk_personality_t *p) 3226{ 3227 if (pnum >= MAX_PERSONALITY) { 3228 MD_BUG(); 3229 return -EINVAL; 3230 } 3231 3232 if (pers[pnum]) { 3233 MD_BUG(); 3234 return -EBUSY; 3235 } 3236 3237 pers[pnum] = p; 3238 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); 3239 return 0; 3240} 3241 3242int unregister_md_personality(int pnum) 3243{ 3244 if (pnum >= MAX_PERSONALITY) { 3245 MD_BUG(); 3246 return -EINVAL; 3247 } 3248 3249 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); 3250 pers[pnum] = NULL; 3251 return 0; 3252} 3253 3254mdp_disk_t *get_spare(mddev_t *mddev) 3255{ 3256 mdp_super_t *sb = mddev->sb; 3257 mdp_disk_t *disk; 3258 mdk_rdev_t *rdev; 3259 struct md_list_head *tmp; 3260 3261 ITERATE_RDEV(mddev,rdev,tmp) { 3262 if (rdev->faulty) 3263 continue; 3264 if (!rdev->sb) { 3265 MD_BUG(); 3266 continue; 3267 } 3268 disk = &sb->disks[rdev->desc_nr]; 3269 if (disk_faulty(disk)) { 3270 MD_BUG(); 3271 continue; 3272 } 3273 if (disk_active(disk)) 3274 continue; 3275 return disk; 3276 } 3277 return NULL; 3278} 3279 3280static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK]; 3281void md_sync_acct(kdev_t dev, unsigned long nr_sectors) 3282{ 3283 unsigned int major = MAJOR(dev); 3284 unsigned int index; 3285 3286 index = disk_index(dev); 3287 if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) 3288 return; 3289 3290 sync_io[major][index] += nr_sectors; 3291} 3292 3293static int is_mddev_idle(mddev_t *mddev) 3294{ 3295 mdk_rdev_t * rdev; 3296 struct md_list_head *tmp; 3297 int idle; 3298 unsigned long curr_events; 3299 3300 idle = 1; 3301 ITERATE_RDEV(mddev,rdev,tmp) { 3302 int major = MAJOR(rdev->dev); 3303 int idx = disk_index(rdev->dev); 3304 3305 if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) 3306 continue; 3307 3308 curr_events = kstat.dk_drive_rblk[major][idx] + 3309 kstat.dk_drive_wblk[major][idx] ; 3310 curr_events -= sync_io[major][idx]; 3311 if ((curr_events - rdev->last_events) > 32) { 3312 rdev->last_events = curr_events; 3313 idle = 0; 3314 } 3315 } 3316 return idle; 3317} 3318 3319MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait); 3320 3321void md_done_sync(mddev_t *mddev, int blocks, int ok) 3322{ 3323 /* another "blocks" (512byte) blocks have been synced */ 3324 atomic_sub(blocks, &mddev->recovery_active); 3325 wake_up(&mddev->recovery_wait); 3326 if (!ok) { 3327 // stop recovery, signal do_sync .... 3328 if (mddev->pers->stop_resync) 3329 mddev->pers->stop_resync(mddev); 3330 if (mddev->recovery_running) 3331 md_interrupt_thread(md_recovery_thread); 3332 } 3333} 3334 3335#define SYNC_MARKS 10 3336#define SYNC_MARK_STEP (3*HZ) 3337int md_do_sync(mddev_t *mddev, mdp_disk_t *spare) 3338{ 3339 mddev_t *mddev2; 3340 unsigned int max_sectors, currspeed, 3341 j, window, err, serialize; 3342 unsigned long mark[SYNC_MARKS]; 3343 unsigned long mark_cnt[SYNC_MARKS]; 3344 int last_mark,m; 3345 struct md_list_head *tmp; 3346 unsigned long last_check; 3347 3348 3349 err = down_interruptible(&mddev->resync_sem); 3350 if (err) 3351 goto out_nolock; 3352 3353recheck: 3354 serialize = 0; 3355 ITERATE_MDDEV(mddev2,tmp) { 3356 if (mddev2 == mddev) 3357 continue; 3358 if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { 3359 printk(KERN_INFO "md: delaying resync of md%d until md%d " 3360 "has finished resync (they share one or more physical units)\n", 3361 mdidx(mddev), mdidx(mddev2)); 3362 serialize = 1; 3363 break; 3364 } 3365 } 3366 if (serialize) { 3367 interruptible_sleep_on(&resync_wait); 3368 if (md_signal_pending(current)) { 3369 md_flush_signals(); 3370 err = -EINTR; 3371 goto out; 3372 } 3373 goto recheck; 3374 } 3375 3376 mddev->curr_resync = 1; 3377 3378 max_sectors = mddev->sb->size<<1; 3379 3380 printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); 3381 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n", 3382 sysctl_speed_limit_min); 3383 printk(KERN_INFO "md: using maximum available idle IO bandwith " 3384 "(but not more than %d KB/sec) for reconstruction.\n", 3385 sysctl_speed_limit_max); 3386 3387 /* 3388 * Resync has low priority. 3389 */ 3390 current->nice = 19; 3391 3392 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3393 for (m = 0; m < SYNC_MARKS; m++) { 3394 mark[m] = jiffies; 3395 mark_cnt[m] = 0; 3396 } 3397 last_mark = 0; 3398 mddev->resync_mark = mark[last_mark]; 3399 mddev->resync_mark_cnt = mark_cnt[last_mark]; 3400 3401 /* 3402 * Tune reconstruction: 3403 */ 3404 window = vm_max_readahead*(PAGE_SIZE/512); 3405 printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n", 3406 window/2,max_sectors/2); 3407 3408 atomic_set(&mddev->recovery_active, 0); 3409 init_waitqueue_head(&mddev->recovery_wait); 3410 last_check = 0; 3411 for (j = 0; j < max_sectors;) { 3412 int sectors; 3413 3414 sectors = mddev->pers->sync_request(mddev, j); 3415 3416 if (sectors < 0) { 3417 err = sectors; 3418 goto out; 3419 } 3420 atomic_add(sectors, &mddev->recovery_active); 3421 j += sectors; 3422 mddev->curr_resync = j; 3423 3424 if (last_check + window > j) 3425 continue; 3426 3427 last_check = j; 3428 3429 run_task_queue(&tq_disk); 3430 3431 repeat: 3432 if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { 3433 /* step marks */ 3434 int next = (last_mark+1) % SYNC_MARKS; 3435 3436 mddev->resync_mark = mark[next]; 3437 mddev->resync_mark_cnt = mark_cnt[next]; 3438 mark[next] = jiffies; 3439 mark_cnt[next] = j - atomic_read(&mddev->recovery_active); 3440 last_mark = next; 3441 } 3442 3443 3444 if (md_signal_pending(current)) { 3445 /* 3446 * got a signal, exit. 3447 */ 3448 mddev->curr_resync = 0; 3449 printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n"); 3450 md_flush_signals(); 3451 err = -EINTR; 3452 goto out; 3453 } 3454 3455 /* 3456 * this loop exits only if either when we are slower than 3457 * the 'hard' speed limit, or the system was IO-idle for 3458 * a jiffy. 3459 * the system might be non-idle CPU-wise, but we only care 3460 * about not overloading the IO subsystem. (things like an 3461 * e2fsck being done on the RAID array should execute fast) 3462 */ 3463 if (md_need_resched(current)) 3464 schedule(); 3465 3466 currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; 3467 3468 if (currspeed > sysctl_speed_limit_min) { 3469 current->nice = 19; 3470 3471 if ((currspeed > sysctl_speed_limit_max) || 3472 !is_mddev_idle(mddev)) { 3473 current->state = TASK_INTERRUPTIBLE; 3474 md_schedule_timeout(HZ/4); 3475 goto repeat; 3476 } 3477 } else 3478 current->nice = -20; 3479 } 3480 printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); 3481 err = 0; 3482 /* 3483 * this also signals 'finished resyncing' to md_stop 3484 */ 3485out: 3486 wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0); 3487 up(&mddev->resync_sem); 3488out_nolock: 3489 mddev->curr_resync = 0; 3490 wake_up(&resync_wait); 3491 return err; 3492} 3493 3494 3495/* 3496 * This is a kernel thread which syncs a spare disk with the active array 3497 * 3498 * the amount of foolproofing might seem to be a tad excessive, but an 3499 * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs 3500 * of my root partition with the first 0.5 gigs of my /home partition ... so 3501 * i'm a bit nervous ;) 3502 */ 3503void md_do_recovery(void *data) 3504{ 3505 int err; 3506 mddev_t *mddev; 3507 mdp_super_t *sb; 3508 mdp_disk_t *spare; 3509 struct md_list_head *tmp; 3510 3511 printk(KERN_INFO "md: recovery thread got woken up ...\n"); 3512restart: 3513 ITERATE_MDDEV(mddev,tmp) { 3514 sb = mddev->sb; 3515 if (!sb) 3516 continue; 3517 if (mddev->recovery_running) 3518 continue; 3519 if (sb->active_disks == sb->raid_disks) 3520 continue; 3521 if (mddev->sb_dirty) 3522 md_update_sb(mddev); 3523 if (!sb->spare_disks) { 3524 printk(KERN_ERR "md%d: no spare disk to reconstruct array! " 3525 "-- continuing in degraded mode\n", mdidx(mddev)); 3526 continue; 3527 } 3528 /* 3529 * now here we get the spare and resync it. 3530 */ 3531 spare = get_spare(mddev); 3532 if (!spare) 3533 continue; 3534 printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", 3535 mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); 3536 if (!mddev->pers->diskop) 3537 continue; 3538 if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE)) 3539 continue; 3540 down(&mddev->recovery_sem); 3541 mddev->recovery_running = 1; 3542 err = md_do_sync(mddev, spare); 3543 if (err == -EIO) { 3544 printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", 3545 mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); 3546 if (!disk_faulty(spare)) { 3547 mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE); 3548 mark_disk_faulty(spare); 3549 mark_disk_nonsync(spare); 3550 mark_disk_inactive(spare); 3551 sb->spare_disks--; 3552 sb->working_disks--; 3553 sb->failed_disks++; 3554 } 3555 } else 3556 if (disk_faulty(spare)) 3557 mddev->pers->diskop(mddev, &spare, 3558 DISKOP_SPARE_INACTIVE); 3559 if (err == -EINTR || err == -ENOMEM) { 3560 /* 3561 * Recovery got interrupted, or ran out of mem ... 3562 * signal back that we have finished using the array. 3563 */ 3564 mddev->pers->diskop(mddev, &spare, 3565 DISKOP_SPARE_INACTIVE); 3566 up(&mddev->recovery_sem); 3567 mddev->recovery_running = 0; 3568 continue; 3569 } else { 3570 mddev->recovery_running = 0; 3571 up(&mddev->recovery_sem); 3572 } 3573 if (!disk_faulty(spare)) { 3574 /* 3575 * the SPARE_ACTIVE diskop possibly changes the 3576 * pointer too 3577 */ 3578 mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); 3579 mark_disk_sync(spare); 3580 mark_disk_active(spare); 3581 sb->active_disks++; 3582 sb->spare_disks--; 3583 } 3584 mddev->sb_dirty = 1; 3585 md_update_sb(mddev); 3586 goto restart; 3587 } 3588 printk(KERN_INFO "md: recovery thread finished ...\n"); 3589 3590} 3591 3592int md_notify_reboot(struct notifier_block *this, 3593 unsigned long code, void *x) 3594{ 3595 struct md_list_head *tmp; 3596 mddev_t *mddev; 3597 3598 if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT) 3599 || (code == MD_SYS_POWER_OFF)) { 3600 3601 printk(KERN_INFO "md: stopping all md devices.\n"); 3602 3603 ITERATE_MDDEV(mddev,tmp) 3604 do_md_stop (mddev, 1); 3605 /* 3606 * certain more exotic SCSI devices are known to be 3607 * volatile wrt too early system reboots. While the 3608 * right place to handle this issue is the given 3609 * driver, we do want to have a safe RAID driver ... 3610 */ 3611 md_mdelay(1000*1); 3612 } 3613 return NOTIFY_DONE; 3614} 3615 3616struct notifier_block md_notifier = { 3617 notifier_call: md_notify_reboot, 3618 next: NULL, 3619 priority: INT_MAX, /* before any real devices */ 3620}; 3621 3622static void md_geninit(void) 3623{ 3624 int i; 3625 3626 for(i = 0; i < MAX_MD_DEVS; i++) { 3627 md_blocksizes[i] = 1024; 3628 md_size[i] = 0; 3629 md_hardsect_sizes[i] = 512; 3630 md_maxreadahead[i] = MD_READAHEAD; 3631 } 3632 blksize_size[MAJOR_NR] = md_blocksizes; 3633 blk_size[MAJOR_NR] = md_size; 3634 max_readahead[MAJOR_NR] = md_maxreadahead; 3635 hardsect_size[MAJOR_NR] = md_hardsect_sizes; 3636 3637 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 3638 3639#ifdef CONFIG_PROC_FS 3640 create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL); 3641#endif 3642} 3643 3644int md__init md_init(void) 3645{ 3646 static char * name = "mdrecoveryd"; 3647 int minor; 3648 3649 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n", 3650 MD_MAJOR_VERSION, MD_MINOR_VERSION, 3651 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 3652 3653 if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops)) 3654 { 3655 printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR); 3656 return (-1); 3657 } 3658 devfs_handle = devfs_mk_dir (NULL, "md", NULL); 3659 /* we don't use devfs_register_series because we want to fill md_hd_struct */ 3660 for (minor=0; minor < MAX_MD_DEVS; ++minor) { 3661 char devname[128]; 3662 sprintf (devname, "%u", minor); 3663 md_hd_struct[minor].de = devfs_register (devfs_handle, 3664 devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor, 3665 S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL); 3666 } 3667 3668 /* forward all md request to md_make_request */ 3669 blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_make_request); 3670 3671 3672 read_ahead[MAJOR_NR] = INT_MAX; 3673 3674 add_gendisk(&md_gendisk); 3675 3676 md_recovery_thread = md_register_thread(md_do_recovery, NULL, name); 3677 if (!md_recovery_thread) 3678 printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n"); 3679 3680 md_register_reboot_notifier(&md_notifier); 3681 raid_table_header = register_sysctl_table(raid_root_table, 1); 3682 3683 md_geninit(); 3684 return (0); 3685} 3686 3687 3688#ifndef MODULE 3689 3690/* 3691 * When md (and any require personalities) are compiled into the kernel 3692 * (not a module), arrays can be assembles are boot time using with AUTODETECT 3693 * where specially marked partitions are registered with md_autodetect_dev(), 3694 * and with MD_BOOT where devices to be collected are given on the boot line 3695 * with md=..... 3696 * The code for that is here. 3697 */ 3698 3699struct { 3700 int set; 3701 int noautodetect; 3702} raid_setup_args md__initdata; 3703 3704/* 3705 * Searches all registered partitions for autorun RAID arrays 3706 * at boot time. 3707 */ 3708static kdev_t detected_devices[128]; 3709static int dev_cnt; 3710 3711void md_autodetect_dev(kdev_t dev) 3712{ 3713 if (dev_cnt >= 0 && dev_cnt < 127) 3714 detected_devices[dev_cnt++] = dev; 3715} 3716 3717 3718static void autostart_arrays(void) 3719{ 3720 mdk_rdev_t *rdev; 3721 int i; 3722 3723 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 3724 3725 for (i = 0; i < dev_cnt; i++) { 3726 kdev_t dev = detected_devices[i]; 3727 3728 if (md_import_device(dev,1)) { 3729 printk(KERN_ALERT "md: could not import %s!\n", 3730 partition_name(dev)); 3731 continue; 3732 } 3733 /* 3734 * Sanity checks: 3735 */ 3736 rdev = find_rdev_all(dev); 3737 if (!rdev) { 3738 MD_BUG(); 3739 continue; 3740 } 3741 if (rdev->faulty) { 3742 MD_BUG(); 3743 continue; 3744 } 3745 md_list_add(&rdev->pending, &pending_raid_disks); 3746 } 3747 dev_cnt = 0; 3748 3749 autorun_devices(-1); 3750} 3751 3752static struct { 3753 char device_set [MAX_MD_DEVS]; 3754 int pers[MAX_MD_DEVS]; 3755 int chunk[MAX_MD_DEVS]; 3756 char *device_names[MAX_MD_DEVS]; 3757} md_setup_args md__initdata; 3758 3759/* 3760 * Parse the command-line parameters given our kernel, but do not 3761 * actually try to invoke the MD device now; that is handled by 3762 * md_setup_drive after the low-level disk drivers have initialised. 3763 * 3764 * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which 3765 * assigns the task of parsing integer arguments to the 3766 * invoked program now). Added ability to initialise all 3767 * the MD devices (by specifying multiple "md=" lines) 3768 * instead of just one. -- KTK 3769 * 18May2000: Added support for persistant-superblock arrays: 3770 * md=n,0,factor,fault,device-list uses RAID0 for device n 3771 * md=n,-1,factor,fault,device-list uses LINEAR for device n 3772 * md=n,device-list reads a RAID superblock from the devices 3773 * elements in device-list are read by name_to_kdev_t so can be 3774 * a hex number or something like /dev/hda1 /dev/sdb 3775 * 2001-06-03: Dave Cinege <dcinege@psychosis.com> 3776 * Shifted name_to_kdev_t() and related operations to md_set_drive() 3777 * for later execution. Rewrote section to make devfs compatible. 3778 */ 3779static int md__init md_setup(char *str) 3780{ 3781 int minor, level, factor, fault; 3782 char *pername = ""; 3783 char *str1 = str; 3784 3785 if (get_option(&str, &minor) != 2) { /* MD Number */ 3786 printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); 3787 return 0; 3788 } 3789 if (minor >= MAX_MD_DEVS) { 3790 printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor); 3791 return 0; 3792 } else if (md_setup_args.device_names[minor]) { 3793 printk(KERN_WARNING "md: md=%d, Specified more then once. " 3794 "Replacing previous definition.\n", minor); 3795 } 3796 switch (get_option(&str, &level)) { /* RAID Personality */ 3797 case 2: /* could be 0 or -1.. */ 3798 if (level == 0 || level == -1) { 3799 if (get_option(&str, &factor) != 2 || /* Chunk Size */ 3800 get_option(&str, &fault) != 2) { 3801 printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); 3802 return 0; 3803 } 3804 md_setup_args.pers[minor] = level; 3805 md_setup_args.chunk[minor] = 1 << (factor+12); 3806 switch(level) { 3807 case -1: 3808 level = LINEAR; 3809 pername = "linear"; 3810 break; 3811 case 0: 3812 level = RAID0; 3813 pername = "raid0"; 3814 break; 3815 default: 3816 printk(KERN_WARNING 3817 "md: The kernel has not been configured for raid%d support!\n", 3818 level); 3819 return 0; 3820 } 3821 md_setup_args.pers[minor] = level; 3822 break; 3823 } 3824 /* FALL THROUGH */ 3825 case 1: /* the first device is numeric */ 3826 str = str1; 3827 /* FALL THROUGH */ 3828 case 0: 3829 md_setup_args.pers[minor] = 0; 3830 pername="super-block"; 3831 } 3832 3833 printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n", 3834 minor, pername, str); 3835 md_setup_args.device_names[minor] = str; 3836 3837 return 1; 3838} 3839 3840extern kdev_t name_to_kdev_t(char *line) md__init; 3841void md__init md_setup_drive(void) 3842{ 3843 int minor, i; 3844 kdev_t dev; 3845 mddev_t*mddev; 3846 kdev_t devices[MD_SB_DISKS+1]; 3847 3848 for (minor = 0; minor < MAX_MD_DEVS; minor++) { 3849 int err = 0; 3850 char *devname; 3851 mdu_disk_info_t dinfo; 3852 3853 if ((devname = md_setup_args.device_names[minor]) == 0) continue; 3854 3855 for (i = 0; i < MD_SB_DISKS && devname != 0; i++) { 3856 3857 char *p; 3858 void *handle; 3859 3860 p = strchr(devname, ','); 3861 if (p) 3862 *p++ = 0; 3863 3864 dev = name_to_kdev_t(devname); 3865 handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev), 3866 DEVFS_SPECIAL_BLK, 1); 3867 if (handle != 0) { 3868 unsigned major, minor; 3869 devfs_get_maj_min(handle, &major, &minor); 3870 dev = MKDEV(major, minor); 3871 } 3872 if (dev == 0) { 3873 printk(KERN_WARNING "md: Unknown device name: %s\n", devname); 3874 break; 3875 } 3876 3877 devices[i] = dev; 3878 md_setup_args.device_set[minor] = 1; 3879 3880 devname = p; 3881 } 3882 devices[i] = 0; 3883 3884 if (md_setup_args.device_set[minor] == 0) 3885 continue; 3886 3887 if (mddev_map[minor].mddev) { 3888 printk(KERN_WARNING 3889 "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n", 3890 minor); 3891 continue; 3892 } 3893 printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]); 3894 3895 mddev = alloc_mddev(MKDEV(MD_MAJOR,minor)); 3896 if (!mddev) { 3897 printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor); 3898 continue; 3899 } 3900 if (md_setup_args.pers[minor]) { 3901 /* non-persistent */ 3902 mdu_array_info_t ainfo; 3903 ainfo.level = pers_to_level(md_setup_args.pers[minor]); 3904 ainfo.size = 0; 3905 ainfo.nr_disks =0; 3906 ainfo.raid_disks =0; 3907 ainfo.md_minor =minor; 3908 ainfo.not_persistent = 1; 3909 3910 ainfo.state = (1 << MD_SB_CLEAN); 3911 ainfo.active_disks = 0; 3912 ainfo.working_disks = 0; 3913 ainfo.failed_disks = 0; 3914 ainfo.spare_disks = 0; 3915 ainfo.layout = 0; 3916 ainfo.chunk_size = md_setup_args.chunk[minor]; 3917 err = set_array_info(mddev, &ainfo); 3918 for (i = 0; !err && (dev = devices[i]); i++) { 3919 dinfo.number = i; 3920 dinfo.raid_disk = i; 3921 dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC); 3922 dinfo.major = MAJOR(dev); 3923 dinfo.minor = MINOR(dev); 3924 mddev->sb->nr_disks++; 3925 mddev->sb->raid_disks++; 3926 mddev->sb->active_disks++; 3927 mddev->sb->working_disks++; 3928 err = add_new_disk (mddev, &dinfo); 3929 } 3930 } else { 3931 /* persistent */ 3932 for (i = 0; (dev = devices[i]); i++) { 3933 dinfo.major = MAJOR(dev); 3934 dinfo.minor = MINOR(dev); 3935 add_new_disk (mddev, &dinfo); 3936 } 3937 } 3938 if (!err) 3939 err = do_md_run(mddev); 3940 if (err) { 3941 mddev->sb_dirty = 0; 3942 do_md_stop(mddev, 0); 3943 printk(KERN_WARNING "md: starting md%d failed\n", minor); 3944 } 3945 } 3946} 3947 3948static int md__init raid_setup(char *str) 3949{ 3950 int len, pos; 3951 3952 len = strlen(str) + 1; 3953 pos = 0; 3954 3955 while (pos < len) { 3956 char *comma = strchr(str+pos, ','); 3957 int wlen; 3958 if (comma) 3959 wlen = (comma-str)-pos; 3960 else wlen = (len-1)-pos; 3961 3962 if (strncmp(str, "noautodetect", wlen) == 0) 3963 raid_setup_args.noautodetect = 1; 3964 pos += wlen+1; 3965 } 3966 raid_setup_args.set = 1; 3967 return 1; 3968} 3969 3970int md__init md_run_setup(void) 3971{ 3972 if (raid_setup_args.noautodetect) 3973 printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n"); 3974 else 3975 autostart_arrays(); 3976 md_setup_drive(); 3977 return 0; 3978} 3979 3980__setup("raid=", raid_setup); 3981__setup("md=", md_setup); 3982 3983__initcall(md_init); 3984__initcall(md_run_setup); 3985 3986#else /* It is a MODULE */ 3987 3988int init_module(void) 3989{ 3990 return md_init(); 3991} 3992 3993static void free_device_names(void) 3994{ 3995 while (device_names.next != &device_names) { 3996 struct list_head *tmp = device_names.next; 3997 list_del(tmp); 3998 kfree(tmp); 3999 } 4000} 4001 4002 4003void cleanup_module(void) 4004{ 4005 md_unregister_thread(md_recovery_thread); 4006 devfs_unregister(devfs_handle); 4007 4008 devfs_unregister_blkdev(MAJOR_NR,"md"); 4009 unregister_reboot_notifier(&md_notifier); 4010 unregister_sysctl_table(raid_table_header); 4011#ifdef CONFIG_PROC_FS 4012 remove_proc_entry("mdstat", NULL); 4013#endif 4014 4015 del_gendisk(&md_gendisk); 4016 4017 blk_dev[MAJOR_NR].queue = NULL; 4018 blksize_size[MAJOR_NR] = NULL; 4019 blk_size[MAJOR_NR] = NULL; 4020 max_readahead[MAJOR_NR] = NULL; 4021 hardsect_size[MAJOR_NR] = NULL; 4022 4023 free_device_names(); 4024 4025} 4026#endif 4027 4028MD_EXPORT_SYMBOL(md_size); 4029MD_EXPORT_SYMBOL(register_md_personality); 4030MD_EXPORT_SYMBOL(unregister_md_personality); 4031MD_EXPORT_SYMBOL(partition_name); 4032MD_EXPORT_SYMBOL(md_error); 4033MD_EXPORT_SYMBOL(md_do_sync); 4034MD_EXPORT_SYMBOL(md_sync_acct); 4035MD_EXPORT_SYMBOL(md_done_sync); 4036MD_EXPORT_SYMBOL(md_recover_arrays); 4037MD_EXPORT_SYMBOL(md_register_thread); 4038MD_EXPORT_SYMBOL(md_unregister_thread); 4039MD_EXPORT_SYMBOL(md_update_sb); 4040MD_EXPORT_SYMBOL(md_wakeup_thread); 4041MD_EXPORT_SYMBOL(md_print_devices); 4042MD_EXPORT_SYMBOL(find_rdev_nr); 4043MD_EXPORT_SYMBOL(md_interrupt_thread); 4044MD_EXPORT_SYMBOL(mddev_map); 4045MD_EXPORT_SYMBOL(md_check_ordering); 4046MD_EXPORT_SYMBOL(get_spare); 4047MODULE_LICENSE("GPL"); 4048