1/* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33*/ 34 35#include <linux/module.h> 36#include <linux/kernel.h> 37#include <linux/kthread.h> 38#include <linux/linkage.h> 39#include <linux/raid/md.h> 40#include <linux/raid/bitmap.h> 41#include <linux/sysctl.h> 42#include <linux/buffer_head.h> /* for invalidate_bdev */ 43#include <linux/poll.h> 44#include <linux/mutex.h> 45#include <linux/ctype.h> 46#include <linux/freezer.h> 47 48#include <linux/init.h> 49 50#include <linux/file.h> 51 52#ifdef CONFIG_KMOD 53#include <linux/kmod.h> 54#endif 55 56#include <asm/unaligned.h> 57 58#define MAJOR_NR MD_MAJOR 59#define MD_DRIVER 60 61/* 63 partitions with the alternate major number (mdp) */ 62#define MdpMinorShift 6 63 64#define DEBUG 0 65#define dprintk(x...) ((void)(DEBUG && printk(x))) 66 67 68#ifndef MODULE 69static void autostart_arrays (int part); 70#endif 71 72static LIST_HEAD(pers_list); 73static DEFINE_SPINLOCK(pers_lock); 74 75static void md_print_devices(void); 76 77#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 78 79/* 80 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 81 * is 1000 KB/sec, so the extra system load does not show up that much. 82 * Increase it if you want to have more _guaranteed_ speed. Note that 83 * the RAID driver will use the maximum available bandwidth if the IO 84 * subsystem is idle. There is also an 'absolute maximum' reconstruction 85 * speed limit - in case reconstruction slows down your system despite 86 * idle IO detection. 87 * 88 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 89 * or /sys/block/mdX/md/sync_speed_{min,max} 90 */ 91 92static int sysctl_speed_limit_min = 1000; 93static int sysctl_speed_limit_max = 200000; 94static inline int speed_min(mddev_t *mddev) 95{ 96 return mddev->sync_speed_min ? 97 mddev->sync_speed_min : sysctl_speed_limit_min; 98} 99 100static inline int speed_max(mddev_t *mddev) 101{ 102 return mddev->sync_speed_max ? 103 mddev->sync_speed_max : sysctl_speed_limit_max; 104} 105 106static struct ctl_table_header *raid_table_header; 107 108static ctl_table raid_table[] = { 109 { 110 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 111 .procname = "speed_limit_min", 112 .data = &sysctl_speed_limit_min, 113 .maxlen = sizeof(int), 114 .mode = S_IRUGO|S_IWUSR, 115 .proc_handler = &proc_dointvec, 116 }, 117 { 118 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 119 .procname = "speed_limit_max", 120 .data = &sysctl_speed_limit_max, 121 .maxlen = sizeof(int), 122 .mode = S_IRUGO|S_IWUSR, 123 .proc_handler = &proc_dointvec, 124 }, 125 { .ctl_name = 0 } 126}; 127 128static ctl_table raid_dir_table[] = { 129 { 130 .ctl_name = DEV_RAID, 131 .procname = "raid", 132 .maxlen = 0, 133 .mode = S_IRUGO|S_IXUGO, 134 .child = raid_table, 135 }, 136 { .ctl_name = 0 } 137}; 138 139static ctl_table raid_root_table[] = { 140 { 141 .ctl_name = CTL_DEV, 142 .procname = "dev", 143 .maxlen = 0, 144 .mode = 0555, 145 .child = raid_dir_table, 146 }, 147 { .ctl_name = 0 } 148}; 149 150static struct block_device_operations md_fops; 151 152static int start_readonly; 153 154/* 155 * We have a system wide 'event count' that is incremented 156 * on any 'interesting' event, and readers of /proc/mdstat 157 * can use 'poll' or 'select' to find out when the event 158 * count increases. 159 * 160 * Events are: 161 * start array, stop array, error, add device, remove device, 162 * start build, activate spare 163 */ 164static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 165static atomic_t md_event_count; 166void md_new_event(mddev_t *mddev) 167{ 168 atomic_inc(&md_event_count); 169 wake_up(&md_event_waiters); 170 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 171} 172EXPORT_SYMBOL_GPL(md_new_event); 173 174/* Alternate version that can be called from interrupts 175 * when calling sysfs_notify isn't needed. 176 */ 177static void md_new_event_inintr(mddev_t *mddev) 178{ 179 atomic_inc(&md_event_count); 180 wake_up(&md_event_waiters); 181} 182 183/* 184 * Enables to iterate over all existing md arrays 185 * all_mddevs_lock protects this list. 186 */ 187static LIST_HEAD(all_mddevs); 188static DEFINE_SPINLOCK(all_mddevs_lock); 189 190 191/* 192 * iterates through all used mddevs in the system. 193 * We take care to grab the all_mddevs_lock whenever navigating 194 * the list, and to always hold a refcount when unlocked. 195 * Any code which breaks out of this loop while own 196 * a reference to the current mddev and must mddev_put it. 197 */ 198#define ITERATE_MDDEV(mddev,tmp) \ 199 \ 200 for (({ spin_lock(&all_mddevs_lock); \ 201 tmp = all_mddevs.next; \ 202 mddev = NULL;}); \ 203 ({ if (tmp != &all_mddevs) \ 204 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 205 spin_unlock(&all_mddevs_lock); \ 206 if (mddev) mddev_put(mddev); \ 207 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 208 tmp != &all_mddevs;}); \ 209 ({ spin_lock(&all_mddevs_lock); \ 210 tmp = tmp->next;}) \ 211 ) 212 213 214static int md_fail_request (request_queue_t *q, struct bio *bio) 215{ 216 bio_io_error(bio, bio->bi_size); 217 return 0; 218} 219 220static inline mddev_t *mddev_get(mddev_t *mddev) 221{ 222 atomic_inc(&mddev->active); 223 return mddev; 224} 225 226static void mddev_put(mddev_t *mddev) 227{ 228 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 229 return; 230 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 231 list_del(&mddev->all_mddevs); 232 spin_unlock(&all_mddevs_lock); 233 blk_cleanup_queue(mddev->queue); 234 kobject_unregister(&mddev->kobj); 235 } else 236 spin_unlock(&all_mddevs_lock); 237} 238 239static mddev_t * mddev_find(dev_t unit) 240{ 241 mddev_t *mddev, *new = NULL; 242 243 retry: 244 spin_lock(&all_mddevs_lock); 245 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 246 if (mddev->unit == unit) { 247 mddev_get(mddev); 248 spin_unlock(&all_mddevs_lock); 249 kfree(new); 250 return mddev; 251 } 252 253 if (new) { 254 list_add(&new->all_mddevs, &all_mddevs); 255 spin_unlock(&all_mddevs_lock); 256 return new; 257 } 258 spin_unlock(&all_mddevs_lock); 259 260 new = kzalloc(sizeof(*new), GFP_KERNEL); 261 if (!new) 262 return NULL; 263 264 new->unit = unit; 265 if (MAJOR(unit) == MD_MAJOR) 266 new->md_minor = MINOR(unit); 267 else 268 new->md_minor = MINOR(unit) >> MdpMinorShift; 269 270 mutex_init(&new->reconfig_mutex); 271 INIT_LIST_HEAD(&new->disks); 272 INIT_LIST_HEAD(&new->all_mddevs); 273 init_timer(&new->safemode_timer); 274 atomic_set(&new->active, 1); 275 spin_lock_init(&new->write_lock); 276 init_waitqueue_head(&new->sb_wait); 277 new->reshape_position = MaxSector; 278 279 new->queue = blk_alloc_queue(GFP_KERNEL); 280 if (!new->queue) { 281 kfree(new); 282 return NULL; 283 } 284 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); 285 286 blk_queue_make_request(new->queue, md_fail_request); 287 288 goto retry; 289} 290 291static inline int mddev_lock(mddev_t * mddev) 292{ 293 return mutex_lock_interruptible(&mddev->reconfig_mutex); 294} 295 296static inline int mddev_trylock(mddev_t * mddev) 297{ 298 return mutex_trylock(&mddev->reconfig_mutex); 299} 300 301static inline void mddev_unlock(mddev_t * mddev) 302{ 303 mutex_unlock(&mddev->reconfig_mutex); 304 305 md_wakeup_thread(mddev->thread); 306} 307 308static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 309{ 310 mdk_rdev_t * rdev; 311 struct list_head *tmp; 312 313 ITERATE_RDEV(mddev,rdev,tmp) { 314 if (rdev->desc_nr == nr) 315 return rdev; 316 } 317 return NULL; 318} 319 320static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 321{ 322 struct list_head *tmp; 323 mdk_rdev_t *rdev; 324 325 ITERATE_RDEV(mddev,rdev,tmp) { 326 if (rdev->bdev->bd_dev == dev) 327 return rdev; 328 } 329 return NULL; 330} 331 332static struct mdk_personality *find_pers(int level, char *clevel) 333{ 334 struct mdk_personality *pers; 335 list_for_each_entry(pers, &pers_list, list) { 336 if (level != LEVEL_NONE && pers->level == level) 337 return pers; 338 if (strcmp(pers->name, clevel)==0) 339 return pers; 340 } 341 return NULL; 342} 343 344static inline sector_t calc_dev_sboffset(struct block_device *bdev) 345{ 346 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 347 return MD_NEW_SIZE_BLOCKS(size); 348} 349 350static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 351{ 352 sector_t size; 353 354 size = rdev->sb_offset; 355 356 if (chunk_size) 357 size &= ~((sector_t)chunk_size/1024 - 1); 358 return size; 359} 360 361static int alloc_disk_sb(mdk_rdev_t * rdev) 362{ 363 if (rdev->sb_page) 364 MD_BUG(); 365 366 rdev->sb_page = alloc_page(GFP_KERNEL); 367 if (!rdev->sb_page) { 368 printk(KERN_ALERT "md: out of memory.\n"); 369 return -EINVAL; 370 } 371 372 return 0; 373} 374 375static void free_disk_sb(mdk_rdev_t * rdev) 376{ 377 if (rdev->sb_page) { 378 put_page(rdev->sb_page); 379 rdev->sb_loaded = 0; 380 rdev->sb_page = NULL; 381 rdev->sb_offset = 0; 382 rdev->size = 0; 383 } 384} 385 386 387static int super_written(struct bio *bio, unsigned int bytes_done, int error) 388{ 389 mdk_rdev_t *rdev = bio->bi_private; 390 mddev_t *mddev = rdev->mddev; 391 if (bio->bi_size) 392 return 1; 393 394 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 395 printk("md: super_written gets error=%d, uptodate=%d\n", 396 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 397 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 398 md_error(mddev, rdev); 399 } 400 401 if (atomic_dec_and_test(&mddev->pending_writes)) 402 wake_up(&mddev->sb_wait); 403 bio_put(bio); 404 return 0; 405} 406 407static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error) 408{ 409 struct bio *bio2 = bio->bi_private; 410 mdk_rdev_t *rdev = bio2->bi_private; 411 mddev_t *mddev = rdev->mddev; 412 if (bio->bi_size) 413 return 1; 414 415 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 416 error == -EOPNOTSUPP) { 417 unsigned long flags; 418 /* barriers don't appear to be supported :-( */ 419 set_bit(BarriersNotsupp, &rdev->flags); 420 mddev->barriers_work = 0; 421 spin_lock_irqsave(&mddev->write_lock, flags); 422 bio2->bi_next = mddev->biolist; 423 mddev->biolist = bio2; 424 spin_unlock_irqrestore(&mddev->write_lock, flags); 425 wake_up(&mddev->sb_wait); 426 bio_put(bio); 427 return 0; 428 } 429 bio_put(bio2); 430 bio->bi_private = rdev; 431 return super_written(bio, bytes_done, error); 432} 433 434void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 435 sector_t sector, int size, struct page *page) 436{ 437 /* write first size bytes of page to sector of rdev 438 * Increment mddev->pending_writes before returning 439 * and decrement it on completion, waking up sb_wait 440 * if zero is reached. 441 * If an error occurred, call md_error 442 * 443 * As we might need to resubmit the request if BIO_RW_BARRIER 444 * causes ENOTSUPP, we allocate a spare bio... 445 */ 446 struct bio *bio = bio_alloc(GFP_NOIO, 1); 447 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 448 449 bio->bi_bdev = rdev->bdev; 450 bio->bi_sector = sector; 451 bio_add_page(bio, page, size, 0); 452 bio->bi_private = rdev; 453 bio->bi_end_io = super_written; 454 bio->bi_rw = rw; 455 456 atomic_inc(&mddev->pending_writes); 457 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 458 struct bio *rbio; 459 rw |= (1<<BIO_RW_BARRIER); 460 rbio = bio_clone(bio, GFP_NOIO); 461 rbio->bi_private = bio; 462 rbio->bi_end_io = super_written_barrier; 463 submit_bio(rw, rbio); 464 } else 465 submit_bio(rw, bio); 466} 467 468void md_super_wait(mddev_t *mddev) 469{ 470 /* wait for all superblock writes that were scheduled to complete. 471 * if any had to be retried (due to BARRIER problems), retry them 472 */ 473 DEFINE_WAIT(wq); 474 for(;;) { 475 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 476 if (atomic_read(&mddev->pending_writes)==0) 477 break; 478 while (mddev->biolist) { 479 struct bio *bio; 480 spin_lock_irq(&mddev->write_lock); 481 bio = mddev->biolist; 482 mddev->biolist = bio->bi_next ; 483 bio->bi_next = NULL; 484 spin_unlock_irq(&mddev->write_lock); 485 submit_bio(bio->bi_rw, bio); 486 } 487 schedule(); 488 } 489 finish_wait(&mddev->sb_wait, &wq); 490} 491 492static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 493{ 494 if (bio->bi_size) 495 return 1; 496 497 complete((struct completion*)bio->bi_private); 498 return 0; 499} 500 501int sync_page_io(struct block_device *bdev, sector_t sector, int size, 502 struct page *page, int rw) 503{ 504 struct bio *bio = bio_alloc(GFP_NOIO, 1); 505 struct completion event; 506 int ret; 507 508 rw |= (1 << BIO_RW_SYNC); 509 510 bio->bi_bdev = bdev; 511 bio->bi_sector = sector; 512 bio_add_page(bio, page, size, 0); 513 init_completion(&event); 514 bio->bi_private = &event; 515 bio->bi_end_io = bi_complete; 516 submit_bio(rw, bio); 517 wait_for_completion(&event); 518 519 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 520 bio_put(bio); 521 return ret; 522} 523EXPORT_SYMBOL_GPL(sync_page_io); 524 525static int read_disk_sb(mdk_rdev_t * rdev, int size) 526{ 527 char b[BDEVNAME_SIZE]; 528 if (!rdev->sb_page) { 529 MD_BUG(); 530 return -EINVAL; 531 } 532 if (rdev->sb_loaded) 533 return 0; 534 535 536 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 537 goto fail; 538 rdev->sb_loaded = 1; 539 return 0; 540 541fail: 542 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 543 bdevname(rdev->bdev,b)); 544 return -EINVAL; 545} 546 547static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 548{ 549 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 550 (sb1->set_uuid1 == sb2->set_uuid1) && 551 (sb1->set_uuid2 == sb2->set_uuid2) && 552 (sb1->set_uuid3 == sb2->set_uuid3)) 553 554 return 1; 555 556 return 0; 557} 558 559 560static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 561{ 562 int ret; 563 mdp_super_t *tmp1, *tmp2; 564 565 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 566 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 567 568 if (!tmp1 || !tmp2) { 569 ret = 0; 570 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 571 goto abort; 572 } 573 574 *tmp1 = *sb1; 575 *tmp2 = *sb2; 576 577 /* 578 * nr_disks is not constant 579 */ 580 tmp1->nr_disks = 0; 581 tmp2->nr_disks = 0; 582 583 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 584 ret = 0; 585 else 586 ret = 1; 587 588abort: 589 kfree(tmp1); 590 kfree(tmp2); 591 return ret; 592} 593 594 595static u32 md_csum_fold(u32 csum) 596{ 597 csum = (csum & 0xffff) + (csum >> 16); 598 return (csum & 0xffff) + (csum >> 16); 599} 600 601static unsigned int calc_sb_csum(mdp_super_t * sb) 602{ 603 u64 newcsum = 0; 604 u32 *sb32 = (u32*)sb; 605 int i; 606 unsigned int disk_csum, csum; 607 608 disk_csum = sb->sb_csum; 609 sb->sb_csum = 0; 610 611 for (i = 0; i < MD_SB_BYTES/4 ; i++) 612 newcsum += sb32[i]; 613 csum = (newcsum & 0xffffffff) + (newcsum>>32); 614 615 616#ifdef CONFIG_ALPHA 617 /* This used to use csum_partial, which was wrong for several 618 * reasons including that different results are returned on 619 * different architectures. It isn't critical that we get exactly 620 * the same return value as before (we always csum_fold before 621 * testing, and that removes any differences). However as we 622 * know that csum_partial always returned a 16bit value on 623 * alphas, do a fold to maximise conformity to previous behaviour. 624 */ 625 sb->sb_csum = md_csum_fold(disk_csum); 626#else 627 sb->sb_csum = disk_csum; 628#endif 629 return csum; 630} 631 632 633/* 634 * Handle superblock details. 635 * We want to be able to handle multiple superblock formats 636 * so we have a common interface to them all, and an array of 637 * different handlers. 638 * We rely on user-space to write the initial superblock, and support 639 * reading and updating of superblocks. 640 * Interface methods are: 641 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 642 * loads and validates a superblock on dev. 643 * if refdev != NULL, compare superblocks on both devices 644 * Return: 645 * 0 - dev has a superblock that is compatible with refdev 646 * 1 - dev has a superblock that is compatible and newer than refdev 647 * so dev should be used as the refdev in future 648 * -EINVAL superblock incompatible or invalid 649 * -othererror e.g. -EIO 650 * 651 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 652 * Verify that dev is acceptable into mddev. 653 * The first time, mddev->raid_disks will be 0, and data from 654 * dev should be merged in. Subsequent calls check that dev 655 * is new enough. Return 0 or -EINVAL 656 * 657 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 658 * Update the superblock for rdev with data in mddev 659 * This does not write to disc. 660 * 661 */ 662 663struct super_type { 664 char *name; 665 struct module *owner; 666 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 667 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 668 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 669}; 670 671/* 672 * load_super for 0.90.0 673 */ 674static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 675{ 676 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 677 mdp_super_t *sb; 678 int ret; 679 sector_t sb_offset; 680 681 /* 682 * Calculate the position of the superblock, 683 * it's at the end of the disk. 684 * 685 * It also happens to be a multiple of 4Kb. 686 */ 687 sb_offset = calc_dev_sboffset(rdev->bdev); 688 rdev->sb_offset = sb_offset; 689 690 ret = read_disk_sb(rdev, MD_SB_BYTES); 691 if (ret) return ret; 692 693 ret = -EINVAL; 694 695 bdevname(rdev->bdev, b); 696 sb = (mdp_super_t*)page_address(rdev->sb_page); 697 698 if (sb->md_magic != MD_SB_MAGIC) { 699 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 700 b); 701 goto abort; 702 } 703 704 if (sb->major_version != 0 || 705 sb->minor_version < 90 || 706 sb->minor_version > 91) { 707 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 708 sb->major_version, sb->minor_version, 709 b); 710 goto abort; 711 } 712 713 if (sb->raid_disks <= 0) 714 goto abort; 715 716 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 717 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 718 b); 719 goto abort; 720 } 721 722 rdev->preferred_minor = sb->md_minor; 723 rdev->data_offset = 0; 724 rdev->sb_size = MD_SB_BYTES; 725 726 if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { 727 if (sb->level != 1 && sb->level != 4 728 && sb->level != 5 && sb->level != 6 729 && sb->level != 10) { 730 printk(KERN_WARNING 731 "md: bitmaps not supported for this level.\n"); 732 goto abort; 733 } 734 } 735 736 if (sb->level == LEVEL_MULTIPATH) 737 rdev->desc_nr = -1; 738 else 739 rdev->desc_nr = sb->this_disk.number; 740 741 if (refdev == 0) 742 ret = 1; 743 else { 744 __u64 ev1, ev2; 745 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 746 if (!uuid_equal(refsb, sb)) { 747 printk(KERN_WARNING "md: %s has different UUID to %s\n", 748 b, bdevname(refdev->bdev,b2)); 749 goto abort; 750 } 751 if (!sb_equal(refsb, sb)) { 752 printk(KERN_WARNING "md: %s has same UUID" 753 " but different superblock to %s\n", 754 b, bdevname(refdev->bdev, b2)); 755 goto abort; 756 } 757 ev1 = md_event(sb); 758 ev2 = md_event(refsb); 759 if (ev1 > ev2) 760 ret = 1; 761 else 762 ret = 0; 763 } 764 rdev->size = calc_dev_size(rdev, sb->chunk_size); 765 766 if (rdev->size < sb->size && sb->level > 1) 767 /* "this cannot possibly happen" ... */ 768 ret = -EINVAL; 769 770 abort: 771 return ret; 772} 773 774/* 775 * validate_super for 0.90.0 776 */ 777static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 778{ 779 mdp_disk_t *desc; 780 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 781 __u64 ev1 = md_event(sb); 782 783 rdev->raid_disk = -1; 784 rdev->flags = 0; 785 if (mddev->raid_disks == 0) { 786 mddev->major_version = 0; 787 mddev->minor_version = sb->minor_version; 788 mddev->patch_version = sb->patch_version; 789 mddev->persistent = ! sb->not_persistent; 790 mddev->chunk_size = sb->chunk_size; 791 mddev->ctime = sb->ctime; 792 mddev->utime = sb->utime; 793 mddev->level = sb->level; 794 mddev->clevel[0] = 0; 795 mddev->layout = sb->layout; 796 mddev->raid_disks = sb->raid_disks; 797 mddev->size = sb->size; 798 mddev->events = ev1; 799 mddev->bitmap_offset = 0; 800 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 801 802 if (mddev->minor_version >= 91) { 803 mddev->reshape_position = sb->reshape_position; 804 mddev->delta_disks = sb->delta_disks; 805 mddev->new_level = sb->new_level; 806 mddev->new_layout = sb->new_layout; 807 mddev->new_chunk = sb->new_chunk; 808 } else { 809 mddev->reshape_position = MaxSector; 810 mddev->delta_disks = 0; 811 mddev->new_level = mddev->level; 812 mddev->new_layout = mddev->layout; 813 mddev->new_chunk = mddev->chunk_size; 814 } 815 816 if (sb->state & (1<<MD_SB_CLEAN)) 817 mddev->recovery_cp = MaxSector; 818 else { 819 if (sb->events_hi == sb->cp_events_hi && 820 sb->events_lo == sb->cp_events_lo) { 821 mddev->recovery_cp = sb->recovery_cp; 822 } else 823 mddev->recovery_cp = 0; 824 } 825 826 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 827 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 828 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 829 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 830 831 mddev->max_disks = MD_SB_DISKS; 832 833 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 834 mddev->bitmap_file == NULL) 835 mddev->bitmap_offset = mddev->default_bitmap_offset; 836 837 } else if (mddev->pers == NULL) { 838 /* Insist on good event counter while assembling */ 839 ++ev1; 840 if (ev1 < mddev->events) 841 return -EINVAL; 842 } else if (mddev->bitmap) { 843 /* if adding to array with a bitmap, then we can accept an 844 * older device ... but not too old. 845 */ 846 if (ev1 < mddev->bitmap->events_cleared) 847 return 0; 848 } else { 849 if (ev1 < mddev->events) 850 /* just a hot-add of a new device, leave raid_disk at -1 */ 851 return 0; 852 } 853 854 if (mddev->level != LEVEL_MULTIPATH) { 855 desc = sb->disks + rdev->desc_nr; 856 857 if (desc->state & (1<<MD_DISK_FAULTY)) 858 set_bit(Faulty, &rdev->flags); 859 else if (desc->state & (1<<MD_DISK_SYNC) /* && 860 desc->raid_disk < mddev->raid_disks */) { 861 set_bit(In_sync, &rdev->flags); 862 rdev->raid_disk = desc->raid_disk; 863 } 864 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 865 set_bit(WriteMostly, &rdev->flags); 866 } else /* MULTIPATH are always insync */ 867 set_bit(In_sync, &rdev->flags); 868 return 0; 869} 870 871/* 872 * sync_super for 0.90.0 873 */ 874static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 875{ 876 mdp_super_t *sb; 877 struct list_head *tmp; 878 mdk_rdev_t *rdev2; 879 int next_spare = mddev->raid_disks; 880 881 882 /* make rdev->sb match mddev data.. 883 * 884 * 1/ zero out disks 885 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 886 * 3/ any empty disks < next_spare become removed 887 * 888 * disks[0] gets initialised to REMOVED because 889 * we cannot be sure from other fields if it has 890 * been initialised or not. 891 */ 892 int i; 893 int active=0, working=0,failed=0,spare=0,nr_disks=0; 894 895 rdev->sb_size = MD_SB_BYTES; 896 897 sb = (mdp_super_t*)page_address(rdev->sb_page); 898 899 memset(sb, 0, sizeof(*sb)); 900 901 sb->md_magic = MD_SB_MAGIC; 902 sb->major_version = mddev->major_version; 903 sb->patch_version = mddev->patch_version; 904 sb->gvalid_words = 0; /* ignored */ 905 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 906 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 907 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 908 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 909 910 sb->ctime = mddev->ctime; 911 sb->level = mddev->level; 912 sb->size = mddev->size; 913 sb->raid_disks = mddev->raid_disks; 914 sb->md_minor = mddev->md_minor; 915 sb->not_persistent = !mddev->persistent; 916 sb->utime = mddev->utime; 917 sb->state = 0; 918 sb->events_hi = (mddev->events>>32); 919 sb->events_lo = (u32)mddev->events; 920 921 if (mddev->reshape_position == MaxSector) 922 sb->minor_version = 90; 923 else { 924 sb->minor_version = 91; 925 sb->reshape_position = mddev->reshape_position; 926 sb->new_level = mddev->new_level; 927 sb->delta_disks = mddev->delta_disks; 928 sb->new_layout = mddev->new_layout; 929 sb->new_chunk = mddev->new_chunk; 930 } 931 mddev->minor_version = sb->minor_version; 932 if (mddev->in_sync) 933 { 934 sb->recovery_cp = mddev->recovery_cp; 935 sb->cp_events_hi = (mddev->events>>32); 936 sb->cp_events_lo = (u32)mddev->events; 937 if (mddev->recovery_cp == MaxSector) 938 sb->state = (1<< MD_SB_CLEAN); 939 } else 940 sb->recovery_cp = 0; 941 942 sb->layout = mddev->layout; 943 sb->chunk_size = mddev->chunk_size; 944 945 if (mddev->bitmap && mddev->bitmap_file == NULL) 946 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 947 948 sb->disks[0].state = (1<<MD_DISK_REMOVED); 949 ITERATE_RDEV(mddev,rdev2,tmp) { 950 mdp_disk_t *d; 951 int desc_nr; 952 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 953 && !test_bit(Faulty, &rdev2->flags)) 954 desc_nr = rdev2->raid_disk; 955 else 956 desc_nr = next_spare++; 957 rdev2->desc_nr = desc_nr; 958 d = &sb->disks[rdev2->desc_nr]; 959 nr_disks++; 960 d->number = rdev2->desc_nr; 961 d->major = MAJOR(rdev2->bdev->bd_dev); 962 d->minor = MINOR(rdev2->bdev->bd_dev); 963 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 964 && !test_bit(Faulty, &rdev2->flags)) 965 d->raid_disk = rdev2->raid_disk; 966 else 967 d->raid_disk = rdev2->desc_nr; /* compatibility */ 968 if (test_bit(Faulty, &rdev2->flags)) 969 d->state = (1<<MD_DISK_FAULTY); 970 else if (test_bit(In_sync, &rdev2->flags)) { 971 d->state = (1<<MD_DISK_ACTIVE); 972 d->state |= (1<<MD_DISK_SYNC); 973 active++; 974 working++; 975 } else { 976 d->state = 0; 977 spare++; 978 working++; 979 } 980 if (test_bit(WriteMostly, &rdev2->flags)) 981 d->state |= (1<<MD_DISK_WRITEMOSTLY); 982 } 983 /* now set the "removed" and "faulty" bits on any missing devices */ 984 for (i=0 ; i < mddev->raid_disks ; i++) { 985 mdp_disk_t *d = &sb->disks[i]; 986 if (d->state == 0 && d->number == 0) { 987 d->number = i; 988 d->raid_disk = i; 989 d->state = (1<<MD_DISK_REMOVED); 990 d->state |= (1<<MD_DISK_FAULTY); 991 failed++; 992 } 993 } 994 sb->nr_disks = nr_disks; 995 sb->active_disks = active; 996 sb->working_disks = working; 997 sb->failed_disks = failed; 998 sb->spare_disks = spare; 999 1000 sb->this_disk = sb->disks[rdev->desc_nr]; 1001 sb->sb_csum = calc_sb_csum(sb); 1002} 1003 1004/* 1005 * version 1 superblock 1006 */ 1007 1008static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 1009{ 1010 __le32 disk_csum; 1011 u32 csum; 1012 unsigned long long newcsum; 1013 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1014 __le32 *isuper = (__le32*)sb; 1015 int i; 1016 1017 disk_csum = sb->sb_csum; 1018 sb->sb_csum = 0; 1019 newcsum = 0; 1020 for (i=0; size>=4; size -= 4 ) 1021 newcsum += le32_to_cpu(*isuper++); 1022 1023 if (size == 2) 1024 newcsum += le16_to_cpu(*(__le16*) isuper); 1025 1026 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1027 sb->sb_csum = disk_csum; 1028 return cpu_to_le32(csum); 1029} 1030 1031static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1032{ 1033 struct mdp_superblock_1 *sb; 1034 int ret; 1035 sector_t sb_offset; 1036 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1037 int bmask; 1038 1039 /* 1040 * Calculate the position of the superblock. 1041 * It is always aligned to a 4K boundary and 1042 * depeding on minor_version, it can be: 1043 * 0: At least 8K, but less than 12K, from end of device 1044 * 1: At start of device 1045 * 2: 4K from start of device. 1046 */ 1047 switch(minor_version) { 1048 case 0: 1049 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 1050 sb_offset -= 8*2; 1051 sb_offset &= ~(sector_t)(4*2-1); 1052 /* convert from sectors to K */ 1053 sb_offset /= 2; 1054 break; 1055 case 1: 1056 sb_offset = 0; 1057 break; 1058 case 2: 1059 sb_offset = 4; 1060 break; 1061 default: 1062 return -EINVAL; 1063 } 1064 rdev->sb_offset = sb_offset; 1065 1066 /* superblock is rarely larger than 1K, but it can be larger, 1067 * and it is safe to read 4k, so we do that 1068 */ 1069 ret = read_disk_sb(rdev, 4096); 1070 if (ret) return ret; 1071 1072 1073 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1074 1075 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1076 sb->major_version != cpu_to_le32(1) || 1077 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1078 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 1079 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1080 return -EINVAL; 1081 1082 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1083 printk("md: invalid superblock checksum on %s\n", 1084 bdevname(rdev->bdev,b)); 1085 return -EINVAL; 1086 } 1087 if (le64_to_cpu(sb->data_size) < 10) { 1088 printk("md: data_size too small on %s\n", 1089 bdevname(rdev->bdev,b)); 1090 return -EINVAL; 1091 } 1092 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { 1093 if (sb->level != cpu_to_le32(1) && 1094 sb->level != cpu_to_le32(4) && 1095 sb->level != cpu_to_le32(5) && 1096 sb->level != cpu_to_le32(6) && 1097 sb->level != cpu_to_le32(10)) { 1098 printk(KERN_WARNING 1099 "md: bitmaps not supported for this level.\n"); 1100 return -EINVAL; 1101 } 1102 } 1103 1104 rdev->preferred_minor = 0xffff; 1105 rdev->data_offset = le64_to_cpu(sb->data_offset); 1106 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1107 1108 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1109 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1110 if (rdev->sb_size & bmask) 1111 rdev-> sb_size = (rdev->sb_size | bmask)+1; 1112 1113 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1114 rdev->desc_nr = -1; 1115 else 1116 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1117 1118 if (refdev == 0) 1119 ret = 1; 1120 else { 1121 __u64 ev1, ev2; 1122 struct mdp_superblock_1 *refsb = 1123 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1124 1125 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1126 sb->level != refsb->level || 1127 sb->layout != refsb->layout || 1128 sb->chunksize != refsb->chunksize) { 1129 printk(KERN_WARNING "md: %s has strangely different" 1130 " superblock to %s\n", 1131 bdevname(rdev->bdev,b), 1132 bdevname(refdev->bdev,b2)); 1133 return -EINVAL; 1134 } 1135 ev1 = le64_to_cpu(sb->events); 1136 ev2 = le64_to_cpu(refsb->events); 1137 1138 if (ev1 > ev2) 1139 ret = 1; 1140 else 1141 ret = 0; 1142 } 1143 if (minor_version) 1144 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1145 else 1146 rdev->size = rdev->sb_offset; 1147 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1148 return -EINVAL; 1149 rdev->size = le64_to_cpu(sb->data_size)/2; 1150 if (le32_to_cpu(sb->chunksize)) 1151 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1152 1153 if (le64_to_cpu(sb->size) > rdev->size*2) 1154 return -EINVAL; 1155 return ret; 1156} 1157 1158static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1159{ 1160 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1161 __u64 ev1 = le64_to_cpu(sb->events); 1162 1163 rdev->raid_disk = -1; 1164 rdev->flags = 0; 1165 if (mddev->raid_disks == 0) { 1166 mddev->major_version = 1; 1167 mddev->patch_version = 0; 1168 mddev->persistent = 1; 1169 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1170 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1171 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1172 mddev->level = le32_to_cpu(sb->level); 1173 mddev->clevel[0] = 0; 1174 mddev->layout = le32_to_cpu(sb->layout); 1175 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1176 mddev->size = le64_to_cpu(sb->size)/2; 1177 mddev->events = ev1; 1178 mddev->bitmap_offset = 0; 1179 mddev->default_bitmap_offset = 1024 >> 9; 1180 1181 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1182 memcpy(mddev->uuid, sb->set_uuid, 16); 1183 1184 mddev->max_disks = (4096-256)/2; 1185 1186 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1187 mddev->bitmap_file == NULL ) 1188 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1189 1190 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1191 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1192 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1193 mddev->new_level = le32_to_cpu(sb->new_level); 1194 mddev->new_layout = le32_to_cpu(sb->new_layout); 1195 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; 1196 } else { 1197 mddev->reshape_position = MaxSector; 1198 mddev->delta_disks = 0; 1199 mddev->new_level = mddev->level; 1200 mddev->new_layout = mddev->layout; 1201 mddev->new_chunk = mddev->chunk_size; 1202 } 1203 1204 } else if (mddev->pers == NULL) { 1205 /* Insist of good event counter while assembling */ 1206 ++ev1; 1207 if (ev1 < mddev->events) 1208 return -EINVAL; 1209 } else if (mddev->bitmap) { 1210 /* If adding to array with a bitmap, then we can accept an 1211 * older device, but not too old. 1212 */ 1213 if (ev1 < mddev->bitmap->events_cleared) 1214 return 0; 1215 } else { 1216 if (ev1 < mddev->events) 1217 /* just a hot-add of a new device, leave raid_disk at -1 */ 1218 return 0; 1219 } 1220 if (mddev->level != LEVEL_MULTIPATH) { 1221 int role; 1222 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1223 switch(role) { 1224 case 0xffff: /* spare */ 1225 break; 1226 case 0xfffe: /* faulty */ 1227 set_bit(Faulty, &rdev->flags); 1228 break; 1229 default: 1230 if ((le32_to_cpu(sb->feature_map) & 1231 MD_FEATURE_RECOVERY_OFFSET)) 1232 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1233 else 1234 set_bit(In_sync, &rdev->flags); 1235 rdev->raid_disk = role; 1236 break; 1237 } 1238 if (sb->devflags & WriteMostly1) 1239 set_bit(WriteMostly, &rdev->flags); 1240 } else /* MULTIPATH are always insync */ 1241 set_bit(In_sync, &rdev->flags); 1242 1243 return 0; 1244} 1245 1246static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1247{ 1248 struct mdp_superblock_1 *sb; 1249 struct list_head *tmp; 1250 mdk_rdev_t *rdev2; 1251 int max_dev, i; 1252 /* make rdev->sb match mddev and rdev data. */ 1253 1254 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1255 1256 sb->feature_map = 0; 1257 sb->pad0 = 0; 1258 sb->recovery_offset = cpu_to_le64(0); 1259 memset(sb->pad1, 0, sizeof(sb->pad1)); 1260 memset(sb->pad2, 0, sizeof(sb->pad2)); 1261 memset(sb->pad3, 0, sizeof(sb->pad3)); 1262 1263 sb->utime = cpu_to_le64((__u64)mddev->utime); 1264 sb->events = cpu_to_le64(mddev->events); 1265 if (mddev->in_sync) 1266 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1267 else 1268 sb->resync_offset = cpu_to_le64(0); 1269 1270 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1271 1272 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1273 sb->size = cpu_to_le64(mddev->size<<1); 1274 1275 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1276 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1277 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1278 } 1279 1280 if (rdev->raid_disk >= 0 && 1281 !test_bit(In_sync, &rdev->flags) && 1282 rdev->recovery_offset > 0) { 1283 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1284 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); 1285 } 1286 1287 if (mddev->reshape_position != MaxSector) { 1288 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1289 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1290 sb->new_layout = cpu_to_le32(mddev->new_layout); 1291 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1292 sb->new_level = cpu_to_le32(mddev->new_level); 1293 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); 1294 } 1295 1296 max_dev = 0; 1297 ITERATE_RDEV(mddev,rdev2,tmp) 1298 if (rdev2->desc_nr+1 > max_dev) 1299 max_dev = rdev2->desc_nr+1; 1300 1301 if (max_dev > le32_to_cpu(sb->max_dev)) 1302 sb->max_dev = cpu_to_le32(max_dev); 1303 for (i=0; i<max_dev;i++) 1304 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1305 1306 ITERATE_RDEV(mddev,rdev2,tmp) { 1307 i = rdev2->desc_nr; 1308 if (test_bit(Faulty, &rdev2->flags)) 1309 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1310 else if (test_bit(In_sync, &rdev2->flags)) 1311 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1312 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1313 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1314 else 1315 sb->dev_roles[i] = cpu_to_le16(0xffff); 1316 } 1317 1318 sb->sb_csum = calc_sb_1_csum(sb); 1319} 1320 1321 1322static struct super_type super_types[] = { 1323 [0] = { 1324 .name = "0.90.0", 1325 .owner = THIS_MODULE, 1326 .load_super = super_90_load, 1327 .validate_super = super_90_validate, 1328 .sync_super = super_90_sync, 1329 }, 1330 [1] = { 1331 .name = "md-1", 1332 .owner = THIS_MODULE, 1333 .load_super = super_1_load, 1334 .validate_super = super_1_validate, 1335 .sync_super = super_1_sync, 1336 }, 1337}; 1338 1339static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1340{ 1341 struct list_head *tmp, *tmp2; 1342 mdk_rdev_t *rdev, *rdev2; 1343 1344 ITERATE_RDEV(mddev1,rdev,tmp) 1345 ITERATE_RDEV(mddev2, rdev2, tmp2) 1346 if (rdev->bdev->bd_contains == 1347 rdev2->bdev->bd_contains) 1348 return 1; 1349 1350 return 0; 1351} 1352 1353static LIST_HEAD(pending_raid_disks); 1354 1355static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1356{ 1357 char b[BDEVNAME_SIZE]; 1358 struct kobject *ko; 1359 char *s; 1360 int err; 1361 1362 if (rdev->mddev) { 1363 MD_BUG(); 1364 return -EINVAL; 1365 } 1366 /* make sure rdev->size exceeds mddev->size */ 1367 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1368 if (mddev->pers) { 1369 /* Cannot change size, so fail 1370 * If mddev->level <= 0, then we don't care 1371 * about aligning sizes (e.g. linear) 1372 */ 1373 if (mddev->level > 0) 1374 return -ENOSPC; 1375 } else 1376 mddev->size = rdev->size; 1377 } 1378 1379 /* Verify rdev->desc_nr is unique. 1380 * If it is -1, assign a free number, else 1381 * check number is not in use 1382 */ 1383 if (rdev->desc_nr < 0) { 1384 int choice = 0; 1385 if (mddev->pers) choice = mddev->raid_disks; 1386 while (find_rdev_nr(mddev, choice)) 1387 choice++; 1388 rdev->desc_nr = choice; 1389 } else { 1390 if (find_rdev_nr(mddev, rdev->desc_nr)) 1391 return -EBUSY; 1392 } 1393 bdevname(rdev->bdev,b); 1394 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) 1395 return -ENOMEM; 1396 while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL) 1397 *s = '!'; 1398 1399 rdev->mddev = mddev; 1400 printk(KERN_INFO "md: bind<%s>\n", b); 1401 1402 rdev->kobj.parent = &mddev->kobj; 1403 if ((err = kobject_add(&rdev->kobj))) 1404 goto fail; 1405 1406 if (rdev->bdev->bd_part) 1407 ko = &rdev->bdev->bd_part->kobj; 1408 else 1409 ko = &rdev->bdev->bd_disk->kobj; 1410 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { 1411 kobject_del(&rdev->kobj); 1412 goto fail; 1413 } 1414 list_add(&rdev->same_set, &mddev->disks); 1415 bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk); 1416 return 0; 1417 1418 fail: 1419 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 1420 b, mdname(mddev)); 1421 return err; 1422} 1423 1424static void delayed_delete(struct work_struct *ws) 1425{ 1426 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); 1427 kobject_del(&rdev->kobj); 1428} 1429 1430static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1431{ 1432 char b[BDEVNAME_SIZE]; 1433 if (!rdev->mddev) { 1434 MD_BUG(); 1435 return; 1436 } 1437 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1438 list_del_init(&rdev->same_set); 1439 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1440 rdev->mddev = NULL; 1441 sysfs_remove_link(&rdev->kobj, "block"); 1442 1443 /* We need to delay this, otherwise we can deadlock when 1444 * writing to 'remove' to "dev/state" 1445 */ 1446 INIT_WORK(&rdev->del_work, delayed_delete); 1447 schedule_work(&rdev->del_work); 1448} 1449 1450/* 1451 * prevent the device from being mounted, repartitioned or 1452 * otherwise reused by a RAID array (or any other kernel 1453 * subsystem), by bd_claiming the device. 1454 */ 1455static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1456{ 1457 int err = 0; 1458 struct block_device *bdev; 1459 char b[BDEVNAME_SIZE]; 1460 1461 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1462 if (IS_ERR(bdev)) { 1463 printk(KERN_ERR "md: could not open %s.\n", 1464 __bdevname(dev, b)); 1465 return PTR_ERR(bdev); 1466 } 1467 err = bd_claim(bdev, rdev); 1468 if (err) { 1469 printk(KERN_ERR "md: could not bd_claim %s.\n", 1470 bdevname(bdev, b)); 1471 blkdev_put(bdev); 1472 return err; 1473 } 1474 rdev->bdev = bdev; 1475 return err; 1476} 1477 1478static void unlock_rdev(mdk_rdev_t *rdev) 1479{ 1480 struct block_device *bdev = rdev->bdev; 1481 rdev->bdev = NULL; 1482 if (!bdev) 1483 MD_BUG(); 1484 bd_release(bdev); 1485 blkdev_put(bdev); 1486} 1487 1488void md_autodetect_dev(dev_t dev); 1489 1490static void export_rdev(mdk_rdev_t * rdev) 1491{ 1492 char b[BDEVNAME_SIZE]; 1493 printk(KERN_INFO "md: export_rdev(%s)\n", 1494 bdevname(rdev->bdev,b)); 1495 if (rdev->mddev) 1496 MD_BUG(); 1497 free_disk_sb(rdev); 1498 list_del_init(&rdev->same_set); 1499#ifndef MODULE 1500 md_autodetect_dev(rdev->bdev->bd_dev); 1501#endif 1502 unlock_rdev(rdev); 1503 kobject_put(&rdev->kobj); 1504} 1505 1506static void kick_rdev_from_array(mdk_rdev_t * rdev) 1507{ 1508 unbind_rdev_from_array(rdev); 1509 export_rdev(rdev); 1510} 1511 1512static void export_array(mddev_t *mddev) 1513{ 1514 struct list_head *tmp; 1515 mdk_rdev_t *rdev; 1516 1517 ITERATE_RDEV(mddev,rdev,tmp) { 1518 if (!rdev->mddev) { 1519 MD_BUG(); 1520 continue; 1521 } 1522 kick_rdev_from_array(rdev); 1523 } 1524 if (!list_empty(&mddev->disks)) 1525 MD_BUG(); 1526 mddev->raid_disks = 0; 1527 mddev->major_version = 0; 1528} 1529 1530static void print_desc(mdp_disk_t *desc) 1531{ 1532 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1533 desc->major,desc->minor,desc->raid_disk,desc->state); 1534} 1535 1536static void print_sb(mdp_super_t *sb) 1537{ 1538 int i; 1539 1540 printk(KERN_INFO 1541 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1542 sb->major_version, sb->minor_version, sb->patch_version, 1543 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1544 sb->ctime); 1545 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1546 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1547 sb->md_minor, sb->layout, sb->chunk_size); 1548 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1549 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1550 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1551 sb->failed_disks, sb->spare_disks, 1552 sb->sb_csum, (unsigned long)sb->events_lo); 1553 1554 printk(KERN_INFO); 1555 for (i = 0; i < MD_SB_DISKS; i++) { 1556 mdp_disk_t *desc; 1557 1558 desc = sb->disks + i; 1559 if (desc->number || desc->major || desc->minor || 1560 desc->raid_disk || (desc->state && (desc->state != 4))) { 1561 printk(" D %2d: ", i); 1562 print_desc(desc); 1563 } 1564 } 1565 printk(KERN_INFO "md: THIS: "); 1566 print_desc(&sb->this_disk); 1567 1568} 1569 1570static void print_rdev(mdk_rdev_t *rdev) 1571{ 1572 char b[BDEVNAME_SIZE]; 1573 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1574 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1575 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1576 rdev->desc_nr); 1577 if (rdev->sb_loaded) { 1578 printk(KERN_INFO "md: rdev superblock:\n"); 1579 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1580 } else 1581 printk(KERN_INFO "md: no rdev superblock!\n"); 1582} 1583 1584static void md_print_devices(void) 1585{ 1586 struct list_head *tmp, *tmp2; 1587 mdk_rdev_t *rdev; 1588 mddev_t *mddev; 1589 char b[BDEVNAME_SIZE]; 1590 1591 printk("\n"); 1592 printk("md: **********************************\n"); 1593 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1594 printk("md: **********************************\n"); 1595 ITERATE_MDDEV(mddev,tmp) { 1596 1597 if (mddev->bitmap) 1598 bitmap_print_sb(mddev->bitmap); 1599 else 1600 printk("%s: ", mdname(mddev)); 1601 ITERATE_RDEV(mddev,rdev,tmp2) 1602 printk("<%s>", bdevname(rdev->bdev,b)); 1603 printk("\n"); 1604 1605 ITERATE_RDEV(mddev,rdev,tmp2) 1606 print_rdev(rdev); 1607 } 1608 printk("md: **********************************\n"); 1609 printk("\n"); 1610} 1611 1612 1613static void sync_sbs(mddev_t * mddev, int nospares) 1614{ 1615 /* Update each superblock (in-memory image), but 1616 * if we are allowed to, skip spares which already 1617 * have the right event counter, or have one earlier 1618 * (which would mean they aren't being marked as dirty 1619 * with the rest of the array) 1620 */ 1621 mdk_rdev_t *rdev; 1622 struct list_head *tmp; 1623 1624 ITERATE_RDEV(mddev,rdev,tmp) { 1625 if (rdev->sb_events == mddev->events || 1626 (nospares && 1627 rdev->raid_disk < 0 && 1628 (rdev->sb_events&1)==0 && 1629 rdev->sb_events+1 == mddev->events)) { 1630 /* Don't update this superblock */ 1631 rdev->sb_loaded = 2; 1632 } else { 1633 super_types[mddev->major_version]. 1634 sync_super(mddev, rdev); 1635 rdev->sb_loaded = 1; 1636 } 1637 } 1638} 1639 1640static void md_update_sb(mddev_t * mddev, int force_change) 1641{ 1642 int err; 1643 struct list_head *tmp; 1644 mdk_rdev_t *rdev; 1645 int sync_req; 1646 int nospares = 0; 1647 1648repeat: 1649 spin_lock_irq(&mddev->write_lock); 1650 1651 set_bit(MD_CHANGE_PENDING, &mddev->flags); 1652 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 1653 force_change = 1; 1654 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 1655 /* just a clean<-> dirty transition, possibly leave spares alone, 1656 * though if events isn't the right even/odd, we will have to do 1657 * spares after all 1658 */ 1659 nospares = 1; 1660 if (force_change) 1661 nospares = 0; 1662 if (mddev->degraded) 1663 /* If the array is degraded, then skipping spares is both 1664 * dangerous and fairly pointless. 1665 * Dangerous because a device that was removed from the array 1666 * might have a event_count that still looks up-to-date, 1667 * so it can be re-added without a resync. 1668 * Pointless because if there are any spares to skip, 1669 * then a recovery will happen and soon that array won't 1670 * be degraded any more and the spare can go back to sleep then. 1671 */ 1672 nospares = 0; 1673 1674 sync_req = mddev->in_sync; 1675 mddev->utime = get_seconds(); 1676 1677 /* If this is just a dirty<->clean transition, and the array is clean 1678 * and 'events' is odd, we can roll back to the previous clean state */ 1679 if (nospares 1680 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1681 && (mddev->events & 1) 1682 && mddev->events != 1) 1683 mddev->events--; 1684 else { 1685 /* otherwise we have to go forward and ... */ 1686 mddev->events ++; 1687 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1688 /* .. if the array isn't clean, insist on an odd 'events' */ 1689 if ((mddev->events&1)==0) { 1690 mddev->events++; 1691 nospares = 0; 1692 } 1693 } else { 1694 /* otherwise insist on an even 'events' (for clean states) */ 1695 if ((mddev->events&1)) { 1696 mddev->events++; 1697 nospares = 0; 1698 } 1699 } 1700 } 1701 1702 if (!mddev->events) { 1703 /* 1704 * oops, this 64-bit counter should never wrap. 1705 * Either we are in around ~1 trillion A.C., assuming 1706 * 1 reboot per second, or we have a bug: 1707 */ 1708 MD_BUG(); 1709 mddev->events --; 1710 } 1711 sync_sbs(mddev, nospares); 1712 1713 /* 1714 * do not write anything to disk if using 1715 * nonpersistent superblocks 1716 */ 1717 if (!mddev->persistent) { 1718 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1719 spin_unlock_irq(&mddev->write_lock); 1720 wake_up(&mddev->sb_wait); 1721 return; 1722 } 1723 spin_unlock_irq(&mddev->write_lock); 1724 1725 dprintk(KERN_INFO 1726 "md: updating %s RAID superblock on device (in sync %d)\n", 1727 mdname(mddev),mddev->in_sync); 1728 1729 err = bitmap_update_sb(mddev->bitmap); 1730 ITERATE_RDEV(mddev,rdev,tmp) { 1731 char b[BDEVNAME_SIZE]; 1732 dprintk(KERN_INFO "md: "); 1733 if (rdev->sb_loaded != 1) 1734 continue; /* no noise on spare devices */ 1735 if (test_bit(Faulty, &rdev->flags)) 1736 dprintk("(skipping faulty "); 1737 1738 dprintk("%s ", bdevname(rdev->bdev,b)); 1739 if (!test_bit(Faulty, &rdev->flags)) { 1740 md_super_write(mddev,rdev, 1741 rdev->sb_offset<<1, rdev->sb_size, 1742 rdev->sb_page); 1743 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1744 bdevname(rdev->bdev,b), 1745 (unsigned long long)rdev->sb_offset); 1746 rdev->sb_events = mddev->events; 1747 1748 } else 1749 dprintk(")\n"); 1750 if (mddev->level == LEVEL_MULTIPATH) 1751 /* only need to write one superblock... */ 1752 break; 1753 } 1754 md_super_wait(mddev); 1755 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 1756 1757 spin_lock_irq(&mddev->write_lock); 1758 if (mddev->in_sync != sync_req || 1759 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 1760 /* have to write it out again */ 1761 spin_unlock_irq(&mddev->write_lock); 1762 goto repeat; 1763 } 1764 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1765 spin_unlock_irq(&mddev->write_lock); 1766 wake_up(&mddev->sb_wait); 1767 1768} 1769 1770/* words written to sysfs files may, or my not, be \n terminated. 1771 * We want to accept with case. For this we use cmd_match. 1772 */ 1773static int cmd_match(const char *cmd, const char *str) 1774{ 1775 /* See if cmd, written into a sysfs file, matches 1776 * str. They must either be the same, or cmd can 1777 * have a trailing newline 1778 */ 1779 while (*cmd && *str && *cmd == *str) { 1780 cmd++; 1781 str++; 1782 } 1783 if (*cmd == '\n') 1784 cmd++; 1785 if (*str || *cmd) 1786 return 0; 1787 return 1; 1788} 1789 1790struct rdev_sysfs_entry { 1791 struct attribute attr; 1792 ssize_t (*show)(mdk_rdev_t *, char *); 1793 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1794}; 1795 1796static ssize_t 1797state_show(mdk_rdev_t *rdev, char *page) 1798{ 1799 char *sep = ""; 1800 int len=0; 1801 1802 if (test_bit(Faulty, &rdev->flags)) { 1803 len+= sprintf(page+len, "%sfaulty",sep); 1804 sep = ","; 1805 } 1806 if (test_bit(In_sync, &rdev->flags)) { 1807 len += sprintf(page+len, "%sin_sync",sep); 1808 sep = ","; 1809 } 1810 if (test_bit(WriteMostly, &rdev->flags)) { 1811 len += sprintf(page+len, "%swrite_mostly",sep); 1812 sep = ","; 1813 } 1814 if (!test_bit(Faulty, &rdev->flags) && 1815 !test_bit(In_sync, &rdev->flags)) { 1816 len += sprintf(page+len, "%sspare", sep); 1817 sep = ","; 1818 } 1819 return len+sprintf(page+len, "\n"); 1820} 1821 1822static ssize_t 1823state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1824{ 1825 /* can write 1826 * faulty - simulates and error 1827 * remove - disconnects the device 1828 * writemostly - sets write_mostly 1829 * -writemostly - clears write_mostly 1830 */ 1831 int err = -EINVAL; 1832 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 1833 md_error(rdev->mddev, rdev); 1834 err = 0; 1835 } else if (cmd_match(buf, "remove")) { 1836 if (rdev->raid_disk >= 0) 1837 err = -EBUSY; 1838 else { 1839 mddev_t *mddev = rdev->mddev; 1840 kick_rdev_from_array(rdev); 1841 if (mddev->pers) 1842 md_update_sb(mddev, 1); 1843 md_new_event(mddev); 1844 err = 0; 1845 } 1846 } else if (cmd_match(buf, "writemostly")) { 1847 set_bit(WriteMostly, &rdev->flags); 1848 err = 0; 1849 } else if (cmd_match(buf, "-writemostly")) { 1850 clear_bit(WriteMostly, &rdev->flags); 1851 err = 0; 1852 } 1853 return err ? err : len; 1854} 1855static struct rdev_sysfs_entry rdev_state = 1856__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 1857 1858static ssize_t 1859super_show(mdk_rdev_t *rdev, char *page) 1860{ 1861 if (rdev->sb_loaded && rdev->sb_size) { 1862 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1863 return rdev->sb_size; 1864 } else 1865 return 0; 1866} 1867static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1868 1869static ssize_t 1870errors_show(mdk_rdev_t *rdev, char *page) 1871{ 1872 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 1873} 1874 1875static ssize_t 1876errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1877{ 1878 char *e; 1879 unsigned long n = simple_strtoul(buf, &e, 10); 1880 if (*buf && (*e == 0 || *e == '\n')) { 1881 atomic_set(&rdev->corrected_errors, n); 1882 return len; 1883 } 1884 return -EINVAL; 1885} 1886static struct rdev_sysfs_entry rdev_errors = 1887__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 1888 1889static ssize_t 1890slot_show(mdk_rdev_t *rdev, char *page) 1891{ 1892 if (rdev->raid_disk < 0) 1893 return sprintf(page, "none\n"); 1894 else 1895 return sprintf(page, "%d\n", rdev->raid_disk); 1896} 1897 1898static ssize_t 1899slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1900{ 1901 char *e; 1902 int slot = simple_strtoul(buf, &e, 10); 1903 if (strncmp(buf, "none", 4)==0) 1904 slot = -1; 1905 else if (e==buf || (*e && *e!= '\n')) 1906 return -EINVAL; 1907 if (rdev->mddev->pers) 1908 /* Cannot set slot in active array (yet) */ 1909 return -EBUSY; 1910 if (slot >= rdev->mddev->raid_disks) 1911 return -ENOSPC; 1912 rdev->raid_disk = slot; 1913 /* assume it is working */ 1914 rdev->flags = 0; 1915 set_bit(In_sync, &rdev->flags); 1916 return len; 1917} 1918 1919 1920static struct rdev_sysfs_entry rdev_slot = 1921__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 1922 1923static ssize_t 1924offset_show(mdk_rdev_t *rdev, char *page) 1925{ 1926 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 1927} 1928 1929static ssize_t 1930offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1931{ 1932 char *e; 1933 unsigned long long offset = simple_strtoull(buf, &e, 10); 1934 if (e==buf || (*e && *e != '\n')) 1935 return -EINVAL; 1936 if (rdev->mddev->pers) 1937 return -EBUSY; 1938 rdev->data_offset = offset; 1939 return len; 1940} 1941 1942static struct rdev_sysfs_entry rdev_offset = 1943__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 1944 1945static ssize_t 1946rdev_size_show(mdk_rdev_t *rdev, char *page) 1947{ 1948 return sprintf(page, "%llu\n", (unsigned long long)rdev->size); 1949} 1950 1951static ssize_t 1952rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1953{ 1954 char *e; 1955 unsigned long long size = simple_strtoull(buf, &e, 10); 1956 if (e==buf || (*e && *e != '\n')) 1957 return -EINVAL; 1958 if (rdev->mddev->pers) 1959 return -EBUSY; 1960 rdev->size = size; 1961 if (size < rdev->mddev->size || rdev->mddev->size == 0) 1962 rdev->mddev->size = size; 1963 return len; 1964} 1965 1966static struct rdev_sysfs_entry rdev_size = 1967__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 1968 1969static struct attribute *rdev_default_attrs[] = { 1970 &rdev_state.attr, 1971 &rdev_super.attr, 1972 &rdev_errors.attr, 1973 &rdev_slot.attr, 1974 &rdev_offset.attr, 1975 &rdev_size.attr, 1976 NULL, 1977}; 1978static ssize_t 1979rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1980{ 1981 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1982 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1983 1984 if (!entry->show) 1985 return -EIO; 1986 return entry->show(rdev, page); 1987} 1988 1989static ssize_t 1990rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1991 const char *page, size_t length) 1992{ 1993 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1994 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1995 1996 if (!entry->store) 1997 return -EIO; 1998 if (!capable(CAP_SYS_ADMIN)) 1999 return -EACCES; 2000 return entry->store(rdev, page, length); 2001} 2002 2003static void rdev_free(struct kobject *ko) 2004{ 2005 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 2006 kfree(rdev); 2007} 2008static struct sysfs_ops rdev_sysfs_ops = { 2009 .show = rdev_attr_show, 2010 .store = rdev_attr_store, 2011}; 2012static struct kobj_type rdev_ktype = { 2013 .release = rdev_free, 2014 .sysfs_ops = &rdev_sysfs_ops, 2015 .default_attrs = rdev_default_attrs, 2016}; 2017 2018/* 2019 * Import a device. If 'super_format' >= 0, then sanity check the superblock 2020 * 2021 * mark the device faulty if: 2022 * 2023 * - the device is nonexistent (zero size) 2024 * - the device has no valid superblock 2025 * 2026 * a faulty rdev _never_ has rdev->sb set. 2027 */ 2028static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 2029{ 2030 char b[BDEVNAME_SIZE]; 2031 int err; 2032 mdk_rdev_t *rdev; 2033 sector_t size; 2034 2035 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 2036 if (!rdev) { 2037 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 2038 return ERR_PTR(-ENOMEM); 2039 } 2040 2041 if ((err = alloc_disk_sb(rdev))) 2042 goto abort_free; 2043 2044 err = lock_rdev(rdev, newdev); 2045 if (err) 2046 goto abort_free; 2047 2048 rdev->kobj.parent = NULL; 2049 rdev->kobj.ktype = &rdev_ktype; 2050 kobject_init(&rdev->kobj); 2051 2052 rdev->desc_nr = -1; 2053 rdev->saved_raid_disk = -1; 2054 rdev->raid_disk = -1; 2055 rdev->flags = 0; 2056 rdev->data_offset = 0; 2057 rdev->sb_events = 0; 2058 atomic_set(&rdev->nr_pending, 0); 2059 atomic_set(&rdev->read_errors, 0); 2060 atomic_set(&rdev->corrected_errors, 0); 2061 2062 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2063 if (!size) { 2064 printk(KERN_WARNING 2065 "md: %s has zero or unknown size, marking faulty!\n", 2066 bdevname(rdev->bdev,b)); 2067 err = -EINVAL; 2068 goto abort_free; 2069 } 2070 2071 if (super_format >= 0) { 2072 err = super_types[super_format]. 2073 load_super(rdev, NULL, super_minor); 2074 if (err == -EINVAL) { 2075 printk(KERN_WARNING 2076 "md: %s has invalid sb, not importing!\n", 2077 bdevname(rdev->bdev,b)); 2078 goto abort_free; 2079 } 2080 if (err < 0) { 2081 printk(KERN_WARNING 2082 "md: could not read %s's sb, not importing!\n", 2083 bdevname(rdev->bdev,b)); 2084 goto abort_free; 2085 } 2086 } 2087 INIT_LIST_HEAD(&rdev->same_set); 2088 2089 return rdev; 2090 2091abort_free: 2092 if (rdev->sb_page) { 2093 if (rdev->bdev) 2094 unlock_rdev(rdev); 2095 free_disk_sb(rdev); 2096 } 2097 kfree(rdev); 2098 return ERR_PTR(err); 2099} 2100 2101/* 2102 * Check a full RAID array for plausibility 2103 */ 2104 2105 2106static void analyze_sbs(mddev_t * mddev) 2107{ 2108 int i; 2109 struct list_head *tmp; 2110 mdk_rdev_t *rdev, *freshest; 2111 char b[BDEVNAME_SIZE]; 2112 2113 freshest = NULL; 2114 ITERATE_RDEV(mddev,rdev,tmp) 2115 switch (super_types[mddev->major_version]. 2116 load_super(rdev, freshest, mddev->minor_version)) { 2117 case 1: 2118 freshest = rdev; 2119 break; 2120 case 0: 2121 break; 2122 default: 2123 printk( KERN_ERR \ 2124 "md: fatal superblock inconsistency in %s" 2125 " -- removing from array\n", 2126 bdevname(rdev->bdev,b)); 2127 kick_rdev_from_array(rdev); 2128 } 2129 2130 2131 super_types[mddev->major_version]. 2132 validate_super(mddev, freshest); 2133 2134 i = 0; 2135 ITERATE_RDEV(mddev,rdev,tmp) { 2136 if (rdev != freshest) 2137 if (super_types[mddev->major_version]. 2138 validate_super(mddev, rdev)) { 2139 printk(KERN_WARNING "md: kicking non-fresh %s" 2140 " from array!\n", 2141 bdevname(rdev->bdev,b)); 2142 kick_rdev_from_array(rdev); 2143 continue; 2144 } 2145 if (mddev->level == LEVEL_MULTIPATH) { 2146 rdev->desc_nr = i++; 2147 rdev->raid_disk = rdev->desc_nr; 2148 set_bit(In_sync, &rdev->flags); 2149 } else if (rdev->raid_disk >= mddev->raid_disks) { 2150 rdev->raid_disk = -1; 2151 clear_bit(In_sync, &rdev->flags); 2152 } 2153 } 2154 2155 2156 2157 if (mddev->recovery_cp != MaxSector && 2158 mddev->level >= 1) 2159 printk(KERN_ERR "md: %s: raid array is not clean" 2160 " -- starting background reconstruction\n", 2161 mdname(mddev)); 2162 2163} 2164 2165static ssize_t 2166safe_delay_show(mddev_t *mddev, char *page) 2167{ 2168 int msec = (mddev->safemode_delay*1000)/HZ; 2169 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2170} 2171static ssize_t 2172safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2173{ 2174 int scale=1; 2175 int dot=0; 2176 int i; 2177 unsigned long msec; 2178 char buf[30]; 2179 char *e; 2180 /* remove a period, and count digits after it */ 2181 if (len >= sizeof(buf)) 2182 return -EINVAL; 2183 strlcpy(buf, cbuf, len); 2184 buf[len] = 0; 2185 for (i=0; i<len; i++) { 2186 if (dot) { 2187 if (isdigit(buf[i])) { 2188 buf[i-1] = buf[i]; 2189 scale *= 10; 2190 } 2191 buf[i] = 0; 2192 } else if (buf[i] == '.') { 2193 dot=1; 2194 buf[i] = 0; 2195 } 2196 } 2197 msec = simple_strtoul(buf, &e, 10); 2198 if (e == buf || (*e && *e != '\n')) 2199 return -EINVAL; 2200 msec = (msec * 1000) / scale; 2201 if (msec == 0) 2202 mddev->safemode_delay = 0; 2203 else { 2204 mddev->safemode_delay = (msec*HZ)/1000; 2205 if (mddev->safemode_delay == 0) 2206 mddev->safemode_delay = 1; 2207 } 2208 return len; 2209} 2210static struct md_sysfs_entry md_safe_delay = 2211__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2212 2213static ssize_t 2214level_show(mddev_t *mddev, char *page) 2215{ 2216 struct mdk_personality *p = mddev->pers; 2217 if (p) 2218 return sprintf(page, "%s\n", p->name); 2219 else if (mddev->clevel[0]) 2220 return sprintf(page, "%s\n", mddev->clevel); 2221 else if (mddev->level != LEVEL_NONE) 2222 return sprintf(page, "%d\n", mddev->level); 2223 else 2224 return 0; 2225} 2226 2227static ssize_t 2228level_store(mddev_t *mddev, const char *buf, size_t len) 2229{ 2230 int rv = len; 2231 if (mddev->pers) 2232 return -EBUSY; 2233 if (len == 0) 2234 return 0; 2235 if (len >= sizeof(mddev->clevel)) 2236 return -ENOSPC; 2237 strncpy(mddev->clevel, buf, len); 2238 if (mddev->clevel[len-1] == '\n') 2239 len--; 2240 mddev->clevel[len] = 0; 2241 mddev->level = LEVEL_NONE; 2242 return rv; 2243} 2244 2245static struct md_sysfs_entry md_level = 2246__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 2247 2248 2249static ssize_t 2250layout_show(mddev_t *mddev, char *page) 2251{ 2252 /* just a number, not meaningful for all levels */ 2253 if (mddev->reshape_position != MaxSector && 2254 mddev->layout != mddev->new_layout) 2255 return sprintf(page, "%d (%d)\n", 2256 mddev->new_layout, mddev->layout); 2257 return sprintf(page, "%d\n", mddev->layout); 2258} 2259 2260static ssize_t 2261layout_store(mddev_t *mddev, const char *buf, size_t len) 2262{ 2263 char *e; 2264 unsigned long n = simple_strtoul(buf, &e, 10); 2265 2266 if (!*buf || (*e && *e != '\n')) 2267 return -EINVAL; 2268 2269 if (mddev->pers) 2270 return -EBUSY; 2271 if (mddev->reshape_position != MaxSector) 2272 mddev->new_layout = n; 2273 else 2274 mddev->layout = n; 2275 return len; 2276} 2277static struct md_sysfs_entry md_layout = 2278__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 2279 2280 2281static ssize_t 2282raid_disks_show(mddev_t *mddev, char *page) 2283{ 2284 if (mddev->raid_disks == 0) 2285 return 0; 2286 if (mddev->reshape_position != MaxSector && 2287 mddev->delta_disks != 0) 2288 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 2289 mddev->raid_disks - mddev->delta_disks); 2290 return sprintf(page, "%d\n", mddev->raid_disks); 2291} 2292 2293static int update_raid_disks(mddev_t *mddev, int raid_disks); 2294 2295static ssize_t 2296raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2297{ 2298 char *e; 2299 int rv = 0; 2300 unsigned long n = simple_strtoul(buf, &e, 10); 2301 2302 if (!*buf || (*e && *e != '\n')) 2303 return -EINVAL; 2304 2305 if (mddev->pers) 2306 rv = update_raid_disks(mddev, n); 2307 else if (mddev->reshape_position != MaxSector) { 2308 int olddisks = mddev->raid_disks - mddev->delta_disks; 2309 mddev->delta_disks = n - olddisks; 2310 mddev->raid_disks = n; 2311 } else 2312 mddev->raid_disks = n; 2313 return rv ? rv : len; 2314} 2315static struct md_sysfs_entry md_raid_disks = 2316__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 2317 2318static ssize_t 2319chunk_size_show(mddev_t *mddev, char *page) 2320{ 2321 if (mddev->reshape_position != MaxSector && 2322 mddev->chunk_size != mddev->new_chunk) 2323 return sprintf(page, "%d (%d)\n", mddev->new_chunk, 2324 mddev->chunk_size); 2325 return sprintf(page, "%d\n", mddev->chunk_size); 2326} 2327 2328static ssize_t 2329chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2330{ 2331 /* can only set chunk_size if array is not yet active */ 2332 char *e; 2333 unsigned long n = simple_strtoul(buf, &e, 10); 2334 2335 if (!*buf || (*e && *e != '\n')) 2336 return -EINVAL; 2337 2338 if (mddev->pers) 2339 return -EBUSY; 2340 else if (mddev->reshape_position != MaxSector) 2341 mddev->new_chunk = n; 2342 else 2343 mddev->chunk_size = n; 2344 return len; 2345} 2346static struct md_sysfs_entry md_chunk_size = 2347__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 2348 2349static ssize_t 2350resync_start_show(mddev_t *mddev, char *page) 2351{ 2352 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2353} 2354 2355static ssize_t 2356resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2357{ 2358 /* can only set chunk_size if array is not yet active */ 2359 char *e; 2360 unsigned long long n = simple_strtoull(buf, &e, 10); 2361 2362 if (mddev->pers) 2363 return -EBUSY; 2364 if (!*buf || (*e && *e != '\n')) 2365 return -EINVAL; 2366 2367 mddev->recovery_cp = n; 2368 return len; 2369} 2370static struct md_sysfs_entry md_resync_start = 2371__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 2372 2373/* 2374 * The array state can be: 2375 * 2376 * clear 2377 * No devices, no size, no level 2378 * Equivalent to STOP_ARRAY ioctl 2379 * inactive 2380 * May have some settings, but array is not active 2381 * all IO results in error 2382 * When written, doesn't tear down array, but just stops it 2383 * suspended (not supported yet) 2384 * All IO requests will block. The array can be reconfigured. 2385 * Writing this, if accepted, will block until array is quiessent 2386 * readonly 2387 * no resync can happen. no superblocks get written. 2388 * write requests fail 2389 * read-auto 2390 * like readonly, but behaves like 'clean' on a write request. 2391 * 2392 * clean - no pending writes, but otherwise active. 2393 * When written to inactive array, starts without resync 2394 * If a write request arrives then 2395 * if metadata is known, mark 'dirty' and switch to 'active'. 2396 * if not known, block and switch to write-pending 2397 * If written to an active array that has pending writes, then fails. 2398 * active 2399 * fully active: IO and resync can be happening. 2400 * When written to inactive array, starts with resync 2401 * 2402 * write-pending 2403 * clean, but writes are blocked waiting for 'active' to be written. 2404 * 2405 * active-idle 2406 * like active, but no writes have been seen for a while (100msec). 2407 * 2408 */ 2409enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2410 write_pending, active_idle, bad_word}; 2411static char *array_states[] = { 2412 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 2413 "write-pending", "active-idle", NULL }; 2414 2415static int match_word(const char *word, char **list) 2416{ 2417 int n; 2418 for (n=0; list[n]; n++) 2419 if (cmd_match(word, list[n])) 2420 break; 2421 return n; 2422} 2423 2424static ssize_t 2425array_state_show(mddev_t *mddev, char *page) 2426{ 2427 enum array_state st = inactive; 2428 2429 if (mddev->pers) 2430 switch(mddev->ro) { 2431 case 1: 2432 st = readonly; 2433 break; 2434 case 2: 2435 st = read_auto; 2436 break; 2437 case 0: 2438 if (mddev->in_sync) 2439 st = clean; 2440 else if (mddev->safemode) 2441 st = active_idle; 2442 else 2443 st = active; 2444 } 2445 else { 2446 if (list_empty(&mddev->disks) && 2447 mddev->raid_disks == 0 && 2448 mddev->size == 0) 2449 st = clear; 2450 else 2451 st = inactive; 2452 } 2453 return sprintf(page, "%s\n", array_states[st]); 2454} 2455 2456static int do_md_stop(mddev_t * mddev, int ro); 2457static int do_md_run(mddev_t * mddev); 2458static int restart_array(mddev_t *mddev); 2459 2460static ssize_t 2461array_state_store(mddev_t *mddev, const char *buf, size_t len) 2462{ 2463 int err = -EINVAL; 2464 enum array_state st = match_word(buf, array_states); 2465 switch(st) { 2466 case bad_word: 2467 break; 2468 case clear: 2469 /* stopping an active array */ 2470 if (mddev->pers) { 2471 if (atomic_read(&mddev->active) > 1) 2472 return -EBUSY; 2473 err = do_md_stop(mddev, 0); 2474 } 2475 break; 2476 case inactive: 2477 /* stopping an active array */ 2478 if (mddev->pers) { 2479 if (atomic_read(&mddev->active) > 1) 2480 return -EBUSY; 2481 err = do_md_stop(mddev, 2); 2482 } 2483 break; 2484 case suspended: 2485 break; /* not supported yet */ 2486 case readonly: 2487 if (mddev->pers) 2488 err = do_md_stop(mddev, 1); 2489 else { 2490 mddev->ro = 1; 2491 err = do_md_run(mddev); 2492 } 2493 break; 2494 case read_auto: 2495 /* stopping an active array */ 2496 if (mddev->pers) { 2497 err = do_md_stop(mddev, 1); 2498 if (err == 0) 2499 mddev->ro = 2; 2500 } else { 2501 mddev->ro = 2; 2502 err = do_md_run(mddev); 2503 } 2504 break; 2505 case clean: 2506 if (mddev->pers) { 2507 restart_array(mddev); 2508 spin_lock_irq(&mddev->write_lock); 2509 if (atomic_read(&mddev->writes_pending) == 0) { 2510 mddev->in_sync = 1; 2511 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 2512 } 2513 spin_unlock_irq(&mddev->write_lock); 2514 } else { 2515 mddev->ro = 0; 2516 mddev->recovery_cp = MaxSector; 2517 err = do_md_run(mddev); 2518 } 2519 break; 2520 case active: 2521 if (mddev->pers) { 2522 restart_array(mddev); 2523 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2524 wake_up(&mddev->sb_wait); 2525 err = 0; 2526 } else { 2527 mddev->ro = 0; 2528 err = do_md_run(mddev); 2529 } 2530 break; 2531 case write_pending: 2532 case active_idle: 2533 /* these cannot be set */ 2534 break; 2535 } 2536 if (err) 2537 return err; 2538 else 2539 return len; 2540} 2541static struct md_sysfs_entry md_array_state = 2542__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 2543 2544static ssize_t 2545null_show(mddev_t *mddev, char *page) 2546{ 2547 return -EINVAL; 2548} 2549 2550static ssize_t 2551new_dev_store(mddev_t *mddev, const char *buf, size_t len) 2552{ 2553 /* buf must be %d:%d\n? giving major and minor numbers */ 2554 /* The new device is added to the array. 2555 * If the array has a persistent superblock, we read the 2556 * superblock to initialise info and check validity. 2557 * Otherwise, only checking done is that in bind_rdev_to_array, 2558 * which mainly checks size. 2559 */ 2560 char *e; 2561 int major = simple_strtoul(buf, &e, 10); 2562 int minor; 2563 dev_t dev; 2564 mdk_rdev_t *rdev; 2565 int err; 2566 2567 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 2568 return -EINVAL; 2569 minor = simple_strtoul(e+1, &e, 10); 2570 if (*e && *e != '\n') 2571 return -EINVAL; 2572 dev = MKDEV(major, minor); 2573 if (major != MAJOR(dev) || 2574 minor != MINOR(dev)) 2575 return -EOVERFLOW; 2576 2577 2578 if (mddev->persistent) { 2579 rdev = md_import_device(dev, mddev->major_version, 2580 mddev->minor_version); 2581 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 2582 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2583 mdk_rdev_t, same_set); 2584 err = super_types[mddev->major_version] 2585 .load_super(rdev, rdev0, mddev->minor_version); 2586 if (err < 0) 2587 goto out; 2588 } 2589 } else 2590 rdev = md_import_device(dev, -1, -1); 2591 2592 if (IS_ERR(rdev)) 2593 return PTR_ERR(rdev); 2594 err = bind_rdev_to_array(rdev, mddev); 2595 out: 2596 if (err) 2597 export_rdev(rdev); 2598 return err ? err : len; 2599} 2600 2601static struct md_sysfs_entry md_new_device = 2602__ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 2603 2604static ssize_t 2605bitmap_store(mddev_t *mddev, const char *buf, size_t len) 2606{ 2607 char *end; 2608 unsigned long chunk, end_chunk; 2609 2610 if (!mddev->bitmap) 2611 goto out; 2612 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 2613 while (*buf) { 2614 chunk = end_chunk = simple_strtoul(buf, &end, 0); 2615 if (buf == end) break; 2616 if (*end == '-') { /* range */ 2617 buf = end + 1; 2618 end_chunk = simple_strtoul(buf, &end, 0); 2619 if (buf == end) break; 2620 } 2621 if (*end && !isspace(*end)) break; 2622 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 2623 buf = end; 2624 while (isspace(*buf)) buf++; 2625 } 2626 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 2627out: 2628 return len; 2629} 2630 2631static struct md_sysfs_entry md_bitmap = 2632__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 2633 2634static ssize_t 2635size_show(mddev_t *mddev, char *page) 2636{ 2637 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 2638} 2639 2640static int update_size(mddev_t *mddev, unsigned long size); 2641 2642static ssize_t 2643size_store(mddev_t *mddev, const char *buf, size_t len) 2644{ 2645 /* If array is inactive, we can reduce the component size, but 2646 * not increase it (except from 0). 2647 * If array is active, we can try an on-line resize 2648 */ 2649 char *e; 2650 int err = 0; 2651 unsigned long long size = simple_strtoull(buf, &e, 10); 2652 if (!*buf || *buf == '\n' || 2653 (*e && *e != '\n')) 2654 return -EINVAL; 2655 2656 if (mddev->pers) { 2657 err = update_size(mddev, size); 2658 md_update_sb(mddev, 1); 2659 } else { 2660 if (mddev->size == 0 || 2661 mddev->size > size) 2662 mddev->size = size; 2663 else 2664 err = -ENOSPC; 2665 } 2666 return err ? err : len; 2667} 2668 2669static struct md_sysfs_entry md_size = 2670__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 2671 2672 2673/* Metdata version. 2674 * This is either 'none' for arrays with externally managed metadata, 2675 * or N.M for internally known formats 2676 */ 2677static ssize_t 2678metadata_show(mddev_t *mddev, char *page) 2679{ 2680 if (mddev->persistent) 2681 return sprintf(page, "%d.%d\n", 2682 mddev->major_version, mddev->minor_version); 2683 else 2684 return sprintf(page, "none\n"); 2685} 2686 2687static ssize_t 2688metadata_store(mddev_t *mddev, const char *buf, size_t len) 2689{ 2690 int major, minor; 2691 char *e; 2692 if (!list_empty(&mddev->disks)) 2693 return -EBUSY; 2694 2695 if (cmd_match(buf, "none")) { 2696 mddev->persistent = 0; 2697 mddev->major_version = 0; 2698 mddev->minor_version = 90; 2699 return len; 2700 } 2701 major = simple_strtoul(buf, &e, 10); 2702 if (e==buf || *e != '.') 2703 return -EINVAL; 2704 buf = e+1; 2705 minor = simple_strtoul(buf, &e, 10); 2706 if (e==buf || (*e && *e != '\n') ) 2707 return -EINVAL; 2708 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 2709 return -ENOENT; 2710 mddev->major_version = major; 2711 mddev->minor_version = minor; 2712 mddev->persistent = 1; 2713 return len; 2714} 2715 2716static struct md_sysfs_entry md_metadata = 2717__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 2718 2719static ssize_t 2720action_show(mddev_t *mddev, char *page) 2721{ 2722 char *type = "idle"; 2723 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2724 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 2725 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2726 type = "reshape"; 2727 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2728 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2729 type = "resync"; 2730 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2731 type = "check"; 2732 else 2733 type = "repair"; 2734 } else 2735 type = "recover"; 2736 } 2737 return sprintf(page, "%s\n", type); 2738} 2739 2740static ssize_t 2741action_store(mddev_t *mddev, const char *page, size_t len) 2742{ 2743 if (!mddev->pers || !mddev->pers->sync_request) 2744 return -EINVAL; 2745 2746 if (cmd_match(page, "idle")) { 2747 if (mddev->sync_thread) { 2748 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2749 md_unregister_thread(mddev->sync_thread); 2750 mddev->sync_thread = NULL; 2751 mddev->recovery = 0; 2752 } 2753 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2754 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 2755 return -EBUSY; 2756 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 2757 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2758 else if (cmd_match(page, "reshape")) { 2759 int err; 2760 if (mddev->pers->start_reshape == NULL) 2761 return -EINVAL; 2762 err = mddev->pers->start_reshape(mddev); 2763 if (err) 2764 return err; 2765 } else { 2766 if (cmd_match(page, "check")) 2767 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2768 else if (!cmd_match(page, "repair")) 2769 return -EINVAL; 2770 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 2771 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2772 } 2773 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2774 md_wakeup_thread(mddev->thread); 2775 return len; 2776} 2777 2778static ssize_t 2779mismatch_cnt_show(mddev_t *mddev, char *page) 2780{ 2781 return sprintf(page, "%llu\n", 2782 (unsigned long long) mddev->resync_mismatches); 2783} 2784 2785static struct md_sysfs_entry md_scan_mode = 2786__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 2787 2788 2789static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 2790 2791static ssize_t 2792sync_min_show(mddev_t *mddev, char *page) 2793{ 2794 return sprintf(page, "%d (%s)\n", speed_min(mddev), 2795 mddev->sync_speed_min ? "local": "system"); 2796} 2797 2798static ssize_t 2799sync_min_store(mddev_t *mddev, const char *buf, size_t len) 2800{ 2801 int min; 2802 char *e; 2803 if (strncmp(buf, "system", 6)==0) { 2804 mddev->sync_speed_min = 0; 2805 return len; 2806 } 2807 min = simple_strtoul(buf, &e, 10); 2808 if (buf == e || (*e && *e != '\n') || min <= 0) 2809 return -EINVAL; 2810 mddev->sync_speed_min = min; 2811 return len; 2812} 2813 2814static struct md_sysfs_entry md_sync_min = 2815__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 2816 2817static ssize_t 2818sync_max_show(mddev_t *mddev, char *page) 2819{ 2820 return sprintf(page, "%d (%s)\n", speed_max(mddev), 2821 mddev->sync_speed_max ? "local": "system"); 2822} 2823 2824static ssize_t 2825sync_max_store(mddev_t *mddev, const char *buf, size_t len) 2826{ 2827 int max; 2828 char *e; 2829 if (strncmp(buf, "system", 6)==0) { 2830 mddev->sync_speed_max = 0; 2831 return len; 2832 } 2833 max = simple_strtoul(buf, &e, 10); 2834 if (buf == e || (*e && *e != '\n') || max <= 0) 2835 return -EINVAL; 2836 mddev->sync_speed_max = max; 2837 return len; 2838} 2839 2840static struct md_sysfs_entry md_sync_max = 2841__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 2842 2843 2844static ssize_t 2845sync_speed_show(mddev_t *mddev, char *page) 2846{ 2847 unsigned long resync, dt, db; 2848 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); 2849 dt = ((jiffies - mddev->resync_mark) / HZ); 2850 if (!dt) dt++; 2851 db = resync - (mddev->resync_mark_cnt); 2852 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ 2853} 2854 2855static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 2856 2857static ssize_t 2858sync_completed_show(mddev_t *mddev, char *page) 2859{ 2860 unsigned long max_blocks, resync; 2861 2862 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2863 max_blocks = mddev->resync_max_sectors; 2864 else 2865 max_blocks = mddev->size << 1; 2866 2867 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 2868 return sprintf(page, "%lu / %lu\n", resync, max_blocks); 2869} 2870 2871static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 2872 2873static ssize_t 2874suspend_lo_show(mddev_t *mddev, char *page) 2875{ 2876 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 2877} 2878 2879static ssize_t 2880suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 2881{ 2882 char *e; 2883 unsigned long long new = simple_strtoull(buf, &e, 10); 2884 2885 if (mddev->pers->quiesce == NULL) 2886 return -EINVAL; 2887 if (buf == e || (*e && *e != '\n')) 2888 return -EINVAL; 2889 if (new >= mddev->suspend_hi || 2890 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 2891 mddev->suspend_lo = new; 2892 mddev->pers->quiesce(mddev, 2); 2893 return len; 2894 } else 2895 return -EINVAL; 2896} 2897static struct md_sysfs_entry md_suspend_lo = 2898__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 2899 2900 2901static ssize_t 2902suspend_hi_show(mddev_t *mddev, char *page) 2903{ 2904 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 2905} 2906 2907static ssize_t 2908suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 2909{ 2910 char *e; 2911 unsigned long long new = simple_strtoull(buf, &e, 10); 2912 2913 if (mddev->pers->quiesce == NULL) 2914 return -EINVAL; 2915 if (buf == e || (*e && *e != '\n')) 2916 return -EINVAL; 2917 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 2918 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 2919 mddev->suspend_hi = new; 2920 mddev->pers->quiesce(mddev, 1); 2921 mddev->pers->quiesce(mddev, 0); 2922 return len; 2923 } else 2924 return -EINVAL; 2925} 2926static struct md_sysfs_entry md_suspend_hi = 2927__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 2928 2929static ssize_t 2930reshape_position_show(mddev_t *mddev, char *page) 2931{ 2932 if (mddev->reshape_position != MaxSector) 2933 return sprintf(page, "%llu\n", 2934 (unsigned long long)mddev->reshape_position); 2935 strcpy(page, "none\n"); 2936 return 5; 2937} 2938 2939static ssize_t 2940reshape_position_store(mddev_t *mddev, const char *buf, size_t len) 2941{ 2942 char *e; 2943 unsigned long long new = simple_strtoull(buf, &e, 10); 2944 if (mddev->pers) 2945 return -EBUSY; 2946 if (buf == e || (*e && *e != '\n')) 2947 return -EINVAL; 2948 mddev->reshape_position = new; 2949 mddev->delta_disks = 0; 2950 mddev->new_level = mddev->level; 2951 mddev->new_layout = mddev->layout; 2952 mddev->new_chunk = mddev->chunk_size; 2953 return len; 2954} 2955 2956static struct md_sysfs_entry md_reshape_position = 2957__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 2958 reshape_position_store); 2959 2960 2961static struct attribute *md_default_attrs[] = { 2962 &md_level.attr, 2963 &md_layout.attr, 2964 &md_raid_disks.attr, 2965 &md_chunk_size.attr, 2966 &md_size.attr, 2967 &md_resync_start.attr, 2968 &md_metadata.attr, 2969 &md_new_device.attr, 2970 &md_safe_delay.attr, 2971 &md_array_state.attr, 2972 &md_reshape_position.attr, 2973 NULL, 2974}; 2975 2976static struct attribute *md_redundancy_attrs[] = { 2977 &md_scan_mode.attr, 2978 &md_mismatches.attr, 2979 &md_sync_min.attr, 2980 &md_sync_max.attr, 2981 &md_sync_speed.attr, 2982 &md_sync_completed.attr, 2983 &md_suspend_lo.attr, 2984 &md_suspend_hi.attr, 2985 &md_bitmap.attr, 2986 NULL, 2987}; 2988static struct attribute_group md_redundancy_group = { 2989 .name = NULL, 2990 .attrs = md_redundancy_attrs, 2991}; 2992 2993 2994static ssize_t 2995md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2996{ 2997 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2998 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2999 ssize_t rv; 3000 3001 if (!entry->show) 3002 return -EIO; 3003 rv = mddev_lock(mddev); 3004 if (!rv) { 3005 rv = entry->show(mddev, page); 3006 mddev_unlock(mddev); 3007 } 3008 return rv; 3009} 3010 3011static ssize_t 3012md_attr_store(struct kobject *kobj, struct attribute *attr, 3013 const char *page, size_t length) 3014{ 3015 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3016 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3017 ssize_t rv; 3018 3019 if (!entry->store) 3020 return -EIO; 3021 if (!capable(CAP_SYS_ADMIN)) 3022 return -EACCES; 3023 rv = mddev_lock(mddev); 3024 if (!rv) { 3025 rv = entry->store(mddev, page, length); 3026 mddev_unlock(mddev); 3027 } 3028 return rv; 3029} 3030 3031static void md_free(struct kobject *ko) 3032{ 3033 mddev_t *mddev = container_of(ko, mddev_t, kobj); 3034 kfree(mddev); 3035} 3036 3037static struct sysfs_ops md_sysfs_ops = { 3038 .show = md_attr_show, 3039 .store = md_attr_store, 3040}; 3041static struct kobj_type md_ktype = { 3042 .release = md_free, 3043 .sysfs_ops = &md_sysfs_ops, 3044 .default_attrs = md_default_attrs, 3045}; 3046 3047int mdp_major = 0; 3048 3049static struct kobject *md_probe(dev_t dev, int *part, void *data) 3050{ 3051 static DEFINE_MUTEX(disks_mutex); 3052 mddev_t *mddev = mddev_find(dev); 3053 struct gendisk *disk; 3054 int partitioned = (MAJOR(dev) != MD_MAJOR); 3055 int shift = partitioned ? MdpMinorShift : 0; 3056 int unit = MINOR(dev) >> shift; 3057 3058 if (!mddev) 3059 return NULL; 3060 3061 mutex_lock(&disks_mutex); 3062 if (mddev->gendisk) { 3063 mutex_unlock(&disks_mutex); 3064 mddev_put(mddev); 3065 return NULL; 3066 } 3067 disk = alloc_disk(1 << shift); 3068 if (!disk) { 3069 mutex_unlock(&disks_mutex); 3070 mddev_put(mddev); 3071 return NULL; 3072 } 3073 disk->major = MAJOR(dev); 3074 disk->first_minor = unit << shift; 3075 if (partitioned) 3076 sprintf(disk->disk_name, "md_d%d", unit); 3077 else 3078 sprintf(disk->disk_name, "md%d", unit); 3079 disk->fops = &md_fops; 3080 disk->private_data = mddev; 3081 disk->queue = mddev->queue; 3082 add_disk(disk); 3083 mddev->gendisk = disk; 3084 mutex_unlock(&disks_mutex); 3085 mddev->kobj.parent = &disk->kobj; 3086 mddev->kobj.k_name = NULL; 3087 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 3088 mddev->kobj.ktype = &md_ktype; 3089 if (kobject_register(&mddev->kobj)) 3090 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3091 disk->disk_name); 3092 return NULL; 3093} 3094 3095static void md_safemode_timeout(unsigned long data) 3096{ 3097 mddev_t *mddev = (mddev_t *) data; 3098 3099 mddev->safemode = 1; 3100 md_wakeup_thread(mddev->thread); 3101} 3102 3103static int start_dirty_degraded; 3104 3105static int do_md_run(mddev_t * mddev) 3106{ 3107 int err; 3108 int chunk_size; 3109 struct list_head *tmp; 3110 mdk_rdev_t *rdev; 3111 struct gendisk *disk; 3112 struct mdk_personality *pers; 3113 char b[BDEVNAME_SIZE]; 3114 3115 if (list_empty(&mddev->disks)) 3116 /* cannot run an array with no devices.. */ 3117 return -EINVAL; 3118 3119 if (mddev->pers) 3120 return -EBUSY; 3121 3122 /* 3123 * Analyze all RAID superblock(s) 3124 */ 3125 if (!mddev->raid_disks) 3126 analyze_sbs(mddev); 3127 3128 chunk_size = mddev->chunk_size; 3129 3130 if (chunk_size) { 3131 if (chunk_size > MAX_CHUNK_SIZE) { 3132 printk(KERN_ERR "too big chunk_size: %d > %d\n", 3133 chunk_size, MAX_CHUNK_SIZE); 3134 return -EINVAL; 3135 } 3136 /* 3137 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 3138 */ 3139 if ( (1 << ffz(~chunk_size)) != chunk_size) { 3140 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 3141 return -EINVAL; 3142 } 3143 if (chunk_size < PAGE_SIZE) { 3144 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 3145 chunk_size, PAGE_SIZE); 3146 return -EINVAL; 3147 } 3148 3149 /* devices must have minimum size of one chunk */ 3150 ITERATE_RDEV(mddev,rdev,tmp) { 3151 if (test_bit(Faulty, &rdev->flags)) 3152 continue; 3153 if (rdev->size < chunk_size / 1024) { 3154 printk(KERN_WARNING 3155 "md: Dev %s smaller than chunk_size:" 3156 " %lluk < %dk\n", 3157 bdevname(rdev->bdev,b), 3158 (unsigned long long)rdev->size, 3159 chunk_size / 1024); 3160 return -EINVAL; 3161 } 3162 } 3163 } 3164 3165#ifdef CONFIG_KMOD 3166 if (mddev->level != LEVEL_NONE) 3167 request_module("md-level-%d", mddev->level); 3168 else if (mddev->clevel[0]) 3169 request_module("md-%s", mddev->clevel); 3170#endif 3171 3172 /* 3173 * Drop all container device buffers, from now on 3174 * the only valid external interface is through the md 3175 * device. 3176 * Also find largest hardsector size 3177 */ 3178 ITERATE_RDEV(mddev,rdev,tmp) { 3179 if (test_bit(Faulty, &rdev->flags)) 3180 continue; 3181 sync_blockdev(rdev->bdev); 3182 invalidate_bdev(rdev->bdev); 3183 } 3184 3185 md_probe(mddev->unit, NULL, NULL); 3186 disk = mddev->gendisk; 3187 if (!disk) 3188 return -ENOMEM; 3189 3190 spin_lock(&pers_lock); 3191 pers = find_pers(mddev->level, mddev->clevel); 3192 if (!pers || !try_module_get(pers->owner)) { 3193 spin_unlock(&pers_lock); 3194 if (mddev->level != LEVEL_NONE) 3195 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 3196 mddev->level); 3197 else 3198 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 3199 mddev->clevel); 3200 return -EINVAL; 3201 } 3202 mddev->pers = pers; 3203 spin_unlock(&pers_lock); 3204 mddev->level = pers->level; 3205 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3206 3207 if (mddev->reshape_position != MaxSector && 3208 pers->start_reshape == NULL) { 3209 /* This personality cannot handle reshaping... */ 3210 mddev->pers = NULL; 3211 module_put(pers->owner); 3212 return -EINVAL; 3213 } 3214 3215 if (pers->sync_request) { 3216 /* Warn if this is a potentially silly 3217 * configuration. 3218 */ 3219 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3220 mdk_rdev_t *rdev2; 3221 struct list_head *tmp2; 3222 int warned = 0; 3223 ITERATE_RDEV(mddev, rdev, tmp) { 3224 ITERATE_RDEV(mddev, rdev2, tmp2) { 3225 if (rdev < rdev2 && 3226 rdev->bdev->bd_contains == 3227 rdev2->bdev->bd_contains) { 3228 printk(KERN_WARNING 3229 "%s: WARNING: %s appears to be" 3230 " on the same physical disk as" 3231 " %s.\n", 3232 mdname(mddev), 3233 bdevname(rdev->bdev,b), 3234 bdevname(rdev2->bdev,b2)); 3235 warned = 1; 3236 } 3237 } 3238 } 3239 if (warned) 3240 printk(KERN_WARNING 3241 "True protection against single-disk" 3242 " failure might be compromised.\n"); 3243 } 3244 3245 mddev->recovery = 0; 3246 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 3247 mddev->barriers_work = 1; 3248 mddev->ok_start_degraded = start_dirty_degraded; 3249 3250 if (start_readonly) 3251 mddev->ro = 2; /* read-only, but switch on first write */ 3252 3253 err = mddev->pers->run(mddev); 3254 if (!err && mddev->pers->sync_request) { 3255 err = bitmap_create(mddev); 3256 if (err) { 3257 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3258 mdname(mddev), err); 3259 mddev->pers->stop(mddev); 3260 } 3261 } 3262 if (err) { 3263 printk(KERN_ERR "md: pers->run() failed ...\n"); 3264 module_put(mddev->pers->owner); 3265 mddev->pers = NULL; 3266 bitmap_destroy(mddev); 3267 return err; 3268 } 3269 if (mddev->pers->sync_request) { 3270 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3271 printk(KERN_WARNING 3272 "md: cannot register extra attributes for %s\n", 3273 mdname(mddev)); 3274 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 3275 mddev->ro = 0; 3276 3277 atomic_set(&mddev->writes_pending,0); 3278 mddev->safemode = 0; 3279 mddev->safemode_timer.function = md_safemode_timeout; 3280 mddev->safemode_timer.data = (unsigned long) mddev; 3281 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 3282 mddev->in_sync = 1; 3283 3284 ITERATE_RDEV(mddev,rdev,tmp) 3285 if (rdev->raid_disk >= 0) { 3286 char nm[20]; 3287 sprintf(nm, "rd%d", rdev->raid_disk); 3288 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 3289 printk("md: cannot register %s for %s\n", 3290 nm, mdname(mddev)); 3291 } 3292 3293 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3294 3295 if (mddev->flags) 3296 md_update_sb(mddev, 0); 3297 3298 set_capacity(disk, mddev->array_size<<1); 3299 3300 /* If we call blk_queue_make_request here, it will 3301 * re-initialise max_sectors etc which may have been 3302 * refined inside -> run. So just set the bits we need to set. 3303 * Most initialisation happended when we called 3304 * blk_queue_make_request(..., md_fail_request) 3305 * earlier. 3306 */ 3307 mddev->queue->queuedata = mddev; 3308 mddev->queue->make_request_fn = mddev->pers->make_request; 3309 3310 /* If there is a partially-recovered drive we need to 3311 * start recovery here. If we leave it to md_check_recovery, 3312 * it will remove the drives and not do the right thing 3313 */ 3314 if (mddev->degraded && !mddev->sync_thread) { 3315 struct list_head *rtmp; 3316 int spares = 0; 3317 ITERATE_RDEV(mddev,rdev,rtmp) 3318 if (rdev->raid_disk >= 0 && 3319 !test_bit(In_sync, &rdev->flags) && 3320 !test_bit(Faulty, &rdev->flags)) 3321 /* complete an interrupted recovery */ 3322 spares++; 3323 if (spares && mddev->pers->sync_request) { 3324 mddev->recovery = 0; 3325 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3326 mddev->sync_thread = md_register_thread(md_do_sync, 3327 mddev, 3328 "%s_resync"); 3329 if (!mddev->sync_thread) { 3330 printk(KERN_ERR "%s: could not start resync" 3331 " thread...\n", 3332 mdname(mddev)); 3333 /* leave the spares where they are, it shouldn't hurt */ 3334 mddev->recovery = 0; 3335 } 3336 } 3337 } 3338 md_wakeup_thread(mddev->thread); 3339 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 3340 3341 mddev->changed = 1; 3342 md_new_event(mddev); 3343 kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE); 3344 return 0; 3345} 3346 3347static int restart_array(mddev_t *mddev) 3348{ 3349 struct gendisk *disk = mddev->gendisk; 3350 int err; 3351 3352 /* 3353 * Complain if it has no devices 3354 */ 3355 err = -ENXIO; 3356 if (list_empty(&mddev->disks)) 3357 goto out; 3358 3359 if (mddev->pers) { 3360 err = -EBUSY; 3361 if (!mddev->ro) 3362 goto out; 3363 3364 mddev->safemode = 0; 3365 mddev->ro = 0; 3366 set_disk_ro(disk, 0); 3367 3368 printk(KERN_INFO "md: %s switched to read-write mode.\n", 3369 mdname(mddev)); 3370 /* 3371 * Kick recovery or resync if necessary 3372 */ 3373 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3374 md_wakeup_thread(mddev->thread); 3375 md_wakeup_thread(mddev->sync_thread); 3376 err = 0; 3377 } else 3378 err = -EINVAL; 3379 3380out: 3381 return err; 3382} 3383 3384/* similar to deny_write_access, but accounts for our holding a reference 3385 * to the file ourselves */ 3386static int deny_bitmap_write_access(struct file * file) 3387{ 3388 struct inode *inode = file->f_mapping->host; 3389 3390 spin_lock(&inode->i_lock); 3391 if (atomic_read(&inode->i_writecount) > 1) { 3392 spin_unlock(&inode->i_lock); 3393 return -ETXTBSY; 3394 } 3395 atomic_set(&inode->i_writecount, -1); 3396 spin_unlock(&inode->i_lock); 3397 3398 return 0; 3399} 3400 3401static void restore_bitmap_write_access(struct file *file) 3402{ 3403 struct inode *inode = file->f_mapping->host; 3404 3405 spin_lock(&inode->i_lock); 3406 atomic_set(&inode->i_writecount, 1); 3407 spin_unlock(&inode->i_lock); 3408} 3409 3410/* mode: 3411 * 0 - completely stop and dis-assemble array 3412 * 1 - switch to readonly 3413 * 2 - stop but do not disassemble array 3414 */ 3415static int do_md_stop(mddev_t * mddev, int mode) 3416{ 3417 int err = 0; 3418 struct gendisk *disk = mddev->gendisk; 3419 3420 if (mddev->pers) { 3421 if (atomic_read(&mddev->active)>2) { 3422 printk("md: %s still in use.\n",mdname(mddev)); 3423 return -EBUSY; 3424 } 3425 3426 if (mddev->sync_thread) { 3427 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3428 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3429 md_unregister_thread(mddev->sync_thread); 3430 mddev->sync_thread = NULL; 3431 } 3432 3433 del_timer_sync(&mddev->safemode_timer); 3434 3435 invalidate_partition(disk, 0); 3436 3437 switch(mode) { 3438 case 1: /* readonly */ 3439 err = -ENXIO; 3440 if (mddev->ro==1) 3441 goto out; 3442 mddev->ro = 1; 3443 break; 3444 case 0: /* disassemble */ 3445 case 2: /* stop */ 3446 bitmap_flush(mddev); 3447 md_super_wait(mddev); 3448 if (mddev->ro) 3449 set_disk_ro(disk, 0); 3450 blk_queue_make_request(mddev->queue, md_fail_request); 3451 mddev->pers->stop(mddev); 3452 mddev->queue->merge_bvec_fn = NULL; 3453 mddev->queue->unplug_fn = NULL; 3454 mddev->queue->issue_flush_fn = NULL; 3455 mddev->queue->backing_dev_info.congested_fn = NULL; 3456 if (mddev->pers->sync_request) 3457 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3458 3459 module_put(mddev->pers->owner); 3460 mddev->pers = NULL; 3461 3462 set_capacity(disk, 0); 3463 mddev->changed = 1; 3464 3465 if (mddev->ro) 3466 mddev->ro = 0; 3467 } 3468 if (!mddev->in_sync || mddev->flags) { 3469 /* mark array as shutdown cleanly */ 3470 mddev->in_sync = 1; 3471 md_update_sb(mddev, 1); 3472 } 3473 if (mode == 1) 3474 set_disk_ro(disk, 1); 3475 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3476 } 3477 3478 /* 3479 * Free resources if final stop 3480 */ 3481 if (mode == 0) { 3482 mdk_rdev_t *rdev; 3483 struct list_head *tmp; 3484 3485 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 3486 3487 bitmap_destroy(mddev); 3488 if (mddev->bitmap_file) { 3489 restore_bitmap_write_access(mddev->bitmap_file); 3490 fput(mddev->bitmap_file); 3491 mddev->bitmap_file = NULL; 3492 } 3493 mddev->bitmap_offset = 0; 3494 3495 ITERATE_RDEV(mddev,rdev,tmp) 3496 if (rdev->raid_disk >= 0) { 3497 char nm[20]; 3498 sprintf(nm, "rd%d", rdev->raid_disk); 3499 sysfs_remove_link(&mddev->kobj, nm); 3500 } 3501 3502 /* make sure all delayed_delete calls have finished */ 3503 flush_scheduled_work(); 3504 3505 export_array(mddev); 3506 3507 mddev->array_size = 0; 3508 mddev->size = 0; 3509 mddev->raid_disks = 0; 3510 mddev->recovery_cp = 0; 3511 mddev->reshape_position = MaxSector; 3512 3513 } else if (mddev->pers) 3514 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3515 mdname(mddev)); 3516 err = 0; 3517 md_new_event(mddev); 3518out: 3519 return err; 3520} 3521 3522#ifndef MODULE 3523static void autorun_array(mddev_t *mddev) 3524{ 3525 mdk_rdev_t *rdev; 3526 struct list_head *tmp; 3527 int err; 3528 3529 if (list_empty(&mddev->disks)) 3530 return; 3531 3532 printk(KERN_INFO "md: running: "); 3533 3534 ITERATE_RDEV(mddev,rdev,tmp) { 3535 char b[BDEVNAME_SIZE]; 3536 printk("<%s>", bdevname(rdev->bdev,b)); 3537 } 3538 printk("\n"); 3539 3540 err = do_md_run (mddev); 3541 if (err) { 3542 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 3543 do_md_stop (mddev, 0); 3544 } 3545} 3546 3547/* 3548 * lets try to run arrays based on all disks that have arrived 3549 * until now. (those are in pending_raid_disks) 3550 * 3551 * the method: pick the first pending disk, collect all disks with 3552 * the same UUID, remove all from the pending list and put them into 3553 * the 'same_array' list. Then order this list based on superblock 3554 * update time (freshest comes first), kick out 'old' disks and 3555 * compare superblocks. If everything's fine then run it. 3556 * 3557 * If "unit" is allocated, then bump its reference count 3558 */ 3559static void autorun_devices(int part) 3560{ 3561 struct list_head *tmp; 3562 mdk_rdev_t *rdev0, *rdev; 3563 mddev_t *mddev; 3564 char b[BDEVNAME_SIZE]; 3565 3566 printk(KERN_INFO "md: autorun ...\n"); 3567 while (!list_empty(&pending_raid_disks)) { 3568 int unit; 3569 dev_t dev; 3570 LIST_HEAD(candidates); 3571 rdev0 = list_entry(pending_raid_disks.next, 3572 mdk_rdev_t, same_set); 3573 3574 printk(KERN_INFO "md: considering %s ...\n", 3575 bdevname(rdev0->bdev,b)); 3576 INIT_LIST_HEAD(&candidates); 3577 ITERATE_RDEV_PENDING(rdev,tmp) 3578 if (super_90_load(rdev, rdev0, 0) >= 0) { 3579 printk(KERN_INFO "md: adding %s ...\n", 3580 bdevname(rdev->bdev,b)); 3581 list_move(&rdev->same_set, &candidates); 3582 } 3583 /* 3584 * now we have a set of devices, with all of them having 3585 * mostly sane superblocks. It's time to allocate the 3586 * mddev. 3587 */ 3588 if (part) { 3589 dev = MKDEV(mdp_major, 3590 rdev0->preferred_minor << MdpMinorShift); 3591 unit = MINOR(dev) >> MdpMinorShift; 3592 } else { 3593 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 3594 unit = MINOR(dev); 3595 } 3596 if (rdev0->preferred_minor != unit) { 3597 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 3598 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 3599 break; 3600 } 3601 3602 md_probe(dev, NULL, NULL); 3603 mddev = mddev_find(dev); 3604 if (!mddev) { 3605 printk(KERN_ERR 3606 "md: cannot allocate memory for md drive.\n"); 3607 break; 3608 } 3609 if (mddev_lock(mddev)) 3610 printk(KERN_WARNING "md: %s locked, cannot run\n", 3611 mdname(mddev)); 3612 else if (mddev->raid_disks || mddev->major_version 3613 || !list_empty(&mddev->disks)) { 3614 printk(KERN_WARNING 3615 "md: %s already running, cannot run %s\n", 3616 mdname(mddev), bdevname(rdev0->bdev,b)); 3617 mddev_unlock(mddev); 3618 } else { 3619 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 3620 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 3621 list_del_init(&rdev->same_set); 3622 if (bind_rdev_to_array(rdev, mddev)) 3623 export_rdev(rdev); 3624 } 3625 autorun_array(mddev); 3626 mddev_unlock(mddev); 3627 } 3628 /* on success, candidates will be empty, on error 3629 * it won't... 3630 */ 3631 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 3632 export_rdev(rdev); 3633 mddev_put(mddev); 3634 } 3635 printk(KERN_INFO "md: ... autorun DONE.\n"); 3636} 3637#endif /* !MODULE */ 3638 3639static int get_version(void __user * arg) 3640{ 3641 mdu_version_t ver; 3642 3643 ver.major = MD_MAJOR_VERSION; 3644 ver.minor = MD_MINOR_VERSION; 3645 ver.patchlevel = MD_PATCHLEVEL_VERSION; 3646 3647 if (copy_to_user(arg, &ver, sizeof(ver))) 3648 return -EFAULT; 3649 3650 return 0; 3651} 3652 3653static int get_array_info(mddev_t * mddev, void __user * arg) 3654{ 3655 mdu_array_info_t info; 3656 int nr,working,active,failed,spare; 3657 mdk_rdev_t *rdev; 3658 struct list_head *tmp; 3659 3660 nr=working=active=failed=spare=0; 3661 ITERATE_RDEV(mddev,rdev,tmp) { 3662 nr++; 3663 if (test_bit(Faulty, &rdev->flags)) 3664 failed++; 3665 else { 3666 working++; 3667 if (test_bit(In_sync, &rdev->flags)) 3668 active++; 3669 else 3670 spare++; 3671 } 3672 } 3673 3674 info.major_version = mddev->major_version; 3675 info.minor_version = mddev->minor_version; 3676 info.patch_version = MD_PATCHLEVEL_VERSION; 3677 info.ctime = mddev->ctime; 3678 info.level = mddev->level; 3679 info.size = mddev->size; 3680 if (info.size != mddev->size) /* overflow */ 3681 info.size = -1; 3682 info.nr_disks = nr; 3683 info.raid_disks = mddev->raid_disks; 3684 info.md_minor = mddev->md_minor; 3685 info.not_persistent= !mddev->persistent; 3686 3687 info.utime = mddev->utime; 3688 info.state = 0; 3689 if (mddev->in_sync) 3690 info.state = (1<<MD_SB_CLEAN); 3691 if (mddev->bitmap && mddev->bitmap_offset) 3692 info.state = (1<<MD_SB_BITMAP_PRESENT); 3693 info.active_disks = active; 3694 info.working_disks = working; 3695 info.failed_disks = failed; 3696 info.spare_disks = spare; 3697 3698 info.layout = mddev->layout; 3699 info.chunk_size = mddev->chunk_size; 3700 3701 if (copy_to_user(arg, &info, sizeof(info))) 3702 return -EFAULT; 3703 3704 return 0; 3705} 3706 3707static int get_bitmap_file(mddev_t * mddev, void __user * arg) 3708{ 3709 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 3710 char *ptr, *buf = NULL; 3711 int err = -ENOMEM; 3712 3713 md_allow_write(mddev); 3714 3715 file = kmalloc(sizeof(*file), GFP_KERNEL); 3716 if (!file) 3717 goto out; 3718 3719 /* bitmap disabled, zero the first byte and copy out */ 3720 if (!mddev->bitmap || !mddev->bitmap->file) { 3721 file->pathname[0] = '\0'; 3722 goto copy_out; 3723 } 3724 3725 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 3726 if (!buf) 3727 goto out; 3728 3729 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 3730 if (!ptr) 3731 goto out; 3732 3733 strcpy(file->pathname, ptr); 3734 3735copy_out: 3736 err = 0; 3737 if (copy_to_user(arg, file, sizeof(*file))) 3738 err = -EFAULT; 3739out: 3740 kfree(buf); 3741 kfree(file); 3742 return err; 3743} 3744 3745static int get_disk_info(mddev_t * mddev, void __user * arg) 3746{ 3747 mdu_disk_info_t info; 3748 unsigned int nr; 3749 mdk_rdev_t *rdev; 3750 3751 if (copy_from_user(&info, arg, sizeof(info))) 3752 return -EFAULT; 3753 3754 nr = info.number; 3755 3756 rdev = find_rdev_nr(mddev, nr); 3757 if (rdev) { 3758 info.major = MAJOR(rdev->bdev->bd_dev); 3759 info.minor = MINOR(rdev->bdev->bd_dev); 3760 info.raid_disk = rdev->raid_disk; 3761 info.state = 0; 3762 if (test_bit(Faulty, &rdev->flags)) 3763 info.state |= (1<<MD_DISK_FAULTY); 3764 else if (test_bit(In_sync, &rdev->flags)) { 3765 info.state |= (1<<MD_DISK_ACTIVE); 3766 info.state |= (1<<MD_DISK_SYNC); 3767 } 3768 if (test_bit(WriteMostly, &rdev->flags)) 3769 info.state |= (1<<MD_DISK_WRITEMOSTLY); 3770 } else { 3771 info.major = info.minor = 0; 3772 info.raid_disk = -1; 3773 info.state = (1<<MD_DISK_REMOVED); 3774 } 3775 3776 if (copy_to_user(arg, &info, sizeof(info))) 3777 return -EFAULT; 3778 3779 return 0; 3780} 3781 3782static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 3783{ 3784 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3785 mdk_rdev_t *rdev; 3786 dev_t dev = MKDEV(info->major,info->minor); 3787 3788 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 3789 return -EOVERFLOW; 3790 3791 if (!mddev->raid_disks) { 3792 int err; 3793 /* expecting a device which has a superblock */ 3794 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 3795 if (IS_ERR(rdev)) { 3796 printk(KERN_WARNING 3797 "md: md_import_device returned %ld\n", 3798 PTR_ERR(rdev)); 3799 return PTR_ERR(rdev); 3800 } 3801 if (!list_empty(&mddev->disks)) { 3802 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3803 mdk_rdev_t, same_set); 3804 int err = super_types[mddev->major_version] 3805 .load_super(rdev, rdev0, mddev->minor_version); 3806 if (err < 0) { 3807 printk(KERN_WARNING 3808 "md: %s has different UUID to %s\n", 3809 bdevname(rdev->bdev,b), 3810 bdevname(rdev0->bdev,b2)); 3811 export_rdev(rdev); 3812 return -EINVAL; 3813 } 3814 } 3815 err = bind_rdev_to_array(rdev, mddev); 3816 if (err) 3817 export_rdev(rdev); 3818 return err; 3819 } 3820 3821 /* 3822 * add_new_disk can be used once the array is assembled 3823 * to add "hot spares". They must already have a superblock 3824 * written 3825 */ 3826 if (mddev->pers) { 3827 int err; 3828 if (!mddev->pers->hot_add_disk) { 3829 printk(KERN_WARNING 3830 "%s: personality does not support diskops!\n", 3831 mdname(mddev)); 3832 return -EINVAL; 3833 } 3834 if (mddev->persistent) 3835 rdev = md_import_device(dev, mddev->major_version, 3836 mddev->minor_version); 3837 else 3838 rdev = md_import_device(dev, -1, -1); 3839 if (IS_ERR(rdev)) { 3840 printk(KERN_WARNING 3841 "md: md_import_device returned %ld\n", 3842 PTR_ERR(rdev)); 3843 return PTR_ERR(rdev); 3844 } 3845 /* set save_raid_disk if appropriate */ 3846 if (!mddev->persistent) { 3847 if (info->state & (1<<MD_DISK_SYNC) && 3848 info->raid_disk < mddev->raid_disks) 3849 rdev->raid_disk = info->raid_disk; 3850 else 3851 rdev->raid_disk = -1; 3852 } else 3853 super_types[mddev->major_version]. 3854 validate_super(mddev, rdev); 3855 rdev->saved_raid_disk = rdev->raid_disk; 3856 3857 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 3858 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3859 set_bit(WriteMostly, &rdev->flags); 3860 3861 rdev->raid_disk = -1; 3862 err = bind_rdev_to_array(rdev, mddev); 3863 if (!err && !mddev->pers->hot_remove_disk) { 3864 /* If there is hot_add_disk but no hot_remove_disk 3865 * then added disks for geometry changes, 3866 * and should be added immediately. 3867 */ 3868 super_types[mddev->major_version]. 3869 validate_super(mddev, rdev); 3870 err = mddev->pers->hot_add_disk(mddev, rdev); 3871 if (err) 3872 unbind_rdev_from_array(rdev); 3873 } 3874 if (err) 3875 export_rdev(rdev); 3876 3877 md_update_sb(mddev, 1); 3878 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3879 md_wakeup_thread(mddev->thread); 3880 return err; 3881 } 3882 3883 /* otherwise, add_new_disk is only allowed 3884 * for major_version==0 superblocks 3885 */ 3886 if (mddev->major_version != 0) { 3887 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 3888 mdname(mddev)); 3889 return -EINVAL; 3890 } 3891 3892 if (!(info->state & (1<<MD_DISK_FAULTY))) { 3893 int err; 3894 rdev = md_import_device (dev, -1, 0); 3895 if (IS_ERR(rdev)) { 3896 printk(KERN_WARNING 3897 "md: error, md_import_device() returned %ld\n", 3898 PTR_ERR(rdev)); 3899 return PTR_ERR(rdev); 3900 } 3901 rdev->desc_nr = info->number; 3902 if (info->raid_disk < mddev->raid_disks) 3903 rdev->raid_disk = info->raid_disk; 3904 else 3905 rdev->raid_disk = -1; 3906 3907 rdev->flags = 0; 3908 3909 if (rdev->raid_disk < mddev->raid_disks) 3910 if (info->state & (1<<MD_DISK_SYNC)) 3911 set_bit(In_sync, &rdev->flags); 3912 3913 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3914 set_bit(WriteMostly, &rdev->flags); 3915 3916 if (!mddev->persistent) { 3917 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 3918 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3919 } else 3920 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3921 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 3922 3923 err = bind_rdev_to_array(rdev, mddev); 3924 if (err) { 3925 export_rdev(rdev); 3926 return err; 3927 } 3928 } 3929 3930 return 0; 3931} 3932 3933static int hot_remove_disk(mddev_t * mddev, dev_t dev) 3934{ 3935 char b[BDEVNAME_SIZE]; 3936 mdk_rdev_t *rdev; 3937 3938 if (!mddev->pers) 3939 return -ENODEV; 3940 3941 rdev = find_rdev(mddev, dev); 3942 if (!rdev) 3943 return -ENXIO; 3944 3945 if (rdev->raid_disk >= 0) 3946 goto busy; 3947 3948 kick_rdev_from_array(rdev); 3949 md_update_sb(mddev, 1); 3950 md_new_event(mddev); 3951 3952 return 0; 3953busy: 3954 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 3955 bdevname(rdev->bdev,b), mdname(mddev)); 3956 return -EBUSY; 3957} 3958 3959static int hot_add_disk(mddev_t * mddev, dev_t dev) 3960{ 3961 char b[BDEVNAME_SIZE]; 3962 int err; 3963 unsigned int size; 3964 mdk_rdev_t *rdev; 3965 3966 if (!mddev->pers) 3967 return -ENODEV; 3968 3969 if (mddev->major_version != 0) { 3970 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 3971 " version-0 superblocks.\n", 3972 mdname(mddev)); 3973 return -EINVAL; 3974 } 3975 if (!mddev->pers->hot_add_disk) { 3976 printk(KERN_WARNING 3977 "%s: personality does not support diskops!\n", 3978 mdname(mddev)); 3979 return -EINVAL; 3980 } 3981 3982 rdev = md_import_device (dev, -1, 0); 3983 if (IS_ERR(rdev)) { 3984 printk(KERN_WARNING 3985 "md: error, md_import_device() returned %ld\n", 3986 PTR_ERR(rdev)); 3987 return -EINVAL; 3988 } 3989 3990 if (mddev->persistent) 3991 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3992 else 3993 rdev->sb_offset = 3994 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3995 3996 size = calc_dev_size(rdev, mddev->chunk_size); 3997 rdev->size = size; 3998 3999 if (test_bit(Faulty, &rdev->flags)) { 4000 printk(KERN_WARNING 4001 "md: can not hot-add faulty %s disk to %s!\n", 4002 bdevname(rdev->bdev,b), mdname(mddev)); 4003 err = -EINVAL; 4004 goto abort_export; 4005 } 4006 clear_bit(In_sync, &rdev->flags); 4007 rdev->desc_nr = -1; 4008 rdev->saved_raid_disk = -1; 4009 err = bind_rdev_to_array(rdev, mddev); 4010 if (err) 4011 goto abort_export; 4012 4013 /* 4014 * The rest should better be atomic, we can have disk failures 4015 * noticed in interrupt contexts ... 4016 */ 4017 4018 if (rdev->desc_nr == mddev->max_disks) { 4019 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 4020 mdname(mddev)); 4021 err = -EBUSY; 4022 goto abort_unbind_export; 4023 } 4024 4025 rdev->raid_disk = -1; 4026 4027 md_update_sb(mddev, 1); 4028 4029 /* 4030 * Kick recovery, maybe this spare has to be added to the 4031 * array immediately. 4032 */ 4033 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4034 md_wakeup_thread(mddev->thread); 4035 md_new_event(mddev); 4036 return 0; 4037 4038abort_unbind_export: 4039 unbind_rdev_from_array(rdev); 4040 4041abort_export: 4042 export_rdev(rdev); 4043 return err; 4044} 4045 4046static int set_bitmap_file(mddev_t *mddev, int fd) 4047{ 4048 int err; 4049 4050 if (mddev->pers) { 4051 if (!mddev->pers->quiesce) 4052 return -EBUSY; 4053 if (mddev->recovery || mddev->sync_thread) 4054 return -EBUSY; 4055 /* we should be able to change the bitmap.. */ 4056 } 4057 4058 4059 if (fd >= 0) { 4060 if (mddev->bitmap) 4061 return -EEXIST; /* cannot add when bitmap is present */ 4062 mddev->bitmap_file = fget(fd); 4063 4064 if (mddev->bitmap_file == NULL) { 4065 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 4066 mdname(mddev)); 4067 return -EBADF; 4068 } 4069 4070 err = deny_bitmap_write_access(mddev->bitmap_file); 4071 if (err) { 4072 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 4073 mdname(mddev)); 4074 fput(mddev->bitmap_file); 4075 mddev->bitmap_file = NULL; 4076 return err; 4077 } 4078 mddev->bitmap_offset = 0; /* file overrides offset */ 4079 } else if (mddev->bitmap == NULL) 4080 return -ENOENT; /* cannot remove what isn't there */ 4081 err = 0; 4082 if (mddev->pers) { 4083 mddev->pers->quiesce(mddev, 1); 4084 if (fd >= 0) 4085 err = bitmap_create(mddev); 4086 if (fd < 0 || err) { 4087 bitmap_destroy(mddev); 4088 fd = -1; /* make sure to put the file */ 4089 } 4090 mddev->pers->quiesce(mddev, 0); 4091 } 4092 if (fd < 0) { 4093 if (mddev->bitmap_file) { 4094 restore_bitmap_write_access(mddev->bitmap_file); 4095 fput(mddev->bitmap_file); 4096 } 4097 mddev->bitmap_file = NULL; 4098 } 4099 4100 return err; 4101} 4102 4103/* 4104 * set_array_info is used two different ways 4105 * The original usage is when creating a new array. 4106 * In this usage, raid_disks is > 0 and it together with 4107 * level, size, not_persistent,layout,chunksize determine the 4108 * shape of the array. 4109 * This will always create an array with a type-0.90.0 superblock. 4110 * The newer usage is when assembling an array. 4111 * In this case raid_disks will be 0, and the major_version field is 4112 * use to determine which style super-blocks are to be found on the devices. 4113 * The minor and patch _version numbers are also kept incase the 4114 * super_block handler wishes to interpret them. 4115 */ 4116static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 4117{ 4118 4119 if (info->raid_disks == 0) { 4120 /* just setting version number for superblock loading */ 4121 if (info->major_version < 0 || 4122 info->major_version >= ARRAY_SIZE(super_types) || 4123 super_types[info->major_version].name == NULL) { 4124 /* maybe try to auto-load a module? */ 4125 printk(KERN_INFO 4126 "md: superblock version %d not known\n", 4127 info->major_version); 4128 return -EINVAL; 4129 } 4130 mddev->major_version = info->major_version; 4131 mddev->minor_version = info->minor_version; 4132 mddev->patch_version = info->patch_version; 4133 mddev->persistent = !info->not_persistent; 4134 return 0; 4135 } 4136 mddev->major_version = MD_MAJOR_VERSION; 4137 mddev->minor_version = MD_MINOR_VERSION; 4138 mddev->patch_version = MD_PATCHLEVEL_VERSION; 4139 mddev->ctime = get_seconds(); 4140 4141 mddev->level = info->level; 4142 mddev->clevel[0] = 0; 4143 mddev->size = info->size; 4144 mddev->raid_disks = info->raid_disks; 4145 /* don't set md_minor, it is determined by which /dev/md* was 4146 * openned 4147 */ 4148 if (info->state & (1<<MD_SB_CLEAN)) 4149 mddev->recovery_cp = MaxSector; 4150 else 4151 mddev->recovery_cp = 0; 4152 mddev->persistent = ! info->not_persistent; 4153 4154 mddev->layout = info->layout; 4155 mddev->chunk_size = info->chunk_size; 4156 4157 mddev->max_disks = MD_SB_DISKS; 4158 4159 mddev->flags = 0; 4160 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4161 4162 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 4163 mddev->bitmap_offset = 0; 4164 4165 mddev->reshape_position = MaxSector; 4166 4167 /* 4168 * Generate a 128 bit UUID 4169 */ 4170 get_random_bytes(mddev->uuid, 16); 4171 4172 mddev->new_level = mddev->level; 4173 mddev->new_chunk = mddev->chunk_size; 4174 mddev->new_layout = mddev->layout; 4175 mddev->delta_disks = 0; 4176 4177 return 0; 4178} 4179 4180static int update_size(mddev_t *mddev, unsigned long size) 4181{ 4182 mdk_rdev_t * rdev; 4183 int rv; 4184 struct list_head *tmp; 4185 int fit = (size == 0); 4186 4187 if (mddev->pers->resize == NULL) 4188 return -EINVAL; 4189 /* The "size" is the amount of each device that is used. 4190 * This can only make sense for arrays with redundancy. 4191 * linear and raid0 always use whatever space is available 4192 * We can only consider changing the size if no resync 4193 * or reconstruction is happening, and if the new size 4194 * is acceptable. It must fit before the sb_offset or, 4195 * if that is <data_offset, it must fit before the 4196 * size of each device. 4197 * If size is zero, we find the largest size that fits. 4198 */ 4199 if (mddev->sync_thread) 4200 return -EBUSY; 4201 ITERATE_RDEV(mddev,rdev,tmp) { 4202 sector_t avail; 4203 avail = rdev->size * 2; 4204 4205 if (fit && (size == 0 || size > avail/2)) 4206 size = avail/2; 4207 if (avail < ((sector_t)size << 1)) 4208 return -ENOSPC; 4209 } 4210 rv = mddev->pers->resize(mddev, (sector_t)size *2); 4211 if (!rv) { 4212 struct block_device *bdev; 4213 4214 bdev = bdget_disk(mddev->gendisk, 0); 4215 if (bdev) { 4216 mutex_lock(&bdev->bd_inode->i_mutex); 4217 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); 4218 mutex_unlock(&bdev->bd_inode->i_mutex); 4219 bdput(bdev); 4220 } 4221 } 4222 return rv; 4223} 4224 4225static int update_raid_disks(mddev_t *mddev, int raid_disks) 4226{ 4227 int rv; 4228 /* change the number of raid disks */ 4229 if (mddev->pers->check_reshape == NULL) 4230 return -EINVAL; 4231 if (raid_disks <= 0 || 4232 raid_disks >= mddev->max_disks) 4233 return -EINVAL; 4234 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 4235 return -EBUSY; 4236 mddev->delta_disks = raid_disks - mddev->raid_disks; 4237 4238 rv = mddev->pers->check_reshape(mddev); 4239 return rv; 4240} 4241 4242 4243/* 4244 * update_array_info is used to change the configuration of an 4245 * on-line array. 4246 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 4247 * fields in the info are checked against the array. 4248 * Any differences that cannot be handled will cause an error. 4249 * Normally, only one change can be managed at a time. 4250 */ 4251static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 4252{ 4253 int rv = 0; 4254 int cnt = 0; 4255 int state = 0; 4256 4257 /* calculate expected state,ignoring low bits */ 4258 if (mddev->bitmap && mddev->bitmap_offset) 4259 state |= (1 << MD_SB_BITMAP_PRESENT); 4260 4261 if (mddev->major_version != info->major_version || 4262 mddev->minor_version != info->minor_version || 4263/* mddev->patch_version != info->patch_version || */ 4264 mddev->ctime != info->ctime || 4265 mddev->level != info->level || 4266/* mddev->layout != info->layout || */ 4267 !mddev->persistent != info->not_persistent|| 4268 mddev->chunk_size != info->chunk_size || 4269 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 4270 ((state^info->state) & 0xfffffe00) 4271 ) 4272 return -EINVAL; 4273 /* Check there is only one change */ 4274 if (info->size >= 0 && mddev->size != info->size) cnt++; 4275 if (mddev->raid_disks != info->raid_disks) cnt++; 4276 if (mddev->layout != info->layout) cnt++; 4277 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 4278 if (cnt == 0) return 0; 4279 if (cnt > 1) return -EINVAL; 4280 4281 if (mddev->layout != info->layout) { 4282 /* Change layout 4283 * we don't need to do anything at the md level, the 4284 * personality will take care of it all. 4285 */ 4286 if (mddev->pers->reconfig == NULL) 4287 return -EINVAL; 4288 else 4289 return mddev->pers->reconfig(mddev, info->layout, -1); 4290 } 4291 if (info->size >= 0 && mddev->size != info->size) 4292 rv = update_size(mddev, info->size); 4293 4294 if (mddev->raid_disks != info->raid_disks) 4295 rv = update_raid_disks(mddev, info->raid_disks); 4296 4297 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 4298 if (mddev->pers->quiesce == NULL) 4299 return -EINVAL; 4300 if (mddev->recovery || mddev->sync_thread) 4301 return -EBUSY; 4302 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 4303 /* add the bitmap */ 4304 if (mddev->bitmap) 4305 return -EEXIST; 4306 if (mddev->default_bitmap_offset == 0) 4307 return -EINVAL; 4308 mddev->bitmap_offset = mddev->default_bitmap_offset; 4309 mddev->pers->quiesce(mddev, 1); 4310 rv = bitmap_create(mddev); 4311 if (rv) 4312 bitmap_destroy(mddev); 4313 mddev->pers->quiesce(mddev, 0); 4314 } else { 4315 /* remove the bitmap */ 4316 if (!mddev->bitmap) 4317 return -ENOENT; 4318 if (mddev->bitmap->file) 4319 return -EINVAL; 4320 mddev->pers->quiesce(mddev, 1); 4321 bitmap_destroy(mddev); 4322 mddev->pers->quiesce(mddev, 0); 4323 mddev->bitmap_offset = 0; 4324 } 4325 } 4326 md_update_sb(mddev, 1); 4327 return rv; 4328} 4329 4330static int set_disk_faulty(mddev_t *mddev, dev_t dev) 4331{ 4332 mdk_rdev_t *rdev; 4333 4334 if (mddev->pers == NULL) 4335 return -ENODEV; 4336 4337 rdev = find_rdev(mddev, dev); 4338 if (!rdev) 4339 return -ENODEV; 4340 4341 md_error(mddev, rdev); 4342 return 0; 4343} 4344 4345static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4346{ 4347 mddev_t *mddev = bdev->bd_disk->private_data; 4348 4349 geo->heads = 2; 4350 geo->sectors = 4; 4351 geo->cylinders = get_capacity(mddev->gendisk) / 8; 4352 return 0; 4353} 4354 4355static int md_ioctl(struct inode *inode, struct file *file, 4356 unsigned int cmd, unsigned long arg) 4357{ 4358 int err = 0; 4359 void __user *argp = (void __user *)arg; 4360 mddev_t *mddev = NULL; 4361 4362 if (!capable(CAP_SYS_ADMIN)) 4363 return -EACCES; 4364 4365 /* 4366 * Commands dealing with the RAID driver but not any 4367 * particular array: 4368 */ 4369 switch (cmd) 4370 { 4371 case RAID_VERSION: 4372 err = get_version(argp); 4373 goto done; 4374 4375 case PRINT_RAID_DEBUG: 4376 err = 0; 4377 md_print_devices(); 4378 goto done; 4379 4380#ifndef MODULE 4381 case RAID_AUTORUN: 4382 err = 0; 4383 autostart_arrays(arg); 4384 goto done; 4385#endif 4386 default:; 4387 } 4388 4389 /* 4390 * Commands creating/starting a new array: 4391 */ 4392 4393 mddev = inode->i_bdev->bd_disk->private_data; 4394 4395 if (!mddev) { 4396 BUG(); 4397 goto abort; 4398 } 4399 4400 err = mddev_lock(mddev); 4401 if (err) { 4402 printk(KERN_INFO 4403 "md: ioctl lock interrupted, reason %d, cmd %d\n", 4404 err, cmd); 4405 goto abort; 4406 } 4407 4408 switch (cmd) 4409 { 4410 case SET_ARRAY_INFO: 4411 { 4412 mdu_array_info_t info; 4413 if (!arg) 4414 memset(&info, 0, sizeof(info)); 4415 else if (copy_from_user(&info, argp, sizeof(info))) { 4416 err = -EFAULT; 4417 goto abort_unlock; 4418 } 4419 if (mddev->pers) { 4420 err = update_array_info(mddev, &info); 4421 if (err) { 4422 printk(KERN_WARNING "md: couldn't update" 4423 " array info. %d\n", err); 4424 goto abort_unlock; 4425 } 4426 goto done_unlock; 4427 } 4428 if (!list_empty(&mddev->disks)) { 4429 printk(KERN_WARNING 4430 "md: array %s already has disks!\n", 4431 mdname(mddev)); 4432 err = -EBUSY; 4433 goto abort_unlock; 4434 } 4435 if (mddev->raid_disks) { 4436 printk(KERN_WARNING 4437 "md: array %s already initialised!\n", 4438 mdname(mddev)); 4439 err = -EBUSY; 4440 goto abort_unlock; 4441 } 4442 err = set_array_info(mddev, &info); 4443 if (err) { 4444 printk(KERN_WARNING "md: couldn't set" 4445 " array info. %d\n", err); 4446 goto abort_unlock; 4447 } 4448 } 4449 goto done_unlock; 4450 4451 default:; 4452 } 4453 4454 /* 4455 * Commands querying/configuring an existing array: 4456 */ 4457 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 4458 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 4459 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 4460 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 4461 && cmd != GET_BITMAP_FILE) { 4462 err = -ENODEV; 4463 goto abort_unlock; 4464 } 4465 4466 /* 4467 * Commands even a read-only array can execute: 4468 */ 4469 switch (cmd) 4470 { 4471 case GET_ARRAY_INFO: 4472 err = get_array_info(mddev, argp); 4473 goto done_unlock; 4474 4475 case GET_BITMAP_FILE: 4476 err = get_bitmap_file(mddev, argp); 4477 goto done_unlock; 4478 4479 case GET_DISK_INFO: 4480 err = get_disk_info(mddev, argp); 4481 goto done_unlock; 4482 4483 case RESTART_ARRAY_RW: 4484 err = restart_array(mddev); 4485 goto done_unlock; 4486 4487 case STOP_ARRAY: 4488 err = do_md_stop (mddev, 0); 4489 goto done_unlock; 4490 4491 case STOP_ARRAY_RO: 4492 err = do_md_stop (mddev, 1); 4493 goto done_unlock; 4494 4495 /* 4496 * We have a problem here : there is no easy way to give a CHS 4497 * virtual geometry. We currently pretend that we have a 2 heads 4498 * 4 sectors (with a BIG number of cylinders...). This drives 4499 * dosfs just mad... ;-) 4500 */ 4501 } 4502 4503 /* 4504 * The remaining ioctls are changing the state of the 4505 * superblock, so we do not allow them on read-only arrays. 4506 * However non-MD ioctls (e.g. get-size) will still come through 4507 * here and hit the 'default' below, so only disallow 4508 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4509 */ 4510 if (_IOC_TYPE(cmd) == MD_MAJOR && 4511 mddev->ro && mddev->pers) { 4512 if (mddev->ro == 2) { 4513 mddev->ro = 0; 4514 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4515 md_wakeup_thread(mddev->thread); 4516 4517 } else { 4518 err = -EROFS; 4519 goto abort_unlock; 4520 } 4521 } 4522 4523 switch (cmd) 4524 { 4525 case ADD_NEW_DISK: 4526 { 4527 mdu_disk_info_t info; 4528 if (copy_from_user(&info, argp, sizeof(info))) 4529 err = -EFAULT; 4530 else 4531 err = add_new_disk(mddev, &info); 4532 goto done_unlock; 4533 } 4534 4535 case HOT_REMOVE_DISK: 4536 err = hot_remove_disk(mddev, new_decode_dev(arg)); 4537 goto done_unlock; 4538 4539 case HOT_ADD_DISK: 4540 err = hot_add_disk(mddev, new_decode_dev(arg)); 4541 goto done_unlock; 4542 4543 case SET_DISK_FAULTY: 4544 err = set_disk_faulty(mddev, new_decode_dev(arg)); 4545 goto done_unlock; 4546 4547 case RUN_ARRAY: 4548 err = do_md_run (mddev); 4549 goto done_unlock; 4550 4551 case SET_BITMAP_FILE: 4552 err = set_bitmap_file(mddev, (int)arg); 4553 goto done_unlock; 4554 4555 default: 4556 err = -EINVAL; 4557 goto abort_unlock; 4558 } 4559 4560done_unlock: 4561abort_unlock: 4562 mddev_unlock(mddev); 4563 4564 return err; 4565done: 4566 if (err) 4567 MD_BUG(); 4568abort: 4569 return err; 4570} 4571 4572static int md_open(struct inode *inode, struct file *file) 4573{ 4574 /* 4575 * Succeed if we can lock the mddev, which confirms that 4576 * it isn't being stopped right now. 4577 */ 4578 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4579 int err; 4580 4581 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) 4582 goto out; 4583 4584 err = 0; 4585 mddev_get(mddev); 4586 mddev_unlock(mddev); 4587 4588 check_disk_change(inode->i_bdev); 4589 out: 4590 return err; 4591} 4592 4593static int md_release(struct inode *inode, struct file * file) 4594{ 4595 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4596 4597 BUG_ON(!mddev); 4598 mddev_put(mddev); 4599 4600 return 0; 4601} 4602 4603static int md_media_changed(struct gendisk *disk) 4604{ 4605 mddev_t *mddev = disk->private_data; 4606 4607 return mddev->changed; 4608} 4609 4610static int md_revalidate(struct gendisk *disk) 4611{ 4612 mddev_t *mddev = disk->private_data; 4613 4614 mddev->changed = 0; 4615 return 0; 4616} 4617static struct block_device_operations md_fops = 4618{ 4619 .owner = THIS_MODULE, 4620 .open = md_open, 4621 .release = md_release, 4622 .ioctl = md_ioctl, 4623 .getgeo = md_getgeo, 4624 .media_changed = md_media_changed, 4625 .revalidate_disk= md_revalidate, 4626}; 4627 4628static int md_thread(void * arg) 4629{ 4630 mdk_thread_t *thread = arg; 4631 4632 /* 4633 * md_thread is a 'system-thread', it's priority should be very 4634 * high. We avoid resource deadlocks individually in each 4635 * raid personality. (RAID5 does preallocation) We also use RR and 4636 * the very same RT priority as kswapd, thus we will never get 4637 * into a priority inversion deadlock. 4638 * 4639 * we definitely have to have equal or higher priority than 4640 * bdflush, otherwise bdflush will deadlock if there are too 4641 * many dirty RAID5 blocks. 4642 */ 4643 4644 current->flags |= PF_NOFREEZE; 4645 allow_signal(SIGKILL); 4646 while (!kthread_should_stop()) { 4647 4648 /* We need to wait INTERRUPTIBLE so that 4649 * we don't add to the load-average. 4650 * That means we need to be sure no signals are 4651 * pending 4652 */ 4653 if (signal_pending(current)) 4654 flush_signals(current); 4655 4656 wait_event_interruptible_timeout 4657 (thread->wqueue, 4658 test_bit(THREAD_WAKEUP, &thread->flags) 4659 || kthread_should_stop(), 4660 thread->timeout); 4661 4662 clear_bit(THREAD_WAKEUP, &thread->flags); 4663 4664 thread->run(thread->mddev); 4665 } 4666 4667 return 0; 4668} 4669 4670void md_wakeup_thread(mdk_thread_t *thread) 4671{ 4672 if (thread) { 4673 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 4674 set_bit(THREAD_WAKEUP, &thread->flags); 4675 wake_up(&thread->wqueue); 4676 } 4677} 4678 4679mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 4680 const char *name) 4681{ 4682 mdk_thread_t *thread; 4683 4684 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 4685 if (!thread) 4686 return NULL; 4687 4688 init_waitqueue_head(&thread->wqueue); 4689 4690 thread->run = run; 4691 thread->mddev = mddev; 4692 thread->timeout = MAX_SCHEDULE_TIMEOUT; 4693 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 4694 if (IS_ERR(thread->tsk)) { 4695 kfree(thread); 4696 return NULL; 4697 } 4698 return thread; 4699} 4700 4701void md_unregister_thread(mdk_thread_t *thread) 4702{ 4703 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 4704 4705 kthread_stop(thread->tsk); 4706 kfree(thread); 4707} 4708 4709void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 4710{ 4711 if (!mddev) { 4712 MD_BUG(); 4713 return; 4714 } 4715 4716 if (!rdev || test_bit(Faulty, &rdev->flags)) 4717 return; 4718/* 4719 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 4720 mdname(mddev), 4721 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 4722 __builtin_return_address(0),__builtin_return_address(1), 4723 __builtin_return_address(2),__builtin_return_address(3)); 4724*/ 4725 if (!mddev->pers) 4726 return; 4727 if (!mddev->pers->error_handler) 4728 return; 4729 mddev->pers->error_handler(mddev,rdev); 4730 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4731 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4732 md_wakeup_thread(mddev->thread); 4733 md_new_event_inintr(mddev); 4734} 4735 4736/* seq_file implementation /proc/mdstat */ 4737 4738static void status_unused(struct seq_file *seq) 4739{ 4740 int i = 0; 4741 mdk_rdev_t *rdev; 4742 struct list_head *tmp; 4743 4744 seq_printf(seq, "unused devices: "); 4745 4746 ITERATE_RDEV_PENDING(rdev,tmp) { 4747 char b[BDEVNAME_SIZE]; 4748 i++; 4749 seq_printf(seq, "%s ", 4750 bdevname(rdev->bdev,b)); 4751 } 4752 if (!i) 4753 seq_printf(seq, "<none>"); 4754 4755 seq_printf(seq, "\n"); 4756} 4757 4758 4759static void status_resync(struct seq_file *seq, mddev_t * mddev) 4760{ 4761 sector_t max_blocks, resync, res; 4762 unsigned long dt, db, rt; 4763 int scale; 4764 unsigned int per_milli; 4765 4766 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 4767 4768 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4769 max_blocks = mddev->resync_max_sectors >> 1; 4770 else 4771 max_blocks = mddev->size; 4772 4773 /* 4774 * Should not happen. 4775 */ 4776 if (!max_blocks) { 4777 MD_BUG(); 4778 return; 4779 } 4780 /* Pick 'scale' such that (resync>>scale)*1000 will fit 4781 * in a sector_t, and (max_blocks>>scale) will fit in a 4782 * u32, as those are the requirements for sector_div. 4783 * Thus 'scale' must be at least 10 4784 */ 4785 scale = 10; 4786 if (sizeof(sector_t) > sizeof(unsigned long)) { 4787 while ( max_blocks/2 > (1ULL<<(scale+32))) 4788 scale++; 4789 } 4790 res = (resync>>scale)*1000; 4791 sector_div(res, (u32)((max_blocks>>scale)+1)); 4792 4793 per_milli = res; 4794 { 4795 int i, x = per_milli/50, y = 20-x; 4796 seq_printf(seq, "["); 4797 for (i = 0; i < x; i++) 4798 seq_printf(seq, "="); 4799 seq_printf(seq, ">"); 4800 for (i = 0; i < y; i++) 4801 seq_printf(seq, "."); 4802 seq_printf(seq, "] "); 4803 } 4804 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 4805 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 4806 "reshape" : 4807 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 4808 "check" : 4809 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 4810 "resync" : "recovery"))), 4811 per_milli/10, per_milli % 10, 4812 (unsigned long long) resync, 4813 (unsigned long long) max_blocks); 4814 4815 /* 4816 * We do not want to overflow, so the order of operands and 4817 * the * 100 / 100 trick are important. We do a +1 to be 4818 * safe against division by zero. We only estimate anyway. 4819 * 4820 * dt: time from mark until now 4821 * db: blocks written from mark until now 4822 * rt: remaining time 4823 */ 4824 dt = ((jiffies - mddev->resync_mark) / HZ); 4825 if (!dt) dt++; 4826 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 4827 - mddev->resync_mark_cnt; 4828 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; 4829 4830 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 4831 4832 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 4833} 4834 4835static void *md_seq_start(struct seq_file *seq, loff_t *pos) 4836{ 4837 struct list_head *tmp; 4838 loff_t l = *pos; 4839 mddev_t *mddev; 4840 4841 if (l >= 0x10000) 4842 return NULL; 4843 if (!l--) 4844 /* header */ 4845 return (void*)1; 4846 4847 spin_lock(&all_mddevs_lock); 4848 list_for_each(tmp,&all_mddevs) 4849 if (!l--) { 4850 mddev = list_entry(tmp, mddev_t, all_mddevs); 4851 mddev_get(mddev); 4852 spin_unlock(&all_mddevs_lock); 4853 return mddev; 4854 } 4855 spin_unlock(&all_mddevs_lock); 4856 if (!l--) 4857 return (void*)2;/* tail */ 4858 return NULL; 4859} 4860 4861static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4862{ 4863 struct list_head *tmp; 4864 mddev_t *next_mddev, *mddev = v; 4865 4866 ++*pos; 4867 if (v == (void*)2) 4868 return NULL; 4869 4870 spin_lock(&all_mddevs_lock); 4871 if (v == (void*)1) 4872 tmp = all_mddevs.next; 4873 else 4874 tmp = mddev->all_mddevs.next; 4875 if (tmp != &all_mddevs) 4876 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 4877 else { 4878 next_mddev = (void*)2; 4879 *pos = 0x10000; 4880 } 4881 spin_unlock(&all_mddevs_lock); 4882 4883 if (v != (void*)1) 4884 mddev_put(mddev); 4885 return next_mddev; 4886 4887} 4888 4889static void md_seq_stop(struct seq_file *seq, void *v) 4890{ 4891 mddev_t *mddev = v; 4892 4893 if (mddev && v != (void*)1 && v != (void*)2) 4894 mddev_put(mddev); 4895} 4896 4897struct mdstat_info { 4898 int event; 4899}; 4900 4901static int md_seq_show(struct seq_file *seq, void *v) 4902{ 4903 mddev_t *mddev = v; 4904 sector_t size; 4905 struct list_head *tmp2; 4906 mdk_rdev_t *rdev; 4907 struct mdstat_info *mi = seq->private; 4908 struct bitmap *bitmap; 4909 4910 if (v == (void*)1) { 4911 struct mdk_personality *pers; 4912 seq_printf(seq, "Personalities : "); 4913 spin_lock(&pers_lock); 4914 list_for_each_entry(pers, &pers_list, list) 4915 seq_printf(seq, "[%s] ", pers->name); 4916 4917 spin_unlock(&pers_lock); 4918 seq_printf(seq, "\n"); 4919 mi->event = atomic_read(&md_event_count); 4920 return 0; 4921 } 4922 if (v == (void*)2) { 4923 status_unused(seq); 4924 return 0; 4925 } 4926 4927 if (mddev_lock(mddev) < 0) 4928 return -EINTR; 4929 4930 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 4931 seq_printf(seq, "%s : %sactive", mdname(mddev), 4932 mddev->pers ? "" : "in"); 4933 if (mddev->pers) { 4934 if (mddev->ro==1) 4935 seq_printf(seq, " (read-only)"); 4936 if (mddev->ro==2) 4937 seq_printf(seq, "(auto-read-only)"); 4938 seq_printf(seq, " %s", mddev->pers->name); 4939 } 4940 4941 size = 0; 4942 ITERATE_RDEV(mddev,rdev,tmp2) { 4943 char b[BDEVNAME_SIZE]; 4944 seq_printf(seq, " %s[%d]", 4945 bdevname(rdev->bdev,b), rdev->desc_nr); 4946 if (test_bit(WriteMostly, &rdev->flags)) 4947 seq_printf(seq, "(W)"); 4948 if (test_bit(Faulty, &rdev->flags)) { 4949 seq_printf(seq, "(F)"); 4950 continue; 4951 } else if (rdev->raid_disk < 0) 4952 seq_printf(seq, "(S)"); /* spare */ 4953 size += rdev->size; 4954 } 4955 4956 if (!list_empty(&mddev->disks)) { 4957 if (mddev->pers) 4958 seq_printf(seq, "\n %llu blocks", 4959 (unsigned long long)mddev->array_size); 4960 else 4961 seq_printf(seq, "\n %llu blocks", 4962 (unsigned long long)size); 4963 } 4964 if (mddev->persistent) { 4965 if (mddev->major_version != 0 || 4966 mddev->minor_version != 90) { 4967 seq_printf(seq," super %d.%d", 4968 mddev->major_version, 4969 mddev->minor_version); 4970 } 4971 } else 4972 seq_printf(seq, " super non-persistent"); 4973 4974 if (mddev->pers) { 4975 mddev->pers->status (seq, mddev); 4976 seq_printf(seq, "\n "); 4977 if (mddev->pers->sync_request) { 4978 if (mddev->curr_resync > 2) { 4979 status_resync (seq, mddev); 4980 seq_printf(seq, "\n "); 4981 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 4982 seq_printf(seq, "\tresync=DELAYED\n "); 4983 else if (mddev->recovery_cp < MaxSector) 4984 seq_printf(seq, "\tresync=PENDING\n "); 4985 } 4986 } else 4987 seq_printf(seq, "\n "); 4988 4989 if ((bitmap = mddev->bitmap)) { 4990 unsigned long chunk_kb; 4991 unsigned long flags; 4992 spin_lock_irqsave(&bitmap->lock, flags); 4993 chunk_kb = bitmap->chunksize >> 10; 4994 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 4995 "%lu%s chunk", 4996 bitmap->pages - bitmap->missing_pages, 4997 bitmap->pages, 4998 (bitmap->pages - bitmap->missing_pages) 4999 << (PAGE_SHIFT - 10), 5000 chunk_kb ? chunk_kb : bitmap->chunksize, 5001 chunk_kb ? "KB" : "B"); 5002 if (bitmap->file) { 5003 seq_printf(seq, ", file: "); 5004 seq_path(seq, bitmap->file->f_path.mnt, 5005 bitmap->file->f_path.dentry," \t\n"); 5006 } 5007 5008 seq_printf(seq, "\n"); 5009 spin_unlock_irqrestore(&bitmap->lock, flags); 5010 } 5011 5012 seq_printf(seq, "\n"); 5013 } 5014 mddev_unlock(mddev); 5015 5016 return 0; 5017} 5018 5019static struct seq_operations md_seq_ops = { 5020 .start = md_seq_start, 5021 .next = md_seq_next, 5022 .stop = md_seq_stop, 5023 .show = md_seq_show, 5024}; 5025 5026static int md_seq_open(struct inode *inode, struct file *file) 5027{ 5028 int error; 5029 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 5030 if (mi == NULL) 5031 return -ENOMEM; 5032 5033 error = seq_open(file, &md_seq_ops); 5034 if (error) 5035 kfree(mi); 5036 else { 5037 struct seq_file *p = file->private_data; 5038 p->private = mi; 5039 mi->event = atomic_read(&md_event_count); 5040 } 5041 return error; 5042} 5043 5044static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 5045{ 5046 struct seq_file *m = filp->private_data; 5047 struct mdstat_info *mi = m->private; 5048 int mask; 5049 5050 poll_wait(filp, &md_event_waiters, wait); 5051 5052 /* always allow read */ 5053 mask = POLLIN | POLLRDNORM; 5054 5055 if (mi->event != atomic_read(&md_event_count)) 5056 mask |= POLLERR | POLLPRI; 5057 return mask; 5058} 5059 5060static const struct file_operations md_seq_fops = { 5061 .owner = THIS_MODULE, 5062 .open = md_seq_open, 5063 .read = seq_read, 5064 .llseek = seq_lseek, 5065 .release = seq_release_private, 5066 .poll = mdstat_poll, 5067}; 5068 5069int register_md_personality(struct mdk_personality *p) 5070{ 5071 spin_lock(&pers_lock); 5072 list_add_tail(&p->list, &pers_list); 5073 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 5074 spin_unlock(&pers_lock); 5075 return 0; 5076} 5077 5078int unregister_md_personality(struct mdk_personality *p) 5079{ 5080 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 5081 spin_lock(&pers_lock); 5082 list_del_init(&p->list); 5083 spin_unlock(&pers_lock); 5084 return 0; 5085} 5086 5087static int is_mddev_idle(mddev_t *mddev) 5088{ 5089 mdk_rdev_t * rdev; 5090 struct list_head *tmp; 5091 int idle; 5092 unsigned long curr_events; 5093 5094 idle = 1; 5095 ITERATE_RDEV(mddev,rdev,tmp) { 5096 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 5097 curr_events = disk_stat_read(disk, sectors[0]) + 5098 disk_stat_read(disk, sectors[1]) - 5099 atomic_read(&disk->sync_io); 5100 /* The difference between curr_events and last_events 5101 * will be affected by any new non-sync IO (making 5102 * curr_events bigger) and any difference in the amount of 5103 * in-flight syncio (making current_events bigger or smaller) 5104 * The amount in-flight is currently limited to 5105 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6 5106 * which is at most 4096 sectors. 5107 * These numbers are fairly fragile and should be made 5108 * more robust, probably by enforcing the 5109 * 'window size' that md_do_sync sort-of uses. 5110 * 5111 * Note: the following is an unsigned comparison. 5112 */ 5113 if ((long)curr_events - (long)rdev->last_events > 4096) { 5114 rdev->last_events = curr_events; 5115 idle = 0; 5116 } 5117 } 5118 return idle; 5119} 5120 5121void md_done_sync(mddev_t *mddev, int blocks, int ok) 5122{ 5123 /* another "blocks" (512byte) blocks have been synced */ 5124 atomic_sub(blocks, &mddev->recovery_active); 5125 wake_up(&mddev->recovery_wait); 5126 if (!ok) { 5127 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5128 md_wakeup_thread(mddev->thread); 5129 // stop recovery, signal do_sync .... 5130 } 5131} 5132 5133 5134/* md_write_start(mddev, bi) 5135 * If we need to update some array metadata (e.g. 'active' flag 5136 * in superblock) before writing, schedule a superblock update 5137 * and wait for it to complete. 5138 */ 5139void md_write_start(mddev_t *mddev, struct bio *bi) 5140{ 5141 if (bio_data_dir(bi) != WRITE) 5142 return; 5143 5144 BUG_ON(mddev->ro == 1); 5145 if (mddev->ro == 2) { 5146 /* need to switch to read/write */ 5147 mddev->ro = 0; 5148 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5149 md_wakeup_thread(mddev->thread); 5150 } 5151 atomic_inc(&mddev->writes_pending); 5152 if (mddev->in_sync) { 5153 spin_lock_irq(&mddev->write_lock); 5154 if (mddev->in_sync) { 5155 mddev->in_sync = 0; 5156 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5157 md_wakeup_thread(mddev->thread); 5158 } 5159 spin_unlock_irq(&mddev->write_lock); 5160 } 5161 wait_event(mddev->sb_wait, mddev->flags==0); 5162} 5163 5164void md_write_end(mddev_t *mddev) 5165{ 5166 if (atomic_dec_and_test(&mddev->writes_pending)) { 5167 if (mddev->safemode == 2) 5168 md_wakeup_thread(mddev->thread); 5169 else if (mddev->safemode_delay) 5170 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 5171 } 5172} 5173 5174/* md_allow_write(mddev) 5175 * Calling this ensures that the array is marked 'active' so that writes 5176 * may proceed without blocking. It is important to call this before 5177 * attempting a GFP_KERNEL allocation while holding the mddev lock. 5178 * Must be called with mddev_lock held. 5179 */ 5180void md_allow_write(mddev_t *mddev) 5181{ 5182 if (!mddev->pers) 5183 return; 5184 if (mddev->ro) 5185 return; 5186 5187 spin_lock_irq(&mddev->write_lock); 5188 if (mddev->in_sync) { 5189 mddev->in_sync = 0; 5190 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5191 if (mddev->safemode_delay && 5192 mddev->safemode == 0) 5193 mddev->safemode = 1; 5194 spin_unlock_irq(&mddev->write_lock); 5195 md_update_sb(mddev, 0); 5196 } else 5197 spin_unlock_irq(&mddev->write_lock); 5198} 5199EXPORT_SYMBOL_GPL(md_allow_write); 5200 5201static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 5202 5203#define SYNC_MARKS 10 5204#define SYNC_MARK_STEP (3*HZ) 5205void md_do_sync(mddev_t *mddev) 5206{ 5207 mddev_t *mddev2; 5208 unsigned int currspeed = 0, 5209 window; 5210 sector_t max_sectors,j, io_sectors; 5211 unsigned long mark[SYNC_MARKS]; 5212 sector_t mark_cnt[SYNC_MARKS]; 5213 int last_mark,m; 5214 struct list_head *tmp; 5215 sector_t last_check; 5216 int skipped = 0; 5217 struct list_head *rtmp; 5218 mdk_rdev_t *rdev; 5219 char *desc; 5220 5221 /* just incase thread restarts... */ 5222 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 5223 return; 5224 if (mddev->ro) /* never try to sync a read-only array */ 5225 return; 5226 5227 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5228 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 5229 desc = "data-check"; 5230 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5231 desc = "requested-resync"; 5232 else 5233 desc = "resync"; 5234 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5235 desc = "reshape"; 5236 else 5237 desc = "recovery"; 5238 5239 /* we overload curr_resync somewhat here. 5240 * 0 == not engaged in resync at all 5241 * 2 == checking that there is no conflict with another sync 5242 * 1 == like 2, but have yielded to allow conflicting resync to 5243 * commense 5244 * other == active in resync - this many blocks 5245 * 5246 * Before starting a resync we must have set curr_resync to 5247 * 2, and then checked that every "conflicting" array has curr_resync 5248 * less than ours. When we find one that is the same or higher 5249 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 5250 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 5251 * This will mean we have to start checking from the beginning again. 5252 * 5253 */ 5254 5255 do { 5256 mddev->curr_resync = 2; 5257 5258 try_again: 5259 if (kthread_should_stop()) { 5260 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5261 goto skip; 5262 } 5263 ITERATE_MDDEV(mddev2,tmp) { 5264 if (mddev2 == mddev) 5265 continue; 5266 if (mddev2->curr_resync && 5267 match_mddev_units(mddev,mddev2)) { 5268 DEFINE_WAIT(wq); 5269 if (mddev < mddev2 && mddev->curr_resync == 2) { 5270 /* arbitrarily yield */ 5271 mddev->curr_resync = 1; 5272 wake_up(&resync_wait); 5273 } 5274 if (mddev > mddev2 && mddev->curr_resync == 1) 5275 /* no need to wait here, we can wait the next 5276 * time 'round when curr_resync == 2 5277 */ 5278 continue; 5279 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); 5280 if (!kthread_should_stop() && 5281 mddev2->curr_resync >= mddev->curr_resync) { 5282 printk(KERN_INFO "md: delaying %s of %s" 5283 " until %s has finished (they" 5284 " share one or more physical units)\n", 5285 desc, mdname(mddev), mdname(mddev2)); 5286 mddev_put(mddev2); 5287 schedule(); 5288 finish_wait(&resync_wait, &wq); 5289 goto try_again; 5290 } 5291 finish_wait(&resync_wait, &wq); 5292 } 5293 } 5294 } while (mddev->curr_resync < 2); 5295 5296 j = 0; 5297 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5298 /* resync follows the size requested by the personality, 5299 * which defaults to physical size, but can be virtual size 5300 */ 5301 max_sectors = mddev->resync_max_sectors; 5302 mddev->resync_mismatches = 0; 5303 /* we don't use the checkpoint if there's a bitmap */ 5304 if (!mddev->bitmap && 5305 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5306 j = mddev->recovery_cp; 5307 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5308 max_sectors = mddev->size << 1; 5309 else { 5310 /* recovery follows the physical size of devices */ 5311 max_sectors = mddev->size << 1; 5312 j = MaxSector; 5313 ITERATE_RDEV(mddev,rdev,rtmp) 5314 if (rdev->raid_disk >= 0 && 5315 !test_bit(Faulty, &rdev->flags) && 5316 !test_bit(In_sync, &rdev->flags) && 5317 rdev->recovery_offset < j) 5318 j = rdev->recovery_offset; 5319 } 5320 5321 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 5322 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 5323 " %d KB/sec/disk.\n", speed_min(mddev)); 5324 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 5325 "(but not more than %d KB/sec) for %s.\n", 5326 speed_max(mddev), desc); 5327 5328 is_mddev_idle(mddev); /* this also initializes IO event counters */ 5329 5330 io_sectors = 0; 5331 for (m = 0; m < SYNC_MARKS; m++) { 5332 mark[m] = jiffies; 5333 mark_cnt[m] = io_sectors; 5334 } 5335 last_mark = 0; 5336 mddev->resync_mark = mark[last_mark]; 5337 mddev->resync_mark_cnt = mark_cnt[last_mark]; 5338 5339 /* 5340 * Tune reconstruction: 5341 */ 5342 window = 32*(PAGE_SIZE/512); 5343 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 5344 window/2,(unsigned long long) max_sectors/2); 5345 5346 atomic_set(&mddev->recovery_active, 0); 5347 init_waitqueue_head(&mddev->recovery_wait); 5348 last_check = 0; 5349 5350 if (j>2) { 5351 printk(KERN_INFO 5352 "md: resuming %s of %s from checkpoint.\n", 5353 desc, mdname(mddev)); 5354 mddev->curr_resync = j; 5355 } 5356 5357 while (j < max_sectors) { 5358 sector_t sectors; 5359 5360 skipped = 0; 5361 sectors = mddev->pers->sync_request(mddev, j, &skipped, 5362 currspeed < speed_min(mddev)); 5363 if (sectors == 0) { 5364 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5365 goto out; 5366 } 5367 5368 if (!skipped) { /* actual IO requested */ 5369 io_sectors += sectors; 5370 atomic_add(sectors, &mddev->recovery_active); 5371 } 5372 5373 j += sectors; 5374 if (j>1) mddev->curr_resync = j; 5375 mddev->curr_mark_cnt = io_sectors; 5376 if (last_check == 0) 5377 /* this is the earliers that rebuilt will be 5378 * visible in /proc/mdstat 5379 */ 5380 md_new_event(mddev); 5381 5382 if (last_check + window > io_sectors || j == max_sectors) 5383 continue; 5384 5385 last_check = io_sectors; 5386 5387 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 5388 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 5389 break; 5390 5391 repeat: 5392 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 5393 /* step marks */ 5394 int next = (last_mark+1) % SYNC_MARKS; 5395 5396 mddev->resync_mark = mark[next]; 5397 mddev->resync_mark_cnt = mark_cnt[next]; 5398 mark[next] = jiffies; 5399 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 5400 last_mark = next; 5401 } 5402 5403 5404 if (kthread_should_stop()) { 5405 /* 5406 * got a signal, exit. 5407 */ 5408 printk(KERN_INFO 5409 "md: md_do_sync() got signal ... exiting\n"); 5410 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5411 goto out; 5412 } 5413 5414 /* 5415 * this loop exits only if either when we are slower than 5416 * the 'hard' speed limit, or the system was IO-idle for 5417 * a jiffy. 5418 * the system might be non-idle CPU-wise, but we only care 5419 * about not overloading the IO subsystem. (things like an 5420 * e2fsck being done on the RAID array should execute fast) 5421 */ 5422 mddev->queue->unplug_fn(mddev->queue); 5423 cond_resched(); 5424 5425 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 5426 /((jiffies-mddev->resync_mark)/HZ +1) +1; 5427 5428 if (currspeed > speed_min(mddev)) { 5429 if ((currspeed > speed_max(mddev)) || 5430 !is_mddev_idle(mddev)) { 5431 msleep(500); 5432 goto repeat; 5433 } 5434 } 5435 } 5436 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 5437 /* 5438 * this also signals 'finished resyncing' to md_stop 5439 */ 5440 out: 5441 mddev->queue->unplug_fn(mddev->queue); 5442 5443 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 5444 5445 /* tell personality that we are finished */ 5446 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 5447 5448 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5449 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 5450 mddev->curr_resync > 2) { 5451 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5452 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5453 if (mddev->curr_resync >= mddev->recovery_cp) { 5454 printk(KERN_INFO 5455 "md: checkpointing %s of %s.\n", 5456 desc, mdname(mddev)); 5457 mddev->recovery_cp = mddev->curr_resync; 5458 } 5459 } else 5460 mddev->recovery_cp = MaxSector; 5461 } else { 5462 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5463 mddev->curr_resync = MaxSector; 5464 ITERATE_RDEV(mddev,rdev,rtmp) 5465 if (rdev->raid_disk >= 0 && 5466 !test_bit(Faulty, &rdev->flags) && 5467 !test_bit(In_sync, &rdev->flags) && 5468 rdev->recovery_offset < mddev->curr_resync) 5469 rdev->recovery_offset = mddev->curr_resync; 5470 } 5471 } 5472 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5473 5474 skip: 5475 mddev->curr_resync = 0; 5476 wake_up(&resync_wait); 5477 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 5478 md_wakeup_thread(mddev->thread); 5479} 5480EXPORT_SYMBOL_GPL(md_do_sync); 5481 5482 5483static int remove_and_add_spares(mddev_t *mddev) 5484{ 5485 mdk_rdev_t *rdev; 5486 struct list_head *rtmp; 5487 int spares = 0; 5488 5489 ITERATE_RDEV(mddev,rdev,rtmp) 5490 if (rdev->raid_disk >= 0 && 5491 (test_bit(Faulty, &rdev->flags) || 5492 ! test_bit(In_sync, &rdev->flags)) && 5493 atomic_read(&rdev->nr_pending)==0) { 5494 if (mddev->pers->hot_remove_disk( 5495 mddev, rdev->raid_disk)==0) { 5496 char nm[20]; 5497 sprintf(nm,"rd%d", rdev->raid_disk); 5498 sysfs_remove_link(&mddev->kobj, nm); 5499 rdev->raid_disk = -1; 5500 } 5501 } 5502 5503 if (mddev->degraded) { 5504 ITERATE_RDEV(mddev,rdev,rtmp) 5505 if (rdev->raid_disk < 0 5506 && !test_bit(Faulty, &rdev->flags)) { 5507 rdev->recovery_offset = 0; 5508 if (mddev->pers->hot_add_disk(mddev,rdev)) { 5509 char nm[20]; 5510 sprintf(nm, "rd%d", rdev->raid_disk); 5511 if (sysfs_create_link(&mddev->kobj, 5512 &rdev->kobj, nm)) 5513 printk(KERN_WARNING 5514 "md: cannot register " 5515 "%s for %s\n", 5516 nm, mdname(mddev)); 5517 spares++; 5518 md_new_event(mddev); 5519 } else 5520 break; 5521 } 5522 } 5523 return spares; 5524} 5525/* 5526 * This routine is regularly called by all per-raid-array threads to 5527 * deal with generic issues like resync and super-block update. 5528 * Raid personalities that don't have a thread (linear/raid0) do not 5529 * need this as they never do any recovery or update the superblock. 5530 * 5531 * It does not do any resync itself, but rather "forks" off other threads 5532 * to do that as needed. 5533 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 5534 * "->recovery" and create a thread at ->sync_thread. 5535 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 5536 * and wakeups up this thread which will reap the thread and finish up. 5537 * This thread also removes any faulty devices (with nr_pending == 0). 5538 * 5539 * The overall approach is: 5540 * 1/ if the superblock needs updating, update it. 5541 * 2/ If a recovery thread is running, don't do anything else. 5542 * 3/ If recovery has finished, clean up, possibly marking spares active. 5543 * 4/ If there are any faulty devices, remove them. 5544 * 5/ If array is degraded, try to add spares devices 5545 * 6/ If array has spares or is not in-sync, start a resync thread. 5546 */ 5547void md_check_recovery(mddev_t *mddev) 5548{ 5549 mdk_rdev_t *rdev; 5550 struct list_head *rtmp; 5551 5552 5553 if (mddev->bitmap) 5554 bitmap_daemon_work(mddev->bitmap); 5555 5556 if (mddev->ro) 5557 return; 5558 5559 if (signal_pending(current)) { 5560 if (mddev->pers->sync_request) { 5561 printk(KERN_INFO "md: %s in immediate safe mode\n", 5562 mdname(mddev)); 5563 mddev->safemode = 2; 5564 } 5565 flush_signals(current); 5566 } 5567 5568 if ( ! ( 5569 mddev->flags || 5570 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 5571 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 5572 (mddev->safemode == 1) || 5573 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 5574 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 5575 )) 5576 return; 5577 5578 if (mddev_trylock(mddev)) { 5579 int spares = 0; 5580 5581 spin_lock_irq(&mddev->write_lock); 5582 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 5583 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 5584 mddev->in_sync = 1; 5585 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5586 } 5587 if (mddev->safemode == 1) 5588 mddev->safemode = 0; 5589 spin_unlock_irq(&mddev->write_lock); 5590 5591 if (mddev->flags) 5592 md_update_sb(mddev, 0); 5593 5594 5595 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 5596 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 5597 /* resync/recovery still happening */ 5598 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5599 goto unlock; 5600 } 5601 if (mddev->sync_thread) { 5602 /* resync has finished, collect result */ 5603 md_unregister_thread(mddev->sync_thread); 5604 mddev->sync_thread = NULL; 5605 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5606 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5607 /* success...*/ 5608 /* activate any spares */ 5609 mddev->pers->spare_active(mddev); 5610 } 5611 md_update_sb(mddev, 1); 5612 5613 /* if array is no-longer degraded, then any saved_raid_disk 5614 * information must be scrapped 5615 */ 5616 if (!mddev->degraded) 5617 ITERATE_RDEV(mddev,rdev,rtmp) 5618 rdev->saved_raid_disk = -1; 5619 5620 mddev->recovery = 0; 5621 /* flag recovery needed just to double check */ 5622 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5623 md_new_event(mddev); 5624 goto unlock; 5625 } 5626 /* Clear some bits that don't mean anything, but 5627 * might be left set 5628 */ 5629 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5630 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 5631 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 5632 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 5633 5634 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 5635 goto unlock; 5636 /* no recovery is running. 5637 * remove any failed drives, then 5638 * add spares if possible. 5639 * Spare are also removed and re-added, to allow 5640 * the personality to fail the re-add. 5641 */ 5642 5643 if (mddev->reshape_position != MaxSector) { 5644 if (mddev->pers->check_reshape(mddev) != 0) 5645 /* Cannot proceed */ 5646 goto unlock; 5647 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5648 } else if ((spares = remove_and_add_spares(mddev))) { 5649 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5650 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5651 } else if (mddev->recovery_cp < MaxSector) { 5652 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5653 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5654 /* nothing to be done ... */ 5655 goto unlock; 5656 5657 if (mddev->pers->sync_request) { 5658 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5659 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 5660 /* We are adding a device or devices to an array 5661 * which has the bitmap stored on all devices. 5662 * So make sure all bitmap pages get written 5663 */ 5664 bitmap_write_all(mddev->bitmap); 5665 } 5666 mddev->sync_thread = md_register_thread(md_do_sync, 5667 mddev, 5668 "%s_resync"); 5669 if (!mddev->sync_thread) { 5670 printk(KERN_ERR "%s: could not start resync" 5671 " thread...\n", 5672 mdname(mddev)); 5673 /* leave the spares where they are, it shouldn't hurt */ 5674 mddev->recovery = 0; 5675 } else 5676 md_wakeup_thread(mddev->sync_thread); 5677 md_new_event(mddev); 5678 } 5679 unlock: 5680 mddev_unlock(mddev); 5681 } 5682} 5683 5684static int md_notify_reboot(struct notifier_block *this, 5685 unsigned long code, void *x) 5686{ 5687 struct list_head *tmp; 5688 mddev_t *mddev; 5689 5690 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 5691 5692 printk(KERN_INFO "md: stopping all md devices.\n"); 5693 5694 ITERATE_MDDEV(mddev,tmp) 5695 if (mddev_trylock(mddev)) { 5696 do_md_stop (mddev, 1); 5697 mddev_unlock(mddev); 5698 } 5699 /* 5700 * certain more exotic SCSI devices are known to be 5701 * volatile wrt too early system reboots. While the 5702 * right place to handle this issue is the given 5703 * driver, we do want to have a safe RAID driver ... 5704 */ 5705 mdelay(1000*1); 5706 } 5707 return NOTIFY_DONE; 5708} 5709 5710static struct notifier_block md_notifier = { 5711 .notifier_call = md_notify_reboot, 5712 .next = NULL, 5713 .priority = INT_MAX, /* before any real devices */ 5714}; 5715 5716static void md_geninit(void) 5717{ 5718 struct proc_dir_entry *p; 5719 5720 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 5721 5722 p = create_proc_entry("mdstat", S_IRUGO, NULL); 5723 if (p) 5724 p->proc_fops = &md_seq_fops; 5725} 5726 5727static int __init md_init(void) 5728{ 5729 if (register_blkdev(MAJOR_NR, "md")) 5730 return -1; 5731 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 5732 unregister_blkdev(MAJOR_NR, "md"); 5733 return -1; 5734 } 5735 blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, 5736 md_probe, NULL, NULL); 5737 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 5738 md_probe, NULL, NULL); 5739 5740 register_reboot_notifier(&md_notifier); 5741 raid_table_header = register_sysctl_table(raid_root_table); 5742 5743 md_geninit(); 5744 return (0); 5745} 5746 5747 5748#ifndef MODULE 5749 5750/* 5751 * Searches all registered partitions for autorun RAID arrays 5752 * at boot time. 5753 */ 5754static dev_t detected_devices[128]; 5755static int dev_cnt; 5756 5757void md_autodetect_dev(dev_t dev) 5758{ 5759 if (dev_cnt >= 0 && dev_cnt < 127) 5760 detected_devices[dev_cnt++] = dev; 5761} 5762 5763 5764static void autostart_arrays(int part) 5765{ 5766 mdk_rdev_t *rdev; 5767 int i; 5768 5769 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 5770 5771 for (i = 0; i < dev_cnt; i++) { 5772 dev_t dev = detected_devices[i]; 5773 5774 rdev = md_import_device(dev,0, 0); 5775 if (IS_ERR(rdev)) 5776 continue; 5777 5778 if (test_bit(Faulty, &rdev->flags)) { 5779 MD_BUG(); 5780 continue; 5781 } 5782 list_add(&rdev->same_set, &pending_raid_disks); 5783 } 5784 dev_cnt = 0; 5785 5786 autorun_devices(part); 5787} 5788 5789#endif /* !MODULE */ 5790 5791static __exit void md_exit(void) 5792{ 5793 mddev_t *mddev; 5794 struct list_head *tmp; 5795 5796 blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); 5797 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 5798 5799 unregister_blkdev(MAJOR_NR,"md"); 5800 unregister_blkdev(mdp_major, "mdp"); 5801 unregister_reboot_notifier(&md_notifier); 5802 unregister_sysctl_table(raid_table_header); 5803 remove_proc_entry("mdstat", NULL); 5804 ITERATE_MDDEV(mddev,tmp) { 5805 struct gendisk *disk = mddev->gendisk; 5806 if (!disk) 5807 continue; 5808 export_array(mddev); 5809 del_gendisk(disk); 5810 put_disk(disk); 5811 mddev->gendisk = NULL; 5812 mddev_put(mddev); 5813 } 5814} 5815 5816module_init(md_init) 5817module_exit(md_exit) 5818 5819static int get_ro(char *buffer, struct kernel_param *kp) 5820{ 5821 return sprintf(buffer, "%d", start_readonly); 5822} 5823static int set_ro(const char *val, struct kernel_param *kp) 5824{ 5825 char *e; 5826 int num = simple_strtoul(val, &e, 10); 5827 if (*val && (*e == '\0' || *e == '\n')) { 5828 start_readonly = num; 5829 return 0; 5830 } 5831 return -EINVAL; 5832} 5833 5834module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 5835module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 5836 5837 5838EXPORT_SYMBOL(register_md_personality); 5839EXPORT_SYMBOL(unregister_md_personality); 5840EXPORT_SYMBOL(md_error); 5841EXPORT_SYMBOL(md_done_sync); 5842EXPORT_SYMBOL(md_write_start); 5843EXPORT_SYMBOL(md_write_end); 5844EXPORT_SYMBOL(md_register_thread); 5845EXPORT_SYMBOL(md_unregister_thread); 5846EXPORT_SYMBOL(md_wakeup_thread); 5847EXPORT_SYMBOL(md_check_recovery); 5848MODULE_LICENSE("GPL"); 5849MODULE_ALIAS("md"); 5850MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 5851