1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3   md.c : Multiple Devices driver for Linux
4     Copyright (C) 1998, 1999, 2000 Ingo Molnar
5
6     completely rewritten, based on the MD driver code from Marc Zyngier
7
8   Changes:
9
10   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
11   - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
12   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
14   - kmod support by: Cyrus Durgin
15   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
16   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
17
18   - lots of fixes and improvements to the RAID1/RAID5 and generic
19     RAID code (such as request based resynchronization):
20
21     Neil Brown <neilb@cse.unsw.edu.au>.
22
23   - persistent bitmap code
24     Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
25
26
27   Errors, Warnings, etc.
28   Please use:
29     pr_crit() for error conditions that risk data loss
30     pr_err() for error conditions that are unexpected, like an IO error
31         or internal inconsistency
32     pr_warn() for error conditions that could have been predicated, like
33         adding a device to an array when it has incompatible metadata
34     pr_info() for every interesting, very rare events, like an array starting
35         or stopping, or resync starting or stopping
36     pr_debug() for everything else.
37
38*/
39
40#include <linux/sched/mm.h>
41#include <linux/sched/signal.h>
42#include <linux/kthread.h>
43#include <linux/blkdev.h>
44#include <linux/blk-integrity.h>
45#include <linux/badblocks.h>
46#include <linux/sysctl.h>
47#include <linux/seq_file.h>
48#include <linux/fs.h>
49#include <linux/poll.h>
50#include <linux/ctype.h>
51#include <linux/string.h>
52#include <linux/hdreg.h>
53#include <linux/proc_fs.h>
54#include <linux/random.h>
55#include <linux/major.h>
56#include <linux/module.h>
57#include <linux/reboot.h>
58#include <linux/file.h>
59#include <linux/compat.h>
60#include <linux/delay.h>
61#include <linux/raid/md_p.h>
62#include <linux/raid/md_u.h>
63#include <linux/raid/detect.h>
64#include <linux/slab.h>
65#include <linux/percpu-refcount.h>
66#include <linux/part_stat.h>
67
68#include "md.h"
69#include "md-bitmap.h"
70#include "md-cluster.h"
71
72/* pers_list is a list of registered personalities protected by pers_lock. */
73static LIST_HEAD(pers_list);
74static DEFINE_SPINLOCK(pers_lock);
75
76static const struct kobj_type md_ktype;
77
78struct md_cluster_operations *md_cluster_ops;
79EXPORT_SYMBOL(md_cluster_ops);
80static struct module *md_cluster_mod;
81
82static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
83static struct workqueue_struct *md_wq;
84
85/*
86 * This workqueue is used for sync_work to register new sync_thread, and for
87 * del_work to remove rdev, and for event_work that is only set by dm-raid.
88 *
89 * Noted that sync_work will grab reconfig_mutex, hence never flush this
90 * workqueue whith reconfig_mutex grabbed.
91 */
92static struct workqueue_struct *md_misc_wq;
93struct workqueue_struct *md_bitmap_wq;
94
95static int remove_and_add_spares(struct mddev *mddev,
96				 struct md_rdev *this);
97static void mddev_detach(struct mddev *mddev);
98static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
99static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
100
101/*
102 * Default number of read corrections we'll attempt on an rdev
103 * before ejecting it from the array. We divide the read error
104 * count by 2 for every hour elapsed between read errors.
105 */
106#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
107/* Default safemode delay: 200 msec */
108#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
109/*
110 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
111 * is 1000 KB/sec, so the extra system load does not show up that much.
112 * Increase it if you want to have more _guaranteed_ speed. Note that
113 * the RAID driver will use the maximum available bandwidth if the IO
114 * subsystem is idle. There is also an 'absolute maximum' reconstruction
115 * speed limit - in case reconstruction slows down your system despite
116 * idle IO detection.
117 *
118 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
119 * or /sys/block/mdX/md/sync_speed_{min,max}
120 */
121
122static int sysctl_speed_limit_min = 1000;
123static int sysctl_speed_limit_max = 200000;
124static inline int speed_min(struct mddev *mddev)
125{
126	return mddev->sync_speed_min ?
127		mddev->sync_speed_min : sysctl_speed_limit_min;
128}
129
130static inline int speed_max(struct mddev *mddev)
131{
132	return mddev->sync_speed_max ?
133		mddev->sync_speed_max : sysctl_speed_limit_max;
134}
135
136static void rdev_uninit_serial(struct md_rdev *rdev)
137{
138	if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
139		return;
140
141	kvfree(rdev->serial);
142	rdev->serial = NULL;
143}
144
145static void rdevs_uninit_serial(struct mddev *mddev)
146{
147	struct md_rdev *rdev;
148
149	rdev_for_each(rdev, mddev)
150		rdev_uninit_serial(rdev);
151}
152
153static int rdev_init_serial(struct md_rdev *rdev)
154{
155	/* serial_nums equals with BARRIER_BUCKETS_NR */
156	int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
157	struct serial_in_rdev *serial = NULL;
158
159	if (test_bit(CollisionCheck, &rdev->flags))
160		return 0;
161
162	serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
163			  GFP_KERNEL);
164	if (!serial)
165		return -ENOMEM;
166
167	for (i = 0; i < serial_nums; i++) {
168		struct serial_in_rdev *serial_tmp = &serial[i];
169
170		spin_lock_init(&serial_tmp->serial_lock);
171		serial_tmp->serial_rb = RB_ROOT_CACHED;
172		init_waitqueue_head(&serial_tmp->serial_io_wait);
173	}
174
175	rdev->serial = serial;
176	set_bit(CollisionCheck, &rdev->flags);
177
178	return 0;
179}
180
181static int rdevs_init_serial(struct mddev *mddev)
182{
183	struct md_rdev *rdev;
184	int ret = 0;
185
186	rdev_for_each(rdev, mddev) {
187		ret = rdev_init_serial(rdev);
188		if (ret)
189			break;
190	}
191
192	/* Free all resources if pool is not existed */
193	if (ret && !mddev->serial_info_pool)
194		rdevs_uninit_serial(mddev);
195
196	return ret;
197}
198
199/*
200 * rdev needs to enable serial stuffs if it meets the conditions:
201 * 1. it is multi-queue device flaged with writemostly.
202 * 2. the write-behind mode is enabled.
203 */
204static int rdev_need_serial(struct md_rdev *rdev)
205{
206	return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
207		rdev->bdev->bd_disk->queue->nr_hw_queues != 1 &&
208		test_bit(WriteMostly, &rdev->flags));
209}
210
211/*
212 * Init resource for rdev(s), then create serial_info_pool if:
213 * 1. rdev is the first device which return true from rdev_enable_serial.
214 * 2. rdev is NULL, means we want to enable serialization for all rdevs.
215 */
216void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
217{
218	int ret = 0;
219
220	if (rdev && !rdev_need_serial(rdev) &&
221	    !test_bit(CollisionCheck, &rdev->flags))
222		return;
223
224	if (!rdev)
225		ret = rdevs_init_serial(mddev);
226	else
227		ret = rdev_init_serial(rdev);
228	if (ret)
229		return;
230
231	if (mddev->serial_info_pool == NULL) {
232		/*
233		 * already in memalloc noio context by
234		 * mddev_suspend()
235		 */
236		mddev->serial_info_pool =
237			mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
238						sizeof(struct serial_info));
239		if (!mddev->serial_info_pool) {
240			rdevs_uninit_serial(mddev);
241			pr_err("can't alloc memory pool for serialization\n");
242		}
243	}
244}
245
246/*
247 * Free resource from rdev(s), and destroy serial_info_pool under conditions:
248 * 1. rdev is the last device flaged with CollisionCheck.
249 * 2. when bitmap is destroyed while policy is not enabled.
250 * 3. for disable policy, the pool is destroyed only when no rdev needs it.
251 */
252void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
253{
254	if (rdev && !test_bit(CollisionCheck, &rdev->flags))
255		return;
256
257	if (mddev->serial_info_pool) {
258		struct md_rdev *temp;
259		int num = 0; /* used to track if other rdevs need the pool */
260
261		rdev_for_each(temp, mddev) {
262			if (!rdev) {
263				if (!mddev->serialize_policy ||
264				    !rdev_need_serial(temp))
265					rdev_uninit_serial(temp);
266				else
267					num++;
268			} else if (temp != rdev &&
269				   test_bit(CollisionCheck, &temp->flags))
270				num++;
271		}
272
273		if (rdev)
274			rdev_uninit_serial(rdev);
275
276		if (num)
277			pr_info("The mempool could be used by other devices\n");
278		else {
279			mempool_destroy(mddev->serial_info_pool);
280			mddev->serial_info_pool = NULL;
281		}
282	}
283}
284
285static struct ctl_table_header *raid_table_header;
286
287static struct ctl_table raid_table[] = {
288	{
289		.procname	= "speed_limit_min",
290		.data		= &sysctl_speed_limit_min,
291		.maxlen		= sizeof(int),
292		.mode		= S_IRUGO|S_IWUSR,
293		.proc_handler	= proc_dointvec,
294	},
295	{
296		.procname	= "speed_limit_max",
297		.data		= &sysctl_speed_limit_max,
298		.maxlen		= sizeof(int),
299		.mode		= S_IRUGO|S_IWUSR,
300		.proc_handler	= proc_dointvec,
301	},
302};
303
304static int start_readonly;
305
306/*
307 * The original mechanism for creating an md device is to create
308 * a device node in /dev and to open it.  This causes races with device-close.
309 * The preferred method is to write to the "new_array" module parameter.
310 * This can avoid races.
311 * Setting create_on_open to false disables the original mechanism
312 * so all the races disappear.
313 */
314static bool create_on_open = true;
315
316/*
317 * We have a system wide 'event count' that is incremented
318 * on any 'interesting' event, and readers of /proc/mdstat
319 * can use 'poll' or 'select' to find out when the event
320 * count increases.
321 *
322 * Events are:
323 *  start array, stop array, error, add device, remove device,
324 *  start build, activate spare
325 */
326static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
327static atomic_t md_event_count;
328void md_new_event(void)
329{
330	atomic_inc(&md_event_count);
331	wake_up(&md_event_waiters);
332}
333EXPORT_SYMBOL_GPL(md_new_event);
334
335/*
336 * Enables to iterate over all existing md arrays
337 * all_mddevs_lock protects this list.
338 */
339static LIST_HEAD(all_mddevs);
340static DEFINE_SPINLOCK(all_mddevs_lock);
341
342static bool is_md_suspended(struct mddev *mddev)
343{
344	return percpu_ref_is_dying(&mddev->active_io);
345}
346/* Rather than calling directly into the personality make_request function,
347 * IO requests come here first so that we can check if the device is
348 * being suspended pending a reconfiguration.
349 * We hold a refcount over the call to ->make_request.  By the time that
350 * call has finished, the bio has been linked into some internal structure
351 * and so is visible to ->quiesce(), so we don't need the refcount any more.
352 */
353static bool is_suspended(struct mddev *mddev, struct bio *bio)
354{
355	if (is_md_suspended(mddev))
356		return true;
357	if (bio_data_dir(bio) != WRITE)
358		return false;
359	if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi))
360		return false;
361	if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi))
362		return false;
363	if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo))
364		return false;
365	return true;
366}
367
368bool md_handle_request(struct mddev *mddev, struct bio *bio)
369{
370check_suspended:
371	if (is_suspended(mddev, bio)) {
372		DEFINE_WAIT(__wait);
373		/* Bail out if REQ_NOWAIT is set for the bio */
374		if (bio->bi_opf & REQ_NOWAIT) {
375			bio_wouldblock_error(bio);
376			return true;
377		}
378		for (;;) {
379			prepare_to_wait(&mddev->sb_wait, &__wait,
380					TASK_UNINTERRUPTIBLE);
381			if (!is_suspended(mddev, bio))
382				break;
383			schedule();
384		}
385		finish_wait(&mddev->sb_wait, &__wait);
386	}
387	if (!percpu_ref_tryget_live(&mddev->active_io))
388		goto check_suspended;
389
390	if (!mddev->pers->make_request(mddev, bio)) {
391		percpu_ref_put(&mddev->active_io);
392		if (!mddev->gendisk && mddev->pers->prepare_suspend)
393			return false;
394		goto check_suspended;
395	}
396
397	percpu_ref_put(&mddev->active_io);
398	return true;
399}
400EXPORT_SYMBOL(md_handle_request);
401
402static void md_submit_bio(struct bio *bio)
403{
404	const int rw = bio_data_dir(bio);
405	struct mddev *mddev = bio->bi_bdev->bd_disk->private_data;
406
407	if (mddev == NULL || mddev->pers == NULL) {
408		bio_io_error(bio);
409		return;
410	}
411
412	if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
413		bio_io_error(bio);
414		return;
415	}
416
417	bio = bio_split_to_limits(bio);
418	if (!bio)
419		return;
420
421	if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) {
422		if (bio_sectors(bio) != 0)
423			bio->bi_status = BLK_STS_IOERR;
424		bio_endio(bio);
425		return;
426	}
427
428	/* bio could be mergeable after passing to underlayer */
429	bio->bi_opf &= ~REQ_NOMERGE;
430
431	md_handle_request(mddev, bio);
432}
433
434/*
435 * Make sure no new requests are submitted to the device, and any requests that
436 * have been submitted are completely handled.
437 */
438int mddev_suspend(struct mddev *mddev, bool interruptible)
439{
440	int err = 0;
441
442	/*
443	 * hold reconfig_mutex to wait for normal io will deadlock, because
444	 * other context can't update super_block, and normal io can rely on
445	 * updating super_block.
446	 */
447	lockdep_assert_not_held(&mddev->reconfig_mutex);
448
449	if (interruptible)
450		err = mutex_lock_interruptible(&mddev->suspend_mutex);
451	else
452		mutex_lock(&mddev->suspend_mutex);
453	if (err)
454		return err;
455
456	if (mddev->suspended) {
457		WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
458		mutex_unlock(&mddev->suspend_mutex);
459		return 0;
460	}
461
462	percpu_ref_kill(&mddev->active_io);
463	if (interruptible)
464		err = wait_event_interruptible(mddev->sb_wait,
465				percpu_ref_is_zero(&mddev->active_io));
466	else
467		wait_event(mddev->sb_wait,
468				percpu_ref_is_zero(&mddev->active_io));
469	if (err) {
470		percpu_ref_resurrect(&mddev->active_io);
471		mutex_unlock(&mddev->suspend_mutex);
472		return err;
473	}
474
475	/*
476	 * For raid456, io might be waiting for reshape to make progress,
477	 * allow new reshape to start while waiting for io to be done to
478	 * prevent deadlock.
479	 */
480	WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
481
482	del_timer_sync(&mddev->safemode_timer);
483	/* restrict memory reclaim I/O during raid array is suspend */
484	mddev->noio_flag = memalloc_noio_save();
485
486	mutex_unlock(&mddev->suspend_mutex);
487	return 0;
488}
489EXPORT_SYMBOL_GPL(mddev_suspend);
490
491static void __mddev_resume(struct mddev *mddev, bool recovery_needed)
492{
493	lockdep_assert_not_held(&mddev->reconfig_mutex);
494
495	mutex_lock(&mddev->suspend_mutex);
496	WRITE_ONCE(mddev->suspended, mddev->suspended - 1);
497	if (mddev->suspended) {
498		mutex_unlock(&mddev->suspend_mutex);
499		return;
500	}
501
502	/* entred the memalloc scope from mddev_suspend() */
503	memalloc_noio_restore(mddev->noio_flag);
504
505	percpu_ref_resurrect(&mddev->active_io);
506	wake_up(&mddev->sb_wait);
507
508	if (recovery_needed)
509		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
510	md_wakeup_thread(mddev->thread);
511	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
512
513	mutex_unlock(&mddev->suspend_mutex);
514}
515
516void mddev_resume(struct mddev *mddev)
517{
518	return __mddev_resume(mddev, true);
519}
520EXPORT_SYMBOL_GPL(mddev_resume);
521
522/* sync bdev before setting device to readonly or stopping raid*/
523static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num)
524{
525	mutex_lock(&mddev->open_mutex);
526	if (mddev->pers && atomic_read(&mddev->openers) > opener_num) {
527		mutex_unlock(&mddev->open_mutex);
528		return -EBUSY;
529	}
530	if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
531		mutex_unlock(&mddev->open_mutex);
532		return -EBUSY;
533	}
534	mutex_unlock(&mddev->open_mutex);
535
536	sync_blockdev(mddev->gendisk->part0);
537	return 0;
538}
539
540/*
541 * Generic flush handling for md
542 */
543
544static void md_end_flush(struct bio *bio)
545{
546	struct md_rdev *rdev = bio->bi_private;
547	struct mddev *mddev = rdev->mddev;
548
549	bio_put(bio);
550
551	rdev_dec_pending(rdev, mddev);
552
553	if (atomic_dec_and_test(&mddev->flush_pending)) {
554		/* The pair is percpu_ref_get() from md_flush_request() */
555		percpu_ref_put(&mddev->active_io);
556
557		/* The pre-request flush has finished */
558		queue_work(md_wq, &mddev->flush_work);
559	}
560}
561
562static void md_submit_flush_data(struct work_struct *ws);
563
564static void submit_flushes(struct work_struct *ws)
565{
566	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
567	struct md_rdev *rdev;
568
569	mddev->start_flush = ktime_get_boottime();
570	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
571	atomic_set(&mddev->flush_pending, 1);
572	rcu_read_lock();
573	rdev_for_each_rcu(rdev, mddev)
574		if (rdev->raid_disk >= 0 &&
575		    !test_bit(Faulty, &rdev->flags)) {
576			struct bio *bi;
577
578			atomic_inc(&rdev->nr_pending);
579			rcu_read_unlock();
580			bi = bio_alloc_bioset(rdev->bdev, 0,
581					      REQ_OP_WRITE | REQ_PREFLUSH,
582					      GFP_NOIO, &mddev->bio_set);
583			bi->bi_end_io = md_end_flush;
584			bi->bi_private = rdev;
585			atomic_inc(&mddev->flush_pending);
586			submit_bio(bi);
587			rcu_read_lock();
588		}
589	rcu_read_unlock();
590	if (atomic_dec_and_test(&mddev->flush_pending)) {
591		/* The pair is percpu_ref_get() from md_flush_request() */
592		percpu_ref_put(&mddev->active_io);
593
594		queue_work(md_wq, &mddev->flush_work);
595	}
596}
597
598static void md_submit_flush_data(struct work_struct *ws)
599{
600	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
601	struct bio *bio = mddev->flush_bio;
602
603	/*
604	 * must reset flush_bio before calling into md_handle_request to avoid a
605	 * deadlock, because other bios passed md_handle_request suspend check
606	 * could wait for this and below md_handle_request could wait for those
607	 * bios because of suspend check
608	 */
609	spin_lock_irq(&mddev->lock);
610	mddev->prev_flush_start = mddev->start_flush;
611	mddev->flush_bio = NULL;
612	spin_unlock_irq(&mddev->lock);
613	wake_up(&mddev->sb_wait);
614
615	if (bio->bi_iter.bi_size == 0) {
616		/* an empty barrier - all done */
617		bio_endio(bio);
618	} else {
619		bio->bi_opf &= ~REQ_PREFLUSH;
620		md_handle_request(mddev, bio);
621	}
622}
623
624/*
625 * Manages consolidation of flushes and submitting any flushes needed for
626 * a bio with REQ_PREFLUSH.  Returns true if the bio is finished or is
627 * being finished in another context.  Returns false if the flushing is
628 * complete but still needs the I/O portion of the bio to be processed.
629 */
630bool md_flush_request(struct mddev *mddev, struct bio *bio)
631{
632	ktime_t req_start = ktime_get_boottime();
633	spin_lock_irq(&mddev->lock);
634	/* flush requests wait until ongoing flush completes,
635	 * hence coalescing all the pending requests.
636	 */
637	wait_event_lock_irq(mddev->sb_wait,
638			    !mddev->flush_bio ||
639			    ktime_before(req_start, mddev->prev_flush_start),
640			    mddev->lock);
641	/* new request after previous flush is completed */
642	if (ktime_after(req_start, mddev->prev_flush_start)) {
643		WARN_ON(mddev->flush_bio);
644		/*
645		 * Grab a reference to make sure mddev_suspend() will wait for
646		 * this flush to be done.
647		 *
648		 * md_flush_reqeust() is called under md_handle_request() and
649		 * 'active_io' is already grabbed, hence percpu_ref_is_zero()
650		 * won't pass, percpu_ref_tryget_live() can't be used because
651		 * percpu_ref_kill() can be called by mddev_suspend()
652		 * concurrently.
653		 */
654		WARN_ON(percpu_ref_is_zero(&mddev->active_io));
655		percpu_ref_get(&mddev->active_io);
656		mddev->flush_bio = bio;
657		bio = NULL;
658	}
659	spin_unlock_irq(&mddev->lock);
660
661	if (!bio) {
662		INIT_WORK(&mddev->flush_work, submit_flushes);
663		queue_work(md_wq, &mddev->flush_work);
664	} else {
665		/* flush was performed for some other bio while we waited. */
666		if (bio->bi_iter.bi_size == 0)
667			/* an empty barrier - all done */
668			bio_endio(bio);
669		else {
670			bio->bi_opf &= ~REQ_PREFLUSH;
671			return false;
672		}
673	}
674	return true;
675}
676EXPORT_SYMBOL(md_flush_request);
677
678static inline struct mddev *mddev_get(struct mddev *mddev)
679{
680	lockdep_assert_held(&all_mddevs_lock);
681
682	if (test_bit(MD_DELETED, &mddev->flags))
683		return NULL;
684	atomic_inc(&mddev->active);
685	return mddev;
686}
687
688static void mddev_delayed_delete(struct work_struct *ws);
689
690static void __mddev_put(struct mddev *mddev)
691{
692	if (mddev->raid_disks || !list_empty(&mddev->disks) ||
693	    mddev->ctime || mddev->hold_active)
694		return;
695
696	/* Array is not configured at all, and not held active, so destroy it */
697	set_bit(MD_DELETED, &mddev->flags);
698
699	/*
700	 * Call queue_work inside the spinlock so that flush_workqueue() after
701	 * mddev_find will succeed in waiting for the work to be done.
702	 */
703	queue_work(md_misc_wq, &mddev->del_work);
704}
705
706void mddev_put(struct mddev *mddev)
707{
708	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
709		return;
710
711	__mddev_put(mddev);
712	spin_unlock(&all_mddevs_lock);
713}
714
715static void md_safemode_timeout(struct timer_list *t);
716static void md_start_sync(struct work_struct *ws);
717
718static void active_io_release(struct percpu_ref *ref)
719{
720	struct mddev *mddev = container_of(ref, struct mddev, active_io);
721
722	wake_up(&mddev->sb_wait);
723}
724
725static void no_op(struct percpu_ref *r) {}
726
727int mddev_init(struct mddev *mddev)
728{
729
730	if (percpu_ref_init(&mddev->active_io, active_io_release,
731			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
732		return -ENOMEM;
733
734	if (percpu_ref_init(&mddev->writes_pending, no_op,
735			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
736		percpu_ref_exit(&mddev->active_io);
737		return -ENOMEM;
738	}
739
740	/* We want to start with the refcount at zero */
741	percpu_ref_put(&mddev->writes_pending);
742
743	mutex_init(&mddev->open_mutex);
744	mutex_init(&mddev->reconfig_mutex);
745	mutex_init(&mddev->sync_mutex);
746	mutex_init(&mddev->suspend_mutex);
747	mutex_init(&mddev->bitmap_info.mutex);
748	INIT_LIST_HEAD(&mddev->disks);
749	INIT_LIST_HEAD(&mddev->all_mddevs);
750	INIT_LIST_HEAD(&mddev->deleting);
751	timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
752	atomic_set(&mddev->active, 1);
753	atomic_set(&mddev->openers, 0);
754	atomic_set(&mddev->sync_seq, 0);
755	spin_lock_init(&mddev->lock);
756	atomic_set(&mddev->flush_pending, 0);
757	init_waitqueue_head(&mddev->sb_wait);
758	init_waitqueue_head(&mddev->recovery_wait);
759	mddev->reshape_position = MaxSector;
760	mddev->reshape_backwards = 0;
761	mddev->last_sync_action = "none";
762	mddev->resync_min = 0;
763	mddev->resync_max = MaxSector;
764	mddev->level = LEVEL_NONE;
765
766	INIT_WORK(&mddev->sync_work, md_start_sync);
767	INIT_WORK(&mddev->del_work, mddev_delayed_delete);
768
769	return 0;
770}
771EXPORT_SYMBOL_GPL(mddev_init);
772
773void mddev_destroy(struct mddev *mddev)
774{
775	percpu_ref_exit(&mddev->active_io);
776	percpu_ref_exit(&mddev->writes_pending);
777}
778EXPORT_SYMBOL_GPL(mddev_destroy);
779
780static struct mddev *mddev_find_locked(dev_t unit)
781{
782	struct mddev *mddev;
783
784	list_for_each_entry(mddev, &all_mddevs, all_mddevs)
785		if (mddev->unit == unit)
786			return mddev;
787
788	return NULL;
789}
790
791/* find an unused unit number */
792static dev_t mddev_alloc_unit(void)
793{
794	static int next_minor = 512;
795	int start = next_minor;
796	bool is_free = 0;
797	dev_t dev = 0;
798
799	while (!is_free) {
800		dev = MKDEV(MD_MAJOR, next_minor);
801		next_minor++;
802		if (next_minor > MINORMASK)
803			next_minor = 0;
804		if (next_minor == start)
805			return 0;		/* Oh dear, all in use. */
806		is_free = !mddev_find_locked(dev);
807	}
808
809	return dev;
810}
811
812static struct mddev *mddev_alloc(dev_t unit)
813{
814	struct mddev *new;
815	int error;
816
817	if (unit && MAJOR(unit) != MD_MAJOR)
818		unit &= ~((1 << MdpMinorShift) - 1);
819
820	new = kzalloc(sizeof(*new), GFP_KERNEL);
821	if (!new)
822		return ERR_PTR(-ENOMEM);
823
824	error = mddev_init(new);
825	if (error)
826		goto out_free_new;
827
828	spin_lock(&all_mddevs_lock);
829	if (unit) {
830		error = -EEXIST;
831		if (mddev_find_locked(unit))
832			goto out_destroy_new;
833		new->unit = unit;
834		if (MAJOR(unit) == MD_MAJOR)
835			new->md_minor = MINOR(unit);
836		else
837			new->md_minor = MINOR(unit) >> MdpMinorShift;
838		new->hold_active = UNTIL_IOCTL;
839	} else {
840		error = -ENODEV;
841		new->unit = mddev_alloc_unit();
842		if (!new->unit)
843			goto out_destroy_new;
844		new->md_minor = MINOR(new->unit);
845		new->hold_active = UNTIL_STOP;
846	}
847
848	list_add(&new->all_mddevs, &all_mddevs);
849	spin_unlock(&all_mddevs_lock);
850	return new;
851
852out_destroy_new:
853	spin_unlock(&all_mddevs_lock);
854	mddev_destroy(new);
855out_free_new:
856	kfree(new);
857	return ERR_PTR(error);
858}
859
860static void mddev_free(struct mddev *mddev)
861{
862	spin_lock(&all_mddevs_lock);
863	list_del(&mddev->all_mddevs);
864	spin_unlock(&all_mddevs_lock);
865
866	mddev_destroy(mddev);
867	kfree(mddev);
868}
869
870static const struct attribute_group md_redundancy_group;
871
872void mddev_unlock(struct mddev *mddev)
873{
874	struct md_rdev *rdev;
875	struct md_rdev *tmp;
876	LIST_HEAD(delete);
877
878	if (!list_empty(&mddev->deleting))
879		list_splice_init(&mddev->deleting, &delete);
880
881	if (mddev->to_remove) {
882		/* These cannot be removed under reconfig_mutex as
883		 * an access to the files will try to take reconfig_mutex
884		 * while holding the file unremovable, which leads to
885		 * a deadlock.
886		 * So hold set sysfs_active while the remove in happeing,
887		 * and anything else which might set ->to_remove or my
888		 * otherwise change the sysfs namespace will fail with
889		 * -EBUSY if sysfs_active is still set.
890		 * We set sysfs_active under reconfig_mutex and elsewhere
891		 * test it under the same mutex to ensure its correct value
892		 * is seen.
893		 */
894		const struct attribute_group *to_remove = mddev->to_remove;
895		mddev->to_remove = NULL;
896		mddev->sysfs_active = 1;
897		mutex_unlock(&mddev->reconfig_mutex);
898
899		if (mddev->kobj.sd) {
900			if (to_remove != &md_redundancy_group)
901				sysfs_remove_group(&mddev->kobj, to_remove);
902			if (mddev->pers == NULL ||
903			    mddev->pers->sync_request == NULL) {
904				sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
905				if (mddev->sysfs_action)
906					sysfs_put(mddev->sysfs_action);
907				if (mddev->sysfs_completed)
908					sysfs_put(mddev->sysfs_completed);
909				if (mddev->sysfs_degraded)
910					sysfs_put(mddev->sysfs_degraded);
911				mddev->sysfs_action = NULL;
912				mddev->sysfs_completed = NULL;
913				mddev->sysfs_degraded = NULL;
914			}
915		}
916		mddev->sysfs_active = 0;
917	} else
918		mutex_unlock(&mddev->reconfig_mutex);
919
920	md_wakeup_thread(mddev->thread);
921	wake_up(&mddev->sb_wait);
922
923	list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
924		list_del_init(&rdev->same_set);
925		kobject_del(&rdev->kobj);
926		export_rdev(rdev, mddev);
927	}
928}
929EXPORT_SYMBOL_GPL(mddev_unlock);
930
931struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
932{
933	struct md_rdev *rdev;
934
935	rdev_for_each_rcu(rdev, mddev)
936		if (rdev->desc_nr == nr)
937			return rdev;
938
939	return NULL;
940}
941EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
942
943static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
944{
945	struct md_rdev *rdev;
946
947	rdev_for_each(rdev, mddev)
948		if (rdev->bdev->bd_dev == dev)
949			return rdev;
950
951	return NULL;
952}
953
954struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
955{
956	struct md_rdev *rdev;
957
958	rdev_for_each_rcu(rdev, mddev)
959		if (rdev->bdev->bd_dev == dev)
960			return rdev;
961
962	return NULL;
963}
964EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
965
966static struct md_personality *find_pers(int level, char *clevel)
967{
968	struct md_personality *pers;
969	list_for_each_entry(pers, &pers_list, list) {
970		if (level != LEVEL_NONE && pers->level == level)
971			return pers;
972		if (strcmp(pers->name, clevel)==0)
973			return pers;
974	}
975	return NULL;
976}
977
978/* return the offset of the super block in 512byte sectors */
979static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
980{
981	return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev));
982}
983
984static int alloc_disk_sb(struct md_rdev *rdev)
985{
986	rdev->sb_page = alloc_page(GFP_KERNEL);
987	if (!rdev->sb_page)
988		return -ENOMEM;
989	return 0;
990}
991
992void md_rdev_clear(struct md_rdev *rdev)
993{
994	if (rdev->sb_page) {
995		put_page(rdev->sb_page);
996		rdev->sb_loaded = 0;
997		rdev->sb_page = NULL;
998		rdev->sb_start = 0;
999		rdev->sectors = 0;
1000	}
1001	if (rdev->bb_page) {
1002		put_page(rdev->bb_page);
1003		rdev->bb_page = NULL;
1004	}
1005	badblocks_exit(&rdev->badblocks);
1006}
1007EXPORT_SYMBOL_GPL(md_rdev_clear);
1008
1009static void super_written(struct bio *bio)
1010{
1011	struct md_rdev *rdev = bio->bi_private;
1012	struct mddev *mddev = rdev->mddev;
1013
1014	if (bio->bi_status) {
1015		pr_err("md: %s gets error=%d\n", __func__,
1016		       blk_status_to_errno(bio->bi_status));
1017		md_error(mddev, rdev);
1018		if (!test_bit(Faulty, &rdev->flags)
1019		    && (bio->bi_opf & MD_FAILFAST)) {
1020			set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
1021			set_bit(LastDev, &rdev->flags);
1022		}
1023	} else
1024		clear_bit(LastDev, &rdev->flags);
1025
1026	bio_put(bio);
1027
1028	rdev_dec_pending(rdev, mddev);
1029
1030	if (atomic_dec_and_test(&mddev->pending_writes))
1031		wake_up(&mddev->sb_wait);
1032}
1033
1034void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
1035		   sector_t sector, int size, struct page *page)
1036{
1037	/* write first size bytes of page to sector of rdev
1038	 * Increment mddev->pending_writes before returning
1039	 * and decrement it on completion, waking up sb_wait
1040	 * if zero is reached.
1041	 * If an error occurred, call md_error
1042	 */
1043	struct bio *bio;
1044
1045	if (!page)
1046		return;
1047
1048	if (test_bit(Faulty, &rdev->flags))
1049		return;
1050
1051	bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev,
1052			      1,
1053			      REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META
1054				  | REQ_PREFLUSH | REQ_FUA,
1055			      GFP_NOIO, &mddev->sync_set);
1056
1057	atomic_inc(&rdev->nr_pending);
1058
1059	bio->bi_iter.bi_sector = sector;
1060	__bio_add_page(bio, page, size, 0);
1061	bio->bi_private = rdev;
1062	bio->bi_end_io = super_written;
1063
1064	if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
1065	    test_bit(FailFast, &rdev->flags) &&
1066	    !test_bit(LastDev, &rdev->flags))
1067		bio->bi_opf |= MD_FAILFAST;
1068
1069	atomic_inc(&mddev->pending_writes);
1070	submit_bio(bio);
1071}
1072
1073int md_super_wait(struct mddev *mddev)
1074{
1075	/* wait for all superblock writes that were scheduled to complete */
1076	wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
1077	if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
1078		return -EAGAIN;
1079	return 0;
1080}
1081
1082int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
1083		 struct page *page, blk_opf_t opf, bool metadata_op)
1084{
1085	struct bio bio;
1086	struct bio_vec bvec;
1087
1088	if (metadata_op && rdev->meta_bdev)
1089		bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf);
1090	else
1091		bio_init(&bio, rdev->bdev, &bvec, 1, opf);
1092
1093	if (metadata_op)
1094		bio.bi_iter.bi_sector = sector + rdev->sb_start;
1095	else if (rdev->mddev->reshape_position != MaxSector &&
1096		 (rdev->mddev->reshape_backwards ==
1097		  (sector >= rdev->mddev->reshape_position)))
1098		bio.bi_iter.bi_sector = sector + rdev->new_data_offset;
1099	else
1100		bio.bi_iter.bi_sector = sector + rdev->data_offset;
1101	__bio_add_page(&bio, page, size, 0);
1102
1103	submit_bio_wait(&bio);
1104
1105	return !bio.bi_status;
1106}
1107EXPORT_SYMBOL_GPL(sync_page_io);
1108
1109static int read_disk_sb(struct md_rdev *rdev, int size)
1110{
1111	if (rdev->sb_loaded)
1112		return 0;
1113
1114	if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true))
1115		goto fail;
1116	rdev->sb_loaded = 1;
1117	return 0;
1118
1119fail:
1120	pr_err("md: disabled device %pg, could not read superblock.\n",
1121	       rdev->bdev);
1122	return -EINVAL;
1123}
1124
1125static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1126{
1127	return	sb1->set_uuid0 == sb2->set_uuid0 &&
1128		sb1->set_uuid1 == sb2->set_uuid1 &&
1129		sb1->set_uuid2 == sb2->set_uuid2 &&
1130		sb1->set_uuid3 == sb2->set_uuid3;
1131}
1132
1133static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1134{
1135	int ret;
1136	mdp_super_t *tmp1, *tmp2;
1137
1138	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
1139	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
1140
1141	if (!tmp1 || !tmp2) {
1142		ret = 0;
1143		goto abort;
1144	}
1145
1146	*tmp1 = *sb1;
1147	*tmp2 = *sb2;
1148
1149	/*
1150	 * nr_disks is not constant
1151	 */
1152	tmp1->nr_disks = 0;
1153	tmp2->nr_disks = 0;
1154
1155	ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
1156abort:
1157	kfree(tmp1);
1158	kfree(tmp2);
1159	return ret;
1160}
1161
1162static u32 md_csum_fold(u32 csum)
1163{
1164	csum = (csum & 0xffff) + (csum >> 16);
1165	return (csum & 0xffff) + (csum >> 16);
1166}
1167
1168static unsigned int calc_sb_csum(mdp_super_t *sb)
1169{
1170	u64 newcsum = 0;
1171	u32 *sb32 = (u32*)sb;
1172	int i;
1173	unsigned int disk_csum, csum;
1174
1175	disk_csum = sb->sb_csum;
1176	sb->sb_csum = 0;
1177
1178	for (i = 0; i < MD_SB_BYTES/4 ; i++)
1179		newcsum += sb32[i];
1180	csum = (newcsum & 0xffffffff) + (newcsum>>32);
1181
1182#ifdef CONFIG_ALPHA
1183	/* This used to use csum_partial, which was wrong for several
1184	 * reasons including that different results are returned on
1185	 * different architectures.  It isn't critical that we get exactly
1186	 * the same return value as before (we always csum_fold before
1187	 * testing, and that removes any differences).  However as we
1188	 * know that csum_partial always returned a 16bit value on
1189	 * alphas, do a fold to maximise conformity to previous behaviour.
1190	 */
1191	sb->sb_csum = md_csum_fold(disk_csum);
1192#else
1193	sb->sb_csum = disk_csum;
1194#endif
1195	return csum;
1196}
1197
1198/*
1199 * Handle superblock details.
1200 * We want to be able to handle multiple superblock formats
1201 * so we have a common interface to them all, and an array of
1202 * different handlers.
1203 * We rely on user-space to write the initial superblock, and support
1204 * reading and updating of superblocks.
1205 * Interface methods are:
1206 *   int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
1207 *      loads and validates a superblock on dev.
1208 *      if refdev != NULL, compare superblocks on both devices
1209 *    Return:
1210 *      0 - dev has a superblock that is compatible with refdev
1211 *      1 - dev has a superblock that is compatible and newer than refdev
1212 *          so dev should be used as the refdev in future
1213 *     -EINVAL superblock incompatible or invalid
1214 *     -othererror e.g. -EIO
1215 *
1216 *   int validate_super(struct mddev *mddev, struct md_rdev *dev)
1217 *      Verify that dev is acceptable into mddev.
1218 *       The first time, mddev->raid_disks will be 0, and data from
1219 *       dev should be merged in.  Subsequent calls check that dev
1220 *       is new enough.  Return 0 or -EINVAL
1221 *
1222 *   void sync_super(struct mddev *mddev, struct md_rdev *dev)
1223 *     Update the superblock for rdev with data in mddev
1224 *     This does not write to disc.
1225 *
1226 */
1227
1228struct super_type  {
1229	char		    *name;
1230	struct module	    *owner;
1231	int		    (*load_super)(struct md_rdev *rdev,
1232					  struct md_rdev *refdev,
1233					  int minor_version);
1234	int		    (*validate_super)(struct mddev *mddev,
1235					      struct md_rdev *freshest,
1236					      struct md_rdev *rdev);
1237	void		    (*sync_super)(struct mddev *mddev,
1238					  struct md_rdev *rdev);
1239	unsigned long long  (*rdev_size_change)(struct md_rdev *rdev,
1240						sector_t num_sectors);
1241	int		    (*allow_new_offset)(struct md_rdev *rdev,
1242						unsigned long long new_offset);
1243};
1244
1245/*
1246 * Check that the given mddev has no bitmap.
1247 *
1248 * This function is called from the run method of all personalities that do not
1249 * support bitmaps. It prints an error message and returns non-zero if mddev
1250 * has a bitmap. Otherwise, it returns 0.
1251 *
1252 */
1253int md_check_no_bitmap(struct mddev *mddev)
1254{
1255	if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1256		return 0;
1257	pr_warn("%s: bitmaps are not supported for %s\n",
1258		mdname(mddev), mddev->pers->name);
1259	return 1;
1260}
1261EXPORT_SYMBOL(md_check_no_bitmap);
1262
1263/*
1264 * load_super for 0.90.0
1265 */
1266static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1267{
1268	mdp_super_t *sb;
1269	int ret;
1270	bool spare_disk = true;
1271
1272	/*
1273	 * Calculate the position of the superblock (512byte sectors),
1274	 * it's at the end of the disk.
1275	 *
1276	 * It also happens to be a multiple of 4Kb.
1277	 */
1278	rdev->sb_start = calc_dev_sboffset(rdev);
1279
1280	ret = read_disk_sb(rdev, MD_SB_BYTES);
1281	if (ret)
1282		return ret;
1283
1284	ret = -EINVAL;
1285
1286	sb = page_address(rdev->sb_page);
1287
1288	if (sb->md_magic != MD_SB_MAGIC) {
1289		pr_warn("md: invalid raid superblock magic on %pg\n",
1290			rdev->bdev);
1291		goto abort;
1292	}
1293
1294	if (sb->major_version != 0 ||
1295	    sb->minor_version < 90 ||
1296	    sb->minor_version > 91) {
1297		pr_warn("Bad version number %d.%d on %pg\n",
1298			sb->major_version, sb->minor_version, rdev->bdev);
1299		goto abort;
1300	}
1301
1302	if (sb->raid_disks <= 0)
1303		goto abort;
1304
1305	if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1306		pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev);
1307		goto abort;
1308	}
1309
1310	rdev->preferred_minor = sb->md_minor;
1311	rdev->data_offset = 0;
1312	rdev->new_data_offset = 0;
1313	rdev->sb_size = MD_SB_BYTES;
1314	rdev->badblocks.shift = -1;
1315
1316	rdev->desc_nr = sb->this_disk.number;
1317
1318	/* not spare disk */
1319	if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS &&
1320	    sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1321		spare_disk = false;
1322
1323	if (!refdev) {
1324		if (!spare_disk)
1325			ret = 1;
1326		else
1327			ret = 0;
1328	} else {
1329		__u64 ev1, ev2;
1330		mdp_super_t *refsb = page_address(refdev->sb_page);
1331		if (!md_uuid_equal(refsb, sb)) {
1332			pr_warn("md: %pg has different UUID to %pg\n",
1333				rdev->bdev, refdev->bdev);
1334			goto abort;
1335		}
1336		if (!md_sb_equal(refsb, sb)) {
1337			pr_warn("md: %pg has same UUID but different superblock to %pg\n",
1338				rdev->bdev, refdev->bdev);
1339			goto abort;
1340		}
1341		ev1 = md_event(sb);
1342		ev2 = md_event(refsb);
1343
1344		if (!spare_disk && ev1 > ev2)
1345			ret = 1;
1346		else
1347			ret = 0;
1348	}
1349	rdev->sectors = rdev->sb_start;
1350	/* Limit to 4TB as metadata cannot record more than that.
1351	 * (not needed for Linear and RAID0 as metadata doesn't
1352	 * record this size)
1353	 */
1354	if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1355		rdev->sectors = (sector_t)(2ULL << 32) - 2;
1356
1357	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1358		/* "this cannot possibly happen" ... */
1359		ret = -EINVAL;
1360
1361 abort:
1362	return ret;
1363}
1364
1365/*
1366 * validate_super for 0.90.0
1367 * note: we are not using "freshest" for 0.9 superblock
1368 */
1369static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1370{
1371	mdp_disk_t *desc;
1372	mdp_super_t *sb = page_address(rdev->sb_page);
1373	__u64 ev1 = md_event(sb);
1374
1375	rdev->raid_disk = -1;
1376	clear_bit(Faulty, &rdev->flags);
1377	clear_bit(In_sync, &rdev->flags);
1378	clear_bit(Bitmap_sync, &rdev->flags);
1379	clear_bit(WriteMostly, &rdev->flags);
1380
1381	if (mddev->raid_disks == 0) {
1382		mddev->major_version = 0;
1383		mddev->minor_version = sb->minor_version;
1384		mddev->patch_version = sb->patch_version;
1385		mddev->external = 0;
1386		mddev->chunk_sectors = sb->chunk_size >> 9;
1387		mddev->ctime = sb->ctime;
1388		mddev->utime = sb->utime;
1389		mddev->level = sb->level;
1390		mddev->clevel[0] = 0;
1391		mddev->layout = sb->layout;
1392		mddev->raid_disks = sb->raid_disks;
1393		mddev->dev_sectors = ((sector_t)sb->size) * 2;
1394		mddev->events = ev1;
1395		mddev->bitmap_info.offset = 0;
1396		mddev->bitmap_info.space = 0;
1397		/* bitmap can use 60 K after the 4K superblocks */
1398		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1399		mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1400		mddev->reshape_backwards = 0;
1401
1402		if (mddev->minor_version >= 91) {
1403			mddev->reshape_position = sb->reshape_position;
1404			mddev->delta_disks = sb->delta_disks;
1405			mddev->new_level = sb->new_level;
1406			mddev->new_layout = sb->new_layout;
1407			mddev->new_chunk_sectors = sb->new_chunk >> 9;
1408			if (mddev->delta_disks < 0)
1409				mddev->reshape_backwards = 1;
1410		} else {
1411			mddev->reshape_position = MaxSector;
1412			mddev->delta_disks = 0;
1413			mddev->new_level = mddev->level;
1414			mddev->new_layout = mddev->layout;
1415			mddev->new_chunk_sectors = mddev->chunk_sectors;
1416		}
1417		if (mddev->level == 0)
1418			mddev->layout = -1;
1419
1420		if (sb->state & (1<<MD_SB_CLEAN))
1421			mddev->recovery_cp = MaxSector;
1422		else {
1423			if (sb->events_hi == sb->cp_events_hi &&
1424				sb->events_lo == sb->cp_events_lo) {
1425				mddev->recovery_cp = sb->recovery_cp;
1426			} else
1427				mddev->recovery_cp = 0;
1428		}
1429
1430		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1431		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1432		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1433		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1434
1435		mddev->max_disks = MD_SB_DISKS;
1436
1437		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1438		    mddev->bitmap_info.file == NULL) {
1439			mddev->bitmap_info.offset =
1440				mddev->bitmap_info.default_offset;
1441			mddev->bitmap_info.space =
1442				mddev->bitmap_info.default_space;
1443		}
1444
1445	} else if (mddev->pers == NULL) {
1446		/* Insist on good event counter while assembling, except
1447		 * for spares (which don't need an event count) */
1448		++ev1;
1449		if (sb->disks[rdev->desc_nr].state & (
1450			    (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1451			if (ev1 < mddev->events)
1452				return -EINVAL;
1453	} else if (mddev->bitmap) {
1454		/* if adding to array with a bitmap, then we can accept an
1455		 * older device ... but not too old.
1456		 */
1457		if (ev1 < mddev->bitmap->events_cleared)
1458			return 0;
1459		if (ev1 < mddev->events)
1460			set_bit(Bitmap_sync, &rdev->flags);
1461	} else {
1462		if (ev1 < mddev->events)
1463			/* just a hot-add of a new device, leave raid_disk at -1 */
1464			return 0;
1465	}
1466
1467	desc = sb->disks + rdev->desc_nr;
1468
1469	if (desc->state & (1<<MD_DISK_FAULTY))
1470		set_bit(Faulty, &rdev->flags);
1471	else if (desc->state & (1<<MD_DISK_SYNC)) {
1472		set_bit(In_sync, &rdev->flags);
1473		rdev->raid_disk = desc->raid_disk;
1474		rdev->saved_raid_disk = desc->raid_disk;
1475	} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1476		/* active but not in sync implies recovery up to
1477		 * reshape position.  We don't know exactly where
1478		 * that is, so set to zero for now
1479		 */
1480		if (mddev->minor_version >= 91) {
1481			rdev->recovery_offset = 0;
1482			rdev->raid_disk = desc->raid_disk;
1483		}
1484	}
1485	if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1486		set_bit(WriteMostly, &rdev->flags);
1487	if (desc->state & (1<<MD_DISK_FAILFAST))
1488		set_bit(FailFast, &rdev->flags);
1489	return 0;
1490}
1491
1492/*
1493 * sync_super for 0.90.0
1494 */
1495static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1496{
1497	mdp_super_t *sb;
1498	struct md_rdev *rdev2;
1499	int next_spare = mddev->raid_disks;
1500
1501	/* make rdev->sb match mddev data..
1502	 *
1503	 * 1/ zero out disks
1504	 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1505	 * 3/ any empty disks < next_spare become removed
1506	 *
1507	 * disks[0] gets initialised to REMOVED because
1508	 * we cannot be sure from other fields if it has
1509	 * been initialised or not.
1510	 */
1511	int i;
1512	int active=0, working=0,failed=0,spare=0,nr_disks=0;
1513
1514	rdev->sb_size = MD_SB_BYTES;
1515
1516	sb = page_address(rdev->sb_page);
1517
1518	memset(sb, 0, sizeof(*sb));
1519
1520	sb->md_magic = MD_SB_MAGIC;
1521	sb->major_version = mddev->major_version;
1522	sb->patch_version = mddev->patch_version;
1523	sb->gvalid_words  = 0; /* ignored */
1524	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1525	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1526	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1527	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1528
1529	sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1530	sb->level = mddev->level;
1531	sb->size = mddev->dev_sectors / 2;
1532	sb->raid_disks = mddev->raid_disks;
1533	sb->md_minor = mddev->md_minor;
1534	sb->not_persistent = 0;
1535	sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1536	sb->state = 0;
1537	sb->events_hi = (mddev->events>>32);
1538	sb->events_lo = (u32)mddev->events;
1539
1540	if (mddev->reshape_position == MaxSector)
1541		sb->minor_version = 90;
1542	else {
1543		sb->minor_version = 91;
1544		sb->reshape_position = mddev->reshape_position;
1545		sb->new_level = mddev->new_level;
1546		sb->delta_disks = mddev->delta_disks;
1547		sb->new_layout = mddev->new_layout;
1548		sb->new_chunk = mddev->new_chunk_sectors << 9;
1549	}
1550	mddev->minor_version = sb->minor_version;
1551	if (mddev->in_sync)
1552	{
1553		sb->recovery_cp = mddev->recovery_cp;
1554		sb->cp_events_hi = (mddev->events>>32);
1555		sb->cp_events_lo = (u32)mddev->events;
1556		if (mddev->recovery_cp == MaxSector)
1557			sb->state = (1<< MD_SB_CLEAN);
1558	} else
1559		sb->recovery_cp = 0;
1560
1561	sb->layout = mddev->layout;
1562	sb->chunk_size = mddev->chunk_sectors << 9;
1563
1564	if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1565		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1566
1567	sb->disks[0].state = (1<<MD_DISK_REMOVED);
1568	rdev_for_each(rdev2, mddev) {
1569		mdp_disk_t *d;
1570		int desc_nr;
1571		int is_active = test_bit(In_sync, &rdev2->flags);
1572
1573		if (rdev2->raid_disk >= 0 &&
1574		    sb->minor_version >= 91)
1575			/* we have nowhere to store the recovery_offset,
1576			 * but if it is not below the reshape_position,
1577			 * we can piggy-back on that.
1578			 */
1579			is_active = 1;
1580		if (rdev2->raid_disk < 0 ||
1581		    test_bit(Faulty, &rdev2->flags))
1582			is_active = 0;
1583		if (is_active)
1584			desc_nr = rdev2->raid_disk;
1585		else
1586			desc_nr = next_spare++;
1587		rdev2->desc_nr = desc_nr;
1588		d = &sb->disks[rdev2->desc_nr];
1589		nr_disks++;
1590		d->number = rdev2->desc_nr;
1591		d->major = MAJOR(rdev2->bdev->bd_dev);
1592		d->minor = MINOR(rdev2->bdev->bd_dev);
1593		if (is_active)
1594			d->raid_disk = rdev2->raid_disk;
1595		else
1596			d->raid_disk = rdev2->desc_nr; /* compatibility */
1597		if (test_bit(Faulty, &rdev2->flags))
1598			d->state = (1<<MD_DISK_FAULTY);
1599		else if (is_active) {
1600			d->state = (1<<MD_DISK_ACTIVE);
1601			if (test_bit(In_sync, &rdev2->flags))
1602				d->state |= (1<<MD_DISK_SYNC);
1603			active++;
1604			working++;
1605		} else {
1606			d->state = 0;
1607			spare++;
1608			working++;
1609		}
1610		if (test_bit(WriteMostly, &rdev2->flags))
1611			d->state |= (1<<MD_DISK_WRITEMOSTLY);
1612		if (test_bit(FailFast, &rdev2->flags))
1613			d->state |= (1<<MD_DISK_FAILFAST);
1614	}
1615	/* now set the "removed" and "faulty" bits on any missing devices */
1616	for (i=0 ; i < mddev->raid_disks ; i++) {
1617		mdp_disk_t *d = &sb->disks[i];
1618		if (d->state == 0 && d->number == 0) {
1619			d->number = i;
1620			d->raid_disk = i;
1621			d->state = (1<<MD_DISK_REMOVED);
1622			d->state |= (1<<MD_DISK_FAULTY);
1623			failed++;
1624		}
1625	}
1626	sb->nr_disks = nr_disks;
1627	sb->active_disks = active;
1628	sb->working_disks = working;
1629	sb->failed_disks = failed;
1630	sb->spare_disks = spare;
1631
1632	sb->this_disk = sb->disks[rdev->desc_nr];
1633	sb->sb_csum = calc_sb_csum(sb);
1634}
1635
1636/*
1637 * rdev_size_change for 0.90.0
1638 */
1639static unsigned long long
1640super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1641{
1642	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1643		return 0; /* component must fit device */
1644	if (rdev->mddev->bitmap_info.offset)
1645		return 0; /* can't move bitmap */
1646	rdev->sb_start = calc_dev_sboffset(rdev);
1647	if (!num_sectors || num_sectors > rdev->sb_start)
1648		num_sectors = rdev->sb_start;
1649	/* Limit to 4TB as metadata cannot record more than that.
1650	 * 4TB == 2^32 KB, or 2*2^32 sectors.
1651	 */
1652	if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1653		num_sectors = (sector_t)(2ULL << 32) - 2;
1654	do {
1655		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1656		       rdev->sb_page);
1657	} while (md_super_wait(rdev->mddev) < 0);
1658	return num_sectors;
1659}
1660
1661static int
1662super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1663{
1664	/* non-zero offset changes not possible with v0.90 */
1665	return new_offset == 0;
1666}
1667
1668/*
1669 * version 1 superblock
1670 */
1671
1672static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1673{
1674	__le32 disk_csum;
1675	u32 csum;
1676	unsigned long long newcsum;
1677	int size = 256 + le32_to_cpu(sb->max_dev)*2;
1678	__le32 *isuper = (__le32*)sb;
1679
1680	disk_csum = sb->sb_csum;
1681	sb->sb_csum = 0;
1682	newcsum = 0;
1683	for (; size >= 4; size -= 4)
1684		newcsum += le32_to_cpu(*isuper++);
1685
1686	if (size == 2)
1687		newcsum += le16_to_cpu(*(__le16*) isuper);
1688
1689	csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1690	sb->sb_csum = disk_csum;
1691	return cpu_to_le32(csum);
1692}
1693
1694static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1695{
1696	struct mdp_superblock_1 *sb;
1697	int ret;
1698	sector_t sb_start;
1699	sector_t sectors;
1700	int bmask;
1701	bool spare_disk = true;
1702
1703	/*
1704	 * Calculate the position of the superblock in 512byte sectors.
1705	 * It is always aligned to a 4K boundary and
1706	 * depeding on minor_version, it can be:
1707	 * 0: At least 8K, but less than 12K, from end of device
1708	 * 1: At start of device
1709	 * 2: 4K from start of device.
1710	 */
1711	switch(minor_version) {
1712	case 0:
1713		sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2;
1714		sb_start &= ~(sector_t)(4*2-1);
1715		break;
1716	case 1:
1717		sb_start = 0;
1718		break;
1719	case 2:
1720		sb_start = 8;
1721		break;
1722	default:
1723		return -EINVAL;
1724	}
1725	rdev->sb_start = sb_start;
1726
1727	/* superblock is rarely larger than 1K, but it can be larger,
1728	 * and it is safe to read 4k, so we do that
1729	 */
1730	ret = read_disk_sb(rdev, 4096);
1731	if (ret) return ret;
1732
1733	sb = page_address(rdev->sb_page);
1734
1735	if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1736	    sb->major_version != cpu_to_le32(1) ||
1737	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1738	    le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1739	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1740		return -EINVAL;
1741
1742	if (calc_sb_1_csum(sb) != sb->sb_csum) {
1743		pr_warn("md: invalid superblock checksum on %pg\n",
1744			rdev->bdev);
1745		return -EINVAL;
1746	}
1747	if (le64_to_cpu(sb->data_size) < 10) {
1748		pr_warn("md: data_size too small on %pg\n",
1749			rdev->bdev);
1750		return -EINVAL;
1751	}
1752	if (sb->pad0 ||
1753	    sb->pad3[0] ||
1754	    memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1755		/* Some padding is non-zero, might be a new feature */
1756		return -EINVAL;
1757
1758	rdev->preferred_minor = 0xffff;
1759	rdev->data_offset = le64_to_cpu(sb->data_offset);
1760	rdev->new_data_offset = rdev->data_offset;
1761	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1762	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1763		rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1764	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1765
1766	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1767	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1768	if (rdev->sb_size & bmask)
1769		rdev->sb_size = (rdev->sb_size | bmask) + 1;
1770
1771	if (minor_version
1772	    && rdev->data_offset < sb_start + (rdev->sb_size/512))
1773		return -EINVAL;
1774	if (minor_version
1775	    && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1776		return -EINVAL;
1777
1778	rdev->desc_nr = le32_to_cpu(sb->dev_number);
1779
1780	if (!rdev->bb_page) {
1781		rdev->bb_page = alloc_page(GFP_KERNEL);
1782		if (!rdev->bb_page)
1783			return -ENOMEM;
1784	}
1785	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1786	    rdev->badblocks.count == 0) {
1787		/* need to load the bad block list.
1788		 * Currently we limit it to one page.
1789		 */
1790		s32 offset;
1791		sector_t bb_sector;
1792		__le64 *bbp;
1793		int i;
1794		int sectors = le16_to_cpu(sb->bblog_size);
1795		if (sectors > (PAGE_SIZE / 512))
1796			return -EINVAL;
1797		offset = le32_to_cpu(sb->bblog_offset);
1798		if (offset == 0)
1799			return -EINVAL;
1800		bb_sector = (long long)offset;
1801		if (!sync_page_io(rdev, bb_sector, sectors << 9,
1802				  rdev->bb_page, REQ_OP_READ, true))
1803			return -EIO;
1804		bbp = (__le64 *)page_address(rdev->bb_page);
1805		rdev->badblocks.shift = sb->bblog_shift;
1806		for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1807			u64 bb = le64_to_cpu(*bbp);
1808			int count = bb & (0x3ff);
1809			u64 sector = bb >> 10;
1810			sector <<= sb->bblog_shift;
1811			count <<= sb->bblog_shift;
1812			if (bb + 1 == 0)
1813				break;
1814			if (badblocks_set(&rdev->badblocks, sector, count, 1))
1815				return -EINVAL;
1816		}
1817	} else if (sb->bblog_offset != 0)
1818		rdev->badblocks.shift = 0;
1819
1820	if ((le32_to_cpu(sb->feature_map) &
1821	    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1822		rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1823		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1824		rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1825	}
1826
1827	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
1828	    sb->level != 0)
1829		return -EINVAL;
1830
1831	/* not spare disk */
1832	if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1833	    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1834	     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1835		spare_disk = false;
1836
1837	if (!refdev) {
1838		if (!spare_disk)
1839			ret = 1;
1840		else
1841			ret = 0;
1842	} else {
1843		__u64 ev1, ev2;
1844		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1845
1846		if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1847		    sb->level != refsb->level ||
1848		    sb->layout != refsb->layout ||
1849		    sb->chunksize != refsb->chunksize) {
1850			pr_warn("md: %pg has strangely different superblock to %pg\n",
1851				rdev->bdev,
1852				refdev->bdev);
1853			return -EINVAL;
1854		}
1855		ev1 = le64_to_cpu(sb->events);
1856		ev2 = le64_to_cpu(refsb->events);
1857
1858		if (!spare_disk && ev1 > ev2)
1859			ret = 1;
1860		else
1861			ret = 0;
1862	}
1863	if (minor_version)
1864		sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
1865	else
1866		sectors = rdev->sb_start;
1867	if (sectors < le64_to_cpu(sb->data_size))
1868		return -EINVAL;
1869	rdev->sectors = le64_to_cpu(sb->data_size);
1870	return ret;
1871}
1872
1873static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
1874{
1875	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1876	__u64 ev1 = le64_to_cpu(sb->events);
1877	int role;
1878
1879	rdev->raid_disk = -1;
1880	clear_bit(Faulty, &rdev->flags);
1881	clear_bit(In_sync, &rdev->flags);
1882	clear_bit(Bitmap_sync, &rdev->flags);
1883	clear_bit(WriteMostly, &rdev->flags);
1884
1885	if (mddev->raid_disks == 0) {
1886		mddev->major_version = 1;
1887		mddev->patch_version = 0;
1888		mddev->external = 0;
1889		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1890		mddev->ctime = le64_to_cpu(sb->ctime);
1891		mddev->utime = le64_to_cpu(sb->utime);
1892		mddev->level = le32_to_cpu(sb->level);
1893		mddev->clevel[0] = 0;
1894		mddev->layout = le32_to_cpu(sb->layout);
1895		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1896		mddev->dev_sectors = le64_to_cpu(sb->size);
1897		mddev->events = ev1;
1898		mddev->bitmap_info.offset = 0;
1899		mddev->bitmap_info.space = 0;
1900		/* Default location for bitmap is 1K after superblock
1901		 * using 3K - total of 4K
1902		 */
1903		mddev->bitmap_info.default_offset = 1024 >> 9;
1904		mddev->bitmap_info.default_space = (4096-1024) >> 9;
1905		mddev->reshape_backwards = 0;
1906
1907		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1908		memcpy(mddev->uuid, sb->set_uuid, 16);
1909
1910		mddev->max_disks =  (4096-256)/2;
1911
1912		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1913		    mddev->bitmap_info.file == NULL) {
1914			mddev->bitmap_info.offset =
1915				(__s32)le32_to_cpu(sb->bitmap_offset);
1916			/* Metadata doesn't record how much space is available.
1917			 * For 1.0, we assume we can use up to the superblock
1918			 * if before, else to 4K beyond superblock.
1919			 * For others, assume no change is possible.
1920			 */
1921			if (mddev->minor_version > 0)
1922				mddev->bitmap_info.space = 0;
1923			else if (mddev->bitmap_info.offset > 0)
1924				mddev->bitmap_info.space =
1925					8 - mddev->bitmap_info.offset;
1926			else
1927				mddev->bitmap_info.space =
1928					-mddev->bitmap_info.offset;
1929		}
1930
1931		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1932			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1933			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1934			mddev->new_level = le32_to_cpu(sb->new_level);
1935			mddev->new_layout = le32_to_cpu(sb->new_layout);
1936			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1937			if (mddev->delta_disks < 0 ||
1938			    (mddev->delta_disks == 0 &&
1939			     (le32_to_cpu(sb->feature_map)
1940			      & MD_FEATURE_RESHAPE_BACKWARDS)))
1941				mddev->reshape_backwards = 1;
1942		} else {
1943			mddev->reshape_position = MaxSector;
1944			mddev->delta_disks = 0;
1945			mddev->new_level = mddev->level;
1946			mddev->new_layout = mddev->layout;
1947			mddev->new_chunk_sectors = mddev->chunk_sectors;
1948		}
1949
1950		if (mddev->level == 0 &&
1951		    !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
1952			mddev->layout = -1;
1953
1954		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1955			set_bit(MD_HAS_JOURNAL, &mddev->flags);
1956
1957		if (le32_to_cpu(sb->feature_map) &
1958		    (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1959			if (le32_to_cpu(sb->feature_map) &
1960			    (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1961				return -EINVAL;
1962			if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1963			    (le32_to_cpu(sb->feature_map) &
1964					    MD_FEATURE_MULTIPLE_PPLS))
1965				return -EINVAL;
1966			set_bit(MD_HAS_PPL, &mddev->flags);
1967		}
1968	} else if (mddev->pers == NULL) {
1969		/* Insist of good event counter while assembling, except for
1970		 * spares (which don't need an event count).
1971		 * Similar to mdadm, we allow event counter difference of 1
1972		 * from the freshest device.
1973		 */
1974		if (rdev->desc_nr >= 0 &&
1975		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1976		    (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1977		     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1978			if (ev1 + 1 < mddev->events)
1979				return -EINVAL;
1980	} else if (mddev->bitmap) {
1981		/* If adding to array with a bitmap, then we can accept an
1982		 * older device, but not too old.
1983		 */
1984		if (ev1 < mddev->bitmap->events_cleared)
1985			return 0;
1986		if (ev1 < mddev->events)
1987			set_bit(Bitmap_sync, &rdev->flags);
1988	} else {
1989		if (ev1 < mddev->events)
1990			/* just a hot-add of a new device, leave raid_disk at -1 */
1991			return 0;
1992	}
1993
1994	if (rdev->desc_nr < 0 ||
1995	    rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1996		role = MD_DISK_ROLE_SPARE;
1997		rdev->desc_nr = -1;
1998	} else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
1999		/*
2000		 * If we are assembling, and our event counter is smaller than the
2001		 * highest event counter, we cannot trust our superblock about the role.
2002		 * It could happen that our rdev was marked as Faulty, and all other
2003		 * superblocks were updated with +1 event counter.
2004		 * Then, before the next superblock update, which typically happens when
2005		 * remove_and_add_spares() removes the device from the array, there was
2006		 * a crash or reboot.
2007		 * If we allow current rdev without consulting the freshest superblock,
2008		 * we could cause data corruption.
2009		 * Note that in this case our event counter is smaller by 1 than the
2010		 * highest, otherwise, this rdev would not be allowed into array;
2011		 * both kernel and mdadm allow event counter difference of 1.
2012		 */
2013		struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
2014		u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
2015
2016		if (rdev->desc_nr >= freshest_max_dev) {
2017			/* this is unexpected, better not proceed */
2018			pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
2019				mdname(mddev), rdev->bdev, rdev->desc_nr,
2020				freshest->bdev, freshest_max_dev);
2021			return -EUCLEAN;
2022		}
2023
2024		role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
2025		pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
2026			 mdname(mddev), rdev->bdev, role, role, freshest->bdev);
2027	} else {
2028		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2029	}
2030	switch (role) {
2031	case MD_DISK_ROLE_SPARE: /* spare */
2032		break;
2033	case MD_DISK_ROLE_FAULTY: /* faulty */
2034		set_bit(Faulty, &rdev->flags);
2035		break;
2036	case MD_DISK_ROLE_JOURNAL: /* journal device */
2037		if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
2038			/* journal device without journal feature */
2039			pr_warn("md: journal device provided without journal feature, ignoring the device\n");
2040			return -EINVAL;
2041		}
2042		set_bit(Journal, &rdev->flags);
2043		rdev->journal_tail = le64_to_cpu(sb->journal_tail);
2044		rdev->raid_disk = 0;
2045		break;
2046	default:
2047		rdev->saved_raid_disk = role;
2048		if ((le32_to_cpu(sb->feature_map) &
2049		     MD_FEATURE_RECOVERY_OFFSET)) {
2050			rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
2051			if (!(le32_to_cpu(sb->feature_map) &
2052			      MD_FEATURE_RECOVERY_BITMAP))
2053				rdev->saved_raid_disk = -1;
2054		} else {
2055			/*
2056			 * If the array is FROZEN, then the device can't
2057			 * be in_sync with rest of array.
2058			 */
2059			if (!test_bit(MD_RECOVERY_FROZEN,
2060				      &mddev->recovery))
2061				set_bit(In_sync, &rdev->flags);
2062		}
2063		rdev->raid_disk = role;
2064		break;
2065	}
2066	if (sb->devflags & WriteMostly1)
2067		set_bit(WriteMostly, &rdev->flags);
2068	if (sb->devflags & FailFast1)
2069		set_bit(FailFast, &rdev->flags);
2070	if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
2071		set_bit(Replacement, &rdev->flags);
2072
2073	return 0;
2074}
2075
2076static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
2077{
2078	struct mdp_superblock_1 *sb;
2079	struct md_rdev *rdev2;
2080	int max_dev, i;
2081	/* make rdev->sb match mddev and rdev data. */
2082
2083	sb = page_address(rdev->sb_page);
2084
2085	sb->feature_map = 0;
2086	sb->pad0 = 0;
2087	sb->recovery_offset = cpu_to_le64(0);
2088	memset(sb->pad3, 0, sizeof(sb->pad3));
2089
2090	sb->utime = cpu_to_le64((__u64)mddev->utime);
2091	sb->events = cpu_to_le64(mddev->events);
2092	if (mddev->in_sync)
2093		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
2094	else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
2095		sb->resync_offset = cpu_to_le64(MaxSector);
2096	else
2097		sb->resync_offset = cpu_to_le64(0);
2098
2099	sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
2100
2101	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
2102	sb->size = cpu_to_le64(mddev->dev_sectors);
2103	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
2104	sb->level = cpu_to_le32(mddev->level);
2105	sb->layout = cpu_to_le32(mddev->layout);
2106	if (test_bit(FailFast, &rdev->flags))
2107		sb->devflags |= FailFast1;
2108	else
2109		sb->devflags &= ~FailFast1;
2110
2111	if (test_bit(WriteMostly, &rdev->flags))
2112		sb->devflags |= WriteMostly1;
2113	else
2114		sb->devflags &= ~WriteMostly1;
2115	sb->data_offset = cpu_to_le64(rdev->data_offset);
2116	sb->data_size = cpu_to_le64(rdev->sectors);
2117
2118	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
2119		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
2120		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
2121	}
2122
2123	if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
2124	    !test_bit(In_sync, &rdev->flags)) {
2125		sb->feature_map |=
2126			cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
2127		sb->recovery_offset =
2128			cpu_to_le64(rdev->recovery_offset);
2129		if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
2130			sb->feature_map |=
2131				cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
2132	}
2133	/* Note: recovery_offset and journal_tail share space  */
2134	if (test_bit(Journal, &rdev->flags))
2135		sb->journal_tail = cpu_to_le64(rdev->journal_tail);
2136	if (test_bit(Replacement, &rdev->flags))
2137		sb->feature_map |=
2138			cpu_to_le32(MD_FEATURE_REPLACEMENT);
2139
2140	if (mddev->reshape_position != MaxSector) {
2141		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
2142		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
2143		sb->new_layout = cpu_to_le32(mddev->new_layout);
2144		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
2145		sb->new_level = cpu_to_le32(mddev->new_level);
2146		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
2147		if (mddev->delta_disks == 0 &&
2148		    mddev->reshape_backwards)
2149			sb->feature_map
2150				|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
2151		if (rdev->new_data_offset != rdev->data_offset) {
2152			sb->feature_map
2153				|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
2154			sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
2155							     - rdev->data_offset));
2156		}
2157	}
2158
2159	if (mddev_is_clustered(mddev))
2160		sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
2161
2162	if (rdev->badblocks.count == 0)
2163		/* Nothing to do for bad blocks*/ ;
2164	else if (sb->bblog_offset == 0)
2165		/* Cannot record bad blocks on this device */
2166		md_error(mddev, rdev);
2167	else {
2168		struct badblocks *bb = &rdev->badblocks;
2169		__le64 *bbp = (__le64 *)page_address(rdev->bb_page);
2170		u64 *p = bb->page;
2171		sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
2172		if (bb->changed) {
2173			unsigned seq;
2174
2175retry:
2176			seq = read_seqbegin(&bb->lock);
2177
2178			memset(bbp, 0xff, PAGE_SIZE);
2179
2180			for (i = 0 ; i < bb->count ; i++) {
2181				u64 internal_bb = p[i];
2182				u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
2183						| BB_LEN(internal_bb));
2184				bbp[i] = cpu_to_le64(store_bb);
2185			}
2186			bb->changed = 0;
2187			if (read_seqretry(&bb->lock, seq))
2188				goto retry;
2189
2190			bb->sector = (rdev->sb_start +
2191				      (int)le32_to_cpu(sb->bblog_offset));
2192			bb->size = le16_to_cpu(sb->bblog_size);
2193		}
2194	}
2195
2196	max_dev = 0;
2197	rdev_for_each(rdev2, mddev)
2198		if (rdev2->desc_nr+1 > max_dev)
2199			max_dev = rdev2->desc_nr+1;
2200
2201	if (max_dev > le32_to_cpu(sb->max_dev)) {
2202		int bmask;
2203		sb->max_dev = cpu_to_le32(max_dev);
2204		rdev->sb_size = max_dev * 2 + 256;
2205		bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
2206		if (rdev->sb_size & bmask)
2207			rdev->sb_size = (rdev->sb_size | bmask) + 1;
2208	} else
2209		max_dev = le32_to_cpu(sb->max_dev);
2210
2211	for (i=0; i<max_dev;i++)
2212		sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2213
2214	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
2215		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
2216
2217	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
2218		if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
2219			sb->feature_map |=
2220			    cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
2221		else
2222			sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
2223		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
2224		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
2225	}
2226
2227	rdev_for_each(rdev2, mddev) {
2228		i = rdev2->desc_nr;
2229		if (test_bit(Faulty, &rdev2->flags))
2230			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
2231		else if (test_bit(In_sync, &rdev2->flags))
2232			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2233		else if (test_bit(Journal, &rdev2->flags))
2234			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
2235		else if (rdev2->raid_disk >= 0)
2236			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
2237		else
2238			sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
2239	}
2240
2241	sb->sb_csum = calc_sb_1_csum(sb);
2242}
2243
2244static sector_t super_1_choose_bm_space(sector_t dev_size)
2245{
2246	sector_t bm_space;
2247
2248	/* if the device is bigger than 8Gig, save 64k for bitmap
2249	 * usage, if bigger than 200Gig, save 128k
2250	 */
2251	if (dev_size < 64*2)
2252		bm_space = 0;
2253	else if (dev_size - 64*2 >= 200*1024*1024*2)
2254		bm_space = 128*2;
2255	else if (dev_size - 4*2 > 8*1024*1024*2)
2256		bm_space = 64*2;
2257	else
2258		bm_space = 4*2;
2259	return bm_space;
2260}
2261
2262static unsigned long long
2263super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
2264{
2265	struct mdp_superblock_1 *sb;
2266	sector_t max_sectors;
2267	if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
2268		return 0; /* component must fit device */
2269	if (rdev->data_offset != rdev->new_data_offset)
2270		return 0; /* too confusing */
2271	if (rdev->sb_start < rdev->data_offset) {
2272		/* minor versions 1 and 2; superblock before data */
2273		max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset;
2274		if (!num_sectors || num_sectors > max_sectors)
2275			num_sectors = max_sectors;
2276	} else if (rdev->mddev->bitmap_info.offset) {
2277		/* minor version 0 with bitmap we can't move */
2278		return 0;
2279	} else {
2280		/* minor version 0; superblock after data */
2281		sector_t sb_start, bm_space;
2282		sector_t dev_size = bdev_nr_sectors(rdev->bdev);
2283
2284		/* 8K is for superblock */
2285		sb_start = dev_size - 8*2;
2286		sb_start &= ~(sector_t)(4*2 - 1);
2287
2288		bm_space = super_1_choose_bm_space(dev_size);
2289
2290		/* Space that can be used to store date needs to decrease
2291		 * superblock bitmap space and bad block space(4K)
2292		 */
2293		max_sectors = sb_start - bm_space - 4*2;
2294
2295		if (!num_sectors || num_sectors > max_sectors)
2296			num_sectors = max_sectors;
2297		rdev->sb_start = sb_start;
2298	}
2299	sb = page_address(rdev->sb_page);
2300	sb->data_size = cpu_to_le64(num_sectors);
2301	sb->super_offset = cpu_to_le64(rdev->sb_start);
2302	sb->sb_csum = calc_sb_1_csum(sb);
2303	do {
2304		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
2305			       rdev->sb_page);
2306	} while (md_super_wait(rdev->mddev) < 0);
2307	return num_sectors;
2308
2309}
2310
2311static int
2312super_1_allow_new_offset(struct md_rdev *rdev,
2313			 unsigned long long new_offset)
2314{
2315	/* All necessary checks on new >= old have been done */
2316	struct bitmap *bitmap;
2317	if (new_offset >= rdev->data_offset)
2318		return 1;
2319
2320	/* with 1.0 metadata, there is no metadata to tread on
2321	 * so we can always move back */
2322	if (rdev->mddev->minor_version == 0)
2323		return 1;
2324
2325	/* otherwise we must be sure not to step on
2326	 * any metadata, so stay:
2327	 * 36K beyond start of superblock
2328	 * beyond end of badblocks
2329	 * beyond write-intent bitmap
2330	 */
2331	if (rdev->sb_start + (32+4)*2 > new_offset)
2332		return 0;
2333	bitmap = rdev->mddev->bitmap;
2334	if (bitmap && !rdev->mddev->bitmap_info.file &&
2335	    rdev->sb_start + rdev->mddev->bitmap_info.offset +
2336	    bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2337		return 0;
2338	if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2339		return 0;
2340
2341	return 1;
2342}
2343
2344static struct super_type super_types[] = {
2345	[0] = {
2346		.name	= "0.90.0",
2347		.owner	= THIS_MODULE,
2348		.load_super	    = super_90_load,
2349		.validate_super	    = super_90_validate,
2350		.sync_super	    = super_90_sync,
2351		.rdev_size_change   = super_90_rdev_size_change,
2352		.allow_new_offset   = super_90_allow_new_offset,
2353	},
2354	[1] = {
2355		.name	= "md-1",
2356		.owner	= THIS_MODULE,
2357		.load_super	    = super_1_load,
2358		.validate_super	    = super_1_validate,
2359		.sync_super	    = super_1_sync,
2360		.rdev_size_change   = super_1_rdev_size_change,
2361		.allow_new_offset   = super_1_allow_new_offset,
2362	},
2363};
2364
2365static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2366{
2367	if (mddev->sync_super) {
2368		mddev->sync_super(mddev, rdev);
2369		return;
2370	}
2371
2372	BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2373
2374	super_types[mddev->major_version].sync_super(mddev, rdev);
2375}
2376
2377static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2378{
2379	struct md_rdev *rdev, *rdev2;
2380
2381	rcu_read_lock();
2382	rdev_for_each_rcu(rdev, mddev1) {
2383		if (test_bit(Faulty, &rdev->flags) ||
2384		    test_bit(Journal, &rdev->flags) ||
2385		    rdev->raid_disk == -1)
2386			continue;
2387		rdev_for_each_rcu(rdev2, mddev2) {
2388			if (test_bit(Faulty, &rdev2->flags) ||
2389			    test_bit(Journal, &rdev2->flags) ||
2390			    rdev2->raid_disk == -1)
2391				continue;
2392			if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) {
2393				rcu_read_unlock();
2394				return 1;
2395			}
2396		}
2397	}
2398	rcu_read_unlock();
2399	return 0;
2400}
2401
2402static LIST_HEAD(pending_raid_disks);
2403
2404/*
2405 * Try to register data integrity profile for an mddev
2406 *
2407 * This is called when an array is started and after a disk has been kicked
2408 * from the array. It only succeeds if all working and active component devices
2409 * are integrity capable with matching profiles.
2410 */
2411int md_integrity_register(struct mddev *mddev)
2412{
2413	struct md_rdev *rdev, *reference = NULL;
2414
2415	if (list_empty(&mddev->disks))
2416		return 0; /* nothing to do */
2417	if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk))
2418		return 0; /* shouldn't register, or already is */
2419	rdev_for_each(rdev, mddev) {
2420		/* skip spares and non-functional disks */
2421		if (test_bit(Faulty, &rdev->flags))
2422			continue;
2423		if (rdev->raid_disk < 0)
2424			continue;
2425		if (!reference) {
2426			/* Use the first rdev as the reference */
2427			reference = rdev;
2428			continue;
2429		}
2430		/* does this rdev's profile match the reference profile? */
2431		if (blk_integrity_compare(reference->bdev->bd_disk,
2432				rdev->bdev->bd_disk) < 0)
2433			return -EINVAL;
2434	}
2435	if (!reference || !bdev_get_integrity(reference->bdev))
2436		return 0;
2437	/*
2438	 * All component devices are integrity capable and have matching
2439	 * profiles, register the common profile for the md device.
2440	 */
2441	blk_integrity_register(mddev->gendisk,
2442			       bdev_get_integrity(reference->bdev));
2443
2444	pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2445	if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
2446	    (mddev->level != 1 && mddev->level != 10 &&
2447	     bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
2448		/*
2449		 * No need to handle the failure of bioset_integrity_create,
2450		 * because the function is called by md_run() -> pers->run(),
2451		 * md_run calls bioset_exit -> bioset_integrity_free in case
2452		 * of failure case.
2453		 */
2454		pr_err("md: failed to create integrity pool for %s\n",
2455		       mdname(mddev));
2456		return -EINVAL;
2457	}
2458	return 0;
2459}
2460EXPORT_SYMBOL(md_integrity_register);
2461
2462/*
2463 * Attempt to add an rdev, but only if it is consistent with the current
2464 * integrity profile
2465 */
2466int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2467{
2468	struct blk_integrity *bi_mddev;
2469
2470	if (mddev_is_dm(mddev))
2471		return 0;
2472
2473	bi_mddev = blk_get_integrity(mddev->gendisk);
2474
2475	if (!bi_mddev) /* nothing to do */
2476		return 0;
2477
2478	if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2479		pr_err("%s: incompatible integrity profile for %pg\n",
2480		       mdname(mddev), rdev->bdev);
2481		return -ENXIO;
2482	}
2483
2484	return 0;
2485}
2486EXPORT_SYMBOL(md_integrity_add_rdev);
2487
2488static bool rdev_read_only(struct md_rdev *rdev)
2489{
2490	return bdev_read_only(rdev->bdev) ||
2491		(rdev->meta_bdev && bdev_read_only(rdev->meta_bdev));
2492}
2493
2494static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2495{
2496	char b[BDEVNAME_SIZE];
2497	int err;
2498
2499	/* prevent duplicates */
2500	if (find_rdev(mddev, rdev->bdev->bd_dev))
2501		return -EEXIST;
2502
2503	if (rdev_read_only(rdev) && mddev->pers)
2504		return -EROFS;
2505
2506	/* make sure rdev->sectors exceeds mddev->dev_sectors */
2507	if (!test_bit(Journal, &rdev->flags) &&
2508	    rdev->sectors &&
2509	    (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2510		if (mddev->pers) {
2511			/* Cannot change size, so fail
2512			 * If mddev->level <= 0, then we don't care
2513			 * about aligning sizes (e.g. linear)
2514			 */
2515			if (mddev->level > 0)
2516				return -ENOSPC;
2517		} else
2518			mddev->dev_sectors = rdev->sectors;
2519	}
2520
2521	/* Verify rdev->desc_nr is unique.
2522	 * If it is -1, assign a free number, else
2523	 * check number is not in use
2524	 */
2525	rcu_read_lock();
2526	if (rdev->desc_nr < 0) {
2527		int choice = 0;
2528		if (mddev->pers)
2529			choice = mddev->raid_disks;
2530		while (md_find_rdev_nr_rcu(mddev, choice))
2531			choice++;
2532		rdev->desc_nr = choice;
2533	} else {
2534		if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2535			rcu_read_unlock();
2536			return -EBUSY;
2537		}
2538	}
2539	rcu_read_unlock();
2540	if (!test_bit(Journal, &rdev->flags) &&
2541	    mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2542		pr_warn("md: %s: array is limited to %d devices\n",
2543			mdname(mddev), mddev->max_disks);
2544		return -EBUSY;
2545	}
2546	snprintf(b, sizeof(b), "%pg", rdev->bdev);
2547	strreplace(b, '/', '!');
2548
2549	rdev->mddev = mddev;
2550	pr_debug("md: bind<%s>\n", b);
2551
2552	if (mddev->raid_disks)
2553		mddev_create_serial_pool(mddev, rdev);
2554
2555	if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2556		goto fail;
2557
2558	/* failure here is OK */
2559	err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block");
2560	rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2561	rdev->sysfs_unack_badblocks =
2562		sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
2563	rdev->sysfs_badblocks =
2564		sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
2565
2566	list_add_rcu(&rdev->same_set, &mddev->disks);
2567	bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2568
2569	/* May as well allow recovery to be retried once */
2570	mddev->recovery_disabled++;
2571
2572	return 0;
2573
2574 fail:
2575	pr_warn("md: failed to register dev-%s for %s\n",
2576		b, mdname(mddev));
2577	mddev_destroy_serial_pool(mddev, rdev);
2578	return err;
2579}
2580
2581void md_autodetect_dev(dev_t dev);
2582
2583/* just for claiming the bdev */
2584static struct md_rdev claim_rdev;
2585
2586static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
2587{
2588	pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
2589	md_rdev_clear(rdev);
2590#ifndef MODULE
2591	if (test_bit(AutoDetected, &rdev->flags))
2592		md_autodetect_dev(rdev->bdev->bd_dev);
2593#endif
2594	fput(rdev->bdev_file);
2595	rdev->bdev = NULL;
2596	kobject_put(&rdev->kobj);
2597}
2598
2599static void md_kick_rdev_from_array(struct md_rdev *rdev)
2600{
2601	struct mddev *mddev = rdev->mddev;
2602
2603	bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2604	list_del_rcu(&rdev->same_set);
2605	pr_debug("md: unbind<%pg>\n", rdev->bdev);
2606	mddev_destroy_serial_pool(rdev->mddev, rdev);
2607	WRITE_ONCE(rdev->mddev, NULL);
2608	sysfs_remove_link(&rdev->kobj, "block");
2609	sysfs_put(rdev->sysfs_state);
2610	sysfs_put(rdev->sysfs_unack_badblocks);
2611	sysfs_put(rdev->sysfs_badblocks);
2612	rdev->sysfs_state = NULL;
2613	rdev->sysfs_unack_badblocks = NULL;
2614	rdev->sysfs_badblocks = NULL;
2615	rdev->badblocks.count = 0;
2616
2617	synchronize_rcu();
2618
2619	/*
2620	 * kobject_del() will wait for all in progress writers to be done, where
2621	 * reconfig_mutex is held, hence it can't be called under
2622	 * reconfig_mutex and it's delayed to mddev_unlock().
2623	 */
2624	list_add(&rdev->same_set, &mddev->deleting);
2625}
2626
2627static void export_array(struct mddev *mddev)
2628{
2629	struct md_rdev *rdev;
2630
2631	while (!list_empty(&mddev->disks)) {
2632		rdev = list_first_entry(&mddev->disks, struct md_rdev,
2633					same_set);
2634		md_kick_rdev_from_array(rdev);
2635	}
2636	mddev->raid_disks = 0;
2637	mddev->major_version = 0;
2638}
2639
2640static bool set_in_sync(struct mddev *mddev)
2641{
2642	lockdep_assert_held(&mddev->lock);
2643	if (!mddev->in_sync) {
2644		mddev->sync_checkers++;
2645		spin_unlock(&mddev->lock);
2646		percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2647		spin_lock(&mddev->lock);
2648		if (!mddev->in_sync &&
2649		    percpu_ref_is_zero(&mddev->writes_pending)) {
2650			mddev->in_sync = 1;
2651			/*
2652			 * Ensure ->in_sync is visible before we clear
2653			 * ->sync_checkers.
2654			 */
2655			smp_mb();
2656			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2657			sysfs_notify_dirent_safe(mddev->sysfs_state);
2658		}
2659		if (--mddev->sync_checkers == 0)
2660			percpu_ref_switch_to_percpu(&mddev->writes_pending);
2661	}
2662	if (mddev->safemode == 1)
2663		mddev->safemode = 0;
2664	return mddev->in_sync;
2665}
2666
2667static void sync_sbs(struct mddev *mddev, int nospares)
2668{
2669	/* Update each superblock (in-memory image), but
2670	 * if we are allowed to, skip spares which already
2671	 * have the right event counter, or have one earlier
2672	 * (which would mean they aren't being marked as dirty
2673	 * with the rest of the array)
2674	 */
2675	struct md_rdev *rdev;
2676	rdev_for_each(rdev, mddev) {
2677		if (rdev->sb_events == mddev->events ||
2678		    (nospares &&
2679		     rdev->raid_disk < 0 &&
2680		     rdev->sb_events+1 == mddev->events)) {
2681			/* Don't update this superblock */
2682			rdev->sb_loaded = 2;
2683		} else {
2684			sync_super(mddev, rdev);
2685			rdev->sb_loaded = 1;
2686		}
2687	}
2688}
2689
2690static bool does_sb_need_changing(struct mddev *mddev)
2691{
2692	struct md_rdev *rdev = NULL, *iter;
2693	struct mdp_superblock_1 *sb;
2694	int role;
2695
2696	/* Find a good rdev */
2697	rdev_for_each(iter, mddev)
2698		if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) {
2699			rdev = iter;
2700			break;
2701		}
2702
2703	/* No good device found. */
2704	if (!rdev)
2705		return false;
2706
2707	sb = page_address(rdev->sb_page);
2708	/* Check if a device has become faulty or a spare become active */
2709	rdev_for_each(rdev, mddev) {
2710		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2711		/* Device activated? */
2712		if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 &&
2713		    !test_bit(Faulty, &rdev->flags))
2714			return true;
2715		/* Device turned faulty? */
2716		if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX))
2717			return true;
2718	}
2719
2720	/* Check if any mddev parameters have changed */
2721	if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2722	    (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2723	    (mddev->layout != le32_to_cpu(sb->layout)) ||
2724	    (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2725	    (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2726		return true;
2727
2728	return false;
2729}
2730
2731void md_update_sb(struct mddev *mddev, int force_change)
2732{
2733	struct md_rdev *rdev;
2734	int sync_req;
2735	int nospares = 0;
2736	int any_badblocks_changed = 0;
2737	int ret = -1;
2738
2739	if (!md_is_rdwr(mddev)) {
2740		if (force_change)
2741			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2742		return;
2743	}
2744
2745repeat:
2746	if (mddev_is_clustered(mddev)) {
2747		if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2748			force_change = 1;
2749		if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2750			nospares = 1;
2751		ret = md_cluster_ops->metadata_update_start(mddev);
2752		/* Has someone else has updated the sb */
2753		if (!does_sb_need_changing(mddev)) {
2754			if (ret == 0)
2755				md_cluster_ops->metadata_update_cancel(mddev);
2756			bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2757							 BIT(MD_SB_CHANGE_DEVS) |
2758							 BIT(MD_SB_CHANGE_CLEAN));
2759			return;
2760		}
2761	}
2762
2763	/*
2764	 * First make sure individual recovery_offsets are correct
2765	 * curr_resync_completed can only be used during recovery.
2766	 * During reshape/resync it might use array-addresses rather
2767	 * that device addresses.
2768	 */
2769	rdev_for_each(rdev, mddev) {
2770		if (rdev->raid_disk >= 0 &&
2771		    mddev->delta_disks >= 0 &&
2772		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2773		    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2774		    !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2775		    !test_bit(Journal, &rdev->flags) &&
2776		    !test_bit(In_sync, &rdev->flags) &&
2777		    mddev->curr_resync_completed > rdev->recovery_offset)
2778				rdev->recovery_offset = mddev->curr_resync_completed;
2779
2780	}
2781	if (!mddev->persistent) {
2782		clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2783		clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2784		if (!mddev->external) {
2785			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2786			rdev_for_each(rdev, mddev) {
2787				if (rdev->badblocks.changed) {
2788					rdev->badblocks.changed = 0;
2789					ack_all_badblocks(&rdev->badblocks);
2790					md_error(mddev, rdev);
2791				}
2792				clear_bit(Blocked, &rdev->flags);
2793				clear_bit(BlockedBadBlocks, &rdev->flags);
2794				wake_up(&rdev->blocked_wait);
2795			}
2796		}
2797		wake_up(&mddev->sb_wait);
2798		return;
2799	}
2800
2801	spin_lock(&mddev->lock);
2802
2803	mddev->utime = ktime_get_real_seconds();
2804
2805	if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2806		force_change = 1;
2807	if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2808		/* just a clean<-> dirty transition, possibly leave spares alone,
2809		 * though if events isn't the right even/odd, we will have to do
2810		 * spares after all
2811		 */
2812		nospares = 1;
2813	if (force_change)
2814		nospares = 0;
2815	if (mddev->degraded)
2816		/* If the array is degraded, then skipping spares is both
2817		 * dangerous and fairly pointless.
2818		 * Dangerous because a device that was removed from the array
2819		 * might have a event_count that still looks up-to-date,
2820		 * so it can be re-added without a resync.
2821		 * Pointless because if there are any spares to skip,
2822		 * then a recovery will happen and soon that array won't
2823		 * be degraded any more and the spare can go back to sleep then.
2824		 */
2825		nospares = 0;
2826
2827	sync_req = mddev->in_sync;
2828
2829	/* If this is just a dirty<->clean transition, and the array is clean
2830	 * and 'events' is odd, we can roll back to the previous clean state */
2831	if (nospares
2832	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2833	    && mddev->can_decrease_events
2834	    && mddev->events != 1) {
2835		mddev->events--;
2836		mddev->can_decrease_events = 0;
2837	} else {
2838		/* otherwise we have to go forward and ... */
2839		mddev->events ++;
2840		mddev->can_decrease_events = nospares;
2841	}
2842
2843	/*
2844	 * This 64-bit counter should never wrap.
2845	 * Either we are in around ~1 trillion A.C., assuming
2846	 * 1 reboot per second, or we have a bug...
2847	 */
2848	WARN_ON(mddev->events == 0);
2849
2850	rdev_for_each(rdev, mddev) {
2851		if (rdev->badblocks.changed)
2852			any_badblocks_changed++;
2853		if (test_bit(Faulty, &rdev->flags))
2854			set_bit(FaultRecorded, &rdev->flags);
2855	}
2856
2857	sync_sbs(mddev, nospares);
2858	spin_unlock(&mddev->lock);
2859
2860	pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2861		 mdname(mddev), mddev->in_sync);
2862
2863	mddev_add_trace_msg(mddev, "md md_update_sb");
2864rewrite:
2865	md_bitmap_update_sb(mddev->bitmap);
2866	rdev_for_each(rdev, mddev) {
2867		if (rdev->sb_loaded != 1)
2868			continue; /* no noise on spare devices */
2869
2870		if (!test_bit(Faulty, &rdev->flags)) {
2871			md_super_write(mddev,rdev,
2872				       rdev->sb_start, rdev->sb_size,
2873				       rdev->sb_page);
2874			pr_debug("md: (write) %pg's sb offset: %llu\n",
2875				 rdev->bdev,
2876				 (unsigned long long)rdev->sb_start);
2877			rdev->sb_events = mddev->events;
2878			if (rdev->badblocks.size) {
2879				md_super_write(mddev, rdev,
2880					       rdev->badblocks.sector,
2881					       rdev->badblocks.size << 9,
2882					       rdev->bb_page);
2883				rdev->badblocks.size = 0;
2884			}
2885
2886		} else
2887			pr_debug("md: %pg (skipping faulty)\n",
2888				 rdev->bdev);
2889	}
2890	if (md_super_wait(mddev) < 0)
2891		goto rewrite;
2892	/* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2893
2894	if (mddev_is_clustered(mddev) && ret == 0)
2895		md_cluster_ops->metadata_update_finish(mddev);
2896
2897	if (mddev->in_sync != sync_req ||
2898	    !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2899			       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2900		/* have to write it out again */
2901		goto repeat;
2902	wake_up(&mddev->sb_wait);
2903	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2904		sysfs_notify_dirent_safe(mddev->sysfs_completed);
2905
2906	rdev_for_each(rdev, mddev) {
2907		if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2908			clear_bit(Blocked, &rdev->flags);
2909
2910		if (any_badblocks_changed)
2911			ack_all_badblocks(&rdev->badblocks);
2912		clear_bit(BlockedBadBlocks, &rdev->flags);
2913		wake_up(&rdev->blocked_wait);
2914	}
2915}
2916EXPORT_SYMBOL(md_update_sb);
2917
2918static int add_bound_rdev(struct md_rdev *rdev)
2919{
2920	struct mddev *mddev = rdev->mddev;
2921	int err = 0;
2922	bool add_journal = test_bit(Journal, &rdev->flags);
2923
2924	if (!mddev->pers->hot_remove_disk || add_journal) {
2925		/* If there is hot_add_disk but no hot_remove_disk
2926		 * then added disks for geometry changes,
2927		 * and should be added immediately.
2928		 */
2929		super_types[mddev->major_version].
2930			validate_super(mddev, NULL/*freshest*/, rdev);
2931		err = mddev->pers->hot_add_disk(mddev, rdev);
2932		if (err) {
2933			md_kick_rdev_from_array(rdev);
2934			return err;
2935		}
2936	}
2937	sysfs_notify_dirent_safe(rdev->sysfs_state);
2938
2939	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2940	if (mddev->degraded)
2941		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2942	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2943	md_new_event();
2944	return 0;
2945}
2946
2947/* words written to sysfs files may, or may not, be \n terminated.
2948 * We want to accept with case. For this we use cmd_match.
2949 */
2950static int cmd_match(const char *cmd, const char *str)
2951{
2952	/* See if cmd, written into a sysfs file, matches
2953	 * str.  They must either be the same, or cmd can
2954	 * have a trailing newline
2955	 */
2956	while (*cmd && *str && *cmd == *str) {
2957		cmd++;
2958		str++;
2959	}
2960	if (*cmd == '\n')
2961		cmd++;
2962	if (*str || *cmd)
2963		return 0;
2964	return 1;
2965}
2966
2967struct rdev_sysfs_entry {
2968	struct attribute attr;
2969	ssize_t (*show)(struct md_rdev *, char *);
2970	ssize_t (*store)(struct md_rdev *, const char *, size_t);
2971};
2972
2973static ssize_t
2974state_show(struct md_rdev *rdev, char *page)
2975{
2976	char *sep = ",";
2977	size_t len = 0;
2978	unsigned long flags = READ_ONCE(rdev->flags);
2979
2980	if (test_bit(Faulty, &flags) ||
2981	    (!test_bit(ExternalBbl, &flags) &&
2982	    rdev->badblocks.unacked_exist))
2983		len += sprintf(page+len, "faulty%s", sep);
2984	if (test_bit(In_sync, &flags))
2985		len += sprintf(page+len, "in_sync%s", sep);
2986	if (test_bit(Journal, &flags))
2987		len += sprintf(page+len, "journal%s", sep);
2988	if (test_bit(WriteMostly, &flags))
2989		len += sprintf(page+len, "write_mostly%s", sep);
2990	if (test_bit(Blocked, &flags) ||
2991	    (rdev->badblocks.unacked_exist
2992	     && !test_bit(Faulty, &flags)))
2993		len += sprintf(page+len, "blocked%s", sep);
2994	if (!test_bit(Faulty, &flags) &&
2995	    !test_bit(Journal, &flags) &&
2996	    !test_bit(In_sync, &flags))
2997		len += sprintf(page+len, "spare%s", sep);
2998	if (test_bit(WriteErrorSeen, &flags))
2999		len += sprintf(page+len, "write_error%s", sep);
3000	if (test_bit(WantReplacement, &flags))
3001		len += sprintf(page+len, "want_replacement%s", sep);
3002	if (test_bit(Replacement, &flags))
3003		len += sprintf(page+len, "replacement%s", sep);
3004	if (test_bit(ExternalBbl, &flags))
3005		len += sprintf(page+len, "external_bbl%s", sep);
3006	if (test_bit(FailFast, &flags))
3007		len += sprintf(page+len, "failfast%s", sep);
3008
3009	if (len)
3010		len -= strlen(sep);
3011
3012	return len+sprintf(page+len, "\n");
3013}
3014
3015static ssize_t
3016state_store(struct md_rdev *rdev, const char *buf, size_t len)
3017{
3018	/* can write
3019	 *  faulty  - simulates an error
3020	 *  remove  - disconnects the device
3021	 *  writemostly - sets write_mostly
3022	 *  -writemostly - clears write_mostly
3023	 *  blocked - sets the Blocked flags
3024	 *  -blocked - clears the Blocked and possibly simulates an error
3025	 *  insync - sets Insync providing device isn't active
3026	 *  -insync - clear Insync for a device with a slot assigned,
3027	 *            so that it gets rebuilt based on bitmap
3028	 *  write_error - sets WriteErrorSeen
3029	 *  -write_error - clears WriteErrorSeen
3030	 *  {,-}failfast - set/clear FailFast
3031	 */
3032
3033	struct mddev *mddev = rdev->mddev;
3034	int err = -EINVAL;
3035	bool need_update_sb = false;
3036
3037	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
3038		md_error(rdev->mddev, rdev);
3039
3040		if (test_bit(MD_BROKEN, &rdev->mddev->flags))
3041			err = -EBUSY;
3042		else
3043			err = 0;
3044	} else if (cmd_match(buf, "remove")) {
3045		if (rdev->mddev->pers) {
3046			clear_bit(Blocked, &rdev->flags);
3047			remove_and_add_spares(rdev->mddev, rdev);
3048		}
3049		if (rdev->raid_disk >= 0)
3050			err = -EBUSY;
3051		else {
3052			err = 0;
3053			if (mddev_is_clustered(mddev))
3054				err = md_cluster_ops->remove_disk(mddev, rdev);
3055
3056			if (err == 0) {
3057				md_kick_rdev_from_array(rdev);
3058				if (mddev->pers)
3059					set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3060				md_new_event();
3061			}
3062		}
3063	} else if (cmd_match(buf, "writemostly")) {
3064		set_bit(WriteMostly, &rdev->flags);
3065		mddev_create_serial_pool(rdev->mddev, rdev);
3066		need_update_sb = true;
3067		err = 0;
3068	} else if (cmd_match(buf, "-writemostly")) {
3069		mddev_destroy_serial_pool(rdev->mddev, rdev);
3070		clear_bit(WriteMostly, &rdev->flags);
3071		need_update_sb = true;
3072		err = 0;
3073	} else if (cmd_match(buf, "blocked")) {
3074		set_bit(Blocked, &rdev->flags);
3075		err = 0;
3076	} else if (cmd_match(buf, "-blocked")) {
3077		if (!test_bit(Faulty, &rdev->flags) &&
3078		    !test_bit(ExternalBbl, &rdev->flags) &&
3079		    rdev->badblocks.unacked_exist) {
3080			/* metadata handler doesn't understand badblocks,
3081			 * so we need to fail the device
3082			 */
3083			md_error(rdev->mddev, rdev);
3084		}
3085		clear_bit(Blocked, &rdev->flags);
3086		clear_bit(BlockedBadBlocks, &rdev->flags);
3087		wake_up(&rdev->blocked_wait);
3088		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3089
3090		err = 0;
3091	} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
3092		set_bit(In_sync, &rdev->flags);
3093		err = 0;
3094	} else if (cmd_match(buf, "failfast")) {
3095		set_bit(FailFast, &rdev->flags);
3096		need_update_sb = true;
3097		err = 0;
3098	} else if (cmd_match(buf, "-failfast")) {
3099		clear_bit(FailFast, &rdev->flags);
3100		need_update_sb = true;
3101		err = 0;
3102	} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
3103		   !test_bit(Journal, &rdev->flags)) {
3104		if (rdev->mddev->pers == NULL) {
3105			clear_bit(In_sync, &rdev->flags);
3106			rdev->saved_raid_disk = rdev->raid_disk;
3107			rdev->raid_disk = -1;
3108			err = 0;
3109		}
3110	} else if (cmd_match(buf, "write_error")) {
3111		set_bit(WriteErrorSeen, &rdev->flags);
3112		err = 0;
3113	} else if (cmd_match(buf, "-write_error")) {
3114		clear_bit(WriteErrorSeen, &rdev->flags);
3115		err = 0;
3116	} else if (cmd_match(buf, "want_replacement")) {
3117		/* Any non-spare device that is not a replacement can
3118		 * become want_replacement at any time, but we then need to
3119		 * check if recovery is needed.
3120		 */
3121		if (rdev->raid_disk >= 0 &&
3122		    !test_bit(Journal, &rdev->flags) &&
3123		    !test_bit(Replacement, &rdev->flags))
3124			set_bit(WantReplacement, &rdev->flags);
3125		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3126		err = 0;
3127	} else if (cmd_match(buf, "-want_replacement")) {
3128		/* Clearing 'want_replacement' is always allowed.
3129		 * Once replacements starts it is too late though.
3130		 */
3131		err = 0;
3132		clear_bit(WantReplacement, &rdev->flags);
3133	} else if (cmd_match(buf, "replacement")) {
3134		/* Can only set a device as a replacement when array has not
3135		 * yet been started.  Once running, replacement is automatic
3136		 * from spares, or by assigning 'slot'.
3137		 */
3138		if (rdev->mddev->pers)
3139			err = -EBUSY;
3140		else {
3141			set_bit(Replacement, &rdev->flags);
3142			err = 0;
3143		}
3144	} else if (cmd_match(buf, "-replacement")) {
3145		/* Similarly, can only clear Replacement before start */
3146		if (rdev->mddev->pers)
3147			err = -EBUSY;
3148		else {
3149			clear_bit(Replacement, &rdev->flags);
3150			err = 0;
3151		}
3152	} else if (cmd_match(buf, "re-add")) {
3153		if (!rdev->mddev->pers)
3154			err = -EINVAL;
3155		else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
3156				rdev->saved_raid_disk >= 0) {
3157			/* clear_bit is performed _after_ all the devices
3158			 * have their local Faulty bit cleared. If any writes
3159			 * happen in the meantime in the local node, they
3160			 * will land in the local bitmap, which will be synced
3161			 * by this node eventually
3162			 */
3163			if (!mddev_is_clustered(rdev->mddev) ||
3164			    (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
3165				clear_bit(Faulty, &rdev->flags);
3166				err = add_bound_rdev(rdev);
3167			}
3168		} else
3169			err = -EBUSY;
3170	} else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
3171		set_bit(ExternalBbl, &rdev->flags);
3172		rdev->badblocks.shift = 0;
3173		err = 0;
3174	} else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
3175		clear_bit(ExternalBbl, &rdev->flags);
3176		err = 0;
3177	}
3178	if (need_update_sb)
3179		md_update_sb(mddev, 1);
3180	if (!err)
3181		sysfs_notify_dirent_safe(rdev->sysfs_state);
3182	return err ? err : len;
3183}
3184static struct rdev_sysfs_entry rdev_state =
3185__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
3186
3187static ssize_t
3188errors_show(struct md_rdev *rdev, char *page)
3189{
3190	return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
3191}
3192
3193static ssize_t
3194errors_store(struct md_rdev *rdev, const char *buf, size_t len)
3195{
3196	unsigned int n;
3197	int rv;
3198
3199	rv = kstrtouint(buf, 10, &n);
3200	if (rv < 0)
3201		return rv;
3202	atomic_set(&rdev->corrected_errors, n);
3203	return len;
3204}
3205static struct rdev_sysfs_entry rdev_errors =
3206__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
3207
3208static ssize_t
3209slot_show(struct md_rdev *rdev, char *page)
3210{
3211	if (test_bit(Journal, &rdev->flags))
3212		return sprintf(page, "journal\n");
3213	else if (rdev->raid_disk < 0)
3214		return sprintf(page, "none\n");
3215	else
3216		return sprintf(page, "%d\n", rdev->raid_disk);
3217}
3218
3219static ssize_t
3220slot_store(struct md_rdev *rdev, const char *buf, size_t len)
3221{
3222	int slot;
3223	int err;
3224
3225	if (test_bit(Journal, &rdev->flags))
3226		return -EBUSY;
3227	if (strncmp(buf, "none", 4)==0)
3228		slot = -1;
3229	else {
3230		err = kstrtouint(buf, 10, (unsigned int *)&slot);
3231		if (err < 0)
3232			return err;
3233		if (slot < 0)
3234			/* overflow */
3235			return -ENOSPC;
3236	}
3237	if (rdev->mddev->pers && slot == -1) {
3238		/* Setting 'slot' on an active array requires also
3239		 * updating the 'rd%d' link, and communicating
3240		 * with the personality with ->hot_*_disk.
3241		 * For now we only support removing
3242		 * failed/spare devices.  This normally happens automatically,
3243		 * but not when the metadata is externally managed.
3244		 */
3245		if (rdev->raid_disk == -1)
3246			return -EEXIST;
3247		/* personality does all needed checks */
3248		if (rdev->mddev->pers->hot_remove_disk == NULL)
3249			return -EINVAL;
3250		clear_bit(Blocked, &rdev->flags);
3251		remove_and_add_spares(rdev->mddev, rdev);
3252		if (rdev->raid_disk >= 0)
3253			return -EBUSY;
3254		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3255	} else if (rdev->mddev->pers) {
3256		/* Activating a spare .. or possibly reactivating
3257		 * if we ever get bitmaps working here.
3258		 */
3259		int err;
3260
3261		if (rdev->raid_disk != -1)
3262			return -EBUSY;
3263
3264		if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
3265			return -EBUSY;
3266
3267		if (rdev->mddev->pers->hot_add_disk == NULL)
3268			return -EINVAL;
3269
3270		if (slot >= rdev->mddev->raid_disks &&
3271		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3272			return -ENOSPC;
3273
3274		rdev->raid_disk = slot;
3275		if (test_bit(In_sync, &rdev->flags))
3276			rdev->saved_raid_disk = slot;
3277		else
3278			rdev->saved_raid_disk = -1;
3279		clear_bit(In_sync, &rdev->flags);
3280		clear_bit(Bitmap_sync, &rdev->flags);
3281		err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev);
3282		if (err) {
3283			rdev->raid_disk = -1;
3284			return err;
3285		} else
3286			sysfs_notify_dirent_safe(rdev->sysfs_state);
3287		/* failure here is OK */;
3288		sysfs_link_rdev(rdev->mddev, rdev);
3289		/* don't wakeup anyone, leave that to userspace. */
3290	} else {
3291		if (slot >= rdev->mddev->raid_disks &&
3292		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3293			return -ENOSPC;
3294		rdev->raid_disk = slot;
3295		/* assume it is working */
3296		clear_bit(Faulty, &rdev->flags);
3297		clear_bit(WriteMostly, &rdev->flags);
3298		set_bit(In_sync, &rdev->flags);
3299		sysfs_notify_dirent_safe(rdev->sysfs_state);
3300	}
3301	return len;
3302}
3303
3304static struct rdev_sysfs_entry rdev_slot =
3305__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3306
3307static ssize_t
3308offset_show(struct md_rdev *rdev, char *page)
3309{
3310	return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3311}
3312
3313static ssize_t
3314offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3315{
3316	unsigned long long offset;
3317	if (kstrtoull(buf, 10, &offset) < 0)
3318		return -EINVAL;
3319	if (rdev->mddev->pers && rdev->raid_disk >= 0)
3320		return -EBUSY;
3321	if (rdev->sectors && rdev->mddev->external)
3322		/* Must set offset before size, so overlap checks
3323		 * can be sane */
3324		return -EBUSY;
3325	rdev->data_offset = offset;
3326	rdev->new_data_offset = offset;
3327	return len;
3328}
3329
3330static struct rdev_sysfs_entry rdev_offset =
3331__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3332
3333static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3334{
3335	return sprintf(page, "%llu\n",
3336		       (unsigned long long)rdev->new_data_offset);
3337}
3338
3339static ssize_t new_offset_store(struct md_rdev *rdev,
3340				const char *buf, size_t len)
3341{
3342	unsigned long long new_offset;
3343	struct mddev *mddev = rdev->mddev;
3344
3345	if (kstrtoull(buf, 10, &new_offset) < 0)
3346		return -EINVAL;
3347
3348	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3349		return -EBUSY;
3350	if (new_offset == rdev->data_offset)
3351		/* reset is always permitted */
3352		;
3353	else if (new_offset > rdev->data_offset) {
3354		/* must not push array size beyond rdev_sectors */
3355		if (new_offset - rdev->data_offset
3356		    + mddev->dev_sectors > rdev->sectors)
3357				return -E2BIG;
3358	}
3359	/* Metadata worries about other space details. */
3360
3361	/* decreasing the offset is inconsistent with a backwards
3362	 * reshape.
3363	 */
3364	if (new_offset < rdev->data_offset &&
3365	    mddev->reshape_backwards)
3366		return -EINVAL;
3367	/* Increasing offset is inconsistent with forwards
3368	 * reshape.  reshape_direction should be set to
3369	 * 'backwards' first.
3370	 */
3371	if (new_offset > rdev->data_offset &&
3372	    !mddev->reshape_backwards)
3373		return -EINVAL;
3374
3375	if (mddev->pers && mddev->persistent &&
3376	    !super_types[mddev->major_version]
3377	    .allow_new_offset(rdev, new_offset))
3378		return -E2BIG;
3379	rdev->new_data_offset = new_offset;
3380	if (new_offset > rdev->data_offset)
3381		mddev->reshape_backwards = 1;
3382	else if (new_offset < rdev->data_offset)
3383		mddev->reshape_backwards = 0;
3384
3385	return len;
3386}
3387static struct rdev_sysfs_entry rdev_new_offset =
3388__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3389
3390static ssize_t
3391rdev_size_show(struct md_rdev *rdev, char *page)
3392{
3393	return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3394}
3395
3396static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b)
3397{
3398	/* check if two start/length pairs overlap */
3399	if (a->data_offset + a->sectors <= b->data_offset)
3400		return false;
3401	if (b->data_offset + b->sectors <= a->data_offset)
3402		return false;
3403	return true;
3404}
3405
3406static bool md_rdev_overlaps(struct md_rdev *rdev)
3407{
3408	struct mddev *mddev;
3409	struct md_rdev *rdev2;
3410
3411	spin_lock(&all_mddevs_lock);
3412	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
3413		if (test_bit(MD_DELETED, &mddev->flags))
3414			continue;
3415		rdev_for_each(rdev2, mddev) {
3416			if (rdev != rdev2 && rdev->bdev == rdev2->bdev &&
3417			    md_rdevs_overlap(rdev, rdev2)) {
3418				spin_unlock(&all_mddevs_lock);
3419				return true;
3420			}
3421		}
3422	}
3423	spin_unlock(&all_mddevs_lock);
3424	return false;
3425}
3426
3427static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3428{
3429	unsigned long long blocks;
3430	sector_t new;
3431
3432	if (kstrtoull(buf, 10, &blocks) < 0)
3433		return -EINVAL;
3434
3435	if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3436		return -EINVAL; /* sector conversion overflow */
3437
3438	new = blocks * 2;
3439	if (new != blocks * 2)
3440		return -EINVAL; /* unsigned long long to sector_t overflow */
3441
3442	*sectors = new;
3443	return 0;
3444}
3445
3446static ssize_t
3447rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3448{
3449	struct mddev *my_mddev = rdev->mddev;
3450	sector_t oldsectors = rdev->sectors;
3451	sector_t sectors;
3452
3453	if (test_bit(Journal, &rdev->flags))
3454		return -EBUSY;
3455	if (strict_blocks_to_sectors(buf, &sectors) < 0)
3456		return -EINVAL;
3457	if (rdev->data_offset != rdev->new_data_offset)
3458		return -EINVAL; /* too confusing */
3459	if (my_mddev->pers && rdev->raid_disk >= 0) {
3460		if (my_mddev->persistent) {
3461			sectors = super_types[my_mddev->major_version].
3462				rdev_size_change(rdev, sectors);
3463			if (!sectors)
3464				return -EBUSY;
3465		} else if (!sectors)
3466			sectors = bdev_nr_sectors(rdev->bdev) -
3467				rdev->data_offset;
3468		if (!my_mddev->pers->resize)
3469			/* Cannot change size for RAID0 or Linear etc */
3470			return -EINVAL;
3471	}
3472	if (sectors < my_mddev->dev_sectors)
3473		return -EINVAL; /* component must fit device */
3474
3475	rdev->sectors = sectors;
3476
3477	/*
3478	 * Check that all other rdevs with the same bdev do not overlap.  This
3479	 * check does not provide a hard guarantee, it just helps avoid
3480	 * dangerous mistakes.
3481	 */
3482	if (sectors > oldsectors && my_mddev->external &&
3483	    md_rdev_overlaps(rdev)) {
3484		/*
3485		 * Someone else could have slipped in a size change here, but
3486		 * doing so is just silly.  We put oldsectors back because we
3487		 * know it is safe, and trust userspace not to race with itself.
3488		 */
3489		rdev->sectors = oldsectors;
3490		return -EBUSY;
3491	}
3492	return len;
3493}
3494
3495static struct rdev_sysfs_entry rdev_size =
3496__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3497
3498static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3499{
3500	unsigned long long recovery_start = rdev->recovery_offset;
3501
3502	if (test_bit(In_sync, &rdev->flags) ||
3503	    recovery_start == MaxSector)
3504		return sprintf(page, "none\n");
3505
3506	return sprintf(page, "%llu\n", recovery_start);
3507}
3508
3509static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3510{
3511	unsigned long long recovery_start;
3512
3513	if (cmd_match(buf, "none"))
3514		recovery_start = MaxSector;
3515	else if (kstrtoull(buf, 10, &recovery_start))
3516		return -EINVAL;
3517
3518	if (rdev->mddev->pers &&
3519	    rdev->raid_disk >= 0)
3520		return -EBUSY;
3521
3522	rdev->recovery_offset = recovery_start;
3523	if (recovery_start == MaxSector)
3524		set_bit(In_sync, &rdev->flags);
3525	else
3526		clear_bit(In_sync, &rdev->flags);
3527	return len;
3528}
3529
3530static struct rdev_sysfs_entry rdev_recovery_start =
3531__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3532
3533/* sysfs access to bad-blocks list.
3534 * We present two files.
3535 * 'bad-blocks' lists sector numbers and lengths of ranges that
3536 *    are recorded as bad.  The list is truncated to fit within
3537 *    the one-page limit of sysfs.
3538 *    Writing "sector length" to this file adds an acknowledged
3539 *    bad block list.
3540 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3541 *    been acknowledged.  Writing to this file adds bad blocks
3542 *    without acknowledging them.  This is largely for testing.
3543 */
3544static ssize_t bb_show(struct md_rdev *rdev, char *page)
3545{
3546	return badblocks_show(&rdev->badblocks, page, 0);
3547}
3548static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3549{
3550	int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3551	/* Maybe that ack was all we needed */
3552	if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3553		wake_up(&rdev->blocked_wait);
3554	return rv;
3555}
3556static struct rdev_sysfs_entry rdev_bad_blocks =
3557__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3558
3559static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3560{
3561	return badblocks_show(&rdev->badblocks, page, 1);
3562}
3563static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3564{
3565	return badblocks_store(&rdev->badblocks, page, len, 1);
3566}
3567static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3568__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3569
3570static ssize_t
3571ppl_sector_show(struct md_rdev *rdev, char *page)
3572{
3573	return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3574}
3575
3576static ssize_t
3577ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3578{
3579	unsigned long long sector;
3580
3581	if (kstrtoull(buf, 10, &sector) < 0)
3582		return -EINVAL;
3583	if (sector != (sector_t)sector)
3584		return -EINVAL;
3585
3586	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3587	    rdev->raid_disk >= 0)
3588		return -EBUSY;
3589
3590	if (rdev->mddev->persistent) {
3591		if (rdev->mddev->major_version == 0)
3592			return -EINVAL;
3593		if ((sector > rdev->sb_start &&
3594		     sector - rdev->sb_start > S16_MAX) ||
3595		    (sector < rdev->sb_start &&
3596		     rdev->sb_start - sector > -S16_MIN))
3597			return -EINVAL;
3598		rdev->ppl.offset = sector - rdev->sb_start;
3599	} else if (!rdev->mddev->external) {
3600		return -EBUSY;
3601	}
3602	rdev->ppl.sector = sector;
3603	return len;
3604}
3605
3606static struct rdev_sysfs_entry rdev_ppl_sector =
3607__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3608
3609static ssize_t
3610ppl_size_show(struct md_rdev *rdev, char *page)
3611{
3612	return sprintf(page, "%u\n", rdev->ppl.size);
3613}
3614
3615static ssize_t
3616ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3617{
3618	unsigned int size;
3619
3620	if (kstrtouint(buf, 10, &size) < 0)
3621		return -EINVAL;
3622
3623	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3624	    rdev->raid_disk >= 0)
3625		return -EBUSY;
3626
3627	if (rdev->mddev->persistent) {
3628		if (rdev->mddev->major_version == 0)
3629			return -EINVAL;
3630		if (size > U16_MAX)
3631			return -EINVAL;
3632	} else if (!rdev->mddev->external) {
3633		return -EBUSY;
3634	}
3635	rdev->ppl.size = size;
3636	return len;
3637}
3638
3639static struct rdev_sysfs_entry rdev_ppl_size =
3640__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3641
3642static struct attribute *rdev_default_attrs[] = {
3643	&rdev_state.attr,
3644	&rdev_errors.attr,
3645	&rdev_slot.attr,
3646	&rdev_offset.attr,
3647	&rdev_new_offset.attr,
3648	&rdev_size.attr,
3649	&rdev_recovery_start.attr,
3650	&rdev_bad_blocks.attr,
3651	&rdev_unack_bad_blocks.attr,
3652	&rdev_ppl_sector.attr,
3653	&rdev_ppl_size.attr,
3654	NULL,
3655};
3656ATTRIBUTE_GROUPS(rdev_default);
3657static ssize_t
3658rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3659{
3660	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3661	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3662
3663	if (!entry->show)
3664		return -EIO;
3665	if (!rdev->mddev)
3666		return -ENODEV;
3667	return entry->show(rdev, page);
3668}
3669
3670static ssize_t
3671rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3672	      const char *page, size_t length)
3673{
3674	struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3675	struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3676	struct kernfs_node *kn = NULL;
3677	bool suspend = false;
3678	ssize_t rv;
3679	struct mddev *mddev = READ_ONCE(rdev->mddev);
3680
3681	if (!entry->store)
3682		return -EIO;
3683	if (!capable(CAP_SYS_ADMIN))
3684		return -EACCES;
3685	if (!mddev)
3686		return -ENODEV;
3687
3688	if (entry->store == state_store) {
3689		if (cmd_match(page, "remove"))
3690			kn = sysfs_break_active_protection(kobj, attr);
3691		if (cmd_match(page, "remove") || cmd_match(page, "re-add") ||
3692		    cmd_match(page, "writemostly") ||
3693		    cmd_match(page, "-writemostly"))
3694			suspend = true;
3695	}
3696
3697	rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev);
3698	if (!rv) {
3699		if (rdev->mddev == NULL)
3700			rv = -ENODEV;
3701		else
3702			rv = entry->store(rdev, page, length);
3703		suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
3704	}
3705
3706	if (kn)
3707		sysfs_unbreak_active_protection(kn);
3708
3709	return rv;
3710}
3711
3712static void rdev_free(struct kobject *ko)
3713{
3714	struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3715	kfree(rdev);
3716}
3717static const struct sysfs_ops rdev_sysfs_ops = {
3718	.show		= rdev_attr_show,
3719	.store		= rdev_attr_store,
3720};
3721static const struct kobj_type rdev_ktype = {
3722	.release	= rdev_free,
3723	.sysfs_ops	= &rdev_sysfs_ops,
3724	.default_groups	= rdev_default_groups,
3725};
3726
3727int md_rdev_init(struct md_rdev *rdev)
3728{
3729	rdev->desc_nr = -1;
3730	rdev->saved_raid_disk = -1;
3731	rdev->raid_disk = -1;
3732	rdev->flags = 0;
3733	rdev->data_offset = 0;
3734	rdev->new_data_offset = 0;
3735	rdev->sb_events = 0;
3736	rdev->last_read_error = 0;
3737	rdev->sb_loaded = 0;
3738	rdev->bb_page = NULL;
3739	atomic_set(&rdev->nr_pending, 0);
3740	atomic_set(&rdev->read_errors, 0);
3741	atomic_set(&rdev->corrected_errors, 0);
3742
3743	INIT_LIST_HEAD(&rdev->same_set);
3744	init_waitqueue_head(&rdev->blocked_wait);
3745
3746	/* Add space to store bad block list.
3747	 * This reserves the space even on arrays where it cannot
3748	 * be used - I wonder if that matters
3749	 */
3750	return badblocks_init(&rdev->badblocks, 0);
3751}
3752EXPORT_SYMBOL_GPL(md_rdev_init);
3753
3754/*
3755 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3756 *
3757 * mark the device faulty if:
3758 *
3759 *   - the device is nonexistent (zero size)
3760 *   - the device has no valid superblock
3761 *
3762 * a faulty rdev _never_ has rdev->sb set.
3763 */
3764static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3765{
3766	struct md_rdev *rdev;
3767	sector_t size;
3768	int err;
3769
3770	rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3771	if (!rdev)
3772		return ERR_PTR(-ENOMEM);
3773
3774	err = md_rdev_init(rdev);
3775	if (err)
3776		goto out_free_rdev;
3777	err = alloc_disk_sb(rdev);
3778	if (err)
3779		goto out_clear_rdev;
3780
3781	rdev->bdev_file = bdev_file_open_by_dev(newdev,
3782			BLK_OPEN_READ | BLK_OPEN_WRITE,
3783			super_format == -2 ? &claim_rdev : rdev, NULL);
3784	if (IS_ERR(rdev->bdev_file)) {
3785		pr_warn("md: could not open device unknown-block(%u,%u).\n",
3786			MAJOR(newdev), MINOR(newdev));
3787		err = PTR_ERR(rdev->bdev_file);
3788		goto out_clear_rdev;
3789	}
3790	rdev->bdev = file_bdev(rdev->bdev_file);
3791
3792	kobject_init(&rdev->kobj, &rdev_ktype);
3793
3794	size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS;
3795	if (!size) {
3796		pr_warn("md: %pg has zero or unknown size, marking faulty!\n",
3797			rdev->bdev);
3798		err = -EINVAL;
3799		goto out_blkdev_put;
3800	}
3801
3802	if (super_format >= 0) {
3803		err = super_types[super_format].
3804			load_super(rdev, NULL, super_minor);
3805		if (err == -EINVAL) {
3806			pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n",
3807				rdev->bdev,
3808				super_format, super_minor);
3809			goto out_blkdev_put;
3810		}
3811		if (err < 0) {
3812			pr_warn("md: could not read %pg's sb, not importing!\n",
3813				rdev->bdev);
3814			goto out_blkdev_put;
3815		}
3816	}
3817
3818	return rdev;
3819
3820out_blkdev_put:
3821	fput(rdev->bdev_file);
3822out_clear_rdev:
3823	md_rdev_clear(rdev);
3824out_free_rdev:
3825	kfree(rdev);
3826	return ERR_PTR(err);
3827}
3828
3829/*
3830 * Check a full RAID array for plausibility
3831 */
3832
3833static int analyze_sbs(struct mddev *mddev)
3834{
3835	int i;
3836	struct md_rdev *rdev, *freshest, *tmp;
3837
3838	freshest = NULL;
3839	rdev_for_each_safe(rdev, tmp, mddev)
3840		switch (super_types[mddev->major_version].
3841			load_super(rdev, freshest, mddev->minor_version)) {
3842		case 1:
3843			freshest = rdev;
3844			break;
3845		case 0:
3846			break;
3847		default:
3848			pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n",
3849				rdev->bdev);
3850			md_kick_rdev_from_array(rdev);
3851		}
3852
3853	/* Cannot find a valid fresh disk */
3854	if (!freshest) {
3855		pr_warn("md: cannot find a valid disk\n");
3856		return -EINVAL;
3857	}
3858
3859	super_types[mddev->major_version].
3860		validate_super(mddev, NULL/*freshest*/, freshest);
3861
3862	i = 0;
3863	rdev_for_each_safe(rdev, tmp, mddev) {
3864		if (mddev->max_disks &&
3865		    (rdev->desc_nr >= mddev->max_disks ||
3866		     i > mddev->max_disks)) {
3867			pr_warn("md: %s: %pg: only %d devices permitted\n",
3868				mdname(mddev), rdev->bdev,
3869				mddev->max_disks);
3870			md_kick_rdev_from_array(rdev);
3871			continue;
3872		}
3873		if (rdev != freshest) {
3874			if (super_types[mddev->major_version].
3875			    validate_super(mddev, freshest, rdev)) {
3876				pr_warn("md: kicking non-fresh %pg from array!\n",
3877					rdev->bdev);
3878				md_kick_rdev_from_array(rdev);
3879				continue;
3880			}
3881		}
3882		if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3883		    !test_bit(Journal, &rdev->flags)) {
3884			rdev->raid_disk = -1;
3885			clear_bit(In_sync, &rdev->flags);
3886		}
3887	}
3888
3889	return 0;
3890}
3891
3892/* Read a fixed-point number.
3893 * Numbers in sysfs attributes should be in "standard" units where
3894 * possible, so time should be in seconds.
3895 * However we internally use a a much smaller unit such as
3896 * milliseconds or jiffies.
3897 * This function takes a decimal number with a possible fractional
3898 * component, and produces an integer which is the result of
3899 * multiplying that number by 10^'scale'.
3900 * all without any floating-point arithmetic.
3901 */
3902int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3903{
3904	unsigned long result = 0;
3905	long decimals = -1;
3906	while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3907		if (*cp == '.')
3908			decimals = 0;
3909		else if (decimals < scale) {
3910			unsigned int value;
3911			value = *cp - '0';
3912			result = result * 10 + value;
3913			if (decimals >= 0)
3914				decimals++;
3915		}
3916		cp++;
3917	}
3918	if (*cp == '\n')
3919		cp++;
3920	if (*cp)
3921		return -EINVAL;
3922	if (decimals < 0)
3923		decimals = 0;
3924	*res = result * int_pow(10, scale - decimals);
3925	return 0;
3926}
3927
3928static ssize_t
3929safe_delay_show(struct mddev *mddev, char *page)
3930{
3931	unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ;
3932
3933	return sprintf(page, "%u.%03u\n", msec/1000, msec%1000);
3934}
3935static ssize_t
3936safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3937{
3938	unsigned long msec;
3939
3940	if (mddev_is_clustered(mddev)) {
3941		pr_warn("md: Safemode is disabled for clustered mode\n");
3942		return -EINVAL;
3943	}
3944
3945	if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ)
3946		return -EINVAL;
3947	if (msec == 0)
3948		mddev->safemode_delay = 0;
3949	else {
3950		unsigned long old_delay = mddev->safemode_delay;
3951		unsigned long new_delay = (msec*HZ)/1000;
3952
3953		if (new_delay == 0)
3954			new_delay = 1;
3955		mddev->safemode_delay = new_delay;
3956		if (new_delay < old_delay || old_delay == 0)
3957			mod_timer(&mddev->safemode_timer, jiffies+1);
3958	}
3959	return len;
3960}
3961static struct md_sysfs_entry md_safe_delay =
3962__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3963
3964static ssize_t
3965level_show(struct mddev *mddev, char *page)
3966{
3967	struct md_personality *p;
3968	int ret;
3969	spin_lock(&mddev->lock);
3970	p = mddev->pers;
3971	if (p)
3972		ret = sprintf(page, "%s\n", p->name);
3973	else if (mddev->clevel[0])
3974		ret = sprintf(page, "%s\n", mddev->clevel);
3975	else if (mddev->level != LEVEL_NONE)
3976		ret = sprintf(page, "%d\n", mddev->level);
3977	else
3978		ret = 0;
3979	spin_unlock(&mddev->lock);
3980	return ret;
3981}
3982
3983static ssize_t
3984level_store(struct mddev *mddev, const char *buf, size_t len)
3985{
3986	char clevel[16];
3987	ssize_t rv;
3988	size_t slen = len;
3989	struct md_personality *pers, *oldpers;
3990	long level;
3991	void *priv, *oldpriv;
3992	struct md_rdev *rdev;
3993
3994	if (slen == 0 || slen >= sizeof(clevel))
3995		return -EINVAL;
3996
3997	rv = mddev_suspend_and_lock(mddev);
3998	if (rv)
3999		return rv;
4000
4001	if (mddev->pers == NULL) {
4002		memcpy(mddev->clevel, buf, slen);
4003		if (mddev->clevel[slen-1] == '\n')
4004			slen--;
4005		mddev->clevel[slen] = 0;
4006		mddev->level = LEVEL_NONE;
4007		rv = len;
4008		goto out_unlock;
4009	}
4010	rv = -EROFS;
4011	if (!md_is_rdwr(mddev))
4012		goto out_unlock;
4013
4014	/* request to change the personality.  Need to ensure:
4015	 *  - array is not engaged in resync/recovery/reshape
4016	 *  - old personality can be suspended
4017	 *  - new personality will access other array.
4018	 */
4019
4020	rv = -EBUSY;
4021	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4022	    mddev->reshape_position != MaxSector ||
4023	    mddev->sysfs_active)
4024		goto out_unlock;
4025
4026	rv = -EINVAL;
4027	if (!mddev->pers->quiesce) {
4028		pr_warn("md: %s: %s does not support online personality change\n",
4029			mdname(mddev), mddev->pers->name);
4030		goto out_unlock;
4031	}
4032
4033	/* Now find the new personality */
4034	memcpy(clevel, buf, slen);
4035	if (clevel[slen-1] == '\n')
4036		slen--;
4037	clevel[slen] = 0;
4038	if (kstrtol(clevel, 10, &level))
4039		level = LEVEL_NONE;
4040
4041	if (request_module("md-%s", clevel) != 0)
4042		request_module("md-level-%s", clevel);
4043	spin_lock(&pers_lock);
4044	pers = find_pers(level, clevel);
4045	if (!pers || !try_module_get(pers->owner)) {
4046		spin_unlock(&pers_lock);
4047		pr_warn("md: personality %s not loaded\n", clevel);
4048		rv = -EINVAL;
4049		goto out_unlock;
4050	}
4051	spin_unlock(&pers_lock);
4052
4053	if (pers == mddev->pers) {
4054		/* Nothing to do! */
4055		module_put(pers->owner);
4056		rv = len;
4057		goto out_unlock;
4058	}
4059	if (!pers->takeover) {
4060		module_put(pers->owner);
4061		pr_warn("md: %s: %s does not support personality takeover\n",
4062			mdname(mddev), clevel);
4063		rv = -EINVAL;
4064		goto out_unlock;
4065	}
4066
4067	rdev_for_each(rdev, mddev)
4068		rdev->new_raid_disk = rdev->raid_disk;
4069
4070	/* ->takeover must set new_* and/or delta_disks
4071	 * if it succeeds, and may set them when it fails.
4072	 */
4073	priv = pers->takeover(mddev);
4074	if (IS_ERR(priv)) {
4075		mddev->new_level = mddev->level;
4076		mddev->new_layout = mddev->layout;
4077		mddev->new_chunk_sectors = mddev->chunk_sectors;
4078		mddev->raid_disks -= mddev->delta_disks;
4079		mddev->delta_disks = 0;
4080		mddev->reshape_backwards = 0;
4081		module_put(pers->owner);
4082		pr_warn("md: %s: %s would not accept array\n",
4083			mdname(mddev), clevel);
4084		rv = PTR_ERR(priv);
4085		goto out_unlock;
4086	}
4087
4088	/* Looks like we have a winner */
4089	mddev_detach(mddev);
4090
4091	spin_lock(&mddev->lock);
4092	oldpers = mddev->pers;
4093	oldpriv = mddev->private;
4094	mddev->pers = pers;
4095	mddev->private = priv;
4096	strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4097	mddev->level = mddev->new_level;
4098	mddev->layout = mddev->new_layout;
4099	mddev->chunk_sectors = mddev->new_chunk_sectors;
4100	mddev->delta_disks = 0;
4101	mddev->reshape_backwards = 0;
4102	mddev->degraded = 0;
4103	spin_unlock(&mddev->lock);
4104
4105	if (oldpers->sync_request == NULL &&
4106	    mddev->external) {
4107		/* We are converting from a no-redundancy array
4108		 * to a redundancy array and metadata is managed
4109		 * externally so we need to be sure that writes
4110		 * won't block due to a need to transition
4111		 *      clean->dirty
4112		 * until external management is started.
4113		 */
4114		mddev->in_sync = 0;
4115		mddev->safemode_delay = 0;
4116		mddev->safemode = 0;
4117	}
4118
4119	oldpers->free(mddev, oldpriv);
4120
4121	if (oldpers->sync_request == NULL &&
4122	    pers->sync_request != NULL) {
4123		/* need to add the md_redundancy_group */
4124		if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4125			pr_warn("md: cannot register extra attributes for %s\n",
4126				mdname(mddev));
4127		mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4128		mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
4129		mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
4130	}
4131	if (oldpers->sync_request != NULL &&
4132	    pers->sync_request == NULL) {
4133		/* need to remove the md_redundancy_group */
4134		if (mddev->to_remove == NULL)
4135			mddev->to_remove = &md_redundancy_group;
4136	}
4137
4138	module_put(oldpers->owner);
4139
4140	rdev_for_each(rdev, mddev) {
4141		if (rdev->raid_disk < 0)
4142			continue;
4143		if (rdev->new_raid_disk >= mddev->raid_disks)
4144			rdev->new_raid_disk = -1;
4145		if (rdev->new_raid_disk == rdev->raid_disk)
4146			continue;
4147		sysfs_unlink_rdev(mddev, rdev);
4148	}
4149	rdev_for_each(rdev, mddev) {
4150		if (rdev->raid_disk < 0)
4151			continue;
4152		if (rdev->new_raid_disk == rdev->raid_disk)
4153			continue;
4154		rdev->raid_disk = rdev->new_raid_disk;
4155		if (rdev->raid_disk < 0)
4156			clear_bit(In_sync, &rdev->flags);
4157		else {
4158			if (sysfs_link_rdev(mddev, rdev))
4159				pr_warn("md: cannot register rd%d for %s after level change\n",
4160					rdev->raid_disk, mdname(mddev));
4161		}
4162	}
4163
4164	if (pers->sync_request == NULL) {
4165		/* this is now an array without redundancy, so
4166		 * it must always be in_sync
4167		 */
4168		mddev->in_sync = 1;
4169		del_timer_sync(&mddev->safemode_timer);
4170	}
4171	pers->run(mddev);
4172	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4173	if (!mddev->thread)
4174		md_update_sb(mddev, 1);
4175	sysfs_notify_dirent_safe(mddev->sysfs_level);
4176	md_new_event();
4177	rv = len;
4178out_unlock:
4179	mddev_unlock_and_resume(mddev);
4180	return rv;
4181}
4182
4183static struct md_sysfs_entry md_level =
4184__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
4185
4186static ssize_t
4187layout_show(struct mddev *mddev, char *page)
4188{
4189	/* just a number, not meaningful for all levels */
4190	if (mddev->reshape_position != MaxSector &&
4191	    mddev->layout != mddev->new_layout)
4192		return sprintf(page, "%d (%d)\n",
4193			       mddev->new_layout, mddev->layout);
4194	return sprintf(page, "%d\n", mddev->layout);
4195}
4196
4197static ssize_t
4198layout_store(struct mddev *mddev, const char *buf, size_t len)
4199{
4200	unsigned int n;
4201	int err;
4202
4203	err = kstrtouint(buf, 10, &n);
4204	if (err < 0)
4205		return err;
4206	err = mddev_lock(mddev);
4207	if (err)
4208		return err;
4209
4210	if (mddev->pers) {
4211		if (mddev->pers->check_reshape == NULL)
4212			err = -EBUSY;
4213		else if (!md_is_rdwr(mddev))
4214			err = -EROFS;
4215		else {
4216			mddev->new_layout = n;
4217			err = mddev->pers->check_reshape(mddev);
4218			if (err)
4219				mddev->new_layout = mddev->layout;
4220		}
4221	} else {
4222		mddev->new_layout = n;
4223		if (mddev->reshape_position == MaxSector)
4224			mddev->layout = n;
4225	}
4226	mddev_unlock(mddev);
4227	return err ?: len;
4228}
4229static struct md_sysfs_entry md_layout =
4230__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
4231
4232static ssize_t
4233raid_disks_show(struct mddev *mddev, char *page)
4234{
4235	if (mddev->raid_disks == 0)
4236		return 0;
4237	if (mddev->reshape_position != MaxSector &&
4238	    mddev->delta_disks != 0)
4239		return sprintf(page, "%d (%d)\n", mddev->raid_disks,
4240			       mddev->raid_disks - mddev->delta_disks);
4241	return sprintf(page, "%d\n", mddev->raid_disks);
4242}
4243
4244static int update_raid_disks(struct mddev *mddev, int raid_disks);
4245
4246static ssize_t
4247raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
4248{
4249	unsigned int n;
4250	int err;
4251
4252	err = kstrtouint(buf, 10, &n);
4253	if (err < 0)
4254		return err;
4255
4256	err = mddev_lock(mddev);
4257	if (err)
4258		return err;
4259	if (mddev->pers)
4260		err = update_raid_disks(mddev, n);
4261	else if (mddev->reshape_position != MaxSector) {
4262		struct md_rdev *rdev;
4263		int olddisks = mddev->raid_disks - mddev->delta_disks;
4264
4265		err = -EINVAL;
4266		rdev_for_each(rdev, mddev) {
4267			if (olddisks < n &&
4268			    rdev->data_offset < rdev->new_data_offset)
4269				goto out_unlock;
4270			if (olddisks > n &&
4271			    rdev->data_offset > rdev->new_data_offset)
4272				goto out_unlock;
4273		}
4274		err = 0;
4275		mddev->delta_disks = n - olddisks;
4276		mddev->raid_disks = n;
4277		mddev->reshape_backwards = (mddev->delta_disks < 0);
4278	} else
4279		mddev->raid_disks = n;
4280out_unlock:
4281	mddev_unlock(mddev);
4282	return err ? err : len;
4283}
4284static struct md_sysfs_entry md_raid_disks =
4285__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4286
4287static ssize_t
4288uuid_show(struct mddev *mddev, char *page)
4289{
4290	return sprintf(page, "%pU\n", mddev->uuid);
4291}
4292static struct md_sysfs_entry md_uuid =
4293__ATTR(uuid, S_IRUGO, uuid_show, NULL);
4294
4295static ssize_t
4296chunk_size_show(struct mddev *mddev, char *page)
4297{
4298	if (mddev->reshape_position != MaxSector &&
4299	    mddev->chunk_sectors != mddev->new_chunk_sectors)
4300		return sprintf(page, "%d (%d)\n",
4301			       mddev->new_chunk_sectors << 9,
4302			       mddev->chunk_sectors << 9);
4303	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4304}
4305
4306static ssize_t
4307chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4308{
4309	unsigned long n;
4310	int err;
4311
4312	err = kstrtoul(buf, 10, &n);
4313	if (err < 0)
4314		return err;
4315
4316	err = mddev_lock(mddev);
4317	if (err)
4318		return err;
4319	if (mddev->pers) {
4320		if (mddev->pers->check_reshape == NULL)
4321			err = -EBUSY;
4322		else if (!md_is_rdwr(mddev))
4323			err = -EROFS;
4324		else {
4325			mddev->new_chunk_sectors = n >> 9;
4326			err = mddev->pers->check_reshape(mddev);
4327			if (err)
4328				mddev->new_chunk_sectors = mddev->chunk_sectors;
4329		}
4330	} else {
4331		mddev->new_chunk_sectors = n >> 9;
4332		if (mddev->reshape_position == MaxSector)
4333			mddev->chunk_sectors = n >> 9;
4334	}
4335	mddev_unlock(mddev);
4336	return err ?: len;
4337}
4338static struct md_sysfs_entry md_chunk_size =
4339__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4340
4341static ssize_t
4342resync_start_show(struct mddev *mddev, char *page)
4343{
4344	if (mddev->recovery_cp == MaxSector)
4345		return sprintf(page, "none\n");
4346	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
4347}
4348
4349static ssize_t
4350resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4351{
4352	unsigned long long n;
4353	int err;
4354
4355	if (cmd_match(buf, "none"))
4356		n = MaxSector;
4357	else {
4358		err = kstrtoull(buf, 10, &n);
4359		if (err < 0)
4360			return err;
4361		if (n != (sector_t)n)
4362			return -EINVAL;
4363	}
4364
4365	err = mddev_lock(mddev);
4366	if (err)
4367		return err;
4368	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4369		err = -EBUSY;
4370
4371	if (!err) {
4372		mddev->recovery_cp = n;
4373		if (mddev->pers)
4374			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4375	}
4376	mddev_unlock(mddev);
4377	return err ?: len;
4378}
4379static struct md_sysfs_entry md_resync_start =
4380__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4381		resync_start_show, resync_start_store);
4382
4383/*
4384 * The array state can be:
4385 *
4386 * clear
4387 *     No devices, no size, no level
4388 *     Equivalent to STOP_ARRAY ioctl
4389 * inactive
4390 *     May have some settings, but array is not active
4391 *        all IO results in error
4392 *     When written, doesn't tear down array, but just stops it
4393 * suspended (not supported yet)
4394 *     All IO requests will block. The array can be reconfigured.
4395 *     Writing this, if accepted, will block until array is quiescent
4396 * readonly
4397 *     no resync can happen.  no superblocks get written.
4398 *     write requests fail
4399 * read-auto
4400 *     like readonly, but behaves like 'clean' on a write request.
4401 *
4402 * clean - no pending writes, but otherwise active.
4403 *     When written to inactive array, starts without resync
4404 *     If a write request arrives then
4405 *       if metadata is known, mark 'dirty' and switch to 'active'.
4406 *       if not known, block and switch to write-pending
4407 *     If written to an active array that has pending writes, then fails.
4408 * active
4409 *     fully active: IO and resync can be happening.
4410 *     When written to inactive array, starts with resync
4411 *
4412 * write-pending
4413 *     clean, but writes are blocked waiting for 'active' to be written.
4414 *
4415 * active-idle
4416 *     like active, but no writes have been seen for a while (100msec).
4417 *
4418 * broken
4419*     Array is failed. It's useful because mounted-arrays aren't stopped
4420*     when array is failed, so this state will at least alert the user that
4421*     something is wrong.
4422 */
4423enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4424		   write_pending, active_idle, broken, bad_word};
4425static char *array_states[] = {
4426	"clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4427	"write-pending", "active-idle", "broken", NULL };
4428
4429static int match_word(const char *word, char **list)
4430{
4431	int n;
4432	for (n=0; list[n]; n++)
4433		if (cmd_match(word, list[n]))
4434			break;
4435	return n;
4436}
4437
4438static ssize_t
4439array_state_show(struct mddev *mddev, char *page)
4440{
4441	enum array_state st = inactive;
4442
4443	if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
4444		switch(mddev->ro) {
4445		case MD_RDONLY:
4446			st = readonly;
4447			break;
4448		case MD_AUTO_READ:
4449			st = read_auto;
4450			break;
4451		case MD_RDWR:
4452			spin_lock(&mddev->lock);
4453			if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4454				st = write_pending;
4455			else if (mddev->in_sync)
4456				st = clean;
4457			else if (mddev->safemode)
4458				st = active_idle;
4459			else
4460				st = active;
4461			spin_unlock(&mddev->lock);
4462		}
4463
4464		if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
4465			st = broken;
4466	} else {
4467		if (list_empty(&mddev->disks) &&
4468		    mddev->raid_disks == 0 &&
4469		    mddev->dev_sectors == 0)
4470			st = clear;
4471		else
4472			st = inactive;
4473	}
4474	return sprintf(page, "%s\n", array_states[st]);
4475}
4476
4477static int do_md_stop(struct mddev *mddev, int ro);
4478static int md_set_readonly(struct mddev *mddev);
4479static int restart_array(struct mddev *mddev);
4480
4481static ssize_t
4482array_state_store(struct mddev *mddev, const char *buf, size_t len)
4483{
4484	int err = 0;
4485	enum array_state st = match_word(buf, array_states);
4486
4487	/* No lock dependent actions */
4488	switch (st) {
4489	case suspended:		/* not supported yet */
4490	case write_pending:	/* cannot be set */
4491	case active_idle:	/* cannot be set */
4492	case broken:		/* cannot be set */
4493	case bad_word:
4494		return -EINVAL;
4495	case clear:
4496	case readonly:
4497	case inactive:
4498	case read_auto:
4499		if (!mddev->pers || !md_is_rdwr(mddev))
4500			break;
4501		/* write sysfs will not open mddev and opener should be 0 */
4502		err = mddev_set_closing_and_sync_blockdev(mddev, 0);
4503		if (err)
4504			return err;
4505		break;
4506	default:
4507		break;
4508	}
4509
4510	if (mddev->pers && (st == active || st == clean) &&
4511	    mddev->ro != MD_RDONLY) {
4512		/* don't take reconfig_mutex when toggling between
4513		 * clean and active
4514		 */
4515		spin_lock(&mddev->lock);
4516		if (st == active) {
4517			restart_array(mddev);
4518			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4519			md_wakeup_thread(mddev->thread);
4520			wake_up(&mddev->sb_wait);
4521		} else /* st == clean */ {
4522			restart_array(mddev);
4523			if (!set_in_sync(mddev))
4524				err = -EBUSY;
4525		}
4526		if (!err)
4527			sysfs_notify_dirent_safe(mddev->sysfs_state);
4528		spin_unlock(&mddev->lock);
4529		return err ?: len;
4530	}
4531	err = mddev_lock(mddev);
4532	if (err)
4533		return err;
4534
4535	switch (st) {
4536	case inactive:
4537		/* stop an active array, return 0 otherwise */
4538		if (mddev->pers)
4539			err = do_md_stop(mddev, 2);
4540		break;
4541	case clear:
4542		err = do_md_stop(mddev, 0);
4543		break;
4544	case readonly:
4545		if (mddev->pers)
4546			err = md_set_readonly(mddev);
4547		else {
4548			mddev->ro = MD_RDONLY;
4549			set_disk_ro(mddev->gendisk, 1);
4550			err = do_md_run(mddev);
4551		}
4552		break;
4553	case read_auto:
4554		if (mddev->pers) {
4555			if (md_is_rdwr(mddev))
4556				err = md_set_readonly(mddev);
4557			else if (mddev->ro == MD_RDONLY)
4558				err = restart_array(mddev);
4559			if (err == 0) {
4560				mddev->ro = MD_AUTO_READ;
4561				set_disk_ro(mddev->gendisk, 0);
4562			}
4563		} else {
4564			mddev->ro = MD_AUTO_READ;
4565			err = do_md_run(mddev);
4566		}
4567		break;
4568	case clean:
4569		if (mddev->pers) {
4570			err = restart_array(mddev);
4571			if (err)
4572				break;
4573			spin_lock(&mddev->lock);
4574			if (!set_in_sync(mddev))
4575				err = -EBUSY;
4576			spin_unlock(&mddev->lock);
4577		} else
4578			err = -EINVAL;
4579		break;
4580	case active:
4581		if (mddev->pers) {
4582			err = restart_array(mddev);
4583			if (err)
4584				break;
4585			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4586			wake_up(&mddev->sb_wait);
4587			err = 0;
4588		} else {
4589			mddev->ro = MD_RDWR;
4590			set_disk_ro(mddev->gendisk, 0);
4591			err = do_md_run(mddev);
4592		}
4593		break;
4594	default:
4595		err = -EINVAL;
4596		break;
4597	}
4598
4599	if (!err) {
4600		if (mddev->hold_active == UNTIL_IOCTL)
4601			mddev->hold_active = 0;
4602		sysfs_notify_dirent_safe(mddev->sysfs_state);
4603	}
4604	mddev_unlock(mddev);
4605
4606	if (st == readonly || st == read_auto || st == inactive ||
4607	    (err && st == clear))
4608		clear_bit(MD_CLOSING, &mddev->flags);
4609
4610	return err ?: len;
4611}
4612static struct md_sysfs_entry md_array_state =
4613__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4614
4615static ssize_t
4616max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4617	return sprintf(page, "%d\n",
4618		       atomic_read(&mddev->max_corr_read_errors));
4619}
4620
4621static ssize_t
4622max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4623{
4624	unsigned int n;
4625	int rv;
4626
4627	rv = kstrtouint(buf, 10, &n);
4628	if (rv < 0)
4629		return rv;
4630	if (n > INT_MAX)
4631		return -EINVAL;
4632	atomic_set(&mddev->max_corr_read_errors, n);
4633	return len;
4634}
4635
4636static struct md_sysfs_entry max_corr_read_errors =
4637__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4638	max_corrected_read_errors_store);
4639
4640static ssize_t
4641null_show(struct mddev *mddev, char *page)
4642{
4643	return -EINVAL;
4644}
4645
4646static ssize_t
4647new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4648{
4649	/* buf must be %d:%d\n? giving major and minor numbers */
4650	/* The new device is added to the array.
4651	 * If the array has a persistent superblock, we read the
4652	 * superblock to initialise info and check validity.
4653	 * Otherwise, only checking done is that in bind_rdev_to_array,
4654	 * which mainly checks size.
4655	 */
4656	char *e;
4657	int major = simple_strtoul(buf, &e, 10);
4658	int minor;
4659	dev_t dev;
4660	struct md_rdev *rdev;
4661	int err;
4662
4663	if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4664		return -EINVAL;
4665	minor = simple_strtoul(e+1, &e, 10);
4666	if (*e && *e != '\n')
4667		return -EINVAL;
4668	dev = MKDEV(major, minor);
4669	if (major != MAJOR(dev) ||
4670	    minor != MINOR(dev))
4671		return -EOVERFLOW;
4672
4673	err = mddev_suspend_and_lock(mddev);
4674	if (err)
4675		return err;
4676	if (mddev->persistent) {
4677		rdev = md_import_device(dev, mddev->major_version,
4678					mddev->minor_version);
4679		if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4680			struct md_rdev *rdev0
4681				= list_entry(mddev->disks.next,
4682					     struct md_rdev, same_set);
4683			err = super_types[mddev->major_version]
4684				.load_super(rdev, rdev0, mddev->minor_version);
4685			if (err < 0)
4686				goto out;
4687		}
4688	} else if (mddev->external)
4689		rdev = md_import_device(dev, -2, -1);
4690	else
4691		rdev = md_import_device(dev, -1, -1);
4692
4693	if (IS_ERR(rdev)) {
4694		mddev_unlock_and_resume(mddev);
4695		return PTR_ERR(rdev);
4696	}
4697	err = bind_rdev_to_array(rdev, mddev);
4698 out:
4699	if (err)
4700		export_rdev(rdev, mddev);
4701	mddev_unlock_and_resume(mddev);
4702	if (!err)
4703		md_new_event();
4704	return err ? err : len;
4705}
4706
4707static struct md_sysfs_entry md_new_device =
4708__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4709
4710static ssize_t
4711bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4712{
4713	char *end;
4714	unsigned long chunk, end_chunk;
4715	int err;
4716
4717	err = mddev_lock(mddev);
4718	if (err)
4719		return err;
4720	if (!mddev->bitmap)
4721		goto out;
4722	/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4723	while (*buf) {
4724		chunk = end_chunk = simple_strtoul(buf, &end, 0);
4725		if (buf == end) break;
4726		if (*end == '-') { /* range */
4727			buf = end + 1;
4728			end_chunk = simple_strtoul(buf, &end, 0);
4729			if (buf == end) break;
4730		}
4731		if (*end && !isspace(*end)) break;
4732		md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4733		buf = skip_spaces(end);
4734	}
4735	md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4736out:
4737	mddev_unlock(mddev);
4738	return len;
4739}
4740
4741static struct md_sysfs_entry md_bitmap =
4742__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4743
4744static ssize_t
4745size_show(struct mddev *mddev, char *page)
4746{
4747	return sprintf(page, "%llu\n",
4748		(unsigned long long)mddev->dev_sectors / 2);
4749}
4750
4751static int update_size(struct mddev *mddev, sector_t num_sectors);
4752
4753static ssize_t
4754size_store(struct mddev *mddev, const char *buf, size_t len)
4755{
4756	/* If array is inactive, we can reduce the component size, but
4757	 * not increase it (except from 0).
4758	 * If array is active, we can try an on-line resize
4759	 */
4760	sector_t sectors;
4761	int err = strict_blocks_to_sectors(buf, &sectors);
4762
4763	if (err < 0)
4764		return err;
4765	err = mddev_lock(mddev);
4766	if (err)
4767		return err;
4768	if (mddev->pers) {
4769		err = update_size(mddev, sectors);
4770		if (err == 0)
4771			md_update_sb(mddev, 1);
4772	} else {
4773		if (mddev->dev_sectors == 0 ||
4774		    mddev->dev_sectors > sectors)
4775			mddev->dev_sectors = sectors;
4776		else
4777			err = -ENOSPC;
4778	}
4779	mddev_unlock(mddev);
4780	return err ? err : len;
4781}
4782
4783static struct md_sysfs_entry md_size =
4784__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4785
4786/* Metadata version.
4787 * This is one of
4788 *   'none' for arrays with no metadata (good luck...)
4789 *   'external' for arrays with externally managed metadata,
4790 * or N.M for internally known formats
4791 */
4792static ssize_t
4793metadata_show(struct mddev *mddev, char *page)
4794{
4795	if (mddev->persistent)
4796		return sprintf(page, "%d.%d\n",
4797			       mddev->major_version, mddev->minor_version);
4798	else if (mddev->external)
4799		return sprintf(page, "external:%s\n", mddev->metadata_type);
4800	else
4801		return sprintf(page, "none\n");
4802}
4803
4804static ssize_t
4805metadata_store(struct mddev *mddev, const char *buf, size_t len)
4806{
4807	int major, minor;
4808	char *e;
4809	int err;
4810	/* Changing the details of 'external' metadata is
4811	 * always permitted.  Otherwise there must be
4812	 * no devices attached to the array.
4813	 */
4814
4815	err = mddev_lock(mddev);
4816	if (err)
4817		return err;
4818	err = -EBUSY;
4819	if (mddev->external && strncmp(buf, "external:", 9) == 0)
4820		;
4821	else if (!list_empty(&mddev->disks))
4822		goto out_unlock;
4823
4824	err = 0;
4825	if (cmd_match(buf, "none")) {
4826		mddev->persistent = 0;
4827		mddev->external = 0;
4828		mddev->major_version = 0;
4829		mddev->minor_version = 90;
4830		goto out_unlock;
4831	}
4832	if (strncmp(buf, "external:", 9) == 0) {
4833		size_t namelen = len-9;
4834		if (namelen >= sizeof(mddev->metadata_type))
4835			namelen = sizeof(mddev->metadata_type)-1;
4836		memcpy(mddev->metadata_type, buf+9, namelen);
4837		mddev->metadata_type[namelen] = 0;
4838		if (namelen && mddev->metadata_type[namelen-1] == '\n')
4839			mddev->metadata_type[--namelen] = 0;
4840		mddev->persistent = 0;
4841		mddev->external = 1;
4842		mddev->major_version = 0;
4843		mddev->minor_version = 90;
4844		goto out_unlock;
4845	}
4846	major = simple_strtoul(buf, &e, 10);
4847	err = -EINVAL;
4848	if (e==buf || *e != '.')
4849		goto out_unlock;
4850	buf = e+1;
4851	minor = simple_strtoul(buf, &e, 10);
4852	if (e==buf || (*e && *e != '\n') )
4853		goto out_unlock;
4854	err = -ENOENT;
4855	if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4856		goto out_unlock;
4857	mddev->major_version = major;
4858	mddev->minor_version = minor;
4859	mddev->persistent = 1;
4860	mddev->external = 0;
4861	err = 0;
4862out_unlock:
4863	mddev_unlock(mddev);
4864	return err ?: len;
4865}
4866
4867static struct md_sysfs_entry md_metadata =
4868__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4869
4870static ssize_t
4871action_show(struct mddev *mddev, char *page)
4872{
4873	char *type = "idle";
4874	unsigned long recovery = mddev->recovery;
4875	if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4876		type = "frozen";
4877	else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
4878	    (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4879		if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4880			type = "reshape";
4881		else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4882			if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4883				type = "resync";
4884			else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4885				type = "check";
4886			else
4887				type = "repair";
4888		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4889			type = "recover";
4890		else if (mddev->reshape_position != MaxSector)
4891			type = "reshape";
4892	}
4893	return sprintf(page, "%s\n", type);
4894}
4895
4896/**
4897 * stop_sync_thread() - wait for sync_thread to stop if it's running.
4898 * @mddev:	the array.
4899 * @locked:	if set, reconfig_mutex will still be held after this function
4900 *		return; if not set, reconfig_mutex will be released after this
4901 *		function return.
4902 * @check_seq:	if set, only wait for curent running sync_thread to stop, noted
4903 *		that new sync_thread can still start.
4904 */
4905static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq)
4906{
4907	int sync_seq;
4908
4909	if (check_seq)
4910		sync_seq = atomic_read(&mddev->sync_seq);
4911
4912	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
4913		if (!locked)
4914			mddev_unlock(mddev);
4915		return;
4916	}
4917
4918	mddev_unlock(mddev);
4919
4920	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4921	/*
4922	 * Thread might be blocked waiting for metadata update which will now
4923	 * never happen
4924	 */
4925	md_wakeup_thread_directly(mddev->sync_thread);
4926	if (work_pending(&mddev->sync_work))
4927		flush_work(&mddev->sync_work);
4928
4929	wait_event(resync_wait,
4930		   !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4931		   (check_seq && sync_seq != atomic_read(&mddev->sync_seq)));
4932
4933	if (locked)
4934		mddev_lock_nointr(mddev);
4935}
4936
4937void md_idle_sync_thread(struct mddev *mddev)
4938{
4939	lockdep_assert_held(&mddev->reconfig_mutex);
4940
4941	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4942	stop_sync_thread(mddev, true, true);
4943}
4944EXPORT_SYMBOL_GPL(md_idle_sync_thread);
4945
4946void md_frozen_sync_thread(struct mddev *mddev)
4947{
4948	lockdep_assert_held(&mddev->reconfig_mutex);
4949
4950	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4951	stop_sync_thread(mddev, true, false);
4952}
4953EXPORT_SYMBOL_GPL(md_frozen_sync_thread);
4954
4955void md_unfrozen_sync_thread(struct mddev *mddev)
4956{
4957	lockdep_assert_held(&mddev->reconfig_mutex);
4958
4959	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4960	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4961	md_wakeup_thread(mddev->thread);
4962	sysfs_notify_dirent_safe(mddev->sysfs_action);
4963}
4964EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread);
4965
4966static void idle_sync_thread(struct mddev *mddev)
4967{
4968	mutex_lock(&mddev->sync_mutex);
4969	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4970
4971	if (mddev_lock(mddev)) {
4972		mutex_unlock(&mddev->sync_mutex);
4973		return;
4974	}
4975
4976	stop_sync_thread(mddev, false, true);
4977	mutex_unlock(&mddev->sync_mutex);
4978}
4979
4980static void frozen_sync_thread(struct mddev *mddev)
4981{
4982	mutex_lock(&mddev->sync_mutex);
4983	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4984
4985	if (mddev_lock(mddev)) {
4986		mutex_unlock(&mddev->sync_mutex);
4987		return;
4988	}
4989
4990	stop_sync_thread(mddev, false, false);
4991	mutex_unlock(&mddev->sync_mutex);
4992}
4993
4994static ssize_t
4995action_store(struct mddev *mddev, const char *page, size_t len)
4996{
4997	if (!mddev->pers || !mddev->pers->sync_request)
4998		return -EINVAL;
4999
5000
5001	if (cmd_match(page, "idle"))
5002		idle_sync_thread(mddev);
5003	else if (cmd_match(page, "frozen"))
5004		frozen_sync_thread(mddev);
5005	else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5006		return -EBUSY;
5007	else if (cmd_match(page, "resync"))
5008		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5009	else if (cmd_match(page, "recover")) {
5010		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5011		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5012	} else if (cmd_match(page, "reshape")) {
5013		int err;
5014		if (mddev->pers->start_reshape == NULL)
5015			return -EINVAL;
5016		err = mddev_lock(mddev);
5017		if (!err) {
5018			if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5019				err =  -EBUSY;
5020			} else if (mddev->reshape_position == MaxSector ||
5021				   mddev->pers->check_reshape == NULL ||
5022				   mddev->pers->check_reshape(mddev)) {
5023				clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5024				err = mddev->pers->start_reshape(mddev);
5025			} else {
5026				/*
5027				 * If reshape is still in progress, and
5028				 * md_check_recovery() can continue to reshape,
5029				 * don't restart reshape because data can be
5030				 * corrupted for raid456.
5031				 */
5032				clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5033			}
5034			mddev_unlock(mddev);
5035		}
5036		if (err)
5037			return err;
5038		sysfs_notify_dirent_safe(mddev->sysfs_degraded);
5039	} else {
5040		if (cmd_match(page, "check"))
5041			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5042		else if (!cmd_match(page, "repair"))
5043			return -EINVAL;
5044		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5045		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
5046		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5047	}
5048	if (mddev->ro == MD_AUTO_READ) {
5049		/* A write to sync_action is enough to justify
5050		 * canceling read-auto mode
5051		 */
5052		flush_work(&mddev->sync_work);
5053		mddev->ro = MD_RDWR;
5054		md_wakeup_thread(mddev->sync_thread);
5055	}
5056	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5057	md_wakeup_thread(mddev->thread);
5058	sysfs_notify_dirent_safe(mddev->sysfs_action);
5059	return len;
5060}
5061
5062static struct md_sysfs_entry md_scan_mode =
5063__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
5064
5065static ssize_t
5066last_sync_action_show(struct mddev *mddev, char *page)
5067{
5068	return sprintf(page, "%s\n", mddev->last_sync_action);
5069}
5070
5071static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
5072
5073static ssize_t
5074mismatch_cnt_show(struct mddev *mddev, char *page)
5075{
5076	return sprintf(page, "%llu\n",
5077		       (unsigned long long)
5078		       atomic64_read(&mddev->resync_mismatches));
5079}
5080
5081static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
5082
5083static ssize_t
5084sync_min_show(struct mddev *mddev, char *page)
5085{
5086	return sprintf(page, "%d (%s)\n", speed_min(mddev),
5087		       mddev->sync_speed_min ? "local": "system");
5088}
5089
5090static ssize_t
5091sync_min_store(struct mddev *mddev, const char *buf, size_t len)
5092{
5093	unsigned int min;
5094	int rv;
5095
5096	if (strncmp(buf, "system", 6)==0) {
5097		min = 0;
5098	} else {
5099		rv = kstrtouint(buf, 10, &min);
5100		if (rv < 0)
5101			return rv;
5102		if (min == 0)
5103			return -EINVAL;
5104	}
5105	mddev->sync_speed_min = min;
5106	return len;
5107}
5108
5109static struct md_sysfs_entry md_sync_min =
5110__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
5111
5112static ssize_t
5113sync_max_show(struct mddev *mddev, char *page)
5114{
5115	return sprintf(page, "%d (%s)\n", speed_max(mddev),
5116		       mddev->sync_speed_max ? "local": "system");
5117}
5118
5119static ssize_t
5120sync_max_store(struct mddev *mddev, const char *buf, size_t len)
5121{
5122	unsigned int max;
5123	int rv;
5124
5125	if (strncmp(buf, "system", 6)==0) {
5126		max = 0;
5127	} else {
5128		rv = kstrtouint(buf, 10, &max);
5129		if (rv < 0)
5130			return rv;
5131		if (max == 0)
5132			return -EINVAL;
5133	}
5134	mddev->sync_speed_max = max;
5135	return len;
5136}
5137
5138static struct md_sysfs_entry md_sync_max =
5139__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
5140
5141static ssize_t
5142degraded_show(struct mddev *mddev, char *page)
5143{
5144	return sprintf(page, "%d\n", mddev->degraded);
5145}
5146static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
5147
5148static ssize_t
5149sync_force_parallel_show(struct mddev *mddev, char *page)
5150{
5151	return sprintf(page, "%d\n", mddev->parallel_resync);
5152}
5153
5154static ssize_t
5155sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
5156{
5157	long n;
5158
5159	if (kstrtol(buf, 10, &n))
5160		return -EINVAL;
5161
5162	if (n != 0 && n != 1)
5163		return -EINVAL;
5164
5165	mddev->parallel_resync = n;
5166
5167	if (mddev->sync_thread)
5168		wake_up(&resync_wait);
5169
5170	return len;
5171}
5172
5173/* force parallel resync, even with shared block devices */
5174static struct md_sysfs_entry md_sync_force_parallel =
5175__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
5176       sync_force_parallel_show, sync_force_parallel_store);
5177
5178static ssize_t
5179sync_speed_show(struct mddev *mddev, char *page)
5180{
5181	unsigned long resync, dt, db;
5182	if (mddev->curr_resync == MD_RESYNC_NONE)
5183		return sprintf(page, "none\n");
5184	resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
5185	dt = (jiffies - mddev->resync_mark) / HZ;
5186	if (!dt) dt++;
5187	db = resync - mddev->resync_mark_cnt;
5188	return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
5189}
5190
5191static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
5192
5193static ssize_t
5194sync_completed_show(struct mddev *mddev, char *page)
5195{
5196	unsigned long long max_sectors, resync;
5197
5198	if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5199		return sprintf(page, "none\n");
5200
5201	if (mddev->curr_resync == MD_RESYNC_YIELDED ||
5202	    mddev->curr_resync == MD_RESYNC_DELAYED)
5203		return sprintf(page, "delayed\n");
5204
5205	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
5206	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5207		max_sectors = mddev->resync_max_sectors;
5208	else
5209		max_sectors = mddev->dev_sectors;
5210
5211	resync = mddev->curr_resync_completed;
5212	return sprintf(page, "%llu / %llu\n", resync, max_sectors);
5213}
5214
5215static struct md_sysfs_entry md_sync_completed =
5216	__ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
5217
5218static ssize_t
5219min_sync_show(struct mddev *mddev, char *page)
5220{
5221	return sprintf(page, "%llu\n",
5222		       (unsigned long long)mddev->resync_min);
5223}
5224static ssize_t
5225min_sync_store(struct mddev *mddev, const char *buf, size_t len)
5226{
5227	unsigned long long min;
5228	int err;
5229
5230	if (kstrtoull(buf, 10, &min))
5231		return -EINVAL;
5232
5233	spin_lock(&mddev->lock);
5234	err = -EINVAL;
5235	if (min > mddev->resync_max)
5236		goto out_unlock;
5237
5238	err = -EBUSY;
5239	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5240		goto out_unlock;
5241
5242	/* Round down to multiple of 4K for safety */
5243	mddev->resync_min = round_down(min, 8);
5244	err = 0;
5245
5246out_unlock:
5247	spin_unlock(&mddev->lock);
5248	return err ?: len;
5249}
5250
5251static struct md_sysfs_entry md_min_sync =
5252__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
5253
5254static ssize_t
5255max_sync_show(struct mddev *mddev, char *page)
5256{
5257	if (mddev->resync_max == MaxSector)
5258		return sprintf(page, "max\n");
5259	else
5260		return sprintf(page, "%llu\n",
5261			       (unsigned long long)mddev->resync_max);
5262}
5263static ssize_t
5264max_sync_store(struct mddev *mddev, const char *buf, size_t len)
5265{
5266	int err;
5267	spin_lock(&mddev->lock);
5268	if (strncmp(buf, "max", 3) == 0)
5269		mddev->resync_max = MaxSector;
5270	else {
5271		unsigned long long max;
5272		int chunk;
5273
5274		err = -EINVAL;
5275		if (kstrtoull(buf, 10, &max))
5276			goto out_unlock;
5277		if (max < mddev->resync_min)
5278			goto out_unlock;
5279
5280		err = -EBUSY;
5281		if (max < mddev->resync_max && md_is_rdwr(mddev) &&
5282		    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5283			goto out_unlock;
5284
5285		/* Must be a multiple of chunk_size */
5286		chunk = mddev->chunk_sectors;
5287		if (chunk) {
5288			sector_t temp = max;
5289
5290			err = -EINVAL;
5291			if (sector_div(temp, chunk))
5292				goto out_unlock;
5293		}
5294		mddev->resync_max = max;
5295	}
5296	wake_up(&mddev->recovery_wait);
5297	err = 0;
5298out_unlock:
5299	spin_unlock(&mddev->lock);
5300	return err ?: len;
5301}
5302
5303static struct md_sysfs_entry md_max_sync =
5304__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
5305
5306static ssize_t
5307suspend_lo_show(struct mddev *mddev, char *page)
5308{
5309	return sprintf(page, "%llu\n",
5310		       (unsigned long long)READ_ONCE(mddev->suspend_lo));
5311}
5312
5313static ssize_t
5314suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
5315{
5316	unsigned long long new;
5317	int err;
5318
5319	err = kstrtoull(buf, 10, &new);
5320	if (err < 0)
5321		return err;
5322	if (new != (sector_t)new)
5323		return -EINVAL;
5324
5325	err = mddev_suspend(mddev, true);
5326	if (err)
5327		return err;
5328
5329	WRITE_ONCE(mddev->suspend_lo, new);
5330	mddev_resume(mddev);
5331
5332	return len;
5333}
5334static struct md_sysfs_entry md_suspend_lo =
5335__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
5336
5337static ssize_t
5338suspend_hi_show(struct mddev *mddev, char *page)
5339{
5340	return sprintf(page, "%llu\n",
5341		       (unsigned long long)READ_ONCE(mddev->suspend_hi));
5342}
5343
5344static ssize_t
5345suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
5346{
5347	unsigned long long new;
5348	int err;
5349
5350	err = kstrtoull(buf, 10, &new);
5351	if (err < 0)
5352		return err;
5353	if (new != (sector_t)new)
5354		return -EINVAL;
5355
5356	err = mddev_suspend(mddev, true);
5357	if (err)
5358		return err;
5359
5360	WRITE_ONCE(mddev->suspend_hi, new);
5361	mddev_resume(mddev);
5362
5363	return len;
5364}
5365static struct md_sysfs_entry md_suspend_hi =
5366__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
5367
5368static ssize_t
5369reshape_position_show(struct mddev *mddev, char *page)
5370{
5371	if (mddev->reshape_position != MaxSector)
5372		return sprintf(page, "%llu\n",
5373			       (unsigned long long)mddev->reshape_position);
5374	strcpy(page, "none\n");
5375	return 5;
5376}
5377
5378static ssize_t
5379reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
5380{
5381	struct md_rdev *rdev;
5382	unsigned long long new;
5383	int err;
5384
5385	err = kstrtoull(buf, 10, &new);
5386	if (err < 0)
5387		return err;
5388	if (new != (sector_t)new)
5389		return -EINVAL;
5390	err = mddev_lock(mddev);
5391	if (err)
5392		return err;
5393	err = -EBUSY;
5394	if (mddev->pers)
5395		goto unlock;
5396	mddev->reshape_position = new;
5397	mddev->delta_disks = 0;
5398	mddev->reshape_backwards = 0;
5399	mddev->new_level = mddev->level;
5400	mddev->new_layout = mddev->layout;
5401	mddev->new_chunk_sectors = mddev->chunk_sectors;
5402	rdev_for_each(rdev, mddev)
5403		rdev->new_data_offset = rdev->data_offset;
5404	err = 0;
5405unlock:
5406	mddev_unlock(mddev);
5407	return err ?: len;
5408}
5409
5410static struct md_sysfs_entry md_reshape_position =
5411__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
5412       reshape_position_store);
5413
5414static ssize_t
5415reshape_direction_show(struct mddev *mddev, char *page)
5416{
5417	return sprintf(page, "%s\n",
5418		       mddev->reshape_backwards ? "backwards" : "forwards");
5419}
5420
5421static ssize_t
5422reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
5423{
5424	int backwards = 0;
5425	int err;
5426
5427	if (cmd_match(buf, "forwards"))
5428		backwards = 0;
5429	else if (cmd_match(buf, "backwards"))
5430		backwards = 1;
5431	else
5432		return -EINVAL;
5433	if (mddev->reshape_backwards == backwards)
5434		return len;
5435
5436	err = mddev_lock(mddev);
5437	if (err)
5438		return err;
5439	/* check if we are allowed to change */
5440	if (mddev->delta_disks)
5441		err = -EBUSY;
5442	else if (mddev->persistent &&
5443	    mddev->major_version == 0)
5444		err =  -EINVAL;
5445	else
5446		mddev->reshape_backwards = backwards;
5447	mddev_unlock(mddev);
5448	return err ?: len;
5449}
5450
5451static struct md_sysfs_entry md_reshape_direction =
5452__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5453       reshape_direction_store);
5454
5455static ssize_t
5456array_size_show(struct mddev *mddev, char *page)
5457{
5458	if (mddev->external_size)
5459		return sprintf(page, "%llu\n",
5460			       (unsigned long long)mddev->array_sectors/2);
5461	else
5462		return sprintf(page, "default\n");
5463}
5464
5465static ssize_t
5466array_size_store(struct mddev *mddev, const char *buf, size_t len)
5467{
5468	sector_t sectors;
5469	int err;
5470
5471	err = mddev_lock(mddev);
5472	if (err)
5473		return err;
5474
5475	/* cluster raid doesn't support change array_sectors */
5476	if (mddev_is_clustered(mddev)) {
5477		mddev_unlock(mddev);
5478		return -EINVAL;
5479	}
5480
5481	if (strncmp(buf, "default", 7) == 0) {
5482		if (mddev->pers)
5483			sectors = mddev->pers->size(mddev, 0, 0);
5484		else
5485			sectors = mddev->array_sectors;
5486
5487		mddev->external_size = 0;
5488	} else {
5489		if (strict_blocks_to_sectors(buf, &sectors) < 0)
5490			err = -EINVAL;
5491		else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5492			err = -E2BIG;
5493		else
5494			mddev->external_size = 1;
5495	}
5496
5497	if (!err) {
5498		mddev->array_sectors = sectors;
5499		if (mddev->pers)
5500			set_capacity_and_notify(mddev->gendisk,
5501						mddev->array_sectors);
5502	}
5503	mddev_unlock(mddev);
5504	return err ?: len;
5505}
5506
5507static struct md_sysfs_entry md_array_size =
5508__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5509       array_size_store);
5510
5511static ssize_t
5512consistency_policy_show(struct mddev *mddev, char *page)
5513{
5514	int ret;
5515
5516	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5517		ret = sprintf(page, "journal\n");
5518	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5519		ret = sprintf(page, "ppl\n");
5520	} else if (mddev->bitmap) {
5521		ret = sprintf(page, "bitmap\n");
5522	} else if (mddev->pers) {
5523		if (mddev->pers->sync_request)
5524			ret = sprintf(page, "resync\n");
5525		else
5526			ret = sprintf(page, "none\n");
5527	} else {
5528		ret = sprintf(page, "unknown\n");
5529	}
5530
5531	return ret;
5532}
5533
5534static ssize_t
5535consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5536{
5537	int err = 0;
5538
5539	if (mddev->pers) {
5540		if (mddev->pers->change_consistency_policy)
5541			err = mddev->pers->change_consistency_policy(mddev, buf);
5542		else
5543			err = -EBUSY;
5544	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5545		set_bit(MD_HAS_PPL, &mddev->flags);
5546	} else {
5547		err = -EINVAL;
5548	}
5549
5550	return err ? err : len;
5551}
5552
5553static struct md_sysfs_entry md_consistency_policy =
5554__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5555       consistency_policy_store);
5556
5557static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
5558{
5559	return sprintf(page, "%d\n", mddev->fail_last_dev);
5560}
5561
5562/*
5563 * Setting fail_last_dev to true to allow last device to be forcibly removed
5564 * from RAID1/RAID10.
5565 */
5566static ssize_t
5567fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
5568{
5569	int ret;
5570	bool value;
5571
5572	ret = kstrtobool(buf, &value);
5573	if (ret)
5574		return ret;
5575
5576	if (value != mddev->fail_last_dev)
5577		mddev->fail_last_dev = value;
5578
5579	return len;
5580}
5581static struct md_sysfs_entry md_fail_last_dev =
5582__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
5583       fail_last_dev_store);
5584
5585static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
5586{
5587	if (mddev->pers == NULL || (mddev->pers->level != 1))
5588		return sprintf(page, "n/a\n");
5589	else
5590		return sprintf(page, "%d\n", mddev->serialize_policy);
5591}
5592
5593/*
5594 * Setting serialize_policy to true to enforce write IO is not reordered
5595 * for raid1.
5596 */
5597static ssize_t
5598serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
5599{
5600	int err;
5601	bool value;
5602
5603	err = kstrtobool(buf, &value);
5604	if (err)
5605		return err;
5606
5607	if (value == mddev->serialize_policy)
5608		return len;
5609
5610	err = mddev_suspend_and_lock(mddev);
5611	if (err)
5612		return err;
5613	if (mddev->pers == NULL || (mddev->pers->level != 1)) {
5614		pr_err("md: serialize_policy is only effective for raid1\n");
5615		err = -EINVAL;
5616		goto unlock;
5617	}
5618
5619	if (value)
5620		mddev_create_serial_pool(mddev, NULL);
5621	else
5622		mddev_destroy_serial_pool(mddev, NULL);
5623	mddev->serialize_policy = value;
5624unlock:
5625	mddev_unlock_and_resume(mddev);
5626	return err ?: len;
5627}
5628
5629static struct md_sysfs_entry md_serialize_policy =
5630__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
5631       serialize_policy_store);
5632
5633
5634static struct attribute *md_default_attrs[] = {
5635	&md_level.attr,
5636	&md_layout.attr,
5637	&md_raid_disks.attr,
5638	&md_uuid.attr,
5639	&md_chunk_size.attr,
5640	&md_size.attr,
5641	&md_resync_start.attr,
5642	&md_metadata.attr,
5643	&md_new_device.attr,
5644	&md_safe_delay.attr,
5645	&md_array_state.attr,
5646	&md_reshape_position.attr,
5647	&md_reshape_direction.attr,
5648	&md_array_size.attr,
5649	&max_corr_read_errors.attr,
5650	&md_consistency_policy.attr,
5651	&md_fail_last_dev.attr,
5652	&md_serialize_policy.attr,
5653	NULL,
5654};
5655
5656static const struct attribute_group md_default_group = {
5657	.attrs = md_default_attrs,
5658};
5659
5660static struct attribute *md_redundancy_attrs[] = {
5661	&md_scan_mode.attr,
5662	&md_last_scan_mode.attr,
5663	&md_mismatches.attr,
5664	&md_sync_min.attr,
5665	&md_sync_max.attr,
5666	&md_sync_speed.attr,
5667	&md_sync_force_parallel.attr,
5668	&md_sync_completed.attr,
5669	&md_min_sync.attr,
5670	&md_max_sync.attr,
5671	&md_suspend_lo.attr,
5672	&md_suspend_hi.attr,
5673	&md_bitmap.attr,
5674	&md_degraded.attr,
5675	NULL,
5676};
5677static const struct attribute_group md_redundancy_group = {
5678	.name = NULL,
5679	.attrs = md_redundancy_attrs,
5680};
5681
5682static const struct attribute_group *md_attr_groups[] = {
5683	&md_default_group,
5684	&md_bitmap_group,
5685	NULL,
5686};
5687
5688static ssize_t
5689md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5690{
5691	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5692	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5693	ssize_t rv;
5694
5695	if (!entry->show)
5696		return -EIO;
5697	spin_lock(&all_mddevs_lock);
5698	if (!mddev_get(mddev)) {
5699		spin_unlock(&all_mddevs_lock);
5700		return -EBUSY;
5701	}
5702	spin_unlock(&all_mddevs_lock);
5703
5704	rv = entry->show(mddev, page);
5705	mddev_put(mddev);
5706	return rv;
5707}
5708
5709static ssize_t
5710md_attr_store(struct kobject *kobj, struct attribute *attr,
5711	      const char *page, size_t length)
5712{
5713	struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5714	struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5715	ssize_t rv;
5716
5717	if (!entry->store)
5718		return -EIO;
5719	if (!capable(CAP_SYS_ADMIN))
5720		return -EACCES;
5721	spin_lock(&all_mddevs_lock);
5722	if (!mddev_get(mddev)) {
5723		spin_unlock(&all_mddevs_lock);
5724		return -EBUSY;
5725	}
5726	spin_unlock(&all_mddevs_lock);
5727	rv = entry->store(mddev, page, length);
5728	mddev_put(mddev);
5729	return rv;
5730}
5731
5732static void md_kobj_release(struct kobject *ko)
5733{
5734	struct mddev *mddev = container_of(ko, struct mddev, kobj);
5735
5736	if (mddev->sysfs_state)
5737		sysfs_put(mddev->sysfs_state);
5738	if (mddev->sysfs_level)
5739		sysfs_put(mddev->sysfs_level);
5740
5741	del_gendisk(mddev->gendisk);
5742	put_disk(mddev->gendisk);
5743}
5744
5745static const struct sysfs_ops md_sysfs_ops = {
5746	.show	= md_attr_show,
5747	.store	= md_attr_store,
5748};
5749static const struct kobj_type md_ktype = {
5750	.release	= md_kobj_release,
5751	.sysfs_ops	= &md_sysfs_ops,
5752	.default_groups	= md_attr_groups,
5753};
5754
5755int mdp_major = 0;
5756
5757/* stack the limit for all rdevs into lim */
5758void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim)
5759{
5760	struct md_rdev *rdev;
5761
5762	rdev_for_each(rdev, mddev) {
5763		queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset,
5764					mddev->gendisk->disk_name);
5765	}
5766}
5767EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
5768
5769/* apply the extra stacking limits from a new rdev into mddev */
5770int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
5771{
5772	struct queue_limits lim;
5773
5774	if (mddev_is_dm(mddev))
5775		return 0;
5776
5777	lim = queue_limits_start_update(mddev->gendisk->queue);
5778	queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
5779				mddev->gendisk->disk_name);
5780	return queue_limits_commit_update(mddev->gendisk->queue, &lim);
5781}
5782EXPORT_SYMBOL_GPL(mddev_stack_new_rdev);
5783
5784/* update the optimal I/O size after a reshape */
5785void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes)
5786{
5787	struct queue_limits lim;
5788
5789	if (mddev_is_dm(mddev))
5790		return;
5791
5792	/* don't bother updating io_opt if we can't suspend the array */
5793	if (mddev_suspend(mddev, false) < 0)
5794		return;
5795	lim = queue_limits_start_update(mddev->gendisk->queue);
5796	lim.io_opt = lim.io_min * nr_stripes;
5797	queue_limits_commit_update(mddev->gendisk->queue, &lim);
5798	mddev_resume(mddev);
5799}
5800EXPORT_SYMBOL_GPL(mddev_update_io_opt);
5801
5802static void mddev_delayed_delete(struct work_struct *ws)
5803{
5804	struct mddev *mddev = container_of(ws, struct mddev, del_work);
5805
5806	kobject_put(&mddev->kobj);
5807}
5808
5809struct mddev *md_alloc(dev_t dev, char *name)
5810{
5811	/*
5812	 * If dev is zero, name is the name of a device to allocate with
5813	 * an arbitrary minor number.  It will be "md_???"
5814	 * If dev is non-zero it must be a device number with a MAJOR of
5815	 * MD_MAJOR or mdp_major.  In this case, if "name" is NULL, then
5816	 * the device is being created by opening a node in /dev.
5817	 * If "name" is not NULL, the device is being created by
5818	 * writing to /sys/module/md_mod/parameters/new_array.
5819	 */
5820	static DEFINE_MUTEX(disks_mutex);
5821	struct mddev *mddev;
5822	struct gendisk *disk;
5823	int partitioned;
5824	int shift;
5825	int unit;
5826	int error ;
5827
5828	/*
5829	 * Wait for any previous instance of this device to be completely
5830	 * removed (mddev_delayed_delete).
5831	 */
5832	flush_workqueue(md_misc_wq);
5833
5834	mutex_lock(&disks_mutex);
5835	mddev = mddev_alloc(dev);
5836	if (IS_ERR(mddev)) {
5837		error = PTR_ERR(mddev);
5838		goto out_unlock;
5839	}
5840
5841	partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5842	shift = partitioned ? MdpMinorShift : 0;
5843	unit = MINOR(mddev->unit) >> shift;
5844
5845	if (name && !dev) {
5846		/* Need to ensure that 'name' is not a duplicate.
5847		 */
5848		struct mddev *mddev2;
5849		spin_lock(&all_mddevs_lock);
5850
5851		list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5852			if (mddev2->gendisk &&
5853			    strcmp(mddev2->gendisk->disk_name, name) == 0) {
5854				spin_unlock(&all_mddevs_lock);
5855				error = -EEXIST;
5856				goto out_free_mddev;
5857			}
5858		spin_unlock(&all_mddevs_lock);
5859	}
5860	if (name && dev)
5861		/*
5862		 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
5863		 */
5864		mddev->hold_active = UNTIL_STOP;
5865
5866	disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
5867	if (IS_ERR(disk)) {
5868		error = PTR_ERR(disk);
5869		goto out_free_mddev;
5870	}
5871
5872	disk->major = MAJOR(mddev->unit);
5873	disk->first_minor = unit << shift;
5874	disk->minors = 1 << shift;
5875	if (name)
5876		strcpy(disk->disk_name, name);
5877	else if (partitioned)
5878		sprintf(disk->disk_name, "md_d%d", unit);
5879	else
5880		sprintf(disk->disk_name, "md%d", unit);
5881	disk->fops = &md_fops;
5882	disk->private_data = mddev;
5883
5884	blk_queue_write_cache(disk->queue, true, true);
5885	disk->events |= DISK_EVENT_MEDIA_CHANGE;
5886	mddev->gendisk = disk;
5887	error = add_disk(disk);
5888	if (error)
5889		goto out_put_disk;
5890
5891	kobject_init(&mddev->kobj, &md_ktype);
5892	error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
5893	if (error) {
5894		/*
5895		 * The disk is already live at this point.  Clear the hold flag
5896		 * and let mddev_put take care of the deletion, as it isn't any
5897		 * different from a normal close on last release now.
5898		 */
5899		mddev->hold_active = 0;
5900		mutex_unlock(&disks_mutex);
5901		mddev_put(mddev);
5902		return ERR_PTR(error);
5903	}
5904
5905	kobject_uevent(&mddev->kobj, KOBJ_ADD);
5906	mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5907	mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
5908	mutex_unlock(&disks_mutex);
5909	return mddev;
5910
5911out_put_disk:
5912	put_disk(disk);
5913out_free_mddev:
5914	mddev_free(mddev);
5915out_unlock:
5916	mutex_unlock(&disks_mutex);
5917	return ERR_PTR(error);
5918}
5919
5920static int md_alloc_and_put(dev_t dev, char *name)
5921{
5922	struct mddev *mddev = md_alloc(dev, name);
5923
5924	if (IS_ERR(mddev))
5925		return PTR_ERR(mddev);
5926	mddev_put(mddev);
5927	return 0;
5928}
5929
5930static void md_probe(dev_t dev)
5931{
5932	if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512)
5933		return;
5934	if (create_on_open)
5935		md_alloc_and_put(dev, NULL);
5936}
5937
5938static int add_named_array(const char *val, const struct kernel_param *kp)
5939{
5940	/*
5941	 * val must be "md_*" or "mdNNN".
5942	 * For "md_*" we allocate an array with a large free minor number, and
5943	 * set the name to val.  val must not already be an active name.
5944	 * For "mdNNN" we allocate an array with the minor number NNN
5945	 * which must not already be in use.
5946	 */
5947	int len = strlen(val);
5948	char buf[DISK_NAME_LEN];
5949	unsigned long devnum;
5950
5951	while (len && val[len-1] == '\n')
5952		len--;
5953	if (len >= DISK_NAME_LEN)
5954		return -E2BIG;
5955	strscpy(buf, val, len+1);
5956	if (strncmp(buf, "md_", 3) == 0)
5957		return md_alloc_and_put(0, buf);
5958	if (strncmp(buf, "md", 2) == 0 &&
5959	    isdigit(buf[2]) &&
5960	    kstrtoul(buf+2, 10, &devnum) == 0 &&
5961	    devnum <= MINORMASK)
5962		return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL);
5963
5964	return -EINVAL;
5965}
5966
5967static void md_safemode_timeout(struct timer_list *t)
5968{
5969	struct mddev *mddev = from_timer(mddev, t, safemode_timer);
5970
5971	mddev->safemode = 1;
5972	if (mddev->external)
5973		sysfs_notify_dirent_safe(mddev->sysfs_state);
5974
5975	md_wakeup_thread(mddev->thread);
5976}
5977
5978static int start_dirty_degraded;
5979
5980int md_run(struct mddev *mddev)
5981{
5982	int err;
5983	struct md_rdev *rdev;
5984	struct md_personality *pers;
5985	bool nowait = true;
5986
5987	if (list_empty(&mddev->disks))
5988		/* cannot run an array with no devices.. */
5989		return -EINVAL;
5990
5991	if (mddev->pers)
5992		return -EBUSY;
5993	/* Cannot run until previous stop completes properly */
5994	if (mddev->sysfs_active)
5995		return -EBUSY;
5996
5997	/*
5998	 * Analyze all RAID superblock(s)
5999	 */
6000	if (!mddev->raid_disks) {
6001		if (!mddev->persistent)
6002			return -EINVAL;
6003		err = analyze_sbs(mddev);
6004		if (err)
6005			return -EINVAL;
6006	}
6007
6008	if (mddev->level != LEVEL_NONE)
6009		request_module("md-level-%d", mddev->level);
6010	else if (mddev->clevel[0])
6011		request_module("md-%s", mddev->clevel);
6012
6013	/*
6014	 * Drop all container device buffers, from now on
6015	 * the only valid external interface is through the md
6016	 * device.
6017	 */
6018	mddev->has_superblocks = false;
6019	rdev_for_each(rdev, mddev) {
6020		if (test_bit(Faulty, &rdev->flags))
6021			continue;
6022		sync_blockdev(rdev->bdev);
6023		invalidate_bdev(rdev->bdev);
6024		if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
6025			mddev->ro = MD_RDONLY;
6026			if (!mddev_is_dm(mddev))
6027				set_disk_ro(mddev->gendisk, 1);
6028		}
6029
6030		if (rdev->sb_page)
6031			mddev->has_superblocks = true;
6032
6033		/* perform some consistency tests on the device.
6034		 * We don't want the data to overlap the metadata,
6035		 * Internal Bitmap issues have been handled elsewhere.
6036		 */
6037		if (rdev->meta_bdev) {
6038			/* Nothing to check */;
6039		} else if (rdev->data_offset < rdev->sb_start) {
6040			if (mddev->dev_sectors &&
6041			    rdev->data_offset + mddev->dev_sectors
6042			    > rdev->sb_start) {
6043				pr_warn("md: %s: data overlaps metadata\n",
6044					mdname(mddev));
6045				return -EINVAL;
6046			}
6047		} else {
6048			if (rdev->sb_start + rdev->sb_size/512
6049			    > rdev->data_offset) {
6050				pr_warn("md: %s: metadata overlaps data\n",
6051					mdname(mddev));
6052				return -EINVAL;
6053			}
6054		}
6055		sysfs_notify_dirent_safe(rdev->sysfs_state);
6056		nowait = nowait && bdev_nowait(rdev->bdev);
6057	}
6058
6059	if (!bioset_initialized(&mddev->bio_set)) {
6060		err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
6061		if (err)
6062			return err;
6063	}
6064	if (!bioset_initialized(&mddev->sync_set)) {
6065		err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
6066		if (err)
6067			goto exit_bio_set;
6068	}
6069
6070	if (!bioset_initialized(&mddev->io_clone_set)) {
6071		err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
6072				  offsetof(struct md_io_clone, bio_clone), 0);
6073		if (err)
6074			goto exit_sync_set;
6075	}
6076
6077	spin_lock(&pers_lock);
6078	pers = find_pers(mddev->level, mddev->clevel);
6079	if (!pers || !try_module_get(pers->owner)) {
6080		spin_unlock(&pers_lock);
6081		if (mddev->level != LEVEL_NONE)
6082			pr_warn("md: personality for level %d is not loaded!\n",
6083				mddev->level);
6084		else
6085			pr_warn("md: personality for level %s is not loaded!\n",
6086				mddev->clevel);
6087		err = -EINVAL;
6088		goto abort;
6089	}
6090	spin_unlock(&pers_lock);
6091	if (mddev->level != pers->level) {
6092		mddev->level = pers->level;
6093		mddev->new_level = pers->level;
6094	}
6095	strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
6096
6097	if (mddev->reshape_position != MaxSector &&
6098	    pers->start_reshape == NULL) {
6099		/* This personality cannot handle reshaping... */
6100		module_put(pers->owner);
6101		err = -EINVAL;
6102		goto abort;
6103	}
6104
6105	if (pers->sync_request) {
6106		/* Warn if this is a potentially silly
6107		 * configuration.
6108		 */
6109		struct md_rdev *rdev2;
6110		int warned = 0;
6111
6112		rdev_for_each(rdev, mddev)
6113			rdev_for_each(rdev2, mddev) {
6114				if (rdev < rdev2 &&
6115				    rdev->bdev->bd_disk ==
6116				    rdev2->bdev->bd_disk) {
6117					pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n",
6118						mdname(mddev),
6119						rdev->bdev,
6120						rdev2->bdev);
6121					warned = 1;
6122				}
6123			}
6124
6125		if (warned)
6126			pr_warn("True protection against single-disk failure might be compromised.\n");
6127	}
6128
6129	/* dm-raid expect sync_thread to be frozen until resume */
6130	if (mddev->gendisk)
6131		mddev->recovery = 0;
6132
6133	/* may be over-ridden by personality */
6134	mddev->resync_max_sectors = mddev->dev_sectors;
6135
6136	mddev->ok_start_degraded = start_dirty_degraded;
6137
6138	if (start_readonly && md_is_rdwr(mddev))
6139		mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */
6140
6141	err = pers->run(mddev);
6142	if (err)
6143		pr_warn("md: pers->run() failed ...\n");
6144	else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
6145		WARN_ONCE(!mddev->external_size,
6146			  "%s: default size too small, but 'external_size' not in effect?\n",
6147			  __func__);
6148		pr_warn("md: invalid array_size %llu > default size %llu\n",
6149			(unsigned long long)mddev->array_sectors / 2,
6150			(unsigned long long)pers->size(mddev, 0, 0) / 2);
6151		err = -EINVAL;
6152	}
6153	if (err == 0 && pers->sync_request &&
6154	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
6155		struct bitmap *bitmap;
6156
6157		bitmap = md_bitmap_create(mddev, -1);
6158		if (IS_ERR(bitmap)) {
6159			err = PTR_ERR(bitmap);
6160			pr_warn("%s: failed to create bitmap (%d)\n",
6161				mdname(mddev), err);
6162		} else
6163			mddev->bitmap = bitmap;
6164
6165	}
6166	if (err)
6167		goto bitmap_abort;
6168
6169	if (mddev->bitmap_info.max_write_behind > 0) {
6170		bool create_pool = false;
6171
6172		rdev_for_each(rdev, mddev) {
6173			if (test_bit(WriteMostly, &rdev->flags) &&
6174			    rdev_init_serial(rdev))
6175				create_pool = true;
6176		}
6177		if (create_pool && mddev->serial_info_pool == NULL) {
6178			mddev->serial_info_pool =
6179				mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
6180						    sizeof(struct serial_info));
6181			if (!mddev->serial_info_pool) {
6182				err = -ENOMEM;
6183				goto bitmap_abort;
6184			}
6185		}
6186	}
6187
6188	if (!mddev_is_dm(mddev)) {
6189		struct request_queue *q = mddev->gendisk->queue;
6190		bool nonrot = true;
6191
6192		rdev_for_each(rdev, mddev) {
6193			if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) {
6194				nonrot = false;
6195				break;
6196			}
6197		}
6198		if (mddev->degraded)
6199			nonrot = false;
6200		if (nonrot)
6201			blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
6202		else
6203			blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
6204		blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q);
6205
6206		/* Set the NOWAIT flags if all underlying devices support it */
6207		if (nowait)
6208			blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
6209	}
6210	if (pers->sync_request) {
6211		if (mddev->kobj.sd &&
6212		    sysfs_create_group(&mddev->kobj, &md_redundancy_group))
6213			pr_warn("md: cannot register extra attributes for %s\n",
6214				mdname(mddev));
6215		mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
6216		mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
6217		mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
6218	} else if (mddev->ro == MD_AUTO_READ)
6219		mddev->ro = MD_RDWR;
6220
6221	atomic_set(&mddev->max_corr_read_errors,
6222		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
6223	mddev->safemode = 0;
6224	if (mddev_is_clustered(mddev))
6225		mddev->safemode_delay = 0;
6226	else
6227		mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
6228	mddev->in_sync = 1;
6229	smp_wmb();
6230	spin_lock(&mddev->lock);
6231	mddev->pers = pers;
6232	spin_unlock(&mddev->lock);
6233	rdev_for_each(rdev, mddev)
6234		if (rdev->raid_disk >= 0)
6235			sysfs_link_rdev(mddev, rdev); /* failure here is OK */
6236
6237	if (mddev->degraded && md_is_rdwr(mddev))
6238		/* This ensures that recovering status is reported immediately
6239		 * via sysfs - until a lack of spares is confirmed.
6240		 */
6241		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6242	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6243
6244	if (mddev->sb_flags)
6245		md_update_sb(mddev, 0);
6246
6247	md_new_event();
6248	return 0;
6249
6250bitmap_abort:
6251	mddev_detach(mddev);
6252	if (mddev->private)
6253		pers->free(mddev, mddev->private);
6254	mddev->private = NULL;
6255	module_put(pers->owner);
6256	md_bitmap_destroy(mddev);
6257abort:
6258	bioset_exit(&mddev->io_clone_set);
6259exit_sync_set:
6260	bioset_exit(&mddev->sync_set);
6261exit_bio_set:
6262	bioset_exit(&mddev->bio_set);
6263	return err;
6264}
6265EXPORT_SYMBOL_GPL(md_run);
6266
6267int do_md_run(struct mddev *mddev)
6268{
6269	int err;
6270
6271	set_bit(MD_NOT_READY, &mddev->flags);
6272	err = md_run(mddev);
6273	if (err)
6274		goto out;
6275	err = md_bitmap_load(mddev);
6276	if (err) {
6277		md_bitmap_destroy(mddev);
6278		goto out;
6279	}
6280
6281	if (mddev_is_clustered(mddev))
6282		md_allow_write(mddev);
6283
6284	/* run start up tasks that require md_thread */
6285	md_start(mddev);
6286
6287	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
6288
6289	set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
6290	clear_bit(MD_NOT_READY, &mddev->flags);
6291	mddev->changed = 1;
6292	kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
6293	sysfs_notify_dirent_safe(mddev->sysfs_state);
6294	sysfs_notify_dirent_safe(mddev->sysfs_action);
6295	sysfs_notify_dirent_safe(mddev->sysfs_degraded);
6296out:
6297	clear_bit(MD_NOT_READY, &mddev->flags);
6298	return err;
6299}
6300
6301int md_start(struct mddev *mddev)
6302{
6303	int ret = 0;
6304
6305	if (mddev->pers->start) {
6306		set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6307		ret = mddev->pers->start(mddev);
6308		clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6309		md_wakeup_thread(mddev->sync_thread);
6310	}
6311	return ret;
6312}
6313EXPORT_SYMBOL_GPL(md_start);
6314
6315static int restart_array(struct mddev *mddev)
6316{
6317	struct gendisk *disk = mddev->gendisk;
6318	struct md_rdev *rdev;
6319	bool has_journal = false;
6320	bool has_readonly = false;
6321
6322	/* Complain if it has no devices */
6323	if (list_empty(&mddev->disks))
6324		return -ENXIO;
6325	if (!mddev->pers)
6326		return -EINVAL;
6327	if (md_is_rdwr(mddev))
6328		return -EBUSY;
6329
6330	rcu_read_lock();
6331	rdev_for_each_rcu(rdev, mddev) {
6332		if (test_bit(Journal, &rdev->flags) &&
6333		    !test_bit(Faulty, &rdev->flags))
6334			has_journal = true;
6335		if (rdev_read_only(rdev))
6336			has_readonly = true;
6337	}
6338	rcu_read_unlock();
6339	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
6340		/* Don't restart rw with journal missing/faulty */
6341			return -EINVAL;
6342	if (has_readonly)
6343		return -EROFS;
6344
6345	mddev->safemode = 0;
6346	mddev->ro = MD_RDWR;
6347	set_disk_ro(disk, 0);
6348	pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
6349	/* Kick recovery or resync if necessary */
6350	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6351	md_wakeup_thread(mddev->sync_thread);
6352	sysfs_notify_dirent_safe(mddev->sysfs_state);
6353	return 0;
6354}
6355
6356static void md_clean(struct mddev *mddev)
6357{
6358	mddev->array_sectors = 0;
6359	mddev->external_size = 0;
6360	mddev->dev_sectors = 0;
6361	mddev->raid_disks = 0;
6362	mddev->recovery_cp = 0;
6363	mddev->resync_min = 0;
6364	mddev->resync_max = MaxSector;
6365	mddev->reshape_position = MaxSector;
6366	/* we still need mddev->external in export_rdev, do not clear it yet */
6367	mddev->persistent = 0;
6368	mddev->level = LEVEL_NONE;
6369	mddev->clevel[0] = 0;
6370	/*
6371	 * Don't clear MD_CLOSING, or mddev can be opened again.
6372	 * 'hold_active != 0' means mddev is still in the creation
6373	 * process and will be used later.
6374	 */
6375	if (mddev->hold_active)
6376		mddev->flags = 0;
6377	else
6378		mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
6379	mddev->sb_flags = 0;
6380	mddev->ro = MD_RDWR;
6381	mddev->metadata_type[0] = 0;
6382	mddev->chunk_sectors = 0;
6383	mddev->ctime = mddev->utime = 0;
6384	mddev->layout = 0;
6385	mddev->max_disks = 0;
6386	mddev->events = 0;
6387	mddev->can_decrease_events = 0;
6388	mddev->delta_disks = 0;
6389	mddev->reshape_backwards = 0;
6390	mddev->new_level = LEVEL_NONE;
6391	mddev->new_layout = 0;
6392	mddev->new_chunk_sectors = 0;
6393	mddev->curr_resync = MD_RESYNC_NONE;
6394	atomic64_set(&mddev->resync_mismatches, 0);
6395	mddev->suspend_lo = mddev->suspend_hi = 0;
6396	mddev->sync_speed_min = mddev->sync_speed_max = 0;
6397	mddev->recovery = 0;
6398	mddev->in_sync = 0;
6399	mddev->changed = 0;
6400	mddev->degraded = 0;
6401	mddev->safemode = 0;
6402	mddev->private = NULL;
6403	mddev->cluster_info = NULL;
6404	mddev->bitmap_info.offset = 0;
6405	mddev->bitmap_info.default_offset = 0;
6406	mddev->bitmap_info.default_space = 0;
6407	mddev->bitmap_info.chunksize = 0;
6408	mddev->bitmap_info.daemon_sleep = 0;
6409	mddev->bitmap_info.max_write_behind = 0;
6410	mddev->bitmap_info.nodes = 0;
6411}
6412
6413static void __md_stop_writes(struct mddev *mddev)
6414{
6415	del_timer_sync(&mddev->safemode_timer);
6416
6417	if (mddev->pers && mddev->pers->quiesce) {
6418		mddev->pers->quiesce(mddev, 1);
6419		mddev->pers->quiesce(mddev, 0);
6420	}
6421	md_bitmap_flush(mddev);
6422
6423	if (md_is_rdwr(mddev) &&
6424	    ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
6425	     mddev->sb_flags)) {
6426		/* mark array as shutdown cleanly */
6427		if (!mddev_is_clustered(mddev))
6428			mddev->in_sync = 1;
6429		md_update_sb(mddev, 1);
6430	}
6431	/* disable policy to guarantee rdevs free resources for serialization */
6432	mddev->serialize_policy = 0;
6433	mddev_destroy_serial_pool(mddev, NULL);
6434}
6435
6436void md_stop_writes(struct mddev *mddev)
6437{
6438	mddev_lock_nointr(mddev);
6439	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6440	stop_sync_thread(mddev, true, false);
6441	__md_stop_writes(mddev);
6442	mddev_unlock(mddev);
6443}
6444EXPORT_SYMBOL_GPL(md_stop_writes);
6445
6446static void mddev_detach(struct mddev *mddev)
6447{
6448	md_bitmap_wait_behind_writes(mddev);
6449	if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
6450		mddev->pers->quiesce(mddev, 1);
6451		mddev->pers->quiesce(mddev, 0);
6452	}
6453	md_unregister_thread(mddev, &mddev->thread);
6454
6455	/* the unplug fn references 'conf' */
6456	if (!mddev_is_dm(mddev))
6457		blk_sync_queue(mddev->gendisk->queue);
6458}
6459
6460static void __md_stop(struct mddev *mddev)
6461{
6462	struct md_personality *pers = mddev->pers;
6463	md_bitmap_destroy(mddev);
6464	mddev_detach(mddev);
6465	spin_lock(&mddev->lock);
6466	mddev->pers = NULL;
6467	spin_unlock(&mddev->lock);
6468	if (mddev->private)
6469		pers->free(mddev, mddev->private);
6470	mddev->private = NULL;
6471	if (pers->sync_request && mddev->to_remove == NULL)
6472		mddev->to_remove = &md_redundancy_group;
6473	module_put(pers->owner);
6474	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6475
6476	bioset_exit(&mddev->bio_set);
6477	bioset_exit(&mddev->sync_set);
6478	bioset_exit(&mddev->io_clone_set);
6479}
6480
6481void md_stop(struct mddev *mddev)
6482{
6483	lockdep_assert_held(&mddev->reconfig_mutex);
6484
6485	/* stop the array and free an attached data structures.
6486	 * This is called from dm-raid
6487	 */
6488	__md_stop_writes(mddev);
6489	__md_stop(mddev);
6490}
6491
6492EXPORT_SYMBOL_GPL(md_stop);
6493
6494/* ensure 'mddev->pers' exist before calling md_set_readonly() */
6495static int md_set_readonly(struct mddev *mddev)
6496{
6497	int err = 0;
6498	int did_freeze = 0;
6499
6500	if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
6501		return -EBUSY;
6502
6503	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6504		did_freeze = 1;
6505		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6506	}
6507
6508	stop_sync_thread(mddev, false, false);
6509	wait_event(mddev->sb_wait,
6510		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
6511	mddev_lock_nointr(mddev);
6512
6513	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6514		pr_warn("md: %s still in use.\n",mdname(mddev));
6515		err = -EBUSY;
6516		goto out;
6517	}
6518
6519	__md_stop_writes(mddev);
6520
6521	if (mddev->ro == MD_RDONLY) {
6522		err  = -ENXIO;
6523		goto out;
6524	}
6525
6526	mddev->ro = MD_RDONLY;
6527	set_disk_ro(mddev->gendisk, 1);
6528
6529out:
6530	if (!err || did_freeze) {
6531		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6532		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6533		sysfs_notify_dirent_safe(mddev->sysfs_state);
6534	}
6535
6536	return err;
6537}
6538
6539/* mode:
6540 *   0 - completely stop and dis-assemble array
6541 *   2 - stop but do not disassemble array
6542 */
6543static int do_md_stop(struct mddev *mddev, int mode)
6544{
6545	struct gendisk *disk = mddev->gendisk;
6546	struct md_rdev *rdev;
6547	int did_freeze = 0;
6548
6549	if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6550		did_freeze = 1;
6551		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6552	}
6553
6554	stop_sync_thread(mddev, true, false);
6555
6556	if (mddev->sysfs_active ||
6557	    test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6558		pr_warn("md: %s still in use.\n",mdname(mddev));
6559		if (did_freeze) {
6560			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6561			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6562		}
6563		return -EBUSY;
6564	}
6565	if (mddev->pers) {
6566		if (!md_is_rdwr(mddev))
6567			set_disk_ro(disk, 0);
6568
6569		__md_stop_writes(mddev);
6570		__md_stop(mddev);
6571
6572		/* tell userspace to handle 'inactive' */
6573		sysfs_notify_dirent_safe(mddev->sysfs_state);
6574
6575		rdev_for_each(rdev, mddev)
6576			if (rdev->raid_disk >= 0)
6577				sysfs_unlink_rdev(mddev, rdev);
6578
6579		set_capacity_and_notify(disk, 0);
6580		mddev->changed = 1;
6581
6582		if (!md_is_rdwr(mddev))
6583			mddev->ro = MD_RDWR;
6584	}
6585	/*
6586	 * Free resources if final stop
6587	 */
6588	if (mode == 0) {
6589		pr_info("md: %s stopped.\n", mdname(mddev));
6590
6591		if (mddev->bitmap_info.file) {
6592			struct file *f = mddev->bitmap_info.file;
6593			spin_lock(&mddev->lock);
6594			mddev->bitmap_info.file = NULL;
6595			spin_unlock(&mddev->lock);
6596			fput(f);
6597		}
6598		mddev->bitmap_info.offset = 0;
6599
6600		export_array(mddev);
6601
6602		md_clean(mddev);
6603		if (mddev->hold_active == UNTIL_STOP)
6604			mddev->hold_active = 0;
6605	}
6606	md_new_event();
6607	sysfs_notify_dirent_safe(mddev->sysfs_state);
6608	return 0;
6609}
6610
6611#ifndef MODULE
6612static void autorun_array(struct mddev *mddev)
6613{
6614	struct md_rdev *rdev;
6615	int err;
6616
6617	if (list_empty(&mddev->disks))
6618		return;
6619
6620	pr_info("md: running: ");
6621
6622	rdev_for_each(rdev, mddev) {
6623		pr_cont("<%pg>", rdev->bdev);
6624	}
6625	pr_cont("\n");
6626
6627	err = do_md_run(mddev);
6628	if (err) {
6629		pr_warn("md: do_md_run() returned %d\n", err);
6630		do_md_stop(mddev, 0);
6631	}
6632}
6633
6634/*
6635 * lets try to run arrays based on all disks that have arrived
6636 * until now. (those are in pending_raid_disks)
6637 *
6638 * the method: pick the first pending disk, collect all disks with
6639 * the same UUID, remove all from the pending list and put them into
6640 * the 'same_array' list. Then order this list based on superblock
6641 * update time (freshest comes first), kick out 'old' disks and
6642 * compare superblocks. If everything's fine then run it.
6643 *
6644 * If "unit" is allocated, then bump its reference count
6645 */
6646static void autorun_devices(int part)
6647{
6648	struct md_rdev *rdev0, *rdev, *tmp;
6649	struct mddev *mddev;
6650
6651	pr_info("md: autorun ...\n");
6652	while (!list_empty(&pending_raid_disks)) {
6653		int unit;
6654		dev_t dev;
6655		LIST_HEAD(candidates);
6656		rdev0 = list_entry(pending_raid_disks.next,
6657					 struct md_rdev, same_set);
6658
6659		pr_debug("md: considering %pg ...\n", rdev0->bdev);
6660		INIT_LIST_HEAD(&candidates);
6661		rdev_for_each_list(rdev, tmp, &pending_raid_disks)
6662			if (super_90_load(rdev, rdev0, 0) >= 0) {
6663				pr_debug("md:  adding %pg ...\n",
6664					 rdev->bdev);
6665				list_move(&rdev->same_set, &candidates);
6666			}
6667		/*
6668		 * now we have a set of devices, with all of them having
6669		 * mostly sane superblocks. It's time to allocate the
6670		 * mddev.
6671		 */
6672		if (part) {
6673			dev = MKDEV(mdp_major,
6674				    rdev0->preferred_minor << MdpMinorShift);
6675			unit = MINOR(dev) >> MdpMinorShift;
6676		} else {
6677			dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
6678			unit = MINOR(dev);
6679		}
6680		if (rdev0->preferred_minor != unit) {
6681			pr_warn("md: unit number in %pg is bad: %d\n",
6682				rdev0->bdev, rdev0->preferred_minor);
6683			break;
6684		}
6685
6686		mddev = md_alloc(dev, NULL);
6687		if (IS_ERR(mddev))
6688			break;
6689
6690		if (mddev_suspend_and_lock(mddev))
6691			pr_warn("md: %s locked, cannot run\n", mdname(mddev));
6692		else if (mddev->raid_disks || mddev->major_version
6693			 || !list_empty(&mddev->disks)) {
6694			pr_warn("md: %s already running, cannot run %pg\n",
6695				mdname(mddev), rdev0->bdev);
6696			mddev_unlock_and_resume(mddev);
6697		} else {
6698			pr_debug("md: created %s\n", mdname(mddev));
6699			mddev->persistent = 1;
6700			rdev_for_each_list(rdev, tmp, &candidates) {
6701				list_del_init(&rdev->same_set);
6702				if (bind_rdev_to_array(rdev, mddev))
6703					export_rdev(rdev, mddev);
6704			}
6705			autorun_array(mddev);
6706			mddev_unlock_and_resume(mddev);
6707		}
6708		/* on success, candidates will be empty, on error
6709		 * it won't...
6710		 */
6711		rdev_for_each_list(rdev, tmp, &candidates) {
6712			list_del_init(&rdev->same_set);
6713			export_rdev(rdev, mddev);
6714		}
6715		mddev_put(mddev);
6716	}
6717	pr_info("md: ... autorun DONE.\n");
6718}
6719#endif /* !MODULE */
6720
6721static int get_version(void __user *arg)
6722{
6723	mdu_version_t ver;
6724
6725	ver.major = MD_MAJOR_VERSION;
6726	ver.minor = MD_MINOR_VERSION;
6727	ver.patchlevel = MD_PATCHLEVEL_VERSION;
6728
6729	if (copy_to_user(arg, &ver, sizeof(ver)))
6730		return -EFAULT;
6731
6732	return 0;
6733}
6734
6735static int get_array_info(struct mddev *mddev, void __user *arg)
6736{
6737	mdu_array_info_t info;
6738	int nr,working,insync,failed,spare;
6739	struct md_rdev *rdev;
6740
6741	nr = working = insync = failed = spare = 0;
6742	rcu_read_lock();
6743	rdev_for_each_rcu(rdev, mddev) {
6744		nr++;
6745		if (test_bit(Faulty, &rdev->flags))
6746			failed++;
6747		else {
6748			working++;
6749			if (test_bit(In_sync, &rdev->flags))
6750				insync++;
6751			else if (test_bit(Journal, &rdev->flags))
6752				/* TODO: add journal count to md_u.h */
6753				;
6754			else
6755				spare++;
6756		}
6757	}
6758	rcu_read_unlock();
6759
6760	info.major_version = mddev->major_version;
6761	info.minor_version = mddev->minor_version;
6762	info.patch_version = MD_PATCHLEVEL_VERSION;
6763	info.ctime         = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
6764	info.level         = mddev->level;
6765	info.size          = mddev->dev_sectors / 2;
6766	if (info.size != mddev->dev_sectors / 2) /* overflow */
6767		info.size = -1;
6768	info.nr_disks      = nr;
6769	info.raid_disks    = mddev->raid_disks;
6770	info.md_minor      = mddev->md_minor;
6771	info.not_persistent= !mddev->persistent;
6772
6773	info.utime         = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
6774	info.state         = 0;
6775	if (mddev->in_sync)
6776		info.state = (1<<MD_SB_CLEAN);
6777	if (mddev->bitmap && mddev->bitmap_info.offset)
6778		info.state |= (1<<MD_SB_BITMAP_PRESENT);
6779	if (mddev_is_clustered(mddev))
6780		info.state |= (1<<MD_SB_CLUSTERED);
6781	info.active_disks  = insync;
6782	info.working_disks = working;
6783	info.failed_disks  = failed;
6784	info.spare_disks   = spare;
6785
6786	info.layout        = mddev->layout;
6787	info.chunk_size    = mddev->chunk_sectors << 9;
6788
6789	if (copy_to_user(arg, &info, sizeof(info)))
6790		return -EFAULT;
6791
6792	return 0;
6793}
6794
6795static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6796{
6797	mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6798	char *ptr;
6799	int err;
6800
6801	file = kzalloc(sizeof(*file), GFP_NOIO);
6802	if (!file)
6803		return -ENOMEM;
6804
6805	err = 0;
6806	spin_lock(&mddev->lock);
6807	/* bitmap enabled */
6808	if (mddev->bitmap_info.file) {
6809		ptr = file_path(mddev->bitmap_info.file, file->pathname,
6810				sizeof(file->pathname));
6811		if (IS_ERR(ptr))
6812			err = PTR_ERR(ptr);
6813		else
6814			memmove(file->pathname, ptr,
6815				sizeof(file->pathname)-(ptr-file->pathname));
6816	}
6817	spin_unlock(&mddev->lock);
6818
6819	if (err == 0 &&
6820	    copy_to_user(arg, file, sizeof(*file)))
6821		err = -EFAULT;
6822
6823	kfree(file);
6824	return err;
6825}
6826
6827static int get_disk_info(struct mddev *mddev, void __user * arg)
6828{
6829	mdu_disk_info_t info;
6830	struct md_rdev *rdev;
6831
6832	if (copy_from_user(&info, arg, sizeof(info)))
6833		return -EFAULT;
6834
6835	rcu_read_lock();
6836	rdev = md_find_rdev_nr_rcu(mddev, info.number);
6837	if (rdev) {
6838		info.major = MAJOR(rdev->bdev->bd_dev);
6839		info.minor = MINOR(rdev->bdev->bd_dev);
6840		info.raid_disk = rdev->raid_disk;
6841		info.state = 0;
6842		if (test_bit(Faulty, &rdev->flags))
6843			info.state |= (1<<MD_DISK_FAULTY);
6844		else if (test_bit(In_sync, &rdev->flags)) {
6845			info.state |= (1<<MD_DISK_ACTIVE);
6846			info.state |= (1<<MD_DISK_SYNC);
6847		}
6848		if (test_bit(Journal, &rdev->flags))
6849			info.state |= (1<<MD_DISK_JOURNAL);
6850		if (test_bit(WriteMostly, &rdev->flags))
6851			info.state |= (1<<MD_DISK_WRITEMOSTLY);
6852		if (test_bit(FailFast, &rdev->flags))
6853			info.state |= (1<<MD_DISK_FAILFAST);
6854	} else {
6855		info.major = info.minor = 0;
6856		info.raid_disk = -1;
6857		info.state = (1<<MD_DISK_REMOVED);
6858	}
6859	rcu_read_unlock();
6860
6861	if (copy_to_user(arg, &info, sizeof(info)))
6862		return -EFAULT;
6863
6864	return 0;
6865}
6866
6867int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
6868{
6869	struct md_rdev *rdev;
6870	dev_t dev = MKDEV(info->major,info->minor);
6871
6872	if (mddev_is_clustered(mddev) &&
6873		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6874		pr_warn("%s: Cannot add to clustered mddev.\n",
6875			mdname(mddev));
6876		return -EINVAL;
6877	}
6878
6879	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6880		return -EOVERFLOW;
6881
6882	if (!mddev->raid_disks) {
6883		int err;
6884		/* expecting a device which has a superblock */
6885		rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6886		if (IS_ERR(rdev)) {
6887			pr_warn("md: md_import_device returned %ld\n",
6888				PTR_ERR(rdev));
6889			return PTR_ERR(rdev);
6890		}
6891		if (!list_empty(&mddev->disks)) {
6892			struct md_rdev *rdev0
6893				= list_entry(mddev->disks.next,
6894					     struct md_rdev, same_set);
6895			err = super_types[mddev->major_version]
6896				.load_super(rdev, rdev0, mddev->minor_version);
6897			if (err < 0) {
6898				pr_warn("md: %pg has different UUID to %pg\n",
6899					rdev->bdev,
6900					rdev0->bdev);
6901				export_rdev(rdev, mddev);
6902				return -EINVAL;
6903			}
6904		}
6905		err = bind_rdev_to_array(rdev, mddev);
6906		if (err)
6907			export_rdev(rdev, mddev);
6908		return err;
6909	}
6910
6911	/*
6912	 * md_add_new_disk can be used once the array is assembled
6913	 * to add "hot spares".  They must already have a superblock
6914	 * written
6915	 */
6916	if (mddev->pers) {
6917		int err;
6918		if (!mddev->pers->hot_add_disk) {
6919			pr_warn("%s: personality does not support diskops!\n",
6920				mdname(mddev));
6921			return -EINVAL;
6922		}
6923		if (mddev->persistent)
6924			rdev = md_import_device(dev, mddev->major_version,
6925						mddev->minor_version);
6926		else
6927			rdev = md_import_device(dev, -1, -1);
6928		if (IS_ERR(rdev)) {
6929			pr_warn("md: md_import_device returned %ld\n",
6930				PTR_ERR(rdev));
6931			return PTR_ERR(rdev);
6932		}
6933		/* set saved_raid_disk if appropriate */
6934		if (!mddev->persistent) {
6935			if (info->state & (1<<MD_DISK_SYNC)  &&
6936			    info->raid_disk < mddev->raid_disks) {
6937				rdev->raid_disk = info->raid_disk;
6938				clear_bit(Bitmap_sync, &rdev->flags);
6939			} else
6940				rdev->raid_disk = -1;
6941			rdev->saved_raid_disk = rdev->raid_disk;
6942		} else
6943			super_types[mddev->major_version].
6944				validate_super(mddev, NULL/*freshest*/, rdev);
6945		if ((info->state & (1<<MD_DISK_SYNC)) &&
6946		     rdev->raid_disk != info->raid_disk) {
6947			/* This was a hot-add request, but events doesn't
6948			 * match, so reject it.
6949			 */
6950			export_rdev(rdev, mddev);
6951			return -EINVAL;
6952		}
6953
6954		clear_bit(In_sync, &rdev->flags); /* just to be sure */
6955		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6956			set_bit(WriteMostly, &rdev->flags);
6957		else
6958			clear_bit(WriteMostly, &rdev->flags);
6959		if (info->state & (1<<MD_DISK_FAILFAST))
6960			set_bit(FailFast, &rdev->flags);
6961		else
6962			clear_bit(FailFast, &rdev->flags);
6963
6964		if (info->state & (1<<MD_DISK_JOURNAL)) {
6965			struct md_rdev *rdev2;
6966			bool has_journal = false;
6967
6968			/* make sure no existing journal disk */
6969			rdev_for_each(rdev2, mddev) {
6970				if (test_bit(Journal, &rdev2->flags)) {
6971					has_journal = true;
6972					break;
6973				}
6974			}
6975			if (has_journal || mddev->bitmap) {
6976				export_rdev(rdev, mddev);
6977				return -EBUSY;
6978			}
6979			set_bit(Journal, &rdev->flags);
6980		}
6981		/*
6982		 * check whether the device shows up in other nodes
6983		 */
6984		if (mddev_is_clustered(mddev)) {
6985			if (info->state & (1 << MD_DISK_CANDIDATE))
6986				set_bit(Candidate, &rdev->flags);
6987			else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6988				/* --add initiated by this node */
6989				err = md_cluster_ops->add_new_disk(mddev, rdev);
6990				if (err) {
6991					export_rdev(rdev, mddev);
6992					return err;
6993				}
6994			}
6995		}
6996
6997		rdev->raid_disk = -1;
6998		err = bind_rdev_to_array(rdev, mddev);
6999
7000		if (err)
7001			export_rdev(rdev, mddev);
7002
7003		if (mddev_is_clustered(mddev)) {
7004			if (info->state & (1 << MD_DISK_CANDIDATE)) {
7005				if (!err) {
7006					err = md_cluster_ops->new_disk_ack(mddev,
7007						err == 0);
7008					if (err)
7009						md_kick_rdev_from_array(rdev);
7010				}
7011			} else {
7012				if (err)
7013					md_cluster_ops->add_new_disk_cancel(mddev);
7014				else
7015					err = add_bound_rdev(rdev);
7016			}
7017
7018		} else if (!err)
7019			err = add_bound_rdev(rdev);
7020
7021		return err;
7022	}
7023
7024	/* otherwise, md_add_new_disk is only allowed
7025	 * for major_version==0 superblocks
7026	 */
7027	if (mddev->major_version != 0) {
7028		pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
7029		return -EINVAL;
7030	}
7031
7032	if (!(info->state & (1<<MD_DISK_FAULTY))) {
7033		int err;
7034		rdev = md_import_device(dev, -1, 0);
7035		if (IS_ERR(rdev)) {
7036			pr_warn("md: error, md_import_device() returned %ld\n",
7037				PTR_ERR(rdev));
7038			return PTR_ERR(rdev);
7039		}
7040		rdev->desc_nr = info->number;
7041		if (info->raid_disk < mddev->raid_disks)
7042			rdev->raid_disk = info->raid_disk;
7043		else
7044			rdev->raid_disk = -1;
7045
7046		if (rdev->raid_disk < mddev->raid_disks)
7047			if (info->state & (1<<MD_DISK_SYNC))
7048				set_bit(In_sync, &rdev->flags);
7049
7050		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
7051			set_bit(WriteMostly, &rdev->flags);
7052		if (info->state & (1<<MD_DISK_FAILFAST))
7053			set_bit(FailFast, &rdev->flags);
7054
7055		if (!mddev->persistent) {
7056			pr_debug("md: nonpersistent superblock ...\n");
7057			rdev->sb_start = bdev_nr_sectors(rdev->bdev);
7058		} else
7059			rdev->sb_start = calc_dev_sboffset(rdev);
7060		rdev->sectors = rdev->sb_start;
7061
7062		err = bind_rdev_to_array(rdev, mddev);
7063		if (err) {
7064			export_rdev(rdev, mddev);
7065			return err;
7066		}
7067	}
7068
7069	return 0;
7070}
7071
7072static int hot_remove_disk(struct mddev *mddev, dev_t dev)
7073{
7074	struct md_rdev *rdev;
7075
7076	if (!mddev->pers)
7077		return -ENODEV;
7078
7079	rdev = find_rdev(mddev, dev);
7080	if (!rdev)
7081		return -ENXIO;
7082
7083	if (rdev->raid_disk < 0)
7084		goto kick_rdev;
7085
7086	clear_bit(Blocked, &rdev->flags);
7087	remove_and_add_spares(mddev, rdev);
7088
7089	if (rdev->raid_disk >= 0)
7090		goto busy;
7091
7092kick_rdev:
7093	if (mddev_is_clustered(mddev)) {
7094		if (md_cluster_ops->remove_disk(mddev, rdev))
7095			goto busy;
7096	}
7097
7098	md_kick_rdev_from_array(rdev);
7099	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7100	if (!mddev->thread)
7101		md_update_sb(mddev, 1);
7102	md_new_event();
7103
7104	return 0;
7105busy:
7106	pr_debug("md: cannot remove active disk %pg from %s ...\n",
7107		 rdev->bdev, mdname(mddev));
7108	return -EBUSY;
7109}
7110
7111static int hot_add_disk(struct mddev *mddev, dev_t dev)
7112{
7113	int err;
7114	struct md_rdev *rdev;
7115
7116	if (!mddev->pers)
7117		return -ENODEV;
7118
7119	if (mddev->major_version != 0) {
7120		pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
7121			mdname(mddev));
7122		return -EINVAL;
7123	}
7124	if (!mddev->pers->hot_add_disk) {
7125		pr_warn("%s: personality does not support diskops!\n",
7126			mdname(mddev));
7127		return -EINVAL;
7128	}
7129
7130	rdev = md_import_device(dev, -1, 0);
7131	if (IS_ERR(rdev)) {
7132		pr_warn("md: error, md_import_device() returned %ld\n",
7133			PTR_ERR(rdev));
7134		return -EINVAL;
7135	}
7136
7137	if (mddev->persistent)
7138		rdev->sb_start = calc_dev_sboffset(rdev);
7139	else
7140		rdev->sb_start = bdev_nr_sectors(rdev->bdev);
7141
7142	rdev->sectors = rdev->sb_start;
7143
7144	if (test_bit(Faulty, &rdev->flags)) {
7145		pr_warn("md: can not hot-add faulty %pg disk to %s!\n",
7146			rdev->bdev, mdname(mddev));
7147		err = -EINVAL;
7148		goto abort_export;
7149	}
7150
7151	clear_bit(In_sync, &rdev->flags);
7152	rdev->desc_nr = -1;
7153	rdev->saved_raid_disk = -1;
7154	err = bind_rdev_to_array(rdev, mddev);
7155	if (err)
7156		goto abort_export;
7157
7158	/*
7159	 * The rest should better be atomic, we can have disk failures
7160	 * noticed in interrupt contexts ...
7161	 */
7162
7163	rdev->raid_disk = -1;
7164
7165	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7166	if (!mddev->thread)
7167		md_update_sb(mddev, 1);
7168	/*
7169	 * If the new disk does not support REQ_NOWAIT,
7170	 * disable on the whole MD.
7171	 */
7172	if (!bdev_nowait(rdev->bdev)) {
7173		pr_info("%s: Disabling nowait because %pg does not support nowait\n",
7174			mdname(mddev), rdev->bdev);
7175		blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue);
7176	}
7177	/*
7178	 * Kick recovery, maybe this spare has to be added to the
7179	 * array immediately.
7180	 */
7181	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7182	md_new_event();
7183	return 0;
7184
7185abort_export:
7186	export_rdev(rdev, mddev);
7187	return err;
7188}
7189
7190static int set_bitmap_file(struct mddev *mddev, int fd)
7191{
7192	int err = 0;
7193
7194	if (mddev->pers) {
7195		if (!mddev->pers->quiesce || !mddev->thread)
7196			return -EBUSY;
7197		if (mddev->recovery || mddev->sync_thread)
7198			return -EBUSY;
7199		/* we should be able to change the bitmap.. */
7200	}
7201
7202	if (fd >= 0) {
7203		struct inode *inode;
7204		struct file *f;
7205
7206		if (mddev->bitmap || mddev->bitmap_info.file)
7207			return -EEXIST; /* cannot add when bitmap is present */
7208
7209		if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) {
7210			pr_warn("%s: bitmap files not supported by this kernel\n",
7211				mdname(mddev));
7212			return -EINVAL;
7213		}
7214		pr_warn("%s: using deprecated bitmap file support\n",
7215			mdname(mddev));
7216
7217		f = fget(fd);
7218
7219		if (f == NULL) {
7220			pr_warn("%s: error: failed to get bitmap file\n",
7221				mdname(mddev));
7222			return -EBADF;
7223		}
7224
7225		inode = f->f_mapping->host;
7226		if (!S_ISREG(inode->i_mode)) {
7227			pr_warn("%s: error: bitmap file must be a regular file\n",
7228				mdname(mddev));
7229			err = -EBADF;
7230		} else if (!(f->f_mode & FMODE_WRITE)) {
7231			pr_warn("%s: error: bitmap file must open for write\n",
7232				mdname(mddev));
7233			err = -EBADF;
7234		} else if (atomic_read(&inode->i_writecount) != 1) {
7235			pr_warn("%s: error: bitmap file is already in use\n",
7236				mdname(mddev));
7237			err = -EBUSY;
7238		}
7239		if (err) {
7240			fput(f);
7241			return err;
7242		}
7243		mddev->bitmap_info.file = f;
7244		mddev->bitmap_info.offset = 0; /* file overrides offset */
7245	} else if (mddev->bitmap == NULL)
7246		return -ENOENT; /* cannot remove what isn't there */
7247	err = 0;
7248	if (mddev->pers) {
7249		if (fd >= 0) {
7250			struct bitmap *bitmap;
7251
7252			bitmap = md_bitmap_create(mddev, -1);
7253			if (!IS_ERR(bitmap)) {
7254				mddev->bitmap = bitmap;
7255				err = md_bitmap_load(mddev);
7256			} else
7257				err = PTR_ERR(bitmap);
7258			if (err) {
7259				md_bitmap_destroy(mddev);
7260				fd = -1;
7261			}
7262		} else if (fd < 0) {
7263			md_bitmap_destroy(mddev);
7264		}
7265	}
7266	if (fd < 0) {
7267		struct file *f = mddev->bitmap_info.file;
7268		if (f) {
7269			spin_lock(&mddev->lock);
7270			mddev->bitmap_info.file = NULL;
7271			spin_unlock(&mddev->lock);
7272			fput(f);
7273		}
7274	}
7275
7276	return err;
7277}
7278
7279/*
7280 * md_set_array_info is used two different ways
7281 * The original usage is when creating a new array.
7282 * In this usage, raid_disks is > 0 and it together with
7283 *  level, size, not_persistent,layout,chunksize determine the
7284 *  shape of the array.
7285 *  This will always create an array with a type-0.90.0 superblock.
7286 * The newer usage is when assembling an array.
7287 *  In this case raid_disks will be 0, and the major_version field is
7288 *  use to determine which style super-blocks are to be found on the devices.
7289 *  The minor and patch _version numbers are also kept incase the
7290 *  super_block handler wishes to interpret them.
7291 */
7292int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
7293{
7294	if (info->raid_disks == 0) {
7295		/* just setting version number for superblock loading */
7296		if (info->major_version < 0 ||
7297		    info->major_version >= ARRAY_SIZE(super_types) ||
7298		    super_types[info->major_version].name == NULL) {
7299			/* maybe try to auto-load a module? */
7300			pr_warn("md: superblock version %d not known\n",
7301				info->major_version);
7302			return -EINVAL;
7303		}
7304		mddev->major_version = info->major_version;
7305		mddev->minor_version = info->minor_version;
7306		mddev->patch_version = info->patch_version;
7307		mddev->persistent = !info->not_persistent;
7308		/* ensure mddev_put doesn't delete this now that there
7309		 * is some minimal configuration.
7310		 */
7311		mddev->ctime         = ktime_get_real_seconds();
7312		return 0;
7313	}
7314	mddev->major_version = MD_MAJOR_VERSION;
7315	mddev->minor_version = MD_MINOR_VERSION;
7316	mddev->patch_version = MD_PATCHLEVEL_VERSION;
7317	mddev->ctime         = ktime_get_real_seconds();
7318
7319	mddev->level         = info->level;
7320	mddev->clevel[0]     = 0;
7321	mddev->dev_sectors   = 2 * (sector_t)info->size;
7322	mddev->raid_disks    = info->raid_disks;
7323	/* don't set md_minor, it is determined by which /dev/md* was
7324	 * openned
7325	 */
7326	if (info->state & (1<<MD_SB_CLEAN))
7327		mddev->recovery_cp = MaxSector;
7328	else
7329		mddev->recovery_cp = 0;
7330	mddev->persistent    = ! info->not_persistent;
7331	mddev->external	     = 0;
7332
7333	mddev->layout        = info->layout;
7334	if (mddev->level == 0)
7335		/* Cannot trust RAID0 layout info here */
7336		mddev->layout = -1;
7337	mddev->chunk_sectors = info->chunk_size >> 9;
7338
7339	if (mddev->persistent) {
7340		mddev->max_disks = MD_SB_DISKS;
7341		mddev->flags = 0;
7342		mddev->sb_flags = 0;
7343	}
7344	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7345
7346	mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
7347	mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
7348	mddev->bitmap_info.offset = 0;
7349
7350	mddev->reshape_position = MaxSector;
7351
7352	/*
7353	 * Generate a 128 bit UUID
7354	 */
7355	get_random_bytes(mddev->uuid, 16);
7356
7357	mddev->new_level = mddev->level;
7358	mddev->new_chunk_sectors = mddev->chunk_sectors;
7359	mddev->new_layout = mddev->layout;
7360	mddev->delta_disks = 0;
7361	mddev->reshape_backwards = 0;
7362
7363	return 0;
7364}
7365
7366void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
7367{
7368	lockdep_assert_held(&mddev->reconfig_mutex);
7369
7370	if (mddev->external_size)
7371		return;
7372
7373	mddev->array_sectors = array_sectors;
7374}
7375EXPORT_SYMBOL(md_set_array_sectors);
7376
7377static int update_size(struct mddev *mddev, sector_t num_sectors)
7378{
7379	struct md_rdev *rdev;
7380	int rv;
7381	int fit = (num_sectors == 0);
7382	sector_t old_dev_sectors = mddev->dev_sectors;
7383
7384	if (mddev->pers->resize == NULL)
7385		return -EINVAL;
7386	/* The "num_sectors" is the number of sectors of each device that
7387	 * is used.  This can only make sense for arrays with redundancy.
7388	 * linear and raid0 always use whatever space is available. We can only
7389	 * consider changing this number if no resync or reconstruction is
7390	 * happening, and if the new size is acceptable. It must fit before the
7391	 * sb_start or, if that is <data_offset, it must fit before the size
7392	 * of each device.  If num_sectors is zero, we find the largest size
7393	 * that fits.
7394	 */
7395	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7396		return -EBUSY;
7397	if (!md_is_rdwr(mddev))
7398		return -EROFS;
7399
7400	rdev_for_each(rdev, mddev) {
7401		sector_t avail = rdev->sectors;
7402
7403		if (fit && (num_sectors == 0 || num_sectors > avail))
7404			num_sectors = avail;
7405		if (avail < num_sectors)
7406			return -ENOSPC;
7407	}
7408	rv = mddev->pers->resize(mddev, num_sectors);
7409	if (!rv) {
7410		if (mddev_is_clustered(mddev))
7411			md_cluster_ops->update_size(mddev, old_dev_sectors);
7412		else if (!mddev_is_dm(mddev))
7413			set_capacity_and_notify(mddev->gendisk,
7414						mddev->array_sectors);
7415	}
7416	return rv;
7417}
7418
7419static int update_raid_disks(struct mddev *mddev, int raid_disks)
7420{
7421	int rv;
7422	struct md_rdev *rdev;
7423	/* change the number of raid disks */
7424	if (mddev->pers->check_reshape == NULL)
7425		return -EINVAL;
7426	if (!md_is_rdwr(mddev))
7427		return -EROFS;
7428	if (raid_disks <= 0 ||
7429	    (mddev->max_disks && raid_disks >= mddev->max_disks))
7430		return -EINVAL;
7431	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7432	    test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) ||
7433	    mddev->reshape_position != MaxSector)
7434		return -EBUSY;
7435
7436	rdev_for_each(rdev, mddev) {
7437		if (mddev->raid_disks < raid_disks &&
7438		    rdev->data_offset < rdev->new_data_offset)
7439			return -EINVAL;
7440		if (mddev->raid_disks > raid_disks &&
7441		    rdev->data_offset > rdev->new_data_offset)
7442			return -EINVAL;
7443	}
7444
7445	mddev->delta_disks = raid_disks - mddev->raid_disks;
7446	if (mddev->delta_disks < 0)
7447		mddev->reshape_backwards = 1;
7448	else if (mddev->delta_disks > 0)
7449		mddev->reshape_backwards = 0;
7450
7451	rv = mddev->pers->check_reshape(mddev);
7452	if (rv < 0) {
7453		mddev->delta_disks = 0;
7454		mddev->reshape_backwards = 0;
7455	}
7456	return rv;
7457}
7458
7459/*
7460 * update_array_info is used to change the configuration of an
7461 * on-line array.
7462 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
7463 * fields in the info are checked against the array.
7464 * Any differences that cannot be handled will cause an error.
7465 * Normally, only one change can be managed at a time.
7466 */
7467static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
7468{
7469	int rv = 0;
7470	int cnt = 0;
7471	int state = 0;
7472
7473	/* calculate expected state,ignoring low bits */
7474	if (mddev->bitmap && mddev->bitmap_info.offset)
7475		state |= (1 << MD_SB_BITMAP_PRESENT);
7476
7477	if (mddev->major_version != info->major_version ||
7478	    mddev->minor_version != info->minor_version ||
7479/*	    mddev->patch_version != info->patch_version || */
7480	    mddev->ctime         != info->ctime         ||
7481	    mddev->level         != info->level         ||
7482/*	    mddev->layout        != info->layout        || */
7483	    mddev->persistent	 != !info->not_persistent ||
7484	    mddev->chunk_sectors != info->chunk_size >> 9 ||
7485	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
7486	    ((state^info->state) & 0xfffffe00)
7487		)
7488		return -EINVAL;
7489	/* Check there is only one change */
7490	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7491		cnt++;
7492	if (mddev->raid_disks != info->raid_disks)
7493		cnt++;
7494	if (mddev->layout != info->layout)
7495		cnt++;
7496	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
7497		cnt++;
7498	if (cnt == 0)
7499		return 0;
7500	if (cnt > 1)
7501		return -EINVAL;
7502
7503	if (mddev->layout != info->layout) {
7504		/* Change layout
7505		 * we don't need to do anything at the md level, the
7506		 * personality will take care of it all.
7507		 */
7508		if (mddev->pers->check_reshape == NULL)
7509			return -EINVAL;
7510		else {
7511			mddev->new_layout = info->layout;
7512			rv = mddev->pers->check_reshape(mddev);
7513			if (rv)
7514				mddev->new_layout = mddev->layout;
7515			return rv;
7516		}
7517	}
7518	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7519		rv = update_size(mddev, (sector_t)info->size * 2);
7520
7521	if (mddev->raid_disks    != info->raid_disks)
7522		rv = update_raid_disks(mddev, info->raid_disks);
7523
7524	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7525		if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
7526			rv = -EINVAL;
7527			goto err;
7528		}
7529		if (mddev->recovery || mddev->sync_thread) {
7530			rv = -EBUSY;
7531			goto err;
7532		}
7533		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7534			struct bitmap *bitmap;
7535			/* add the bitmap */
7536			if (mddev->bitmap) {
7537				rv = -EEXIST;
7538				goto err;
7539			}
7540			if (mddev->bitmap_info.default_offset == 0) {
7541				rv = -EINVAL;
7542				goto err;
7543			}
7544			mddev->bitmap_info.offset =
7545				mddev->bitmap_info.default_offset;
7546			mddev->bitmap_info.space =
7547				mddev->bitmap_info.default_space;
7548			bitmap = md_bitmap_create(mddev, -1);
7549			if (!IS_ERR(bitmap)) {
7550				mddev->bitmap = bitmap;
7551				rv = md_bitmap_load(mddev);
7552			} else
7553				rv = PTR_ERR(bitmap);
7554			if (rv)
7555				md_bitmap_destroy(mddev);
7556		} else {
7557			/* remove the bitmap */
7558			if (!mddev->bitmap) {
7559				rv = -ENOENT;
7560				goto err;
7561			}
7562			if (mddev->bitmap->storage.file) {
7563				rv = -EINVAL;
7564				goto err;
7565			}
7566			if (mddev->bitmap_info.nodes) {
7567				/* hold PW on all the bitmap lock */
7568				if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
7569					pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
7570					rv = -EPERM;
7571					md_cluster_ops->unlock_all_bitmaps(mddev);
7572					goto err;
7573				}
7574
7575				mddev->bitmap_info.nodes = 0;
7576				md_cluster_ops->leave(mddev);
7577				module_put(md_cluster_mod);
7578				mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
7579			}
7580			md_bitmap_destroy(mddev);
7581			mddev->bitmap_info.offset = 0;
7582		}
7583	}
7584	md_update_sb(mddev, 1);
7585	return rv;
7586err:
7587	return rv;
7588}
7589
7590static int set_disk_faulty(struct mddev *mddev, dev_t dev)
7591{
7592	struct md_rdev *rdev;
7593	int err = 0;
7594
7595	if (mddev->pers == NULL)
7596		return -ENODEV;
7597
7598	rcu_read_lock();
7599	rdev = md_find_rdev_rcu(mddev, dev);
7600	if (!rdev)
7601		err =  -ENODEV;
7602	else {
7603		md_error(mddev, rdev);
7604		if (test_bit(MD_BROKEN, &mddev->flags))
7605			err = -EBUSY;
7606	}
7607	rcu_read_unlock();
7608	return err;
7609}
7610
7611/*
7612 * We have a problem here : there is no easy way to give a CHS
7613 * virtual geometry. We currently pretend that we have a 2 heads
7614 * 4 sectors (with a BIG number of cylinders...). This drives
7615 * dosfs just mad... ;-)
7616 */
7617static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
7618{
7619	struct mddev *mddev = bdev->bd_disk->private_data;
7620
7621	geo->heads = 2;
7622	geo->sectors = 4;
7623	geo->cylinders = mddev->array_sectors / 8;
7624	return 0;
7625}
7626
7627static inline int md_ioctl_valid(unsigned int cmd)
7628{
7629	switch (cmd) {
7630	case GET_ARRAY_INFO:
7631	case GET_DISK_INFO:
7632	case RAID_VERSION:
7633		return 0;
7634	case ADD_NEW_DISK:
7635	case GET_BITMAP_FILE:
7636	case HOT_ADD_DISK:
7637	case HOT_REMOVE_DISK:
7638	case RESTART_ARRAY_RW:
7639	case RUN_ARRAY:
7640	case SET_ARRAY_INFO:
7641	case SET_BITMAP_FILE:
7642	case SET_DISK_FAULTY:
7643	case STOP_ARRAY:
7644	case STOP_ARRAY_RO:
7645	case CLUSTERED_DISK_NACK:
7646		if (!capable(CAP_SYS_ADMIN))
7647			return -EACCES;
7648		return 0;
7649	default:
7650		return -ENOTTY;
7651	}
7652}
7653
7654static bool md_ioctl_need_suspend(unsigned int cmd)
7655{
7656	switch (cmd) {
7657	case ADD_NEW_DISK:
7658	case HOT_ADD_DISK:
7659	case HOT_REMOVE_DISK:
7660	case SET_BITMAP_FILE:
7661	case SET_ARRAY_INFO:
7662		return true;
7663	default:
7664		return false;
7665	}
7666}
7667
7668static int __md_set_array_info(struct mddev *mddev, void __user *argp)
7669{
7670	mdu_array_info_t info;
7671	int err;
7672
7673	if (!argp)
7674		memset(&info, 0, sizeof(info));
7675	else if (copy_from_user(&info, argp, sizeof(info)))
7676		return -EFAULT;
7677
7678	if (mddev->pers) {
7679		err = update_array_info(mddev, &info);
7680		if (err)
7681			pr_warn("md: couldn't update array info. %d\n", err);
7682		return err;
7683	}
7684
7685	if (!list_empty(&mddev->disks)) {
7686		pr_warn("md: array %s already has disks!\n", mdname(mddev));
7687		return -EBUSY;
7688	}
7689
7690	if (mddev->raid_disks) {
7691		pr_warn("md: array %s already initialised!\n", mdname(mddev));
7692		return -EBUSY;
7693	}
7694
7695	err = md_set_array_info(mddev, &info);
7696	if (err)
7697		pr_warn("md: couldn't set array info. %d\n", err);
7698
7699	return err;
7700}
7701
7702static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
7703			unsigned int cmd, unsigned long arg)
7704{
7705	int err = 0;
7706	void __user *argp = (void __user *)arg;
7707	struct mddev *mddev = NULL;
7708
7709	err = md_ioctl_valid(cmd);
7710	if (err)
7711		return err;
7712
7713	/*
7714	 * Commands dealing with the RAID driver but not any
7715	 * particular array:
7716	 */
7717	if (cmd == RAID_VERSION)
7718		return get_version(argp);
7719
7720	/*
7721	 * Commands creating/starting a new array:
7722	 */
7723
7724	mddev = bdev->bd_disk->private_data;
7725
7726	/* Some actions do not requires the mutex */
7727	switch (cmd) {
7728	case GET_ARRAY_INFO:
7729		if (!mddev->raid_disks && !mddev->external)
7730			return -ENODEV;
7731		return get_array_info(mddev, argp);
7732
7733	case GET_DISK_INFO:
7734		if (!mddev->raid_disks && !mddev->external)
7735			return -ENODEV;
7736		return get_disk_info(mddev, argp);
7737
7738	case SET_DISK_FAULTY:
7739		return set_disk_faulty(mddev, new_decode_dev(arg));
7740
7741	case GET_BITMAP_FILE:
7742		return get_bitmap_file(mddev, argp);
7743	}
7744
7745	if (cmd == HOT_REMOVE_DISK)
7746		/* need to ensure recovery thread has run */
7747		wait_event_interruptible_timeout(mddev->sb_wait,
7748						 !test_bit(MD_RECOVERY_NEEDED,
7749							   &mddev->recovery),
7750						 msecs_to_jiffies(5000));
7751	if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
7752		/* Need to flush page cache, and ensure no-one else opens
7753		 * and writes
7754		 */
7755		err = mddev_set_closing_and_sync_blockdev(mddev, 1);
7756		if (err)
7757			return err;
7758	}
7759
7760	if (!md_is_rdwr(mddev))
7761		flush_work(&mddev->sync_work);
7762
7763	err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
7764					   mddev_lock(mddev);
7765	if (err) {
7766		pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7767			 err, cmd);
7768		goto out;
7769	}
7770
7771	if (cmd == SET_ARRAY_INFO) {
7772		err = __md_set_array_info(mddev, argp);
7773		goto unlock;
7774	}
7775
7776	/*
7777	 * Commands querying/configuring an existing array:
7778	 */
7779	/* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7780	 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7781	if ((!mddev->raid_disks && !mddev->external)
7782	    && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7783	    && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7784	    && cmd != GET_BITMAP_FILE) {
7785		err = -ENODEV;
7786		goto unlock;
7787	}
7788
7789	/*
7790	 * Commands even a read-only array can execute:
7791	 */
7792	switch (cmd) {
7793	case RESTART_ARRAY_RW:
7794		err = restart_array(mddev);
7795		goto unlock;
7796
7797	case STOP_ARRAY:
7798		err = do_md_stop(mddev, 0);
7799		goto unlock;
7800
7801	case STOP_ARRAY_RO:
7802		if (mddev->pers)
7803			err = md_set_readonly(mddev);
7804		goto unlock;
7805
7806	case HOT_REMOVE_DISK:
7807		err = hot_remove_disk(mddev, new_decode_dev(arg));
7808		goto unlock;
7809
7810	case ADD_NEW_DISK:
7811		/* We can support ADD_NEW_DISK on read-only arrays
7812		 * only if we are re-adding a preexisting device.
7813		 * So require mddev->pers and MD_DISK_SYNC.
7814		 */
7815		if (mddev->pers) {
7816			mdu_disk_info_t info;
7817			if (copy_from_user(&info, argp, sizeof(info)))
7818				err = -EFAULT;
7819			else if (!(info.state & (1<<MD_DISK_SYNC)))
7820				/* Need to clear read-only for this */
7821				break;
7822			else
7823				err = md_add_new_disk(mddev, &info);
7824			goto unlock;
7825		}
7826		break;
7827	}
7828
7829	/*
7830	 * The remaining ioctls are changing the state of the
7831	 * superblock, so we do not allow them on read-only arrays.
7832	 */
7833	if (!md_is_rdwr(mddev) && mddev->pers) {
7834		if (mddev->ro != MD_AUTO_READ) {
7835			err = -EROFS;
7836			goto unlock;
7837		}
7838		mddev->ro = MD_RDWR;
7839		sysfs_notify_dirent_safe(mddev->sysfs_state);
7840		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7841		/* mddev_unlock will wake thread */
7842		/* If a device failed while we were read-only, we
7843		 * need to make sure the metadata is updated now.
7844		 */
7845		if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7846			mddev_unlock(mddev);
7847			wait_event(mddev->sb_wait,
7848				   !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7849				   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7850			mddev_lock_nointr(mddev);
7851		}
7852	}
7853
7854	switch (cmd) {
7855	case ADD_NEW_DISK:
7856	{
7857		mdu_disk_info_t info;
7858		if (copy_from_user(&info, argp, sizeof(info)))
7859			err = -EFAULT;
7860		else
7861			err = md_add_new_disk(mddev, &info);
7862		goto unlock;
7863	}
7864
7865	case CLUSTERED_DISK_NACK:
7866		if (mddev_is_clustered(mddev))
7867			md_cluster_ops->new_disk_ack(mddev, false);
7868		else
7869			err = -EINVAL;
7870		goto unlock;
7871
7872	case HOT_ADD_DISK:
7873		err = hot_add_disk(mddev, new_decode_dev(arg));
7874		goto unlock;
7875
7876	case RUN_ARRAY:
7877		err = do_md_run(mddev);
7878		goto unlock;
7879
7880	case SET_BITMAP_FILE:
7881		err = set_bitmap_file(mddev, (int)arg);
7882		goto unlock;
7883
7884	default:
7885		err = -EINVAL;
7886		goto unlock;
7887	}
7888
7889unlock:
7890	if (mddev->hold_active == UNTIL_IOCTL &&
7891	    err != -EINVAL)
7892		mddev->hold_active = 0;
7893
7894	md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
7895				     mddev_unlock(mddev);
7896
7897out:
7898	if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
7899		clear_bit(MD_CLOSING, &mddev->flags);
7900	return err;
7901}
7902#ifdef CONFIG_COMPAT
7903static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode,
7904		    unsigned int cmd, unsigned long arg)
7905{
7906	switch (cmd) {
7907	case HOT_REMOVE_DISK:
7908	case HOT_ADD_DISK:
7909	case SET_DISK_FAULTY:
7910	case SET_BITMAP_FILE:
7911		/* These take in integer arg, do not convert */
7912		break;
7913	default:
7914		arg = (unsigned long)compat_ptr(arg);
7915		break;
7916	}
7917
7918	return md_ioctl(bdev, mode, cmd, arg);
7919}
7920#endif /* CONFIG_COMPAT */
7921
7922static int md_set_read_only(struct block_device *bdev, bool ro)
7923{
7924	struct mddev *mddev = bdev->bd_disk->private_data;
7925	int err;
7926
7927	err = mddev_lock(mddev);
7928	if (err)
7929		return err;
7930
7931	if (!mddev->raid_disks && !mddev->external) {
7932		err = -ENODEV;
7933		goto out_unlock;
7934	}
7935
7936	/*
7937	 * Transitioning to read-auto need only happen for arrays that call
7938	 * md_write_start and which are not ready for writes yet.
7939	 */
7940	if (!ro && mddev->ro == MD_RDONLY && mddev->pers) {
7941		err = restart_array(mddev);
7942		if (err)
7943			goto out_unlock;
7944		mddev->ro = MD_AUTO_READ;
7945	}
7946
7947out_unlock:
7948	mddev_unlock(mddev);
7949	return err;
7950}
7951
7952static int md_open(struct gendisk *disk, blk_mode_t mode)
7953{
7954	struct mddev *mddev;
7955	int err;
7956
7957	spin_lock(&all_mddevs_lock);
7958	mddev = mddev_get(disk->private_data);
7959	spin_unlock(&all_mddevs_lock);
7960	if (!mddev)
7961		return -ENODEV;
7962
7963	err = mutex_lock_interruptible(&mddev->open_mutex);
7964	if (err)
7965		goto out;
7966
7967	err = -ENODEV;
7968	if (test_bit(MD_CLOSING, &mddev->flags))
7969		goto out_unlock;
7970
7971	atomic_inc(&mddev->openers);
7972	mutex_unlock(&mddev->open_mutex);
7973
7974	disk_check_media_change(disk);
7975	return 0;
7976
7977out_unlock:
7978	mutex_unlock(&mddev->open_mutex);
7979out:
7980	mddev_put(mddev);
7981	return err;
7982}
7983
7984static void md_release(struct gendisk *disk)
7985{
7986	struct mddev *mddev = disk->private_data;
7987
7988	BUG_ON(!mddev);
7989	atomic_dec(&mddev->openers);
7990	mddev_put(mddev);
7991}
7992
7993static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing)
7994{
7995	struct mddev *mddev = disk->private_data;
7996	unsigned int ret = 0;
7997
7998	if (mddev->changed)
7999		ret = DISK_EVENT_MEDIA_CHANGE;
8000	mddev->changed = 0;
8001	return ret;
8002}
8003
8004static void md_free_disk(struct gendisk *disk)
8005{
8006	struct mddev *mddev = disk->private_data;
8007
8008	mddev_free(mddev);
8009}
8010
8011const struct block_device_operations md_fops =
8012{
8013	.owner		= THIS_MODULE,
8014	.submit_bio	= md_submit_bio,
8015	.open		= md_open,
8016	.release	= md_release,
8017	.ioctl		= md_ioctl,
8018#ifdef CONFIG_COMPAT
8019	.compat_ioctl	= md_compat_ioctl,
8020#endif
8021	.getgeo		= md_getgeo,
8022	.check_events	= md_check_events,
8023	.set_read_only	= md_set_read_only,
8024	.free_disk	= md_free_disk,
8025};
8026
8027static int md_thread(void *arg)
8028{
8029	struct md_thread *thread = arg;
8030
8031	/*
8032	 * md_thread is a 'system-thread', it's priority should be very
8033	 * high. We avoid resource deadlocks individually in each
8034	 * raid personality. (RAID5 does preallocation) We also use RR and
8035	 * the very same RT priority as kswapd, thus we will never get
8036	 * into a priority inversion deadlock.
8037	 *
8038	 * we definitely have to have equal or higher priority than
8039	 * bdflush, otherwise bdflush will deadlock if there are too
8040	 * many dirty RAID5 blocks.
8041	 */
8042
8043	allow_signal(SIGKILL);
8044	while (!kthread_should_stop()) {
8045
8046		/* We need to wait INTERRUPTIBLE so that
8047		 * we don't add to the load-average.
8048		 * That means we need to be sure no signals are
8049		 * pending
8050		 */
8051		if (signal_pending(current))
8052			flush_signals(current);
8053
8054		wait_event_interruptible_timeout
8055			(thread->wqueue,
8056			 test_bit(THREAD_WAKEUP, &thread->flags)
8057			 || kthread_should_stop() || kthread_should_park(),
8058			 thread->timeout);
8059
8060		clear_bit(THREAD_WAKEUP, &thread->flags);
8061		if (kthread_should_park())
8062			kthread_parkme();
8063		if (!kthread_should_stop())
8064			thread->run(thread);
8065	}
8066
8067	return 0;
8068}
8069
8070static void md_wakeup_thread_directly(struct md_thread __rcu *thread)
8071{
8072	struct md_thread *t;
8073
8074	rcu_read_lock();
8075	t = rcu_dereference(thread);
8076	if (t)
8077		wake_up_process(t->tsk);
8078	rcu_read_unlock();
8079}
8080
8081void md_wakeup_thread(struct md_thread __rcu *thread)
8082{
8083	struct md_thread *t;
8084
8085	rcu_read_lock();
8086	t = rcu_dereference(thread);
8087	if (t) {
8088		pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
8089		set_bit(THREAD_WAKEUP, &t->flags);
8090		wake_up(&t->wqueue);
8091	}
8092	rcu_read_unlock();
8093}
8094EXPORT_SYMBOL(md_wakeup_thread);
8095
8096struct md_thread *md_register_thread(void (*run) (struct md_thread *),
8097		struct mddev *mddev, const char *name)
8098{
8099	struct md_thread *thread;
8100
8101	thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
8102	if (!thread)
8103		return NULL;
8104
8105	init_waitqueue_head(&thread->wqueue);
8106
8107	thread->run = run;
8108	thread->mddev = mddev;
8109	thread->timeout = MAX_SCHEDULE_TIMEOUT;
8110	thread->tsk = kthread_run(md_thread, thread,
8111				  "%s_%s",
8112				  mdname(thread->mddev),
8113				  name);
8114	if (IS_ERR(thread->tsk)) {
8115		kfree(thread);
8116		return NULL;
8117	}
8118	return thread;
8119}
8120EXPORT_SYMBOL(md_register_thread);
8121
8122void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp)
8123{
8124	struct md_thread *thread = rcu_dereference_protected(*threadp,
8125					lockdep_is_held(&mddev->reconfig_mutex));
8126
8127	if (!thread)
8128		return;
8129
8130	rcu_assign_pointer(*threadp, NULL);
8131	synchronize_rcu();
8132
8133	pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
8134	kthread_stop(thread->tsk);
8135	kfree(thread);
8136}
8137EXPORT_SYMBOL(md_unregister_thread);
8138
8139void md_error(struct mddev *mddev, struct md_rdev *rdev)
8140{
8141	if (!rdev || test_bit(Faulty, &rdev->flags))
8142		return;
8143
8144	if (!mddev->pers || !mddev->pers->error_handler)
8145		return;
8146	mddev->pers->error_handler(mddev, rdev);
8147
8148	if (mddev->pers->level == 0)
8149		return;
8150
8151	if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
8152		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8153	sysfs_notify_dirent_safe(rdev->sysfs_state);
8154	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8155	if (!test_bit(MD_BROKEN, &mddev->flags)) {
8156		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8157		md_wakeup_thread(mddev->thread);
8158	}
8159	if (mddev->event_work.func)
8160		queue_work(md_misc_wq, &mddev->event_work);
8161	md_new_event();
8162}
8163EXPORT_SYMBOL(md_error);
8164
8165/* seq_file implementation /proc/mdstat */
8166
8167static void status_unused(struct seq_file *seq)
8168{
8169	int i = 0;
8170	struct md_rdev *rdev;
8171
8172	seq_printf(seq, "unused devices: ");
8173
8174	list_for_each_entry(rdev, &pending_raid_disks, same_set) {
8175		i++;
8176		seq_printf(seq, "%pg ", rdev->bdev);
8177	}
8178	if (!i)
8179		seq_printf(seq, "<none>");
8180
8181	seq_printf(seq, "\n");
8182}
8183
8184static void status_personalities(struct seq_file *seq)
8185{
8186	struct md_personality *pers;
8187
8188	seq_puts(seq, "Personalities : ");
8189	spin_lock(&pers_lock);
8190	list_for_each_entry(pers, &pers_list, list)
8191		seq_printf(seq, "[%s] ", pers->name);
8192
8193	spin_unlock(&pers_lock);
8194	seq_puts(seq, "\n");
8195}
8196
8197static int status_resync(struct seq_file *seq, struct mddev *mddev)
8198{
8199	sector_t max_sectors, resync, res;
8200	unsigned long dt, db = 0;
8201	sector_t rt, curr_mark_cnt, resync_mark_cnt;
8202	int scale, recovery_active;
8203	unsigned int per_milli;
8204
8205	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8206	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8207		max_sectors = mddev->resync_max_sectors;
8208	else
8209		max_sectors = mddev->dev_sectors;
8210
8211	resync = mddev->curr_resync;
8212	if (resync < MD_RESYNC_ACTIVE) {
8213		if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8214			/* Still cleaning up */
8215			resync = max_sectors;
8216	} else if (resync > max_sectors) {
8217		resync = max_sectors;
8218	} else {
8219		res = atomic_read(&mddev->recovery_active);
8220		/*
8221		 * Resync has started, but the subtraction has overflowed or
8222		 * yielded one of the special values. Force it to active to
8223		 * ensure the status reports an active resync.
8224		 */
8225		if (resync < res || resync - res < MD_RESYNC_ACTIVE)
8226			resync = MD_RESYNC_ACTIVE;
8227		else
8228			resync -= res;
8229	}
8230
8231	if (resync == MD_RESYNC_NONE) {
8232		if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
8233			struct md_rdev *rdev;
8234
8235			rdev_for_each(rdev, mddev)
8236				if (rdev->raid_disk >= 0 &&
8237				    !test_bit(Faulty, &rdev->flags) &&
8238				    rdev->recovery_offset != MaxSector &&
8239				    rdev->recovery_offset) {
8240					seq_printf(seq, "\trecover=REMOTE");
8241					return 1;
8242				}
8243			if (mddev->reshape_position != MaxSector)
8244				seq_printf(seq, "\treshape=REMOTE");
8245			else
8246				seq_printf(seq, "\tresync=REMOTE");
8247			return 1;
8248		}
8249		if (mddev->recovery_cp < MaxSector) {
8250			seq_printf(seq, "\tresync=PENDING");
8251			return 1;
8252		}
8253		return 0;
8254	}
8255	if (resync < MD_RESYNC_ACTIVE) {
8256		seq_printf(seq, "\tresync=DELAYED");
8257		return 1;
8258	}
8259
8260	WARN_ON(max_sectors == 0);
8261	/* Pick 'scale' such that (resync>>scale)*1000 will fit
8262	 * in a sector_t, and (max_sectors>>scale) will fit in a
8263	 * u32, as those are the requirements for sector_div.
8264	 * Thus 'scale' must be at least 10
8265	 */
8266	scale = 10;
8267	if (sizeof(sector_t) > sizeof(unsigned long)) {
8268		while ( max_sectors/2 > (1ULL<<(scale+32)))
8269			scale++;
8270	}
8271	res = (resync>>scale)*1000;
8272	sector_div(res, (u32)((max_sectors>>scale)+1));
8273
8274	per_milli = res;
8275	{
8276		int i, x = per_milli/50, y = 20-x;
8277		seq_printf(seq, "[");
8278		for (i = 0; i < x; i++)
8279			seq_printf(seq, "=");
8280		seq_printf(seq, ">");
8281		for (i = 0; i < y; i++)
8282			seq_printf(seq, ".");
8283		seq_printf(seq, "] ");
8284	}
8285	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
8286		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
8287		    "reshape" :
8288		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
8289		     "check" :
8290		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
8291		      "resync" : "recovery"))),
8292		   per_milli/10, per_milli % 10,
8293		   (unsigned long long) resync/2,
8294		   (unsigned long long) max_sectors/2);
8295
8296	/*
8297	 * dt: time from mark until now
8298	 * db: blocks written from mark until now
8299	 * rt: remaining time
8300	 *
8301	 * rt is a sector_t, which is always 64bit now. We are keeping
8302	 * the original algorithm, but it is not really necessary.
8303	 *
8304	 * Original algorithm:
8305	 *   So we divide before multiply in case it is 32bit and close
8306	 *   to the limit.
8307	 *   We scale the divisor (db) by 32 to avoid losing precision
8308	 *   near the end of resync when the number of remaining sectors
8309	 *   is close to 'db'.
8310	 *   We then divide rt by 32 after multiplying by db to compensate.
8311	 *   The '+1' avoids division by zero if db is very small.
8312	 */
8313	dt = ((jiffies - mddev->resync_mark) / HZ);
8314	if (!dt) dt++;
8315
8316	curr_mark_cnt = mddev->curr_mark_cnt;
8317	recovery_active = atomic_read(&mddev->recovery_active);
8318	resync_mark_cnt = mddev->resync_mark_cnt;
8319
8320	if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
8321		db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
8322
8323	rt = max_sectors - resync;    /* number of remaining sectors */
8324	rt = div64_u64(rt, db/32+1);
8325	rt *= dt;
8326	rt >>= 5;
8327
8328	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
8329		   ((unsigned long)rt % 60)/6);
8330
8331	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
8332	return 1;
8333}
8334
8335static void *md_seq_start(struct seq_file *seq, loff_t *pos)
8336	__acquires(&all_mddevs_lock)
8337{
8338	seq->poll_event = atomic_read(&md_event_count);
8339	spin_lock(&all_mddevs_lock);
8340
8341	return seq_list_start_head(&all_mddevs, *pos);
8342}
8343
8344static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
8345{
8346	return seq_list_next(v, &all_mddevs, pos);
8347}
8348
8349static void md_seq_stop(struct seq_file *seq, void *v)
8350	__releases(&all_mddevs_lock)
8351{
8352	spin_unlock(&all_mddevs_lock);
8353}
8354
8355static int md_seq_show(struct seq_file *seq, void *v)
8356{
8357	struct mddev *mddev;
8358	sector_t sectors;
8359	struct md_rdev *rdev;
8360
8361	if (v == &all_mddevs) {
8362		status_personalities(seq);
8363		if (list_empty(&all_mddevs))
8364			status_unused(seq);
8365		return 0;
8366	}
8367
8368	mddev = list_entry(v, struct mddev, all_mddevs);
8369	if (!mddev_get(mddev))
8370		return 0;
8371
8372	spin_unlock(&all_mddevs_lock);
8373	spin_lock(&mddev->lock);
8374	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
8375		seq_printf(seq, "%s : %sactive", mdname(mddev),
8376						mddev->pers ? "" : "in");
8377		if (mddev->pers) {
8378			if (mddev->ro == MD_RDONLY)
8379				seq_printf(seq, " (read-only)");
8380			if (mddev->ro == MD_AUTO_READ)
8381				seq_printf(seq, " (auto-read-only)");
8382			seq_printf(seq, " %s", mddev->pers->name);
8383		}
8384
8385		sectors = 0;
8386		rcu_read_lock();
8387		rdev_for_each_rcu(rdev, mddev) {
8388			seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr);
8389
8390			if (test_bit(WriteMostly, &rdev->flags))
8391				seq_printf(seq, "(W)");
8392			if (test_bit(Journal, &rdev->flags))
8393				seq_printf(seq, "(J)");
8394			if (test_bit(Faulty, &rdev->flags)) {
8395				seq_printf(seq, "(F)");
8396				continue;
8397			}
8398			if (rdev->raid_disk < 0)
8399				seq_printf(seq, "(S)"); /* spare */
8400			if (test_bit(Replacement, &rdev->flags))
8401				seq_printf(seq, "(R)");
8402			sectors += rdev->sectors;
8403		}
8404		rcu_read_unlock();
8405
8406		if (!list_empty(&mddev->disks)) {
8407			if (mddev->pers)
8408				seq_printf(seq, "\n      %llu blocks",
8409					   (unsigned long long)
8410					   mddev->array_sectors / 2);
8411			else
8412				seq_printf(seq, "\n      %llu blocks",
8413					   (unsigned long long)sectors / 2);
8414		}
8415		if (mddev->persistent) {
8416			if (mddev->major_version != 0 ||
8417			    mddev->minor_version != 90) {
8418				seq_printf(seq," super %d.%d",
8419					   mddev->major_version,
8420					   mddev->minor_version);
8421			}
8422		} else if (mddev->external)
8423			seq_printf(seq, " super external:%s",
8424				   mddev->metadata_type);
8425		else
8426			seq_printf(seq, " super non-persistent");
8427
8428		if (mddev->pers) {
8429			mddev->pers->status(seq, mddev);
8430			seq_printf(seq, "\n      ");
8431			if (mddev->pers->sync_request) {
8432				if (status_resync(seq, mddev))
8433					seq_printf(seq, "\n      ");
8434			}
8435		} else
8436			seq_printf(seq, "\n       ");
8437
8438		md_bitmap_status(seq, mddev->bitmap);
8439
8440		seq_printf(seq, "\n");
8441	}
8442	spin_unlock(&mddev->lock);
8443	spin_lock(&all_mddevs_lock);
8444
8445	if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
8446		status_unused(seq);
8447
8448	if (atomic_dec_and_test(&mddev->active))
8449		__mddev_put(mddev);
8450
8451	return 0;
8452}
8453
8454static const struct seq_operations md_seq_ops = {
8455	.start  = md_seq_start,
8456	.next   = md_seq_next,
8457	.stop   = md_seq_stop,
8458	.show   = md_seq_show,
8459};
8460
8461static int md_seq_open(struct inode *inode, struct file *file)
8462{
8463	struct seq_file *seq;
8464	int error;
8465
8466	error = seq_open(file, &md_seq_ops);
8467	if (error)
8468		return error;
8469
8470	seq = file->private_data;
8471	seq->poll_event = atomic_read(&md_event_count);
8472	return error;
8473}
8474
8475static int md_unloading;
8476static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
8477{
8478	struct seq_file *seq = filp->private_data;
8479	__poll_t mask;
8480
8481	if (md_unloading)
8482		return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
8483	poll_wait(filp, &md_event_waiters, wait);
8484
8485	/* always allow read */
8486	mask = EPOLLIN | EPOLLRDNORM;
8487
8488	if (seq->poll_event != atomic_read(&md_event_count))
8489		mask |= EPOLLERR | EPOLLPRI;
8490	return mask;
8491}
8492
8493static const struct proc_ops mdstat_proc_ops = {
8494	.proc_open	= md_seq_open,
8495	.proc_read	= seq_read,
8496	.proc_lseek	= seq_lseek,
8497	.proc_release	= seq_release,
8498	.proc_poll	= mdstat_poll,
8499};
8500
8501int register_md_personality(struct md_personality *p)
8502{
8503	pr_debug("md: %s personality registered for level %d\n",
8504		 p->name, p->level);
8505	spin_lock(&pers_lock);
8506	list_add_tail(&p->list, &pers_list);
8507	spin_unlock(&pers_lock);
8508	return 0;
8509}
8510EXPORT_SYMBOL(register_md_personality);
8511
8512int unregister_md_personality(struct md_personality *p)
8513{
8514	pr_debug("md: %s personality unregistered\n", p->name);
8515	spin_lock(&pers_lock);
8516	list_del_init(&p->list);
8517	spin_unlock(&pers_lock);
8518	return 0;
8519}
8520EXPORT_SYMBOL(unregister_md_personality);
8521
8522int register_md_cluster_operations(struct md_cluster_operations *ops,
8523				   struct module *module)
8524{
8525	int ret = 0;
8526	spin_lock(&pers_lock);
8527	if (md_cluster_ops != NULL)
8528		ret = -EALREADY;
8529	else {
8530		md_cluster_ops = ops;
8531		md_cluster_mod = module;
8532	}
8533	spin_unlock(&pers_lock);
8534	return ret;
8535}
8536EXPORT_SYMBOL(register_md_cluster_operations);
8537
8538int unregister_md_cluster_operations(void)
8539{
8540	spin_lock(&pers_lock);
8541	md_cluster_ops = NULL;
8542	spin_unlock(&pers_lock);
8543	return 0;
8544}
8545EXPORT_SYMBOL(unregister_md_cluster_operations);
8546
8547int md_setup_cluster(struct mddev *mddev, int nodes)
8548{
8549	int ret;
8550	if (!md_cluster_ops)
8551		request_module("md-cluster");
8552	spin_lock(&pers_lock);
8553	/* ensure module won't be unloaded */
8554	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
8555		pr_warn("can't find md-cluster module or get its reference.\n");
8556		spin_unlock(&pers_lock);
8557		return -ENOENT;
8558	}
8559	spin_unlock(&pers_lock);
8560
8561	ret = md_cluster_ops->join(mddev, nodes);
8562	if (!ret)
8563		mddev->safemode_delay = 0;
8564	return ret;
8565}
8566
8567void md_cluster_stop(struct mddev *mddev)
8568{
8569	if (!md_cluster_ops)
8570		return;
8571	md_cluster_ops->leave(mddev);
8572	module_put(md_cluster_mod);
8573}
8574
8575static int is_mddev_idle(struct mddev *mddev, int init)
8576{
8577	struct md_rdev *rdev;
8578	int idle;
8579	int curr_events;
8580
8581	idle = 1;
8582	rcu_read_lock();
8583	rdev_for_each_rcu(rdev, mddev) {
8584		struct gendisk *disk = rdev->bdev->bd_disk;
8585		curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
8586			      atomic_read(&disk->sync_io);
8587		/* sync IO will cause sync_io to increase before the disk_stats
8588		 * as sync_io is counted when a request starts, and
8589		 * disk_stats is counted when it completes.
8590		 * So resync activity will cause curr_events to be smaller than
8591		 * when there was no such activity.
8592		 * non-sync IO will cause disk_stat to increase without
8593		 * increasing sync_io so curr_events will (eventually)
8594		 * be larger than it was before.  Once it becomes
8595		 * substantially larger, the test below will cause
8596		 * the array to appear non-idle, and resync will slow
8597		 * down.
8598		 * If there is a lot of outstanding resync activity when
8599		 * we set last_event to curr_events, then all that activity
8600		 * completing might cause the array to appear non-idle
8601		 * and resync will be slowed down even though there might
8602		 * not have been non-resync activity.  This will only
8603		 * happen once though.  'last_events' will soon reflect
8604		 * the state where there is little or no outstanding
8605		 * resync requests, and further resync activity will
8606		 * always make curr_events less than last_events.
8607		 *
8608		 */
8609		if (init || curr_events - rdev->last_events > 64) {
8610			rdev->last_events = curr_events;
8611			idle = 0;
8612		}
8613	}
8614	rcu_read_unlock();
8615	return idle;
8616}
8617
8618void md_done_sync(struct mddev *mddev, int blocks, int ok)
8619{
8620	/* another "blocks" (512byte) blocks have been synced */
8621	atomic_sub(blocks, &mddev->recovery_active);
8622	wake_up(&mddev->recovery_wait);
8623	if (!ok) {
8624		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8625		set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
8626		md_wakeup_thread(mddev->thread);
8627		// stop recovery, signal do_sync ....
8628	}
8629}
8630EXPORT_SYMBOL(md_done_sync);
8631
8632/* md_write_start(mddev, bi)
8633 * If we need to update some array metadata (e.g. 'active' flag
8634 * in superblock) before writing, schedule a superblock update
8635 * and wait for it to complete.
8636 * A return value of 'false' means that the write wasn't recorded
8637 * and cannot proceed as the array is being suspend.
8638 */
8639bool md_write_start(struct mddev *mddev, struct bio *bi)
8640{
8641	int did_change = 0;
8642
8643	if (bio_data_dir(bi) != WRITE)
8644		return true;
8645
8646	BUG_ON(mddev->ro == MD_RDONLY);
8647	if (mddev->ro == MD_AUTO_READ) {
8648		/* need to switch to read/write */
8649		flush_work(&mddev->sync_work);
8650		mddev->ro = MD_RDWR;
8651		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8652		md_wakeup_thread(mddev->thread);
8653		md_wakeup_thread(mddev->sync_thread);
8654		did_change = 1;
8655	}
8656	rcu_read_lock();
8657	percpu_ref_get(&mddev->writes_pending);
8658	smp_mb(); /* Match smp_mb in set_in_sync() */
8659	if (mddev->safemode == 1)
8660		mddev->safemode = 0;
8661	/* sync_checkers is always 0 when writes_pending is in per-cpu mode */
8662	if (mddev->in_sync || mddev->sync_checkers) {
8663		spin_lock(&mddev->lock);
8664		if (mddev->in_sync) {
8665			mddev->in_sync = 0;
8666			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8667			set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8668			md_wakeup_thread(mddev->thread);
8669			did_change = 1;
8670		}
8671		spin_unlock(&mddev->lock);
8672	}
8673	rcu_read_unlock();
8674	if (did_change)
8675		sysfs_notify_dirent_safe(mddev->sysfs_state);
8676	if (!mddev->has_superblocks)
8677		return true;
8678	wait_event(mddev->sb_wait,
8679		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
8680		   is_md_suspended(mddev));
8681	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
8682		percpu_ref_put(&mddev->writes_pending);
8683		return false;
8684	}
8685	return true;
8686}
8687EXPORT_SYMBOL(md_write_start);
8688
8689/* md_write_inc can only be called when md_write_start() has
8690 * already been called at least once of the current request.
8691 * It increments the counter and is useful when a single request
8692 * is split into several parts.  Each part causes an increment and
8693 * so needs a matching md_write_end().
8694 * Unlike md_write_start(), it is safe to call md_write_inc() inside
8695 * a spinlocked region.
8696 */
8697void md_write_inc(struct mddev *mddev, struct bio *bi)
8698{
8699	if (bio_data_dir(bi) != WRITE)
8700		return;
8701	WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev));
8702	percpu_ref_get(&mddev->writes_pending);
8703}
8704EXPORT_SYMBOL(md_write_inc);
8705
8706void md_write_end(struct mddev *mddev)
8707{
8708	percpu_ref_put(&mddev->writes_pending);
8709
8710	if (mddev->safemode == 2)
8711		md_wakeup_thread(mddev->thread);
8712	else if (mddev->safemode_delay)
8713		/* The roundup() ensures this only performs locking once
8714		 * every ->safemode_delay jiffies
8715		 */
8716		mod_timer(&mddev->safemode_timer,
8717			  roundup(jiffies, mddev->safemode_delay) +
8718			  mddev->safemode_delay);
8719}
8720
8721EXPORT_SYMBOL(md_write_end);
8722
8723/* This is used by raid0 and raid10 */
8724void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
8725			struct bio *bio, sector_t start, sector_t size)
8726{
8727	struct bio *discard_bio = NULL;
8728
8729	if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO,
8730			&discard_bio) || !discard_bio)
8731		return;
8732
8733	bio_chain(discard_bio, bio);
8734	bio_clone_blkg_association(discard_bio, bio);
8735	mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector);
8736	submit_bio_noacct(discard_bio);
8737}
8738EXPORT_SYMBOL_GPL(md_submit_discard_bio);
8739
8740static void md_end_clone_io(struct bio *bio)
8741{
8742	struct md_io_clone *md_io_clone = bio->bi_private;
8743	struct bio *orig_bio = md_io_clone->orig_bio;
8744	struct mddev *mddev = md_io_clone->mddev;
8745
8746	if (bio->bi_status && !orig_bio->bi_status)
8747		orig_bio->bi_status = bio->bi_status;
8748
8749	if (md_io_clone->start_time)
8750		bio_end_io_acct(orig_bio, md_io_clone->start_time);
8751
8752	bio_put(bio);
8753	bio_endio(orig_bio);
8754	percpu_ref_put(&mddev->active_io);
8755}
8756
8757static void md_clone_bio(struct mddev *mddev, struct bio **bio)
8758{
8759	struct block_device *bdev = (*bio)->bi_bdev;
8760	struct md_io_clone *md_io_clone;
8761	struct bio *clone =
8762		bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
8763
8764	md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
8765	md_io_clone->orig_bio = *bio;
8766	md_io_clone->mddev = mddev;
8767	if (blk_queue_io_stat(bdev->bd_disk->queue))
8768		md_io_clone->start_time = bio_start_io_acct(*bio);
8769
8770	clone->bi_end_io = md_end_clone_io;
8771	clone->bi_private = md_io_clone;
8772	*bio = clone;
8773}
8774
8775void md_account_bio(struct mddev *mddev, struct bio **bio)
8776{
8777	percpu_ref_get(&mddev->active_io);
8778	md_clone_bio(mddev, bio);
8779}
8780EXPORT_SYMBOL_GPL(md_account_bio);
8781
8782void md_free_cloned_bio(struct bio *bio)
8783{
8784	struct md_io_clone *md_io_clone = bio->bi_private;
8785	struct bio *orig_bio = md_io_clone->orig_bio;
8786	struct mddev *mddev = md_io_clone->mddev;
8787
8788	if (bio->bi_status && !orig_bio->bi_status)
8789		orig_bio->bi_status = bio->bi_status;
8790
8791	if (md_io_clone->start_time)
8792		bio_end_io_acct(orig_bio, md_io_clone->start_time);
8793
8794	bio_put(bio);
8795	percpu_ref_put(&mddev->active_io);
8796}
8797EXPORT_SYMBOL_GPL(md_free_cloned_bio);
8798
8799/* md_allow_write(mddev)
8800 * Calling this ensures that the array is marked 'active' so that writes
8801 * may proceed without blocking.  It is important to call this before
8802 * attempting a GFP_KERNEL allocation while holding the mddev lock.
8803 * Must be called with mddev_lock held.
8804 */
8805void md_allow_write(struct mddev *mddev)
8806{
8807	if (!mddev->pers)
8808		return;
8809	if (!md_is_rdwr(mddev))
8810		return;
8811	if (!mddev->pers->sync_request)
8812		return;
8813
8814	spin_lock(&mddev->lock);
8815	if (mddev->in_sync) {
8816		mddev->in_sync = 0;
8817		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8818		set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8819		if (mddev->safemode_delay &&
8820		    mddev->safemode == 0)
8821			mddev->safemode = 1;
8822		spin_unlock(&mddev->lock);
8823		md_update_sb(mddev, 0);
8824		sysfs_notify_dirent_safe(mddev->sysfs_state);
8825		/* wait for the dirty state to be recorded in the metadata */
8826		wait_event(mddev->sb_wait,
8827			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8828	} else
8829		spin_unlock(&mddev->lock);
8830}
8831EXPORT_SYMBOL_GPL(md_allow_write);
8832
8833#define SYNC_MARKS	10
8834#define	SYNC_MARK_STEP	(3*HZ)
8835#define UPDATE_FREQUENCY (5*60*HZ)
8836void md_do_sync(struct md_thread *thread)
8837{
8838	struct mddev *mddev = thread->mddev;
8839	struct mddev *mddev2;
8840	unsigned int currspeed = 0, window;
8841	sector_t max_sectors,j, io_sectors, recovery_done;
8842	unsigned long mark[SYNC_MARKS];
8843	unsigned long update_time;
8844	sector_t mark_cnt[SYNC_MARKS];
8845	int last_mark,m;
8846	sector_t last_check;
8847	int skipped = 0;
8848	struct md_rdev *rdev;
8849	char *desc, *action = NULL;
8850	struct blk_plug plug;
8851	int ret;
8852
8853	/* just incase thread restarts... */
8854	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8855		return;
8856
8857	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8858		goto skip;
8859
8860	if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) ||
8861	    !md_is_rdwr(mddev)) {/* never try to sync a read-only array */
8862		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8863		goto skip;
8864	}
8865
8866	if (mddev_is_clustered(mddev)) {
8867		ret = md_cluster_ops->resync_start(mddev);
8868		if (ret)
8869			goto skip;
8870
8871		set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8872		if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8873			test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
8874			test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
8875		     && ((unsigned long long)mddev->curr_resync_completed
8876			 < (unsigned long long)mddev->resync_max_sectors))
8877			goto skip;
8878	}
8879
8880	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8881		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
8882			desc = "data-check";
8883			action = "check";
8884		} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8885			desc = "requested-resync";
8886			action = "repair";
8887		} else
8888			desc = "resync";
8889	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8890		desc = "reshape";
8891	else
8892		desc = "recovery";
8893
8894	mddev->last_sync_action = action ?: desc;
8895
8896	/*
8897	 * Before starting a resync we must have set curr_resync to
8898	 * 2, and then checked that every "conflicting" array has curr_resync
8899	 * less than ours.  When we find one that is the same or higher
8900	 * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
8901	 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
8902	 * This will mean we have to start checking from the beginning again.
8903	 *
8904	 */
8905
8906	do {
8907		int mddev2_minor = -1;
8908		mddev->curr_resync = MD_RESYNC_DELAYED;
8909
8910	try_again:
8911		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8912			goto skip;
8913		spin_lock(&all_mddevs_lock);
8914		list_for_each_entry(mddev2, &all_mddevs, all_mddevs) {
8915			if (test_bit(MD_DELETED, &mddev2->flags))
8916				continue;
8917			if (mddev2 == mddev)
8918				continue;
8919			if (!mddev->parallel_resync
8920			&&  mddev2->curr_resync
8921			&&  match_mddev_units(mddev, mddev2)) {
8922				DEFINE_WAIT(wq);
8923				if (mddev < mddev2 &&
8924				    mddev->curr_resync == MD_RESYNC_DELAYED) {
8925					/* arbitrarily yield */
8926					mddev->curr_resync = MD_RESYNC_YIELDED;
8927					wake_up(&resync_wait);
8928				}
8929				if (mddev > mddev2 &&
8930				    mddev->curr_resync == MD_RESYNC_YIELDED)
8931					/* no need to wait here, we can wait the next
8932					 * time 'round when curr_resync == 2
8933					 */
8934					continue;
8935				/* We need to wait 'interruptible' so as not to
8936				 * contribute to the load average, and not to
8937				 * be caught by 'softlockup'
8938				 */
8939				prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
8940				if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8941				    mddev2->curr_resync >= mddev->curr_resync) {
8942					if (mddev2_minor != mddev2->md_minor) {
8943						mddev2_minor = mddev2->md_minor;
8944						pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
8945							desc, mdname(mddev),
8946							mdname(mddev2));
8947					}
8948					spin_unlock(&all_mddevs_lock);
8949
8950					if (signal_pending(current))
8951						flush_signals(current);
8952					schedule();
8953					finish_wait(&resync_wait, &wq);
8954					goto try_again;
8955				}
8956				finish_wait(&resync_wait, &wq);
8957			}
8958		}
8959		spin_unlock(&all_mddevs_lock);
8960	} while (mddev->curr_resync < MD_RESYNC_DELAYED);
8961
8962	j = 0;
8963	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8964		/* resync follows the size requested by the personality,
8965		 * which defaults to physical size, but can be virtual size
8966		 */
8967		max_sectors = mddev->resync_max_sectors;
8968		atomic64_set(&mddev->resync_mismatches, 0);
8969		/* we don't use the checkpoint if there's a bitmap */
8970		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8971			j = mddev->resync_min;
8972		else if (!mddev->bitmap)
8973			j = mddev->recovery_cp;
8974
8975	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
8976		max_sectors = mddev->resync_max_sectors;
8977		/*
8978		 * If the original node aborts reshaping then we continue the
8979		 * reshaping, so set j again to avoid restart reshape from the
8980		 * first beginning
8981		 */
8982		if (mddev_is_clustered(mddev) &&
8983		    mddev->reshape_position != MaxSector)
8984			j = mddev->reshape_position;
8985	} else {
8986		/* recovery follows the physical size of devices */
8987		max_sectors = mddev->dev_sectors;
8988		j = MaxSector;
8989		rcu_read_lock();
8990		rdev_for_each_rcu(rdev, mddev)
8991			if (rdev->raid_disk >= 0 &&
8992			    !test_bit(Journal, &rdev->flags) &&
8993			    !test_bit(Faulty, &rdev->flags) &&
8994			    !test_bit(In_sync, &rdev->flags) &&
8995			    rdev->recovery_offset < j)
8996				j = rdev->recovery_offset;
8997		rcu_read_unlock();
8998
8999		/* If there is a bitmap, we need to make sure all
9000		 * writes that started before we added a spare
9001		 * complete before we start doing a recovery.
9002		 * Otherwise the write might complete and (via
9003		 * bitmap_endwrite) set a bit in the bitmap after the
9004		 * recovery has checked that bit and skipped that
9005		 * region.
9006		 */
9007		if (mddev->bitmap) {
9008			mddev->pers->quiesce(mddev, 1);
9009			mddev->pers->quiesce(mddev, 0);
9010		}
9011	}
9012
9013	pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
9014	pr_debug("md: minimum _guaranteed_  speed: %d KB/sec/disk.\n", speed_min(mddev));
9015	pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
9016		 speed_max(mddev), desc);
9017
9018	is_mddev_idle(mddev, 1); /* this initializes IO event counters */
9019
9020	io_sectors = 0;
9021	for (m = 0; m < SYNC_MARKS; m++) {
9022		mark[m] = jiffies;
9023		mark_cnt[m] = io_sectors;
9024	}
9025	last_mark = 0;
9026	mddev->resync_mark = mark[last_mark];
9027	mddev->resync_mark_cnt = mark_cnt[last_mark];
9028
9029	/*
9030	 * Tune reconstruction:
9031	 */
9032	window = 32 * (PAGE_SIZE / 512);
9033	pr_debug("md: using %dk window, over a total of %lluk.\n",
9034		 window/2, (unsigned long long)max_sectors/2);
9035
9036	atomic_set(&mddev->recovery_active, 0);
9037	last_check = 0;
9038
9039	if (j >= MD_RESYNC_ACTIVE) {
9040		pr_debug("md: resuming %s of %s from checkpoint.\n",
9041			 desc, mdname(mddev));
9042		mddev->curr_resync = j;
9043	} else
9044		mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */
9045	mddev->curr_resync_completed = j;
9046	sysfs_notify_dirent_safe(mddev->sysfs_completed);
9047	md_new_event();
9048	update_time = jiffies;
9049
9050	blk_start_plug(&plug);
9051	while (j < max_sectors) {
9052		sector_t sectors;
9053
9054		skipped = 0;
9055
9056		if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9057		    ((mddev->curr_resync > mddev->curr_resync_completed &&
9058		      (mddev->curr_resync - mddev->curr_resync_completed)
9059		      > (max_sectors >> 4)) ||
9060		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
9061		     (j - mddev->curr_resync_completed)*2
9062		     >= mddev->resync_max - mddev->curr_resync_completed ||
9063		     mddev->curr_resync_completed > mddev->resync_max
9064			    )) {
9065			/* time to update curr_resync_completed */
9066			wait_event(mddev->recovery_wait,
9067				   atomic_read(&mddev->recovery_active) == 0);
9068			mddev->curr_resync_completed = j;
9069			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
9070			    j > mddev->recovery_cp)
9071				mddev->recovery_cp = j;
9072			update_time = jiffies;
9073			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
9074			sysfs_notify_dirent_safe(mddev->sysfs_completed);
9075		}
9076
9077		while (j >= mddev->resync_max &&
9078		       !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9079			/* As this condition is controlled by user-space,
9080			 * we can block indefinitely, so use '_interruptible'
9081			 * to avoid triggering warnings.
9082			 */
9083			flush_signals(current); /* just in case */
9084			wait_event_interruptible(mddev->recovery_wait,
9085						 mddev->resync_max > j
9086						 || test_bit(MD_RECOVERY_INTR,
9087							     &mddev->recovery));
9088		}
9089
9090		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9091			break;
9092
9093		sectors = mddev->pers->sync_request(mddev, j, &skipped);
9094		if (sectors == 0) {
9095			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9096			break;
9097		}
9098
9099		if (!skipped) { /* actual IO requested */
9100			io_sectors += sectors;
9101			atomic_add(sectors, &mddev->recovery_active);
9102		}
9103
9104		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9105			break;
9106
9107		j += sectors;
9108		if (j > max_sectors)
9109			/* when skipping, extra large numbers can be returned. */
9110			j = max_sectors;
9111		if (j >= MD_RESYNC_ACTIVE)
9112			mddev->curr_resync = j;
9113		mddev->curr_mark_cnt = io_sectors;
9114		if (last_check == 0)
9115			/* this is the earliest that rebuild will be
9116			 * visible in /proc/mdstat
9117			 */
9118			md_new_event();
9119
9120		if (last_check + window > io_sectors || j == max_sectors)
9121			continue;
9122
9123		last_check = io_sectors;
9124	repeat:
9125		if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
9126			/* step marks */
9127			int next = (last_mark+1) % SYNC_MARKS;
9128
9129			mddev->resync_mark = mark[next];
9130			mddev->resync_mark_cnt = mark_cnt[next];
9131			mark[next] = jiffies;
9132			mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
9133			last_mark = next;
9134		}
9135
9136		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9137			break;
9138
9139		/*
9140		 * this loop exits only if either when we are slower than
9141		 * the 'hard' speed limit, or the system was IO-idle for
9142		 * a jiffy.
9143		 * the system might be non-idle CPU-wise, but we only care
9144		 * about not overloading the IO subsystem. (things like an
9145		 * e2fsck being done on the RAID array should execute fast)
9146		 */
9147		cond_resched();
9148
9149		recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
9150		currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
9151			/((jiffies-mddev->resync_mark)/HZ +1) +1;
9152
9153		if (currspeed > speed_min(mddev)) {
9154			if (currspeed > speed_max(mddev)) {
9155				msleep(500);
9156				goto repeat;
9157			}
9158			if (!is_mddev_idle(mddev, 0)) {
9159				/*
9160				 * Give other IO more of a chance.
9161				 * The faster the devices, the less we wait.
9162				 */
9163				wait_event(mddev->recovery_wait,
9164					   !atomic_read(&mddev->recovery_active));
9165			}
9166		}
9167	}
9168	pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
9169		test_bit(MD_RECOVERY_INTR, &mddev->recovery)
9170		? "interrupted" : "done");
9171	/*
9172	 * this also signals 'finished resyncing' to md_stop
9173	 */
9174	blk_finish_plug(&plug);
9175	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
9176
9177	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9178	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9179	    mddev->curr_resync >= MD_RESYNC_ACTIVE) {
9180		mddev->curr_resync_completed = mddev->curr_resync;
9181		sysfs_notify_dirent_safe(mddev->sysfs_completed);
9182	}
9183	mddev->pers->sync_request(mddev, max_sectors, &skipped);
9184
9185	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
9186	    mddev->curr_resync > MD_RESYNC_ACTIVE) {
9187		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
9188			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9189				if (mddev->curr_resync >= mddev->recovery_cp) {
9190					pr_debug("md: checkpointing %s of %s.\n",
9191						 desc, mdname(mddev));
9192					if (test_bit(MD_RECOVERY_ERROR,
9193						&mddev->recovery))
9194						mddev->recovery_cp =
9195							mddev->curr_resync_completed;
9196					else
9197						mddev->recovery_cp =
9198							mddev->curr_resync;
9199				}
9200			} else
9201				mddev->recovery_cp = MaxSector;
9202		} else {
9203			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
9204				mddev->curr_resync = MaxSector;
9205			if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9206			    test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
9207				rcu_read_lock();
9208				rdev_for_each_rcu(rdev, mddev)
9209					if (rdev->raid_disk >= 0 &&
9210					    mddev->delta_disks >= 0 &&
9211					    !test_bit(Journal, &rdev->flags) &&
9212					    !test_bit(Faulty, &rdev->flags) &&
9213					    !test_bit(In_sync, &rdev->flags) &&
9214					    rdev->recovery_offset < mddev->curr_resync)
9215						rdev->recovery_offset = mddev->curr_resync;
9216				rcu_read_unlock();
9217			}
9218		}
9219	}
9220 skip:
9221	/* set CHANGE_PENDING here since maybe another update is needed,
9222	 * so other nodes are informed. It should be harmless for normal
9223	 * raid */
9224	set_mask_bits(&mddev->sb_flags, 0,
9225		      BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
9226
9227	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9228			!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9229			mddev->delta_disks > 0 &&
9230			mddev->pers->finish_reshape &&
9231			mddev->pers->size &&
9232			!mddev_is_dm(mddev)) {
9233		mddev_lock_nointr(mddev);
9234		md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
9235		mddev_unlock(mddev);
9236		if (!mddev_is_clustered(mddev))
9237			set_capacity_and_notify(mddev->gendisk,
9238						mddev->array_sectors);
9239	}
9240
9241	spin_lock(&mddev->lock);
9242	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
9243		/* We completed so min/max setting can be forgotten if used. */
9244		if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9245			mddev->resync_min = 0;
9246		mddev->resync_max = MaxSector;
9247	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
9248		mddev->resync_min = mddev->curr_resync_completed;
9249	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
9250	mddev->curr_resync = MD_RESYNC_NONE;
9251	spin_unlock(&mddev->lock);
9252
9253	wake_up(&resync_wait);
9254	md_wakeup_thread(mddev->thread);
9255	return;
9256}
9257EXPORT_SYMBOL_GPL(md_do_sync);
9258
9259static bool rdev_removeable(struct md_rdev *rdev)
9260{
9261	/* rdev is not used. */
9262	if (rdev->raid_disk < 0)
9263		return false;
9264
9265	/* There are still inflight io, don't remove this rdev. */
9266	if (atomic_read(&rdev->nr_pending))
9267		return false;
9268
9269	/*
9270	 * An error occurred but has not yet been acknowledged by the metadata
9271	 * handler, don't remove this rdev.
9272	 */
9273	if (test_bit(Blocked, &rdev->flags))
9274		return false;
9275
9276	/* Fautly rdev is not used, it's safe to remove it. */
9277	if (test_bit(Faulty, &rdev->flags))
9278		return true;
9279
9280	/* Journal disk can only be removed if it's faulty. */
9281	if (test_bit(Journal, &rdev->flags))
9282		return false;
9283
9284	/*
9285	 * 'In_sync' is cleared while 'raid_disk' is valid, which means
9286	 * replacement has just become active from pers->spare_active(), and
9287	 * then pers->hot_remove_disk() will replace this rdev with replacement.
9288	 */
9289	if (!test_bit(In_sync, &rdev->flags))
9290		return true;
9291
9292	return false;
9293}
9294
9295static bool rdev_is_spare(struct md_rdev *rdev)
9296{
9297	return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 &&
9298	       !test_bit(In_sync, &rdev->flags) &&
9299	       !test_bit(Journal, &rdev->flags) &&
9300	       !test_bit(Faulty, &rdev->flags);
9301}
9302
9303static bool rdev_addable(struct md_rdev *rdev)
9304{
9305	/* rdev is already used, don't add it again. */
9306	if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
9307	    test_bit(Faulty, &rdev->flags))
9308		return false;
9309
9310	/* Allow to add journal disk. */
9311	if (test_bit(Journal, &rdev->flags))
9312		return true;
9313
9314	/* Allow to add if array is read-write. */
9315	if (md_is_rdwr(rdev->mddev))
9316		return true;
9317
9318	/*
9319	 * For read-only array, only allow to readd a rdev. And if bitmap is
9320	 * used, don't allow to readd a rdev that is too old.
9321	 */
9322	if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))
9323		return true;
9324
9325	return false;
9326}
9327
9328static bool md_spares_need_change(struct mddev *mddev)
9329{
9330	struct md_rdev *rdev;
9331
9332	rcu_read_lock();
9333	rdev_for_each_rcu(rdev, mddev) {
9334		if (rdev_removeable(rdev) || rdev_addable(rdev)) {
9335			rcu_read_unlock();
9336			return true;
9337		}
9338	}
9339	rcu_read_unlock();
9340	return false;
9341}
9342
9343static int remove_and_add_spares(struct mddev *mddev,
9344				 struct md_rdev *this)
9345{
9346	struct md_rdev *rdev;
9347	int spares = 0;
9348	int removed = 0;
9349
9350	if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
9351		/* Mustn't remove devices when resync thread is running */
9352		return 0;
9353
9354	rdev_for_each(rdev, mddev) {
9355		if ((this == NULL || rdev == this) && rdev_removeable(rdev) &&
9356		    !mddev->pers->hot_remove_disk(mddev, rdev)) {
9357			sysfs_unlink_rdev(mddev, rdev);
9358			rdev->saved_raid_disk = rdev->raid_disk;
9359			rdev->raid_disk = -1;
9360			removed++;
9361		}
9362	}
9363
9364	if (removed && mddev->kobj.sd)
9365		sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9366
9367	if (this && removed)
9368		goto no_add;
9369
9370	rdev_for_each(rdev, mddev) {
9371		if (this && this != rdev)
9372			continue;
9373		if (rdev_is_spare(rdev))
9374			spares++;
9375		if (!rdev_addable(rdev))
9376			continue;
9377		if (!test_bit(Journal, &rdev->flags))
9378			rdev->recovery_offset = 0;
9379		if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
9380			/* failure here is OK */
9381			sysfs_link_rdev(mddev, rdev);
9382			if (!test_bit(Journal, &rdev->flags))
9383				spares++;
9384			md_new_event();
9385			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9386		}
9387	}
9388no_add:
9389	if (removed)
9390		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9391	return spares;
9392}
9393
9394static bool md_choose_sync_action(struct mddev *mddev, int *spares)
9395{
9396	/* Check if reshape is in progress first. */
9397	if (mddev->reshape_position != MaxSector) {
9398		if (mddev->pers->check_reshape == NULL ||
9399		    mddev->pers->check_reshape(mddev) != 0)
9400			return false;
9401
9402		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9403		clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9404		return true;
9405	}
9406
9407	/*
9408	 * Remove any failed drives, then add spares if possible. Spares are
9409	 * also removed and re-added, to allow the personality to fail the
9410	 * re-add.
9411	 */
9412	*spares = remove_and_add_spares(mddev, NULL);
9413	if (*spares) {
9414		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9415		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9416		clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9417
9418		/* Start new recovery. */
9419		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9420		return true;
9421	}
9422
9423	/* Check if recovery is in progress. */
9424	if (mddev->recovery_cp < MaxSector) {
9425		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9426		clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9427		return true;
9428	}
9429
9430	/* Delay to choose resync/check/repair in md_do_sync(). */
9431	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
9432		return true;
9433
9434	/* Nothing to be done */
9435	return false;
9436}
9437
9438static void md_start_sync(struct work_struct *ws)
9439{
9440	struct mddev *mddev = container_of(ws, struct mddev, sync_work);
9441	int spares = 0;
9442	bool suspend = false;
9443	char *name;
9444
9445	/*
9446	 * If reshape is still in progress, spares won't be added or removed
9447	 * from conf until reshape is done.
9448	 */
9449	if (mddev->reshape_position == MaxSector &&
9450	    md_spares_need_change(mddev)) {
9451		suspend = true;
9452		mddev_suspend(mddev, false);
9453	}
9454
9455	mddev_lock_nointr(mddev);
9456	if (!md_is_rdwr(mddev)) {
9457		/*
9458		 * On a read-only array we can:
9459		 * - remove failed devices
9460		 * - add already-in_sync devices if the array itself is in-sync.
9461		 * As we only add devices that are already in-sync, we can
9462		 * activate the spares immediately.
9463		 */
9464		remove_and_add_spares(mddev, NULL);
9465		goto not_running;
9466	}
9467
9468	if (!md_choose_sync_action(mddev, &spares))
9469		goto not_running;
9470
9471	if (!mddev->pers->sync_request)
9472		goto not_running;
9473
9474	/*
9475	 * We are adding a device or devices to an array which has the bitmap
9476	 * stored on all devices. So make sure all bitmap pages get written.
9477	 */
9478	if (spares)
9479		md_bitmap_write_all(mddev->bitmap);
9480
9481	name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
9482			"reshape" : "resync";
9483	rcu_assign_pointer(mddev->sync_thread,
9484			   md_register_thread(md_do_sync, mddev, name));
9485	if (!mddev->sync_thread) {
9486		pr_warn("%s: could not start resync thread...\n",
9487			mdname(mddev));
9488		/* leave the spares where they are, it shouldn't hurt */
9489		goto not_running;
9490	}
9491
9492	mddev_unlock(mddev);
9493	/*
9494	 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
9495	 * not set it again. Otherwise, we may cause issue like this one:
9496	 *     https://bugzilla.kernel.org/show_bug.cgi?id=218200
9497	 * Therefore, use __mddev_resume(mddev, false).
9498	 */
9499	if (suspend)
9500		__mddev_resume(mddev, false);
9501	md_wakeup_thread(mddev->sync_thread);
9502	sysfs_notify_dirent_safe(mddev->sysfs_action);
9503	md_new_event();
9504	return;
9505
9506not_running:
9507	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9508	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9509	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9510	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9511	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9512	mddev_unlock(mddev);
9513	/*
9514	 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should
9515	 * not set it again. Otherwise, we may cause issue like this one:
9516	 *     https://bugzilla.kernel.org/show_bug.cgi?id=218200
9517	 * Therefore, use __mddev_resume(mddev, false).
9518	 */
9519	if (suspend)
9520		__mddev_resume(mddev, false);
9521
9522	wake_up(&resync_wait);
9523	if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
9524	    mddev->sysfs_action)
9525		sysfs_notify_dirent_safe(mddev->sysfs_action);
9526}
9527
9528static void unregister_sync_thread(struct mddev *mddev)
9529{
9530	if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
9531		/* resync/recovery still happening */
9532		clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9533		return;
9534	}
9535
9536	if (WARN_ON_ONCE(!mddev->sync_thread))
9537		return;
9538
9539	md_reap_sync_thread(mddev);
9540}
9541
9542/*
9543 * This routine is regularly called by all per-raid-array threads to
9544 * deal with generic issues like resync and super-block update.
9545 * Raid personalities that don't have a thread (linear/raid0) do not
9546 * need this as they never do any recovery or update the superblock.
9547 *
9548 * It does not do any resync itself, but rather "forks" off other threads
9549 * to do that as needed.
9550 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
9551 * "->recovery" and create a thread at ->sync_thread.
9552 * When the thread finishes it sets MD_RECOVERY_DONE
9553 * and wakeups up this thread which will reap the thread and finish up.
9554 * This thread also removes any faulty devices (with nr_pending == 0).
9555 *
9556 * The overall approach is:
9557 *  1/ if the superblock needs updating, update it.
9558 *  2/ If a recovery thread is running, don't do anything else.
9559 *  3/ If recovery has finished, clean up, possibly marking spares active.
9560 *  4/ If there are any faulty devices, remove them.
9561 *  5/ If array is degraded, try to add spares devices
9562 *  6/ If array has spares or is not in-sync, start a resync thread.
9563 */
9564void md_check_recovery(struct mddev *mddev)
9565{
9566	if (mddev->bitmap)
9567		md_bitmap_daemon_work(mddev);
9568
9569	if (signal_pending(current)) {
9570		if (mddev->pers->sync_request && !mddev->external) {
9571			pr_debug("md: %s in immediate safe mode\n",
9572				 mdname(mddev));
9573			mddev->safemode = 2;
9574		}
9575		flush_signals(current);
9576	}
9577
9578	if (!md_is_rdwr(mddev) &&
9579	    !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
9580	    !test_bit(MD_RECOVERY_DONE, &mddev->recovery))
9581		return;
9582	if ( ! (
9583		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
9584		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
9585		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
9586		(mddev->external == 0 && mddev->safemode == 1) ||
9587		(mddev->safemode == 2
9588		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
9589		))
9590		return;
9591
9592	if (mddev_trylock(mddev)) {
9593		bool try_set_sync = mddev->safemode != 0;
9594
9595		if (!mddev->external && mddev->safemode == 1)
9596			mddev->safemode = 0;
9597
9598		if (!md_is_rdwr(mddev)) {
9599			struct md_rdev *rdev;
9600
9601			if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
9602				unregister_sync_thread(mddev);
9603				goto unlock;
9604			}
9605
9606			if (!mddev->external && mddev->in_sync)
9607				/*
9608				 * 'Blocked' flag not needed as failed devices
9609				 * will be recorded if array switched to read/write.
9610				 * Leaving it set will prevent the device
9611				 * from being removed.
9612				 */
9613				rdev_for_each(rdev, mddev)
9614					clear_bit(Blocked, &rdev->flags);
9615
9616			/*
9617			 * There is no thread, but we need to call
9618			 * ->spare_active and clear saved_raid_disk
9619			 */
9620			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
9621			md_reap_sync_thread(mddev);
9622
9623			/*
9624			 * Let md_start_sync() to remove and add rdevs to the
9625			 * array.
9626			 */
9627			if (md_spares_need_change(mddev)) {
9628				set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9629				queue_work(md_misc_wq, &mddev->sync_work);
9630			}
9631
9632			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
9633			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9634			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9635
9636			goto unlock;
9637		}
9638
9639		if (mddev_is_clustered(mddev)) {
9640			struct md_rdev *rdev, *tmp;
9641			/* kick the device if another node issued a
9642			 * remove disk.
9643			 */
9644			rdev_for_each_safe(rdev, tmp, mddev) {
9645				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
9646						rdev->raid_disk < 0)
9647					md_kick_rdev_from_array(rdev);
9648			}
9649		}
9650
9651		if (try_set_sync && !mddev->external && !mddev->in_sync) {
9652			spin_lock(&mddev->lock);
9653			set_in_sync(mddev);
9654			spin_unlock(&mddev->lock);
9655		}
9656
9657		if (mddev->sb_flags)
9658			md_update_sb(mddev, 0);
9659
9660		/*
9661		 * Never start a new sync thread if MD_RECOVERY_RUNNING is
9662		 * still set.
9663		 */
9664		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
9665			unregister_sync_thread(mddev);
9666			goto unlock;
9667		}
9668
9669		/* Set RUNNING before clearing NEEDED to avoid
9670		 * any transients in the value of "sync_action".
9671		 */
9672		mddev->curr_resync_completed = 0;
9673		spin_lock(&mddev->lock);
9674		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9675		spin_unlock(&mddev->lock);
9676		/* Clear some bits that don't mean anything, but
9677		 * might be left set
9678		 */
9679		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
9680		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9681
9682		if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
9683		    !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
9684			queue_work(md_misc_wq, &mddev->sync_work);
9685		} else {
9686			clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9687			wake_up(&resync_wait);
9688		}
9689
9690	unlock:
9691		wake_up(&mddev->sb_wait);
9692		mddev_unlock(mddev);
9693	}
9694}
9695EXPORT_SYMBOL(md_check_recovery);
9696
9697void md_reap_sync_thread(struct mddev *mddev)
9698{
9699	struct md_rdev *rdev;
9700	sector_t old_dev_sectors = mddev->dev_sectors;
9701	bool is_reshaped = false;
9702
9703	/* resync has finished, collect result */
9704	md_unregister_thread(mddev, &mddev->sync_thread);
9705	atomic_inc(&mddev->sync_seq);
9706
9707	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
9708	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
9709	    mddev->degraded != mddev->raid_disks) {
9710		/* success...*/
9711		/* activate any spares */
9712		if (mddev->pers->spare_active(mddev)) {
9713			sysfs_notify_dirent_safe(mddev->sysfs_degraded);
9714			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
9715		}
9716	}
9717	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
9718	    mddev->pers->finish_reshape) {
9719		mddev->pers->finish_reshape(mddev);
9720		if (mddev_is_clustered(mddev))
9721			is_reshaped = true;
9722	}
9723
9724	/* If array is no-longer degraded, then any saved_raid_disk
9725	 * information must be scrapped.
9726	 */
9727	if (!mddev->degraded)
9728		rdev_for_each(rdev, mddev)
9729			rdev->saved_raid_disk = -1;
9730
9731	md_update_sb(mddev, 1);
9732	/* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
9733	 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
9734	 * clustered raid */
9735	if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
9736		md_cluster_ops->resync_finish(mddev);
9737	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9738	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9739	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9740	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9741	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9742	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9743	/*
9744	 * We call md_cluster_ops->update_size here because sync_size could
9745	 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
9746	 * so it is time to update size across cluster.
9747	 */
9748	if (mddev_is_clustered(mddev) && is_reshaped
9749				      && !test_bit(MD_CLOSING, &mddev->flags))
9750		md_cluster_ops->update_size(mddev, old_dev_sectors);
9751	/* flag recovery needed just to double check */
9752	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9753	sysfs_notify_dirent_safe(mddev->sysfs_completed);
9754	sysfs_notify_dirent_safe(mddev->sysfs_action);
9755	md_new_event();
9756	if (mddev->event_work.func)
9757		queue_work(md_misc_wq, &mddev->event_work);
9758	wake_up(&resync_wait);
9759}
9760EXPORT_SYMBOL(md_reap_sync_thread);
9761
9762void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
9763{
9764	sysfs_notify_dirent_safe(rdev->sysfs_state);
9765	wait_event_timeout(rdev->blocked_wait,
9766			   !test_bit(Blocked, &rdev->flags) &&
9767			   !test_bit(BlockedBadBlocks, &rdev->flags),
9768			   msecs_to_jiffies(5000));
9769	rdev_dec_pending(rdev, mddev);
9770}
9771EXPORT_SYMBOL(md_wait_for_blocked_rdev);
9772
9773void md_finish_reshape(struct mddev *mddev)
9774{
9775	/* called be personality module when reshape completes. */
9776	struct md_rdev *rdev;
9777
9778	rdev_for_each(rdev, mddev) {
9779		if (rdev->data_offset > rdev->new_data_offset)
9780			rdev->sectors += rdev->data_offset - rdev->new_data_offset;
9781		else
9782			rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
9783		rdev->data_offset = rdev->new_data_offset;
9784	}
9785}
9786EXPORT_SYMBOL(md_finish_reshape);
9787
9788/* Bad block management */
9789
9790/* Returns 1 on success, 0 on failure */
9791int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9792		       int is_new)
9793{
9794	struct mddev *mddev = rdev->mddev;
9795	int rv;
9796	if (is_new)
9797		s += rdev->new_data_offset;
9798	else
9799		s += rdev->data_offset;
9800	rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
9801	if (rv == 0) {
9802		/* Make sure they get written out promptly */
9803		if (test_bit(ExternalBbl, &rdev->flags))
9804			sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
9805		sysfs_notify_dirent_safe(rdev->sysfs_state);
9806		set_mask_bits(&mddev->sb_flags, 0,
9807			      BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9808		md_wakeup_thread(rdev->mddev->thread);
9809		return 1;
9810	} else
9811		return 0;
9812}
9813EXPORT_SYMBOL_GPL(rdev_set_badblocks);
9814
9815int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9816			 int is_new)
9817{
9818	int rv;
9819	if (is_new)
9820		s += rdev->new_data_offset;
9821	else
9822		s += rdev->data_offset;
9823	rv = badblocks_clear(&rdev->badblocks, s, sectors);
9824	if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
9825		sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
9826	return rv;
9827}
9828EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
9829
9830static int md_notify_reboot(struct notifier_block *this,
9831			    unsigned long code, void *x)
9832{
9833	struct mddev *mddev, *n;
9834	int need_delay = 0;
9835
9836	spin_lock(&all_mddevs_lock);
9837	list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
9838		if (!mddev_get(mddev))
9839			continue;
9840		spin_unlock(&all_mddevs_lock);
9841		if (mddev_trylock(mddev)) {
9842			if (mddev->pers)
9843				__md_stop_writes(mddev);
9844			if (mddev->persistent)
9845				mddev->safemode = 2;
9846			mddev_unlock(mddev);
9847		}
9848		need_delay = 1;
9849		mddev_put(mddev);
9850		spin_lock(&all_mddevs_lock);
9851	}
9852	spin_unlock(&all_mddevs_lock);
9853
9854	/*
9855	 * certain more exotic SCSI devices are known to be
9856	 * volatile wrt too early system reboots. While the
9857	 * right place to handle this issue is the given
9858	 * driver, we do want to have a safe RAID driver ...
9859	 */
9860	if (need_delay)
9861		msleep(1000);
9862
9863	return NOTIFY_DONE;
9864}
9865
9866static struct notifier_block md_notifier = {
9867	.notifier_call	= md_notify_reboot,
9868	.next		= NULL,
9869	.priority	= INT_MAX, /* before any real devices */
9870};
9871
9872static void md_geninit(void)
9873{
9874	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
9875
9876	proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops);
9877}
9878
9879static int __init md_init(void)
9880{
9881	int ret = -ENOMEM;
9882
9883	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
9884	if (!md_wq)
9885		goto err_wq;
9886
9887	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
9888	if (!md_misc_wq)
9889		goto err_misc_wq;
9890
9891	md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND,
9892				       0);
9893	if (!md_bitmap_wq)
9894		goto err_bitmap_wq;
9895
9896	ret = __register_blkdev(MD_MAJOR, "md", md_probe);
9897	if (ret < 0)
9898		goto err_md;
9899
9900	ret = __register_blkdev(0, "mdp", md_probe);
9901	if (ret < 0)
9902		goto err_mdp;
9903	mdp_major = ret;
9904
9905	register_reboot_notifier(&md_notifier);
9906	raid_table_header = register_sysctl("dev/raid", raid_table);
9907
9908	md_geninit();
9909	return 0;
9910
9911err_mdp:
9912	unregister_blkdev(MD_MAJOR, "md");
9913err_md:
9914	destroy_workqueue(md_bitmap_wq);
9915err_bitmap_wq:
9916	destroy_workqueue(md_misc_wq);
9917err_misc_wq:
9918	destroy_workqueue(md_wq);
9919err_wq:
9920	return ret;
9921}
9922
9923static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9924{
9925	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
9926	struct md_rdev *rdev2, *tmp;
9927	int role, ret;
9928
9929	/*
9930	 * If size is changed in another node then we need to
9931	 * do resize as well.
9932	 */
9933	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
9934		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
9935		if (ret)
9936			pr_info("md-cluster: resize failed\n");
9937		else
9938			md_bitmap_update_sb(mddev->bitmap);
9939	}
9940
9941	/* Check for change of roles in the active devices */
9942	rdev_for_each_safe(rdev2, tmp, mddev) {
9943		if (test_bit(Faulty, &rdev2->flags))
9944			continue;
9945
9946		/* Check if the roles changed */
9947		role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
9948
9949		if (test_bit(Candidate, &rdev2->flags)) {
9950			if (role == MD_DISK_ROLE_FAULTY) {
9951				pr_info("md: Removing Candidate device %pg because add failed\n",
9952					rdev2->bdev);
9953				md_kick_rdev_from_array(rdev2);
9954				continue;
9955			}
9956			else
9957				clear_bit(Candidate, &rdev2->flags);
9958		}
9959
9960		if (role != rdev2->raid_disk) {
9961			/*
9962			 * got activated except reshape is happening.
9963			 */
9964			if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
9965			    !(le32_to_cpu(sb->feature_map) &
9966			      MD_FEATURE_RESHAPE_ACTIVE)) {
9967				rdev2->saved_raid_disk = role;
9968				ret = remove_and_add_spares(mddev, rdev2);
9969				pr_info("Activated spare: %pg\n",
9970					rdev2->bdev);
9971				/* wakeup mddev->thread here, so array could
9972				 * perform resync with the new activated disk */
9973				set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9974				md_wakeup_thread(mddev->thread);
9975			}
9976			/* device faulty
9977			 * We just want to do the minimum to mark the disk
9978			 * as faulty. The recovery is performed by the
9979			 * one who initiated the error.
9980			 */
9981			if (role == MD_DISK_ROLE_FAULTY ||
9982			    role == MD_DISK_ROLE_JOURNAL) {
9983				md_error(mddev, rdev2);
9984				clear_bit(Blocked, &rdev2->flags);
9985			}
9986		}
9987	}
9988
9989	if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) {
9990		ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
9991		if (ret)
9992			pr_warn("md: updating array disks failed. %d\n", ret);
9993	}
9994
9995	/*
9996	 * Since mddev->delta_disks has already updated in update_raid_disks,
9997	 * so it is time to check reshape.
9998	 */
9999	if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
10000	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
10001		/*
10002		 * reshape is happening in the remote node, we need to
10003		 * update reshape_position and call start_reshape.
10004		 */
10005		mddev->reshape_position = le64_to_cpu(sb->reshape_position);
10006		if (mddev->pers->update_reshape_pos)
10007			mddev->pers->update_reshape_pos(mddev);
10008		if (mddev->pers->start_reshape)
10009			mddev->pers->start_reshape(mddev);
10010	} else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
10011		   mddev->reshape_position != MaxSector &&
10012		   !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
10013		/* reshape is just done in another node. */
10014		mddev->reshape_position = MaxSector;
10015		if (mddev->pers->update_reshape_pos)
10016			mddev->pers->update_reshape_pos(mddev);
10017	}
10018
10019	/* Finally set the event to be up to date */
10020	mddev->events = le64_to_cpu(sb->events);
10021}
10022
10023static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
10024{
10025	int err;
10026	struct page *swapout = rdev->sb_page;
10027	struct mdp_superblock_1 *sb;
10028
10029	/* Store the sb page of the rdev in the swapout temporary
10030	 * variable in case we err in the future
10031	 */
10032	rdev->sb_page = NULL;
10033	err = alloc_disk_sb(rdev);
10034	if (err == 0) {
10035		ClearPageUptodate(rdev->sb_page);
10036		rdev->sb_loaded = 0;
10037		err = super_types[mddev->major_version].
10038			load_super(rdev, NULL, mddev->minor_version);
10039	}
10040	if (err < 0) {
10041		pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
10042				__func__, __LINE__, rdev->desc_nr, err);
10043		if (rdev->sb_page)
10044			put_page(rdev->sb_page);
10045		rdev->sb_page = swapout;
10046		rdev->sb_loaded = 1;
10047		return err;
10048	}
10049
10050	sb = page_address(rdev->sb_page);
10051	/* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
10052	 * is not set
10053	 */
10054
10055	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
10056		rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
10057
10058	/* The other node finished recovery, call spare_active to set
10059	 * device In_sync and mddev->degraded
10060	 */
10061	if (rdev->recovery_offset == MaxSector &&
10062	    !test_bit(In_sync, &rdev->flags) &&
10063	    mddev->pers->spare_active(mddev))
10064		sysfs_notify_dirent_safe(mddev->sysfs_degraded);
10065
10066	put_page(swapout);
10067	return 0;
10068}
10069
10070void md_reload_sb(struct mddev *mddev, int nr)
10071{
10072	struct md_rdev *rdev = NULL, *iter;
10073	int err;
10074
10075	/* Find the rdev */
10076	rdev_for_each_rcu(iter, mddev) {
10077		if (iter->desc_nr == nr) {
10078			rdev = iter;
10079			break;
10080		}
10081	}
10082
10083	if (!rdev) {
10084		pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
10085		return;
10086	}
10087
10088	err = read_rdev(mddev, rdev);
10089	if (err < 0)
10090		return;
10091
10092	check_sb_changes(mddev, rdev);
10093
10094	/* Read all rdev's to update recovery_offset */
10095	rdev_for_each_rcu(rdev, mddev) {
10096		if (!test_bit(Faulty, &rdev->flags))
10097			read_rdev(mddev, rdev);
10098	}
10099}
10100EXPORT_SYMBOL(md_reload_sb);
10101
10102#ifndef MODULE
10103
10104/*
10105 * Searches all registered partitions for autorun RAID arrays
10106 * at boot time.
10107 */
10108
10109static DEFINE_MUTEX(detected_devices_mutex);
10110static LIST_HEAD(all_detected_devices);
10111struct detected_devices_node {
10112	struct list_head list;
10113	dev_t dev;
10114};
10115
10116void md_autodetect_dev(dev_t dev)
10117{
10118	struct detected_devices_node *node_detected_dev;
10119
10120	node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
10121	if (node_detected_dev) {
10122		node_detected_dev->dev = dev;
10123		mutex_lock(&detected_devices_mutex);
10124		list_add_tail(&node_detected_dev->list, &all_detected_devices);
10125		mutex_unlock(&detected_devices_mutex);
10126	}
10127}
10128
10129void md_autostart_arrays(int part)
10130{
10131	struct md_rdev *rdev;
10132	struct detected_devices_node *node_detected_dev;
10133	dev_t dev;
10134	int i_scanned, i_passed;
10135
10136	i_scanned = 0;
10137	i_passed = 0;
10138
10139	pr_info("md: Autodetecting RAID arrays.\n");
10140
10141	mutex_lock(&detected_devices_mutex);
10142	while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
10143		i_scanned++;
10144		node_detected_dev = list_entry(all_detected_devices.next,
10145					struct detected_devices_node, list);
10146		list_del(&node_detected_dev->list);
10147		dev = node_detected_dev->dev;
10148		kfree(node_detected_dev);
10149		mutex_unlock(&detected_devices_mutex);
10150		rdev = md_import_device(dev,0, 90);
10151		mutex_lock(&detected_devices_mutex);
10152		if (IS_ERR(rdev))
10153			continue;
10154
10155		if (test_bit(Faulty, &rdev->flags))
10156			continue;
10157
10158		set_bit(AutoDetected, &rdev->flags);
10159		list_add(&rdev->same_set, &pending_raid_disks);
10160		i_passed++;
10161	}
10162	mutex_unlock(&detected_devices_mutex);
10163
10164	pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
10165
10166	autorun_devices(part);
10167}
10168
10169#endif /* !MODULE */
10170
10171static __exit void md_exit(void)
10172{
10173	struct mddev *mddev, *n;
10174	int delay = 1;
10175
10176	unregister_blkdev(MD_MAJOR,"md");
10177	unregister_blkdev(mdp_major, "mdp");
10178	unregister_reboot_notifier(&md_notifier);
10179	unregister_sysctl_table(raid_table_header);
10180
10181	/* We cannot unload the modules while some process is
10182	 * waiting for us in select() or poll() - wake them up
10183	 */
10184	md_unloading = 1;
10185	while (waitqueue_active(&md_event_waiters)) {
10186		/* not safe to leave yet */
10187		wake_up(&md_event_waiters);
10188		msleep(delay);
10189		delay += delay;
10190	}
10191	remove_proc_entry("mdstat", NULL);
10192
10193	spin_lock(&all_mddevs_lock);
10194	list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
10195		if (!mddev_get(mddev))
10196			continue;
10197		spin_unlock(&all_mddevs_lock);
10198		export_array(mddev);
10199		mddev->ctime = 0;
10200		mddev->hold_active = 0;
10201		/*
10202		 * As the mddev is now fully clear, mddev_put will schedule
10203		 * the mddev for destruction by a workqueue, and the
10204		 * destroy_workqueue() below will wait for that to complete.
10205		 */
10206		mddev_put(mddev);
10207		spin_lock(&all_mddevs_lock);
10208	}
10209	spin_unlock(&all_mddevs_lock);
10210
10211	destroy_workqueue(md_misc_wq);
10212	destroy_workqueue(md_bitmap_wq);
10213	destroy_workqueue(md_wq);
10214}
10215
10216subsys_initcall(md_init);
10217module_exit(md_exit)
10218
10219static int get_ro(char *buffer, const struct kernel_param *kp)
10220{
10221	return sprintf(buffer, "%d\n", start_readonly);
10222}
10223static int set_ro(const char *val, const struct kernel_param *kp)
10224{
10225	return kstrtouint(val, 10, (unsigned int *)&start_readonly);
10226}
10227
10228module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
10229module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
10230module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
10231module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
10232
10233MODULE_LICENSE("GPL");
10234MODULE_DESCRIPTION("MD RAID framework");
10235MODULE_ALIAS("md");
10236MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
10237