1/*
2 * raid1.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
5 *
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7 *
8 * RAID-1 management functions.
9 *
10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11 *
12 * Fixes to reconstruction by Jakob �stergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2, or (at your option)
18 * any later version.
19 *
20 * You should have received a copy of the GNU General Public License
21 * (for example /usr/src/linux/COPYING); if not, write to the Free
22 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#include <linux/module.h>
26#include <linux/config.h>
27#include <linux/slab.h>
28#include <linux/raid/raid1.h>
29#include <asm/atomic.h>
30
31#define MAJOR_NR MD_MAJOR
32#define MD_DRIVER
33#define MD_PERSONALITY
34
35#define MAX_WORK_PER_DISK 128
36
37#define	NR_RESERVED_BUFS	32
38
39
40/*
41 * The following can be used to debug the driver
42 */
43#define RAID1_DEBUG	0
44
45#if RAID1_DEBUG
46#define PRINTK(x...)   printk(x)
47#define inline
48#define __inline__
49#else
50#define PRINTK(x...)  do { } while (0)
51#endif
52
53
54static mdk_personality_t raid1_personality;
55static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
56struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
57
58static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
59{
60	/* return a linked list of "cnt" struct buffer_heads.
61	 * don't take any off the free list unless we know we can
62	 * get all we need, otherwise we could deadlock
63	 */
64	struct buffer_head *bh=NULL;
65
66	while(cnt) {
67		struct buffer_head *t;
68		md_spin_lock_irq(&conf->device_lock);
69		if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
70			while (cnt) {
71				t = conf->freebh;
72				conf->freebh = t->b_next;
73				t->b_next = bh;
74				bh = t;
75				t->b_state = 0;
76				conf->freebh_cnt--;
77				cnt--;
78			}
79		md_spin_unlock_irq(&conf->device_lock);
80		if (cnt == 0)
81			break;
82		t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
83		if (t) {
84			t->b_next = bh;
85			bh = t;
86			cnt--;
87		} else {
88			PRINTK("raid1: waiting for %d bh\n", cnt);
89			conf->freebh_blocked = 1;
90			wait_disk_event(conf->wait_buffer,
91					!conf->freebh_blocked ||
92					conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
93			conf->freebh_blocked = 0;
94		}
95	}
96	return bh;
97}
98
99static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
100{
101	unsigned long flags;
102	spin_lock_irqsave(&conf->device_lock, flags);
103	while (bh) {
104		struct buffer_head *t = bh;
105		bh=bh->b_next;
106		if (t->b_pprev == NULL)
107			kmem_cache_free(bh_cachep, t);
108		else {
109			t->b_next= conf->freebh;
110			conf->freebh = t;
111			conf->freebh_cnt++;
112		}
113	}
114	spin_unlock_irqrestore(&conf->device_lock, flags);
115	wake_up(&conf->wait_buffer);
116}
117
118static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
119{
120	/* allocate cnt buffer_heads, possibly less if kmalloc fails */
121	int i = 0;
122
123	while (i < cnt) {
124		struct buffer_head *bh;
125		bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
126		if (!bh) break;
127
128		md_spin_lock_irq(&conf->device_lock);
129		bh->b_pprev = &conf->freebh;
130		bh->b_next = conf->freebh;
131		conf->freebh = bh;
132		conf->freebh_cnt++;
133		md_spin_unlock_irq(&conf->device_lock);
134
135		i++;
136	}
137	return i;
138}
139
140static void raid1_shrink_bh(raid1_conf_t *conf)
141{
142	/* discard all buffer_heads */
143
144	md_spin_lock_irq(&conf->device_lock);
145	while (conf->freebh) {
146		struct buffer_head *bh = conf->freebh;
147		conf->freebh = bh->b_next;
148		kmem_cache_free(bh_cachep, bh);
149		conf->freebh_cnt--;
150	}
151	md_spin_unlock_irq(&conf->device_lock);
152}
153
154
155static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
156{
157	struct raid1_bh *r1_bh = NULL;
158
159	do {
160		md_spin_lock_irq(&conf->device_lock);
161		if (!conf->freer1_blocked && conf->freer1) {
162			r1_bh = conf->freer1;
163			conf->freer1 = r1_bh->next_r1;
164			conf->freer1_cnt--;
165			r1_bh->next_r1 = NULL;
166			r1_bh->state = (1 << R1BH_PreAlloc);
167			r1_bh->bh_req.b_state = 0;
168		}
169		md_spin_unlock_irq(&conf->device_lock);
170		if (r1_bh)
171			return r1_bh;
172		r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
173		if (r1_bh) {
174			memset(r1_bh, 0, sizeof(*r1_bh));
175			return r1_bh;
176		}
177		conf->freer1_blocked = 1;
178		wait_disk_event(conf->wait_buffer,
179				!conf->freer1_blocked ||
180				conf->freer1_cnt > NR_RESERVED_BUFS/2
181			);
182		conf->freer1_blocked = 0;
183	} while (1);
184}
185
186static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
187{
188	struct buffer_head *bh = r1_bh->mirror_bh_list;
189	raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
190
191	r1_bh->mirror_bh_list = NULL;
192
193	if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
194		unsigned long flags;
195		spin_lock_irqsave(&conf->device_lock, flags);
196		r1_bh->next_r1 = conf->freer1;
197		conf->freer1 = r1_bh;
198		conf->freer1_cnt++;
199		spin_unlock_irqrestore(&conf->device_lock, flags);
200		/* don't need to wakeup wait_buffer because
201		 *  raid1_free_bh below will do that
202		 */
203	} else {
204		kfree(r1_bh);
205	}
206	raid1_free_bh(conf, bh);
207}
208
209static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
210{
211	int i = 0;
212
213	while (i < cnt) {
214		struct raid1_bh *r1_bh;
215		r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
216		if (!r1_bh)
217			break;
218		memset(r1_bh, 0, sizeof(*r1_bh));
219		set_bit(R1BH_PreAlloc, &r1_bh->state);
220		r1_bh->mddev = conf->mddev;
221
222		raid1_free_r1bh(r1_bh);
223		i++;
224	}
225	return i;
226}
227
228static void raid1_shrink_r1bh(raid1_conf_t *conf)
229{
230	md_spin_lock_irq(&conf->device_lock);
231	while (conf->freer1) {
232		struct raid1_bh *r1_bh = conf->freer1;
233		conf->freer1 = r1_bh->next_r1;
234		conf->freer1_cnt--;
235		kfree(r1_bh);
236	}
237	md_spin_unlock_irq(&conf->device_lock);
238}
239
240
241
242static inline void raid1_free_buf(struct raid1_bh *r1_bh)
243{
244	unsigned long flags;
245	struct buffer_head *bh = r1_bh->mirror_bh_list;
246	raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
247	r1_bh->mirror_bh_list = NULL;
248
249	spin_lock_irqsave(&conf->device_lock, flags);
250	r1_bh->next_r1 = conf->freebuf;
251	conf->freebuf = r1_bh;
252	spin_unlock_irqrestore(&conf->device_lock, flags);
253	raid1_free_bh(conf, bh);
254}
255
256static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
257{
258	struct raid1_bh *r1_bh;
259
260	md_spin_lock_irq(&conf->device_lock);
261	wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
262	r1_bh = conf->freebuf;
263	conf->freebuf = r1_bh->next_r1;
264	r1_bh->next_r1= NULL;
265	md_spin_unlock_irq(&conf->device_lock);
266
267	return r1_bh;
268}
269
270static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
271{
272	int i = 0;
273	struct raid1_bh *head = NULL, **tail;
274	tail = &head;
275
276	while (i < cnt) {
277		struct raid1_bh *r1_bh;
278		struct page *page;
279
280		page = alloc_page(GFP_KERNEL);
281		if (!page)
282			break;
283
284		r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
285		if (!r1_bh) {
286			__free_page(page);
287			break;
288		}
289		memset(r1_bh, 0, sizeof(*r1_bh));
290		r1_bh->bh_req.b_page = page;
291		r1_bh->bh_req.b_data = page_address(page);
292		*tail = r1_bh;
293		r1_bh->next_r1 = NULL;
294		tail = & r1_bh->next_r1;
295		i++;
296	}
297	/* this lock probably isn't needed, as at the time when
298	 * we are allocating buffers, nobody else will be touching the
299	 * freebuf list.  But it doesn't hurt....
300	 */
301	md_spin_lock_irq(&conf->device_lock);
302	*tail = conf->freebuf;
303	conf->freebuf = head;
304	md_spin_unlock_irq(&conf->device_lock);
305	return i;
306}
307
308static void raid1_shrink_buffers (raid1_conf_t *conf)
309{
310	struct raid1_bh *head;
311	md_spin_lock_irq(&conf->device_lock);
312	head = conf->freebuf;
313	conf->freebuf = NULL;
314	md_spin_unlock_irq(&conf->device_lock);
315
316	while (head) {
317		struct raid1_bh *r1_bh = head;
318		head = r1_bh->next_r1;
319		__free_page(r1_bh->bh_req.b_page);
320		kfree(r1_bh);
321	}
322}
323
324static int raid1_map (mddev_t *mddev, kdev_t *rdev)
325{
326	raid1_conf_t *conf = mddev_to_conf(mddev);
327	int i, disks = MD_SB_DISKS;
328
329	/*
330	 * Later we do read balancing on the read side
331	 * now we use the first available disk.
332	 */
333
334	for (i = 0; i < disks; i++) {
335		if (conf->mirrors[i].operational) {
336			*rdev = conf->mirrors[i].dev;
337			return (0);
338		}
339	}
340
341	printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
342	return (-1);
343}
344
345static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
346{
347	unsigned long flags;
348	mddev_t *mddev = r1_bh->mddev;
349	raid1_conf_t *conf = mddev_to_conf(mddev);
350
351	md_spin_lock_irqsave(&retry_list_lock, flags);
352	if (raid1_retry_list == NULL)
353		raid1_retry_tail = &raid1_retry_list;
354	*raid1_retry_tail = r1_bh;
355	raid1_retry_tail = &r1_bh->next_r1;
356	r1_bh->next_r1 = NULL;
357	md_spin_unlock_irqrestore(&retry_list_lock, flags);
358	md_wakeup_thread(conf->thread);
359}
360
361
362static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
363{
364	unsigned long flags;
365	spin_lock_irqsave(&conf->segment_lock, flags);
366	if (sector < conf->start_active)
367		conf->cnt_done--;
368	else if (sector >= conf->start_future && conf->phase == phase)
369		conf->cnt_future--;
370	else if (!--conf->cnt_pending)
371		wake_up(&conf->wait_ready);
372
373	spin_unlock_irqrestore(&conf->segment_lock, flags);
374}
375
376static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
377{
378	unsigned long flags;
379	spin_lock_irqsave(&conf->segment_lock, flags);
380	if (sector >= conf->start_ready)
381		--conf->cnt_ready;
382	else if (sector >= conf->start_active) {
383		if (!--conf->cnt_active) {
384			conf->start_active = conf->start_ready;
385			wake_up(&conf->wait_done);
386		}
387	}
388	spin_unlock_irqrestore(&conf->segment_lock, flags);
389}
390
391/*
392 * raid1_end_bh_io() is called when we have finished servicing a mirrored
393 * operation and are ready to return a success/failure code to the buffer
394 * cache layer.
395 */
396static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
397{
398	struct buffer_head *bh = r1_bh->master_bh;
399
400	io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
401			test_bit(R1BH_SyncPhase, &r1_bh->state));
402
403	bh->b_end_io(bh, uptodate);
404	raid1_free_r1bh(r1_bh);
405}
406void raid1_end_request (struct buffer_head *bh, int uptodate)
407{
408	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
409
410	/*
411	 * this branch is our 'one mirror IO has finished' event handler:
412	 */
413	if (!uptodate)
414		md_error (r1_bh->mddev, bh->b_dev);
415	else
416		/*
417		 * Set R1BH_Uptodate in our master buffer_head, so that
418		 * we will return a good error code for to the higher
419		 * levels even if IO on some other mirrored buffer fails.
420		 *
421		 * The 'master' represents the complex operation to
422		 * user-side. So if something waits for IO, then it will
423		 * wait for the 'master' buffer_head.
424		 */
425		set_bit (R1BH_Uptodate, &r1_bh->state);
426
427	/*
428	 * We split up the read and write side, imho they are
429	 * conceptually different.
430	 */
431
432	if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
433		/*
434		 * we have only one buffer_head on the read side
435		 */
436
437		if (uptodate) {
438			raid1_end_bh_io(r1_bh, uptodate);
439			return;
440		}
441		/*
442		 * oops, read error:
443		 */
444		printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
445			 partition_name(bh->b_dev), bh->b_blocknr);
446		raid1_reschedule_retry(r1_bh);
447		return;
448	}
449
450	/*
451	 * WRITE:
452	 *
453	 * Let's see if all mirrored write operations have finished
454	 * already.
455	 */
456
457	if (atomic_dec_and_test(&r1_bh->remaining))
458		raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
459}
460
461/*
462 * This routine returns the disk from which the requested read should
463 * be done. It bookkeeps the last read position for every disk
464 * in array and when new read requests come, the disk which last
465 * position is nearest to the request, is chosen.
466 *
467 * TODO: now if there are 2 mirrors in the same 2 devices, performance
468 * degrades dramatically because position is mirror, not device based.
469 * This should be changed to be device based. Also atomic sequential
470 * reads should be somehow balanced.
471 */
472
473static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
474{
475	int new_disk = conf->last_used;
476	const int sectors = bh->b_size >> 9;
477	const unsigned long this_sector = bh->b_rsector;
478	int disk = new_disk;
479	unsigned long new_distance;
480	unsigned long current_distance;
481
482	/*
483	 * Check if it is sane at all to balance
484	 */
485
486	if (conf->resync_mirrors)
487		goto rb_out;
488
489
490	/* make sure that disk is operational */
491	while( !conf->mirrors[new_disk].operational) {
492		if (new_disk <= 0) new_disk = conf->raid_disks;
493		new_disk--;
494		if (new_disk == disk) {
495			/*
496			 * This means no working disk was found
497			 * Nothing much to do, lets not change anything
498			 * and hope for the best...
499			 */
500
501			new_disk = conf->last_used;
502
503			goto rb_out;
504		}
505	}
506	disk = new_disk;
507	/* now disk == new_disk == starting point for search */
508
509	/*
510	 * Don't touch anything for sequential reads.
511	 */
512
513	if (this_sector == conf->mirrors[new_disk].head_position)
514		goto rb_out;
515
516	/*
517	 * If reads have been done only on a single disk
518	 * for a time, lets give another disk a change.
519	 * This is for kicking those idling disks so that
520	 * they would find work near some hotspot.
521	 */
522
523	if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
524		conf->sect_count = 0;
525
526#if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92)
527		/* Work around a compiler bug in egcs-2.92.11 19980921 */
528		new_disk = *(volatile int *)&new_disk;
529#endif
530		do {
531			if (new_disk<=0)
532				new_disk = conf->raid_disks;
533			new_disk--;
534			if (new_disk == disk)
535				break;
536		} while ((conf->mirrors[new_disk].write_only) ||
537			 (!conf->mirrors[new_disk].operational));
538
539		goto rb_out;
540	}
541
542	current_distance = abs(this_sector -
543				conf->mirrors[disk].head_position);
544
545	/* Find the disk which is closest */
546
547	do {
548		if (disk <= 0)
549			disk = conf->raid_disks;
550		disk--;
551
552		if ((conf->mirrors[disk].write_only) ||
553				(!conf->mirrors[disk].operational))
554			continue;
555
556		new_distance = abs(this_sector -
557					conf->mirrors[disk].head_position);
558
559		if (new_distance < current_distance) {
560			conf->sect_count = 0;
561			current_distance = new_distance;
562			new_disk = disk;
563		}
564	} while (disk != conf->last_used);
565
566rb_out:
567	conf->mirrors[new_disk].head_position = this_sector + sectors;
568
569	conf->last_used = new_disk;
570	conf->sect_count += sectors;
571
572	return new_disk;
573}
574
575static int raid1_make_request (mddev_t *mddev, int rw,
576			       struct buffer_head * bh)
577{
578	raid1_conf_t *conf = mddev_to_conf(mddev);
579	struct buffer_head *bh_req, *bhl;
580	struct raid1_bh * r1_bh;
581	int disks = MD_SB_DISKS;
582	int i, sum_bhs = 0;
583	struct mirror_info *mirror;
584
585	if (!buffer_locked(bh))
586		BUG();
587
588/*
589 * make_request() can abort the operation when READA is being
590 * used and no empty request is available.
591 *
592 * Currently, just replace the command with READ/WRITE.
593 */
594	if (rw == READA)
595		rw = READ;
596
597	r1_bh = raid1_alloc_r1bh (conf);
598
599	spin_lock_irq(&conf->segment_lock);
600	wait_event_lock_irq(conf->wait_done,
601			bh->b_rsector < conf->start_active ||
602			bh->b_rsector >= conf->start_future,
603			conf->segment_lock);
604	if (bh->b_rsector < conf->start_active)
605		conf->cnt_done++;
606	else {
607		conf->cnt_future++;
608		if (conf->phase)
609			set_bit(R1BH_SyncPhase, &r1_bh->state);
610	}
611	spin_unlock_irq(&conf->segment_lock);
612
613	/*
614	 * i think the read and write branch should be separated completely,
615	 * since we want to do read balancing on the read side for example.
616	 * Alternative implementations? :) --mingo
617	 */
618
619	r1_bh->master_bh = bh;
620	r1_bh->mddev = mddev;
621	r1_bh->cmd = rw;
622
623	if (rw == READ) {
624		/*
625		 * read balancing logic:
626		 */
627		mirror = conf->mirrors + raid1_read_balance(conf, bh);
628
629		bh_req = &r1_bh->bh_req;
630		memcpy(bh_req, bh, sizeof(*bh));
631		bh_req->b_blocknr = bh->b_rsector;
632		bh_req->b_dev = mirror->dev;
633		bh_req->b_rdev = mirror->dev;
634	/*	bh_req->b_rsector = bh->n_rsector; */
635		bh_req->b_end_io = raid1_end_request;
636		bh_req->b_private = r1_bh;
637		generic_make_request (rw, bh_req);
638		return 0;
639	}
640
641	/*
642	 * WRITE:
643	 */
644
645	bhl = raid1_alloc_bh(conf, conf->raid_disks);
646	for (i = 0; i < disks; i++) {
647		struct buffer_head *mbh;
648		if (!conf->mirrors[i].operational)
649			continue;
650
651	/*
652	 * We should use a private pool (size depending on NR_REQUEST),
653	 * to avoid writes filling up the memory with bhs
654	 *
655 	 * Such pools are much faster than kmalloc anyways (so we waste
656 	 * almost nothing by not using the master bh when writing and
657 	 * win alot of cleanness) but for now we are cool enough. --mingo
658 	 *
659	 * It's safe to sleep here, buffer heads cannot be used in a shared
660 	 * manner in the write branch. Look how we lock the buffer at the
661 	 * beginning of this function to grok the difference ;)
662	 */
663 		mbh = bhl;
664		if (mbh == NULL) {
665			MD_BUG();
666			break;
667		}
668		bhl = mbh->b_next;
669		mbh->b_next = NULL;
670		mbh->b_this_page = (struct buffer_head *)1;
671
672 	/*
673 	 * prepare mirrored mbh (fields ordered for max mem throughput):
674 	 */
675		mbh->b_blocknr    = bh->b_rsector;
676		mbh->b_dev        = conf->mirrors[i].dev;
677		mbh->b_rdev	  = conf->mirrors[i].dev;
678		mbh->b_rsector	  = bh->b_rsector;
679		mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
680						(1<<BH_Mapped) | (1<<BH_Lock);
681
682		atomic_set(&mbh->b_count, 1);
683 		mbh->b_size       = bh->b_size;
684 		mbh->b_page	  = bh->b_page;
685 		mbh->b_data	  = bh->b_data;
686 		mbh->b_list       = BUF_LOCKED;
687 		mbh->b_end_io     = raid1_end_request;
688 		mbh->b_private    = r1_bh;
689
690		mbh->b_next = r1_bh->mirror_bh_list;
691		r1_bh->mirror_bh_list = mbh;
692		sum_bhs++;
693	}
694	if (bhl) raid1_free_bh(conf,bhl);
695	if (!sum_bhs) {
696		/* Gag - all mirrors non-operational.. */
697		raid1_end_bh_io(r1_bh, 0);
698		return 0;
699	}
700	md_atomic_set(&r1_bh->remaining, sum_bhs);
701
702	/*
703	 * We have to be a bit careful about the semaphore above, thats
704	 * why we start the requests separately. Since kmalloc() could
705	 * fail, sleep and make_request() can sleep too, this is the
706	 * safer solution. Imagine, end_request decreasing the semaphore
707	 * before we could have set it up ... We could play tricks with
708	 * the semaphore (presetting it and correcting at the end if
709	 * sum_bhs is not 'n' but we have to do end_request by hand if
710	 * all requests finish until we had a chance to set up the
711	 * semaphore correctly ... lots of races).
712	 */
713	bh = r1_bh->mirror_bh_list;
714	while(bh) {
715		struct buffer_head *bh2 = bh;
716		bh = bh->b_next;
717		generic_make_request(rw, bh2);
718	}
719	return (0);
720}
721
722static int raid1_status (char *page, mddev_t *mddev)
723{
724	raid1_conf_t *conf = mddev_to_conf(mddev);
725	int sz = 0, i;
726
727	sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
728						 conf->working_disks);
729	for (i = 0; i < conf->raid_disks; i++)
730		sz += sprintf (page+sz, "%s",
731			conf->mirrors[i].operational ? "U" : "_");
732	sz += sprintf (page+sz, "]");
733	return sz;
734}
735
736#define LAST_DISK KERN_ALERT \
737"raid1: only one disk left and IO error.\n"
738
739#define NO_SPARE_DISK KERN_ALERT \
740"raid1: no spare disk left, degrading mirror level by one.\n"
741
742#define DISK_FAILED KERN_ALERT \
743"raid1: Disk failure on %s, disabling device. \n" \
744"	Operation continuing on %d devices\n"
745
746#define START_SYNCING KERN_ALERT \
747"raid1: start syncing spare disk.\n"
748
749#define ALREADY_SYNCING KERN_INFO \
750"raid1: syncing already in progress.\n"
751
752static void mark_disk_bad (mddev_t *mddev, int failed)
753{
754	raid1_conf_t *conf = mddev_to_conf(mddev);
755	struct mirror_info *mirror = conf->mirrors+failed;
756	mdp_super_t *sb = mddev->sb;
757
758	mirror->operational = 0;
759	mark_disk_faulty(sb->disks+mirror->number);
760	mark_disk_nonsync(sb->disks+mirror->number);
761	mark_disk_inactive(sb->disks+mirror->number);
762	if (!mirror->write_only)
763		sb->active_disks--;
764	sb->working_disks--;
765	sb->failed_disks++;
766	mddev->sb_dirty = 1;
767	md_wakeup_thread(conf->thread);
768	if (!mirror->write_only)
769		conf->working_disks--;
770	printk (DISK_FAILED, partition_name (mirror->dev),
771				 conf->working_disks);
772}
773
774static int raid1_error (mddev_t *mddev, kdev_t dev)
775{
776	raid1_conf_t *conf = mddev_to_conf(mddev);
777	struct mirror_info * mirrors = conf->mirrors;
778	int disks = MD_SB_DISKS;
779	int i;
780
781	/* Find the drive.
782	 * If it is not operational, then we have already marked it as dead
783	 * else if it is the last working disks, ignore the error, let the
784	 * next level up know.
785	 * else mark the drive as failed
786	 */
787
788	for (i = 0; i < disks; i++)
789		if (mirrors[i].dev==dev && mirrors[i].operational)
790			break;
791	if (i == disks)
792		return 0;
793
794	if (i < conf->raid_disks && conf->working_disks == 1) {
795		/* Don't fail the drive, act as though we were just a
796		 * normal single drive
797		 */
798
799		return 1;
800	}
801	mark_disk_bad(mddev, i);
802	return 0;
803}
804
805#undef LAST_DISK
806#undef NO_SPARE_DISK
807#undef DISK_FAILED
808#undef START_SYNCING
809
810
811static void print_raid1_conf (raid1_conf_t *conf)
812{
813	int i;
814	struct mirror_info *tmp;
815
816	printk("RAID1 conf printout:\n");
817	if (!conf) {
818		printk("(conf==NULL)\n");
819		return;
820	}
821	printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
822			 conf->raid_disks, conf->nr_disks);
823
824	for (i = 0; i < MD_SB_DISKS; i++) {
825		tmp = conf->mirrors + i;
826		printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
827			i, tmp->spare,tmp->operational,
828			tmp->number,tmp->raid_disk,tmp->used_slot,
829			partition_name(tmp->dev));
830	}
831}
832
833static void close_sync(raid1_conf_t *conf)
834{
835	mddev_t *mddev = conf->mddev;
836	/* If reconstruction was interrupted, we need to close the "active" and "pending"
837	 * holes.
838	 * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
839	 */
840	/* this is really needed when recovery stops too... */
841	spin_lock_irq(&conf->segment_lock);
842	conf->start_active = conf->start_pending;
843	conf->start_ready = conf->start_pending;
844	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
845	conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
846	conf->start_future = (mddev->sb->size<<1)+1;
847	conf->cnt_pending = conf->cnt_future;
848	conf->cnt_future = 0;
849	conf->phase = conf->phase ^1;
850	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
851	conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
852	conf->phase = 0;
853	conf->cnt_future = conf->cnt_done;;
854	conf->cnt_done = 0;
855	spin_unlock_irq(&conf->segment_lock);
856	wake_up(&conf->wait_done);
857}
858
859static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
860{
861	int err = 0;
862	int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
863	raid1_conf_t *conf = mddev->private;
864	struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
865	mdp_super_t *sb = mddev->sb;
866	mdp_disk_t *failed_desc, *spare_desc, *added_desc;
867	mdk_rdev_t *spare_rdev, *failed_rdev;
868
869	print_raid1_conf(conf);
870
871	switch (state) {
872	case DISKOP_SPARE_ACTIVE:
873	case DISKOP_SPARE_INACTIVE:
874		/* need to wait for pending sync io before locking device */
875		close_sync(conf);
876	}
877
878	md_spin_lock_irq(&conf->device_lock);
879	/*
880	 * find the disk ...
881	 */
882	switch (state) {
883
884	case DISKOP_SPARE_ACTIVE:
885
886		/*
887		 * Find the failed disk within the RAID1 configuration ...
888		 * (this can only be in the first conf->working_disks part)
889		 */
890		for (i = 0; i < conf->raid_disks; i++) {
891			tmp = conf->mirrors + i;
892			if ((!tmp->operational && !tmp->spare) ||
893					!tmp->used_slot) {
894				failed_disk = i;
895				break;
896			}
897		}
898		/*
899		 * When we activate a spare disk we _must_ have a disk in
900		 * the lower (active) part of the array to replace.
901		 */
902		if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
903			MD_BUG();
904			err = 1;
905			goto abort;
906		}
907		/* fall through */
908
909	case DISKOP_SPARE_WRITE:
910	case DISKOP_SPARE_INACTIVE:
911
912		/*
913		 * Find the spare disk ... (can only be in the 'high'
914		 * area of the array)
915		 */
916		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
917			tmp = conf->mirrors + i;
918			if (tmp->spare && tmp->number == (*d)->number) {
919				spare_disk = i;
920				break;
921			}
922		}
923		if (spare_disk == -1) {
924			MD_BUG();
925			err = 1;
926			goto abort;
927		}
928		break;
929
930	case DISKOP_HOT_REMOVE_DISK:
931
932		for (i = 0; i < MD_SB_DISKS; i++) {
933			tmp = conf->mirrors + i;
934			if (tmp->used_slot && (tmp->number == (*d)->number)) {
935				if (tmp->operational) {
936					err = -EBUSY;
937					goto abort;
938				}
939				removed_disk = i;
940				break;
941			}
942		}
943		if (removed_disk == -1) {
944			MD_BUG();
945			err = 1;
946			goto abort;
947		}
948		break;
949
950	case DISKOP_HOT_ADD_DISK:
951
952		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
953			tmp = conf->mirrors + i;
954			if (!tmp->used_slot) {
955				added_disk = i;
956				break;
957			}
958		}
959		if (added_disk == -1) {
960			MD_BUG();
961			err = 1;
962			goto abort;
963		}
964		break;
965	}
966
967	switch (state) {
968	/*
969	 * Switch the spare disk to write-only mode:
970	 */
971	case DISKOP_SPARE_WRITE:
972		sdisk = conf->mirrors + spare_disk;
973		sdisk->operational = 1;
974		sdisk->write_only = 1;
975		break;
976	/*
977	 * Deactivate a spare disk:
978	 */
979	case DISKOP_SPARE_INACTIVE:
980		if (conf->start_future > 0) {
981			MD_BUG();
982			err = -EBUSY;
983			break;
984		}
985		sdisk = conf->mirrors + spare_disk;
986		sdisk->operational = 0;
987		sdisk->write_only = 0;
988		break;
989	/*
990	 * Activate (mark read-write) the (now sync) spare disk,
991	 * which means we switch it's 'raid position' (->raid_disk)
992	 * with the failed disk. (only the first 'conf->nr_disks'
993	 * slots are used for 'real' disks and we must preserve this
994	 * property)
995	 */
996	case DISKOP_SPARE_ACTIVE:
997		if (conf->start_future > 0) {
998			MD_BUG();
999			err = -EBUSY;
1000			break;
1001		}
1002		sdisk = conf->mirrors + spare_disk;
1003		fdisk = conf->mirrors + failed_disk;
1004
1005		spare_desc = &sb->disks[sdisk->number];
1006		failed_desc = &sb->disks[fdisk->number];
1007
1008		if (spare_desc != *d) {
1009			MD_BUG();
1010			err = 1;
1011			goto abort;
1012		}
1013
1014		if (spare_desc->raid_disk != sdisk->raid_disk) {
1015			MD_BUG();
1016			err = 1;
1017			goto abort;
1018		}
1019
1020		if (sdisk->raid_disk != spare_disk) {
1021			MD_BUG();
1022			err = 1;
1023			goto abort;
1024		}
1025
1026		if (failed_desc->raid_disk != fdisk->raid_disk) {
1027			MD_BUG();
1028			err = 1;
1029			goto abort;
1030		}
1031
1032		if (fdisk->raid_disk != failed_disk) {
1033			MD_BUG();
1034			err = 1;
1035			goto abort;
1036		}
1037
1038		/*
1039		 * do the switch finally
1040		 */
1041		spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1042		failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1043
1044		/* There must be a spare_rdev, but there may not be a
1045		 * failed_rdev.  That slot might be empty...
1046		 */
1047		spare_rdev->desc_nr = failed_desc->number;
1048		if (failed_rdev)
1049			failed_rdev->desc_nr = spare_desc->number;
1050
1051		xchg_values(*spare_desc, *failed_desc);
1052		xchg_values(*fdisk, *sdisk);
1053
1054		/*
1055		 * (careful, 'failed' and 'spare' are switched from now on)
1056		 *
1057		 * we want to preserve linear numbering and we want to
1058		 * give the proper raid_disk number to the now activated
1059		 * disk. (this means we switch back these values)
1060		 */
1061
1062		xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1063		xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1064		xchg_values(spare_desc->number, failed_desc->number);
1065		xchg_values(sdisk->number, fdisk->number);
1066
1067		*d = failed_desc;
1068
1069		if (sdisk->dev == MKDEV(0,0))
1070			sdisk->used_slot = 0;
1071		/*
1072		 * this really activates the spare.
1073		 */
1074		fdisk->spare = 0;
1075		fdisk->write_only = 0;
1076
1077		/*
1078		 * if we activate a spare, we definitely replace a
1079		 * non-operational disk slot in the 'low' area of
1080		 * the disk array.
1081		 */
1082
1083		conf->working_disks++;
1084
1085		break;
1086
1087	case DISKOP_HOT_REMOVE_DISK:
1088		rdisk = conf->mirrors + removed_disk;
1089
1090		if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1091			MD_BUG();
1092			err = 1;
1093			goto abort;
1094		}
1095		rdisk->dev = MKDEV(0,0);
1096		rdisk->used_slot = 0;
1097		conf->nr_disks--;
1098		break;
1099
1100	case DISKOP_HOT_ADD_DISK:
1101		adisk = conf->mirrors + added_disk;
1102		added_desc = *d;
1103
1104		if (added_disk != added_desc->number) {
1105			MD_BUG();
1106			err = 1;
1107			goto abort;
1108		}
1109
1110		adisk->number = added_desc->number;
1111		adisk->raid_disk = added_desc->raid_disk;
1112		adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1113
1114		adisk->operational = 0;
1115		adisk->write_only = 0;
1116		adisk->spare = 1;
1117		adisk->used_slot = 1;
1118		adisk->head_position = 0;
1119		conf->nr_disks++;
1120
1121		break;
1122
1123	default:
1124		MD_BUG();
1125		err = 1;
1126		goto abort;
1127	}
1128abort:
1129	md_spin_unlock_irq(&conf->device_lock);
1130	if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1131		/* should move to "END_REBUILD" when such exists */
1132		raid1_shrink_buffers(conf);
1133
1134	print_raid1_conf(conf);
1135	return err;
1136}
1137
1138
1139#define IO_ERROR KERN_ALERT \
1140"raid1: %s: unrecoverable I/O read error for block %lu\n"
1141
1142#define REDIRECT_SECTOR KERN_ERR \
1143"raid1: %s: redirecting sector %lu to another mirror\n"
1144
1145/*
1146 * This is a kernel thread which:
1147 *
1148 *	1.	Retries failed read operations on working mirrors.
1149 *	2.	Updates the raid superblock when problems encounter.
1150 *	3.	Performs writes following reads for array syncronising.
1151 */
1152static void end_sync_write(struct buffer_head *bh, int uptodate);
1153static void end_sync_read(struct buffer_head *bh, int uptodate);
1154
1155static void raid1d (void *data)
1156{
1157	struct raid1_bh *r1_bh;
1158	struct buffer_head *bh;
1159	unsigned long flags;
1160	raid1_conf_t *conf = data;
1161	mddev_t *mddev = conf->mddev;
1162	kdev_t dev;
1163
1164	if (mddev->sb_dirty)
1165		md_update_sb(mddev);
1166
1167	for (;;) {
1168		md_spin_lock_irqsave(&retry_list_lock, flags);
1169		r1_bh = raid1_retry_list;
1170		if (!r1_bh)
1171			break;
1172		raid1_retry_list = r1_bh->next_r1;
1173		md_spin_unlock_irqrestore(&retry_list_lock, flags);
1174
1175		mddev = r1_bh->mddev;
1176		bh = &r1_bh->bh_req;
1177		switch(r1_bh->cmd) {
1178		case SPECIAL:
1179			/* have to allocate lots of bh structures and
1180			 * schedule writes
1181			 */
1182			if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1183				int i, sum_bhs = 0;
1184				int disks = MD_SB_DISKS;
1185				struct buffer_head *bhl, *mbh;
1186
1187				conf = mddev_to_conf(mddev);
1188				bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1189				for (i = 0; i < disks ; i++) {
1190					if (!conf->mirrors[i].operational)
1191						continue;
1192					if (i==conf->last_used)
1193						/* we read from here, no need to write */
1194						continue;
1195					if (i < conf->raid_disks
1196					    && !conf->resync_mirrors)
1197						/* don't need to write this,
1198						 * we are just rebuilding */
1199						continue;
1200					mbh = bhl;
1201					if (!mbh) {
1202						MD_BUG();
1203						break;
1204					}
1205					bhl = mbh->b_next;
1206					mbh->b_this_page = (struct buffer_head *)1;
1207
1208
1209				/*
1210				 * prepare mirrored bh (fields ordered for max mem throughput):
1211				 */
1212					mbh->b_blocknr    = bh->b_blocknr;
1213					mbh->b_dev        = conf->mirrors[i].dev;
1214					mbh->b_rdev	  = conf->mirrors[i].dev;
1215					mbh->b_rsector	  = bh->b_blocknr;
1216					mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
1217						(1<<BH_Mapped) | (1<<BH_Lock);
1218					atomic_set(&mbh->b_count, 1);
1219					mbh->b_size       = bh->b_size;
1220					mbh->b_page	  = bh->b_page;
1221					mbh->b_data	  = bh->b_data;
1222					mbh->b_list       = BUF_LOCKED;
1223					mbh->b_end_io     = end_sync_write;
1224					mbh->b_private    = r1_bh;
1225
1226					mbh->b_next = r1_bh->mirror_bh_list;
1227					r1_bh->mirror_bh_list = mbh;
1228
1229					sum_bhs++;
1230				}
1231				md_atomic_set(&r1_bh->remaining, sum_bhs);
1232				if (bhl) raid1_free_bh(conf, bhl);
1233				mbh = r1_bh->mirror_bh_list;
1234
1235				if (!sum_bhs) {
1236					/* nowhere to write this too... I guess we
1237					 * must be done
1238					 */
1239					sync_request_done(bh->b_blocknr, conf);
1240					md_done_sync(mddev, bh->b_size>>9, 0);
1241					raid1_free_buf(r1_bh);
1242				} else
1243				while (mbh) {
1244					struct buffer_head *bh1 = mbh;
1245					mbh = mbh->b_next;
1246					generic_make_request(WRITE, bh1);
1247					md_sync_acct(bh1->b_dev, bh1->b_size/512);
1248				}
1249			} else {
1250				/* There is no point trying a read-for-reconstruct
1251				 * as reconstruct is about to be aborted
1252				 */
1253
1254				printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1255				md_done_sync(mddev, bh->b_size>>9, 0);
1256			}
1257
1258			break;
1259		case READ:
1260		case READA:
1261			dev = bh->b_dev;
1262			raid1_map (mddev, &bh->b_dev);
1263			if (bh->b_dev == dev) {
1264				printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1265				raid1_end_bh_io(r1_bh, 0);
1266			} else {
1267				printk (REDIRECT_SECTOR,
1268					partition_name(bh->b_dev), bh->b_blocknr);
1269				bh->b_rdev = bh->b_dev;
1270				bh->b_rsector = bh->b_blocknr;
1271				generic_make_request (r1_bh->cmd, bh);
1272			}
1273			break;
1274		}
1275	}
1276	md_spin_unlock_irqrestore(&retry_list_lock, flags);
1277}
1278#undef IO_ERROR
1279#undef REDIRECT_SECTOR
1280
1281/*
1282 * Private kernel thread to reconstruct mirrors after an unclean
1283 * shutdown.
1284 */
1285static void raid1syncd (void *data)
1286{
1287	raid1_conf_t *conf = data;
1288	mddev_t *mddev = conf->mddev;
1289
1290	if (!conf->resync_mirrors)
1291		return;
1292	if (conf->resync_mirrors == 2)
1293		return;
1294	down(&mddev->recovery_sem);
1295	if (!md_do_sync(mddev, NULL)) {
1296		/*
1297		 * Only if everything went Ok.
1298		 */
1299		conf->resync_mirrors = 0;
1300	}
1301
1302	close_sync(conf);
1303
1304	up(&mddev->recovery_sem);
1305	raid1_shrink_buffers(conf);
1306}
1307
1308/*
1309 * perform a "sync" on one "block"
1310 *
1311 * We need to make sure that no normal I/O request - particularly write
1312 * requests - conflict with active sync requests.
1313 * This is achieved by conceptually dividing the device space into a
1314 * number of sections:
1315 *  DONE: 0 .. a-1     These blocks are in-sync
1316 *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
1317 *                     no normal IO requests
1318 *  READY: b .. c-1    These blocks have no normal IO requests - sync
1319 *                     request may be happening
1320 *  PENDING: c .. d-1  These blocks may have IO requests, but no new
1321 *                     ones will be added
1322 *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
1323 *                     be happening, but not sync
1324 *
1325 * We keep a
1326 *   phase    which flips (0 or 1) each time d moves and
1327 * a count of:
1328 *   z =  active io requests in FUTURE since d moved - marked with
1329 *        current phase
1330 *   y =  active io requests in FUTURE before d moved, or PENDING -
1331 *        marked with previous phase
1332 *   x =  active sync requests in READY
1333 *   w =  active sync requests in ACTIVE
1334 *   v =  active io requests in DONE
1335 *
1336 * Normally, a=b=c=d=0 and z= active io requests
1337 *   or a=b=c=d=END and v= active io requests
1338 * Allowed changes to a,b,c,d:
1339 * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
1340 * B:  y==0 -> c=d
1341 * C:   b=c, w+=x, x=0
1342 * D:  w==0 -> a=b
1343 * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1344 *
1345 * At start of sync we apply A.
1346 * When y reaches 0, we apply B then A then being sync requests
1347 * When sync point reaches c-1, we wait for y==0, and W==0, and
1348 * then apply apply B then A then D then C.
1349 * Finally, we apply E
1350 *
1351 * The sync request simply issues a "read" against a working drive
1352 * This is marked so that on completion the raid1d thread is woken to
1353 * issue suitable write requests
1354 */
1355
1356static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
1357{
1358	raid1_conf_t *conf = mddev_to_conf(mddev);
1359	struct mirror_info *mirror;
1360	struct raid1_bh *r1_bh;
1361	struct buffer_head *bh;
1362	int bsize;
1363	int disk;
1364	int block_nr;
1365	int buffs;
1366
1367	if (!sector_nr) {
1368		/* we want enough buffers to hold twice the window of 128*/
1369		buffs = 128 *2 / (PAGE_SIZE>>9);
1370		buffs = raid1_grow_buffers(conf, buffs);
1371		if (buffs < 2)
1372			goto nomem;
1373		conf->window = buffs*(PAGE_SIZE>>9)/2;
1374	}
1375	spin_lock_irq(&conf->segment_lock);
1376	if (!sector_nr) {
1377		/* initialize ...*/
1378		conf->start_active = 0;
1379		conf->start_ready = 0;
1380		conf->start_pending = 0;
1381		conf->start_future = 0;
1382		conf->phase = 0;
1383
1384		conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1385		conf->cnt_done = conf->cnt_pending = 0;
1386		if (conf->cnt_ready || conf->cnt_active)
1387			MD_BUG();
1388	}
1389	while (sector_nr >= conf->start_pending) {
1390		PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1391			sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1392			conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1393		wait_event_lock_irq(conf->wait_done,
1394					!conf->cnt_active,
1395					conf->segment_lock);
1396		wait_event_lock_irq(conf->wait_ready,
1397					!conf->cnt_pending,
1398					conf->segment_lock);
1399		conf->start_active = conf->start_ready;
1400		conf->start_ready = conf->start_pending;
1401		conf->start_pending = conf->start_future;
1402		conf->start_future = conf->start_future+conf->window;
1403		// Note: falling off the end is not a problem
1404		conf->phase = conf->phase ^1;
1405		conf->cnt_active = conf->cnt_ready;
1406		conf->cnt_ready = 0;
1407		conf->cnt_pending = conf->cnt_future;
1408		conf->cnt_future = 0;
1409		wake_up(&conf->wait_done);
1410	}
1411	conf->cnt_ready++;
1412	spin_unlock_irq(&conf->segment_lock);
1413
1414
1415	/* If reconstructing, and >1 working disc,
1416	 * could dedicate one to rebuild and others to
1417	 * service read requests ..
1418	 */
1419	disk = conf->last_used;
1420	/* make sure disk is operational */
1421	while (!conf->mirrors[disk].operational) {
1422		if (disk <= 0) disk = conf->raid_disks;
1423		disk--;
1424		if (disk == conf->last_used)
1425			break;
1426	}
1427	conf->last_used = disk;
1428
1429	mirror = conf->mirrors+conf->last_used;
1430
1431	r1_bh = raid1_alloc_buf (conf);
1432	r1_bh->master_bh = NULL;
1433	r1_bh->mddev = mddev;
1434	r1_bh->cmd = SPECIAL;
1435	bh = &r1_bh->bh_req;
1436
1437	block_nr = sector_nr;
1438	bsize = 512;
1439	while (!(block_nr & 1) && bsize < PAGE_SIZE
1440			&& (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) {
1441		block_nr >>= 1;
1442		bsize <<= 1;
1443	}
1444	bh->b_size = bsize;
1445	bh->b_list = BUF_LOCKED;
1446	bh->b_dev = mirror->dev;
1447	bh->b_rdev = mirror->dev;
1448	bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
1449	if (!bh->b_page)
1450		BUG();
1451	if (!bh->b_data)
1452		BUG();
1453	if (bh->b_data != page_address(bh->b_page))
1454		BUG();
1455	bh->b_end_io = end_sync_read;
1456	bh->b_private = r1_bh;
1457	bh->b_blocknr = sector_nr;
1458	bh->b_rsector = sector_nr;
1459	init_waitqueue_head(&bh->b_wait);
1460
1461	generic_make_request(READ, bh);
1462	md_sync_acct(bh->b_dev, bh->b_size/512);
1463
1464	return (bsize >> 9);
1465
1466nomem:
1467	raid1_shrink_buffers(conf);
1468	return -ENOMEM;
1469}
1470
1471static void end_sync_read(struct buffer_head *bh, int uptodate)
1472{
1473	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1474
1475	/* we have read a block, now it needs to be re-written,
1476	 * or re-read if the read failed.
1477	 * We don't do much here, just schedule handling by raid1d
1478	 */
1479	if (!uptodate)
1480		md_error (r1_bh->mddev, bh->b_dev);
1481	else
1482		set_bit(R1BH_Uptodate, &r1_bh->state);
1483	raid1_reschedule_retry(r1_bh);
1484}
1485
1486static void end_sync_write(struct buffer_head *bh, int uptodate)
1487{
1488 	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1489
1490	if (!uptodate)
1491 		md_error (r1_bh->mddev, bh->b_dev);
1492	if (atomic_dec_and_test(&r1_bh->remaining)) {
1493		mddev_t *mddev = r1_bh->mddev;
1494 		unsigned long sect = bh->b_blocknr;
1495		int size = bh->b_size;
1496		raid1_free_buf(r1_bh);
1497		sync_request_done(sect, mddev_to_conf(mddev));
1498		md_done_sync(mddev,size>>9, uptodate);
1499	}
1500}
1501
1502#define INVALID_LEVEL KERN_WARNING \
1503"raid1: md%d: raid level not set to mirroring (%d)\n"
1504
1505#define NO_SB KERN_ERR \
1506"raid1: disabled mirror %s (couldn't access raid superblock)\n"
1507
1508#define ERRORS KERN_ERR \
1509"raid1: disabled mirror %s (errors detected)\n"
1510
1511#define NOT_IN_SYNC KERN_ERR \
1512"raid1: disabled mirror %s (not in sync)\n"
1513
1514#define INCONSISTENT KERN_ERR \
1515"raid1: disabled mirror %s (inconsistent descriptor)\n"
1516
1517#define ALREADY_RUNNING KERN_ERR \
1518"raid1: disabled mirror %s (mirror %d already operational)\n"
1519
1520#define OPERATIONAL KERN_INFO \
1521"raid1: device %s operational as mirror %d\n"
1522
1523#define MEM_ERROR KERN_ERR \
1524"raid1: couldn't allocate memory for md%d\n"
1525
1526#define SPARE KERN_INFO \
1527"raid1: spare disk %s\n"
1528
1529#define NONE_OPERATIONAL KERN_ERR \
1530"raid1: no operational mirrors for md%d\n"
1531
1532#define ARRAY_IS_ACTIVE KERN_INFO \
1533"raid1: raid set md%d active with %d out of %d mirrors\n"
1534
1535#define THREAD_ERROR KERN_ERR \
1536"raid1: couldn't allocate thread for md%d\n"
1537
1538#define START_RESYNC KERN_WARNING \
1539"raid1: raid set md%d not clean; reconstructing mirrors\n"
1540
1541static int raid1_run (mddev_t *mddev)
1542{
1543	raid1_conf_t *conf;
1544	int i, j, disk_idx;
1545	struct mirror_info *disk;
1546	mdp_super_t *sb = mddev->sb;
1547	mdp_disk_t *descriptor;
1548	mdk_rdev_t *rdev;
1549	struct md_list_head *tmp;
1550	int start_recovery = 0;
1551
1552	MOD_INC_USE_COUNT;
1553
1554	if (sb->level != 1) {
1555		printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1556		goto out;
1557	}
1558	/*
1559	 * copy the already verified devices into our private RAID1
1560	 * bookkeeping area. [whatever we allocate in raid1_run(),
1561	 * should be freed in raid1_stop()]
1562	 */
1563
1564	conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1565	mddev->private = conf;
1566	if (!conf) {
1567		printk(MEM_ERROR, mdidx(mddev));
1568		goto out;
1569	}
1570	memset(conf, 0, sizeof(*conf));
1571
1572	ITERATE_RDEV(mddev,rdev,tmp) {
1573		if (rdev->faulty) {
1574			printk(ERRORS, partition_name(rdev->dev));
1575		} else {
1576			if (!rdev->sb) {
1577				MD_BUG();
1578				continue;
1579			}
1580		}
1581		if (rdev->desc_nr == -1) {
1582			MD_BUG();
1583			continue;
1584		}
1585		descriptor = &sb->disks[rdev->desc_nr];
1586		disk_idx = descriptor->raid_disk;
1587		disk = conf->mirrors + disk_idx;
1588
1589		if (disk_faulty(descriptor)) {
1590			disk->number = descriptor->number;
1591			disk->raid_disk = disk_idx;
1592			disk->dev = rdev->dev;
1593			disk->sect_limit = MAX_WORK_PER_DISK;
1594			disk->operational = 0;
1595			disk->write_only = 0;
1596			disk->spare = 0;
1597			disk->used_slot = 1;
1598			disk->head_position = 0;
1599			continue;
1600		}
1601		if (disk_active(descriptor)) {
1602			if (!disk_sync(descriptor)) {
1603				printk(NOT_IN_SYNC,
1604					partition_name(rdev->dev));
1605				continue;
1606			}
1607			if ((descriptor->number > MD_SB_DISKS) ||
1608					 (disk_idx > sb->raid_disks)) {
1609
1610				printk(INCONSISTENT,
1611					partition_name(rdev->dev));
1612				continue;
1613			}
1614			if (disk->operational) {
1615				printk(ALREADY_RUNNING,
1616					partition_name(rdev->dev),
1617					disk_idx);
1618				continue;
1619			}
1620			printk(OPERATIONAL, partition_name(rdev->dev),
1621 					disk_idx);
1622			disk->number = descriptor->number;
1623			disk->raid_disk = disk_idx;
1624			disk->dev = rdev->dev;
1625			disk->sect_limit = MAX_WORK_PER_DISK;
1626			disk->operational = 1;
1627			disk->write_only = 0;
1628			disk->spare = 0;
1629			disk->used_slot = 1;
1630			disk->head_position = 0;
1631			conf->working_disks++;
1632		} else {
1633		/*
1634		 * Must be a spare disk ..
1635		 */
1636			printk(SPARE, partition_name(rdev->dev));
1637			disk->number = descriptor->number;
1638			disk->raid_disk = disk_idx;
1639			disk->dev = rdev->dev;
1640			disk->sect_limit = MAX_WORK_PER_DISK;
1641			disk->operational = 0;
1642			disk->write_only = 0;
1643			disk->spare = 1;
1644			disk->used_slot = 1;
1645			disk->head_position = 0;
1646		}
1647	}
1648	conf->raid_disks = sb->raid_disks;
1649	conf->nr_disks = sb->nr_disks;
1650	conf->mddev = mddev;
1651	conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1652
1653	conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1654	init_waitqueue_head(&conf->wait_buffer);
1655	init_waitqueue_head(&conf->wait_done);
1656	init_waitqueue_head(&conf->wait_ready);
1657
1658	if (!conf->working_disks) {
1659		printk(NONE_OPERATIONAL, mdidx(mddev));
1660		goto out_free_conf;
1661	}
1662
1663
1664	/* pre-allocate some buffer_head structures.
1665	 * As a minimum, 1 r1bh and raid_disks buffer_heads
1666	 * would probably get us by in tight memory situations,
1667	 * but a few more is probably a good idea.
1668	 * For now, try NR_RESERVED_BUFS r1bh and
1669	 * NR_RESERVED_BUFS*raid_disks bufferheads
1670	 * This will allow at least NR_RESERVED_BUFS concurrent
1671	 * reads or writes even if kmalloc starts failing
1672	 */
1673	if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
1674	    raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
1675	                      < NR_RESERVED_BUFS*conf->raid_disks) {
1676		printk(MEM_ERROR, mdidx(mddev));
1677		goto out_free_conf;
1678	}
1679
1680	for (i = 0; i < MD_SB_DISKS; i++) {
1681
1682		descriptor = sb->disks+i;
1683		disk_idx = descriptor->raid_disk;
1684		disk = conf->mirrors + disk_idx;
1685
1686		if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1687				!disk->used_slot) {
1688
1689			disk->number = descriptor->number;
1690			disk->raid_disk = disk_idx;
1691			disk->dev = MKDEV(0,0);
1692
1693			disk->operational = 0;
1694			disk->write_only = 0;
1695			disk->spare = 0;
1696			disk->used_slot = 1;
1697			disk->head_position = 0;
1698		}
1699	}
1700
1701	/*
1702	 * find the first working one and use it as a starting point
1703	 * to read balancing.
1704	 */
1705	for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
1706		/* nothing */;
1707	conf->last_used = j;
1708
1709
1710	if (conf->working_disks != sb->raid_disks) {
1711		printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1712		start_recovery = 1;
1713	}
1714
1715	{
1716		const char * name = "raid1d";
1717
1718		conf->thread = md_register_thread(raid1d, conf, name);
1719		if (!conf->thread) {
1720			printk(THREAD_ERROR, mdidx(mddev));
1721			goto out_free_conf;
1722		}
1723	}
1724
1725	if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
1726	    (conf->working_disks > 1)) {
1727		const char * name = "raid1syncd";
1728
1729		conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1730		if (!conf->resync_thread) {
1731			printk(THREAD_ERROR, mdidx(mddev));
1732			goto out_free_conf;
1733		}
1734
1735		printk(START_RESYNC, mdidx(mddev));
1736		conf->resync_mirrors = 1;
1737		md_wakeup_thread(conf->resync_thread);
1738	}
1739
1740	/*
1741	 * Regenerate the "device is in sync with the raid set" bit for
1742	 * each device.
1743	 */
1744	for (i = 0; i < MD_SB_DISKS; i++) {
1745		mark_disk_nonsync(sb->disks+i);
1746		for (j = 0; j < sb->raid_disks; j++) {
1747			if (!conf->mirrors[j].operational)
1748				continue;
1749			if (sb->disks[i].number == conf->mirrors[j].number)
1750				mark_disk_sync(sb->disks+i);
1751		}
1752	}
1753	sb->active_disks = conf->working_disks;
1754
1755	if (start_recovery)
1756		md_recover_arrays();
1757
1758
1759	printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1760	/*
1761	 * Ok, everything is just fine now
1762	 */
1763	return 0;
1764
1765out_free_conf:
1766	raid1_shrink_r1bh(conf);
1767	raid1_shrink_bh(conf);
1768	raid1_shrink_buffers(conf);
1769	kfree(conf);
1770	mddev->private = NULL;
1771out:
1772	MOD_DEC_USE_COUNT;
1773	return -EIO;
1774}
1775
1776#undef INVALID_LEVEL
1777#undef NO_SB
1778#undef ERRORS
1779#undef NOT_IN_SYNC
1780#undef INCONSISTENT
1781#undef ALREADY_RUNNING
1782#undef OPERATIONAL
1783#undef SPARE
1784#undef NONE_OPERATIONAL
1785#undef ARRAY_IS_ACTIVE
1786
1787static int raid1_stop_resync (mddev_t *mddev)
1788{
1789	raid1_conf_t *conf = mddev_to_conf(mddev);
1790
1791	if (conf->resync_thread) {
1792		if (conf->resync_mirrors) {
1793			conf->resync_mirrors = 2;
1794			md_interrupt_thread(conf->resync_thread);
1795
1796			printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1797			return 1;
1798		}
1799		return 0;
1800	}
1801	return 0;
1802}
1803
1804static int raid1_restart_resync (mddev_t *mddev)
1805{
1806	raid1_conf_t *conf = mddev_to_conf(mddev);
1807
1808	if (conf->resync_mirrors) {
1809		if (!conf->resync_thread) {
1810			MD_BUG();
1811			return 0;
1812		}
1813		conf->resync_mirrors = 1;
1814		md_wakeup_thread(conf->resync_thread);
1815		return 1;
1816	}
1817	return 0;
1818}
1819
1820static int raid1_stop (mddev_t *mddev)
1821{
1822	raid1_conf_t *conf = mddev_to_conf(mddev);
1823
1824	md_unregister_thread(conf->thread);
1825	if (conf->resync_thread)
1826		md_unregister_thread(conf->resync_thread);
1827	raid1_shrink_r1bh(conf);
1828	raid1_shrink_bh(conf);
1829	raid1_shrink_buffers(conf);
1830	kfree(conf);
1831	mddev->private = NULL;
1832	MOD_DEC_USE_COUNT;
1833	return 0;
1834}
1835
1836static mdk_personality_t raid1_personality=
1837{
1838	name:		"raid1",
1839	make_request:	raid1_make_request,
1840	run:		raid1_run,
1841	stop:		raid1_stop,
1842	status:		raid1_status,
1843	error_handler:	raid1_error,
1844	diskop:		raid1_diskop,
1845	stop_resync:	raid1_stop_resync,
1846	restart_resync:	raid1_restart_resync,
1847	sync_request:	raid1_sync_request
1848};
1849
1850static int md__init raid1_init (void)
1851{
1852	return register_md_personality (RAID1, &raid1_personality);
1853}
1854
1855static void raid1_exit (void)
1856{
1857	unregister_md_personality (RAID1);
1858}
1859
1860module_init(raid1_init);
1861module_exit(raid1_exit);
1862MODULE_LICENSE("GPL");
1863