• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/linux/linux-2.6/drivers/md/
1/*
2 * raid10.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 2000-2004 Neil Brown
5 *
6 * RAID-10 support for md.
7 *
8 * Base on code in raid1.c.  See raid1.c for futher copyright information.
9 *
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option)
14 * any later version.
15 *
16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21#include <linux/slab.h>
22#include <linux/delay.h>
23#include <linux/blkdev.h>
24#include <linux/seq_file.h>
25#include "md.h"
26#include "raid10.h"
27#include "raid0.h"
28#include "bitmap.h"
29
30/*
31 * RAID10 provides a combination of RAID0 and RAID1 functionality.
32 * The layout of data is defined by
33 *    chunk_size
34 *    raid_disks
35 *    near_copies (stored in low byte of layout)
36 *    far_copies (stored in second byte of layout)
37 *    far_offset (stored in bit 16 of layout )
38 *
39 * The data to be stored is divided into chunks using chunksize.
40 * Each device is divided into far_copies sections.
41 * In each section, chunks are laid out in a style similar to raid0, but
42 * near_copies copies of each chunk is stored (each on a different drive).
43 * The starting device for each section is offset near_copies from the starting
44 * device of the previous section.
45 * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
46 * drive.
47 * near_copies and far_copies must be at least one, and their product is at most
48 * raid_disks.
49 *
50 * If far_offset is true, then the far_copies are handled a bit differently.
51 * The copies are still in different stripes, but instead of be very far apart
52 * on disk, there are adjacent stripes.
53 */
54
55/*
56 * Number of guaranteed r10bios in case of extreme VM load:
57 */
58#define	NR_RAID10_BIOS 256
59
60static void unplug_slaves(mddev_t *mddev);
61
62static void allow_barrier(conf_t *conf);
63static void lower_barrier(conf_t *conf);
64
65static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
66{
67	conf_t *conf = data;
68	r10bio_t *r10_bio;
69	int size = offsetof(struct r10bio_s, devs[conf->copies]);
70
71	/* allocate a r10bio with room for raid_disks entries in the bios array */
72	r10_bio = kzalloc(size, gfp_flags);
73	if (!r10_bio && conf->mddev)
74		unplug_slaves(conf->mddev);
75
76	return r10_bio;
77}
78
79static void r10bio_pool_free(void *r10_bio, void *data)
80{
81	kfree(r10_bio);
82}
83
84/* Maximum size of each resync request */
85#define RESYNC_BLOCK_SIZE (64*1024)
86#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
87/* amount of memory to reserve for resync requests */
88#define RESYNC_WINDOW (1024*1024)
89/* maximum number of concurrent requests, memory permitting */
90#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
91
92/*
93 * When performing a resync, we need to read and compare, so
94 * we need as many pages are there are copies.
95 * When performing a recovery, we need 2 bios, one for read,
96 * one for write (we recover only one drive per r10buf)
97 *
98 */
99static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
100{
101	conf_t *conf = data;
102	struct page *page;
103	r10bio_t *r10_bio;
104	struct bio *bio;
105	int i, j;
106	int nalloc;
107
108	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
109	if (!r10_bio) {
110		unplug_slaves(conf->mddev);
111		return NULL;
112	}
113
114	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
115		nalloc = conf->copies; /* resync */
116	else
117		nalloc = 2; /* recovery */
118
119	/*
120	 * Allocate bios.
121	 */
122	for (j = nalloc ; j-- ; ) {
123		bio = bio_alloc(gfp_flags, RESYNC_PAGES);
124		if (!bio)
125			goto out_free_bio;
126		r10_bio->devs[j].bio = bio;
127	}
128	/*
129	 * Allocate RESYNC_PAGES data pages and attach them
130	 * where needed.
131	 */
132	for (j = 0 ; j < nalloc; j++) {
133		bio = r10_bio->devs[j].bio;
134		for (i = 0; i < RESYNC_PAGES; i++) {
135			page = alloc_page(gfp_flags);
136			if (unlikely(!page))
137				goto out_free_pages;
138
139			bio->bi_io_vec[i].bv_page = page;
140		}
141	}
142
143	return r10_bio;
144
145out_free_pages:
146	for ( ; i > 0 ; i--)
147		safe_put_page(bio->bi_io_vec[i-1].bv_page);
148	while (j--)
149		for (i = 0; i < RESYNC_PAGES ; i++)
150			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
151	j = -1;
152out_free_bio:
153	while ( ++j < nalloc )
154		bio_put(r10_bio->devs[j].bio);
155	r10bio_pool_free(r10_bio, conf);
156	return NULL;
157}
158
159static void r10buf_pool_free(void *__r10_bio, void *data)
160{
161	int i;
162	conf_t *conf = data;
163	r10bio_t *r10bio = __r10_bio;
164	int j;
165
166	for (j=0; j < conf->copies; j++) {
167		struct bio *bio = r10bio->devs[j].bio;
168		if (bio) {
169			for (i = 0; i < RESYNC_PAGES; i++) {
170				safe_put_page(bio->bi_io_vec[i].bv_page);
171				bio->bi_io_vec[i].bv_page = NULL;
172			}
173			bio_put(bio);
174		}
175	}
176	r10bio_pool_free(r10bio, conf);
177}
178
179static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
180{
181	int i;
182
183	for (i = 0; i < conf->copies; i++) {
184		struct bio **bio = & r10_bio->devs[i].bio;
185		if (*bio && *bio != IO_BLOCKED)
186			bio_put(*bio);
187		*bio = NULL;
188	}
189}
190
191static void free_r10bio(r10bio_t *r10_bio)
192{
193	conf_t *conf = r10_bio->mddev->private;
194
195	/*
196	 * Wake up any possible resync thread that waits for the device
197	 * to go idle.
198	 */
199	allow_barrier(conf);
200
201	put_all_bios(conf, r10_bio);
202	mempool_free(r10_bio, conf->r10bio_pool);
203}
204
205static void put_buf(r10bio_t *r10_bio)
206{
207	conf_t *conf = r10_bio->mddev->private;
208
209	mempool_free(r10_bio, conf->r10buf_pool);
210
211	lower_barrier(conf);
212}
213
214static void reschedule_retry(r10bio_t *r10_bio)
215{
216	unsigned long flags;
217	mddev_t *mddev = r10_bio->mddev;
218	conf_t *conf = mddev->private;
219
220	spin_lock_irqsave(&conf->device_lock, flags);
221	list_add(&r10_bio->retry_list, &conf->retry_list);
222	conf->nr_queued ++;
223	spin_unlock_irqrestore(&conf->device_lock, flags);
224
225	/* wake up frozen array... */
226	wake_up(&conf->wait_barrier);
227
228	md_wakeup_thread(mddev->thread);
229}
230
231/*
232 * raid_end_bio_io() is called when we have finished servicing a mirrored
233 * operation and are ready to return a success/failure code to the buffer
234 * cache layer.
235 */
236static void raid_end_bio_io(r10bio_t *r10_bio)
237{
238	struct bio *bio = r10_bio->master_bio;
239
240	bio_endio(bio,
241		test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
242	free_r10bio(r10_bio);
243}
244
245/*
246 * Update disk head position estimator based on IRQ completion info.
247 */
248static inline void update_head_pos(int slot, r10bio_t *r10_bio)
249{
250	conf_t *conf = r10_bio->mddev->private;
251
252	conf->mirrors[r10_bio->devs[slot].devnum].head_position =
253		r10_bio->devs[slot].addr + (r10_bio->sectors);
254}
255
256static void raid10_end_read_request(struct bio *bio, int error)
257{
258	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
259	r10bio_t *r10_bio = bio->bi_private;
260	int slot, dev;
261	conf_t *conf = r10_bio->mddev->private;
262
263
264	slot = r10_bio->read_slot;
265	dev = r10_bio->devs[slot].devnum;
266	/*
267	 * this branch is our 'one mirror IO has finished' event handler:
268	 */
269	update_head_pos(slot, r10_bio);
270
271	if (uptodate) {
272		/*
273		 * Set R10BIO_Uptodate in our master bio, so that
274		 * we will return a good error code to the higher
275		 * levels even if IO on some other mirrored buffer fails.
276		 *
277		 * The 'master' represents the composite IO operation to
278		 * user-side. So if something waits for IO, then it will
279		 * wait for the 'master' bio.
280		 */
281		set_bit(R10BIO_Uptodate, &r10_bio->state);
282		raid_end_bio_io(r10_bio);
283	} else {
284		/*
285		 * oops, read error:
286		 */
287		char b[BDEVNAME_SIZE];
288		if (printk_ratelimit())
289			printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n",
290			       mdname(conf->mddev),
291			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
292		reschedule_retry(r10_bio);
293	}
294
295	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
296}
297
298static void raid10_end_write_request(struct bio *bio, int error)
299{
300	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
301	r10bio_t *r10_bio = bio->bi_private;
302	int slot, dev;
303	conf_t *conf = r10_bio->mddev->private;
304
305	for (slot = 0; slot < conf->copies; slot++)
306		if (r10_bio->devs[slot].bio == bio)
307			break;
308	dev = r10_bio->devs[slot].devnum;
309
310	/*
311	 * this branch is our 'one mirror IO has finished' event handler:
312	 */
313	if (!uptodate) {
314		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
315		/* an I/O failed, we can't clear the bitmap */
316		set_bit(R10BIO_Degraded, &r10_bio->state);
317	} else
318		/*
319		 * Set R10BIO_Uptodate in our master bio, so that
320		 * we will return a good error code for to the higher
321		 * levels even if IO on some other mirrored buffer fails.
322		 *
323		 * The 'master' represents the composite IO operation to
324		 * user-side. So if something waits for IO, then it will
325		 * wait for the 'master' bio.
326		 */
327		set_bit(R10BIO_Uptodate, &r10_bio->state);
328
329	update_head_pos(slot, r10_bio);
330
331	/*
332	 *
333	 * Let's see if all mirrored write operations have finished
334	 * already.
335	 */
336	if (atomic_dec_and_test(&r10_bio->remaining)) {
337		/* clear the bitmap if all writes complete successfully */
338		bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
339				r10_bio->sectors,
340				!test_bit(R10BIO_Degraded, &r10_bio->state),
341				0);
342		md_write_end(r10_bio->mddev);
343		raid_end_bio_io(r10_bio);
344	}
345
346	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
347}
348
349
350/*
351 * RAID10 layout manager
352 * Aswell as the chunksize and raid_disks count, there are two
353 * parameters: near_copies and far_copies.
354 * near_copies * far_copies must be <= raid_disks.
355 * Normally one of these will be 1.
356 * If both are 1, we get raid0.
357 * If near_copies == raid_disks, we get raid1.
358 *
359 * Chunks are layed out in raid0 style with near_copies copies of the
360 * first chunk, followed by near_copies copies of the next chunk and
361 * so on.
362 * If far_copies > 1, then after 1/far_copies of the array has been assigned
363 * as described above, we start again with a device offset of near_copies.
364 * So we effectively have another copy of the whole array further down all
365 * the drives, but with blocks on different drives.
366 * With this layout, and block is never stored twice on the one device.
367 *
368 * raid10_find_phys finds the sector offset of a given virtual sector
369 * on each device that it is on.
370 *
371 * raid10_find_virt does the reverse mapping, from a device and a
372 * sector offset to a virtual address
373 */
374
375static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
376{
377	int n,f;
378	sector_t sector;
379	sector_t chunk;
380	sector_t stripe;
381	int dev;
382
383	int slot = 0;
384
385	/* now calculate first sector/dev */
386	chunk = r10bio->sector >> conf->chunk_shift;
387	sector = r10bio->sector & conf->chunk_mask;
388
389	chunk *= conf->near_copies;
390	stripe = chunk;
391	dev = sector_div(stripe, conf->raid_disks);
392	if (conf->far_offset)
393		stripe *= conf->far_copies;
394
395	sector += stripe << conf->chunk_shift;
396
397	/* and calculate all the others */
398	for (n=0; n < conf->near_copies; n++) {
399		int d = dev;
400		sector_t s = sector;
401		r10bio->devs[slot].addr = sector;
402		r10bio->devs[slot].devnum = d;
403		slot++;
404
405		for (f = 1; f < conf->far_copies; f++) {
406			d += conf->near_copies;
407			if (d >= conf->raid_disks)
408				d -= conf->raid_disks;
409			s += conf->stride;
410			r10bio->devs[slot].devnum = d;
411			r10bio->devs[slot].addr = s;
412			slot++;
413		}
414		dev++;
415		if (dev >= conf->raid_disks) {
416			dev = 0;
417			sector += (conf->chunk_mask + 1);
418		}
419	}
420	BUG_ON(slot != conf->copies);
421}
422
423static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
424{
425	sector_t offset, chunk, vchunk;
426
427	offset = sector & conf->chunk_mask;
428	if (conf->far_offset) {
429		int fc;
430		chunk = sector >> conf->chunk_shift;
431		fc = sector_div(chunk, conf->far_copies);
432		dev -= fc * conf->near_copies;
433		if (dev < 0)
434			dev += conf->raid_disks;
435	} else {
436		while (sector >= conf->stride) {
437			sector -= conf->stride;
438			if (dev < conf->near_copies)
439				dev += conf->raid_disks - conf->near_copies;
440			else
441				dev -= conf->near_copies;
442		}
443		chunk = sector >> conf->chunk_shift;
444	}
445	vchunk = chunk * conf->raid_disks + dev;
446	sector_div(vchunk, conf->near_copies);
447	return (vchunk << conf->chunk_shift) + offset;
448}
449
450/**
451 *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
452 *	@q: request queue
453 *	@bvm: properties of new bio
454 *	@biovec: the request that could be merged to it.
455 *
456 *	Return amount of bytes we can accept at this offset
457 *      If near_copies == raid_disk, there are no striping issues,
458 *      but in that case, the function isn't called at all.
459 */
460static int raid10_mergeable_bvec(struct request_queue *q,
461				 struct bvec_merge_data *bvm,
462				 struct bio_vec *biovec)
463{
464	mddev_t *mddev = q->queuedata;
465	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
466	int max;
467	unsigned int chunk_sectors = mddev->chunk_sectors;
468	unsigned int bio_sectors = bvm->bi_size >> 9;
469
470	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
471	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
472	if (max <= biovec->bv_len && bio_sectors == 0)
473		return biovec->bv_len;
474	else
475		return max;
476}
477
478/*
479 * This routine returns the disk from which the requested read should
480 * be done. There is a per-array 'next expected sequential IO' sector
481 * number - if this matches on the next IO then we use the last disk.
482 * There is also a per-disk 'last know head position' sector that is
483 * maintained from IRQ contexts, both the normal and the resync IO
484 * completion handlers update this position correctly. If there is no
485 * perfect sequential match then we pick the disk whose head is closest.
486 *
487 * If there are 2 mirrors in the same 2 devices, performance degrades
488 * because position is mirror, not device based.
489 *
490 * The rdev for the device selected will have nr_pending incremented.
491 */
492
493static int read_balance(conf_t *conf, r10bio_t *r10_bio)
494{
495	const sector_t this_sector = r10_bio->sector;
496	int disk, slot, nslot;
497	const int sectors = r10_bio->sectors;
498	sector_t new_distance, current_distance;
499	mdk_rdev_t *rdev;
500
501	raid10_find_phys(conf, r10_bio);
502	rcu_read_lock();
503	/*
504	 * Check if we can balance. We can balance on the whole
505	 * device if no resync is going on (recovery is ok), or below
506	 * the resync window. We take the first readable disk when
507	 * above the resync window.
508	 */
509	if (conf->mddev->recovery_cp < MaxSector
510	    && (this_sector + sectors >= conf->next_resync)) {
511		/* make sure that disk is operational */
512		slot = 0;
513		disk = r10_bio->devs[slot].devnum;
514
515		while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
516		       r10_bio->devs[slot].bio == IO_BLOCKED ||
517		       !test_bit(In_sync, &rdev->flags)) {
518			slot++;
519			if (slot == conf->copies) {
520				slot = 0;
521				disk = -1;
522				break;
523			}
524			disk = r10_bio->devs[slot].devnum;
525		}
526		goto rb_out;
527	}
528
529
530	/* make sure the disk is operational */
531	slot = 0;
532	disk = r10_bio->devs[slot].devnum;
533	while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
534	       r10_bio->devs[slot].bio == IO_BLOCKED ||
535	       !test_bit(In_sync, &rdev->flags)) {
536		slot ++;
537		if (slot == conf->copies) {
538			disk = -1;
539			goto rb_out;
540		}
541		disk = r10_bio->devs[slot].devnum;
542	}
543
544
545	current_distance = abs(r10_bio->devs[slot].addr -
546			       conf->mirrors[disk].head_position);
547
548	/* Find the disk whose head is closest,
549	 * or - for far > 1 - find the closest to partition beginning */
550
551	for (nslot = slot; nslot < conf->copies; nslot++) {
552		int ndisk = r10_bio->devs[nslot].devnum;
553
554
555		if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
556		    r10_bio->devs[nslot].bio == IO_BLOCKED ||
557		    !test_bit(In_sync, &rdev->flags))
558			continue;
559
560		/* This optimisation is debatable, and completely destroys
561		 * sequential read speed for 'far copies' arrays.  So only
562		 * keep it for 'near' arrays, and review those later.
563		 */
564		if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
565			disk = ndisk;
566			slot = nslot;
567			break;
568		}
569
570		/* for far > 1 always use the lowest address */
571		if (conf->far_copies > 1)
572			new_distance = r10_bio->devs[nslot].addr;
573		else
574			new_distance = abs(r10_bio->devs[nslot].addr -
575					   conf->mirrors[ndisk].head_position);
576		if (new_distance < current_distance) {
577			current_distance = new_distance;
578			disk = ndisk;
579			slot = nslot;
580		}
581	}
582
583rb_out:
584	r10_bio->read_slot = slot;
585/*	conf->next_seq_sect = this_sector + sectors;*/
586
587	if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
588		atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
589	else
590		disk = -1;
591	rcu_read_unlock();
592
593	return disk;
594}
595
596static void unplug_slaves(mddev_t *mddev)
597{
598	conf_t *conf = mddev->private;
599	int i;
600
601	rcu_read_lock();
602	for (i=0; i < conf->raid_disks; i++) {
603		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
604		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
605			struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
606
607			atomic_inc(&rdev->nr_pending);
608			rcu_read_unlock();
609
610			blk_unplug(r_queue);
611
612			rdev_dec_pending(rdev, mddev);
613			rcu_read_lock();
614		}
615	}
616	rcu_read_unlock();
617}
618
619static void raid10_unplug(struct request_queue *q)
620{
621	mddev_t *mddev = q->queuedata;
622
623	unplug_slaves(q->queuedata);
624	md_wakeup_thread(mddev->thread);
625}
626
627static int raid10_congested(void *data, int bits)
628{
629	mddev_t *mddev = data;
630	conf_t *conf = mddev->private;
631	int i, ret = 0;
632
633	if (mddev_congested(mddev, bits))
634		return 1;
635	rcu_read_lock();
636	for (i = 0; i < conf->raid_disks && ret == 0; i++) {
637		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
638		if (rdev && !test_bit(Faulty, &rdev->flags)) {
639			struct request_queue *q = bdev_get_queue(rdev->bdev);
640
641			ret |= bdi_congested(&q->backing_dev_info, bits);
642		}
643	}
644	rcu_read_unlock();
645	return ret;
646}
647
648static int flush_pending_writes(conf_t *conf)
649{
650	/* Any writes that have been queued but are awaiting
651	 * bitmap updates get flushed here.
652	 * We return 1 if any requests were actually submitted.
653	 */
654	int rv = 0;
655
656	spin_lock_irq(&conf->device_lock);
657
658	if (conf->pending_bio_list.head) {
659		struct bio *bio;
660		bio = bio_list_get(&conf->pending_bio_list);
661		blk_remove_plug(conf->mddev->queue);
662		spin_unlock_irq(&conf->device_lock);
663		/* flush any pending bitmap writes to disk
664		 * before proceeding w/ I/O */
665		bitmap_unplug(conf->mddev->bitmap);
666
667		while (bio) { /* submit pending writes */
668			struct bio *next = bio->bi_next;
669			bio->bi_next = NULL;
670			generic_make_request(bio);
671			bio = next;
672		}
673		rv = 1;
674	} else
675		spin_unlock_irq(&conf->device_lock);
676	return rv;
677}
678/* Barriers....
679 * Sometimes we need to suspend IO while we do something else,
680 * either some resync/recovery, or reconfigure the array.
681 * To do this we raise a 'barrier'.
682 * The 'barrier' is a counter that can be raised multiple times
683 * to count how many activities are happening which preclude
684 * normal IO.
685 * We can only raise the barrier if there is no pending IO.
686 * i.e. if nr_pending == 0.
687 * We choose only to raise the barrier if no-one is waiting for the
688 * barrier to go down.  This means that as soon as an IO request
689 * is ready, no other operations which require a barrier will start
690 * until the IO request has had a chance.
691 *
692 * So: regular IO calls 'wait_barrier'.  When that returns there
693 *    is no backgroup IO happening,  It must arrange to call
694 *    allow_barrier when it has finished its IO.
695 * backgroup IO calls must call raise_barrier.  Once that returns
696 *    there is no normal IO happeing.  It must arrange to call
697 *    lower_barrier when the particular background IO completes.
698 */
699
700static void raise_barrier(conf_t *conf, int force)
701{
702	BUG_ON(force && !conf->barrier);
703	spin_lock_irq(&conf->resync_lock);
704
705	/* Wait until no block IO is waiting (unless 'force') */
706	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
707			    conf->resync_lock,
708			    raid10_unplug(conf->mddev->queue));
709
710	/* block any new IO from starting */
711	conf->barrier++;
712
713	/* No wait for all pending IO to complete */
714	wait_event_lock_irq(conf->wait_barrier,
715			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
716			    conf->resync_lock,
717			    raid10_unplug(conf->mddev->queue));
718
719	spin_unlock_irq(&conf->resync_lock);
720}
721
722static void lower_barrier(conf_t *conf)
723{
724	unsigned long flags;
725	spin_lock_irqsave(&conf->resync_lock, flags);
726	conf->barrier--;
727	spin_unlock_irqrestore(&conf->resync_lock, flags);
728	wake_up(&conf->wait_barrier);
729}
730
731static void wait_barrier(conf_t *conf)
732{
733	spin_lock_irq(&conf->resync_lock);
734	if (conf->barrier) {
735		conf->nr_waiting++;
736		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
737				    conf->resync_lock,
738				    raid10_unplug(conf->mddev->queue));
739		conf->nr_waiting--;
740	}
741	conf->nr_pending++;
742	spin_unlock_irq(&conf->resync_lock);
743}
744
745static void allow_barrier(conf_t *conf)
746{
747	unsigned long flags;
748	spin_lock_irqsave(&conf->resync_lock, flags);
749	conf->nr_pending--;
750	spin_unlock_irqrestore(&conf->resync_lock, flags);
751	wake_up(&conf->wait_barrier);
752}
753
754static void freeze_array(conf_t *conf)
755{
756	/* stop syncio and normal IO and wait for everything to
757	 * go quiet.
758	 * We increment barrier and nr_waiting, and then
759	 * wait until nr_pending match nr_queued+1
760	 * This is called in the context of one normal IO request
761	 * that has failed. Thus any sync request that might be pending
762	 * will be blocked by nr_pending, and we need to wait for
763	 * pending IO requests to complete or be queued for re-try.
764	 * Thus the number queued (nr_queued) plus this request (1)
765	 * must match the number of pending IOs (nr_pending) before
766	 * we continue.
767	 */
768	spin_lock_irq(&conf->resync_lock);
769	conf->barrier++;
770	conf->nr_waiting++;
771	wait_event_lock_irq(conf->wait_barrier,
772			    conf->nr_pending == conf->nr_queued+1,
773			    conf->resync_lock,
774			    ({ flush_pending_writes(conf);
775			       raid10_unplug(conf->mddev->queue); }));
776	spin_unlock_irq(&conf->resync_lock);
777}
778
779static void unfreeze_array(conf_t *conf)
780{
781	/* reverse the effect of the freeze */
782	spin_lock_irq(&conf->resync_lock);
783	conf->barrier--;
784	conf->nr_waiting--;
785	wake_up(&conf->wait_barrier);
786	spin_unlock_irq(&conf->resync_lock);
787}
788
789static int make_request(mddev_t *mddev, struct bio * bio)
790{
791	conf_t *conf = mddev->private;
792	mirror_info_t *mirror;
793	r10bio_t *r10_bio;
794	struct bio *read_bio;
795	int i;
796	int chunk_sects = conf->chunk_mask + 1;
797	const int rw = bio_data_dir(bio);
798	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
799	struct bio_list bl;
800	unsigned long flags;
801	mdk_rdev_t *blocked_rdev;
802
803	if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
804		md_barrier_request(mddev, bio);
805		return 0;
806	}
807
808	/* If this request crosses a chunk boundary, we need to
809	 * split it.  This will only happen for 1 PAGE (or less) requests.
810	 */
811	if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
812		      > chunk_sects &&
813		    conf->near_copies < conf->raid_disks)) {
814		struct bio_pair *bp;
815		/* Sanity check -- queue functions should prevent this happening */
816		if (bio->bi_vcnt != 1 ||
817		    bio->bi_idx != 0)
818			goto bad_map;
819		/* This is a one page bio that upper layers
820		 * refuse to split for us, so we need to split it.
821		 */
822		bp = bio_split(bio,
823			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
824
825		/* Each of these 'make_request' calls will call 'wait_barrier'.
826		 * If the first succeeds but the second blocks due to the resync
827		 * thread raising the barrier, we will deadlock because the
828		 * IO to the underlying device will be queued in generic_make_request
829		 * and will never complete, so will never reduce nr_pending.
830		 * So increment nr_waiting here so no new raise_barriers will
831		 * succeed, and so the second wait_barrier cannot block.
832		 */
833		spin_lock_irq(&conf->resync_lock);
834		conf->nr_waiting++;
835		spin_unlock_irq(&conf->resync_lock);
836
837		if (make_request(mddev, &bp->bio1))
838			generic_make_request(&bp->bio1);
839		if (make_request(mddev, &bp->bio2))
840			generic_make_request(&bp->bio2);
841
842		spin_lock_irq(&conf->resync_lock);
843		conf->nr_waiting--;
844		wake_up(&conf->wait_barrier);
845		spin_unlock_irq(&conf->resync_lock);
846
847		bio_pair_release(bp);
848		return 0;
849	bad_map:
850		printk("md/raid10:%s: make_request bug: can't convert block across chunks"
851		       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
852		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
853
854		bio_io_error(bio);
855		return 0;
856	}
857
858	md_write_start(mddev, bio);
859
860	/*
861	 * Register the new request and wait if the reconstruction
862	 * thread has put up a bar for new requests.
863	 * Continue immediately if no resync is active currently.
864	 */
865	wait_barrier(conf);
866
867	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
868
869	r10_bio->master_bio = bio;
870	r10_bio->sectors = bio->bi_size >> 9;
871
872	r10_bio->mddev = mddev;
873	r10_bio->sector = bio->bi_sector;
874	r10_bio->state = 0;
875
876	if (rw == READ) {
877		/*
878		 * read balancing logic:
879		 */
880		int disk = read_balance(conf, r10_bio);
881		int slot = r10_bio->read_slot;
882		if (disk < 0) {
883			raid_end_bio_io(r10_bio);
884			return 0;
885		}
886		mirror = conf->mirrors + disk;
887
888		read_bio = bio_clone(bio, GFP_NOIO);
889
890		r10_bio->devs[slot].bio = read_bio;
891
892		read_bio->bi_sector = r10_bio->devs[slot].addr +
893			mirror->rdev->data_offset;
894		read_bio->bi_bdev = mirror->rdev->bdev;
895		read_bio->bi_end_io = raid10_end_read_request;
896		read_bio->bi_rw = READ | do_sync;
897		read_bio->bi_private = r10_bio;
898
899		generic_make_request(read_bio);
900		return 0;
901	}
902
903	/*
904	 * WRITE:
905	 */
906	/* first select target devices under rcu_lock and
907	 * inc refcount on their rdev.  Record them by setting
908	 * bios[x] to bio
909	 */
910	raid10_find_phys(conf, r10_bio);
911 retry_write:
912	blocked_rdev = NULL;
913	rcu_read_lock();
914	for (i = 0;  i < conf->copies; i++) {
915		int d = r10_bio->devs[i].devnum;
916		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
917		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
918			atomic_inc(&rdev->nr_pending);
919			blocked_rdev = rdev;
920			break;
921		}
922		if (rdev && !test_bit(Faulty, &rdev->flags)) {
923			atomic_inc(&rdev->nr_pending);
924			r10_bio->devs[i].bio = bio;
925		} else {
926			r10_bio->devs[i].bio = NULL;
927			set_bit(R10BIO_Degraded, &r10_bio->state);
928		}
929	}
930	rcu_read_unlock();
931
932	if (unlikely(blocked_rdev)) {
933		/* Have to wait for this device to get unblocked, then retry */
934		int j;
935		int d;
936
937		for (j = 0; j < i; j++)
938			if (r10_bio->devs[j].bio) {
939				d = r10_bio->devs[j].devnum;
940				rdev_dec_pending(conf->mirrors[d].rdev, mddev);
941			}
942		allow_barrier(conf);
943		md_wait_for_blocked_rdev(blocked_rdev, mddev);
944		wait_barrier(conf);
945		goto retry_write;
946	}
947
948	atomic_set(&r10_bio->remaining, 0);
949
950	bio_list_init(&bl);
951	for (i = 0; i < conf->copies; i++) {
952		struct bio *mbio;
953		int d = r10_bio->devs[i].devnum;
954		if (!r10_bio->devs[i].bio)
955			continue;
956
957		mbio = bio_clone(bio, GFP_NOIO);
958		r10_bio->devs[i].bio = mbio;
959
960		mbio->bi_sector	= r10_bio->devs[i].addr+
961			conf->mirrors[d].rdev->data_offset;
962		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
963		mbio->bi_end_io	= raid10_end_write_request;
964		mbio->bi_rw = WRITE | do_sync;
965		mbio->bi_private = r10_bio;
966
967		atomic_inc(&r10_bio->remaining);
968		bio_list_add(&bl, mbio);
969	}
970
971	if (unlikely(!atomic_read(&r10_bio->remaining))) {
972		/* the array is dead */
973		md_write_end(mddev);
974		raid_end_bio_io(r10_bio);
975		return 0;
976	}
977
978	bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
979	spin_lock_irqsave(&conf->device_lock, flags);
980	bio_list_merge(&conf->pending_bio_list, &bl);
981	blk_plug_device(mddev->queue);
982	spin_unlock_irqrestore(&conf->device_lock, flags);
983
984	/* In case raid10d snuck in to freeze_array */
985	wake_up(&conf->wait_barrier);
986
987	if (do_sync)
988		md_wakeup_thread(mddev->thread);
989
990	return 0;
991}
992
993static void status(struct seq_file *seq, mddev_t *mddev)
994{
995	conf_t *conf = mddev->private;
996	int i;
997
998	if (conf->near_copies < conf->raid_disks)
999		seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1000	if (conf->near_copies > 1)
1001		seq_printf(seq, " %d near-copies", conf->near_copies);
1002	if (conf->far_copies > 1) {
1003		if (conf->far_offset)
1004			seq_printf(seq, " %d offset-copies", conf->far_copies);
1005		else
1006			seq_printf(seq, " %d far-copies", conf->far_copies);
1007	}
1008	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
1009					conf->raid_disks - mddev->degraded);
1010	for (i = 0; i < conf->raid_disks; i++)
1011		seq_printf(seq, "%s",
1012			      conf->mirrors[i].rdev &&
1013			      test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1014	seq_printf(seq, "]");
1015}
1016
1017static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1018{
1019	char b[BDEVNAME_SIZE];
1020	conf_t *conf = mddev->private;
1021
1022	/*
1023	 * If it is not operational, then we have already marked it as dead
1024	 * else if it is the last working disks, ignore the error, let the
1025	 * next level up know.
1026	 * else mark the drive as failed
1027	 */
1028	if (test_bit(In_sync, &rdev->flags)
1029	    && conf->raid_disks-mddev->degraded == 1)
1030		/*
1031		 * Don't fail the drive, just return an IO error.
1032		 * The test should really be more sophisticated than
1033		 * "working_disks == 1", but it isn't critical, and
1034		 * can wait until we do more sophisticated "is the drive
1035		 * really dead" tests...
1036		 */
1037		return;
1038	if (test_and_clear_bit(In_sync, &rdev->flags)) {
1039		unsigned long flags;
1040		spin_lock_irqsave(&conf->device_lock, flags);
1041		mddev->degraded++;
1042		spin_unlock_irqrestore(&conf->device_lock, flags);
1043		/*
1044		 * if recovery is running, make sure it aborts.
1045		 */
1046		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1047	}
1048	set_bit(Faulty, &rdev->flags);
1049	set_bit(MD_CHANGE_DEVS, &mddev->flags);
1050	printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n"
1051	       KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n",
1052	       mdname(mddev), bdevname(rdev->bdev, b),
1053	       mdname(mddev), conf->raid_disks - mddev->degraded);
1054}
1055
1056static void print_conf(conf_t *conf)
1057{
1058	int i;
1059	mirror_info_t *tmp;
1060
1061	printk(KERN_DEBUG "RAID10 conf printout:\n");
1062	if (!conf) {
1063		printk(KERN_DEBUG "(!conf)\n");
1064		return;
1065	}
1066	printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1067		conf->raid_disks);
1068
1069	for (i = 0; i < conf->raid_disks; i++) {
1070		char b[BDEVNAME_SIZE];
1071		tmp = conf->mirrors + i;
1072		if (tmp->rdev)
1073			printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1074				i, !test_bit(In_sync, &tmp->rdev->flags),
1075			        !test_bit(Faulty, &tmp->rdev->flags),
1076				bdevname(tmp->rdev->bdev,b));
1077	}
1078}
1079
1080static void close_sync(conf_t *conf)
1081{
1082	wait_barrier(conf);
1083	allow_barrier(conf);
1084
1085	mempool_destroy(conf->r10buf_pool);
1086	conf->r10buf_pool = NULL;
1087}
1088
1089/* check if there are enough drives for
1090 * every block to appear on atleast one
1091 */
1092static int enough(conf_t *conf)
1093{
1094	int first = 0;
1095
1096	do {
1097		int n = conf->copies;
1098		int cnt = 0;
1099		while (n--) {
1100			if (conf->mirrors[first].rdev)
1101				cnt++;
1102			first = (first+1) % conf->raid_disks;
1103		}
1104		if (cnt == 0)
1105			return 0;
1106	} while (first != 0);
1107	return 1;
1108}
1109
1110static int raid10_spare_active(mddev_t *mddev)
1111{
1112	int i;
1113	conf_t *conf = mddev->private;
1114	mirror_info_t *tmp;
1115	int count = 0;
1116	unsigned long flags;
1117
1118	/*
1119	 * Find all non-in_sync disks within the RAID10 configuration
1120	 * and mark them in_sync
1121	 */
1122	for (i = 0; i < conf->raid_disks; i++) {
1123		tmp = conf->mirrors + i;
1124		if (tmp->rdev
1125		    && !test_bit(Faulty, &tmp->rdev->flags)
1126		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1127			count++;
1128			sysfs_notify_dirent(tmp->rdev->sysfs_state);
1129		}
1130	}
1131	spin_lock_irqsave(&conf->device_lock, flags);
1132	mddev->degraded -= count;
1133	spin_unlock_irqrestore(&conf->device_lock, flags);
1134
1135	print_conf(conf);
1136	return count;
1137}
1138
1139
1140static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1141{
1142	conf_t *conf = mddev->private;
1143	int err = -EEXIST;
1144	int mirror;
1145	mirror_info_t *p;
1146	int first = 0;
1147	int last = conf->raid_disks - 1;
1148
1149	if (mddev->recovery_cp < MaxSector)
1150		/* only hot-add to in-sync arrays, as recovery is
1151		 * very different from resync
1152		 */
1153		return -EBUSY;
1154	if (!enough(conf))
1155		return -EINVAL;
1156
1157	if (rdev->raid_disk >= 0)
1158		first = last = rdev->raid_disk;
1159
1160	if (rdev->saved_raid_disk >= 0 &&
1161	    rdev->saved_raid_disk >= first &&
1162	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1163		mirror = rdev->saved_raid_disk;
1164	else
1165		mirror = first;
1166	for ( ; mirror <= last ; mirror++)
1167		if ( !(p=conf->mirrors+mirror)->rdev) {
1168
1169			disk_stack_limits(mddev->gendisk, rdev->bdev,
1170					  rdev->data_offset << 9);
1171			/* as we don't honour merge_bvec_fn, we must
1172			 * never risk violating it, so limit
1173			 * ->max_segments to one lying with a single
1174			 * page, as a one page request is never in
1175			 * violation.
1176			 */
1177			if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1178				blk_queue_max_segments(mddev->queue, 1);
1179				blk_queue_segment_boundary(mddev->queue,
1180							   PAGE_CACHE_SIZE - 1);
1181			}
1182
1183			p->head_position = 0;
1184			rdev->raid_disk = mirror;
1185			err = 0;
1186			if (rdev->saved_raid_disk != mirror)
1187				conf->fullsync = 1;
1188			rcu_assign_pointer(p->rdev, rdev);
1189			break;
1190		}
1191
1192	md_integrity_add_rdev(rdev, mddev);
1193	print_conf(conf);
1194	return err;
1195}
1196
1197static int raid10_remove_disk(mddev_t *mddev, int number)
1198{
1199	conf_t *conf = mddev->private;
1200	int err = 0;
1201	mdk_rdev_t *rdev;
1202	mirror_info_t *p = conf->mirrors+ number;
1203
1204	print_conf(conf);
1205	rdev = p->rdev;
1206	if (rdev) {
1207		if (test_bit(In_sync, &rdev->flags) ||
1208		    atomic_read(&rdev->nr_pending)) {
1209			err = -EBUSY;
1210			goto abort;
1211		}
1212		/* Only remove faulty devices in recovery
1213		 * is not possible.
1214		 */
1215		if (!test_bit(Faulty, &rdev->flags) &&
1216		    enough(conf)) {
1217			err = -EBUSY;
1218			goto abort;
1219		}
1220		p->rdev = NULL;
1221		synchronize_rcu();
1222		if (atomic_read(&rdev->nr_pending)) {
1223			/* lost the race, try later */
1224			err = -EBUSY;
1225			p->rdev = rdev;
1226			goto abort;
1227		}
1228		md_integrity_register(mddev);
1229	}
1230abort:
1231
1232	print_conf(conf);
1233	return err;
1234}
1235
1236
1237static void end_sync_read(struct bio *bio, int error)
1238{
1239	r10bio_t *r10_bio = bio->bi_private;
1240	conf_t *conf = r10_bio->mddev->private;
1241	int i,d;
1242
1243	for (i=0; i<conf->copies; i++)
1244		if (r10_bio->devs[i].bio == bio)
1245			break;
1246	BUG_ON(i == conf->copies);
1247	update_head_pos(i, r10_bio);
1248	d = r10_bio->devs[i].devnum;
1249
1250	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1251		set_bit(R10BIO_Uptodate, &r10_bio->state);
1252	else {
1253		atomic_add(r10_bio->sectors,
1254			   &conf->mirrors[d].rdev->corrected_errors);
1255		if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
1256			md_error(r10_bio->mddev,
1257				 conf->mirrors[d].rdev);
1258	}
1259
1260	/* for reconstruct, we always reschedule after a read.
1261	 * for resync, only after all reads
1262	 */
1263	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1264	if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1265	    atomic_dec_and_test(&r10_bio->remaining)) {
1266		/* we have read all the blocks,
1267		 * do the comparison in process context in raid10d
1268		 */
1269		reschedule_retry(r10_bio);
1270	}
1271}
1272
1273static void end_sync_write(struct bio *bio, int error)
1274{
1275	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1276	r10bio_t *r10_bio = bio->bi_private;
1277	mddev_t *mddev = r10_bio->mddev;
1278	conf_t *conf = mddev->private;
1279	int i,d;
1280
1281	for (i = 0; i < conf->copies; i++)
1282		if (r10_bio->devs[i].bio == bio)
1283			break;
1284	d = r10_bio->devs[i].devnum;
1285
1286	if (!uptodate)
1287		md_error(mddev, conf->mirrors[d].rdev);
1288
1289	update_head_pos(i, r10_bio);
1290
1291	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1292	while (atomic_dec_and_test(&r10_bio->remaining)) {
1293		if (r10_bio->master_bio == NULL) {
1294			/* the primary of several recovery bios */
1295			sector_t s = r10_bio->sectors;
1296			put_buf(r10_bio);
1297			md_done_sync(mddev, s, 1);
1298			break;
1299		} else {
1300			r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1301			put_buf(r10_bio);
1302			r10_bio = r10_bio2;
1303		}
1304	}
1305}
1306
1307/*
1308 * Note: sync and recover and handled very differently for raid10
1309 * This code is for resync.
1310 * For resync, we read through virtual addresses and read all blocks.
1311 * If there is any error, we schedule a write.  The lowest numbered
1312 * drive is authoritative.
1313 * However requests come for physical address, so we need to map.
1314 * For every physical address there are raid_disks/copies virtual addresses,
1315 * which is always are least one, but is not necessarly an integer.
1316 * This means that a physical address can span multiple chunks, so we may
1317 * have to submit multiple io requests for a single sync request.
1318 */
1319/*
1320 * We check if all blocks are in-sync and only write to blocks that
1321 * aren't in sync
1322 */
1323static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1324{
1325	conf_t *conf = mddev->private;
1326	int i, first;
1327	struct bio *tbio, *fbio;
1328
1329	atomic_set(&r10_bio->remaining, 1);
1330
1331	/* find the first device with a block */
1332	for (i=0; i<conf->copies; i++)
1333		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1334			break;
1335
1336	if (i == conf->copies)
1337		goto done;
1338
1339	first = i;
1340	fbio = r10_bio->devs[i].bio;
1341
1342	/* now find blocks with errors */
1343	for (i=0 ; i < conf->copies ; i++) {
1344		int  j, d;
1345		int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1346
1347		tbio = r10_bio->devs[i].bio;
1348
1349		if (tbio->bi_end_io != end_sync_read)
1350			continue;
1351		if (i == first)
1352			continue;
1353		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
1354			/* We know that the bi_io_vec layout is the same for
1355			 * both 'first' and 'i', so we just compare them.
1356			 * All vec entries are PAGE_SIZE;
1357			 */
1358			for (j = 0; j < vcnt; j++)
1359				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1360					   page_address(tbio->bi_io_vec[j].bv_page),
1361					   PAGE_SIZE))
1362					break;
1363			if (j == vcnt)
1364				continue;
1365			mddev->resync_mismatches += r10_bio->sectors;
1366		}
1367		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1368			/* Don't fix anything. */
1369			continue;
1370		/* Ok, we need to write this bio
1371		 * First we need to fixup bv_offset, bv_len and
1372		 * bi_vecs, as the read request might have corrupted these
1373		 */
1374		tbio->bi_vcnt = vcnt;
1375		tbio->bi_size = r10_bio->sectors << 9;
1376		tbio->bi_idx = 0;
1377		tbio->bi_phys_segments = 0;
1378		tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1379		tbio->bi_flags |= 1 << BIO_UPTODATE;
1380		tbio->bi_next = NULL;
1381		tbio->bi_rw = WRITE;
1382		tbio->bi_private = r10_bio;
1383		tbio->bi_sector = r10_bio->devs[i].addr;
1384
1385		for (j=0; j < vcnt ; j++) {
1386			tbio->bi_io_vec[j].bv_offset = 0;
1387			tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
1388
1389			memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1390			       page_address(fbio->bi_io_vec[j].bv_page),
1391			       PAGE_SIZE);
1392		}
1393		tbio->bi_end_io = end_sync_write;
1394
1395		d = r10_bio->devs[i].devnum;
1396		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1397		atomic_inc(&r10_bio->remaining);
1398		md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
1399
1400		tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
1401		tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1402		generic_make_request(tbio);
1403	}
1404
1405done:
1406	if (atomic_dec_and_test(&r10_bio->remaining)) {
1407		md_done_sync(mddev, r10_bio->sectors, 1);
1408		put_buf(r10_bio);
1409	}
1410}
1411
1412/*
1413 * Now for the recovery code.
1414 * Recovery happens across physical sectors.
1415 * We recover all non-is_sync drives by finding the virtual address of
1416 * each, and then choose a working drive that also has that virt address.
1417 * There is a separate r10_bio for each non-in_sync drive.
1418 * Only the first two slots are in use. The first for reading,
1419 * The second for writing.
1420 *
1421 */
1422
1423static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1424{
1425	conf_t *conf = mddev->private;
1426	int i, d;
1427	struct bio *bio, *wbio;
1428
1429
1430	/* move the pages across to the second bio
1431	 * and submit the write request
1432	 */
1433	bio = r10_bio->devs[0].bio;
1434	wbio = r10_bio->devs[1].bio;
1435	for (i=0; i < wbio->bi_vcnt; i++) {
1436		struct page *p = bio->bi_io_vec[i].bv_page;
1437		bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
1438		wbio->bi_io_vec[i].bv_page = p;
1439	}
1440	d = r10_bio->devs[1].devnum;
1441
1442	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1443	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1444	if (test_bit(R10BIO_Uptodate, &r10_bio->state))
1445		generic_make_request(wbio);
1446	else
1447		bio_endio(wbio, -EIO);
1448}
1449
1450
1451/*
1452 * Used by fix_read_error() to decay the per rdev read_errors.
1453 * We halve the read error count for every hour that has elapsed
1454 * since the last recorded read error.
1455 *
1456 */
1457static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
1458{
1459	struct timespec cur_time_mon;
1460	unsigned long hours_since_last;
1461	unsigned int read_errors = atomic_read(&rdev->read_errors);
1462
1463	ktime_get_ts(&cur_time_mon);
1464
1465	if (rdev->last_read_error.tv_sec == 0 &&
1466	    rdev->last_read_error.tv_nsec == 0) {
1467		/* first time we've seen a read error */
1468		rdev->last_read_error = cur_time_mon;
1469		return;
1470	}
1471
1472	hours_since_last = (cur_time_mon.tv_sec -
1473			    rdev->last_read_error.tv_sec) / 3600;
1474
1475	rdev->last_read_error = cur_time_mon;
1476
1477	/*
1478	 * if hours_since_last is > the number of bits in read_errors
1479	 * just set read errors to 0. We do this to avoid
1480	 * overflowing the shift of read_errors by hours_since_last.
1481	 */
1482	if (hours_since_last >= 8 * sizeof(read_errors))
1483		atomic_set(&rdev->read_errors, 0);
1484	else
1485		atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
1486}
1487
1488/*
1489 * This is a kernel thread which:
1490 *
1491 *	1.	Retries failed read operations on working mirrors.
1492 *	2.	Updates the raid superblock when problems encounter.
1493 *	3.	Performs writes following reads for array synchronising.
1494 */
1495
1496static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1497{
1498	int sect = 0; /* Offset from r10_bio->sector */
1499	int sectors = r10_bio->sectors;
1500	mdk_rdev_t*rdev;
1501	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
1502	int d = r10_bio->devs[r10_bio->read_slot].devnum;
1503
1504	rcu_read_lock();
1505	rdev = rcu_dereference(conf->mirrors[d].rdev);
1506	if (rdev) { /* If rdev is not NULL */
1507		char b[BDEVNAME_SIZE];
1508		int cur_read_error_count = 0;
1509
1510		bdevname(rdev->bdev, b);
1511
1512		if (test_bit(Faulty, &rdev->flags)) {
1513			rcu_read_unlock();
1514			/* drive has already been failed, just ignore any
1515			   more fix_read_error() attempts */
1516			return;
1517		}
1518
1519		check_decay_read_errors(mddev, rdev);
1520		atomic_inc(&rdev->read_errors);
1521		cur_read_error_count = atomic_read(&rdev->read_errors);
1522		if (cur_read_error_count > max_read_errors) {
1523			rcu_read_unlock();
1524			printk(KERN_NOTICE
1525			       "md/raid10:%s: %s: Raid device exceeded "
1526			       "read_error threshold "
1527			       "[cur %d:max %d]\n",
1528			       mdname(mddev),
1529			       b, cur_read_error_count, max_read_errors);
1530			printk(KERN_NOTICE
1531			       "md/raid10:%s: %s: Failing raid "
1532			       "device\n", mdname(mddev), b);
1533			md_error(mddev, conf->mirrors[d].rdev);
1534			return;
1535		}
1536	}
1537	rcu_read_unlock();
1538
1539	while(sectors) {
1540		int s = sectors;
1541		int sl = r10_bio->read_slot;
1542		int success = 0;
1543		int start;
1544
1545		if (s > (PAGE_SIZE>>9))
1546			s = PAGE_SIZE >> 9;
1547
1548		rcu_read_lock();
1549		do {
1550			d = r10_bio->devs[sl].devnum;
1551			rdev = rcu_dereference(conf->mirrors[d].rdev);
1552			if (rdev &&
1553			    test_bit(In_sync, &rdev->flags)) {
1554				atomic_inc(&rdev->nr_pending);
1555				rcu_read_unlock();
1556				success = sync_page_io(rdev->bdev,
1557						       r10_bio->devs[sl].addr +
1558						       sect + rdev->data_offset,
1559						       s<<9,
1560						       conf->tmppage, READ);
1561				rdev_dec_pending(rdev, mddev);
1562				rcu_read_lock();
1563				if (success)
1564					break;
1565			}
1566			sl++;
1567			if (sl == conf->copies)
1568				sl = 0;
1569		} while (!success && sl != r10_bio->read_slot);
1570		rcu_read_unlock();
1571
1572		if (!success) {
1573			/* Cannot read from anywhere -- bye bye array */
1574			int dn = r10_bio->devs[r10_bio->read_slot].devnum;
1575			md_error(mddev, conf->mirrors[dn].rdev);
1576			break;
1577		}
1578
1579		start = sl;
1580		/* write it back and re-read */
1581		rcu_read_lock();
1582		while (sl != r10_bio->read_slot) {
1583			char b[BDEVNAME_SIZE];
1584
1585			if (sl==0)
1586				sl = conf->copies;
1587			sl--;
1588			d = r10_bio->devs[sl].devnum;
1589			rdev = rcu_dereference(conf->mirrors[d].rdev);
1590			if (rdev &&
1591			    test_bit(In_sync, &rdev->flags)) {
1592				atomic_inc(&rdev->nr_pending);
1593				rcu_read_unlock();
1594				atomic_add(s, &rdev->corrected_errors);
1595				if (sync_page_io(rdev->bdev,
1596						 r10_bio->devs[sl].addr +
1597						 sect + rdev->data_offset,
1598						 s<<9, conf->tmppage, WRITE)
1599				    == 0) {
1600					/* Well, this device is dead */
1601					printk(KERN_NOTICE
1602					       "md/raid10:%s: read correction "
1603					       "write failed"
1604					       " (%d sectors at %llu on %s)\n",
1605					       mdname(mddev), s,
1606					       (unsigned long long)(sect+
1607					       rdev->data_offset),
1608					       bdevname(rdev->bdev, b));
1609					printk(KERN_NOTICE "md/raid10:%s: %s: failing "
1610					       "drive\n",
1611					       mdname(mddev),
1612					       bdevname(rdev->bdev, b));
1613					md_error(mddev, rdev);
1614				}
1615				rdev_dec_pending(rdev, mddev);
1616				rcu_read_lock();
1617			}
1618		}
1619		sl = start;
1620		while (sl != r10_bio->read_slot) {
1621
1622			if (sl==0)
1623				sl = conf->copies;
1624			sl--;
1625			d = r10_bio->devs[sl].devnum;
1626			rdev = rcu_dereference(conf->mirrors[d].rdev);
1627			if (rdev &&
1628			    test_bit(In_sync, &rdev->flags)) {
1629				char b[BDEVNAME_SIZE];
1630				atomic_inc(&rdev->nr_pending);
1631				rcu_read_unlock();
1632				if (sync_page_io(rdev->bdev,
1633						 r10_bio->devs[sl].addr +
1634						 sect + rdev->data_offset,
1635						 s<<9, conf->tmppage,
1636						 READ) == 0) {
1637					/* Well, this device is dead */
1638					printk(KERN_NOTICE
1639					       "md/raid10:%s: unable to read back "
1640					       "corrected sectors"
1641					       " (%d sectors at %llu on %s)\n",
1642					       mdname(mddev), s,
1643					       (unsigned long long)(sect+
1644						    rdev->data_offset),
1645					       bdevname(rdev->bdev, b));
1646					printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
1647					       mdname(mddev),
1648					       bdevname(rdev->bdev, b));
1649
1650					md_error(mddev, rdev);
1651				} else {
1652					printk(KERN_INFO
1653					       "md/raid10:%s: read error corrected"
1654					       " (%d sectors at %llu on %s)\n",
1655					       mdname(mddev), s,
1656					       (unsigned long long)(sect+
1657					            rdev->data_offset),
1658					       bdevname(rdev->bdev, b));
1659				}
1660
1661				rdev_dec_pending(rdev, mddev);
1662				rcu_read_lock();
1663			}
1664		}
1665		rcu_read_unlock();
1666
1667		sectors -= s;
1668		sect += s;
1669	}
1670}
1671
1672static void raid10d(mddev_t *mddev)
1673{
1674	r10bio_t *r10_bio;
1675	struct bio *bio;
1676	unsigned long flags;
1677	conf_t *conf = mddev->private;
1678	struct list_head *head = &conf->retry_list;
1679	int unplug=0;
1680	mdk_rdev_t *rdev;
1681
1682	md_check_recovery(mddev);
1683
1684	for (;;) {
1685		char b[BDEVNAME_SIZE];
1686
1687		unplug += flush_pending_writes(conf);
1688
1689		spin_lock_irqsave(&conf->device_lock, flags);
1690		if (list_empty(head)) {
1691			spin_unlock_irqrestore(&conf->device_lock, flags);
1692			break;
1693		}
1694		r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1695		list_del(head->prev);
1696		conf->nr_queued--;
1697		spin_unlock_irqrestore(&conf->device_lock, flags);
1698
1699		mddev = r10_bio->mddev;
1700		conf = mddev->private;
1701		if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
1702			sync_request_write(mddev, r10_bio);
1703			unplug = 1;
1704		} else 	if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
1705			recovery_request_write(mddev, r10_bio);
1706			unplug = 1;
1707		} else {
1708			int mirror;
1709			/* we got a read error. Maybe the drive is bad.  Maybe just
1710			 * the block and we can fix it.
1711			 * We freeze all other IO, and try reading the block from
1712			 * other devices.  When we find one, we re-write
1713			 * and check it that fixes the read error.
1714			 * This is all done synchronously while the array is
1715			 * frozen.
1716			 */
1717			if (mddev->ro == 0) {
1718				freeze_array(conf);
1719				fix_read_error(conf, mddev, r10_bio);
1720				unfreeze_array(conf);
1721			}
1722
1723			bio = r10_bio->devs[r10_bio->read_slot].bio;
1724			r10_bio->devs[r10_bio->read_slot].bio =
1725				mddev->ro ? IO_BLOCKED : NULL;
1726			mirror = read_balance(conf, r10_bio);
1727			if (mirror == -1) {
1728				printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
1729				       " read error for block %llu\n",
1730				       mdname(mddev),
1731				       bdevname(bio->bi_bdev,b),
1732				       (unsigned long long)r10_bio->sector);
1733				raid_end_bio_io(r10_bio);
1734				bio_put(bio);
1735			} else {
1736				const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
1737				bio_put(bio);
1738				rdev = conf->mirrors[mirror].rdev;
1739				if (printk_ratelimit())
1740					printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
1741					       " another mirror\n",
1742					       mdname(mddev),
1743					       bdevname(rdev->bdev,b),
1744					       (unsigned long long)r10_bio->sector);
1745				bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
1746				r10_bio->devs[r10_bio->read_slot].bio = bio;
1747				bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
1748					+ rdev->data_offset;
1749				bio->bi_bdev = rdev->bdev;
1750				bio->bi_rw = READ | do_sync;
1751				bio->bi_private = r10_bio;
1752				bio->bi_end_io = raid10_end_read_request;
1753				unplug = 1;
1754				generic_make_request(bio);
1755			}
1756		}
1757		cond_resched();
1758	}
1759	if (unplug)
1760		unplug_slaves(mddev);
1761}
1762
1763
1764static int init_resync(conf_t *conf)
1765{
1766	int buffs;
1767
1768	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
1769	BUG_ON(conf->r10buf_pool);
1770	conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
1771	if (!conf->r10buf_pool)
1772		return -ENOMEM;
1773	conf->next_resync = 0;
1774	return 0;
1775}
1776
1777/*
1778 * perform a "sync" on one "block"
1779 *
1780 * We need to make sure that no normal I/O request - particularly write
1781 * requests - conflict with active sync requests.
1782 *
1783 * This is achieved by tracking pending requests and a 'barrier' concept
1784 * that can be installed to exclude normal IO requests.
1785 *
1786 * Resync and recovery are handled very differently.
1787 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
1788 *
1789 * For resync, we iterate over virtual addresses, read all copies,
1790 * and update if there are differences.  If only one copy is live,
1791 * skip it.
1792 * For recovery, we iterate over physical addresses, read a good
1793 * value for each non-in_sync drive, and over-write.
1794 *
1795 * So, for recovery we may have several outstanding complex requests for a
1796 * given address, one for each out-of-sync device.  We model this by allocating
1797 * a number of r10_bio structures, one for each out-of-sync device.
1798 * As we setup these structures, we collect all bio's together into a list
1799 * which we then process collectively to add pages, and then process again
1800 * to pass to generic_make_request.
1801 *
1802 * The r10_bio structures are linked using a borrowed master_bio pointer.
1803 * This link is counted in ->remaining.  When the r10_bio that points to NULL
1804 * has its remaining count decremented to 0, the whole complex operation
1805 * is complete.
1806 *
1807 */
1808
1809static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1810{
1811	conf_t *conf = mddev->private;
1812	r10bio_t *r10_bio;
1813	struct bio *biolist = NULL, *bio;
1814	sector_t max_sector, nr_sectors;
1815	int disk;
1816	int i;
1817	int max_sync;
1818	int sync_blocks;
1819
1820	sector_t sectors_skipped = 0;
1821	int chunks_skipped = 0;
1822
1823	if (!conf->r10buf_pool)
1824		if (init_resync(conf))
1825			return 0;
1826
1827 skipped:
1828	max_sector = mddev->dev_sectors;
1829	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1830		max_sector = mddev->resync_max_sectors;
1831	if (sector_nr >= max_sector) {
1832		/* If we aborted, we need to abort the
1833		 * sync on the 'current' bitmap chucks (there can
1834		 * be several when recovering multiple devices).
1835		 * as we may have started syncing it but not finished.
1836		 * We can find the current address in
1837		 * mddev->curr_resync, but for recovery,
1838		 * we need to convert that to several
1839		 * virtual addresses.
1840		 */
1841		if (mddev->curr_resync < max_sector) { /* aborted */
1842			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1843				bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1844						&sync_blocks, 1);
1845			else for (i=0; i<conf->raid_disks; i++) {
1846				sector_t sect =
1847					raid10_find_virt(conf, mddev->curr_resync, i);
1848				bitmap_end_sync(mddev->bitmap, sect,
1849						&sync_blocks, 1);
1850			}
1851		} else /* completed sync */
1852			conf->fullsync = 0;
1853
1854		bitmap_close_sync(mddev->bitmap);
1855		close_sync(conf);
1856		*skipped = 1;
1857		return sectors_skipped;
1858	}
1859	if (chunks_skipped >= conf->raid_disks) {
1860		/* if there has been nothing to do on any drive,
1861		 * then there is nothing to do at all..
1862		 */
1863		*skipped = 1;
1864		return (max_sector - sector_nr) + sectors_skipped;
1865	}
1866
1867	if (max_sector > mddev->resync_max)
1868		max_sector = mddev->resync_max; /* Don't do IO beyond here */
1869
1870	/* make sure whole request will fit in a chunk - if chunks
1871	 * are meaningful
1872	 */
1873	if (conf->near_copies < conf->raid_disks &&
1874	    max_sector > (sector_nr | conf->chunk_mask))
1875		max_sector = (sector_nr | conf->chunk_mask) + 1;
1876	/*
1877	 * If there is non-resync activity waiting for us then
1878	 * put in a delay to throttle resync.
1879	 */
1880	if (!go_faster && conf->nr_waiting)
1881		msleep_interruptible(1000);
1882
1883	/* Again, very different code for resync and recovery.
1884	 * Both must result in an r10bio with a list of bios that
1885	 * have bi_end_io, bi_sector, bi_bdev set,
1886	 * and bi_private set to the r10bio.
1887	 * For recovery, we may actually create several r10bios
1888	 * with 2 bios in each, that correspond to the bios in the main one.
1889	 * In this case, the subordinate r10bios link back through a
1890	 * borrowed master_bio pointer, and the counter in the master
1891	 * includes a ref from each subordinate.
1892	 */
1893	/* First, we decide what to do and set ->bi_end_io
1894	 * To end_sync_read if we want to read, and
1895	 * end_sync_write if we will want to write.
1896	 */
1897
1898	max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1899	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1900		/* recovery... the complicated one */
1901		int j, k;
1902		r10_bio = NULL;
1903
1904		for (i=0 ; i<conf->raid_disks; i++)
1905			if (conf->mirrors[i].rdev &&
1906			    !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
1907				int still_degraded = 0;
1908				/* want to reconstruct this device */
1909				r10bio_t *rb2 = r10_bio;
1910				sector_t sect = raid10_find_virt(conf, sector_nr, i);
1911				int must_sync;
1912				/* Unless we are doing a full sync, we only need
1913				 * to recover the block if it is set in the bitmap
1914				 */
1915				must_sync = bitmap_start_sync(mddev->bitmap, sect,
1916							      &sync_blocks, 1);
1917				if (sync_blocks < max_sync)
1918					max_sync = sync_blocks;
1919				if (!must_sync &&
1920				    !conf->fullsync) {
1921					/* yep, skip the sync_blocks here, but don't assume
1922					 * that there will never be anything to do here
1923					 */
1924					chunks_skipped = -1;
1925					continue;
1926				}
1927
1928				r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1929				raise_barrier(conf, rb2 != NULL);
1930				atomic_set(&r10_bio->remaining, 0);
1931
1932				r10_bio->master_bio = (struct bio*)rb2;
1933				if (rb2)
1934					atomic_inc(&rb2->remaining);
1935				r10_bio->mddev = mddev;
1936				set_bit(R10BIO_IsRecover, &r10_bio->state);
1937				r10_bio->sector = sect;
1938
1939				raid10_find_phys(conf, r10_bio);
1940
1941				/* Need to check if the array will still be
1942				 * degraded
1943				 */
1944				for (j=0; j<conf->raid_disks; j++)
1945					if (conf->mirrors[j].rdev == NULL ||
1946					    test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
1947						still_degraded = 1;
1948						break;
1949					}
1950
1951				must_sync = bitmap_start_sync(mddev->bitmap, sect,
1952							      &sync_blocks, still_degraded);
1953
1954				for (j=0; j<conf->copies;j++) {
1955					int d = r10_bio->devs[j].devnum;
1956					if (conf->mirrors[d].rdev &&
1957					    test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
1958						/* This is where we read from */
1959						bio = r10_bio->devs[0].bio;
1960						bio->bi_next = biolist;
1961						biolist = bio;
1962						bio->bi_private = r10_bio;
1963						bio->bi_end_io = end_sync_read;
1964						bio->bi_rw = READ;
1965						bio->bi_sector = r10_bio->devs[j].addr +
1966							conf->mirrors[d].rdev->data_offset;
1967						bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1968						atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1969						atomic_inc(&r10_bio->remaining);
1970						/* and we write to 'i' */
1971
1972						for (k=0; k<conf->copies; k++)
1973							if (r10_bio->devs[k].devnum == i)
1974								break;
1975						BUG_ON(k == conf->copies);
1976						bio = r10_bio->devs[1].bio;
1977						bio->bi_next = biolist;
1978						biolist = bio;
1979						bio->bi_private = r10_bio;
1980						bio->bi_end_io = end_sync_write;
1981						bio->bi_rw = WRITE;
1982						bio->bi_sector = r10_bio->devs[k].addr +
1983							conf->mirrors[i].rdev->data_offset;
1984						bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1985
1986						r10_bio->devs[0].devnum = d;
1987						r10_bio->devs[1].devnum = i;
1988
1989						break;
1990					}
1991				}
1992				if (j == conf->copies) {
1993					/* Cannot recover, so abort the recovery */
1994					put_buf(r10_bio);
1995					if (rb2)
1996						atomic_dec(&rb2->remaining);
1997					r10_bio = rb2;
1998					if (!test_and_set_bit(MD_RECOVERY_INTR,
1999							      &mddev->recovery))
2000						printk(KERN_INFO "md/raid10:%s: insufficient "
2001						       "working devices for recovery.\n",
2002						       mdname(mddev));
2003					break;
2004				}
2005			}
2006		if (biolist == NULL) {
2007			while (r10_bio) {
2008				r10bio_t *rb2 = r10_bio;
2009				r10_bio = (r10bio_t*) rb2->master_bio;
2010				rb2->master_bio = NULL;
2011				put_buf(rb2);
2012			}
2013			goto giveup;
2014		}
2015	} else {
2016		/* resync. Schedule a read for every block at this virt offset */
2017		int count = 0;
2018
2019		bitmap_cond_end_sync(mddev->bitmap, sector_nr);
2020
2021		if (!bitmap_start_sync(mddev->bitmap, sector_nr,
2022				       &sync_blocks, mddev->degraded) &&
2023		    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2024			/* We can skip this block */
2025			*skipped = 1;
2026			return sync_blocks + sectors_skipped;
2027		}
2028		if (sync_blocks < max_sync)
2029			max_sync = sync_blocks;
2030		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
2031
2032		r10_bio->mddev = mddev;
2033		atomic_set(&r10_bio->remaining, 0);
2034		raise_barrier(conf, 0);
2035		conf->next_resync = sector_nr;
2036
2037		r10_bio->master_bio = NULL;
2038		r10_bio->sector = sector_nr;
2039		set_bit(R10BIO_IsSync, &r10_bio->state);
2040		raid10_find_phys(conf, r10_bio);
2041		r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
2042
2043		for (i=0; i<conf->copies; i++) {
2044			int d = r10_bio->devs[i].devnum;
2045			bio = r10_bio->devs[i].bio;
2046			bio->bi_end_io = NULL;
2047			clear_bit(BIO_UPTODATE, &bio->bi_flags);
2048			if (conf->mirrors[d].rdev == NULL ||
2049			    test_bit(Faulty, &conf->mirrors[d].rdev->flags))
2050				continue;
2051			atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2052			atomic_inc(&r10_bio->remaining);
2053			bio->bi_next = biolist;
2054			biolist = bio;
2055			bio->bi_private = r10_bio;
2056			bio->bi_end_io = end_sync_read;
2057			bio->bi_rw = READ;
2058			bio->bi_sector = r10_bio->devs[i].addr +
2059				conf->mirrors[d].rdev->data_offset;
2060			bio->bi_bdev = conf->mirrors[d].rdev->bdev;
2061			count++;
2062		}
2063
2064		if (count < 2) {
2065			for (i=0; i<conf->copies; i++) {
2066				int d = r10_bio->devs[i].devnum;
2067				if (r10_bio->devs[i].bio->bi_end_io)
2068					rdev_dec_pending(conf->mirrors[d].rdev, mddev);
2069			}
2070			put_buf(r10_bio);
2071			biolist = NULL;
2072			goto giveup;
2073		}
2074	}
2075
2076	for (bio = biolist; bio ; bio=bio->bi_next) {
2077
2078		bio->bi_flags &= ~(BIO_POOL_MASK - 1);
2079		if (bio->bi_end_io)
2080			bio->bi_flags |= 1 << BIO_UPTODATE;
2081		bio->bi_vcnt = 0;
2082		bio->bi_idx = 0;
2083		bio->bi_phys_segments = 0;
2084		bio->bi_size = 0;
2085	}
2086
2087	nr_sectors = 0;
2088	if (sector_nr + max_sync < max_sector)
2089		max_sector = sector_nr + max_sync;
2090	do {
2091		struct page *page;
2092		int len = PAGE_SIZE;
2093		disk = 0;
2094		if (sector_nr + (len>>9) > max_sector)
2095			len = (max_sector - sector_nr) << 9;
2096		if (len == 0)
2097			break;
2098		for (bio= biolist ; bio ; bio=bio->bi_next) {
2099			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
2100			if (bio_add_page(bio, page, len, 0) == 0) {
2101				/* stop here */
2102				struct bio *bio2;
2103				bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
2104				for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
2105					/* remove last page from this bio */
2106					bio2->bi_vcnt--;
2107					bio2->bi_size -= len;
2108					bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
2109				}
2110				goto bio_full;
2111			}
2112			disk = i;
2113		}
2114		nr_sectors += len>>9;
2115		sector_nr += len>>9;
2116	} while (biolist->bi_vcnt < RESYNC_PAGES);
2117 bio_full:
2118	r10_bio->sectors = nr_sectors;
2119
2120	while (biolist) {
2121		bio = biolist;
2122		biolist = biolist->bi_next;
2123
2124		bio->bi_next = NULL;
2125		r10_bio = bio->bi_private;
2126		r10_bio->sectors = nr_sectors;
2127
2128		if (bio->bi_end_io == end_sync_read) {
2129			md_sync_acct(bio->bi_bdev, nr_sectors);
2130			generic_make_request(bio);
2131		}
2132	}
2133
2134	if (sectors_skipped)
2135		/* pretend they weren't skipped, it makes
2136		 * no important difference in this case
2137		 */
2138		md_done_sync(mddev, sectors_skipped, 1);
2139
2140	return sectors_skipped + nr_sectors;
2141 giveup:
2142	/* There is nowhere to write, so all non-sync
2143	 * drives must be failed, so try the next chunk...
2144	 */
2145	if (sector_nr + max_sync < max_sector)
2146		max_sector = sector_nr + max_sync;
2147
2148	sectors_skipped += (max_sector - sector_nr);
2149	chunks_skipped ++;
2150	sector_nr = max_sector;
2151	goto skipped;
2152}
2153
2154static sector_t
2155raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
2156{
2157	sector_t size;
2158	conf_t *conf = mddev->private;
2159
2160	if (!raid_disks)
2161		raid_disks = conf->raid_disks;
2162	if (!sectors)
2163		sectors = conf->dev_sectors;
2164
2165	size = sectors >> conf->chunk_shift;
2166	sector_div(size, conf->far_copies);
2167	size = size * raid_disks;
2168	sector_div(size, conf->near_copies);
2169
2170	return size << conf->chunk_shift;
2171}
2172
2173
2174static conf_t *setup_conf(mddev_t *mddev)
2175{
2176	conf_t *conf = NULL;
2177	int nc, fc, fo;
2178	sector_t stride, size;
2179	int err = -EINVAL;
2180
2181	if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||
2182	    !is_power_of_2(mddev->new_chunk_sectors)) {
2183		printk(KERN_ERR "md/raid10:%s: chunk size must be "
2184		       "at least PAGE_SIZE(%ld) and be a power of 2.\n",
2185		       mdname(mddev), PAGE_SIZE);
2186		goto out;
2187	}
2188
2189	nc = mddev->new_layout & 255;
2190	fc = (mddev->new_layout >> 8) & 255;
2191	fo = mddev->new_layout & (1<<16);
2192
2193	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
2194	    (mddev->new_layout >> 17)) {
2195		printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
2196		       mdname(mddev), mddev->new_layout);
2197		goto out;
2198	}
2199
2200	err = -ENOMEM;
2201	conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
2202	if (!conf)
2203		goto out;
2204
2205	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
2206				GFP_KERNEL);
2207	if (!conf->mirrors)
2208		goto out;
2209
2210	conf->tmppage = alloc_page(GFP_KERNEL);
2211	if (!conf->tmppage)
2212		goto out;
2213
2214
2215	conf->raid_disks = mddev->raid_disks;
2216	conf->near_copies = nc;
2217	conf->far_copies = fc;
2218	conf->copies = nc*fc;
2219	conf->far_offset = fo;
2220	conf->chunk_mask = mddev->new_chunk_sectors - 1;
2221	conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
2222
2223	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
2224					   r10bio_pool_free, conf);
2225	if (!conf->r10bio_pool)
2226		goto out;
2227
2228	size = mddev->dev_sectors >> conf->chunk_shift;
2229	sector_div(size, fc);
2230	size = size * conf->raid_disks;
2231	sector_div(size, nc);
2232	/* 'size' is now the number of chunks in the array */
2233	/* calculate "used chunks per device" in 'stride' */
2234	stride = size * conf->copies;
2235
2236	/* We need to round up when dividing by raid_disks to
2237	 * get the stride size.
2238	 */
2239	stride += conf->raid_disks - 1;
2240	sector_div(stride, conf->raid_disks);
2241
2242	conf->dev_sectors = stride << conf->chunk_shift;
2243
2244	if (fo)
2245		stride = 1;
2246	else
2247		sector_div(stride, fc);
2248	conf->stride = stride << conf->chunk_shift;
2249
2250
2251	spin_lock_init(&conf->device_lock);
2252	INIT_LIST_HEAD(&conf->retry_list);
2253
2254	spin_lock_init(&conf->resync_lock);
2255	init_waitqueue_head(&conf->wait_barrier);
2256
2257	conf->thread = md_register_thread(raid10d, mddev, NULL);
2258	if (!conf->thread)
2259		goto out;
2260
2261	conf->mddev = mddev;
2262	return conf;
2263
2264 out:
2265	printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
2266	       mdname(mddev));
2267	if (conf) {
2268		if (conf->r10bio_pool)
2269			mempool_destroy(conf->r10bio_pool);
2270		kfree(conf->mirrors);
2271		safe_put_page(conf->tmppage);
2272		kfree(conf);
2273	}
2274	return ERR_PTR(err);
2275}
2276
2277static int run(mddev_t *mddev)
2278{
2279	conf_t *conf;
2280	int i, disk_idx, chunk_size;
2281	mirror_info_t *disk;
2282	mdk_rdev_t *rdev;
2283	sector_t size;
2284
2285	/*
2286	 * copy the already verified devices into our private RAID10
2287	 * bookkeeping area. [whatever we allocate in run(),
2288	 * should be freed in stop()]
2289	 */
2290
2291	if (mddev->private == NULL) {
2292		conf = setup_conf(mddev);
2293		if (IS_ERR(conf))
2294			return PTR_ERR(conf);
2295		mddev->private = conf;
2296	}
2297	conf = mddev->private;
2298	if (!conf)
2299		goto out;
2300
2301	mddev->queue->queue_lock = &conf->device_lock;
2302
2303	mddev->thread = conf->thread;
2304	conf->thread = NULL;
2305
2306	chunk_size = mddev->chunk_sectors << 9;
2307	blk_queue_io_min(mddev->queue, chunk_size);
2308	if (conf->raid_disks % conf->near_copies)
2309		blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);
2310	else
2311		blk_queue_io_opt(mddev->queue, chunk_size *
2312				 (conf->raid_disks / conf->near_copies));
2313
2314	list_for_each_entry(rdev, &mddev->disks, same_set) {
2315		disk_idx = rdev->raid_disk;
2316		if (disk_idx >= conf->raid_disks
2317		    || disk_idx < 0)
2318			continue;
2319		disk = conf->mirrors + disk_idx;
2320
2321		disk->rdev = rdev;
2322		disk_stack_limits(mddev->gendisk, rdev->bdev,
2323				  rdev->data_offset << 9);
2324		/* as we don't honour merge_bvec_fn, we must never risk
2325		 * violating it, so limit max_segments to 1 lying
2326		 * within a single page.
2327		 */
2328		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2329			blk_queue_max_segments(mddev->queue, 1);
2330			blk_queue_segment_boundary(mddev->queue,
2331						   PAGE_CACHE_SIZE - 1);
2332		}
2333
2334		disk->head_position = 0;
2335	}
2336	/* need to check that every block has at least one working mirror */
2337	if (!enough(conf)) {
2338		printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
2339		       mdname(mddev));
2340		goto out_free_conf;
2341	}
2342
2343	mddev->degraded = 0;
2344	for (i = 0; i < conf->raid_disks; i++) {
2345
2346		disk = conf->mirrors + i;
2347
2348		if (!disk->rdev ||
2349		    !test_bit(In_sync, &disk->rdev->flags)) {
2350			disk->head_position = 0;
2351			mddev->degraded++;
2352			if (disk->rdev)
2353				conf->fullsync = 1;
2354		}
2355	}
2356
2357	if (mddev->recovery_cp != MaxSector)
2358		printk(KERN_NOTICE "md/raid10:%s: not clean"
2359		       " -- starting background reconstruction\n",
2360		       mdname(mddev));
2361	printk(KERN_INFO
2362		"md/raid10:%s: active with %d out of %d devices\n",
2363		mdname(mddev), conf->raid_disks - mddev->degraded,
2364		conf->raid_disks);
2365	/*
2366	 * Ok, everything is just fine now
2367	 */
2368	mddev->dev_sectors = conf->dev_sectors;
2369	size = raid10_size(mddev, 0, 0);
2370	md_set_array_sectors(mddev, size);
2371	mddev->resync_max_sectors = size;
2372
2373	mddev->queue->unplug_fn = raid10_unplug;
2374	mddev->queue->backing_dev_info.congested_fn = raid10_congested;
2375	mddev->queue->backing_dev_info.congested_data = mddev;
2376
2377	/* Calculate max read-ahead size.
2378	 * We need to readahead at least twice a whole stripe....
2379	 * maybe...
2380	 */
2381	{
2382		int stripe = conf->raid_disks *
2383			((mddev->chunk_sectors << 9) / PAGE_SIZE);
2384		stripe /= conf->near_copies;
2385		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
2386			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
2387	}
2388
2389	if (conf->near_copies < conf->raid_disks)
2390		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2391	md_integrity_register(mddev);
2392	return 0;
2393
2394out_free_conf:
2395	md_unregister_thread(mddev->thread);
2396	if (conf->r10bio_pool)
2397		mempool_destroy(conf->r10bio_pool);
2398	safe_put_page(conf->tmppage);
2399	kfree(conf->mirrors);
2400	kfree(conf);
2401	mddev->private = NULL;
2402out:
2403	return -EIO;
2404}
2405
2406static int stop(mddev_t *mddev)
2407{
2408	conf_t *conf = mddev->private;
2409
2410	raise_barrier(conf, 0);
2411	lower_barrier(conf);
2412
2413	md_unregister_thread(mddev->thread);
2414	mddev->thread = NULL;
2415	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2416	if (conf->r10bio_pool)
2417		mempool_destroy(conf->r10bio_pool);
2418	kfree(conf->mirrors);
2419	kfree(conf);
2420	mddev->private = NULL;
2421	return 0;
2422}
2423
2424static void raid10_quiesce(mddev_t *mddev, int state)
2425{
2426	conf_t *conf = mddev->private;
2427
2428	switch(state) {
2429	case 1:
2430		raise_barrier(conf, 0);
2431		break;
2432	case 0:
2433		lower_barrier(conf);
2434		break;
2435	}
2436}
2437
2438static void *raid10_takeover_raid0(mddev_t *mddev)
2439{
2440	mdk_rdev_t *rdev;
2441	conf_t *conf;
2442
2443	if (mddev->degraded > 0) {
2444		printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
2445		       mdname(mddev));
2446		return ERR_PTR(-EINVAL);
2447	}
2448
2449	/* Set new parameters */
2450	mddev->new_level = 10;
2451	/* new layout: far_copies = 1, near_copies = 2 */
2452	mddev->new_layout = (1<<8) + 2;
2453	mddev->new_chunk_sectors = mddev->chunk_sectors;
2454	mddev->delta_disks = mddev->raid_disks;
2455	mddev->raid_disks *= 2;
2456	/* make sure it will be not marked as dirty */
2457	mddev->recovery_cp = MaxSector;
2458
2459	conf = setup_conf(mddev);
2460	if (!IS_ERR(conf))
2461		list_for_each_entry(rdev, &mddev->disks, same_set)
2462			if (rdev->raid_disk >= 0)
2463				rdev->new_raid_disk = rdev->raid_disk * 2;
2464
2465	return conf;
2466}
2467
2468static void *raid10_takeover(mddev_t *mddev)
2469{
2470	struct raid0_private_data *raid0_priv;
2471
2472	/* raid10 can take over:
2473	 *  raid0 - providing it has only two drives
2474	 */
2475	if (mddev->level == 0) {
2476		/* for raid0 takeover only one zone is supported */
2477		raid0_priv = mddev->private;
2478		if (raid0_priv->nr_strip_zones > 1) {
2479			printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
2480			       " with more than one zone.\n",
2481			       mdname(mddev));
2482			return ERR_PTR(-EINVAL);
2483		}
2484		return raid10_takeover_raid0(mddev);
2485	}
2486	return ERR_PTR(-EINVAL);
2487}
2488
2489static struct mdk_personality raid10_personality =
2490{
2491	.name		= "raid10",
2492	.level		= 10,
2493	.owner		= THIS_MODULE,
2494	.make_request	= make_request,
2495	.run		= run,
2496	.stop		= stop,
2497	.status		= status,
2498	.error_handler	= error,
2499	.hot_add_disk	= raid10_add_disk,
2500	.hot_remove_disk= raid10_remove_disk,
2501	.spare_active	= raid10_spare_active,
2502	.sync_request	= sync_request,
2503	.quiesce	= raid10_quiesce,
2504	.size		= raid10_size,
2505	.takeover	= raid10_takeover,
2506};
2507
2508static int __init raid_init(void)
2509{
2510	return register_md_personality(&raid10_personality);
2511}
2512
2513static void raid_exit(void)
2514{
2515	unregister_md_personality(&raid10_personality);
2516}
2517
2518module_init(raid_init);
2519module_exit(raid_exit);
2520MODULE_LICENSE("GPL");
2521MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
2522MODULE_ALIAS("md-personality-9"); /* RAID10 */
2523MODULE_ALIAS("md-raid10");
2524MODULE_ALIAS("md-level-10");
2525