1/*
2 * raid10.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 2000-2004 Neil Brown
5 *
6 * RAID-10 support for md.
7 *
8 * Base on code in raid1.c.  See raid1.c for futher copyright information.
9 *
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option)
14 * any later version.
15 *
16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21#include "dm-bio-list.h"
22#include <linux/raid/raid10.h>
23#include <linux/raid/bitmap.h>
24
25/*
26 * RAID10 provides a combination of RAID0 and RAID1 functionality.
27 * The layout of data is defined by
28 *    chunk_size
29 *    raid_disks
30 *    near_copies (stored in low byte of layout)
31 *    far_copies (stored in second byte of layout)
32 *    far_offset (stored in bit 16 of layout )
33 *
34 * The data to be stored is divided into chunks using chunksize.
35 * Each device is divided into far_copies sections.
36 * In each section, chunks are laid out in a style similar to raid0, but
37 * near_copies copies of each chunk is stored (each on a different drive).
38 * The starting device for each section is offset near_copies from the starting
39 * device of the previous section.
40 * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
41 * drive.
42 * near_copies and far_copies must be at least one, and their product is at most
43 * raid_disks.
44 *
45 * If far_offset is true, then the far_copies are handled a bit differently.
46 * The copies are still in different stripes, but instead of be very far apart
47 * on disk, there are adjacent stripes.
48 */
49
50/*
51 * Number of guaranteed r10bios in case of extreme VM load:
52 */
53#define	NR_RAID10_BIOS 256
54
55static void unplug_slaves(mddev_t *mddev);
56
57static void allow_barrier(conf_t *conf);
58static void lower_barrier(conf_t *conf);
59
60static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
61{
62	conf_t *conf = data;
63	r10bio_t *r10_bio;
64	int size = offsetof(struct r10bio_s, devs[conf->copies]);
65
66	/* allocate a r10bio with room for raid_disks entries in the bios array */
67	r10_bio = kzalloc(size, gfp_flags);
68	if (!r10_bio)
69		unplug_slaves(conf->mddev);
70
71	return r10_bio;
72}
73
74static void r10bio_pool_free(void *r10_bio, void *data)
75{
76	kfree(r10_bio);
77}
78
79#define RESYNC_BLOCK_SIZE (64*1024)
80//#define RESYNC_BLOCK_SIZE PAGE_SIZE
81#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
82#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
83#define RESYNC_WINDOW (2048*1024)
84
85/*
86 * When performing a resync, we need to read and compare, so
87 * we need as many pages are there are copies.
88 * When performing a recovery, we need 2 bios, one for read,
89 * one for write (we recover only one drive per r10buf)
90 *
91 */
92static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
93{
94	conf_t *conf = data;
95	struct page *page;
96	r10bio_t *r10_bio;
97	struct bio *bio;
98	int i, j;
99	int nalloc;
100
101	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
102	if (!r10_bio) {
103		unplug_slaves(conf->mddev);
104		return NULL;
105	}
106
107	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
108		nalloc = conf->copies; /* resync */
109	else
110		nalloc = 2; /* recovery */
111
112	/*
113	 * Allocate bios.
114	 */
115	for (j = nalloc ; j-- ; ) {
116		bio = bio_alloc(gfp_flags, RESYNC_PAGES);
117		if (!bio)
118			goto out_free_bio;
119		r10_bio->devs[j].bio = bio;
120	}
121	/*
122	 * Allocate RESYNC_PAGES data pages and attach them
123	 * where needed.
124	 */
125	for (j = 0 ; j < nalloc; j++) {
126		bio = r10_bio->devs[j].bio;
127		for (i = 0; i < RESYNC_PAGES; i++) {
128			page = alloc_page(gfp_flags);
129			if (unlikely(!page))
130				goto out_free_pages;
131
132			bio->bi_io_vec[i].bv_page = page;
133		}
134	}
135
136	return r10_bio;
137
138out_free_pages:
139	for ( ; i > 0 ; i--)
140		safe_put_page(bio->bi_io_vec[i-1].bv_page);
141	while (j--)
142		for (i = 0; i < RESYNC_PAGES ; i++)
143			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
144	j = -1;
145out_free_bio:
146	while ( ++j < nalloc )
147		bio_put(r10_bio->devs[j].bio);
148	r10bio_pool_free(r10_bio, conf);
149	return NULL;
150}
151
152static void r10buf_pool_free(void *__r10_bio, void *data)
153{
154	int i;
155	conf_t *conf = data;
156	r10bio_t *r10bio = __r10_bio;
157	int j;
158
159	for (j=0; j < conf->copies; j++) {
160		struct bio *bio = r10bio->devs[j].bio;
161		if (bio) {
162			for (i = 0; i < RESYNC_PAGES; i++) {
163				safe_put_page(bio->bi_io_vec[i].bv_page);
164				bio->bi_io_vec[i].bv_page = NULL;
165			}
166			bio_put(bio);
167		}
168	}
169	r10bio_pool_free(r10bio, conf);
170}
171
172static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
173{
174	int i;
175
176	for (i = 0; i < conf->copies; i++) {
177		struct bio **bio = & r10_bio->devs[i].bio;
178		if (*bio && *bio != IO_BLOCKED)
179			bio_put(*bio);
180		*bio = NULL;
181	}
182}
183
184static void free_r10bio(r10bio_t *r10_bio)
185{
186	conf_t *conf = mddev_to_conf(r10_bio->mddev);
187
188	/*
189	 * Wake up any possible resync thread that waits for the device
190	 * to go idle.
191	 */
192	allow_barrier(conf);
193
194	put_all_bios(conf, r10_bio);
195	mempool_free(r10_bio, conf->r10bio_pool);
196}
197
198static void put_buf(r10bio_t *r10_bio)
199{
200	conf_t *conf = mddev_to_conf(r10_bio->mddev);
201
202	mempool_free(r10_bio, conf->r10buf_pool);
203
204	lower_barrier(conf);
205}
206
207static void reschedule_retry(r10bio_t *r10_bio)
208{
209	unsigned long flags;
210	mddev_t *mddev = r10_bio->mddev;
211	conf_t *conf = mddev_to_conf(mddev);
212
213	spin_lock_irqsave(&conf->device_lock, flags);
214	list_add(&r10_bio->retry_list, &conf->retry_list);
215	conf->nr_queued ++;
216	spin_unlock_irqrestore(&conf->device_lock, flags);
217
218	md_wakeup_thread(mddev->thread);
219}
220
221/*
222 * raid_end_bio_io() is called when we have finished servicing a mirrored
223 * operation and are ready to return a success/failure code to the buffer
224 * cache layer.
225 */
226static void raid_end_bio_io(r10bio_t *r10_bio)
227{
228	struct bio *bio = r10_bio->master_bio;
229
230	bio_endio(bio, bio->bi_size,
231		test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
232	free_r10bio(r10_bio);
233}
234
235/*
236 * Update disk head position estimator based on IRQ completion info.
237 */
238static inline void update_head_pos(int slot, r10bio_t *r10_bio)
239{
240	conf_t *conf = mddev_to_conf(r10_bio->mddev);
241
242	conf->mirrors[r10_bio->devs[slot].devnum].head_position =
243		r10_bio->devs[slot].addr + (r10_bio->sectors);
244}
245
246static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
247{
248	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
249	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
250	int slot, dev;
251	conf_t *conf = mddev_to_conf(r10_bio->mddev);
252
253	if (bio->bi_size)
254		return 1;
255
256	slot = r10_bio->read_slot;
257	dev = r10_bio->devs[slot].devnum;
258	/*
259	 * this branch is our 'one mirror IO has finished' event handler:
260	 */
261	update_head_pos(slot, r10_bio);
262
263	if (uptodate) {
264		/*
265		 * Set R10BIO_Uptodate in our master bio, so that
266		 * we will return a good error code to the higher
267		 * levels even if IO on some other mirrored buffer fails.
268		 *
269		 * The 'master' represents the composite IO operation to
270		 * user-side. So if something waits for IO, then it will
271		 * wait for the 'master' bio.
272		 */
273		set_bit(R10BIO_Uptodate, &r10_bio->state);
274		raid_end_bio_io(r10_bio);
275	} else {
276		/*
277		 * oops, read error:
278		 */
279		char b[BDEVNAME_SIZE];
280		if (printk_ratelimit())
281			printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",
282			       bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
283		reschedule_retry(r10_bio);
284	}
285
286	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
287	return 0;
288}
289
290static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
291{
292	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
293	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
294	int slot, dev;
295	conf_t *conf = mddev_to_conf(r10_bio->mddev);
296
297	if (bio->bi_size)
298		return 1;
299
300	for (slot = 0; slot < conf->copies; slot++)
301		if (r10_bio->devs[slot].bio == bio)
302			break;
303	dev = r10_bio->devs[slot].devnum;
304
305	/*
306	 * this branch is our 'one mirror IO has finished' event handler:
307	 */
308	if (!uptodate) {
309		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
310		/* an I/O failed, we can't clear the bitmap */
311		set_bit(R10BIO_Degraded, &r10_bio->state);
312	} else
313		/*
314		 * Set R10BIO_Uptodate in our master bio, so that
315		 * we will return a good error code for to the higher
316		 * levels even if IO on some other mirrored buffer fails.
317		 *
318		 * The 'master' represents the composite IO operation to
319		 * user-side. So if something waits for IO, then it will
320		 * wait for the 'master' bio.
321		 */
322		set_bit(R10BIO_Uptodate, &r10_bio->state);
323
324	update_head_pos(slot, r10_bio);
325
326	/*
327	 *
328	 * Let's see if all mirrored write operations have finished
329	 * already.
330	 */
331	if (atomic_dec_and_test(&r10_bio->remaining)) {
332		/* clear the bitmap if all writes complete successfully */
333		bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
334				r10_bio->sectors,
335				!test_bit(R10BIO_Degraded, &r10_bio->state),
336				0);
337		md_write_end(r10_bio->mddev);
338		raid_end_bio_io(r10_bio);
339	}
340
341	rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
342	return 0;
343}
344
345
346/*
347 * RAID10 layout manager
348 * Aswell as the chunksize and raid_disks count, there are two
349 * parameters: near_copies and far_copies.
350 * near_copies * far_copies must be <= raid_disks.
351 * Normally one of these will be 1.
352 * If both are 1, we get raid0.
353 * If near_copies == raid_disks, we get raid1.
354 *
355 * Chunks are layed out in raid0 style with near_copies copies of the
356 * first chunk, followed by near_copies copies of the next chunk and
357 * so on.
358 * If far_copies > 1, then after 1/far_copies of the array has been assigned
359 * as described above, we start again with a device offset of near_copies.
360 * So we effectively have another copy of the whole array further down all
361 * the drives, but with blocks on different drives.
362 * With this layout, and block is never stored twice on the one device.
363 *
364 * raid10_find_phys finds the sector offset of a given virtual sector
365 * on each device that it is on.
366 *
367 * raid10_find_virt does the reverse mapping, from a device and a
368 * sector offset to a virtual address
369 */
370
371static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
372{
373	int n,f;
374	sector_t sector;
375	sector_t chunk;
376	sector_t stripe;
377	int dev;
378
379	int slot = 0;
380
381	/* now calculate first sector/dev */
382	chunk = r10bio->sector >> conf->chunk_shift;
383	sector = r10bio->sector & conf->chunk_mask;
384
385	chunk *= conf->near_copies;
386	stripe = chunk;
387	dev = sector_div(stripe, conf->raid_disks);
388	if (conf->far_offset)
389		stripe *= conf->far_copies;
390
391	sector += stripe << conf->chunk_shift;
392
393	/* and calculate all the others */
394	for (n=0; n < conf->near_copies; n++) {
395		int d = dev;
396		sector_t s = sector;
397		r10bio->devs[slot].addr = sector;
398		r10bio->devs[slot].devnum = d;
399		slot++;
400
401		for (f = 1; f < conf->far_copies; f++) {
402			d += conf->near_copies;
403			if (d >= conf->raid_disks)
404				d -= conf->raid_disks;
405			s += conf->stride;
406			r10bio->devs[slot].devnum = d;
407			r10bio->devs[slot].addr = s;
408			slot++;
409		}
410		dev++;
411		if (dev >= conf->raid_disks) {
412			dev = 0;
413			sector += (conf->chunk_mask + 1);
414		}
415	}
416	BUG_ON(slot != conf->copies);
417}
418
419static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
420{
421	sector_t offset, chunk, vchunk;
422
423	offset = sector & conf->chunk_mask;
424	if (conf->far_offset) {
425		int fc;
426		chunk = sector >> conf->chunk_shift;
427		fc = sector_div(chunk, conf->far_copies);
428		dev -= fc * conf->near_copies;
429		if (dev < 0)
430			dev += conf->raid_disks;
431	} else {
432		while (sector >= conf->stride) {
433			sector -= conf->stride;
434			if (dev < conf->near_copies)
435				dev += conf->raid_disks - conf->near_copies;
436			else
437				dev -= conf->near_copies;
438		}
439		chunk = sector >> conf->chunk_shift;
440	}
441	vchunk = chunk * conf->raid_disks + dev;
442	sector_div(vchunk, conf->near_copies);
443	return (vchunk << conf->chunk_shift) + offset;
444}
445
446/**
447 *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
448 *	@q: request queue
449 *	@bio: the buffer head that's been built up so far
450 *	@biovec: the request that could be merged to it.
451 *
452 *	Return amount of bytes we can accept at this offset
453 *      If near_copies == raid_disk, there are no striping issues,
454 *      but in that case, the function isn't called at all.
455 */
456static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio,
457				struct bio_vec *bio_vec)
458{
459	mddev_t *mddev = q->queuedata;
460	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
461	int max;
462	unsigned int chunk_sectors = mddev->chunk_size >> 9;
463	unsigned int bio_sectors = bio->bi_size >> 9;
464
465	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
466	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
467	if (max <= bio_vec->bv_len && bio_sectors == 0)
468		return bio_vec->bv_len;
469	else
470		return max;
471}
472
473/*
474 * This routine returns the disk from which the requested read should
475 * be done. There is a per-array 'next expected sequential IO' sector
476 * number - if this matches on the next IO then we use the last disk.
477 * There is also a per-disk 'last know head position' sector that is
478 * maintained from IRQ contexts, both the normal and the resync IO
479 * completion handlers update this position correctly. If there is no
480 * perfect sequential match then we pick the disk whose head is closest.
481 *
482 * If there are 2 mirrors in the same 2 devices, performance degrades
483 * because position is mirror, not device based.
484 *
485 * The rdev for the device selected will have nr_pending incremented.
486 */
487
488static int read_balance(conf_t *conf, r10bio_t *r10_bio)
489{
490	const unsigned long this_sector = r10_bio->sector;
491	int disk, slot, nslot;
492	const int sectors = r10_bio->sectors;
493	sector_t new_distance, current_distance;
494	mdk_rdev_t *rdev;
495
496	raid10_find_phys(conf, r10_bio);
497	rcu_read_lock();
498	/*
499	 * Check if we can balance. We can balance on the whole
500	 * device if no resync is going on (recovery is ok), or below
501	 * the resync window. We take the first readable disk when
502	 * above the resync window.
503	 */
504	if (conf->mddev->recovery_cp < MaxSector
505	    && (this_sector + sectors >= conf->next_resync)) {
506		/* make sure that disk is operational */
507		slot = 0;
508		disk = r10_bio->devs[slot].devnum;
509
510		while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
511		       r10_bio->devs[slot].bio == IO_BLOCKED ||
512		       !test_bit(In_sync, &rdev->flags)) {
513			slot++;
514			if (slot == conf->copies) {
515				slot = 0;
516				disk = -1;
517				break;
518			}
519			disk = r10_bio->devs[slot].devnum;
520		}
521		goto rb_out;
522	}
523
524
525	/* make sure the disk is operational */
526	slot = 0;
527	disk = r10_bio->devs[slot].devnum;
528	while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
529	       r10_bio->devs[slot].bio == IO_BLOCKED ||
530	       !test_bit(In_sync, &rdev->flags)) {
531		slot ++;
532		if (slot == conf->copies) {
533			disk = -1;
534			goto rb_out;
535		}
536		disk = r10_bio->devs[slot].devnum;
537	}
538
539
540	current_distance = abs(r10_bio->devs[slot].addr -
541			       conf->mirrors[disk].head_position);
542
543	/* Find the disk whose head is closest */
544
545	for (nslot = slot; nslot < conf->copies; nslot++) {
546		int ndisk = r10_bio->devs[nslot].devnum;
547
548
549		if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
550		    r10_bio->devs[nslot].bio == IO_BLOCKED ||
551		    !test_bit(In_sync, &rdev->flags))
552			continue;
553
554		/* This optimisation is debatable, and completely destroys
555		 * sequential read speed for 'far copies' arrays.  So only
556		 * keep it for 'near' arrays, and review those later.
557		 */
558		if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
559			disk = ndisk;
560			slot = nslot;
561			break;
562		}
563		new_distance = abs(r10_bio->devs[nslot].addr -
564				   conf->mirrors[ndisk].head_position);
565		if (new_distance < current_distance) {
566			current_distance = new_distance;
567			disk = ndisk;
568			slot = nslot;
569		}
570	}
571
572rb_out:
573	r10_bio->read_slot = slot;
574/*	conf->next_seq_sect = this_sector + sectors;*/
575
576	if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
577		atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
578	else
579		disk = -1;
580	rcu_read_unlock();
581
582	return disk;
583}
584
585static void unplug_slaves(mddev_t *mddev)
586{
587	conf_t *conf = mddev_to_conf(mddev);
588	int i;
589
590	rcu_read_lock();
591	for (i=0; i<mddev->raid_disks; i++) {
592		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
593		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
594			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
595
596			atomic_inc(&rdev->nr_pending);
597			rcu_read_unlock();
598
599			if (r_queue->unplug_fn)
600				r_queue->unplug_fn(r_queue);
601
602			rdev_dec_pending(rdev, mddev);
603			rcu_read_lock();
604		}
605	}
606	rcu_read_unlock();
607}
608
609static void raid10_unplug(request_queue_t *q)
610{
611	mddev_t *mddev = q->queuedata;
612
613	unplug_slaves(q->queuedata);
614	md_wakeup_thread(mddev->thread);
615}
616
617static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
618			     sector_t *error_sector)
619{
620	mddev_t *mddev = q->queuedata;
621	conf_t *conf = mddev_to_conf(mddev);
622	int i, ret = 0;
623
624	rcu_read_lock();
625	for (i=0; i<mddev->raid_disks && ret == 0; i++) {
626		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
627		if (rdev && !test_bit(Faulty, &rdev->flags)) {
628			struct block_device *bdev = rdev->bdev;
629			request_queue_t *r_queue = bdev_get_queue(bdev);
630
631			if (!r_queue->issue_flush_fn)
632				ret = -EOPNOTSUPP;
633			else {
634				atomic_inc(&rdev->nr_pending);
635				rcu_read_unlock();
636				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
637							      error_sector);
638				rdev_dec_pending(rdev, mddev);
639				rcu_read_lock();
640			}
641		}
642	}
643	rcu_read_unlock();
644	return ret;
645}
646
647static int raid10_congested(void *data, int bits)
648{
649	mddev_t *mddev = data;
650	conf_t *conf = mddev_to_conf(mddev);
651	int i, ret = 0;
652
653	rcu_read_lock();
654	for (i = 0; i < mddev->raid_disks && ret == 0; i++) {
655		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
656		if (rdev && !test_bit(Faulty, &rdev->flags)) {
657			request_queue_t *q = bdev_get_queue(rdev->bdev);
658
659			ret |= bdi_congested(&q->backing_dev_info, bits);
660		}
661	}
662	rcu_read_unlock();
663	return ret;
664}
665
666
667/* Barriers....
668 * Sometimes we need to suspend IO while we do something else,
669 * either some resync/recovery, or reconfigure the array.
670 * To do this we raise a 'barrier'.
671 * The 'barrier' is a counter that can be raised multiple times
672 * to count how many activities are happening which preclude
673 * normal IO.
674 * We can only raise the barrier if there is no pending IO.
675 * i.e. if nr_pending == 0.
676 * We choose only to raise the barrier if no-one is waiting for the
677 * barrier to go down.  This means that as soon as an IO request
678 * is ready, no other operations which require a barrier will start
679 * until the IO request has had a chance.
680 *
681 * So: regular IO calls 'wait_barrier'.  When that returns there
682 *    is no backgroup IO happening,  It must arrange to call
683 *    allow_barrier when it has finished its IO.
684 * backgroup IO calls must call raise_barrier.  Once that returns
685 *    there is no normal IO happeing.  It must arrange to call
686 *    lower_barrier when the particular background IO completes.
687 */
688#define RESYNC_DEPTH 32
689
690static void raise_barrier(conf_t *conf, int force)
691{
692	BUG_ON(force && !conf->barrier);
693	spin_lock_irq(&conf->resync_lock);
694
695	/* Wait until no block IO is waiting (unless 'force') */
696	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
697			    conf->resync_lock,
698			    raid10_unplug(conf->mddev->queue));
699
700	/* block any new IO from starting */
701	conf->barrier++;
702
703	/* No wait for all pending IO to complete */
704	wait_event_lock_irq(conf->wait_barrier,
705			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
706			    conf->resync_lock,
707			    raid10_unplug(conf->mddev->queue));
708
709	spin_unlock_irq(&conf->resync_lock);
710}
711
712static void lower_barrier(conf_t *conf)
713{
714	unsigned long flags;
715	spin_lock_irqsave(&conf->resync_lock, flags);
716	conf->barrier--;
717	spin_unlock_irqrestore(&conf->resync_lock, flags);
718	wake_up(&conf->wait_barrier);
719}
720
721static void wait_barrier(conf_t *conf)
722{
723	spin_lock_irq(&conf->resync_lock);
724	if (conf->barrier) {
725		conf->nr_waiting++;
726		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
727				    conf->resync_lock,
728				    raid10_unplug(conf->mddev->queue));
729		conf->nr_waiting--;
730	}
731	conf->nr_pending++;
732	spin_unlock_irq(&conf->resync_lock);
733}
734
735static void allow_barrier(conf_t *conf)
736{
737	unsigned long flags;
738	spin_lock_irqsave(&conf->resync_lock, flags);
739	conf->nr_pending--;
740	spin_unlock_irqrestore(&conf->resync_lock, flags);
741	wake_up(&conf->wait_barrier);
742}
743
744static void freeze_array(conf_t *conf)
745{
746	/* stop syncio and normal IO and wait for everything to
747	 * go quiet.
748	 * We increment barrier and nr_waiting, and then
749	 * wait until barrier+nr_pending match nr_queued+2
750	 */
751	spin_lock_irq(&conf->resync_lock);
752	conf->barrier++;
753	conf->nr_waiting++;
754	wait_event_lock_irq(conf->wait_barrier,
755			    conf->barrier+conf->nr_pending == conf->nr_queued+2,
756			    conf->resync_lock,
757			    raid10_unplug(conf->mddev->queue));
758	spin_unlock_irq(&conf->resync_lock);
759}
760
761static void unfreeze_array(conf_t *conf)
762{
763	/* reverse the effect of the freeze */
764	spin_lock_irq(&conf->resync_lock);
765	conf->barrier--;
766	conf->nr_waiting--;
767	wake_up(&conf->wait_barrier);
768	spin_unlock_irq(&conf->resync_lock);
769}
770
771static int make_request(request_queue_t *q, struct bio * bio)
772{
773	mddev_t *mddev = q->queuedata;
774	conf_t *conf = mddev_to_conf(mddev);
775	mirror_info_t *mirror;
776	r10bio_t *r10_bio;
777	struct bio *read_bio;
778	int i;
779	int chunk_sects = conf->chunk_mask + 1;
780	const int rw = bio_data_dir(bio);
781	const int do_sync = bio_sync(bio);
782	struct bio_list bl;
783	unsigned long flags;
784
785	if (unlikely(bio_barrier(bio))) {
786		bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
787		return 0;
788	}
789
790	/* If this request crosses a chunk boundary, we need to
791	 * split it.  This will only happen for 1 PAGE (or less) requests.
792	 */
793	if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
794		      > chunk_sects &&
795		    conf->near_copies < conf->raid_disks)) {
796		struct bio_pair *bp;
797		/* Sanity check -- queue functions should prevent this happening */
798		if (bio->bi_vcnt != 1 ||
799		    bio->bi_idx != 0)
800			goto bad_map;
801		/* This is a one page bio that upper layers
802		 * refuse to split for us, so we need to split it.
803		 */
804		bp = bio_split(bio, bio_split_pool,
805			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
806		if (make_request(q, &bp->bio1))
807			generic_make_request(&bp->bio1);
808		if (make_request(q, &bp->bio2))
809			generic_make_request(&bp->bio2);
810
811		bio_pair_release(bp);
812		return 0;
813	bad_map:
814		printk("raid10_make_request bug: can't convert block across chunks"
815		       " or bigger than %dk %llu %d\n", chunk_sects/2,
816		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
817
818		bio_io_error(bio, bio->bi_size);
819		return 0;
820	}
821
822	md_write_start(mddev, bio);
823
824	/*
825	 * Register the new request and wait if the reconstruction
826	 * thread has put up a bar for new requests.
827	 * Continue immediately if no resync is active currently.
828	 */
829	wait_barrier(conf);
830
831	disk_stat_inc(mddev->gendisk, ios[rw]);
832	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
833
834	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
835
836	r10_bio->master_bio = bio;
837	r10_bio->sectors = bio->bi_size >> 9;
838
839	r10_bio->mddev = mddev;
840	r10_bio->sector = bio->bi_sector;
841	r10_bio->state = 0;
842
843	if (rw == READ) {
844		/*
845		 * read balancing logic:
846		 */
847		int disk = read_balance(conf, r10_bio);
848		int slot = r10_bio->read_slot;
849		if (disk < 0) {
850			raid_end_bio_io(r10_bio);
851			return 0;
852		}
853		mirror = conf->mirrors + disk;
854
855		read_bio = bio_clone(bio, GFP_NOIO);
856
857		r10_bio->devs[slot].bio = read_bio;
858
859		read_bio->bi_sector = r10_bio->devs[slot].addr +
860			mirror->rdev->data_offset;
861		read_bio->bi_bdev = mirror->rdev->bdev;
862		read_bio->bi_end_io = raid10_end_read_request;
863		read_bio->bi_rw = READ | do_sync;
864		read_bio->bi_private = r10_bio;
865
866		generic_make_request(read_bio);
867		return 0;
868	}
869
870	/*
871	 * WRITE:
872	 */
873	/* first select target devices under spinlock and
874	 * inc refcount on their rdev.  Record them by setting
875	 * bios[x] to bio
876	 */
877	raid10_find_phys(conf, r10_bio);
878	rcu_read_lock();
879	for (i = 0;  i < conf->copies; i++) {
880		int d = r10_bio->devs[i].devnum;
881		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
882		if (rdev &&
883		    !test_bit(Faulty, &rdev->flags)) {
884			atomic_inc(&rdev->nr_pending);
885			r10_bio->devs[i].bio = bio;
886		} else {
887			r10_bio->devs[i].bio = NULL;
888			set_bit(R10BIO_Degraded, &r10_bio->state);
889		}
890	}
891	rcu_read_unlock();
892
893	atomic_set(&r10_bio->remaining, 0);
894
895	bio_list_init(&bl);
896	for (i = 0; i < conf->copies; i++) {
897		struct bio *mbio;
898		int d = r10_bio->devs[i].devnum;
899		if (!r10_bio->devs[i].bio)
900			continue;
901
902		mbio = bio_clone(bio, GFP_NOIO);
903		r10_bio->devs[i].bio = mbio;
904
905		mbio->bi_sector	= r10_bio->devs[i].addr+
906			conf->mirrors[d].rdev->data_offset;
907		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
908		mbio->bi_end_io	= raid10_end_write_request;
909		mbio->bi_rw = WRITE | do_sync;
910		mbio->bi_private = r10_bio;
911
912		atomic_inc(&r10_bio->remaining);
913		bio_list_add(&bl, mbio);
914	}
915
916	bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
917	spin_lock_irqsave(&conf->device_lock, flags);
918	bio_list_merge(&conf->pending_bio_list, &bl);
919	blk_plug_device(mddev->queue);
920	spin_unlock_irqrestore(&conf->device_lock, flags);
921
922	if (do_sync)
923		md_wakeup_thread(mddev->thread);
924
925	return 0;
926}
927
928static void status(struct seq_file *seq, mddev_t *mddev)
929{
930	conf_t *conf = mddev_to_conf(mddev);
931	int i;
932
933	if (conf->near_copies < conf->raid_disks)
934		seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
935	if (conf->near_copies > 1)
936		seq_printf(seq, " %d near-copies", conf->near_copies);
937	if (conf->far_copies > 1) {
938		if (conf->far_offset)
939			seq_printf(seq, " %d offset-copies", conf->far_copies);
940		else
941			seq_printf(seq, " %d far-copies", conf->far_copies);
942	}
943	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
944					conf->raid_disks - mddev->degraded);
945	for (i = 0; i < conf->raid_disks; i++)
946		seq_printf(seq, "%s",
947			      conf->mirrors[i].rdev &&
948			      test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
949	seq_printf(seq, "]");
950}
951
952static void error(mddev_t *mddev, mdk_rdev_t *rdev)
953{
954	char b[BDEVNAME_SIZE];
955	conf_t *conf = mddev_to_conf(mddev);
956
957	/*
958	 * If it is not operational, then we have already marked it as dead
959	 * else if it is the last working disks, ignore the error, let the
960	 * next level up know.
961	 * else mark the drive as failed
962	 */
963	if (test_bit(In_sync, &rdev->flags)
964	    && conf->raid_disks-mddev->degraded == 1)
965		/*
966		 * Don't fail the drive, just return an IO error.
967		 * The test should really be more sophisticated than
968		 * "working_disks == 1", but it isn't critical, and
969		 * can wait until we do more sophisticated "is the drive
970		 * really dead" tests...
971		 */
972		return;
973	if (test_and_clear_bit(In_sync, &rdev->flags)) {
974		unsigned long flags;
975		spin_lock_irqsave(&conf->device_lock, flags);
976		mddev->degraded++;
977		spin_unlock_irqrestore(&conf->device_lock, flags);
978		/*
979		 * if recovery is running, make sure it aborts.
980		 */
981		set_bit(MD_RECOVERY_ERR, &mddev->recovery);
982	}
983	set_bit(Faulty, &rdev->flags);
984	set_bit(MD_CHANGE_DEVS, &mddev->flags);
985	printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"
986		"	Operation continuing on %d devices\n",
987		bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
988}
989
990static void print_conf(conf_t *conf)
991{
992	int i;
993	mirror_info_t *tmp;
994
995	printk("RAID10 conf printout:\n");
996	if (!conf) {
997		printk("(!conf)\n");
998		return;
999	}
1000	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1001		conf->raid_disks);
1002
1003	for (i = 0; i < conf->raid_disks; i++) {
1004		char b[BDEVNAME_SIZE];
1005		tmp = conf->mirrors + i;
1006		if (tmp->rdev)
1007			printk(" disk %d, wo:%d, o:%d, dev:%s\n",
1008				i, !test_bit(In_sync, &tmp->rdev->flags),
1009			        !test_bit(Faulty, &tmp->rdev->flags),
1010				bdevname(tmp->rdev->bdev,b));
1011	}
1012}
1013
1014static void close_sync(conf_t *conf)
1015{
1016	wait_barrier(conf);
1017	allow_barrier(conf);
1018
1019	mempool_destroy(conf->r10buf_pool);
1020	conf->r10buf_pool = NULL;
1021}
1022
1023/* check if there are enough drives for
1024 * every block to appear on atleast one
1025 */
1026static int enough(conf_t *conf)
1027{
1028	int first = 0;
1029
1030	do {
1031		int n = conf->copies;
1032		int cnt = 0;
1033		while (n--) {
1034			if (conf->mirrors[first].rdev)
1035				cnt++;
1036			first = (first+1) % conf->raid_disks;
1037		}
1038		if (cnt == 0)
1039			return 0;
1040	} while (first != 0);
1041	return 1;
1042}
1043
1044static int raid10_spare_active(mddev_t *mddev)
1045{
1046	int i;
1047	conf_t *conf = mddev->private;
1048	mirror_info_t *tmp;
1049
1050	/*
1051	 * Find all non-in_sync disks within the RAID10 configuration
1052	 * and mark them in_sync
1053	 */
1054	for (i = 0; i < conf->raid_disks; i++) {
1055		tmp = conf->mirrors + i;
1056		if (tmp->rdev
1057		    && !test_bit(Faulty, &tmp->rdev->flags)
1058		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1059			unsigned long flags;
1060			spin_lock_irqsave(&conf->device_lock, flags);
1061			mddev->degraded--;
1062			spin_unlock_irqrestore(&conf->device_lock, flags);
1063		}
1064	}
1065
1066	print_conf(conf);
1067	return 0;
1068}
1069
1070
1071static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1072{
1073	conf_t *conf = mddev->private;
1074	int found = 0;
1075	int mirror;
1076	mirror_info_t *p;
1077
1078	if (mddev->recovery_cp < MaxSector)
1079		/* only hot-add to in-sync arrays, as recovery is
1080		 * very different from resync
1081		 */
1082		return 0;
1083	if (!enough(conf))
1084		return 0;
1085
1086	if (rdev->saved_raid_disk >= 0 &&
1087	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1088		mirror = rdev->saved_raid_disk;
1089	else
1090		mirror = 0;
1091	for ( ; mirror < mddev->raid_disks; mirror++)
1092		if ( !(p=conf->mirrors+mirror)->rdev) {
1093
1094			blk_queue_stack_limits(mddev->queue,
1095					       rdev->bdev->bd_disk->queue);
1096			/* as we don't honour merge_bvec_fn, we must never risk
1097			 * violating it, so limit ->max_sector to one PAGE, as
1098			 * a one page request is never in violation.
1099			 */
1100			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1101			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
1102				mddev->queue->max_sectors = (PAGE_SIZE>>9);
1103
1104			p->head_position = 0;
1105			rdev->raid_disk = mirror;
1106			found = 1;
1107			if (rdev->saved_raid_disk != mirror)
1108				conf->fullsync = 1;
1109			rcu_assign_pointer(p->rdev, rdev);
1110			break;
1111		}
1112
1113	print_conf(conf);
1114	return found;
1115}
1116
1117static int raid10_remove_disk(mddev_t *mddev, int number)
1118{
1119	conf_t *conf = mddev->private;
1120	int err = 0;
1121	mdk_rdev_t *rdev;
1122	mirror_info_t *p = conf->mirrors+ number;
1123
1124	print_conf(conf);
1125	rdev = p->rdev;
1126	if (rdev) {
1127		if (test_bit(In_sync, &rdev->flags) ||
1128		    atomic_read(&rdev->nr_pending)) {
1129			err = -EBUSY;
1130			goto abort;
1131		}
1132		p->rdev = NULL;
1133		synchronize_rcu();
1134		if (atomic_read(&rdev->nr_pending)) {
1135			/* lost the race, try later */
1136			err = -EBUSY;
1137			p->rdev = rdev;
1138		}
1139	}
1140abort:
1141
1142	print_conf(conf);
1143	return err;
1144}
1145
1146
1147static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
1148{
1149	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1150	conf_t *conf = mddev_to_conf(r10_bio->mddev);
1151	int i,d;
1152
1153	if (bio->bi_size)
1154		return 1;
1155
1156	for (i=0; i<conf->copies; i++)
1157		if (r10_bio->devs[i].bio == bio)
1158			break;
1159	BUG_ON(i == conf->copies);
1160	update_head_pos(i, r10_bio);
1161	d = r10_bio->devs[i].devnum;
1162
1163	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1164		set_bit(R10BIO_Uptodate, &r10_bio->state);
1165	else {
1166		atomic_add(r10_bio->sectors,
1167			   &conf->mirrors[d].rdev->corrected_errors);
1168		if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
1169			md_error(r10_bio->mddev,
1170				 conf->mirrors[d].rdev);
1171	}
1172
1173	/* for reconstruct, we always reschedule after a read.
1174	 * for resync, only after all reads
1175	 */
1176	if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1177	    atomic_dec_and_test(&r10_bio->remaining)) {
1178		/* we have read all the blocks,
1179		 * do the comparison in process context in raid10d
1180		 */
1181		reschedule_retry(r10_bio);
1182	}
1183	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1184	return 0;
1185}
1186
1187static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
1188{
1189	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1190	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1191	mddev_t *mddev = r10_bio->mddev;
1192	conf_t *conf = mddev_to_conf(mddev);
1193	int i,d;
1194
1195	if (bio->bi_size)
1196		return 1;
1197
1198	for (i = 0; i < conf->copies; i++)
1199		if (r10_bio->devs[i].bio == bio)
1200			break;
1201	d = r10_bio->devs[i].devnum;
1202
1203	if (!uptodate)
1204		md_error(mddev, conf->mirrors[d].rdev);
1205	update_head_pos(i, r10_bio);
1206
1207	while (atomic_dec_and_test(&r10_bio->remaining)) {
1208		if (r10_bio->master_bio == NULL) {
1209			/* the primary of several recovery bios */
1210			md_done_sync(mddev, r10_bio->sectors, 1);
1211			put_buf(r10_bio);
1212			break;
1213		} else {
1214			r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1215			put_buf(r10_bio);
1216			r10_bio = r10_bio2;
1217		}
1218	}
1219	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1220	return 0;
1221}
1222
1223/*
1224 * Note: sync and recover and handled very differently for raid10
1225 * This code is for resync.
1226 * For resync, we read through virtual addresses and read all blocks.
1227 * If there is any error, we schedule a write.  The lowest numbered
1228 * drive is authoritative.
1229 * However requests come for physical address, so we need to map.
1230 * For every physical address there are raid_disks/copies virtual addresses,
1231 * which is always are least one, but is not necessarly an integer.
1232 * This means that a physical address can span multiple chunks, so we may
1233 * have to submit multiple io requests for a single sync request.
1234 */
1235/*
1236 * We check if all blocks are in-sync and only write to blocks that
1237 * aren't in sync
1238 */
1239static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1240{
1241	conf_t *conf = mddev_to_conf(mddev);
1242	int i, first;
1243	struct bio *tbio, *fbio;
1244
1245	atomic_set(&r10_bio->remaining, 1);
1246
1247	/* find the first device with a block */
1248	for (i=0; i<conf->copies; i++)
1249		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1250			break;
1251
1252	if (i == conf->copies)
1253		goto done;
1254
1255	first = i;
1256	fbio = r10_bio->devs[i].bio;
1257
1258	/* now find blocks with errors */
1259	for (i=0 ; i < conf->copies ; i++) {
1260		int  j, d;
1261		int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1262
1263		tbio = r10_bio->devs[i].bio;
1264
1265		if (tbio->bi_end_io != end_sync_read)
1266			continue;
1267		if (i == first)
1268			continue;
1269		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
1270			/* We know that the bi_io_vec layout is the same for
1271			 * both 'first' and 'i', so we just compare them.
1272			 * All vec entries are PAGE_SIZE;
1273			 */
1274			for (j = 0; j < vcnt; j++)
1275				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1276					   page_address(tbio->bi_io_vec[j].bv_page),
1277					   PAGE_SIZE))
1278					break;
1279			if (j == vcnt)
1280				continue;
1281			mddev->resync_mismatches += r10_bio->sectors;
1282		}
1283		if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1284			/* Don't fix anything. */
1285			continue;
1286		/* Ok, we need to write this bio
1287		 * First we need to fixup bv_offset, bv_len and
1288		 * bi_vecs, as the read request might have corrupted these
1289		 */
1290		tbio->bi_vcnt = vcnt;
1291		tbio->bi_size = r10_bio->sectors << 9;
1292		tbio->bi_idx = 0;
1293		tbio->bi_phys_segments = 0;
1294		tbio->bi_hw_segments = 0;
1295		tbio->bi_hw_front_size = 0;
1296		tbio->bi_hw_back_size = 0;
1297		tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1298		tbio->bi_flags |= 1 << BIO_UPTODATE;
1299		tbio->bi_next = NULL;
1300		tbio->bi_rw = WRITE;
1301		tbio->bi_private = r10_bio;
1302		tbio->bi_sector = r10_bio->devs[i].addr;
1303
1304		for (j=0; j < vcnt ; j++) {
1305			tbio->bi_io_vec[j].bv_offset = 0;
1306			tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
1307
1308			memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1309			       page_address(fbio->bi_io_vec[j].bv_page),
1310			       PAGE_SIZE);
1311		}
1312		tbio->bi_end_io = end_sync_write;
1313
1314		d = r10_bio->devs[i].devnum;
1315		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1316		atomic_inc(&r10_bio->remaining);
1317		md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
1318
1319		tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
1320		tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1321		generic_make_request(tbio);
1322	}
1323
1324done:
1325	if (atomic_dec_and_test(&r10_bio->remaining)) {
1326		md_done_sync(mddev, r10_bio->sectors, 1);
1327		put_buf(r10_bio);
1328	}
1329}
1330
1331/*
1332 * Now for the recovery code.
1333 * Recovery happens across physical sectors.
1334 * We recover all non-is_sync drives by finding the virtual address of
1335 * each, and then choose a working drive that also has that virt address.
1336 * There is a separate r10_bio for each non-in_sync drive.
1337 * Only the first two slots are in use. The first for reading,
1338 * The second for writing.
1339 *
1340 */
1341
1342static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1343{
1344	conf_t *conf = mddev_to_conf(mddev);
1345	int i, d;
1346	struct bio *bio, *wbio;
1347
1348
1349	/* move the pages across to the second bio
1350	 * and submit the write request
1351	 */
1352	bio = r10_bio->devs[0].bio;
1353	wbio = r10_bio->devs[1].bio;
1354	for (i=0; i < wbio->bi_vcnt; i++) {
1355		struct page *p = bio->bi_io_vec[i].bv_page;
1356		bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
1357		wbio->bi_io_vec[i].bv_page = p;
1358	}
1359	d = r10_bio->devs[1].devnum;
1360
1361	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1362	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1363	if (test_bit(R10BIO_Uptodate, &r10_bio->state))
1364		generic_make_request(wbio);
1365	else
1366		bio_endio(wbio, wbio->bi_size, -EIO);
1367}
1368
1369
1370/*
1371 * This is a kernel thread which:
1372 *
1373 *	1.	Retries failed read operations on working mirrors.
1374 *	2.	Updates the raid superblock when problems encounter.
1375 *	3.	Performs writes following reads for array synchronising.
1376 */
1377
1378static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1379{
1380	int sect = 0; /* Offset from r10_bio->sector */
1381	int sectors = r10_bio->sectors;
1382	mdk_rdev_t*rdev;
1383	while(sectors) {
1384		int s = sectors;
1385		int sl = r10_bio->read_slot;
1386		int success = 0;
1387		int start;
1388
1389		if (s > (PAGE_SIZE>>9))
1390			s = PAGE_SIZE >> 9;
1391
1392		rcu_read_lock();
1393		do {
1394			int d = r10_bio->devs[sl].devnum;
1395			rdev = rcu_dereference(conf->mirrors[d].rdev);
1396			if (rdev &&
1397			    test_bit(In_sync, &rdev->flags)) {
1398				atomic_inc(&rdev->nr_pending);
1399				rcu_read_unlock();
1400				success = sync_page_io(rdev->bdev,
1401						       r10_bio->devs[sl].addr +
1402						       sect + rdev->data_offset,
1403						       s<<9,
1404						       conf->tmppage, READ);
1405				rdev_dec_pending(rdev, mddev);
1406				rcu_read_lock();
1407				if (success)
1408					break;
1409			}
1410			sl++;
1411			if (sl == conf->copies)
1412				sl = 0;
1413		} while (!success && sl != r10_bio->read_slot);
1414		rcu_read_unlock();
1415
1416		if (!success) {
1417			/* Cannot read from anywhere -- bye bye array */
1418			int dn = r10_bio->devs[r10_bio->read_slot].devnum;
1419			md_error(mddev, conf->mirrors[dn].rdev);
1420			break;
1421		}
1422
1423		start = sl;
1424		/* write it back and re-read */
1425		rcu_read_lock();
1426		while (sl != r10_bio->read_slot) {
1427			int d;
1428			if (sl==0)
1429				sl = conf->copies;
1430			sl--;
1431			d = r10_bio->devs[sl].devnum;
1432			rdev = rcu_dereference(conf->mirrors[d].rdev);
1433			if (rdev &&
1434			    test_bit(In_sync, &rdev->flags)) {
1435				atomic_inc(&rdev->nr_pending);
1436				rcu_read_unlock();
1437				atomic_add(s, &rdev->corrected_errors);
1438				if (sync_page_io(rdev->bdev,
1439						 r10_bio->devs[sl].addr +
1440						 sect + rdev->data_offset,
1441						 s<<9, conf->tmppage, WRITE)
1442				    == 0)
1443					/* Well, this device is dead */
1444					md_error(mddev, rdev);
1445				rdev_dec_pending(rdev, mddev);
1446				rcu_read_lock();
1447			}
1448		}
1449		sl = start;
1450		while (sl != r10_bio->read_slot) {
1451			int d;
1452			if (sl==0)
1453				sl = conf->copies;
1454			sl--;
1455			d = r10_bio->devs[sl].devnum;
1456			rdev = rcu_dereference(conf->mirrors[d].rdev);
1457			if (rdev &&
1458			    test_bit(In_sync, &rdev->flags)) {
1459				char b[BDEVNAME_SIZE];
1460				atomic_inc(&rdev->nr_pending);
1461				rcu_read_unlock();
1462				if (sync_page_io(rdev->bdev,
1463						 r10_bio->devs[sl].addr +
1464						 sect + rdev->data_offset,
1465						 s<<9, conf->tmppage, READ) == 0)
1466					/* Well, this device is dead */
1467					md_error(mddev, rdev);
1468				else
1469					printk(KERN_INFO
1470					       "raid10:%s: read error corrected"
1471					       " (%d sectors at %llu on %s)\n",
1472					       mdname(mddev), s,
1473					       (unsigned long long)(sect+
1474					            rdev->data_offset),
1475					       bdevname(rdev->bdev, b));
1476
1477				rdev_dec_pending(rdev, mddev);
1478				rcu_read_lock();
1479			}
1480		}
1481		rcu_read_unlock();
1482
1483		sectors -= s;
1484		sect += s;
1485	}
1486}
1487
1488static void raid10d(mddev_t *mddev)
1489{
1490	r10bio_t *r10_bio;
1491	struct bio *bio;
1492	unsigned long flags;
1493	conf_t *conf = mddev_to_conf(mddev);
1494	struct list_head *head = &conf->retry_list;
1495	int unplug=0;
1496	mdk_rdev_t *rdev;
1497
1498	md_check_recovery(mddev);
1499
1500	for (;;) {
1501		char b[BDEVNAME_SIZE];
1502		spin_lock_irqsave(&conf->device_lock, flags);
1503
1504		if (conf->pending_bio_list.head) {
1505			bio = bio_list_get(&conf->pending_bio_list);
1506			blk_remove_plug(mddev->queue);
1507			spin_unlock_irqrestore(&conf->device_lock, flags);
1508			/* flush any pending bitmap writes to disk before proceeding w/ I/O */
1509			if (bitmap_unplug(mddev->bitmap) != 0)
1510				printk("%s: bitmap file write failed!\n", mdname(mddev));
1511
1512			while (bio) { /* submit pending writes */
1513				struct bio *next = bio->bi_next;
1514				bio->bi_next = NULL;
1515				generic_make_request(bio);
1516				bio = next;
1517			}
1518			unplug = 1;
1519
1520			continue;
1521		}
1522
1523		if (list_empty(head))
1524			break;
1525		r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1526		list_del(head->prev);
1527		conf->nr_queued--;
1528		spin_unlock_irqrestore(&conf->device_lock, flags);
1529
1530		mddev = r10_bio->mddev;
1531		conf = mddev_to_conf(mddev);
1532		if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
1533			sync_request_write(mddev, r10_bio);
1534			unplug = 1;
1535		} else 	if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
1536			recovery_request_write(mddev, r10_bio);
1537			unplug = 1;
1538		} else {
1539			int mirror;
1540			/* we got a read error. Maybe the drive is bad.  Maybe just
1541			 * the block and we can fix it.
1542			 * We freeze all other IO, and try reading the block from
1543			 * other devices.  When we find one, we re-write
1544			 * and check it that fixes the read error.
1545			 * This is all done synchronously while the array is
1546			 * frozen.
1547			 */
1548			if (mddev->ro == 0) {
1549				freeze_array(conf);
1550				fix_read_error(conf, mddev, r10_bio);
1551				unfreeze_array(conf);
1552			}
1553
1554			bio = r10_bio->devs[r10_bio->read_slot].bio;
1555			r10_bio->devs[r10_bio->read_slot].bio =
1556				mddev->ro ? IO_BLOCKED : NULL;
1557			bio_put(bio);
1558			mirror = read_balance(conf, r10_bio);
1559			if (mirror == -1) {
1560				printk(KERN_ALERT "raid10: %s: unrecoverable I/O"
1561				       " read error for block %llu\n",
1562				       bdevname(bio->bi_bdev,b),
1563				       (unsigned long long)r10_bio->sector);
1564				raid_end_bio_io(r10_bio);
1565			} else {
1566				const int do_sync = bio_sync(r10_bio->master_bio);
1567				rdev = conf->mirrors[mirror].rdev;
1568				if (printk_ratelimit())
1569					printk(KERN_ERR "raid10: %s: redirecting sector %llu to"
1570					       " another mirror\n",
1571					       bdevname(rdev->bdev,b),
1572					       (unsigned long long)r10_bio->sector);
1573				bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
1574				r10_bio->devs[r10_bio->read_slot].bio = bio;
1575				bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
1576					+ rdev->data_offset;
1577				bio->bi_bdev = rdev->bdev;
1578				bio->bi_rw = READ | do_sync;
1579				bio->bi_private = r10_bio;
1580				bio->bi_end_io = raid10_end_read_request;
1581				unplug = 1;
1582				generic_make_request(bio);
1583			}
1584		}
1585	}
1586	spin_unlock_irqrestore(&conf->device_lock, flags);
1587	if (unplug)
1588		unplug_slaves(mddev);
1589}
1590
1591
1592static int init_resync(conf_t *conf)
1593{
1594	int buffs;
1595
1596	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
1597	BUG_ON(conf->r10buf_pool);
1598	conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
1599	if (!conf->r10buf_pool)
1600		return -ENOMEM;
1601	conf->next_resync = 0;
1602	return 0;
1603}
1604
1605/*
1606 * perform a "sync" on one "block"
1607 *
1608 * We need to make sure that no normal I/O request - particularly write
1609 * requests - conflict with active sync requests.
1610 *
1611 * This is achieved by tracking pending requests and a 'barrier' concept
1612 * that can be installed to exclude normal IO requests.
1613 *
1614 * Resync and recovery are handled very differently.
1615 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
1616 *
1617 * For resync, we iterate over virtual addresses, read all copies,
1618 * and update if there are differences.  If only one copy is live,
1619 * skip it.
1620 * For recovery, we iterate over physical addresses, read a good
1621 * value for each non-in_sync drive, and over-write.
1622 *
1623 * So, for recovery we may have several outstanding complex requests for a
1624 * given address, one for each out-of-sync device.  We model this by allocating
1625 * a number of r10_bio structures, one for each out-of-sync device.
1626 * As we setup these structures, we collect all bio's together into a list
1627 * which we then process collectively to add pages, and then process again
1628 * to pass to generic_make_request.
1629 *
1630 * The r10_bio structures are linked using a borrowed master_bio pointer.
1631 * This link is counted in ->remaining.  When the r10_bio that points to NULL
1632 * has its remaining count decremented to 0, the whole complex operation
1633 * is complete.
1634 *
1635 */
1636
1637static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1638{
1639	conf_t *conf = mddev_to_conf(mddev);
1640	r10bio_t *r10_bio;
1641	struct bio *biolist = NULL, *bio;
1642	sector_t max_sector, nr_sectors;
1643	int disk;
1644	int i;
1645	int max_sync;
1646	int sync_blocks;
1647
1648	sector_t sectors_skipped = 0;
1649	int chunks_skipped = 0;
1650
1651	if (!conf->r10buf_pool)
1652		if (init_resync(conf))
1653			return 0;
1654
1655 skipped:
1656	max_sector = mddev->size << 1;
1657	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1658		max_sector = mddev->resync_max_sectors;
1659	if (sector_nr >= max_sector) {
1660		/* If we aborted, we need to abort the
1661		 * sync on the 'current' bitmap chucks (there can
1662		 * be several when recovering multiple devices).
1663		 * as we may have started syncing it but not finished.
1664		 * We can find the current address in
1665		 * mddev->curr_resync, but for recovery,
1666		 * we need to convert that to several
1667		 * virtual addresses.
1668		 */
1669		if (mddev->curr_resync < max_sector) { /* aborted */
1670			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1671				bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1672						&sync_blocks, 1);
1673			else for (i=0; i<conf->raid_disks; i++) {
1674				sector_t sect =
1675					raid10_find_virt(conf, mddev->curr_resync, i);
1676				bitmap_end_sync(mddev->bitmap, sect,
1677						&sync_blocks, 1);
1678			}
1679		} else /* completed sync */
1680			conf->fullsync = 0;
1681
1682		bitmap_close_sync(mddev->bitmap);
1683		close_sync(conf);
1684		*skipped = 1;
1685		return sectors_skipped;
1686	}
1687	if (chunks_skipped >= conf->raid_disks) {
1688		/* if there has been nothing to do on any drive,
1689		 * then there is nothing to do at all..
1690		 */
1691		*skipped = 1;
1692		return (max_sector - sector_nr) + sectors_skipped;
1693	}
1694
1695	/* make sure whole request will fit in a chunk - if chunks
1696	 * are meaningful
1697	 */
1698	if (conf->near_copies < conf->raid_disks &&
1699	    max_sector > (sector_nr | conf->chunk_mask))
1700		max_sector = (sector_nr | conf->chunk_mask) + 1;
1701	/*
1702	 * If there is non-resync activity waiting for us then
1703	 * put in a delay to throttle resync.
1704	 */
1705	if (!go_faster && conf->nr_waiting)
1706		msleep_interruptible(1000);
1707
1708	/* Again, very different code for resync and recovery.
1709	 * Both must result in an r10bio with a list of bios that
1710	 * have bi_end_io, bi_sector, bi_bdev set,
1711	 * and bi_private set to the r10bio.
1712	 * For recovery, we may actually create several r10bios
1713	 * with 2 bios in each, that correspond to the bios in the main one.
1714	 * In this case, the subordinate r10bios link back through a
1715	 * borrowed master_bio pointer, and the counter in the master
1716	 * includes a ref from each subordinate.
1717	 */
1718	/* First, we decide what to do and set ->bi_end_io
1719	 * To end_sync_read if we want to read, and
1720	 * end_sync_write if we will want to write.
1721	 */
1722
1723	max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1724	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1725		/* recovery... the complicated one */
1726		int i, j, k;
1727		r10_bio = NULL;
1728
1729		for (i=0 ; i<conf->raid_disks; i++)
1730			if (conf->mirrors[i].rdev &&
1731			    !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
1732				int still_degraded = 0;
1733				/* want to reconstruct this device */
1734				r10bio_t *rb2 = r10_bio;
1735				sector_t sect = raid10_find_virt(conf, sector_nr, i);
1736				int must_sync;
1737				/* Unless we are doing a full sync, we only need
1738				 * to recover the block if it is set in the bitmap
1739				 */
1740				must_sync = bitmap_start_sync(mddev->bitmap, sect,
1741							      &sync_blocks, 1);
1742				if (sync_blocks < max_sync)
1743					max_sync = sync_blocks;
1744				if (!must_sync &&
1745				    !conf->fullsync) {
1746					/* yep, skip the sync_blocks here, but don't assume
1747					 * that there will never be anything to do here
1748					 */
1749					chunks_skipped = -1;
1750					continue;
1751				}
1752
1753				r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1754				raise_barrier(conf, rb2 != NULL);
1755				atomic_set(&r10_bio->remaining, 0);
1756
1757				r10_bio->master_bio = (struct bio*)rb2;
1758				if (rb2)
1759					atomic_inc(&rb2->remaining);
1760				r10_bio->mddev = mddev;
1761				set_bit(R10BIO_IsRecover, &r10_bio->state);
1762				r10_bio->sector = sect;
1763
1764				raid10_find_phys(conf, r10_bio);
1765				/* Need to check if this section will still be
1766				 * degraded
1767				 */
1768				for (j=0; j<conf->copies;j++) {
1769					int d = r10_bio->devs[j].devnum;
1770					if (conf->mirrors[d].rdev == NULL ||
1771					    test_bit(Faulty, &conf->mirrors[d].rdev->flags)) {
1772						still_degraded = 1;
1773						break;
1774					}
1775				}
1776				must_sync = bitmap_start_sync(mddev->bitmap, sect,
1777							      &sync_blocks, still_degraded);
1778
1779				for (j=0; j<conf->copies;j++) {
1780					int d = r10_bio->devs[j].devnum;
1781					if (conf->mirrors[d].rdev &&
1782					    test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
1783						/* This is where we read from */
1784						bio = r10_bio->devs[0].bio;
1785						bio->bi_next = biolist;
1786						biolist = bio;
1787						bio->bi_private = r10_bio;
1788						bio->bi_end_io = end_sync_read;
1789						bio->bi_rw = READ;
1790						bio->bi_sector = r10_bio->devs[j].addr +
1791							conf->mirrors[d].rdev->data_offset;
1792						bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1793						atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1794						atomic_inc(&r10_bio->remaining);
1795						/* and we write to 'i' */
1796
1797						for (k=0; k<conf->copies; k++)
1798							if (r10_bio->devs[k].devnum == i)
1799								break;
1800						BUG_ON(k == conf->copies);
1801						bio = r10_bio->devs[1].bio;
1802						bio->bi_next = biolist;
1803						biolist = bio;
1804						bio->bi_private = r10_bio;
1805						bio->bi_end_io = end_sync_write;
1806						bio->bi_rw = WRITE;
1807						bio->bi_sector = r10_bio->devs[k].addr +
1808							conf->mirrors[i].rdev->data_offset;
1809						bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1810
1811						r10_bio->devs[0].devnum = d;
1812						r10_bio->devs[1].devnum = i;
1813
1814						break;
1815					}
1816				}
1817				if (j == conf->copies) {
1818					/* Cannot recover, so abort the recovery */
1819					put_buf(r10_bio);
1820					r10_bio = rb2;
1821					if (!test_and_set_bit(MD_RECOVERY_ERR, &mddev->recovery))
1822						printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n",
1823						       mdname(mddev));
1824					break;
1825				}
1826			}
1827		if (biolist == NULL) {
1828			while (r10_bio) {
1829				r10bio_t *rb2 = r10_bio;
1830				r10_bio = (r10bio_t*) rb2->master_bio;
1831				rb2->master_bio = NULL;
1832				put_buf(rb2);
1833			}
1834			goto giveup;
1835		}
1836	} else {
1837		/* resync. Schedule a read for every block at this virt offset */
1838		int count = 0;
1839
1840		if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1841				       &sync_blocks, mddev->degraded) &&
1842		    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1843			/* We can skip this block */
1844			*skipped = 1;
1845			return sync_blocks + sectors_skipped;
1846		}
1847		if (sync_blocks < max_sync)
1848			max_sync = sync_blocks;
1849		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1850
1851		r10_bio->mddev = mddev;
1852		atomic_set(&r10_bio->remaining, 0);
1853		raise_barrier(conf, 0);
1854		conf->next_resync = sector_nr;
1855
1856		r10_bio->master_bio = NULL;
1857		r10_bio->sector = sector_nr;
1858		set_bit(R10BIO_IsSync, &r10_bio->state);
1859		raid10_find_phys(conf, r10_bio);
1860		r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
1861
1862		for (i=0; i<conf->copies; i++) {
1863			int d = r10_bio->devs[i].devnum;
1864			bio = r10_bio->devs[i].bio;
1865			bio->bi_end_io = NULL;
1866			clear_bit(BIO_UPTODATE, &bio->bi_flags);
1867			if (conf->mirrors[d].rdev == NULL ||
1868			    test_bit(Faulty, &conf->mirrors[d].rdev->flags))
1869				continue;
1870			atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1871			atomic_inc(&r10_bio->remaining);
1872			bio->bi_next = biolist;
1873			biolist = bio;
1874			bio->bi_private = r10_bio;
1875			bio->bi_end_io = end_sync_read;
1876			bio->bi_rw = READ;
1877			bio->bi_sector = r10_bio->devs[i].addr +
1878				conf->mirrors[d].rdev->data_offset;
1879			bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1880			count++;
1881		}
1882
1883		if (count < 2) {
1884			for (i=0; i<conf->copies; i++) {
1885				int d = r10_bio->devs[i].devnum;
1886				if (r10_bio->devs[i].bio->bi_end_io)
1887					rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1888			}
1889			put_buf(r10_bio);
1890			biolist = NULL;
1891			goto giveup;
1892		}
1893	}
1894
1895	for (bio = biolist; bio ; bio=bio->bi_next) {
1896
1897		bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1898		if (bio->bi_end_io)
1899			bio->bi_flags |= 1 << BIO_UPTODATE;
1900		bio->bi_vcnt = 0;
1901		bio->bi_idx = 0;
1902		bio->bi_phys_segments = 0;
1903		bio->bi_hw_segments = 0;
1904		bio->bi_size = 0;
1905	}
1906
1907	nr_sectors = 0;
1908	if (sector_nr + max_sync < max_sector)
1909		max_sector = sector_nr + max_sync;
1910	do {
1911		struct page *page;
1912		int len = PAGE_SIZE;
1913		disk = 0;
1914		if (sector_nr + (len>>9) > max_sector)
1915			len = (max_sector - sector_nr) << 9;
1916		if (len == 0)
1917			break;
1918		for (bio= biolist ; bio ; bio=bio->bi_next) {
1919			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1920			if (bio_add_page(bio, page, len, 0) == 0) {
1921				/* stop here */
1922				struct bio *bio2;
1923				bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1924				for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
1925					/* remove last page from this bio */
1926					bio2->bi_vcnt--;
1927					bio2->bi_size -= len;
1928					bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
1929				}
1930				goto bio_full;
1931			}
1932			disk = i;
1933		}
1934		nr_sectors += len>>9;
1935		sector_nr += len>>9;
1936	} while (biolist->bi_vcnt < RESYNC_PAGES);
1937 bio_full:
1938	r10_bio->sectors = nr_sectors;
1939
1940	while (biolist) {
1941		bio = biolist;
1942		biolist = biolist->bi_next;
1943
1944		bio->bi_next = NULL;
1945		r10_bio = bio->bi_private;
1946		r10_bio->sectors = nr_sectors;
1947
1948		if (bio->bi_end_io == end_sync_read) {
1949			md_sync_acct(bio->bi_bdev, nr_sectors);
1950			generic_make_request(bio);
1951		}
1952	}
1953
1954	if (sectors_skipped)
1955		/* pretend they weren't skipped, it makes
1956		 * no important difference in this case
1957		 */
1958		md_done_sync(mddev, sectors_skipped, 1);
1959
1960	return sectors_skipped + nr_sectors;
1961 giveup:
1962	/* There is nowhere to write, so all non-sync
1963	 * drives must be failed, so try the next chunk...
1964	 */
1965	{
1966	sector_t sec = max_sector - sector_nr;
1967	sectors_skipped += sec;
1968	chunks_skipped ++;
1969	sector_nr = max_sector;
1970	goto skipped;
1971	}
1972}
1973
1974static int run(mddev_t *mddev)
1975{
1976	conf_t *conf;
1977	int i, disk_idx;
1978	mirror_info_t *disk;
1979	mdk_rdev_t *rdev;
1980	struct list_head *tmp;
1981	int nc, fc, fo;
1982	sector_t stride, size;
1983
1984	if (mddev->chunk_size == 0) {
1985		printk(KERN_ERR "md/raid10: non-zero chunk size required.\n");
1986		return -EINVAL;
1987	}
1988
1989	nc = mddev->layout & 255;
1990	fc = (mddev->layout >> 8) & 255;
1991	fo = mddev->layout & (1<<16);
1992	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
1993	    (mddev->layout >> 17)) {
1994		printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
1995		       mdname(mddev), mddev->layout);
1996		goto out;
1997	}
1998	/*
1999	 * copy the already verified devices into our private RAID10
2000	 * bookkeeping area. [whatever we allocate in run(),
2001	 * should be freed in stop()]
2002	 */
2003	conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
2004	mddev->private = conf;
2005	if (!conf) {
2006		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2007			mdname(mddev));
2008		goto out;
2009	}
2010	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
2011				 GFP_KERNEL);
2012	if (!conf->mirrors) {
2013		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2014		       mdname(mddev));
2015		goto out_free_conf;
2016	}
2017
2018	conf->tmppage = alloc_page(GFP_KERNEL);
2019	if (!conf->tmppage)
2020		goto out_free_conf;
2021
2022	conf->mddev = mddev;
2023	conf->raid_disks = mddev->raid_disks;
2024	conf->near_copies = nc;
2025	conf->far_copies = fc;
2026	conf->copies = nc*fc;
2027	conf->far_offset = fo;
2028	conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
2029	conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
2030	size = mddev->size >> (conf->chunk_shift-1);
2031	sector_div(size, fc);
2032	size = size * conf->raid_disks;
2033	sector_div(size, nc);
2034	/* 'size' is now the number of chunks in the array */
2035	/* calculate "used chunks per device" in 'stride' */
2036	stride = size * conf->copies;
2037
2038	/* We need to round up when dividing by raid_disks to
2039	 * get the stride size.
2040	 */
2041	stride += conf->raid_disks - 1;
2042	sector_div(stride, conf->raid_disks);
2043	mddev->size = stride  << (conf->chunk_shift-1);
2044
2045	if (fo)
2046		stride = 1;
2047	else
2048		sector_div(stride, fc);
2049	conf->stride = stride << conf->chunk_shift;
2050
2051	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
2052						r10bio_pool_free, conf);
2053	if (!conf->r10bio_pool) {
2054		printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
2055			mdname(mddev));
2056		goto out_free_conf;
2057	}
2058
2059	ITERATE_RDEV(mddev, rdev, tmp) {
2060		disk_idx = rdev->raid_disk;
2061		if (disk_idx >= mddev->raid_disks
2062		    || disk_idx < 0)
2063			continue;
2064		disk = conf->mirrors + disk_idx;
2065
2066		disk->rdev = rdev;
2067
2068		blk_queue_stack_limits(mddev->queue,
2069				       rdev->bdev->bd_disk->queue);
2070		/* as we don't honour merge_bvec_fn, we must never risk
2071		 * violating it, so limit ->max_sector to one PAGE, as
2072		 * a one page request is never in violation.
2073		 */
2074		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
2075		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
2076			mddev->queue->max_sectors = (PAGE_SIZE>>9);
2077
2078		disk->head_position = 0;
2079	}
2080	spin_lock_init(&conf->device_lock);
2081	INIT_LIST_HEAD(&conf->retry_list);
2082
2083	spin_lock_init(&conf->resync_lock);
2084	init_waitqueue_head(&conf->wait_barrier);
2085
2086	/* need to check that every block has at least one working mirror */
2087	if (!enough(conf)) {
2088		printk(KERN_ERR "raid10: not enough operational mirrors for %s\n",
2089		       mdname(mddev));
2090		goto out_free_conf;
2091	}
2092
2093	mddev->degraded = 0;
2094	for (i = 0; i < conf->raid_disks; i++) {
2095
2096		disk = conf->mirrors + i;
2097
2098		if (!disk->rdev ||
2099		    !test_bit(In_sync, &disk->rdev->flags)) {
2100			disk->head_position = 0;
2101			mddev->degraded++;
2102		}
2103	}
2104
2105
2106	mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10");
2107	if (!mddev->thread) {
2108		printk(KERN_ERR
2109		       "raid10: couldn't allocate thread for %s\n",
2110		       mdname(mddev));
2111		goto out_free_conf;
2112	}
2113
2114	printk(KERN_INFO
2115		"raid10: raid set %s active with %d out of %d devices\n",
2116		mdname(mddev), mddev->raid_disks - mddev->degraded,
2117		mddev->raid_disks);
2118	/*
2119	 * Ok, everything is just fine now
2120	 */
2121	mddev->array_size = size << (conf->chunk_shift-1);
2122	mddev->resync_max_sectors = size << conf->chunk_shift;
2123
2124	mddev->queue->unplug_fn = raid10_unplug;
2125	mddev->queue->issue_flush_fn = raid10_issue_flush;
2126	mddev->queue->backing_dev_info.congested_fn = raid10_congested;
2127	mddev->queue->backing_dev_info.congested_data = mddev;
2128
2129	/* Calculate max read-ahead size.
2130	 * We need to readahead at least twice a whole stripe....
2131	 * maybe...
2132	 */
2133	{
2134		int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE);
2135		stripe /= conf->near_copies;
2136		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
2137			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
2138	}
2139
2140	if (conf->near_copies < mddev->raid_disks)
2141		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2142	return 0;
2143
2144out_free_conf:
2145	if (conf->r10bio_pool)
2146		mempool_destroy(conf->r10bio_pool);
2147	safe_put_page(conf->tmppage);
2148	kfree(conf->mirrors);
2149	kfree(conf);
2150	mddev->private = NULL;
2151out:
2152	return -EIO;
2153}
2154
2155static int stop(mddev_t *mddev)
2156{
2157	conf_t *conf = mddev_to_conf(mddev);
2158
2159	md_unregister_thread(mddev->thread);
2160	mddev->thread = NULL;
2161	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2162	if (conf->r10bio_pool)
2163		mempool_destroy(conf->r10bio_pool);
2164	kfree(conf->mirrors);
2165	kfree(conf);
2166	mddev->private = NULL;
2167	return 0;
2168}
2169
2170static void raid10_quiesce(mddev_t *mddev, int state)
2171{
2172	conf_t *conf = mddev_to_conf(mddev);
2173
2174	switch(state) {
2175	case 1:
2176		raise_barrier(conf, 0);
2177		break;
2178	case 0:
2179		lower_barrier(conf);
2180		break;
2181	}
2182	if (mddev->thread) {
2183		if (mddev->bitmap)
2184			mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2185		else
2186			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2187		md_wakeup_thread(mddev->thread);
2188	}
2189}
2190
2191static struct mdk_personality raid10_personality =
2192{
2193	.name		= "raid10",
2194	.level		= 10,
2195	.owner		= THIS_MODULE,
2196	.make_request	= make_request,
2197	.run		= run,
2198	.stop		= stop,
2199	.status		= status,
2200	.error_handler	= error,
2201	.hot_add_disk	= raid10_add_disk,
2202	.hot_remove_disk= raid10_remove_disk,
2203	.spare_active	= raid10_spare_active,
2204	.sync_request	= sync_request,
2205	.quiesce	= raid10_quiesce,
2206};
2207
2208static int __init raid_init(void)
2209{
2210	return register_md_personality(&raid10_personality);
2211}
2212
2213static void raid_exit(void)
2214{
2215	unregister_md_personality(&raid10_personality);
2216}
2217
2218module_init(raid_init);
2219module_exit(raid_exit);
2220MODULE_LICENSE("GPL");
2221MODULE_ALIAS("md-personality-9"); /* RAID10 */
2222MODULE_ALIAS("md-raid10");
2223MODULE_ALIAS("md-level-10");
2224