1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29/*
30 * NAME:	raid_resync.c
31 * DESCRIPTION: RAID driver source file containing routines related to resync
32 *		operation.
33 * ROUTINES PROVIDED FOR EXTERNAL USE:
34 *	   resync_request() - get resync lock if available
35 *	   release_resync_request() - relinquish resync lock
36 *	   erred_check_line() - provide write instruction for erred column
37 *	     init_pw_area() - initialize pre-write area
38 *	     copy_pw_area() - copy pre-write area from one device to another
39 */
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/conf.h>
44#include <sys/file.h>
45#include <sys/user.h>
46#include <sys/uio.h>
47#include <sys/t_lock.h>
48#include <sys/buf.h>
49#include <sys/dkio.h>
50#include <sys/vtoc.h>
51#include <sys/kmem.h>
52#include <vm/page.h>
53#include <sys/sysmacros.h>
54#include <sys/types.h>
55#include <sys/mkdev.h>
56#include <sys/stat.h>
57#include <sys/open.h>
58#include <sys/disp.h>
59#include <sys/modctl.h>
60#include <sys/ddi.h>
61#include <sys/sunddi.h>
62#include <sys/lvm/md_raid.h>
63
64#include <sys/sysevent/eventdefs.h>
65#include <sys/sysevent/svm.h>
66
67#define	NOCOLUMN	(-1)
68
69extern md_set_t		md_set[];
70extern kmem_cache_t	*raid_child_cache;
71extern kmem_cache_t	*raid_parent_cache;
72extern md_resync_t	md_cpr_resync;
73extern major_t		md_major;
74extern void		raid_parent_init(md_raidps_t *ps);
75extern void		raid_child_init(md_raidcs_t *ps);
76
77/*
78 * NAMES:	xor
79 * DESCRIPTION: Xor two chunks of data together.  The data referenced by
80 *		addr1 and addr2 are xor'd together for size and written into
81 *		addr1.
82 * PARAMETERS:	caddr_t addr1 - address of first chunk of data and destination
83 *		caddr_t addr2 - address of second chunk of data
84 *		u_int	 size - number to xor
85 */
86static void
87xor(caddr_t addr1, caddr_t addr2, size_t size)
88{
89	while (size--) {
90		*addr1++ ^= *addr2++;
91	}
92}
93
94/*
95 * NAME:	release_resync_request
96 *
97 * DESCRIPTION: Release resync active flag and reset unit values accordingly.
98 *
99 * PARAMETERS:	minor_t	    mnum - minor number identity of metadevice
100 *
101 * LOCKS:	Expects Unit Writer Lock to be held across call.
102 */
103void
104release_resync_request(
105	minor_t		mnum
106)
107{
108	mr_unit_t	*un;
109
110	un = MD_UNIT(mnum);
111	ASSERT(un != NULL);
112
113	un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
114
115	un->un_column[un->un_resync_index].un_devflags &= ~MD_RAID_RESYNC;
116	un->un_column[un->un_resync_index].un_devflags &= ~MD_RAID_RESYNC_ERRED;
117	un->un_column[un->un_resync_index].un_devflags &=
118	    ~(MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC);
119
120	un->un_resync_line_index = 0;
121	un->un_resync_index = NOCOLUMN;
122}
123
124/*
125 * NAME:	resync_request
126 *
127 * DESCRIPTION: Request resync.	 If resync is available (no current active
128 *		resync), mark unit as resync active and initialize.
129 *
130 * PARAMETERS:	minor_t	    mnum - minor number identity of metadevice
131 *		int column_index - index of column to resync
132 *		int	copysize - copysize of ioctl request
133 *		md_error_t   *ep - error output parameter
134 *
135 * RETURN:	0 if resync is available, 1 otherwise.
136 *
137 * LOCKS:	Expects Unit Writer Lock to be held across call.
138 *
139 * NOTE:	Sets un_resync_copysize to the input value in copysize, the
140 *		existing value from an incomplete previous resync with an
141 *		input value in copysize, or the lesser of the unit segment
142 *		size or maxio.
143 */
144/* ARGSUSED */
145int
146resync_request(
147	minor_t		mnum,
148	int		column_index,
149	size_t		copysize,
150	md_error_t	*mde
151)
152{
153	mr_unit_t	*un;
154
155	un = MD_UNIT(mnum);
156	ASSERT(un != NULL);
157
158	/* if resync or grow not already active, set resync active for unit */
159	if (! (un->un_column[column_index].un_devflags & MD_RAID_RESYNC) &&
160	    ((un->c.un_status & MD_UN_RESYNC_ACTIVE) ||
161	    (un->c.un_status & MD_UN_GROW_PENDING) ||
162	    (un->un_column[column_index].un_devstate & RCS_RESYNC))) {
163		if (mde)
164			return (mdmderror(mde, MDE_GROW_DELAYED, mnum));
165		return (1);
166	}
167
168	if (un->un_column[column_index].un_devstate &
169	    (RCS_ERRED | RCS_LAST_ERRED))
170		un->un_column[column_index].un_devflags |= MD_RAID_DEV_ERRED;
171	else
172		un->un_column[column_index].un_devflags &= ~MD_RAID_DEV_ERRED;
173	un->c.un_status |= MD_UN_RESYNC_ACTIVE;
174	un->un_resync_index = column_index;
175	un->un_resync_line_index = 0;
176	raid_set_state(un, column_index, RCS_RESYNC, 0);
177
178	return (0);
179}
180
181/*
182 * Name:	alloc_bufs
183 *
184 * DESCRIPTION: Initialize resync_comp buffers.
185 *
186 * PARAMETERS:	size_t	   bsize - size of buffer
187 *		buf_t *read_buf1 - first read buf
188 *		buf_t *read_buf2 - second read buf
189 *		buf_t *write_buf - write buf
190 */
191static void
192alloc_bufs(md_raidcs_t *cs, size_t bsize)
193{
194	/* allocate buffers, write uses the read_buf1 buffer */
195	cs->cs_dbuffer = kmem_zalloc(bsize, KM_SLEEP);
196	cs->cs_pbuffer = kmem_zalloc(bsize, KM_SLEEP);
197}
198
199void
200init_buf(buf_t *bp, int flags, size_t size)
201{
202	/* zero buf */
203	bzero((caddr_t)bp, sizeof (buf_t));
204
205	/* set b_back and b_forw to point back to buf */
206	bp->b_back = bp;
207	bp->b_forw = bp;
208
209	/* set flags size */
210	bp->b_flags = flags;
211	bp->b_bufsize = size;
212	bp->b_offset = -1;
213
214	/* setup semaphores */
215	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
216	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
217}
218
219void
220destroy_buf(buf_t *bp)
221{
222	sema_destroy(&bp->b_io);
223	sema_destroy(&bp->b_sem);
224}
225
226void
227reset_buf(buf_t *bp, int flags, size_t size)
228{
229	destroy_buf(bp);
230	init_buf(bp, flags, size);
231}
232
233/*
234 * NAME:	free_bufs
235 *
236 * DESCRIPTION: Free up buffers.
237 *
238 * PARAMETERS:	size_t	   bsize - size of buffer
239 *		buf_t *read_buf1 - first read buf
240 *		buf_t *read_buf2 - second read buf
241 *		buf_t *write_buf - write buf
242 */
243static void
244free_bufs(size_t bsize, md_raidcs_t *cs)
245{
246	kmem_free(cs->cs_dbuffer, bsize);
247	kmem_free(cs->cs_pbuffer, bsize);
248}
249
250/*
251 * NAME:	init_pw_area
252 *
253 * DESCRIPTION: Initialize pre-write area to all zeros.
254 *
255 * PARAMETERS:	minor_t	      mnum      - minor number identity of metadevice
256 *		md_dev64_t dev_to_write - index of column to resync
257 *		int   column_index      - index of column to resync
258 *
259 * RETURN:	1 if write error on resync device, otherwise 0
260 *
261 * LOCKS:	Expects Unit Reader Lock to be held across call.
262 */
263int
264init_pw_area(
265	mr_unit_t *un,
266	md_dev64_t dev_to_write,
267	diskaddr_t pwstart,
268	uint_t	col
269)
270{
271	buf_t	buf;
272	caddr_t	databuffer;
273	size_t	copysize;
274	size_t	bsize;
275	int	error = 0;
276	int	i;
277
278	ASSERT(un != NULL);
279	ASSERT(un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN);
280
281	bsize = un->un_iosize;
282	copysize = dbtob(bsize);
283	databuffer = kmem_zalloc(copysize, KM_SLEEP);
284	init_buf(&buf, (B_BUSY | B_WRITE), copysize);
285
286	for (i = 0; i < un->un_pwcnt; i++) {
287		/* magic field is 0 for 4.0 compatability */
288		RAID_FILLIN_RPW(databuffer, un, 0, 0,
289				0, 0, 0,
290				0, col, 0);
291		buf.b_un.b_addr = (caddr_t)databuffer;
292		buf.b_edev = md_dev64_to_dev(dev_to_write);
293		buf.b_bcount = dbtob(bsize);
294		buf.b_lblkno = pwstart + (i * un->un_iosize);
295
296		/* write buf */
297		(void) md_call_strategy(&buf, MD_STR_NOTTOP, NULL);
298
299		if (biowait(&buf)) {
300			error = 1;
301			break;
302		}
303		reset_buf(&buf, (B_BUSY | B_WRITE), copysize);
304	} /* for */
305
306	destroy_buf(&buf);
307	kmem_free(databuffer, copysize);
308
309	return (error);
310}
311
312/*
313 * NAME:	raid_open_alt
314 *
315 * DESCRIPTION: opens the alt device used during resync.
316 *
317 * PARAMETERS:	un
318 *
319 * RETURN:	0 - successfull
320 *		1 - failed
321 *
322 * LOCKS:	requires unit writer lock
323 */
324
325static int
326raid_open_alt(mr_unit_t *un, int index)
327{
328	mr_column_t	*column = &un->un_column[index];
329	set_t		setno = MD_MIN2SET(MD_SID(un));
330	side_t		side = mddb_getsidenum(setno);
331	md_dev64_t	tmpdev = column->un_alt_dev;
332
333	/* correct locks */
334	ASSERT(UNIT_WRITER_HELD(un));
335	/* not already writing to */
336	ASSERT(! (column->un_devflags & MD_RAID_WRITE_ALT));
337	/* not already open */
338	ASSERT(! (column->un_devflags & MD_RAID_ALT_ISOPEN));
339
340	if (tmpdev != NODEV64) {
341		/*
342		 * Open by device id. We use orig_key since alt_dev
343		 * has been set by the caller to be the same as orig_dev.
344		 */
345		if ((md_getmajor(tmpdev) != md_major) &&
346			md_devid_found(setno, side, column->un_orig_key) == 1) {
347			tmpdev = md_resolve_bydevid(MD_SID(un), tmpdev,
348				column->un_orig_key);
349		}
350		if (md_layered_open(MD_SID(un), &tmpdev, MD_OFLG_NULL)) {
351			/* failed open */
352			column->un_alt_dev = tmpdev;
353			return (1);
354		} else {
355			/* open suceeded */
356			column->un_alt_dev = tmpdev;
357			column->un_devflags |= MD_RAID_ALT_ISOPEN;
358			return (0);
359		}
360	} else
361		/* no alt device to open */
362		return (1);
363}
364
365
366/*
367 * NAME:	raid_close_alt
368 *
369 * DESCRIPTION: closes the alt device used during resync.
370 *
371 * PARAMETERS:	un - raid unit structure
372 *		indes - raid column
373 *
374 * RETURN:	none
375 *
376 * LOCKS:	requires unit writer lock
377 */
378
379static void
380raid_close_alt(mr_unit_t *un, int index)
381{
382	mr_column_t	*column = &un->un_column[index];
383	md_dev64_t	tmpdev = column->un_alt_dev;
384
385	ASSERT(UNIT_WRITER_HELD(un));	/* correct locks */
386	ASSERT(! (column->un_devflags & MD_RAID_WRITE_ALT)); /* not writing */
387	ASSERT(column->un_devflags & MD_RAID_ALT_ISOPEN); /* already open */
388	ASSERT(tmpdev != NODEV64); /* is a device */
389
390	md_layered_close(column->un_alt_dev, MD_OFLG_NULL);
391	column->un_devflags &= ~MD_RAID_ALT_ISOPEN;
392	column->un_alt_dev = NODEV64;
393}
394
395static diskaddr_t
396raid_resync_fillin_cs(diskaddr_t line, uint_t line_count, md_raidcs_t *cs)
397{
398	mr_unit_t	*un = cs->cs_un;
399
400	ASSERT(line < un->un_segsincolumn);
401
402	cs->cs_line = line;
403	cs->cs_blkno = line * un->un_segsize;
404	cs->cs_blkcnt = un->un_segsize * line_count;
405	cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
406	raid_line_reader_lock(cs, 1);
407
408	return (line + line_count);
409}
410
411/* states returned by raid_resync_line */
412
413#define	RAID_RESYNC_OKAY	0
414#define	RAID_RESYNC_RDERROR	2
415#define	RAID_RESYNC_WRERROR	3
416#define	RAID_RESYNC_STATE	4
417
418int
419raid_resync_region(
420	md_raidcs_t	*cs,
421	diskaddr_t	line,
422	uint_t		line_count,
423	int		*single_read,
424	hs_cmds_t	*hs_state,
425	int		*err_col,
426	md_dev64_t	dev_to_write,
427	diskaddr_t	write_dev_start)
428{
429	mr_unit_t 	*un = cs->cs_un;
430	buf_t		*readb1 = &cs->cs_pbuf;
431	buf_t		*readb2 = &cs->cs_dbuf;
432	buf_t		*writeb = &cs->cs_hbuf;
433	diskaddr_t	off;
434	size_t		tcopysize;
435	size_t		copysize;
436	int 		resync;
437	int		quit = 0;
438	size_t		leftinseg;
439	int		i;
440
441	resync = un->un_resync_index;
442	off = line * un->un_segsize;
443	copysize = un->un_resync_copysize;
444
445	/* find first column to read, skip resync column */
446
447	leftinseg = un->un_segsize * line_count;
448	while (leftinseg) {
449
450		/* truncate last chunk to end if needed */
451		if (copysize > leftinseg)
452			tcopysize = leftinseg;
453		else
454			tcopysize = copysize;
455		leftinseg -= tcopysize;
456
457		/*
458		 * One of two scenarios:
459		 * 1) resync device with hotspare ok.  This implies that
460		 *    we are copying from a good hotspare to a new good original
461		 *    device.  In this case readb1 is used as the buf for
462		 *    the read from the hotspare device.
463		 * 2) For all other cases, including when in case 1) and an
464		 *    error is detected on the (formerly good) hotspare device,
465		 *    readb1 is used for the initial read.  readb2 is used for
466		 *    all other reads.	Each readb2 buffer is xor'd into the
467		 *    readb1 buffer.
468		 *
469		 * In both cases, writeb is used for the write, using readb1's
470		 * buffer.
471		 *
472		 * For case 2, we could alternatively perform the read for all
473		 * devices concurrently to improve performance.	 However,
474		 * this could diminish performance for concurrent reads and
475		 * writes if low on memory.
476		 */
477
478		/* read first buffer */
479
480		/* switch to read from good columns if single_read */
481		if (*single_read) {
482			if (un->un_column[resync].un_dev == NODEV64)
483				return (RAID_RESYNC_RDERROR);
484
485			reset_buf(readb1, B_READ | B_BUSY,
486			    dbtob(copysize));
487			readb1->b_bcount = dbtob(tcopysize);
488			readb1->b_un.b_addr = cs->cs_pbuffer;
489			readb1->b_edev = md_dev64_to_dev(
490						un->un_column[resync].un_dev);
491			readb1->b_lblkno =
492			    un->un_column[resync].un_devstart + off;
493			(void) md_call_strategy(readb1, MD_STR_NOTTOP, NULL);
494			if (biowait(readb1)) {
495				/*
496				 * at this point just start rebuilding the
497				 * data and go on since the other column
498				 * are ok.
499				 */
500				*single_read = 0;
501				*hs_state = HS_BAD;
502				un->un_column[resync].un_devflags &=
503				    ~MD_RAID_COPY_RESYNC;
504				un->un_column[resync].un_devflags |=
505				    MD_RAID_REGEN_RESYNC;
506			}
507		}
508
509		/* if reading from all non-resync columns */
510		if (!*single_read) {
511			/* for each column, read line and xor into write buf */
512			bzero(cs->cs_pbuffer, dbtob(tcopysize));
513			for (i = 0; i < un->un_totalcolumncnt; i++) {
514
515				if (un->un_column[i].un_dev == NODEV64)
516					return (RAID_RESYNC_RDERROR);
517
518				/* skip column getting resync'ed */
519				if (i == resync) {
520					continue;
521				}
522				reset_buf(readb1, B_READ | B_BUSY,
523				    dbtob(copysize));
524				readb1->b_bcount = dbtob(tcopysize);
525				readb1->b_un.b_addr = cs->cs_dbuffer;
526				readb1->b_edev = md_dev64_to_dev(
527						un->un_column[i].un_dev);
528				readb1->b_lblkno =
529				    un->un_column[i].un_devstart + off;
530
531				(void) md_call_strategy(readb1, MD_STR_NOTTOP,
532					NULL);
533				if (biowait(readb1)) {
534					*err_col = i;
535					quit = RAID_RESYNC_RDERROR;
536				}
537
538				if (quit)
539					return (quit);
540
541				/* xor readb2 data into readb1 */
542				xor(cs->cs_pbuffer, readb1->b_un.b_addr,
543				    dbtob(tcopysize));
544			} /* for */
545		}
546
547		reset_buf(writeb, B_WRITE | B_BUSY,
548		    dbtob(copysize));
549		writeb->b_bcount = dbtob(tcopysize);
550		writeb->b_un.b_addr = cs->cs_pbuffer;
551		writeb->b_lblkno = off + write_dev_start;
552		writeb->b_edev = md_dev64_to_dev(dev_to_write);
553
554		/* set write block number and perform the write */
555		(void) md_call_strategy(writeb, MD_STR_NOTTOP, NULL);
556		if (biowait(writeb)) {
557			if (*single_read == 0) {
558				*hs_state = HS_BAD;
559			}
560			return (RAID_RESYNC_WRERROR);
561		}
562		writeb->b_blkno += tcopysize;
563		off += tcopysize;
564	} /* while */
565	sema_destroy(&readb1->b_io);
566	sema_destroy(&readb1->b_sem);
567	sema_destroy(&readb2->b_io);
568	sema_destroy(&readb2->b_sem);
569	sema_destroy(&writeb->b_io);
570	sema_destroy(&writeb->b_sem);
571	return (RAID_RESYNC_OKAY);
572}
573
574/*
575 * NAME:	resync_comp
576 *
577 * DESCRIPTION: Resync the component.  Iterate through the raid unit a line at
578 *		a time, read from the good device(s) and write the resync
579 *		device.
580 *
581 * PARAMETERS:	minor_t	   mnum - minor number identity of metadevice
582 *		md_raidcs_t *cs - child save struct
583 *
584 * RETURN:	 0 - successfull
585 *		 1 - failed
586 *		-1 - aborted
587 *
588 * LOCKS:	Expects Unit Reader Lock to be held across call.  Acquires and
589 *		releases Line Reader Lock for per-line I/O.
590 */
591static void
592resync_comp(
593	minor_t		mnum,
594	md_raidcs_t	*cs
595)
596{
597	mdi_unit_t	*ui;
598	mr_unit_t	*un;
599	mddb_recid_t	recids[2];
600	rcs_state_t	state;
601	md_dev64_t	dev_to_write;
602	diskaddr_t	write_pwstart;
603	diskaddr_t	write_devstart;
604	md_dev64_t	dev;
605	int		resync;
606	int		i;
607	int		single_read = 0;
608	int		err;
609	int		err_cnt;
610	int		last_err;
611	diskaddr_t	line;
612	diskaddr_t	segsincolumn;
613	size_t		bsize;
614	uint_t		line_count;
615
616	/*
617	 * hs_state is the state of the hotspare on the column being resynced
618	 * dev_state is the state of the resync target
619	 */
620	hs_cmds_t	hs_state;
621	int		err_col = -1;
622	diskaddr_t	resync_end_pos;
623
624	ui = MDI_UNIT(mnum);
625	ASSERT(ui != NULL);
626
627	un = cs->cs_un;
628
629	md_unit_readerexit(ui);
630	un = (mr_unit_t *)md_io_writerlock(ui);
631	un = (mr_unit_t *)md_unit_writerlock(ui);
632	resync = un->un_resync_index;
633	state = un->un_column[resync].un_devstate;
634	line_count = un->un_maxio / un->un_segsize;
635	if (line_count == 0) { /* handle the case of segsize > maxio */
636		line_count = 1;
637		bsize = un->un_maxio;
638	} else
639		bsize = line_count * un->un_segsize;
640
641	un->un_resync_copysize = (uint_t)bsize;
642
643	ASSERT(un->c.un_status & MD_UN_RESYNC_ACTIVE);
644	ASSERT(un->un_column[resync].un_devflags &
645	    (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC));
646
647	/*
648	 * if the column is not in resync then just bail out.
649	 */
650	if (! (un->un_column[resync].un_devstate & RCS_RESYNC)) {
651		md_unit_writerexit(ui);
652		md_io_writerexit(ui);
653		un = (mr_unit_t *)md_unit_readerlock(ui);
654		return;
655	}
656	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START, SVM_TAG_METADEVICE,
657	    MD_UN2SET(un), MD_SID(un));
658
659	/* identify device to write and its start block */
660
661	if (un->un_column[resync].un_alt_dev != NODEV64) {
662		if (raid_open_alt(un, resync)) {
663			raid_set_state(un, resync, state, 0);
664			md_unit_writerexit(ui);
665			md_io_writerexit(ui);
666			un = (mr_unit_t *)md_unit_readerlock(ui);
667			cmn_err(CE_WARN, "md: %s: %s open failed replace "
668				"terminated", md_shortname(MD_SID(un)),
669				md_devname(MD_UN2SET(un),
670					un->un_column[resync].un_alt_dev,
671					NULL, 0));
672			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
673			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
674			return;
675		}
676		ASSERT(un->un_column[resync].un_devflags & MD_RAID_COPY_RESYNC);
677		dev_to_write = un->un_column[resync].un_alt_dev;
678		write_devstart = un->un_column[resync].un_alt_devstart;
679		write_pwstart = un->un_column[resync].un_alt_pwstart;
680		if (un->un_column[resync].un_devflags & MD_RAID_DEV_ERRED) {
681			single_read = 0;
682			hs_state = HS_BAD;
683		} else {
684			hs_state = HS_FREE;
685			single_read = 1;
686		}
687		un->un_column[resync].un_devflags |= MD_RAID_WRITE_ALT;
688	} else {
689		dev_to_write = un->un_column[resync].un_dev;
690		write_devstart = un->un_column[resync].un_devstart;
691		write_pwstart = un->un_column[resync].un_pwstart;
692		single_read = 0;
693		hs_state = HS_FREE;
694		ASSERT(un->un_column[resync].un_devflags &
695		    MD_RAID_REGEN_RESYNC);
696	}
697
698	alloc_bufs(cs, dbtob(bsize));
699	/* initialize pre-write area */
700	if (init_pw_area(un, dev_to_write, write_pwstart, resync)) {
701		un->un_column[resync].un_devflags &= ~MD_RAID_WRITE_ALT;
702		if (un->un_column[resync].un_alt_dev != NODEV64) {
703			raid_close_alt(un, resync);
704		}
705		md_unit_writerexit(ui);
706		md_io_writerexit(ui);
707		if (dev_to_write == un->un_column[resync].un_dev)
708			hs_state = HS_BAD;
709		err = RAID_RESYNC_WRERROR;
710		goto resync_comp_error;
711	}
712
713	un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
714	segsincolumn = un->un_segsincolumn;
715	err_cnt = raid_state_cnt(un, RCS_ERRED | RCS_LAST_ERRED);
716
717	/* commit the record */
718
719	md_unit_writerexit(ui);
720	md_io_writerexit(ui);
721
722
723	/* resync each line of the unit */
724	for (line = 0; line <  segsincolumn; line += line_count) {
725		/*
726		 * Update address range in child struct and lock the line.
727		 *
728		 * The reader version of the line lock is used since only
729		 * resync will use data beyond un_resync_line_index on the
730		 * resync device.
731		 */
732		un = (mr_unit_t *)md_io_readerlock(ui);
733		if (line + line_count > segsincolumn)
734			line_count = segsincolumn - line;
735		resync_end_pos = raid_resync_fillin_cs(line, line_count, cs);
736		(void) md_unit_readerlock(ui);
737		ASSERT(un->un_resync_line_index == resync_end_pos);
738		err = raid_resync_region(cs, line, (int)line_count,
739		    &single_read, &hs_state, &err_col, dev_to_write,
740		    write_devstart);
741
742		/*
743		 * if the column failed to resync then stop writing directly
744		 * to the column.
745		 */
746		if (err)
747			un->un_resync_line_index = 0;
748
749		md_unit_readerexit(ui);
750		raid_line_exit(cs);
751		md_io_readerexit(ui);
752
753		if (err)
754			break;
755
756		un = (mr_unit_t *)md_unit_writerlock(ui);
757
758		if (raid_state_cnt(un, RCS_ERRED | RCS_LAST_ERRED) != err_cnt) {
759			err = RAID_RESYNC_STATE;
760			md_unit_writerexit(ui);
761			break;
762		}
763		md_unit_writerexit(ui);
764	} /* for */
765
766resync_comp_error:
767	un = (mr_unit_t *)md_io_writerlock(ui);
768	(void) md_unit_writerlock(ui);
769	un->un_column[resync].un_devflags &= ~MD_RAID_WRITE_ALT;
770
771	recids[0] = 0;
772	recids[1] = 0;
773	switch (err) {
774		/*
775		 * successful resync
776		 */
777	    case RAID_RESYNC_OKAY:
778		/* initialize pre-write area */
779		if ((un->un_column[resync].un_orig_dev != NODEV64) &&
780		    (un->un_column[resync].un_orig_dev ==
781		    un->un_column[resync].un_alt_dev)) {
782			/*
783			 * replacing a hot spare
784			 * release the hot spare, which will close the hotspare
785			 * and mark it closed.
786			 */
787			raid_hs_release(hs_state, un, &recids[0], resync);
788			/*
789			 * make the resync target the main device and
790			 * mark open
791			 */
792			un->un_column[resync].un_hs_id = 0;
793			un->un_column[resync].un_dev =
794			    un->un_column[resync].un_orig_dev;
795			un->un_column[resync].un_devstart =
796			    un->un_column[resync].un_orig_devstart;
797			un->un_column[resync].un_pwstart =
798			    un->un_column[resync].un_orig_pwstart;
799			un->un_column[resync].un_devflags |= MD_RAID_DEV_ISOPEN;
800			/* alt becomes the device so don't close it */
801			un->un_column[resync].un_devflags &= ~MD_RAID_WRITE_ALT;
802			un->un_column[resync].un_devflags &=
803			    ~MD_RAID_ALT_ISOPEN;
804			un->un_column[resync].un_alt_dev = NODEV64;
805		}
806		raid_set_state(un, resync, RCS_OKAY, 0);
807		break;
808
809	    case RAID_RESYNC_WRERROR:
810		if (HOTSPARED(un, resync) && single_read &&
811		    (un->un_column[resync].un_devflags & MD_RAID_COPY_RESYNC)) {
812			/*
813			 * this is the case where the resync target is
814			 * bad but there is a good hotspare.  In this
815			 * case keep the hotspare, and go back to okay.
816			 */
817			raid_set_state(un, resync, RCS_OKAY, 0);
818			cmn_err(CE_WARN, "md: %s: %s write error, replace "
819				"terminated", md_shortname(MD_SID(un)),
820				md_devname(MD_UN2SET(un),
821					un->un_column[resync].un_orig_dev,
822					NULL, 0));
823			break;
824		}
825		if (HOTSPARED(un, resync)) {
826			raid_hs_release(hs_state, un, &recids[0], resync);
827			un->un_column[resync].un_dev =
828			    un->un_column[resync].un_orig_dev;
829			un->un_column[resync].un_devstart =
830			    un->un_column[resync].un_orig_devstart;
831			un->un_column[resync].un_pwstart =
832			    un->un_column[resync].un_orig_pwstart;
833		}
834		raid_set_state(un, resync, RCS_ERRED, 0);
835		if (un->un_column[resync].un_devflags & MD_RAID_REGEN_RESYNC)
836			dev = un->un_column[resync].un_dev;
837		else
838			dev = un->un_column[resync].un_alt_dev;
839		cmn_err(CE_WARN, "md: %s: %s write error replace terminated",
840		    md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un), dev,
841		    NULL, 0));
842		break;
843
844	    case RAID_RESYNC_STATE:
845		if (HOTSPARED(un, resync) && single_read &&
846		    (un->un_column[resync].un_devflags & MD_RAID_COPY_RESYNC)) {
847			/*
848			 * this is the case where the resync target is
849			 * bad but there is a good hotspare.  In this
850			 * case keep the hotspare, and go back to okay.
851			 */
852			raid_set_state(un, resync, RCS_OKAY, 0);
853			cmn_err(CE_WARN, "md: %s: needs maintenance, replace "
854			    "terminated", md_shortname(MD_SID(un)));
855			break;
856		}
857		if (HOTSPARED(un, resync)) {
858			raid_hs_release(hs_state, un, &recids[0], resync);
859			un->un_column[resync].un_dev =
860			    un->un_column[resync].un_orig_dev;
861			un->un_column[resync].un_devstart =
862			    un->un_column[resync].un_orig_devstart;
863			un->un_column[resync].un_pwstart =
864			    un->un_column[resync].un_orig_pwstart;
865		}
866		break;
867	    case RAID_RESYNC_RDERROR:
868		if (HOTSPARED(un, resync)) {
869			raid_hs_release(hs_state, un, &recids[0], resync);
870			un->un_column[resync].un_dev =
871			    un->un_column[resync].un_orig_dev;
872			un->un_column[resync].un_devstart =
873			    un->un_column[resync].un_orig_devstart;
874			un->un_column[resync].un_pwstart =
875			    un->un_column[resync].un_orig_pwstart;
876		}
877
878		if ((resync != err_col) && (err_col != NOCOLUMN))
879			raid_set_state(un, err_col, RCS_ERRED, 0);
880		break;
881
882	    default:
883		ASSERT(0);
884	}
885	if (un->un_column[resync].un_alt_dev != NODEV64) {
886		raid_close_alt(un, resync);
887	}
888
889	/*
890	 * an io operation may have gotten an error and placed a
891	 * column in erred state.  This will abort the resync, which
892	 * will end up in last erred.  This is ugly so go through
893	 * the columns and do cleanup
894	 */
895	err_cnt = 0;
896	last_err = 0;
897	for (i = 0; i < un->un_totalcolumncnt; i++) {
898		if (un->un_column[i].un_devstate & RCS_OKAY)
899			continue;
900		if (i == resync) {
901			raid_set_state(un, i, RCS_ERRED, 1);
902			err_cnt++;
903		} else if (err == RAID_RESYNC_OKAY) {
904			err_cnt++;
905		} else {
906			raid_set_state(un, i, RCS_LAST_ERRED, 1);
907			last_err++;
908		}
909	}
910	if ((err_cnt == 0) && (last_err == 0))
911		un->un_state = RUS_OKAY;
912	else if (last_err == 0) {
913		un->un_state = RUS_ERRED;
914		ASSERT(err_cnt == 1);
915	} else if (last_err > 0) {
916		un->un_state = RUS_LAST_ERRED;
917	}
918
919	uniqtime32(&un->un_column[resync].un_devtimestamp);
920	un->un_resync_copysize = 0;
921	un->un_column[resync].un_devflags &=
922	    ~(MD_RAID_REGEN_RESYNC | MD_RAID_COPY_RESYNC);
923	raid_commit(un, recids);
924	/* release unit writer lock and acquire unit reader lock */
925	md_unit_writerexit(ui);
926	md_io_writerexit(ui);
927	(void) md_unit_readerlock(ui);
928	if (err == RAID_RESYNC_OKAY) {
929		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
930		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
931	} else {
932		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
933		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
934		if (raid_state_cnt(un, RCS_ERRED |
935			RCS_LAST_ERRED) > 1) {
936			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
937			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
938		} else {
939			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
940			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
941		}
942	}
943
944	free_bufs(dbtob(bsize), cs);
945}
946
947/*
948 * NAME:	resync_unit
949 *
950 * DESCRIPTION: Start of RAID resync thread.  Perform up front allocations,
951 *		initializations and consistency checking, then call
952 *		resync_comp to resync the component.
953 *
954 * PARAMETERS:	minor_t mnum - minor number identity of metadevice
955 *
956 * LOCKS:	Acquires and releases Unit Reader Lock to maintain unit
957 *		existence during resync.
958 *		Acquires and releases the resync count lock for cpr.
959 */
960static void
961resync_unit(
962	minor_t mnum
963)
964{
965	mdi_unit_t	*ui;
966	mr_unit_t	*un;
967	md_raidps_t	*ps = NULL;
968	md_raidcs_t	*cs = NULL;
969	int		resync;
970
971	/*
972	 * Increment the raid resync count for cpr
973	 */
974	mutex_enter(&md_cpr_resync.md_resync_mutex);
975	md_cpr_resync.md_raid_resync++;
976	mutex_exit(&md_cpr_resync.md_resync_mutex);
977
978	ui = MDI_UNIT(mnum);
979	ASSERT(ui != NULL);
980
981	un = (mr_unit_t *)md_unit_readerlock(ui);
982
983	/*
984	 * Allocate parent and child memory pool structures.  These are
985	 * only needed to lock raid lines, so only the minimal
986	 * required fields for this purpose are initialized.
987	 *
988	 * Do not use the reserve pool for resync.
989	 */
990	ps = kmem_cache_alloc(raid_parent_cache, MD_ALLOCFLAGS);
991	raid_parent_init(ps);
992	cs = kmem_cache_alloc(raid_child_cache, MD_ALLOCFLAGS);
993	raid_child_init(cs);
994	resync = un->un_resync_index;
995	ps->ps_un = un;
996	ps->ps_ui = ui;
997	ps->ps_flags = MD_RPS_INUSE;
998	cs->cs_ps = ps;
999	cs->cs_un = un;
1000
1001	ASSERT(!(un->un_column[resync].un_devflags & MD_RAID_WRITE_ALT));
1002
1003	resync_comp(mnum, cs);
1004	release_resync_request(mnum);
1005
1006	kmem_cache_free(raid_child_cache, cs);
1007	kmem_cache_free(raid_parent_cache, ps);
1008
1009	md_unit_readerexit(ui);
1010
1011	/* close raid unit */
1012	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1013
1014	/* poke hot spare daemon */
1015	(void) raid_hotspares();
1016
1017	/*
1018	 * Decrement the raid resync count for cpr
1019	 */
1020	mutex_enter(&md_cpr_resync.md_resync_mutex);
1021	md_cpr_resync.md_raid_resync--;
1022	mutex_exit(&md_cpr_resync.md_resync_mutex);
1023
1024	thread_exit();
1025}
1026
1027/*
1028 * NAME:	raid_resync_unit
1029 *
1030 * DESCRIPTION: RAID metadevice specific resync routine.
1031 *		Open the unit and start resync_unit as a separate thread.
1032 *
1033 * PARAMETERS:	minor_t	  mnum - minor number identity of metadevice
1034 *		md_error_t *ep - output error parameter
1035 *
1036 * RETURN:	On error return 1 or set ep to nonzero, otherwise return 0.
1037 *
1038 * LOCKS:	Acquires and releases Unit Writer Lock.
1039 */
1040int
1041raid_resync_unit(
1042	minor_t			mnum,
1043	md_error_t		*ep
1044)
1045{
1046	mdi_unit_t	*ui;
1047	set_t		setno = MD_MIN2SET(mnum);
1048	mr_unit_t	*un;
1049
1050	ui = MDI_UNIT(mnum);
1051	un = MD_UNIT(mnum);
1052
1053	if (md_get_setstatus(setno) & MD_SET_STALE)
1054		return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
1055
1056	ASSERT(un->un_column[un->un_resync_index].un_devflags &
1057	    (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC));
1058
1059	/* Don't start a resync if the device is not available */
1060	if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
1061		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
1062	}
1063
1064	if (raid_internal_open(mnum, FREAD | FWRITE, OTYP_LYR, 0)) {
1065		(void) md_unit_writerlock(ui);
1066		release_resync_request(mnum);
1067		md_unit_writerexit(ui);
1068		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
1069		    setno, MD_SID(un));
1070		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
1071	}
1072
1073	/* start resync_unit thread */
1074	(void) thread_create(NULL, 0, resync_unit, (void *)(uintptr_t)mnum,
1075	    0, &p0, TS_RUN, minclsyspri);
1076
1077	return (0);
1078}
1079