1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28/*
29 * NAME:	raid_replay.c
30 *
31 * DESCRIPTION: RAID driver source file containing routines related to replay
32 *		operation.
33 *
34 * ROUTINES PROVIDED FOR EXTERNAL USE:
35 *		raid_replay() - replay all the pre write entries in the unit.
36 */
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/conf.h>
41#include <sys/file.h>
42#include <sys/user.h>
43#include <sys/uio.h>
44#include <sys/t_lock.h>
45#include <sys/buf.h>
46#include <sys/dkio.h>
47#include <sys/vtoc.h>
48#include <sys/kmem.h>
49#include <vm/page.h>
50#include <sys/sysmacros.h>
51#include <sys/types.h>
52#include <sys/mkdev.h>
53#include <sys/stat.h>
54#include <sys/open.h>
55#include <sys/modctl.h>
56#include <sys/ddi.h>
57#include <sys/sunddi.h>
58
59#include <sys/lvm/md_raid.h>
60
61#include <sys/sysevent/eventdefs.h>
62#include <sys/sysevent/svm.h>
63
64/* functions forward declarations */
65static int	raid_replay_error(mr_unit_t *un, int column);
66
67int		raid_total_rply_entries = 0;
68
69/*
70 * NAMES:	raid_rply_dealloc, raid_rply_alloc
71 * DESCRIPTION: RAID metadevice replay buffer allocation/deallocation routines
72 * PARAMETERS:	mr_unit_t *un - pointer to the unit structure
73 *		mr_unit_t *un - pointer to the unit structure
74 * RETURNS:
75 */
76static void
77raid_rply_dealloc(mr_unit_t *un,
78		raid_rplybuf_t **bufs,
79		raid_rplybuf_t *rwbuf1,
80		raid_rplybuf_t *rwbuf2)
81{
82	int	i;
83	raid_rplybuf_t *tmp;
84
85	for (i = 0, tmp = *bufs; i < un->un_totalcolumncnt; i++, tmp++) {
86		if (tmp->rpl_data) {
87			kmem_free(tmp->rpl_data, DEV_BSIZE);
88			tmp->rpl_data = NULL;
89		}
90		if (tmp->rpl_buf) {
91			kmem_free(tmp->rpl_buf, sizeof (buf_t));
92			tmp->rpl_buf = NULL;
93		}
94	}
95	kmem_free(*bufs, sizeof (raid_rplybuf_t) * un->un_totalcolumncnt);
96	*bufs = NULL;
97	if (rwbuf1->rpl_data) {
98		kmem_free(rwbuf1->rpl_data, dbtob(un->un_iosize));
99		rwbuf1->rpl_data = NULL;
100	}
101	if (rwbuf1->rpl_buf) {
102		kmem_free((caddr_t)rwbuf1->rpl_buf, sizeof (buf_t));
103		rwbuf1->rpl_buf = NULL;
104	}
105	if (rwbuf2->rpl_data) {
106		kmem_free(rwbuf2->rpl_data, dbtob(un->un_iosize));
107		rwbuf2->rpl_data = NULL;
108	}
109	if (rwbuf2->rpl_buf) {
110		kmem_free((caddr_t)rwbuf2->rpl_buf, sizeof (buf_t));
111		rwbuf2->rpl_buf = NULL;
112	}
113}
114
115static void
116raid_rply_alloc(mr_unit_t *un,
117		raid_rplybuf_t **bufs,
118		raid_rplybuf_t *rwbuf1,
119		raid_rplybuf_t *rwbuf2)
120{
121	int		i;
122	raid_rplybuf_t *tmp;
123	buf_t		*bp;
124
125	/* intialization */
126	*bufs = kmem_zalloc(sizeof (raid_rplybuf_t) * un->un_totalcolumncnt,
127	    KM_SLEEP);
128	ASSERT(*bufs != NULL);
129	bzero((caddr_t)rwbuf1, sizeof (raid_rplybuf_t));
130	bzero((caddr_t)rwbuf2, sizeof (raid_rplybuf_t));
131
132	/* allocate all the buffers required for the replay processing */
133	for (i = 0, tmp = *bufs; i < un->un_totalcolumncnt; i++, tmp++) {
134		tmp->rpl_data = kmem_zalloc(DEV_BSIZE, KM_SLEEP);
135		ASSERT(tmp->rpl_data != NULL);
136		tmp->rpl_buf = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
137		ASSERT(tmp->rpl_buf != NULL);
138		bp = (buf_t *)tmp->rpl_buf;
139		bp->b_back = bp;
140		bp->b_forw = bp;
141		bp->b_flags = B_BUSY;
142		bp->b_offset = -1;
143		/* Initialize semaphores */
144		sema_init(&bp->b_io, 0, NULL,
145			SEMA_DEFAULT, NULL);
146		sema_init(&bp->b_sem, 0, NULL,
147			SEMA_DEFAULT, NULL);
148	}
149
150	rwbuf1->rpl_data = kmem_zalloc(dbtob(un->un_iosize), KM_SLEEP);
151	ASSERT(rwbuf1->rpl_data != NULL);
152	rwbuf1->rpl_buf = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
153	ASSERT(rwbuf1->rpl_buf != NULL);
154	rwbuf2->rpl_data = kmem_zalloc(dbtob(un->un_iosize), KM_SLEEP);
155	ASSERT(rwbuf2->rpl_data != NULL);
156	rwbuf2->rpl_buf = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
157	ASSERT(rwbuf2->rpl_buf != NULL);
158
159	bp = (buf_t *)rwbuf1->rpl_buf;
160	bp->b_back = bp;
161	bp->b_forw = bp;
162	bp->b_flags = B_BUSY;
163	bp->b_offset = -1;
164	/* Initialize semaphores */
165	sema_init(&bp->b_io, 0, NULL,
166		SEMA_DEFAULT, NULL);
167	sema_init(&bp->b_sem, 0, NULL,
168		SEMA_DEFAULT, NULL);
169	bp = (buf_t *)rwbuf2->rpl_buf;
170	bp->b_back = bp;
171	bp->b_forw = bp;
172	bp->b_flags = B_BUSY;
173	bp->b_offset = -1;
174	/* Initialize semaphores */
175	sema_init(&bp->b_io, 0, NULL,
176		SEMA_DEFAULT, NULL);
177	sema_init(&bp->b_sem, 0, NULL,
178		SEMA_DEFAULT, NULL);
179}
180
181/*
182 * NAMES:	rpl_insert, rpl_delete, rpl_find
183 * DESCRIPTION: RAID metadevice replay list processing APIs
184 * PARAMETERS:	raid_rplylst_t *list - pointer to the replay list.
185 *		raid_pwhdr_t   *pwptr - pointer to a pre-write header.
186 * RETURNS:
187 */
188static void
189rpl_insert(raid_rplylst_t **listp, raid_rplylst_t *newp)
190{
191	raid_rplylst_t *tmp, **prevp;
192
193	for (prevp = listp; ((tmp = *prevp) != NULL); prevp = &tmp->rpl_next) {
194		if (tmp->rpl_id > newp->rpl_id) {
195			break;
196		}
197	}
198	newp->rpl_next = tmp;
199	*prevp = newp;
200}
201
202static void
203rpl_delete(raid_rplylst_t **prevp, raid_rplylst_t *oldp)
204{
205
206	ASSERT((caddr_t)oldp);
207	raid_total_rply_entries --;
208	*prevp = oldp->rpl_next;
209	kmem_free((caddr_t)oldp, sizeof (raid_rplylst_t));
210}
211
212static raid_rplylst_t *
213rpl_find(raid_rplylst_t *list, long long pw_id)
214{
215	raid_rplylst_t *tmp;
216
217	for (tmp = list; tmp; tmp = tmp->rpl_next) {
218		if (pw_id == tmp->rpl_id) {
219			return (tmp);
220		}
221	}
222	return ((raid_rplylst_t *)NULL);
223}
224
225/*
226 * NAMES:	enq_rplylst
227 * DESCRIPTION: Enqueue a pre-write header into the replay list.
228 * PARAMETERS:	raid_rplylst_t *list - pointer to the replay list.
229 *		raid_pwhdr_t   *pwptr - pointer to a pre-write header.
230 * RETURNS:
231 */
232static void
233enq_rplylst(raid_rplylst_t **listp, raid_pwhdr_t *pwhp,
234		uint_t slot, int column)
235{
236	raid_rplylst_t *newp, *oldp;
237
238	/* check if the pre-write existed in the list */
239	if ((pwhp->rpw_colcount <= 2) &&
240	    (oldp = rpl_find(*listp, pwhp->rpw_id))) {
241		bcopy((caddr_t)pwhp, (caddr_t)&oldp->rpl_pwhdr2,
242			sizeof (raid_pwhdr_t));
243		oldp->rpl_slot2   = slot;
244		oldp->rpl_column2 = column;
245	} else {
246		raid_total_rply_entries ++;
247		newp = (raid_rplylst_t *)kmem_zalloc(sizeof (raid_rplylst_t),
248		    KM_SLEEP);
249		ASSERT(newp != NULL);
250		bcopy((caddr_t)pwhp, (caddr_t)&newp->rpl_pwhdr1,
251			sizeof (raid_pwhdr_t));
252		bzero((caddr_t)&newp->rpl_pwhdr2, sizeof (raid_pwhdr_t));
253
254		newp->rpl_id = pwhp->rpw_id;
255		newp->rpl_column1 = column;
256		newp->rpl_slot1 = slot;
257		newp->rpl_next = (raid_rplylst_t *)NULL;
258		newp->rpl_colcnt = pwhp->rpw_colcount;
259		rpl_insert(listp, newp);
260	}
261}
262
263/*
264 * NAMES:	pw_read_done and pw_write_done
265 * DESCRIPTION: don't know the usage yet ??? (TBD)
266 * PARAMETERS:
267 * RETURNS:
268 */
269static int
270pw_read_done(buf_t *bp)
271{
272	ASSERT(SEMA_HELD(&bp->b_sem));
273	ASSERT((bp->b_flags & B_DONE) == 0);
274
275	bp->b_flags |= B_DONE;
276
277	if (bp->b_flags & B_ASYNC)
278		sema_v(&bp->b_sem);
279	else
280		/* wakeup the thread waiting on this buf */
281		sema_v(&bp->b_io);
282	return (0);
283}
284
285static int
286pw_write_done(buf_t *bp)
287{
288	ASSERT(SEMA_HELD(&bp->b_sem));
289	ASSERT((bp->b_flags & B_DONE) == 0);
290
291	bp->b_flags |= B_DONE;
292
293	if (bp->b_flags & B_ASYNC)
294		sema_v(&bp->b_sem);
295	else
296		/* wakeup the thread waiting on this buf */
297		sema_v(&bp->b_io);
298
299	return (0);
300}
301
302/*
303 * NAMES:	raid_pwhdr_read
304 * DESCRIPTION: issue a syncronous read to read a pre-write header
305 * PARAMETERS:	mr_unit_t *un - pointer to the unit structure
306 *		int	pw_slot - pre-write entry slot number
307 *		int	column	- column number for the pre-write entry
308 *		raid_rplybuf_t *bufp - pointer to the replay buffer structure
309 * RETURNS:
310 */
311static void
312raid_pwhdr_read(mr_unit_t *un, int pw_slot, int column, raid_rplybuf_t *bufp)
313{
314	buf_t		*bp;
315
316	/* set up pointers from raid_rplybuf_t *bufp */
317	bp = (buf_t *)bufp->rpl_buf;
318
319	/* calculate the data address or block number */
320	bp->b_un.b_addr = bufp->rpl_data;
321	bp->b_lblkno = un->un_column[column].un_pwstart +
322		pw_slot * un->un_iosize;
323	bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev);
324	bp->b_bufsize = DEV_BSIZE;
325	bp->b_bcount = DEV_BSIZE;
326	bp->b_flags  = (B_READ | B_BUSY);
327	bp->b_iodone = pw_read_done;
328	(void) md_call_strategy(bp, 0, NULL);
329}
330
331/*
332 * NAMES:	raid_pw_read
333 * DESCRIPTION: issue a syncronous read to read a pre-write entry
334 * PARAMETERS:	mr_unit_t	*un    - pointer to the unit structure
335 *		int		column - column number for the pre-write entry
336 *		u_int		slot   - pre-write entry slot number
337 *		raid_rplybuf_t	*bufp  - pointer to the replay buffer structure
338 * RETURNS:
339 */
340static int
341raid_pw_read(mr_unit_t *un, int column, uint_t slot, raid_rplybuf_t *bufp)
342{
343	buf_t	*bp;
344	int	error;
345	uint_t	blkcnt  = un->un_iosize;
346	uint_t	bytecnt = blkcnt * DEV_BSIZE;
347
348	/* if this column is no longer accessible, return */
349	if (!COLUMN_ISUP(un, column))
350		return (RAID_RPLY_COMPREPLAY);
351
352	/* set up pointers from raid_rplybuf_t *bufp */
353	bp = (buf_t *)bufp->rpl_buf;
354
355	/* calculate the data address or block number */
356	bp->b_un.b_addr = bufp->rpl_data;
357	bp->b_bufsize = bytecnt;
358	bp->b_bcount = bytecnt;
359	bp->b_flags = (B_READ | B_BUSY);
360	bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev);
361	bp->b_lblkno = un->un_column[column].un_pwstart + (slot * blkcnt);
362	bp->b_iodone = pw_read_done;
363	(void) md_call_strategy(bp, 0, NULL);
364	if (biowait(bp)) {
365		error = raid_replay_error(un, column);
366		return (error);
367	}
368	return (0);
369}
370
371/*
372 * NAMES:	raid_pw_write
373 * DESCRIPTION: issue a syncronous write to write a pre-write entry
374 * PARAMETERS:	mr_unit_t *un - pointer to the unit structure
375 *		int	column	- column number for the pre-write entry
376 *		raid_pwhdr_t   *pwhp - needed for some infos about the pw header
377 *		raid_rplybuf_t *bufp - pointer to the replay buffer structure
378 * RETURNS:
379 */
380static int
381raid_pw_write(mr_unit_t *un, int column, raid_pwhdr_t *pwhp,
382    raid_rplybuf_t *bufp)
383{
384	buf_t	 *bp;
385	int	 error;
386
387	/* if this column is no longer accessible, return */
388	if (!COLUMN_ISUP(un, column))
389		return (RAID_RPLY_COMPREPLAY);
390
391	/* set up pointers from raid_rplybuf_t *bufp */
392	bp = (buf_t *)bufp->rpl_buf;
393
394	/* calculate the data address or block number */
395	bp->b_un.b_addr = bufp->rpl_data + DEV_BSIZE;
396	bp->b_bufsize = dbtob(pwhp->rpw_blkcnt);
397	bp->b_bcount = dbtob(pwhp->rpw_blkcnt);
398	bp->b_flags = (B_WRITE | B_BUSY);
399	bp->b_edev  = md_dev64_to_dev(un->un_column[column].un_dev);
400	bp->b_lblkno = un->un_column[column].un_devstart + pwhp->rpw_blkno;
401	bp->b_iodone = pw_write_done;
402	(void) md_call_strategy(bp, 0, NULL);
403	if (biowait(bp)) {
404		error = raid_replay_error(un, column);
405		return (error);
406	}
407	return (0);
408}
409
410/*
411 * NAMES:	genchecksum
412 * DESCRIPTION: generate check sum for a pre-write entry
413 * PARAMETERS:	caddr_t addr - where the data bytes are
414 *		int bcount - number of bytes in the pre-write entry
415 * RETURNS:
416 */
417static uint_t
418genchecksum(caddr_t addr, size_t bcount)
419{
420	uint_t *dbuf;
421	size_t wordcnt;
422	uint_t dsum = 0;
423
424	wordcnt = bcount / sizeof (uint_t);
425	dbuf = (uint_t *)(void *)(addr);
426
427	while (wordcnt--) {
428		dsum ^= *dbuf;
429		dbuf++;
430	}
431	return (dsum);
432}
433
434/*
435 * NAMES:	raid_rply_verify
436 * DESCRIPTION: verify the pre-write entry for replay
437 * PARAMETERS:	mr_unit_t *un	- pointer to unit structure
438 *		int col1	- column number 1
439 *		int goodsum1	- flag to indicate good checksum
440 *		int *do_1	- flag to indicate whether we should replay
441 *				  the first pre-write
442 *		int col2	- column number 2
443 *		int goodsum2	- flag to indicate good checksum
444 *		int *do_2	- flag to indicate whether we should replay
445 *				  the first pre-write
446 * RETURNS:
447 */
448static void
449raid_rply_verify(mr_unit_t *un, int col1, int goodsum1, int *do_1,
450    int col2, int goodsum2, int *do_2)
451{
452	int	good_state1 = 0;
453	int	good_state2 = 0;
454
455	*do_1 = 0; *do_2 = 0;		/* prepare for the worst */
456	if (COLUMN_ISUP(un, col1)) {
457		good_state1 = 1;
458	}
459	if (COLUMN_ISUP(un, col2)) {
460		good_state2 = 1;
461	}
462	if ((good_state1 & good_state2) && (goodsum1 & goodsum2)) {
463		/* if both columns check out, do it */
464		*do_1 = 1; *do_2 = 1;
465	} else if ((good_state1 & goodsum1) && !good_state2) {
466		/* if one column is okay and the other is errored, do it */
467		*do_1 = 1; *do_2 = 0;
468	} else if ((good_state2 & goodsum2) && !good_state1) {
469		/* if one column is okay and the other is errored, do it */
470		*do_2 = 1; *do_1 = 0;
471	}
472}
473
474/*
475 * NAMES:	raid_rplyeach
476 * DESCRIPTION: issue a syncronous read to read a pre-write header
477 * PARAMETERS:	mr_unit_t *un - pointer to the unit structure
478 *		raid_rplylst_t *eachp - pointer to the replay list entry
479 *		raid_rplybuf_t *rwbuf1 - pointer to the replay buffer structure
480 *		raid_rplybuf_t *rwbuf2 - pointer to the replay buffer structure
481 * RETURNS:
482 */
483static int
484raid_rplyeach(
485	mr_unit_t	*un,
486	raid_rplylst_t	*eachp,
487	raid_rplybuf_t	*rwbuf1,
488	raid_rplybuf_t	*rwbuf2
489)
490{
491	raid_pwhdr_t	*pwhp1;
492	raid_pwhdr_t	*pwhp2;
493	uint_t		dsum1 = 0;
494	uint_t		dsum2 = 0;
495	int		good_pw1 = 0;
496	int		good_pw2 = 0;
497	int		do_1 = 0;
498	int		do_2 = 0;
499	int		error = 0;
500
501	/* First verify the normal case - two pre-write entries are all good */
502	if ((eachp->rpl_pwhdr1.rpw_magic == RAID_PWMAGIC &&
503	    eachp->rpl_pwhdr2.rpw_magic == RAID_PWMAGIC) &&
504	    (eachp->rpl_pwhdr1.rpw_blkcnt == eachp->rpl_pwhdr2.rpw_blkcnt)) {
505
506		ASSERT(eachp->rpl_pwhdr1.rpw_id == eachp->rpl_pwhdr2.rpw_id);
507
508		/* read the pre-write entries */
509		error = raid_pw_read(un, eachp->rpl_column1,
510		    eachp->rpl_slot1, rwbuf1);
511		pwhp1 = &eachp->rpl_pwhdr1;
512		if (error) {
513			if (error != RAID_RPLY_COMPREPLAY)
514				return (error);
515			good_pw1 = FALSE;
516		} else {
517			/* generate checksum for each pre-write entry */
518			dsum1 = genchecksum(rwbuf1->rpl_data + DEV_BSIZE,
519						dbtob(pwhp1->rpw_blkcnt));
520			good_pw1 = (dsum1 == pwhp1->rpw_sum);
521		}
522
523		error = raid_pw_read(un, eachp->rpl_column2, eachp->rpl_slot2,
524		    rwbuf2);
525		pwhp2 = &eachp->rpl_pwhdr2;
526		if (error) {
527			if (error != RAID_RPLY_COMPREPLAY)
528				return (error);
529			good_pw2 = FALSE;
530		} else {
531			/* generate checksum for pre-write entry */
532			dsum2 = genchecksum(rwbuf2->rpl_data + DEV_BSIZE,
533						dbtob(pwhp2->rpw_blkcnt));
534			good_pw2 = (dsum2 == pwhp2->rpw_sum);
535		}
536
537		/* verify the checksums and states */
538		raid_rply_verify(un, eachp->rpl_column1, good_pw1, &do_1,
539			eachp->rpl_column2, good_pw2, &do_2);
540
541		/* write (replay) the pre-write entries */
542		if (do_1) {
543			error = raid_pw_write(un, eachp->rpl_column1,
544			    &eachp->rpl_pwhdr1, rwbuf1);
545			if (error && (error != RAID_RPLY_COMPREPLAY)) {
546				return (error);
547			}
548		}
549		if (do_2) {
550			error = raid_pw_write(un, eachp->rpl_column2,
551			    &eachp->rpl_pwhdr2, rwbuf2);
552			if (error && (error != RAID_RPLY_COMPREPLAY)) {
553				return (error);
554			}
555		}
556		return (0);
557	}
558	if (eachp->rpl_pwhdr1.rpw_magic == RAID_PWMAGIC) {
559		/*
560		 * if partner was errored at time of write
561		 * or due to open or replay, replay this entry
562		 */
563		if ((eachp->rpl_pwhdr1.rpw_columnnum == -1) ||
564		    (! COLUMN_ISUP(un, eachp->rpl_pwhdr1.rpw_columnnum))) {
565			/* read the pre-write entry */
566			error = raid_pw_read(un, eachp->rpl_column1,
567			    eachp->rpl_slot1, rwbuf1);
568			if (error)
569				return (error);
570			/* generate checksum for the pre-write entry */
571			pwhp1 = &eachp->rpl_pwhdr1;
572			dsum1 = genchecksum(rwbuf1->rpl_data + DEV_BSIZE,
573						dbtob(pwhp1->rpw_blkcnt));
574			if (dsum1 == pwhp1->rpw_sum) {
575				error = raid_pw_write(un, eachp->rpl_column1,
576						&eachp->rpl_pwhdr1, rwbuf1);
577				if (error && (error != RAID_RPLY_COMPREPLAY)) {
578					return (error);
579				}
580			}
581		}
582		return (0);
583	}
584
585	return (0);
586}
587
588static int
589replay_line(mr_unit_t *un, raid_rplylst_t *eachp, raid_rplybuf_t *rplybuf)
590{
591	raid_pwhdr_t	*pwhdr1, *pwhdr2;
592	raid_rplylst_t	*eachpn;
593	int		i;
594	int		cnt;
595	diskaddr_t	blkno;
596	uint_t		blkcnt;
597	long long	id;
598	int		dsum;
599	int		error;
600	int		colcnt, col, col2;
601	int		down;
602
603	if (eachp->rpl_id == 0)
604		return (0);
605	/*
606	 * check: 1 - enough equal ids
607	 *	  2 - all have same columncnt
608	 *	  3 - all have same blkno
609	 *	  4 - all have same blkcnt
610	 *
611	 * read each and check the checksum
612	 * write each
613	 */
614
615	cnt = eachp->rpl_colcnt;
616	id = eachp->rpl_id;
617	pwhdr1 = &eachp->rpl_pwhdr1;
618	blkno = pwhdr1->rpw_blkno;
619	blkcnt = pwhdr1->rpw_blkcnt;
620
621	error = raid_pw_read(un, eachp->rpl_column1, eachp->rpl_slot1, rplybuf);
622	dsum = genchecksum(rplybuf->rpl_data + DEV_BSIZE,
623	    dbtob(pwhdr1->rpw_blkcnt));
624
625	if (dsum != pwhdr1->rpw_sum)
626		return (0);
627
628	if (error) {
629		if (error == RAID_RPLY_COMPREPLAY)
630			return (0);
631		else
632			return (1);
633	}
634
635	eachpn = eachp->rpl_next;
636	for (i = 1; i < cnt; i++) {
637		if (eachpn == NULL)
638			break;
639		col2 = eachpn->rpl_column1;
640		ASSERT(col2 < un->un_totalcolumncnt);
641		pwhdr2 = &eachpn->rpl_pwhdr1;
642		if ((pwhdr2->rpw_blkno != blkno) ||
643		    (pwhdr2->rpw_blkcnt != blkcnt) ||
644		    (eachpn->rpl_id != id) ||
645		    (pwhdr2->rpw_colcount != cnt)) {
646			return (0);
647		}
648
649		error = raid_pw_read(un, col2, eachpn->rpl_slot1, rplybuf);
650		dsum = genchecksum(rplybuf->rpl_data + DEV_BSIZE,
651		    dbtob(pwhdr2->rpw_blkcnt));
652		if (dsum != pwhdr2->rpw_sum)
653			return (0);
654		eachpn = eachpn->rpl_next;
655	}
656	colcnt = i;
657
658	if (error)
659		return (0);
660
661	down = raid_state_cnt(un, RCS_ERRED);
662	if ((i != un->un_totalcolumncnt) &&
663	    (i != (un->un_totalcolumncnt - down)))
664		return (0);
665
666	/* there ara enough columns to write correctly */
667	eachpn = eachp;
668	for (i = 0; i < colcnt; i++) {
669		col = eachpn->rpl_column1;
670		error = raid_pw_read(un, col, eachpn->rpl_slot1, rplybuf);
671		error = raid_pw_write(un, col, &eachpn->rpl_pwhdr1, rplybuf);
672		eachpn->rpl_id = 0;
673		if (error && (error != RAID_RPLY_COMPREPLAY))
674			return (1);
675		eachpn = eachpn->rpl_next;
676	}
677	return (0);
678}
679
680/*
681 * NAMES:	raid_replay_error
682 * DESCRIPTION: RAID metadevice replay error handling routine (TBD)
683 * PARAMETERS:
684 * RETURNS:
685 */
686static int
687raid_replay_error(mr_unit_t *un, int column)
688{
689	int	error = RAID_RPLY_COMPREPLAY;
690
691	raid_set_state(un, column, RCS_ERRED, 0);
692	raid_commit(un, NULL);
693
694	if (UNIT_STATE(un) == RUS_LAST_ERRED) {
695		error = RAID_RPLY_READONLY;
696		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE,
697		    MD_UN2SET(un), MD_SID(un));
698	} else if (UNIT_STATE(un) == RUS_ERRED) {
699		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE,
700		    MD_UN2SET(un), MD_SID(un));
701	}
702
703	return (error);
704}
705
706/*
707 * NAMES:	raid_replay
708 * DESCRIPTION: RAID metadevice main replay processing routine
709 * PARAMETERS:	mr_unit_t *un - pointer to an unit structure
710 * RETURNS:
711 */
712
713int
714raid_replay(mr_unit_t *un)
715{
716	raid_rplylst_t	*rplylst = NULL;
717	raid_rplylst_t	**prevp, *eachp;
718	raid_rplybuf_t	*rplybuf;
719	raid_rplybuf_t	rwbuf1;
720	raid_rplybuf_t	rwbuf2;
721	mr_column_t	*colptr;
722	raid_pwhdr_t	pwhdr;
723	raid_pwhdr_t	*pwhdrp = &pwhdr;
724	int		error = 0;
725	int		i, j;
726	diskaddr_t	max_blkno = un->un_segsize * un->un_segsincolumn;
727	int		totalcolumns = un->un_totalcolumncnt;
728
729	raid_rply_alloc(un, &rplybuf, &rwbuf1, &rwbuf2);
730
731	/* build a replay list based on the order of pre-write id */
732	for (i = 0; i < un->un_pwcnt; i++) {
733		/* issue a synchronous read for each column */
734		for (j = 0; j < un->un_totalcolumncnt; j++) {
735			if (COLUMN_ISUP(un, j)) {
736				raid_pwhdr_read(un, i, j, &rplybuf[j]);
737				/* wait for I/O completion for each column */
738				if (biowait((buf_t *)rplybuf[j].rpl_buf)) {
739					/* potential state transition */
740					error = raid_replay_error(un, j);
741					if (error == RAID_RPLY_COMPREPLAY)
742						continue;
743					else
744						goto replay_failed;
745				}
746				if (un->c.un_revision & MD_64BIT_META_DEV) {
747					pwhdrp = (raid_pwhdr_t *)
748							rplybuf[j].rpl_data;
749				} else {
750					RAID_CONVERT_RPW((raid_pwhdr32_od_t *)
751							rplybuf[j].rpl_data,
752							pwhdrp);
753				}
754
755				/* first check pre-write magic number */
756				if (pwhdrp->rpw_magic != RAID_PWMAGIC) {
757					continue;
758				}
759				if (pwhdrp->rpw_column != j) {
760					continue;
761				}
762				if (pwhdrp->rpw_id == (long long) 0) {
763					continue;
764				}
765				if (pwhdrp->rpw_blkcnt > (un->un_iosize - 1)) {
766					continue;
767				}
768				if (pwhdrp->rpw_blkcnt == 0) {
769					continue;
770				}
771				if (pwhdrp->rpw_blkno > max_blkno) {
772					continue;
773				}
774				if ((pwhdrp->rpw_columnnum < 0) ||
775				    (pwhdrp->rpw_columnnum > totalcolumns)) {
776					continue;
777				}
778				if (((pwhdrp->rpw_colcount != 1) &&
779				    (pwhdrp->rpw_colcount != 2) &&
780				    (pwhdrp->rpw_colcount != totalcolumns))) {
781					continue;
782				}
783
784				enq_rplylst(&rplylst, pwhdrp, i, j);
785			}
786		}
787	}
788
789	/* replay each entry in the replay list */
790	prevp = &rplylst;
791	while ((eachp = *prevp) != NULL) {
792		/* zero out the pre-write headers in the buffer */
793		bzero((caddr_t)rwbuf1.rpl_data, sizeof (raid_pwhdr_t));
794		bzero((caddr_t)rwbuf2.rpl_data, sizeof (raid_pwhdr_t));
795
796		if (eachp->rpl_colcnt <= 2)
797			error = raid_rplyeach(un, eachp, &rwbuf1, &rwbuf2);
798		else
799			error = replay_line(un, eachp, &rwbuf1);
800
801		if (error && (error != RAID_RPLY_COMPREPLAY)) {
802			goto replay_failed;
803		}
804
805		/* free the processed replay list entry */
806		rpl_delete(prevp, eachp);
807		prevp = &rplylst;
808	}
809
810	/* zero out all pre-write entries in this unit */
811	for (j = 0; j < un->un_totalcolumncnt; j++) {
812		if (COLUMN_ISUP(un, j)) {
813			colptr = &un->un_column[j];
814			if (init_pw_area(un, colptr->un_dev,
815						colptr->un_pwstart, j))
816				break;
817		}
818	}
819
820	/* deallocate all the buffer resource allocated in this routine */
821	raid_rply_dealloc(un, &rplybuf, &rwbuf1, &rwbuf2);
822
823	return (RAID_RPLY_SUCCESS);
824
825replay_failed:
826
827	/* first release the list */
828	prevp = &rplylst;
829	while ((eachp = *prevp) != NULL) {
830		rpl_delete(prevp, eachp);
831		prevp = &rplylst;
832	}
833
834	/* then release buffers */
835	raid_rply_dealloc(un, &rplybuf, &rwbuf1, &rwbuf2);
836
837	/* also reset the pre-write id variable to one */
838	un->un_pwid = 1;
839	raid_total_rply_entries = 0;
840
841	return (error);
842}
843