rf_reconstruct.c revision 1.98
1/*	$NetBSD: rf_reconstruct.c,v 1.98 2007/07/18 19:04:58 ad Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/************************************************************
30 *
31 * rf_reconstruct.c -- code to perform on-line reconstruction
32 *
33 ************************************************************/
34
35#include <sys/cdefs.h>
36__KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.98 2007/07/18 19:04:58 ad Exp $");
37
38#include <sys/param.h>
39#include <sys/time.h>
40#include <sys/buf.h>
41#include <sys/errno.h>
42#include <sys/systm.h>
43#include <sys/proc.h>
44#include <sys/ioctl.h>
45#include <sys/fcntl.h>
46#include <sys/vnode.h>
47#include <dev/raidframe/raidframevar.h>
48
49#include "rf_raid.h"
50#include "rf_reconutil.h"
51#include "rf_revent.h"
52#include "rf_reconbuffer.h"
53#include "rf_acctrace.h"
54#include "rf_etimer.h"
55#include "rf_dag.h"
56#include "rf_desc.h"
57#include "rf_debugprint.h"
58#include "rf_general.h"
59#include "rf_driver.h"
60#include "rf_utils.h"
61#include "rf_shutdown.h"
62
63#include "rf_kintf.h"
64
65/* setting these to -1 causes them to be set to their default values if not set by debug options */
66
67#if RF_DEBUG_RECON
68#define Dprintf(s)         if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
69#define Dprintf1(s,a)         if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
70#define Dprintf2(s,a,b)       if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
71#define Dprintf3(s,a,b,c)     if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
72#define Dprintf4(s,a,b,c,d)   if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
73#define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
74#define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
75#define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
76
77#define DDprintf1(s,a)         if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
78#define DDprintf2(s,a,b)       if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
79
80#else /* RF_DEBUG_RECON */
81
82#define Dprintf(s) {}
83#define Dprintf1(s,a) {}
84#define Dprintf2(s,a,b) {}
85#define Dprintf3(s,a,b,c) {}
86#define Dprintf4(s,a,b,c,d) {}
87#define Dprintf5(s,a,b,c,d,e) {}
88#define Dprintf6(s,a,b,c,d,e,f) {}
89#define Dprintf7(s,a,b,c,d,e,f,g) {}
90
91#define DDprintf1(s,a) {}
92#define DDprintf2(s,a,b) {}
93
94#endif /* RF_DEBUG_RECON */
95
96#define RF_RECON_DONE_READS   1
97#define RF_RECON_READ_ERROR   2
98#define RF_RECON_WRITE_ERROR  3
99#define RF_RECON_READ_STOPPED 4
100
101#define RF_MAX_FREE_RECONBUFFER 32
102#define RF_MIN_FREE_RECONBUFFER 16
103
104static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t,
105					      RF_RaidDisk_t *, int, RF_RowCol_t);
106static void FreeReconDesc(RF_RaidReconDesc_t *);
107static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *);
108static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t);
109static int TryToRead(RF_Raid_t *, RF_RowCol_t);
110static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t,
111				RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *,
112				RF_SectorNum_t *);
113static int IssueNextWriteRequest(RF_Raid_t *);
114static int ReconReadDoneProc(void *, int);
115static int ReconWriteDoneProc(void *, int);
116static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t);
117static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *,
118			       RF_RowCol_t, RF_HeadSepLimit_t,
119			       RF_ReconUnitNum_t);
120static int CheckForcedOrBlockedReconstruction(RF_Raid_t *,
121					      RF_ReconParityStripeStatus_t *,
122					      RF_PerDiskReconCtrl_t *,
123					      RF_RowCol_t, RF_StripeNum_t,
124					      RF_ReconUnitNum_t);
125static void ForceReconReadDoneProc(void *, int);
126static void rf_ShutdownReconstruction(void *);
127
128struct RF_ReconDoneProc_s {
129	void    (*proc) (RF_Raid_t *, void *);
130	void   *arg;
131	RF_ReconDoneProc_t *next;
132};
133
134/**************************************************************************
135 *
136 * sets up the parameters that will be used by the reconstruction process
137 * currently there are none, except for those that the layout-specific
138 * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
139 *
140 * in the kernel, we fire off the recon thread.
141 *
142 **************************************************************************/
143static void
144rf_ShutdownReconstruction(void *ignored)
145{
146	pool_destroy(&rf_pools.reconbuffer);
147}
148
149int
150rf_ConfigureReconstruction(RF_ShutdownList_t **listp)
151{
152
153	rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t),
154		     "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER);
155	rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
156
157	return (0);
158}
159
160static RF_RaidReconDesc_t *
161AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col,
162		   RF_RaidDisk_t *spareDiskPtr, int numDisksDone,
163		   RF_RowCol_t scol)
164{
165
166	RF_RaidReconDesc_t *reconDesc;
167
168	RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t),
169		  (RF_RaidReconDesc_t *));
170	reconDesc->raidPtr = raidPtr;
171	reconDesc->col = col;
172	reconDesc->spareDiskPtr = spareDiskPtr;
173	reconDesc->numDisksDone = numDisksDone;
174	reconDesc->scol = scol;
175	reconDesc->next = NULL;
176
177	return (reconDesc);
178}
179
180static void
181FreeReconDesc(RF_RaidReconDesc_t *reconDesc)
182{
183#if RF_RECON_STATS > 0
184	printf("raid%d: %lu recon event waits, %lu recon delays\n",
185	       reconDesc->raidPtr->raidid,
186	       (long) reconDesc->numReconEventWaits,
187	       (long) reconDesc->numReconExecDelays);
188#endif				/* RF_RECON_STATS > 0 */
189	printf("raid%d: %lu max exec ticks\n",
190	       reconDesc->raidPtr->raidid,
191	       (long) reconDesc->maxReconExecTicks);
192#if (RF_RECON_STATS > 0) || defined(KERNEL)
193	printf("\n");
194#endif				/* (RF_RECON_STATS > 0) || KERNEL */
195	RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t));
196}
197
198
199/*****************************************************************************
200 *
201 * primary routine to reconstruct a failed disk.  This should be called from
202 * within its own thread.  It won't return until reconstruction completes,
203 * fails, or is aborted.
204 *****************************************************************************/
205int
206rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col)
207{
208	const RF_LayoutSW_t *lp;
209	int     rc;
210
211	lp = raidPtr->Layout.map;
212	if (lp->SubmitReconBuffer) {
213		/*
214	         * The current infrastructure only supports reconstructing one
215	         * disk at a time for each array.
216	         */
217		RF_LOCK_MUTEX(raidPtr->mutex);
218		while (raidPtr->reconInProgress) {
219			RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
220		}
221		raidPtr->reconInProgress++;
222		RF_UNLOCK_MUTEX(raidPtr->mutex);
223		rc = rf_ReconstructFailedDiskBasic(raidPtr, col);
224		RF_LOCK_MUTEX(raidPtr->mutex);
225		raidPtr->reconInProgress--;
226		RF_UNLOCK_MUTEX(raidPtr->mutex);
227	} else {
228		RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
229		    lp->parityConfig);
230		rc = EIO;
231	}
232	RF_SIGNAL_COND(raidPtr->waitForReconCond);
233	return (rc);
234}
235
236int
237rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col)
238{
239	RF_ComponentLabel_t c_label;
240	RF_RaidDisk_t *spareDiskPtr = NULL;
241	RF_RaidReconDesc_t *reconDesc;
242	RF_RowCol_t scol;
243	int     numDisksDone = 0, rc;
244
245	/* first look for a spare drive onto which to reconstruct the data */
246	/* spare disk descriptors are stored in row 0.  This may have to
247	 * change eventually */
248
249	RF_LOCK_MUTEX(raidPtr->mutex);
250	RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed);
251#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
252	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
253		if (raidPtr->status != rf_rs_degraded) {
254			RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col);
255			RF_UNLOCK_MUTEX(raidPtr->mutex);
256			return (EINVAL);
257		}
258		scol = (-1);
259	} else {
260#endif
261		for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
262			if (raidPtr->Disks[scol].status == rf_ds_spare) {
263				spareDiskPtr = &raidPtr->Disks[scol];
264				spareDiskPtr->status = rf_ds_used_spare;
265				break;
266			}
267		}
268		if (!spareDiskPtr) {
269			RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col);
270			RF_UNLOCK_MUTEX(raidPtr->mutex);
271			return (ENOSPC);
272		}
273		printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol);
274#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
275	}
276#endif
277	RF_UNLOCK_MUTEX(raidPtr->mutex);
278
279	reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol);
280	raidPtr->reconDesc = (void *) reconDesc;
281#if RF_RECON_STATS > 0
282	reconDesc->hsStallCount = 0;
283	reconDesc->numReconExecDelays = 0;
284	reconDesc->numReconEventWaits = 0;
285#endif				/* RF_RECON_STATS > 0 */
286	reconDesc->reconExecTimerRunning = 0;
287	reconDesc->reconExecTicks = 0;
288	reconDesc->maxReconExecTicks = 0;
289	rc = rf_ContinueReconstructFailedDisk(reconDesc);
290
291	if (!rc) {
292		/* fix up the component label */
293		/* Don't actually need the read here.. */
294		raidread_component_label(
295                        raidPtr->raid_cinfo[scol].ci_dev,
296			raidPtr->raid_cinfo[scol].ci_vp,
297			&c_label);
298
299		raid_init_component_label( raidPtr, &c_label);
300		c_label.row = 0;
301		c_label.column = col;
302		c_label.clean = RF_RAID_DIRTY;
303		c_label.status = rf_ds_optimal;
304		c_label.partitionSize = raidPtr->Disks[scol].partitionSize;
305
306		/* We've just done a rebuild based on all the other
307		   disks, so at this point the parity is known to be
308		   clean, even if it wasn't before. */
309
310		/* XXX doesn't hold for RAID 6!!*/
311
312		RF_LOCK_MUTEX(raidPtr->mutex);
313		raidPtr->parity_good = RF_RAID_CLEAN;
314		RF_UNLOCK_MUTEX(raidPtr->mutex);
315
316		/* XXXX MORE NEEDED HERE */
317
318		raidwrite_component_label(
319                        raidPtr->raid_cinfo[scol].ci_dev,
320			raidPtr->raid_cinfo[scol].ci_vp,
321			&c_label);
322
323	} else {
324		/* Reconstruct failed. */
325
326		RF_LOCK_MUTEX(raidPtr->mutex);
327		/* Failed disk goes back to "failed" status */
328		raidPtr->Disks[col].status = rf_ds_failed;
329
330		/* Spare disk goes back to "spare" status. */
331		spareDiskPtr->status = rf_ds_spare;
332		RF_UNLOCK_MUTEX(raidPtr->mutex);
333
334	}
335	rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
336	return (rc);
337}
338
339/*
340
341   Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
342   and you don't get a spare until the next Monday.  With this function
343   (and hot-swappable drives) you can now put your new disk containing
344   /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
345   rebuild the data "on the spot".
346
347*/
348
349int
350rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col)
351{
352	RF_RaidDisk_t *spareDiskPtr = NULL;
353	RF_RaidReconDesc_t *reconDesc;
354	const RF_LayoutSW_t *lp;
355	RF_ComponentLabel_t c_label;
356	int     numDisksDone = 0, rc;
357	struct partinfo dpart;
358	struct vnode *vp;
359	struct vattr va;
360	struct lwp *lwp;
361	int retcode;
362	int ac;
363
364	lp = raidPtr->Layout.map;
365	if (!lp->SubmitReconBuffer) {
366		RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
367			     lp->parityConfig);
368		/* wakeup anyone who might be waiting to do a reconstruct */
369		RF_SIGNAL_COND(raidPtr->waitForReconCond);
370		return(EIO);
371	}
372
373	/*
374	 * The current infrastructure only supports reconstructing one
375	 * disk at a time for each array.
376	 */
377	RF_LOCK_MUTEX(raidPtr->mutex);
378
379	if (raidPtr->Disks[col].status != rf_ds_failed) {
380		/* "It's gone..." */
381		raidPtr->numFailures++;
382		raidPtr->Disks[col].status = rf_ds_failed;
383		raidPtr->status = rf_rs_degraded;
384		RF_UNLOCK_MUTEX(raidPtr->mutex);
385		rf_update_component_labels(raidPtr,
386					   RF_NORMAL_COMPONENT_UPDATE);
387		RF_LOCK_MUTEX(raidPtr->mutex);
388	}
389
390	while (raidPtr->reconInProgress) {
391		RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
392	}
393
394	raidPtr->reconInProgress++;
395
396	/* first look for a spare drive onto which to reconstruct the
397	   data.  spare disk descriptors are stored in row 0.  This
398	   may have to change eventually */
399
400	/* Actually, we don't care if it's failed or not...  On a RAID
401	   set with correct parity, this function should be callable
402	   on any component without ill affects. */
403	/* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */
404
405#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
406	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
407		RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col);
408
409		raidPtr->reconInProgress--;
410		RF_UNLOCK_MUTEX(raidPtr->mutex);
411		RF_SIGNAL_COND(raidPtr->waitForReconCond);
412		return (EINVAL);
413	}
414#endif
415	lwp = raidPtr->engine_thread;
416
417	/* This device may have been opened successfully the
418	   first time. Close it before trying to open it again.. */
419
420	if (raidPtr->raid_cinfo[col].ci_vp != NULL) {
421#if 0
422		printf("Closed the open device: %s\n",
423		       raidPtr->Disks[col].devname);
424#endif
425		vp = raidPtr->raid_cinfo[col].ci_vp;
426		ac = raidPtr->Disks[col].auto_configured;
427		RF_UNLOCK_MUTEX(raidPtr->mutex);
428		rf_close_component(raidPtr, vp, ac);
429		RF_LOCK_MUTEX(raidPtr->mutex);
430		raidPtr->raid_cinfo[col].ci_vp = NULL;
431	}
432	/* note that this disk was *not* auto_configured (any longer)*/
433	raidPtr->Disks[col].auto_configured = 0;
434
435#if 0
436	printf("About to (re-)open the device for rebuilding: %s\n",
437	       raidPtr->Disks[col].devname);
438#endif
439	RF_UNLOCK_MUTEX(raidPtr->mutex);
440	retcode = dk_lookup(raidPtr->Disks[col].devname, lwp, &vp, UIO_SYSSPACE);
441
442	if (retcode) {
443		printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid,
444		       raidPtr->Disks[col].devname, retcode);
445
446		/* the component isn't responding properly...
447		   must be still dead :-( */
448		RF_LOCK_MUTEX(raidPtr->mutex);
449		raidPtr->reconInProgress--;
450		RF_UNLOCK_MUTEX(raidPtr->mutex);
451		RF_SIGNAL_COND(raidPtr->waitForReconCond);
452		return(retcode);
453	}
454
455	/* Ok, so we can at least do a lookup...
456	   How about actually getting a vp for it? */
457
458	if ((retcode = VOP_GETATTR(vp, &va, lwp->l_cred, lwp)) != 0) {
459		RF_LOCK_MUTEX(raidPtr->mutex);
460		raidPtr->reconInProgress--;
461		RF_UNLOCK_MUTEX(raidPtr->mutex);
462		RF_SIGNAL_COND(raidPtr->waitForReconCond);
463		return(retcode);
464	}
465
466	retcode = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, lwp->l_cred, lwp);
467	if (retcode) {
468		RF_LOCK_MUTEX(raidPtr->mutex);
469		raidPtr->reconInProgress--;
470		RF_UNLOCK_MUTEX(raidPtr->mutex);
471		RF_SIGNAL_COND(raidPtr->waitForReconCond);
472		return(retcode);
473	}
474	RF_LOCK_MUTEX(raidPtr->mutex);
475	raidPtr->Disks[col].blockSize =	dpart.disklab->d_secsize;
476
477	raidPtr->Disks[col].numBlocks = dpart.part->p_size -
478		rf_protectedSectors;
479
480	raidPtr->raid_cinfo[col].ci_vp = vp;
481	raidPtr->raid_cinfo[col].ci_dev = va.va_rdev;
482
483	raidPtr->Disks[col].dev = va.va_rdev;
484
485	/* we allow the user to specify that only a fraction
486	   of the disks should be used this is just for debug:
487	   it speeds up * the parity scan */
488	raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks *
489		rf_sizePercentage / 100;
490	RF_UNLOCK_MUTEX(raidPtr->mutex);
491
492	spareDiskPtr = &raidPtr->Disks[col];
493	spareDiskPtr->status = rf_ds_used_spare;
494
495	printf("raid%d: initiating in-place reconstruction on column %d\n",
496	       raidPtr->raidid, col);
497
498	reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr,
499				       numDisksDone, col);
500	raidPtr->reconDesc = (void *) reconDesc;
501#if RF_RECON_STATS > 0
502	reconDesc->hsStallCount = 0;
503	reconDesc->numReconExecDelays = 0;
504	reconDesc->numReconEventWaits = 0;
505#endif				/* RF_RECON_STATS > 0 */
506	reconDesc->reconExecTimerRunning = 0;
507	reconDesc->reconExecTicks = 0;
508	reconDesc->maxReconExecTicks = 0;
509	rc = rf_ContinueReconstructFailedDisk(reconDesc);
510
511	if (!rc) {
512		RF_LOCK_MUTEX(raidPtr->mutex);
513		/* Need to set these here, as at this point it'll be claiming
514		   that the disk is in rf_ds_spared!  But we know better :-) */
515
516		raidPtr->Disks[col].status = rf_ds_optimal;
517		raidPtr->status = rf_rs_optimal;
518		RF_UNLOCK_MUTEX(raidPtr->mutex);
519
520		/* fix up the component label */
521		/* Don't actually need the read here.. */
522		raidread_component_label(raidPtr->raid_cinfo[col].ci_dev,
523					 raidPtr->raid_cinfo[col].ci_vp,
524					 &c_label);
525
526		RF_LOCK_MUTEX(raidPtr->mutex);
527		raid_init_component_label(raidPtr, &c_label);
528
529		c_label.row = 0;
530		c_label.column = col;
531
532		/* We've just done a rebuild based on all the other
533		   disks, so at this point the parity is known to be
534		   clean, even if it wasn't before. */
535
536		/* XXX doesn't hold for RAID 6!!*/
537
538		raidPtr->parity_good = RF_RAID_CLEAN;
539		RF_UNLOCK_MUTEX(raidPtr->mutex);
540
541		raidwrite_component_label(raidPtr->raid_cinfo[col].ci_dev,
542					  raidPtr->raid_cinfo[col].ci_vp,
543					  &c_label);
544
545	} else {
546		/* Reconstruct-in-place failed.  Disk goes back to
547		   "failed" status, regardless of what it was before.  */
548		RF_LOCK_MUTEX(raidPtr->mutex);
549		raidPtr->Disks[col].status = rf_ds_failed;
550		RF_UNLOCK_MUTEX(raidPtr->mutex);
551	}
552
553	rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
554
555	RF_LOCK_MUTEX(raidPtr->mutex);
556	raidPtr->reconInProgress--;
557	RF_UNLOCK_MUTEX(raidPtr->mutex);
558
559	RF_SIGNAL_COND(raidPtr->waitForReconCond);
560	return (rc);
561}
562
563
564int
565rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc)
566{
567	RF_Raid_t *raidPtr = reconDesc->raidPtr;
568	RF_RowCol_t col = reconDesc->col;
569	RF_RowCol_t scol = reconDesc->scol;
570	RF_ReconMap_t *mapPtr;
571	RF_ReconCtrl_t *tmp_reconctrl;
572	RF_ReconEvent_t *event;
573	RF_CallbackDesc_t *p;
574	struct timeval etime, elpsd;
575	unsigned long xor_s, xor_resid_us;
576	int     i, ds;
577	int status;
578	int recon_error, write_error;
579
580	raidPtr->accumXorTimeUs = 0;
581#if RF_ACC_TRACE > 0
582	/* create one trace record per physical disk */
583	RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
584#endif
585
586	/* quiesce the array prior to starting recon.  this is needed
587	 * to assure no nasty interactions with pending user writes.
588	 * We need to do this before we change the disk or row status. */
589
590	Dprintf("RECON: begin request suspend\n");
591	rf_SuspendNewRequestsAndWait(raidPtr);
592	Dprintf("RECON: end request suspend\n");
593
594	/* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */
595	tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol);
596
597	RF_LOCK_MUTEX(raidPtr->mutex);
598
599	/* create the reconstruction control pointer and install it in
600	 * the right slot */
601	raidPtr->reconControl = tmp_reconctrl;
602	mapPtr = raidPtr->reconControl->reconMap;
603	raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs;
604	raidPtr->reconControl->numRUsComplete =	0;
605	raidPtr->status = rf_rs_reconstructing;
606	raidPtr->Disks[col].status = rf_ds_reconstructing;
607	raidPtr->Disks[col].spareCol = scol;
608
609	RF_UNLOCK_MUTEX(raidPtr->mutex);
610
611	RF_GETTIME(raidPtr->reconControl->starttime);
612
613	/* now start up the actual reconstruction: issue a read for
614	 * each surviving disk */
615
616	reconDesc->numDisksDone = 0;
617	for (i = 0; i < raidPtr->numCol; i++) {
618		if (i != col) {
619			/* find and issue the next I/O on the
620			 * indicated disk */
621			if (IssueNextReadRequest(raidPtr, i)) {
622				Dprintf1("RECON: done issuing for c%d\n", i);
623				reconDesc->numDisksDone++;
624			}
625		}
626	}
627
628	Dprintf("RECON: resume requests\n");
629	rf_ResumeNewRequests(raidPtr);
630
631	/* process reconstruction events until all disks report that
632	 * they've completed all work */
633
634	mapPtr = raidPtr->reconControl->reconMap;
635	recon_error = 0;
636	write_error = 0;
637
638	while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
639
640		event = rf_GetNextReconEvent(reconDesc);
641		status = ProcessReconEvent(raidPtr, event);
642
643		/* the normal case is that a read completes, and all is well. */
644		if (status == RF_RECON_DONE_READS) {
645			reconDesc->numDisksDone++;
646		} else if ((status == RF_RECON_READ_ERROR) ||
647			   (status == RF_RECON_WRITE_ERROR)) {
648			/* an error was encountered while reconstructing...
649			   Pretend we've finished this disk.
650			*/
651			recon_error = 1;
652			raidPtr->reconControl->error = 1;
653
654			/* bump the numDisksDone count for reads,
655			   but not for writes */
656			if (status == RF_RECON_READ_ERROR)
657				reconDesc->numDisksDone++;
658
659			/* write errors are special -- when we are
660			   done dealing with the reads that are
661			   finished, we don't want to wait for any
662			   writes */
663			if (status == RF_RECON_WRITE_ERROR)
664				write_error = 1;
665
666		} else if (status == RF_RECON_READ_STOPPED) {
667			/* count this component as being "done" */
668			reconDesc->numDisksDone++;
669		}
670
671		if (recon_error) {
672
673			/* make sure any stragglers are woken up so that
674			   their theads will complete, and we can get out
675			   of here with all IO processed */
676
677			while (raidPtr->reconControl->headSepCBList) {
678				p = raidPtr->reconControl->headSepCBList;
679				raidPtr->reconControl->headSepCBList = p->next;
680				p->next = NULL;
681				rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
682				rf_FreeCallbackDesc(p);
683			}
684		}
685
686		raidPtr->reconControl->numRUsTotal =
687			mapPtr->totalRUs;
688		raidPtr->reconControl->numRUsComplete =
689			mapPtr->totalRUs -
690			rf_UnitsLeftToReconstruct(mapPtr);
691
692#if RF_DEBUG_RECON
693		raidPtr->reconControl->percentComplete =
694			(raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
695		if (rf_prReconSched) {
696			rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
697		}
698#endif
699	}
700
701	mapPtr = raidPtr->reconControl->reconMap;
702	if (rf_reconDebug) {
703		printf("RECON: all reads completed\n");
704	}
705	/* at this point all the reads have completed.  We now wait
706	 * for any pending writes to complete, and then we're done */
707
708	while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) {
709
710		event = rf_GetNextReconEvent(reconDesc);
711		status = ProcessReconEvent(raidPtr, event);
712
713		if (status == RF_RECON_WRITE_ERROR) {
714			recon_error = 1;
715			raidPtr->reconControl->error = 1;
716			/* an error was encountered at the very end... bail */
717		} else {
718#if RF_DEBUG_RECON
719			raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
720			if (rf_prReconSched) {
721				rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
722			}
723#endif
724		}
725	}
726
727	if (recon_error) {
728		/* we've encountered an error in reconstructing. */
729		printf("raid%d: reconstruction failed.\n", raidPtr->raidid);
730
731		/* we start by blocking IO to the RAID set. */
732		rf_SuspendNewRequestsAndWait(raidPtr);
733
734		RF_LOCK_MUTEX(raidPtr->mutex);
735		/* mark set as being degraded, rather than
736		   rf_rs_reconstructing as we were before the problem.
737		   After this is done we can update status of the
738		   component disks without worrying about someone
739		   trying to read from a failed component.
740		*/
741		raidPtr->status = rf_rs_degraded;
742		RF_UNLOCK_MUTEX(raidPtr->mutex);
743
744		/* resume IO */
745		rf_ResumeNewRequests(raidPtr);
746
747		/* At this point there are two cases:
748		   1) If we've experienced a read error, then we've
749		   already waited for all the reads we're going to get,
750		   and we just need to wait for the writes.
751
752		   2) If we've experienced a write error, we've also
753		   already waited for all the reads to complete,
754		   but there is little point in waiting for the writes --
755		   when they do complete, they will just be ignored.
756
757		   So we just wait for writes to complete if we didn't have a
758		   write error.
759		*/
760
761		if (!write_error) {
762			/* wait for writes to complete */
763			while (raidPtr->reconControl->pending_writes > 0) {
764
765				event = rf_GetNextReconEvent(reconDesc);
766				status = ProcessReconEvent(raidPtr, event);
767
768				if (status == RF_RECON_WRITE_ERROR) {
769					raidPtr->reconControl->error = 1;
770					/* an error was encountered at the very end... bail.
771					   This will be very bad news for the user, since
772					   at this point there will have been a read error
773					   on one component, and a write error on another!
774					*/
775					break;
776				}
777			}
778		}
779
780
781		/* cleanup */
782
783		/* drain the event queue - after waiting for the writes above,
784		   there shouldn't be much (if anything!) left in the queue. */
785
786		rf_DrainReconEventQueue(reconDesc);
787
788		/* XXX  As much as we'd like to free the recon control structure
789		   and the reconDesc, we have no way of knowing if/when those will
790		   be touched by IO that has yet to occur.  It is rather poor to be
791		   basically causing a 'memory leak' here, but there doesn't seem to be
792		   a cleaner alternative at this time.  Perhaps when the reconstruct code
793		   gets a makeover this problem will go away.
794		*/
795#if 0
796		rf_FreeReconControl(raidPtr);
797#endif
798
799#if RF_ACC_TRACE > 0
800		RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
801#endif
802		/* XXX see comment above */
803#if 0
804		FreeReconDesc(reconDesc);
805#endif
806
807		return (1);
808	}
809
810	/* Success:  mark the dead disk as reconstructed.  We quiesce
811	 * the array here to assure no nasty interactions with pending
812	 * user accesses when we free up the psstatus structure as
813	 * part of FreeReconControl() */
814
815	rf_SuspendNewRequestsAndWait(raidPtr);
816
817	RF_LOCK_MUTEX(raidPtr->mutex);
818	raidPtr->numFailures--;
819	ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
820	raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared;
821	raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal;
822	RF_UNLOCK_MUTEX(raidPtr->mutex);
823	RF_GETTIME(etime);
824	RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd);
825
826	rf_ResumeNewRequests(raidPtr);
827
828	printf("raid%d: Reconstruction of disk at col %d completed\n",
829	       raidPtr->raidid, col);
830	xor_s = raidPtr->accumXorTimeUs / 1000000;
831	xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
832	printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n",
833	       raidPtr->raidid,
834	       (int) elpsd.tv_sec, (int) elpsd.tv_usec,
835	       raidPtr->accumXorTimeUs, xor_s, xor_resid_us);
836	printf("raid%d:  (start time %d sec %d usec, end time %d sec %d usec)\n",
837	       raidPtr->raidid,
838	       (int) raidPtr->reconControl->starttime.tv_sec,
839	       (int) raidPtr->reconControl->starttime.tv_usec,
840	       (int) etime.tv_sec, (int) etime.tv_usec);
841#if RF_RECON_STATS > 0
842	printf("raid%d: Total head-sep stall count was %d\n",
843	       raidPtr->raidid, (int) reconDesc->hsStallCount);
844#endif				/* RF_RECON_STATS > 0 */
845	rf_FreeReconControl(raidPtr);
846#if RF_ACC_TRACE > 0
847	RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
848#endif
849	FreeReconDesc(reconDesc);
850
851	return (0);
852
853}
854/*****************************************************************************
855 * do the right thing upon each reconstruction event.
856 *****************************************************************************/
857static int
858ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event)
859{
860	int     retcode = 0, submitblocked;
861	RF_ReconBuffer_t *rbuf;
862	RF_SectorCount_t sectorsPerRU;
863
864	retcode = RF_RECON_READ_STOPPED;
865
866	Dprintf1("RECON: ProcessReconEvent type %d\n", event->type);
867	switch (event->type) {
868
869		/* a read I/O has completed */
870	case RF_REVENT_READDONE:
871		rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf;
872		Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n",
873		    event->col, rbuf->parityStripeID);
874		Dprintf7("RECON: done read  psid %ld buf %lx  %02x %02x %02x %02x %02x\n",
875		    rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
876		    rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
877		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
878		if (!raidPtr->reconControl->error) {
879			submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
880			Dprintf1("RECON: submitblocked=%d\n", submitblocked);
881			if (!submitblocked)
882				retcode = IssueNextReadRequest(raidPtr, event->col);
883			else
884				retcode = 0;
885		}
886		break;
887
888		/* a write I/O has completed */
889	case RF_REVENT_WRITEDONE:
890#if RF_DEBUG_RECON
891		if (rf_floatingRbufDebug) {
892			rf_CheckFloatingRbufCount(raidPtr, 1);
893		}
894#endif
895		sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
896		rbuf = (RF_ReconBuffer_t *) event->arg;
897		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
898		Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n",
899		    rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete);
900		rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap,
901		    rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
902		rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru);
903
904		RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
905		raidPtr->reconControl->pending_writes--;
906		RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
907
908		if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
909			RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
910			while(raidPtr->reconControl->rb_lock) {
911				ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO, "reconctrlpre1", 0,
912					&raidPtr->reconControl->rb_mutex);
913			}
914			raidPtr->reconControl->rb_lock = 1;
915			RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
916
917			raidPtr->numFullReconBuffers--;
918			rf_ReleaseFloatingReconBuffer(raidPtr, rbuf);
919
920			RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
921			raidPtr->reconControl->rb_lock = 0;
922			wakeup(&raidPtr->reconControl->rb_lock);
923			RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
924		} else
925			if (rbuf->type == RF_RBUF_TYPE_FORCED)
926				rf_FreeReconBuffer(rbuf);
927			else
928				RF_ASSERT(0);
929		retcode = 0;
930		break;
931
932	case RF_REVENT_BUFCLEAR:	/* A buffer-stall condition has been
933					 * cleared */
934		Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col);
935		if (!raidPtr->reconControl->error) {
936			submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf,
937							     0, (int) (long) event->arg);
938			RF_ASSERT(!submitblocked);	/* we wouldn't have gotten the
939							 * BUFCLEAR event if we
940							 * couldn't submit */
941			retcode = IssueNextReadRequest(raidPtr, event->col);
942		}
943		break;
944
945	case RF_REVENT_BLOCKCLEAR:	/* A user-write reconstruction
946					 * blockage has been cleared */
947		DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col);
948		if (!raidPtr->reconControl->error) {
949			retcode = TryToRead(raidPtr, event->col);
950		}
951		break;
952
953	case RF_REVENT_HEADSEPCLEAR:	/* A max-head-separation
954					 * reconstruction blockage has been
955					 * cleared */
956		Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col);
957		if (!raidPtr->reconControl->error) {
958			retcode = TryToRead(raidPtr, event->col);
959		}
960		break;
961
962		/* a buffer has become ready to write */
963	case RF_REVENT_BUFREADY:
964		Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col);
965		if (!raidPtr->reconControl->error) {
966			retcode = IssueNextWriteRequest(raidPtr);
967#if RF_DEBUG_RECON
968			if (rf_floatingRbufDebug) {
969				rf_CheckFloatingRbufCount(raidPtr, 1);
970			}
971#endif
972		}
973		break;
974
975		/* we need to skip the current RU entirely because it got
976		 * recon'd while we were waiting for something else to happen */
977	case RF_REVENT_SKIP:
978		DDprintf1("RECON: SKIP EVENT: col %d\n", event->col);
979		if (!raidPtr->reconControl->error) {
980			retcode = IssueNextReadRequest(raidPtr, event->col);
981		}
982		break;
983
984		/* a forced-reconstruction read access has completed.  Just
985		 * submit the buffer */
986	case RF_REVENT_FORCEDREADDONE:
987		rbuf = (RF_ReconBuffer_t *) event->arg;
988		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
989		DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col);
990		if (!raidPtr->reconControl->error) {
991			submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
992			RF_ASSERT(!submitblocked);
993		}
994		break;
995
996		/* A read I/O failed to complete */
997	case RF_REVENT_READ_FAILED:
998		retcode = RF_RECON_READ_ERROR;
999		break;
1000
1001		/* A write I/O failed to complete */
1002	case RF_REVENT_WRITE_FAILED:
1003		retcode = RF_RECON_WRITE_ERROR;
1004
1005		rbuf = (RF_ReconBuffer_t *) event->arg;
1006
1007		/* cleanup the disk queue data */
1008		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1009
1010		/* At this point we're erroring out, badly, and floatingRbufs
1011		   may not even be valid.  Rather than putting this back onto
1012		   the floatingRbufs list, just arrange for its immediate
1013		   destruction.
1014		*/
1015		rf_FreeReconBuffer(rbuf);
1016		break;
1017
1018		/* a forced read I/O failed to complete */
1019	case RF_REVENT_FORCEDREAD_FAILED:
1020		retcode = RF_RECON_READ_ERROR;
1021		break;
1022
1023	default:
1024		RF_PANIC();
1025	}
1026	rf_FreeReconEventDesc(event);
1027	return (retcode);
1028}
1029/*****************************************************************************
1030 *
1031 * find the next thing that's needed on the indicated disk, and issue
1032 * a read request for it.  We assume that the reconstruction buffer
1033 * associated with this process is free to receive the data.  If
1034 * reconstruction is blocked on the indicated RU, we issue a
1035 * blockage-release request instead of a physical disk read request.
1036 * If the current disk gets too far ahead of the others, we issue a
1037 * head-separation wait request and return.
1038 *
1039 * ctrl->{ru_count, curPSID, diskOffset} and
1040 * rbuf->failedDiskSectorOffset are maintained to point to the unit
1041 * we're currently accessing.  Note that this deviates from the
1042 * standard C idiom of having counters point to the next thing to be
1043 * accessed.  This allows us to easily retry when we're blocked by
1044 * head separation or reconstruction-blockage events.
1045 *
1046 *****************************************************************************/
1047static int
1048IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col)
1049{
1050	RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1051	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1052	RF_ReconBuffer_t *rbuf = ctrl->rbuf;
1053	RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
1054	RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1055	int     do_new_check = 0, retcode = 0, status;
1056
1057	/* if we are currently the slowest disk, mark that we have to do a new
1058	 * check */
1059	if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter)
1060		do_new_check = 1;
1061
1062	while (1) {
1063
1064		ctrl->ru_count++;
1065		if (ctrl->ru_count < RUsPerPU) {
1066			ctrl->diskOffset += sectorsPerRU;
1067			rbuf->failedDiskSectorOffset += sectorsPerRU;
1068		} else {
1069			ctrl->curPSID++;
1070			ctrl->ru_count = 0;
1071			/* code left over from when head-sep was based on
1072			 * parity stripe id */
1073			if (ctrl->curPSID >= raidPtr->reconControl->lastPSID) {
1074				CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter));
1075				return (RF_RECON_DONE_READS);	/* finito! */
1076			}
1077			/* find the disk offsets of the start of the parity
1078			 * stripe on both the current disk and the failed
1079			 * disk. skip this entire parity stripe if either disk
1080			 * does not appear in the indicated PS */
1081			status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset,
1082			    &rbuf->spCol, &rbuf->spOffset);
1083			if (status) {
1084				ctrl->ru_count = RUsPerPU - 1;
1085				continue;
1086			}
1087		}
1088		rbuf->which_ru = ctrl->ru_count;
1089
1090		/* skip this RU if it's already been reconstructed */
1091		if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) {
1092			Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count);
1093			continue;
1094		}
1095		break;
1096	}
1097	ctrl->headSepCounter++;
1098	if (do_new_check)
1099		CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter);	/* update min if needed */
1100
1101
1102	/* at this point, we have definitely decided what to do, and we have
1103	 * only to see if we can actually do it now */
1104	rbuf->parityStripeID = ctrl->curPSID;
1105	rbuf->which_ru = ctrl->ru_count;
1106#if RF_ACC_TRACE > 0
1107	memset((char *) &raidPtr->recon_tracerecs[col], 0,
1108	    sizeof(raidPtr->recon_tracerecs[col]));
1109	raidPtr->recon_tracerecs[col].reconacc = 1;
1110	RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1111#endif
1112	retcode = TryToRead(raidPtr, col);
1113	return (retcode);
1114}
1115
1116/*
1117 * tries to issue the next read on the indicated disk.  We may be
1118 * blocked by (a) the heads being too far apart, or (b) recon on the
1119 * indicated RU being blocked due to a write by a user thread.  In
1120 * this case, we issue a head-sep or blockage wait request, which will
1121 * cause this same routine to be invoked again later when the blockage
1122 * has cleared.
1123 */
1124
1125static int
1126TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col)
1127{
1128	RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1129	RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
1130	RF_StripeNum_t psid = ctrl->curPSID;
1131	RF_ReconUnitNum_t which_ru = ctrl->ru_count;
1132	RF_DiskQueueData_t *req;
1133	int     status;
1134	RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr;
1135
1136	/* if the current disk is too far ahead of the others, issue a
1137	 * head-separation wait and return */
1138	if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru))
1139		return (0);
1140
1141	/* allocate a new PSS in case we need it */
1142	newpssPtr = rf_AllocPSStatus(raidPtr);
1143
1144	RF_LOCK_PSS_MUTEX(raidPtr, psid);
1145	pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr);
1146
1147	if (pssPtr != newpssPtr) {
1148		rf_FreePSStatus(raidPtr, newpssPtr);
1149	}
1150
1151	/* if recon is blocked on the indicated parity stripe, issue a
1152	 * block-wait request and return. this also must mark the indicated RU
1153	 * in the stripe as under reconstruction if not blocked. */
1154	status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru);
1155	if (status == RF_PSS_RECON_BLOCKED) {
1156		Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru);
1157		goto out;
1158	} else
1159		if (status == RF_PSS_FORCED_ON_WRITE) {
1160			rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1161			goto out;
1162		}
1163	/* make one last check to be sure that the indicated RU didn't get
1164	 * reconstructed while we were waiting for something else to happen.
1165	 * This is unfortunate in that it causes us to make this check twice
1166	 * in the normal case.  Might want to make some attempt to re-work
1167	 * this so that we only do this check if we've definitely blocked on
1168	 * one of the above checks.  When this condition is detected, we may
1169	 * have just created a bogus status entry, which we need to delete. */
1170	if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) {
1171		Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru);
1172		if (pssPtr == newpssPtr)
1173			rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1174		rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1175		goto out;
1176	}
1177	/* found something to read.  issue the I/O */
1178	Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n",
1179	    psid, col, ctrl->diskOffset, ctrl->rbuf->buffer);
1180#if RF_ACC_TRACE > 0
1181	RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
1182	RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
1183	raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
1184	    RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
1185	RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1186#endif
1187	/* should be ok to use a NULL proc pointer here, all the bufs we use
1188	 * should be in kernel space */
1189	req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
1190	    ReconReadDoneProc, (void *) ctrl,
1191#if RF_ACC_TRACE > 0
1192				     &raidPtr->recon_tracerecs[col],
1193#else
1194				     NULL,
1195#endif
1196				     (void *) raidPtr, 0, NULL, PR_WAITOK);
1197
1198	ctrl->rbuf->arg = (void *) req;
1199	rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY);
1200	pssPtr->issued[col] = 1;
1201
1202out:
1203	RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1204	return (0);
1205}
1206
1207
1208/*
1209 * given a parity stripe ID, we want to find out whether both the
1210 * current disk and the failed disk exist in that parity stripe.  If
1211 * not, we want to skip this whole PS.  If so, we want to find the
1212 * disk offset of the start of the PS on both the current disk and the
1213 * failed disk.
1214 *
1215 * this works by getting a list of disks comprising the indicated
1216 * parity stripe, and searching the list for the current and failed
1217 * disks.  Once we've decided they both exist in the parity stripe, we
1218 * need to decide whether each is data or parity, so that we'll know
1219 * which mapping function to call to get the corresponding disk
1220 * offsets.
1221 *
1222 * this is kind of unpleasant, but doing it this way allows the
1223 * reconstruction code to use parity stripe IDs rather than physical
1224 * disks address to march through the failed disk, which greatly
1225 * simplifies a lot of code, as well as eliminating the need for a
1226 * reverse-mapping function.  I also think it will execute faster,
1227 * since the calls to the mapping module are kept to a minimum.
1228 *
1229 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
1230 * THE STRIPE IN THE CORRECT ORDER
1231 *
1232 * raidPtr          - raid descriptor
1233 * psid             - parity stripe identifier
1234 * col              - column of disk to find the offsets for
1235 * spCol            - out: col of spare unit for failed unit
1236 * spOffset         - out: offset into disk containing spare unit
1237 *
1238 */
1239
1240
1241static int
1242ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid,
1243		     RF_RowCol_t col, RF_SectorNum_t *outDiskOffset,
1244		     RF_SectorNum_t *outFailedDiskSectorOffset,
1245		     RF_RowCol_t *spCol, RF_SectorNum_t *spOffset)
1246{
1247	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1248	RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1249	RF_RaidAddr_t sosRaidAddress;	/* start-of-stripe */
1250	RF_RowCol_t *diskids;
1251	u_int   i, j, k, i_offset, j_offset;
1252	RF_RowCol_t pcol;
1253	int     testcol;
1254	RF_SectorNum_t poffset;
1255	char    i_is_parity = 0, j_is_parity = 0;
1256	RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
1257
1258	/* get a listing of the disks comprising that stripe */
1259	sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
1260	(layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids);
1261	RF_ASSERT(diskids);
1262
1263	/* reject this entire parity stripe if it does not contain the
1264	 * indicated disk or it does not contain the failed disk */
1265
1266	for (i = 0; i < stripeWidth; i++) {
1267		if (col == diskids[i])
1268			break;
1269	}
1270	if (i == stripeWidth)
1271		goto skipit;
1272	for (j = 0; j < stripeWidth; j++) {
1273		if (fcol == diskids[j])
1274			break;
1275	}
1276	if (j == stripeWidth) {
1277		goto skipit;
1278	}
1279	/* find out which disk the parity is on */
1280	(layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP);
1281
1282	/* find out if either the current RU or the failed RU is parity */
1283	/* also, if the parity occurs in this stripe prior to the data and/or
1284	 * failed col, we need to decrement i and/or j */
1285	for (k = 0; k < stripeWidth; k++)
1286		if (diskids[k] == pcol)
1287			break;
1288	RF_ASSERT(k < stripeWidth);
1289	i_offset = i;
1290	j_offset = j;
1291	if (k < i)
1292		i_offset--;
1293	else
1294		if (k == i) {
1295			i_is_parity = 1;
1296			i_offset = 0;
1297		}		/* set offsets to zero to disable multiply
1298				 * below */
1299	if (k < j)
1300		j_offset--;
1301	else
1302		if (k == j) {
1303			j_is_parity = 1;
1304			j_offset = 0;
1305		}
1306	/* at this point, [ij]_is_parity tells us whether the [current,failed]
1307	 * disk is parity at the start of this RU, and, if data, "[ij]_offset"
1308	 * tells us how far into the stripe the [current,failed] disk is. */
1309
1310	/* call the mapping routine to get the offset into the current disk,
1311	 * repeat for failed disk. */
1312	if (i_is_parity)
1313		layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1314	else
1315		layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1316
1317	RF_ASSERT(col == testcol);
1318
1319	if (j_is_parity)
1320		layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1321	else
1322		layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1323	RF_ASSERT(fcol == testcol);
1324
1325	/* now locate the spare unit for the failed unit */
1326#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1327	if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
1328		if (j_is_parity)
1329			layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1330		else
1331			layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1332	} else {
1333#endif
1334		*spCol = raidPtr->reconControl->spareCol;
1335		*spOffset = *outFailedDiskSectorOffset;
1336#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1337	}
1338#endif
1339	return (0);
1340
1341skipit:
1342	Dprintf2("RECON: Skipping psid %ld: nothing needed from r%d c%d\n",
1343	    psid, col);
1344	return (1);
1345}
1346/* this is called when a buffer has become ready to write to the replacement disk */
1347static int
1348IssueNextWriteRequest(RF_Raid_t *raidPtr)
1349{
1350	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1351	RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1352#if RF_ACC_TRACE > 0
1353	RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1354#endif
1355	RF_ReconBuffer_t *rbuf;
1356	RF_DiskQueueData_t *req;
1357
1358	rbuf = rf_GetFullReconBuffer(raidPtr->reconControl);
1359	RF_ASSERT(rbuf);	/* there must be one available, or we wouldn't
1360				 * have gotten the event that sent us here */
1361	RF_ASSERT(rbuf->pssPtr);
1362
1363	rbuf->pssPtr->writeRbuf = rbuf;
1364	rbuf->pssPtr = NULL;
1365
1366	Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n",
1367	    rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
1368	    rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
1369	Dprintf6("RECON: new write psid %ld   %02x %02x %02x %02x %02x\n",
1370	    rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
1371	    rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
1372
1373	/* should be ok to use a NULL b_proc here b/c all addrs should be in
1374	 * kernel space */
1375	req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
1376	    sectorsPerRU, rbuf->buffer,
1377	    rbuf->parityStripeID, rbuf->which_ru,
1378	    ReconWriteDoneProc, (void *) rbuf,
1379#if RF_ACC_TRACE > 0
1380	    &raidPtr->recon_tracerecs[fcol],
1381#else
1382				     NULL,
1383#endif
1384	    (void *) raidPtr, 0, NULL, PR_WAITOK);
1385
1386	rbuf->arg = (void *) req;
1387	RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1388	raidPtr->reconControl->pending_writes++;
1389	RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1390	rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY);
1391
1392	return (0);
1393}
1394
1395/*
1396 * this gets called upon the completion of a reconstruction read
1397 * operation the arg is a pointer to the per-disk reconstruction
1398 * control structure for the process that just finished a read.
1399 *
1400 * called at interrupt context in the kernel, so don't do anything
1401 * illegal here.
1402 */
1403static int
1404ReconReadDoneProc(void *arg, int status)
1405{
1406	RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
1407	RF_Raid_t *raidPtr;
1408
1409	/* Detect that reconCtrl is no longer valid, and if that
1410	   is the case, bail without calling rf_CauseReconEvent().
1411	   There won't be anyone listening for this event anyway */
1412
1413	if (ctrl->reconCtrl == NULL)
1414		return(0);
1415
1416	raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
1417
1418	if (status) {
1419		printf("raid%d: Recon read failed!\n", raidPtr->raidid);
1420		rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED);
1421		return(0);
1422	}
1423#if RF_ACC_TRACE > 0
1424	RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1425	RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1426	raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us =
1427	    RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1428	RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1429#endif
1430	rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE);
1431	return (0);
1432}
1433/* this gets called upon the completion of a reconstruction write operation.
1434 * the arg is a pointer to the rbuf that was just written
1435 *
1436 * called at interrupt context in the kernel, so don't do anything illegal here.
1437 */
1438static int
1439ReconWriteDoneProc(void *arg, int status)
1440{
1441	RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
1442
1443	/* Detect that reconControl is no longer valid, and if that
1444	   is the case, bail without calling rf_CauseReconEvent().
1445	   There won't be anyone listening for this event anyway */
1446
1447	if (rbuf->raidPtr->reconControl == NULL)
1448		return(0);
1449
1450	Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru);
1451	if (status) {
1452		printf("raid%d: Recon write failed!\n", rbuf->raidPtr->raidid);
1453		rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED);
1454		return(0);
1455	}
1456	rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE);
1457	return (0);
1458}
1459
1460
1461/*
1462 * computes a new minimum head sep, and wakes up anyone who needs to
1463 * be woken as a result
1464 */
1465static void
1466CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr)
1467{
1468	RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1469	RF_HeadSepLimit_t new_min;
1470	RF_RowCol_t i;
1471	RF_CallbackDesc_t *p;
1472	RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter);	/* from the definition
1473								 * of a minimum */
1474
1475
1476	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1477	while(reconCtrlPtr->rb_lock) {
1478		ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlcnmhs", 0, &reconCtrlPtr->rb_mutex);
1479	}
1480	reconCtrlPtr->rb_lock = 1;
1481	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1482
1483	new_min = ~(1L << (8 * sizeof(long) - 1));	/* 0x7FFF....FFF */
1484	for (i = 0; i < raidPtr->numCol; i++)
1485		if (i != reconCtrlPtr->fcol) {
1486			if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min)
1487				new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter;
1488		}
1489	/* set the new minimum and wake up anyone who can now run again */
1490	if (new_min != reconCtrlPtr->minHeadSepCounter) {
1491		reconCtrlPtr->minHeadSepCounter = new_min;
1492		Dprintf1("RECON:  new min head pos counter val is %ld\n", new_min);
1493		while (reconCtrlPtr->headSepCBList) {
1494			if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min)
1495				break;
1496			p = reconCtrlPtr->headSepCBList;
1497			reconCtrlPtr->headSepCBList = p->next;
1498			p->next = NULL;
1499			rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
1500			rf_FreeCallbackDesc(p);
1501		}
1502
1503	}
1504	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1505	reconCtrlPtr->rb_lock = 0;
1506	wakeup(&reconCtrlPtr->rb_lock);
1507	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1508}
1509
1510/*
1511 * checks to see that the maximum head separation will not be violated
1512 * if we initiate a reconstruction I/O on the indicated disk.
1513 * Limiting the maximum head separation between two disks eliminates
1514 * the nasty buffer-stall conditions that occur when one disk races
1515 * ahead of the others and consumes all of the floating recon buffers.
1516 * This code is complex and unpleasant but it's necessary to avoid
1517 * some very nasty, albeit fairly rare, reconstruction behavior.
1518 *
1519 * returns non-zero if and only if we have to stop working on the
1520 * indicated disk due to a head-separation delay.
1521 */
1522static int
1523CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl,
1524		    RF_RowCol_t col, RF_HeadSepLimit_t hsCtr,
1525		    RF_ReconUnitNum_t which_ru)
1526{
1527	RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1528	RF_CallbackDesc_t *cb, *p, *pt;
1529	int     retval = 0;
1530
1531	/* if we're too far ahead of the slowest disk, stop working on this
1532	 * disk until the slower ones catch up.  We do this by scheduling a
1533	 * wakeup callback for the time when the slowest disk has caught up.
1534	 * We define "caught up" with 20% hysteresis, i.e. the head separation
1535	 * must have fallen to at most 80% of the max allowable head
1536	 * separation before we'll wake up.
1537	 *
1538	 */
1539	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1540	while(reconCtrlPtr->rb_lock) {
1541		ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlchs", 0, &reconCtrlPtr->rb_mutex);
1542	}
1543	reconCtrlPtr->rb_lock = 1;
1544	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1545	if ((raidPtr->headSepLimit >= 0) &&
1546	    ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) {
1547		Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n",
1548			 raidPtr->raidid, col, ctrl->headSepCounter,
1549			 reconCtrlPtr->minHeadSepCounter,
1550			 raidPtr->headSepLimit);
1551		cb = rf_AllocCallbackDesc();
1552		/* the minHeadSepCounter value we have to get to before we'll
1553		 * wake up.  build in 20% hysteresis. */
1554		cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
1555		cb->col = col;
1556		cb->next = NULL;
1557
1558		/* insert this callback descriptor into the sorted list of
1559		 * pending head-sep callbacks */
1560		p = reconCtrlPtr->headSepCBList;
1561		if (!p)
1562			reconCtrlPtr->headSepCBList = cb;
1563		else
1564			if (cb->callbackArg.v < p->callbackArg.v) {
1565				cb->next = reconCtrlPtr->headSepCBList;
1566				reconCtrlPtr->headSepCBList = cb;
1567			} else {
1568				for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next);
1569				cb->next = p;
1570				pt->next = cb;
1571			}
1572		retval = 1;
1573#if RF_RECON_STATS > 0
1574		ctrl->reconCtrl->reconDesc->hsStallCount++;
1575#endif				/* RF_RECON_STATS > 0 */
1576	}
1577	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1578	reconCtrlPtr->rb_lock = 0;
1579	wakeup(&reconCtrlPtr->rb_lock);
1580	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1581
1582	return (retval);
1583}
1584/*
1585 * checks to see if reconstruction has been either forced or blocked
1586 * by a user operation.  if forced, we skip this RU entirely.  else if
1587 * blocked, put ourselves on the wait list.  else return 0.
1588 *
1589 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY
1590 */
1591static int
1592CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr,
1593				   RF_ReconParityStripeStatus_t *pssPtr,
1594				   RF_PerDiskReconCtrl_t *ctrl,
1595				   RF_RowCol_t col,
1596				   RF_StripeNum_t psid,
1597				   RF_ReconUnitNum_t which_ru)
1598{
1599	RF_CallbackDesc_t *cb;
1600	int     retcode = 0;
1601
1602	if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
1603		retcode = RF_PSS_FORCED_ON_WRITE;
1604	else
1605		if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
1606			Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru);
1607			cb = rf_AllocCallbackDesc();	/* append ourselves to
1608							 * the blockage-wait
1609							 * list */
1610			cb->col = col;
1611			cb->next = pssPtr->blockWaitList;
1612			pssPtr->blockWaitList = cb;
1613			retcode = RF_PSS_RECON_BLOCKED;
1614		}
1615	if (!retcode)
1616		pssPtr->flags |= RF_PSS_UNDER_RECON;	/* mark this RU as under
1617							 * reconstruction */
1618
1619	return (retcode);
1620}
1621/*
1622 * if reconstruction is currently ongoing for the indicated stripeID,
1623 * reconstruction is forced to completion and we return non-zero to
1624 * indicate that the caller must wait.  If not, then reconstruction is
1625 * blocked on the indicated stripe and the routine returns zero.  If
1626 * and only if we return non-zero, we'll cause the cbFunc to get
1627 * invoked with the cbArg when the reconstruction has completed.
1628 */
1629int
1630rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1631		     void (*cbFunc)(RF_Raid_t *, void *), void *cbArg)
1632{
1633	RF_StripeNum_t stripeID = asmap->stripeID;	/* the stripe ID we're
1634							 * forcing recon on */
1635	RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;	/* num sects in one RU */
1636	RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr;	/* a pointer to the parity
1637						 * stripe status structure */
1638	RF_StripeNum_t psid;	/* parity stripe id */
1639	RF_SectorNum_t offset, fd_offset;	/* disk offset, failed-disk
1640						 * offset */
1641	RF_RowCol_t *diskids;
1642	RF_ReconUnitNum_t which_ru;	/* RU within parity stripe */
1643	RF_RowCol_t fcol, diskno, i;
1644	RF_ReconBuffer_t *new_rbuf;	/* ptr to newly allocated rbufs */
1645	RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */
1646	RF_CallbackDesc_t *cb;
1647	int     nPromoted;
1648
1649	psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1650
1651	/* allocate a new PSS in case we need it */
1652        newpssPtr = rf_AllocPSStatus(raidPtr);
1653
1654	RF_LOCK_PSS_MUTEX(raidPtr, psid);
1655
1656	pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr);
1657
1658        if (pssPtr != newpssPtr) {
1659                rf_FreePSStatus(raidPtr, newpssPtr);
1660        }
1661
1662	/* if recon is not ongoing on this PS, just return */
1663	if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1664		RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1665		return (0);
1666	}
1667	/* otherwise, we have to wait for reconstruction to complete on this
1668	 * RU. */
1669	/* In order to avoid waiting for a potentially large number of
1670	 * low-priority accesses to complete, we force a normal-priority (i.e.
1671	 * not low-priority) reconstruction on this RU. */
1672	if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
1673		DDprintf1("Forcing recon on psid %ld\n", psid);
1674		pssPtr->flags |= RF_PSS_FORCED_ON_WRITE;	/* mark this RU as under
1675								 * forced recon */
1676		pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;	/* clear the blockage
1677							 * that we just set */
1678		fcol = raidPtr->reconControl->fcol;
1679
1680		/* get a listing of the disks comprising the indicated stripe */
1681		(raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids);
1682
1683		/* For previously issued reads, elevate them to normal
1684		 * priority.  If the I/O has already completed, it won't be
1685		 * found in the queue, and hence this will be a no-op. For
1686		 * unissued reads, allocate buffers and issue new reads.  The
1687		 * fact that we've set the FORCED bit means that the regular
1688		 * recon procs will not re-issue these reqs */
1689		for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++)
1690			if ((diskno = diskids[i]) != fcol) {
1691				if (pssPtr->issued[diskno]) {
1692					nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru);
1693					if (rf_reconDebug && nPromoted)
1694						printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno);
1695				} else {
1696					new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED);	/* create new buf */
1697					ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset,
1698					    &new_rbuf->spCol, &new_rbuf->spOffset);	/* find offsets & spare
1699													 * location */
1700					new_rbuf->parityStripeID = psid;	/* fill in the buffer */
1701					new_rbuf->which_ru = which_ru;
1702					new_rbuf->failedDiskSectorOffset = fd_offset;
1703					new_rbuf->priority = RF_IO_NORMAL_PRIORITY;
1704
1705					/* use NULL b_proc b/c all addrs
1706					 * should be in kernel space */
1707					req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer,
1708					    psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf,
1709					    NULL, (void *) raidPtr, 0, NULL, PR_WAITOK);
1710
1711					new_rbuf->arg = req;
1712					rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY);	/* enqueue the I/O */
1713					Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno);
1714				}
1715			}
1716		/* if the write is sitting in the disk queue, elevate its
1717		 * priority */
1718		if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru))
1719			printf("raid%d: promoted write to col %d\n",
1720			       raidPtr->raidid, fcol);
1721	}
1722	/* install a callback descriptor to be invoked when recon completes on
1723	 * this parity stripe. */
1724	cb = rf_AllocCallbackDesc();
1725	/* XXX the following is bogus.. These functions don't really match!!
1726	 * GO */
1727	cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
1728	cb->callbackArg.p = (void *) cbArg;
1729	cb->next = pssPtr->procWaitList;
1730	pssPtr->procWaitList = cb;
1731	DDprintf2("raid%d: Waiting for forced recon on psid %ld\n",
1732		  raidPtr->raidid, psid);
1733
1734	RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1735	return (1);
1736}
1737/* called upon the completion of a forced reconstruction read.
1738 * all we do is schedule the FORCEDREADONE event.
1739 * called at interrupt context in the kernel, so don't do anything illegal here.
1740 */
1741static void
1742ForceReconReadDoneProc(void *arg, int status)
1743{
1744	RF_ReconBuffer_t *rbuf = arg;
1745
1746	/* Detect that reconControl is no longer valid, and if that
1747	   is the case, bail without calling rf_CauseReconEvent().
1748	   There won't be anyone listening for this event anyway */
1749
1750	if (rbuf->raidPtr->reconControl == NULL)
1751		return;
1752
1753	if (status) {
1754		printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid);
1755		rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED);
1756		return;
1757	}
1758	rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE);
1759}
1760/* releases a block on the reconstruction of the indicated stripe */
1761int
1762rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
1763{
1764	RF_StripeNum_t stripeID = asmap->stripeID;
1765	RF_ReconParityStripeStatus_t *pssPtr;
1766	RF_ReconUnitNum_t which_ru;
1767	RF_StripeNum_t psid;
1768	RF_CallbackDesc_t *cb;
1769
1770	psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1771	RF_LOCK_PSS_MUTEX(raidPtr, psid);
1772	pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL);
1773
1774	/* When recon is forced, the pss desc can get deleted before we get
1775	 * back to unblock recon. But, this can _only_ happen when recon is
1776	 * forced. It would be good to put some kind of sanity check here, but
1777	 * how to decide if recon was just forced or not? */
1778	if (!pssPtr) {
1779		/* printf("Warning: no pss descriptor upon unblock on psid %ld
1780		 * RU %d\n",psid,which_ru); */
1781#if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0)
1782		if (rf_reconDebug || rf_pssDebug)
1783			printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru);
1784#endif
1785		goto out;
1786	}
1787	pssPtr->blockCount--;
1788	Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n",
1789		 raidPtr->raidid, psid, pssPtr->blockCount);
1790	if (pssPtr->blockCount == 0) {	/* if recon blockage has been released */
1791
1792		/* unblock recon before calling CauseReconEvent in case
1793		 * CauseReconEvent causes us to try to issue a new read before
1794		 * returning here. */
1795		pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
1796
1797
1798		while (pssPtr->blockWaitList) {
1799			/* spin through the block-wait list and
1800			   release all the waiters */
1801			cb = pssPtr->blockWaitList;
1802			pssPtr->blockWaitList = cb->next;
1803			cb->next = NULL;
1804			rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR);
1805			rf_FreeCallbackDesc(cb);
1806		}
1807		if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1808			/* if no recon was requested while recon was blocked */
1809			rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1810		}
1811	}
1812out:
1813	RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1814	return (0);
1815}
1816