rf_reconstruct.c revision 1.105
1/*	$NetBSD: rf_reconstruct.c,v 1.105 2008/09/23 21:36:35 oster Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/************************************************************
30 *
31 * rf_reconstruct.c -- code to perform on-line reconstruction
32 *
33 ************************************************************/
34
35#include <sys/cdefs.h>
36__KERNEL_RCSID(0, "$NetBSD: rf_reconstruct.c,v 1.105 2008/09/23 21:36:35 oster Exp $");
37
38#include <sys/param.h>
39#include <sys/time.h>
40#include <sys/buf.h>
41#include <sys/errno.h>
42#include <sys/systm.h>
43#include <sys/proc.h>
44#include <sys/ioctl.h>
45#include <sys/fcntl.h>
46#include <sys/vnode.h>
47#include <dev/raidframe/raidframevar.h>
48
49#include "rf_raid.h"
50#include "rf_reconutil.h"
51#include "rf_revent.h"
52#include "rf_reconbuffer.h"
53#include "rf_acctrace.h"
54#include "rf_etimer.h"
55#include "rf_dag.h"
56#include "rf_desc.h"
57#include "rf_debugprint.h"
58#include "rf_general.h"
59#include "rf_driver.h"
60#include "rf_utils.h"
61#include "rf_shutdown.h"
62
63#include "rf_kintf.h"
64
65/* setting these to -1 causes them to be set to their default values if not set by debug options */
66
67#if RF_DEBUG_RECON
68#define Dprintf(s)         if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
69#define Dprintf1(s,a)         if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
70#define Dprintf2(s,a,b)       if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
71#define Dprintf3(s,a,b,c)     if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
72#define Dprintf4(s,a,b,c,d)   if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
73#define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
74#define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
75#define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
76
77#define DDprintf1(s,a)         if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
78#define DDprintf2(s,a,b)       if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
79
80#else /* RF_DEBUG_RECON */
81
82#define Dprintf(s) {}
83#define Dprintf1(s,a) {}
84#define Dprintf2(s,a,b) {}
85#define Dprintf3(s,a,b,c) {}
86#define Dprintf4(s,a,b,c,d) {}
87#define Dprintf5(s,a,b,c,d,e) {}
88#define Dprintf6(s,a,b,c,d,e,f) {}
89#define Dprintf7(s,a,b,c,d,e,f,g) {}
90
91#define DDprintf1(s,a) {}
92#define DDprintf2(s,a,b) {}
93
94#endif /* RF_DEBUG_RECON */
95
96#define RF_RECON_DONE_READS   1
97#define RF_RECON_READ_ERROR   2
98#define RF_RECON_WRITE_ERROR  3
99#define RF_RECON_READ_STOPPED 4
100#define RF_RECON_WRITE_DONE   5
101
102#define RF_MAX_FREE_RECONBUFFER 32
103#define RF_MIN_FREE_RECONBUFFER 16
104
105static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *, RF_RowCol_t,
106					      RF_RaidDisk_t *, int, RF_RowCol_t);
107static void FreeReconDesc(RF_RaidReconDesc_t *);
108static int ProcessReconEvent(RF_Raid_t *, RF_ReconEvent_t *);
109static int IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t);
110static int TryToRead(RF_Raid_t *, RF_RowCol_t);
111static int ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t, RF_RowCol_t,
112				RF_SectorNum_t *, RF_SectorNum_t *, RF_RowCol_t *,
113				RF_SectorNum_t *);
114static int IssueNextWriteRequest(RF_Raid_t *);
115static int ReconReadDoneProc(void *, int);
116static int ReconWriteDoneProc(void *, int);
117static void CheckForNewMinHeadSep(RF_Raid_t *, RF_HeadSepLimit_t);
118static int CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *,
119			       RF_RowCol_t, RF_HeadSepLimit_t,
120			       RF_ReconUnitNum_t);
121static int CheckForcedOrBlockedReconstruction(RF_Raid_t *,
122					      RF_ReconParityStripeStatus_t *,
123					      RF_PerDiskReconCtrl_t *,
124					      RF_RowCol_t, RF_StripeNum_t,
125					      RF_ReconUnitNum_t);
126static void ForceReconReadDoneProc(void *, int);
127static void rf_ShutdownReconstruction(void *);
128
129struct RF_ReconDoneProc_s {
130	void    (*proc) (RF_Raid_t *, void *);
131	void   *arg;
132	RF_ReconDoneProc_t *next;
133};
134
135/**************************************************************************
136 *
137 * sets up the parameters that will be used by the reconstruction process
138 * currently there are none, except for those that the layout-specific
139 * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
140 *
141 * in the kernel, we fire off the recon thread.
142 *
143 **************************************************************************/
144static void
145rf_ShutdownReconstruction(void *ignored)
146{
147	pool_destroy(&rf_pools.reconbuffer);
148}
149
150int
151rf_ConfigureReconstruction(RF_ShutdownList_t **listp)
152{
153
154	rf_pool_init(&rf_pools.reconbuffer, sizeof(RF_ReconBuffer_t),
155		     "rf_reconbuffer_pl", RF_MIN_FREE_RECONBUFFER, RF_MAX_FREE_RECONBUFFER);
156	rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
157
158	return (0);
159}
160
161static RF_RaidReconDesc_t *
162AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t col,
163		   RF_RaidDisk_t *spareDiskPtr, int numDisksDone,
164		   RF_RowCol_t scol)
165{
166
167	RF_RaidReconDesc_t *reconDesc;
168
169	RF_Malloc(reconDesc, sizeof(RF_RaidReconDesc_t),
170		  (RF_RaidReconDesc_t *));
171	reconDesc->raidPtr = raidPtr;
172	reconDesc->col = col;
173	reconDesc->spareDiskPtr = spareDiskPtr;
174	reconDesc->numDisksDone = numDisksDone;
175	reconDesc->scol = scol;
176	reconDesc->next = NULL;
177
178	return (reconDesc);
179}
180
181static void
182FreeReconDesc(RF_RaidReconDesc_t *reconDesc)
183{
184#if RF_RECON_STATS > 0
185	printf("raid%d: %lu recon event waits, %lu recon delays\n",
186	       reconDesc->raidPtr->raidid,
187	       (long) reconDesc->numReconEventWaits,
188	       (long) reconDesc->numReconExecDelays);
189#endif				/* RF_RECON_STATS > 0 */
190	printf("raid%d: %lu max exec ticks\n",
191	       reconDesc->raidPtr->raidid,
192	       (long) reconDesc->maxReconExecTicks);
193	RF_Free(reconDesc, sizeof(RF_RaidReconDesc_t));
194}
195
196
197/*****************************************************************************
198 *
199 * primary routine to reconstruct a failed disk.  This should be called from
200 * within its own thread.  It won't return until reconstruction completes,
201 * fails, or is aborted.
202 *****************************************************************************/
203int
204rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t col)
205{
206	const RF_LayoutSW_t *lp;
207	int     rc;
208
209	lp = raidPtr->Layout.map;
210	if (lp->SubmitReconBuffer) {
211		/*
212	         * The current infrastructure only supports reconstructing one
213	         * disk at a time for each array.
214	         */
215		RF_LOCK_MUTEX(raidPtr->mutex);
216		while (raidPtr->reconInProgress) {
217			RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
218		}
219		raidPtr->reconInProgress++;
220		RF_UNLOCK_MUTEX(raidPtr->mutex);
221		rc = rf_ReconstructFailedDiskBasic(raidPtr, col);
222		RF_LOCK_MUTEX(raidPtr->mutex);
223		raidPtr->reconInProgress--;
224		RF_UNLOCK_MUTEX(raidPtr->mutex);
225	} else {
226		RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
227		    lp->parityConfig);
228		rc = EIO;
229	}
230	RF_SIGNAL_COND(raidPtr->waitForReconCond);
231	return (rc);
232}
233
234int
235rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col)
236{
237	RF_ComponentLabel_t c_label;
238	RF_RaidDisk_t *spareDiskPtr = NULL;
239	RF_RaidReconDesc_t *reconDesc;
240	RF_RowCol_t scol;
241	int     numDisksDone = 0, rc;
242
243	/* first look for a spare drive onto which to reconstruct the data */
244	/* spare disk descriptors are stored in row 0.  This may have to
245	 * change eventually */
246
247	RF_LOCK_MUTEX(raidPtr->mutex);
248	RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed);
249#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
250	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
251		if (raidPtr->status != rf_rs_degraded) {
252			RF_ERRORMSG1("Unable to reconstruct disk at col %d because status not degraded\n", col);
253			RF_UNLOCK_MUTEX(raidPtr->mutex);
254			return (EINVAL);
255		}
256		scol = (-1);
257	} else {
258#endif
259		for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
260			if (raidPtr->Disks[scol].status == rf_ds_spare) {
261				spareDiskPtr = &raidPtr->Disks[scol];
262				spareDiskPtr->status = rf_ds_used_spare;
263				break;
264			}
265		}
266		if (!spareDiskPtr) {
267			RF_ERRORMSG1("Unable to reconstruct disk at col %d because no spares are available\n", col);
268			RF_UNLOCK_MUTEX(raidPtr->mutex);
269			return (ENOSPC);
270		}
271		printf("RECON: initiating reconstruction on col %d -> spare at col %d\n", col, scol);
272#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
273	}
274#endif
275	RF_UNLOCK_MUTEX(raidPtr->mutex);
276
277	reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr, numDisksDone, scol);
278	raidPtr->reconDesc = (void *) reconDesc;
279#if RF_RECON_STATS > 0
280	reconDesc->hsStallCount = 0;
281	reconDesc->numReconExecDelays = 0;
282	reconDesc->numReconEventWaits = 0;
283#endif				/* RF_RECON_STATS > 0 */
284	reconDesc->reconExecTimerRunning = 0;
285	reconDesc->reconExecTicks = 0;
286	reconDesc->maxReconExecTicks = 0;
287	rc = rf_ContinueReconstructFailedDisk(reconDesc);
288
289	if (!rc) {
290		/* fix up the component label */
291		/* Don't actually need the read here.. */
292		raidread_component_label(
293                        raidPtr->raid_cinfo[scol].ci_dev,
294			raidPtr->raid_cinfo[scol].ci_vp,
295			&c_label);
296
297		raid_init_component_label( raidPtr, &c_label);
298		c_label.row = 0;
299		c_label.column = col;
300		c_label.clean = RF_RAID_DIRTY;
301		c_label.status = rf_ds_optimal;
302		c_label.partitionSize = raidPtr->Disks[scol].partitionSize;
303
304		/* We've just done a rebuild based on all the other
305		   disks, so at this point the parity is known to be
306		   clean, even if it wasn't before. */
307
308		/* XXX doesn't hold for RAID 6!!*/
309
310		RF_LOCK_MUTEX(raidPtr->mutex);
311		raidPtr->parity_good = RF_RAID_CLEAN;
312		RF_UNLOCK_MUTEX(raidPtr->mutex);
313
314		/* XXXX MORE NEEDED HERE */
315
316		raidwrite_component_label(
317                        raidPtr->raid_cinfo[scol].ci_dev,
318			raidPtr->raid_cinfo[scol].ci_vp,
319			&c_label);
320
321	} else {
322		/* Reconstruct failed. */
323
324		RF_LOCK_MUTEX(raidPtr->mutex);
325		/* Failed disk goes back to "failed" status */
326		raidPtr->Disks[col].status = rf_ds_failed;
327
328		/* Spare disk goes back to "spare" status. */
329		spareDiskPtr->status = rf_ds_spare;
330		RF_UNLOCK_MUTEX(raidPtr->mutex);
331
332	}
333	rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
334	return (rc);
335}
336
337/*
338
339   Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
340   and you don't get a spare until the next Monday.  With this function
341   (and hot-swappable drives) you can now put your new disk containing
342   /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
343   rebuild the data "on the spot".
344
345*/
346
347int
348rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t col)
349{
350	RF_RaidDisk_t *spareDiskPtr = NULL;
351	RF_RaidReconDesc_t *reconDesc;
352	const RF_LayoutSW_t *lp;
353	RF_ComponentLabel_t c_label;
354	int     numDisksDone = 0, rc;
355	struct partinfo dpart;
356	struct vnode *vp;
357	struct vattr va;
358	int retcode;
359	int ac;
360
361	lp = raidPtr->Layout.map;
362	if (!lp->SubmitReconBuffer) {
363		RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
364			     lp->parityConfig);
365		/* wakeup anyone who might be waiting to do a reconstruct */
366		RF_SIGNAL_COND(raidPtr->waitForReconCond);
367		return(EIO);
368	}
369
370	/*
371	 * The current infrastructure only supports reconstructing one
372	 * disk at a time for each array.
373	 */
374	RF_LOCK_MUTEX(raidPtr->mutex);
375
376	if (raidPtr->Disks[col].status != rf_ds_failed) {
377		/* "It's gone..." */
378		raidPtr->numFailures++;
379		raidPtr->Disks[col].status = rf_ds_failed;
380		raidPtr->status = rf_rs_degraded;
381		RF_UNLOCK_MUTEX(raidPtr->mutex);
382		rf_update_component_labels(raidPtr,
383					   RF_NORMAL_COMPONENT_UPDATE);
384		RF_LOCK_MUTEX(raidPtr->mutex);
385	}
386
387	while (raidPtr->reconInProgress) {
388		RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
389	}
390
391	raidPtr->reconInProgress++;
392
393	/* first look for a spare drive onto which to reconstruct the
394	   data.  spare disk descriptors are stored in row 0.  This
395	   may have to change eventually */
396
397	/* Actually, we don't care if it's failed or not...  On a RAID
398	   set with correct parity, this function should be callable
399	   on any component without ill effects. */
400	/* RF_ASSERT(raidPtr->Disks[col].status == rf_ds_failed); */
401
402#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
403	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
404		RF_ERRORMSG1("Unable to reconstruct to disk at col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", col);
405
406		raidPtr->reconInProgress--;
407		RF_UNLOCK_MUTEX(raidPtr->mutex);
408		RF_SIGNAL_COND(raidPtr->waitForReconCond);
409		return (EINVAL);
410	}
411#endif
412
413	/* This device may have been opened successfully the
414	   first time. Close it before trying to open it again.. */
415
416	if (raidPtr->raid_cinfo[col].ci_vp != NULL) {
417#if 0
418		printf("Closed the open device: %s\n",
419		       raidPtr->Disks[col].devname);
420#endif
421		vp = raidPtr->raid_cinfo[col].ci_vp;
422		ac = raidPtr->Disks[col].auto_configured;
423		RF_UNLOCK_MUTEX(raidPtr->mutex);
424		rf_close_component(raidPtr, vp, ac);
425		RF_LOCK_MUTEX(raidPtr->mutex);
426		raidPtr->raid_cinfo[col].ci_vp = NULL;
427	}
428	/* note that this disk was *not* auto_configured (any longer)*/
429	raidPtr->Disks[col].auto_configured = 0;
430
431#if 0
432	printf("About to (re-)open the device for rebuilding: %s\n",
433	       raidPtr->Disks[col].devname);
434#endif
435	RF_UNLOCK_MUTEX(raidPtr->mutex);
436	retcode = dk_lookup(raidPtr->Disks[col].devname, curlwp, &vp, UIO_SYSSPACE);
437
438	if (retcode) {
439		printf("raid%d: rebuilding: dk_lookup on device: %s failed: %d!\n",raidPtr->raidid,
440		       raidPtr->Disks[col].devname, retcode);
441
442		/* the component isn't responding properly...
443		   must be still dead :-( */
444		RF_LOCK_MUTEX(raidPtr->mutex);
445		raidPtr->reconInProgress--;
446		RF_UNLOCK_MUTEX(raidPtr->mutex);
447		RF_SIGNAL_COND(raidPtr->waitForReconCond);
448		return(retcode);
449	}
450
451	/* Ok, so we can at least do a lookup...
452	   How about actually getting a vp for it? */
453
454	if ((retcode = VOP_GETATTR(vp, &va, curlwp->l_cred)) != 0) {
455		RF_LOCK_MUTEX(raidPtr->mutex);
456		raidPtr->reconInProgress--;
457		RF_UNLOCK_MUTEX(raidPtr->mutex);
458		RF_SIGNAL_COND(raidPtr->waitForReconCond);
459		return(retcode);
460	}
461
462	retcode = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, curlwp->l_cred);
463	if (retcode) {
464		RF_LOCK_MUTEX(raidPtr->mutex);
465		raidPtr->reconInProgress--;
466		RF_UNLOCK_MUTEX(raidPtr->mutex);
467		RF_SIGNAL_COND(raidPtr->waitForReconCond);
468		return(retcode);
469	}
470	RF_LOCK_MUTEX(raidPtr->mutex);
471	raidPtr->Disks[col].blockSize =	dpart.disklab->d_secsize;
472
473	raidPtr->Disks[col].numBlocks = dpart.part->p_size -
474		rf_protectedSectors;
475
476	raidPtr->raid_cinfo[col].ci_vp = vp;
477	raidPtr->raid_cinfo[col].ci_dev = va.va_rdev;
478
479	raidPtr->Disks[col].dev = va.va_rdev;
480
481	/* we allow the user to specify that only a fraction
482	   of the disks should be used this is just for debug:
483	   it speeds up * the parity scan */
484	raidPtr->Disks[col].numBlocks = raidPtr->Disks[col].numBlocks *
485		rf_sizePercentage / 100;
486	RF_UNLOCK_MUTEX(raidPtr->mutex);
487
488	spareDiskPtr = &raidPtr->Disks[col];
489	spareDiskPtr->status = rf_ds_used_spare;
490
491	printf("raid%d: initiating in-place reconstruction on column %d\n",
492	       raidPtr->raidid, col);
493
494	reconDesc = AllocRaidReconDesc((void *) raidPtr, col, spareDiskPtr,
495				       numDisksDone, col);
496	raidPtr->reconDesc = (void *) reconDesc;
497#if RF_RECON_STATS > 0
498	reconDesc->hsStallCount = 0;
499	reconDesc->numReconExecDelays = 0;
500	reconDesc->numReconEventWaits = 0;
501#endif				/* RF_RECON_STATS > 0 */
502	reconDesc->reconExecTimerRunning = 0;
503	reconDesc->reconExecTicks = 0;
504	reconDesc->maxReconExecTicks = 0;
505	rc = rf_ContinueReconstructFailedDisk(reconDesc);
506
507	if (!rc) {
508		RF_LOCK_MUTEX(raidPtr->mutex);
509		/* Need to set these here, as at this point it'll be claiming
510		   that the disk is in rf_ds_spared!  But we know better :-) */
511
512		raidPtr->Disks[col].status = rf_ds_optimal;
513		raidPtr->status = rf_rs_optimal;
514		RF_UNLOCK_MUTEX(raidPtr->mutex);
515
516		/* fix up the component label */
517		/* Don't actually need the read here.. */
518		raidread_component_label(raidPtr->raid_cinfo[col].ci_dev,
519					 raidPtr->raid_cinfo[col].ci_vp,
520					 &c_label);
521
522		RF_LOCK_MUTEX(raidPtr->mutex);
523		raid_init_component_label(raidPtr, &c_label);
524
525		c_label.row = 0;
526		c_label.column = col;
527
528		/* We've just done a rebuild based on all the other
529		   disks, so at this point the parity is known to be
530		   clean, even if it wasn't before. */
531
532		/* XXX doesn't hold for RAID 6!!*/
533
534		raidPtr->parity_good = RF_RAID_CLEAN;
535		RF_UNLOCK_MUTEX(raidPtr->mutex);
536
537		raidwrite_component_label(raidPtr->raid_cinfo[col].ci_dev,
538					  raidPtr->raid_cinfo[col].ci_vp,
539					  &c_label);
540
541	} else {
542		/* Reconstruct-in-place failed.  Disk goes back to
543		   "failed" status, regardless of what it was before.  */
544		RF_LOCK_MUTEX(raidPtr->mutex);
545		raidPtr->Disks[col].status = rf_ds_failed;
546		RF_UNLOCK_MUTEX(raidPtr->mutex);
547	}
548
549	rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
550
551	RF_LOCK_MUTEX(raidPtr->mutex);
552	raidPtr->reconInProgress--;
553	RF_UNLOCK_MUTEX(raidPtr->mutex);
554
555	RF_SIGNAL_COND(raidPtr->waitForReconCond);
556	return (rc);
557}
558
559
560int
561rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc)
562{
563	RF_Raid_t *raidPtr = reconDesc->raidPtr;
564	RF_RowCol_t col = reconDesc->col;
565	RF_RowCol_t scol = reconDesc->scol;
566	RF_ReconMap_t *mapPtr;
567	RF_ReconCtrl_t *tmp_reconctrl;
568	RF_ReconEvent_t *event;
569	RF_StripeCount_t incPSID,lastPSID,num_writes,pending_writes,prev;
570	RF_ReconUnitCount_t RUsPerPU;
571	struct timeval etime, elpsd;
572	unsigned long xor_s, xor_resid_us;
573	int     i, ds;
574	int status, done;
575	int recon_error, write_error;
576
577	raidPtr->accumXorTimeUs = 0;
578#if RF_ACC_TRACE > 0
579	/* create one trace record per physical disk */
580	RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
581#endif
582
583	/* quiesce the array prior to starting recon.  this is needed
584	 * to assure no nasty interactions with pending user writes.
585	 * We need to do this before we change the disk or row status. */
586
587	Dprintf("RECON: begin request suspend\n");
588	rf_SuspendNewRequestsAndWait(raidPtr);
589	Dprintf("RECON: end request suspend\n");
590
591	/* allocate our RF_ReconCTRL_t before we protect raidPtr->reconControl[row] */
592	tmp_reconctrl = rf_MakeReconControl(reconDesc, col, scol);
593
594	RF_LOCK_MUTEX(raidPtr->mutex);
595
596	/* create the reconstruction control pointer and install it in
597	 * the right slot */
598	raidPtr->reconControl = tmp_reconctrl;
599	mapPtr = raidPtr->reconControl->reconMap;
600	raidPtr->reconControl->numRUsTotal = mapPtr->totalRUs;
601	raidPtr->reconControl->numRUsComplete =	0;
602	raidPtr->status = rf_rs_reconstructing;
603	raidPtr->Disks[col].status = rf_ds_reconstructing;
604	raidPtr->Disks[col].spareCol = scol;
605
606	RF_UNLOCK_MUTEX(raidPtr->mutex);
607
608	RF_GETTIME(raidPtr->reconControl->starttime);
609
610	Dprintf("RECON: resume requests\n");
611	rf_ResumeNewRequests(raidPtr);
612
613
614	mapPtr = raidPtr->reconControl->reconMap;
615
616	incPSID = RF_RECONMAP_SIZE;
617	lastPSID = raidPtr->Layout.numStripe / raidPtr->Layout.SUsPerPU;
618	RUsPerPU = raidPtr->Layout.SUsPerPU / raidPtr->Layout.SUsPerRU;
619	recon_error = 0;
620	write_error = 0;
621	pending_writes = incPSID;
622	raidPtr->reconControl->lastPSID = incPSID;
623
624	/* start the actual reconstruction */
625
626	done = 0;
627	while (!done) {
628
629		num_writes = 0;
630
631		/* issue a read for each surviving disk */
632
633		reconDesc->numDisksDone = 0;
634		for (i = 0; i < raidPtr->numCol; i++) {
635			if (i != col) {
636				/* find and issue the next I/O on the
637				 * indicated disk */
638				if (IssueNextReadRequest(raidPtr, i)) {
639					Dprintf1("RECON: done issuing for c%d\n", i);
640					reconDesc->numDisksDone++;
641				}
642			}
643		}
644
645		/* process reconstruction events until all disks report that
646		 * they've completed all work */
647
648		while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
649
650			event = rf_GetNextReconEvent(reconDesc);
651			status = ProcessReconEvent(raidPtr, event);
652
653			/* the normal case is that a read completes, and all is well. */
654			if (status == RF_RECON_DONE_READS) {
655				reconDesc->numDisksDone++;
656			} else if ((status == RF_RECON_READ_ERROR) ||
657				   (status == RF_RECON_WRITE_ERROR)) {
658				/* an error was encountered while reconstructing...
659				   Pretend we've finished this disk.
660				*/
661				recon_error = 1;
662				raidPtr->reconControl->error = 1;
663
664				/* bump the numDisksDone count for reads,
665				   but not for writes */
666				if (status == RF_RECON_READ_ERROR)
667					reconDesc->numDisksDone++;
668
669				/* write errors are special -- when we are
670				   done dealing with the reads that are
671				   finished, we don't want to wait for any
672				   writes */
673				if (status == RF_RECON_WRITE_ERROR)
674					write_error = 1;
675
676			} else if (status == RF_RECON_READ_STOPPED) {
677				/* count this component as being "done" */
678				reconDesc->numDisksDone++;
679			} else if (status == RF_RECON_WRITE_DONE) {
680				num_writes++;
681			}
682
683			if (recon_error) {
684				/* make sure any stragglers are woken up so that
685				   their theads will complete, and we can get out
686				   of here with all IO processed */
687
688				rf_WakeupHeadSepCBWaiters(raidPtr);
689			}
690
691			raidPtr->reconControl->numRUsTotal =
692				mapPtr->totalRUs;
693			raidPtr->reconControl->numRUsComplete =
694				mapPtr->totalRUs -
695				rf_UnitsLeftToReconstruct(mapPtr);
696
697#if RF_DEBUG_RECON
698			raidPtr->reconControl->percentComplete =
699				(raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
700			if (rf_prReconSched) {
701				rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
702			}
703#endif
704		}
705
706		/* reads done, wakup any waiters, and then wait for writes */
707
708		rf_WakeupHeadSepCBWaiters(raidPtr);
709
710		while (!recon_error && (num_writes < pending_writes)) {
711			event = rf_GetNextReconEvent(reconDesc);
712			status = ProcessReconEvent(raidPtr, event);
713
714			if (status == RF_RECON_WRITE_ERROR) {
715				recon_error = 1;
716				raidPtr->reconControl->error = 1;
717				/* an error was encountered at the very end... bail */
718			} else if (status == RF_RECON_WRITE_DONE) {
719				num_writes++;
720			}
721		}
722		if (recon_error ||
723		    (raidPtr->reconControl->lastPSID == lastPSID)) {
724			done = 1;
725			break;
726		}
727
728		prev = raidPtr->reconControl->lastPSID;
729		raidPtr->reconControl->lastPSID += incPSID;
730
731		if (raidPtr->reconControl->lastPSID > lastPSID) {
732			pending_writes = lastPSID - prev;
733			raidPtr->reconControl->lastPSID = lastPSID;
734		}
735
736		/* back down curPSID to get ready for the next round... */
737		for (i = 0; i < raidPtr->numCol; i++) {
738			if (i != col) {
739				raidPtr->reconControl->perDiskInfo[i].curPSID--;
740				raidPtr->reconControl->perDiskInfo[i].ru_count = RUsPerPU - 1;
741			}
742		}
743	}
744
745	mapPtr = raidPtr->reconControl->reconMap;
746	if (rf_reconDebug) {
747		printf("RECON: all reads completed\n");
748	}
749	/* at this point all the reads have completed.  We now wait
750	 * for any pending writes to complete, and then we're done */
751
752	while (!recon_error && rf_UnitsLeftToReconstruct(raidPtr->reconControl->reconMap) > 0) {
753
754		event = rf_GetNextReconEvent(reconDesc);
755		status = ProcessReconEvent(raidPtr, event);
756
757		if (status == RF_RECON_WRITE_ERROR) {
758			recon_error = 1;
759			raidPtr->reconControl->error = 1;
760			/* an error was encountered at the very end... bail */
761		} else {
762#if RF_DEBUG_RECON
763			raidPtr->reconControl->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
764			if (rf_prReconSched) {
765				rf_PrintReconSchedule(raidPtr->reconControl->reconMap, &(raidPtr->reconControl->starttime));
766			}
767#endif
768		}
769	}
770
771	if (recon_error) {
772		/* we've encountered an error in reconstructing. */
773		printf("raid%d: reconstruction failed.\n", raidPtr->raidid);
774
775		/* we start by blocking IO to the RAID set. */
776		rf_SuspendNewRequestsAndWait(raidPtr);
777
778		RF_LOCK_MUTEX(raidPtr->mutex);
779		/* mark set as being degraded, rather than
780		   rf_rs_reconstructing as we were before the problem.
781		   After this is done we can update status of the
782		   component disks without worrying about someone
783		   trying to read from a failed component.
784		*/
785		raidPtr->status = rf_rs_degraded;
786		RF_UNLOCK_MUTEX(raidPtr->mutex);
787
788		/* resume IO */
789		rf_ResumeNewRequests(raidPtr);
790
791		/* At this point there are two cases:
792		   1) If we've experienced a read error, then we've
793		   already waited for all the reads we're going to get,
794		   and we just need to wait for the writes.
795
796		   2) If we've experienced a write error, we've also
797		   already waited for all the reads to complete,
798		   but there is little point in waiting for the writes --
799		   when they do complete, they will just be ignored.
800
801		   So we just wait for writes to complete if we didn't have a
802		   write error.
803		*/
804
805		if (!write_error) {
806			/* wait for writes to complete */
807			while (raidPtr->reconControl->pending_writes > 0) {
808
809				event = rf_GetNextReconEvent(reconDesc);
810				status = ProcessReconEvent(raidPtr, event);
811
812				if (status == RF_RECON_WRITE_ERROR) {
813					raidPtr->reconControl->error = 1;
814					/* an error was encountered at the very end... bail.
815					   This will be very bad news for the user, since
816					   at this point there will have been a read error
817					   on one component, and a write error on another!
818					*/
819					break;
820				}
821			}
822		}
823
824
825		/* cleanup */
826
827		/* drain the event queue - after waiting for the writes above,
828		   there shouldn't be much (if anything!) left in the queue. */
829
830		rf_DrainReconEventQueue(reconDesc);
831
832		/* XXX  As much as we'd like to free the recon control structure
833		   and the reconDesc, we have no way of knowing if/when those will
834		   be touched by IO that has yet to occur.  It is rather poor to be
835		   basically causing a 'memory leak' here, but there doesn't seem to be
836		   a cleaner alternative at this time.  Perhaps when the reconstruct code
837		   gets a makeover this problem will go away.
838		*/
839#if 0
840		rf_FreeReconControl(raidPtr);
841#endif
842
843#if RF_ACC_TRACE > 0
844		RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
845#endif
846		/* XXX see comment above */
847#if 0
848		FreeReconDesc(reconDesc);
849#endif
850
851		return (1);
852	}
853
854	/* Success:  mark the dead disk as reconstructed.  We quiesce
855	 * the array here to assure no nasty interactions with pending
856	 * user accesses when we free up the psstatus structure as
857	 * part of FreeReconControl() */
858
859	rf_SuspendNewRequestsAndWait(raidPtr);
860
861	RF_LOCK_MUTEX(raidPtr->mutex);
862	raidPtr->numFailures--;
863	ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
864	raidPtr->Disks[col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared;
865	raidPtr->status = (ds) ? rf_rs_reconfigured : rf_rs_optimal;
866	RF_UNLOCK_MUTEX(raidPtr->mutex);
867	RF_GETTIME(etime);
868	RF_TIMEVAL_DIFF(&(raidPtr->reconControl->starttime), &etime, &elpsd);
869
870	rf_ResumeNewRequests(raidPtr);
871
872	printf("raid%d: Reconstruction of disk at col %d completed\n",
873	       raidPtr->raidid, col);
874	xor_s = raidPtr->accumXorTimeUs / 1000000;
875	xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
876	printf("raid%d: Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n",
877	       raidPtr->raidid,
878	       (int) elpsd.tv_sec, (int) elpsd.tv_usec,
879	       raidPtr->accumXorTimeUs, xor_s, xor_resid_us);
880	printf("raid%d:  (start time %d sec %d usec, end time %d sec %d usec)\n",
881	       raidPtr->raidid,
882	       (int) raidPtr->reconControl->starttime.tv_sec,
883	       (int) raidPtr->reconControl->starttime.tv_usec,
884	       (int) etime.tv_sec, (int) etime.tv_usec);
885#if RF_RECON_STATS > 0
886	printf("raid%d: Total head-sep stall count was %d\n",
887	       raidPtr->raidid, (int) reconDesc->hsStallCount);
888#endif				/* RF_RECON_STATS > 0 */
889	rf_FreeReconControl(raidPtr);
890#if RF_ACC_TRACE > 0
891	RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
892#endif
893	FreeReconDesc(reconDesc);
894
895	return (0);
896
897}
898/*****************************************************************************
899 * do the right thing upon each reconstruction event.
900 *****************************************************************************/
901static int
902ProcessReconEvent(RF_Raid_t *raidPtr, RF_ReconEvent_t *event)
903{
904	int     retcode = 0, submitblocked;
905	RF_ReconBuffer_t *rbuf;
906	RF_SectorCount_t sectorsPerRU;
907
908	retcode = RF_RECON_READ_STOPPED;
909
910	Dprintf1("RECON: ProcessReconEvent type %d\n", event->type);
911
912	switch (event->type) {
913
914		/* a read I/O has completed */
915	case RF_REVENT_READDONE:
916		rbuf = raidPtr->reconControl->perDiskInfo[event->col].rbuf;
917		Dprintf2("RECON: READDONE EVENT: col %d psid %ld\n",
918		    event->col, rbuf->parityStripeID);
919		Dprintf7("RECON: done read  psid %ld buf %lx  %02x %02x %02x %02x %02x\n",
920		    rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
921		    rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
922		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
923		if (!raidPtr->reconControl->error) {
924			submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
925			Dprintf1("RECON: submitblocked=%d\n", submitblocked);
926			if (!submitblocked)
927				retcode = IssueNextReadRequest(raidPtr, event->col);
928			else
929				retcode = 0;
930		}
931		break;
932
933		/* a write I/O has completed */
934	case RF_REVENT_WRITEDONE:
935#if RF_DEBUG_RECON
936		if (rf_floatingRbufDebug) {
937			rf_CheckFloatingRbufCount(raidPtr, 1);
938		}
939#endif
940		sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
941		rbuf = (RF_ReconBuffer_t *) event->arg;
942		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
943		Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n",
944		    rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl->percentComplete);
945		rf_ReconMapUpdate(raidPtr, raidPtr->reconControl->reconMap,
946		    rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
947		rf_RemoveFromActiveReconTable(raidPtr, rbuf->parityStripeID, rbuf->which_ru);
948
949		RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
950		raidPtr->reconControl->pending_writes--;
951		RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
952
953		if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
954			RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
955			while(raidPtr->reconControl->rb_lock) {
956				ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO, "reconctrlpre1", 0,
957					&raidPtr->reconControl->rb_mutex);
958			}
959			raidPtr->reconControl->rb_lock = 1;
960			RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
961
962			raidPtr->numFullReconBuffers--;
963			rf_ReleaseFloatingReconBuffer(raidPtr, rbuf);
964
965			RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
966			raidPtr->reconControl->rb_lock = 0;
967			wakeup(&raidPtr->reconControl->rb_lock);
968			RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
969		} else
970			if (rbuf->type == RF_RBUF_TYPE_FORCED)
971				rf_FreeReconBuffer(rbuf);
972			else
973				RF_ASSERT(0);
974		retcode = RF_RECON_WRITE_DONE;
975		break;
976
977	case RF_REVENT_BUFCLEAR:	/* A buffer-stall condition has been
978					 * cleared */
979		Dprintf1("RECON: BUFCLEAR EVENT: col %d\n", event->col);
980		if (!raidPtr->reconControl->error) {
981			submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl->perDiskInfo[event->col].rbuf,
982							     0, (int) (long) event->arg);
983			RF_ASSERT(!submitblocked);	/* we wouldn't have gotten the
984							 * BUFCLEAR event if we
985							 * couldn't submit */
986			retcode = IssueNextReadRequest(raidPtr, event->col);
987		}
988		break;
989
990	case RF_REVENT_BLOCKCLEAR:	/* A user-write reconstruction
991					 * blockage has been cleared */
992		DDprintf1("RECON: BLOCKCLEAR EVENT: col %d\n", event->col);
993		if (!raidPtr->reconControl->error) {
994			retcode = TryToRead(raidPtr, event->col);
995		}
996		break;
997
998	case RF_REVENT_HEADSEPCLEAR:	/* A max-head-separation
999					 * reconstruction blockage has been
1000					 * cleared */
1001		Dprintf1("RECON: HEADSEPCLEAR EVENT: col %d\n", event->col);
1002		if (!raidPtr->reconControl->error) {
1003			retcode = TryToRead(raidPtr, event->col);
1004		}
1005		break;
1006
1007		/* a buffer has become ready to write */
1008	case RF_REVENT_BUFREADY:
1009		Dprintf1("RECON: BUFREADY EVENT: col %d\n", event->col);
1010		if (!raidPtr->reconControl->error) {
1011			retcode = IssueNextWriteRequest(raidPtr);
1012#if RF_DEBUG_RECON
1013			if (rf_floatingRbufDebug) {
1014				rf_CheckFloatingRbufCount(raidPtr, 1);
1015			}
1016#endif
1017		}
1018		break;
1019
1020		/* we need to skip the current RU entirely because it got
1021		 * recon'd while we were waiting for something else to happen */
1022	case RF_REVENT_SKIP:
1023		DDprintf1("RECON: SKIP EVENT: col %d\n", event->col);
1024		if (!raidPtr->reconControl->error) {
1025			retcode = IssueNextReadRequest(raidPtr, event->col);
1026		}
1027		break;
1028
1029		/* a forced-reconstruction read access has completed.  Just
1030		 * submit the buffer */
1031	case RF_REVENT_FORCEDREADDONE:
1032		rbuf = (RF_ReconBuffer_t *) event->arg;
1033		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1034		DDprintf1("RECON: FORCEDREADDONE EVENT: col %d\n", event->col);
1035		if (!raidPtr->reconControl->error) {
1036			submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
1037			RF_ASSERT(!submitblocked);
1038			retcode = 0;
1039		}
1040		break;
1041
1042		/* A read I/O failed to complete */
1043	case RF_REVENT_READ_FAILED:
1044		retcode = RF_RECON_READ_ERROR;
1045		break;
1046
1047		/* A write I/O failed to complete */
1048	case RF_REVENT_WRITE_FAILED:
1049		retcode = RF_RECON_WRITE_ERROR;
1050
1051		rbuf = (RF_ReconBuffer_t *) event->arg;
1052
1053		/* cleanup the disk queue data */
1054		rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1055
1056		/* At this point we're erroring out, badly, and floatingRbufs
1057		   may not even be valid.  Rather than putting this back onto
1058		   the floatingRbufs list, just arrange for its immediate
1059		   destruction.
1060		*/
1061		rf_FreeReconBuffer(rbuf);
1062		break;
1063
1064		/* a forced read I/O failed to complete */
1065	case RF_REVENT_FORCEDREAD_FAILED:
1066		retcode = RF_RECON_READ_ERROR;
1067		break;
1068
1069	default:
1070		RF_PANIC();
1071	}
1072	rf_FreeReconEventDesc(event);
1073	return (retcode);
1074}
1075/*****************************************************************************
1076 *
1077 * find the next thing that's needed on the indicated disk, and issue
1078 * a read request for it.  We assume that the reconstruction buffer
1079 * associated with this process is free to receive the data.  If
1080 * reconstruction is blocked on the indicated RU, we issue a
1081 * blockage-release request instead of a physical disk read request.
1082 * If the current disk gets too far ahead of the others, we issue a
1083 * head-separation wait request and return.
1084 *
1085 * ctrl->{ru_count, curPSID, diskOffset} and
1086 * rbuf->failedDiskSectorOffset are maintained to point to the unit
1087 * we're currently accessing.  Note that this deviates from the
1088 * standard C idiom of having counters point to the next thing to be
1089 * accessed.  This allows us to easily retry when we're blocked by
1090 * head separation or reconstruction-blockage events.
1091 *
1092 *****************************************************************************/
1093static int
1094IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t col)
1095{
1096	RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1097	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1098	RF_ReconBuffer_t *rbuf = ctrl->rbuf;
1099	RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
1100	RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1101	int     do_new_check = 0, retcode = 0, status;
1102
1103	/* if we are currently the slowest disk, mark that we have to do a new
1104	 * check */
1105	if (ctrl->headSepCounter <= raidPtr->reconControl->minHeadSepCounter)
1106		do_new_check = 1;
1107
1108	while (1) {
1109
1110		ctrl->ru_count++;
1111		if (ctrl->ru_count < RUsPerPU) {
1112			ctrl->diskOffset += sectorsPerRU;
1113			rbuf->failedDiskSectorOffset += sectorsPerRU;
1114		} else {
1115			ctrl->curPSID++;
1116			ctrl->ru_count = 0;
1117			/* code left over from when head-sep was based on
1118			 * parity stripe id */
1119			if (ctrl->curPSID >= raidPtr->reconControl->lastPSID) {
1120				CheckForNewMinHeadSep(raidPtr, ++(ctrl->headSepCounter));
1121				return (RF_RECON_DONE_READS);	/* finito! */
1122			}
1123			/* find the disk offsets of the start of the parity
1124			 * stripe on both the current disk and the failed
1125			 * disk. skip this entire parity stripe if either disk
1126			 * does not appear in the indicated PS */
1127			status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset,
1128			    &rbuf->spCol, &rbuf->spOffset);
1129			if (status) {
1130				ctrl->ru_count = RUsPerPU - 1;
1131				continue;
1132			}
1133		}
1134		rbuf->which_ru = ctrl->ru_count;
1135
1136		/* skip this RU if it's already been reconstructed */
1137		if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, rbuf->failedDiskSectorOffset)) {
1138			Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count);
1139			continue;
1140		}
1141		break;
1142	}
1143	ctrl->headSepCounter++;
1144	if (do_new_check)
1145		CheckForNewMinHeadSep(raidPtr, ctrl->headSepCounter);	/* update min if needed */
1146
1147
1148	/* at this point, we have definitely decided what to do, and we have
1149	 * only to see if we can actually do it now */
1150	rbuf->parityStripeID = ctrl->curPSID;
1151	rbuf->which_ru = ctrl->ru_count;
1152#if RF_ACC_TRACE > 0
1153	memset((char *) &raidPtr->recon_tracerecs[col], 0,
1154	    sizeof(raidPtr->recon_tracerecs[col]));
1155	raidPtr->recon_tracerecs[col].reconacc = 1;
1156	RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1157#endif
1158	retcode = TryToRead(raidPtr, col);
1159	return (retcode);
1160}
1161
1162/*
1163 * tries to issue the next read on the indicated disk.  We may be
1164 * blocked by (a) the heads being too far apart, or (b) recon on the
1165 * indicated RU being blocked due to a write by a user thread.  In
1166 * this case, we issue a head-sep or blockage wait request, which will
1167 * cause this same routine to be invoked again later when the blockage
1168 * has cleared.
1169 */
1170
1171static int
1172TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t col)
1173{
1174	RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl->perDiskInfo[col];
1175	RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
1176	RF_StripeNum_t psid = ctrl->curPSID;
1177	RF_ReconUnitNum_t which_ru = ctrl->ru_count;
1178	RF_DiskQueueData_t *req;
1179	int     status;
1180	RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr;
1181
1182	/* if the current disk is too far ahead of the others, issue a
1183	 * head-separation wait and return */
1184	if (CheckHeadSeparation(raidPtr, ctrl, col, ctrl->headSepCounter, which_ru))
1185		return (0);
1186
1187	/* allocate a new PSS in case we need it */
1188	newpssPtr = rf_AllocPSStatus(raidPtr);
1189
1190	RF_LOCK_PSS_MUTEX(raidPtr, psid);
1191	pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE, newpssPtr);
1192
1193	if (pssPtr != newpssPtr) {
1194		rf_FreePSStatus(raidPtr, newpssPtr);
1195	}
1196
1197	/* if recon is blocked on the indicated parity stripe, issue a
1198	 * block-wait request and return. this also must mark the indicated RU
1199	 * in the stripe as under reconstruction if not blocked. */
1200	status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, col, psid, which_ru);
1201	if (status == RF_PSS_RECON_BLOCKED) {
1202		Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru);
1203		goto out;
1204	} else
1205		if (status == RF_PSS_FORCED_ON_WRITE) {
1206			rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1207			goto out;
1208		}
1209	/* make one last check to be sure that the indicated RU didn't get
1210	 * reconstructed while we were waiting for something else to happen.
1211	 * This is unfortunate in that it causes us to make this check twice
1212	 * in the normal case.  Might want to make some attempt to re-work
1213	 * this so that we only do this check if we've definitely blocked on
1214	 * one of the above checks.  When this condition is detected, we may
1215	 * have just created a bogus status entry, which we need to delete. */
1216	if (rf_CheckRUReconstructed(raidPtr->reconControl->reconMap, ctrl->rbuf->failedDiskSectorOffset)) {
1217		Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru);
1218		if (pssPtr == newpssPtr)
1219			rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1220		rf_CauseReconEvent(raidPtr, col, NULL, RF_REVENT_SKIP);
1221		goto out;
1222	}
1223	/* found something to read.  issue the I/O */
1224	Dprintf4("RECON: Read for psid %ld on col %d offset %ld buf %lx\n",
1225	    psid, col, ctrl->diskOffset, ctrl->rbuf->buffer);
1226#if RF_ACC_TRACE > 0
1227	RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
1228	RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
1229	raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
1230	    RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
1231	RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1232#endif
1233	/* should be ok to use a NULL proc pointer here, all the bufs we use
1234	 * should be in kernel space */
1235	req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
1236	    ReconReadDoneProc, (void *) ctrl,
1237#if RF_ACC_TRACE > 0
1238				     &raidPtr->recon_tracerecs[col],
1239#else
1240				     NULL,
1241#endif
1242				     (void *) raidPtr, 0, NULL, PR_WAITOK);
1243
1244	ctrl->rbuf->arg = (void *) req;
1245	rf_DiskIOEnqueue(&raidPtr->Queues[col], req, RF_IO_RECON_PRIORITY);
1246	pssPtr->issued[col] = 1;
1247
1248out:
1249	RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1250	return (0);
1251}
1252
1253
1254/*
1255 * given a parity stripe ID, we want to find out whether both the
1256 * current disk and the failed disk exist in that parity stripe.  If
1257 * not, we want to skip this whole PS.  If so, we want to find the
1258 * disk offset of the start of the PS on both the current disk and the
1259 * failed disk.
1260 *
1261 * this works by getting a list of disks comprising the indicated
1262 * parity stripe, and searching the list for the current and failed
1263 * disks.  Once we've decided they both exist in the parity stripe, we
1264 * need to decide whether each is data or parity, so that we'll know
1265 * which mapping function to call to get the corresponding disk
1266 * offsets.
1267 *
1268 * this is kind of unpleasant, but doing it this way allows the
1269 * reconstruction code to use parity stripe IDs rather than physical
1270 * disks address to march through the failed disk, which greatly
1271 * simplifies a lot of code, as well as eliminating the need for a
1272 * reverse-mapping function.  I also think it will execute faster,
1273 * since the calls to the mapping module are kept to a minimum.
1274 *
1275 * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
1276 * THE STRIPE IN THE CORRECT ORDER
1277 *
1278 * raidPtr          - raid descriptor
1279 * psid             - parity stripe identifier
1280 * col              - column of disk to find the offsets for
1281 * spCol            - out: col of spare unit for failed unit
1282 * spOffset         - out: offset into disk containing spare unit
1283 *
1284 */
1285
1286
1287static int
1288ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid,
1289		     RF_RowCol_t col, RF_SectorNum_t *outDiskOffset,
1290		     RF_SectorNum_t *outFailedDiskSectorOffset,
1291		     RF_RowCol_t *spCol, RF_SectorNum_t *spOffset)
1292{
1293	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1294	RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1295	RF_RaidAddr_t sosRaidAddress;	/* start-of-stripe */
1296	RF_RowCol_t *diskids;
1297	u_int   i, j, k, i_offset, j_offset;
1298	RF_RowCol_t pcol;
1299	int     testcol;
1300	RF_SectorNum_t poffset;
1301	char    i_is_parity = 0, j_is_parity = 0;
1302	RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
1303
1304	/* get a listing of the disks comprising that stripe */
1305	sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
1306	(layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids);
1307	RF_ASSERT(diskids);
1308
1309	/* reject this entire parity stripe if it does not contain the
1310	 * indicated disk or it does not contain the failed disk */
1311
1312	for (i = 0; i < stripeWidth; i++) {
1313		if (col == diskids[i])
1314			break;
1315	}
1316	if (i == stripeWidth)
1317		goto skipit;
1318	for (j = 0; j < stripeWidth; j++) {
1319		if (fcol == diskids[j])
1320			break;
1321	}
1322	if (j == stripeWidth) {
1323		goto skipit;
1324	}
1325	/* find out which disk the parity is on */
1326	(layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &pcol, &poffset, RF_DONT_REMAP);
1327
1328	/* find out if either the current RU or the failed RU is parity */
1329	/* also, if the parity occurs in this stripe prior to the data and/or
1330	 * failed col, we need to decrement i and/or j */
1331	for (k = 0; k < stripeWidth; k++)
1332		if (diskids[k] == pcol)
1333			break;
1334	RF_ASSERT(k < stripeWidth);
1335	i_offset = i;
1336	j_offset = j;
1337	if (k < i)
1338		i_offset--;
1339	else
1340		if (k == i) {
1341			i_is_parity = 1;
1342			i_offset = 0;
1343		}		/* set offsets to zero to disable multiply
1344				 * below */
1345	if (k < j)
1346		j_offset--;
1347	else
1348		if (k == j) {
1349			j_is_parity = 1;
1350			j_offset = 0;
1351		}
1352	/* at this point, [ij]_is_parity tells us whether the [current,failed]
1353	 * disk is parity at the start of this RU, and, if data, "[ij]_offset"
1354	 * tells us how far into the stripe the [current,failed] disk is. */
1355
1356	/* call the mapping routine to get the offset into the current disk,
1357	 * repeat for failed disk. */
1358	if (i_is_parity)
1359		layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1360	else
1361		layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outDiskOffset, RF_DONT_REMAP);
1362
1363	RF_ASSERT(col == testcol);
1364
1365	if (j_is_parity)
1366		layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1367	else
1368		layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
1369	RF_ASSERT(fcol == testcol);
1370
1371	/* now locate the spare unit for the failed unit */
1372#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1373	if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
1374		if (j_is_parity)
1375			layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1376		else
1377			layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spCol, spOffset, RF_REMAP);
1378	} else {
1379#endif
1380		*spCol = raidPtr->reconControl->spareCol;
1381		*spOffset = *outFailedDiskSectorOffset;
1382#if RF_INCLUDE_PARITY_DECLUSTERING_DS > 0
1383	}
1384#endif
1385	return (0);
1386
1387skipit:
1388	Dprintf2("RECON: Skipping psid %ld: nothing needed from c%d\n",
1389	    psid, col);
1390	return (1);
1391}
1392/* this is called when a buffer has become ready to write to the replacement disk */
1393static int
1394IssueNextWriteRequest(RF_Raid_t *raidPtr)
1395{
1396	RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1397	RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1398#if RF_ACC_TRACE > 0
1399	RF_RowCol_t fcol = raidPtr->reconControl->fcol;
1400#endif
1401	RF_ReconBuffer_t *rbuf;
1402	RF_DiskQueueData_t *req;
1403
1404	rbuf = rf_GetFullReconBuffer(raidPtr->reconControl);
1405	RF_ASSERT(rbuf);	/* there must be one available, or we wouldn't
1406				 * have gotten the event that sent us here */
1407	RF_ASSERT(rbuf->pssPtr);
1408
1409	rbuf->pssPtr->writeRbuf = rbuf;
1410	rbuf->pssPtr = NULL;
1411
1412	Dprintf6("RECON: New write (c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n",
1413	    rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
1414	    rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
1415	Dprintf6("RECON: new write psid %ld   %02x %02x %02x %02x %02x\n",
1416	    rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
1417	    rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
1418
1419	/* should be ok to use a NULL b_proc here b/c all addrs should be in
1420	 * kernel space */
1421	req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
1422	    sectorsPerRU, rbuf->buffer,
1423	    rbuf->parityStripeID, rbuf->which_ru,
1424	    ReconWriteDoneProc, (void *) rbuf,
1425#if RF_ACC_TRACE > 0
1426	    &raidPtr->recon_tracerecs[fcol],
1427#else
1428				     NULL,
1429#endif
1430	    (void *) raidPtr, 0, NULL, PR_WAITOK);
1431
1432	rbuf->arg = (void *) req;
1433	RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1434	raidPtr->reconControl->pending_writes++;
1435	RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1436	rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spCol], req, RF_IO_RECON_PRIORITY);
1437
1438	return (0);
1439}
1440
1441/*
1442 * this gets called upon the completion of a reconstruction read
1443 * operation the arg is a pointer to the per-disk reconstruction
1444 * control structure for the process that just finished a read.
1445 *
1446 * called at interrupt context in the kernel, so don't do anything
1447 * illegal here.
1448 */
1449static int
1450ReconReadDoneProc(void *arg, int status)
1451{
1452	RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
1453	RF_Raid_t *raidPtr;
1454
1455	/* Detect that reconCtrl is no longer valid, and if that
1456	   is the case, bail without calling rf_CauseReconEvent().
1457	   There won't be anyone listening for this event anyway */
1458
1459	if (ctrl->reconCtrl == NULL)
1460		return(0);
1461
1462	raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
1463
1464	if (status) {
1465		printf("raid%d: Recon read failed: %d\n", raidPtr->raidid, status);
1466		rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READ_FAILED);
1467		return(0);
1468	}
1469#if RF_ACC_TRACE > 0
1470	RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1471	RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1472	raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us =
1473	    RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1474	RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1475#endif
1476	rf_CauseReconEvent(raidPtr, ctrl->col, NULL, RF_REVENT_READDONE);
1477	return (0);
1478}
1479/* this gets called upon the completion of a reconstruction write operation.
1480 * the arg is a pointer to the rbuf that was just written
1481 *
1482 * called at interrupt context in the kernel, so don't do anything illegal here.
1483 */
1484static int
1485ReconWriteDoneProc(void *arg, int status)
1486{
1487	RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
1488
1489	/* Detect that reconControl is no longer valid, and if that
1490	   is the case, bail without calling rf_CauseReconEvent().
1491	   There won't be anyone listening for this event anyway */
1492
1493	if (rbuf->raidPtr->reconControl == NULL)
1494		return(0);
1495
1496	Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru);
1497	if (status) {
1498		printf("raid%d: Recon write failed!\n", rbuf->raidPtr->raidid);
1499		rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITE_FAILED);
1500		return(0);
1501	}
1502	rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, arg, RF_REVENT_WRITEDONE);
1503	return (0);
1504}
1505
1506
1507/*
1508 * computes a new minimum head sep, and wakes up anyone who needs to
1509 * be woken as a result
1510 */
1511static void
1512CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_HeadSepLimit_t hsCtr)
1513{
1514	RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1515	RF_HeadSepLimit_t new_min;
1516	RF_RowCol_t i;
1517	RF_CallbackDesc_t *p;
1518	RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter);	/* from the definition
1519								 * of a minimum */
1520
1521
1522	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1523	while(reconCtrlPtr->rb_lock) {
1524		ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlcnmhs", 0, &reconCtrlPtr->rb_mutex);
1525	}
1526	reconCtrlPtr->rb_lock = 1;
1527	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1528
1529	new_min = ~(1L << (8 * sizeof(long) - 1));	/* 0x7FFF....FFF */
1530	for (i = 0; i < raidPtr->numCol; i++)
1531		if (i != reconCtrlPtr->fcol) {
1532			if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min)
1533				new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter;
1534		}
1535	/* set the new minimum and wake up anyone who can now run again */
1536	if (new_min != reconCtrlPtr->minHeadSepCounter) {
1537		reconCtrlPtr->minHeadSepCounter = new_min;
1538		Dprintf1("RECON:  new min head pos counter val is %ld\n", new_min);
1539		while (reconCtrlPtr->headSepCBList) {
1540			if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min)
1541				break;
1542			p = reconCtrlPtr->headSepCBList;
1543			reconCtrlPtr->headSepCBList = p->next;
1544			p->next = NULL;
1545			rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
1546			rf_FreeCallbackDesc(p);
1547		}
1548
1549	}
1550	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1551	reconCtrlPtr->rb_lock = 0;
1552	wakeup(&reconCtrlPtr->rb_lock);
1553	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1554}
1555
1556/*
1557 * checks to see that the maximum head separation will not be violated
1558 * if we initiate a reconstruction I/O on the indicated disk.
1559 * Limiting the maximum head separation between two disks eliminates
1560 * the nasty buffer-stall conditions that occur when one disk races
1561 * ahead of the others and consumes all of the floating recon buffers.
1562 * This code is complex and unpleasant but it's necessary to avoid
1563 * some very nasty, albeit fairly rare, reconstruction behavior.
1564 *
1565 * returns non-zero if and only if we have to stop working on the
1566 * indicated disk due to a head-separation delay.
1567 */
1568static int
1569CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl,
1570		    RF_RowCol_t col, RF_HeadSepLimit_t hsCtr,
1571		    RF_ReconUnitNum_t which_ru)
1572{
1573	RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl;
1574	RF_CallbackDesc_t *cb, *p, *pt;
1575	int     retval = 0;
1576
1577	/* if we're too far ahead of the slowest disk, stop working on this
1578	 * disk until the slower ones catch up.  We do this by scheduling a
1579	 * wakeup callback for the time when the slowest disk has caught up.
1580	 * We define "caught up" with 20% hysteresis, i.e. the head separation
1581	 * must have fallen to at most 80% of the max allowable head
1582	 * separation before we'll wake up.
1583	 *
1584	 */
1585	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1586	while(reconCtrlPtr->rb_lock) {
1587		ltsleep(&reconCtrlPtr->rb_lock, PRIBIO, "reconctlchs", 0, &reconCtrlPtr->rb_mutex);
1588	}
1589	reconCtrlPtr->rb_lock = 1;
1590	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1591	if ((raidPtr->headSepLimit >= 0) &&
1592	    ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) {
1593		Dprintf5("raid%d: RECON: head sep stall: col %d hsCtr %ld minHSCtr %ld limit %ld\n",
1594			 raidPtr->raidid, col, ctrl->headSepCounter,
1595			 reconCtrlPtr->minHeadSepCounter,
1596			 raidPtr->headSepLimit);
1597		cb = rf_AllocCallbackDesc();
1598		/* the minHeadSepCounter value we have to get to before we'll
1599		 * wake up.  build in 20% hysteresis. */
1600		cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
1601		cb->col = col;
1602		cb->next = NULL;
1603
1604		/* insert this callback descriptor into the sorted list of
1605		 * pending head-sep callbacks */
1606		p = reconCtrlPtr->headSepCBList;
1607		if (!p)
1608			reconCtrlPtr->headSepCBList = cb;
1609		else
1610			if (cb->callbackArg.v < p->callbackArg.v) {
1611				cb->next = reconCtrlPtr->headSepCBList;
1612				reconCtrlPtr->headSepCBList = cb;
1613			} else {
1614				for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next);
1615				cb->next = p;
1616				pt->next = cb;
1617			}
1618		retval = 1;
1619#if RF_RECON_STATS > 0
1620		ctrl->reconCtrl->reconDesc->hsStallCount++;
1621#endif				/* RF_RECON_STATS > 0 */
1622	}
1623	RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1624	reconCtrlPtr->rb_lock = 0;
1625	wakeup(&reconCtrlPtr->rb_lock);
1626	RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1627
1628	return (retval);
1629}
1630/*
1631 * checks to see if reconstruction has been either forced or blocked
1632 * by a user operation.  if forced, we skip this RU entirely.  else if
1633 * blocked, put ourselves on the wait list.  else return 0.
1634 *
1635 * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY
1636 */
1637static int
1638CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr,
1639				   RF_ReconParityStripeStatus_t *pssPtr,
1640				   RF_PerDiskReconCtrl_t *ctrl,
1641				   RF_RowCol_t col,
1642				   RF_StripeNum_t psid,
1643				   RF_ReconUnitNum_t which_ru)
1644{
1645	RF_CallbackDesc_t *cb;
1646	int     retcode = 0;
1647
1648	if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
1649		retcode = RF_PSS_FORCED_ON_WRITE;
1650	else
1651		if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
1652			Dprintf3("RECON: col %d blocked at psid %ld ru %d\n", col, psid, which_ru);
1653			cb = rf_AllocCallbackDesc();	/* append ourselves to
1654							 * the blockage-wait
1655							 * list */
1656			cb->col = col;
1657			cb->next = pssPtr->blockWaitList;
1658			pssPtr->blockWaitList = cb;
1659			retcode = RF_PSS_RECON_BLOCKED;
1660		}
1661	if (!retcode)
1662		pssPtr->flags |= RF_PSS_UNDER_RECON;	/* mark this RU as under
1663							 * reconstruction */
1664
1665	return (retcode);
1666}
1667/*
1668 * if reconstruction is currently ongoing for the indicated stripeID,
1669 * reconstruction is forced to completion and we return non-zero to
1670 * indicate that the caller must wait.  If not, then reconstruction is
1671 * blocked on the indicated stripe and the routine returns zero.  If
1672 * and only if we return non-zero, we'll cause the cbFunc to get
1673 * invoked with the cbArg when the reconstruction has completed.
1674 */
1675int
1676rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1677		     void (*cbFunc)(RF_Raid_t *, void *), void *cbArg)
1678{
1679	RF_StripeNum_t stripeID = asmap->stripeID;	/* the stripe ID we're
1680							 * forcing recon on */
1681	RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;	/* num sects in one RU */
1682	RF_ReconParityStripeStatus_t *pssPtr, *newpssPtr;	/* a pointer to the parity
1683						 * stripe status structure */
1684	RF_StripeNum_t psid;	/* parity stripe id */
1685	RF_SectorNum_t offset, fd_offset;	/* disk offset, failed-disk
1686						 * offset */
1687	RF_RowCol_t *diskids;
1688	RF_ReconUnitNum_t which_ru;	/* RU within parity stripe */
1689	RF_RowCol_t fcol, diskno, i;
1690	RF_ReconBuffer_t *new_rbuf;	/* ptr to newly allocated rbufs */
1691	RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */
1692	RF_CallbackDesc_t *cb;
1693	int     nPromoted;
1694
1695	psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1696
1697	/* allocate a new PSS in case we need it */
1698        newpssPtr = rf_AllocPSStatus(raidPtr);
1699
1700	RF_LOCK_PSS_MUTEX(raidPtr, psid);
1701
1702	pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, newpssPtr);
1703
1704        if (pssPtr != newpssPtr) {
1705                rf_FreePSStatus(raidPtr, newpssPtr);
1706        }
1707
1708	/* if recon is not ongoing on this PS, just return */
1709	if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1710		RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1711		return (0);
1712	}
1713	/* otherwise, we have to wait for reconstruction to complete on this
1714	 * RU. */
1715	/* In order to avoid waiting for a potentially large number of
1716	 * low-priority accesses to complete, we force a normal-priority (i.e.
1717	 * not low-priority) reconstruction on this RU. */
1718	if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
1719		DDprintf1("Forcing recon on psid %ld\n", psid);
1720		pssPtr->flags |= RF_PSS_FORCED_ON_WRITE;	/* mark this RU as under
1721								 * forced recon */
1722		pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;	/* clear the blockage
1723							 * that we just set */
1724		fcol = raidPtr->reconControl->fcol;
1725
1726		/* get a listing of the disks comprising the indicated stripe */
1727		(raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids);
1728
1729		/* For previously issued reads, elevate them to normal
1730		 * priority.  If the I/O has already completed, it won't be
1731		 * found in the queue, and hence this will be a no-op. For
1732		 * unissued reads, allocate buffers and issue new reads.  The
1733		 * fact that we've set the FORCED bit means that the regular
1734		 * recon procs will not re-issue these reqs */
1735		for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++)
1736			if ((diskno = diskids[i]) != fcol) {
1737				if (pssPtr->issued[diskno]) {
1738					nPromoted = rf_DiskIOPromote(&raidPtr->Queues[diskno], psid, which_ru);
1739					if (rf_reconDebug && nPromoted)
1740						printf("raid%d: promoted read from col %d\n", raidPtr->raidid, diskno);
1741				} else {
1742					new_rbuf = rf_MakeReconBuffer(raidPtr, diskno, RF_RBUF_TYPE_FORCED);	/* create new buf */
1743					ComputePSDiskOffsets(raidPtr, psid, diskno, &offset, &fd_offset,
1744					    &new_rbuf->spCol, &new_rbuf->spOffset);	/* find offsets & spare
1745													 * location */
1746					new_rbuf->parityStripeID = psid;	/* fill in the buffer */
1747					new_rbuf->which_ru = which_ru;
1748					new_rbuf->failedDiskSectorOffset = fd_offset;
1749					new_rbuf->priority = RF_IO_NORMAL_PRIORITY;
1750
1751					/* use NULL b_proc b/c all addrs
1752					 * should be in kernel space */
1753					req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer,
1754					    psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf,
1755					    NULL, (void *) raidPtr, 0, NULL, PR_WAITOK);
1756
1757					new_rbuf->arg = req;
1758					rf_DiskIOEnqueue(&raidPtr->Queues[diskno], req, RF_IO_NORMAL_PRIORITY);	/* enqueue the I/O */
1759					Dprintf2("raid%d: Issued new read req on col %d\n", raidPtr->raidid, diskno);
1760				}
1761			}
1762		/* if the write is sitting in the disk queue, elevate its
1763		 * priority */
1764		if (rf_DiskIOPromote(&raidPtr->Queues[fcol], psid, which_ru))
1765			if (rf_reconDebug)
1766				printf("raid%d: promoted write to col %d\n",
1767				       raidPtr->raidid, fcol);
1768	}
1769	/* install a callback descriptor to be invoked when recon completes on
1770	 * this parity stripe. */
1771	cb = rf_AllocCallbackDesc();
1772	/* XXX the following is bogus.. These functions don't really match!!
1773	 * GO */
1774	cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
1775	cb->callbackArg.p = (void *) cbArg;
1776	cb->next = pssPtr->procWaitList;
1777	pssPtr->procWaitList = cb;
1778	DDprintf2("raid%d: Waiting for forced recon on psid %ld\n",
1779		  raidPtr->raidid, psid);
1780
1781	RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1782	return (1);
1783}
1784/* called upon the completion of a forced reconstruction read.
1785 * all we do is schedule the FORCEDREADONE event.
1786 * called at interrupt context in the kernel, so don't do anything illegal here.
1787 */
1788static void
1789ForceReconReadDoneProc(void *arg, int status)
1790{
1791	RF_ReconBuffer_t *rbuf = arg;
1792
1793	/* Detect that reconControl is no longer valid, and if that
1794	   is the case, bail without calling rf_CauseReconEvent().
1795	   There won't be anyone listening for this event anyway */
1796
1797	if (rbuf->raidPtr->reconControl == NULL)
1798		return;
1799
1800	if (status) {
1801		printf("raid%d: Forced recon read failed!\n", rbuf->raidPtr->raidid);
1802		rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREAD_FAILED);
1803		return;
1804	}
1805	rf_CauseReconEvent(rbuf->raidPtr, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE);
1806}
1807/* releases a block on the reconstruction of the indicated stripe */
1808int
1809rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
1810{
1811	RF_StripeNum_t stripeID = asmap->stripeID;
1812	RF_ReconParityStripeStatus_t *pssPtr;
1813	RF_ReconUnitNum_t which_ru;
1814	RF_StripeNum_t psid;
1815	RF_CallbackDesc_t *cb;
1816
1817	psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
1818	RF_LOCK_PSS_MUTEX(raidPtr, psid);
1819	pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl->pssTable, psid, which_ru, RF_PSS_NONE, NULL);
1820
1821	/* When recon is forced, the pss desc can get deleted before we get
1822	 * back to unblock recon. But, this can _only_ happen when recon is
1823	 * forced. It would be good to put some kind of sanity check here, but
1824	 * how to decide if recon was just forced or not? */
1825	if (!pssPtr) {
1826		/* printf("Warning: no pss descriptor upon unblock on psid %ld
1827		 * RU %d\n",psid,which_ru); */
1828#if (RF_DEBUG_RECON > 0) || (RF_DEBUG_PSS > 0)
1829		if (rf_reconDebug || rf_pssDebug)
1830			printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru);
1831#endif
1832		goto out;
1833	}
1834	pssPtr->blockCount--;
1835	Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d\n",
1836		 raidPtr->raidid, psid, pssPtr->blockCount);
1837	if (pssPtr->blockCount == 0) {	/* if recon blockage has been released */
1838
1839		/* unblock recon before calling CauseReconEvent in case
1840		 * CauseReconEvent causes us to try to issue a new read before
1841		 * returning here. */
1842		pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
1843
1844
1845		while (pssPtr->blockWaitList) {
1846			/* spin through the block-wait list and
1847			   release all the waiters */
1848			cb = pssPtr->blockWaitList;
1849			pssPtr->blockWaitList = cb->next;
1850			cb->next = NULL;
1851			rf_CauseReconEvent(raidPtr, cb->col, NULL, RF_REVENT_BLOCKCLEAR);
1852			rf_FreeCallbackDesc(cb);
1853		}
1854		if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1855			/* if no recon was requested while recon was blocked */
1856			rf_PSStatusDelete(raidPtr, raidPtr->reconControl->pssTable, pssPtr);
1857		}
1858	}
1859out:
1860	RF_UNLOCK_PSS_MUTEX(raidPtr, psid);
1861	return (0);
1862}
1863
1864void
1865rf_WakeupHeadSepCBWaiters(RF_Raid_t *raidPtr)
1866{
1867	RF_CallbackDesc_t *p;
1868
1869	RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1870	while(raidPtr->reconControl->rb_lock) {
1871		ltsleep(&raidPtr->reconControl->rb_lock, PRIBIO,
1872			"rf_wakeuphscbw", 0, &raidPtr->reconControl->rb_mutex);
1873	}
1874
1875	raidPtr->reconControl->rb_lock = 1;
1876	RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1877
1878	while (raidPtr->reconControl->headSepCBList) {
1879		p = raidPtr->reconControl->headSepCBList;
1880		raidPtr->reconControl->headSepCBList = p->next;
1881		p->next = NULL;
1882		rf_CauseReconEvent(raidPtr, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
1883		rf_FreeCallbackDesc(p);
1884	}
1885	RF_LOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1886	raidPtr->reconControl->rb_lock = 0;
1887	wakeup(&raidPtr->reconControl->rb_lock);
1888	RF_UNLOCK_MUTEX(raidPtr->reconControl->rb_mutex);
1889
1890}
1891
1892