1/*	$NetBSD: rf_copyback.c,v 1.48 2011/08/03 14:44:38 oster Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/*****************************************************************************
30 *
31 * copyback.c -- code to copy reconstructed data back from spare space to
32 *               the replaced disk.
33 *
34 * the code operates using callbacks on the I/Os to continue with the
35 * next unit to be copied back.  We do this because a simple loop
36 * containing blocking I/Os will not work in the simulator.
37 *
38 ****************************************************************************/
39
40#include <sys/cdefs.h>
41__KERNEL_RCSID(0, "$NetBSD: rf_copyback.c,v 1.48 2011/08/03 14:44:38 oster Exp $");
42
43#include <dev/raidframe/raidframevar.h>
44
45#include <sys/time.h>
46#include <sys/buf.h>
47#include "rf_raid.h"
48#include "rf_mcpair.h"
49#include "rf_acctrace.h"
50#include "rf_etimer.h"
51#include "rf_general.h"
52#include "rf_utils.h"
53#include "rf_copyback.h"
54#include "rf_decluster.h"
55#include "rf_driver.h"
56#include "rf_shutdown.h"
57#include "rf_kintf.h"
58
59#define RF_COPYBACK_DATA   0
60#define RF_COPYBACK_PARITY 1
61
62int     rf_copyback_in_progress;
63
64static int rf_CopybackReadDoneProc(RF_CopybackDesc_t * desc, int status);
65static int rf_CopybackWriteDoneProc(RF_CopybackDesc_t * desc, int status);
66static void rf_CopybackOne(RF_CopybackDesc_t * desc, int typ,
67			   RF_RaidAddr_t addr, RF_RowCol_t testCol,
68			   RF_SectorNum_t testOffs);
69static void rf_CopybackComplete(RF_CopybackDesc_t * desc, int status);
70
71int
72rf_ConfigureCopyback(RF_ShutdownList_t **listp)
73{
74	rf_copyback_in_progress = 0;
75	return (0);
76}
77
78#include <sys/param.h>
79#include <sys/systm.h>
80#include <sys/proc.h>
81#include <sys/ioctl.h>
82#include <sys/fcntl.h>
83#include <sys/vnode.h>
84#include <sys/namei.h> /* for pathbuf */
85
86/* do a complete copyback */
87void
88rf_CopybackReconstructedData(RF_Raid_t *raidPtr)
89{
90	RF_ComponentLabel_t *c_label;
91	int     found, retcode;
92	RF_CopybackDesc_t *desc;
93	RF_RowCol_t fcol;
94	RF_RaidDisk_t *badDisk;
95	char   *databuf;
96
97	struct pathbuf *dev_pb;
98	struct vnode *vp;
99	struct vattr va;
100
101	int ac;
102
103	fcol = 0;
104	found = 0;
105	for (fcol = 0; fcol < raidPtr->numCol; fcol++) {
106		if (raidPtr->Disks[fcol].status == rf_ds_dist_spared
107		    || raidPtr->Disks[fcol].status == rf_ds_spared) {
108			found = 1;
109			break;
110		}
111	}
112
113	if (!found) {
114		printf("raid%d: no disks need copyback\n", raidPtr->raidid);
115		return;
116	}
117
118	badDisk = &raidPtr->Disks[fcol];
119
120	/* This device may have been opened successfully the first time. Close
121	 * it before trying to open it again.. */
122
123	if (raidPtr->raid_cinfo[fcol].ci_vp != NULL) {
124		printf("Closed the open device: %s\n",
125		    raidPtr->Disks[fcol].devname);
126		vp = raidPtr->raid_cinfo[fcol].ci_vp;
127		ac = raidPtr->Disks[fcol].auto_configured;
128		rf_close_component(raidPtr, vp, ac);
129		raidPtr->raid_cinfo[fcol].ci_vp = NULL;
130
131	}
132	/* note that this disk was *not* auto_configured (any longer) */
133	raidPtr->Disks[fcol].auto_configured = 0;
134
135	printf("About to (re-)open the device: %s\n",
136	    raidPtr->Disks[fcol].devname);
137
138	dev_pb = pathbuf_create(raidPtr->Disks[fcol].devname);
139	if (dev_pb == NULL) {
140		/* shouldn't happen unless maybe the system is OOMing */
141		printf("raid%d: copyback: pathbuf_create on device: %s failed: %d!\n",
142		       raidPtr->raidid, raidPtr->Disks[fcol].devname,
143		       ENOMEM);
144		return;
145	}
146	retcode = dk_lookup(dev_pb, curlwp, &vp);
147	pathbuf_destroy(dev_pb);
148
149	if (retcode) {
150		printf("raid%d: copyback: dk_lookup on device: %s failed: %d!\n",
151		       raidPtr->raidid, raidPtr->Disks[fcol].devname,
152		       retcode);
153
154		/* XXX the component isn't responding properly... must be
155		 * still dead :-( */
156		return;
157
158	} else {
159
160		/* Ok, so we can at least do a lookup... How about actually
161		 * getting a vp for it? */
162
163		vn_lock(vp, LK_SHARED | LK_RETRY);
164		retcode = VOP_GETATTR(vp, &va, curlwp->l_cred);
165		VOP_UNLOCK(vp);
166		if (retcode != 0)
167			return;
168		retcode = rf_getdisksize(vp, &raidPtr->Disks[fcol]);
169		if (retcode) {
170			return;
171		}
172
173		raidPtr->raid_cinfo[fcol].ci_vp = vp;
174		raidPtr->raid_cinfo[fcol].ci_dev = va.va_rdev;
175
176		raidPtr->Disks[fcol].dev = va.va_rdev;	/* XXX or the above? */
177
178		/* we allow the user to specify that only a fraction of the
179		 * disks should be used this is just for debug:  it speeds up
180		 * the parity scan */
181		raidPtr->Disks[fcol].numBlocks =
182		    raidPtr->Disks[fcol].numBlocks *
183		    rf_sizePercentage / 100;
184	}
185
186	if (retcode) {
187		printf("raid%d: copyback: target disk failed TUR\n",
188		       raidPtr->raidid);
189		return;
190	}
191	/* get a buffer to hold one SU  */
192	RF_Malloc(databuf, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (char *));
193
194	/* create a descriptor */
195	RF_Malloc(desc, sizeof(*desc), (RF_CopybackDesc_t *));
196	desc->raidPtr = raidPtr;
197	desc->status = 0;
198	desc->fcol = fcol;
199	desc->spCol = badDisk->spareCol;
200	desc->stripeAddr = 0;
201	desc->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
202	desc->sectPerStripe = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.numDataCol;
203	desc->databuf = databuf;
204	desc->mcpair = rf_AllocMCPair();
205
206	/* quiesce the array, since we don't want to code support for user
207	 * accs here */
208	rf_SuspendNewRequestsAndWait(raidPtr);
209
210	/* adjust state of the array and of the disks */
211	rf_lock_mutex2(raidPtr->mutex);
212	raidPtr->Disks[desc->fcol].status = rf_ds_optimal;
213	raidPtr->status = rf_rs_optimal;
214	rf_copyback_in_progress = 1;	/* debug only */
215	rf_unlock_mutex2(raidPtr->mutex);
216
217	RF_GETTIME(desc->starttime);
218	rf_ContinueCopyback(desc);
219
220	/* Data has been restored.  Fix up the component label. */
221	/* Don't actually need the read here.. */
222
223	c_label = raidget_component_label(raidPtr, fcol);
224	raid_init_component_label(raidPtr, c_label);
225
226	c_label->row = 0;
227	c_label->column = fcol;
228	rf_component_label_set_partitionsize(c_label,
229	    raidPtr->Disks[fcol].partitionSize);
230
231	raidflush_component_label(raidPtr, fcol);
232
233	/* XXXjld why is this here? */
234	rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
235}
236
237
238/*
239 * invoked via callback after a copyback I/O has completed to
240 * continue on with the next one
241 */
242void
243rf_ContinueCopyback(RF_CopybackDesc_t *desc)
244{
245	RF_SectorNum_t testOffs, stripeAddr;
246	RF_Raid_t *raidPtr = desc->raidPtr;
247	RF_RaidAddr_t addr;
248	RF_RowCol_t testCol;
249#if RF_DEBUG_RECON
250	int     old_pctg, new_pctg;
251	struct timeval t, diff;
252#endif
253	int done;
254
255#if RF_DEBUG_RECON
256	old_pctg = (-1);
257#endif
258	while (1) {
259		stripeAddr = desc->stripeAddr;
260		desc->raidPtr->copyback_stripes_done = stripeAddr
261			/ desc->sectPerStripe;
262#if RF_DEBUG_RECON
263		if (rf_prReconSched) {
264			old_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors;
265		}
266#endif
267		desc->stripeAddr += desc->sectPerStripe;
268#if RF_DEBUG_RECON
269		if (rf_prReconSched) {
270			new_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors;
271			if (new_pctg != old_pctg) {
272				RF_GETTIME(t);
273				RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff);
274				printf("%d %d.%06d\n", new_pctg, (int) diff.tv_sec, (int) diff.tv_usec);
275			}
276		}
277#endif
278		if (stripeAddr >= raidPtr->totalSectors) {
279			rf_CopybackComplete(desc, 0);
280			return;
281		}
282		/* walk through the current stripe, su-by-su */
283		for (done = 0, addr = stripeAddr; addr < stripeAddr + desc->sectPerStripe; addr += desc->sectPerSU) {
284
285			/* map the SU, disallowing remap to spare space */
286			(raidPtr->Layout.map->MapSector) (raidPtr, addr, &testCol, &testOffs, RF_DONT_REMAP);
287
288			if (testCol == desc->fcol) {
289				rf_CopybackOne(desc, RF_COPYBACK_DATA, addr, testCol, testOffs);
290				done = 1;
291				break;
292			}
293		}
294
295		if (!done) {
296			/* we didn't find the failed disk in the data part.
297			 * check parity. */
298
299			/* map the parity for this stripe, disallowing remap
300			 * to spare space */
301			(raidPtr->Layout.map->MapParity) (raidPtr, stripeAddr, &testCol, &testOffs, RF_DONT_REMAP);
302
303			if (testCol == desc->fcol) {
304				rf_CopybackOne(desc, RF_COPYBACK_PARITY, stripeAddr, testCol, testOffs);
305			}
306		}
307		/* check to see if the last read/write pair failed */
308		if (desc->status) {
309			rf_CopybackComplete(desc, 1);
310			return;
311		}
312		/* we didn't find any units to copy back in this stripe.
313		 * Continue with the next one */
314	}
315}
316
317
318/* copyback one unit */
319static void
320rf_CopybackOne(RF_CopybackDesc_t *desc, int typ, RF_RaidAddr_t addr,
321	       RF_RowCol_t testCol, RF_SectorNum_t testOffs)
322{
323	RF_SectorCount_t sectPerSU = desc->sectPerSU;
324	RF_Raid_t *raidPtr = desc->raidPtr;
325	RF_RowCol_t spCol = desc->spCol;
326	RF_SectorNum_t spOffs;
327
328	/* find the spare spare location for this SU */
329	if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
330		if (typ == RF_COPYBACK_DATA)
331			raidPtr->Layout.map->MapSector(raidPtr, addr, &spCol, &spOffs, RF_REMAP);
332		else
333			raidPtr->Layout.map->MapParity(raidPtr, addr, &spCol, &spOffs, RF_REMAP);
334	} else {
335		spOffs = testOffs;
336	}
337
338	/* create reqs to read the old location & write the new */
339	desc->readreq = rf_CreateDiskQueueData(RF_IO_TYPE_READ, spOffs,
340	    sectPerSU, desc->databuf, 0L, 0,
341	    (int (*) (void *, int)) rf_CopybackReadDoneProc, desc,
342	    NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL,
343	    PR_WAITOK);
344	desc->writereq = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, testOffs,
345	    sectPerSU, desc->databuf, 0L, 0,
346	    (int (*) (void *, int)) rf_CopybackWriteDoneProc, desc,
347	    NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL,
348	    PR_WAITOK);
349	desc->fcol = testCol;
350
351	/* enqueue the read.  the write will go out as part of the callback on
352	 * the read. at user-level & in the kernel, wait for the read-write
353	 * pair to complete. in the simulator, just return, since everything
354	 * will happen as callbacks */
355
356	RF_LOCK_MCPAIR(desc->mcpair);
357	desc->mcpair->flag = 0;
358	RF_UNLOCK_MCPAIR(desc->mcpair);
359
360	rf_DiskIOEnqueue(&raidPtr->Queues[spCol], desc->readreq, RF_IO_NORMAL_PRIORITY);
361
362	RF_LOCK_MCPAIR(desc->mcpair);
363	while (!desc->mcpair->flag) {
364		RF_WAIT_MCPAIR(desc->mcpair);
365	}
366	RF_UNLOCK_MCPAIR(desc->mcpair);
367	rf_FreeDiskQueueData(desc->readreq);
368	rf_FreeDiskQueueData(desc->writereq);
369
370}
371
372
373/* called at interrupt context when the read has completed.  just send out the write */
374static int
375rf_CopybackReadDoneProc(RF_CopybackDesc_t *desc, int status)
376{
377	if (status) {		/* invoke the callback with bad status */
378		printf("raid%d: copyback read failed.  Aborting.\n",
379		       desc->raidPtr->raidid);
380		(desc->writereq->CompleteFunc) (desc, -100);
381	} else {
382		rf_DiskIOEnqueue(&(desc->raidPtr->Queues[desc->fcol]), desc->writereq, RF_IO_NORMAL_PRIORITY);
383	}
384	return (0);
385}
386/* called at interrupt context when the write has completed.
387 * at user level & in the kernel, wake up the copyback thread.
388 * in the simulator, invoke the next copyback directly.
389 * can't free diskqueuedata structs in the kernel b/c we're at interrupt context.
390 */
391static int
392rf_CopybackWriteDoneProc(RF_CopybackDesc_t *desc, int status)
393{
394	if (status && status != -100) {
395		printf("raid%d: copyback write failed.  Aborting.\n",
396		       desc->raidPtr->raidid);
397	}
398	desc->status = status;
399	rf_MCPairWakeupFunc(desc->mcpair);
400	return (0);
401}
402/* invoked when the copyback has completed */
403static void
404rf_CopybackComplete(RF_CopybackDesc_t *desc, int status)
405{
406	RF_Raid_t *raidPtr = desc->raidPtr;
407	struct timeval t, diff;
408
409	if (!status) {
410		rf_lock_mutex2(raidPtr->mutex);
411		if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
412			RF_ASSERT(raidPtr->Layout.map->parityConfig == 'D');
413			rf_FreeSpareTable(raidPtr);
414		} else {
415			raidPtr->Disks[desc->spCol].status = rf_ds_spare;
416		}
417		rf_unlock_mutex2(raidPtr->mutex);
418
419		RF_GETTIME(t);
420		RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff);
421#if 0
422		printf("Copyback time was %d.%06d seconds\n",
423		    (int) diff.tv_sec, (int) diff.tv_usec);
424#endif
425	} else
426		printf("raid%d: Copyback failure.  Status: %d\n",
427		       raidPtr->raidid, status);
428
429	RF_Free(desc->databuf, rf_RaidAddressToByte(raidPtr, desc->sectPerSU));
430	rf_FreeMCPair(desc->mcpair);
431	RF_Free(desc, sizeof(*desc));
432
433	rf_copyback_in_progress = 0;
434	rf_ResumeNewRequests(raidPtr);
435}
436