rf_diskqueue.c revision 1.36
1/*	$NetBSD: rf_diskqueue.c,v 1.36 2004/11/24 13:42:36 oster Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/****************************************************************************
30 *
31 * rf_diskqueue.c -- higher-level disk queue code
32 *
33 * the routines here are a generic wrapper around the actual queueing
34 * routines.  The code here implements thread scheduling, synchronization,
35 * and locking ops (see below) on top of the lower-level queueing code.
36 *
37 * to support atomic RMW, we implement "locking operations".  When a
38 * locking op is dispatched to the lower levels of the driver, the
39 * queue is locked, and no further I/Os are dispatched until the queue
40 * receives & completes a corresponding "unlocking operation".  This
41 * code relies on the higher layers to guarantee that a locking op
42 * will always be eventually followed by an unlocking op.  The model
43 * is that the higher layers are structured so locking and unlocking
44 * ops occur in pairs, i.e.  an unlocking op cannot be generated until
45 * after a locking op reports completion.  There is no good way to
46 * check to see that an unlocking op "corresponds" to the op that
47 * currently has the queue locked, so we make no such attempt.  Since
48 * by definition there can be only one locking op outstanding on a
49 * disk, this should not be a problem.
50 *
51 * In the kernel, we allow multiple I/Os to be concurrently dispatched
52 * to the disk driver.  In order to support locking ops in this
53 * environment, when we decide to do a locking op, we stop dispatching
54 * new I/Os and wait until all dispatched I/Os have completed before
55 * dispatching the locking op.
56 *
57 * Unfortunately, the code is different in the 3 different operating
58 * states (user level, kernel, simulator).  In the kernel, I/O is
59 * non-blocking, and we have no disk threads to dispatch for us.
60 * Therefore, we have to dispatch new I/Os to the scsi driver at the
61 * time of enqueue, and also at the time of completion.  At user
62 * level, I/O is blocking, and so only the disk threads may dispatch
63 * I/Os.  Thus at user level, all we can do at enqueue time is enqueue
64 * and wake up the disk thread to do the dispatch.
65 *
66 ****************************************************************************/
67
68#include <sys/cdefs.h>
69__KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.36 2004/11/24 13:42:36 oster Exp $");
70
71#include <dev/raidframe/raidframevar.h>
72
73#include "rf_threadstuff.h"
74#include "rf_raid.h"
75#include "rf_diskqueue.h"
76#include "rf_alloclist.h"
77#include "rf_acctrace.h"
78#include "rf_etimer.h"
79#include "rf_general.h"
80#include "rf_debugprint.h"
81#include "rf_shutdown.h"
82#include "rf_cvscan.h"
83#include "rf_sstf.h"
84#include "rf_fifo.h"
85#include "rf_kintf.h"
86
87static void rf_ShutdownDiskQueueSystem(void *);
88
89#ifndef RF_DEBUG_DISKQUEUE
90#define RF_DEBUG_DISKQUEUE 0
91#endif
92
93#if RF_DEBUG_DISKQUEUE
94#define Dprintf1(s,a)         if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
95#define Dprintf2(s,a,b)       if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
96#define Dprintf3(s,a,b,c)     if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
97#else
98#define Dprintf1(s,a)
99#define Dprintf2(s,a,b)
100#define Dprintf3(s,a,b,c)
101#endif
102
103/*****************************************************************************
104 *
105 * the disk queue switch defines all the functions used in the
106 * different queueing disciplines queue ID, init routine, enqueue
107 * routine, dequeue routine
108 *
109 ****************************************************************************/
110
111static const RF_DiskQueueSW_t diskqueuesw[] = {
112	{"fifo",		/* FIFO */
113		rf_FifoCreate,
114		rf_FifoEnqueue,
115		rf_FifoDequeue,
116		rf_FifoPeek,
117	rf_FifoPromote},
118
119	{"cvscan",		/* cvscan */
120		rf_CvscanCreate,
121		rf_CvscanEnqueue,
122		rf_CvscanDequeue,
123		rf_CvscanPeek,
124	rf_CvscanPromote},
125
126	{"sstf",		/* shortest seek time first */
127		rf_SstfCreate,
128		rf_SstfEnqueue,
129		rf_SstfDequeue,
130		rf_SstfPeek,
131	rf_SstfPromote},
132
133	{"scan",		/* SCAN (two-way elevator) */
134		rf_ScanCreate,
135		rf_SstfEnqueue,
136		rf_ScanDequeue,
137		rf_ScanPeek,
138	rf_SstfPromote},
139
140	{"cscan",		/* CSCAN (one-way elevator) */
141		rf_CscanCreate,
142		rf_SstfEnqueue,
143		rf_CscanDequeue,
144		rf_CscanPeek,
145	rf_SstfPromote},
146
147};
148#define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
149
150#define RF_MAX_FREE_DQD 256
151#define RF_MIN_FREE_DQD  64
152
153#include <sys/buf.h>
154
155/* configures a single disk queue */
156
157int
158rf_ConfigureDiskQueue(RF_Raid_t *raidPtr, RF_DiskQueue_t *diskqueue,
159		      RF_RowCol_t c, const RF_DiskQueueSW_t *p,
160		      RF_SectorCount_t sectPerDisk, dev_t dev,
161		      int maxOutstanding, RF_ShutdownList_t **listp,
162		      RF_AllocListElem_t *clList)
163{
164	diskqueue->col = c;
165	diskqueue->qPtr = p;
166	diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp);
167	diskqueue->dev = dev;
168	diskqueue->numOutstanding = 0;
169	diskqueue->queueLength = 0;
170	diskqueue->maxOutstanding = maxOutstanding;
171	diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
172	diskqueue->nextLockingOp = NULL;
173	diskqueue->flags = 0;
174	diskqueue->raidPtr = raidPtr;
175	diskqueue->rf_cinfo = &raidPtr->raid_cinfo[c];
176	rf_mutex_init(&diskqueue->mutex);
177	diskqueue->cond = 0;
178	return (0);
179}
180
181static void
182rf_ShutdownDiskQueueSystem(void *ignored)
183{
184	pool_destroy(&rf_pools.dqd);
185}
186
187int
188rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp)
189{
190
191	rf_pool_init(&rf_pools.dqd, sizeof(RF_DiskQueueData_t),
192		     "rf_dqd_pl", RF_MIN_FREE_DQD, RF_MAX_FREE_DQD);
193	rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL);
194
195	return (0);
196}
197
198int
199rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
200		       RF_Config_t *cfgPtr)
201{
202	RF_DiskQueue_t *diskQueues, *spareQueues;
203	const RF_DiskQueueSW_t *p;
204	RF_RowCol_t r,c;
205	int     rc, i;
206
207	raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
208
209	for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) {
210		if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
211			p = &diskqueuesw[i];
212			break;
213		}
214	}
215	if (p == NULL) {
216		RF_ERRORMSG2("Unknown queue type \"%s\".  Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType);
217		p = &diskqueuesw[0];
218	}
219	raidPtr->qType = p;
220
221	RF_MallocAndAdd(diskQueues,
222			(raidPtr->numCol + RF_MAXSPARE) *
223			sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *),
224			raidPtr->cleanupList);
225	if (diskQueues == NULL)
226		return (ENOMEM);
227	raidPtr->Queues = diskQueues;
228
229	for (c = 0; c < raidPtr->numCol; c++) {
230		rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[c],
231					   c, p,
232					   raidPtr->sectorsPerDisk,
233					   raidPtr->Disks[c].dev,
234					   cfgPtr->maxOutstandingDiskReqs,
235					   listp, raidPtr->cleanupList);
236		if (rc)
237			return (rc);
238	}
239
240	spareQueues = &raidPtr->Queues[raidPtr->numCol];
241	for (r = 0; r < raidPtr->numSpare; r++) {
242		rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r],
243					   raidPtr->numCol + r, p,
244					   raidPtr->sectorsPerDisk,
245					   raidPtr->Disks[raidPtr->numCol + r].dev,
246					   cfgPtr->maxOutstandingDiskReqs, listp,
247					   raidPtr->cleanupList);
248		if (rc)
249			return (rc);
250	}
251	return (0);
252}
253/* Enqueue a disk I/O
254 *
255 * Unfortunately, we have to do things differently in the different
256 * environments (simulator, user-level, kernel).
257 * At user level, all I/O is blocking, so we have 1 or more threads/disk
258 * and the thread that enqueues is different from the thread that dequeues.
259 * In the kernel, I/O is non-blocking and so we'd like to have multiple
260 * I/Os outstanding on the physical disks when possible.
261 *
262 * when any request arrives at a queue, we have two choices:
263 *    dispatch it to the lower levels
264 *    queue it up
265 *
266 * kernel rules for when to do what:
267 *    locking request:  queue empty => dispatch and lock queue,
268 *                      else queue it
269 *    unlocking req  :  always dispatch it
270 *    normal req     :  queue empty => dispatch it & set priority
271 *                      queue not full & priority is ok => dispatch it
272 *                      else queue it
273 *
274 * user-level rules:
275 *    always enqueue.  In the special case of an unlocking op, enqueue
276 *    in a special way that will cause the unlocking op to be the next
277 *    thing dequeued.
278 *
279 * simulator rules:
280 *    Do the same as at user level, with the sleeps and wakeups suppressed.
281 */
282void
283rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri)
284{
285	RF_ETIMER_START(req->qtime);
286	RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
287	req->priority = pri;
288
289#if RF_DEBUG_DISKQUEUE
290	if (rf_queueDebug && (req->numSector == 0)) {
291		printf("Warning: Enqueueing zero-sector access\n");
292	}
293#endif
294	/*
295         * kernel
296         */
297	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
298	/* locking request */
299	if (RF_LOCKING_REQ(req)) {
300		if (RF_QUEUE_EMPTY(queue)) {
301			Dprintf2("Dispatching pri %d locking op to c %d (queue empty)\n", pri, queue->col);
302			RF_LOCK_QUEUE(queue);
303			rf_DispatchKernelIO(queue, req);
304		} else {
305			queue->queueLength++;	/* increment count of number
306						 * of requests waiting in this
307						 * queue */
308			Dprintf2("Enqueueing pri %d locking op to c %d (queue not empty)\n", pri, queue->col);
309			req->queue = (void *) queue;
310			(queue->qPtr->Enqueue) (queue->qHdr, req, pri);
311		}
312	}
313	/* unlocking request */
314	else
315		if (RF_UNLOCKING_REQ(req)) {	/* we'll do the actual unlock
316						 * when this I/O completes */
317			Dprintf2("Dispatching pri %d unlocking op to c %d\n", pri, queue->col);
318			RF_ASSERT(RF_QUEUE_LOCKED(queue));
319			rf_DispatchKernelIO(queue, req);
320		}
321	/* normal request */
322		else
323			if (RF_OK_TO_DISPATCH(queue, req)) {
324				Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n", pri, queue->col);
325				rf_DispatchKernelIO(queue, req);
326			} else {
327				queue->queueLength++;	/* increment count of
328							 * number of requests
329							 * waiting in this queue */
330				Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n", pri, queue->col);
331				req->queue = (void *) queue;
332				(queue->qPtr->Enqueue) (queue->qHdr, req, pri);
333			}
334	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
335}
336
337
338/* get the next set of I/Os started, kernel version only */
339void
340rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status)
341{
342	int     done = 0;
343
344	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
345
346	/* unlock the queue: (1) after an unlocking req completes (2) after a
347	 * locking req fails */
348	if (RF_UNLOCKING_REQ(req) || (RF_LOCKING_REQ(req) && status)) {
349		Dprintf1("DiskIOComplete: unlocking queue at c %d\n", queue->col);
350		RF_ASSERT(RF_QUEUE_LOCKED(queue));
351		RF_UNLOCK_QUEUE(queue);
352	}
353	queue->numOutstanding--;
354	RF_ASSERT(queue->numOutstanding >= 0);
355
356	/* dispatch requests to the disk until we find one that we can't. */
357	/* no reason to continue once we've filled up the queue */
358	/* no reason to even start if the queue is locked */
359
360	while (!done && !RF_QUEUE_FULL(queue) && !RF_QUEUE_LOCKED(queue)) {
361		if (queue->nextLockingOp) {
362			req = queue->nextLockingOp;
363			queue->nextLockingOp = NULL;
364			Dprintf2("DiskIOComplete: a pri %d locking req was pending at c %d\n", req->priority, queue->col);
365		} else {
366			req = (queue->qPtr->Dequeue) (queue->qHdr);
367			if (req != NULL) {
368				Dprintf2("DiskIOComplete: extracting pri %d req from queue at c %d\n", req->priority, queue->col);
369			} else {
370				Dprintf1("DiskIOComplete: no more requests to extract.\n", "");
371			}
372		}
373		if (req) {
374			queue->queueLength--;	/* decrement count of number
375						 * of requests waiting in this
376						 * queue */
377			RF_ASSERT(queue->queueLength >= 0);
378		}
379		if (!req)
380			done = 1;
381		else
382			if (RF_LOCKING_REQ(req)) {
383				if (RF_QUEUE_EMPTY(queue)) {	/* dispatch it */
384					Dprintf2("DiskIOComplete: dispatching pri %d locking req to c %d (queue empty)\n", req->priority, queue->col);
385					RF_LOCK_QUEUE(queue);
386					rf_DispatchKernelIO(queue, req);
387					done = 1;
388				} else {	/* put it aside to wait for
389						 * the queue to drain */
390					Dprintf2("DiskIOComplete: postponing pri %d locking req to c %d\n", req->priority, queue->col);
391					RF_ASSERT(queue->nextLockingOp == NULL);
392					queue->nextLockingOp = req;
393					done = 1;
394				}
395			} else
396				if (RF_UNLOCKING_REQ(req)) {	/* should not happen:
397								 * unlocking ops should
398								 * not get queued */
399					RF_ASSERT(RF_QUEUE_LOCKED(queue));	/* support it anyway for
400										 * the future */
401					Dprintf2("DiskIOComplete: dispatching pri %d unl req to c %d (SHOULD NOT SEE THIS)\n", req->priority, queue->col);
402					rf_DispatchKernelIO(queue, req);
403					done = 1;
404				} else
405					if (RF_OK_TO_DISPATCH(queue, req)) {
406						Dprintf2("DiskIOComplete: dispatching pri %d regular req to c %d (ok to dispatch)\n", req->priority, queue->col);
407						rf_DispatchKernelIO(queue, req);
408					} else {	/* we can't dispatch it,
409							 * so just re-enqueue
410							 * it.  */
411						/* potential trouble here if
412						 * disk queues batch reqs */
413						Dprintf2("DiskIOComplete: re-enqueueing pri %d regular req to c %d\n", req->priority, queue->col);
414						queue->queueLength++;
415						(queue->qPtr->Enqueue) (queue->qHdr, req, req->priority);
416						done = 1;
417					}
418	}
419
420	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
421}
422/* promotes accesses tagged with the given parityStripeID from low priority
423 * to normal priority.  This promotion is optional, meaning that a queue
424 * need not implement it.  If there is no promotion routine associated with
425 * a queue, this routine does nothing and returns -1.
426 */
427int
428rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID,
429		 RF_ReconUnitNum_t which_ru)
430{
431	int     retval;
432
433	if (!queue->qPtr->Promote)
434		return (-1);
435	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
436	retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru);
437	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
438	return (retval);
439}
440
441RF_DiskQueueData_t *
442rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect,
443		       RF_SectorCount_t nsect, caddr_t buf,
444		       RF_StripeNum_t parityStripeID,
445		       RF_ReconUnitNum_t which_ru,
446		       int (*wakeF) (void *, int), void *arg,
447		       RF_DiskQueueData_t *next,
448		       RF_AccTraceEntry_t *tracerec, void *raidPtr,
449		       RF_DiskQueueDataFlags_t flags, void *kb_proc)
450{
451	RF_DiskQueueData_t *p;
452	int s;
453
454	p = pool_get(&rf_pools.dqd, PR_WAITOK);
455	memset(p, 0, sizeof(RF_DiskQueueData_t));
456	/* Need to be at splbio to access bufpool! */
457	s = splbio();
458	p->bp = pool_get(&bufpool, PR_NOWAIT); /* XXX: make up our minds here.
459						  WAITOK, or NOWAIT?? */
460	splx(s);
461	if (p->bp == NULL) {
462		/* no memory for the buffer!?!? */
463		pool_put(&rf_pools.dqd, p);
464		return(NULL);
465	}
466
467	memset(p->bp, 0, sizeof(struct buf));
468	p->sectorOffset = ssect + rf_protectedSectors;
469	p->numSector = nsect;
470	p->type = typ;
471	p->buf = buf;
472	p->parityStripeID = parityStripeID;
473	p->which_ru = which_ru;
474	p->CompleteFunc = wakeF;
475	p->argument = arg;
476	p->next = next;
477	p->tracerec = tracerec;
478	p->priority = RF_IO_NORMAL_PRIORITY;
479	p->raidPtr = raidPtr;
480	p->flags = flags;
481	p->b_proc = kb_proc;
482	return (p);
483}
484
485void
486rf_FreeDiskQueueData(RF_DiskQueueData_t *p)
487{
488	int s;
489
490	s = splbio();
491	pool_put(&bufpool, p->bp);
492	splx(s);
493	pool_put(&rf_pools.dqd, p);
494}
495