rf_diskqueue.c revision 1.59
1/*	$NetBSD: rf_diskqueue.c,v 1.59 2021/07/23 00:26:19 oster Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/****************************************************************************
30 *
31 * rf_diskqueue.c -- higher-level disk queue code
32 *
33 * the routines here are a generic wrapper around the actual queueing
34 * routines.  The code here implements thread scheduling, synchronization,
35 * and locking ops (see below) on top of the lower-level queueing code.
36 *
37 * to support atomic RMW, we implement "locking operations".  When a
38 * locking op is dispatched to the lower levels of the driver, the
39 * queue is locked, and no further I/Os are dispatched until the queue
40 * receives & completes a corresponding "unlocking operation".  This
41 * code relies on the higher layers to guarantee that a locking op
42 * will always be eventually followed by an unlocking op.  The model
43 * is that the higher layers are structured so locking and unlocking
44 * ops occur in pairs, i.e.  an unlocking op cannot be generated until
45 * after a locking op reports completion.  There is no good way to
46 * check to see that an unlocking op "corresponds" to the op that
47 * currently has the queue locked, so we make no such attempt.  Since
48 * by definition there can be only one locking op outstanding on a
49 * disk, this should not be a problem.
50 *
51 * In the kernel, we allow multiple I/Os to be concurrently dispatched
52 * to the disk driver.  In order to support locking ops in this
53 * environment, when we decide to do a locking op, we stop dispatching
54 * new I/Os and wait until all dispatched I/Os have completed before
55 * dispatching the locking op.
56 *
57 * Unfortunately, the code is different in the 3 different operating
58 * states (user level, kernel, simulator).  In the kernel, I/O is
59 * non-blocking, and we have no disk threads to dispatch for us.
60 * Therefore, we have to dispatch new I/Os to the scsi driver at the
61 * time of enqueue, and also at the time of completion.  At user
62 * level, I/O is blocking, and so only the disk threads may dispatch
63 * I/Os.  Thus at user level, all we can do at enqueue time is enqueue
64 * and wake up the disk thread to do the dispatch.
65 *
66 ****************************************************************************/
67
68#include <sys/cdefs.h>
69__KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.59 2021/07/23 00:26:19 oster Exp $");
70
71#include <dev/raidframe/raidframevar.h>
72
73#include "rf_threadstuff.h"
74#include "rf_raid.h"
75#include "rf_diskqueue.h"
76#include "rf_alloclist.h"
77#include "rf_acctrace.h"
78#include "rf_etimer.h"
79#include "rf_general.h"
80#include "rf_debugprint.h"
81#include "rf_shutdown.h"
82#include "rf_cvscan.h"
83#include "rf_sstf.h"
84#include "rf_fifo.h"
85#include "rf_kintf.h"
86
87#include <sys/buf.h>
88
89static void rf_ShutdownDiskQueueSystem(void *);
90
91#ifndef RF_DEBUG_DISKQUEUE
92#define RF_DEBUG_DISKQUEUE 0
93#endif
94
95#if RF_DEBUG_DISKQUEUE
96#define Dprintf1(s,a)         if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
97#define Dprintf2(s,a,b)       if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
98#define Dprintf3(s,a,b,c)     if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
99#else
100#define Dprintf1(s,a)
101#define Dprintf2(s,a,b)
102#define Dprintf3(s,a,b,c)
103#endif
104
105/*****************************************************************************
106 *
107 * the disk queue switch defines all the functions used in the
108 * different queueing disciplines queue ID, init routine, enqueue
109 * routine, dequeue routine
110 *
111 ****************************************************************************/
112
113static const RF_DiskQueueSW_t diskqueuesw[] = {
114	{"fifo",		/* FIFO */
115		rf_FifoCreate,
116		rf_FifoEnqueue,
117		rf_FifoDequeue,
118		rf_FifoPeek,
119	rf_FifoPromote},
120
121	{"cvscan",		/* cvscan */
122		rf_CvscanCreate,
123		rf_CvscanEnqueue,
124		rf_CvscanDequeue,
125		rf_CvscanPeek,
126	rf_CvscanPromote},
127
128	{"sstf",		/* shortest seek time first */
129		rf_SstfCreate,
130		rf_SstfEnqueue,
131		rf_SstfDequeue,
132		rf_SstfPeek,
133	rf_SstfPromote},
134
135	{"scan",		/* SCAN (two-way elevator) */
136		rf_ScanCreate,
137		rf_SstfEnqueue,
138		rf_ScanDequeue,
139		rf_ScanPeek,
140	rf_SstfPromote},
141
142	{"cscan",		/* CSCAN (one-way elevator) */
143		rf_CscanCreate,
144		rf_SstfEnqueue,
145		rf_CscanDequeue,
146		rf_CscanPeek,
147	rf_SstfPromote},
148
149};
150#define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
151
152
153#define RF_MAX_FREE_DQD 256
154#define RF_MIN_FREE_DQD  64
155
156/* XXX: scale these... */
157#define RF_MAX_FREE_BUFIO 256
158#define RF_MIN_FREE_BUFIO  64
159
160
161
162/* configures a single disk queue */
163
164static void
165rf_ShutdownDiskQueue(void *arg)
166{
167	RF_DiskQueue_t *diskqueue = arg;
168
169	rf_destroy_mutex2(diskqueue->mutex);
170}
171
172int
173rf_ConfigureDiskQueue(RF_Raid_t *raidPtr, RF_DiskQueue_t *diskqueue,
174		      RF_RowCol_t c, const RF_DiskQueueSW_t *p,
175		      RF_SectorCount_t sectPerDisk, dev_t dev,
176		      int maxOutstanding, RF_ShutdownList_t **listp,
177		      RF_AllocListElem_t *clList)
178{
179	diskqueue->col = c;
180	diskqueue->qPtr = p;
181	diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp);
182	diskqueue->dev = dev;
183	diskqueue->numOutstanding = 0;
184	diskqueue->queueLength = 0;
185	diskqueue->maxOutstanding = maxOutstanding;
186	diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
187	diskqueue->flags = 0;
188	diskqueue->raidPtr = raidPtr;
189	diskqueue->rf_cinfo = &raidPtr->raid_cinfo[c];
190	rf_init_mutex2(diskqueue->mutex, IPL_VM);
191	rf_ShutdownCreate(listp, rf_ShutdownDiskQueue, diskqueue);
192	return (0);
193}
194
195static void
196rf_ShutdownDiskQueueSystem(void *ignored)
197{
198	pool_destroy(&rf_pools.dqd);
199	pool_destroy(&rf_pools.bufio);
200}
201
202int
203rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp)
204{
205
206	rf_pool_init(&rf_pools.dqd, sizeof(RF_DiskQueueData_t),
207		     "rf_dqd_pl", RF_MIN_FREE_DQD, RF_MAX_FREE_DQD);
208	rf_pool_init(&rf_pools.bufio, sizeof(buf_t),
209		     "rf_bufio_pl", RF_MIN_FREE_BUFIO, RF_MAX_FREE_BUFIO);
210	rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL);
211
212	return (0);
213}
214
215int
216rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
217		       RF_Config_t *cfgPtr)
218{
219	RF_DiskQueue_t *diskQueues, *spareQueues;
220	const RF_DiskQueueSW_t *p;
221	RF_RowCol_t r,c;
222	int     rc, i;
223
224	raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
225
226	for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) {
227		if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
228			p = &diskqueuesw[i];
229			break;
230		}
231	}
232	if (p == NULL) {
233		RF_ERRORMSG2("Unknown queue type \"%s\".  Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType);
234		p = &diskqueuesw[0];
235	}
236	raidPtr->qType = p;
237
238	diskQueues = RF_MallocAndAdd(
239	    (raidPtr->numCol + RF_MAXSPARE) * sizeof(*diskQueues),
240	    raidPtr->cleanupList);
241	if (diskQueues == NULL)
242		return (ENOMEM);
243	raidPtr->Queues = diskQueues;
244
245	for (c = 0; c < raidPtr->numCol; c++) {
246		rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[c],
247					   c, p,
248					   raidPtr->sectorsPerDisk,
249					   raidPtr->Disks[c].dev,
250					   cfgPtr->maxOutstandingDiskReqs,
251					   listp, raidPtr->cleanupList);
252		if (rc)
253			return (rc);
254	}
255
256	spareQueues = &raidPtr->Queues[raidPtr->numCol];
257	for (r = 0; r < raidPtr->numSpare; r++) {
258		rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r],
259					   raidPtr->numCol + r, p,
260					   raidPtr->sectorsPerDisk,
261					   raidPtr->Disks[raidPtr->numCol + r].dev,
262					   cfgPtr->maxOutstandingDiskReqs, listp,
263					   raidPtr->cleanupList);
264		if (rc)
265			return (rc);
266	}
267	return (0);
268}
269/* Enqueue a disk I/O
270 *
271 * In the kernel, I/O is non-blocking and so we'd like to have multiple
272 * I/Os outstanding on the physical disks when possible.
273 *
274 * when any request arrives at a queue, we have two choices:
275 *    dispatch it to the lower levels
276 *    queue it up
277 *
278 * kernel rules for when to do what:
279 *    unlocking req  :  always dispatch it
280 *    normal req     :  queue empty => dispatch it & set priority
281 *                      queue not full & priority is ok => dispatch it
282 *                      else queue it
283 */
284void
285rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri)
286{
287	RF_ETIMER_START(req->qtime);
288	RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
289	req->priority = pri;
290
291#if RF_DEBUG_DISKQUEUE
292	if (rf_queueDebug && (req->numSector == 0)) {
293		printf("Warning: Enqueueing zero-sector access\n");
294	}
295#endif
296	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
297	if (RF_OK_TO_DISPATCH(queue, req)) {
298		Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n", pri, queue->col);
299		rf_DispatchKernelIO(queue, req);
300	} else {
301		queue->queueLength++;	/* increment count of number of requests waiting in this queue */
302		Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n", pri, queue->col);
303		req->queue = (void *) queue;
304		(queue->qPtr->Enqueue) (queue->qHdr, req, pri);
305	}
306	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
307}
308
309
310/* get the next set of I/Os started */
311void
312rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status)
313{
314	int     done = 0;
315
316	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
317	queue->numOutstanding--;
318	RF_ASSERT(queue->numOutstanding >= 0);
319
320	/* dispatch requests to the disk until we find one that we can't. */
321	/* no reason to continue once we've filled up the queue */
322	/* no reason to even start if the queue is locked */
323
324	while (!done && !RF_QUEUE_FULL(queue)) {
325		req = (queue->qPtr->Dequeue) (queue->qHdr);
326		if (req) {
327			Dprintf2("DiskIOComplete: extracting pri %d req from queue at c %d\n", req->priority, queue->col);
328			queue->queueLength--;	/* decrement count of number of requests waiting in this queue */
329			RF_ASSERT(queue->queueLength >= 0);
330			if (RF_OK_TO_DISPATCH(queue, req)) {
331				Dprintf2("DiskIOComplete: dispatching pri %d regular req to c %d (ok to dispatch)\n", req->priority, queue->col);
332				rf_DispatchKernelIO(queue, req);
333			} else {
334				/* we can't dispatch it, so just re-enqueue it.
335				   potential trouble here if disk queues batch reqs */
336				Dprintf2("DiskIOComplete: re-enqueueing pri %d regular req to c %d\n", req->priority, queue->col);
337				queue->queueLength++;
338				(queue->qPtr->Enqueue) (queue->qHdr, req, req->priority);
339				done = 1;
340			}
341		} else {
342			Dprintf1("DiskIOComplete: no more requests to extract.\n", "");
343			done = 1;
344		}
345	}
346
347	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
348}
349/* promotes accesses tagged with the given parityStripeID from low priority
350 * to normal priority.  This promotion is optional, meaning that a queue
351 * need not implement it.  If there is no promotion routine associated with
352 * a queue, this routine does nothing and returns -1.
353 */
354int
355rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID,
356		 RF_ReconUnitNum_t which_ru)
357{
358	int     retval;
359
360	if (!queue->qPtr->Promote)
361		return (-1);
362	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
363	retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru);
364	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
365	return (retval);
366}
367
368RF_DiskQueueData_t *
369rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect,
370		       RF_SectorCount_t nsect, void *bf,
371		       RF_StripeNum_t parityStripeID,
372		       RF_ReconUnitNum_t which_ru,
373		       void (*wakeF) (void *, int), void *arg,
374		       RF_AccTraceEntry_t *tracerec, RF_Raid_t *raidPtr,
375		       RF_DiskQueueDataFlags_t flags, const struct buf *mbp,
376		       int waitflag)
377{
378	RF_DiskQueueData_t *p;
379
380	p = pool_get(&rf_pools.dqd, PR_WAITOK | PR_ZERO);
381	KASSERT(p != NULL);
382
383	/* Obtain a buffer from our own pool.  It is possible for the
384	   regular getiobuf() to run out of memory and return NULL.
385	   We need to guarantee that never happens, as RAIDframe
386	   doesn't have a good way to recover if memory allocation
387	   fails here.
388	*/
389	p->bp = pool_get(&rf_pools.bufio, PR_WAITOK | PR_ZERO);
390	KASSERT(p->bp != NULL);
391
392	buf_init(p->bp);
393
394	SET(p->bp->b_cflags, BC_BUSY);	/* mark buffer busy */
395	if (mbp) {
396		SET(p->bp->b_flags, mbp->b_flags & rf_b_pass);
397		p->bp->b_proc = mbp->b_proc;
398	}
399
400	p->sectorOffset = ssect + rf_protectedSectors;
401	p->numSector = nsect;
402	p->type = typ;
403	p->buf = bf;
404	p->parityStripeID = parityStripeID;
405	p->which_ru = which_ru;
406	p->CompleteFunc = wakeF;
407	p->argument = arg;
408	p->next = NULL;
409	p->tracerec = tracerec;
410	p->priority = RF_IO_NORMAL_PRIORITY;
411	p->raidPtr = raidPtr;
412	p->flags = flags;
413	return (p);
414}
415
416void
417rf_FreeDiskQueueData(RF_DiskQueueData_t *p)
418{
419	pool_put(&rf_pools.bufio, p->bp);
420	pool_put(&rf_pools.dqd, p);
421}
422