1/*	$NetBSD: rf_diskqueue.c,v 1.64 2023/09/17 20:07:39 oster Exp $	*/
2/*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21 *  School of Computer Science
22 *  Carnegie Mellon University
23 *  Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29/****************************************************************************
30 *
31 * rf_diskqueue.c -- higher-level disk queue code
32 *
33 * the routines here are a generic wrapper around the actual queueing
34 * routines.  The code here implements thread scheduling, synchronization,
35 * and locking ops (see below) on top of the lower-level queueing code.
36 *
37 * to support atomic RMW, we implement "locking operations".  When a
38 * locking op is dispatched to the lower levels of the driver, the
39 * queue is locked, and no further I/Os are dispatched until the queue
40 * receives & completes a corresponding "unlocking operation".  This
41 * code relies on the higher layers to guarantee that a locking op
42 * will always be eventually followed by an unlocking op.  The model
43 * is that the higher layers are structured so locking and unlocking
44 * ops occur in pairs, i.e.  an unlocking op cannot be generated until
45 * after a locking op reports completion.  There is no good way to
46 * check to see that an unlocking op "corresponds" to the op that
47 * currently has the queue locked, so we make no such attempt.  Since
48 * by definition there can be only one locking op outstanding on a
49 * disk, this should not be a problem.
50 *
51 * In the kernel, we allow multiple I/Os to be concurrently dispatched
52 * to the disk driver.  In order to support locking ops in this
53 * environment, when we decide to do a locking op, we stop dispatching
54 * new I/Os and wait until all dispatched I/Os have completed before
55 * dispatching the locking op.
56 *
57 * Unfortunately, the code is different in the 3 different operating
58 * states (user level, kernel, simulator).  In the kernel, I/O is
59 * non-blocking, and we have no disk threads to dispatch for us.
60 * Therefore, we have to dispatch new I/Os to the scsi driver at the
61 * time of enqueue, and also at the time of completion.  At user
62 * level, I/O is blocking, and so only the disk threads may dispatch
63 * I/Os.  Thus at user level, all we can do at enqueue time is enqueue
64 * and wake up the disk thread to do the dispatch.
65 *
66 ****************************************************************************/
67
68#include <sys/cdefs.h>
69__KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.64 2023/09/17 20:07:39 oster Exp $");
70
71#include <dev/raidframe/raidframevar.h>
72
73#include "rf_threadstuff.h"
74#include "rf_raid.h"
75#include "rf_diskqueue.h"
76#include "rf_alloclist.h"
77#include "rf_acctrace.h"
78#include "rf_etimer.h"
79#include "rf_general.h"
80#include "rf_debugprint.h"
81#include "rf_shutdown.h"
82#include "rf_cvscan.h"
83#include "rf_sstf.h"
84#include "rf_fifo.h"
85#include "rf_kintf.h"
86
87#include <sys/buf.h>
88
89static void rf_ShutdownDiskQueueSystem(void *);
90
91#ifndef RF_DEBUG_DISKQUEUE
92#define RF_DEBUG_DISKQUEUE 0
93#endif
94
95#if RF_DEBUG_DISKQUEUE
96#define Dprintf1(s,a)         if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
97#define Dprintf2(s,a,b)       if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
98#define Dprintf3(s,a,b,c)     if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
99#else
100#define Dprintf1(s,a)
101#define Dprintf2(s,a,b)
102#define Dprintf3(s,a,b,c)
103#endif
104
105/*****************************************************************************
106 *
107 * the disk queue switch defines all the functions used in the
108 * different queueing disciplines queue ID, init routine, enqueue
109 * routine, dequeue routine
110 *
111 ****************************************************************************/
112
113static const RF_DiskQueueSW_t diskqueuesw[] = {
114	{"fifo",		/* FIFO */
115		rf_FifoCreate,
116		rf_FifoEnqueue,
117		rf_FifoDequeue,
118		rf_FifoPromote},
119
120	{"cvscan",		/* cvscan */
121		rf_CvscanCreate,
122		rf_CvscanEnqueue,
123		rf_CvscanDequeue,
124		rf_CvscanPromote},
125
126	{"sstf",		/* shortest seek time first */
127		rf_SstfCreate,
128		rf_SstfEnqueue,
129		rf_SstfDequeue,
130		rf_SstfPromote},
131
132	{"scan",		/* SCAN (two-way elevator) */
133		rf_ScanCreate,
134		rf_SstfEnqueue,
135		rf_ScanDequeue,
136		rf_SstfPromote},
137
138	{"cscan",		/* CSCAN (one-way elevator) */
139		rf_CscanCreate,
140		rf_SstfEnqueue,
141		rf_CscanDequeue,
142		rf_SstfPromote},
143
144};
145#define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
146
147
148#define RF_MAX_FREE_DQD 256
149#define RF_MIN_FREE_DQD  64
150
151/* XXX: scale these... */
152#define RF_MAX_FREE_BUFIO 256
153#define RF_MIN_FREE_BUFIO  64
154
155
156
157/* configures a single disk queue */
158
159static void
160rf_ShutdownDiskQueue(void *arg)
161{
162	RF_DiskQueue_t *diskqueue = arg;
163
164	rf_destroy_mutex2(diskqueue->mutex);
165}
166
167int
168rf_ConfigureDiskQueue(RF_Raid_t *raidPtr, RF_DiskQueue_t *diskqueue,
169		      RF_RowCol_t c, const RF_DiskQueueSW_t *p,
170		      RF_SectorCount_t sectPerDisk, dev_t dev,
171		      int maxOutstanding, RF_ShutdownList_t **listp,
172		      RF_AllocListElem_t *clList)
173{
174	diskqueue->col = c;
175	diskqueue->qPtr = p;
176	diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp);
177	diskqueue->dev = dev;
178	diskqueue->numOutstanding = 0;
179	diskqueue->queueLength = 0;
180	diskqueue->maxOutstanding = maxOutstanding;
181	diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
182	diskqueue->flags = 0;
183	diskqueue->raidPtr = raidPtr;
184	diskqueue->rf_cinfo = &raidPtr->raid_cinfo[c];
185	rf_init_mutex2(diskqueue->mutex, IPL_VM);
186	rf_ShutdownCreate(listp, rf_ShutdownDiskQueue, diskqueue);
187	return (0);
188}
189
190int
191rf_UpdateDiskQueue(RF_DiskQueue_t *diskqueue, RF_RaidDisk_t *disk)
192{
193	diskqueue->dev = disk->dev;
194	return(0);
195}
196
197static void
198rf_ShutdownDiskQueueSystem(void *arg)
199{
200	RF_Raid_t *raidPtr;
201
202	raidPtr = (RF_Raid_t *) arg;
203
204	pool_destroy(&raidPtr->pools.dqd);
205	pool_destroy(&raidPtr->pools.bufio);
206}
207
208int
209rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
210			    RF_Config_t *cfgPtr)
211
212{
213
214	rf_pool_init(raidPtr, raidPtr->poolNames.dqd, &raidPtr->pools.dqd, sizeof(RF_DiskQueueData_t),
215		     "dqd", RF_MIN_FREE_DQD, RF_MAX_FREE_DQD);
216	rf_pool_init(raidPtr, raidPtr->poolNames.bufio, &raidPtr->pools.bufio, sizeof(buf_t),
217		     "bufio", RF_MIN_FREE_BUFIO, RF_MAX_FREE_BUFIO);
218	rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, raidPtr);
219
220	return (0);
221}
222
223int
224rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
225		       RF_Config_t *cfgPtr)
226{
227	RF_DiskQueue_t *diskQueues, *spareQueues;
228	const RF_DiskQueueSW_t *p;
229	RF_RowCol_t r,c;
230	int     rc, i;
231
232	raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
233
234	for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) {
235		if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
236			p = &diskqueuesw[i];
237			break;
238		}
239	}
240	if (p == NULL) {
241		RF_ERRORMSG2("Unknown queue type \"%s\".  Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType);
242		p = &diskqueuesw[0];
243	}
244	raidPtr->qType = p;
245
246	diskQueues = RF_MallocAndAdd(
247	    (raidPtr->numCol + RF_MAXSPARE) * sizeof(*diskQueues),
248	    raidPtr->cleanupList);
249	if (diskQueues == NULL)
250		return (ENOMEM);
251	raidPtr->Queues = diskQueues;
252
253	for (c = 0; c < raidPtr->numCol; c++) {
254		rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[c],
255					   c, p,
256					   raidPtr->sectorsPerDisk,
257					   raidPtr->Disks[c].dev,
258					   cfgPtr->maxOutstandingDiskReqs,
259					   listp, raidPtr->cleanupList);
260		if (rc)
261			return (rc);
262	}
263
264	spareQueues = &raidPtr->Queues[raidPtr->numCol];
265	for (r = 0; r < raidPtr->maxQueue; r++) {
266		rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r],
267					   raidPtr->numCol + r, p,
268					   raidPtr->sectorsPerDisk,
269					   raidPtr->Disks[raidPtr->numCol + r].dev,
270					   cfgPtr->maxOutstandingDiskReqs, listp,
271					   raidPtr->cleanupList);
272		if (rc)
273			return (rc);
274	}
275	return (0);
276}
277/* Enqueue a disk I/O
278 *
279 * In the kernel, I/O is non-blocking and so we'd like to have multiple
280 * I/Os outstanding on the physical disks when possible.
281 *
282 * when any request arrives at a queue, we have two choices:
283 *    dispatch it to the lower levels
284 *    queue it up
285 *
286 * kernel rules for when to do what:
287 *    unlocking req  :  always dispatch it
288 *    normal req     :  queue empty => dispatch it & set priority
289 *                      queue not full & priority is ok => dispatch it
290 *                      else queue it
291 */
292void
293rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri)
294{
295	RF_ETIMER_START(req->qtime);
296	RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
297	req->priority = pri;
298
299#if RF_DEBUG_DISKQUEUE
300	if (rf_queueDebug && (req->numSector == 0)) {
301		printf("Warning: Enqueueing zero-sector access\n");
302	}
303#endif
304	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
305	if (RF_OK_TO_DISPATCH(queue, req)) {
306		Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n", pri, queue->col);
307		rf_DispatchKernelIO(queue, req);
308	} else {
309		queue->queueLength++;	/* increment count of number of requests waiting in this queue */
310		Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n", pri, queue->col);
311		req->queue = (void *) queue;
312		(queue->qPtr->Enqueue) (queue->qHdr, req, pri);
313	}
314	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue");
315}
316
317
318/* get the next set of I/Os started */
319void
320rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status)
321{
322	int     done = 0;
323
324	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
325	queue->numOutstanding--;
326	RF_ASSERT(queue->numOutstanding >= 0);
327
328	/* dispatch requests to the disk until we find one that we can't. */
329	/* no reason to continue once we've filled up the queue */
330	/* no reason to even start if the queue is locked */
331
332	while (!done && !RF_QUEUE_FULL(queue)) {
333		req = (queue->qPtr->Dequeue) (queue->qHdr);
334		if (req) {
335			Dprintf2("DiskIOComplete: extracting pri %d req from queue at c %d\n", req->priority, queue->col);
336			queue->queueLength--;	/* decrement count of number of requests waiting in this queue */
337			RF_ASSERT(queue->queueLength >= 0);
338			if (RF_OK_TO_DISPATCH(queue, req)) {
339				Dprintf2("DiskIOComplete: dispatching pri %d regular req to c %d (ok to dispatch)\n", req->priority, queue->col);
340				rf_DispatchKernelIO(queue, req);
341			} else {
342				/* we can't dispatch it, so just re-enqueue it.
343				   potential trouble here if disk queues batch reqs */
344				Dprintf2("DiskIOComplete: re-enqueueing pri %d regular req to c %d\n", req->priority, queue->col);
345				queue->queueLength++;
346				(queue->qPtr->Enqueue) (queue->qHdr, req, req->priority);
347				done = 1;
348			}
349		} else {
350			Dprintf1("DiskIOComplete: no more requests to extract.\n", "");
351			done = 1;
352		}
353	}
354
355	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete");
356}
357/* promotes accesses tagged with the given parityStripeID from low priority
358 * to normal priority.  This promotion is optional, meaning that a queue
359 * need not implement it.  If there is no promotion routine associated with
360 * a queue, this routine does nothing and returns -1.
361 */
362int
363rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID,
364		 RF_ReconUnitNum_t which_ru)
365{
366	int     retval;
367
368	if (!queue->qPtr->Promote)
369		return (-1);
370	RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
371	retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru);
372	RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote");
373	return (retval);
374}
375
376RF_DiskQueueData_t *
377rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect,
378		       RF_SectorCount_t nsect, void *bf,
379		       RF_StripeNum_t parityStripeID,
380		       RF_ReconUnitNum_t which_ru,
381		       void (*wakeF) (void *, int), void *arg,
382		       RF_AccTraceEntry_t *tracerec, RF_Raid_t *raidPtr,
383		       RF_DiskQueueDataFlags_t flags, const struct buf *mbp)
384{
385	RF_DiskQueueData_t *p;
386
387	p = pool_get(&raidPtr->pools.dqd, PR_WAITOK | PR_ZERO);
388	KASSERT(p != NULL);
389
390	/* Obtain a buffer from our own pool.  It is possible for the
391	   regular getiobuf() to run out of memory and return NULL.
392	   We need to guarantee that never happens, as RAIDframe
393	   doesn't have a good way to recover if memory allocation
394	   fails here.
395	*/
396	p->bp = pool_get(&raidPtr->pools.bufio, PR_WAITOK | PR_ZERO);
397	KASSERT(p->bp != NULL);
398
399	buf_init(p->bp);
400
401	SET(p->bp->b_cflags, BC_BUSY);	/* mark buffer busy */
402	if (mbp) {
403		SET(p->bp->b_flags, mbp->b_flags & rf_b_pass);
404		p->bp->b_proc = mbp->b_proc;
405	}
406
407	p->sectorOffset = ssect + rf_protectedSectors;
408	p->numSector = nsect;
409	p->type = typ;
410	p->buf = bf;
411	p->parityStripeID = parityStripeID;
412	p->which_ru = which_ru;
413	p->CompleteFunc = wakeF;
414	p->argument = arg;
415	p->next = NULL;
416	p->tracerec = tracerec;
417	p->priority = RF_IO_NORMAL_PRIORITY;
418	p->raidPtr = raidPtr;
419	p->flags = flags;
420	return (p);
421}
422
423void
424rf_FreeDiskQueueData(RF_DiskQueueData_t *p)
425{
426
427	buf_destroy(p->bp);
428
429	pool_put(&p->raidPtr->pools.bufio, p->bp);
430	pool_put(&p->raidPtr->pools.dqd, p);
431}
432