rf_diskqueue.c revision 1.36
1/* $NetBSD: rf_diskqueue.c,v 1.36 2004/11/24 13:42:36 oster Exp $ */ 2/* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29/**************************************************************************** 30 * 31 * rf_diskqueue.c -- higher-level disk queue code 32 * 33 * the routines here are a generic wrapper around the actual queueing 34 * routines. The code here implements thread scheduling, synchronization, 35 * and locking ops (see below) on top of the lower-level queueing code. 36 * 37 * to support atomic RMW, we implement "locking operations". When a 38 * locking op is dispatched to the lower levels of the driver, the 39 * queue is locked, and no further I/Os are dispatched until the queue 40 * receives & completes a corresponding "unlocking operation". This 41 * code relies on the higher layers to guarantee that a locking op 42 * will always be eventually followed by an unlocking op. The model 43 * is that the higher layers are structured so locking and unlocking 44 * ops occur in pairs, i.e. an unlocking op cannot be generated until 45 * after a locking op reports completion. There is no good way to 46 * check to see that an unlocking op "corresponds" to the op that 47 * currently has the queue locked, so we make no such attempt. Since 48 * by definition there can be only one locking op outstanding on a 49 * disk, this should not be a problem. 50 * 51 * In the kernel, we allow multiple I/Os to be concurrently dispatched 52 * to the disk driver. In order to support locking ops in this 53 * environment, when we decide to do a locking op, we stop dispatching 54 * new I/Os and wait until all dispatched I/Os have completed before 55 * dispatching the locking op. 56 * 57 * Unfortunately, the code is different in the 3 different operating 58 * states (user level, kernel, simulator). In the kernel, I/O is 59 * non-blocking, and we have no disk threads to dispatch for us. 60 * Therefore, we have to dispatch new I/Os to the scsi driver at the 61 * time of enqueue, and also at the time of completion. At user 62 * level, I/O is blocking, and so only the disk threads may dispatch 63 * I/Os. Thus at user level, all we can do at enqueue time is enqueue 64 * and wake up the disk thread to do the dispatch. 65 * 66 ****************************************************************************/ 67 68#include <sys/cdefs.h> 69__KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.36 2004/11/24 13:42:36 oster Exp $"); 70 71#include <dev/raidframe/raidframevar.h> 72 73#include "rf_threadstuff.h" 74#include "rf_raid.h" 75#include "rf_diskqueue.h" 76#include "rf_alloclist.h" 77#include "rf_acctrace.h" 78#include "rf_etimer.h" 79#include "rf_general.h" 80#include "rf_debugprint.h" 81#include "rf_shutdown.h" 82#include "rf_cvscan.h" 83#include "rf_sstf.h" 84#include "rf_fifo.h" 85#include "rf_kintf.h" 86 87static void rf_ShutdownDiskQueueSystem(void *); 88 89#ifndef RF_DEBUG_DISKQUEUE 90#define RF_DEBUG_DISKQUEUE 0 91#endif 92 93#if RF_DEBUG_DISKQUEUE 94#define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 95#define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 96#define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) 97#else 98#define Dprintf1(s,a) 99#define Dprintf2(s,a,b) 100#define Dprintf3(s,a,b,c) 101#endif 102 103/***************************************************************************** 104 * 105 * the disk queue switch defines all the functions used in the 106 * different queueing disciplines queue ID, init routine, enqueue 107 * routine, dequeue routine 108 * 109 ****************************************************************************/ 110 111static const RF_DiskQueueSW_t diskqueuesw[] = { 112 {"fifo", /* FIFO */ 113 rf_FifoCreate, 114 rf_FifoEnqueue, 115 rf_FifoDequeue, 116 rf_FifoPeek, 117 rf_FifoPromote}, 118 119 {"cvscan", /* cvscan */ 120 rf_CvscanCreate, 121 rf_CvscanEnqueue, 122 rf_CvscanDequeue, 123 rf_CvscanPeek, 124 rf_CvscanPromote}, 125 126 {"sstf", /* shortest seek time first */ 127 rf_SstfCreate, 128 rf_SstfEnqueue, 129 rf_SstfDequeue, 130 rf_SstfPeek, 131 rf_SstfPromote}, 132 133 {"scan", /* SCAN (two-way elevator) */ 134 rf_ScanCreate, 135 rf_SstfEnqueue, 136 rf_ScanDequeue, 137 rf_ScanPeek, 138 rf_SstfPromote}, 139 140 {"cscan", /* CSCAN (one-way elevator) */ 141 rf_CscanCreate, 142 rf_SstfEnqueue, 143 rf_CscanDequeue, 144 rf_CscanPeek, 145 rf_SstfPromote}, 146 147}; 148#define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t)) 149 150#define RF_MAX_FREE_DQD 256 151#define RF_MIN_FREE_DQD 64 152 153#include <sys/buf.h> 154 155/* configures a single disk queue */ 156 157int 158rf_ConfigureDiskQueue(RF_Raid_t *raidPtr, RF_DiskQueue_t *diskqueue, 159 RF_RowCol_t c, const RF_DiskQueueSW_t *p, 160 RF_SectorCount_t sectPerDisk, dev_t dev, 161 int maxOutstanding, RF_ShutdownList_t **listp, 162 RF_AllocListElem_t *clList) 163{ 164 diskqueue->col = c; 165 diskqueue->qPtr = p; 166 diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp); 167 diskqueue->dev = dev; 168 diskqueue->numOutstanding = 0; 169 diskqueue->queueLength = 0; 170 diskqueue->maxOutstanding = maxOutstanding; 171 diskqueue->curPriority = RF_IO_NORMAL_PRIORITY; 172 diskqueue->nextLockingOp = NULL; 173 diskqueue->flags = 0; 174 diskqueue->raidPtr = raidPtr; 175 diskqueue->rf_cinfo = &raidPtr->raid_cinfo[c]; 176 rf_mutex_init(&diskqueue->mutex); 177 diskqueue->cond = 0; 178 return (0); 179} 180 181static void 182rf_ShutdownDiskQueueSystem(void *ignored) 183{ 184 pool_destroy(&rf_pools.dqd); 185} 186 187int 188rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp) 189{ 190 191 rf_pool_init(&rf_pools.dqd, sizeof(RF_DiskQueueData_t), 192 "rf_dqd_pl", RF_MIN_FREE_DQD, RF_MAX_FREE_DQD); 193 rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL); 194 195 return (0); 196} 197 198int 199rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, 200 RF_Config_t *cfgPtr) 201{ 202 RF_DiskQueue_t *diskQueues, *spareQueues; 203 const RF_DiskQueueSW_t *p; 204 RF_RowCol_t r,c; 205 int rc, i; 206 207 raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs; 208 209 for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) { 210 if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) { 211 p = &diskqueuesw[i]; 212 break; 213 } 214 } 215 if (p == NULL) { 216 RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType); 217 p = &diskqueuesw[0]; 218 } 219 raidPtr->qType = p; 220 221 RF_MallocAndAdd(diskQueues, 222 (raidPtr->numCol + RF_MAXSPARE) * 223 sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *), 224 raidPtr->cleanupList); 225 if (diskQueues == NULL) 226 return (ENOMEM); 227 raidPtr->Queues = diskQueues; 228 229 for (c = 0; c < raidPtr->numCol; c++) { 230 rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[c], 231 c, p, 232 raidPtr->sectorsPerDisk, 233 raidPtr->Disks[c].dev, 234 cfgPtr->maxOutstandingDiskReqs, 235 listp, raidPtr->cleanupList); 236 if (rc) 237 return (rc); 238 } 239 240 spareQueues = &raidPtr->Queues[raidPtr->numCol]; 241 for (r = 0; r < raidPtr->numSpare; r++) { 242 rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r], 243 raidPtr->numCol + r, p, 244 raidPtr->sectorsPerDisk, 245 raidPtr->Disks[raidPtr->numCol + r].dev, 246 cfgPtr->maxOutstandingDiskReqs, listp, 247 raidPtr->cleanupList); 248 if (rc) 249 return (rc); 250 } 251 return (0); 252} 253/* Enqueue a disk I/O 254 * 255 * Unfortunately, we have to do things differently in the different 256 * environments (simulator, user-level, kernel). 257 * At user level, all I/O is blocking, so we have 1 or more threads/disk 258 * and the thread that enqueues is different from the thread that dequeues. 259 * In the kernel, I/O is non-blocking and so we'd like to have multiple 260 * I/Os outstanding on the physical disks when possible. 261 * 262 * when any request arrives at a queue, we have two choices: 263 * dispatch it to the lower levels 264 * queue it up 265 * 266 * kernel rules for when to do what: 267 * locking request: queue empty => dispatch and lock queue, 268 * else queue it 269 * unlocking req : always dispatch it 270 * normal req : queue empty => dispatch it & set priority 271 * queue not full & priority is ok => dispatch it 272 * else queue it 273 * 274 * user-level rules: 275 * always enqueue. In the special case of an unlocking op, enqueue 276 * in a special way that will cause the unlocking op to be the next 277 * thing dequeued. 278 * 279 * simulator rules: 280 * Do the same as at user level, with the sleeps and wakeups suppressed. 281 */ 282void 283rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri) 284{ 285 RF_ETIMER_START(req->qtime); 286 RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector); 287 req->priority = pri; 288 289#if RF_DEBUG_DISKQUEUE 290 if (rf_queueDebug && (req->numSector == 0)) { 291 printf("Warning: Enqueueing zero-sector access\n"); 292 } 293#endif 294 /* 295 * kernel 296 */ 297 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue"); 298 /* locking request */ 299 if (RF_LOCKING_REQ(req)) { 300 if (RF_QUEUE_EMPTY(queue)) { 301 Dprintf2("Dispatching pri %d locking op to c %d (queue empty)\n", pri, queue->col); 302 RF_LOCK_QUEUE(queue); 303 rf_DispatchKernelIO(queue, req); 304 } else { 305 queue->queueLength++; /* increment count of number 306 * of requests waiting in this 307 * queue */ 308 Dprintf2("Enqueueing pri %d locking op to c %d (queue not empty)\n", pri, queue->col); 309 req->queue = (void *) queue; 310 (queue->qPtr->Enqueue) (queue->qHdr, req, pri); 311 } 312 } 313 /* unlocking request */ 314 else 315 if (RF_UNLOCKING_REQ(req)) { /* we'll do the actual unlock 316 * when this I/O completes */ 317 Dprintf2("Dispatching pri %d unlocking op to c %d\n", pri, queue->col); 318 RF_ASSERT(RF_QUEUE_LOCKED(queue)); 319 rf_DispatchKernelIO(queue, req); 320 } 321 /* normal request */ 322 else 323 if (RF_OK_TO_DISPATCH(queue, req)) { 324 Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n", pri, queue->col); 325 rf_DispatchKernelIO(queue, req); 326 } else { 327 queue->queueLength++; /* increment count of 328 * number of requests 329 * waiting in this queue */ 330 Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n", pri, queue->col); 331 req->queue = (void *) queue; 332 (queue->qPtr->Enqueue) (queue->qHdr, req, pri); 333 } 334 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue"); 335} 336 337 338/* get the next set of I/Os started, kernel version only */ 339void 340rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status) 341{ 342 int done = 0; 343 344 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete"); 345 346 /* unlock the queue: (1) after an unlocking req completes (2) after a 347 * locking req fails */ 348 if (RF_UNLOCKING_REQ(req) || (RF_LOCKING_REQ(req) && status)) { 349 Dprintf1("DiskIOComplete: unlocking queue at c %d\n", queue->col); 350 RF_ASSERT(RF_QUEUE_LOCKED(queue)); 351 RF_UNLOCK_QUEUE(queue); 352 } 353 queue->numOutstanding--; 354 RF_ASSERT(queue->numOutstanding >= 0); 355 356 /* dispatch requests to the disk until we find one that we can't. */ 357 /* no reason to continue once we've filled up the queue */ 358 /* no reason to even start if the queue is locked */ 359 360 while (!done && !RF_QUEUE_FULL(queue) && !RF_QUEUE_LOCKED(queue)) { 361 if (queue->nextLockingOp) { 362 req = queue->nextLockingOp; 363 queue->nextLockingOp = NULL; 364 Dprintf2("DiskIOComplete: a pri %d locking req was pending at c %d\n", req->priority, queue->col); 365 } else { 366 req = (queue->qPtr->Dequeue) (queue->qHdr); 367 if (req != NULL) { 368 Dprintf2("DiskIOComplete: extracting pri %d req from queue at c %d\n", req->priority, queue->col); 369 } else { 370 Dprintf1("DiskIOComplete: no more requests to extract.\n", ""); 371 } 372 } 373 if (req) { 374 queue->queueLength--; /* decrement count of number 375 * of requests waiting in this 376 * queue */ 377 RF_ASSERT(queue->queueLength >= 0); 378 } 379 if (!req) 380 done = 1; 381 else 382 if (RF_LOCKING_REQ(req)) { 383 if (RF_QUEUE_EMPTY(queue)) { /* dispatch it */ 384 Dprintf2("DiskIOComplete: dispatching pri %d locking req to c %d (queue empty)\n", req->priority, queue->col); 385 RF_LOCK_QUEUE(queue); 386 rf_DispatchKernelIO(queue, req); 387 done = 1; 388 } else { /* put it aside to wait for 389 * the queue to drain */ 390 Dprintf2("DiskIOComplete: postponing pri %d locking req to c %d\n", req->priority, queue->col); 391 RF_ASSERT(queue->nextLockingOp == NULL); 392 queue->nextLockingOp = req; 393 done = 1; 394 } 395 } else 396 if (RF_UNLOCKING_REQ(req)) { /* should not happen: 397 * unlocking ops should 398 * not get queued */ 399 RF_ASSERT(RF_QUEUE_LOCKED(queue)); /* support it anyway for 400 * the future */ 401 Dprintf2("DiskIOComplete: dispatching pri %d unl req to c %d (SHOULD NOT SEE THIS)\n", req->priority, queue->col); 402 rf_DispatchKernelIO(queue, req); 403 done = 1; 404 } else 405 if (RF_OK_TO_DISPATCH(queue, req)) { 406 Dprintf2("DiskIOComplete: dispatching pri %d regular req to c %d (ok to dispatch)\n", req->priority, queue->col); 407 rf_DispatchKernelIO(queue, req); 408 } else { /* we can't dispatch it, 409 * so just re-enqueue 410 * it. */ 411 /* potential trouble here if 412 * disk queues batch reqs */ 413 Dprintf2("DiskIOComplete: re-enqueueing pri %d regular req to c %d\n", req->priority, queue->col); 414 queue->queueLength++; 415 (queue->qPtr->Enqueue) (queue->qHdr, req, req->priority); 416 done = 1; 417 } 418 } 419 420 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete"); 421} 422/* promotes accesses tagged with the given parityStripeID from low priority 423 * to normal priority. This promotion is optional, meaning that a queue 424 * need not implement it. If there is no promotion routine associated with 425 * a queue, this routine does nothing and returns -1. 426 */ 427int 428rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID, 429 RF_ReconUnitNum_t which_ru) 430{ 431 int retval; 432 433 if (!queue->qPtr->Promote) 434 return (-1); 435 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote"); 436 retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru); 437 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote"); 438 return (retval); 439} 440 441RF_DiskQueueData_t * 442rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect, 443 RF_SectorCount_t nsect, caddr_t buf, 444 RF_StripeNum_t parityStripeID, 445 RF_ReconUnitNum_t which_ru, 446 int (*wakeF) (void *, int), void *arg, 447 RF_DiskQueueData_t *next, 448 RF_AccTraceEntry_t *tracerec, void *raidPtr, 449 RF_DiskQueueDataFlags_t flags, void *kb_proc) 450{ 451 RF_DiskQueueData_t *p; 452 int s; 453 454 p = pool_get(&rf_pools.dqd, PR_WAITOK); 455 memset(p, 0, sizeof(RF_DiskQueueData_t)); 456 /* Need to be at splbio to access bufpool! */ 457 s = splbio(); 458 p->bp = pool_get(&bufpool, PR_NOWAIT); /* XXX: make up our minds here. 459 WAITOK, or NOWAIT?? */ 460 splx(s); 461 if (p->bp == NULL) { 462 /* no memory for the buffer!?!? */ 463 pool_put(&rf_pools.dqd, p); 464 return(NULL); 465 } 466 467 memset(p->bp, 0, sizeof(struct buf)); 468 p->sectorOffset = ssect + rf_protectedSectors; 469 p->numSector = nsect; 470 p->type = typ; 471 p->buf = buf; 472 p->parityStripeID = parityStripeID; 473 p->which_ru = which_ru; 474 p->CompleteFunc = wakeF; 475 p->argument = arg; 476 p->next = next; 477 p->tracerec = tracerec; 478 p->priority = RF_IO_NORMAL_PRIORITY; 479 p->raidPtr = raidPtr; 480 p->flags = flags; 481 p->b_proc = kb_proc; 482 return (p); 483} 484 485void 486rf_FreeDiskQueueData(RF_DiskQueueData_t *p) 487{ 488 int s; 489 490 s = splbio(); 491 pool_put(&bufpool, p->bp); 492 splx(s); 493 pool_put(&rf_pools.dqd, p); 494} 495