rf_diskqueue.c revision 1.16
1/* $NetBSD: rf_diskqueue.c,v 1.16 2002/08/02 03:55:13 oster Exp $ */ 2/* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29/**************************************************************************** 30 * 31 * rf_diskqueue.c -- higher-level disk queue code 32 * 33 * the routines here are a generic wrapper around the actual queueing 34 * routines. The code here implements thread scheduling, synchronization, 35 * and locking ops (see below) on top of the lower-level queueing code. 36 * 37 * to support atomic RMW, we implement "locking operations". When a 38 * locking op is dispatched to the lower levels of the driver, the 39 * queue is locked, and no further I/Os are dispatched until the queue 40 * receives & completes a corresponding "unlocking operation". This 41 * code relies on the higher layers to guarantee that a locking op 42 * will always be eventually followed by an unlocking op. The model 43 * is that the higher layers are structured so locking and unlocking 44 * ops occur in pairs, i.e. an unlocking op cannot be generated until 45 * after a locking op reports completion. There is no good way to 46 * check to see that an unlocking op "corresponds" to the op that 47 * currently has the queue locked, so we make no such attempt. Since 48 * by definition there can be only one locking op outstanding on a 49 * disk, this should not be a problem. 50 * 51 * In the kernel, we allow multiple I/Os to be concurrently dispatched 52 * to the disk driver. In order to support locking ops in this 53 * environment, when we decide to do a locking op, we stop dispatching 54 * new I/Os and wait until all dispatched I/Os have completed before 55 * dispatching the locking op. 56 * 57 * Unfortunately, the code is different in the 3 different operating 58 * states (user level, kernel, simulator). In the kernel, I/O is 59 * non-blocking, and we have no disk threads to dispatch for us. 60 * Therefore, we have to dispatch new I/Os to the scsi driver at the 61 * time of enqueue, and also at the time of completion. At user 62 * level, I/O is blocking, and so only the disk threads may dispatch 63 * I/Os. Thus at user level, all we can do at enqueue time is enqueue 64 * and wake up the disk thread to do the dispatch. 65 * 66 ****************************************************************************/ 67 68#include <sys/cdefs.h> 69__KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.16 2002/08/02 03:55:13 oster Exp $"); 70 71#include <dev/raidframe/raidframevar.h> 72 73#include "rf_threadstuff.h" 74#include "rf_raid.h" 75#include "rf_diskqueue.h" 76#include "rf_alloclist.h" 77#include "rf_acctrace.h" 78#include "rf_etimer.h" 79#include "rf_general.h" 80#include "rf_freelist.h" 81#include "rf_debugprint.h" 82#include "rf_shutdown.h" 83#include "rf_cvscan.h" 84#include "rf_sstf.h" 85#include "rf_fifo.h" 86#include "rf_kintf.h" 87 88static int init_dqd(RF_DiskQueueData_t *); 89static void clean_dqd(RF_DiskQueueData_t *); 90static void rf_ShutdownDiskQueueSystem(void *); 91 92#define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 93#define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 94#define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) 95 96/***************************************************************************** 97 * 98 * the disk queue switch defines all the functions used in the 99 * different queueing disciplines queue ID, init routine, enqueue 100 * routine, dequeue routine 101 * 102 ****************************************************************************/ 103 104static RF_DiskQueueSW_t diskqueuesw[] = { 105 {"fifo", /* FIFO */ 106 rf_FifoCreate, 107 rf_FifoEnqueue, 108 rf_FifoDequeue, 109 rf_FifoPeek, 110 rf_FifoPromote}, 111 112 {"cvscan", /* cvscan */ 113 rf_CvscanCreate, 114 rf_CvscanEnqueue, 115 rf_CvscanDequeue, 116 rf_CvscanPeek, 117 rf_CvscanPromote}, 118 119 {"sstf", /* shortest seek time first */ 120 rf_SstfCreate, 121 rf_SstfEnqueue, 122 rf_SstfDequeue, 123 rf_SstfPeek, 124 rf_SstfPromote}, 125 126 {"scan", /* SCAN (two-way elevator) */ 127 rf_ScanCreate, 128 rf_SstfEnqueue, 129 rf_ScanDequeue, 130 rf_ScanPeek, 131 rf_SstfPromote}, 132 133 {"cscan", /* CSCAN (one-way elevator) */ 134 rf_CscanCreate, 135 rf_SstfEnqueue, 136 rf_CscanDequeue, 137 rf_CscanPeek, 138 rf_SstfPromote}, 139 140}; 141#define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t)) 142 143static RF_FreeList_t *rf_dqd_freelist; 144 145#define RF_MAX_FREE_DQD 256 146#define RF_DQD_INC 16 147#define RF_DQD_INITIAL 64 148 149#include <sys/buf.h> 150 151static int 152init_dqd(dqd) 153 RF_DiskQueueData_t *dqd; 154{ 155 156 dqd->bp = (struct buf *) malloc(sizeof(struct buf), 157 M_RAIDFRAME, M_NOWAIT); 158 if (dqd->bp == NULL) { 159 return (ENOMEM); 160 } 161 memset(dqd->bp, 0, sizeof(struct buf)); /* if you don't do it, nobody 162 * else will.. */ 163 return (0); 164} 165 166static void 167clean_dqd(dqd) 168 RF_DiskQueueData_t *dqd; 169{ 170 free(dqd->bp, M_RAIDFRAME); 171} 172/* configures a single disk queue */ 173 174int 175rf_ConfigureDiskQueue( 176 RF_Raid_t * raidPtr, 177 RF_DiskQueue_t * diskqueue, 178 RF_RowCol_t r, /* row & col -- debug only. BZZT not any 179 * more... */ 180 RF_RowCol_t c, 181 RF_DiskQueueSW_t * p, 182 RF_SectorCount_t sectPerDisk, 183 dev_t dev, 184 int maxOutstanding, 185 RF_ShutdownList_t ** listp, 186 RF_AllocListElem_t * clList) 187{ 188 int rc; 189 190 diskqueue->row = r; 191 diskqueue->col = c; 192 diskqueue->qPtr = p; 193 diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp); 194 diskqueue->dev = dev; 195 diskqueue->numOutstanding = 0; 196 diskqueue->queueLength = 0; 197 diskqueue->maxOutstanding = maxOutstanding; 198 diskqueue->curPriority = RF_IO_NORMAL_PRIORITY; 199 diskqueue->nextLockingOp = NULL; 200 diskqueue->unlockingOp = NULL; 201 diskqueue->numWaiting = 0; 202 diskqueue->flags = 0; 203 diskqueue->raidPtr = raidPtr; 204 diskqueue->rf_cinfo = &raidPtr->raid_cinfo[r][c]; 205 rc = rf_create_managed_mutex(listp, &diskqueue->mutex); 206 if (rc) { 207 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, 208 __LINE__, rc); 209 return (rc); 210 } 211 rc = rf_create_managed_cond(listp, &diskqueue->cond); 212 if (rc) { 213 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__, 214 __LINE__, rc); 215 return (rc); 216 } 217 return (0); 218} 219 220static void 221rf_ShutdownDiskQueueSystem(ignored) 222 void *ignored; 223{ 224 RF_FREELIST_DESTROY_CLEAN(rf_dqd_freelist, next, (RF_DiskQueueData_t *), clean_dqd); 225} 226 227int 228rf_ConfigureDiskQueueSystem(listp) 229 RF_ShutdownList_t **listp; 230{ 231 int rc; 232 233 RF_FREELIST_CREATE(rf_dqd_freelist, RF_MAX_FREE_DQD, 234 RF_DQD_INC, sizeof(RF_DiskQueueData_t)); 235 if (rf_dqd_freelist == NULL) 236 return (ENOMEM); 237 rc = rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL); 238 if (rc) { 239 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", 240 __FILE__, __LINE__, rc); 241 rf_ShutdownDiskQueueSystem(NULL); 242 return (rc); 243 } 244 RF_FREELIST_PRIME_INIT(rf_dqd_freelist, RF_DQD_INITIAL, next, 245 (RF_DiskQueueData_t *), init_dqd); 246 return (0); 247} 248 249int 250rf_ConfigureDiskQueues( 251 RF_ShutdownList_t ** listp, 252 RF_Raid_t * raidPtr, 253 RF_Config_t * cfgPtr) 254{ 255 RF_DiskQueue_t **diskQueues, *spareQueues; 256 RF_DiskQueueSW_t *p; 257 RF_RowCol_t r, c; 258 int rc, i; 259 260 raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs; 261 262 for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) { 263 if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) { 264 p = &diskqueuesw[i]; 265 break; 266 } 267 } 268 if (p == NULL) { 269 RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType); 270 p = &diskqueuesw[0]; 271 } 272 raidPtr->qType = p; 273 RF_CallocAndAdd(diskQueues, raidPtr->numRow, sizeof(RF_DiskQueue_t *), (RF_DiskQueue_t **), raidPtr->cleanupList); 274 if (diskQueues == NULL) { 275 return (ENOMEM); 276 } 277 raidPtr->Queues = diskQueues; 278 for (r = 0; r < raidPtr->numRow; r++) { 279 RF_CallocAndAdd(diskQueues[r], raidPtr->numCol + 280 ((r == 0) ? RF_MAXSPARE : 0), 281 sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *), 282 raidPtr->cleanupList); 283 if (diskQueues[r] == NULL) 284 return (ENOMEM); 285 for (c = 0; c < raidPtr->numCol; c++) { 286 rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[r][c], 287 r, c, p, 288 raidPtr->sectorsPerDisk, 289 raidPtr->Disks[r][c].dev, 290 cfgPtr->maxOutstandingDiskReqs, 291 listp, raidPtr->cleanupList); 292 if (rc) 293 return (rc); 294 } 295 } 296 297 spareQueues = &raidPtr->Queues[0][raidPtr->numCol]; 298 for (r = 0; r < raidPtr->numSpare; r++) { 299 rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r], 300 0, raidPtr->numCol + r, p, 301 raidPtr->sectorsPerDisk, 302 raidPtr->Disks[0][raidPtr->numCol + r].dev, 303 cfgPtr->maxOutstandingDiskReqs, listp, 304 raidPtr->cleanupList); 305 if (rc) 306 return (rc); 307 } 308 return (0); 309} 310/* Enqueue a disk I/O 311 * 312 * Unfortunately, we have to do things differently in the different 313 * environments (simulator, user-level, kernel). 314 * At user level, all I/O is blocking, so we have 1 or more threads/disk 315 * and the thread that enqueues is different from the thread that dequeues. 316 * In the kernel, I/O is non-blocking and so we'd like to have multiple 317 * I/Os outstanding on the physical disks when possible. 318 * 319 * when any request arrives at a queue, we have two choices: 320 * dispatch it to the lower levels 321 * queue it up 322 * 323 * kernel rules for when to do what: 324 * locking request: queue empty => dispatch and lock queue, 325 * else queue it 326 * unlocking req : always dispatch it 327 * normal req : queue empty => dispatch it & set priority 328 * queue not full & priority is ok => dispatch it 329 * else queue it 330 * 331 * user-level rules: 332 * always enqueue. In the special case of an unlocking op, enqueue 333 * in a special way that will cause the unlocking op to be the next 334 * thing dequeued. 335 * 336 * simulator rules: 337 * Do the same as at user level, with the sleeps and wakeups suppressed. 338 */ 339void 340rf_DiskIOEnqueue(queue, req, pri) 341 RF_DiskQueue_t *queue; 342 RF_DiskQueueData_t *req; 343 int pri; 344{ 345 RF_ETIMER_START(req->qtime); 346 RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector); 347 req->priority = pri; 348 349 if (rf_queueDebug && (req->numSector == 0)) { 350 printf("Warning: Enqueueing zero-sector access\n"); 351 } 352 /* 353 * kernel 354 */ 355 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue"); 356 /* locking request */ 357 if (RF_LOCKING_REQ(req)) { 358 if (RF_QUEUE_EMPTY(queue)) { 359 Dprintf3("Dispatching pri %d locking op to r %d c %d (queue empty)\n", pri, queue->row, queue->col); 360 RF_LOCK_QUEUE(queue); 361 rf_DispatchKernelIO(queue, req); 362 } else { 363 queue->queueLength++; /* increment count of number 364 * of requests waiting in this 365 * queue */ 366 Dprintf3("Enqueueing pri %d locking op to r %d c %d (queue not empty)\n", pri, queue->row, queue->col); 367 req->queue = (void *) queue; 368 (queue->qPtr->Enqueue) (queue->qHdr, req, pri); 369 } 370 } 371 /* unlocking request */ 372 else 373 if (RF_UNLOCKING_REQ(req)) { /* we'll do the actual unlock 374 * when this I/O completes */ 375 Dprintf3("Dispatching pri %d unlocking op to r %d c %d\n", pri, queue->row, queue->col); 376 RF_ASSERT(RF_QUEUE_LOCKED(queue)); 377 rf_DispatchKernelIO(queue, req); 378 } 379 /* normal request */ 380 else 381 if (RF_OK_TO_DISPATCH(queue, req)) { 382 Dprintf3("Dispatching pri %d regular op to r %d c %d (ok to dispatch)\n", pri, queue->row, queue->col); 383 rf_DispatchKernelIO(queue, req); 384 } else { 385 queue->queueLength++; /* increment count of 386 * number of requests 387 * waiting in this queue */ 388 Dprintf3("Enqueueing pri %d regular op to r %d c %d (not ok to dispatch)\n", pri, queue->row, queue->col); 389 req->queue = (void *) queue; 390 (queue->qPtr->Enqueue) (queue->qHdr, req, pri); 391 } 392 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue"); 393} 394 395 396/* get the next set of I/Os started, kernel version only */ 397void 398rf_DiskIOComplete(queue, req, status) 399 RF_DiskQueue_t *queue; 400 RF_DiskQueueData_t *req; 401 int status; 402{ 403 int done = 0; 404 405 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete"); 406 407 /* unlock the queue: (1) after an unlocking req completes (2) after a 408 * locking req fails */ 409 if (RF_UNLOCKING_REQ(req) || (RF_LOCKING_REQ(req) && status)) { 410 Dprintf2("DiskIOComplete: unlocking queue at r %d c %d\n", queue->row, queue->col); 411 RF_ASSERT(RF_QUEUE_LOCKED(queue) && (queue->unlockingOp == NULL)); 412 RF_UNLOCK_QUEUE(queue); 413 } 414 queue->numOutstanding--; 415 RF_ASSERT(queue->numOutstanding >= 0); 416 417 /* dispatch requests to the disk until we find one that we can't. */ 418 /* no reason to continue once we've filled up the queue */ 419 /* no reason to even start if the queue is locked */ 420 421 while (!done && !RF_QUEUE_FULL(queue) && !RF_QUEUE_LOCKED(queue)) { 422 if (queue->nextLockingOp) { 423 req = queue->nextLockingOp; 424 queue->nextLockingOp = NULL; 425 Dprintf3("DiskIOComplete: a pri %d locking req was pending at r %d c %d\n", req->priority, queue->row, queue->col); 426 } else { 427 req = (queue->qPtr->Dequeue) (queue->qHdr); 428 if (req != NULL) { 429 Dprintf3("DiskIOComplete: extracting pri %d req from queue at r %d c %d\n", req->priority, queue->row, queue->col); 430 } else { 431 Dprintf1("DiskIOComplete: no more requests to extract.\n", ""); 432 } 433 } 434 if (req) { 435 queue->queueLength--; /* decrement count of number 436 * of requests waiting in this 437 * queue */ 438 RF_ASSERT(queue->queueLength >= 0); 439 } 440 if (!req) 441 done = 1; 442 else 443 if (RF_LOCKING_REQ(req)) { 444 if (RF_QUEUE_EMPTY(queue)) { /* dispatch it */ 445 Dprintf3("DiskIOComplete: dispatching pri %d locking req to r %d c %d (queue empty)\n", req->priority, queue->row, queue->col); 446 RF_LOCK_QUEUE(queue); 447 rf_DispatchKernelIO(queue, req); 448 done = 1; 449 } else { /* put it aside to wait for 450 * the queue to drain */ 451 Dprintf3("DiskIOComplete: postponing pri %d locking req to r %d c %d\n", req->priority, queue->row, queue->col); 452 RF_ASSERT(queue->nextLockingOp == NULL); 453 queue->nextLockingOp = req; 454 done = 1; 455 } 456 } else 457 if (RF_UNLOCKING_REQ(req)) { /* should not happen: 458 * unlocking ops should 459 * not get queued */ 460 RF_ASSERT(RF_QUEUE_LOCKED(queue)); /* support it anyway for 461 * the future */ 462 Dprintf3("DiskIOComplete: dispatching pri %d unl req to r %d c %d (SHOULD NOT SEE THIS)\n", req->priority, queue->row, queue->col); 463 rf_DispatchKernelIO(queue, req); 464 done = 1; 465 } else 466 if (RF_OK_TO_DISPATCH(queue, req)) { 467 Dprintf3("DiskIOComplete: dispatching pri %d regular req to r %d c %d (ok to dispatch)\n", req->priority, queue->row, queue->col); 468 rf_DispatchKernelIO(queue, req); 469 } else { /* we can't dispatch it, 470 * so just re-enqueue 471 * it. */ 472 /* potential trouble here if 473 * disk queues batch reqs */ 474 Dprintf3("DiskIOComplete: re-enqueueing pri %d regular req to r %d c %d\n", req->priority, queue->row, queue->col); 475 queue->queueLength++; 476 (queue->qPtr->Enqueue) (queue->qHdr, req, req->priority); 477 done = 1; 478 } 479 } 480 481 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete"); 482} 483/* promotes accesses tagged with the given parityStripeID from low priority 484 * to normal priority. This promotion is optional, meaning that a queue 485 * need not implement it. If there is no promotion routine associated with 486 * a queue, this routine does nothing and returns -1. 487 */ 488int 489rf_DiskIOPromote(queue, parityStripeID, which_ru) 490 RF_DiskQueue_t *queue; 491 RF_StripeNum_t parityStripeID; 492 RF_ReconUnitNum_t which_ru; 493{ 494 int retval; 495 496 if (!queue->qPtr->Promote) 497 return (-1); 498 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote"); 499 retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru); 500 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote"); 501 return (retval); 502} 503 504RF_DiskQueueData_t * 505rf_CreateDiskQueueData( 506 RF_IoType_t typ, 507 RF_SectorNum_t ssect, 508 RF_SectorCount_t nsect, 509 caddr_t buf, 510 RF_StripeNum_t parityStripeID, 511 RF_ReconUnitNum_t which_ru, 512 int (*wakeF) (void *, int), 513 void *arg, 514 RF_DiskQueueData_t * next, 515 RF_AccTraceEntry_t * tracerec, 516 void *raidPtr, 517 RF_DiskQueueDataFlags_t flags, 518 void *kb_proc) 519{ 520 RF_DiskQueueData_t *p; 521 522 RF_FREELIST_GET_INIT(rf_dqd_freelist, p, next, (RF_DiskQueueData_t *), init_dqd); 523 524 p->sectorOffset = ssect + rf_protectedSectors; 525 p->numSector = nsect; 526 p->type = typ; 527 p->buf = buf; 528 p->parityStripeID = parityStripeID; 529 p->which_ru = which_ru; 530 p->CompleteFunc = wakeF; 531 p->argument = arg; 532 p->next = next; 533 p->tracerec = tracerec; 534 p->priority = RF_IO_NORMAL_PRIORITY; 535 p->AuxFunc = NULL; 536 p->buf2 = NULL; 537 p->raidPtr = raidPtr; 538 p->flags = flags; 539 p->b_proc = kb_proc; 540 return (p); 541} 542 543void 544rf_FreeDiskQueueData(p) 545 RF_DiskQueueData_t *p; 546{ 547 RF_FREELIST_FREE_CLEAN(rf_dqd_freelist, p, next, clean_dqd); 548} 549