1 /* $NetBSD: rf_diskqueue.c,v 1.13 2000/03/04 04:22:34 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /**************************************************************************** 30 * 31 * rf_diskqueue.c -- higher-level disk queue code 32 * 33 * the routines here are a generic wrapper around the actual queueing 34 * routines. The code here implements thread scheduling, synchronization, 35 * and locking ops (see below) on top of the lower-level queueing code. 36 * 37 * to support atomic RMW, we implement "locking operations". When a 38 * locking op is dispatched to the lower levels of the driver, the 39 * queue is locked, and no further I/Os are dispatched until the queue 40 * receives & completes a corresponding "unlocking operation". This 41 * code relies on the higher layers to guarantee that a locking op 42 * will always be eventually followed by an unlocking op. The model 43 * is that the higher layers are structured so locking and unlocking 44 * ops occur in pairs, i.e. an unlocking op cannot be generated until 45 * after a locking op reports completion. There is no good way to 46 * check to see that an unlocking op "corresponds" to the op that 47 * currently has the queue locked, so we make no such attempt. Since 48 * by definition there can be only one locking op outstanding on a 49 * disk, this should not be a problem. 50 * 51 * In the kernel, we allow multiple I/Os to be concurrently dispatched 52 * to the disk driver. In order to support locking ops in this 53 * environment, when we decide to do a locking op, we stop dispatching 54 * new I/Os and wait until all dispatched I/Os have completed before 55 * dispatching the locking op. 56 * 57 * Unfortunately, the code is different in the 3 different operating 58 * states (user level, kernel, simulator). In the kernel, I/O is 59 * non-blocking, and we have no disk threads to dispatch for us. 60 * Therefore, we have to dispatch new I/Os to the scsi driver at the 61 * time of enqueue, and also at the time of completion. At user 62 * level, I/O is blocking, and so only the disk threads may dispatch 63 * I/Os. Thus at user level, all we can do at enqueue time is enqueue 64 * and wake up the disk thread to do the dispatch. 65 * 66 ****************************************************************************/ 67 68 #include "rf_types.h" 69 #include "rf_threadstuff.h" 70 #include "rf_raid.h" 71 #include "rf_diskqueue.h" 72 #include "rf_alloclist.h" 73 #include "rf_acctrace.h" 74 #include "rf_etimer.h" 75 #include "rf_configure.h" 76 #include "rf_general.h" 77 #include "rf_freelist.h" 78 #include "rf_debugprint.h" 79 #include "rf_shutdown.h" 80 #include "rf_cvscan.h" 81 #include "rf_sstf.h" 82 #include "rf_fifo.h" 83 #include "rf_kintf.h" 84 85 static int init_dqd(RF_DiskQueueData_t *); 86 static void clean_dqd(RF_DiskQueueData_t *); 87 static void rf_ShutdownDiskQueueSystem(void *); 88 89 #define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 90 #define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 91 #define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) 92 93 /***************************************************************************** 94 * 95 * the disk queue switch defines all the functions used in the 96 * different queueing disciplines queue ID, init routine, enqueue 97 * routine, dequeue routine 98 * 99 ****************************************************************************/ 100 101 static RF_DiskQueueSW_t diskqueuesw[] = { 102 {"fifo", /* FIFO */ 103 rf_FifoCreate, 104 rf_FifoEnqueue, 105 rf_FifoDequeue, 106 rf_FifoPeek, 107 rf_FifoPromote}, 108 109 {"cvscan", /* cvscan */ 110 rf_CvscanCreate, 111 rf_CvscanEnqueue, 112 rf_CvscanDequeue, 113 rf_CvscanPeek, 114 rf_CvscanPromote}, 115 116 {"sstf", /* shortest seek time first */ 117 rf_SstfCreate, 118 rf_SstfEnqueue, 119 rf_SstfDequeue, 120 rf_SstfPeek, 121 rf_SstfPromote}, 122 123 {"scan", /* SCAN (two-way elevator) */ 124 rf_ScanCreate, 125 rf_SstfEnqueue, 126 rf_ScanDequeue, 127 rf_ScanPeek, 128 rf_SstfPromote}, 129 130 {"cscan", /* CSCAN (one-way elevator) */ 131 rf_CscanCreate, 132 rf_SstfEnqueue, 133 rf_CscanDequeue, 134 rf_CscanPeek, 135 rf_SstfPromote}, 136 137 }; 138 #define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t)) 139 140 static RF_FreeList_t *rf_dqd_freelist; 141 142 #define RF_MAX_FREE_DQD 256 143 #define RF_DQD_INC 16 144 #define RF_DQD_INITIAL 64 145 146 #include <sys/buf.h> 147 148 static int 149 init_dqd(dqd) 150 RF_DiskQueueData_t *dqd; 151 { 152 153 dqd->bp = (struct buf *) malloc(sizeof(struct buf), 154 M_RAIDFRAME, M_NOWAIT); 155 if (dqd->bp == NULL) { 156 return (ENOMEM); 157 } 158 memset(dqd->bp, 0, sizeof(struct buf)); /* if you don't do it, nobody 159 * else will.. */ 160 return (0); 161 } 162 163 static void 164 clean_dqd(dqd) 165 RF_DiskQueueData_t *dqd; 166 { 167 free(dqd->bp, M_RAIDFRAME); 168 } 169 /* configures a single disk queue */ 170 171 int 172 rf_ConfigureDiskQueue( 173 RF_Raid_t * raidPtr, 174 RF_DiskQueue_t * diskqueue, 175 RF_RowCol_t r, /* row & col -- debug only. BZZT not any 176 * more... */ 177 RF_RowCol_t c, 178 RF_DiskQueueSW_t * p, 179 RF_SectorCount_t sectPerDisk, 180 dev_t dev, 181 int maxOutstanding, 182 RF_ShutdownList_t ** listp, 183 RF_AllocListElem_t * clList) 184 { 185 int rc; 186 187 diskqueue->row = r; 188 diskqueue->col = c; 189 diskqueue->qPtr = p; 190 diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp); 191 diskqueue->dev = dev; 192 diskqueue->numOutstanding = 0; 193 diskqueue->queueLength = 0; 194 diskqueue->maxOutstanding = maxOutstanding; 195 diskqueue->curPriority = RF_IO_NORMAL_PRIORITY; 196 diskqueue->nextLockingOp = NULL; 197 diskqueue->unlockingOp = NULL; 198 diskqueue->numWaiting = 0; 199 diskqueue->flags = 0; 200 diskqueue->raidPtr = raidPtr; 201 diskqueue->rf_cinfo = &raidPtr->raid_cinfo[r][c]; 202 rc = rf_create_managed_mutex(listp, &diskqueue->mutex); 203 if (rc) { 204 RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, 205 __LINE__, rc); 206 return (rc); 207 } 208 rc = rf_create_managed_cond(listp, &diskqueue->cond); 209 if (rc) { 210 RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__, 211 __LINE__, rc); 212 return (rc); 213 } 214 return (0); 215 } 216 217 static void 218 rf_ShutdownDiskQueueSystem(ignored) 219 void *ignored; 220 { 221 RF_FREELIST_DESTROY_CLEAN(rf_dqd_freelist, next, (RF_DiskQueueData_t *), clean_dqd); 222 } 223 224 int 225 rf_ConfigureDiskQueueSystem(listp) 226 RF_ShutdownList_t **listp; 227 { 228 int rc; 229 230 RF_FREELIST_CREATE(rf_dqd_freelist, RF_MAX_FREE_DQD, 231 RF_DQD_INC, sizeof(RF_DiskQueueData_t)); 232 if (rf_dqd_freelist == NULL) 233 return (ENOMEM); 234 rc = rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL); 235 if (rc) { 236 RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", 237 __FILE__, __LINE__, rc); 238 rf_ShutdownDiskQueueSystem(NULL); 239 return (rc); 240 } 241 RF_FREELIST_PRIME_INIT(rf_dqd_freelist, RF_DQD_INITIAL, next, 242 (RF_DiskQueueData_t *), init_dqd); 243 return (0); 244 } 245 246 int 247 rf_ConfigureDiskQueues( 248 RF_ShutdownList_t ** listp, 249 RF_Raid_t * raidPtr, 250 RF_Config_t * cfgPtr) 251 { 252 RF_DiskQueue_t **diskQueues, *spareQueues; 253 RF_DiskQueueSW_t *p; 254 RF_RowCol_t r, c; 255 int rc, i; 256 257 raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs; 258 259 for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) { 260 if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) { 261 p = &diskqueuesw[i]; 262 break; 263 } 264 } 265 if (p == NULL) { 266 RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType); 267 p = &diskqueuesw[0]; 268 } 269 raidPtr->qType = p; 270 RF_CallocAndAdd(diskQueues, raidPtr->numRow, sizeof(RF_DiskQueue_t *), (RF_DiskQueue_t **), raidPtr->cleanupList); 271 if (diskQueues == NULL) { 272 return (ENOMEM); 273 } 274 raidPtr->Queues = diskQueues; 275 for (r = 0; r < raidPtr->numRow; r++) { 276 RF_CallocAndAdd(diskQueues[r], raidPtr->numCol + 277 ((r == 0) ? RF_MAXSPARE : 0), 278 sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *), 279 raidPtr->cleanupList); 280 if (diskQueues[r] == NULL) 281 return (ENOMEM); 282 for (c = 0; c < raidPtr->numCol; c++) { 283 rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[r][c], 284 r, c, p, 285 raidPtr->sectorsPerDisk, 286 raidPtr->Disks[r][c].dev, 287 cfgPtr->maxOutstandingDiskReqs, 288 listp, raidPtr->cleanupList); 289 if (rc) 290 return (rc); 291 } 292 } 293 294 spareQueues = &raidPtr->Queues[0][raidPtr->numCol]; 295 for (r = 0; r < raidPtr->numSpare; r++) { 296 rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r], 297 0, raidPtr->numCol + r, p, 298 raidPtr->sectorsPerDisk, 299 raidPtr->Disks[0][raidPtr->numCol + r].dev, 300 cfgPtr->maxOutstandingDiskReqs, listp, 301 raidPtr->cleanupList); 302 if (rc) 303 return (rc); 304 } 305 return (0); 306 } 307 /* Enqueue a disk I/O 308 * 309 * Unfortunately, we have to do things differently in the different 310 * environments (simulator, user-level, kernel). 311 * At user level, all I/O is blocking, so we have 1 or more threads/disk 312 * and the thread that enqueues is different from the thread that dequeues. 313 * In the kernel, I/O is non-blocking and so we'd like to have multiple 314 * I/Os outstanding on the physical disks when possible. 315 * 316 * when any request arrives at a queue, we have two choices: 317 * dispatch it to the lower levels 318 * queue it up 319 * 320 * kernel rules for when to do what: 321 * locking request: queue empty => dispatch and lock queue, 322 * else queue it 323 * unlocking req : always dispatch it 324 * normal req : queue empty => dispatch it & set priority 325 * queue not full & priority is ok => dispatch it 326 * else queue it 327 * 328 * user-level rules: 329 * always enqueue. In the special case of an unlocking op, enqueue 330 * in a special way that will cause the unlocking op to be the next 331 * thing dequeued. 332 * 333 * simulator rules: 334 * Do the same as at user level, with the sleeps and wakeups suppressed. 335 */ 336 void 337 rf_DiskIOEnqueue(queue, req, pri) 338 RF_DiskQueue_t *queue; 339 RF_DiskQueueData_t *req; 340 int pri; 341 { 342 RF_ETIMER_START(req->qtime); 343 RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector); 344 req->priority = pri; 345 346 if (rf_queueDebug && (req->numSector == 0)) { 347 printf("Warning: Enqueueing zero-sector access\n"); 348 } 349 /* 350 * kernel 351 */ 352 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue"); 353 /* locking request */ 354 if (RF_LOCKING_REQ(req)) { 355 if (RF_QUEUE_EMPTY(queue)) { 356 Dprintf3("Dispatching pri %d locking op to r %d c %d (queue empty)\n", pri, queue->row, queue->col); 357 RF_LOCK_QUEUE(queue); 358 rf_DispatchKernelIO(queue, req); 359 } else { 360 queue->queueLength++; /* increment count of number 361 * of requests waiting in this 362 * queue */ 363 Dprintf3("Enqueueing pri %d locking op to r %d c %d (queue not empty)\n", pri, queue->row, queue->col); 364 req->queue = (void *) queue; 365 (queue->qPtr->Enqueue) (queue->qHdr, req, pri); 366 } 367 } 368 /* unlocking request */ 369 else 370 if (RF_UNLOCKING_REQ(req)) { /* we'll do the actual unlock 371 * when this I/O completes */ 372 Dprintf3("Dispatching pri %d unlocking op to r %d c %d\n", pri, queue->row, queue->col); 373 RF_ASSERT(RF_QUEUE_LOCKED(queue)); 374 rf_DispatchKernelIO(queue, req); 375 } 376 /* normal request */ 377 else 378 if (RF_OK_TO_DISPATCH(queue, req)) { 379 Dprintf3("Dispatching pri %d regular op to r %d c %d (ok to dispatch)\n", pri, queue->row, queue->col); 380 rf_DispatchKernelIO(queue, req); 381 } else { 382 queue->queueLength++; /* increment count of 383 * number of requests 384 * waiting in this queue */ 385 Dprintf3("Enqueueing pri %d regular op to r %d c %d (not ok to dispatch)\n", pri, queue->row, queue->col); 386 req->queue = (void *) queue; 387 (queue->qPtr->Enqueue) (queue->qHdr, req, pri); 388 } 389 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue"); 390 } 391 392 393 /* get the next set of I/Os started, kernel version only */ 394 void 395 rf_DiskIOComplete(queue, req, status) 396 RF_DiskQueue_t *queue; 397 RF_DiskQueueData_t *req; 398 int status; 399 { 400 int done = 0; 401 402 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete"); 403 404 /* unlock the queue: (1) after an unlocking req completes (2) after a 405 * locking req fails */ 406 if (RF_UNLOCKING_REQ(req) || (RF_LOCKING_REQ(req) && status)) { 407 Dprintf2("DiskIOComplete: unlocking queue at r %d c %d\n", queue->row, queue->col); 408 RF_ASSERT(RF_QUEUE_LOCKED(queue) && (queue->unlockingOp == NULL)); 409 RF_UNLOCK_QUEUE(queue); 410 } 411 queue->numOutstanding--; 412 RF_ASSERT(queue->numOutstanding >= 0); 413 414 /* dispatch requests to the disk until we find one that we can't. */ 415 /* no reason to continue once we've filled up the queue */ 416 /* no reason to even start if the queue is locked */ 417 418 while (!done && !RF_QUEUE_FULL(queue) && !RF_QUEUE_LOCKED(queue)) { 419 if (queue->nextLockingOp) { 420 req = queue->nextLockingOp; 421 queue->nextLockingOp = NULL; 422 Dprintf3("DiskIOComplete: a pri %d locking req was pending at r %d c %d\n", req->priority, queue->row, queue->col); 423 } else { 424 req = (queue->qPtr->Dequeue) (queue->qHdr); 425 if (req != NULL) { 426 Dprintf3("DiskIOComplete: extracting pri %d req from queue at r %d c %d\n", req->priority, queue->row, queue->col); 427 } else { 428 Dprintf1("DiskIOComplete: no more requests to extract.\n", ""); 429 } 430 } 431 if (req) { 432 queue->queueLength--; /* decrement count of number 433 * of requests waiting in this 434 * queue */ 435 RF_ASSERT(queue->queueLength >= 0); 436 } 437 if (!req) 438 done = 1; 439 else 440 if (RF_LOCKING_REQ(req)) { 441 if (RF_QUEUE_EMPTY(queue)) { /* dispatch it */ 442 Dprintf3("DiskIOComplete: dispatching pri %d locking req to r %d c %d (queue empty)\n", req->priority, queue->row, queue->col); 443 RF_LOCK_QUEUE(queue); 444 rf_DispatchKernelIO(queue, req); 445 done = 1; 446 } else { /* put it aside to wait for 447 * the queue to drain */ 448 Dprintf3("DiskIOComplete: postponing pri %d locking req to r %d c %d\n", req->priority, queue->row, queue->col); 449 RF_ASSERT(queue->nextLockingOp == NULL); 450 queue->nextLockingOp = req; 451 done = 1; 452 } 453 } else 454 if (RF_UNLOCKING_REQ(req)) { /* should not happen: 455 * unlocking ops should 456 * not get queued */ 457 RF_ASSERT(RF_QUEUE_LOCKED(queue)); /* support it anyway for 458 * the future */ 459 Dprintf3("DiskIOComplete: dispatching pri %d unl req to r %d c %d (SHOULD NOT SEE THIS)\n", req->priority, queue->row, queue->col); 460 rf_DispatchKernelIO(queue, req); 461 done = 1; 462 } else 463 if (RF_OK_TO_DISPATCH(queue, req)) { 464 Dprintf3("DiskIOComplete: dispatching pri %d regular req to r %d c %d (ok to dispatch)\n", req->priority, queue->row, queue->col); 465 rf_DispatchKernelIO(queue, req); 466 } else { /* we can't dispatch it, 467 * so just re-enqueue 468 * it. */ 469 /* potential trouble here if 470 * disk queues batch reqs */ 471 Dprintf3("DiskIOComplete: re-enqueueing pri %d regular req to r %d c %d\n", req->priority, queue->row, queue->col); 472 queue->queueLength++; 473 (queue->qPtr->Enqueue) (queue->qHdr, req, req->priority); 474 done = 1; 475 } 476 } 477 478 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete"); 479 } 480 /* promotes accesses tagged with the given parityStripeID from low priority 481 * to normal priority. This promotion is optional, meaning that a queue 482 * need not implement it. If there is no promotion routine associated with 483 * a queue, this routine does nothing and returns -1. 484 */ 485 int 486 rf_DiskIOPromote(queue, parityStripeID, which_ru) 487 RF_DiskQueue_t *queue; 488 RF_StripeNum_t parityStripeID; 489 RF_ReconUnitNum_t which_ru; 490 { 491 int retval; 492 493 if (!queue->qPtr->Promote) 494 return (-1); 495 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote"); 496 retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru); 497 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote"); 498 return (retval); 499 } 500 501 RF_DiskQueueData_t * 502 rf_CreateDiskQueueData( 503 RF_IoType_t typ, 504 RF_SectorNum_t ssect, 505 RF_SectorCount_t nsect, 506 caddr_t buf, 507 RF_StripeNum_t parityStripeID, 508 RF_ReconUnitNum_t which_ru, 509 int (*wakeF) (void *, int), 510 void *arg, 511 RF_DiskQueueData_t * next, 512 RF_AccTraceEntry_t * tracerec, 513 void *raidPtr, 514 RF_DiskQueueDataFlags_t flags, 515 void *kb_proc) 516 { 517 RF_DiskQueueData_t *p; 518 519 RF_FREELIST_GET_INIT(rf_dqd_freelist, p, next, (RF_DiskQueueData_t *), init_dqd); 520 521 p->sectorOffset = ssect + rf_protectedSectors; 522 p->numSector = nsect; 523 p->type = typ; 524 p->buf = buf; 525 p->parityStripeID = parityStripeID; 526 p->which_ru = which_ru; 527 p->CompleteFunc = wakeF; 528 p->argument = arg; 529 p->next = next; 530 p->tracerec = tracerec; 531 p->priority = RF_IO_NORMAL_PRIORITY; 532 p->AuxFunc = NULL; 533 p->buf2 = NULL; 534 p->raidPtr = raidPtr; 535 p->flags = flags; 536 p->b_proc = kb_proc; 537 return (p); 538 } 539 540 RF_DiskQueueData_t * 541 rf_CreateDiskQueueDataFull( 542 RF_IoType_t typ, 543 RF_SectorNum_t ssect, 544 RF_SectorCount_t nsect, 545 caddr_t buf, 546 RF_StripeNum_t parityStripeID, 547 RF_ReconUnitNum_t which_ru, 548 int (*wakeF) (void *, int), 549 void *arg, 550 RF_DiskQueueData_t * next, 551 RF_AccTraceEntry_t * tracerec, 552 int priority, 553 int (*AuxFunc) (void *,...), 554 caddr_t buf2, 555 void *raidPtr, 556 RF_DiskQueueDataFlags_t flags, 557 void *kb_proc) 558 { 559 RF_DiskQueueData_t *p; 560 561 RF_FREELIST_GET_INIT(rf_dqd_freelist, p, next, (RF_DiskQueueData_t *), init_dqd); 562 563 p->sectorOffset = ssect + rf_protectedSectors; 564 p->numSector = nsect; 565 p->type = typ; 566 p->buf = buf; 567 p->parityStripeID = parityStripeID; 568 p->which_ru = which_ru; 569 p->CompleteFunc = wakeF; 570 p->argument = arg; 571 p->next = next; 572 p->tracerec = tracerec; 573 p->priority = priority; 574 p->AuxFunc = AuxFunc; 575 p->buf2 = buf2; 576 p->raidPtr = raidPtr; 577 p->flags = flags; 578 p->b_proc = kb_proc; 579 return (p); 580 } 581 582 void 583 rf_FreeDiskQueueData(p) 584 RF_DiskQueueData_t *p; 585 { 586 RF_FREELIST_FREE_CLEAN(rf_dqd_freelist, p, next, clean_dqd); 587 } 588