1 /* $NetBSD: rf_diskqueue.c,v 1.53 2011/05/05 06:04:09 mrg Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /**************************************************************************** 30 * 31 * rf_diskqueue.c -- higher-level disk queue code 32 * 33 * the routines here are a generic wrapper around the actual queueing 34 * routines. The code here implements thread scheduling, synchronization, 35 * and locking ops (see below) on top of the lower-level queueing code. 36 * 37 * to support atomic RMW, we implement "locking operations". When a 38 * locking op is dispatched to the lower levels of the driver, the 39 * queue is locked, and no further I/Os are dispatched until the queue 40 * receives & completes a corresponding "unlocking operation". This 41 * code relies on the higher layers to guarantee that a locking op 42 * will always be eventually followed by an unlocking op. The model 43 * is that the higher layers are structured so locking and unlocking 44 * ops occur in pairs, i.e. an unlocking op cannot be generated until 45 * after a locking op reports completion. There is no good way to 46 * check to see that an unlocking op "corresponds" to the op that 47 * currently has the queue locked, so we make no such attempt. Since 48 * by definition there can be only one locking op outstanding on a 49 * disk, this should not be a problem. 50 * 51 * In the kernel, we allow multiple I/Os to be concurrently dispatched 52 * to the disk driver. In order to support locking ops in this 53 * environment, when we decide to do a locking op, we stop dispatching 54 * new I/Os and wait until all dispatched I/Os have completed before 55 * dispatching the locking op. 56 * 57 * Unfortunately, the code is different in the 3 different operating 58 * states (user level, kernel, simulator). In the kernel, I/O is 59 * non-blocking, and we have no disk threads to dispatch for us. 60 * Therefore, we have to dispatch new I/Os to the scsi driver at the 61 * time of enqueue, and also at the time of completion. At user 62 * level, I/O is blocking, and so only the disk threads may dispatch 63 * I/Os. Thus at user level, all we can do at enqueue time is enqueue 64 * and wake up the disk thread to do the dispatch. 65 * 66 ****************************************************************************/ 67 68 #include <sys/cdefs.h> 69 __KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.53 2011/05/05 06:04:09 mrg Exp $"); 70 71 #include <dev/raidframe/raidframevar.h> 72 73 #include "rf_threadstuff.h" 74 #include "rf_raid.h" 75 #include "rf_diskqueue.h" 76 #include "rf_alloclist.h" 77 #include "rf_acctrace.h" 78 #include "rf_etimer.h" 79 #include "rf_general.h" 80 #include "rf_debugprint.h" 81 #include "rf_shutdown.h" 82 #include "rf_cvscan.h" 83 #include "rf_sstf.h" 84 #include "rf_fifo.h" 85 #include "rf_kintf.h" 86 87 static void rf_ShutdownDiskQueueSystem(void *); 88 89 #ifndef RF_DEBUG_DISKQUEUE 90 #define RF_DEBUG_DISKQUEUE 0 91 #endif 92 93 #if RF_DEBUG_DISKQUEUE 94 #define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 95 #define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 96 #define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) 97 #else 98 #define Dprintf1(s,a) 99 #define Dprintf2(s,a,b) 100 #define Dprintf3(s,a,b,c) 101 #endif 102 103 /***************************************************************************** 104 * 105 * the disk queue switch defines all the functions used in the 106 * different queueing disciplines queue ID, init routine, enqueue 107 * routine, dequeue routine 108 * 109 ****************************************************************************/ 110 111 static const RF_DiskQueueSW_t diskqueuesw[] = { 112 {"fifo", /* FIFO */ 113 rf_FifoCreate, 114 rf_FifoEnqueue, 115 rf_FifoDequeue, 116 rf_FifoPeek, 117 rf_FifoPromote}, 118 119 {"cvscan", /* cvscan */ 120 rf_CvscanCreate, 121 rf_CvscanEnqueue, 122 rf_CvscanDequeue, 123 rf_CvscanPeek, 124 rf_CvscanPromote}, 125 126 {"sstf", /* shortest seek time first */ 127 rf_SstfCreate, 128 rf_SstfEnqueue, 129 rf_SstfDequeue, 130 rf_SstfPeek, 131 rf_SstfPromote}, 132 133 {"scan", /* SCAN (two-way elevator) */ 134 rf_ScanCreate, 135 rf_SstfEnqueue, 136 rf_ScanDequeue, 137 rf_ScanPeek, 138 rf_SstfPromote}, 139 140 {"cscan", /* CSCAN (one-way elevator) */ 141 rf_CscanCreate, 142 rf_SstfEnqueue, 143 rf_CscanDequeue, 144 rf_CscanPeek, 145 rf_SstfPromote}, 146 147 }; 148 #define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t)) 149 150 #define RF_MAX_FREE_DQD 256 151 #define RF_MIN_FREE_DQD 64 152 153 #include <sys/buf.h> 154 155 /* configures a single disk queue */ 156 157 static void 158 rf_ShutdownDiskQueue(void *arg) 159 { 160 RF_DiskQueue_t *diskqueue = arg; 161 162 rf_destroy_mutex2(diskqueue->mutex); 163 } 164 165 int 166 rf_ConfigureDiskQueue(RF_Raid_t *raidPtr, RF_DiskQueue_t *diskqueue, 167 RF_RowCol_t c, const RF_DiskQueueSW_t *p, 168 RF_SectorCount_t sectPerDisk, dev_t dev, 169 int maxOutstanding, RF_ShutdownList_t **listp, 170 RF_AllocListElem_t *clList) 171 { 172 diskqueue->col = c; 173 diskqueue->qPtr = p; 174 diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp); 175 diskqueue->dev = dev; 176 diskqueue->numOutstanding = 0; 177 diskqueue->queueLength = 0; 178 diskqueue->maxOutstanding = maxOutstanding; 179 diskqueue->curPriority = RF_IO_NORMAL_PRIORITY; 180 diskqueue->flags = 0; 181 diskqueue->raidPtr = raidPtr; 182 diskqueue->rf_cinfo = &raidPtr->raid_cinfo[c]; 183 rf_init_mutex2(diskqueue->mutex, IPL_VM); 184 rf_ShutdownCreate(listp, rf_ShutdownDiskQueue, diskqueue); 185 return (0); 186 } 187 188 static void 189 rf_ShutdownDiskQueueSystem(void *ignored) 190 { 191 pool_destroy(&rf_pools.dqd); 192 } 193 194 int 195 rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp) 196 { 197 198 rf_pool_init(&rf_pools.dqd, sizeof(RF_DiskQueueData_t), 199 "rf_dqd_pl", RF_MIN_FREE_DQD, RF_MAX_FREE_DQD); 200 rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL); 201 202 return (0); 203 } 204 205 int 206 rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, 207 RF_Config_t *cfgPtr) 208 { 209 RF_DiskQueue_t *diskQueues, *spareQueues; 210 const RF_DiskQueueSW_t *p; 211 RF_RowCol_t r,c; 212 int rc, i; 213 214 raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs; 215 216 for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) { 217 if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) { 218 p = &diskqueuesw[i]; 219 break; 220 } 221 } 222 if (p == NULL) { 223 RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType); 224 p = &diskqueuesw[0]; 225 } 226 raidPtr->qType = p; 227 228 RF_MallocAndAdd(diskQueues, 229 (raidPtr->numCol + RF_MAXSPARE) * 230 sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *), 231 raidPtr->cleanupList); 232 if (diskQueues == NULL) 233 return (ENOMEM); 234 raidPtr->Queues = diskQueues; 235 236 for (c = 0; c < raidPtr->numCol; c++) { 237 rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[c], 238 c, p, 239 raidPtr->sectorsPerDisk, 240 raidPtr->Disks[c].dev, 241 cfgPtr->maxOutstandingDiskReqs, 242 listp, raidPtr->cleanupList); 243 if (rc) 244 return (rc); 245 } 246 247 spareQueues = &raidPtr->Queues[raidPtr->numCol]; 248 for (r = 0; r < raidPtr->numSpare; r++) { 249 rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r], 250 raidPtr->numCol + r, p, 251 raidPtr->sectorsPerDisk, 252 raidPtr->Disks[raidPtr->numCol + r].dev, 253 cfgPtr->maxOutstandingDiskReqs, listp, 254 raidPtr->cleanupList); 255 if (rc) 256 return (rc); 257 } 258 return (0); 259 } 260 /* Enqueue a disk I/O 261 * 262 * In the kernel, I/O is non-blocking and so we'd like to have multiple 263 * I/Os outstanding on the physical disks when possible. 264 * 265 * when any request arrives at a queue, we have two choices: 266 * dispatch it to the lower levels 267 * queue it up 268 * 269 * kernel rules for when to do what: 270 * unlocking req : always dispatch it 271 * normal req : queue empty => dispatch it & set priority 272 * queue not full & priority is ok => dispatch it 273 * else queue it 274 */ 275 void 276 rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri) 277 { 278 RF_ETIMER_START(req->qtime); 279 RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector); 280 req->priority = pri; 281 282 #if RF_DEBUG_DISKQUEUE 283 if (rf_queueDebug && (req->numSector == 0)) { 284 printf("Warning: Enqueueing zero-sector access\n"); 285 } 286 #endif 287 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue"); 288 if (RF_OK_TO_DISPATCH(queue, req)) { 289 Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n", pri, queue->col); 290 rf_DispatchKernelIO(queue, req); 291 } else { 292 queue->queueLength++; /* increment count of number of requests waiting in this queue */ 293 Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n", pri, queue->col); 294 req->queue = (void *) queue; 295 (queue->qPtr->Enqueue) (queue->qHdr, req, pri); 296 } 297 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue"); 298 } 299 300 301 /* get the next set of I/Os started */ 302 void 303 rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status) 304 { 305 int done = 0; 306 307 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete"); 308 queue->numOutstanding--; 309 RF_ASSERT(queue->numOutstanding >= 0); 310 311 /* dispatch requests to the disk until we find one that we can't. */ 312 /* no reason to continue once we've filled up the queue */ 313 /* no reason to even start if the queue is locked */ 314 315 while (!done && !RF_QUEUE_FULL(queue)) { 316 req = (queue->qPtr->Dequeue) (queue->qHdr); 317 if (req) { 318 Dprintf2("DiskIOComplete: extracting pri %d req from queue at c %d\n", req->priority, queue->col); 319 queue->queueLength--; /* decrement count of number of requests waiting in this queue */ 320 RF_ASSERT(queue->queueLength >= 0); 321 if (RF_OK_TO_DISPATCH(queue, req)) { 322 Dprintf2("DiskIOComplete: dispatching pri %d regular req to c %d (ok to dispatch)\n", req->priority, queue->col); 323 rf_DispatchKernelIO(queue, req); 324 } else { 325 /* we can't dispatch it, so just re-enqueue it. 326 potential trouble here if disk queues batch reqs */ 327 Dprintf2("DiskIOComplete: re-enqueueing pri %d regular req to c %d\n", req->priority, queue->col); 328 queue->queueLength++; 329 (queue->qPtr->Enqueue) (queue->qHdr, req, req->priority); 330 done = 1; 331 } 332 } else { 333 Dprintf1("DiskIOComplete: no more requests to extract.\n", ""); 334 done = 1; 335 } 336 } 337 338 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete"); 339 } 340 /* promotes accesses tagged with the given parityStripeID from low priority 341 * to normal priority. This promotion is optional, meaning that a queue 342 * need not implement it. If there is no promotion routine associated with 343 * a queue, this routine does nothing and returns -1. 344 */ 345 int 346 rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID, 347 RF_ReconUnitNum_t which_ru) 348 { 349 int retval; 350 351 if (!queue->qPtr->Promote) 352 return (-1); 353 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote"); 354 retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru); 355 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote"); 356 return (retval); 357 } 358 359 RF_DiskQueueData_t * 360 rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect, 361 RF_SectorCount_t nsect, void *bf, 362 RF_StripeNum_t parityStripeID, 363 RF_ReconUnitNum_t which_ru, 364 int (*wakeF) (void *, int), void *arg, 365 RF_AccTraceEntry_t *tracerec, RF_Raid_t *raidPtr, 366 RF_DiskQueueDataFlags_t flags, void *kb_proc, 367 int waitflag) 368 { 369 RF_DiskQueueData_t *p; 370 371 p = pool_get(&rf_pools.dqd, waitflag); 372 if (p == NULL) 373 return (NULL); 374 375 memset(p, 0, sizeof(RF_DiskQueueData_t)); 376 if (waitflag == PR_WAITOK) { 377 p->bp = getiobuf(NULL, true); 378 } else { 379 p->bp = getiobuf(NULL, false); 380 } 381 if (p->bp == NULL) { 382 pool_put(&rf_pools.dqd, p); 383 return (NULL); 384 } 385 SET(p->bp->b_cflags, BC_BUSY); /* mark buffer busy */ 386 387 p->sectorOffset = ssect + rf_protectedSectors; 388 p->numSector = nsect; 389 p->type = typ; 390 p->buf = bf; 391 p->parityStripeID = parityStripeID; 392 p->which_ru = which_ru; 393 p->CompleteFunc = wakeF; 394 p->argument = arg; 395 p->next = NULL; 396 p->tracerec = tracerec; 397 p->priority = RF_IO_NORMAL_PRIORITY; 398 p->raidPtr = raidPtr; 399 p->flags = flags; 400 p->b_proc = kb_proc; 401 return (p); 402 } 403 404 void 405 rf_FreeDiskQueueData(RF_DiskQueueData_t *p) 406 { 407 int s; 408 s = splbio(); /* XXX protect only pool_put, or neither? */ 409 putiobuf(p->bp); 410 pool_put(&rf_pools.dqd, p); 411 splx(s); 412 } 413