1 /* $NetBSD: rf_diskqueue.c,v 1.63 2021/12/14 00:46:43 mrg Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /**************************************************************************** 30 * 31 * rf_diskqueue.c -- higher-level disk queue code 32 * 33 * the routines here are a generic wrapper around the actual queueing 34 * routines. The code here implements thread scheduling, synchronization, 35 * and locking ops (see below) on top of the lower-level queueing code. 36 * 37 * to support atomic RMW, we implement "locking operations". When a 38 * locking op is dispatched to the lower levels of the driver, the 39 * queue is locked, and no further I/Os are dispatched until the queue 40 * receives & completes a corresponding "unlocking operation". This 41 * code relies on the higher layers to guarantee that a locking op 42 * will always be eventually followed by an unlocking op. The model 43 * is that the higher layers are structured so locking and unlocking 44 * ops occur in pairs, i.e. an unlocking op cannot be generated until 45 * after a locking op reports completion. There is no good way to 46 * check to see that an unlocking op "corresponds" to the op that 47 * currently has the queue locked, so we make no such attempt. Since 48 * by definition there can be only one locking op outstanding on a 49 * disk, this should not be a problem. 50 * 51 * In the kernel, we allow multiple I/Os to be concurrently dispatched 52 * to the disk driver. In order to support locking ops in this 53 * environment, when we decide to do a locking op, we stop dispatching 54 * new I/Os and wait until all dispatched I/Os have completed before 55 * dispatching the locking op. 56 * 57 * Unfortunately, the code is different in the 3 different operating 58 * states (user level, kernel, simulator). In the kernel, I/O is 59 * non-blocking, and we have no disk threads to dispatch for us. 60 * Therefore, we have to dispatch new I/Os to the scsi driver at the 61 * time of enqueue, and also at the time of completion. At user 62 * level, I/O is blocking, and so only the disk threads may dispatch 63 * I/Os. Thus at user level, all we can do at enqueue time is enqueue 64 * and wake up the disk thread to do the dispatch. 65 * 66 ****************************************************************************/ 67 68 #include <sys/cdefs.h> 69 __KERNEL_RCSID(0, "$NetBSD: rf_diskqueue.c,v 1.63 2021/12/14 00:46:43 mrg Exp $"); 70 71 #include <dev/raidframe/raidframevar.h> 72 73 #include "rf_threadstuff.h" 74 #include "rf_raid.h" 75 #include "rf_diskqueue.h" 76 #include "rf_alloclist.h" 77 #include "rf_acctrace.h" 78 #include "rf_etimer.h" 79 #include "rf_general.h" 80 #include "rf_debugprint.h" 81 #include "rf_shutdown.h" 82 #include "rf_cvscan.h" 83 #include "rf_sstf.h" 84 #include "rf_fifo.h" 85 #include "rf_kintf.h" 86 87 #include <sys/buf.h> 88 89 static void rf_ShutdownDiskQueueSystem(void *); 90 91 #ifndef RF_DEBUG_DISKQUEUE 92 #define RF_DEBUG_DISKQUEUE 0 93 #endif 94 95 #if RF_DEBUG_DISKQUEUE 96 #define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) 97 #define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) 98 #define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) 99 #else 100 #define Dprintf1(s,a) 101 #define Dprintf2(s,a,b) 102 #define Dprintf3(s,a,b,c) 103 #endif 104 105 /***************************************************************************** 106 * 107 * the disk queue switch defines all the functions used in the 108 * different queueing disciplines queue ID, init routine, enqueue 109 * routine, dequeue routine 110 * 111 ****************************************************************************/ 112 113 static const RF_DiskQueueSW_t diskqueuesw[] = { 114 {"fifo", /* FIFO */ 115 rf_FifoCreate, 116 rf_FifoEnqueue, 117 rf_FifoDequeue, 118 rf_FifoPromote}, 119 120 {"cvscan", /* cvscan */ 121 rf_CvscanCreate, 122 rf_CvscanEnqueue, 123 rf_CvscanDequeue, 124 rf_CvscanPromote}, 125 126 {"sstf", /* shortest seek time first */ 127 rf_SstfCreate, 128 rf_SstfEnqueue, 129 rf_SstfDequeue, 130 rf_SstfPromote}, 131 132 {"scan", /* SCAN (two-way elevator) */ 133 rf_ScanCreate, 134 rf_SstfEnqueue, 135 rf_ScanDequeue, 136 rf_SstfPromote}, 137 138 {"cscan", /* CSCAN (one-way elevator) */ 139 rf_CscanCreate, 140 rf_SstfEnqueue, 141 rf_CscanDequeue, 142 rf_SstfPromote}, 143 144 }; 145 #define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t)) 146 147 148 #define RF_MAX_FREE_DQD 256 149 #define RF_MIN_FREE_DQD 64 150 151 /* XXX: scale these... */ 152 #define RF_MAX_FREE_BUFIO 256 153 #define RF_MIN_FREE_BUFIO 64 154 155 156 157 /* configures a single disk queue */ 158 159 static void 160 rf_ShutdownDiskQueue(void *arg) 161 { 162 RF_DiskQueue_t *diskqueue = arg; 163 164 rf_destroy_mutex2(diskqueue->mutex); 165 } 166 167 int 168 rf_ConfigureDiskQueue(RF_Raid_t *raidPtr, RF_DiskQueue_t *diskqueue, 169 RF_RowCol_t c, const RF_DiskQueueSW_t *p, 170 RF_SectorCount_t sectPerDisk, dev_t dev, 171 int maxOutstanding, RF_ShutdownList_t **listp, 172 RF_AllocListElem_t *clList) 173 { 174 diskqueue->col = c; 175 diskqueue->qPtr = p; 176 diskqueue->qHdr = (p->Create) (sectPerDisk, clList, listp); 177 diskqueue->dev = dev; 178 diskqueue->numOutstanding = 0; 179 diskqueue->queueLength = 0; 180 diskqueue->maxOutstanding = maxOutstanding; 181 diskqueue->curPriority = RF_IO_NORMAL_PRIORITY; 182 diskqueue->flags = 0; 183 diskqueue->raidPtr = raidPtr; 184 diskqueue->rf_cinfo = &raidPtr->raid_cinfo[c]; 185 rf_init_mutex2(diskqueue->mutex, IPL_VM); 186 rf_ShutdownCreate(listp, rf_ShutdownDiskQueue, diskqueue); 187 return (0); 188 } 189 190 static void 191 rf_ShutdownDiskQueueSystem(void *arg) 192 { 193 RF_Raid_t *raidPtr; 194 195 raidPtr = (RF_Raid_t *) arg; 196 197 pool_destroy(&raidPtr->pools.dqd); 198 pool_destroy(&raidPtr->pools.bufio); 199 } 200 201 int 202 rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, 203 RF_Config_t *cfgPtr) 204 205 { 206 207 rf_pool_init(raidPtr, raidPtr->poolNames.dqd, &raidPtr->pools.dqd, sizeof(RF_DiskQueueData_t), 208 "dqd", RF_MIN_FREE_DQD, RF_MAX_FREE_DQD); 209 rf_pool_init(raidPtr, raidPtr->poolNames.bufio, &raidPtr->pools.bufio, sizeof(buf_t), 210 "bufio", RF_MIN_FREE_BUFIO, RF_MAX_FREE_BUFIO); 211 rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, raidPtr); 212 213 return (0); 214 } 215 216 int 217 rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, 218 RF_Config_t *cfgPtr) 219 { 220 RF_DiskQueue_t *diskQueues, *spareQueues; 221 const RF_DiskQueueSW_t *p; 222 RF_RowCol_t r,c; 223 int rc, i; 224 225 raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs; 226 227 for (p = NULL, i = 0; i < NUM_DISK_QUEUE_TYPES; i++) { 228 if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) { 229 p = &diskqueuesw[i]; 230 break; 231 } 232 } 233 if (p == NULL) { 234 RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n", cfgPtr->diskQueueType, diskqueuesw[0].queueType); 235 p = &diskqueuesw[0]; 236 } 237 raidPtr->qType = p; 238 239 diskQueues = RF_MallocAndAdd( 240 (raidPtr->numCol + RF_MAXSPARE) * sizeof(*diskQueues), 241 raidPtr->cleanupList); 242 if (diskQueues == NULL) 243 return (ENOMEM); 244 raidPtr->Queues = diskQueues; 245 246 for (c = 0; c < raidPtr->numCol; c++) { 247 rc = rf_ConfigureDiskQueue(raidPtr, &diskQueues[c], 248 c, p, 249 raidPtr->sectorsPerDisk, 250 raidPtr->Disks[c].dev, 251 cfgPtr->maxOutstandingDiskReqs, 252 listp, raidPtr->cleanupList); 253 if (rc) 254 return (rc); 255 } 256 257 spareQueues = &raidPtr->Queues[raidPtr->numCol]; 258 for (r = 0; r < raidPtr->numSpare; r++) { 259 rc = rf_ConfigureDiskQueue(raidPtr, &spareQueues[r], 260 raidPtr->numCol + r, p, 261 raidPtr->sectorsPerDisk, 262 raidPtr->Disks[raidPtr->numCol + r].dev, 263 cfgPtr->maxOutstandingDiskReqs, listp, 264 raidPtr->cleanupList); 265 if (rc) 266 return (rc); 267 } 268 return (0); 269 } 270 /* Enqueue a disk I/O 271 * 272 * In the kernel, I/O is non-blocking and so we'd like to have multiple 273 * I/Os outstanding on the physical disks when possible. 274 * 275 * when any request arrives at a queue, we have two choices: 276 * dispatch it to the lower levels 277 * queue it up 278 * 279 * kernel rules for when to do what: 280 * unlocking req : always dispatch it 281 * normal req : queue empty => dispatch it & set priority 282 * queue not full & priority is ok => dispatch it 283 * else queue it 284 */ 285 void 286 rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri) 287 { 288 RF_ETIMER_START(req->qtime); 289 RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector); 290 req->priority = pri; 291 292 #if RF_DEBUG_DISKQUEUE 293 if (rf_queueDebug && (req->numSector == 0)) { 294 printf("Warning: Enqueueing zero-sector access\n"); 295 } 296 #endif 297 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue"); 298 if (RF_OK_TO_DISPATCH(queue, req)) { 299 Dprintf2("Dispatching pri %d regular op to c %d (ok to dispatch)\n", pri, queue->col); 300 rf_DispatchKernelIO(queue, req); 301 } else { 302 queue->queueLength++; /* increment count of number of requests waiting in this queue */ 303 Dprintf2("Enqueueing pri %d regular op to c %d (not ok to dispatch)\n", pri, queue->col); 304 req->queue = (void *) queue; 305 (queue->qPtr->Enqueue) (queue->qHdr, req, pri); 306 } 307 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOEnqueue"); 308 } 309 310 311 /* get the next set of I/Os started */ 312 void 313 rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status) 314 { 315 int done = 0; 316 317 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOComplete"); 318 queue->numOutstanding--; 319 RF_ASSERT(queue->numOutstanding >= 0); 320 321 /* dispatch requests to the disk until we find one that we can't. */ 322 /* no reason to continue once we've filled up the queue */ 323 /* no reason to even start if the queue is locked */ 324 325 while (!done && !RF_QUEUE_FULL(queue)) { 326 req = (queue->qPtr->Dequeue) (queue->qHdr); 327 if (req) { 328 Dprintf2("DiskIOComplete: extracting pri %d req from queue at c %d\n", req->priority, queue->col); 329 queue->queueLength--; /* decrement count of number of requests waiting in this queue */ 330 RF_ASSERT(queue->queueLength >= 0); 331 if (RF_OK_TO_DISPATCH(queue, req)) { 332 Dprintf2("DiskIOComplete: dispatching pri %d regular req to c %d (ok to dispatch)\n", req->priority, queue->col); 333 rf_DispatchKernelIO(queue, req); 334 } else { 335 /* we can't dispatch it, so just re-enqueue it. 336 potential trouble here if disk queues batch reqs */ 337 Dprintf2("DiskIOComplete: re-enqueueing pri %d regular req to c %d\n", req->priority, queue->col); 338 queue->queueLength++; 339 (queue->qPtr->Enqueue) (queue->qHdr, req, req->priority); 340 done = 1; 341 } 342 } else { 343 Dprintf1("DiskIOComplete: no more requests to extract.\n", ""); 344 done = 1; 345 } 346 } 347 348 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOComplete"); 349 } 350 /* promotes accesses tagged with the given parityStripeID from low priority 351 * to normal priority. This promotion is optional, meaning that a queue 352 * need not implement it. If there is no promotion routine associated with 353 * a queue, this routine does nothing and returns -1. 354 */ 355 int 356 rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID, 357 RF_ReconUnitNum_t which_ru) 358 { 359 int retval; 360 361 if (!queue->qPtr->Promote) 362 return (-1); 363 RF_LOCK_QUEUE_MUTEX(queue, "DiskIOPromote"); 364 retval = (queue->qPtr->Promote) (queue->qHdr, parityStripeID, which_ru); 365 RF_UNLOCK_QUEUE_MUTEX(queue, "DiskIOPromote"); 366 return (retval); 367 } 368 369 RF_DiskQueueData_t * 370 rf_CreateDiskQueueData(RF_IoType_t typ, RF_SectorNum_t ssect, 371 RF_SectorCount_t nsect, void *bf, 372 RF_StripeNum_t parityStripeID, 373 RF_ReconUnitNum_t which_ru, 374 void (*wakeF) (void *, int), void *arg, 375 RF_AccTraceEntry_t *tracerec, RF_Raid_t *raidPtr, 376 RF_DiskQueueDataFlags_t flags, const struct buf *mbp) 377 { 378 RF_DiskQueueData_t *p; 379 380 p = pool_get(&raidPtr->pools.dqd, PR_WAITOK | PR_ZERO); 381 KASSERT(p != NULL); 382 383 /* Obtain a buffer from our own pool. It is possible for the 384 regular getiobuf() to run out of memory and return NULL. 385 We need to guarantee that never happens, as RAIDframe 386 doesn't have a good way to recover if memory allocation 387 fails here. 388 */ 389 p->bp = pool_get(&raidPtr->pools.bufio, PR_WAITOK | PR_ZERO); 390 KASSERT(p->bp != NULL); 391 392 buf_init(p->bp); 393 394 SET(p->bp->b_cflags, BC_BUSY); /* mark buffer busy */ 395 if (mbp) { 396 SET(p->bp->b_flags, mbp->b_flags & rf_b_pass); 397 p->bp->b_proc = mbp->b_proc; 398 } 399 400 p->sectorOffset = ssect + rf_protectedSectors; 401 p->numSector = nsect; 402 p->type = typ; 403 p->buf = bf; 404 p->parityStripeID = parityStripeID; 405 p->which_ru = which_ru; 406 p->CompleteFunc = wakeF; 407 p->argument = arg; 408 p->next = NULL; 409 p->tracerec = tracerec; 410 p->priority = RF_IO_NORMAL_PRIORITY; 411 p->raidPtr = raidPtr; 412 p->flags = flags; 413 return (p); 414 } 415 416 void 417 rf_FreeDiskQueueData(RF_DiskQueueData_t *p) 418 { 419 420 buf_destroy(p->bp); 421 422 pool_put(&p->raidPtr->pools.bufio, p->bp); 423 pool_put(&p->raidPtr->pools.dqd, p); 424 } 425