1 /* $NetBSD: rf_raid.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */ 2 /* 3 * Copyright (c) 1995 Carnegie-Mellon University. 4 * All rights reserved. 5 * 6 * Author: Mark Holland 7 * 8 * Permission to use, copy, modify and distribute this software and 9 * its documentation is hereby granted, provided that both the copyright 10 * notice and this permission notice appear in all copies of the 11 * software, derivative works or modified versions, and any portions 12 * thereof, and that both notices appear in supporting documentation. 13 * 14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 17 * 18 * Carnegie Mellon requests users of this software to return to 19 * 20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 21 * School of Computer Science 22 * Carnegie Mellon University 23 * Pittsburgh PA 15213-3890 24 * 25 * any improvements or extensions that they make and grant Carnegie the 26 * rights to redistribute these changes. 27 */ 28 29 /********************************************** 30 * rf_raid.h -- main header file for RAID driver 31 **********************************************/ 32 33 /* 34 * : 35 * Log: rf_raid.h,v 36 * Revision 1.48 1996/08/20 22:33:54 jimz 37 * make hist_diskreq a doubly-indexed array 38 * 39 * Revision 1.47 1996/07/15 05:40:41 jimz 40 * some recon datastructure cleanup 41 * better handling of multiple failures 42 * added undocumented double-recon test 43 * 44 * Revision 1.46 1996/07/10 22:28:51 jimz 45 * get rid of obsolete row statuses (dead,degraded2) 46 * 47 * Revision 1.45 1996/06/14 14:56:29 jimz 48 * make engine threading stuff ifndef SIMULATE 49 * 50 * Revision 1.44 1996/06/14 14:16:54 jimz 51 * move in engine node queue, atomicity control 52 * 53 * Revision 1.43 1996/06/12 04:41:26 jimz 54 * tweaks to make genplot work with user-level driver 55 * (mainly change stat collection) 56 * 57 * Revision 1.42 1996/06/11 10:57:17 jimz 58 * add recon_done_procs, recon_done_proc_mutex 59 * 60 * Revision 1.41 1996/06/11 01:26:48 jimz 61 * added mechanism for user-level to sync diskthread startup, 62 * shutdown 63 * 64 * Revision 1.40 1996/06/10 14:18:58 jimz 65 * move user, throughput stats into per-array structure 66 * 67 * Revision 1.39 1996/06/10 11:55:47 jimz 68 * Straightened out some per-array/not-per-array distinctions, fixed 69 * a couple bugs related to confusion. Added shutdown lists. Removed 70 * layout shutdown function (now subsumed by shutdown lists). 71 * 72 * Revision 1.38 1996/06/07 21:33:04 jimz 73 * begin using consistent types for sector numbers, 74 * stripe numbers, row+col numbers, recon unit numbers 75 * 76 * Revision 1.37 1996/06/05 19:38:32 jimz 77 * fixed up disk queueing types config 78 * added sstf disk queueing 79 * fixed exit bug on diskthreads (ref-ing bad mem) 80 * 81 * Revision 1.36 1996/06/05 18:06:02 jimz 82 * Major code cleanup. The Great Renaming is now done. 83 * Better modularity. Better typing. Fixed a bunch of 84 * synchronization bugs. Made a lot of global stuff 85 * per-desc or per-array. Removed dead code. 86 * 87 * Revision 1.35 1996/06/03 23:28:26 jimz 88 * more bugfixes 89 * check in tree to sync for IPDS runs with current bugfixes 90 * there still may be a problem with threads in the script test 91 * getting I/Os stuck- not trivially reproducible (runs ~50 times 92 * in a row without getting stuck) 93 * 94 * Revision 1.34 1996/06/02 17:31:48 jimz 95 * Moved a lot of global stuff into array structure, where it belongs. 96 * Fixed up paritylogging, pss modules in this manner. Some general 97 * code cleanup. Removed lots of dead code, some dead files. 98 * 99 * Revision 1.33 1996/05/30 23:22:16 jimz 100 * bugfixes of serialization, timing problems 101 * more cleanup 102 * 103 * Revision 1.32 1996/05/30 11:29:41 jimz 104 * Numerous bug fixes. Stripe lock release code disagreed with the taking code 105 * about when stripes should be locked (I made it consistent: no parity, no lock) 106 * There was a lot of extra serialization of I/Os which I've removed- a lot of 107 * it was to calculate values for the cache code, which is no longer with us. 108 * More types, function, macro cleanup. Added code to properly quiesce the array 109 * on shutdown. Made a lot of stuff array-specific which was (bogusly) general 110 * before. Fixed memory allocation, freeing bugs. 111 * 112 * Revision 1.31 1996/05/27 18:56:37 jimz 113 * more code cleanup 114 * better typing 115 * compiles in all 3 environments 116 * 117 * Revision 1.30 1996/05/24 22:17:04 jimz 118 * continue code + namespace cleanup 119 * typed a bunch of flags 120 * 121 * Revision 1.29 1996/05/23 21:46:35 jimz 122 * checkpoint in code cleanup (release prep) 123 * lots of types, function names have been fixed 124 * 125 * Revision 1.28 1996/05/23 00:33:23 jimz 126 * code cleanup: move all debug decls to rf_options.c, all extern 127 * debug decls to rf_options.h, all debug vars preceded by rf_ 128 * 129 * Revision 1.27 1996/05/18 19:51:34 jimz 130 * major code cleanup- fix syntax, make some types consistent, 131 * add prototypes, clean out dead code, et cetera 132 * 133 * Revision 1.26 1996/05/08 21:01:24 jimz 134 * fixed up enum type names that were conflicting with other 135 * enums and function names (ie, "panic") 136 * future naming trends will be towards RF_ and rf_ for 137 * everything raidframe-related 138 * 139 * Revision 1.25 1996/05/02 14:57:55 jimz 140 * add sectorMask 141 * 142 * Revision 1.24 1996/04/22 15:53:13 jimz 143 * MAX_RAIDS -> NRAIDFRAME 144 * 145 * Revision 1.23 1995/12/14 18:39:46 jimz 146 * convert to rf_types.h types 147 * 148 * Revision 1.22 1995/12/06 15:02:26 root 149 * added copyright info 150 * 151 * Revision 1.21 1995/10/09 17:39:24 jimz 152 * added info for tracking number of outstanding accesses 153 * at user-level 154 * 155 * Revision 1.20 1995/09/30 20:37:46 jimz 156 * added acc_totals to Raid for kernel 157 * 158 * Revision 1.19 1995/09/19 22:57:14 jimz 159 * add cache of raidid for kernel 160 * 161 * Revision 1.18 1995/09/18 16:50:04 jimz 162 * added RF_MAX_DISKS (for config ioctls) 163 * 164 * Revision 1.17 1995/09/07 19:02:31 jimz 165 * mods to get raidframe to compile and link 166 * in kernel environment 167 * 168 * Revision 1.16 1995/07/21 19:29:51 robby 169 * added some info for the idler to the Raid 170 * 171 * Revision 1.15 1995/07/16 03:19:14 cfb 172 * added cachePtr to *raidPtr 173 * 174 * Revision 1.14 1995/06/23 13:39:36 robby 175 * updeated to prototypes in rf_layout.h 176 * 177 */ 178 179 #ifndef _RF__RF_RAID_H_ 180 #define _RF__RF_RAID_H_ 181 182 #ifdef _KERNEL 183 #define KERNEL 184 #endif 185 186 #include "rf_archs.h" 187 #include "rf_types.h" 188 #include "rf_threadstuff.h" 189 190 #if defined(__NetBSD__) && defined(_KERNEL) 191 #include "rf_netbsd.h" 192 #endif 193 194 #ifdef KERNEL 195 /* XXX Needs to be added. GO 196 #include <raidframe.h> 197 */ 198 #include <sys/disklabel.h> 199 #else /* KERNEL */ 200 #include <stdio.h> 201 #include <assert.h> 202 #endif /* KERNEL */ 203 #include <sys/types.h> 204 205 #include "rf_alloclist.h" 206 #include "rf_stripelocks.h" 207 #include "rf_layout.h" 208 #include "rf_disks.h" 209 #include "rf_debugMem.h" 210 #include "rf_diskqueue.h" 211 #include "rf_reconstruct.h" 212 #include "rf_acctrace.h" 213 214 #if RF_INCLUDE_PARITYLOGGING > 0 215 #include "rf_paritylog.h" 216 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ 217 218 #define RF_MAX_DISKS 128 /* max disks per array */ 219 #ifdef __NetBSD__ 220 #define RF_DEV2RAIDID(_dev) (DISKUNIT(_dev)) 221 #else 222 #define RF_DEV2RAIDID(_dev) (minor(_dev)>>6) /* convert dev_t to raid id */ 223 #endif 224 225 /* 226 * Each row in the array is a distinct parity group, so 227 * each has it's own status, which is one of the following. 228 */ 229 typedef enum RF_RowStatus_e { 230 rf_rs_optimal, 231 rf_rs_degraded, 232 rf_rs_reconstructing, 233 rf_rs_reconfigured 234 } RF_RowStatus_t; 235 236 struct RF_CumulativeStats_s { 237 struct timeval start; /* the time when the stats were last started*/ 238 struct timeval stop; /* the time when the stats were last stopped */ 239 long sum_io_us; /* sum of all user response times (us) */ 240 long num_ios; /* total number of I/Os serviced */ 241 long num_sect_moved; /* total number of sectors read or written */ 242 }; 243 244 struct RF_ThroughputStats_s { 245 RF_DECLARE_MUTEX(mutex)/* a mutex used to lock the configuration stuff */ 246 struct timeval start; /* timer started when numOutstandingRequests moves from 0 to 1 */ 247 struct timeval stop; /* timer stopped when numOutstandingRequests moves from 1 to 0 */ 248 RF_uint64 sum_io_us; /* total time timer is enabled */ 249 RF_uint64 num_ios; /* total number of ios processed by RAIDframe */ 250 long num_out_ios; /* number of outstanding ios */ 251 }; 252 253 #ifdef SIMULATE 254 typedef struct RF_PendingRecon_s RF_PendingRecon_t; 255 struct RF_PendingRecon_s { 256 RF_RowCol_t row; 257 RF_RowCol_t col; 258 RF_PendingRecon_t *next; 259 }; 260 #endif /* SIMULATE */ 261 262 struct RF_Raid_s { 263 /* This portion never changes, and can be accessed without locking */ 264 /* an exception is Disks[][].status, which requires locking when it is changed */ 265 u_int numRow; /* number of rows of disks, typically == # of ranks */ 266 u_int numCol; /* number of columns of disks, typically == # of disks/rank */ 267 u_int numSpare; /* number of spare disks */ 268 int maxQueueDepth; /* max disk queue depth */ 269 RF_SectorCount_t totalSectors; /* total number of sectors in the array */ 270 RF_SectorCount_t sectorsPerDisk; /* number of sectors on each disk */ 271 u_int logBytesPerSector; /* base-2 log of the number of bytes in a sector */ 272 u_int bytesPerSector; /* bytes in a sector */ 273 RF_int32 sectorMask; /* mask of bytes-per-sector */ 274 275 RF_RaidLayout_t Layout; /* all information related to layout */ 276 RF_RaidDisk_t **Disks; /* all information related to physical disks */ 277 RF_DiskQueue_t **Queues; /* all information related to disk queues */ 278 /* NOTE: This is an anchor point via which the queues can be accessed, 279 * but the enqueue/dequeue routines in diskqueue.c use a local copy of 280 * this pointer for the actual accesses. 281 */ 282 /* The remainder of the structure can change, and therefore requires locking on reads and updates */ 283 RF_DECLARE_MUTEX(mutex) /* mutex used to serialize access to the fields below */ 284 RF_RowStatus_t *status; /* the status of each row in the array */ 285 int valid; /* indicates successful configuration */ 286 RF_LockTableEntry_t *lockTable; /* stripe-lock table */ 287 RF_LockTableEntry_t *quiesceLock; /* quiesnce table */ 288 int numFailures; /* total number of failures in the array */ 289 290 /* 291 * Cleanup stuff 292 */ 293 RF_ShutdownList_t *shutdownList; /* shutdown activities */ 294 RF_AllocListElem_t *cleanupList; /* memory to be freed at shutdown time */ 295 296 /* 297 * Recon stuff 298 */ 299 RF_HeadSepLimit_t headSepLimit; 300 int numFloatingReconBufs; 301 int reconInProgress; 302 #ifdef SIMULATE 303 RF_PendingRecon_t *pendingRecon; 304 #endif /* SIMULATE */ 305 RF_DECLARE_COND(waitForReconCond) 306 RF_RaidReconDesc_t *reconDesc; /* reconstruction descriptor */ 307 RF_ReconCtrl_t **reconControl; /* reconstruction control structure pointers for each row in the array */ 308 309 #if !defined(KERNEL) && !defined(SIMULATE) 310 /* 311 * Disk thread stuff 312 */ 313 int diskthreads_created; 314 int diskthreads_running; 315 int diskthreads_shutdown; 316 RF_DECLARE_MUTEX(diskthread_count_mutex) 317 RF_DECLARE_COND(diskthread_count_cond) 318 #endif /* !KERNEL && !SIMULATE */ 319 320 /* 321 * Array-quiescence stuff 322 */ 323 RF_DECLARE_MUTEX(access_suspend_mutex) 324 RF_DECLARE_COND(quiescent_cond) 325 RF_IoCount_t accesses_suspended; 326 RF_IoCount_t accs_in_flight; 327 int access_suspend_release; 328 int waiting_for_quiescence; 329 RF_CallbackDesc_t *quiesce_wait_list; 330 331 /* 332 * Statistics 333 */ 334 #if !defined(KERNEL) && !defined(SIMULATE) 335 RF_ThroughputStats_t throughputstats; 336 #endif /* !KERNEL && !SIMULATE */ 337 RF_CumulativeStats_t userstats; 338 339 /* 340 * Engine thread control 341 */ 342 RF_DECLARE_MUTEX(node_queue_mutex) 343 RF_DECLARE_COND(node_queue_cond) 344 RF_DagNode_t *node_queue; 345 #ifndef SIMULATE 346 RF_Thread_t engine_thread; 347 RF_ThreadGroup_t engine_tg; 348 #endif /* !SIMULATE */ 349 int shutdown_engine; 350 int dags_in_flight; /* debug */ 351 352 /* 353 * PSS (Parity Stripe Status) stuff 354 */ 355 RF_FreeList_t *pss_freelist; 356 long pssTableSize; 357 358 /* 359 * Reconstruction stuff 360 */ 361 int procsInBufWait; 362 int numFullReconBuffers; 363 RF_AccTraceEntry_t *recon_tracerecs; 364 unsigned long accumXorTimeUs; 365 RF_ReconDoneProc_t *recon_done_procs; 366 RF_DECLARE_MUTEX(recon_done_proc_mutex) 367 368 #if !defined(KERNEL) && !defined(SIMULATE) 369 RF_Thread_t **diskthreads, *sparediskthreads; /* thread descriptors for disk threads in user-level version */ 370 #endif /* !KERNEL && !SIMULATE */ 371 372 /* 373 * nAccOutstanding, waitShutdown protected by desc freelist lock 374 * (This may seem strange, since that's a central serialization point 375 * for a per-array piece of data, but otherwise, it'd be an extra 376 * per-array lock, and that'd only be less efficient...) 377 */ 378 RF_DECLARE_COND(outstandingCond) 379 int waitShutdown; 380 int nAccOutstanding; 381 382 RF_DiskId_t **diskids; 383 RF_DiskId_t *sparediskids; 384 385 #ifdef KERNEL 386 int raidid; 387 #endif /* KERNEL */ 388 RF_AccTotals_t acc_totals; 389 int keep_acc_totals; 390 391 #ifdef _KERNEL 392 struct raidcinfo **raid_cinfo; /* array of component info */ 393 struct proc *proc; /* XXX shouldn't be needed here.. :-p */ 394 #endif 395 396 int terminate_disk_queues; 397 398 /* 399 * XXX 400 * 401 * config-specific information should be moved 402 * somewhere else, or at least hung off this 403 * in some generic way 404 */ 405 406 /* used by rf_compute_workload_shift */ 407 RF_RowCol_t hist_diskreq[RF_MAXROW][RF_MAXCOL]; 408 409 /* used by declustering */ 410 int noRotate; 411 412 #if RF_INCLUDE_PARITYLOGGING > 0 413 /* used by parity logging */ 414 RF_SectorCount_t regionLogCapacity; 415 RF_ParityLogQueue_t parityLogPool; /* pool of unused parity logs */ 416 RF_RegionInfo_t *regionInfo; /* array of region state */ 417 int numParityLogs; 418 int numSectorsPerLog; 419 int regionParityRange; 420 int logsInUse; /* debugging */ 421 RF_ParityLogDiskQueue_t parityLogDiskQueue; /* state of parity logging disk work */ 422 RF_RegionBufferQueue_t regionBufferPool; /* buffers for holding region log */ 423 RF_RegionBufferQueue_t parityBufferPool; /* buffers for holding parity */ 424 caddr_t parityLogBufferHeap; /* pool of unused parity logs */ 425 #ifndef SIMULATE 426 RF_Thread_t pLogDiskThreadHandle; 427 #endif /* !SIMULATE */ 428 429 #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ 430 }; 431 432 #endif /* !_RF__RF_RAID_H_ */ 433