1 /* $NetBSD: rf_dagffwr.c,v 1.38 2023/10/15 18:15:20 oster Exp $ */
2 /*
3 * Copyright (c) 1995 Carnegie-Mellon University.
4 * All rights reserved.
5 *
6 * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
7 *
8 * Permission to use, copy, modify and distribute this software and
9 * its documentation is hereby granted, provided that both the copyright
10 * notice and this permission notice appear in all copies of the
11 * software, derivative works or modified versions, and any portions
12 * thereof, and that both notices appear in supporting documentation.
13 *
14 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17 *
18 * Carnegie Mellon requests users of this software to return to
19 *
20 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
21 * School of Computer Science
22 * Carnegie Mellon University
23 * Pittsburgh PA 15213-3890
24 *
25 * any improvements or extensions that they make and grant Carnegie the
26 * rights to redistribute these changes.
27 */
28
29 /*
30 * rf_dagff.c
31 *
32 * code for creating fault-free DAGs
33 *
34 */
35
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rf_dagffwr.c,v 1.38 2023/10/15 18:15:20 oster Exp $");
38
39 #include <dev/raidframe/raidframevar.h>
40
41 #include "rf_raid.h"
42 #include "rf_dag.h"
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_debugMem.h"
46 #include "rf_dagffrd.h"
47 #include "rf_general.h"
48 #include "rf_dagffwr.h"
49 #include "rf_map.h"
50
51 /******************************************************************************
52 *
53 * General comments on DAG creation:
54 *
55 * All DAGs in this file use roll-away error recovery. Each DAG has a single
56 * commit node, usually called "Cmt." If an error occurs before the Cmt node
57 * is reached, the execution engine will halt forward execution and work
58 * backward through the graph, executing the undo functions. Assuming that
59 * each node in the graph prior to the Cmt node are undoable and atomic - or -
60 * does not make changes to permanent state, the graph will fail atomically.
61 * If an error occurs after the Cmt node executes, the engine will roll-forward
62 * through the graph, blindly executing nodes until it reaches the end.
63 * If a graph reaches the end, it is assumed to have completed successfully.
64 *
65 * A graph has only 1 Cmt node.
66 *
67 */
68
69
70 /******************************************************************************
71 *
72 * The following wrappers map the standard DAG creation interface to the
73 * DAG creation routines. Additionally, these wrappers enable experimentation
74 * with new DAG structures by providing an extra level of indirection, allowing
75 * the DAG creation routines to be replaced at this single point.
76 */
77
78
79 void
rf_CreateNonRedundantWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList,RF_IoType_t type)80 rf_CreateNonRedundantWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
81 RF_DagHeader_t *dag_h, void *bp,
82 RF_RaidAccessFlags_t flags,
83 RF_AllocListElem_t *allocList,
84 RF_IoType_t type)
85 {
86 rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
87 RF_IO_TYPE_WRITE);
88 }
89
90 void
rf_CreateRAID0WriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList,RF_IoType_t type)91 rf_CreateRAID0WriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
92 RF_DagHeader_t *dag_h, void *bp,
93 RF_RaidAccessFlags_t flags,
94 RF_AllocListElem_t *allocList,
95 RF_IoType_t type)
96 {
97 rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
98 RF_IO_TYPE_WRITE);
99 }
100
101 void
rf_CreateSmallWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList)102 rf_CreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
103 RF_DagHeader_t *dag_h, void *bp,
104 RF_RaidAccessFlags_t flags,
105 RF_AllocListElem_t *allocList)
106 {
107 /* "normal" rollaway */
108 rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags,
109 allocList, &rf_xorFuncs, NULL);
110 }
111
112 void
rf_CreateLargeWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList)113 rf_CreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
114 RF_DagHeader_t *dag_h, void *bp,
115 RF_RaidAccessFlags_t flags,
116 RF_AllocListElem_t *allocList)
117 {
118 /* "normal" rollaway */
119 rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags,
120 allocList, 1, rf_RegularXorFunc, RF_TRUE);
121 }
122
123
124 /******************************************************************************
125 *
126 * DAG creation code begins here
127 */
128 #define BUF_ALLOC(num) \
129 RF_MallocAndAdd(rf_RaidAddressToByte(raidPtr, num), allocList)
130
131
132 /******************************************************************************
133 *
134 * creates a DAG to perform a large-write operation:
135 *
136 * / Rod \ / Wnd \
137 * H -- block- Rod - Xor - Cmt - Wnd --- T
138 * \ Rod / \ Wnp /
139 * \[Wnq]/
140 *
141 * The XOR node also does the Q calculation in the P+Q architecture.
142 * All nodes are before the commit node (Cmt) are assumed to be atomic and
143 * undoable - or - they make no changes to permanent state.
144 *
145 * Rod = read old data
146 * Cmt = commit node
147 * Wnp = write new parity
148 * Wnd = write new data
149 * Wnq = write new "q"
150 * [] denotes optional segments in the graph
151 *
152 * Parameters: raidPtr - description of the physical array
153 * asmap - logical & physical addresses for this access
154 * bp - buffer ptr (holds write data)
155 * flags - general flags (e.g. disk locking)
156 * allocList - list of memory allocated in DAG creation
157 * nfaults - number of faults array can tolerate
158 * (equal to # redundancy units in stripe)
159 * redfuncs - list of redundancy generating functions
160 *
161 *****************************************************************************/
162
163 void
rf_CommonCreateLargeWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList,int nfaults,void (* redFunc)(RF_DagNode_t *),int allowBufferRecycle)164 rf_CommonCreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
165 RF_DagHeader_t *dag_h, void *bp,
166 RF_RaidAccessFlags_t flags,
167 RF_AllocListElem_t *allocList,
168 int nfaults, void (*redFunc) (RF_DagNode_t *),
169 int allowBufferRecycle)
170 {
171 RF_DagNode_t *wndNodes, *rodNodes, *xorNode, *wnpNode, *tmpNode;
172 RF_DagNode_t *blockNode, *commitNode, *termNode;
173 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
174 RF_DagNode_t *wnqNode;
175 #endif
176 int nWndNodes, nRodNodes, i, nodeNum, asmNum;
177 RF_AccessStripeMapHeader_t *new_asm_h[2];
178 RF_StripeNum_t parityStripeID;
179 char *sosBuffer, *eosBuffer;
180 RF_ReconUnitNum_t which_ru;
181 RF_RaidLayout_t *layoutPtr;
182 RF_PhysDiskAddr_t *pda;
183
184 layoutPtr = &(raidPtr->Layout);
185 parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr,
186 asmap->raidAddress,
187 &which_ru);
188
189 #if RF_DEBUG_DAG
190 if (rf_dagDebug) {
191 printf("[Creating large-write DAG]\n");
192 }
193 #endif
194 dag_h->creator = "LargeWriteDAG";
195
196 dag_h->numCommitNodes = 1;
197 dag_h->numCommits = 0;
198 dag_h->numSuccedents = 1;
199
200 /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */
201 nWndNodes = asmap->numStripeUnitsAccessed;
202
203 for (i = 0; i < nWndNodes; i++) {
204 tmpNode = rf_AllocDAGNode(raidPtr);
205 tmpNode->list_next = dag_h->nodes;
206 dag_h->nodes = tmpNode;
207 }
208 wndNodes = dag_h->nodes;
209
210 xorNode = rf_AllocDAGNode(raidPtr);
211 xorNode->list_next = dag_h->nodes;
212 dag_h->nodes = xorNode;
213
214 wnpNode = rf_AllocDAGNode(raidPtr);
215 wnpNode->list_next = dag_h->nodes;
216 dag_h->nodes = wnpNode;
217
218 blockNode = rf_AllocDAGNode(raidPtr);
219 blockNode->list_next = dag_h->nodes;
220 dag_h->nodes = blockNode;
221
222 commitNode = rf_AllocDAGNode(raidPtr);
223 commitNode->list_next = dag_h->nodes;
224 dag_h->nodes = commitNode;
225
226 termNode = rf_AllocDAGNode(raidPtr);
227 termNode->list_next = dag_h->nodes;
228 dag_h->nodes = termNode;
229
230 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
231 if (nfaults == 2) {
232 wnqNode = rf_AllocDAGNode(raidPtr);
233 } else {
234 wnqNode = NULL;
235 }
236 #endif
237 rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h,
238 new_asm_h, &nRodNodes, &sosBuffer,
239 &eosBuffer, allocList);
240 if (nRodNodes > 0) {
241 for (i = 0; i < nRodNodes; i++) {
242 tmpNode = rf_AllocDAGNode(raidPtr);
243 tmpNode->list_next = dag_h->nodes;
244 dag_h->nodes = tmpNode;
245 }
246 rodNodes = dag_h->nodes;
247 } else {
248 rodNodes = NULL;
249 }
250
251 /* begin node initialization */
252 if (nRodNodes > 0) {
253 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
254 rf_NullNodeUndoFunc, NULL, nRodNodes, 0, 0, 0,
255 dag_h, "Nil", allocList);
256 } else {
257 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
258 rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0,
259 dag_h, "Nil", allocList);
260 }
261
262 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
263 rf_NullNodeUndoFunc, NULL, nWndNodes + nfaults, 1, 0, 0,
264 dag_h, "Cmt", allocList);
265 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
266 rf_TerminateUndoFunc, NULL, 0, nWndNodes + nfaults, 0, 0,
267 dag_h, "Trm", allocList);
268
269 /* initialize the Rod nodes */
270 tmpNode = rodNodes;
271 for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
272 if (new_asm_h[asmNum]) {
273 pda = new_asm_h[asmNum]->stripeMap->physInfo;
274 while (pda) {
275 rf_InitNode(tmpNode, rf_wait,
276 RF_FALSE, rf_DiskReadFunc,
277 rf_DiskReadUndoFunc,
278 rf_GenericWakeupFunc,
279 1, 1, 4, 0, dag_h,
280 "Rod", allocList);
281 tmpNode->params[0].p = pda;
282 tmpNode->params[1].p = pda->bufPtr;
283 tmpNode->params[2].v = parityStripeID;
284 tmpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
285 which_ru);
286 nodeNum++;
287 pda = pda->next;
288 tmpNode = tmpNode->list_next;
289 }
290 }
291 }
292 RF_ASSERT(nodeNum == nRodNodes);
293
294 /* initialize the wnd nodes */
295 pda = asmap->physInfo;
296 tmpNode = wndNodes;
297 for (i = 0; i < nWndNodes; i++) {
298 rf_InitNode(tmpNode, rf_wait, RF_FALSE,
299 rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
300 rf_GenericWakeupFunc, 1, 1, 4, 0,
301 dag_h, "Wnd", allocList);
302 RF_ASSERT(pda != NULL);
303 tmpNode->params[0].p = pda;
304 tmpNode->params[1].p = pda->bufPtr;
305 tmpNode->params[2].v = parityStripeID;
306 tmpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
307 pda = pda->next;
308 tmpNode = tmpNode->list_next;
309 }
310
311 /* initialize the redundancy node */
312 if (nRodNodes > 0) {
313 rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc,
314 rf_NullNodeUndoFunc, NULL, 1,
315 nRodNodes, 2 * (nWndNodes + nRodNodes) + 1,
316 nfaults, dag_h, "Xr ", allocList);
317 } else {
318 rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc,
319 rf_NullNodeUndoFunc, NULL, 1,
320 1, 2 * (nWndNodes + nRodNodes) + 1,
321 nfaults, dag_h, "Xr ", allocList);
322 }
323 xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
324 tmpNode = wndNodes;
325 for (i = 0; i < nWndNodes; i++) {
326 /* pda */
327 xorNode->params[2 * i + 0] = tmpNode->params[0];
328 /* buf ptr */
329 xorNode->params[2 * i + 1] = tmpNode->params[1];
330 tmpNode = tmpNode->list_next;
331 }
332 tmpNode = rodNodes;
333 for (i = 0; i < nRodNodes; i++) {
334 /* pda */
335 xorNode->params[2 * (nWndNodes + i) + 0] = tmpNode->params[0];
336 /* buf ptr */
337 xorNode->params[2 * (nWndNodes + i) + 1] = tmpNode->params[1];
338 tmpNode = tmpNode->list_next;
339 }
340 /* xor node needs to get at RAID information */
341 xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr;
342
343 /*
344 * Look for an Rod node that reads a complete SU. If none,
345 * alloc a buffer to receive the parity info. Note that we
346 * can't use a new data buffer because it will not have gotten
347 * written when the xor occurs. */
348 if (allowBufferRecycle) {
349 tmpNode = rodNodes;
350 for (i = 0; i < nRodNodes; i++) {
351 if (((RF_PhysDiskAddr_t *) tmpNode->params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
352 break;
353 tmpNode = tmpNode->list_next;
354 }
355 }
356 if ((!allowBufferRecycle) || (i == nRodNodes)) {
357 xorNode->results[0] = rf_AllocBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit));
358 } else {
359 /* this works because the only way we get here is if
360 allowBufferRecycle is true and we went through the
361 above for loop, and exited via the break before
362 i==nRodNodes was true. That means tmpNode will
363 still point to a valid node -- the one we want for
364 here! */
365 xorNode->results[0] = tmpNode->params[1].p;
366 }
367
368 /* initialize the Wnp node */
369 rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc,
370 rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
371 dag_h, "Wnp", allocList);
372 wnpNode->params[0].p = asmap->parityInfo;
373 wnpNode->params[1].p = xorNode->results[0];
374 wnpNode->params[2].v = parityStripeID;
375 wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
376 /* parityInfo must describe entire parity unit */
377 RF_ASSERT(asmap->parityInfo->next == NULL);
378
379 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
380 if (nfaults == 2) {
381 /*
382 * We never try to recycle a buffer for the Q calcuation
383 * in addition to the parity. This would cause two buffers
384 * to get smashed during the P and Q calculation, guaranteeing
385 * one would be wrong.
386 */
387 xorNode->results[1] =
388 BUF_ALLOC(raidPtr->Layout.sectorsPerStripeUnit);
389 rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc,
390 rf_DiskWriteUndoFunc, rf_GenericWakeupFunc,
391 1, 1, 4, 0, dag_h, "Wnq", allocList);
392 wnqNode->params[0].p = asmap->qInfo;
393 wnqNode->params[1].p = xorNode->results[1];
394 wnqNode->params[2].v = parityStripeID;
395 wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
396 /* parityInfo must describe entire parity unit */
397 RF_ASSERT(asmap->parityInfo->next == NULL);
398 }
399 #endif
400 /*
401 * Connect nodes to form graph.
402 */
403
404 /* connect dag header to block node */
405 RF_ASSERT(blockNode->numAntecedents == 0);
406 dag_h->succedents[0] = blockNode;
407
408 if (nRodNodes > 0) {
409 /* connect the block node to the Rod nodes */
410 RF_ASSERT(blockNode->numSuccedents == nRodNodes);
411 RF_ASSERT(xorNode->numAntecedents == nRodNodes);
412 tmpNode = rodNodes;
413 for (i = 0; i < nRodNodes; i++) {
414 RF_ASSERT(tmpNode->numAntecedents == 1);
415 blockNode->succedents[i] = tmpNode;
416 tmpNode->antecedents[0] = blockNode;
417 tmpNode->antType[0] = rf_control;
418
419 /* connect the Rod nodes to the Xor node */
420 RF_ASSERT(tmpNode->numSuccedents == 1);
421 tmpNode->succedents[0] = xorNode;
422 xorNode->antecedents[i] = tmpNode;
423 xorNode->antType[i] = rf_trueData;
424 tmpNode = tmpNode->list_next;
425 }
426 } else {
427 /* connect the block node to the Xor node */
428 RF_ASSERT(blockNode->numSuccedents == 1);
429 RF_ASSERT(xorNode->numAntecedents == 1);
430 blockNode->succedents[0] = xorNode;
431 xorNode->antecedents[0] = blockNode;
432 xorNode->antType[0] = rf_control;
433 }
434
435 /* connect the xor node to the commit node */
436 RF_ASSERT(xorNode->numSuccedents == 1);
437 RF_ASSERT(commitNode->numAntecedents == 1);
438 xorNode->succedents[0] = commitNode;
439 commitNode->antecedents[0] = xorNode;
440 commitNode->antType[0] = rf_control;
441
442 /* connect the commit node to the write nodes */
443 RF_ASSERT(commitNode->numSuccedents == nWndNodes + nfaults);
444 tmpNode = wndNodes;
445 for (i = 0; i < nWndNodes; i++) {
446 RF_ASSERT(wndNodes->numAntecedents == 1);
447 commitNode->succedents[i] = tmpNode;
448 tmpNode->antecedents[0] = commitNode;
449 tmpNode->antType[0] = rf_control;
450 tmpNode = tmpNode->list_next;
451 }
452 RF_ASSERT(wnpNode->numAntecedents == 1);
453 commitNode->succedents[nWndNodes] = wnpNode;
454 wnpNode->antecedents[0] = commitNode;
455 wnpNode->antType[0] = rf_trueData;
456 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
457 if (nfaults == 2) {
458 RF_ASSERT(wnqNode->numAntecedents == 1);
459 commitNode->succedents[nWndNodes + 1] = wnqNode;
460 wnqNode->antecedents[0] = commitNode;
461 wnqNode->antType[0] = rf_trueData;
462 }
463 #endif
464 /* connect the write nodes to the term node */
465 RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults);
466 RF_ASSERT(termNode->numSuccedents == 0);
467 tmpNode = wndNodes;
468 for (i = 0; i < nWndNodes; i++) {
469 RF_ASSERT(wndNodes->numSuccedents == 1);
470 tmpNode->succedents[0] = termNode;
471 termNode->antecedents[i] = tmpNode;
472 termNode->antType[i] = rf_control;
473 tmpNode = tmpNode->list_next;
474 }
475 RF_ASSERT(wnpNode->numSuccedents == 1);
476 wnpNode->succedents[0] = termNode;
477 termNode->antecedents[nWndNodes] = wnpNode;
478 termNode->antType[nWndNodes] = rf_control;
479 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
480 if (nfaults == 2) {
481 RF_ASSERT(wnqNode->numSuccedents == 1);
482 wnqNode->succedents[0] = termNode;
483 termNode->antecedents[nWndNodes + 1] = wnqNode;
484 termNode->antType[nWndNodes + 1] = rf_control;
485 }
486 #endif
487 }
488 /******************************************************************************
489 *
490 * creates a DAG to perform a small-write operation (either raid 5 or pq),
491 * which is as follows:
492 *
493 * Hdr -> Nil -> Rop -> Xor -> Cmt ----> Wnp [Unp] --> Trm
494 * \- Rod X / \----> Wnd [Und]-/
495 * [\- Rod X / \---> Wnd [Und]-/]
496 * [\- Roq -> Q / \--> Wnq [Unq]-/]
497 *
498 * Rop = read old parity
499 * Rod = read old data
500 * Roq = read old "q"
501 * Cmt = commit node
502 * Und = unlock data disk
503 * Unp = unlock parity disk
504 * Unq = unlock q disk
505 * Wnp = write new parity
506 * Wnd = write new data
507 * Wnq = write new "q"
508 * [ ] denotes optional segments in the graph
509 *
510 * Parameters: raidPtr - description of the physical array
511 * asmap - logical & physical addresses for this access
512 * bp - buffer ptr (holds write data)
513 * flags - general flags (e.g. disk locking)
514 * allocList - list of memory allocated in DAG creation
515 * pfuncs - list of parity generating functions
516 * qfuncs - list of q generating functions
517 *
518 * A null qfuncs indicates single fault tolerant
519 *****************************************************************************/
520
521 void
rf_CommonCreateSmallWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList,const RF_RedFuncs_t * pfuncs,const RF_RedFuncs_t * qfuncs)522 rf_CommonCreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
523 RF_DagHeader_t *dag_h, void *bp,
524 RF_RaidAccessFlags_t flags,
525 RF_AllocListElem_t *allocList,
526 const RF_RedFuncs_t *pfuncs,
527 const RF_RedFuncs_t *qfuncs)
528 {
529 RF_DagNode_t *readDataNodes, *readParityNodes, *termNode;
530 RF_DagNode_t *tmpNode, *tmpreadDataNode, *tmpreadParityNode;
531 RF_DagNode_t *xorNodes, *blockNode, *commitNode;
532 RF_DagNode_t *writeDataNodes, *writeParityNodes;
533 RF_DagNode_t *tmpxorNode, *tmpwriteDataNode;
534 RF_DagNode_t *tmpwriteParityNode;
535 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
536 RF_DagNode_t *tmpwriteQNode, *tmpreadQNode, *tmpqNode, *readQNodes,
537 *writeQNodes, *qNodes;
538 #endif
539 int i, j, nNodes;
540 RF_ReconUnitNum_t which_ru;
541 void (*func) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *);
542 void (*qfunc) (RF_DagNode_t *) __unused;
543 int numDataNodes, numParityNodes;
544 RF_StripeNum_t parityStripeID;
545 RF_PhysDiskAddr_t *pda;
546 const char *name, *qname __unused;
547 long nfaults;
548
549 nfaults = qfuncs ? 2 : 1;
550
551 parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
552 asmap->raidAddress, &which_ru);
553 pda = asmap->physInfo;
554 numDataNodes = asmap->numStripeUnitsAccessed;
555 numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
556
557 #if RF_DEBUG_DAG
558 if (rf_dagDebug) {
559 printf("[Creating small-write DAG]\n");
560 }
561 #endif
562 RF_ASSERT(numDataNodes > 0);
563 dag_h->creator = "SmallWriteDAG";
564
565 dag_h->numCommitNodes = 1;
566 dag_h->numCommits = 0;
567 dag_h->numSuccedents = 1;
568
569 /*
570 * DAG creation occurs in four steps:
571 * 1. count the number of nodes in the DAG
572 * 2. create the nodes
573 * 3. initialize the nodes
574 * 4. connect the nodes
575 */
576
577 /*
578 * Step 1. compute number of nodes in the graph
579 */
580
581 /* number of nodes: a read and write for each data unit a
582 * redundancy computation node for each parity node (nfaults *
583 * nparity) a read and write for each parity unit a block and
584 * commit node (2) a terminate node if atomic RMW an unlock
585 * node for each data unit, redundancy unit
586 * totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes)
587 * + (nfaults * 2 * numParityNodes) + 3;
588 */
589
590 /*
591 * Step 2. create the nodes
592 */
593
594 blockNode = rf_AllocDAGNode(raidPtr);
595 blockNode->list_next = dag_h->nodes;
596 dag_h->nodes = blockNode;
597
598 commitNode = rf_AllocDAGNode(raidPtr);
599 commitNode->list_next = dag_h->nodes;
600 dag_h->nodes = commitNode;
601
602 for (i = 0; i < numDataNodes; i++) {
603 tmpNode = rf_AllocDAGNode(raidPtr);
604 tmpNode->list_next = dag_h->nodes;
605 dag_h->nodes = tmpNode;
606 }
607 readDataNodes = dag_h->nodes;
608
609 for (i = 0; i < numParityNodes; i++) {
610 tmpNode = rf_AllocDAGNode(raidPtr);
611 tmpNode->list_next = dag_h->nodes;
612 dag_h->nodes = tmpNode;
613 }
614 readParityNodes = dag_h->nodes;
615
616 for (i = 0; i < numDataNodes; i++) {
617 tmpNode = rf_AllocDAGNode(raidPtr);
618 tmpNode->list_next = dag_h->nodes;
619 dag_h->nodes = tmpNode;
620 }
621 writeDataNodes = dag_h->nodes;
622
623 for (i = 0; i < numParityNodes; i++) {
624 tmpNode = rf_AllocDAGNode(raidPtr);
625 tmpNode->list_next = dag_h->nodes;
626 dag_h->nodes = tmpNode;
627 }
628 writeParityNodes = dag_h->nodes;
629
630 for (i = 0; i < numParityNodes; i++) {
631 tmpNode = rf_AllocDAGNode(raidPtr);
632 tmpNode->list_next = dag_h->nodes;
633 dag_h->nodes = tmpNode;
634 }
635 xorNodes = dag_h->nodes;
636
637 termNode = rf_AllocDAGNode(raidPtr);
638 termNode->list_next = dag_h->nodes;
639 dag_h->nodes = termNode;
640
641 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
642 if (nfaults == 2) {
643 for (i = 0; i < numParityNodes; i++) {
644 tmpNode = rf_AllocDAGNode(raidPtr);
645 tmpNode->list_next = dag_h->nodes;
646 dag_h->nodes = tmpNode;
647 }
648 readQNodes = dag_h->nodes;
649
650 for (i = 0; i < numParityNodes; i++) {
651 tmpNode = rf_AllocDAGNode(raidPtr);
652 tmpNode->list_next = dag_h->nodes;
653 dag_h->nodes = tmpNode;
654 }
655 writeQNodes = dag_h->nodes;
656
657 for (i = 0; i < numParityNodes; i++) {
658 tmpNode = rf_AllocDAGNode(raidPtr);
659 tmpNode->list_next = dag_h->nodes;
660 dag_h->nodes = tmpNode;
661 }
662 qNodes = dag_h->nodes;
663 } else {
664 readQNodes = writeQNodes = qNodes = NULL;
665 }
666 #endif
667
668 /*
669 * Step 3. initialize the nodes
670 */
671 /* initialize block node (Nil) */
672 nNodes = numDataNodes + (nfaults * numParityNodes);
673 rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
674 rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0,
675 dag_h, "Nil", allocList);
676
677 /* initialize commit node (Cmt) */
678 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
679 rf_NullNodeUndoFunc, NULL, nNodes,
680 (nfaults * numParityNodes), 0, 0, dag_h, "Cmt", allocList);
681
682 /* initialize terminate node (Trm) */
683 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
684 rf_TerminateUndoFunc, NULL, 0, nNodes, 0, 0,
685 dag_h, "Trm", allocList);
686
687 /* initialize nodes which read old data (Rod) */
688 tmpreadDataNode = readDataNodes;
689 for (i = 0; i < numDataNodes; i++) {
690 rf_InitNode(tmpreadDataNode, rf_wait, RF_FALSE,
691 rf_DiskReadFunc, rf_DiskReadUndoFunc,
692 rf_GenericWakeupFunc, (nfaults * numParityNodes),
693 1, 4, 0, dag_h, "Rod", allocList);
694 RF_ASSERT(pda != NULL);
695 /* physical disk addr desc */
696 tmpreadDataNode->params[0].p = pda;
697 /* buffer to hold old data */
698 tmpreadDataNode->params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector);
699 tmpreadDataNode->params[2].v = parityStripeID;
700 tmpreadDataNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
701 which_ru);
702 pda = pda->next;
703 for (j = 0; j < tmpreadDataNode->numSuccedents; j++) {
704 tmpreadDataNode->propList[j] = NULL;
705 }
706 tmpreadDataNode = tmpreadDataNode->list_next;
707 }
708
709 /* initialize nodes which read old parity (Rop) */
710 pda = asmap->parityInfo;
711 i = 0;
712 tmpreadParityNode = readParityNodes;
713 for (i = 0; i < numParityNodes; i++) {
714 RF_ASSERT(pda != NULL);
715 rf_InitNode(tmpreadParityNode, rf_wait, RF_FALSE,
716 rf_DiskReadFunc, rf_DiskReadUndoFunc,
717 rf_GenericWakeupFunc, numParityNodes, 1, 4, 0,
718 dag_h, "Rop", allocList);
719 tmpreadParityNode->params[0].p = pda;
720 /* buffer to hold old parity */
721 tmpreadParityNode->params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector);
722 tmpreadParityNode->params[2].v = parityStripeID;
723 tmpreadParityNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
724 which_ru);
725 pda = pda->next;
726 for (j = 0; j < tmpreadParityNode->numSuccedents; j++) {
727 tmpreadParityNode->propList[0] = NULL;
728 }
729 tmpreadParityNode = tmpreadParityNode->list_next;
730 }
731
732 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
733 /* initialize nodes which read old Q (Roq) */
734 if (nfaults == 2) {
735 pda = asmap->qInfo;
736 tmpreadQNode = readQNodes;
737 for (i = 0; i < numParityNodes; i++) {
738 RF_ASSERT(pda != NULL);
739 rf_InitNode(tmpreadQNode, rf_wait, RF_FALSE,
740 rf_DiskReadFunc, rf_DiskReadUndoFunc,
741 rf_GenericWakeupFunc, numParityNodes,
742 1, 4, 0, dag_h, "Roq", allocList);
743 tmpreadQNode->params[0].p = pda;
744 /* buffer to hold old Q */
745 tmpreadQNode->params[1].p = rf_AllocBuffer(raidPtr, dag_h,
746 pda->numSector << raidPtr->logBytesPerSector);
747 tmpreadQNode->params[2].v = parityStripeID;
748 tmpreadQNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
749 which_ru);
750 pda = pda->next;
751 for (j = 0; j < tmpreadQNode->numSuccedents; j++) {
752 tmpreadQNode->propList[0] = NULL;
753 }
754 tmpreadQNode = tmpreadQNode->list_next;
755 }
756 }
757 #endif
758 /* initialize nodes which write new data (Wnd) */
759 pda = asmap->physInfo;
760 tmpwriteDataNode = writeDataNodes;
761 for (i = 0; i < numDataNodes; i++) {
762 RF_ASSERT(pda != NULL);
763 rf_InitNode(tmpwriteDataNode, rf_wait, RF_FALSE,
764 rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
765 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
766 "Wnd", allocList);
767 /* physical disk addr desc */
768 tmpwriteDataNode->params[0].p = pda;
769 /* buffer holding new data to be written */
770 tmpwriteDataNode->params[1].p = pda->bufPtr;
771 tmpwriteDataNode->params[2].v = parityStripeID;
772 tmpwriteDataNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
773 which_ru);
774 pda = pda->next;
775 tmpwriteDataNode = tmpwriteDataNode->list_next;
776 }
777
778 /*
779 * Initialize nodes which compute new parity and Q.
780 */
781 /*
782 * We use the simple XOR func in the double-XOR case, and when
783 * we're accessing only a portion of one stripe unit. The
784 * distinction between the two is that the regular XOR func
785 * assumes that the targbuf is a full SU in size, and examines
786 * the pda associated with the buffer to decide where within
787 * the buffer to XOR the data, whereas the simple XOR func
788 * just XORs the data into the start of the buffer. */
789 if ((numParityNodes == 2) || ((numDataNodes == 1)
790 && (asmap->totalSectorsAccessed <
791 raidPtr->Layout.sectorsPerStripeUnit))) {
792 func = pfuncs->simple;
793 undoFunc = rf_NullNodeUndoFunc;
794 name = pfuncs->SimpleName;
795 if (qfuncs) {
796 qfunc = qfuncs->simple;
797 qname = qfuncs->SimpleName;
798 } else {
799 qfunc = NULL;
800 qname = NULL;
801 }
802 } else {
803 func = pfuncs->regular;
804 undoFunc = rf_NullNodeUndoFunc;
805 name = pfuncs->RegularName;
806 if (qfuncs) {
807 qfunc = qfuncs->regular;
808 qname = qfuncs->RegularName;
809 } else {
810 qfunc = NULL;
811 qname = NULL;
812 }
813 }
814 /*
815 * Initialize the xor nodes: params are {pda,buf}
816 * from {Rod,Wnd,Rop} nodes, and raidPtr
817 */
818 if (numParityNodes == 2) {
819 /* double-xor case */
820 tmpxorNode = xorNodes;
821 tmpreadDataNode = readDataNodes;
822 tmpreadParityNode = readParityNodes;
823 tmpwriteDataNode = writeDataNodes;
824 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
825 tmpqNode = qNodes;
826 tmpreadQNode = readQNodes;
827 #endif
828 for (i = 0; i < numParityNodes; i++) {
829 /* note: no wakeup func for xor */
830 rf_InitNode(tmpxorNode, rf_wait, RF_FALSE, func,
831 undoFunc, NULL, 1,
832 (numDataNodes + numParityNodes),
833 7, 1, dag_h, name, allocList);
834 tmpxorNode->flags |= RF_DAGNODE_FLAG_YIELD;
835 tmpxorNode->params[0] = tmpreadDataNode->params[0];
836 tmpxorNode->params[1] = tmpreadDataNode->params[1];
837 tmpxorNode->params[2] = tmpreadParityNode->params[0];
838 tmpxorNode->params[3] = tmpreadParityNode->params[1];
839 tmpxorNode->params[4] = tmpwriteDataNode->params[0];
840 tmpxorNode->params[5] = tmpwriteDataNode->params[1];
841 tmpxorNode->params[6].p = raidPtr;
842 /* use old parity buf as target buf */
843 tmpxorNode->results[0] = tmpreadParityNode->params[1].p;
844 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
845 if (nfaults == 2) {
846 /* note: no wakeup func for qor */
847 rf_InitNode(tmpqNode, rf_wait, RF_FALSE,
848 qfunc, undoFunc, NULL, 1,
849 (numDataNodes + numParityNodes),
850 7, 1, dag_h, qname, allocList);
851 tmpqNode->params[0] = tmpreadDataNode->params[0];
852 tmpqNode->params[1] = tmpreadDataNode->params[1];
853 tmpqNode->params[2] = tmpreadQNode->params[0];
854 tmpqNode->params[3] = tmpreadQNode->params[1];
855 tmpqNode->params[4] = tmpwriteDataNode->params[0];
856 tmpqNode->params[5] = tmpwriteDataNode->params[1];
857 tmpqNode->params[6].p = raidPtr;
858 /* use old Q buf as target buf */
859 tmpqNode->results[0] = tmpreadQNode->params[1].p;
860 tmpqNode = tmpqNode->list_next;
861 tmpreadQNode = tmpreadQNode->list_next;
862 }
863 #endif
864 tmpxorNode = tmpxorNode->list_next;
865 tmpreadDataNode = tmpreadDataNode->list_next;
866 tmpreadParityNode = tmpreadParityNode->list_next;
867 tmpwriteDataNode = tmpwriteDataNode->list_next;
868 }
869 } else {
870 /* there is only one xor node in this case */
871 rf_InitNode(xorNodes, rf_wait, RF_FALSE, func,
872 undoFunc, NULL, 1, (numDataNodes + numParityNodes),
873 (2 * (numDataNodes + numDataNodes + 1) + 1), 1,
874 dag_h, name, allocList);
875 xorNodes->flags |= RF_DAGNODE_FLAG_YIELD;
876 tmpreadDataNode = readDataNodes;
877 for (i = 0; i < numDataNodes; i++) { /* used to be"numDataNodes + 1" until we factored
878 out the "+1" into the "deal with Rop separately below */
879 /* set up params related to Rod nodes */
880 xorNodes->params[2 * i + 0] = tmpreadDataNode->params[0]; /* pda */
881 xorNodes->params[2 * i + 1] = tmpreadDataNode->params[1]; /* buffer ptr */
882 tmpreadDataNode = tmpreadDataNode->list_next;
883 }
884 /* deal with Rop separately */
885 xorNodes->params[2 * numDataNodes + 0] = readParityNodes->params[0]; /* pda */
886 xorNodes->params[2 * numDataNodes + 1] = readParityNodes->params[1]; /* buffer ptr */
887
888 tmpwriteDataNode = writeDataNodes;
889 for (i = 0; i < numDataNodes; i++) {
890 /* set up params related to Wnd and Wnp nodes */
891 xorNodes->params[2 * (numDataNodes + 1 + i) + 0] = /* pda */
892 tmpwriteDataNode->params[0];
893 xorNodes->params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */
894 tmpwriteDataNode->params[1];
895 tmpwriteDataNode = tmpwriteDataNode->list_next;
896 }
897 /* xor node needs to get at RAID information */
898 xorNodes->params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
899 xorNodes->results[0] = readParityNodes->params[1].p;
900 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
901 if (nfaults == 2) {
902 rf_InitNode(qNodes, rf_wait, RF_FALSE, qfunc,
903 undoFunc, NULL, 1,
904 (numDataNodes + numParityNodes),
905 (2 * (numDataNodes + numDataNodes + 1) + 1), 1,
906 dag_h, qname, allocList);
907 tmpreadDataNode = readDataNodes;
908 for (i = 0; i < numDataNodes; i++) {
909 /* set up params related to Rod */
910 qNodes->params[2 * i + 0] = tmpreadDataNode->params[0]; /* pda */
911 qNodes->params[2 * i + 1] = tmpreadDataNode->params[1]; /* buffer ptr */
912 tmpreadDataNode = tmpreadDataNode->list_next;
913 }
914 /* and read old q */
915 qNodes->params[2 * numDataNodes + 0] = /* pda */
916 readQNodes->params[0];
917 qNodes->params[2 * numDataNodes + 1] = /* buffer ptr */
918 readQNodes->params[1];
919 tmpwriteDataNode = writeDataNodes;
920 for (i = 0; i < numDataNodes; i++) {
921 /* set up params related to Wnd nodes */
922 qNodes->params[2 * (numDataNodes + 1 + i) + 0] = /* pda */
923 tmpwriteDataNode->params[0];
924 qNodes->params[2 * (numDataNodes + 1 + i) + 1] = /* buffer ptr */
925 tmpwriteDataNode->params[1];
926 tmpwriteDataNode = tmpwriteDataNode->list_next;
927 }
928 /* xor node needs to get at RAID information */
929 qNodes->params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
930 qNodes->results[0] = readQNodes->params[1].p;
931 }
932 #endif
933 }
934
935 /* initialize nodes which write new parity (Wnp) */
936 pda = asmap->parityInfo;
937 tmpwriteParityNode = writeParityNodes;
938 tmpxorNode = xorNodes;
939 for (i = 0; i < numParityNodes; i++) {
940 rf_InitNode(tmpwriteParityNode, rf_wait, RF_FALSE,
941 rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
942 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
943 "Wnp", allocList);
944 RF_ASSERT(pda != NULL);
945 tmpwriteParityNode->params[0].p = pda; /* param 1 (bufPtr)
946 * filled in by xor node */
947 tmpwriteParityNode->params[1].p = tmpxorNode->results[0]; /* buffer pointer for
948 * parity write
949 * operation */
950 tmpwriteParityNode->params[2].v = parityStripeID;
951 tmpwriteParityNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
952 which_ru);
953 pda = pda->next;
954 tmpwriteParityNode = tmpwriteParityNode->list_next;
955 tmpxorNode = tmpxorNode->list_next;
956 }
957
958 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
959 /* initialize nodes which write new Q (Wnq) */
960 if (nfaults == 2) {
961 pda = asmap->qInfo;
962 tmpwriteQNode = writeQNodes;
963 tmpqNode = qNodes;
964 for (i = 0; i < numParityNodes; i++) {
965 rf_InitNode(tmpwriteQNode, rf_wait, RF_FALSE,
966 rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
967 rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
968 "Wnq", allocList);
969 RF_ASSERT(pda != NULL);
970 tmpwriteQNode->params[0].p = pda; /* param 1 (bufPtr)
971 * filled in by xor node */
972 tmpwriteQNode->params[1].p = tmpqNode->results[0]; /* buffer pointer for
973 * parity write
974 * operation */
975 tmpwriteQNode->params[2].v = parityStripeID;
976 tmpwriteQNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
977 which_ru);
978 pda = pda->next;
979 tmpwriteQNode = tmpwriteQNode->list_next;
980 tmpqNode = tmpqNode->list_next;
981 }
982 }
983 #endif
984 /*
985 * Step 4. connect the nodes.
986 */
987
988 /* connect header to block node */
989 dag_h->succedents[0] = blockNode;
990
991 /* connect block node to read old data nodes */
992 RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults)));
993 tmpreadDataNode = readDataNodes;
994 for (i = 0; i < numDataNodes; i++) {
995 blockNode->succedents[i] = tmpreadDataNode;
996 RF_ASSERT(tmpreadDataNode->numAntecedents == 1);
997 tmpreadDataNode->antecedents[0] = blockNode;
998 tmpreadDataNode->antType[0] = rf_control;
999 tmpreadDataNode = tmpreadDataNode->list_next;
1000 }
1001
1002 /* connect block node to read old parity nodes */
1003 tmpreadParityNode = readParityNodes;
1004 for (i = 0; i < numParityNodes; i++) {
1005 blockNode->succedents[numDataNodes + i] = tmpreadParityNode;
1006 RF_ASSERT(tmpreadParityNode->numAntecedents == 1);
1007 tmpreadParityNode->antecedents[0] = blockNode;
1008 tmpreadParityNode->antType[0] = rf_control;
1009 tmpreadParityNode = tmpreadParityNode->list_next;
1010 }
1011
1012 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1013 /* connect block node to read old Q nodes */
1014 if (nfaults == 2) {
1015 tmpreadQNode = readQNodes;
1016 for (i = 0; i < numParityNodes; i++) {
1017 blockNode->succedents[numDataNodes + numParityNodes + i] = tmpreadQNode;
1018 RF_ASSERT(tmpreadQNode->numAntecedents == 1);
1019 tmpreadQNode->antecedents[0] = blockNode;
1020 tmpreadQNode->antType[0] = rf_control;
1021 tmpreadQNode = tmpreadQNode->list_next;
1022 }
1023 }
1024 #endif
1025 /* connect read old data nodes to xor nodes */
1026 tmpreadDataNode = readDataNodes;
1027 for (i = 0; i < numDataNodes; i++) {
1028 RF_ASSERT(tmpreadDataNode->numSuccedents == (nfaults * numParityNodes));
1029 tmpxorNode = xorNodes;
1030 for (j = 0; j < numParityNodes; j++) {
1031 RF_ASSERT(tmpxorNode->numAntecedents == numDataNodes + numParityNodes);
1032 tmpreadDataNode->succedents[j] = tmpxorNode;
1033 tmpxorNode->antecedents[i] = tmpreadDataNode;
1034 tmpxorNode->antType[i] = rf_trueData;
1035 tmpxorNode = tmpxorNode->list_next;
1036 }
1037 tmpreadDataNode = tmpreadDataNode->list_next;
1038 }
1039
1040 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1041 /* connect read old data nodes to q nodes */
1042 if (nfaults == 2) {
1043 tmpreadDataNode = readDataNodes;
1044 for (i = 0; i < numDataNodes; i++) {
1045 tmpqNode = qNodes;
1046 for (j = 0; j < numParityNodes; j++) {
1047 RF_ASSERT(tmpqNode->numAntecedents == numDataNodes + numParityNodes);
1048 tmpreadDataNode->succedents[numParityNodes + j] = tmpqNode;
1049 tmpqNode->antecedents[i] = tmpreadDataNode;
1050 tmpqNode->antType[i] = rf_trueData;
1051 tmpqNode = tmpqNode->list_next;
1052 }
1053 tmpreadDataNode = tmpreadDataNode->list_next;
1054 }
1055 }
1056 #endif
1057 /* connect read old parity nodes to xor nodes */
1058 tmpreadParityNode = readParityNodes;
1059 for (i = 0; i < numParityNodes; i++) {
1060 RF_ASSERT(tmpreadParityNode->numSuccedents == numParityNodes);
1061 tmpxorNode = xorNodes;
1062 for (j = 0; j < numParityNodes; j++) {
1063 tmpreadParityNode->succedents[j] = tmpxorNode;
1064 tmpxorNode->antecedents[numDataNodes + i] = tmpreadParityNode;
1065 tmpxorNode->antType[numDataNodes + i] = rf_trueData;
1066 tmpxorNode = tmpxorNode->list_next;
1067 }
1068 tmpreadParityNode = tmpreadParityNode->list_next;
1069 }
1070
1071 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1072 /* connect read old q nodes to q nodes */
1073 if (nfaults == 2) {
1074 tmpreadParityNode = readParityNodes;
1075 tmpreadQNode = readQNodes;
1076 for (i = 0; i < numParityNodes; i++) {
1077 RF_ASSERT(tmpreadParityNode->numSuccedents == numParityNodes);
1078 tmpqNode = qNodes;
1079 for (j = 0; j < numParityNodes; j++) {
1080 tmpreadQNode->succedents[j] = tmpqNode;
1081 tmpqNode->antecedents[numDataNodes + i] = tmpreadQNode;
1082 tmpqNode->antType[numDataNodes + i] = rf_trueData;
1083 tmpqNode = tmpqNode->list_next;
1084 }
1085 tmpreadParityNode = tmpreadParityNode->list_next;
1086 tmpreadQNode = tmpreadQNode->list_next;
1087 }
1088 }
1089 #endif
1090 /* connect xor nodes to commit node */
1091 RF_ASSERT(commitNode->numAntecedents == (nfaults * numParityNodes));
1092 tmpxorNode = xorNodes;
1093 for (i = 0; i < numParityNodes; i++) {
1094 RF_ASSERT(tmpxorNode->numSuccedents == 1);
1095 tmpxorNode->succedents[0] = commitNode;
1096 commitNode->antecedents[i] = tmpxorNode;
1097 commitNode->antType[i] = rf_control;
1098 tmpxorNode = tmpxorNode->list_next;
1099 }
1100
1101 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1102 /* connect q nodes to commit node */
1103 if (nfaults == 2) {
1104 tmpqNode = qNodes;
1105 for (i = 0; i < numParityNodes; i++) {
1106 RF_ASSERT(tmpqNode->numSuccedents == 1);
1107 tmpqNode->succedents[0] = commitNode;
1108 commitNode->antecedents[i + numParityNodes] = tmpqNode;
1109 commitNode->antType[i + numParityNodes] = rf_control;
1110 tmpqNode = tmpqNode->list_next;
1111 }
1112 }
1113 #endif
1114 /* connect commit node to write nodes */
1115 RF_ASSERT(commitNode->numSuccedents == (numDataNodes + (nfaults * numParityNodes)));
1116 tmpwriteDataNode = writeDataNodes;
1117 for (i = 0; i < numDataNodes; i++) {
1118 RF_ASSERT(tmpwriteDataNode->numAntecedents == 1);
1119 commitNode->succedents[i] = tmpwriteDataNode;
1120 tmpwriteDataNode->antecedents[0] = commitNode;
1121 tmpwriteDataNode->antType[0] = rf_trueData;
1122 tmpwriteDataNode = tmpwriteDataNode->list_next;
1123 }
1124 tmpwriteParityNode = writeParityNodes;
1125 for (i = 0; i < numParityNodes; i++) {
1126 RF_ASSERT(tmpwriteParityNode->numAntecedents == 1);
1127 commitNode->succedents[i + numDataNodes] = tmpwriteParityNode;
1128 tmpwriteParityNode->antecedents[0] = commitNode;
1129 tmpwriteParityNode->antType[0] = rf_trueData;
1130 tmpwriteParityNode = tmpwriteParityNode->list_next;
1131 }
1132 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1133 if (nfaults == 2) {
1134 tmpwriteQNode = writeQNodes;
1135 for (i = 0; i < numParityNodes; i++) {
1136 RF_ASSERT(tmpwriteQNode->numAntecedents == 1);
1137 commitNode->succedents[i + numDataNodes + numParityNodes] = tmpwriteQNode;
1138 tmpwriteQNode->antecedents[0] = commitNode;
1139 tmpwriteQNode->antType[0] = rf_trueData;
1140 tmpwriteQNode = tmpwriteQNode->list_next;
1141 }
1142 }
1143 #endif
1144 RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
1145 RF_ASSERT(termNode->numSuccedents == 0);
1146 tmpwriteDataNode = writeDataNodes;
1147 for (i = 0; i < numDataNodes; i++) {
1148 /* connect write new data nodes to term node */
1149 RF_ASSERT(tmpwriteDataNode->numSuccedents == 1);
1150 RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
1151 tmpwriteDataNode->succedents[0] = termNode;
1152 termNode->antecedents[i] = tmpwriteDataNode;
1153 termNode->antType[i] = rf_control;
1154 tmpwriteDataNode = tmpwriteDataNode->list_next;
1155 }
1156
1157 tmpwriteParityNode = writeParityNodes;
1158 for (i = 0; i < numParityNodes; i++) {
1159 RF_ASSERT(tmpwriteParityNode->numSuccedents == 1);
1160 tmpwriteParityNode->succedents[0] = termNode;
1161 termNode->antecedents[numDataNodes + i] = tmpwriteParityNode;
1162 termNode->antType[numDataNodes + i] = rf_control;
1163 tmpwriteParityNode = tmpwriteParityNode->list_next;
1164 }
1165
1166 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1167 if (nfaults == 2) {
1168 tmpwriteQNode = writeQNodes;
1169 for (i = 0; i < numParityNodes; i++) {
1170 RF_ASSERT(tmpwriteQNode->numSuccedents == 1);
1171 tmpwriteQNode->succedents[0] = termNode;
1172 termNode->antecedents[numDataNodes + numParityNodes + i] = tmpwriteQNode;
1173 termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
1174 tmpwriteQNode = tmpwriteQNode->list_next;
1175 }
1176 }
1177 #endif
1178 }
1179
1180
1181 /******************************************************************************
1182 * create a write graph (fault-free or degraded) for RAID level 1
1183 *
1184 * Hdr -> Commit -> Wpd -> Nil -> Trm
1185 * -> Wsd ->
1186 *
1187 * The "Wpd" node writes data to the primary copy in the mirror pair
1188 * The "Wsd" node writes data to the secondary copy in the mirror pair
1189 *
1190 * Parameters: raidPtr - description of the physical array
1191 * asmap - logical & physical addresses for this access
1192 * bp - buffer ptr (holds write data)
1193 * flags - general flags (e.g. disk locking)
1194 * allocList - list of memory allocated in DAG creation
1195 *****************************************************************************/
1196
1197 void
rf_CreateRaidOneWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList)1198 rf_CreateRaidOneWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1199 RF_DagHeader_t *dag_h, void *bp,
1200 RF_RaidAccessFlags_t flags,
1201 RF_AllocListElem_t *allocList)
1202 {
1203 RF_DagNode_t *unblockNode, *termNode, *commitNode;
1204 RF_DagNode_t *wndNode, *wmirNode;
1205 RF_DagNode_t *tmpNode, *tmpwndNode, *tmpwmirNode;
1206 int nWndNodes, nWmirNodes, i;
1207 RF_ReconUnitNum_t which_ru;
1208 RF_PhysDiskAddr_t *pda, *pdaP;
1209 RF_StripeNum_t parityStripeID;
1210
1211 parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
1212 asmap->raidAddress, &which_ru);
1213 #if RF_DEBUG_DAG
1214 if (rf_dagDebug) {
1215 printf("[Creating RAID level 1 write DAG]\n");
1216 }
1217 #endif
1218 dag_h->creator = "RaidOneWriteDAG";
1219
1220 /* 2 implies access not SU aligned */
1221 nWmirNodes = (asmap->parityInfo->next) ? 2 : 1;
1222 nWndNodes = (asmap->physInfo->next) ? 2 : 1;
1223
1224 /* alloc the Wnd nodes and the Wmir node */
1225 if (asmap->numDataFailed == 1)
1226 nWndNodes--;
1227 if (asmap->numParityFailed == 1)
1228 nWmirNodes--;
1229
1230 /* total number of nodes = nWndNodes + nWmirNodes + (commit + unblock
1231 * + terminator) */
1232 for (i = 0; i < nWndNodes; i++) {
1233 tmpNode = rf_AllocDAGNode(raidPtr);
1234 tmpNode->list_next = dag_h->nodes;
1235 dag_h->nodes = tmpNode;
1236 }
1237 wndNode = dag_h->nodes;
1238
1239 for (i = 0; i < nWmirNodes; i++) {
1240 tmpNode = rf_AllocDAGNode(raidPtr);
1241 tmpNode->list_next = dag_h->nodes;
1242 dag_h->nodes = tmpNode;
1243 }
1244 wmirNode = dag_h->nodes;
1245
1246 commitNode = rf_AllocDAGNode(raidPtr);
1247 commitNode->list_next = dag_h->nodes;
1248 dag_h->nodes = commitNode;
1249
1250 unblockNode = rf_AllocDAGNode(raidPtr);
1251 unblockNode->list_next = dag_h->nodes;
1252 dag_h->nodes = unblockNode;
1253
1254 termNode = rf_AllocDAGNode(raidPtr);
1255 termNode->list_next = dag_h->nodes;
1256 dag_h->nodes = termNode;
1257
1258 /* this dag can commit immediately */
1259 dag_h->numCommitNodes = 1;
1260 dag_h->numCommits = 0;
1261 dag_h->numSuccedents = 1;
1262
1263 /* initialize the commit, unblock, and term nodes */
1264 rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
1265 rf_NullNodeUndoFunc, NULL, (nWndNodes + nWmirNodes),
1266 0, 0, 0, dag_h, "Cmt", allocList);
1267 rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
1268 rf_NullNodeUndoFunc, NULL, 1, (nWndNodes + nWmirNodes),
1269 0, 0, dag_h, "Nil", allocList);
1270 rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
1271 rf_TerminateUndoFunc, NULL, 0, 1, 0, 0,
1272 dag_h, "Trm", allocList);
1273
1274 /* initialize the wnd nodes */
1275 if (nWndNodes > 0) {
1276 pda = asmap->physInfo;
1277 tmpwndNode = wndNode;
1278 for (i = 0; i < nWndNodes; i++) {
1279 rf_InitNode(tmpwndNode, rf_wait, RF_FALSE,
1280 rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1281 rf_GenericWakeupFunc, 1, 1, 4, 0,
1282 dag_h, "Wpd", allocList);
1283 RF_ASSERT(pda != NULL);
1284 tmpwndNode->params[0].p = pda;
1285 tmpwndNode->params[1].p = pda->bufPtr;
1286 tmpwndNode->params[2].v = parityStripeID;
1287 tmpwndNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
1288 pda = pda->next;
1289 tmpwndNode = tmpwndNode->list_next;
1290 }
1291 RF_ASSERT(pda == NULL);
1292 }
1293 /* initialize the mirror nodes */
1294 if (nWmirNodes > 0) {
1295 pda = asmap->physInfo;
1296 pdaP = asmap->parityInfo;
1297 tmpwmirNode = wmirNode;
1298 for (i = 0; i < nWmirNodes; i++) {
1299 rf_InitNode(tmpwmirNode, rf_wait, RF_FALSE,
1300 rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1301 rf_GenericWakeupFunc, 1, 1, 4, 0,
1302 dag_h, "Wsd", allocList);
1303 RF_ASSERT(pda != NULL);
1304 tmpwmirNode->params[0].p = pdaP;
1305 tmpwmirNode->params[1].p = pda->bufPtr;
1306 tmpwmirNode->params[2].v = parityStripeID;
1307 tmpwmirNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
1308 pda = pda->next;
1309 pdaP = pdaP->next;
1310 tmpwmirNode = tmpwmirNode->list_next;
1311 }
1312 RF_ASSERT(pda == NULL);
1313 RF_ASSERT(pdaP == NULL);
1314 }
1315 /* link the header node to the commit node */
1316 RF_ASSERT(dag_h->numSuccedents == 1);
1317 RF_ASSERT(commitNode->numAntecedents == 0);
1318 dag_h->succedents[0] = commitNode;
1319
1320 /* link the commit node to the write nodes */
1321 RF_ASSERT(commitNode->numSuccedents == (nWndNodes + nWmirNodes));
1322 tmpwndNode = wndNode;
1323 for (i = 0; i < nWndNodes; i++) {
1324 RF_ASSERT(tmpwndNode->numAntecedents == 1);
1325 commitNode->succedents[i] = tmpwndNode;
1326 tmpwndNode->antecedents[0] = commitNode;
1327 tmpwndNode->antType[0] = rf_control;
1328 tmpwndNode = tmpwndNode->list_next;
1329 }
1330 tmpwmirNode = wmirNode;
1331 for (i = 0; i < nWmirNodes; i++) {
1332 RF_ASSERT(tmpwmirNode->numAntecedents == 1);
1333 commitNode->succedents[i + nWndNodes] = tmpwmirNode;
1334 tmpwmirNode->antecedents[0] = commitNode;
1335 tmpwmirNode->antType[0] = rf_control;
1336 tmpwmirNode = tmpwmirNode->list_next;
1337 }
1338
1339 /* link the write nodes to the unblock node */
1340 RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes));
1341 tmpwndNode = wndNode;
1342 for (i = 0; i < nWndNodes; i++) {
1343 RF_ASSERT(tmpwndNode->numSuccedents == 1);
1344 tmpwndNode->succedents[0] = unblockNode;
1345 unblockNode->antecedents[i] = tmpwndNode;
1346 unblockNode->antType[i] = rf_control;
1347 tmpwndNode = tmpwndNode->list_next;
1348 }
1349 tmpwmirNode = wmirNode;
1350 for (i = 0; i < nWmirNodes; i++) {
1351 RF_ASSERT(tmpwmirNode->numSuccedents == 1);
1352 tmpwmirNode->succedents[0] = unblockNode;
1353 unblockNode->antecedents[i + nWndNodes] = tmpwmirNode;
1354 unblockNode->antType[i + nWndNodes] = rf_control;
1355 tmpwmirNode = tmpwmirNode->list_next;
1356 }
1357
1358 /* link the unblock node to the term node */
1359 RF_ASSERT(unblockNode->numSuccedents == 1);
1360 RF_ASSERT(termNode->numAntecedents == 1);
1361 RF_ASSERT(termNode->numSuccedents == 0);
1362 unblockNode->succedents[0] = termNode;
1363 termNode->antecedents[0] = unblockNode;
1364 termNode->antType[0] = rf_control;
1365 }
1366