xref: /netbsd-src/sys/dev/raidframe/rf_dagffwr.c (revision 5192e0e7e8c6ea9b0c3f63d942341865bc3ddbd8)
1 /*	$NetBSD: rf_dagffwr.c,v 1.38 2023/10/15 18:15:20 oster Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /*
30  * rf_dagff.c
31  *
32  * code for creating fault-free DAGs
33  *
34  */
35 
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rf_dagffwr.c,v 1.38 2023/10/15 18:15:20 oster Exp $");
38 
39 #include <dev/raidframe/raidframevar.h>
40 
41 #include "rf_raid.h"
42 #include "rf_dag.h"
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_debugMem.h"
46 #include "rf_dagffrd.h"
47 #include "rf_general.h"
48 #include "rf_dagffwr.h"
49 #include "rf_map.h"
50 
51 /******************************************************************************
52  *
53  * General comments on DAG creation:
54  *
55  * All DAGs in this file use roll-away error recovery.  Each DAG has a single
56  * commit node, usually called "Cmt."  If an error occurs before the Cmt node
57  * is reached, the execution engine will halt forward execution and work
58  * backward through the graph, executing the undo functions.  Assuming that
59  * each node in the graph prior to the Cmt node are undoable and atomic - or -
60  * does not make changes to permanent state, the graph will fail atomically.
61  * If an error occurs after the Cmt node executes, the engine will roll-forward
62  * through the graph, blindly executing nodes until it reaches the end.
63  * If a graph reaches the end, it is assumed to have completed successfully.
64  *
65  * A graph has only 1 Cmt node.
66  *
67  */
68 
69 
70 /******************************************************************************
71  *
72  * The following wrappers map the standard DAG creation interface to the
73  * DAG creation routines.  Additionally, these wrappers enable experimentation
74  * with new DAG structures by providing an extra level of indirection, allowing
75  * the DAG creation routines to be replaced at this single point.
76  */
77 
78 
79 void
rf_CreateNonRedundantWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList,RF_IoType_t type)80 rf_CreateNonRedundantWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
81 			      RF_DagHeader_t *dag_h, void *bp,
82 			      RF_RaidAccessFlags_t flags,
83 			      RF_AllocListElem_t *allocList,
84 			      RF_IoType_t type)
85 {
86 	rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
87 				 RF_IO_TYPE_WRITE);
88 }
89 
90 void
rf_CreateRAID0WriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList,RF_IoType_t type)91 rf_CreateRAID0WriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
92 		       RF_DagHeader_t *dag_h, void *bp,
93 		       RF_RaidAccessFlags_t flags,
94 		       RF_AllocListElem_t *allocList,
95 		       RF_IoType_t type)
96 {
97 	rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
98 				 RF_IO_TYPE_WRITE);
99 }
100 
101 void
rf_CreateSmallWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList)102 rf_CreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
103 		       RF_DagHeader_t *dag_h, void *bp,
104 		       RF_RaidAccessFlags_t flags,
105 		       RF_AllocListElem_t *allocList)
106 {
107 	/* "normal" rollaway */
108 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags,
109 				     allocList, &rf_xorFuncs, NULL);
110 }
111 
112 void
rf_CreateLargeWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList)113 rf_CreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
114 		       RF_DagHeader_t *dag_h, void *bp,
115 		       RF_RaidAccessFlags_t flags,
116 		       RF_AllocListElem_t *allocList)
117 {
118 	/* "normal" rollaway */
119 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags,
120 				     allocList, 1, rf_RegularXorFunc, RF_TRUE);
121 }
122 
123 
124 /******************************************************************************
125  *
126  * DAG creation code begins here
127  */
128 #define BUF_ALLOC(num) \
129   RF_MallocAndAdd(rf_RaidAddressToByte(raidPtr, num), allocList)
130 
131 
132 /******************************************************************************
133  *
134  * creates a DAG to perform a large-write operation:
135  *
136  *           / Rod \           / Wnd \
137  * H -- block- Rod - Xor - Cmt - Wnd --- T
138  *           \ Rod /          \  Wnp /
139  *                             \[Wnq]/
140  *
141  * The XOR node also does the Q calculation in the P+Q architecture.
142  * All nodes are before the commit node (Cmt) are assumed to be atomic and
143  * undoable - or - they make no changes to permanent state.
144  *
145  * Rod = read old data
146  * Cmt = commit node
147  * Wnp = write new parity
148  * Wnd = write new data
149  * Wnq = write new "q"
150  * [] denotes optional segments in the graph
151  *
152  * Parameters:  raidPtr   - description of the physical array
153  *              asmap     - logical & physical addresses for this access
154  *              bp        - buffer ptr (holds write data)
155  *              flags     - general flags (e.g. disk locking)
156  *              allocList - list of memory allocated in DAG creation
157  *              nfaults   - number of faults array can tolerate
158  *                          (equal to # redundancy units in stripe)
159  *              redfuncs  - list of redundancy generating functions
160  *
161  *****************************************************************************/
162 
163 void
rf_CommonCreateLargeWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList,int nfaults,void (* redFunc)(RF_DagNode_t *),int allowBufferRecycle)164 rf_CommonCreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
165 			     RF_DagHeader_t *dag_h, void *bp,
166 			     RF_RaidAccessFlags_t flags,
167 			     RF_AllocListElem_t *allocList,
168 			     int nfaults, void (*redFunc) (RF_DagNode_t *),
169 			     int allowBufferRecycle)
170 {
171 	RF_DagNode_t *wndNodes, *rodNodes, *xorNode, *wnpNode, *tmpNode;
172 	RF_DagNode_t *blockNode, *commitNode, *termNode;
173 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
174 	RF_DagNode_t *wnqNode;
175 #endif
176 	int     nWndNodes, nRodNodes, i, nodeNum, asmNum;
177 	RF_AccessStripeMapHeader_t *new_asm_h[2];
178 	RF_StripeNum_t parityStripeID;
179 	char   *sosBuffer, *eosBuffer;
180 	RF_ReconUnitNum_t which_ru;
181 	RF_RaidLayout_t *layoutPtr;
182 	RF_PhysDiskAddr_t *pda;
183 
184 	layoutPtr = &(raidPtr->Layout);
185 	parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr,
186 							asmap->raidAddress,
187 							&which_ru);
188 
189 #if RF_DEBUG_DAG
190 	if (rf_dagDebug) {
191 		printf("[Creating large-write DAG]\n");
192 	}
193 #endif
194 	dag_h->creator = "LargeWriteDAG";
195 
196 	dag_h->numCommitNodes = 1;
197 	dag_h->numCommits = 0;
198 	dag_h->numSuccedents = 1;
199 
200 	/* alloc the nodes: Wnd, xor, commit, block, term, and  Wnp */
201 	nWndNodes = asmap->numStripeUnitsAccessed;
202 
203 	for (i = 0; i < nWndNodes; i++) {
204 		tmpNode = rf_AllocDAGNode(raidPtr);
205 		tmpNode->list_next = dag_h->nodes;
206 		dag_h->nodes = tmpNode;
207 	}
208 	wndNodes = dag_h->nodes;
209 
210 	xorNode = rf_AllocDAGNode(raidPtr);
211 	xorNode->list_next = dag_h->nodes;
212 	dag_h->nodes = xorNode;
213 
214 	wnpNode = rf_AllocDAGNode(raidPtr);
215 	wnpNode->list_next = dag_h->nodes;
216 	dag_h->nodes = wnpNode;
217 
218 	blockNode = rf_AllocDAGNode(raidPtr);
219 	blockNode->list_next = dag_h->nodes;
220 	dag_h->nodes = blockNode;
221 
222 	commitNode = rf_AllocDAGNode(raidPtr);
223 	commitNode->list_next = dag_h->nodes;
224 	dag_h->nodes = commitNode;
225 
226 	termNode = rf_AllocDAGNode(raidPtr);
227 	termNode->list_next = dag_h->nodes;
228 	dag_h->nodes = termNode;
229 
230 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
231 	if (nfaults == 2) {
232 		wnqNode = rf_AllocDAGNode(raidPtr);
233 	} else {
234 		wnqNode = NULL;
235 	}
236 #endif
237 	rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h,
238 					new_asm_h, &nRodNodes, &sosBuffer,
239 					&eosBuffer, allocList);
240 	if (nRodNodes > 0) {
241 		for (i = 0; i < nRodNodes; i++) {
242 			tmpNode = rf_AllocDAGNode(raidPtr);
243 			tmpNode->list_next = dag_h->nodes;
244 			dag_h->nodes = tmpNode;
245 		}
246 		rodNodes = dag_h->nodes;
247 	} else {
248 		rodNodes = NULL;
249 	}
250 
251 	/* begin node initialization */
252 	if (nRodNodes > 0) {
253 		rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
254 			    rf_NullNodeUndoFunc, NULL, nRodNodes, 0, 0, 0,
255 			    dag_h, "Nil", allocList);
256 	} else {
257 		rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
258 			    rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0,
259 			    dag_h, "Nil", allocList);
260 	}
261 
262 	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
263 		    rf_NullNodeUndoFunc, NULL, nWndNodes + nfaults, 1, 0, 0,
264 		    dag_h, "Cmt", allocList);
265 	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
266 		    rf_TerminateUndoFunc, NULL, 0, nWndNodes + nfaults, 0, 0,
267 		    dag_h, "Trm", allocList);
268 
269 	/* initialize the Rod nodes */
270 	tmpNode = rodNodes;
271 	for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
272 		if (new_asm_h[asmNum]) {
273 			pda = new_asm_h[asmNum]->stripeMap->physInfo;
274 			while (pda) {
275 				rf_InitNode(tmpNode, rf_wait,
276 					    RF_FALSE, rf_DiskReadFunc,
277 					    rf_DiskReadUndoFunc,
278 					    rf_GenericWakeupFunc,
279 					    1, 1, 4, 0, dag_h,
280 					    "Rod", allocList);
281 				tmpNode->params[0].p = pda;
282 				tmpNode->params[1].p = pda->bufPtr;
283 				tmpNode->params[2].v = parityStripeID;
284 				tmpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
285 				    which_ru);
286 				nodeNum++;
287 				pda = pda->next;
288 				tmpNode = tmpNode->list_next;
289 			}
290 		}
291 	}
292 	RF_ASSERT(nodeNum == nRodNodes);
293 
294 	/* initialize the wnd nodes */
295 	pda = asmap->physInfo;
296 	tmpNode = wndNodes;
297 	for (i = 0; i < nWndNodes; i++) {
298 		rf_InitNode(tmpNode, rf_wait, RF_FALSE,
299 			    rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
300 			    rf_GenericWakeupFunc, 1, 1, 4, 0,
301 			    dag_h, "Wnd", allocList);
302 		RF_ASSERT(pda != NULL);
303 		tmpNode->params[0].p = pda;
304 		tmpNode->params[1].p = pda->bufPtr;
305 		tmpNode->params[2].v = parityStripeID;
306 		tmpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
307 		pda = pda->next;
308 		tmpNode = tmpNode->list_next;
309 	}
310 
311 	/* initialize the redundancy node */
312 	if (nRodNodes > 0) {
313 		rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc,
314 			    rf_NullNodeUndoFunc, NULL, 1,
315 			    nRodNodes, 2 * (nWndNodes + nRodNodes) + 1,
316 			    nfaults, dag_h, "Xr ", allocList);
317 	} else {
318 		rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc,
319 			    rf_NullNodeUndoFunc, NULL, 1,
320 			    1, 2 * (nWndNodes + nRodNodes) + 1,
321 			    nfaults, dag_h, "Xr ", allocList);
322 	}
323 	xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
324 	tmpNode = wndNodes;
325 	for (i = 0; i < nWndNodes; i++) {
326 		/* pda */
327 		xorNode->params[2 * i + 0] = tmpNode->params[0];
328 		/* buf ptr */
329 		xorNode->params[2 * i + 1] = tmpNode->params[1];
330 		tmpNode = tmpNode->list_next;
331 	}
332 	tmpNode = rodNodes;
333 	for (i = 0; i < nRodNodes; i++) {
334 		/* pda */
335 		xorNode->params[2 * (nWndNodes + i) + 0] = tmpNode->params[0];
336 		/* buf ptr */
337 		xorNode->params[2 * (nWndNodes + i) + 1] = tmpNode->params[1];
338 		tmpNode = tmpNode->list_next;
339 	}
340 	/* xor node needs to get at RAID information */
341 	xorNode->params[2 * (nWndNodes + nRodNodes)].p = raidPtr;
342 
343 	/*
344          * Look for an Rod node that reads a complete SU. If none,
345          * alloc a buffer to receive the parity info. Note that we
346          * can't use a new data buffer because it will not have gotten
347          * written when the xor occurs.  */
348 	if (allowBufferRecycle) {
349 		tmpNode = rodNodes;
350 		for (i = 0; i < nRodNodes; i++) {
351 			if (((RF_PhysDiskAddr_t *) tmpNode->params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
352 				break;
353 			tmpNode = tmpNode->list_next;
354 		}
355 	}
356 	if ((!allowBufferRecycle) || (i == nRodNodes)) {
357 		xorNode->results[0] = rf_AllocBuffer(raidPtr, dag_h, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit));
358 	} else {
359 		/* this works because the only way we get here is if
360 		   allowBufferRecycle is true and we went through the
361 		   above for loop, and exited via the break before
362 		   i==nRodNodes was true.  That means tmpNode will
363 		   still point to a valid node -- the one we want for
364 		   here! */
365 		xorNode->results[0] = tmpNode->params[1].p;
366 	}
367 
368 	/* initialize the Wnp node */
369 	rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc,
370 		    rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
371 		    dag_h, "Wnp", allocList);
372 	wnpNode->params[0].p = asmap->parityInfo;
373 	wnpNode->params[1].p = xorNode->results[0];
374 	wnpNode->params[2].v = parityStripeID;
375 	wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
376 	/* parityInfo must describe entire parity unit */
377 	RF_ASSERT(asmap->parityInfo->next == NULL);
378 
379 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
380 	if (nfaults == 2) {
381 		/*
382 	         * We never try to recycle a buffer for the Q calcuation
383 	         * in addition to the parity. This would cause two buffers
384 	         * to get smashed during the P and Q calculation, guaranteeing
385 	         * one would be wrong.
386 	         */
387 		xorNode->results[1] =
388 		    BUF_ALLOC(raidPtr->Layout.sectorsPerStripeUnit);
389 		rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc,
390 			    rf_DiskWriteUndoFunc, rf_GenericWakeupFunc,
391 			    1, 1, 4, 0, dag_h, "Wnq", allocList);
392 		wnqNode->params[0].p = asmap->qInfo;
393 		wnqNode->params[1].p = xorNode->results[1];
394 		wnqNode->params[2].v = parityStripeID;
395 		wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
396 		/* parityInfo must describe entire parity unit */
397 		RF_ASSERT(asmap->parityInfo->next == NULL);
398 	}
399 #endif
400 	/*
401          * Connect nodes to form graph.
402          */
403 
404 	/* connect dag header to block node */
405 	RF_ASSERT(blockNode->numAntecedents == 0);
406 	dag_h->succedents[0] = blockNode;
407 
408 	if (nRodNodes > 0) {
409 		/* connect the block node to the Rod nodes */
410 		RF_ASSERT(blockNode->numSuccedents == nRodNodes);
411 		RF_ASSERT(xorNode->numAntecedents == nRodNodes);
412 		tmpNode = rodNodes;
413 		for (i = 0; i < nRodNodes; i++) {
414 			RF_ASSERT(tmpNode->numAntecedents == 1);
415 			blockNode->succedents[i] = tmpNode;
416 			tmpNode->antecedents[0] = blockNode;
417 			tmpNode->antType[0] = rf_control;
418 
419 			/* connect the Rod nodes to the Xor node */
420 			RF_ASSERT(tmpNode->numSuccedents == 1);
421 			tmpNode->succedents[0] = xorNode;
422 			xorNode->antecedents[i] = tmpNode;
423 			xorNode->antType[i] = rf_trueData;
424 			tmpNode = tmpNode->list_next;
425 		}
426 	} else {
427 		/* connect the block node to the Xor node */
428 		RF_ASSERT(blockNode->numSuccedents == 1);
429 		RF_ASSERT(xorNode->numAntecedents == 1);
430 		blockNode->succedents[0] = xorNode;
431 		xorNode->antecedents[0] = blockNode;
432 		xorNode->antType[0] = rf_control;
433 	}
434 
435 	/* connect the xor node to the commit node */
436 	RF_ASSERT(xorNode->numSuccedents == 1);
437 	RF_ASSERT(commitNode->numAntecedents == 1);
438 	xorNode->succedents[0] = commitNode;
439 	commitNode->antecedents[0] = xorNode;
440 	commitNode->antType[0] = rf_control;
441 
442 	/* connect the commit node to the write nodes */
443 	RF_ASSERT(commitNode->numSuccedents == nWndNodes + nfaults);
444 	tmpNode = wndNodes;
445 	for (i = 0; i < nWndNodes; i++) {
446 		RF_ASSERT(wndNodes->numAntecedents == 1);
447 		commitNode->succedents[i] = tmpNode;
448 		tmpNode->antecedents[0] = commitNode;
449 		tmpNode->antType[0] = rf_control;
450 		tmpNode = tmpNode->list_next;
451 	}
452 	RF_ASSERT(wnpNode->numAntecedents == 1);
453 	commitNode->succedents[nWndNodes] = wnpNode;
454 	wnpNode->antecedents[0] = commitNode;
455 	wnpNode->antType[0] = rf_trueData;
456 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
457 	if (nfaults == 2) {
458 		RF_ASSERT(wnqNode->numAntecedents == 1);
459 		commitNode->succedents[nWndNodes + 1] = wnqNode;
460 		wnqNode->antecedents[0] = commitNode;
461 		wnqNode->antType[0] = rf_trueData;
462 	}
463 #endif
464 	/* connect the write nodes to the term node */
465 	RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults);
466 	RF_ASSERT(termNode->numSuccedents == 0);
467 	tmpNode = wndNodes;
468 	for (i = 0; i < nWndNodes; i++) {
469 		RF_ASSERT(wndNodes->numSuccedents == 1);
470 		tmpNode->succedents[0] = termNode;
471 		termNode->antecedents[i] = tmpNode;
472 		termNode->antType[i] = rf_control;
473 		tmpNode = tmpNode->list_next;
474 	}
475 	RF_ASSERT(wnpNode->numSuccedents == 1);
476 	wnpNode->succedents[0] = termNode;
477 	termNode->antecedents[nWndNodes] = wnpNode;
478 	termNode->antType[nWndNodes] = rf_control;
479 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
480 	if (nfaults == 2) {
481 		RF_ASSERT(wnqNode->numSuccedents == 1);
482 		wnqNode->succedents[0] = termNode;
483 		termNode->antecedents[nWndNodes + 1] = wnqNode;
484 		termNode->antType[nWndNodes + 1] = rf_control;
485 	}
486 #endif
487 }
488 /******************************************************************************
489  *
490  * creates a DAG to perform a small-write operation (either raid 5 or pq),
491  * which is as follows:
492  *
493  * Hdr -> Nil -> Rop -> Xor -> Cmt ----> Wnp [Unp] --> Trm
494  *            \- Rod X      /     \----> Wnd [Und]-/
495  *           [\- Rod X     /       \---> Wnd [Und]-/]
496  *           [\- Roq -> Q /         \--> Wnq [Unq]-/]
497  *
498  * Rop = read old parity
499  * Rod = read old data
500  * Roq = read old "q"
501  * Cmt = commit node
502  * Und = unlock data disk
503  * Unp = unlock parity disk
504  * Unq = unlock q disk
505  * Wnp = write new parity
506  * Wnd = write new data
507  * Wnq = write new "q"
508  * [ ] denotes optional segments in the graph
509  *
510  * Parameters:  raidPtr   - description of the physical array
511  *              asmap     - logical & physical addresses for this access
512  *              bp        - buffer ptr (holds write data)
513  *              flags     - general flags (e.g. disk locking)
514  *              allocList - list of memory allocated in DAG creation
515  *              pfuncs    - list of parity generating functions
516  *              qfuncs    - list of q generating functions
517  *
518  * A null qfuncs indicates single fault tolerant
519  *****************************************************************************/
520 
521 void
rf_CommonCreateSmallWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList,const RF_RedFuncs_t * pfuncs,const RF_RedFuncs_t * qfuncs)522 rf_CommonCreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
523 			     RF_DagHeader_t *dag_h, void *bp,
524 			     RF_RaidAccessFlags_t flags,
525 			     RF_AllocListElem_t *allocList,
526 			     const RF_RedFuncs_t *pfuncs,
527 			     const RF_RedFuncs_t *qfuncs)
528 {
529 	RF_DagNode_t *readDataNodes, *readParityNodes, *termNode;
530 	RF_DagNode_t *tmpNode, *tmpreadDataNode, *tmpreadParityNode;
531 	RF_DagNode_t *xorNodes, *blockNode, *commitNode;
532 	RF_DagNode_t *writeDataNodes, *writeParityNodes;
533 	RF_DagNode_t *tmpxorNode, *tmpwriteDataNode;
534 	RF_DagNode_t *tmpwriteParityNode;
535 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
536 	RF_DagNode_t *tmpwriteQNode, *tmpreadQNode, *tmpqNode, *readQNodes,
537 	     *writeQNodes, *qNodes;
538 #endif
539 	int     i, j, nNodes;
540 	RF_ReconUnitNum_t which_ru;
541 	void    (*func) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *);
542 	void    (*qfunc) (RF_DagNode_t *) __unused;
543 	int     numDataNodes, numParityNodes;
544 	RF_StripeNum_t parityStripeID;
545 	RF_PhysDiskAddr_t *pda;
546 	const char *name, *qname __unused;
547 	long    nfaults;
548 
549 	nfaults = qfuncs ? 2 : 1;
550 
551 	parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
552 	    asmap->raidAddress, &which_ru);
553 	pda = asmap->physInfo;
554 	numDataNodes = asmap->numStripeUnitsAccessed;
555 	numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
556 
557 #if RF_DEBUG_DAG
558 	if (rf_dagDebug) {
559 		printf("[Creating small-write DAG]\n");
560 	}
561 #endif
562 	RF_ASSERT(numDataNodes > 0);
563 	dag_h->creator = "SmallWriteDAG";
564 
565 	dag_h->numCommitNodes = 1;
566 	dag_h->numCommits = 0;
567 	dag_h->numSuccedents = 1;
568 
569 	/*
570          * DAG creation occurs in four steps:
571          * 1. count the number of nodes in the DAG
572          * 2. create the nodes
573          * 3. initialize the nodes
574          * 4. connect the nodes
575          */
576 
577 	/*
578          * Step 1. compute number of nodes in the graph
579          */
580 
581 	/* number of nodes: a read and write for each data unit a
582 	 * redundancy computation node for each parity node (nfaults *
583 	 * nparity) a read and write for each parity unit a block and
584 	 * commit node (2) a terminate node if atomic RMW an unlock
585 	 * node for each data unit, redundancy unit
586 	 * totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes)
587 	 *   + (nfaults * 2 * numParityNodes) + 3;
588 	 */
589 
590 	/*
591          * Step 2. create the nodes
592          */
593 
594 	blockNode = rf_AllocDAGNode(raidPtr);
595 	blockNode->list_next = dag_h->nodes;
596 	dag_h->nodes = blockNode;
597 
598 	commitNode = rf_AllocDAGNode(raidPtr);
599 	commitNode->list_next = dag_h->nodes;
600 	dag_h->nodes = commitNode;
601 
602 	for (i = 0; i < numDataNodes; i++) {
603 		tmpNode = rf_AllocDAGNode(raidPtr);
604 		tmpNode->list_next = dag_h->nodes;
605 		dag_h->nodes = tmpNode;
606 	}
607 	readDataNodes = dag_h->nodes;
608 
609 	for (i = 0; i < numParityNodes; i++) {
610 		tmpNode = rf_AllocDAGNode(raidPtr);
611 		tmpNode->list_next = dag_h->nodes;
612 		dag_h->nodes = tmpNode;
613 	}
614 	readParityNodes = dag_h->nodes;
615 
616 	for (i = 0; i < numDataNodes; i++) {
617 		tmpNode = rf_AllocDAGNode(raidPtr);
618 		tmpNode->list_next = dag_h->nodes;
619 		dag_h->nodes = tmpNode;
620 	}
621 	writeDataNodes = dag_h->nodes;
622 
623 	for (i = 0; i < numParityNodes; i++) {
624 		tmpNode = rf_AllocDAGNode(raidPtr);
625 		tmpNode->list_next = dag_h->nodes;
626 		dag_h->nodes = tmpNode;
627 	}
628 	writeParityNodes = dag_h->nodes;
629 
630 	for (i = 0; i < numParityNodes; i++) {
631 		tmpNode = rf_AllocDAGNode(raidPtr);
632 		tmpNode->list_next = dag_h->nodes;
633 		dag_h->nodes = tmpNode;
634 	}
635 	xorNodes = dag_h->nodes;
636 
637 	termNode = rf_AllocDAGNode(raidPtr);
638 	termNode->list_next = dag_h->nodes;
639 	dag_h->nodes = termNode;
640 
641 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
642 	if (nfaults == 2) {
643 		for (i = 0; i < numParityNodes; i++) {
644 			tmpNode = rf_AllocDAGNode(raidPtr);
645 			tmpNode->list_next = dag_h->nodes;
646 			dag_h->nodes = tmpNode;
647 		}
648 		readQNodes = dag_h->nodes;
649 
650 		for (i = 0; i < numParityNodes; i++) {
651 			tmpNode = rf_AllocDAGNode(raidPtr);
652 			tmpNode->list_next = dag_h->nodes;
653 			dag_h->nodes = tmpNode;
654 		}
655 		writeQNodes = dag_h->nodes;
656 
657 		for (i = 0; i < numParityNodes; i++) {
658 			tmpNode = rf_AllocDAGNode(raidPtr);
659 			tmpNode->list_next = dag_h->nodes;
660 			dag_h->nodes = tmpNode;
661 		}
662 		qNodes = dag_h->nodes;
663 	} else {
664 		readQNodes = writeQNodes = qNodes = NULL;
665 	}
666 #endif
667 
668 	/*
669          * Step 3. initialize the nodes
670          */
671 	/* initialize block node (Nil) */
672 	nNodes = numDataNodes + (nfaults * numParityNodes);
673 	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
674 		    rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0,
675 		    dag_h, "Nil", allocList);
676 
677 	/* initialize commit node (Cmt) */
678 	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
679 		    rf_NullNodeUndoFunc, NULL, nNodes,
680 		    (nfaults * numParityNodes), 0, 0, dag_h, "Cmt", allocList);
681 
682 	/* initialize terminate node (Trm) */
683 	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
684 		    rf_TerminateUndoFunc, NULL, 0, nNodes, 0, 0,
685 		    dag_h, "Trm", allocList);
686 
687 	/* initialize nodes which read old data (Rod) */
688 	tmpreadDataNode = readDataNodes;
689 	for (i = 0; i < numDataNodes; i++) {
690 		rf_InitNode(tmpreadDataNode, rf_wait, RF_FALSE,
691 			    rf_DiskReadFunc, rf_DiskReadUndoFunc,
692 			    rf_GenericWakeupFunc, (nfaults * numParityNodes),
693 			    1, 4, 0, dag_h, "Rod", allocList);
694 		RF_ASSERT(pda != NULL);
695 		/* physical disk addr desc */
696 		tmpreadDataNode->params[0].p = pda;
697 		/* buffer to hold old data */
698 		tmpreadDataNode->params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector);
699 		tmpreadDataNode->params[2].v = parityStripeID;
700 		tmpreadDataNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
701 		    which_ru);
702 		pda = pda->next;
703 		for (j = 0; j < tmpreadDataNode->numSuccedents; j++) {
704 			tmpreadDataNode->propList[j] = NULL;
705 		}
706 		tmpreadDataNode = tmpreadDataNode->list_next;
707 	}
708 
709 	/* initialize nodes which read old parity (Rop) */
710 	pda = asmap->parityInfo;
711 	i = 0;
712 	tmpreadParityNode = readParityNodes;
713 	for (i = 0; i < numParityNodes; i++) {
714 		RF_ASSERT(pda != NULL);
715 		rf_InitNode(tmpreadParityNode, rf_wait, RF_FALSE,
716 			    rf_DiskReadFunc, rf_DiskReadUndoFunc,
717 			    rf_GenericWakeupFunc, numParityNodes, 1, 4, 0,
718 			    dag_h, "Rop", allocList);
719 		tmpreadParityNode->params[0].p = pda;
720 		/* buffer to hold old parity */
721 		tmpreadParityNode->params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda->numSector << raidPtr->logBytesPerSector);
722 		tmpreadParityNode->params[2].v = parityStripeID;
723 		tmpreadParityNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
724 		    which_ru);
725 		pda = pda->next;
726 		for (j = 0; j < tmpreadParityNode->numSuccedents; j++) {
727 			tmpreadParityNode->propList[0] = NULL;
728 		}
729 		tmpreadParityNode = tmpreadParityNode->list_next;
730 	}
731 
732 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
733 	/* initialize nodes which read old Q (Roq) */
734 	if (nfaults == 2) {
735 		pda = asmap->qInfo;
736 		tmpreadQNode = readQNodes;
737 		for (i = 0; i < numParityNodes; i++) {
738 			RF_ASSERT(pda != NULL);
739 			rf_InitNode(tmpreadQNode, rf_wait, RF_FALSE,
740 				    rf_DiskReadFunc, rf_DiskReadUndoFunc,
741 				    rf_GenericWakeupFunc, numParityNodes,
742 				    1, 4, 0, dag_h, "Roq", allocList);
743 			tmpreadQNode->params[0].p = pda;
744 			/* buffer to hold old Q */
745 			tmpreadQNode->params[1].p = rf_AllocBuffer(raidPtr, dag_h,
746 								   pda->numSector << raidPtr->logBytesPerSector);
747 			tmpreadQNode->params[2].v = parityStripeID;
748 			tmpreadQNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
749 			    which_ru);
750 			pda = pda->next;
751 			for (j = 0; j < tmpreadQNode->numSuccedents; j++) {
752 				tmpreadQNode->propList[0] = NULL;
753 			}
754 			tmpreadQNode = tmpreadQNode->list_next;
755 		}
756 	}
757 #endif
758 	/* initialize nodes which write new data (Wnd) */
759 	pda = asmap->physInfo;
760 	tmpwriteDataNode = writeDataNodes;
761 	for (i = 0; i < numDataNodes; i++) {
762 		RF_ASSERT(pda != NULL);
763 		rf_InitNode(tmpwriteDataNode, rf_wait, RF_FALSE,
764 			    rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
765 			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
766 			    "Wnd", allocList);
767 		/* physical disk addr desc */
768 		tmpwriteDataNode->params[0].p = pda;
769 		/* buffer holding new data to be written */
770 		tmpwriteDataNode->params[1].p = pda->bufPtr;
771 		tmpwriteDataNode->params[2].v = parityStripeID;
772 		tmpwriteDataNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
773 		    which_ru);
774 		pda = pda->next;
775 		tmpwriteDataNode = tmpwriteDataNode->list_next;
776 	}
777 
778 	/*
779          * Initialize nodes which compute new parity and Q.
780          */
781 	/*
782          * We use the simple XOR func in the double-XOR case, and when
783          * we're accessing only a portion of one stripe unit. The
784          * distinction between the two is that the regular XOR func
785          * assumes that the targbuf is a full SU in size, and examines
786          * the pda associated with the buffer to decide where within
787          * the buffer to XOR the data, whereas the simple XOR func
788          * just XORs the data into the start of the buffer.  */
789 	if ((numParityNodes == 2) || ((numDataNodes == 1)
790 		&& (asmap->totalSectorsAccessed <
791 		    raidPtr->Layout.sectorsPerStripeUnit))) {
792 		func = pfuncs->simple;
793 		undoFunc = rf_NullNodeUndoFunc;
794 		name = pfuncs->SimpleName;
795 		if (qfuncs) {
796 			qfunc = qfuncs->simple;
797 			qname = qfuncs->SimpleName;
798 		} else {
799 			qfunc = NULL;
800 			qname = NULL;
801 		}
802 	} else {
803 		func = pfuncs->regular;
804 		undoFunc = rf_NullNodeUndoFunc;
805 		name = pfuncs->RegularName;
806 		if (qfuncs) {
807 			qfunc = qfuncs->regular;
808 			qname = qfuncs->RegularName;
809 		} else {
810 			qfunc = NULL;
811 			qname = NULL;
812 		}
813 	}
814 	/*
815          * Initialize the xor nodes: params are {pda,buf}
816          * from {Rod,Wnd,Rop} nodes, and raidPtr
817          */
818 	if (numParityNodes == 2) {
819 		/* double-xor case */
820 		tmpxorNode = xorNodes;
821 		tmpreadDataNode = readDataNodes;
822 		tmpreadParityNode = readParityNodes;
823 		tmpwriteDataNode = writeDataNodes;
824 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
825 		tmpqNode = qNodes;
826 		tmpreadQNode = readQNodes;
827 #endif
828 		for (i = 0; i < numParityNodes; i++) {
829 			/* note: no wakeup func for xor */
830 			rf_InitNode(tmpxorNode, rf_wait, RF_FALSE, func,
831 				    undoFunc, NULL, 1,
832 				    (numDataNodes + numParityNodes),
833 				    7, 1, dag_h, name, allocList);
834 			tmpxorNode->flags |= RF_DAGNODE_FLAG_YIELD;
835 			tmpxorNode->params[0] = tmpreadDataNode->params[0];
836 			tmpxorNode->params[1] = tmpreadDataNode->params[1];
837 			tmpxorNode->params[2] = tmpreadParityNode->params[0];
838 			tmpxorNode->params[3] = tmpreadParityNode->params[1];
839 			tmpxorNode->params[4] = tmpwriteDataNode->params[0];
840 			tmpxorNode->params[5] = tmpwriteDataNode->params[1];
841 			tmpxorNode->params[6].p = raidPtr;
842 			/* use old parity buf as target buf */
843 			tmpxorNode->results[0] = tmpreadParityNode->params[1].p;
844 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
845 			if (nfaults == 2) {
846 				/* note: no wakeup func for qor */
847 				rf_InitNode(tmpqNode, rf_wait, RF_FALSE,
848 					    qfunc, undoFunc, NULL, 1,
849 					    (numDataNodes + numParityNodes),
850 					    7, 1, dag_h, qname, allocList);
851 				tmpqNode->params[0] = tmpreadDataNode->params[0];
852 				tmpqNode->params[1] = tmpreadDataNode->params[1];
853 				tmpqNode->params[2] = tmpreadQNode->params[0];
854 				tmpqNode->params[3] = tmpreadQNode->params[1];
855 				tmpqNode->params[4] = tmpwriteDataNode->params[0];
856 				tmpqNode->params[5] = tmpwriteDataNode->params[1];
857 				tmpqNode->params[6].p = raidPtr;
858 				/* use old Q buf as target buf */
859 				tmpqNode->results[0] = tmpreadQNode->params[1].p;
860 				tmpqNode = tmpqNode->list_next;
861 				tmpreadQNode = tmpreadQNode->list_next;
862 			}
863 #endif
864 			tmpxorNode = tmpxorNode->list_next;
865 			tmpreadDataNode = tmpreadDataNode->list_next;
866 			tmpreadParityNode = tmpreadParityNode->list_next;
867 			tmpwriteDataNode = tmpwriteDataNode->list_next;
868 		}
869 	} else {
870 		/* there is only one xor node in this case */
871 		rf_InitNode(xorNodes, rf_wait, RF_FALSE, func,
872 			    undoFunc, NULL, 1, (numDataNodes + numParityNodes),
873 			    (2 * (numDataNodes + numDataNodes + 1) + 1), 1,
874 			    dag_h, name, allocList);
875 		xorNodes->flags |= RF_DAGNODE_FLAG_YIELD;
876 		tmpreadDataNode = readDataNodes;
877 		for (i = 0; i < numDataNodes; i++) { /* used to be"numDataNodes + 1" until we factored
878 							out the "+1" into the "deal with Rop separately below */
879 			/* set up params related to Rod nodes */
880 			xorNodes->params[2 * i + 0] = tmpreadDataNode->params[0];	/* pda */
881 			xorNodes->params[2 * i + 1] = tmpreadDataNode->params[1];	/* buffer ptr */
882 			tmpreadDataNode = tmpreadDataNode->list_next;
883 		}
884 		/* deal with Rop separately */
885 		xorNodes->params[2 * numDataNodes + 0] = readParityNodes->params[0];    /* pda */
886 		xorNodes->params[2 * numDataNodes + 1] = readParityNodes->params[1];    /* buffer ptr */
887 
888 		tmpwriteDataNode = writeDataNodes;
889 		for (i = 0; i < numDataNodes; i++) {
890 			/* set up params related to Wnd and Wnp nodes */
891 			xorNodes->params[2 * (numDataNodes + 1 + i) + 0] =	/* pda */
892 			    tmpwriteDataNode->params[0];
893 			xorNodes->params[2 * (numDataNodes + 1 + i) + 1] =	/* buffer ptr */
894 			    tmpwriteDataNode->params[1];
895 			tmpwriteDataNode = tmpwriteDataNode->list_next;
896 		}
897 		/* xor node needs to get at RAID information */
898 		xorNodes->params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
899 		xorNodes->results[0] = readParityNodes->params[1].p;
900 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
901 		if (nfaults == 2) {
902 			rf_InitNode(qNodes, rf_wait, RF_FALSE, qfunc,
903 				    undoFunc, NULL, 1,
904 				    (numDataNodes + numParityNodes),
905 				    (2 * (numDataNodes + numDataNodes + 1) + 1), 1,
906 				    dag_h, qname, allocList);
907 			tmpreadDataNode = readDataNodes;
908 			for (i = 0; i < numDataNodes; i++) {
909 				/* set up params related to Rod */
910 				qNodes->params[2 * i + 0] = tmpreadDataNode->params[0];	/* pda */
911 				qNodes->params[2 * i + 1] = tmpreadDataNode->params[1];	/* buffer ptr */
912 				tmpreadDataNode = tmpreadDataNode->list_next;
913 			}
914 			/* and read old q */
915 			qNodes->params[2 * numDataNodes + 0] =	/* pda */
916 			    readQNodes->params[0];
917 			qNodes->params[2 * numDataNodes + 1] =	/* buffer ptr */
918 			    readQNodes->params[1];
919 			tmpwriteDataNode = writeDataNodes;
920 			for (i = 0; i < numDataNodes; i++) {
921 				/* set up params related to Wnd nodes */
922 				qNodes->params[2 * (numDataNodes + 1 + i) + 0] =	/* pda */
923 				    tmpwriteDataNode->params[0];
924 				qNodes->params[2 * (numDataNodes + 1 + i) + 1] =	/* buffer ptr */
925 				    tmpwriteDataNode->params[1];
926 				tmpwriteDataNode = tmpwriteDataNode->list_next;
927 			}
928 			/* xor node needs to get at RAID information */
929 			qNodes->params[2 * (numDataNodes + numDataNodes + 1)].p = raidPtr;
930 			qNodes->results[0] = readQNodes->params[1].p;
931 		}
932 #endif
933 	}
934 
935 	/* initialize nodes which write new parity (Wnp) */
936 	pda = asmap->parityInfo;
937 	tmpwriteParityNode = writeParityNodes;
938 	tmpxorNode = xorNodes;
939 	for (i = 0; i < numParityNodes; i++) {
940 		rf_InitNode(tmpwriteParityNode, rf_wait, RF_FALSE,
941 			    rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
942 			    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
943 			    "Wnp", allocList);
944 		RF_ASSERT(pda != NULL);
945 		tmpwriteParityNode->params[0].p = pda;	/* param 1 (bufPtr)
946 				  			 * filled in by xor node */
947 		tmpwriteParityNode->params[1].p = tmpxorNode->results[0];	/* buffer pointer for
948 				  						 * parity write
949 				  						 * operation */
950 		tmpwriteParityNode->params[2].v = parityStripeID;
951 		tmpwriteParityNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
952 		    which_ru);
953 		pda = pda->next;
954 		tmpwriteParityNode = tmpwriteParityNode->list_next;
955 		tmpxorNode = tmpxorNode->list_next;
956 	}
957 
958 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
959 	/* initialize nodes which write new Q (Wnq) */
960 	if (nfaults == 2) {
961 		pda = asmap->qInfo;
962 		tmpwriteQNode = writeQNodes;
963 		tmpqNode = qNodes;
964 		for (i = 0; i < numParityNodes; i++) {
965 			rf_InitNode(tmpwriteQNode, rf_wait, RF_FALSE,
966 				    rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
967 				    rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
968 				    "Wnq", allocList);
969 			RF_ASSERT(pda != NULL);
970 			tmpwriteQNode->params[0].p = pda;	/* param 1 (bufPtr)
971 								 * filled in by xor node */
972 			tmpwriteQNode->params[1].p = tmpqNode->results[0];	/* buffer pointer for
973 										 * parity write
974 										 * operation */
975 			tmpwriteQNode->params[2].v = parityStripeID;
976 			tmpwriteQNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
977 			    which_ru);
978 			pda = pda->next;
979 			tmpwriteQNode = tmpwriteQNode->list_next;
980 			tmpqNode = tmpqNode->list_next;
981 		}
982 	}
983 #endif
984 	/*
985          * Step 4. connect the nodes.
986          */
987 
988 	/* connect header to block node */
989 	dag_h->succedents[0] = blockNode;
990 
991 	/* connect block node to read old data nodes */
992 	RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults)));
993 	tmpreadDataNode = readDataNodes;
994 	for (i = 0; i < numDataNodes; i++) {
995 		blockNode->succedents[i] = tmpreadDataNode;
996 		RF_ASSERT(tmpreadDataNode->numAntecedents == 1);
997 		tmpreadDataNode->antecedents[0] = blockNode;
998 		tmpreadDataNode->antType[0] = rf_control;
999 		tmpreadDataNode = tmpreadDataNode->list_next;
1000 	}
1001 
1002 	/* connect block node to read old parity nodes */
1003 	tmpreadParityNode = readParityNodes;
1004 	for (i = 0; i < numParityNodes; i++) {
1005 		blockNode->succedents[numDataNodes + i] = tmpreadParityNode;
1006 		RF_ASSERT(tmpreadParityNode->numAntecedents == 1);
1007 		tmpreadParityNode->antecedents[0] = blockNode;
1008 		tmpreadParityNode->antType[0] = rf_control;
1009 		tmpreadParityNode = tmpreadParityNode->list_next;
1010 	}
1011 
1012 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1013 	/* connect block node to read old Q nodes */
1014 	if (nfaults == 2) {
1015 		tmpreadQNode = readQNodes;
1016 		for (i = 0; i < numParityNodes; i++) {
1017 			blockNode->succedents[numDataNodes + numParityNodes + i] = tmpreadQNode;
1018 			RF_ASSERT(tmpreadQNode->numAntecedents == 1);
1019 			tmpreadQNode->antecedents[0] = blockNode;
1020 			tmpreadQNode->antType[0] = rf_control;
1021 			tmpreadQNode = tmpreadQNode->list_next;
1022 		}
1023 	}
1024 #endif
1025 	/* connect read old data nodes to xor nodes */
1026 	tmpreadDataNode = readDataNodes;
1027 	for (i = 0; i < numDataNodes; i++) {
1028 		RF_ASSERT(tmpreadDataNode->numSuccedents == (nfaults * numParityNodes));
1029 		tmpxorNode = xorNodes;
1030 		for (j = 0; j < numParityNodes; j++) {
1031 			RF_ASSERT(tmpxorNode->numAntecedents == numDataNodes + numParityNodes);
1032 			tmpreadDataNode->succedents[j] = tmpxorNode;
1033 			tmpxorNode->antecedents[i] = tmpreadDataNode;
1034 			tmpxorNode->antType[i] = rf_trueData;
1035 			tmpxorNode = tmpxorNode->list_next;
1036 		}
1037 		tmpreadDataNode = tmpreadDataNode->list_next;
1038 	}
1039 
1040 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1041 	/* connect read old data nodes to q nodes */
1042 	if (nfaults == 2) {
1043 		tmpreadDataNode = readDataNodes;
1044 		for (i = 0; i < numDataNodes; i++) {
1045 			tmpqNode = qNodes;
1046 			for (j = 0; j < numParityNodes; j++) {
1047 				RF_ASSERT(tmpqNode->numAntecedents == numDataNodes + numParityNodes);
1048 				tmpreadDataNode->succedents[numParityNodes + j] = tmpqNode;
1049 				tmpqNode->antecedents[i] = tmpreadDataNode;
1050 				tmpqNode->antType[i] = rf_trueData;
1051 				tmpqNode = tmpqNode->list_next;
1052 			}
1053 			tmpreadDataNode = tmpreadDataNode->list_next;
1054 		}
1055 	}
1056 #endif
1057 	/* connect read old parity nodes to xor nodes */
1058 	tmpreadParityNode = readParityNodes;
1059 	for (i = 0; i < numParityNodes; i++) {
1060 		RF_ASSERT(tmpreadParityNode->numSuccedents == numParityNodes);
1061 		tmpxorNode = xorNodes;
1062 		for (j = 0; j < numParityNodes; j++) {
1063 			tmpreadParityNode->succedents[j] = tmpxorNode;
1064 			tmpxorNode->antecedents[numDataNodes + i] = tmpreadParityNode;
1065 			tmpxorNode->antType[numDataNodes + i] = rf_trueData;
1066 			tmpxorNode = tmpxorNode->list_next;
1067 		}
1068 		tmpreadParityNode = tmpreadParityNode->list_next;
1069 	}
1070 
1071 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1072 	/* connect read old q nodes to q nodes */
1073 	if (nfaults == 2) {
1074 		tmpreadParityNode = readParityNodes;
1075 		tmpreadQNode = readQNodes;
1076 		for (i = 0; i < numParityNodes; i++) {
1077 			RF_ASSERT(tmpreadParityNode->numSuccedents == numParityNodes);
1078 			tmpqNode = qNodes;
1079 			for (j = 0; j < numParityNodes; j++) {
1080 				tmpreadQNode->succedents[j] = tmpqNode;
1081 				tmpqNode->antecedents[numDataNodes + i] = tmpreadQNode;
1082 				tmpqNode->antType[numDataNodes + i] = rf_trueData;
1083 				tmpqNode = tmpqNode->list_next;
1084 			}
1085 			tmpreadParityNode = tmpreadParityNode->list_next;
1086 			tmpreadQNode = tmpreadQNode->list_next;
1087 		}
1088 	}
1089 #endif
1090 	/* connect xor nodes to commit node */
1091 	RF_ASSERT(commitNode->numAntecedents == (nfaults * numParityNodes));
1092 	tmpxorNode = xorNodes;
1093 	for (i = 0; i < numParityNodes; i++) {
1094 		RF_ASSERT(tmpxorNode->numSuccedents == 1);
1095 		tmpxorNode->succedents[0] = commitNode;
1096 		commitNode->antecedents[i] = tmpxorNode;
1097 		commitNode->antType[i] = rf_control;
1098 		tmpxorNode = tmpxorNode->list_next;
1099 	}
1100 
1101 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1102 	/* connect q nodes to commit node */
1103 	if (nfaults == 2) {
1104 		tmpqNode = qNodes;
1105 		for (i = 0; i < numParityNodes; i++) {
1106 			RF_ASSERT(tmpqNode->numSuccedents == 1);
1107 			tmpqNode->succedents[0] = commitNode;
1108 			commitNode->antecedents[i + numParityNodes] = tmpqNode;
1109 			commitNode->antType[i + numParityNodes] = rf_control;
1110 			tmpqNode = tmpqNode->list_next;
1111 		}
1112 	}
1113 #endif
1114 	/* connect commit node to write nodes */
1115 	RF_ASSERT(commitNode->numSuccedents == (numDataNodes + (nfaults * numParityNodes)));
1116 	tmpwriteDataNode = writeDataNodes;
1117 	for (i = 0; i < numDataNodes; i++) {
1118 		RF_ASSERT(tmpwriteDataNode->numAntecedents == 1);
1119 		commitNode->succedents[i] = tmpwriteDataNode;
1120 		tmpwriteDataNode->antecedents[0] = commitNode;
1121 		tmpwriteDataNode->antType[0] = rf_trueData;
1122 		tmpwriteDataNode = tmpwriteDataNode->list_next;
1123 	}
1124 	tmpwriteParityNode = writeParityNodes;
1125 	for (i = 0; i < numParityNodes; i++) {
1126 		RF_ASSERT(tmpwriteParityNode->numAntecedents == 1);
1127 		commitNode->succedents[i + numDataNodes] = tmpwriteParityNode;
1128 		tmpwriteParityNode->antecedents[0] = commitNode;
1129 		tmpwriteParityNode->antType[0] = rf_trueData;
1130 		tmpwriteParityNode = tmpwriteParityNode->list_next;
1131 	}
1132 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1133 	if (nfaults == 2) {
1134 		tmpwriteQNode = writeQNodes;
1135 		for (i = 0; i < numParityNodes; i++) {
1136 			RF_ASSERT(tmpwriteQNode->numAntecedents == 1);
1137 			commitNode->succedents[i + numDataNodes + numParityNodes] = tmpwriteQNode;
1138 			tmpwriteQNode->antecedents[0] = commitNode;
1139 			tmpwriteQNode->antType[0] = rf_trueData;
1140 			tmpwriteQNode = tmpwriteQNode->list_next;
1141 		}
1142 	}
1143 #endif
1144 	RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
1145 	RF_ASSERT(termNode->numSuccedents == 0);
1146 	tmpwriteDataNode = writeDataNodes;
1147 	for (i = 0; i < numDataNodes; i++) {
1148 		/* connect write new data nodes to term node */
1149 		RF_ASSERT(tmpwriteDataNode->numSuccedents == 1);
1150 		RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
1151 		tmpwriteDataNode->succedents[0] = termNode;
1152 		termNode->antecedents[i] = tmpwriteDataNode;
1153 		termNode->antType[i] = rf_control;
1154 		tmpwriteDataNode = tmpwriteDataNode->list_next;
1155 	}
1156 
1157 	tmpwriteParityNode = writeParityNodes;
1158 	for (i = 0; i < numParityNodes; i++) {
1159 		RF_ASSERT(tmpwriteParityNode->numSuccedents == 1);
1160 		tmpwriteParityNode->succedents[0] = termNode;
1161 		termNode->antecedents[numDataNodes + i] = tmpwriteParityNode;
1162 		termNode->antType[numDataNodes + i] = rf_control;
1163 		tmpwriteParityNode = tmpwriteParityNode->list_next;
1164 	}
1165 
1166 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
1167 	if (nfaults == 2) {
1168 		tmpwriteQNode = writeQNodes;
1169 		for (i = 0; i < numParityNodes; i++) {
1170 			RF_ASSERT(tmpwriteQNode->numSuccedents == 1);
1171 			tmpwriteQNode->succedents[0] = termNode;
1172 			termNode->antecedents[numDataNodes + numParityNodes + i] = tmpwriteQNode;
1173 			termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
1174 			tmpwriteQNode = tmpwriteQNode->list_next;
1175 		}
1176 	}
1177 #endif
1178 }
1179 
1180 
1181 /******************************************************************************
1182  * create a write graph (fault-free or degraded) for RAID level 1
1183  *
1184  * Hdr -> Commit -> Wpd -> Nil -> Trm
1185  *               -> Wsd ->
1186  *
1187  * The "Wpd" node writes data to the primary copy in the mirror pair
1188  * The "Wsd" node writes data to the secondary copy in the mirror pair
1189  *
1190  * Parameters:  raidPtr   - description of the physical array
1191  *              asmap     - logical & physical addresses for this access
1192  *              bp        - buffer ptr (holds write data)
1193  *              flags     - general flags (e.g. disk locking)
1194  *              allocList - list of memory allocated in DAG creation
1195  *****************************************************************************/
1196 
1197 void
rf_CreateRaidOneWriteDAG(RF_Raid_t * raidPtr,RF_AccessStripeMap_t * asmap,RF_DagHeader_t * dag_h,void * bp,RF_RaidAccessFlags_t flags,RF_AllocListElem_t * allocList)1198 rf_CreateRaidOneWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1199 			 RF_DagHeader_t *dag_h, void *bp,
1200 			 RF_RaidAccessFlags_t flags,
1201 			 RF_AllocListElem_t *allocList)
1202 {
1203 	RF_DagNode_t *unblockNode, *termNode, *commitNode;
1204 	RF_DagNode_t *wndNode, *wmirNode;
1205 	RF_DagNode_t *tmpNode, *tmpwndNode, *tmpwmirNode;
1206 	int     nWndNodes, nWmirNodes, i;
1207 	RF_ReconUnitNum_t which_ru;
1208 	RF_PhysDiskAddr_t *pda, *pdaP;
1209 	RF_StripeNum_t parityStripeID;
1210 
1211 	parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
1212 	    asmap->raidAddress, &which_ru);
1213 #if RF_DEBUG_DAG
1214 	if (rf_dagDebug) {
1215 		printf("[Creating RAID level 1 write DAG]\n");
1216 	}
1217 #endif
1218 	dag_h->creator = "RaidOneWriteDAG";
1219 
1220 	/* 2 implies access not SU aligned */
1221 	nWmirNodes = (asmap->parityInfo->next) ? 2 : 1;
1222 	nWndNodes = (asmap->physInfo->next) ? 2 : 1;
1223 
1224 	/* alloc the Wnd nodes and the Wmir node */
1225 	if (asmap->numDataFailed == 1)
1226 		nWndNodes--;
1227 	if (asmap->numParityFailed == 1)
1228 		nWmirNodes--;
1229 
1230 	/* total number of nodes = nWndNodes + nWmirNodes + (commit + unblock
1231 	 * + terminator) */
1232 	for (i = 0; i < nWndNodes; i++) {
1233 		tmpNode = rf_AllocDAGNode(raidPtr);
1234 		tmpNode->list_next = dag_h->nodes;
1235 		dag_h->nodes = tmpNode;
1236 	}
1237 	wndNode = dag_h->nodes;
1238 
1239 	for (i = 0; i < nWmirNodes; i++) {
1240 		tmpNode = rf_AllocDAGNode(raidPtr);
1241 		tmpNode->list_next = dag_h->nodes;
1242 		dag_h->nodes = tmpNode;
1243 	}
1244 	wmirNode = dag_h->nodes;
1245 
1246 	commitNode = rf_AllocDAGNode(raidPtr);
1247 	commitNode->list_next = dag_h->nodes;
1248 	dag_h->nodes = commitNode;
1249 
1250 	unblockNode = rf_AllocDAGNode(raidPtr);
1251 	unblockNode->list_next = dag_h->nodes;
1252 	dag_h->nodes = unblockNode;
1253 
1254 	termNode = rf_AllocDAGNode(raidPtr);
1255 	termNode->list_next = dag_h->nodes;
1256 	dag_h->nodes = termNode;
1257 
1258 	/* this dag can commit immediately */
1259 	dag_h->numCommitNodes = 1;
1260 	dag_h->numCommits = 0;
1261 	dag_h->numSuccedents = 1;
1262 
1263 	/* initialize the commit, unblock, and term nodes */
1264 	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
1265 		    rf_NullNodeUndoFunc, NULL, (nWndNodes + nWmirNodes),
1266 		    0, 0, 0, dag_h, "Cmt", allocList);
1267 	rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
1268 		    rf_NullNodeUndoFunc, NULL, 1, (nWndNodes + nWmirNodes),
1269 		    0, 0, dag_h, "Nil", allocList);
1270 	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
1271 		    rf_TerminateUndoFunc, NULL, 0, 1, 0, 0,
1272 		    dag_h, "Trm", allocList);
1273 
1274 	/* initialize the wnd nodes */
1275 	if (nWndNodes > 0) {
1276 		pda = asmap->physInfo;
1277 		tmpwndNode = wndNode;
1278 		for (i = 0; i < nWndNodes; i++) {
1279 			rf_InitNode(tmpwndNode, rf_wait, RF_FALSE,
1280 				    rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1281 				    rf_GenericWakeupFunc, 1, 1, 4, 0,
1282 				    dag_h, "Wpd", allocList);
1283 			RF_ASSERT(pda != NULL);
1284 			tmpwndNode->params[0].p = pda;
1285 			tmpwndNode->params[1].p = pda->bufPtr;
1286 			tmpwndNode->params[2].v = parityStripeID;
1287 			tmpwndNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
1288 			pda = pda->next;
1289 			tmpwndNode = tmpwndNode->list_next;
1290 		}
1291 		RF_ASSERT(pda == NULL);
1292 	}
1293 	/* initialize the mirror nodes */
1294 	if (nWmirNodes > 0) {
1295 		pda = asmap->physInfo;
1296 		pdaP = asmap->parityInfo;
1297 		tmpwmirNode = wmirNode;
1298 		for (i = 0; i < nWmirNodes; i++) {
1299 			rf_InitNode(tmpwmirNode, rf_wait, RF_FALSE,
1300 				    rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
1301 				    rf_GenericWakeupFunc, 1, 1, 4, 0,
1302 				    dag_h, "Wsd", allocList);
1303 			RF_ASSERT(pda != NULL);
1304 			tmpwmirNode->params[0].p = pdaP;
1305 			tmpwmirNode->params[1].p = pda->bufPtr;
1306 			tmpwmirNode->params[2].v = parityStripeID;
1307 			tmpwmirNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, which_ru);
1308 			pda = pda->next;
1309 			pdaP = pdaP->next;
1310 			tmpwmirNode = tmpwmirNode->list_next;
1311 		}
1312 		RF_ASSERT(pda == NULL);
1313 		RF_ASSERT(pdaP == NULL);
1314 	}
1315 	/* link the header node to the commit node */
1316 	RF_ASSERT(dag_h->numSuccedents == 1);
1317 	RF_ASSERT(commitNode->numAntecedents == 0);
1318 	dag_h->succedents[0] = commitNode;
1319 
1320 	/* link the commit node to the write nodes */
1321 	RF_ASSERT(commitNode->numSuccedents == (nWndNodes + nWmirNodes));
1322 	tmpwndNode = wndNode;
1323 	for (i = 0; i < nWndNodes; i++) {
1324 		RF_ASSERT(tmpwndNode->numAntecedents == 1);
1325 		commitNode->succedents[i] = tmpwndNode;
1326 		tmpwndNode->antecedents[0] = commitNode;
1327 		tmpwndNode->antType[0] = rf_control;
1328 		tmpwndNode = tmpwndNode->list_next;
1329 	}
1330 	tmpwmirNode = wmirNode;
1331 	for (i = 0; i < nWmirNodes; i++) {
1332 		RF_ASSERT(tmpwmirNode->numAntecedents == 1);
1333 		commitNode->succedents[i + nWndNodes] = tmpwmirNode;
1334 		tmpwmirNode->antecedents[0] = commitNode;
1335 		tmpwmirNode->antType[0] = rf_control;
1336 		tmpwmirNode = tmpwmirNode->list_next;
1337 	}
1338 
1339 	/* link the write nodes to the unblock node */
1340 	RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes));
1341 	tmpwndNode = wndNode;
1342 	for (i = 0; i < nWndNodes; i++) {
1343 		RF_ASSERT(tmpwndNode->numSuccedents == 1);
1344 		tmpwndNode->succedents[0] = unblockNode;
1345 		unblockNode->antecedents[i] = tmpwndNode;
1346 		unblockNode->antType[i] = rf_control;
1347 		tmpwndNode = tmpwndNode->list_next;
1348 	}
1349 	tmpwmirNode = wmirNode;
1350 	for (i = 0; i < nWmirNodes; i++) {
1351 		RF_ASSERT(tmpwmirNode->numSuccedents == 1);
1352 		tmpwmirNode->succedents[0] = unblockNode;
1353 		unblockNode->antecedents[i + nWndNodes] = tmpwmirNode;
1354 		unblockNode->antType[i + nWndNodes] = rf_control;
1355 		tmpwmirNode = tmpwmirNode->list_next;
1356 	}
1357 
1358 	/* link the unblock node to the term node */
1359 	RF_ASSERT(unblockNode->numSuccedents == 1);
1360 	RF_ASSERT(termNode->numAntecedents == 1);
1361 	RF_ASSERT(termNode->numSuccedents == 0);
1362 	unblockNode->succedents[0] = termNode;
1363 	termNode->antecedents[0] = unblockNode;
1364 	termNode->antType[0] = rf_control;
1365 }
1366