xref: /netbsd-src/sys/dev/raidframe/rf_disks.c (revision 3b435a73967be44dfb4a27315acd72bfacde430c)
1 /*	$NetBSD: rf_disks.c,v 1.13 1999/08/14 03:10:03 oster Exp $	*/
2 /*-
3  * Copyright (c) 1999 The NetBSD Foundation, Inc.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to The NetBSD Foundation
7  * by Greg Oster
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *        This product includes software developed by the NetBSD
20  *        Foundation, Inc. and its contributors.
21  * 4. Neither the name of The NetBSD Foundation nor the names of its
22  *    contributors may be used to endorse or promote products derived
23  *    from this software without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35  * POSSIBILITY OF SUCH DAMAGE.
36  */
37 
38 /*
39  * Copyright (c) 1995 Carnegie-Mellon University.
40  * All rights reserved.
41  *
42  * Author: Mark Holland
43  *
44  * Permission to use, copy, modify and distribute this software and
45  * its documentation is hereby granted, provided that both the copyright
46  * notice and this permission notice appear in all copies of the
47  * software, derivative works or modified versions, and any portions
48  * thereof, and that both notices appear in supporting documentation.
49  *
50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
53  *
54  * Carnegie Mellon requests users of this software to return to
55  *
56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
57  *  School of Computer Science
58  *  Carnegie Mellon University
59  *  Pittsburgh PA 15213-3890
60  *
61  * any improvements or extensions that they make and grant Carnegie the
62  * rights to redistribute these changes.
63  */
64 
65 /***************************************************************
66  * rf_disks.c -- code to perform operations on the actual disks
67  ***************************************************************/
68 
69 #include "rf_types.h"
70 #include "rf_raid.h"
71 #include "rf_alloclist.h"
72 #include "rf_utils.h"
73 #include "rf_configure.h"
74 #include "rf_general.h"
75 #include "rf_options.h"
76 
77 #include <sys/types.h>
78 #include <sys/param.h>
79 #include <sys/systm.h>
80 #include <sys/proc.h>
81 #include <sys/ioctl.h>
82 #include <sys/fcntl.h>
83 #include <sys/vnode.h>
84 
85 /* XXX these should be in a header file somewhere */
86 int raidlookup __P((char *, struct proc * p, struct vnode **));
87 int raidwrite_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *);
88 int raidread_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *);
89 void rf_UnconfigureVnodes( RF_Raid_t * );
90 int rf_CheckLabels( RF_Raid_t *, RF_Config_t *);
91 
92 #define DPRINTF6(a,b,c,d,e,f) if (rf_diskDebug) printf(a,b,c,d,e,f)
93 #define DPRINTF7(a,b,c,d,e,f,g) if (rf_diskDebug) printf(a,b,c,d,e,f,g)
94 
95 /**************************************************************************
96  *
97  * initialize the disks comprising the array
98  *
99  * We want the spare disks to have regular row,col numbers so that we can
100  * easily substitue a spare for a failed disk.  But, the driver code assumes
101  * throughout that the array contains numRow by numCol _non-spare_ disks, so
102  * it's not clear how to fit in the spares.  This is an unfortunate holdover
103  * from raidSim.  The quick and dirty fix is to make row zero bigger than the
104  * rest, and put all the spares in it.  This probably needs to get changed
105  * eventually.
106  *
107  **************************************************************************/
108 
109 int
110 rf_ConfigureDisks( listp, raidPtr, cfgPtr )
111 	RF_ShutdownList_t **listp;
112 	RF_Raid_t *raidPtr;
113 	RF_Config_t *cfgPtr;
114 {
115 	RF_RaidDisk_t **disks;
116 	RF_SectorCount_t min_numblks = (RF_SectorCount_t) 0x7FFFFFFFFFFFLL;
117 	RF_RowCol_t r, c;
118 	int bs, ret;
119 	unsigned i, count, foundone = 0, numFailuresThisRow;
120 	int num_rows_done, num_cols_done;
121 	int force;
122 
123 	num_rows_done = 0;
124 	num_cols_done = 0;
125 	force = cfgPtr->force;
126 
127 	RF_CallocAndAdd(disks, raidPtr->numRow, sizeof(RF_RaidDisk_t *),
128 			(RF_RaidDisk_t **), raidPtr->cleanupList);
129 	if (disks == NULL) {
130 		ret = ENOMEM;
131 		goto fail;
132 	}
133 	raidPtr->Disks = disks;
134 
135 	/* get space for the device-specific stuff... */
136 	RF_CallocAndAdd(raidPtr->raid_cinfo, raidPtr->numRow,
137 	    sizeof(struct raidcinfo *), (struct raidcinfo **),
138 	    raidPtr->cleanupList);
139 	if (raidPtr->raid_cinfo == NULL) {
140 		ret = ENOMEM;
141 		goto fail;
142 	}
143 	for (r = 0; r < raidPtr->numRow; r++) {
144 		numFailuresThisRow = 0;
145 		/* We allocate RF_MAXSPARE on the first row so that we
146 		   have room to do hot-swapping of spares */
147 		RF_CallocAndAdd(disks[r], raidPtr->numCol
148 				+ ((r == 0) ? RF_MAXSPARE : 0),
149 				sizeof(RF_RaidDisk_t), (RF_RaidDisk_t *),
150 				raidPtr->cleanupList);
151 		if (disks[r] == NULL) {
152 			ret = ENOMEM;
153 			goto fail;
154 		}
155 		/* get more space for device specific stuff.. */
156 		RF_CallocAndAdd(raidPtr->raid_cinfo[r],
157 		    raidPtr->numCol + ((r == 0) ? raidPtr->numSpare : 0),
158 		    sizeof(struct raidcinfo), (struct raidcinfo *),
159 		    raidPtr->cleanupList);
160 		if (raidPtr->raid_cinfo[r] == NULL) {
161 			ret = ENOMEM;
162 			goto fail;
163 		}
164 		for (c = 0; c < raidPtr->numCol; c++) {
165 			ret = rf_ConfigureDisk(raidPtr,
166 					       &cfgPtr->devnames[r][c][0],
167 					       &disks[r][c], r, c);
168 			if (ret)
169 				goto fail;
170 
171 			if (disks[r][c].status == rf_ds_optimal) {
172 				raidread_component_label(
173 					 raidPtr->raid_cinfo[r][c].ci_dev,
174 					 raidPtr->raid_cinfo[r][c].ci_vp,
175 					 &raidPtr->raid_cinfo[r][c].ci_label);
176 			}
177 
178 			if (disks[r][c].status != rf_ds_optimal) {
179 				numFailuresThisRow++;
180 			} else {
181 				if (disks[r][c].numBlocks < min_numblks)
182 					min_numblks = disks[r][c].numBlocks;
183 				DPRINTF7("Disk at row %d col %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n",
184 				    r, c, disks[r][c].devname,
185 				    (long int) disks[r][c].numBlocks,
186 				    disks[r][c].blockSize,
187 				    (long int) disks[r][c].numBlocks *
188 					 disks[r][c].blockSize / 1024 / 1024);
189 			}
190 			num_cols_done++;
191 		}
192 		/* XXX fix for n-fault tolerant */
193 		/* XXX this should probably check to see how many failures
194 		   we can handle for this configuration! */
195 		if (numFailuresThisRow > 0)
196 			raidPtr->status[r] = rf_rs_degraded;
197 		num_rows_done++;
198 	}
199 
200 	/* all disks must be the same size & have the same block size, bs must
201 	 * be a power of 2 */
202 	bs = 0;
203 	for (foundone = r = 0; !foundone && r < raidPtr->numRow; r++) {
204 		for (c = 0; !foundone && c < raidPtr->numCol; c++) {
205 			if (disks[r][c].status == rf_ds_optimal) {
206 				bs = disks[r][c].blockSize;
207 				foundone = 1;
208 			}
209 		}
210 	}
211 	if (!foundone) {
212 		RF_ERRORMSG("RAIDFRAME: Did not find any live disks in the array.\n");
213 		ret = EINVAL;
214 		goto fail;
215 	}
216 	for (count = 0, i = 1; i; i <<= 1)
217 		if (bs & i)
218 			count++;
219 	if (count != 1) {
220 		RF_ERRORMSG1("Error: block size on disks (%d) must be a power of 2\n", bs);
221 		ret = EINVAL;
222 		goto fail;
223 	}
224 
225 	if (rf_CheckLabels( raidPtr, cfgPtr )) {
226 		printf("raid%d: There were fatal errors\n", raidPtr->raidid);
227 		if (force != 0) {
228 			printf("raid%d: Fatal errors being ignored.\n",
229 			       raidPtr->raidid);
230 		} else {
231 			ret = EINVAL;
232 			goto fail;
233 		}
234 	}
235 
236 	for (r = 0; r < raidPtr->numRow; r++) {
237 		for (c = 0; c < raidPtr->numCol; c++) {
238 			if (disks[r][c].status == rf_ds_optimal) {
239 				if (disks[r][c].blockSize != bs) {
240 					RF_ERRORMSG2("Error: block size of disk at r %d c %d different from disk at r 0 c 0\n", r, c);
241 					ret = EINVAL;
242 					goto fail;
243 				}
244 				if (disks[r][c].numBlocks != min_numblks) {
245 					RF_ERRORMSG3("WARNING: truncating disk at r %d c %d to %d blocks\n",
246 					    r, c, (int) min_numblks);
247 					disks[r][c].numBlocks = min_numblks;
248 				}
249 			}
250 		}
251 	}
252 
253 	raidPtr->sectorsPerDisk = min_numblks;
254 	raidPtr->logBytesPerSector = ffs(bs) - 1;
255 	raidPtr->bytesPerSector = bs;
256 	raidPtr->sectorMask = bs - 1;
257 	return (0);
258 
259 fail:
260 
261 	rf_UnconfigureVnodes( raidPtr );
262 
263 	return (ret);
264 }
265 
266 
267 /****************************************************************************
268  * set up the data structures describing the spare disks in the array
269  * recall from the above comment that the spare disk descriptors are stored
270  * in row zero, which is specially expanded to hold them.
271  ****************************************************************************/
272 int
273 rf_ConfigureSpareDisks( listp, raidPtr, cfgPtr )
274 	RF_ShutdownList_t ** listp;
275 	RF_Raid_t * raidPtr;
276 	RF_Config_t * cfgPtr;
277 {
278 	int     i, ret;
279 	unsigned int bs;
280 	RF_RaidDisk_t *disks;
281 	int     num_spares_done;
282 
283 	num_spares_done = 0;
284 
285 	/* The space for the spares should have already been allocated by
286 	 * ConfigureDisks() */
287 
288 	disks = &raidPtr->Disks[0][raidPtr->numCol];
289 	for (i = 0; i < raidPtr->numSpare; i++) {
290 		ret = rf_ConfigureDisk(raidPtr, &cfgPtr->spare_names[i][0],
291 				       &disks[i], 0, raidPtr->numCol + i);
292 		if (ret)
293 			goto fail;
294 		if (disks[i].status != rf_ds_optimal) {
295 			RF_ERRORMSG1("Warning: spare disk %s failed TUR\n",
296 				     &cfgPtr->spare_names[i][0]);
297 		} else {
298 			disks[i].status = rf_ds_spare;	/* change status to
299 							 * spare */
300 			DPRINTF6("Spare Disk %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n", i,
301 			    disks[i].devname,
302 			    (long int) disks[i].numBlocks, disks[i].blockSize,
303 			    (long int) disks[i].numBlocks *
304 				 disks[i].blockSize / 1024 / 1024);
305 		}
306 		num_spares_done++;
307 	}
308 
309 	/* check sizes and block sizes on spare disks */
310 	bs = 1 << raidPtr->logBytesPerSector;
311 	for (i = 0; i < raidPtr->numSpare; i++) {
312 		if (disks[i].blockSize != bs) {
313 			RF_ERRORMSG3("Block size of %d on spare disk %s is not the same as on other disks (%d)\n", disks[i].blockSize, disks[i].devname, bs);
314 			ret = EINVAL;
315 			goto fail;
316 		}
317 		if (disks[i].numBlocks < raidPtr->sectorsPerDisk) {
318 			RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %ld blocks)\n",
319 				     disks[i].devname, disks[i].blockSize,
320 				     (long int) raidPtr->sectorsPerDisk);
321 			ret = EINVAL;
322 			goto fail;
323 		} else
324 			if (disks[i].numBlocks > raidPtr->sectorsPerDisk) {
325 				RF_ERRORMSG2("Warning: truncating spare disk %s to %ld blocks\n", disks[i].devname, (long int) raidPtr->sectorsPerDisk);
326 
327 				disks[i].numBlocks = raidPtr->sectorsPerDisk;
328 			}
329 	}
330 
331 	return (0);
332 
333 fail:
334 
335 	/* Release the hold on the main components.  We've failed to allocate
336 	 * a spare, and since we're failing, we need to free things..
337 
338 	 XXX failing to allocate a spare is *not* that big of a deal...
339 	 We *can* survive without it, if need be, esp. if we get hot
340 	 adding working.
341 
342 	 If we don't fail out here, then we need a way to remove this spare...
343 	 that should be easier to do here than if we are "live"...
344 
345 	 */
346 
347 	rf_UnconfigureVnodes( raidPtr );
348 
349 	return (ret);
350 }
351 
352 
353 
354 /* configure a single disk in the array */
355 int
356 rf_ConfigureDisk(raidPtr, buf, diskPtr, row, col)
357 	RF_Raid_t *raidPtr;
358 	char   *buf;
359 	RF_RaidDisk_t *diskPtr;
360 	RF_RowCol_t row;
361 	RF_RowCol_t col;
362 {
363 	char   *p;
364 	int     retcode;
365 
366 	struct partinfo dpart;
367 	struct vnode *vp;
368 	struct vattr va;
369 	struct proc *proc;
370 	int     error;
371 
372 	retcode = 0;
373 	p = rf_find_non_white(buf);
374 	if (p[strlen(p) - 1] == '\n') {
375 		/* strip off the newline */
376 		p[strlen(p) - 1] = '\0';
377 	}
378 	(void) strcpy(diskPtr->devname, p);
379 
380 	proc = raidPtr->engine_thread;
381 
382 	/* Let's start by claiming the component is fine and well... */
383 	diskPtr->status = rf_ds_optimal;
384 
385 	raidPtr->raid_cinfo[row][col].ci_vp = NULL;
386 	raidPtr->raid_cinfo[row][col].ci_dev = NULL;
387 
388 	error = raidlookup(diskPtr->devname, proc, &vp);
389 	if (error) {
390 		printf("raidlookup on device: %s failed!\n", diskPtr->devname);
391 		if (error == ENXIO) {
392 			/* the component isn't there... must be dead :-( */
393 			diskPtr->status = rf_ds_failed;
394 		} else {
395 			return (error);
396 		}
397 	}
398 	if (diskPtr->status == rf_ds_optimal) {
399 
400 		if ((error = VOP_GETATTR(vp, &va, proc->p_ucred, proc)) != 0) {
401 			return (error);
402 		}
403 		error = VOP_IOCTL(vp, DIOCGPART, (caddr_t) & dpart,
404 				  FREAD, proc->p_ucred, proc);
405 		if (error) {
406 			return (error);
407 		}
408 
409 		diskPtr->blockSize = dpart.disklab->d_secsize;
410 
411 		diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
412 
413 		raidPtr->raid_cinfo[row][col].ci_vp = vp;
414 		raidPtr->raid_cinfo[row][col].ci_dev = va.va_rdev;
415 
416 		diskPtr->dev = va.va_rdev;
417 
418 		/* we allow the user to specify that only a fraction of the
419 		 * disks should be used this is just for debug:  it speeds up
420 		 * the parity scan */
421 		diskPtr->numBlocks = diskPtr->numBlocks *
422 			rf_sizePercentage / 100;
423 	}
424 	return (0);
425 }
426 
427 static void rf_print_label_status( RF_Raid_t *, int, int, char *,
428 				  RF_ComponentLabel_t *);
429 
430 static void
431 rf_print_label_status( raidPtr, row, column, dev_name, ci_label )
432 	RF_Raid_t *raidPtr;
433 	int row;
434 	int column;
435 	char *dev_name;
436 	RF_ComponentLabel_t *ci_label;
437 {
438 
439 	printf("raid%d: Component %s being configured at row: %d col: %d\n",
440 	       raidPtr->raidid, dev_name, row, column );
441 	printf("         Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
442 	       ci_label->row, ci_label->column,
443 	       ci_label->num_rows, ci_label->num_columns);
444 	printf("         Version: %d Serial Number: %d Mod Counter: %d\n",
445 	       ci_label->version, ci_label->serial_number,
446 	       ci_label->mod_counter);
447 	printf("         Clean: %s Status: %d\n",
448 	       ci_label->clean ? "Yes" : "No", ci_label->status );
449 }
450 
451 static int rf_check_label_vitals( RF_Raid_t *, int, int, char *,
452 				  RF_ComponentLabel_t *, int, int );
453 static int rf_check_label_vitals( raidPtr, row, column, dev_name, ci_label,
454 				  serial_number, mod_counter )
455 	RF_Raid_t *raidPtr;
456 	int row;
457 	int column;
458 	char *dev_name;
459 	RF_ComponentLabel_t *ci_label;
460 	int serial_number;
461 	int mod_counter;
462 {
463 	int fatal_error = 0;
464 
465 	if (serial_number != ci_label->serial_number) {
466 		printf("%s has a different serial number: %d %d\n",
467 		       dev_name, serial_number, ci_label->serial_number);
468 		fatal_error = 1;
469 	}
470 	if (mod_counter != ci_label->mod_counter) {
471 		printf("%s has a different modfication count: %d %d\n",
472 		       dev_name, mod_counter, ci_label->mod_counter);
473 	}
474 
475 	if (row != ci_label->row) {
476 		printf("Row out of alignment for: %s\n", dev_name);
477 		fatal_error = 1;
478 	}
479 	if (column != ci_label->column) {
480 		printf("Column out of alignment for: %s\n", dev_name);
481 		fatal_error = 1;
482 	}
483 	if (raidPtr->numRow != ci_label->num_rows) {
484 		printf("Number of rows do not match for: %s\n", dev_name);
485 		fatal_error = 1;
486 	}
487 	if (raidPtr->numCol != ci_label->num_columns) {
488 		printf("Number of columns do not match for: %s\n", dev_name);
489 		fatal_error = 1;
490 	}
491 	if (ci_label->clean == 0) {
492 		/* it's not clean, but that's not fatal */
493 		printf("%s is not clean!\n", dev_name);
494 	}
495 	return(fatal_error);
496 }
497 
498 
499 /*
500 
501    rf_CheckLabels() - check all the component labels for consistency.
502    Return an error if there is anything major amiss.
503 
504  */
505 
506 int
507 rf_CheckLabels( raidPtr, cfgPtr )
508 	RF_Raid_t *raidPtr;
509 	RF_Config_t *cfgPtr;
510 {
511 	int r,c;
512 	char *dev_name;
513 	RF_ComponentLabel_t *ci_label;
514 	int serial_number = 0;
515 	int mod_number = 0;
516 	int fatal_error = 0;
517 	int mod_values[4];
518 	int mod_count[4];
519 	int ser_values[4];
520 	int ser_count[4];
521 	int num_ser;
522 	int num_mod;
523 	int i;
524 	int found;
525 	int hosed_row;
526 	int hosed_column;
527 	int too_fatal;
528 	int parity_good;
529 	int force;
530 
531 	hosed_row = -1;
532 	hosed_column = -1;
533 	too_fatal = 0;
534 	force = cfgPtr->force;
535 
536 	/*
537 	   We're going to try to be a little intelligent here.  If one
538 	   component's label is bogus, and we can identify that it's the
539 	   *only* one that's gone, we'll mark it as "failed" and allow
540 	   the configuration to proceed.  This will be the *only* case
541 	   that we'll proceed if there would be (otherwise) fatal errors.
542 
543 	   Basically we simply keep a count of how many components had
544 	   what serial number.  If all but one agree, we simply mark
545 	   the disagreeing component as being failed, and allow
546 	   things to come up "normally".
547 
548 	   We do this first for serial numbers, and then for "mod_counter".
549 
550 	 */
551 
552 	num_ser = 0;
553 	num_mod = 0;
554 	for (r = 0; r < raidPtr->numRow && !fatal_error ; r++) {
555 		for (c = 0; c < raidPtr->numCol; c++) {
556 			ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
557 			found=0;
558 			for(i=0;i<num_ser;i++) {
559 				if (ser_values[i] == ci_label->serial_number) {
560 					ser_count[i]++;
561 					found=1;
562 					break;
563 				}
564 			}
565 			if (!found) {
566 				ser_values[num_ser] = ci_label->serial_number;
567 				ser_count[num_ser] = 1;
568 				num_ser++;
569 				if (num_ser>2) {
570 					fatal_error = 1;
571 					break;
572 				}
573 			}
574 			found=0;
575 			for(i=0;i<num_mod;i++) {
576 				if (mod_values[i] == ci_label->mod_counter) {
577 					mod_count[i]++;
578 					found=1;
579 					break;
580 				}
581 			}
582 			if (!found) {
583 			        mod_values[num_mod] = ci_label->mod_counter;
584 				mod_count[num_mod] = 1;
585 				num_mod++;
586 				if (num_mod>2) {
587 					fatal_error = 1;
588 					break;
589 				}
590 			}
591 		}
592 	}
593 #if DEBUG
594 	printf("raid%d: Summary of serial numbers:\n", raidPtr->raidid);
595 	for(i=0;i<num_ser;i++) {
596 		printf("%d %d\n", ser_values[i], ser_count[i]);
597 	}
598 	printf("raid%d: Summary of mod counters:\n", raidPtr->raidid);
599 	for(i=0;i<num_mod;i++) {
600 		printf("%d %d\n", mod_values[i], mod_count[i]);
601 	}
602 #endif
603 	serial_number = ser_values[0];
604 	if (num_ser == 2) {
605 		if ((ser_count[0] == 1) || (ser_count[1] == 1)) {
606 			/* Locate the maverick component */
607 			if (ser_count[1] > ser_count[0]) {
608 				serial_number = ser_values[1];
609 			}
610 			for (r = 0; r < raidPtr->numRow; r++) {
611 				for (c = 0; c < raidPtr->numCol; c++) {
612 				ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
613 					if (serial_number !=
614 					    ci_label->serial_number) {
615 						hosed_row = r;
616 						hosed_column = c;
617 						break;
618 					}
619 				}
620 			}
621 			printf("Hosed component: %s\n",
622 			       &cfgPtr->devnames[hosed_row][hosed_column][0]);
623 			if (!force) {
624 				/* we'll fail this component, as if there are
625 				   other major errors, we arn't forcing things
626 				   and we'll abort the config anyways */
627 				raidPtr->Disks[hosed_row][hosed_column].status
628 					= rf_ds_failed;
629 				raidPtr->numFailures++;
630 				raidPtr->status[hosed_row] = rf_rs_degraded;
631 			}
632 		} else {
633 			too_fatal = 1;
634 		}
635 		if (cfgPtr->parityConfig == '0') {
636 			/* We've identified two different serial numbers.
637 			   RAID 0 can't cope with that, so we'll punt */
638 			too_fatal = 1;
639 		}
640 
641 	}
642 
643 	/* record the serial number for later.  If we bail later, setting
644 	   this doesn't matter, otherwise we've got the best guess at the
645 	   correct serial number */
646 	raidPtr->serial_number = serial_number;
647 
648 	mod_number = mod_values[0];
649 	if (num_mod == 2) {
650 		if ((mod_count[0] == 1) || (mod_count[1] == 1)) {
651 			/* Locate the maverick component */
652 			if (mod_count[1] > mod_count[0]) {
653 				mod_number = mod_values[1];
654 			} else if (mod_count[1] < mod_count[0]) {
655 				mod_number = mod_values[0];
656 			} else {
657 				/* counts of different modification values
658 				   are the same.   Assume greater value is
659 				   the correct one, all other things
660 				   considered */
661 				if (mod_values[0] > mod_values[1]) {
662 					mod_number = mod_values[0];
663 				} else {
664 					mod_number = mod_values[1];
665 				}
666 
667 			}
668 			for (r = 0; r < raidPtr->numRow && !too_fatal ; r++) {
669 				for (c = 0; c < raidPtr->numCol; c++) {
670 					ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
671 					if (mod_number !=
672 					    ci_label->mod_counter) {
673 						if ( ( hosed_row == r ) &&
674 						     ( hosed_column == c )) {
675 							/* same one.  Can
676 							   deal with it.  */
677 						} else {
678 							hosed_row = r;
679 							hosed_column = c;
680 							if (num_ser != 1) {
681 								too_fatal = 1;
682 								break;
683 							}
684 						}
685 					}
686 				}
687 			}
688 			printf("Hosed component: %s\n",
689 			       &cfgPtr->devnames[hosed_row][hosed_column][0]);
690 			if (!force) {
691 				/* we'll fail this component, as if there are
692 				   other major errors, we arn't forcing things
693 				   and we'll abort the config anyways */
694 				if (raidPtr->Disks[hosed_row][hosed_column].status != rf_ds_failed) {
695 					raidPtr->Disks[hosed_row][hosed_column].status
696 						= rf_ds_failed;
697 					raidPtr->numFailures++;
698 					raidPtr->status[hosed_row] = rf_rs_degraded;
699 				}
700 			}
701 		} else {
702 			too_fatal = 1;
703 		}
704 		if (cfgPtr->parityConfig == '0') {
705 			/* We've identified two different mod counters.
706 			   RAID 0 can't cope with that, so we'll punt */
707 			too_fatal = 1;
708 		}
709 	}
710 
711 	raidPtr->mod_counter = mod_number;
712 
713 	if (too_fatal) {
714 		/* we've had both a serial number mismatch, and a mod_counter
715 		   mismatch -- and they involved two different components!!
716 		   Bail -- make things fail so that the user must force
717 		   the issue... */
718 		hosed_row = -1;
719 		hosed_column = -1;
720 	}
721 
722 	if (num_ser > 2) {
723 		printf("raid%d: Too many different serial numbers!\n",
724 		       raidPtr->raidid);
725 	}
726 
727 	if (num_mod > 2) {
728 		printf("raid%d: Too many different mod counters!\n",
729 		       raidPtr->raidid);
730 	}
731 
732 	/* we start by assuming the parity will be good, and flee from
733 	   that notion at the slightest sign of trouble */
734 
735 	parity_good = RF_RAID_CLEAN;
736 	for (r = 0; r < raidPtr->numRow; r++) {
737 		for (c = 0; c < raidPtr->numCol; c++) {
738 			dev_name = &cfgPtr->devnames[r][c][0];
739 			ci_label = &raidPtr->raid_cinfo[r][c].ci_label;
740 
741 			if ((r == hosed_row) && (c == hosed_column)) {
742 				printf("raid%d: Ignoring %s\n",
743 				       raidPtr->raidid, dev_name);
744 			} else {
745 				rf_print_label_status( raidPtr, r, c,
746 						       dev_name, ci_label );
747 				if (rf_check_label_vitals( raidPtr, r, c,
748 							   dev_name, ci_label,
749 							   serial_number,
750 							   mod_number )) {
751 					fatal_error = 1;
752 				}
753 				if (ci_label->clean != RF_RAID_CLEAN) {
754 					parity_good = RF_RAID_DIRTY;
755 				}
756 			}
757 		}
758 	}
759 	if (fatal_error) {
760 		parity_good = RF_RAID_DIRTY;
761 	}
762 
763 	/* we note the state of the parity */
764 	raidPtr->parity_good = parity_good;
765 
766 	return(fatal_error);
767 }
768 
769 int config_disk_queue(RF_Raid_t *, RF_DiskQueue_t *, RF_RowCol_t,
770 		      RF_RowCol_t, RF_DiskQueueSW_t *,
771 		      RF_SectorCount_t, dev_t, int,
772 		      RF_ShutdownList_t **,
773 		      RF_AllocListElem_t *);
774 int rf_add_hot_spare(RF_Raid_t *, RF_SingleComponent_t *);
775 int
776 rf_add_hot_spare(raidPtr, sparePtr)
777 	RF_Raid_t *raidPtr;
778 	RF_SingleComponent_t *sparePtr;
779 {
780 	RF_RaidDisk_t *disks;
781 	RF_DiskQueue_t *spareQueues;
782 	int ret;
783 	unsigned int bs;
784 	int spare_number;
785 
786 	printf("Just in rf_add_hot_spare: %d\n",raidPtr->numSpare);
787 	printf("Num col: %d\n",raidPtr->numCol);
788 	if (raidPtr->numSpare >= RF_MAXSPARE) {
789 		RF_ERRORMSG1("Too many spares: %d\n", raidPtr->numSpare);
790 		return(EINVAL);
791 	}
792 
793 	RF_LOCK_MUTEX(raidPtr->mutex);
794 
795 	/* the beginning of the spares... */
796 	disks = &raidPtr->Disks[0][raidPtr->numCol];
797 
798 	spare_number = raidPtr->numSpare;
799 
800 	ret = rf_ConfigureDisk(raidPtr, sparePtr->component_name,
801 			       &disks[spare_number], 0,
802 			       raidPtr->numCol + spare_number);
803 
804 	if (ret)
805 		goto fail;
806 	if (disks[spare_number].status != rf_ds_optimal) {
807 		RF_ERRORMSG1("Warning: spare disk %s failed TUR\n",
808 			     sparePtr->component_name);
809 		ret=EINVAL;
810 		goto fail;
811 	} else {
812 		disks[spare_number].status = rf_ds_spare;
813 		DPRINTF6("Spare Disk %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n", spare_number,
814 			 disks[spare_number].devname,
815 			 (long int) disks[spare_number].numBlocks,
816 			 disks[spare_number].blockSize,
817 			 (long int) disks[spare_number].numBlocks *
818 			 disks[spare_number].blockSize / 1024 / 1024);
819 	}
820 
821 
822 	/* check sizes and block sizes on the spare disk */
823 	bs = 1 << raidPtr->logBytesPerSector;
824 	if (disks[spare_number].blockSize != bs) {
825 		RF_ERRORMSG3("Block size of %d on spare disk %s is not the same as on other disks (%d)\n", disks[spare_number].blockSize, disks[spare_number].devname, bs);
826 		ret = EINVAL;
827 		goto fail;
828 	}
829 	if (disks[spare_number].numBlocks < raidPtr->sectorsPerDisk) {
830 		RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %ld blocks)\n",
831 			     disks[spare_number].devname,
832 			     disks[spare_number].blockSize,
833 			     (long int) raidPtr->sectorsPerDisk);
834 		ret = EINVAL;
835 		goto fail;
836 	} else {
837 		if (disks[spare_number].numBlocks >
838 		    raidPtr->sectorsPerDisk) {
839 			RF_ERRORMSG2("Warning: truncating spare disk %s to %ld blocks\n", disks[spare_number].devname,
840 				     (long int) raidPtr->sectorsPerDisk);
841 
842 			disks[spare_number].numBlocks = raidPtr->sectorsPerDisk;
843 		}
844 	}
845 
846 	spareQueues = &raidPtr->Queues[0][raidPtr->numCol];
847 	ret = config_disk_queue( raidPtr, &spareQueues[spare_number],
848 				 0, raidPtr->numCol + spare_number,
849 				 raidPtr->Queues[0][0].qPtr, /* XXX */
850 				 raidPtr->sectorsPerDisk,
851 				 raidPtr->Disks[0][raidPtr->numCol + spare_number].dev,
852 				 raidPtr->Queues[0][0].maxOutstanding, /* XXX */
853 				 &raidPtr->shutdownList,
854 				 raidPtr->cleanupList);
855 
856 
857 	raidPtr->numSpare++;
858 	RF_UNLOCK_MUTEX(raidPtr->mutex);
859 	return (0);
860 
861 fail:
862 	RF_UNLOCK_MUTEX(raidPtr->mutex);
863 	return(ret);
864 }
865 
866 int
867 rf_remove_hot_spare(raidPtr,sparePtr)
868 	RF_Raid_t *raidPtr;
869 	RF_SingleComponent_t *sparePtr;
870 {
871 	int spare_number;
872 
873 
874 	if (raidPtr->numSpare==0) {
875 		printf("No spares to remove!\n");
876 		return(EINVAL);
877 	}
878 
879 	spare_number = sparePtr->column;
880 
881 	return(EINVAL); /* XXX not implemented yet */
882 #if 0
883 	if (spare_number < 0 || spare_number > raidPtr->numSpare) {
884 		return(EINVAL);
885 	}
886 
887 	/* verify that this spare isn't in use... */
888 
889 
890 
891 
892 	/* it's gone.. */
893 
894 	raidPtr->numSpare--;
895 
896 	return(0);
897 #endif
898 }
899 
900 
901