xref: /netbsd-src/sys/dev/raidframe/rf_netbsdkintf.c (revision 404fbe5fb94ca1e054339640cabb2801ce52dd30)
1 /*	$NetBSD: rf_netbsdkintf.c,v 1.251 2008/11/18 14:29:55 ad Exp $	*/
2 /*-
3  * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to The NetBSD Foundation
7  * by Greg Oster; Jason R. Thorpe.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * Copyright (c) 1990, 1993
33  *      The Regents of the University of California.  All rights reserved.
34  *
35  * This code is derived from software contributed to Berkeley by
36  * the Systems Programming Group of the University of Utah Computer
37  * Science Department.
38  *
39  * Redistribution and use in source and binary forms, with or without
40  * modification, are permitted provided that the following conditions
41  * are met:
42  * 1. Redistributions of source code must retain the above copyright
43  *    notice, this list of conditions and the following disclaimer.
44  * 2. Redistributions in binary form must reproduce the above copyright
45  *    notice, this list of conditions and the following disclaimer in the
46  *    documentation and/or other materials provided with the distribution.
47  * 3. Neither the name of the University nor the names of its contributors
48  *    may be used to endorse or promote products derived from this software
49  *    without specific prior written permission.
50  *
51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61  * SUCH DAMAGE.
62  *
63  * from: Utah $Hdr: cd.c 1.6 90/11/28$
64  *
65  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
66  */
67 
68 /*
69  * Copyright (c) 1988 University of Utah.
70  *
71  * This code is derived from software contributed to Berkeley by
72  * the Systems Programming Group of the University of Utah Computer
73  * Science Department.
74  *
75  * Redistribution and use in source and binary forms, with or without
76  * modification, are permitted provided that the following conditions
77  * are met:
78  * 1. Redistributions of source code must retain the above copyright
79  *    notice, this list of conditions and the following disclaimer.
80  * 2. Redistributions in binary form must reproduce the above copyright
81  *    notice, this list of conditions and the following disclaimer in the
82  *    documentation and/or other materials provided with the distribution.
83  * 3. All advertising materials mentioning features or use of this software
84  *    must display the following acknowledgement:
85  *      This product includes software developed by the University of
86  *      California, Berkeley and its contributors.
87  * 4. Neither the name of the University nor the names of its contributors
88  *    may be used to endorse or promote products derived from this software
89  *    without specific prior written permission.
90  *
91  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
92  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
93  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
94  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
95  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
96  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
97  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
98  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
99  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
100  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
101  * SUCH DAMAGE.
102  *
103  * from: Utah $Hdr: cd.c 1.6 90/11/28$
104  *
105  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
106  */
107 
108 /*
109  * Copyright (c) 1995 Carnegie-Mellon University.
110  * All rights reserved.
111  *
112  * Authors: Mark Holland, Jim Zelenka
113  *
114  * Permission to use, copy, modify and distribute this software and
115  * its documentation is hereby granted, provided that both the copyright
116  * notice and this permission notice appear in all copies of the
117  * software, derivative works or modified versions, and any portions
118  * thereof, and that both notices appear in supporting documentation.
119  *
120  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
121  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
122  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
123  *
124  * Carnegie Mellon requests users of this software to return to
125  *
126  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
127  *  School of Computer Science
128  *  Carnegie Mellon University
129  *  Pittsburgh PA 15213-3890
130  *
131  * any improvements or extensions that they make and grant Carnegie the
132  * rights to redistribute these changes.
133  */
134 
135 /***********************************************************
136  *
137  * rf_kintf.c -- the kernel interface routines for RAIDframe
138  *
139  ***********************************************************/
140 
141 #include <sys/cdefs.h>
142 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.251 2008/11/18 14:29:55 ad Exp $");
143 
144 #ifdef _KERNEL_OPT
145 #include "opt_raid_autoconfig.h"
146 #include "raid.h"
147 #endif
148 
149 #include <sys/param.h>
150 #include <sys/errno.h>
151 #include <sys/pool.h>
152 #include <sys/proc.h>
153 #include <sys/queue.h>
154 #include <sys/disk.h>
155 #include <sys/device.h>
156 #include <sys/stat.h>
157 #include <sys/ioctl.h>
158 #include <sys/fcntl.h>
159 #include <sys/systm.h>
160 #include <sys/vnode.h>
161 #include <sys/disklabel.h>
162 #include <sys/conf.h>
163 #include <sys/buf.h>
164 #include <sys/bufq.h>
165 #include <sys/user.h>
166 #include <sys/reboot.h>
167 #include <sys/kauth.h>
168 
169 #include <prop/proplib.h>
170 
171 #include <dev/raidframe/raidframevar.h>
172 #include <dev/raidframe/raidframeio.h>
173 
174 #include "rf_raid.h"
175 #include "rf_copyback.h"
176 #include "rf_dag.h"
177 #include "rf_dagflags.h"
178 #include "rf_desc.h"
179 #include "rf_diskqueue.h"
180 #include "rf_etimer.h"
181 #include "rf_general.h"
182 #include "rf_kintf.h"
183 #include "rf_options.h"
184 #include "rf_driver.h"
185 #include "rf_parityscan.h"
186 #include "rf_threadstuff.h"
187 
188 #ifdef DEBUG
189 int     rf_kdebug_level = 0;
190 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
191 #else				/* DEBUG */
192 #define db1_printf(a) { }
193 #endif				/* DEBUG */
194 
195 static RF_Raid_t **raidPtrs;	/* global raid device descriptors */
196 
197 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
198 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
199 
200 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
201 						 * spare table */
202 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
203 						 * installation process */
204 #endif
205 
206 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
207 
208 /* prototypes */
209 static void KernelWakeupFunc(struct buf *);
210 static void InitBP(struct buf *, struct vnode *, unsigned,
211     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
212     void *, int, struct proc *);
213 static void raidinit(RF_Raid_t *);
214 
215 void raidattach(int);
216 static int raid_match(struct device *, struct cfdata *, void *);
217 static void raid_attach(struct device *, struct device *, void *);
218 static int raid_detach(struct device *, int);
219 
220 dev_type_open(raidopen);
221 dev_type_close(raidclose);
222 dev_type_read(raidread);
223 dev_type_write(raidwrite);
224 dev_type_ioctl(raidioctl);
225 dev_type_strategy(raidstrategy);
226 dev_type_dump(raiddump);
227 dev_type_size(raidsize);
228 
229 const struct bdevsw raid_bdevsw = {
230 	raidopen, raidclose, raidstrategy, raidioctl,
231 	raiddump, raidsize, D_DISK
232 };
233 
234 const struct cdevsw raid_cdevsw = {
235 	raidopen, raidclose, raidread, raidwrite, raidioctl,
236 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
237 };
238 
239 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
240 
241 /* XXX Not sure if the following should be replacing the raidPtrs above,
242    or if it should be used in conjunction with that...
243 */
244 
245 struct raid_softc {
246 	struct device *sc_dev;
247 	int     sc_flags;	/* flags */
248 	int     sc_cflags;	/* configuration flags */
249 	uint64_t sc_size;	/* size of the raid device */
250 	char    sc_xname[20];	/* XXX external name */
251 	struct disk sc_dkdev;	/* generic disk device info */
252 	struct bufq_state *buf_queue;	/* used for the device queue */
253 };
254 /* sc_flags */
255 #define RAIDF_INITED	0x01	/* unit has been initialized */
256 #define RAIDF_WLABEL	0x02	/* label area is writable */
257 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
258 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
259 #define RAIDF_LOCKED	0x80	/* unit is locked */
260 
261 #define	raidunit(x)	DISKUNIT(x)
262 int numraid = 0;
263 
264 extern struct cfdriver raid_cd;
265 CFATTACH_DECL_NEW(raid, sizeof(struct raid_softc),
266     raid_match, raid_attach, raid_detach, NULL);
267 
268 /*
269  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
270  * Be aware that large numbers can allow the driver to consume a lot of
271  * kernel memory, especially on writes, and in degraded mode reads.
272  *
273  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
274  * a single 64K write will typically require 64K for the old data,
275  * 64K for the old parity, and 64K for the new parity, for a total
276  * of 192K (if the parity buffer is not re-used immediately).
277  * Even it if is used immediately, that's still 128K, which when multiplied
278  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
279  *
280  * Now in degraded mode, for example, a 64K read on the above setup may
281  * require data reconstruction, which will require *all* of the 4 remaining
282  * disks to participate -- 4 * 32K/disk == 128K again.
283  */
284 
285 #ifndef RAIDOUTSTANDING
286 #define RAIDOUTSTANDING   6
287 #endif
288 
289 #define RAIDLABELDEV(dev)	\
290 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
291 
292 /* declared here, and made public, for the benefit of KVM stuff.. */
293 struct raid_softc *raid_softc;
294 
295 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
296 				     struct disklabel *);
297 static void raidgetdisklabel(dev_t);
298 static void raidmakedisklabel(struct raid_softc *);
299 
300 static int raidlock(struct raid_softc *);
301 static void raidunlock(struct raid_softc *);
302 
303 static void rf_markalldirty(RF_Raid_t *);
304 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
305 
306 void rf_ReconThread(struct rf_recon_req *);
307 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
308 void rf_CopybackThread(RF_Raid_t *raidPtr);
309 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
310 int rf_autoconfig(struct device *self);
311 void rf_buildroothack(RF_ConfigSet_t *);
312 
313 RF_AutoConfig_t *rf_find_raid_components(void);
314 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
315 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
316 static int rf_reasonable_label(RF_ComponentLabel_t *);
317 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
318 int rf_set_autoconfig(RF_Raid_t *, int);
319 int rf_set_rootpartition(RF_Raid_t *, int);
320 void rf_release_all_vps(RF_ConfigSet_t *);
321 void rf_cleanup_config_set(RF_ConfigSet_t *);
322 int rf_have_enough_components(RF_ConfigSet_t *);
323 int rf_auto_config_set(RF_ConfigSet_t *, int *);
324 
325 static int raidautoconfig = 0; /* Debugging, mostly.  Set to 0 to not
326 				  allow autoconfig to take place.
327 				  Note that this is overridden by having
328 				  RAID_AUTOCONFIG as an option in the
329 				  kernel config file.  */
330 
331 struct RF_Pools_s rf_pools;
332 
333 void
334 raidattach(int num)
335 {
336 	int raidID;
337 	int i, rc;
338 
339 #ifdef DEBUG
340 	printf("raidattach: Asked for %d units\n", num);
341 #endif
342 
343 	if (num <= 0) {
344 #ifdef DIAGNOSTIC
345 		panic("raidattach: count <= 0");
346 #endif
347 		return;
348 	}
349 	/* This is where all the initialization stuff gets done. */
350 
351 	numraid = num;
352 
353 	/* Make some space for requested number of units... */
354 
355 	RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
356 	if (raidPtrs == NULL) {
357 		panic("raidPtrs is NULL!!");
358 	}
359 
360 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
361 	rf_mutex_init(&rf_sparet_wait_mutex);
362 
363 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
364 #endif
365 
366 	for (i = 0; i < num; i++)
367 		raidPtrs[i] = NULL;
368 	rc = rf_BootRaidframe();
369 	if (rc == 0)
370 		aprint_normal("Kernelized RAIDframe activated\n");
371 	else
372 		panic("Serious error booting RAID!!");
373 
374 	/* put together some datastructures like the CCD device does.. This
375 	 * lets us lock the device and what-not when it gets opened. */
376 
377 	raid_softc = (struct raid_softc *)
378 		malloc(num * sizeof(struct raid_softc),
379 		       M_RAIDFRAME, M_NOWAIT);
380 	if (raid_softc == NULL) {
381 		aprint_error("WARNING: no memory for RAIDframe driver\n");
382 		return;
383 	}
384 
385 	memset(raid_softc, 0, num * sizeof(struct raid_softc));
386 
387 	for (raidID = 0; raidID < num; raidID++) {
388 		bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
389 
390 		RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
391 			  (RF_Raid_t *));
392 		if (raidPtrs[raidID] == NULL) {
393 			aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
394 			numraid = raidID;
395 			return;
396 		}
397 	}
398 
399 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
400 		aprint_error("raidattach: config_cfattach_attach failed?\n");
401 	}
402 
403 #ifdef RAID_AUTOCONFIG
404 	raidautoconfig = 1;
405 #endif
406 
407 	/*
408 	 * Register a finalizer which will be used to auto-config RAID
409 	 * sets once all real hardware devices have been found.
410 	 */
411 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
412 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
413 }
414 
415 int
416 rf_autoconfig(struct device *self)
417 {
418 	RF_AutoConfig_t *ac_list;
419 	RF_ConfigSet_t *config_sets;
420 
421 	if (raidautoconfig == 0)
422 		return (0);
423 
424 	/* XXX This code can only be run once. */
425 	raidautoconfig = 0;
426 
427 	/* 1. locate all RAID components on the system */
428 #ifdef DEBUG
429 	printf("Searching for RAID components...\n");
430 #endif
431 	ac_list = rf_find_raid_components();
432 
433 	/* 2. Sort them into their respective sets. */
434 	config_sets = rf_create_auto_sets(ac_list);
435 
436 	/*
437 	 * 3. Evaluate each set andconfigure the valid ones.
438 	 * This gets done in rf_buildroothack().
439 	 */
440 	rf_buildroothack(config_sets);
441 
442 	return 1;
443 }
444 
445 void
446 rf_buildroothack(RF_ConfigSet_t *config_sets)
447 {
448 	RF_ConfigSet_t *cset;
449 	RF_ConfigSet_t *next_cset;
450 	int retcode;
451 	int raidID;
452 	int rootID;
453 	int col;
454 	int num_root;
455 	char *devname;
456 
457 	rootID = 0;
458 	num_root = 0;
459 	cset = config_sets;
460 	while(cset != NULL ) {
461 		next_cset = cset->next;
462 		if (rf_have_enough_components(cset) &&
463 		    cset->ac->clabel->autoconfigure==1) {
464 			retcode = rf_auto_config_set(cset,&raidID);
465 			if (!retcode) {
466 #ifdef DEBUG
467 				printf("raid%d: configured ok\n", raidID);
468 #endif
469 				if (cset->rootable) {
470 					rootID = raidID;
471 					num_root++;
472 				}
473 			} else {
474 				/* The autoconfig didn't work :( */
475 #ifdef DEBUG
476 				printf("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
477 #endif
478 				rf_release_all_vps(cset);
479 			}
480 		} else {
481 			/* we're not autoconfiguring this set...
482 			   release the associated resources */
483 			rf_release_all_vps(cset);
484 		}
485 		/* cleanup */
486 		rf_cleanup_config_set(cset);
487 		cset = next_cset;
488 	}
489 
490 	/* if the user has specified what the root device should be
491 	   then we don't touch booted_device or boothowto... */
492 
493 	if (rootspec != NULL)
494 		return;
495 
496 	/* we found something bootable... */
497 
498 	if (num_root == 1) {
499 		booted_device = raid_softc[rootID].sc_dev;
500 	} else if (num_root > 1) {
501 
502 		/*
503 		 * Maybe the MD code can help. If it cannot, then
504 		 * setroot() will discover that we have no
505 		 * booted_device and will ask the user if nothing was
506 		 * hardwired in the kernel config file
507 		 */
508 
509 		if (booted_device == NULL)
510 			cpu_rootconf();
511 		if (booted_device == NULL)
512 			return;
513 
514 		num_root = 0;
515 		for (raidID = 0; raidID < numraid; raidID++) {
516 			if (raidPtrs[raidID]->valid == 0)
517 				continue;
518 
519 			if (raidPtrs[raidID]->root_partition == 0)
520 				continue;
521 
522 			for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
523 				devname = raidPtrs[raidID]->Disks[col].devname;
524 				devname += sizeof("/dev/") - 1;
525 				if (strncmp(devname, device_xname(booted_device),
526 					    strlen(device_xname(booted_device))) != 0)
527 					continue;
528 #ifdef DEBUG
529 				printf("raid%d includes boot device %s\n",
530 				       raidID, devname);
531 #endif
532 				num_root++;
533 				rootID = raidID;
534 			}
535 		}
536 
537 		if (num_root == 1) {
538 			booted_device = raid_softc[rootID].sc_dev;
539 		} else {
540 			/* we can't guess.. require the user to answer... */
541 			boothowto |= RB_ASKNAME;
542 		}
543 	}
544 }
545 
546 
547 int
548 raidsize(dev_t dev)
549 {
550 	struct raid_softc *rs;
551 	struct disklabel *lp;
552 	int     part, unit, omask, size;
553 
554 	unit = raidunit(dev);
555 	if (unit >= numraid)
556 		return (-1);
557 	rs = &raid_softc[unit];
558 
559 	if ((rs->sc_flags & RAIDF_INITED) == 0)
560 		return (-1);
561 
562 	part = DISKPART(dev);
563 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
564 	lp = rs->sc_dkdev.dk_label;
565 
566 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
567 		return (-1);
568 
569 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
570 		size = -1;
571 	else
572 		size = lp->d_partitions[part].p_size *
573 		    (lp->d_secsize / DEV_BSIZE);
574 
575 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
576 		return (-1);
577 
578 	return (size);
579 
580 }
581 
582 int
583 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
584 {
585 	int     unit = raidunit(dev);
586 	struct raid_softc *rs;
587 	const struct bdevsw *bdev;
588 	struct disklabel *lp;
589 	RF_Raid_t *raidPtr;
590 	daddr_t offset;
591 	int     part, c, sparecol, j, scol, dumpto;
592 	int     error = 0;
593 
594 	if (unit >= numraid)
595 		return (ENXIO);
596 
597 	rs = &raid_softc[unit];
598 	raidPtr = raidPtrs[unit];
599 
600 	if ((rs->sc_flags & RAIDF_INITED) == 0)
601 		return ENXIO;
602 
603 	/* we only support dumping to RAID 1 sets */
604 	if (raidPtr->Layout.numDataCol != 1 ||
605 	    raidPtr->Layout.numParityCol != 1)
606 		return EINVAL;
607 
608 
609 	if ((error = raidlock(rs)) != 0)
610 		return error;
611 
612 	if (size % DEV_BSIZE != 0) {
613 		error = EINVAL;
614 		goto out;
615 	}
616 
617 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
618 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
619 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
620 		    size / DEV_BSIZE, rs->sc_size);
621 		error = EINVAL;
622 		goto out;
623 	}
624 
625 	part = DISKPART(dev);
626 	lp = rs->sc_dkdev.dk_label;
627 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
628 
629 	/* figure out what device is alive.. */
630 
631 	/*
632 	   Look for a component to dump to.  The preference for the
633 	   component to dump to is as follows:
634 	   1) the master
635 	   2) a used_spare of the master
636 	   3) the slave
637 	   4) a used_spare of the slave
638 	*/
639 
640 	dumpto = -1;
641 	for (c = 0; c < raidPtr->numCol; c++) {
642 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
643 			/* this might be the one */
644 			dumpto = c;
645 			break;
646 		}
647 	}
648 
649 	/*
650 	   At this point we have possibly selected a live master or a
651 	   live slave.  We now check to see if there is a spared
652 	   master (or a spared slave), if we didn't find a live master
653 	   or a live slave.
654 	*/
655 
656 	for (c = 0; c < raidPtr->numSpare; c++) {
657 		sparecol = raidPtr->numCol + c;
658 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
659 			/* How about this one? */
660 			scol = -1;
661 			for(j=0;j<raidPtr->numCol;j++) {
662 				if (raidPtr->Disks[j].spareCol == sparecol) {
663 					scol = j;
664 					break;
665 				}
666 			}
667 			if (scol == 0) {
668 				/*
669 				   We must have found a spared master!
670 				   We'll take that over anything else
671 				   found so far.  (We couldn't have
672 				   found a real master before, since
673 				   this is a used spare, and it's
674 				   saying that it's replacing the
675 				   master.)  On reboot (with
676 				   autoconfiguration turned on)
677 				   sparecol will become the 1st
678 				   component (component0) of this set.
679 				*/
680 				dumpto = sparecol;
681 				break;
682 			} else if (scol != -1) {
683 				/*
684 				   Must be a spared slave.  We'll dump
685 				   to that if we havn't found anything
686 				   else so far.
687 				*/
688 				if (dumpto == -1)
689 					dumpto = sparecol;
690 			}
691 		}
692 	}
693 
694 	if (dumpto == -1) {
695 		/* we couldn't find any live components to dump to!?!?
696 		 */
697 		error = EINVAL;
698 		goto out;
699 	}
700 
701 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
702 
703 	/*
704 	   Note that blkno is relative to this particular partition.
705 	   By adding the offset of this partition in the RAID
706 	   set, and also adding RF_PROTECTED_SECTORS, we get a
707 	   value that is relative to the partition used for the
708 	   underlying component.
709 	*/
710 
711 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
712 				blkno + offset, va, size);
713 
714 out:
715 	raidunlock(rs);
716 
717 	return error;
718 }
719 /* ARGSUSED */
720 int
721 raidopen(dev_t dev, int flags, int fmt,
722     struct lwp *l)
723 {
724 	int     unit = raidunit(dev);
725 	struct raid_softc *rs;
726 	struct disklabel *lp;
727 	int     part, pmask;
728 	int     error = 0;
729 
730 	if (unit >= numraid)
731 		return (ENXIO);
732 	rs = &raid_softc[unit];
733 
734 	if ((error = raidlock(rs)) != 0)
735 		return (error);
736 	lp = rs->sc_dkdev.dk_label;
737 
738 	part = DISKPART(dev);
739 
740 	/*
741 	 * If there are wedges, and this is not RAW_PART, then we
742 	 * need to fail.
743 	 */
744 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
745 		error = EBUSY;
746 		goto bad;
747 	}
748 	pmask = (1 << part);
749 
750 	if ((rs->sc_flags & RAIDF_INITED) &&
751 	    (rs->sc_dkdev.dk_openmask == 0))
752 		raidgetdisklabel(dev);
753 
754 	/* make sure that this partition exists */
755 
756 	if (part != RAW_PART) {
757 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
758 		    ((part >= lp->d_npartitions) ||
759 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
760 			error = ENXIO;
761 			goto bad;
762 		}
763 	}
764 	/* Prevent this unit from being unconfigured while open. */
765 	switch (fmt) {
766 	case S_IFCHR:
767 		rs->sc_dkdev.dk_copenmask |= pmask;
768 		break;
769 
770 	case S_IFBLK:
771 		rs->sc_dkdev.dk_bopenmask |= pmask;
772 		break;
773 	}
774 
775 	if ((rs->sc_dkdev.dk_openmask == 0) &&
776 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
777 		/* First one... mark things as dirty... Note that we *MUST*
778 		 have done a configure before this.  I DO NOT WANT TO BE
779 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
780 		 THAT THEY BELONG TOGETHER!!!!! */
781 		/* XXX should check to see if we're only open for reading
782 		   here... If so, we needn't do this, but then need some
783 		   other way of keeping track of what's happened.. */
784 
785 		rf_markalldirty( raidPtrs[unit] );
786 	}
787 
788 
789 	rs->sc_dkdev.dk_openmask =
790 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
791 
792 bad:
793 	raidunlock(rs);
794 
795 	return (error);
796 
797 
798 }
799 /* ARGSUSED */
800 int
801 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
802 {
803 	int     unit = raidunit(dev);
804 	struct cfdata *cf;
805 	struct raid_softc *rs;
806 	int     error = 0;
807 	int     part;
808 
809 	if (unit >= numraid)
810 		return (ENXIO);
811 	rs = &raid_softc[unit];
812 
813 	if ((error = raidlock(rs)) != 0)
814 		return (error);
815 
816 	part = DISKPART(dev);
817 
818 	/* ...that much closer to allowing unconfiguration... */
819 	switch (fmt) {
820 	case S_IFCHR:
821 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
822 		break;
823 
824 	case S_IFBLK:
825 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
826 		break;
827 	}
828 	rs->sc_dkdev.dk_openmask =
829 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
830 
831 	if ((rs->sc_dkdev.dk_openmask == 0) &&
832 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
833 		/* Last one... device is not unconfigured yet.
834 		   Device shutdown has taken care of setting the
835 		   clean bits if RAIDF_INITED is not set
836 		   mark things as clean... */
837 
838 		rf_update_component_labels(raidPtrs[unit],
839 						 RF_FINAL_COMPONENT_UPDATE);
840 		if (doing_shutdown) {
841 			/* last one, and we're going down, so
842 			   lights out for this RAID set too. */
843 			error = rf_Shutdown(raidPtrs[unit]);
844 
845 			/* It's no longer initialized... */
846 			rs->sc_flags &= ~RAIDF_INITED;
847 
848 			/* detach the device */
849 
850 			cf = device_cfdata(rs->sc_dev);
851 			error = config_detach(rs->sc_dev, DETACH_QUIET);
852 			free(cf, M_RAIDFRAME);
853 
854 			/* Detach the disk. */
855 			disk_detach(&rs->sc_dkdev);
856 			disk_destroy(&rs->sc_dkdev);
857 		}
858 	}
859 
860 	raidunlock(rs);
861 	return (0);
862 
863 }
864 
865 void
866 raidstrategy(struct buf *bp)
867 {
868 	int s;
869 
870 	unsigned int raidID = raidunit(bp->b_dev);
871 	RF_Raid_t *raidPtr;
872 	struct raid_softc *rs = &raid_softc[raidID];
873 	int     wlabel;
874 
875 	if ((rs->sc_flags & RAIDF_INITED) ==0) {
876 		bp->b_error = ENXIO;
877 		goto done;
878 	}
879 	if (raidID >= numraid || !raidPtrs[raidID]) {
880 		bp->b_error = ENODEV;
881 		goto done;
882 	}
883 	raidPtr = raidPtrs[raidID];
884 	if (!raidPtr->valid) {
885 		bp->b_error = ENODEV;
886 		goto done;
887 	}
888 	if (bp->b_bcount == 0) {
889 		db1_printf(("b_bcount is zero..\n"));
890 		goto done;
891 	}
892 
893 	/*
894 	 * Do bounds checking and adjust transfer.  If there's an
895 	 * error, the bounds check will flag that for us.
896 	 */
897 
898 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
899 	if (DISKPART(bp->b_dev) == RAW_PART) {
900 		uint64_t size; /* device size in DEV_BSIZE unit */
901 
902 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
903 			size = raidPtr->totalSectors <<
904 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
905 		} else {
906 			size = raidPtr->totalSectors >>
907 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
908 		}
909 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
910 			goto done;
911 		}
912 	} else {
913 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
914 			db1_printf(("Bounds check failed!!:%d %d\n",
915 				(int) bp->b_blkno, (int) wlabel));
916 			goto done;
917 		}
918 	}
919 	s = splbio();
920 
921 	bp->b_resid = 0;
922 
923 	/* stuff it onto our queue */
924 	BUFQ_PUT(rs->buf_queue, bp);
925 
926 	/* scheduled the IO to happen at the next convenient time */
927 	wakeup(&(raidPtrs[raidID]->iodone));
928 
929 	splx(s);
930 	return;
931 
932 done:
933 	bp->b_resid = bp->b_bcount;
934 	biodone(bp);
935 }
936 /* ARGSUSED */
937 int
938 raidread(dev_t dev, struct uio *uio, int flags)
939 {
940 	int     unit = raidunit(dev);
941 	struct raid_softc *rs;
942 
943 	if (unit >= numraid)
944 		return (ENXIO);
945 	rs = &raid_softc[unit];
946 
947 	if ((rs->sc_flags & RAIDF_INITED) == 0)
948 		return (ENXIO);
949 
950 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
951 
952 }
953 /* ARGSUSED */
954 int
955 raidwrite(dev_t dev, struct uio *uio, int flags)
956 {
957 	int     unit = raidunit(dev);
958 	struct raid_softc *rs;
959 
960 	if (unit >= numraid)
961 		return (ENXIO);
962 	rs = &raid_softc[unit];
963 
964 	if ((rs->sc_flags & RAIDF_INITED) == 0)
965 		return (ENXIO);
966 
967 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
968 
969 }
970 
971 int
972 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
973 {
974 	int     unit = raidunit(dev);
975 	int     error = 0;
976 	int     part, pmask;
977 	struct cfdata *cf;
978 	struct raid_softc *rs;
979 	RF_Config_t *k_cfg, *u_cfg;
980 	RF_Raid_t *raidPtr;
981 	RF_RaidDisk_t *diskPtr;
982 	RF_AccTotals_t *totals;
983 	RF_DeviceConfig_t *d_cfg, **ucfgp;
984 	u_char *specific_buf;
985 	int retcode = 0;
986 	int column;
987 	int raidid;
988 	struct rf_recon_req *rrcopy, *rr;
989 	RF_ComponentLabel_t *clabel;
990 	RF_ComponentLabel_t *ci_label;
991 	RF_ComponentLabel_t **clabel_ptr;
992 	RF_SingleComponent_t *sparePtr,*componentPtr;
993 	RF_SingleComponent_t component;
994 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
995 	int i, j, d;
996 #ifdef __HAVE_OLD_DISKLABEL
997 	struct disklabel newlabel;
998 #endif
999 	struct dkwedge_info *dkw;
1000 
1001 	if (unit >= numraid)
1002 		return (ENXIO);
1003 	rs = &raid_softc[unit];
1004 	raidPtr = raidPtrs[unit];
1005 
1006 	db1_printf(("raidioctl: %d %d %d %d\n", (int) dev,
1007 		(int) DISKPART(dev), (int) unit, (int) cmd));
1008 
1009 	/* Must be open for writes for these commands... */
1010 	switch (cmd) {
1011 #ifdef DIOCGSECTORSIZE
1012 	case DIOCGSECTORSIZE:
1013 		*(u_int *)data = raidPtr->bytesPerSector;
1014 		return 0;
1015 	case DIOCGMEDIASIZE:
1016 		*(off_t *)data =
1017 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1018 		return 0;
1019 #endif
1020 	case DIOCSDINFO:
1021 	case DIOCWDINFO:
1022 #ifdef __HAVE_OLD_DISKLABEL
1023 	case ODIOCWDINFO:
1024 	case ODIOCSDINFO:
1025 #endif
1026 	case DIOCWLABEL:
1027 	case DIOCAWEDGE:
1028 	case DIOCDWEDGE:
1029 		if ((flag & FWRITE) == 0)
1030 			return (EBADF);
1031 	}
1032 
1033 	/* Must be initialized for these... */
1034 	switch (cmd) {
1035 	case DIOCGDINFO:
1036 	case DIOCSDINFO:
1037 	case DIOCWDINFO:
1038 #ifdef __HAVE_OLD_DISKLABEL
1039 	case ODIOCGDINFO:
1040 	case ODIOCWDINFO:
1041 	case ODIOCSDINFO:
1042 	case ODIOCGDEFLABEL:
1043 #endif
1044 	case DIOCGPART:
1045 	case DIOCWLABEL:
1046 	case DIOCGDEFLABEL:
1047 	case DIOCAWEDGE:
1048 	case DIOCDWEDGE:
1049 	case DIOCLWEDGES:
1050 	case RAIDFRAME_SHUTDOWN:
1051 	case RAIDFRAME_REWRITEPARITY:
1052 	case RAIDFRAME_GET_INFO:
1053 	case RAIDFRAME_RESET_ACCTOTALS:
1054 	case RAIDFRAME_GET_ACCTOTALS:
1055 	case RAIDFRAME_KEEP_ACCTOTALS:
1056 	case RAIDFRAME_GET_SIZE:
1057 	case RAIDFRAME_FAIL_DISK:
1058 	case RAIDFRAME_COPYBACK:
1059 	case RAIDFRAME_CHECK_RECON_STATUS:
1060 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1061 	case RAIDFRAME_GET_COMPONENT_LABEL:
1062 	case RAIDFRAME_SET_COMPONENT_LABEL:
1063 	case RAIDFRAME_ADD_HOT_SPARE:
1064 	case RAIDFRAME_REMOVE_HOT_SPARE:
1065 	case RAIDFRAME_INIT_LABELS:
1066 	case RAIDFRAME_REBUILD_IN_PLACE:
1067 	case RAIDFRAME_CHECK_PARITY:
1068 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1069 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1070 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1071 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1072 	case RAIDFRAME_SET_AUTOCONFIG:
1073 	case RAIDFRAME_SET_ROOT:
1074 	case RAIDFRAME_DELETE_COMPONENT:
1075 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1076 		if ((rs->sc_flags & RAIDF_INITED) == 0)
1077 			return (ENXIO);
1078 	}
1079 
1080 	switch (cmd) {
1081 
1082 		/* configure the system */
1083 	case RAIDFRAME_CONFIGURE:
1084 
1085 		if (raidPtr->valid) {
1086 			/* There is a valid RAID set running on this unit! */
1087 			printf("raid%d: Device already configured!\n",unit);
1088 			return(EINVAL);
1089 		}
1090 
1091 		/* copy-in the configuration information */
1092 		/* data points to a pointer to the configuration structure */
1093 
1094 		u_cfg = *((RF_Config_t **) data);
1095 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1096 		if (k_cfg == NULL) {
1097 			return (ENOMEM);
1098 		}
1099 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1100 		if (retcode) {
1101 			RF_Free(k_cfg, sizeof(RF_Config_t));
1102 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1103 				retcode));
1104 			return (retcode);
1105 		}
1106 		/* allocate a buffer for the layout-specific data, and copy it
1107 		 * in */
1108 		if (k_cfg->layoutSpecificSize) {
1109 			if (k_cfg->layoutSpecificSize > 10000) {
1110 				/* sanity check */
1111 				RF_Free(k_cfg, sizeof(RF_Config_t));
1112 				return (EINVAL);
1113 			}
1114 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1115 			    (u_char *));
1116 			if (specific_buf == NULL) {
1117 				RF_Free(k_cfg, sizeof(RF_Config_t));
1118 				return (ENOMEM);
1119 			}
1120 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1121 			    k_cfg->layoutSpecificSize);
1122 			if (retcode) {
1123 				RF_Free(k_cfg, sizeof(RF_Config_t));
1124 				RF_Free(specific_buf,
1125 					k_cfg->layoutSpecificSize);
1126 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1127 					retcode));
1128 				return (retcode);
1129 			}
1130 		} else
1131 			specific_buf = NULL;
1132 		k_cfg->layoutSpecific = specific_buf;
1133 
1134 		/* should do some kind of sanity check on the configuration.
1135 		 * Store the sum of all the bytes in the last byte? */
1136 
1137 		/* configure the system */
1138 
1139 		/*
1140 		 * Clear the entire RAID descriptor, just to make sure
1141 		 *  there is no stale data left in the case of a
1142 		 *  reconfiguration
1143 		 */
1144 		memset((char *) raidPtr, 0, sizeof(RF_Raid_t));
1145 		raidPtr->raidid = unit;
1146 
1147 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
1148 
1149 		if (retcode == 0) {
1150 
1151 			/* allow this many simultaneous IO's to
1152 			   this RAID device */
1153 			raidPtr->openings = RAIDOUTSTANDING;
1154 
1155 			raidinit(raidPtr);
1156 			rf_markalldirty(raidPtr);
1157 		}
1158 		/* free the buffers.  No return code here. */
1159 		if (k_cfg->layoutSpecificSize) {
1160 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1161 		}
1162 		RF_Free(k_cfg, sizeof(RF_Config_t));
1163 
1164 		return (retcode);
1165 
1166 		/* shutdown the system */
1167 	case RAIDFRAME_SHUTDOWN:
1168 
1169 		if ((error = raidlock(rs)) != 0)
1170 			return (error);
1171 
1172 		/*
1173 		 * If somebody has a partition mounted, we shouldn't
1174 		 * shutdown.
1175 		 */
1176 
1177 		part = DISKPART(dev);
1178 		pmask = (1 << part);
1179 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1180 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1181 			(rs->sc_dkdev.dk_copenmask & pmask))) {
1182 			raidunlock(rs);
1183 			return (EBUSY);
1184 		}
1185 
1186 		retcode = rf_Shutdown(raidPtr);
1187 
1188 		/* It's no longer initialized... */
1189 		rs->sc_flags &= ~RAIDF_INITED;
1190 
1191 		/* free the pseudo device attach bits */
1192 
1193 		cf = device_cfdata(rs->sc_dev);
1194 		/* XXX this causes us to not return any errors
1195 		   from the above call to rf_Shutdown() */
1196 		retcode = config_detach(rs->sc_dev, DETACH_QUIET);
1197 		free(cf, M_RAIDFRAME);
1198 
1199 		/* Detach the disk. */
1200 		disk_detach(&rs->sc_dkdev);
1201 		disk_destroy(&rs->sc_dkdev);
1202 
1203 		raidunlock(rs);
1204 
1205 		return (retcode);
1206 	case RAIDFRAME_GET_COMPONENT_LABEL:
1207 		clabel_ptr = (RF_ComponentLabel_t **) data;
1208 		/* need to read the component label for the disk indicated
1209 		   by row,column in clabel */
1210 
1211 		/* For practice, let's get it directly fromdisk, rather
1212 		   than from the in-core copy */
1213 		RF_Malloc( clabel, sizeof( RF_ComponentLabel_t ),
1214 			   (RF_ComponentLabel_t *));
1215 		if (clabel == NULL)
1216 			return (ENOMEM);
1217 
1218 		retcode = copyin( *clabel_ptr, clabel,
1219 				  sizeof(RF_ComponentLabel_t));
1220 
1221 		if (retcode) {
1222 			RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1223 			return(retcode);
1224 		}
1225 
1226 		clabel->row = 0; /* Don't allow looking at anything else.*/
1227 
1228 		column = clabel->column;
1229 
1230 		if ((column < 0) || (column >= raidPtr->numCol +
1231 				     raidPtr->numSpare)) {
1232 			RF_Free( clabel, sizeof(RF_ComponentLabel_t));
1233 			return(EINVAL);
1234 		}
1235 
1236 		retcode = raidread_component_label(raidPtr->Disks[column].dev,
1237 				raidPtr->raid_cinfo[column].ci_vp,
1238 				clabel );
1239 
1240 		if (retcode == 0) {
1241 			retcode = copyout(clabel, *clabel_ptr,
1242 					  sizeof(RF_ComponentLabel_t));
1243 		}
1244 		RF_Free(clabel, sizeof(RF_ComponentLabel_t));
1245 		return (retcode);
1246 
1247 	case RAIDFRAME_SET_COMPONENT_LABEL:
1248 		clabel = (RF_ComponentLabel_t *) data;
1249 
1250 		/* XXX check the label for valid stuff... */
1251 		/* Note that some things *should not* get modified --
1252 		   the user should be re-initing the labels instead of
1253 		   trying to patch things.
1254 		   */
1255 
1256 		raidid = raidPtr->raidid;
1257 #ifdef DEBUG
1258 		printf("raid%d: Got component label:\n", raidid);
1259 		printf("raid%d: Version: %d\n", raidid, clabel->version);
1260 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1261 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1262 		printf("raid%d: Column: %d\n", raidid, clabel->column);
1263 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1264 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1265 		printf("raid%d: Status: %d\n", raidid, clabel->status);
1266 #endif
1267 		clabel->row = 0;
1268 		column = clabel->column;
1269 
1270 		if ((column < 0) || (column >= raidPtr->numCol)) {
1271 			return(EINVAL);
1272 		}
1273 
1274 		/* XXX this isn't allowed to do anything for now :-) */
1275 
1276 		/* XXX and before it is, we need to fill in the rest
1277 		   of the fields!?!?!?! */
1278 #if 0
1279 		raidwrite_component_label(
1280 		     raidPtr->Disks[column].dev,
1281 			    raidPtr->raid_cinfo[column].ci_vp,
1282 			    clabel );
1283 #endif
1284 		return (0);
1285 
1286 	case RAIDFRAME_INIT_LABELS:
1287 		clabel = (RF_ComponentLabel_t *) data;
1288 		/*
1289 		   we only want the serial number from
1290 		   the above.  We get all the rest of the information
1291 		   from the config that was used to create this RAID
1292 		   set.
1293 		   */
1294 
1295 		raidPtr->serial_number = clabel->serial_number;
1296 
1297 		RF_Malloc(ci_label, sizeof(RF_ComponentLabel_t),
1298 			  (RF_ComponentLabel_t *));
1299 		if (ci_label == NULL)
1300 			return (ENOMEM);
1301 
1302 		raid_init_component_label(raidPtr, ci_label);
1303 		ci_label->serial_number = clabel->serial_number;
1304 		ci_label->row = 0; /* we dont' pretend to support more */
1305 
1306 		for(column=0;column<raidPtr->numCol;column++) {
1307 			diskPtr = &raidPtr->Disks[column];
1308 			if (!RF_DEAD_DISK(diskPtr->status)) {
1309 				ci_label->partitionSize = diskPtr->partitionSize;
1310 				ci_label->column = column;
1311 				raidwrite_component_label(
1312 							  raidPtr->Disks[column].dev,
1313 							  raidPtr->raid_cinfo[column].ci_vp,
1314 							  ci_label );
1315 			}
1316 		}
1317 		RF_Free(ci_label, sizeof(RF_ComponentLabel_t));
1318 
1319 		return (retcode);
1320 	case RAIDFRAME_SET_AUTOCONFIG:
1321 		d = rf_set_autoconfig(raidPtr, *(int *) data);
1322 		printf("raid%d: New autoconfig value is: %d\n",
1323 		       raidPtr->raidid, d);
1324 		*(int *) data = d;
1325 		return (retcode);
1326 
1327 	case RAIDFRAME_SET_ROOT:
1328 		d = rf_set_rootpartition(raidPtr, *(int *) data);
1329 		printf("raid%d: New rootpartition value is: %d\n",
1330 		       raidPtr->raidid, d);
1331 		*(int *) data = d;
1332 		return (retcode);
1333 
1334 		/* initialize all parity */
1335 	case RAIDFRAME_REWRITEPARITY:
1336 
1337 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1338 			/* Parity for RAID 0 is trivially correct */
1339 			raidPtr->parity_good = RF_RAID_CLEAN;
1340 			return(0);
1341 		}
1342 
1343 		if (raidPtr->parity_rewrite_in_progress == 1) {
1344 			/* Re-write is already in progress! */
1345 			return(EINVAL);
1346 		}
1347 
1348 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1349 					   rf_RewriteParityThread,
1350 					   raidPtr,"raid_parity");
1351 		return (retcode);
1352 
1353 
1354 	case RAIDFRAME_ADD_HOT_SPARE:
1355 		sparePtr = (RF_SingleComponent_t *) data;
1356 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1357 		retcode = rf_add_hot_spare(raidPtr, &component);
1358 		return(retcode);
1359 
1360 	case RAIDFRAME_REMOVE_HOT_SPARE:
1361 		return(retcode);
1362 
1363 	case RAIDFRAME_DELETE_COMPONENT:
1364 		componentPtr = (RF_SingleComponent_t *)data;
1365 		memcpy( &component, componentPtr,
1366 			sizeof(RF_SingleComponent_t));
1367 		retcode = rf_delete_component(raidPtr, &component);
1368 		return(retcode);
1369 
1370 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1371 		componentPtr = (RF_SingleComponent_t *)data;
1372 		memcpy( &component, componentPtr,
1373 			sizeof(RF_SingleComponent_t));
1374 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
1375 		return(retcode);
1376 
1377 	case RAIDFRAME_REBUILD_IN_PLACE:
1378 
1379 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1380 			/* Can't do this on a RAID 0!! */
1381 			return(EINVAL);
1382 		}
1383 
1384 		if (raidPtr->recon_in_progress == 1) {
1385 			/* a reconstruct is already in progress! */
1386 			return(EINVAL);
1387 		}
1388 
1389 		componentPtr = (RF_SingleComponent_t *) data;
1390 		memcpy( &component, componentPtr,
1391 			sizeof(RF_SingleComponent_t));
1392 		component.row = 0; /* we don't support any more */
1393 		column = component.column;
1394 
1395 		if ((column < 0) || (column >= raidPtr->numCol)) {
1396 			return(EINVAL);
1397 		}
1398 
1399 		RF_LOCK_MUTEX(raidPtr->mutex);
1400 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1401 		    (raidPtr->numFailures > 0)) {
1402 			/* XXX 0 above shouldn't be constant!!! */
1403 			/* some component other than this has failed.
1404 			   Let's not make things worse than they already
1405 			   are... */
1406 			printf("raid%d: Unable to reconstruct to disk at:\n",
1407 			       raidPtr->raidid);
1408 			printf("raid%d:     Col: %d   Too many failures.\n",
1409 			       raidPtr->raidid, column);
1410 			RF_UNLOCK_MUTEX(raidPtr->mutex);
1411 			return (EINVAL);
1412 		}
1413 		if (raidPtr->Disks[column].status ==
1414 		    rf_ds_reconstructing) {
1415 			printf("raid%d: Unable to reconstruct to disk at:\n",
1416 			       raidPtr->raidid);
1417 			printf("raid%d:    Col: %d   Reconstruction already occuring!\n", raidPtr->raidid, column);
1418 
1419 			RF_UNLOCK_MUTEX(raidPtr->mutex);
1420 			return (EINVAL);
1421 		}
1422 		if (raidPtr->Disks[column].status == rf_ds_spared) {
1423 			RF_UNLOCK_MUTEX(raidPtr->mutex);
1424 			return (EINVAL);
1425 		}
1426 		RF_UNLOCK_MUTEX(raidPtr->mutex);
1427 
1428 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1429 		if (rrcopy == NULL)
1430 			return(ENOMEM);
1431 
1432 		rrcopy->raidPtr = (void *) raidPtr;
1433 		rrcopy->col = column;
1434 
1435 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1436 					   rf_ReconstructInPlaceThread,
1437 					   rrcopy,"raid_reconip");
1438 		return(retcode);
1439 
1440 	case RAIDFRAME_GET_INFO:
1441 		if (!raidPtr->valid)
1442 			return (ENODEV);
1443 		ucfgp = (RF_DeviceConfig_t **) data;
1444 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1445 			  (RF_DeviceConfig_t *));
1446 		if (d_cfg == NULL)
1447 			return (ENOMEM);
1448 		d_cfg->rows = 1; /* there is only 1 row now */
1449 		d_cfg->cols = raidPtr->numCol;
1450 		d_cfg->ndevs = raidPtr->numCol;
1451 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
1452 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1453 			return (ENOMEM);
1454 		}
1455 		d_cfg->nspares = raidPtr->numSpare;
1456 		if (d_cfg->nspares >= RF_MAX_DISKS) {
1457 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1458 			return (ENOMEM);
1459 		}
1460 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1461 		d = 0;
1462 		for (j = 0; j < d_cfg->cols; j++) {
1463 			d_cfg->devs[d] = raidPtr->Disks[j];
1464 			d++;
1465 		}
1466 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1467 			d_cfg->spares[i] = raidPtr->Disks[j];
1468 		}
1469 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1470 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1471 
1472 		return (retcode);
1473 
1474 	case RAIDFRAME_CHECK_PARITY:
1475 		*(int *) data = raidPtr->parity_good;
1476 		return (0);
1477 
1478 	case RAIDFRAME_RESET_ACCTOTALS:
1479 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1480 		return (0);
1481 
1482 	case RAIDFRAME_GET_ACCTOTALS:
1483 		totals = (RF_AccTotals_t *) data;
1484 		*totals = raidPtr->acc_totals;
1485 		return (0);
1486 
1487 	case RAIDFRAME_KEEP_ACCTOTALS:
1488 		raidPtr->keep_acc_totals = *(int *)data;
1489 		return (0);
1490 
1491 	case RAIDFRAME_GET_SIZE:
1492 		*(int *) data = raidPtr->totalSectors;
1493 		return (0);
1494 
1495 		/* fail a disk & optionally start reconstruction */
1496 	case RAIDFRAME_FAIL_DISK:
1497 
1498 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1499 			/* Can't do this on a RAID 0!! */
1500 			return(EINVAL);
1501 		}
1502 
1503 		rr = (struct rf_recon_req *) data;
1504 		rr->row = 0;
1505 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
1506 			return (EINVAL);
1507 
1508 
1509 		RF_LOCK_MUTEX(raidPtr->mutex);
1510 		if (raidPtr->status == rf_rs_reconstructing) {
1511 			/* you can't fail a disk while we're reconstructing! */
1512 			/* XXX wrong for RAID6 */
1513 			RF_UNLOCK_MUTEX(raidPtr->mutex);
1514 			return (EINVAL);
1515 		}
1516 		if ((raidPtr->Disks[rr->col].status ==
1517 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1518 			/* some other component has failed.  Let's not make
1519 			   things worse. XXX wrong for RAID6 */
1520 			RF_UNLOCK_MUTEX(raidPtr->mutex);
1521 			return (EINVAL);
1522 		}
1523 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1524 			/* Can't fail a spared disk! */
1525 			RF_UNLOCK_MUTEX(raidPtr->mutex);
1526 			return (EINVAL);
1527 		}
1528 		RF_UNLOCK_MUTEX(raidPtr->mutex);
1529 
1530 		/* make a copy of the recon request so that we don't rely on
1531 		 * the user's buffer */
1532 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1533 		if (rrcopy == NULL)
1534 			return(ENOMEM);
1535 		memcpy(rrcopy, rr, sizeof(*rr));
1536 		rrcopy->raidPtr = (void *) raidPtr;
1537 
1538 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1539 					   rf_ReconThread,
1540 					   rrcopy,"raid_recon");
1541 		return (0);
1542 
1543 		/* invoke a copyback operation after recon on whatever disk
1544 		 * needs it, if any */
1545 	case RAIDFRAME_COPYBACK:
1546 
1547 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1548 			/* This makes no sense on a RAID 0!! */
1549 			return(EINVAL);
1550 		}
1551 
1552 		if (raidPtr->copyback_in_progress == 1) {
1553 			/* Copyback is already in progress! */
1554 			return(EINVAL);
1555 		}
1556 
1557 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1558 					   rf_CopybackThread,
1559 					   raidPtr,"raid_copyback");
1560 		return (retcode);
1561 
1562 		/* return the percentage completion of reconstruction */
1563 	case RAIDFRAME_CHECK_RECON_STATUS:
1564 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1565 			/* This makes no sense on a RAID 0, so tell the
1566 			   user it's done. */
1567 			*(int *) data = 100;
1568 			return(0);
1569 		}
1570 		if (raidPtr->status != rf_rs_reconstructing)
1571 			*(int *) data = 100;
1572 		else {
1573 			if (raidPtr->reconControl->numRUsTotal > 0) {
1574 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1575 			} else {
1576 				*(int *) data = 0;
1577 			}
1578 		}
1579 		return (0);
1580 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1581 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1582 		if (raidPtr->status != rf_rs_reconstructing) {
1583 			progressInfo.remaining = 0;
1584 			progressInfo.completed = 100;
1585 			progressInfo.total = 100;
1586 		} else {
1587 			progressInfo.total =
1588 				raidPtr->reconControl->numRUsTotal;
1589 			progressInfo.completed =
1590 				raidPtr->reconControl->numRUsComplete;
1591 			progressInfo.remaining = progressInfo.total -
1592 				progressInfo.completed;
1593 		}
1594 		retcode = copyout(&progressInfo, *progressInfoPtr,
1595 				  sizeof(RF_ProgressInfo_t));
1596 		return (retcode);
1597 
1598 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1599 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1600 			/* This makes no sense on a RAID 0, so tell the
1601 			   user it's done. */
1602 			*(int *) data = 100;
1603 			return(0);
1604 		}
1605 		if (raidPtr->parity_rewrite_in_progress == 1) {
1606 			*(int *) data = 100 *
1607 				raidPtr->parity_rewrite_stripes_done /
1608 				raidPtr->Layout.numStripe;
1609 		} else {
1610 			*(int *) data = 100;
1611 		}
1612 		return (0);
1613 
1614 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1615 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1616 		if (raidPtr->parity_rewrite_in_progress == 1) {
1617 			progressInfo.total = raidPtr->Layout.numStripe;
1618 			progressInfo.completed =
1619 				raidPtr->parity_rewrite_stripes_done;
1620 			progressInfo.remaining = progressInfo.total -
1621 				progressInfo.completed;
1622 		} else {
1623 			progressInfo.remaining = 0;
1624 			progressInfo.completed = 100;
1625 			progressInfo.total = 100;
1626 		}
1627 		retcode = copyout(&progressInfo, *progressInfoPtr,
1628 				  sizeof(RF_ProgressInfo_t));
1629 		return (retcode);
1630 
1631 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1632 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1633 			/* This makes no sense on a RAID 0 */
1634 			*(int *) data = 100;
1635 			return(0);
1636 		}
1637 		if (raidPtr->copyback_in_progress == 1) {
1638 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
1639 				raidPtr->Layout.numStripe;
1640 		} else {
1641 			*(int *) data = 100;
1642 		}
1643 		return (0);
1644 
1645 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1646 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1647 		if (raidPtr->copyback_in_progress == 1) {
1648 			progressInfo.total = raidPtr->Layout.numStripe;
1649 			progressInfo.completed =
1650 				raidPtr->copyback_stripes_done;
1651 			progressInfo.remaining = progressInfo.total -
1652 				progressInfo.completed;
1653 		} else {
1654 			progressInfo.remaining = 0;
1655 			progressInfo.completed = 100;
1656 			progressInfo.total = 100;
1657 		}
1658 		retcode = copyout(&progressInfo, *progressInfoPtr,
1659 				  sizeof(RF_ProgressInfo_t));
1660 		return (retcode);
1661 
1662 		/* the sparetable daemon calls this to wait for the kernel to
1663 		 * need a spare table. this ioctl does not return until a
1664 		 * spare table is needed. XXX -- calling mpsleep here in the
1665 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1666 		 * -- I should either compute the spare table in the kernel,
1667 		 * or have a different -- XXX XXX -- interface (a different
1668 		 * character device) for delivering the table     -- XXX */
1669 #if 0
1670 	case RAIDFRAME_SPARET_WAIT:
1671 		RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1672 		while (!rf_sparet_wait_queue)
1673 			mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
1674 		waitreq = rf_sparet_wait_queue;
1675 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1676 		RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1677 
1678 		/* structure assignment */
1679 		*((RF_SparetWait_t *) data) = *waitreq;
1680 
1681 		RF_Free(waitreq, sizeof(*waitreq));
1682 		return (0);
1683 
1684 		/* wakes up a process waiting on SPARET_WAIT and puts an error
1685 		 * code in it that will cause the dameon to exit */
1686 	case RAIDFRAME_ABORT_SPARET_WAIT:
1687 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1688 		waitreq->fcol = -1;
1689 		RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1690 		waitreq->next = rf_sparet_wait_queue;
1691 		rf_sparet_wait_queue = waitreq;
1692 		RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1693 		wakeup(&rf_sparet_wait_queue);
1694 		return (0);
1695 
1696 		/* used by the spare table daemon to deliver a spare table
1697 		 * into the kernel */
1698 	case RAIDFRAME_SEND_SPARET:
1699 
1700 		/* install the spare table */
1701 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1702 
1703 		/* respond to the requestor.  the return status of the spare
1704 		 * table installation is passed in the "fcol" field */
1705 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1706 		waitreq->fcol = retcode;
1707 		RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1708 		waitreq->next = rf_sparet_resp_queue;
1709 		rf_sparet_resp_queue = waitreq;
1710 		wakeup(&rf_sparet_resp_queue);
1711 		RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1712 
1713 		return (retcode);
1714 #endif
1715 
1716 	default:
1717 		break; /* fall through to the os-specific code below */
1718 
1719 	}
1720 
1721 	if (!raidPtr->valid)
1722 		return (EINVAL);
1723 
1724 	/*
1725 	 * Add support for "regular" device ioctls here.
1726 	 */
1727 
1728 	switch (cmd) {
1729 	case DIOCGDINFO:
1730 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1731 		break;
1732 #ifdef __HAVE_OLD_DISKLABEL
1733 	case ODIOCGDINFO:
1734 		newlabel = *(rs->sc_dkdev.dk_label);
1735 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1736 			return ENOTTY;
1737 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1738 		break;
1739 #endif
1740 
1741 	case DIOCGPART:
1742 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1743 		((struct partinfo *) data)->part =
1744 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1745 		break;
1746 
1747 	case DIOCWDINFO:
1748 	case DIOCSDINFO:
1749 #ifdef __HAVE_OLD_DISKLABEL
1750 	case ODIOCWDINFO:
1751 	case ODIOCSDINFO:
1752 #endif
1753 	{
1754 		struct disklabel *lp;
1755 #ifdef __HAVE_OLD_DISKLABEL
1756 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1757 			memset(&newlabel, 0, sizeof newlabel);
1758 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
1759 			lp = &newlabel;
1760 		} else
1761 #endif
1762 		lp = (struct disklabel *)data;
1763 
1764 		if ((error = raidlock(rs)) != 0)
1765 			return (error);
1766 
1767 		rs->sc_flags |= RAIDF_LABELLING;
1768 
1769 		error = setdisklabel(rs->sc_dkdev.dk_label,
1770 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
1771 		if (error == 0) {
1772 			if (cmd == DIOCWDINFO
1773 #ifdef __HAVE_OLD_DISKLABEL
1774 			    || cmd == ODIOCWDINFO
1775 #endif
1776 			   )
1777 				error = writedisklabel(RAIDLABELDEV(dev),
1778 				    raidstrategy, rs->sc_dkdev.dk_label,
1779 				    rs->sc_dkdev.dk_cpulabel);
1780 		}
1781 		rs->sc_flags &= ~RAIDF_LABELLING;
1782 
1783 		raidunlock(rs);
1784 
1785 		if (error)
1786 			return (error);
1787 		break;
1788 	}
1789 
1790 	case DIOCWLABEL:
1791 		if (*(int *) data != 0)
1792 			rs->sc_flags |= RAIDF_WLABEL;
1793 		else
1794 			rs->sc_flags &= ~RAIDF_WLABEL;
1795 		break;
1796 
1797 	case DIOCGDEFLABEL:
1798 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1799 		break;
1800 
1801 #ifdef __HAVE_OLD_DISKLABEL
1802 	case ODIOCGDEFLABEL:
1803 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
1804 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1805 			return ENOTTY;
1806 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1807 		break;
1808 #endif
1809 
1810 	case DIOCAWEDGE:
1811 	case DIOCDWEDGE:
1812 	    	dkw = (void *)data;
1813 
1814 		/* If the ioctl happens here, the parent is us. */
1815 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
1816 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1817 
1818 	case DIOCLWEDGES:
1819 		return dkwedge_list(&rs->sc_dkdev,
1820 		    (struct dkwedge_list *)data, l);
1821 
1822 	default:
1823 		retcode = ENOTTY;
1824 	}
1825 	return (retcode);
1826 
1827 }
1828 
1829 
1830 /* raidinit -- complete the rest of the initialization for the
1831    RAIDframe device.  */
1832 
1833 
1834 static void
1835 raidinit(RF_Raid_t *raidPtr)
1836 {
1837 	struct cfdata *cf;
1838 	struct raid_softc *rs;
1839 	int     unit;
1840 
1841 	unit = raidPtr->raidid;
1842 
1843 	rs = &raid_softc[unit];
1844 
1845 	/* XXX should check return code first... */
1846 	rs->sc_flags |= RAIDF_INITED;
1847 
1848 	/* XXX doesn't check bounds. */
1849 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1850 
1851 	/* attach the pseudo device */
1852 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1853 	cf->cf_name = raid_cd.cd_name;
1854 	cf->cf_atname = raid_cd.cd_name;
1855 	cf->cf_unit = unit;
1856 	cf->cf_fstate = FSTATE_STAR;
1857 
1858 	rs->sc_dev = config_attach_pseudo(cf);
1859 
1860 	if (rs->sc_dev==NULL) {
1861 		printf("raid%d: config_attach_pseudo failed\n",
1862 		       raidPtr->raidid);
1863 	}
1864 
1865 	/* disk_attach actually creates space for the CPU disklabel, among
1866 	 * other things, so it's critical to call this *BEFORE* we try putzing
1867 	 * with disklabels. */
1868 
1869 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1870 	disk_attach(&rs->sc_dkdev);
1871 
1872 	/* XXX There may be a weird interaction here between this, and
1873 	 * protectedSectors, as used in RAIDframe.  */
1874 
1875 	rs->sc_size = raidPtr->totalSectors;
1876 
1877 	dkwedge_discover(&rs->sc_dkdev);
1878 
1879 	rf_set_properties(rs, raidPtr);
1880 
1881 }
1882 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1883 /* wake up the daemon & tell it to get us a spare table
1884  * XXX
1885  * the entries in the queues should be tagged with the raidPtr
1886  * so that in the extremely rare case that two recons happen at once,
1887  * we know for which device were requesting a spare table
1888  * XXX
1889  *
1890  * XXX This code is not currently used. GO
1891  */
1892 int
1893 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1894 {
1895 	int     retcode;
1896 
1897 	RF_LOCK_MUTEX(rf_sparet_wait_mutex);
1898 	req->next = rf_sparet_wait_queue;
1899 	rf_sparet_wait_queue = req;
1900 	wakeup(&rf_sparet_wait_queue);
1901 
1902 	/* mpsleep unlocks the mutex */
1903 	while (!rf_sparet_resp_queue) {
1904 		tsleep(&rf_sparet_resp_queue, PRIBIO,
1905 		    "raidframe getsparetable", 0);
1906 	}
1907 	req = rf_sparet_resp_queue;
1908 	rf_sparet_resp_queue = req->next;
1909 	RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
1910 
1911 	retcode = req->fcol;
1912 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
1913 					 * alloc'd */
1914 	return (retcode);
1915 }
1916 #endif
1917 
1918 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1919  * bp & passes it down.
1920  * any calls originating in the kernel must use non-blocking I/O
1921  * do some extra sanity checking to return "appropriate" error values for
1922  * certain conditions (to make some standard utilities work)
1923  *
1924  * Formerly known as: rf_DoAccessKernel
1925  */
1926 void
1927 raidstart(RF_Raid_t *raidPtr)
1928 {
1929 	RF_SectorCount_t num_blocks, pb, sum;
1930 	RF_RaidAddr_t raid_addr;
1931 	struct partition *pp;
1932 	daddr_t blocknum;
1933 	int     unit;
1934 	struct raid_softc *rs;
1935 	int     do_async;
1936 	struct buf *bp;
1937 	int rc;
1938 
1939 	unit = raidPtr->raidid;
1940 	rs = &raid_softc[unit];
1941 
1942 	/* quick check to see if anything has died recently */
1943 	RF_LOCK_MUTEX(raidPtr->mutex);
1944 	if (raidPtr->numNewFailures > 0) {
1945 		RF_UNLOCK_MUTEX(raidPtr->mutex);
1946 		rf_update_component_labels(raidPtr,
1947 					   RF_NORMAL_COMPONENT_UPDATE);
1948 		RF_LOCK_MUTEX(raidPtr->mutex);
1949 		raidPtr->numNewFailures--;
1950 	}
1951 
1952 	/* Check to see if we're at the limit... */
1953 	while (raidPtr->openings > 0) {
1954 		RF_UNLOCK_MUTEX(raidPtr->mutex);
1955 
1956 		/* get the next item, if any, from the queue */
1957 		if ((bp = BUFQ_GET(rs->buf_queue)) == NULL) {
1958 			/* nothing more to do */
1959 			return;
1960 		}
1961 
1962 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
1963 		 * partition.. Need to make it absolute to the underlying
1964 		 * device.. */
1965 
1966 		blocknum = bp->b_blkno;
1967 		if (DISKPART(bp->b_dev) != RAW_PART) {
1968 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
1969 			blocknum += pp->p_offset;
1970 		}
1971 
1972 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
1973 			    (int) blocknum));
1974 
1975 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
1976 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
1977 
1978 		/* *THIS* is where we adjust what block we're going to...
1979 		 * but DO NOT TOUCH bp->b_blkno!!! */
1980 		raid_addr = blocknum;
1981 
1982 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
1983 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
1984 		sum = raid_addr + num_blocks + pb;
1985 		if (1 || rf_debugKernelAccess) {
1986 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
1987 				    (int) raid_addr, (int) sum, (int) num_blocks,
1988 				    (int) pb, (int) bp->b_resid));
1989 		}
1990 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
1991 		    || (sum < num_blocks) || (sum < pb)) {
1992 			bp->b_error = ENOSPC;
1993 			bp->b_resid = bp->b_bcount;
1994 			biodone(bp);
1995 			RF_LOCK_MUTEX(raidPtr->mutex);
1996 			continue;
1997 		}
1998 		/*
1999 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2000 		 */
2001 
2002 		if (bp->b_bcount & raidPtr->sectorMask) {
2003 			bp->b_error = EINVAL;
2004 			bp->b_resid = bp->b_bcount;
2005 			biodone(bp);
2006 			RF_LOCK_MUTEX(raidPtr->mutex);
2007 			continue;
2008 
2009 		}
2010 		db1_printf(("Calling DoAccess..\n"));
2011 
2012 
2013 		RF_LOCK_MUTEX(raidPtr->mutex);
2014 		raidPtr->openings--;
2015 		RF_UNLOCK_MUTEX(raidPtr->mutex);
2016 
2017 		/*
2018 		 * Everything is async.
2019 		 */
2020 		do_async = 1;
2021 
2022 		disk_busy(&rs->sc_dkdev);
2023 
2024 		/* XXX we're still at splbio() here... do we *really*
2025 		   need to be? */
2026 
2027 		/* don't ever condition on bp->b_flags & B_WRITE.
2028 		 * always condition on B_READ instead */
2029 
2030 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2031 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2032 				 do_async, raid_addr, num_blocks,
2033 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2034 
2035 		if (rc) {
2036 			bp->b_error = rc;
2037 			bp->b_resid = bp->b_bcount;
2038 			biodone(bp);
2039 			/* continue loop */
2040 		}
2041 
2042 		RF_LOCK_MUTEX(raidPtr->mutex);
2043 	}
2044 	RF_UNLOCK_MUTEX(raidPtr->mutex);
2045 }
2046 
2047 
2048 
2049 
2050 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
2051 
2052 int
2053 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2054 {
2055 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2056 	struct buf *bp;
2057 
2058 	req->queue = queue;
2059 
2060 #if DIAGNOSTIC
2061 	if (queue->raidPtr->raidid >= numraid) {
2062 		printf("Invalid unit number: %d %d\n", queue->raidPtr->raidid,
2063 		    numraid);
2064 		panic("Invalid Unit number in rf_DispatchKernelIO");
2065 	}
2066 #endif
2067 
2068 	bp = req->bp;
2069 
2070 	switch (req->type) {
2071 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
2072 		/* XXX need to do something extra here.. */
2073 		/* I'm leaving this in, as I've never actually seen it used,
2074 		 * and I'd like folks to report it... GO */
2075 		printf(("WAKEUP CALLED\n"));
2076 		queue->numOutstanding++;
2077 
2078 		bp->b_flags = 0;
2079 		bp->b_private = req;
2080 
2081 		KernelWakeupFunc(bp);
2082 		break;
2083 
2084 	case RF_IO_TYPE_READ:
2085 	case RF_IO_TYPE_WRITE:
2086 #if RF_ACC_TRACE > 0
2087 		if (req->tracerec) {
2088 			RF_ETIMER_START(req->tracerec->timer);
2089 		}
2090 #endif
2091 		InitBP(bp, queue->rf_cinfo->ci_vp,
2092 		    op, queue->rf_cinfo->ci_dev,
2093 		    req->sectorOffset, req->numSector,
2094 		    req->buf, KernelWakeupFunc, (void *) req,
2095 		    queue->raidPtr->logBytesPerSector, req->b_proc);
2096 
2097 		if (rf_debugKernelAccess) {
2098 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
2099 				(long) bp->b_blkno));
2100 		}
2101 		queue->numOutstanding++;
2102 		queue->last_deq_sector = req->sectorOffset;
2103 		/* acc wouldn't have been let in if there were any pending
2104 		 * reqs at any other priority */
2105 		queue->curPriority = req->priority;
2106 
2107 		db1_printf(("Going for %c to unit %d col %d\n",
2108 			    req->type, queue->raidPtr->raidid,
2109 			    queue->col));
2110 		db1_printf(("sector %d count %d (%d bytes) %d\n",
2111 			(int) req->sectorOffset, (int) req->numSector,
2112 			(int) (req->numSector <<
2113 			    queue->raidPtr->logBytesPerSector),
2114 			(int) queue->raidPtr->logBytesPerSector));
2115 		bdev_strategy(bp);
2116 
2117 		break;
2118 
2119 	default:
2120 		panic("bad req->type in rf_DispatchKernelIO");
2121 	}
2122 	db1_printf(("Exiting from DispatchKernelIO\n"));
2123 
2124 	return (0);
2125 }
2126 /* this is the callback function associated with a I/O invoked from
2127    kernel code.
2128  */
2129 static void
2130 KernelWakeupFunc(struct buf *bp)
2131 {
2132 	RF_DiskQueueData_t *req = NULL;
2133 	RF_DiskQueue_t *queue;
2134 	int s;
2135 
2136 	s = splbio();
2137 	db1_printf(("recovering the request queue:\n"));
2138 	req = bp->b_private;
2139 
2140 	queue = (RF_DiskQueue_t *) req->queue;
2141 
2142 #if RF_ACC_TRACE > 0
2143 	if (req->tracerec) {
2144 		RF_ETIMER_STOP(req->tracerec->timer);
2145 		RF_ETIMER_EVAL(req->tracerec->timer);
2146 		RF_LOCK_MUTEX(rf_tracing_mutex);
2147 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2148 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2149 		req->tracerec->num_phys_ios++;
2150 		RF_UNLOCK_MUTEX(rf_tracing_mutex);
2151 	}
2152 #endif
2153 
2154 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
2155 	 * ballistic, and mark the component as hosed... */
2156 
2157 	if (bp->b_error != 0) {
2158 		/* Mark the disk as dead */
2159 		/* but only mark it once... */
2160 		/* and only if it wouldn't leave this RAID set
2161 		   completely broken */
2162 		if (((queue->raidPtr->Disks[queue->col].status ==
2163 		      rf_ds_optimal) ||
2164 		     (queue->raidPtr->Disks[queue->col].status ==
2165 		      rf_ds_used_spare)) &&
2166 		     (queue->raidPtr->numFailures <
2167 		      queue->raidPtr->Layout.map->faultsTolerated)) {
2168 			printf("raid%d: IO Error.  Marking %s as failed.\n",
2169 			       queue->raidPtr->raidid,
2170 			       queue->raidPtr->Disks[queue->col].devname);
2171 			queue->raidPtr->Disks[queue->col].status =
2172 			    rf_ds_failed;
2173 			queue->raidPtr->status = rf_rs_degraded;
2174 			queue->raidPtr->numFailures++;
2175 			queue->raidPtr->numNewFailures++;
2176 		} else {	/* Disk is already dead... */
2177 			/* printf("Disk already marked as dead!\n"); */
2178 		}
2179 
2180 	}
2181 
2182 	/* Fill in the error value */
2183 
2184 	req->error = bp->b_error;
2185 
2186 	simple_lock(&queue->raidPtr->iodone_lock);
2187 
2188 	/* Drop this one on the "finished" queue... */
2189 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2190 
2191 	/* Let the raidio thread know there is work to be done. */
2192 	wakeup(&(queue->raidPtr->iodone));
2193 
2194 	simple_unlock(&queue->raidPtr->iodone_lock);
2195 
2196 	splx(s);
2197 }
2198 
2199 
2200 
2201 /*
2202  * initialize a buf structure for doing an I/O in the kernel.
2203  */
2204 static void
2205 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2206        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2207        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2208        struct proc *b_proc)
2209 {
2210 	/* bp->b_flags       = B_PHYS | rw_flag; */
2211 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
2212 	bp->b_oflags = 0;
2213 	bp->b_cflags = 0;
2214 	bp->b_bcount = numSect << logBytesPerSector;
2215 	bp->b_bufsize = bp->b_bcount;
2216 	bp->b_error = 0;
2217 	bp->b_dev = dev;
2218 	bp->b_data = bf;
2219 	bp->b_blkno = startSect;
2220 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
2221 	if (bp->b_bcount == 0) {
2222 		panic("bp->b_bcount is zero in InitBP!!");
2223 	}
2224 	bp->b_proc = b_proc;
2225 	bp->b_iodone = cbFunc;
2226 	bp->b_private = cbArg;
2227 }
2228 
2229 static void
2230 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2231 		    struct disklabel *lp)
2232 {
2233 	memset(lp, 0, sizeof(*lp));
2234 
2235 	/* fabricate a label... */
2236 	lp->d_secperunit = raidPtr->totalSectors;
2237 	lp->d_secsize = raidPtr->bytesPerSector;
2238 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2239 	lp->d_ntracks = 4 * raidPtr->numCol;
2240 	lp->d_ncylinders = raidPtr->totalSectors /
2241 		(lp->d_nsectors * lp->d_ntracks);
2242 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2243 
2244 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2245 	lp->d_type = DTYPE_RAID;
2246 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2247 	lp->d_rpm = 3600;
2248 	lp->d_interleave = 1;
2249 	lp->d_flags = 0;
2250 
2251 	lp->d_partitions[RAW_PART].p_offset = 0;
2252 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2253 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2254 	lp->d_npartitions = RAW_PART + 1;
2255 
2256 	lp->d_magic = DISKMAGIC;
2257 	lp->d_magic2 = DISKMAGIC;
2258 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2259 
2260 }
2261 /*
2262  * Read the disklabel from the raid device.  If one is not present, fake one
2263  * up.
2264  */
2265 static void
2266 raidgetdisklabel(dev_t dev)
2267 {
2268 	int     unit = raidunit(dev);
2269 	struct raid_softc *rs = &raid_softc[unit];
2270 	const char   *errstring;
2271 	struct disklabel *lp = rs->sc_dkdev.dk_label;
2272 	struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2273 	RF_Raid_t *raidPtr;
2274 
2275 	db1_printf(("Getting the disklabel...\n"));
2276 
2277 	memset(clp, 0, sizeof(*clp));
2278 
2279 	raidPtr = raidPtrs[unit];
2280 
2281 	raidgetdefaultlabel(raidPtr, rs, lp);
2282 
2283 	/*
2284 	 * Call the generic disklabel extraction routine.
2285 	 */
2286 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2287 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2288 	if (errstring)
2289 		raidmakedisklabel(rs);
2290 	else {
2291 		int     i;
2292 		struct partition *pp;
2293 
2294 		/*
2295 		 * Sanity check whether the found disklabel is valid.
2296 		 *
2297 		 * This is necessary since total size of the raid device
2298 		 * may vary when an interleave is changed even though exactly
2299 		 * same components are used, and old disklabel may used
2300 		 * if that is found.
2301 		 */
2302 		if (lp->d_secperunit != rs->sc_size)
2303 			printf("raid%d: WARNING: %s: "
2304 			    "total sector size in disklabel (%d) != "
2305 			    "the size of raid (%ld)\n", unit, rs->sc_xname,
2306 			    lp->d_secperunit, (long) rs->sc_size);
2307 		for (i = 0; i < lp->d_npartitions; i++) {
2308 			pp = &lp->d_partitions[i];
2309 			if (pp->p_offset + pp->p_size > rs->sc_size)
2310 				printf("raid%d: WARNING: %s: end of partition `%c' "
2311 				       "exceeds the size of raid (%ld)\n",
2312 				       unit, rs->sc_xname, 'a' + i, (long) rs->sc_size);
2313 		}
2314 	}
2315 
2316 }
2317 /*
2318  * Take care of things one might want to take care of in the event
2319  * that a disklabel isn't present.
2320  */
2321 static void
2322 raidmakedisklabel(struct raid_softc *rs)
2323 {
2324 	struct disklabel *lp = rs->sc_dkdev.dk_label;
2325 	db1_printf(("Making a label..\n"));
2326 
2327 	/*
2328 	 * For historical reasons, if there's no disklabel present
2329 	 * the raw partition must be marked FS_BSDFFS.
2330 	 */
2331 
2332 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2333 
2334 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2335 
2336 	lp->d_checksum = dkcksum(lp);
2337 }
2338 /*
2339  * Wait interruptibly for an exclusive lock.
2340  *
2341  * XXX
2342  * Several drivers do this; it should be abstracted and made MP-safe.
2343  * (Hmm... where have we seen this warning before :->  GO )
2344  */
2345 static int
2346 raidlock(struct raid_softc *rs)
2347 {
2348 	int     error;
2349 
2350 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2351 		rs->sc_flags |= RAIDF_WANTED;
2352 		if ((error =
2353 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2354 			return (error);
2355 	}
2356 	rs->sc_flags |= RAIDF_LOCKED;
2357 	return (0);
2358 }
2359 /*
2360  * Unlock and wake up any waiters.
2361  */
2362 static void
2363 raidunlock(struct raid_softc *rs)
2364 {
2365 
2366 	rs->sc_flags &= ~RAIDF_LOCKED;
2367 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2368 		rs->sc_flags &= ~RAIDF_WANTED;
2369 		wakeup(rs);
2370 	}
2371 }
2372 
2373 
2374 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2375 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2376 
2377 int
2378 raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter)
2379 {
2380 	RF_ComponentLabel_t clabel;
2381 	raidread_component_label(dev, b_vp, &clabel);
2382 	clabel.mod_counter = mod_counter;
2383 	clabel.clean = RF_RAID_CLEAN;
2384 	raidwrite_component_label(dev, b_vp, &clabel);
2385 	return(0);
2386 }
2387 
2388 
2389 int
2390 raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter)
2391 {
2392 	RF_ComponentLabel_t clabel;
2393 	raidread_component_label(dev, b_vp, &clabel);
2394 	clabel.mod_counter = mod_counter;
2395 	clabel.clean = RF_RAID_DIRTY;
2396 	raidwrite_component_label(dev, b_vp, &clabel);
2397 	return(0);
2398 }
2399 
2400 /* ARGSUSED */
2401 int
2402 raidread_component_label(dev_t dev, struct vnode *b_vp,
2403 			 RF_ComponentLabel_t *clabel)
2404 {
2405 	struct buf *bp;
2406 	const struct bdevsw *bdev;
2407 	int error;
2408 
2409 	/* XXX should probably ensure that we don't try to do this if
2410 	   someone has changed rf_protected_sectors. */
2411 
2412 	if (b_vp == NULL) {
2413 		/* For whatever reason, this component is not valid.
2414 		   Don't try to read a component label from it. */
2415 		return(EINVAL);
2416 	}
2417 
2418 	/* get a block of the appropriate size... */
2419 	bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2420 	bp->b_dev = dev;
2421 
2422 	/* get our ducks in a row for the read */
2423 	bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2424 	bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2425 	bp->b_flags |= B_READ;
2426  	bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2427 
2428 	bdev = bdevsw_lookup(bp->b_dev);
2429 	if (bdev == NULL)
2430 		return (ENXIO);
2431 	(*bdev->d_strategy)(bp);
2432 
2433 	error = biowait(bp);
2434 
2435 	if (!error) {
2436 		memcpy(clabel, bp->b_data,
2437 		       sizeof(RF_ComponentLabel_t));
2438 	}
2439 
2440 	brelse(bp, 0);
2441 	return(error);
2442 }
2443 /* ARGSUSED */
2444 int
2445 raidwrite_component_label(dev_t dev, struct vnode *b_vp,
2446 			  RF_ComponentLabel_t *clabel)
2447 {
2448 	struct buf *bp;
2449 	const struct bdevsw *bdev;
2450 	int error;
2451 
2452 	/* get a block of the appropriate size... */
2453 	bp = geteblk((int)RF_COMPONENT_INFO_SIZE);
2454 	bp->b_dev = dev;
2455 
2456 	/* get our ducks in a row for the write */
2457 	bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE;
2458 	bp->b_bcount = RF_COMPONENT_INFO_SIZE;
2459 	bp->b_flags |= B_WRITE;
2460  	bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE;
2461 
2462 	memset(bp->b_data, 0, RF_COMPONENT_INFO_SIZE );
2463 
2464 	memcpy(bp->b_data, clabel, sizeof(RF_ComponentLabel_t));
2465 
2466 	bdev = bdevsw_lookup(bp->b_dev);
2467 	if (bdev == NULL)
2468 		return (ENXIO);
2469 	(*bdev->d_strategy)(bp);
2470 	error = biowait(bp);
2471 	brelse(bp, 0);
2472 	if (error) {
2473 #if 1
2474 		printf("Failed to write RAID component info!\n");
2475 #endif
2476 	}
2477 
2478 	return(error);
2479 }
2480 
2481 void
2482 rf_markalldirty(RF_Raid_t *raidPtr)
2483 {
2484 	RF_ComponentLabel_t clabel;
2485 	int sparecol;
2486 	int c;
2487 	int j;
2488 	int scol = -1;
2489 
2490 	raidPtr->mod_counter++;
2491 	for (c = 0; c < raidPtr->numCol; c++) {
2492 		/* we don't want to touch (at all) a disk that has
2493 		   failed */
2494 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2495 			raidread_component_label(
2496 						 raidPtr->Disks[c].dev,
2497 						 raidPtr->raid_cinfo[c].ci_vp,
2498 						 &clabel);
2499 			if (clabel.status == rf_ds_spared) {
2500 				/* XXX do something special...
2501 				   but whatever you do, don't
2502 				   try to access it!! */
2503 			} else {
2504 				raidmarkdirty(
2505 					      raidPtr->Disks[c].dev,
2506 					      raidPtr->raid_cinfo[c].ci_vp,
2507 					      raidPtr->mod_counter);
2508 			}
2509 		}
2510 	}
2511 
2512 	for( c = 0; c < raidPtr->numSpare ; c++) {
2513 		sparecol = raidPtr->numCol + c;
2514 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2515 			/*
2516 
2517 			   we claim this disk is "optimal" if it's
2518 			   rf_ds_used_spare, as that means it should be
2519 			   directly substitutable for the disk it replaced.
2520 			   We note that too...
2521 
2522 			 */
2523 
2524 			for(j=0;j<raidPtr->numCol;j++) {
2525 				if (raidPtr->Disks[j].spareCol == sparecol) {
2526 					scol = j;
2527 					break;
2528 				}
2529 			}
2530 
2531 			raidread_component_label(
2532 				 raidPtr->Disks[sparecol].dev,
2533 				 raidPtr->raid_cinfo[sparecol].ci_vp,
2534 				 &clabel);
2535 			/* make sure status is noted */
2536 
2537 			raid_init_component_label(raidPtr, &clabel);
2538 
2539 			clabel.row = 0;
2540 			clabel.column = scol;
2541 			/* Note: we *don't* change status from rf_ds_used_spare
2542 			   to rf_ds_optimal */
2543 			/* clabel.status = rf_ds_optimal; */
2544 
2545 			raidmarkdirty(raidPtr->Disks[sparecol].dev,
2546 				      raidPtr->raid_cinfo[sparecol].ci_vp,
2547 				      raidPtr->mod_counter);
2548 		}
2549 	}
2550 }
2551 
2552 
2553 void
2554 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2555 {
2556 	RF_ComponentLabel_t clabel;
2557 	int sparecol;
2558 	int c;
2559 	int j;
2560 	int scol;
2561 
2562 	scol = -1;
2563 
2564 	/* XXX should do extra checks to make sure things really are clean,
2565 	   rather than blindly setting the clean bit... */
2566 
2567 	raidPtr->mod_counter++;
2568 
2569 	for (c = 0; c < raidPtr->numCol; c++) {
2570 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
2571 			raidread_component_label(
2572 						 raidPtr->Disks[c].dev,
2573 						 raidPtr->raid_cinfo[c].ci_vp,
2574 						 &clabel);
2575 			/* make sure status is noted */
2576 			clabel.status = rf_ds_optimal;
2577 
2578 			/* bump the counter */
2579 			clabel.mod_counter = raidPtr->mod_counter;
2580 
2581 			/* note what unit we are configured as */
2582 			clabel.last_unit = raidPtr->raidid;
2583 
2584 			raidwrite_component_label(
2585 						  raidPtr->Disks[c].dev,
2586 						  raidPtr->raid_cinfo[c].ci_vp,
2587 						  &clabel);
2588 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2589 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2590 					raidmarkclean(
2591 						      raidPtr->Disks[c].dev,
2592 						      raidPtr->raid_cinfo[c].ci_vp,
2593 						      raidPtr->mod_counter);
2594 				}
2595 			}
2596 		}
2597 		/* else we don't touch it.. */
2598 	}
2599 
2600 	for( c = 0; c < raidPtr->numSpare ; c++) {
2601 		sparecol = raidPtr->numCol + c;
2602 		/* Need to ensure that the reconstruct actually completed! */
2603 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2604 			/*
2605 
2606 			   we claim this disk is "optimal" if it's
2607 			   rf_ds_used_spare, as that means it should be
2608 			   directly substitutable for the disk it replaced.
2609 			   We note that too...
2610 
2611 			 */
2612 
2613 			for(j=0;j<raidPtr->numCol;j++) {
2614 				if (raidPtr->Disks[j].spareCol == sparecol) {
2615 					scol = j;
2616 					break;
2617 				}
2618 			}
2619 
2620 			/* XXX shouldn't *really* need this... */
2621 			raidread_component_label(
2622 				      raidPtr->Disks[sparecol].dev,
2623 				      raidPtr->raid_cinfo[sparecol].ci_vp,
2624 				      &clabel);
2625 			/* make sure status is noted */
2626 
2627 			raid_init_component_label(raidPtr, &clabel);
2628 
2629 			clabel.mod_counter = raidPtr->mod_counter;
2630 			clabel.column = scol;
2631 			clabel.status = rf_ds_optimal;
2632 			clabel.last_unit = raidPtr->raidid;
2633 
2634 			raidwrite_component_label(
2635 				      raidPtr->Disks[sparecol].dev,
2636 				      raidPtr->raid_cinfo[sparecol].ci_vp,
2637 				      &clabel);
2638 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2639 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2640 					raidmarkclean( raidPtr->Disks[sparecol].dev,
2641 						       raidPtr->raid_cinfo[sparecol].ci_vp,
2642 						       raidPtr->mod_counter);
2643 				}
2644 			}
2645 		}
2646 	}
2647 }
2648 
2649 void
2650 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2651 {
2652 
2653 	if (vp != NULL) {
2654 		if (auto_configured == 1) {
2655 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2656 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2657 			vput(vp);
2658 
2659 		} else {
2660 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2661 		}
2662 	}
2663 }
2664 
2665 
2666 void
2667 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2668 {
2669 	int r,c;
2670 	struct vnode *vp;
2671 	int acd;
2672 
2673 
2674 	/* We take this opportunity to close the vnodes like we should.. */
2675 
2676 	for (c = 0; c < raidPtr->numCol; c++) {
2677 		vp = raidPtr->raid_cinfo[c].ci_vp;
2678 		acd = raidPtr->Disks[c].auto_configured;
2679 		rf_close_component(raidPtr, vp, acd);
2680 		raidPtr->raid_cinfo[c].ci_vp = NULL;
2681 		raidPtr->Disks[c].auto_configured = 0;
2682 	}
2683 
2684 	for (r = 0; r < raidPtr->numSpare; r++) {
2685 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2686 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2687 		rf_close_component(raidPtr, vp, acd);
2688 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2689 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2690 	}
2691 }
2692 
2693 
2694 void
2695 rf_ReconThread(struct rf_recon_req *req)
2696 {
2697 	int     s;
2698 	RF_Raid_t *raidPtr;
2699 
2700 	s = splbio();
2701 	raidPtr = (RF_Raid_t *) req->raidPtr;
2702 	raidPtr->recon_in_progress = 1;
2703 
2704 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2705 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2706 
2707 	RF_Free(req, sizeof(*req));
2708 
2709 	raidPtr->recon_in_progress = 0;
2710 	splx(s);
2711 
2712 	/* That's all... */
2713 	kthread_exit(0);	/* does not return */
2714 }
2715 
2716 void
2717 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2718 {
2719 	int retcode;
2720 	int s;
2721 
2722 	raidPtr->parity_rewrite_stripes_done = 0;
2723 	raidPtr->parity_rewrite_in_progress = 1;
2724 	s = splbio();
2725 	retcode = rf_RewriteParity(raidPtr);
2726 	splx(s);
2727 	if (retcode) {
2728 		printf("raid%d: Error re-writing parity!\n",raidPtr->raidid);
2729 	} else {
2730 		/* set the clean bit!  If we shutdown correctly,
2731 		   the clean bit on each component label will get
2732 		   set */
2733 		raidPtr->parity_good = RF_RAID_CLEAN;
2734 	}
2735 	raidPtr->parity_rewrite_in_progress = 0;
2736 
2737 	/* Anyone waiting for us to stop?  If so, inform them... */
2738 	if (raidPtr->waitShutdown) {
2739 		wakeup(&raidPtr->parity_rewrite_in_progress);
2740 	}
2741 
2742 	/* That's all... */
2743 	kthread_exit(0);	/* does not return */
2744 }
2745 
2746 
2747 void
2748 rf_CopybackThread(RF_Raid_t *raidPtr)
2749 {
2750 	int s;
2751 
2752 	raidPtr->copyback_in_progress = 1;
2753 	s = splbio();
2754 	rf_CopybackReconstructedData(raidPtr);
2755 	splx(s);
2756 	raidPtr->copyback_in_progress = 0;
2757 
2758 	/* That's all... */
2759 	kthread_exit(0);	/* does not return */
2760 }
2761 
2762 
2763 void
2764 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2765 {
2766 	int s;
2767 	RF_Raid_t *raidPtr;
2768 
2769 	s = splbio();
2770 	raidPtr = req->raidPtr;
2771 	raidPtr->recon_in_progress = 1;
2772 	rf_ReconstructInPlace(raidPtr, req->col);
2773 	RF_Free(req, sizeof(*req));
2774 	raidPtr->recon_in_progress = 0;
2775 	splx(s);
2776 
2777 	/* That's all... */
2778 	kthread_exit(0);	/* does not return */
2779 }
2780 
2781 static RF_AutoConfig_t *
2782 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2783     const char *cname, RF_SectorCount_t size)
2784 {
2785 	int good_one = 0;
2786 	RF_ComponentLabel_t *clabel;
2787 	RF_AutoConfig_t *ac;
2788 
2789 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2790 	if (clabel == NULL) {
2791 oomem:
2792 		    while(ac_list) {
2793 			    ac = ac_list;
2794 			    if (ac->clabel)
2795 				    free(ac->clabel, M_RAIDFRAME);
2796 			    ac_list = ac_list->next;
2797 			    free(ac, M_RAIDFRAME);
2798 		    }
2799 		    printf("RAID auto config: out of memory!\n");
2800 		    return NULL; /* XXX probably should panic? */
2801 	}
2802 
2803 	if (!raidread_component_label(dev, vp, clabel)) {
2804 		    /* Got the label.  Does it look reasonable? */
2805 		    if (rf_reasonable_label(clabel) &&
2806 			(clabel->partitionSize <= size)) {
2807 #ifdef DEBUG
2808 			    printf("Component on: %s: %llu\n",
2809 				cname, (unsigned long long)size);
2810 			    rf_print_component_label(clabel);
2811 #endif
2812 			    /* if it's reasonable, add it, else ignore it. */
2813 			    ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2814 				M_NOWAIT);
2815 			    if (ac == NULL) {
2816 				    free(clabel, M_RAIDFRAME);
2817 				    goto oomem;
2818 			    }
2819 			    strlcpy(ac->devname, cname, sizeof(ac->devname));
2820 			    ac->dev = dev;
2821 			    ac->vp = vp;
2822 			    ac->clabel = clabel;
2823 			    ac->next = ac_list;
2824 			    ac_list = ac;
2825 			    good_one = 1;
2826 		    }
2827 	}
2828 	if (!good_one) {
2829 		/* cleanup */
2830 		free(clabel, M_RAIDFRAME);
2831 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2832 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2833 		vput(vp);
2834 	}
2835 	return ac_list;
2836 }
2837 
2838 RF_AutoConfig_t *
2839 rf_find_raid_components()
2840 {
2841 	struct vnode *vp;
2842 	struct disklabel label;
2843 	struct device *dv;
2844 	dev_t dev;
2845 	int bmajor, bminor, wedge;
2846 	int error;
2847 	int i;
2848 	RF_AutoConfig_t *ac_list;
2849 
2850 
2851 	/* initialize the AutoConfig list */
2852 	ac_list = NULL;
2853 
2854 	/* we begin by trolling through *all* the devices on the system */
2855 
2856 	for (dv = alldevs.tqh_first; dv != NULL;
2857 	     dv = dv->dv_list.tqe_next) {
2858 
2859 		/* we are only interested in disks... */
2860 		if (device_class(dv) != DV_DISK)
2861 			continue;
2862 
2863 		/* we don't care about floppies... */
2864 		if (device_is_a(dv, "fd")) {
2865 			continue;
2866 		}
2867 
2868 		/* we don't care about CD's... */
2869 		if (device_is_a(dv, "cd")) {
2870 			continue;
2871 		}
2872 
2873 		/* we don't care about md's... */
2874 		if (device_is_a(dv, "md")) {
2875 			continue;
2876 		}
2877 
2878 		/* hdfd is the Atari/Hades floppy driver */
2879 		if (device_is_a(dv, "hdfd")) {
2880 			continue;
2881 		}
2882 
2883 		/* fdisa is the Atari/Milan floppy driver */
2884 		if (device_is_a(dv, "fdisa")) {
2885 			continue;
2886 		}
2887 
2888 		/* need to find the device_name_to_block_device_major stuff */
2889 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
2890 
2891 		/* get a vnode for the raw partition of this disk */
2892 
2893 		wedge = device_is_a(dv, "dk");
2894 		bminor = minor(device_unit(dv));
2895 		dev = wedge ? makedev(bmajor, bminor) :
2896 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
2897 		if (bdevvp(dev, &vp))
2898 			panic("RAID can't alloc vnode");
2899 
2900 		error = VOP_OPEN(vp, FREAD, NOCRED);
2901 
2902 		if (error) {
2903 			/* "Who cares."  Continue looking
2904 			   for something that exists*/
2905 			vput(vp);
2906 			continue;
2907 		}
2908 
2909 		if (wedge) {
2910 			struct dkwedge_info dkw;
2911 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
2912 			    NOCRED);
2913 			if (error) {
2914 				printf("RAIDframe: can't get wedge info for "
2915 				    "dev %s (%d)\n", device_xname(dv), error);
2916 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2917 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2918 				vput(vp);
2919 				continue;
2920 			}
2921 
2922 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
2923 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2924 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2925 				vput(vp);
2926 				continue;
2927 			}
2928 
2929 			ac_list = rf_get_component(ac_list, dev, vp,
2930 			    device_xname(dv), dkw.dkw_size);
2931 			continue;
2932 		}
2933 
2934 		/* Ok, the disk exists.  Go get the disklabel. */
2935 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
2936 		if (error) {
2937 			/*
2938 			 * XXX can't happen - open() would
2939 			 * have errored out (or faked up one)
2940 			 */
2941 			if (error != ENOTTY)
2942 				printf("RAIDframe: can't get label for dev "
2943 				    "%s (%d)\n", device_xname(dv), error);
2944 		}
2945 
2946 		/* don't need this any more.  We'll allocate it again
2947 		   a little later if we really do... */
2948 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2949 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2950 		vput(vp);
2951 
2952 		if (error)
2953 			continue;
2954 
2955 		for (i = 0; i < label.d_npartitions; i++) {
2956 			char cname[sizeof(ac_list->devname)];
2957 
2958 			/* We only support partitions marked as RAID */
2959 			if (label.d_partitions[i].p_fstype != FS_RAID)
2960 				continue;
2961 
2962 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
2963 			if (bdevvp(dev, &vp))
2964 				panic("RAID can't alloc vnode");
2965 
2966 			error = VOP_OPEN(vp, FREAD, NOCRED);
2967 			if (error) {
2968 				/* Whatever... */
2969 				vput(vp);
2970 				continue;
2971 			}
2972 			snprintf(cname, sizeof(cname), "%s%c",
2973 			    device_xname(dv), 'a' + i);
2974 			ac_list = rf_get_component(ac_list, dev, vp, cname,
2975 				label.d_partitions[i].p_size);
2976 		}
2977 	}
2978 	return ac_list;
2979 }
2980 
2981 
2982 static int
2983 rf_reasonable_label(RF_ComponentLabel_t *clabel)
2984 {
2985 
2986 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
2987 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
2988 	    ((clabel->clean == RF_RAID_CLEAN) ||
2989 	     (clabel->clean == RF_RAID_DIRTY)) &&
2990 	    clabel->row >=0 &&
2991 	    clabel->column >= 0 &&
2992 	    clabel->num_rows > 0 &&
2993 	    clabel->num_columns > 0 &&
2994 	    clabel->row < clabel->num_rows &&
2995 	    clabel->column < clabel->num_columns &&
2996 	    clabel->blockSize > 0 &&
2997 	    clabel->numBlocks > 0) {
2998 		/* label looks reasonable enough... */
2999 		return(1);
3000 	}
3001 	return(0);
3002 }
3003 
3004 
3005 #ifdef DEBUG
3006 void
3007 rf_print_component_label(RF_ComponentLabel_t *clabel)
3008 {
3009 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3010 	       clabel->row, clabel->column,
3011 	       clabel->num_rows, clabel->num_columns);
3012 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3013 	       clabel->version, clabel->serial_number,
3014 	       clabel->mod_counter);
3015 	printf("   Clean: %s Status: %d\n",
3016 	       clabel->clean ? "Yes" : "No", clabel->status );
3017 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3018 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3019 	printf("   RAID Level: %c  blocksize: %d numBlocks: %d\n",
3020 	       (char) clabel->parityConfig, clabel->blockSize,
3021 	       clabel->numBlocks);
3022 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No" );
3023 	printf("   Contains root partition: %s\n",
3024 	       clabel->root_partition ? "Yes" : "No" );
3025 	printf("   Last configured as: raid%d\n", clabel->last_unit );
3026 #if 0
3027 	   printf("   Config order: %d\n", clabel->config_order);
3028 #endif
3029 
3030 }
3031 #endif
3032 
3033 RF_ConfigSet_t *
3034 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3035 {
3036 	RF_AutoConfig_t *ac;
3037 	RF_ConfigSet_t *config_sets;
3038 	RF_ConfigSet_t *cset;
3039 	RF_AutoConfig_t *ac_next;
3040 
3041 
3042 	config_sets = NULL;
3043 
3044 	/* Go through the AutoConfig list, and figure out which components
3045 	   belong to what sets.  */
3046 	ac = ac_list;
3047 	while(ac!=NULL) {
3048 		/* we're going to putz with ac->next, so save it here
3049 		   for use at the end of the loop */
3050 		ac_next = ac->next;
3051 
3052 		if (config_sets == NULL) {
3053 			/* will need at least this one... */
3054 			config_sets = (RF_ConfigSet_t *)
3055 				malloc(sizeof(RF_ConfigSet_t),
3056 				       M_RAIDFRAME, M_NOWAIT);
3057 			if (config_sets == NULL) {
3058 				panic("rf_create_auto_sets: No memory!");
3059 			}
3060 			/* this one is easy :) */
3061 			config_sets->ac = ac;
3062 			config_sets->next = NULL;
3063 			config_sets->rootable = 0;
3064 			ac->next = NULL;
3065 		} else {
3066 			/* which set does this component fit into? */
3067 			cset = config_sets;
3068 			while(cset!=NULL) {
3069 				if (rf_does_it_fit(cset, ac)) {
3070 					/* looks like it matches... */
3071 					ac->next = cset->ac;
3072 					cset->ac = ac;
3073 					break;
3074 				}
3075 				cset = cset->next;
3076 			}
3077 			if (cset==NULL) {
3078 				/* didn't find a match above... new set..*/
3079 				cset = (RF_ConfigSet_t *)
3080 					malloc(sizeof(RF_ConfigSet_t),
3081 					       M_RAIDFRAME, M_NOWAIT);
3082 				if (cset == NULL) {
3083 					panic("rf_create_auto_sets: No memory!");
3084 				}
3085 				cset->ac = ac;
3086 				ac->next = NULL;
3087 				cset->next = config_sets;
3088 				cset->rootable = 0;
3089 				config_sets = cset;
3090 			}
3091 		}
3092 		ac = ac_next;
3093 	}
3094 
3095 
3096 	return(config_sets);
3097 }
3098 
3099 static int
3100 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3101 {
3102 	RF_ComponentLabel_t *clabel1, *clabel2;
3103 
3104 	/* If this one matches the *first* one in the set, that's good
3105 	   enough, since the other members of the set would have been
3106 	   through here too... */
3107 	/* note that we are not checking partitionSize here..
3108 
3109 	   Note that we are also not checking the mod_counters here.
3110 	   If everything else matches execpt the mod_counter, that's
3111 	   good enough for this test.  We will deal with the mod_counters
3112 	   a little later in the autoconfiguration process.
3113 
3114 	    (clabel1->mod_counter == clabel2->mod_counter) &&
3115 
3116 	   The reason we don't check for this is that failed disks
3117 	   will have lower modification counts.  If those disks are
3118 	   not added to the set they used to belong to, then they will
3119 	   form their own set, which may result in 2 different sets,
3120 	   for example, competing to be configured at raid0, and
3121 	   perhaps competing to be the root filesystem set.  If the
3122 	   wrong ones get configured, or both attempt to become /,
3123 	   weird behaviour and or serious lossage will occur.  Thus we
3124 	   need to bring them into the fold here, and kick them out at
3125 	   a later point.
3126 
3127 	*/
3128 
3129 	clabel1 = cset->ac->clabel;
3130 	clabel2 = ac->clabel;
3131 	if ((clabel1->version == clabel2->version) &&
3132 	    (clabel1->serial_number == clabel2->serial_number) &&
3133 	    (clabel1->num_rows == clabel2->num_rows) &&
3134 	    (clabel1->num_columns == clabel2->num_columns) &&
3135 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
3136 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3137 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3138 	    (clabel1->parityConfig == clabel2->parityConfig) &&
3139 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3140 	    (clabel1->blockSize == clabel2->blockSize) &&
3141 	    (clabel1->numBlocks == clabel2->numBlocks) &&
3142 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
3143 	    (clabel1->root_partition == clabel2->root_partition) &&
3144 	    (clabel1->last_unit == clabel2->last_unit) &&
3145 	    (clabel1->config_order == clabel2->config_order)) {
3146 		/* if it get's here, it almost *has* to be a match */
3147 	} else {
3148 		/* it's not consistent with somebody in the set..
3149 		   punt */
3150 		return(0);
3151 	}
3152 	/* all was fine.. it must fit... */
3153 	return(1);
3154 }
3155 
3156 int
3157 rf_have_enough_components(RF_ConfigSet_t *cset)
3158 {
3159 	RF_AutoConfig_t *ac;
3160 	RF_AutoConfig_t *auto_config;
3161 	RF_ComponentLabel_t *clabel;
3162 	int c;
3163 	int num_cols;
3164 	int num_missing;
3165 	int mod_counter;
3166 	int mod_counter_found;
3167 	int even_pair_failed;
3168 	char parity_type;
3169 
3170 
3171 	/* check to see that we have enough 'live' components
3172 	   of this set.  If so, we can configure it if necessary */
3173 
3174 	num_cols = cset->ac->clabel->num_columns;
3175 	parity_type = cset->ac->clabel->parityConfig;
3176 
3177 	/* XXX Check for duplicate components!?!?!? */
3178 
3179 	/* Determine what the mod_counter is supposed to be for this set. */
3180 
3181 	mod_counter_found = 0;
3182 	mod_counter = 0;
3183 	ac = cset->ac;
3184 	while(ac!=NULL) {
3185 		if (mod_counter_found==0) {
3186 			mod_counter = ac->clabel->mod_counter;
3187 			mod_counter_found = 1;
3188 		} else {
3189 			if (ac->clabel->mod_counter > mod_counter) {
3190 				mod_counter = ac->clabel->mod_counter;
3191 			}
3192 		}
3193 		ac = ac->next;
3194 	}
3195 
3196 	num_missing = 0;
3197 	auto_config = cset->ac;
3198 
3199 	even_pair_failed = 0;
3200 	for(c=0; c<num_cols; c++) {
3201 		ac = auto_config;
3202 		while(ac!=NULL) {
3203 			if ((ac->clabel->column == c) &&
3204 			    (ac->clabel->mod_counter == mod_counter)) {
3205 				/* it's this one... */
3206 #ifdef DEBUG
3207 				printf("Found: %s at %d\n",
3208 				       ac->devname,c);
3209 #endif
3210 				break;
3211 			}
3212 			ac=ac->next;
3213 		}
3214 		if (ac==NULL) {
3215 				/* Didn't find one here! */
3216 				/* special case for RAID 1, especially
3217 				   where there are more than 2
3218 				   components (where RAIDframe treats
3219 				   things a little differently :( ) */
3220 			if (parity_type == '1') {
3221 				if (c%2 == 0) { /* even component */
3222 					even_pair_failed = 1;
3223 				} else { /* odd component.  If
3224 					    we're failed, and
3225 					    so is the even
3226 					    component, it's
3227 					    "Good Night, Charlie" */
3228 					if (even_pair_failed == 1) {
3229 						return(0);
3230 					}
3231 				}
3232 			} else {
3233 				/* normal accounting */
3234 				num_missing++;
3235 			}
3236 		}
3237 		if ((parity_type == '1') && (c%2 == 1)) {
3238 				/* Just did an even component, and we didn't
3239 				   bail.. reset the even_pair_failed flag,
3240 				   and go on to the next component.... */
3241 			even_pair_failed = 0;
3242 		}
3243 	}
3244 
3245 	clabel = cset->ac->clabel;
3246 
3247 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3248 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3249 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
3250 		/* XXX this needs to be made *much* more general */
3251 		/* Too many failures */
3252 		return(0);
3253 	}
3254 	/* otherwise, all is well, and we've got enough to take a kick
3255 	   at autoconfiguring this set */
3256 	return(1);
3257 }
3258 
3259 void
3260 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3261 			RF_Raid_t *raidPtr)
3262 {
3263 	RF_ComponentLabel_t *clabel;
3264 	int i;
3265 
3266 	clabel = ac->clabel;
3267 
3268 	/* 1. Fill in the common stuff */
3269 	config->numRow = clabel->num_rows = 1;
3270 	config->numCol = clabel->num_columns;
3271 	config->numSpare = 0; /* XXX should this be set here? */
3272 	config->sectPerSU = clabel->sectPerSU;
3273 	config->SUsPerPU = clabel->SUsPerPU;
3274 	config->SUsPerRU = clabel->SUsPerRU;
3275 	config->parityConfig = clabel->parityConfig;
3276 	/* XXX... */
3277 	strcpy(config->diskQueueType,"fifo");
3278 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3279 	config->layoutSpecificSize = 0; /* XXX ?? */
3280 
3281 	while(ac!=NULL) {
3282 		/* row/col values will be in range due to the checks
3283 		   in reasonable_label() */
3284 		strcpy(config->devnames[0][ac->clabel->column],
3285 		       ac->devname);
3286 		ac = ac->next;
3287 	}
3288 
3289 	for(i=0;i<RF_MAXDBGV;i++) {
3290 		config->debugVars[i][0] = 0;
3291 	}
3292 }
3293 
3294 int
3295 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3296 {
3297 	RF_ComponentLabel_t clabel;
3298 	struct vnode *vp;
3299 	dev_t dev;
3300 	int column;
3301 	int sparecol;
3302 
3303 	raidPtr->autoconfigure = new_value;
3304 
3305 	for(column=0; column<raidPtr->numCol; column++) {
3306 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3307 			dev = raidPtr->Disks[column].dev;
3308 			vp = raidPtr->raid_cinfo[column].ci_vp;
3309 			raidread_component_label(dev, vp, &clabel);
3310 			clabel.autoconfigure = new_value;
3311 			raidwrite_component_label(dev, vp, &clabel);
3312 		}
3313 	}
3314 	for(column = 0; column < raidPtr->numSpare ; column++) {
3315 		sparecol = raidPtr->numCol + column;
3316 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3317 			dev = raidPtr->Disks[sparecol].dev;
3318 			vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3319 			raidread_component_label(dev, vp, &clabel);
3320 			clabel.autoconfigure = new_value;
3321 			raidwrite_component_label(dev, vp, &clabel);
3322 		}
3323 	}
3324 	return(new_value);
3325 }
3326 
3327 int
3328 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3329 {
3330 	RF_ComponentLabel_t clabel;
3331 	struct vnode *vp;
3332 	dev_t dev;
3333 	int column;
3334 	int sparecol;
3335 
3336 	raidPtr->root_partition = new_value;
3337 	for(column=0; column<raidPtr->numCol; column++) {
3338 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3339 			dev = raidPtr->Disks[column].dev;
3340 			vp = raidPtr->raid_cinfo[column].ci_vp;
3341 			raidread_component_label(dev, vp, &clabel);
3342 			clabel.root_partition = new_value;
3343 			raidwrite_component_label(dev, vp, &clabel);
3344 		}
3345 	}
3346 	for(column = 0; column < raidPtr->numSpare ; column++) {
3347 		sparecol = raidPtr->numCol + column;
3348 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3349 			dev = raidPtr->Disks[sparecol].dev;
3350 			vp = raidPtr->raid_cinfo[sparecol].ci_vp;
3351 			raidread_component_label(dev, vp, &clabel);
3352 			clabel.root_partition = new_value;
3353 			raidwrite_component_label(dev, vp, &clabel);
3354 		}
3355 	}
3356 	return(new_value);
3357 }
3358 
3359 void
3360 rf_release_all_vps(RF_ConfigSet_t *cset)
3361 {
3362 	RF_AutoConfig_t *ac;
3363 
3364 	ac = cset->ac;
3365 	while(ac!=NULL) {
3366 		/* Close the vp, and give it back */
3367 		if (ac->vp) {
3368 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3369 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
3370 			vput(ac->vp);
3371 			ac->vp = NULL;
3372 		}
3373 		ac = ac->next;
3374 	}
3375 }
3376 
3377 
3378 void
3379 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3380 {
3381 	RF_AutoConfig_t *ac;
3382 	RF_AutoConfig_t *next_ac;
3383 
3384 	ac = cset->ac;
3385 	while(ac!=NULL) {
3386 		next_ac = ac->next;
3387 		/* nuke the label */
3388 		free(ac->clabel, M_RAIDFRAME);
3389 		/* cleanup the config structure */
3390 		free(ac, M_RAIDFRAME);
3391 		/* "next.." */
3392 		ac = next_ac;
3393 	}
3394 	/* and, finally, nuke the config set */
3395 	free(cset, M_RAIDFRAME);
3396 }
3397 
3398 
3399 void
3400 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3401 {
3402 	/* current version number */
3403 	clabel->version = RF_COMPONENT_LABEL_VERSION;
3404 	clabel->serial_number = raidPtr->serial_number;
3405 	clabel->mod_counter = raidPtr->mod_counter;
3406 	clabel->num_rows = 1;
3407 	clabel->num_columns = raidPtr->numCol;
3408 	clabel->clean = RF_RAID_DIRTY; /* not clean */
3409 	clabel->status = rf_ds_optimal; /* "It's good!" */
3410 
3411 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3412 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3413 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3414 
3415 	clabel->blockSize = raidPtr->bytesPerSector;
3416 	clabel->numBlocks = raidPtr->sectorsPerDisk;
3417 
3418 	/* XXX not portable */
3419 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3420 	clabel->maxOutstanding = raidPtr->maxOutstanding;
3421 	clabel->autoconfigure = raidPtr->autoconfigure;
3422 	clabel->root_partition = raidPtr->root_partition;
3423 	clabel->last_unit = raidPtr->raidid;
3424 	clabel->config_order = raidPtr->config_order;
3425 }
3426 
3427 int
3428 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3429 {
3430 	RF_Raid_t *raidPtr;
3431 	RF_Config_t *config;
3432 	int raidID;
3433 	int retcode;
3434 
3435 #ifdef DEBUG
3436 	printf("RAID autoconfigure\n");
3437 #endif
3438 
3439 	retcode = 0;
3440 	*unit = -1;
3441 
3442 	/* 1. Create a config structure */
3443 
3444 	config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3445 				       M_RAIDFRAME,
3446 				       M_NOWAIT);
3447 	if (config==NULL) {
3448 		printf("Out of mem!?!?\n");
3449 				/* XXX do something more intelligent here. */
3450 		return(1);
3451 	}
3452 
3453 	memset(config, 0, sizeof(RF_Config_t));
3454 
3455 	/*
3456 	   2. Figure out what RAID ID this one is supposed to live at
3457 	   See if we can get the same RAID dev that it was configured
3458 	   on last time..
3459 	*/
3460 
3461 	raidID = cset->ac->clabel->last_unit;
3462 	if ((raidID < 0) || (raidID >= numraid)) {
3463 		/* let's not wander off into lala land. */
3464 		raidID = numraid - 1;
3465 	}
3466 	if (raidPtrs[raidID]->valid != 0) {
3467 
3468 		/*
3469 		   Nope... Go looking for an alternative...
3470 		   Start high so we don't immediately use raid0 if that's
3471 		   not taken.
3472 		*/
3473 
3474 		for(raidID = numraid - 1; raidID >= 0; raidID--) {
3475 			if (raidPtrs[raidID]->valid == 0) {
3476 				/* can use this one! */
3477 				break;
3478 			}
3479 		}
3480 	}
3481 
3482 	if (raidID < 0) {
3483 		/* punt... */
3484 		printf("Unable to auto configure this set!\n");
3485 		printf("(Out of RAID devs!)\n");
3486 		free(config, M_RAIDFRAME);
3487 		return(1);
3488 	}
3489 
3490 #ifdef DEBUG
3491 	printf("Configuring raid%d:\n",raidID);
3492 #endif
3493 
3494 	raidPtr = raidPtrs[raidID];
3495 
3496 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
3497 	raidPtr->raidid = raidID;
3498 	raidPtr->openings = RAIDOUTSTANDING;
3499 
3500 	/* 3. Build the configuration structure */
3501 	rf_create_configuration(cset->ac, config, raidPtr);
3502 
3503 	/* 4. Do the configuration */
3504 	retcode = rf_Configure(raidPtr, config, cset->ac);
3505 
3506 	if (retcode == 0) {
3507 
3508 		raidinit(raidPtrs[raidID]);
3509 
3510 		rf_markalldirty(raidPtrs[raidID]);
3511 		raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3512 		if (cset->ac->clabel->root_partition==1) {
3513 			/* everything configured just fine.  Make a note
3514 			   that this set is eligible to be root. */
3515 			cset->rootable = 1;
3516 			/* XXX do this here? */
3517 			raidPtrs[raidID]->root_partition = 1;
3518 		}
3519 	}
3520 
3521 	/* 5. Cleanup */
3522 	free(config, M_RAIDFRAME);
3523 
3524 	*unit = raidID;
3525 	return(retcode);
3526 }
3527 
3528 void
3529 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3530 {
3531 	struct buf *bp;
3532 
3533 	bp = (struct buf *)desc->bp;
3534 	disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3535 	    (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3536 }
3537 
3538 void
3539 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3540 	     size_t xmin, size_t xmax)
3541 {
3542 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3543 	pool_sethiwat(p, xmax);
3544 	pool_prime(p, xmin);
3545 	pool_setlowat(p, xmin);
3546 }
3547 
3548 /*
3549  * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3550  * if there is IO pending and if that IO could possibly be done for a
3551  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
3552  * otherwise.
3553  *
3554  */
3555 
3556 int
3557 rf_buf_queue_check(int raidid)
3558 {
3559 	if ((BUFQ_PEEK(raid_softc[raidid].buf_queue) != NULL) &&
3560 	    raidPtrs[raidid]->openings > 0) {
3561 		/* there is work to do */
3562 		return 0;
3563 	}
3564 	/* default is nothing to do */
3565 	return 1;
3566 }
3567 
3568 int
3569 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr)
3570 {
3571 	struct partinfo dpart;
3572 	struct dkwedge_info dkw;
3573 	int error;
3574 
3575 	error = VOP_IOCTL(vp, DIOCGPART, &dpart, FREAD, l->l_cred);
3576 	if (error == 0) {
3577 		diskPtr->blockSize = dpart.disklab->d_secsize;
3578 		diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
3579 		diskPtr->partitionSize = dpart.part->p_size;
3580 		return 0;
3581 	}
3582 
3583 	error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, l->l_cred);
3584 	if (error == 0) {
3585 		diskPtr->blockSize = 512;	/* XXX */
3586 		diskPtr->numBlocks = dkw.dkw_size - rf_protectedSectors;
3587 		diskPtr->partitionSize = dkw.dkw_size;
3588 		return 0;
3589 	}
3590 	return error;
3591 }
3592 
3593 static int
3594 raid_match(struct device *self, struct cfdata *cfdata,
3595     void *aux)
3596 {
3597 	return 1;
3598 }
3599 
3600 static void
3601 raid_attach(struct device *parent, struct device *self,
3602     void *aux)
3603 {
3604 
3605 }
3606 
3607 
3608 static int
3609 raid_detach(struct device *self, int flags)
3610 {
3611 	struct raid_softc *rs = (struct raid_softc *)self;
3612 
3613 	if (rs->sc_flags & RAIDF_INITED)
3614 		return EBUSY;
3615 
3616 	return 0;
3617 }
3618 
3619 static void
3620 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3621 {
3622 	prop_dictionary_t disk_info, odisk_info, geom;
3623 	disk_info = prop_dictionary_create();
3624 	geom = prop_dictionary_create();
3625 	prop_dictionary_set_uint64(geom, "sectors-per-unit",
3626 				   raidPtr->totalSectors);
3627 	prop_dictionary_set_uint32(geom, "sector-size",
3628 				   raidPtr->bytesPerSector);
3629 
3630 	prop_dictionary_set_uint16(geom, "sectors-per-track",
3631 				   raidPtr->Layout.dataSectorsPerStripe);
3632 	prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3633 				   4 * raidPtr->numCol);
3634 
3635 	prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3636 	   raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3637 	   (4 * raidPtr->numCol)));
3638 
3639 	prop_dictionary_set(disk_info, "geometry", geom);
3640 	prop_object_release(geom);
3641 	prop_dictionary_set(device_properties(rs->sc_dev),
3642 			    "disk-info", disk_info);
3643 	odisk_info = rs->sc_dkdev.dk_info;
3644 	rs->sc_dkdev.dk_info = disk_info;
3645 	if (odisk_info)
3646 		prop_object_release(odisk_info);
3647 }
3648