xref: /netbsd-src/sys/dev/raidframe/rf_netbsdkintf.c (revision b757af438b42b93f8c6571f026d8b8ef3eaf5fc9)
1 /*	$NetBSD: rf_netbsdkintf.c,v 1.296 2012/02/16 06:52:03 buhrow Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Greg Oster; Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *      The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * Copyright (c) 1995 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Mark Holland, Jim Zelenka
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /***********************************************************
98  *
99  * rf_kintf.c -- the kernel interface routines for RAIDframe
100  *
101  ***********************************************************/
102 
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.296 2012/02/16 06:52:03 buhrow Exp $");
105 
106 #ifdef _KERNEL_OPT
107 #include "opt_compat_netbsd.h"
108 #include "opt_raid_autoconfig.h"
109 #include "raid.h"
110 #endif
111 
112 #include <sys/param.h>
113 #include <sys/errno.h>
114 #include <sys/pool.h>
115 #include <sys/proc.h>
116 #include <sys/queue.h>
117 #include <sys/disk.h>
118 #include <sys/device.h>
119 #include <sys/stat.h>
120 #include <sys/ioctl.h>
121 #include <sys/fcntl.h>
122 #include <sys/systm.h>
123 #include <sys/vnode.h>
124 #include <sys/disklabel.h>
125 #include <sys/conf.h>
126 #include <sys/buf.h>
127 #include <sys/bufq.h>
128 #include <sys/reboot.h>
129 #include <sys/kauth.h>
130 
131 #include <prop/proplib.h>
132 
133 #include <dev/raidframe/raidframevar.h>
134 #include <dev/raidframe/raidframeio.h>
135 #include <dev/raidframe/rf_paritymap.h>
136 
137 #include "rf_raid.h"
138 #include "rf_copyback.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150 
151 #ifdef COMPAT_50
152 #include "rf_compat50.h"
153 #endif
154 
155 #ifdef DEBUG
156 int     rf_kdebug_level = 0;
157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
158 #else				/* DEBUG */
159 #define db1_printf(a) { }
160 #endif				/* DEBUG */
161 
162 static RF_Raid_t **raidPtrs;	/* global raid device descriptors */
163 
164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
165 static rf_declare_mutex2(rf_sparet_wait_mutex);
166 static rf_declare_cond2(rf_sparet_wait_cv);
167 static rf_declare_cond2(rf_sparet_resp_cv);
168 
169 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
170 						 * spare table */
171 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
172 						 * installation process */
173 #endif
174 
175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
176 
177 /* prototypes */
178 static void KernelWakeupFunc(struct buf *);
179 static void InitBP(struct buf *, struct vnode *, unsigned,
180     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
181     void *, int, struct proc *);
182 static void raidinit(RF_Raid_t *);
183 
184 void raidattach(int);
185 static int raid_match(device_t, cfdata_t, void *);
186 static void raid_attach(device_t, device_t, void *);
187 static int raid_detach(device_t, int);
188 
189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
190     daddr_t, daddr_t);
191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
192     daddr_t, daddr_t, int);
193 
194 static int raidwrite_component_label(unsigned,
195     dev_t, struct vnode *, RF_ComponentLabel_t *);
196 static int raidread_component_label(unsigned,
197     dev_t, struct vnode *, RF_ComponentLabel_t *);
198 
199 
200 dev_type_open(raidopen);
201 dev_type_close(raidclose);
202 dev_type_read(raidread);
203 dev_type_write(raidwrite);
204 dev_type_ioctl(raidioctl);
205 dev_type_strategy(raidstrategy);
206 dev_type_dump(raiddump);
207 dev_type_size(raidsize);
208 
209 const struct bdevsw raid_bdevsw = {
210 	raidopen, raidclose, raidstrategy, raidioctl,
211 	raiddump, raidsize, D_DISK
212 };
213 
214 const struct cdevsw raid_cdevsw = {
215 	raidopen, raidclose, raidread, raidwrite, raidioctl,
216 	nostop, notty, nopoll, nommap, nokqfilter, D_DISK
217 };
218 
219 static struct dkdriver rf_dkdriver = { raidstrategy, minphys };
220 
221 /* XXX Not sure if the following should be replacing the raidPtrs above,
222    or if it should be used in conjunction with that...
223 */
224 
225 struct raid_softc {
226 	device_t sc_dev;
227 	int     sc_flags;	/* flags */
228 	int     sc_cflags;	/* configuration flags */
229 	uint64_t sc_size;	/* size of the raid device */
230 	char    sc_xname[20];	/* XXX external name */
231 	struct disk sc_dkdev;	/* generic disk device info */
232 	struct bufq_state *buf_queue;	/* used for the device queue */
233 };
234 /* sc_flags */
235 #define RAIDF_INITED	0x01	/* unit has been initialized */
236 #define RAIDF_WLABEL	0x02	/* label area is writable */
237 #define RAIDF_LABELLING	0x04	/* unit is currently being labelled */
238 #define RAIDF_SHUTDOWN	0x08	/* unit is being shutdown */
239 #define RAIDF_WANTED	0x40	/* someone is waiting to obtain a lock */
240 #define RAIDF_LOCKED	0x80	/* unit is locked */
241 
242 #define	raidunit(x)	DISKUNIT(x)
243 int numraid = 0;
244 
245 extern struct cfdriver raid_cd;
246 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
247     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
248     DVF_DETACH_SHUTDOWN);
249 
250 /*
251  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
252  * Be aware that large numbers can allow the driver to consume a lot of
253  * kernel memory, especially on writes, and in degraded mode reads.
254  *
255  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
256  * a single 64K write will typically require 64K for the old data,
257  * 64K for the old parity, and 64K for the new parity, for a total
258  * of 192K (if the parity buffer is not re-used immediately).
259  * Even it if is used immediately, that's still 128K, which when multiplied
260  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
261  *
262  * Now in degraded mode, for example, a 64K read on the above setup may
263  * require data reconstruction, which will require *all* of the 4 remaining
264  * disks to participate -- 4 * 32K/disk == 128K again.
265  */
266 
267 #ifndef RAIDOUTSTANDING
268 #define RAIDOUTSTANDING   6
269 #endif
270 
271 #define RAIDLABELDEV(dev)	\
272 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
273 
274 /* declared here, and made public, for the benefit of KVM stuff.. */
275 struct raid_softc *raid_softc;
276 
277 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *,
278 				     struct disklabel *);
279 static void raidgetdisklabel(dev_t);
280 static void raidmakedisklabel(struct raid_softc *);
281 
282 static int raidlock(struct raid_softc *);
283 static void raidunlock(struct raid_softc *);
284 
285 static int raid_detach_unlocked(struct raid_softc *);
286 
287 static void rf_markalldirty(RF_Raid_t *);
288 static void rf_set_properties(struct raid_softc *, RF_Raid_t *);
289 
290 void rf_ReconThread(struct rf_recon_req *);
291 void rf_RewriteParityThread(RF_Raid_t *raidPtr);
292 void rf_CopybackThread(RF_Raid_t *raidPtr);
293 void rf_ReconstructInPlaceThread(struct rf_recon_req *);
294 int rf_autoconfig(device_t);
295 void rf_buildroothack(RF_ConfigSet_t *);
296 
297 RF_AutoConfig_t *rf_find_raid_components(void);
298 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
299 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
300 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t);
301 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
302 int rf_set_autoconfig(RF_Raid_t *, int);
303 int rf_set_rootpartition(RF_Raid_t *, int);
304 void rf_release_all_vps(RF_ConfigSet_t *);
305 void rf_cleanup_config_set(RF_ConfigSet_t *);
306 int rf_have_enough_components(RF_ConfigSet_t *);
307 int rf_auto_config_set(RF_ConfigSet_t *, int *);
308 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
309 
310 /*
311  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
312  * Note that this is overridden by having RAID_AUTOCONFIG as an option
313  * in the kernel config file.
314  */
315 #ifdef RAID_AUTOCONFIG
316 int raidautoconfig = 1;
317 #else
318 int raidautoconfig = 0;
319 #endif
320 static bool raidautoconfigdone = false;
321 
322 struct RF_Pools_s rf_pools;
323 
324 void
325 raidattach(int num)
326 {
327 	int raidID;
328 	int i, rc;
329 
330 	aprint_debug("raidattach: Asked for %d units\n", num);
331 
332 	if (num <= 0) {
333 #ifdef DIAGNOSTIC
334 		panic("raidattach: count <= 0");
335 #endif
336 		return;
337 	}
338 	/* This is where all the initialization stuff gets done. */
339 
340 	numraid = num;
341 
342 	/* Make some space for requested number of units... */
343 
344 	RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **));
345 	if (raidPtrs == NULL) {
346 		panic("raidPtrs is NULL!!");
347 	}
348 
349 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
350 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
351 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
352 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
353 
354 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
355 #endif
356 
357 	for (i = 0; i < num; i++)
358 		raidPtrs[i] = NULL;
359 	rc = rf_BootRaidframe();
360 	if (rc == 0)
361 		aprint_verbose("Kernelized RAIDframe activated\n");
362 	else
363 		panic("Serious error booting RAID!!");
364 
365 	/* put together some datastructures like the CCD device does.. This
366 	 * lets us lock the device and what-not when it gets opened. */
367 
368 	raid_softc = (struct raid_softc *)
369 		malloc(num * sizeof(struct raid_softc),
370 		       M_RAIDFRAME, M_NOWAIT);
371 	if (raid_softc == NULL) {
372 		aprint_error("WARNING: no memory for RAIDframe driver\n");
373 		return;
374 	}
375 
376 	memset(raid_softc, 0, num * sizeof(struct raid_softc));
377 
378 	for (raidID = 0; raidID < num; raidID++) {
379 		bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0);
380 
381 		RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t),
382 			  (RF_Raid_t *));
383 		if (raidPtrs[raidID] == NULL) {
384 			aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID);
385 			numraid = raidID;
386 			return;
387 		}
388 	}
389 
390 	if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) {
391 		aprint_error("raidattach: config_cfattach_attach failed?\n");
392 	}
393 
394 	raidautoconfigdone = false;
395 
396 	/*
397 	 * Register a finalizer which will be used to auto-config RAID
398 	 * sets once all real hardware devices have been found.
399 	 */
400 	if (config_finalize_register(NULL, rf_autoconfig) != 0)
401 		aprint_error("WARNING: unable to register RAIDframe finalizer\n");
402 }
403 
404 int
405 rf_autoconfig(device_t self)
406 {
407 	RF_AutoConfig_t *ac_list;
408 	RF_ConfigSet_t *config_sets;
409 
410 	if (!raidautoconfig || raidautoconfigdone == true)
411 		return (0);
412 
413 	/* XXX This code can only be run once. */
414 	raidautoconfigdone = true;
415 
416 	/* 1. locate all RAID components on the system */
417 	aprint_debug("Searching for RAID components...\n");
418 	ac_list = rf_find_raid_components();
419 
420 	/* 2. Sort them into their respective sets. */
421 	config_sets = rf_create_auto_sets(ac_list);
422 
423 	/*
424 	 * 3. Evaluate each set andconfigure the valid ones.
425 	 * This gets done in rf_buildroothack().
426 	 */
427 	rf_buildroothack(config_sets);
428 
429 	return 1;
430 }
431 
432 void
433 rf_buildroothack(RF_ConfigSet_t *config_sets)
434 {
435 	RF_ConfigSet_t *cset;
436 	RF_ConfigSet_t *next_cset;
437 	int retcode;
438 	int raidID;
439 	int rootID;
440 	int col;
441 	int num_root;
442 	char *devname;
443 
444 	rootID = 0;
445 	num_root = 0;
446 	cset = config_sets;
447 	while (cset != NULL) {
448 		next_cset = cset->next;
449 		if (rf_have_enough_components(cset) &&
450 		    cset->ac->clabel->autoconfigure==1) {
451 			retcode = rf_auto_config_set(cset,&raidID);
452 			if (!retcode) {
453 				aprint_debug("raid%d: configured ok\n", raidID);
454 				if (cset->rootable) {
455 					rootID = raidID;
456 					num_root++;
457 				}
458 			} else {
459 				/* The autoconfig didn't work :( */
460 				aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID);
461 				rf_release_all_vps(cset);
462 			}
463 		} else {
464 			/* we're not autoconfiguring this set...
465 			   release the associated resources */
466 			rf_release_all_vps(cset);
467 		}
468 		/* cleanup */
469 		rf_cleanup_config_set(cset);
470 		cset = next_cset;
471 	}
472 
473 	/* if the user has specified what the root device should be
474 	   then we don't touch booted_device or boothowto... */
475 
476 	if (rootspec != NULL)
477 		return;
478 
479 	/* we found something bootable... */
480 
481 	if (num_root == 1) {
482 		booted_device = raid_softc[rootID].sc_dev;
483 	} else if (num_root > 1) {
484 
485 		/*
486 		 * Maybe the MD code can help. If it cannot, then
487 		 * setroot() will discover that we have no
488 		 * booted_device and will ask the user if nothing was
489 		 * hardwired in the kernel config file
490 		 */
491 
492 		if (booted_device == NULL)
493 			cpu_rootconf();
494 		if (booted_device == NULL)
495 			return;
496 
497 		num_root = 0;
498 		for (raidID = 0; raidID < numraid; raidID++) {
499 			if (raidPtrs[raidID]->valid == 0)
500 				continue;
501 
502 			if (raidPtrs[raidID]->root_partition == 0)
503 				continue;
504 
505 			for (col = 0; col < raidPtrs[raidID]->numCol; col++) {
506 				devname = raidPtrs[raidID]->Disks[col].devname;
507 				devname += sizeof("/dev/") - 1;
508 				if (strncmp(devname, device_xname(booted_device),
509 					    strlen(device_xname(booted_device))) != 0)
510 					continue;
511 				aprint_debug("raid%d includes boot device %s\n",
512 				       raidID, devname);
513 				num_root++;
514 				rootID = raidID;
515 			}
516 		}
517 
518 		if (num_root == 1) {
519 			booted_device = raid_softc[rootID].sc_dev;
520 		} else {
521 			/* we can't guess.. require the user to answer... */
522 			boothowto |= RB_ASKNAME;
523 		}
524 	}
525 }
526 
527 
528 int
529 raidsize(dev_t dev)
530 {
531 	struct raid_softc *rs;
532 	struct disklabel *lp;
533 	int     part, unit, omask, size;
534 
535 	unit = raidunit(dev);
536 	if (unit >= numraid)
537 		return (-1);
538 	rs = &raid_softc[unit];
539 
540 	if ((rs->sc_flags & RAIDF_INITED) == 0)
541 		return (-1);
542 
543 	part = DISKPART(dev);
544 	omask = rs->sc_dkdev.dk_openmask & (1 << part);
545 	lp = rs->sc_dkdev.dk_label;
546 
547 	if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp))
548 		return (-1);
549 
550 	if (lp->d_partitions[part].p_fstype != FS_SWAP)
551 		size = -1;
552 	else
553 		size = lp->d_partitions[part].p_size *
554 		    (lp->d_secsize / DEV_BSIZE);
555 
556 	if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp))
557 		return (-1);
558 
559 	return (size);
560 
561 }
562 
563 int
564 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
565 {
566 	int     unit = raidunit(dev);
567 	struct raid_softc *rs;
568 	const struct bdevsw *bdev;
569 	struct disklabel *lp;
570 	RF_Raid_t *raidPtr;
571 	daddr_t offset;
572 	int     part, c, sparecol, j, scol, dumpto;
573 	int     error = 0;
574 
575 	if (unit >= numraid)
576 		return (ENXIO);
577 
578 	rs = &raid_softc[unit];
579 	raidPtr = raidPtrs[unit];
580 
581 	if ((rs->sc_flags & RAIDF_INITED) == 0)
582 		return ENXIO;
583 
584 	/* we only support dumping to RAID 1 sets */
585 	if (raidPtr->Layout.numDataCol != 1 ||
586 	    raidPtr->Layout.numParityCol != 1)
587 		return EINVAL;
588 
589 
590 	if ((error = raidlock(rs)) != 0)
591 		return error;
592 
593 	if (size % DEV_BSIZE != 0) {
594 		error = EINVAL;
595 		goto out;
596 	}
597 
598 	if (blkno + size / DEV_BSIZE > rs->sc_size) {
599 		printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > "
600 		    "sc->sc_size (%" PRIu64 ")\n", __func__, blkno,
601 		    size / DEV_BSIZE, rs->sc_size);
602 		error = EINVAL;
603 		goto out;
604 	}
605 
606 	part = DISKPART(dev);
607 	lp = rs->sc_dkdev.dk_label;
608 	offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS;
609 
610 	/* figure out what device is alive.. */
611 
612 	/*
613 	   Look for a component to dump to.  The preference for the
614 	   component to dump to is as follows:
615 	   1) the master
616 	   2) a used_spare of the master
617 	   3) the slave
618 	   4) a used_spare of the slave
619 	*/
620 
621 	dumpto = -1;
622 	for (c = 0; c < raidPtr->numCol; c++) {
623 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
624 			/* this might be the one */
625 			dumpto = c;
626 			break;
627 		}
628 	}
629 
630 	/*
631 	   At this point we have possibly selected a live master or a
632 	   live slave.  We now check to see if there is a spared
633 	   master (or a spared slave), if we didn't find a live master
634 	   or a live slave.
635 	*/
636 
637 	for (c = 0; c < raidPtr->numSpare; c++) {
638 		sparecol = raidPtr->numCol + c;
639 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
640 			/* How about this one? */
641 			scol = -1;
642 			for(j=0;j<raidPtr->numCol;j++) {
643 				if (raidPtr->Disks[j].spareCol == sparecol) {
644 					scol = j;
645 					break;
646 				}
647 			}
648 			if (scol == 0) {
649 				/*
650 				   We must have found a spared master!
651 				   We'll take that over anything else
652 				   found so far.  (We couldn't have
653 				   found a real master before, since
654 				   this is a used spare, and it's
655 				   saying that it's replacing the
656 				   master.)  On reboot (with
657 				   autoconfiguration turned on)
658 				   sparecol will become the 1st
659 				   component (component0) of this set.
660 				*/
661 				dumpto = sparecol;
662 				break;
663 			} else if (scol != -1) {
664 				/*
665 				   Must be a spared slave.  We'll dump
666 				   to that if we havn't found anything
667 				   else so far.
668 				*/
669 				if (dumpto == -1)
670 					dumpto = sparecol;
671 			}
672 		}
673 	}
674 
675 	if (dumpto == -1) {
676 		/* we couldn't find any live components to dump to!?!?
677 		 */
678 		error = EINVAL;
679 		goto out;
680 	}
681 
682 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
683 
684 	/*
685 	   Note that blkno is relative to this particular partition.
686 	   By adding the offset of this partition in the RAID
687 	   set, and also adding RF_PROTECTED_SECTORS, we get a
688 	   value that is relative to the partition used for the
689 	   underlying component.
690 	*/
691 
692 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
693 				blkno + offset, va, size);
694 
695 out:
696 	raidunlock(rs);
697 
698 	return error;
699 }
700 /* ARGSUSED */
701 int
702 raidopen(dev_t dev, int flags, int fmt,
703     struct lwp *l)
704 {
705 	int     unit = raidunit(dev);
706 	struct raid_softc *rs;
707 	struct disklabel *lp;
708 	int     part, pmask;
709 	int     error = 0;
710 
711 	if (unit >= numraid)
712 		return (ENXIO);
713 	rs = &raid_softc[unit];
714 
715 	if ((error = raidlock(rs)) != 0)
716 		return (error);
717 
718 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
719 		error = EBUSY;
720 		goto bad;
721 	}
722 
723 	lp = rs->sc_dkdev.dk_label;
724 
725 	part = DISKPART(dev);
726 
727 	/*
728 	 * If there are wedges, and this is not RAW_PART, then we
729 	 * need to fail.
730 	 */
731 	if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
732 		error = EBUSY;
733 		goto bad;
734 	}
735 	pmask = (1 << part);
736 
737 	if ((rs->sc_flags & RAIDF_INITED) &&
738 	    (rs->sc_dkdev.dk_openmask == 0))
739 		raidgetdisklabel(dev);
740 
741 	/* make sure that this partition exists */
742 
743 	if (part != RAW_PART) {
744 		if (((rs->sc_flags & RAIDF_INITED) == 0) ||
745 		    ((part >= lp->d_npartitions) ||
746 			(lp->d_partitions[part].p_fstype == FS_UNUSED))) {
747 			error = ENXIO;
748 			goto bad;
749 		}
750 	}
751 	/* Prevent this unit from being unconfigured while open. */
752 	switch (fmt) {
753 	case S_IFCHR:
754 		rs->sc_dkdev.dk_copenmask |= pmask;
755 		break;
756 
757 	case S_IFBLK:
758 		rs->sc_dkdev.dk_bopenmask |= pmask;
759 		break;
760 	}
761 
762 	if ((rs->sc_dkdev.dk_openmask == 0) &&
763 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
764 		/* First one... mark things as dirty... Note that we *MUST*
765 		 have done a configure before this.  I DO NOT WANT TO BE
766 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
767 		 THAT THEY BELONG TOGETHER!!!!! */
768 		/* XXX should check to see if we're only open for reading
769 		   here... If so, we needn't do this, but then need some
770 		   other way of keeping track of what's happened.. */
771 
772 		rf_markalldirty(raidPtrs[unit]);
773 	}
774 
775 
776 	rs->sc_dkdev.dk_openmask =
777 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
778 
779 bad:
780 	raidunlock(rs);
781 
782 	return (error);
783 
784 
785 }
786 /* ARGSUSED */
787 int
788 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
789 {
790 	int     unit = raidunit(dev);
791 	struct raid_softc *rs;
792 	int     error = 0;
793 	int     part;
794 
795 	if (unit >= numraid)
796 		return (ENXIO);
797 	rs = &raid_softc[unit];
798 
799 	if ((error = raidlock(rs)) != 0)
800 		return (error);
801 
802 	part = DISKPART(dev);
803 
804 	/* ...that much closer to allowing unconfiguration... */
805 	switch (fmt) {
806 	case S_IFCHR:
807 		rs->sc_dkdev.dk_copenmask &= ~(1 << part);
808 		break;
809 
810 	case S_IFBLK:
811 		rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
812 		break;
813 	}
814 	rs->sc_dkdev.dk_openmask =
815 	    rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
816 
817 	if ((rs->sc_dkdev.dk_openmask == 0) &&
818 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
819 		/* Last one... device is not unconfigured yet.
820 		   Device shutdown has taken care of setting the
821 		   clean bits if RAIDF_INITED is not set
822 		   mark things as clean... */
823 
824 		rf_update_component_labels(raidPtrs[unit],
825 						 RF_FINAL_COMPONENT_UPDATE);
826 
827 		/* If the kernel is shutting down, it will detach
828 		 * this RAID set soon enough.
829 		 */
830 	}
831 
832 	raidunlock(rs);
833 	return (0);
834 
835 }
836 
837 void
838 raidstrategy(struct buf *bp)
839 {
840 	unsigned int raidID = raidunit(bp->b_dev);
841 	RF_Raid_t *raidPtr;
842 	struct raid_softc *rs = &raid_softc[raidID];
843 	int     wlabel;
844 
845 	if ((rs->sc_flags & RAIDF_INITED) ==0) {
846 		bp->b_error = ENXIO;
847 		goto done;
848 	}
849 	if (raidID >= numraid || !raidPtrs[raidID]) {
850 		bp->b_error = ENODEV;
851 		goto done;
852 	}
853 	raidPtr = raidPtrs[raidID];
854 	if (!raidPtr->valid) {
855 		bp->b_error = ENODEV;
856 		goto done;
857 	}
858 	if (bp->b_bcount == 0) {
859 		db1_printf(("b_bcount is zero..\n"));
860 		goto done;
861 	}
862 
863 	/*
864 	 * Do bounds checking and adjust transfer.  If there's an
865 	 * error, the bounds check will flag that for us.
866 	 */
867 
868 	wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING);
869 	if (DISKPART(bp->b_dev) == RAW_PART) {
870 		uint64_t size; /* device size in DEV_BSIZE unit */
871 
872 		if (raidPtr->logBytesPerSector > DEV_BSHIFT) {
873 			size = raidPtr->totalSectors <<
874 			    (raidPtr->logBytesPerSector - DEV_BSHIFT);
875 		} else {
876 			size = raidPtr->totalSectors >>
877 			    (DEV_BSHIFT - raidPtr->logBytesPerSector);
878 		}
879 		if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) {
880 			goto done;
881 		}
882 	} else {
883 		if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) {
884 			db1_printf(("Bounds check failed!!:%d %d\n",
885 				(int) bp->b_blkno, (int) wlabel));
886 			goto done;
887 		}
888 	}
889 
890 	rf_lock_mutex2(raidPtr->iodone_lock);
891 
892 	bp->b_resid = 0;
893 
894 	/* stuff it onto our queue */
895 	bufq_put(rs->buf_queue, bp);
896 
897 	/* scheduled the IO to happen at the next convenient time */
898 	rf_signal_cond2(raidPtr->iodone_cv);
899 	rf_unlock_mutex2(raidPtr->iodone_lock);
900 
901 	return;
902 
903 done:
904 	bp->b_resid = bp->b_bcount;
905 	biodone(bp);
906 }
907 /* ARGSUSED */
908 int
909 raidread(dev_t dev, struct uio *uio, int flags)
910 {
911 	int     unit = raidunit(dev);
912 	struct raid_softc *rs;
913 
914 	if (unit >= numraid)
915 		return (ENXIO);
916 	rs = &raid_softc[unit];
917 
918 	if ((rs->sc_flags & RAIDF_INITED) == 0)
919 		return (ENXIO);
920 
921 	return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
922 
923 }
924 /* ARGSUSED */
925 int
926 raidwrite(dev_t dev, struct uio *uio, int flags)
927 {
928 	int     unit = raidunit(dev);
929 	struct raid_softc *rs;
930 
931 	if (unit >= numraid)
932 		return (ENXIO);
933 	rs = &raid_softc[unit];
934 
935 	if ((rs->sc_flags & RAIDF_INITED) == 0)
936 		return (ENXIO);
937 
938 	return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
939 
940 }
941 
942 static int
943 raid_detach_unlocked(struct raid_softc *rs)
944 {
945 	int error;
946 	RF_Raid_t *raidPtr;
947 
948 	raidPtr = raidPtrs[device_unit(rs->sc_dev)];
949 
950 	/*
951 	 * If somebody has a partition mounted, we shouldn't
952 	 * shutdown.
953 	 */
954 	if (rs->sc_dkdev.dk_openmask != 0)
955 		return EBUSY;
956 
957 	if ((rs->sc_flags & RAIDF_INITED) == 0)
958 		;	/* not initialized: nothing to do */
959 	else if ((error = rf_Shutdown(raidPtr)) != 0)
960 		return error;
961 	else
962 		rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN);
963 
964 	/* Detach the disk. */
965 	dkwedge_delall(&rs->sc_dkdev);
966 	disk_detach(&rs->sc_dkdev);
967 	disk_destroy(&rs->sc_dkdev);
968 
969 	aprint_normal_dev(rs->sc_dev, "detached\n");
970 
971 	return 0;
972 }
973 
974 int
975 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
976 {
977 	int     unit = raidunit(dev);
978 	int     error = 0;
979 	int     part, pmask;
980 	cfdata_t cf;
981 	struct raid_softc *rs;
982 	RF_Config_t *k_cfg, *u_cfg;
983 	RF_Raid_t *raidPtr;
984 	RF_RaidDisk_t *diskPtr;
985 	RF_AccTotals_t *totals;
986 	RF_DeviceConfig_t *d_cfg, **ucfgp;
987 	u_char *specific_buf;
988 	int retcode = 0;
989 	int column;
990 /*	int raidid; */
991 	struct rf_recon_req *rrcopy, *rr;
992 	RF_ComponentLabel_t *clabel;
993 	RF_ComponentLabel_t *ci_label;
994 	RF_ComponentLabel_t **clabel_ptr;
995 	RF_SingleComponent_t *sparePtr,*componentPtr;
996 	RF_SingleComponent_t component;
997 	RF_ProgressInfo_t progressInfo, **progressInfoPtr;
998 	int i, j, d;
999 #ifdef __HAVE_OLD_DISKLABEL
1000 	struct disklabel newlabel;
1001 #endif
1002 	struct dkwedge_info *dkw;
1003 
1004 	if (unit >= numraid)
1005 		return (ENXIO);
1006 	rs = &raid_softc[unit];
1007 	raidPtr = raidPtrs[unit];
1008 
1009 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1010 		(int) DISKPART(dev), (int) unit, cmd));
1011 
1012 	/* Must be open for writes for these commands... */
1013 	switch (cmd) {
1014 #ifdef DIOCGSECTORSIZE
1015 	case DIOCGSECTORSIZE:
1016 		*(u_int *)data = raidPtr->bytesPerSector;
1017 		return 0;
1018 	case DIOCGMEDIASIZE:
1019 		*(off_t *)data =
1020 		    (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector;
1021 		return 0;
1022 #endif
1023 	case DIOCSDINFO:
1024 	case DIOCWDINFO:
1025 #ifdef __HAVE_OLD_DISKLABEL
1026 	case ODIOCWDINFO:
1027 	case ODIOCSDINFO:
1028 #endif
1029 	case DIOCWLABEL:
1030 	case DIOCAWEDGE:
1031 	case DIOCDWEDGE:
1032 		if ((flag & FWRITE) == 0)
1033 			return (EBADF);
1034 	}
1035 
1036 	/* Must be initialized for these... */
1037 	switch (cmd) {
1038 	case DIOCGDINFO:
1039 	case DIOCSDINFO:
1040 	case DIOCWDINFO:
1041 #ifdef __HAVE_OLD_DISKLABEL
1042 	case ODIOCGDINFO:
1043 	case ODIOCWDINFO:
1044 	case ODIOCSDINFO:
1045 	case ODIOCGDEFLABEL:
1046 #endif
1047 	case DIOCGPART:
1048 	case DIOCWLABEL:
1049 	case DIOCGDEFLABEL:
1050 	case DIOCAWEDGE:
1051 	case DIOCDWEDGE:
1052 	case DIOCLWEDGES:
1053 	case DIOCCACHESYNC:
1054 	case RAIDFRAME_SHUTDOWN:
1055 	case RAIDFRAME_REWRITEPARITY:
1056 	case RAIDFRAME_GET_INFO:
1057 	case RAIDFRAME_RESET_ACCTOTALS:
1058 	case RAIDFRAME_GET_ACCTOTALS:
1059 	case RAIDFRAME_KEEP_ACCTOTALS:
1060 	case RAIDFRAME_GET_SIZE:
1061 	case RAIDFRAME_FAIL_DISK:
1062 	case RAIDFRAME_COPYBACK:
1063 	case RAIDFRAME_CHECK_RECON_STATUS:
1064 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1065 	case RAIDFRAME_GET_COMPONENT_LABEL:
1066 	case RAIDFRAME_SET_COMPONENT_LABEL:
1067 	case RAIDFRAME_ADD_HOT_SPARE:
1068 	case RAIDFRAME_REMOVE_HOT_SPARE:
1069 	case RAIDFRAME_INIT_LABELS:
1070 	case RAIDFRAME_REBUILD_IN_PLACE:
1071 	case RAIDFRAME_CHECK_PARITY:
1072 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1073 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1074 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1075 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1076 	case RAIDFRAME_SET_AUTOCONFIG:
1077 	case RAIDFRAME_SET_ROOT:
1078 	case RAIDFRAME_DELETE_COMPONENT:
1079 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1080 	case RAIDFRAME_PARITYMAP_STATUS:
1081 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1082 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1083 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1084 		if ((rs->sc_flags & RAIDF_INITED) == 0)
1085 			return (ENXIO);
1086 	}
1087 
1088 	switch (cmd) {
1089 #ifdef COMPAT_50
1090 	case RAIDFRAME_GET_INFO50:
1091 		return rf_get_info50(raidPtr, data);
1092 
1093 	case RAIDFRAME_CONFIGURE50:
1094 		if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0)
1095 			return retcode;
1096 		goto config;
1097 #endif
1098 		/* configure the system */
1099 	case RAIDFRAME_CONFIGURE:
1100 
1101 		if (raidPtr->valid) {
1102 			/* There is a valid RAID set running on this unit! */
1103 			printf("raid%d: Device already configured!\n",unit);
1104 			return(EINVAL);
1105 		}
1106 
1107 		/* copy-in the configuration information */
1108 		/* data points to a pointer to the configuration structure */
1109 
1110 		u_cfg = *((RF_Config_t **) data);
1111 		RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *));
1112 		if (k_cfg == NULL) {
1113 			return (ENOMEM);
1114 		}
1115 		retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t));
1116 		if (retcode) {
1117 			RF_Free(k_cfg, sizeof(RF_Config_t));
1118 			db1_printf(("rf_ioctl: retcode=%d copyin.1\n",
1119 				retcode));
1120 			return (retcode);
1121 		}
1122 		goto config;
1123 	config:
1124 		/* allocate a buffer for the layout-specific data, and copy it
1125 		 * in */
1126 		if (k_cfg->layoutSpecificSize) {
1127 			if (k_cfg->layoutSpecificSize > 10000) {
1128 				/* sanity check */
1129 				RF_Free(k_cfg, sizeof(RF_Config_t));
1130 				return (EINVAL);
1131 			}
1132 			RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
1133 			    (u_char *));
1134 			if (specific_buf == NULL) {
1135 				RF_Free(k_cfg, sizeof(RF_Config_t));
1136 				return (ENOMEM);
1137 			}
1138 			retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1139 			    k_cfg->layoutSpecificSize);
1140 			if (retcode) {
1141 				RF_Free(k_cfg, sizeof(RF_Config_t));
1142 				RF_Free(specific_buf,
1143 					k_cfg->layoutSpecificSize);
1144 				db1_printf(("rf_ioctl: retcode=%d copyin.2\n",
1145 					retcode));
1146 				return (retcode);
1147 			}
1148 		} else
1149 			specific_buf = NULL;
1150 		k_cfg->layoutSpecific = specific_buf;
1151 
1152 		/* should do some kind of sanity check on the configuration.
1153 		 * Store the sum of all the bytes in the last byte? */
1154 
1155 		/* configure the system */
1156 
1157 		/*
1158 		 * Clear the entire RAID descriptor, just to make sure
1159 		 *  there is no stale data left in the case of a
1160 		 *  reconfiguration
1161 		 */
1162 		memset(raidPtr, 0, sizeof(*raidPtr));
1163 		raidPtr->raidid = unit;
1164 
1165 		retcode = rf_Configure(raidPtr, k_cfg, NULL);
1166 
1167 		if (retcode == 0) {
1168 
1169 			/* allow this many simultaneous IO's to
1170 			   this RAID device */
1171 			raidPtr->openings = RAIDOUTSTANDING;
1172 
1173 			raidinit(raidPtr);
1174 			rf_markalldirty(raidPtr);
1175 		}
1176 		/* free the buffers.  No return code here. */
1177 		if (k_cfg->layoutSpecificSize) {
1178 			RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1179 		}
1180 		RF_Free(k_cfg, sizeof(RF_Config_t));
1181 
1182 		return (retcode);
1183 
1184 		/* shutdown the system */
1185 	case RAIDFRAME_SHUTDOWN:
1186 
1187 		part = DISKPART(dev);
1188 		pmask = (1 << part);
1189 
1190 		if ((error = raidlock(rs)) != 0)
1191 			return (error);
1192 
1193 		if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
1194 		    ((rs->sc_dkdev.dk_bopenmask & pmask) &&
1195 			(rs->sc_dkdev.dk_copenmask & pmask)))
1196 			retcode = EBUSY;
1197 		else {
1198 			rs->sc_flags |= RAIDF_SHUTDOWN;
1199 			rs->sc_dkdev.dk_copenmask &= ~pmask;
1200 			rs->sc_dkdev.dk_bopenmask &= ~pmask;
1201 			rs->sc_dkdev.dk_openmask &= ~pmask;
1202 			retcode = 0;
1203 		}
1204 
1205 		raidunlock(rs);
1206 
1207 		if (retcode != 0)
1208 			return retcode;
1209 
1210 		/* free the pseudo device attach bits */
1211 
1212 		cf = device_cfdata(rs->sc_dev);
1213 		if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0)
1214 			free(cf, M_RAIDFRAME);
1215 
1216 		return (retcode);
1217 	case RAIDFRAME_GET_COMPONENT_LABEL:
1218 		clabel_ptr = (RF_ComponentLabel_t **) data;
1219 		/* need to read the component label for the disk indicated
1220 		   by row,column in clabel */
1221 
1222 		/*
1223 		 * Perhaps there should be an option to skip the in-core
1224 		 * copy and hit the disk, as with disklabel(8).
1225 		 */
1226 		RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *));
1227 
1228 		retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel));
1229 
1230 		if (retcode) {
1231 			RF_Free(clabel, sizeof(*clabel));
1232 			return retcode;
1233 		}
1234 
1235 		clabel->row = 0; /* Don't allow looking at anything else.*/
1236 
1237 		column = clabel->column;
1238 
1239 		if ((column < 0) || (column >= raidPtr->numCol +
1240 		    raidPtr->numSpare)) {
1241 			RF_Free(clabel, sizeof(*clabel));
1242 			return EINVAL;
1243 		}
1244 
1245 		RF_Free(clabel, sizeof(*clabel));
1246 
1247 		clabel = raidget_component_label(raidPtr, column);
1248 
1249 		return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr));
1250 
1251 #if 0
1252 	case RAIDFRAME_SET_COMPONENT_LABEL:
1253 		clabel = (RF_ComponentLabel_t *) data;
1254 
1255 		/* XXX check the label for valid stuff... */
1256 		/* Note that some things *should not* get modified --
1257 		   the user should be re-initing the labels instead of
1258 		   trying to patch things.
1259 		   */
1260 
1261 		raidid = raidPtr->raidid;
1262 #ifdef DEBUG
1263 		printf("raid%d: Got component label:\n", raidid);
1264 		printf("raid%d: Version: %d\n", raidid, clabel->version);
1265 		printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1266 		printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1267 		printf("raid%d: Column: %d\n", raidid, clabel->column);
1268 		printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1269 		printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1270 		printf("raid%d: Status: %d\n", raidid, clabel->status);
1271 #endif
1272 		clabel->row = 0;
1273 		column = clabel->column;
1274 
1275 		if ((column < 0) || (column >= raidPtr->numCol)) {
1276 			return(EINVAL);
1277 		}
1278 
1279 		/* XXX this isn't allowed to do anything for now :-) */
1280 
1281 		/* XXX and before it is, we need to fill in the rest
1282 		   of the fields!?!?!?! */
1283 		memcpy(raidget_component_label(raidPtr, column),
1284 		    clabel, sizeof(*clabel));
1285 		raidflush_component_label(raidPtr, column);
1286 		return (0);
1287 #endif
1288 
1289 	case RAIDFRAME_INIT_LABELS:
1290 		clabel = (RF_ComponentLabel_t *) data;
1291 		/*
1292 		   we only want the serial number from
1293 		   the above.  We get all the rest of the information
1294 		   from the config that was used to create this RAID
1295 		   set.
1296 		   */
1297 
1298 		raidPtr->serial_number = clabel->serial_number;
1299 
1300 		for(column=0;column<raidPtr->numCol;column++) {
1301 			diskPtr = &raidPtr->Disks[column];
1302 			if (!RF_DEAD_DISK(diskPtr->status)) {
1303 				ci_label = raidget_component_label(raidPtr,
1304 				    column);
1305 				/* Zeroing this is important. */
1306 				memset(ci_label, 0, sizeof(*ci_label));
1307 				raid_init_component_label(raidPtr, ci_label);
1308 				ci_label->serial_number =
1309 				    raidPtr->serial_number;
1310 				ci_label->row = 0; /* we dont' pretend to support more */
1311 				rf_component_label_set_partitionsize(ci_label,
1312 				    diskPtr->partitionSize);
1313 				ci_label->column = column;
1314 				raidflush_component_label(raidPtr, column);
1315 			}
1316 			/* XXXjld what about the spares? */
1317 		}
1318 
1319 		return (retcode);
1320 	case RAIDFRAME_SET_AUTOCONFIG:
1321 		d = rf_set_autoconfig(raidPtr, *(int *) data);
1322 		printf("raid%d: New autoconfig value is: %d\n",
1323 		       raidPtr->raidid, d);
1324 		*(int *) data = d;
1325 		return (retcode);
1326 
1327 	case RAIDFRAME_SET_ROOT:
1328 		d = rf_set_rootpartition(raidPtr, *(int *) data);
1329 		printf("raid%d: New rootpartition value is: %d\n",
1330 		       raidPtr->raidid, d);
1331 		*(int *) data = d;
1332 		return (retcode);
1333 
1334 		/* initialize all parity */
1335 	case RAIDFRAME_REWRITEPARITY:
1336 
1337 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1338 			/* Parity for RAID 0 is trivially correct */
1339 			raidPtr->parity_good = RF_RAID_CLEAN;
1340 			return(0);
1341 		}
1342 
1343 		if (raidPtr->parity_rewrite_in_progress == 1) {
1344 			/* Re-write is already in progress! */
1345 			return(EINVAL);
1346 		}
1347 
1348 		retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1349 					   rf_RewriteParityThread,
1350 					   raidPtr,"raid_parity");
1351 		return (retcode);
1352 
1353 
1354 	case RAIDFRAME_ADD_HOT_SPARE:
1355 		sparePtr = (RF_SingleComponent_t *) data;
1356 		memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t));
1357 		retcode = rf_add_hot_spare(raidPtr, &component);
1358 		return(retcode);
1359 
1360 	case RAIDFRAME_REMOVE_HOT_SPARE:
1361 		return(retcode);
1362 
1363 	case RAIDFRAME_DELETE_COMPONENT:
1364 		componentPtr = (RF_SingleComponent_t *)data;
1365 		memcpy( &component, componentPtr,
1366 			sizeof(RF_SingleComponent_t));
1367 		retcode = rf_delete_component(raidPtr, &component);
1368 		return(retcode);
1369 
1370 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1371 		componentPtr = (RF_SingleComponent_t *)data;
1372 		memcpy( &component, componentPtr,
1373 			sizeof(RF_SingleComponent_t));
1374 		retcode = rf_incorporate_hot_spare(raidPtr, &component);
1375 		return(retcode);
1376 
1377 	case RAIDFRAME_REBUILD_IN_PLACE:
1378 
1379 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1380 			/* Can't do this on a RAID 0!! */
1381 			return(EINVAL);
1382 		}
1383 
1384 		if (raidPtr->recon_in_progress == 1) {
1385 			/* a reconstruct is already in progress! */
1386 			return(EINVAL);
1387 		}
1388 
1389 		componentPtr = (RF_SingleComponent_t *) data;
1390 		memcpy( &component, componentPtr,
1391 			sizeof(RF_SingleComponent_t));
1392 		component.row = 0; /* we don't support any more */
1393 		column = component.column;
1394 
1395 		if ((column < 0) || (column >= raidPtr->numCol)) {
1396 			return(EINVAL);
1397 		}
1398 
1399 		rf_lock_mutex2(raidPtr->mutex);
1400 		if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1401 		    (raidPtr->numFailures > 0)) {
1402 			/* XXX 0 above shouldn't be constant!!! */
1403 			/* some component other than this has failed.
1404 			   Let's not make things worse than they already
1405 			   are... */
1406 			printf("raid%d: Unable to reconstruct to disk at:\n",
1407 			       raidPtr->raidid);
1408 			printf("raid%d:     Col: %d   Too many failures.\n",
1409 			       raidPtr->raidid, column);
1410 			rf_unlock_mutex2(raidPtr->mutex);
1411 			return (EINVAL);
1412 		}
1413 		if (raidPtr->Disks[column].status ==
1414 		    rf_ds_reconstructing) {
1415 			printf("raid%d: Unable to reconstruct to disk at:\n",
1416 			       raidPtr->raidid);
1417 			printf("raid%d:    Col: %d   Reconstruction already occuring!\n", raidPtr->raidid, column);
1418 
1419 			rf_unlock_mutex2(raidPtr->mutex);
1420 			return (EINVAL);
1421 		}
1422 		if (raidPtr->Disks[column].status == rf_ds_spared) {
1423 			rf_unlock_mutex2(raidPtr->mutex);
1424 			return (EINVAL);
1425 		}
1426 		rf_unlock_mutex2(raidPtr->mutex);
1427 
1428 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1429 		if (rrcopy == NULL)
1430 			return(ENOMEM);
1431 
1432 		rrcopy->raidPtr = (void *) raidPtr;
1433 		rrcopy->col = column;
1434 
1435 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1436 					   rf_ReconstructInPlaceThread,
1437 					   rrcopy,"raid_reconip");
1438 		return(retcode);
1439 
1440 	case RAIDFRAME_GET_INFO:
1441 		if (!raidPtr->valid)
1442 			return (ENODEV);
1443 		ucfgp = (RF_DeviceConfig_t **) data;
1444 		RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t),
1445 			  (RF_DeviceConfig_t *));
1446 		if (d_cfg == NULL)
1447 			return (ENOMEM);
1448 		d_cfg->rows = 1; /* there is only 1 row now */
1449 		d_cfg->cols = raidPtr->numCol;
1450 		d_cfg->ndevs = raidPtr->numCol;
1451 		if (d_cfg->ndevs >= RF_MAX_DISKS) {
1452 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1453 			return (ENOMEM);
1454 		}
1455 		d_cfg->nspares = raidPtr->numSpare;
1456 		if (d_cfg->nspares >= RF_MAX_DISKS) {
1457 			RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1458 			return (ENOMEM);
1459 		}
1460 		d_cfg->maxqdepth = raidPtr->maxQueueDepth;
1461 		d = 0;
1462 		for (j = 0; j < d_cfg->cols; j++) {
1463 			d_cfg->devs[d] = raidPtr->Disks[j];
1464 			d++;
1465 		}
1466 		for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) {
1467 			d_cfg->spares[i] = raidPtr->Disks[j];
1468 		}
1469 		retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t));
1470 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1471 
1472 		return (retcode);
1473 
1474 	case RAIDFRAME_CHECK_PARITY:
1475 		*(int *) data = raidPtr->parity_good;
1476 		return (0);
1477 
1478 	case RAIDFRAME_PARITYMAP_STATUS:
1479 		if (rf_paritymap_ineligible(raidPtr))
1480 			return EINVAL;
1481 		rf_paritymap_status(raidPtr->parity_map,
1482 		    (struct rf_pmstat *)data);
1483 		return 0;
1484 
1485 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1486 		if (rf_paritymap_ineligible(raidPtr))
1487 			return EINVAL;
1488 		if (raidPtr->parity_map == NULL)
1489 			return ENOENT; /* ??? */
1490 		if (0 != rf_paritymap_set_params(raidPtr->parity_map,
1491 			(struct rf_pmparams *)data, 1))
1492 			return EINVAL;
1493 		return 0;
1494 
1495 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1496 		if (rf_paritymap_ineligible(raidPtr))
1497 			return EINVAL;
1498 		*(int *) data = rf_paritymap_get_disable(raidPtr);
1499 		return 0;
1500 
1501 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1502 		if (rf_paritymap_ineligible(raidPtr))
1503 			return EINVAL;
1504 		rf_paritymap_set_disable(raidPtr, *(int *)data);
1505 		/* XXX should errors be passed up? */
1506 		return 0;
1507 
1508 	case RAIDFRAME_RESET_ACCTOTALS:
1509 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1510 		return (0);
1511 
1512 	case RAIDFRAME_GET_ACCTOTALS:
1513 		totals = (RF_AccTotals_t *) data;
1514 		*totals = raidPtr->acc_totals;
1515 		return (0);
1516 
1517 	case RAIDFRAME_KEEP_ACCTOTALS:
1518 		raidPtr->keep_acc_totals = *(int *)data;
1519 		return (0);
1520 
1521 	case RAIDFRAME_GET_SIZE:
1522 		*(int *) data = raidPtr->totalSectors;
1523 		return (0);
1524 
1525 		/* fail a disk & optionally start reconstruction */
1526 	case RAIDFRAME_FAIL_DISK:
1527 
1528 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1529 			/* Can't do this on a RAID 0!! */
1530 			return(EINVAL);
1531 		}
1532 
1533 		rr = (struct rf_recon_req *) data;
1534 		rr->row = 0;
1535 		if (rr->col < 0 || rr->col >= raidPtr->numCol)
1536 			return (EINVAL);
1537 
1538 
1539 		rf_lock_mutex2(raidPtr->mutex);
1540 		if (raidPtr->status == rf_rs_reconstructing) {
1541 			/* you can't fail a disk while we're reconstructing! */
1542 			/* XXX wrong for RAID6 */
1543 			rf_unlock_mutex2(raidPtr->mutex);
1544 			return (EINVAL);
1545 		}
1546 		if ((raidPtr->Disks[rr->col].status ==
1547 		     rf_ds_optimal) && (raidPtr->numFailures > 0)) {
1548 			/* some other component has failed.  Let's not make
1549 			   things worse. XXX wrong for RAID6 */
1550 			rf_unlock_mutex2(raidPtr->mutex);
1551 			return (EINVAL);
1552 		}
1553 		if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1554 			/* Can't fail a spared disk! */
1555 			rf_unlock_mutex2(raidPtr->mutex);
1556 			return (EINVAL);
1557 		}
1558 		rf_unlock_mutex2(raidPtr->mutex);
1559 
1560 		/* make a copy of the recon request so that we don't rely on
1561 		 * the user's buffer */
1562 		RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
1563 		if (rrcopy == NULL)
1564 			return(ENOMEM);
1565 		memcpy(rrcopy, rr, sizeof(*rr));
1566 		rrcopy->raidPtr = (void *) raidPtr;
1567 
1568 		retcode = RF_CREATE_THREAD(raidPtr->recon_thread,
1569 					   rf_ReconThread,
1570 					   rrcopy,"raid_recon");
1571 		return (0);
1572 
1573 		/* invoke a copyback operation after recon on whatever disk
1574 		 * needs it, if any */
1575 	case RAIDFRAME_COPYBACK:
1576 
1577 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1578 			/* This makes no sense on a RAID 0!! */
1579 			return(EINVAL);
1580 		}
1581 
1582 		if (raidPtr->copyback_in_progress == 1) {
1583 			/* Copyback is already in progress! */
1584 			return(EINVAL);
1585 		}
1586 
1587 		retcode = RF_CREATE_THREAD(raidPtr->copyback_thread,
1588 					   rf_CopybackThread,
1589 					   raidPtr,"raid_copyback");
1590 		return (retcode);
1591 
1592 		/* return the percentage completion of reconstruction */
1593 	case RAIDFRAME_CHECK_RECON_STATUS:
1594 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1595 			/* This makes no sense on a RAID 0, so tell the
1596 			   user it's done. */
1597 			*(int *) data = 100;
1598 			return(0);
1599 		}
1600 		if (raidPtr->status != rf_rs_reconstructing)
1601 			*(int *) data = 100;
1602 		else {
1603 			if (raidPtr->reconControl->numRUsTotal > 0) {
1604 				*(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal);
1605 			} else {
1606 				*(int *) data = 0;
1607 			}
1608 		}
1609 		return (0);
1610 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1611 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1612 		if (raidPtr->status != rf_rs_reconstructing) {
1613 			progressInfo.remaining = 0;
1614 			progressInfo.completed = 100;
1615 			progressInfo.total = 100;
1616 		} else {
1617 			progressInfo.total =
1618 				raidPtr->reconControl->numRUsTotal;
1619 			progressInfo.completed =
1620 				raidPtr->reconControl->numRUsComplete;
1621 			progressInfo.remaining = progressInfo.total -
1622 				progressInfo.completed;
1623 		}
1624 		retcode = copyout(&progressInfo, *progressInfoPtr,
1625 				  sizeof(RF_ProgressInfo_t));
1626 		return (retcode);
1627 
1628 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1629 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1630 			/* This makes no sense on a RAID 0, so tell the
1631 			   user it's done. */
1632 			*(int *) data = 100;
1633 			return(0);
1634 		}
1635 		if (raidPtr->parity_rewrite_in_progress == 1) {
1636 			*(int *) data = 100 *
1637 				raidPtr->parity_rewrite_stripes_done /
1638 				raidPtr->Layout.numStripe;
1639 		} else {
1640 			*(int *) data = 100;
1641 		}
1642 		return (0);
1643 
1644 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1645 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1646 		if (raidPtr->parity_rewrite_in_progress == 1) {
1647 			progressInfo.total = raidPtr->Layout.numStripe;
1648 			progressInfo.completed =
1649 				raidPtr->parity_rewrite_stripes_done;
1650 			progressInfo.remaining = progressInfo.total -
1651 				progressInfo.completed;
1652 		} else {
1653 			progressInfo.remaining = 0;
1654 			progressInfo.completed = 100;
1655 			progressInfo.total = 100;
1656 		}
1657 		retcode = copyout(&progressInfo, *progressInfoPtr,
1658 				  sizeof(RF_ProgressInfo_t));
1659 		return (retcode);
1660 
1661 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1662 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1663 			/* This makes no sense on a RAID 0 */
1664 			*(int *) data = 100;
1665 			return(0);
1666 		}
1667 		if (raidPtr->copyback_in_progress == 1) {
1668 			*(int *) data = 100 * raidPtr->copyback_stripes_done /
1669 				raidPtr->Layout.numStripe;
1670 		} else {
1671 			*(int *) data = 100;
1672 		}
1673 		return (0);
1674 
1675 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1676 		progressInfoPtr = (RF_ProgressInfo_t **) data;
1677 		if (raidPtr->copyback_in_progress == 1) {
1678 			progressInfo.total = raidPtr->Layout.numStripe;
1679 			progressInfo.completed =
1680 				raidPtr->copyback_stripes_done;
1681 			progressInfo.remaining = progressInfo.total -
1682 				progressInfo.completed;
1683 		} else {
1684 			progressInfo.remaining = 0;
1685 			progressInfo.completed = 100;
1686 			progressInfo.total = 100;
1687 		}
1688 		retcode = copyout(&progressInfo, *progressInfoPtr,
1689 				  sizeof(RF_ProgressInfo_t));
1690 		return (retcode);
1691 
1692 		/* the sparetable daemon calls this to wait for the kernel to
1693 		 * need a spare table. this ioctl does not return until a
1694 		 * spare table is needed. XXX -- calling mpsleep here in the
1695 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1696 		 * -- I should either compute the spare table in the kernel,
1697 		 * or have a different -- XXX XXX -- interface (a different
1698 		 * character device) for delivering the table     -- XXX */
1699 #if 0
1700 	case RAIDFRAME_SPARET_WAIT:
1701 		rf_lock_mutex2(rf_sparet_wait_mutex);
1702 		while (!rf_sparet_wait_queue)
1703 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1704 		waitreq = rf_sparet_wait_queue;
1705 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1706 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1707 
1708 		/* structure assignment */
1709 		*((RF_SparetWait_t *) data) = *waitreq;
1710 
1711 		RF_Free(waitreq, sizeof(*waitreq));
1712 		return (0);
1713 
1714 		/* wakes up a process waiting on SPARET_WAIT and puts an error
1715 		 * code in it that will cause the dameon to exit */
1716 	case RAIDFRAME_ABORT_SPARET_WAIT:
1717 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1718 		waitreq->fcol = -1;
1719 		rf_lock_mutex2(rf_sparet_wait_mutex);
1720 		waitreq->next = rf_sparet_wait_queue;
1721 		rf_sparet_wait_queue = waitreq;
1722 		rf_broadcast_conf2(rf_sparet_wait_cv);
1723 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1724 		return (0);
1725 
1726 		/* used by the spare table daemon to deliver a spare table
1727 		 * into the kernel */
1728 	case RAIDFRAME_SEND_SPARET:
1729 
1730 		/* install the spare table */
1731 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1732 
1733 		/* respond to the requestor.  the return status of the spare
1734 		 * table installation is passed in the "fcol" field */
1735 		RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
1736 		waitreq->fcol = retcode;
1737 		rf_lock_mutex2(rf_sparet_wait_mutex);
1738 		waitreq->next = rf_sparet_resp_queue;
1739 		rf_sparet_resp_queue = waitreq;
1740 		rf_broadcast_cond2(rf_sparet_resp_cv);
1741 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1742 
1743 		return (retcode);
1744 #endif
1745 
1746 	default:
1747 		break; /* fall through to the os-specific code below */
1748 
1749 	}
1750 
1751 	if (!raidPtr->valid)
1752 		return (EINVAL);
1753 
1754 	/*
1755 	 * Add support for "regular" device ioctls here.
1756 	 */
1757 
1758 	error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l);
1759 	if (error != EPASSTHROUGH)
1760 		return (error);
1761 
1762 	switch (cmd) {
1763 	case DIOCGDINFO:
1764 		*(struct disklabel *) data = *(rs->sc_dkdev.dk_label);
1765 		break;
1766 #ifdef __HAVE_OLD_DISKLABEL
1767 	case ODIOCGDINFO:
1768 		newlabel = *(rs->sc_dkdev.dk_label);
1769 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1770 			return ENOTTY;
1771 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1772 		break;
1773 #endif
1774 
1775 	case DIOCGPART:
1776 		((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label;
1777 		((struct partinfo *) data)->part =
1778 		    &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
1779 		break;
1780 
1781 	case DIOCWDINFO:
1782 	case DIOCSDINFO:
1783 #ifdef __HAVE_OLD_DISKLABEL
1784 	case ODIOCWDINFO:
1785 	case ODIOCSDINFO:
1786 #endif
1787 	{
1788 		struct disklabel *lp;
1789 #ifdef __HAVE_OLD_DISKLABEL
1790 		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1791 			memset(&newlabel, 0, sizeof newlabel);
1792 			memcpy(&newlabel, data, sizeof (struct olddisklabel));
1793 			lp = &newlabel;
1794 		} else
1795 #endif
1796 		lp = (struct disklabel *)data;
1797 
1798 		if ((error = raidlock(rs)) != 0)
1799 			return (error);
1800 
1801 		rs->sc_flags |= RAIDF_LABELLING;
1802 
1803 		error = setdisklabel(rs->sc_dkdev.dk_label,
1804 		    lp, 0, rs->sc_dkdev.dk_cpulabel);
1805 		if (error == 0) {
1806 			if (cmd == DIOCWDINFO
1807 #ifdef __HAVE_OLD_DISKLABEL
1808 			    || cmd == ODIOCWDINFO
1809 #endif
1810 			   )
1811 				error = writedisklabel(RAIDLABELDEV(dev),
1812 				    raidstrategy, rs->sc_dkdev.dk_label,
1813 				    rs->sc_dkdev.dk_cpulabel);
1814 		}
1815 		rs->sc_flags &= ~RAIDF_LABELLING;
1816 
1817 		raidunlock(rs);
1818 
1819 		if (error)
1820 			return (error);
1821 		break;
1822 	}
1823 
1824 	case DIOCWLABEL:
1825 		if (*(int *) data != 0)
1826 			rs->sc_flags |= RAIDF_WLABEL;
1827 		else
1828 			rs->sc_flags &= ~RAIDF_WLABEL;
1829 		break;
1830 
1831 	case DIOCGDEFLABEL:
1832 		raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data);
1833 		break;
1834 
1835 #ifdef __HAVE_OLD_DISKLABEL
1836 	case ODIOCGDEFLABEL:
1837 		raidgetdefaultlabel(raidPtr, rs, &newlabel);
1838 		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1839 			return ENOTTY;
1840 		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1841 		break;
1842 #endif
1843 
1844 	case DIOCAWEDGE:
1845 	case DIOCDWEDGE:
1846 	    	dkw = (void *)data;
1847 
1848 		/* If the ioctl happens here, the parent is us. */
1849 		(void)strcpy(dkw->dkw_parent, rs->sc_xname);
1850 		return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw);
1851 
1852 	case DIOCLWEDGES:
1853 		return dkwedge_list(&rs->sc_dkdev,
1854 		    (struct dkwedge_list *)data, l);
1855 	case DIOCCACHESYNC:
1856 		return rf_sync_component_caches(raidPtr);
1857 	default:
1858 		retcode = ENOTTY;
1859 	}
1860 	return (retcode);
1861 
1862 }
1863 
1864 
1865 /* raidinit -- complete the rest of the initialization for the
1866    RAIDframe device.  */
1867 
1868 
1869 static void
1870 raidinit(RF_Raid_t *raidPtr)
1871 {
1872 	cfdata_t cf;
1873 	struct raid_softc *rs;
1874 	int     unit;
1875 
1876 	unit = raidPtr->raidid;
1877 
1878 	rs = &raid_softc[unit];
1879 
1880 	/* XXX should check return code first... */
1881 	rs->sc_flags |= RAIDF_INITED;
1882 
1883 	/* XXX doesn't check bounds. */
1884 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit);
1885 
1886 	/* attach the pseudo device */
1887 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1888 	cf->cf_name = raid_cd.cd_name;
1889 	cf->cf_atname = raid_cd.cd_name;
1890 	cf->cf_unit = unit;
1891 	cf->cf_fstate = FSTATE_STAR;
1892 
1893 	rs->sc_dev = config_attach_pseudo(cf);
1894 
1895 	if (rs->sc_dev == NULL) {
1896 		printf("raid%d: config_attach_pseudo failed\n",
1897 		    raidPtr->raidid);
1898 		rs->sc_flags &= ~RAIDF_INITED;
1899 		free(cf, M_RAIDFRAME);
1900 		return;
1901 	}
1902 
1903 	/* disk_attach actually creates space for the CPU disklabel, among
1904 	 * other things, so it's critical to call this *BEFORE* we try putzing
1905 	 * with disklabels. */
1906 
1907 	disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1908 	disk_attach(&rs->sc_dkdev);
1909 	disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector);
1910 
1911 	/* XXX There may be a weird interaction here between this, and
1912 	 * protectedSectors, as used in RAIDframe.  */
1913 
1914 	rs->sc_size = raidPtr->totalSectors;
1915 
1916 	dkwedge_discover(&rs->sc_dkdev);
1917 
1918 	rf_set_properties(rs, raidPtr);
1919 
1920 }
1921 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1922 /* wake up the daemon & tell it to get us a spare table
1923  * XXX
1924  * the entries in the queues should be tagged with the raidPtr
1925  * so that in the extremely rare case that two recons happen at once,
1926  * we know for which device were requesting a spare table
1927  * XXX
1928  *
1929  * XXX This code is not currently used. GO
1930  */
1931 int
1932 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
1933 {
1934 	int     retcode;
1935 
1936 	rf_lock_mutex2(rf_sparet_wait_mutex);
1937 	req->next = rf_sparet_wait_queue;
1938 	rf_sparet_wait_queue = req;
1939 	rf_broadcast_cond2(rf_sparet_wait_cv);
1940 
1941 	/* mpsleep unlocks the mutex */
1942 	while (!rf_sparet_resp_queue) {
1943 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
1944 	}
1945 	req = rf_sparet_resp_queue;
1946 	rf_sparet_resp_queue = req->next;
1947 	rf_unlock_mutex2(rf_sparet_wait_mutex);
1948 
1949 	retcode = req->fcol;
1950 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
1951 					 * alloc'd */
1952 	return (retcode);
1953 }
1954 #endif
1955 
1956 /* a wrapper around rf_DoAccess that extracts appropriate info from the
1957  * bp & passes it down.
1958  * any calls originating in the kernel must use non-blocking I/O
1959  * do some extra sanity checking to return "appropriate" error values for
1960  * certain conditions (to make some standard utilities work)
1961  *
1962  * Formerly known as: rf_DoAccessKernel
1963  */
1964 void
1965 raidstart(RF_Raid_t *raidPtr)
1966 {
1967 	RF_SectorCount_t num_blocks, pb, sum;
1968 	RF_RaidAddr_t raid_addr;
1969 	struct partition *pp;
1970 	daddr_t blocknum;
1971 	int     unit;
1972 	struct raid_softc *rs;
1973 	int     do_async;
1974 	struct buf *bp;
1975 	int rc;
1976 
1977 	unit = raidPtr->raidid;
1978 	rs = &raid_softc[unit];
1979 
1980 	/* quick check to see if anything has died recently */
1981 	rf_lock_mutex2(raidPtr->mutex);
1982 	if (raidPtr->numNewFailures > 0) {
1983 		rf_unlock_mutex2(raidPtr->mutex);
1984 		rf_update_component_labels(raidPtr,
1985 					   RF_NORMAL_COMPONENT_UPDATE);
1986 		rf_lock_mutex2(raidPtr->mutex);
1987 		raidPtr->numNewFailures--;
1988 	}
1989 
1990 	/* Check to see if we're at the limit... */
1991 	while (raidPtr->openings > 0) {
1992 		rf_unlock_mutex2(raidPtr->mutex);
1993 
1994 		/* get the next item, if any, from the queue */
1995 		if ((bp = bufq_get(rs->buf_queue)) == NULL) {
1996 			/* nothing more to do */
1997 			return;
1998 		}
1999 
2000 		/* Ok, for the bp we have here, bp->b_blkno is relative to the
2001 		 * partition.. Need to make it absolute to the underlying
2002 		 * device.. */
2003 
2004 		blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector;
2005 		if (DISKPART(bp->b_dev) != RAW_PART) {
2006 			pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
2007 			blocknum += pp->p_offset;
2008 		}
2009 
2010 		db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2011 			    (int) blocknum));
2012 
2013 		db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2014 		db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2015 
2016 		/* *THIS* is where we adjust what block we're going to...
2017 		 * but DO NOT TOUCH bp->b_blkno!!! */
2018 		raid_addr = blocknum;
2019 
2020 		num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2021 		pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2022 		sum = raid_addr + num_blocks + pb;
2023 		if (1 || rf_debugKernelAccess) {
2024 			db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2025 				    (int) raid_addr, (int) sum, (int) num_blocks,
2026 				    (int) pb, (int) bp->b_resid));
2027 		}
2028 		if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2029 		    || (sum < num_blocks) || (sum < pb)) {
2030 			bp->b_error = ENOSPC;
2031 			bp->b_resid = bp->b_bcount;
2032 			biodone(bp);
2033 			rf_lock_mutex2(raidPtr->mutex);
2034 			continue;
2035 		}
2036 		/*
2037 		 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2038 		 */
2039 
2040 		if (bp->b_bcount & raidPtr->sectorMask) {
2041 			bp->b_error = EINVAL;
2042 			bp->b_resid = bp->b_bcount;
2043 			biodone(bp);
2044 			rf_lock_mutex2(raidPtr->mutex);
2045 			continue;
2046 
2047 		}
2048 		db1_printf(("Calling DoAccess..\n"));
2049 
2050 
2051 		rf_lock_mutex2(raidPtr->mutex);
2052 		raidPtr->openings--;
2053 		rf_unlock_mutex2(raidPtr->mutex);
2054 
2055 		/*
2056 		 * Everything is async.
2057 		 */
2058 		do_async = 1;
2059 
2060 		disk_busy(&rs->sc_dkdev);
2061 
2062 		/* XXX we're still at splbio() here... do we *really*
2063 		   need to be? */
2064 
2065 		/* don't ever condition on bp->b_flags & B_WRITE.
2066 		 * always condition on B_READ instead */
2067 
2068 		rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2069 				 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2070 				 do_async, raid_addr, num_blocks,
2071 				 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2072 
2073 		if (rc) {
2074 			bp->b_error = rc;
2075 			bp->b_resid = bp->b_bcount;
2076 			biodone(bp);
2077 			/* continue loop */
2078 		}
2079 
2080 		rf_lock_mutex2(raidPtr->mutex);
2081 	}
2082 	rf_unlock_mutex2(raidPtr->mutex);
2083 }
2084 
2085 
2086 
2087 
2088 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
2089 
2090 int
2091 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2092 {
2093 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2094 	struct buf *bp;
2095 
2096 	req->queue = queue;
2097 	bp = req->bp;
2098 
2099 	switch (req->type) {
2100 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
2101 		/* XXX need to do something extra here.. */
2102 		/* I'm leaving this in, as I've never actually seen it used,
2103 		 * and I'd like folks to report it... GO */
2104 		printf(("WAKEUP CALLED\n"));
2105 		queue->numOutstanding++;
2106 
2107 		bp->b_flags = 0;
2108 		bp->b_private = req;
2109 
2110 		KernelWakeupFunc(bp);
2111 		break;
2112 
2113 	case RF_IO_TYPE_READ:
2114 	case RF_IO_TYPE_WRITE:
2115 #if RF_ACC_TRACE > 0
2116 		if (req->tracerec) {
2117 			RF_ETIMER_START(req->tracerec->timer);
2118 		}
2119 #endif
2120 		InitBP(bp, queue->rf_cinfo->ci_vp,
2121 		    op, queue->rf_cinfo->ci_dev,
2122 		    req->sectorOffset, req->numSector,
2123 		    req->buf, KernelWakeupFunc, (void *) req,
2124 		    queue->raidPtr->logBytesPerSector, req->b_proc);
2125 
2126 		if (rf_debugKernelAccess) {
2127 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
2128 				(long) bp->b_blkno));
2129 		}
2130 		queue->numOutstanding++;
2131 		queue->last_deq_sector = req->sectorOffset;
2132 		/* acc wouldn't have been let in if there were any pending
2133 		 * reqs at any other priority */
2134 		queue->curPriority = req->priority;
2135 
2136 		db1_printf(("Going for %c to unit %d col %d\n",
2137 			    req->type, queue->raidPtr->raidid,
2138 			    queue->col));
2139 		db1_printf(("sector %d count %d (%d bytes) %d\n",
2140 			(int) req->sectorOffset, (int) req->numSector,
2141 			(int) (req->numSector <<
2142 			    queue->raidPtr->logBytesPerSector),
2143 			(int) queue->raidPtr->logBytesPerSector));
2144 
2145 		/*
2146 		 * XXX: drop lock here since this can block at
2147 		 * least with backing SCSI devices.  Retake it
2148 		 * to minimize fuss with calling interfaces.
2149 		 */
2150 
2151 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2152 		bdev_strategy(bp);
2153 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2154 		break;
2155 
2156 	default:
2157 		panic("bad req->type in rf_DispatchKernelIO");
2158 	}
2159 	db1_printf(("Exiting from DispatchKernelIO\n"));
2160 
2161 	return (0);
2162 }
2163 /* this is the callback function associated with a I/O invoked from
2164    kernel code.
2165  */
2166 static void
2167 KernelWakeupFunc(struct buf *bp)
2168 {
2169 	RF_DiskQueueData_t *req = NULL;
2170 	RF_DiskQueue_t *queue;
2171 
2172 	db1_printf(("recovering the request queue:\n"));
2173 
2174 	req = bp->b_private;
2175 
2176 	queue = (RF_DiskQueue_t *) req->queue;
2177 
2178 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
2179 
2180 #if RF_ACC_TRACE > 0
2181 	if (req->tracerec) {
2182 		RF_ETIMER_STOP(req->tracerec->timer);
2183 		RF_ETIMER_EVAL(req->tracerec->timer);
2184 		rf_lock_mutex2(rf_tracing_mutex);
2185 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2186 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2187 		req->tracerec->num_phys_ios++;
2188 		rf_unlock_mutex2(rf_tracing_mutex);
2189 	}
2190 #endif
2191 
2192 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
2193 	 * ballistic, and mark the component as hosed... */
2194 
2195 	if (bp->b_error != 0) {
2196 		/* Mark the disk as dead */
2197 		/* but only mark it once... */
2198 		/* and only if it wouldn't leave this RAID set
2199 		   completely broken */
2200 		if (((queue->raidPtr->Disks[queue->col].status ==
2201 		      rf_ds_optimal) ||
2202 		     (queue->raidPtr->Disks[queue->col].status ==
2203 		      rf_ds_used_spare)) &&
2204 		     (queue->raidPtr->numFailures <
2205 		      queue->raidPtr->Layout.map->faultsTolerated)) {
2206 			printf("raid%d: IO Error.  Marking %s as failed.\n",
2207 			       queue->raidPtr->raidid,
2208 			       queue->raidPtr->Disks[queue->col].devname);
2209 			queue->raidPtr->Disks[queue->col].status =
2210 			    rf_ds_failed;
2211 			queue->raidPtr->status = rf_rs_degraded;
2212 			queue->raidPtr->numFailures++;
2213 			queue->raidPtr->numNewFailures++;
2214 		} else {	/* Disk is already dead... */
2215 			/* printf("Disk already marked as dead!\n"); */
2216 		}
2217 
2218 	}
2219 
2220 	/* Fill in the error value */
2221 	req->error = bp->b_error;
2222 
2223 	/* Drop this one on the "finished" queue... */
2224 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2225 
2226 	/* Let the raidio thread know there is work to be done. */
2227 	rf_signal_cond2(queue->raidPtr->iodone_cv);
2228 
2229 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2230 }
2231 
2232 
2233 /*
2234  * initialize a buf structure for doing an I/O in the kernel.
2235  */
2236 static void
2237 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2238        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2239        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector,
2240        struct proc *b_proc)
2241 {
2242 	/* bp->b_flags       = B_PHYS | rw_flag; */
2243 	bp->b_flags = rw_flag;	/* XXX need B_PHYS here too??? */
2244 	bp->b_oflags = 0;
2245 	bp->b_cflags = 0;
2246 	bp->b_bcount = numSect << logBytesPerSector;
2247 	bp->b_bufsize = bp->b_bcount;
2248 	bp->b_error = 0;
2249 	bp->b_dev = dev;
2250 	bp->b_data = bf;
2251 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2252 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
2253 	if (bp->b_bcount == 0) {
2254 		panic("bp->b_bcount is zero in InitBP!!");
2255 	}
2256 	bp->b_proc = b_proc;
2257 	bp->b_iodone = cbFunc;
2258 	bp->b_private = cbArg;
2259 }
2260 
2261 static void
2262 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs,
2263 		    struct disklabel *lp)
2264 {
2265 	memset(lp, 0, sizeof(*lp));
2266 
2267 	/* fabricate a label... */
2268 	lp->d_secperunit = raidPtr->totalSectors;
2269 	lp->d_secsize = raidPtr->bytesPerSector;
2270 	lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe;
2271 	lp->d_ntracks = 4 * raidPtr->numCol;
2272 	lp->d_ncylinders = raidPtr->totalSectors /
2273 		(lp->d_nsectors * lp->d_ntracks);
2274 	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
2275 
2276 	strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
2277 	lp->d_type = DTYPE_RAID;
2278 	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
2279 	lp->d_rpm = 3600;
2280 	lp->d_interleave = 1;
2281 	lp->d_flags = 0;
2282 
2283 	lp->d_partitions[RAW_PART].p_offset = 0;
2284 	lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
2285 	lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
2286 	lp->d_npartitions = RAW_PART + 1;
2287 
2288 	lp->d_magic = DISKMAGIC;
2289 	lp->d_magic2 = DISKMAGIC;
2290 	lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
2291 
2292 }
2293 /*
2294  * Read the disklabel from the raid device.  If one is not present, fake one
2295  * up.
2296  */
2297 static void
2298 raidgetdisklabel(dev_t dev)
2299 {
2300 	int     unit = raidunit(dev);
2301 	struct raid_softc *rs = &raid_softc[unit];
2302 	const char   *errstring;
2303 	struct disklabel *lp = rs->sc_dkdev.dk_label;
2304 	struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
2305 	RF_Raid_t *raidPtr;
2306 
2307 	db1_printf(("Getting the disklabel...\n"));
2308 
2309 	memset(clp, 0, sizeof(*clp));
2310 
2311 	raidPtr = raidPtrs[unit];
2312 
2313 	raidgetdefaultlabel(raidPtr, rs, lp);
2314 
2315 	/*
2316 	 * Call the generic disklabel extraction routine.
2317 	 */
2318 	errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
2319 	    rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
2320 	if (errstring)
2321 		raidmakedisklabel(rs);
2322 	else {
2323 		int     i;
2324 		struct partition *pp;
2325 
2326 		/*
2327 		 * Sanity check whether the found disklabel is valid.
2328 		 *
2329 		 * This is necessary since total size of the raid device
2330 		 * may vary when an interleave is changed even though exactly
2331 		 * same components are used, and old disklabel may used
2332 		 * if that is found.
2333 		 */
2334 		if (lp->d_secperunit != rs->sc_size)
2335 			printf("raid%d: WARNING: %s: "
2336 			    "total sector size in disklabel (%" PRIu32 ") != "
2337 			    "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname,
2338 			    lp->d_secperunit, rs->sc_size);
2339 		for (i = 0; i < lp->d_npartitions; i++) {
2340 			pp = &lp->d_partitions[i];
2341 			if (pp->p_offset + pp->p_size > rs->sc_size)
2342 				printf("raid%d: WARNING: %s: end of partition `%c' "
2343 				       "exceeds the size of raid (%" PRIu64 ")\n",
2344 				       unit, rs->sc_xname, 'a' + i, rs->sc_size);
2345 		}
2346 	}
2347 
2348 }
2349 /*
2350  * Take care of things one might want to take care of in the event
2351  * that a disklabel isn't present.
2352  */
2353 static void
2354 raidmakedisklabel(struct raid_softc *rs)
2355 {
2356 	struct disklabel *lp = rs->sc_dkdev.dk_label;
2357 	db1_printf(("Making a label..\n"));
2358 
2359 	/*
2360 	 * For historical reasons, if there's no disklabel present
2361 	 * the raw partition must be marked FS_BSDFFS.
2362 	 */
2363 
2364 	lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
2365 
2366 	strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
2367 
2368 	lp->d_checksum = dkcksum(lp);
2369 }
2370 /*
2371  * Wait interruptibly for an exclusive lock.
2372  *
2373  * XXX
2374  * Several drivers do this; it should be abstracted and made MP-safe.
2375  * (Hmm... where have we seen this warning before :->  GO )
2376  */
2377 static int
2378 raidlock(struct raid_softc *rs)
2379 {
2380 	int     error;
2381 
2382 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2383 		rs->sc_flags |= RAIDF_WANTED;
2384 		if ((error =
2385 			tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
2386 			return (error);
2387 	}
2388 	rs->sc_flags |= RAIDF_LOCKED;
2389 	return (0);
2390 }
2391 /*
2392  * Unlock and wake up any waiters.
2393  */
2394 static void
2395 raidunlock(struct raid_softc *rs)
2396 {
2397 
2398 	rs->sc_flags &= ~RAIDF_LOCKED;
2399 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2400 		rs->sc_flags &= ~RAIDF_WANTED;
2401 		wakeup(rs);
2402 	}
2403 }
2404 
2405 
2406 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2407 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2408 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
2409 
2410 static daddr_t
2411 rf_component_info_offset(void)
2412 {
2413 
2414 	return RF_COMPONENT_INFO_OFFSET;
2415 }
2416 
2417 static daddr_t
2418 rf_component_info_size(unsigned secsize)
2419 {
2420 	daddr_t info_size;
2421 
2422 	KASSERT(secsize);
2423 	if (secsize > RF_COMPONENT_INFO_SIZE)
2424 		info_size = secsize;
2425 	else
2426 		info_size = RF_COMPONENT_INFO_SIZE;
2427 
2428 	return info_size;
2429 }
2430 
2431 static daddr_t
2432 rf_parity_map_offset(RF_Raid_t *raidPtr)
2433 {
2434 	daddr_t map_offset;
2435 
2436 	KASSERT(raidPtr->bytesPerSector);
2437 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2438 		map_offset = raidPtr->bytesPerSector;
2439 	else
2440 		map_offset = RF_COMPONENT_INFO_SIZE;
2441 	map_offset += rf_component_info_offset();
2442 
2443 	return map_offset;
2444 }
2445 
2446 static daddr_t
2447 rf_parity_map_size(RF_Raid_t *raidPtr)
2448 {
2449 	daddr_t map_size;
2450 
2451 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2452 		map_size = raidPtr->bytesPerSector;
2453 	else
2454 		map_size = RF_PARITY_MAP_SIZE;
2455 
2456 	return map_size;
2457 }
2458 
2459 int
2460 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2461 {
2462 	RF_ComponentLabel_t *clabel;
2463 
2464 	clabel = raidget_component_label(raidPtr, col);
2465 	clabel->clean = RF_RAID_CLEAN;
2466 	raidflush_component_label(raidPtr, col);
2467 	return(0);
2468 }
2469 
2470 
2471 int
2472 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2473 {
2474 	RF_ComponentLabel_t *clabel;
2475 
2476 	clabel = raidget_component_label(raidPtr, col);
2477 	clabel->clean = RF_RAID_DIRTY;
2478 	raidflush_component_label(raidPtr, col);
2479 	return(0);
2480 }
2481 
2482 int
2483 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2484 {
2485 	KASSERT(raidPtr->bytesPerSector);
2486 	return raidread_component_label(raidPtr->bytesPerSector,
2487 	    raidPtr->Disks[col].dev,
2488 	    raidPtr->raid_cinfo[col].ci_vp,
2489 	    &raidPtr->raid_cinfo[col].ci_label);
2490 }
2491 
2492 RF_ComponentLabel_t *
2493 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2494 {
2495 	return &raidPtr->raid_cinfo[col].ci_label;
2496 }
2497 
2498 int
2499 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2500 {
2501 	RF_ComponentLabel_t *label;
2502 
2503 	label = &raidPtr->raid_cinfo[col].ci_label;
2504 	label->mod_counter = raidPtr->mod_counter;
2505 #ifndef RF_NO_PARITY_MAP
2506 	label->parity_map_modcount = label->mod_counter;
2507 #endif
2508 	return raidwrite_component_label(raidPtr->bytesPerSector,
2509 	    raidPtr->Disks[col].dev,
2510 	    raidPtr->raid_cinfo[col].ci_vp, label);
2511 }
2512 
2513 
2514 static int
2515 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2516     RF_ComponentLabel_t *clabel)
2517 {
2518 	return raidread_component_area(dev, b_vp, clabel,
2519 	    sizeof(RF_ComponentLabel_t),
2520 	    rf_component_info_offset(),
2521 	    rf_component_info_size(secsize));
2522 }
2523 
2524 /* ARGSUSED */
2525 static int
2526 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2527     size_t msize, daddr_t offset, daddr_t dsize)
2528 {
2529 	struct buf *bp;
2530 	const struct bdevsw *bdev;
2531 	int error;
2532 
2533 	/* XXX should probably ensure that we don't try to do this if
2534 	   someone has changed rf_protected_sectors. */
2535 
2536 	if (b_vp == NULL) {
2537 		/* For whatever reason, this component is not valid.
2538 		   Don't try to read a component label from it. */
2539 		return(EINVAL);
2540 	}
2541 
2542 	/* get a block of the appropriate size... */
2543 	bp = geteblk((int)dsize);
2544 	bp->b_dev = dev;
2545 
2546 	/* get our ducks in a row for the read */
2547 	bp->b_blkno = offset / DEV_BSIZE;
2548 	bp->b_bcount = dsize;
2549 	bp->b_flags |= B_READ;
2550  	bp->b_resid = dsize;
2551 
2552 	bdev = bdevsw_lookup(bp->b_dev);
2553 	if (bdev == NULL)
2554 		return (ENXIO);
2555 	(*bdev->d_strategy)(bp);
2556 
2557 	error = biowait(bp);
2558 
2559 	if (!error) {
2560 		memcpy(data, bp->b_data, msize);
2561 	}
2562 
2563 	brelse(bp, 0);
2564 	return(error);
2565 }
2566 
2567 
2568 static int
2569 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2570     RF_ComponentLabel_t *clabel)
2571 {
2572 	return raidwrite_component_area(dev, b_vp, clabel,
2573 	    sizeof(RF_ComponentLabel_t),
2574 	    rf_component_info_offset(),
2575 	    rf_component_info_size(secsize), 0);
2576 }
2577 
2578 /* ARGSUSED */
2579 static int
2580 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2581     size_t msize, daddr_t offset, daddr_t dsize, int asyncp)
2582 {
2583 	struct buf *bp;
2584 	const struct bdevsw *bdev;
2585 	int error;
2586 
2587 	/* get a block of the appropriate size... */
2588 	bp = geteblk((int)dsize);
2589 	bp->b_dev = dev;
2590 
2591 	/* get our ducks in a row for the write */
2592 	bp->b_blkno = offset / DEV_BSIZE;
2593 	bp->b_bcount = dsize;
2594 	bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0);
2595  	bp->b_resid = dsize;
2596 
2597 	memset(bp->b_data, 0, dsize);
2598 	memcpy(bp->b_data, data, msize);
2599 
2600 	bdev = bdevsw_lookup(bp->b_dev);
2601 	if (bdev == NULL)
2602 		return (ENXIO);
2603 	(*bdev->d_strategy)(bp);
2604 	if (asyncp)
2605 		return 0;
2606 	error = biowait(bp);
2607 	brelse(bp, 0);
2608 	if (error) {
2609 #if 1
2610 		printf("Failed to write RAID component info!\n");
2611 #endif
2612 	}
2613 
2614 	return(error);
2615 }
2616 
2617 void
2618 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2619 {
2620 	int c;
2621 
2622 	for (c = 0; c < raidPtr->numCol; c++) {
2623 		/* Skip dead disks. */
2624 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2625 			continue;
2626 		/* XXXjld: what if an error occurs here? */
2627 		raidwrite_component_area(raidPtr->Disks[c].dev,
2628 		    raidPtr->raid_cinfo[c].ci_vp, map,
2629 		    RF_PARITYMAP_NBYTE,
2630 		    rf_parity_map_offset(raidPtr),
2631 		    rf_parity_map_size(raidPtr), 0);
2632 	}
2633 }
2634 
2635 void
2636 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2637 {
2638 	struct rf_paritymap_ondisk tmp;
2639 	int c,first;
2640 
2641 	first=1;
2642 	for (c = 0; c < raidPtr->numCol; c++) {
2643 		/* Skip dead disks. */
2644 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2645 			continue;
2646 		raidread_component_area(raidPtr->Disks[c].dev,
2647 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
2648 		    RF_PARITYMAP_NBYTE,
2649 		    rf_parity_map_offset(raidPtr),
2650 		    rf_parity_map_size(raidPtr));
2651 		if (first) {
2652 			memcpy(map, &tmp, sizeof(*map));
2653 			first = 0;
2654 		} else {
2655 			rf_paritymap_merge(map, &tmp);
2656 		}
2657 	}
2658 }
2659 
2660 void
2661 rf_markalldirty(RF_Raid_t *raidPtr)
2662 {
2663 	RF_ComponentLabel_t *clabel;
2664 	int sparecol;
2665 	int c;
2666 	int j;
2667 	int scol = -1;
2668 
2669 	raidPtr->mod_counter++;
2670 	for (c = 0; c < raidPtr->numCol; c++) {
2671 		/* we don't want to touch (at all) a disk that has
2672 		   failed */
2673 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2674 			clabel = raidget_component_label(raidPtr, c);
2675 			if (clabel->status == rf_ds_spared) {
2676 				/* XXX do something special...
2677 				   but whatever you do, don't
2678 				   try to access it!! */
2679 			} else {
2680 				raidmarkdirty(raidPtr, c);
2681 			}
2682 		}
2683 	}
2684 
2685 	for( c = 0; c < raidPtr->numSpare ; c++) {
2686 		sparecol = raidPtr->numCol + c;
2687 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2688 			/*
2689 
2690 			   we claim this disk is "optimal" if it's
2691 			   rf_ds_used_spare, as that means it should be
2692 			   directly substitutable for the disk it replaced.
2693 			   We note that too...
2694 
2695 			 */
2696 
2697 			for(j=0;j<raidPtr->numCol;j++) {
2698 				if (raidPtr->Disks[j].spareCol == sparecol) {
2699 					scol = j;
2700 					break;
2701 				}
2702 			}
2703 
2704 			clabel = raidget_component_label(raidPtr, sparecol);
2705 			/* make sure status is noted */
2706 
2707 			raid_init_component_label(raidPtr, clabel);
2708 
2709 			clabel->row = 0;
2710 			clabel->column = scol;
2711 			/* Note: we *don't* change status from rf_ds_used_spare
2712 			   to rf_ds_optimal */
2713 			/* clabel.status = rf_ds_optimal; */
2714 
2715 			raidmarkdirty(raidPtr, sparecol);
2716 		}
2717 	}
2718 }
2719 
2720 
2721 void
2722 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2723 {
2724 	RF_ComponentLabel_t *clabel;
2725 	int sparecol;
2726 	int c;
2727 	int j;
2728 	int scol;
2729 
2730 	scol = -1;
2731 
2732 	/* XXX should do extra checks to make sure things really are clean,
2733 	   rather than blindly setting the clean bit... */
2734 
2735 	raidPtr->mod_counter++;
2736 
2737 	for (c = 0; c < raidPtr->numCol; c++) {
2738 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
2739 			clabel = raidget_component_label(raidPtr, c);
2740 			/* make sure status is noted */
2741 			clabel->status = rf_ds_optimal;
2742 
2743 			/* note what unit we are configured as */
2744 			clabel->last_unit = raidPtr->raidid;
2745 
2746 			raidflush_component_label(raidPtr, c);
2747 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2748 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2749 					raidmarkclean(raidPtr, c);
2750 				}
2751 			}
2752 		}
2753 		/* else we don't touch it.. */
2754 	}
2755 
2756 	for( c = 0; c < raidPtr->numSpare ; c++) {
2757 		sparecol = raidPtr->numCol + c;
2758 		/* Need to ensure that the reconstruct actually completed! */
2759 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2760 			/*
2761 
2762 			   we claim this disk is "optimal" if it's
2763 			   rf_ds_used_spare, as that means it should be
2764 			   directly substitutable for the disk it replaced.
2765 			   We note that too...
2766 
2767 			 */
2768 
2769 			for(j=0;j<raidPtr->numCol;j++) {
2770 				if (raidPtr->Disks[j].spareCol == sparecol) {
2771 					scol = j;
2772 					break;
2773 				}
2774 			}
2775 
2776 			/* XXX shouldn't *really* need this... */
2777 			clabel = raidget_component_label(raidPtr, sparecol);
2778 			/* make sure status is noted */
2779 
2780 			raid_init_component_label(raidPtr, clabel);
2781 
2782 			clabel->column = scol;
2783 			clabel->status = rf_ds_optimal;
2784 			clabel->last_unit = raidPtr->raidid;
2785 
2786 			raidflush_component_label(raidPtr, sparecol);
2787 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2788 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2789 					raidmarkclean(raidPtr, sparecol);
2790 				}
2791 			}
2792 		}
2793 	}
2794 }
2795 
2796 void
2797 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2798 {
2799 
2800 	if (vp != NULL) {
2801 		if (auto_configured == 1) {
2802 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2803 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2804 			vput(vp);
2805 
2806 		} else {
2807 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2808 		}
2809 	}
2810 }
2811 
2812 
2813 void
2814 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2815 {
2816 	int r,c;
2817 	struct vnode *vp;
2818 	int acd;
2819 
2820 
2821 	/* We take this opportunity to close the vnodes like we should.. */
2822 
2823 	for (c = 0; c < raidPtr->numCol; c++) {
2824 		vp = raidPtr->raid_cinfo[c].ci_vp;
2825 		acd = raidPtr->Disks[c].auto_configured;
2826 		rf_close_component(raidPtr, vp, acd);
2827 		raidPtr->raid_cinfo[c].ci_vp = NULL;
2828 		raidPtr->Disks[c].auto_configured = 0;
2829 	}
2830 
2831 	for (r = 0; r < raidPtr->numSpare; r++) {
2832 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2833 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2834 		rf_close_component(raidPtr, vp, acd);
2835 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2836 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2837 	}
2838 }
2839 
2840 
2841 void
2842 rf_ReconThread(struct rf_recon_req *req)
2843 {
2844 	int     s;
2845 	RF_Raid_t *raidPtr;
2846 
2847 	s = splbio();
2848 	raidPtr = (RF_Raid_t *) req->raidPtr;
2849 	raidPtr->recon_in_progress = 1;
2850 
2851 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2852 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2853 
2854 	RF_Free(req, sizeof(*req));
2855 
2856 	raidPtr->recon_in_progress = 0;
2857 	splx(s);
2858 
2859 	/* That's all... */
2860 	kthread_exit(0);	/* does not return */
2861 }
2862 
2863 void
2864 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2865 {
2866 	int retcode;
2867 	int s;
2868 
2869 	raidPtr->parity_rewrite_stripes_done = 0;
2870 	raidPtr->parity_rewrite_in_progress = 1;
2871 	s = splbio();
2872 	retcode = rf_RewriteParity(raidPtr);
2873 	splx(s);
2874 	if (retcode) {
2875 		printf("raid%d: Error re-writing parity (%d)!\n",
2876 		    raidPtr->raidid, retcode);
2877 	} else {
2878 		/* set the clean bit!  If we shutdown correctly,
2879 		   the clean bit on each component label will get
2880 		   set */
2881 		raidPtr->parity_good = RF_RAID_CLEAN;
2882 	}
2883 	raidPtr->parity_rewrite_in_progress = 0;
2884 
2885 	/* Anyone waiting for us to stop?  If so, inform them... */
2886 	if (raidPtr->waitShutdown) {
2887 		wakeup(&raidPtr->parity_rewrite_in_progress);
2888 	}
2889 
2890 	/* That's all... */
2891 	kthread_exit(0);	/* does not return */
2892 }
2893 
2894 
2895 void
2896 rf_CopybackThread(RF_Raid_t *raidPtr)
2897 {
2898 	int s;
2899 
2900 	raidPtr->copyback_in_progress = 1;
2901 	s = splbio();
2902 	rf_CopybackReconstructedData(raidPtr);
2903 	splx(s);
2904 	raidPtr->copyback_in_progress = 0;
2905 
2906 	/* That's all... */
2907 	kthread_exit(0);	/* does not return */
2908 }
2909 
2910 
2911 void
2912 rf_ReconstructInPlaceThread(struct rf_recon_req *req)
2913 {
2914 	int s;
2915 	RF_Raid_t *raidPtr;
2916 
2917 	s = splbio();
2918 	raidPtr = req->raidPtr;
2919 	raidPtr->recon_in_progress = 1;
2920 	rf_ReconstructInPlace(raidPtr, req->col);
2921 	RF_Free(req, sizeof(*req));
2922 	raidPtr->recon_in_progress = 0;
2923 	splx(s);
2924 
2925 	/* That's all... */
2926 	kthread_exit(0);	/* does not return */
2927 }
2928 
2929 static RF_AutoConfig_t *
2930 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2931     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2932     unsigned secsize)
2933 {
2934 	int good_one = 0;
2935 	RF_ComponentLabel_t *clabel;
2936 	RF_AutoConfig_t *ac;
2937 
2938 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT);
2939 	if (clabel == NULL) {
2940 oomem:
2941 		    while(ac_list) {
2942 			    ac = ac_list;
2943 			    if (ac->clabel)
2944 				    free(ac->clabel, M_RAIDFRAME);
2945 			    ac_list = ac_list->next;
2946 			    free(ac, M_RAIDFRAME);
2947 		    }
2948 		    printf("RAID auto config: out of memory!\n");
2949 		    return NULL; /* XXX probably should panic? */
2950 	}
2951 
2952 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
2953 		/* Got the label.  Does it look reasonable? */
2954 		if (rf_reasonable_label(clabel, numsecs) &&
2955 		    (rf_component_label_partitionsize(clabel) <= size)) {
2956 #ifdef DEBUG
2957 			printf("Component on: %s: %llu\n",
2958 				cname, (unsigned long long)size);
2959 			rf_print_component_label(clabel);
2960 #endif
2961 			/* if it's reasonable, add it, else ignore it. */
2962 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2963 				M_NOWAIT);
2964 			if (ac == NULL) {
2965 				free(clabel, M_RAIDFRAME);
2966 				goto oomem;
2967 			}
2968 			strlcpy(ac->devname, cname, sizeof(ac->devname));
2969 			ac->dev = dev;
2970 			ac->vp = vp;
2971 			ac->clabel = clabel;
2972 			ac->next = ac_list;
2973 			ac_list = ac;
2974 			good_one = 1;
2975 		}
2976 	}
2977 	if (!good_one) {
2978 		/* cleanup */
2979 		free(clabel, M_RAIDFRAME);
2980 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2981 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2982 		vput(vp);
2983 	}
2984 	return ac_list;
2985 }
2986 
2987 RF_AutoConfig_t *
2988 rf_find_raid_components(void)
2989 {
2990 	struct vnode *vp;
2991 	struct disklabel label;
2992 	device_t dv;
2993 	deviter_t di;
2994 	dev_t dev;
2995 	int bmajor, bminor, wedge, rf_part_found;
2996 	int error;
2997 	int i;
2998 	RF_AutoConfig_t *ac_list;
2999 	uint64_t numsecs;
3000 	unsigned secsize;
3001 
3002 	/* initialize the AutoConfig list */
3003 	ac_list = NULL;
3004 
3005 	/* we begin by trolling through *all* the devices on the system */
3006 
3007 	for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
3008 	     dv = deviter_next(&di)) {
3009 
3010 		/* we are only interested in disks... */
3011 		if (device_class(dv) != DV_DISK)
3012 			continue;
3013 
3014 		/* we don't care about floppies... */
3015 		if (device_is_a(dv, "fd")) {
3016 			continue;
3017 		}
3018 
3019 		/* we don't care about CD's... */
3020 		if (device_is_a(dv, "cd")) {
3021 			continue;
3022 		}
3023 
3024 		/* we don't care about md's... */
3025 		if (device_is_a(dv, "md")) {
3026 			continue;
3027 		}
3028 
3029 		/* hdfd is the Atari/Hades floppy driver */
3030 		if (device_is_a(dv, "hdfd")) {
3031 			continue;
3032 		}
3033 
3034 		/* fdisa is the Atari/Milan floppy driver */
3035 		if (device_is_a(dv, "fdisa")) {
3036 			continue;
3037 		}
3038 
3039 		/* need to find the device_name_to_block_device_major stuff */
3040 		bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3041 
3042 		rf_part_found = 0; /*No raid partition as yet*/
3043 
3044 		/* get a vnode for the raw partition of this disk */
3045 
3046 		wedge = device_is_a(dv, "dk");
3047 		bminor = minor(device_unit(dv));
3048 		dev = wedge ? makedev(bmajor, bminor) :
3049 		    MAKEDISKDEV(bmajor, bminor, RAW_PART);
3050 		if (bdevvp(dev, &vp))
3051 			panic("RAID can't alloc vnode");
3052 
3053 		error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3054 
3055 		if (error) {
3056 			/* "Who cares."  Continue looking
3057 			   for something that exists*/
3058 			vput(vp);
3059 			continue;
3060 		}
3061 
3062 		error = getdisksize(vp, &numsecs, &secsize);
3063 		if (error) {
3064 			vput(vp);
3065 			continue;
3066 		}
3067 		if (wedge) {
3068 			struct dkwedge_info dkw;
3069 			error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3070 			    NOCRED);
3071 			if (error) {
3072 				printf("RAIDframe: can't get wedge info for "
3073 				    "dev %s (%d)\n", device_xname(dv), error);
3074 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3075 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3076 				vput(vp);
3077 				continue;
3078 			}
3079 
3080 			if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3081 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3082 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3083 				vput(vp);
3084 				continue;
3085 			}
3086 
3087 			ac_list = rf_get_component(ac_list, dev, vp,
3088 			    device_xname(dv), dkw.dkw_size, numsecs, secsize);
3089 			rf_part_found = 1; /*There is a raid component on this disk*/
3090 			continue;
3091 		}
3092 
3093 		/* Ok, the disk exists.  Go get the disklabel. */
3094 		error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3095 		if (error) {
3096 			/*
3097 			 * XXX can't happen - open() would
3098 			 * have errored out (or faked up one)
3099 			 */
3100 			if (error != ENOTTY)
3101 				printf("RAIDframe: can't get label for dev "
3102 				    "%s (%d)\n", device_xname(dv), error);
3103 		}
3104 
3105 		/* don't need this any more.  We'll allocate it again
3106 		   a little later if we really do... */
3107 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3108 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3109 		vput(vp);
3110 
3111 		if (error)
3112 			continue;
3113 
3114 		rf_part_found = 0; /*No raid partitions yet*/
3115 		for (i = 0; i < label.d_npartitions; i++) {
3116 			char cname[sizeof(ac_list->devname)];
3117 
3118 			/* We only support partitions marked as RAID */
3119 			if (label.d_partitions[i].p_fstype != FS_RAID)
3120 				continue;
3121 
3122 			dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3123 			if (bdevvp(dev, &vp))
3124 				panic("RAID can't alloc vnode");
3125 
3126 			error = VOP_OPEN(vp, FREAD, NOCRED);
3127 			if (error) {
3128 				/* Whatever... */
3129 				vput(vp);
3130 				continue;
3131 			}
3132 			snprintf(cname, sizeof(cname), "%s%c",
3133 			    device_xname(dv), 'a' + i);
3134 			ac_list = rf_get_component(ac_list, dev, vp, cname,
3135 				label.d_partitions[i].p_size, numsecs, secsize);
3136 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
3137 		}
3138 
3139 		/*
3140 		 *If there is no raid component on this disk, either in a
3141 		 *disklabel or inside a wedge, check the raw partition as well,
3142 		 *as it is possible to configure raid components on raw disk
3143 		 *devices.
3144 		 */
3145 
3146 		if (!rf_part_found) {
3147 			char cname[sizeof(ac_list->devname)];
3148 
3149 			dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3150 			if (bdevvp(dev, &vp))
3151 				panic("RAID can't alloc vnode");
3152 
3153 			error = VOP_OPEN(vp, FREAD, NOCRED);
3154 			if (error) {
3155 				/* Whatever... */
3156 				vput(vp);
3157 				continue;
3158 			}
3159 			snprintf(cname, sizeof(cname), "%s%c",
3160 			    device_xname(dv), 'a' + RAW_PART);
3161 			ac_list = rf_get_component(ac_list, dev, vp, cname,
3162 				label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3163 		}
3164 	}
3165 	deviter_release(&di);
3166 	return ac_list;
3167 }
3168 
3169 
3170 int
3171 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3172 {
3173 
3174 	if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) ||
3175 	     (clabel->version==RF_COMPONENT_LABEL_VERSION)) &&
3176 	    ((clabel->clean == RF_RAID_CLEAN) ||
3177 	     (clabel->clean == RF_RAID_DIRTY)) &&
3178 	    clabel->row >=0 &&
3179 	    clabel->column >= 0 &&
3180 	    clabel->num_rows > 0 &&
3181 	    clabel->num_columns > 0 &&
3182 	    clabel->row < clabel->num_rows &&
3183 	    clabel->column < clabel->num_columns &&
3184 	    clabel->blockSize > 0 &&
3185 	    /*
3186 	     * numBlocksHi may contain garbage, but it is ok since
3187 	     * the type is unsigned.  If it is really garbage,
3188 	     * rf_fix_old_label_size() will fix it.
3189 	     */
3190 	    rf_component_label_numblocks(clabel) > 0) {
3191 		/*
3192 		 * label looks reasonable enough...
3193 		 * let's make sure it has no old garbage.
3194 		 */
3195 		if (numsecs)
3196 			rf_fix_old_label_size(clabel, numsecs);
3197 		return(1);
3198 	}
3199 	return(0);
3200 }
3201 
3202 
3203 /*
3204  * For reasons yet unknown, some old component labels have garbage in
3205  * the newer numBlocksHi region, and this causes lossage.  Since those
3206  * disks will also have numsecs set to less than 32 bits of sectors,
3207  * we can determine when this corruption has occured, and fix it.
3208  *
3209  * The exact same problem, with the same unknown reason, happens to
3210  * the partitionSizeHi member as well.
3211  */
3212 static void
3213 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3214 {
3215 
3216 	if (numsecs < ((uint64_t)1 << 32)) {
3217 		if (clabel->numBlocksHi) {
3218 			printf("WARNING: total sectors < 32 bits, yet "
3219 			       "numBlocksHi set\n"
3220 			       "WARNING: resetting numBlocksHi to zero.\n");
3221 			clabel->numBlocksHi = 0;
3222 		}
3223 
3224 		if (clabel->partitionSizeHi) {
3225 			printf("WARNING: total sectors < 32 bits, yet "
3226 			       "partitionSizeHi set\n"
3227 			       "WARNING: resetting partitionSizeHi to zero.\n");
3228 			clabel->partitionSizeHi = 0;
3229 		}
3230 	}
3231 }
3232 
3233 
3234 #ifdef DEBUG
3235 void
3236 rf_print_component_label(RF_ComponentLabel_t *clabel)
3237 {
3238 	uint64_t numBlocks;
3239 
3240 	numBlocks = rf_component_label_numblocks(clabel);
3241 
3242 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3243 	       clabel->row, clabel->column,
3244 	       clabel->num_rows, clabel->num_columns);
3245 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3246 	       clabel->version, clabel->serial_number,
3247 	       clabel->mod_counter);
3248 	printf("   Clean: %s Status: %d\n",
3249 	       clabel->clean ? "Yes" : "No", clabel->status);
3250 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3251 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3252 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
3253 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3254 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3255 	printf("   Contains root partition: %s\n",
3256 	       clabel->root_partition ? "Yes" : "No");
3257 	printf("   Last configured as: raid%d\n", clabel->last_unit);
3258 #if 0
3259 	   printf("   Config order: %d\n", clabel->config_order);
3260 #endif
3261 
3262 }
3263 #endif
3264 
3265 RF_ConfigSet_t *
3266 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3267 {
3268 	RF_AutoConfig_t *ac;
3269 	RF_ConfigSet_t *config_sets;
3270 	RF_ConfigSet_t *cset;
3271 	RF_AutoConfig_t *ac_next;
3272 
3273 
3274 	config_sets = NULL;
3275 
3276 	/* Go through the AutoConfig list, and figure out which components
3277 	   belong to what sets.  */
3278 	ac = ac_list;
3279 	while(ac!=NULL) {
3280 		/* we're going to putz with ac->next, so save it here
3281 		   for use at the end of the loop */
3282 		ac_next = ac->next;
3283 
3284 		if (config_sets == NULL) {
3285 			/* will need at least this one... */
3286 			config_sets = (RF_ConfigSet_t *)
3287 				malloc(sizeof(RF_ConfigSet_t),
3288 				       M_RAIDFRAME, M_NOWAIT);
3289 			if (config_sets == NULL) {
3290 				panic("rf_create_auto_sets: No memory!");
3291 			}
3292 			/* this one is easy :) */
3293 			config_sets->ac = ac;
3294 			config_sets->next = NULL;
3295 			config_sets->rootable = 0;
3296 			ac->next = NULL;
3297 		} else {
3298 			/* which set does this component fit into? */
3299 			cset = config_sets;
3300 			while(cset!=NULL) {
3301 				if (rf_does_it_fit(cset, ac)) {
3302 					/* looks like it matches... */
3303 					ac->next = cset->ac;
3304 					cset->ac = ac;
3305 					break;
3306 				}
3307 				cset = cset->next;
3308 			}
3309 			if (cset==NULL) {
3310 				/* didn't find a match above... new set..*/
3311 				cset = (RF_ConfigSet_t *)
3312 					malloc(sizeof(RF_ConfigSet_t),
3313 					       M_RAIDFRAME, M_NOWAIT);
3314 				if (cset == NULL) {
3315 					panic("rf_create_auto_sets: No memory!");
3316 				}
3317 				cset->ac = ac;
3318 				ac->next = NULL;
3319 				cset->next = config_sets;
3320 				cset->rootable = 0;
3321 				config_sets = cset;
3322 			}
3323 		}
3324 		ac = ac_next;
3325 	}
3326 
3327 
3328 	return(config_sets);
3329 }
3330 
3331 static int
3332 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3333 {
3334 	RF_ComponentLabel_t *clabel1, *clabel2;
3335 
3336 	/* If this one matches the *first* one in the set, that's good
3337 	   enough, since the other members of the set would have been
3338 	   through here too... */
3339 	/* note that we are not checking partitionSize here..
3340 
3341 	   Note that we are also not checking the mod_counters here.
3342 	   If everything else matches execpt the mod_counter, that's
3343 	   good enough for this test.  We will deal with the mod_counters
3344 	   a little later in the autoconfiguration process.
3345 
3346 	    (clabel1->mod_counter == clabel2->mod_counter) &&
3347 
3348 	   The reason we don't check for this is that failed disks
3349 	   will have lower modification counts.  If those disks are
3350 	   not added to the set they used to belong to, then they will
3351 	   form their own set, which may result in 2 different sets,
3352 	   for example, competing to be configured at raid0, and
3353 	   perhaps competing to be the root filesystem set.  If the
3354 	   wrong ones get configured, or both attempt to become /,
3355 	   weird behaviour and or serious lossage will occur.  Thus we
3356 	   need to bring them into the fold here, and kick them out at
3357 	   a later point.
3358 
3359 	*/
3360 
3361 	clabel1 = cset->ac->clabel;
3362 	clabel2 = ac->clabel;
3363 	if ((clabel1->version == clabel2->version) &&
3364 	    (clabel1->serial_number == clabel2->serial_number) &&
3365 	    (clabel1->num_rows == clabel2->num_rows) &&
3366 	    (clabel1->num_columns == clabel2->num_columns) &&
3367 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
3368 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3369 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3370 	    (clabel1->parityConfig == clabel2->parityConfig) &&
3371 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3372 	    (clabel1->blockSize == clabel2->blockSize) &&
3373 	    rf_component_label_numblocks(clabel1) ==
3374 	    rf_component_label_numblocks(clabel2) &&
3375 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
3376 	    (clabel1->root_partition == clabel2->root_partition) &&
3377 	    (clabel1->last_unit == clabel2->last_unit) &&
3378 	    (clabel1->config_order == clabel2->config_order)) {
3379 		/* if it get's here, it almost *has* to be a match */
3380 	} else {
3381 		/* it's not consistent with somebody in the set..
3382 		   punt */
3383 		return(0);
3384 	}
3385 	/* all was fine.. it must fit... */
3386 	return(1);
3387 }
3388 
3389 int
3390 rf_have_enough_components(RF_ConfigSet_t *cset)
3391 {
3392 	RF_AutoConfig_t *ac;
3393 	RF_AutoConfig_t *auto_config;
3394 	RF_ComponentLabel_t *clabel;
3395 	int c;
3396 	int num_cols;
3397 	int num_missing;
3398 	int mod_counter;
3399 	int mod_counter_found;
3400 	int even_pair_failed;
3401 	char parity_type;
3402 
3403 
3404 	/* check to see that we have enough 'live' components
3405 	   of this set.  If so, we can configure it if necessary */
3406 
3407 	num_cols = cset->ac->clabel->num_columns;
3408 	parity_type = cset->ac->clabel->parityConfig;
3409 
3410 	/* XXX Check for duplicate components!?!?!? */
3411 
3412 	/* Determine what the mod_counter is supposed to be for this set. */
3413 
3414 	mod_counter_found = 0;
3415 	mod_counter = 0;
3416 	ac = cset->ac;
3417 	while(ac!=NULL) {
3418 		if (mod_counter_found==0) {
3419 			mod_counter = ac->clabel->mod_counter;
3420 			mod_counter_found = 1;
3421 		} else {
3422 			if (ac->clabel->mod_counter > mod_counter) {
3423 				mod_counter = ac->clabel->mod_counter;
3424 			}
3425 		}
3426 		ac = ac->next;
3427 	}
3428 
3429 	num_missing = 0;
3430 	auto_config = cset->ac;
3431 
3432 	even_pair_failed = 0;
3433 	for(c=0; c<num_cols; c++) {
3434 		ac = auto_config;
3435 		while(ac!=NULL) {
3436 			if ((ac->clabel->column == c) &&
3437 			    (ac->clabel->mod_counter == mod_counter)) {
3438 				/* it's this one... */
3439 #ifdef DEBUG
3440 				printf("Found: %s at %d\n",
3441 				       ac->devname,c);
3442 #endif
3443 				break;
3444 			}
3445 			ac=ac->next;
3446 		}
3447 		if (ac==NULL) {
3448 				/* Didn't find one here! */
3449 				/* special case for RAID 1, especially
3450 				   where there are more than 2
3451 				   components (where RAIDframe treats
3452 				   things a little differently :( ) */
3453 			if (parity_type == '1') {
3454 				if (c%2 == 0) { /* even component */
3455 					even_pair_failed = 1;
3456 				} else { /* odd component.  If
3457 					    we're failed, and
3458 					    so is the even
3459 					    component, it's
3460 					    "Good Night, Charlie" */
3461 					if (even_pair_failed == 1) {
3462 						return(0);
3463 					}
3464 				}
3465 			} else {
3466 				/* normal accounting */
3467 				num_missing++;
3468 			}
3469 		}
3470 		if ((parity_type == '1') && (c%2 == 1)) {
3471 				/* Just did an even component, and we didn't
3472 				   bail.. reset the even_pair_failed flag,
3473 				   and go on to the next component.... */
3474 			even_pair_failed = 0;
3475 		}
3476 	}
3477 
3478 	clabel = cset->ac->clabel;
3479 
3480 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3481 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3482 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
3483 		/* XXX this needs to be made *much* more general */
3484 		/* Too many failures */
3485 		return(0);
3486 	}
3487 	/* otherwise, all is well, and we've got enough to take a kick
3488 	   at autoconfiguring this set */
3489 	return(1);
3490 }
3491 
3492 void
3493 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3494 			RF_Raid_t *raidPtr)
3495 {
3496 	RF_ComponentLabel_t *clabel;
3497 	int i;
3498 
3499 	clabel = ac->clabel;
3500 
3501 	/* 1. Fill in the common stuff */
3502 	config->numRow = clabel->num_rows = 1;
3503 	config->numCol = clabel->num_columns;
3504 	config->numSpare = 0; /* XXX should this be set here? */
3505 	config->sectPerSU = clabel->sectPerSU;
3506 	config->SUsPerPU = clabel->SUsPerPU;
3507 	config->SUsPerRU = clabel->SUsPerRU;
3508 	config->parityConfig = clabel->parityConfig;
3509 	/* XXX... */
3510 	strcpy(config->diskQueueType,"fifo");
3511 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3512 	config->layoutSpecificSize = 0; /* XXX ?? */
3513 
3514 	while(ac!=NULL) {
3515 		/* row/col values will be in range due to the checks
3516 		   in reasonable_label() */
3517 		strcpy(config->devnames[0][ac->clabel->column],
3518 		       ac->devname);
3519 		ac = ac->next;
3520 	}
3521 
3522 	for(i=0;i<RF_MAXDBGV;i++) {
3523 		config->debugVars[i][0] = 0;
3524 	}
3525 }
3526 
3527 int
3528 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3529 {
3530 	RF_ComponentLabel_t *clabel;
3531 	int column;
3532 	int sparecol;
3533 
3534 	raidPtr->autoconfigure = new_value;
3535 
3536 	for(column=0; column<raidPtr->numCol; column++) {
3537 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3538 			clabel = raidget_component_label(raidPtr, column);
3539 			clabel->autoconfigure = new_value;
3540 			raidflush_component_label(raidPtr, column);
3541 		}
3542 	}
3543 	for(column = 0; column < raidPtr->numSpare ; column++) {
3544 		sparecol = raidPtr->numCol + column;
3545 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3546 			clabel = raidget_component_label(raidPtr, sparecol);
3547 			clabel->autoconfigure = new_value;
3548 			raidflush_component_label(raidPtr, sparecol);
3549 		}
3550 	}
3551 	return(new_value);
3552 }
3553 
3554 int
3555 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3556 {
3557 	RF_ComponentLabel_t *clabel;
3558 	int column;
3559 	int sparecol;
3560 
3561 	raidPtr->root_partition = new_value;
3562 	for(column=0; column<raidPtr->numCol; column++) {
3563 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3564 			clabel = raidget_component_label(raidPtr, column);
3565 			clabel->root_partition = new_value;
3566 			raidflush_component_label(raidPtr, column);
3567 		}
3568 	}
3569 	for(column = 0; column < raidPtr->numSpare ; column++) {
3570 		sparecol = raidPtr->numCol + column;
3571 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3572 			clabel = raidget_component_label(raidPtr, sparecol);
3573 			clabel->root_partition = new_value;
3574 			raidflush_component_label(raidPtr, sparecol);
3575 		}
3576 	}
3577 	return(new_value);
3578 }
3579 
3580 void
3581 rf_release_all_vps(RF_ConfigSet_t *cset)
3582 {
3583 	RF_AutoConfig_t *ac;
3584 
3585 	ac = cset->ac;
3586 	while(ac!=NULL) {
3587 		/* Close the vp, and give it back */
3588 		if (ac->vp) {
3589 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3590 			VOP_CLOSE(ac->vp, FREAD, NOCRED);
3591 			vput(ac->vp);
3592 			ac->vp = NULL;
3593 		}
3594 		ac = ac->next;
3595 	}
3596 }
3597 
3598 
3599 void
3600 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3601 {
3602 	RF_AutoConfig_t *ac;
3603 	RF_AutoConfig_t *next_ac;
3604 
3605 	ac = cset->ac;
3606 	while(ac!=NULL) {
3607 		next_ac = ac->next;
3608 		/* nuke the label */
3609 		free(ac->clabel, M_RAIDFRAME);
3610 		/* cleanup the config structure */
3611 		free(ac, M_RAIDFRAME);
3612 		/* "next.." */
3613 		ac = next_ac;
3614 	}
3615 	/* and, finally, nuke the config set */
3616 	free(cset, M_RAIDFRAME);
3617 }
3618 
3619 
3620 void
3621 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3622 {
3623 	/* current version number */
3624 	clabel->version = RF_COMPONENT_LABEL_VERSION;
3625 	clabel->serial_number = raidPtr->serial_number;
3626 	clabel->mod_counter = raidPtr->mod_counter;
3627 
3628 	clabel->num_rows = 1;
3629 	clabel->num_columns = raidPtr->numCol;
3630 	clabel->clean = RF_RAID_DIRTY; /* not clean */
3631 	clabel->status = rf_ds_optimal; /* "It's good!" */
3632 
3633 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3634 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3635 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3636 
3637 	clabel->blockSize = raidPtr->bytesPerSector;
3638 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3639 
3640 	/* XXX not portable */
3641 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3642 	clabel->maxOutstanding = raidPtr->maxOutstanding;
3643 	clabel->autoconfigure = raidPtr->autoconfigure;
3644 	clabel->root_partition = raidPtr->root_partition;
3645 	clabel->last_unit = raidPtr->raidid;
3646 	clabel->config_order = raidPtr->config_order;
3647 
3648 #ifndef RF_NO_PARITY_MAP
3649 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
3650 #endif
3651 }
3652 
3653 int
3654 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit)
3655 {
3656 	RF_Raid_t *raidPtr;
3657 	RF_Config_t *config;
3658 	int raidID;
3659 	int retcode;
3660 
3661 #ifdef DEBUG
3662 	printf("RAID autoconfigure\n");
3663 #endif
3664 
3665 	retcode = 0;
3666 	*unit = -1;
3667 
3668 	/* 1. Create a config structure */
3669 
3670 	config = (RF_Config_t *)malloc(sizeof(RF_Config_t),
3671 				       M_RAIDFRAME,
3672 				       M_NOWAIT);
3673 	if (config==NULL) {
3674 		printf("Out of mem!?!?\n");
3675 				/* XXX do something more intelligent here. */
3676 		return(1);
3677 	}
3678 
3679 	memset(config, 0, sizeof(RF_Config_t));
3680 
3681 	/*
3682 	   2. Figure out what RAID ID this one is supposed to live at
3683 	   See if we can get the same RAID dev that it was configured
3684 	   on last time..
3685 	*/
3686 
3687 	raidID = cset->ac->clabel->last_unit;
3688 	if ((raidID < 0) || (raidID >= numraid)) {
3689 		/* let's not wander off into lala land. */
3690 		raidID = numraid - 1;
3691 	}
3692 	if (raidPtrs[raidID]->valid != 0) {
3693 
3694 		/*
3695 		   Nope... Go looking for an alternative...
3696 		   Start high so we don't immediately use raid0 if that's
3697 		   not taken.
3698 		*/
3699 
3700 		for(raidID = numraid - 1; raidID >= 0; raidID--) {
3701 			if (raidPtrs[raidID]->valid == 0) {
3702 				/* can use this one! */
3703 				break;
3704 			}
3705 		}
3706 	}
3707 
3708 	if (raidID < 0) {
3709 		/* punt... */
3710 		printf("Unable to auto configure this set!\n");
3711 		printf("(Out of RAID devs!)\n");
3712 		free(config, M_RAIDFRAME);
3713 		return(1);
3714 	}
3715 
3716 #ifdef DEBUG
3717 	printf("Configuring raid%d:\n",raidID);
3718 #endif
3719 
3720 	raidPtr = raidPtrs[raidID];
3721 
3722 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
3723 	raidPtr->raidid = raidID;
3724 	raidPtr->openings = RAIDOUTSTANDING;
3725 
3726 	/* 3. Build the configuration structure */
3727 	rf_create_configuration(cset->ac, config, raidPtr);
3728 
3729 	/* 4. Do the configuration */
3730 	retcode = rf_Configure(raidPtr, config, cset->ac);
3731 
3732 	if (retcode == 0) {
3733 
3734 		raidinit(raidPtrs[raidID]);
3735 
3736 		rf_markalldirty(raidPtrs[raidID]);
3737 		raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */
3738 		if (cset->ac->clabel->root_partition==1) {
3739 			/* everything configured just fine.  Make a note
3740 			   that this set is eligible to be root. */
3741 			cset->rootable = 1;
3742 			/* XXX do this here? */
3743 			raidPtrs[raidID]->root_partition = 1;
3744 		}
3745 	}
3746 
3747 	/* 5. Cleanup */
3748 	free(config, M_RAIDFRAME);
3749 
3750 	*unit = raidID;
3751 	return(retcode);
3752 }
3753 
3754 void
3755 rf_disk_unbusy(RF_RaidAccessDesc_t *desc)
3756 {
3757 	struct buf *bp;
3758 
3759 	bp = (struct buf *)desc->bp;
3760 	disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev,
3761 	    (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ));
3762 }
3763 
3764 void
3765 rf_pool_init(struct pool *p, size_t size, const char *w_chan,
3766 	     size_t xmin, size_t xmax)
3767 {
3768 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3769 	pool_sethiwat(p, xmax);
3770 	pool_prime(p, xmin);
3771 	pool_setlowat(p, xmin);
3772 }
3773 
3774 /*
3775  * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see
3776  * if there is IO pending and if that IO could possibly be done for a
3777  * given RAID set.  Returns 0 if IO is waiting and can be done, 1
3778  * otherwise.
3779  *
3780  */
3781 
3782 int
3783 rf_buf_queue_check(int raidid)
3784 {
3785 	if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) &&
3786 	    raidPtrs[raidid]->openings > 0) {
3787 		/* there is work to do */
3788 		return 0;
3789 	}
3790 	/* default is nothing to do */
3791 	return 1;
3792 }
3793 
3794 int
3795 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3796 {
3797 	uint64_t numsecs;
3798 	unsigned secsize;
3799 	int error;
3800 
3801 	error = getdisksize(vp, &numsecs, &secsize);
3802 	if (error == 0) {
3803 		diskPtr->blockSize = secsize;
3804 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
3805 		diskPtr->partitionSize = numsecs;
3806 		return 0;
3807 	}
3808 	return error;
3809 }
3810 
3811 static int
3812 raid_match(device_t self, cfdata_t cfdata, void *aux)
3813 {
3814 	return 1;
3815 }
3816 
3817 static void
3818 raid_attach(device_t parent, device_t self, void *aux)
3819 {
3820 
3821 }
3822 
3823 
3824 static int
3825 raid_detach(device_t self, int flags)
3826 {
3827 	int error;
3828 	struct raid_softc *rs = &raid_softc[device_unit(self)];
3829 
3830 	if ((error = raidlock(rs)) != 0)
3831 		return (error);
3832 
3833 	error = raid_detach_unlocked(rs);
3834 
3835 	raidunlock(rs);
3836 
3837 	return error;
3838 }
3839 
3840 static void
3841 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr)
3842 {
3843 	prop_dictionary_t disk_info, odisk_info, geom;
3844 	disk_info = prop_dictionary_create();
3845 	geom = prop_dictionary_create();
3846 	prop_dictionary_set_uint64(geom, "sectors-per-unit",
3847 				   raidPtr->totalSectors);
3848 	prop_dictionary_set_uint32(geom, "sector-size",
3849 				   raidPtr->bytesPerSector);
3850 
3851 	prop_dictionary_set_uint16(geom, "sectors-per-track",
3852 				   raidPtr->Layout.dataSectorsPerStripe);
3853 	prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
3854 				   4 * raidPtr->numCol);
3855 
3856 	prop_dictionary_set_uint64(geom, "cylinders-per-unit",
3857 	   raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe *
3858 	   (4 * raidPtr->numCol)));
3859 
3860 	prop_dictionary_set(disk_info, "geometry", geom);
3861 	prop_object_release(geom);
3862 	prop_dictionary_set(device_properties(rs->sc_dev),
3863 			    "disk-info", disk_info);
3864 	odisk_info = rs->sc_dkdev.dk_info;
3865 	rs->sc_dkdev.dk_info = disk_info;
3866 	if (odisk_info)
3867 		prop_object_release(odisk_info);
3868 }
3869 
3870 /*
3871  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3872  * We end up returning whatever error was returned by the first cache flush
3873  * that fails.
3874  */
3875 
3876 int
3877 rf_sync_component_caches(RF_Raid_t *raidPtr)
3878 {
3879 	int c, sparecol;
3880 	int e,error;
3881 	int force = 1;
3882 
3883 	error = 0;
3884 	for (c = 0; c < raidPtr->numCol; c++) {
3885 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
3886 			e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3887 					  &force, FWRITE, NOCRED);
3888 			if (e) {
3889 				if (e != ENODEV)
3890 					printf("raid%d: cache flush to component %s failed.\n",
3891 					       raidPtr->raidid, raidPtr->Disks[c].devname);
3892 				if (error == 0) {
3893 					error = e;
3894 				}
3895 			}
3896 		}
3897 	}
3898 
3899 	for( c = 0; c < raidPtr->numSpare ; c++) {
3900 		sparecol = raidPtr->numCol + c;
3901 		/* Need to ensure that the reconstruct actually completed! */
3902 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3903 			e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp,
3904 					  DIOCCACHESYNC, &force, FWRITE, NOCRED);
3905 			if (e) {
3906 				if (e != ENODEV)
3907 					printf("raid%d: cache flush to component %s failed.\n",
3908 					       raidPtr->raidid, raidPtr->Disks[sparecol].devname);
3909 				if (error == 0) {
3910 					error = e;
3911 				}
3912 			}
3913 		}
3914 	}
3915 	return error;
3916 }
3917