xref: /netbsd-src/sys/dev/raidframe/rf_netbsdkintf.c (revision 5d6f13c8e9d46ee5cec6bd5f77943f30931e0096)
1 /*	$NetBSD: rf_netbsdkintf.c,v 1.418 2025/01/08 08:25:36 andvar Exp $	*/
2 
3 /*-
4  * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Greg Oster; Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1988 University of Utah.
34  * Copyright (c) 1990, 1993
35  *      The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * the Systems Programming Group of the University of Utah Computer
39  * Science Department.
40  *
41  * Redistribution and use in source and binary forms, with or without
42  * modification, are permitted provided that the following conditions
43  * are met:
44  * 1. Redistributions of source code must retain the above copyright
45  *    notice, this list of conditions and the following disclaimer.
46  * 2. Redistributions in binary form must reproduce the above copyright
47  *    notice, this list of conditions and the following disclaimer in the
48  *    documentation and/or other materials provided with the distribution.
49  * 3. Neither the name of the University nor the names of its contributors
50  *    may be used to endorse or promote products derived from this software
51  *    without specific prior written permission.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63  * SUCH DAMAGE.
64  *
65  * from: Utah $Hdr: cd.c 1.6 90/11/28$
66  *
67  *      @(#)cd.c        8.2 (Berkeley) 11/16/93
68  */
69 
70 /*
71  * Copyright (c) 1995 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Mark Holland, Jim Zelenka
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /***********************************************************
98  *
99  * rf_kintf.c -- the kernel interface routines for RAIDframe
100  *
101  ***********************************************************/
102 
103 #include <sys/cdefs.h>
104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.418 2025/01/08 08:25:36 andvar Exp $");
105 
106 #ifdef _KERNEL_OPT
107 #include "opt_raid_autoconfig.h"
108 #include "opt_compat_netbsd32.h"
109 #endif
110 
111 #include <sys/param.h>
112 #include <sys/errno.h>
113 #include <sys/pool.h>
114 #include <sys/proc.h>
115 #include <sys/queue.h>
116 #include <sys/disk.h>
117 #include <sys/device.h>
118 #include <sys/stat.h>
119 #include <sys/ioctl.h>
120 #include <sys/fcntl.h>
121 #include <sys/systm.h>
122 #include <sys/vnode.h>
123 #include <sys/disklabel.h>
124 #include <sys/conf.h>
125 #include <sys/buf.h>
126 #include <sys/bufq.h>
127 #include <sys/reboot.h>
128 #include <sys/kauth.h>
129 #include <sys/module.h>
130 #include <sys/compat_stub.h>
131 
132 #include <prop/proplib.h>
133 
134 #include <dev/raidframe/raidframevar.h>
135 #include <dev/raidframe/raidframeio.h>
136 #include <dev/raidframe/rf_paritymap.h>
137 
138 #include "rf_raid.h"
139 #include "rf_dag.h"
140 #include "rf_dagflags.h"
141 #include "rf_desc.h"
142 #include "rf_diskqueue.h"
143 #include "rf_etimer.h"
144 #include "rf_general.h"
145 #include "rf_kintf.h"
146 #include "rf_options.h"
147 #include "rf_driver.h"
148 #include "rf_parityscan.h"
149 #include "rf_threadstuff.h"
150 
151 #include "ioconf.h"
152 
153 #ifdef DEBUG
154 int     rf_kdebug_level = 0;
155 #define db1_printf(a) if (rf_kdebug_level > 0) printf a
156 #else				/* DEBUG */
157 #define db1_printf(a) { }
158 #endif				/* DEBUG */
159 
160 #define DEVICE_XNAME(dev) dev ? device_xname(dev) : "null"
161 
162 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
163 static rf_declare_mutex2(rf_sparet_wait_mutex);
164 static rf_declare_cond2(rf_sparet_wait_cv);
165 static rf_declare_cond2(rf_sparet_resp_cv);
166 
167 static RF_SparetWait_t *rf_sparet_wait_queue;	/* requests to install a
168 						 * spare table */
169 static RF_SparetWait_t *rf_sparet_resp_queue;	/* responses from
170 						 * installation process */
171 #endif
172 
173 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS);
174 
175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures");
176 
177 /* prototypes */
178 static void KernelWakeupFunc(struct buf *);
179 static void InitBP(struct buf *, struct vnode *, unsigned,
180     dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *),
181     void *, int);
182 static void raidinit(struct raid_softc *);
183 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp);
184 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *);
185 
186 static int raid_match(device_t, cfdata_t, void *);
187 static void raid_attach(device_t, device_t, void *);
188 static int raid_detach(device_t, int);
189 
190 static int raidread_component_area(dev_t, struct vnode *, void *, size_t,
191     daddr_t, daddr_t);
192 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t,
193     daddr_t, daddr_t);
194 
195 static int raidwrite_component_label(unsigned,
196     dev_t, struct vnode *, RF_ComponentLabel_t *);
197 static int raidread_component_label(unsigned,
198     dev_t, struct vnode *, RF_ComponentLabel_t *);
199 
200 static int raid_diskstart(device_t, struct buf *bp);
201 static int raid_dumpblocks(device_t, void *, daddr_t, int);
202 static int raid_lastclose(device_t);
203 
204 static dev_type_open(raidopen);
205 static dev_type_close(raidclose);
206 static dev_type_read(raidread);
207 static dev_type_write(raidwrite);
208 static dev_type_ioctl(raidioctl);
209 static dev_type_strategy(raidstrategy);
210 static dev_type_dump(raiddump);
211 static dev_type_size(raidsize);
212 
213 const struct bdevsw raid_bdevsw = {
214 	.d_open = raidopen,
215 	.d_close = raidclose,
216 	.d_strategy = raidstrategy,
217 	.d_ioctl = raidioctl,
218 	.d_dump = raiddump,
219 	.d_psize = raidsize,
220 	.d_discard = nodiscard,
221 	.d_flag = D_DISK
222 };
223 
224 const struct cdevsw raid_cdevsw = {
225 	.d_open = raidopen,
226 	.d_close = raidclose,
227 	.d_read = raidread,
228 	.d_write = raidwrite,
229 	.d_ioctl = raidioctl,
230 	.d_stop = nostop,
231 	.d_tty = notty,
232 	.d_poll = nopoll,
233 	.d_mmap = nommap,
234 	.d_kqfilter = nokqfilter,
235 	.d_discard = nodiscard,
236 	.d_flag = D_DISK
237 };
238 
239 static struct dkdriver rf_dkdriver = {
240 	.d_open = raidopen,
241 	.d_close = raidclose,
242 	.d_strategy = raidstrategy,
243 	.d_diskstart = raid_diskstart,
244 	.d_dumpblocks = raid_dumpblocks,
245 	.d_lastclose = raid_lastclose,
246 	.d_minphys = minphys
247 };
248 
249 #define	raidunit(x)	DISKUNIT(x)
250 #define	raidsoftc(dev)	(((struct raid_softc *)device_private(dev))->sc_r.softc)
251 
252 extern struct cfdriver raid_cd;
253 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc),
254     raid_match, raid_attach, raid_detach, NULL, NULL, NULL,
255     DVF_DETACH_SHUTDOWN);
256 
257 /* Internal representation of a rf_recon_req */
258 struct rf_recon_req_internal {
259 	RF_RowCol_t col;
260 	RF_ReconReqFlags_t flags;
261 	void   *raidPtr;
262 };
263 
264 /*
265  * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device.
266  * Be aware that large numbers can allow the driver to consume a lot of
267  * kernel memory, especially on writes, and in degraded mode reads.
268  *
269  * For example: with a stripe width of 64 blocks (32k) and 5 disks,
270  * a single 64K write will typically require 64K for the old data,
271  * 64K for the old parity, and 64K for the new parity, for a total
272  * of 192K (if the parity buffer is not re-used immediately).
273  * Even it if is used immediately, that's still 128K, which when multiplied
274  * by say 10 requests, is 1280K, *on top* of the 640K of incoming data.
275  *
276  * Now in degraded mode, for example, a 64K read on the above setup may
277  * require data reconstruction, which will require *all* of the 4 remaining
278  * disks to participate -- 4 * 32K/disk == 128K again.
279  */
280 
281 #ifndef RAIDOUTSTANDING
282 #define RAIDOUTSTANDING   6
283 #endif
284 
285 #define RAIDLABELDEV(dev)	\
286 	(MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
287 
288 /* declared here, and made public, for the benefit of KVM stuff.. */
289 
290 static int raidlock(struct raid_softc *);
291 static void raidunlock(struct raid_softc *);
292 
293 static int raid_detach_unlocked(struct raid_softc *);
294 
295 static void rf_markalldirty(RF_Raid_t *);
296 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *);
297 
298 static void rf_ReconThread(struct rf_recon_req_internal *);
299 static void rf_RewriteParityThread(RF_Raid_t *raidPtr);
300 static void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *);
301 static int rf_autoconfig(device_t);
302 static int rf_rescan(void);
303 static void rf_buildroothack(RF_ConfigSet_t *);
304 
305 static RF_AutoConfig_t *rf_find_raid_components(void);
306 static RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *);
307 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *);
308 static void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *);
309 static int rf_set_autoconfig(RF_Raid_t *, int);
310 static int rf_set_rootpartition(RF_Raid_t *, int);
311 static void rf_release_all_vps(RF_ConfigSet_t *);
312 static void rf_cleanup_config_set(RF_ConfigSet_t *);
313 static int rf_have_enough_components(RF_ConfigSet_t *);
314 static struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *);
315 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t);
316 
317 /*
318  * Debugging, mostly.  Set to 0 to not allow autoconfig to take place.
319  * Note that this is overridden by having RAID_AUTOCONFIG as an option
320  * in the kernel config file.
321  */
322 #ifdef RAID_AUTOCONFIG
323 int raidautoconfig = 1;
324 #else
325 int raidautoconfig = 0;
326 #endif
327 static bool raidautoconfigdone = false;
328 
329 struct pool rf_alloclist_pool;   /* AllocList */
330 
331 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids);
332 static kmutex_t raid_lock;
333 
334 static struct raid_softc *
335 raidcreate(int unit) {
336 	struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP);
337 	sc->sc_unit = unit;
338 	cv_init(&sc->sc_cv, "raidunit");
339 	mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE);
340 	return sc;
341 }
342 
343 static void
344 raiddestroy(struct raid_softc *sc) {
345 	cv_destroy(&sc->sc_cv);
346 	mutex_destroy(&sc->sc_mutex);
347 	kmem_free(sc, sizeof(*sc));
348 }
349 
350 static struct raid_softc *
351 raidget(int unit, bool create) {
352 	struct raid_softc *sc;
353 	if (unit < 0) {
354 #ifdef DIAGNOSTIC
355 		panic("%s: unit %d!", __func__, unit);
356 #endif
357 		return NULL;
358 	}
359 	mutex_enter(&raid_lock);
360 	LIST_FOREACH(sc, &raids, sc_link) {
361 		if (sc->sc_unit == unit) {
362 			mutex_exit(&raid_lock);
363 			return sc;
364 		}
365 	}
366 	mutex_exit(&raid_lock);
367 	if (!create)
368 		return NULL;
369 	sc = raidcreate(unit);
370 	mutex_enter(&raid_lock);
371 	LIST_INSERT_HEAD(&raids, sc, sc_link);
372 	mutex_exit(&raid_lock);
373 	return sc;
374 }
375 
376 static void
377 raidput(struct raid_softc *sc) {
378 	mutex_enter(&raid_lock);
379 	LIST_REMOVE(sc, sc_link);
380 	mutex_exit(&raid_lock);
381 	raiddestroy(sc);
382 }
383 
384 void
385 raidattach(int num)
386 {
387 
388 	/*
389 	 * Device attachment and associated initialization now occurs
390 	 * as part of the module initialization.
391 	 */
392 }
393 
394 static int
395 rf_autoconfig(device_t self)
396 {
397 	RF_AutoConfig_t *ac_list;
398 	RF_ConfigSet_t *config_sets;
399 
400 	if (!raidautoconfig || raidautoconfigdone == true)
401 		return 0;
402 
403 	/* XXX This code can only be run once. */
404 	raidautoconfigdone = true;
405 
406 #ifdef __HAVE_CPU_BOOTCONF
407 	/*
408 	 * 0. find the boot device if needed first so we can use it later
409 	 * this needs to be done before we autoconfigure any raid sets,
410 	 * because if we use wedges we are not going to be able to open
411 	 * the boot device later
412 	 */
413 	if (booted_device == NULL)
414 		cpu_bootconf();
415 #endif
416 	/* 1. locate all RAID components on the system */
417 	aprint_debug("Searching for RAID components...\n");
418 	ac_list = rf_find_raid_components();
419 
420 	/* 2. Sort them into their respective sets. */
421 	config_sets = rf_create_auto_sets(ac_list);
422 
423 	/*
424 	 * 3. Evaluate each set and configure the valid ones.
425 	 * This gets done in rf_buildroothack().
426 	 */
427 	rf_buildroothack(config_sets);
428 
429 	return 1;
430 }
431 
432 int
433 rf_inited(const struct raid_softc *rs) {
434 	return (rs->sc_flags & RAIDF_INITED) != 0;
435 }
436 
437 RF_Raid_t *
438 rf_get_raid(struct raid_softc *rs) {
439 	return &rs->sc_r;
440 }
441 
442 int
443 rf_get_unit(const struct raid_softc *rs) {
444 	return rs->sc_unit;
445 }
446 
447 static int
448 rf_containsboot(RF_Raid_t *r, device_t bdv) {
449 	const char *bootname;
450 	size_t len;
451 
452 	/* if bdv is NULL, the set can't contain it. exit early. */
453 	if (bdv == NULL)
454 		return 0;
455 
456 	bootname = device_xname(bdv);
457 	len = strlen(bootname);
458 
459 	for (int col = 0; col < r->numCol; col++) {
460 		const char *devname = r->Disks[col].devname;
461 		devname += sizeof("/dev/") - 1;
462 		if (strncmp(devname, "dk", 2) == 0) {
463 			const char *parent =
464 			    dkwedge_get_parent_name(r->Disks[col].dev);
465 			if (parent != NULL)
466 				devname = parent;
467 		}
468 		if (strncmp(devname, bootname, len) == 0) {
469 			struct raid_softc *sc = r->softc;
470 			aprint_debug("raid%d includes boot device %s\n",
471 			    sc->sc_unit, devname);
472 			return 1;
473 		}
474 	}
475 	return 0;
476 }
477 
478 static int
479 rf_rescan(void)
480 {
481 	RF_AutoConfig_t *ac_list;
482 	RF_ConfigSet_t *config_sets, *cset, *next_cset;
483 	struct raid_softc *sc;
484 	int raid_added;
485 
486 	ac_list = rf_find_raid_components();
487 	config_sets = rf_create_auto_sets(ac_list);
488 
489 	raid_added = 1;
490 	while (raid_added > 0) {
491 		raid_added = 0;
492 		cset = config_sets;
493 		while (cset != NULL) {
494 			next_cset = cset->next;
495 			if (rf_have_enough_components(cset) &&
496 			    cset->ac->clabel->autoconfigure == 1) {
497 				sc = rf_auto_config_set(cset);
498 				if (sc != NULL) {
499 					aprint_debug("raid%d: configured ok, rootable %d\n",
500 						     sc->sc_unit, cset->rootable);
501 					/* We added one RAID set */
502 					raid_added++;
503 				} else {
504 					/* The autoconfig didn't work :( */
505 					aprint_debug("Autoconfig failed\n");
506 					rf_release_all_vps(cset);
507 				}
508 			} else {
509 				/* we're not autoconfiguring this set...
510 				   release the associated resources */
511 				rf_release_all_vps(cset);
512 			}
513 			/* cleanup */
514 			rf_cleanup_config_set(cset);
515 			cset = next_cset;
516 		}
517 		if (raid_added > 0) {
518 			/* We added at least one RAID set, so re-scan for recursive RAID */
519 			ac_list = rf_find_raid_components();
520 			config_sets = rf_create_auto_sets(ac_list);
521 		}
522 	}
523 
524 	return 0;
525 }
526 
527 /*
528  * Example setup:
529  * dk1 at wd0: "raid@wd0", 171965 blocks at 32802, type: raidframe
530  * dk3 at wd1: "raid@wd1", 171965 blocks at 32802, type: raidframz
531  * raid1: Components: /dev/dk1 /dev/dk3
532  * dk4 at raid1: "empty@raid1", 8192 blocks at 34, type: msdos
533  * dk5 at raid1: "root@raid1", 163517 blocks at 8226, type: ffs
534  *
535  * If booted from wd0, booted_device will be
536  * disk wd0, startblk = 41092, nblks = 163517
537  *
538  * That is, dk5 with startblk computed from the beginning of wd0
539  * instead of beginning of raid1:
540  * 32802 + 64 (RF_PROTECTED_SECTORS) + 8226 = 41092
541  *
542  * In order to find the boot wedge, we must iterate on each component,
543  * find its offset from disk beginning, and look for the boot wedge with
544  * startblck adjusted.
545  */
546 static device_t
547 rf_find_bootwedge(struct raid_softc *rsc)
548 {
549 	RF_Raid_t *r = &rsc->sc_r;
550 	const char *bootname;
551 	size_t len;
552 	device_t rdev = NULL;
553 
554 	if (booted_device == NULL)
555 		goto out;
556 
557 	bootname = device_xname(booted_device);
558 	len = strlen(bootname);
559 
560 	aprint_debug("%s: booted_device %s, startblk = %"PRId64", "
561 		     "nblks = %"PRId64"\n", __func__,
562 		     bootname, booted_startblk, booted_nblks);
563 
564 	for (int col = 0; col < r->numCol; col++) {
565 		const char *devname = r->Disks[col].devname;
566 		const char *parent;
567 		struct disk *dk;
568 		u_int nwedges;
569 		struct dkwedge_info *dkwi;
570 		struct dkwedge_list dkwl;
571 		size_t dkwi_len;
572 		int i;
573 
574 		devname += sizeof("/dev/") - 1;
575 		if (strncmp(devname, "dk", 2) != 0)
576 			continue;
577 
578 		parent = dkwedge_get_parent_name(r->Disks[col].dev);
579 		if (parent == NULL) {
580 			aprint_debug("%s: cannot find parent for "
581 				     "component /dev/%s", __func__, devname);
582 			continue;
583 		}
584 
585 		if (strncmp(parent, bootname, len) != 0)
586 			continue;
587 
588 		aprint_debug("%s: looking up wedge %s in device %s\n",
589 			     __func__, devname, parent);
590 
591 		dk = disk_find(parent);
592 		nwedges = dk->dk_nwedges;
593 		dkwi_len = sizeof(*dkwi) * nwedges;
594 		dkwi = RF_Malloc(dkwi_len);
595 
596 		dkwl.dkwl_buf = dkwi;
597 		dkwl.dkwl_bufsize = dkwi_len;
598 		dkwl.dkwl_nwedges = 0;
599 		dkwl.dkwl_ncopied = 0;
600 
601 		if (dkwedge_list(dk, &dkwl, curlwp) == 0) {
602 			daddr_t startblk;
603 
604 			for (i = 0; i < dkwl.dkwl_ncopied; i++) {
605 				if (strcmp(dkwi[i].dkw_devname, devname) == 0)
606 					break;
607 			}
608 
609 			KASSERT(i < dkwl.dkwl_ncopied);
610 
611 			aprint_debug("%s: wedge %s, "
612 				     "startblk = %"PRId64", "
613 				     "nblks = %"PRId64"\n",
614 				     __func__,
615 				     dkwi[i].dkw_devname,
616 				     dkwi[i].dkw_offset,
617 				     dkwi[i].dkw_size);
618 
619 			startblk = booted_startblk
620 				 - dkwi[i].dkw_offset
621 				 - RF_PROTECTED_SECTORS;
622 
623 			aprint_debug("%s: looking for wedge in %s, "
624 				     "startblk = %"PRId64", "
625 				     "nblks = %"PRId64"\n",
626 				     __func__,
627 				     DEVICE_XNAME(rsc->sc_dksc.sc_dev),
628 				     startblk, booted_nblks);
629 
630 			rdev = dkwedge_find_partition(rsc->sc_dksc.sc_dev,
631 						      startblk,
632 						      booted_nblks);
633 			if (rdev) {
634 				aprint_debug("%s: root candidate wedge %s "
635 					     "shifted from %s\n", __func__,
636 					     device_xname(rdev),
637 					     dkwi[i].dkw_devname);
638 				goto done;
639 			} else {
640 				aprint_debug("%s: not found\n", __func__);
641 			}
642 		}
643 
644 		aprint_debug("%s: nothing found for col %d\n", __func__, col);
645 done:
646 		RF_Free(dkwi, dkwi_len);
647 	}
648 
649 out:
650 	if (!rdev)
651 		aprint_debug("%s: nothing found\n", __func__);
652 
653 	return rdev;
654 }
655 
656 static void
657 rf_buildroothack(RF_ConfigSet_t *config_sets)
658 {
659 	RF_AutoConfig_t *ac_list;
660 	RF_ConfigSet_t *cset;
661 	RF_ConfigSet_t *next_cset;
662 	int num_root;
663 	int raid_added;
664 	struct raid_softc *sc, *rsc;
665 	struct dk_softc *dksc = NULL;	/* XXX gcc -Os: may be used uninit. */
666 
667 	sc = rsc = NULL;
668 	num_root = 0;
669 
670 	raid_added = 1;
671 	while (raid_added > 0) {
672 		raid_added = 0;
673 		cset = config_sets;
674 		while (cset != NULL) {
675 			next_cset = cset->next;
676 			if (rf_have_enough_components(cset) &&
677 			    cset->ac->clabel->autoconfigure == 1) {
678 				sc = rf_auto_config_set(cset);
679 				if (sc != NULL) {
680 					aprint_debug("raid%d: configured ok, rootable %d\n",
681 						     sc->sc_unit, cset->rootable);
682 					/* We added one RAID set */
683 					raid_added++;
684 					if (cset->rootable) {
685 						rsc = sc;
686 						num_root++;
687 					}
688 				} else {
689 					/* The autoconfig didn't work :( */
690 					aprint_debug("Autoconfig failed\n");
691 					rf_release_all_vps(cset);
692 				}
693 			} else {
694 				/* we're not autoconfiguring this set...
695 				   release the associated resources */
696 				rf_release_all_vps(cset);
697 			}
698 			/* cleanup */
699 			rf_cleanup_config_set(cset);
700 			cset = next_cset;
701 		}
702 		if (raid_added > 0) {
703 			/* We added at least one RAID set, so re-scan for recursive RAID */
704 			ac_list = rf_find_raid_components();
705 			config_sets = rf_create_auto_sets(ac_list);
706 		}
707 	}
708 
709 	/* if the user has specified what the root device should be
710 	   then we don't touch booted_device or boothowto... */
711 
712 	if (rootspec != NULL) {
713 		aprint_debug("%s: rootspec %s\n", __func__, rootspec);
714 		return;
715 	}
716 
717 	/* we found something bootable... */
718 	if (num_root == 1) {
719 		device_t candidate_root = NULL;
720 		dksc = &rsc->sc_dksc;
721 
722 		if (dksc->sc_dkdev.dk_nwedges != 0) {
723 
724 			/* Find the wedge we booted from */
725 			candidate_root = rf_find_bootwedge(rsc);
726 
727 			/* Try first partition */
728 			if (candidate_root == NULL) {
729 				size_t i = 0;
730 				candidate_root = dkwedge_find_by_parent(
731 				    device_xname(dksc->sc_dev), &i);
732 			}
733 			aprint_debug("%s: candidate wedge root %s\n",
734 			    __func__, DEVICE_XNAME(candidate_root));
735 		} else {
736 			candidate_root = dksc->sc_dev;
737 		}
738 
739 		aprint_debug("%s: candidate root = %s, booted_device = %s, "
740 			     "root_partition = %d, contains_boot=%d\n",
741 		    __func__, DEVICE_XNAME(candidate_root),
742 		    DEVICE_XNAME(booted_device), rsc->sc_r.root_partition,
743 		    rf_containsboot(&rsc->sc_r, booted_device));
744 
745 		/* XXX the check for booted_device == NULL can probably be
746 		 * dropped, now that rf_containsboot handles that case.
747 		 */
748 		if (booted_device == NULL ||
749 		    rsc->sc_r.root_partition == 1 ||
750 		    rf_containsboot(&rsc->sc_r, booted_device)) {
751 			booted_device = candidate_root;
752 			booted_method = "raidframe/single";
753 			booted_partition = 0;	/* XXX assume 'a' */
754 			aprint_debug("%s: set booted_device = %s\n", __func__,
755 			    DEVICE_XNAME(booted_device));
756 		}
757 	} else if (num_root > 1) {
758 		aprint_debug("%s: many roots=%d, %s\n", __func__, num_root,
759 		    DEVICE_XNAME(booted_device));
760 
761 		/*
762 		 * Maybe the MD code can help. If it cannot, then
763 		 * setroot() will discover that we have no
764 		 * booted_device and will ask the user if nothing was
765 		 * hardwired in the kernel config file
766 		 */
767 		if (booted_device == NULL)
768 			return;
769 
770 		num_root = 0;
771 		mutex_enter(&raid_lock);
772 		LIST_FOREACH(sc, &raids, sc_link) {
773 			RF_Raid_t *r = &sc->sc_r;
774 			if (r->valid == 0)
775 				continue;
776 
777 			if (r->root_partition == 0)
778 				continue;
779 
780 			if (rf_containsboot(r, booted_device)) {
781 				num_root++;
782 				rsc = sc;
783 				dksc = &rsc->sc_dksc;
784 			}
785 		}
786 		mutex_exit(&raid_lock);
787 
788 		if (num_root == 1) {
789 			booted_device = dksc->sc_dev;
790 			booted_method = "raidframe/multi";
791 			booted_partition = 0;	/* XXX assume 'a' */
792 		} else {
793 			/* we can't guess.. require the user to answer... */
794 			boothowto |= RB_ASKNAME;
795 		}
796 	}
797 }
798 
799 static int
800 raidsize(dev_t dev)
801 {
802 	struct raid_softc *rs;
803 	struct dk_softc *dksc;
804 	unsigned int unit;
805 
806 	unit = raidunit(dev);
807 	if ((rs = raidget(unit, false)) == NULL)
808 		return -1;
809 	dksc = &rs->sc_dksc;
810 
811 	if ((rs->sc_flags & RAIDF_INITED) == 0)
812 		return -1;
813 
814 	return dk_size(dksc, dev);
815 }
816 
817 static int
818 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size)
819 {
820 	unsigned int unit;
821 	struct raid_softc *rs;
822 	struct dk_softc *dksc;
823 
824 	unit = raidunit(dev);
825 	if ((rs = raidget(unit, false)) == NULL)
826 		return ENXIO;
827 	dksc = &rs->sc_dksc;
828 
829 	if ((rs->sc_flags & RAIDF_INITED) == 0)
830 		return ENODEV;
831 
832         /*
833            Note that blkno is relative to this particular partition.
834            By adding adding RF_PROTECTED_SECTORS, we get a value that
835 	   is relative to the partition used for the underlying component.
836         */
837 	blkno += RF_PROTECTED_SECTORS;
838 
839 	return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE);
840 }
841 
842 static int
843 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk)
844 {
845 	struct raid_softc *rs = raidsoftc(dev);
846 	const struct bdevsw *bdev;
847 	RF_Raid_t *raidPtr;
848 	int     c, sparecol, j, scol, dumpto;
849 	int     error = 0;
850 
851 	raidPtr = &rs->sc_r;
852 
853 	/* we only support dumping to RAID 1 sets */
854 	if (raidPtr->Layout.numDataCol != 1 ||
855 	    raidPtr->Layout.numParityCol != 1)
856 		return EINVAL;
857 
858 	if ((error = raidlock(rs)) != 0)
859 		return error;
860 
861 	/* figure out what device is alive.. */
862 
863 	/*
864 	   Look for a component to dump to.  The preference for the
865 	   component to dump to is as follows:
866 	   1) the first component
867 	   2) a used_spare of the first component
868 	   3) the second component
869 	   4) a used_spare of the second component
870 	*/
871 
872 	dumpto = -1;
873 	for (c = 0; c < raidPtr->numCol; c++) {
874 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
875 			/* this might be the one */
876 			dumpto = c;
877 			break;
878 		}
879 	}
880 
881 	/*
882 	   At this point we have possibly selected a live component.
883 	   If we didn't find a live component, we now check to see
884 	   if there is a relevant spared component.
885 	*/
886 
887 	for (c = 0; c < raidPtr->numSpare; c++) {
888 		sparecol = raidPtr->numCol + c;
889 
890 		if (raidPtr->Disks[sparecol].status ==  rf_ds_used_spare) {
891 			/* How about this one? */
892 			scol = -1;
893 			for(j=0;j<raidPtr->numCol;j++) {
894 				if (raidPtr->Disks[j].spareCol == sparecol) {
895 					scol = j;
896 					break;
897 				}
898 			}
899 			if (scol == 0) {
900 				/*
901 				   We must have found a spared first
902 				   component!  We'll take that over
903 				   anything else found so far.  (We
904 				   couldn't have found a real first
905 				   component before, since this is a
906 				   used spare, and it's saying that
907 				   it's replacing the first
908 				   component.)  On reboot (with
909 				   autoconfiguration turned on)
910 				   sparecol will become the first
911 				   component (component0) of this set.
912 				*/
913 				dumpto = sparecol;
914 				break;
915 			} else if (scol != -1) {
916 				/*
917 				   Must be a spared second component.
918 				   We'll dump to that if we havn't found
919 				   anything else so far.
920 				*/
921 				if (dumpto == -1)
922 					dumpto = sparecol;
923 			}
924 		}
925 	}
926 
927 	if (dumpto == -1) {
928 		/* we couldn't find any live components to dump to!?!?
929 		 */
930 		error = EINVAL;
931 		goto out;
932 	}
933 
934 	bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev);
935 	if (bdev == NULL) {
936 		error = ENXIO;
937 		goto out;
938 	}
939 
940 	error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev,
941 				blkno, va, nblk * raidPtr->bytesPerSector);
942 
943 out:
944 	raidunlock(rs);
945 
946 	return error;
947 }
948 
949 /* ARGSUSED */
950 static int
951 raidopen(dev_t dev, int flags, int fmt,
952     struct lwp *l)
953 {
954 	int     unit = raidunit(dev);
955 	struct raid_softc *rs;
956 	struct dk_softc *dksc;
957 	int     error = 0;
958 	int     part, pmask;
959 
960 	if ((rs = raidget(unit, true)) == NULL)
961 		return ENXIO;
962 	if ((error = raidlock(rs)) != 0)
963 		return error;
964 
965 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) {
966 		error = EBUSY;
967 		goto bad;
968 	}
969 
970 	dksc = &rs->sc_dksc;
971 
972 	part = DISKPART(dev);
973 	pmask = (1 << part);
974 
975 	if (!DK_BUSY(dksc, pmask) &&
976 	    ((rs->sc_flags & RAIDF_INITED) != 0)) {
977 		/* First one... mark things as dirty... Note that we *MUST*
978 		 have done a configure before this.  I DO NOT WANT TO BE
979 		 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED
980 		 THAT THEY BELONG TOGETHER!!!!! */
981 		/* XXX should check to see if we're only open for reading
982 		   here... If so, we needn't do this, but then need some
983 		   other way of keeping track of what's happened.. */
984 
985 		rf_markalldirty(&rs->sc_r);
986 	}
987 
988 	if ((rs->sc_flags & RAIDF_INITED) != 0)
989 		error = dk_open(dksc, dev, flags, fmt, l);
990 
991 bad:
992 	raidunlock(rs);
993 
994 	return error;
995 
996 
997 }
998 
999 static int
1000 raid_lastclose(device_t self)
1001 {
1002 	struct raid_softc *rs = raidsoftc(self);
1003 
1004 	/* Last one... device is not unconfigured yet.
1005 	   Device shutdown has taken care of setting the
1006 	   clean bits if RAIDF_INITED is not set
1007 	   mark things as clean... */
1008 
1009 	rf_update_component_labels(&rs->sc_r,
1010 	    RF_FINAL_COMPONENT_UPDATE);
1011 
1012 	/* pass to unlocked code */
1013 	if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
1014 		rs->sc_flags |= RAIDF_DETACH;
1015 
1016 	return 0;
1017 }
1018 
1019 /* ARGSUSED */
1020 static int
1021 raidclose(dev_t dev, int flags, int fmt, struct lwp *l)
1022 {
1023 	int     unit = raidunit(dev);
1024 	struct raid_softc *rs;
1025 	struct dk_softc *dksc;
1026 	cfdata_t cf;
1027 	int     error = 0, do_detach = 0, do_put = 0;
1028 
1029 	if ((rs = raidget(unit, false)) == NULL)
1030 		return ENXIO;
1031 	dksc = &rs->sc_dksc;
1032 
1033 	if ((error = raidlock(rs)) != 0)
1034 		return error;
1035 
1036 	if ((rs->sc_flags & RAIDF_INITED) != 0) {
1037 		error = dk_close(dksc, dev, flags, fmt, l);
1038 		if ((rs->sc_flags & RAIDF_DETACH) != 0)
1039 			do_detach = 1;
1040 	} else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0)
1041 		do_put = 1;
1042 
1043 	raidunlock(rs);
1044 
1045 	if (do_detach) {
1046 		/* free the pseudo device attach bits */
1047 		cf = device_cfdata(dksc->sc_dev);
1048 		error = config_detach(dksc->sc_dev, 0);
1049 		if (error == 0)
1050 			free(cf, M_RAIDFRAME);
1051 	} else if (do_put) {
1052 		raidput(rs);
1053 	}
1054 
1055 	return error;
1056 
1057 }
1058 
1059 static void
1060 raid_wakeup(RF_Raid_t *raidPtr)
1061 {
1062 	rf_lock_mutex2(raidPtr->iodone_lock);
1063 	rf_signal_cond2(raidPtr->iodone_cv);
1064 	rf_unlock_mutex2(raidPtr->iodone_lock);
1065 }
1066 
1067 static void
1068 raidstrategy(struct buf *bp)
1069 {
1070 	unsigned int unit;
1071 	struct raid_softc *rs;
1072 	struct dk_softc *dksc;
1073 	RF_Raid_t *raidPtr;
1074 
1075 	unit = raidunit(bp->b_dev);
1076 	if ((rs = raidget(unit, false)) == NULL) {
1077 		bp->b_error = ENXIO;
1078 		goto fail;
1079 	}
1080 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
1081 		bp->b_error = ENXIO;
1082 		goto fail;
1083 	}
1084 	dksc = &rs->sc_dksc;
1085 	raidPtr = &rs->sc_r;
1086 
1087 	/* Queue IO only */
1088 	if (dk_strategy_defer(dksc, bp))
1089 		goto done;
1090 
1091 	/* schedule the IO to happen at the next convenient time */
1092 	raid_wakeup(raidPtr);
1093 
1094 done:
1095 	return;
1096 
1097 fail:
1098 	bp->b_resid = bp->b_bcount;
1099 	biodone(bp);
1100 }
1101 
1102 static int
1103 raid_diskstart(device_t dev, struct buf *bp)
1104 {
1105 	struct raid_softc *rs = raidsoftc(dev);
1106 	RF_Raid_t *raidPtr;
1107 
1108 	raidPtr = &rs->sc_r;
1109 	if (!raidPtr->valid) {
1110 		db1_printf(("raid is not valid..\n"));
1111 		return ENODEV;
1112 	}
1113 
1114 	/* XXX */
1115 	bp->b_resid = 0;
1116 
1117 	return raiddoaccess(raidPtr, bp);
1118 }
1119 
1120 void
1121 raiddone(RF_Raid_t *raidPtr, struct buf *bp)
1122 {
1123 	struct raid_softc *rs;
1124 	struct dk_softc *dksc;
1125 
1126 	rs = raidPtr->softc;
1127 	dksc = &rs->sc_dksc;
1128 
1129 	dk_done(dksc, bp);
1130 
1131 	rf_lock_mutex2(raidPtr->mutex);
1132 	raidPtr->openings++;
1133 	rf_unlock_mutex2(raidPtr->mutex);
1134 
1135 	/* schedule more IO */
1136 	raid_wakeup(raidPtr);
1137 }
1138 
1139 /* ARGSUSED */
1140 static int
1141 raidread(dev_t dev, struct uio *uio, int flags)
1142 {
1143 	int     unit = raidunit(dev);
1144 	struct raid_softc *rs;
1145 
1146 	if ((rs = raidget(unit, false)) == NULL)
1147 		return ENXIO;
1148 
1149 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1150 		return ENXIO;
1151 
1152 	return physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
1153 
1154 }
1155 
1156 /* ARGSUSED */
1157 static int
1158 raidwrite(dev_t dev, struct uio *uio, int flags)
1159 {
1160 	int     unit = raidunit(dev);
1161 	struct raid_softc *rs;
1162 
1163 	if ((rs = raidget(unit, false)) == NULL)
1164 		return ENXIO;
1165 
1166 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1167 		return ENXIO;
1168 
1169 	return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio);
1170 
1171 }
1172 
1173 static int
1174 raid_detach_unlocked(struct raid_softc *rs)
1175 {
1176 	struct dk_softc *dksc = &rs->sc_dksc;
1177 	RF_Raid_t *raidPtr;
1178 	int error;
1179 
1180 	raidPtr = &rs->sc_r;
1181 
1182 	if (DK_BUSY(dksc, 0) ||
1183 	    raidPtr->recon_in_progress != 0 ||
1184 	    raidPtr->parity_rewrite_in_progress != 0)
1185 		return EBUSY;
1186 
1187 	if ((rs->sc_flags & RAIDF_INITED) == 0)
1188 		return 0;
1189 
1190 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
1191 
1192 	if ((error = rf_Shutdown(raidPtr)) != 0)
1193 		return error;
1194 
1195 	rs->sc_flags &= ~RAIDF_INITED;
1196 
1197 	/* Kill off any queued buffers */
1198 	dk_drain(dksc);
1199 	bufq_free(dksc->sc_bufq);
1200 
1201 	/* Detach the disk. */
1202 	dkwedge_delall(&dksc->sc_dkdev);
1203 	disk_detach(&dksc->sc_dkdev);
1204 	disk_destroy(&dksc->sc_dkdev);
1205 	dk_detach(dksc);
1206 
1207 	return 0;
1208 }
1209 
1210 int
1211 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr)
1212 {
1213 	struct rf_recon_req_internal *rrint;
1214 
1215 	if (raidPtr->Layout.map->faultsTolerated == 0) {
1216 		/* Can't do this on a RAID 0!! */
1217 		return EINVAL;
1218 	}
1219 
1220 	if (rr->col < 0 || rr->col >= raidPtr->numCol) {
1221 		/* bad column */
1222 		return EINVAL;
1223 	}
1224 
1225 	rf_lock_mutex2(raidPtr->mutex);
1226 	if (raidPtr->status == rf_rs_reconstructing) {
1227 		raidPtr->abortRecon[rr->col] = 1;
1228 	}
1229 	if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) &&
1230 	    (raidPtr->numFailures > 0)) {
1231 		/* some other component has failed.  Let's not make
1232 		   things worse. XXX wrong for RAID6 */
1233 		goto out;
1234 	}
1235 	if (raidPtr->Disks[rr->col].status == rf_ds_spared) {
1236 		int spareCol = raidPtr->Disks[rr->col].spareCol;
1237 
1238 		if (spareCol < raidPtr->numCol ||
1239 		    spareCol >= raidPtr->numCol + raidPtr->numSpare)
1240 			goto out;
1241 
1242 		/*
1243 		 * Fail the spare disk so that we can
1244 		 * reconstruct on another one.
1245 		 */
1246 		raidPtr->Disks[spareCol].status = rf_ds_failed;
1247 
1248 	}
1249 	rf_unlock_mutex2(raidPtr->mutex);
1250 
1251 	/* make a copy of the recon request so that we don't rely on
1252 	 * the user's buffer */
1253 	rrint = RF_Malloc(sizeof(*rrint));
1254 	if (rrint == NULL)
1255 		return(ENOMEM);
1256 	rrint->col = rr->col;
1257 	rrint->flags = rr->flags;
1258 	rrint->raidPtr = raidPtr;
1259 
1260 	return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread,
1261 	    rrint, "raid_recon");
1262 out:
1263 	rf_unlock_mutex2(raidPtr->mutex);
1264 	return EINVAL;
1265 }
1266 
1267 static int
1268 rf_copyinspecificbuf(RF_Config_t *k_cfg)
1269 {
1270 	/* allocate a buffer for the layout-specific data, and copy it in */
1271 	if (k_cfg->layoutSpecificSize == 0)
1272 		return 0;
1273 
1274 	if (k_cfg->layoutSpecificSize > 10000) {
1275 	    /* sanity check */
1276 	    return EINVAL;
1277 	}
1278 
1279 	u_char *specific_buf;
1280 	specific_buf =  RF_Malloc(k_cfg->layoutSpecificSize);
1281 	if (specific_buf == NULL)
1282 		return ENOMEM;
1283 
1284 	int retcode = copyin(k_cfg->layoutSpecific, specific_buf,
1285 	    k_cfg->layoutSpecificSize);
1286 	if (retcode) {
1287 		RF_Free(specific_buf, k_cfg->layoutSpecificSize);
1288 		db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode));
1289 		return retcode;
1290 	}
1291 
1292 	k_cfg->layoutSpecific = specific_buf;
1293 	return 0;
1294 }
1295 
1296 static int
1297 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg)
1298 {
1299 	RF_Config_t *u_cfg = *((RF_Config_t **) data);
1300 
1301 	if (rs->sc_r.valid) {
1302 		/* There is a valid RAID set running on this unit! */
1303 		printf("raid%d: Device already configured!\n", rs->sc_unit);
1304 		return EINVAL;
1305 	}
1306 
1307 	/* copy-in the configuration information */
1308 	/* data points to a pointer to the configuration structure */
1309 	*k_cfg = RF_Malloc(sizeof(**k_cfg));
1310 	if (*k_cfg == NULL) {
1311 		return ENOMEM;
1312 	}
1313 	int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t));
1314 	if (retcode == 0)
1315 		return 0;
1316 	RF_Free(*k_cfg, sizeof(RF_Config_t));
1317 	db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode));
1318 	rs->sc_flags |= RAIDF_SHUTDOWN;
1319 	return retcode;
1320 }
1321 
1322 int
1323 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg)
1324 {
1325 	int retcode, i;
1326 	RF_Raid_t *raidPtr = &rs->sc_r;
1327 
1328 	rs->sc_flags &= ~RAIDF_SHUTDOWN;
1329 
1330 	if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0)
1331 		goto out;
1332 
1333 	/* should do some kind of sanity check on the configuration.
1334 	 * Store the sum of all the bytes in the last byte? */
1335 
1336 	/* Force nul-termination on all strings. */
1337 #define ZERO_FINAL(s)	do { s[sizeof(s) - 1] = '\0'; } while (0)
1338 	for (i = 0; i < RF_MAXCOL; i++) {
1339 		ZERO_FINAL(k_cfg->devnames[0][i]);
1340 	}
1341 	for (i = 0; i < RF_MAXSPARE; i++) {
1342 		ZERO_FINAL(k_cfg->spare_names[i]);
1343 	}
1344 	for (i = 0; i < RF_MAXDBGV; i++) {
1345 		ZERO_FINAL(k_cfg->debugVars[i]);
1346 	}
1347 #undef ZERO_FINAL
1348 
1349 	/* Check some basic limits. */
1350 	if (k_cfg->numCol >= RF_MAXCOL || k_cfg->numCol < 0) {
1351 		retcode = EINVAL;
1352 		goto out;
1353 	}
1354 	if (k_cfg->numSpare >= RF_MAXSPARE || k_cfg->numSpare < 0) {
1355 		retcode = EINVAL;
1356 		goto out;
1357 	}
1358 
1359 	/* configure the system */
1360 
1361 	/*
1362 	 * Clear the entire RAID descriptor, just to make sure
1363 	 *  there is no stale data left in the case of a
1364 	 *  reconfiguration
1365 	 */
1366 	memset(raidPtr, 0, sizeof(*raidPtr));
1367 	raidPtr->softc = rs;
1368 	raidPtr->raidid = rs->sc_unit;
1369 
1370 	retcode = rf_Configure(raidPtr, k_cfg, NULL);
1371 
1372 	if (retcode == 0) {
1373 		/* allow this many simultaneous IO's to
1374 		   this RAID device */
1375 		raidPtr->openings = RAIDOUTSTANDING;
1376 
1377 		raidinit(rs);
1378 		raid_wakeup(raidPtr);
1379 		rf_markalldirty(raidPtr);
1380 	}
1381 
1382 	/* free the buffers.  No return code here. */
1383 	if (k_cfg->layoutSpecificSize) {
1384 		RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize);
1385 	}
1386 out:
1387 	RF_Free(k_cfg, sizeof(RF_Config_t));
1388 	if (retcode) {
1389 		/*
1390 		 * If configuration failed, set sc_flags so that we
1391 		 * will detach the device when we close it.
1392 		 */
1393 		rs->sc_flags |= RAIDF_SHUTDOWN;
1394 	}
1395 	return retcode;
1396 }
1397 
1398 #if RF_DISABLED
1399 static int
1400 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1401 {
1402 
1403 	/* XXX check the label for valid stuff... */
1404 	/* Note that some things *should not* get modified --
1405 	   the user should be re-initing the labels instead of
1406 	   trying to patch things.
1407 	   */
1408 #ifdef DEBUG
1409 	int raidid = raidPtr->raidid;
1410 	printf("raid%d: Got component label:\n", raidid);
1411 	printf("raid%d: Version: %d\n", raidid, clabel->version);
1412 	printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number);
1413 	printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter);
1414 	printf("raid%d: Column: %d\n", raidid, clabel->column);
1415 	printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns);
1416 	printf("raid%d: Clean: %d\n", raidid, clabel->clean);
1417 	printf("raid%d: Status: %d\n", raidid, clabel->status);
1418 #endif	/* DEBUG */
1419 	clabel->row = 0;
1420 	int column = clabel->column;
1421 
1422 	if ((column < 0) || (column >= raidPtr->numCol)) {
1423 		return(EINVAL);
1424 	}
1425 
1426 	/* XXX this isn't allowed to do anything for now :-) */
1427 
1428 	/* XXX and before it is, we need to fill in the rest
1429 	   of the fields!?!?!?! */
1430 	memcpy(raidget_component_label(raidPtr, column),
1431 	    clabel, sizeof(*clabel));
1432 	raidflush_component_label(raidPtr, column);
1433 	return 0;
1434 }
1435 #endif
1436 
1437 static int
1438 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
1439 {
1440 	/*
1441 	   we only want the serial number from
1442 	   the above.  We get all the rest of the information
1443 	   from the config that was used to create this RAID
1444 	   set.
1445 	   */
1446 
1447 	raidPtr->serial_number = clabel->serial_number;
1448 
1449 	for (int column = 0; column < raidPtr->numCol; column++) {
1450 		RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column];
1451 		if (RF_DEAD_DISK(diskPtr->status))
1452 			continue;
1453 		RF_ComponentLabel_t *ci_label = raidget_component_label(
1454 		    raidPtr, column);
1455 		/* Zeroing this is important. */
1456 		memset(ci_label, 0, sizeof(*ci_label));
1457 		raid_init_component_label(raidPtr, ci_label);
1458 		ci_label->serial_number = raidPtr->serial_number;
1459 		ci_label->row = 0; /* we dont' pretend to support more */
1460 		rf_component_label_set_partitionsize(ci_label,
1461 		    diskPtr->partitionSize);
1462 		ci_label->column = column;
1463 		raidflush_component_label(raidPtr, column);
1464 		/* XXXjld what about the spares? */
1465 	}
1466 
1467 	return 0;
1468 }
1469 
1470 static int
1471 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr)
1472 {
1473 
1474 	if (raidPtr->Layout.map->faultsTolerated == 0) {
1475 		/* Can't do this on a RAID 0!! */
1476 		return EINVAL;
1477 	}
1478 
1479 	if (raidPtr->recon_in_progress == 1) {
1480 		/* a reconstruct is already in progress! */
1481 		return EINVAL;
1482 	}
1483 
1484 	RF_SingleComponent_t component;
1485 	memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t));
1486 	component.row = 0; /* we don't support any more */
1487 	int column = component.column;
1488 
1489 	if ((column < 0) || (column >= raidPtr->numCol)) {
1490 		return EINVAL;
1491 	}
1492 
1493 	rf_lock_mutex2(raidPtr->mutex);
1494 	if ((raidPtr->Disks[column].status == rf_ds_optimal) &&
1495 	    (raidPtr->numFailures > 0)) {
1496 		/* XXX 0 above shouldn't be constant!!! */
1497 		/* some component other than this has failed.
1498 		   Let's not make things worse than they already
1499 		   are... */
1500 		printf("raid%d: Unable to reconstruct to disk at:\n",
1501 		       raidPtr->raidid);
1502 		printf("raid%d:     Col: %d   Too many failures.\n",
1503 		       raidPtr->raidid, column);
1504 		rf_unlock_mutex2(raidPtr->mutex);
1505 		return EINVAL;
1506 	}
1507 
1508 	if (raidPtr->Disks[column].status == rf_ds_reconstructing) {
1509 		printf("raid%d: Unable to reconstruct to disk at:\n",
1510 		       raidPtr->raidid);
1511 		printf("raid%d:    Col: %d   "
1512 		    "Reconstruction already occurring!\n",
1513 		    raidPtr->raidid, column);
1514 
1515 		rf_unlock_mutex2(raidPtr->mutex);
1516 		return EINVAL;
1517 	}
1518 
1519 	if (raidPtr->Disks[column].status == rf_ds_spared) {
1520 		rf_unlock_mutex2(raidPtr->mutex);
1521 		return EINVAL;
1522 	}
1523 
1524 	rf_unlock_mutex2(raidPtr->mutex);
1525 
1526 	struct rf_recon_req_internal *rrint;
1527 	rrint = RF_Malloc(sizeof(*rrint));
1528 	if (rrint == NULL)
1529 		return ENOMEM;
1530 
1531 	rrint->col = column;
1532 	rrint->raidPtr = raidPtr;
1533 
1534 	return RF_CREATE_THREAD(raidPtr->recon_thread,
1535 	    rf_ReconstructInPlaceThread, rrint, "raid_reconip");
1536 }
1537 
1538 static int
1539 rf_check_recon_status(RF_Raid_t *raidPtr, int *data)
1540 {
1541 	/*
1542 	 * This makes no sense on a RAID 0, or if we are not reconstructing
1543 	 * so tell the user it's done.
1544 	 */
1545 	if (raidPtr->Layout.map->faultsTolerated == 0 ||
1546 	    raidPtr->status != rf_rs_reconstructing) {
1547 		*data = 100;
1548 		return 0;
1549 	}
1550 	if (raidPtr->reconControl->numRUsTotal == 0) {
1551 		*data = 0;
1552 		return 0;
1553 	}
1554 	*data = (raidPtr->reconControl->numRUsComplete * 100
1555 	    / raidPtr->reconControl->numRUsTotal);
1556 	return 0;
1557 }
1558 
1559 /*
1560  * Copy a RF_SingleComponent_t from 'data', ensuring nul-termination
1561  * on the component_name[] array.
1562  */
1563 static void
1564 rf_copy_single_component(RF_SingleComponent_t *component, void *data)
1565 {
1566 
1567 	memcpy(component, data, sizeof *component);
1568 	component->component_name[sizeof(component->component_name) - 1] = '\0';
1569 }
1570 
1571 static int
1572 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1573 {
1574 	int     unit = raidunit(dev);
1575 	int     part, pmask;
1576 	struct raid_softc *rs;
1577 	struct dk_softc *dksc;
1578 	RF_Config_t *k_cfg;
1579 	RF_Raid_t *raidPtr;
1580 	RF_AccTotals_t *totals;
1581 	RF_SingleComponent_t component;
1582 	RF_DeviceConfig_t *d_cfg, *ucfgp;
1583 	int retcode = 0;
1584 	int column;
1585 	RF_ComponentLabel_t *clabel;
1586 	int d;
1587 
1588 	if ((rs = raidget(unit, false)) == NULL)
1589 		return ENXIO;
1590 
1591 	dksc = &rs->sc_dksc;
1592 	raidPtr = &rs->sc_r;
1593 
1594 	db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev,
1595 	    (int) DISKPART(dev), (int) unit, cmd));
1596 
1597 	/* Only CONFIGURE and RESCAN can be done without the RAID being initialized. */
1598 	switch (cmd) {
1599 	case RAIDFRAME_CONFIGURE:
1600 	case RAIDFRAME_RESCAN:
1601 		break;
1602 	default:
1603 		if (!rf_inited(rs))
1604 			return ENXIO;
1605 	}
1606 
1607 	switch (cmd) {
1608 		/* configure the system */
1609 	case RAIDFRAME_CONFIGURE:
1610 		if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0)
1611 			return retcode;
1612 		return rf_construct(rs, k_cfg);
1613 
1614 		/* shutdown the system */
1615 	case RAIDFRAME_SHUTDOWN:
1616 
1617 		part = DISKPART(dev);
1618 		pmask = (1 << part);
1619 
1620 		if ((retcode = raidlock(rs)) != 0)
1621 			return retcode;
1622 
1623 		if (DK_BUSY(dksc, pmask) ||
1624 		    raidPtr->recon_in_progress != 0 ||
1625 		    raidPtr->parity_rewrite_in_progress != 0)
1626 			retcode = EBUSY;
1627 		else {
1628 			/* detach and free on close */
1629 			rs->sc_flags |= RAIDF_SHUTDOWN;
1630 			retcode = 0;
1631 		}
1632 
1633 		raidunlock(rs);
1634 
1635 		return retcode;
1636 	case RAIDFRAME_GET_COMPONENT_LABEL:
1637 		return rf_get_component_label(raidPtr, data);
1638 
1639 #if RF_DISABLED
1640 	case RAIDFRAME_SET_COMPONENT_LABEL:
1641 		return rf_set_component_label(raidPtr, data);
1642 #endif
1643 
1644 	case RAIDFRAME_INIT_LABELS:
1645 		return rf_init_component_label(raidPtr, data);
1646 
1647 	case RAIDFRAME_SET_AUTOCONFIG:
1648 		d = rf_set_autoconfig(raidPtr, *(int *) data);
1649 		printf("raid%d: New autoconfig value is: %d\n",
1650 		       raidPtr->raidid, d);
1651 		*(int *) data = d;
1652 		return retcode;
1653 
1654 	case RAIDFRAME_SET_ROOT:
1655 		d = rf_set_rootpartition(raidPtr, *(int *) data);
1656 		printf("raid%d: New rootpartition value is: %d\n",
1657 		       raidPtr->raidid, d);
1658 		*(int *) data = d;
1659 		return retcode;
1660 
1661 		/* initialize all parity */
1662 	case RAIDFRAME_REWRITEPARITY:
1663 
1664 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1665 			/* Parity for RAID 0 is trivially correct */
1666 			raidPtr->parity_good = RF_RAID_CLEAN;
1667 			return 0;
1668 		}
1669 
1670 		if (raidPtr->parity_rewrite_in_progress == 1) {
1671 			/* Re-write is already in progress! */
1672 			return EINVAL;
1673 		}
1674 
1675 		return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread,
1676 		    rf_RewriteParityThread, raidPtr,"raid_parity");
1677 
1678 	case RAIDFRAME_ADD_HOT_SPARE:
1679 		rf_copy_single_component(&component, data);
1680 		return rf_add_hot_spare(raidPtr, &component);
1681 
1682 	/* Remove a non hot-spare component, never implemented in userland */
1683 	case RAIDFRAME_DELETE_COMPONENT:
1684 		rf_copy_single_component(&component, data);
1685 		return rf_delete_component(raidPtr, &component);
1686 
1687 	case RAIDFRAME_REMOVE_COMPONENT:
1688 		rf_copy_single_component(&component, data);
1689 		return rf_remove_component(raidPtr, &component);
1690 
1691 	case RAIDFRAME_INCORPORATE_HOT_SPARE:
1692 		rf_copy_single_component(&component, data);
1693 		return rf_incorporate_hot_spare(raidPtr, &component);
1694 
1695 	case RAIDFRAME_REBUILD_IN_PLACE:
1696 		return rf_rebuild_in_place(raidPtr, data);
1697 
1698 	case RAIDFRAME_GET_INFO:
1699 		ucfgp = *(RF_DeviceConfig_t **)data;
1700 		d_cfg = RF_Malloc(sizeof(*d_cfg));
1701 		if (d_cfg == NULL)
1702 			return ENOMEM;
1703 		retcode = rf_get_info(raidPtr, d_cfg);
1704 		if (retcode == 0) {
1705 			retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg));
1706 		}
1707 		RF_Free(d_cfg, sizeof(RF_DeviceConfig_t));
1708 		return retcode;
1709 
1710 	case RAIDFRAME_CHECK_PARITY:
1711 		*(int *) data = raidPtr->parity_good;
1712 		return 0;
1713 
1714 	case RAIDFRAME_PARITYMAP_STATUS:
1715 		if (rf_paritymap_ineligible(raidPtr))
1716 			return EINVAL;
1717 		rf_paritymap_status(raidPtr->parity_map, data);
1718 		return 0;
1719 
1720 	case RAIDFRAME_PARITYMAP_SET_PARAMS:
1721 		if (rf_paritymap_ineligible(raidPtr))
1722 			return EINVAL;
1723 		if (raidPtr->parity_map == NULL)
1724 			return ENOENT; /* ??? */
1725 		if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0)
1726 			return EINVAL;
1727 		return 0;
1728 
1729 	case RAIDFRAME_PARITYMAP_GET_DISABLE:
1730 		if (rf_paritymap_ineligible(raidPtr))
1731 			return EINVAL;
1732 		*(int *) data = rf_paritymap_get_disable(raidPtr);
1733 		return 0;
1734 
1735 	case RAIDFRAME_PARITYMAP_SET_DISABLE:
1736 		if (rf_paritymap_ineligible(raidPtr))
1737 			return EINVAL;
1738 		rf_paritymap_set_disable(raidPtr, *(int *)data);
1739 		/* XXX should errors be passed up? */
1740 		return 0;
1741 
1742 	case RAIDFRAME_RESCAN:
1743 		return rf_rescan();
1744 
1745 	case RAIDFRAME_RESET_ACCTOTALS:
1746 		memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals));
1747 		return 0;
1748 
1749 	case RAIDFRAME_GET_ACCTOTALS:
1750 		totals = (RF_AccTotals_t *) data;
1751 		*totals = raidPtr->acc_totals;
1752 		return 0;
1753 
1754 	case RAIDFRAME_KEEP_ACCTOTALS:
1755 		raidPtr->keep_acc_totals = *(int *)data;
1756 		return 0;
1757 
1758 	case RAIDFRAME_GET_SIZE:
1759 		*(int *) data = raidPtr->totalSectors;
1760 		return 0;
1761 
1762 	case RAIDFRAME_FAIL_DISK:
1763 		return rf_fail_disk(raidPtr, data);
1764 
1765 		/* copyback is no longer supported */
1766 	case RAIDFRAME_COPYBACK:
1767 		return EINVAL;
1768 
1769 		/* return the percentage completion of reconstruction */
1770 	case RAIDFRAME_CHECK_RECON_STATUS:
1771 		return rf_check_recon_status(raidPtr, data);
1772 
1773 	case RAIDFRAME_CHECK_RECON_STATUS_EXT:
1774 		rf_check_recon_status_ext(raidPtr, data);
1775 		return 0;
1776 
1777 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS:
1778 		if (raidPtr->Layout.map->faultsTolerated == 0) {
1779 			/* This makes no sense on a RAID 0, so tell the
1780 			   user it's done. */
1781 			*(int *) data = 100;
1782 			return 0;
1783 		}
1784 		if (raidPtr->parity_rewrite_in_progress == 1) {
1785 			*(int *) data = 100 *
1786 				raidPtr->parity_rewrite_stripes_done /
1787 				raidPtr->Layout.numStripe;
1788 		} else {
1789 			*(int *) data = 100;
1790 		}
1791 		return 0;
1792 
1793 	case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT:
1794 		rf_check_parityrewrite_status_ext(raidPtr, data);
1795 		return 0;
1796 
1797 	case RAIDFRAME_CHECK_COPYBACK_STATUS:
1798 		*(int *) data = 100;
1799 		return 0;
1800 
1801 	case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT:
1802 		rf_check_copyback_status_ext(raidPtr, data);
1803 		return 0;
1804 
1805 	case RAIDFRAME_SET_LAST_UNIT:
1806 		for (column = 0; column < raidPtr->numCol; column++)
1807 			if (raidPtr->Disks[column].status != rf_ds_optimal)
1808 				return EBUSY;
1809 
1810 		for (column = 0; column < raidPtr->numCol; column++) {
1811 			clabel = raidget_component_label(raidPtr, column);
1812 			clabel->last_unit = *(int *)data;
1813 			raidflush_component_label(raidPtr, column);
1814 		}
1815 		rs->sc_cflags |= RAIDF_UNIT_CHANGED;
1816 		return 0;
1817 
1818 		/* the sparetable daemon calls this to wait for the kernel to
1819 		 * need a spare table. this ioctl does not return until a
1820 		 * spare table is needed. XXX -- calling mpsleep here in the
1821 		 * ioctl code is almost certainly wrong and evil. -- XXX XXX
1822 		 * -- I should either compute the spare table in the kernel,
1823 		 * or have a different -- XXX XXX -- interface (a different
1824 		 * character device) for delivering the table     -- XXX */
1825 #if RF_DISABLED
1826 	case RAIDFRAME_SPARET_WAIT:
1827 		rf_lock_mutex2(rf_sparet_wait_mutex);
1828 		while (!rf_sparet_wait_queue)
1829 			rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex);
1830 		RF_SparetWait_t *waitreq = rf_sparet_wait_queue;
1831 		rf_sparet_wait_queue = rf_sparet_wait_queue->next;
1832 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1833 
1834 		/* structure assignment */
1835 		*((RF_SparetWait_t *) data) = *waitreq;
1836 
1837 		RF_Free(waitreq, sizeof(*waitreq));
1838 		return 0;
1839 
1840 		/* wakes up a process waiting on SPARET_WAIT and puts an error
1841 		 * code in it that will cause the dameon to exit */
1842 	case RAIDFRAME_ABORT_SPARET_WAIT:
1843 		waitreq = RF_Malloc(sizeof(*waitreq));
1844 		waitreq->fcol = -1;
1845 		rf_lock_mutex2(rf_sparet_wait_mutex);
1846 		waitreq->next = rf_sparet_wait_queue;
1847 		rf_sparet_wait_queue = waitreq;
1848 		rf_broadcast_cond2(rf_sparet_wait_cv);
1849 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1850 		return 0;
1851 
1852 		/* used by the spare table daemon to deliver a spare table
1853 		 * into the kernel */
1854 	case RAIDFRAME_SEND_SPARET:
1855 
1856 		/* install the spare table */
1857 		retcode = rf_SetSpareTable(raidPtr, *(void **) data);
1858 
1859 		/* respond to the requestor.  the return status of the spare
1860 		 * table installation is passed in the "fcol" field */
1861 		waitred = RF_Malloc(sizeof(*waitreq));
1862 		waitreq->fcol = retcode;
1863 		rf_lock_mutex2(rf_sparet_wait_mutex);
1864 		waitreq->next = rf_sparet_resp_queue;
1865 		rf_sparet_resp_queue = waitreq;
1866 		rf_broadcast_cond2(rf_sparet_resp_cv);
1867 		rf_unlock_mutex2(rf_sparet_wait_mutex);
1868 
1869 		return retcode;
1870 #endif
1871 	default:
1872 		/*
1873 		 * Don't bother trying to load compat modules
1874 		 * if it is not our ioctl. This is more efficient
1875 		 * and makes rump tests not depend on compat code
1876 		 */
1877 		if (IOCGROUP(cmd) != 'r')
1878 			break;
1879 #ifdef _LP64
1880 		if ((l->l_proc->p_flag & PK_32) != 0) {
1881 			module_autoload("compat_netbsd32_raid",
1882 			    MODULE_CLASS_EXEC);
1883 			MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook,
1884 			    (rs, cmd, data), enosys(), retcode);
1885 			if (retcode != EPASSTHROUGH)
1886 				return retcode;
1887 		}
1888 #endif
1889 		module_autoload("compat_raid_80", MODULE_CLASS_EXEC);
1890 		MODULE_HOOK_CALL(raidframe_ioctl_80_hook,
1891 		    (rs, cmd, data), enosys(), retcode);
1892 		if (retcode != EPASSTHROUGH)
1893 			return retcode;
1894 
1895 		module_autoload("compat_raid_50", MODULE_CLASS_EXEC);
1896 		MODULE_HOOK_CALL(raidframe_ioctl_50_hook,
1897 		    (rs, cmd, data), enosys(), retcode);
1898 		if (retcode != EPASSTHROUGH)
1899 			return retcode;
1900 		break; /* fall through to the os-specific code below */
1901 
1902 	}
1903 
1904 	if (!raidPtr->valid)
1905 		return EINVAL;
1906 
1907 	/*
1908 	 * Add support for "regular" device ioctls here.
1909 	 */
1910 
1911 	switch (cmd) {
1912 	case DIOCGCACHE:
1913 		retcode = rf_get_component_caches(raidPtr, (int *)data);
1914 		break;
1915 
1916 	case DIOCCACHESYNC:
1917 		retcode = rf_sync_component_caches(raidPtr, *(int *)data);
1918 		break;
1919 
1920 	default:
1921 		retcode = dk_ioctl(dksc, dev, cmd, data, flag, l);
1922 		break;
1923 	}
1924 
1925 	return retcode;
1926 
1927 }
1928 
1929 
1930 /* raidinit -- complete the rest of the initialization for the
1931    RAIDframe device.  */
1932 
1933 
1934 static void
1935 raidinit(struct raid_softc *rs)
1936 {
1937 	cfdata_t cf;
1938 	unsigned int unit;
1939 	struct dk_softc *dksc = &rs->sc_dksc;
1940 	RF_Raid_t *raidPtr = &rs->sc_r;
1941 	device_t dev;
1942 
1943 	unit = raidPtr->raidid;
1944 
1945 	/* XXX doesn't check bounds. */
1946 	snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit);
1947 
1948 	/* attach the pseudo device */
1949 	cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK);
1950 	cf->cf_name = raid_cd.cd_name;
1951 	cf->cf_atname = raid_cd.cd_name;
1952 	cf->cf_unit = unit;
1953 	cf->cf_fstate = FSTATE_STAR;
1954 
1955 	dev = config_attach_pseudo(cf);
1956 	if (dev == NULL) {
1957 		printf("raid%d: config_attach_pseudo failed\n",
1958 		    raidPtr->raidid);
1959 		free(cf, M_RAIDFRAME);
1960 		return;
1961 	}
1962 
1963 	/* provide a backpointer to the real softc */
1964 	raidsoftc(dev) = rs;
1965 
1966 	/* disk_attach actually creates space for the CPU disklabel, among
1967 	 * other things, so it's critical to call this *BEFORE* we try putzing
1968 	 * with disklabels. */
1969 	dk_init(dksc, dev, DKTYPE_RAID);
1970 	disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver);
1971 
1972 	/* XXX There may be a weird interaction here between this, and
1973 	 * protectedSectors, as used in RAIDframe.  */
1974 
1975 	rs->sc_size = raidPtr->totalSectors;
1976 
1977 	/* Attach dk and disk subsystems */
1978 	dk_attach(dksc);
1979 	disk_attach(&dksc->sc_dkdev);
1980 	rf_set_geometry(rs, raidPtr);
1981 
1982 	bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK);
1983 
1984 	/* mark unit as usuable */
1985 	rs->sc_flags |= RAIDF_INITED;
1986 
1987 	dkwedge_discover(&dksc->sc_dkdev);
1988 }
1989 
1990 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
1991 /* wake up the daemon & tell it to get us a spare table
1992  * XXX
1993  * the entries in the queues should be tagged with the raidPtr
1994  * so that in the extremely rare case that two recons happen at once,
1995  * we know for which device were requesting a spare table
1996  * XXX
1997  *
1998  * XXX This code is not currently used. GO
1999  */
2000 int
2001 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req)
2002 {
2003 	int     retcode;
2004 
2005 	rf_lock_mutex2(rf_sparet_wait_mutex);
2006 	req->next = rf_sparet_wait_queue;
2007 	rf_sparet_wait_queue = req;
2008 	rf_broadcast_cond2(rf_sparet_wait_cv);
2009 
2010 	/* mpsleep unlocks the mutex */
2011 	while (!rf_sparet_resp_queue) {
2012 		rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex);
2013 	}
2014 	req = rf_sparet_resp_queue;
2015 	rf_sparet_resp_queue = req->next;
2016 	rf_unlock_mutex2(rf_sparet_wait_mutex);
2017 
2018 	retcode = req->fcol;
2019 	RF_Free(req, sizeof(*req));	/* this is not the same req as we
2020 					 * alloc'd */
2021 	return retcode;
2022 }
2023 #endif
2024 
2025 /* a wrapper around rf_DoAccess that extracts appropriate info from the
2026  * bp & passes it down.
2027  * any calls originating in the kernel must use non-blocking I/O
2028  * do some extra sanity checking to return "appropriate" error values for
2029  * certain conditions (to make some standard utilities work)
2030  *
2031  * Formerly known as: rf_DoAccessKernel
2032  */
2033 void
2034 raidstart(RF_Raid_t *raidPtr)
2035 {
2036 	struct raid_softc *rs;
2037 	struct dk_softc *dksc;
2038 
2039 	rs = raidPtr->softc;
2040 	dksc = &rs->sc_dksc;
2041 	/* quick check to see if anything has died recently */
2042 	rf_lock_mutex2(raidPtr->mutex);
2043 	if (raidPtr->numNewFailures > 0) {
2044 		rf_unlock_mutex2(raidPtr->mutex);
2045 		rf_update_component_labels(raidPtr,
2046 					   RF_NORMAL_COMPONENT_UPDATE);
2047 		rf_lock_mutex2(raidPtr->mutex);
2048 		raidPtr->numNewFailures--;
2049 	}
2050 	rf_unlock_mutex2(raidPtr->mutex);
2051 
2052 	if ((rs->sc_flags & RAIDF_INITED) == 0) {
2053 		printf("raid%d: raidstart not ready\n", raidPtr->raidid);
2054 		return;
2055 	}
2056 
2057 	dk_start(dksc, NULL);
2058 }
2059 
2060 static int
2061 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp)
2062 {
2063 	RF_SectorCount_t num_blocks, pb, sum;
2064 	RF_RaidAddr_t raid_addr;
2065 	daddr_t blocknum;
2066 	int rc;
2067 
2068 	rf_lock_mutex2(raidPtr->mutex);
2069 	if (raidPtr->openings == 0) {
2070 		rf_unlock_mutex2(raidPtr->mutex);
2071 		return EAGAIN;
2072 	}
2073 	rf_unlock_mutex2(raidPtr->mutex);
2074 
2075 	blocknum = bp->b_rawblkno;
2076 
2077 	db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno,
2078 		    (int) blocknum));
2079 
2080 	db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount));
2081 	db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid));
2082 
2083 	/* *THIS* is where we adjust what block we're going to...
2084 	 * but DO NOT TOUCH bp->b_blkno!!! */
2085 	raid_addr = blocknum;
2086 
2087 	num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
2088 	pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0;
2089 	sum = raid_addr + num_blocks + pb;
2090 	if (1 || rf_debugKernelAccess) {
2091 		db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
2092 			    (int) raid_addr, (int) sum, (int) num_blocks,
2093 			    (int) pb, (int) bp->b_resid));
2094 	}
2095 	if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
2096 	    || (sum < num_blocks) || (sum < pb)) {
2097 		rc = ENOSPC;
2098 		goto done;
2099 	}
2100 	/*
2101 	 * XXX rf_DoAccess() should do this, not just DoAccessKernel()
2102 	 */
2103 
2104 	if (bp->b_bcount & raidPtr->sectorMask) {
2105 		rc = ENOSPC;
2106 		goto done;
2107 	}
2108 	db1_printf(("Calling DoAccess..\n"));
2109 
2110 
2111 	rf_lock_mutex2(raidPtr->mutex);
2112 	raidPtr->openings--;
2113 	rf_unlock_mutex2(raidPtr->mutex);
2114 
2115 	/* don't ever condition on bp->b_flags & B_WRITE.
2116 	 * always condition on B_READ instead */
2117 
2118 	rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
2119 			 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
2120 			 raid_addr, num_blocks,
2121 			 bp->b_data, bp, RF_DAG_NONBLOCKING_IO);
2122 
2123 done:
2124 	return rc;
2125 }
2126 
2127 /* invoke an I/O from kernel mode.  Disk queue should be locked upon entry */
2128 
2129 int
2130 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req)
2131 {
2132 	int     op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
2133 	struct buf *bp;
2134 
2135 	req->queue = queue;
2136 	bp = req->bp;
2137 
2138 	switch (req->type) {
2139 	case RF_IO_TYPE_NOP:	/* used primarily to unlock a locked queue */
2140 		/* XXX need to do something extra here.. */
2141 		/* I'm leaving this in, as I've never actually seen it used,
2142 		 * and I'd like folks to report it... GO */
2143 		printf("%s: WAKEUP CALLED\n", __func__);
2144 		queue->numOutstanding++;
2145 
2146 		bp->b_flags = 0;
2147 		bp->b_private = req;
2148 
2149 		KernelWakeupFunc(bp);
2150 		break;
2151 
2152 	case RF_IO_TYPE_READ:
2153 	case RF_IO_TYPE_WRITE:
2154 #if RF_ACC_TRACE > 0
2155 		if (req->tracerec) {
2156 			RF_ETIMER_START(req->tracerec->timer);
2157 		}
2158 #endif
2159 		InitBP(bp, queue->rf_cinfo->ci_vp,
2160 		    op, queue->rf_cinfo->ci_dev,
2161 		    req->sectorOffset, req->numSector,
2162 		    req->buf, KernelWakeupFunc, (void *) req,
2163 		    queue->raidPtr->logBytesPerSector);
2164 
2165 		if (rf_debugKernelAccess) {
2166 			db1_printf(("dispatch: bp->b_blkno = %ld\n",
2167 				(long) bp->b_blkno));
2168 		}
2169 		queue->numOutstanding++;
2170 		queue->last_deq_sector = req->sectorOffset;
2171 		/* acc wouldn't have been let in if there were any pending
2172 		 * reqs at any other priority */
2173 		queue->curPriority = req->priority;
2174 
2175 		db1_printf(("Going for %c to unit %d col %d\n",
2176 			    req->type, queue->raidPtr->raidid,
2177 			    queue->col));
2178 		db1_printf(("sector %d count %d (%d bytes) %d\n",
2179 			(int) req->sectorOffset, (int) req->numSector,
2180 			(int) (req->numSector <<
2181 			    queue->raidPtr->logBytesPerSector),
2182 			(int) queue->raidPtr->logBytesPerSector));
2183 
2184 		/*
2185 		 * XXX: drop lock here since this can block at
2186 		 * least with backing SCSI devices.  Retake it
2187 		 * to minimize fuss with calling interfaces.
2188 		 */
2189 
2190 		RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam");
2191 		bdev_strategy(bp);
2192 		RF_LOCK_QUEUE_MUTEX(queue, "unusedparam");
2193 		break;
2194 
2195 	default:
2196 		panic("bad req->type in rf_DispatchKernelIO");
2197 	}
2198 	db1_printf(("Exiting from DispatchKernelIO\n"));
2199 
2200 	return 0;
2201 }
2202 /* this is the callback function associated with a I/O invoked from
2203    kernel code.
2204  */
2205 static void
2206 KernelWakeupFunc(struct buf *bp)
2207 {
2208 	RF_DiskQueueData_t *req = NULL;
2209 	RF_DiskQueue_t *queue;
2210 
2211 	db1_printf(("recovering the request queue:\n"));
2212 
2213 	req = bp->b_private;
2214 
2215 	queue = (RF_DiskQueue_t *) req->queue;
2216 
2217 	rf_lock_mutex2(queue->raidPtr->iodone_lock);
2218 
2219 #if RF_ACC_TRACE > 0
2220 	if (req->tracerec) {
2221 		RF_ETIMER_STOP(req->tracerec->timer);
2222 		RF_ETIMER_EVAL(req->tracerec->timer);
2223 		rf_lock_mutex2(rf_tracing_mutex);
2224 		req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2225 		req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
2226 		req->tracerec->num_phys_ios++;
2227 		rf_unlock_mutex2(rf_tracing_mutex);
2228 	}
2229 #endif
2230 
2231 	/* XXX Ok, let's get aggressive... If b_error is set, let's go
2232 	 * ballistic, and mark the component as hosed... */
2233 
2234 	if (bp->b_error != 0) {
2235 		/* Mark the disk as dead */
2236 		/* but only mark it once... */
2237 		/* and only if it wouldn't leave this RAID set
2238 		   completely broken */
2239 		if (((queue->raidPtr->Disks[queue->col].status ==
2240 		      rf_ds_optimal) ||
2241 		     (queue->raidPtr->Disks[queue->col].status ==
2242 		      rf_ds_used_spare)) &&
2243 		     (queue->raidPtr->numFailures <
2244 		      queue->raidPtr->Layout.map->faultsTolerated)) {
2245 			printf("raid%d: IO Error (%d). Marking %s as failed.\n",
2246 			       queue->raidPtr->raidid,
2247 			       bp->b_error,
2248 			       queue->raidPtr->Disks[queue->col].devname);
2249 			queue->raidPtr->Disks[queue->col].status =
2250 			    rf_ds_failed;
2251 			queue->raidPtr->status = rf_rs_degraded;
2252 			queue->raidPtr->numFailures++;
2253 			queue->raidPtr->numNewFailures++;
2254 		} else {	/* Disk is already dead... */
2255 			/* printf("Disk already marked as dead!\n"); */
2256 		}
2257 
2258 	}
2259 
2260 	/* Fill in the error value */
2261 	req->error = bp->b_error;
2262 
2263 	/* Drop this one on the "finished" queue... */
2264 	TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries);
2265 
2266 	/* Let the raidio thread know there is work to be done. */
2267 	rf_signal_cond2(queue->raidPtr->iodone_cv);
2268 
2269 	rf_unlock_mutex2(queue->raidPtr->iodone_lock);
2270 }
2271 
2272 
2273 /*
2274  * initialize a buf structure for doing an I/O in the kernel.
2275  */
2276 static void
2277 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev,
2278        RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf,
2279        void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector)
2280 {
2281 	bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass);
2282 	bp->b_oflags = 0;
2283 	bp->b_cflags = 0;
2284 	bp->b_bcount = numSect << logBytesPerSector;
2285 	bp->b_bufsize = bp->b_bcount;
2286 	bp->b_error = 0;
2287 	bp->b_dev = dev;
2288 	bp->b_data = bf;
2289 	bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT;
2290 	bp->b_resid = bp->b_bcount;	/* XXX is this right!??!?!! */
2291 	if (bp->b_bcount == 0) {
2292 		panic("bp->b_bcount is zero in InitBP!!");
2293 	}
2294 	bp->b_iodone = cbFunc;
2295 	bp->b_private = cbArg;
2296 }
2297 
2298 /*
2299  * Wait interruptibly for an exclusive lock.
2300  *
2301  * XXX
2302  * Several drivers do this; it should be abstracted and made MP-safe.
2303  * (Hmm... where have we seen this warning before :->  GO )
2304  */
2305 static int
2306 raidlock(struct raid_softc *rs)
2307 {
2308 	int     error;
2309 
2310 	error = 0;
2311 	mutex_enter(&rs->sc_mutex);
2312 	while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
2313 		rs->sc_flags |= RAIDF_WANTED;
2314 		error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex);
2315 		if (error != 0)
2316 			goto done;
2317 	}
2318 	rs->sc_flags |= RAIDF_LOCKED;
2319 done:
2320 	mutex_exit(&rs->sc_mutex);
2321 	return error;
2322 }
2323 /*
2324  * Unlock and wake up any waiters.
2325  */
2326 static void
2327 raidunlock(struct raid_softc *rs)
2328 {
2329 
2330 	mutex_enter(&rs->sc_mutex);
2331 	rs->sc_flags &= ~RAIDF_LOCKED;
2332 	if ((rs->sc_flags & RAIDF_WANTED) != 0) {
2333 		rs->sc_flags &= ~RAIDF_WANTED;
2334 		cv_broadcast(&rs->sc_cv);
2335 	}
2336 	mutex_exit(&rs->sc_mutex);
2337 }
2338 
2339 
2340 #define RF_COMPONENT_INFO_OFFSET  16384 /* bytes */
2341 #define RF_COMPONENT_INFO_SIZE     1024 /* bytes */
2342 #define RF_PARITY_MAP_SIZE   RF_PARITYMAP_NBYTE
2343 
2344 static daddr_t
2345 rf_component_info_offset(void)
2346 {
2347 
2348 	return RF_COMPONENT_INFO_OFFSET;
2349 }
2350 
2351 static daddr_t
2352 rf_component_info_size(unsigned secsize)
2353 {
2354 	daddr_t info_size;
2355 
2356 	KASSERT(secsize);
2357 	if (secsize > RF_COMPONENT_INFO_SIZE)
2358 		info_size = secsize;
2359 	else
2360 		info_size = RF_COMPONENT_INFO_SIZE;
2361 
2362 	return info_size;
2363 }
2364 
2365 static daddr_t
2366 rf_parity_map_offset(RF_Raid_t *raidPtr)
2367 {
2368 	daddr_t map_offset;
2369 
2370 	KASSERT(raidPtr->bytesPerSector);
2371 	if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE)
2372 		map_offset = raidPtr->bytesPerSector;
2373 	else
2374 		map_offset = RF_COMPONENT_INFO_SIZE;
2375 	map_offset += rf_component_info_offset();
2376 
2377 	return map_offset;
2378 }
2379 
2380 static daddr_t
2381 rf_parity_map_size(RF_Raid_t *raidPtr)
2382 {
2383 	daddr_t map_size;
2384 
2385 	if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE)
2386 		map_size = raidPtr->bytesPerSector;
2387 	else
2388 		map_size = RF_PARITY_MAP_SIZE;
2389 
2390 	return map_size;
2391 }
2392 
2393 int
2394 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col)
2395 {
2396 	RF_ComponentLabel_t *clabel;
2397 
2398 	clabel = raidget_component_label(raidPtr, col);
2399 	clabel->clean = RF_RAID_CLEAN;
2400 	raidflush_component_label(raidPtr, col);
2401 	return(0);
2402 }
2403 
2404 
2405 int
2406 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col)
2407 {
2408 	RF_ComponentLabel_t *clabel;
2409 
2410 	clabel = raidget_component_label(raidPtr, col);
2411 	clabel->clean = RF_RAID_DIRTY;
2412 	raidflush_component_label(raidPtr, col);
2413 	return(0);
2414 }
2415 
2416 int
2417 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2418 {
2419 	KASSERT(raidPtr->bytesPerSector);
2420 
2421 	return raidread_component_label(raidPtr->bytesPerSector,
2422 	    raidPtr->Disks[col].dev,
2423 	    raidPtr->raid_cinfo[col].ci_vp,
2424 	    &raidPtr->raid_cinfo[col].ci_label);
2425 }
2426 
2427 RF_ComponentLabel_t *
2428 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2429 {
2430 	return &raidPtr->raid_cinfo[col].ci_label;
2431 }
2432 
2433 int
2434 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col)
2435 {
2436 	RF_ComponentLabel_t *label;
2437 
2438 	label = &raidPtr->raid_cinfo[col].ci_label;
2439 	label->mod_counter = raidPtr->mod_counter;
2440 #ifndef RF_NO_PARITY_MAP
2441 	label->parity_map_modcount = label->mod_counter;
2442 #endif
2443 	return raidwrite_component_label(raidPtr->bytesPerSector,
2444 	    raidPtr->Disks[col].dev,
2445 	    raidPtr->raid_cinfo[col].ci_vp, label);
2446 }
2447 
2448 /*
2449  * Swap the label endianness.
2450  *
2451  * Everything in the component label is 4-byte-swapped except the version,
2452  * which is kept in the byte-swapped version at all times, and indicates
2453  * for the writer that a swap is necessary.
2454  *
2455  * For reads it is expected that out_label == clabel, but writes expect
2456  * separate labels so only the re-swapped label is written out to disk,
2457  * leaving the swapped-except-version internally.
2458  *
2459  * Only support swapping label version 2.
2460  */
2461 static void
2462 rf_swap_label(RF_ComponentLabel_t *clabel, RF_ComponentLabel_t *out_label)
2463 {
2464 	int	*in, *out, *in_last;
2465 
2466 	KASSERT(clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION));
2467 
2468 	/* Don't swap the label, but do copy it. */
2469 	out_label->version = clabel->version;
2470 
2471 	in = &clabel->serial_number;
2472 	in_last = &clabel->future_use2[42];
2473 	out = &out_label->serial_number;
2474 
2475 	for (; in < in_last; in++, out++)
2476 		*out = bswap32(*in);
2477 }
2478 
2479 static int
2480 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2481     RF_ComponentLabel_t *clabel)
2482 {
2483 	int error;
2484 
2485 	error = raidread_component_area(dev, b_vp, clabel,
2486 	    sizeof(RF_ComponentLabel_t),
2487 	    rf_component_info_offset(),
2488 	    rf_component_info_size(secsize));
2489 
2490 	if (error == 0 &&
2491 	    clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2492 		rf_swap_label(clabel, clabel);
2493 	}
2494 
2495 	return error;
2496 }
2497 
2498 /* ARGSUSED */
2499 static int
2500 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data,
2501     size_t msize, daddr_t offset, daddr_t dsize)
2502 {
2503 	struct buf *bp;
2504 	int error;
2505 
2506 	/* XXX should probably ensure that we don't try to do this if
2507 	   someone has changed rf_protected_sectors. */
2508 
2509 	if (b_vp == NULL) {
2510 		/* For whatever reason, this component is not valid.
2511 		   Don't try to read a component label from it. */
2512 		return(EINVAL);
2513 	}
2514 
2515 	/* get a block of the appropriate size... */
2516 	bp = geteblk((int)dsize);
2517 	bp->b_dev = dev;
2518 
2519 	/* get our ducks in a row for the read */
2520 	bp->b_blkno = offset / DEV_BSIZE;
2521 	bp->b_bcount = dsize;
2522 	bp->b_flags |= B_READ;
2523  	bp->b_resid = dsize;
2524 
2525 	bdev_strategy(bp);
2526 	error = biowait(bp);
2527 
2528 	if (!error) {
2529 		memcpy(data, bp->b_data, msize);
2530 	}
2531 
2532 	brelse(bp, 0);
2533 	return(error);
2534 }
2535 
2536 static int
2537 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp,
2538     RF_ComponentLabel_t *clabel)
2539 {
2540 	RF_ComponentLabel_t *clabel_write = clabel;
2541 	RF_ComponentLabel_t lclabel;
2542 	int error;
2543 
2544 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) {
2545 		clabel_write = &lclabel;
2546 		rf_swap_label(clabel, clabel_write);
2547 	}
2548 	error = raidwrite_component_area(dev, b_vp, clabel_write,
2549 	    sizeof(RF_ComponentLabel_t),
2550 	    rf_component_info_offset(),
2551 	    rf_component_info_size(secsize));
2552 
2553 	return error;
2554 }
2555 
2556 /* ARGSUSED */
2557 static int
2558 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data,
2559     size_t msize, daddr_t offset, daddr_t dsize)
2560 {
2561 	struct buf *bp;
2562 	int error;
2563 
2564 	/* get a block of the appropriate size... */
2565 	bp = geteblk((int)dsize);
2566 	bp->b_dev = dev;
2567 
2568 	/* get our ducks in a row for the write */
2569 	bp->b_blkno = offset / DEV_BSIZE;
2570 	bp->b_bcount = dsize;
2571 	bp->b_flags |= B_WRITE;
2572  	bp->b_resid = dsize;
2573 
2574 	memset(bp->b_data, 0, dsize);
2575 	memcpy(bp->b_data, data, msize);
2576 
2577 	bdev_strategy(bp);
2578 	error = biowait(bp);
2579 	brelse(bp, 0);
2580 	if (error) {
2581 #if 1
2582 		printf("Failed to write RAID component info!\n");
2583 #endif
2584 	}
2585 
2586 	return(error);
2587 }
2588 
2589 void
2590 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2591 {
2592 	int c;
2593 
2594 	for (c = 0; c < raidPtr->numCol; c++) {
2595 		/* Skip dead disks. */
2596 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2597 			continue;
2598 		/* XXXjld: what if an error occurs here? */
2599 		raidwrite_component_area(raidPtr->Disks[c].dev,
2600 		    raidPtr->raid_cinfo[c].ci_vp, map,
2601 		    RF_PARITYMAP_NBYTE,
2602 		    rf_parity_map_offset(raidPtr),
2603 		    rf_parity_map_size(raidPtr));
2604 	}
2605 }
2606 
2607 void
2608 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map)
2609 {
2610 	struct rf_paritymap_ondisk tmp;
2611 	int c,first;
2612 
2613 	first=1;
2614 	for (c = 0; c < raidPtr->numCol; c++) {
2615 		/* Skip dead disks. */
2616 		if (RF_DEAD_DISK(raidPtr->Disks[c].status))
2617 			continue;
2618 		raidread_component_area(raidPtr->Disks[c].dev,
2619 		    raidPtr->raid_cinfo[c].ci_vp, &tmp,
2620 		    RF_PARITYMAP_NBYTE,
2621 		    rf_parity_map_offset(raidPtr),
2622 		    rf_parity_map_size(raidPtr));
2623 		if (first) {
2624 			memcpy(map, &tmp, sizeof(*map));
2625 			first = 0;
2626 		} else {
2627 			rf_paritymap_merge(map, &tmp);
2628 		}
2629 	}
2630 }
2631 
2632 void
2633 rf_markalldirty(RF_Raid_t *raidPtr)
2634 {
2635 	RF_ComponentLabel_t *clabel;
2636 	int sparecol;
2637 	int c;
2638 	int j;
2639 	int scol = -1;
2640 
2641 	raidPtr->mod_counter++;
2642 	for (c = 0; c < raidPtr->numCol; c++) {
2643 		/* we don't want to touch (at all) a disk that has
2644 		   failed */
2645 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
2646 			clabel = raidget_component_label(raidPtr, c);
2647 			if (clabel->status == rf_ds_spared) {
2648 				/* XXX do something special...
2649 				   but whatever you do, don't
2650 				   try to access it!! */
2651 			} else {
2652 				raidmarkdirty(raidPtr, c);
2653 			}
2654 		}
2655 	}
2656 
2657 	for (c = 0; c < raidPtr->numSpare ; c++) {
2658 		sparecol = raidPtr->numCol + c;
2659 
2660 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2661 			/*
2662 
2663 			   we claim this disk is "optimal" if it's
2664 			   rf_ds_used_spare, as that means it should be
2665 			   directly substitutable for the disk it replaced.
2666 			   We note that too...
2667 
2668 			 */
2669 
2670 			for(j=0;j<raidPtr->numCol;j++) {
2671 				if (raidPtr->Disks[j].spareCol == sparecol) {
2672 					scol = j;
2673 					break;
2674 				}
2675 			}
2676 
2677 			clabel = raidget_component_label(raidPtr, sparecol);
2678 			/* make sure status is noted */
2679 
2680 			raid_init_component_label(raidPtr, clabel);
2681 
2682 			clabel->row = 0;
2683 			clabel->column = scol;
2684 			/* Note: we *don't* change status from rf_ds_used_spare
2685 			   to rf_ds_optimal */
2686 			/* clabel.status = rf_ds_optimal; */
2687 
2688 			raidmarkdirty(raidPtr, sparecol);
2689 		}
2690 	}
2691 }
2692 
2693 
2694 void
2695 rf_update_component_labels(RF_Raid_t *raidPtr, int final)
2696 {
2697 	RF_ComponentLabel_t *clabel;
2698 	int sparecol;
2699 	int c;
2700 	int j;
2701 	int scol;
2702 	struct raid_softc *rs = raidPtr->softc;
2703 
2704 	scol = -1;
2705 
2706 	/* XXX should do extra checks to make sure things really are clean,
2707 	   rather than blindly setting the clean bit... */
2708 
2709 	raidPtr->mod_counter++;
2710 
2711 	for (c = 0; c < raidPtr->numCol; c++) {
2712 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
2713 			clabel = raidget_component_label(raidPtr, c);
2714 			/* make sure status is noted */
2715 			clabel->status = rf_ds_optimal;
2716 
2717 			/* note what unit we are configured as */
2718 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2719 				clabel->last_unit = raidPtr->raidid;
2720 
2721 			raidflush_component_label(raidPtr, c);
2722 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2723 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2724 					raidmarkclean(raidPtr, c);
2725 				}
2726 			}
2727 		}
2728 		/* else we don't touch it.. */
2729 	}
2730 
2731 	for (c = 0; c < raidPtr->numSpare ; c++) {
2732 		sparecol = raidPtr->numCol + c;
2733 
2734 		/* Need to ensure that the reconstruct actually completed! */
2735 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
2736 			/*
2737 
2738 			   we claim this disk is "optimal" if it's
2739 			   rf_ds_used_spare, as that means it should be
2740 			   directly substitutable for the disk it replaced.
2741 			   We note that too...
2742 
2743 			 */
2744 
2745 			for(j=0;j<raidPtr->numCol;j++) {
2746 				if (raidPtr->Disks[j].spareCol == sparecol) {
2747 					scol = j;
2748 					break;
2749 				}
2750 			}
2751 
2752 			/* XXX shouldn't *really* need this... */
2753 			clabel = raidget_component_label(raidPtr, sparecol);
2754 			/* make sure status is noted */
2755 
2756 			raid_init_component_label(raidPtr, clabel);
2757 
2758 			clabel->column = scol;
2759 			clabel->status = rf_ds_optimal;
2760 			if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0)
2761 				clabel->last_unit = raidPtr->raidid;
2762 
2763 			raidflush_component_label(raidPtr, sparecol);
2764 			if (final == RF_FINAL_COMPONENT_UPDATE) {
2765 				if (raidPtr->parity_good == RF_RAID_CLEAN) {
2766 					raidmarkclean(raidPtr, sparecol);
2767 				}
2768 			}
2769 		}
2770 	}
2771 }
2772 
2773 void
2774 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured)
2775 {
2776 
2777 	if (vp != NULL) {
2778 		if (auto_configured == 1) {
2779 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2780 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2781 			vput(vp);
2782 
2783 		} else {
2784 			(void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred);
2785 		}
2786 	}
2787 }
2788 
2789 
2790 void
2791 rf_UnconfigureVnodes(RF_Raid_t *raidPtr)
2792 {
2793 	int r,c;
2794 	struct vnode *vp;
2795 	int acd;
2796 
2797 
2798 	/* We take this opportunity to close the vnodes like we should.. */
2799 
2800 	for (c = 0; c < raidPtr->numCol; c++) {
2801 		vp = raidPtr->raid_cinfo[c].ci_vp;
2802 		acd = raidPtr->Disks[c].auto_configured;
2803 		rf_close_component(raidPtr, vp, acd);
2804 		raidPtr->raid_cinfo[c].ci_vp = NULL;
2805 		raidPtr->Disks[c].auto_configured = 0;
2806 	}
2807 
2808 	for (r = 0; r < raidPtr->numSpare; r++) {
2809 		vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp;
2810 		acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured;
2811 		rf_close_component(raidPtr, vp, acd);
2812 		raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL;
2813 		raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0;
2814 	}
2815 }
2816 
2817 
2818 static void
2819 rf_ReconThread(struct rf_recon_req_internal *req)
2820 {
2821 	int     s;
2822 	RF_Raid_t *raidPtr;
2823 
2824 	s = splbio();
2825 	raidPtr = (RF_Raid_t *) req->raidPtr;
2826 	raidPtr->recon_in_progress = 1;
2827 
2828 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2829 		raidPtr->forceRecon = 1;
2830 	}
2831 
2832 	rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col,
2833 		    ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0));
2834 
2835 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2836 		raidPtr->forceRecon = 0;
2837 	}
2838 
2839 	RF_Free(req, sizeof(*req));
2840 
2841 	raidPtr->recon_in_progress = 0;
2842 	splx(s);
2843 
2844 	/* That's all... */
2845 	kthread_exit(0);	/* does not return */
2846 }
2847 
2848 static void
2849 rf_RewriteParityThread(RF_Raid_t *raidPtr)
2850 {
2851 	int retcode;
2852 	int s;
2853 
2854 	raidPtr->parity_rewrite_stripes_done = 0;
2855 	raidPtr->parity_rewrite_in_progress = 1;
2856 	s = splbio();
2857 	retcode = rf_RewriteParity(raidPtr);
2858 	splx(s);
2859 	if (retcode) {
2860 		printf("raid%d: Error re-writing parity (%d)!\n",
2861 		    raidPtr->raidid, retcode);
2862 	} else {
2863 		/* set the clean bit!  If we shutdown correctly,
2864 		   the clean bit on each component label will get
2865 		   set */
2866 		raidPtr->parity_good = RF_RAID_CLEAN;
2867 	}
2868 	raidPtr->parity_rewrite_in_progress = 0;
2869 
2870 	/* Anyone waiting for us to stop?  If so, inform them... */
2871 	if (raidPtr->waitShutdown) {
2872 		rf_lock_mutex2(raidPtr->rad_lock);
2873 		cv_broadcast(&raidPtr->parity_rewrite_cv);
2874 		rf_unlock_mutex2(raidPtr->rad_lock);
2875 	}
2876 
2877 	/* That's all... */
2878 	kthread_exit(0);	/* does not return */
2879 }
2880 
2881 static void
2882 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req)
2883 {
2884 	int s;
2885 	RF_Raid_t *raidPtr;
2886 
2887 	s = splbio();
2888 	raidPtr = req->raidPtr;
2889 	raidPtr->recon_in_progress = 1;
2890 
2891 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2892 		raidPtr->forceRecon = 1;
2893 	}
2894 
2895 	rf_ReconstructInPlace(raidPtr, req->col);
2896 
2897 	if (req->flags & RF_FDFLAGS_RECON_FORCE) {
2898 		raidPtr->forceRecon = 0;
2899 	}
2900 
2901 	RF_Free(req, sizeof(*req));
2902 	raidPtr->recon_in_progress = 0;
2903 	splx(s);
2904 
2905 	/* That's all... */
2906 	kthread_exit(0);	/* does not return */
2907 }
2908 
2909 static RF_AutoConfig_t *
2910 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp,
2911     const char *cname, RF_SectorCount_t size, uint64_t numsecs,
2912     unsigned secsize)
2913 {
2914 	int good_one = 0;
2915 	RF_ComponentLabel_t *clabel;
2916 	RF_AutoConfig_t *ac;
2917 
2918 	clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK);
2919 
2920 	if (!raidread_component_label(secsize, dev, vp, clabel)) {
2921 		/* Got the label.  Does it look reasonable? */
2922 		if (rf_reasonable_label(clabel, numsecs) &&
2923 		    (rf_component_label_partitionsize(clabel) <= size)) {
2924 #ifdef DEBUG
2925 			printf("Component on: %s: %llu\n",
2926 				cname, (unsigned long long)size);
2927 			rf_print_component_label(clabel);
2928 #endif
2929 			/* if it's reasonable, add it, else ignore it. */
2930 			ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME,
2931 				M_WAITOK);
2932 			strlcpy(ac->devname, cname, sizeof(ac->devname));
2933 			ac->dev = dev;
2934 			ac->vp = vp;
2935 			ac->clabel = clabel;
2936 			ac->next = ac_list;
2937 			ac_list = ac;
2938 			good_one = 1;
2939 		}
2940 	}
2941 	if (!good_one) {
2942 		/* cleanup */
2943 		free(clabel, M_RAIDFRAME);
2944 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2945 		VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
2946 		vput(vp);
2947 	}
2948 	return ac_list;
2949 }
2950 
2951 static RF_AutoConfig_t *
2952 rf_find_raid_components(void)
2953 {
2954 	struct vnode *vp;
2955 	struct disklabel label;
2956 	device_t dv;
2957 	deviter_t di;
2958 	dev_t dev;
2959 	int bmajor, bminor, wedge, rf_part_found;
2960 	int error;
2961 	int i;
2962 	RF_AutoConfig_t *ac_list;
2963 	uint64_t numsecs;
2964 	unsigned secsize;
2965 	int dowedges;
2966 
2967 	/* initialize the AutoConfig list */
2968 	ac_list = NULL;
2969 
2970 	/*
2971 	 * we begin by trolling through *all* the devices on the system *twice*
2972 	 * first we scan for wedges, second for other devices. This avoids
2973 	 * using a raw partition instead of a wedge that covers the whole disk
2974 	 */
2975 
2976 	for (dowedges=1; dowedges>=0; --dowedges) {
2977 		for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL;
2978 		     dv = deviter_next(&di)) {
2979 
2980 			/* we are only interested in disks */
2981 			if (device_class(dv) != DV_DISK)
2982 				continue;
2983 
2984 			/* we don't care about floppies */
2985 			if (device_is_a(dv, "fd")) {
2986 				continue;
2987 			}
2988 
2989 			/* we don't care about CDs. */
2990 			if (device_is_a(dv, "cd")) {
2991 				continue;
2992 			}
2993 
2994 			/* we don't care about md. */
2995 			if (device_is_a(dv, "md")) {
2996 				continue;
2997 			}
2998 
2999 			/* hdfd is the Atari/Hades floppy driver */
3000 			if (device_is_a(dv, "hdfd")) {
3001 				continue;
3002 			}
3003 
3004 			/* fdisa is the Atari/Milan floppy driver */
3005 			if (device_is_a(dv, "fdisa")) {
3006 				continue;
3007 			}
3008 
3009 			/* we don't care about spiflash */
3010 			if (device_is_a(dv, "spiflash")) {
3011 				continue;
3012 			}
3013 
3014 			/* are we in the wedges pass ? */
3015 			wedge = device_is_a(dv, "dk");
3016 			if (wedge != dowedges) {
3017 				continue;
3018 			}
3019 
3020 			/* need to find the device_name_to_block_device_major stuff */
3021 			bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
3022 
3023 			rf_part_found = 0; /*No raid partition as yet*/
3024 
3025 			/* get a vnode for the raw partition of this disk */
3026 			bminor = minor(device_unit(dv));
3027 			dev = wedge ? makedev(bmajor, bminor) :
3028 			    MAKEDISKDEV(bmajor, bminor, RAW_PART);
3029 			if (bdevvp(dev, &vp))
3030 				panic("RAID can't alloc vnode");
3031 
3032 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3033 			error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED);
3034 
3035 			if (error) {
3036 				/* "Who cares."  Continue looking
3037 				   for something that exists*/
3038 				vput(vp);
3039 				continue;
3040 			}
3041 
3042 			error = getdisksize(vp, &numsecs, &secsize);
3043 			if (error) {
3044 				/*
3045 				 * Pseudo devices like vnd and cgd can be
3046 				 * opened but may still need some configuration.
3047 				 * Ignore these quietly.
3048 				 */
3049 				if (error != ENXIO)
3050 					printf("RAIDframe: can't get disk size"
3051 					    " for dev %s (%d)\n",
3052 					    device_xname(dv), error);
3053 				VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3054 				vput(vp);
3055 				continue;
3056 			}
3057 			if (wedge) {
3058 				struct dkwedge_info dkw;
3059 				error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD,
3060 				    NOCRED);
3061 				if (error) {
3062 					printf("RAIDframe: can't get wedge info for "
3063 					    "dev %s (%d)\n", device_xname(dv), error);
3064 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3065 					vput(vp);
3066 					continue;
3067 				}
3068 
3069 				if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) {
3070 					VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3071 					vput(vp);
3072 					continue;
3073 				}
3074 
3075 				VOP_UNLOCK(vp);
3076 				ac_list = rf_get_component(ac_list, dev, vp,
3077 				    device_xname(dv), dkw.dkw_size, numsecs, secsize);
3078 				rf_part_found = 1; /*There is a raid component on this disk*/
3079 				continue;
3080 			}
3081 
3082 			/* Ok, the disk exists.  Go get the disklabel. */
3083 			error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED);
3084 			if (error) {
3085 				/*
3086 				 * XXX can't happen - open() would
3087 				 * have errored out (or faked up one)
3088 				 */
3089 				if (error != ENOTTY)
3090 					printf("RAIDframe: can't get label for dev "
3091 					    "%s (%d)\n", device_xname(dv), error);
3092 			}
3093 
3094 			/* don't need this any more.  We'll allocate it again
3095 			   a little later if we really do... */
3096 			VOP_CLOSE(vp, FREAD | FWRITE, NOCRED);
3097 			vput(vp);
3098 
3099 			if (error)
3100 				continue;
3101 
3102 			rf_part_found = 0; /*No raid partitions yet*/
3103 			for (i = 0; i < label.d_npartitions; i++) {
3104 				char cname[sizeof(ac_list->devname)];
3105 
3106 				/* We only support partitions marked as RAID */
3107 				if (label.d_partitions[i].p_fstype != FS_RAID)
3108 					continue;
3109 
3110 				dev = MAKEDISKDEV(bmajor, device_unit(dv), i);
3111 				if (bdevvp(dev, &vp))
3112 					panic("RAID can't alloc vnode");
3113 
3114 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3115 				error = VOP_OPEN(vp, FREAD, NOCRED);
3116 				if (error) {
3117 					/* Not quite a 'whatever'.  In
3118 					 * this situation we know
3119 					 * there is a FS_RAID
3120 					 * partition, but we can't
3121 					 * open it.  The most likely
3122 					 * reason is that the
3123 					 * partition is already in
3124 					 * use by another RAID set.
3125 					 * So note that we've already
3126 					 * found a partition on this
3127 					 * disk so we don't attempt
3128 					 * to use the raw disk later. */
3129 					rf_part_found = 1;
3130 					vput(vp);
3131 					continue;
3132 				}
3133 				VOP_UNLOCK(vp);
3134 				snprintf(cname, sizeof(cname), "%s%c",
3135 				    device_xname(dv), 'a' + i);
3136 				ac_list = rf_get_component(ac_list, dev, vp, cname,
3137 					label.d_partitions[i].p_size, numsecs, secsize);
3138 				rf_part_found = 1; /*There is at least one raid partition on this disk*/
3139 			}
3140 
3141 			/*
3142 			 *If there is no raid component on this disk, either in a
3143 			 *disklabel or inside a wedge, check the raw partition as well,
3144 			 *as it is possible to configure raid components on raw disk
3145 			 *devices.
3146 			 */
3147 
3148 			if (!rf_part_found) {
3149 				char cname[sizeof(ac_list->devname)];
3150 
3151 				dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART);
3152 				if (bdevvp(dev, &vp))
3153 					panic("RAID can't alloc vnode");
3154 
3155 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3156 
3157 				error = VOP_OPEN(vp, FREAD, NOCRED);
3158 				if (error) {
3159 					/* Whatever... */
3160 					vput(vp);
3161 					continue;
3162 				}
3163 				VOP_UNLOCK(vp);
3164 				snprintf(cname, sizeof(cname), "%s%c",
3165 				    device_xname(dv), 'a' + RAW_PART);
3166 				ac_list = rf_get_component(ac_list, dev, vp, cname,
3167 					label.d_partitions[RAW_PART].p_size, numsecs, secsize);
3168 			}
3169 		}
3170 		deviter_release(&di);
3171 	}
3172 	return ac_list;
3173 }
3174 
3175 int
3176 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3177 {
3178 
3179 	if ((clabel->version==RF_COMPONENT_LABEL_VERSION_1 ||
3180 	     clabel->version==RF_COMPONENT_LABEL_VERSION ||
3181 	     clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION)) &&
3182 	    (clabel->clean == RF_RAID_CLEAN ||
3183 	     clabel->clean == RF_RAID_DIRTY) &&
3184 	    clabel->row >=0 &&
3185 	    clabel->column >= 0 &&
3186 	    clabel->num_rows > 0 &&
3187 	    clabel->num_columns > 0 &&
3188 	    clabel->row < clabel->num_rows &&
3189 	    clabel->column < clabel->num_columns &&
3190 	    clabel->blockSize > 0 &&
3191 	    /*
3192 	     * numBlocksHi may contain garbage, but it is ok since
3193 	     * the type is unsigned.  If it is really garbage,
3194 	     * rf_fix_old_label_size() will fix it.
3195 	     */
3196 	    rf_component_label_numblocks(clabel) > 0) {
3197 		/*
3198 		 * label looks reasonable enough...
3199 		 * let's make sure it has no old garbage.
3200 		 */
3201 		if (numsecs)
3202 			rf_fix_old_label_size(clabel, numsecs);
3203 		return(1);
3204 	}
3205 	return(0);
3206 }
3207 
3208 
3209 /*
3210  * For reasons yet unknown, some old component labels have garbage in
3211  * the newer numBlocksHi region, and this causes lossage.  Since those
3212  * disks will also have numsecs set to less than 32 bits of sectors,
3213  * we can determine when this corruption has occurred, and fix it.
3214  *
3215  * The exact same problem, with the same unknown reason, happens to
3216  * the partitionSizeHi member as well.
3217  */
3218 static void
3219 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs)
3220 {
3221 
3222 	if (numsecs < ((uint64_t)1 << 32)) {
3223 		if (clabel->numBlocksHi) {
3224 			printf("WARNING: total sectors < 32 bits, yet "
3225 			       "numBlocksHi set\n"
3226 			       "WARNING: resetting numBlocksHi to zero.\n");
3227 			clabel->numBlocksHi = 0;
3228 		}
3229 
3230 		if (clabel->partitionSizeHi) {
3231 			printf("WARNING: total sectors < 32 bits, yet "
3232 			       "partitionSizeHi set\n"
3233 			       "WARNING: resetting partitionSizeHi to zero.\n");
3234 			clabel->partitionSizeHi = 0;
3235 		}
3236 	}
3237 }
3238 
3239 
3240 #ifdef DEBUG
3241 void
3242 rf_print_component_label(RF_ComponentLabel_t *clabel)
3243 {
3244 	uint64_t numBlocks;
3245 	static const char *rp[] = {
3246 	    "No", "Force", "Soft", "*invalid*"
3247 	};
3248 
3249 
3250 	numBlocks = rf_component_label_numblocks(clabel);
3251 
3252 	printf("   Row: %d Column: %d Num Rows: %d Num Columns: %d\n",
3253 	       clabel->row, clabel->column,
3254 	       clabel->num_rows, clabel->num_columns);
3255 	printf("   Version: %d Serial Number: %d Mod Counter: %d\n",
3256 	       clabel->version, clabel->serial_number,
3257 	       clabel->mod_counter);
3258 	printf("   Clean: %s Status: %d\n",
3259 	       clabel->clean ? "Yes" : "No", clabel->status);
3260 	printf("   sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n",
3261 	       clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU);
3262 	printf("   RAID Level: %c  blocksize: %d numBlocks: %"PRIu64"\n",
3263 	       (char) clabel->parityConfig, clabel->blockSize, numBlocks);
3264 	printf("   Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No");
3265 	printf("   Root partition: %s\n", rp[clabel->root_partition & 3]);
3266 	printf("   Last configured as: raid%d\n", clabel->last_unit);
3267 #if 0
3268 	   printf("   Config order: %d\n", clabel->config_order);
3269 #endif
3270 
3271 }
3272 #endif
3273 
3274 static RF_ConfigSet_t *
3275 rf_create_auto_sets(RF_AutoConfig_t *ac_list)
3276 {
3277 	RF_AutoConfig_t *ac;
3278 	RF_ConfigSet_t *config_sets;
3279 	RF_ConfigSet_t *cset;
3280 	RF_AutoConfig_t *ac_next;
3281 
3282 
3283 	config_sets = NULL;
3284 
3285 	/* Go through the AutoConfig list, and figure out which components
3286 	   belong to what sets.  */
3287 	ac = ac_list;
3288 	while(ac!=NULL) {
3289 		/* we're going to putz with ac->next, so save it here
3290 		   for use at the end of the loop */
3291 		ac_next = ac->next;
3292 
3293 		if (config_sets == NULL) {
3294 			/* will need at least this one... */
3295 			config_sets = malloc(sizeof(RF_ConfigSet_t),
3296 				       M_RAIDFRAME, M_WAITOK);
3297 			/* this one is easy :) */
3298 			config_sets->ac = ac;
3299 			config_sets->next = NULL;
3300 			config_sets->rootable = 0;
3301 			ac->next = NULL;
3302 		} else {
3303 			/* which set does this component fit into? */
3304 			cset = config_sets;
3305 			while(cset!=NULL) {
3306 				if (rf_does_it_fit(cset, ac)) {
3307 					/* looks like it matches... */
3308 					ac->next = cset->ac;
3309 					cset->ac = ac;
3310 					break;
3311 				}
3312 				cset = cset->next;
3313 			}
3314 			if (cset==NULL) {
3315 				/* didn't find a match above... new set..*/
3316 				cset = malloc(sizeof(RF_ConfigSet_t),
3317 					       M_RAIDFRAME, M_WAITOK);
3318 				cset->ac = ac;
3319 				ac->next = NULL;
3320 				cset->next = config_sets;
3321 				cset->rootable = 0;
3322 				config_sets = cset;
3323 			}
3324 		}
3325 		ac = ac_next;
3326 	}
3327 
3328 
3329 	return(config_sets);
3330 }
3331 
3332 static int
3333 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac)
3334 {
3335 	RF_ComponentLabel_t *clabel1, *clabel2;
3336 
3337 	/* If this one matches the *first* one in the set, that's good
3338 	   enough, since the other members of the set would have been
3339 	   through here too... */
3340 	/* note that we are not checking partitionSize here..
3341 
3342 	   Note that we are also not checking the mod_counters here.
3343 	   If everything else matches except the mod_counter, that's
3344 	   good enough for this test.  We will deal with the mod_counters
3345 	   a little later in the autoconfiguration process.
3346 
3347 	    (clabel1->mod_counter == clabel2->mod_counter) &&
3348 
3349 	   The reason we don't check for this is that failed disks
3350 	   will have lower modification counts.  If those disks are
3351 	   not added to the set they used to belong to, then they will
3352 	   form their own set, which may result in 2 different sets,
3353 	   for example, competing to be configured at raid0, and
3354 	   perhaps competing to be the root filesystem set.  If the
3355 	   wrong ones get configured, or both attempt to become /,
3356 	   weird behaviour and or serious lossage will occur.  Thus we
3357 	   need to bring them into the fold here, and kick them out at
3358 	   a later point.
3359 
3360 	*/
3361 
3362 	clabel1 = cset->ac->clabel;
3363 	clabel2 = ac->clabel;
3364 	if ((clabel1->version == clabel2->version) &&
3365 	    (clabel1->serial_number == clabel2->serial_number) &&
3366 	    (clabel1->num_rows == clabel2->num_rows) &&
3367 	    (clabel1->num_columns == clabel2->num_columns) &&
3368 	    (clabel1->sectPerSU == clabel2->sectPerSU) &&
3369 	    (clabel1->SUsPerPU == clabel2->SUsPerPU) &&
3370 	    (clabel1->SUsPerRU == clabel2->SUsPerRU) &&
3371 	    (clabel1->parityConfig == clabel2->parityConfig) &&
3372 	    (clabel1->maxOutstanding == clabel2->maxOutstanding) &&
3373 	    (clabel1->blockSize == clabel2->blockSize) &&
3374 	    rf_component_label_numblocks(clabel1) ==
3375 	    rf_component_label_numblocks(clabel2) &&
3376 	    (clabel1->autoconfigure == clabel2->autoconfigure) &&
3377 	    (clabel1->root_partition == clabel2->root_partition) &&
3378 	    (clabel1->last_unit == clabel2->last_unit) &&
3379 	    (clabel1->config_order == clabel2->config_order)) {
3380 		/* if it get's here, it almost *has* to be a match */
3381 	} else {
3382 		/* it's not consistent with somebody in the set..
3383 		   punt */
3384 		return(0);
3385 	}
3386 	/* all was fine.. it must fit... */
3387 	return(1);
3388 }
3389 
3390 static int
3391 rf_have_enough_components(RF_ConfigSet_t *cset)
3392 {
3393 	RF_AutoConfig_t *ac;
3394 	RF_AutoConfig_t *auto_config;
3395 	RF_ComponentLabel_t *clabel;
3396 	int c;
3397 	int num_cols;
3398 	int num_missing;
3399 	int mod_counter;
3400 	int mod_counter_found;
3401 	int even_pair_failed;
3402 	char parity_type;
3403 
3404 
3405 	/* check to see that we have enough 'live' components
3406 	   of this set.  If so, we can configure it if necessary */
3407 
3408 	num_cols = cset->ac->clabel->num_columns;
3409 	parity_type = cset->ac->clabel->parityConfig;
3410 
3411 	/* XXX Check for duplicate components!?!?!? */
3412 
3413 	/* Determine what the mod_counter is supposed to be for this set. */
3414 
3415 	mod_counter_found = 0;
3416 	mod_counter = 0;
3417 	ac = cset->ac;
3418 	while(ac!=NULL) {
3419 		if (mod_counter_found==0) {
3420 			mod_counter = ac->clabel->mod_counter;
3421 			mod_counter_found = 1;
3422 		} else {
3423 			if (ac->clabel->mod_counter > mod_counter) {
3424 				mod_counter = ac->clabel->mod_counter;
3425 			}
3426 		}
3427 		ac = ac->next;
3428 	}
3429 
3430 	num_missing = 0;
3431 	auto_config = cset->ac;
3432 
3433 	even_pair_failed = 0;
3434 	for(c=0; c<num_cols; c++) {
3435 		ac = auto_config;
3436 		while(ac!=NULL) {
3437 			if ((ac->clabel->column == c) &&
3438 			    (ac->clabel->mod_counter == mod_counter)) {
3439 				/* it's this one... */
3440 #ifdef DEBUG
3441 				printf("Found: %s at %d\n",
3442 				       ac->devname,c);
3443 #endif
3444 				break;
3445 			}
3446 			ac=ac->next;
3447 		}
3448 		if (ac==NULL) {
3449 				/* Didn't find one here! */
3450 				/* special case for RAID 1, especially
3451 				   where there are more than 2
3452 				   components (where RAIDframe treats
3453 				   things a little differently :( ) */
3454 			if (parity_type == '1') {
3455 				if (c%2 == 0) { /* even component */
3456 					even_pair_failed = 1;
3457 				} else { /* odd component.  If
3458 					    we're failed, and
3459 					    so is the even
3460 					    component, it's
3461 					    "Good Night, Charlie" */
3462 					if (even_pair_failed == 1) {
3463 						return(0);
3464 					}
3465 				}
3466 			} else {
3467 				/* normal accounting */
3468 				num_missing++;
3469 			}
3470 		}
3471 		if ((parity_type == '1') && (c%2 == 1)) {
3472 				/* Just did an even component, and we didn't
3473 				   bail.. reset the even_pair_failed flag,
3474 				   and go on to the next component.... */
3475 			even_pair_failed = 0;
3476 		}
3477 	}
3478 
3479 	clabel = cset->ac->clabel;
3480 
3481 	if (((clabel->parityConfig == '0') && (num_missing > 0)) ||
3482 	    ((clabel->parityConfig == '4') && (num_missing > 1)) ||
3483 	    ((clabel->parityConfig == '5') && (num_missing > 1))) {
3484 		/* XXX this needs to be made *much* more general */
3485 		/* Too many failures */
3486 		return(0);
3487 	}
3488 	/* otherwise, all is well, and we've got enough to take a kick
3489 	   at autoconfiguring this set */
3490 	return(1);
3491 }
3492 
3493 static void
3494 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config,
3495 			RF_Raid_t *raidPtr)
3496 {
3497 	RF_ComponentLabel_t *clabel;
3498 	int i;
3499 
3500 	clabel = ac->clabel;
3501 
3502 	/* 1. Fill in the common stuff */
3503 	config->numCol = clabel->num_columns;
3504 	config->numSpare = 0; /* XXX should this be set here? */
3505 	config->sectPerSU = clabel->sectPerSU;
3506 	config->SUsPerPU = clabel->SUsPerPU;
3507 	config->SUsPerRU = clabel->SUsPerRU;
3508 	config->parityConfig = clabel->parityConfig;
3509 	/* XXX... */
3510 	strcpy(config->diskQueueType,"fifo");
3511 	config->maxOutstandingDiskReqs = clabel->maxOutstanding;
3512 	config->layoutSpecificSize = 0; /* XXX ?? */
3513 
3514 	while(ac!=NULL) {
3515 		/* row/col values will be in range due to the checks
3516 		   in reasonable_label() */
3517 		strcpy(config->devnames[0][ac->clabel->column],
3518 		       ac->devname);
3519 		ac = ac->next;
3520 	}
3521 
3522 	for(i=0;i<RF_MAXDBGV;i++) {
3523 		config->debugVars[i][0] = 0;
3524 	}
3525 }
3526 
3527 static int
3528 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value)
3529 {
3530 	RF_ComponentLabel_t *clabel;
3531 	int column;
3532 	int sparecol;
3533 
3534 	raidPtr->autoconfigure = new_value;
3535 
3536 	for(column=0; column<raidPtr->numCol; column++) {
3537 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3538 			clabel = raidget_component_label(raidPtr, column);
3539 			clabel->autoconfigure = new_value;
3540 			raidflush_component_label(raidPtr, column);
3541 		}
3542 	}
3543 	for(column = 0; column < raidPtr->numSpare ; column++) {
3544 		sparecol = raidPtr->numCol + column;
3545 
3546 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3547 			clabel = raidget_component_label(raidPtr, sparecol);
3548 			clabel->autoconfigure = new_value;
3549 			raidflush_component_label(raidPtr, sparecol);
3550 		}
3551 	}
3552 	return(new_value);
3553 }
3554 
3555 static int
3556 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value)
3557 {
3558 	RF_ComponentLabel_t *clabel;
3559 	int column;
3560 	int sparecol;
3561 
3562 	raidPtr->root_partition = new_value;
3563 	for(column=0; column<raidPtr->numCol; column++) {
3564 		if (raidPtr->Disks[column].status == rf_ds_optimal) {
3565 			clabel = raidget_component_label(raidPtr, column);
3566 			clabel->root_partition = new_value;
3567 			raidflush_component_label(raidPtr, column);
3568 		}
3569 	}
3570 	for (column = 0; column < raidPtr->numSpare ; column++) {
3571 		sparecol = raidPtr->numCol + column;
3572 
3573 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3574 			clabel = raidget_component_label(raidPtr, sparecol);
3575 			clabel->root_partition = new_value;
3576 			raidflush_component_label(raidPtr, sparecol);
3577 		}
3578 	}
3579 	return(new_value);
3580 }
3581 
3582 static void
3583 rf_release_all_vps(RF_ConfigSet_t *cset)
3584 {
3585 	RF_AutoConfig_t *ac;
3586 
3587 	ac = cset->ac;
3588 	while(ac!=NULL) {
3589 		/* Close the vp, and give it back */
3590 		if (ac->vp) {
3591 			vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY);
3592 			VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED);
3593 			vput(ac->vp);
3594 			ac->vp = NULL;
3595 		}
3596 		ac = ac->next;
3597 	}
3598 }
3599 
3600 
3601 static void
3602 rf_cleanup_config_set(RF_ConfigSet_t *cset)
3603 {
3604 	RF_AutoConfig_t *ac;
3605 	RF_AutoConfig_t *next_ac;
3606 
3607 	ac = cset->ac;
3608 	while(ac!=NULL) {
3609 		next_ac = ac->next;
3610 		/* nuke the label */
3611 		free(ac->clabel, M_RAIDFRAME);
3612 		/* cleanup the config structure */
3613 		free(ac, M_RAIDFRAME);
3614 		/* "next.." */
3615 		ac = next_ac;
3616 	}
3617 	/* and, finally, nuke the config set */
3618 	free(cset, M_RAIDFRAME);
3619 }
3620 
3621 
3622 void
3623 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel)
3624 {
3625 	/* avoid over-writing byteswapped version. */
3626 	if (clabel->version != bswap32(RF_COMPONENT_LABEL_VERSION))
3627 		clabel->version = RF_COMPONENT_LABEL_VERSION;
3628 	clabel->serial_number = raidPtr->serial_number;
3629 	clabel->mod_counter = raidPtr->mod_counter;
3630 
3631 	clabel->num_rows = 1;
3632 	clabel->num_columns = raidPtr->numCol;
3633 	clabel->clean = RF_RAID_DIRTY; /* not clean */
3634 	clabel->status = rf_ds_optimal; /* "It's good!" */
3635 
3636 	clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
3637 	clabel->SUsPerPU = raidPtr->Layout.SUsPerPU;
3638 	clabel->SUsPerRU = raidPtr->Layout.SUsPerRU;
3639 
3640 	clabel->blockSize = raidPtr->bytesPerSector;
3641 	rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk);
3642 
3643 	/* XXX not portable */
3644 	clabel->parityConfig = raidPtr->Layout.map->parityConfig;
3645 	clabel->maxOutstanding = raidPtr->maxOutstanding;
3646 	clabel->autoconfigure = raidPtr->autoconfigure;
3647 	clabel->root_partition = raidPtr->root_partition;
3648 	clabel->last_unit = raidPtr->raidid;
3649 	clabel->config_order = raidPtr->config_order;
3650 
3651 #ifndef RF_NO_PARITY_MAP
3652 	rf_paritymap_init_label(raidPtr->parity_map, clabel);
3653 #endif
3654 }
3655 
3656 static struct raid_softc *
3657 rf_auto_config_set(RF_ConfigSet_t *cset)
3658 {
3659 	RF_Raid_t *raidPtr;
3660 	RF_Config_t *config;
3661 	int raidID;
3662 	struct raid_softc *sc;
3663 
3664 #ifdef DEBUG
3665 	printf("RAID autoconfigure\n");
3666 #endif
3667 
3668 	/* 1. Create a config structure */
3669 	config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO);
3670 
3671 	/*
3672 	   2. Figure out what RAID ID this one is supposed to live at
3673 	   See if we can get the same RAID dev that it was configured
3674 	   on last time..
3675 	*/
3676 
3677 	raidID = cset->ac->clabel->last_unit;
3678 	for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0;
3679 	     sc = raidget(++raidID, false))
3680 		continue;
3681 #ifdef DEBUG
3682 	printf("Configuring raid%d:\n",raidID);
3683 #endif
3684 
3685 	if (sc == NULL)
3686 		sc = raidget(raidID, true);
3687 	raidPtr = &sc->sc_r;
3688 
3689 	/* XXX all this stuff should be done SOMEWHERE ELSE! */
3690 	raidPtr->softc = sc;
3691 	raidPtr->raidid = raidID;
3692 	raidPtr->openings = RAIDOUTSTANDING;
3693 
3694 	/* 3. Build the configuration structure */
3695 	rf_create_configuration(cset->ac, config, raidPtr);
3696 
3697 	/* 4. Do the configuration */
3698 	if (rf_Configure(raidPtr, config, cset->ac) == 0) {
3699 		raidinit(sc);
3700 
3701 		rf_markalldirty(raidPtr);
3702 		raidPtr->autoconfigure = 1; /* XXX do this here? */
3703 		switch (cset->ac->clabel->root_partition) {
3704 		case 1:	/* Force Root */
3705 		case 2:	/* Soft Root: root when boot partition part of raid */
3706 			/*
3707 			 * everything configured just fine.  Make a note
3708 			 * that this set is eligible to be root,
3709 			 * or forced to be root
3710 			 */
3711 			cset->rootable = cset->ac->clabel->root_partition;
3712 			/* XXX do this here? */
3713 			raidPtr->root_partition = cset->rootable;
3714 			break;
3715 		default:
3716 			break;
3717 		}
3718 	} else {
3719 		raidput(sc);
3720 		sc = NULL;
3721 	}
3722 
3723 	/* 5. Cleanup */
3724 	free(config, M_RAIDFRAME);
3725 	return sc;
3726 }
3727 
3728 void
3729 rf_pool_init(RF_Raid_t *raidPtr, char *w_chan, struct pool *p, size_t size, const char *pool_name,
3730 	     size_t xmin, size_t xmax)
3731 {
3732 
3733 	/* Format: raid%d_foo */
3734 	snprintf(w_chan, RF_MAX_POOLNAMELEN, "raid%d_%s", raidPtr->raidid, pool_name);
3735 
3736 	pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO);
3737 	pool_sethiwat(p, xmax);
3738 	pool_prime(p, xmin);
3739 }
3740 
3741 
3742 /*
3743  * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue
3744  * to see if there is IO pending and if that IO could possibly be done
3745  * for a given RAID set.  Returns 0 if IO is waiting and can be done, 1
3746  * otherwise.
3747  *
3748  */
3749 int
3750 rf_buf_queue_check(RF_Raid_t *raidPtr)
3751 {
3752 	struct raid_softc *rs;
3753 	struct dk_softc *dksc;
3754 
3755 	rs = raidPtr->softc;
3756 	dksc = &rs->sc_dksc;
3757 
3758 	if ((rs->sc_flags & RAIDF_INITED) == 0)
3759 		return 1;
3760 
3761 	if (dk_strategy_pending(dksc) && raidPtr->openings > 0) {
3762 		/* there is work to do */
3763 		return 0;
3764 	}
3765 	/* default is nothing to do */
3766 	return 1;
3767 }
3768 
3769 int
3770 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr)
3771 {
3772 	uint64_t numsecs;
3773 	unsigned secsize;
3774 	int error;
3775 
3776 	error = getdisksize(vp, &numsecs, &secsize);
3777 	if (error == 0) {
3778 		diskPtr->blockSize = secsize;
3779 		diskPtr->numBlocks = numsecs - rf_protectedSectors;
3780 		diskPtr->partitionSize = numsecs;
3781 		return 0;
3782 	}
3783 	return error;
3784 }
3785 
3786 static int
3787 raid_match(device_t self, cfdata_t cfdata, void *aux)
3788 {
3789 	return 1;
3790 }
3791 
3792 static void
3793 raid_attach(device_t parent, device_t self, void *aux)
3794 {
3795 }
3796 
3797 
3798 static int
3799 raid_detach(device_t self, int flags)
3800 {
3801 	int error;
3802 	struct raid_softc *rs = raidsoftc(self);
3803 
3804 	if (rs == NULL)
3805 		return ENXIO;
3806 
3807 	if ((error = raidlock(rs)) != 0)
3808 		return error;
3809 
3810 	error = raid_detach_unlocked(rs);
3811 
3812 	raidunlock(rs);
3813 
3814 	/* XXX raid can be referenced here */
3815 
3816 	if (error)
3817 		return error;
3818 
3819 	/* Free the softc */
3820 	raidput(rs);
3821 
3822 	return 0;
3823 }
3824 
3825 static void
3826 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr)
3827 {
3828 	struct dk_softc *dksc = &rs->sc_dksc;
3829 	struct disk_geom *dg = &dksc->sc_dkdev.dk_geom;
3830 
3831 	memset(dg, 0, sizeof(*dg));
3832 
3833 	dg->dg_secperunit = raidPtr->totalSectors;
3834 	dg->dg_secsize = raidPtr->bytesPerSector;
3835 	dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe;
3836 	dg->dg_ntracks = 4 * raidPtr->numCol;
3837 
3838 	disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL);
3839 }
3840 
3841 /*
3842  * Get cache info for all the components (including spares).
3843  * Returns intersection of all the cache flags of all disks, or first
3844  * error if any encountered.
3845  * XXXfua feature flags can change as spares are added - lock down somehow
3846  */
3847 static int
3848 rf_get_component_caches(RF_Raid_t *raidPtr, int *data)
3849 {
3850 	int c;
3851 	int error;
3852 	int dkwhole = 0, dkpart;
3853 
3854 	for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) {
3855 		/*
3856 		 * Check any non-dead disk, even when currently being
3857 		 * reconstructed.
3858 		 */
3859 		if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) {
3860 			error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp,
3861 			    DIOCGCACHE, &dkpart, FREAD, NOCRED);
3862 			if (error) {
3863 				if (error != ENODEV) {
3864 					printf("raid%d: get cache for component %s failed\n",
3865 					    raidPtr->raidid,
3866 					    raidPtr->Disks[c].devname);
3867 				}
3868 
3869 				return error;
3870 			}
3871 
3872 			if (c == 0)
3873 				dkwhole = dkpart;
3874 			else
3875 				dkwhole = DKCACHE_COMBINE(dkwhole, dkpart);
3876 		}
3877 	}
3878 
3879 	*data = dkwhole;
3880 
3881 	return 0;
3882 }
3883 
3884 /*
3885  * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components.
3886  * We end up returning whatever error was returned by the first cache flush
3887  * that fails.
3888  */
3889 
3890 static int
3891 rf_sync_component_cache(RF_Raid_t *raidPtr, int c, int force)
3892 {
3893 	int e = 0;
3894 	for (int i = 0; i < 5; i++) {
3895 		e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC,
3896 		    &force, FWRITE, NOCRED);
3897 		if (!e || e == ENODEV)
3898 			return e;
3899 		printf("raid%d: cache flush[%d] to component %s failed (%d)\n",
3900 		    raidPtr->raidid, i, raidPtr->Disks[c].devname, e);
3901 	}
3902 	return e;
3903 }
3904 
3905 int
3906 rf_sync_component_caches(RF_Raid_t *raidPtr, int force)
3907 {
3908 	int c, error;
3909 
3910 	error = 0;
3911 	for (c = 0; c < raidPtr->numCol; c++) {
3912 		if (raidPtr->Disks[c].status == rf_ds_optimal) {
3913 			int e = rf_sync_component_cache(raidPtr, c, force);
3914 			if (e && !error)
3915 				error = e;
3916 		}
3917 	}
3918 
3919 	for (c = 0; c < raidPtr->numSpare ; c++) {
3920 		int sparecol = raidPtr->numCol + c;
3921 
3922 		/* Need to ensure that the reconstruct actually completed! */
3923 		if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) {
3924 			int e = rf_sync_component_cache(raidPtr, sparecol,
3925 			    force);
3926 			if (e && !error)
3927 				error = e;
3928 		}
3929 	}
3930 	return error;
3931 }
3932 
3933 /* Fill in info with the current status */
3934 void
3935 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3936 {
3937 
3938 	memset(info, 0, sizeof(*info));
3939 
3940 	if (raidPtr->status != rf_rs_reconstructing) {
3941 		info->total = 100;
3942 		info->completed = 100;
3943 	} else {
3944 		info->total = raidPtr->reconControl->numRUsTotal;
3945 		info->completed = raidPtr->reconControl->numRUsComplete;
3946 	}
3947 	info->remaining = info->total - info->completed;
3948 }
3949 
3950 /* Fill in info with the current status */
3951 void
3952 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3953 {
3954 
3955 	memset(info, 0, sizeof(*info));
3956 
3957 	if (raidPtr->parity_rewrite_in_progress == 1) {
3958 		info->total = raidPtr->Layout.numStripe;
3959 		info->completed = raidPtr->parity_rewrite_stripes_done;
3960 	} else {
3961 		info->completed = 100;
3962 		info->total = 100;
3963 	}
3964 	info->remaining = info->total - info->completed;
3965 }
3966 
3967 /* Fill in info with the current status */
3968 void
3969 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info)
3970 {
3971 
3972 	memset(info, 0, sizeof(*info));
3973 	info->remaining = 0;
3974 	info->completed = 100;
3975 	info->total = 100;
3976 }
3977 
3978 /* Fill in config with the current info */
3979 int
3980 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config)
3981 {
3982 	int	d, i, j;
3983 
3984 	if (!raidPtr->valid)
3985 		return ENODEV;
3986 	config->cols = raidPtr->numCol;
3987 	config->ndevs = raidPtr->numCol;
3988 	if (config->ndevs >= RF_MAX_DISKS)
3989 		return ENOMEM;
3990 	config->nspares = raidPtr->numSpare;
3991 	if (config->nspares >= RF_MAX_DISKS)
3992 		return ENOMEM;
3993 	config->maxqdepth = raidPtr->maxQueueDepth;
3994 	d = 0;
3995 	for (j = 0; j < config->cols; j++) {
3996 		config->devs[d] = raidPtr->Disks[j];
3997 		d++;
3998 	}
3999 	for (i = 0; i < config->nspares; i++) {
4000 		config->spares[i] = raidPtr->Disks[raidPtr->numCol + i];
4001 		if (config->spares[i].status == rf_ds_rebuilding_spare) {
4002 			/* raidctl(8) expects to see this as a used spare */
4003 			config->spares[i].status = rf_ds_used_spare;
4004 		}
4005 	}
4006 	return 0;
4007 }
4008 
4009 int
4010 rf_get_component_label(RF_Raid_t *raidPtr, void *data)
4011 {
4012 	RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data;
4013 	RF_ComponentLabel_t *raid_clabel;
4014 	int column = clabel->column;
4015 
4016 	if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare))
4017 		return EINVAL;
4018 	raid_clabel = raidget_component_label(raidPtr, column);
4019 	memcpy(clabel, raid_clabel, sizeof *clabel);
4020 	/* Fix-up for userland. */
4021 	if (clabel->version == bswap32(RF_COMPONENT_LABEL_VERSION))
4022 		clabel->version = RF_COMPONENT_LABEL_VERSION;
4023 
4024 	return 0;
4025 }
4026 
4027 /*
4028  * Module interface
4029  */
4030 
4031 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs");
4032 
4033 #ifdef _MODULE
4034 CFDRIVER_DECL(raid, DV_DISK, NULL);
4035 #endif
4036 
4037 static int raid_modcmd(modcmd_t, void *);
4038 static int raid_modcmd_init(void);
4039 static int raid_modcmd_fini(void);
4040 
4041 static int
4042 raid_modcmd(modcmd_t cmd, void *data)
4043 {
4044 	int error;
4045 
4046 	error = 0;
4047 	switch (cmd) {
4048 	case MODULE_CMD_INIT:
4049 		error = raid_modcmd_init();
4050 		break;
4051 	case MODULE_CMD_FINI:
4052 		error = raid_modcmd_fini();
4053 		break;
4054 	default:
4055 		error = ENOTTY;
4056 		break;
4057 	}
4058 	return error;
4059 }
4060 
4061 static int
4062 raid_modcmd_init(void)
4063 {
4064 	int error;
4065 	int bmajor, cmajor;
4066 
4067 	mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE);
4068 	mutex_enter(&raid_lock);
4069 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4070 	rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM);
4071 	rf_init_cond2(rf_sparet_wait_cv, "sparetw");
4072 	rf_init_cond2(rf_sparet_resp_cv, "rfgst");
4073 
4074 	rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
4075 #endif
4076 
4077 	bmajor = cmajor = -1;
4078 	error = devsw_attach("raid", &raid_bdevsw, &bmajor,
4079 	    &raid_cdevsw, &cmajor);
4080 	if (error != 0 && error != EEXIST) {
4081 		aprint_error("%s: devsw_attach failed %d\n", __func__, error);
4082 		mutex_exit(&raid_lock);
4083 		return error;
4084 	}
4085 #ifdef _MODULE
4086 	error = config_cfdriver_attach(&raid_cd);
4087 	if (error != 0) {
4088 		aprint_error("%s: config_cfdriver_attach failed %d\n",
4089 		    __func__, error);
4090 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
4091 		mutex_exit(&raid_lock);
4092 		return error;
4093 	}
4094 #endif
4095 	error = config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4096 	if (error != 0) {
4097 		aprint_error("%s: config_cfattach_attach failed %d\n",
4098 		    __func__, error);
4099 #ifdef _MODULE
4100 		config_cfdriver_detach(&raid_cd);
4101 #endif
4102 		devsw_detach(&raid_bdevsw, &raid_cdevsw);
4103 		mutex_exit(&raid_lock);
4104 		return error;
4105 	}
4106 
4107 	raidautoconfigdone = false;
4108 
4109 	mutex_exit(&raid_lock);
4110 
4111 	if (error == 0) {
4112 		if (rf_BootRaidframe(true) == 0)
4113 			aprint_verbose("Kernelized RAIDframe activated\n");
4114 		else
4115 			panic("Serious error activating RAID!!");
4116 	}
4117 
4118 	/*
4119 	 * Register a finalizer which will be used to auto-config RAID
4120 	 * sets once all real hardware devices have been found.
4121 	 */
4122 	error = config_finalize_register(NULL, rf_autoconfig);
4123 	if (error != 0) {
4124 		aprint_error("WARNING: unable to register RAIDframe "
4125 		    "finalizer\n");
4126 		error = 0;
4127 	}
4128 
4129 	return error;
4130 }
4131 
4132 static int
4133 raid_modcmd_fini(void)
4134 {
4135 	int error;
4136 
4137 	mutex_enter(&raid_lock);
4138 
4139 	/* Don't allow unload if raid device(s) exist.  */
4140 	if (!LIST_EMPTY(&raids)) {
4141 		mutex_exit(&raid_lock);
4142 		return EBUSY;
4143 	}
4144 
4145 	error = config_cfattach_detach(raid_cd.cd_name, &raid_ca);
4146 	if (error != 0) {
4147 		aprint_error("%s: cannot detach cfattach\n",__func__);
4148 		mutex_exit(&raid_lock);
4149 		return error;
4150 	}
4151 #ifdef _MODULE
4152 	error = config_cfdriver_detach(&raid_cd);
4153 	if (error != 0) {
4154 		aprint_error("%s: cannot detach cfdriver\n",__func__);
4155 		config_cfattach_attach(raid_cd.cd_name, &raid_ca);
4156 		mutex_exit(&raid_lock);
4157 		return error;
4158 	}
4159 #endif
4160 	devsw_detach(&raid_bdevsw, &raid_cdevsw);
4161 	rf_BootRaidframe(false);
4162 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0)
4163 	rf_destroy_mutex2(rf_sparet_wait_mutex);
4164 	rf_destroy_cond2(rf_sparet_wait_cv);
4165 	rf_destroy_cond2(rf_sparet_resp_cv);
4166 #endif
4167 	mutex_exit(&raid_lock);
4168 	mutex_destroy(&raid_lock);
4169 
4170 	return error;
4171 }
4172