1 /* $NetBSD: rf_netbsdkintf.c,v 1.352 2017/11/14 14:27:54 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Greg Oster; Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1988 University of Utah. 34 * Copyright (c) 1990, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * the Systems Programming Group of the University of Utah Computer 39 * Science Department. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 66 * 67 * @(#)cd.c 8.2 (Berkeley) 11/16/93 68 */ 69 70 /* 71 * Copyright (c) 1995 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Mark Holland, Jim Zelenka 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /*********************************************************** 98 * 99 * rf_kintf.c -- the kernel interface routines for RAIDframe 100 * 101 ***********************************************************/ 102 103 #include <sys/cdefs.h> 104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.352 2017/11/14 14:27:54 christos Exp $"); 105 106 #ifdef _KERNEL_OPT 107 #include "opt_compat_netbsd.h" 108 #include "opt_raid_autoconfig.h" 109 #endif 110 111 #include <sys/param.h> 112 #include <sys/errno.h> 113 #include <sys/pool.h> 114 #include <sys/proc.h> 115 #include <sys/queue.h> 116 #include <sys/disk.h> 117 #include <sys/device.h> 118 #include <sys/stat.h> 119 #include <sys/ioctl.h> 120 #include <sys/fcntl.h> 121 #include <sys/systm.h> 122 #include <sys/vnode.h> 123 #include <sys/disklabel.h> 124 #include <sys/conf.h> 125 #include <sys/buf.h> 126 #include <sys/bufq.h> 127 #include <sys/reboot.h> 128 #include <sys/kauth.h> 129 #include <sys/module.h> 130 131 #include <prop/proplib.h> 132 133 #include <dev/raidframe/raidframevar.h> 134 #include <dev/raidframe/raidframeio.h> 135 #include <dev/raidframe/rf_paritymap.h> 136 137 #include "rf_raid.h" 138 #include "rf_copyback.h" 139 #include "rf_dag.h" 140 #include "rf_dagflags.h" 141 #include "rf_desc.h" 142 #include "rf_diskqueue.h" 143 #include "rf_etimer.h" 144 #include "rf_general.h" 145 #include "rf_kintf.h" 146 #include "rf_options.h" 147 #include "rf_driver.h" 148 #include "rf_parityscan.h" 149 #include "rf_threadstuff.h" 150 151 #ifdef COMPAT_50 152 #include "rf_compat50.h" 153 #endif 154 155 #include "ioconf.h" 156 157 #ifdef DEBUG 158 int rf_kdebug_level = 0; 159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a 160 #else /* DEBUG */ 161 #define db1_printf(a) { } 162 #endif /* DEBUG */ 163 164 #ifdef DEBUG_ROOT 165 #define DPRINTF(a, ...) printf(a, __VA_ARGS__) 166 #else 167 #define DPRINTF(a, ...) 168 #endif 169 170 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 171 static rf_declare_mutex2(rf_sparet_wait_mutex); 172 static rf_declare_cond2(rf_sparet_wait_cv); 173 static rf_declare_cond2(rf_sparet_resp_cv); 174 175 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a 176 * spare table */ 177 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from 178 * installation process */ 179 #endif 180 181 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures"); 182 183 /* prototypes */ 184 static void KernelWakeupFunc(struct buf *); 185 static void InitBP(struct buf *, struct vnode *, unsigned, 186 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *), 187 void *, int, struct proc *); 188 struct raid_softc; 189 static void raidinit(struct raid_softc *); 190 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp); 191 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *); 192 193 static int raid_match(device_t, cfdata_t, void *); 194 static void raid_attach(device_t, device_t, void *); 195 static int raid_detach(device_t, int); 196 197 static int raidread_component_area(dev_t, struct vnode *, void *, size_t, 198 daddr_t, daddr_t); 199 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t, 200 daddr_t, daddr_t, int); 201 202 static int raidwrite_component_label(unsigned, 203 dev_t, struct vnode *, RF_ComponentLabel_t *); 204 static int raidread_component_label(unsigned, 205 dev_t, struct vnode *, RF_ComponentLabel_t *); 206 207 static int raid_diskstart(device_t, struct buf *bp); 208 static int raid_dumpblocks(device_t, void *, daddr_t, int); 209 static int raid_lastclose(device_t); 210 211 static dev_type_open(raidopen); 212 static dev_type_close(raidclose); 213 static dev_type_read(raidread); 214 static dev_type_write(raidwrite); 215 static dev_type_ioctl(raidioctl); 216 static dev_type_strategy(raidstrategy); 217 static dev_type_dump(raiddump); 218 static dev_type_size(raidsize); 219 220 const struct bdevsw raid_bdevsw = { 221 .d_open = raidopen, 222 .d_close = raidclose, 223 .d_strategy = raidstrategy, 224 .d_ioctl = raidioctl, 225 .d_dump = raiddump, 226 .d_psize = raidsize, 227 .d_discard = nodiscard, 228 .d_flag = D_DISK 229 }; 230 231 const struct cdevsw raid_cdevsw = { 232 .d_open = raidopen, 233 .d_close = raidclose, 234 .d_read = raidread, 235 .d_write = raidwrite, 236 .d_ioctl = raidioctl, 237 .d_stop = nostop, 238 .d_tty = notty, 239 .d_poll = nopoll, 240 .d_mmap = nommap, 241 .d_kqfilter = nokqfilter, 242 .d_discard = nodiscard, 243 .d_flag = D_DISK 244 }; 245 246 static struct dkdriver rf_dkdriver = { 247 .d_open = raidopen, 248 .d_close = raidclose, 249 .d_strategy = raidstrategy, 250 .d_diskstart = raid_diskstart, 251 .d_dumpblocks = raid_dumpblocks, 252 .d_lastclose = raid_lastclose, 253 .d_minphys = minphys 254 }; 255 256 struct raid_softc { 257 struct dk_softc sc_dksc; 258 int sc_unit; 259 int sc_flags; /* flags */ 260 int sc_cflags; /* configuration flags */ 261 kmutex_t sc_mutex; /* interlock mutex */ 262 kcondvar_t sc_cv; /* and the condvar */ 263 uint64_t sc_size; /* size of the raid device */ 264 char sc_xname[20]; /* XXX external name */ 265 RF_Raid_t sc_r; 266 LIST_ENTRY(raid_softc) sc_link; 267 }; 268 /* sc_flags */ 269 #define RAIDF_INITED 0x01 /* unit has been initialized */ 270 #define RAIDF_SHUTDOWN 0x02 /* unit is being shutdown */ 271 #define RAIDF_DETACH 0x04 /* detach after final close */ 272 #define RAIDF_WANTED 0x08 /* someone waiting to obtain a lock */ 273 #define RAIDF_LOCKED 0x10 /* unit is locked */ 274 #define RAIDF_UNIT_CHANGED 0x20 /* unit is being changed */ 275 276 #define raidunit(x) DISKUNIT(x) 277 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc) 278 279 extern struct cfdriver raid_cd; 280 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc), 281 raid_match, raid_attach, raid_detach, NULL, NULL, NULL, 282 DVF_DETACH_SHUTDOWN); 283 284 /* 285 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device. 286 * Be aware that large numbers can allow the driver to consume a lot of 287 * kernel memory, especially on writes, and in degraded mode reads. 288 * 289 * For example: with a stripe width of 64 blocks (32k) and 5 disks, 290 * a single 64K write will typically require 64K for the old data, 291 * 64K for the old parity, and 64K for the new parity, for a total 292 * of 192K (if the parity buffer is not re-used immediately). 293 * Even it if is used immediately, that's still 128K, which when multiplied 294 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data. 295 * 296 * Now in degraded mode, for example, a 64K read on the above setup may 297 * require data reconstruction, which will require *all* of the 4 remaining 298 * disks to participate -- 4 * 32K/disk == 128K again. 299 */ 300 301 #ifndef RAIDOUTSTANDING 302 #define RAIDOUTSTANDING 6 303 #endif 304 305 #define RAIDLABELDEV(dev) \ 306 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART)) 307 308 /* declared here, and made public, for the benefit of KVM stuff.. */ 309 310 static int raidlock(struct raid_softc *); 311 static void raidunlock(struct raid_softc *); 312 313 static int raid_detach_unlocked(struct raid_softc *); 314 315 static void rf_markalldirty(RF_Raid_t *); 316 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *); 317 318 void rf_ReconThread(struct rf_recon_req *); 319 void rf_RewriteParityThread(RF_Raid_t *raidPtr); 320 void rf_CopybackThread(RF_Raid_t *raidPtr); 321 void rf_ReconstructInPlaceThread(struct rf_recon_req *); 322 int rf_autoconfig(device_t); 323 void rf_buildroothack(RF_ConfigSet_t *); 324 325 RF_AutoConfig_t *rf_find_raid_components(void); 326 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *); 327 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *); 328 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t); 329 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *); 330 int rf_set_autoconfig(RF_Raid_t *, int); 331 int rf_set_rootpartition(RF_Raid_t *, int); 332 void rf_release_all_vps(RF_ConfigSet_t *); 333 void rf_cleanup_config_set(RF_ConfigSet_t *); 334 int rf_have_enough_components(RF_ConfigSet_t *); 335 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *); 336 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t); 337 338 /* 339 * Debugging, mostly. Set to 0 to not allow autoconfig to take place. 340 * Note that this is overridden by having RAID_AUTOCONFIG as an option 341 * in the kernel config file. 342 */ 343 #ifdef RAID_AUTOCONFIG 344 int raidautoconfig = 1; 345 #else 346 int raidautoconfig = 0; 347 #endif 348 static bool raidautoconfigdone = false; 349 350 struct RF_Pools_s rf_pools; 351 352 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids); 353 static kmutex_t raid_lock; 354 355 static struct raid_softc * 356 raidcreate(int unit) { 357 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP); 358 sc->sc_unit = unit; 359 cv_init(&sc->sc_cv, "raidunit"); 360 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE); 361 return sc; 362 } 363 364 static void 365 raiddestroy(struct raid_softc *sc) { 366 cv_destroy(&sc->sc_cv); 367 mutex_destroy(&sc->sc_mutex); 368 kmem_free(sc, sizeof(*sc)); 369 } 370 371 static struct raid_softc * 372 raidget(int unit, bool create) { 373 struct raid_softc *sc; 374 if (unit < 0) { 375 #ifdef DIAGNOSTIC 376 panic("%s: unit %d!", __func__, unit); 377 #endif 378 return NULL; 379 } 380 mutex_enter(&raid_lock); 381 LIST_FOREACH(sc, &raids, sc_link) { 382 if (sc->sc_unit == unit) { 383 mutex_exit(&raid_lock); 384 return sc; 385 } 386 } 387 mutex_exit(&raid_lock); 388 if (!create) 389 return NULL; 390 if ((sc = raidcreate(unit)) == NULL) 391 return NULL; 392 mutex_enter(&raid_lock); 393 LIST_INSERT_HEAD(&raids, sc, sc_link); 394 mutex_exit(&raid_lock); 395 return sc; 396 } 397 398 static void 399 raidput(struct raid_softc *sc) { 400 mutex_enter(&raid_lock); 401 LIST_REMOVE(sc, sc_link); 402 mutex_exit(&raid_lock); 403 raiddestroy(sc); 404 } 405 406 void 407 raidattach(int num) 408 { 409 410 /* 411 * Device attachment and associated initialization now occurs 412 * as part of the module initialization. 413 */ 414 } 415 416 int 417 rf_autoconfig(device_t self) 418 { 419 RF_AutoConfig_t *ac_list; 420 RF_ConfigSet_t *config_sets; 421 422 if (!raidautoconfig || raidautoconfigdone == true) 423 return (0); 424 425 /* XXX This code can only be run once. */ 426 raidautoconfigdone = true; 427 428 #ifdef __HAVE_CPU_BOOTCONF 429 /* 430 * 0. find the boot device if needed first so we can use it later 431 * this needs to be done before we autoconfigure any raid sets, 432 * because if we use wedges we are not going to be able to open 433 * the boot device later 434 */ 435 if (booted_device == NULL) 436 cpu_bootconf(); 437 #endif 438 /* 1. locate all RAID components on the system */ 439 aprint_debug("Searching for RAID components...\n"); 440 ac_list = rf_find_raid_components(); 441 442 /* 2. Sort them into their respective sets. */ 443 config_sets = rf_create_auto_sets(ac_list); 444 445 /* 446 * 3. Evaluate each set and configure the valid ones. 447 * This gets done in rf_buildroothack(). 448 */ 449 rf_buildroothack(config_sets); 450 451 return 1; 452 } 453 454 static int 455 rf_containsboot(RF_Raid_t *r, device_t bdv) { 456 const char *bootname = device_xname(bdv); 457 size_t len = strlen(bootname); 458 459 for (int col = 0; col < r->numCol; col++) { 460 const char *devname = r->Disks[col].devname; 461 devname += sizeof("/dev/") - 1; 462 if (strncmp(devname, "dk", 2) == 0) { 463 const char *parent = 464 dkwedge_get_parent_name(r->Disks[col].dev); 465 if (parent != NULL) 466 devname = parent; 467 } 468 if (strncmp(devname, bootname, len) == 0) { 469 struct raid_softc *sc = r->softc; 470 aprint_debug("raid%d includes boot device %s\n", 471 sc->sc_unit, devname); 472 return 1; 473 } 474 } 475 return 0; 476 } 477 478 void 479 rf_buildroothack(RF_ConfigSet_t *config_sets) 480 { 481 RF_ConfigSet_t *cset; 482 RF_ConfigSet_t *next_cset; 483 int num_root; 484 struct raid_softc *sc, *rsc; 485 struct dk_softc *dksc; 486 487 sc = rsc = NULL; 488 num_root = 0; 489 cset = config_sets; 490 while (cset != NULL) { 491 next_cset = cset->next; 492 if (rf_have_enough_components(cset) && 493 cset->ac->clabel->autoconfigure == 1) { 494 sc = rf_auto_config_set(cset); 495 if (sc != NULL) { 496 aprint_debug("raid%d: configured ok\n", 497 sc->sc_unit); 498 if (cset->rootable) { 499 rsc = sc; 500 num_root++; 501 } 502 } else { 503 /* The autoconfig didn't work :( */ 504 aprint_debug("Autoconfig failed\n"); 505 rf_release_all_vps(cset); 506 } 507 } else { 508 /* we're not autoconfiguring this set... 509 release the associated resources */ 510 rf_release_all_vps(cset); 511 } 512 /* cleanup */ 513 rf_cleanup_config_set(cset); 514 cset = next_cset; 515 } 516 dksc = &rsc->sc_dksc; 517 518 /* if the user has specified what the root device should be 519 then we don't touch booted_device or boothowto... */ 520 521 if (rootspec != NULL) 522 return; 523 524 /* we found something bootable... */ 525 526 /* 527 * XXX: The following code assumes that the root raid 528 * is the first ('a') partition. This is about the best 529 * we can do with a BSD disklabel, but we might be able 530 * to do better with a GPT label, by setting a specified 531 * attribute to indicate the root partition. We can then 532 * stash the partition number in the r->root_partition 533 * high bits (the bottom 2 bits are already used). For 534 * now we just set booted_partition to 0 when we override 535 * root. 536 */ 537 if (num_root == 1) { 538 device_t candidate_root; 539 if (dksc->sc_dkdev.dk_nwedges != 0) { 540 char cname[sizeof(cset->ac->devname)]; 541 /* XXX: assume partition 'a' first */ 542 snprintf(cname, sizeof(cname), "%s%c", 543 device_xname(dksc->sc_dev), 'a'); 544 candidate_root = dkwedge_find_by_wname(cname); 545 DPRINTF("%s: candidate wedge root=%s\n", __func__, 546 cname); 547 if (candidate_root == NULL) { 548 /* 549 * If that is not found, because we don't use 550 * disklabel, return the first dk child 551 * XXX: we can skip the 'a' check above 552 * and always do this... 553 */ 554 size_t i = 0; 555 candidate_root = dkwedge_find_by_parent( 556 device_xname(dksc->sc_dev), &i); 557 } 558 DPRINTF("%s: candidate wedge root=%p\n", __func__, 559 candidate_root); 560 } else 561 candidate_root = dksc->sc_dev; 562 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root); 563 DPRINTF("%s: booted_device=%p root_partition=%d " 564 "contains_boot=%d\n", __func__, booted_device, 565 rsc->sc_r.root_partition, 566 rf_containsboot(&rsc->sc_r, booted_device)); 567 if (booted_device == NULL || 568 rsc->sc_r.root_partition == 1 || 569 rf_containsboot(&rsc->sc_r, booted_device)) { 570 booted_device = candidate_root; 571 booted_method = "raidframe/single"; 572 booted_partition = 0; /* XXX assume 'a' */ 573 } 574 } else if (num_root > 1) { 575 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root, 576 booted_device); 577 578 /* 579 * Maybe the MD code can help. If it cannot, then 580 * setroot() will discover that we have no 581 * booted_device and will ask the user if nothing was 582 * hardwired in the kernel config file 583 */ 584 if (booted_device == NULL) 585 return; 586 587 num_root = 0; 588 mutex_enter(&raid_lock); 589 LIST_FOREACH(sc, &raids, sc_link) { 590 RF_Raid_t *r = &sc->sc_r; 591 if (r->valid == 0) 592 continue; 593 594 if (r->root_partition == 0) 595 continue; 596 597 if (rf_containsboot(r, booted_device)) { 598 num_root++; 599 rsc = sc; 600 dksc = &rsc->sc_dksc; 601 } 602 } 603 mutex_exit(&raid_lock); 604 605 if (num_root == 1) { 606 booted_device = dksc->sc_dev; 607 booted_method = "raidframe/multi"; 608 booted_partition = 0; /* XXX assume 'a' */ 609 } else { 610 /* we can't guess.. require the user to answer... */ 611 boothowto |= RB_ASKNAME; 612 } 613 } 614 } 615 616 static int 617 raidsize(dev_t dev) 618 { 619 struct raid_softc *rs; 620 struct dk_softc *dksc; 621 unsigned int unit; 622 623 unit = raidunit(dev); 624 if ((rs = raidget(unit, false)) == NULL) 625 return -1; 626 dksc = &rs->sc_dksc; 627 628 if ((rs->sc_flags & RAIDF_INITED) == 0) 629 return -1; 630 631 return dk_size(dksc, dev); 632 } 633 634 static int 635 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size) 636 { 637 unsigned int unit; 638 struct raid_softc *rs; 639 struct dk_softc *dksc; 640 641 unit = raidunit(dev); 642 if ((rs = raidget(unit, false)) == NULL) 643 return ENXIO; 644 dksc = &rs->sc_dksc; 645 646 if ((rs->sc_flags & RAIDF_INITED) == 0) 647 return ENODEV; 648 649 /* 650 Note that blkno is relative to this particular partition. 651 By adding adding RF_PROTECTED_SECTORS, we get a value that 652 is relative to the partition used for the underlying component. 653 */ 654 blkno += RF_PROTECTED_SECTORS; 655 656 return dk_dump(dksc, dev, blkno, va, size); 657 } 658 659 static int 660 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk) 661 { 662 struct raid_softc *rs = raidsoftc(dev); 663 const struct bdevsw *bdev; 664 RF_Raid_t *raidPtr; 665 int c, sparecol, j, scol, dumpto; 666 int error = 0; 667 668 raidPtr = &rs->sc_r; 669 670 /* we only support dumping to RAID 1 sets */ 671 if (raidPtr->Layout.numDataCol != 1 || 672 raidPtr->Layout.numParityCol != 1) 673 return EINVAL; 674 675 if ((error = raidlock(rs)) != 0) 676 return error; 677 678 /* figure out what device is alive.. */ 679 680 /* 681 Look for a component to dump to. The preference for the 682 component to dump to is as follows: 683 1) the master 684 2) a used_spare of the master 685 3) the slave 686 4) a used_spare of the slave 687 */ 688 689 dumpto = -1; 690 for (c = 0; c < raidPtr->numCol; c++) { 691 if (raidPtr->Disks[c].status == rf_ds_optimal) { 692 /* this might be the one */ 693 dumpto = c; 694 break; 695 } 696 } 697 698 /* 699 At this point we have possibly selected a live master or a 700 live slave. We now check to see if there is a spared 701 master (or a spared slave), if we didn't find a live master 702 or a live slave. 703 */ 704 705 for (c = 0; c < raidPtr->numSpare; c++) { 706 sparecol = raidPtr->numCol + c; 707 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 708 /* How about this one? */ 709 scol = -1; 710 for(j=0;j<raidPtr->numCol;j++) { 711 if (raidPtr->Disks[j].spareCol == sparecol) { 712 scol = j; 713 break; 714 } 715 } 716 if (scol == 0) { 717 /* 718 We must have found a spared master! 719 We'll take that over anything else 720 found so far. (We couldn't have 721 found a real master before, since 722 this is a used spare, and it's 723 saying that it's replacing the 724 master.) On reboot (with 725 autoconfiguration turned on) 726 sparecol will become the 1st 727 component (component0) of this set. 728 */ 729 dumpto = sparecol; 730 break; 731 } else if (scol != -1) { 732 /* 733 Must be a spared slave. We'll dump 734 to that if we havn't found anything 735 else so far. 736 */ 737 if (dumpto == -1) 738 dumpto = sparecol; 739 } 740 } 741 } 742 743 if (dumpto == -1) { 744 /* we couldn't find any live components to dump to!?!? 745 */ 746 error = EINVAL; 747 goto out; 748 } 749 750 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev); 751 if (bdev == NULL) { 752 error = ENXIO; 753 goto out; 754 } 755 756 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev, 757 blkno, va, nblk * raidPtr->bytesPerSector); 758 759 out: 760 raidunlock(rs); 761 762 return error; 763 } 764 765 /* ARGSUSED */ 766 static int 767 raidopen(dev_t dev, int flags, int fmt, 768 struct lwp *l) 769 { 770 int unit = raidunit(dev); 771 struct raid_softc *rs; 772 struct dk_softc *dksc; 773 int error = 0; 774 int part, pmask; 775 776 if ((rs = raidget(unit, true)) == NULL) 777 return ENXIO; 778 if ((error = raidlock(rs)) != 0) 779 return (error); 780 781 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) { 782 error = EBUSY; 783 goto bad; 784 } 785 786 dksc = &rs->sc_dksc; 787 788 part = DISKPART(dev); 789 pmask = (1 << part); 790 791 if (!DK_BUSY(dksc, pmask) && 792 ((rs->sc_flags & RAIDF_INITED) != 0)) { 793 /* First one... mark things as dirty... Note that we *MUST* 794 have done a configure before this. I DO NOT WANT TO BE 795 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED 796 THAT THEY BELONG TOGETHER!!!!! */ 797 /* XXX should check to see if we're only open for reading 798 here... If so, we needn't do this, but then need some 799 other way of keeping track of what's happened.. */ 800 801 rf_markalldirty(&rs->sc_r); 802 } 803 804 if ((rs->sc_flags & RAIDF_INITED) != 0) 805 error = dk_open(dksc, dev, flags, fmt, l); 806 807 bad: 808 raidunlock(rs); 809 810 return (error); 811 812 813 } 814 815 static int 816 raid_lastclose(device_t self) 817 { 818 struct raid_softc *rs = raidsoftc(self); 819 820 /* Last one... device is not unconfigured yet. 821 Device shutdown has taken care of setting the 822 clean bits if RAIDF_INITED is not set 823 mark things as clean... */ 824 825 rf_update_component_labels(&rs->sc_r, 826 RF_FINAL_COMPONENT_UPDATE); 827 828 /* pass to unlocked code */ 829 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) 830 rs->sc_flags |= RAIDF_DETACH; 831 832 return 0; 833 } 834 835 /* ARGSUSED */ 836 static int 837 raidclose(dev_t dev, int flags, int fmt, struct lwp *l) 838 { 839 int unit = raidunit(dev); 840 struct raid_softc *rs; 841 struct dk_softc *dksc; 842 cfdata_t cf; 843 int error = 0, do_detach = 0, do_put = 0; 844 845 if ((rs = raidget(unit, false)) == NULL) 846 return ENXIO; 847 dksc = &rs->sc_dksc; 848 849 if ((error = raidlock(rs)) != 0) 850 return (error); 851 852 if ((rs->sc_flags & RAIDF_INITED) != 0) { 853 error = dk_close(dksc, dev, flags, fmt, l); 854 if ((rs->sc_flags & RAIDF_DETACH) != 0) 855 do_detach = 1; 856 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) 857 do_put = 1; 858 859 raidunlock(rs); 860 861 if (do_detach) { 862 /* free the pseudo device attach bits */ 863 cf = device_cfdata(dksc->sc_dev); 864 error = config_detach(dksc->sc_dev, 0); 865 if (error == 0) 866 free(cf, M_RAIDFRAME); 867 } else if (do_put) { 868 raidput(rs); 869 } 870 871 return (error); 872 873 } 874 875 static void 876 raid_wakeup(RF_Raid_t *raidPtr) 877 { 878 rf_lock_mutex2(raidPtr->iodone_lock); 879 rf_signal_cond2(raidPtr->iodone_cv); 880 rf_unlock_mutex2(raidPtr->iodone_lock); 881 } 882 883 static void 884 raidstrategy(struct buf *bp) 885 { 886 unsigned int unit; 887 struct raid_softc *rs; 888 struct dk_softc *dksc; 889 RF_Raid_t *raidPtr; 890 891 unit = raidunit(bp->b_dev); 892 if ((rs = raidget(unit, false)) == NULL) { 893 bp->b_error = ENXIO; 894 goto fail; 895 } 896 if ((rs->sc_flags & RAIDF_INITED) == 0) { 897 bp->b_error = ENXIO; 898 goto fail; 899 } 900 dksc = &rs->sc_dksc; 901 raidPtr = &rs->sc_r; 902 903 /* Queue IO only */ 904 if (dk_strategy_defer(dksc, bp)) 905 goto done; 906 907 /* schedule the IO to happen at the next convenient time */ 908 raid_wakeup(raidPtr); 909 910 done: 911 return; 912 913 fail: 914 bp->b_resid = bp->b_bcount; 915 biodone(bp); 916 } 917 918 static int 919 raid_diskstart(device_t dev, struct buf *bp) 920 { 921 struct raid_softc *rs = raidsoftc(dev); 922 RF_Raid_t *raidPtr; 923 924 raidPtr = &rs->sc_r; 925 if (!raidPtr->valid) { 926 db1_printf(("raid is not valid..\n")); 927 return ENODEV; 928 } 929 930 /* XXX */ 931 bp->b_resid = 0; 932 933 return raiddoaccess(raidPtr, bp); 934 } 935 936 void 937 raiddone(RF_Raid_t *raidPtr, struct buf *bp) 938 { 939 struct raid_softc *rs; 940 struct dk_softc *dksc; 941 942 rs = raidPtr->softc; 943 dksc = &rs->sc_dksc; 944 945 dk_done(dksc, bp); 946 947 rf_lock_mutex2(raidPtr->mutex); 948 raidPtr->openings++; 949 rf_unlock_mutex2(raidPtr->mutex); 950 951 /* schedule more IO */ 952 raid_wakeup(raidPtr); 953 } 954 955 /* ARGSUSED */ 956 static int 957 raidread(dev_t dev, struct uio *uio, int flags) 958 { 959 int unit = raidunit(dev); 960 struct raid_softc *rs; 961 962 if ((rs = raidget(unit, false)) == NULL) 963 return ENXIO; 964 965 if ((rs->sc_flags & RAIDF_INITED) == 0) 966 return (ENXIO); 967 968 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio)); 969 970 } 971 972 /* ARGSUSED */ 973 static int 974 raidwrite(dev_t dev, struct uio *uio, int flags) 975 { 976 int unit = raidunit(dev); 977 struct raid_softc *rs; 978 979 if ((rs = raidget(unit, false)) == NULL) 980 return ENXIO; 981 982 if ((rs->sc_flags & RAIDF_INITED) == 0) 983 return (ENXIO); 984 985 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio)); 986 987 } 988 989 static int 990 raid_detach_unlocked(struct raid_softc *rs) 991 { 992 struct dk_softc *dksc = &rs->sc_dksc; 993 RF_Raid_t *raidPtr; 994 int error; 995 996 raidPtr = &rs->sc_r; 997 998 if (DK_BUSY(dksc, 0) || 999 raidPtr->recon_in_progress != 0 || 1000 raidPtr->parity_rewrite_in_progress != 0 || 1001 raidPtr->copyback_in_progress != 0) 1002 return EBUSY; 1003 1004 if ((rs->sc_flags & RAIDF_INITED) == 0) 1005 return 0; 1006 1007 rs->sc_flags &= ~RAIDF_SHUTDOWN; 1008 1009 if ((error = rf_Shutdown(raidPtr)) != 0) 1010 return error; 1011 1012 rs->sc_flags &= ~RAIDF_INITED; 1013 1014 /* Kill off any queued buffers */ 1015 dk_drain(dksc); 1016 bufq_free(dksc->sc_bufq); 1017 1018 /* Detach the disk. */ 1019 dkwedge_delall(&dksc->sc_dkdev); 1020 disk_detach(&dksc->sc_dkdev); 1021 disk_destroy(&dksc->sc_dkdev); 1022 dk_detach(dksc); 1023 1024 return 0; 1025 } 1026 1027 static int 1028 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) 1029 { 1030 int unit = raidunit(dev); 1031 int error = 0; 1032 int part, pmask; 1033 struct raid_softc *rs; 1034 struct dk_softc *dksc; 1035 RF_Config_t *k_cfg, *u_cfg; 1036 RF_Raid_t *raidPtr; 1037 RF_RaidDisk_t *diskPtr; 1038 RF_AccTotals_t *totals; 1039 RF_DeviceConfig_t *d_cfg, **ucfgp; 1040 u_char *specific_buf; 1041 int retcode = 0; 1042 int column; 1043 /* int raidid; */ 1044 struct rf_recon_req *rrcopy, *rr; 1045 RF_ComponentLabel_t *clabel; 1046 RF_ComponentLabel_t *ci_label; 1047 RF_ComponentLabel_t **clabel_ptr; 1048 RF_SingleComponent_t *sparePtr,*componentPtr; 1049 RF_SingleComponent_t component; 1050 RF_ProgressInfo_t progressInfo, **progressInfoPtr; 1051 int i, j, d; 1052 1053 if ((rs = raidget(unit, false)) == NULL) 1054 return ENXIO; 1055 dksc = &rs->sc_dksc; 1056 raidPtr = &rs->sc_r; 1057 1058 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev, 1059 (int) DISKPART(dev), (int) unit, cmd)); 1060 1061 /* Must be initialized for these... */ 1062 switch (cmd) { 1063 case RAIDFRAME_REWRITEPARITY: 1064 case RAIDFRAME_GET_INFO: 1065 case RAIDFRAME_RESET_ACCTOTALS: 1066 case RAIDFRAME_GET_ACCTOTALS: 1067 case RAIDFRAME_KEEP_ACCTOTALS: 1068 case RAIDFRAME_GET_SIZE: 1069 case RAIDFRAME_FAIL_DISK: 1070 case RAIDFRAME_COPYBACK: 1071 case RAIDFRAME_CHECK_RECON_STATUS: 1072 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1073 case RAIDFRAME_GET_COMPONENT_LABEL: 1074 case RAIDFRAME_SET_COMPONENT_LABEL: 1075 case RAIDFRAME_ADD_HOT_SPARE: 1076 case RAIDFRAME_REMOVE_HOT_SPARE: 1077 case RAIDFRAME_INIT_LABELS: 1078 case RAIDFRAME_REBUILD_IN_PLACE: 1079 case RAIDFRAME_CHECK_PARITY: 1080 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1081 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1082 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1083 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1084 case RAIDFRAME_SET_AUTOCONFIG: 1085 case RAIDFRAME_SET_ROOT: 1086 case RAIDFRAME_DELETE_COMPONENT: 1087 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1088 case RAIDFRAME_PARITYMAP_STATUS: 1089 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1090 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1091 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1092 if ((rs->sc_flags & RAIDF_INITED) == 0) 1093 return (ENXIO); 1094 } 1095 1096 switch (cmd) { 1097 #ifdef COMPAT_50 1098 case RAIDFRAME_GET_INFO50: 1099 return rf_get_info50(raidPtr, data); 1100 1101 case RAIDFRAME_CONFIGURE50: 1102 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0) 1103 return retcode; 1104 goto config; 1105 #endif 1106 /* configure the system */ 1107 case RAIDFRAME_CONFIGURE: 1108 1109 if (raidPtr->valid) { 1110 /* There is a valid RAID set running on this unit! */ 1111 printf("raid%d: Device already configured!\n",unit); 1112 return(EINVAL); 1113 } 1114 1115 /* copy-in the configuration information */ 1116 /* data points to a pointer to the configuration structure */ 1117 1118 u_cfg = *((RF_Config_t **) data); 1119 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *)); 1120 if (k_cfg == NULL) { 1121 return (ENOMEM); 1122 } 1123 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t)); 1124 if (retcode) { 1125 RF_Free(k_cfg, sizeof(RF_Config_t)); 1126 db1_printf(("rf_ioctl: retcode=%d copyin.1\n", 1127 retcode)); 1128 goto no_config; 1129 } 1130 goto config; 1131 config: 1132 rs->sc_flags &= ~RAIDF_SHUTDOWN; 1133 1134 /* allocate a buffer for the layout-specific data, and copy it 1135 * in */ 1136 if (k_cfg->layoutSpecificSize) { 1137 if (k_cfg->layoutSpecificSize > 10000) { 1138 /* sanity check */ 1139 RF_Free(k_cfg, sizeof(RF_Config_t)); 1140 retcode = EINVAL; 1141 goto no_config; 1142 } 1143 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize, 1144 (u_char *)); 1145 if (specific_buf == NULL) { 1146 RF_Free(k_cfg, sizeof(RF_Config_t)); 1147 retcode = ENOMEM; 1148 goto no_config; 1149 } 1150 retcode = copyin(k_cfg->layoutSpecific, specific_buf, 1151 k_cfg->layoutSpecificSize); 1152 if (retcode) { 1153 RF_Free(k_cfg, sizeof(RF_Config_t)); 1154 RF_Free(specific_buf, 1155 k_cfg->layoutSpecificSize); 1156 db1_printf(("rf_ioctl: retcode=%d copyin.2\n", 1157 retcode)); 1158 goto no_config; 1159 } 1160 } else 1161 specific_buf = NULL; 1162 k_cfg->layoutSpecific = specific_buf; 1163 1164 /* should do some kind of sanity check on the configuration. 1165 * Store the sum of all the bytes in the last byte? */ 1166 1167 /* configure the system */ 1168 1169 /* 1170 * Clear the entire RAID descriptor, just to make sure 1171 * there is no stale data left in the case of a 1172 * reconfiguration 1173 */ 1174 memset(raidPtr, 0, sizeof(*raidPtr)); 1175 raidPtr->softc = rs; 1176 raidPtr->raidid = unit; 1177 1178 retcode = rf_Configure(raidPtr, k_cfg, NULL); 1179 1180 if (retcode == 0) { 1181 1182 /* allow this many simultaneous IO's to 1183 this RAID device */ 1184 raidPtr->openings = RAIDOUTSTANDING; 1185 1186 raidinit(rs); 1187 raid_wakeup(raidPtr); 1188 rf_markalldirty(raidPtr); 1189 } 1190 /* free the buffers. No return code here. */ 1191 if (k_cfg->layoutSpecificSize) { 1192 RF_Free(specific_buf, k_cfg->layoutSpecificSize); 1193 } 1194 RF_Free(k_cfg, sizeof(RF_Config_t)); 1195 1196 no_config: 1197 /* 1198 * If configuration failed, set sc_flags so that we 1199 * will detach the device when we close it. 1200 */ 1201 if (retcode != 0) 1202 rs->sc_flags |= RAIDF_SHUTDOWN; 1203 return (retcode); 1204 1205 /* shutdown the system */ 1206 case RAIDFRAME_SHUTDOWN: 1207 1208 part = DISKPART(dev); 1209 pmask = (1 << part); 1210 1211 if ((error = raidlock(rs)) != 0) 1212 return (error); 1213 1214 if (DK_BUSY(dksc, pmask) || 1215 raidPtr->recon_in_progress != 0 || 1216 raidPtr->parity_rewrite_in_progress != 0 || 1217 raidPtr->copyback_in_progress != 0) 1218 retcode = EBUSY; 1219 else { 1220 /* detach and free on close */ 1221 rs->sc_flags |= RAIDF_SHUTDOWN; 1222 retcode = 0; 1223 } 1224 1225 raidunlock(rs); 1226 1227 return (retcode); 1228 case RAIDFRAME_GET_COMPONENT_LABEL: 1229 clabel_ptr = (RF_ComponentLabel_t **) data; 1230 /* need to read the component label for the disk indicated 1231 by row,column in clabel */ 1232 1233 /* 1234 * Perhaps there should be an option to skip the in-core 1235 * copy and hit the disk, as with disklabel(8). 1236 */ 1237 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *)); 1238 1239 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel)); 1240 1241 if (retcode) { 1242 RF_Free(clabel, sizeof(*clabel)); 1243 return retcode; 1244 } 1245 1246 clabel->row = 0; /* Don't allow looking at anything else.*/ 1247 1248 column = clabel->column; 1249 1250 if ((column < 0) || (column >= raidPtr->numCol + 1251 raidPtr->numSpare)) { 1252 RF_Free(clabel, sizeof(*clabel)); 1253 return EINVAL; 1254 } 1255 1256 RF_Free(clabel, sizeof(*clabel)); 1257 1258 clabel = raidget_component_label(raidPtr, column); 1259 1260 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr)); 1261 1262 #if 0 1263 case RAIDFRAME_SET_COMPONENT_LABEL: 1264 clabel = (RF_ComponentLabel_t *) data; 1265 1266 /* XXX check the label for valid stuff... */ 1267 /* Note that some things *should not* get modified -- 1268 the user should be re-initing the labels instead of 1269 trying to patch things. 1270 */ 1271 1272 raidid = raidPtr->raidid; 1273 #ifdef DEBUG 1274 printf("raid%d: Got component label:\n", raidid); 1275 printf("raid%d: Version: %d\n", raidid, clabel->version); 1276 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number); 1277 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter); 1278 printf("raid%d: Column: %d\n", raidid, clabel->column); 1279 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns); 1280 printf("raid%d: Clean: %d\n", raidid, clabel->clean); 1281 printf("raid%d: Status: %d\n", raidid, clabel->status); 1282 #endif 1283 clabel->row = 0; 1284 column = clabel->column; 1285 1286 if ((column < 0) || (column >= raidPtr->numCol)) { 1287 return(EINVAL); 1288 } 1289 1290 /* XXX this isn't allowed to do anything for now :-) */ 1291 1292 /* XXX and before it is, we need to fill in the rest 1293 of the fields!?!?!?! */ 1294 memcpy(raidget_component_label(raidPtr, column), 1295 clabel, sizeof(*clabel)); 1296 raidflush_component_label(raidPtr, column); 1297 return (0); 1298 #endif 1299 1300 case RAIDFRAME_INIT_LABELS: 1301 clabel = (RF_ComponentLabel_t *) data; 1302 /* 1303 we only want the serial number from 1304 the above. We get all the rest of the information 1305 from the config that was used to create this RAID 1306 set. 1307 */ 1308 1309 raidPtr->serial_number = clabel->serial_number; 1310 1311 for(column=0;column<raidPtr->numCol;column++) { 1312 diskPtr = &raidPtr->Disks[column]; 1313 if (!RF_DEAD_DISK(diskPtr->status)) { 1314 ci_label = raidget_component_label(raidPtr, 1315 column); 1316 /* Zeroing this is important. */ 1317 memset(ci_label, 0, sizeof(*ci_label)); 1318 raid_init_component_label(raidPtr, ci_label); 1319 ci_label->serial_number = 1320 raidPtr->serial_number; 1321 ci_label->row = 0; /* we dont' pretend to support more */ 1322 rf_component_label_set_partitionsize(ci_label, 1323 diskPtr->partitionSize); 1324 ci_label->column = column; 1325 raidflush_component_label(raidPtr, column); 1326 } 1327 /* XXXjld what about the spares? */ 1328 } 1329 1330 return (retcode); 1331 case RAIDFRAME_SET_AUTOCONFIG: 1332 d = rf_set_autoconfig(raidPtr, *(int *) data); 1333 printf("raid%d: New autoconfig value is: %d\n", 1334 raidPtr->raidid, d); 1335 *(int *) data = d; 1336 return (retcode); 1337 1338 case RAIDFRAME_SET_ROOT: 1339 d = rf_set_rootpartition(raidPtr, *(int *) data); 1340 printf("raid%d: New rootpartition value is: %d\n", 1341 raidPtr->raidid, d); 1342 *(int *) data = d; 1343 return (retcode); 1344 1345 /* initialize all parity */ 1346 case RAIDFRAME_REWRITEPARITY: 1347 1348 if (raidPtr->Layout.map->faultsTolerated == 0) { 1349 /* Parity for RAID 0 is trivially correct */ 1350 raidPtr->parity_good = RF_RAID_CLEAN; 1351 return(0); 1352 } 1353 1354 if (raidPtr->parity_rewrite_in_progress == 1) { 1355 /* Re-write is already in progress! */ 1356 return(EINVAL); 1357 } 1358 1359 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread, 1360 rf_RewriteParityThread, 1361 raidPtr,"raid_parity"); 1362 return (retcode); 1363 1364 1365 case RAIDFRAME_ADD_HOT_SPARE: 1366 sparePtr = (RF_SingleComponent_t *) data; 1367 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t)); 1368 retcode = rf_add_hot_spare(raidPtr, &component); 1369 return(retcode); 1370 1371 case RAIDFRAME_REMOVE_HOT_SPARE: 1372 return(retcode); 1373 1374 case RAIDFRAME_DELETE_COMPONENT: 1375 componentPtr = (RF_SingleComponent_t *)data; 1376 memcpy( &component, componentPtr, 1377 sizeof(RF_SingleComponent_t)); 1378 retcode = rf_delete_component(raidPtr, &component); 1379 return(retcode); 1380 1381 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1382 componentPtr = (RF_SingleComponent_t *)data; 1383 memcpy( &component, componentPtr, 1384 sizeof(RF_SingleComponent_t)); 1385 retcode = rf_incorporate_hot_spare(raidPtr, &component); 1386 return(retcode); 1387 1388 case RAIDFRAME_REBUILD_IN_PLACE: 1389 1390 if (raidPtr->Layout.map->faultsTolerated == 0) { 1391 /* Can't do this on a RAID 0!! */ 1392 return(EINVAL); 1393 } 1394 1395 if (raidPtr->recon_in_progress == 1) { 1396 /* a reconstruct is already in progress! */ 1397 return(EINVAL); 1398 } 1399 1400 componentPtr = (RF_SingleComponent_t *) data; 1401 memcpy( &component, componentPtr, 1402 sizeof(RF_SingleComponent_t)); 1403 component.row = 0; /* we don't support any more */ 1404 column = component.column; 1405 1406 if ((column < 0) || (column >= raidPtr->numCol)) { 1407 return(EINVAL); 1408 } 1409 1410 rf_lock_mutex2(raidPtr->mutex); 1411 if ((raidPtr->Disks[column].status == rf_ds_optimal) && 1412 (raidPtr->numFailures > 0)) { 1413 /* XXX 0 above shouldn't be constant!!! */ 1414 /* some component other than this has failed. 1415 Let's not make things worse than they already 1416 are... */ 1417 printf("raid%d: Unable to reconstruct to disk at:\n", 1418 raidPtr->raidid); 1419 printf("raid%d: Col: %d Too many failures.\n", 1420 raidPtr->raidid, column); 1421 rf_unlock_mutex2(raidPtr->mutex); 1422 return (EINVAL); 1423 } 1424 if (raidPtr->Disks[column].status == 1425 rf_ds_reconstructing) { 1426 printf("raid%d: Unable to reconstruct to disk at:\n", 1427 raidPtr->raidid); 1428 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column); 1429 1430 rf_unlock_mutex2(raidPtr->mutex); 1431 return (EINVAL); 1432 } 1433 if (raidPtr->Disks[column].status == rf_ds_spared) { 1434 rf_unlock_mutex2(raidPtr->mutex); 1435 return (EINVAL); 1436 } 1437 rf_unlock_mutex2(raidPtr->mutex); 1438 1439 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); 1440 if (rrcopy == NULL) 1441 return(ENOMEM); 1442 1443 rrcopy->raidPtr = (void *) raidPtr; 1444 rrcopy->col = column; 1445 1446 retcode = RF_CREATE_THREAD(raidPtr->recon_thread, 1447 rf_ReconstructInPlaceThread, 1448 rrcopy,"raid_reconip"); 1449 return(retcode); 1450 1451 case RAIDFRAME_GET_INFO: 1452 if (!raidPtr->valid) 1453 return (ENODEV); 1454 ucfgp = (RF_DeviceConfig_t **) data; 1455 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t), 1456 (RF_DeviceConfig_t *)); 1457 if (d_cfg == NULL) 1458 return (ENOMEM); 1459 d_cfg->rows = 1; /* there is only 1 row now */ 1460 d_cfg->cols = raidPtr->numCol; 1461 d_cfg->ndevs = raidPtr->numCol; 1462 if (d_cfg->ndevs >= RF_MAX_DISKS) { 1463 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1464 return (ENOMEM); 1465 } 1466 d_cfg->nspares = raidPtr->numSpare; 1467 if (d_cfg->nspares >= RF_MAX_DISKS) { 1468 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1469 return (ENOMEM); 1470 } 1471 d_cfg->maxqdepth = raidPtr->maxQueueDepth; 1472 d = 0; 1473 for (j = 0; j < d_cfg->cols; j++) { 1474 d_cfg->devs[d] = raidPtr->Disks[j]; 1475 d++; 1476 } 1477 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) { 1478 d_cfg->spares[i] = raidPtr->Disks[j]; 1479 if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) { 1480 /* XXX: raidctl(8) expects to see this as a used spare */ 1481 d_cfg->spares[i].status = rf_ds_used_spare; 1482 } 1483 } 1484 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t)); 1485 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1486 1487 return (retcode); 1488 1489 case RAIDFRAME_CHECK_PARITY: 1490 *(int *) data = raidPtr->parity_good; 1491 return (0); 1492 1493 case RAIDFRAME_PARITYMAP_STATUS: 1494 if (rf_paritymap_ineligible(raidPtr)) 1495 return EINVAL; 1496 rf_paritymap_status(raidPtr->parity_map, 1497 (struct rf_pmstat *)data); 1498 return 0; 1499 1500 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1501 if (rf_paritymap_ineligible(raidPtr)) 1502 return EINVAL; 1503 if (raidPtr->parity_map == NULL) 1504 return ENOENT; /* ??? */ 1505 if (0 != rf_paritymap_set_params(raidPtr->parity_map, 1506 (struct rf_pmparams *)data, 1)) 1507 return EINVAL; 1508 return 0; 1509 1510 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1511 if (rf_paritymap_ineligible(raidPtr)) 1512 return EINVAL; 1513 *(int *) data = rf_paritymap_get_disable(raidPtr); 1514 return 0; 1515 1516 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1517 if (rf_paritymap_ineligible(raidPtr)) 1518 return EINVAL; 1519 rf_paritymap_set_disable(raidPtr, *(int *)data); 1520 /* XXX should errors be passed up? */ 1521 return 0; 1522 1523 case RAIDFRAME_RESET_ACCTOTALS: 1524 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals)); 1525 return (0); 1526 1527 case RAIDFRAME_GET_ACCTOTALS: 1528 totals = (RF_AccTotals_t *) data; 1529 *totals = raidPtr->acc_totals; 1530 return (0); 1531 1532 case RAIDFRAME_KEEP_ACCTOTALS: 1533 raidPtr->keep_acc_totals = *(int *)data; 1534 return (0); 1535 1536 case RAIDFRAME_GET_SIZE: 1537 *(int *) data = raidPtr->totalSectors; 1538 return (0); 1539 1540 /* fail a disk & optionally start reconstruction */ 1541 case RAIDFRAME_FAIL_DISK: 1542 1543 if (raidPtr->Layout.map->faultsTolerated == 0) { 1544 /* Can't do this on a RAID 0!! */ 1545 return(EINVAL); 1546 } 1547 1548 rr = (struct rf_recon_req *) data; 1549 rr->row = 0; 1550 if (rr->col < 0 || rr->col >= raidPtr->numCol) 1551 return (EINVAL); 1552 1553 1554 rf_lock_mutex2(raidPtr->mutex); 1555 if (raidPtr->status == rf_rs_reconstructing) { 1556 /* you can't fail a disk while we're reconstructing! */ 1557 /* XXX wrong for RAID6 */ 1558 rf_unlock_mutex2(raidPtr->mutex); 1559 return (EINVAL); 1560 } 1561 if ((raidPtr->Disks[rr->col].status == 1562 rf_ds_optimal) && (raidPtr->numFailures > 0)) { 1563 /* some other component has failed. Let's not make 1564 things worse. XXX wrong for RAID6 */ 1565 rf_unlock_mutex2(raidPtr->mutex); 1566 return (EINVAL); 1567 } 1568 if (raidPtr->Disks[rr->col].status == rf_ds_spared) { 1569 /* Can't fail a spared disk! */ 1570 rf_unlock_mutex2(raidPtr->mutex); 1571 return (EINVAL); 1572 } 1573 rf_unlock_mutex2(raidPtr->mutex); 1574 1575 /* make a copy of the recon request so that we don't rely on 1576 * the user's buffer */ 1577 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); 1578 if (rrcopy == NULL) 1579 return(ENOMEM); 1580 memcpy(rrcopy, rr, sizeof(*rr)); 1581 rrcopy->raidPtr = (void *) raidPtr; 1582 1583 retcode = RF_CREATE_THREAD(raidPtr->recon_thread, 1584 rf_ReconThread, 1585 rrcopy,"raid_recon"); 1586 return (0); 1587 1588 /* invoke a copyback operation after recon on whatever disk 1589 * needs it, if any */ 1590 case RAIDFRAME_COPYBACK: 1591 1592 if (raidPtr->Layout.map->faultsTolerated == 0) { 1593 /* This makes no sense on a RAID 0!! */ 1594 return(EINVAL); 1595 } 1596 1597 if (raidPtr->copyback_in_progress == 1) { 1598 /* Copyback is already in progress! */ 1599 return(EINVAL); 1600 } 1601 1602 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread, 1603 rf_CopybackThread, 1604 raidPtr,"raid_copyback"); 1605 return (retcode); 1606 1607 /* return the percentage completion of reconstruction */ 1608 case RAIDFRAME_CHECK_RECON_STATUS: 1609 if (raidPtr->Layout.map->faultsTolerated == 0) { 1610 /* This makes no sense on a RAID 0, so tell the 1611 user it's done. */ 1612 *(int *) data = 100; 1613 return(0); 1614 } 1615 if (raidPtr->status != rf_rs_reconstructing) 1616 *(int *) data = 100; 1617 else { 1618 if (raidPtr->reconControl->numRUsTotal > 0) { 1619 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal); 1620 } else { 1621 *(int *) data = 0; 1622 } 1623 } 1624 return (0); 1625 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1626 progressInfoPtr = (RF_ProgressInfo_t **) data; 1627 if (raidPtr->status != rf_rs_reconstructing) { 1628 progressInfo.remaining = 0; 1629 progressInfo.completed = 100; 1630 progressInfo.total = 100; 1631 } else { 1632 progressInfo.total = 1633 raidPtr->reconControl->numRUsTotal; 1634 progressInfo.completed = 1635 raidPtr->reconControl->numRUsComplete; 1636 progressInfo.remaining = progressInfo.total - 1637 progressInfo.completed; 1638 } 1639 retcode = copyout(&progressInfo, *progressInfoPtr, 1640 sizeof(RF_ProgressInfo_t)); 1641 return (retcode); 1642 1643 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1644 if (raidPtr->Layout.map->faultsTolerated == 0) { 1645 /* This makes no sense on a RAID 0, so tell the 1646 user it's done. */ 1647 *(int *) data = 100; 1648 return(0); 1649 } 1650 if (raidPtr->parity_rewrite_in_progress == 1) { 1651 *(int *) data = 100 * 1652 raidPtr->parity_rewrite_stripes_done / 1653 raidPtr->Layout.numStripe; 1654 } else { 1655 *(int *) data = 100; 1656 } 1657 return (0); 1658 1659 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1660 progressInfoPtr = (RF_ProgressInfo_t **) data; 1661 if (raidPtr->parity_rewrite_in_progress == 1) { 1662 progressInfo.total = raidPtr->Layout.numStripe; 1663 progressInfo.completed = 1664 raidPtr->parity_rewrite_stripes_done; 1665 progressInfo.remaining = progressInfo.total - 1666 progressInfo.completed; 1667 } else { 1668 progressInfo.remaining = 0; 1669 progressInfo.completed = 100; 1670 progressInfo.total = 100; 1671 } 1672 retcode = copyout(&progressInfo, *progressInfoPtr, 1673 sizeof(RF_ProgressInfo_t)); 1674 return (retcode); 1675 1676 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1677 if (raidPtr->Layout.map->faultsTolerated == 0) { 1678 /* This makes no sense on a RAID 0 */ 1679 *(int *) data = 100; 1680 return(0); 1681 } 1682 if (raidPtr->copyback_in_progress == 1) { 1683 *(int *) data = 100 * raidPtr->copyback_stripes_done / 1684 raidPtr->Layout.numStripe; 1685 } else { 1686 *(int *) data = 100; 1687 } 1688 return (0); 1689 1690 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1691 progressInfoPtr = (RF_ProgressInfo_t **) data; 1692 if (raidPtr->copyback_in_progress == 1) { 1693 progressInfo.total = raidPtr->Layout.numStripe; 1694 progressInfo.completed = 1695 raidPtr->copyback_stripes_done; 1696 progressInfo.remaining = progressInfo.total - 1697 progressInfo.completed; 1698 } else { 1699 progressInfo.remaining = 0; 1700 progressInfo.completed = 100; 1701 progressInfo.total = 100; 1702 } 1703 retcode = copyout(&progressInfo, *progressInfoPtr, 1704 sizeof(RF_ProgressInfo_t)); 1705 return (retcode); 1706 1707 case RAIDFRAME_SET_LAST_UNIT: 1708 for (column = 0; column < raidPtr->numCol; column++) 1709 if (raidPtr->Disks[column].status != rf_ds_optimal) 1710 return EBUSY; 1711 1712 for (column = 0; column < raidPtr->numCol; column++) { 1713 clabel = raidget_component_label(raidPtr, column); 1714 clabel->last_unit = *(int *)data; 1715 raidflush_component_label(raidPtr, column); 1716 } 1717 rs->sc_cflags |= RAIDF_UNIT_CHANGED; 1718 return 0; 1719 1720 /* the sparetable daemon calls this to wait for the kernel to 1721 * need a spare table. this ioctl does not return until a 1722 * spare table is needed. XXX -- calling mpsleep here in the 1723 * ioctl code is almost certainly wrong and evil. -- XXX XXX 1724 * -- I should either compute the spare table in the kernel, 1725 * or have a different -- XXX XXX -- interface (a different 1726 * character device) for delivering the table -- XXX */ 1727 #if 0 1728 case RAIDFRAME_SPARET_WAIT: 1729 rf_lock_mutex2(rf_sparet_wait_mutex); 1730 while (!rf_sparet_wait_queue) 1731 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex); 1732 waitreq = rf_sparet_wait_queue; 1733 rf_sparet_wait_queue = rf_sparet_wait_queue->next; 1734 rf_unlock_mutex2(rf_sparet_wait_mutex); 1735 1736 /* structure assignment */ 1737 *((RF_SparetWait_t *) data) = *waitreq; 1738 1739 RF_Free(waitreq, sizeof(*waitreq)); 1740 return (0); 1741 1742 /* wakes up a process waiting on SPARET_WAIT and puts an error 1743 * code in it that will cause the dameon to exit */ 1744 case RAIDFRAME_ABORT_SPARET_WAIT: 1745 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); 1746 waitreq->fcol = -1; 1747 rf_lock_mutex2(rf_sparet_wait_mutex); 1748 waitreq->next = rf_sparet_wait_queue; 1749 rf_sparet_wait_queue = waitreq; 1750 rf_broadcast_conf2(rf_sparet_wait_cv); 1751 rf_unlock_mutex2(rf_sparet_wait_mutex); 1752 return (0); 1753 1754 /* used by the spare table daemon to deliver a spare table 1755 * into the kernel */ 1756 case RAIDFRAME_SEND_SPARET: 1757 1758 /* install the spare table */ 1759 retcode = rf_SetSpareTable(raidPtr, *(void **) data); 1760 1761 /* respond to the requestor. the return status of the spare 1762 * table installation is passed in the "fcol" field */ 1763 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); 1764 waitreq->fcol = retcode; 1765 rf_lock_mutex2(rf_sparet_wait_mutex); 1766 waitreq->next = rf_sparet_resp_queue; 1767 rf_sparet_resp_queue = waitreq; 1768 rf_broadcast_cond2(rf_sparet_resp_cv); 1769 rf_unlock_mutex2(rf_sparet_wait_mutex); 1770 1771 return (retcode); 1772 #endif 1773 1774 default: 1775 break; /* fall through to the os-specific code below */ 1776 1777 } 1778 1779 if (!raidPtr->valid) 1780 return (EINVAL); 1781 1782 /* 1783 * Add support for "regular" device ioctls here. 1784 */ 1785 1786 switch (cmd) { 1787 case DIOCGCACHE: 1788 retcode = rf_get_component_caches(raidPtr, (int *)data); 1789 break; 1790 1791 case DIOCCACHESYNC: 1792 retcode = rf_sync_component_caches(raidPtr); 1793 break; 1794 1795 default: 1796 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l); 1797 break; 1798 } 1799 1800 return (retcode); 1801 1802 } 1803 1804 1805 /* raidinit -- complete the rest of the initialization for the 1806 RAIDframe device. */ 1807 1808 1809 static void 1810 raidinit(struct raid_softc *rs) 1811 { 1812 cfdata_t cf; 1813 unsigned int unit; 1814 struct dk_softc *dksc = &rs->sc_dksc; 1815 RF_Raid_t *raidPtr = &rs->sc_r; 1816 device_t dev; 1817 1818 unit = raidPtr->raidid; 1819 1820 /* XXX doesn't check bounds. */ 1821 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit); 1822 1823 /* attach the pseudo device */ 1824 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK); 1825 cf->cf_name = raid_cd.cd_name; 1826 cf->cf_atname = raid_cd.cd_name; 1827 cf->cf_unit = unit; 1828 cf->cf_fstate = FSTATE_STAR; 1829 1830 dev = config_attach_pseudo(cf); 1831 if (dev == NULL) { 1832 printf("raid%d: config_attach_pseudo failed\n", 1833 raidPtr->raidid); 1834 free(cf, M_RAIDFRAME); 1835 return; 1836 } 1837 1838 /* provide a backpointer to the real softc */ 1839 raidsoftc(dev) = rs; 1840 1841 /* disk_attach actually creates space for the CPU disklabel, among 1842 * other things, so it's critical to call this *BEFORE* we try putzing 1843 * with disklabels. */ 1844 dk_init(dksc, dev, DKTYPE_RAID); 1845 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver); 1846 1847 /* XXX There may be a weird interaction here between this, and 1848 * protectedSectors, as used in RAIDframe. */ 1849 1850 rs->sc_size = raidPtr->totalSectors; 1851 1852 /* Attach dk and disk subsystems */ 1853 dk_attach(dksc); 1854 disk_attach(&dksc->sc_dkdev); 1855 rf_set_geometry(rs, raidPtr); 1856 1857 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK); 1858 1859 /* mark unit as usuable */ 1860 rs->sc_flags |= RAIDF_INITED; 1861 1862 dkwedge_discover(&dksc->sc_dkdev); 1863 } 1864 1865 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 1866 /* wake up the daemon & tell it to get us a spare table 1867 * XXX 1868 * the entries in the queues should be tagged with the raidPtr 1869 * so that in the extremely rare case that two recons happen at once, 1870 * we know for which device were requesting a spare table 1871 * XXX 1872 * 1873 * XXX This code is not currently used. GO 1874 */ 1875 int 1876 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req) 1877 { 1878 int retcode; 1879 1880 rf_lock_mutex2(rf_sparet_wait_mutex); 1881 req->next = rf_sparet_wait_queue; 1882 rf_sparet_wait_queue = req; 1883 rf_broadcast_cond2(rf_sparet_wait_cv); 1884 1885 /* mpsleep unlocks the mutex */ 1886 while (!rf_sparet_resp_queue) { 1887 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex); 1888 } 1889 req = rf_sparet_resp_queue; 1890 rf_sparet_resp_queue = req->next; 1891 rf_unlock_mutex2(rf_sparet_wait_mutex); 1892 1893 retcode = req->fcol; 1894 RF_Free(req, sizeof(*req)); /* this is not the same req as we 1895 * alloc'd */ 1896 return (retcode); 1897 } 1898 #endif 1899 1900 /* a wrapper around rf_DoAccess that extracts appropriate info from the 1901 * bp & passes it down. 1902 * any calls originating in the kernel must use non-blocking I/O 1903 * do some extra sanity checking to return "appropriate" error values for 1904 * certain conditions (to make some standard utilities work) 1905 * 1906 * Formerly known as: rf_DoAccessKernel 1907 */ 1908 void 1909 raidstart(RF_Raid_t *raidPtr) 1910 { 1911 struct raid_softc *rs; 1912 struct dk_softc *dksc; 1913 1914 rs = raidPtr->softc; 1915 dksc = &rs->sc_dksc; 1916 /* quick check to see if anything has died recently */ 1917 rf_lock_mutex2(raidPtr->mutex); 1918 if (raidPtr->numNewFailures > 0) { 1919 rf_unlock_mutex2(raidPtr->mutex); 1920 rf_update_component_labels(raidPtr, 1921 RF_NORMAL_COMPONENT_UPDATE); 1922 rf_lock_mutex2(raidPtr->mutex); 1923 raidPtr->numNewFailures--; 1924 } 1925 rf_unlock_mutex2(raidPtr->mutex); 1926 1927 if ((rs->sc_flags & RAIDF_INITED) == 0) { 1928 printf("raid%d: raidstart not ready\n", raidPtr->raidid); 1929 return; 1930 } 1931 1932 dk_start(dksc, NULL); 1933 } 1934 1935 static int 1936 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp) 1937 { 1938 RF_SectorCount_t num_blocks, pb, sum; 1939 RF_RaidAddr_t raid_addr; 1940 daddr_t blocknum; 1941 int do_async; 1942 int rc; 1943 1944 rf_lock_mutex2(raidPtr->mutex); 1945 if (raidPtr->openings == 0) { 1946 rf_unlock_mutex2(raidPtr->mutex); 1947 return EAGAIN; 1948 } 1949 rf_unlock_mutex2(raidPtr->mutex); 1950 1951 blocknum = bp->b_rawblkno; 1952 1953 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno, 1954 (int) blocknum)); 1955 1956 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount)); 1957 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid)); 1958 1959 /* *THIS* is where we adjust what block we're going to... 1960 * but DO NOT TOUCH bp->b_blkno!!! */ 1961 raid_addr = blocknum; 1962 1963 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector; 1964 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0; 1965 sum = raid_addr + num_blocks + pb; 1966 if (1 || rf_debugKernelAccess) { 1967 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n", 1968 (int) raid_addr, (int) sum, (int) num_blocks, 1969 (int) pb, (int) bp->b_resid)); 1970 } 1971 if ((sum > raidPtr->totalSectors) || (sum < raid_addr) 1972 || (sum < num_blocks) || (sum < pb)) { 1973 rc = ENOSPC; 1974 goto done; 1975 } 1976 /* 1977 * XXX rf_DoAccess() should do this, not just DoAccessKernel() 1978 */ 1979 1980 if (bp->b_bcount & raidPtr->sectorMask) { 1981 rc = ENOSPC; 1982 goto done; 1983 } 1984 db1_printf(("Calling DoAccess..\n")); 1985 1986 1987 rf_lock_mutex2(raidPtr->mutex); 1988 raidPtr->openings--; 1989 rf_unlock_mutex2(raidPtr->mutex); 1990 1991 /* 1992 * Everything is async. 1993 */ 1994 do_async = 1; 1995 1996 /* don't ever condition on bp->b_flags & B_WRITE. 1997 * always condition on B_READ instead */ 1998 1999 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ? 2000 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE, 2001 do_async, raid_addr, num_blocks, 2002 bp->b_data, bp, RF_DAG_NONBLOCKING_IO); 2003 2004 done: 2005 return rc; 2006 } 2007 2008 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */ 2009 2010 int 2011 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req) 2012 { 2013 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE; 2014 struct buf *bp; 2015 2016 req->queue = queue; 2017 bp = req->bp; 2018 2019 switch (req->type) { 2020 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */ 2021 /* XXX need to do something extra here.. */ 2022 /* I'm leaving this in, as I've never actually seen it used, 2023 * and I'd like folks to report it... GO */ 2024 printf(("WAKEUP CALLED\n")); 2025 queue->numOutstanding++; 2026 2027 bp->b_flags = 0; 2028 bp->b_private = req; 2029 2030 KernelWakeupFunc(bp); 2031 break; 2032 2033 case RF_IO_TYPE_READ: 2034 case RF_IO_TYPE_WRITE: 2035 #if RF_ACC_TRACE > 0 2036 if (req->tracerec) { 2037 RF_ETIMER_START(req->tracerec->timer); 2038 } 2039 #endif 2040 InitBP(bp, queue->rf_cinfo->ci_vp, 2041 op, queue->rf_cinfo->ci_dev, 2042 req->sectorOffset, req->numSector, 2043 req->buf, KernelWakeupFunc, (void *) req, 2044 queue->raidPtr->logBytesPerSector, req->b_proc); 2045 2046 if (rf_debugKernelAccess) { 2047 db1_printf(("dispatch: bp->b_blkno = %ld\n", 2048 (long) bp->b_blkno)); 2049 } 2050 queue->numOutstanding++; 2051 queue->last_deq_sector = req->sectorOffset; 2052 /* acc wouldn't have been let in if there were any pending 2053 * reqs at any other priority */ 2054 queue->curPriority = req->priority; 2055 2056 db1_printf(("Going for %c to unit %d col %d\n", 2057 req->type, queue->raidPtr->raidid, 2058 queue->col)); 2059 db1_printf(("sector %d count %d (%d bytes) %d\n", 2060 (int) req->sectorOffset, (int) req->numSector, 2061 (int) (req->numSector << 2062 queue->raidPtr->logBytesPerSector), 2063 (int) queue->raidPtr->logBytesPerSector)); 2064 2065 /* 2066 * XXX: drop lock here since this can block at 2067 * least with backing SCSI devices. Retake it 2068 * to minimize fuss with calling interfaces. 2069 */ 2070 2071 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam"); 2072 bdev_strategy(bp); 2073 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam"); 2074 break; 2075 2076 default: 2077 panic("bad req->type in rf_DispatchKernelIO"); 2078 } 2079 db1_printf(("Exiting from DispatchKernelIO\n")); 2080 2081 return (0); 2082 } 2083 /* this is the callback function associated with a I/O invoked from 2084 kernel code. 2085 */ 2086 static void 2087 KernelWakeupFunc(struct buf *bp) 2088 { 2089 RF_DiskQueueData_t *req = NULL; 2090 RF_DiskQueue_t *queue; 2091 2092 db1_printf(("recovering the request queue:\n")); 2093 2094 req = bp->b_private; 2095 2096 queue = (RF_DiskQueue_t *) req->queue; 2097 2098 rf_lock_mutex2(queue->raidPtr->iodone_lock); 2099 2100 #if RF_ACC_TRACE > 0 2101 if (req->tracerec) { 2102 RF_ETIMER_STOP(req->tracerec->timer); 2103 RF_ETIMER_EVAL(req->tracerec->timer); 2104 rf_lock_mutex2(rf_tracing_mutex); 2105 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2106 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2107 req->tracerec->num_phys_ios++; 2108 rf_unlock_mutex2(rf_tracing_mutex); 2109 } 2110 #endif 2111 2112 /* XXX Ok, let's get aggressive... If b_error is set, let's go 2113 * ballistic, and mark the component as hosed... */ 2114 2115 if (bp->b_error != 0) { 2116 /* Mark the disk as dead */ 2117 /* but only mark it once... */ 2118 /* and only if it wouldn't leave this RAID set 2119 completely broken */ 2120 if (((queue->raidPtr->Disks[queue->col].status == 2121 rf_ds_optimal) || 2122 (queue->raidPtr->Disks[queue->col].status == 2123 rf_ds_used_spare)) && 2124 (queue->raidPtr->numFailures < 2125 queue->raidPtr->Layout.map->faultsTolerated)) { 2126 printf("raid%d: IO Error (%d). Marking %s as failed.\n", 2127 queue->raidPtr->raidid, 2128 bp->b_error, 2129 queue->raidPtr->Disks[queue->col].devname); 2130 queue->raidPtr->Disks[queue->col].status = 2131 rf_ds_failed; 2132 queue->raidPtr->status = rf_rs_degraded; 2133 queue->raidPtr->numFailures++; 2134 queue->raidPtr->numNewFailures++; 2135 } else { /* Disk is already dead... */ 2136 /* printf("Disk already marked as dead!\n"); */ 2137 } 2138 2139 } 2140 2141 /* Fill in the error value */ 2142 req->error = bp->b_error; 2143 2144 /* Drop this one on the "finished" queue... */ 2145 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries); 2146 2147 /* Let the raidio thread know there is work to be done. */ 2148 rf_signal_cond2(queue->raidPtr->iodone_cv); 2149 2150 rf_unlock_mutex2(queue->raidPtr->iodone_lock); 2151 } 2152 2153 2154 /* 2155 * initialize a buf structure for doing an I/O in the kernel. 2156 */ 2157 static void 2158 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev, 2159 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf, 2160 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector, 2161 struct proc *b_proc) 2162 { 2163 /* bp->b_flags = B_PHYS | rw_flag; */ 2164 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */ 2165 bp->b_oflags = 0; 2166 bp->b_cflags = 0; 2167 bp->b_bcount = numSect << logBytesPerSector; 2168 bp->b_bufsize = bp->b_bcount; 2169 bp->b_error = 0; 2170 bp->b_dev = dev; 2171 bp->b_data = bf; 2172 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT; 2173 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */ 2174 if (bp->b_bcount == 0) { 2175 panic("bp->b_bcount is zero in InitBP!!"); 2176 } 2177 bp->b_proc = b_proc; 2178 bp->b_iodone = cbFunc; 2179 bp->b_private = cbArg; 2180 } 2181 2182 /* 2183 * Wait interruptibly for an exclusive lock. 2184 * 2185 * XXX 2186 * Several drivers do this; it should be abstracted and made MP-safe. 2187 * (Hmm... where have we seen this warning before :-> GO ) 2188 */ 2189 static int 2190 raidlock(struct raid_softc *rs) 2191 { 2192 int error; 2193 2194 error = 0; 2195 mutex_enter(&rs->sc_mutex); 2196 while ((rs->sc_flags & RAIDF_LOCKED) != 0) { 2197 rs->sc_flags |= RAIDF_WANTED; 2198 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex); 2199 if (error != 0) 2200 goto done; 2201 } 2202 rs->sc_flags |= RAIDF_LOCKED; 2203 done: 2204 mutex_exit(&rs->sc_mutex); 2205 return (error); 2206 } 2207 /* 2208 * Unlock and wake up any waiters. 2209 */ 2210 static void 2211 raidunlock(struct raid_softc *rs) 2212 { 2213 2214 mutex_enter(&rs->sc_mutex); 2215 rs->sc_flags &= ~RAIDF_LOCKED; 2216 if ((rs->sc_flags & RAIDF_WANTED) != 0) { 2217 rs->sc_flags &= ~RAIDF_WANTED; 2218 cv_broadcast(&rs->sc_cv); 2219 } 2220 mutex_exit(&rs->sc_mutex); 2221 } 2222 2223 2224 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */ 2225 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */ 2226 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE 2227 2228 static daddr_t 2229 rf_component_info_offset(void) 2230 { 2231 2232 return RF_COMPONENT_INFO_OFFSET; 2233 } 2234 2235 static daddr_t 2236 rf_component_info_size(unsigned secsize) 2237 { 2238 daddr_t info_size; 2239 2240 KASSERT(secsize); 2241 if (secsize > RF_COMPONENT_INFO_SIZE) 2242 info_size = secsize; 2243 else 2244 info_size = RF_COMPONENT_INFO_SIZE; 2245 2246 return info_size; 2247 } 2248 2249 static daddr_t 2250 rf_parity_map_offset(RF_Raid_t *raidPtr) 2251 { 2252 daddr_t map_offset; 2253 2254 KASSERT(raidPtr->bytesPerSector); 2255 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE) 2256 map_offset = raidPtr->bytesPerSector; 2257 else 2258 map_offset = RF_COMPONENT_INFO_SIZE; 2259 map_offset += rf_component_info_offset(); 2260 2261 return map_offset; 2262 } 2263 2264 static daddr_t 2265 rf_parity_map_size(RF_Raid_t *raidPtr) 2266 { 2267 daddr_t map_size; 2268 2269 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE) 2270 map_size = raidPtr->bytesPerSector; 2271 else 2272 map_size = RF_PARITY_MAP_SIZE; 2273 2274 return map_size; 2275 } 2276 2277 int 2278 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col) 2279 { 2280 RF_ComponentLabel_t *clabel; 2281 2282 clabel = raidget_component_label(raidPtr, col); 2283 clabel->clean = RF_RAID_CLEAN; 2284 raidflush_component_label(raidPtr, col); 2285 return(0); 2286 } 2287 2288 2289 int 2290 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col) 2291 { 2292 RF_ComponentLabel_t *clabel; 2293 2294 clabel = raidget_component_label(raidPtr, col); 2295 clabel->clean = RF_RAID_DIRTY; 2296 raidflush_component_label(raidPtr, col); 2297 return(0); 2298 } 2299 2300 int 2301 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2302 { 2303 KASSERT(raidPtr->bytesPerSector); 2304 return raidread_component_label(raidPtr->bytesPerSector, 2305 raidPtr->Disks[col].dev, 2306 raidPtr->raid_cinfo[col].ci_vp, 2307 &raidPtr->raid_cinfo[col].ci_label); 2308 } 2309 2310 RF_ComponentLabel_t * 2311 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2312 { 2313 return &raidPtr->raid_cinfo[col].ci_label; 2314 } 2315 2316 int 2317 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2318 { 2319 RF_ComponentLabel_t *label; 2320 2321 label = &raidPtr->raid_cinfo[col].ci_label; 2322 label->mod_counter = raidPtr->mod_counter; 2323 #ifndef RF_NO_PARITY_MAP 2324 label->parity_map_modcount = label->mod_counter; 2325 #endif 2326 return raidwrite_component_label(raidPtr->bytesPerSector, 2327 raidPtr->Disks[col].dev, 2328 raidPtr->raid_cinfo[col].ci_vp, label); 2329 } 2330 2331 2332 static int 2333 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2334 RF_ComponentLabel_t *clabel) 2335 { 2336 return raidread_component_area(dev, b_vp, clabel, 2337 sizeof(RF_ComponentLabel_t), 2338 rf_component_info_offset(), 2339 rf_component_info_size(secsize)); 2340 } 2341 2342 /* ARGSUSED */ 2343 static int 2344 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data, 2345 size_t msize, daddr_t offset, daddr_t dsize) 2346 { 2347 struct buf *bp; 2348 int error; 2349 2350 /* XXX should probably ensure that we don't try to do this if 2351 someone has changed rf_protected_sectors. */ 2352 2353 if (b_vp == NULL) { 2354 /* For whatever reason, this component is not valid. 2355 Don't try to read a component label from it. */ 2356 return(EINVAL); 2357 } 2358 2359 /* get a block of the appropriate size... */ 2360 bp = geteblk((int)dsize); 2361 bp->b_dev = dev; 2362 2363 /* get our ducks in a row for the read */ 2364 bp->b_blkno = offset / DEV_BSIZE; 2365 bp->b_bcount = dsize; 2366 bp->b_flags |= B_READ; 2367 bp->b_resid = dsize; 2368 2369 bdev_strategy(bp); 2370 error = biowait(bp); 2371 2372 if (!error) { 2373 memcpy(data, bp->b_data, msize); 2374 } 2375 2376 brelse(bp, 0); 2377 return(error); 2378 } 2379 2380 2381 static int 2382 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2383 RF_ComponentLabel_t *clabel) 2384 { 2385 return raidwrite_component_area(dev, b_vp, clabel, 2386 sizeof(RF_ComponentLabel_t), 2387 rf_component_info_offset(), 2388 rf_component_info_size(secsize), 0); 2389 } 2390 2391 /* ARGSUSED */ 2392 static int 2393 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data, 2394 size_t msize, daddr_t offset, daddr_t dsize, int asyncp) 2395 { 2396 struct buf *bp; 2397 int error; 2398 2399 /* get a block of the appropriate size... */ 2400 bp = geteblk((int)dsize); 2401 bp->b_dev = dev; 2402 2403 /* get our ducks in a row for the write */ 2404 bp->b_blkno = offset / DEV_BSIZE; 2405 bp->b_bcount = dsize; 2406 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0); 2407 bp->b_resid = dsize; 2408 2409 memset(bp->b_data, 0, dsize); 2410 memcpy(bp->b_data, data, msize); 2411 2412 bdev_strategy(bp); 2413 if (asyncp) 2414 return 0; 2415 error = biowait(bp); 2416 brelse(bp, 0); 2417 if (error) { 2418 #if 1 2419 printf("Failed to write RAID component info!\n"); 2420 #endif 2421 } 2422 2423 return(error); 2424 } 2425 2426 void 2427 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2428 { 2429 int c; 2430 2431 for (c = 0; c < raidPtr->numCol; c++) { 2432 /* Skip dead disks. */ 2433 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2434 continue; 2435 /* XXXjld: what if an error occurs here? */ 2436 raidwrite_component_area(raidPtr->Disks[c].dev, 2437 raidPtr->raid_cinfo[c].ci_vp, map, 2438 RF_PARITYMAP_NBYTE, 2439 rf_parity_map_offset(raidPtr), 2440 rf_parity_map_size(raidPtr), 0); 2441 } 2442 } 2443 2444 void 2445 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2446 { 2447 struct rf_paritymap_ondisk tmp; 2448 int c,first; 2449 2450 first=1; 2451 for (c = 0; c < raidPtr->numCol; c++) { 2452 /* Skip dead disks. */ 2453 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2454 continue; 2455 raidread_component_area(raidPtr->Disks[c].dev, 2456 raidPtr->raid_cinfo[c].ci_vp, &tmp, 2457 RF_PARITYMAP_NBYTE, 2458 rf_parity_map_offset(raidPtr), 2459 rf_parity_map_size(raidPtr)); 2460 if (first) { 2461 memcpy(map, &tmp, sizeof(*map)); 2462 first = 0; 2463 } else { 2464 rf_paritymap_merge(map, &tmp); 2465 } 2466 } 2467 } 2468 2469 void 2470 rf_markalldirty(RF_Raid_t *raidPtr) 2471 { 2472 RF_ComponentLabel_t *clabel; 2473 int sparecol; 2474 int c; 2475 int j; 2476 int scol = -1; 2477 2478 raidPtr->mod_counter++; 2479 for (c = 0; c < raidPtr->numCol; c++) { 2480 /* we don't want to touch (at all) a disk that has 2481 failed */ 2482 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) { 2483 clabel = raidget_component_label(raidPtr, c); 2484 if (clabel->status == rf_ds_spared) { 2485 /* XXX do something special... 2486 but whatever you do, don't 2487 try to access it!! */ 2488 } else { 2489 raidmarkdirty(raidPtr, c); 2490 } 2491 } 2492 } 2493 2494 for( c = 0; c < raidPtr->numSpare ; c++) { 2495 sparecol = raidPtr->numCol + c; 2496 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2497 /* 2498 2499 we claim this disk is "optimal" if it's 2500 rf_ds_used_spare, as that means it should be 2501 directly substitutable for the disk it replaced. 2502 We note that too... 2503 2504 */ 2505 2506 for(j=0;j<raidPtr->numCol;j++) { 2507 if (raidPtr->Disks[j].spareCol == sparecol) { 2508 scol = j; 2509 break; 2510 } 2511 } 2512 2513 clabel = raidget_component_label(raidPtr, sparecol); 2514 /* make sure status is noted */ 2515 2516 raid_init_component_label(raidPtr, clabel); 2517 2518 clabel->row = 0; 2519 clabel->column = scol; 2520 /* Note: we *don't* change status from rf_ds_used_spare 2521 to rf_ds_optimal */ 2522 /* clabel.status = rf_ds_optimal; */ 2523 2524 raidmarkdirty(raidPtr, sparecol); 2525 } 2526 } 2527 } 2528 2529 2530 void 2531 rf_update_component_labels(RF_Raid_t *raidPtr, int final) 2532 { 2533 RF_ComponentLabel_t *clabel; 2534 int sparecol; 2535 int c; 2536 int j; 2537 int scol; 2538 struct raid_softc *rs = raidPtr->softc; 2539 2540 scol = -1; 2541 2542 /* XXX should do extra checks to make sure things really are clean, 2543 rather than blindly setting the clean bit... */ 2544 2545 raidPtr->mod_counter++; 2546 2547 for (c = 0; c < raidPtr->numCol; c++) { 2548 if (raidPtr->Disks[c].status == rf_ds_optimal) { 2549 clabel = raidget_component_label(raidPtr, c); 2550 /* make sure status is noted */ 2551 clabel->status = rf_ds_optimal; 2552 2553 /* note what unit we are configured as */ 2554 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0) 2555 clabel->last_unit = raidPtr->raidid; 2556 2557 raidflush_component_label(raidPtr, c); 2558 if (final == RF_FINAL_COMPONENT_UPDATE) { 2559 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2560 raidmarkclean(raidPtr, c); 2561 } 2562 } 2563 } 2564 /* else we don't touch it.. */ 2565 } 2566 2567 for( c = 0; c < raidPtr->numSpare ; c++) { 2568 sparecol = raidPtr->numCol + c; 2569 /* Need to ensure that the reconstruct actually completed! */ 2570 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2571 /* 2572 2573 we claim this disk is "optimal" if it's 2574 rf_ds_used_spare, as that means it should be 2575 directly substitutable for the disk it replaced. 2576 We note that too... 2577 2578 */ 2579 2580 for(j=0;j<raidPtr->numCol;j++) { 2581 if (raidPtr->Disks[j].spareCol == sparecol) { 2582 scol = j; 2583 break; 2584 } 2585 } 2586 2587 /* XXX shouldn't *really* need this... */ 2588 clabel = raidget_component_label(raidPtr, sparecol); 2589 /* make sure status is noted */ 2590 2591 raid_init_component_label(raidPtr, clabel); 2592 2593 clabel->column = scol; 2594 clabel->status = rf_ds_optimal; 2595 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0) 2596 clabel->last_unit = raidPtr->raidid; 2597 2598 raidflush_component_label(raidPtr, sparecol); 2599 if (final == RF_FINAL_COMPONENT_UPDATE) { 2600 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2601 raidmarkclean(raidPtr, sparecol); 2602 } 2603 } 2604 } 2605 } 2606 } 2607 2608 void 2609 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured) 2610 { 2611 2612 if (vp != NULL) { 2613 if (auto_configured == 1) { 2614 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2615 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2616 vput(vp); 2617 2618 } else { 2619 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred); 2620 } 2621 } 2622 } 2623 2624 2625 void 2626 rf_UnconfigureVnodes(RF_Raid_t *raidPtr) 2627 { 2628 int r,c; 2629 struct vnode *vp; 2630 int acd; 2631 2632 2633 /* We take this opportunity to close the vnodes like we should.. */ 2634 2635 for (c = 0; c < raidPtr->numCol; c++) { 2636 vp = raidPtr->raid_cinfo[c].ci_vp; 2637 acd = raidPtr->Disks[c].auto_configured; 2638 rf_close_component(raidPtr, vp, acd); 2639 raidPtr->raid_cinfo[c].ci_vp = NULL; 2640 raidPtr->Disks[c].auto_configured = 0; 2641 } 2642 2643 for (r = 0; r < raidPtr->numSpare; r++) { 2644 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp; 2645 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured; 2646 rf_close_component(raidPtr, vp, acd); 2647 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL; 2648 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0; 2649 } 2650 } 2651 2652 2653 void 2654 rf_ReconThread(struct rf_recon_req *req) 2655 { 2656 int s; 2657 RF_Raid_t *raidPtr; 2658 2659 s = splbio(); 2660 raidPtr = (RF_Raid_t *) req->raidPtr; 2661 raidPtr->recon_in_progress = 1; 2662 2663 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col, 2664 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0)); 2665 2666 RF_Free(req, sizeof(*req)); 2667 2668 raidPtr->recon_in_progress = 0; 2669 splx(s); 2670 2671 /* That's all... */ 2672 kthread_exit(0); /* does not return */ 2673 } 2674 2675 void 2676 rf_RewriteParityThread(RF_Raid_t *raidPtr) 2677 { 2678 int retcode; 2679 int s; 2680 2681 raidPtr->parity_rewrite_stripes_done = 0; 2682 raidPtr->parity_rewrite_in_progress = 1; 2683 s = splbio(); 2684 retcode = rf_RewriteParity(raidPtr); 2685 splx(s); 2686 if (retcode) { 2687 printf("raid%d: Error re-writing parity (%d)!\n", 2688 raidPtr->raidid, retcode); 2689 } else { 2690 /* set the clean bit! If we shutdown correctly, 2691 the clean bit on each component label will get 2692 set */ 2693 raidPtr->parity_good = RF_RAID_CLEAN; 2694 } 2695 raidPtr->parity_rewrite_in_progress = 0; 2696 2697 /* Anyone waiting for us to stop? If so, inform them... */ 2698 if (raidPtr->waitShutdown) { 2699 wakeup(&raidPtr->parity_rewrite_in_progress); 2700 } 2701 2702 /* That's all... */ 2703 kthread_exit(0); /* does not return */ 2704 } 2705 2706 2707 void 2708 rf_CopybackThread(RF_Raid_t *raidPtr) 2709 { 2710 int s; 2711 2712 raidPtr->copyback_in_progress = 1; 2713 s = splbio(); 2714 rf_CopybackReconstructedData(raidPtr); 2715 splx(s); 2716 raidPtr->copyback_in_progress = 0; 2717 2718 /* That's all... */ 2719 kthread_exit(0); /* does not return */ 2720 } 2721 2722 2723 void 2724 rf_ReconstructInPlaceThread(struct rf_recon_req *req) 2725 { 2726 int s; 2727 RF_Raid_t *raidPtr; 2728 2729 s = splbio(); 2730 raidPtr = req->raidPtr; 2731 raidPtr->recon_in_progress = 1; 2732 rf_ReconstructInPlace(raidPtr, req->col); 2733 RF_Free(req, sizeof(*req)); 2734 raidPtr->recon_in_progress = 0; 2735 splx(s); 2736 2737 /* That's all... */ 2738 kthread_exit(0); /* does not return */ 2739 } 2740 2741 static RF_AutoConfig_t * 2742 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp, 2743 const char *cname, RF_SectorCount_t size, uint64_t numsecs, 2744 unsigned secsize) 2745 { 2746 int good_one = 0; 2747 RF_ComponentLabel_t *clabel; 2748 RF_AutoConfig_t *ac; 2749 2750 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT); 2751 if (clabel == NULL) { 2752 oomem: 2753 while(ac_list) { 2754 ac = ac_list; 2755 if (ac->clabel) 2756 free(ac->clabel, M_RAIDFRAME); 2757 ac_list = ac_list->next; 2758 free(ac, M_RAIDFRAME); 2759 } 2760 printf("RAID auto config: out of memory!\n"); 2761 return NULL; /* XXX probably should panic? */ 2762 } 2763 2764 if (!raidread_component_label(secsize, dev, vp, clabel)) { 2765 /* Got the label. Does it look reasonable? */ 2766 if (rf_reasonable_label(clabel, numsecs) && 2767 (rf_component_label_partitionsize(clabel) <= size)) { 2768 #ifdef DEBUG 2769 printf("Component on: %s: %llu\n", 2770 cname, (unsigned long long)size); 2771 rf_print_component_label(clabel); 2772 #endif 2773 /* if it's reasonable, add it, else ignore it. */ 2774 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME, 2775 M_NOWAIT); 2776 if (ac == NULL) { 2777 free(clabel, M_RAIDFRAME); 2778 goto oomem; 2779 } 2780 strlcpy(ac->devname, cname, sizeof(ac->devname)); 2781 ac->dev = dev; 2782 ac->vp = vp; 2783 ac->clabel = clabel; 2784 ac->next = ac_list; 2785 ac_list = ac; 2786 good_one = 1; 2787 } 2788 } 2789 if (!good_one) { 2790 /* cleanup */ 2791 free(clabel, M_RAIDFRAME); 2792 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2793 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2794 vput(vp); 2795 } 2796 return ac_list; 2797 } 2798 2799 RF_AutoConfig_t * 2800 rf_find_raid_components(void) 2801 { 2802 struct vnode *vp; 2803 struct disklabel label; 2804 device_t dv; 2805 deviter_t di; 2806 dev_t dev; 2807 int bmajor, bminor, wedge, rf_part_found; 2808 int error; 2809 int i; 2810 RF_AutoConfig_t *ac_list; 2811 uint64_t numsecs; 2812 unsigned secsize; 2813 int dowedges; 2814 2815 /* initialize the AutoConfig list */ 2816 ac_list = NULL; 2817 2818 /* 2819 * we begin by trolling through *all* the devices on the system *twice* 2820 * first we scan for wedges, second for other devices. This avoids 2821 * using a raw partition instead of a wedge that covers the whole disk 2822 */ 2823 2824 for (dowedges=1; dowedges>=0; --dowedges) { 2825 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL; 2826 dv = deviter_next(&di)) { 2827 2828 /* we are only interested in disks... */ 2829 if (device_class(dv) != DV_DISK) 2830 continue; 2831 2832 /* we don't care about floppies... */ 2833 if (device_is_a(dv, "fd")) { 2834 continue; 2835 } 2836 2837 /* we don't care about CD's... */ 2838 if (device_is_a(dv, "cd")) { 2839 continue; 2840 } 2841 2842 /* we don't care about md's... */ 2843 if (device_is_a(dv, "md")) { 2844 continue; 2845 } 2846 2847 /* hdfd is the Atari/Hades floppy driver */ 2848 if (device_is_a(dv, "hdfd")) { 2849 continue; 2850 } 2851 2852 /* fdisa is the Atari/Milan floppy driver */ 2853 if (device_is_a(dv, "fdisa")) { 2854 continue; 2855 } 2856 2857 /* are we in the wedges pass ? */ 2858 wedge = device_is_a(dv, "dk"); 2859 if (wedge != dowedges) { 2860 continue; 2861 } 2862 2863 /* need to find the device_name_to_block_device_major stuff */ 2864 bmajor = devsw_name2blk(device_xname(dv), NULL, 0); 2865 2866 rf_part_found = 0; /*No raid partition as yet*/ 2867 2868 /* get a vnode for the raw partition of this disk */ 2869 bminor = minor(device_unit(dv)); 2870 dev = wedge ? makedev(bmajor, bminor) : 2871 MAKEDISKDEV(bmajor, bminor, RAW_PART); 2872 if (bdevvp(dev, &vp)) 2873 panic("RAID can't alloc vnode"); 2874 2875 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED); 2876 2877 if (error) { 2878 /* "Who cares." Continue looking 2879 for something that exists*/ 2880 vput(vp); 2881 continue; 2882 } 2883 2884 error = getdisksize(vp, &numsecs, &secsize); 2885 if (error) { 2886 /* 2887 * Pseudo devices like vnd and cgd can be 2888 * opened but may still need some configuration. 2889 * Ignore these quietly. 2890 */ 2891 if (error != ENXIO) 2892 printf("RAIDframe: can't get disk size" 2893 " for dev %s (%d)\n", 2894 device_xname(dv), error); 2895 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2896 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2897 vput(vp); 2898 continue; 2899 } 2900 if (wedge) { 2901 struct dkwedge_info dkw; 2902 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, 2903 NOCRED); 2904 if (error) { 2905 printf("RAIDframe: can't get wedge info for " 2906 "dev %s (%d)\n", device_xname(dv), error); 2907 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2908 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2909 vput(vp); 2910 continue; 2911 } 2912 2913 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) { 2914 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2915 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2916 vput(vp); 2917 continue; 2918 } 2919 2920 ac_list = rf_get_component(ac_list, dev, vp, 2921 device_xname(dv), dkw.dkw_size, numsecs, secsize); 2922 rf_part_found = 1; /*There is a raid component on this disk*/ 2923 continue; 2924 } 2925 2926 /* Ok, the disk exists. Go get the disklabel. */ 2927 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED); 2928 if (error) { 2929 /* 2930 * XXX can't happen - open() would 2931 * have errored out (or faked up one) 2932 */ 2933 if (error != ENOTTY) 2934 printf("RAIDframe: can't get label for dev " 2935 "%s (%d)\n", device_xname(dv), error); 2936 } 2937 2938 /* don't need this any more. We'll allocate it again 2939 a little later if we really do... */ 2940 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2941 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2942 vput(vp); 2943 2944 if (error) 2945 continue; 2946 2947 rf_part_found = 0; /*No raid partitions yet*/ 2948 for (i = 0; i < label.d_npartitions; i++) { 2949 char cname[sizeof(ac_list->devname)]; 2950 2951 /* We only support partitions marked as RAID */ 2952 if (label.d_partitions[i].p_fstype != FS_RAID) 2953 continue; 2954 2955 dev = MAKEDISKDEV(bmajor, device_unit(dv), i); 2956 if (bdevvp(dev, &vp)) 2957 panic("RAID can't alloc vnode"); 2958 2959 error = VOP_OPEN(vp, FREAD, NOCRED); 2960 if (error) { 2961 /* Whatever... */ 2962 vput(vp); 2963 continue; 2964 } 2965 snprintf(cname, sizeof(cname), "%s%c", 2966 device_xname(dv), 'a' + i); 2967 ac_list = rf_get_component(ac_list, dev, vp, cname, 2968 label.d_partitions[i].p_size, numsecs, secsize); 2969 rf_part_found = 1; /*There is at least one raid partition on this disk*/ 2970 } 2971 2972 /* 2973 *If there is no raid component on this disk, either in a 2974 *disklabel or inside a wedge, check the raw partition as well, 2975 *as it is possible to configure raid components on raw disk 2976 *devices. 2977 */ 2978 2979 if (!rf_part_found) { 2980 char cname[sizeof(ac_list->devname)]; 2981 2982 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART); 2983 if (bdevvp(dev, &vp)) 2984 panic("RAID can't alloc vnode"); 2985 2986 error = VOP_OPEN(vp, FREAD, NOCRED); 2987 if (error) { 2988 /* Whatever... */ 2989 vput(vp); 2990 continue; 2991 } 2992 snprintf(cname, sizeof(cname), "%s%c", 2993 device_xname(dv), 'a' + RAW_PART); 2994 ac_list = rf_get_component(ac_list, dev, vp, cname, 2995 label.d_partitions[RAW_PART].p_size, numsecs, secsize); 2996 } 2997 } 2998 deviter_release(&di); 2999 } 3000 return ac_list; 3001 } 3002 3003 3004 int 3005 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs) 3006 { 3007 3008 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) || 3009 (clabel->version==RF_COMPONENT_LABEL_VERSION)) && 3010 ((clabel->clean == RF_RAID_CLEAN) || 3011 (clabel->clean == RF_RAID_DIRTY)) && 3012 clabel->row >=0 && 3013 clabel->column >= 0 && 3014 clabel->num_rows > 0 && 3015 clabel->num_columns > 0 && 3016 clabel->row < clabel->num_rows && 3017 clabel->column < clabel->num_columns && 3018 clabel->blockSize > 0 && 3019 /* 3020 * numBlocksHi may contain garbage, but it is ok since 3021 * the type is unsigned. If it is really garbage, 3022 * rf_fix_old_label_size() will fix it. 3023 */ 3024 rf_component_label_numblocks(clabel) > 0) { 3025 /* 3026 * label looks reasonable enough... 3027 * let's make sure it has no old garbage. 3028 */ 3029 if (numsecs) 3030 rf_fix_old_label_size(clabel, numsecs); 3031 return(1); 3032 } 3033 return(0); 3034 } 3035 3036 3037 /* 3038 * For reasons yet unknown, some old component labels have garbage in 3039 * the newer numBlocksHi region, and this causes lossage. Since those 3040 * disks will also have numsecs set to less than 32 bits of sectors, 3041 * we can determine when this corruption has occurred, and fix it. 3042 * 3043 * The exact same problem, with the same unknown reason, happens to 3044 * the partitionSizeHi member as well. 3045 */ 3046 static void 3047 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs) 3048 { 3049 3050 if (numsecs < ((uint64_t)1 << 32)) { 3051 if (clabel->numBlocksHi) { 3052 printf("WARNING: total sectors < 32 bits, yet " 3053 "numBlocksHi set\n" 3054 "WARNING: resetting numBlocksHi to zero.\n"); 3055 clabel->numBlocksHi = 0; 3056 } 3057 3058 if (clabel->partitionSizeHi) { 3059 printf("WARNING: total sectors < 32 bits, yet " 3060 "partitionSizeHi set\n" 3061 "WARNING: resetting partitionSizeHi to zero.\n"); 3062 clabel->partitionSizeHi = 0; 3063 } 3064 } 3065 } 3066 3067 3068 #ifdef DEBUG 3069 void 3070 rf_print_component_label(RF_ComponentLabel_t *clabel) 3071 { 3072 uint64_t numBlocks; 3073 static const char *rp[] = { 3074 "No", "Force", "Soft", "*invalid*" 3075 }; 3076 3077 3078 numBlocks = rf_component_label_numblocks(clabel); 3079 3080 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n", 3081 clabel->row, clabel->column, 3082 clabel->num_rows, clabel->num_columns); 3083 printf(" Version: %d Serial Number: %d Mod Counter: %d\n", 3084 clabel->version, clabel->serial_number, 3085 clabel->mod_counter); 3086 printf(" Clean: %s Status: %d\n", 3087 clabel->clean ? "Yes" : "No", clabel->status); 3088 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n", 3089 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU); 3090 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n", 3091 (char) clabel->parityConfig, clabel->blockSize, numBlocks); 3092 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No"); 3093 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]); 3094 printf(" Last configured as: raid%d\n", clabel->last_unit); 3095 #if 0 3096 printf(" Config order: %d\n", clabel->config_order); 3097 #endif 3098 3099 } 3100 #endif 3101 3102 RF_ConfigSet_t * 3103 rf_create_auto_sets(RF_AutoConfig_t *ac_list) 3104 { 3105 RF_AutoConfig_t *ac; 3106 RF_ConfigSet_t *config_sets; 3107 RF_ConfigSet_t *cset; 3108 RF_AutoConfig_t *ac_next; 3109 3110 3111 config_sets = NULL; 3112 3113 /* Go through the AutoConfig list, and figure out which components 3114 belong to what sets. */ 3115 ac = ac_list; 3116 while(ac!=NULL) { 3117 /* we're going to putz with ac->next, so save it here 3118 for use at the end of the loop */ 3119 ac_next = ac->next; 3120 3121 if (config_sets == NULL) { 3122 /* will need at least this one... */ 3123 config_sets = (RF_ConfigSet_t *) 3124 malloc(sizeof(RF_ConfigSet_t), 3125 M_RAIDFRAME, M_NOWAIT); 3126 if (config_sets == NULL) { 3127 panic("rf_create_auto_sets: No memory!"); 3128 } 3129 /* this one is easy :) */ 3130 config_sets->ac = ac; 3131 config_sets->next = NULL; 3132 config_sets->rootable = 0; 3133 ac->next = NULL; 3134 } else { 3135 /* which set does this component fit into? */ 3136 cset = config_sets; 3137 while(cset!=NULL) { 3138 if (rf_does_it_fit(cset, ac)) { 3139 /* looks like it matches... */ 3140 ac->next = cset->ac; 3141 cset->ac = ac; 3142 break; 3143 } 3144 cset = cset->next; 3145 } 3146 if (cset==NULL) { 3147 /* didn't find a match above... new set..*/ 3148 cset = (RF_ConfigSet_t *) 3149 malloc(sizeof(RF_ConfigSet_t), 3150 M_RAIDFRAME, M_NOWAIT); 3151 if (cset == NULL) { 3152 panic("rf_create_auto_sets: No memory!"); 3153 } 3154 cset->ac = ac; 3155 ac->next = NULL; 3156 cset->next = config_sets; 3157 cset->rootable = 0; 3158 config_sets = cset; 3159 } 3160 } 3161 ac = ac_next; 3162 } 3163 3164 3165 return(config_sets); 3166 } 3167 3168 static int 3169 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac) 3170 { 3171 RF_ComponentLabel_t *clabel1, *clabel2; 3172 3173 /* If this one matches the *first* one in the set, that's good 3174 enough, since the other members of the set would have been 3175 through here too... */ 3176 /* note that we are not checking partitionSize here.. 3177 3178 Note that we are also not checking the mod_counters here. 3179 If everything else matches except the mod_counter, that's 3180 good enough for this test. We will deal with the mod_counters 3181 a little later in the autoconfiguration process. 3182 3183 (clabel1->mod_counter == clabel2->mod_counter) && 3184 3185 The reason we don't check for this is that failed disks 3186 will have lower modification counts. If those disks are 3187 not added to the set they used to belong to, then they will 3188 form their own set, which may result in 2 different sets, 3189 for example, competing to be configured at raid0, and 3190 perhaps competing to be the root filesystem set. If the 3191 wrong ones get configured, or both attempt to become /, 3192 weird behaviour and or serious lossage will occur. Thus we 3193 need to bring them into the fold here, and kick them out at 3194 a later point. 3195 3196 */ 3197 3198 clabel1 = cset->ac->clabel; 3199 clabel2 = ac->clabel; 3200 if ((clabel1->version == clabel2->version) && 3201 (clabel1->serial_number == clabel2->serial_number) && 3202 (clabel1->num_rows == clabel2->num_rows) && 3203 (clabel1->num_columns == clabel2->num_columns) && 3204 (clabel1->sectPerSU == clabel2->sectPerSU) && 3205 (clabel1->SUsPerPU == clabel2->SUsPerPU) && 3206 (clabel1->SUsPerRU == clabel2->SUsPerRU) && 3207 (clabel1->parityConfig == clabel2->parityConfig) && 3208 (clabel1->maxOutstanding == clabel2->maxOutstanding) && 3209 (clabel1->blockSize == clabel2->blockSize) && 3210 rf_component_label_numblocks(clabel1) == 3211 rf_component_label_numblocks(clabel2) && 3212 (clabel1->autoconfigure == clabel2->autoconfigure) && 3213 (clabel1->root_partition == clabel2->root_partition) && 3214 (clabel1->last_unit == clabel2->last_unit) && 3215 (clabel1->config_order == clabel2->config_order)) { 3216 /* if it get's here, it almost *has* to be a match */ 3217 } else { 3218 /* it's not consistent with somebody in the set.. 3219 punt */ 3220 return(0); 3221 } 3222 /* all was fine.. it must fit... */ 3223 return(1); 3224 } 3225 3226 int 3227 rf_have_enough_components(RF_ConfigSet_t *cset) 3228 { 3229 RF_AutoConfig_t *ac; 3230 RF_AutoConfig_t *auto_config; 3231 RF_ComponentLabel_t *clabel; 3232 int c; 3233 int num_cols; 3234 int num_missing; 3235 int mod_counter; 3236 int mod_counter_found; 3237 int even_pair_failed; 3238 char parity_type; 3239 3240 3241 /* check to see that we have enough 'live' components 3242 of this set. If so, we can configure it if necessary */ 3243 3244 num_cols = cset->ac->clabel->num_columns; 3245 parity_type = cset->ac->clabel->parityConfig; 3246 3247 /* XXX Check for duplicate components!?!?!? */ 3248 3249 /* Determine what the mod_counter is supposed to be for this set. */ 3250 3251 mod_counter_found = 0; 3252 mod_counter = 0; 3253 ac = cset->ac; 3254 while(ac!=NULL) { 3255 if (mod_counter_found==0) { 3256 mod_counter = ac->clabel->mod_counter; 3257 mod_counter_found = 1; 3258 } else { 3259 if (ac->clabel->mod_counter > mod_counter) { 3260 mod_counter = ac->clabel->mod_counter; 3261 } 3262 } 3263 ac = ac->next; 3264 } 3265 3266 num_missing = 0; 3267 auto_config = cset->ac; 3268 3269 even_pair_failed = 0; 3270 for(c=0; c<num_cols; c++) { 3271 ac = auto_config; 3272 while(ac!=NULL) { 3273 if ((ac->clabel->column == c) && 3274 (ac->clabel->mod_counter == mod_counter)) { 3275 /* it's this one... */ 3276 #ifdef DEBUG 3277 printf("Found: %s at %d\n", 3278 ac->devname,c); 3279 #endif 3280 break; 3281 } 3282 ac=ac->next; 3283 } 3284 if (ac==NULL) { 3285 /* Didn't find one here! */ 3286 /* special case for RAID 1, especially 3287 where there are more than 2 3288 components (where RAIDframe treats 3289 things a little differently :( ) */ 3290 if (parity_type == '1') { 3291 if (c%2 == 0) { /* even component */ 3292 even_pair_failed = 1; 3293 } else { /* odd component. If 3294 we're failed, and 3295 so is the even 3296 component, it's 3297 "Good Night, Charlie" */ 3298 if (even_pair_failed == 1) { 3299 return(0); 3300 } 3301 } 3302 } else { 3303 /* normal accounting */ 3304 num_missing++; 3305 } 3306 } 3307 if ((parity_type == '1') && (c%2 == 1)) { 3308 /* Just did an even component, and we didn't 3309 bail.. reset the even_pair_failed flag, 3310 and go on to the next component.... */ 3311 even_pair_failed = 0; 3312 } 3313 } 3314 3315 clabel = cset->ac->clabel; 3316 3317 if (((clabel->parityConfig == '0') && (num_missing > 0)) || 3318 ((clabel->parityConfig == '4') && (num_missing > 1)) || 3319 ((clabel->parityConfig == '5') && (num_missing > 1))) { 3320 /* XXX this needs to be made *much* more general */ 3321 /* Too many failures */ 3322 return(0); 3323 } 3324 /* otherwise, all is well, and we've got enough to take a kick 3325 at autoconfiguring this set */ 3326 return(1); 3327 } 3328 3329 void 3330 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config, 3331 RF_Raid_t *raidPtr) 3332 { 3333 RF_ComponentLabel_t *clabel; 3334 int i; 3335 3336 clabel = ac->clabel; 3337 3338 /* 1. Fill in the common stuff */ 3339 config->numRow = clabel->num_rows = 1; 3340 config->numCol = clabel->num_columns; 3341 config->numSpare = 0; /* XXX should this be set here? */ 3342 config->sectPerSU = clabel->sectPerSU; 3343 config->SUsPerPU = clabel->SUsPerPU; 3344 config->SUsPerRU = clabel->SUsPerRU; 3345 config->parityConfig = clabel->parityConfig; 3346 /* XXX... */ 3347 strcpy(config->diskQueueType,"fifo"); 3348 config->maxOutstandingDiskReqs = clabel->maxOutstanding; 3349 config->layoutSpecificSize = 0; /* XXX ?? */ 3350 3351 while(ac!=NULL) { 3352 /* row/col values will be in range due to the checks 3353 in reasonable_label() */ 3354 strcpy(config->devnames[0][ac->clabel->column], 3355 ac->devname); 3356 ac = ac->next; 3357 } 3358 3359 for(i=0;i<RF_MAXDBGV;i++) { 3360 config->debugVars[i][0] = 0; 3361 } 3362 } 3363 3364 int 3365 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value) 3366 { 3367 RF_ComponentLabel_t *clabel; 3368 int column; 3369 int sparecol; 3370 3371 raidPtr->autoconfigure = new_value; 3372 3373 for(column=0; column<raidPtr->numCol; column++) { 3374 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3375 clabel = raidget_component_label(raidPtr, column); 3376 clabel->autoconfigure = new_value; 3377 raidflush_component_label(raidPtr, column); 3378 } 3379 } 3380 for(column = 0; column < raidPtr->numSpare ; column++) { 3381 sparecol = raidPtr->numCol + column; 3382 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3383 clabel = raidget_component_label(raidPtr, sparecol); 3384 clabel->autoconfigure = new_value; 3385 raidflush_component_label(raidPtr, sparecol); 3386 } 3387 } 3388 return(new_value); 3389 } 3390 3391 int 3392 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value) 3393 { 3394 RF_ComponentLabel_t *clabel; 3395 int column; 3396 int sparecol; 3397 3398 raidPtr->root_partition = new_value; 3399 for(column=0; column<raidPtr->numCol; column++) { 3400 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3401 clabel = raidget_component_label(raidPtr, column); 3402 clabel->root_partition = new_value; 3403 raidflush_component_label(raidPtr, column); 3404 } 3405 } 3406 for(column = 0; column < raidPtr->numSpare ; column++) { 3407 sparecol = raidPtr->numCol + column; 3408 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3409 clabel = raidget_component_label(raidPtr, sparecol); 3410 clabel->root_partition = new_value; 3411 raidflush_component_label(raidPtr, sparecol); 3412 } 3413 } 3414 return(new_value); 3415 } 3416 3417 void 3418 rf_release_all_vps(RF_ConfigSet_t *cset) 3419 { 3420 RF_AutoConfig_t *ac; 3421 3422 ac = cset->ac; 3423 while(ac!=NULL) { 3424 /* Close the vp, and give it back */ 3425 if (ac->vp) { 3426 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY); 3427 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED); 3428 vput(ac->vp); 3429 ac->vp = NULL; 3430 } 3431 ac = ac->next; 3432 } 3433 } 3434 3435 3436 void 3437 rf_cleanup_config_set(RF_ConfigSet_t *cset) 3438 { 3439 RF_AutoConfig_t *ac; 3440 RF_AutoConfig_t *next_ac; 3441 3442 ac = cset->ac; 3443 while(ac!=NULL) { 3444 next_ac = ac->next; 3445 /* nuke the label */ 3446 free(ac->clabel, M_RAIDFRAME); 3447 /* cleanup the config structure */ 3448 free(ac, M_RAIDFRAME); 3449 /* "next.." */ 3450 ac = next_ac; 3451 } 3452 /* and, finally, nuke the config set */ 3453 free(cset, M_RAIDFRAME); 3454 } 3455 3456 3457 void 3458 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 3459 { 3460 /* current version number */ 3461 clabel->version = RF_COMPONENT_LABEL_VERSION; 3462 clabel->serial_number = raidPtr->serial_number; 3463 clabel->mod_counter = raidPtr->mod_counter; 3464 3465 clabel->num_rows = 1; 3466 clabel->num_columns = raidPtr->numCol; 3467 clabel->clean = RF_RAID_DIRTY; /* not clean */ 3468 clabel->status = rf_ds_optimal; /* "It's good!" */ 3469 3470 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; 3471 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU; 3472 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU; 3473 3474 clabel->blockSize = raidPtr->bytesPerSector; 3475 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk); 3476 3477 /* XXX not portable */ 3478 clabel->parityConfig = raidPtr->Layout.map->parityConfig; 3479 clabel->maxOutstanding = raidPtr->maxOutstanding; 3480 clabel->autoconfigure = raidPtr->autoconfigure; 3481 clabel->root_partition = raidPtr->root_partition; 3482 clabel->last_unit = raidPtr->raidid; 3483 clabel->config_order = raidPtr->config_order; 3484 3485 #ifndef RF_NO_PARITY_MAP 3486 rf_paritymap_init_label(raidPtr->parity_map, clabel); 3487 #endif 3488 } 3489 3490 struct raid_softc * 3491 rf_auto_config_set(RF_ConfigSet_t *cset) 3492 { 3493 RF_Raid_t *raidPtr; 3494 RF_Config_t *config; 3495 int raidID; 3496 struct raid_softc *sc; 3497 3498 #ifdef DEBUG 3499 printf("RAID autoconfigure\n"); 3500 #endif 3501 3502 /* 1. Create a config structure */ 3503 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO); 3504 if (config == NULL) { 3505 printf("%s: Out of mem - config!?!?\n", __func__); 3506 /* XXX do something more intelligent here. */ 3507 return NULL; 3508 } 3509 3510 /* 3511 2. Figure out what RAID ID this one is supposed to live at 3512 See if we can get the same RAID dev that it was configured 3513 on last time.. 3514 */ 3515 3516 raidID = cset->ac->clabel->last_unit; 3517 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0; 3518 sc = raidget(++raidID, false)) 3519 continue; 3520 #ifdef DEBUG 3521 printf("Configuring raid%d:\n",raidID); 3522 #endif 3523 3524 if (sc == NULL) 3525 sc = raidget(raidID, true); 3526 if (sc == NULL) { 3527 printf("%s: Out of mem - softc!?!?\n", __func__); 3528 /* XXX do something more intelligent here. */ 3529 free(config, M_RAIDFRAME); 3530 return NULL; 3531 } 3532 3533 raidPtr = &sc->sc_r; 3534 3535 /* XXX all this stuff should be done SOMEWHERE ELSE! */ 3536 raidPtr->softc = sc; 3537 raidPtr->raidid = raidID; 3538 raidPtr->openings = RAIDOUTSTANDING; 3539 3540 /* 3. Build the configuration structure */ 3541 rf_create_configuration(cset->ac, config, raidPtr); 3542 3543 /* 4. Do the configuration */ 3544 if (rf_Configure(raidPtr, config, cset->ac) == 0) { 3545 raidinit(sc); 3546 3547 rf_markalldirty(raidPtr); 3548 raidPtr->autoconfigure = 1; /* XXX do this here? */ 3549 switch (cset->ac->clabel->root_partition) { 3550 case 1: /* Force Root */ 3551 case 2: /* Soft Root: root when boot partition part of raid */ 3552 /* 3553 * everything configured just fine. Make a note 3554 * that this set is eligible to be root, 3555 * or forced to be root 3556 */ 3557 cset->rootable = cset->ac->clabel->root_partition; 3558 /* XXX do this here? */ 3559 raidPtr->root_partition = cset->rootable; 3560 break; 3561 default: 3562 break; 3563 } 3564 } else { 3565 raidput(sc); 3566 sc = NULL; 3567 } 3568 3569 /* 5. Cleanup */ 3570 free(config, M_RAIDFRAME); 3571 return sc; 3572 } 3573 3574 void 3575 rf_pool_init(struct pool *p, size_t size, const char *w_chan, 3576 size_t xmin, size_t xmax) 3577 { 3578 int error; 3579 3580 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO); 3581 pool_sethiwat(p, xmax); 3582 if ((error = pool_prime(p, xmin)) != 0) 3583 panic("%s: failed to prime pool: %d", __func__, error); 3584 pool_setlowat(p, xmin); 3585 } 3586 3587 /* 3588 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue 3589 * to see if there is IO pending and if that IO could possibly be done 3590 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1 3591 * otherwise. 3592 * 3593 */ 3594 int 3595 rf_buf_queue_check(RF_Raid_t *raidPtr) 3596 { 3597 struct raid_softc *rs; 3598 struct dk_softc *dksc; 3599 3600 rs = raidPtr->softc; 3601 dksc = &rs->sc_dksc; 3602 3603 if ((rs->sc_flags & RAIDF_INITED) == 0) 3604 return 1; 3605 3606 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) { 3607 /* there is work to do */ 3608 return 0; 3609 } 3610 /* default is nothing to do */ 3611 return 1; 3612 } 3613 3614 int 3615 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr) 3616 { 3617 uint64_t numsecs; 3618 unsigned secsize; 3619 int error; 3620 3621 error = getdisksize(vp, &numsecs, &secsize); 3622 if (error == 0) { 3623 diskPtr->blockSize = secsize; 3624 diskPtr->numBlocks = numsecs - rf_protectedSectors; 3625 diskPtr->partitionSize = numsecs; 3626 return 0; 3627 } 3628 return error; 3629 } 3630 3631 static int 3632 raid_match(device_t self, cfdata_t cfdata, void *aux) 3633 { 3634 return 1; 3635 } 3636 3637 static void 3638 raid_attach(device_t parent, device_t self, void *aux) 3639 { 3640 } 3641 3642 3643 static int 3644 raid_detach(device_t self, int flags) 3645 { 3646 int error; 3647 struct raid_softc *rs = raidsoftc(self); 3648 3649 if (rs == NULL) 3650 return ENXIO; 3651 3652 if ((error = raidlock(rs)) != 0) 3653 return (error); 3654 3655 error = raid_detach_unlocked(rs); 3656 3657 raidunlock(rs); 3658 3659 /* XXX raid can be referenced here */ 3660 3661 if (error) 3662 return error; 3663 3664 /* Free the softc */ 3665 raidput(rs); 3666 3667 return 0; 3668 } 3669 3670 static void 3671 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr) 3672 { 3673 struct dk_softc *dksc = &rs->sc_dksc; 3674 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; 3675 3676 memset(dg, 0, sizeof(*dg)); 3677 3678 dg->dg_secperunit = raidPtr->totalSectors; 3679 dg->dg_secsize = raidPtr->bytesPerSector; 3680 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe; 3681 dg->dg_ntracks = 4 * raidPtr->numCol; 3682 3683 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL); 3684 } 3685 3686 /* 3687 * Get cache info for all the components (including spares). 3688 * Returns intersection of all the cache flags of all disks, or first 3689 * error if any encountered. 3690 * XXXfua feature flags can change as spares are added - lock down somehow 3691 */ 3692 static int 3693 rf_get_component_caches(RF_Raid_t *raidPtr, int *data) 3694 { 3695 int c; 3696 int error; 3697 int dkwhole = 0, dkpart; 3698 3699 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) { 3700 /* 3701 * Check any non-dead disk, even when currently being 3702 * reconstructed. 3703 */ 3704 if (!RF_DEAD_DISK(raidPtr->Disks[c].status) 3705 || raidPtr->Disks[c].status == rf_ds_reconstructing) { 3706 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, 3707 DIOCGCACHE, &dkpart, FREAD, NOCRED); 3708 if (error) { 3709 if (error != ENODEV) { 3710 printf("raid%d: get cache for component %s failed\n", 3711 raidPtr->raidid, 3712 raidPtr->Disks[c].devname); 3713 } 3714 3715 return error; 3716 } 3717 3718 if (c == 0) 3719 dkwhole = dkpart; 3720 else 3721 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart); 3722 } 3723 } 3724 3725 *data = dkwhole; 3726 3727 return 0; 3728 } 3729 3730 /* 3731 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components. 3732 * We end up returning whatever error was returned by the first cache flush 3733 * that fails. 3734 */ 3735 3736 int 3737 rf_sync_component_caches(RF_Raid_t *raidPtr) 3738 { 3739 int c, sparecol; 3740 int e,error; 3741 int force = 1; 3742 3743 error = 0; 3744 for (c = 0; c < raidPtr->numCol; c++) { 3745 if (raidPtr->Disks[c].status == rf_ds_optimal) { 3746 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC, 3747 &force, FWRITE, NOCRED); 3748 if (e) { 3749 if (e != ENODEV) 3750 printf("raid%d: cache flush to component %s failed.\n", 3751 raidPtr->raidid, raidPtr->Disks[c].devname); 3752 if (error == 0) { 3753 error = e; 3754 } 3755 } 3756 } 3757 } 3758 3759 for( c = 0; c < raidPtr->numSpare ; c++) { 3760 sparecol = raidPtr->numCol + c; 3761 /* Need to ensure that the reconstruct actually completed! */ 3762 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3763 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp, 3764 DIOCCACHESYNC, &force, FWRITE, NOCRED); 3765 if (e) { 3766 if (e != ENODEV) 3767 printf("raid%d: cache flush to component %s failed.\n", 3768 raidPtr->raidid, raidPtr->Disks[sparecol].devname); 3769 if (error == 0) { 3770 error = e; 3771 } 3772 } 3773 } 3774 } 3775 return error; 3776 } 3777 3778 /* 3779 * Module interface 3780 */ 3781 3782 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr"); 3783 3784 #ifdef _MODULE 3785 CFDRIVER_DECL(raid, DV_DISK, NULL); 3786 #endif 3787 3788 static int raid_modcmd(modcmd_t, void *); 3789 static int raid_modcmd_init(void); 3790 static int raid_modcmd_fini(void); 3791 3792 static int 3793 raid_modcmd(modcmd_t cmd, void *data) 3794 { 3795 int error; 3796 3797 error = 0; 3798 switch (cmd) { 3799 case MODULE_CMD_INIT: 3800 error = raid_modcmd_init(); 3801 break; 3802 case MODULE_CMD_FINI: 3803 error = raid_modcmd_fini(); 3804 break; 3805 default: 3806 error = ENOTTY; 3807 break; 3808 } 3809 return error; 3810 } 3811 3812 static int 3813 raid_modcmd_init(void) 3814 { 3815 int error; 3816 int bmajor, cmajor; 3817 3818 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE); 3819 mutex_enter(&raid_lock); 3820 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 3821 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM); 3822 rf_init_cond2(rf_sparet_wait_cv, "sparetw"); 3823 rf_init_cond2(rf_sparet_resp_cv, "rfgst"); 3824 3825 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; 3826 #endif 3827 3828 bmajor = cmajor = -1; 3829 error = devsw_attach("raid", &raid_bdevsw, &bmajor, 3830 &raid_cdevsw, &cmajor); 3831 if (error != 0 && error != EEXIST) { 3832 aprint_error("%s: devsw_attach failed %d\n", __func__, error); 3833 mutex_exit(&raid_lock); 3834 return error; 3835 } 3836 #ifdef _MODULE 3837 error = config_cfdriver_attach(&raid_cd); 3838 if (error != 0) { 3839 aprint_error("%s: config_cfdriver_attach failed %d\n", 3840 __func__, error); 3841 devsw_detach(&raid_bdevsw, &raid_cdevsw); 3842 mutex_exit(&raid_lock); 3843 return error; 3844 } 3845 #endif 3846 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3847 if (error != 0) { 3848 aprint_error("%s: config_cfattach_attach failed %d\n", 3849 __func__, error); 3850 #ifdef _MODULE 3851 config_cfdriver_detach(&raid_cd); 3852 #endif 3853 devsw_detach(&raid_bdevsw, &raid_cdevsw); 3854 mutex_exit(&raid_lock); 3855 return error; 3856 } 3857 3858 raidautoconfigdone = false; 3859 3860 mutex_exit(&raid_lock); 3861 3862 if (error == 0) { 3863 if (rf_BootRaidframe(true) == 0) 3864 aprint_verbose("Kernelized RAIDframe activated\n"); 3865 else 3866 panic("Serious error activating RAID!!"); 3867 } 3868 3869 /* 3870 * Register a finalizer which will be used to auto-config RAID 3871 * sets once all real hardware devices have been found. 3872 */ 3873 error = config_finalize_register(NULL, rf_autoconfig); 3874 if (error != 0) { 3875 aprint_error("WARNING: unable to register RAIDframe " 3876 "finalizer\n"); 3877 error = 0; 3878 } 3879 3880 return error; 3881 } 3882 3883 static int 3884 raid_modcmd_fini(void) 3885 { 3886 int error; 3887 3888 mutex_enter(&raid_lock); 3889 3890 /* Don't allow unload if raid device(s) exist. */ 3891 if (!LIST_EMPTY(&raids)) { 3892 mutex_exit(&raid_lock); 3893 return EBUSY; 3894 } 3895 3896 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca); 3897 if (error != 0) { 3898 aprint_error("%s: cannot detach cfattach\n",__func__); 3899 mutex_exit(&raid_lock); 3900 return error; 3901 } 3902 #ifdef _MODULE 3903 error = config_cfdriver_detach(&raid_cd); 3904 if (error != 0) { 3905 aprint_error("%s: cannot detach cfdriver\n",__func__); 3906 config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3907 mutex_exit(&raid_lock); 3908 return error; 3909 } 3910 #endif 3911 error = devsw_detach(&raid_bdevsw, &raid_cdevsw); 3912 if (error != 0) { 3913 aprint_error("%s: cannot detach devsw\n",__func__); 3914 #ifdef _MODULE 3915 config_cfdriver_attach(&raid_cd); 3916 #endif 3917 config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3918 mutex_exit(&raid_lock); 3919 return error; 3920 } 3921 rf_BootRaidframe(false); 3922 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 3923 rf_destroy_mutex2(rf_sparet_wait_mutex); 3924 rf_destroy_cond2(rf_sparet_wait_cv); 3925 rf_destroy_cond2(rf_sparet_resp_cv); 3926 #endif 3927 mutex_exit(&raid_lock); 3928 mutex_destroy(&raid_lock); 3929 3930 return error; 3931 } 3932