1 /* $NetBSD: rf_netbsdkintf.c,v 1.347 2016/09/19 23:37:10 jdolecek Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Greg Oster; Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1988 University of Utah. 34 * Copyright (c) 1990, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * the Systems Programming Group of the University of Utah Computer 39 * Science Department. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 66 * 67 * @(#)cd.c 8.2 (Berkeley) 11/16/93 68 */ 69 70 /* 71 * Copyright (c) 1995 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Mark Holland, Jim Zelenka 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /*********************************************************** 98 * 99 * rf_kintf.c -- the kernel interface routines for RAIDframe 100 * 101 ***********************************************************/ 102 103 #include <sys/cdefs.h> 104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.347 2016/09/19 23:37:10 jdolecek Exp $"); 105 106 #ifdef _KERNEL_OPT 107 #include "opt_compat_netbsd.h" 108 #include "opt_raid_autoconfig.h" 109 #endif 110 111 #include <sys/param.h> 112 #include <sys/errno.h> 113 #include <sys/pool.h> 114 #include <sys/proc.h> 115 #include <sys/queue.h> 116 #include <sys/disk.h> 117 #include <sys/device.h> 118 #include <sys/stat.h> 119 #include <sys/ioctl.h> 120 #include <sys/fcntl.h> 121 #include <sys/systm.h> 122 #include <sys/vnode.h> 123 #include <sys/disklabel.h> 124 #include <sys/conf.h> 125 #include <sys/buf.h> 126 #include <sys/bufq.h> 127 #include <sys/reboot.h> 128 #include <sys/kauth.h> 129 #include <sys/module.h> 130 131 #include <prop/proplib.h> 132 133 #include <dev/raidframe/raidframevar.h> 134 #include <dev/raidframe/raidframeio.h> 135 #include <dev/raidframe/rf_paritymap.h> 136 137 #include "rf_raid.h" 138 #include "rf_copyback.h" 139 #include "rf_dag.h" 140 #include "rf_dagflags.h" 141 #include "rf_desc.h" 142 #include "rf_diskqueue.h" 143 #include "rf_etimer.h" 144 #include "rf_general.h" 145 #include "rf_kintf.h" 146 #include "rf_options.h" 147 #include "rf_driver.h" 148 #include "rf_parityscan.h" 149 #include "rf_threadstuff.h" 150 151 #ifdef COMPAT_50 152 #include "rf_compat50.h" 153 #endif 154 155 #include "ioconf.h" 156 157 #ifdef DEBUG 158 int rf_kdebug_level = 0; 159 #define db1_printf(a) if (rf_kdebug_level > 0) printf a 160 #else /* DEBUG */ 161 #define db1_printf(a) { } 162 #endif /* DEBUG */ 163 164 #ifdef DEBUG_ROOT 165 #define DPRINTF(a, ...) printf(a, __VA_ARGS__) 166 #else 167 #define DPRINTF(a, ...) 168 #endif 169 170 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 171 static rf_declare_mutex2(rf_sparet_wait_mutex); 172 static rf_declare_cond2(rf_sparet_wait_cv); 173 static rf_declare_cond2(rf_sparet_resp_cv); 174 175 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a 176 * spare table */ 177 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from 178 * installation process */ 179 #endif 180 181 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures"); 182 183 /* prototypes */ 184 static void KernelWakeupFunc(struct buf *); 185 static void InitBP(struct buf *, struct vnode *, unsigned, 186 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *), 187 void *, int, struct proc *); 188 struct raid_softc; 189 static void raidinit(struct raid_softc *); 190 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp); 191 192 static int raid_match(device_t, cfdata_t, void *); 193 static void raid_attach(device_t, device_t, void *); 194 static int raid_detach(device_t, int); 195 196 static int raidread_component_area(dev_t, struct vnode *, void *, size_t, 197 daddr_t, daddr_t); 198 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t, 199 daddr_t, daddr_t, int); 200 201 static int raidwrite_component_label(unsigned, 202 dev_t, struct vnode *, RF_ComponentLabel_t *); 203 static int raidread_component_label(unsigned, 204 dev_t, struct vnode *, RF_ComponentLabel_t *); 205 206 static int raid_diskstart(device_t, struct buf *bp); 207 static int raid_dumpblocks(device_t, void *, daddr_t, int); 208 static int raid_lastclose(device_t); 209 210 static dev_type_open(raidopen); 211 static dev_type_close(raidclose); 212 static dev_type_read(raidread); 213 static dev_type_write(raidwrite); 214 static dev_type_ioctl(raidioctl); 215 static dev_type_strategy(raidstrategy); 216 static dev_type_dump(raiddump); 217 static dev_type_size(raidsize); 218 219 const struct bdevsw raid_bdevsw = { 220 .d_open = raidopen, 221 .d_close = raidclose, 222 .d_strategy = raidstrategy, 223 .d_ioctl = raidioctl, 224 .d_dump = raiddump, 225 .d_psize = raidsize, 226 .d_discard = nodiscard, 227 .d_flag = D_DISK 228 }; 229 230 const struct cdevsw raid_cdevsw = { 231 .d_open = raidopen, 232 .d_close = raidclose, 233 .d_read = raidread, 234 .d_write = raidwrite, 235 .d_ioctl = raidioctl, 236 .d_stop = nostop, 237 .d_tty = notty, 238 .d_poll = nopoll, 239 .d_mmap = nommap, 240 .d_kqfilter = nokqfilter, 241 .d_discard = nodiscard, 242 .d_flag = D_DISK 243 }; 244 245 static struct dkdriver rf_dkdriver = { 246 .d_open = raidopen, 247 .d_close = raidclose, 248 .d_strategy = raidstrategy, 249 .d_diskstart = raid_diskstart, 250 .d_dumpblocks = raid_dumpblocks, 251 .d_lastclose = raid_lastclose, 252 .d_minphys = minphys 253 }; 254 255 struct raid_softc { 256 struct dk_softc sc_dksc; 257 int sc_unit; 258 int sc_flags; /* flags */ 259 int sc_cflags; /* configuration flags */ 260 kmutex_t sc_mutex; /* interlock mutex */ 261 kcondvar_t sc_cv; /* and the condvar */ 262 uint64_t sc_size; /* size of the raid device */ 263 char sc_xname[20]; /* XXX external name */ 264 RF_Raid_t sc_r; 265 LIST_ENTRY(raid_softc) sc_link; 266 }; 267 /* sc_flags */ 268 #define RAIDF_INITED 0x01 /* unit has been initialized */ 269 #define RAIDF_SHUTDOWN 0x02 /* unit is being shutdown */ 270 #define RAIDF_DETACH 0x04 /* detach after final close */ 271 #define RAIDF_WANTED 0x08 /* someone waiting to obtain a lock */ 272 #define RAIDF_LOCKED 0x10 /* unit is locked */ 273 #define RAIDF_UNIT_CHANGED 0x20 /* unit is being changed */ 274 275 #define raidunit(x) DISKUNIT(x) 276 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc) 277 278 extern struct cfdriver raid_cd; 279 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc), 280 raid_match, raid_attach, raid_detach, NULL, NULL, NULL, 281 DVF_DETACH_SHUTDOWN); 282 283 /* 284 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device. 285 * Be aware that large numbers can allow the driver to consume a lot of 286 * kernel memory, especially on writes, and in degraded mode reads. 287 * 288 * For example: with a stripe width of 64 blocks (32k) and 5 disks, 289 * a single 64K write will typically require 64K for the old data, 290 * 64K for the old parity, and 64K for the new parity, for a total 291 * of 192K (if the parity buffer is not re-used immediately). 292 * Even it if is used immediately, that's still 128K, which when multiplied 293 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data. 294 * 295 * Now in degraded mode, for example, a 64K read on the above setup may 296 * require data reconstruction, which will require *all* of the 4 remaining 297 * disks to participate -- 4 * 32K/disk == 128K again. 298 */ 299 300 #ifndef RAIDOUTSTANDING 301 #define RAIDOUTSTANDING 6 302 #endif 303 304 #define RAIDLABELDEV(dev) \ 305 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART)) 306 307 /* declared here, and made public, for the benefit of KVM stuff.. */ 308 309 static int raidlock(struct raid_softc *); 310 static void raidunlock(struct raid_softc *); 311 312 static int raid_detach_unlocked(struct raid_softc *); 313 314 static void rf_markalldirty(RF_Raid_t *); 315 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *); 316 317 void rf_ReconThread(struct rf_recon_req *); 318 void rf_RewriteParityThread(RF_Raid_t *raidPtr); 319 void rf_CopybackThread(RF_Raid_t *raidPtr); 320 void rf_ReconstructInPlaceThread(struct rf_recon_req *); 321 int rf_autoconfig(device_t); 322 void rf_buildroothack(RF_ConfigSet_t *); 323 324 RF_AutoConfig_t *rf_find_raid_components(void); 325 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *); 326 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *); 327 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t); 328 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *); 329 int rf_set_autoconfig(RF_Raid_t *, int); 330 int rf_set_rootpartition(RF_Raid_t *, int); 331 void rf_release_all_vps(RF_ConfigSet_t *); 332 void rf_cleanup_config_set(RF_ConfigSet_t *); 333 int rf_have_enough_components(RF_ConfigSet_t *); 334 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *); 335 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t); 336 337 /* 338 * Debugging, mostly. Set to 0 to not allow autoconfig to take place. 339 * Note that this is overridden by having RAID_AUTOCONFIG as an option 340 * in the kernel config file. 341 */ 342 #ifdef RAID_AUTOCONFIG 343 int raidautoconfig = 1; 344 #else 345 int raidautoconfig = 0; 346 #endif 347 static bool raidautoconfigdone = false; 348 349 struct RF_Pools_s rf_pools; 350 351 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids); 352 static kmutex_t raid_lock; 353 354 static struct raid_softc * 355 raidcreate(int unit) { 356 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP); 357 if (sc == NULL) { 358 #ifdef DIAGNOSTIC 359 printf("%s: out of memory\n", __func__); 360 #endif 361 return NULL; 362 } 363 sc->sc_unit = unit; 364 cv_init(&sc->sc_cv, "raidunit"); 365 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE); 366 return sc; 367 } 368 369 static void 370 raiddestroy(struct raid_softc *sc) { 371 cv_destroy(&sc->sc_cv); 372 mutex_destroy(&sc->sc_mutex); 373 kmem_free(sc, sizeof(*sc)); 374 } 375 376 static struct raid_softc * 377 raidget(int unit, bool create) { 378 struct raid_softc *sc; 379 if (unit < 0) { 380 #ifdef DIAGNOSTIC 381 panic("%s: unit %d!", __func__, unit); 382 #endif 383 return NULL; 384 } 385 mutex_enter(&raid_lock); 386 LIST_FOREACH(sc, &raids, sc_link) { 387 if (sc->sc_unit == unit) { 388 mutex_exit(&raid_lock); 389 return sc; 390 } 391 } 392 mutex_exit(&raid_lock); 393 if (!create) 394 return NULL; 395 if ((sc = raidcreate(unit)) == NULL) 396 return NULL; 397 mutex_enter(&raid_lock); 398 LIST_INSERT_HEAD(&raids, sc, sc_link); 399 mutex_exit(&raid_lock); 400 return sc; 401 } 402 403 static void 404 raidput(struct raid_softc *sc) { 405 mutex_enter(&raid_lock); 406 LIST_REMOVE(sc, sc_link); 407 mutex_exit(&raid_lock); 408 raiddestroy(sc); 409 } 410 411 void 412 raidattach(int num) 413 { 414 415 /* 416 * Device attachment and associated initialization now occurs 417 * as part of the module initialization. 418 */ 419 } 420 421 int 422 rf_autoconfig(device_t self) 423 { 424 RF_AutoConfig_t *ac_list; 425 RF_ConfigSet_t *config_sets; 426 427 if (!raidautoconfig || raidautoconfigdone == true) 428 return (0); 429 430 /* XXX This code can only be run once. */ 431 raidautoconfigdone = true; 432 433 #ifdef __HAVE_CPU_BOOTCONF 434 /* 435 * 0. find the boot device if needed first so we can use it later 436 * this needs to be done before we autoconfigure any raid sets, 437 * because if we use wedges we are not going to be able to open 438 * the boot device later 439 */ 440 if (booted_device == NULL) 441 cpu_bootconf(); 442 #endif 443 /* 1. locate all RAID components on the system */ 444 aprint_debug("Searching for RAID components...\n"); 445 ac_list = rf_find_raid_components(); 446 447 /* 2. Sort them into their respective sets. */ 448 config_sets = rf_create_auto_sets(ac_list); 449 450 /* 451 * 3. Evaluate each set and configure the valid ones. 452 * This gets done in rf_buildroothack(). 453 */ 454 rf_buildroothack(config_sets); 455 456 return 1; 457 } 458 459 static int 460 rf_containsboot(RF_Raid_t *r, device_t bdv) { 461 const char *bootname = device_xname(bdv); 462 size_t len = strlen(bootname); 463 464 for (int col = 0; col < r->numCol; col++) { 465 const char *devname = r->Disks[col].devname; 466 devname += sizeof("/dev/") - 1; 467 if (strncmp(devname, "dk", 2) == 0) { 468 const char *parent = 469 dkwedge_get_parent_name(r->Disks[col].dev); 470 if (parent != NULL) 471 devname = parent; 472 } 473 if (strncmp(devname, bootname, len) == 0) { 474 struct raid_softc *sc = r->softc; 475 aprint_debug("raid%d includes boot device %s\n", 476 sc->sc_unit, devname); 477 return 1; 478 } 479 } 480 return 0; 481 } 482 483 void 484 rf_buildroothack(RF_ConfigSet_t *config_sets) 485 { 486 RF_ConfigSet_t *cset; 487 RF_ConfigSet_t *next_cset; 488 int num_root; 489 struct raid_softc *sc, *rsc; 490 struct dk_softc *dksc; 491 492 sc = rsc = NULL; 493 num_root = 0; 494 cset = config_sets; 495 while (cset != NULL) { 496 next_cset = cset->next; 497 if (rf_have_enough_components(cset) && 498 cset->ac->clabel->autoconfigure == 1) { 499 sc = rf_auto_config_set(cset); 500 if (sc != NULL) { 501 aprint_debug("raid%d: configured ok\n", 502 sc->sc_unit); 503 if (cset->rootable) { 504 rsc = sc; 505 num_root++; 506 } 507 } else { 508 /* The autoconfig didn't work :( */ 509 aprint_debug("Autoconfig failed\n"); 510 rf_release_all_vps(cset); 511 } 512 } else { 513 /* we're not autoconfiguring this set... 514 release the associated resources */ 515 rf_release_all_vps(cset); 516 } 517 /* cleanup */ 518 rf_cleanup_config_set(cset); 519 cset = next_cset; 520 } 521 dksc = &rsc->sc_dksc; 522 523 /* if the user has specified what the root device should be 524 then we don't touch booted_device or boothowto... */ 525 526 if (rootspec != NULL) 527 return; 528 529 /* we found something bootable... */ 530 531 /* 532 * XXX: The following code assumes that the root raid 533 * is the first ('a') partition. This is about the best 534 * we can do with a BSD disklabel, but we might be able 535 * to do better with a GPT label, by setting a specified 536 * attribute to indicate the root partition. We can then 537 * stash the partition number in the r->root_partition 538 * high bits (the bottom 2 bits are already used). For 539 * now we just set booted_partition to 0 when we override 540 * root. 541 */ 542 if (num_root == 1) { 543 device_t candidate_root; 544 if (dksc->sc_dkdev.dk_nwedges != 0) { 545 char cname[sizeof(cset->ac->devname)]; 546 /* XXX: assume partition 'a' first */ 547 snprintf(cname, sizeof(cname), "%s%c", 548 device_xname(dksc->sc_dev), 'a'); 549 candidate_root = dkwedge_find_by_wname(cname); 550 DPRINTF("%s: candidate wedge root=%s\n", __func__, 551 cname); 552 if (candidate_root == NULL) { 553 /* 554 * If that is not found, because we don't use 555 * disklabel, return the first dk child 556 * XXX: we can skip the 'a' check above 557 * and always do this... 558 */ 559 size_t i = 0; 560 candidate_root = dkwedge_find_by_parent( 561 device_xname(dksc->sc_dev), &i); 562 } 563 DPRINTF("%s: candidate wedge root=%p\n", __func__, 564 candidate_root); 565 } else 566 candidate_root = dksc->sc_dev; 567 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root); 568 DPRINTF("%s: booted_device=%p root_partition=%d " 569 "contains_boot=%d\n", __func__, booted_device, 570 rsc->sc_r.root_partition, 571 rf_containsboot(&rsc->sc_r, booted_device)); 572 if (booted_device == NULL || 573 rsc->sc_r.root_partition == 1 || 574 rf_containsboot(&rsc->sc_r, booted_device)) { 575 booted_device = candidate_root; 576 booted_partition = 0; /* XXX assume 'a' */ 577 } 578 } else if (num_root > 1) { 579 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root, 580 booted_device); 581 582 /* 583 * Maybe the MD code can help. If it cannot, then 584 * setroot() will discover that we have no 585 * booted_device and will ask the user if nothing was 586 * hardwired in the kernel config file 587 */ 588 if (booted_device == NULL) 589 return; 590 591 num_root = 0; 592 mutex_enter(&raid_lock); 593 LIST_FOREACH(sc, &raids, sc_link) { 594 RF_Raid_t *r = &sc->sc_r; 595 if (r->valid == 0) 596 continue; 597 598 if (r->root_partition == 0) 599 continue; 600 601 if (rf_containsboot(r, booted_device)) { 602 num_root++; 603 rsc = sc; 604 dksc = &rsc->sc_dksc; 605 } 606 } 607 mutex_exit(&raid_lock); 608 609 if (num_root == 1) { 610 booted_device = dksc->sc_dev; 611 booted_partition = 0; /* XXX assume 'a' */ 612 } else { 613 /* we can't guess.. require the user to answer... */ 614 boothowto |= RB_ASKNAME; 615 } 616 } 617 } 618 619 static int 620 raidsize(dev_t dev) 621 { 622 struct raid_softc *rs; 623 struct dk_softc *dksc; 624 unsigned int unit; 625 626 unit = raidunit(dev); 627 if ((rs = raidget(unit, false)) == NULL) 628 return -1; 629 dksc = &rs->sc_dksc; 630 631 if ((rs->sc_flags & RAIDF_INITED) == 0) 632 return -1; 633 634 return dk_size(dksc, dev); 635 } 636 637 static int 638 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size) 639 { 640 unsigned int unit; 641 struct raid_softc *rs; 642 struct dk_softc *dksc; 643 644 unit = raidunit(dev); 645 if ((rs = raidget(unit, false)) == NULL) 646 return ENXIO; 647 dksc = &rs->sc_dksc; 648 649 if ((rs->sc_flags & RAIDF_INITED) == 0) 650 return ENODEV; 651 652 /* 653 Note that blkno is relative to this particular partition. 654 By adding adding RF_PROTECTED_SECTORS, we get a value that 655 is relative to the partition used for the underlying component. 656 */ 657 blkno += RF_PROTECTED_SECTORS; 658 659 return dk_dump(dksc, dev, blkno, va, size); 660 } 661 662 static int 663 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk) 664 { 665 struct raid_softc *rs = raidsoftc(dev); 666 const struct bdevsw *bdev; 667 RF_Raid_t *raidPtr; 668 int c, sparecol, j, scol, dumpto; 669 int error = 0; 670 671 raidPtr = &rs->sc_r; 672 673 /* we only support dumping to RAID 1 sets */ 674 if (raidPtr->Layout.numDataCol != 1 || 675 raidPtr->Layout.numParityCol != 1) 676 return EINVAL; 677 678 if ((error = raidlock(rs)) != 0) 679 return error; 680 681 /* figure out what device is alive.. */ 682 683 /* 684 Look for a component to dump to. The preference for the 685 component to dump to is as follows: 686 1) the master 687 2) a used_spare of the master 688 3) the slave 689 4) a used_spare of the slave 690 */ 691 692 dumpto = -1; 693 for (c = 0; c < raidPtr->numCol; c++) { 694 if (raidPtr->Disks[c].status == rf_ds_optimal) { 695 /* this might be the one */ 696 dumpto = c; 697 break; 698 } 699 } 700 701 /* 702 At this point we have possibly selected a live master or a 703 live slave. We now check to see if there is a spared 704 master (or a spared slave), if we didn't find a live master 705 or a live slave. 706 */ 707 708 for (c = 0; c < raidPtr->numSpare; c++) { 709 sparecol = raidPtr->numCol + c; 710 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 711 /* How about this one? */ 712 scol = -1; 713 for(j=0;j<raidPtr->numCol;j++) { 714 if (raidPtr->Disks[j].spareCol == sparecol) { 715 scol = j; 716 break; 717 } 718 } 719 if (scol == 0) { 720 /* 721 We must have found a spared master! 722 We'll take that over anything else 723 found so far. (We couldn't have 724 found a real master before, since 725 this is a used spare, and it's 726 saying that it's replacing the 727 master.) On reboot (with 728 autoconfiguration turned on) 729 sparecol will become the 1st 730 component (component0) of this set. 731 */ 732 dumpto = sparecol; 733 break; 734 } else if (scol != -1) { 735 /* 736 Must be a spared slave. We'll dump 737 to that if we havn't found anything 738 else so far. 739 */ 740 if (dumpto == -1) 741 dumpto = sparecol; 742 } 743 } 744 } 745 746 if (dumpto == -1) { 747 /* we couldn't find any live components to dump to!?!? 748 */ 749 error = EINVAL; 750 goto out; 751 } 752 753 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev); 754 if (bdev == NULL) { 755 error = ENXIO; 756 goto out; 757 } 758 759 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev, 760 blkno, va, nblk * raidPtr->bytesPerSector); 761 762 out: 763 raidunlock(rs); 764 765 return error; 766 } 767 768 /* ARGSUSED */ 769 static int 770 raidopen(dev_t dev, int flags, int fmt, 771 struct lwp *l) 772 { 773 int unit = raidunit(dev); 774 struct raid_softc *rs; 775 struct dk_softc *dksc; 776 int error = 0; 777 int part, pmask; 778 779 if ((rs = raidget(unit, true)) == NULL) 780 return ENXIO; 781 if ((error = raidlock(rs)) != 0) 782 return (error); 783 784 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) { 785 error = EBUSY; 786 goto bad; 787 } 788 789 dksc = &rs->sc_dksc; 790 791 part = DISKPART(dev); 792 pmask = (1 << part); 793 794 if (!DK_BUSY(dksc, pmask) && 795 ((rs->sc_flags & RAIDF_INITED) != 0)) { 796 /* First one... mark things as dirty... Note that we *MUST* 797 have done a configure before this. I DO NOT WANT TO BE 798 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED 799 THAT THEY BELONG TOGETHER!!!!! */ 800 /* XXX should check to see if we're only open for reading 801 here... If so, we needn't do this, but then need some 802 other way of keeping track of what's happened.. */ 803 804 rf_markalldirty(&rs->sc_r); 805 } 806 807 if ((rs->sc_flags & RAIDF_INITED) != 0) 808 error = dk_open(dksc, dev, flags, fmt, l); 809 810 bad: 811 raidunlock(rs); 812 813 return (error); 814 815 816 } 817 818 static int 819 raid_lastclose(device_t self) 820 { 821 struct raid_softc *rs = raidsoftc(self); 822 823 /* Last one... device is not unconfigured yet. 824 Device shutdown has taken care of setting the 825 clean bits if RAIDF_INITED is not set 826 mark things as clean... */ 827 828 rf_update_component_labels(&rs->sc_r, 829 RF_FINAL_COMPONENT_UPDATE); 830 831 /* pass to unlocked code */ 832 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) 833 rs->sc_flags |= RAIDF_DETACH; 834 835 return 0; 836 } 837 838 /* ARGSUSED */ 839 static int 840 raidclose(dev_t dev, int flags, int fmt, struct lwp *l) 841 { 842 int unit = raidunit(dev); 843 struct raid_softc *rs; 844 struct dk_softc *dksc; 845 cfdata_t cf; 846 int error = 0, do_detach = 0, do_put = 0; 847 848 if ((rs = raidget(unit, false)) == NULL) 849 return ENXIO; 850 dksc = &rs->sc_dksc; 851 852 if ((error = raidlock(rs)) != 0) 853 return (error); 854 855 if ((rs->sc_flags & RAIDF_INITED) != 0) { 856 error = dk_close(dksc, dev, flags, fmt, l); 857 if ((rs->sc_flags & RAIDF_DETACH) != 0) 858 do_detach = 1; 859 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) 860 do_put = 1; 861 862 raidunlock(rs); 863 864 if (do_detach) { 865 /* free the pseudo device attach bits */ 866 cf = device_cfdata(dksc->sc_dev); 867 error = config_detach(dksc->sc_dev, 0); 868 if (error == 0) 869 free(cf, M_RAIDFRAME); 870 } else if (do_put) { 871 raidput(rs); 872 } 873 874 return (error); 875 876 } 877 878 static void 879 raid_wakeup(RF_Raid_t *raidPtr) 880 { 881 rf_lock_mutex2(raidPtr->iodone_lock); 882 rf_signal_cond2(raidPtr->iodone_cv); 883 rf_unlock_mutex2(raidPtr->iodone_lock); 884 } 885 886 static void 887 raidstrategy(struct buf *bp) 888 { 889 unsigned int unit; 890 struct raid_softc *rs; 891 struct dk_softc *dksc; 892 RF_Raid_t *raidPtr; 893 894 unit = raidunit(bp->b_dev); 895 if ((rs = raidget(unit, false)) == NULL) { 896 bp->b_error = ENXIO; 897 goto fail; 898 } 899 if ((rs->sc_flags & RAIDF_INITED) == 0) { 900 bp->b_error = ENXIO; 901 goto fail; 902 } 903 dksc = &rs->sc_dksc; 904 raidPtr = &rs->sc_r; 905 906 /* Queue IO only */ 907 if (dk_strategy_defer(dksc, bp)) 908 goto done; 909 910 /* schedule the IO to happen at the next convenient time */ 911 raid_wakeup(raidPtr); 912 913 done: 914 return; 915 916 fail: 917 bp->b_resid = bp->b_bcount; 918 biodone(bp); 919 } 920 921 static int 922 raid_diskstart(device_t dev, struct buf *bp) 923 { 924 struct raid_softc *rs = raidsoftc(dev); 925 RF_Raid_t *raidPtr; 926 927 raidPtr = &rs->sc_r; 928 if (!raidPtr->valid) { 929 db1_printf(("raid is not valid..\n")); 930 return ENODEV; 931 } 932 933 /* XXX */ 934 bp->b_resid = 0; 935 936 return raiddoaccess(raidPtr, bp); 937 } 938 939 void 940 raiddone(RF_Raid_t *raidPtr, struct buf *bp) 941 { 942 struct raid_softc *rs; 943 struct dk_softc *dksc; 944 945 rs = raidPtr->softc; 946 dksc = &rs->sc_dksc; 947 948 dk_done(dksc, bp); 949 950 rf_lock_mutex2(raidPtr->mutex); 951 raidPtr->openings++; 952 rf_unlock_mutex2(raidPtr->mutex); 953 954 /* schedule more IO */ 955 raid_wakeup(raidPtr); 956 } 957 958 /* ARGSUSED */ 959 static int 960 raidread(dev_t dev, struct uio *uio, int flags) 961 { 962 int unit = raidunit(dev); 963 struct raid_softc *rs; 964 965 if ((rs = raidget(unit, false)) == NULL) 966 return ENXIO; 967 968 if ((rs->sc_flags & RAIDF_INITED) == 0) 969 return (ENXIO); 970 971 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio)); 972 973 } 974 975 /* ARGSUSED */ 976 static int 977 raidwrite(dev_t dev, struct uio *uio, int flags) 978 { 979 int unit = raidunit(dev); 980 struct raid_softc *rs; 981 982 if ((rs = raidget(unit, false)) == NULL) 983 return ENXIO; 984 985 if ((rs->sc_flags & RAIDF_INITED) == 0) 986 return (ENXIO); 987 988 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio)); 989 990 } 991 992 static int 993 raid_detach_unlocked(struct raid_softc *rs) 994 { 995 struct dk_softc *dksc = &rs->sc_dksc; 996 RF_Raid_t *raidPtr; 997 int error; 998 999 raidPtr = &rs->sc_r; 1000 1001 if (DK_BUSY(dksc, 0) || 1002 raidPtr->recon_in_progress != 0 || 1003 raidPtr->parity_rewrite_in_progress != 0 || 1004 raidPtr->copyback_in_progress != 0) 1005 return EBUSY; 1006 1007 if ((rs->sc_flags & RAIDF_INITED) == 0) 1008 return 0; 1009 1010 rs->sc_flags &= ~RAIDF_SHUTDOWN; 1011 1012 if ((error = rf_Shutdown(raidPtr)) != 0) 1013 return error; 1014 1015 rs->sc_flags &= ~RAIDF_INITED; 1016 1017 /* Kill off any queued buffers */ 1018 dk_drain(dksc); 1019 bufq_free(dksc->sc_bufq); 1020 1021 /* Detach the disk. */ 1022 dkwedge_delall(&dksc->sc_dkdev); 1023 disk_detach(&dksc->sc_dkdev); 1024 disk_destroy(&dksc->sc_dkdev); 1025 dk_detach(dksc); 1026 1027 return 0; 1028 } 1029 1030 static int 1031 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) 1032 { 1033 int unit = raidunit(dev); 1034 int error = 0; 1035 int part, pmask; 1036 struct raid_softc *rs; 1037 struct dk_softc *dksc; 1038 RF_Config_t *k_cfg, *u_cfg; 1039 RF_Raid_t *raidPtr; 1040 RF_RaidDisk_t *diskPtr; 1041 RF_AccTotals_t *totals; 1042 RF_DeviceConfig_t *d_cfg, **ucfgp; 1043 u_char *specific_buf; 1044 int retcode = 0; 1045 int column; 1046 /* int raidid; */ 1047 struct rf_recon_req *rrcopy, *rr; 1048 RF_ComponentLabel_t *clabel; 1049 RF_ComponentLabel_t *ci_label; 1050 RF_ComponentLabel_t **clabel_ptr; 1051 RF_SingleComponent_t *sparePtr,*componentPtr; 1052 RF_SingleComponent_t component; 1053 RF_ProgressInfo_t progressInfo, **progressInfoPtr; 1054 int i, j, d; 1055 1056 if ((rs = raidget(unit, false)) == NULL) 1057 return ENXIO; 1058 dksc = &rs->sc_dksc; 1059 raidPtr = &rs->sc_r; 1060 1061 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev, 1062 (int) DISKPART(dev), (int) unit, cmd)); 1063 1064 /* Must be initialized for these... */ 1065 switch (cmd) { 1066 case RAIDFRAME_REWRITEPARITY: 1067 case RAIDFRAME_GET_INFO: 1068 case RAIDFRAME_RESET_ACCTOTALS: 1069 case RAIDFRAME_GET_ACCTOTALS: 1070 case RAIDFRAME_KEEP_ACCTOTALS: 1071 case RAIDFRAME_GET_SIZE: 1072 case RAIDFRAME_FAIL_DISK: 1073 case RAIDFRAME_COPYBACK: 1074 case RAIDFRAME_CHECK_RECON_STATUS: 1075 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1076 case RAIDFRAME_GET_COMPONENT_LABEL: 1077 case RAIDFRAME_SET_COMPONENT_LABEL: 1078 case RAIDFRAME_ADD_HOT_SPARE: 1079 case RAIDFRAME_REMOVE_HOT_SPARE: 1080 case RAIDFRAME_INIT_LABELS: 1081 case RAIDFRAME_REBUILD_IN_PLACE: 1082 case RAIDFRAME_CHECK_PARITY: 1083 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1084 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1085 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1086 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1087 case RAIDFRAME_SET_AUTOCONFIG: 1088 case RAIDFRAME_SET_ROOT: 1089 case RAIDFRAME_DELETE_COMPONENT: 1090 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1091 case RAIDFRAME_PARITYMAP_STATUS: 1092 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1093 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1094 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1095 if ((rs->sc_flags & RAIDF_INITED) == 0) 1096 return (ENXIO); 1097 } 1098 1099 switch (cmd) { 1100 #ifdef COMPAT_50 1101 case RAIDFRAME_GET_INFO50: 1102 return rf_get_info50(raidPtr, data); 1103 1104 case RAIDFRAME_CONFIGURE50: 1105 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0) 1106 return retcode; 1107 goto config; 1108 #endif 1109 /* configure the system */ 1110 case RAIDFRAME_CONFIGURE: 1111 1112 if (raidPtr->valid) { 1113 /* There is a valid RAID set running on this unit! */ 1114 printf("raid%d: Device already configured!\n",unit); 1115 return(EINVAL); 1116 } 1117 1118 /* copy-in the configuration information */ 1119 /* data points to a pointer to the configuration structure */ 1120 1121 u_cfg = *((RF_Config_t **) data); 1122 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *)); 1123 if (k_cfg == NULL) { 1124 return (ENOMEM); 1125 } 1126 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t)); 1127 if (retcode) { 1128 RF_Free(k_cfg, sizeof(RF_Config_t)); 1129 db1_printf(("rf_ioctl: retcode=%d copyin.1\n", 1130 retcode)); 1131 goto no_config; 1132 } 1133 goto config; 1134 config: 1135 rs->sc_flags &= ~RAIDF_SHUTDOWN; 1136 1137 /* allocate a buffer for the layout-specific data, and copy it 1138 * in */ 1139 if (k_cfg->layoutSpecificSize) { 1140 if (k_cfg->layoutSpecificSize > 10000) { 1141 /* sanity check */ 1142 RF_Free(k_cfg, sizeof(RF_Config_t)); 1143 retcode = EINVAL; 1144 goto no_config; 1145 } 1146 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize, 1147 (u_char *)); 1148 if (specific_buf == NULL) { 1149 RF_Free(k_cfg, sizeof(RF_Config_t)); 1150 retcode = ENOMEM; 1151 goto no_config; 1152 } 1153 retcode = copyin(k_cfg->layoutSpecific, specific_buf, 1154 k_cfg->layoutSpecificSize); 1155 if (retcode) { 1156 RF_Free(k_cfg, sizeof(RF_Config_t)); 1157 RF_Free(specific_buf, 1158 k_cfg->layoutSpecificSize); 1159 db1_printf(("rf_ioctl: retcode=%d copyin.2\n", 1160 retcode)); 1161 goto no_config; 1162 } 1163 } else 1164 specific_buf = NULL; 1165 k_cfg->layoutSpecific = specific_buf; 1166 1167 /* should do some kind of sanity check on the configuration. 1168 * Store the sum of all the bytes in the last byte? */ 1169 1170 /* configure the system */ 1171 1172 /* 1173 * Clear the entire RAID descriptor, just to make sure 1174 * there is no stale data left in the case of a 1175 * reconfiguration 1176 */ 1177 memset(raidPtr, 0, sizeof(*raidPtr)); 1178 raidPtr->softc = rs; 1179 raidPtr->raidid = unit; 1180 1181 retcode = rf_Configure(raidPtr, k_cfg, NULL); 1182 1183 if (retcode == 0) { 1184 1185 /* allow this many simultaneous IO's to 1186 this RAID device */ 1187 raidPtr->openings = RAIDOUTSTANDING; 1188 1189 raidinit(rs); 1190 raid_wakeup(raidPtr); 1191 rf_markalldirty(raidPtr); 1192 } 1193 /* free the buffers. No return code here. */ 1194 if (k_cfg->layoutSpecificSize) { 1195 RF_Free(specific_buf, k_cfg->layoutSpecificSize); 1196 } 1197 RF_Free(k_cfg, sizeof(RF_Config_t)); 1198 1199 no_config: 1200 /* 1201 * If configuration failed, set sc_flags so that we 1202 * will detach the device when we close it. 1203 */ 1204 if (retcode != 0) 1205 rs->sc_flags |= RAIDF_SHUTDOWN; 1206 return (retcode); 1207 1208 /* shutdown the system */ 1209 case RAIDFRAME_SHUTDOWN: 1210 1211 part = DISKPART(dev); 1212 pmask = (1 << part); 1213 1214 if ((error = raidlock(rs)) != 0) 1215 return (error); 1216 1217 if (DK_BUSY(dksc, pmask) || 1218 raidPtr->recon_in_progress != 0 || 1219 raidPtr->parity_rewrite_in_progress != 0 || 1220 raidPtr->copyback_in_progress != 0) 1221 retcode = EBUSY; 1222 else { 1223 /* detach and free on close */ 1224 rs->sc_flags |= RAIDF_SHUTDOWN; 1225 retcode = 0; 1226 } 1227 1228 raidunlock(rs); 1229 1230 return (retcode); 1231 case RAIDFRAME_GET_COMPONENT_LABEL: 1232 clabel_ptr = (RF_ComponentLabel_t **) data; 1233 /* need to read the component label for the disk indicated 1234 by row,column in clabel */ 1235 1236 /* 1237 * Perhaps there should be an option to skip the in-core 1238 * copy and hit the disk, as with disklabel(8). 1239 */ 1240 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *)); 1241 1242 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel)); 1243 1244 if (retcode) { 1245 RF_Free(clabel, sizeof(*clabel)); 1246 return retcode; 1247 } 1248 1249 clabel->row = 0; /* Don't allow looking at anything else.*/ 1250 1251 column = clabel->column; 1252 1253 if ((column < 0) || (column >= raidPtr->numCol + 1254 raidPtr->numSpare)) { 1255 RF_Free(clabel, sizeof(*clabel)); 1256 return EINVAL; 1257 } 1258 1259 RF_Free(clabel, sizeof(*clabel)); 1260 1261 clabel = raidget_component_label(raidPtr, column); 1262 1263 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr)); 1264 1265 #if 0 1266 case RAIDFRAME_SET_COMPONENT_LABEL: 1267 clabel = (RF_ComponentLabel_t *) data; 1268 1269 /* XXX check the label for valid stuff... */ 1270 /* Note that some things *should not* get modified -- 1271 the user should be re-initing the labels instead of 1272 trying to patch things. 1273 */ 1274 1275 raidid = raidPtr->raidid; 1276 #ifdef DEBUG 1277 printf("raid%d: Got component label:\n", raidid); 1278 printf("raid%d: Version: %d\n", raidid, clabel->version); 1279 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number); 1280 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter); 1281 printf("raid%d: Column: %d\n", raidid, clabel->column); 1282 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns); 1283 printf("raid%d: Clean: %d\n", raidid, clabel->clean); 1284 printf("raid%d: Status: %d\n", raidid, clabel->status); 1285 #endif 1286 clabel->row = 0; 1287 column = clabel->column; 1288 1289 if ((column < 0) || (column >= raidPtr->numCol)) { 1290 return(EINVAL); 1291 } 1292 1293 /* XXX this isn't allowed to do anything for now :-) */ 1294 1295 /* XXX and before it is, we need to fill in the rest 1296 of the fields!?!?!?! */ 1297 memcpy(raidget_component_label(raidPtr, column), 1298 clabel, sizeof(*clabel)); 1299 raidflush_component_label(raidPtr, column); 1300 return (0); 1301 #endif 1302 1303 case RAIDFRAME_INIT_LABELS: 1304 clabel = (RF_ComponentLabel_t *) data; 1305 /* 1306 we only want the serial number from 1307 the above. We get all the rest of the information 1308 from the config that was used to create this RAID 1309 set. 1310 */ 1311 1312 raidPtr->serial_number = clabel->serial_number; 1313 1314 for(column=0;column<raidPtr->numCol;column++) { 1315 diskPtr = &raidPtr->Disks[column]; 1316 if (!RF_DEAD_DISK(diskPtr->status)) { 1317 ci_label = raidget_component_label(raidPtr, 1318 column); 1319 /* Zeroing this is important. */ 1320 memset(ci_label, 0, sizeof(*ci_label)); 1321 raid_init_component_label(raidPtr, ci_label); 1322 ci_label->serial_number = 1323 raidPtr->serial_number; 1324 ci_label->row = 0; /* we dont' pretend to support more */ 1325 rf_component_label_set_partitionsize(ci_label, 1326 diskPtr->partitionSize); 1327 ci_label->column = column; 1328 raidflush_component_label(raidPtr, column); 1329 } 1330 /* XXXjld what about the spares? */ 1331 } 1332 1333 return (retcode); 1334 case RAIDFRAME_SET_AUTOCONFIG: 1335 d = rf_set_autoconfig(raidPtr, *(int *) data); 1336 printf("raid%d: New autoconfig value is: %d\n", 1337 raidPtr->raidid, d); 1338 *(int *) data = d; 1339 return (retcode); 1340 1341 case RAIDFRAME_SET_ROOT: 1342 d = rf_set_rootpartition(raidPtr, *(int *) data); 1343 printf("raid%d: New rootpartition value is: %d\n", 1344 raidPtr->raidid, d); 1345 *(int *) data = d; 1346 return (retcode); 1347 1348 /* initialize all parity */ 1349 case RAIDFRAME_REWRITEPARITY: 1350 1351 if (raidPtr->Layout.map->faultsTolerated == 0) { 1352 /* Parity for RAID 0 is trivially correct */ 1353 raidPtr->parity_good = RF_RAID_CLEAN; 1354 return(0); 1355 } 1356 1357 if (raidPtr->parity_rewrite_in_progress == 1) { 1358 /* Re-write is already in progress! */ 1359 return(EINVAL); 1360 } 1361 1362 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread, 1363 rf_RewriteParityThread, 1364 raidPtr,"raid_parity"); 1365 return (retcode); 1366 1367 1368 case RAIDFRAME_ADD_HOT_SPARE: 1369 sparePtr = (RF_SingleComponent_t *) data; 1370 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t)); 1371 retcode = rf_add_hot_spare(raidPtr, &component); 1372 return(retcode); 1373 1374 case RAIDFRAME_REMOVE_HOT_SPARE: 1375 return(retcode); 1376 1377 case RAIDFRAME_DELETE_COMPONENT: 1378 componentPtr = (RF_SingleComponent_t *)data; 1379 memcpy( &component, componentPtr, 1380 sizeof(RF_SingleComponent_t)); 1381 retcode = rf_delete_component(raidPtr, &component); 1382 return(retcode); 1383 1384 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1385 componentPtr = (RF_SingleComponent_t *)data; 1386 memcpy( &component, componentPtr, 1387 sizeof(RF_SingleComponent_t)); 1388 retcode = rf_incorporate_hot_spare(raidPtr, &component); 1389 return(retcode); 1390 1391 case RAIDFRAME_REBUILD_IN_PLACE: 1392 1393 if (raidPtr->Layout.map->faultsTolerated == 0) { 1394 /* Can't do this on a RAID 0!! */ 1395 return(EINVAL); 1396 } 1397 1398 if (raidPtr->recon_in_progress == 1) { 1399 /* a reconstruct is already in progress! */ 1400 return(EINVAL); 1401 } 1402 1403 componentPtr = (RF_SingleComponent_t *) data; 1404 memcpy( &component, componentPtr, 1405 sizeof(RF_SingleComponent_t)); 1406 component.row = 0; /* we don't support any more */ 1407 column = component.column; 1408 1409 if ((column < 0) || (column >= raidPtr->numCol)) { 1410 return(EINVAL); 1411 } 1412 1413 rf_lock_mutex2(raidPtr->mutex); 1414 if ((raidPtr->Disks[column].status == rf_ds_optimal) && 1415 (raidPtr->numFailures > 0)) { 1416 /* XXX 0 above shouldn't be constant!!! */ 1417 /* some component other than this has failed. 1418 Let's not make things worse than they already 1419 are... */ 1420 printf("raid%d: Unable to reconstruct to disk at:\n", 1421 raidPtr->raidid); 1422 printf("raid%d: Col: %d Too many failures.\n", 1423 raidPtr->raidid, column); 1424 rf_unlock_mutex2(raidPtr->mutex); 1425 return (EINVAL); 1426 } 1427 if (raidPtr->Disks[column].status == 1428 rf_ds_reconstructing) { 1429 printf("raid%d: Unable to reconstruct to disk at:\n", 1430 raidPtr->raidid); 1431 printf("raid%d: Col: %d Reconstruction already occurring!\n", raidPtr->raidid, column); 1432 1433 rf_unlock_mutex2(raidPtr->mutex); 1434 return (EINVAL); 1435 } 1436 if (raidPtr->Disks[column].status == rf_ds_spared) { 1437 rf_unlock_mutex2(raidPtr->mutex); 1438 return (EINVAL); 1439 } 1440 rf_unlock_mutex2(raidPtr->mutex); 1441 1442 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); 1443 if (rrcopy == NULL) 1444 return(ENOMEM); 1445 1446 rrcopy->raidPtr = (void *) raidPtr; 1447 rrcopy->col = column; 1448 1449 retcode = RF_CREATE_THREAD(raidPtr->recon_thread, 1450 rf_ReconstructInPlaceThread, 1451 rrcopy,"raid_reconip"); 1452 return(retcode); 1453 1454 case RAIDFRAME_GET_INFO: 1455 if (!raidPtr->valid) 1456 return (ENODEV); 1457 ucfgp = (RF_DeviceConfig_t **) data; 1458 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t), 1459 (RF_DeviceConfig_t *)); 1460 if (d_cfg == NULL) 1461 return (ENOMEM); 1462 d_cfg->rows = 1; /* there is only 1 row now */ 1463 d_cfg->cols = raidPtr->numCol; 1464 d_cfg->ndevs = raidPtr->numCol; 1465 if (d_cfg->ndevs >= RF_MAX_DISKS) { 1466 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1467 return (ENOMEM); 1468 } 1469 d_cfg->nspares = raidPtr->numSpare; 1470 if (d_cfg->nspares >= RF_MAX_DISKS) { 1471 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1472 return (ENOMEM); 1473 } 1474 d_cfg->maxqdepth = raidPtr->maxQueueDepth; 1475 d = 0; 1476 for (j = 0; j < d_cfg->cols; j++) { 1477 d_cfg->devs[d] = raidPtr->Disks[j]; 1478 d++; 1479 } 1480 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) { 1481 d_cfg->spares[i] = raidPtr->Disks[j]; 1482 if (d_cfg->spares[i].status == rf_ds_rebuilding_spare) { 1483 /* XXX: raidctl(8) expects to see this as a used spare */ 1484 d_cfg->spares[i].status = rf_ds_used_spare; 1485 } 1486 } 1487 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t)); 1488 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1489 1490 return (retcode); 1491 1492 case RAIDFRAME_CHECK_PARITY: 1493 *(int *) data = raidPtr->parity_good; 1494 return (0); 1495 1496 case RAIDFRAME_PARITYMAP_STATUS: 1497 if (rf_paritymap_ineligible(raidPtr)) 1498 return EINVAL; 1499 rf_paritymap_status(raidPtr->parity_map, 1500 (struct rf_pmstat *)data); 1501 return 0; 1502 1503 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1504 if (rf_paritymap_ineligible(raidPtr)) 1505 return EINVAL; 1506 if (raidPtr->parity_map == NULL) 1507 return ENOENT; /* ??? */ 1508 if (0 != rf_paritymap_set_params(raidPtr->parity_map, 1509 (struct rf_pmparams *)data, 1)) 1510 return EINVAL; 1511 return 0; 1512 1513 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1514 if (rf_paritymap_ineligible(raidPtr)) 1515 return EINVAL; 1516 *(int *) data = rf_paritymap_get_disable(raidPtr); 1517 return 0; 1518 1519 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1520 if (rf_paritymap_ineligible(raidPtr)) 1521 return EINVAL; 1522 rf_paritymap_set_disable(raidPtr, *(int *)data); 1523 /* XXX should errors be passed up? */ 1524 return 0; 1525 1526 case RAIDFRAME_RESET_ACCTOTALS: 1527 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals)); 1528 return (0); 1529 1530 case RAIDFRAME_GET_ACCTOTALS: 1531 totals = (RF_AccTotals_t *) data; 1532 *totals = raidPtr->acc_totals; 1533 return (0); 1534 1535 case RAIDFRAME_KEEP_ACCTOTALS: 1536 raidPtr->keep_acc_totals = *(int *)data; 1537 return (0); 1538 1539 case RAIDFRAME_GET_SIZE: 1540 *(int *) data = raidPtr->totalSectors; 1541 return (0); 1542 1543 /* fail a disk & optionally start reconstruction */ 1544 case RAIDFRAME_FAIL_DISK: 1545 1546 if (raidPtr->Layout.map->faultsTolerated == 0) { 1547 /* Can't do this on a RAID 0!! */ 1548 return(EINVAL); 1549 } 1550 1551 rr = (struct rf_recon_req *) data; 1552 rr->row = 0; 1553 if (rr->col < 0 || rr->col >= raidPtr->numCol) 1554 return (EINVAL); 1555 1556 1557 rf_lock_mutex2(raidPtr->mutex); 1558 if (raidPtr->status == rf_rs_reconstructing) { 1559 /* you can't fail a disk while we're reconstructing! */ 1560 /* XXX wrong for RAID6 */ 1561 rf_unlock_mutex2(raidPtr->mutex); 1562 return (EINVAL); 1563 } 1564 if ((raidPtr->Disks[rr->col].status == 1565 rf_ds_optimal) && (raidPtr->numFailures > 0)) { 1566 /* some other component has failed. Let's not make 1567 things worse. XXX wrong for RAID6 */ 1568 rf_unlock_mutex2(raidPtr->mutex); 1569 return (EINVAL); 1570 } 1571 if (raidPtr->Disks[rr->col].status == rf_ds_spared) { 1572 /* Can't fail a spared disk! */ 1573 rf_unlock_mutex2(raidPtr->mutex); 1574 return (EINVAL); 1575 } 1576 rf_unlock_mutex2(raidPtr->mutex); 1577 1578 /* make a copy of the recon request so that we don't rely on 1579 * the user's buffer */ 1580 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); 1581 if (rrcopy == NULL) 1582 return(ENOMEM); 1583 memcpy(rrcopy, rr, sizeof(*rr)); 1584 rrcopy->raidPtr = (void *) raidPtr; 1585 1586 retcode = RF_CREATE_THREAD(raidPtr->recon_thread, 1587 rf_ReconThread, 1588 rrcopy,"raid_recon"); 1589 return (0); 1590 1591 /* invoke a copyback operation after recon on whatever disk 1592 * needs it, if any */ 1593 case RAIDFRAME_COPYBACK: 1594 1595 if (raidPtr->Layout.map->faultsTolerated == 0) { 1596 /* This makes no sense on a RAID 0!! */ 1597 return(EINVAL); 1598 } 1599 1600 if (raidPtr->copyback_in_progress == 1) { 1601 /* Copyback is already in progress! */ 1602 return(EINVAL); 1603 } 1604 1605 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread, 1606 rf_CopybackThread, 1607 raidPtr,"raid_copyback"); 1608 return (retcode); 1609 1610 /* return the percentage completion of reconstruction */ 1611 case RAIDFRAME_CHECK_RECON_STATUS: 1612 if (raidPtr->Layout.map->faultsTolerated == 0) { 1613 /* This makes no sense on a RAID 0, so tell the 1614 user it's done. */ 1615 *(int *) data = 100; 1616 return(0); 1617 } 1618 if (raidPtr->status != rf_rs_reconstructing) 1619 *(int *) data = 100; 1620 else { 1621 if (raidPtr->reconControl->numRUsTotal > 0) { 1622 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal); 1623 } else { 1624 *(int *) data = 0; 1625 } 1626 } 1627 return (0); 1628 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1629 progressInfoPtr = (RF_ProgressInfo_t **) data; 1630 if (raidPtr->status != rf_rs_reconstructing) { 1631 progressInfo.remaining = 0; 1632 progressInfo.completed = 100; 1633 progressInfo.total = 100; 1634 } else { 1635 progressInfo.total = 1636 raidPtr->reconControl->numRUsTotal; 1637 progressInfo.completed = 1638 raidPtr->reconControl->numRUsComplete; 1639 progressInfo.remaining = progressInfo.total - 1640 progressInfo.completed; 1641 } 1642 retcode = copyout(&progressInfo, *progressInfoPtr, 1643 sizeof(RF_ProgressInfo_t)); 1644 return (retcode); 1645 1646 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1647 if (raidPtr->Layout.map->faultsTolerated == 0) { 1648 /* This makes no sense on a RAID 0, so tell the 1649 user it's done. */ 1650 *(int *) data = 100; 1651 return(0); 1652 } 1653 if (raidPtr->parity_rewrite_in_progress == 1) { 1654 *(int *) data = 100 * 1655 raidPtr->parity_rewrite_stripes_done / 1656 raidPtr->Layout.numStripe; 1657 } else { 1658 *(int *) data = 100; 1659 } 1660 return (0); 1661 1662 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1663 progressInfoPtr = (RF_ProgressInfo_t **) data; 1664 if (raidPtr->parity_rewrite_in_progress == 1) { 1665 progressInfo.total = raidPtr->Layout.numStripe; 1666 progressInfo.completed = 1667 raidPtr->parity_rewrite_stripes_done; 1668 progressInfo.remaining = progressInfo.total - 1669 progressInfo.completed; 1670 } else { 1671 progressInfo.remaining = 0; 1672 progressInfo.completed = 100; 1673 progressInfo.total = 100; 1674 } 1675 retcode = copyout(&progressInfo, *progressInfoPtr, 1676 sizeof(RF_ProgressInfo_t)); 1677 return (retcode); 1678 1679 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1680 if (raidPtr->Layout.map->faultsTolerated == 0) { 1681 /* This makes no sense on a RAID 0 */ 1682 *(int *) data = 100; 1683 return(0); 1684 } 1685 if (raidPtr->copyback_in_progress == 1) { 1686 *(int *) data = 100 * raidPtr->copyback_stripes_done / 1687 raidPtr->Layout.numStripe; 1688 } else { 1689 *(int *) data = 100; 1690 } 1691 return (0); 1692 1693 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1694 progressInfoPtr = (RF_ProgressInfo_t **) data; 1695 if (raidPtr->copyback_in_progress == 1) { 1696 progressInfo.total = raidPtr->Layout.numStripe; 1697 progressInfo.completed = 1698 raidPtr->copyback_stripes_done; 1699 progressInfo.remaining = progressInfo.total - 1700 progressInfo.completed; 1701 } else { 1702 progressInfo.remaining = 0; 1703 progressInfo.completed = 100; 1704 progressInfo.total = 100; 1705 } 1706 retcode = copyout(&progressInfo, *progressInfoPtr, 1707 sizeof(RF_ProgressInfo_t)); 1708 return (retcode); 1709 1710 case RAIDFRAME_SET_LAST_UNIT: 1711 for (column = 0; column < raidPtr->numCol; column++) 1712 if (raidPtr->Disks[column].status != rf_ds_optimal) 1713 return EBUSY; 1714 1715 for (column = 0; column < raidPtr->numCol; column++) { 1716 clabel = raidget_component_label(raidPtr, column); 1717 clabel->last_unit = *(int *)data; 1718 raidflush_component_label(raidPtr, column); 1719 } 1720 rs->sc_cflags |= RAIDF_UNIT_CHANGED; 1721 return 0; 1722 1723 /* the sparetable daemon calls this to wait for the kernel to 1724 * need a spare table. this ioctl does not return until a 1725 * spare table is needed. XXX -- calling mpsleep here in the 1726 * ioctl code is almost certainly wrong and evil. -- XXX XXX 1727 * -- I should either compute the spare table in the kernel, 1728 * or have a different -- XXX XXX -- interface (a different 1729 * character device) for delivering the table -- XXX */ 1730 #if 0 1731 case RAIDFRAME_SPARET_WAIT: 1732 rf_lock_mutex2(rf_sparet_wait_mutex); 1733 while (!rf_sparet_wait_queue) 1734 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex); 1735 waitreq = rf_sparet_wait_queue; 1736 rf_sparet_wait_queue = rf_sparet_wait_queue->next; 1737 rf_unlock_mutex2(rf_sparet_wait_mutex); 1738 1739 /* structure assignment */ 1740 *((RF_SparetWait_t *) data) = *waitreq; 1741 1742 RF_Free(waitreq, sizeof(*waitreq)); 1743 return (0); 1744 1745 /* wakes up a process waiting on SPARET_WAIT and puts an error 1746 * code in it that will cause the dameon to exit */ 1747 case RAIDFRAME_ABORT_SPARET_WAIT: 1748 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); 1749 waitreq->fcol = -1; 1750 rf_lock_mutex2(rf_sparet_wait_mutex); 1751 waitreq->next = rf_sparet_wait_queue; 1752 rf_sparet_wait_queue = waitreq; 1753 rf_broadcast_conf2(rf_sparet_wait_cv); 1754 rf_unlock_mutex2(rf_sparet_wait_mutex); 1755 return (0); 1756 1757 /* used by the spare table daemon to deliver a spare table 1758 * into the kernel */ 1759 case RAIDFRAME_SEND_SPARET: 1760 1761 /* install the spare table */ 1762 retcode = rf_SetSpareTable(raidPtr, *(void **) data); 1763 1764 /* respond to the requestor. the return status of the spare 1765 * table installation is passed in the "fcol" field */ 1766 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); 1767 waitreq->fcol = retcode; 1768 rf_lock_mutex2(rf_sparet_wait_mutex); 1769 waitreq->next = rf_sparet_resp_queue; 1770 rf_sparet_resp_queue = waitreq; 1771 rf_broadcast_cond2(rf_sparet_resp_cv); 1772 rf_unlock_mutex2(rf_sparet_wait_mutex); 1773 1774 return (retcode); 1775 #endif 1776 1777 default: 1778 break; /* fall through to the os-specific code below */ 1779 1780 } 1781 1782 if (!raidPtr->valid) 1783 return (EINVAL); 1784 1785 /* 1786 * Add support for "regular" device ioctls here. 1787 */ 1788 1789 switch (cmd) { 1790 case DIOCCACHESYNC: 1791 retcode = rf_sync_component_caches(raidPtr); 1792 break; 1793 1794 default: 1795 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l); 1796 break; 1797 } 1798 1799 return (retcode); 1800 1801 } 1802 1803 1804 /* raidinit -- complete the rest of the initialization for the 1805 RAIDframe device. */ 1806 1807 1808 static void 1809 raidinit(struct raid_softc *rs) 1810 { 1811 cfdata_t cf; 1812 unsigned int unit; 1813 struct dk_softc *dksc = &rs->sc_dksc; 1814 RF_Raid_t *raidPtr = &rs->sc_r; 1815 device_t dev; 1816 1817 unit = raidPtr->raidid; 1818 1819 /* XXX doesn't check bounds. */ 1820 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit); 1821 1822 /* attach the pseudo device */ 1823 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK); 1824 cf->cf_name = raid_cd.cd_name; 1825 cf->cf_atname = raid_cd.cd_name; 1826 cf->cf_unit = unit; 1827 cf->cf_fstate = FSTATE_STAR; 1828 1829 dev = config_attach_pseudo(cf); 1830 if (dev == NULL) { 1831 printf("raid%d: config_attach_pseudo failed\n", 1832 raidPtr->raidid); 1833 free(cf, M_RAIDFRAME); 1834 return; 1835 } 1836 1837 /* provide a backpointer to the real softc */ 1838 raidsoftc(dev) = rs; 1839 1840 /* disk_attach actually creates space for the CPU disklabel, among 1841 * other things, so it's critical to call this *BEFORE* we try putzing 1842 * with disklabels. */ 1843 dk_init(dksc, dev, DKTYPE_RAID); 1844 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver); 1845 1846 /* XXX There may be a weird interaction here between this, and 1847 * protectedSectors, as used in RAIDframe. */ 1848 1849 rs->sc_size = raidPtr->totalSectors; 1850 1851 /* Attach dk and disk subsystems */ 1852 dk_attach(dksc); 1853 disk_attach(&dksc->sc_dkdev); 1854 rf_set_geometry(rs, raidPtr); 1855 1856 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK); 1857 1858 /* mark unit as usuable */ 1859 rs->sc_flags |= RAIDF_INITED; 1860 1861 dkwedge_discover(&dksc->sc_dkdev); 1862 } 1863 1864 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 1865 /* wake up the daemon & tell it to get us a spare table 1866 * XXX 1867 * the entries in the queues should be tagged with the raidPtr 1868 * so that in the extremely rare case that two recons happen at once, 1869 * we know for which device were requesting a spare table 1870 * XXX 1871 * 1872 * XXX This code is not currently used. GO 1873 */ 1874 int 1875 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req) 1876 { 1877 int retcode; 1878 1879 rf_lock_mutex2(rf_sparet_wait_mutex); 1880 req->next = rf_sparet_wait_queue; 1881 rf_sparet_wait_queue = req; 1882 rf_broadcast_cond2(rf_sparet_wait_cv); 1883 1884 /* mpsleep unlocks the mutex */ 1885 while (!rf_sparet_resp_queue) { 1886 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex); 1887 } 1888 req = rf_sparet_resp_queue; 1889 rf_sparet_resp_queue = req->next; 1890 rf_unlock_mutex2(rf_sparet_wait_mutex); 1891 1892 retcode = req->fcol; 1893 RF_Free(req, sizeof(*req)); /* this is not the same req as we 1894 * alloc'd */ 1895 return (retcode); 1896 } 1897 #endif 1898 1899 /* a wrapper around rf_DoAccess that extracts appropriate info from the 1900 * bp & passes it down. 1901 * any calls originating in the kernel must use non-blocking I/O 1902 * do some extra sanity checking to return "appropriate" error values for 1903 * certain conditions (to make some standard utilities work) 1904 * 1905 * Formerly known as: rf_DoAccessKernel 1906 */ 1907 void 1908 raidstart(RF_Raid_t *raidPtr) 1909 { 1910 struct raid_softc *rs; 1911 struct dk_softc *dksc; 1912 1913 rs = raidPtr->softc; 1914 dksc = &rs->sc_dksc; 1915 /* quick check to see if anything has died recently */ 1916 rf_lock_mutex2(raidPtr->mutex); 1917 if (raidPtr->numNewFailures > 0) { 1918 rf_unlock_mutex2(raidPtr->mutex); 1919 rf_update_component_labels(raidPtr, 1920 RF_NORMAL_COMPONENT_UPDATE); 1921 rf_lock_mutex2(raidPtr->mutex); 1922 raidPtr->numNewFailures--; 1923 } 1924 rf_unlock_mutex2(raidPtr->mutex); 1925 1926 if ((rs->sc_flags & RAIDF_INITED) == 0) { 1927 printf("raid%d: raidstart not ready\n", raidPtr->raidid); 1928 return; 1929 } 1930 1931 dk_start(dksc, NULL); 1932 } 1933 1934 static int 1935 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp) 1936 { 1937 RF_SectorCount_t num_blocks, pb, sum; 1938 RF_RaidAddr_t raid_addr; 1939 daddr_t blocknum; 1940 int do_async; 1941 int rc; 1942 1943 rf_lock_mutex2(raidPtr->mutex); 1944 if (raidPtr->openings == 0) { 1945 rf_unlock_mutex2(raidPtr->mutex); 1946 return EAGAIN; 1947 } 1948 rf_unlock_mutex2(raidPtr->mutex); 1949 1950 blocknum = bp->b_rawblkno; 1951 1952 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno, 1953 (int) blocknum)); 1954 1955 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount)); 1956 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid)); 1957 1958 /* *THIS* is where we adjust what block we're going to... 1959 * but DO NOT TOUCH bp->b_blkno!!! */ 1960 raid_addr = blocknum; 1961 1962 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector; 1963 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0; 1964 sum = raid_addr + num_blocks + pb; 1965 if (1 || rf_debugKernelAccess) { 1966 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n", 1967 (int) raid_addr, (int) sum, (int) num_blocks, 1968 (int) pb, (int) bp->b_resid)); 1969 } 1970 if ((sum > raidPtr->totalSectors) || (sum < raid_addr) 1971 || (sum < num_blocks) || (sum < pb)) { 1972 rc = ENOSPC; 1973 goto done; 1974 } 1975 /* 1976 * XXX rf_DoAccess() should do this, not just DoAccessKernel() 1977 */ 1978 1979 if (bp->b_bcount & raidPtr->sectorMask) { 1980 rc = ENOSPC; 1981 goto done; 1982 } 1983 db1_printf(("Calling DoAccess..\n")); 1984 1985 1986 rf_lock_mutex2(raidPtr->mutex); 1987 raidPtr->openings--; 1988 rf_unlock_mutex2(raidPtr->mutex); 1989 1990 /* 1991 * Everything is async. 1992 */ 1993 do_async = 1; 1994 1995 /* don't ever condition on bp->b_flags & B_WRITE. 1996 * always condition on B_READ instead */ 1997 1998 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ? 1999 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE, 2000 do_async, raid_addr, num_blocks, 2001 bp->b_data, bp, RF_DAG_NONBLOCKING_IO); 2002 2003 done: 2004 return rc; 2005 } 2006 2007 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */ 2008 2009 int 2010 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req) 2011 { 2012 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE; 2013 struct buf *bp; 2014 2015 req->queue = queue; 2016 bp = req->bp; 2017 2018 switch (req->type) { 2019 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */ 2020 /* XXX need to do something extra here.. */ 2021 /* I'm leaving this in, as I've never actually seen it used, 2022 * and I'd like folks to report it... GO */ 2023 printf(("WAKEUP CALLED\n")); 2024 queue->numOutstanding++; 2025 2026 bp->b_flags = 0; 2027 bp->b_private = req; 2028 2029 KernelWakeupFunc(bp); 2030 break; 2031 2032 case RF_IO_TYPE_READ: 2033 case RF_IO_TYPE_WRITE: 2034 #if RF_ACC_TRACE > 0 2035 if (req->tracerec) { 2036 RF_ETIMER_START(req->tracerec->timer); 2037 } 2038 #endif 2039 InitBP(bp, queue->rf_cinfo->ci_vp, 2040 op, queue->rf_cinfo->ci_dev, 2041 req->sectorOffset, req->numSector, 2042 req->buf, KernelWakeupFunc, (void *) req, 2043 queue->raidPtr->logBytesPerSector, req->b_proc); 2044 2045 if (rf_debugKernelAccess) { 2046 db1_printf(("dispatch: bp->b_blkno = %ld\n", 2047 (long) bp->b_blkno)); 2048 } 2049 queue->numOutstanding++; 2050 queue->last_deq_sector = req->sectorOffset; 2051 /* acc wouldn't have been let in if there were any pending 2052 * reqs at any other priority */ 2053 queue->curPriority = req->priority; 2054 2055 db1_printf(("Going for %c to unit %d col %d\n", 2056 req->type, queue->raidPtr->raidid, 2057 queue->col)); 2058 db1_printf(("sector %d count %d (%d bytes) %d\n", 2059 (int) req->sectorOffset, (int) req->numSector, 2060 (int) (req->numSector << 2061 queue->raidPtr->logBytesPerSector), 2062 (int) queue->raidPtr->logBytesPerSector)); 2063 2064 /* 2065 * XXX: drop lock here since this can block at 2066 * least with backing SCSI devices. Retake it 2067 * to minimize fuss with calling interfaces. 2068 */ 2069 2070 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam"); 2071 bdev_strategy(bp); 2072 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam"); 2073 break; 2074 2075 default: 2076 panic("bad req->type in rf_DispatchKernelIO"); 2077 } 2078 db1_printf(("Exiting from DispatchKernelIO\n")); 2079 2080 return (0); 2081 } 2082 /* this is the callback function associated with a I/O invoked from 2083 kernel code. 2084 */ 2085 static void 2086 KernelWakeupFunc(struct buf *bp) 2087 { 2088 RF_DiskQueueData_t *req = NULL; 2089 RF_DiskQueue_t *queue; 2090 2091 db1_printf(("recovering the request queue:\n")); 2092 2093 req = bp->b_private; 2094 2095 queue = (RF_DiskQueue_t *) req->queue; 2096 2097 rf_lock_mutex2(queue->raidPtr->iodone_lock); 2098 2099 #if RF_ACC_TRACE > 0 2100 if (req->tracerec) { 2101 RF_ETIMER_STOP(req->tracerec->timer); 2102 RF_ETIMER_EVAL(req->tracerec->timer); 2103 rf_lock_mutex2(rf_tracing_mutex); 2104 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2105 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2106 req->tracerec->num_phys_ios++; 2107 rf_unlock_mutex2(rf_tracing_mutex); 2108 } 2109 #endif 2110 2111 /* XXX Ok, let's get aggressive... If b_error is set, let's go 2112 * ballistic, and mark the component as hosed... */ 2113 2114 if (bp->b_error != 0) { 2115 /* Mark the disk as dead */ 2116 /* but only mark it once... */ 2117 /* and only if it wouldn't leave this RAID set 2118 completely broken */ 2119 if (((queue->raidPtr->Disks[queue->col].status == 2120 rf_ds_optimal) || 2121 (queue->raidPtr->Disks[queue->col].status == 2122 rf_ds_used_spare)) && 2123 (queue->raidPtr->numFailures < 2124 queue->raidPtr->Layout.map->faultsTolerated)) { 2125 printf("raid%d: IO Error (%d). Marking %s as failed.\n", 2126 queue->raidPtr->raidid, 2127 bp->b_error, 2128 queue->raidPtr->Disks[queue->col].devname); 2129 queue->raidPtr->Disks[queue->col].status = 2130 rf_ds_failed; 2131 queue->raidPtr->status = rf_rs_degraded; 2132 queue->raidPtr->numFailures++; 2133 queue->raidPtr->numNewFailures++; 2134 } else { /* Disk is already dead... */ 2135 /* printf("Disk already marked as dead!\n"); */ 2136 } 2137 2138 } 2139 2140 /* Fill in the error value */ 2141 req->error = bp->b_error; 2142 2143 /* Drop this one on the "finished" queue... */ 2144 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries); 2145 2146 /* Let the raidio thread know there is work to be done. */ 2147 rf_signal_cond2(queue->raidPtr->iodone_cv); 2148 2149 rf_unlock_mutex2(queue->raidPtr->iodone_lock); 2150 } 2151 2152 2153 /* 2154 * initialize a buf structure for doing an I/O in the kernel. 2155 */ 2156 static void 2157 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev, 2158 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf, 2159 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector, 2160 struct proc *b_proc) 2161 { 2162 /* bp->b_flags = B_PHYS | rw_flag; */ 2163 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */ 2164 bp->b_oflags = 0; 2165 bp->b_cflags = 0; 2166 bp->b_bcount = numSect << logBytesPerSector; 2167 bp->b_bufsize = bp->b_bcount; 2168 bp->b_error = 0; 2169 bp->b_dev = dev; 2170 bp->b_data = bf; 2171 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT; 2172 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */ 2173 if (bp->b_bcount == 0) { 2174 panic("bp->b_bcount is zero in InitBP!!"); 2175 } 2176 bp->b_proc = b_proc; 2177 bp->b_iodone = cbFunc; 2178 bp->b_private = cbArg; 2179 } 2180 2181 /* 2182 * Wait interruptibly for an exclusive lock. 2183 * 2184 * XXX 2185 * Several drivers do this; it should be abstracted and made MP-safe. 2186 * (Hmm... where have we seen this warning before :-> GO ) 2187 */ 2188 static int 2189 raidlock(struct raid_softc *rs) 2190 { 2191 int error; 2192 2193 error = 0; 2194 mutex_enter(&rs->sc_mutex); 2195 while ((rs->sc_flags & RAIDF_LOCKED) != 0) { 2196 rs->sc_flags |= RAIDF_WANTED; 2197 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex); 2198 if (error != 0) 2199 goto done; 2200 } 2201 rs->sc_flags |= RAIDF_LOCKED; 2202 done: 2203 mutex_exit(&rs->sc_mutex); 2204 return (error); 2205 } 2206 /* 2207 * Unlock and wake up any waiters. 2208 */ 2209 static void 2210 raidunlock(struct raid_softc *rs) 2211 { 2212 2213 mutex_enter(&rs->sc_mutex); 2214 rs->sc_flags &= ~RAIDF_LOCKED; 2215 if ((rs->sc_flags & RAIDF_WANTED) != 0) { 2216 rs->sc_flags &= ~RAIDF_WANTED; 2217 cv_broadcast(&rs->sc_cv); 2218 } 2219 mutex_exit(&rs->sc_mutex); 2220 } 2221 2222 2223 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */ 2224 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */ 2225 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE 2226 2227 static daddr_t 2228 rf_component_info_offset(void) 2229 { 2230 2231 return RF_COMPONENT_INFO_OFFSET; 2232 } 2233 2234 static daddr_t 2235 rf_component_info_size(unsigned secsize) 2236 { 2237 daddr_t info_size; 2238 2239 KASSERT(secsize); 2240 if (secsize > RF_COMPONENT_INFO_SIZE) 2241 info_size = secsize; 2242 else 2243 info_size = RF_COMPONENT_INFO_SIZE; 2244 2245 return info_size; 2246 } 2247 2248 static daddr_t 2249 rf_parity_map_offset(RF_Raid_t *raidPtr) 2250 { 2251 daddr_t map_offset; 2252 2253 KASSERT(raidPtr->bytesPerSector); 2254 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE) 2255 map_offset = raidPtr->bytesPerSector; 2256 else 2257 map_offset = RF_COMPONENT_INFO_SIZE; 2258 map_offset += rf_component_info_offset(); 2259 2260 return map_offset; 2261 } 2262 2263 static daddr_t 2264 rf_parity_map_size(RF_Raid_t *raidPtr) 2265 { 2266 daddr_t map_size; 2267 2268 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE) 2269 map_size = raidPtr->bytesPerSector; 2270 else 2271 map_size = RF_PARITY_MAP_SIZE; 2272 2273 return map_size; 2274 } 2275 2276 int 2277 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col) 2278 { 2279 RF_ComponentLabel_t *clabel; 2280 2281 clabel = raidget_component_label(raidPtr, col); 2282 clabel->clean = RF_RAID_CLEAN; 2283 raidflush_component_label(raidPtr, col); 2284 return(0); 2285 } 2286 2287 2288 int 2289 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col) 2290 { 2291 RF_ComponentLabel_t *clabel; 2292 2293 clabel = raidget_component_label(raidPtr, col); 2294 clabel->clean = RF_RAID_DIRTY; 2295 raidflush_component_label(raidPtr, col); 2296 return(0); 2297 } 2298 2299 int 2300 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2301 { 2302 KASSERT(raidPtr->bytesPerSector); 2303 return raidread_component_label(raidPtr->bytesPerSector, 2304 raidPtr->Disks[col].dev, 2305 raidPtr->raid_cinfo[col].ci_vp, 2306 &raidPtr->raid_cinfo[col].ci_label); 2307 } 2308 2309 RF_ComponentLabel_t * 2310 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2311 { 2312 return &raidPtr->raid_cinfo[col].ci_label; 2313 } 2314 2315 int 2316 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2317 { 2318 RF_ComponentLabel_t *label; 2319 2320 label = &raidPtr->raid_cinfo[col].ci_label; 2321 label->mod_counter = raidPtr->mod_counter; 2322 #ifndef RF_NO_PARITY_MAP 2323 label->parity_map_modcount = label->mod_counter; 2324 #endif 2325 return raidwrite_component_label(raidPtr->bytesPerSector, 2326 raidPtr->Disks[col].dev, 2327 raidPtr->raid_cinfo[col].ci_vp, label); 2328 } 2329 2330 2331 static int 2332 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2333 RF_ComponentLabel_t *clabel) 2334 { 2335 return raidread_component_area(dev, b_vp, clabel, 2336 sizeof(RF_ComponentLabel_t), 2337 rf_component_info_offset(), 2338 rf_component_info_size(secsize)); 2339 } 2340 2341 /* ARGSUSED */ 2342 static int 2343 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data, 2344 size_t msize, daddr_t offset, daddr_t dsize) 2345 { 2346 struct buf *bp; 2347 int error; 2348 2349 /* XXX should probably ensure that we don't try to do this if 2350 someone has changed rf_protected_sectors. */ 2351 2352 if (b_vp == NULL) { 2353 /* For whatever reason, this component is not valid. 2354 Don't try to read a component label from it. */ 2355 return(EINVAL); 2356 } 2357 2358 /* get a block of the appropriate size... */ 2359 bp = geteblk((int)dsize); 2360 bp->b_dev = dev; 2361 2362 /* get our ducks in a row for the read */ 2363 bp->b_blkno = offset / DEV_BSIZE; 2364 bp->b_bcount = dsize; 2365 bp->b_flags |= B_READ; 2366 bp->b_resid = dsize; 2367 2368 bdev_strategy(bp); 2369 error = biowait(bp); 2370 2371 if (!error) { 2372 memcpy(data, bp->b_data, msize); 2373 } 2374 2375 brelse(bp, 0); 2376 return(error); 2377 } 2378 2379 2380 static int 2381 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2382 RF_ComponentLabel_t *clabel) 2383 { 2384 return raidwrite_component_area(dev, b_vp, clabel, 2385 sizeof(RF_ComponentLabel_t), 2386 rf_component_info_offset(), 2387 rf_component_info_size(secsize), 0); 2388 } 2389 2390 /* ARGSUSED */ 2391 static int 2392 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data, 2393 size_t msize, daddr_t offset, daddr_t dsize, int asyncp) 2394 { 2395 struct buf *bp; 2396 int error; 2397 2398 /* get a block of the appropriate size... */ 2399 bp = geteblk((int)dsize); 2400 bp->b_dev = dev; 2401 2402 /* get our ducks in a row for the write */ 2403 bp->b_blkno = offset / DEV_BSIZE; 2404 bp->b_bcount = dsize; 2405 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0); 2406 bp->b_resid = dsize; 2407 2408 memset(bp->b_data, 0, dsize); 2409 memcpy(bp->b_data, data, msize); 2410 2411 bdev_strategy(bp); 2412 if (asyncp) 2413 return 0; 2414 error = biowait(bp); 2415 brelse(bp, 0); 2416 if (error) { 2417 #if 1 2418 printf("Failed to write RAID component info!\n"); 2419 #endif 2420 } 2421 2422 return(error); 2423 } 2424 2425 void 2426 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2427 { 2428 int c; 2429 2430 for (c = 0; c < raidPtr->numCol; c++) { 2431 /* Skip dead disks. */ 2432 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2433 continue; 2434 /* XXXjld: what if an error occurs here? */ 2435 raidwrite_component_area(raidPtr->Disks[c].dev, 2436 raidPtr->raid_cinfo[c].ci_vp, map, 2437 RF_PARITYMAP_NBYTE, 2438 rf_parity_map_offset(raidPtr), 2439 rf_parity_map_size(raidPtr), 0); 2440 } 2441 } 2442 2443 void 2444 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2445 { 2446 struct rf_paritymap_ondisk tmp; 2447 int c,first; 2448 2449 first=1; 2450 for (c = 0; c < raidPtr->numCol; c++) { 2451 /* Skip dead disks. */ 2452 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2453 continue; 2454 raidread_component_area(raidPtr->Disks[c].dev, 2455 raidPtr->raid_cinfo[c].ci_vp, &tmp, 2456 RF_PARITYMAP_NBYTE, 2457 rf_parity_map_offset(raidPtr), 2458 rf_parity_map_size(raidPtr)); 2459 if (first) { 2460 memcpy(map, &tmp, sizeof(*map)); 2461 first = 0; 2462 } else { 2463 rf_paritymap_merge(map, &tmp); 2464 } 2465 } 2466 } 2467 2468 void 2469 rf_markalldirty(RF_Raid_t *raidPtr) 2470 { 2471 RF_ComponentLabel_t *clabel; 2472 int sparecol; 2473 int c; 2474 int j; 2475 int scol = -1; 2476 2477 raidPtr->mod_counter++; 2478 for (c = 0; c < raidPtr->numCol; c++) { 2479 /* we don't want to touch (at all) a disk that has 2480 failed */ 2481 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) { 2482 clabel = raidget_component_label(raidPtr, c); 2483 if (clabel->status == rf_ds_spared) { 2484 /* XXX do something special... 2485 but whatever you do, don't 2486 try to access it!! */ 2487 } else { 2488 raidmarkdirty(raidPtr, c); 2489 } 2490 } 2491 } 2492 2493 for( c = 0; c < raidPtr->numSpare ; c++) { 2494 sparecol = raidPtr->numCol + c; 2495 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2496 /* 2497 2498 we claim this disk is "optimal" if it's 2499 rf_ds_used_spare, as that means it should be 2500 directly substitutable for the disk it replaced. 2501 We note that too... 2502 2503 */ 2504 2505 for(j=0;j<raidPtr->numCol;j++) { 2506 if (raidPtr->Disks[j].spareCol == sparecol) { 2507 scol = j; 2508 break; 2509 } 2510 } 2511 2512 clabel = raidget_component_label(raidPtr, sparecol); 2513 /* make sure status is noted */ 2514 2515 raid_init_component_label(raidPtr, clabel); 2516 2517 clabel->row = 0; 2518 clabel->column = scol; 2519 /* Note: we *don't* change status from rf_ds_used_spare 2520 to rf_ds_optimal */ 2521 /* clabel.status = rf_ds_optimal; */ 2522 2523 raidmarkdirty(raidPtr, sparecol); 2524 } 2525 } 2526 } 2527 2528 2529 void 2530 rf_update_component_labels(RF_Raid_t *raidPtr, int final) 2531 { 2532 RF_ComponentLabel_t *clabel; 2533 int sparecol; 2534 int c; 2535 int j; 2536 int scol; 2537 struct raid_softc *rs = raidPtr->softc; 2538 2539 scol = -1; 2540 2541 /* XXX should do extra checks to make sure things really are clean, 2542 rather than blindly setting the clean bit... */ 2543 2544 raidPtr->mod_counter++; 2545 2546 for (c = 0; c < raidPtr->numCol; c++) { 2547 if (raidPtr->Disks[c].status == rf_ds_optimal) { 2548 clabel = raidget_component_label(raidPtr, c); 2549 /* make sure status is noted */ 2550 clabel->status = rf_ds_optimal; 2551 2552 /* note what unit we are configured as */ 2553 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0) 2554 clabel->last_unit = raidPtr->raidid; 2555 2556 raidflush_component_label(raidPtr, c); 2557 if (final == RF_FINAL_COMPONENT_UPDATE) { 2558 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2559 raidmarkclean(raidPtr, c); 2560 } 2561 } 2562 } 2563 /* else we don't touch it.. */ 2564 } 2565 2566 for( c = 0; c < raidPtr->numSpare ; c++) { 2567 sparecol = raidPtr->numCol + c; 2568 /* Need to ensure that the reconstruct actually completed! */ 2569 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2570 /* 2571 2572 we claim this disk is "optimal" if it's 2573 rf_ds_used_spare, as that means it should be 2574 directly substitutable for the disk it replaced. 2575 We note that too... 2576 2577 */ 2578 2579 for(j=0;j<raidPtr->numCol;j++) { 2580 if (raidPtr->Disks[j].spareCol == sparecol) { 2581 scol = j; 2582 break; 2583 } 2584 } 2585 2586 /* XXX shouldn't *really* need this... */ 2587 clabel = raidget_component_label(raidPtr, sparecol); 2588 /* make sure status is noted */ 2589 2590 raid_init_component_label(raidPtr, clabel); 2591 2592 clabel->column = scol; 2593 clabel->status = rf_ds_optimal; 2594 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0) 2595 clabel->last_unit = raidPtr->raidid; 2596 2597 raidflush_component_label(raidPtr, sparecol); 2598 if (final == RF_FINAL_COMPONENT_UPDATE) { 2599 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2600 raidmarkclean(raidPtr, sparecol); 2601 } 2602 } 2603 } 2604 } 2605 } 2606 2607 void 2608 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured) 2609 { 2610 2611 if (vp != NULL) { 2612 if (auto_configured == 1) { 2613 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2614 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2615 vput(vp); 2616 2617 } else { 2618 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred); 2619 } 2620 } 2621 } 2622 2623 2624 void 2625 rf_UnconfigureVnodes(RF_Raid_t *raidPtr) 2626 { 2627 int r,c; 2628 struct vnode *vp; 2629 int acd; 2630 2631 2632 /* We take this opportunity to close the vnodes like we should.. */ 2633 2634 for (c = 0; c < raidPtr->numCol; c++) { 2635 vp = raidPtr->raid_cinfo[c].ci_vp; 2636 acd = raidPtr->Disks[c].auto_configured; 2637 rf_close_component(raidPtr, vp, acd); 2638 raidPtr->raid_cinfo[c].ci_vp = NULL; 2639 raidPtr->Disks[c].auto_configured = 0; 2640 } 2641 2642 for (r = 0; r < raidPtr->numSpare; r++) { 2643 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp; 2644 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured; 2645 rf_close_component(raidPtr, vp, acd); 2646 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL; 2647 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0; 2648 } 2649 } 2650 2651 2652 void 2653 rf_ReconThread(struct rf_recon_req *req) 2654 { 2655 int s; 2656 RF_Raid_t *raidPtr; 2657 2658 s = splbio(); 2659 raidPtr = (RF_Raid_t *) req->raidPtr; 2660 raidPtr->recon_in_progress = 1; 2661 2662 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col, 2663 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0)); 2664 2665 RF_Free(req, sizeof(*req)); 2666 2667 raidPtr->recon_in_progress = 0; 2668 splx(s); 2669 2670 /* That's all... */ 2671 kthread_exit(0); /* does not return */ 2672 } 2673 2674 void 2675 rf_RewriteParityThread(RF_Raid_t *raidPtr) 2676 { 2677 int retcode; 2678 int s; 2679 2680 raidPtr->parity_rewrite_stripes_done = 0; 2681 raidPtr->parity_rewrite_in_progress = 1; 2682 s = splbio(); 2683 retcode = rf_RewriteParity(raidPtr); 2684 splx(s); 2685 if (retcode) { 2686 printf("raid%d: Error re-writing parity (%d)!\n", 2687 raidPtr->raidid, retcode); 2688 } else { 2689 /* set the clean bit! If we shutdown correctly, 2690 the clean bit on each component label will get 2691 set */ 2692 raidPtr->parity_good = RF_RAID_CLEAN; 2693 } 2694 raidPtr->parity_rewrite_in_progress = 0; 2695 2696 /* Anyone waiting for us to stop? If so, inform them... */ 2697 if (raidPtr->waitShutdown) { 2698 wakeup(&raidPtr->parity_rewrite_in_progress); 2699 } 2700 2701 /* That's all... */ 2702 kthread_exit(0); /* does not return */ 2703 } 2704 2705 2706 void 2707 rf_CopybackThread(RF_Raid_t *raidPtr) 2708 { 2709 int s; 2710 2711 raidPtr->copyback_in_progress = 1; 2712 s = splbio(); 2713 rf_CopybackReconstructedData(raidPtr); 2714 splx(s); 2715 raidPtr->copyback_in_progress = 0; 2716 2717 /* That's all... */ 2718 kthread_exit(0); /* does not return */ 2719 } 2720 2721 2722 void 2723 rf_ReconstructInPlaceThread(struct rf_recon_req *req) 2724 { 2725 int s; 2726 RF_Raid_t *raidPtr; 2727 2728 s = splbio(); 2729 raidPtr = req->raidPtr; 2730 raidPtr->recon_in_progress = 1; 2731 rf_ReconstructInPlace(raidPtr, req->col); 2732 RF_Free(req, sizeof(*req)); 2733 raidPtr->recon_in_progress = 0; 2734 splx(s); 2735 2736 /* That's all... */ 2737 kthread_exit(0); /* does not return */ 2738 } 2739 2740 static RF_AutoConfig_t * 2741 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp, 2742 const char *cname, RF_SectorCount_t size, uint64_t numsecs, 2743 unsigned secsize) 2744 { 2745 int good_one = 0; 2746 RF_ComponentLabel_t *clabel; 2747 RF_AutoConfig_t *ac; 2748 2749 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT); 2750 if (clabel == NULL) { 2751 oomem: 2752 while(ac_list) { 2753 ac = ac_list; 2754 if (ac->clabel) 2755 free(ac->clabel, M_RAIDFRAME); 2756 ac_list = ac_list->next; 2757 free(ac, M_RAIDFRAME); 2758 } 2759 printf("RAID auto config: out of memory!\n"); 2760 return NULL; /* XXX probably should panic? */ 2761 } 2762 2763 if (!raidread_component_label(secsize, dev, vp, clabel)) { 2764 /* Got the label. Does it look reasonable? */ 2765 if (rf_reasonable_label(clabel, numsecs) && 2766 (rf_component_label_partitionsize(clabel) <= size)) { 2767 #ifdef DEBUG 2768 printf("Component on: %s: %llu\n", 2769 cname, (unsigned long long)size); 2770 rf_print_component_label(clabel); 2771 #endif 2772 /* if it's reasonable, add it, else ignore it. */ 2773 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME, 2774 M_NOWAIT); 2775 if (ac == NULL) { 2776 free(clabel, M_RAIDFRAME); 2777 goto oomem; 2778 } 2779 strlcpy(ac->devname, cname, sizeof(ac->devname)); 2780 ac->dev = dev; 2781 ac->vp = vp; 2782 ac->clabel = clabel; 2783 ac->next = ac_list; 2784 ac_list = ac; 2785 good_one = 1; 2786 } 2787 } 2788 if (!good_one) { 2789 /* cleanup */ 2790 free(clabel, M_RAIDFRAME); 2791 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2792 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2793 vput(vp); 2794 } 2795 return ac_list; 2796 } 2797 2798 RF_AutoConfig_t * 2799 rf_find_raid_components(void) 2800 { 2801 struct vnode *vp; 2802 struct disklabel label; 2803 device_t dv; 2804 deviter_t di; 2805 dev_t dev; 2806 int bmajor, bminor, wedge, rf_part_found; 2807 int error; 2808 int i; 2809 RF_AutoConfig_t *ac_list; 2810 uint64_t numsecs; 2811 unsigned secsize; 2812 int dowedges; 2813 2814 /* initialize the AutoConfig list */ 2815 ac_list = NULL; 2816 2817 /* 2818 * we begin by trolling through *all* the devices on the system *twice* 2819 * first we scan for wedges, second for other devices. This avoids 2820 * using a raw partition instead of a wedge that covers the whole disk 2821 */ 2822 2823 for (dowedges=1; dowedges>=0; --dowedges) { 2824 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL; 2825 dv = deviter_next(&di)) { 2826 2827 /* we are only interested in disks... */ 2828 if (device_class(dv) != DV_DISK) 2829 continue; 2830 2831 /* we don't care about floppies... */ 2832 if (device_is_a(dv, "fd")) { 2833 continue; 2834 } 2835 2836 /* we don't care about CD's... */ 2837 if (device_is_a(dv, "cd")) { 2838 continue; 2839 } 2840 2841 /* we don't care about md's... */ 2842 if (device_is_a(dv, "md")) { 2843 continue; 2844 } 2845 2846 /* hdfd is the Atari/Hades floppy driver */ 2847 if (device_is_a(dv, "hdfd")) { 2848 continue; 2849 } 2850 2851 /* fdisa is the Atari/Milan floppy driver */ 2852 if (device_is_a(dv, "fdisa")) { 2853 continue; 2854 } 2855 2856 /* are we in the wedges pass ? */ 2857 wedge = device_is_a(dv, "dk"); 2858 if (wedge != dowedges) { 2859 continue; 2860 } 2861 2862 /* need to find the device_name_to_block_device_major stuff */ 2863 bmajor = devsw_name2blk(device_xname(dv), NULL, 0); 2864 2865 rf_part_found = 0; /*No raid partition as yet*/ 2866 2867 /* get a vnode for the raw partition of this disk */ 2868 bminor = minor(device_unit(dv)); 2869 dev = wedge ? makedev(bmajor, bminor) : 2870 MAKEDISKDEV(bmajor, bminor, RAW_PART); 2871 if (bdevvp(dev, &vp)) 2872 panic("RAID can't alloc vnode"); 2873 2874 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED); 2875 2876 if (error) { 2877 /* "Who cares." Continue looking 2878 for something that exists*/ 2879 vput(vp); 2880 continue; 2881 } 2882 2883 error = getdisksize(vp, &numsecs, &secsize); 2884 if (error) { 2885 /* 2886 * Pseudo devices like vnd and cgd can be 2887 * opened but may still need some configuration. 2888 * Ignore these quietly. 2889 */ 2890 if (error != ENXIO) 2891 printf("RAIDframe: can't get disk size" 2892 " for dev %s (%d)\n", 2893 device_xname(dv), error); 2894 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2895 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2896 vput(vp); 2897 continue; 2898 } 2899 if (wedge) { 2900 struct dkwedge_info dkw; 2901 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, 2902 NOCRED); 2903 if (error) { 2904 printf("RAIDframe: can't get wedge info for " 2905 "dev %s (%d)\n", device_xname(dv), error); 2906 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2907 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2908 vput(vp); 2909 continue; 2910 } 2911 2912 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) { 2913 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2914 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2915 vput(vp); 2916 continue; 2917 } 2918 2919 ac_list = rf_get_component(ac_list, dev, vp, 2920 device_xname(dv), dkw.dkw_size, numsecs, secsize); 2921 rf_part_found = 1; /*There is a raid component on this disk*/ 2922 continue; 2923 } 2924 2925 /* Ok, the disk exists. Go get the disklabel. */ 2926 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED); 2927 if (error) { 2928 /* 2929 * XXX can't happen - open() would 2930 * have errored out (or faked up one) 2931 */ 2932 if (error != ENOTTY) 2933 printf("RAIDframe: can't get label for dev " 2934 "%s (%d)\n", device_xname(dv), error); 2935 } 2936 2937 /* don't need this any more. We'll allocate it again 2938 a little later if we really do... */ 2939 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2940 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2941 vput(vp); 2942 2943 if (error) 2944 continue; 2945 2946 rf_part_found = 0; /*No raid partitions yet*/ 2947 for (i = 0; i < label.d_npartitions; i++) { 2948 char cname[sizeof(ac_list->devname)]; 2949 2950 /* We only support partitions marked as RAID */ 2951 if (label.d_partitions[i].p_fstype != FS_RAID) 2952 continue; 2953 2954 dev = MAKEDISKDEV(bmajor, device_unit(dv), i); 2955 if (bdevvp(dev, &vp)) 2956 panic("RAID can't alloc vnode"); 2957 2958 error = VOP_OPEN(vp, FREAD, NOCRED); 2959 if (error) { 2960 /* Whatever... */ 2961 vput(vp); 2962 continue; 2963 } 2964 snprintf(cname, sizeof(cname), "%s%c", 2965 device_xname(dv), 'a' + i); 2966 ac_list = rf_get_component(ac_list, dev, vp, cname, 2967 label.d_partitions[i].p_size, numsecs, secsize); 2968 rf_part_found = 1; /*There is at least one raid partition on this disk*/ 2969 } 2970 2971 /* 2972 *If there is no raid component on this disk, either in a 2973 *disklabel or inside a wedge, check the raw partition as well, 2974 *as it is possible to configure raid components on raw disk 2975 *devices. 2976 */ 2977 2978 if (!rf_part_found) { 2979 char cname[sizeof(ac_list->devname)]; 2980 2981 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART); 2982 if (bdevvp(dev, &vp)) 2983 panic("RAID can't alloc vnode"); 2984 2985 error = VOP_OPEN(vp, FREAD, NOCRED); 2986 if (error) { 2987 /* Whatever... */ 2988 vput(vp); 2989 continue; 2990 } 2991 snprintf(cname, sizeof(cname), "%s%c", 2992 device_xname(dv), 'a' + RAW_PART); 2993 ac_list = rf_get_component(ac_list, dev, vp, cname, 2994 label.d_partitions[RAW_PART].p_size, numsecs, secsize); 2995 } 2996 } 2997 deviter_release(&di); 2998 } 2999 return ac_list; 3000 } 3001 3002 3003 int 3004 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs) 3005 { 3006 3007 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) || 3008 (clabel->version==RF_COMPONENT_LABEL_VERSION)) && 3009 ((clabel->clean == RF_RAID_CLEAN) || 3010 (clabel->clean == RF_RAID_DIRTY)) && 3011 clabel->row >=0 && 3012 clabel->column >= 0 && 3013 clabel->num_rows > 0 && 3014 clabel->num_columns > 0 && 3015 clabel->row < clabel->num_rows && 3016 clabel->column < clabel->num_columns && 3017 clabel->blockSize > 0 && 3018 /* 3019 * numBlocksHi may contain garbage, but it is ok since 3020 * the type is unsigned. If it is really garbage, 3021 * rf_fix_old_label_size() will fix it. 3022 */ 3023 rf_component_label_numblocks(clabel) > 0) { 3024 /* 3025 * label looks reasonable enough... 3026 * let's make sure it has no old garbage. 3027 */ 3028 if (numsecs) 3029 rf_fix_old_label_size(clabel, numsecs); 3030 return(1); 3031 } 3032 return(0); 3033 } 3034 3035 3036 /* 3037 * For reasons yet unknown, some old component labels have garbage in 3038 * the newer numBlocksHi region, and this causes lossage. Since those 3039 * disks will also have numsecs set to less than 32 bits of sectors, 3040 * we can determine when this corruption has occurred, and fix it. 3041 * 3042 * The exact same problem, with the same unknown reason, happens to 3043 * the partitionSizeHi member as well. 3044 */ 3045 static void 3046 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs) 3047 { 3048 3049 if (numsecs < ((uint64_t)1 << 32)) { 3050 if (clabel->numBlocksHi) { 3051 printf("WARNING: total sectors < 32 bits, yet " 3052 "numBlocksHi set\n" 3053 "WARNING: resetting numBlocksHi to zero.\n"); 3054 clabel->numBlocksHi = 0; 3055 } 3056 3057 if (clabel->partitionSizeHi) { 3058 printf("WARNING: total sectors < 32 bits, yet " 3059 "partitionSizeHi set\n" 3060 "WARNING: resetting partitionSizeHi to zero.\n"); 3061 clabel->partitionSizeHi = 0; 3062 } 3063 } 3064 } 3065 3066 3067 #ifdef DEBUG 3068 void 3069 rf_print_component_label(RF_ComponentLabel_t *clabel) 3070 { 3071 uint64_t numBlocks; 3072 static const char *rp[] = { 3073 "No", "Force", "Soft", "*invalid*" 3074 }; 3075 3076 3077 numBlocks = rf_component_label_numblocks(clabel); 3078 3079 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n", 3080 clabel->row, clabel->column, 3081 clabel->num_rows, clabel->num_columns); 3082 printf(" Version: %d Serial Number: %d Mod Counter: %d\n", 3083 clabel->version, clabel->serial_number, 3084 clabel->mod_counter); 3085 printf(" Clean: %s Status: %d\n", 3086 clabel->clean ? "Yes" : "No", clabel->status); 3087 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n", 3088 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU); 3089 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n", 3090 (char) clabel->parityConfig, clabel->blockSize, numBlocks); 3091 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No"); 3092 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]); 3093 printf(" Last configured as: raid%d\n", clabel->last_unit); 3094 #if 0 3095 printf(" Config order: %d\n", clabel->config_order); 3096 #endif 3097 3098 } 3099 #endif 3100 3101 RF_ConfigSet_t * 3102 rf_create_auto_sets(RF_AutoConfig_t *ac_list) 3103 { 3104 RF_AutoConfig_t *ac; 3105 RF_ConfigSet_t *config_sets; 3106 RF_ConfigSet_t *cset; 3107 RF_AutoConfig_t *ac_next; 3108 3109 3110 config_sets = NULL; 3111 3112 /* Go through the AutoConfig list, and figure out which components 3113 belong to what sets. */ 3114 ac = ac_list; 3115 while(ac!=NULL) { 3116 /* we're going to putz with ac->next, so save it here 3117 for use at the end of the loop */ 3118 ac_next = ac->next; 3119 3120 if (config_sets == NULL) { 3121 /* will need at least this one... */ 3122 config_sets = (RF_ConfigSet_t *) 3123 malloc(sizeof(RF_ConfigSet_t), 3124 M_RAIDFRAME, M_NOWAIT); 3125 if (config_sets == NULL) { 3126 panic("rf_create_auto_sets: No memory!"); 3127 } 3128 /* this one is easy :) */ 3129 config_sets->ac = ac; 3130 config_sets->next = NULL; 3131 config_sets->rootable = 0; 3132 ac->next = NULL; 3133 } else { 3134 /* which set does this component fit into? */ 3135 cset = config_sets; 3136 while(cset!=NULL) { 3137 if (rf_does_it_fit(cset, ac)) { 3138 /* looks like it matches... */ 3139 ac->next = cset->ac; 3140 cset->ac = ac; 3141 break; 3142 } 3143 cset = cset->next; 3144 } 3145 if (cset==NULL) { 3146 /* didn't find a match above... new set..*/ 3147 cset = (RF_ConfigSet_t *) 3148 malloc(sizeof(RF_ConfigSet_t), 3149 M_RAIDFRAME, M_NOWAIT); 3150 if (cset == NULL) { 3151 panic("rf_create_auto_sets: No memory!"); 3152 } 3153 cset->ac = ac; 3154 ac->next = NULL; 3155 cset->next = config_sets; 3156 cset->rootable = 0; 3157 config_sets = cset; 3158 } 3159 } 3160 ac = ac_next; 3161 } 3162 3163 3164 return(config_sets); 3165 } 3166 3167 static int 3168 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac) 3169 { 3170 RF_ComponentLabel_t *clabel1, *clabel2; 3171 3172 /* If this one matches the *first* one in the set, that's good 3173 enough, since the other members of the set would have been 3174 through here too... */ 3175 /* note that we are not checking partitionSize here.. 3176 3177 Note that we are also not checking the mod_counters here. 3178 If everything else matches except the mod_counter, that's 3179 good enough for this test. We will deal with the mod_counters 3180 a little later in the autoconfiguration process. 3181 3182 (clabel1->mod_counter == clabel2->mod_counter) && 3183 3184 The reason we don't check for this is that failed disks 3185 will have lower modification counts. If those disks are 3186 not added to the set they used to belong to, then they will 3187 form their own set, which may result in 2 different sets, 3188 for example, competing to be configured at raid0, and 3189 perhaps competing to be the root filesystem set. If the 3190 wrong ones get configured, or both attempt to become /, 3191 weird behaviour and or serious lossage will occur. Thus we 3192 need to bring them into the fold here, and kick them out at 3193 a later point. 3194 3195 */ 3196 3197 clabel1 = cset->ac->clabel; 3198 clabel2 = ac->clabel; 3199 if ((clabel1->version == clabel2->version) && 3200 (clabel1->serial_number == clabel2->serial_number) && 3201 (clabel1->num_rows == clabel2->num_rows) && 3202 (clabel1->num_columns == clabel2->num_columns) && 3203 (clabel1->sectPerSU == clabel2->sectPerSU) && 3204 (clabel1->SUsPerPU == clabel2->SUsPerPU) && 3205 (clabel1->SUsPerRU == clabel2->SUsPerRU) && 3206 (clabel1->parityConfig == clabel2->parityConfig) && 3207 (clabel1->maxOutstanding == clabel2->maxOutstanding) && 3208 (clabel1->blockSize == clabel2->blockSize) && 3209 rf_component_label_numblocks(clabel1) == 3210 rf_component_label_numblocks(clabel2) && 3211 (clabel1->autoconfigure == clabel2->autoconfigure) && 3212 (clabel1->root_partition == clabel2->root_partition) && 3213 (clabel1->last_unit == clabel2->last_unit) && 3214 (clabel1->config_order == clabel2->config_order)) { 3215 /* if it get's here, it almost *has* to be a match */ 3216 } else { 3217 /* it's not consistent with somebody in the set.. 3218 punt */ 3219 return(0); 3220 } 3221 /* all was fine.. it must fit... */ 3222 return(1); 3223 } 3224 3225 int 3226 rf_have_enough_components(RF_ConfigSet_t *cset) 3227 { 3228 RF_AutoConfig_t *ac; 3229 RF_AutoConfig_t *auto_config; 3230 RF_ComponentLabel_t *clabel; 3231 int c; 3232 int num_cols; 3233 int num_missing; 3234 int mod_counter; 3235 int mod_counter_found; 3236 int even_pair_failed; 3237 char parity_type; 3238 3239 3240 /* check to see that we have enough 'live' components 3241 of this set. If so, we can configure it if necessary */ 3242 3243 num_cols = cset->ac->clabel->num_columns; 3244 parity_type = cset->ac->clabel->parityConfig; 3245 3246 /* XXX Check for duplicate components!?!?!? */ 3247 3248 /* Determine what the mod_counter is supposed to be for this set. */ 3249 3250 mod_counter_found = 0; 3251 mod_counter = 0; 3252 ac = cset->ac; 3253 while(ac!=NULL) { 3254 if (mod_counter_found==0) { 3255 mod_counter = ac->clabel->mod_counter; 3256 mod_counter_found = 1; 3257 } else { 3258 if (ac->clabel->mod_counter > mod_counter) { 3259 mod_counter = ac->clabel->mod_counter; 3260 } 3261 } 3262 ac = ac->next; 3263 } 3264 3265 num_missing = 0; 3266 auto_config = cset->ac; 3267 3268 even_pair_failed = 0; 3269 for(c=0; c<num_cols; c++) { 3270 ac = auto_config; 3271 while(ac!=NULL) { 3272 if ((ac->clabel->column == c) && 3273 (ac->clabel->mod_counter == mod_counter)) { 3274 /* it's this one... */ 3275 #ifdef DEBUG 3276 printf("Found: %s at %d\n", 3277 ac->devname,c); 3278 #endif 3279 break; 3280 } 3281 ac=ac->next; 3282 } 3283 if (ac==NULL) { 3284 /* Didn't find one here! */ 3285 /* special case for RAID 1, especially 3286 where there are more than 2 3287 components (where RAIDframe treats 3288 things a little differently :( ) */ 3289 if (parity_type == '1') { 3290 if (c%2 == 0) { /* even component */ 3291 even_pair_failed = 1; 3292 } else { /* odd component. If 3293 we're failed, and 3294 so is the even 3295 component, it's 3296 "Good Night, Charlie" */ 3297 if (even_pair_failed == 1) { 3298 return(0); 3299 } 3300 } 3301 } else { 3302 /* normal accounting */ 3303 num_missing++; 3304 } 3305 } 3306 if ((parity_type == '1') && (c%2 == 1)) { 3307 /* Just did an even component, and we didn't 3308 bail.. reset the even_pair_failed flag, 3309 and go on to the next component.... */ 3310 even_pair_failed = 0; 3311 } 3312 } 3313 3314 clabel = cset->ac->clabel; 3315 3316 if (((clabel->parityConfig == '0') && (num_missing > 0)) || 3317 ((clabel->parityConfig == '4') && (num_missing > 1)) || 3318 ((clabel->parityConfig == '5') && (num_missing > 1))) { 3319 /* XXX this needs to be made *much* more general */ 3320 /* Too many failures */ 3321 return(0); 3322 } 3323 /* otherwise, all is well, and we've got enough to take a kick 3324 at autoconfiguring this set */ 3325 return(1); 3326 } 3327 3328 void 3329 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config, 3330 RF_Raid_t *raidPtr) 3331 { 3332 RF_ComponentLabel_t *clabel; 3333 int i; 3334 3335 clabel = ac->clabel; 3336 3337 /* 1. Fill in the common stuff */ 3338 config->numRow = clabel->num_rows = 1; 3339 config->numCol = clabel->num_columns; 3340 config->numSpare = 0; /* XXX should this be set here? */ 3341 config->sectPerSU = clabel->sectPerSU; 3342 config->SUsPerPU = clabel->SUsPerPU; 3343 config->SUsPerRU = clabel->SUsPerRU; 3344 config->parityConfig = clabel->parityConfig; 3345 /* XXX... */ 3346 strcpy(config->diskQueueType,"fifo"); 3347 config->maxOutstandingDiskReqs = clabel->maxOutstanding; 3348 config->layoutSpecificSize = 0; /* XXX ?? */ 3349 3350 while(ac!=NULL) { 3351 /* row/col values will be in range due to the checks 3352 in reasonable_label() */ 3353 strcpy(config->devnames[0][ac->clabel->column], 3354 ac->devname); 3355 ac = ac->next; 3356 } 3357 3358 for(i=0;i<RF_MAXDBGV;i++) { 3359 config->debugVars[i][0] = 0; 3360 } 3361 } 3362 3363 int 3364 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value) 3365 { 3366 RF_ComponentLabel_t *clabel; 3367 int column; 3368 int sparecol; 3369 3370 raidPtr->autoconfigure = new_value; 3371 3372 for(column=0; column<raidPtr->numCol; column++) { 3373 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3374 clabel = raidget_component_label(raidPtr, column); 3375 clabel->autoconfigure = new_value; 3376 raidflush_component_label(raidPtr, column); 3377 } 3378 } 3379 for(column = 0; column < raidPtr->numSpare ; column++) { 3380 sparecol = raidPtr->numCol + column; 3381 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3382 clabel = raidget_component_label(raidPtr, sparecol); 3383 clabel->autoconfigure = new_value; 3384 raidflush_component_label(raidPtr, sparecol); 3385 } 3386 } 3387 return(new_value); 3388 } 3389 3390 int 3391 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value) 3392 { 3393 RF_ComponentLabel_t *clabel; 3394 int column; 3395 int sparecol; 3396 3397 raidPtr->root_partition = new_value; 3398 for(column=0; column<raidPtr->numCol; column++) { 3399 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3400 clabel = raidget_component_label(raidPtr, column); 3401 clabel->root_partition = new_value; 3402 raidflush_component_label(raidPtr, column); 3403 } 3404 } 3405 for(column = 0; column < raidPtr->numSpare ; column++) { 3406 sparecol = raidPtr->numCol + column; 3407 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3408 clabel = raidget_component_label(raidPtr, sparecol); 3409 clabel->root_partition = new_value; 3410 raidflush_component_label(raidPtr, sparecol); 3411 } 3412 } 3413 return(new_value); 3414 } 3415 3416 void 3417 rf_release_all_vps(RF_ConfigSet_t *cset) 3418 { 3419 RF_AutoConfig_t *ac; 3420 3421 ac = cset->ac; 3422 while(ac!=NULL) { 3423 /* Close the vp, and give it back */ 3424 if (ac->vp) { 3425 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY); 3426 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED); 3427 vput(ac->vp); 3428 ac->vp = NULL; 3429 } 3430 ac = ac->next; 3431 } 3432 } 3433 3434 3435 void 3436 rf_cleanup_config_set(RF_ConfigSet_t *cset) 3437 { 3438 RF_AutoConfig_t *ac; 3439 RF_AutoConfig_t *next_ac; 3440 3441 ac = cset->ac; 3442 while(ac!=NULL) { 3443 next_ac = ac->next; 3444 /* nuke the label */ 3445 free(ac->clabel, M_RAIDFRAME); 3446 /* cleanup the config structure */ 3447 free(ac, M_RAIDFRAME); 3448 /* "next.." */ 3449 ac = next_ac; 3450 } 3451 /* and, finally, nuke the config set */ 3452 free(cset, M_RAIDFRAME); 3453 } 3454 3455 3456 void 3457 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 3458 { 3459 /* current version number */ 3460 clabel->version = RF_COMPONENT_LABEL_VERSION; 3461 clabel->serial_number = raidPtr->serial_number; 3462 clabel->mod_counter = raidPtr->mod_counter; 3463 3464 clabel->num_rows = 1; 3465 clabel->num_columns = raidPtr->numCol; 3466 clabel->clean = RF_RAID_DIRTY; /* not clean */ 3467 clabel->status = rf_ds_optimal; /* "It's good!" */ 3468 3469 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; 3470 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU; 3471 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU; 3472 3473 clabel->blockSize = raidPtr->bytesPerSector; 3474 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk); 3475 3476 /* XXX not portable */ 3477 clabel->parityConfig = raidPtr->Layout.map->parityConfig; 3478 clabel->maxOutstanding = raidPtr->maxOutstanding; 3479 clabel->autoconfigure = raidPtr->autoconfigure; 3480 clabel->root_partition = raidPtr->root_partition; 3481 clabel->last_unit = raidPtr->raidid; 3482 clabel->config_order = raidPtr->config_order; 3483 3484 #ifndef RF_NO_PARITY_MAP 3485 rf_paritymap_init_label(raidPtr->parity_map, clabel); 3486 #endif 3487 } 3488 3489 struct raid_softc * 3490 rf_auto_config_set(RF_ConfigSet_t *cset) 3491 { 3492 RF_Raid_t *raidPtr; 3493 RF_Config_t *config; 3494 int raidID; 3495 struct raid_softc *sc; 3496 3497 #ifdef DEBUG 3498 printf("RAID autoconfigure\n"); 3499 #endif 3500 3501 /* 1. Create a config structure */ 3502 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO); 3503 if (config == NULL) { 3504 printf("%s: Out of mem - config!?!?\n", __func__); 3505 /* XXX do something more intelligent here. */ 3506 return NULL; 3507 } 3508 3509 /* 3510 2. Figure out what RAID ID this one is supposed to live at 3511 See if we can get the same RAID dev that it was configured 3512 on last time.. 3513 */ 3514 3515 raidID = cset->ac->clabel->last_unit; 3516 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0; 3517 sc = raidget(++raidID, false)) 3518 continue; 3519 #ifdef DEBUG 3520 printf("Configuring raid%d:\n",raidID); 3521 #endif 3522 3523 if (sc == NULL) 3524 sc = raidget(raidID, true); 3525 if (sc == NULL) { 3526 printf("%s: Out of mem - softc!?!?\n", __func__); 3527 /* XXX do something more intelligent here. */ 3528 free(config, M_RAIDFRAME); 3529 return NULL; 3530 } 3531 3532 raidPtr = &sc->sc_r; 3533 3534 /* XXX all this stuff should be done SOMEWHERE ELSE! */ 3535 raidPtr->softc = sc; 3536 raidPtr->raidid = raidID; 3537 raidPtr->openings = RAIDOUTSTANDING; 3538 3539 /* 3. Build the configuration structure */ 3540 rf_create_configuration(cset->ac, config, raidPtr); 3541 3542 /* 4. Do the configuration */ 3543 if (rf_Configure(raidPtr, config, cset->ac) == 0) { 3544 raidinit(sc); 3545 3546 rf_markalldirty(raidPtr); 3547 raidPtr->autoconfigure = 1; /* XXX do this here? */ 3548 switch (cset->ac->clabel->root_partition) { 3549 case 1: /* Force Root */ 3550 case 2: /* Soft Root: root when boot partition part of raid */ 3551 /* 3552 * everything configured just fine. Make a note 3553 * that this set is eligible to be root, 3554 * or forced to be root 3555 */ 3556 cset->rootable = cset->ac->clabel->root_partition; 3557 /* XXX do this here? */ 3558 raidPtr->root_partition = cset->rootable; 3559 break; 3560 default: 3561 break; 3562 } 3563 } else { 3564 raidput(sc); 3565 sc = NULL; 3566 } 3567 3568 /* 5. Cleanup */ 3569 free(config, M_RAIDFRAME); 3570 return sc; 3571 } 3572 3573 void 3574 rf_pool_init(struct pool *p, size_t size, const char *w_chan, 3575 size_t xmin, size_t xmax) 3576 { 3577 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO); 3578 pool_sethiwat(p, xmax); 3579 pool_prime(p, xmin); 3580 pool_setlowat(p, xmin); 3581 } 3582 3583 /* 3584 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue 3585 * to see if there is IO pending and if that IO could possibly be done 3586 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1 3587 * otherwise. 3588 * 3589 */ 3590 int 3591 rf_buf_queue_check(RF_Raid_t *raidPtr) 3592 { 3593 struct raid_softc *rs; 3594 struct dk_softc *dksc; 3595 3596 rs = raidPtr->softc; 3597 dksc = &rs->sc_dksc; 3598 3599 if ((rs->sc_flags & RAIDF_INITED) == 0) 3600 return 1; 3601 3602 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) { 3603 /* there is work to do */ 3604 return 0; 3605 } 3606 /* default is nothing to do */ 3607 return 1; 3608 } 3609 3610 int 3611 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr) 3612 { 3613 uint64_t numsecs; 3614 unsigned secsize; 3615 int error; 3616 3617 error = getdisksize(vp, &numsecs, &secsize); 3618 if (error == 0) { 3619 diskPtr->blockSize = secsize; 3620 diskPtr->numBlocks = numsecs - rf_protectedSectors; 3621 diskPtr->partitionSize = numsecs; 3622 return 0; 3623 } 3624 return error; 3625 } 3626 3627 static int 3628 raid_match(device_t self, cfdata_t cfdata, void *aux) 3629 { 3630 return 1; 3631 } 3632 3633 static void 3634 raid_attach(device_t parent, device_t self, void *aux) 3635 { 3636 } 3637 3638 3639 static int 3640 raid_detach(device_t self, int flags) 3641 { 3642 int error; 3643 struct raid_softc *rs = raidsoftc(self); 3644 3645 if (rs == NULL) 3646 return ENXIO; 3647 3648 if ((error = raidlock(rs)) != 0) 3649 return (error); 3650 3651 error = raid_detach_unlocked(rs); 3652 3653 raidunlock(rs); 3654 3655 /* XXX raid can be referenced here */ 3656 3657 if (error) 3658 return error; 3659 3660 /* Free the softc */ 3661 raidput(rs); 3662 3663 return 0; 3664 } 3665 3666 static void 3667 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr) 3668 { 3669 struct dk_softc *dksc = &rs->sc_dksc; 3670 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; 3671 3672 memset(dg, 0, sizeof(*dg)); 3673 3674 dg->dg_secperunit = raidPtr->totalSectors; 3675 dg->dg_secsize = raidPtr->bytesPerSector; 3676 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe; 3677 dg->dg_ntracks = 4 * raidPtr->numCol; 3678 3679 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL); 3680 } 3681 3682 /* 3683 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components. 3684 * We end up returning whatever error was returned by the first cache flush 3685 * that fails. 3686 */ 3687 3688 int 3689 rf_sync_component_caches(RF_Raid_t *raidPtr) 3690 { 3691 int c, sparecol; 3692 int e,error; 3693 int force = 1; 3694 3695 error = 0; 3696 for (c = 0; c < raidPtr->numCol; c++) { 3697 if (raidPtr->Disks[c].status == rf_ds_optimal) { 3698 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC, 3699 &force, FWRITE, NOCRED); 3700 if (e) { 3701 if (e != ENODEV) 3702 printf("raid%d: cache flush to component %s failed.\n", 3703 raidPtr->raidid, raidPtr->Disks[c].devname); 3704 if (error == 0) { 3705 error = e; 3706 } 3707 } 3708 } 3709 } 3710 3711 for( c = 0; c < raidPtr->numSpare ; c++) { 3712 sparecol = raidPtr->numCol + c; 3713 /* Need to ensure that the reconstruct actually completed! */ 3714 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3715 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp, 3716 DIOCCACHESYNC, &force, FWRITE, NOCRED); 3717 if (e) { 3718 if (e != ENODEV) 3719 printf("raid%d: cache flush to component %s failed.\n", 3720 raidPtr->raidid, raidPtr->Disks[sparecol].devname); 3721 if (error == 0) { 3722 error = e; 3723 } 3724 } 3725 } 3726 } 3727 return error; 3728 } 3729 3730 /* 3731 * Module interface 3732 */ 3733 3734 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr"); 3735 3736 #ifdef _MODULE 3737 CFDRIVER_DECL(raid, DV_DISK, NULL); 3738 #endif 3739 3740 static int raid_modcmd(modcmd_t, void *); 3741 static int raid_modcmd_init(void); 3742 static int raid_modcmd_fini(void); 3743 3744 static int 3745 raid_modcmd(modcmd_t cmd, void *data) 3746 { 3747 int error; 3748 3749 error = 0; 3750 switch (cmd) { 3751 case MODULE_CMD_INIT: 3752 error = raid_modcmd_init(); 3753 break; 3754 case MODULE_CMD_FINI: 3755 error = raid_modcmd_fini(); 3756 break; 3757 default: 3758 error = ENOTTY; 3759 break; 3760 } 3761 return error; 3762 } 3763 3764 static int 3765 raid_modcmd_init(void) 3766 { 3767 int error; 3768 int bmajor, cmajor; 3769 3770 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE); 3771 mutex_enter(&raid_lock); 3772 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 3773 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM); 3774 rf_init_cond2(rf_sparet_wait_cv, "sparetw"); 3775 rf_init_cond2(rf_sparet_resp_cv, "rfgst"); 3776 3777 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; 3778 #endif 3779 3780 bmajor = cmajor = -1; 3781 error = devsw_attach("raid", &raid_bdevsw, &bmajor, 3782 &raid_cdevsw, &cmajor); 3783 if (error != 0 && error != EEXIST) { 3784 aprint_error("%s: devsw_attach failed %d\n", __func__, error); 3785 mutex_exit(&raid_lock); 3786 return error; 3787 } 3788 #ifdef _MODULE 3789 error = config_cfdriver_attach(&raid_cd); 3790 if (error != 0) { 3791 aprint_error("%s: config_cfdriver_attach failed %d\n", 3792 __func__, error); 3793 devsw_detach(&raid_bdevsw, &raid_cdevsw); 3794 mutex_exit(&raid_lock); 3795 return error; 3796 } 3797 #endif 3798 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3799 if (error != 0) { 3800 aprint_error("%s: config_cfattach_attach failed %d\n", 3801 __func__, error); 3802 #ifdef _MODULE 3803 config_cfdriver_detach(&raid_cd); 3804 #endif 3805 devsw_detach(&raid_bdevsw, &raid_cdevsw); 3806 mutex_exit(&raid_lock); 3807 return error; 3808 } 3809 3810 raidautoconfigdone = false; 3811 3812 mutex_exit(&raid_lock); 3813 3814 if (error == 0) { 3815 if (rf_BootRaidframe(true) == 0) 3816 aprint_verbose("Kernelized RAIDframe activated\n"); 3817 else 3818 panic("Serious error activating RAID!!"); 3819 } 3820 3821 /* 3822 * Register a finalizer which will be used to auto-config RAID 3823 * sets once all real hardware devices have been found. 3824 */ 3825 error = config_finalize_register(NULL, rf_autoconfig); 3826 if (error != 0) { 3827 aprint_error("WARNING: unable to register RAIDframe " 3828 "finalizer\n"); 3829 error = 0; 3830 } 3831 3832 return error; 3833 } 3834 3835 static int 3836 raid_modcmd_fini(void) 3837 { 3838 int error; 3839 3840 mutex_enter(&raid_lock); 3841 3842 /* Don't allow unload if raid device(s) exist. */ 3843 if (!LIST_EMPTY(&raids)) { 3844 mutex_exit(&raid_lock); 3845 return EBUSY; 3846 } 3847 3848 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca); 3849 if (error != 0) { 3850 aprint_error("%s: cannot detach cfattach\n",__func__); 3851 mutex_exit(&raid_lock); 3852 return error; 3853 } 3854 #ifdef _MODULE 3855 error = config_cfdriver_detach(&raid_cd); 3856 if (error != 0) { 3857 aprint_error("%s: cannot detach cfdriver\n",__func__); 3858 config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3859 mutex_exit(&raid_lock); 3860 return error; 3861 } 3862 #endif 3863 error = devsw_detach(&raid_bdevsw, &raid_cdevsw); 3864 if (error != 0) { 3865 aprint_error("%s: cannot detach devsw\n",__func__); 3866 #ifdef _MODULE 3867 config_cfdriver_attach(&raid_cd); 3868 #endif 3869 config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3870 mutex_exit(&raid_lock); 3871 return error; 3872 } 3873 rf_BootRaidframe(false); 3874 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 3875 rf_destroy_mutex2(rf_sparet_wait_mutex); 3876 rf_destroy_cond2(rf_sparet_wait_cv); 3877 rf_destroy_cond2(rf_sparet_resp_cv); 3878 #endif 3879 mutex_exit(&raid_lock); 3880 mutex_destroy(&raid_lock); 3881 3882 return error; 3883 } 3884