1 /* $NetBSD: rf_netbsdkintf.c,v 1.381 2020/03/21 06:02:13 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Greg Oster; Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1988 University of Utah. 34 * Copyright (c) 1990, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * the Systems Programming Group of the University of Utah Computer 39 * Science Department. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 66 * 67 * @(#)cd.c 8.2 (Berkeley) 11/16/93 68 */ 69 70 /* 71 * Copyright (c) 1995 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Mark Holland, Jim Zelenka 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /*********************************************************** 98 * 99 * rf_kintf.c -- the kernel interface routines for RAIDframe 100 * 101 ***********************************************************/ 102 103 #include <sys/cdefs.h> 104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.381 2020/03/21 06:02:13 riastradh Exp $"); 105 106 #ifdef _KERNEL_OPT 107 #include "opt_raid_autoconfig.h" 108 #include "opt_compat_netbsd32.h" 109 #endif 110 111 #include <sys/param.h> 112 #include <sys/errno.h> 113 #include <sys/pool.h> 114 #include <sys/proc.h> 115 #include <sys/queue.h> 116 #include <sys/disk.h> 117 #include <sys/device.h> 118 #include <sys/stat.h> 119 #include <sys/ioctl.h> 120 #include <sys/fcntl.h> 121 #include <sys/systm.h> 122 #include <sys/vnode.h> 123 #include <sys/disklabel.h> 124 #include <sys/conf.h> 125 #include <sys/buf.h> 126 #include <sys/bufq.h> 127 #include <sys/reboot.h> 128 #include <sys/kauth.h> 129 #include <sys/module.h> 130 #include <sys/compat_stub.h> 131 132 #include <prop/proplib.h> 133 134 #include <dev/raidframe/raidframevar.h> 135 #include <dev/raidframe/raidframeio.h> 136 #include <dev/raidframe/rf_paritymap.h> 137 138 #include "rf_raid.h" 139 #include "rf_copyback.h" 140 #include "rf_dag.h" 141 #include "rf_dagflags.h" 142 #include "rf_desc.h" 143 #include "rf_diskqueue.h" 144 #include "rf_etimer.h" 145 #include "rf_general.h" 146 #include "rf_kintf.h" 147 #include "rf_options.h" 148 #include "rf_driver.h" 149 #include "rf_parityscan.h" 150 #include "rf_threadstuff.h" 151 152 #include "ioconf.h" 153 154 #ifdef DEBUG 155 int rf_kdebug_level = 0; 156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a 157 #else /* DEBUG */ 158 #define db1_printf(a) { } 159 #endif /* DEBUG */ 160 161 #ifdef DEBUG_ROOT 162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__) 163 #else 164 #define DPRINTF(a, ...) 165 #endif 166 167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 168 static rf_declare_mutex2(rf_sparet_wait_mutex); 169 static rf_declare_cond2(rf_sparet_wait_cv); 170 static rf_declare_cond2(rf_sparet_resp_cv); 171 172 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a 173 * spare table */ 174 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from 175 * installation process */ 176 #endif 177 178 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures"); 179 180 /* prototypes */ 181 static void KernelWakeupFunc(struct buf *); 182 static void InitBP(struct buf *, struct vnode *, unsigned, 183 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *), 184 void *, int, struct proc *); 185 static void raidinit(struct raid_softc *); 186 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp); 187 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *); 188 189 static int raid_match(device_t, cfdata_t, void *); 190 static void raid_attach(device_t, device_t, void *); 191 static int raid_detach(device_t, int); 192 193 static int raidread_component_area(dev_t, struct vnode *, void *, size_t, 194 daddr_t, daddr_t); 195 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t, 196 daddr_t, daddr_t, int); 197 198 static int raidwrite_component_label(unsigned, 199 dev_t, struct vnode *, RF_ComponentLabel_t *); 200 static int raidread_component_label(unsigned, 201 dev_t, struct vnode *, RF_ComponentLabel_t *); 202 203 static int raid_diskstart(device_t, struct buf *bp); 204 static int raid_dumpblocks(device_t, void *, daddr_t, int); 205 static int raid_lastclose(device_t); 206 207 static dev_type_open(raidopen); 208 static dev_type_close(raidclose); 209 static dev_type_read(raidread); 210 static dev_type_write(raidwrite); 211 static dev_type_ioctl(raidioctl); 212 static dev_type_strategy(raidstrategy); 213 static dev_type_dump(raiddump); 214 static dev_type_size(raidsize); 215 216 const struct bdevsw raid_bdevsw = { 217 .d_open = raidopen, 218 .d_close = raidclose, 219 .d_strategy = raidstrategy, 220 .d_ioctl = raidioctl, 221 .d_dump = raiddump, 222 .d_psize = raidsize, 223 .d_discard = nodiscard, 224 .d_flag = D_DISK 225 }; 226 227 const struct cdevsw raid_cdevsw = { 228 .d_open = raidopen, 229 .d_close = raidclose, 230 .d_read = raidread, 231 .d_write = raidwrite, 232 .d_ioctl = raidioctl, 233 .d_stop = nostop, 234 .d_tty = notty, 235 .d_poll = nopoll, 236 .d_mmap = nommap, 237 .d_kqfilter = nokqfilter, 238 .d_discard = nodiscard, 239 .d_flag = D_DISK 240 }; 241 242 static struct dkdriver rf_dkdriver = { 243 .d_open = raidopen, 244 .d_close = raidclose, 245 .d_strategy = raidstrategy, 246 .d_diskstart = raid_diskstart, 247 .d_dumpblocks = raid_dumpblocks, 248 .d_lastclose = raid_lastclose, 249 .d_minphys = minphys 250 }; 251 252 #define raidunit(x) DISKUNIT(x) 253 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc) 254 255 extern struct cfdriver raid_cd; 256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc), 257 raid_match, raid_attach, raid_detach, NULL, NULL, NULL, 258 DVF_DETACH_SHUTDOWN); 259 260 /* Internal representation of a rf_recon_req */ 261 struct rf_recon_req_internal { 262 RF_RowCol_t col; 263 RF_ReconReqFlags_t flags; 264 void *raidPtr; 265 }; 266 267 /* 268 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device. 269 * Be aware that large numbers can allow the driver to consume a lot of 270 * kernel memory, especially on writes, and in degraded mode reads. 271 * 272 * For example: with a stripe width of 64 blocks (32k) and 5 disks, 273 * a single 64K write will typically require 64K for the old data, 274 * 64K for the old parity, and 64K for the new parity, for a total 275 * of 192K (if the parity buffer is not re-used immediately). 276 * Even it if is used immediately, that's still 128K, which when multiplied 277 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data. 278 * 279 * Now in degraded mode, for example, a 64K read on the above setup may 280 * require data reconstruction, which will require *all* of the 4 remaining 281 * disks to participate -- 4 * 32K/disk == 128K again. 282 */ 283 284 #ifndef RAIDOUTSTANDING 285 #define RAIDOUTSTANDING 6 286 #endif 287 288 #define RAIDLABELDEV(dev) \ 289 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART)) 290 291 /* declared here, and made public, for the benefit of KVM stuff.. */ 292 293 static int raidlock(struct raid_softc *); 294 static void raidunlock(struct raid_softc *); 295 296 static int raid_detach_unlocked(struct raid_softc *); 297 298 static void rf_markalldirty(RF_Raid_t *); 299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *); 300 301 void rf_ReconThread(struct rf_recon_req_internal *); 302 void rf_RewriteParityThread(RF_Raid_t *raidPtr); 303 void rf_CopybackThread(RF_Raid_t *raidPtr); 304 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *); 305 int rf_autoconfig(device_t); 306 void rf_buildroothack(RF_ConfigSet_t *); 307 308 RF_AutoConfig_t *rf_find_raid_components(void); 309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *); 310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *); 311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t); 312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *); 313 int rf_set_autoconfig(RF_Raid_t *, int); 314 int rf_set_rootpartition(RF_Raid_t *, int); 315 void rf_release_all_vps(RF_ConfigSet_t *); 316 void rf_cleanup_config_set(RF_ConfigSet_t *); 317 int rf_have_enough_components(RF_ConfigSet_t *); 318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *); 319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t); 320 321 /* 322 * Debugging, mostly. Set to 0 to not allow autoconfig to take place. 323 * Note that this is overridden by having RAID_AUTOCONFIG as an option 324 * in the kernel config file. 325 */ 326 #ifdef RAID_AUTOCONFIG 327 int raidautoconfig = 1; 328 #else 329 int raidautoconfig = 0; 330 #endif 331 static bool raidautoconfigdone = false; 332 333 struct RF_Pools_s rf_pools; 334 335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids); 336 static kmutex_t raid_lock; 337 338 static struct raid_softc * 339 raidcreate(int unit) { 340 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP); 341 sc->sc_unit = unit; 342 cv_init(&sc->sc_cv, "raidunit"); 343 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE); 344 return sc; 345 } 346 347 static void 348 raiddestroy(struct raid_softc *sc) { 349 cv_destroy(&sc->sc_cv); 350 mutex_destroy(&sc->sc_mutex); 351 kmem_free(sc, sizeof(*sc)); 352 } 353 354 static struct raid_softc * 355 raidget(int unit, bool create) { 356 struct raid_softc *sc; 357 if (unit < 0) { 358 #ifdef DIAGNOSTIC 359 panic("%s: unit %d!", __func__, unit); 360 #endif 361 return NULL; 362 } 363 mutex_enter(&raid_lock); 364 LIST_FOREACH(sc, &raids, sc_link) { 365 if (sc->sc_unit == unit) { 366 mutex_exit(&raid_lock); 367 return sc; 368 } 369 } 370 mutex_exit(&raid_lock); 371 if (!create) 372 return NULL; 373 sc = raidcreate(unit); 374 mutex_enter(&raid_lock); 375 LIST_INSERT_HEAD(&raids, sc, sc_link); 376 mutex_exit(&raid_lock); 377 return sc; 378 } 379 380 static void 381 raidput(struct raid_softc *sc) { 382 mutex_enter(&raid_lock); 383 LIST_REMOVE(sc, sc_link); 384 mutex_exit(&raid_lock); 385 raiddestroy(sc); 386 } 387 388 void 389 raidattach(int num) 390 { 391 392 /* 393 * Device attachment and associated initialization now occurs 394 * as part of the module initialization. 395 */ 396 } 397 398 int 399 rf_autoconfig(device_t self) 400 { 401 RF_AutoConfig_t *ac_list; 402 RF_ConfigSet_t *config_sets; 403 404 if (!raidautoconfig || raidautoconfigdone == true) 405 return (0); 406 407 /* XXX This code can only be run once. */ 408 raidautoconfigdone = true; 409 410 #ifdef __HAVE_CPU_BOOTCONF 411 /* 412 * 0. find the boot device if needed first so we can use it later 413 * this needs to be done before we autoconfigure any raid sets, 414 * because if we use wedges we are not going to be able to open 415 * the boot device later 416 */ 417 if (booted_device == NULL) 418 cpu_bootconf(); 419 #endif 420 /* 1. locate all RAID components on the system */ 421 aprint_debug("Searching for RAID components...\n"); 422 ac_list = rf_find_raid_components(); 423 424 /* 2. Sort them into their respective sets. */ 425 config_sets = rf_create_auto_sets(ac_list); 426 427 /* 428 * 3. Evaluate each set and configure the valid ones. 429 * This gets done in rf_buildroothack(). 430 */ 431 rf_buildroothack(config_sets); 432 433 return 1; 434 } 435 436 int 437 rf_inited(const struct raid_softc *rs) { 438 return (rs->sc_flags & RAIDF_INITED) != 0; 439 } 440 441 RF_Raid_t * 442 rf_get_raid(struct raid_softc *rs) { 443 return &rs->sc_r; 444 } 445 446 int 447 rf_get_unit(const struct raid_softc *rs) { 448 return rs->sc_unit; 449 } 450 451 static int 452 rf_containsboot(RF_Raid_t *r, device_t bdv) { 453 const char *bootname; 454 size_t len; 455 456 /* if bdv is NULL, the set can't contain it. exit early. */ 457 if (bdv == NULL) 458 return 0; 459 460 bootname = device_xname(bdv); 461 len = strlen(bootname); 462 463 for (int col = 0; col < r->numCol; col++) { 464 const char *devname = r->Disks[col].devname; 465 devname += sizeof("/dev/") - 1; 466 if (strncmp(devname, "dk", 2) == 0) { 467 const char *parent = 468 dkwedge_get_parent_name(r->Disks[col].dev); 469 if (parent != NULL) 470 devname = parent; 471 } 472 if (strncmp(devname, bootname, len) == 0) { 473 struct raid_softc *sc = r->softc; 474 aprint_debug("raid%d includes boot device %s\n", 475 sc->sc_unit, devname); 476 return 1; 477 } 478 } 479 return 0; 480 } 481 482 void 483 rf_buildroothack(RF_ConfigSet_t *config_sets) 484 { 485 RF_ConfigSet_t *cset; 486 RF_ConfigSet_t *next_cset; 487 int num_root; 488 struct raid_softc *sc, *rsc; 489 struct dk_softc *dksc = NULL; /* XXX gcc -Os: may be used uninit. */ 490 491 sc = rsc = NULL; 492 num_root = 0; 493 cset = config_sets; 494 while (cset != NULL) { 495 next_cset = cset->next; 496 if (rf_have_enough_components(cset) && 497 cset->ac->clabel->autoconfigure == 1) { 498 sc = rf_auto_config_set(cset); 499 if (sc != NULL) { 500 aprint_debug("raid%d: configured ok, rootable %d\n", 501 sc->sc_unit, cset->rootable); 502 if (cset->rootable) { 503 rsc = sc; 504 num_root++; 505 } 506 } else { 507 /* The autoconfig didn't work :( */ 508 aprint_debug("Autoconfig failed\n"); 509 rf_release_all_vps(cset); 510 } 511 } else { 512 /* we're not autoconfiguring this set... 513 release the associated resources */ 514 rf_release_all_vps(cset); 515 } 516 /* cleanup */ 517 rf_cleanup_config_set(cset); 518 cset = next_cset; 519 } 520 521 /* if the user has specified what the root device should be 522 then we don't touch booted_device or boothowto... */ 523 524 if (rootspec != NULL) { 525 DPRINTF("%s: rootspec %s\n", __func__, rootspec); 526 return; 527 } 528 529 /* we found something bootable... */ 530 531 /* 532 * XXX: The following code assumes that the root raid 533 * is the first ('a') partition. This is about the best 534 * we can do with a BSD disklabel, but we might be able 535 * to do better with a GPT label, by setting a specified 536 * attribute to indicate the root partition. We can then 537 * stash the partition number in the r->root_partition 538 * high bits (the bottom 2 bits are already used). For 539 * now we just set booted_partition to 0 when we override 540 * root. 541 */ 542 if (num_root == 1) { 543 device_t candidate_root; 544 dksc = &rsc->sc_dksc; 545 if (dksc->sc_dkdev.dk_nwedges != 0) { 546 char cname[sizeof(cset->ac->devname)]; 547 /* XXX: assume partition 'a' first */ 548 snprintf(cname, sizeof(cname), "%s%c", 549 device_xname(dksc->sc_dev), 'a'); 550 candidate_root = dkwedge_find_by_wname(cname); 551 DPRINTF("%s: candidate wedge root=%s\n", __func__, 552 cname); 553 if (candidate_root == NULL) { 554 /* 555 * If that is not found, because we don't use 556 * disklabel, return the first dk child 557 * XXX: we can skip the 'a' check above 558 * and always do this... 559 */ 560 size_t i = 0; 561 candidate_root = dkwedge_find_by_parent( 562 device_xname(dksc->sc_dev), &i); 563 } 564 DPRINTF("%s: candidate wedge root=%p\n", __func__, 565 candidate_root); 566 } else 567 candidate_root = dksc->sc_dev; 568 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root); 569 DPRINTF("%s: booted_device=%p root_partition=%d " 570 "contains_boot=%d", 571 __func__, booted_device, rsc->sc_r.root_partition, 572 rf_containsboot(&rsc->sc_r, booted_device)); 573 /* XXX the check for booted_device == NULL can probably be 574 * dropped, now that rf_containsboot handles that case. 575 */ 576 if (booted_device == NULL || 577 rsc->sc_r.root_partition == 1 || 578 rf_containsboot(&rsc->sc_r, booted_device)) { 579 booted_device = candidate_root; 580 booted_method = "raidframe/single"; 581 booted_partition = 0; /* XXX assume 'a' */ 582 } 583 } else if (num_root > 1) { 584 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root, 585 booted_device); 586 587 /* 588 * Maybe the MD code can help. If it cannot, then 589 * setroot() will discover that we have no 590 * booted_device and will ask the user if nothing was 591 * hardwired in the kernel config file 592 */ 593 if (booted_device == NULL) 594 return; 595 596 num_root = 0; 597 mutex_enter(&raid_lock); 598 LIST_FOREACH(sc, &raids, sc_link) { 599 RF_Raid_t *r = &sc->sc_r; 600 if (r->valid == 0) 601 continue; 602 603 if (r->root_partition == 0) 604 continue; 605 606 if (rf_containsboot(r, booted_device)) { 607 num_root++; 608 rsc = sc; 609 dksc = &rsc->sc_dksc; 610 } 611 } 612 mutex_exit(&raid_lock); 613 614 if (num_root == 1) { 615 booted_device = dksc->sc_dev; 616 booted_method = "raidframe/multi"; 617 booted_partition = 0; /* XXX assume 'a' */ 618 } else { 619 /* we can't guess.. require the user to answer... */ 620 boothowto |= RB_ASKNAME; 621 } 622 } 623 } 624 625 static int 626 raidsize(dev_t dev) 627 { 628 struct raid_softc *rs; 629 struct dk_softc *dksc; 630 unsigned int unit; 631 632 unit = raidunit(dev); 633 if ((rs = raidget(unit, false)) == NULL) 634 return -1; 635 dksc = &rs->sc_dksc; 636 637 if ((rs->sc_flags & RAIDF_INITED) == 0) 638 return -1; 639 640 return dk_size(dksc, dev); 641 } 642 643 static int 644 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size) 645 { 646 unsigned int unit; 647 struct raid_softc *rs; 648 struct dk_softc *dksc; 649 650 unit = raidunit(dev); 651 if ((rs = raidget(unit, false)) == NULL) 652 return ENXIO; 653 dksc = &rs->sc_dksc; 654 655 if ((rs->sc_flags & RAIDF_INITED) == 0) 656 return ENODEV; 657 658 /* 659 Note that blkno is relative to this particular partition. 660 By adding adding RF_PROTECTED_SECTORS, we get a value that 661 is relative to the partition used for the underlying component. 662 */ 663 blkno += RF_PROTECTED_SECTORS; 664 665 return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE); 666 } 667 668 static int 669 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk) 670 { 671 struct raid_softc *rs = raidsoftc(dev); 672 const struct bdevsw *bdev; 673 RF_Raid_t *raidPtr; 674 int c, sparecol, j, scol, dumpto; 675 int error = 0; 676 677 raidPtr = &rs->sc_r; 678 679 /* we only support dumping to RAID 1 sets */ 680 if (raidPtr->Layout.numDataCol != 1 || 681 raidPtr->Layout.numParityCol != 1) 682 return EINVAL; 683 684 if ((error = raidlock(rs)) != 0) 685 return error; 686 687 /* figure out what device is alive.. */ 688 689 /* 690 Look for a component to dump to. The preference for the 691 component to dump to is as follows: 692 1) the master 693 2) a used_spare of the master 694 3) the slave 695 4) a used_spare of the slave 696 */ 697 698 dumpto = -1; 699 for (c = 0; c < raidPtr->numCol; c++) { 700 if (raidPtr->Disks[c].status == rf_ds_optimal) { 701 /* this might be the one */ 702 dumpto = c; 703 break; 704 } 705 } 706 707 /* 708 At this point we have possibly selected a live master or a 709 live slave. We now check to see if there is a spared 710 master (or a spared slave), if we didn't find a live master 711 or a live slave. 712 */ 713 714 for (c = 0; c < raidPtr->numSpare; c++) { 715 sparecol = raidPtr->numCol + c; 716 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 717 /* How about this one? */ 718 scol = -1; 719 for(j=0;j<raidPtr->numCol;j++) { 720 if (raidPtr->Disks[j].spareCol == sparecol) { 721 scol = j; 722 break; 723 } 724 } 725 if (scol == 0) { 726 /* 727 We must have found a spared master! 728 We'll take that over anything else 729 found so far. (We couldn't have 730 found a real master before, since 731 this is a used spare, and it's 732 saying that it's replacing the 733 master.) On reboot (with 734 autoconfiguration turned on) 735 sparecol will become the 1st 736 component (component0) of this set. 737 */ 738 dumpto = sparecol; 739 break; 740 } else if (scol != -1) { 741 /* 742 Must be a spared slave. We'll dump 743 to that if we havn't found anything 744 else so far. 745 */ 746 if (dumpto == -1) 747 dumpto = sparecol; 748 } 749 } 750 } 751 752 if (dumpto == -1) { 753 /* we couldn't find any live components to dump to!?!? 754 */ 755 error = EINVAL; 756 goto out; 757 } 758 759 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev); 760 if (bdev == NULL) { 761 error = ENXIO; 762 goto out; 763 } 764 765 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev, 766 blkno, va, nblk * raidPtr->bytesPerSector); 767 768 out: 769 raidunlock(rs); 770 771 return error; 772 } 773 774 /* ARGSUSED */ 775 static int 776 raidopen(dev_t dev, int flags, int fmt, 777 struct lwp *l) 778 { 779 int unit = raidunit(dev); 780 struct raid_softc *rs; 781 struct dk_softc *dksc; 782 int error = 0; 783 int part, pmask; 784 785 if ((rs = raidget(unit, true)) == NULL) 786 return ENXIO; 787 if ((error = raidlock(rs)) != 0) 788 return (error); 789 790 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) { 791 error = EBUSY; 792 goto bad; 793 } 794 795 dksc = &rs->sc_dksc; 796 797 part = DISKPART(dev); 798 pmask = (1 << part); 799 800 if (!DK_BUSY(dksc, pmask) && 801 ((rs->sc_flags & RAIDF_INITED) != 0)) { 802 /* First one... mark things as dirty... Note that we *MUST* 803 have done a configure before this. I DO NOT WANT TO BE 804 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED 805 THAT THEY BELONG TOGETHER!!!!! */ 806 /* XXX should check to see if we're only open for reading 807 here... If so, we needn't do this, but then need some 808 other way of keeping track of what's happened.. */ 809 810 rf_markalldirty(&rs->sc_r); 811 } 812 813 if ((rs->sc_flags & RAIDF_INITED) != 0) 814 error = dk_open(dksc, dev, flags, fmt, l); 815 816 bad: 817 raidunlock(rs); 818 819 return (error); 820 821 822 } 823 824 static int 825 raid_lastclose(device_t self) 826 { 827 struct raid_softc *rs = raidsoftc(self); 828 829 /* Last one... device is not unconfigured yet. 830 Device shutdown has taken care of setting the 831 clean bits if RAIDF_INITED is not set 832 mark things as clean... */ 833 834 rf_update_component_labels(&rs->sc_r, 835 RF_FINAL_COMPONENT_UPDATE); 836 837 /* pass to unlocked code */ 838 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) 839 rs->sc_flags |= RAIDF_DETACH; 840 841 return 0; 842 } 843 844 /* ARGSUSED */ 845 static int 846 raidclose(dev_t dev, int flags, int fmt, struct lwp *l) 847 { 848 int unit = raidunit(dev); 849 struct raid_softc *rs; 850 struct dk_softc *dksc; 851 cfdata_t cf; 852 int error = 0, do_detach = 0, do_put = 0; 853 854 if ((rs = raidget(unit, false)) == NULL) 855 return ENXIO; 856 dksc = &rs->sc_dksc; 857 858 if ((error = raidlock(rs)) != 0) 859 return (error); 860 861 if ((rs->sc_flags & RAIDF_INITED) != 0) { 862 error = dk_close(dksc, dev, flags, fmt, l); 863 if ((rs->sc_flags & RAIDF_DETACH) != 0) 864 do_detach = 1; 865 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) 866 do_put = 1; 867 868 raidunlock(rs); 869 870 if (do_detach) { 871 /* free the pseudo device attach bits */ 872 cf = device_cfdata(dksc->sc_dev); 873 error = config_detach(dksc->sc_dev, 0); 874 if (error == 0) 875 free(cf, M_RAIDFRAME); 876 } else if (do_put) { 877 raidput(rs); 878 } 879 880 return (error); 881 882 } 883 884 static void 885 raid_wakeup(RF_Raid_t *raidPtr) 886 { 887 rf_lock_mutex2(raidPtr->iodone_lock); 888 rf_signal_cond2(raidPtr->iodone_cv); 889 rf_unlock_mutex2(raidPtr->iodone_lock); 890 } 891 892 static void 893 raidstrategy(struct buf *bp) 894 { 895 unsigned int unit; 896 struct raid_softc *rs; 897 struct dk_softc *dksc; 898 RF_Raid_t *raidPtr; 899 900 unit = raidunit(bp->b_dev); 901 if ((rs = raidget(unit, false)) == NULL) { 902 bp->b_error = ENXIO; 903 goto fail; 904 } 905 if ((rs->sc_flags & RAIDF_INITED) == 0) { 906 bp->b_error = ENXIO; 907 goto fail; 908 } 909 dksc = &rs->sc_dksc; 910 raidPtr = &rs->sc_r; 911 912 /* Queue IO only */ 913 if (dk_strategy_defer(dksc, bp)) 914 goto done; 915 916 /* schedule the IO to happen at the next convenient time */ 917 raid_wakeup(raidPtr); 918 919 done: 920 return; 921 922 fail: 923 bp->b_resid = bp->b_bcount; 924 biodone(bp); 925 } 926 927 static int 928 raid_diskstart(device_t dev, struct buf *bp) 929 { 930 struct raid_softc *rs = raidsoftc(dev); 931 RF_Raid_t *raidPtr; 932 933 raidPtr = &rs->sc_r; 934 if (!raidPtr->valid) { 935 db1_printf(("raid is not valid..\n")); 936 return ENODEV; 937 } 938 939 /* XXX */ 940 bp->b_resid = 0; 941 942 return raiddoaccess(raidPtr, bp); 943 } 944 945 void 946 raiddone(RF_Raid_t *raidPtr, struct buf *bp) 947 { 948 struct raid_softc *rs; 949 struct dk_softc *dksc; 950 951 rs = raidPtr->softc; 952 dksc = &rs->sc_dksc; 953 954 dk_done(dksc, bp); 955 956 rf_lock_mutex2(raidPtr->mutex); 957 raidPtr->openings++; 958 rf_unlock_mutex2(raidPtr->mutex); 959 960 /* schedule more IO */ 961 raid_wakeup(raidPtr); 962 } 963 964 /* ARGSUSED */ 965 static int 966 raidread(dev_t dev, struct uio *uio, int flags) 967 { 968 int unit = raidunit(dev); 969 struct raid_softc *rs; 970 971 if ((rs = raidget(unit, false)) == NULL) 972 return ENXIO; 973 974 if ((rs->sc_flags & RAIDF_INITED) == 0) 975 return (ENXIO); 976 977 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio)); 978 979 } 980 981 /* ARGSUSED */ 982 static int 983 raidwrite(dev_t dev, struct uio *uio, int flags) 984 { 985 int unit = raidunit(dev); 986 struct raid_softc *rs; 987 988 if ((rs = raidget(unit, false)) == NULL) 989 return ENXIO; 990 991 if ((rs->sc_flags & RAIDF_INITED) == 0) 992 return (ENXIO); 993 994 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio)); 995 996 } 997 998 static int 999 raid_detach_unlocked(struct raid_softc *rs) 1000 { 1001 struct dk_softc *dksc = &rs->sc_dksc; 1002 RF_Raid_t *raidPtr; 1003 int error; 1004 1005 raidPtr = &rs->sc_r; 1006 1007 if (DK_BUSY(dksc, 0) || 1008 raidPtr->recon_in_progress != 0 || 1009 raidPtr->parity_rewrite_in_progress != 0 || 1010 raidPtr->copyback_in_progress != 0) 1011 return EBUSY; 1012 1013 if ((rs->sc_flags & RAIDF_INITED) == 0) 1014 return 0; 1015 1016 rs->sc_flags &= ~RAIDF_SHUTDOWN; 1017 1018 if ((error = rf_Shutdown(raidPtr)) != 0) 1019 return error; 1020 1021 rs->sc_flags &= ~RAIDF_INITED; 1022 1023 /* Kill off any queued buffers */ 1024 dk_drain(dksc); 1025 bufq_free(dksc->sc_bufq); 1026 1027 /* Detach the disk. */ 1028 dkwedge_delall(&dksc->sc_dkdev); 1029 disk_detach(&dksc->sc_dkdev); 1030 disk_destroy(&dksc->sc_dkdev); 1031 dk_detach(dksc); 1032 1033 return 0; 1034 } 1035 1036 static bool 1037 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd) 1038 { 1039 switch (cmd) { 1040 case RAIDFRAME_ADD_HOT_SPARE: 1041 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1042 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1043 case RAIDFRAME_CHECK_PARITY: 1044 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1045 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1046 case RAIDFRAME_CHECK_RECON_STATUS: 1047 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1048 case RAIDFRAME_COPYBACK: 1049 case RAIDFRAME_DELETE_COMPONENT: 1050 case RAIDFRAME_FAIL_DISK: 1051 case RAIDFRAME_GET_ACCTOTALS: 1052 case RAIDFRAME_GET_COMPONENT_LABEL: 1053 case RAIDFRAME_GET_INFO: 1054 case RAIDFRAME_GET_SIZE: 1055 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1056 case RAIDFRAME_INIT_LABELS: 1057 case RAIDFRAME_KEEP_ACCTOTALS: 1058 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1059 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1060 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1061 case RAIDFRAME_PARITYMAP_STATUS: 1062 case RAIDFRAME_REBUILD_IN_PLACE: 1063 case RAIDFRAME_REMOVE_HOT_SPARE: 1064 case RAIDFRAME_RESET_ACCTOTALS: 1065 case RAIDFRAME_REWRITEPARITY: 1066 case RAIDFRAME_SET_AUTOCONFIG: 1067 case RAIDFRAME_SET_COMPONENT_LABEL: 1068 case RAIDFRAME_SET_ROOT: 1069 return (rs->sc_flags & RAIDF_INITED) == 0; 1070 } 1071 return false; 1072 } 1073 1074 int 1075 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr) 1076 { 1077 struct rf_recon_req_internal *rrint; 1078 1079 if (raidPtr->Layout.map->faultsTolerated == 0) { 1080 /* Can't do this on a RAID 0!! */ 1081 return EINVAL; 1082 } 1083 1084 if (rr->col < 0 || rr->col >= raidPtr->numCol) { 1085 /* bad column */ 1086 return EINVAL; 1087 } 1088 1089 rf_lock_mutex2(raidPtr->mutex); 1090 if (raidPtr->status == rf_rs_reconstructing) { 1091 /* you can't fail a disk while we're reconstructing! */ 1092 /* XXX wrong for RAID6 */ 1093 goto out; 1094 } 1095 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) && 1096 (raidPtr->numFailures > 0)) { 1097 /* some other component has failed. Let's not make 1098 things worse. XXX wrong for RAID6 */ 1099 goto out; 1100 } 1101 if (raidPtr->Disks[rr->col].status == rf_ds_spared) { 1102 /* Can't fail a spared disk! */ 1103 goto out; 1104 } 1105 rf_unlock_mutex2(raidPtr->mutex); 1106 1107 /* make a copy of the recon request so that we don't rely on 1108 * the user's buffer */ 1109 rrint = RF_Malloc(sizeof(*rrint)); 1110 if (rrint == NULL) 1111 return(ENOMEM); 1112 rrint->col = rr->col; 1113 rrint->flags = rr->flags; 1114 rrint->raidPtr = raidPtr; 1115 1116 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread, 1117 rrint, "raid_recon"); 1118 out: 1119 rf_unlock_mutex2(raidPtr->mutex); 1120 return EINVAL; 1121 } 1122 1123 static int 1124 rf_copyinspecificbuf(RF_Config_t *k_cfg) 1125 { 1126 /* allocate a buffer for the layout-specific data, and copy it in */ 1127 if (k_cfg->layoutSpecificSize == 0) 1128 return 0; 1129 1130 if (k_cfg->layoutSpecificSize > 10000) { 1131 /* sanity check */ 1132 return EINVAL; 1133 } 1134 1135 u_char *specific_buf; 1136 specific_buf = RF_Malloc(k_cfg->layoutSpecificSize); 1137 if (specific_buf == NULL) 1138 return ENOMEM; 1139 1140 int retcode = copyin(k_cfg->layoutSpecific, specific_buf, 1141 k_cfg->layoutSpecificSize); 1142 if (retcode) { 1143 RF_Free(specific_buf, k_cfg->layoutSpecificSize); 1144 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode)); 1145 return retcode; 1146 } 1147 1148 k_cfg->layoutSpecific = specific_buf; 1149 return 0; 1150 } 1151 1152 static int 1153 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg) 1154 { 1155 RF_Config_t *u_cfg = *((RF_Config_t **) data); 1156 1157 if (rs->sc_r.valid) { 1158 /* There is a valid RAID set running on this unit! */ 1159 printf("raid%d: Device already configured!\n", rs->sc_unit); 1160 return EINVAL; 1161 } 1162 1163 /* copy-in the configuration information */ 1164 /* data points to a pointer to the configuration structure */ 1165 *k_cfg = RF_Malloc(sizeof(**k_cfg)); 1166 if (*k_cfg == NULL) { 1167 return ENOMEM; 1168 } 1169 int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t)); 1170 if (retcode == 0) 1171 return 0; 1172 RF_Free(*k_cfg, sizeof(RF_Config_t)); 1173 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode)); 1174 rs->sc_flags |= RAIDF_SHUTDOWN; 1175 return retcode; 1176 } 1177 1178 int 1179 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg) 1180 { 1181 int retcode; 1182 RF_Raid_t *raidPtr = &rs->sc_r; 1183 1184 rs->sc_flags &= ~RAIDF_SHUTDOWN; 1185 1186 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0) 1187 goto out; 1188 1189 /* should do some kind of sanity check on the configuration. 1190 * Store the sum of all the bytes in the last byte? */ 1191 1192 /* configure the system */ 1193 1194 /* 1195 * Clear the entire RAID descriptor, just to make sure 1196 * there is no stale data left in the case of a 1197 * reconfiguration 1198 */ 1199 memset(raidPtr, 0, sizeof(*raidPtr)); 1200 raidPtr->softc = rs; 1201 raidPtr->raidid = rs->sc_unit; 1202 1203 retcode = rf_Configure(raidPtr, k_cfg, NULL); 1204 1205 if (retcode == 0) { 1206 /* allow this many simultaneous IO's to 1207 this RAID device */ 1208 raidPtr->openings = RAIDOUTSTANDING; 1209 1210 raidinit(rs); 1211 raid_wakeup(raidPtr); 1212 rf_markalldirty(raidPtr); 1213 } 1214 1215 /* free the buffers. No return code here. */ 1216 if (k_cfg->layoutSpecificSize) { 1217 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize); 1218 } 1219 out: 1220 RF_Free(k_cfg, sizeof(RF_Config_t)); 1221 if (retcode) { 1222 /* 1223 * If configuration failed, set sc_flags so that we 1224 * will detach the device when we close it. 1225 */ 1226 rs->sc_flags |= RAIDF_SHUTDOWN; 1227 } 1228 return retcode; 1229 } 1230 1231 #if RF_DISABLED 1232 static int 1233 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 1234 { 1235 1236 /* XXX check the label for valid stuff... */ 1237 /* Note that some things *should not* get modified -- 1238 the user should be re-initing the labels instead of 1239 trying to patch things. 1240 */ 1241 #ifdef DEBUG 1242 int raidid = raidPtr->raidid; 1243 printf("raid%d: Got component label:\n", raidid); 1244 printf("raid%d: Version: %d\n", raidid, clabel->version); 1245 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number); 1246 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter); 1247 printf("raid%d: Column: %d\n", raidid, clabel->column); 1248 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns); 1249 printf("raid%d: Clean: %d\n", raidid, clabel->clean); 1250 printf("raid%d: Status: %d\n", raidid, clabel->status); 1251 #endif /* DEBUG */ 1252 clabel->row = 0; 1253 int column = clabel->column; 1254 1255 if ((column < 0) || (column >= raidPtr->numCol)) { 1256 return(EINVAL); 1257 } 1258 1259 /* XXX this isn't allowed to do anything for now :-) */ 1260 1261 /* XXX and before it is, we need to fill in the rest 1262 of the fields!?!?!?! */ 1263 memcpy(raidget_component_label(raidPtr, column), 1264 clabel, sizeof(*clabel)); 1265 raidflush_component_label(raidPtr, column); 1266 return 0; 1267 } 1268 #endif 1269 1270 static int 1271 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 1272 { 1273 /* 1274 we only want the serial number from 1275 the above. We get all the rest of the information 1276 from the config that was used to create this RAID 1277 set. 1278 */ 1279 1280 raidPtr->serial_number = clabel->serial_number; 1281 1282 for (int column = 0; column < raidPtr->numCol; column++) { 1283 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column]; 1284 if (RF_DEAD_DISK(diskPtr->status)) 1285 continue; 1286 RF_ComponentLabel_t *ci_label = raidget_component_label( 1287 raidPtr, column); 1288 /* Zeroing this is important. */ 1289 memset(ci_label, 0, sizeof(*ci_label)); 1290 raid_init_component_label(raidPtr, ci_label); 1291 ci_label->serial_number = raidPtr->serial_number; 1292 ci_label->row = 0; /* we dont' pretend to support more */ 1293 rf_component_label_set_partitionsize(ci_label, 1294 diskPtr->partitionSize); 1295 ci_label->column = column; 1296 raidflush_component_label(raidPtr, column); 1297 /* XXXjld what about the spares? */ 1298 } 1299 1300 return 0; 1301 } 1302 1303 static int 1304 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr) 1305 { 1306 1307 if (raidPtr->Layout.map->faultsTolerated == 0) { 1308 /* Can't do this on a RAID 0!! */ 1309 return EINVAL; 1310 } 1311 1312 if (raidPtr->recon_in_progress == 1) { 1313 /* a reconstruct is already in progress! */ 1314 return EINVAL; 1315 } 1316 1317 RF_SingleComponent_t component; 1318 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t)); 1319 component.row = 0; /* we don't support any more */ 1320 int column = component.column; 1321 1322 if ((column < 0) || (column >= raidPtr->numCol)) { 1323 return EINVAL; 1324 } 1325 1326 rf_lock_mutex2(raidPtr->mutex); 1327 if ((raidPtr->Disks[column].status == rf_ds_optimal) && 1328 (raidPtr->numFailures > 0)) { 1329 /* XXX 0 above shouldn't be constant!!! */ 1330 /* some component other than this has failed. 1331 Let's not make things worse than they already 1332 are... */ 1333 printf("raid%d: Unable to reconstruct to disk at:\n", 1334 raidPtr->raidid); 1335 printf("raid%d: Col: %d Too many failures.\n", 1336 raidPtr->raidid, column); 1337 rf_unlock_mutex2(raidPtr->mutex); 1338 return EINVAL; 1339 } 1340 1341 if (raidPtr->Disks[column].status == rf_ds_reconstructing) { 1342 printf("raid%d: Unable to reconstruct to disk at:\n", 1343 raidPtr->raidid); 1344 printf("raid%d: Col: %d " 1345 "Reconstruction already occurring!\n", 1346 raidPtr->raidid, column); 1347 1348 rf_unlock_mutex2(raidPtr->mutex); 1349 return EINVAL; 1350 } 1351 1352 if (raidPtr->Disks[column].status == rf_ds_spared) { 1353 rf_unlock_mutex2(raidPtr->mutex); 1354 return EINVAL; 1355 } 1356 1357 rf_unlock_mutex2(raidPtr->mutex); 1358 1359 struct rf_recon_req_internal *rrint; 1360 rrint = RF_Malloc(sizeof(*rrint)); 1361 if (rrint == NULL) 1362 return ENOMEM; 1363 1364 rrint->col = column; 1365 rrint->raidPtr = raidPtr; 1366 1367 return RF_CREATE_THREAD(raidPtr->recon_thread, 1368 rf_ReconstructInPlaceThread, rrint, "raid_reconip"); 1369 } 1370 1371 static int 1372 rf_check_recon_status(RF_Raid_t *raidPtr, int *data) 1373 { 1374 /* 1375 * This makes no sense on a RAID 0, or if we are not reconstructing 1376 * so tell the user it's done. 1377 */ 1378 if (raidPtr->Layout.map->faultsTolerated == 0 || 1379 raidPtr->status != rf_rs_reconstructing) { 1380 *data = 100; 1381 return 0; 1382 } 1383 if (raidPtr->reconControl->numRUsTotal == 0) { 1384 *data = 0; 1385 return 0; 1386 } 1387 *data = (raidPtr->reconControl->numRUsComplete * 100 1388 / raidPtr->reconControl->numRUsTotal); 1389 return 0; 1390 } 1391 1392 static int 1393 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) 1394 { 1395 int unit = raidunit(dev); 1396 int part, pmask; 1397 struct raid_softc *rs; 1398 struct dk_softc *dksc; 1399 RF_Config_t *k_cfg; 1400 RF_Raid_t *raidPtr; 1401 RF_AccTotals_t *totals; 1402 RF_SingleComponent_t component; 1403 RF_DeviceConfig_t *d_cfg, *ucfgp; 1404 int retcode = 0; 1405 int column; 1406 RF_ComponentLabel_t *clabel; 1407 RF_SingleComponent_t *sparePtr,*componentPtr; 1408 int d; 1409 1410 if ((rs = raidget(unit, false)) == NULL) 1411 return ENXIO; 1412 1413 dksc = &rs->sc_dksc; 1414 raidPtr = &rs->sc_r; 1415 1416 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev, 1417 (int) DISKPART(dev), (int) unit, cmd)); 1418 1419 /* Must be initialized for these... */ 1420 if (rf_must_be_initialized(rs, cmd)) 1421 return ENXIO; 1422 1423 switch (cmd) { 1424 /* configure the system */ 1425 case RAIDFRAME_CONFIGURE: 1426 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0) 1427 return retcode; 1428 return rf_construct(rs, k_cfg); 1429 1430 /* shutdown the system */ 1431 case RAIDFRAME_SHUTDOWN: 1432 1433 part = DISKPART(dev); 1434 pmask = (1 << part); 1435 1436 if ((retcode = raidlock(rs)) != 0) 1437 return retcode; 1438 1439 if (DK_BUSY(dksc, pmask) || 1440 raidPtr->recon_in_progress != 0 || 1441 raidPtr->parity_rewrite_in_progress != 0 || 1442 raidPtr->copyback_in_progress != 0) 1443 retcode = EBUSY; 1444 else { 1445 /* detach and free on close */ 1446 rs->sc_flags |= RAIDF_SHUTDOWN; 1447 retcode = 0; 1448 } 1449 1450 raidunlock(rs); 1451 1452 return retcode; 1453 case RAIDFRAME_GET_COMPONENT_LABEL: 1454 return rf_get_component_label(raidPtr, data); 1455 1456 #if RF_DISABLED 1457 case RAIDFRAME_SET_COMPONENT_LABEL: 1458 return rf_set_component_label(raidPtr, data); 1459 #endif 1460 1461 case RAIDFRAME_INIT_LABELS: 1462 return rf_init_component_label(raidPtr, data); 1463 1464 case RAIDFRAME_SET_AUTOCONFIG: 1465 d = rf_set_autoconfig(raidPtr, *(int *) data); 1466 printf("raid%d: New autoconfig value is: %d\n", 1467 raidPtr->raidid, d); 1468 *(int *) data = d; 1469 return retcode; 1470 1471 case RAIDFRAME_SET_ROOT: 1472 d = rf_set_rootpartition(raidPtr, *(int *) data); 1473 printf("raid%d: New rootpartition value is: %d\n", 1474 raidPtr->raidid, d); 1475 *(int *) data = d; 1476 return retcode; 1477 1478 /* initialize all parity */ 1479 case RAIDFRAME_REWRITEPARITY: 1480 1481 if (raidPtr->Layout.map->faultsTolerated == 0) { 1482 /* Parity for RAID 0 is trivially correct */ 1483 raidPtr->parity_good = RF_RAID_CLEAN; 1484 return 0; 1485 } 1486 1487 if (raidPtr->parity_rewrite_in_progress == 1) { 1488 /* Re-write is already in progress! */ 1489 return EINVAL; 1490 } 1491 1492 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread, 1493 rf_RewriteParityThread, raidPtr,"raid_parity"); 1494 1495 case RAIDFRAME_ADD_HOT_SPARE: 1496 sparePtr = (RF_SingleComponent_t *) data; 1497 memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t)); 1498 return rf_add_hot_spare(raidPtr, &component); 1499 1500 case RAIDFRAME_REMOVE_HOT_SPARE: 1501 return retcode; 1502 1503 case RAIDFRAME_DELETE_COMPONENT: 1504 componentPtr = (RF_SingleComponent_t *)data; 1505 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t)); 1506 return rf_delete_component(raidPtr, &component); 1507 1508 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1509 componentPtr = (RF_SingleComponent_t *)data; 1510 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t)); 1511 return rf_incorporate_hot_spare(raidPtr, &component); 1512 1513 case RAIDFRAME_REBUILD_IN_PLACE: 1514 return rf_rebuild_in_place(raidPtr, data); 1515 1516 case RAIDFRAME_GET_INFO: 1517 ucfgp = *(RF_DeviceConfig_t **)data; 1518 d_cfg = RF_Malloc(sizeof(*d_cfg)); 1519 if (d_cfg == NULL) 1520 return ENOMEM; 1521 retcode = rf_get_info(raidPtr, d_cfg); 1522 if (retcode == 0) { 1523 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg)); 1524 } 1525 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1526 return retcode; 1527 1528 case RAIDFRAME_CHECK_PARITY: 1529 *(int *) data = raidPtr->parity_good; 1530 return 0; 1531 1532 case RAIDFRAME_PARITYMAP_STATUS: 1533 if (rf_paritymap_ineligible(raidPtr)) 1534 return EINVAL; 1535 rf_paritymap_status(raidPtr->parity_map, data); 1536 return 0; 1537 1538 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1539 if (rf_paritymap_ineligible(raidPtr)) 1540 return EINVAL; 1541 if (raidPtr->parity_map == NULL) 1542 return ENOENT; /* ??? */ 1543 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0) 1544 return EINVAL; 1545 return 0; 1546 1547 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1548 if (rf_paritymap_ineligible(raidPtr)) 1549 return EINVAL; 1550 *(int *) data = rf_paritymap_get_disable(raidPtr); 1551 return 0; 1552 1553 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1554 if (rf_paritymap_ineligible(raidPtr)) 1555 return EINVAL; 1556 rf_paritymap_set_disable(raidPtr, *(int *)data); 1557 /* XXX should errors be passed up? */ 1558 return 0; 1559 1560 case RAIDFRAME_RESET_ACCTOTALS: 1561 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals)); 1562 return 0; 1563 1564 case RAIDFRAME_GET_ACCTOTALS: 1565 totals = (RF_AccTotals_t *) data; 1566 *totals = raidPtr->acc_totals; 1567 return 0; 1568 1569 case RAIDFRAME_KEEP_ACCTOTALS: 1570 raidPtr->keep_acc_totals = *(int *)data; 1571 return 0; 1572 1573 case RAIDFRAME_GET_SIZE: 1574 *(int *) data = raidPtr->totalSectors; 1575 return 0; 1576 1577 case RAIDFRAME_FAIL_DISK: 1578 return rf_fail_disk(raidPtr, data); 1579 1580 /* invoke a copyback operation after recon on whatever disk 1581 * needs it, if any */ 1582 case RAIDFRAME_COPYBACK: 1583 1584 if (raidPtr->Layout.map->faultsTolerated == 0) { 1585 /* This makes no sense on a RAID 0!! */ 1586 return EINVAL; 1587 } 1588 1589 if (raidPtr->copyback_in_progress == 1) { 1590 /* Copyback is already in progress! */ 1591 return EINVAL; 1592 } 1593 1594 return RF_CREATE_THREAD(raidPtr->copyback_thread, 1595 rf_CopybackThread, raidPtr, "raid_copyback"); 1596 1597 /* return the percentage completion of reconstruction */ 1598 case RAIDFRAME_CHECK_RECON_STATUS: 1599 return rf_check_recon_status(raidPtr, data); 1600 1601 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1602 rf_check_recon_status_ext(raidPtr, data); 1603 return 0; 1604 1605 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1606 if (raidPtr->Layout.map->faultsTolerated == 0) { 1607 /* This makes no sense on a RAID 0, so tell the 1608 user it's done. */ 1609 *(int *) data = 100; 1610 return 0; 1611 } 1612 if (raidPtr->parity_rewrite_in_progress == 1) { 1613 *(int *) data = 100 * 1614 raidPtr->parity_rewrite_stripes_done / 1615 raidPtr->Layout.numStripe; 1616 } else { 1617 *(int *) data = 100; 1618 } 1619 return 0; 1620 1621 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1622 rf_check_parityrewrite_status_ext(raidPtr, data); 1623 return 0; 1624 1625 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1626 if (raidPtr->Layout.map->faultsTolerated == 0) { 1627 /* This makes no sense on a RAID 0 */ 1628 *(int *) data = 100; 1629 return 0; 1630 } 1631 if (raidPtr->copyback_in_progress == 1) { 1632 *(int *) data = 100 * raidPtr->copyback_stripes_done / 1633 raidPtr->Layout.numStripe; 1634 } else { 1635 *(int *) data = 100; 1636 } 1637 return 0; 1638 1639 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1640 rf_check_copyback_status_ext(raidPtr, data); 1641 return 0; 1642 1643 case RAIDFRAME_SET_LAST_UNIT: 1644 for (column = 0; column < raidPtr->numCol; column++) 1645 if (raidPtr->Disks[column].status != rf_ds_optimal) 1646 return EBUSY; 1647 1648 for (column = 0; column < raidPtr->numCol; column++) { 1649 clabel = raidget_component_label(raidPtr, column); 1650 clabel->last_unit = *(int *)data; 1651 raidflush_component_label(raidPtr, column); 1652 } 1653 rs->sc_cflags |= RAIDF_UNIT_CHANGED; 1654 return 0; 1655 1656 /* the sparetable daemon calls this to wait for the kernel to 1657 * need a spare table. this ioctl does not return until a 1658 * spare table is needed. XXX -- calling mpsleep here in the 1659 * ioctl code is almost certainly wrong and evil. -- XXX XXX 1660 * -- I should either compute the spare table in the kernel, 1661 * or have a different -- XXX XXX -- interface (a different 1662 * character device) for delivering the table -- XXX */ 1663 #if RF_DISABLED 1664 case RAIDFRAME_SPARET_WAIT: 1665 rf_lock_mutex2(rf_sparet_wait_mutex); 1666 while (!rf_sparet_wait_queue) 1667 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex); 1668 RF_SparetWait_t *waitreq = rf_sparet_wait_queue; 1669 rf_sparet_wait_queue = rf_sparet_wait_queue->next; 1670 rf_unlock_mutex2(rf_sparet_wait_mutex); 1671 1672 /* structure assignment */ 1673 *((RF_SparetWait_t *) data) = *waitreq; 1674 1675 RF_Free(waitreq, sizeof(*waitreq)); 1676 return 0; 1677 1678 /* wakes up a process waiting on SPARET_WAIT and puts an error 1679 * code in it that will cause the dameon to exit */ 1680 case RAIDFRAME_ABORT_SPARET_WAIT: 1681 waitreq = RF_Malloc(sizeof(*waitreq)); 1682 waitreq->fcol = -1; 1683 rf_lock_mutex2(rf_sparet_wait_mutex); 1684 waitreq->next = rf_sparet_wait_queue; 1685 rf_sparet_wait_queue = waitreq; 1686 rf_broadcast_cond2(rf_sparet_wait_cv); 1687 rf_unlock_mutex2(rf_sparet_wait_mutex); 1688 return 0; 1689 1690 /* used by the spare table daemon to deliver a spare table 1691 * into the kernel */ 1692 case RAIDFRAME_SEND_SPARET: 1693 1694 /* install the spare table */ 1695 retcode = rf_SetSpareTable(raidPtr, *(void **) data); 1696 1697 /* respond to the requestor. the return status of the spare 1698 * table installation is passed in the "fcol" field */ 1699 waitred = RF_Malloc(sizeof(*waitreq)); 1700 waitreq->fcol = retcode; 1701 rf_lock_mutex2(rf_sparet_wait_mutex); 1702 waitreq->next = rf_sparet_resp_queue; 1703 rf_sparet_resp_queue = waitreq; 1704 rf_broadcast_cond2(rf_sparet_resp_cv); 1705 rf_unlock_mutex2(rf_sparet_wait_mutex); 1706 1707 return retcode; 1708 #endif 1709 default: 1710 /* 1711 * Don't bother trying to load compat modules 1712 * if it is not our ioctl. This is more efficient 1713 * and makes rump tests not depend on compat code 1714 */ 1715 if (IOCGROUP(cmd) != 'r') 1716 break; 1717 #ifdef _LP64 1718 if ((l->l_proc->p_flag & PK_32) != 0) { 1719 module_autoload("compat_netbsd32_raid", 1720 MODULE_CLASS_EXEC); 1721 MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook, 1722 (rs, cmd, data), enosys(), retcode); 1723 if (retcode != EPASSTHROUGH) 1724 return retcode; 1725 } 1726 #endif 1727 module_autoload("compat_raid_80", MODULE_CLASS_EXEC); 1728 MODULE_HOOK_CALL(raidframe_ioctl_80_hook, 1729 (rs, cmd, data), enosys(), retcode); 1730 if (retcode != EPASSTHROUGH) 1731 return retcode; 1732 1733 module_autoload("compat_raid_50", MODULE_CLASS_EXEC); 1734 MODULE_HOOK_CALL(raidframe_ioctl_50_hook, 1735 (rs, cmd, data), enosys(), retcode); 1736 if (retcode != EPASSTHROUGH) 1737 return retcode; 1738 break; /* fall through to the os-specific code below */ 1739 1740 } 1741 1742 if (!raidPtr->valid) 1743 return (EINVAL); 1744 1745 /* 1746 * Add support for "regular" device ioctls here. 1747 */ 1748 1749 switch (cmd) { 1750 case DIOCGCACHE: 1751 retcode = rf_get_component_caches(raidPtr, (int *)data); 1752 break; 1753 1754 case DIOCCACHESYNC: 1755 retcode = rf_sync_component_caches(raidPtr); 1756 break; 1757 1758 default: 1759 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l); 1760 break; 1761 } 1762 1763 return (retcode); 1764 1765 } 1766 1767 1768 /* raidinit -- complete the rest of the initialization for the 1769 RAIDframe device. */ 1770 1771 1772 static void 1773 raidinit(struct raid_softc *rs) 1774 { 1775 cfdata_t cf; 1776 unsigned int unit; 1777 struct dk_softc *dksc = &rs->sc_dksc; 1778 RF_Raid_t *raidPtr = &rs->sc_r; 1779 device_t dev; 1780 1781 unit = raidPtr->raidid; 1782 1783 /* XXX doesn't check bounds. */ 1784 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit); 1785 1786 /* attach the pseudo device */ 1787 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK); 1788 cf->cf_name = raid_cd.cd_name; 1789 cf->cf_atname = raid_cd.cd_name; 1790 cf->cf_unit = unit; 1791 cf->cf_fstate = FSTATE_STAR; 1792 1793 dev = config_attach_pseudo(cf); 1794 if (dev == NULL) { 1795 printf("raid%d: config_attach_pseudo failed\n", 1796 raidPtr->raidid); 1797 free(cf, M_RAIDFRAME); 1798 return; 1799 } 1800 1801 /* provide a backpointer to the real softc */ 1802 raidsoftc(dev) = rs; 1803 1804 /* disk_attach actually creates space for the CPU disklabel, among 1805 * other things, so it's critical to call this *BEFORE* we try putzing 1806 * with disklabels. */ 1807 dk_init(dksc, dev, DKTYPE_RAID); 1808 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver); 1809 1810 /* XXX There may be a weird interaction here between this, and 1811 * protectedSectors, as used in RAIDframe. */ 1812 1813 rs->sc_size = raidPtr->totalSectors; 1814 1815 /* Attach dk and disk subsystems */ 1816 dk_attach(dksc); 1817 disk_attach(&dksc->sc_dkdev); 1818 rf_set_geometry(rs, raidPtr); 1819 1820 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK); 1821 1822 /* mark unit as usuable */ 1823 rs->sc_flags |= RAIDF_INITED; 1824 1825 dkwedge_discover(&dksc->sc_dkdev); 1826 } 1827 1828 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 1829 /* wake up the daemon & tell it to get us a spare table 1830 * XXX 1831 * the entries in the queues should be tagged with the raidPtr 1832 * so that in the extremely rare case that two recons happen at once, 1833 * we know for which device were requesting a spare table 1834 * XXX 1835 * 1836 * XXX This code is not currently used. GO 1837 */ 1838 int 1839 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req) 1840 { 1841 int retcode; 1842 1843 rf_lock_mutex2(rf_sparet_wait_mutex); 1844 req->next = rf_sparet_wait_queue; 1845 rf_sparet_wait_queue = req; 1846 rf_broadcast_cond2(rf_sparet_wait_cv); 1847 1848 /* mpsleep unlocks the mutex */ 1849 while (!rf_sparet_resp_queue) { 1850 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex); 1851 } 1852 req = rf_sparet_resp_queue; 1853 rf_sparet_resp_queue = req->next; 1854 rf_unlock_mutex2(rf_sparet_wait_mutex); 1855 1856 retcode = req->fcol; 1857 RF_Free(req, sizeof(*req)); /* this is not the same req as we 1858 * alloc'd */ 1859 return (retcode); 1860 } 1861 #endif 1862 1863 /* a wrapper around rf_DoAccess that extracts appropriate info from the 1864 * bp & passes it down. 1865 * any calls originating in the kernel must use non-blocking I/O 1866 * do some extra sanity checking to return "appropriate" error values for 1867 * certain conditions (to make some standard utilities work) 1868 * 1869 * Formerly known as: rf_DoAccessKernel 1870 */ 1871 void 1872 raidstart(RF_Raid_t *raidPtr) 1873 { 1874 struct raid_softc *rs; 1875 struct dk_softc *dksc; 1876 1877 rs = raidPtr->softc; 1878 dksc = &rs->sc_dksc; 1879 /* quick check to see if anything has died recently */ 1880 rf_lock_mutex2(raidPtr->mutex); 1881 if (raidPtr->numNewFailures > 0) { 1882 rf_unlock_mutex2(raidPtr->mutex); 1883 rf_update_component_labels(raidPtr, 1884 RF_NORMAL_COMPONENT_UPDATE); 1885 rf_lock_mutex2(raidPtr->mutex); 1886 raidPtr->numNewFailures--; 1887 } 1888 rf_unlock_mutex2(raidPtr->mutex); 1889 1890 if ((rs->sc_flags & RAIDF_INITED) == 0) { 1891 printf("raid%d: raidstart not ready\n", raidPtr->raidid); 1892 return; 1893 } 1894 1895 dk_start(dksc, NULL); 1896 } 1897 1898 static int 1899 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp) 1900 { 1901 RF_SectorCount_t num_blocks, pb, sum; 1902 RF_RaidAddr_t raid_addr; 1903 daddr_t blocknum; 1904 int do_async; 1905 int rc; 1906 1907 rf_lock_mutex2(raidPtr->mutex); 1908 if (raidPtr->openings == 0) { 1909 rf_unlock_mutex2(raidPtr->mutex); 1910 return EAGAIN; 1911 } 1912 rf_unlock_mutex2(raidPtr->mutex); 1913 1914 blocknum = bp->b_rawblkno; 1915 1916 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno, 1917 (int) blocknum)); 1918 1919 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount)); 1920 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid)); 1921 1922 /* *THIS* is where we adjust what block we're going to... 1923 * but DO NOT TOUCH bp->b_blkno!!! */ 1924 raid_addr = blocknum; 1925 1926 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector; 1927 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0; 1928 sum = raid_addr + num_blocks + pb; 1929 if (1 || rf_debugKernelAccess) { 1930 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n", 1931 (int) raid_addr, (int) sum, (int) num_blocks, 1932 (int) pb, (int) bp->b_resid)); 1933 } 1934 if ((sum > raidPtr->totalSectors) || (sum < raid_addr) 1935 || (sum < num_blocks) || (sum < pb)) { 1936 rc = ENOSPC; 1937 goto done; 1938 } 1939 /* 1940 * XXX rf_DoAccess() should do this, not just DoAccessKernel() 1941 */ 1942 1943 if (bp->b_bcount & raidPtr->sectorMask) { 1944 rc = ENOSPC; 1945 goto done; 1946 } 1947 db1_printf(("Calling DoAccess..\n")); 1948 1949 1950 rf_lock_mutex2(raidPtr->mutex); 1951 raidPtr->openings--; 1952 rf_unlock_mutex2(raidPtr->mutex); 1953 1954 /* 1955 * Everything is async. 1956 */ 1957 do_async = 1; 1958 1959 /* don't ever condition on bp->b_flags & B_WRITE. 1960 * always condition on B_READ instead */ 1961 1962 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ? 1963 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE, 1964 do_async, raid_addr, num_blocks, 1965 bp->b_data, bp, RF_DAG_NONBLOCKING_IO); 1966 1967 done: 1968 return rc; 1969 } 1970 1971 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */ 1972 1973 int 1974 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req) 1975 { 1976 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE; 1977 struct buf *bp; 1978 1979 req->queue = queue; 1980 bp = req->bp; 1981 1982 switch (req->type) { 1983 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */ 1984 /* XXX need to do something extra here.. */ 1985 /* I'm leaving this in, as I've never actually seen it used, 1986 * and I'd like folks to report it... GO */ 1987 printf(("WAKEUP CALLED\n")); 1988 queue->numOutstanding++; 1989 1990 bp->b_flags = 0; 1991 bp->b_private = req; 1992 1993 KernelWakeupFunc(bp); 1994 break; 1995 1996 case RF_IO_TYPE_READ: 1997 case RF_IO_TYPE_WRITE: 1998 #if RF_ACC_TRACE > 0 1999 if (req->tracerec) { 2000 RF_ETIMER_START(req->tracerec->timer); 2001 } 2002 #endif 2003 InitBP(bp, queue->rf_cinfo->ci_vp, 2004 op, queue->rf_cinfo->ci_dev, 2005 req->sectorOffset, req->numSector, 2006 req->buf, KernelWakeupFunc, (void *) req, 2007 queue->raidPtr->logBytesPerSector, req->b_proc); 2008 2009 if (rf_debugKernelAccess) { 2010 db1_printf(("dispatch: bp->b_blkno = %ld\n", 2011 (long) bp->b_blkno)); 2012 } 2013 queue->numOutstanding++; 2014 queue->last_deq_sector = req->sectorOffset; 2015 /* acc wouldn't have been let in if there were any pending 2016 * reqs at any other priority */ 2017 queue->curPriority = req->priority; 2018 2019 db1_printf(("Going for %c to unit %d col %d\n", 2020 req->type, queue->raidPtr->raidid, 2021 queue->col)); 2022 db1_printf(("sector %d count %d (%d bytes) %d\n", 2023 (int) req->sectorOffset, (int) req->numSector, 2024 (int) (req->numSector << 2025 queue->raidPtr->logBytesPerSector), 2026 (int) queue->raidPtr->logBytesPerSector)); 2027 2028 /* 2029 * XXX: drop lock here since this can block at 2030 * least with backing SCSI devices. Retake it 2031 * to minimize fuss with calling interfaces. 2032 */ 2033 2034 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam"); 2035 bdev_strategy(bp); 2036 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam"); 2037 break; 2038 2039 default: 2040 panic("bad req->type in rf_DispatchKernelIO"); 2041 } 2042 db1_printf(("Exiting from DispatchKernelIO\n")); 2043 2044 return (0); 2045 } 2046 /* this is the callback function associated with a I/O invoked from 2047 kernel code. 2048 */ 2049 static void 2050 KernelWakeupFunc(struct buf *bp) 2051 { 2052 RF_DiskQueueData_t *req = NULL; 2053 RF_DiskQueue_t *queue; 2054 2055 db1_printf(("recovering the request queue:\n")); 2056 2057 req = bp->b_private; 2058 2059 queue = (RF_DiskQueue_t *) req->queue; 2060 2061 rf_lock_mutex2(queue->raidPtr->iodone_lock); 2062 2063 #if RF_ACC_TRACE > 0 2064 if (req->tracerec) { 2065 RF_ETIMER_STOP(req->tracerec->timer); 2066 RF_ETIMER_EVAL(req->tracerec->timer); 2067 rf_lock_mutex2(rf_tracing_mutex); 2068 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2069 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2070 req->tracerec->num_phys_ios++; 2071 rf_unlock_mutex2(rf_tracing_mutex); 2072 } 2073 #endif 2074 2075 /* XXX Ok, let's get aggressive... If b_error is set, let's go 2076 * ballistic, and mark the component as hosed... */ 2077 2078 if (bp->b_error != 0) { 2079 /* Mark the disk as dead */ 2080 /* but only mark it once... */ 2081 /* and only if it wouldn't leave this RAID set 2082 completely broken */ 2083 if (((queue->raidPtr->Disks[queue->col].status == 2084 rf_ds_optimal) || 2085 (queue->raidPtr->Disks[queue->col].status == 2086 rf_ds_used_spare)) && 2087 (queue->raidPtr->numFailures < 2088 queue->raidPtr->Layout.map->faultsTolerated)) { 2089 printf("raid%d: IO Error (%d). Marking %s as failed.\n", 2090 queue->raidPtr->raidid, 2091 bp->b_error, 2092 queue->raidPtr->Disks[queue->col].devname); 2093 queue->raidPtr->Disks[queue->col].status = 2094 rf_ds_failed; 2095 queue->raidPtr->status = rf_rs_degraded; 2096 queue->raidPtr->numFailures++; 2097 queue->raidPtr->numNewFailures++; 2098 } else { /* Disk is already dead... */ 2099 /* printf("Disk already marked as dead!\n"); */ 2100 } 2101 2102 } 2103 2104 /* Fill in the error value */ 2105 req->error = bp->b_error; 2106 2107 /* Drop this one on the "finished" queue... */ 2108 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries); 2109 2110 /* Let the raidio thread know there is work to be done. */ 2111 rf_signal_cond2(queue->raidPtr->iodone_cv); 2112 2113 rf_unlock_mutex2(queue->raidPtr->iodone_lock); 2114 } 2115 2116 2117 /* 2118 * initialize a buf structure for doing an I/O in the kernel. 2119 */ 2120 static void 2121 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev, 2122 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf, 2123 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector, 2124 struct proc *b_proc) 2125 { 2126 /* bp->b_flags = B_PHYS | rw_flag; */ 2127 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */ 2128 bp->b_oflags = 0; 2129 bp->b_cflags = 0; 2130 bp->b_bcount = numSect << logBytesPerSector; 2131 bp->b_bufsize = bp->b_bcount; 2132 bp->b_error = 0; 2133 bp->b_dev = dev; 2134 bp->b_data = bf; 2135 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT; 2136 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */ 2137 if (bp->b_bcount == 0) { 2138 panic("bp->b_bcount is zero in InitBP!!"); 2139 } 2140 bp->b_proc = b_proc; 2141 bp->b_iodone = cbFunc; 2142 bp->b_private = cbArg; 2143 } 2144 2145 /* 2146 * Wait interruptibly for an exclusive lock. 2147 * 2148 * XXX 2149 * Several drivers do this; it should be abstracted and made MP-safe. 2150 * (Hmm... where have we seen this warning before :-> GO ) 2151 */ 2152 static int 2153 raidlock(struct raid_softc *rs) 2154 { 2155 int error; 2156 2157 error = 0; 2158 mutex_enter(&rs->sc_mutex); 2159 while ((rs->sc_flags & RAIDF_LOCKED) != 0) { 2160 rs->sc_flags |= RAIDF_WANTED; 2161 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex); 2162 if (error != 0) 2163 goto done; 2164 } 2165 rs->sc_flags |= RAIDF_LOCKED; 2166 done: 2167 mutex_exit(&rs->sc_mutex); 2168 return (error); 2169 } 2170 /* 2171 * Unlock and wake up any waiters. 2172 */ 2173 static void 2174 raidunlock(struct raid_softc *rs) 2175 { 2176 2177 mutex_enter(&rs->sc_mutex); 2178 rs->sc_flags &= ~RAIDF_LOCKED; 2179 if ((rs->sc_flags & RAIDF_WANTED) != 0) { 2180 rs->sc_flags &= ~RAIDF_WANTED; 2181 cv_broadcast(&rs->sc_cv); 2182 } 2183 mutex_exit(&rs->sc_mutex); 2184 } 2185 2186 2187 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */ 2188 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */ 2189 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE 2190 2191 static daddr_t 2192 rf_component_info_offset(void) 2193 { 2194 2195 return RF_COMPONENT_INFO_OFFSET; 2196 } 2197 2198 static daddr_t 2199 rf_component_info_size(unsigned secsize) 2200 { 2201 daddr_t info_size; 2202 2203 KASSERT(secsize); 2204 if (secsize > RF_COMPONENT_INFO_SIZE) 2205 info_size = secsize; 2206 else 2207 info_size = RF_COMPONENT_INFO_SIZE; 2208 2209 return info_size; 2210 } 2211 2212 static daddr_t 2213 rf_parity_map_offset(RF_Raid_t *raidPtr) 2214 { 2215 daddr_t map_offset; 2216 2217 KASSERT(raidPtr->bytesPerSector); 2218 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE) 2219 map_offset = raidPtr->bytesPerSector; 2220 else 2221 map_offset = RF_COMPONENT_INFO_SIZE; 2222 map_offset += rf_component_info_offset(); 2223 2224 return map_offset; 2225 } 2226 2227 static daddr_t 2228 rf_parity_map_size(RF_Raid_t *raidPtr) 2229 { 2230 daddr_t map_size; 2231 2232 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE) 2233 map_size = raidPtr->bytesPerSector; 2234 else 2235 map_size = RF_PARITY_MAP_SIZE; 2236 2237 return map_size; 2238 } 2239 2240 int 2241 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col) 2242 { 2243 RF_ComponentLabel_t *clabel; 2244 2245 clabel = raidget_component_label(raidPtr, col); 2246 clabel->clean = RF_RAID_CLEAN; 2247 raidflush_component_label(raidPtr, col); 2248 return(0); 2249 } 2250 2251 2252 int 2253 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col) 2254 { 2255 RF_ComponentLabel_t *clabel; 2256 2257 clabel = raidget_component_label(raidPtr, col); 2258 clabel->clean = RF_RAID_DIRTY; 2259 raidflush_component_label(raidPtr, col); 2260 return(0); 2261 } 2262 2263 int 2264 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2265 { 2266 KASSERT(raidPtr->bytesPerSector); 2267 return raidread_component_label(raidPtr->bytesPerSector, 2268 raidPtr->Disks[col].dev, 2269 raidPtr->raid_cinfo[col].ci_vp, 2270 &raidPtr->raid_cinfo[col].ci_label); 2271 } 2272 2273 RF_ComponentLabel_t * 2274 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2275 { 2276 return &raidPtr->raid_cinfo[col].ci_label; 2277 } 2278 2279 int 2280 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2281 { 2282 RF_ComponentLabel_t *label; 2283 2284 label = &raidPtr->raid_cinfo[col].ci_label; 2285 label->mod_counter = raidPtr->mod_counter; 2286 #ifndef RF_NO_PARITY_MAP 2287 label->parity_map_modcount = label->mod_counter; 2288 #endif 2289 return raidwrite_component_label(raidPtr->bytesPerSector, 2290 raidPtr->Disks[col].dev, 2291 raidPtr->raid_cinfo[col].ci_vp, label); 2292 } 2293 2294 2295 static int 2296 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2297 RF_ComponentLabel_t *clabel) 2298 { 2299 return raidread_component_area(dev, b_vp, clabel, 2300 sizeof(RF_ComponentLabel_t), 2301 rf_component_info_offset(), 2302 rf_component_info_size(secsize)); 2303 } 2304 2305 /* ARGSUSED */ 2306 static int 2307 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data, 2308 size_t msize, daddr_t offset, daddr_t dsize) 2309 { 2310 struct buf *bp; 2311 int error; 2312 2313 /* XXX should probably ensure that we don't try to do this if 2314 someone has changed rf_protected_sectors. */ 2315 2316 if (b_vp == NULL) { 2317 /* For whatever reason, this component is not valid. 2318 Don't try to read a component label from it. */ 2319 return(EINVAL); 2320 } 2321 2322 /* get a block of the appropriate size... */ 2323 bp = geteblk((int)dsize); 2324 bp->b_dev = dev; 2325 2326 /* get our ducks in a row for the read */ 2327 bp->b_blkno = offset / DEV_BSIZE; 2328 bp->b_bcount = dsize; 2329 bp->b_flags |= B_READ; 2330 bp->b_resid = dsize; 2331 2332 bdev_strategy(bp); 2333 error = biowait(bp); 2334 2335 if (!error) { 2336 memcpy(data, bp->b_data, msize); 2337 } 2338 2339 brelse(bp, 0); 2340 return(error); 2341 } 2342 2343 2344 static int 2345 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2346 RF_ComponentLabel_t *clabel) 2347 { 2348 return raidwrite_component_area(dev, b_vp, clabel, 2349 sizeof(RF_ComponentLabel_t), 2350 rf_component_info_offset(), 2351 rf_component_info_size(secsize), 0); 2352 } 2353 2354 /* ARGSUSED */ 2355 static int 2356 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data, 2357 size_t msize, daddr_t offset, daddr_t dsize, int asyncp) 2358 { 2359 struct buf *bp; 2360 int error; 2361 2362 /* get a block of the appropriate size... */ 2363 bp = geteblk((int)dsize); 2364 bp->b_dev = dev; 2365 2366 /* get our ducks in a row for the write */ 2367 bp->b_blkno = offset / DEV_BSIZE; 2368 bp->b_bcount = dsize; 2369 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0); 2370 bp->b_resid = dsize; 2371 2372 memset(bp->b_data, 0, dsize); 2373 memcpy(bp->b_data, data, msize); 2374 2375 bdev_strategy(bp); 2376 if (asyncp) 2377 return 0; 2378 error = biowait(bp); 2379 brelse(bp, 0); 2380 if (error) { 2381 #if 1 2382 printf("Failed to write RAID component info!\n"); 2383 #endif 2384 } 2385 2386 return(error); 2387 } 2388 2389 void 2390 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2391 { 2392 int c; 2393 2394 for (c = 0; c < raidPtr->numCol; c++) { 2395 /* Skip dead disks. */ 2396 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2397 continue; 2398 /* XXXjld: what if an error occurs here? */ 2399 raidwrite_component_area(raidPtr->Disks[c].dev, 2400 raidPtr->raid_cinfo[c].ci_vp, map, 2401 RF_PARITYMAP_NBYTE, 2402 rf_parity_map_offset(raidPtr), 2403 rf_parity_map_size(raidPtr), 0); 2404 } 2405 } 2406 2407 void 2408 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2409 { 2410 struct rf_paritymap_ondisk tmp; 2411 int c,first; 2412 2413 first=1; 2414 for (c = 0; c < raidPtr->numCol; c++) { 2415 /* Skip dead disks. */ 2416 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2417 continue; 2418 raidread_component_area(raidPtr->Disks[c].dev, 2419 raidPtr->raid_cinfo[c].ci_vp, &tmp, 2420 RF_PARITYMAP_NBYTE, 2421 rf_parity_map_offset(raidPtr), 2422 rf_parity_map_size(raidPtr)); 2423 if (first) { 2424 memcpy(map, &tmp, sizeof(*map)); 2425 first = 0; 2426 } else { 2427 rf_paritymap_merge(map, &tmp); 2428 } 2429 } 2430 } 2431 2432 void 2433 rf_markalldirty(RF_Raid_t *raidPtr) 2434 { 2435 RF_ComponentLabel_t *clabel; 2436 int sparecol; 2437 int c; 2438 int j; 2439 int scol = -1; 2440 2441 raidPtr->mod_counter++; 2442 for (c = 0; c < raidPtr->numCol; c++) { 2443 /* we don't want to touch (at all) a disk that has 2444 failed */ 2445 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) { 2446 clabel = raidget_component_label(raidPtr, c); 2447 if (clabel->status == rf_ds_spared) { 2448 /* XXX do something special... 2449 but whatever you do, don't 2450 try to access it!! */ 2451 } else { 2452 raidmarkdirty(raidPtr, c); 2453 } 2454 } 2455 } 2456 2457 for( c = 0; c < raidPtr->numSpare ; c++) { 2458 sparecol = raidPtr->numCol + c; 2459 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2460 /* 2461 2462 we claim this disk is "optimal" if it's 2463 rf_ds_used_spare, as that means it should be 2464 directly substitutable for the disk it replaced. 2465 We note that too... 2466 2467 */ 2468 2469 for(j=0;j<raidPtr->numCol;j++) { 2470 if (raidPtr->Disks[j].spareCol == sparecol) { 2471 scol = j; 2472 break; 2473 } 2474 } 2475 2476 clabel = raidget_component_label(raidPtr, sparecol); 2477 /* make sure status is noted */ 2478 2479 raid_init_component_label(raidPtr, clabel); 2480 2481 clabel->row = 0; 2482 clabel->column = scol; 2483 /* Note: we *don't* change status from rf_ds_used_spare 2484 to rf_ds_optimal */ 2485 /* clabel.status = rf_ds_optimal; */ 2486 2487 raidmarkdirty(raidPtr, sparecol); 2488 } 2489 } 2490 } 2491 2492 2493 void 2494 rf_update_component_labels(RF_Raid_t *raidPtr, int final) 2495 { 2496 RF_ComponentLabel_t *clabel; 2497 int sparecol; 2498 int c; 2499 int j; 2500 int scol; 2501 struct raid_softc *rs = raidPtr->softc; 2502 2503 scol = -1; 2504 2505 /* XXX should do extra checks to make sure things really are clean, 2506 rather than blindly setting the clean bit... */ 2507 2508 raidPtr->mod_counter++; 2509 2510 for (c = 0; c < raidPtr->numCol; c++) { 2511 if (raidPtr->Disks[c].status == rf_ds_optimal) { 2512 clabel = raidget_component_label(raidPtr, c); 2513 /* make sure status is noted */ 2514 clabel->status = rf_ds_optimal; 2515 2516 /* note what unit we are configured as */ 2517 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0) 2518 clabel->last_unit = raidPtr->raidid; 2519 2520 raidflush_component_label(raidPtr, c); 2521 if (final == RF_FINAL_COMPONENT_UPDATE) { 2522 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2523 raidmarkclean(raidPtr, c); 2524 } 2525 } 2526 } 2527 /* else we don't touch it.. */ 2528 } 2529 2530 for( c = 0; c < raidPtr->numSpare ; c++) { 2531 sparecol = raidPtr->numCol + c; 2532 /* Need to ensure that the reconstruct actually completed! */ 2533 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2534 /* 2535 2536 we claim this disk is "optimal" if it's 2537 rf_ds_used_spare, as that means it should be 2538 directly substitutable for the disk it replaced. 2539 We note that too... 2540 2541 */ 2542 2543 for(j=0;j<raidPtr->numCol;j++) { 2544 if (raidPtr->Disks[j].spareCol == sparecol) { 2545 scol = j; 2546 break; 2547 } 2548 } 2549 2550 /* XXX shouldn't *really* need this... */ 2551 clabel = raidget_component_label(raidPtr, sparecol); 2552 /* make sure status is noted */ 2553 2554 raid_init_component_label(raidPtr, clabel); 2555 2556 clabel->column = scol; 2557 clabel->status = rf_ds_optimal; 2558 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0) 2559 clabel->last_unit = raidPtr->raidid; 2560 2561 raidflush_component_label(raidPtr, sparecol); 2562 if (final == RF_FINAL_COMPONENT_UPDATE) { 2563 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2564 raidmarkclean(raidPtr, sparecol); 2565 } 2566 } 2567 } 2568 } 2569 } 2570 2571 void 2572 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured) 2573 { 2574 2575 if (vp != NULL) { 2576 if (auto_configured == 1) { 2577 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2578 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2579 vput(vp); 2580 2581 } else { 2582 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred); 2583 } 2584 } 2585 } 2586 2587 2588 void 2589 rf_UnconfigureVnodes(RF_Raid_t *raidPtr) 2590 { 2591 int r,c; 2592 struct vnode *vp; 2593 int acd; 2594 2595 2596 /* We take this opportunity to close the vnodes like we should.. */ 2597 2598 for (c = 0; c < raidPtr->numCol; c++) { 2599 vp = raidPtr->raid_cinfo[c].ci_vp; 2600 acd = raidPtr->Disks[c].auto_configured; 2601 rf_close_component(raidPtr, vp, acd); 2602 raidPtr->raid_cinfo[c].ci_vp = NULL; 2603 raidPtr->Disks[c].auto_configured = 0; 2604 } 2605 2606 for (r = 0; r < raidPtr->numSpare; r++) { 2607 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp; 2608 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured; 2609 rf_close_component(raidPtr, vp, acd); 2610 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL; 2611 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0; 2612 } 2613 } 2614 2615 2616 void 2617 rf_ReconThread(struct rf_recon_req_internal *req) 2618 { 2619 int s; 2620 RF_Raid_t *raidPtr; 2621 2622 s = splbio(); 2623 raidPtr = (RF_Raid_t *) req->raidPtr; 2624 raidPtr->recon_in_progress = 1; 2625 2626 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col, 2627 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0)); 2628 2629 RF_Free(req, sizeof(*req)); 2630 2631 raidPtr->recon_in_progress = 0; 2632 splx(s); 2633 2634 /* That's all... */ 2635 kthread_exit(0); /* does not return */ 2636 } 2637 2638 void 2639 rf_RewriteParityThread(RF_Raid_t *raidPtr) 2640 { 2641 int retcode; 2642 int s; 2643 2644 raidPtr->parity_rewrite_stripes_done = 0; 2645 raidPtr->parity_rewrite_in_progress = 1; 2646 s = splbio(); 2647 retcode = rf_RewriteParity(raidPtr); 2648 splx(s); 2649 if (retcode) { 2650 printf("raid%d: Error re-writing parity (%d)!\n", 2651 raidPtr->raidid, retcode); 2652 } else { 2653 /* set the clean bit! If we shutdown correctly, 2654 the clean bit on each component label will get 2655 set */ 2656 raidPtr->parity_good = RF_RAID_CLEAN; 2657 } 2658 raidPtr->parity_rewrite_in_progress = 0; 2659 2660 /* Anyone waiting for us to stop? If so, inform them... */ 2661 if (raidPtr->waitShutdown) { 2662 rf_lock_mutex2(raidPtr->rad_lock); 2663 cv_broadcast(&raidPtr->parity_rewrite_cv); 2664 rf_unlock_mutex2(raidPtr->rad_lock); 2665 } 2666 2667 /* That's all... */ 2668 kthread_exit(0); /* does not return */ 2669 } 2670 2671 2672 void 2673 rf_CopybackThread(RF_Raid_t *raidPtr) 2674 { 2675 int s; 2676 2677 raidPtr->copyback_in_progress = 1; 2678 s = splbio(); 2679 rf_CopybackReconstructedData(raidPtr); 2680 splx(s); 2681 raidPtr->copyback_in_progress = 0; 2682 2683 /* That's all... */ 2684 kthread_exit(0); /* does not return */ 2685 } 2686 2687 2688 void 2689 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req) 2690 { 2691 int s; 2692 RF_Raid_t *raidPtr; 2693 2694 s = splbio(); 2695 raidPtr = req->raidPtr; 2696 raidPtr->recon_in_progress = 1; 2697 rf_ReconstructInPlace(raidPtr, req->col); 2698 RF_Free(req, sizeof(*req)); 2699 raidPtr->recon_in_progress = 0; 2700 splx(s); 2701 2702 /* That's all... */ 2703 kthread_exit(0); /* does not return */ 2704 } 2705 2706 static RF_AutoConfig_t * 2707 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp, 2708 const char *cname, RF_SectorCount_t size, uint64_t numsecs, 2709 unsigned secsize) 2710 { 2711 int good_one = 0; 2712 RF_ComponentLabel_t *clabel; 2713 RF_AutoConfig_t *ac; 2714 2715 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK); 2716 2717 if (!raidread_component_label(secsize, dev, vp, clabel)) { 2718 /* Got the label. Does it look reasonable? */ 2719 if (rf_reasonable_label(clabel, numsecs) && 2720 (rf_component_label_partitionsize(clabel) <= size)) { 2721 #ifdef DEBUG 2722 printf("Component on: %s: %llu\n", 2723 cname, (unsigned long long)size); 2724 rf_print_component_label(clabel); 2725 #endif 2726 /* if it's reasonable, add it, else ignore it. */ 2727 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME, 2728 M_WAITOK); 2729 strlcpy(ac->devname, cname, sizeof(ac->devname)); 2730 ac->dev = dev; 2731 ac->vp = vp; 2732 ac->clabel = clabel; 2733 ac->next = ac_list; 2734 ac_list = ac; 2735 good_one = 1; 2736 } 2737 } 2738 if (!good_one) { 2739 /* cleanup */ 2740 free(clabel, M_RAIDFRAME); 2741 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2742 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2743 vput(vp); 2744 } 2745 return ac_list; 2746 } 2747 2748 RF_AutoConfig_t * 2749 rf_find_raid_components(void) 2750 { 2751 struct vnode *vp; 2752 struct disklabel label; 2753 device_t dv; 2754 deviter_t di; 2755 dev_t dev; 2756 int bmajor, bminor, wedge, rf_part_found; 2757 int error; 2758 int i; 2759 RF_AutoConfig_t *ac_list; 2760 uint64_t numsecs; 2761 unsigned secsize; 2762 int dowedges; 2763 2764 /* initialize the AutoConfig list */ 2765 ac_list = NULL; 2766 2767 /* 2768 * we begin by trolling through *all* the devices on the system *twice* 2769 * first we scan for wedges, second for other devices. This avoids 2770 * using a raw partition instead of a wedge that covers the whole disk 2771 */ 2772 2773 for (dowedges=1; dowedges>=0; --dowedges) { 2774 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL; 2775 dv = deviter_next(&di)) { 2776 2777 /* we are only interested in disks... */ 2778 if (device_class(dv) != DV_DISK) 2779 continue; 2780 2781 /* we don't care about floppies... */ 2782 if (device_is_a(dv, "fd")) { 2783 continue; 2784 } 2785 2786 /* we don't care about CD's... */ 2787 if (device_is_a(dv, "cd")) { 2788 continue; 2789 } 2790 2791 /* we don't care about md's... */ 2792 if (device_is_a(dv, "md")) { 2793 continue; 2794 } 2795 2796 /* hdfd is the Atari/Hades floppy driver */ 2797 if (device_is_a(dv, "hdfd")) { 2798 continue; 2799 } 2800 2801 /* fdisa is the Atari/Milan floppy driver */ 2802 if (device_is_a(dv, "fdisa")) { 2803 continue; 2804 } 2805 2806 /* are we in the wedges pass ? */ 2807 wedge = device_is_a(dv, "dk"); 2808 if (wedge != dowedges) { 2809 continue; 2810 } 2811 2812 /* need to find the device_name_to_block_device_major stuff */ 2813 bmajor = devsw_name2blk(device_xname(dv), NULL, 0); 2814 2815 rf_part_found = 0; /*No raid partition as yet*/ 2816 2817 /* get a vnode for the raw partition of this disk */ 2818 bminor = minor(device_unit(dv)); 2819 dev = wedge ? makedev(bmajor, bminor) : 2820 MAKEDISKDEV(bmajor, bminor, RAW_PART); 2821 if (bdevvp(dev, &vp)) 2822 panic("RAID can't alloc vnode"); 2823 2824 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2825 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED); 2826 2827 if (error) { 2828 /* "Who cares." Continue looking 2829 for something that exists*/ 2830 vput(vp); 2831 continue; 2832 } 2833 2834 error = getdisksize(vp, &numsecs, &secsize); 2835 if (error) { 2836 /* 2837 * Pseudo devices like vnd and cgd can be 2838 * opened but may still need some configuration. 2839 * Ignore these quietly. 2840 */ 2841 if (error != ENXIO) 2842 printf("RAIDframe: can't get disk size" 2843 " for dev %s (%d)\n", 2844 device_xname(dv), error); 2845 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2846 vput(vp); 2847 continue; 2848 } 2849 if (wedge) { 2850 struct dkwedge_info dkw; 2851 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, 2852 NOCRED); 2853 if (error) { 2854 printf("RAIDframe: can't get wedge info for " 2855 "dev %s (%d)\n", device_xname(dv), error); 2856 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2857 vput(vp); 2858 continue; 2859 } 2860 2861 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) { 2862 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2863 vput(vp); 2864 continue; 2865 } 2866 2867 VOP_UNLOCK(vp); 2868 ac_list = rf_get_component(ac_list, dev, vp, 2869 device_xname(dv), dkw.dkw_size, numsecs, secsize); 2870 rf_part_found = 1; /*There is a raid component on this disk*/ 2871 continue; 2872 } 2873 2874 /* Ok, the disk exists. Go get the disklabel. */ 2875 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED); 2876 if (error) { 2877 /* 2878 * XXX can't happen - open() would 2879 * have errored out (or faked up one) 2880 */ 2881 if (error != ENOTTY) 2882 printf("RAIDframe: can't get label for dev " 2883 "%s (%d)\n", device_xname(dv), error); 2884 } 2885 2886 /* don't need this any more. We'll allocate it again 2887 a little later if we really do... */ 2888 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2889 vput(vp); 2890 2891 if (error) 2892 continue; 2893 2894 rf_part_found = 0; /*No raid partitions yet*/ 2895 for (i = 0; i < label.d_npartitions; i++) { 2896 char cname[sizeof(ac_list->devname)]; 2897 2898 /* We only support partitions marked as RAID */ 2899 if (label.d_partitions[i].p_fstype != FS_RAID) 2900 continue; 2901 2902 dev = MAKEDISKDEV(bmajor, device_unit(dv), i); 2903 if (bdevvp(dev, &vp)) 2904 panic("RAID can't alloc vnode"); 2905 2906 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2907 error = VOP_OPEN(vp, FREAD, NOCRED); 2908 if (error) { 2909 /* Whatever... */ 2910 vput(vp); 2911 continue; 2912 } 2913 VOP_UNLOCK(vp); 2914 snprintf(cname, sizeof(cname), "%s%c", 2915 device_xname(dv), 'a' + i); 2916 ac_list = rf_get_component(ac_list, dev, vp, cname, 2917 label.d_partitions[i].p_size, numsecs, secsize); 2918 rf_part_found = 1; /*There is at least one raid partition on this disk*/ 2919 } 2920 2921 /* 2922 *If there is no raid component on this disk, either in a 2923 *disklabel or inside a wedge, check the raw partition as well, 2924 *as it is possible to configure raid components on raw disk 2925 *devices. 2926 */ 2927 2928 if (!rf_part_found) { 2929 char cname[sizeof(ac_list->devname)]; 2930 2931 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART); 2932 if (bdevvp(dev, &vp)) 2933 panic("RAID can't alloc vnode"); 2934 2935 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2936 2937 error = VOP_OPEN(vp, FREAD, NOCRED); 2938 if (error) { 2939 /* Whatever... */ 2940 vput(vp); 2941 continue; 2942 } 2943 VOP_UNLOCK(vp); 2944 snprintf(cname, sizeof(cname), "%s%c", 2945 device_xname(dv), 'a' + RAW_PART); 2946 ac_list = rf_get_component(ac_list, dev, vp, cname, 2947 label.d_partitions[RAW_PART].p_size, numsecs, secsize); 2948 } 2949 } 2950 deviter_release(&di); 2951 } 2952 return ac_list; 2953 } 2954 2955 2956 int 2957 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs) 2958 { 2959 2960 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) || 2961 (clabel->version==RF_COMPONENT_LABEL_VERSION)) && 2962 ((clabel->clean == RF_RAID_CLEAN) || 2963 (clabel->clean == RF_RAID_DIRTY)) && 2964 clabel->row >=0 && 2965 clabel->column >= 0 && 2966 clabel->num_rows > 0 && 2967 clabel->num_columns > 0 && 2968 clabel->row < clabel->num_rows && 2969 clabel->column < clabel->num_columns && 2970 clabel->blockSize > 0 && 2971 /* 2972 * numBlocksHi may contain garbage, but it is ok since 2973 * the type is unsigned. If it is really garbage, 2974 * rf_fix_old_label_size() will fix it. 2975 */ 2976 rf_component_label_numblocks(clabel) > 0) { 2977 /* 2978 * label looks reasonable enough... 2979 * let's make sure it has no old garbage. 2980 */ 2981 if (numsecs) 2982 rf_fix_old_label_size(clabel, numsecs); 2983 return(1); 2984 } 2985 return(0); 2986 } 2987 2988 2989 /* 2990 * For reasons yet unknown, some old component labels have garbage in 2991 * the newer numBlocksHi region, and this causes lossage. Since those 2992 * disks will also have numsecs set to less than 32 bits of sectors, 2993 * we can determine when this corruption has occurred, and fix it. 2994 * 2995 * The exact same problem, with the same unknown reason, happens to 2996 * the partitionSizeHi member as well. 2997 */ 2998 static void 2999 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs) 3000 { 3001 3002 if (numsecs < ((uint64_t)1 << 32)) { 3003 if (clabel->numBlocksHi) { 3004 printf("WARNING: total sectors < 32 bits, yet " 3005 "numBlocksHi set\n" 3006 "WARNING: resetting numBlocksHi to zero.\n"); 3007 clabel->numBlocksHi = 0; 3008 } 3009 3010 if (clabel->partitionSizeHi) { 3011 printf("WARNING: total sectors < 32 bits, yet " 3012 "partitionSizeHi set\n" 3013 "WARNING: resetting partitionSizeHi to zero.\n"); 3014 clabel->partitionSizeHi = 0; 3015 } 3016 } 3017 } 3018 3019 3020 #ifdef DEBUG 3021 void 3022 rf_print_component_label(RF_ComponentLabel_t *clabel) 3023 { 3024 uint64_t numBlocks; 3025 static const char *rp[] = { 3026 "No", "Force", "Soft", "*invalid*" 3027 }; 3028 3029 3030 numBlocks = rf_component_label_numblocks(clabel); 3031 3032 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n", 3033 clabel->row, clabel->column, 3034 clabel->num_rows, clabel->num_columns); 3035 printf(" Version: %d Serial Number: %d Mod Counter: %d\n", 3036 clabel->version, clabel->serial_number, 3037 clabel->mod_counter); 3038 printf(" Clean: %s Status: %d\n", 3039 clabel->clean ? "Yes" : "No", clabel->status); 3040 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n", 3041 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU); 3042 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n", 3043 (char) clabel->parityConfig, clabel->blockSize, numBlocks); 3044 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No"); 3045 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]); 3046 printf(" Last configured as: raid%d\n", clabel->last_unit); 3047 #if 0 3048 printf(" Config order: %d\n", clabel->config_order); 3049 #endif 3050 3051 } 3052 #endif 3053 3054 RF_ConfigSet_t * 3055 rf_create_auto_sets(RF_AutoConfig_t *ac_list) 3056 { 3057 RF_AutoConfig_t *ac; 3058 RF_ConfigSet_t *config_sets; 3059 RF_ConfigSet_t *cset; 3060 RF_AutoConfig_t *ac_next; 3061 3062 3063 config_sets = NULL; 3064 3065 /* Go through the AutoConfig list, and figure out which components 3066 belong to what sets. */ 3067 ac = ac_list; 3068 while(ac!=NULL) { 3069 /* we're going to putz with ac->next, so save it here 3070 for use at the end of the loop */ 3071 ac_next = ac->next; 3072 3073 if (config_sets == NULL) { 3074 /* will need at least this one... */ 3075 config_sets = malloc(sizeof(RF_ConfigSet_t), 3076 M_RAIDFRAME, M_WAITOK); 3077 /* this one is easy :) */ 3078 config_sets->ac = ac; 3079 config_sets->next = NULL; 3080 config_sets->rootable = 0; 3081 ac->next = NULL; 3082 } else { 3083 /* which set does this component fit into? */ 3084 cset = config_sets; 3085 while(cset!=NULL) { 3086 if (rf_does_it_fit(cset, ac)) { 3087 /* looks like it matches... */ 3088 ac->next = cset->ac; 3089 cset->ac = ac; 3090 break; 3091 } 3092 cset = cset->next; 3093 } 3094 if (cset==NULL) { 3095 /* didn't find a match above... new set..*/ 3096 cset = malloc(sizeof(RF_ConfigSet_t), 3097 M_RAIDFRAME, M_WAITOK); 3098 cset->ac = ac; 3099 ac->next = NULL; 3100 cset->next = config_sets; 3101 cset->rootable = 0; 3102 config_sets = cset; 3103 } 3104 } 3105 ac = ac_next; 3106 } 3107 3108 3109 return(config_sets); 3110 } 3111 3112 static int 3113 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac) 3114 { 3115 RF_ComponentLabel_t *clabel1, *clabel2; 3116 3117 /* If this one matches the *first* one in the set, that's good 3118 enough, since the other members of the set would have been 3119 through here too... */ 3120 /* note that we are not checking partitionSize here.. 3121 3122 Note that we are also not checking the mod_counters here. 3123 If everything else matches except the mod_counter, that's 3124 good enough for this test. We will deal with the mod_counters 3125 a little later in the autoconfiguration process. 3126 3127 (clabel1->mod_counter == clabel2->mod_counter) && 3128 3129 The reason we don't check for this is that failed disks 3130 will have lower modification counts. If those disks are 3131 not added to the set they used to belong to, then they will 3132 form their own set, which may result in 2 different sets, 3133 for example, competing to be configured at raid0, and 3134 perhaps competing to be the root filesystem set. If the 3135 wrong ones get configured, or both attempt to become /, 3136 weird behaviour and or serious lossage will occur. Thus we 3137 need to bring them into the fold here, and kick them out at 3138 a later point. 3139 3140 */ 3141 3142 clabel1 = cset->ac->clabel; 3143 clabel2 = ac->clabel; 3144 if ((clabel1->version == clabel2->version) && 3145 (clabel1->serial_number == clabel2->serial_number) && 3146 (clabel1->num_rows == clabel2->num_rows) && 3147 (clabel1->num_columns == clabel2->num_columns) && 3148 (clabel1->sectPerSU == clabel2->sectPerSU) && 3149 (clabel1->SUsPerPU == clabel2->SUsPerPU) && 3150 (clabel1->SUsPerRU == clabel2->SUsPerRU) && 3151 (clabel1->parityConfig == clabel2->parityConfig) && 3152 (clabel1->maxOutstanding == clabel2->maxOutstanding) && 3153 (clabel1->blockSize == clabel2->blockSize) && 3154 rf_component_label_numblocks(clabel1) == 3155 rf_component_label_numblocks(clabel2) && 3156 (clabel1->autoconfigure == clabel2->autoconfigure) && 3157 (clabel1->root_partition == clabel2->root_partition) && 3158 (clabel1->last_unit == clabel2->last_unit) && 3159 (clabel1->config_order == clabel2->config_order)) { 3160 /* if it get's here, it almost *has* to be a match */ 3161 } else { 3162 /* it's not consistent with somebody in the set.. 3163 punt */ 3164 return(0); 3165 } 3166 /* all was fine.. it must fit... */ 3167 return(1); 3168 } 3169 3170 int 3171 rf_have_enough_components(RF_ConfigSet_t *cset) 3172 { 3173 RF_AutoConfig_t *ac; 3174 RF_AutoConfig_t *auto_config; 3175 RF_ComponentLabel_t *clabel; 3176 int c; 3177 int num_cols; 3178 int num_missing; 3179 int mod_counter; 3180 int mod_counter_found; 3181 int even_pair_failed; 3182 char parity_type; 3183 3184 3185 /* check to see that we have enough 'live' components 3186 of this set. If so, we can configure it if necessary */ 3187 3188 num_cols = cset->ac->clabel->num_columns; 3189 parity_type = cset->ac->clabel->parityConfig; 3190 3191 /* XXX Check for duplicate components!?!?!? */ 3192 3193 /* Determine what the mod_counter is supposed to be for this set. */ 3194 3195 mod_counter_found = 0; 3196 mod_counter = 0; 3197 ac = cset->ac; 3198 while(ac!=NULL) { 3199 if (mod_counter_found==0) { 3200 mod_counter = ac->clabel->mod_counter; 3201 mod_counter_found = 1; 3202 } else { 3203 if (ac->clabel->mod_counter > mod_counter) { 3204 mod_counter = ac->clabel->mod_counter; 3205 } 3206 } 3207 ac = ac->next; 3208 } 3209 3210 num_missing = 0; 3211 auto_config = cset->ac; 3212 3213 even_pair_failed = 0; 3214 for(c=0; c<num_cols; c++) { 3215 ac = auto_config; 3216 while(ac!=NULL) { 3217 if ((ac->clabel->column == c) && 3218 (ac->clabel->mod_counter == mod_counter)) { 3219 /* it's this one... */ 3220 #ifdef DEBUG 3221 printf("Found: %s at %d\n", 3222 ac->devname,c); 3223 #endif 3224 break; 3225 } 3226 ac=ac->next; 3227 } 3228 if (ac==NULL) { 3229 /* Didn't find one here! */ 3230 /* special case for RAID 1, especially 3231 where there are more than 2 3232 components (where RAIDframe treats 3233 things a little differently :( ) */ 3234 if (parity_type == '1') { 3235 if (c%2 == 0) { /* even component */ 3236 even_pair_failed = 1; 3237 } else { /* odd component. If 3238 we're failed, and 3239 so is the even 3240 component, it's 3241 "Good Night, Charlie" */ 3242 if (even_pair_failed == 1) { 3243 return(0); 3244 } 3245 } 3246 } else { 3247 /* normal accounting */ 3248 num_missing++; 3249 } 3250 } 3251 if ((parity_type == '1') && (c%2 == 1)) { 3252 /* Just did an even component, and we didn't 3253 bail.. reset the even_pair_failed flag, 3254 and go on to the next component.... */ 3255 even_pair_failed = 0; 3256 } 3257 } 3258 3259 clabel = cset->ac->clabel; 3260 3261 if (((clabel->parityConfig == '0') && (num_missing > 0)) || 3262 ((clabel->parityConfig == '4') && (num_missing > 1)) || 3263 ((clabel->parityConfig == '5') && (num_missing > 1))) { 3264 /* XXX this needs to be made *much* more general */ 3265 /* Too many failures */ 3266 return(0); 3267 } 3268 /* otherwise, all is well, and we've got enough to take a kick 3269 at autoconfiguring this set */ 3270 return(1); 3271 } 3272 3273 void 3274 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config, 3275 RF_Raid_t *raidPtr) 3276 { 3277 RF_ComponentLabel_t *clabel; 3278 int i; 3279 3280 clabel = ac->clabel; 3281 3282 /* 1. Fill in the common stuff */ 3283 config->numCol = clabel->num_columns; 3284 config->numSpare = 0; /* XXX should this be set here? */ 3285 config->sectPerSU = clabel->sectPerSU; 3286 config->SUsPerPU = clabel->SUsPerPU; 3287 config->SUsPerRU = clabel->SUsPerRU; 3288 config->parityConfig = clabel->parityConfig; 3289 /* XXX... */ 3290 strcpy(config->diskQueueType,"fifo"); 3291 config->maxOutstandingDiskReqs = clabel->maxOutstanding; 3292 config->layoutSpecificSize = 0; /* XXX ?? */ 3293 3294 while(ac!=NULL) { 3295 /* row/col values will be in range due to the checks 3296 in reasonable_label() */ 3297 strcpy(config->devnames[0][ac->clabel->column], 3298 ac->devname); 3299 ac = ac->next; 3300 } 3301 3302 for(i=0;i<RF_MAXDBGV;i++) { 3303 config->debugVars[i][0] = 0; 3304 } 3305 } 3306 3307 int 3308 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value) 3309 { 3310 RF_ComponentLabel_t *clabel; 3311 int column; 3312 int sparecol; 3313 3314 raidPtr->autoconfigure = new_value; 3315 3316 for(column=0; column<raidPtr->numCol; column++) { 3317 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3318 clabel = raidget_component_label(raidPtr, column); 3319 clabel->autoconfigure = new_value; 3320 raidflush_component_label(raidPtr, column); 3321 } 3322 } 3323 for(column = 0; column < raidPtr->numSpare ; column++) { 3324 sparecol = raidPtr->numCol + column; 3325 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3326 clabel = raidget_component_label(raidPtr, sparecol); 3327 clabel->autoconfigure = new_value; 3328 raidflush_component_label(raidPtr, sparecol); 3329 } 3330 } 3331 return(new_value); 3332 } 3333 3334 int 3335 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value) 3336 { 3337 RF_ComponentLabel_t *clabel; 3338 int column; 3339 int sparecol; 3340 3341 raidPtr->root_partition = new_value; 3342 for(column=0; column<raidPtr->numCol; column++) { 3343 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3344 clabel = raidget_component_label(raidPtr, column); 3345 clabel->root_partition = new_value; 3346 raidflush_component_label(raidPtr, column); 3347 } 3348 } 3349 for(column = 0; column < raidPtr->numSpare ; column++) { 3350 sparecol = raidPtr->numCol + column; 3351 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3352 clabel = raidget_component_label(raidPtr, sparecol); 3353 clabel->root_partition = new_value; 3354 raidflush_component_label(raidPtr, sparecol); 3355 } 3356 } 3357 return(new_value); 3358 } 3359 3360 void 3361 rf_release_all_vps(RF_ConfigSet_t *cset) 3362 { 3363 RF_AutoConfig_t *ac; 3364 3365 ac = cset->ac; 3366 while(ac!=NULL) { 3367 /* Close the vp, and give it back */ 3368 if (ac->vp) { 3369 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY); 3370 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED); 3371 vput(ac->vp); 3372 ac->vp = NULL; 3373 } 3374 ac = ac->next; 3375 } 3376 } 3377 3378 3379 void 3380 rf_cleanup_config_set(RF_ConfigSet_t *cset) 3381 { 3382 RF_AutoConfig_t *ac; 3383 RF_AutoConfig_t *next_ac; 3384 3385 ac = cset->ac; 3386 while(ac!=NULL) { 3387 next_ac = ac->next; 3388 /* nuke the label */ 3389 free(ac->clabel, M_RAIDFRAME); 3390 /* cleanup the config structure */ 3391 free(ac, M_RAIDFRAME); 3392 /* "next.." */ 3393 ac = next_ac; 3394 } 3395 /* and, finally, nuke the config set */ 3396 free(cset, M_RAIDFRAME); 3397 } 3398 3399 3400 void 3401 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 3402 { 3403 /* current version number */ 3404 clabel->version = RF_COMPONENT_LABEL_VERSION; 3405 clabel->serial_number = raidPtr->serial_number; 3406 clabel->mod_counter = raidPtr->mod_counter; 3407 3408 clabel->num_rows = 1; 3409 clabel->num_columns = raidPtr->numCol; 3410 clabel->clean = RF_RAID_DIRTY; /* not clean */ 3411 clabel->status = rf_ds_optimal; /* "It's good!" */ 3412 3413 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; 3414 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU; 3415 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU; 3416 3417 clabel->blockSize = raidPtr->bytesPerSector; 3418 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk); 3419 3420 /* XXX not portable */ 3421 clabel->parityConfig = raidPtr->Layout.map->parityConfig; 3422 clabel->maxOutstanding = raidPtr->maxOutstanding; 3423 clabel->autoconfigure = raidPtr->autoconfigure; 3424 clabel->root_partition = raidPtr->root_partition; 3425 clabel->last_unit = raidPtr->raidid; 3426 clabel->config_order = raidPtr->config_order; 3427 3428 #ifndef RF_NO_PARITY_MAP 3429 rf_paritymap_init_label(raidPtr->parity_map, clabel); 3430 #endif 3431 } 3432 3433 struct raid_softc * 3434 rf_auto_config_set(RF_ConfigSet_t *cset) 3435 { 3436 RF_Raid_t *raidPtr; 3437 RF_Config_t *config; 3438 int raidID; 3439 struct raid_softc *sc; 3440 3441 #ifdef DEBUG 3442 printf("RAID autoconfigure\n"); 3443 #endif 3444 3445 /* 1. Create a config structure */ 3446 config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO); 3447 3448 /* 3449 2. Figure out what RAID ID this one is supposed to live at 3450 See if we can get the same RAID dev that it was configured 3451 on last time.. 3452 */ 3453 3454 raidID = cset->ac->clabel->last_unit; 3455 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0; 3456 sc = raidget(++raidID, false)) 3457 continue; 3458 #ifdef DEBUG 3459 printf("Configuring raid%d:\n",raidID); 3460 #endif 3461 3462 if (sc == NULL) 3463 sc = raidget(raidID, true); 3464 raidPtr = &sc->sc_r; 3465 3466 /* XXX all this stuff should be done SOMEWHERE ELSE! */ 3467 raidPtr->softc = sc; 3468 raidPtr->raidid = raidID; 3469 raidPtr->openings = RAIDOUTSTANDING; 3470 3471 /* 3. Build the configuration structure */ 3472 rf_create_configuration(cset->ac, config, raidPtr); 3473 3474 /* 4. Do the configuration */ 3475 if (rf_Configure(raidPtr, config, cset->ac) == 0) { 3476 raidinit(sc); 3477 3478 rf_markalldirty(raidPtr); 3479 raidPtr->autoconfigure = 1; /* XXX do this here? */ 3480 switch (cset->ac->clabel->root_partition) { 3481 case 1: /* Force Root */ 3482 case 2: /* Soft Root: root when boot partition part of raid */ 3483 /* 3484 * everything configured just fine. Make a note 3485 * that this set is eligible to be root, 3486 * or forced to be root 3487 */ 3488 cset->rootable = cset->ac->clabel->root_partition; 3489 /* XXX do this here? */ 3490 raidPtr->root_partition = cset->rootable; 3491 break; 3492 default: 3493 break; 3494 } 3495 } else { 3496 raidput(sc); 3497 sc = NULL; 3498 } 3499 3500 /* 5. Cleanup */ 3501 free(config, M_RAIDFRAME); 3502 return sc; 3503 } 3504 3505 void 3506 rf_pool_init(struct pool *p, size_t size, const char *w_chan, 3507 size_t xmin, size_t xmax) 3508 { 3509 int error; 3510 3511 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO); 3512 pool_sethiwat(p, xmax); 3513 if ((error = pool_prime(p, xmin)) != 0) 3514 panic("%s: failed to prime pool: %d", __func__, error); 3515 pool_setlowat(p, xmin); 3516 } 3517 3518 /* 3519 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue 3520 * to see if there is IO pending and if that IO could possibly be done 3521 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1 3522 * otherwise. 3523 * 3524 */ 3525 int 3526 rf_buf_queue_check(RF_Raid_t *raidPtr) 3527 { 3528 struct raid_softc *rs; 3529 struct dk_softc *dksc; 3530 3531 rs = raidPtr->softc; 3532 dksc = &rs->sc_dksc; 3533 3534 if ((rs->sc_flags & RAIDF_INITED) == 0) 3535 return 1; 3536 3537 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) { 3538 /* there is work to do */ 3539 return 0; 3540 } 3541 /* default is nothing to do */ 3542 return 1; 3543 } 3544 3545 int 3546 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr) 3547 { 3548 uint64_t numsecs; 3549 unsigned secsize; 3550 int error; 3551 3552 error = getdisksize(vp, &numsecs, &secsize); 3553 if (error == 0) { 3554 diskPtr->blockSize = secsize; 3555 diskPtr->numBlocks = numsecs - rf_protectedSectors; 3556 diskPtr->partitionSize = numsecs; 3557 return 0; 3558 } 3559 return error; 3560 } 3561 3562 static int 3563 raid_match(device_t self, cfdata_t cfdata, void *aux) 3564 { 3565 return 1; 3566 } 3567 3568 static void 3569 raid_attach(device_t parent, device_t self, void *aux) 3570 { 3571 } 3572 3573 3574 static int 3575 raid_detach(device_t self, int flags) 3576 { 3577 int error; 3578 struct raid_softc *rs = raidsoftc(self); 3579 3580 if (rs == NULL) 3581 return ENXIO; 3582 3583 if ((error = raidlock(rs)) != 0) 3584 return (error); 3585 3586 error = raid_detach_unlocked(rs); 3587 3588 raidunlock(rs); 3589 3590 /* XXX raid can be referenced here */ 3591 3592 if (error) 3593 return error; 3594 3595 /* Free the softc */ 3596 raidput(rs); 3597 3598 return 0; 3599 } 3600 3601 static void 3602 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr) 3603 { 3604 struct dk_softc *dksc = &rs->sc_dksc; 3605 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; 3606 3607 memset(dg, 0, sizeof(*dg)); 3608 3609 dg->dg_secperunit = raidPtr->totalSectors; 3610 dg->dg_secsize = raidPtr->bytesPerSector; 3611 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe; 3612 dg->dg_ntracks = 4 * raidPtr->numCol; 3613 3614 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL); 3615 } 3616 3617 /* 3618 * Get cache info for all the components (including spares). 3619 * Returns intersection of all the cache flags of all disks, or first 3620 * error if any encountered. 3621 * XXXfua feature flags can change as spares are added - lock down somehow 3622 */ 3623 static int 3624 rf_get_component_caches(RF_Raid_t *raidPtr, int *data) 3625 { 3626 int c; 3627 int error; 3628 int dkwhole = 0, dkpart; 3629 3630 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) { 3631 /* 3632 * Check any non-dead disk, even when currently being 3633 * reconstructed. 3634 */ 3635 if (!RF_DEAD_DISK(raidPtr->Disks[c].status) 3636 || raidPtr->Disks[c].status == rf_ds_reconstructing) { 3637 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, 3638 DIOCGCACHE, &dkpart, FREAD, NOCRED); 3639 if (error) { 3640 if (error != ENODEV) { 3641 printf("raid%d: get cache for component %s failed\n", 3642 raidPtr->raidid, 3643 raidPtr->Disks[c].devname); 3644 } 3645 3646 return error; 3647 } 3648 3649 if (c == 0) 3650 dkwhole = dkpart; 3651 else 3652 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart); 3653 } 3654 } 3655 3656 *data = dkwhole; 3657 3658 return 0; 3659 } 3660 3661 /* 3662 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components. 3663 * We end up returning whatever error was returned by the first cache flush 3664 * that fails. 3665 */ 3666 3667 int 3668 rf_sync_component_caches(RF_Raid_t *raidPtr) 3669 { 3670 int c, sparecol; 3671 int e,error; 3672 int force = 1; 3673 3674 error = 0; 3675 for (c = 0; c < raidPtr->numCol; c++) { 3676 if (raidPtr->Disks[c].status == rf_ds_optimal) { 3677 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC, 3678 &force, FWRITE, NOCRED); 3679 if (e) { 3680 if (e != ENODEV) 3681 printf("raid%d: cache flush to component %s failed.\n", 3682 raidPtr->raidid, raidPtr->Disks[c].devname); 3683 if (error == 0) { 3684 error = e; 3685 } 3686 } 3687 } 3688 } 3689 3690 for( c = 0; c < raidPtr->numSpare ; c++) { 3691 sparecol = raidPtr->numCol + c; 3692 /* Need to ensure that the reconstruct actually completed! */ 3693 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3694 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp, 3695 DIOCCACHESYNC, &force, FWRITE, NOCRED); 3696 if (e) { 3697 if (e != ENODEV) 3698 printf("raid%d: cache flush to component %s failed.\n", 3699 raidPtr->raidid, raidPtr->Disks[sparecol].devname); 3700 if (error == 0) { 3701 error = e; 3702 } 3703 } 3704 } 3705 } 3706 return error; 3707 } 3708 3709 /* Fill in info with the current status */ 3710 void 3711 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info) 3712 { 3713 3714 if (raidPtr->status != rf_rs_reconstructing) { 3715 info->total = 100; 3716 info->completed = 100; 3717 } else { 3718 info->total = raidPtr->reconControl->numRUsTotal; 3719 info->completed = raidPtr->reconControl->numRUsComplete; 3720 } 3721 info->remaining = info->total - info->completed; 3722 } 3723 3724 /* Fill in info with the current status */ 3725 void 3726 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info) 3727 { 3728 3729 if (raidPtr->parity_rewrite_in_progress == 1) { 3730 info->total = raidPtr->Layout.numStripe; 3731 info->completed = raidPtr->parity_rewrite_stripes_done; 3732 } else { 3733 info->completed = 100; 3734 info->total = 100; 3735 } 3736 info->remaining = info->total - info->completed; 3737 } 3738 3739 /* Fill in info with the current status */ 3740 void 3741 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info) 3742 { 3743 3744 if (raidPtr->copyback_in_progress == 1) { 3745 info->total = raidPtr->Layout.numStripe; 3746 info->completed = raidPtr->copyback_stripes_done; 3747 info->remaining = info->total - info->completed; 3748 } else { 3749 info->remaining = 0; 3750 info->completed = 100; 3751 info->total = 100; 3752 } 3753 } 3754 3755 /* Fill in config with the current info */ 3756 int 3757 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config) 3758 { 3759 int d, i, j; 3760 3761 if (!raidPtr->valid) 3762 return (ENODEV); 3763 config->cols = raidPtr->numCol; 3764 config->ndevs = raidPtr->numCol; 3765 if (config->ndevs >= RF_MAX_DISKS) 3766 return (ENOMEM); 3767 config->nspares = raidPtr->numSpare; 3768 if (config->nspares >= RF_MAX_DISKS) 3769 return (ENOMEM); 3770 config->maxqdepth = raidPtr->maxQueueDepth; 3771 d = 0; 3772 for (j = 0; j < config->cols; j++) { 3773 config->devs[d] = raidPtr->Disks[j]; 3774 d++; 3775 } 3776 for (j = config->cols, i = 0; i < config->nspares; i++, j++) { 3777 config->spares[i] = raidPtr->Disks[j]; 3778 if (config->spares[i].status == rf_ds_rebuilding_spare) { 3779 /* XXX: raidctl(8) expects to see this as a used spare */ 3780 config->spares[i].status = rf_ds_used_spare; 3781 } 3782 } 3783 return 0; 3784 } 3785 3786 int 3787 rf_get_component_label(RF_Raid_t *raidPtr, void *data) 3788 { 3789 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data; 3790 RF_ComponentLabel_t *raid_clabel; 3791 int column = clabel->column; 3792 3793 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare)) 3794 return EINVAL; 3795 raid_clabel = raidget_component_label(raidPtr, column); 3796 memcpy(clabel, raid_clabel, sizeof *clabel); 3797 3798 return 0; 3799 } 3800 3801 /* 3802 * Module interface 3803 */ 3804 3805 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs"); 3806 3807 #ifdef _MODULE 3808 CFDRIVER_DECL(raid, DV_DISK, NULL); 3809 #endif 3810 3811 static int raid_modcmd(modcmd_t, void *); 3812 static int raid_modcmd_init(void); 3813 static int raid_modcmd_fini(void); 3814 3815 static int 3816 raid_modcmd(modcmd_t cmd, void *data) 3817 { 3818 int error; 3819 3820 error = 0; 3821 switch (cmd) { 3822 case MODULE_CMD_INIT: 3823 error = raid_modcmd_init(); 3824 break; 3825 case MODULE_CMD_FINI: 3826 error = raid_modcmd_fini(); 3827 break; 3828 default: 3829 error = ENOTTY; 3830 break; 3831 } 3832 return error; 3833 } 3834 3835 static int 3836 raid_modcmd_init(void) 3837 { 3838 int error; 3839 int bmajor, cmajor; 3840 3841 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE); 3842 mutex_enter(&raid_lock); 3843 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 3844 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM); 3845 rf_init_cond2(rf_sparet_wait_cv, "sparetw"); 3846 rf_init_cond2(rf_sparet_resp_cv, "rfgst"); 3847 3848 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; 3849 #endif 3850 3851 bmajor = cmajor = -1; 3852 error = devsw_attach("raid", &raid_bdevsw, &bmajor, 3853 &raid_cdevsw, &cmajor); 3854 if (error != 0 && error != EEXIST) { 3855 aprint_error("%s: devsw_attach failed %d\n", __func__, error); 3856 mutex_exit(&raid_lock); 3857 return error; 3858 } 3859 #ifdef _MODULE 3860 error = config_cfdriver_attach(&raid_cd); 3861 if (error != 0) { 3862 aprint_error("%s: config_cfdriver_attach failed %d\n", 3863 __func__, error); 3864 devsw_detach(&raid_bdevsw, &raid_cdevsw); 3865 mutex_exit(&raid_lock); 3866 return error; 3867 } 3868 #endif 3869 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3870 if (error != 0) { 3871 aprint_error("%s: config_cfattach_attach failed %d\n", 3872 __func__, error); 3873 #ifdef _MODULE 3874 config_cfdriver_detach(&raid_cd); 3875 #endif 3876 devsw_detach(&raid_bdevsw, &raid_cdevsw); 3877 mutex_exit(&raid_lock); 3878 return error; 3879 } 3880 3881 raidautoconfigdone = false; 3882 3883 mutex_exit(&raid_lock); 3884 3885 if (error == 0) { 3886 if (rf_BootRaidframe(true) == 0) 3887 aprint_verbose("Kernelized RAIDframe activated\n"); 3888 else 3889 panic("Serious error activating RAID!!"); 3890 } 3891 3892 /* 3893 * Register a finalizer which will be used to auto-config RAID 3894 * sets once all real hardware devices have been found. 3895 */ 3896 error = config_finalize_register(NULL, rf_autoconfig); 3897 if (error != 0) { 3898 aprint_error("WARNING: unable to register RAIDframe " 3899 "finalizer\n"); 3900 error = 0; 3901 } 3902 3903 return error; 3904 } 3905 3906 static int 3907 raid_modcmd_fini(void) 3908 { 3909 int error; 3910 3911 mutex_enter(&raid_lock); 3912 3913 /* Don't allow unload if raid device(s) exist. */ 3914 if (!LIST_EMPTY(&raids)) { 3915 mutex_exit(&raid_lock); 3916 return EBUSY; 3917 } 3918 3919 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca); 3920 if (error != 0) { 3921 aprint_error("%s: cannot detach cfattach\n",__func__); 3922 mutex_exit(&raid_lock); 3923 return error; 3924 } 3925 #ifdef _MODULE 3926 error = config_cfdriver_detach(&raid_cd); 3927 if (error != 0) { 3928 aprint_error("%s: cannot detach cfdriver\n",__func__); 3929 config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3930 mutex_exit(&raid_lock); 3931 return error; 3932 } 3933 #endif 3934 error = devsw_detach(&raid_bdevsw, &raid_cdevsw); 3935 if (error != 0) { 3936 aprint_error("%s: cannot detach devsw\n",__func__); 3937 #ifdef _MODULE 3938 config_cfdriver_attach(&raid_cd); 3939 #endif 3940 config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3941 mutex_exit(&raid_lock); 3942 return error; 3943 } 3944 rf_BootRaidframe(false); 3945 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 3946 rf_destroy_mutex2(rf_sparet_wait_mutex); 3947 rf_destroy_cond2(rf_sparet_wait_cv); 3948 rf_destroy_cond2(rf_sparet_resp_cv); 3949 #endif 3950 mutex_exit(&raid_lock); 3951 mutex_destroy(&raid_lock); 3952 3953 return error; 3954 } 3955