1 /* $NetBSD: rf_netbsdkintf.c,v 1.389 2020/08/25 13:50:00 skrll Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Greg Oster; Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1988 University of Utah. 34 * Copyright (c) 1990, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * the Systems Programming Group of the University of Utah Computer 39 * Science Department. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 66 * 67 * @(#)cd.c 8.2 (Berkeley) 11/16/93 68 */ 69 70 /* 71 * Copyright (c) 1995 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Mark Holland, Jim Zelenka 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /*********************************************************** 98 * 99 * rf_kintf.c -- the kernel interface routines for RAIDframe 100 * 101 ***********************************************************/ 102 103 #include <sys/cdefs.h> 104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.389 2020/08/25 13:50:00 skrll Exp $"); 105 106 #ifdef _KERNEL_OPT 107 #include "opt_raid_autoconfig.h" 108 #include "opt_compat_netbsd32.h" 109 #endif 110 111 #include <sys/param.h> 112 #include <sys/errno.h> 113 #include <sys/pool.h> 114 #include <sys/proc.h> 115 #include <sys/queue.h> 116 #include <sys/disk.h> 117 #include <sys/device.h> 118 #include <sys/stat.h> 119 #include <sys/ioctl.h> 120 #include <sys/fcntl.h> 121 #include <sys/systm.h> 122 #include <sys/vnode.h> 123 #include <sys/disklabel.h> 124 #include <sys/conf.h> 125 #include <sys/buf.h> 126 #include <sys/bufq.h> 127 #include <sys/reboot.h> 128 #include <sys/kauth.h> 129 #include <sys/module.h> 130 #include <sys/compat_stub.h> 131 132 #include <prop/proplib.h> 133 134 #include <dev/raidframe/raidframevar.h> 135 #include <dev/raidframe/raidframeio.h> 136 #include <dev/raidframe/rf_paritymap.h> 137 138 #include "rf_raid.h" 139 #include "rf_copyback.h" 140 #include "rf_dag.h" 141 #include "rf_dagflags.h" 142 #include "rf_desc.h" 143 #include "rf_diskqueue.h" 144 #include "rf_etimer.h" 145 #include "rf_general.h" 146 #include "rf_kintf.h" 147 #include "rf_options.h" 148 #include "rf_driver.h" 149 #include "rf_parityscan.h" 150 #include "rf_threadstuff.h" 151 152 #include "ioconf.h" 153 154 #ifdef DEBUG 155 int rf_kdebug_level = 0; 156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a 157 #else /* DEBUG */ 158 #define db1_printf(a) { } 159 #endif /* DEBUG */ 160 161 #ifdef DEBUG_ROOT 162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__) 163 #else 164 #define DPRINTF(a, ...) 165 #endif 166 167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 168 static rf_declare_mutex2(rf_sparet_wait_mutex); 169 static rf_declare_cond2(rf_sparet_wait_cv); 170 static rf_declare_cond2(rf_sparet_resp_cv); 171 172 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a 173 * spare table */ 174 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from 175 * installation process */ 176 #endif 177 178 const int rf_b_pass = (B_PHYS|B_RAW|B_MEDIA_FLAGS); 179 180 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures"); 181 182 /* prototypes */ 183 static void KernelWakeupFunc(struct buf *); 184 static void InitBP(struct buf *, struct vnode *, unsigned, 185 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *), 186 void *, int); 187 static void raidinit(struct raid_softc *); 188 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp); 189 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *); 190 191 static int raid_match(device_t, cfdata_t, void *); 192 static void raid_attach(device_t, device_t, void *); 193 static int raid_detach(device_t, int); 194 195 static int raidread_component_area(dev_t, struct vnode *, void *, size_t, 196 daddr_t, daddr_t); 197 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t, 198 daddr_t, daddr_t, int); 199 200 static int raidwrite_component_label(unsigned, 201 dev_t, struct vnode *, RF_ComponentLabel_t *); 202 static int raidread_component_label(unsigned, 203 dev_t, struct vnode *, RF_ComponentLabel_t *); 204 205 static int raid_diskstart(device_t, struct buf *bp); 206 static int raid_dumpblocks(device_t, void *, daddr_t, int); 207 static int raid_lastclose(device_t); 208 209 static dev_type_open(raidopen); 210 static dev_type_close(raidclose); 211 static dev_type_read(raidread); 212 static dev_type_write(raidwrite); 213 static dev_type_ioctl(raidioctl); 214 static dev_type_strategy(raidstrategy); 215 static dev_type_dump(raiddump); 216 static dev_type_size(raidsize); 217 218 const struct bdevsw raid_bdevsw = { 219 .d_open = raidopen, 220 .d_close = raidclose, 221 .d_strategy = raidstrategy, 222 .d_ioctl = raidioctl, 223 .d_dump = raiddump, 224 .d_psize = raidsize, 225 .d_discard = nodiscard, 226 .d_flag = D_DISK 227 }; 228 229 const struct cdevsw raid_cdevsw = { 230 .d_open = raidopen, 231 .d_close = raidclose, 232 .d_read = raidread, 233 .d_write = raidwrite, 234 .d_ioctl = raidioctl, 235 .d_stop = nostop, 236 .d_tty = notty, 237 .d_poll = nopoll, 238 .d_mmap = nommap, 239 .d_kqfilter = nokqfilter, 240 .d_discard = nodiscard, 241 .d_flag = D_DISK 242 }; 243 244 static struct dkdriver rf_dkdriver = { 245 .d_open = raidopen, 246 .d_close = raidclose, 247 .d_strategy = raidstrategy, 248 .d_diskstart = raid_diskstart, 249 .d_dumpblocks = raid_dumpblocks, 250 .d_lastclose = raid_lastclose, 251 .d_minphys = minphys 252 }; 253 254 #define raidunit(x) DISKUNIT(x) 255 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc) 256 257 extern struct cfdriver raid_cd; 258 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc), 259 raid_match, raid_attach, raid_detach, NULL, NULL, NULL, 260 DVF_DETACH_SHUTDOWN); 261 262 /* Internal representation of a rf_recon_req */ 263 struct rf_recon_req_internal { 264 RF_RowCol_t col; 265 RF_ReconReqFlags_t flags; 266 void *raidPtr; 267 }; 268 269 /* 270 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device. 271 * Be aware that large numbers can allow the driver to consume a lot of 272 * kernel memory, especially on writes, and in degraded mode reads. 273 * 274 * For example: with a stripe width of 64 blocks (32k) and 5 disks, 275 * a single 64K write will typically require 64K for the old data, 276 * 64K for the old parity, and 64K for the new parity, for a total 277 * of 192K (if the parity buffer is not re-used immediately). 278 * Even it if is used immediately, that's still 128K, which when multiplied 279 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data. 280 * 281 * Now in degraded mode, for example, a 64K read on the above setup may 282 * require data reconstruction, which will require *all* of the 4 remaining 283 * disks to participate -- 4 * 32K/disk == 128K again. 284 */ 285 286 #ifndef RAIDOUTSTANDING 287 #define RAIDOUTSTANDING 6 288 #endif 289 290 #define RAIDLABELDEV(dev) \ 291 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART)) 292 293 /* declared here, and made public, for the benefit of KVM stuff.. */ 294 295 static int raidlock(struct raid_softc *); 296 static void raidunlock(struct raid_softc *); 297 298 static int raid_detach_unlocked(struct raid_softc *); 299 300 static void rf_markalldirty(RF_Raid_t *); 301 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *); 302 303 void rf_ReconThread(struct rf_recon_req_internal *); 304 void rf_RewriteParityThread(RF_Raid_t *raidPtr); 305 void rf_CopybackThread(RF_Raid_t *raidPtr); 306 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *); 307 int rf_autoconfig(device_t); 308 void rf_buildroothack(RF_ConfigSet_t *); 309 310 RF_AutoConfig_t *rf_find_raid_components(void); 311 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *); 312 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *); 313 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t); 314 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *); 315 int rf_set_autoconfig(RF_Raid_t *, int); 316 int rf_set_rootpartition(RF_Raid_t *, int); 317 void rf_release_all_vps(RF_ConfigSet_t *); 318 void rf_cleanup_config_set(RF_ConfigSet_t *); 319 int rf_have_enough_components(RF_ConfigSet_t *); 320 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *); 321 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t); 322 323 /* 324 * Debugging, mostly. Set to 0 to not allow autoconfig to take place. 325 * Note that this is overridden by having RAID_AUTOCONFIG as an option 326 * in the kernel config file. 327 */ 328 #ifdef RAID_AUTOCONFIG 329 int raidautoconfig = 1; 330 #else 331 int raidautoconfig = 0; 332 #endif 333 static bool raidautoconfigdone = false; 334 335 struct RF_Pools_s rf_pools; 336 337 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids); 338 static kmutex_t raid_lock; 339 340 static struct raid_softc * 341 raidcreate(int unit) { 342 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP); 343 sc->sc_unit = unit; 344 cv_init(&sc->sc_cv, "raidunit"); 345 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE); 346 return sc; 347 } 348 349 static void 350 raiddestroy(struct raid_softc *sc) { 351 cv_destroy(&sc->sc_cv); 352 mutex_destroy(&sc->sc_mutex); 353 kmem_free(sc, sizeof(*sc)); 354 } 355 356 static struct raid_softc * 357 raidget(int unit, bool create) { 358 struct raid_softc *sc; 359 if (unit < 0) { 360 #ifdef DIAGNOSTIC 361 panic("%s: unit %d!", __func__, unit); 362 #endif 363 return NULL; 364 } 365 mutex_enter(&raid_lock); 366 LIST_FOREACH(sc, &raids, sc_link) { 367 if (sc->sc_unit == unit) { 368 mutex_exit(&raid_lock); 369 return sc; 370 } 371 } 372 mutex_exit(&raid_lock); 373 if (!create) 374 return NULL; 375 sc = raidcreate(unit); 376 mutex_enter(&raid_lock); 377 LIST_INSERT_HEAD(&raids, sc, sc_link); 378 mutex_exit(&raid_lock); 379 return sc; 380 } 381 382 static void 383 raidput(struct raid_softc *sc) { 384 mutex_enter(&raid_lock); 385 LIST_REMOVE(sc, sc_link); 386 mutex_exit(&raid_lock); 387 raiddestroy(sc); 388 } 389 390 void 391 raidattach(int num) 392 { 393 394 /* 395 * Device attachment and associated initialization now occurs 396 * as part of the module initialization. 397 */ 398 } 399 400 int 401 rf_autoconfig(device_t self) 402 { 403 RF_AutoConfig_t *ac_list; 404 RF_ConfigSet_t *config_sets; 405 406 if (!raidautoconfig || raidautoconfigdone == true) 407 return 0; 408 409 /* XXX This code can only be run once. */ 410 raidautoconfigdone = true; 411 412 #ifdef __HAVE_CPU_BOOTCONF 413 /* 414 * 0. find the boot device if needed first so we can use it later 415 * this needs to be done before we autoconfigure any raid sets, 416 * because if we use wedges we are not going to be able to open 417 * the boot device later 418 */ 419 if (booted_device == NULL) 420 cpu_bootconf(); 421 #endif 422 /* 1. locate all RAID components on the system */ 423 aprint_debug("Searching for RAID components...\n"); 424 ac_list = rf_find_raid_components(); 425 426 /* 2. Sort them into their respective sets. */ 427 config_sets = rf_create_auto_sets(ac_list); 428 429 /* 430 * 3. Evaluate each set and configure the valid ones. 431 * This gets done in rf_buildroothack(). 432 */ 433 rf_buildroothack(config_sets); 434 435 return 1; 436 } 437 438 int 439 rf_inited(const struct raid_softc *rs) { 440 return (rs->sc_flags & RAIDF_INITED) != 0; 441 } 442 443 RF_Raid_t * 444 rf_get_raid(struct raid_softc *rs) { 445 return &rs->sc_r; 446 } 447 448 int 449 rf_get_unit(const struct raid_softc *rs) { 450 return rs->sc_unit; 451 } 452 453 static int 454 rf_containsboot(RF_Raid_t *r, device_t bdv) { 455 const char *bootname; 456 size_t len; 457 458 /* if bdv is NULL, the set can't contain it. exit early. */ 459 if (bdv == NULL) 460 return 0; 461 462 bootname = device_xname(bdv); 463 len = strlen(bootname); 464 465 for (int col = 0; col < r->numCol; col++) { 466 const char *devname = r->Disks[col].devname; 467 devname += sizeof("/dev/") - 1; 468 if (strncmp(devname, "dk", 2) == 0) { 469 const char *parent = 470 dkwedge_get_parent_name(r->Disks[col].dev); 471 if (parent != NULL) 472 devname = parent; 473 } 474 if (strncmp(devname, bootname, len) == 0) { 475 struct raid_softc *sc = r->softc; 476 aprint_debug("raid%d includes boot device %s\n", 477 sc->sc_unit, devname); 478 return 1; 479 } 480 } 481 return 0; 482 } 483 484 void 485 rf_buildroothack(RF_ConfigSet_t *config_sets) 486 { 487 RF_ConfigSet_t *cset; 488 RF_ConfigSet_t *next_cset; 489 int num_root; 490 struct raid_softc *sc, *rsc; 491 struct dk_softc *dksc = NULL; /* XXX gcc -Os: may be used uninit. */ 492 493 sc = rsc = NULL; 494 num_root = 0; 495 cset = config_sets; 496 while (cset != NULL) { 497 next_cset = cset->next; 498 if (rf_have_enough_components(cset) && 499 cset->ac->clabel->autoconfigure == 1) { 500 sc = rf_auto_config_set(cset); 501 if (sc != NULL) { 502 aprint_debug("raid%d: configured ok, rootable %d\n", 503 sc->sc_unit, cset->rootable); 504 if (cset->rootable) { 505 rsc = sc; 506 num_root++; 507 } 508 } else { 509 /* The autoconfig didn't work :( */ 510 aprint_debug("Autoconfig failed\n"); 511 rf_release_all_vps(cset); 512 } 513 } else { 514 /* we're not autoconfiguring this set... 515 release the associated resources */ 516 rf_release_all_vps(cset); 517 } 518 /* cleanup */ 519 rf_cleanup_config_set(cset); 520 cset = next_cset; 521 } 522 523 /* if the user has specified what the root device should be 524 then we don't touch booted_device or boothowto... */ 525 526 if (rootspec != NULL) { 527 DPRINTF("%s: rootspec %s\n", __func__, rootspec); 528 return; 529 } 530 531 /* we found something bootable... */ 532 533 /* 534 * XXX: The following code assumes that the root raid 535 * is the first ('a') partition. This is about the best 536 * we can do with a BSD disklabel, but we might be able 537 * to do better with a GPT label, by setting a specified 538 * attribute to indicate the root partition. We can then 539 * stash the partition number in the r->root_partition 540 * high bits (the bottom 2 bits are already used). For 541 * now we just set booted_partition to 0 when we override 542 * root. 543 */ 544 if (num_root == 1) { 545 device_t candidate_root; 546 dksc = &rsc->sc_dksc; 547 if (dksc->sc_dkdev.dk_nwedges != 0) { 548 char cname[sizeof(cset->ac->devname)]; 549 /* XXX: assume partition 'a' first */ 550 snprintf(cname, sizeof(cname), "%s%c", 551 device_xname(dksc->sc_dev), 'a'); 552 candidate_root = dkwedge_find_by_wname(cname); 553 DPRINTF("%s: candidate wedge root=%s\n", __func__, 554 cname); 555 if (candidate_root == NULL) { 556 /* 557 * If that is not found, because we don't use 558 * disklabel, return the first dk child 559 * XXX: we can skip the 'a' check above 560 * and always do this... 561 */ 562 size_t i = 0; 563 candidate_root = dkwedge_find_by_parent( 564 device_xname(dksc->sc_dev), &i); 565 } 566 DPRINTF("%s: candidate wedge root=%p\n", __func__, 567 candidate_root); 568 } else 569 candidate_root = dksc->sc_dev; 570 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root); 571 DPRINTF("%s: booted_device=%p root_partition=%d " 572 "contains_boot=%d", 573 __func__, booted_device, rsc->sc_r.root_partition, 574 rf_containsboot(&rsc->sc_r, booted_device)); 575 /* XXX the check for booted_device == NULL can probably be 576 * dropped, now that rf_containsboot handles that case. 577 */ 578 if (booted_device == NULL || 579 rsc->sc_r.root_partition == 1 || 580 rf_containsboot(&rsc->sc_r, booted_device)) { 581 booted_device = candidate_root; 582 booted_method = "raidframe/single"; 583 booted_partition = 0; /* XXX assume 'a' */ 584 } 585 } else if (num_root > 1) { 586 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root, 587 booted_device); 588 589 /* 590 * Maybe the MD code can help. If it cannot, then 591 * setroot() will discover that we have no 592 * booted_device and will ask the user if nothing was 593 * hardwired in the kernel config file 594 */ 595 if (booted_device == NULL) 596 return; 597 598 num_root = 0; 599 mutex_enter(&raid_lock); 600 LIST_FOREACH(sc, &raids, sc_link) { 601 RF_Raid_t *r = &sc->sc_r; 602 if (r->valid == 0) 603 continue; 604 605 if (r->root_partition == 0) 606 continue; 607 608 if (rf_containsboot(r, booted_device)) { 609 num_root++; 610 rsc = sc; 611 dksc = &rsc->sc_dksc; 612 } 613 } 614 mutex_exit(&raid_lock); 615 616 if (num_root == 1) { 617 booted_device = dksc->sc_dev; 618 booted_method = "raidframe/multi"; 619 booted_partition = 0; /* XXX assume 'a' */ 620 } else { 621 /* we can't guess.. require the user to answer... */ 622 boothowto |= RB_ASKNAME; 623 } 624 } 625 } 626 627 static int 628 raidsize(dev_t dev) 629 { 630 struct raid_softc *rs; 631 struct dk_softc *dksc; 632 unsigned int unit; 633 634 unit = raidunit(dev); 635 if ((rs = raidget(unit, false)) == NULL) 636 return -1; 637 dksc = &rs->sc_dksc; 638 639 if ((rs->sc_flags & RAIDF_INITED) == 0) 640 return -1; 641 642 return dk_size(dksc, dev); 643 } 644 645 static int 646 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size) 647 { 648 unsigned int unit; 649 struct raid_softc *rs; 650 struct dk_softc *dksc; 651 652 unit = raidunit(dev); 653 if ((rs = raidget(unit, false)) == NULL) 654 return ENXIO; 655 dksc = &rs->sc_dksc; 656 657 if ((rs->sc_flags & RAIDF_INITED) == 0) 658 return ENODEV; 659 660 /* 661 Note that blkno is relative to this particular partition. 662 By adding adding RF_PROTECTED_SECTORS, we get a value that 663 is relative to the partition used for the underlying component. 664 */ 665 blkno += RF_PROTECTED_SECTORS; 666 667 return dk_dump(dksc, dev, blkno, va, size, DK_DUMP_RECURSIVE); 668 } 669 670 static int 671 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk) 672 { 673 struct raid_softc *rs = raidsoftc(dev); 674 const struct bdevsw *bdev; 675 RF_Raid_t *raidPtr; 676 int c, sparecol, j, scol, dumpto; 677 int error = 0; 678 679 raidPtr = &rs->sc_r; 680 681 /* we only support dumping to RAID 1 sets */ 682 if (raidPtr->Layout.numDataCol != 1 || 683 raidPtr->Layout.numParityCol != 1) 684 return EINVAL; 685 686 if ((error = raidlock(rs)) != 0) 687 return error; 688 689 /* figure out what device is alive.. */ 690 691 /* 692 Look for a component to dump to. The preference for the 693 component to dump to is as follows: 694 1) the first component 695 2) a used_spare of the first component 696 3) the second component 697 4) a used_spare of the second component 698 */ 699 700 dumpto = -1; 701 for (c = 0; c < raidPtr->numCol; c++) { 702 if (raidPtr->Disks[c].status == rf_ds_optimal) { 703 /* this might be the one */ 704 dumpto = c; 705 break; 706 } 707 } 708 709 /* 710 At this point we have possibly selected a live component. 711 If we didn't find a live ocmponent, we now check to see 712 if there is a relevant spared component. 713 */ 714 715 for (c = 0; c < raidPtr->numSpare; c++) { 716 sparecol = raidPtr->numCol + c; 717 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 718 /* How about this one? */ 719 scol = -1; 720 for(j=0;j<raidPtr->numCol;j++) { 721 if (raidPtr->Disks[j].spareCol == sparecol) { 722 scol = j; 723 break; 724 } 725 } 726 if (scol == 0) { 727 /* 728 We must have found a spared first 729 component! We'll take that over 730 anything else found so far. (We 731 couldn't have found a real first 732 component before, since this is a 733 used spare, and it's saying that 734 it's replacing the first 735 component.) On reboot (with 736 autoconfiguration turned on) 737 sparecol will become the first 738 component (component0) of this set. 739 */ 740 dumpto = sparecol; 741 break; 742 } else if (scol != -1) { 743 /* 744 Must be a spared second component. 745 We'll dump to that if we havn't found 746 anything else so far. 747 */ 748 if (dumpto == -1) 749 dumpto = sparecol; 750 } 751 } 752 } 753 754 if (dumpto == -1) { 755 /* we couldn't find any live components to dump to!?!? 756 */ 757 error = EINVAL; 758 goto out; 759 } 760 761 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev); 762 if (bdev == NULL) { 763 error = ENXIO; 764 goto out; 765 } 766 767 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev, 768 blkno, va, nblk * raidPtr->bytesPerSector); 769 770 out: 771 raidunlock(rs); 772 773 return error; 774 } 775 776 /* ARGSUSED */ 777 static int 778 raidopen(dev_t dev, int flags, int fmt, 779 struct lwp *l) 780 { 781 int unit = raidunit(dev); 782 struct raid_softc *rs; 783 struct dk_softc *dksc; 784 int error = 0; 785 int part, pmask; 786 787 if ((rs = raidget(unit, true)) == NULL) 788 return ENXIO; 789 if ((error = raidlock(rs)) != 0) 790 return error; 791 792 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) { 793 error = EBUSY; 794 goto bad; 795 } 796 797 dksc = &rs->sc_dksc; 798 799 part = DISKPART(dev); 800 pmask = (1 << part); 801 802 if (!DK_BUSY(dksc, pmask) && 803 ((rs->sc_flags & RAIDF_INITED) != 0)) { 804 /* First one... mark things as dirty... Note that we *MUST* 805 have done a configure before this. I DO NOT WANT TO BE 806 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED 807 THAT THEY BELONG TOGETHER!!!!! */ 808 /* XXX should check to see if we're only open for reading 809 here... If so, we needn't do this, but then need some 810 other way of keeping track of what's happened.. */ 811 812 rf_markalldirty(&rs->sc_r); 813 } 814 815 if ((rs->sc_flags & RAIDF_INITED) != 0) 816 error = dk_open(dksc, dev, flags, fmt, l); 817 818 bad: 819 raidunlock(rs); 820 821 return error; 822 823 824 } 825 826 static int 827 raid_lastclose(device_t self) 828 { 829 struct raid_softc *rs = raidsoftc(self); 830 831 /* Last one... device is not unconfigured yet. 832 Device shutdown has taken care of setting the 833 clean bits if RAIDF_INITED is not set 834 mark things as clean... */ 835 836 rf_update_component_labels(&rs->sc_r, 837 RF_FINAL_COMPONENT_UPDATE); 838 839 /* pass to unlocked code */ 840 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) 841 rs->sc_flags |= RAIDF_DETACH; 842 843 return 0; 844 } 845 846 /* ARGSUSED */ 847 static int 848 raidclose(dev_t dev, int flags, int fmt, struct lwp *l) 849 { 850 int unit = raidunit(dev); 851 struct raid_softc *rs; 852 struct dk_softc *dksc; 853 cfdata_t cf; 854 int error = 0, do_detach = 0, do_put = 0; 855 856 if ((rs = raidget(unit, false)) == NULL) 857 return ENXIO; 858 dksc = &rs->sc_dksc; 859 860 if ((error = raidlock(rs)) != 0) 861 return error; 862 863 if ((rs->sc_flags & RAIDF_INITED) != 0) { 864 error = dk_close(dksc, dev, flags, fmt, l); 865 if ((rs->sc_flags & RAIDF_DETACH) != 0) 866 do_detach = 1; 867 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) 868 do_put = 1; 869 870 raidunlock(rs); 871 872 if (do_detach) { 873 /* free the pseudo device attach bits */ 874 cf = device_cfdata(dksc->sc_dev); 875 error = config_detach(dksc->sc_dev, 0); 876 if (error == 0) 877 free(cf, M_RAIDFRAME); 878 } else if (do_put) { 879 raidput(rs); 880 } 881 882 return error; 883 884 } 885 886 static void 887 raid_wakeup(RF_Raid_t *raidPtr) 888 { 889 rf_lock_mutex2(raidPtr->iodone_lock); 890 rf_signal_cond2(raidPtr->iodone_cv); 891 rf_unlock_mutex2(raidPtr->iodone_lock); 892 } 893 894 static void 895 raidstrategy(struct buf *bp) 896 { 897 unsigned int unit; 898 struct raid_softc *rs; 899 struct dk_softc *dksc; 900 RF_Raid_t *raidPtr; 901 902 unit = raidunit(bp->b_dev); 903 if ((rs = raidget(unit, false)) == NULL) { 904 bp->b_error = ENXIO; 905 goto fail; 906 } 907 if ((rs->sc_flags & RAIDF_INITED) == 0) { 908 bp->b_error = ENXIO; 909 goto fail; 910 } 911 dksc = &rs->sc_dksc; 912 raidPtr = &rs->sc_r; 913 914 /* Queue IO only */ 915 if (dk_strategy_defer(dksc, bp)) 916 goto done; 917 918 /* schedule the IO to happen at the next convenient time */ 919 raid_wakeup(raidPtr); 920 921 done: 922 return; 923 924 fail: 925 bp->b_resid = bp->b_bcount; 926 biodone(bp); 927 } 928 929 static int 930 raid_diskstart(device_t dev, struct buf *bp) 931 { 932 struct raid_softc *rs = raidsoftc(dev); 933 RF_Raid_t *raidPtr; 934 935 raidPtr = &rs->sc_r; 936 if (!raidPtr->valid) { 937 db1_printf(("raid is not valid..\n")); 938 return ENODEV; 939 } 940 941 /* XXX */ 942 bp->b_resid = 0; 943 944 return raiddoaccess(raidPtr, bp); 945 } 946 947 void 948 raiddone(RF_Raid_t *raidPtr, struct buf *bp) 949 { 950 struct raid_softc *rs; 951 struct dk_softc *dksc; 952 953 rs = raidPtr->softc; 954 dksc = &rs->sc_dksc; 955 956 dk_done(dksc, bp); 957 958 rf_lock_mutex2(raidPtr->mutex); 959 raidPtr->openings++; 960 rf_unlock_mutex2(raidPtr->mutex); 961 962 /* schedule more IO */ 963 raid_wakeup(raidPtr); 964 } 965 966 /* ARGSUSED */ 967 static int 968 raidread(dev_t dev, struct uio *uio, int flags) 969 { 970 int unit = raidunit(dev); 971 struct raid_softc *rs; 972 973 if ((rs = raidget(unit, false)) == NULL) 974 return ENXIO; 975 976 if ((rs->sc_flags & RAIDF_INITED) == 0) 977 return ENXIO; 978 979 return physio(raidstrategy, NULL, dev, B_READ, minphys, uio); 980 981 } 982 983 /* ARGSUSED */ 984 static int 985 raidwrite(dev_t dev, struct uio *uio, int flags) 986 { 987 int unit = raidunit(dev); 988 struct raid_softc *rs; 989 990 if ((rs = raidget(unit, false)) == NULL) 991 return ENXIO; 992 993 if ((rs->sc_flags & RAIDF_INITED) == 0) 994 return ENXIO; 995 996 return physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio); 997 998 } 999 1000 static int 1001 raid_detach_unlocked(struct raid_softc *rs) 1002 { 1003 struct dk_softc *dksc = &rs->sc_dksc; 1004 RF_Raid_t *raidPtr; 1005 int error; 1006 1007 raidPtr = &rs->sc_r; 1008 1009 if (DK_BUSY(dksc, 0) || 1010 raidPtr->recon_in_progress != 0 || 1011 raidPtr->parity_rewrite_in_progress != 0 || 1012 raidPtr->copyback_in_progress != 0) 1013 return EBUSY; 1014 1015 if ((rs->sc_flags & RAIDF_INITED) == 0) 1016 return 0; 1017 1018 rs->sc_flags &= ~RAIDF_SHUTDOWN; 1019 1020 if ((error = rf_Shutdown(raidPtr)) != 0) 1021 return error; 1022 1023 rs->sc_flags &= ~RAIDF_INITED; 1024 1025 /* Kill off any queued buffers */ 1026 dk_drain(dksc); 1027 bufq_free(dksc->sc_bufq); 1028 1029 /* Detach the disk. */ 1030 dkwedge_delall(&dksc->sc_dkdev); 1031 disk_detach(&dksc->sc_dkdev); 1032 disk_destroy(&dksc->sc_dkdev); 1033 dk_detach(dksc); 1034 1035 return 0; 1036 } 1037 1038 static bool 1039 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd) 1040 { 1041 switch (cmd) { 1042 case RAIDFRAME_ADD_HOT_SPARE: 1043 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1044 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1045 case RAIDFRAME_CHECK_PARITY: 1046 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1047 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1048 case RAIDFRAME_CHECK_RECON_STATUS: 1049 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1050 case RAIDFRAME_COPYBACK: 1051 case RAIDFRAME_DELETE_COMPONENT: 1052 case RAIDFRAME_FAIL_DISK: 1053 case RAIDFRAME_GET_ACCTOTALS: 1054 case RAIDFRAME_GET_COMPONENT_LABEL: 1055 case RAIDFRAME_GET_INFO: 1056 case RAIDFRAME_GET_SIZE: 1057 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1058 case RAIDFRAME_INIT_LABELS: 1059 case RAIDFRAME_KEEP_ACCTOTALS: 1060 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1061 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1062 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1063 case RAIDFRAME_PARITYMAP_STATUS: 1064 case RAIDFRAME_REBUILD_IN_PLACE: 1065 case RAIDFRAME_REMOVE_HOT_SPARE: 1066 case RAIDFRAME_RESET_ACCTOTALS: 1067 case RAIDFRAME_REWRITEPARITY: 1068 case RAIDFRAME_SET_AUTOCONFIG: 1069 case RAIDFRAME_SET_COMPONENT_LABEL: 1070 case RAIDFRAME_SET_ROOT: 1071 return (rs->sc_flags & RAIDF_INITED) == 0; 1072 } 1073 return false; 1074 } 1075 1076 int 1077 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr) 1078 { 1079 struct rf_recon_req_internal *rrint; 1080 1081 if (raidPtr->Layout.map->faultsTolerated == 0) { 1082 /* Can't do this on a RAID 0!! */ 1083 return EINVAL; 1084 } 1085 1086 if (rr->col < 0 || rr->col >= raidPtr->numCol) { 1087 /* bad column */ 1088 return EINVAL; 1089 } 1090 1091 rf_lock_mutex2(raidPtr->mutex); 1092 if (raidPtr->status == rf_rs_reconstructing) { 1093 /* you can't fail a disk while we're reconstructing! */ 1094 /* XXX wrong for RAID6 */ 1095 goto out; 1096 } 1097 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) && 1098 (raidPtr->numFailures > 0)) { 1099 /* some other component has failed. Let's not make 1100 things worse. XXX wrong for RAID6 */ 1101 goto out; 1102 } 1103 if (raidPtr->Disks[rr->col].status == rf_ds_spared) { 1104 /* Can't fail a spared disk! */ 1105 goto out; 1106 } 1107 rf_unlock_mutex2(raidPtr->mutex); 1108 1109 /* make a copy of the recon request so that we don't rely on 1110 * the user's buffer */ 1111 rrint = RF_Malloc(sizeof(*rrint)); 1112 if (rrint == NULL) 1113 return(ENOMEM); 1114 rrint->col = rr->col; 1115 rrint->flags = rr->flags; 1116 rrint->raidPtr = raidPtr; 1117 1118 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread, 1119 rrint, "raid_recon"); 1120 out: 1121 rf_unlock_mutex2(raidPtr->mutex); 1122 return EINVAL; 1123 } 1124 1125 static int 1126 rf_copyinspecificbuf(RF_Config_t *k_cfg) 1127 { 1128 /* allocate a buffer for the layout-specific data, and copy it in */ 1129 if (k_cfg->layoutSpecificSize == 0) 1130 return 0; 1131 1132 if (k_cfg->layoutSpecificSize > 10000) { 1133 /* sanity check */ 1134 return EINVAL; 1135 } 1136 1137 u_char *specific_buf; 1138 specific_buf = RF_Malloc(k_cfg->layoutSpecificSize); 1139 if (specific_buf == NULL) 1140 return ENOMEM; 1141 1142 int retcode = copyin(k_cfg->layoutSpecific, specific_buf, 1143 k_cfg->layoutSpecificSize); 1144 if (retcode) { 1145 RF_Free(specific_buf, k_cfg->layoutSpecificSize); 1146 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode)); 1147 return retcode; 1148 } 1149 1150 k_cfg->layoutSpecific = specific_buf; 1151 return 0; 1152 } 1153 1154 static int 1155 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg) 1156 { 1157 RF_Config_t *u_cfg = *((RF_Config_t **) data); 1158 1159 if (rs->sc_r.valid) { 1160 /* There is a valid RAID set running on this unit! */ 1161 printf("raid%d: Device already configured!\n", rs->sc_unit); 1162 return EINVAL; 1163 } 1164 1165 /* copy-in the configuration information */ 1166 /* data points to a pointer to the configuration structure */ 1167 *k_cfg = RF_Malloc(sizeof(**k_cfg)); 1168 if (*k_cfg == NULL) { 1169 return ENOMEM; 1170 } 1171 int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t)); 1172 if (retcode == 0) 1173 return 0; 1174 RF_Free(*k_cfg, sizeof(RF_Config_t)); 1175 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode)); 1176 rs->sc_flags |= RAIDF_SHUTDOWN; 1177 return retcode; 1178 } 1179 1180 int 1181 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg) 1182 { 1183 int retcode; 1184 RF_Raid_t *raidPtr = &rs->sc_r; 1185 1186 rs->sc_flags &= ~RAIDF_SHUTDOWN; 1187 1188 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0) 1189 goto out; 1190 1191 /* should do some kind of sanity check on the configuration. 1192 * Store the sum of all the bytes in the last byte? */ 1193 1194 /* configure the system */ 1195 1196 /* 1197 * Clear the entire RAID descriptor, just to make sure 1198 * there is no stale data left in the case of a 1199 * reconfiguration 1200 */ 1201 memset(raidPtr, 0, sizeof(*raidPtr)); 1202 raidPtr->softc = rs; 1203 raidPtr->raidid = rs->sc_unit; 1204 1205 retcode = rf_Configure(raidPtr, k_cfg, NULL); 1206 1207 if (retcode == 0) { 1208 /* allow this many simultaneous IO's to 1209 this RAID device */ 1210 raidPtr->openings = RAIDOUTSTANDING; 1211 1212 raidinit(rs); 1213 raid_wakeup(raidPtr); 1214 rf_markalldirty(raidPtr); 1215 } 1216 1217 /* free the buffers. No return code here. */ 1218 if (k_cfg->layoutSpecificSize) { 1219 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize); 1220 } 1221 out: 1222 RF_Free(k_cfg, sizeof(RF_Config_t)); 1223 if (retcode) { 1224 /* 1225 * If configuration failed, set sc_flags so that we 1226 * will detach the device when we close it. 1227 */ 1228 rs->sc_flags |= RAIDF_SHUTDOWN; 1229 } 1230 return retcode; 1231 } 1232 1233 #if RF_DISABLED 1234 static int 1235 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 1236 { 1237 1238 /* XXX check the label for valid stuff... */ 1239 /* Note that some things *should not* get modified -- 1240 the user should be re-initing the labels instead of 1241 trying to patch things. 1242 */ 1243 #ifdef DEBUG 1244 int raidid = raidPtr->raidid; 1245 printf("raid%d: Got component label:\n", raidid); 1246 printf("raid%d: Version: %d\n", raidid, clabel->version); 1247 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number); 1248 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter); 1249 printf("raid%d: Column: %d\n", raidid, clabel->column); 1250 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns); 1251 printf("raid%d: Clean: %d\n", raidid, clabel->clean); 1252 printf("raid%d: Status: %d\n", raidid, clabel->status); 1253 #endif /* DEBUG */ 1254 clabel->row = 0; 1255 int column = clabel->column; 1256 1257 if ((column < 0) || (column >= raidPtr->numCol)) { 1258 return(EINVAL); 1259 } 1260 1261 /* XXX this isn't allowed to do anything for now :-) */ 1262 1263 /* XXX and before it is, we need to fill in the rest 1264 of the fields!?!?!?! */ 1265 memcpy(raidget_component_label(raidPtr, column), 1266 clabel, sizeof(*clabel)); 1267 raidflush_component_label(raidPtr, column); 1268 return 0; 1269 } 1270 #endif 1271 1272 static int 1273 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 1274 { 1275 /* 1276 we only want the serial number from 1277 the above. We get all the rest of the information 1278 from the config that was used to create this RAID 1279 set. 1280 */ 1281 1282 raidPtr->serial_number = clabel->serial_number; 1283 1284 for (int column = 0; column < raidPtr->numCol; column++) { 1285 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column]; 1286 if (RF_DEAD_DISK(diskPtr->status)) 1287 continue; 1288 RF_ComponentLabel_t *ci_label = raidget_component_label( 1289 raidPtr, column); 1290 /* Zeroing this is important. */ 1291 memset(ci_label, 0, sizeof(*ci_label)); 1292 raid_init_component_label(raidPtr, ci_label); 1293 ci_label->serial_number = raidPtr->serial_number; 1294 ci_label->row = 0; /* we dont' pretend to support more */ 1295 rf_component_label_set_partitionsize(ci_label, 1296 diskPtr->partitionSize); 1297 ci_label->column = column; 1298 raidflush_component_label(raidPtr, column); 1299 /* XXXjld what about the spares? */ 1300 } 1301 1302 return 0; 1303 } 1304 1305 static int 1306 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr) 1307 { 1308 1309 if (raidPtr->Layout.map->faultsTolerated == 0) { 1310 /* Can't do this on a RAID 0!! */ 1311 return EINVAL; 1312 } 1313 1314 if (raidPtr->recon_in_progress == 1) { 1315 /* a reconstruct is already in progress! */ 1316 return EINVAL; 1317 } 1318 1319 RF_SingleComponent_t component; 1320 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t)); 1321 component.row = 0; /* we don't support any more */ 1322 int column = component.column; 1323 1324 if ((column < 0) || (column >= raidPtr->numCol)) { 1325 return EINVAL; 1326 } 1327 1328 rf_lock_mutex2(raidPtr->mutex); 1329 if ((raidPtr->Disks[column].status == rf_ds_optimal) && 1330 (raidPtr->numFailures > 0)) { 1331 /* XXX 0 above shouldn't be constant!!! */ 1332 /* some component other than this has failed. 1333 Let's not make things worse than they already 1334 are... */ 1335 printf("raid%d: Unable to reconstruct to disk at:\n", 1336 raidPtr->raidid); 1337 printf("raid%d: Col: %d Too many failures.\n", 1338 raidPtr->raidid, column); 1339 rf_unlock_mutex2(raidPtr->mutex); 1340 return EINVAL; 1341 } 1342 1343 if (raidPtr->Disks[column].status == rf_ds_reconstructing) { 1344 printf("raid%d: Unable to reconstruct to disk at:\n", 1345 raidPtr->raidid); 1346 printf("raid%d: Col: %d " 1347 "Reconstruction already occurring!\n", 1348 raidPtr->raidid, column); 1349 1350 rf_unlock_mutex2(raidPtr->mutex); 1351 return EINVAL; 1352 } 1353 1354 if (raidPtr->Disks[column].status == rf_ds_spared) { 1355 rf_unlock_mutex2(raidPtr->mutex); 1356 return EINVAL; 1357 } 1358 1359 rf_unlock_mutex2(raidPtr->mutex); 1360 1361 struct rf_recon_req_internal *rrint; 1362 rrint = RF_Malloc(sizeof(*rrint)); 1363 if (rrint == NULL) 1364 return ENOMEM; 1365 1366 rrint->col = column; 1367 rrint->raidPtr = raidPtr; 1368 1369 return RF_CREATE_THREAD(raidPtr->recon_thread, 1370 rf_ReconstructInPlaceThread, rrint, "raid_reconip"); 1371 } 1372 1373 static int 1374 rf_check_recon_status(RF_Raid_t *raidPtr, int *data) 1375 { 1376 /* 1377 * This makes no sense on a RAID 0, or if we are not reconstructing 1378 * so tell the user it's done. 1379 */ 1380 if (raidPtr->Layout.map->faultsTolerated == 0 || 1381 raidPtr->status != rf_rs_reconstructing) { 1382 *data = 100; 1383 return 0; 1384 } 1385 if (raidPtr->reconControl->numRUsTotal == 0) { 1386 *data = 0; 1387 return 0; 1388 } 1389 *data = (raidPtr->reconControl->numRUsComplete * 100 1390 / raidPtr->reconControl->numRUsTotal); 1391 return 0; 1392 } 1393 1394 static int 1395 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) 1396 { 1397 int unit = raidunit(dev); 1398 int part, pmask; 1399 struct raid_softc *rs; 1400 struct dk_softc *dksc; 1401 RF_Config_t *k_cfg; 1402 RF_Raid_t *raidPtr; 1403 RF_AccTotals_t *totals; 1404 RF_SingleComponent_t component; 1405 RF_DeviceConfig_t *d_cfg, *ucfgp; 1406 int retcode = 0; 1407 int column; 1408 RF_ComponentLabel_t *clabel; 1409 RF_SingleComponent_t *sparePtr,*componentPtr; 1410 int d; 1411 1412 if ((rs = raidget(unit, false)) == NULL) 1413 return ENXIO; 1414 1415 dksc = &rs->sc_dksc; 1416 raidPtr = &rs->sc_r; 1417 1418 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev, 1419 (int) DISKPART(dev), (int) unit, cmd)); 1420 1421 /* Must be initialized for these... */ 1422 if (rf_must_be_initialized(rs, cmd)) 1423 return ENXIO; 1424 1425 switch (cmd) { 1426 /* configure the system */ 1427 case RAIDFRAME_CONFIGURE: 1428 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0) 1429 return retcode; 1430 return rf_construct(rs, k_cfg); 1431 1432 /* shutdown the system */ 1433 case RAIDFRAME_SHUTDOWN: 1434 1435 part = DISKPART(dev); 1436 pmask = (1 << part); 1437 1438 if ((retcode = raidlock(rs)) != 0) 1439 return retcode; 1440 1441 if (DK_BUSY(dksc, pmask) || 1442 raidPtr->recon_in_progress != 0 || 1443 raidPtr->parity_rewrite_in_progress != 0 || 1444 raidPtr->copyback_in_progress != 0) 1445 retcode = EBUSY; 1446 else { 1447 /* detach and free on close */ 1448 rs->sc_flags |= RAIDF_SHUTDOWN; 1449 retcode = 0; 1450 } 1451 1452 raidunlock(rs); 1453 1454 return retcode; 1455 case RAIDFRAME_GET_COMPONENT_LABEL: 1456 return rf_get_component_label(raidPtr, data); 1457 1458 #if RF_DISABLED 1459 case RAIDFRAME_SET_COMPONENT_LABEL: 1460 return rf_set_component_label(raidPtr, data); 1461 #endif 1462 1463 case RAIDFRAME_INIT_LABELS: 1464 return rf_init_component_label(raidPtr, data); 1465 1466 case RAIDFRAME_SET_AUTOCONFIG: 1467 d = rf_set_autoconfig(raidPtr, *(int *) data); 1468 printf("raid%d: New autoconfig value is: %d\n", 1469 raidPtr->raidid, d); 1470 *(int *) data = d; 1471 return retcode; 1472 1473 case RAIDFRAME_SET_ROOT: 1474 d = rf_set_rootpartition(raidPtr, *(int *) data); 1475 printf("raid%d: New rootpartition value is: %d\n", 1476 raidPtr->raidid, d); 1477 *(int *) data = d; 1478 return retcode; 1479 1480 /* initialize all parity */ 1481 case RAIDFRAME_REWRITEPARITY: 1482 1483 if (raidPtr->Layout.map->faultsTolerated == 0) { 1484 /* Parity for RAID 0 is trivially correct */ 1485 raidPtr->parity_good = RF_RAID_CLEAN; 1486 return 0; 1487 } 1488 1489 if (raidPtr->parity_rewrite_in_progress == 1) { 1490 /* Re-write is already in progress! */ 1491 return EINVAL; 1492 } 1493 1494 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread, 1495 rf_RewriteParityThread, raidPtr,"raid_parity"); 1496 1497 case RAIDFRAME_ADD_HOT_SPARE: 1498 sparePtr = (RF_SingleComponent_t *) data; 1499 memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t)); 1500 return rf_add_hot_spare(raidPtr, &component); 1501 1502 case RAIDFRAME_REMOVE_HOT_SPARE: 1503 return retcode; 1504 1505 case RAIDFRAME_DELETE_COMPONENT: 1506 componentPtr = (RF_SingleComponent_t *)data; 1507 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t)); 1508 return rf_delete_component(raidPtr, &component); 1509 1510 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1511 componentPtr = (RF_SingleComponent_t *)data; 1512 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t)); 1513 return rf_incorporate_hot_spare(raidPtr, &component); 1514 1515 case RAIDFRAME_REBUILD_IN_PLACE: 1516 return rf_rebuild_in_place(raidPtr, data); 1517 1518 case RAIDFRAME_GET_INFO: 1519 ucfgp = *(RF_DeviceConfig_t **)data; 1520 d_cfg = RF_Malloc(sizeof(*d_cfg)); 1521 if (d_cfg == NULL) 1522 return ENOMEM; 1523 retcode = rf_get_info(raidPtr, d_cfg); 1524 if (retcode == 0) { 1525 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg)); 1526 } 1527 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1528 return retcode; 1529 1530 case RAIDFRAME_CHECK_PARITY: 1531 *(int *) data = raidPtr->parity_good; 1532 return 0; 1533 1534 case RAIDFRAME_PARITYMAP_STATUS: 1535 if (rf_paritymap_ineligible(raidPtr)) 1536 return EINVAL; 1537 rf_paritymap_status(raidPtr->parity_map, data); 1538 return 0; 1539 1540 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1541 if (rf_paritymap_ineligible(raidPtr)) 1542 return EINVAL; 1543 if (raidPtr->parity_map == NULL) 1544 return ENOENT; /* ??? */ 1545 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0) 1546 return EINVAL; 1547 return 0; 1548 1549 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1550 if (rf_paritymap_ineligible(raidPtr)) 1551 return EINVAL; 1552 *(int *) data = rf_paritymap_get_disable(raidPtr); 1553 return 0; 1554 1555 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1556 if (rf_paritymap_ineligible(raidPtr)) 1557 return EINVAL; 1558 rf_paritymap_set_disable(raidPtr, *(int *)data); 1559 /* XXX should errors be passed up? */ 1560 return 0; 1561 1562 case RAIDFRAME_RESET_ACCTOTALS: 1563 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals)); 1564 return 0; 1565 1566 case RAIDFRAME_GET_ACCTOTALS: 1567 totals = (RF_AccTotals_t *) data; 1568 *totals = raidPtr->acc_totals; 1569 return 0; 1570 1571 case RAIDFRAME_KEEP_ACCTOTALS: 1572 raidPtr->keep_acc_totals = *(int *)data; 1573 return 0; 1574 1575 case RAIDFRAME_GET_SIZE: 1576 *(int *) data = raidPtr->totalSectors; 1577 return 0; 1578 1579 case RAIDFRAME_FAIL_DISK: 1580 return rf_fail_disk(raidPtr, data); 1581 1582 /* invoke a copyback operation after recon on whatever disk 1583 * needs it, if any */ 1584 case RAIDFRAME_COPYBACK: 1585 1586 if (raidPtr->Layout.map->faultsTolerated == 0) { 1587 /* This makes no sense on a RAID 0!! */ 1588 return EINVAL; 1589 } 1590 1591 if (raidPtr->copyback_in_progress == 1) { 1592 /* Copyback is already in progress! */ 1593 return EINVAL; 1594 } 1595 1596 return RF_CREATE_THREAD(raidPtr->copyback_thread, 1597 rf_CopybackThread, raidPtr, "raid_copyback"); 1598 1599 /* return the percentage completion of reconstruction */ 1600 case RAIDFRAME_CHECK_RECON_STATUS: 1601 return rf_check_recon_status(raidPtr, data); 1602 1603 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1604 rf_check_recon_status_ext(raidPtr, data); 1605 return 0; 1606 1607 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1608 if (raidPtr->Layout.map->faultsTolerated == 0) { 1609 /* This makes no sense on a RAID 0, so tell the 1610 user it's done. */ 1611 *(int *) data = 100; 1612 return 0; 1613 } 1614 if (raidPtr->parity_rewrite_in_progress == 1) { 1615 *(int *) data = 100 * 1616 raidPtr->parity_rewrite_stripes_done / 1617 raidPtr->Layout.numStripe; 1618 } else { 1619 *(int *) data = 100; 1620 } 1621 return 0; 1622 1623 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1624 rf_check_parityrewrite_status_ext(raidPtr, data); 1625 return 0; 1626 1627 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1628 if (raidPtr->Layout.map->faultsTolerated == 0) { 1629 /* This makes no sense on a RAID 0 */ 1630 *(int *) data = 100; 1631 return 0; 1632 } 1633 if (raidPtr->copyback_in_progress == 1) { 1634 *(int *) data = 100 * raidPtr->copyback_stripes_done / 1635 raidPtr->Layout.numStripe; 1636 } else { 1637 *(int *) data = 100; 1638 } 1639 return 0; 1640 1641 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1642 rf_check_copyback_status_ext(raidPtr, data); 1643 return 0; 1644 1645 case RAIDFRAME_SET_LAST_UNIT: 1646 for (column = 0; column < raidPtr->numCol; column++) 1647 if (raidPtr->Disks[column].status != rf_ds_optimal) 1648 return EBUSY; 1649 1650 for (column = 0; column < raidPtr->numCol; column++) { 1651 clabel = raidget_component_label(raidPtr, column); 1652 clabel->last_unit = *(int *)data; 1653 raidflush_component_label(raidPtr, column); 1654 } 1655 rs->sc_cflags |= RAIDF_UNIT_CHANGED; 1656 return 0; 1657 1658 /* the sparetable daemon calls this to wait for the kernel to 1659 * need a spare table. this ioctl does not return until a 1660 * spare table is needed. XXX -- calling mpsleep here in the 1661 * ioctl code is almost certainly wrong and evil. -- XXX XXX 1662 * -- I should either compute the spare table in the kernel, 1663 * or have a different -- XXX XXX -- interface (a different 1664 * character device) for delivering the table -- XXX */ 1665 #if RF_DISABLED 1666 case RAIDFRAME_SPARET_WAIT: 1667 rf_lock_mutex2(rf_sparet_wait_mutex); 1668 while (!rf_sparet_wait_queue) 1669 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex); 1670 RF_SparetWait_t *waitreq = rf_sparet_wait_queue; 1671 rf_sparet_wait_queue = rf_sparet_wait_queue->next; 1672 rf_unlock_mutex2(rf_sparet_wait_mutex); 1673 1674 /* structure assignment */ 1675 *((RF_SparetWait_t *) data) = *waitreq; 1676 1677 RF_Free(waitreq, sizeof(*waitreq)); 1678 return 0; 1679 1680 /* wakes up a process waiting on SPARET_WAIT and puts an error 1681 * code in it that will cause the dameon to exit */ 1682 case RAIDFRAME_ABORT_SPARET_WAIT: 1683 waitreq = RF_Malloc(sizeof(*waitreq)); 1684 waitreq->fcol = -1; 1685 rf_lock_mutex2(rf_sparet_wait_mutex); 1686 waitreq->next = rf_sparet_wait_queue; 1687 rf_sparet_wait_queue = waitreq; 1688 rf_broadcast_cond2(rf_sparet_wait_cv); 1689 rf_unlock_mutex2(rf_sparet_wait_mutex); 1690 return 0; 1691 1692 /* used by the spare table daemon to deliver a spare table 1693 * into the kernel */ 1694 case RAIDFRAME_SEND_SPARET: 1695 1696 /* install the spare table */ 1697 retcode = rf_SetSpareTable(raidPtr, *(void **) data); 1698 1699 /* respond to the requestor. the return status of the spare 1700 * table installation is passed in the "fcol" field */ 1701 waitred = RF_Malloc(sizeof(*waitreq)); 1702 waitreq->fcol = retcode; 1703 rf_lock_mutex2(rf_sparet_wait_mutex); 1704 waitreq->next = rf_sparet_resp_queue; 1705 rf_sparet_resp_queue = waitreq; 1706 rf_broadcast_cond2(rf_sparet_resp_cv); 1707 rf_unlock_mutex2(rf_sparet_wait_mutex); 1708 1709 return retcode; 1710 #endif 1711 default: 1712 /* 1713 * Don't bother trying to load compat modules 1714 * if it is not our ioctl. This is more efficient 1715 * and makes rump tests not depend on compat code 1716 */ 1717 if (IOCGROUP(cmd) != 'r') 1718 break; 1719 #ifdef _LP64 1720 if ((l->l_proc->p_flag & PK_32) != 0) { 1721 module_autoload("compat_netbsd32_raid", 1722 MODULE_CLASS_EXEC); 1723 MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook, 1724 (rs, cmd, data), enosys(), retcode); 1725 if (retcode != EPASSTHROUGH) 1726 return retcode; 1727 } 1728 #endif 1729 module_autoload("compat_raid_80", MODULE_CLASS_EXEC); 1730 MODULE_HOOK_CALL(raidframe_ioctl_80_hook, 1731 (rs, cmd, data), enosys(), retcode); 1732 if (retcode != EPASSTHROUGH) 1733 return retcode; 1734 1735 module_autoload("compat_raid_50", MODULE_CLASS_EXEC); 1736 MODULE_HOOK_CALL(raidframe_ioctl_50_hook, 1737 (rs, cmd, data), enosys(), retcode); 1738 if (retcode != EPASSTHROUGH) 1739 return retcode; 1740 break; /* fall through to the os-specific code below */ 1741 1742 } 1743 1744 if (!raidPtr->valid) 1745 return EINVAL; 1746 1747 /* 1748 * Add support for "regular" device ioctls here. 1749 */ 1750 1751 switch (cmd) { 1752 case DIOCGCACHE: 1753 retcode = rf_get_component_caches(raidPtr, (int *)data); 1754 break; 1755 1756 case DIOCCACHESYNC: 1757 retcode = rf_sync_component_caches(raidPtr); 1758 break; 1759 1760 default: 1761 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l); 1762 break; 1763 } 1764 1765 return retcode; 1766 1767 } 1768 1769 1770 /* raidinit -- complete the rest of the initialization for the 1771 RAIDframe device. */ 1772 1773 1774 static void 1775 raidinit(struct raid_softc *rs) 1776 { 1777 cfdata_t cf; 1778 unsigned int unit; 1779 struct dk_softc *dksc = &rs->sc_dksc; 1780 RF_Raid_t *raidPtr = &rs->sc_r; 1781 device_t dev; 1782 1783 unit = raidPtr->raidid; 1784 1785 /* XXX doesn't check bounds. */ 1786 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit); 1787 1788 /* attach the pseudo device */ 1789 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK); 1790 cf->cf_name = raid_cd.cd_name; 1791 cf->cf_atname = raid_cd.cd_name; 1792 cf->cf_unit = unit; 1793 cf->cf_fstate = FSTATE_STAR; 1794 1795 dev = config_attach_pseudo(cf); 1796 if (dev == NULL) { 1797 printf("raid%d: config_attach_pseudo failed\n", 1798 raidPtr->raidid); 1799 free(cf, M_RAIDFRAME); 1800 return; 1801 } 1802 1803 /* provide a backpointer to the real softc */ 1804 raidsoftc(dev) = rs; 1805 1806 /* disk_attach actually creates space for the CPU disklabel, among 1807 * other things, so it's critical to call this *BEFORE* we try putzing 1808 * with disklabels. */ 1809 dk_init(dksc, dev, DKTYPE_RAID); 1810 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver); 1811 1812 /* XXX There may be a weird interaction here between this, and 1813 * protectedSectors, as used in RAIDframe. */ 1814 1815 rs->sc_size = raidPtr->totalSectors; 1816 1817 /* Attach dk and disk subsystems */ 1818 dk_attach(dksc); 1819 disk_attach(&dksc->sc_dkdev); 1820 rf_set_geometry(rs, raidPtr); 1821 1822 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK); 1823 1824 /* mark unit as usuable */ 1825 rs->sc_flags |= RAIDF_INITED; 1826 1827 dkwedge_discover(&dksc->sc_dkdev); 1828 } 1829 1830 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 1831 /* wake up the daemon & tell it to get us a spare table 1832 * XXX 1833 * the entries in the queues should be tagged with the raidPtr 1834 * so that in the extremely rare case that two recons happen at once, 1835 * we know for which device were requesting a spare table 1836 * XXX 1837 * 1838 * XXX This code is not currently used. GO 1839 */ 1840 int 1841 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req) 1842 { 1843 int retcode; 1844 1845 rf_lock_mutex2(rf_sparet_wait_mutex); 1846 req->next = rf_sparet_wait_queue; 1847 rf_sparet_wait_queue = req; 1848 rf_broadcast_cond2(rf_sparet_wait_cv); 1849 1850 /* mpsleep unlocks the mutex */ 1851 while (!rf_sparet_resp_queue) { 1852 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex); 1853 } 1854 req = rf_sparet_resp_queue; 1855 rf_sparet_resp_queue = req->next; 1856 rf_unlock_mutex2(rf_sparet_wait_mutex); 1857 1858 retcode = req->fcol; 1859 RF_Free(req, sizeof(*req)); /* this is not the same req as we 1860 * alloc'd */ 1861 return retcode; 1862 } 1863 #endif 1864 1865 /* a wrapper around rf_DoAccess that extracts appropriate info from the 1866 * bp & passes it down. 1867 * any calls originating in the kernel must use non-blocking I/O 1868 * do some extra sanity checking to return "appropriate" error values for 1869 * certain conditions (to make some standard utilities work) 1870 * 1871 * Formerly known as: rf_DoAccessKernel 1872 */ 1873 void 1874 raidstart(RF_Raid_t *raidPtr) 1875 { 1876 struct raid_softc *rs; 1877 struct dk_softc *dksc; 1878 1879 rs = raidPtr->softc; 1880 dksc = &rs->sc_dksc; 1881 /* quick check to see if anything has died recently */ 1882 rf_lock_mutex2(raidPtr->mutex); 1883 if (raidPtr->numNewFailures > 0) { 1884 rf_unlock_mutex2(raidPtr->mutex); 1885 rf_update_component_labels(raidPtr, 1886 RF_NORMAL_COMPONENT_UPDATE); 1887 rf_lock_mutex2(raidPtr->mutex); 1888 raidPtr->numNewFailures--; 1889 } 1890 rf_unlock_mutex2(raidPtr->mutex); 1891 1892 if ((rs->sc_flags & RAIDF_INITED) == 0) { 1893 printf("raid%d: raidstart not ready\n", raidPtr->raidid); 1894 return; 1895 } 1896 1897 dk_start(dksc, NULL); 1898 } 1899 1900 static int 1901 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp) 1902 { 1903 RF_SectorCount_t num_blocks, pb, sum; 1904 RF_RaidAddr_t raid_addr; 1905 daddr_t blocknum; 1906 int do_async; 1907 int rc; 1908 1909 rf_lock_mutex2(raidPtr->mutex); 1910 if (raidPtr->openings == 0) { 1911 rf_unlock_mutex2(raidPtr->mutex); 1912 return EAGAIN; 1913 } 1914 rf_unlock_mutex2(raidPtr->mutex); 1915 1916 blocknum = bp->b_rawblkno; 1917 1918 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno, 1919 (int) blocknum)); 1920 1921 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount)); 1922 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid)); 1923 1924 /* *THIS* is where we adjust what block we're going to... 1925 * but DO NOT TOUCH bp->b_blkno!!! */ 1926 raid_addr = blocknum; 1927 1928 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector; 1929 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0; 1930 sum = raid_addr + num_blocks + pb; 1931 if (1 || rf_debugKernelAccess) { 1932 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n", 1933 (int) raid_addr, (int) sum, (int) num_blocks, 1934 (int) pb, (int) bp->b_resid)); 1935 } 1936 if ((sum > raidPtr->totalSectors) || (sum < raid_addr) 1937 || (sum < num_blocks) || (sum < pb)) { 1938 rc = ENOSPC; 1939 goto done; 1940 } 1941 /* 1942 * XXX rf_DoAccess() should do this, not just DoAccessKernel() 1943 */ 1944 1945 if (bp->b_bcount & raidPtr->sectorMask) { 1946 rc = ENOSPC; 1947 goto done; 1948 } 1949 db1_printf(("Calling DoAccess..\n")); 1950 1951 1952 rf_lock_mutex2(raidPtr->mutex); 1953 raidPtr->openings--; 1954 rf_unlock_mutex2(raidPtr->mutex); 1955 1956 /* 1957 * Everything is async. 1958 */ 1959 do_async = 1; 1960 1961 /* don't ever condition on bp->b_flags & B_WRITE. 1962 * always condition on B_READ instead */ 1963 1964 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ? 1965 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE, 1966 do_async, raid_addr, num_blocks, 1967 bp->b_data, bp, RF_DAG_NONBLOCKING_IO); 1968 1969 done: 1970 return rc; 1971 } 1972 1973 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */ 1974 1975 int 1976 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req) 1977 { 1978 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE; 1979 struct buf *bp; 1980 1981 req->queue = queue; 1982 bp = req->bp; 1983 1984 switch (req->type) { 1985 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */ 1986 /* XXX need to do something extra here.. */ 1987 /* I'm leaving this in, as I've never actually seen it used, 1988 * and I'd like folks to report it... GO */ 1989 printf(("WAKEUP CALLED\n")); 1990 queue->numOutstanding++; 1991 1992 bp->b_flags = 0; 1993 bp->b_private = req; 1994 1995 KernelWakeupFunc(bp); 1996 break; 1997 1998 case RF_IO_TYPE_READ: 1999 case RF_IO_TYPE_WRITE: 2000 #if RF_ACC_TRACE > 0 2001 if (req->tracerec) { 2002 RF_ETIMER_START(req->tracerec->timer); 2003 } 2004 #endif 2005 InitBP(bp, queue->rf_cinfo->ci_vp, 2006 op, queue->rf_cinfo->ci_dev, 2007 req->sectorOffset, req->numSector, 2008 req->buf, KernelWakeupFunc, (void *) req, 2009 queue->raidPtr->logBytesPerSector); 2010 2011 if (rf_debugKernelAccess) { 2012 db1_printf(("dispatch: bp->b_blkno = %ld\n", 2013 (long) bp->b_blkno)); 2014 } 2015 queue->numOutstanding++; 2016 queue->last_deq_sector = req->sectorOffset; 2017 /* acc wouldn't have been let in if there were any pending 2018 * reqs at any other priority */ 2019 queue->curPriority = req->priority; 2020 2021 db1_printf(("Going for %c to unit %d col %d\n", 2022 req->type, queue->raidPtr->raidid, 2023 queue->col)); 2024 db1_printf(("sector %d count %d (%d bytes) %d\n", 2025 (int) req->sectorOffset, (int) req->numSector, 2026 (int) (req->numSector << 2027 queue->raidPtr->logBytesPerSector), 2028 (int) queue->raidPtr->logBytesPerSector)); 2029 2030 /* 2031 * XXX: drop lock here since this can block at 2032 * least with backing SCSI devices. Retake it 2033 * to minimize fuss with calling interfaces. 2034 */ 2035 2036 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam"); 2037 bdev_strategy(bp); 2038 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam"); 2039 break; 2040 2041 default: 2042 panic("bad req->type in rf_DispatchKernelIO"); 2043 } 2044 db1_printf(("Exiting from DispatchKernelIO\n")); 2045 2046 return 0; 2047 } 2048 /* this is the callback function associated with a I/O invoked from 2049 kernel code. 2050 */ 2051 static void 2052 KernelWakeupFunc(struct buf *bp) 2053 { 2054 RF_DiskQueueData_t *req = NULL; 2055 RF_DiskQueue_t *queue; 2056 2057 db1_printf(("recovering the request queue:\n")); 2058 2059 req = bp->b_private; 2060 2061 queue = (RF_DiskQueue_t *) req->queue; 2062 2063 rf_lock_mutex2(queue->raidPtr->iodone_lock); 2064 2065 #if RF_ACC_TRACE > 0 2066 if (req->tracerec) { 2067 RF_ETIMER_STOP(req->tracerec->timer); 2068 RF_ETIMER_EVAL(req->tracerec->timer); 2069 rf_lock_mutex2(rf_tracing_mutex); 2070 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2071 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2072 req->tracerec->num_phys_ios++; 2073 rf_unlock_mutex2(rf_tracing_mutex); 2074 } 2075 #endif 2076 2077 /* XXX Ok, let's get aggressive... If b_error is set, let's go 2078 * ballistic, and mark the component as hosed... */ 2079 2080 if (bp->b_error != 0) { 2081 /* Mark the disk as dead */ 2082 /* but only mark it once... */ 2083 /* and only if it wouldn't leave this RAID set 2084 completely broken */ 2085 if (((queue->raidPtr->Disks[queue->col].status == 2086 rf_ds_optimal) || 2087 (queue->raidPtr->Disks[queue->col].status == 2088 rf_ds_used_spare)) && 2089 (queue->raidPtr->numFailures < 2090 queue->raidPtr->Layout.map->faultsTolerated)) { 2091 printf("raid%d: IO Error (%d). Marking %s as failed.\n", 2092 queue->raidPtr->raidid, 2093 bp->b_error, 2094 queue->raidPtr->Disks[queue->col].devname); 2095 queue->raidPtr->Disks[queue->col].status = 2096 rf_ds_failed; 2097 queue->raidPtr->status = rf_rs_degraded; 2098 queue->raidPtr->numFailures++; 2099 queue->raidPtr->numNewFailures++; 2100 } else { /* Disk is already dead... */ 2101 /* printf("Disk already marked as dead!\n"); */ 2102 } 2103 2104 } 2105 2106 /* Fill in the error value */ 2107 req->error = bp->b_error; 2108 2109 /* Drop this one on the "finished" queue... */ 2110 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries); 2111 2112 /* Let the raidio thread know there is work to be done. */ 2113 rf_signal_cond2(queue->raidPtr->iodone_cv); 2114 2115 rf_unlock_mutex2(queue->raidPtr->iodone_lock); 2116 } 2117 2118 2119 /* 2120 * initialize a buf structure for doing an I/O in the kernel. 2121 */ 2122 static void 2123 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev, 2124 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf, 2125 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector) 2126 { 2127 bp->b_flags = rw_flag | (bp->b_flags & rf_b_pass); 2128 bp->b_oflags = 0; 2129 bp->b_cflags = 0; 2130 bp->b_bcount = numSect << logBytesPerSector; 2131 bp->b_bufsize = bp->b_bcount; 2132 bp->b_error = 0; 2133 bp->b_dev = dev; 2134 bp->b_data = bf; 2135 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT; 2136 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */ 2137 if (bp->b_bcount == 0) { 2138 panic("bp->b_bcount is zero in InitBP!!"); 2139 } 2140 bp->b_iodone = cbFunc; 2141 bp->b_private = cbArg; 2142 } 2143 2144 /* 2145 * Wait interruptibly for an exclusive lock. 2146 * 2147 * XXX 2148 * Several drivers do this; it should be abstracted and made MP-safe. 2149 * (Hmm... where have we seen this warning before :-> GO ) 2150 */ 2151 static int 2152 raidlock(struct raid_softc *rs) 2153 { 2154 int error; 2155 2156 error = 0; 2157 mutex_enter(&rs->sc_mutex); 2158 while ((rs->sc_flags & RAIDF_LOCKED) != 0) { 2159 rs->sc_flags |= RAIDF_WANTED; 2160 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex); 2161 if (error != 0) 2162 goto done; 2163 } 2164 rs->sc_flags |= RAIDF_LOCKED; 2165 done: 2166 mutex_exit(&rs->sc_mutex); 2167 return error; 2168 } 2169 /* 2170 * Unlock and wake up any waiters. 2171 */ 2172 static void 2173 raidunlock(struct raid_softc *rs) 2174 { 2175 2176 mutex_enter(&rs->sc_mutex); 2177 rs->sc_flags &= ~RAIDF_LOCKED; 2178 if ((rs->sc_flags & RAIDF_WANTED) != 0) { 2179 rs->sc_flags &= ~RAIDF_WANTED; 2180 cv_broadcast(&rs->sc_cv); 2181 } 2182 mutex_exit(&rs->sc_mutex); 2183 } 2184 2185 2186 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */ 2187 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */ 2188 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE 2189 2190 static daddr_t 2191 rf_component_info_offset(void) 2192 { 2193 2194 return RF_COMPONENT_INFO_OFFSET; 2195 } 2196 2197 static daddr_t 2198 rf_component_info_size(unsigned secsize) 2199 { 2200 daddr_t info_size; 2201 2202 KASSERT(secsize); 2203 if (secsize > RF_COMPONENT_INFO_SIZE) 2204 info_size = secsize; 2205 else 2206 info_size = RF_COMPONENT_INFO_SIZE; 2207 2208 return info_size; 2209 } 2210 2211 static daddr_t 2212 rf_parity_map_offset(RF_Raid_t *raidPtr) 2213 { 2214 daddr_t map_offset; 2215 2216 KASSERT(raidPtr->bytesPerSector); 2217 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE) 2218 map_offset = raidPtr->bytesPerSector; 2219 else 2220 map_offset = RF_COMPONENT_INFO_SIZE; 2221 map_offset += rf_component_info_offset(); 2222 2223 return map_offset; 2224 } 2225 2226 static daddr_t 2227 rf_parity_map_size(RF_Raid_t *raidPtr) 2228 { 2229 daddr_t map_size; 2230 2231 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE) 2232 map_size = raidPtr->bytesPerSector; 2233 else 2234 map_size = RF_PARITY_MAP_SIZE; 2235 2236 return map_size; 2237 } 2238 2239 int 2240 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col) 2241 { 2242 RF_ComponentLabel_t *clabel; 2243 2244 clabel = raidget_component_label(raidPtr, col); 2245 clabel->clean = RF_RAID_CLEAN; 2246 raidflush_component_label(raidPtr, col); 2247 return(0); 2248 } 2249 2250 2251 int 2252 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col) 2253 { 2254 RF_ComponentLabel_t *clabel; 2255 2256 clabel = raidget_component_label(raidPtr, col); 2257 clabel->clean = RF_RAID_DIRTY; 2258 raidflush_component_label(raidPtr, col); 2259 return(0); 2260 } 2261 2262 int 2263 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2264 { 2265 KASSERT(raidPtr->bytesPerSector); 2266 return raidread_component_label(raidPtr->bytesPerSector, 2267 raidPtr->Disks[col].dev, 2268 raidPtr->raid_cinfo[col].ci_vp, 2269 &raidPtr->raid_cinfo[col].ci_label); 2270 } 2271 2272 RF_ComponentLabel_t * 2273 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2274 { 2275 return &raidPtr->raid_cinfo[col].ci_label; 2276 } 2277 2278 int 2279 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2280 { 2281 RF_ComponentLabel_t *label; 2282 2283 label = &raidPtr->raid_cinfo[col].ci_label; 2284 label->mod_counter = raidPtr->mod_counter; 2285 #ifndef RF_NO_PARITY_MAP 2286 label->parity_map_modcount = label->mod_counter; 2287 #endif 2288 return raidwrite_component_label(raidPtr->bytesPerSector, 2289 raidPtr->Disks[col].dev, 2290 raidPtr->raid_cinfo[col].ci_vp, label); 2291 } 2292 2293 2294 static int 2295 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2296 RF_ComponentLabel_t *clabel) 2297 { 2298 return raidread_component_area(dev, b_vp, clabel, 2299 sizeof(RF_ComponentLabel_t), 2300 rf_component_info_offset(), 2301 rf_component_info_size(secsize)); 2302 } 2303 2304 /* ARGSUSED */ 2305 static int 2306 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data, 2307 size_t msize, daddr_t offset, daddr_t dsize) 2308 { 2309 struct buf *bp; 2310 int error; 2311 2312 /* XXX should probably ensure that we don't try to do this if 2313 someone has changed rf_protected_sectors. */ 2314 2315 if (b_vp == NULL) { 2316 /* For whatever reason, this component is not valid. 2317 Don't try to read a component label from it. */ 2318 return(EINVAL); 2319 } 2320 2321 /* get a block of the appropriate size... */ 2322 bp = geteblk((int)dsize); 2323 bp->b_dev = dev; 2324 2325 /* get our ducks in a row for the read */ 2326 bp->b_blkno = offset / DEV_BSIZE; 2327 bp->b_bcount = dsize; 2328 bp->b_flags |= B_READ; 2329 bp->b_resid = dsize; 2330 2331 bdev_strategy(bp); 2332 error = biowait(bp); 2333 2334 if (!error) { 2335 memcpy(data, bp->b_data, msize); 2336 } 2337 2338 brelse(bp, 0); 2339 return(error); 2340 } 2341 2342 2343 static int 2344 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2345 RF_ComponentLabel_t *clabel) 2346 { 2347 return raidwrite_component_area(dev, b_vp, clabel, 2348 sizeof(RF_ComponentLabel_t), 2349 rf_component_info_offset(), 2350 rf_component_info_size(secsize), 0); 2351 } 2352 2353 /* ARGSUSED */ 2354 static int 2355 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data, 2356 size_t msize, daddr_t offset, daddr_t dsize, int asyncp) 2357 { 2358 struct buf *bp; 2359 int error; 2360 2361 /* get a block of the appropriate size... */ 2362 bp = geteblk((int)dsize); 2363 bp->b_dev = dev; 2364 2365 /* get our ducks in a row for the write */ 2366 bp->b_blkno = offset / DEV_BSIZE; 2367 bp->b_bcount = dsize; 2368 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0); 2369 bp->b_resid = dsize; 2370 2371 memset(bp->b_data, 0, dsize); 2372 memcpy(bp->b_data, data, msize); 2373 2374 bdev_strategy(bp); 2375 if (asyncp) 2376 return 0; 2377 error = biowait(bp); 2378 brelse(bp, 0); 2379 if (error) { 2380 #if 1 2381 printf("Failed to write RAID component info!\n"); 2382 #endif 2383 } 2384 2385 return(error); 2386 } 2387 2388 void 2389 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2390 { 2391 int c; 2392 2393 for (c = 0; c < raidPtr->numCol; c++) { 2394 /* Skip dead disks. */ 2395 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2396 continue; 2397 /* XXXjld: what if an error occurs here? */ 2398 raidwrite_component_area(raidPtr->Disks[c].dev, 2399 raidPtr->raid_cinfo[c].ci_vp, map, 2400 RF_PARITYMAP_NBYTE, 2401 rf_parity_map_offset(raidPtr), 2402 rf_parity_map_size(raidPtr), 0); 2403 } 2404 } 2405 2406 void 2407 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2408 { 2409 struct rf_paritymap_ondisk tmp; 2410 int c,first; 2411 2412 first=1; 2413 for (c = 0; c < raidPtr->numCol; c++) { 2414 /* Skip dead disks. */ 2415 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2416 continue; 2417 raidread_component_area(raidPtr->Disks[c].dev, 2418 raidPtr->raid_cinfo[c].ci_vp, &tmp, 2419 RF_PARITYMAP_NBYTE, 2420 rf_parity_map_offset(raidPtr), 2421 rf_parity_map_size(raidPtr)); 2422 if (first) { 2423 memcpy(map, &tmp, sizeof(*map)); 2424 first = 0; 2425 } else { 2426 rf_paritymap_merge(map, &tmp); 2427 } 2428 } 2429 } 2430 2431 void 2432 rf_markalldirty(RF_Raid_t *raidPtr) 2433 { 2434 RF_ComponentLabel_t *clabel; 2435 int sparecol; 2436 int c; 2437 int j; 2438 int scol = -1; 2439 2440 raidPtr->mod_counter++; 2441 for (c = 0; c < raidPtr->numCol; c++) { 2442 /* we don't want to touch (at all) a disk that has 2443 failed */ 2444 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) { 2445 clabel = raidget_component_label(raidPtr, c); 2446 if (clabel->status == rf_ds_spared) { 2447 /* XXX do something special... 2448 but whatever you do, don't 2449 try to access it!! */ 2450 } else { 2451 raidmarkdirty(raidPtr, c); 2452 } 2453 } 2454 } 2455 2456 for( c = 0; c < raidPtr->numSpare ; c++) { 2457 sparecol = raidPtr->numCol + c; 2458 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2459 /* 2460 2461 we claim this disk is "optimal" if it's 2462 rf_ds_used_spare, as that means it should be 2463 directly substitutable for the disk it replaced. 2464 We note that too... 2465 2466 */ 2467 2468 for(j=0;j<raidPtr->numCol;j++) { 2469 if (raidPtr->Disks[j].spareCol == sparecol) { 2470 scol = j; 2471 break; 2472 } 2473 } 2474 2475 clabel = raidget_component_label(raidPtr, sparecol); 2476 /* make sure status is noted */ 2477 2478 raid_init_component_label(raidPtr, clabel); 2479 2480 clabel->row = 0; 2481 clabel->column = scol; 2482 /* Note: we *don't* change status from rf_ds_used_spare 2483 to rf_ds_optimal */ 2484 /* clabel.status = rf_ds_optimal; */ 2485 2486 raidmarkdirty(raidPtr, sparecol); 2487 } 2488 } 2489 } 2490 2491 2492 void 2493 rf_update_component_labels(RF_Raid_t *raidPtr, int final) 2494 { 2495 RF_ComponentLabel_t *clabel; 2496 int sparecol; 2497 int c; 2498 int j; 2499 int scol; 2500 struct raid_softc *rs = raidPtr->softc; 2501 2502 scol = -1; 2503 2504 /* XXX should do extra checks to make sure things really are clean, 2505 rather than blindly setting the clean bit... */ 2506 2507 raidPtr->mod_counter++; 2508 2509 for (c = 0; c < raidPtr->numCol; c++) { 2510 if (raidPtr->Disks[c].status == rf_ds_optimal) { 2511 clabel = raidget_component_label(raidPtr, c); 2512 /* make sure status is noted */ 2513 clabel->status = rf_ds_optimal; 2514 2515 /* note what unit we are configured as */ 2516 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0) 2517 clabel->last_unit = raidPtr->raidid; 2518 2519 raidflush_component_label(raidPtr, c); 2520 if (final == RF_FINAL_COMPONENT_UPDATE) { 2521 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2522 raidmarkclean(raidPtr, c); 2523 } 2524 } 2525 } 2526 /* else we don't touch it.. */ 2527 } 2528 2529 for( c = 0; c < raidPtr->numSpare ; c++) { 2530 sparecol = raidPtr->numCol + c; 2531 /* Need to ensure that the reconstruct actually completed! */ 2532 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2533 /* 2534 2535 we claim this disk is "optimal" if it's 2536 rf_ds_used_spare, as that means it should be 2537 directly substitutable for the disk it replaced. 2538 We note that too... 2539 2540 */ 2541 2542 for(j=0;j<raidPtr->numCol;j++) { 2543 if (raidPtr->Disks[j].spareCol == sparecol) { 2544 scol = j; 2545 break; 2546 } 2547 } 2548 2549 /* XXX shouldn't *really* need this... */ 2550 clabel = raidget_component_label(raidPtr, sparecol); 2551 /* make sure status is noted */ 2552 2553 raid_init_component_label(raidPtr, clabel); 2554 2555 clabel->column = scol; 2556 clabel->status = rf_ds_optimal; 2557 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0) 2558 clabel->last_unit = raidPtr->raidid; 2559 2560 raidflush_component_label(raidPtr, sparecol); 2561 if (final == RF_FINAL_COMPONENT_UPDATE) { 2562 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2563 raidmarkclean(raidPtr, sparecol); 2564 } 2565 } 2566 } 2567 } 2568 } 2569 2570 void 2571 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured) 2572 { 2573 2574 if (vp != NULL) { 2575 if (auto_configured == 1) { 2576 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2577 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2578 vput(vp); 2579 2580 } else { 2581 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred); 2582 } 2583 } 2584 } 2585 2586 2587 void 2588 rf_UnconfigureVnodes(RF_Raid_t *raidPtr) 2589 { 2590 int r,c; 2591 struct vnode *vp; 2592 int acd; 2593 2594 2595 /* We take this opportunity to close the vnodes like we should.. */ 2596 2597 for (c = 0; c < raidPtr->numCol; c++) { 2598 vp = raidPtr->raid_cinfo[c].ci_vp; 2599 acd = raidPtr->Disks[c].auto_configured; 2600 rf_close_component(raidPtr, vp, acd); 2601 raidPtr->raid_cinfo[c].ci_vp = NULL; 2602 raidPtr->Disks[c].auto_configured = 0; 2603 } 2604 2605 for (r = 0; r < raidPtr->numSpare; r++) { 2606 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp; 2607 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured; 2608 rf_close_component(raidPtr, vp, acd); 2609 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL; 2610 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0; 2611 } 2612 } 2613 2614 2615 void 2616 rf_ReconThread(struct rf_recon_req_internal *req) 2617 { 2618 int s; 2619 RF_Raid_t *raidPtr; 2620 2621 s = splbio(); 2622 raidPtr = (RF_Raid_t *) req->raidPtr; 2623 raidPtr->recon_in_progress = 1; 2624 2625 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col, 2626 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0)); 2627 2628 RF_Free(req, sizeof(*req)); 2629 2630 raidPtr->recon_in_progress = 0; 2631 splx(s); 2632 2633 /* That's all... */ 2634 kthread_exit(0); /* does not return */ 2635 } 2636 2637 void 2638 rf_RewriteParityThread(RF_Raid_t *raidPtr) 2639 { 2640 int retcode; 2641 int s; 2642 2643 raidPtr->parity_rewrite_stripes_done = 0; 2644 raidPtr->parity_rewrite_in_progress = 1; 2645 s = splbio(); 2646 retcode = rf_RewriteParity(raidPtr); 2647 splx(s); 2648 if (retcode) { 2649 printf("raid%d: Error re-writing parity (%d)!\n", 2650 raidPtr->raidid, retcode); 2651 } else { 2652 /* set the clean bit! If we shutdown correctly, 2653 the clean bit on each component label will get 2654 set */ 2655 raidPtr->parity_good = RF_RAID_CLEAN; 2656 } 2657 raidPtr->parity_rewrite_in_progress = 0; 2658 2659 /* Anyone waiting for us to stop? If so, inform them... */ 2660 if (raidPtr->waitShutdown) { 2661 rf_lock_mutex2(raidPtr->rad_lock); 2662 cv_broadcast(&raidPtr->parity_rewrite_cv); 2663 rf_unlock_mutex2(raidPtr->rad_lock); 2664 } 2665 2666 /* That's all... */ 2667 kthread_exit(0); /* does not return */ 2668 } 2669 2670 2671 void 2672 rf_CopybackThread(RF_Raid_t *raidPtr) 2673 { 2674 int s; 2675 2676 raidPtr->copyback_in_progress = 1; 2677 s = splbio(); 2678 rf_CopybackReconstructedData(raidPtr); 2679 splx(s); 2680 raidPtr->copyback_in_progress = 0; 2681 2682 /* That's all... */ 2683 kthread_exit(0); /* does not return */ 2684 } 2685 2686 2687 void 2688 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req) 2689 { 2690 int s; 2691 RF_Raid_t *raidPtr; 2692 2693 s = splbio(); 2694 raidPtr = req->raidPtr; 2695 raidPtr->recon_in_progress = 1; 2696 rf_ReconstructInPlace(raidPtr, req->col); 2697 RF_Free(req, sizeof(*req)); 2698 raidPtr->recon_in_progress = 0; 2699 splx(s); 2700 2701 /* That's all... */ 2702 kthread_exit(0); /* does not return */ 2703 } 2704 2705 static RF_AutoConfig_t * 2706 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp, 2707 const char *cname, RF_SectorCount_t size, uint64_t numsecs, 2708 unsigned secsize) 2709 { 2710 int good_one = 0; 2711 RF_ComponentLabel_t *clabel; 2712 RF_AutoConfig_t *ac; 2713 2714 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_WAITOK); 2715 2716 if (!raidread_component_label(secsize, dev, vp, clabel)) { 2717 /* Got the label. Does it look reasonable? */ 2718 if (rf_reasonable_label(clabel, numsecs) && 2719 (rf_component_label_partitionsize(clabel) <= size)) { 2720 #ifdef DEBUG 2721 printf("Component on: %s: %llu\n", 2722 cname, (unsigned long long)size); 2723 rf_print_component_label(clabel); 2724 #endif 2725 /* if it's reasonable, add it, else ignore it. */ 2726 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME, 2727 M_WAITOK); 2728 strlcpy(ac->devname, cname, sizeof(ac->devname)); 2729 ac->dev = dev; 2730 ac->vp = vp; 2731 ac->clabel = clabel; 2732 ac->next = ac_list; 2733 ac_list = ac; 2734 good_one = 1; 2735 } 2736 } 2737 if (!good_one) { 2738 /* cleanup */ 2739 free(clabel, M_RAIDFRAME); 2740 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2741 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2742 vput(vp); 2743 } 2744 return ac_list; 2745 } 2746 2747 RF_AutoConfig_t * 2748 rf_find_raid_components(void) 2749 { 2750 struct vnode *vp; 2751 struct disklabel label; 2752 device_t dv; 2753 deviter_t di; 2754 dev_t dev; 2755 int bmajor, bminor, wedge, rf_part_found; 2756 int error; 2757 int i; 2758 RF_AutoConfig_t *ac_list; 2759 uint64_t numsecs; 2760 unsigned secsize; 2761 int dowedges; 2762 2763 /* initialize the AutoConfig list */ 2764 ac_list = NULL; 2765 2766 /* 2767 * we begin by trolling through *all* the devices on the system *twice* 2768 * first we scan for wedges, second for other devices. This avoids 2769 * using a raw partition instead of a wedge that covers the whole disk 2770 */ 2771 2772 for (dowedges=1; dowedges>=0; --dowedges) { 2773 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL; 2774 dv = deviter_next(&di)) { 2775 2776 /* we are only interested in disks... */ 2777 if (device_class(dv) != DV_DISK) 2778 continue; 2779 2780 /* we don't care about floppies... */ 2781 if (device_is_a(dv, "fd")) { 2782 continue; 2783 } 2784 2785 /* we don't care about CD's... */ 2786 if (device_is_a(dv, "cd")) { 2787 continue; 2788 } 2789 2790 /* we don't care about md's... */ 2791 if (device_is_a(dv, "md")) { 2792 continue; 2793 } 2794 2795 /* hdfd is the Atari/Hades floppy driver */ 2796 if (device_is_a(dv, "hdfd")) { 2797 continue; 2798 } 2799 2800 /* fdisa is the Atari/Milan floppy driver */ 2801 if (device_is_a(dv, "fdisa")) { 2802 continue; 2803 } 2804 2805 /* are we in the wedges pass ? */ 2806 wedge = device_is_a(dv, "dk"); 2807 if (wedge != dowedges) { 2808 continue; 2809 } 2810 2811 /* need to find the device_name_to_block_device_major stuff */ 2812 bmajor = devsw_name2blk(device_xname(dv), NULL, 0); 2813 2814 rf_part_found = 0; /*No raid partition as yet*/ 2815 2816 /* get a vnode for the raw partition of this disk */ 2817 bminor = minor(device_unit(dv)); 2818 dev = wedge ? makedev(bmajor, bminor) : 2819 MAKEDISKDEV(bmajor, bminor, RAW_PART); 2820 if (bdevvp(dev, &vp)) 2821 panic("RAID can't alloc vnode"); 2822 2823 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2824 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED); 2825 2826 if (error) { 2827 /* "Who cares." Continue looking 2828 for something that exists*/ 2829 vput(vp); 2830 continue; 2831 } 2832 2833 error = getdisksize(vp, &numsecs, &secsize); 2834 if (error) { 2835 /* 2836 * Pseudo devices like vnd and cgd can be 2837 * opened but may still need some configuration. 2838 * Ignore these quietly. 2839 */ 2840 if (error != ENXIO) 2841 printf("RAIDframe: can't get disk size" 2842 " for dev %s (%d)\n", 2843 device_xname(dv), error); 2844 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2845 vput(vp); 2846 continue; 2847 } 2848 if (wedge) { 2849 struct dkwedge_info dkw; 2850 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, 2851 NOCRED); 2852 if (error) { 2853 printf("RAIDframe: can't get wedge info for " 2854 "dev %s (%d)\n", device_xname(dv), error); 2855 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2856 vput(vp); 2857 continue; 2858 } 2859 2860 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) { 2861 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2862 vput(vp); 2863 continue; 2864 } 2865 2866 VOP_UNLOCK(vp); 2867 ac_list = rf_get_component(ac_list, dev, vp, 2868 device_xname(dv), dkw.dkw_size, numsecs, secsize); 2869 rf_part_found = 1; /*There is a raid component on this disk*/ 2870 continue; 2871 } 2872 2873 /* Ok, the disk exists. Go get the disklabel. */ 2874 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED); 2875 if (error) { 2876 /* 2877 * XXX can't happen - open() would 2878 * have errored out (or faked up one) 2879 */ 2880 if (error != ENOTTY) 2881 printf("RAIDframe: can't get label for dev " 2882 "%s (%d)\n", device_xname(dv), error); 2883 } 2884 2885 /* don't need this any more. We'll allocate it again 2886 a little later if we really do... */ 2887 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2888 vput(vp); 2889 2890 if (error) 2891 continue; 2892 2893 rf_part_found = 0; /*No raid partitions yet*/ 2894 for (i = 0; i < label.d_npartitions; i++) { 2895 char cname[sizeof(ac_list->devname)]; 2896 2897 /* We only support partitions marked as RAID */ 2898 if (label.d_partitions[i].p_fstype != FS_RAID) 2899 continue; 2900 2901 dev = MAKEDISKDEV(bmajor, device_unit(dv), i); 2902 if (bdevvp(dev, &vp)) 2903 panic("RAID can't alloc vnode"); 2904 2905 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2906 error = VOP_OPEN(vp, FREAD, NOCRED); 2907 if (error) { 2908 /* Whatever... */ 2909 vput(vp); 2910 continue; 2911 } 2912 VOP_UNLOCK(vp); 2913 snprintf(cname, sizeof(cname), "%s%c", 2914 device_xname(dv), 'a' + i); 2915 ac_list = rf_get_component(ac_list, dev, vp, cname, 2916 label.d_partitions[i].p_size, numsecs, secsize); 2917 rf_part_found = 1; /*There is at least one raid partition on this disk*/ 2918 } 2919 2920 /* 2921 *If there is no raid component on this disk, either in a 2922 *disklabel or inside a wedge, check the raw partition as well, 2923 *as it is possible to configure raid components on raw disk 2924 *devices. 2925 */ 2926 2927 if (!rf_part_found) { 2928 char cname[sizeof(ac_list->devname)]; 2929 2930 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART); 2931 if (bdevvp(dev, &vp)) 2932 panic("RAID can't alloc vnode"); 2933 2934 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2935 2936 error = VOP_OPEN(vp, FREAD, NOCRED); 2937 if (error) { 2938 /* Whatever... */ 2939 vput(vp); 2940 continue; 2941 } 2942 VOP_UNLOCK(vp); 2943 snprintf(cname, sizeof(cname), "%s%c", 2944 device_xname(dv), 'a' + RAW_PART); 2945 ac_list = rf_get_component(ac_list, dev, vp, cname, 2946 label.d_partitions[RAW_PART].p_size, numsecs, secsize); 2947 } 2948 } 2949 deviter_release(&di); 2950 } 2951 return ac_list; 2952 } 2953 2954 2955 int 2956 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs) 2957 { 2958 2959 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) || 2960 (clabel->version==RF_COMPONENT_LABEL_VERSION)) && 2961 ((clabel->clean == RF_RAID_CLEAN) || 2962 (clabel->clean == RF_RAID_DIRTY)) && 2963 clabel->row >=0 && 2964 clabel->column >= 0 && 2965 clabel->num_rows > 0 && 2966 clabel->num_columns > 0 && 2967 clabel->row < clabel->num_rows && 2968 clabel->column < clabel->num_columns && 2969 clabel->blockSize > 0 && 2970 /* 2971 * numBlocksHi may contain garbage, but it is ok since 2972 * the type is unsigned. If it is really garbage, 2973 * rf_fix_old_label_size() will fix it. 2974 */ 2975 rf_component_label_numblocks(clabel) > 0) { 2976 /* 2977 * label looks reasonable enough... 2978 * let's make sure it has no old garbage. 2979 */ 2980 if (numsecs) 2981 rf_fix_old_label_size(clabel, numsecs); 2982 return(1); 2983 } 2984 return(0); 2985 } 2986 2987 2988 /* 2989 * For reasons yet unknown, some old component labels have garbage in 2990 * the newer numBlocksHi region, and this causes lossage. Since those 2991 * disks will also have numsecs set to less than 32 bits of sectors, 2992 * we can determine when this corruption has occurred, and fix it. 2993 * 2994 * The exact same problem, with the same unknown reason, happens to 2995 * the partitionSizeHi member as well. 2996 */ 2997 static void 2998 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs) 2999 { 3000 3001 if (numsecs < ((uint64_t)1 << 32)) { 3002 if (clabel->numBlocksHi) { 3003 printf("WARNING: total sectors < 32 bits, yet " 3004 "numBlocksHi set\n" 3005 "WARNING: resetting numBlocksHi to zero.\n"); 3006 clabel->numBlocksHi = 0; 3007 } 3008 3009 if (clabel->partitionSizeHi) { 3010 printf("WARNING: total sectors < 32 bits, yet " 3011 "partitionSizeHi set\n" 3012 "WARNING: resetting partitionSizeHi to zero.\n"); 3013 clabel->partitionSizeHi = 0; 3014 } 3015 } 3016 } 3017 3018 3019 #ifdef DEBUG 3020 void 3021 rf_print_component_label(RF_ComponentLabel_t *clabel) 3022 { 3023 uint64_t numBlocks; 3024 static const char *rp[] = { 3025 "No", "Force", "Soft", "*invalid*" 3026 }; 3027 3028 3029 numBlocks = rf_component_label_numblocks(clabel); 3030 3031 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n", 3032 clabel->row, clabel->column, 3033 clabel->num_rows, clabel->num_columns); 3034 printf(" Version: %d Serial Number: %d Mod Counter: %d\n", 3035 clabel->version, clabel->serial_number, 3036 clabel->mod_counter); 3037 printf(" Clean: %s Status: %d\n", 3038 clabel->clean ? "Yes" : "No", clabel->status); 3039 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n", 3040 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU); 3041 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n", 3042 (char) clabel->parityConfig, clabel->blockSize, numBlocks); 3043 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No"); 3044 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]); 3045 printf(" Last configured as: raid%d\n", clabel->last_unit); 3046 #if 0 3047 printf(" Config order: %d\n", clabel->config_order); 3048 #endif 3049 3050 } 3051 #endif 3052 3053 RF_ConfigSet_t * 3054 rf_create_auto_sets(RF_AutoConfig_t *ac_list) 3055 { 3056 RF_AutoConfig_t *ac; 3057 RF_ConfigSet_t *config_sets; 3058 RF_ConfigSet_t *cset; 3059 RF_AutoConfig_t *ac_next; 3060 3061 3062 config_sets = NULL; 3063 3064 /* Go through the AutoConfig list, and figure out which components 3065 belong to what sets. */ 3066 ac = ac_list; 3067 while(ac!=NULL) { 3068 /* we're going to putz with ac->next, so save it here 3069 for use at the end of the loop */ 3070 ac_next = ac->next; 3071 3072 if (config_sets == NULL) { 3073 /* will need at least this one... */ 3074 config_sets = malloc(sizeof(RF_ConfigSet_t), 3075 M_RAIDFRAME, M_WAITOK); 3076 /* this one is easy :) */ 3077 config_sets->ac = ac; 3078 config_sets->next = NULL; 3079 config_sets->rootable = 0; 3080 ac->next = NULL; 3081 } else { 3082 /* which set does this component fit into? */ 3083 cset = config_sets; 3084 while(cset!=NULL) { 3085 if (rf_does_it_fit(cset, ac)) { 3086 /* looks like it matches... */ 3087 ac->next = cset->ac; 3088 cset->ac = ac; 3089 break; 3090 } 3091 cset = cset->next; 3092 } 3093 if (cset==NULL) { 3094 /* didn't find a match above... new set..*/ 3095 cset = malloc(sizeof(RF_ConfigSet_t), 3096 M_RAIDFRAME, M_WAITOK); 3097 cset->ac = ac; 3098 ac->next = NULL; 3099 cset->next = config_sets; 3100 cset->rootable = 0; 3101 config_sets = cset; 3102 } 3103 } 3104 ac = ac_next; 3105 } 3106 3107 3108 return(config_sets); 3109 } 3110 3111 static int 3112 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac) 3113 { 3114 RF_ComponentLabel_t *clabel1, *clabel2; 3115 3116 /* If this one matches the *first* one in the set, that's good 3117 enough, since the other members of the set would have been 3118 through here too... */ 3119 /* note that we are not checking partitionSize here.. 3120 3121 Note that we are also not checking the mod_counters here. 3122 If everything else matches except the mod_counter, that's 3123 good enough for this test. We will deal with the mod_counters 3124 a little later in the autoconfiguration process. 3125 3126 (clabel1->mod_counter == clabel2->mod_counter) && 3127 3128 The reason we don't check for this is that failed disks 3129 will have lower modification counts. If those disks are 3130 not added to the set they used to belong to, then they will 3131 form their own set, which may result in 2 different sets, 3132 for example, competing to be configured at raid0, and 3133 perhaps competing to be the root filesystem set. If the 3134 wrong ones get configured, or both attempt to become /, 3135 weird behaviour and or serious lossage will occur. Thus we 3136 need to bring them into the fold here, and kick them out at 3137 a later point. 3138 3139 */ 3140 3141 clabel1 = cset->ac->clabel; 3142 clabel2 = ac->clabel; 3143 if ((clabel1->version == clabel2->version) && 3144 (clabel1->serial_number == clabel2->serial_number) && 3145 (clabel1->num_rows == clabel2->num_rows) && 3146 (clabel1->num_columns == clabel2->num_columns) && 3147 (clabel1->sectPerSU == clabel2->sectPerSU) && 3148 (clabel1->SUsPerPU == clabel2->SUsPerPU) && 3149 (clabel1->SUsPerRU == clabel2->SUsPerRU) && 3150 (clabel1->parityConfig == clabel2->parityConfig) && 3151 (clabel1->maxOutstanding == clabel2->maxOutstanding) && 3152 (clabel1->blockSize == clabel2->blockSize) && 3153 rf_component_label_numblocks(clabel1) == 3154 rf_component_label_numblocks(clabel2) && 3155 (clabel1->autoconfigure == clabel2->autoconfigure) && 3156 (clabel1->root_partition == clabel2->root_partition) && 3157 (clabel1->last_unit == clabel2->last_unit) && 3158 (clabel1->config_order == clabel2->config_order)) { 3159 /* if it get's here, it almost *has* to be a match */ 3160 } else { 3161 /* it's not consistent with somebody in the set.. 3162 punt */ 3163 return(0); 3164 } 3165 /* all was fine.. it must fit... */ 3166 return(1); 3167 } 3168 3169 int 3170 rf_have_enough_components(RF_ConfigSet_t *cset) 3171 { 3172 RF_AutoConfig_t *ac; 3173 RF_AutoConfig_t *auto_config; 3174 RF_ComponentLabel_t *clabel; 3175 int c; 3176 int num_cols; 3177 int num_missing; 3178 int mod_counter; 3179 int mod_counter_found; 3180 int even_pair_failed; 3181 char parity_type; 3182 3183 3184 /* check to see that we have enough 'live' components 3185 of this set. If so, we can configure it if necessary */ 3186 3187 num_cols = cset->ac->clabel->num_columns; 3188 parity_type = cset->ac->clabel->parityConfig; 3189 3190 /* XXX Check for duplicate components!?!?!? */ 3191 3192 /* Determine what the mod_counter is supposed to be for this set. */ 3193 3194 mod_counter_found = 0; 3195 mod_counter = 0; 3196 ac = cset->ac; 3197 while(ac!=NULL) { 3198 if (mod_counter_found==0) { 3199 mod_counter = ac->clabel->mod_counter; 3200 mod_counter_found = 1; 3201 } else { 3202 if (ac->clabel->mod_counter > mod_counter) { 3203 mod_counter = ac->clabel->mod_counter; 3204 } 3205 } 3206 ac = ac->next; 3207 } 3208 3209 num_missing = 0; 3210 auto_config = cset->ac; 3211 3212 even_pair_failed = 0; 3213 for(c=0; c<num_cols; c++) { 3214 ac = auto_config; 3215 while(ac!=NULL) { 3216 if ((ac->clabel->column == c) && 3217 (ac->clabel->mod_counter == mod_counter)) { 3218 /* it's this one... */ 3219 #ifdef DEBUG 3220 printf("Found: %s at %d\n", 3221 ac->devname,c); 3222 #endif 3223 break; 3224 } 3225 ac=ac->next; 3226 } 3227 if (ac==NULL) { 3228 /* Didn't find one here! */ 3229 /* special case for RAID 1, especially 3230 where there are more than 2 3231 components (where RAIDframe treats 3232 things a little differently :( ) */ 3233 if (parity_type == '1') { 3234 if (c%2 == 0) { /* even component */ 3235 even_pair_failed = 1; 3236 } else { /* odd component. If 3237 we're failed, and 3238 so is the even 3239 component, it's 3240 "Good Night, Charlie" */ 3241 if (even_pair_failed == 1) { 3242 return(0); 3243 } 3244 } 3245 } else { 3246 /* normal accounting */ 3247 num_missing++; 3248 } 3249 } 3250 if ((parity_type == '1') && (c%2 == 1)) { 3251 /* Just did an even component, and we didn't 3252 bail.. reset the even_pair_failed flag, 3253 and go on to the next component.... */ 3254 even_pair_failed = 0; 3255 } 3256 } 3257 3258 clabel = cset->ac->clabel; 3259 3260 if (((clabel->parityConfig == '0') && (num_missing > 0)) || 3261 ((clabel->parityConfig == '4') && (num_missing > 1)) || 3262 ((clabel->parityConfig == '5') && (num_missing > 1))) { 3263 /* XXX this needs to be made *much* more general */ 3264 /* Too many failures */ 3265 return(0); 3266 } 3267 /* otherwise, all is well, and we've got enough to take a kick 3268 at autoconfiguring this set */ 3269 return(1); 3270 } 3271 3272 void 3273 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config, 3274 RF_Raid_t *raidPtr) 3275 { 3276 RF_ComponentLabel_t *clabel; 3277 int i; 3278 3279 clabel = ac->clabel; 3280 3281 /* 1. Fill in the common stuff */ 3282 config->numCol = clabel->num_columns; 3283 config->numSpare = 0; /* XXX should this be set here? */ 3284 config->sectPerSU = clabel->sectPerSU; 3285 config->SUsPerPU = clabel->SUsPerPU; 3286 config->SUsPerRU = clabel->SUsPerRU; 3287 config->parityConfig = clabel->parityConfig; 3288 /* XXX... */ 3289 strcpy(config->diskQueueType,"fifo"); 3290 config->maxOutstandingDiskReqs = clabel->maxOutstanding; 3291 config->layoutSpecificSize = 0; /* XXX ?? */ 3292 3293 while(ac!=NULL) { 3294 /* row/col values will be in range due to the checks 3295 in reasonable_label() */ 3296 strcpy(config->devnames[0][ac->clabel->column], 3297 ac->devname); 3298 ac = ac->next; 3299 } 3300 3301 for(i=0;i<RF_MAXDBGV;i++) { 3302 config->debugVars[i][0] = 0; 3303 } 3304 } 3305 3306 int 3307 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value) 3308 { 3309 RF_ComponentLabel_t *clabel; 3310 int column; 3311 int sparecol; 3312 3313 raidPtr->autoconfigure = new_value; 3314 3315 for(column=0; column<raidPtr->numCol; column++) { 3316 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3317 clabel = raidget_component_label(raidPtr, column); 3318 clabel->autoconfigure = new_value; 3319 raidflush_component_label(raidPtr, column); 3320 } 3321 } 3322 for(column = 0; column < raidPtr->numSpare ; column++) { 3323 sparecol = raidPtr->numCol + column; 3324 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3325 clabel = raidget_component_label(raidPtr, sparecol); 3326 clabel->autoconfigure = new_value; 3327 raidflush_component_label(raidPtr, sparecol); 3328 } 3329 } 3330 return(new_value); 3331 } 3332 3333 int 3334 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value) 3335 { 3336 RF_ComponentLabel_t *clabel; 3337 int column; 3338 int sparecol; 3339 3340 raidPtr->root_partition = new_value; 3341 for(column=0; column<raidPtr->numCol; column++) { 3342 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3343 clabel = raidget_component_label(raidPtr, column); 3344 clabel->root_partition = new_value; 3345 raidflush_component_label(raidPtr, column); 3346 } 3347 } 3348 for(column = 0; column < raidPtr->numSpare ; column++) { 3349 sparecol = raidPtr->numCol + column; 3350 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3351 clabel = raidget_component_label(raidPtr, sparecol); 3352 clabel->root_partition = new_value; 3353 raidflush_component_label(raidPtr, sparecol); 3354 } 3355 } 3356 return(new_value); 3357 } 3358 3359 void 3360 rf_release_all_vps(RF_ConfigSet_t *cset) 3361 { 3362 RF_AutoConfig_t *ac; 3363 3364 ac = cset->ac; 3365 while(ac!=NULL) { 3366 /* Close the vp, and give it back */ 3367 if (ac->vp) { 3368 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY); 3369 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED); 3370 vput(ac->vp); 3371 ac->vp = NULL; 3372 } 3373 ac = ac->next; 3374 } 3375 } 3376 3377 3378 void 3379 rf_cleanup_config_set(RF_ConfigSet_t *cset) 3380 { 3381 RF_AutoConfig_t *ac; 3382 RF_AutoConfig_t *next_ac; 3383 3384 ac = cset->ac; 3385 while(ac!=NULL) { 3386 next_ac = ac->next; 3387 /* nuke the label */ 3388 free(ac->clabel, M_RAIDFRAME); 3389 /* cleanup the config structure */ 3390 free(ac, M_RAIDFRAME); 3391 /* "next.." */ 3392 ac = next_ac; 3393 } 3394 /* and, finally, nuke the config set */ 3395 free(cset, M_RAIDFRAME); 3396 } 3397 3398 3399 void 3400 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 3401 { 3402 /* current version number */ 3403 clabel->version = RF_COMPONENT_LABEL_VERSION; 3404 clabel->serial_number = raidPtr->serial_number; 3405 clabel->mod_counter = raidPtr->mod_counter; 3406 3407 clabel->num_rows = 1; 3408 clabel->num_columns = raidPtr->numCol; 3409 clabel->clean = RF_RAID_DIRTY; /* not clean */ 3410 clabel->status = rf_ds_optimal; /* "It's good!" */ 3411 3412 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; 3413 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU; 3414 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU; 3415 3416 clabel->blockSize = raidPtr->bytesPerSector; 3417 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk); 3418 3419 /* XXX not portable */ 3420 clabel->parityConfig = raidPtr->Layout.map->parityConfig; 3421 clabel->maxOutstanding = raidPtr->maxOutstanding; 3422 clabel->autoconfigure = raidPtr->autoconfigure; 3423 clabel->root_partition = raidPtr->root_partition; 3424 clabel->last_unit = raidPtr->raidid; 3425 clabel->config_order = raidPtr->config_order; 3426 3427 #ifndef RF_NO_PARITY_MAP 3428 rf_paritymap_init_label(raidPtr->parity_map, clabel); 3429 #endif 3430 } 3431 3432 struct raid_softc * 3433 rf_auto_config_set(RF_ConfigSet_t *cset) 3434 { 3435 RF_Raid_t *raidPtr; 3436 RF_Config_t *config; 3437 int raidID; 3438 struct raid_softc *sc; 3439 3440 #ifdef DEBUG 3441 printf("RAID autoconfigure\n"); 3442 #endif 3443 3444 /* 1. Create a config structure */ 3445 config = malloc(sizeof(*config), M_RAIDFRAME, M_WAITOK|M_ZERO); 3446 3447 /* 3448 2. Figure out what RAID ID this one is supposed to live at 3449 See if we can get the same RAID dev that it was configured 3450 on last time.. 3451 */ 3452 3453 raidID = cset->ac->clabel->last_unit; 3454 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0; 3455 sc = raidget(++raidID, false)) 3456 continue; 3457 #ifdef DEBUG 3458 printf("Configuring raid%d:\n",raidID); 3459 #endif 3460 3461 if (sc == NULL) 3462 sc = raidget(raidID, true); 3463 raidPtr = &sc->sc_r; 3464 3465 /* XXX all this stuff should be done SOMEWHERE ELSE! */ 3466 raidPtr->softc = sc; 3467 raidPtr->raidid = raidID; 3468 raidPtr->openings = RAIDOUTSTANDING; 3469 3470 /* 3. Build the configuration structure */ 3471 rf_create_configuration(cset->ac, config, raidPtr); 3472 3473 /* 4. Do the configuration */ 3474 if (rf_Configure(raidPtr, config, cset->ac) == 0) { 3475 raidinit(sc); 3476 3477 rf_markalldirty(raidPtr); 3478 raidPtr->autoconfigure = 1; /* XXX do this here? */ 3479 switch (cset->ac->clabel->root_partition) { 3480 case 1: /* Force Root */ 3481 case 2: /* Soft Root: root when boot partition part of raid */ 3482 /* 3483 * everything configured just fine. Make a note 3484 * that this set is eligible to be root, 3485 * or forced to be root 3486 */ 3487 cset->rootable = cset->ac->clabel->root_partition; 3488 /* XXX do this here? */ 3489 raidPtr->root_partition = cset->rootable; 3490 break; 3491 default: 3492 break; 3493 } 3494 } else { 3495 raidput(sc); 3496 sc = NULL; 3497 } 3498 3499 /* 5. Cleanup */ 3500 free(config, M_RAIDFRAME); 3501 return sc; 3502 } 3503 3504 void 3505 rf_pool_init(struct pool *p, size_t size, const char *w_chan, 3506 size_t xmin, size_t xmax) 3507 { 3508 3509 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO); 3510 pool_sethiwat(p, xmax); 3511 pool_prime(p, xmin); 3512 } 3513 3514 /* 3515 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue 3516 * to see if there is IO pending and if that IO could possibly be done 3517 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1 3518 * otherwise. 3519 * 3520 */ 3521 int 3522 rf_buf_queue_check(RF_Raid_t *raidPtr) 3523 { 3524 struct raid_softc *rs; 3525 struct dk_softc *dksc; 3526 3527 rs = raidPtr->softc; 3528 dksc = &rs->sc_dksc; 3529 3530 if ((rs->sc_flags & RAIDF_INITED) == 0) 3531 return 1; 3532 3533 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) { 3534 /* there is work to do */ 3535 return 0; 3536 } 3537 /* default is nothing to do */ 3538 return 1; 3539 } 3540 3541 int 3542 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr) 3543 { 3544 uint64_t numsecs; 3545 unsigned secsize; 3546 int error; 3547 3548 error = getdisksize(vp, &numsecs, &secsize); 3549 if (error == 0) { 3550 diskPtr->blockSize = secsize; 3551 diskPtr->numBlocks = numsecs - rf_protectedSectors; 3552 diskPtr->partitionSize = numsecs; 3553 return 0; 3554 } 3555 return error; 3556 } 3557 3558 static int 3559 raid_match(device_t self, cfdata_t cfdata, void *aux) 3560 { 3561 return 1; 3562 } 3563 3564 static void 3565 raid_attach(device_t parent, device_t self, void *aux) 3566 { 3567 } 3568 3569 3570 static int 3571 raid_detach(device_t self, int flags) 3572 { 3573 int error; 3574 struct raid_softc *rs = raidsoftc(self); 3575 3576 if (rs == NULL) 3577 return ENXIO; 3578 3579 if ((error = raidlock(rs)) != 0) 3580 return error; 3581 3582 error = raid_detach_unlocked(rs); 3583 3584 raidunlock(rs); 3585 3586 /* XXX raid can be referenced here */ 3587 3588 if (error) 3589 return error; 3590 3591 /* Free the softc */ 3592 raidput(rs); 3593 3594 return 0; 3595 } 3596 3597 static void 3598 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr) 3599 { 3600 struct dk_softc *dksc = &rs->sc_dksc; 3601 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; 3602 3603 memset(dg, 0, sizeof(*dg)); 3604 3605 dg->dg_secperunit = raidPtr->totalSectors; 3606 dg->dg_secsize = raidPtr->bytesPerSector; 3607 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe; 3608 dg->dg_ntracks = 4 * raidPtr->numCol; 3609 3610 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL); 3611 } 3612 3613 /* 3614 * Get cache info for all the components (including spares). 3615 * Returns intersection of all the cache flags of all disks, or first 3616 * error if any encountered. 3617 * XXXfua feature flags can change as spares are added - lock down somehow 3618 */ 3619 static int 3620 rf_get_component_caches(RF_Raid_t *raidPtr, int *data) 3621 { 3622 int c; 3623 int error; 3624 int dkwhole = 0, dkpart; 3625 3626 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) { 3627 /* 3628 * Check any non-dead disk, even when currently being 3629 * reconstructed. 3630 */ 3631 if (!RF_DEAD_DISK(raidPtr->Disks[c].status) 3632 || raidPtr->Disks[c].status == rf_ds_reconstructing) { 3633 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, 3634 DIOCGCACHE, &dkpart, FREAD, NOCRED); 3635 if (error) { 3636 if (error != ENODEV) { 3637 printf("raid%d: get cache for component %s failed\n", 3638 raidPtr->raidid, 3639 raidPtr->Disks[c].devname); 3640 } 3641 3642 return error; 3643 } 3644 3645 if (c == 0) 3646 dkwhole = dkpart; 3647 else 3648 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart); 3649 } 3650 } 3651 3652 *data = dkwhole; 3653 3654 return 0; 3655 } 3656 3657 /* 3658 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components. 3659 * We end up returning whatever error was returned by the first cache flush 3660 * that fails. 3661 */ 3662 3663 static int 3664 rf_sync_component_cache(RF_Raid_t *raidPtr, int c) 3665 { 3666 int force = 1; 3667 int e = 0; 3668 for (int i = 0; i < 5; i++) { 3669 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC, 3670 &force, FWRITE, NOCRED); 3671 if (!e || e == ENODEV) 3672 return e; 3673 printf("raid%d: cache flush[%d] to component %s failed (%d)\n", 3674 raidPtr->raidid, i, raidPtr->Disks[c].devname, e); 3675 } 3676 return e; 3677 } 3678 3679 int 3680 rf_sync_component_caches(RF_Raid_t *raidPtr) 3681 { 3682 int c, error; 3683 3684 error = 0; 3685 for (c = 0; c < raidPtr->numCol; c++) { 3686 if (raidPtr->Disks[c].status == rf_ds_optimal) { 3687 int e = rf_sync_component_cache(raidPtr, c); 3688 if (e && !error) 3689 error = e; 3690 } 3691 } 3692 3693 for (c = 0; c < raidPtr->numSpare ; c++) { 3694 int sparecol = raidPtr->numCol + c; 3695 /* Need to ensure that the reconstruct actually completed! */ 3696 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3697 int e = rf_sync_component_cache(raidPtr, sparecol); 3698 if (e && !error) 3699 error = e; 3700 } 3701 } 3702 return error; 3703 } 3704 3705 /* Fill in info with the current status */ 3706 void 3707 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info) 3708 { 3709 3710 if (raidPtr->status != rf_rs_reconstructing) { 3711 info->total = 100; 3712 info->completed = 100; 3713 } else { 3714 info->total = raidPtr->reconControl->numRUsTotal; 3715 info->completed = raidPtr->reconControl->numRUsComplete; 3716 } 3717 info->remaining = info->total - info->completed; 3718 } 3719 3720 /* Fill in info with the current status */ 3721 void 3722 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info) 3723 { 3724 3725 if (raidPtr->parity_rewrite_in_progress == 1) { 3726 info->total = raidPtr->Layout.numStripe; 3727 info->completed = raidPtr->parity_rewrite_stripes_done; 3728 } else { 3729 info->completed = 100; 3730 info->total = 100; 3731 } 3732 info->remaining = info->total - info->completed; 3733 } 3734 3735 /* Fill in info with the current status */ 3736 void 3737 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info) 3738 { 3739 3740 if (raidPtr->copyback_in_progress == 1) { 3741 info->total = raidPtr->Layout.numStripe; 3742 info->completed = raidPtr->copyback_stripes_done; 3743 info->remaining = info->total - info->completed; 3744 } else { 3745 info->remaining = 0; 3746 info->completed = 100; 3747 info->total = 100; 3748 } 3749 } 3750 3751 /* Fill in config with the current info */ 3752 int 3753 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config) 3754 { 3755 int d, i, j; 3756 3757 if (!raidPtr->valid) 3758 return ENODEV; 3759 config->cols = raidPtr->numCol; 3760 config->ndevs = raidPtr->numCol; 3761 if (config->ndevs >= RF_MAX_DISKS) 3762 return ENOMEM; 3763 config->nspares = raidPtr->numSpare; 3764 if (config->nspares >= RF_MAX_DISKS) 3765 return ENOMEM; 3766 config->maxqdepth = raidPtr->maxQueueDepth; 3767 d = 0; 3768 for (j = 0; j < config->cols; j++) { 3769 config->devs[d] = raidPtr->Disks[j]; 3770 d++; 3771 } 3772 for (j = config->cols, i = 0; i < config->nspares; i++, j++) { 3773 config->spares[i] = raidPtr->Disks[j]; 3774 if (config->spares[i].status == rf_ds_rebuilding_spare) { 3775 /* XXX: raidctl(8) expects to see this as a used spare */ 3776 config->spares[i].status = rf_ds_used_spare; 3777 } 3778 } 3779 return 0; 3780 } 3781 3782 int 3783 rf_get_component_label(RF_Raid_t *raidPtr, void *data) 3784 { 3785 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data; 3786 RF_ComponentLabel_t *raid_clabel; 3787 int column = clabel->column; 3788 3789 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare)) 3790 return EINVAL; 3791 raid_clabel = raidget_component_label(raidPtr, column); 3792 memcpy(clabel, raid_clabel, sizeof *clabel); 3793 3794 return 0; 3795 } 3796 3797 /* 3798 * Module interface 3799 */ 3800 3801 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs"); 3802 3803 #ifdef _MODULE 3804 CFDRIVER_DECL(raid, DV_DISK, NULL); 3805 #endif 3806 3807 static int raid_modcmd(modcmd_t, void *); 3808 static int raid_modcmd_init(void); 3809 static int raid_modcmd_fini(void); 3810 3811 static int 3812 raid_modcmd(modcmd_t cmd, void *data) 3813 { 3814 int error; 3815 3816 error = 0; 3817 switch (cmd) { 3818 case MODULE_CMD_INIT: 3819 error = raid_modcmd_init(); 3820 break; 3821 case MODULE_CMD_FINI: 3822 error = raid_modcmd_fini(); 3823 break; 3824 default: 3825 error = ENOTTY; 3826 break; 3827 } 3828 return error; 3829 } 3830 3831 static int 3832 raid_modcmd_init(void) 3833 { 3834 int error; 3835 int bmajor, cmajor; 3836 3837 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE); 3838 mutex_enter(&raid_lock); 3839 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 3840 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM); 3841 rf_init_cond2(rf_sparet_wait_cv, "sparetw"); 3842 rf_init_cond2(rf_sparet_resp_cv, "rfgst"); 3843 3844 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; 3845 #endif 3846 3847 bmajor = cmajor = -1; 3848 error = devsw_attach("raid", &raid_bdevsw, &bmajor, 3849 &raid_cdevsw, &cmajor); 3850 if (error != 0 && error != EEXIST) { 3851 aprint_error("%s: devsw_attach failed %d\n", __func__, error); 3852 mutex_exit(&raid_lock); 3853 return error; 3854 } 3855 #ifdef _MODULE 3856 error = config_cfdriver_attach(&raid_cd); 3857 if (error != 0) { 3858 aprint_error("%s: config_cfdriver_attach failed %d\n", 3859 __func__, error); 3860 devsw_detach(&raid_bdevsw, &raid_cdevsw); 3861 mutex_exit(&raid_lock); 3862 return error; 3863 } 3864 #endif 3865 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3866 if (error != 0) { 3867 aprint_error("%s: config_cfattach_attach failed %d\n", 3868 __func__, error); 3869 #ifdef _MODULE 3870 config_cfdriver_detach(&raid_cd); 3871 #endif 3872 devsw_detach(&raid_bdevsw, &raid_cdevsw); 3873 mutex_exit(&raid_lock); 3874 return error; 3875 } 3876 3877 raidautoconfigdone = false; 3878 3879 mutex_exit(&raid_lock); 3880 3881 if (error == 0) { 3882 if (rf_BootRaidframe(true) == 0) 3883 aprint_verbose("Kernelized RAIDframe activated\n"); 3884 else 3885 panic("Serious error activating RAID!!"); 3886 } 3887 3888 /* 3889 * Register a finalizer which will be used to auto-config RAID 3890 * sets once all real hardware devices have been found. 3891 */ 3892 error = config_finalize_register(NULL, rf_autoconfig); 3893 if (error != 0) { 3894 aprint_error("WARNING: unable to register RAIDframe " 3895 "finalizer\n"); 3896 error = 0; 3897 } 3898 3899 return error; 3900 } 3901 3902 static int 3903 raid_modcmd_fini(void) 3904 { 3905 int error; 3906 3907 mutex_enter(&raid_lock); 3908 3909 /* Don't allow unload if raid device(s) exist. */ 3910 if (!LIST_EMPTY(&raids)) { 3911 mutex_exit(&raid_lock); 3912 return EBUSY; 3913 } 3914 3915 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca); 3916 if (error != 0) { 3917 aprint_error("%s: cannot detach cfattach\n",__func__); 3918 mutex_exit(&raid_lock); 3919 return error; 3920 } 3921 #ifdef _MODULE 3922 error = config_cfdriver_detach(&raid_cd); 3923 if (error != 0) { 3924 aprint_error("%s: cannot detach cfdriver\n",__func__); 3925 config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3926 mutex_exit(&raid_lock); 3927 return error; 3928 } 3929 #endif 3930 error = devsw_detach(&raid_bdevsw, &raid_cdevsw); 3931 if (error != 0) { 3932 aprint_error("%s: cannot detach devsw\n",__func__); 3933 #ifdef _MODULE 3934 config_cfdriver_attach(&raid_cd); 3935 #endif 3936 config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3937 mutex_exit(&raid_lock); 3938 return error; 3939 } 3940 rf_BootRaidframe(false); 3941 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 3942 rf_destroy_mutex2(rf_sparet_wait_mutex); 3943 rf_destroy_cond2(rf_sparet_wait_cv); 3944 rf_destroy_cond2(rf_sparet_resp_cv); 3945 #endif 3946 mutex_exit(&raid_lock); 3947 mutex_destroy(&raid_lock); 3948 3949 return error; 3950 } 3951