1 /* $NetBSD: rf_netbsdkintf.c,v 1.376 2019/03/01 11:06:56 pgoyette Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Greg Oster; Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1988 University of Utah. 34 * Copyright (c) 1990, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * the Systems Programming Group of the University of Utah Computer 39 * Science Department. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 66 * 67 * @(#)cd.c 8.2 (Berkeley) 11/16/93 68 */ 69 70 /* 71 * Copyright (c) 1995 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Mark Holland, Jim Zelenka 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /*********************************************************** 98 * 99 * rf_kintf.c -- the kernel interface routines for RAIDframe 100 * 101 ***********************************************************/ 102 103 #include <sys/cdefs.h> 104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.376 2019/03/01 11:06:56 pgoyette Exp $"); 105 106 #ifdef _KERNEL_OPT 107 #include "opt_raid_autoconfig.h" 108 #include "opt_compat_netbsd32.h" 109 #endif 110 111 #include <sys/param.h> 112 #include <sys/errno.h> 113 #include <sys/pool.h> 114 #include <sys/proc.h> 115 #include <sys/queue.h> 116 #include <sys/disk.h> 117 #include <sys/device.h> 118 #include <sys/stat.h> 119 #include <sys/ioctl.h> 120 #include <sys/fcntl.h> 121 #include <sys/systm.h> 122 #include <sys/vnode.h> 123 #include <sys/disklabel.h> 124 #include <sys/conf.h> 125 #include <sys/buf.h> 126 #include <sys/bufq.h> 127 #include <sys/reboot.h> 128 #include <sys/kauth.h> 129 #include <sys/module.h> 130 #include <sys/compat_stub.h> 131 132 #include <prop/proplib.h> 133 134 #include <dev/raidframe/raidframevar.h> 135 #include <dev/raidframe/raidframeio.h> 136 #include <dev/raidframe/rf_paritymap.h> 137 138 #include "rf_raid.h" 139 #include "rf_copyback.h" 140 #include "rf_dag.h" 141 #include "rf_dagflags.h" 142 #include "rf_desc.h" 143 #include "rf_diskqueue.h" 144 #include "rf_etimer.h" 145 #include "rf_general.h" 146 #include "rf_kintf.h" 147 #include "rf_options.h" 148 #include "rf_driver.h" 149 #include "rf_parityscan.h" 150 #include "rf_threadstuff.h" 151 152 #include "ioconf.h" 153 154 #ifdef DEBUG 155 int rf_kdebug_level = 0; 156 #define db1_printf(a) if (rf_kdebug_level > 0) printf a 157 #else /* DEBUG */ 158 #define db1_printf(a) { } 159 #endif /* DEBUG */ 160 161 #ifdef DEBUG_ROOT 162 #define DPRINTF(a, ...) printf(a, __VA_ARGS__) 163 #else 164 #define DPRINTF(a, ...) 165 #endif 166 167 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 168 static rf_declare_mutex2(rf_sparet_wait_mutex); 169 static rf_declare_cond2(rf_sparet_wait_cv); 170 static rf_declare_cond2(rf_sparet_resp_cv); 171 172 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a 173 * spare table */ 174 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from 175 * installation process */ 176 #endif 177 178 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures"); 179 180 /* prototypes */ 181 static void KernelWakeupFunc(struct buf *); 182 static void InitBP(struct buf *, struct vnode *, unsigned, 183 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *), 184 void *, int, struct proc *); 185 static void raidinit(struct raid_softc *); 186 static int raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp); 187 static int rf_get_component_caches(RF_Raid_t *raidPtr, int *); 188 189 static int raid_match(device_t, cfdata_t, void *); 190 static void raid_attach(device_t, device_t, void *); 191 static int raid_detach(device_t, int); 192 193 static int raidread_component_area(dev_t, struct vnode *, void *, size_t, 194 daddr_t, daddr_t); 195 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t, 196 daddr_t, daddr_t, int); 197 198 static int raidwrite_component_label(unsigned, 199 dev_t, struct vnode *, RF_ComponentLabel_t *); 200 static int raidread_component_label(unsigned, 201 dev_t, struct vnode *, RF_ComponentLabel_t *); 202 203 static int raid_diskstart(device_t, struct buf *bp); 204 static int raid_dumpblocks(device_t, void *, daddr_t, int); 205 static int raid_lastclose(device_t); 206 207 static dev_type_open(raidopen); 208 static dev_type_close(raidclose); 209 static dev_type_read(raidread); 210 static dev_type_write(raidwrite); 211 static dev_type_ioctl(raidioctl); 212 static dev_type_strategy(raidstrategy); 213 static dev_type_dump(raiddump); 214 static dev_type_size(raidsize); 215 216 const struct bdevsw raid_bdevsw = { 217 .d_open = raidopen, 218 .d_close = raidclose, 219 .d_strategy = raidstrategy, 220 .d_ioctl = raidioctl, 221 .d_dump = raiddump, 222 .d_psize = raidsize, 223 .d_discard = nodiscard, 224 .d_flag = D_DISK 225 }; 226 227 const struct cdevsw raid_cdevsw = { 228 .d_open = raidopen, 229 .d_close = raidclose, 230 .d_read = raidread, 231 .d_write = raidwrite, 232 .d_ioctl = raidioctl, 233 .d_stop = nostop, 234 .d_tty = notty, 235 .d_poll = nopoll, 236 .d_mmap = nommap, 237 .d_kqfilter = nokqfilter, 238 .d_discard = nodiscard, 239 .d_flag = D_DISK 240 }; 241 242 static struct dkdriver rf_dkdriver = { 243 .d_open = raidopen, 244 .d_close = raidclose, 245 .d_strategy = raidstrategy, 246 .d_diskstart = raid_diskstart, 247 .d_dumpblocks = raid_dumpblocks, 248 .d_lastclose = raid_lastclose, 249 .d_minphys = minphys 250 }; 251 252 #define raidunit(x) DISKUNIT(x) 253 #define raidsoftc(dev) (((struct raid_softc *)device_private(dev))->sc_r.softc) 254 255 extern struct cfdriver raid_cd; 256 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc), 257 raid_match, raid_attach, raid_detach, NULL, NULL, NULL, 258 DVF_DETACH_SHUTDOWN); 259 260 /* Internal representation of a rf_recon_req */ 261 struct rf_recon_req_internal { 262 RF_RowCol_t col; 263 RF_ReconReqFlags_t flags; 264 void *raidPtr; 265 }; 266 267 /* 268 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device. 269 * Be aware that large numbers can allow the driver to consume a lot of 270 * kernel memory, especially on writes, and in degraded mode reads. 271 * 272 * For example: with a stripe width of 64 blocks (32k) and 5 disks, 273 * a single 64K write will typically require 64K for the old data, 274 * 64K for the old parity, and 64K for the new parity, for a total 275 * of 192K (if the parity buffer is not re-used immediately). 276 * Even it if is used immediately, that's still 128K, which when multiplied 277 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data. 278 * 279 * Now in degraded mode, for example, a 64K read on the above setup may 280 * require data reconstruction, which will require *all* of the 4 remaining 281 * disks to participate -- 4 * 32K/disk == 128K again. 282 */ 283 284 #ifndef RAIDOUTSTANDING 285 #define RAIDOUTSTANDING 6 286 #endif 287 288 #define RAIDLABELDEV(dev) \ 289 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART)) 290 291 /* declared here, and made public, for the benefit of KVM stuff.. */ 292 293 static int raidlock(struct raid_softc *); 294 static void raidunlock(struct raid_softc *); 295 296 static int raid_detach_unlocked(struct raid_softc *); 297 298 static void rf_markalldirty(RF_Raid_t *); 299 static void rf_set_geometry(struct raid_softc *, RF_Raid_t *); 300 301 void rf_ReconThread(struct rf_recon_req_internal *); 302 void rf_RewriteParityThread(RF_Raid_t *raidPtr); 303 void rf_CopybackThread(RF_Raid_t *raidPtr); 304 void rf_ReconstructInPlaceThread(struct rf_recon_req_internal *); 305 int rf_autoconfig(device_t); 306 void rf_buildroothack(RF_ConfigSet_t *); 307 308 RF_AutoConfig_t *rf_find_raid_components(void); 309 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *); 310 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *); 311 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t); 312 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *); 313 int rf_set_autoconfig(RF_Raid_t *, int); 314 int rf_set_rootpartition(RF_Raid_t *, int); 315 void rf_release_all_vps(RF_ConfigSet_t *); 316 void rf_cleanup_config_set(RF_ConfigSet_t *); 317 int rf_have_enough_components(RF_ConfigSet_t *); 318 struct raid_softc *rf_auto_config_set(RF_ConfigSet_t *); 319 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t); 320 321 /* 322 * Debugging, mostly. Set to 0 to not allow autoconfig to take place. 323 * Note that this is overridden by having RAID_AUTOCONFIG as an option 324 * in the kernel config file. 325 */ 326 #ifdef RAID_AUTOCONFIG 327 int raidautoconfig = 1; 328 #else 329 int raidautoconfig = 0; 330 #endif 331 static bool raidautoconfigdone = false; 332 333 struct RF_Pools_s rf_pools; 334 335 static LIST_HEAD(, raid_softc) raids = LIST_HEAD_INITIALIZER(raids); 336 static kmutex_t raid_lock; 337 338 static struct raid_softc * 339 raidcreate(int unit) { 340 struct raid_softc *sc = kmem_zalloc(sizeof(*sc), KM_SLEEP); 341 sc->sc_unit = unit; 342 cv_init(&sc->sc_cv, "raidunit"); 343 mutex_init(&sc->sc_mutex, MUTEX_DEFAULT, IPL_NONE); 344 return sc; 345 } 346 347 static void 348 raiddestroy(struct raid_softc *sc) { 349 cv_destroy(&sc->sc_cv); 350 mutex_destroy(&sc->sc_mutex); 351 kmem_free(sc, sizeof(*sc)); 352 } 353 354 static struct raid_softc * 355 raidget(int unit, bool create) { 356 struct raid_softc *sc; 357 if (unit < 0) { 358 #ifdef DIAGNOSTIC 359 panic("%s: unit %d!", __func__, unit); 360 #endif 361 return NULL; 362 } 363 mutex_enter(&raid_lock); 364 LIST_FOREACH(sc, &raids, sc_link) { 365 if (sc->sc_unit == unit) { 366 mutex_exit(&raid_lock); 367 return sc; 368 } 369 } 370 mutex_exit(&raid_lock); 371 if (!create) 372 return NULL; 373 if ((sc = raidcreate(unit)) == NULL) 374 return NULL; 375 mutex_enter(&raid_lock); 376 LIST_INSERT_HEAD(&raids, sc, sc_link); 377 mutex_exit(&raid_lock); 378 return sc; 379 } 380 381 static void 382 raidput(struct raid_softc *sc) { 383 mutex_enter(&raid_lock); 384 LIST_REMOVE(sc, sc_link); 385 mutex_exit(&raid_lock); 386 raiddestroy(sc); 387 } 388 389 void 390 raidattach(int num) 391 { 392 393 /* 394 * Device attachment and associated initialization now occurs 395 * as part of the module initialization. 396 */ 397 } 398 399 int 400 rf_autoconfig(device_t self) 401 { 402 RF_AutoConfig_t *ac_list; 403 RF_ConfigSet_t *config_sets; 404 405 if (!raidautoconfig || raidautoconfigdone == true) 406 return (0); 407 408 /* XXX This code can only be run once. */ 409 raidautoconfigdone = true; 410 411 #ifdef __HAVE_CPU_BOOTCONF 412 /* 413 * 0. find the boot device if needed first so we can use it later 414 * this needs to be done before we autoconfigure any raid sets, 415 * because if we use wedges we are not going to be able to open 416 * the boot device later 417 */ 418 if (booted_device == NULL) 419 cpu_bootconf(); 420 #endif 421 /* 1. locate all RAID components on the system */ 422 aprint_debug("Searching for RAID components...\n"); 423 ac_list = rf_find_raid_components(); 424 425 /* 2. Sort them into their respective sets. */ 426 config_sets = rf_create_auto_sets(ac_list); 427 428 /* 429 * 3. Evaluate each set and configure the valid ones. 430 * This gets done in rf_buildroothack(). 431 */ 432 rf_buildroothack(config_sets); 433 434 return 1; 435 } 436 437 int 438 rf_inited(const struct raid_softc *rs) { 439 return (rs->sc_flags & RAIDF_INITED) != 0; 440 } 441 442 RF_Raid_t * 443 rf_get_raid(struct raid_softc *rs) { 444 return &rs->sc_r; 445 } 446 447 int 448 rf_get_unit(const struct raid_softc *rs) { 449 return rs->sc_unit; 450 } 451 452 static int 453 rf_containsboot(RF_Raid_t *r, device_t bdv) { 454 const char *bootname; 455 size_t len; 456 457 /* if bdv is NULL, the set can't contain it. exit early. */ 458 if (bdv == NULL) 459 return 0; 460 461 bootname = device_xname(bdv); 462 len = strlen(bootname); 463 464 for (int col = 0; col < r->numCol; col++) { 465 const char *devname = r->Disks[col].devname; 466 devname += sizeof("/dev/") - 1; 467 if (strncmp(devname, "dk", 2) == 0) { 468 const char *parent = 469 dkwedge_get_parent_name(r->Disks[col].dev); 470 if (parent != NULL) 471 devname = parent; 472 } 473 if (strncmp(devname, bootname, len) == 0) { 474 struct raid_softc *sc = r->softc; 475 aprint_debug("raid%d includes boot device %s\n", 476 sc->sc_unit, devname); 477 return 1; 478 } 479 } 480 return 0; 481 } 482 483 void 484 rf_buildroothack(RF_ConfigSet_t *config_sets) 485 { 486 RF_ConfigSet_t *cset; 487 RF_ConfigSet_t *next_cset; 488 int num_root; 489 struct raid_softc *sc, *rsc; 490 struct dk_softc *dksc; 491 492 sc = rsc = NULL; 493 num_root = 0; 494 cset = config_sets; 495 while (cset != NULL) { 496 next_cset = cset->next; 497 if (rf_have_enough_components(cset) && 498 cset->ac->clabel->autoconfigure == 1) { 499 sc = rf_auto_config_set(cset); 500 if (sc != NULL) { 501 aprint_debug("raid%d: configured ok, rootable %d\n", 502 sc->sc_unit, cset->rootable); 503 if (cset->rootable) { 504 rsc = sc; 505 num_root++; 506 } 507 } else { 508 /* The autoconfig didn't work :( */ 509 aprint_debug("Autoconfig failed\n"); 510 rf_release_all_vps(cset); 511 } 512 } else { 513 /* we're not autoconfiguring this set... 514 release the associated resources */ 515 rf_release_all_vps(cset); 516 } 517 /* cleanup */ 518 rf_cleanup_config_set(cset); 519 cset = next_cset; 520 } 521 dksc = &rsc->sc_dksc; 522 523 /* if the user has specified what the root device should be 524 then we don't touch booted_device or boothowto... */ 525 526 if (rootspec != NULL) { 527 DPRINTF("%s: rootspec %s\n", __func__, rootspec); 528 return; 529 } 530 531 /* we found something bootable... */ 532 533 /* 534 * XXX: The following code assumes that the root raid 535 * is the first ('a') partition. This is about the best 536 * we can do with a BSD disklabel, but we might be able 537 * to do better with a GPT label, by setting a specified 538 * attribute to indicate the root partition. We can then 539 * stash the partition number in the r->root_partition 540 * high bits (the bottom 2 bits are already used). For 541 * now we just set booted_partition to 0 when we override 542 * root. 543 */ 544 if (num_root == 1) { 545 device_t candidate_root; 546 if (dksc->sc_dkdev.dk_nwedges != 0) { 547 char cname[sizeof(cset->ac->devname)]; 548 /* XXX: assume partition 'a' first */ 549 snprintf(cname, sizeof(cname), "%s%c", 550 device_xname(dksc->sc_dev), 'a'); 551 candidate_root = dkwedge_find_by_wname(cname); 552 DPRINTF("%s: candidate wedge root=%s\n", __func__, 553 cname); 554 if (candidate_root == NULL) { 555 /* 556 * If that is not found, because we don't use 557 * disklabel, return the first dk child 558 * XXX: we can skip the 'a' check above 559 * and always do this... 560 */ 561 size_t i = 0; 562 candidate_root = dkwedge_find_by_parent( 563 device_xname(dksc->sc_dev), &i); 564 } 565 DPRINTF("%s: candidate wedge root=%p\n", __func__, 566 candidate_root); 567 } else 568 candidate_root = dksc->sc_dev; 569 DPRINTF("%s: candidate root=%p\n", __func__, candidate_root); 570 DPRINTF("%s: booted_device=%p root_partition=%d " 571 "contains_boot=%d", 572 __func__, booted_device, rsc->sc_r.root_partition, 573 rf_containsboot(&rsc->sc_r, booted_device)); 574 /* XXX the check for booted_device == NULL can probably be 575 * dropped, now that rf_containsboot handles that case. 576 */ 577 if (booted_device == NULL || 578 rsc->sc_r.root_partition == 1 || 579 rf_containsboot(&rsc->sc_r, booted_device)) { 580 booted_device = candidate_root; 581 booted_method = "raidframe/single"; 582 booted_partition = 0; /* XXX assume 'a' */ 583 } 584 } else if (num_root > 1) { 585 DPRINTF("%s: many roots=%d, %p\n", __func__, num_root, 586 booted_device); 587 588 /* 589 * Maybe the MD code can help. If it cannot, then 590 * setroot() will discover that we have no 591 * booted_device and will ask the user if nothing was 592 * hardwired in the kernel config file 593 */ 594 if (booted_device == NULL) 595 return; 596 597 num_root = 0; 598 mutex_enter(&raid_lock); 599 LIST_FOREACH(sc, &raids, sc_link) { 600 RF_Raid_t *r = &sc->sc_r; 601 if (r->valid == 0) 602 continue; 603 604 if (r->root_partition == 0) 605 continue; 606 607 if (rf_containsboot(r, booted_device)) { 608 num_root++; 609 rsc = sc; 610 dksc = &rsc->sc_dksc; 611 } 612 } 613 mutex_exit(&raid_lock); 614 615 if (num_root == 1) { 616 booted_device = dksc->sc_dev; 617 booted_method = "raidframe/multi"; 618 booted_partition = 0; /* XXX assume 'a' */ 619 } else { 620 /* we can't guess.. require the user to answer... */ 621 boothowto |= RB_ASKNAME; 622 } 623 } 624 } 625 626 static int 627 raidsize(dev_t dev) 628 { 629 struct raid_softc *rs; 630 struct dk_softc *dksc; 631 unsigned int unit; 632 633 unit = raidunit(dev); 634 if ((rs = raidget(unit, false)) == NULL) 635 return -1; 636 dksc = &rs->sc_dksc; 637 638 if ((rs->sc_flags & RAIDF_INITED) == 0) 639 return -1; 640 641 return dk_size(dksc, dev); 642 } 643 644 static int 645 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size) 646 { 647 unsigned int unit; 648 struct raid_softc *rs; 649 struct dk_softc *dksc; 650 651 unit = raidunit(dev); 652 if ((rs = raidget(unit, false)) == NULL) 653 return ENXIO; 654 dksc = &rs->sc_dksc; 655 656 if ((rs->sc_flags & RAIDF_INITED) == 0) 657 return ENODEV; 658 659 /* 660 Note that blkno is relative to this particular partition. 661 By adding adding RF_PROTECTED_SECTORS, we get a value that 662 is relative to the partition used for the underlying component. 663 */ 664 blkno += RF_PROTECTED_SECTORS; 665 666 return dk_dump(dksc, dev, blkno, va, size); 667 } 668 669 static int 670 raid_dumpblocks(device_t dev, void *va, daddr_t blkno, int nblk) 671 { 672 struct raid_softc *rs = raidsoftc(dev); 673 const struct bdevsw *bdev; 674 RF_Raid_t *raidPtr; 675 int c, sparecol, j, scol, dumpto; 676 int error = 0; 677 678 raidPtr = &rs->sc_r; 679 680 /* we only support dumping to RAID 1 sets */ 681 if (raidPtr->Layout.numDataCol != 1 || 682 raidPtr->Layout.numParityCol != 1) 683 return EINVAL; 684 685 if ((error = raidlock(rs)) != 0) 686 return error; 687 688 /* figure out what device is alive.. */ 689 690 /* 691 Look for a component to dump to. The preference for the 692 component to dump to is as follows: 693 1) the master 694 2) a used_spare of the master 695 3) the slave 696 4) a used_spare of the slave 697 */ 698 699 dumpto = -1; 700 for (c = 0; c < raidPtr->numCol; c++) { 701 if (raidPtr->Disks[c].status == rf_ds_optimal) { 702 /* this might be the one */ 703 dumpto = c; 704 break; 705 } 706 } 707 708 /* 709 At this point we have possibly selected a live master or a 710 live slave. We now check to see if there is a spared 711 master (or a spared slave), if we didn't find a live master 712 or a live slave. 713 */ 714 715 for (c = 0; c < raidPtr->numSpare; c++) { 716 sparecol = raidPtr->numCol + c; 717 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 718 /* How about this one? */ 719 scol = -1; 720 for(j=0;j<raidPtr->numCol;j++) { 721 if (raidPtr->Disks[j].spareCol == sparecol) { 722 scol = j; 723 break; 724 } 725 } 726 if (scol == 0) { 727 /* 728 We must have found a spared master! 729 We'll take that over anything else 730 found so far. (We couldn't have 731 found a real master before, since 732 this is a used spare, and it's 733 saying that it's replacing the 734 master.) On reboot (with 735 autoconfiguration turned on) 736 sparecol will become the 1st 737 component (component0) of this set. 738 */ 739 dumpto = sparecol; 740 break; 741 } else if (scol != -1) { 742 /* 743 Must be a spared slave. We'll dump 744 to that if we havn't found anything 745 else so far. 746 */ 747 if (dumpto == -1) 748 dumpto = sparecol; 749 } 750 } 751 } 752 753 if (dumpto == -1) { 754 /* we couldn't find any live components to dump to!?!? 755 */ 756 error = EINVAL; 757 goto out; 758 } 759 760 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev); 761 if (bdev == NULL) { 762 error = ENXIO; 763 goto out; 764 } 765 766 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev, 767 blkno, va, nblk * raidPtr->bytesPerSector); 768 769 out: 770 raidunlock(rs); 771 772 return error; 773 } 774 775 /* ARGSUSED */ 776 static int 777 raidopen(dev_t dev, int flags, int fmt, 778 struct lwp *l) 779 { 780 int unit = raidunit(dev); 781 struct raid_softc *rs; 782 struct dk_softc *dksc; 783 int error = 0; 784 int part, pmask; 785 786 if ((rs = raidget(unit, true)) == NULL) 787 return ENXIO; 788 if ((error = raidlock(rs)) != 0) 789 return (error); 790 791 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) { 792 error = EBUSY; 793 goto bad; 794 } 795 796 dksc = &rs->sc_dksc; 797 798 part = DISKPART(dev); 799 pmask = (1 << part); 800 801 if (!DK_BUSY(dksc, pmask) && 802 ((rs->sc_flags & RAIDF_INITED) != 0)) { 803 /* First one... mark things as dirty... Note that we *MUST* 804 have done a configure before this. I DO NOT WANT TO BE 805 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED 806 THAT THEY BELONG TOGETHER!!!!! */ 807 /* XXX should check to see if we're only open for reading 808 here... If so, we needn't do this, but then need some 809 other way of keeping track of what's happened.. */ 810 811 rf_markalldirty(&rs->sc_r); 812 } 813 814 if ((rs->sc_flags & RAIDF_INITED) != 0) 815 error = dk_open(dksc, dev, flags, fmt, l); 816 817 bad: 818 raidunlock(rs); 819 820 return (error); 821 822 823 } 824 825 static int 826 raid_lastclose(device_t self) 827 { 828 struct raid_softc *rs = raidsoftc(self); 829 830 /* Last one... device is not unconfigured yet. 831 Device shutdown has taken care of setting the 832 clean bits if RAIDF_INITED is not set 833 mark things as clean... */ 834 835 rf_update_component_labels(&rs->sc_r, 836 RF_FINAL_COMPONENT_UPDATE); 837 838 /* pass to unlocked code */ 839 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) 840 rs->sc_flags |= RAIDF_DETACH; 841 842 return 0; 843 } 844 845 /* ARGSUSED */ 846 static int 847 raidclose(dev_t dev, int flags, int fmt, struct lwp *l) 848 { 849 int unit = raidunit(dev); 850 struct raid_softc *rs; 851 struct dk_softc *dksc; 852 cfdata_t cf; 853 int error = 0, do_detach = 0, do_put = 0; 854 855 if ((rs = raidget(unit, false)) == NULL) 856 return ENXIO; 857 dksc = &rs->sc_dksc; 858 859 if ((error = raidlock(rs)) != 0) 860 return (error); 861 862 if ((rs->sc_flags & RAIDF_INITED) != 0) { 863 error = dk_close(dksc, dev, flags, fmt, l); 864 if ((rs->sc_flags & RAIDF_DETACH) != 0) 865 do_detach = 1; 866 } else if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) 867 do_put = 1; 868 869 raidunlock(rs); 870 871 if (do_detach) { 872 /* free the pseudo device attach bits */ 873 cf = device_cfdata(dksc->sc_dev); 874 error = config_detach(dksc->sc_dev, 0); 875 if (error == 0) 876 free(cf, M_RAIDFRAME); 877 } else if (do_put) { 878 raidput(rs); 879 } 880 881 return (error); 882 883 } 884 885 static void 886 raid_wakeup(RF_Raid_t *raidPtr) 887 { 888 rf_lock_mutex2(raidPtr->iodone_lock); 889 rf_signal_cond2(raidPtr->iodone_cv); 890 rf_unlock_mutex2(raidPtr->iodone_lock); 891 } 892 893 static void 894 raidstrategy(struct buf *bp) 895 { 896 unsigned int unit; 897 struct raid_softc *rs; 898 struct dk_softc *dksc; 899 RF_Raid_t *raidPtr; 900 901 unit = raidunit(bp->b_dev); 902 if ((rs = raidget(unit, false)) == NULL) { 903 bp->b_error = ENXIO; 904 goto fail; 905 } 906 if ((rs->sc_flags & RAIDF_INITED) == 0) { 907 bp->b_error = ENXIO; 908 goto fail; 909 } 910 dksc = &rs->sc_dksc; 911 raidPtr = &rs->sc_r; 912 913 /* Queue IO only */ 914 if (dk_strategy_defer(dksc, bp)) 915 goto done; 916 917 /* schedule the IO to happen at the next convenient time */ 918 raid_wakeup(raidPtr); 919 920 done: 921 return; 922 923 fail: 924 bp->b_resid = bp->b_bcount; 925 biodone(bp); 926 } 927 928 static int 929 raid_diskstart(device_t dev, struct buf *bp) 930 { 931 struct raid_softc *rs = raidsoftc(dev); 932 RF_Raid_t *raidPtr; 933 934 raidPtr = &rs->sc_r; 935 if (!raidPtr->valid) { 936 db1_printf(("raid is not valid..\n")); 937 return ENODEV; 938 } 939 940 /* XXX */ 941 bp->b_resid = 0; 942 943 return raiddoaccess(raidPtr, bp); 944 } 945 946 void 947 raiddone(RF_Raid_t *raidPtr, struct buf *bp) 948 { 949 struct raid_softc *rs; 950 struct dk_softc *dksc; 951 952 rs = raidPtr->softc; 953 dksc = &rs->sc_dksc; 954 955 dk_done(dksc, bp); 956 957 rf_lock_mutex2(raidPtr->mutex); 958 raidPtr->openings++; 959 rf_unlock_mutex2(raidPtr->mutex); 960 961 /* schedule more IO */ 962 raid_wakeup(raidPtr); 963 } 964 965 /* ARGSUSED */ 966 static int 967 raidread(dev_t dev, struct uio *uio, int flags) 968 { 969 int unit = raidunit(dev); 970 struct raid_softc *rs; 971 972 if ((rs = raidget(unit, false)) == NULL) 973 return ENXIO; 974 975 if ((rs->sc_flags & RAIDF_INITED) == 0) 976 return (ENXIO); 977 978 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio)); 979 980 } 981 982 /* ARGSUSED */ 983 static int 984 raidwrite(dev_t dev, struct uio *uio, int flags) 985 { 986 int unit = raidunit(dev); 987 struct raid_softc *rs; 988 989 if ((rs = raidget(unit, false)) == NULL) 990 return ENXIO; 991 992 if ((rs->sc_flags & RAIDF_INITED) == 0) 993 return (ENXIO); 994 995 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio)); 996 997 } 998 999 static int 1000 raid_detach_unlocked(struct raid_softc *rs) 1001 { 1002 struct dk_softc *dksc = &rs->sc_dksc; 1003 RF_Raid_t *raidPtr; 1004 int error; 1005 1006 raidPtr = &rs->sc_r; 1007 1008 if (DK_BUSY(dksc, 0) || 1009 raidPtr->recon_in_progress != 0 || 1010 raidPtr->parity_rewrite_in_progress != 0 || 1011 raidPtr->copyback_in_progress != 0) 1012 return EBUSY; 1013 1014 if ((rs->sc_flags & RAIDF_INITED) == 0) 1015 return 0; 1016 1017 rs->sc_flags &= ~RAIDF_SHUTDOWN; 1018 1019 if ((error = rf_Shutdown(raidPtr)) != 0) 1020 return error; 1021 1022 rs->sc_flags &= ~RAIDF_INITED; 1023 1024 /* Kill off any queued buffers */ 1025 dk_drain(dksc); 1026 bufq_free(dksc->sc_bufq); 1027 1028 /* Detach the disk. */ 1029 dkwedge_delall(&dksc->sc_dkdev); 1030 disk_detach(&dksc->sc_dkdev); 1031 disk_destroy(&dksc->sc_dkdev); 1032 dk_detach(dksc); 1033 1034 return 0; 1035 } 1036 1037 static bool 1038 rf_must_be_initialized(const struct raid_softc *rs, u_long cmd) 1039 { 1040 switch (cmd) { 1041 case RAIDFRAME_ADD_HOT_SPARE: 1042 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1043 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1044 case RAIDFRAME_CHECK_PARITY: 1045 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1046 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1047 case RAIDFRAME_CHECK_RECON_STATUS: 1048 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1049 case RAIDFRAME_COPYBACK: 1050 case RAIDFRAME_DELETE_COMPONENT: 1051 case RAIDFRAME_FAIL_DISK: 1052 case RAIDFRAME_GET_ACCTOTALS: 1053 case RAIDFRAME_GET_COMPONENT_LABEL: 1054 case RAIDFRAME_GET_INFO: 1055 case RAIDFRAME_GET_SIZE: 1056 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1057 case RAIDFRAME_INIT_LABELS: 1058 case RAIDFRAME_KEEP_ACCTOTALS: 1059 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1060 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1061 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1062 case RAIDFRAME_PARITYMAP_STATUS: 1063 case RAIDFRAME_REBUILD_IN_PLACE: 1064 case RAIDFRAME_REMOVE_HOT_SPARE: 1065 case RAIDFRAME_RESET_ACCTOTALS: 1066 case RAIDFRAME_REWRITEPARITY: 1067 case RAIDFRAME_SET_AUTOCONFIG: 1068 case RAIDFRAME_SET_COMPONENT_LABEL: 1069 case RAIDFRAME_SET_ROOT: 1070 return (rs->sc_flags & RAIDF_INITED) == 0; 1071 } 1072 return false; 1073 } 1074 1075 int 1076 rf_fail_disk(RF_Raid_t *raidPtr, struct rf_recon_req *rr) 1077 { 1078 struct rf_recon_req_internal *rrint; 1079 1080 if (raidPtr->Layout.map->faultsTolerated == 0) { 1081 /* Can't do this on a RAID 0!! */ 1082 return EINVAL; 1083 } 1084 1085 if (rr->col < 0 || rr->col >= raidPtr->numCol) { 1086 /* bad column */ 1087 return EINVAL; 1088 } 1089 1090 rf_lock_mutex2(raidPtr->mutex); 1091 if (raidPtr->status == rf_rs_reconstructing) { 1092 /* you can't fail a disk while we're reconstructing! */ 1093 /* XXX wrong for RAID6 */ 1094 goto out; 1095 } 1096 if ((raidPtr->Disks[rr->col].status == rf_ds_optimal) && 1097 (raidPtr->numFailures > 0)) { 1098 /* some other component has failed. Let's not make 1099 things worse. XXX wrong for RAID6 */ 1100 goto out; 1101 } 1102 if (raidPtr->Disks[rr->col].status == rf_ds_spared) { 1103 /* Can't fail a spared disk! */ 1104 goto out; 1105 } 1106 rf_unlock_mutex2(raidPtr->mutex); 1107 1108 /* make a copy of the recon request so that we don't rely on 1109 * the user's buffer */ 1110 rrint = RF_Malloc(sizeof(*rrint)); 1111 if (rrint == NULL) 1112 return(ENOMEM); 1113 rrint->col = rr->col; 1114 rrint->flags = rr->flags; 1115 rrint->raidPtr = raidPtr; 1116 1117 return RF_CREATE_THREAD(raidPtr->recon_thread, rf_ReconThread, 1118 rrint, "raid_recon"); 1119 out: 1120 rf_unlock_mutex2(raidPtr->mutex); 1121 return EINVAL; 1122 } 1123 1124 static int 1125 rf_copyinspecificbuf(RF_Config_t *k_cfg) 1126 { 1127 /* allocate a buffer for the layout-specific data, and copy it in */ 1128 if (k_cfg->layoutSpecificSize == 0) 1129 return 0; 1130 1131 if (k_cfg->layoutSpecificSize > 10000) { 1132 /* sanity check */ 1133 return EINVAL; 1134 } 1135 1136 u_char *specific_buf; 1137 specific_buf = RF_Malloc(k_cfg->layoutSpecificSize); 1138 if (specific_buf == NULL) 1139 return ENOMEM; 1140 1141 int retcode = copyin(k_cfg->layoutSpecific, specific_buf, 1142 k_cfg->layoutSpecificSize); 1143 if (retcode) { 1144 RF_Free(specific_buf, k_cfg->layoutSpecificSize); 1145 db1_printf(("%s: retcode=%d copyin.2\n", __func__, retcode)); 1146 return retcode; 1147 } 1148 1149 k_cfg->layoutSpecific = specific_buf; 1150 return 0; 1151 } 1152 1153 static int 1154 rf_getConfiguration(struct raid_softc *rs, void *data, RF_Config_t **k_cfg) 1155 { 1156 RF_Config_t *u_cfg = *((RF_Config_t **) data); 1157 1158 if (rs->sc_r.valid) { 1159 /* There is a valid RAID set running on this unit! */ 1160 printf("raid%d: Device already configured!\n", rs->sc_unit); 1161 return EINVAL; 1162 } 1163 1164 /* copy-in the configuration information */ 1165 /* data points to a pointer to the configuration structure */ 1166 *k_cfg = RF_Malloc(sizeof(**k_cfg)); 1167 if (*k_cfg == NULL) { 1168 return ENOMEM; 1169 } 1170 int retcode = copyin(u_cfg, *k_cfg, sizeof(RF_Config_t)); 1171 if (retcode == 0) 1172 return 0; 1173 RF_Free(*k_cfg, sizeof(RF_Config_t)); 1174 db1_printf(("%s: retcode=%d copyin.1\n", __func__, retcode)); 1175 rs->sc_flags |= RAIDF_SHUTDOWN; 1176 return retcode; 1177 } 1178 1179 int 1180 rf_construct(struct raid_softc *rs, RF_Config_t *k_cfg) 1181 { 1182 int retcode; 1183 RF_Raid_t *raidPtr = &rs->sc_r; 1184 1185 rs->sc_flags &= ~RAIDF_SHUTDOWN; 1186 1187 if ((retcode = rf_copyinspecificbuf(k_cfg)) != 0) 1188 goto out; 1189 1190 /* should do some kind of sanity check on the configuration. 1191 * Store the sum of all the bytes in the last byte? */ 1192 1193 /* configure the system */ 1194 1195 /* 1196 * Clear the entire RAID descriptor, just to make sure 1197 * there is no stale data left in the case of a 1198 * reconfiguration 1199 */ 1200 memset(raidPtr, 0, sizeof(*raidPtr)); 1201 raidPtr->softc = rs; 1202 raidPtr->raidid = rs->sc_unit; 1203 1204 retcode = rf_Configure(raidPtr, k_cfg, NULL); 1205 1206 if (retcode == 0) { 1207 /* allow this many simultaneous IO's to 1208 this RAID device */ 1209 raidPtr->openings = RAIDOUTSTANDING; 1210 1211 raidinit(rs); 1212 raid_wakeup(raidPtr); 1213 rf_markalldirty(raidPtr); 1214 } 1215 1216 /* free the buffers. No return code here. */ 1217 if (k_cfg->layoutSpecificSize) { 1218 RF_Free(k_cfg->layoutSpecific, k_cfg->layoutSpecificSize); 1219 } 1220 out: 1221 RF_Free(k_cfg, sizeof(RF_Config_t)); 1222 if (retcode) { 1223 /* 1224 * If configuration failed, set sc_flags so that we 1225 * will detach the device when we close it. 1226 */ 1227 rs->sc_flags |= RAIDF_SHUTDOWN; 1228 } 1229 return retcode; 1230 } 1231 1232 #if RF_DISABLED 1233 static int 1234 rf_set_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 1235 { 1236 1237 /* XXX check the label for valid stuff... */ 1238 /* Note that some things *should not* get modified -- 1239 the user should be re-initing the labels instead of 1240 trying to patch things. 1241 */ 1242 #ifdef DEBUG 1243 int raidid = raidPtr->raidid; 1244 printf("raid%d: Got component label:\n", raidid); 1245 printf("raid%d: Version: %d\n", raidid, clabel->version); 1246 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number); 1247 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter); 1248 printf("raid%d: Column: %d\n", raidid, clabel->column); 1249 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns); 1250 printf("raid%d: Clean: %d\n", raidid, clabel->clean); 1251 printf("raid%d: Status: %d\n", raidid, clabel->status); 1252 #endif /* DEBUG */ 1253 clabel->row = 0; 1254 int column = clabel->column; 1255 1256 if ((column < 0) || (column >= raidPtr->numCol)) { 1257 return(EINVAL); 1258 } 1259 1260 /* XXX this isn't allowed to do anything for now :-) */ 1261 1262 /* XXX and before it is, we need to fill in the rest 1263 of the fields!?!?!?! */ 1264 memcpy(raidget_component_label(raidPtr, column), 1265 clabel, sizeof(*clabel)); 1266 raidflush_component_label(raidPtr, column); 1267 return 0; 1268 } 1269 #endif 1270 1271 static int 1272 rf_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 1273 { 1274 /* 1275 we only want the serial number from 1276 the above. We get all the rest of the information 1277 from the config that was used to create this RAID 1278 set. 1279 */ 1280 1281 raidPtr->serial_number = clabel->serial_number; 1282 1283 for (int column = 0; column < raidPtr->numCol; column++) { 1284 RF_RaidDisk_t *diskPtr = &raidPtr->Disks[column]; 1285 if (RF_DEAD_DISK(diskPtr->status)) 1286 continue; 1287 RF_ComponentLabel_t *ci_label = raidget_component_label( 1288 raidPtr, column); 1289 /* Zeroing this is important. */ 1290 memset(ci_label, 0, sizeof(*ci_label)); 1291 raid_init_component_label(raidPtr, ci_label); 1292 ci_label->serial_number = raidPtr->serial_number; 1293 ci_label->row = 0; /* we dont' pretend to support more */ 1294 rf_component_label_set_partitionsize(ci_label, 1295 diskPtr->partitionSize); 1296 ci_label->column = column; 1297 raidflush_component_label(raidPtr, column); 1298 /* XXXjld what about the spares? */ 1299 } 1300 1301 return 0; 1302 } 1303 1304 static int 1305 rf_rebuild_in_place(RF_Raid_t *raidPtr, RF_SingleComponent_t *componentPtr) 1306 { 1307 1308 if (raidPtr->Layout.map->faultsTolerated == 0) { 1309 /* Can't do this on a RAID 0!! */ 1310 return EINVAL; 1311 } 1312 1313 if (raidPtr->recon_in_progress == 1) { 1314 /* a reconstruct is already in progress! */ 1315 return EINVAL; 1316 } 1317 1318 RF_SingleComponent_t component; 1319 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t)); 1320 component.row = 0; /* we don't support any more */ 1321 int column = component.column; 1322 1323 if ((column < 0) || (column >= raidPtr->numCol)) { 1324 return EINVAL; 1325 } 1326 1327 rf_lock_mutex2(raidPtr->mutex); 1328 if ((raidPtr->Disks[column].status == rf_ds_optimal) && 1329 (raidPtr->numFailures > 0)) { 1330 /* XXX 0 above shouldn't be constant!!! */ 1331 /* some component other than this has failed. 1332 Let's not make things worse than they already 1333 are... */ 1334 printf("raid%d: Unable to reconstruct to disk at:\n", 1335 raidPtr->raidid); 1336 printf("raid%d: Col: %d Too many failures.\n", 1337 raidPtr->raidid, column); 1338 rf_unlock_mutex2(raidPtr->mutex); 1339 return EINVAL; 1340 } 1341 1342 if (raidPtr->Disks[column].status == rf_ds_reconstructing) { 1343 printf("raid%d: Unable to reconstruct to disk at:\n", 1344 raidPtr->raidid); 1345 printf("raid%d: Col: %d " 1346 "Reconstruction already occurring!\n", 1347 raidPtr->raidid, column); 1348 1349 rf_unlock_mutex2(raidPtr->mutex); 1350 return EINVAL; 1351 } 1352 1353 if (raidPtr->Disks[column].status == rf_ds_spared) { 1354 rf_unlock_mutex2(raidPtr->mutex); 1355 return EINVAL; 1356 } 1357 1358 rf_unlock_mutex2(raidPtr->mutex); 1359 1360 struct rf_recon_req_internal *rrint; 1361 rrint = RF_Malloc(sizeof(*rrint)); 1362 if (rrint == NULL) 1363 return ENOMEM; 1364 1365 rrint->col = column; 1366 rrint->raidPtr = raidPtr; 1367 1368 return RF_CREATE_THREAD(raidPtr->recon_thread, 1369 rf_ReconstructInPlaceThread, rrint, "raid_reconip"); 1370 } 1371 1372 static int 1373 rf_check_recon_status(RF_Raid_t *raidPtr, int *data) 1374 { 1375 /* 1376 * This makes no sense on a RAID 0, or if we are not reconstructing 1377 * so tell the user it's done. 1378 */ 1379 if (raidPtr->Layout.map->faultsTolerated == 0 || 1380 raidPtr->status != rf_rs_reconstructing) { 1381 *data = 100; 1382 return 0; 1383 } 1384 if (raidPtr->reconControl->numRUsTotal == 0) { 1385 *data = 0; 1386 return 0; 1387 } 1388 *data = (raidPtr->reconControl->numRUsComplete * 100 1389 / raidPtr->reconControl->numRUsTotal); 1390 return 0; 1391 } 1392 1393 static int 1394 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) 1395 { 1396 int unit = raidunit(dev); 1397 int part, pmask; 1398 struct raid_softc *rs; 1399 struct dk_softc *dksc; 1400 RF_Config_t *k_cfg; 1401 RF_Raid_t *raidPtr; 1402 RF_AccTotals_t *totals; 1403 RF_SingleComponent_t component; 1404 RF_DeviceConfig_t *d_cfg, *ucfgp; 1405 int retcode = 0; 1406 int column; 1407 RF_ComponentLabel_t *clabel; 1408 RF_SingleComponent_t *sparePtr,*componentPtr; 1409 int d; 1410 1411 if ((rs = raidget(unit, false)) == NULL) 1412 return ENXIO; 1413 1414 dksc = &rs->sc_dksc; 1415 raidPtr = &rs->sc_r; 1416 1417 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev, 1418 (int) DISKPART(dev), (int) unit, cmd)); 1419 1420 /* Must be initialized for these... */ 1421 if (rf_must_be_initialized(rs, cmd)) 1422 return ENXIO; 1423 1424 switch (cmd) { 1425 /* configure the system */ 1426 case RAIDFRAME_CONFIGURE: 1427 if ((retcode = rf_getConfiguration(rs, data, &k_cfg)) != 0) 1428 return retcode; 1429 return rf_construct(rs, k_cfg); 1430 1431 /* shutdown the system */ 1432 case RAIDFRAME_SHUTDOWN: 1433 1434 part = DISKPART(dev); 1435 pmask = (1 << part); 1436 1437 if ((retcode = raidlock(rs)) != 0) 1438 return retcode; 1439 1440 if (DK_BUSY(dksc, pmask) || 1441 raidPtr->recon_in_progress != 0 || 1442 raidPtr->parity_rewrite_in_progress != 0 || 1443 raidPtr->copyback_in_progress != 0) 1444 retcode = EBUSY; 1445 else { 1446 /* detach and free on close */ 1447 rs->sc_flags |= RAIDF_SHUTDOWN; 1448 retcode = 0; 1449 } 1450 1451 raidunlock(rs); 1452 1453 return retcode; 1454 case RAIDFRAME_GET_COMPONENT_LABEL: 1455 return rf_get_component_label(raidPtr, data); 1456 1457 #if RF_DISABLED 1458 case RAIDFRAME_SET_COMPONENT_LABEL: 1459 return rf_set_component_label(raidPtr, data); 1460 #endif 1461 1462 case RAIDFRAME_INIT_LABELS: 1463 return rf_init_component_label(raidPtr, data); 1464 1465 case RAIDFRAME_SET_AUTOCONFIG: 1466 d = rf_set_autoconfig(raidPtr, *(int *) data); 1467 printf("raid%d: New autoconfig value is: %d\n", 1468 raidPtr->raidid, d); 1469 *(int *) data = d; 1470 return retcode; 1471 1472 case RAIDFRAME_SET_ROOT: 1473 d = rf_set_rootpartition(raidPtr, *(int *) data); 1474 printf("raid%d: New rootpartition value is: %d\n", 1475 raidPtr->raidid, d); 1476 *(int *) data = d; 1477 return retcode; 1478 1479 /* initialize all parity */ 1480 case RAIDFRAME_REWRITEPARITY: 1481 1482 if (raidPtr->Layout.map->faultsTolerated == 0) { 1483 /* Parity for RAID 0 is trivially correct */ 1484 raidPtr->parity_good = RF_RAID_CLEAN; 1485 return 0; 1486 } 1487 1488 if (raidPtr->parity_rewrite_in_progress == 1) { 1489 /* Re-write is already in progress! */ 1490 return EINVAL; 1491 } 1492 1493 return RF_CREATE_THREAD(raidPtr->parity_rewrite_thread, 1494 rf_RewriteParityThread, raidPtr,"raid_parity"); 1495 1496 case RAIDFRAME_ADD_HOT_SPARE: 1497 sparePtr = (RF_SingleComponent_t *) data; 1498 memcpy(&component, sparePtr, sizeof(RF_SingleComponent_t)); 1499 return rf_add_hot_spare(raidPtr, &component); 1500 1501 case RAIDFRAME_REMOVE_HOT_SPARE: 1502 return retcode; 1503 1504 case RAIDFRAME_DELETE_COMPONENT: 1505 componentPtr = (RF_SingleComponent_t *)data; 1506 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t)); 1507 return rf_delete_component(raidPtr, &component); 1508 1509 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1510 componentPtr = (RF_SingleComponent_t *)data; 1511 memcpy(&component, componentPtr, sizeof(RF_SingleComponent_t)); 1512 return rf_incorporate_hot_spare(raidPtr, &component); 1513 1514 case RAIDFRAME_REBUILD_IN_PLACE: 1515 return rf_rebuild_in_place(raidPtr, data); 1516 1517 case RAIDFRAME_GET_INFO: 1518 ucfgp = *(RF_DeviceConfig_t **)data; 1519 d_cfg = RF_Malloc(sizeof(*d_cfg)); 1520 if (d_cfg == NULL) 1521 return ENOMEM; 1522 retcode = rf_get_info(raidPtr, d_cfg); 1523 if (retcode == 0) { 1524 retcode = copyout(d_cfg, ucfgp, sizeof(*d_cfg)); 1525 } 1526 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1527 return retcode; 1528 1529 case RAIDFRAME_CHECK_PARITY: 1530 *(int *) data = raidPtr->parity_good; 1531 return 0; 1532 1533 case RAIDFRAME_PARITYMAP_STATUS: 1534 if (rf_paritymap_ineligible(raidPtr)) 1535 return EINVAL; 1536 rf_paritymap_status(raidPtr->parity_map, data); 1537 return 0; 1538 1539 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1540 if (rf_paritymap_ineligible(raidPtr)) 1541 return EINVAL; 1542 if (raidPtr->parity_map == NULL) 1543 return ENOENT; /* ??? */ 1544 if (rf_paritymap_set_params(raidPtr->parity_map, data, 1) != 0) 1545 return EINVAL; 1546 return 0; 1547 1548 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1549 if (rf_paritymap_ineligible(raidPtr)) 1550 return EINVAL; 1551 *(int *) data = rf_paritymap_get_disable(raidPtr); 1552 return 0; 1553 1554 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1555 if (rf_paritymap_ineligible(raidPtr)) 1556 return EINVAL; 1557 rf_paritymap_set_disable(raidPtr, *(int *)data); 1558 /* XXX should errors be passed up? */ 1559 return 0; 1560 1561 case RAIDFRAME_RESET_ACCTOTALS: 1562 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals)); 1563 return 0; 1564 1565 case RAIDFRAME_GET_ACCTOTALS: 1566 totals = (RF_AccTotals_t *) data; 1567 *totals = raidPtr->acc_totals; 1568 return 0; 1569 1570 case RAIDFRAME_KEEP_ACCTOTALS: 1571 raidPtr->keep_acc_totals = *(int *)data; 1572 return 0; 1573 1574 case RAIDFRAME_GET_SIZE: 1575 *(int *) data = raidPtr->totalSectors; 1576 return 0; 1577 1578 case RAIDFRAME_FAIL_DISK: 1579 return rf_fail_disk(raidPtr, data); 1580 1581 /* invoke a copyback operation after recon on whatever disk 1582 * needs it, if any */ 1583 case RAIDFRAME_COPYBACK: 1584 1585 if (raidPtr->Layout.map->faultsTolerated == 0) { 1586 /* This makes no sense on a RAID 0!! */ 1587 return EINVAL; 1588 } 1589 1590 if (raidPtr->copyback_in_progress == 1) { 1591 /* Copyback is already in progress! */ 1592 return EINVAL; 1593 } 1594 1595 return RF_CREATE_THREAD(raidPtr->copyback_thread, 1596 rf_CopybackThread, raidPtr, "raid_copyback"); 1597 1598 /* return the percentage completion of reconstruction */ 1599 case RAIDFRAME_CHECK_RECON_STATUS: 1600 return rf_check_recon_status(raidPtr, data); 1601 1602 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1603 rf_check_recon_status_ext(raidPtr, data); 1604 return 0; 1605 1606 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1607 if (raidPtr->Layout.map->faultsTolerated == 0) { 1608 /* This makes no sense on a RAID 0, so tell the 1609 user it's done. */ 1610 *(int *) data = 100; 1611 return 0; 1612 } 1613 if (raidPtr->parity_rewrite_in_progress == 1) { 1614 *(int *) data = 100 * 1615 raidPtr->parity_rewrite_stripes_done / 1616 raidPtr->Layout.numStripe; 1617 } else { 1618 *(int *) data = 100; 1619 } 1620 return 0; 1621 1622 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1623 rf_check_parityrewrite_status_ext(raidPtr, data); 1624 return 0; 1625 1626 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1627 if (raidPtr->Layout.map->faultsTolerated == 0) { 1628 /* This makes no sense on a RAID 0 */ 1629 *(int *) data = 100; 1630 return 0; 1631 } 1632 if (raidPtr->copyback_in_progress == 1) { 1633 *(int *) data = 100 * raidPtr->copyback_stripes_done / 1634 raidPtr->Layout.numStripe; 1635 } else { 1636 *(int *) data = 100; 1637 } 1638 return 0; 1639 1640 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1641 rf_check_copyback_status_ext(raidPtr, data); 1642 return 0; 1643 1644 case RAIDFRAME_SET_LAST_UNIT: 1645 for (column = 0; column < raidPtr->numCol; column++) 1646 if (raidPtr->Disks[column].status != rf_ds_optimal) 1647 return EBUSY; 1648 1649 for (column = 0; column < raidPtr->numCol; column++) { 1650 clabel = raidget_component_label(raidPtr, column); 1651 clabel->last_unit = *(int *)data; 1652 raidflush_component_label(raidPtr, column); 1653 } 1654 rs->sc_cflags |= RAIDF_UNIT_CHANGED; 1655 return 0; 1656 1657 /* the sparetable daemon calls this to wait for the kernel to 1658 * need a spare table. this ioctl does not return until a 1659 * spare table is needed. XXX -- calling mpsleep here in the 1660 * ioctl code is almost certainly wrong and evil. -- XXX XXX 1661 * -- I should either compute the spare table in the kernel, 1662 * or have a different -- XXX XXX -- interface (a different 1663 * character device) for delivering the table -- XXX */ 1664 #if RF_DISABLED 1665 case RAIDFRAME_SPARET_WAIT: 1666 rf_lock_mutex2(rf_sparet_wait_mutex); 1667 while (!rf_sparet_wait_queue) 1668 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex); 1669 RF_SparetWait_t *waitreq = rf_sparet_wait_queue; 1670 rf_sparet_wait_queue = rf_sparet_wait_queue->next; 1671 rf_unlock_mutex2(rf_sparet_wait_mutex); 1672 1673 /* structure assignment */ 1674 *((RF_SparetWait_t *) data) = *waitreq; 1675 1676 RF_Free(waitreq, sizeof(*waitreq)); 1677 return 0; 1678 1679 /* wakes up a process waiting on SPARET_WAIT and puts an error 1680 * code in it that will cause the dameon to exit */ 1681 case RAIDFRAME_ABORT_SPARET_WAIT: 1682 waitreq = RF_Malloc(sizeof(*waitreq)); 1683 waitreq->fcol = -1; 1684 rf_lock_mutex2(rf_sparet_wait_mutex); 1685 waitreq->next = rf_sparet_wait_queue; 1686 rf_sparet_wait_queue = waitreq; 1687 rf_broadcast_cond2(rf_sparet_wait_cv); 1688 rf_unlock_mutex2(rf_sparet_wait_mutex); 1689 return 0; 1690 1691 /* used by the spare table daemon to deliver a spare table 1692 * into the kernel */ 1693 case RAIDFRAME_SEND_SPARET: 1694 1695 /* install the spare table */ 1696 retcode = rf_SetSpareTable(raidPtr, *(void **) data); 1697 1698 /* respond to the requestor. the return status of the spare 1699 * table installation is passed in the "fcol" field */ 1700 waitred = RF_Malloc(sizeof(*waitreq)); 1701 waitreq->fcol = retcode; 1702 rf_lock_mutex2(rf_sparet_wait_mutex); 1703 waitreq->next = rf_sparet_resp_queue; 1704 rf_sparet_resp_queue = waitreq; 1705 rf_broadcast_cond2(rf_sparet_resp_cv); 1706 rf_unlock_mutex2(rf_sparet_wait_mutex); 1707 1708 return retcode; 1709 #endif 1710 default: 1711 /* 1712 * Don't bother trying to load compat modules 1713 * if it is not our ioctl. This is more efficient 1714 * and makes rump tests not depend on compat code 1715 */ 1716 if (IOCGROUP(cmd) != 'r') 1717 break; 1718 #ifdef _LP64 1719 if ((l->l_proc->p_flag & PK_32) != 0) { 1720 module_autoload("compat_netbsd32_raid", 1721 MODULE_CLASS_EXEC); 1722 MODULE_HOOK_CALL(raidframe_netbsd32_ioctl_hook, 1723 (rs, cmd, data), enosys(), retcode); 1724 if (retcode != EPASSTHROUGH) 1725 return retcode; 1726 } 1727 #endif 1728 module_autoload("compat_raid_80", MODULE_CLASS_EXEC); 1729 MODULE_HOOK_CALL(raidframe_ioctl_80_hook, 1730 (rs, cmd, data), enosys(), retcode); 1731 if (retcode != EPASSTHROUGH) 1732 return retcode; 1733 1734 module_autoload("compat_raid_50", MODULE_CLASS_EXEC); 1735 MODULE_HOOK_CALL(raidframe_ioctl_50_hook, 1736 (rs, cmd, data), enosys(), retcode); 1737 if (retcode != EPASSTHROUGH) 1738 return retcode; 1739 break; /* fall through to the os-specific code below */ 1740 1741 } 1742 1743 if (!raidPtr->valid) 1744 return (EINVAL); 1745 1746 /* 1747 * Add support for "regular" device ioctls here. 1748 */ 1749 1750 switch (cmd) { 1751 case DIOCGCACHE: 1752 retcode = rf_get_component_caches(raidPtr, (int *)data); 1753 break; 1754 1755 case DIOCCACHESYNC: 1756 retcode = rf_sync_component_caches(raidPtr); 1757 break; 1758 1759 default: 1760 retcode = dk_ioctl(dksc, dev, cmd, data, flag, l); 1761 break; 1762 } 1763 1764 return (retcode); 1765 1766 } 1767 1768 1769 /* raidinit -- complete the rest of the initialization for the 1770 RAIDframe device. */ 1771 1772 1773 static void 1774 raidinit(struct raid_softc *rs) 1775 { 1776 cfdata_t cf; 1777 unsigned int unit; 1778 struct dk_softc *dksc = &rs->sc_dksc; 1779 RF_Raid_t *raidPtr = &rs->sc_r; 1780 device_t dev; 1781 1782 unit = raidPtr->raidid; 1783 1784 /* XXX doesn't check bounds. */ 1785 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%u", unit); 1786 1787 /* attach the pseudo device */ 1788 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK); 1789 cf->cf_name = raid_cd.cd_name; 1790 cf->cf_atname = raid_cd.cd_name; 1791 cf->cf_unit = unit; 1792 cf->cf_fstate = FSTATE_STAR; 1793 1794 dev = config_attach_pseudo(cf); 1795 if (dev == NULL) { 1796 printf("raid%d: config_attach_pseudo failed\n", 1797 raidPtr->raidid); 1798 free(cf, M_RAIDFRAME); 1799 return; 1800 } 1801 1802 /* provide a backpointer to the real softc */ 1803 raidsoftc(dev) = rs; 1804 1805 /* disk_attach actually creates space for the CPU disklabel, among 1806 * other things, so it's critical to call this *BEFORE* we try putzing 1807 * with disklabels. */ 1808 dk_init(dksc, dev, DKTYPE_RAID); 1809 disk_init(&dksc->sc_dkdev, rs->sc_xname, &rf_dkdriver); 1810 1811 /* XXX There may be a weird interaction here between this, and 1812 * protectedSectors, as used in RAIDframe. */ 1813 1814 rs->sc_size = raidPtr->totalSectors; 1815 1816 /* Attach dk and disk subsystems */ 1817 dk_attach(dksc); 1818 disk_attach(&dksc->sc_dkdev); 1819 rf_set_geometry(rs, raidPtr); 1820 1821 bufq_alloc(&dksc->sc_bufq, "fcfs", BUFQ_SORT_RAWBLOCK); 1822 1823 /* mark unit as usuable */ 1824 rs->sc_flags |= RAIDF_INITED; 1825 1826 dkwedge_discover(&dksc->sc_dkdev); 1827 } 1828 1829 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 1830 /* wake up the daemon & tell it to get us a spare table 1831 * XXX 1832 * the entries in the queues should be tagged with the raidPtr 1833 * so that in the extremely rare case that two recons happen at once, 1834 * we know for which device were requesting a spare table 1835 * XXX 1836 * 1837 * XXX This code is not currently used. GO 1838 */ 1839 int 1840 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req) 1841 { 1842 int retcode; 1843 1844 rf_lock_mutex2(rf_sparet_wait_mutex); 1845 req->next = rf_sparet_wait_queue; 1846 rf_sparet_wait_queue = req; 1847 rf_broadcast_cond2(rf_sparet_wait_cv); 1848 1849 /* mpsleep unlocks the mutex */ 1850 while (!rf_sparet_resp_queue) { 1851 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex); 1852 } 1853 req = rf_sparet_resp_queue; 1854 rf_sparet_resp_queue = req->next; 1855 rf_unlock_mutex2(rf_sparet_wait_mutex); 1856 1857 retcode = req->fcol; 1858 RF_Free(req, sizeof(*req)); /* this is not the same req as we 1859 * alloc'd */ 1860 return (retcode); 1861 } 1862 #endif 1863 1864 /* a wrapper around rf_DoAccess that extracts appropriate info from the 1865 * bp & passes it down. 1866 * any calls originating in the kernel must use non-blocking I/O 1867 * do some extra sanity checking to return "appropriate" error values for 1868 * certain conditions (to make some standard utilities work) 1869 * 1870 * Formerly known as: rf_DoAccessKernel 1871 */ 1872 void 1873 raidstart(RF_Raid_t *raidPtr) 1874 { 1875 struct raid_softc *rs; 1876 struct dk_softc *dksc; 1877 1878 rs = raidPtr->softc; 1879 dksc = &rs->sc_dksc; 1880 /* quick check to see if anything has died recently */ 1881 rf_lock_mutex2(raidPtr->mutex); 1882 if (raidPtr->numNewFailures > 0) { 1883 rf_unlock_mutex2(raidPtr->mutex); 1884 rf_update_component_labels(raidPtr, 1885 RF_NORMAL_COMPONENT_UPDATE); 1886 rf_lock_mutex2(raidPtr->mutex); 1887 raidPtr->numNewFailures--; 1888 } 1889 rf_unlock_mutex2(raidPtr->mutex); 1890 1891 if ((rs->sc_flags & RAIDF_INITED) == 0) { 1892 printf("raid%d: raidstart not ready\n", raidPtr->raidid); 1893 return; 1894 } 1895 1896 dk_start(dksc, NULL); 1897 } 1898 1899 static int 1900 raiddoaccess(RF_Raid_t *raidPtr, struct buf *bp) 1901 { 1902 RF_SectorCount_t num_blocks, pb, sum; 1903 RF_RaidAddr_t raid_addr; 1904 daddr_t blocknum; 1905 int do_async; 1906 int rc; 1907 1908 rf_lock_mutex2(raidPtr->mutex); 1909 if (raidPtr->openings == 0) { 1910 rf_unlock_mutex2(raidPtr->mutex); 1911 return EAGAIN; 1912 } 1913 rf_unlock_mutex2(raidPtr->mutex); 1914 1915 blocknum = bp->b_rawblkno; 1916 1917 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno, 1918 (int) blocknum)); 1919 1920 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount)); 1921 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid)); 1922 1923 /* *THIS* is where we adjust what block we're going to... 1924 * but DO NOT TOUCH bp->b_blkno!!! */ 1925 raid_addr = blocknum; 1926 1927 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector; 1928 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0; 1929 sum = raid_addr + num_blocks + pb; 1930 if (1 || rf_debugKernelAccess) { 1931 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n", 1932 (int) raid_addr, (int) sum, (int) num_blocks, 1933 (int) pb, (int) bp->b_resid)); 1934 } 1935 if ((sum > raidPtr->totalSectors) || (sum < raid_addr) 1936 || (sum < num_blocks) || (sum < pb)) { 1937 rc = ENOSPC; 1938 goto done; 1939 } 1940 /* 1941 * XXX rf_DoAccess() should do this, not just DoAccessKernel() 1942 */ 1943 1944 if (bp->b_bcount & raidPtr->sectorMask) { 1945 rc = ENOSPC; 1946 goto done; 1947 } 1948 db1_printf(("Calling DoAccess..\n")); 1949 1950 1951 rf_lock_mutex2(raidPtr->mutex); 1952 raidPtr->openings--; 1953 rf_unlock_mutex2(raidPtr->mutex); 1954 1955 /* 1956 * Everything is async. 1957 */ 1958 do_async = 1; 1959 1960 /* don't ever condition on bp->b_flags & B_WRITE. 1961 * always condition on B_READ instead */ 1962 1963 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ? 1964 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE, 1965 do_async, raid_addr, num_blocks, 1966 bp->b_data, bp, RF_DAG_NONBLOCKING_IO); 1967 1968 done: 1969 return rc; 1970 } 1971 1972 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */ 1973 1974 int 1975 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req) 1976 { 1977 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE; 1978 struct buf *bp; 1979 1980 req->queue = queue; 1981 bp = req->bp; 1982 1983 switch (req->type) { 1984 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */ 1985 /* XXX need to do something extra here.. */ 1986 /* I'm leaving this in, as I've never actually seen it used, 1987 * and I'd like folks to report it... GO */ 1988 printf(("WAKEUP CALLED\n")); 1989 queue->numOutstanding++; 1990 1991 bp->b_flags = 0; 1992 bp->b_private = req; 1993 1994 KernelWakeupFunc(bp); 1995 break; 1996 1997 case RF_IO_TYPE_READ: 1998 case RF_IO_TYPE_WRITE: 1999 #if RF_ACC_TRACE > 0 2000 if (req->tracerec) { 2001 RF_ETIMER_START(req->tracerec->timer); 2002 } 2003 #endif 2004 InitBP(bp, queue->rf_cinfo->ci_vp, 2005 op, queue->rf_cinfo->ci_dev, 2006 req->sectorOffset, req->numSector, 2007 req->buf, KernelWakeupFunc, (void *) req, 2008 queue->raidPtr->logBytesPerSector, req->b_proc); 2009 2010 if (rf_debugKernelAccess) { 2011 db1_printf(("dispatch: bp->b_blkno = %ld\n", 2012 (long) bp->b_blkno)); 2013 } 2014 queue->numOutstanding++; 2015 queue->last_deq_sector = req->sectorOffset; 2016 /* acc wouldn't have been let in if there were any pending 2017 * reqs at any other priority */ 2018 queue->curPriority = req->priority; 2019 2020 db1_printf(("Going for %c to unit %d col %d\n", 2021 req->type, queue->raidPtr->raidid, 2022 queue->col)); 2023 db1_printf(("sector %d count %d (%d bytes) %d\n", 2024 (int) req->sectorOffset, (int) req->numSector, 2025 (int) (req->numSector << 2026 queue->raidPtr->logBytesPerSector), 2027 (int) queue->raidPtr->logBytesPerSector)); 2028 2029 /* 2030 * XXX: drop lock here since this can block at 2031 * least with backing SCSI devices. Retake it 2032 * to minimize fuss with calling interfaces. 2033 */ 2034 2035 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam"); 2036 bdev_strategy(bp); 2037 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam"); 2038 break; 2039 2040 default: 2041 panic("bad req->type in rf_DispatchKernelIO"); 2042 } 2043 db1_printf(("Exiting from DispatchKernelIO\n")); 2044 2045 return (0); 2046 } 2047 /* this is the callback function associated with a I/O invoked from 2048 kernel code. 2049 */ 2050 static void 2051 KernelWakeupFunc(struct buf *bp) 2052 { 2053 RF_DiskQueueData_t *req = NULL; 2054 RF_DiskQueue_t *queue; 2055 2056 db1_printf(("recovering the request queue:\n")); 2057 2058 req = bp->b_private; 2059 2060 queue = (RF_DiskQueue_t *) req->queue; 2061 2062 rf_lock_mutex2(queue->raidPtr->iodone_lock); 2063 2064 #if RF_ACC_TRACE > 0 2065 if (req->tracerec) { 2066 RF_ETIMER_STOP(req->tracerec->timer); 2067 RF_ETIMER_EVAL(req->tracerec->timer); 2068 rf_lock_mutex2(rf_tracing_mutex); 2069 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2070 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2071 req->tracerec->num_phys_ios++; 2072 rf_unlock_mutex2(rf_tracing_mutex); 2073 } 2074 #endif 2075 2076 /* XXX Ok, let's get aggressive... If b_error is set, let's go 2077 * ballistic, and mark the component as hosed... */ 2078 2079 if (bp->b_error != 0) { 2080 /* Mark the disk as dead */ 2081 /* but only mark it once... */ 2082 /* and only if it wouldn't leave this RAID set 2083 completely broken */ 2084 if (((queue->raidPtr->Disks[queue->col].status == 2085 rf_ds_optimal) || 2086 (queue->raidPtr->Disks[queue->col].status == 2087 rf_ds_used_spare)) && 2088 (queue->raidPtr->numFailures < 2089 queue->raidPtr->Layout.map->faultsTolerated)) { 2090 printf("raid%d: IO Error (%d). Marking %s as failed.\n", 2091 queue->raidPtr->raidid, 2092 bp->b_error, 2093 queue->raidPtr->Disks[queue->col].devname); 2094 queue->raidPtr->Disks[queue->col].status = 2095 rf_ds_failed; 2096 queue->raidPtr->status = rf_rs_degraded; 2097 queue->raidPtr->numFailures++; 2098 queue->raidPtr->numNewFailures++; 2099 } else { /* Disk is already dead... */ 2100 /* printf("Disk already marked as dead!\n"); */ 2101 } 2102 2103 } 2104 2105 /* Fill in the error value */ 2106 req->error = bp->b_error; 2107 2108 /* Drop this one on the "finished" queue... */ 2109 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries); 2110 2111 /* Let the raidio thread know there is work to be done. */ 2112 rf_signal_cond2(queue->raidPtr->iodone_cv); 2113 2114 rf_unlock_mutex2(queue->raidPtr->iodone_lock); 2115 } 2116 2117 2118 /* 2119 * initialize a buf structure for doing an I/O in the kernel. 2120 */ 2121 static void 2122 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev, 2123 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf, 2124 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector, 2125 struct proc *b_proc) 2126 { 2127 /* bp->b_flags = B_PHYS | rw_flag; */ 2128 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */ 2129 bp->b_oflags = 0; 2130 bp->b_cflags = 0; 2131 bp->b_bcount = numSect << logBytesPerSector; 2132 bp->b_bufsize = bp->b_bcount; 2133 bp->b_error = 0; 2134 bp->b_dev = dev; 2135 bp->b_data = bf; 2136 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT; 2137 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */ 2138 if (bp->b_bcount == 0) { 2139 panic("bp->b_bcount is zero in InitBP!!"); 2140 } 2141 bp->b_proc = b_proc; 2142 bp->b_iodone = cbFunc; 2143 bp->b_private = cbArg; 2144 } 2145 2146 /* 2147 * Wait interruptibly for an exclusive lock. 2148 * 2149 * XXX 2150 * Several drivers do this; it should be abstracted and made MP-safe. 2151 * (Hmm... where have we seen this warning before :-> GO ) 2152 */ 2153 static int 2154 raidlock(struct raid_softc *rs) 2155 { 2156 int error; 2157 2158 error = 0; 2159 mutex_enter(&rs->sc_mutex); 2160 while ((rs->sc_flags & RAIDF_LOCKED) != 0) { 2161 rs->sc_flags |= RAIDF_WANTED; 2162 error = cv_wait_sig(&rs->sc_cv, &rs->sc_mutex); 2163 if (error != 0) 2164 goto done; 2165 } 2166 rs->sc_flags |= RAIDF_LOCKED; 2167 done: 2168 mutex_exit(&rs->sc_mutex); 2169 return (error); 2170 } 2171 /* 2172 * Unlock and wake up any waiters. 2173 */ 2174 static void 2175 raidunlock(struct raid_softc *rs) 2176 { 2177 2178 mutex_enter(&rs->sc_mutex); 2179 rs->sc_flags &= ~RAIDF_LOCKED; 2180 if ((rs->sc_flags & RAIDF_WANTED) != 0) { 2181 rs->sc_flags &= ~RAIDF_WANTED; 2182 cv_broadcast(&rs->sc_cv); 2183 } 2184 mutex_exit(&rs->sc_mutex); 2185 } 2186 2187 2188 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */ 2189 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */ 2190 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE 2191 2192 static daddr_t 2193 rf_component_info_offset(void) 2194 { 2195 2196 return RF_COMPONENT_INFO_OFFSET; 2197 } 2198 2199 static daddr_t 2200 rf_component_info_size(unsigned secsize) 2201 { 2202 daddr_t info_size; 2203 2204 KASSERT(secsize); 2205 if (secsize > RF_COMPONENT_INFO_SIZE) 2206 info_size = secsize; 2207 else 2208 info_size = RF_COMPONENT_INFO_SIZE; 2209 2210 return info_size; 2211 } 2212 2213 static daddr_t 2214 rf_parity_map_offset(RF_Raid_t *raidPtr) 2215 { 2216 daddr_t map_offset; 2217 2218 KASSERT(raidPtr->bytesPerSector); 2219 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE) 2220 map_offset = raidPtr->bytesPerSector; 2221 else 2222 map_offset = RF_COMPONENT_INFO_SIZE; 2223 map_offset += rf_component_info_offset(); 2224 2225 return map_offset; 2226 } 2227 2228 static daddr_t 2229 rf_parity_map_size(RF_Raid_t *raidPtr) 2230 { 2231 daddr_t map_size; 2232 2233 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE) 2234 map_size = raidPtr->bytesPerSector; 2235 else 2236 map_size = RF_PARITY_MAP_SIZE; 2237 2238 return map_size; 2239 } 2240 2241 int 2242 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col) 2243 { 2244 RF_ComponentLabel_t *clabel; 2245 2246 clabel = raidget_component_label(raidPtr, col); 2247 clabel->clean = RF_RAID_CLEAN; 2248 raidflush_component_label(raidPtr, col); 2249 return(0); 2250 } 2251 2252 2253 int 2254 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col) 2255 { 2256 RF_ComponentLabel_t *clabel; 2257 2258 clabel = raidget_component_label(raidPtr, col); 2259 clabel->clean = RF_RAID_DIRTY; 2260 raidflush_component_label(raidPtr, col); 2261 return(0); 2262 } 2263 2264 int 2265 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2266 { 2267 KASSERT(raidPtr->bytesPerSector); 2268 return raidread_component_label(raidPtr->bytesPerSector, 2269 raidPtr->Disks[col].dev, 2270 raidPtr->raid_cinfo[col].ci_vp, 2271 &raidPtr->raid_cinfo[col].ci_label); 2272 } 2273 2274 RF_ComponentLabel_t * 2275 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2276 { 2277 return &raidPtr->raid_cinfo[col].ci_label; 2278 } 2279 2280 int 2281 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2282 { 2283 RF_ComponentLabel_t *label; 2284 2285 label = &raidPtr->raid_cinfo[col].ci_label; 2286 label->mod_counter = raidPtr->mod_counter; 2287 #ifndef RF_NO_PARITY_MAP 2288 label->parity_map_modcount = label->mod_counter; 2289 #endif 2290 return raidwrite_component_label(raidPtr->bytesPerSector, 2291 raidPtr->Disks[col].dev, 2292 raidPtr->raid_cinfo[col].ci_vp, label); 2293 } 2294 2295 2296 static int 2297 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2298 RF_ComponentLabel_t *clabel) 2299 { 2300 return raidread_component_area(dev, b_vp, clabel, 2301 sizeof(RF_ComponentLabel_t), 2302 rf_component_info_offset(), 2303 rf_component_info_size(secsize)); 2304 } 2305 2306 /* ARGSUSED */ 2307 static int 2308 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data, 2309 size_t msize, daddr_t offset, daddr_t dsize) 2310 { 2311 struct buf *bp; 2312 int error; 2313 2314 /* XXX should probably ensure that we don't try to do this if 2315 someone has changed rf_protected_sectors. */ 2316 2317 if (b_vp == NULL) { 2318 /* For whatever reason, this component is not valid. 2319 Don't try to read a component label from it. */ 2320 return(EINVAL); 2321 } 2322 2323 /* get a block of the appropriate size... */ 2324 bp = geteblk((int)dsize); 2325 bp->b_dev = dev; 2326 2327 /* get our ducks in a row for the read */ 2328 bp->b_blkno = offset / DEV_BSIZE; 2329 bp->b_bcount = dsize; 2330 bp->b_flags |= B_READ; 2331 bp->b_resid = dsize; 2332 2333 bdev_strategy(bp); 2334 error = biowait(bp); 2335 2336 if (!error) { 2337 memcpy(data, bp->b_data, msize); 2338 } 2339 2340 brelse(bp, 0); 2341 return(error); 2342 } 2343 2344 2345 static int 2346 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2347 RF_ComponentLabel_t *clabel) 2348 { 2349 return raidwrite_component_area(dev, b_vp, clabel, 2350 sizeof(RF_ComponentLabel_t), 2351 rf_component_info_offset(), 2352 rf_component_info_size(secsize), 0); 2353 } 2354 2355 /* ARGSUSED */ 2356 static int 2357 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data, 2358 size_t msize, daddr_t offset, daddr_t dsize, int asyncp) 2359 { 2360 struct buf *bp; 2361 int error; 2362 2363 /* get a block of the appropriate size... */ 2364 bp = geteblk((int)dsize); 2365 bp->b_dev = dev; 2366 2367 /* get our ducks in a row for the write */ 2368 bp->b_blkno = offset / DEV_BSIZE; 2369 bp->b_bcount = dsize; 2370 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0); 2371 bp->b_resid = dsize; 2372 2373 memset(bp->b_data, 0, dsize); 2374 memcpy(bp->b_data, data, msize); 2375 2376 bdev_strategy(bp); 2377 if (asyncp) 2378 return 0; 2379 error = biowait(bp); 2380 brelse(bp, 0); 2381 if (error) { 2382 #if 1 2383 printf("Failed to write RAID component info!\n"); 2384 #endif 2385 } 2386 2387 return(error); 2388 } 2389 2390 void 2391 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2392 { 2393 int c; 2394 2395 for (c = 0; c < raidPtr->numCol; c++) { 2396 /* Skip dead disks. */ 2397 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2398 continue; 2399 /* XXXjld: what if an error occurs here? */ 2400 raidwrite_component_area(raidPtr->Disks[c].dev, 2401 raidPtr->raid_cinfo[c].ci_vp, map, 2402 RF_PARITYMAP_NBYTE, 2403 rf_parity_map_offset(raidPtr), 2404 rf_parity_map_size(raidPtr), 0); 2405 } 2406 } 2407 2408 void 2409 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2410 { 2411 struct rf_paritymap_ondisk tmp; 2412 int c,first; 2413 2414 first=1; 2415 for (c = 0; c < raidPtr->numCol; c++) { 2416 /* Skip dead disks. */ 2417 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2418 continue; 2419 raidread_component_area(raidPtr->Disks[c].dev, 2420 raidPtr->raid_cinfo[c].ci_vp, &tmp, 2421 RF_PARITYMAP_NBYTE, 2422 rf_parity_map_offset(raidPtr), 2423 rf_parity_map_size(raidPtr)); 2424 if (first) { 2425 memcpy(map, &tmp, sizeof(*map)); 2426 first = 0; 2427 } else { 2428 rf_paritymap_merge(map, &tmp); 2429 } 2430 } 2431 } 2432 2433 void 2434 rf_markalldirty(RF_Raid_t *raidPtr) 2435 { 2436 RF_ComponentLabel_t *clabel; 2437 int sparecol; 2438 int c; 2439 int j; 2440 int scol = -1; 2441 2442 raidPtr->mod_counter++; 2443 for (c = 0; c < raidPtr->numCol; c++) { 2444 /* we don't want to touch (at all) a disk that has 2445 failed */ 2446 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) { 2447 clabel = raidget_component_label(raidPtr, c); 2448 if (clabel->status == rf_ds_spared) { 2449 /* XXX do something special... 2450 but whatever you do, don't 2451 try to access it!! */ 2452 } else { 2453 raidmarkdirty(raidPtr, c); 2454 } 2455 } 2456 } 2457 2458 for( c = 0; c < raidPtr->numSpare ; c++) { 2459 sparecol = raidPtr->numCol + c; 2460 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2461 /* 2462 2463 we claim this disk is "optimal" if it's 2464 rf_ds_used_spare, as that means it should be 2465 directly substitutable for the disk it replaced. 2466 We note that too... 2467 2468 */ 2469 2470 for(j=0;j<raidPtr->numCol;j++) { 2471 if (raidPtr->Disks[j].spareCol == sparecol) { 2472 scol = j; 2473 break; 2474 } 2475 } 2476 2477 clabel = raidget_component_label(raidPtr, sparecol); 2478 /* make sure status is noted */ 2479 2480 raid_init_component_label(raidPtr, clabel); 2481 2482 clabel->row = 0; 2483 clabel->column = scol; 2484 /* Note: we *don't* change status from rf_ds_used_spare 2485 to rf_ds_optimal */ 2486 /* clabel.status = rf_ds_optimal; */ 2487 2488 raidmarkdirty(raidPtr, sparecol); 2489 } 2490 } 2491 } 2492 2493 2494 void 2495 rf_update_component_labels(RF_Raid_t *raidPtr, int final) 2496 { 2497 RF_ComponentLabel_t *clabel; 2498 int sparecol; 2499 int c; 2500 int j; 2501 int scol; 2502 struct raid_softc *rs = raidPtr->softc; 2503 2504 scol = -1; 2505 2506 /* XXX should do extra checks to make sure things really are clean, 2507 rather than blindly setting the clean bit... */ 2508 2509 raidPtr->mod_counter++; 2510 2511 for (c = 0; c < raidPtr->numCol; c++) { 2512 if (raidPtr->Disks[c].status == rf_ds_optimal) { 2513 clabel = raidget_component_label(raidPtr, c); 2514 /* make sure status is noted */ 2515 clabel->status = rf_ds_optimal; 2516 2517 /* note what unit we are configured as */ 2518 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0) 2519 clabel->last_unit = raidPtr->raidid; 2520 2521 raidflush_component_label(raidPtr, c); 2522 if (final == RF_FINAL_COMPONENT_UPDATE) { 2523 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2524 raidmarkclean(raidPtr, c); 2525 } 2526 } 2527 } 2528 /* else we don't touch it.. */ 2529 } 2530 2531 for( c = 0; c < raidPtr->numSpare ; c++) { 2532 sparecol = raidPtr->numCol + c; 2533 /* Need to ensure that the reconstruct actually completed! */ 2534 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2535 /* 2536 2537 we claim this disk is "optimal" if it's 2538 rf_ds_used_spare, as that means it should be 2539 directly substitutable for the disk it replaced. 2540 We note that too... 2541 2542 */ 2543 2544 for(j=0;j<raidPtr->numCol;j++) { 2545 if (raidPtr->Disks[j].spareCol == sparecol) { 2546 scol = j; 2547 break; 2548 } 2549 } 2550 2551 /* XXX shouldn't *really* need this... */ 2552 clabel = raidget_component_label(raidPtr, sparecol); 2553 /* make sure status is noted */ 2554 2555 raid_init_component_label(raidPtr, clabel); 2556 2557 clabel->column = scol; 2558 clabel->status = rf_ds_optimal; 2559 if ((rs->sc_cflags & RAIDF_UNIT_CHANGED) == 0) 2560 clabel->last_unit = raidPtr->raidid; 2561 2562 raidflush_component_label(raidPtr, sparecol); 2563 if (final == RF_FINAL_COMPONENT_UPDATE) { 2564 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2565 raidmarkclean(raidPtr, sparecol); 2566 } 2567 } 2568 } 2569 } 2570 } 2571 2572 void 2573 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured) 2574 { 2575 2576 if (vp != NULL) { 2577 if (auto_configured == 1) { 2578 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2579 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2580 vput(vp); 2581 2582 } else { 2583 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred); 2584 } 2585 } 2586 } 2587 2588 2589 void 2590 rf_UnconfigureVnodes(RF_Raid_t *raidPtr) 2591 { 2592 int r,c; 2593 struct vnode *vp; 2594 int acd; 2595 2596 2597 /* We take this opportunity to close the vnodes like we should.. */ 2598 2599 for (c = 0; c < raidPtr->numCol; c++) { 2600 vp = raidPtr->raid_cinfo[c].ci_vp; 2601 acd = raidPtr->Disks[c].auto_configured; 2602 rf_close_component(raidPtr, vp, acd); 2603 raidPtr->raid_cinfo[c].ci_vp = NULL; 2604 raidPtr->Disks[c].auto_configured = 0; 2605 } 2606 2607 for (r = 0; r < raidPtr->numSpare; r++) { 2608 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp; 2609 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured; 2610 rf_close_component(raidPtr, vp, acd); 2611 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL; 2612 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0; 2613 } 2614 } 2615 2616 2617 void 2618 rf_ReconThread(struct rf_recon_req_internal *req) 2619 { 2620 int s; 2621 RF_Raid_t *raidPtr; 2622 2623 s = splbio(); 2624 raidPtr = (RF_Raid_t *) req->raidPtr; 2625 raidPtr->recon_in_progress = 1; 2626 2627 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col, 2628 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0)); 2629 2630 RF_Free(req, sizeof(*req)); 2631 2632 raidPtr->recon_in_progress = 0; 2633 splx(s); 2634 2635 /* That's all... */ 2636 kthread_exit(0); /* does not return */ 2637 } 2638 2639 void 2640 rf_RewriteParityThread(RF_Raid_t *raidPtr) 2641 { 2642 int retcode; 2643 int s; 2644 2645 raidPtr->parity_rewrite_stripes_done = 0; 2646 raidPtr->parity_rewrite_in_progress = 1; 2647 s = splbio(); 2648 retcode = rf_RewriteParity(raidPtr); 2649 splx(s); 2650 if (retcode) { 2651 printf("raid%d: Error re-writing parity (%d)!\n", 2652 raidPtr->raidid, retcode); 2653 } else { 2654 /* set the clean bit! If we shutdown correctly, 2655 the clean bit on each component label will get 2656 set */ 2657 raidPtr->parity_good = RF_RAID_CLEAN; 2658 } 2659 raidPtr->parity_rewrite_in_progress = 0; 2660 2661 /* Anyone waiting for us to stop? If so, inform them... */ 2662 if (raidPtr->waitShutdown) { 2663 rf_lock_mutex2(raidPtr->rad_lock); 2664 cv_broadcast(&raidPtr->parity_rewrite_cv); 2665 rf_unlock_mutex2(raidPtr->rad_lock); 2666 } 2667 2668 /* That's all... */ 2669 kthread_exit(0); /* does not return */ 2670 } 2671 2672 2673 void 2674 rf_CopybackThread(RF_Raid_t *raidPtr) 2675 { 2676 int s; 2677 2678 raidPtr->copyback_in_progress = 1; 2679 s = splbio(); 2680 rf_CopybackReconstructedData(raidPtr); 2681 splx(s); 2682 raidPtr->copyback_in_progress = 0; 2683 2684 /* That's all... */ 2685 kthread_exit(0); /* does not return */ 2686 } 2687 2688 2689 void 2690 rf_ReconstructInPlaceThread(struct rf_recon_req_internal *req) 2691 { 2692 int s; 2693 RF_Raid_t *raidPtr; 2694 2695 s = splbio(); 2696 raidPtr = req->raidPtr; 2697 raidPtr->recon_in_progress = 1; 2698 rf_ReconstructInPlace(raidPtr, req->col); 2699 RF_Free(req, sizeof(*req)); 2700 raidPtr->recon_in_progress = 0; 2701 splx(s); 2702 2703 /* That's all... */ 2704 kthread_exit(0); /* does not return */ 2705 } 2706 2707 static RF_AutoConfig_t * 2708 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp, 2709 const char *cname, RF_SectorCount_t size, uint64_t numsecs, 2710 unsigned secsize) 2711 { 2712 int good_one = 0; 2713 RF_ComponentLabel_t *clabel; 2714 RF_AutoConfig_t *ac; 2715 2716 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT); 2717 if (clabel == NULL) { 2718 oomem: 2719 while(ac_list) { 2720 ac = ac_list; 2721 if (ac->clabel) 2722 free(ac->clabel, M_RAIDFRAME); 2723 ac_list = ac_list->next; 2724 free(ac, M_RAIDFRAME); 2725 } 2726 printf("RAID auto config: out of memory!\n"); 2727 return NULL; /* XXX probably should panic? */ 2728 } 2729 2730 if (!raidread_component_label(secsize, dev, vp, clabel)) { 2731 /* Got the label. Does it look reasonable? */ 2732 if (rf_reasonable_label(clabel, numsecs) && 2733 (rf_component_label_partitionsize(clabel) <= size)) { 2734 #ifdef DEBUG 2735 printf("Component on: %s: %llu\n", 2736 cname, (unsigned long long)size); 2737 rf_print_component_label(clabel); 2738 #endif 2739 /* if it's reasonable, add it, else ignore it. */ 2740 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME, 2741 M_NOWAIT); 2742 if (ac == NULL) { 2743 free(clabel, M_RAIDFRAME); 2744 goto oomem; 2745 } 2746 strlcpy(ac->devname, cname, sizeof(ac->devname)); 2747 ac->dev = dev; 2748 ac->vp = vp; 2749 ac->clabel = clabel; 2750 ac->next = ac_list; 2751 ac_list = ac; 2752 good_one = 1; 2753 } 2754 } 2755 if (!good_one) { 2756 /* cleanup */ 2757 free(clabel, M_RAIDFRAME); 2758 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2759 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2760 vput(vp); 2761 } 2762 return ac_list; 2763 } 2764 2765 RF_AutoConfig_t * 2766 rf_find_raid_components(void) 2767 { 2768 struct vnode *vp; 2769 struct disklabel label; 2770 device_t dv; 2771 deviter_t di; 2772 dev_t dev; 2773 int bmajor, bminor, wedge, rf_part_found; 2774 int error; 2775 int i; 2776 RF_AutoConfig_t *ac_list; 2777 uint64_t numsecs; 2778 unsigned secsize; 2779 int dowedges; 2780 2781 /* initialize the AutoConfig list */ 2782 ac_list = NULL; 2783 2784 /* 2785 * we begin by trolling through *all* the devices on the system *twice* 2786 * first we scan for wedges, second for other devices. This avoids 2787 * using a raw partition instead of a wedge that covers the whole disk 2788 */ 2789 2790 for (dowedges=1; dowedges>=0; --dowedges) { 2791 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL; 2792 dv = deviter_next(&di)) { 2793 2794 /* we are only interested in disks... */ 2795 if (device_class(dv) != DV_DISK) 2796 continue; 2797 2798 /* we don't care about floppies... */ 2799 if (device_is_a(dv, "fd")) { 2800 continue; 2801 } 2802 2803 /* we don't care about CD's... */ 2804 if (device_is_a(dv, "cd")) { 2805 continue; 2806 } 2807 2808 /* we don't care about md's... */ 2809 if (device_is_a(dv, "md")) { 2810 continue; 2811 } 2812 2813 /* hdfd is the Atari/Hades floppy driver */ 2814 if (device_is_a(dv, "hdfd")) { 2815 continue; 2816 } 2817 2818 /* fdisa is the Atari/Milan floppy driver */ 2819 if (device_is_a(dv, "fdisa")) { 2820 continue; 2821 } 2822 2823 /* are we in the wedges pass ? */ 2824 wedge = device_is_a(dv, "dk"); 2825 if (wedge != dowedges) { 2826 continue; 2827 } 2828 2829 /* need to find the device_name_to_block_device_major stuff */ 2830 bmajor = devsw_name2blk(device_xname(dv), NULL, 0); 2831 2832 rf_part_found = 0; /*No raid partition as yet*/ 2833 2834 /* get a vnode for the raw partition of this disk */ 2835 bminor = minor(device_unit(dv)); 2836 dev = wedge ? makedev(bmajor, bminor) : 2837 MAKEDISKDEV(bmajor, bminor, RAW_PART); 2838 if (bdevvp(dev, &vp)) 2839 panic("RAID can't alloc vnode"); 2840 2841 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2842 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED); 2843 2844 if (error) { 2845 /* "Who cares." Continue looking 2846 for something that exists*/ 2847 vput(vp); 2848 continue; 2849 } 2850 2851 error = getdisksize(vp, &numsecs, &secsize); 2852 if (error) { 2853 /* 2854 * Pseudo devices like vnd and cgd can be 2855 * opened but may still need some configuration. 2856 * Ignore these quietly. 2857 */ 2858 if (error != ENXIO) 2859 printf("RAIDframe: can't get disk size" 2860 " for dev %s (%d)\n", 2861 device_xname(dv), error); 2862 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2863 vput(vp); 2864 continue; 2865 } 2866 if (wedge) { 2867 struct dkwedge_info dkw; 2868 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, 2869 NOCRED); 2870 if (error) { 2871 printf("RAIDframe: can't get wedge info for " 2872 "dev %s (%d)\n", device_xname(dv), error); 2873 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2874 vput(vp); 2875 continue; 2876 } 2877 2878 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) { 2879 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2880 vput(vp); 2881 continue; 2882 } 2883 2884 VOP_UNLOCK(vp); 2885 ac_list = rf_get_component(ac_list, dev, vp, 2886 device_xname(dv), dkw.dkw_size, numsecs, secsize); 2887 rf_part_found = 1; /*There is a raid component on this disk*/ 2888 continue; 2889 } 2890 2891 /* Ok, the disk exists. Go get the disklabel. */ 2892 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED); 2893 if (error) { 2894 /* 2895 * XXX can't happen - open() would 2896 * have errored out (or faked up one) 2897 */ 2898 if (error != ENOTTY) 2899 printf("RAIDframe: can't get label for dev " 2900 "%s (%d)\n", device_xname(dv), error); 2901 } 2902 2903 /* don't need this any more. We'll allocate it again 2904 a little later if we really do... */ 2905 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2906 vput(vp); 2907 2908 if (error) 2909 continue; 2910 2911 rf_part_found = 0; /*No raid partitions yet*/ 2912 for (i = 0; i < label.d_npartitions; i++) { 2913 char cname[sizeof(ac_list->devname)]; 2914 2915 /* We only support partitions marked as RAID */ 2916 if (label.d_partitions[i].p_fstype != FS_RAID) 2917 continue; 2918 2919 dev = MAKEDISKDEV(bmajor, device_unit(dv), i); 2920 if (bdevvp(dev, &vp)) 2921 panic("RAID can't alloc vnode"); 2922 2923 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2924 error = VOP_OPEN(vp, FREAD, NOCRED); 2925 if (error) { 2926 /* Whatever... */ 2927 vput(vp); 2928 continue; 2929 } 2930 VOP_UNLOCK(vp); 2931 snprintf(cname, sizeof(cname), "%s%c", 2932 device_xname(dv), 'a' + i); 2933 ac_list = rf_get_component(ac_list, dev, vp, cname, 2934 label.d_partitions[i].p_size, numsecs, secsize); 2935 rf_part_found = 1; /*There is at least one raid partition on this disk*/ 2936 } 2937 2938 /* 2939 *If there is no raid component on this disk, either in a 2940 *disklabel or inside a wedge, check the raw partition as well, 2941 *as it is possible to configure raid components on raw disk 2942 *devices. 2943 */ 2944 2945 if (!rf_part_found) { 2946 char cname[sizeof(ac_list->devname)]; 2947 2948 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART); 2949 if (bdevvp(dev, &vp)) 2950 panic("RAID can't alloc vnode"); 2951 2952 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2953 2954 error = VOP_OPEN(vp, FREAD, NOCRED); 2955 if (error) { 2956 /* Whatever... */ 2957 vput(vp); 2958 continue; 2959 } 2960 VOP_UNLOCK(vp); 2961 snprintf(cname, sizeof(cname), "%s%c", 2962 device_xname(dv), 'a' + RAW_PART); 2963 ac_list = rf_get_component(ac_list, dev, vp, cname, 2964 label.d_partitions[RAW_PART].p_size, numsecs, secsize); 2965 } 2966 } 2967 deviter_release(&di); 2968 } 2969 return ac_list; 2970 } 2971 2972 2973 int 2974 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs) 2975 { 2976 2977 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) || 2978 (clabel->version==RF_COMPONENT_LABEL_VERSION)) && 2979 ((clabel->clean == RF_RAID_CLEAN) || 2980 (clabel->clean == RF_RAID_DIRTY)) && 2981 clabel->row >=0 && 2982 clabel->column >= 0 && 2983 clabel->num_rows > 0 && 2984 clabel->num_columns > 0 && 2985 clabel->row < clabel->num_rows && 2986 clabel->column < clabel->num_columns && 2987 clabel->blockSize > 0 && 2988 /* 2989 * numBlocksHi may contain garbage, but it is ok since 2990 * the type is unsigned. If it is really garbage, 2991 * rf_fix_old_label_size() will fix it. 2992 */ 2993 rf_component_label_numblocks(clabel) > 0) { 2994 /* 2995 * label looks reasonable enough... 2996 * let's make sure it has no old garbage. 2997 */ 2998 if (numsecs) 2999 rf_fix_old_label_size(clabel, numsecs); 3000 return(1); 3001 } 3002 return(0); 3003 } 3004 3005 3006 /* 3007 * For reasons yet unknown, some old component labels have garbage in 3008 * the newer numBlocksHi region, and this causes lossage. Since those 3009 * disks will also have numsecs set to less than 32 bits of sectors, 3010 * we can determine when this corruption has occurred, and fix it. 3011 * 3012 * The exact same problem, with the same unknown reason, happens to 3013 * the partitionSizeHi member as well. 3014 */ 3015 static void 3016 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs) 3017 { 3018 3019 if (numsecs < ((uint64_t)1 << 32)) { 3020 if (clabel->numBlocksHi) { 3021 printf("WARNING: total sectors < 32 bits, yet " 3022 "numBlocksHi set\n" 3023 "WARNING: resetting numBlocksHi to zero.\n"); 3024 clabel->numBlocksHi = 0; 3025 } 3026 3027 if (clabel->partitionSizeHi) { 3028 printf("WARNING: total sectors < 32 bits, yet " 3029 "partitionSizeHi set\n" 3030 "WARNING: resetting partitionSizeHi to zero.\n"); 3031 clabel->partitionSizeHi = 0; 3032 } 3033 } 3034 } 3035 3036 3037 #ifdef DEBUG 3038 void 3039 rf_print_component_label(RF_ComponentLabel_t *clabel) 3040 { 3041 uint64_t numBlocks; 3042 static const char *rp[] = { 3043 "No", "Force", "Soft", "*invalid*" 3044 }; 3045 3046 3047 numBlocks = rf_component_label_numblocks(clabel); 3048 3049 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n", 3050 clabel->row, clabel->column, 3051 clabel->num_rows, clabel->num_columns); 3052 printf(" Version: %d Serial Number: %d Mod Counter: %d\n", 3053 clabel->version, clabel->serial_number, 3054 clabel->mod_counter); 3055 printf(" Clean: %s Status: %d\n", 3056 clabel->clean ? "Yes" : "No", clabel->status); 3057 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n", 3058 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU); 3059 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n", 3060 (char) clabel->parityConfig, clabel->blockSize, numBlocks); 3061 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No"); 3062 printf(" Root partition: %s\n", rp[clabel->root_partition & 3]); 3063 printf(" Last configured as: raid%d\n", clabel->last_unit); 3064 #if 0 3065 printf(" Config order: %d\n", clabel->config_order); 3066 #endif 3067 3068 } 3069 #endif 3070 3071 RF_ConfigSet_t * 3072 rf_create_auto_sets(RF_AutoConfig_t *ac_list) 3073 { 3074 RF_AutoConfig_t *ac; 3075 RF_ConfigSet_t *config_sets; 3076 RF_ConfigSet_t *cset; 3077 RF_AutoConfig_t *ac_next; 3078 3079 3080 config_sets = NULL; 3081 3082 /* Go through the AutoConfig list, and figure out which components 3083 belong to what sets. */ 3084 ac = ac_list; 3085 while(ac!=NULL) { 3086 /* we're going to putz with ac->next, so save it here 3087 for use at the end of the loop */ 3088 ac_next = ac->next; 3089 3090 if (config_sets == NULL) { 3091 /* will need at least this one... */ 3092 config_sets = (RF_ConfigSet_t *) 3093 malloc(sizeof(RF_ConfigSet_t), 3094 M_RAIDFRAME, M_NOWAIT); 3095 if (config_sets == NULL) { 3096 panic("rf_create_auto_sets: No memory!"); 3097 } 3098 /* this one is easy :) */ 3099 config_sets->ac = ac; 3100 config_sets->next = NULL; 3101 config_sets->rootable = 0; 3102 ac->next = NULL; 3103 } else { 3104 /* which set does this component fit into? */ 3105 cset = config_sets; 3106 while(cset!=NULL) { 3107 if (rf_does_it_fit(cset, ac)) { 3108 /* looks like it matches... */ 3109 ac->next = cset->ac; 3110 cset->ac = ac; 3111 break; 3112 } 3113 cset = cset->next; 3114 } 3115 if (cset==NULL) { 3116 /* didn't find a match above... new set..*/ 3117 cset = (RF_ConfigSet_t *) 3118 malloc(sizeof(RF_ConfigSet_t), 3119 M_RAIDFRAME, M_NOWAIT); 3120 if (cset == NULL) { 3121 panic("rf_create_auto_sets: No memory!"); 3122 } 3123 cset->ac = ac; 3124 ac->next = NULL; 3125 cset->next = config_sets; 3126 cset->rootable = 0; 3127 config_sets = cset; 3128 } 3129 } 3130 ac = ac_next; 3131 } 3132 3133 3134 return(config_sets); 3135 } 3136 3137 static int 3138 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac) 3139 { 3140 RF_ComponentLabel_t *clabel1, *clabel2; 3141 3142 /* If this one matches the *first* one in the set, that's good 3143 enough, since the other members of the set would have been 3144 through here too... */ 3145 /* note that we are not checking partitionSize here.. 3146 3147 Note that we are also not checking the mod_counters here. 3148 If everything else matches except the mod_counter, that's 3149 good enough for this test. We will deal with the mod_counters 3150 a little later in the autoconfiguration process. 3151 3152 (clabel1->mod_counter == clabel2->mod_counter) && 3153 3154 The reason we don't check for this is that failed disks 3155 will have lower modification counts. If those disks are 3156 not added to the set they used to belong to, then they will 3157 form their own set, which may result in 2 different sets, 3158 for example, competing to be configured at raid0, and 3159 perhaps competing to be the root filesystem set. If the 3160 wrong ones get configured, or both attempt to become /, 3161 weird behaviour and or serious lossage will occur. Thus we 3162 need to bring them into the fold here, and kick them out at 3163 a later point. 3164 3165 */ 3166 3167 clabel1 = cset->ac->clabel; 3168 clabel2 = ac->clabel; 3169 if ((clabel1->version == clabel2->version) && 3170 (clabel1->serial_number == clabel2->serial_number) && 3171 (clabel1->num_rows == clabel2->num_rows) && 3172 (clabel1->num_columns == clabel2->num_columns) && 3173 (clabel1->sectPerSU == clabel2->sectPerSU) && 3174 (clabel1->SUsPerPU == clabel2->SUsPerPU) && 3175 (clabel1->SUsPerRU == clabel2->SUsPerRU) && 3176 (clabel1->parityConfig == clabel2->parityConfig) && 3177 (clabel1->maxOutstanding == clabel2->maxOutstanding) && 3178 (clabel1->blockSize == clabel2->blockSize) && 3179 rf_component_label_numblocks(clabel1) == 3180 rf_component_label_numblocks(clabel2) && 3181 (clabel1->autoconfigure == clabel2->autoconfigure) && 3182 (clabel1->root_partition == clabel2->root_partition) && 3183 (clabel1->last_unit == clabel2->last_unit) && 3184 (clabel1->config_order == clabel2->config_order)) { 3185 /* if it get's here, it almost *has* to be a match */ 3186 } else { 3187 /* it's not consistent with somebody in the set.. 3188 punt */ 3189 return(0); 3190 } 3191 /* all was fine.. it must fit... */ 3192 return(1); 3193 } 3194 3195 int 3196 rf_have_enough_components(RF_ConfigSet_t *cset) 3197 { 3198 RF_AutoConfig_t *ac; 3199 RF_AutoConfig_t *auto_config; 3200 RF_ComponentLabel_t *clabel; 3201 int c; 3202 int num_cols; 3203 int num_missing; 3204 int mod_counter; 3205 int mod_counter_found; 3206 int even_pair_failed; 3207 char parity_type; 3208 3209 3210 /* check to see that we have enough 'live' components 3211 of this set. If so, we can configure it if necessary */ 3212 3213 num_cols = cset->ac->clabel->num_columns; 3214 parity_type = cset->ac->clabel->parityConfig; 3215 3216 /* XXX Check for duplicate components!?!?!? */ 3217 3218 /* Determine what the mod_counter is supposed to be for this set. */ 3219 3220 mod_counter_found = 0; 3221 mod_counter = 0; 3222 ac = cset->ac; 3223 while(ac!=NULL) { 3224 if (mod_counter_found==0) { 3225 mod_counter = ac->clabel->mod_counter; 3226 mod_counter_found = 1; 3227 } else { 3228 if (ac->clabel->mod_counter > mod_counter) { 3229 mod_counter = ac->clabel->mod_counter; 3230 } 3231 } 3232 ac = ac->next; 3233 } 3234 3235 num_missing = 0; 3236 auto_config = cset->ac; 3237 3238 even_pair_failed = 0; 3239 for(c=0; c<num_cols; c++) { 3240 ac = auto_config; 3241 while(ac!=NULL) { 3242 if ((ac->clabel->column == c) && 3243 (ac->clabel->mod_counter == mod_counter)) { 3244 /* it's this one... */ 3245 #ifdef DEBUG 3246 printf("Found: %s at %d\n", 3247 ac->devname,c); 3248 #endif 3249 break; 3250 } 3251 ac=ac->next; 3252 } 3253 if (ac==NULL) { 3254 /* Didn't find one here! */ 3255 /* special case for RAID 1, especially 3256 where there are more than 2 3257 components (where RAIDframe treats 3258 things a little differently :( ) */ 3259 if (parity_type == '1') { 3260 if (c%2 == 0) { /* even component */ 3261 even_pair_failed = 1; 3262 } else { /* odd component. If 3263 we're failed, and 3264 so is the even 3265 component, it's 3266 "Good Night, Charlie" */ 3267 if (even_pair_failed == 1) { 3268 return(0); 3269 } 3270 } 3271 } else { 3272 /* normal accounting */ 3273 num_missing++; 3274 } 3275 } 3276 if ((parity_type == '1') && (c%2 == 1)) { 3277 /* Just did an even component, and we didn't 3278 bail.. reset the even_pair_failed flag, 3279 and go on to the next component.... */ 3280 even_pair_failed = 0; 3281 } 3282 } 3283 3284 clabel = cset->ac->clabel; 3285 3286 if (((clabel->parityConfig == '0') && (num_missing > 0)) || 3287 ((clabel->parityConfig == '4') && (num_missing > 1)) || 3288 ((clabel->parityConfig == '5') && (num_missing > 1))) { 3289 /* XXX this needs to be made *much* more general */ 3290 /* Too many failures */ 3291 return(0); 3292 } 3293 /* otherwise, all is well, and we've got enough to take a kick 3294 at autoconfiguring this set */ 3295 return(1); 3296 } 3297 3298 void 3299 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config, 3300 RF_Raid_t *raidPtr) 3301 { 3302 RF_ComponentLabel_t *clabel; 3303 int i; 3304 3305 clabel = ac->clabel; 3306 3307 /* 1. Fill in the common stuff */ 3308 config->numCol = clabel->num_columns; 3309 config->numSpare = 0; /* XXX should this be set here? */ 3310 config->sectPerSU = clabel->sectPerSU; 3311 config->SUsPerPU = clabel->SUsPerPU; 3312 config->SUsPerRU = clabel->SUsPerRU; 3313 config->parityConfig = clabel->parityConfig; 3314 /* XXX... */ 3315 strcpy(config->diskQueueType,"fifo"); 3316 config->maxOutstandingDiskReqs = clabel->maxOutstanding; 3317 config->layoutSpecificSize = 0; /* XXX ?? */ 3318 3319 while(ac!=NULL) { 3320 /* row/col values will be in range due to the checks 3321 in reasonable_label() */ 3322 strcpy(config->devnames[0][ac->clabel->column], 3323 ac->devname); 3324 ac = ac->next; 3325 } 3326 3327 for(i=0;i<RF_MAXDBGV;i++) { 3328 config->debugVars[i][0] = 0; 3329 } 3330 } 3331 3332 int 3333 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value) 3334 { 3335 RF_ComponentLabel_t *clabel; 3336 int column; 3337 int sparecol; 3338 3339 raidPtr->autoconfigure = new_value; 3340 3341 for(column=0; column<raidPtr->numCol; column++) { 3342 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3343 clabel = raidget_component_label(raidPtr, column); 3344 clabel->autoconfigure = new_value; 3345 raidflush_component_label(raidPtr, column); 3346 } 3347 } 3348 for(column = 0; column < raidPtr->numSpare ; column++) { 3349 sparecol = raidPtr->numCol + column; 3350 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3351 clabel = raidget_component_label(raidPtr, sparecol); 3352 clabel->autoconfigure = new_value; 3353 raidflush_component_label(raidPtr, sparecol); 3354 } 3355 } 3356 return(new_value); 3357 } 3358 3359 int 3360 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value) 3361 { 3362 RF_ComponentLabel_t *clabel; 3363 int column; 3364 int sparecol; 3365 3366 raidPtr->root_partition = new_value; 3367 for(column=0; column<raidPtr->numCol; column++) { 3368 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3369 clabel = raidget_component_label(raidPtr, column); 3370 clabel->root_partition = new_value; 3371 raidflush_component_label(raidPtr, column); 3372 } 3373 } 3374 for(column = 0; column < raidPtr->numSpare ; column++) { 3375 sparecol = raidPtr->numCol + column; 3376 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3377 clabel = raidget_component_label(raidPtr, sparecol); 3378 clabel->root_partition = new_value; 3379 raidflush_component_label(raidPtr, sparecol); 3380 } 3381 } 3382 return(new_value); 3383 } 3384 3385 void 3386 rf_release_all_vps(RF_ConfigSet_t *cset) 3387 { 3388 RF_AutoConfig_t *ac; 3389 3390 ac = cset->ac; 3391 while(ac!=NULL) { 3392 /* Close the vp, and give it back */ 3393 if (ac->vp) { 3394 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY); 3395 VOP_CLOSE(ac->vp, FREAD | FWRITE, NOCRED); 3396 vput(ac->vp); 3397 ac->vp = NULL; 3398 } 3399 ac = ac->next; 3400 } 3401 } 3402 3403 3404 void 3405 rf_cleanup_config_set(RF_ConfigSet_t *cset) 3406 { 3407 RF_AutoConfig_t *ac; 3408 RF_AutoConfig_t *next_ac; 3409 3410 ac = cset->ac; 3411 while(ac!=NULL) { 3412 next_ac = ac->next; 3413 /* nuke the label */ 3414 free(ac->clabel, M_RAIDFRAME); 3415 /* cleanup the config structure */ 3416 free(ac, M_RAIDFRAME); 3417 /* "next.." */ 3418 ac = next_ac; 3419 } 3420 /* and, finally, nuke the config set */ 3421 free(cset, M_RAIDFRAME); 3422 } 3423 3424 3425 void 3426 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 3427 { 3428 /* current version number */ 3429 clabel->version = RF_COMPONENT_LABEL_VERSION; 3430 clabel->serial_number = raidPtr->serial_number; 3431 clabel->mod_counter = raidPtr->mod_counter; 3432 3433 clabel->num_rows = 1; 3434 clabel->num_columns = raidPtr->numCol; 3435 clabel->clean = RF_RAID_DIRTY; /* not clean */ 3436 clabel->status = rf_ds_optimal; /* "It's good!" */ 3437 3438 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; 3439 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU; 3440 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU; 3441 3442 clabel->blockSize = raidPtr->bytesPerSector; 3443 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk); 3444 3445 /* XXX not portable */ 3446 clabel->parityConfig = raidPtr->Layout.map->parityConfig; 3447 clabel->maxOutstanding = raidPtr->maxOutstanding; 3448 clabel->autoconfigure = raidPtr->autoconfigure; 3449 clabel->root_partition = raidPtr->root_partition; 3450 clabel->last_unit = raidPtr->raidid; 3451 clabel->config_order = raidPtr->config_order; 3452 3453 #ifndef RF_NO_PARITY_MAP 3454 rf_paritymap_init_label(raidPtr->parity_map, clabel); 3455 #endif 3456 } 3457 3458 struct raid_softc * 3459 rf_auto_config_set(RF_ConfigSet_t *cset) 3460 { 3461 RF_Raid_t *raidPtr; 3462 RF_Config_t *config; 3463 int raidID; 3464 struct raid_softc *sc; 3465 3466 #ifdef DEBUG 3467 printf("RAID autoconfigure\n"); 3468 #endif 3469 3470 /* 1. Create a config structure */ 3471 config = malloc(sizeof(*config), M_RAIDFRAME, M_NOWAIT|M_ZERO); 3472 if (config == NULL) { 3473 printf("%s: Out of mem - config!?!?\n", __func__); 3474 /* XXX do something more intelligent here. */ 3475 return NULL; 3476 } 3477 3478 /* 3479 2. Figure out what RAID ID this one is supposed to live at 3480 See if we can get the same RAID dev that it was configured 3481 on last time.. 3482 */ 3483 3484 raidID = cset->ac->clabel->last_unit; 3485 for (sc = raidget(raidID, false); sc && sc->sc_r.valid != 0; 3486 sc = raidget(++raidID, false)) 3487 continue; 3488 #ifdef DEBUG 3489 printf("Configuring raid%d:\n",raidID); 3490 #endif 3491 3492 if (sc == NULL) 3493 sc = raidget(raidID, true); 3494 if (sc == NULL) { 3495 printf("%s: Out of mem - softc!?!?\n", __func__); 3496 /* XXX do something more intelligent here. */ 3497 free(config, M_RAIDFRAME); 3498 return NULL; 3499 } 3500 3501 raidPtr = &sc->sc_r; 3502 3503 /* XXX all this stuff should be done SOMEWHERE ELSE! */ 3504 raidPtr->softc = sc; 3505 raidPtr->raidid = raidID; 3506 raidPtr->openings = RAIDOUTSTANDING; 3507 3508 /* 3. Build the configuration structure */ 3509 rf_create_configuration(cset->ac, config, raidPtr); 3510 3511 /* 4. Do the configuration */ 3512 if (rf_Configure(raidPtr, config, cset->ac) == 0) { 3513 raidinit(sc); 3514 3515 rf_markalldirty(raidPtr); 3516 raidPtr->autoconfigure = 1; /* XXX do this here? */ 3517 switch (cset->ac->clabel->root_partition) { 3518 case 1: /* Force Root */ 3519 case 2: /* Soft Root: root when boot partition part of raid */ 3520 /* 3521 * everything configured just fine. Make a note 3522 * that this set is eligible to be root, 3523 * or forced to be root 3524 */ 3525 cset->rootable = cset->ac->clabel->root_partition; 3526 /* XXX do this here? */ 3527 raidPtr->root_partition = cset->rootable; 3528 break; 3529 default: 3530 break; 3531 } 3532 } else { 3533 raidput(sc); 3534 sc = NULL; 3535 } 3536 3537 /* 5. Cleanup */ 3538 free(config, M_RAIDFRAME); 3539 return sc; 3540 } 3541 3542 void 3543 rf_pool_init(struct pool *p, size_t size, const char *w_chan, 3544 size_t xmin, size_t xmax) 3545 { 3546 int error; 3547 3548 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO); 3549 pool_sethiwat(p, xmax); 3550 if ((error = pool_prime(p, xmin)) != 0) 3551 panic("%s: failed to prime pool: %d", __func__, error); 3552 pool_setlowat(p, xmin); 3553 } 3554 3555 /* 3556 * rf_buf_queue_check(RF_Raid_t raidPtr) -- looks into the buffer queue 3557 * to see if there is IO pending and if that IO could possibly be done 3558 * for a given RAID set. Returns 0 if IO is waiting and can be done, 1 3559 * otherwise. 3560 * 3561 */ 3562 int 3563 rf_buf_queue_check(RF_Raid_t *raidPtr) 3564 { 3565 struct raid_softc *rs; 3566 struct dk_softc *dksc; 3567 3568 rs = raidPtr->softc; 3569 dksc = &rs->sc_dksc; 3570 3571 if ((rs->sc_flags & RAIDF_INITED) == 0) 3572 return 1; 3573 3574 if (dk_strategy_pending(dksc) && raidPtr->openings > 0) { 3575 /* there is work to do */ 3576 return 0; 3577 } 3578 /* default is nothing to do */ 3579 return 1; 3580 } 3581 3582 int 3583 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr) 3584 { 3585 uint64_t numsecs; 3586 unsigned secsize; 3587 int error; 3588 3589 error = getdisksize(vp, &numsecs, &secsize); 3590 if (error == 0) { 3591 diskPtr->blockSize = secsize; 3592 diskPtr->numBlocks = numsecs - rf_protectedSectors; 3593 diskPtr->partitionSize = numsecs; 3594 return 0; 3595 } 3596 return error; 3597 } 3598 3599 static int 3600 raid_match(device_t self, cfdata_t cfdata, void *aux) 3601 { 3602 return 1; 3603 } 3604 3605 static void 3606 raid_attach(device_t parent, device_t self, void *aux) 3607 { 3608 } 3609 3610 3611 static int 3612 raid_detach(device_t self, int flags) 3613 { 3614 int error; 3615 struct raid_softc *rs = raidsoftc(self); 3616 3617 if (rs == NULL) 3618 return ENXIO; 3619 3620 if ((error = raidlock(rs)) != 0) 3621 return (error); 3622 3623 error = raid_detach_unlocked(rs); 3624 3625 raidunlock(rs); 3626 3627 /* XXX raid can be referenced here */ 3628 3629 if (error) 3630 return error; 3631 3632 /* Free the softc */ 3633 raidput(rs); 3634 3635 return 0; 3636 } 3637 3638 static void 3639 rf_set_geometry(struct raid_softc *rs, RF_Raid_t *raidPtr) 3640 { 3641 struct dk_softc *dksc = &rs->sc_dksc; 3642 struct disk_geom *dg = &dksc->sc_dkdev.dk_geom; 3643 3644 memset(dg, 0, sizeof(*dg)); 3645 3646 dg->dg_secperunit = raidPtr->totalSectors; 3647 dg->dg_secsize = raidPtr->bytesPerSector; 3648 dg->dg_nsectors = raidPtr->Layout.dataSectorsPerStripe; 3649 dg->dg_ntracks = 4 * raidPtr->numCol; 3650 3651 disk_set_info(dksc->sc_dev, &dksc->sc_dkdev, NULL); 3652 } 3653 3654 /* 3655 * Get cache info for all the components (including spares). 3656 * Returns intersection of all the cache flags of all disks, or first 3657 * error if any encountered. 3658 * XXXfua feature flags can change as spares are added - lock down somehow 3659 */ 3660 static int 3661 rf_get_component_caches(RF_Raid_t *raidPtr, int *data) 3662 { 3663 int c; 3664 int error; 3665 int dkwhole = 0, dkpart; 3666 3667 for (c = 0; c < raidPtr->numCol + raidPtr->numSpare; c++) { 3668 /* 3669 * Check any non-dead disk, even when currently being 3670 * reconstructed. 3671 */ 3672 if (!RF_DEAD_DISK(raidPtr->Disks[c].status) 3673 || raidPtr->Disks[c].status == rf_ds_reconstructing) { 3674 error = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, 3675 DIOCGCACHE, &dkpart, FREAD, NOCRED); 3676 if (error) { 3677 if (error != ENODEV) { 3678 printf("raid%d: get cache for component %s failed\n", 3679 raidPtr->raidid, 3680 raidPtr->Disks[c].devname); 3681 } 3682 3683 return error; 3684 } 3685 3686 if (c == 0) 3687 dkwhole = dkpart; 3688 else 3689 dkwhole = DKCACHE_COMBINE(dkwhole, dkpart); 3690 } 3691 } 3692 3693 *data = dkwhole; 3694 3695 return 0; 3696 } 3697 3698 /* 3699 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components. 3700 * We end up returning whatever error was returned by the first cache flush 3701 * that fails. 3702 */ 3703 3704 int 3705 rf_sync_component_caches(RF_Raid_t *raidPtr) 3706 { 3707 int c, sparecol; 3708 int e,error; 3709 int force = 1; 3710 3711 error = 0; 3712 for (c = 0; c < raidPtr->numCol; c++) { 3713 if (raidPtr->Disks[c].status == rf_ds_optimal) { 3714 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC, 3715 &force, FWRITE, NOCRED); 3716 if (e) { 3717 if (e != ENODEV) 3718 printf("raid%d: cache flush to component %s failed.\n", 3719 raidPtr->raidid, raidPtr->Disks[c].devname); 3720 if (error == 0) { 3721 error = e; 3722 } 3723 } 3724 } 3725 } 3726 3727 for( c = 0; c < raidPtr->numSpare ; c++) { 3728 sparecol = raidPtr->numCol + c; 3729 /* Need to ensure that the reconstruct actually completed! */ 3730 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3731 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp, 3732 DIOCCACHESYNC, &force, FWRITE, NOCRED); 3733 if (e) { 3734 if (e != ENODEV) 3735 printf("raid%d: cache flush to component %s failed.\n", 3736 raidPtr->raidid, raidPtr->Disks[sparecol].devname); 3737 if (error == 0) { 3738 error = e; 3739 } 3740 } 3741 } 3742 } 3743 return error; 3744 } 3745 3746 /* Fill in info with the current status */ 3747 void 3748 rf_check_recon_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info) 3749 { 3750 3751 if (raidPtr->status != rf_rs_reconstructing) { 3752 info->total = 100; 3753 info->completed = 100; 3754 } else { 3755 info->total = raidPtr->reconControl->numRUsTotal; 3756 info->completed = raidPtr->reconControl->numRUsComplete; 3757 } 3758 info->remaining = info->total - info->completed; 3759 } 3760 3761 /* Fill in info with the current status */ 3762 void 3763 rf_check_parityrewrite_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info) 3764 { 3765 3766 if (raidPtr->parity_rewrite_in_progress == 1) { 3767 info->total = raidPtr->Layout.numStripe; 3768 info->completed = raidPtr->parity_rewrite_stripes_done; 3769 } else { 3770 info->completed = 100; 3771 info->total = 100; 3772 } 3773 info->remaining = info->total - info->completed; 3774 } 3775 3776 /* Fill in info with the current status */ 3777 void 3778 rf_check_copyback_status_ext(RF_Raid_t *raidPtr, RF_ProgressInfo_t *info) 3779 { 3780 3781 if (raidPtr->copyback_in_progress == 1) { 3782 info->total = raidPtr->Layout.numStripe; 3783 info->completed = raidPtr->copyback_stripes_done; 3784 info->remaining = info->total - info->completed; 3785 } else { 3786 info->remaining = 0; 3787 info->completed = 100; 3788 info->total = 100; 3789 } 3790 } 3791 3792 /* Fill in config with the current info */ 3793 int 3794 rf_get_info(RF_Raid_t *raidPtr, RF_DeviceConfig_t *config) 3795 { 3796 int d, i, j; 3797 3798 if (!raidPtr->valid) 3799 return (ENODEV); 3800 config->cols = raidPtr->numCol; 3801 config->ndevs = raidPtr->numCol; 3802 if (config->ndevs >= RF_MAX_DISKS) 3803 return (ENOMEM); 3804 config->nspares = raidPtr->numSpare; 3805 if (config->nspares >= RF_MAX_DISKS) 3806 return (ENOMEM); 3807 config->maxqdepth = raidPtr->maxQueueDepth; 3808 d = 0; 3809 for (j = 0; j < config->cols; j++) { 3810 config->devs[d] = raidPtr->Disks[j]; 3811 d++; 3812 } 3813 for (j = config->cols, i = 0; i < config->nspares; i++, j++) { 3814 config->spares[i] = raidPtr->Disks[j]; 3815 if (config->spares[i].status == rf_ds_rebuilding_spare) { 3816 /* XXX: raidctl(8) expects to see this as a used spare */ 3817 config->spares[i].status = rf_ds_used_spare; 3818 } 3819 } 3820 return 0; 3821 } 3822 3823 int 3824 rf_get_component_label(RF_Raid_t *raidPtr, void *data) 3825 { 3826 RF_ComponentLabel_t *clabel = (RF_ComponentLabel_t *)data; 3827 RF_ComponentLabel_t *raid_clabel; 3828 int column = clabel->column; 3829 3830 if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare)) 3831 return EINVAL; 3832 raid_clabel = raidget_component_label(raidPtr, column); 3833 memcpy(clabel, raid_clabel, sizeof *clabel); 3834 3835 return 0; 3836 } 3837 3838 /* 3839 * Module interface 3840 */ 3841 3842 MODULE(MODULE_CLASS_DRIVER, raid, "dk_subr,bufq_fcfs"); 3843 3844 #ifdef _MODULE 3845 CFDRIVER_DECL(raid, DV_DISK, NULL); 3846 #endif 3847 3848 static int raid_modcmd(modcmd_t, void *); 3849 static int raid_modcmd_init(void); 3850 static int raid_modcmd_fini(void); 3851 3852 static int 3853 raid_modcmd(modcmd_t cmd, void *data) 3854 { 3855 int error; 3856 3857 error = 0; 3858 switch (cmd) { 3859 case MODULE_CMD_INIT: 3860 error = raid_modcmd_init(); 3861 break; 3862 case MODULE_CMD_FINI: 3863 error = raid_modcmd_fini(); 3864 break; 3865 default: 3866 error = ENOTTY; 3867 break; 3868 } 3869 return error; 3870 } 3871 3872 static int 3873 raid_modcmd_init(void) 3874 { 3875 int error; 3876 int bmajor, cmajor; 3877 3878 mutex_init(&raid_lock, MUTEX_DEFAULT, IPL_NONE); 3879 mutex_enter(&raid_lock); 3880 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 3881 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM); 3882 rf_init_cond2(rf_sparet_wait_cv, "sparetw"); 3883 rf_init_cond2(rf_sparet_resp_cv, "rfgst"); 3884 3885 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; 3886 #endif 3887 3888 bmajor = cmajor = -1; 3889 error = devsw_attach("raid", &raid_bdevsw, &bmajor, 3890 &raid_cdevsw, &cmajor); 3891 if (error != 0 && error != EEXIST) { 3892 aprint_error("%s: devsw_attach failed %d\n", __func__, error); 3893 mutex_exit(&raid_lock); 3894 return error; 3895 } 3896 #ifdef _MODULE 3897 error = config_cfdriver_attach(&raid_cd); 3898 if (error != 0) { 3899 aprint_error("%s: config_cfdriver_attach failed %d\n", 3900 __func__, error); 3901 devsw_detach(&raid_bdevsw, &raid_cdevsw); 3902 mutex_exit(&raid_lock); 3903 return error; 3904 } 3905 #endif 3906 error = config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3907 if (error != 0) { 3908 aprint_error("%s: config_cfattach_attach failed %d\n", 3909 __func__, error); 3910 #ifdef _MODULE 3911 config_cfdriver_detach(&raid_cd); 3912 #endif 3913 devsw_detach(&raid_bdevsw, &raid_cdevsw); 3914 mutex_exit(&raid_lock); 3915 return error; 3916 } 3917 3918 raidautoconfigdone = false; 3919 3920 mutex_exit(&raid_lock); 3921 3922 if (error == 0) { 3923 if (rf_BootRaidframe(true) == 0) 3924 aprint_verbose("Kernelized RAIDframe activated\n"); 3925 else 3926 panic("Serious error activating RAID!!"); 3927 } 3928 3929 /* 3930 * Register a finalizer which will be used to auto-config RAID 3931 * sets once all real hardware devices have been found. 3932 */ 3933 error = config_finalize_register(NULL, rf_autoconfig); 3934 if (error != 0) { 3935 aprint_error("WARNING: unable to register RAIDframe " 3936 "finalizer\n"); 3937 error = 0; 3938 } 3939 3940 return error; 3941 } 3942 3943 static int 3944 raid_modcmd_fini(void) 3945 { 3946 int error; 3947 3948 mutex_enter(&raid_lock); 3949 3950 /* Don't allow unload if raid device(s) exist. */ 3951 if (!LIST_EMPTY(&raids)) { 3952 mutex_exit(&raid_lock); 3953 return EBUSY; 3954 } 3955 3956 error = config_cfattach_detach(raid_cd.cd_name, &raid_ca); 3957 if (error != 0) { 3958 aprint_error("%s: cannot detach cfattach\n",__func__); 3959 mutex_exit(&raid_lock); 3960 return error; 3961 } 3962 #ifdef _MODULE 3963 error = config_cfdriver_detach(&raid_cd); 3964 if (error != 0) { 3965 aprint_error("%s: cannot detach cfdriver\n",__func__); 3966 config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3967 mutex_exit(&raid_lock); 3968 return error; 3969 } 3970 #endif 3971 error = devsw_detach(&raid_bdevsw, &raid_cdevsw); 3972 if (error != 0) { 3973 aprint_error("%s: cannot detach devsw\n",__func__); 3974 #ifdef _MODULE 3975 config_cfdriver_attach(&raid_cd); 3976 #endif 3977 config_cfattach_attach(raid_cd.cd_name, &raid_ca); 3978 mutex_exit(&raid_lock); 3979 return error; 3980 } 3981 rf_BootRaidframe(false); 3982 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 3983 rf_destroy_mutex2(rf_sparet_wait_mutex); 3984 rf_destroy_cond2(rf_sparet_wait_cv); 3985 rf_destroy_cond2(rf_sparet_resp_cv); 3986 #endif 3987 mutex_exit(&raid_lock); 3988 mutex_destroy(&raid_lock); 3989 3990 return error; 3991 } 3992