1 /* $NetBSD: rf_netbsdkintf.c,v 1.281 2011/02/08 20:20:27 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Greg Oster; Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1988 University of Utah. 34 * Copyright (c) 1990, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * the Systems Programming Group of the University of Utah Computer 39 * Science Department. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 66 * 67 * @(#)cd.c 8.2 (Berkeley) 11/16/93 68 */ 69 70 /* 71 * Copyright (c) 1995 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Mark Holland, Jim Zelenka 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /*********************************************************** 98 * 99 * rf_kintf.c -- the kernel interface routines for RAIDframe 100 * 101 ***********************************************************/ 102 103 #include <sys/cdefs.h> 104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.281 2011/02/08 20:20:27 rmind Exp $"); 105 106 #ifdef _KERNEL_OPT 107 #include "opt_compat_netbsd.h" 108 #include "opt_raid_autoconfig.h" 109 #include "raid.h" 110 #endif 111 112 #include <sys/param.h> 113 #include <sys/errno.h> 114 #include <sys/pool.h> 115 #include <sys/proc.h> 116 #include <sys/queue.h> 117 #include <sys/disk.h> 118 #include <sys/device.h> 119 #include <sys/stat.h> 120 #include <sys/ioctl.h> 121 #include <sys/fcntl.h> 122 #include <sys/systm.h> 123 #include <sys/vnode.h> 124 #include <sys/disklabel.h> 125 #include <sys/conf.h> 126 #include <sys/buf.h> 127 #include <sys/bufq.h> 128 #include <sys/reboot.h> 129 #include <sys/kauth.h> 130 131 #include <prop/proplib.h> 132 133 #include <dev/raidframe/raidframevar.h> 134 #include <dev/raidframe/raidframeio.h> 135 #include <dev/raidframe/rf_paritymap.h> 136 137 #include "rf_raid.h" 138 #include "rf_copyback.h" 139 #include "rf_dag.h" 140 #include "rf_dagflags.h" 141 #include "rf_desc.h" 142 #include "rf_diskqueue.h" 143 #include "rf_etimer.h" 144 #include "rf_general.h" 145 #include "rf_kintf.h" 146 #include "rf_options.h" 147 #include "rf_driver.h" 148 #include "rf_parityscan.h" 149 #include "rf_threadstuff.h" 150 151 #ifdef COMPAT_50 152 #include "rf_compat50.h" 153 #endif 154 155 #ifdef DEBUG 156 int rf_kdebug_level = 0; 157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a 158 #else /* DEBUG */ 159 #define db1_printf(a) { } 160 #endif /* DEBUG */ 161 162 static RF_Raid_t **raidPtrs; /* global raid device descriptors */ 163 164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 165 RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex) 166 167 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a 168 * spare table */ 169 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from 170 * installation process */ 171 #endif 172 173 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures"); 174 175 /* prototypes */ 176 static void KernelWakeupFunc(struct buf *); 177 static void InitBP(struct buf *, struct vnode *, unsigned, 178 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *), 179 void *, int, struct proc *); 180 static void raidinit(RF_Raid_t *); 181 182 void raidattach(int); 183 static int raid_match(device_t, cfdata_t, void *); 184 static void raid_attach(device_t, device_t, void *); 185 static int raid_detach(device_t, int); 186 187 static int raidread_component_area(dev_t, struct vnode *, void *, size_t, 188 daddr_t, daddr_t); 189 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t, 190 daddr_t, daddr_t, int); 191 192 static int raidwrite_component_label(unsigned, 193 dev_t, struct vnode *, RF_ComponentLabel_t *); 194 static int raidread_component_label(unsigned, 195 dev_t, struct vnode *, RF_ComponentLabel_t *); 196 197 198 dev_type_open(raidopen); 199 dev_type_close(raidclose); 200 dev_type_read(raidread); 201 dev_type_write(raidwrite); 202 dev_type_ioctl(raidioctl); 203 dev_type_strategy(raidstrategy); 204 dev_type_dump(raiddump); 205 dev_type_size(raidsize); 206 207 const struct bdevsw raid_bdevsw = { 208 raidopen, raidclose, raidstrategy, raidioctl, 209 raiddump, raidsize, D_DISK 210 }; 211 212 const struct cdevsw raid_cdevsw = { 213 raidopen, raidclose, raidread, raidwrite, raidioctl, 214 nostop, notty, nopoll, nommap, nokqfilter, D_DISK 215 }; 216 217 static struct dkdriver rf_dkdriver = { raidstrategy, minphys }; 218 219 /* XXX Not sure if the following should be replacing the raidPtrs above, 220 or if it should be used in conjunction with that... 221 */ 222 223 struct raid_softc { 224 device_t sc_dev; 225 int sc_flags; /* flags */ 226 int sc_cflags; /* configuration flags */ 227 uint64_t sc_size; /* size of the raid device */ 228 char sc_xname[20]; /* XXX external name */ 229 struct disk sc_dkdev; /* generic disk device info */ 230 struct bufq_state *buf_queue; /* used for the device queue */ 231 }; 232 /* sc_flags */ 233 #define RAIDF_INITED 0x01 /* unit has been initialized */ 234 #define RAIDF_WLABEL 0x02 /* label area is writable */ 235 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */ 236 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */ 237 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */ 238 #define RAIDF_LOCKED 0x80 /* unit is locked */ 239 240 #define raidunit(x) DISKUNIT(x) 241 int numraid = 0; 242 243 extern struct cfdriver raid_cd; 244 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc), 245 raid_match, raid_attach, raid_detach, NULL, NULL, NULL, 246 DVF_DETACH_SHUTDOWN); 247 248 /* 249 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device. 250 * Be aware that large numbers can allow the driver to consume a lot of 251 * kernel memory, especially on writes, and in degraded mode reads. 252 * 253 * For example: with a stripe width of 64 blocks (32k) and 5 disks, 254 * a single 64K write will typically require 64K for the old data, 255 * 64K for the old parity, and 64K for the new parity, for a total 256 * of 192K (if the parity buffer is not re-used immediately). 257 * Even it if is used immediately, that's still 128K, which when multiplied 258 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data. 259 * 260 * Now in degraded mode, for example, a 64K read on the above setup may 261 * require data reconstruction, which will require *all* of the 4 remaining 262 * disks to participate -- 4 * 32K/disk == 128K again. 263 */ 264 265 #ifndef RAIDOUTSTANDING 266 #define RAIDOUTSTANDING 6 267 #endif 268 269 #define RAIDLABELDEV(dev) \ 270 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART)) 271 272 /* declared here, and made public, for the benefit of KVM stuff.. */ 273 struct raid_softc *raid_softc; 274 275 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *, 276 struct disklabel *); 277 static void raidgetdisklabel(dev_t); 278 static void raidmakedisklabel(struct raid_softc *); 279 280 static int raidlock(struct raid_softc *); 281 static void raidunlock(struct raid_softc *); 282 283 static int raid_detach_unlocked(struct raid_softc *); 284 285 static void rf_markalldirty(RF_Raid_t *); 286 static void rf_set_properties(struct raid_softc *, RF_Raid_t *); 287 288 void rf_ReconThread(struct rf_recon_req *); 289 void rf_RewriteParityThread(RF_Raid_t *raidPtr); 290 void rf_CopybackThread(RF_Raid_t *raidPtr); 291 void rf_ReconstructInPlaceThread(struct rf_recon_req *); 292 int rf_autoconfig(device_t); 293 void rf_buildroothack(RF_ConfigSet_t *); 294 295 RF_AutoConfig_t *rf_find_raid_components(void); 296 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *); 297 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *); 298 static int rf_reasonable_label(RF_ComponentLabel_t *); 299 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *); 300 int rf_set_autoconfig(RF_Raid_t *, int); 301 int rf_set_rootpartition(RF_Raid_t *, int); 302 void rf_release_all_vps(RF_ConfigSet_t *); 303 void rf_cleanup_config_set(RF_ConfigSet_t *); 304 int rf_have_enough_components(RF_ConfigSet_t *); 305 int rf_auto_config_set(RF_ConfigSet_t *, int *); 306 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t); 307 308 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not 309 allow autoconfig to take place. 310 Note that this is overridden by having 311 RAID_AUTOCONFIG as an option in the 312 kernel config file. */ 313 314 struct RF_Pools_s rf_pools; 315 316 void 317 raidattach(int num) 318 { 319 int raidID; 320 int i, rc; 321 322 aprint_debug("raidattach: Asked for %d units\n", num); 323 324 if (num <= 0) { 325 #ifdef DIAGNOSTIC 326 panic("raidattach: count <= 0"); 327 #endif 328 return; 329 } 330 /* This is where all the initialization stuff gets done. */ 331 332 numraid = num; 333 334 /* Make some space for requested number of units... */ 335 336 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **)); 337 if (raidPtrs == NULL) { 338 panic("raidPtrs is NULL!!"); 339 } 340 341 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 342 rf_mutex_init(&rf_sparet_wait_mutex); 343 344 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; 345 #endif 346 347 for (i = 0; i < num; i++) 348 raidPtrs[i] = NULL; 349 rc = rf_BootRaidframe(); 350 if (rc == 0) 351 aprint_verbose("Kernelized RAIDframe activated\n"); 352 else 353 panic("Serious error booting RAID!!"); 354 355 /* put together some datastructures like the CCD device does.. This 356 * lets us lock the device and what-not when it gets opened. */ 357 358 raid_softc = (struct raid_softc *) 359 malloc(num * sizeof(struct raid_softc), 360 M_RAIDFRAME, M_NOWAIT); 361 if (raid_softc == NULL) { 362 aprint_error("WARNING: no memory for RAIDframe driver\n"); 363 return; 364 } 365 366 memset(raid_softc, 0, num * sizeof(struct raid_softc)); 367 368 for (raidID = 0; raidID < num; raidID++) { 369 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0); 370 371 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t), 372 (RF_Raid_t *)); 373 if (raidPtrs[raidID] == NULL) { 374 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID); 375 numraid = raidID; 376 return; 377 } 378 } 379 380 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) { 381 aprint_error("raidattach: config_cfattach_attach failed?\n"); 382 } 383 384 #ifdef RAID_AUTOCONFIG 385 raidautoconfig = 1; 386 #endif 387 388 /* 389 * Register a finalizer which will be used to auto-config RAID 390 * sets once all real hardware devices have been found. 391 */ 392 if (config_finalize_register(NULL, rf_autoconfig) != 0) 393 aprint_error("WARNING: unable to register RAIDframe finalizer\n"); 394 } 395 396 int 397 rf_autoconfig(device_t self) 398 { 399 RF_AutoConfig_t *ac_list; 400 RF_ConfigSet_t *config_sets; 401 402 if (raidautoconfig == 0) 403 return (0); 404 405 /* XXX This code can only be run once. */ 406 raidautoconfig = 0; 407 408 /* 1. locate all RAID components on the system */ 409 aprint_debug("Searching for RAID components...\n"); 410 ac_list = rf_find_raid_components(); 411 412 /* 2. Sort them into their respective sets. */ 413 config_sets = rf_create_auto_sets(ac_list); 414 415 /* 416 * 3. Evaluate each set andconfigure the valid ones. 417 * This gets done in rf_buildroothack(). 418 */ 419 rf_buildroothack(config_sets); 420 421 return 1; 422 } 423 424 void 425 rf_buildroothack(RF_ConfigSet_t *config_sets) 426 { 427 RF_ConfigSet_t *cset; 428 RF_ConfigSet_t *next_cset; 429 int retcode; 430 int raidID; 431 int rootID; 432 int col; 433 int num_root; 434 char *devname; 435 436 rootID = 0; 437 num_root = 0; 438 cset = config_sets; 439 while (cset != NULL) { 440 next_cset = cset->next; 441 if (rf_have_enough_components(cset) && 442 cset->ac->clabel->autoconfigure==1) { 443 retcode = rf_auto_config_set(cset,&raidID); 444 if (!retcode) { 445 aprint_debug("raid%d: configured ok\n", raidID); 446 if (cset->rootable) { 447 rootID = raidID; 448 num_root++; 449 } 450 } else { 451 /* The autoconfig didn't work :( */ 452 aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID); 453 rf_release_all_vps(cset); 454 } 455 } else { 456 /* we're not autoconfiguring this set... 457 release the associated resources */ 458 rf_release_all_vps(cset); 459 } 460 /* cleanup */ 461 rf_cleanup_config_set(cset); 462 cset = next_cset; 463 } 464 465 /* if the user has specified what the root device should be 466 then we don't touch booted_device or boothowto... */ 467 468 if (rootspec != NULL) 469 return; 470 471 /* we found something bootable... */ 472 473 if (num_root == 1) { 474 booted_device = raid_softc[rootID].sc_dev; 475 } else if (num_root > 1) { 476 477 /* 478 * Maybe the MD code can help. If it cannot, then 479 * setroot() will discover that we have no 480 * booted_device and will ask the user if nothing was 481 * hardwired in the kernel config file 482 */ 483 484 if (booted_device == NULL) 485 cpu_rootconf(); 486 if (booted_device == NULL) 487 return; 488 489 num_root = 0; 490 for (raidID = 0; raidID < numraid; raidID++) { 491 if (raidPtrs[raidID]->valid == 0) 492 continue; 493 494 if (raidPtrs[raidID]->root_partition == 0) 495 continue; 496 497 for (col = 0; col < raidPtrs[raidID]->numCol; col++) { 498 devname = raidPtrs[raidID]->Disks[col].devname; 499 devname += sizeof("/dev/") - 1; 500 if (strncmp(devname, device_xname(booted_device), 501 strlen(device_xname(booted_device))) != 0) 502 continue; 503 aprint_debug("raid%d includes boot device %s\n", 504 raidID, devname); 505 num_root++; 506 rootID = raidID; 507 } 508 } 509 510 if (num_root == 1) { 511 booted_device = raid_softc[rootID].sc_dev; 512 } else { 513 /* we can't guess.. require the user to answer... */ 514 boothowto |= RB_ASKNAME; 515 } 516 } 517 } 518 519 520 int 521 raidsize(dev_t dev) 522 { 523 struct raid_softc *rs; 524 struct disklabel *lp; 525 int part, unit, omask, size; 526 527 unit = raidunit(dev); 528 if (unit >= numraid) 529 return (-1); 530 rs = &raid_softc[unit]; 531 532 if ((rs->sc_flags & RAIDF_INITED) == 0) 533 return (-1); 534 535 part = DISKPART(dev); 536 omask = rs->sc_dkdev.dk_openmask & (1 << part); 537 lp = rs->sc_dkdev.dk_label; 538 539 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp)) 540 return (-1); 541 542 if (lp->d_partitions[part].p_fstype != FS_SWAP) 543 size = -1; 544 else 545 size = lp->d_partitions[part].p_size * 546 (lp->d_secsize / DEV_BSIZE); 547 548 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp)) 549 return (-1); 550 551 return (size); 552 553 } 554 555 int 556 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size) 557 { 558 int unit = raidunit(dev); 559 struct raid_softc *rs; 560 const struct bdevsw *bdev; 561 struct disklabel *lp; 562 RF_Raid_t *raidPtr; 563 daddr_t offset; 564 int part, c, sparecol, j, scol, dumpto; 565 int error = 0; 566 567 if (unit >= numraid) 568 return (ENXIO); 569 570 rs = &raid_softc[unit]; 571 raidPtr = raidPtrs[unit]; 572 573 if ((rs->sc_flags & RAIDF_INITED) == 0) 574 return ENXIO; 575 576 /* we only support dumping to RAID 1 sets */ 577 if (raidPtr->Layout.numDataCol != 1 || 578 raidPtr->Layout.numParityCol != 1) 579 return EINVAL; 580 581 582 if ((error = raidlock(rs)) != 0) 583 return error; 584 585 if (size % DEV_BSIZE != 0) { 586 error = EINVAL; 587 goto out; 588 } 589 590 if (blkno + size / DEV_BSIZE > rs->sc_size) { 591 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > " 592 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno, 593 size / DEV_BSIZE, rs->sc_size); 594 error = EINVAL; 595 goto out; 596 } 597 598 part = DISKPART(dev); 599 lp = rs->sc_dkdev.dk_label; 600 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS; 601 602 /* figure out what device is alive.. */ 603 604 /* 605 Look for a component to dump to. The preference for the 606 component to dump to is as follows: 607 1) the master 608 2) a used_spare of the master 609 3) the slave 610 4) a used_spare of the slave 611 */ 612 613 dumpto = -1; 614 for (c = 0; c < raidPtr->numCol; c++) { 615 if (raidPtr->Disks[c].status == rf_ds_optimal) { 616 /* this might be the one */ 617 dumpto = c; 618 break; 619 } 620 } 621 622 /* 623 At this point we have possibly selected a live master or a 624 live slave. We now check to see if there is a spared 625 master (or a spared slave), if we didn't find a live master 626 or a live slave. 627 */ 628 629 for (c = 0; c < raidPtr->numSpare; c++) { 630 sparecol = raidPtr->numCol + c; 631 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 632 /* How about this one? */ 633 scol = -1; 634 for(j=0;j<raidPtr->numCol;j++) { 635 if (raidPtr->Disks[j].spareCol == sparecol) { 636 scol = j; 637 break; 638 } 639 } 640 if (scol == 0) { 641 /* 642 We must have found a spared master! 643 We'll take that over anything else 644 found so far. (We couldn't have 645 found a real master before, since 646 this is a used spare, and it's 647 saying that it's replacing the 648 master.) On reboot (with 649 autoconfiguration turned on) 650 sparecol will become the 1st 651 component (component0) of this set. 652 */ 653 dumpto = sparecol; 654 break; 655 } else if (scol != -1) { 656 /* 657 Must be a spared slave. We'll dump 658 to that if we havn't found anything 659 else so far. 660 */ 661 if (dumpto == -1) 662 dumpto = sparecol; 663 } 664 } 665 } 666 667 if (dumpto == -1) { 668 /* we couldn't find any live components to dump to!?!? 669 */ 670 error = EINVAL; 671 goto out; 672 } 673 674 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev); 675 676 /* 677 Note that blkno is relative to this particular partition. 678 By adding the offset of this partition in the RAID 679 set, and also adding RF_PROTECTED_SECTORS, we get a 680 value that is relative to the partition used for the 681 underlying component. 682 */ 683 684 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev, 685 blkno + offset, va, size); 686 687 out: 688 raidunlock(rs); 689 690 return error; 691 } 692 /* ARGSUSED */ 693 int 694 raidopen(dev_t dev, int flags, int fmt, 695 struct lwp *l) 696 { 697 int unit = raidunit(dev); 698 struct raid_softc *rs; 699 struct disklabel *lp; 700 int part, pmask; 701 int error = 0; 702 703 if (unit >= numraid) 704 return (ENXIO); 705 rs = &raid_softc[unit]; 706 707 if ((error = raidlock(rs)) != 0) 708 return (error); 709 710 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) { 711 error = EBUSY; 712 goto bad; 713 } 714 715 lp = rs->sc_dkdev.dk_label; 716 717 part = DISKPART(dev); 718 719 /* 720 * If there are wedges, and this is not RAW_PART, then we 721 * need to fail. 722 */ 723 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) { 724 error = EBUSY; 725 goto bad; 726 } 727 pmask = (1 << part); 728 729 if ((rs->sc_flags & RAIDF_INITED) && 730 (rs->sc_dkdev.dk_openmask == 0)) 731 raidgetdisklabel(dev); 732 733 /* make sure that this partition exists */ 734 735 if (part != RAW_PART) { 736 if (((rs->sc_flags & RAIDF_INITED) == 0) || 737 ((part >= lp->d_npartitions) || 738 (lp->d_partitions[part].p_fstype == FS_UNUSED))) { 739 error = ENXIO; 740 goto bad; 741 } 742 } 743 /* Prevent this unit from being unconfigured while open. */ 744 switch (fmt) { 745 case S_IFCHR: 746 rs->sc_dkdev.dk_copenmask |= pmask; 747 break; 748 749 case S_IFBLK: 750 rs->sc_dkdev.dk_bopenmask |= pmask; 751 break; 752 } 753 754 if ((rs->sc_dkdev.dk_openmask == 0) && 755 ((rs->sc_flags & RAIDF_INITED) != 0)) { 756 /* First one... mark things as dirty... Note that we *MUST* 757 have done a configure before this. I DO NOT WANT TO BE 758 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED 759 THAT THEY BELONG TOGETHER!!!!! */ 760 /* XXX should check to see if we're only open for reading 761 here... If so, we needn't do this, but then need some 762 other way of keeping track of what's happened.. */ 763 764 rf_markalldirty(raidPtrs[unit]); 765 } 766 767 768 rs->sc_dkdev.dk_openmask = 769 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; 770 771 bad: 772 raidunlock(rs); 773 774 return (error); 775 776 777 } 778 /* ARGSUSED */ 779 int 780 raidclose(dev_t dev, int flags, int fmt, struct lwp *l) 781 { 782 int unit = raidunit(dev); 783 struct raid_softc *rs; 784 int error = 0; 785 int part; 786 787 if (unit >= numraid) 788 return (ENXIO); 789 rs = &raid_softc[unit]; 790 791 if ((error = raidlock(rs)) != 0) 792 return (error); 793 794 part = DISKPART(dev); 795 796 /* ...that much closer to allowing unconfiguration... */ 797 switch (fmt) { 798 case S_IFCHR: 799 rs->sc_dkdev.dk_copenmask &= ~(1 << part); 800 break; 801 802 case S_IFBLK: 803 rs->sc_dkdev.dk_bopenmask &= ~(1 << part); 804 break; 805 } 806 rs->sc_dkdev.dk_openmask = 807 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; 808 809 if ((rs->sc_dkdev.dk_openmask == 0) && 810 ((rs->sc_flags & RAIDF_INITED) != 0)) { 811 /* Last one... device is not unconfigured yet. 812 Device shutdown has taken care of setting the 813 clean bits if RAIDF_INITED is not set 814 mark things as clean... */ 815 816 rf_update_component_labels(raidPtrs[unit], 817 RF_FINAL_COMPONENT_UPDATE); 818 819 /* If the kernel is shutting down, it will detach 820 * this RAID set soon enough. 821 */ 822 } 823 824 raidunlock(rs); 825 return (0); 826 827 } 828 829 void 830 raidstrategy(struct buf *bp) 831 { 832 int s; 833 834 unsigned int raidID = raidunit(bp->b_dev); 835 RF_Raid_t *raidPtr; 836 struct raid_softc *rs = &raid_softc[raidID]; 837 int wlabel; 838 839 if ((rs->sc_flags & RAIDF_INITED) ==0) { 840 bp->b_error = ENXIO; 841 goto done; 842 } 843 if (raidID >= numraid || !raidPtrs[raidID]) { 844 bp->b_error = ENODEV; 845 goto done; 846 } 847 raidPtr = raidPtrs[raidID]; 848 if (!raidPtr->valid) { 849 bp->b_error = ENODEV; 850 goto done; 851 } 852 if (bp->b_bcount == 0) { 853 db1_printf(("b_bcount is zero..\n")); 854 goto done; 855 } 856 857 /* 858 * Do bounds checking and adjust transfer. If there's an 859 * error, the bounds check will flag that for us. 860 */ 861 862 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING); 863 if (DISKPART(bp->b_dev) == RAW_PART) { 864 uint64_t size; /* device size in DEV_BSIZE unit */ 865 866 if (raidPtr->logBytesPerSector > DEV_BSHIFT) { 867 size = raidPtr->totalSectors << 868 (raidPtr->logBytesPerSector - DEV_BSHIFT); 869 } else { 870 size = raidPtr->totalSectors >> 871 (DEV_BSHIFT - raidPtr->logBytesPerSector); 872 } 873 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) { 874 goto done; 875 } 876 } else { 877 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) { 878 db1_printf(("Bounds check failed!!:%d %d\n", 879 (int) bp->b_blkno, (int) wlabel)); 880 goto done; 881 } 882 } 883 s = splbio(); 884 885 bp->b_resid = 0; 886 887 /* stuff it onto our queue */ 888 bufq_put(rs->buf_queue, bp); 889 890 /* scheduled the IO to happen at the next convenient time */ 891 wakeup(&(raidPtrs[raidID]->iodone)); 892 893 splx(s); 894 return; 895 896 done: 897 bp->b_resid = bp->b_bcount; 898 biodone(bp); 899 } 900 /* ARGSUSED */ 901 int 902 raidread(dev_t dev, struct uio *uio, int flags) 903 { 904 int unit = raidunit(dev); 905 struct raid_softc *rs; 906 907 if (unit >= numraid) 908 return (ENXIO); 909 rs = &raid_softc[unit]; 910 911 if ((rs->sc_flags & RAIDF_INITED) == 0) 912 return (ENXIO); 913 914 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio)); 915 916 } 917 /* ARGSUSED */ 918 int 919 raidwrite(dev_t dev, struct uio *uio, int flags) 920 { 921 int unit = raidunit(dev); 922 struct raid_softc *rs; 923 924 if (unit >= numraid) 925 return (ENXIO); 926 rs = &raid_softc[unit]; 927 928 if ((rs->sc_flags & RAIDF_INITED) == 0) 929 return (ENXIO); 930 931 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio)); 932 933 } 934 935 static int 936 raid_detach_unlocked(struct raid_softc *rs) 937 { 938 int error; 939 RF_Raid_t *raidPtr; 940 941 raidPtr = raidPtrs[device_unit(rs->sc_dev)]; 942 943 /* 944 * If somebody has a partition mounted, we shouldn't 945 * shutdown. 946 */ 947 if (rs->sc_dkdev.dk_openmask != 0) 948 return EBUSY; 949 950 if ((rs->sc_flags & RAIDF_INITED) == 0) 951 ; /* not initialized: nothing to do */ 952 else if ((error = rf_Shutdown(raidPtr)) != 0) 953 return error; 954 else 955 rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN); 956 957 /* Detach the disk. */ 958 dkwedge_delall(&rs->sc_dkdev); 959 disk_detach(&rs->sc_dkdev); 960 disk_destroy(&rs->sc_dkdev); 961 962 return 0; 963 } 964 965 int 966 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) 967 { 968 int unit = raidunit(dev); 969 int error = 0; 970 int part, pmask; 971 cfdata_t cf; 972 struct raid_softc *rs; 973 RF_Config_t *k_cfg, *u_cfg; 974 RF_Raid_t *raidPtr; 975 RF_RaidDisk_t *diskPtr; 976 RF_AccTotals_t *totals; 977 RF_DeviceConfig_t *d_cfg, **ucfgp; 978 u_char *specific_buf; 979 int retcode = 0; 980 int column; 981 /* int raidid; */ 982 struct rf_recon_req *rrcopy, *rr; 983 RF_ComponentLabel_t *clabel; 984 RF_ComponentLabel_t *ci_label; 985 RF_ComponentLabel_t **clabel_ptr; 986 RF_SingleComponent_t *sparePtr,*componentPtr; 987 RF_SingleComponent_t component; 988 RF_ProgressInfo_t progressInfo, **progressInfoPtr; 989 int i, j, d; 990 #ifdef __HAVE_OLD_DISKLABEL 991 struct disklabel newlabel; 992 #endif 993 struct dkwedge_info *dkw; 994 995 if (unit >= numraid) 996 return (ENXIO); 997 rs = &raid_softc[unit]; 998 raidPtr = raidPtrs[unit]; 999 1000 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev, 1001 (int) DISKPART(dev), (int) unit, cmd)); 1002 1003 /* Must be open for writes for these commands... */ 1004 switch (cmd) { 1005 #ifdef DIOCGSECTORSIZE 1006 case DIOCGSECTORSIZE: 1007 *(u_int *)data = raidPtr->bytesPerSector; 1008 return 0; 1009 case DIOCGMEDIASIZE: 1010 *(off_t *)data = 1011 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector; 1012 return 0; 1013 #endif 1014 case DIOCSDINFO: 1015 case DIOCWDINFO: 1016 #ifdef __HAVE_OLD_DISKLABEL 1017 case ODIOCWDINFO: 1018 case ODIOCSDINFO: 1019 #endif 1020 case DIOCWLABEL: 1021 case DIOCAWEDGE: 1022 case DIOCDWEDGE: 1023 if ((flag & FWRITE) == 0) 1024 return (EBADF); 1025 } 1026 1027 /* Must be initialized for these... */ 1028 switch (cmd) { 1029 case DIOCGDINFO: 1030 case DIOCSDINFO: 1031 case DIOCWDINFO: 1032 #ifdef __HAVE_OLD_DISKLABEL 1033 case ODIOCGDINFO: 1034 case ODIOCWDINFO: 1035 case ODIOCSDINFO: 1036 case ODIOCGDEFLABEL: 1037 #endif 1038 case DIOCGPART: 1039 case DIOCWLABEL: 1040 case DIOCGDEFLABEL: 1041 case DIOCAWEDGE: 1042 case DIOCDWEDGE: 1043 case DIOCLWEDGES: 1044 case DIOCCACHESYNC: 1045 case RAIDFRAME_SHUTDOWN: 1046 case RAIDFRAME_REWRITEPARITY: 1047 case RAIDFRAME_GET_INFO: 1048 case RAIDFRAME_RESET_ACCTOTALS: 1049 case RAIDFRAME_GET_ACCTOTALS: 1050 case RAIDFRAME_KEEP_ACCTOTALS: 1051 case RAIDFRAME_GET_SIZE: 1052 case RAIDFRAME_FAIL_DISK: 1053 case RAIDFRAME_COPYBACK: 1054 case RAIDFRAME_CHECK_RECON_STATUS: 1055 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1056 case RAIDFRAME_GET_COMPONENT_LABEL: 1057 case RAIDFRAME_SET_COMPONENT_LABEL: 1058 case RAIDFRAME_ADD_HOT_SPARE: 1059 case RAIDFRAME_REMOVE_HOT_SPARE: 1060 case RAIDFRAME_INIT_LABELS: 1061 case RAIDFRAME_REBUILD_IN_PLACE: 1062 case RAIDFRAME_CHECK_PARITY: 1063 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1064 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1065 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1066 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1067 case RAIDFRAME_SET_AUTOCONFIG: 1068 case RAIDFRAME_SET_ROOT: 1069 case RAIDFRAME_DELETE_COMPONENT: 1070 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1071 case RAIDFRAME_PARITYMAP_STATUS: 1072 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1073 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1074 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1075 if ((rs->sc_flags & RAIDF_INITED) == 0) 1076 return (ENXIO); 1077 } 1078 1079 switch (cmd) { 1080 #ifdef COMPAT_50 1081 case RAIDFRAME_GET_INFO50: 1082 return rf_get_info50(raidPtr, data); 1083 1084 case RAIDFRAME_CONFIGURE50: 1085 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0) 1086 return retcode; 1087 goto config; 1088 #endif 1089 /* configure the system */ 1090 case RAIDFRAME_CONFIGURE: 1091 1092 if (raidPtr->valid) { 1093 /* There is a valid RAID set running on this unit! */ 1094 printf("raid%d: Device already configured!\n",unit); 1095 return(EINVAL); 1096 } 1097 1098 /* copy-in the configuration information */ 1099 /* data points to a pointer to the configuration structure */ 1100 1101 u_cfg = *((RF_Config_t **) data); 1102 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *)); 1103 if (k_cfg == NULL) { 1104 return (ENOMEM); 1105 } 1106 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t)); 1107 if (retcode) { 1108 RF_Free(k_cfg, sizeof(RF_Config_t)); 1109 db1_printf(("rf_ioctl: retcode=%d copyin.1\n", 1110 retcode)); 1111 return (retcode); 1112 } 1113 goto config; 1114 config: 1115 /* allocate a buffer for the layout-specific data, and copy it 1116 * in */ 1117 if (k_cfg->layoutSpecificSize) { 1118 if (k_cfg->layoutSpecificSize > 10000) { 1119 /* sanity check */ 1120 RF_Free(k_cfg, sizeof(RF_Config_t)); 1121 return (EINVAL); 1122 } 1123 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize, 1124 (u_char *)); 1125 if (specific_buf == NULL) { 1126 RF_Free(k_cfg, sizeof(RF_Config_t)); 1127 return (ENOMEM); 1128 } 1129 retcode = copyin(k_cfg->layoutSpecific, specific_buf, 1130 k_cfg->layoutSpecificSize); 1131 if (retcode) { 1132 RF_Free(k_cfg, sizeof(RF_Config_t)); 1133 RF_Free(specific_buf, 1134 k_cfg->layoutSpecificSize); 1135 db1_printf(("rf_ioctl: retcode=%d copyin.2\n", 1136 retcode)); 1137 return (retcode); 1138 } 1139 } else 1140 specific_buf = NULL; 1141 k_cfg->layoutSpecific = specific_buf; 1142 1143 /* should do some kind of sanity check on the configuration. 1144 * Store the sum of all the bytes in the last byte? */ 1145 1146 /* configure the system */ 1147 1148 /* 1149 * Clear the entire RAID descriptor, just to make sure 1150 * there is no stale data left in the case of a 1151 * reconfiguration 1152 */ 1153 memset(raidPtr, 0, sizeof(*raidPtr)); 1154 raidPtr->raidid = unit; 1155 1156 retcode = rf_Configure(raidPtr, k_cfg, NULL); 1157 1158 if (retcode == 0) { 1159 1160 /* allow this many simultaneous IO's to 1161 this RAID device */ 1162 raidPtr->openings = RAIDOUTSTANDING; 1163 1164 raidinit(raidPtr); 1165 rf_markalldirty(raidPtr); 1166 } 1167 /* free the buffers. No return code here. */ 1168 if (k_cfg->layoutSpecificSize) { 1169 RF_Free(specific_buf, k_cfg->layoutSpecificSize); 1170 } 1171 RF_Free(k_cfg, sizeof(RF_Config_t)); 1172 1173 return (retcode); 1174 1175 /* shutdown the system */ 1176 case RAIDFRAME_SHUTDOWN: 1177 1178 part = DISKPART(dev); 1179 pmask = (1 << part); 1180 1181 if ((error = raidlock(rs)) != 0) 1182 return (error); 1183 1184 if ((rs->sc_dkdev.dk_openmask & ~pmask) || 1185 ((rs->sc_dkdev.dk_bopenmask & pmask) && 1186 (rs->sc_dkdev.dk_copenmask & pmask))) 1187 retcode = EBUSY; 1188 else { 1189 rs->sc_flags |= RAIDF_SHUTDOWN; 1190 rs->sc_dkdev.dk_copenmask &= ~pmask; 1191 rs->sc_dkdev.dk_bopenmask &= ~pmask; 1192 rs->sc_dkdev.dk_openmask &= ~pmask; 1193 retcode = 0; 1194 } 1195 1196 raidunlock(rs); 1197 1198 if (retcode != 0) 1199 return retcode; 1200 1201 /* free the pseudo device attach bits */ 1202 1203 cf = device_cfdata(rs->sc_dev); 1204 if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0) 1205 free(cf, M_RAIDFRAME); 1206 1207 return (retcode); 1208 case RAIDFRAME_GET_COMPONENT_LABEL: 1209 clabel_ptr = (RF_ComponentLabel_t **) data; 1210 /* need to read the component label for the disk indicated 1211 by row,column in clabel */ 1212 1213 /* 1214 * Perhaps there should be an option to skip the in-core 1215 * copy and hit the disk, as with disklabel(8). 1216 */ 1217 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *)); 1218 1219 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel)); 1220 1221 if (retcode) { 1222 RF_Free(clabel, sizeof(*clabel)); 1223 return retcode; 1224 } 1225 1226 clabel->row = 0; /* Don't allow looking at anything else.*/ 1227 1228 column = clabel->column; 1229 1230 if ((column < 0) || (column >= raidPtr->numCol + 1231 raidPtr->numSpare)) { 1232 RF_Free(clabel, sizeof(*clabel)); 1233 return EINVAL; 1234 } 1235 1236 RF_Free(clabel, sizeof(*clabel)); 1237 1238 clabel = raidget_component_label(raidPtr, column); 1239 1240 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr)); 1241 1242 #if 0 1243 case RAIDFRAME_SET_COMPONENT_LABEL: 1244 clabel = (RF_ComponentLabel_t *) data; 1245 1246 /* XXX check the label for valid stuff... */ 1247 /* Note that some things *should not* get modified -- 1248 the user should be re-initing the labels instead of 1249 trying to patch things. 1250 */ 1251 1252 raidid = raidPtr->raidid; 1253 #ifdef DEBUG 1254 printf("raid%d: Got component label:\n", raidid); 1255 printf("raid%d: Version: %d\n", raidid, clabel->version); 1256 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number); 1257 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter); 1258 printf("raid%d: Column: %d\n", raidid, clabel->column); 1259 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns); 1260 printf("raid%d: Clean: %d\n", raidid, clabel->clean); 1261 printf("raid%d: Status: %d\n", raidid, clabel->status); 1262 #endif 1263 clabel->row = 0; 1264 column = clabel->column; 1265 1266 if ((column < 0) || (column >= raidPtr->numCol)) { 1267 return(EINVAL); 1268 } 1269 1270 /* XXX this isn't allowed to do anything for now :-) */ 1271 1272 /* XXX and before it is, we need to fill in the rest 1273 of the fields!?!?!?! */ 1274 memcpy(raidget_component_label(raidPtr, column), 1275 clabel, sizeof(*clabel)); 1276 raidflush_component_label(raidPtr, column); 1277 return (0); 1278 #endif 1279 1280 case RAIDFRAME_INIT_LABELS: 1281 clabel = (RF_ComponentLabel_t *) data; 1282 /* 1283 we only want the serial number from 1284 the above. We get all the rest of the information 1285 from the config that was used to create this RAID 1286 set. 1287 */ 1288 1289 raidPtr->serial_number = clabel->serial_number; 1290 1291 for(column=0;column<raidPtr->numCol;column++) { 1292 diskPtr = &raidPtr->Disks[column]; 1293 if (!RF_DEAD_DISK(diskPtr->status)) { 1294 ci_label = raidget_component_label(raidPtr, 1295 column); 1296 /* Zeroing this is important. */ 1297 memset(ci_label, 0, sizeof(*ci_label)); 1298 raid_init_component_label(raidPtr, ci_label); 1299 ci_label->serial_number = 1300 raidPtr->serial_number; 1301 ci_label->row = 0; /* we dont' pretend to support more */ 1302 ci_label->partitionSize = 1303 diskPtr->partitionSize; 1304 ci_label->column = column; 1305 raidflush_component_label(raidPtr, column); 1306 } 1307 /* XXXjld what about the spares? */ 1308 } 1309 1310 return (retcode); 1311 case RAIDFRAME_SET_AUTOCONFIG: 1312 d = rf_set_autoconfig(raidPtr, *(int *) data); 1313 printf("raid%d: New autoconfig value is: %d\n", 1314 raidPtr->raidid, d); 1315 *(int *) data = d; 1316 return (retcode); 1317 1318 case RAIDFRAME_SET_ROOT: 1319 d = rf_set_rootpartition(raidPtr, *(int *) data); 1320 printf("raid%d: New rootpartition value is: %d\n", 1321 raidPtr->raidid, d); 1322 *(int *) data = d; 1323 return (retcode); 1324 1325 /* initialize all parity */ 1326 case RAIDFRAME_REWRITEPARITY: 1327 1328 if (raidPtr->Layout.map->faultsTolerated == 0) { 1329 /* Parity for RAID 0 is trivially correct */ 1330 raidPtr->parity_good = RF_RAID_CLEAN; 1331 return(0); 1332 } 1333 1334 if (raidPtr->parity_rewrite_in_progress == 1) { 1335 /* Re-write is already in progress! */ 1336 return(EINVAL); 1337 } 1338 1339 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread, 1340 rf_RewriteParityThread, 1341 raidPtr,"raid_parity"); 1342 return (retcode); 1343 1344 1345 case RAIDFRAME_ADD_HOT_SPARE: 1346 sparePtr = (RF_SingleComponent_t *) data; 1347 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t)); 1348 retcode = rf_add_hot_spare(raidPtr, &component); 1349 return(retcode); 1350 1351 case RAIDFRAME_REMOVE_HOT_SPARE: 1352 return(retcode); 1353 1354 case RAIDFRAME_DELETE_COMPONENT: 1355 componentPtr = (RF_SingleComponent_t *)data; 1356 memcpy( &component, componentPtr, 1357 sizeof(RF_SingleComponent_t)); 1358 retcode = rf_delete_component(raidPtr, &component); 1359 return(retcode); 1360 1361 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1362 componentPtr = (RF_SingleComponent_t *)data; 1363 memcpy( &component, componentPtr, 1364 sizeof(RF_SingleComponent_t)); 1365 retcode = rf_incorporate_hot_spare(raidPtr, &component); 1366 return(retcode); 1367 1368 case RAIDFRAME_REBUILD_IN_PLACE: 1369 1370 if (raidPtr->Layout.map->faultsTolerated == 0) { 1371 /* Can't do this on a RAID 0!! */ 1372 return(EINVAL); 1373 } 1374 1375 if (raidPtr->recon_in_progress == 1) { 1376 /* a reconstruct is already in progress! */ 1377 return(EINVAL); 1378 } 1379 1380 componentPtr = (RF_SingleComponent_t *) data; 1381 memcpy( &component, componentPtr, 1382 sizeof(RF_SingleComponent_t)); 1383 component.row = 0; /* we don't support any more */ 1384 column = component.column; 1385 1386 if ((column < 0) || (column >= raidPtr->numCol)) { 1387 return(EINVAL); 1388 } 1389 1390 RF_LOCK_MUTEX(raidPtr->mutex); 1391 if ((raidPtr->Disks[column].status == rf_ds_optimal) && 1392 (raidPtr->numFailures > 0)) { 1393 /* XXX 0 above shouldn't be constant!!! */ 1394 /* some component other than this has failed. 1395 Let's not make things worse than they already 1396 are... */ 1397 printf("raid%d: Unable to reconstruct to disk at:\n", 1398 raidPtr->raidid); 1399 printf("raid%d: Col: %d Too many failures.\n", 1400 raidPtr->raidid, column); 1401 RF_UNLOCK_MUTEX(raidPtr->mutex); 1402 return (EINVAL); 1403 } 1404 if (raidPtr->Disks[column].status == 1405 rf_ds_reconstructing) { 1406 printf("raid%d: Unable to reconstruct to disk at:\n", 1407 raidPtr->raidid); 1408 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column); 1409 1410 RF_UNLOCK_MUTEX(raidPtr->mutex); 1411 return (EINVAL); 1412 } 1413 if (raidPtr->Disks[column].status == rf_ds_spared) { 1414 RF_UNLOCK_MUTEX(raidPtr->mutex); 1415 return (EINVAL); 1416 } 1417 RF_UNLOCK_MUTEX(raidPtr->mutex); 1418 1419 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); 1420 if (rrcopy == NULL) 1421 return(ENOMEM); 1422 1423 rrcopy->raidPtr = (void *) raidPtr; 1424 rrcopy->col = column; 1425 1426 retcode = RF_CREATE_THREAD(raidPtr->recon_thread, 1427 rf_ReconstructInPlaceThread, 1428 rrcopy,"raid_reconip"); 1429 return(retcode); 1430 1431 case RAIDFRAME_GET_INFO: 1432 if (!raidPtr->valid) 1433 return (ENODEV); 1434 ucfgp = (RF_DeviceConfig_t **) data; 1435 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t), 1436 (RF_DeviceConfig_t *)); 1437 if (d_cfg == NULL) 1438 return (ENOMEM); 1439 d_cfg->rows = 1; /* there is only 1 row now */ 1440 d_cfg->cols = raidPtr->numCol; 1441 d_cfg->ndevs = raidPtr->numCol; 1442 if (d_cfg->ndevs >= RF_MAX_DISKS) { 1443 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1444 return (ENOMEM); 1445 } 1446 d_cfg->nspares = raidPtr->numSpare; 1447 if (d_cfg->nspares >= RF_MAX_DISKS) { 1448 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1449 return (ENOMEM); 1450 } 1451 d_cfg->maxqdepth = raidPtr->maxQueueDepth; 1452 d = 0; 1453 for (j = 0; j < d_cfg->cols; j++) { 1454 d_cfg->devs[d] = raidPtr->Disks[j]; 1455 d++; 1456 } 1457 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) { 1458 d_cfg->spares[i] = raidPtr->Disks[j]; 1459 } 1460 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t)); 1461 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1462 1463 return (retcode); 1464 1465 case RAIDFRAME_CHECK_PARITY: 1466 *(int *) data = raidPtr->parity_good; 1467 return (0); 1468 1469 case RAIDFRAME_PARITYMAP_STATUS: 1470 if (rf_paritymap_ineligible(raidPtr)) 1471 return EINVAL; 1472 rf_paritymap_status(raidPtr->parity_map, 1473 (struct rf_pmstat *)data); 1474 return 0; 1475 1476 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1477 if (rf_paritymap_ineligible(raidPtr)) 1478 return EINVAL; 1479 if (raidPtr->parity_map == NULL) 1480 return ENOENT; /* ??? */ 1481 if (0 != rf_paritymap_set_params(raidPtr->parity_map, 1482 (struct rf_pmparams *)data, 1)) 1483 return EINVAL; 1484 return 0; 1485 1486 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1487 if (rf_paritymap_ineligible(raidPtr)) 1488 return EINVAL; 1489 *(int *) data = rf_paritymap_get_disable(raidPtr); 1490 return 0; 1491 1492 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1493 if (rf_paritymap_ineligible(raidPtr)) 1494 return EINVAL; 1495 rf_paritymap_set_disable(raidPtr, *(int *)data); 1496 /* XXX should errors be passed up? */ 1497 return 0; 1498 1499 case RAIDFRAME_RESET_ACCTOTALS: 1500 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals)); 1501 return (0); 1502 1503 case RAIDFRAME_GET_ACCTOTALS: 1504 totals = (RF_AccTotals_t *) data; 1505 *totals = raidPtr->acc_totals; 1506 return (0); 1507 1508 case RAIDFRAME_KEEP_ACCTOTALS: 1509 raidPtr->keep_acc_totals = *(int *)data; 1510 return (0); 1511 1512 case RAIDFRAME_GET_SIZE: 1513 *(int *) data = raidPtr->totalSectors; 1514 return (0); 1515 1516 /* fail a disk & optionally start reconstruction */ 1517 case RAIDFRAME_FAIL_DISK: 1518 1519 if (raidPtr->Layout.map->faultsTolerated == 0) { 1520 /* Can't do this on a RAID 0!! */ 1521 return(EINVAL); 1522 } 1523 1524 rr = (struct rf_recon_req *) data; 1525 rr->row = 0; 1526 if (rr->col < 0 || rr->col >= raidPtr->numCol) 1527 return (EINVAL); 1528 1529 1530 RF_LOCK_MUTEX(raidPtr->mutex); 1531 if (raidPtr->status == rf_rs_reconstructing) { 1532 /* you can't fail a disk while we're reconstructing! */ 1533 /* XXX wrong for RAID6 */ 1534 RF_UNLOCK_MUTEX(raidPtr->mutex); 1535 return (EINVAL); 1536 } 1537 if ((raidPtr->Disks[rr->col].status == 1538 rf_ds_optimal) && (raidPtr->numFailures > 0)) { 1539 /* some other component has failed. Let's not make 1540 things worse. XXX wrong for RAID6 */ 1541 RF_UNLOCK_MUTEX(raidPtr->mutex); 1542 return (EINVAL); 1543 } 1544 if (raidPtr->Disks[rr->col].status == rf_ds_spared) { 1545 /* Can't fail a spared disk! */ 1546 RF_UNLOCK_MUTEX(raidPtr->mutex); 1547 return (EINVAL); 1548 } 1549 RF_UNLOCK_MUTEX(raidPtr->mutex); 1550 1551 /* make a copy of the recon request so that we don't rely on 1552 * the user's buffer */ 1553 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); 1554 if (rrcopy == NULL) 1555 return(ENOMEM); 1556 memcpy(rrcopy, rr, sizeof(*rr)); 1557 rrcopy->raidPtr = (void *) raidPtr; 1558 1559 retcode = RF_CREATE_THREAD(raidPtr->recon_thread, 1560 rf_ReconThread, 1561 rrcopy,"raid_recon"); 1562 return (0); 1563 1564 /* invoke a copyback operation after recon on whatever disk 1565 * needs it, if any */ 1566 case RAIDFRAME_COPYBACK: 1567 1568 if (raidPtr->Layout.map->faultsTolerated == 0) { 1569 /* This makes no sense on a RAID 0!! */ 1570 return(EINVAL); 1571 } 1572 1573 if (raidPtr->copyback_in_progress == 1) { 1574 /* Copyback is already in progress! */ 1575 return(EINVAL); 1576 } 1577 1578 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread, 1579 rf_CopybackThread, 1580 raidPtr,"raid_copyback"); 1581 return (retcode); 1582 1583 /* return the percentage completion of reconstruction */ 1584 case RAIDFRAME_CHECK_RECON_STATUS: 1585 if (raidPtr->Layout.map->faultsTolerated == 0) { 1586 /* This makes no sense on a RAID 0, so tell the 1587 user it's done. */ 1588 *(int *) data = 100; 1589 return(0); 1590 } 1591 if (raidPtr->status != rf_rs_reconstructing) 1592 *(int *) data = 100; 1593 else { 1594 if (raidPtr->reconControl->numRUsTotal > 0) { 1595 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal); 1596 } else { 1597 *(int *) data = 0; 1598 } 1599 } 1600 return (0); 1601 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1602 progressInfoPtr = (RF_ProgressInfo_t **) data; 1603 if (raidPtr->status != rf_rs_reconstructing) { 1604 progressInfo.remaining = 0; 1605 progressInfo.completed = 100; 1606 progressInfo.total = 100; 1607 } else { 1608 progressInfo.total = 1609 raidPtr->reconControl->numRUsTotal; 1610 progressInfo.completed = 1611 raidPtr->reconControl->numRUsComplete; 1612 progressInfo.remaining = progressInfo.total - 1613 progressInfo.completed; 1614 } 1615 retcode = copyout(&progressInfo, *progressInfoPtr, 1616 sizeof(RF_ProgressInfo_t)); 1617 return (retcode); 1618 1619 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1620 if (raidPtr->Layout.map->faultsTolerated == 0) { 1621 /* This makes no sense on a RAID 0, so tell the 1622 user it's done. */ 1623 *(int *) data = 100; 1624 return(0); 1625 } 1626 if (raidPtr->parity_rewrite_in_progress == 1) { 1627 *(int *) data = 100 * 1628 raidPtr->parity_rewrite_stripes_done / 1629 raidPtr->Layout.numStripe; 1630 } else { 1631 *(int *) data = 100; 1632 } 1633 return (0); 1634 1635 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1636 progressInfoPtr = (RF_ProgressInfo_t **) data; 1637 if (raidPtr->parity_rewrite_in_progress == 1) { 1638 progressInfo.total = raidPtr->Layout.numStripe; 1639 progressInfo.completed = 1640 raidPtr->parity_rewrite_stripes_done; 1641 progressInfo.remaining = progressInfo.total - 1642 progressInfo.completed; 1643 } else { 1644 progressInfo.remaining = 0; 1645 progressInfo.completed = 100; 1646 progressInfo.total = 100; 1647 } 1648 retcode = copyout(&progressInfo, *progressInfoPtr, 1649 sizeof(RF_ProgressInfo_t)); 1650 return (retcode); 1651 1652 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1653 if (raidPtr->Layout.map->faultsTolerated == 0) { 1654 /* This makes no sense on a RAID 0 */ 1655 *(int *) data = 100; 1656 return(0); 1657 } 1658 if (raidPtr->copyback_in_progress == 1) { 1659 *(int *) data = 100 * raidPtr->copyback_stripes_done / 1660 raidPtr->Layout.numStripe; 1661 } else { 1662 *(int *) data = 100; 1663 } 1664 return (0); 1665 1666 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1667 progressInfoPtr = (RF_ProgressInfo_t **) data; 1668 if (raidPtr->copyback_in_progress == 1) { 1669 progressInfo.total = raidPtr->Layout.numStripe; 1670 progressInfo.completed = 1671 raidPtr->copyback_stripes_done; 1672 progressInfo.remaining = progressInfo.total - 1673 progressInfo.completed; 1674 } else { 1675 progressInfo.remaining = 0; 1676 progressInfo.completed = 100; 1677 progressInfo.total = 100; 1678 } 1679 retcode = copyout(&progressInfo, *progressInfoPtr, 1680 sizeof(RF_ProgressInfo_t)); 1681 return (retcode); 1682 1683 /* the sparetable daemon calls this to wait for the kernel to 1684 * need a spare table. this ioctl does not return until a 1685 * spare table is needed. XXX -- calling mpsleep here in the 1686 * ioctl code is almost certainly wrong and evil. -- XXX XXX 1687 * -- I should either compute the spare table in the kernel, 1688 * or have a different -- XXX XXX -- interface (a different 1689 * character device) for delivering the table -- XXX */ 1690 #if 0 1691 case RAIDFRAME_SPARET_WAIT: 1692 RF_LOCK_MUTEX(rf_sparet_wait_mutex); 1693 while (!rf_sparet_wait_queue) 1694 mpsleep(&rf_sparet_wait_queue, (PZERO + 1) | PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE); 1695 waitreq = rf_sparet_wait_queue; 1696 rf_sparet_wait_queue = rf_sparet_wait_queue->next; 1697 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex); 1698 1699 /* structure assignment */ 1700 *((RF_SparetWait_t *) data) = *waitreq; 1701 1702 RF_Free(waitreq, sizeof(*waitreq)); 1703 return (0); 1704 1705 /* wakes up a process waiting on SPARET_WAIT and puts an error 1706 * code in it that will cause the dameon to exit */ 1707 case RAIDFRAME_ABORT_SPARET_WAIT: 1708 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); 1709 waitreq->fcol = -1; 1710 RF_LOCK_MUTEX(rf_sparet_wait_mutex); 1711 waitreq->next = rf_sparet_wait_queue; 1712 rf_sparet_wait_queue = waitreq; 1713 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex); 1714 wakeup(&rf_sparet_wait_queue); 1715 return (0); 1716 1717 /* used by the spare table daemon to deliver a spare table 1718 * into the kernel */ 1719 case RAIDFRAME_SEND_SPARET: 1720 1721 /* install the spare table */ 1722 retcode = rf_SetSpareTable(raidPtr, *(void **) data); 1723 1724 /* respond to the requestor. the return status of the spare 1725 * table installation is passed in the "fcol" field */ 1726 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); 1727 waitreq->fcol = retcode; 1728 RF_LOCK_MUTEX(rf_sparet_wait_mutex); 1729 waitreq->next = rf_sparet_resp_queue; 1730 rf_sparet_resp_queue = waitreq; 1731 wakeup(&rf_sparet_resp_queue); 1732 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex); 1733 1734 return (retcode); 1735 #endif 1736 1737 default: 1738 break; /* fall through to the os-specific code below */ 1739 1740 } 1741 1742 if (!raidPtr->valid) 1743 return (EINVAL); 1744 1745 /* 1746 * Add support for "regular" device ioctls here. 1747 */ 1748 1749 error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l); 1750 if (error != EPASSTHROUGH) 1751 return (error); 1752 1753 switch (cmd) { 1754 case DIOCGDINFO: 1755 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label); 1756 break; 1757 #ifdef __HAVE_OLD_DISKLABEL 1758 case ODIOCGDINFO: 1759 newlabel = *(rs->sc_dkdev.dk_label); 1760 if (newlabel.d_npartitions > OLDMAXPARTITIONS) 1761 return ENOTTY; 1762 memcpy(data, &newlabel, sizeof (struct olddisklabel)); 1763 break; 1764 #endif 1765 1766 case DIOCGPART: 1767 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label; 1768 ((struct partinfo *) data)->part = 1769 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)]; 1770 break; 1771 1772 case DIOCWDINFO: 1773 case DIOCSDINFO: 1774 #ifdef __HAVE_OLD_DISKLABEL 1775 case ODIOCWDINFO: 1776 case ODIOCSDINFO: 1777 #endif 1778 { 1779 struct disklabel *lp; 1780 #ifdef __HAVE_OLD_DISKLABEL 1781 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) { 1782 memset(&newlabel, 0, sizeof newlabel); 1783 memcpy(&newlabel, data, sizeof (struct olddisklabel)); 1784 lp = &newlabel; 1785 } else 1786 #endif 1787 lp = (struct disklabel *)data; 1788 1789 if ((error = raidlock(rs)) != 0) 1790 return (error); 1791 1792 rs->sc_flags |= RAIDF_LABELLING; 1793 1794 error = setdisklabel(rs->sc_dkdev.dk_label, 1795 lp, 0, rs->sc_dkdev.dk_cpulabel); 1796 if (error == 0) { 1797 if (cmd == DIOCWDINFO 1798 #ifdef __HAVE_OLD_DISKLABEL 1799 || cmd == ODIOCWDINFO 1800 #endif 1801 ) 1802 error = writedisklabel(RAIDLABELDEV(dev), 1803 raidstrategy, rs->sc_dkdev.dk_label, 1804 rs->sc_dkdev.dk_cpulabel); 1805 } 1806 rs->sc_flags &= ~RAIDF_LABELLING; 1807 1808 raidunlock(rs); 1809 1810 if (error) 1811 return (error); 1812 break; 1813 } 1814 1815 case DIOCWLABEL: 1816 if (*(int *) data != 0) 1817 rs->sc_flags |= RAIDF_WLABEL; 1818 else 1819 rs->sc_flags &= ~RAIDF_WLABEL; 1820 break; 1821 1822 case DIOCGDEFLABEL: 1823 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data); 1824 break; 1825 1826 #ifdef __HAVE_OLD_DISKLABEL 1827 case ODIOCGDEFLABEL: 1828 raidgetdefaultlabel(raidPtr, rs, &newlabel); 1829 if (newlabel.d_npartitions > OLDMAXPARTITIONS) 1830 return ENOTTY; 1831 memcpy(data, &newlabel, sizeof (struct olddisklabel)); 1832 break; 1833 #endif 1834 1835 case DIOCAWEDGE: 1836 case DIOCDWEDGE: 1837 dkw = (void *)data; 1838 1839 /* If the ioctl happens here, the parent is us. */ 1840 (void)strcpy(dkw->dkw_parent, rs->sc_xname); 1841 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw); 1842 1843 case DIOCLWEDGES: 1844 return dkwedge_list(&rs->sc_dkdev, 1845 (struct dkwedge_list *)data, l); 1846 case DIOCCACHESYNC: 1847 return rf_sync_component_caches(raidPtr); 1848 default: 1849 retcode = ENOTTY; 1850 } 1851 return (retcode); 1852 1853 } 1854 1855 1856 /* raidinit -- complete the rest of the initialization for the 1857 RAIDframe device. */ 1858 1859 1860 static void 1861 raidinit(RF_Raid_t *raidPtr) 1862 { 1863 cfdata_t cf; 1864 struct raid_softc *rs; 1865 int unit; 1866 1867 unit = raidPtr->raidid; 1868 1869 rs = &raid_softc[unit]; 1870 1871 /* XXX should check return code first... */ 1872 rs->sc_flags |= RAIDF_INITED; 1873 1874 /* XXX doesn't check bounds. */ 1875 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit); 1876 1877 /* attach the pseudo device */ 1878 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK); 1879 cf->cf_name = raid_cd.cd_name; 1880 cf->cf_atname = raid_cd.cd_name; 1881 cf->cf_unit = unit; 1882 cf->cf_fstate = FSTATE_STAR; 1883 1884 rs->sc_dev = config_attach_pseudo(cf); 1885 1886 if (rs->sc_dev == NULL) { 1887 printf("raid%d: config_attach_pseudo failed\n", 1888 raidPtr->raidid); 1889 rs->sc_flags &= ~RAIDF_INITED; 1890 free(cf, M_RAIDFRAME); 1891 return; 1892 } 1893 1894 /* disk_attach actually creates space for the CPU disklabel, among 1895 * other things, so it's critical to call this *BEFORE* we try putzing 1896 * with disklabels. */ 1897 1898 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver); 1899 disk_attach(&rs->sc_dkdev); 1900 disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector); 1901 1902 /* XXX There may be a weird interaction here between this, and 1903 * protectedSectors, as used in RAIDframe. */ 1904 1905 rs->sc_size = raidPtr->totalSectors; 1906 1907 dkwedge_discover(&rs->sc_dkdev); 1908 1909 rf_set_properties(rs, raidPtr); 1910 1911 } 1912 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 1913 /* wake up the daemon & tell it to get us a spare table 1914 * XXX 1915 * the entries in the queues should be tagged with the raidPtr 1916 * so that in the extremely rare case that two recons happen at once, 1917 * we know for which device were requesting a spare table 1918 * XXX 1919 * 1920 * XXX This code is not currently used. GO 1921 */ 1922 int 1923 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req) 1924 { 1925 int retcode; 1926 1927 RF_LOCK_MUTEX(rf_sparet_wait_mutex); 1928 req->next = rf_sparet_wait_queue; 1929 rf_sparet_wait_queue = req; 1930 wakeup(&rf_sparet_wait_queue); 1931 1932 /* mpsleep unlocks the mutex */ 1933 while (!rf_sparet_resp_queue) { 1934 tsleep(&rf_sparet_resp_queue, PRIBIO, 1935 "raidframe getsparetable", 0); 1936 } 1937 req = rf_sparet_resp_queue; 1938 rf_sparet_resp_queue = req->next; 1939 RF_UNLOCK_MUTEX(rf_sparet_wait_mutex); 1940 1941 retcode = req->fcol; 1942 RF_Free(req, sizeof(*req)); /* this is not the same req as we 1943 * alloc'd */ 1944 return (retcode); 1945 } 1946 #endif 1947 1948 /* a wrapper around rf_DoAccess that extracts appropriate info from the 1949 * bp & passes it down. 1950 * any calls originating in the kernel must use non-blocking I/O 1951 * do some extra sanity checking to return "appropriate" error values for 1952 * certain conditions (to make some standard utilities work) 1953 * 1954 * Formerly known as: rf_DoAccessKernel 1955 */ 1956 void 1957 raidstart(RF_Raid_t *raidPtr) 1958 { 1959 RF_SectorCount_t num_blocks, pb, sum; 1960 RF_RaidAddr_t raid_addr; 1961 struct partition *pp; 1962 daddr_t blocknum; 1963 int unit; 1964 struct raid_softc *rs; 1965 int do_async; 1966 struct buf *bp; 1967 int rc; 1968 1969 unit = raidPtr->raidid; 1970 rs = &raid_softc[unit]; 1971 1972 /* quick check to see if anything has died recently */ 1973 RF_LOCK_MUTEX(raidPtr->mutex); 1974 if (raidPtr->numNewFailures > 0) { 1975 RF_UNLOCK_MUTEX(raidPtr->mutex); 1976 rf_update_component_labels(raidPtr, 1977 RF_NORMAL_COMPONENT_UPDATE); 1978 RF_LOCK_MUTEX(raidPtr->mutex); 1979 raidPtr->numNewFailures--; 1980 } 1981 1982 /* Check to see if we're at the limit... */ 1983 while (raidPtr->openings > 0) { 1984 RF_UNLOCK_MUTEX(raidPtr->mutex); 1985 1986 /* get the next item, if any, from the queue */ 1987 if ((bp = bufq_get(rs->buf_queue)) == NULL) { 1988 /* nothing more to do */ 1989 return; 1990 } 1991 1992 /* Ok, for the bp we have here, bp->b_blkno is relative to the 1993 * partition.. Need to make it absolute to the underlying 1994 * device.. */ 1995 1996 blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector; 1997 if (DISKPART(bp->b_dev) != RAW_PART) { 1998 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)]; 1999 blocknum += pp->p_offset; 2000 } 2001 2002 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno, 2003 (int) blocknum)); 2004 2005 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount)); 2006 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid)); 2007 2008 /* *THIS* is where we adjust what block we're going to... 2009 * but DO NOT TOUCH bp->b_blkno!!! */ 2010 raid_addr = blocknum; 2011 2012 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector; 2013 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0; 2014 sum = raid_addr + num_blocks + pb; 2015 if (1 || rf_debugKernelAccess) { 2016 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n", 2017 (int) raid_addr, (int) sum, (int) num_blocks, 2018 (int) pb, (int) bp->b_resid)); 2019 } 2020 if ((sum > raidPtr->totalSectors) || (sum < raid_addr) 2021 || (sum < num_blocks) || (sum < pb)) { 2022 bp->b_error = ENOSPC; 2023 bp->b_resid = bp->b_bcount; 2024 biodone(bp); 2025 RF_LOCK_MUTEX(raidPtr->mutex); 2026 continue; 2027 } 2028 /* 2029 * XXX rf_DoAccess() should do this, not just DoAccessKernel() 2030 */ 2031 2032 if (bp->b_bcount & raidPtr->sectorMask) { 2033 bp->b_error = EINVAL; 2034 bp->b_resid = bp->b_bcount; 2035 biodone(bp); 2036 RF_LOCK_MUTEX(raidPtr->mutex); 2037 continue; 2038 2039 } 2040 db1_printf(("Calling DoAccess..\n")); 2041 2042 2043 RF_LOCK_MUTEX(raidPtr->mutex); 2044 raidPtr->openings--; 2045 RF_UNLOCK_MUTEX(raidPtr->mutex); 2046 2047 /* 2048 * Everything is async. 2049 */ 2050 do_async = 1; 2051 2052 disk_busy(&rs->sc_dkdev); 2053 2054 /* XXX we're still at splbio() here... do we *really* 2055 need to be? */ 2056 2057 /* don't ever condition on bp->b_flags & B_WRITE. 2058 * always condition on B_READ instead */ 2059 2060 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ? 2061 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE, 2062 do_async, raid_addr, num_blocks, 2063 bp->b_data, bp, RF_DAG_NONBLOCKING_IO); 2064 2065 if (rc) { 2066 bp->b_error = rc; 2067 bp->b_resid = bp->b_bcount; 2068 biodone(bp); 2069 /* continue loop */ 2070 } 2071 2072 RF_LOCK_MUTEX(raidPtr->mutex); 2073 } 2074 RF_UNLOCK_MUTEX(raidPtr->mutex); 2075 } 2076 2077 2078 2079 2080 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */ 2081 2082 int 2083 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req) 2084 { 2085 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE; 2086 struct buf *bp; 2087 2088 req->queue = queue; 2089 bp = req->bp; 2090 2091 switch (req->type) { 2092 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */ 2093 /* XXX need to do something extra here.. */ 2094 /* I'm leaving this in, as I've never actually seen it used, 2095 * and I'd like folks to report it... GO */ 2096 printf(("WAKEUP CALLED\n")); 2097 queue->numOutstanding++; 2098 2099 bp->b_flags = 0; 2100 bp->b_private = req; 2101 2102 KernelWakeupFunc(bp); 2103 break; 2104 2105 case RF_IO_TYPE_READ: 2106 case RF_IO_TYPE_WRITE: 2107 #if RF_ACC_TRACE > 0 2108 if (req->tracerec) { 2109 RF_ETIMER_START(req->tracerec->timer); 2110 } 2111 #endif 2112 InitBP(bp, queue->rf_cinfo->ci_vp, 2113 op, queue->rf_cinfo->ci_dev, 2114 req->sectorOffset, req->numSector, 2115 req->buf, KernelWakeupFunc, (void *) req, 2116 queue->raidPtr->logBytesPerSector, req->b_proc); 2117 2118 if (rf_debugKernelAccess) { 2119 db1_printf(("dispatch: bp->b_blkno = %ld\n", 2120 (long) bp->b_blkno)); 2121 } 2122 queue->numOutstanding++; 2123 queue->last_deq_sector = req->sectorOffset; 2124 /* acc wouldn't have been let in if there were any pending 2125 * reqs at any other priority */ 2126 queue->curPriority = req->priority; 2127 2128 db1_printf(("Going for %c to unit %d col %d\n", 2129 req->type, queue->raidPtr->raidid, 2130 queue->col)); 2131 db1_printf(("sector %d count %d (%d bytes) %d\n", 2132 (int) req->sectorOffset, (int) req->numSector, 2133 (int) (req->numSector << 2134 queue->raidPtr->logBytesPerSector), 2135 (int) queue->raidPtr->logBytesPerSector)); 2136 2137 /* 2138 * XXX: drop lock here since this can block at 2139 * least with backing SCSI devices. Retake it 2140 * to minimize fuss with calling interfaces. 2141 */ 2142 2143 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam"); 2144 bdev_strategy(bp); 2145 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam"); 2146 break; 2147 2148 default: 2149 panic("bad req->type in rf_DispatchKernelIO"); 2150 } 2151 db1_printf(("Exiting from DispatchKernelIO\n")); 2152 2153 return (0); 2154 } 2155 /* this is the callback function associated with a I/O invoked from 2156 kernel code. 2157 */ 2158 static void 2159 KernelWakeupFunc(struct buf *bp) 2160 { 2161 RF_DiskQueueData_t *req = NULL; 2162 RF_DiskQueue_t *queue; 2163 int s; 2164 2165 s = splbio(); 2166 db1_printf(("recovering the request queue:\n")); 2167 req = bp->b_private; 2168 2169 queue = (RF_DiskQueue_t *) req->queue; 2170 2171 #if RF_ACC_TRACE > 0 2172 if (req->tracerec) { 2173 RF_ETIMER_STOP(req->tracerec->timer); 2174 RF_ETIMER_EVAL(req->tracerec->timer); 2175 RF_LOCK_MUTEX(rf_tracing_mutex); 2176 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2177 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2178 req->tracerec->num_phys_ios++; 2179 RF_UNLOCK_MUTEX(rf_tracing_mutex); 2180 } 2181 #endif 2182 2183 /* XXX Ok, let's get aggressive... If b_error is set, let's go 2184 * ballistic, and mark the component as hosed... */ 2185 2186 if (bp->b_error != 0) { 2187 /* Mark the disk as dead */ 2188 /* but only mark it once... */ 2189 /* and only if it wouldn't leave this RAID set 2190 completely broken */ 2191 if (((queue->raidPtr->Disks[queue->col].status == 2192 rf_ds_optimal) || 2193 (queue->raidPtr->Disks[queue->col].status == 2194 rf_ds_used_spare)) && 2195 (queue->raidPtr->numFailures < 2196 queue->raidPtr->Layout.map->faultsTolerated)) { 2197 printf("raid%d: IO Error. Marking %s as failed.\n", 2198 queue->raidPtr->raidid, 2199 queue->raidPtr->Disks[queue->col].devname); 2200 queue->raidPtr->Disks[queue->col].status = 2201 rf_ds_failed; 2202 queue->raidPtr->status = rf_rs_degraded; 2203 queue->raidPtr->numFailures++; 2204 queue->raidPtr->numNewFailures++; 2205 } else { /* Disk is already dead... */ 2206 /* printf("Disk already marked as dead!\n"); */ 2207 } 2208 2209 } 2210 2211 /* Fill in the error value */ 2212 2213 req->error = bp->b_error; 2214 2215 simple_lock(&queue->raidPtr->iodone_lock); 2216 2217 /* Drop this one on the "finished" queue... */ 2218 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries); 2219 2220 /* Let the raidio thread know there is work to be done. */ 2221 wakeup(&(queue->raidPtr->iodone)); 2222 2223 simple_unlock(&queue->raidPtr->iodone_lock); 2224 2225 splx(s); 2226 } 2227 2228 2229 2230 /* 2231 * initialize a buf structure for doing an I/O in the kernel. 2232 */ 2233 static void 2234 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev, 2235 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf, 2236 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector, 2237 struct proc *b_proc) 2238 { 2239 /* bp->b_flags = B_PHYS | rw_flag; */ 2240 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */ 2241 bp->b_oflags = 0; 2242 bp->b_cflags = 0; 2243 bp->b_bcount = numSect << logBytesPerSector; 2244 bp->b_bufsize = bp->b_bcount; 2245 bp->b_error = 0; 2246 bp->b_dev = dev; 2247 bp->b_data = bf; 2248 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT; 2249 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */ 2250 if (bp->b_bcount == 0) { 2251 panic("bp->b_bcount is zero in InitBP!!"); 2252 } 2253 bp->b_proc = b_proc; 2254 bp->b_iodone = cbFunc; 2255 bp->b_private = cbArg; 2256 } 2257 2258 static void 2259 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs, 2260 struct disklabel *lp) 2261 { 2262 memset(lp, 0, sizeof(*lp)); 2263 2264 /* fabricate a label... */ 2265 lp->d_secperunit = raidPtr->totalSectors; 2266 lp->d_secsize = raidPtr->bytesPerSector; 2267 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe; 2268 lp->d_ntracks = 4 * raidPtr->numCol; 2269 lp->d_ncylinders = raidPtr->totalSectors / 2270 (lp->d_nsectors * lp->d_ntracks); 2271 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors; 2272 2273 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename)); 2274 lp->d_type = DTYPE_RAID; 2275 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); 2276 lp->d_rpm = 3600; 2277 lp->d_interleave = 1; 2278 lp->d_flags = 0; 2279 2280 lp->d_partitions[RAW_PART].p_offset = 0; 2281 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors; 2282 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED; 2283 lp->d_npartitions = RAW_PART + 1; 2284 2285 lp->d_magic = DISKMAGIC; 2286 lp->d_magic2 = DISKMAGIC; 2287 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label); 2288 2289 } 2290 /* 2291 * Read the disklabel from the raid device. If one is not present, fake one 2292 * up. 2293 */ 2294 static void 2295 raidgetdisklabel(dev_t dev) 2296 { 2297 int unit = raidunit(dev); 2298 struct raid_softc *rs = &raid_softc[unit]; 2299 const char *errstring; 2300 struct disklabel *lp = rs->sc_dkdev.dk_label; 2301 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel; 2302 RF_Raid_t *raidPtr; 2303 2304 db1_printf(("Getting the disklabel...\n")); 2305 2306 memset(clp, 0, sizeof(*clp)); 2307 2308 raidPtr = raidPtrs[unit]; 2309 2310 raidgetdefaultlabel(raidPtr, rs, lp); 2311 2312 /* 2313 * Call the generic disklabel extraction routine. 2314 */ 2315 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy, 2316 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel); 2317 if (errstring) 2318 raidmakedisklabel(rs); 2319 else { 2320 int i; 2321 struct partition *pp; 2322 2323 /* 2324 * Sanity check whether the found disklabel is valid. 2325 * 2326 * This is necessary since total size of the raid device 2327 * may vary when an interleave is changed even though exactly 2328 * same components are used, and old disklabel may used 2329 * if that is found. 2330 */ 2331 if (lp->d_secperunit != rs->sc_size) 2332 printf("raid%d: WARNING: %s: " 2333 "total sector size in disklabel (%" PRIu32 ") != " 2334 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname, 2335 lp->d_secperunit, rs->sc_size); 2336 for (i = 0; i < lp->d_npartitions; i++) { 2337 pp = &lp->d_partitions[i]; 2338 if (pp->p_offset + pp->p_size > rs->sc_size) 2339 printf("raid%d: WARNING: %s: end of partition `%c' " 2340 "exceeds the size of raid (%" PRIu64 ")\n", 2341 unit, rs->sc_xname, 'a' + i, rs->sc_size); 2342 } 2343 } 2344 2345 } 2346 /* 2347 * Take care of things one might want to take care of in the event 2348 * that a disklabel isn't present. 2349 */ 2350 static void 2351 raidmakedisklabel(struct raid_softc *rs) 2352 { 2353 struct disklabel *lp = rs->sc_dkdev.dk_label; 2354 db1_printf(("Making a label..\n")); 2355 2356 /* 2357 * For historical reasons, if there's no disklabel present 2358 * the raw partition must be marked FS_BSDFFS. 2359 */ 2360 2361 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS; 2362 2363 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname)); 2364 2365 lp->d_checksum = dkcksum(lp); 2366 } 2367 /* 2368 * Wait interruptibly for an exclusive lock. 2369 * 2370 * XXX 2371 * Several drivers do this; it should be abstracted and made MP-safe. 2372 * (Hmm... where have we seen this warning before :-> GO ) 2373 */ 2374 static int 2375 raidlock(struct raid_softc *rs) 2376 { 2377 int error; 2378 2379 while ((rs->sc_flags & RAIDF_LOCKED) != 0) { 2380 rs->sc_flags |= RAIDF_WANTED; 2381 if ((error = 2382 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0) 2383 return (error); 2384 } 2385 rs->sc_flags |= RAIDF_LOCKED; 2386 return (0); 2387 } 2388 /* 2389 * Unlock and wake up any waiters. 2390 */ 2391 static void 2392 raidunlock(struct raid_softc *rs) 2393 { 2394 2395 rs->sc_flags &= ~RAIDF_LOCKED; 2396 if ((rs->sc_flags & RAIDF_WANTED) != 0) { 2397 rs->sc_flags &= ~RAIDF_WANTED; 2398 wakeup(rs); 2399 } 2400 } 2401 2402 2403 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */ 2404 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */ 2405 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE 2406 2407 static daddr_t 2408 rf_component_info_offset(void) 2409 { 2410 2411 return RF_COMPONENT_INFO_OFFSET; 2412 } 2413 2414 static daddr_t 2415 rf_component_info_size(unsigned secsize) 2416 { 2417 daddr_t info_size; 2418 2419 KASSERT(secsize); 2420 if (secsize > RF_COMPONENT_INFO_SIZE) 2421 info_size = secsize; 2422 else 2423 info_size = RF_COMPONENT_INFO_SIZE; 2424 2425 return info_size; 2426 } 2427 2428 static daddr_t 2429 rf_parity_map_offset(RF_Raid_t *raidPtr) 2430 { 2431 daddr_t map_offset; 2432 2433 KASSERT(raidPtr->bytesPerSector); 2434 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE) 2435 map_offset = raidPtr->bytesPerSector; 2436 else 2437 map_offset = RF_COMPONENT_INFO_SIZE; 2438 map_offset += rf_component_info_offset(); 2439 2440 return map_offset; 2441 } 2442 2443 static daddr_t 2444 rf_parity_map_size(RF_Raid_t *raidPtr) 2445 { 2446 daddr_t map_size; 2447 2448 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE) 2449 map_size = raidPtr->bytesPerSector; 2450 else 2451 map_size = RF_PARITY_MAP_SIZE; 2452 2453 return map_size; 2454 } 2455 2456 int 2457 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col) 2458 { 2459 RF_ComponentLabel_t *clabel; 2460 2461 clabel = raidget_component_label(raidPtr, col); 2462 clabel->clean = RF_RAID_CLEAN; 2463 raidflush_component_label(raidPtr, col); 2464 return(0); 2465 } 2466 2467 2468 int 2469 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col) 2470 { 2471 RF_ComponentLabel_t *clabel; 2472 2473 clabel = raidget_component_label(raidPtr, col); 2474 clabel->clean = RF_RAID_DIRTY; 2475 raidflush_component_label(raidPtr, col); 2476 return(0); 2477 } 2478 2479 int 2480 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2481 { 2482 KASSERT(raidPtr->bytesPerSector); 2483 return raidread_component_label(raidPtr->bytesPerSector, 2484 raidPtr->Disks[col].dev, 2485 raidPtr->raid_cinfo[col].ci_vp, 2486 &raidPtr->raid_cinfo[col].ci_label); 2487 } 2488 2489 RF_ComponentLabel_t * 2490 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2491 { 2492 return &raidPtr->raid_cinfo[col].ci_label; 2493 } 2494 2495 int 2496 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2497 { 2498 RF_ComponentLabel_t *label; 2499 2500 label = &raidPtr->raid_cinfo[col].ci_label; 2501 label->mod_counter = raidPtr->mod_counter; 2502 #ifndef RF_NO_PARITY_MAP 2503 label->parity_map_modcount = label->mod_counter; 2504 #endif 2505 return raidwrite_component_label(raidPtr->bytesPerSector, 2506 raidPtr->Disks[col].dev, 2507 raidPtr->raid_cinfo[col].ci_vp, label); 2508 } 2509 2510 2511 static int 2512 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2513 RF_ComponentLabel_t *clabel) 2514 { 2515 return raidread_component_area(dev, b_vp, clabel, 2516 sizeof(RF_ComponentLabel_t), 2517 rf_component_info_offset(), 2518 rf_component_info_size(secsize)); 2519 } 2520 2521 /* ARGSUSED */ 2522 static int 2523 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data, 2524 size_t msize, daddr_t offset, daddr_t dsize) 2525 { 2526 struct buf *bp; 2527 const struct bdevsw *bdev; 2528 int error; 2529 2530 /* XXX should probably ensure that we don't try to do this if 2531 someone has changed rf_protected_sectors. */ 2532 2533 if (b_vp == NULL) { 2534 /* For whatever reason, this component is not valid. 2535 Don't try to read a component label from it. */ 2536 return(EINVAL); 2537 } 2538 2539 /* get a block of the appropriate size... */ 2540 bp = geteblk((int)dsize); 2541 bp->b_dev = dev; 2542 2543 /* get our ducks in a row for the read */ 2544 bp->b_blkno = offset / DEV_BSIZE; 2545 bp->b_bcount = dsize; 2546 bp->b_flags |= B_READ; 2547 bp->b_resid = dsize; 2548 2549 bdev = bdevsw_lookup(bp->b_dev); 2550 if (bdev == NULL) 2551 return (ENXIO); 2552 (*bdev->d_strategy)(bp); 2553 2554 error = biowait(bp); 2555 2556 if (!error) { 2557 memcpy(data, bp->b_data, msize); 2558 } 2559 2560 brelse(bp, 0); 2561 return(error); 2562 } 2563 2564 2565 static int 2566 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2567 RF_ComponentLabel_t *clabel) 2568 { 2569 return raidwrite_component_area(dev, b_vp, clabel, 2570 sizeof(RF_ComponentLabel_t), 2571 rf_component_info_offset(), 2572 rf_component_info_size(secsize), 0); 2573 } 2574 2575 /* ARGSUSED */ 2576 static int 2577 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data, 2578 size_t msize, daddr_t offset, daddr_t dsize, int asyncp) 2579 { 2580 struct buf *bp; 2581 const struct bdevsw *bdev; 2582 int error; 2583 2584 /* get a block of the appropriate size... */ 2585 bp = geteblk((int)dsize); 2586 bp->b_dev = dev; 2587 2588 /* get our ducks in a row for the write */ 2589 bp->b_blkno = offset / DEV_BSIZE; 2590 bp->b_bcount = dsize; 2591 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0); 2592 bp->b_resid = dsize; 2593 2594 memset(bp->b_data, 0, dsize); 2595 memcpy(bp->b_data, data, msize); 2596 2597 bdev = bdevsw_lookup(bp->b_dev); 2598 if (bdev == NULL) 2599 return (ENXIO); 2600 (*bdev->d_strategy)(bp); 2601 if (asyncp) 2602 return 0; 2603 error = biowait(bp); 2604 brelse(bp, 0); 2605 if (error) { 2606 #if 1 2607 printf("Failed to write RAID component info!\n"); 2608 #endif 2609 } 2610 2611 return(error); 2612 } 2613 2614 void 2615 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2616 { 2617 int c; 2618 2619 for (c = 0; c < raidPtr->numCol; c++) { 2620 /* Skip dead disks. */ 2621 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2622 continue; 2623 /* XXXjld: what if an error occurs here? */ 2624 raidwrite_component_area(raidPtr->Disks[c].dev, 2625 raidPtr->raid_cinfo[c].ci_vp, map, 2626 RF_PARITYMAP_NBYTE, 2627 rf_parity_map_offset(raidPtr), 2628 rf_parity_map_size(raidPtr), 0); 2629 } 2630 } 2631 2632 void 2633 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2634 { 2635 struct rf_paritymap_ondisk tmp; 2636 int c,first; 2637 2638 first=1; 2639 for (c = 0; c < raidPtr->numCol; c++) { 2640 /* Skip dead disks. */ 2641 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2642 continue; 2643 raidread_component_area(raidPtr->Disks[c].dev, 2644 raidPtr->raid_cinfo[c].ci_vp, &tmp, 2645 RF_PARITYMAP_NBYTE, 2646 rf_parity_map_offset(raidPtr), 2647 rf_parity_map_size(raidPtr)); 2648 if (first) { 2649 memcpy(map, &tmp, sizeof(*map)); 2650 first = 0; 2651 } else { 2652 rf_paritymap_merge(map, &tmp); 2653 } 2654 } 2655 } 2656 2657 void 2658 rf_markalldirty(RF_Raid_t *raidPtr) 2659 { 2660 RF_ComponentLabel_t *clabel; 2661 int sparecol; 2662 int c; 2663 int j; 2664 int scol = -1; 2665 2666 raidPtr->mod_counter++; 2667 for (c = 0; c < raidPtr->numCol; c++) { 2668 /* we don't want to touch (at all) a disk that has 2669 failed */ 2670 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) { 2671 clabel = raidget_component_label(raidPtr, c); 2672 if (clabel->status == rf_ds_spared) { 2673 /* XXX do something special... 2674 but whatever you do, don't 2675 try to access it!! */ 2676 } else { 2677 raidmarkdirty(raidPtr, c); 2678 } 2679 } 2680 } 2681 2682 for( c = 0; c < raidPtr->numSpare ; c++) { 2683 sparecol = raidPtr->numCol + c; 2684 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2685 /* 2686 2687 we claim this disk is "optimal" if it's 2688 rf_ds_used_spare, as that means it should be 2689 directly substitutable for the disk it replaced. 2690 We note that too... 2691 2692 */ 2693 2694 for(j=0;j<raidPtr->numCol;j++) { 2695 if (raidPtr->Disks[j].spareCol == sparecol) { 2696 scol = j; 2697 break; 2698 } 2699 } 2700 2701 clabel = raidget_component_label(raidPtr, sparecol); 2702 /* make sure status is noted */ 2703 2704 raid_init_component_label(raidPtr, clabel); 2705 2706 clabel->row = 0; 2707 clabel->column = scol; 2708 /* Note: we *don't* change status from rf_ds_used_spare 2709 to rf_ds_optimal */ 2710 /* clabel.status = rf_ds_optimal; */ 2711 2712 raidmarkdirty(raidPtr, sparecol); 2713 } 2714 } 2715 } 2716 2717 2718 void 2719 rf_update_component_labels(RF_Raid_t *raidPtr, int final) 2720 { 2721 RF_ComponentLabel_t *clabel; 2722 int sparecol; 2723 int c; 2724 int j; 2725 int scol; 2726 2727 scol = -1; 2728 2729 /* XXX should do extra checks to make sure things really are clean, 2730 rather than blindly setting the clean bit... */ 2731 2732 raidPtr->mod_counter++; 2733 2734 for (c = 0; c < raidPtr->numCol; c++) { 2735 if (raidPtr->Disks[c].status == rf_ds_optimal) { 2736 clabel = raidget_component_label(raidPtr, c); 2737 /* make sure status is noted */ 2738 clabel->status = rf_ds_optimal; 2739 2740 /* note what unit we are configured as */ 2741 clabel->last_unit = raidPtr->raidid; 2742 2743 raidflush_component_label(raidPtr, c); 2744 if (final == RF_FINAL_COMPONENT_UPDATE) { 2745 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2746 raidmarkclean(raidPtr, c); 2747 } 2748 } 2749 } 2750 /* else we don't touch it.. */ 2751 } 2752 2753 for( c = 0; c < raidPtr->numSpare ; c++) { 2754 sparecol = raidPtr->numCol + c; 2755 /* Need to ensure that the reconstruct actually completed! */ 2756 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2757 /* 2758 2759 we claim this disk is "optimal" if it's 2760 rf_ds_used_spare, as that means it should be 2761 directly substitutable for the disk it replaced. 2762 We note that too... 2763 2764 */ 2765 2766 for(j=0;j<raidPtr->numCol;j++) { 2767 if (raidPtr->Disks[j].spareCol == sparecol) { 2768 scol = j; 2769 break; 2770 } 2771 } 2772 2773 /* XXX shouldn't *really* need this... */ 2774 clabel = raidget_component_label(raidPtr, sparecol); 2775 /* make sure status is noted */ 2776 2777 raid_init_component_label(raidPtr, clabel); 2778 2779 clabel->column = scol; 2780 clabel->status = rf_ds_optimal; 2781 clabel->last_unit = raidPtr->raidid; 2782 2783 raidflush_component_label(raidPtr, sparecol); 2784 if (final == RF_FINAL_COMPONENT_UPDATE) { 2785 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2786 raidmarkclean(raidPtr, sparecol); 2787 } 2788 } 2789 } 2790 } 2791 } 2792 2793 void 2794 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured) 2795 { 2796 2797 if (vp != NULL) { 2798 if (auto_configured == 1) { 2799 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2800 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2801 vput(vp); 2802 2803 } else { 2804 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred); 2805 } 2806 } 2807 } 2808 2809 2810 void 2811 rf_UnconfigureVnodes(RF_Raid_t *raidPtr) 2812 { 2813 int r,c; 2814 struct vnode *vp; 2815 int acd; 2816 2817 2818 /* We take this opportunity to close the vnodes like we should.. */ 2819 2820 for (c = 0; c < raidPtr->numCol; c++) { 2821 vp = raidPtr->raid_cinfo[c].ci_vp; 2822 acd = raidPtr->Disks[c].auto_configured; 2823 rf_close_component(raidPtr, vp, acd); 2824 raidPtr->raid_cinfo[c].ci_vp = NULL; 2825 raidPtr->Disks[c].auto_configured = 0; 2826 } 2827 2828 for (r = 0; r < raidPtr->numSpare; r++) { 2829 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp; 2830 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured; 2831 rf_close_component(raidPtr, vp, acd); 2832 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL; 2833 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0; 2834 } 2835 } 2836 2837 2838 void 2839 rf_ReconThread(struct rf_recon_req *req) 2840 { 2841 int s; 2842 RF_Raid_t *raidPtr; 2843 2844 s = splbio(); 2845 raidPtr = (RF_Raid_t *) req->raidPtr; 2846 raidPtr->recon_in_progress = 1; 2847 2848 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col, 2849 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0)); 2850 2851 RF_Free(req, sizeof(*req)); 2852 2853 raidPtr->recon_in_progress = 0; 2854 splx(s); 2855 2856 /* That's all... */ 2857 kthread_exit(0); /* does not return */ 2858 } 2859 2860 void 2861 rf_RewriteParityThread(RF_Raid_t *raidPtr) 2862 { 2863 int retcode; 2864 int s; 2865 2866 raidPtr->parity_rewrite_stripes_done = 0; 2867 raidPtr->parity_rewrite_in_progress = 1; 2868 s = splbio(); 2869 retcode = rf_RewriteParity(raidPtr); 2870 splx(s); 2871 if (retcode) { 2872 printf("raid%d: Error re-writing parity (%d)!\n", 2873 raidPtr->raidid, retcode); 2874 } else { 2875 /* set the clean bit! If we shutdown correctly, 2876 the clean bit on each component label will get 2877 set */ 2878 raidPtr->parity_good = RF_RAID_CLEAN; 2879 } 2880 raidPtr->parity_rewrite_in_progress = 0; 2881 2882 /* Anyone waiting for us to stop? If so, inform them... */ 2883 if (raidPtr->waitShutdown) { 2884 wakeup(&raidPtr->parity_rewrite_in_progress); 2885 } 2886 2887 /* That's all... */ 2888 kthread_exit(0); /* does not return */ 2889 } 2890 2891 2892 void 2893 rf_CopybackThread(RF_Raid_t *raidPtr) 2894 { 2895 int s; 2896 2897 raidPtr->copyback_in_progress = 1; 2898 s = splbio(); 2899 rf_CopybackReconstructedData(raidPtr); 2900 splx(s); 2901 raidPtr->copyback_in_progress = 0; 2902 2903 /* That's all... */ 2904 kthread_exit(0); /* does not return */ 2905 } 2906 2907 2908 void 2909 rf_ReconstructInPlaceThread(struct rf_recon_req *req) 2910 { 2911 int s; 2912 RF_Raid_t *raidPtr; 2913 2914 s = splbio(); 2915 raidPtr = req->raidPtr; 2916 raidPtr->recon_in_progress = 1; 2917 rf_ReconstructInPlace(raidPtr, req->col); 2918 RF_Free(req, sizeof(*req)); 2919 raidPtr->recon_in_progress = 0; 2920 splx(s); 2921 2922 /* That's all... */ 2923 kthread_exit(0); /* does not return */ 2924 } 2925 2926 static RF_AutoConfig_t * 2927 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp, 2928 const char *cname, RF_SectorCount_t size, uint64_t numsecs, 2929 unsigned secsize) 2930 { 2931 int good_one = 0; 2932 RF_ComponentLabel_t *clabel; 2933 RF_AutoConfig_t *ac; 2934 2935 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT); 2936 if (clabel == NULL) { 2937 oomem: 2938 while(ac_list) { 2939 ac = ac_list; 2940 if (ac->clabel) 2941 free(ac->clabel, M_RAIDFRAME); 2942 ac_list = ac_list->next; 2943 free(ac, M_RAIDFRAME); 2944 } 2945 printf("RAID auto config: out of memory!\n"); 2946 return NULL; /* XXX probably should panic? */ 2947 } 2948 2949 if (!raidread_component_label(secsize, dev, vp, clabel)) { 2950 /* Got the label. Does it look reasonable? */ 2951 if (rf_reasonable_label(clabel) && 2952 (clabel->partitionSize <= size)) { 2953 rf_fix_old_label_size(clabel, numsecs); 2954 #ifdef DEBUG 2955 printf("Component on: %s: %llu\n", 2956 cname, (unsigned long long)size); 2957 rf_print_component_label(clabel); 2958 #endif 2959 /* if it's reasonable, add it, else ignore it. */ 2960 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME, 2961 M_NOWAIT); 2962 if (ac == NULL) { 2963 free(clabel, M_RAIDFRAME); 2964 goto oomem; 2965 } 2966 strlcpy(ac->devname, cname, sizeof(ac->devname)); 2967 ac->dev = dev; 2968 ac->vp = vp; 2969 ac->clabel = clabel; 2970 ac->next = ac_list; 2971 ac_list = ac; 2972 good_one = 1; 2973 } 2974 } 2975 if (!good_one) { 2976 /* cleanup */ 2977 free(clabel, M_RAIDFRAME); 2978 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2979 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2980 vput(vp); 2981 } 2982 return ac_list; 2983 } 2984 2985 RF_AutoConfig_t * 2986 rf_find_raid_components(void) 2987 { 2988 struct vnode *vp; 2989 struct disklabel label; 2990 device_t dv; 2991 deviter_t di; 2992 dev_t dev; 2993 int bmajor, bminor, wedge; 2994 int error; 2995 int i; 2996 RF_AutoConfig_t *ac_list; 2997 uint64_t numsecs; 2998 unsigned secsize; 2999 3000 RF_ASSERT(raidPtr->bytesPerSector < rf_component_info_offset()); 3001 3002 /* initialize the AutoConfig list */ 3003 ac_list = NULL; 3004 3005 /* we begin by trolling through *all* the devices on the system */ 3006 3007 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL; 3008 dv = deviter_next(&di)) { 3009 3010 /* we are only interested in disks... */ 3011 if (device_class(dv) != DV_DISK) 3012 continue; 3013 3014 /* we don't care about floppies... */ 3015 if (device_is_a(dv, "fd")) { 3016 continue; 3017 } 3018 3019 /* we don't care about CD's... */ 3020 if (device_is_a(dv, "cd")) { 3021 continue; 3022 } 3023 3024 /* we don't care about md's... */ 3025 if (device_is_a(dv, "md")) { 3026 continue; 3027 } 3028 3029 /* hdfd is the Atari/Hades floppy driver */ 3030 if (device_is_a(dv, "hdfd")) { 3031 continue; 3032 } 3033 3034 /* fdisa is the Atari/Milan floppy driver */ 3035 if (device_is_a(dv, "fdisa")) { 3036 continue; 3037 } 3038 3039 /* need to find the device_name_to_block_device_major stuff */ 3040 bmajor = devsw_name2blk(device_xname(dv), NULL, 0); 3041 3042 /* get a vnode for the raw partition of this disk */ 3043 3044 wedge = device_is_a(dv, "dk"); 3045 bminor = minor(device_unit(dv)); 3046 dev = wedge ? makedev(bmajor, bminor) : 3047 MAKEDISKDEV(bmajor, bminor, RAW_PART); 3048 if (bdevvp(dev, &vp)) 3049 panic("RAID can't alloc vnode"); 3050 3051 error = VOP_OPEN(vp, FREAD, NOCRED); 3052 3053 if (error) { 3054 /* "Who cares." Continue looking 3055 for something that exists*/ 3056 vput(vp); 3057 continue; 3058 } 3059 3060 error = getdisksize(vp, &numsecs, &secsize); 3061 if (error) { 3062 vput(vp); 3063 continue; 3064 } 3065 if (wedge) { 3066 struct dkwedge_info dkw; 3067 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, 3068 NOCRED); 3069 if (error) { 3070 printf("RAIDframe: can't get wedge info for " 3071 "dev %s (%d)\n", device_xname(dv), error); 3072 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3073 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 3074 vput(vp); 3075 continue; 3076 } 3077 3078 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) { 3079 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3080 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 3081 vput(vp); 3082 continue; 3083 } 3084 3085 ac_list = rf_get_component(ac_list, dev, vp, 3086 device_xname(dv), dkw.dkw_size, numsecs, secsize); 3087 continue; 3088 } 3089 3090 /* Ok, the disk exists. Go get the disklabel. */ 3091 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED); 3092 if (error) { 3093 /* 3094 * XXX can't happen - open() would 3095 * have errored out (or faked up one) 3096 */ 3097 if (error != ENOTTY) 3098 printf("RAIDframe: can't get label for dev " 3099 "%s (%d)\n", device_xname(dv), error); 3100 } 3101 3102 /* don't need this any more. We'll allocate it again 3103 a little later if we really do... */ 3104 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3105 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 3106 vput(vp); 3107 3108 if (error) 3109 continue; 3110 3111 for (i = 0; i < label.d_npartitions; i++) { 3112 char cname[sizeof(ac_list->devname)]; 3113 3114 /* We only support partitions marked as RAID */ 3115 if (label.d_partitions[i].p_fstype != FS_RAID) 3116 continue; 3117 3118 dev = MAKEDISKDEV(bmajor, device_unit(dv), i); 3119 if (bdevvp(dev, &vp)) 3120 panic("RAID can't alloc vnode"); 3121 3122 error = VOP_OPEN(vp, FREAD, NOCRED); 3123 if (error) { 3124 /* Whatever... */ 3125 vput(vp); 3126 continue; 3127 } 3128 snprintf(cname, sizeof(cname), "%s%c", 3129 device_xname(dv), 'a' + i); 3130 ac_list = rf_get_component(ac_list, dev, vp, cname, 3131 label.d_partitions[i].p_size, numsecs, secsize); 3132 } 3133 } 3134 deviter_release(&di); 3135 return ac_list; 3136 } 3137 3138 3139 static int 3140 rf_reasonable_label(RF_ComponentLabel_t *clabel) 3141 { 3142 3143 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) || 3144 (clabel->version==RF_COMPONENT_LABEL_VERSION)) && 3145 ((clabel->clean == RF_RAID_CLEAN) || 3146 (clabel->clean == RF_RAID_DIRTY)) && 3147 clabel->row >=0 && 3148 clabel->column >= 0 && 3149 clabel->num_rows > 0 && 3150 clabel->num_columns > 0 && 3151 clabel->row < clabel->num_rows && 3152 clabel->column < clabel->num_columns && 3153 clabel->blockSize > 0 && 3154 clabel->numBlocks > 0) { 3155 /* label looks reasonable enough... */ 3156 return(1); 3157 } 3158 return(0); 3159 } 3160 3161 3162 /* 3163 * For reasons yet unknown, some old component labels have garbage in 3164 * the newer numBlocksHi region, and this causes lossage. Since those 3165 * disks will also have numsecs set to less than 32 bits of sectors, 3166 * we can determine when this corruption has occured, and fix it. 3167 */ 3168 static void 3169 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs) 3170 { 3171 3172 if (clabel->numBlocksHi && numsecs < ((uint64_t)1 << 32)) { 3173 printf("WARNING: total sectors < 32 bits, yet numBlocksHi set\n" 3174 "WARNING: resetting numBlocksHi to zero.\n"); 3175 clabel->numBlocksHi = 0; 3176 } 3177 } 3178 3179 3180 #ifdef DEBUG 3181 void 3182 rf_print_component_label(RF_ComponentLabel_t *clabel) 3183 { 3184 uint64_t numBlocks = clabel->numBlocks; 3185 3186 numBlocks |= (uint64_t)clabel->numBlocksHi << 32; 3187 3188 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n", 3189 clabel->row, clabel->column, 3190 clabel->num_rows, clabel->num_columns); 3191 printf(" Version: %d Serial Number: %d Mod Counter: %d\n", 3192 clabel->version, clabel->serial_number, 3193 clabel->mod_counter); 3194 printf(" Clean: %s Status: %d\n", 3195 clabel->clean ? "Yes" : "No", clabel->status); 3196 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n", 3197 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU); 3198 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n", 3199 (char) clabel->parityConfig, clabel->blockSize, numBlocks); 3200 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No"); 3201 printf(" Contains root partition: %s\n", 3202 clabel->root_partition ? "Yes" : "No"); 3203 printf(" Last configured as: raid%d\n", clabel->last_unit); 3204 #if 0 3205 printf(" Config order: %d\n", clabel->config_order); 3206 #endif 3207 3208 } 3209 #endif 3210 3211 RF_ConfigSet_t * 3212 rf_create_auto_sets(RF_AutoConfig_t *ac_list) 3213 { 3214 RF_AutoConfig_t *ac; 3215 RF_ConfigSet_t *config_sets; 3216 RF_ConfigSet_t *cset; 3217 RF_AutoConfig_t *ac_next; 3218 3219 3220 config_sets = NULL; 3221 3222 /* Go through the AutoConfig list, and figure out which components 3223 belong to what sets. */ 3224 ac = ac_list; 3225 while(ac!=NULL) { 3226 /* we're going to putz with ac->next, so save it here 3227 for use at the end of the loop */ 3228 ac_next = ac->next; 3229 3230 if (config_sets == NULL) { 3231 /* will need at least this one... */ 3232 config_sets = (RF_ConfigSet_t *) 3233 malloc(sizeof(RF_ConfigSet_t), 3234 M_RAIDFRAME, M_NOWAIT); 3235 if (config_sets == NULL) { 3236 panic("rf_create_auto_sets: No memory!"); 3237 } 3238 /* this one is easy :) */ 3239 config_sets->ac = ac; 3240 config_sets->next = NULL; 3241 config_sets->rootable = 0; 3242 ac->next = NULL; 3243 } else { 3244 /* which set does this component fit into? */ 3245 cset = config_sets; 3246 while(cset!=NULL) { 3247 if (rf_does_it_fit(cset, ac)) { 3248 /* looks like it matches... */ 3249 ac->next = cset->ac; 3250 cset->ac = ac; 3251 break; 3252 } 3253 cset = cset->next; 3254 } 3255 if (cset==NULL) { 3256 /* didn't find a match above... new set..*/ 3257 cset = (RF_ConfigSet_t *) 3258 malloc(sizeof(RF_ConfigSet_t), 3259 M_RAIDFRAME, M_NOWAIT); 3260 if (cset == NULL) { 3261 panic("rf_create_auto_sets: No memory!"); 3262 } 3263 cset->ac = ac; 3264 ac->next = NULL; 3265 cset->next = config_sets; 3266 cset->rootable = 0; 3267 config_sets = cset; 3268 } 3269 } 3270 ac = ac_next; 3271 } 3272 3273 3274 return(config_sets); 3275 } 3276 3277 static int 3278 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac) 3279 { 3280 RF_ComponentLabel_t *clabel1, *clabel2; 3281 3282 /* If this one matches the *first* one in the set, that's good 3283 enough, since the other members of the set would have been 3284 through here too... */ 3285 /* note that we are not checking partitionSize here.. 3286 3287 Note that we are also not checking the mod_counters here. 3288 If everything else matches execpt the mod_counter, that's 3289 good enough for this test. We will deal with the mod_counters 3290 a little later in the autoconfiguration process. 3291 3292 (clabel1->mod_counter == clabel2->mod_counter) && 3293 3294 The reason we don't check for this is that failed disks 3295 will have lower modification counts. If those disks are 3296 not added to the set they used to belong to, then they will 3297 form their own set, which may result in 2 different sets, 3298 for example, competing to be configured at raid0, and 3299 perhaps competing to be the root filesystem set. If the 3300 wrong ones get configured, or both attempt to become /, 3301 weird behaviour and or serious lossage will occur. Thus we 3302 need to bring them into the fold here, and kick them out at 3303 a later point. 3304 3305 */ 3306 3307 clabel1 = cset->ac->clabel; 3308 clabel2 = ac->clabel; 3309 if ((clabel1->version == clabel2->version) && 3310 (clabel1->serial_number == clabel2->serial_number) && 3311 (clabel1->num_rows == clabel2->num_rows) && 3312 (clabel1->num_columns == clabel2->num_columns) && 3313 (clabel1->sectPerSU == clabel2->sectPerSU) && 3314 (clabel1->SUsPerPU == clabel2->SUsPerPU) && 3315 (clabel1->SUsPerRU == clabel2->SUsPerRU) && 3316 (clabel1->parityConfig == clabel2->parityConfig) && 3317 (clabel1->maxOutstanding == clabel2->maxOutstanding) && 3318 (clabel1->blockSize == clabel2->blockSize) && 3319 (clabel1->numBlocks == clabel2->numBlocks) && 3320 (clabel1->numBlocksHi == clabel2->numBlocksHi) && 3321 (clabel1->autoconfigure == clabel2->autoconfigure) && 3322 (clabel1->root_partition == clabel2->root_partition) && 3323 (clabel1->last_unit == clabel2->last_unit) && 3324 (clabel1->config_order == clabel2->config_order)) { 3325 /* if it get's here, it almost *has* to be a match */ 3326 } else { 3327 /* it's not consistent with somebody in the set.. 3328 punt */ 3329 return(0); 3330 } 3331 /* all was fine.. it must fit... */ 3332 return(1); 3333 } 3334 3335 int 3336 rf_have_enough_components(RF_ConfigSet_t *cset) 3337 { 3338 RF_AutoConfig_t *ac; 3339 RF_AutoConfig_t *auto_config; 3340 RF_ComponentLabel_t *clabel; 3341 int c; 3342 int num_cols; 3343 int num_missing; 3344 int mod_counter; 3345 int mod_counter_found; 3346 int even_pair_failed; 3347 char parity_type; 3348 3349 3350 /* check to see that we have enough 'live' components 3351 of this set. If so, we can configure it if necessary */ 3352 3353 num_cols = cset->ac->clabel->num_columns; 3354 parity_type = cset->ac->clabel->parityConfig; 3355 3356 /* XXX Check for duplicate components!?!?!? */ 3357 3358 /* Determine what the mod_counter is supposed to be for this set. */ 3359 3360 mod_counter_found = 0; 3361 mod_counter = 0; 3362 ac = cset->ac; 3363 while(ac!=NULL) { 3364 if (mod_counter_found==0) { 3365 mod_counter = ac->clabel->mod_counter; 3366 mod_counter_found = 1; 3367 } else { 3368 if (ac->clabel->mod_counter > mod_counter) { 3369 mod_counter = ac->clabel->mod_counter; 3370 } 3371 } 3372 ac = ac->next; 3373 } 3374 3375 num_missing = 0; 3376 auto_config = cset->ac; 3377 3378 even_pair_failed = 0; 3379 for(c=0; c<num_cols; c++) { 3380 ac = auto_config; 3381 while(ac!=NULL) { 3382 if ((ac->clabel->column == c) && 3383 (ac->clabel->mod_counter == mod_counter)) { 3384 /* it's this one... */ 3385 #ifdef DEBUG 3386 printf("Found: %s at %d\n", 3387 ac->devname,c); 3388 #endif 3389 break; 3390 } 3391 ac=ac->next; 3392 } 3393 if (ac==NULL) { 3394 /* Didn't find one here! */ 3395 /* special case for RAID 1, especially 3396 where there are more than 2 3397 components (where RAIDframe treats 3398 things a little differently :( ) */ 3399 if (parity_type == '1') { 3400 if (c%2 == 0) { /* even component */ 3401 even_pair_failed = 1; 3402 } else { /* odd component. If 3403 we're failed, and 3404 so is the even 3405 component, it's 3406 "Good Night, Charlie" */ 3407 if (even_pair_failed == 1) { 3408 return(0); 3409 } 3410 } 3411 } else { 3412 /* normal accounting */ 3413 num_missing++; 3414 } 3415 } 3416 if ((parity_type == '1') && (c%2 == 1)) { 3417 /* Just did an even component, and we didn't 3418 bail.. reset the even_pair_failed flag, 3419 and go on to the next component.... */ 3420 even_pair_failed = 0; 3421 } 3422 } 3423 3424 clabel = cset->ac->clabel; 3425 3426 if (((clabel->parityConfig == '0') && (num_missing > 0)) || 3427 ((clabel->parityConfig == '4') && (num_missing > 1)) || 3428 ((clabel->parityConfig == '5') && (num_missing > 1))) { 3429 /* XXX this needs to be made *much* more general */ 3430 /* Too many failures */ 3431 return(0); 3432 } 3433 /* otherwise, all is well, and we've got enough to take a kick 3434 at autoconfiguring this set */ 3435 return(1); 3436 } 3437 3438 void 3439 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config, 3440 RF_Raid_t *raidPtr) 3441 { 3442 RF_ComponentLabel_t *clabel; 3443 int i; 3444 3445 clabel = ac->clabel; 3446 3447 /* 1. Fill in the common stuff */ 3448 config->numRow = clabel->num_rows = 1; 3449 config->numCol = clabel->num_columns; 3450 config->numSpare = 0; /* XXX should this be set here? */ 3451 config->sectPerSU = clabel->sectPerSU; 3452 config->SUsPerPU = clabel->SUsPerPU; 3453 config->SUsPerRU = clabel->SUsPerRU; 3454 config->parityConfig = clabel->parityConfig; 3455 /* XXX... */ 3456 strcpy(config->diskQueueType,"fifo"); 3457 config->maxOutstandingDiskReqs = clabel->maxOutstanding; 3458 config->layoutSpecificSize = 0; /* XXX ?? */ 3459 3460 while(ac!=NULL) { 3461 /* row/col values will be in range due to the checks 3462 in reasonable_label() */ 3463 strcpy(config->devnames[0][ac->clabel->column], 3464 ac->devname); 3465 ac = ac->next; 3466 } 3467 3468 for(i=0;i<RF_MAXDBGV;i++) { 3469 config->debugVars[i][0] = 0; 3470 } 3471 } 3472 3473 int 3474 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value) 3475 { 3476 RF_ComponentLabel_t *clabel; 3477 int column; 3478 int sparecol; 3479 3480 raidPtr->autoconfigure = new_value; 3481 3482 for(column=0; column<raidPtr->numCol; column++) { 3483 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3484 clabel = raidget_component_label(raidPtr, column); 3485 clabel->autoconfigure = new_value; 3486 raidflush_component_label(raidPtr, column); 3487 } 3488 } 3489 for(column = 0; column < raidPtr->numSpare ; column++) { 3490 sparecol = raidPtr->numCol + column; 3491 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3492 clabel = raidget_component_label(raidPtr, sparecol); 3493 clabel->autoconfigure = new_value; 3494 raidflush_component_label(raidPtr, sparecol); 3495 } 3496 } 3497 return(new_value); 3498 } 3499 3500 int 3501 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value) 3502 { 3503 RF_ComponentLabel_t *clabel; 3504 int column; 3505 int sparecol; 3506 3507 raidPtr->root_partition = new_value; 3508 for(column=0; column<raidPtr->numCol; column++) { 3509 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3510 clabel = raidget_component_label(raidPtr, column); 3511 clabel->root_partition = new_value; 3512 raidflush_component_label(raidPtr, column); 3513 } 3514 } 3515 for(column = 0; column < raidPtr->numSpare ; column++) { 3516 sparecol = raidPtr->numCol + column; 3517 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3518 clabel = raidget_component_label(raidPtr, sparecol); 3519 clabel->root_partition = new_value; 3520 raidflush_component_label(raidPtr, sparecol); 3521 } 3522 } 3523 return(new_value); 3524 } 3525 3526 void 3527 rf_release_all_vps(RF_ConfigSet_t *cset) 3528 { 3529 RF_AutoConfig_t *ac; 3530 3531 ac = cset->ac; 3532 while(ac!=NULL) { 3533 /* Close the vp, and give it back */ 3534 if (ac->vp) { 3535 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY); 3536 VOP_CLOSE(ac->vp, FREAD, NOCRED); 3537 vput(ac->vp); 3538 ac->vp = NULL; 3539 } 3540 ac = ac->next; 3541 } 3542 } 3543 3544 3545 void 3546 rf_cleanup_config_set(RF_ConfigSet_t *cset) 3547 { 3548 RF_AutoConfig_t *ac; 3549 RF_AutoConfig_t *next_ac; 3550 3551 ac = cset->ac; 3552 while(ac!=NULL) { 3553 next_ac = ac->next; 3554 /* nuke the label */ 3555 free(ac->clabel, M_RAIDFRAME); 3556 /* cleanup the config structure */ 3557 free(ac, M_RAIDFRAME); 3558 /* "next.." */ 3559 ac = next_ac; 3560 } 3561 /* and, finally, nuke the config set */ 3562 free(cset, M_RAIDFRAME); 3563 } 3564 3565 3566 void 3567 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 3568 { 3569 /* current version number */ 3570 clabel->version = RF_COMPONENT_LABEL_VERSION; 3571 clabel->serial_number = raidPtr->serial_number; 3572 clabel->mod_counter = raidPtr->mod_counter; 3573 3574 clabel->num_rows = 1; 3575 clabel->num_columns = raidPtr->numCol; 3576 clabel->clean = RF_RAID_DIRTY; /* not clean */ 3577 clabel->status = rf_ds_optimal; /* "It's good!" */ 3578 3579 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; 3580 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU; 3581 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU; 3582 3583 clabel->blockSize = raidPtr->bytesPerSector; 3584 clabel->numBlocks = raidPtr->sectorsPerDisk; 3585 clabel->numBlocksHi = raidPtr->sectorsPerDisk >> 32; 3586 3587 /* XXX not portable */ 3588 clabel->parityConfig = raidPtr->Layout.map->parityConfig; 3589 clabel->maxOutstanding = raidPtr->maxOutstanding; 3590 clabel->autoconfigure = raidPtr->autoconfigure; 3591 clabel->root_partition = raidPtr->root_partition; 3592 clabel->last_unit = raidPtr->raidid; 3593 clabel->config_order = raidPtr->config_order; 3594 3595 #ifndef RF_NO_PARITY_MAP 3596 rf_paritymap_init_label(raidPtr->parity_map, clabel); 3597 #endif 3598 } 3599 3600 int 3601 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit) 3602 { 3603 RF_Raid_t *raidPtr; 3604 RF_Config_t *config; 3605 int raidID; 3606 int retcode; 3607 3608 #ifdef DEBUG 3609 printf("RAID autoconfigure\n"); 3610 #endif 3611 3612 retcode = 0; 3613 *unit = -1; 3614 3615 /* 1. Create a config structure */ 3616 3617 config = (RF_Config_t *)malloc(sizeof(RF_Config_t), 3618 M_RAIDFRAME, 3619 M_NOWAIT); 3620 if (config==NULL) { 3621 printf("Out of mem!?!?\n"); 3622 /* XXX do something more intelligent here. */ 3623 return(1); 3624 } 3625 3626 memset(config, 0, sizeof(RF_Config_t)); 3627 3628 /* 3629 2. Figure out what RAID ID this one is supposed to live at 3630 See if we can get the same RAID dev that it was configured 3631 on last time.. 3632 */ 3633 3634 raidID = cset->ac->clabel->last_unit; 3635 if ((raidID < 0) || (raidID >= numraid)) { 3636 /* let's not wander off into lala land. */ 3637 raidID = numraid - 1; 3638 } 3639 if (raidPtrs[raidID]->valid != 0) { 3640 3641 /* 3642 Nope... Go looking for an alternative... 3643 Start high so we don't immediately use raid0 if that's 3644 not taken. 3645 */ 3646 3647 for(raidID = numraid - 1; raidID >= 0; raidID--) { 3648 if (raidPtrs[raidID]->valid == 0) { 3649 /* can use this one! */ 3650 break; 3651 } 3652 } 3653 } 3654 3655 if (raidID < 0) { 3656 /* punt... */ 3657 printf("Unable to auto configure this set!\n"); 3658 printf("(Out of RAID devs!)\n"); 3659 free(config, M_RAIDFRAME); 3660 return(1); 3661 } 3662 3663 #ifdef DEBUG 3664 printf("Configuring raid%d:\n",raidID); 3665 #endif 3666 3667 raidPtr = raidPtrs[raidID]; 3668 3669 /* XXX all this stuff should be done SOMEWHERE ELSE! */ 3670 raidPtr->raidid = raidID; 3671 raidPtr->openings = RAIDOUTSTANDING; 3672 3673 /* 3. Build the configuration structure */ 3674 rf_create_configuration(cset->ac, config, raidPtr); 3675 3676 /* 4. Do the configuration */ 3677 retcode = rf_Configure(raidPtr, config, cset->ac); 3678 3679 if (retcode == 0) { 3680 3681 raidinit(raidPtrs[raidID]); 3682 3683 rf_markalldirty(raidPtrs[raidID]); 3684 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */ 3685 if (cset->ac->clabel->root_partition==1) { 3686 /* everything configured just fine. Make a note 3687 that this set is eligible to be root. */ 3688 cset->rootable = 1; 3689 /* XXX do this here? */ 3690 raidPtrs[raidID]->root_partition = 1; 3691 } 3692 } 3693 3694 /* 5. Cleanup */ 3695 free(config, M_RAIDFRAME); 3696 3697 *unit = raidID; 3698 return(retcode); 3699 } 3700 3701 void 3702 rf_disk_unbusy(RF_RaidAccessDesc_t *desc) 3703 { 3704 struct buf *bp; 3705 3706 bp = (struct buf *)desc->bp; 3707 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev, 3708 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ)); 3709 } 3710 3711 void 3712 rf_pool_init(struct pool *p, size_t size, const char *w_chan, 3713 size_t xmin, size_t xmax) 3714 { 3715 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO); 3716 pool_sethiwat(p, xmax); 3717 pool_prime(p, xmin); 3718 pool_setlowat(p, xmin); 3719 } 3720 3721 /* 3722 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see 3723 * if there is IO pending and if that IO could possibly be done for a 3724 * given RAID set. Returns 0 if IO is waiting and can be done, 1 3725 * otherwise. 3726 * 3727 */ 3728 3729 int 3730 rf_buf_queue_check(int raidid) 3731 { 3732 if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) && 3733 raidPtrs[raidid]->openings > 0) { 3734 /* there is work to do */ 3735 return 0; 3736 } 3737 /* default is nothing to do */ 3738 return 1; 3739 } 3740 3741 int 3742 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr) 3743 { 3744 uint64_t numsecs; 3745 unsigned secsize; 3746 int error; 3747 3748 error = getdisksize(vp, &numsecs, &secsize); 3749 if (error == 0) { 3750 diskPtr->blockSize = secsize; 3751 diskPtr->numBlocks = numsecs - rf_protectedSectors; 3752 diskPtr->partitionSize = numsecs; 3753 return 0; 3754 } 3755 return error; 3756 } 3757 3758 static int 3759 raid_match(device_t self, cfdata_t cfdata, void *aux) 3760 { 3761 return 1; 3762 } 3763 3764 static void 3765 raid_attach(device_t parent, device_t self, void *aux) 3766 { 3767 3768 } 3769 3770 3771 static int 3772 raid_detach(device_t self, int flags) 3773 { 3774 int error; 3775 struct raid_softc *rs = &raid_softc[device_unit(self)]; 3776 3777 if ((error = raidlock(rs)) != 0) 3778 return (error); 3779 3780 error = raid_detach_unlocked(rs); 3781 3782 raidunlock(rs); 3783 3784 return error; 3785 } 3786 3787 static void 3788 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr) 3789 { 3790 prop_dictionary_t disk_info, odisk_info, geom; 3791 disk_info = prop_dictionary_create(); 3792 geom = prop_dictionary_create(); 3793 prop_dictionary_set_uint64(geom, "sectors-per-unit", 3794 raidPtr->totalSectors); 3795 prop_dictionary_set_uint32(geom, "sector-size", 3796 raidPtr->bytesPerSector); 3797 3798 prop_dictionary_set_uint16(geom, "sectors-per-track", 3799 raidPtr->Layout.dataSectorsPerStripe); 3800 prop_dictionary_set_uint16(geom, "tracks-per-cylinder", 3801 4 * raidPtr->numCol); 3802 3803 prop_dictionary_set_uint64(geom, "cylinders-per-unit", 3804 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe * 3805 (4 * raidPtr->numCol))); 3806 3807 prop_dictionary_set(disk_info, "geometry", geom); 3808 prop_object_release(geom); 3809 prop_dictionary_set(device_properties(rs->sc_dev), 3810 "disk-info", disk_info); 3811 odisk_info = rs->sc_dkdev.dk_info; 3812 rs->sc_dkdev.dk_info = disk_info; 3813 if (odisk_info) 3814 prop_object_release(odisk_info); 3815 } 3816 3817 /* 3818 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components. 3819 * We end up returning whatever error was returned by the first cache flush 3820 * that fails. 3821 */ 3822 3823 int 3824 rf_sync_component_caches(RF_Raid_t *raidPtr) 3825 { 3826 int c, sparecol; 3827 int e,error; 3828 int force = 1; 3829 3830 error = 0; 3831 for (c = 0; c < raidPtr->numCol; c++) { 3832 if (raidPtr->Disks[c].status == rf_ds_optimal) { 3833 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC, 3834 &force, FWRITE, NOCRED); 3835 if (e) { 3836 if (e != ENODEV) 3837 printf("raid%d: cache flush to component %s failed.\n", 3838 raidPtr->raidid, raidPtr->Disks[c].devname); 3839 if (error == 0) { 3840 error = e; 3841 } 3842 } 3843 } 3844 } 3845 3846 for( c = 0; c < raidPtr->numSpare ; c++) { 3847 sparecol = raidPtr->numCol + c; 3848 /* Need to ensure that the reconstruct actually completed! */ 3849 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3850 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp, 3851 DIOCCACHESYNC, &force, FWRITE, NOCRED); 3852 if (e) { 3853 if (e != ENODEV) 3854 printf("raid%d: cache flush to component %s failed.\n", 3855 raidPtr->raidid, raidPtr->Disks[sparecol].devname); 3856 if (error == 0) { 3857 error = e; 3858 } 3859 } 3860 } 3861 } 3862 return error; 3863 } 3864