1 /* $NetBSD: rf_netbsdkintf.c,v 1.291 2011/05/11 18:13:12 mrg Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Greg Oster; Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1988 University of Utah. 34 * Copyright (c) 1990, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * the Systems Programming Group of the University of Utah Computer 39 * Science Department. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 66 * 67 * @(#)cd.c 8.2 (Berkeley) 11/16/93 68 */ 69 70 /* 71 * Copyright (c) 1995 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Mark Holland, Jim Zelenka 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /*********************************************************** 98 * 99 * rf_kintf.c -- the kernel interface routines for RAIDframe 100 * 101 ***********************************************************/ 102 103 #include <sys/cdefs.h> 104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.291 2011/05/11 18:13:12 mrg Exp $"); 105 106 #ifdef _KERNEL_OPT 107 #include "opt_compat_netbsd.h" 108 #include "opt_raid_autoconfig.h" 109 #include "raid.h" 110 #endif 111 112 #include <sys/param.h> 113 #include <sys/errno.h> 114 #include <sys/pool.h> 115 #include <sys/proc.h> 116 #include <sys/queue.h> 117 #include <sys/disk.h> 118 #include <sys/device.h> 119 #include <sys/stat.h> 120 #include <sys/ioctl.h> 121 #include <sys/fcntl.h> 122 #include <sys/systm.h> 123 #include <sys/vnode.h> 124 #include <sys/disklabel.h> 125 #include <sys/conf.h> 126 #include <sys/buf.h> 127 #include <sys/bufq.h> 128 #include <sys/reboot.h> 129 #include <sys/kauth.h> 130 131 #include <prop/proplib.h> 132 133 #include <dev/raidframe/raidframevar.h> 134 #include <dev/raidframe/raidframeio.h> 135 #include <dev/raidframe/rf_paritymap.h> 136 137 #include "rf_raid.h" 138 #include "rf_copyback.h" 139 #include "rf_dag.h" 140 #include "rf_dagflags.h" 141 #include "rf_desc.h" 142 #include "rf_diskqueue.h" 143 #include "rf_etimer.h" 144 #include "rf_general.h" 145 #include "rf_kintf.h" 146 #include "rf_options.h" 147 #include "rf_driver.h" 148 #include "rf_parityscan.h" 149 #include "rf_threadstuff.h" 150 151 #ifdef COMPAT_50 152 #include "rf_compat50.h" 153 #endif 154 155 #ifdef DEBUG 156 int rf_kdebug_level = 0; 157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a 158 #else /* DEBUG */ 159 #define db1_printf(a) { } 160 #endif /* DEBUG */ 161 162 static RF_Raid_t **raidPtrs; /* global raid device descriptors */ 163 164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 165 static rf_declare_mutex2(rf_sparet_wait_mutex); 166 static rf_declare_cond2(rf_sparet_wait_cv); 167 static rf_declare_cond2(rf_sparet_resp_cv); 168 169 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a 170 * spare table */ 171 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from 172 * installation process */ 173 #endif 174 175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures"); 176 177 /* prototypes */ 178 static void KernelWakeupFunc(struct buf *); 179 static void InitBP(struct buf *, struct vnode *, unsigned, 180 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *), 181 void *, int, struct proc *); 182 static void raidinit(RF_Raid_t *); 183 184 void raidattach(int); 185 static int raid_match(device_t, cfdata_t, void *); 186 static void raid_attach(device_t, device_t, void *); 187 static int raid_detach(device_t, int); 188 189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t, 190 daddr_t, daddr_t); 191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t, 192 daddr_t, daddr_t, int); 193 194 static int raidwrite_component_label(unsigned, 195 dev_t, struct vnode *, RF_ComponentLabel_t *); 196 static int raidread_component_label(unsigned, 197 dev_t, struct vnode *, RF_ComponentLabel_t *); 198 199 200 dev_type_open(raidopen); 201 dev_type_close(raidclose); 202 dev_type_read(raidread); 203 dev_type_write(raidwrite); 204 dev_type_ioctl(raidioctl); 205 dev_type_strategy(raidstrategy); 206 dev_type_dump(raiddump); 207 dev_type_size(raidsize); 208 209 const struct bdevsw raid_bdevsw = { 210 raidopen, raidclose, raidstrategy, raidioctl, 211 raiddump, raidsize, D_DISK 212 }; 213 214 const struct cdevsw raid_cdevsw = { 215 raidopen, raidclose, raidread, raidwrite, raidioctl, 216 nostop, notty, nopoll, nommap, nokqfilter, D_DISK 217 }; 218 219 static struct dkdriver rf_dkdriver = { raidstrategy, minphys }; 220 221 /* XXX Not sure if the following should be replacing the raidPtrs above, 222 or if it should be used in conjunction with that... 223 */ 224 225 struct raid_softc { 226 device_t sc_dev; 227 int sc_flags; /* flags */ 228 int sc_cflags; /* configuration flags */ 229 uint64_t sc_size; /* size of the raid device */ 230 char sc_xname[20]; /* XXX external name */ 231 struct disk sc_dkdev; /* generic disk device info */ 232 struct bufq_state *buf_queue; /* used for the device queue */ 233 }; 234 /* sc_flags */ 235 #define RAIDF_INITED 0x01 /* unit has been initialized */ 236 #define RAIDF_WLABEL 0x02 /* label area is writable */ 237 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */ 238 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */ 239 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */ 240 #define RAIDF_LOCKED 0x80 /* unit is locked */ 241 242 #define raidunit(x) DISKUNIT(x) 243 int numraid = 0; 244 245 extern struct cfdriver raid_cd; 246 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc), 247 raid_match, raid_attach, raid_detach, NULL, NULL, NULL, 248 DVF_DETACH_SHUTDOWN); 249 250 /* 251 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device. 252 * Be aware that large numbers can allow the driver to consume a lot of 253 * kernel memory, especially on writes, and in degraded mode reads. 254 * 255 * For example: with a stripe width of 64 blocks (32k) and 5 disks, 256 * a single 64K write will typically require 64K for the old data, 257 * 64K for the old parity, and 64K for the new parity, for a total 258 * of 192K (if the parity buffer is not re-used immediately). 259 * Even it if is used immediately, that's still 128K, which when multiplied 260 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data. 261 * 262 * Now in degraded mode, for example, a 64K read on the above setup may 263 * require data reconstruction, which will require *all* of the 4 remaining 264 * disks to participate -- 4 * 32K/disk == 128K again. 265 */ 266 267 #ifndef RAIDOUTSTANDING 268 #define RAIDOUTSTANDING 6 269 #endif 270 271 #define RAIDLABELDEV(dev) \ 272 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART)) 273 274 /* declared here, and made public, for the benefit of KVM stuff.. */ 275 struct raid_softc *raid_softc; 276 277 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *, 278 struct disklabel *); 279 static void raidgetdisklabel(dev_t); 280 static void raidmakedisklabel(struct raid_softc *); 281 282 static int raidlock(struct raid_softc *); 283 static void raidunlock(struct raid_softc *); 284 285 static int raid_detach_unlocked(struct raid_softc *); 286 287 static void rf_markalldirty(RF_Raid_t *); 288 static void rf_set_properties(struct raid_softc *, RF_Raid_t *); 289 290 void rf_ReconThread(struct rf_recon_req *); 291 void rf_RewriteParityThread(RF_Raid_t *raidPtr); 292 void rf_CopybackThread(RF_Raid_t *raidPtr); 293 void rf_ReconstructInPlaceThread(struct rf_recon_req *); 294 int rf_autoconfig(device_t); 295 void rf_buildroothack(RF_ConfigSet_t *); 296 297 RF_AutoConfig_t *rf_find_raid_components(void); 298 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *); 299 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *); 300 static int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t); 301 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *); 302 int rf_set_autoconfig(RF_Raid_t *, int); 303 int rf_set_rootpartition(RF_Raid_t *, int); 304 void rf_release_all_vps(RF_ConfigSet_t *); 305 void rf_cleanup_config_set(RF_ConfigSet_t *); 306 int rf_have_enough_components(RF_ConfigSet_t *); 307 int rf_auto_config_set(RF_ConfigSet_t *, int *); 308 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t); 309 310 static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not 311 allow autoconfig to take place. 312 Note that this is overridden by having 313 RAID_AUTOCONFIG as an option in the 314 kernel config file. */ 315 316 struct RF_Pools_s rf_pools; 317 318 void 319 raidattach(int num) 320 { 321 int raidID; 322 int i, rc; 323 324 aprint_debug("raidattach: Asked for %d units\n", num); 325 326 if (num <= 0) { 327 #ifdef DIAGNOSTIC 328 panic("raidattach: count <= 0"); 329 #endif 330 return; 331 } 332 /* This is where all the initialization stuff gets done. */ 333 334 numraid = num; 335 336 /* Make some space for requested number of units... */ 337 338 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **)); 339 if (raidPtrs == NULL) { 340 panic("raidPtrs is NULL!!"); 341 } 342 343 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 344 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM); 345 rf_init_cond2(rf_sparet_wait_cv, "sparetw"); 346 rf_init_cond2(rf_sparet_resp_cv, "rfgst"); 347 348 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; 349 #endif 350 351 for (i = 0; i < num; i++) 352 raidPtrs[i] = NULL; 353 rc = rf_BootRaidframe(); 354 if (rc == 0) 355 aprint_verbose("Kernelized RAIDframe activated\n"); 356 else 357 panic("Serious error booting RAID!!"); 358 359 /* put together some datastructures like the CCD device does.. This 360 * lets us lock the device and what-not when it gets opened. */ 361 362 raid_softc = (struct raid_softc *) 363 malloc(num * sizeof(struct raid_softc), 364 M_RAIDFRAME, M_NOWAIT); 365 if (raid_softc == NULL) { 366 aprint_error("WARNING: no memory for RAIDframe driver\n"); 367 return; 368 } 369 370 memset(raid_softc, 0, num * sizeof(struct raid_softc)); 371 372 for (raidID = 0; raidID < num; raidID++) { 373 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0); 374 375 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t), 376 (RF_Raid_t *)); 377 if (raidPtrs[raidID] == NULL) { 378 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID); 379 numraid = raidID; 380 return; 381 } 382 } 383 384 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) { 385 aprint_error("raidattach: config_cfattach_attach failed?\n"); 386 } 387 388 #ifdef RAID_AUTOCONFIG 389 raidautoconfig = 1; 390 #endif 391 392 /* 393 * Register a finalizer which will be used to auto-config RAID 394 * sets once all real hardware devices have been found. 395 */ 396 if (config_finalize_register(NULL, rf_autoconfig) != 0) 397 aprint_error("WARNING: unable to register RAIDframe finalizer\n"); 398 } 399 400 int 401 rf_autoconfig(device_t self) 402 { 403 RF_AutoConfig_t *ac_list; 404 RF_ConfigSet_t *config_sets; 405 406 if (raidautoconfig == 0) 407 return (0); 408 409 /* XXX This code can only be run once. */ 410 raidautoconfig = 0; 411 412 /* 1. locate all RAID components on the system */ 413 aprint_debug("Searching for RAID components...\n"); 414 ac_list = rf_find_raid_components(); 415 416 /* 2. Sort them into their respective sets. */ 417 config_sets = rf_create_auto_sets(ac_list); 418 419 /* 420 * 3. Evaluate each set andconfigure the valid ones. 421 * This gets done in rf_buildroothack(). 422 */ 423 rf_buildroothack(config_sets); 424 425 return 1; 426 } 427 428 void 429 rf_buildroothack(RF_ConfigSet_t *config_sets) 430 { 431 RF_ConfigSet_t *cset; 432 RF_ConfigSet_t *next_cset; 433 int retcode; 434 int raidID; 435 int rootID; 436 int col; 437 int num_root; 438 char *devname; 439 440 rootID = 0; 441 num_root = 0; 442 cset = config_sets; 443 while (cset != NULL) { 444 next_cset = cset->next; 445 if (rf_have_enough_components(cset) && 446 cset->ac->clabel->autoconfigure==1) { 447 retcode = rf_auto_config_set(cset,&raidID); 448 if (!retcode) { 449 aprint_debug("raid%d: configured ok\n", raidID); 450 if (cset->rootable) { 451 rootID = raidID; 452 num_root++; 453 } 454 } else { 455 /* The autoconfig didn't work :( */ 456 aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID); 457 rf_release_all_vps(cset); 458 } 459 } else { 460 /* we're not autoconfiguring this set... 461 release the associated resources */ 462 rf_release_all_vps(cset); 463 } 464 /* cleanup */ 465 rf_cleanup_config_set(cset); 466 cset = next_cset; 467 } 468 469 /* if the user has specified what the root device should be 470 then we don't touch booted_device or boothowto... */ 471 472 if (rootspec != NULL) 473 return; 474 475 /* we found something bootable... */ 476 477 if (num_root == 1) { 478 booted_device = raid_softc[rootID].sc_dev; 479 } else if (num_root > 1) { 480 481 /* 482 * Maybe the MD code can help. If it cannot, then 483 * setroot() will discover that we have no 484 * booted_device and will ask the user if nothing was 485 * hardwired in the kernel config file 486 */ 487 488 if (booted_device == NULL) 489 cpu_rootconf(); 490 if (booted_device == NULL) 491 return; 492 493 num_root = 0; 494 for (raidID = 0; raidID < numraid; raidID++) { 495 if (raidPtrs[raidID]->valid == 0) 496 continue; 497 498 if (raidPtrs[raidID]->root_partition == 0) 499 continue; 500 501 for (col = 0; col < raidPtrs[raidID]->numCol; col++) { 502 devname = raidPtrs[raidID]->Disks[col].devname; 503 devname += sizeof("/dev/") - 1; 504 if (strncmp(devname, device_xname(booted_device), 505 strlen(device_xname(booted_device))) != 0) 506 continue; 507 aprint_debug("raid%d includes boot device %s\n", 508 raidID, devname); 509 num_root++; 510 rootID = raidID; 511 } 512 } 513 514 if (num_root == 1) { 515 booted_device = raid_softc[rootID].sc_dev; 516 } else { 517 /* we can't guess.. require the user to answer... */ 518 boothowto |= RB_ASKNAME; 519 } 520 } 521 } 522 523 524 int 525 raidsize(dev_t dev) 526 { 527 struct raid_softc *rs; 528 struct disklabel *lp; 529 int part, unit, omask, size; 530 531 unit = raidunit(dev); 532 if (unit >= numraid) 533 return (-1); 534 rs = &raid_softc[unit]; 535 536 if ((rs->sc_flags & RAIDF_INITED) == 0) 537 return (-1); 538 539 part = DISKPART(dev); 540 omask = rs->sc_dkdev.dk_openmask & (1 << part); 541 lp = rs->sc_dkdev.dk_label; 542 543 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp)) 544 return (-1); 545 546 if (lp->d_partitions[part].p_fstype != FS_SWAP) 547 size = -1; 548 else 549 size = lp->d_partitions[part].p_size * 550 (lp->d_secsize / DEV_BSIZE); 551 552 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp)) 553 return (-1); 554 555 return (size); 556 557 } 558 559 int 560 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size) 561 { 562 int unit = raidunit(dev); 563 struct raid_softc *rs; 564 const struct bdevsw *bdev; 565 struct disklabel *lp; 566 RF_Raid_t *raidPtr; 567 daddr_t offset; 568 int part, c, sparecol, j, scol, dumpto; 569 int error = 0; 570 571 if (unit >= numraid) 572 return (ENXIO); 573 574 rs = &raid_softc[unit]; 575 raidPtr = raidPtrs[unit]; 576 577 if ((rs->sc_flags & RAIDF_INITED) == 0) 578 return ENXIO; 579 580 /* we only support dumping to RAID 1 sets */ 581 if (raidPtr->Layout.numDataCol != 1 || 582 raidPtr->Layout.numParityCol != 1) 583 return EINVAL; 584 585 586 if ((error = raidlock(rs)) != 0) 587 return error; 588 589 if (size % DEV_BSIZE != 0) { 590 error = EINVAL; 591 goto out; 592 } 593 594 if (blkno + size / DEV_BSIZE > rs->sc_size) { 595 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > " 596 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno, 597 size / DEV_BSIZE, rs->sc_size); 598 error = EINVAL; 599 goto out; 600 } 601 602 part = DISKPART(dev); 603 lp = rs->sc_dkdev.dk_label; 604 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS; 605 606 /* figure out what device is alive.. */ 607 608 /* 609 Look for a component to dump to. The preference for the 610 component to dump to is as follows: 611 1) the master 612 2) a used_spare of the master 613 3) the slave 614 4) a used_spare of the slave 615 */ 616 617 dumpto = -1; 618 for (c = 0; c < raidPtr->numCol; c++) { 619 if (raidPtr->Disks[c].status == rf_ds_optimal) { 620 /* this might be the one */ 621 dumpto = c; 622 break; 623 } 624 } 625 626 /* 627 At this point we have possibly selected a live master or a 628 live slave. We now check to see if there is a spared 629 master (or a spared slave), if we didn't find a live master 630 or a live slave. 631 */ 632 633 for (c = 0; c < raidPtr->numSpare; c++) { 634 sparecol = raidPtr->numCol + c; 635 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 636 /* How about this one? */ 637 scol = -1; 638 for(j=0;j<raidPtr->numCol;j++) { 639 if (raidPtr->Disks[j].spareCol == sparecol) { 640 scol = j; 641 break; 642 } 643 } 644 if (scol == 0) { 645 /* 646 We must have found a spared master! 647 We'll take that over anything else 648 found so far. (We couldn't have 649 found a real master before, since 650 this is a used spare, and it's 651 saying that it's replacing the 652 master.) On reboot (with 653 autoconfiguration turned on) 654 sparecol will become the 1st 655 component (component0) of this set. 656 */ 657 dumpto = sparecol; 658 break; 659 } else if (scol != -1) { 660 /* 661 Must be a spared slave. We'll dump 662 to that if we havn't found anything 663 else so far. 664 */ 665 if (dumpto == -1) 666 dumpto = sparecol; 667 } 668 } 669 } 670 671 if (dumpto == -1) { 672 /* we couldn't find any live components to dump to!?!? 673 */ 674 error = EINVAL; 675 goto out; 676 } 677 678 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev); 679 680 /* 681 Note that blkno is relative to this particular partition. 682 By adding the offset of this partition in the RAID 683 set, and also adding RF_PROTECTED_SECTORS, we get a 684 value that is relative to the partition used for the 685 underlying component. 686 */ 687 688 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev, 689 blkno + offset, va, size); 690 691 out: 692 raidunlock(rs); 693 694 return error; 695 } 696 /* ARGSUSED */ 697 int 698 raidopen(dev_t dev, int flags, int fmt, 699 struct lwp *l) 700 { 701 int unit = raidunit(dev); 702 struct raid_softc *rs; 703 struct disklabel *lp; 704 int part, pmask; 705 int error = 0; 706 707 if (unit >= numraid) 708 return (ENXIO); 709 rs = &raid_softc[unit]; 710 711 if ((error = raidlock(rs)) != 0) 712 return (error); 713 714 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) { 715 error = EBUSY; 716 goto bad; 717 } 718 719 lp = rs->sc_dkdev.dk_label; 720 721 part = DISKPART(dev); 722 723 /* 724 * If there are wedges, and this is not RAW_PART, then we 725 * need to fail. 726 */ 727 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) { 728 error = EBUSY; 729 goto bad; 730 } 731 pmask = (1 << part); 732 733 if ((rs->sc_flags & RAIDF_INITED) && 734 (rs->sc_dkdev.dk_openmask == 0)) 735 raidgetdisklabel(dev); 736 737 /* make sure that this partition exists */ 738 739 if (part != RAW_PART) { 740 if (((rs->sc_flags & RAIDF_INITED) == 0) || 741 ((part >= lp->d_npartitions) || 742 (lp->d_partitions[part].p_fstype == FS_UNUSED))) { 743 error = ENXIO; 744 goto bad; 745 } 746 } 747 /* Prevent this unit from being unconfigured while open. */ 748 switch (fmt) { 749 case S_IFCHR: 750 rs->sc_dkdev.dk_copenmask |= pmask; 751 break; 752 753 case S_IFBLK: 754 rs->sc_dkdev.dk_bopenmask |= pmask; 755 break; 756 } 757 758 if ((rs->sc_dkdev.dk_openmask == 0) && 759 ((rs->sc_flags & RAIDF_INITED) != 0)) { 760 /* First one... mark things as dirty... Note that we *MUST* 761 have done a configure before this. I DO NOT WANT TO BE 762 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED 763 THAT THEY BELONG TOGETHER!!!!! */ 764 /* XXX should check to see if we're only open for reading 765 here... If so, we needn't do this, but then need some 766 other way of keeping track of what's happened.. */ 767 768 rf_markalldirty(raidPtrs[unit]); 769 } 770 771 772 rs->sc_dkdev.dk_openmask = 773 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; 774 775 bad: 776 raidunlock(rs); 777 778 return (error); 779 780 781 } 782 /* ARGSUSED */ 783 int 784 raidclose(dev_t dev, int flags, int fmt, struct lwp *l) 785 { 786 int unit = raidunit(dev); 787 struct raid_softc *rs; 788 int error = 0; 789 int part; 790 791 if (unit >= numraid) 792 return (ENXIO); 793 rs = &raid_softc[unit]; 794 795 if ((error = raidlock(rs)) != 0) 796 return (error); 797 798 part = DISKPART(dev); 799 800 /* ...that much closer to allowing unconfiguration... */ 801 switch (fmt) { 802 case S_IFCHR: 803 rs->sc_dkdev.dk_copenmask &= ~(1 << part); 804 break; 805 806 case S_IFBLK: 807 rs->sc_dkdev.dk_bopenmask &= ~(1 << part); 808 break; 809 } 810 rs->sc_dkdev.dk_openmask = 811 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; 812 813 if ((rs->sc_dkdev.dk_openmask == 0) && 814 ((rs->sc_flags & RAIDF_INITED) != 0)) { 815 /* Last one... device is not unconfigured yet. 816 Device shutdown has taken care of setting the 817 clean bits if RAIDF_INITED is not set 818 mark things as clean... */ 819 820 rf_update_component_labels(raidPtrs[unit], 821 RF_FINAL_COMPONENT_UPDATE); 822 823 /* If the kernel is shutting down, it will detach 824 * this RAID set soon enough. 825 */ 826 } 827 828 raidunlock(rs); 829 return (0); 830 831 } 832 833 void 834 raidstrategy(struct buf *bp) 835 { 836 unsigned int raidID = raidunit(bp->b_dev); 837 RF_Raid_t *raidPtr; 838 struct raid_softc *rs = &raid_softc[raidID]; 839 int wlabel; 840 841 if ((rs->sc_flags & RAIDF_INITED) ==0) { 842 bp->b_error = ENXIO; 843 goto done; 844 } 845 if (raidID >= numraid || !raidPtrs[raidID]) { 846 bp->b_error = ENODEV; 847 goto done; 848 } 849 raidPtr = raidPtrs[raidID]; 850 if (!raidPtr->valid) { 851 bp->b_error = ENODEV; 852 goto done; 853 } 854 if (bp->b_bcount == 0) { 855 db1_printf(("b_bcount is zero..\n")); 856 goto done; 857 } 858 859 /* 860 * Do bounds checking and adjust transfer. If there's an 861 * error, the bounds check will flag that for us. 862 */ 863 864 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING); 865 if (DISKPART(bp->b_dev) == RAW_PART) { 866 uint64_t size; /* device size in DEV_BSIZE unit */ 867 868 if (raidPtr->logBytesPerSector > DEV_BSHIFT) { 869 size = raidPtr->totalSectors << 870 (raidPtr->logBytesPerSector - DEV_BSHIFT); 871 } else { 872 size = raidPtr->totalSectors >> 873 (DEV_BSHIFT - raidPtr->logBytesPerSector); 874 } 875 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) { 876 goto done; 877 } 878 } else { 879 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) { 880 db1_printf(("Bounds check failed!!:%d %d\n", 881 (int) bp->b_blkno, (int) wlabel)); 882 goto done; 883 } 884 } 885 886 rf_lock_mutex2(raidPtr->iodone_lock); 887 888 bp->b_resid = 0; 889 890 /* stuff it onto our queue */ 891 bufq_put(rs->buf_queue, bp); 892 893 /* scheduled the IO to happen at the next convenient time */ 894 rf_signal_cond2(raidPtr->iodone_cv); 895 rf_unlock_mutex2(raidPtr->iodone_lock); 896 897 return; 898 899 done: 900 bp->b_resid = bp->b_bcount; 901 biodone(bp); 902 } 903 /* ARGSUSED */ 904 int 905 raidread(dev_t dev, struct uio *uio, int flags) 906 { 907 int unit = raidunit(dev); 908 struct raid_softc *rs; 909 910 if (unit >= numraid) 911 return (ENXIO); 912 rs = &raid_softc[unit]; 913 914 if ((rs->sc_flags & RAIDF_INITED) == 0) 915 return (ENXIO); 916 917 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio)); 918 919 } 920 /* ARGSUSED */ 921 int 922 raidwrite(dev_t dev, struct uio *uio, int flags) 923 { 924 int unit = raidunit(dev); 925 struct raid_softc *rs; 926 927 if (unit >= numraid) 928 return (ENXIO); 929 rs = &raid_softc[unit]; 930 931 if ((rs->sc_flags & RAIDF_INITED) == 0) 932 return (ENXIO); 933 934 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio)); 935 936 } 937 938 static int 939 raid_detach_unlocked(struct raid_softc *rs) 940 { 941 int error; 942 RF_Raid_t *raidPtr; 943 944 raidPtr = raidPtrs[device_unit(rs->sc_dev)]; 945 946 /* 947 * If somebody has a partition mounted, we shouldn't 948 * shutdown. 949 */ 950 if (rs->sc_dkdev.dk_openmask != 0) 951 return EBUSY; 952 953 if ((rs->sc_flags & RAIDF_INITED) == 0) 954 ; /* not initialized: nothing to do */ 955 else if ((error = rf_Shutdown(raidPtr)) != 0) 956 return error; 957 else 958 rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN); 959 960 /* Detach the disk. */ 961 dkwedge_delall(&rs->sc_dkdev); 962 disk_detach(&rs->sc_dkdev); 963 disk_destroy(&rs->sc_dkdev); 964 965 aprint_normal_dev(rs->sc_dev, "detached\n"); 966 967 return 0; 968 } 969 970 int 971 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) 972 { 973 int unit = raidunit(dev); 974 int error = 0; 975 int part, pmask; 976 cfdata_t cf; 977 struct raid_softc *rs; 978 RF_Config_t *k_cfg, *u_cfg; 979 RF_Raid_t *raidPtr; 980 RF_RaidDisk_t *diskPtr; 981 RF_AccTotals_t *totals; 982 RF_DeviceConfig_t *d_cfg, **ucfgp; 983 u_char *specific_buf; 984 int retcode = 0; 985 int column; 986 /* int raidid; */ 987 struct rf_recon_req *rrcopy, *rr; 988 RF_ComponentLabel_t *clabel; 989 RF_ComponentLabel_t *ci_label; 990 RF_ComponentLabel_t **clabel_ptr; 991 RF_SingleComponent_t *sparePtr,*componentPtr; 992 RF_SingleComponent_t component; 993 RF_ProgressInfo_t progressInfo, **progressInfoPtr; 994 int i, j, d; 995 #ifdef __HAVE_OLD_DISKLABEL 996 struct disklabel newlabel; 997 #endif 998 struct dkwedge_info *dkw; 999 1000 if (unit >= numraid) 1001 return (ENXIO); 1002 rs = &raid_softc[unit]; 1003 raidPtr = raidPtrs[unit]; 1004 1005 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev, 1006 (int) DISKPART(dev), (int) unit, cmd)); 1007 1008 /* Must be open for writes for these commands... */ 1009 switch (cmd) { 1010 #ifdef DIOCGSECTORSIZE 1011 case DIOCGSECTORSIZE: 1012 *(u_int *)data = raidPtr->bytesPerSector; 1013 return 0; 1014 case DIOCGMEDIASIZE: 1015 *(off_t *)data = 1016 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector; 1017 return 0; 1018 #endif 1019 case DIOCSDINFO: 1020 case DIOCWDINFO: 1021 #ifdef __HAVE_OLD_DISKLABEL 1022 case ODIOCWDINFO: 1023 case ODIOCSDINFO: 1024 #endif 1025 case DIOCWLABEL: 1026 case DIOCAWEDGE: 1027 case DIOCDWEDGE: 1028 if ((flag & FWRITE) == 0) 1029 return (EBADF); 1030 } 1031 1032 /* Must be initialized for these... */ 1033 switch (cmd) { 1034 case DIOCGDINFO: 1035 case DIOCSDINFO: 1036 case DIOCWDINFO: 1037 #ifdef __HAVE_OLD_DISKLABEL 1038 case ODIOCGDINFO: 1039 case ODIOCWDINFO: 1040 case ODIOCSDINFO: 1041 case ODIOCGDEFLABEL: 1042 #endif 1043 case DIOCGPART: 1044 case DIOCWLABEL: 1045 case DIOCGDEFLABEL: 1046 case DIOCAWEDGE: 1047 case DIOCDWEDGE: 1048 case DIOCLWEDGES: 1049 case DIOCCACHESYNC: 1050 case RAIDFRAME_SHUTDOWN: 1051 case RAIDFRAME_REWRITEPARITY: 1052 case RAIDFRAME_GET_INFO: 1053 case RAIDFRAME_RESET_ACCTOTALS: 1054 case RAIDFRAME_GET_ACCTOTALS: 1055 case RAIDFRAME_KEEP_ACCTOTALS: 1056 case RAIDFRAME_GET_SIZE: 1057 case RAIDFRAME_FAIL_DISK: 1058 case RAIDFRAME_COPYBACK: 1059 case RAIDFRAME_CHECK_RECON_STATUS: 1060 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1061 case RAIDFRAME_GET_COMPONENT_LABEL: 1062 case RAIDFRAME_SET_COMPONENT_LABEL: 1063 case RAIDFRAME_ADD_HOT_SPARE: 1064 case RAIDFRAME_REMOVE_HOT_SPARE: 1065 case RAIDFRAME_INIT_LABELS: 1066 case RAIDFRAME_REBUILD_IN_PLACE: 1067 case RAIDFRAME_CHECK_PARITY: 1068 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1069 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1070 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1071 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1072 case RAIDFRAME_SET_AUTOCONFIG: 1073 case RAIDFRAME_SET_ROOT: 1074 case RAIDFRAME_DELETE_COMPONENT: 1075 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1076 case RAIDFRAME_PARITYMAP_STATUS: 1077 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1078 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1079 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1080 if ((rs->sc_flags & RAIDF_INITED) == 0) 1081 return (ENXIO); 1082 } 1083 1084 switch (cmd) { 1085 #ifdef COMPAT_50 1086 case RAIDFRAME_GET_INFO50: 1087 return rf_get_info50(raidPtr, data); 1088 1089 case RAIDFRAME_CONFIGURE50: 1090 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0) 1091 return retcode; 1092 goto config; 1093 #endif 1094 /* configure the system */ 1095 case RAIDFRAME_CONFIGURE: 1096 1097 if (raidPtr->valid) { 1098 /* There is a valid RAID set running on this unit! */ 1099 printf("raid%d: Device already configured!\n",unit); 1100 return(EINVAL); 1101 } 1102 1103 /* copy-in the configuration information */ 1104 /* data points to a pointer to the configuration structure */ 1105 1106 u_cfg = *((RF_Config_t **) data); 1107 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *)); 1108 if (k_cfg == NULL) { 1109 return (ENOMEM); 1110 } 1111 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t)); 1112 if (retcode) { 1113 RF_Free(k_cfg, sizeof(RF_Config_t)); 1114 db1_printf(("rf_ioctl: retcode=%d copyin.1\n", 1115 retcode)); 1116 return (retcode); 1117 } 1118 goto config; 1119 config: 1120 /* allocate a buffer for the layout-specific data, and copy it 1121 * in */ 1122 if (k_cfg->layoutSpecificSize) { 1123 if (k_cfg->layoutSpecificSize > 10000) { 1124 /* sanity check */ 1125 RF_Free(k_cfg, sizeof(RF_Config_t)); 1126 return (EINVAL); 1127 } 1128 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize, 1129 (u_char *)); 1130 if (specific_buf == NULL) { 1131 RF_Free(k_cfg, sizeof(RF_Config_t)); 1132 return (ENOMEM); 1133 } 1134 retcode = copyin(k_cfg->layoutSpecific, specific_buf, 1135 k_cfg->layoutSpecificSize); 1136 if (retcode) { 1137 RF_Free(k_cfg, sizeof(RF_Config_t)); 1138 RF_Free(specific_buf, 1139 k_cfg->layoutSpecificSize); 1140 db1_printf(("rf_ioctl: retcode=%d copyin.2\n", 1141 retcode)); 1142 return (retcode); 1143 } 1144 } else 1145 specific_buf = NULL; 1146 k_cfg->layoutSpecific = specific_buf; 1147 1148 /* should do some kind of sanity check on the configuration. 1149 * Store the sum of all the bytes in the last byte? */ 1150 1151 /* configure the system */ 1152 1153 /* 1154 * Clear the entire RAID descriptor, just to make sure 1155 * there is no stale data left in the case of a 1156 * reconfiguration 1157 */ 1158 memset(raidPtr, 0, sizeof(*raidPtr)); 1159 raidPtr->raidid = unit; 1160 1161 retcode = rf_Configure(raidPtr, k_cfg, NULL); 1162 1163 if (retcode == 0) { 1164 1165 /* allow this many simultaneous IO's to 1166 this RAID device */ 1167 raidPtr->openings = RAIDOUTSTANDING; 1168 1169 raidinit(raidPtr); 1170 rf_markalldirty(raidPtr); 1171 } 1172 /* free the buffers. No return code here. */ 1173 if (k_cfg->layoutSpecificSize) { 1174 RF_Free(specific_buf, k_cfg->layoutSpecificSize); 1175 } 1176 RF_Free(k_cfg, sizeof(RF_Config_t)); 1177 1178 return (retcode); 1179 1180 /* shutdown the system */ 1181 case RAIDFRAME_SHUTDOWN: 1182 1183 part = DISKPART(dev); 1184 pmask = (1 << part); 1185 1186 if ((error = raidlock(rs)) != 0) 1187 return (error); 1188 1189 if ((rs->sc_dkdev.dk_openmask & ~pmask) || 1190 ((rs->sc_dkdev.dk_bopenmask & pmask) && 1191 (rs->sc_dkdev.dk_copenmask & pmask))) 1192 retcode = EBUSY; 1193 else { 1194 rs->sc_flags |= RAIDF_SHUTDOWN; 1195 rs->sc_dkdev.dk_copenmask &= ~pmask; 1196 rs->sc_dkdev.dk_bopenmask &= ~pmask; 1197 rs->sc_dkdev.dk_openmask &= ~pmask; 1198 retcode = 0; 1199 } 1200 1201 raidunlock(rs); 1202 1203 if (retcode != 0) 1204 return retcode; 1205 1206 /* free the pseudo device attach bits */ 1207 1208 cf = device_cfdata(rs->sc_dev); 1209 if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0) 1210 free(cf, M_RAIDFRAME); 1211 1212 return (retcode); 1213 case RAIDFRAME_GET_COMPONENT_LABEL: 1214 clabel_ptr = (RF_ComponentLabel_t **) data; 1215 /* need to read the component label for the disk indicated 1216 by row,column in clabel */ 1217 1218 /* 1219 * Perhaps there should be an option to skip the in-core 1220 * copy and hit the disk, as with disklabel(8). 1221 */ 1222 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *)); 1223 1224 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel)); 1225 1226 if (retcode) { 1227 RF_Free(clabel, sizeof(*clabel)); 1228 return retcode; 1229 } 1230 1231 clabel->row = 0; /* Don't allow looking at anything else.*/ 1232 1233 column = clabel->column; 1234 1235 if ((column < 0) || (column >= raidPtr->numCol + 1236 raidPtr->numSpare)) { 1237 RF_Free(clabel, sizeof(*clabel)); 1238 return EINVAL; 1239 } 1240 1241 RF_Free(clabel, sizeof(*clabel)); 1242 1243 clabel = raidget_component_label(raidPtr, column); 1244 1245 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr)); 1246 1247 #if 0 1248 case RAIDFRAME_SET_COMPONENT_LABEL: 1249 clabel = (RF_ComponentLabel_t *) data; 1250 1251 /* XXX check the label for valid stuff... */ 1252 /* Note that some things *should not* get modified -- 1253 the user should be re-initing the labels instead of 1254 trying to patch things. 1255 */ 1256 1257 raidid = raidPtr->raidid; 1258 #ifdef DEBUG 1259 printf("raid%d: Got component label:\n", raidid); 1260 printf("raid%d: Version: %d\n", raidid, clabel->version); 1261 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number); 1262 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter); 1263 printf("raid%d: Column: %d\n", raidid, clabel->column); 1264 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns); 1265 printf("raid%d: Clean: %d\n", raidid, clabel->clean); 1266 printf("raid%d: Status: %d\n", raidid, clabel->status); 1267 #endif 1268 clabel->row = 0; 1269 column = clabel->column; 1270 1271 if ((column < 0) || (column >= raidPtr->numCol)) { 1272 return(EINVAL); 1273 } 1274 1275 /* XXX this isn't allowed to do anything for now :-) */ 1276 1277 /* XXX and before it is, we need to fill in the rest 1278 of the fields!?!?!?! */ 1279 memcpy(raidget_component_label(raidPtr, column), 1280 clabel, sizeof(*clabel)); 1281 raidflush_component_label(raidPtr, column); 1282 return (0); 1283 #endif 1284 1285 case RAIDFRAME_INIT_LABELS: 1286 clabel = (RF_ComponentLabel_t *) data; 1287 /* 1288 we only want the serial number from 1289 the above. We get all the rest of the information 1290 from the config that was used to create this RAID 1291 set. 1292 */ 1293 1294 raidPtr->serial_number = clabel->serial_number; 1295 1296 for(column=0;column<raidPtr->numCol;column++) { 1297 diskPtr = &raidPtr->Disks[column]; 1298 if (!RF_DEAD_DISK(diskPtr->status)) { 1299 ci_label = raidget_component_label(raidPtr, 1300 column); 1301 /* Zeroing this is important. */ 1302 memset(ci_label, 0, sizeof(*ci_label)); 1303 raid_init_component_label(raidPtr, ci_label); 1304 ci_label->serial_number = 1305 raidPtr->serial_number; 1306 ci_label->row = 0; /* we dont' pretend to support more */ 1307 rf_component_label_set_partitionsize(ci_label, 1308 diskPtr->partitionSize); 1309 ci_label->column = column; 1310 raidflush_component_label(raidPtr, column); 1311 } 1312 /* XXXjld what about the spares? */ 1313 } 1314 1315 return (retcode); 1316 case RAIDFRAME_SET_AUTOCONFIG: 1317 d = rf_set_autoconfig(raidPtr, *(int *) data); 1318 printf("raid%d: New autoconfig value is: %d\n", 1319 raidPtr->raidid, d); 1320 *(int *) data = d; 1321 return (retcode); 1322 1323 case RAIDFRAME_SET_ROOT: 1324 d = rf_set_rootpartition(raidPtr, *(int *) data); 1325 printf("raid%d: New rootpartition value is: %d\n", 1326 raidPtr->raidid, d); 1327 *(int *) data = d; 1328 return (retcode); 1329 1330 /* initialize all parity */ 1331 case RAIDFRAME_REWRITEPARITY: 1332 1333 if (raidPtr->Layout.map->faultsTolerated == 0) { 1334 /* Parity for RAID 0 is trivially correct */ 1335 raidPtr->parity_good = RF_RAID_CLEAN; 1336 return(0); 1337 } 1338 1339 if (raidPtr->parity_rewrite_in_progress == 1) { 1340 /* Re-write is already in progress! */ 1341 return(EINVAL); 1342 } 1343 1344 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread, 1345 rf_RewriteParityThread, 1346 raidPtr,"raid_parity"); 1347 return (retcode); 1348 1349 1350 case RAIDFRAME_ADD_HOT_SPARE: 1351 sparePtr = (RF_SingleComponent_t *) data; 1352 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t)); 1353 retcode = rf_add_hot_spare(raidPtr, &component); 1354 return(retcode); 1355 1356 case RAIDFRAME_REMOVE_HOT_SPARE: 1357 return(retcode); 1358 1359 case RAIDFRAME_DELETE_COMPONENT: 1360 componentPtr = (RF_SingleComponent_t *)data; 1361 memcpy( &component, componentPtr, 1362 sizeof(RF_SingleComponent_t)); 1363 retcode = rf_delete_component(raidPtr, &component); 1364 return(retcode); 1365 1366 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1367 componentPtr = (RF_SingleComponent_t *)data; 1368 memcpy( &component, componentPtr, 1369 sizeof(RF_SingleComponent_t)); 1370 retcode = rf_incorporate_hot_spare(raidPtr, &component); 1371 return(retcode); 1372 1373 case RAIDFRAME_REBUILD_IN_PLACE: 1374 1375 if (raidPtr->Layout.map->faultsTolerated == 0) { 1376 /* Can't do this on a RAID 0!! */ 1377 return(EINVAL); 1378 } 1379 1380 if (raidPtr->recon_in_progress == 1) { 1381 /* a reconstruct is already in progress! */ 1382 return(EINVAL); 1383 } 1384 1385 componentPtr = (RF_SingleComponent_t *) data; 1386 memcpy( &component, componentPtr, 1387 sizeof(RF_SingleComponent_t)); 1388 component.row = 0; /* we don't support any more */ 1389 column = component.column; 1390 1391 if ((column < 0) || (column >= raidPtr->numCol)) { 1392 return(EINVAL); 1393 } 1394 1395 rf_lock_mutex2(raidPtr->mutex); 1396 if ((raidPtr->Disks[column].status == rf_ds_optimal) && 1397 (raidPtr->numFailures > 0)) { 1398 /* XXX 0 above shouldn't be constant!!! */ 1399 /* some component other than this has failed. 1400 Let's not make things worse than they already 1401 are... */ 1402 printf("raid%d: Unable to reconstruct to disk at:\n", 1403 raidPtr->raidid); 1404 printf("raid%d: Col: %d Too many failures.\n", 1405 raidPtr->raidid, column); 1406 rf_unlock_mutex2(raidPtr->mutex); 1407 return (EINVAL); 1408 } 1409 if (raidPtr->Disks[column].status == 1410 rf_ds_reconstructing) { 1411 printf("raid%d: Unable to reconstruct to disk at:\n", 1412 raidPtr->raidid); 1413 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column); 1414 1415 rf_unlock_mutex2(raidPtr->mutex); 1416 return (EINVAL); 1417 } 1418 if (raidPtr->Disks[column].status == rf_ds_spared) { 1419 rf_unlock_mutex2(raidPtr->mutex); 1420 return (EINVAL); 1421 } 1422 rf_unlock_mutex2(raidPtr->mutex); 1423 1424 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); 1425 if (rrcopy == NULL) 1426 return(ENOMEM); 1427 1428 rrcopy->raidPtr = (void *) raidPtr; 1429 rrcopy->col = column; 1430 1431 retcode = RF_CREATE_THREAD(raidPtr->recon_thread, 1432 rf_ReconstructInPlaceThread, 1433 rrcopy,"raid_reconip"); 1434 return(retcode); 1435 1436 case RAIDFRAME_GET_INFO: 1437 if (!raidPtr->valid) 1438 return (ENODEV); 1439 ucfgp = (RF_DeviceConfig_t **) data; 1440 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t), 1441 (RF_DeviceConfig_t *)); 1442 if (d_cfg == NULL) 1443 return (ENOMEM); 1444 d_cfg->rows = 1; /* there is only 1 row now */ 1445 d_cfg->cols = raidPtr->numCol; 1446 d_cfg->ndevs = raidPtr->numCol; 1447 if (d_cfg->ndevs >= RF_MAX_DISKS) { 1448 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1449 return (ENOMEM); 1450 } 1451 d_cfg->nspares = raidPtr->numSpare; 1452 if (d_cfg->nspares >= RF_MAX_DISKS) { 1453 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1454 return (ENOMEM); 1455 } 1456 d_cfg->maxqdepth = raidPtr->maxQueueDepth; 1457 d = 0; 1458 for (j = 0; j < d_cfg->cols; j++) { 1459 d_cfg->devs[d] = raidPtr->Disks[j]; 1460 d++; 1461 } 1462 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) { 1463 d_cfg->spares[i] = raidPtr->Disks[j]; 1464 } 1465 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t)); 1466 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1467 1468 return (retcode); 1469 1470 case RAIDFRAME_CHECK_PARITY: 1471 *(int *) data = raidPtr->parity_good; 1472 return (0); 1473 1474 case RAIDFRAME_PARITYMAP_STATUS: 1475 if (rf_paritymap_ineligible(raidPtr)) 1476 return EINVAL; 1477 rf_paritymap_status(raidPtr->parity_map, 1478 (struct rf_pmstat *)data); 1479 return 0; 1480 1481 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1482 if (rf_paritymap_ineligible(raidPtr)) 1483 return EINVAL; 1484 if (raidPtr->parity_map == NULL) 1485 return ENOENT; /* ??? */ 1486 if (0 != rf_paritymap_set_params(raidPtr->parity_map, 1487 (struct rf_pmparams *)data, 1)) 1488 return EINVAL; 1489 return 0; 1490 1491 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1492 if (rf_paritymap_ineligible(raidPtr)) 1493 return EINVAL; 1494 *(int *) data = rf_paritymap_get_disable(raidPtr); 1495 return 0; 1496 1497 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1498 if (rf_paritymap_ineligible(raidPtr)) 1499 return EINVAL; 1500 rf_paritymap_set_disable(raidPtr, *(int *)data); 1501 /* XXX should errors be passed up? */ 1502 return 0; 1503 1504 case RAIDFRAME_RESET_ACCTOTALS: 1505 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals)); 1506 return (0); 1507 1508 case RAIDFRAME_GET_ACCTOTALS: 1509 totals = (RF_AccTotals_t *) data; 1510 *totals = raidPtr->acc_totals; 1511 return (0); 1512 1513 case RAIDFRAME_KEEP_ACCTOTALS: 1514 raidPtr->keep_acc_totals = *(int *)data; 1515 return (0); 1516 1517 case RAIDFRAME_GET_SIZE: 1518 *(int *) data = raidPtr->totalSectors; 1519 return (0); 1520 1521 /* fail a disk & optionally start reconstruction */ 1522 case RAIDFRAME_FAIL_DISK: 1523 1524 if (raidPtr->Layout.map->faultsTolerated == 0) { 1525 /* Can't do this on a RAID 0!! */ 1526 return(EINVAL); 1527 } 1528 1529 rr = (struct rf_recon_req *) data; 1530 rr->row = 0; 1531 if (rr->col < 0 || rr->col >= raidPtr->numCol) 1532 return (EINVAL); 1533 1534 1535 rf_lock_mutex2(raidPtr->mutex); 1536 if (raidPtr->status == rf_rs_reconstructing) { 1537 /* you can't fail a disk while we're reconstructing! */ 1538 /* XXX wrong for RAID6 */ 1539 rf_unlock_mutex2(raidPtr->mutex); 1540 return (EINVAL); 1541 } 1542 if ((raidPtr->Disks[rr->col].status == 1543 rf_ds_optimal) && (raidPtr->numFailures > 0)) { 1544 /* some other component has failed. Let's not make 1545 things worse. XXX wrong for RAID6 */ 1546 rf_unlock_mutex2(raidPtr->mutex); 1547 return (EINVAL); 1548 } 1549 if (raidPtr->Disks[rr->col].status == rf_ds_spared) { 1550 /* Can't fail a spared disk! */ 1551 rf_unlock_mutex2(raidPtr->mutex); 1552 return (EINVAL); 1553 } 1554 rf_unlock_mutex2(raidPtr->mutex); 1555 1556 /* make a copy of the recon request so that we don't rely on 1557 * the user's buffer */ 1558 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); 1559 if (rrcopy == NULL) 1560 return(ENOMEM); 1561 memcpy(rrcopy, rr, sizeof(*rr)); 1562 rrcopy->raidPtr = (void *) raidPtr; 1563 1564 retcode = RF_CREATE_THREAD(raidPtr->recon_thread, 1565 rf_ReconThread, 1566 rrcopy,"raid_recon"); 1567 return (0); 1568 1569 /* invoke a copyback operation after recon on whatever disk 1570 * needs it, if any */ 1571 case RAIDFRAME_COPYBACK: 1572 1573 if (raidPtr->Layout.map->faultsTolerated == 0) { 1574 /* This makes no sense on a RAID 0!! */ 1575 return(EINVAL); 1576 } 1577 1578 if (raidPtr->copyback_in_progress == 1) { 1579 /* Copyback is already in progress! */ 1580 return(EINVAL); 1581 } 1582 1583 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread, 1584 rf_CopybackThread, 1585 raidPtr,"raid_copyback"); 1586 return (retcode); 1587 1588 /* return the percentage completion of reconstruction */ 1589 case RAIDFRAME_CHECK_RECON_STATUS: 1590 if (raidPtr->Layout.map->faultsTolerated == 0) { 1591 /* This makes no sense on a RAID 0, so tell the 1592 user it's done. */ 1593 *(int *) data = 100; 1594 return(0); 1595 } 1596 if (raidPtr->status != rf_rs_reconstructing) 1597 *(int *) data = 100; 1598 else { 1599 if (raidPtr->reconControl->numRUsTotal > 0) { 1600 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal); 1601 } else { 1602 *(int *) data = 0; 1603 } 1604 } 1605 return (0); 1606 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1607 progressInfoPtr = (RF_ProgressInfo_t **) data; 1608 if (raidPtr->status != rf_rs_reconstructing) { 1609 progressInfo.remaining = 0; 1610 progressInfo.completed = 100; 1611 progressInfo.total = 100; 1612 } else { 1613 progressInfo.total = 1614 raidPtr->reconControl->numRUsTotal; 1615 progressInfo.completed = 1616 raidPtr->reconControl->numRUsComplete; 1617 progressInfo.remaining = progressInfo.total - 1618 progressInfo.completed; 1619 } 1620 retcode = copyout(&progressInfo, *progressInfoPtr, 1621 sizeof(RF_ProgressInfo_t)); 1622 return (retcode); 1623 1624 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1625 if (raidPtr->Layout.map->faultsTolerated == 0) { 1626 /* This makes no sense on a RAID 0, so tell the 1627 user it's done. */ 1628 *(int *) data = 100; 1629 return(0); 1630 } 1631 if (raidPtr->parity_rewrite_in_progress == 1) { 1632 *(int *) data = 100 * 1633 raidPtr->parity_rewrite_stripes_done / 1634 raidPtr->Layout.numStripe; 1635 } else { 1636 *(int *) data = 100; 1637 } 1638 return (0); 1639 1640 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1641 progressInfoPtr = (RF_ProgressInfo_t **) data; 1642 if (raidPtr->parity_rewrite_in_progress == 1) { 1643 progressInfo.total = raidPtr->Layout.numStripe; 1644 progressInfo.completed = 1645 raidPtr->parity_rewrite_stripes_done; 1646 progressInfo.remaining = progressInfo.total - 1647 progressInfo.completed; 1648 } else { 1649 progressInfo.remaining = 0; 1650 progressInfo.completed = 100; 1651 progressInfo.total = 100; 1652 } 1653 retcode = copyout(&progressInfo, *progressInfoPtr, 1654 sizeof(RF_ProgressInfo_t)); 1655 return (retcode); 1656 1657 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1658 if (raidPtr->Layout.map->faultsTolerated == 0) { 1659 /* This makes no sense on a RAID 0 */ 1660 *(int *) data = 100; 1661 return(0); 1662 } 1663 if (raidPtr->copyback_in_progress == 1) { 1664 *(int *) data = 100 * raidPtr->copyback_stripes_done / 1665 raidPtr->Layout.numStripe; 1666 } else { 1667 *(int *) data = 100; 1668 } 1669 return (0); 1670 1671 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1672 progressInfoPtr = (RF_ProgressInfo_t **) data; 1673 if (raidPtr->copyback_in_progress == 1) { 1674 progressInfo.total = raidPtr->Layout.numStripe; 1675 progressInfo.completed = 1676 raidPtr->copyback_stripes_done; 1677 progressInfo.remaining = progressInfo.total - 1678 progressInfo.completed; 1679 } else { 1680 progressInfo.remaining = 0; 1681 progressInfo.completed = 100; 1682 progressInfo.total = 100; 1683 } 1684 retcode = copyout(&progressInfo, *progressInfoPtr, 1685 sizeof(RF_ProgressInfo_t)); 1686 return (retcode); 1687 1688 /* the sparetable daemon calls this to wait for the kernel to 1689 * need a spare table. this ioctl does not return until a 1690 * spare table is needed. XXX -- calling mpsleep here in the 1691 * ioctl code is almost certainly wrong and evil. -- XXX XXX 1692 * -- I should either compute the spare table in the kernel, 1693 * or have a different -- XXX XXX -- interface (a different 1694 * character device) for delivering the table -- XXX */ 1695 #if 0 1696 case RAIDFRAME_SPARET_WAIT: 1697 rf_lock_mutex2(rf_sparet_wait_mutex); 1698 while (!rf_sparet_wait_queue) 1699 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex); 1700 waitreq = rf_sparet_wait_queue; 1701 rf_sparet_wait_queue = rf_sparet_wait_queue->next; 1702 rf_unlock_mutex2(rf_sparet_wait_mutex); 1703 1704 /* structure assignment */ 1705 *((RF_SparetWait_t *) data) = *waitreq; 1706 1707 RF_Free(waitreq, sizeof(*waitreq)); 1708 return (0); 1709 1710 /* wakes up a process waiting on SPARET_WAIT and puts an error 1711 * code in it that will cause the dameon to exit */ 1712 case RAIDFRAME_ABORT_SPARET_WAIT: 1713 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); 1714 waitreq->fcol = -1; 1715 rf_lock_mutex2(rf_sparet_wait_mutex); 1716 waitreq->next = rf_sparet_wait_queue; 1717 rf_sparet_wait_queue = waitreq; 1718 rf_broadcast_conf2(rf_sparet_wait_cv); 1719 rf_unlock_mutex2(rf_sparet_wait_mutex); 1720 return (0); 1721 1722 /* used by the spare table daemon to deliver a spare table 1723 * into the kernel */ 1724 case RAIDFRAME_SEND_SPARET: 1725 1726 /* install the spare table */ 1727 retcode = rf_SetSpareTable(raidPtr, *(void **) data); 1728 1729 /* respond to the requestor. the return status of the spare 1730 * table installation is passed in the "fcol" field */ 1731 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); 1732 waitreq->fcol = retcode; 1733 rf_lock_mutex2(rf_sparet_wait_mutex); 1734 waitreq->next = rf_sparet_resp_queue; 1735 rf_sparet_resp_queue = waitreq; 1736 rf_broadcast_cond2(rf_sparet_resp_cv); 1737 rf_unlock_mutex2(rf_sparet_wait_mutex); 1738 1739 return (retcode); 1740 #endif 1741 1742 default: 1743 break; /* fall through to the os-specific code below */ 1744 1745 } 1746 1747 if (!raidPtr->valid) 1748 return (EINVAL); 1749 1750 /* 1751 * Add support for "regular" device ioctls here. 1752 */ 1753 1754 error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l); 1755 if (error != EPASSTHROUGH) 1756 return (error); 1757 1758 switch (cmd) { 1759 case DIOCGDINFO: 1760 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label); 1761 break; 1762 #ifdef __HAVE_OLD_DISKLABEL 1763 case ODIOCGDINFO: 1764 newlabel = *(rs->sc_dkdev.dk_label); 1765 if (newlabel.d_npartitions > OLDMAXPARTITIONS) 1766 return ENOTTY; 1767 memcpy(data, &newlabel, sizeof (struct olddisklabel)); 1768 break; 1769 #endif 1770 1771 case DIOCGPART: 1772 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label; 1773 ((struct partinfo *) data)->part = 1774 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)]; 1775 break; 1776 1777 case DIOCWDINFO: 1778 case DIOCSDINFO: 1779 #ifdef __HAVE_OLD_DISKLABEL 1780 case ODIOCWDINFO: 1781 case ODIOCSDINFO: 1782 #endif 1783 { 1784 struct disklabel *lp; 1785 #ifdef __HAVE_OLD_DISKLABEL 1786 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) { 1787 memset(&newlabel, 0, sizeof newlabel); 1788 memcpy(&newlabel, data, sizeof (struct olddisklabel)); 1789 lp = &newlabel; 1790 } else 1791 #endif 1792 lp = (struct disklabel *)data; 1793 1794 if ((error = raidlock(rs)) != 0) 1795 return (error); 1796 1797 rs->sc_flags |= RAIDF_LABELLING; 1798 1799 error = setdisklabel(rs->sc_dkdev.dk_label, 1800 lp, 0, rs->sc_dkdev.dk_cpulabel); 1801 if (error == 0) { 1802 if (cmd == DIOCWDINFO 1803 #ifdef __HAVE_OLD_DISKLABEL 1804 || cmd == ODIOCWDINFO 1805 #endif 1806 ) 1807 error = writedisklabel(RAIDLABELDEV(dev), 1808 raidstrategy, rs->sc_dkdev.dk_label, 1809 rs->sc_dkdev.dk_cpulabel); 1810 } 1811 rs->sc_flags &= ~RAIDF_LABELLING; 1812 1813 raidunlock(rs); 1814 1815 if (error) 1816 return (error); 1817 break; 1818 } 1819 1820 case DIOCWLABEL: 1821 if (*(int *) data != 0) 1822 rs->sc_flags |= RAIDF_WLABEL; 1823 else 1824 rs->sc_flags &= ~RAIDF_WLABEL; 1825 break; 1826 1827 case DIOCGDEFLABEL: 1828 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data); 1829 break; 1830 1831 #ifdef __HAVE_OLD_DISKLABEL 1832 case ODIOCGDEFLABEL: 1833 raidgetdefaultlabel(raidPtr, rs, &newlabel); 1834 if (newlabel.d_npartitions > OLDMAXPARTITIONS) 1835 return ENOTTY; 1836 memcpy(data, &newlabel, sizeof (struct olddisklabel)); 1837 break; 1838 #endif 1839 1840 case DIOCAWEDGE: 1841 case DIOCDWEDGE: 1842 dkw = (void *)data; 1843 1844 /* If the ioctl happens here, the parent is us. */ 1845 (void)strcpy(dkw->dkw_parent, rs->sc_xname); 1846 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw); 1847 1848 case DIOCLWEDGES: 1849 return dkwedge_list(&rs->sc_dkdev, 1850 (struct dkwedge_list *)data, l); 1851 case DIOCCACHESYNC: 1852 return rf_sync_component_caches(raidPtr); 1853 default: 1854 retcode = ENOTTY; 1855 } 1856 return (retcode); 1857 1858 } 1859 1860 1861 /* raidinit -- complete the rest of the initialization for the 1862 RAIDframe device. */ 1863 1864 1865 static void 1866 raidinit(RF_Raid_t *raidPtr) 1867 { 1868 cfdata_t cf; 1869 struct raid_softc *rs; 1870 int unit; 1871 1872 unit = raidPtr->raidid; 1873 1874 rs = &raid_softc[unit]; 1875 1876 /* XXX should check return code first... */ 1877 rs->sc_flags |= RAIDF_INITED; 1878 1879 /* XXX doesn't check bounds. */ 1880 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit); 1881 1882 /* attach the pseudo device */ 1883 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK); 1884 cf->cf_name = raid_cd.cd_name; 1885 cf->cf_atname = raid_cd.cd_name; 1886 cf->cf_unit = unit; 1887 cf->cf_fstate = FSTATE_STAR; 1888 1889 rs->sc_dev = config_attach_pseudo(cf); 1890 1891 if (rs->sc_dev == NULL) { 1892 printf("raid%d: config_attach_pseudo failed\n", 1893 raidPtr->raidid); 1894 rs->sc_flags &= ~RAIDF_INITED; 1895 free(cf, M_RAIDFRAME); 1896 return; 1897 } 1898 1899 /* disk_attach actually creates space for the CPU disklabel, among 1900 * other things, so it's critical to call this *BEFORE* we try putzing 1901 * with disklabels. */ 1902 1903 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver); 1904 disk_attach(&rs->sc_dkdev); 1905 disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector); 1906 1907 /* XXX There may be a weird interaction here between this, and 1908 * protectedSectors, as used in RAIDframe. */ 1909 1910 rs->sc_size = raidPtr->totalSectors; 1911 1912 dkwedge_discover(&rs->sc_dkdev); 1913 1914 rf_set_properties(rs, raidPtr); 1915 1916 } 1917 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 1918 /* wake up the daemon & tell it to get us a spare table 1919 * XXX 1920 * the entries in the queues should be tagged with the raidPtr 1921 * so that in the extremely rare case that two recons happen at once, 1922 * we know for which device were requesting a spare table 1923 * XXX 1924 * 1925 * XXX This code is not currently used. GO 1926 */ 1927 int 1928 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req) 1929 { 1930 int retcode; 1931 1932 rf_lock_mutex2(rf_sparet_wait_mutex); 1933 req->next = rf_sparet_wait_queue; 1934 rf_sparet_wait_queue = req; 1935 rf_broadcast_cond2(rf_sparet_wait_cv); 1936 1937 /* mpsleep unlocks the mutex */ 1938 while (!rf_sparet_resp_queue) { 1939 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex); 1940 } 1941 req = rf_sparet_resp_queue; 1942 rf_sparet_resp_queue = req->next; 1943 rf_unlock_mutex2(rf_sparet_wait_mutex); 1944 1945 retcode = req->fcol; 1946 RF_Free(req, sizeof(*req)); /* this is not the same req as we 1947 * alloc'd */ 1948 return (retcode); 1949 } 1950 #endif 1951 1952 /* a wrapper around rf_DoAccess that extracts appropriate info from the 1953 * bp & passes it down. 1954 * any calls originating in the kernel must use non-blocking I/O 1955 * do some extra sanity checking to return "appropriate" error values for 1956 * certain conditions (to make some standard utilities work) 1957 * 1958 * Formerly known as: rf_DoAccessKernel 1959 */ 1960 void 1961 raidstart(RF_Raid_t *raidPtr) 1962 { 1963 RF_SectorCount_t num_blocks, pb, sum; 1964 RF_RaidAddr_t raid_addr; 1965 struct partition *pp; 1966 daddr_t blocknum; 1967 int unit; 1968 struct raid_softc *rs; 1969 int do_async; 1970 struct buf *bp; 1971 int rc; 1972 1973 unit = raidPtr->raidid; 1974 rs = &raid_softc[unit]; 1975 1976 /* quick check to see if anything has died recently */ 1977 rf_lock_mutex2(raidPtr->mutex); 1978 if (raidPtr->numNewFailures > 0) { 1979 rf_unlock_mutex2(raidPtr->mutex); 1980 rf_update_component_labels(raidPtr, 1981 RF_NORMAL_COMPONENT_UPDATE); 1982 rf_lock_mutex2(raidPtr->mutex); 1983 raidPtr->numNewFailures--; 1984 } 1985 1986 /* Check to see if we're at the limit... */ 1987 while (raidPtr->openings > 0) { 1988 rf_unlock_mutex2(raidPtr->mutex); 1989 1990 /* get the next item, if any, from the queue */ 1991 if ((bp = bufq_get(rs->buf_queue)) == NULL) { 1992 /* nothing more to do */ 1993 return; 1994 } 1995 1996 /* Ok, for the bp we have here, bp->b_blkno is relative to the 1997 * partition.. Need to make it absolute to the underlying 1998 * device.. */ 1999 2000 blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector; 2001 if (DISKPART(bp->b_dev) != RAW_PART) { 2002 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)]; 2003 blocknum += pp->p_offset; 2004 } 2005 2006 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno, 2007 (int) blocknum)); 2008 2009 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount)); 2010 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid)); 2011 2012 /* *THIS* is where we adjust what block we're going to... 2013 * but DO NOT TOUCH bp->b_blkno!!! */ 2014 raid_addr = blocknum; 2015 2016 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector; 2017 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0; 2018 sum = raid_addr + num_blocks + pb; 2019 if (1 || rf_debugKernelAccess) { 2020 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n", 2021 (int) raid_addr, (int) sum, (int) num_blocks, 2022 (int) pb, (int) bp->b_resid)); 2023 } 2024 if ((sum > raidPtr->totalSectors) || (sum < raid_addr) 2025 || (sum < num_blocks) || (sum < pb)) { 2026 bp->b_error = ENOSPC; 2027 bp->b_resid = bp->b_bcount; 2028 biodone(bp); 2029 rf_lock_mutex2(raidPtr->mutex); 2030 continue; 2031 } 2032 /* 2033 * XXX rf_DoAccess() should do this, not just DoAccessKernel() 2034 */ 2035 2036 if (bp->b_bcount & raidPtr->sectorMask) { 2037 bp->b_error = EINVAL; 2038 bp->b_resid = bp->b_bcount; 2039 biodone(bp); 2040 rf_lock_mutex2(raidPtr->mutex); 2041 continue; 2042 2043 } 2044 db1_printf(("Calling DoAccess..\n")); 2045 2046 2047 rf_lock_mutex2(raidPtr->mutex); 2048 raidPtr->openings--; 2049 rf_unlock_mutex2(raidPtr->mutex); 2050 2051 /* 2052 * Everything is async. 2053 */ 2054 do_async = 1; 2055 2056 disk_busy(&rs->sc_dkdev); 2057 2058 /* XXX we're still at splbio() here... do we *really* 2059 need to be? */ 2060 2061 /* don't ever condition on bp->b_flags & B_WRITE. 2062 * always condition on B_READ instead */ 2063 2064 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ? 2065 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE, 2066 do_async, raid_addr, num_blocks, 2067 bp->b_data, bp, RF_DAG_NONBLOCKING_IO); 2068 2069 if (rc) { 2070 bp->b_error = rc; 2071 bp->b_resid = bp->b_bcount; 2072 biodone(bp); 2073 /* continue loop */ 2074 } 2075 2076 rf_lock_mutex2(raidPtr->mutex); 2077 } 2078 rf_unlock_mutex2(raidPtr->mutex); 2079 } 2080 2081 2082 2083 2084 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */ 2085 2086 int 2087 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req) 2088 { 2089 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE; 2090 struct buf *bp; 2091 2092 req->queue = queue; 2093 bp = req->bp; 2094 2095 switch (req->type) { 2096 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */ 2097 /* XXX need to do something extra here.. */ 2098 /* I'm leaving this in, as I've never actually seen it used, 2099 * and I'd like folks to report it... GO */ 2100 printf(("WAKEUP CALLED\n")); 2101 queue->numOutstanding++; 2102 2103 bp->b_flags = 0; 2104 bp->b_private = req; 2105 2106 KernelWakeupFunc(bp); 2107 break; 2108 2109 case RF_IO_TYPE_READ: 2110 case RF_IO_TYPE_WRITE: 2111 #if RF_ACC_TRACE > 0 2112 if (req->tracerec) { 2113 RF_ETIMER_START(req->tracerec->timer); 2114 } 2115 #endif 2116 InitBP(bp, queue->rf_cinfo->ci_vp, 2117 op, queue->rf_cinfo->ci_dev, 2118 req->sectorOffset, req->numSector, 2119 req->buf, KernelWakeupFunc, (void *) req, 2120 queue->raidPtr->logBytesPerSector, req->b_proc); 2121 2122 if (rf_debugKernelAccess) { 2123 db1_printf(("dispatch: bp->b_blkno = %ld\n", 2124 (long) bp->b_blkno)); 2125 } 2126 queue->numOutstanding++; 2127 queue->last_deq_sector = req->sectorOffset; 2128 /* acc wouldn't have been let in if there were any pending 2129 * reqs at any other priority */ 2130 queue->curPriority = req->priority; 2131 2132 db1_printf(("Going for %c to unit %d col %d\n", 2133 req->type, queue->raidPtr->raidid, 2134 queue->col)); 2135 db1_printf(("sector %d count %d (%d bytes) %d\n", 2136 (int) req->sectorOffset, (int) req->numSector, 2137 (int) (req->numSector << 2138 queue->raidPtr->logBytesPerSector), 2139 (int) queue->raidPtr->logBytesPerSector)); 2140 2141 /* 2142 * XXX: drop lock here since this can block at 2143 * least with backing SCSI devices. Retake it 2144 * to minimize fuss with calling interfaces. 2145 */ 2146 2147 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam"); 2148 bdev_strategy(bp); 2149 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam"); 2150 break; 2151 2152 default: 2153 panic("bad req->type in rf_DispatchKernelIO"); 2154 } 2155 db1_printf(("Exiting from DispatchKernelIO\n")); 2156 2157 return (0); 2158 } 2159 /* this is the callback function associated with a I/O invoked from 2160 kernel code. 2161 */ 2162 static void 2163 KernelWakeupFunc(struct buf *bp) 2164 { 2165 RF_DiskQueueData_t *req = NULL; 2166 RF_DiskQueue_t *queue; 2167 2168 db1_printf(("recovering the request queue:\n")); 2169 2170 req = bp->b_private; 2171 2172 queue = (RF_DiskQueue_t *) req->queue; 2173 2174 rf_lock_mutex2(queue->raidPtr->iodone_lock); 2175 2176 #if RF_ACC_TRACE > 0 2177 if (req->tracerec) { 2178 RF_ETIMER_STOP(req->tracerec->timer); 2179 RF_ETIMER_EVAL(req->tracerec->timer); 2180 rf_lock_mutex2(rf_tracing_mutex); 2181 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2182 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2183 req->tracerec->num_phys_ios++; 2184 rf_unlock_mutex2(rf_tracing_mutex); 2185 } 2186 #endif 2187 2188 /* XXX Ok, let's get aggressive... If b_error is set, let's go 2189 * ballistic, and mark the component as hosed... */ 2190 2191 if (bp->b_error != 0) { 2192 /* Mark the disk as dead */ 2193 /* but only mark it once... */ 2194 /* and only if it wouldn't leave this RAID set 2195 completely broken */ 2196 if (((queue->raidPtr->Disks[queue->col].status == 2197 rf_ds_optimal) || 2198 (queue->raidPtr->Disks[queue->col].status == 2199 rf_ds_used_spare)) && 2200 (queue->raidPtr->numFailures < 2201 queue->raidPtr->Layout.map->faultsTolerated)) { 2202 printf("raid%d: IO Error. Marking %s as failed.\n", 2203 queue->raidPtr->raidid, 2204 queue->raidPtr->Disks[queue->col].devname); 2205 queue->raidPtr->Disks[queue->col].status = 2206 rf_ds_failed; 2207 queue->raidPtr->status = rf_rs_degraded; 2208 queue->raidPtr->numFailures++; 2209 queue->raidPtr->numNewFailures++; 2210 } else { /* Disk is already dead... */ 2211 /* printf("Disk already marked as dead!\n"); */ 2212 } 2213 2214 } 2215 2216 /* Fill in the error value */ 2217 req->error = bp->b_error; 2218 2219 /* Drop this one on the "finished" queue... */ 2220 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries); 2221 2222 /* Let the raidio thread know there is work to be done. */ 2223 rf_signal_cond2(queue->raidPtr->iodone_cv); 2224 2225 rf_unlock_mutex2(queue->raidPtr->iodone_lock); 2226 } 2227 2228 2229 /* 2230 * initialize a buf structure for doing an I/O in the kernel. 2231 */ 2232 static void 2233 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev, 2234 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf, 2235 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector, 2236 struct proc *b_proc) 2237 { 2238 /* bp->b_flags = B_PHYS | rw_flag; */ 2239 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */ 2240 bp->b_oflags = 0; 2241 bp->b_cflags = 0; 2242 bp->b_bcount = numSect << logBytesPerSector; 2243 bp->b_bufsize = bp->b_bcount; 2244 bp->b_error = 0; 2245 bp->b_dev = dev; 2246 bp->b_data = bf; 2247 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT; 2248 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */ 2249 if (bp->b_bcount == 0) { 2250 panic("bp->b_bcount is zero in InitBP!!"); 2251 } 2252 bp->b_proc = b_proc; 2253 bp->b_iodone = cbFunc; 2254 bp->b_private = cbArg; 2255 } 2256 2257 static void 2258 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs, 2259 struct disklabel *lp) 2260 { 2261 memset(lp, 0, sizeof(*lp)); 2262 2263 /* fabricate a label... */ 2264 lp->d_secperunit = raidPtr->totalSectors; 2265 lp->d_secsize = raidPtr->bytesPerSector; 2266 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe; 2267 lp->d_ntracks = 4 * raidPtr->numCol; 2268 lp->d_ncylinders = raidPtr->totalSectors / 2269 (lp->d_nsectors * lp->d_ntracks); 2270 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors; 2271 2272 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename)); 2273 lp->d_type = DTYPE_RAID; 2274 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); 2275 lp->d_rpm = 3600; 2276 lp->d_interleave = 1; 2277 lp->d_flags = 0; 2278 2279 lp->d_partitions[RAW_PART].p_offset = 0; 2280 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors; 2281 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED; 2282 lp->d_npartitions = RAW_PART + 1; 2283 2284 lp->d_magic = DISKMAGIC; 2285 lp->d_magic2 = DISKMAGIC; 2286 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label); 2287 2288 } 2289 /* 2290 * Read the disklabel from the raid device. If one is not present, fake one 2291 * up. 2292 */ 2293 static void 2294 raidgetdisklabel(dev_t dev) 2295 { 2296 int unit = raidunit(dev); 2297 struct raid_softc *rs = &raid_softc[unit]; 2298 const char *errstring; 2299 struct disklabel *lp = rs->sc_dkdev.dk_label; 2300 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel; 2301 RF_Raid_t *raidPtr; 2302 2303 db1_printf(("Getting the disklabel...\n")); 2304 2305 memset(clp, 0, sizeof(*clp)); 2306 2307 raidPtr = raidPtrs[unit]; 2308 2309 raidgetdefaultlabel(raidPtr, rs, lp); 2310 2311 /* 2312 * Call the generic disklabel extraction routine. 2313 */ 2314 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy, 2315 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel); 2316 if (errstring) 2317 raidmakedisklabel(rs); 2318 else { 2319 int i; 2320 struct partition *pp; 2321 2322 /* 2323 * Sanity check whether the found disklabel is valid. 2324 * 2325 * This is necessary since total size of the raid device 2326 * may vary when an interleave is changed even though exactly 2327 * same components are used, and old disklabel may used 2328 * if that is found. 2329 */ 2330 if (lp->d_secperunit != rs->sc_size) 2331 printf("raid%d: WARNING: %s: " 2332 "total sector size in disklabel (%" PRIu32 ") != " 2333 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname, 2334 lp->d_secperunit, rs->sc_size); 2335 for (i = 0; i < lp->d_npartitions; i++) { 2336 pp = &lp->d_partitions[i]; 2337 if (pp->p_offset + pp->p_size > rs->sc_size) 2338 printf("raid%d: WARNING: %s: end of partition `%c' " 2339 "exceeds the size of raid (%" PRIu64 ")\n", 2340 unit, rs->sc_xname, 'a' + i, rs->sc_size); 2341 } 2342 } 2343 2344 } 2345 /* 2346 * Take care of things one might want to take care of in the event 2347 * that a disklabel isn't present. 2348 */ 2349 static void 2350 raidmakedisklabel(struct raid_softc *rs) 2351 { 2352 struct disklabel *lp = rs->sc_dkdev.dk_label; 2353 db1_printf(("Making a label..\n")); 2354 2355 /* 2356 * For historical reasons, if there's no disklabel present 2357 * the raw partition must be marked FS_BSDFFS. 2358 */ 2359 2360 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS; 2361 2362 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname)); 2363 2364 lp->d_checksum = dkcksum(lp); 2365 } 2366 /* 2367 * Wait interruptibly for an exclusive lock. 2368 * 2369 * XXX 2370 * Several drivers do this; it should be abstracted and made MP-safe. 2371 * (Hmm... where have we seen this warning before :-> GO ) 2372 */ 2373 static int 2374 raidlock(struct raid_softc *rs) 2375 { 2376 int error; 2377 2378 while ((rs->sc_flags & RAIDF_LOCKED) != 0) { 2379 rs->sc_flags |= RAIDF_WANTED; 2380 if ((error = 2381 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0) 2382 return (error); 2383 } 2384 rs->sc_flags |= RAIDF_LOCKED; 2385 return (0); 2386 } 2387 /* 2388 * Unlock and wake up any waiters. 2389 */ 2390 static void 2391 raidunlock(struct raid_softc *rs) 2392 { 2393 2394 rs->sc_flags &= ~RAIDF_LOCKED; 2395 if ((rs->sc_flags & RAIDF_WANTED) != 0) { 2396 rs->sc_flags &= ~RAIDF_WANTED; 2397 wakeup(rs); 2398 } 2399 } 2400 2401 2402 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */ 2403 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */ 2404 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE 2405 2406 static daddr_t 2407 rf_component_info_offset(void) 2408 { 2409 2410 return RF_COMPONENT_INFO_OFFSET; 2411 } 2412 2413 static daddr_t 2414 rf_component_info_size(unsigned secsize) 2415 { 2416 daddr_t info_size; 2417 2418 KASSERT(secsize); 2419 if (secsize > RF_COMPONENT_INFO_SIZE) 2420 info_size = secsize; 2421 else 2422 info_size = RF_COMPONENT_INFO_SIZE; 2423 2424 return info_size; 2425 } 2426 2427 static daddr_t 2428 rf_parity_map_offset(RF_Raid_t *raidPtr) 2429 { 2430 daddr_t map_offset; 2431 2432 KASSERT(raidPtr->bytesPerSector); 2433 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE) 2434 map_offset = raidPtr->bytesPerSector; 2435 else 2436 map_offset = RF_COMPONENT_INFO_SIZE; 2437 map_offset += rf_component_info_offset(); 2438 2439 return map_offset; 2440 } 2441 2442 static daddr_t 2443 rf_parity_map_size(RF_Raid_t *raidPtr) 2444 { 2445 daddr_t map_size; 2446 2447 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE) 2448 map_size = raidPtr->bytesPerSector; 2449 else 2450 map_size = RF_PARITY_MAP_SIZE; 2451 2452 return map_size; 2453 } 2454 2455 int 2456 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col) 2457 { 2458 RF_ComponentLabel_t *clabel; 2459 2460 clabel = raidget_component_label(raidPtr, col); 2461 clabel->clean = RF_RAID_CLEAN; 2462 raidflush_component_label(raidPtr, col); 2463 return(0); 2464 } 2465 2466 2467 int 2468 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col) 2469 { 2470 RF_ComponentLabel_t *clabel; 2471 2472 clabel = raidget_component_label(raidPtr, col); 2473 clabel->clean = RF_RAID_DIRTY; 2474 raidflush_component_label(raidPtr, col); 2475 return(0); 2476 } 2477 2478 int 2479 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2480 { 2481 KASSERT(raidPtr->bytesPerSector); 2482 return raidread_component_label(raidPtr->bytesPerSector, 2483 raidPtr->Disks[col].dev, 2484 raidPtr->raid_cinfo[col].ci_vp, 2485 &raidPtr->raid_cinfo[col].ci_label); 2486 } 2487 2488 RF_ComponentLabel_t * 2489 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2490 { 2491 return &raidPtr->raid_cinfo[col].ci_label; 2492 } 2493 2494 int 2495 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2496 { 2497 RF_ComponentLabel_t *label; 2498 2499 label = &raidPtr->raid_cinfo[col].ci_label; 2500 label->mod_counter = raidPtr->mod_counter; 2501 #ifndef RF_NO_PARITY_MAP 2502 label->parity_map_modcount = label->mod_counter; 2503 #endif 2504 return raidwrite_component_label(raidPtr->bytesPerSector, 2505 raidPtr->Disks[col].dev, 2506 raidPtr->raid_cinfo[col].ci_vp, label); 2507 } 2508 2509 2510 static int 2511 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2512 RF_ComponentLabel_t *clabel) 2513 { 2514 return raidread_component_area(dev, b_vp, clabel, 2515 sizeof(RF_ComponentLabel_t), 2516 rf_component_info_offset(), 2517 rf_component_info_size(secsize)); 2518 } 2519 2520 /* ARGSUSED */ 2521 static int 2522 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data, 2523 size_t msize, daddr_t offset, daddr_t dsize) 2524 { 2525 struct buf *bp; 2526 const struct bdevsw *bdev; 2527 int error; 2528 2529 /* XXX should probably ensure that we don't try to do this if 2530 someone has changed rf_protected_sectors. */ 2531 2532 if (b_vp == NULL) { 2533 /* For whatever reason, this component is not valid. 2534 Don't try to read a component label from it. */ 2535 return(EINVAL); 2536 } 2537 2538 /* get a block of the appropriate size... */ 2539 bp = geteblk((int)dsize); 2540 bp->b_dev = dev; 2541 2542 /* get our ducks in a row for the read */ 2543 bp->b_blkno = offset / DEV_BSIZE; 2544 bp->b_bcount = dsize; 2545 bp->b_flags |= B_READ; 2546 bp->b_resid = dsize; 2547 2548 bdev = bdevsw_lookup(bp->b_dev); 2549 if (bdev == NULL) 2550 return (ENXIO); 2551 (*bdev->d_strategy)(bp); 2552 2553 error = biowait(bp); 2554 2555 if (!error) { 2556 memcpy(data, bp->b_data, msize); 2557 } 2558 2559 brelse(bp, 0); 2560 return(error); 2561 } 2562 2563 2564 static int 2565 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2566 RF_ComponentLabel_t *clabel) 2567 { 2568 return raidwrite_component_area(dev, b_vp, clabel, 2569 sizeof(RF_ComponentLabel_t), 2570 rf_component_info_offset(), 2571 rf_component_info_size(secsize), 0); 2572 } 2573 2574 /* ARGSUSED */ 2575 static int 2576 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data, 2577 size_t msize, daddr_t offset, daddr_t dsize, int asyncp) 2578 { 2579 struct buf *bp; 2580 const struct bdevsw *bdev; 2581 int error; 2582 2583 /* get a block of the appropriate size... */ 2584 bp = geteblk((int)dsize); 2585 bp->b_dev = dev; 2586 2587 /* get our ducks in a row for the write */ 2588 bp->b_blkno = offset / DEV_BSIZE; 2589 bp->b_bcount = dsize; 2590 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0); 2591 bp->b_resid = dsize; 2592 2593 memset(bp->b_data, 0, dsize); 2594 memcpy(bp->b_data, data, msize); 2595 2596 bdev = bdevsw_lookup(bp->b_dev); 2597 if (bdev == NULL) 2598 return (ENXIO); 2599 (*bdev->d_strategy)(bp); 2600 if (asyncp) 2601 return 0; 2602 error = biowait(bp); 2603 brelse(bp, 0); 2604 if (error) { 2605 #if 1 2606 printf("Failed to write RAID component info!\n"); 2607 #endif 2608 } 2609 2610 return(error); 2611 } 2612 2613 void 2614 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2615 { 2616 int c; 2617 2618 for (c = 0; c < raidPtr->numCol; c++) { 2619 /* Skip dead disks. */ 2620 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2621 continue; 2622 /* XXXjld: what if an error occurs here? */ 2623 raidwrite_component_area(raidPtr->Disks[c].dev, 2624 raidPtr->raid_cinfo[c].ci_vp, map, 2625 RF_PARITYMAP_NBYTE, 2626 rf_parity_map_offset(raidPtr), 2627 rf_parity_map_size(raidPtr), 0); 2628 } 2629 } 2630 2631 void 2632 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2633 { 2634 struct rf_paritymap_ondisk tmp; 2635 int c,first; 2636 2637 first=1; 2638 for (c = 0; c < raidPtr->numCol; c++) { 2639 /* Skip dead disks. */ 2640 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2641 continue; 2642 raidread_component_area(raidPtr->Disks[c].dev, 2643 raidPtr->raid_cinfo[c].ci_vp, &tmp, 2644 RF_PARITYMAP_NBYTE, 2645 rf_parity_map_offset(raidPtr), 2646 rf_parity_map_size(raidPtr)); 2647 if (first) { 2648 memcpy(map, &tmp, sizeof(*map)); 2649 first = 0; 2650 } else { 2651 rf_paritymap_merge(map, &tmp); 2652 } 2653 } 2654 } 2655 2656 void 2657 rf_markalldirty(RF_Raid_t *raidPtr) 2658 { 2659 RF_ComponentLabel_t *clabel; 2660 int sparecol; 2661 int c; 2662 int j; 2663 int scol = -1; 2664 2665 raidPtr->mod_counter++; 2666 for (c = 0; c < raidPtr->numCol; c++) { 2667 /* we don't want to touch (at all) a disk that has 2668 failed */ 2669 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) { 2670 clabel = raidget_component_label(raidPtr, c); 2671 if (clabel->status == rf_ds_spared) { 2672 /* XXX do something special... 2673 but whatever you do, don't 2674 try to access it!! */ 2675 } else { 2676 raidmarkdirty(raidPtr, c); 2677 } 2678 } 2679 } 2680 2681 for( c = 0; c < raidPtr->numSpare ; c++) { 2682 sparecol = raidPtr->numCol + c; 2683 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2684 /* 2685 2686 we claim this disk is "optimal" if it's 2687 rf_ds_used_spare, as that means it should be 2688 directly substitutable for the disk it replaced. 2689 We note that too... 2690 2691 */ 2692 2693 for(j=0;j<raidPtr->numCol;j++) { 2694 if (raidPtr->Disks[j].spareCol == sparecol) { 2695 scol = j; 2696 break; 2697 } 2698 } 2699 2700 clabel = raidget_component_label(raidPtr, sparecol); 2701 /* make sure status is noted */ 2702 2703 raid_init_component_label(raidPtr, clabel); 2704 2705 clabel->row = 0; 2706 clabel->column = scol; 2707 /* Note: we *don't* change status from rf_ds_used_spare 2708 to rf_ds_optimal */ 2709 /* clabel.status = rf_ds_optimal; */ 2710 2711 raidmarkdirty(raidPtr, sparecol); 2712 } 2713 } 2714 } 2715 2716 2717 void 2718 rf_update_component_labels(RF_Raid_t *raidPtr, int final) 2719 { 2720 RF_ComponentLabel_t *clabel; 2721 int sparecol; 2722 int c; 2723 int j; 2724 int scol; 2725 2726 scol = -1; 2727 2728 /* XXX should do extra checks to make sure things really are clean, 2729 rather than blindly setting the clean bit... */ 2730 2731 raidPtr->mod_counter++; 2732 2733 for (c = 0; c < raidPtr->numCol; c++) { 2734 if (raidPtr->Disks[c].status == rf_ds_optimal) { 2735 clabel = raidget_component_label(raidPtr, c); 2736 /* make sure status is noted */ 2737 clabel->status = rf_ds_optimal; 2738 2739 /* note what unit we are configured as */ 2740 clabel->last_unit = raidPtr->raidid; 2741 2742 raidflush_component_label(raidPtr, c); 2743 if (final == RF_FINAL_COMPONENT_UPDATE) { 2744 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2745 raidmarkclean(raidPtr, c); 2746 } 2747 } 2748 } 2749 /* else we don't touch it.. */ 2750 } 2751 2752 for( c = 0; c < raidPtr->numSpare ; c++) { 2753 sparecol = raidPtr->numCol + c; 2754 /* Need to ensure that the reconstruct actually completed! */ 2755 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2756 /* 2757 2758 we claim this disk is "optimal" if it's 2759 rf_ds_used_spare, as that means it should be 2760 directly substitutable for the disk it replaced. 2761 We note that too... 2762 2763 */ 2764 2765 for(j=0;j<raidPtr->numCol;j++) { 2766 if (raidPtr->Disks[j].spareCol == sparecol) { 2767 scol = j; 2768 break; 2769 } 2770 } 2771 2772 /* XXX shouldn't *really* need this... */ 2773 clabel = raidget_component_label(raidPtr, sparecol); 2774 /* make sure status is noted */ 2775 2776 raid_init_component_label(raidPtr, clabel); 2777 2778 clabel->column = scol; 2779 clabel->status = rf_ds_optimal; 2780 clabel->last_unit = raidPtr->raidid; 2781 2782 raidflush_component_label(raidPtr, sparecol); 2783 if (final == RF_FINAL_COMPONENT_UPDATE) { 2784 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2785 raidmarkclean(raidPtr, sparecol); 2786 } 2787 } 2788 } 2789 } 2790 } 2791 2792 void 2793 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured) 2794 { 2795 2796 if (vp != NULL) { 2797 if (auto_configured == 1) { 2798 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2799 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2800 vput(vp); 2801 2802 } else { 2803 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred); 2804 } 2805 } 2806 } 2807 2808 2809 void 2810 rf_UnconfigureVnodes(RF_Raid_t *raidPtr) 2811 { 2812 int r,c; 2813 struct vnode *vp; 2814 int acd; 2815 2816 2817 /* We take this opportunity to close the vnodes like we should.. */ 2818 2819 for (c = 0; c < raidPtr->numCol; c++) { 2820 vp = raidPtr->raid_cinfo[c].ci_vp; 2821 acd = raidPtr->Disks[c].auto_configured; 2822 rf_close_component(raidPtr, vp, acd); 2823 raidPtr->raid_cinfo[c].ci_vp = NULL; 2824 raidPtr->Disks[c].auto_configured = 0; 2825 } 2826 2827 for (r = 0; r < raidPtr->numSpare; r++) { 2828 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp; 2829 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured; 2830 rf_close_component(raidPtr, vp, acd); 2831 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL; 2832 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0; 2833 } 2834 } 2835 2836 2837 void 2838 rf_ReconThread(struct rf_recon_req *req) 2839 { 2840 int s; 2841 RF_Raid_t *raidPtr; 2842 2843 s = splbio(); 2844 raidPtr = (RF_Raid_t *) req->raidPtr; 2845 raidPtr->recon_in_progress = 1; 2846 2847 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col, 2848 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0)); 2849 2850 RF_Free(req, sizeof(*req)); 2851 2852 raidPtr->recon_in_progress = 0; 2853 splx(s); 2854 2855 /* That's all... */ 2856 kthread_exit(0); /* does not return */ 2857 } 2858 2859 void 2860 rf_RewriteParityThread(RF_Raid_t *raidPtr) 2861 { 2862 int retcode; 2863 int s; 2864 2865 raidPtr->parity_rewrite_stripes_done = 0; 2866 raidPtr->parity_rewrite_in_progress = 1; 2867 s = splbio(); 2868 retcode = rf_RewriteParity(raidPtr); 2869 splx(s); 2870 if (retcode) { 2871 printf("raid%d: Error re-writing parity (%d)!\n", 2872 raidPtr->raidid, retcode); 2873 } else { 2874 /* set the clean bit! If we shutdown correctly, 2875 the clean bit on each component label will get 2876 set */ 2877 raidPtr->parity_good = RF_RAID_CLEAN; 2878 } 2879 raidPtr->parity_rewrite_in_progress = 0; 2880 2881 /* Anyone waiting for us to stop? If so, inform them... */ 2882 if (raidPtr->waitShutdown) { 2883 wakeup(&raidPtr->parity_rewrite_in_progress); 2884 } 2885 2886 /* That's all... */ 2887 kthread_exit(0); /* does not return */ 2888 } 2889 2890 2891 void 2892 rf_CopybackThread(RF_Raid_t *raidPtr) 2893 { 2894 int s; 2895 2896 raidPtr->copyback_in_progress = 1; 2897 s = splbio(); 2898 rf_CopybackReconstructedData(raidPtr); 2899 splx(s); 2900 raidPtr->copyback_in_progress = 0; 2901 2902 /* That's all... */ 2903 kthread_exit(0); /* does not return */ 2904 } 2905 2906 2907 void 2908 rf_ReconstructInPlaceThread(struct rf_recon_req *req) 2909 { 2910 int s; 2911 RF_Raid_t *raidPtr; 2912 2913 s = splbio(); 2914 raidPtr = req->raidPtr; 2915 raidPtr->recon_in_progress = 1; 2916 rf_ReconstructInPlace(raidPtr, req->col); 2917 RF_Free(req, sizeof(*req)); 2918 raidPtr->recon_in_progress = 0; 2919 splx(s); 2920 2921 /* That's all... */ 2922 kthread_exit(0); /* does not return */ 2923 } 2924 2925 static RF_AutoConfig_t * 2926 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp, 2927 const char *cname, RF_SectorCount_t size, uint64_t numsecs, 2928 unsigned secsize) 2929 { 2930 int good_one = 0; 2931 RF_ComponentLabel_t *clabel; 2932 RF_AutoConfig_t *ac; 2933 2934 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT); 2935 if (clabel == NULL) { 2936 oomem: 2937 while(ac_list) { 2938 ac = ac_list; 2939 if (ac->clabel) 2940 free(ac->clabel, M_RAIDFRAME); 2941 ac_list = ac_list->next; 2942 free(ac, M_RAIDFRAME); 2943 } 2944 printf("RAID auto config: out of memory!\n"); 2945 return NULL; /* XXX probably should panic? */ 2946 } 2947 2948 if (!raidread_component_label(secsize, dev, vp, clabel)) { 2949 /* Got the label. Does it look reasonable? */ 2950 if (rf_reasonable_label(clabel, numsecs) && 2951 (rf_component_label_partitionsize(clabel) <= size)) { 2952 #ifdef DEBUG 2953 printf("Component on: %s: %llu\n", 2954 cname, (unsigned long long)size); 2955 rf_print_component_label(clabel); 2956 #endif 2957 /* if it's reasonable, add it, else ignore it. */ 2958 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME, 2959 M_NOWAIT); 2960 if (ac == NULL) { 2961 free(clabel, M_RAIDFRAME); 2962 goto oomem; 2963 } 2964 strlcpy(ac->devname, cname, sizeof(ac->devname)); 2965 ac->dev = dev; 2966 ac->vp = vp; 2967 ac->clabel = clabel; 2968 ac->next = ac_list; 2969 ac_list = ac; 2970 good_one = 1; 2971 } 2972 } 2973 if (!good_one) { 2974 /* cleanup */ 2975 free(clabel, M_RAIDFRAME); 2976 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2977 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2978 vput(vp); 2979 } 2980 return ac_list; 2981 } 2982 2983 RF_AutoConfig_t * 2984 rf_find_raid_components(void) 2985 { 2986 struct vnode *vp; 2987 struct disklabel label; 2988 device_t dv; 2989 deviter_t di; 2990 dev_t dev; 2991 int bmajor, bminor, wedge; 2992 int error; 2993 int i; 2994 RF_AutoConfig_t *ac_list; 2995 uint64_t numsecs; 2996 unsigned secsize; 2997 2998 /* initialize the AutoConfig list */ 2999 ac_list = NULL; 3000 3001 /* we begin by trolling through *all* the devices on the system */ 3002 3003 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL; 3004 dv = deviter_next(&di)) { 3005 3006 /* we are only interested in disks... */ 3007 if (device_class(dv) != DV_DISK) 3008 continue; 3009 3010 /* we don't care about floppies... */ 3011 if (device_is_a(dv, "fd")) { 3012 continue; 3013 } 3014 3015 /* we don't care about CD's... */ 3016 if (device_is_a(dv, "cd")) { 3017 continue; 3018 } 3019 3020 /* we don't care about md's... */ 3021 if (device_is_a(dv, "md")) { 3022 continue; 3023 } 3024 3025 /* hdfd is the Atari/Hades floppy driver */ 3026 if (device_is_a(dv, "hdfd")) { 3027 continue; 3028 } 3029 3030 /* fdisa is the Atari/Milan floppy driver */ 3031 if (device_is_a(dv, "fdisa")) { 3032 continue; 3033 } 3034 3035 /* need to find the device_name_to_block_device_major stuff */ 3036 bmajor = devsw_name2blk(device_xname(dv), NULL, 0); 3037 3038 /* get a vnode for the raw partition of this disk */ 3039 3040 wedge = device_is_a(dv, "dk"); 3041 bminor = minor(device_unit(dv)); 3042 dev = wedge ? makedev(bmajor, bminor) : 3043 MAKEDISKDEV(bmajor, bminor, RAW_PART); 3044 if (bdevvp(dev, &vp)) 3045 panic("RAID can't alloc vnode"); 3046 3047 error = VOP_OPEN(vp, FREAD, NOCRED); 3048 3049 if (error) { 3050 /* "Who cares." Continue looking 3051 for something that exists*/ 3052 vput(vp); 3053 continue; 3054 } 3055 3056 error = getdisksize(vp, &numsecs, &secsize); 3057 if (error) { 3058 vput(vp); 3059 continue; 3060 } 3061 if (wedge) { 3062 struct dkwedge_info dkw; 3063 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, 3064 NOCRED); 3065 if (error) { 3066 printf("RAIDframe: can't get wedge info for " 3067 "dev %s (%d)\n", device_xname(dv), error); 3068 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3069 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 3070 vput(vp); 3071 continue; 3072 } 3073 3074 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) { 3075 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3076 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 3077 vput(vp); 3078 continue; 3079 } 3080 3081 ac_list = rf_get_component(ac_list, dev, vp, 3082 device_xname(dv), dkw.dkw_size, numsecs, secsize); 3083 continue; 3084 } 3085 3086 /* Ok, the disk exists. Go get the disklabel. */ 3087 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED); 3088 if (error) { 3089 /* 3090 * XXX can't happen - open() would 3091 * have errored out (or faked up one) 3092 */ 3093 if (error != ENOTTY) 3094 printf("RAIDframe: can't get label for dev " 3095 "%s (%d)\n", device_xname(dv), error); 3096 } 3097 3098 /* don't need this any more. We'll allocate it again 3099 a little later if we really do... */ 3100 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3101 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 3102 vput(vp); 3103 3104 if (error) 3105 continue; 3106 3107 for (i = 0; i < label.d_npartitions; i++) { 3108 char cname[sizeof(ac_list->devname)]; 3109 3110 /* We only support partitions marked as RAID */ 3111 if (label.d_partitions[i].p_fstype != FS_RAID) 3112 continue; 3113 3114 dev = MAKEDISKDEV(bmajor, device_unit(dv), i); 3115 if (bdevvp(dev, &vp)) 3116 panic("RAID can't alloc vnode"); 3117 3118 error = VOP_OPEN(vp, FREAD, NOCRED); 3119 if (error) { 3120 /* Whatever... */ 3121 vput(vp); 3122 continue; 3123 } 3124 snprintf(cname, sizeof(cname), "%s%c", 3125 device_xname(dv), 'a' + i); 3126 ac_list = rf_get_component(ac_list, dev, vp, cname, 3127 label.d_partitions[i].p_size, numsecs, secsize); 3128 } 3129 } 3130 deviter_release(&di); 3131 return ac_list; 3132 } 3133 3134 3135 static int 3136 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs) 3137 { 3138 3139 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) || 3140 (clabel->version==RF_COMPONENT_LABEL_VERSION)) && 3141 ((clabel->clean == RF_RAID_CLEAN) || 3142 (clabel->clean == RF_RAID_DIRTY)) && 3143 clabel->row >=0 && 3144 clabel->column >= 0 && 3145 clabel->num_rows > 0 && 3146 clabel->num_columns > 0 && 3147 clabel->row < clabel->num_rows && 3148 clabel->column < clabel->num_columns && 3149 clabel->blockSize > 0 && 3150 /* 3151 * numBlocksHi may contain garbage, but it is ok since 3152 * the type is unsigned. If it is really garbage, 3153 * rf_fix_old_label_size() will fix it. 3154 */ 3155 rf_component_label_numblocks(clabel) > 0) { 3156 /* 3157 * label looks reasonable enough... 3158 * let's make sure it has no old garbage. 3159 */ 3160 rf_fix_old_label_size(clabel, numsecs); 3161 return(1); 3162 } 3163 return(0); 3164 } 3165 3166 3167 /* 3168 * For reasons yet unknown, some old component labels have garbage in 3169 * the newer numBlocksHi region, and this causes lossage. Since those 3170 * disks will also have numsecs set to less than 32 bits of sectors, 3171 * we can determine when this corruption has occured, and fix it. 3172 * 3173 * The exact same problem, with the same unknown reason, happens to 3174 * the partitionSizeHi member as well. 3175 */ 3176 static void 3177 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs) 3178 { 3179 3180 if (numsecs < ((uint64_t)1 << 32)) { 3181 if (clabel->numBlocksHi) { 3182 printf("WARNING: total sectors < 32 bits, yet " 3183 "numBlocksHi set\n" 3184 "WARNING: resetting numBlocksHi to zero.\n"); 3185 clabel->numBlocksHi = 0; 3186 } 3187 3188 if (clabel->partitionSizeHi) { 3189 printf("WARNING: total sectors < 32 bits, yet " 3190 "partitionSizeHi set\n" 3191 "WARNING: resetting partitionSizeHi to zero.\n"); 3192 clabel->partitionSizeHi = 0; 3193 } 3194 } 3195 } 3196 3197 3198 #ifdef DEBUG 3199 void 3200 rf_print_component_label(RF_ComponentLabel_t *clabel) 3201 { 3202 uint64_t numBlocks; 3203 3204 numBlocks = rf_component_label_numblocks(clabel); 3205 3206 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n", 3207 clabel->row, clabel->column, 3208 clabel->num_rows, clabel->num_columns); 3209 printf(" Version: %d Serial Number: %d Mod Counter: %d\n", 3210 clabel->version, clabel->serial_number, 3211 clabel->mod_counter); 3212 printf(" Clean: %s Status: %d\n", 3213 clabel->clean ? "Yes" : "No", clabel->status); 3214 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n", 3215 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU); 3216 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n", 3217 (char) clabel->parityConfig, clabel->blockSize, numBlocks); 3218 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No"); 3219 printf(" Contains root partition: %s\n", 3220 clabel->root_partition ? "Yes" : "No"); 3221 printf(" Last configured as: raid%d\n", clabel->last_unit); 3222 #if 0 3223 printf(" Config order: %d\n", clabel->config_order); 3224 #endif 3225 3226 } 3227 #endif 3228 3229 RF_ConfigSet_t * 3230 rf_create_auto_sets(RF_AutoConfig_t *ac_list) 3231 { 3232 RF_AutoConfig_t *ac; 3233 RF_ConfigSet_t *config_sets; 3234 RF_ConfigSet_t *cset; 3235 RF_AutoConfig_t *ac_next; 3236 3237 3238 config_sets = NULL; 3239 3240 /* Go through the AutoConfig list, and figure out which components 3241 belong to what sets. */ 3242 ac = ac_list; 3243 while(ac!=NULL) { 3244 /* we're going to putz with ac->next, so save it here 3245 for use at the end of the loop */ 3246 ac_next = ac->next; 3247 3248 if (config_sets == NULL) { 3249 /* will need at least this one... */ 3250 config_sets = (RF_ConfigSet_t *) 3251 malloc(sizeof(RF_ConfigSet_t), 3252 M_RAIDFRAME, M_NOWAIT); 3253 if (config_sets == NULL) { 3254 panic("rf_create_auto_sets: No memory!"); 3255 } 3256 /* this one is easy :) */ 3257 config_sets->ac = ac; 3258 config_sets->next = NULL; 3259 config_sets->rootable = 0; 3260 ac->next = NULL; 3261 } else { 3262 /* which set does this component fit into? */ 3263 cset = config_sets; 3264 while(cset!=NULL) { 3265 if (rf_does_it_fit(cset, ac)) { 3266 /* looks like it matches... */ 3267 ac->next = cset->ac; 3268 cset->ac = ac; 3269 break; 3270 } 3271 cset = cset->next; 3272 } 3273 if (cset==NULL) { 3274 /* didn't find a match above... new set..*/ 3275 cset = (RF_ConfigSet_t *) 3276 malloc(sizeof(RF_ConfigSet_t), 3277 M_RAIDFRAME, M_NOWAIT); 3278 if (cset == NULL) { 3279 panic("rf_create_auto_sets: No memory!"); 3280 } 3281 cset->ac = ac; 3282 ac->next = NULL; 3283 cset->next = config_sets; 3284 cset->rootable = 0; 3285 config_sets = cset; 3286 } 3287 } 3288 ac = ac_next; 3289 } 3290 3291 3292 return(config_sets); 3293 } 3294 3295 static int 3296 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac) 3297 { 3298 RF_ComponentLabel_t *clabel1, *clabel2; 3299 3300 /* If this one matches the *first* one in the set, that's good 3301 enough, since the other members of the set would have been 3302 through here too... */ 3303 /* note that we are not checking partitionSize here.. 3304 3305 Note that we are also not checking the mod_counters here. 3306 If everything else matches execpt the mod_counter, that's 3307 good enough for this test. We will deal with the mod_counters 3308 a little later in the autoconfiguration process. 3309 3310 (clabel1->mod_counter == clabel2->mod_counter) && 3311 3312 The reason we don't check for this is that failed disks 3313 will have lower modification counts. If those disks are 3314 not added to the set they used to belong to, then they will 3315 form their own set, which may result in 2 different sets, 3316 for example, competing to be configured at raid0, and 3317 perhaps competing to be the root filesystem set. If the 3318 wrong ones get configured, or both attempt to become /, 3319 weird behaviour and or serious lossage will occur. Thus we 3320 need to bring them into the fold here, and kick them out at 3321 a later point. 3322 3323 */ 3324 3325 clabel1 = cset->ac->clabel; 3326 clabel2 = ac->clabel; 3327 if ((clabel1->version == clabel2->version) && 3328 (clabel1->serial_number == clabel2->serial_number) && 3329 (clabel1->num_rows == clabel2->num_rows) && 3330 (clabel1->num_columns == clabel2->num_columns) && 3331 (clabel1->sectPerSU == clabel2->sectPerSU) && 3332 (clabel1->SUsPerPU == clabel2->SUsPerPU) && 3333 (clabel1->SUsPerRU == clabel2->SUsPerRU) && 3334 (clabel1->parityConfig == clabel2->parityConfig) && 3335 (clabel1->maxOutstanding == clabel2->maxOutstanding) && 3336 (clabel1->blockSize == clabel2->blockSize) && 3337 rf_component_label_numblocks(clabel1) == 3338 rf_component_label_numblocks(clabel2) && 3339 (clabel1->autoconfigure == clabel2->autoconfigure) && 3340 (clabel1->root_partition == clabel2->root_partition) && 3341 (clabel1->last_unit == clabel2->last_unit) && 3342 (clabel1->config_order == clabel2->config_order)) { 3343 /* if it get's here, it almost *has* to be a match */ 3344 } else { 3345 /* it's not consistent with somebody in the set.. 3346 punt */ 3347 return(0); 3348 } 3349 /* all was fine.. it must fit... */ 3350 return(1); 3351 } 3352 3353 int 3354 rf_have_enough_components(RF_ConfigSet_t *cset) 3355 { 3356 RF_AutoConfig_t *ac; 3357 RF_AutoConfig_t *auto_config; 3358 RF_ComponentLabel_t *clabel; 3359 int c; 3360 int num_cols; 3361 int num_missing; 3362 int mod_counter; 3363 int mod_counter_found; 3364 int even_pair_failed; 3365 char parity_type; 3366 3367 3368 /* check to see that we have enough 'live' components 3369 of this set. If so, we can configure it if necessary */ 3370 3371 num_cols = cset->ac->clabel->num_columns; 3372 parity_type = cset->ac->clabel->parityConfig; 3373 3374 /* XXX Check for duplicate components!?!?!? */ 3375 3376 /* Determine what the mod_counter is supposed to be for this set. */ 3377 3378 mod_counter_found = 0; 3379 mod_counter = 0; 3380 ac = cset->ac; 3381 while(ac!=NULL) { 3382 if (mod_counter_found==0) { 3383 mod_counter = ac->clabel->mod_counter; 3384 mod_counter_found = 1; 3385 } else { 3386 if (ac->clabel->mod_counter > mod_counter) { 3387 mod_counter = ac->clabel->mod_counter; 3388 } 3389 } 3390 ac = ac->next; 3391 } 3392 3393 num_missing = 0; 3394 auto_config = cset->ac; 3395 3396 even_pair_failed = 0; 3397 for(c=0; c<num_cols; c++) { 3398 ac = auto_config; 3399 while(ac!=NULL) { 3400 if ((ac->clabel->column == c) && 3401 (ac->clabel->mod_counter == mod_counter)) { 3402 /* it's this one... */ 3403 #ifdef DEBUG 3404 printf("Found: %s at %d\n", 3405 ac->devname,c); 3406 #endif 3407 break; 3408 } 3409 ac=ac->next; 3410 } 3411 if (ac==NULL) { 3412 /* Didn't find one here! */ 3413 /* special case for RAID 1, especially 3414 where there are more than 2 3415 components (where RAIDframe treats 3416 things a little differently :( ) */ 3417 if (parity_type == '1') { 3418 if (c%2 == 0) { /* even component */ 3419 even_pair_failed = 1; 3420 } else { /* odd component. If 3421 we're failed, and 3422 so is the even 3423 component, it's 3424 "Good Night, Charlie" */ 3425 if (even_pair_failed == 1) { 3426 return(0); 3427 } 3428 } 3429 } else { 3430 /* normal accounting */ 3431 num_missing++; 3432 } 3433 } 3434 if ((parity_type == '1') && (c%2 == 1)) { 3435 /* Just did an even component, and we didn't 3436 bail.. reset the even_pair_failed flag, 3437 and go on to the next component.... */ 3438 even_pair_failed = 0; 3439 } 3440 } 3441 3442 clabel = cset->ac->clabel; 3443 3444 if (((clabel->parityConfig == '0') && (num_missing > 0)) || 3445 ((clabel->parityConfig == '4') && (num_missing > 1)) || 3446 ((clabel->parityConfig == '5') && (num_missing > 1))) { 3447 /* XXX this needs to be made *much* more general */ 3448 /* Too many failures */ 3449 return(0); 3450 } 3451 /* otherwise, all is well, and we've got enough to take a kick 3452 at autoconfiguring this set */ 3453 return(1); 3454 } 3455 3456 void 3457 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config, 3458 RF_Raid_t *raidPtr) 3459 { 3460 RF_ComponentLabel_t *clabel; 3461 int i; 3462 3463 clabel = ac->clabel; 3464 3465 /* 1. Fill in the common stuff */ 3466 config->numRow = clabel->num_rows = 1; 3467 config->numCol = clabel->num_columns; 3468 config->numSpare = 0; /* XXX should this be set here? */ 3469 config->sectPerSU = clabel->sectPerSU; 3470 config->SUsPerPU = clabel->SUsPerPU; 3471 config->SUsPerRU = clabel->SUsPerRU; 3472 config->parityConfig = clabel->parityConfig; 3473 /* XXX... */ 3474 strcpy(config->diskQueueType,"fifo"); 3475 config->maxOutstandingDiskReqs = clabel->maxOutstanding; 3476 config->layoutSpecificSize = 0; /* XXX ?? */ 3477 3478 while(ac!=NULL) { 3479 /* row/col values will be in range due to the checks 3480 in reasonable_label() */ 3481 strcpy(config->devnames[0][ac->clabel->column], 3482 ac->devname); 3483 ac = ac->next; 3484 } 3485 3486 for(i=0;i<RF_MAXDBGV;i++) { 3487 config->debugVars[i][0] = 0; 3488 } 3489 } 3490 3491 int 3492 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value) 3493 { 3494 RF_ComponentLabel_t *clabel; 3495 int column; 3496 int sparecol; 3497 3498 raidPtr->autoconfigure = new_value; 3499 3500 for(column=0; column<raidPtr->numCol; column++) { 3501 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3502 clabel = raidget_component_label(raidPtr, column); 3503 clabel->autoconfigure = new_value; 3504 raidflush_component_label(raidPtr, column); 3505 } 3506 } 3507 for(column = 0; column < raidPtr->numSpare ; column++) { 3508 sparecol = raidPtr->numCol + column; 3509 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3510 clabel = raidget_component_label(raidPtr, sparecol); 3511 clabel->autoconfigure = new_value; 3512 raidflush_component_label(raidPtr, sparecol); 3513 } 3514 } 3515 return(new_value); 3516 } 3517 3518 int 3519 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value) 3520 { 3521 RF_ComponentLabel_t *clabel; 3522 int column; 3523 int sparecol; 3524 3525 raidPtr->root_partition = new_value; 3526 for(column=0; column<raidPtr->numCol; column++) { 3527 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3528 clabel = raidget_component_label(raidPtr, column); 3529 clabel->root_partition = new_value; 3530 raidflush_component_label(raidPtr, column); 3531 } 3532 } 3533 for(column = 0; column < raidPtr->numSpare ; column++) { 3534 sparecol = raidPtr->numCol + column; 3535 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3536 clabel = raidget_component_label(raidPtr, sparecol); 3537 clabel->root_partition = new_value; 3538 raidflush_component_label(raidPtr, sparecol); 3539 } 3540 } 3541 return(new_value); 3542 } 3543 3544 void 3545 rf_release_all_vps(RF_ConfigSet_t *cset) 3546 { 3547 RF_AutoConfig_t *ac; 3548 3549 ac = cset->ac; 3550 while(ac!=NULL) { 3551 /* Close the vp, and give it back */ 3552 if (ac->vp) { 3553 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY); 3554 VOP_CLOSE(ac->vp, FREAD, NOCRED); 3555 vput(ac->vp); 3556 ac->vp = NULL; 3557 } 3558 ac = ac->next; 3559 } 3560 } 3561 3562 3563 void 3564 rf_cleanup_config_set(RF_ConfigSet_t *cset) 3565 { 3566 RF_AutoConfig_t *ac; 3567 RF_AutoConfig_t *next_ac; 3568 3569 ac = cset->ac; 3570 while(ac!=NULL) { 3571 next_ac = ac->next; 3572 /* nuke the label */ 3573 free(ac->clabel, M_RAIDFRAME); 3574 /* cleanup the config structure */ 3575 free(ac, M_RAIDFRAME); 3576 /* "next.." */ 3577 ac = next_ac; 3578 } 3579 /* and, finally, nuke the config set */ 3580 free(cset, M_RAIDFRAME); 3581 } 3582 3583 3584 void 3585 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 3586 { 3587 /* current version number */ 3588 clabel->version = RF_COMPONENT_LABEL_VERSION; 3589 clabel->serial_number = raidPtr->serial_number; 3590 clabel->mod_counter = raidPtr->mod_counter; 3591 3592 clabel->num_rows = 1; 3593 clabel->num_columns = raidPtr->numCol; 3594 clabel->clean = RF_RAID_DIRTY; /* not clean */ 3595 clabel->status = rf_ds_optimal; /* "It's good!" */ 3596 3597 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; 3598 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU; 3599 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU; 3600 3601 clabel->blockSize = raidPtr->bytesPerSector; 3602 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk); 3603 3604 /* XXX not portable */ 3605 clabel->parityConfig = raidPtr->Layout.map->parityConfig; 3606 clabel->maxOutstanding = raidPtr->maxOutstanding; 3607 clabel->autoconfigure = raidPtr->autoconfigure; 3608 clabel->root_partition = raidPtr->root_partition; 3609 clabel->last_unit = raidPtr->raidid; 3610 clabel->config_order = raidPtr->config_order; 3611 3612 #ifndef RF_NO_PARITY_MAP 3613 rf_paritymap_init_label(raidPtr->parity_map, clabel); 3614 #endif 3615 } 3616 3617 int 3618 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit) 3619 { 3620 RF_Raid_t *raidPtr; 3621 RF_Config_t *config; 3622 int raidID; 3623 int retcode; 3624 3625 #ifdef DEBUG 3626 printf("RAID autoconfigure\n"); 3627 #endif 3628 3629 retcode = 0; 3630 *unit = -1; 3631 3632 /* 1. Create a config structure */ 3633 3634 config = (RF_Config_t *)malloc(sizeof(RF_Config_t), 3635 M_RAIDFRAME, 3636 M_NOWAIT); 3637 if (config==NULL) { 3638 printf("Out of mem!?!?\n"); 3639 /* XXX do something more intelligent here. */ 3640 return(1); 3641 } 3642 3643 memset(config, 0, sizeof(RF_Config_t)); 3644 3645 /* 3646 2. Figure out what RAID ID this one is supposed to live at 3647 See if we can get the same RAID dev that it was configured 3648 on last time.. 3649 */ 3650 3651 raidID = cset->ac->clabel->last_unit; 3652 if ((raidID < 0) || (raidID >= numraid)) { 3653 /* let's not wander off into lala land. */ 3654 raidID = numraid - 1; 3655 } 3656 if (raidPtrs[raidID]->valid != 0) { 3657 3658 /* 3659 Nope... Go looking for an alternative... 3660 Start high so we don't immediately use raid0 if that's 3661 not taken. 3662 */ 3663 3664 for(raidID = numraid - 1; raidID >= 0; raidID--) { 3665 if (raidPtrs[raidID]->valid == 0) { 3666 /* can use this one! */ 3667 break; 3668 } 3669 } 3670 } 3671 3672 if (raidID < 0) { 3673 /* punt... */ 3674 printf("Unable to auto configure this set!\n"); 3675 printf("(Out of RAID devs!)\n"); 3676 free(config, M_RAIDFRAME); 3677 return(1); 3678 } 3679 3680 #ifdef DEBUG 3681 printf("Configuring raid%d:\n",raidID); 3682 #endif 3683 3684 raidPtr = raidPtrs[raidID]; 3685 3686 /* XXX all this stuff should be done SOMEWHERE ELSE! */ 3687 raidPtr->raidid = raidID; 3688 raidPtr->openings = RAIDOUTSTANDING; 3689 3690 /* 3. Build the configuration structure */ 3691 rf_create_configuration(cset->ac, config, raidPtr); 3692 3693 /* 4. Do the configuration */ 3694 retcode = rf_Configure(raidPtr, config, cset->ac); 3695 3696 if (retcode == 0) { 3697 3698 raidinit(raidPtrs[raidID]); 3699 3700 rf_markalldirty(raidPtrs[raidID]); 3701 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */ 3702 if (cset->ac->clabel->root_partition==1) { 3703 /* everything configured just fine. Make a note 3704 that this set is eligible to be root. */ 3705 cset->rootable = 1; 3706 /* XXX do this here? */ 3707 raidPtrs[raidID]->root_partition = 1; 3708 } 3709 } 3710 3711 /* 5. Cleanup */ 3712 free(config, M_RAIDFRAME); 3713 3714 *unit = raidID; 3715 return(retcode); 3716 } 3717 3718 void 3719 rf_disk_unbusy(RF_RaidAccessDesc_t *desc) 3720 { 3721 struct buf *bp; 3722 3723 bp = (struct buf *)desc->bp; 3724 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev, 3725 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ)); 3726 } 3727 3728 void 3729 rf_pool_init(struct pool *p, size_t size, const char *w_chan, 3730 size_t xmin, size_t xmax) 3731 { 3732 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO); 3733 pool_sethiwat(p, xmax); 3734 pool_prime(p, xmin); 3735 pool_setlowat(p, xmin); 3736 } 3737 3738 /* 3739 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see 3740 * if there is IO pending and if that IO could possibly be done for a 3741 * given RAID set. Returns 0 if IO is waiting and can be done, 1 3742 * otherwise. 3743 * 3744 */ 3745 3746 int 3747 rf_buf_queue_check(int raidid) 3748 { 3749 if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) && 3750 raidPtrs[raidid]->openings > 0) { 3751 /* there is work to do */ 3752 return 0; 3753 } 3754 /* default is nothing to do */ 3755 return 1; 3756 } 3757 3758 int 3759 rf_getdisksize(struct vnode *vp, struct lwp *l, RF_RaidDisk_t *diskPtr) 3760 { 3761 uint64_t numsecs; 3762 unsigned secsize; 3763 int error; 3764 3765 error = getdisksize(vp, &numsecs, &secsize); 3766 if (error == 0) { 3767 diskPtr->blockSize = secsize; 3768 diskPtr->numBlocks = numsecs - rf_protectedSectors; 3769 diskPtr->partitionSize = numsecs; 3770 return 0; 3771 } 3772 return error; 3773 } 3774 3775 static int 3776 raid_match(device_t self, cfdata_t cfdata, void *aux) 3777 { 3778 return 1; 3779 } 3780 3781 static void 3782 raid_attach(device_t parent, device_t self, void *aux) 3783 { 3784 3785 } 3786 3787 3788 static int 3789 raid_detach(device_t self, int flags) 3790 { 3791 int error; 3792 struct raid_softc *rs = &raid_softc[device_unit(self)]; 3793 3794 if ((error = raidlock(rs)) != 0) 3795 return (error); 3796 3797 error = raid_detach_unlocked(rs); 3798 3799 raidunlock(rs); 3800 3801 return error; 3802 } 3803 3804 static void 3805 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr) 3806 { 3807 prop_dictionary_t disk_info, odisk_info, geom; 3808 disk_info = prop_dictionary_create(); 3809 geom = prop_dictionary_create(); 3810 prop_dictionary_set_uint64(geom, "sectors-per-unit", 3811 raidPtr->totalSectors); 3812 prop_dictionary_set_uint32(geom, "sector-size", 3813 raidPtr->bytesPerSector); 3814 3815 prop_dictionary_set_uint16(geom, "sectors-per-track", 3816 raidPtr->Layout.dataSectorsPerStripe); 3817 prop_dictionary_set_uint16(geom, "tracks-per-cylinder", 3818 4 * raidPtr->numCol); 3819 3820 prop_dictionary_set_uint64(geom, "cylinders-per-unit", 3821 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe * 3822 (4 * raidPtr->numCol))); 3823 3824 prop_dictionary_set(disk_info, "geometry", geom); 3825 prop_object_release(geom); 3826 prop_dictionary_set(device_properties(rs->sc_dev), 3827 "disk-info", disk_info); 3828 odisk_info = rs->sc_dkdev.dk_info; 3829 rs->sc_dkdev.dk_info = disk_info; 3830 if (odisk_info) 3831 prop_object_release(odisk_info); 3832 } 3833 3834 /* 3835 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components. 3836 * We end up returning whatever error was returned by the first cache flush 3837 * that fails. 3838 */ 3839 3840 int 3841 rf_sync_component_caches(RF_Raid_t *raidPtr) 3842 { 3843 int c, sparecol; 3844 int e,error; 3845 int force = 1; 3846 3847 error = 0; 3848 for (c = 0; c < raidPtr->numCol; c++) { 3849 if (raidPtr->Disks[c].status == rf_ds_optimal) { 3850 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC, 3851 &force, FWRITE, NOCRED); 3852 if (e) { 3853 if (e != ENODEV) 3854 printf("raid%d: cache flush to component %s failed.\n", 3855 raidPtr->raidid, raidPtr->Disks[c].devname); 3856 if (error == 0) { 3857 error = e; 3858 } 3859 } 3860 } 3861 } 3862 3863 for( c = 0; c < raidPtr->numSpare ; c++) { 3864 sparecol = raidPtr->numCol + c; 3865 /* Need to ensure that the reconstruct actually completed! */ 3866 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3867 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp, 3868 DIOCCACHESYNC, &force, FWRITE, NOCRED); 3869 if (e) { 3870 if (e != ENODEV) 3871 printf("raid%d: cache flush to component %s failed.\n", 3872 raidPtr->raidid, raidPtr->Disks[sparecol].devname); 3873 if (error == 0) { 3874 error = e; 3875 } 3876 } 3877 } 3878 } 3879 return error; 3880 } 3881