1 /* $NetBSD: rf_netbsdkintf.c,v 1.296 2012/02/16 06:52:03 buhrow Exp $ */ 2 3 /*- 4 * Copyright (c) 1996, 1997, 1998, 2008-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Greg Oster; Jason R. Thorpe. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1988 University of Utah. 34 * Copyright (c) 1990, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * This code is derived from software contributed to Berkeley by 38 * the Systems Programming Group of the University of Utah Computer 39 * Science Department. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * from: Utah $Hdr: cd.c 1.6 90/11/28$ 66 * 67 * @(#)cd.c 8.2 (Berkeley) 11/16/93 68 */ 69 70 /* 71 * Copyright (c) 1995 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Mark Holland, Jim Zelenka 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /*********************************************************** 98 * 99 * rf_kintf.c -- the kernel interface routines for RAIDframe 100 * 101 ***********************************************************/ 102 103 #include <sys/cdefs.h> 104 __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.296 2012/02/16 06:52:03 buhrow Exp $"); 105 106 #ifdef _KERNEL_OPT 107 #include "opt_compat_netbsd.h" 108 #include "opt_raid_autoconfig.h" 109 #include "raid.h" 110 #endif 111 112 #include <sys/param.h> 113 #include <sys/errno.h> 114 #include <sys/pool.h> 115 #include <sys/proc.h> 116 #include <sys/queue.h> 117 #include <sys/disk.h> 118 #include <sys/device.h> 119 #include <sys/stat.h> 120 #include <sys/ioctl.h> 121 #include <sys/fcntl.h> 122 #include <sys/systm.h> 123 #include <sys/vnode.h> 124 #include <sys/disklabel.h> 125 #include <sys/conf.h> 126 #include <sys/buf.h> 127 #include <sys/bufq.h> 128 #include <sys/reboot.h> 129 #include <sys/kauth.h> 130 131 #include <prop/proplib.h> 132 133 #include <dev/raidframe/raidframevar.h> 134 #include <dev/raidframe/raidframeio.h> 135 #include <dev/raidframe/rf_paritymap.h> 136 137 #include "rf_raid.h" 138 #include "rf_copyback.h" 139 #include "rf_dag.h" 140 #include "rf_dagflags.h" 141 #include "rf_desc.h" 142 #include "rf_diskqueue.h" 143 #include "rf_etimer.h" 144 #include "rf_general.h" 145 #include "rf_kintf.h" 146 #include "rf_options.h" 147 #include "rf_driver.h" 148 #include "rf_parityscan.h" 149 #include "rf_threadstuff.h" 150 151 #ifdef COMPAT_50 152 #include "rf_compat50.h" 153 #endif 154 155 #ifdef DEBUG 156 int rf_kdebug_level = 0; 157 #define db1_printf(a) if (rf_kdebug_level > 0) printf a 158 #else /* DEBUG */ 159 #define db1_printf(a) { } 160 #endif /* DEBUG */ 161 162 static RF_Raid_t **raidPtrs; /* global raid device descriptors */ 163 164 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 165 static rf_declare_mutex2(rf_sparet_wait_mutex); 166 static rf_declare_cond2(rf_sparet_wait_cv); 167 static rf_declare_cond2(rf_sparet_resp_cv); 168 169 static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a 170 * spare table */ 171 static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from 172 * installation process */ 173 #endif 174 175 MALLOC_DEFINE(M_RAIDFRAME, "RAIDframe", "RAIDframe structures"); 176 177 /* prototypes */ 178 static void KernelWakeupFunc(struct buf *); 179 static void InitBP(struct buf *, struct vnode *, unsigned, 180 dev_t, RF_SectorNum_t, RF_SectorCount_t, void *, void (*) (struct buf *), 181 void *, int, struct proc *); 182 static void raidinit(RF_Raid_t *); 183 184 void raidattach(int); 185 static int raid_match(device_t, cfdata_t, void *); 186 static void raid_attach(device_t, device_t, void *); 187 static int raid_detach(device_t, int); 188 189 static int raidread_component_area(dev_t, struct vnode *, void *, size_t, 190 daddr_t, daddr_t); 191 static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t, 192 daddr_t, daddr_t, int); 193 194 static int raidwrite_component_label(unsigned, 195 dev_t, struct vnode *, RF_ComponentLabel_t *); 196 static int raidread_component_label(unsigned, 197 dev_t, struct vnode *, RF_ComponentLabel_t *); 198 199 200 dev_type_open(raidopen); 201 dev_type_close(raidclose); 202 dev_type_read(raidread); 203 dev_type_write(raidwrite); 204 dev_type_ioctl(raidioctl); 205 dev_type_strategy(raidstrategy); 206 dev_type_dump(raiddump); 207 dev_type_size(raidsize); 208 209 const struct bdevsw raid_bdevsw = { 210 raidopen, raidclose, raidstrategy, raidioctl, 211 raiddump, raidsize, D_DISK 212 }; 213 214 const struct cdevsw raid_cdevsw = { 215 raidopen, raidclose, raidread, raidwrite, raidioctl, 216 nostop, notty, nopoll, nommap, nokqfilter, D_DISK 217 }; 218 219 static struct dkdriver rf_dkdriver = { raidstrategy, minphys }; 220 221 /* XXX Not sure if the following should be replacing the raidPtrs above, 222 or if it should be used in conjunction with that... 223 */ 224 225 struct raid_softc { 226 device_t sc_dev; 227 int sc_flags; /* flags */ 228 int sc_cflags; /* configuration flags */ 229 uint64_t sc_size; /* size of the raid device */ 230 char sc_xname[20]; /* XXX external name */ 231 struct disk sc_dkdev; /* generic disk device info */ 232 struct bufq_state *buf_queue; /* used for the device queue */ 233 }; 234 /* sc_flags */ 235 #define RAIDF_INITED 0x01 /* unit has been initialized */ 236 #define RAIDF_WLABEL 0x02 /* label area is writable */ 237 #define RAIDF_LABELLING 0x04 /* unit is currently being labelled */ 238 #define RAIDF_SHUTDOWN 0x08 /* unit is being shutdown */ 239 #define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */ 240 #define RAIDF_LOCKED 0x80 /* unit is locked */ 241 242 #define raidunit(x) DISKUNIT(x) 243 int numraid = 0; 244 245 extern struct cfdriver raid_cd; 246 CFATTACH_DECL3_NEW(raid, sizeof(struct raid_softc), 247 raid_match, raid_attach, raid_detach, NULL, NULL, NULL, 248 DVF_DETACH_SHUTDOWN); 249 250 /* 251 * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device. 252 * Be aware that large numbers can allow the driver to consume a lot of 253 * kernel memory, especially on writes, and in degraded mode reads. 254 * 255 * For example: with a stripe width of 64 blocks (32k) and 5 disks, 256 * a single 64K write will typically require 64K for the old data, 257 * 64K for the old parity, and 64K for the new parity, for a total 258 * of 192K (if the parity buffer is not re-used immediately). 259 * Even it if is used immediately, that's still 128K, which when multiplied 260 * by say 10 requests, is 1280K, *on top* of the 640K of incoming data. 261 * 262 * Now in degraded mode, for example, a 64K read on the above setup may 263 * require data reconstruction, which will require *all* of the 4 remaining 264 * disks to participate -- 4 * 32K/disk == 128K again. 265 */ 266 267 #ifndef RAIDOUTSTANDING 268 #define RAIDOUTSTANDING 6 269 #endif 270 271 #define RAIDLABELDEV(dev) \ 272 (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART)) 273 274 /* declared here, and made public, for the benefit of KVM stuff.. */ 275 struct raid_softc *raid_softc; 276 277 static void raidgetdefaultlabel(RF_Raid_t *, struct raid_softc *, 278 struct disklabel *); 279 static void raidgetdisklabel(dev_t); 280 static void raidmakedisklabel(struct raid_softc *); 281 282 static int raidlock(struct raid_softc *); 283 static void raidunlock(struct raid_softc *); 284 285 static int raid_detach_unlocked(struct raid_softc *); 286 287 static void rf_markalldirty(RF_Raid_t *); 288 static void rf_set_properties(struct raid_softc *, RF_Raid_t *); 289 290 void rf_ReconThread(struct rf_recon_req *); 291 void rf_RewriteParityThread(RF_Raid_t *raidPtr); 292 void rf_CopybackThread(RF_Raid_t *raidPtr); 293 void rf_ReconstructInPlaceThread(struct rf_recon_req *); 294 int rf_autoconfig(device_t); 295 void rf_buildroothack(RF_ConfigSet_t *); 296 297 RF_AutoConfig_t *rf_find_raid_components(void); 298 RF_ConfigSet_t *rf_create_auto_sets(RF_AutoConfig_t *); 299 static int rf_does_it_fit(RF_ConfigSet_t *,RF_AutoConfig_t *); 300 int rf_reasonable_label(RF_ComponentLabel_t *, uint64_t); 301 void rf_create_configuration(RF_AutoConfig_t *,RF_Config_t *, RF_Raid_t *); 302 int rf_set_autoconfig(RF_Raid_t *, int); 303 int rf_set_rootpartition(RF_Raid_t *, int); 304 void rf_release_all_vps(RF_ConfigSet_t *); 305 void rf_cleanup_config_set(RF_ConfigSet_t *); 306 int rf_have_enough_components(RF_ConfigSet_t *); 307 int rf_auto_config_set(RF_ConfigSet_t *, int *); 308 static void rf_fix_old_label_size(RF_ComponentLabel_t *, uint64_t); 309 310 /* 311 * Debugging, mostly. Set to 0 to not allow autoconfig to take place. 312 * Note that this is overridden by having RAID_AUTOCONFIG as an option 313 * in the kernel config file. 314 */ 315 #ifdef RAID_AUTOCONFIG 316 int raidautoconfig = 1; 317 #else 318 int raidautoconfig = 0; 319 #endif 320 static bool raidautoconfigdone = false; 321 322 struct RF_Pools_s rf_pools; 323 324 void 325 raidattach(int num) 326 { 327 int raidID; 328 int i, rc; 329 330 aprint_debug("raidattach: Asked for %d units\n", num); 331 332 if (num <= 0) { 333 #ifdef DIAGNOSTIC 334 panic("raidattach: count <= 0"); 335 #endif 336 return; 337 } 338 /* This is where all the initialization stuff gets done. */ 339 340 numraid = num; 341 342 /* Make some space for requested number of units... */ 343 344 RF_Malloc(raidPtrs, num * sizeof(RF_Raid_t *), (RF_Raid_t **)); 345 if (raidPtrs == NULL) { 346 panic("raidPtrs is NULL!!"); 347 } 348 349 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 350 rf_init_mutex2(rf_sparet_wait_mutex, IPL_VM); 351 rf_init_cond2(rf_sparet_wait_cv, "sparetw"); 352 rf_init_cond2(rf_sparet_resp_cv, "rfgst"); 353 354 rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; 355 #endif 356 357 for (i = 0; i < num; i++) 358 raidPtrs[i] = NULL; 359 rc = rf_BootRaidframe(); 360 if (rc == 0) 361 aprint_verbose("Kernelized RAIDframe activated\n"); 362 else 363 panic("Serious error booting RAID!!"); 364 365 /* put together some datastructures like the CCD device does.. This 366 * lets us lock the device and what-not when it gets opened. */ 367 368 raid_softc = (struct raid_softc *) 369 malloc(num * sizeof(struct raid_softc), 370 M_RAIDFRAME, M_NOWAIT); 371 if (raid_softc == NULL) { 372 aprint_error("WARNING: no memory for RAIDframe driver\n"); 373 return; 374 } 375 376 memset(raid_softc, 0, num * sizeof(struct raid_softc)); 377 378 for (raidID = 0; raidID < num; raidID++) { 379 bufq_alloc(&raid_softc[raidID].buf_queue, "fcfs", 0); 380 381 RF_Malloc(raidPtrs[raidID], sizeof(RF_Raid_t), 382 (RF_Raid_t *)); 383 if (raidPtrs[raidID] == NULL) { 384 aprint_error("WARNING: raidPtrs[%d] is NULL\n", raidID); 385 numraid = raidID; 386 return; 387 } 388 } 389 390 if (config_cfattach_attach(raid_cd.cd_name, &raid_ca)) { 391 aprint_error("raidattach: config_cfattach_attach failed?\n"); 392 } 393 394 raidautoconfigdone = false; 395 396 /* 397 * Register a finalizer which will be used to auto-config RAID 398 * sets once all real hardware devices have been found. 399 */ 400 if (config_finalize_register(NULL, rf_autoconfig) != 0) 401 aprint_error("WARNING: unable to register RAIDframe finalizer\n"); 402 } 403 404 int 405 rf_autoconfig(device_t self) 406 { 407 RF_AutoConfig_t *ac_list; 408 RF_ConfigSet_t *config_sets; 409 410 if (!raidautoconfig || raidautoconfigdone == true) 411 return (0); 412 413 /* XXX This code can only be run once. */ 414 raidautoconfigdone = true; 415 416 /* 1. locate all RAID components on the system */ 417 aprint_debug("Searching for RAID components...\n"); 418 ac_list = rf_find_raid_components(); 419 420 /* 2. Sort them into their respective sets. */ 421 config_sets = rf_create_auto_sets(ac_list); 422 423 /* 424 * 3. Evaluate each set andconfigure the valid ones. 425 * This gets done in rf_buildroothack(). 426 */ 427 rf_buildroothack(config_sets); 428 429 return 1; 430 } 431 432 void 433 rf_buildroothack(RF_ConfigSet_t *config_sets) 434 { 435 RF_ConfigSet_t *cset; 436 RF_ConfigSet_t *next_cset; 437 int retcode; 438 int raidID; 439 int rootID; 440 int col; 441 int num_root; 442 char *devname; 443 444 rootID = 0; 445 num_root = 0; 446 cset = config_sets; 447 while (cset != NULL) { 448 next_cset = cset->next; 449 if (rf_have_enough_components(cset) && 450 cset->ac->clabel->autoconfigure==1) { 451 retcode = rf_auto_config_set(cset,&raidID); 452 if (!retcode) { 453 aprint_debug("raid%d: configured ok\n", raidID); 454 if (cset->rootable) { 455 rootID = raidID; 456 num_root++; 457 } 458 } else { 459 /* The autoconfig didn't work :( */ 460 aprint_debug("Autoconfig failed with code %d for raid%d\n", retcode, raidID); 461 rf_release_all_vps(cset); 462 } 463 } else { 464 /* we're not autoconfiguring this set... 465 release the associated resources */ 466 rf_release_all_vps(cset); 467 } 468 /* cleanup */ 469 rf_cleanup_config_set(cset); 470 cset = next_cset; 471 } 472 473 /* if the user has specified what the root device should be 474 then we don't touch booted_device or boothowto... */ 475 476 if (rootspec != NULL) 477 return; 478 479 /* we found something bootable... */ 480 481 if (num_root == 1) { 482 booted_device = raid_softc[rootID].sc_dev; 483 } else if (num_root > 1) { 484 485 /* 486 * Maybe the MD code can help. If it cannot, then 487 * setroot() will discover that we have no 488 * booted_device and will ask the user if nothing was 489 * hardwired in the kernel config file 490 */ 491 492 if (booted_device == NULL) 493 cpu_rootconf(); 494 if (booted_device == NULL) 495 return; 496 497 num_root = 0; 498 for (raidID = 0; raidID < numraid; raidID++) { 499 if (raidPtrs[raidID]->valid == 0) 500 continue; 501 502 if (raidPtrs[raidID]->root_partition == 0) 503 continue; 504 505 for (col = 0; col < raidPtrs[raidID]->numCol; col++) { 506 devname = raidPtrs[raidID]->Disks[col].devname; 507 devname += sizeof("/dev/") - 1; 508 if (strncmp(devname, device_xname(booted_device), 509 strlen(device_xname(booted_device))) != 0) 510 continue; 511 aprint_debug("raid%d includes boot device %s\n", 512 raidID, devname); 513 num_root++; 514 rootID = raidID; 515 } 516 } 517 518 if (num_root == 1) { 519 booted_device = raid_softc[rootID].sc_dev; 520 } else { 521 /* we can't guess.. require the user to answer... */ 522 boothowto |= RB_ASKNAME; 523 } 524 } 525 } 526 527 528 int 529 raidsize(dev_t dev) 530 { 531 struct raid_softc *rs; 532 struct disklabel *lp; 533 int part, unit, omask, size; 534 535 unit = raidunit(dev); 536 if (unit >= numraid) 537 return (-1); 538 rs = &raid_softc[unit]; 539 540 if ((rs->sc_flags & RAIDF_INITED) == 0) 541 return (-1); 542 543 part = DISKPART(dev); 544 omask = rs->sc_dkdev.dk_openmask & (1 << part); 545 lp = rs->sc_dkdev.dk_label; 546 547 if (omask == 0 && raidopen(dev, 0, S_IFBLK, curlwp)) 548 return (-1); 549 550 if (lp->d_partitions[part].p_fstype != FS_SWAP) 551 size = -1; 552 else 553 size = lp->d_partitions[part].p_size * 554 (lp->d_secsize / DEV_BSIZE); 555 556 if (omask == 0 && raidclose(dev, 0, S_IFBLK, curlwp)) 557 return (-1); 558 559 return (size); 560 561 } 562 563 int 564 raiddump(dev_t dev, daddr_t blkno, void *va, size_t size) 565 { 566 int unit = raidunit(dev); 567 struct raid_softc *rs; 568 const struct bdevsw *bdev; 569 struct disklabel *lp; 570 RF_Raid_t *raidPtr; 571 daddr_t offset; 572 int part, c, sparecol, j, scol, dumpto; 573 int error = 0; 574 575 if (unit >= numraid) 576 return (ENXIO); 577 578 rs = &raid_softc[unit]; 579 raidPtr = raidPtrs[unit]; 580 581 if ((rs->sc_flags & RAIDF_INITED) == 0) 582 return ENXIO; 583 584 /* we only support dumping to RAID 1 sets */ 585 if (raidPtr->Layout.numDataCol != 1 || 586 raidPtr->Layout.numParityCol != 1) 587 return EINVAL; 588 589 590 if ((error = raidlock(rs)) != 0) 591 return error; 592 593 if (size % DEV_BSIZE != 0) { 594 error = EINVAL; 595 goto out; 596 } 597 598 if (blkno + size / DEV_BSIZE > rs->sc_size) { 599 printf("%s: blkno (%" PRIu64 ") + size / DEV_BSIZE (%zu) > " 600 "sc->sc_size (%" PRIu64 ")\n", __func__, blkno, 601 size / DEV_BSIZE, rs->sc_size); 602 error = EINVAL; 603 goto out; 604 } 605 606 part = DISKPART(dev); 607 lp = rs->sc_dkdev.dk_label; 608 offset = lp->d_partitions[part].p_offset + RF_PROTECTED_SECTORS; 609 610 /* figure out what device is alive.. */ 611 612 /* 613 Look for a component to dump to. The preference for the 614 component to dump to is as follows: 615 1) the master 616 2) a used_spare of the master 617 3) the slave 618 4) a used_spare of the slave 619 */ 620 621 dumpto = -1; 622 for (c = 0; c < raidPtr->numCol; c++) { 623 if (raidPtr->Disks[c].status == rf_ds_optimal) { 624 /* this might be the one */ 625 dumpto = c; 626 break; 627 } 628 } 629 630 /* 631 At this point we have possibly selected a live master or a 632 live slave. We now check to see if there is a spared 633 master (or a spared slave), if we didn't find a live master 634 or a live slave. 635 */ 636 637 for (c = 0; c < raidPtr->numSpare; c++) { 638 sparecol = raidPtr->numCol + c; 639 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 640 /* How about this one? */ 641 scol = -1; 642 for(j=0;j<raidPtr->numCol;j++) { 643 if (raidPtr->Disks[j].spareCol == sparecol) { 644 scol = j; 645 break; 646 } 647 } 648 if (scol == 0) { 649 /* 650 We must have found a spared master! 651 We'll take that over anything else 652 found so far. (We couldn't have 653 found a real master before, since 654 this is a used spare, and it's 655 saying that it's replacing the 656 master.) On reboot (with 657 autoconfiguration turned on) 658 sparecol will become the 1st 659 component (component0) of this set. 660 */ 661 dumpto = sparecol; 662 break; 663 } else if (scol != -1) { 664 /* 665 Must be a spared slave. We'll dump 666 to that if we havn't found anything 667 else so far. 668 */ 669 if (dumpto == -1) 670 dumpto = sparecol; 671 } 672 } 673 } 674 675 if (dumpto == -1) { 676 /* we couldn't find any live components to dump to!?!? 677 */ 678 error = EINVAL; 679 goto out; 680 } 681 682 bdev = bdevsw_lookup(raidPtr->Disks[dumpto].dev); 683 684 /* 685 Note that blkno is relative to this particular partition. 686 By adding the offset of this partition in the RAID 687 set, and also adding RF_PROTECTED_SECTORS, we get a 688 value that is relative to the partition used for the 689 underlying component. 690 */ 691 692 error = (*bdev->d_dump)(raidPtr->Disks[dumpto].dev, 693 blkno + offset, va, size); 694 695 out: 696 raidunlock(rs); 697 698 return error; 699 } 700 /* ARGSUSED */ 701 int 702 raidopen(dev_t dev, int flags, int fmt, 703 struct lwp *l) 704 { 705 int unit = raidunit(dev); 706 struct raid_softc *rs; 707 struct disklabel *lp; 708 int part, pmask; 709 int error = 0; 710 711 if (unit >= numraid) 712 return (ENXIO); 713 rs = &raid_softc[unit]; 714 715 if ((error = raidlock(rs)) != 0) 716 return (error); 717 718 if ((rs->sc_flags & RAIDF_SHUTDOWN) != 0) { 719 error = EBUSY; 720 goto bad; 721 } 722 723 lp = rs->sc_dkdev.dk_label; 724 725 part = DISKPART(dev); 726 727 /* 728 * If there are wedges, and this is not RAW_PART, then we 729 * need to fail. 730 */ 731 if (rs->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) { 732 error = EBUSY; 733 goto bad; 734 } 735 pmask = (1 << part); 736 737 if ((rs->sc_flags & RAIDF_INITED) && 738 (rs->sc_dkdev.dk_openmask == 0)) 739 raidgetdisklabel(dev); 740 741 /* make sure that this partition exists */ 742 743 if (part != RAW_PART) { 744 if (((rs->sc_flags & RAIDF_INITED) == 0) || 745 ((part >= lp->d_npartitions) || 746 (lp->d_partitions[part].p_fstype == FS_UNUSED))) { 747 error = ENXIO; 748 goto bad; 749 } 750 } 751 /* Prevent this unit from being unconfigured while open. */ 752 switch (fmt) { 753 case S_IFCHR: 754 rs->sc_dkdev.dk_copenmask |= pmask; 755 break; 756 757 case S_IFBLK: 758 rs->sc_dkdev.dk_bopenmask |= pmask; 759 break; 760 } 761 762 if ((rs->sc_dkdev.dk_openmask == 0) && 763 ((rs->sc_flags & RAIDF_INITED) != 0)) { 764 /* First one... mark things as dirty... Note that we *MUST* 765 have done a configure before this. I DO NOT WANT TO BE 766 SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED 767 THAT THEY BELONG TOGETHER!!!!! */ 768 /* XXX should check to see if we're only open for reading 769 here... If so, we needn't do this, but then need some 770 other way of keeping track of what's happened.. */ 771 772 rf_markalldirty(raidPtrs[unit]); 773 } 774 775 776 rs->sc_dkdev.dk_openmask = 777 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; 778 779 bad: 780 raidunlock(rs); 781 782 return (error); 783 784 785 } 786 /* ARGSUSED */ 787 int 788 raidclose(dev_t dev, int flags, int fmt, struct lwp *l) 789 { 790 int unit = raidunit(dev); 791 struct raid_softc *rs; 792 int error = 0; 793 int part; 794 795 if (unit >= numraid) 796 return (ENXIO); 797 rs = &raid_softc[unit]; 798 799 if ((error = raidlock(rs)) != 0) 800 return (error); 801 802 part = DISKPART(dev); 803 804 /* ...that much closer to allowing unconfiguration... */ 805 switch (fmt) { 806 case S_IFCHR: 807 rs->sc_dkdev.dk_copenmask &= ~(1 << part); 808 break; 809 810 case S_IFBLK: 811 rs->sc_dkdev.dk_bopenmask &= ~(1 << part); 812 break; 813 } 814 rs->sc_dkdev.dk_openmask = 815 rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; 816 817 if ((rs->sc_dkdev.dk_openmask == 0) && 818 ((rs->sc_flags & RAIDF_INITED) != 0)) { 819 /* Last one... device is not unconfigured yet. 820 Device shutdown has taken care of setting the 821 clean bits if RAIDF_INITED is not set 822 mark things as clean... */ 823 824 rf_update_component_labels(raidPtrs[unit], 825 RF_FINAL_COMPONENT_UPDATE); 826 827 /* If the kernel is shutting down, it will detach 828 * this RAID set soon enough. 829 */ 830 } 831 832 raidunlock(rs); 833 return (0); 834 835 } 836 837 void 838 raidstrategy(struct buf *bp) 839 { 840 unsigned int raidID = raidunit(bp->b_dev); 841 RF_Raid_t *raidPtr; 842 struct raid_softc *rs = &raid_softc[raidID]; 843 int wlabel; 844 845 if ((rs->sc_flags & RAIDF_INITED) ==0) { 846 bp->b_error = ENXIO; 847 goto done; 848 } 849 if (raidID >= numraid || !raidPtrs[raidID]) { 850 bp->b_error = ENODEV; 851 goto done; 852 } 853 raidPtr = raidPtrs[raidID]; 854 if (!raidPtr->valid) { 855 bp->b_error = ENODEV; 856 goto done; 857 } 858 if (bp->b_bcount == 0) { 859 db1_printf(("b_bcount is zero..\n")); 860 goto done; 861 } 862 863 /* 864 * Do bounds checking and adjust transfer. If there's an 865 * error, the bounds check will flag that for us. 866 */ 867 868 wlabel = rs->sc_flags & (RAIDF_WLABEL | RAIDF_LABELLING); 869 if (DISKPART(bp->b_dev) == RAW_PART) { 870 uint64_t size; /* device size in DEV_BSIZE unit */ 871 872 if (raidPtr->logBytesPerSector > DEV_BSHIFT) { 873 size = raidPtr->totalSectors << 874 (raidPtr->logBytesPerSector - DEV_BSHIFT); 875 } else { 876 size = raidPtr->totalSectors >> 877 (DEV_BSHIFT - raidPtr->logBytesPerSector); 878 } 879 if (bounds_check_with_mediasize(bp, DEV_BSIZE, size) <= 0) { 880 goto done; 881 } 882 } else { 883 if (bounds_check_with_label(&rs->sc_dkdev, bp, wlabel) <= 0) { 884 db1_printf(("Bounds check failed!!:%d %d\n", 885 (int) bp->b_blkno, (int) wlabel)); 886 goto done; 887 } 888 } 889 890 rf_lock_mutex2(raidPtr->iodone_lock); 891 892 bp->b_resid = 0; 893 894 /* stuff it onto our queue */ 895 bufq_put(rs->buf_queue, bp); 896 897 /* scheduled the IO to happen at the next convenient time */ 898 rf_signal_cond2(raidPtr->iodone_cv); 899 rf_unlock_mutex2(raidPtr->iodone_lock); 900 901 return; 902 903 done: 904 bp->b_resid = bp->b_bcount; 905 biodone(bp); 906 } 907 /* ARGSUSED */ 908 int 909 raidread(dev_t dev, struct uio *uio, int flags) 910 { 911 int unit = raidunit(dev); 912 struct raid_softc *rs; 913 914 if (unit >= numraid) 915 return (ENXIO); 916 rs = &raid_softc[unit]; 917 918 if ((rs->sc_flags & RAIDF_INITED) == 0) 919 return (ENXIO); 920 921 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio)); 922 923 } 924 /* ARGSUSED */ 925 int 926 raidwrite(dev_t dev, struct uio *uio, int flags) 927 { 928 int unit = raidunit(dev); 929 struct raid_softc *rs; 930 931 if (unit >= numraid) 932 return (ENXIO); 933 rs = &raid_softc[unit]; 934 935 if ((rs->sc_flags & RAIDF_INITED) == 0) 936 return (ENXIO); 937 938 return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio)); 939 940 } 941 942 static int 943 raid_detach_unlocked(struct raid_softc *rs) 944 { 945 int error; 946 RF_Raid_t *raidPtr; 947 948 raidPtr = raidPtrs[device_unit(rs->sc_dev)]; 949 950 /* 951 * If somebody has a partition mounted, we shouldn't 952 * shutdown. 953 */ 954 if (rs->sc_dkdev.dk_openmask != 0) 955 return EBUSY; 956 957 if ((rs->sc_flags & RAIDF_INITED) == 0) 958 ; /* not initialized: nothing to do */ 959 else if ((error = rf_Shutdown(raidPtr)) != 0) 960 return error; 961 else 962 rs->sc_flags &= ~(RAIDF_INITED|RAIDF_SHUTDOWN); 963 964 /* Detach the disk. */ 965 dkwedge_delall(&rs->sc_dkdev); 966 disk_detach(&rs->sc_dkdev); 967 disk_destroy(&rs->sc_dkdev); 968 969 aprint_normal_dev(rs->sc_dev, "detached\n"); 970 971 return 0; 972 } 973 974 int 975 raidioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l) 976 { 977 int unit = raidunit(dev); 978 int error = 0; 979 int part, pmask; 980 cfdata_t cf; 981 struct raid_softc *rs; 982 RF_Config_t *k_cfg, *u_cfg; 983 RF_Raid_t *raidPtr; 984 RF_RaidDisk_t *diskPtr; 985 RF_AccTotals_t *totals; 986 RF_DeviceConfig_t *d_cfg, **ucfgp; 987 u_char *specific_buf; 988 int retcode = 0; 989 int column; 990 /* int raidid; */ 991 struct rf_recon_req *rrcopy, *rr; 992 RF_ComponentLabel_t *clabel; 993 RF_ComponentLabel_t *ci_label; 994 RF_ComponentLabel_t **clabel_ptr; 995 RF_SingleComponent_t *sparePtr,*componentPtr; 996 RF_SingleComponent_t component; 997 RF_ProgressInfo_t progressInfo, **progressInfoPtr; 998 int i, j, d; 999 #ifdef __HAVE_OLD_DISKLABEL 1000 struct disklabel newlabel; 1001 #endif 1002 struct dkwedge_info *dkw; 1003 1004 if (unit >= numraid) 1005 return (ENXIO); 1006 rs = &raid_softc[unit]; 1007 raidPtr = raidPtrs[unit]; 1008 1009 db1_printf(("raidioctl: %d %d %d %lu\n", (int) dev, 1010 (int) DISKPART(dev), (int) unit, cmd)); 1011 1012 /* Must be open for writes for these commands... */ 1013 switch (cmd) { 1014 #ifdef DIOCGSECTORSIZE 1015 case DIOCGSECTORSIZE: 1016 *(u_int *)data = raidPtr->bytesPerSector; 1017 return 0; 1018 case DIOCGMEDIASIZE: 1019 *(off_t *)data = 1020 (off_t)raidPtr->totalSectors * raidPtr->bytesPerSector; 1021 return 0; 1022 #endif 1023 case DIOCSDINFO: 1024 case DIOCWDINFO: 1025 #ifdef __HAVE_OLD_DISKLABEL 1026 case ODIOCWDINFO: 1027 case ODIOCSDINFO: 1028 #endif 1029 case DIOCWLABEL: 1030 case DIOCAWEDGE: 1031 case DIOCDWEDGE: 1032 if ((flag & FWRITE) == 0) 1033 return (EBADF); 1034 } 1035 1036 /* Must be initialized for these... */ 1037 switch (cmd) { 1038 case DIOCGDINFO: 1039 case DIOCSDINFO: 1040 case DIOCWDINFO: 1041 #ifdef __HAVE_OLD_DISKLABEL 1042 case ODIOCGDINFO: 1043 case ODIOCWDINFO: 1044 case ODIOCSDINFO: 1045 case ODIOCGDEFLABEL: 1046 #endif 1047 case DIOCGPART: 1048 case DIOCWLABEL: 1049 case DIOCGDEFLABEL: 1050 case DIOCAWEDGE: 1051 case DIOCDWEDGE: 1052 case DIOCLWEDGES: 1053 case DIOCCACHESYNC: 1054 case RAIDFRAME_SHUTDOWN: 1055 case RAIDFRAME_REWRITEPARITY: 1056 case RAIDFRAME_GET_INFO: 1057 case RAIDFRAME_RESET_ACCTOTALS: 1058 case RAIDFRAME_GET_ACCTOTALS: 1059 case RAIDFRAME_KEEP_ACCTOTALS: 1060 case RAIDFRAME_GET_SIZE: 1061 case RAIDFRAME_FAIL_DISK: 1062 case RAIDFRAME_COPYBACK: 1063 case RAIDFRAME_CHECK_RECON_STATUS: 1064 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1065 case RAIDFRAME_GET_COMPONENT_LABEL: 1066 case RAIDFRAME_SET_COMPONENT_LABEL: 1067 case RAIDFRAME_ADD_HOT_SPARE: 1068 case RAIDFRAME_REMOVE_HOT_SPARE: 1069 case RAIDFRAME_INIT_LABELS: 1070 case RAIDFRAME_REBUILD_IN_PLACE: 1071 case RAIDFRAME_CHECK_PARITY: 1072 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1073 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1074 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1075 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1076 case RAIDFRAME_SET_AUTOCONFIG: 1077 case RAIDFRAME_SET_ROOT: 1078 case RAIDFRAME_DELETE_COMPONENT: 1079 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1080 case RAIDFRAME_PARITYMAP_STATUS: 1081 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1082 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1083 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1084 if ((rs->sc_flags & RAIDF_INITED) == 0) 1085 return (ENXIO); 1086 } 1087 1088 switch (cmd) { 1089 #ifdef COMPAT_50 1090 case RAIDFRAME_GET_INFO50: 1091 return rf_get_info50(raidPtr, data); 1092 1093 case RAIDFRAME_CONFIGURE50: 1094 if ((retcode = rf_config50(raidPtr, unit, data, &k_cfg)) != 0) 1095 return retcode; 1096 goto config; 1097 #endif 1098 /* configure the system */ 1099 case RAIDFRAME_CONFIGURE: 1100 1101 if (raidPtr->valid) { 1102 /* There is a valid RAID set running on this unit! */ 1103 printf("raid%d: Device already configured!\n",unit); 1104 return(EINVAL); 1105 } 1106 1107 /* copy-in the configuration information */ 1108 /* data points to a pointer to the configuration structure */ 1109 1110 u_cfg = *((RF_Config_t **) data); 1111 RF_Malloc(k_cfg, sizeof(RF_Config_t), (RF_Config_t *)); 1112 if (k_cfg == NULL) { 1113 return (ENOMEM); 1114 } 1115 retcode = copyin(u_cfg, k_cfg, sizeof(RF_Config_t)); 1116 if (retcode) { 1117 RF_Free(k_cfg, sizeof(RF_Config_t)); 1118 db1_printf(("rf_ioctl: retcode=%d copyin.1\n", 1119 retcode)); 1120 return (retcode); 1121 } 1122 goto config; 1123 config: 1124 /* allocate a buffer for the layout-specific data, and copy it 1125 * in */ 1126 if (k_cfg->layoutSpecificSize) { 1127 if (k_cfg->layoutSpecificSize > 10000) { 1128 /* sanity check */ 1129 RF_Free(k_cfg, sizeof(RF_Config_t)); 1130 return (EINVAL); 1131 } 1132 RF_Malloc(specific_buf, k_cfg->layoutSpecificSize, 1133 (u_char *)); 1134 if (specific_buf == NULL) { 1135 RF_Free(k_cfg, sizeof(RF_Config_t)); 1136 return (ENOMEM); 1137 } 1138 retcode = copyin(k_cfg->layoutSpecific, specific_buf, 1139 k_cfg->layoutSpecificSize); 1140 if (retcode) { 1141 RF_Free(k_cfg, sizeof(RF_Config_t)); 1142 RF_Free(specific_buf, 1143 k_cfg->layoutSpecificSize); 1144 db1_printf(("rf_ioctl: retcode=%d copyin.2\n", 1145 retcode)); 1146 return (retcode); 1147 } 1148 } else 1149 specific_buf = NULL; 1150 k_cfg->layoutSpecific = specific_buf; 1151 1152 /* should do some kind of sanity check on the configuration. 1153 * Store the sum of all the bytes in the last byte? */ 1154 1155 /* configure the system */ 1156 1157 /* 1158 * Clear the entire RAID descriptor, just to make sure 1159 * there is no stale data left in the case of a 1160 * reconfiguration 1161 */ 1162 memset(raidPtr, 0, sizeof(*raidPtr)); 1163 raidPtr->raidid = unit; 1164 1165 retcode = rf_Configure(raidPtr, k_cfg, NULL); 1166 1167 if (retcode == 0) { 1168 1169 /* allow this many simultaneous IO's to 1170 this RAID device */ 1171 raidPtr->openings = RAIDOUTSTANDING; 1172 1173 raidinit(raidPtr); 1174 rf_markalldirty(raidPtr); 1175 } 1176 /* free the buffers. No return code here. */ 1177 if (k_cfg->layoutSpecificSize) { 1178 RF_Free(specific_buf, k_cfg->layoutSpecificSize); 1179 } 1180 RF_Free(k_cfg, sizeof(RF_Config_t)); 1181 1182 return (retcode); 1183 1184 /* shutdown the system */ 1185 case RAIDFRAME_SHUTDOWN: 1186 1187 part = DISKPART(dev); 1188 pmask = (1 << part); 1189 1190 if ((error = raidlock(rs)) != 0) 1191 return (error); 1192 1193 if ((rs->sc_dkdev.dk_openmask & ~pmask) || 1194 ((rs->sc_dkdev.dk_bopenmask & pmask) && 1195 (rs->sc_dkdev.dk_copenmask & pmask))) 1196 retcode = EBUSY; 1197 else { 1198 rs->sc_flags |= RAIDF_SHUTDOWN; 1199 rs->sc_dkdev.dk_copenmask &= ~pmask; 1200 rs->sc_dkdev.dk_bopenmask &= ~pmask; 1201 rs->sc_dkdev.dk_openmask &= ~pmask; 1202 retcode = 0; 1203 } 1204 1205 raidunlock(rs); 1206 1207 if (retcode != 0) 1208 return retcode; 1209 1210 /* free the pseudo device attach bits */ 1211 1212 cf = device_cfdata(rs->sc_dev); 1213 if ((retcode = config_detach(rs->sc_dev, DETACH_QUIET)) == 0) 1214 free(cf, M_RAIDFRAME); 1215 1216 return (retcode); 1217 case RAIDFRAME_GET_COMPONENT_LABEL: 1218 clabel_ptr = (RF_ComponentLabel_t **) data; 1219 /* need to read the component label for the disk indicated 1220 by row,column in clabel */ 1221 1222 /* 1223 * Perhaps there should be an option to skip the in-core 1224 * copy and hit the disk, as with disklabel(8). 1225 */ 1226 RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *)); 1227 1228 retcode = copyin(*clabel_ptr, clabel, sizeof(*clabel)); 1229 1230 if (retcode) { 1231 RF_Free(clabel, sizeof(*clabel)); 1232 return retcode; 1233 } 1234 1235 clabel->row = 0; /* Don't allow looking at anything else.*/ 1236 1237 column = clabel->column; 1238 1239 if ((column < 0) || (column >= raidPtr->numCol + 1240 raidPtr->numSpare)) { 1241 RF_Free(clabel, sizeof(*clabel)); 1242 return EINVAL; 1243 } 1244 1245 RF_Free(clabel, sizeof(*clabel)); 1246 1247 clabel = raidget_component_label(raidPtr, column); 1248 1249 return copyout(clabel, *clabel_ptr, sizeof(**clabel_ptr)); 1250 1251 #if 0 1252 case RAIDFRAME_SET_COMPONENT_LABEL: 1253 clabel = (RF_ComponentLabel_t *) data; 1254 1255 /* XXX check the label for valid stuff... */ 1256 /* Note that some things *should not* get modified -- 1257 the user should be re-initing the labels instead of 1258 trying to patch things. 1259 */ 1260 1261 raidid = raidPtr->raidid; 1262 #ifdef DEBUG 1263 printf("raid%d: Got component label:\n", raidid); 1264 printf("raid%d: Version: %d\n", raidid, clabel->version); 1265 printf("raid%d: Serial Number: %d\n", raidid, clabel->serial_number); 1266 printf("raid%d: Mod counter: %d\n", raidid, clabel->mod_counter); 1267 printf("raid%d: Column: %d\n", raidid, clabel->column); 1268 printf("raid%d: Num Columns: %d\n", raidid, clabel->num_columns); 1269 printf("raid%d: Clean: %d\n", raidid, clabel->clean); 1270 printf("raid%d: Status: %d\n", raidid, clabel->status); 1271 #endif 1272 clabel->row = 0; 1273 column = clabel->column; 1274 1275 if ((column < 0) || (column >= raidPtr->numCol)) { 1276 return(EINVAL); 1277 } 1278 1279 /* XXX this isn't allowed to do anything for now :-) */ 1280 1281 /* XXX and before it is, we need to fill in the rest 1282 of the fields!?!?!?! */ 1283 memcpy(raidget_component_label(raidPtr, column), 1284 clabel, sizeof(*clabel)); 1285 raidflush_component_label(raidPtr, column); 1286 return (0); 1287 #endif 1288 1289 case RAIDFRAME_INIT_LABELS: 1290 clabel = (RF_ComponentLabel_t *) data; 1291 /* 1292 we only want the serial number from 1293 the above. We get all the rest of the information 1294 from the config that was used to create this RAID 1295 set. 1296 */ 1297 1298 raidPtr->serial_number = clabel->serial_number; 1299 1300 for(column=0;column<raidPtr->numCol;column++) { 1301 diskPtr = &raidPtr->Disks[column]; 1302 if (!RF_DEAD_DISK(diskPtr->status)) { 1303 ci_label = raidget_component_label(raidPtr, 1304 column); 1305 /* Zeroing this is important. */ 1306 memset(ci_label, 0, sizeof(*ci_label)); 1307 raid_init_component_label(raidPtr, ci_label); 1308 ci_label->serial_number = 1309 raidPtr->serial_number; 1310 ci_label->row = 0; /* we dont' pretend to support more */ 1311 rf_component_label_set_partitionsize(ci_label, 1312 diskPtr->partitionSize); 1313 ci_label->column = column; 1314 raidflush_component_label(raidPtr, column); 1315 } 1316 /* XXXjld what about the spares? */ 1317 } 1318 1319 return (retcode); 1320 case RAIDFRAME_SET_AUTOCONFIG: 1321 d = rf_set_autoconfig(raidPtr, *(int *) data); 1322 printf("raid%d: New autoconfig value is: %d\n", 1323 raidPtr->raidid, d); 1324 *(int *) data = d; 1325 return (retcode); 1326 1327 case RAIDFRAME_SET_ROOT: 1328 d = rf_set_rootpartition(raidPtr, *(int *) data); 1329 printf("raid%d: New rootpartition value is: %d\n", 1330 raidPtr->raidid, d); 1331 *(int *) data = d; 1332 return (retcode); 1333 1334 /* initialize all parity */ 1335 case RAIDFRAME_REWRITEPARITY: 1336 1337 if (raidPtr->Layout.map->faultsTolerated == 0) { 1338 /* Parity for RAID 0 is trivially correct */ 1339 raidPtr->parity_good = RF_RAID_CLEAN; 1340 return(0); 1341 } 1342 1343 if (raidPtr->parity_rewrite_in_progress == 1) { 1344 /* Re-write is already in progress! */ 1345 return(EINVAL); 1346 } 1347 1348 retcode = RF_CREATE_THREAD(raidPtr->parity_rewrite_thread, 1349 rf_RewriteParityThread, 1350 raidPtr,"raid_parity"); 1351 return (retcode); 1352 1353 1354 case RAIDFRAME_ADD_HOT_SPARE: 1355 sparePtr = (RF_SingleComponent_t *) data; 1356 memcpy( &component, sparePtr, sizeof(RF_SingleComponent_t)); 1357 retcode = rf_add_hot_spare(raidPtr, &component); 1358 return(retcode); 1359 1360 case RAIDFRAME_REMOVE_HOT_SPARE: 1361 return(retcode); 1362 1363 case RAIDFRAME_DELETE_COMPONENT: 1364 componentPtr = (RF_SingleComponent_t *)data; 1365 memcpy( &component, componentPtr, 1366 sizeof(RF_SingleComponent_t)); 1367 retcode = rf_delete_component(raidPtr, &component); 1368 return(retcode); 1369 1370 case RAIDFRAME_INCORPORATE_HOT_SPARE: 1371 componentPtr = (RF_SingleComponent_t *)data; 1372 memcpy( &component, componentPtr, 1373 sizeof(RF_SingleComponent_t)); 1374 retcode = rf_incorporate_hot_spare(raidPtr, &component); 1375 return(retcode); 1376 1377 case RAIDFRAME_REBUILD_IN_PLACE: 1378 1379 if (raidPtr->Layout.map->faultsTolerated == 0) { 1380 /* Can't do this on a RAID 0!! */ 1381 return(EINVAL); 1382 } 1383 1384 if (raidPtr->recon_in_progress == 1) { 1385 /* a reconstruct is already in progress! */ 1386 return(EINVAL); 1387 } 1388 1389 componentPtr = (RF_SingleComponent_t *) data; 1390 memcpy( &component, componentPtr, 1391 sizeof(RF_SingleComponent_t)); 1392 component.row = 0; /* we don't support any more */ 1393 column = component.column; 1394 1395 if ((column < 0) || (column >= raidPtr->numCol)) { 1396 return(EINVAL); 1397 } 1398 1399 rf_lock_mutex2(raidPtr->mutex); 1400 if ((raidPtr->Disks[column].status == rf_ds_optimal) && 1401 (raidPtr->numFailures > 0)) { 1402 /* XXX 0 above shouldn't be constant!!! */ 1403 /* some component other than this has failed. 1404 Let's not make things worse than they already 1405 are... */ 1406 printf("raid%d: Unable to reconstruct to disk at:\n", 1407 raidPtr->raidid); 1408 printf("raid%d: Col: %d Too many failures.\n", 1409 raidPtr->raidid, column); 1410 rf_unlock_mutex2(raidPtr->mutex); 1411 return (EINVAL); 1412 } 1413 if (raidPtr->Disks[column].status == 1414 rf_ds_reconstructing) { 1415 printf("raid%d: Unable to reconstruct to disk at:\n", 1416 raidPtr->raidid); 1417 printf("raid%d: Col: %d Reconstruction already occuring!\n", raidPtr->raidid, column); 1418 1419 rf_unlock_mutex2(raidPtr->mutex); 1420 return (EINVAL); 1421 } 1422 if (raidPtr->Disks[column].status == rf_ds_spared) { 1423 rf_unlock_mutex2(raidPtr->mutex); 1424 return (EINVAL); 1425 } 1426 rf_unlock_mutex2(raidPtr->mutex); 1427 1428 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); 1429 if (rrcopy == NULL) 1430 return(ENOMEM); 1431 1432 rrcopy->raidPtr = (void *) raidPtr; 1433 rrcopy->col = column; 1434 1435 retcode = RF_CREATE_THREAD(raidPtr->recon_thread, 1436 rf_ReconstructInPlaceThread, 1437 rrcopy,"raid_reconip"); 1438 return(retcode); 1439 1440 case RAIDFRAME_GET_INFO: 1441 if (!raidPtr->valid) 1442 return (ENODEV); 1443 ucfgp = (RF_DeviceConfig_t **) data; 1444 RF_Malloc(d_cfg, sizeof(RF_DeviceConfig_t), 1445 (RF_DeviceConfig_t *)); 1446 if (d_cfg == NULL) 1447 return (ENOMEM); 1448 d_cfg->rows = 1; /* there is only 1 row now */ 1449 d_cfg->cols = raidPtr->numCol; 1450 d_cfg->ndevs = raidPtr->numCol; 1451 if (d_cfg->ndevs >= RF_MAX_DISKS) { 1452 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1453 return (ENOMEM); 1454 } 1455 d_cfg->nspares = raidPtr->numSpare; 1456 if (d_cfg->nspares >= RF_MAX_DISKS) { 1457 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1458 return (ENOMEM); 1459 } 1460 d_cfg->maxqdepth = raidPtr->maxQueueDepth; 1461 d = 0; 1462 for (j = 0; j < d_cfg->cols; j++) { 1463 d_cfg->devs[d] = raidPtr->Disks[j]; 1464 d++; 1465 } 1466 for (j = d_cfg->cols, i = 0; i < d_cfg->nspares; i++, j++) { 1467 d_cfg->spares[i] = raidPtr->Disks[j]; 1468 } 1469 retcode = copyout(d_cfg, *ucfgp, sizeof(RF_DeviceConfig_t)); 1470 RF_Free(d_cfg, sizeof(RF_DeviceConfig_t)); 1471 1472 return (retcode); 1473 1474 case RAIDFRAME_CHECK_PARITY: 1475 *(int *) data = raidPtr->parity_good; 1476 return (0); 1477 1478 case RAIDFRAME_PARITYMAP_STATUS: 1479 if (rf_paritymap_ineligible(raidPtr)) 1480 return EINVAL; 1481 rf_paritymap_status(raidPtr->parity_map, 1482 (struct rf_pmstat *)data); 1483 return 0; 1484 1485 case RAIDFRAME_PARITYMAP_SET_PARAMS: 1486 if (rf_paritymap_ineligible(raidPtr)) 1487 return EINVAL; 1488 if (raidPtr->parity_map == NULL) 1489 return ENOENT; /* ??? */ 1490 if (0 != rf_paritymap_set_params(raidPtr->parity_map, 1491 (struct rf_pmparams *)data, 1)) 1492 return EINVAL; 1493 return 0; 1494 1495 case RAIDFRAME_PARITYMAP_GET_DISABLE: 1496 if (rf_paritymap_ineligible(raidPtr)) 1497 return EINVAL; 1498 *(int *) data = rf_paritymap_get_disable(raidPtr); 1499 return 0; 1500 1501 case RAIDFRAME_PARITYMAP_SET_DISABLE: 1502 if (rf_paritymap_ineligible(raidPtr)) 1503 return EINVAL; 1504 rf_paritymap_set_disable(raidPtr, *(int *)data); 1505 /* XXX should errors be passed up? */ 1506 return 0; 1507 1508 case RAIDFRAME_RESET_ACCTOTALS: 1509 memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals)); 1510 return (0); 1511 1512 case RAIDFRAME_GET_ACCTOTALS: 1513 totals = (RF_AccTotals_t *) data; 1514 *totals = raidPtr->acc_totals; 1515 return (0); 1516 1517 case RAIDFRAME_KEEP_ACCTOTALS: 1518 raidPtr->keep_acc_totals = *(int *)data; 1519 return (0); 1520 1521 case RAIDFRAME_GET_SIZE: 1522 *(int *) data = raidPtr->totalSectors; 1523 return (0); 1524 1525 /* fail a disk & optionally start reconstruction */ 1526 case RAIDFRAME_FAIL_DISK: 1527 1528 if (raidPtr->Layout.map->faultsTolerated == 0) { 1529 /* Can't do this on a RAID 0!! */ 1530 return(EINVAL); 1531 } 1532 1533 rr = (struct rf_recon_req *) data; 1534 rr->row = 0; 1535 if (rr->col < 0 || rr->col >= raidPtr->numCol) 1536 return (EINVAL); 1537 1538 1539 rf_lock_mutex2(raidPtr->mutex); 1540 if (raidPtr->status == rf_rs_reconstructing) { 1541 /* you can't fail a disk while we're reconstructing! */ 1542 /* XXX wrong for RAID6 */ 1543 rf_unlock_mutex2(raidPtr->mutex); 1544 return (EINVAL); 1545 } 1546 if ((raidPtr->Disks[rr->col].status == 1547 rf_ds_optimal) && (raidPtr->numFailures > 0)) { 1548 /* some other component has failed. Let's not make 1549 things worse. XXX wrong for RAID6 */ 1550 rf_unlock_mutex2(raidPtr->mutex); 1551 return (EINVAL); 1552 } 1553 if (raidPtr->Disks[rr->col].status == rf_ds_spared) { 1554 /* Can't fail a spared disk! */ 1555 rf_unlock_mutex2(raidPtr->mutex); 1556 return (EINVAL); 1557 } 1558 rf_unlock_mutex2(raidPtr->mutex); 1559 1560 /* make a copy of the recon request so that we don't rely on 1561 * the user's buffer */ 1562 RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); 1563 if (rrcopy == NULL) 1564 return(ENOMEM); 1565 memcpy(rrcopy, rr, sizeof(*rr)); 1566 rrcopy->raidPtr = (void *) raidPtr; 1567 1568 retcode = RF_CREATE_THREAD(raidPtr->recon_thread, 1569 rf_ReconThread, 1570 rrcopy,"raid_recon"); 1571 return (0); 1572 1573 /* invoke a copyback operation after recon on whatever disk 1574 * needs it, if any */ 1575 case RAIDFRAME_COPYBACK: 1576 1577 if (raidPtr->Layout.map->faultsTolerated == 0) { 1578 /* This makes no sense on a RAID 0!! */ 1579 return(EINVAL); 1580 } 1581 1582 if (raidPtr->copyback_in_progress == 1) { 1583 /* Copyback is already in progress! */ 1584 return(EINVAL); 1585 } 1586 1587 retcode = RF_CREATE_THREAD(raidPtr->copyback_thread, 1588 rf_CopybackThread, 1589 raidPtr,"raid_copyback"); 1590 return (retcode); 1591 1592 /* return the percentage completion of reconstruction */ 1593 case RAIDFRAME_CHECK_RECON_STATUS: 1594 if (raidPtr->Layout.map->faultsTolerated == 0) { 1595 /* This makes no sense on a RAID 0, so tell the 1596 user it's done. */ 1597 *(int *) data = 100; 1598 return(0); 1599 } 1600 if (raidPtr->status != rf_rs_reconstructing) 1601 *(int *) data = 100; 1602 else { 1603 if (raidPtr->reconControl->numRUsTotal > 0) { 1604 *(int *) data = (raidPtr->reconControl->numRUsComplete * 100 / raidPtr->reconControl->numRUsTotal); 1605 } else { 1606 *(int *) data = 0; 1607 } 1608 } 1609 return (0); 1610 case RAIDFRAME_CHECK_RECON_STATUS_EXT: 1611 progressInfoPtr = (RF_ProgressInfo_t **) data; 1612 if (raidPtr->status != rf_rs_reconstructing) { 1613 progressInfo.remaining = 0; 1614 progressInfo.completed = 100; 1615 progressInfo.total = 100; 1616 } else { 1617 progressInfo.total = 1618 raidPtr->reconControl->numRUsTotal; 1619 progressInfo.completed = 1620 raidPtr->reconControl->numRUsComplete; 1621 progressInfo.remaining = progressInfo.total - 1622 progressInfo.completed; 1623 } 1624 retcode = copyout(&progressInfo, *progressInfoPtr, 1625 sizeof(RF_ProgressInfo_t)); 1626 return (retcode); 1627 1628 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS: 1629 if (raidPtr->Layout.map->faultsTolerated == 0) { 1630 /* This makes no sense on a RAID 0, so tell the 1631 user it's done. */ 1632 *(int *) data = 100; 1633 return(0); 1634 } 1635 if (raidPtr->parity_rewrite_in_progress == 1) { 1636 *(int *) data = 100 * 1637 raidPtr->parity_rewrite_stripes_done / 1638 raidPtr->Layout.numStripe; 1639 } else { 1640 *(int *) data = 100; 1641 } 1642 return (0); 1643 1644 case RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT: 1645 progressInfoPtr = (RF_ProgressInfo_t **) data; 1646 if (raidPtr->parity_rewrite_in_progress == 1) { 1647 progressInfo.total = raidPtr->Layout.numStripe; 1648 progressInfo.completed = 1649 raidPtr->parity_rewrite_stripes_done; 1650 progressInfo.remaining = progressInfo.total - 1651 progressInfo.completed; 1652 } else { 1653 progressInfo.remaining = 0; 1654 progressInfo.completed = 100; 1655 progressInfo.total = 100; 1656 } 1657 retcode = copyout(&progressInfo, *progressInfoPtr, 1658 sizeof(RF_ProgressInfo_t)); 1659 return (retcode); 1660 1661 case RAIDFRAME_CHECK_COPYBACK_STATUS: 1662 if (raidPtr->Layout.map->faultsTolerated == 0) { 1663 /* This makes no sense on a RAID 0 */ 1664 *(int *) data = 100; 1665 return(0); 1666 } 1667 if (raidPtr->copyback_in_progress == 1) { 1668 *(int *) data = 100 * raidPtr->copyback_stripes_done / 1669 raidPtr->Layout.numStripe; 1670 } else { 1671 *(int *) data = 100; 1672 } 1673 return (0); 1674 1675 case RAIDFRAME_CHECK_COPYBACK_STATUS_EXT: 1676 progressInfoPtr = (RF_ProgressInfo_t **) data; 1677 if (raidPtr->copyback_in_progress == 1) { 1678 progressInfo.total = raidPtr->Layout.numStripe; 1679 progressInfo.completed = 1680 raidPtr->copyback_stripes_done; 1681 progressInfo.remaining = progressInfo.total - 1682 progressInfo.completed; 1683 } else { 1684 progressInfo.remaining = 0; 1685 progressInfo.completed = 100; 1686 progressInfo.total = 100; 1687 } 1688 retcode = copyout(&progressInfo, *progressInfoPtr, 1689 sizeof(RF_ProgressInfo_t)); 1690 return (retcode); 1691 1692 /* the sparetable daemon calls this to wait for the kernel to 1693 * need a spare table. this ioctl does not return until a 1694 * spare table is needed. XXX -- calling mpsleep here in the 1695 * ioctl code is almost certainly wrong and evil. -- XXX XXX 1696 * -- I should either compute the spare table in the kernel, 1697 * or have a different -- XXX XXX -- interface (a different 1698 * character device) for delivering the table -- XXX */ 1699 #if 0 1700 case RAIDFRAME_SPARET_WAIT: 1701 rf_lock_mutex2(rf_sparet_wait_mutex); 1702 while (!rf_sparet_wait_queue) 1703 rf_wait_cond2(rf_sparet_wait_cv, rf_sparet_wait_mutex); 1704 waitreq = rf_sparet_wait_queue; 1705 rf_sparet_wait_queue = rf_sparet_wait_queue->next; 1706 rf_unlock_mutex2(rf_sparet_wait_mutex); 1707 1708 /* structure assignment */ 1709 *((RF_SparetWait_t *) data) = *waitreq; 1710 1711 RF_Free(waitreq, sizeof(*waitreq)); 1712 return (0); 1713 1714 /* wakes up a process waiting on SPARET_WAIT and puts an error 1715 * code in it that will cause the dameon to exit */ 1716 case RAIDFRAME_ABORT_SPARET_WAIT: 1717 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); 1718 waitreq->fcol = -1; 1719 rf_lock_mutex2(rf_sparet_wait_mutex); 1720 waitreq->next = rf_sparet_wait_queue; 1721 rf_sparet_wait_queue = waitreq; 1722 rf_broadcast_conf2(rf_sparet_wait_cv); 1723 rf_unlock_mutex2(rf_sparet_wait_mutex); 1724 return (0); 1725 1726 /* used by the spare table daemon to deliver a spare table 1727 * into the kernel */ 1728 case RAIDFRAME_SEND_SPARET: 1729 1730 /* install the spare table */ 1731 retcode = rf_SetSpareTable(raidPtr, *(void **) data); 1732 1733 /* respond to the requestor. the return status of the spare 1734 * table installation is passed in the "fcol" field */ 1735 RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); 1736 waitreq->fcol = retcode; 1737 rf_lock_mutex2(rf_sparet_wait_mutex); 1738 waitreq->next = rf_sparet_resp_queue; 1739 rf_sparet_resp_queue = waitreq; 1740 rf_broadcast_cond2(rf_sparet_resp_cv); 1741 rf_unlock_mutex2(rf_sparet_wait_mutex); 1742 1743 return (retcode); 1744 #endif 1745 1746 default: 1747 break; /* fall through to the os-specific code below */ 1748 1749 } 1750 1751 if (!raidPtr->valid) 1752 return (EINVAL); 1753 1754 /* 1755 * Add support for "regular" device ioctls here. 1756 */ 1757 1758 error = disk_ioctl(&rs->sc_dkdev, cmd, data, flag, l); 1759 if (error != EPASSTHROUGH) 1760 return (error); 1761 1762 switch (cmd) { 1763 case DIOCGDINFO: 1764 *(struct disklabel *) data = *(rs->sc_dkdev.dk_label); 1765 break; 1766 #ifdef __HAVE_OLD_DISKLABEL 1767 case ODIOCGDINFO: 1768 newlabel = *(rs->sc_dkdev.dk_label); 1769 if (newlabel.d_npartitions > OLDMAXPARTITIONS) 1770 return ENOTTY; 1771 memcpy(data, &newlabel, sizeof (struct olddisklabel)); 1772 break; 1773 #endif 1774 1775 case DIOCGPART: 1776 ((struct partinfo *) data)->disklab = rs->sc_dkdev.dk_label; 1777 ((struct partinfo *) data)->part = 1778 &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)]; 1779 break; 1780 1781 case DIOCWDINFO: 1782 case DIOCSDINFO: 1783 #ifdef __HAVE_OLD_DISKLABEL 1784 case ODIOCWDINFO: 1785 case ODIOCSDINFO: 1786 #endif 1787 { 1788 struct disklabel *lp; 1789 #ifdef __HAVE_OLD_DISKLABEL 1790 if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) { 1791 memset(&newlabel, 0, sizeof newlabel); 1792 memcpy(&newlabel, data, sizeof (struct olddisklabel)); 1793 lp = &newlabel; 1794 } else 1795 #endif 1796 lp = (struct disklabel *)data; 1797 1798 if ((error = raidlock(rs)) != 0) 1799 return (error); 1800 1801 rs->sc_flags |= RAIDF_LABELLING; 1802 1803 error = setdisklabel(rs->sc_dkdev.dk_label, 1804 lp, 0, rs->sc_dkdev.dk_cpulabel); 1805 if (error == 0) { 1806 if (cmd == DIOCWDINFO 1807 #ifdef __HAVE_OLD_DISKLABEL 1808 || cmd == ODIOCWDINFO 1809 #endif 1810 ) 1811 error = writedisklabel(RAIDLABELDEV(dev), 1812 raidstrategy, rs->sc_dkdev.dk_label, 1813 rs->sc_dkdev.dk_cpulabel); 1814 } 1815 rs->sc_flags &= ~RAIDF_LABELLING; 1816 1817 raidunlock(rs); 1818 1819 if (error) 1820 return (error); 1821 break; 1822 } 1823 1824 case DIOCWLABEL: 1825 if (*(int *) data != 0) 1826 rs->sc_flags |= RAIDF_WLABEL; 1827 else 1828 rs->sc_flags &= ~RAIDF_WLABEL; 1829 break; 1830 1831 case DIOCGDEFLABEL: 1832 raidgetdefaultlabel(raidPtr, rs, (struct disklabel *) data); 1833 break; 1834 1835 #ifdef __HAVE_OLD_DISKLABEL 1836 case ODIOCGDEFLABEL: 1837 raidgetdefaultlabel(raidPtr, rs, &newlabel); 1838 if (newlabel.d_npartitions > OLDMAXPARTITIONS) 1839 return ENOTTY; 1840 memcpy(data, &newlabel, sizeof (struct olddisklabel)); 1841 break; 1842 #endif 1843 1844 case DIOCAWEDGE: 1845 case DIOCDWEDGE: 1846 dkw = (void *)data; 1847 1848 /* If the ioctl happens here, the parent is us. */ 1849 (void)strcpy(dkw->dkw_parent, rs->sc_xname); 1850 return cmd == DIOCAWEDGE ? dkwedge_add(dkw) : dkwedge_del(dkw); 1851 1852 case DIOCLWEDGES: 1853 return dkwedge_list(&rs->sc_dkdev, 1854 (struct dkwedge_list *)data, l); 1855 case DIOCCACHESYNC: 1856 return rf_sync_component_caches(raidPtr); 1857 default: 1858 retcode = ENOTTY; 1859 } 1860 return (retcode); 1861 1862 } 1863 1864 1865 /* raidinit -- complete the rest of the initialization for the 1866 RAIDframe device. */ 1867 1868 1869 static void 1870 raidinit(RF_Raid_t *raidPtr) 1871 { 1872 cfdata_t cf; 1873 struct raid_softc *rs; 1874 int unit; 1875 1876 unit = raidPtr->raidid; 1877 1878 rs = &raid_softc[unit]; 1879 1880 /* XXX should check return code first... */ 1881 rs->sc_flags |= RAIDF_INITED; 1882 1883 /* XXX doesn't check bounds. */ 1884 snprintf(rs->sc_xname, sizeof(rs->sc_xname), "raid%d", unit); 1885 1886 /* attach the pseudo device */ 1887 cf = malloc(sizeof(*cf), M_RAIDFRAME, M_WAITOK); 1888 cf->cf_name = raid_cd.cd_name; 1889 cf->cf_atname = raid_cd.cd_name; 1890 cf->cf_unit = unit; 1891 cf->cf_fstate = FSTATE_STAR; 1892 1893 rs->sc_dev = config_attach_pseudo(cf); 1894 1895 if (rs->sc_dev == NULL) { 1896 printf("raid%d: config_attach_pseudo failed\n", 1897 raidPtr->raidid); 1898 rs->sc_flags &= ~RAIDF_INITED; 1899 free(cf, M_RAIDFRAME); 1900 return; 1901 } 1902 1903 /* disk_attach actually creates space for the CPU disklabel, among 1904 * other things, so it's critical to call this *BEFORE* we try putzing 1905 * with disklabels. */ 1906 1907 disk_init(&rs->sc_dkdev, rs->sc_xname, &rf_dkdriver); 1908 disk_attach(&rs->sc_dkdev); 1909 disk_blocksize(&rs->sc_dkdev, raidPtr->bytesPerSector); 1910 1911 /* XXX There may be a weird interaction here between this, and 1912 * protectedSectors, as used in RAIDframe. */ 1913 1914 rs->sc_size = raidPtr->totalSectors; 1915 1916 dkwedge_discover(&rs->sc_dkdev); 1917 1918 rf_set_properties(rs, raidPtr); 1919 1920 } 1921 #if (RF_INCLUDE_PARITY_DECLUSTERING_DS > 0) 1922 /* wake up the daemon & tell it to get us a spare table 1923 * XXX 1924 * the entries in the queues should be tagged with the raidPtr 1925 * so that in the extremely rare case that two recons happen at once, 1926 * we know for which device were requesting a spare table 1927 * XXX 1928 * 1929 * XXX This code is not currently used. GO 1930 */ 1931 int 1932 rf_GetSpareTableFromDaemon(RF_SparetWait_t *req) 1933 { 1934 int retcode; 1935 1936 rf_lock_mutex2(rf_sparet_wait_mutex); 1937 req->next = rf_sparet_wait_queue; 1938 rf_sparet_wait_queue = req; 1939 rf_broadcast_cond2(rf_sparet_wait_cv); 1940 1941 /* mpsleep unlocks the mutex */ 1942 while (!rf_sparet_resp_queue) { 1943 rf_wait_cond2(rf_sparet_resp_cv, rf_sparet_wait_mutex); 1944 } 1945 req = rf_sparet_resp_queue; 1946 rf_sparet_resp_queue = req->next; 1947 rf_unlock_mutex2(rf_sparet_wait_mutex); 1948 1949 retcode = req->fcol; 1950 RF_Free(req, sizeof(*req)); /* this is not the same req as we 1951 * alloc'd */ 1952 return (retcode); 1953 } 1954 #endif 1955 1956 /* a wrapper around rf_DoAccess that extracts appropriate info from the 1957 * bp & passes it down. 1958 * any calls originating in the kernel must use non-blocking I/O 1959 * do some extra sanity checking to return "appropriate" error values for 1960 * certain conditions (to make some standard utilities work) 1961 * 1962 * Formerly known as: rf_DoAccessKernel 1963 */ 1964 void 1965 raidstart(RF_Raid_t *raidPtr) 1966 { 1967 RF_SectorCount_t num_blocks, pb, sum; 1968 RF_RaidAddr_t raid_addr; 1969 struct partition *pp; 1970 daddr_t blocknum; 1971 int unit; 1972 struct raid_softc *rs; 1973 int do_async; 1974 struct buf *bp; 1975 int rc; 1976 1977 unit = raidPtr->raidid; 1978 rs = &raid_softc[unit]; 1979 1980 /* quick check to see if anything has died recently */ 1981 rf_lock_mutex2(raidPtr->mutex); 1982 if (raidPtr->numNewFailures > 0) { 1983 rf_unlock_mutex2(raidPtr->mutex); 1984 rf_update_component_labels(raidPtr, 1985 RF_NORMAL_COMPONENT_UPDATE); 1986 rf_lock_mutex2(raidPtr->mutex); 1987 raidPtr->numNewFailures--; 1988 } 1989 1990 /* Check to see if we're at the limit... */ 1991 while (raidPtr->openings > 0) { 1992 rf_unlock_mutex2(raidPtr->mutex); 1993 1994 /* get the next item, if any, from the queue */ 1995 if ((bp = bufq_get(rs->buf_queue)) == NULL) { 1996 /* nothing more to do */ 1997 return; 1998 } 1999 2000 /* Ok, for the bp we have here, bp->b_blkno is relative to the 2001 * partition.. Need to make it absolute to the underlying 2002 * device.. */ 2003 2004 blocknum = bp->b_blkno << DEV_BSHIFT >> raidPtr->logBytesPerSector; 2005 if (DISKPART(bp->b_dev) != RAW_PART) { 2006 pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)]; 2007 blocknum += pp->p_offset; 2008 } 2009 2010 db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno, 2011 (int) blocknum)); 2012 2013 db1_printf(("bp->b_bcount = %d\n", (int) bp->b_bcount)); 2014 db1_printf(("bp->b_resid = %d\n", (int) bp->b_resid)); 2015 2016 /* *THIS* is where we adjust what block we're going to... 2017 * but DO NOT TOUCH bp->b_blkno!!! */ 2018 raid_addr = blocknum; 2019 2020 num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector; 2021 pb = (bp->b_bcount & raidPtr->sectorMask) ? 1 : 0; 2022 sum = raid_addr + num_blocks + pb; 2023 if (1 || rf_debugKernelAccess) { 2024 db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n", 2025 (int) raid_addr, (int) sum, (int) num_blocks, 2026 (int) pb, (int) bp->b_resid)); 2027 } 2028 if ((sum > raidPtr->totalSectors) || (sum < raid_addr) 2029 || (sum < num_blocks) || (sum < pb)) { 2030 bp->b_error = ENOSPC; 2031 bp->b_resid = bp->b_bcount; 2032 biodone(bp); 2033 rf_lock_mutex2(raidPtr->mutex); 2034 continue; 2035 } 2036 /* 2037 * XXX rf_DoAccess() should do this, not just DoAccessKernel() 2038 */ 2039 2040 if (bp->b_bcount & raidPtr->sectorMask) { 2041 bp->b_error = EINVAL; 2042 bp->b_resid = bp->b_bcount; 2043 biodone(bp); 2044 rf_lock_mutex2(raidPtr->mutex); 2045 continue; 2046 2047 } 2048 db1_printf(("Calling DoAccess..\n")); 2049 2050 2051 rf_lock_mutex2(raidPtr->mutex); 2052 raidPtr->openings--; 2053 rf_unlock_mutex2(raidPtr->mutex); 2054 2055 /* 2056 * Everything is async. 2057 */ 2058 do_async = 1; 2059 2060 disk_busy(&rs->sc_dkdev); 2061 2062 /* XXX we're still at splbio() here... do we *really* 2063 need to be? */ 2064 2065 /* don't ever condition on bp->b_flags & B_WRITE. 2066 * always condition on B_READ instead */ 2067 2068 rc = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ? 2069 RF_IO_TYPE_READ : RF_IO_TYPE_WRITE, 2070 do_async, raid_addr, num_blocks, 2071 bp->b_data, bp, RF_DAG_NONBLOCKING_IO); 2072 2073 if (rc) { 2074 bp->b_error = rc; 2075 bp->b_resid = bp->b_bcount; 2076 biodone(bp); 2077 /* continue loop */ 2078 } 2079 2080 rf_lock_mutex2(raidPtr->mutex); 2081 } 2082 rf_unlock_mutex2(raidPtr->mutex); 2083 } 2084 2085 2086 2087 2088 /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */ 2089 2090 int 2091 rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req) 2092 { 2093 int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE; 2094 struct buf *bp; 2095 2096 req->queue = queue; 2097 bp = req->bp; 2098 2099 switch (req->type) { 2100 case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */ 2101 /* XXX need to do something extra here.. */ 2102 /* I'm leaving this in, as I've never actually seen it used, 2103 * and I'd like folks to report it... GO */ 2104 printf(("WAKEUP CALLED\n")); 2105 queue->numOutstanding++; 2106 2107 bp->b_flags = 0; 2108 bp->b_private = req; 2109 2110 KernelWakeupFunc(bp); 2111 break; 2112 2113 case RF_IO_TYPE_READ: 2114 case RF_IO_TYPE_WRITE: 2115 #if RF_ACC_TRACE > 0 2116 if (req->tracerec) { 2117 RF_ETIMER_START(req->tracerec->timer); 2118 } 2119 #endif 2120 InitBP(bp, queue->rf_cinfo->ci_vp, 2121 op, queue->rf_cinfo->ci_dev, 2122 req->sectorOffset, req->numSector, 2123 req->buf, KernelWakeupFunc, (void *) req, 2124 queue->raidPtr->logBytesPerSector, req->b_proc); 2125 2126 if (rf_debugKernelAccess) { 2127 db1_printf(("dispatch: bp->b_blkno = %ld\n", 2128 (long) bp->b_blkno)); 2129 } 2130 queue->numOutstanding++; 2131 queue->last_deq_sector = req->sectorOffset; 2132 /* acc wouldn't have been let in if there were any pending 2133 * reqs at any other priority */ 2134 queue->curPriority = req->priority; 2135 2136 db1_printf(("Going for %c to unit %d col %d\n", 2137 req->type, queue->raidPtr->raidid, 2138 queue->col)); 2139 db1_printf(("sector %d count %d (%d bytes) %d\n", 2140 (int) req->sectorOffset, (int) req->numSector, 2141 (int) (req->numSector << 2142 queue->raidPtr->logBytesPerSector), 2143 (int) queue->raidPtr->logBytesPerSector)); 2144 2145 /* 2146 * XXX: drop lock here since this can block at 2147 * least with backing SCSI devices. Retake it 2148 * to minimize fuss with calling interfaces. 2149 */ 2150 2151 RF_UNLOCK_QUEUE_MUTEX(queue, "unusedparam"); 2152 bdev_strategy(bp); 2153 RF_LOCK_QUEUE_MUTEX(queue, "unusedparam"); 2154 break; 2155 2156 default: 2157 panic("bad req->type in rf_DispatchKernelIO"); 2158 } 2159 db1_printf(("Exiting from DispatchKernelIO\n")); 2160 2161 return (0); 2162 } 2163 /* this is the callback function associated with a I/O invoked from 2164 kernel code. 2165 */ 2166 static void 2167 KernelWakeupFunc(struct buf *bp) 2168 { 2169 RF_DiskQueueData_t *req = NULL; 2170 RF_DiskQueue_t *queue; 2171 2172 db1_printf(("recovering the request queue:\n")); 2173 2174 req = bp->b_private; 2175 2176 queue = (RF_DiskQueue_t *) req->queue; 2177 2178 rf_lock_mutex2(queue->raidPtr->iodone_lock); 2179 2180 #if RF_ACC_TRACE > 0 2181 if (req->tracerec) { 2182 RF_ETIMER_STOP(req->tracerec->timer); 2183 RF_ETIMER_EVAL(req->tracerec->timer); 2184 rf_lock_mutex2(rf_tracing_mutex); 2185 req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2186 req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer); 2187 req->tracerec->num_phys_ios++; 2188 rf_unlock_mutex2(rf_tracing_mutex); 2189 } 2190 #endif 2191 2192 /* XXX Ok, let's get aggressive... If b_error is set, let's go 2193 * ballistic, and mark the component as hosed... */ 2194 2195 if (bp->b_error != 0) { 2196 /* Mark the disk as dead */ 2197 /* but only mark it once... */ 2198 /* and only if it wouldn't leave this RAID set 2199 completely broken */ 2200 if (((queue->raidPtr->Disks[queue->col].status == 2201 rf_ds_optimal) || 2202 (queue->raidPtr->Disks[queue->col].status == 2203 rf_ds_used_spare)) && 2204 (queue->raidPtr->numFailures < 2205 queue->raidPtr->Layout.map->faultsTolerated)) { 2206 printf("raid%d: IO Error. Marking %s as failed.\n", 2207 queue->raidPtr->raidid, 2208 queue->raidPtr->Disks[queue->col].devname); 2209 queue->raidPtr->Disks[queue->col].status = 2210 rf_ds_failed; 2211 queue->raidPtr->status = rf_rs_degraded; 2212 queue->raidPtr->numFailures++; 2213 queue->raidPtr->numNewFailures++; 2214 } else { /* Disk is already dead... */ 2215 /* printf("Disk already marked as dead!\n"); */ 2216 } 2217 2218 } 2219 2220 /* Fill in the error value */ 2221 req->error = bp->b_error; 2222 2223 /* Drop this one on the "finished" queue... */ 2224 TAILQ_INSERT_TAIL(&(queue->raidPtr->iodone), req, iodone_entries); 2225 2226 /* Let the raidio thread know there is work to be done. */ 2227 rf_signal_cond2(queue->raidPtr->iodone_cv); 2228 2229 rf_unlock_mutex2(queue->raidPtr->iodone_lock); 2230 } 2231 2232 2233 /* 2234 * initialize a buf structure for doing an I/O in the kernel. 2235 */ 2236 static void 2237 InitBP(struct buf *bp, struct vnode *b_vp, unsigned rw_flag, dev_t dev, 2238 RF_SectorNum_t startSect, RF_SectorCount_t numSect, void *bf, 2239 void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector, 2240 struct proc *b_proc) 2241 { 2242 /* bp->b_flags = B_PHYS | rw_flag; */ 2243 bp->b_flags = rw_flag; /* XXX need B_PHYS here too??? */ 2244 bp->b_oflags = 0; 2245 bp->b_cflags = 0; 2246 bp->b_bcount = numSect << logBytesPerSector; 2247 bp->b_bufsize = bp->b_bcount; 2248 bp->b_error = 0; 2249 bp->b_dev = dev; 2250 bp->b_data = bf; 2251 bp->b_blkno = startSect << logBytesPerSector >> DEV_BSHIFT; 2252 bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */ 2253 if (bp->b_bcount == 0) { 2254 panic("bp->b_bcount is zero in InitBP!!"); 2255 } 2256 bp->b_proc = b_proc; 2257 bp->b_iodone = cbFunc; 2258 bp->b_private = cbArg; 2259 } 2260 2261 static void 2262 raidgetdefaultlabel(RF_Raid_t *raidPtr, struct raid_softc *rs, 2263 struct disklabel *lp) 2264 { 2265 memset(lp, 0, sizeof(*lp)); 2266 2267 /* fabricate a label... */ 2268 lp->d_secperunit = raidPtr->totalSectors; 2269 lp->d_secsize = raidPtr->bytesPerSector; 2270 lp->d_nsectors = raidPtr->Layout.dataSectorsPerStripe; 2271 lp->d_ntracks = 4 * raidPtr->numCol; 2272 lp->d_ncylinders = raidPtr->totalSectors / 2273 (lp->d_nsectors * lp->d_ntracks); 2274 lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors; 2275 2276 strncpy(lp->d_typename, "raid", sizeof(lp->d_typename)); 2277 lp->d_type = DTYPE_RAID; 2278 strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); 2279 lp->d_rpm = 3600; 2280 lp->d_interleave = 1; 2281 lp->d_flags = 0; 2282 2283 lp->d_partitions[RAW_PART].p_offset = 0; 2284 lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors; 2285 lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED; 2286 lp->d_npartitions = RAW_PART + 1; 2287 2288 lp->d_magic = DISKMAGIC; 2289 lp->d_magic2 = DISKMAGIC; 2290 lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label); 2291 2292 } 2293 /* 2294 * Read the disklabel from the raid device. If one is not present, fake one 2295 * up. 2296 */ 2297 static void 2298 raidgetdisklabel(dev_t dev) 2299 { 2300 int unit = raidunit(dev); 2301 struct raid_softc *rs = &raid_softc[unit]; 2302 const char *errstring; 2303 struct disklabel *lp = rs->sc_dkdev.dk_label; 2304 struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel; 2305 RF_Raid_t *raidPtr; 2306 2307 db1_printf(("Getting the disklabel...\n")); 2308 2309 memset(clp, 0, sizeof(*clp)); 2310 2311 raidPtr = raidPtrs[unit]; 2312 2313 raidgetdefaultlabel(raidPtr, rs, lp); 2314 2315 /* 2316 * Call the generic disklabel extraction routine. 2317 */ 2318 errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy, 2319 rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel); 2320 if (errstring) 2321 raidmakedisklabel(rs); 2322 else { 2323 int i; 2324 struct partition *pp; 2325 2326 /* 2327 * Sanity check whether the found disklabel is valid. 2328 * 2329 * This is necessary since total size of the raid device 2330 * may vary when an interleave is changed even though exactly 2331 * same components are used, and old disklabel may used 2332 * if that is found. 2333 */ 2334 if (lp->d_secperunit != rs->sc_size) 2335 printf("raid%d: WARNING: %s: " 2336 "total sector size in disklabel (%" PRIu32 ") != " 2337 "the size of raid (%" PRIu64 ")\n", unit, rs->sc_xname, 2338 lp->d_secperunit, rs->sc_size); 2339 for (i = 0; i < lp->d_npartitions; i++) { 2340 pp = &lp->d_partitions[i]; 2341 if (pp->p_offset + pp->p_size > rs->sc_size) 2342 printf("raid%d: WARNING: %s: end of partition `%c' " 2343 "exceeds the size of raid (%" PRIu64 ")\n", 2344 unit, rs->sc_xname, 'a' + i, rs->sc_size); 2345 } 2346 } 2347 2348 } 2349 /* 2350 * Take care of things one might want to take care of in the event 2351 * that a disklabel isn't present. 2352 */ 2353 static void 2354 raidmakedisklabel(struct raid_softc *rs) 2355 { 2356 struct disklabel *lp = rs->sc_dkdev.dk_label; 2357 db1_printf(("Making a label..\n")); 2358 2359 /* 2360 * For historical reasons, if there's no disklabel present 2361 * the raw partition must be marked FS_BSDFFS. 2362 */ 2363 2364 lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS; 2365 2366 strncpy(lp->d_packname, "default label", sizeof(lp->d_packname)); 2367 2368 lp->d_checksum = dkcksum(lp); 2369 } 2370 /* 2371 * Wait interruptibly for an exclusive lock. 2372 * 2373 * XXX 2374 * Several drivers do this; it should be abstracted and made MP-safe. 2375 * (Hmm... where have we seen this warning before :-> GO ) 2376 */ 2377 static int 2378 raidlock(struct raid_softc *rs) 2379 { 2380 int error; 2381 2382 while ((rs->sc_flags & RAIDF_LOCKED) != 0) { 2383 rs->sc_flags |= RAIDF_WANTED; 2384 if ((error = 2385 tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0) 2386 return (error); 2387 } 2388 rs->sc_flags |= RAIDF_LOCKED; 2389 return (0); 2390 } 2391 /* 2392 * Unlock and wake up any waiters. 2393 */ 2394 static void 2395 raidunlock(struct raid_softc *rs) 2396 { 2397 2398 rs->sc_flags &= ~RAIDF_LOCKED; 2399 if ((rs->sc_flags & RAIDF_WANTED) != 0) { 2400 rs->sc_flags &= ~RAIDF_WANTED; 2401 wakeup(rs); 2402 } 2403 } 2404 2405 2406 #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */ 2407 #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */ 2408 #define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE 2409 2410 static daddr_t 2411 rf_component_info_offset(void) 2412 { 2413 2414 return RF_COMPONENT_INFO_OFFSET; 2415 } 2416 2417 static daddr_t 2418 rf_component_info_size(unsigned secsize) 2419 { 2420 daddr_t info_size; 2421 2422 KASSERT(secsize); 2423 if (secsize > RF_COMPONENT_INFO_SIZE) 2424 info_size = secsize; 2425 else 2426 info_size = RF_COMPONENT_INFO_SIZE; 2427 2428 return info_size; 2429 } 2430 2431 static daddr_t 2432 rf_parity_map_offset(RF_Raid_t *raidPtr) 2433 { 2434 daddr_t map_offset; 2435 2436 KASSERT(raidPtr->bytesPerSector); 2437 if (raidPtr->bytesPerSector > RF_COMPONENT_INFO_SIZE) 2438 map_offset = raidPtr->bytesPerSector; 2439 else 2440 map_offset = RF_COMPONENT_INFO_SIZE; 2441 map_offset += rf_component_info_offset(); 2442 2443 return map_offset; 2444 } 2445 2446 static daddr_t 2447 rf_parity_map_size(RF_Raid_t *raidPtr) 2448 { 2449 daddr_t map_size; 2450 2451 if (raidPtr->bytesPerSector > RF_PARITY_MAP_SIZE) 2452 map_size = raidPtr->bytesPerSector; 2453 else 2454 map_size = RF_PARITY_MAP_SIZE; 2455 2456 return map_size; 2457 } 2458 2459 int 2460 raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col) 2461 { 2462 RF_ComponentLabel_t *clabel; 2463 2464 clabel = raidget_component_label(raidPtr, col); 2465 clabel->clean = RF_RAID_CLEAN; 2466 raidflush_component_label(raidPtr, col); 2467 return(0); 2468 } 2469 2470 2471 int 2472 raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col) 2473 { 2474 RF_ComponentLabel_t *clabel; 2475 2476 clabel = raidget_component_label(raidPtr, col); 2477 clabel->clean = RF_RAID_DIRTY; 2478 raidflush_component_label(raidPtr, col); 2479 return(0); 2480 } 2481 2482 int 2483 raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2484 { 2485 KASSERT(raidPtr->bytesPerSector); 2486 return raidread_component_label(raidPtr->bytesPerSector, 2487 raidPtr->Disks[col].dev, 2488 raidPtr->raid_cinfo[col].ci_vp, 2489 &raidPtr->raid_cinfo[col].ci_label); 2490 } 2491 2492 RF_ComponentLabel_t * 2493 raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2494 { 2495 return &raidPtr->raid_cinfo[col].ci_label; 2496 } 2497 2498 int 2499 raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) 2500 { 2501 RF_ComponentLabel_t *label; 2502 2503 label = &raidPtr->raid_cinfo[col].ci_label; 2504 label->mod_counter = raidPtr->mod_counter; 2505 #ifndef RF_NO_PARITY_MAP 2506 label->parity_map_modcount = label->mod_counter; 2507 #endif 2508 return raidwrite_component_label(raidPtr->bytesPerSector, 2509 raidPtr->Disks[col].dev, 2510 raidPtr->raid_cinfo[col].ci_vp, label); 2511 } 2512 2513 2514 static int 2515 raidread_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2516 RF_ComponentLabel_t *clabel) 2517 { 2518 return raidread_component_area(dev, b_vp, clabel, 2519 sizeof(RF_ComponentLabel_t), 2520 rf_component_info_offset(), 2521 rf_component_info_size(secsize)); 2522 } 2523 2524 /* ARGSUSED */ 2525 static int 2526 raidread_component_area(dev_t dev, struct vnode *b_vp, void *data, 2527 size_t msize, daddr_t offset, daddr_t dsize) 2528 { 2529 struct buf *bp; 2530 const struct bdevsw *bdev; 2531 int error; 2532 2533 /* XXX should probably ensure that we don't try to do this if 2534 someone has changed rf_protected_sectors. */ 2535 2536 if (b_vp == NULL) { 2537 /* For whatever reason, this component is not valid. 2538 Don't try to read a component label from it. */ 2539 return(EINVAL); 2540 } 2541 2542 /* get a block of the appropriate size... */ 2543 bp = geteblk((int)dsize); 2544 bp->b_dev = dev; 2545 2546 /* get our ducks in a row for the read */ 2547 bp->b_blkno = offset / DEV_BSIZE; 2548 bp->b_bcount = dsize; 2549 bp->b_flags |= B_READ; 2550 bp->b_resid = dsize; 2551 2552 bdev = bdevsw_lookup(bp->b_dev); 2553 if (bdev == NULL) 2554 return (ENXIO); 2555 (*bdev->d_strategy)(bp); 2556 2557 error = biowait(bp); 2558 2559 if (!error) { 2560 memcpy(data, bp->b_data, msize); 2561 } 2562 2563 brelse(bp, 0); 2564 return(error); 2565 } 2566 2567 2568 static int 2569 raidwrite_component_label(unsigned secsize, dev_t dev, struct vnode *b_vp, 2570 RF_ComponentLabel_t *clabel) 2571 { 2572 return raidwrite_component_area(dev, b_vp, clabel, 2573 sizeof(RF_ComponentLabel_t), 2574 rf_component_info_offset(), 2575 rf_component_info_size(secsize), 0); 2576 } 2577 2578 /* ARGSUSED */ 2579 static int 2580 raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data, 2581 size_t msize, daddr_t offset, daddr_t dsize, int asyncp) 2582 { 2583 struct buf *bp; 2584 const struct bdevsw *bdev; 2585 int error; 2586 2587 /* get a block of the appropriate size... */ 2588 bp = geteblk((int)dsize); 2589 bp->b_dev = dev; 2590 2591 /* get our ducks in a row for the write */ 2592 bp->b_blkno = offset / DEV_BSIZE; 2593 bp->b_bcount = dsize; 2594 bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0); 2595 bp->b_resid = dsize; 2596 2597 memset(bp->b_data, 0, dsize); 2598 memcpy(bp->b_data, data, msize); 2599 2600 bdev = bdevsw_lookup(bp->b_dev); 2601 if (bdev == NULL) 2602 return (ENXIO); 2603 (*bdev->d_strategy)(bp); 2604 if (asyncp) 2605 return 0; 2606 error = biowait(bp); 2607 brelse(bp, 0); 2608 if (error) { 2609 #if 1 2610 printf("Failed to write RAID component info!\n"); 2611 #endif 2612 } 2613 2614 return(error); 2615 } 2616 2617 void 2618 rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2619 { 2620 int c; 2621 2622 for (c = 0; c < raidPtr->numCol; c++) { 2623 /* Skip dead disks. */ 2624 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2625 continue; 2626 /* XXXjld: what if an error occurs here? */ 2627 raidwrite_component_area(raidPtr->Disks[c].dev, 2628 raidPtr->raid_cinfo[c].ci_vp, map, 2629 RF_PARITYMAP_NBYTE, 2630 rf_parity_map_offset(raidPtr), 2631 rf_parity_map_size(raidPtr), 0); 2632 } 2633 } 2634 2635 void 2636 rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) 2637 { 2638 struct rf_paritymap_ondisk tmp; 2639 int c,first; 2640 2641 first=1; 2642 for (c = 0; c < raidPtr->numCol; c++) { 2643 /* Skip dead disks. */ 2644 if (RF_DEAD_DISK(raidPtr->Disks[c].status)) 2645 continue; 2646 raidread_component_area(raidPtr->Disks[c].dev, 2647 raidPtr->raid_cinfo[c].ci_vp, &tmp, 2648 RF_PARITYMAP_NBYTE, 2649 rf_parity_map_offset(raidPtr), 2650 rf_parity_map_size(raidPtr)); 2651 if (first) { 2652 memcpy(map, &tmp, sizeof(*map)); 2653 first = 0; 2654 } else { 2655 rf_paritymap_merge(map, &tmp); 2656 } 2657 } 2658 } 2659 2660 void 2661 rf_markalldirty(RF_Raid_t *raidPtr) 2662 { 2663 RF_ComponentLabel_t *clabel; 2664 int sparecol; 2665 int c; 2666 int j; 2667 int scol = -1; 2668 2669 raidPtr->mod_counter++; 2670 for (c = 0; c < raidPtr->numCol; c++) { 2671 /* we don't want to touch (at all) a disk that has 2672 failed */ 2673 if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) { 2674 clabel = raidget_component_label(raidPtr, c); 2675 if (clabel->status == rf_ds_spared) { 2676 /* XXX do something special... 2677 but whatever you do, don't 2678 try to access it!! */ 2679 } else { 2680 raidmarkdirty(raidPtr, c); 2681 } 2682 } 2683 } 2684 2685 for( c = 0; c < raidPtr->numSpare ; c++) { 2686 sparecol = raidPtr->numCol + c; 2687 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2688 /* 2689 2690 we claim this disk is "optimal" if it's 2691 rf_ds_used_spare, as that means it should be 2692 directly substitutable for the disk it replaced. 2693 We note that too... 2694 2695 */ 2696 2697 for(j=0;j<raidPtr->numCol;j++) { 2698 if (raidPtr->Disks[j].spareCol == sparecol) { 2699 scol = j; 2700 break; 2701 } 2702 } 2703 2704 clabel = raidget_component_label(raidPtr, sparecol); 2705 /* make sure status is noted */ 2706 2707 raid_init_component_label(raidPtr, clabel); 2708 2709 clabel->row = 0; 2710 clabel->column = scol; 2711 /* Note: we *don't* change status from rf_ds_used_spare 2712 to rf_ds_optimal */ 2713 /* clabel.status = rf_ds_optimal; */ 2714 2715 raidmarkdirty(raidPtr, sparecol); 2716 } 2717 } 2718 } 2719 2720 2721 void 2722 rf_update_component_labels(RF_Raid_t *raidPtr, int final) 2723 { 2724 RF_ComponentLabel_t *clabel; 2725 int sparecol; 2726 int c; 2727 int j; 2728 int scol; 2729 2730 scol = -1; 2731 2732 /* XXX should do extra checks to make sure things really are clean, 2733 rather than blindly setting the clean bit... */ 2734 2735 raidPtr->mod_counter++; 2736 2737 for (c = 0; c < raidPtr->numCol; c++) { 2738 if (raidPtr->Disks[c].status == rf_ds_optimal) { 2739 clabel = raidget_component_label(raidPtr, c); 2740 /* make sure status is noted */ 2741 clabel->status = rf_ds_optimal; 2742 2743 /* note what unit we are configured as */ 2744 clabel->last_unit = raidPtr->raidid; 2745 2746 raidflush_component_label(raidPtr, c); 2747 if (final == RF_FINAL_COMPONENT_UPDATE) { 2748 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2749 raidmarkclean(raidPtr, c); 2750 } 2751 } 2752 } 2753 /* else we don't touch it.. */ 2754 } 2755 2756 for( c = 0; c < raidPtr->numSpare ; c++) { 2757 sparecol = raidPtr->numCol + c; 2758 /* Need to ensure that the reconstruct actually completed! */ 2759 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 2760 /* 2761 2762 we claim this disk is "optimal" if it's 2763 rf_ds_used_spare, as that means it should be 2764 directly substitutable for the disk it replaced. 2765 We note that too... 2766 2767 */ 2768 2769 for(j=0;j<raidPtr->numCol;j++) { 2770 if (raidPtr->Disks[j].spareCol == sparecol) { 2771 scol = j; 2772 break; 2773 } 2774 } 2775 2776 /* XXX shouldn't *really* need this... */ 2777 clabel = raidget_component_label(raidPtr, sparecol); 2778 /* make sure status is noted */ 2779 2780 raid_init_component_label(raidPtr, clabel); 2781 2782 clabel->column = scol; 2783 clabel->status = rf_ds_optimal; 2784 clabel->last_unit = raidPtr->raidid; 2785 2786 raidflush_component_label(raidPtr, sparecol); 2787 if (final == RF_FINAL_COMPONENT_UPDATE) { 2788 if (raidPtr->parity_good == RF_RAID_CLEAN) { 2789 raidmarkclean(raidPtr, sparecol); 2790 } 2791 } 2792 } 2793 } 2794 } 2795 2796 void 2797 rf_close_component(RF_Raid_t *raidPtr, struct vnode *vp, int auto_configured) 2798 { 2799 2800 if (vp != NULL) { 2801 if (auto_configured == 1) { 2802 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2803 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2804 vput(vp); 2805 2806 } else { 2807 (void) vn_close(vp, FREAD | FWRITE, curlwp->l_cred); 2808 } 2809 } 2810 } 2811 2812 2813 void 2814 rf_UnconfigureVnodes(RF_Raid_t *raidPtr) 2815 { 2816 int r,c; 2817 struct vnode *vp; 2818 int acd; 2819 2820 2821 /* We take this opportunity to close the vnodes like we should.. */ 2822 2823 for (c = 0; c < raidPtr->numCol; c++) { 2824 vp = raidPtr->raid_cinfo[c].ci_vp; 2825 acd = raidPtr->Disks[c].auto_configured; 2826 rf_close_component(raidPtr, vp, acd); 2827 raidPtr->raid_cinfo[c].ci_vp = NULL; 2828 raidPtr->Disks[c].auto_configured = 0; 2829 } 2830 2831 for (r = 0; r < raidPtr->numSpare; r++) { 2832 vp = raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp; 2833 acd = raidPtr->Disks[raidPtr->numCol + r].auto_configured; 2834 rf_close_component(raidPtr, vp, acd); 2835 raidPtr->raid_cinfo[raidPtr->numCol + r].ci_vp = NULL; 2836 raidPtr->Disks[raidPtr->numCol + r].auto_configured = 0; 2837 } 2838 } 2839 2840 2841 void 2842 rf_ReconThread(struct rf_recon_req *req) 2843 { 2844 int s; 2845 RF_Raid_t *raidPtr; 2846 2847 s = splbio(); 2848 raidPtr = (RF_Raid_t *) req->raidPtr; 2849 raidPtr->recon_in_progress = 1; 2850 2851 rf_FailDisk((RF_Raid_t *) req->raidPtr, req->col, 2852 ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0)); 2853 2854 RF_Free(req, sizeof(*req)); 2855 2856 raidPtr->recon_in_progress = 0; 2857 splx(s); 2858 2859 /* That's all... */ 2860 kthread_exit(0); /* does not return */ 2861 } 2862 2863 void 2864 rf_RewriteParityThread(RF_Raid_t *raidPtr) 2865 { 2866 int retcode; 2867 int s; 2868 2869 raidPtr->parity_rewrite_stripes_done = 0; 2870 raidPtr->parity_rewrite_in_progress = 1; 2871 s = splbio(); 2872 retcode = rf_RewriteParity(raidPtr); 2873 splx(s); 2874 if (retcode) { 2875 printf("raid%d: Error re-writing parity (%d)!\n", 2876 raidPtr->raidid, retcode); 2877 } else { 2878 /* set the clean bit! If we shutdown correctly, 2879 the clean bit on each component label will get 2880 set */ 2881 raidPtr->parity_good = RF_RAID_CLEAN; 2882 } 2883 raidPtr->parity_rewrite_in_progress = 0; 2884 2885 /* Anyone waiting for us to stop? If so, inform them... */ 2886 if (raidPtr->waitShutdown) { 2887 wakeup(&raidPtr->parity_rewrite_in_progress); 2888 } 2889 2890 /* That's all... */ 2891 kthread_exit(0); /* does not return */ 2892 } 2893 2894 2895 void 2896 rf_CopybackThread(RF_Raid_t *raidPtr) 2897 { 2898 int s; 2899 2900 raidPtr->copyback_in_progress = 1; 2901 s = splbio(); 2902 rf_CopybackReconstructedData(raidPtr); 2903 splx(s); 2904 raidPtr->copyback_in_progress = 0; 2905 2906 /* That's all... */ 2907 kthread_exit(0); /* does not return */ 2908 } 2909 2910 2911 void 2912 rf_ReconstructInPlaceThread(struct rf_recon_req *req) 2913 { 2914 int s; 2915 RF_Raid_t *raidPtr; 2916 2917 s = splbio(); 2918 raidPtr = req->raidPtr; 2919 raidPtr->recon_in_progress = 1; 2920 rf_ReconstructInPlace(raidPtr, req->col); 2921 RF_Free(req, sizeof(*req)); 2922 raidPtr->recon_in_progress = 0; 2923 splx(s); 2924 2925 /* That's all... */ 2926 kthread_exit(0); /* does not return */ 2927 } 2928 2929 static RF_AutoConfig_t * 2930 rf_get_component(RF_AutoConfig_t *ac_list, dev_t dev, struct vnode *vp, 2931 const char *cname, RF_SectorCount_t size, uint64_t numsecs, 2932 unsigned secsize) 2933 { 2934 int good_one = 0; 2935 RF_ComponentLabel_t *clabel; 2936 RF_AutoConfig_t *ac; 2937 2938 clabel = malloc(sizeof(RF_ComponentLabel_t), M_RAIDFRAME, M_NOWAIT); 2939 if (clabel == NULL) { 2940 oomem: 2941 while(ac_list) { 2942 ac = ac_list; 2943 if (ac->clabel) 2944 free(ac->clabel, M_RAIDFRAME); 2945 ac_list = ac_list->next; 2946 free(ac, M_RAIDFRAME); 2947 } 2948 printf("RAID auto config: out of memory!\n"); 2949 return NULL; /* XXX probably should panic? */ 2950 } 2951 2952 if (!raidread_component_label(secsize, dev, vp, clabel)) { 2953 /* Got the label. Does it look reasonable? */ 2954 if (rf_reasonable_label(clabel, numsecs) && 2955 (rf_component_label_partitionsize(clabel) <= size)) { 2956 #ifdef DEBUG 2957 printf("Component on: %s: %llu\n", 2958 cname, (unsigned long long)size); 2959 rf_print_component_label(clabel); 2960 #endif 2961 /* if it's reasonable, add it, else ignore it. */ 2962 ac = malloc(sizeof(RF_AutoConfig_t), M_RAIDFRAME, 2963 M_NOWAIT); 2964 if (ac == NULL) { 2965 free(clabel, M_RAIDFRAME); 2966 goto oomem; 2967 } 2968 strlcpy(ac->devname, cname, sizeof(ac->devname)); 2969 ac->dev = dev; 2970 ac->vp = vp; 2971 ac->clabel = clabel; 2972 ac->next = ac_list; 2973 ac_list = ac; 2974 good_one = 1; 2975 } 2976 } 2977 if (!good_one) { 2978 /* cleanup */ 2979 free(clabel, M_RAIDFRAME); 2980 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2981 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 2982 vput(vp); 2983 } 2984 return ac_list; 2985 } 2986 2987 RF_AutoConfig_t * 2988 rf_find_raid_components(void) 2989 { 2990 struct vnode *vp; 2991 struct disklabel label; 2992 device_t dv; 2993 deviter_t di; 2994 dev_t dev; 2995 int bmajor, bminor, wedge, rf_part_found; 2996 int error; 2997 int i; 2998 RF_AutoConfig_t *ac_list; 2999 uint64_t numsecs; 3000 unsigned secsize; 3001 3002 /* initialize the AutoConfig list */ 3003 ac_list = NULL; 3004 3005 /* we begin by trolling through *all* the devices on the system */ 3006 3007 for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL; 3008 dv = deviter_next(&di)) { 3009 3010 /* we are only interested in disks... */ 3011 if (device_class(dv) != DV_DISK) 3012 continue; 3013 3014 /* we don't care about floppies... */ 3015 if (device_is_a(dv, "fd")) { 3016 continue; 3017 } 3018 3019 /* we don't care about CD's... */ 3020 if (device_is_a(dv, "cd")) { 3021 continue; 3022 } 3023 3024 /* we don't care about md's... */ 3025 if (device_is_a(dv, "md")) { 3026 continue; 3027 } 3028 3029 /* hdfd is the Atari/Hades floppy driver */ 3030 if (device_is_a(dv, "hdfd")) { 3031 continue; 3032 } 3033 3034 /* fdisa is the Atari/Milan floppy driver */ 3035 if (device_is_a(dv, "fdisa")) { 3036 continue; 3037 } 3038 3039 /* need to find the device_name_to_block_device_major stuff */ 3040 bmajor = devsw_name2blk(device_xname(dv), NULL, 0); 3041 3042 rf_part_found = 0; /*No raid partition as yet*/ 3043 3044 /* get a vnode for the raw partition of this disk */ 3045 3046 wedge = device_is_a(dv, "dk"); 3047 bminor = minor(device_unit(dv)); 3048 dev = wedge ? makedev(bmajor, bminor) : 3049 MAKEDISKDEV(bmajor, bminor, RAW_PART); 3050 if (bdevvp(dev, &vp)) 3051 panic("RAID can't alloc vnode"); 3052 3053 error = VOP_OPEN(vp, FREAD | FSILENT, NOCRED); 3054 3055 if (error) { 3056 /* "Who cares." Continue looking 3057 for something that exists*/ 3058 vput(vp); 3059 continue; 3060 } 3061 3062 error = getdisksize(vp, &numsecs, &secsize); 3063 if (error) { 3064 vput(vp); 3065 continue; 3066 } 3067 if (wedge) { 3068 struct dkwedge_info dkw; 3069 error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, 3070 NOCRED); 3071 if (error) { 3072 printf("RAIDframe: can't get wedge info for " 3073 "dev %s (%d)\n", device_xname(dv), error); 3074 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3075 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 3076 vput(vp); 3077 continue; 3078 } 3079 3080 if (strcmp(dkw.dkw_ptype, DKW_PTYPE_RAIDFRAME) != 0) { 3081 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3082 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 3083 vput(vp); 3084 continue; 3085 } 3086 3087 ac_list = rf_get_component(ac_list, dev, vp, 3088 device_xname(dv), dkw.dkw_size, numsecs, secsize); 3089 rf_part_found = 1; /*There is a raid component on this disk*/ 3090 continue; 3091 } 3092 3093 /* Ok, the disk exists. Go get the disklabel. */ 3094 error = VOP_IOCTL(vp, DIOCGDINFO, &label, FREAD, NOCRED); 3095 if (error) { 3096 /* 3097 * XXX can't happen - open() would 3098 * have errored out (or faked up one) 3099 */ 3100 if (error != ENOTTY) 3101 printf("RAIDframe: can't get label for dev " 3102 "%s (%d)\n", device_xname(dv), error); 3103 } 3104 3105 /* don't need this any more. We'll allocate it again 3106 a little later if we really do... */ 3107 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3108 VOP_CLOSE(vp, FREAD | FWRITE, NOCRED); 3109 vput(vp); 3110 3111 if (error) 3112 continue; 3113 3114 rf_part_found = 0; /*No raid partitions yet*/ 3115 for (i = 0; i < label.d_npartitions; i++) { 3116 char cname[sizeof(ac_list->devname)]; 3117 3118 /* We only support partitions marked as RAID */ 3119 if (label.d_partitions[i].p_fstype != FS_RAID) 3120 continue; 3121 3122 dev = MAKEDISKDEV(bmajor, device_unit(dv), i); 3123 if (bdevvp(dev, &vp)) 3124 panic("RAID can't alloc vnode"); 3125 3126 error = VOP_OPEN(vp, FREAD, NOCRED); 3127 if (error) { 3128 /* Whatever... */ 3129 vput(vp); 3130 continue; 3131 } 3132 snprintf(cname, sizeof(cname), "%s%c", 3133 device_xname(dv), 'a' + i); 3134 ac_list = rf_get_component(ac_list, dev, vp, cname, 3135 label.d_partitions[i].p_size, numsecs, secsize); 3136 rf_part_found = 1; /*There is at least one raid partition on this disk*/ 3137 } 3138 3139 /* 3140 *If there is no raid component on this disk, either in a 3141 *disklabel or inside a wedge, check the raw partition as well, 3142 *as it is possible to configure raid components on raw disk 3143 *devices. 3144 */ 3145 3146 if (!rf_part_found) { 3147 char cname[sizeof(ac_list->devname)]; 3148 3149 dev = MAKEDISKDEV(bmajor, device_unit(dv), RAW_PART); 3150 if (bdevvp(dev, &vp)) 3151 panic("RAID can't alloc vnode"); 3152 3153 error = VOP_OPEN(vp, FREAD, NOCRED); 3154 if (error) { 3155 /* Whatever... */ 3156 vput(vp); 3157 continue; 3158 } 3159 snprintf(cname, sizeof(cname), "%s%c", 3160 device_xname(dv), 'a' + RAW_PART); 3161 ac_list = rf_get_component(ac_list, dev, vp, cname, 3162 label.d_partitions[RAW_PART].p_size, numsecs, secsize); 3163 } 3164 } 3165 deviter_release(&di); 3166 return ac_list; 3167 } 3168 3169 3170 int 3171 rf_reasonable_label(RF_ComponentLabel_t *clabel, uint64_t numsecs) 3172 { 3173 3174 if (((clabel->version==RF_COMPONENT_LABEL_VERSION_1) || 3175 (clabel->version==RF_COMPONENT_LABEL_VERSION)) && 3176 ((clabel->clean == RF_RAID_CLEAN) || 3177 (clabel->clean == RF_RAID_DIRTY)) && 3178 clabel->row >=0 && 3179 clabel->column >= 0 && 3180 clabel->num_rows > 0 && 3181 clabel->num_columns > 0 && 3182 clabel->row < clabel->num_rows && 3183 clabel->column < clabel->num_columns && 3184 clabel->blockSize > 0 && 3185 /* 3186 * numBlocksHi may contain garbage, but it is ok since 3187 * the type is unsigned. If it is really garbage, 3188 * rf_fix_old_label_size() will fix it. 3189 */ 3190 rf_component_label_numblocks(clabel) > 0) { 3191 /* 3192 * label looks reasonable enough... 3193 * let's make sure it has no old garbage. 3194 */ 3195 if (numsecs) 3196 rf_fix_old_label_size(clabel, numsecs); 3197 return(1); 3198 } 3199 return(0); 3200 } 3201 3202 3203 /* 3204 * For reasons yet unknown, some old component labels have garbage in 3205 * the newer numBlocksHi region, and this causes lossage. Since those 3206 * disks will also have numsecs set to less than 32 bits of sectors, 3207 * we can determine when this corruption has occured, and fix it. 3208 * 3209 * The exact same problem, with the same unknown reason, happens to 3210 * the partitionSizeHi member as well. 3211 */ 3212 static void 3213 rf_fix_old_label_size(RF_ComponentLabel_t *clabel, uint64_t numsecs) 3214 { 3215 3216 if (numsecs < ((uint64_t)1 << 32)) { 3217 if (clabel->numBlocksHi) { 3218 printf("WARNING: total sectors < 32 bits, yet " 3219 "numBlocksHi set\n" 3220 "WARNING: resetting numBlocksHi to zero.\n"); 3221 clabel->numBlocksHi = 0; 3222 } 3223 3224 if (clabel->partitionSizeHi) { 3225 printf("WARNING: total sectors < 32 bits, yet " 3226 "partitionSizeHi set\n" 3227 "WARNING: resetting partitionSizeHi to zero.\n"); 3228 clabel->partitionSizeHi = 0; 3229 } 3230 } 3231 } 3232 3233 3234 #ifdef DEBUG 3235 void 3236 rf_print_component_label(RF_ComponentLabel_t *clabel) 3237 { 3238 uint64_t numBlocks; 3239 3240 numBlocks = rf_component_label_numblocks(clabel); 3241 3242 printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n", 3243 clabel->row, clabel->column, 3244 clabel->num_rows, clabel->num_columns); 3245 printf(" Version: %d Serial Number: %d Mod Counter: %d\n", 3246 clabel->version, clabel->serial_number, 3247 clabel->mod_counter); 3248 printf(" Clean: %s Status: %d\n", 3249 clabel->clean ? "Yes" : "No", clabel->status); 3250 printf(" sectPerSU: %d SUsPerPU: %d SUsPerRU: %d\n", 3251 clabel->sectPerSU, clabel->SUsPerPU, clabel->SUsPerRU); 3252 printf(" RAID Level: %c blocksize: %d numBlocks: %"PRIu64"\n", 3253 (char) clabel->parityConfig, clabel->blockSize, numBlocks); 3254 printf(" Autoconfig: %s\n", clabel->autoconfigure ? "Yes" : "No"); 3255 printf(" Contains root partition: %s\n", 3256 clabel->root_partition ? "Yes" : "No"); 3257 printf(" Last configured as: raid%d\n", clabel->last_unit); 3258 #if 0 3259 printf(" Config order: %d\n", clabel->config_order); 3260 #endif 3261 3262 } 3263 #endif 3264 3265 RF_ConfigSet_t * 3266 rf_create_auto_sets(RF_AutoConfig_t *ac_list) 3267 { 3268 RF_AutoConfig_t *ac; 3269 RF_ConfigSet_t *config_sets; 3270 RF_ConfigSet_t *cset; 3271 RF_AutoConfig_t *ac_next; 3272 3273 3274 config_sets = NULL; 3275 3276 /* Go through the AutoConfig list, and figure out which components 3277 belong to what sets. */ 3278 ac = ac_list; 3279 while(ac!=NULL) { 3280 /* we're going to putz with ac->next, so save it here 3281 for use at the end of the loop */ 3282 ac_next = ac->next; 3283 3284 if (config_sets == NULL) { 3285 /* will need at least this one... */ 3286 config_sets = (RF_ConfigSet_t *) 3287 malloc(sizeof(RF_ConfigSet_t), 3288 M_RAIDFRAME, M_NOWAIT); 3289 if (config_sets == NULL) { 3290 panic("rf_create_auto_sets: No memory!"); 3291 } 3292 /* this one is easy :) */ 3293 config_sets->ac = ac; 3294 config_sets->next = NULL; 3295 config_sets->rootable = 0; 3296 ac->next = NULL; 3297 } else { 3298 /* which set does this component fit into? */ 3299 cset = config_sets; 3300 while(cset!=NULL) { 3301 if (rf_does_it_fit(cset, ac)) { 3302 /* looks like it matches... */ 3303 ac->next = cset->ac; 3304 cset->ac = ac; 3305 break; 3306 } 3307 cset = cset->next; 3308 } 3309 if (cset==NULL) { 3310 /* didn't find a match above... new set..*/ 3311 cset = (RF_ConfigSet_t *) 3312 malloc(sizeof(RF_ConfigSet_t), 3313 M_RAIDFRAME, M_NOWAIT); 3314 if (cset == NULL) { 3315 panic("rf_create_auto_sets: No memory!"); 3316 } 3317 cset->ac = ac; 3318 ac->next = NULL; 3319 cset->next = config_sets; 3320 cset->rootable = 0; 3321 config_sets = cset; 3322 } 3323 } 3324 ac = ac_next; 3325 } 3326 3327 3328 return(config_sets); 3329 } 3330 3331 static int 3332 rf_does_it_fit(RF_ConfigSet_t *cset, RF_AutoConfig_t *ac) 3333 { 3334 RF_ComponentLabel_t *clabel1, *clabel2; 3335 3336 /* If this one matches the *first* one in the set, that's good 3337 enough, since the other members of the set would have been 3338 through here too... */ 3339 /* note that we are not checking partitionSize here.. 3340 3341 Note that we are also not checking the mod_counters here. 3342 If everything else matches execpt the mod_counter, that's 3343 good enough for this test. We will deal with the mod_counters 3344 a little later in the autoconfiguration process. 3345 3346 (clabel1->mod_counter == clabel2->mod_counter) && 3347 3348 The reason we don't check for this is that failed disks 3349 will have lower modification counts. If those disks are 3350 not added to the set they used to belong to, then they will 3351 form their own set, which may result in 2 different sets, 3352 for example, competing to be configured at raid0, and 3353 perhaps competing to be the root filesystem set. If the 3354 wrong ones get configured, or both attempt to become /, 3355 weird behaviour and or serious lossage will occur. Thus we 3356 need to bring them into the fold here, and kick them out at 3357 a later point. 3358 3359 */ 3360 3361 clabel1 = cset->ac->clabel; 3362 clabel2 = ac->clabel; 3363 if ((clabel1->version == clabel2->version) && 3364 (clabel1->serial_number == clabel2->serial_number) && 3365 (clabel1->num_rows == clabel2->num_rows) && 3366 (clabel1->num_columns == clabel2->num_columns) && 3367 (clabel1->sectPerSU == clabel2->sectPerSU) && 3368 (clabel1->SUsPerPU == clabel2->SUsPerPU) && 3369 (clabel1->SUsPerRU == clabel2->SUsPerRU) && 3370 (clabel1->parityConfig == clabel2->parityConfig) && 3371 (clabel1->maxOutstanding == clabel2->maxOutstanding) && 3372 (clabel1->blockSize == clabel2->blockSize) && 3373 rf_component_label_numblocks(clabel1) == 3374 rf_component_label_numblocks(clabel2) && 3375 (clabel1->autoconfigure == clabel2->autoconfigure) && 3376 (clabel1->root_partition == clabel2->root_partition) && 3377 (clabel1->last_unit == clabel2->last_unit) && 3378 (clabel1->config_order == clabel2->config_order)) { 3379 /* if it get's here, it almost *has* to be a match */ 3380 } else { 3381 /* it's not consistent with somebody in the set.. 3382 punt */ 3383 return(0); 3384 } 3385 /* all was fine.. it must fit... */ 3386 return(1); 3387 } 3388 3389 int 3390 rf_have_enough_components(RF_ConfigSet_t *cset) 3391 { 3392 RF_AutoConfig_t *ac; 3393 RF_AutoConfig_t *auto_config; 3394 RF_ComponentLabel_t *clabel; 3395 int c; 3396 int num_cols; 3397 int num_missing; 3398 int mod_counter; 3399 int mod_counter_found; 3400 int even_pair_failed; 3401 char parity_type; 3402 3403 3404 /* check to see that we have enough 'live' components 3405 of this set. If so, we can configure it if necessary */ 3406 3407 num_cols = cset->ac->clabel->num_columns; 3408 parity_type = cset->ac->clabel->parityConfig; 3409 3410 /* XXX Check for duplicate components!?!?!? */ 3411 3412 /* Determine what the mod_counter is supposed to be for this set. */ 3413 3414 mod_counter_found = 0; 3415 mod_counter = 0; 3416 ac = cset->ac; 3417 while(ac!=NULL) { 3418 if (mod_counter_found==0) { 3419 mod_counter = ac->clabel->mod_counter; 3420 mod_counter_found = 1; 3421 } else { 3422 if (ac->clabel->mod_counter > mod_counter) { 3423 mod_counter = ac->clabel->mod_counter; 3424 } 3425 } 3426 ac = ac->next; 3427 } 3428 3429 num_missing = 0; 3430 auto_config = cset->ac; 3431 3432 even_pair_failed = 0; 3433 for(c=0; c<num_cols; c++) { 3434 ac = auto_config; 3435 while(ac!=NULL) { 3436 if ((ac->clabel->column == c) && 3437 (ac->clabel->mod_counter == mod_counter)) { 3438 /* it's this one... */ 3439 #ifdef DEBUG 3440 printf("Found: %s at %d\n", 3441 ac->devname,c); 3442 #endif 3443 break; 3444 } 3445 ac=ac->next; 3446 } 3447 if (ac==NULL) { 3448 /* Didn't find one here! */ 3449 /* special case for RAID 1, especially 3450 where there are more than 2 3451 components (where RAIDframe treats 3452 things a little differently :( ) */ 3453 if (parity_type == '1') { 3454 if (c%2 == 0) { /* even component */ 3455 even_pair_failed = 1; 3456 } else { /* odd component. If 3457 we're failed, and 3458 so is the even 3459 component, it's 3460 "Good Night, Charlie" */ 3461 if (even_pair_failed == 1) { 3462 return(0); 3463 } 3464 } 3465 } else { 3466 /* normal accounting */ 3467 num_missing++; 3468 } 3469 } 3470 if ((parity_type == '1') && (c%2 == 1)) { 3471 /* Just did an even component, and we didn't 3472 bail.. reset the even_pair_failed flag, 3473 and go on to the next component.... */ 3474 even_pair_failed = 0; 3475 } 3476 } 3477 3478 clabel = cset->ac->clabel; 3479 3480 if (((clabel->parityConfig == '0') && (num_missing > 0)) || 3481 ((clabel->parityConfig == '4') && (num_missing > 1)) || 3482 ((clabel->parityConfig == '5') && (num_missing > 1))) { 3483 /* XXX this needs to be made *much* more general */ 3484 /* Too many failures */ 3485 return(0); 3486 } 3487 /* otherwise, all is well, and we've got enough to take a kick 3488 at autoconfiguring this set */ 3489 return(1); 3490 } 3491 3492 void 3493 rf_create_configuration(RF_AutoConfig_t *ac, RF_Config_t *config, 3494 RF_Raid_t *raidPtr) 3495 { 3496 RF_ComponentLabel_t *clabel; 3497 int i; 3498 3499 clabel = ac->clabel; 3500 3501 /* 1. Fill in the common stuff */ 3502 config->numRow = clabel->num_rows = 1; 3503 config->numCol = clabel->num_columns; 3504 config->numSpare = 0; /* XXX should this be set here? */ 3505 config->sectPerSU = clabel->sectPerSU; 3506 config->SUsPerPU = clabel->SUsPerPU; 3507 config->SUsPerRU = clabel->SUsPerRU; 3508 config->parityConfig = clabel->parityConfig; 3509 /* XXX... */ 3510 strcpy(config->diskQueueType,"fifo"); 3511 config->maxOutstandingDiskReqs = clabel->maxOutstanding; 3512 config->layoutSpecificSize = 0; /* XXX ?? */ 3513 3514 while(ac!=NULL) { 3515 /* row/col values will be in range due to the checks 3516 in reasonable_label() */ 3517 strcpy(config->devnames[0][ac->clabel->column], 3518 ac->devname); 3519 ac = ac->next; 3520 } 3521 3522 for(i=0;i<RF_MAXDBGV;i++) { 3523 config->debugVars[i][0] = 0; 3524 } 3525 } 3526 3527 int 3528 rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value) 3529 { 3530 RF_ComponentLabel_t *clabel; 3531 int column; 3532 int sparecol; 3533 3534 raidPtr->autoconfigure = new_value; 3535 3536 for(column=0; column<raidPtr->numCol; column++) { 3537 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3538 clabel = raidget_component_label(raidPtr, column); 3539 clabel->autoconfigure = new_value; 3540 raidflush_component_label(raidPtr, column); 3541 } 3542 } 3543 for(column = 0; column < raidPtr->numSpare ; column++) { 3544 sparecol = raidPtr->numCol + column; 3545 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3546 clabel = raidget_component_label(raidPtr, sparecol); 3547 clabel->autoconfigure = new_value; 3548 raidflush_component_label(raidPtr, sparecol); 3549 } 3550 } 3551 return(new_value); 3552 } 3553 3554 int 3555 rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value) 3556 { 3557 RF_ComponentLabel_t *clabel; 3558 int column; 3559 int sparecol; 3560 3561 raidPtr->root_partition = new_value; 3562 for(column=0; column<raidPtr->numCol; column++) { 3563 if (raidPtr->Disks[column].status == rf_ds_optimal) { 3564 clabel = raidget_component_label(raidPtr, column); 3565 clabel->root_partition = new_value; 3566 raidflush_component_label(raidPtr, column); 3567 } 3568 } 3569 for(column = 0; column < raidPtr->numSpare ; column++) { 3570 sparecol = raidPtr->numCol + column; 3571 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3572 clabel = raidget_component_label(raidPtr, sparecol); 3573 clabel->root_partition = new_value; 3574 raidflush_component_label(raidPtr, sparecol); 3575 } 3576 } 3577 return(new_value); 3578 } 3579 3580 void 3581 rf_release_all_vps(RF_ConfigSet_t *cset) 3582 { 3583 RF_AutoConfig_t *ac; 3584 3585 ac = cset->ac; 3586 while(ac!=NULL) { 3587 /* Close the vp, and give it back */ 3588 if (ac->vp) { 3589 vn_lock(ac->vp, LK_EXCLUSIVE | LK_RETRY); 3590 VOP_CLOSE(ac->vp, FREAD, NOCRED); 3591 vput(ac->vp); 3592 ac->vp = NULL; 3593 } 3594 ac = ac->next; 3595 } 3596 } 3597 3598 3599 void 3600 rf_cleanup_config_set(RF_ConfigSet_t *cset) 3601 { 3602 RF_AutoConfig_t *ac; 3603 RF_AutoConfig_t *next_ac; 3604 3605 ac = cset->ac; 3606 while(ac!=NULL) { 3607 next_ac = ac->next; 3608 /* nuke the label */ 3609 free(ac->clabel, M_RAIDFRAME); 3610 /* cleanup the config structure */ 3611 free(ac, M_RAIDFRAME); 3612 /* "next.." */ 3613 ac = next_ac; 3614 } 3615 /* and, finally, nuke the config set */ 3616 free(cset, M_RAIDFRAME); 3617 } 3618 3619 3620 void 3621 raid_init_component_label(RF_Raid_t *raidPtr, RF_ComponentLabel_t *clabel) 3622 { 3623 /* current version number */ 3624 clabel->version = RF_COMPONENT_LABEL_VERSION; 3625 clabel->serial_number = raidPtr->serial_number; 3626 clabel->mod_counter = raidPtr->mod_counter; 3627 3628 clabel->num_rows = 1; 3629 clabel->num_columns = raidPtr->numCol; 3630 clabel->clean = RF_RAID_DIRTY; /* not clean */ 3631 clabel->status = rf_ds_optimal; /* "It's good!" */ 3632 3633 clabel->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; 3634 clabel->SUsPerPU = raidPtr->Layout.SUsPerPU; 3635 clabel->SUsPerRU = raidPtr->Layout.SUsPerRU; 3636 3637 clabel->blockSize = raidPtr->bytesPerSector; 3638 rf_component_label_set_numblocks(clabel, raidPtr->sectorsPerDisk); 3639 3640 /* XXX not portable */ 3641 clabel->parityConfig = raidPtr->Layout.map->parityConfig; 3642 clabel->maxOutstanding = raidPtr->maxOutstanding; 3643 clabel->autoconfigure = raidPtr->autoconfigure; 3644 clabel->root_partition = raidPtr->root_partition; 3645 clabel->last_unit = raidPtr->raidid; 3646 clabel->config_order = raidPtr->config_order; 3647 3648 #ifndef RF_NO_PARITY_MAP 3649 rf_paritymap_init_label(raidPtr->parity_map, clabel); 3650 #endif 3651 } 3652 3653 int 3654 rf_auto_config_set(RF_ConfigSet_t *cset, int *unit) 3655 { 3656 RF_Raid_t *raidPtr; 3657 RF_Config_t *config; 3658 int raidID; 3659 int retcode; 3660 3661 #ifdef DEBUG 3662 printf("RAID autoconfigure\n"); 3663 #endif 3664 3665 retcode = 0; 3666 *unit = -1; 3667 3668 /* 1. Create a config structure */ 3669 3670 config = (RF_Config_t *)malloc(sizeof(RF_Config_t), 3671 M_RAIDFRAME, 3672 M_NOWAIT); 3673 if (config==NULL) { 3674 printf("Out of mem!?!?\n"); 3675 /* XXX do something more intelligent here. */ 3676 return(1); 3677 } 3678 3679 memset(config, 0, sizeof(RF_Config_t)); 3680 3681 /* 3682 2. Figure out what RAID ID this one is supposed to live at 3683 See if we can get the same RAID dev that it was configured 3684 on last time.. 3685 */ 3686 3687 raidID = cset->ac->clabel->last_unit; 3688 if ((raidID < 0) || (raidID >= numraid)) { 3689 /* let's not wander off into lala land. */ 3690 raidID = numraid - 1; 3691 } 3692 if (raidPtrs[raidID]->valid != 0) { 3693 3694 /* 3695 Nope... Go looking for an alternative... 3696 Start high so we don't immediately use raid0 if that's 3697 not taken. 3698 */ 3699 3700 for(raidID = numraid - 1; raidID >= 0; raidID--) { 3701 if (raidPtrs[raidID]->valid == 0) { 3702 /* can use this one! */ 3703 break; 3704 } 3705 } 3706 } 3707 3708 if (raidID < 0) { 3709 /* punt... */ 3710 printf("Unable to auto configure this set!\n"); 3711 printf("(Out of RAID devs!)\n"); 3712 free(config, M_RAIDFRAME); 3713 return(1); 3714 } 3715 3716 #ifdef DEBUG 3717 printf("Configuring raid%d:\n",raidID); 3718 #endif 3719 3720 raidPtr = raidPtrs[raidID]; 3721 3722 /* XXX all this stuff should be done SOMEWHERE ELSE! */ 3723 raidPtr->raidid = raidID; 3724 raidPtr->openings = RAIDOUTSTANDING; 3725 3726 /* 3. Build the configuration structure */ 3727 rf_create_configuration(cset->ac, config, raidPtr); 3728 3729 /* 4. Do the configuration */ 3730 retcode = rf_Configure(raidPtr, config, cset->ac); 3731 3732 if (retcode == 0) { 3733 3734 raidinit(raidPtrs[raidID]); 3735 3736 rf_markalldirty(raidPtrs[raidID]); 3737 raidPtrs[raidID]->autoconfigure = 1; /* XXX do this here? */ 3738 if (cset->ac->clabel->root_partition==1) { 3739 /* everything configured just fine. Make a note 3740 that this set is eligible to be root. */ 3741 cset->rootable = 1; 3742 /* XXX do this here? */ 3743 raidPtrs[raidID]->root_partition = 1; 3744 } 3745 } 3746 3747 /* 5. Cleanup */ 3748 free(config, M_RAIDFRAME); 3749 3750 *unit = raidID; 3751 return(retcode); 3752 } 3753 3754 void 3755 rf_disk_unbusy(RF_RaidAccessDesc_t *desc) 3756 { 3757 struct buf *bp; 3758 3759 bp = (struct buf *)desc->bp; 3760 disk_unbusy(&raid_softc[desc->raidPtr->raidid].sc_dkdev, 3761 (bp->b_bcount - bp->b_resid), (bp->b_flags & B_READ)); 3762 } 3763 3764 void 3765 rf_pool_init(struct pool *p, size_t size, const char *w_chan, 3766 size_t xmin, size_t xmax) 3767 { 3768 pool_init(p, size, 0, 0, 0, w_chan, NULL, IPL_BIO); 3769 pool_sethiwat(p, xmax); 3770 pool_prime(p, xmin); 3771 pool_setlowat(p, xmin); 3772 } 3773 3774 /* 3775 * rf_buf_queue_check(int raidid) -- looks into the buf_queue to see 3776 * if there is IO pending and if that IO could possibly be done for a 3777 * given RAID set. Returns 0 if IO is waiting and can be done, 1 3778 * otherwise. 3779 * 3780 */ 3781 3782 int 3783 rf_buf_queue_check(int raidid) 3784 { 3785 if ((bufq_peek(raid_softc[raidid].buf_queue) != NULL) && 3786 raidPtrs[raidid]->openings > 0) { 3787 /* there is work to do */ 3788 return 0; 3789 } 3790 /* default is nothing to do */ 3791 return 1; 3792 } 3793 3794 int 3795 rf_getdisksize(struct vnode *vp, RF_RaidDisk_t *diskPtr) 3796 { 3797 uint64_t numsecs; 3798 unsigned secsize; 3799 int error; 3800 3801 error = getdisksize(vp, &numsecs, &secsize); 3802 if (error == 0) { 3803 diskPtr->blockSize = secsize; 3804 diskPtr->numBlocks = numsecs - rf_protectedSectors; 3805 diskPtr->partitionSize = numsecs; 3806 return 0; 3807 } 3808 return error; 3809 } 3810 3811 static int 3812 raid_match(device_t self, cfdata_t cfdata, void *aux) 3813 { 3814 return 1; 3815 } 3816 3817 static void 3818 raid_attach(device_t parent, device_t self, void *aux) 3819 { 3820 3821 } 3822 3823 3824 static int 3825 raid_detach(device_t self, int flags) 3826 { 3827 int error; 3828 struct raid_softc *rs = &raid_softc[device_unit(self)]; 3829 3830 if ((error = raidlock(rs)) != 0) 3831 return (error); 3832 3833 error = raid_detach_unlocked(rs); 3834 3835 raidunlock(rs); 3836 3837 return error; 3838 } 3839 3840 static void 3841 rf_set_properties(struct raid_softc *rs, RF_Raid_t *raidPtr) 3842 { 3843 prop_dictionary_t disk_info, odisk_info, geom; 3844 disk_info = prop_dictionary_create(); 3845 geom = prop_dictionary_create(); 3846 prop_dictionary_set_uint64(geom, "sectors-per-unit", 3847 raidPtr->totalSectors); 3848 prop_dictionary_set_uint32(geom, "sector-size", 3849 raidPtr->bytesPerSector); 3850 3851 prop_dictionary_set_uint16(geom, "sectors-per-track", 3852 raidPtr->Layout.dataSectorsPerStripe); 3853 prop_dictionary_set_uint16(geom, "tracks-per-cylinder", 3854 4 * raidPtr->numCol); 3855 3856 prop_dictionary_set_uint64(geom, "cylinders-per-unit", 3857 raidPtr->totalSectors / (raidPtr->Layout.dataSectorsPerStripe * 3858 (4 * raidPtr->numCol))); 3859 3860 prop_dictionary_set(disk_info, "geometry", geom); 3861 prop_object_release(geom); 3862 prop_dictionary_set(device_properties(rs->sc_dev), 3863 "disk-info", disk_info); 3864 odisk_info = rs->sc_dkdev.dk_info; 3865 rs->sc_dkdev.dk_info = disk_info; 3866 if (odisk_info) 3867 prop_object_release(odisk_info); 3868 } 3869 3870 /* 3871 * Implement forwarding of the DIOCCACHESYNC ioctl to each of the components. 3872 * We end up returning whatever error was returned by the first cache flush 3873 * that fails. 3874 */ 3875 3876 int 3877 rf_sync_component_caches(RF_Raid_t *raidPtr) 3878 { 3879 int c, sparecol; 3880 int e,error; 3881 int force = 1; 3882 3883 error = 0; 3884 for (c = 0; c < raidPtr->numCol; c++) { 3885 if (raidPtr->Disks[c].status == rf_ds_optimal) { 3886 e = VOP_IOCTL(raidPtr->raid_cinfo[c].ci_vp, DIOCCACHESYNC, 3887 &force, FWRITE, NOCRED); 3888 if (e) { 3889 if (e != ENODEV) 3890 printf("raid%d: cache flush to component %s failed.\n", 3891 raidPtr->raidid, raidPtr->Disks[c].devname); 3892 if (error == 0) { 3893 error = e; 3894 } 3895 } 3896 } 3897 } 3898 3899 for( c = 0; c < raidPtr->numSpare ; c++) { 3900 sparecol = raidPtr->numCol + c; 3901 /* Need to ensure that the reconstruct actually completed! */ 3902 if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { 3903 e = VOP_IOCTL(raidPtr->raid_cinfo[sparecol].ci_vp, 3904 DIOCCACHESYNC, &force, FWRITE, NOCRED); 3905 if (e) { 3906 if (e != ENODEV) 3907 printf("raid%d: cache flush to component %s failed.\n", 3908 raidPtr->raidid, raidPtr->Disks[sparecol].devname); 3909 if (error == 0) { 3910 error = e; 3911 } 3912 } 3913 } 3914 } 3915 return error; 3916 } 3917