1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Driver for Virtual Disk. 30 */ 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/user.h> 36 #include <sys/uio.h> 37 #include <sys/proc.h> 38 #include <sys/t_lock.h> 39 #include <sys/dkio.h> 40 #include <sys/kmem.h> 41 #include <sys/debug.h> 42 #include <sys/cmn_err.h> 43 #include <sys/sysmacros.h> 44 #include <sys/types.h> 45 #include <sys/mkdev.h> 46 #include <sys/vtoc.h> 47 #include <sys/open.h> 48 #include <sys/file.h> 49 #include <vm/page.h> 50 #include <sys/callb.h> 51 #include <sys/disp.h> 52 #include <sys/modctl.h> 53 #include <sys/errno.h> 54 #include <sys/door.h> 55 #include <sys/lvm/mdmn_commd.h> 56 #include <sys/lvm/md_hotspares.h> 57 58 #include <sys/lvm/mdvar.h> 59 #include <sys/lvm/md_names.h> 60 61 #include <sys/ddi.h> 62 #include <sys/proc.h> 63 #include <sys/sunddi.h> 64 #include <sys/esunddi.h> 65 66 #include <sys/sysevent.h> 67 #include <sys/sysevent/eventdefs.h> 68 69 #include <sys/sysevent/svm.h> 70 #include <sys/lvm/md_basic.h> 71 72 73 /* 74 * Machine specific Hertz is kept here 75 */ 76 extern clock_t md_hz; 77 78 /* 79 * Externs. 80 */ 81 extern int (*mdv_strategy_tstpnt)(buf_t *, int, void*); 82 extern major_t md_major; 83 extern unit_t md_nunits; 84 extern set_t md_nsets; 85 extern md_set_t md_set[]; 86 extern md_set_io_t md_set_io[]; 87 extern md_ops_t **md_ops; 88 extern md_ops_t *md_opslist; 89 extern ddi_modhandle_t *md_mods; 90 91 extern md_krwlock_t md_unit_array_rw; 92 extern kmutex_t md_mx; 93 extern kcondvar_t md_cv; 94 95 extern md_krwlock_t hsp_rwlp; 96 extern md_krwlock_t ni_rwlp; 97 98 extern int md_num_daemons; 99 extern int md_status; 100 extern int md_ioctl_cnt; 101 extern int md_mtioctl_cnt; 102 103 extern struct metatransops metatransops; 104 extern md_event_queue_t *md_event_queue; 105 extern md_resync_t md_cpr_resync; 106 extern int md_done_daemon_threads; 107 extern int md_ff_daemon_threads; 108 109 110 extern mddb_set_t *mddb_setenter(set_t setno, int flag, int *errorcodep); 111 extern void mddb_setexit(mddb_set_t *s); 112 extern void *lookup_entry(struct nm_next_hdr *, set_t, 113 side_t, mdkey_t, md_dev64_t, int); 114 extern struct nm_next_hdr *get_first_record(set_t, int, int); 115 116 struct mdq_anchor md_done_daemon; /* done request queue */ 117 struct mdq_anchor md_mstr_daemon; /* mirror timeout requests */ 118 struct mdq_anchor md_mhs_daemon; /* mirror hotspare requests queue */ 119 struct mdq_anchor md_hs_daemon; /* raid hotspare requests queue */ 120 struct mdq_anchor md_ff_daemonq; /* failfast request queue */ 121 struct mdq_anchor md_mirror_daemon; /* mirror owner queue */ 122 struct mdq_anchor md_mirror_io_daemon; /* mirror owner i/o queue */ 123 struct mdq_anchor md_mirror_rs_daemon; /* mirror resync done queue */ 124 struct mdq_anchor md_sp_daemon; /* soft-part error daemon queue */ 125 126 int md_done_daemon_threads = 1; /* threads for md_done_daemon requestq */ 127 int md_mstr_daemon_threads = 1; /* threads for md_mstr_daemon requestq */ 128 int md_mhs_daemon_threads = 1; /* threads for md_mhs_daemon requestq */ 129 int md_hs_daemon_threads = 1; /* threads for md_hs_daemon requestq */ 130 int md_ff_daemon_threads = 3; /* threads for md_ff_daemon requestq */ 131 int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */ 132 int md_sp_daemon_threads = 1; /* threads for md_sp_daemon requestq */ 133 134 #ifdef DEBUG 135 /* Flag to switch on debug messages */ 136 int md_release_reacquire_debug = 0; /* debug flag */ 137 #endif 138 139 /* 140 * 141 * The md_request_queues is table of pointers to request queues and the number 142 * of threads associated with the request queues. 143 * When the number of threads is set to 1, then the order of execution is 144 * sequential. 145 * The number of threads for all the queues have been defined as global 146 * variables to enable kernel tuning. 147 * 148 */ 149 150 #define MD_DAEMON_QUEUES 10 151 152 md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = { 153 {&md_done_daemon, &md_done_daemon_threads}, 154 {&md_mstr_daemon, &md_mstr_daemon_threads}, 155 {&md_hs_daemon, &md_hs_daemon_threads}, 156 {&md_ff_daemonq, &md_ff_daemon_threads}, 157 {&md_mirror_daemon, &md_mirror_daemon_threads}, 158 {&md_mirror_io_daemon, &md_mirror_daemon_threads}, 159 {&md_mirror_rs_daemon, &md_mirror_daemon_threads}, 160 {&md_sp_daemon, &md_sp_daemon_threads}, 161 {&md_mhs_daemon, &md_mhs_daemon_threads}, 162 {0, 0} 163 }; 164 165 /* 166 * Number of times a message is retried before issuing a warning to the operator 167 */ 168 #define MD_MN_WARN_INTVL 10 169 170 /* 171 * Setting retry cnt to one (pre decremented) so that we actually do no 172 * retries when committing/deleting a mddb rec. The underlying disk driver 173 * does several retries to check if the disk is really dead or not so there 174 * is no reason for us to retry on top of the drivers retries. 175 */ 176 177 uint_t md_retry_cnt = 1; /* global so it can be patched */ 178 179 /* 180 * Bug # 1212146 181 * Before this change the user had to pass in a short aligned buffer because of 182 * problems in some underlying device drivers. This problem seems to have been 183 * corrected in the underlying drivers so we will default to not requiring any 184 * alignment. If the user needs to check for a specific alignment, 185 * md_uio_alignment_mask may be set in /etc/system to accomplish this. To get 186 * the behavior before this fix, the md_uio_alignment_mask would be set to 1, 187 * to check for word alignment, it can be set to 3, for double word alignment, 188 * it can be set to 7, etc. 189 * 190 * [Other part of fix is in function md_chk_uio()] 191 */ 192 static int md_uio_alignment_mask = 0; 193 194 /* 195 * for md_dev64_t translation 196 */ 197 struct md_xlate_table *md_tuple_table; 198 struct md_xlate_major_table *md_major_tuple_table; 199 int md_tuple_length; 200 uint_t md_majortab_len; 201 202 /* Function declarations */ 203 204 static int md_create_probe_rqlist(md_probedev_impl_t *plist, 205 daemon_queue_t **hdr, intptr_t (*probe_test)()); 206 207 /* 208 * manipulate global status 209 */ 210 void 211 md_set_status(int bits) 212 { 213 mutex_enter(&md_mx); 214 md_status |= bits; 215 mutex_exit(&md_mx); 216 } 217 218 void 219 md_clr_status(int bits) 220 { 221 mutex_enter(&md_mx); 222 md_status &= ~bits; 223 mutex_exit(&md_mx); 224 } 225 226 int 227 md_get_status() 228 { 229 int result; 230 mutex_enter(&md_mx); 231 result = md_status; 232 mutex_exit(&md_mx); 233 return (result); 234 } 235 236 void 237 md_set_setstatus(set_t setno, int bits) 238 { 239 ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS); 240 241 mutex_enter(&md_mx); 242 md_set[setno].s_status |= bits; 243 mutex_exit(&md_mx); 244 } 245 246 void 247 md_clr_setstatus(set_t setno, int bits) 248 { 249 ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS); 250 251 mutex_enter(&md_mx); 252 md_set[setno].s_status &= ~bits; 253 mutex_exit(&md_mx); 254 } 255 256 uint_t 257 md_get_setstatus(set_t setno) 258 { 259 uint_t result; 260 261 ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS); 262 263 mutex_enter(&md_mx); 264 result = md_set[setno].s_status; 265 mutex_exit(&md_mx); 266 return (result); 267 } 268 269 /* 270 * md_unit_readerlock_common: 271 * ------------------------- 272 * Mark the given unit as having a reader reference. Spin waiting for any 273 * writer references to be released. 274 * 275 * Input: 276 * ui unit reference 277 * lock_held 0 => ui_mx needs to be grabbed 278 * 1 => ui_mx already held 279 * Output: 280 * mm_unit_t corresponding to unit structure 281 * ui->ui_readercnt incremented 282 */ 283 static void * 284 md_unit_readerlock_common(mdi_unit_t *ui, int lock_held) 285 { 286 uint_t flag = MD_UL_WRITER | MD_UL_WANABEWRITER; 287 288 if (!lock_held) 289 mutex_enter(&ui->ui_mx); 290 while (ui->ui_lock & flag) { 291 if (panicstr) { 292 if (ui->ui_lock & MD_UL_WRITER) 293 panic("md: writer lock is held"); 294 break; 295 } 296 cv_wait(&ui->ui_cv, &ui->ui_mx); 297 } 298 ui->ui_readercnt++; 299 if (!lock_held) 300 mutex_exit(&ui->ui_mx); 301 return (MD_UNIT(ui->ui_link.ln_id)); 302 } 303 304 void * 305 md_unit_readerlock(mdi_unit_t *ui) 306 { 307 return (md_unit_readerlock_common(ui, 0)); 308 } 309 310 /* 311 * md_unit_writerlock_common: 312 * ------------------------- 313 * Acquire a unique writer reference. Causes previous readers to drain. 314 * Spins if a writer reference already exists or if a previous reader/writer 315 * dropped the lock to allow a ksend_message to be despatched. 316 * 317 * Input: 318 * ui unit reference 319 * lock_held 0 => grab ui_mx 320 * 1 => ui_mx already held on entry 321 * Output: 322 * mm_unit_t reference 323 */ 324 static void * 325 md_unit_writerlock_common(mdi_unit_t *ui, int lock_held) 326 { 327 uint_t flag = MD_UL_WRITER; 328 329 if (panicstr) 330 panic("md: writer lock not allowed"); 331 332 if (!lock_held) 333 mutex_enter(&ui->ui_mx); 334 335 while ((ui->ui_lock & flag) || (ui->ui_readercnt != 0)) { 336 ui->ui_wanabecnt++; 337 ui->ui_lock |= MD_UL_WANABEWRITER; 338 cv_wait(&ui->ui_cv, &ui->ui_mx); 339 if (--ui->ui_wanabecnt == 0) 340 ui->ui_lock &= ~MD_UL_WANABEWRITER; 341 } 342 ui->ui_lock |= MD_UL_WRITER; 343 ui->ui_owner = curthread; 344 345 if (!lock_held) 346 mutex_exit(&ui->ui_mx); 347 return (MD_UNIT(ui->ui_link.ln_id)); 348 } 349 350 void * 351 md_unit_writerlock(mdi_unit_t *ui) 352 { 353 return (md_unit_writerlock_common(ui, 0)); 354 } 355 356 /* 357 * md_unit_readerexit_common: 358 * ------------------------- 359 * Release the readerlock for the specified unit. If the reader count reaches 360 * zero and there are waiting writers (MD_UL_WANABEWRITER set) wake them up. 361 * 362 * Input: 363 * ui unit reference 364 * lock_held 0 => ui_mx needs to be acquired 365 * 1 => ui_mx already held 366 */ 367 static void 368 md_unit_readerexit_common(mdi_unit_t *ui, int lock_held) 369 { 370 if (!lock_held) 371 mutex_enter(&ui->ui_mx); 372 ASSERT((ui->ui_lock & MD_UL_WRITER) == 0); 373 ASSERT(ui->ui_readercnt != 0); 374 ui->ui_readercnt--; 375 if ((ui->ui_wanabecnt != 0) && (ui->ui_readercnt == 0)) 376 cv_broadcast(&ui->ui_cv); 377 378 if (!lock_held) 379 mutex_exit(&ui->ui_mx); 380 } 381 382 void 383 md_unit_readerexit(mdi_unit_t *ui) 384 { 385 md_unit_readerexit_common(ui, 0); 386 } 387 388 /* 389 * md_unit_writerexit_common: 390 * ------------------------- 391 * Release the writerlock currently held on the unit. Wake any threads waiting 392 * on becoming reader or writer (MD_UL_WANABEWRITER set). 393 * 394 * Input: 395 * ui unit reference 396 * lock_held 0 => ui_mx to be acquired 397 * 1 => ui_mx already held 398 */ 399 static void 400 md_unit_writerexit_common(mdi_unit_t *ui, int lock_held) 401 { 402 if (!lock_held) 403 mutex_enter(&ui->ui_mx); 404 ASSERT((ui->ui_lock & MD_UL_WRITER) != 0); 405 ASSERT(ui->ui_readercnt == 0); 406 ui->ui_lock &= ~MD_UL_WRITER; 407 ui->ui_owner = NULL; 408 409 cv_broadcast(&ui->ui_cv); 410 if (!lock_held) 411 mutex_exit(&ui->ui_mx); 412 } 413 414 void 415 md_unit_writerexit(mdi_unit_t *ui) 416 { 417 md_unit_writerexit_common(ui, 0); 418 } 419 420 void * 421 md_io_readerlock(mdi_unit_t *ui) 422 { 423 md_io_lock_t *io = ui->ui_io_lock; 424 425 ASSERT(io); /* checks case where no io lock allocated */ 426 mutex_enter(&io->io_mx); 427 while (io->io_lock & (MD_UL_WRITER | MD_UL_WANABEWRITER)) { 428 if (panicstr) { 429 if (io->io_lock & MD_UL_WRITER) 430 panic("md: writer lock is held"); 431 break; 432 } 433 cv_wait(&io->io_cv, &io->io_mx); 434 } 435 io->io_readercnt++; 436 mutex_exit(&io->io_mx); 437 return (MD_UNIT(ui->ui_link.ln_id)); 438 } 439 440 void * 441 md_io_writerlock(mdi_unit_t *ui) 442 { 443 md_io_lock_t *io = ui->ui_io_lock; 444 445 ASSERT(io); /* checks case where no io lock allocated */ 446 if (panicstr) 447 panic("md: writer lock not allowed"); 448 449 mutex_enter(&io->io_mx); 450 while ((io->io_lock & MD_UL_WRITER) || (io->io_readercnt != 0)) { 451 io->io_wanabecnt++; 452 io->io_lock |= MD_UL_WANABEWRITER; 453 cv_wait(&io->io_cv, &io->io_mx); 454 if (--io->io_wanabecnt == 0) 455 io->io_lock &= ~MD_UL_WANABEWRITER; 456 } 457 io->io_lock |= MD_UL_WRITER; 458 io->io_owner = curthread; 459 460 mutex_exit(&io->io_mx); 461 return (MD_UNIT(ui->ui_link.ln_id)); 462 } 463 464 void 465 md_io_readerexit(mdi_unit_t *ui) 466 { 467 md_io_lock_t *io = ui->ui_io_lock; 468 469 mutex_enter(&io->io_mx); 470 ASSERT((io->io_lock & MD_UL_WRITER) == 0); 471 ASSERT(io->io_readercnt != 0); 472 io->io_readercnt--; 473 if ((io->io_wanabecnt != 0) && (io->io_readercnt == 0)) { 474 cv_broadcast(&io->io_cv); 475 } 476 mutex_exit(&io->io_mx); 477 } 478 479 void 480 md_io_writerexit(mdi_unit_t *ui) 481 { 482 md_io_lock_t *io = ui->ui_io_lock; 483 484 mutex_enter(&io->io_mx); 485 ASSERT((io->io_lock & MD_UL_WRITER) != 0); 486 ASSERT(io->io_readercnt == 0); 487 io->io_lock &= ~MD_UL_WRITER; 488 io->io_owner = NULL; 489 490 cv_broadcast(&io->io_cv); 491 mutex_exit(&io->io_mx); 492 } 493 494 /* 495 * Attempt to grab that set of locks defined as global. 496 * A mask containing the set of global locks that are owned upon 497 * entry is input. Any additional global locks are then grabbed. 498 * This keeps the caller from having to know the set of global 499 * locks. 500 */ 501 static int 502 md_global_lock_enter(int global_locks_owned_mask) 503 { 504 505 /* 506 * The current implementation has been verified by inspection 507 * and test to be deadlock free. If another global lock is 508 * added, changing the algorithm used by this function should 509 * be considered. With more than 2 locks it is difficult to 510 * guarantee that locks are being acquired in the correct order. 511 * The safe approach would be to drop all of the locks that are 512 * owned at function entry and then reacquire all of the locks 513 * in the order defined by the lock hierarchy. 514 */ 515 mutex_enter(&md_mx); 516 if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) { 517 while ((md_mtioctl_cnt != 0) || 518 (md_status & MD_GBL_IOCTL_LOCK)) { 519 if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) { 520 mutex_exit(&md_mx); 521 return (EINTR); 522 } 523 } 524 md_status |= MD_GBL_IOCTL_LOCK; 525 md_ioctl_cnt++; 526 } 527 if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) { 528 while (md_status & MD_GBL_HS_LOCK) { 529 if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) { 530 md_status &= ~MD_GBL_IOCTL_LOCK; 531 mutex_exit(&md_mx); 532 return (EINTR); 533 } 534 } 535 md_status |= MD_GBL_HS_LOCK; 536 } 537 mutex_exit(&md_mx); 538 return (0); 539 } 540 541 /* 542 * Release the set of global locks that were grabbed in md_global_lock_enter 543 * that were not already owned by the calling thread. The set of previously 544 * owned global locks is passed in as a mask parameter. 545 */ 546 static int 547 md_global_lock_exit(int global_locks_owned_mask, int code, 548 int flags, mdi_unit_t *ui) 549 { 550 mutex_enter(&md_mx); 551 552 /* If MT ioctl decrement mt_ioctl_cnt */ 553 if ((flags & MD_MT_IOCTL)) { 554 md_mtioctl_cnt--; 555 } else { 556 if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) { 557 /* clear the lock and decrement count */ 558 ASSERT(md_ioctl_cnt == 1); 559 md_ioctl_cnt--; 560 md_status &= ~MD_GBL_IOCTL_LOCK; 561 } 562 if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) 563 md_status &= ~MD_GBL_HS_LOCK; 564 } 565 if (flags & MD_READER_HELD) 566 md_unit_readerexit(ui); 567 if (flags & MD_WRITER_HELD) 568 md_unit_writerexit(ui); 569 if (flags & MD_IO_HELD) 570 md_io_writerexit(ui); 571 if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) { 572 rw_exit(&md_unit_array_rw.lock); 573 } 574 cv_broadcast(&md_cv); 575 mutex_exit(&md_mx); 576 577 return (code); 578 } 579 580 /* 581 * The two functions, md_ioctl_lock_enter, and md_ioctl_lock_exit make 582 * use of the md_global_lock_{enter|exit} functions to avoid duplication 583 * of code. They rely upon the fact that the locks that are specified in 584 * the input mask are not acquired or freed. If this algorithm changes 585 * as described in the block comment at the beginning of md_global_lock_enter 586 * then it will be necessary to change these 2 functions. Otherwise these 587 * functions will be grabbing and holding global locks unnecessarily. 588 */ 589 int 590 md_ioctl_lock_enter(void) 591 { 592 /* grab only the ioctl lock */ 593 return (md_global_lock_enter(~MD_GBL_IOCTL_LOCK)); 594 } 595 596 /* 597 * If md_ioctl_lock_exit is being called at the end of an ioctl before 598 * returning to user space, then ioctl_end is set to 1. 599 * Otherwise, the ioctl lock is being dropped in the middle of handling 600 * an ioctl and will be reacquired before the end of the ioctl. 601 * Do not attempt to process the MN diskset mddb parse flags unless 602 * ioctl_end is true - otherwise a deadlock situation could arise. 603 */ 604 int 605 md_ioctl_lock_exit(int code, int flags, mdi_unit_t *ui, int ioctl_end) 606 { 607 int ret_val; 608 uint_t status; 609 mddb_set_t *s; 610 int i; 611 int err; 612 md_mn_msg_mddb_parse_t *mddb_parse_msg; 613 md_mn_kresult_t *kresult; 614 mddb_lb_t *lbp; 615 int rval = 1; 616 int flag; 617 618 /* release only the ioctl lock */ 619 ret_val = md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui); 620 621 /* 622 * If md_ioctl_lock_exit is being called with a possible lock held 623 * (ioctl_end is 0), then don't check the MN disksets since the 624 * call to mddb_setenter may cause a lock ordering deadlock. 625 */ 626 if (!ioctl_end) 627 return (ret_val); 628 629 /* 630 * Walk through disksets to see if there is a MN diskset that 631 * has messages that need to be sent. Set must be snarfed and 632 * be a MN diskset in order to be checked. 633 * 634 * In a MN diskset, this routine may send messages to the 635 * rpc.mdcommd in order to have the slave nodes re-parse parts 636 * of the mddb. Messages can only be sent with no locks held, 637 * so if mddb change occurred while the ioctl lock is held, this 638 * routine must send the messages. 639 */ 640 for (i = 1; i < md_nsets; i++) { 641 status = md_get_setstatus(i); 642 643 /* Set must be snarfed and be a MN diskset */ 644 if ((status & (MD_SET_SNARFED | MD_SET_MNSET)) != 645 (MD_SET_SNARFED | MD_SET_MNSET)) 646 continue; 647 648 /* Grab set lock so that set can't change */ 649 if ((s = mddb_setenter(i, MDDB_MUSTEXIST, &err)) == NULL) 650 continue; 651 652 lbp = s->s_lbp; 653 654 /* Re-get set status now that lock is held */ 655 status = md_get_setstatus(i); 656 657 /* 658 * If MN parsing block flag is set - continue to next set. 659 * 660 * If s_mn_parseflags_sending is non-zero, then another thread 661 * is already currently sending a parse message, so just 662 * release the set mutex. If this ioctl had caused an mddb 663 * change that results in a parse message to be generated, 664 * the thread that is currently sending a parse message would 665 * generate the additional parse message. 666 * 667 * If s_mn_parseflags_sending is zero then loop until 668 * s_mn_parseflags is 0 (until there are no more 669 * messages to send). 670 * While s_mn_parseflags is non-zero, 671 * put snapshot of parse_flags in s_mn_parseflags_sending 672 * set s_mn_parseflags to zero 673 * release set mutex 674 * send message 675 * re-grab set mutex 676 * set s_mn_parseflags_sending to zero 677 * 678 * If set is STALE, send message with NO_LOG flag so that 679 * rpc.mdcommd won't attempt to log message to non-writeable 680 * replica. 681 */ 682 mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), 683 KM_SLEEP); 684 while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) && 685 (s->s_mn_parseflags & MDDB_PARSE_MASK) && 686 (!(status & MD_SET_MNPARSE_BLK))) { 687 688 /* Grab snapshot of parse flags */ 689 s->s_mn_parseflags_sending = s->s_mn_parseflags; 690 s->s_mn_parseflags = 0; 691 692 mutex_exit(&md_set[(s)->s_setno].s_dbmx); 693 694 /* 695 * Send the message to the slaves to re-parse 696 * the indicated portions of the mddb. Send the status 697 * of the 50 mddbs in this set so that slaves know 698 * which mddbs that the master node thinks are 'good'. 699 * Otherwise, slave may reparse, but from wrong 700 * replica. 701 */ 702 mddb_parse_msg->msg_parse_flags = 703 s->s_mn_parseflags_sending; 704 705 for (i = 0; i < MDDB_NLB; i++) { 706 mddb_parse_msg->msg_lb_flags[i] = 707 lbp->lb_locators[i].l_flags; 708 } 709 kresult = kmem_zalloc(sizeof (md_mn_kresult_t), 710 KM_SLEEP); 711 while (rval != 0) { 712 flag = 0; 713 if (status & MD_SET_STALE) 714 flag |= MD_MSGF_NO_LOG; 715 rval = mdmn_ksend_message(s->s_setno, 716 MD_MN_MSG_MDDB_PARSE, flag, 717 (char *)mddb_parse_msg, 718 sizeof (mddb_parse_msg), kresult); 719 /* if the node hasn't yet joined, it's Ok. */ 720 if ((!MDMN_KSEND_MSG_OK(rval, kresult)) && 721 (kresult->kmmr_comm_state != 722 MDMNE_NOT_JOINED)) { 723 mdmn_ksend_show_error(rval, kresult, 724 "MD_MN_MSG_MDDB_PARSE"); 725 cmn_err(CE_WARN, "md_ioctl_lock_exit: " 726 "Unable to send mddb update " 727 "message to other nodes in " 728 "diskset %s\n", s->s_setname); 729 rval = 1; 730 } 731 } 732 kmem_free(kresult, sizeof (md_mn_kresult_t)); 733 734 /* 735 * Re-grab mutex to clear sending field and to 736 * see if another parse message needs to be generated. 737 */ 738 mutex_enter(&md_set[(s)->s_setno].s_dbmx); 739 s->s_mn_parseflags_sending = 0; 740 } 741 kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t)); 742 mutex_exit(&md_set[(s)->s_setno].s_dbmx); 743 } 744 return (ret_val); 745 } 746 747 /* 748 * Called when in an ioctl and need readerlock. 749 */ 750 void * 751 md_ioctl_readerlock(IOLOCK *lock, mdi_unit_t *ui) 752 { 753 ASSERT(lock != NULL); 754 lock->l_ui = ui; 755 lock->l_flags |= MD_READER_HELD; 756 return (md_unit_readerlock_common(ui, 0)); 757 } 758 759 /* 760 * Called when in an ioctl and need writerlock. 761 */ 762 void * 763 md_ioctl_writerlock(IOLOCK *lock, mdi_unit_t *ui) 764 { 765 ASSERT(lock != NULL); 766 lock->l_ui = ui; 767 lock->l_flags |= MD_WRITER_HELD; 768 return (md_unit_writerlock_common(ui, 0)); 769 } 770 771 void * 772 md_ioctl_io_lock(IOLOCK *lock, mdi_unit_t *ui) 773 { 774 ASSERT(lock != NULL); 775 lock->l_ui = ui; 776 lock->l_flags |= MD_IO_HELD; 777 return (md_io_writerlock(ui)); 778 } 779 780 void 781 md_ioctl_readerexit(IOLOCK *lock) 782 { 783 ASSERT(lock != NULL); 784 lock->l_flags &= ~MD_READER_HELD; 785 md_unit_readerexit(lock->l_ui); 786 } 787 788 void 789 md_ioctl_writerexit(IOLOCK *lock) 790 { 791 ASSERT(lock != NULL); 792 lock->l_flags &= ~MD_WRITER_HELD; 793 md_unit_writerexit(lock->l_ui); 794 } 795 796 void 797 md_ioctl_io_exit(IOLOCK *lock) 798 { 799 ASSERT(lock != NULL); 800 lock->l_flags &= ~MD_IO_HELD; 801 md_io_writerexit(lock->l_ui); 802 } 803 804 /* 805 * md_ioctl_releaselocks: 806 * -------------------- 807 * Release the unit locks that are held and stop subsequent 808 * md_unit_reader/writerlock calls from progressing. This allows the caller 809 * to send messages across the cluster when running in a multinode 810 * environment. 811 * ioctl originated locks (via md_ioctl_readerlock/md_ioctl_writerlock) are 812 * allowed to progress as normal. This is required as these typically are 813 * invoked by the message handler that may be called while a unit lock is 814 * marked as released. 815 * 816 * On entry: 817 * variety of unit locks may be held including ioctl lock 818 * 819 * On exit: 820 * locks released and unit structure updated to prevent subsequent reader/ 821 * writer locks being acquired until md_ioctl_reacquirelocks is called 822 */ 823 void 824 md_ioctl_releaselocks(int code, int flags, mdi_unit_t *ui) 825 { 826 /* This actually releases the locks. */ 827 (void) md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui); 828 } 829 830 /* 831 * md_ioctl_reacquirelocks: 832 * ---------------------- 833 * Reacquire the locks that were held when md_ioctl_releaselocks 834 * was called. 835 * 836 * On entry: 837 * No unit locks held 838 * On exit: 839 * locks held that were held at md_ioctl_releaselocks time including 840 * the ioctl lock. 841 */ 842 void 843 md_ioctl_reacquirelocks(int flags, mdi_unit_t *ui) 844 { 845 if (flags & MD_MT_IOCTL) { 846 mutex_enter(&md_mx); 847 md_mtioctl_cnt++; 848 mutex_exit(&md_mx); 849 } else { 850 while (md_ioctl_lock_enter() == EINTR); 851 } 852 if (flags & MD_ARRAY_WRITER) { 853 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 854 } else if (flags & MD_ARRAY_READER) { 855 rw_enter(&md_unit_array_rw.lock, RW_READER); 856 } 857 if (ui != (mdi_unit_t *)NULL) { 858 if (flags & MD_IO_HELD) { 859 (void) md_io_writerlock(ui); 860 } 861 862 mutex_enter(&ui->ui_mx); 863 if (flags & MD_READER_HELD) { 864 (void) md_unit_readerlock_common(ui, 1); 865 } else if (flags & MD_WRITER_HELD) { 866 (void) md_unit_writerlock_common(ui, 1); 867 } 868 /* Wake up any blocked readerlock() calls */ 869 cv_broadcast(&ui->ui_cv); 870 mutex_exit(&ui->ui_mx); 871 } 872 } 873 874 void 875 md_ioctl_droplocks(IOLOCK *lock) 876 { 877 mdi_unit_t *ui; 878 int flags; 879 880 ASSERT(lock != NULL); 881 ui = lock->l_ui; 882 flags = lock->l_flags; 883 if (flags & MD_READER_HELD) { 884 lock->l_flags &= ~MD_READER_HELD; 885 md_unit_readerexit(ui); 886 } 887 if (flags & MD_WRITER_HELD) { 888 lock->l_flags &= ~MD_WRITER_HELD; 889 md_unit_writerexit(ui); 890 } 891 if (flags & MD_IO_HELD) { 892 lock->l_flags &= ~MD_IO_HELD; 893 md_io_writerexit(ui); 894 } 895 if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) { 896 lock->l_flags &= ~(MD_ARRAY_WRITER | MD_ARRAY_READER); 897 rw_exit(&md_unit_array_rw.lock); 898 } 899 } 900 901 void 902 md_array_writer(IOLOCK *lock) 903 { 904 ASSERT(lock != NULL); 905 lock->l_flags |= MD_ARRAY_WRITER; 906 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 907 } 908 909 void 910 md_array_reader(IOLOCK *lock) 911 { 912 ASSERT(lock != NULL); 913 lock->l_flags |= MD_ARRAY_READER; 914 rw_enter(&md_unit_array_rw.lock, RW_READER); 915 } 916 917 /* 918 * Called when in an ioctl and need opencloselock. 919 * Sets flags in lockp for READER_HELD. 920 */ 921 void * 922 md_ioctl_openclose_enter(IOLOCK *lockp, mdi_unit_t *ui) 923 { 924 void *un; 925 926 ASSERT(lockp != NULL); 927 mutex_enter(&ui->ui_mx); 928 while (ui->ui_lock & MD_UL_OPENORCLOSE) 929 cv_wait(&ui->ui_cv, &ui->ui_mx); 930 ui->ui_lock |= MD_UL_OPENORCLOSE; 931 932 /* Maintain mutex across the readerlock call */ 933 lockp->l_ui = ui; 934 lockp->l_flags |= MD_READER_HELD; 935 un = md_unit_readerlock_common(ui, 1); 936 mutex_exit(&ui->ui_mx); 937 938 return (un); 939 } 940 941 /* 942 * Clears reader lock using md_ioctl instead of md_unit 943 * and updates lockp. 944 */ 945 void 946 md_ioctl_openclose_exit(IOLOCK *lockp) 947 { 948 mdi_unit_t *ui; 949 950 ASSERT(lockp != NULL); 951 ui = lockp->l_ui; 952 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); 953 954 md_ioctl_readerexit(lockp); 955 956 mutex_enter(&ui->ui_mx); 957 ui->ui_lock &= ~MD_UL_OPENORCLOSE; 958 959 cv_broadcast(&ui->ui_cv); 960 mutex_exit(&ui->ui_mx); 961 } 962 963 /* 964 * Clears reader lock using md_ioctl instead of md_unit 965 * and updates lockp. 966 * Does not acquire or release the ui_mx lock since the calling 967 * routine has already acquired this lock. 968 */ 969 void 970 md_ioctl_openclose_exit_lh(IOLOCK *lockp) 971 { 972 mdi_unit_t *ui; 973 974 ASSERT(lockp != NULL); 975 ui = lockp->l_ui; 976 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); 977 978 lockp->l_flags &= ~MD_READER_HELD; 979 md_unit_readerexit_common(lockp->l_ui, 1); 980 981 ui->ui_lock &= ~MD_UL_OPENORCLOSE; 982 cv_broadcast(&ui->ui_cv); 983 } 984 985 void * 986 md_unit_openclose_enter(mdi_unit_t *ui) 987 { 988 void *un; 989 990 mutex_enter(&ui->ui_mx); 991 while (ui->ui_lock & (MD_UL_OPENORCLOSE)) 992 cv_wait(&ui->ui_cv, &ui->ui_mx); 993 ui->ui_lock |= MD_UL_OPENORCLOSE; 994 995 /* Maintain mutex across the readerlock call */ 996 un = md_unit_readerlock_common(ui, 1); 997 mutex_exit(&ui->ui_mx); 998 999 return (un); 1000 } 1001 1002 void 1003 md_unit_openclose_exit(mdi_unit_t *ui) 1004 { 1005 md_unit_readerexit(ui); 1006 1007 mutex_enter(&ui->ui_mx); 1008 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); 1009 ui->ui_lock &= ~MD_UL_OPENORCLOSE; 1010 1011 cv_broadcast(&ui->ui_cv); 1012 mutex_exit(&ui->ui_mx); 1013 } 1014 1015 /* 1016 * Drop the openclose and readerlocks without acquiring or 1017 * releasing the ui_mx lock since the calling routine has 1018 * already acquired this lock. 1019 */ 1020 void 1021 md_unit_openclose_exit_lh(mdi_unit_t *ui) 1022 { 1023 md_unit_readerexit_common(ui, 1); 1024 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); 1025 ui->ui_lock &= ~MD_UL_OPENORCLOSE; 1026 cv_broadcast(&ui->ui_cv); 1027 } 1028 1029 int 1030 md_unit_isopen( 1031 mdi_unit_t *ui 1032 ) 1033 { 1034 int isopen; 1035 1036 /* check status */ 1037 mutex_enter(&ui->ui_mx); 1038 isopen = ((ui->ui_lock & MD_UL_OPEN) ? 1 : 0); 1039 mutex_exit(&ui->ui_mx); 1040 return (isopen); 1041 } 1042 1043 int 1044 md_unit_incopen( 1045 minor_t mnum, 1046 int flag, 1047 int otyp 1048 ) 1049 { 1050 mdi_unit_t *ui = MDI_UNIT(mnum); 1051 int err = 0; 1052 1053 /* check type and flags */ 1054 ASSERT(ui != NULL); 1055 mutex_enter(&ui->ui_mx); 1056 if ((otyp < 0) || (otyp >= OTYPCNT)) { 1057 err = EINVAL; 1058 goto out; 1059 } 1060 if (((flag & FEXCL) && (ui->ui_lock & MD_UL_OPEN)) || 1061 (ui->ui_lock & MD_UL_EXCL)) { 1062 err = EBUSY; 1063 goto out; 1064 } 1065 1066 /* count and flag open */ 1067 ui->ui_ocnt[otyp]++; 1068 ui->ui_lock |= MD_UL_OPEN; 1069 if (flag & FEXCL) 1070 ui->ui_lock |= MD_UL_EXCL; 1071 1072 /* setup kstat, return success */ 1073 mutex_exit(&ui->ui_mx); 1074 md_kstat_init(mnum); 1075 return (0); 1076 1077 /* return error */ 1078 out: 1079 mutex_exit(&ui->ui_mx); 1080 return (err); 1081 } 1082 1083 int 1084 md_unit_decopen( 1085 minor_t mnum, 1086 int otyp 1087 ) 1088 { 1089 mdi_unit_t *ui = MDI_UNIT(mnum); 1090 int err = 0; 1091 unsigned i; 1092 1093 /* check type and flags */ 1094 ASSERT(ui != NULL); 1095 mutex_enter(&ui->ui_mx); 1096 if ((otyp < 0) || (otyp >= OTYPCNT)) { 1097 err = EINVAL; 1098 goto out; 1099 } else if (ui->ui_ocnt[otyp] == 0) { 1100 err = ENXIO; 1101 goto out; 1102 } 1103 1104 /* count and flag closed */ 1105 if (otyp == OTYP_LYR) 1106 ui->ui_ocnt[otyp]--; 1107 else 1108 ui->ui_ocnt[otyp] = 0; 1109 ui->ui_lock &= ~MD_UL_OPEN; 1110 for (i = 0; (i < OTYPCNT); ++i) 1111 if (ui->ui_ocnt[i] != 0) 1112 ui->ui_lock |= MD_UL_OPEN; 1113 if (! (ui->ui_lock & MD_UL_OPEN)) 1114 ui->ui_lock &= ~MD_UL_EXCL; 1115 1116 /* teardown kstat, return success */ 1117 if (! (ui->ui_lock & MD_UL_OPEN)) { 1118 mutex_exit(&ui->ui_mx); 1119 md_kstat_destroy(mnum); 1120 return (0); 1121 } 1122 1123 /* return success */ 1124 out: 1125 mutex_exit(&ui->ui_mx); 1126 return (err); 1127 } 1128 1129 md_dev64_t 1130 md_xlate_targ_2_mini(md_dev64_t targ_devt) 1131 { 1132 dev32_t mini_32_devt, targ_32_devt; 1133 int i; 1134 1135 /* 1136 * check to see if we're in an upgrade situation 1137 * if we are not in upgrade just return the input device 1138 */ 1139 1140 if (!MD_UPGRADE) 1141 return (targ_devt); 1142 1143 targ_32_devt = md_cmpldev(targ_devt); 1144 1145 i = 0; 1146 while (i != md_tuple_length) { 1147 if (md_tuple_table[i].targ_devt == targ_32_devt) { 1148 mini_32_devt = md_tuple_table[i].mini_devt; 1149 return (md_expldev((md_dev64_t)mini_32_devt)); 1150 } 1151 i++; 1152 } 1153 return (NODEV64); 1154 } 1155 1156 md_dev64_t 1157 md_xlate_mini_2_targ(md_dev64_t mini_devt) 1158 { 1159 dev32_t mini_32_devt, targ_32_devt; 1160 int i; 1161 1162 if (!MD_UPGRADE) 1163 return (mini_devt); 1164 1165 mini_32_devt = md_cmpldev(mini_devt); 1166 1167 i = 0; 1168 while (i != md_tuple_length) { 1169 if (md_tuple_table[i].mini_devt == mini_32_devt) { 1170 targ_32_devt = md_tuple_table[i].targ_devt; 1171 return (md_expldev((md_dev64_t)targ_32_devt)); 1172 } 1173 i++; 1174 } 1175 return (NODEV64); 1176 } 1177 1178 void 1179 md_xlate_free(int size) 1180 { 1181 kmem_free(md_tuple_table, size); 1182 } 1183 1184 char * 1185 md_targ_major_to_name(major_t maj) 1186 { 1187 char *drv_name = NULL; 1188 int i; 1189 1190 if (!MD_UPGRADE) 1191 return (ddi_major_to_name(maj)); 1192 1193 for (i = 0; i < md_majortab_len; i++) { 1194 if (md_major_tuple_table[i].targ_maj == maj) { 1195 drv_name = md_major_tuple_table[i].drv_name; 1196 break; 1197 } 1198 } 1199 return (drv_name); 1200 } 1201 1202 major_t 1203 md_targ_name_to_major(char *drv_name) 1204 { 1205 major_t maj; 1206 int i; 1207 1208 maj = md_getmajor(NODEV64); 1209 if (!MD_UPGRADE) 1210 return (ddi_name_to_major(drv_name)); 1211 1212 for (i = 0; i < md_majortab_len; i++) { 1213 if ((strcmp(md_major_tuple_table[i].drv_name, 1214 drv_name)) == 0) { 1215 maj = md_major_tuple_table[i].targ_maj; 1216 break; 1217 } 1218 } 1219 1220 return (maj); 1221 } 1222 1223 void 1224 md_majortab_free() 1225 { 1226 size_t sz; 1227 int i; 1228 1229 for (i = 0; i < md_majortab_len; i++) { 1230 freestr(md_major_tuple_table[i].drv_name); 1231 } 1232 1233 sz = md_majortab_len * sizeof (struct md_xlate_major_table); 1234 kmem_free(md_major_tuple_table, sz); 1235 } 1236 1237 /* functions return a pointer to a function which returns an int */ 1238 1239 intptr_t (* 1240 md_get_named_service(md_dev64_t dev, int modindex, char *name, 1241 intptr_t (*Default)()))() 1242 { 1243 mdi_unit_t *ui; 1244 md_named_services_t *sp; 1245 int i; 1246 1247 /* 1248 * Return the first named service found. 1249 * Use this path when it is known that there is only 1250 * one named service possible (e.g., hotspare interface) 1251 */ 1252 if ((dev == NODEV64) && (modindex == ANY_SERVICE)) { 1253 for (i = 0; i < MD_NOPS; i++) { 1254 if (md_ops[i] == NULL) { 1255 continue; 1256 } 1257 sp = md_ops[i]->md_services; 1258 if (sp == NULL) 1259 continue; 1260 while (sp->md_service != NULL) { 1261 if (strcmp(name, sp->md_name) == 0) 1262 return (sp->md_service); 1263 sp++; 1264 } 1265 } 1266 return (Default); 1267 } 1268 1269 /* 1270 * Return the named service for the given modindex. 1271 * This is used if there are multiple possible named services 1272 * and each one needs to be called (e.g., poke hotspares) 1273 */ 1274 if (dev == NODEV64) { 1275 if (modindex >= MD_NOPS) 1276 return (Default); 1277 1278 if (md_ops[modindex] == NULL) 1279 return (Default); 1280 1281 sp = md_ops[modindex]->md_services; 1282 if (sp == NULL) 1283 return (Default); 1284 1285 while (sp->md_service != NULL) { 1286 if (strcmp(name, sp->md_name) == 0) 1287 return (sp->md_service); 1288 sp++; 1289 } 1290 return (Default); 1291 } 1292 1293 /* 1294 * Return the named service for this md_dev64_t 1295 */ 1296 if (md_getmajor(dev) != md_major) 1297 return (Default); 1298 1299 if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) || 1300 (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits)) 1301 return (NULL); 1302 1303 1304 if ((ui = MDI_UNIT(md_getminor(dev))) == NULL) 1305 return (NULL); 1306 1307 sp = md_ops[ui->ui_opsindex]->md_services; 1308 if (sp == NULL) 1309 return (Default); 1310 while (sp->md_service != NULL) { 1311 if (strcmp(name, sp->md_name) == 0) 1312 return (sp->md_service); 1313 sp++; 1314 } 1315 return (Default); 1316 } 1317 1318 /* 1319 * md_daemon callback routine 1320 */ 1321 boolean_t 1322 callb_md_cpr(void *arg, int code) 1323 { 1324 callb_cpr_t *cp = (callb_cpr_t *)arg; 1325 int ret = 0; /* assume success */ 1326 1327 mutex_enter(cp->cc_lockp); 1328 1329 switch (code) { 1330 case CB_CODE_CPR_CHKPT: 1331 /* 1332 * Check for active resync threads 1333 */ 1334 mutex_enter(&md_cpr_resync.md_resync_mutex); 1335 if ((md_cpr_resync.md_mirror_resync > 0) || 1336 (md_cpr_resync.md_raid_resync > 0)) { 1337 mutex_exit(&md_cpr_resync.md_resync_mutex); 1338 cmn_err(CE_WARN, "There are Solaris Volume Manager " 1339 "synchronization threads running."); 1340 cmn_err(CE_WARN, "Please try system suspension at " 1341 "a later time."); 1342 ret = -1; 1343 break; 1344 } 1345 mutex_exit(&md_cpr_resync.md_resync_mutex); 1346 1347 cp->cc_events |= CALLB_CPR_START; 1348 while (!(cp->cc_events & CALLB_CPR_SAFE)) 1349 /* cv_timedwait() returns -1 if it times out. */ 1350 if ((ret = cv_timedwait(&cp->cc_callb_cv, cp->cc_lockp, 1351 lbolt + CPR_KTHREAD_TIMEOUT_SEC * hz)) == -1) 1352 break; 1353 break; 1354 1355 case CB_CODE_CPR_RESUME: 1356 cp->cc_events &= ~CALLB_CPR_START; 1357 cv_signal(&cp->cc_stop_cv); 1358 break; 1359 } 1360 mutex_exit(cp->cc_lockp); 1361 return (ret != -1); 1362 } 1363 1364 void 1365 md_daemon(int pass_thru, mdq_anchor_t *anchor) 1366 { 1367 daemon_queue_t *dq; 1368 callb_cpr_t cprinfo; 1369 1370 if (pass_thru && (md_get_status() & MD_GBL_DAEMONS_LIVE)) 1371 return; 1372 /* 1373 * Register cpr callback 1374 */ 1375 CALLB_CPR_INIT(&cprinfo, &anchor->a_mx, callb_md_cpr, "md_daemon"); 1376 1377 /*CONSTCOND*/ 1378 while (1) { 1379 mutex_enter(&anchor->a_mx); 1380 while ((dq = anchor->dq.dq_next) == &(anchor->dq)) { 1381 if (pass_thru) { 1382 /* 1383 * CALLB_CPR_EXIT Will do 1384 * mutex_exit(&anchor->a_mx) 1385 */ 1386 CALLB_CPR_EXIT(&cprinfo); 1387 return; 1388 } 1389 if (md_get_status() & MD_GBL_DAEMONS_DIE) { 1390 mutex_exit(&anchor->a_mx); 1391 mutex_enter(&md_mx); 1392 md_num_daemons--; 1393 mutex_exit(&md_mx); 1394 /* 1395 * CALLB_CPR_EXIT will do 1396 * mutex_exit(&anchor->a_mx) 1397 */ 1398 mutex_enter(&anchor->a_mx); 1399 CALLB_CPR_EXIT(&cprinfo); 1400 thread_exit(); 1401 } 1402 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1403 cv_wait(&anchor->a_cv, &anchor->a_mx); 1404 CALLB_CPR_SAFE_END(&cprinfo, &anchor->a_mx); 1405 } 1406 dq->dq_prev->dq_next = dq->dq_next; 1407 dq->dq_next->dq_prev = dq->dq_prev; 1408 dq->dq_prev = dq->dq_next = NULL; 1409 anchor->dq.qlen--; 1410 mutex_exit(&anchor->a_mx); 1411 (*(dq->dq_call))(dq); 1412 } 1413 /*NOTREACHED*/ 1414 } 1415 1416 /* 1417 * daemon_request: 1418 * 1419 * Adds requests to appropriate requestq which is 1420 * anchored by *anchor. 1421 * The request is the first element of a doubly linked circular list. 1422 * When the request is a single element, the forward and backward 1423 * pointers MUST point to the element itself. 1424 */ 1425 1426 void 1427 daemon_request(mdq_anchor_t *anchor, void (*func)(), 1428 daemon_queue_t *request, callstyle_t style) 1429 { 1430 daemon_queue_t *rqtp; 1431 int i = 0; 1432 1433 rqtp = request; 1434 if (style == REQ_OLD) { 1435 ASSERT((rqtp->dq_next == NULL) && (rqtp->dq_prev == NULL)); 1436 /* set it to the new style */ 1437 rqtp->dq_prev = rqtp->dq_next = rqtp; 1438 } 1439 ASSERT((rqtp->dq_next != NULL) && (rqtp->dq_prev != NULL)); 1440 1441 /* scan the list and add the function to each element */ 1442 1443 do { 1444 rqtp->dq_call = func; 1445 i++; 1446 rqtp = rqtp->dq_next; 1447 } while (rqtp != request); 1448 1449 /* save pointer to tail of the request list */ 1450 rqtp = request->dq_prev; 1451 1452 mutex_enter(&anchor->a_mx); 1453 /* stats */ 1454 anchor->dq.qlen += i; 1455 anchor->dq.treqs += i; 1456 anchor->dq.maxq_len = (anchor->dq.qlen > anchor->dq.maxq_len) ? 1457 anchor->dq.qlen : anchor->dq.maxq_len; 1458 1459 /* now add the list to request queue */ 1460 request->dq_prev = anchor->dq.dq_prev; 1461 rqtp->dq_next = &anchor->dq; 1462 anchor->dq.dq_prev->dq_next = request; 1463 anchor->dq.dq_prev = rqtp; 1464 cv_broadcast(&anchor->a_cv); 1465 mutex_exit(&anchor->a_mx); 1466 } 1467 1468 void 1469 mddb_commitrec_wrapper(mddb_recid_t recid) 1470 { 1471 int sent_log = 0; 1472 uint_t retry = md_retry_cnt; 1473 set_t setno; 1474 1475 while (mddb_commitrec(recid)) { 1476 if (! sent_log) { 1477 cmn_err(CE_WARN, 1478 "md: state database commit failed"); 1479 sent_log = 1; 1480 } 1481 delay(md_hz); 1482 1483 /* 1484 * Setting retry cnt to one (pre decremented) so that we 1485 * actually do no retries when committing/deleting a mddb rec. 1486 * The underlying disk driver does several retries to check 1487 * if the disk is really dead or not so there 1488 * is no reason for us to retry on top of the drivers retries. 1489 */ 1490 1491 if (--retry == 0) { 1492 setno = mddb_getsetnum(recid); 1493 if (md_get_setstatus(setno) & MD_SET_TOOFEW) { 1494 panic( 1495 "md: Panic due to lack of DiskSuite state\n" 1496 " database replicas. Fewer than 50%% of " 1497 "the total were available,\n so panic to " 1498 "ensure data integrity."); 1499 } else { 1500 panic("md: state database problem"); 1501 } 1502 /*NOTREACHED*/ 1503 } 1504 } 1505 } 1506 1507 void 1508 mddb_commitrecs_wrapper(mddb_recid_t *recids) 1509 { 1510 int sent_log = 0; 1511 uint_t retry = md_retry_cnt; 1512 set_t setno; 1513 1514 while (mddb_commitrecs(recids)) { 1515 if (! sent_log) { 1516 cmn_err(CE_WARN, 1517 "md: state database commit failed"); 1518 sent_log = 1; 1519 } 1520 delay(md_hz); 1521 1522 /* 1523 * Setting retry cnt to one (pre decremented) so that we 1524 * actually do no retries when committing/deleting a mddb rec. 1525 * The underlying disk driver does several retries to check 1526 * if the disk is really dead or not so there 1527 * is no reason for us to retry on top of the drivers retries. 1528 */ 1529 1530 if (--retry == 0) { 1531 /* 1532 * since all the records are part of the same set 1533 * use the first one to get setno 1534 */ 1535 setno = mddb_getsetnum(*recids); 1536 if (md_get_setstatus(setno) & MD_SET_TOOFEW) { 1537 panic( 1538 "md: Panic due to lack of DiskSuite state\n" 1539 " database replicas. Fewer than 50%% of " 1540 "the total were available,\n so panic to " 1541 "ensure data integrity."); 1542 } else { 1543 panic("md: state database problem"); 1544 } 1545 /*NOTREACHED*/ 1546 } 1547 } 1548 } 1549 1550 void 1551 mddb_deleterec_wrapper(mddb_recid_t recid) 1552 { 1553 int sent_log = 0; 1554 uint_t retry = md_retry_cnt; 1555 set_t setno; 1556 1557 while (mddb_deleterec(recid)) { 1558 if (! sent_log) { 1559 cmn_err(CE_WARN, 1560 "md: state database delete failed"); 1561 sent_log = 1; 1562 } 1563 delay(md_hz); 1564 1565 /* 1566 * Setting retry cnt to one (pre decremented) so that we 1567 * actually do no retries when committing/deleting a mddb rec. 1568 * The underlying disk driver does several retries to check 1569 * if the disk is really dead or not so there 1570 * is no reason for us to retry on top of the drivers retries. 1571 */ 1572 1573 if (--retry == 0) { 1574 setno = mddb_getsetnum(recid); 1575 if (md_get_setstatus(setno) & MD_SET_TOOFEW) { 1576 panic( 1577 "md: Panic due to lack of DiskSuite state\n" 1578 " database replicas. Fewer than 50%% of " 1579 "the total were available,\n so panic to " 1580 "ensure data integrity."); 1581 } else { 1582 panic("md: state database problem"); 1583 } 1584 /*NOTREACHED*/ 1585 } 1586 } 1587 } 1588 1589 /* 1590 * md_holdset_enter is called in order to hold the set in its 1591 * current state (loaded, unloaded, snarfed, unsnarfed, etc) 1592 * until md_holdset_exit is called. This is used by the mirror 1593 * code to mark the set as HOLD so that the set won't be 1594 * unloaded while hotspares are being allocated in check_4_hotspares. 1595 * The original fix to the mirror code to hold the set was to call 1596 * md_haltsnarf_enter, but this will block all ioctls and ioctls 1597 * must work for a MN diskset while hotspares are allocated. 1598 */ 1599 void 1600 md_holdset_enter(set_t setno) 1601 { 1602 mutex_enter(&md_mx); 1603 while (md_set[setno].s_status & MD_SET_HOLD) 1604 cv_wait(&md_cv, &md_mx); 1605 md_set[setno].s_status |= MD_SET_HOLD; 1606 mutex_exit(&md_mx); 1607 } 1608 1609 void 1610 md_holdset_exit(set_t setno) 1611 { 1612 mutex_enter(&md_mx); 1613 md_set[setno].s_status &= ~MD_SET_HOLD; 1614 cv_broadcast(&md_cv); 1615 mutex_exit(&md_mx); 1616 } 1617 1618 /* 1619 * Returns a 0 if this thread marked the set as HOLD (success), 1620 * returns a -1 if set was already marked HOLD (failure). 1621 * Used by the release_set code to see if set is marked HOLD. 1622 * HOLD is set by a daemon when hotspares are being allocated 1623 * to mirror units. 1624 */ 1625 int 1626 md_holdset_testandenter(set_t setno) 1627 { 1628 mutex_enter(&md_mx); 1629 if (md_set[setno].s_status & MD_SET_HOLD) { 1630 mutex_exit(&md_mx); 1631 return (-1); 1632 } 1633 md_set[setno].s_status |= MD_SET_HOLD; 1634 mutex_exit(&md_mx); 1635 return (0); 1636 } 1637 1638 void 1639 md_haltsnarf_enter(set_t setno) 1640 { 1641 mutex_enter(&md_mx); 1642 while (md_set[setno].s_status & MD_SET_SNARFING) 1643 cv_wait(&md_cv, &md_mx); 1644 1645 md_set[setno].s_status |= MD_SET_SNARFING; 1646 mutex_exit(&md_mx); 1647 } 1648 1649 void 1650 md_haltsnarf_exit(set_t setno) 1651 { 1652 mutex_enter(&md_mx); 1653 md_set[setno].s_status &= ~MD_SET_SNARFING; 1654 cv_broadcast(&md_cv); 1655 mutex_exit(&md_mx); 1656 } 1657 1658 void 1659 md_haltsnarf_wait(set_t setno) 1660 { 1661 mutex_enter(&md_mx); 1662 while (md_set[setno].s_status & MD_SET_SNARFING) 1663 cv_wait(&md_cv, &md_mx); 1664 mutex_exit(&md_mx); 1665 } 1666 1667 /* 1668 * ASSUMED that the md_unit_array_rw WRITER lock is held. 1669 */ 1670 int 1671 md_halt_set(set_t setno, enum md_haltcmd cmd) 1672 { 1673 int i, err; 1674 1675 if (md_set[setno].s_un == NULL || md_set[setno].s_ui == NULL) { 1676 return (0); 1677 } 1678 1679 if ((cmd == MD_HALT_CHECK) || (cmd == MD_HALT_ALL)) { 1680 for (i = 0; i < MD_NOPS; i++) { 1681 if (md_ops[i] == NULL) 1682 continue; 1683 if ((*(md_ops[i]->md_halt))(MD_HALT_CLOSE, setno)) { 1684 for (--i; i > 0; --i) { 1685 if (md_ops[i] == NULL) 1686 continue; 1687 (void) (*(md_ops[i]->md_halt)) 1688 (MD_HALT_OPEN, setno); 1689 } 1690 return (EBUSY); 1691 } 1692 } 1693 1694 for (i = 0; i < MD_NOPS; i++) { 1695 if (md_ops[i] == NULL) 1696 continue; 1697 if ((*(md_ops[i]->md_halt))(MD_HALT_CHECK, setno)) { 1698 for (i = 0; i < MD_NOPS; i++) { 1699 if (md_ops[i] == NULL) 1700 continue; 1701 (void) (*(md_ops[i]->md_halt)) 1702 (MD_HALT_OPEN, setno); 1703 } 1704 return (EBUSY); 1705 } 1706 } 1707 } 1708 1709 if ((cmd == MD_HALT_DOIT) || (cmd == MD_HALT_ALL)) { 1710 for (i = 0; i < MD_NOPS; i++) { 1711 if (md_ops[i] == NULL) 1712 continue; 1713 err = (*(md_ops[i]->md_halt))(MD_HALT_DOIT, setno); 1714 if (err != 0) 1715 cmn_err(CE_NOTE, 1716 "md: halt failed for %s, error %d", 1717 md_ops[i]->md_driver.md_drivername, err); 1718 } 1719 1720 /* 1721 * Unload the devid namespace if it is loaded 1722 */ 1723 md_unload_namespace(setno, NM_DEVID); 1724 md_unload_namespace(setno, 0L); 1725 md_clr_setstatus(setno, MD_SET_SNARFED); 1726 } 1727 1728 return (0); 1729 } 1730 1731 int 1732 md_halt(int global_locks_owned_mask) 1733 { 1734 set_t i, j; 1735 int err; 1736 int init_queues; 1737 md_requestq_entry_t *rqp; 1738 md_ops_t **pops, *ops, *lops; 1739 ddi_modhandle_t mod; 1740 char *name; 1741 1742 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 1743 1744 /* 1745 * Grab the all of the global locks that are not 1746 * already owned to ensure that there isn't another 1747 * thread trying to access a global resource 1748 * while the halt is in progress 1749 */ 1750 if (md_global_lock_enter(global_locks_owned_mask) == EINTR) 1751 return (EINTR); 1752 1753 for (i = 0; i < md_nsets; i++) 1754 md_haltsnarf_enter(i); 1755 1756 /* 1757 * Kill the daemon threads. 1758 */ 1759 init_queues = ((md_get_status() & MD_GBL_DAEMONS_LIVE) ? FALSE : TRUE); 1760 md_clr_status(MD_GBL_DAEMONS_LIVE); 1761 md_set_status(MD_GBL_DAEMONS_DIE); 1762 1763 rqp = &md_daemon_queues[0]; 1764 i = 0; 1765 while (!NULL_REQUESTQ_ENTRY(rqp)) { 1766 cv_broadcast(&rqp->dispq_headp->a_cv); 1767 rqp = &md_daemon_queues[++i]; 1768 } 1769 1770 mutex_enter(&md_mx); 1771 while (md_num_daemons != 0) { 1772 mutex_exit(&md_mx); 1773 delay(md_hz); 1774 mutex_enter(&md_mx); 1775 } 1776 mutex_exit(&md_mx); 1777 md_clr_status(MD_GBL_DAEMONS_DIE); 1778 1779 for (i = 0; i < md_nsets; i++) 1780 /* 1781 * Only call into md_halt_set if s_un / s_ui are both set. 1782 * If they are NULL this set hasn't been accessed, so its 1783 * pointless performing the call. 1784 */ 1785 if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) { 1786 if (md_halt_set(i, MD_HALT_CHECK)) { 1787 if (md_start_daemons(init_queues)) 1788 cmn_err(CE_WARN, 1789 "md: restart of daemon threads " 1790 "failed"); 1791 for (j = 0; j < md_nsets; j++) 1792 md_haltsnarf_exit(j); 1793 1794 return (md_global_lock_exit( 1795 global_locks_owned_mask, EBUSY, 1796 MD_ARRAY_WRITER, NULL)); 1797 } 1798 } 1799 1800 /* 1801 * if we get here we are going to do it 1802 */ 1803 for (i = 0; i < md_nsets; i++) { 1804 /* 1805 * Only call into md_halt_set if s_un / s_ui are both set. 1806 * If they are NULL this set hasn't been accessed, so its 1807 * pointless performing the call. 1808 */ 1809 if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) { 1810 err = md_halt_set(i, MD_HALT_DOIT); 1811 if (err != 0) 1812 cmn_err(CE_NOTE, 1813 "md: halt failed set %u, error %d", 1814 (unsigned)i, err); 1815 } 1816 } 1817 1818 /* 1819 * issue a halt unload to each module to indicate that it 1820 * is about to be unloaded. Each module is called once, set 1821 * has no meaning at this point in time. 1822 */ 1823 for (i = 0; i < MD_NOPS; i++) { 1824 if (md_ops[i] == NULL) 1825 continue; 1826 err = (*(md_ops[i]->md_halt))(MD_HALT_UNLOAD, 0); 1827 if (err != 0) 1828 cmn_err(CE_NOTE, 1829 "md: halt failed for %s, error %d", 1830 md_ops[i]->md_driver.md_drivername, err); 1831 } 1832 1833 /* ddi_modclose the submodules */ 1834 for (i = 0; i < MD_NOPS; i++) { 1835 /* skip if not open */ 1836 if ((md_ops[i] == NULL) || (md_mods[i] == NULL)) 1837 continue; 1838 1839 /* find and unlink from md_opslist */ 1840 ops = md_ops[i]; 1841 mod = md_mods[i]; 1842 pops = &md_opslist; 1843 for (lops = *pops; lops; 1844 pops = &lops->md_next, lops = *pops) { 1845 if (lops == ops) { 1846 *pops = ops->md_next; 1847 ops->md_next = NULL; 1848 break; 1849 } 1850 } 1851 1852 /* uninitialize */ 1853 name = ops->md_driver.md_drivername, 1854 md_ops[i] = NULL; 1855 md_mods[i] = NULL; 1856 ops->md_selfindex = 0; 1857 ops->md_driver.md_drivername[0] = '\0'; 1858 rw_destroy(&ops->md_link_rw.lock); 1859 1860 /* close */ 1861 err = ddi_modclose(mod); 1862 if (err != 0) 1863 cmn_err(CE_NOTE, 1864 "md: halt close failed for %s, error %d", 1865 name ? name : "UNKNOWN", err); 1866 } 1867 1868 /* Unload the database */ 1869 mddb_unload(); 1870 1871 md_set_status(MD_GBL_HALTED); /* we are ready to be unloaded */ 1872 1873 for (i = 0; i < md_nsets; i++) 1874 md_haltsnarf_exit(i); 1875 1876 return (md_global_lock_exit(global_locks_owned_mask, 0, 1877 MD_ARRAY_WRITER, NULL)); 1878 } 1879 1880 /* 1881 * md_layered_open() is an internal routine only for SVM modules. 1882 * So the input device will be a md_dev64_t, because all SVM modules internally 1883 * work with that device type. 1884 * ddi routines on the other hand work with dev_t. So, if we call any ddi 1885 * routines from here we first have to convert that device into a dev_t. 1886 */ 1887 1888 int 1889 md_layered_open( 1890 minor_t mnum, 1891 md_dev64_t *dev, 1892 int md_oflags 1893 ) 1894 { 1895 int flag = (FREAD | FWRITE); 1896 cred_t *cred_p = kcred; 1897 major_t major; 1898 int err; 1899 dev_t ddi_dev = md_dev64_to_dev(*dev); 1900 1901 if (ddi_dev == NODEV) 1902 return (ENODEV); 1903 1904 major = getmajor(ddi_dev); 1905 1906 /* metadevice */ 1907 if (major == md_major) { 1908 mdi_unit_t *ui; 1909 1910 /* open underlying driver */ 1911 mnum = getminor(ddi_dev); 1912 1913 ui = MDI_UNIT(mnum); 1914 if (md_ops[ui->ui_opsindex]->md_open != NULL) { 1915 int ret = (*md_ops[ui->ui_opsindex]->md_open)(&ddi_dev, 1916 flag, OTYP_LYR, cred_p, md_oflags); 1917 /* 1918 * As open() may change the device, 1919 * send this info back to the caller. 1920 */ 1921 *dev = md_expldev(ddi_dev); 1922 return (ret); 1923 } 1924 1925 /* or do it ourselves */ 1926 (void) md_unit_openclose_enter(ui); 1927 err = md_unit_incopen(mnum, flag, OTYP_LYR); 1928 md_unit_openclose_exit(ui); 1929 /* convert our ddi_dev back to the dev we were given */ 1930 *dev = md_expldev(ddi_dev); 1931 return (err); 1932 } 1933 1934 /* 1935 * Open regular device, since open() may change dev_t give new dev_t 1936 * back to the caller. 1937 */ 1938 err = dev_lopen(&ddi_dev, flag, OTYP_LYR, cred_p); 1939 *dev = md_expldev(ddi_dev); 1940 return (err); 1941 } 1942 1943 /* 1944 * md_layered_close() is an internal routine only for SVM modules. 1945 * So the input device will be a md_dev64_t, because all SVM modules internally 1946 * work with that device type. 1947 * ddi routines on the other hand work with dev_t. So, if we call any ddi 1948 * routines from here we first have to convert that device into a dev_t. 1949 */ 1950 void 1951 md_layered_close( 1952 md_dev64_t dev, 1953 int md_cflags 1954 ) 1955 { 1956 int flag = (FREAD | FWRITE); 1957 cred_t *cred_p = kcred; 1958 dev_t ddi_dev = md_dev64_to_dev(dev); 1959 major_t major = getmajor(ddi_dev); 1960 minor_t mnum = getminor(ddi_dev); 1961 1962 /* metadevice */ 1963 if (major == md_major) { 1964 mdi_unit_t *ui = MDI_UNIT(mnum); 1965 1966 /* close underlying driver */ 1967 if (md_ops[ui->ui_opsindex]->md_close != NULL) { 1968 (*md_ops[ui->ui_opsindex]->md_close) 1969 (ddi_dev, flag, OTYP_LYR, cred_p, md_cflags); 1970 return; 1971 } 1972 1973 /* or do it ourselves */ 1974 (void) md_unit_openclose_enter(ui); 1975 (void) md_unit_decopen(mnum, OTYP_LYR); 1976 md_unit_openclose_exit(ui); 1977 return; 1978 } 1979 1980 /* close regular device */ 1981 (void) dev_lclose(ddi_dev, flag, OTYP_LYR, cred_p); 1982 } 1983 1984 /* 1985 * saves a little code in mdstrategy 1986 */ 1987 int 1988 errdone(mdi_unit_t *ui, struct buf *bp, int err) 1989 { 1990 if ((bp->b_error = err) != 0) 1991 bp->b_flags |= B_ERROR; 1992 else 1993 bp->b_resid = bp->b_bcount; 1994 md_unit_readerexit(ui); 1995 md_biodone(bp); 1996 return (1); 1997 } 1998 1999 static int md_write_label = 0; 2000 2001 int 2002 md_checkbuf(mdi_unit_t *ui, md_unit_t *un, buf_t *bp) 2003 { 2004 diskaddr_t endblk; 2005 set_t setno = MD_UN2SET(un); 2006 2007 if ((md_get_setstatus(setno) & MD_SET_STALE) && 2008 (! (bp->b_flags & B_READ))) 2009 return (errdone(ui, bp, EROFS)); 2010 /* 2011 * Check early for unreasonable block number. 2012 * 2013 * b_blkno is defined as adaddr_t which is typedef'd to a long. 2014 * A problem occurs if b_blkno has bit 31 set and un_total_blocks 2015 * doesn't, b_blkno is then compared as a negative number which is 2016 * always less than a positive. 2017 */ 2018 if ((u_longlong_t)bp->b_lblkno > (u_longlong_t)un->c.un_total_blocks) 2019 return (errdone(ui, bp, EINVAL)); 2020 2021 if (bp->b_lblkno == un->c.un_total_blocks) 2022 return (errdone(ui, bp, 0)); 2023 2024 /* 2025 * make sure we don't clobber any labels 2026 */ 2027 if ((bp->b_lblkno == 0) && (! (bp->b_flags & B_READ)) && 2028 (un->c.un_flag & MD_LABELED) && (! md_write_label)) { 2029 cmn_err(CE_NOTE, "md: %s: write to label", 2030 md_shortname(getminor(bp->b_edev))); 2031 return (errdone(ui, bp, EINVAL)); 2032 } 2033 2034 bp->b_resid = 0; 2035 endblk = (diskaddr_t)(bp->b_lblkno + 2036 howmany(bp->b_bcount, DEV_BSIZE) - 1); 2037 2038 if (endblk > (un->c.un_total_blocks - 1)) { 2039 bp->b_resid = dbtob(endblk - (un->c.un_total_blocks - 1)); 2040 endblk = un->c.un_total_blocks - 1; 2041 bp->b_bcount -= bp->b_resid; 2042 } 2043 return (0); 2044 } 2045 2046 /* 2047 * init_request_queue: initializes the request queues and creates the threads. 2048 * return value = 0 :invalid num_threads 2049 * = n : n is the number of threads created. 2050 */ 2051 2052 int 2053 init_requestq( 2054 md_requestq_entry_t *rq, /* request queue info */ 2055 void (*threadfn)(), /* function to start the thread */ 2056 caddr_t threadfn_args, /* args to the function */ 2057 int pri, /* thread priority */ 2058 int init_queue) /* flag to init queues */ 2059 { 2060 struct mdq_anchor *rqhead; 2061 int i; 2062 int num_threads; 2063 2064 2065 num_threads = *(rq->num_threadsp); 2066 rqhead = rq->dispq_headp; 2067 2068 if (NULL_REQUESTQ_ENTRY(rq) || num_threads == 0) 2069 return (0); 2070 2071 if (init_queue) { 2072 rqhead->dq.maxq_len = 0; 2073 rqhead->dq.treqs = 0; 2074 rqhead->dq.dq_next = &rqhead->dq; 2075 rqhead->dq.dq_prev = &rqhead->dq; 2076 cv_init(&rqhead->a_cv, NULL, CV_DEFAULT, NULL); 2077 mutex_init(&rqhead->a_mx, NULL, MUTEX_DEFAULT, NULL); 2078 } 2079 for (i = 0; i < num_threads; i++) { 2080 (void) thread_create(NULL, 0, threadfn, threadfn_args, 0, &p0, 2081 TS_RUN, pri); 2082 } 2083 return (i); 2084 } 2085 2086 static void 2087 start_daemon(struct mdq_anchor *q) 2088 { 2089 md_daemon(0, q); 2090 ASSERT(0); 2091 } 2092 2093 /* 2094 * Creates all the md daemons. 2095 * Global: 2096 * md_num_daemons is set to number of daemons. 2097 * MD_GBL_DAEMONS_LIVE flag set to indicate the daemons are active. 2098 * 2099 * Return value: 0 success 2100 * 1 failure 2101 */ 2102 int 2103 md_start_daemons(int init_queue) 2104 { 2105 md_requestq_entry_t *rqp; 2106 int cnt; 2107 int i; 2108 int retval = 0; 2109 2110 2111 if (md_get_status() & MD_GBL_DAEMONS_LIVE) { 2112 return (retval); 2113 } 2114 md_clr_status(MD_GBL_DAEMONS_DIE); 2115 2116 rqp = &md_daemon_queues[0]; 2117 i = 0; 2118 while (!NULL_REQUESTQ_ENTRY(rqp)) { 2119 cnt = init_requestq(rqp, start_daemon, 2120 (caddr_t)rqp->dispq_headp, minclsyspri, init_queue); 2121 2122 if (cnt && cnt != *rqp->num_threadsp) { 2123 retval = 1; 2124 break; 2125 } 2126 /* 2127 * initialize variables 2128 */ 2129 md_num_daemons += cnt; 2130 rqp = &md_daemon_queues[++i]; 2131 } 2132 2133 md_set_status(MD_GBL_DAEMONS_LIVE); 2134 return (retval); 2135 } 2136 2137 int 2138 md_loadsubmod(set_t setno, char *name, int drvrid) 2139 { 2140 ddi_modhandle_t mod; 2141 md_ops_t **pops, *ops; 2142 int i, err; 2143 2144 /* 2145 * See if the submodule is mdopened. If not, i is the index of the 2146 * next empty slot. 2147 */ 2148 for (i = 0; md_ops[i] != NULL; i++) { 2149 if (strncmp(name, md_ops[i]->md_driver.md_drivername, 2150 MD_DRIVERNAMELEN) == 0) 2151 return (i); 2152 2153 if (i == (MD_NOPS - 1)) 2154 return (-1); 2155 } 2156 2157 if (drvrid < 0) { 2158 /* Do not try to add any records to the DB when stale. */ 2159 if (md_get_setstatus(setno) & MD_SET_STALE) 2160 return (-1); 2161 drvrid = md_setshared_name(setno, name, 0L); 2162 } 2163 2164 if (drvrid < 0) 2165 return (-1); 2166 2167 /* open and import the md_ops of the submodules */ 2168 mod = ddi_modopen(name, KRTLD_MODE_FIRST, &err); 2169 if (mod == NULL) { 2170 cmn_err(CE_WARN, "md_loadsubmod: " 2171 "unable to ddi_modopen %s, error %d\n", name, err); 2172 return (-1); 2173 } 2174 pops = ddi_modsym(mod, "md_interface_ops", &err); 2175 if (pops == NULL) { 2176 cmn_err(CE_WARN, "md_loadsubmod: " 2177 "unable to import md_interface_ops from %s, error %d\n", 2178 name, err); 2179 (void) ddi_modclose(mod); 2180 return (-1); 2181 } 2182 2183 /* ddi_modsym returns pointer to md_interface_ops in submod */ 2184 ops = *pops; 2185 2186 /* initialize */ 2187 ops->md_selfindex = i; 2188 rw_init(&ops->md_link_rw.lock, NULL, RW_DEFAULT, NULL); 2189 (void) strncpy(ops->md_driver.md_drivername, name, 2190 MD_DRIVERNAMELEN); 2191 2192 /* plumb */ 2193 md_ops[i] = ops; 2194 md_mods[i] = mod; 2195 ops->md_next = md_opslist; 2196 md_opslist = ops; 2197 2198 /* return index */ 2199 return (i); 2200 } 2201 2202 int 2203 md_getmodindex(md_driver_t *driver, int dont_load, int db_notrequired) 2204 { 2205 int i; 2206 int modindex; 2207 char *name = driver->md_drivername; 2208 set_t setno = driver->md_setno; 2209 int drvid; 2210 int local_dont_load; 2211 2212 if (setno >= md_nsets) 2213 return (-1); 2214 2215 for (i = 0; name[i] != 0; i++) 2216 if (i == (MD_DRIVERNAMELEN -1)) 2217 return (-1); 2218 2219 /* 2220 * If set is STALE, set local_dont_load to 1 since no records 2221 * should be added to DB when stale. 2222 */ 2223 if (md_get_setstatus(setno) & MD_SET_STALE) { 2224 local_dont_load = 1; 2225 } else { 2226 local_dont_load = dont_load; 2227 } 2228 2229 /* 2230 * Single thread ioctl module binding with respect to 2231 * similar code executed in md_loadsubmod that is called 2232 * from md_snarf_db_set (which is where that path does 2233 * its md_haltsnarf_enter call). 2234 */ 2235 md_haltsnarf_enter(setno); 2236 2237 /* See if the submodule is already ddi_modopened. */ 2238 for (i = 0; md_ops[i] != NULL; i++) { 2239 if (strncmp(name, md_ops[i]->md_driver.md_drivername, 2240 MD_DRIVERNAMELEN) == 0) { 2241 if (! local_dont_load && 2242 (md_getshared_key(setno, name) == MD_KEYBAD)) { 2243 if (md_setshared_name(setno, name, 0L) 2244 == MD_KEYBAD) { 2245 if (!db_notrequired) 2246 goto err; 2247 } 2248 } 2249 md_haltsnarf_exit(setno); 2250 return (i); 2251 } 2252 2253 if (i == (MD_NOPS -1)) 2254 break; 2255 } 2256 2257 if (local_dont_load) 2258 goto err; 2259 2260 drvid = ((db_notrequired) ? 0 : (int)md_getshared_key(setno, name)); 2261 2262 /* ddi_modopen the submodule */ 2263 modindex = md_loadsubmod(setno, name, drvid); 2264 if (modindex < 0) 2265 goto err; 2266 2267 if (md_ops[modindex]->md_snarf != NULL) 2268 (*(md_ops[modindex]->md_snarf))(MD_SNARF_DOIT, setno); 2269 2270 md_haltsnarf_exit(setno); 2271 return (modindex); 2272 2273 err: md_haltsnarf_exit(setno); 2274 return (-1); 2275 } 2276 2277 void 2278 md_call_strategy(buf_t *bp, int flags, void *private) 2279 { 2280 mdi_unit_t *ui; 2281 2282 if (mdv_strategy_tstpnt) 2283 if ((*mdv_strategy_tstpnt)(bp, flags, private) != 0) 2284 return; 2285 if (getmajor(bp->b_edev) != md_major) { 2286 (void) bdev_strategy(bp); 2287 return; 2288 } 2289 2290 flags = (flags & MD_STR_PASSEDON) | MD_STR_NOTTOP; 2291 ui = MDI_UNIT(getminor(bp->b_edev)); 2292 ASSERT(ui != NULL); 2293 (*md_ops[ui->ui_opsindex]->md_strategy)(bp, flags, private); 2294 } 2295 2296 /* 2297 * md_call_ioctl: 2298 * ------------- 2299 * Issue the specified ioctl to the device associated with the given md_dev64_t 2300 * 2301 * Arguments: 2302 * dev - underlying device [md_dev64_t] 2303 * cmd - ioctl to perform 2304 * data - arguments / result location 2305 * mode - read/write/layered ioctl 2306 * lockp - lock reference 2307 * 2308 * Returns: 2309 * 0 success 2310 * !=0 Failure (error code) 2311 */ 2312 int 2313 md_call_ioctl(md_dev64_t dev, int cmd, void *data, int mode, IOLOCK *lockp) 2314 { 2315 dev_t device = md_dev64_to_dev(dev); 2316 int rval; 2317 mdi_unit_t *ui; 2318 2319 /* 2320 * See if device is a metadevice. If not call cdev_ioctl(), otherwise 2321 * call the ioctl entry-point in the metadevice. 2322 */ 2323 if (md_getmajor(dev) != md_major) { 2324 int rv; 2325 rval = cdev_ioctl(device, cmd, (intptr_t)data, mode, 2326 ddi_get_cred(), &rv); 2327 } else { 2328 ui = MDI_UNIT(md_getminor(dev)); 2329 ASSERT(ui != NULL); 2330 rval = (*md_ops[ui->ui_opsindex]->md_ioctl)(device, cmd, data, 2331 mode, lockp); 2332 } 2333 return (rval); 2334 } 2335 2336 void 2337 md_rem_link(set_t setno, int id, krwlock_t *rw, md_link_t **head) 2338 { 2339 md_link_t *next; 2340 md_link_t **pprev; 2341 2342 rw_enter(rw, RW_WRITER); 2343 2344 next = *head; 2345 pprev = head; 2346 while (next) { 2347 if ((next->ln_setno == setno) && (next->ln_id == id)) { 2348 *pprev = next->ln_next; 2349 rw_exit(rw); 2350 return; 2351 } 2352 pprev = &next->ln_next; 2353 next = next->ln_next; 2354 } 2355 2356 rw_exit(rw); 2357 } 2358 2359 int 2360 md_dev_exists(md_dev64_t dev) 2361 { 2362 2363 if (dev == NODEV64) 2364 return (0); 2365 2366 if (strcmp(ddi_major_to_name(md_getmajor(dev)), "md") != 0) 2367 return (1); 2368 2369 if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) || 2370 (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits)) 2371 return (0); 2372 2373 if (MDI_UNIT(md_getminor(dev)) != NULL) 2374 return (1); 2375 2376 return (0); 2377 } 2378 2379 md_parent_t 2380 md_get_parent(md_dev64_t dev) 2381 { 2382 md_unit_t *un; 2383 mdi_unit_t *ui; 2384 md_parent_t parent; 2385 2386 if (md_getmajor(dev) != md_major) 2387 return (MD_NO_PARENT); 2388 2389 ui = MDI_UNIT(md_getminor(dev)); 2390 2391 un = (md_unit_t *)md_unit_readerlock(ui); 2392 parent = un->c.un_parent; 2393 md_unit_readerexit(ui); 2394 2395 return (parent); 2396 } 2397 2398 void 2399 md_set_parent(md_dev64_t dev, md_parent_t parent) 2400 { 2401 md_unit_t *un; 2402 mdi_unit_t *ui; 2403 2404 if (md_getmajor(dev) != md_major) 2405 return; 2406 2407 ui = MDI_UNIT(md_getminor(dev)); 2408 2409 un = (md_unit_t *)md_unit_readerlock(ui); 2410 un->c.un_parent = parent; 2411 md_unit_readerexit(ui); 2412 } 2413 2414 void 2415 md_reset_parent(md_dev64_t dev) 2416 { 2417 md_unit_t *un; 2418 mdi_unit_t *ui; 2419 2420 if (md_getmajor(dev) != md_major) 2421 return; 2422 2423 ui = MDI_UNIT(md_getminor(dev)); 2424 2425 un = (md_unit_t *)md_unit_readerlock(ui); 2426 un->c.un_parent = MD_NO_PARENT; 2427 md_unit_readerexit(ui); 2428 } 2429 2430 2431 static intptr_t (*hot_spare_interface)() = (intptr_t (*)())NULL; 2432 2433 int 2434 md_hot_spare_ifc( 2435 hs_cmds_t cmd, 2436 mddb_recid_t id, 2437 u_longlong_t size, 2438 int labeled, 2439 mddb_recid_t *hs_id, 2440 mdkey_t *key, 2441 md_dev64_t *dev, 2442 diskaddr_t *sblock) 2443 { 2444 int err; 2445 2446 /* 2447 * RW lock on hot_spare_interface. We don't want it to change from 2448 * underneath us. If hot_spare_interface is NULL we're going to 2449 * need to set it. So we need to upgrade to a WRITER lock. If that 2450 * doesn't work, we drop the lock and reenter as WRITER. This leaves 2451 * a small hole during which hot_spare_interface could be modified 2452 * so we check it for NULL again. What a pain. Then if still null 2453 * load from md_get_named_service. 2454 */ 2455 2456 rw_enter(&hsp_rwlp.lock, RW_READER); 2457 if (hot_spare_interface == NULL) { 2458 if (rw_tryupgrade(&hsp_rwlp.lock) == 0) { 2459 rw_exit(&hsp_rwlp.lock); 2460 rw_enter(&hsp_rwlp.lock, RW_WRITER); 2461 if (hot_spare_interface != NULL) { 2462 err = ((*hot_spare_interface) 2463 (cmd, id, size, labeled, hs_id, key, dev, 2464 sblock)); 2465 rw_exit(&hsp_rwlp.lock); 2466 return (err); 2467 } 2468 } 2469 hot_spare_interface = md_get_named_service(NODEV64, ANY_SERVICE, 2470 "hot spare interface", 0); 2471 rw_downgrade(&hsp_rwlp.lock); 2472 } 2473 2474 if (hot_spare_interface == NULL) { 2475 cmn_err(CE_WARN, "md: no hotspare interface"); 2476 rw_exit(&hsp_rwlp.lock); 2477 return (0); 2478 } 2479 2480 err = ((*hot_spare_interface) 2481 (cmd, id, size, labeled, hs_id, key, dev, sblock)); 2482 rw_exit(&hsp_rwlp.lock); 2483 return (err); 2484 } 2485 2486 void 2487 md_clear_hot_spare_interface() 2488 { 2489 rw_enter(&hsp_rwlp.lock, RW_WRITER); 2490 hot_spare_interface = NULL; 2491 rw_exit(&hsp_rwlp.lock); 2492 } 2493 2494 2495 static intptr_t (*notify_interface)() = (intptr_t (*)())NULL; 2496 2497 int 2498 md_notify_interface( 2499 md_event_cmds_t cmd, 2500 md_tags_t tag, 2501 set_t set, 2502 md_dev64_t dev, 2503 md_event_type_t event 2504 ) 2505 { 2506 int err; 2507 2508 if (md_event_queue == NULL) 2509 return (0); 2510 rw_enter(&ni_rwlp.lock, RW_READER); 2511 if (notify_interface == NULL) { 2512 if (rw_tryupgrade(&ni_rwlp.lock) == 0) { 2513 rw_exit(&ni_rwlp.lock); 2514 rw_enter(&ni_rwlp.lock, RW_WRITER); 2515 if (notify_interface != NULL) { 2516 err = ((*notify_interface) 2517 (cmd, tag, set, dev, event)); 2518 rw_exit(&ni_rwlp.lock); 2519 return (err); 2520 } 2521 } 2522 notify_interface = md_get_named_service(NODEV64, ANY_SERVICE, 2523 "notify interface", 0); 2524 rw_downgrade(&ni_rwlp.lock); 2525 } 2526 if (notify_interface == NULL) { 2527 cmn_err(CE_WARN, "md: no notify interface"); 2528 rw_exit(&ni_rwlp.lock); 2529 return (0); 2530 } 2531 err = ((*notify_interface)(cmd, tag, set, dev, event)); 2532 rw_exit(&ni_rwlp.lock); 2533 return (err); 2534 } 2535 2536 char * 2537 obj2devname(uint32_t tag, uint_t setno, md_dev64_t dev) 2538 { 2539 char *setname; 2540 char name[MD_MAX_CTDLEN]; 2541 minor_t mnum = md_getminor(dev); 2542 major_t maj = md_getmajor(dev); 2543 int rtn = 0; 2544 2545 /* 2546 * Verify that the passed dev_t refers to a valid metadevice. 2547 * If it doesn't we can make no assumptions as to what the device 2548 * name is. Return NULL in these cases. 2549 */ 2550 if (((maj != md_major) || (MD_MIN2UNIT(mnum) >= md_nunits)) || 2551 (MD_MIN2SET(mnum) >= md_nsets)) { 2552 return (NULL); 2553 } 2554 2555 setname = NULL; 2556 name[0] = '\0'; 2557 switch (tag) { 2558 case SVM_TAG_HSP: 2559 if (setno == 0) { 2560 rtn = snprintf(name, sizeof (name), "hsp%u", 2561 (unsigned)MD_MIN2UNIT(mnum)); 2562 } else { 2563 setname = mddb_getsetname(setno); 2564 if (setname != NULL) { 2565 rtn = snprintf(name, sizeof (name), "%s/hsp%u", 2566 setname, (unsigned)MD_MIN2UNIT(mnum)); 2567 } 2568 } 2569 break; 2570 case SVM_TAG_DRIVE: 2571 (void) sprintf(name, "drive"); 2572 break; 2573 case SVM_TAG_HOST: 2574 (void) sprintf(name, "host"); 2575 break; 2576 case SVM_TAG_SET: 2577 rtn = snprintf(name, sizeof (name), "%s", 2578 mddb_getsetname(setno)); 2579 if ((name[0] == '\0') || (rtn >= sizeof (name))) { 2580 (void) sprintf(name, "diskset"); 2581 rtn = 0; 2582 } 2583 break; 2584 default: 2585 rtn = snprintf(name, sizeof (name), "%s", md_shortname(mnum)); 2586 break; 2587 } 2588 2589 /* Check if we got any rubbish for any of the snprintf's */ 2590 if ((name[0] == '\0') || (rtn >= sizeof (name))) { 2591 return (NULL); 2592 } 2593 2594 return (md_strdup(name)); 2595 } 2596 2597 /* Sysevent subclass and mdnotify event type pairs */ 2598 struct node { 2599 char *se_ev; 2600 md_event_type_t md_ev; 2601 }; 2602 2603 /* 2604 * Table must be sorted in case sensitive ascending order of 2605 * the sysevents values 2606 */ 2607 static struct node ev_table[] = { 2608 { ESC_SVM_ADD, EQ_ADD }, 2609 { ESC_SVM_ATTACH, EQ_ATTACH }, 2610 { ESC_SVM_ATTACHING, EQ_ATTACHING }, 2611 { ESC_SVM_CHANGE, EQ_CHANGE }, 2612 { ESC_SVM_CREATE, EQ_CREATE }, 2613 { ESC_SVM_DELETE, EQ_DELETE }, 2614 { ESC_SVM_DETACH, EQ_DETACH }, 2615 { ESC_SVM_DETACHING, EQ_DETACHING }, 2616 { ESC_SVM_DRIVE_ADD, EQ_DRIVE_ADD }, 2617 { ESC_SVM_DRIVE_DELETE, EQ_DRIVE_DELETE }, 2618 { ESC_SVM_ENABLE, EQ_ENABLE }, 2619 { ESC_SVM_ERRED, EQ_ERRED }, 2620 { ESC_SVM_EXCHANGE, EQ_EXCHANGE }, 2621 { ESC_SVM_GROW, EQ_GROW }, 2622 { ESC_SVM_HS_CHANGED, EQ_HS_CHANGED }, 2623 { ESC_SVM_HS_FREED, EQ_HS_FREED }, 2624 { ESC_SVM_HOST_ADD, EQ_HOST_ADD }, 2625 { ESC_SVM_HOST_DELETE, EQ_HOST_DELETE }, 2626 { ESC_SVM_HOTSPARED, EQ_HOTSPARED }, 2627 { ESC_SVM_INIT_FAILED, EQ_INIT_FAILED }, 2628 { ESC_SVM_INIT_FATAL, EQ_INIT_FATAL }, 2629 { ESC_SVM_INIT_START, EQ_INIT_START }, 2630 { ESC_SVM_INIT_SUCCESS, EQ_INIT_SUCCESS }, 2631 { ESC_SVM_IOERR, EQ_IOERR }, 2632 { ESC_SVM_LASTERRED, EQ_LASTERRED }, 2633 { ESC_SVM_MEDIATOR_ADD, EQ_MEDIATOR_ADD }, 2634 { ESC_SVM_MEDIATOR_DELETE, EQ_MEDIATOR_DELETE }, 2635 { ESC_SVM_OFFLINE, EQ_OFFLINE }, 2636 { ESC_SVM_OK, EQ_OK }, 2637 { ESC_SVM_ONLINE, EQ_ONLINE }, 2638 { ESC_SVM_OPEN_FAIL, EQ_OPEN_FAIL }, 2639 { ESC_SVM_REGEN_DONE, EQ_REGEN_DONE }, 2640 { ESC_SVM_REGEN_FAILED, EQ_REGEN_FAILED }, 2641 { ESC_SVM_REGEN_START, EQ_REGEN_START }, 2642 { ESC_SVM_RELEASE, EQ_RELEASE }, 2643 { ESC_SVM_REMOVE, EQ_REMOVE }, 2644 { ESC_SVM_RENAME_DST, EQ_RENAME_DST }, 2645 { ESC_SVM_RENAME_SRC, EQ_RENAME_SRC }, 2646 { ESC_SVM_REPLACE, EQ_REPLACE }, 2647 { ESC_SVM_RESYNC_DONE, EQ_RESYNC_DONE }, 2648 { ESC_SVM_RESYNC_FAILED, EQ_RESYNC_FAILED }, 2649 { ESC_SVM_RESYNC_START, EQ_RESYNC_START }, 2650 { ESC_SVM_RESYNC_SUCCESS, EQ_RESYNC_SUCCESS }, 2651 { ESC_SVM_TAKEOVER, EQ_TAKEOVER } 2652 }; 2653 2654 static md_tags_t md_tags[] = { 2655 TAG_UNK, 2656 TAG_METADEVICE, 2657 TAG_UNK, 2658 TAG_UNK, 2659 TAG_UNK, 2660 TAG_UNK, 2661 TAG_REPLICA, 2662 TAG_HSP, 2663 TAG_HS, 2664 TAG_SET, 2665 TAG_DRIVE, 2666 TAG_HOST, 2667 TAG_MEDIATOR 2668 }; 2669 2670 md_event_type_t 2671 ev_get(char *subclass) 2672 { 2673 int high, mid, low, p; 2674 2675 low = 0; 2676 high = (sizeof (ev_table) / sizeof (ev_table[0])) - 1; 2677 while (low <= high) { 2678 mid = (high + low) / 2; 2679 p = strcmp(subclass, ev_table[mid].se_ev); 2680 if (p == 0) { 2681 return (ev_table[mid].md_ev); 2682 } else if (p < 0) { 2683 high = mid - 1; 2684 } else { 2685 low = mid + 1; 2686 } 2687 } 2688 2689 return (EQ_EMPTY); 2690 } 2691 2692 /* 2693 * Log mdnotify event 2694 */ 2695 void 2696 do_mdnotify(char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid) 2697 { 2698 md_event_type_t ev_type; 2699 md_tags_t md_tag; 2700 2701 /* Translate sysevent into mdnotify event */ 2702 ev_type = ev_get(se_subclass); 2703 2704 if (tag >= (sizeof (md_tags) / sizeof (md_tags[0]))) { 2705 md_tag = TAG_UNK; 2706 } else { 2707 md_tag = md_tags[tag]; 2708 } 2709 2710 NOTIFY_MD(md_tag, setno, devid, ev_type); 2711 } 2712 2713 /* 2714 * Log SVM sys events 2715 */ 2716 void 2717 svm_gen_sysevent( 2718 char *se_class, 2719 char *se_subclass, 2720 uint32_t tag, 2721 set_t setno, 2722 md_dev64_t devid 2723 ) 2724 { 2725 nvlist_t *attr_list; 2726 sysevent_id_t eid; 2727 int err = DDI_SUCCESS; 2728 char *devname; 2729 extern dev_info_t *md_devinfo; 2730 2731 /* Raise the mdnotify event before anything else */ 2732 do_mdnotify(se_subclass, tag, setno, devid); 2733 2734 if (md_devinfo == NULL) { 2735 return; 2736 } 2737 2738 err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_NOSLEEP); 2739 2740 if (err == DDI_SUCCESS) { 2741 /* Add the version numver */ 2742 err = nvlist_add_uint32(attr_list, SVM_VERSION_NO, 2743 (uint32_t)SVM_VERSION); 2744 if (err != DDI_SUCCESS) { 2745 goto fail; 2746 } 2747 2748 /* Add the tag attribute */ 2749 err = nvlist_add_uint32(attr_list, SVM_TAG, (uint32_t)tag); 2750 if (err != DDI_SUCCESS) { 2751 goto fail; 2752 } 2753 2754 /* Add the set number attribute */ 2755 err = nvlist_add_uint32(attr_list, SVM_SET_NO, (uint32_t)setno); 2756 if (err != DDI_SUCCESS) { 2757 goto fail; 2758 } 2759 2760 /* Add the device id attribute */ 2761 err = nvlist_add_uint64(attr_list, SVM_DEV_ID, (uint64_t)devid); 2762 if (err != DDI_SUCCESS) { 2763 goto fail; 2764 } 2765 2766 /* Add the device name attribute */ 2767 devname = obj2devname(tag, setno, devid); 2768 if (devname != NULL) { 2769 err = nvlist_add_string(attr_list, SVM_DEV_NAME, 2770 devname); 2771 freestr(devname); 2772 } else { 2773 err = nvlist_add_string(attr_list, SVM_DEV_NAME, 2774 "unspecified"); 2775 } 2776 if (err != DDI_SUCCESS) { 2777 goto fail; 2778 } 2779 2780 /* Attempt to post event */ 2781 err = ddi_log_sysevent(md_devinfo, DDI_VENDOR_SUNW, se_class, 2782 se_subclass, attr_list, &eid, DDI_SLEEP); 2783 2784 nvlist_free(attr_list); 2785 if (err != DDI_SUCCESS) { 2786 cmn_err(CE_WARN, "Failed to log event for %s, %s," 2787 " err=%x", se_class, se_subclass, err); 2788 } 2789 } 2790 2791 return; 2792 2793 fail: 2794 nvlist_free(attr_list); 2795 cmn_err(CE_WARN, "Failed to setup attributes for event %s, %s, err=%x", 2796 se_class, se_subclass, err); 2797 } 2798 2799 void 2800 md_clear_named_service() 2801 { 2802 rw_enter(&ni_rwlp.lock, RW_WRITER); 2803 notify_interface = NULL; 2804 rw_exit(&ni_rwlp.lock); 2805 } 2806 2807 void 2808 md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock) 2809 { 2810 mdi_unit_t *ui; 2811 set_t setno = MD_MIN2SET(mnum); 2812 2813 ui = (mdi_unit_t *)kmem_zalloc(sizeof (mdi_unit_t), KM_SLEEP); 2814 ui->ui_opsindex = ops->md_selfindex; 2815 2816 /* initialize all the incore conditional variables */ 2817 mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL); 2818 cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL); 2819 2820 if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) { 2821 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 2822 MDI_VOIDUNIT(mnum) = (void *) ui; 2823 rw_exit(&md_unit_array_rw.lock); 2824 } else 2825 MDI_VOIDUNIT(mnum) = (void *) ui; 2826 2827 rw_enter(&ops->md_link_rw.lock, RW_WRITER); 2828 ui->ui_link.ln_next = ops->md_head; 2829 ui->ui_link.ln_setno = setno; 2830 ui->ui_link.ln_id = mnum; 2831 ops->md_head = &ui->ui_link; 2832 if (alloc_lock) { 2833 ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP); 2834 mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL); 2835 cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL); 2836 mutex_init(&ui->ui_io_lock->io_list_mutex, NULL, 2837 MUTEX_DEFAULT, NULL); 2838 ui->ui_io_lock->io_list_front = NULL; 2839 ui->ui_io_lock->io_list_back = NULL; 2840 } 2841 /* setup the unavailable field */ 2842 #if defined(_ILP32) 2843 if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) { 2844 ui->ui_tstate |= MD_64MD_ON_32KERNEL; 2845 cmn_err(CE_NOTE, "d%d is unavailable because 64 bit " 2846 "metadevices are not accessible on a 32 bit kernel", 2847 mnum); 2848 } 2849 #endif 2850 2851 rw_exit(&ops->md_link_rw.lock); 2852 } 2853 2854 void 2855 md_destroy_unit_incore(minor_t mnum, md_ops_t *ops) 2856 { 2857 mdi_unit_t *ui; 2858 2859 /* 2860 * ASSUMPTION: md_unit_array_rw WRITER lock is held. 2861 */ 2862 ui = MDI_UNIT(mnum); 2863 if (ui == NULL) 2864 return; 2865 2866 md_rem_link(MD_MIN2SET(mnum), mnum, &ops->md_link_rw.lock, 2867 &ops->md_head); 2868 2869 /* destroy the io lock if one is being used */ 2870 if (ui->ui_io_lock) { 2871 mutex_destroy(&ui->ui_io_lock->io_mx); 2872 cv_destroy(&ui->ui_io_lock->io_cv); 2873 kmem_free(ui->ui_io_lock, sizeof (md_io_lock_t)); 2874 } 2875 2876 /* teardown kstat */ 2877 md_kstat_destroy(mnum); 2878 2879 /* destroy all the incore conditional variables */ 2880 mutex_destroy(&ui->ui_mx); 2881 cv_destroy(&ui->ui_cv); 2882 2883 kmem_free(ui, sizeof (mdi_unit_t)); 2884 MDI_VOIDUNIT(mnum) = (void *) NULL; 2885 } 2886 2887 void 2888 md_rem_names(sv_dev_t *sv, int nsv) 2889 { 2890 int i, s; 2891 int max_sides; 2892 2893 if (nsv == 0) 2894 return; 2895 2896 /* All entries removed are in the same diskset */ 2897 if (md_get_setstatus(sv[0].setno) & MD_SET_MNSET) 2898 max_sides = MD_MNMAXSIDES; 2899 else 2900 max_sides = MD_MAXSIDES; 2901 2902 for (i = 0; i < nsv; i++) 2903 for (s = 0; s < max_sides; s++) 2904 (void) md_remdevname(sv[i].setno, s, sv[i].key); 2905 } 2906 2907 /* 2908 * Checking user args before we get into physio - returns 0 for ok, else errno 2909 * We do a lot of checking against illegal arguments here because some of the 2910 * real disk drivers don't like certain kinds of arguments. (e.g xy doesn't 2911 * like odd address user buffer.) Those drivers capture bad arguments in 2912 * xxread and xxwrite. But since meta-driver calls their strategy routines 2913 * directly, two bad scenario might happen: 2914 * 1. the real strategy doesn't like it and panic. 2915 * 2. the real strategy doesn't like it and set B_ERROR. 2916 * 2917 * The second case is no better than the first one, since the meta-driver 2918 * will treat it as a media-error and off line the mirror metapartition. 2919 * (Too bad there is no way to tell what error it is.) 2920 * 2921 */ 2922 int 2923 md_chk_uio(struct uio *uio) 2924 { 2925 int i; 2926 struct iovec *iov; 2927 2928 /* 2929 * Check for negative or not block-aligned offset 2930 */ 2931 if ((uio->uio_loffset < 0) || 2932 ((uio->uio_loffset & (DEV_BSIZE - 1)) != 0)) { 2933 return (EINVAL); 2934 } 2935 iov = uio->uio_iov; 2936 i = uio->uio_iovcnt; 2937 2938 while (i--) { 2939 if ((iov->iov_len & (DEV_BSIZE - 1)) != 0) 2940 return (EINVAL); 2941 /* 2942 * Bug # 1212146 2943 * The default is to not check alignment, but we can now check 2944 * for a larger number of alignments if desired. 2945 */ 2946 if ((uintptr_t)(iov->iov_base) & md_uio_alignment_mask) 2947 return (EINVAL); 2948 iov++; 2949 } 2950 return (0); 2951 } 2952 2953 char * 2954 md_shortname( 2955 minor_t mnum 2956 ) 2957 { 2958 static char buf[MAXPATHLEN]; 2959 char *devname; 2960 char *invalid = " (Invalid minor number %u) "; 2961 char *metaname; 2962 mdc_unit_t *un; 2963 side_t side; 2964 set_t setno = MD_MIN2SET(mnum); 2965 unit_t unit = MD_MIN2UNIT(mnum); 2966 2967 if ((un = MD_UNIT(mnum)) == NULL) { 2968 (void) snprintf(buf, sizeof (buf), invalid, mnum); 2969 return (buf); 2970 } 2971 2972 /* 2973 * If unit is not a friendly name unit, derive the name from the 2974 * minor number. 2975 */ 2976 if ((un->un_revision & MD_FN_META_DEV) == 0) { 2977 /* This is a traditional metadevice */ 2978 if (setno == MD_LOCAL_SET) { 2979 (void) snprintf(buf, sizeof (buf), "d%u", 2980 (unsigned)unit); 2981 } else { 2982 (void) snprintf(buf, sizeof (buf), "%s/d%u", 2983 mddb_getsetname(setno), (unsigned)unit); 2984 } 2985 return (buf); 2986 } 2987 2988 /* 2989 * It is a friendly name metadevice, so we need to get its name. 2990 */ 2991 side = mddb_getsidenum(setno); 2992 devname = (char *)kmem_alloc(MAXPATHLEN, KM_SLEEP); 2993 if (md_getdevname(setno, side, MD_KEYWILD, 2994 md_makedevice(md_major, mnum), devname, MAXPATHLEN) == 0) { 2995 /* 2996 * md_getdevname has given us either /dev/md/dsk/<metaname> 2997 * or /dev/md/<setname>/dsk/<metname> depending on whether 2998 * or not we are in the local set. Thus, we'll pull the 2999 * metaname from this string. 3000 */ 3001 if ((metaname = strrchr(devname, '/')) == NULL) { 3002 (void) snprintf(buf, sizeof (buf), invalid, mnum); 3003 goto out; 3004 } 3005 metaname++; /* move past slash */ 3006 if (setno == MD_LOCAL_SET) { 3007 /* No set name. */ 3008 (void) snprintf(buf, sizeof (buf), "%s", metaname); 3009 } else { 3010 /* Include setname */ 3011 (void) snprintf(buf, sizeof (buf), "%s/%s", 3012 mddb_getsetname(setno), metaname); 3013 } 3014 } else { 3015 /* We couldn't find the name. */ 3016 (void) snprintf(buf, sizeof (buf), invalid, mnum); 3017 } 3018 3019 out: 3020 kmem_free(devname, MAXPATHLEN); 3021 return (buf); 3022 } 3023 3024 char * 3025 md_devname( 3026 set_t setno, 3027 md_dev64_t dev, 3028 char *buf, 3029 size_t size 3030 ) 3031 { 3032 static char mybuf[MD_MAX_CTDLEN]; 3033 int err; 3034 3035 if (buf == NULL) { 3036 buf = mybuf; 3037 size = sizeof (mybuf); 3038 } else { 3039 ASSERT(size >= MD_MAX_CTDLEN); 3040 } 3041 3042 err = md_getdevname(setno, mddb_getsidenum(setno), 3043 0, dev, buf, size); 3044 if (err) { 3045 if (err == ENOENT) { 3046 (void) sprintf(buf, "(Unavailable)"); 3047 } else { 3048 (void) sprintf(buf, "(%u.%u)", 3049 md_getmajor(dev), md_getminor(dev)); 3050 } 3051 } 3052 3053 return (buf); 3054 } 3055 void 3056 md_minphys(buf_t *pb) 3057 { 3058 extern unsigned md_maxbcount; 3059 3060 if (pb->b_bcount > md_maxbcount) 3061 pb->b_bcount = md_maxbcount; 3062 } 3063 3064 void 3065 md_bioinit(struct buf *bp) 3066 { 3067 ASSERT(bp); 3068 3069 bioinit(bp); 3070 bp->b_back = bp; 3071 bp->b_forw = bp; 3072 bp->b_flags = B_BUSY; /* initialize flags */ 3073 } 3074 3075 void 3076 md_bioreset(struct buf *bp) 3077 { 3078 ASSERT(bp); 3079 3080 bioreset(bp); 3081 bp->b_back = bp; 3082 bp->b_forw = bp; 3083 bp->b_flags = B_BUSY; /* initialize flags */ 3084 } 3085 3086 /* 3087 * md_bioclone is needed as long as the real bioclone only takes a daddr_t 3088 * as block number. 3089 * We simply call bioclone with all input parameters but blkno, and set the 3090 * correct blkno afterwards. 3091 * Caveat Emptor: bp_mem must not be NULL! 3092 */ 3093 buf_t * 3094 md_bioclone(buf_t *bp, off_t off, size_t len, dev_t dev, diskaddr_t blkno, 3095 int (*iodone)(buf_t *), buf_t *bp_mem, int sleep) 3096 { 3097 (void) bioclone(bp, off, len, dev, 0, iodone, bp_mem, sleep); 3098 bp_mem->b_lblkno = blkno; 3099 return (bp_mem); 3100 } 3101 3102 3103 /* 3104 * kstat stuff 3105 */ 3106 void 3107 md_kstat_init_ui( 3108 minor_t mnum, 3109 mdi_unit_t *ui 3110 ) 3111 { 3112 if ((ui != NULL) && (ui->ui_kstat == NULL)) { 3113 set_t setno = MD_MIN2SET(mnum); 3114 unit_t unit = MD_MIN2UNIT(mnum); 3115 char module[KSTAT_STRLEN]; 3116 char *p = module; 3117 3118 if (setno != MD_LOCAL_SET) { 3119 char buf[64]; 3120 char *s = buf; 3121 char *e = module + sizeof (module) - 4; 3122 3123 (void) sprintf(buf, "%u", setno); 3124 while ((p < e) && (*s != '\0')) 3125 *p++ = *s++; 3126 *p++ = '/'; 3127 } 3128 *p++ = 'm'; 3129 *p++ = 'd'; 3130 *p = '\0'; 3131 if ((ui->ui_kstat = kstat_create(module, unit, NULL, "disk", 3132 KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) { 3133 ui->ui_kstat->ks_lock = &ui->ui_mx; 3134 kstat_install(ui->ui_kstat); 3135 } 3136 } 3137 } 3138 3139 void 3140 md_kstat_init( 3141 minor_t mnum 3142 ) 3143 { 3144 md_kstat_init_ui(mnum, MDI_UNIT(mnum)); 3145 } 3146 3147 void 3148 md_kstat_destroy_ui( 3149 mdi_unit_t *ui 3150 ) 3151 { 3152 /* 3153 * kstat_delete() interface has it's own locking mechanism and 3154 * does not allow holding of kstat lock (ks_lock). 3155 * Note: ks_lock == ui_mx from the md_kstat_init_ui(). 3156 */ 3157 if ((ui != NULL) && (ui->ui_kstat != NULL)) { 3158 kstat_delete(ui->ui_kstat); 3159 ui->ui_kstat = NULL; 3160 } 3161 } 3162 3163 void 3164 md_kstat_destroy( 3165 minor_t mnum 3166 ) 3167 { 3168 md_kstat_destroy_ui(MDI_UNIT(mnum)); 3169 } 3170 3171 /* 3172 * In the following subsequent routines, locks are held before checking the 3173 * validity of ui_kstat. This is done to make sure that we don't trip over 3174 * a NULL ui_kstat anymore. 3175 */ 3176 3177 void 3178 md_kstat_waitq_enter( 3179 mdi_unit_t *ui 3180 ) 3181 { 3182 mutex_enter(&ui->ui_mx); 3183 if (ui->ui_kstat != NULL) 3184 kstat_waitq_enter(KSTAT_IO_PTR(ui->ui_kstat)); 3185 mutex_exit(&ui->ui_mx); 3186 } 3187 3188 void 3189 md_kstat_waitq_to_runq( 3190 mdi_unit_t *ui 3191 ) 3192 { 3193 mutex_enter(&ui->ui_mx); 3194 if (ui->ui_kstat != NULL) 3195 kstat_waitq_to_runq(KSTAT_IO_PTR(ui->ui_kstat)); 3196 mutex_exit(&ui->ui_mx); 3197 } 3198 3199 void 3200 md_kstat_waitq_exit( 3201 mdi_unit_t *ui 3202 ) 3203 { 3204 mutex_enter(&ui->ui_mx); 3205 if (ui->ui_kstat != NULL) 3206 kstat_waitq_exit(KSTAT_IO_PTR(ui->ui_kstat)); 3207 mutex_exit(&ui->ui_mx); 3208 } 3209 3210 void 3211 md_kstat_runq_enter( 3212 mdi_unit_t *ui 3213 ) 3214 { 3215 mutex_enter(&ui->ui_mx); 3216 if (ui->ui_kstat != NULL) 3217 kstat_runq_enter(KSTAT_IO_PTR(ui->ui_kstat)); 3218 mutex_exit(&ui->ui_mx); 3219 } 3220 3221 void 3222 md_kstat_runq_exit( 3223 mdi_unit_t *ui 3224 ) 3225 { 3226 mutex_enter(&ui->ui_mx); 3227 if (ui->ui_kstat != NULL) 3228 kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat)); 3229 mutex_exit(&ui->ui_mx); 3230 } 3231 3232 void 3233 md_kstat_done( 3234 mdi_unit_t *ui, 3235 buf_t *bp, 3236 int war 3237 ) 3238 { 3239 size_t n_done; 3240 3241 /* check for end of device */ 3242 if ((bp->b_resid != 0) && (! (bp->b_flags & B_ERROR))) { 3243 n_done = bp->b_bcount; 3244 } else if (bp->b_bcount < bp->b_resid) { 3245 n_done = 0; 3246 } else { 3247 n_done = bp->b_bcount - bp->b_resid; 3248 } 3249 3250 /* do accounting */ 3251 mutex_enter(&ui->ui_mx); 3252 if (ui->ui_kstat != NULL) { 3253 if ((! war) && (bp->b_flags & B_READ)) { 3254 KSTAT_IO_PTR(ui->ui_kstat)->reads++; 3255 KSTAT_IO_PTR(ui->ui_kstat)->nread += n_done; 3256 } else { 3257 KSTAT_IO_PTR(ui->ui_kstat)->writes++; 3258 KSTAT_IO_PTR(ui->ui_kstat)->nwritten += n_done; 3259 } 3260 kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat)); 3261 } 3262 mutex_exit(&ui->ui_mx); 3263 } 3264 3265 pid_t 3266 md_getpid() 3267 { 3268 pid_t valuep; 3269 if (drv_getparm(PPID, (pid_t *)&valuep) != 0) { 3270 ASSERT(0); 3271 return ((pid_t)0); 3272 } else { 3273 ASSERT(valuep); 3274 return (valuep); 3275 } 3276 } 3277 3278 3279 proc_t * 3280 md_getproc() 3281 { 3282 proc_t *valuep; 3283 if (drv_getparm(UPROCP, (proc_t **)&valuep) != 0) { 3284 ASSERT(0); 3285 return ((proc_t *)NULL); 3286 } else { 3287 ASSERT(valuep); 3288 return (valuep); 3289 } 3290 } 3291 3292 extern kmutex_t pidlock; 3293 3294 /* 3295 * this check to see if a process pid pair are still running. For the 3296 * disk set lock when both pid/proc are zero then the locks is not 3297 * currently held. 3298 */ 3299 int 3300 md_checkpid(pid_t pid, proc_t *proc) 3301 { 3302 int retval = 1; 3303 3304 if (pid == 0 && proc == NULL) 3305 return (0); 3306 3307 mutex_enter(&pidlock); 3308 if (prfind(pid) != proc) 3309 retval = 0; 3310 mutex_exit(&pidlock); 3311 return (retval); 3312 } 3313 3314 /* 3315 * NAME: md_init_probereq 3316 * 3317 * DESCRIPTION: initializes a probe request. Parcels out the mnums such that 3318 * they can be dispatched to multiple daemon threads. 3319 * 3320 * PARAMETERS: struct md_probedev *p pointer ioctl input 3321 * 3322 * RETURN VALUE: Returns errno 3323 * 3324 */ 3325 3326 int 3327 md_init_probereq(struct md_probedev_impl *p, daemon_queue_t **hdrpp) 3328 { 3329 int err = 0; 3330 int modindx; 3331 intptr_t (*probe_test)(); 3332 3333 /* 3334 * Initialize the semaphores and mutex 3335 * for the request 3336 */ 3337 3338 p->probe_sema = kmem_alloc(sizeof (ksema_t), KM_SLEEP); 3339 3340 p->probe_mx = kmem_alloc(sizeof (kmutex_t), KM_SLEEP); 3341 sema_init(PROBE_SEMA(p), 0, NULL, SEMA_DRIVER, NULL); 3342 mutex_init(PROBE_MX(p), NULL, MUTEX_DEFAULT, NULL); 3343 3344 modindx = md_getmodindex(&(p->probe.md_driver), 1, 1); 3345 probe_test = md_get_named_service(NODEV64, modindx, 3346 p->probe.test_name, 0); 3347 if (probe_test == NULL) { 3348 err = EINVAL; 3349 goto err_out; 3350 } 3351 3352 err = md_create_probe_rqlist(p, hdrpp, probe_test); 3353 err_out: 3354 return (err); 3355 } 3356 3357 /* 3358 * NAME: md_probe_one 3359 * 3360 * DESCRIPTION: Generic routine for probing disks. This is called from the 3361 * daemon. 3362 * 3363 * PARAMETERS: probe_req_t *reqp pointer to the probe request structure. 3364 * 3365 */ 3366 3367 void 3368 md_probe_one(probe_req_t *reqp) 3369 { 3370 mdi_unit_t *ui; 3371 md_probedev_impl_t *p; 3372 int err = 0; 3373 3374 p = (md_probedev_impl_t *)reqp->private_handle; 3375 /* 3376 * Validate the unit while holding the global ioctl lock, then 3377 * obtain the unit_writerlock. Once the writerlock has been obtained 3378 * we can release the global lock. As long as we hold one of these 3379 * locks this will prevent a metaclear operation being performed 3380 * on the metadevice because metaclear takes the readerlock (via 3381 * openclose lock). 3382 */ 3383 while (md_ioctl_lock_enter() == EINTR); 3384 ui = MDI_UNIT(reqp->mnum); 3385 if (ui != NULL) { 3386 (void) md_unit_writerlock_common(ui, 0); 3387 (void) md_ioctl_lock_exit(0, 0, 0, FALSE); 3388 err = (*reqp->probe_fcn)(ui, reqp->mnum); 3389 md_unit_writerexit(ui); 3390 } else { 3391 (void) md_ioctl_lock_exit(0, 0, 0, FALSE); 3392 } 3393 3394 /* update the info info in the probe structure */ 3395 3396 mutex_enter(PROBE_MX(p)); 3397 if (err != 0) { 3398 cmn_err(CE_NOTE, "md_probe_one: err %d mnum %d\n", err, 3399 reqp->mnum); 3400 (void) mdsyserror(&(p->probe.mde), err); 3401 } 3402 3403 mutex_exit(PROBE_MX(p)); 3404 sema_v(PROBE_SEMA(p)); 3405 3406 kmem_free(reqp, sizeof (probe_req_t)); 3407 } 3408 char * 3409 md_strdup(char *cp) 3410 { 3411 char *new_cp = NULL; 3412 3413 new_cp = kmem_alloc(strlen(cp) + 1, KM_SLEEP); 3414 3415 return (strcpy(new_cp, cp)); 3416 } 3417 3418 void 3419 freestr(char *cp) 3420 { 3421 kmem_free(cp, strlen(cp) + 1); 3422 } 3423 3424 /* 3425 * Validate the list and skip invalid devices. Then create 3426 * a doubly linked circular list of devices to probe. 3427 * The hdr points to the head and tail of this list. 3428 */ 3429 3430 static int 3431 md_create_probe_rqlist(md_probedev_impl_t *plist, daemon_queue_t **hdr, 3432 intptr_t (*probe_test)()) 3433 { 3434 int i, err, nodevcnt; 3435 probe_req_t *tp; 3436 daemon_queue_t *hp; 3437 minor_t mnum; 3438 3439 nodevcnt = 0; 3440 3441 hp = NULL; 3442 3443 for (i = 0; i < plist->probe.nmdevs; i++) { 3444 mnum = ((minor_t *)(uintptr_t)(plist->probe.mnum_list))[i]; 3445 if (MDI_UNIT(mnum) == NULL) { 3446 cmn_err(CE_WARN, "md: Cannot probe %s since it does " 3447 "not exist", md_shortname(mnum)); 3448 nodevcnt++; 3449 continue; 3450 } 3451 tp = kmem_alloc(sizeof (probe_req_t), KM_SLEEP); 3452 tp->mnum = mnum; 3453 tp->private_handle = (void *)plist; 3454 tp->probe_fcn = probe_test; 3455 if (hp == NULL) { 3456 hp = (daemon_queue_t *)tp; 3457 hp->dq_prev = hp->dq_next = (daemon_queue_t *)tp; 3458 } else { 3459 tp->dq.dq_next = hp; 3460 tp->dq.dq_prev = hp->dq_prev; 3461 hp->dq_prev->dq_next = (daemon_queue_t *)tp; 3462 hp->dq_prev = (daemon_queue_t *)tp; 3463 } 3464 } 3465 3466 *hdr = hp; 3467 if (nodevcnt > 0) 3468 plist->probe.nmdevs -= nodevcnt; 3469 3470 /* 3471 * If there are no devices to be probed because they were 3472 * incorrect, then return an error. 3473 */ 3474 err = (plist->probe.nmdevs == 0) ? ENODEV : 0; 3475 3476 return (err); 3477 } 3478 3479 /* 3480 * This routine increments the I/O count for set I/O operations. This 3481 * value is used to determine if an I/O can done. If a release is in 3482 * process this will return an error and cause the I/O to be errored. 3483 */ 3484 int 3485 md_inc_iocount(set_t setno) 3486 { 3487 int rc = 0; 3488 3489 if (setno == 0) 3490 return (0); 3491 3492 mutex_enter(&md_set_io[setno].md_io_mx); 3493 if (!(md_set_io[setno].io_state & MD_SET_ACTIVE)) { 3494 rc = EIO; 3495 goto out; 3496 } 3497 3498 ASSERT(md_set_io[setno].io_cnt >= 0); 3499 md_set_io[setno].io_cnt++; 3500 3501 out: mutex_exit(&md_set_io[setno].md_io_mx); 3502 return (rc); 3503 } 3504 3505 void 3506 md_inc_iocount_noblock(set_t setno) 3507 { 3508 3509 if (setno == 0) 3510 return; 3511 3512 mutex_enter(&md_set_io[setno].md_io_mx); 3513 md_set_io[setno].io_cnt++; 3514 mutex_exit(&md_set_io[setno].md_io_mx); 3515 } 3516 void 3517 md_dec_iocount(set_t setno) 3518 { 3519 3520 if (setno == 0) 3521 return; 3522 3523 mutex_enter(&md_set_io[setno].md_io_mx); 3524 md_set_io[setno].io_cnt--; 3525 ASSERT(md_set_io[setno].io_cnt >= 0); 3526 if ((md_set_io[setno].io_state & MD_SET_RELEASE) && 3527 (md_set_io[setno].io_cnt == 0)) 3528 cv_broadcast(&md_set_io[setno].md_io_cv); 3529 mutex_exit(&md_set_io[setno].md_io_mx); 3530 } 3531 3532 int 3533 md_isblock_setio(set_t setno) 3534 { 3535 int rc = 0; 3536 3537 if (setno == 0) 3538 return (0); 3539 3540 mutex_enter(&md_set_io[setno].md_io_mx); 3541 if (md_set_io[setno].io_state & MD_SET_RELEASE) 3542 rc = 1; 3543 3544 mutex_exit(&md_set_io[setno].md_io_mx); 3545 return (rc); 3546 } 3547 3548 int 3549 md_block_setio(set_t setno) 3550 { 3551 int rc = 0; 3552 3553 if (setno == 0) 3554 return (1); 3555 3556 mutex_enter(&md_set_io[setno].md_io_mx); 3557 md_set_io[setno].io_state = MD_SET_RELEASE; 3558 3559 while (md_set_io[setno].io_cnt > 0) { 3560 cv_wait(&md_set_io[setno].md_io_cv, 3561 &md_set_io[setno].md_io_mx); 3562 } 3563 rc = 1; 3564 3565 3566 ASSERT(md_set_io[setno].io_cnt == 0); 3567 mutex_exit(&md_set_io[setno].md_io_mx); 3568 3569 return (rc); 3570 } 3571 3572 void 3573 md_clearblock_setio(set_t setno) 3574 { 3575 if (setno == 0) 3576 return; 3577 3578 mutex_enter(&md_set_io[setno].md_io_mx); 3579 md_set_io[setno].io_state = MD_SET_ACTIVE; 3580 mutex_exit(&md_set_io[setno].md_io_mx); 3581 } 3582 3583 void 3584 md_unblock_setio(set_t setno) 3585 { 3586 if (setno == 0) 3587 return; 3588 3589 mutex_enter(&md_set_io[setno].md_io_mx); 3590 #ifdef DEBUG 3591 if (md_set_io[setno].io_cnt != 0) { 3592 cmn_err(CE_NOTE, "set %d count was %ld at take", 3593 setno, md_set_io[setno].io_cnt); 3594 } 3595 #endif /* DEBUG */ 3596 3597 md_set_io[setno].io_state = MD_SET_ACTIVE; 3598 md_set_io[setno].io_cnt = 0; 3599 mutex_exit(&md_set_io[setno].md_io_mx); 3600 } 3601 3602 /* 3603 * Test and set version of the md_block_setio. 3604 * Set the io_state to keep new I/O from being issued. 3605 * If there is I/O currently in progress, then set io_state to active 3606 * and return failure. Otherwise, return a 1 for success. 3607 * 3608 * Used in a MN diskset since the commd must be suspended before 3609 * this node can attempt to withdraw from a diskset. But, with commd 3610 * suspended, I/O may have been issued that can never finish until 3611 * commd is resumed (allocation of hotspare, etc). So, if I/O is 3612 * outstanding after diskset io_state is marked RELEASE, then set diskset 3613 * io_state back to ACTIVE and return failure. 3614 */ 3615 int 3616 md_tas_block_setio(set_t setno) 3617 { 3618 int rc; 3619 3620 if (setno == 0) 3621 return (1); 3622 3623 mutex_enter(&md_set_io[setno].md_io_mx); 3624 md_set_io[setno].io_state = MD_SET_RELEASE; 3625 3626 if (md_set_io[setno].io_cnt > 0) { 3627 md_set_io[setno].io_state = MD_SET_ACTIVE; 3628 rc = 0; 3629 } else { 3630 rc = 1; 3631 } 3632 3633 mutex_exit(&md_set_io[setno].md_io_mx); 3634 3635 return (rc); 3636 } 3637 3638 void 3639 md_biodone(struct buf *pb) 3640 { 3641 minor_t mnum; 3642 set_t setno; 3643 mdi_unit_t *ui; 3644 3645 mnum = getminor(pb->b_edev); 3646 setno = MD_MIN2SET(mnum); 3647 3648 if (setno == 0) { 3649 biodone(pb); 3650 return; 3651 } 3652 3653 #ifdef DEBUG 3654 ui = MDI_UNIT(mnum); 3655 if (!md_unit_isopen(ui)) 3656 cmn_err(CE_NOTE, "io after close on %s\n", md_shortname(mnum)); 3657 #endif /* DEBUG */ 3658 3659 /* 3660 * Handle the local diskset 3661 */ 3662 if (md_set_io[setno].io_cnt > 0) 3663 md_dec_iocount(setno); 3664 3665 #ifdef DEBUG 3666 /* 3667 * this is being done after the lock is dropped so there 3668 * are cases it may be invalid. It is advisory. 3669 */ 3670 if (md_set_io[setno].io_state & MD_SET_RELEASE) { 3671 /* Only display this error once for this metadevice */ 3672 if ((ui->ui_tstate & MD_RELEASE_IOERR_DONE) == 0) { 3673 cmn_err(CE_NOTE, 3674 "I/O to %s attempted during set RELEASE\n", 3675 md_shortname(mnum)); 3676 ui->ui_tstate |= MD_RELEASE_IOERR_DONE; 3677 } 3678 } 3679 #endif /* DEBUG */ 3680 3681 biodone(pb); 3682 } 3683 3684 3685 /* 3686 * Driver special private devt handling routine 3687 * INPUT: md_dev64_t 3688 * OUTPUT: dev_t, 32 bit on a 32 bit kernel, 64 bit on a 64 bit kernel. 3689 */ 3690 dev_t 3691 md_dev64_to_dev(md_dev64_t dev) 3692 { 3693 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 3694 minor_t minor = (minor_t)(dev & MAXMIN64); 3695 3696 return (makedevice(major, minor)); 3697 3698 } 3699 3700 /* 3701 * Driver private makedevice routine 3702 * INPUT: major_t major, minor_t minor 3703 * OUTPUT: md_dev64_t, no matter if on 32 bit or 64 bit kernel. 3704 */ 3705 md_dev64_t 3706 md_makedevice(major_t major, minor_t minor) 3707 { 3708 return (((md_dev64_t)major << NBITSMINOR64) | minor); 3709 3710 } 3711 3712 3713 /* 3714 * Driver private devt md_getmajor routine 3715 * INPUT: dev a 64 bit container holding either a 32 bit or a 64 bit device 3716 * OUTPUT: the appropriate major number 3717 */ 3718 major_t 3719 md_getmajor(md_dev64_t dev) 3720 { 3721 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 3722 3723 if (major == 0) { 3724 /* Here we were given a 32bit dev */ 3725 major = (major_t)(dev >> NBITSMINOR32) & MAXMAJ32; 3726 } 3727 return (major); 3728 } 3729 3730 /* 3731 * Driver private devt md_getminor routine 3732 * INPUT: dev a 64 bit container holding either a 32 bit or a 64 bit device 3733 * OUTPUT: the appropriate minor number 3734 */ 3735 minor_t 3736 md_getminor(md_dev64_t dev) 3737 { 3738 minor_t minor; 3739 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 3740 3741 if (major == 0) { 3742 /* Here we were given a 32bit dev */ 3743 minor = (minor_t)(dev & MAXMIN32); 3744 } else { 3745 minor = (minor_t)(dev & MAXMIN64); 3746 } 3747 return (minor); 3748 } 3749 3750 int 3751 md_check_ioctl_against_efi(int cmd, ushort_t flags) 3752 { 3753 /* 3754 * If the metadevice is an old style device, it has a vtoc, 3755 * in that case all reading EFI ioctls are not applicable. 3756 * If the metadevice has an EFI label, reading vtoc and geom ioctls 3757 * are not supposed to work. 3758 */ 3759 switch (cmd) { 3760 case DKIOCGGEOM: 3761 case DKIOCGVTOC: 3762 case DKIOCGAPART: 3763 if ((flags & MD_EFILABEL) != 0) { 3764 return (ENOTSUP); 3765 } 3766 break; 3767 case DKIOCGETEFI: 3768 case DKIOCPARTITION: 3769 if ((flags & MD_EFILABEL) == 0) { 3770 return (ENOTSUP); 3771 } 3772 break; 3773 3774 case DKIOCSETEFI: 3775 /* setting an EFI label should always be ok */ 3776 return (0); 3777 3778 case DKIOCSVTOC: 3779 /* 3780 * This one is ok for small devices, even if they have an EFI 3781 * label. The appropriate check is in md_set_vtoc 3782 */ 3783 return (0); 3784 } 3785 return (0); 3786 } 3787 3788 /* 3789 * md_vtoc_to_efi_record() 3790 * Input: record id of the vtoc record 3791 * Output: record id of the efi record 3792 * Function: 3793 * - reads the volume name from the vtoc record 3794 * - converts the volume name to a format, libefi understands 3795 * - creates a new record of size MD_EFI_PARTNAME_BYTES 3796 * - stores the volname in that record, 3797 * - commits that record 3798 * - returns the recid of the efi record. 3799 * Caveat Emptor: 3800 * The calling routine must do something like 3801 * - un->c.un_vtoc_id = md_vtoc_to_efi_record(vtoc_recid) 3802 * - commit(un) 3803 * - delete(vtoc_recid) 3804 * in order to keep the mddb consistent in case of a panic in the middle. 3805 * Errors: 3806 * - returns 0 on any error 3807 */ 3808 mddb_recid_t 3809 md_vtoc_to_efi_record(mddb_recid_t vtoc_recid, set_t setno) 3810 { 3811 struct vtoc *vtoc; 3812 ushort_t *v; 3813 mddb_recid_t efi_recid; 3814 int i; 3815 3816 if (mddb_getrecstatus(vtoc_recid) != MDDB_OK) { 3817 return (0); 3818 } 3819 vtoc = (struct vtoc *)mddb_getrecaddr(vtoc_recid); 3820 efi_recid = mddb_createrec(MD_EFI_PARTNAME_BYTES, MDDB_EFILABEL, 0, 3821 MD_CRO_32BIT, setno); 3822 if (efi_recid < 0) { 3823 return (0); 3824 } 3825 v = (ushort_t *)mddb_getrecaddr(efi_recid); 3826 3827 /* This for loop read, converts and writes */ 3828 for (i = 0; i < LEN_DKL_VVOL; i++) { 3829 v[i] = LE_16((uint16_t)vtoc->v_volume[i]); 3830 } 3831 /* commit the new record */ 3832 mddb_commitrec_wrapper(efi_recid); 3833 3834 return (efi_recid); 3835 } 3836 3837 /* 3838 * Send a kernel message. 3839 * user has to provide for an allocated result structure 3840 * If the door handler disappears we retry forever emitting warnings every so 3841 * often. 3842 * TODO: make this a flaggable attribute so that the caller can decide if the 3843 * message is to be a 'one-shot' message or not. 3844 */ 3845 int 3846 mdmn_ksend_message( 3847 set_t setno, 3848 md_mn_msgtype_t type, 3849 uint_t flags, 3850 char *data, 3851 int size, 3852 md_mn_kresult_t *result) 3853 { 3854 door_arg_t da; 3855 md_mn_kmsg_t *kmsg; 3856 uint_t retry_cnt = 0; 3857 int rval; 3858 3859 if (size > MDMN_MAX_KMSG_DATA) 3860 return (ENOMEM); 3861 kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP); 3862 kmsg->kmsg_flags = flags; 3863 kmsg->kmsg_setno = setno; 3864 kmsg->kmsg_type = type; 3865 kmsg->kmsg_size = size; 3866 bcopy(data, &(kmsg->kmsg_data), size); 3867 3868 #ifdef DEBUG_COMM 3869 printf("send msg: set=%d, flags=%d, type=%d, txid = 0x%llx," 3870 " size=%d, data=%d, data2=%d\n", 3871 kmsg->kmsg_setno, 3872 kmsg->kmsg_flags, 3873 kmsg->kmsg_type, 3874 kmsg->kmsg_size, 3875 *(int *)data, 3876 *(int *)(char *)(&kmsg->kmsg_data)); 3877 3878 3879 #endif /* DEBUG_COMM */ 3880 3881 da.data_ptr = (char *)(kmsg); 3882 da.data_size = sizeof (md_mn_kmsg_t); 3883 da.desc_ptr = NULL; 3884 da.desc_num = 0; 3885 da.rbuf = (char *)result; 3886 da.rsize = sizeof (*result); 3887 3888 /* 3889 * Wait for the door handle to be established. 3890 */ 3891 3892 while (mdmn_door_did == -1) { 3893 if ((++retry_cnt % MD_MN_WARN_INTVL) == 0) { 3894 cmn_err(CE_WARN, "door handle not yet ready. " 3895 "Check if /usr/lib/lvm/mddoors is running"); 3896 } 3897 delay(md_hz); 3898 } 3899 retry_cnt = 0; 3900 3901 while ((rval = door_ki_upcall(mdmn_door_handle, &da)) != 0) { 3902 if (rval == EAGAIN) { 3903 if ((++retry_cnt % MD_MN_WARN_INTVL) == 0) { 3904 cmn_err(CE_WARN, "door call failed. " 3905 "Check if /usr/lib/lvm/mddoors is running"); 3906 } 3907 } else { 3908 cmn_err(CE_WARN, 3909 "md door call failed. Returned %d", rval); 3910 } 3911 delay(md_hz); 3912 } 3913 kmem_free(kmsg, sizeof (md_mn_kmsg_t)); 3914 3915 /* 3916 * Attempt to determine if the message failed (with an RPC_FAILURE) 3917 * because we are in the middle of shutting the system down. 3918 * 3919 * If message failed with an RPC_FAILURE when rpc.mdcommd had 3920 * been gracefully shutdown (md_mn_is_commd_present returns FALSE) 3921 * then don't retry the message anymore. If message 3922 * failed due to any other reason, then retry up to MD_MN_WARN_INTVL 3923 * times which should allow a shutting down system time to 3924 * notify the kernel of a graceful shutdown of rpc.mdcommd. 3925 * 3926 * Caller of this routine will need to check the md_mn_commd_present 3927 * flag and the failure error in order to determine whether to panic 3928 * or not. If md_mn_commd_present is set to 0 and failure error 3929 * is RPC_FAILURE, the calling routine should not panic since the 3930 * system is in the process of being shutdown. 3931 * 3932 */ 3933 3934 retry_cnt = 0; 3935 3936 if (result->kmmr_comm_state == MDMNE_RPC_FAIL) { 3937 while (md_mn_is_commd_present() == 1) { 3938 if ((++retry_cnt % MD_MN_WARN_INTVL) == 0) 3939 break; 3940 delay(md_hz); 3941 } 3942 } 3943 3944 return (0); 3945 } 3946 3947 /* 3948 * Called to propagate the capability of a metadevice to all nodes in the set. 3949 * 3950 * On entry, lockp is set if the function has been called from within an ioctl. 3951 * 3952 * IOLOCK_RETURN_RELEASE, which drops the md_ioctl_lock is called in this 3953 * routine to enable other mdioctls to enter the kernel while this 3954 * thread of execution waits on the completion of mdmn_ksend_message. When 3955 * the message is completed the thread continues and md_ioctl_lock must be 3956 * reacquired. Even though md_ioctl_lock is interruptable, we choose to 3957 * ignore EINTR as we must not return without acquiring md_ioctl_lock. 3958 */ 3959 3960 int 3961 mdmn_send_capability_message(minor_t mnum, volcap_t vc, IOLOCK *lockp) 3962 { 3963 md_mn_msg_setcap_t msg; 3964 md_mn_kresult_t *kres; 3965 mdi_unit_t *ui = MDI_UNIT(mnum); 3966 int ret; 3967 k_sigset_t oldmask, newmask; 3968 3969 (void) strncpy((char *)&msg.msg_setcap_driver, 3970 md_ops[ui->ui_opsindex]->md_driver.md_drivername, MD_DRIVERNAMELEN); 3971 msg.msg_setcap_mnum = mnum; 3972 msg.msg_setcap_set = vc.vc_set; 3973 3974 if (lockp) 3975 IOLOCK_RETURN_RELEASE(0, lockp); 3976 kres = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP); 3977 3978 /* 3979 * Mask signals for the mdmd_ksend_message call. This keeps the door 3980 * interface from failing if the user process receives a signal while 3981 * in mdmn_ksend_message. 3982 */ 3983 sigfillset(&newmask); 3984 sigreplace(&newmask, &oldmask); 3985 ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP, 3986 MD_MSGF_NO_LOG, (char *)&msg, sizeof (md_mn_msg_setcap_t), 3987 kres)); 3988 sigreplace(&oldmask, (k_sigset_t *)NULL); 3989 3990 if (!MDMN_KSEND_MSG_OK(ret, kres)) { 3991 mdmn_ksend_show_error(ret, kres, "MD_MN_MSG_SET_CAP"); 3992 ret = EIO; 3993 } 3994 kmem_free(kres, sizeof (md_mn_kresult_t)); 3995 3996 if (lockp) { 3997 IOLOCK_RETURN_REACQUIRE(lockp); 3998 } 3999 return (ret); 4000 } 4001 4002 /* 4003 * Called to clear all of the transient capabilities for a metadevice when it is 4004 * not open on any node in the cluster 4005 * Called from close for mirror and sp. 4006 */ 4007 4008 void 4009 mdmn_clear_all_capabilities(minor_t mnum) 4010 { 4011 md_isopen_t clumsg; 4012 int ret; 4013 md_mn_kresult_t *kresult; 4014 volcap_t vc; 4015 k_sigset_t oldmask, newmask; 4016 4017 clumsg.dev = md_makedevice(md_major, mnum); 4018 clumsg.mde = mdnullerror; 4019 /* 4020 * The check open message doesn't have to be logged, nor should the 4021 * result be stored in the MCT. We want an up-to-date state. 4022 */ 4023 kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP); 4024 4025 /* 4026 * Mask signals for the mdmd_ksend_message call. This keeps the door 4027 * interface from failing if the user process receives a signal while 4028 * in mdmn_ksend_message. 4029 */ 4030 sigfillset(&newmask); 4031 sigreplace(&newmask, &oldmask); 4032 ret = mdmn_ksend_message(MD_MIN2SET(mnum), 4033 MD_MN_MSG_CLU_CHECK, 4034 MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 4035 (char *)&clumsg, sizeof (clumsg), kresult); 4036 sigreplace(&oldmask, (k_sigset_t *)NULL); 4037 4038 if ((ret == 0) && (kresult->kmmr_exitval == 0)) { 4039 /* 4040 * Not open on any node, clear all capabilities, eg ABR and 4041 * DMR 4042 */ 4043 vc.vc_set = 0; 4044 (void) mdmn_send_capability_message(mnum, vc, NULL); 4045 } 4046 kmem_free(kresult, sizeof (md_mn_kresult_t)); 4047 } 4048 4049 /* 4050 * mdmn_ksend_show_error: 4051 * --------------------- 4052 * Called to display the error contents of a failing mdmn_ksend_message() result 4053 * 4054 * Input: 4055 * rv - return value from mdmn_ksend_message() 4056 * kres - pointer to result structure filled in by mdmn_ksend_message 4057 * s - Informative message to identify failing condition (e.g. 4058 * "Ownership change") This string will be displayed with 4059 * cmn_err(CE_WARN, "%s *FAILED*",...) to alert the system 4060 * administrator 4061 */ 4062 void 4063 mdmn_ksend_show_error(int rv, md_mn_kresult_t *kres, const char *s) 4064 { 4065 if (rv == 0) { 4066 cmn_err(CE_WARN, "%s *FAILED*", s); 4067 cmn_err(CE_CONT, "exit_val = %d, comm_state = %d, failing_node" 4068 " = %d", kres->kmmr_exitval, kres->kmmr_comm_state, 4069 kres->kmmr_failing_node); 4070 } else { 4071 cmn_err(CE_WARN, "%s *FAILED*, return value = %d", s, rv); 4072 } 4073 } 4074 4075 /* 4076 * Callback routine for resync thread. If requested to suspend we mark the 4077 * commd as not being present. 4078 */ 4079 boolean_t 4080 callb_md_mrs_cpr(void *arg, int code) 4081 { 4082 callb_cpr_t *cp = (callb_cpr_t *)arg; 4083 int ret = 0; /* assume success */ 4084 4085 mutex_enter(cp->cc_lockp); 4086 4087 switch (code) { 4088 case CB_CODE_CPR_CHKPT: 4089 /* 4090 * Mark the rpc.mdcommd as no longer present. We are trying to 4091 * suspend the system and so we should expect RPC failures to 4092 * occur. 4093 */ 4094 md_mn_clear_commd_present(); 4095 cp->cc_events |= CALLB_CPR_START; 4096 while (!(cp->cc_events & CALLB_CPR_SAFE)) 4097 /* cv_timedwait() returns -1 if it times out. */ 4098 if ((ret = cv_timedwait(&cp->cc_callb_cv, cp->cc_lockp, 4099 lbolt + CPR_KTHREAD_TIMEOUT_SEC * hz)) == -1) 4100 break; 4101 break; 4102 4103 case CB_CODE_CPR_RESUME: 4104 cp->cc_events &= ~CALLB_CPR_START; 4105 cv_signal(&cp->cc_stop_cv); 4106 break; 4107 } 4108 mutex_exit(cp->cc_lockp); 4109 return (ret != -1); 4110 } 4111 4112 4113 void 4114 md_rem_hspname(set_t setno, mdkey_t n_key) 4115 { 4116 int s; 4117 int max_sides; 4118 4119 4120 /* All entries removed are in the same diskset */ 4121 if (md_get_setstatus(setno) & MD_SET_MNSET) 4122 max_sides = MD_MNMAXSIDES; 4123 else 4124 max_sides = MD_MAXSIDES; 4125 4126 for (s = 0; s < max_sides; s++) 4127 (void) md_remdevname(setno, s, n_key); 4128 } 4129 4130 4131 int 4132 md_rem_selfname(minor_t selfid) 4133 { 4134 int s; 4135 set_t setno = MD_MIN2SET(selfid); 4136 int max_sides; 4137 md_dev64_t dev; 4138 struct nm_next_hdr *nh; 4139 struct nm_name *n; 4140 mdkey_t key; 4141 4142 /* 4143 * Get the key since remove routine expects it 4144 */ 4145 dev = md_makedevice(md_major, selfid); 4146 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) { 4147 return (ENOENT); 4148 } 4149 4150 if ((n = (struct nm_name *)lookup_entry(nh, setno, MD_SIDEWILD, 4151 MD_KEYWILD, dev, 0L)) == NULL) { 4152 return (ENOENT); 4153 } 4154 4155 /* All entries removed are in the same diskset */ 4156 key = n->n_key; 4157 if (md_get_setstatus(setno) & MD_SET_MNSET) 4158 max_sides = MD_MNMAXSIDES; 4159 else 4160 max_sides = MD_MAXSIDES; 4161 4162 for (s = 0; s < max_sides; s++) 4163 (void) md_remdevname(setno, s, key); 4164 4165 return (0); 4166 } 4167 4168 void 4169 md_upd_set_unnext(set_t setno, unit_t un) 4170 { 4171 if (un < md_set[setno].s_un_next) { 4172 md_set[setno].s_un_next = un; 4173 } 4174 } 4175 4176 struct hot_spare_pool * 4177 find_hot_spare_pool(set_t setno, int hsp_id) 4178 { 4179 hot_spare_pool_t *hsp; 4180 4181 hsp = (hot_spare_pool_t *)md_set[setno].s_hsp; 4182 while (hsp != NULL) { 4183 if (hsp->hsp_self_id == hsp_id) 4184 return (hsp); 4185 hsp = hsp->hsp_next; 4186 } 4187 4188 return ((hot_spare_pool_t *)0); 4189 } 4190