1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Driver for Virtual Disk. 29 */ 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/buf.h> 33 #include <sys/conf.h> 34 #include <sys/user.h> 35 #include <sys/uio.h> 36 #include <sys/proc.h> 37 #include <sys/t_lock.h> 38 #include <sys/dkio.h> 39 #include <sys/kmem.h> 40 #include <sys/debug.h> 41 #include <sys/cmn_err.h> 42 #include <sys/sysmacros.h> 43 #include <sys/types.h> 44 #include <sys/mkdev.h> 45 #include <sys/vtoc.h> 46 #include <sys/open.h> 47 #include <sys/file.h> 48 #include <vm/page.h> 49 #include <sys/callb.h> 50 #include <sys/disp.h> 51 #include <sys/modctl.h> 52 #include <sys/errno.h> 53 #include <sys/door.h> 54 #include <sys/lvm/mdmn_commd.h> 55 #include <sys/lvm/md_hotspares.h> 56 57 #include <sys/lvm/mdvar.h> 58 #include <sys/lvm/md_names.h> 59 60 #include <sys/ddi.h> 61 #include <sys/proc.h> 62 #include <sys/sunddi.h> 63 #include <sys/esunddi.h> 64 65 #include <sys/sysevent.h> 66 #include <sys/sysevent/eventdefs.h> 67 68 #include <sys/sysevent/svm.h> 69 #include <sys/lvm/md_basic.h> 70 71 72 /* 73 * Machine specific Hertz is kept here 74 */ 75 extern clock_t md_hz; 76 77 /* 78 * Externs. 79 */ 80 extern int (*mdv_strategy_tstpnt)(buf_t *, int, void*); 81 extern major_t md_major; 82 extern unit_t md_nunits; 83 extern set_t md_nsets; 84 extern md_set_t md_set[]; 85 extern md_set_io_t md_set_io[]; 86 extern md_ops_t **md_ops; 87 extern md_ops_t *md_opslist; 88 extern ddi_modhandle_t *md_mods; 89 extern dev_info_t *md_devinfo; 90 91 extern md_krwlock_t md_unit_array_rw; 92 extern kmutex_t md_mx; 93 extern kcondvar_t md_cv; 94 95 extern md_krwlock_t hsp_rwlp; 96 extern md_krwlock_t ni_rwlp; 97 98 extern int md_num_daemons; 99 extern int md_status; 100 extern int md_ioctl_cnt; 101 extern int md_mtioctl_cnt; 102 103 extern struct metatransops metatransops; 104 extern md_event_queue_t *md_event_queue; 105 extern md_resync_t md_cpr_resync; 106 extern int md_done_daemon_threads; 107 extern int md_ff_daemon_threads; 108 109 110 extern mddb_set_t *mddb_setenter(set_t setno, int flag, int *errorcodep); 111 extern void mddb_setexit(mddb_set_t *s); 112 extern void *lookup_entry(struct nm_next_hdr *, set_t, 113 side_t, mdkey_t, md_dev64_t, int); 114 extern struct nm_next_hdr *get_first_record(set_t, int, int); 115 extern dev_t getrootdev(void); 116 117 struct mdq_anchor md_done_daemon; /* done request queue */ 118 struct mdq_anchor md_mstr_daemon; /* mirror error, WOW requests */ 119 struct mdq_anchor md_mhs_daemon; /* mirror hotspare requests queue */ 120 struct mdq_anchor md_hs_daemon; /* raid hotspare requests queue */ 121 struct mdq_anchor md_ff_daemonq; /* failfast request queue */ 122 struct mdq_anchor md_mirror_daemon; /* mirror owner queue */ 123 struct mdq_anchor md_mirror_io_daemon; /* mirror owner i/o queue */ 124 struct mdq_anchor md_mirror_rs_daemon; /* mirror resync done queue */ 125 struct mdq_anchor md_sp_daemon; /* soft-part error daemon queue */ 126 struct mdq_anchor md_mto_daemon; /* mirror timeout daemon queue */ 127 128 int md_done_daemon_threads = 1; /* threads for md_done_daemon requestq */ 129 int md_mstr_daemon_threads = 1; /* threads for md_mstr_daemon requestq */ 130 int md_mhs_daemon_threads = 1; /* threads for md_mhs_daemon requestq */ 131 int md_hs_daemon_threads = 1; /* threads for md_hs_daemon requestq */ 132 int md_ff_daemon_threads = 3; /* threads for md_ff_daemon requestq */ 133 int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */ 134 int md_sp_daemon_threads = 1; /* threads for md_sp_daemon requestq */ 135 int md_mto_daemon_threads = 1; /* threads for md_mto_daemon requestq */ 136 137 #ifdef DEBUG 138 /* Flag to switch on debug messages */ 139 int md_release_reacquire_debug = 0; /* debug flag */ 140 #endif 141 142 /* 143 * 144 * The md_request_queues is table of pointers to request queues and the number 145 * of threads associated with the request queues. 146 * When the number of threads is set to 1, then the order of execution is 147 * sequential. 148 * The number of threads for all the queues have been defined as global 149 * variables to enable kernel tuning. 150 * 151 */ 152 153 #define MD_DAEMON_QUEUES 11 154 155 md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = { 156 {&md_done_daemon, &md_done_daemon_threads}, 157 {&md_mstr_daemon, &md_mstr_daemon_threads}, 158 {&md_hs_daemon, &md_hs_daemon_threads}, 159 {&md_ff_daemonq, &md_ff_daemon_threads}, 160 {&md_mirror_daemon, &md_mirror_daemon_threads}, 161 {&md_mirror_io_daemon, &md_mirror_daemon_threads}, 162 {&md_mirror_rs_daemon, &md_mirror_daemon_threads}, 163 {&md_sp_daemon, &md_sp_daemon_threads}, 164 {&md_mhs_daemon, &md_mhs_daemon_threads}, 165 {&md_mto_daemon, &md_mto_daemon_threads}, 166 {0, 0} 167 }; 168 169 /* 170 * Number of times a message is retried before issuing a warning to the operator 171 */ 172 #define MD_MN_WARN_INTVL 10 173 174 /* 175 * Setting retry cnt to one (pre decremented) so that we actually do no 176 * retries when committing/deleting a mddb rec. The underlying disk driver 177 * does several retries to check if the disk is really dead or not so there 178 * is no reason for us to retry on top of the drivers retries. 179 */ 180 181 uint_t md_retry_cnt = 1; /* global so it can be patched */ 182 183 /* 184 * How many times to try to do the door_ki_upcall() in mdmn_ksend_message. 185 * Again, made patchable here should it prove useful. 186 */ 187 uint_t md_send_retry_limit = 30; 188 189 /* 190 * Bug # 1212146 191 * Before this change the user had to pass in a short aligned buffer because of 192 * problems in some underlying device drivers. This problem seems to have been 193 * corrected in the underlying drivers so we will default to not requiring any 194 * alignment. If the user needs to check for a specific alignment, 195 * md_uio_alignment_mask may be set in /etc/system to accomplish this. To get 196 * the behavior before this fix, the md_uio_alignment_mask would be set to 1, 197 * to check for word alignment, it can be set to 3, for double word alignment, 198 * it can be set to 7, etc. 199 * 200 * [Other part of fix is in function md_chk_uio()] 201 */ 202 static int md_uio_alignment_mask = 0; 203 204 /* 205 * for md_dev64_t translation 206 */ 207 struct md_xlate_table *md_tuple_table; 208 struct md_xlate_major_table *md_major_tuple_table; 209 int md_tuple_length; 210 uint_t md_majortab_len; 211 212 /* Function declarations */ 213 214 static int md_create_probe_rqlist(md_probedev_impl_t *plist, 215 daemon_queue_t **hdr, intptr_t (*probe_test)()); 216 217 /* 218 * manipulate global status 219 */ 220 void 221 md_set_status(int bits) 222 { 223 mutex_enter(&md_mx); 224 md_status |= bits; 225 mutex_exit(&md_mx); 226 } 227 228 void 229 md_clr_status(int bits) 230 { 231 mutex_enter(&md_mx); 232 md_status &= ~bits; 233 mutex_exit(&md_mx); 234 } 235 236 int 237 md_get_status() 238 { 239 int result; 240 mutex_enter(&md_mx); 241 result = md_status; 242 mutex_exit(&md_mx); 243 return (result); 244 } 245 246 void 247 md_set_setstatus(set_t setno, int bits) 248 { 249 ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS); 250 251 mutex_enter(&md_mx); 252 md_set[setno].s_status |= bits; 253 mutex_exit(&md_mx); 254 } 255 256 void 257 md_clr_setstatus(set_t setno, int bits) 258 { 259 ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS); 260 261 mutex_enter(&md_mx); 262 md_set[setno].s_status &= ~bits; 263 mutex_exit(&md_mx); 264 } 265 266 uint_t 267 md_get_setstatus(set_t setno) 268 { 269 uint_t result; 270 271 ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS); 272 273 mutex_enter(&md_mx); 274 result = md_set[setno].s_status; 275 mutex_exit(&md_mx); 276 return (result); 277 } 278 279 /* 280 * md_unit_readerlock_common: 281 * ------------------------- 282 * Mark the given unit as having a reader reference. Spin waiting for any 283 * writer references to be released. 284 * 285 * Input: 286 * ui unit reference 287 * lock_held 0 => ui_mx needs to be grabbed 288 * 1 => ui_mx already held 289 * Output: 290 * mm_unit_t corresponding to unit structure 291 * ui->ui_readercnt incremented 292 */ 293 static void * 294 md_unit_readerlock_common(mdi_unit_t *ui, int lock_held) 295 { 296 uint_t flag = MD_UL_WRITER | MD_UL_WANABEWRITER; 297 298 if (!lock_held) 299 mutex_enter(&ui->ui_mx); 300 while (ui->ui_lock & flag) { 301 if (panicstr) { 302 if (ui->ui_lock & MD_UL_WRITER) 303 panic("md: writer lock is held"); 304 break; 305 } 306 cv_wait(&ui->ui_cv, &ui->ui_mx); 307 } 308 ui->ui_readercnt++; 309 if (!lock_held) 310 mutex_exit(&ui->ui_mx); 311 return (MD_UNIT(ui->ui_link.ln_id)); 312 } 313 314 void * 315 md_unit_readerlock(mdi_unit_t *ui) 316 { 317 return (md_unit_readerlock_common(ui, 0)); 318 } 319 320 /* 321 * md_unit_writerlock_common: 322 * ------------------------- 323 * Acquire a unique writer reference. Causes previous readers to drain. 324 * Spins if a writer reference already exists or if a previous reader/writer 325 * dropped the lock to allow a ksend_message to be despatched. 326 * 327 * Input: 328 * ui unit reference 329 * lock_held 0 => grab ui_mx 330 * 1 => ui_mx already held on entry 331 * Output: 332 * mm_unit_t reference 333 */ 334 static void * 335 md_unit_writerlock_common(mdi_unit_t *ui, int lock_held) 336 { 337 uint_t flag = MD_UL_WRITER; 338 339 if (panicstr) 340 panic("md: writer lock not allowed"); 341 342 if (!lock_held) 343 mutex_enter(&ui->ui_mx); 344 345 while ((ui->ui_lock & flag) || (ui->ui_readercnt != 0)) { 346 ui->ui_wanabecnt++; 347 ui->ui_lock |= MD_UL_WANABEWRITER; 348 cv_wait(&ui->ui_cv, &ui->ui_mx); 349 if (--ui->ui_wanabecnt == 0) 350 ui->ui_lock &= ~MD_UL_WANABEWRITER; 351 } 352 ui->ui_lock |= MD_UL_WRITER; 353 ui->ui_owner = curthread; 354 355 if (!lock_held) 356 mutex_exit(&ui->ui_mx); 357 return (MD_UNIT(ui->ui_link.ln_id)); 358 } 359 360 void * 361 md_unit_writerlock(mdi_unit_t *ui) 362 { 363 return (md_unit_writerlock_common(ui, 0)); 364 } 365 366 /* 367 * md_unit_readerexit_common: 368 * ------------------------- 369 * Release the readerlock for the specified unit. If the reader count reaches 370 * zero and there are waiting writers (MD_UL_WANABEWRITER set) wake them up. 371 * 372 * Input: 373 * ui unit reference 374 * lock_held 0 => ui_mx needs to be acquired 375 * 1 => ui_mx already held 376 */ 377 static void 378 md_unit_readerexit_common(mdi_unit_t *ui, int lock_held) 379 { 380 if (!lock_held) 381 mutex_enter(&ui->ui_mx); 382 ASSERT((ui->ui_lock & MD_UL_WRITER) == 0); 383 ASSERT(ui->ui_readercnt != 0); 384 ui->ui_readercnt--; 385 if ((ui->ui_wanabecnt != 0) && (ui->ui_readercnt == 0)) 386 cv_broadcast(&ui->ui_cv); 387 388 if (!lock_held) 389 mutex_exit(&ui->ui_mx); 390 } 391 392 void 393 md_unit_readerexit(mdi_unit_t *ui) 394 { 395 md_unit_readerexit_common(ui, 0); 396 } 397 398 /* 399 * md_unit_writerexit_common: 400 * ------------------------- 401 * Release the writerlock currently held on the unit. Wake any threads waiting 402 * on becoming reader or writer (MD_UL_WANABEWRITER set). 403 * 404 * Input: 405 * ui unit reference 406 * lock_held 0 => ui_mx to be acquired 407 * 1 => ui_mx already held 408 */ 409 static void 410 md_unit_writerexit_common(mdi_unit_t *ui, int lock_held) 411 { 412 if (!lock_held) 413 mutex_enter(&ui->ui_mx); 414 ASSERT((ui->ui_lock & MD_UL_WRITER) != 0); 415 ASSERT(ui->ui_readercnt == 0); 416 ui->ui_lock &= ~MD_UL_WRITER; 417 ui->ui_owner = NULL; 418 419 cv_broadcast(&ui->ui_cv); 420 if (!lock_held) 421 mutex_exit(&ui->ui_mx); 422 } 423 424 void 425 md_unit_writerexit(mdi_unit_t *ui) 426 { 427 md_unit_writerexit_common(ui, 0); 428 } 429 430 void * 431 md_io_readerlock(mdi_unit_t *ui) 432 { 433 md_io_lock_t *io = ui->ui_io_lock; 434 435 ASSERT(io); /* checks case where no io lock allocated */ 436 mutex_enter(&io->io_mx); 437 while (io->io_lock & (MD_UL_WRITER | MD_UL_WANABEWRITER)) { 438 if (panicstr) { 439 if (io->io_lock & MD_UL_WRITER) 440 panic("md: writer lock is held"); 441 break; 442 } 443 cv_wait(&io->io_cv, &io->io_mx); 444 } 445 io->io_readercnt++; 446 mutex_exit(&io->io_mx); 447 return (MD_UNIT(ui->ui_link.ln_id)); 448 } 449 450 void * 451 md_io_writerlock(mdi_unit_t *ui) 452 { 453 md_io_lock_t *io = ui->ui_io_lock; 454 455 ASSERT(io); /* checks case where no io lock allocated */ 456 if (panicstr) 457 panic("md: writer lock not allowed"); 458 459 mutex_enter(&io->io_mx); 460 while ((io->io_lock & MD_UL_WRITER) || (io->io_readercnt != 0)) { 461 io->io_wanabecnt++; 462 io->io_lock |= MD_UL_WANABEWRITER; 463 cv_wait(&io->io_cv, &io->io_mx); 464 if (--io->io_wanabecnt == 0) 465 io->io_lock &= ~MD_UL_WANABEWRITER; 466 } 467 io->io_lock |= MD_UL_WRITER; 468 io->io_owner = curthread; 469 470 mutex_exit(&io->io_mx); 471 return (MD_UNIT(ui->ui_link.ln_id)); 472 } 473 474 void 475 md_io_readerexit(mdi_unit_t *ui) 476 { 477 md_io_lock_t *io = ui->ui_io_lock; 478 479 mutex_enter(&io->io_mx); 480 ASSERT((io->io_lock & MD_UL_WRITER) == 0); 481 ASSERT(io->io_readercnt != 0); 482 io->io_readercnt--; 483 if ((io->io_wanabecnt != 0) && (io->io_readercnt == 0)) { 484 cv_broadcast(&io->io_cv); 485 } 486 mutex_exit(&io->io_mx); 487 } 488 489 void 490 md_io_writerexit(mdi_unit_t *ui) 491 { 492 md_io_lock_t *io = ui->ui_io_lock; 493 494 mutex_enter(&io->io_mx); 495 ASSERT((io->io_lock & MD_UL_WRITER) != 0); 496 ASSERT(io->io_readercnt == 0); 497 io->io_lock &= ~MD_UL_WRITER; 498 io->io_owner = NULL; 499 500 cv_broadcast(&io->io_cv); 501 mutex_exit(&io->io_mx); 502 } 503 504 /* 505 * Attempt to grab that set of locks defined as global. 506 * A mask containing the set of global locks that are owned upon 507 * entry is input. Any additional global locks are then grabbed. 508 * This keeps the caller from having to know the set of global 509 * locks. 510 */ 511 static int 512 md_global_lock_enter(int global_locks_owned_mask) 513 { 514 515 /* 516 * The current implementation has been verified by inspection 517 * and test to be deadlock free. If another global lock is 518 * added, changing the algorithm used by this function should 519 * be considered. With more than 2 locks it is difficult to 520 * guarantee that locks are being acquired in the correct order. 521 * The safe approach would be to drop all of the locks that are 522 * owned at function entry and then reacquire all of the locks 523 * in the order defined by the lock hierarchy. 524 */ 525 mutex_enter(&md_mx); 526 if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) { 527 while ((md_mtioctl_cnt != 0) || 528 (md_status & MD_GBL_IOCTL_LOCK)) { 529 if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) { 530 mutex_exit(&md_mx); 531 return (EINTR); 532 } 533 } 534 md_status |= MD_GBL_IOCTL_LOCK; 535 md_ioctl_cnt++; 536 } 537 if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) { 538 while (md_status & MD_GBL_HS_LOCK) { 539 if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) { 540 md_status &= ~MD_GBL_IOCTL_LOCK; 541 mutex_exit(&md_mx); 542 return (EINTR); 543 } 544 } 545 md_status |= MD_GBL_HS_LOCK; 546 } 547 mutex_exit(&md_mx); 548 return (0); 549 } 550 551 /* 552 * Release the set of global locks that were grabbed in md_global_lock_enter 553 * that were not already owned by the calling thread. The set of previously 554 * owned global locks is passed in as a mask parameter. 555 */ 556 static int 557 md_global_lock_exit(int global_locks_owned_mask, int code, 558 int flags, mdi_unit_t *ui) 559 { 560 mutex_enter(&md_mx); 561 562 /* If MT ioctl decrement mt_ioctl_cnt */ 563 if ((flags & MD_MT_IOCTL)) { 564 md_mtioctl_cnt--; 565 } else { 566 if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) { 567 /* clear the lock and decrement count */ 568 ASSERT(md_ioctl_cnt == 1); 569 md_ioctl_cnt--; 570 md_status &= ~MD_GBL_IOCTL_LOCK; 571 } 572 if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) 573 md_status &= ~MD_GBL_HS_LOCK; 574 } 575 if (flags & MD_READER_HELD) 576 md_unit_readerexit(ui); 577 if (flags & MD_WRITER_HELD) 578 md_unit_writerexit(ui); 579 if (flags & MD_IO_HELD) 580 md_io_writerexit(ui); 581 if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) { 582 rw_exit(&md_unit_array_rw.lock); 583 } 584 cv_broadcast(&md_cv); 585 mutex_exit(&md_mx); 586 587 return (code); 588 } 589 590 /* 591 * The two functions, md_ioctl_lock_enter, and md_ioctl_lock_exit make 592 * use of the md_global_lock_{enter|exit} functions to avoid duplication 593 * of code. They rely upon the fact that the locks that are specified in 594 * the input mask are not acquired or freed. If this algorithm changes 595 * as described in the block comment at the beginning of md_global_lock_enter 596 * then it will be necessary to change these 2 functions. Otherwise these 597 * functions will be grabbing and holding global locks unnecessarily. 598 */ 599 int 600 md_ioctl_lock_enter(void) 601 { 602 /* grab only the ioctl lock */ 603 return (md_global_lock_enter(~MD_GBL_IOCTL_LOCK)); 604 } 605 606 /* 607 * If md_ioctl_lock_exit is being called at the end of an ioctl before 608 * returning to user space, then ioctl_end is set to 1. 609 * Otherwise, the ioctl lock is being dropped in the middle of handling 610 * an ioctl and will be reacquired before the end of the ioctl. 611 * Do not attempt to process the MN diskset mddb parse flags unless 612 * ioctl_end is true - otherwise a deadlock situation could arise. 613 */ 614 int 615 md_ioctl_lock_exit(int code, int flags, mdi_unit_t *ui, int ioctl_end) 616 { 617 int ret_val; 618 uint_t status; 619 mddb_set_t *s; 620 int i; 621 int err; 622 md_mn_msg_mddb_parse_t *mddb_parse_msg; 623 md_mn_kresult_t *kresult; 624 mddb_lb_t *lbp; 625 int rval = 1; 626 int flag; 627 628 /* release only the ioctl lock */ 629 ret_val = md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui); 630 631 /* 632 * If md_ioctl_lock_exit is being called with a possible lock held 633 * (ioctl_end is 0), then don't check the MN disksets since the 634 * call to mddb_setenter may cause a lock ordering deadlock. 635 */ 636 if (!ioctl_end) 637 return (ret_val); 638 639 /* 640 * Walk through disksets to see if there is a MN diskset that 641 * has messages that need to be sent. Set must be snarfed and 642 * be a MN diskset in order to be checked. 643 * 644 * In a MN diskset, this routine may send messages to the 645 * rpc.mdcommd in order to have the slave nodes re-parse parts 646 * of the mddb. Messages can only be sent with no locks held, 647 * so if mddb change occurred while the ioctl lock is held, this 648 * routine must send the messages. 649 */ 650 for (i = 1; i < md_nsets; i++) { 651 status = md_get_setstatus(i); 652 653 /* Set must be snarfed and be a MN diskset */ 654 if ((status & (MD_SET_SNARFED | MD_SET_MNSET)) != 655 (MD_SET_SNARFED | MD_SET_MNSET)) 656 continue; 657 658 /* Grab set lock so that set can't change */ 659 if ((s = mddb_setenter(i, MDDB_MUSTEXIST, &err)) == NULL) 660 continue; 661 662 lbp = s->s_lbp; 663 664 /* Re-get set status now that lock is held */ 665 status = md_get_setstatus(i); 666 667 /* 668 * If MN parsing block flag is set - continue to next set. 669 * 670 * If s_mn_parseflags_sending is non-zero, then another thread 671 * is already currently sending a parse message, so just 672 * release the set mutex. If this ioctl had caused an mddb 673 * change that results in a parse message to be generated, 674 * the thread that is currently sending a parse message would 675 * generate the additional parse message. 676 * 677 * If s_mn_parseflags_sending is zero then loop until 678 * s_mn_parseflags is 0 (until there are no more 679 * messages to send). 680 * While s_mn_parseflags is non-zero, 681 * put snapshot of parse_flags in s_mn_parseflags_sending 682 * set s_mn_parseflags to zero 683 * release set mutex 684 * send message 685 * re-grab set mutex 686 * set s_mn_parseflags_sending to zero 687 * 688 * If set is STALE, send message with NO_LOG flag so that 689 * rpc.mdcommd won't attempt to log message to non-writeable 690 * replica. 691 */ 692 mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), 693 KM_SLEEP); 694 while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) && 695 (s->s_mn_parseflags & MDDB_PARSE_MASK) && 696 (!(status & MD_SET_MNPARSE_BLK))) { 697 698 /* Grab snapshot of parse flags */ 699 s->s_mn_parseflags_sending = s->s_mn_parseflags; 700 s->s_mn_parseflags = 0; 701 702 mutex_exit(&md_set[(s)->s_setno].s_dbmx); 703 704 /* 705 * Send the message to the slaves to re-parse 706 * the indicated portions of the mddb. Send the status 707 * of the 50 mddbs in this set so that slaves know 708 * which mddbs that the master node thinks are 'good'. 709 * Otherwise, slave may reparse, but from wrong 710 * replica. 711 */ 712 mddb_parse_msg->msg_parse_flags = 713 s->s_mn_parseflags_sending; 714 715 for (i = 0; i < MDDB_NLB; i++) { 716 mddb_parse_msg->msg_lb_flags[i] = 717 lbp->lb_locators[i].l_flags; 718 } 719 kresult = kmem_zalloc(sizeof (md_mn_kresult_t), 720 KM_SLEEP); 721 while (rval != 0) { 722 flag = 0; 723 if (status & MD_SET_STALE) 724 flag |= MD_MSGF_NO_LOG; 725 rval = mdmn_ksend_message(s->s_setno, 726 MD_MN_MSG_MDDB_PARSE, flag, 0, 727 (char *)mddb_parse_msg, 728 sizeof (md_mn_msg_mddb_parse_t), kresult); 729 /* if the node hasn't yet joined, it's Ok. */ 730 if ((!MDMN_KSEND_MSG_OK(rval, kresult)) && 731 (kresult->kmmr_comm_state != 732 MDMNE_NOT_JOINED)) { 733 mdmn_ksend_show_error(rval, kresult, 734 "MD_MN_MSG_MDDB_PARSE"); 735 cmn_err(CE_WARN, "md_ioctl_lock_exit: " 736 "Unable to send mddb update " 737 "message to other nodes in " 738 "diskset %s\n", s->s_setname); 739 rval = 1; 740 } 741 } 742 kmem_free(kresult, sizeof (md_mn_kresult_t)); 743 744 /* 745 * Re-grab mutex to clear sending field and to 746 * see if another parse message needs to be generated. 747 */ 748 mutex_enter(&md_set[(s)->s_setno].s_dbmx); 749 s->s_mn_parseflags_sending = 0; 750 } 751 kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t)); 752 mutex_exit(&md_set[(s)->s_setno].s_dbmx); 753 } 754 return (ret_val); 755 } 756 757 /* 758 * Called when in an ioctl and need readerlock. 759 */ 760 void * 761 md_ioctl_readerlock(IOLOCK *lock, mdi_unit_t *ui) 762 { 763 ASSERT(lock != NULL); 764 lock->l_ui = ui; 765 lock->l_flags |= MD_READER_HELD; 766 return (md_unit_readerlock_common(ui, 0)); 767 } 768 769 /* 770 * Called when in an ioctl and need writerlock. 771 */ 772 void * 773 md_ioctl_writerlock(IOLOCK *lock, mdi_unit_t *ui) 774 { 775 ASSERT(lock != NULL); 776 lock->l_ui = ui; 777 lock->l_flags |= MD_WRITER_HELD; 778 return (md_unit_writerlock_common(ui, 0)); 779 } 780 781 void * 782 md_ioctl_io_lock(IOLOCK *lock, mdi_unit_t *ui) 783 { 784 ASSERT(lock != NULL); 785 lock->l_ui = ui; 786 lock->l_flags |= MD_IO_HELD; 787 return (md_io_writerlock(ui)); 788 } 789 790 void 791 md_ioctl_readerexit(IOLOCK *lock) 792 { 793 ASSERT(lock != NULL); 794 lock->l_flags &= ~MD_READER_HELD; 795 md_unit_readerexit(lock->l_ui); 796 } 797 798 void 799 md_ioctl_writerexit(IOLOCK *lock) 800 { 801 ASSERT(lock != NULL); 802 lock->l_flags &= ~MD_WRITER_HELD; 803 md_unit_writerexit(lock->l_ui); 804 } 805 806 void 807 md_ioctl_io_exit(IOLOCK *lock) 808 { 809 ASSERT(lock != NULL); 810 lock->l_flags &= ~MD_IO_HELD; 811 md_io_writerexit(lock->l_ui); 812 } 813 814 /* 815 * md_ioctl_releaselocks: 816 * -------------------- 817 * Release the unit locks that are held and stop subsequent 818 * md_unit_reader/writerlock calls from progressing. This allows the caller 819 * to send messages across the cluster when running in a multinode 820 * environment. 821 * ioctl originated locks (via md_ioctl_readerlock/md_ioctl_writerlock) are 822 * allowed to progress as normal. This is required as these typically are 823 * invoked by the message handler that may be called while a unit lock is 824 * marked as released. 825 * 826 * On entry: 827 * variety of unit locks may be held including ioctl lock 828 * 829 * On exit: 830 * locks released and unit structure updated to prevent subsequent reader/ 831 * writer locks being acquired until md_ioctl_reacquirelocks is called 832 */ 833 void 834 md_ioctl_releaselocks(int code, int flags, mdi_unit_t *ui) 835 { 836 /* This actually releases the locks. */ 837 (void) md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui); 838 } 839 840 /* 841 * md_ioctl_reacquirelocks: 842 * ---------------------- 843 * Reacquire the locks that were held when md_ioctl_releaselocks 844 * was called. 845 * 846 * On entry: 847 * No unit locks held 848 * On exit: 849 * locks held that were held at md_ioctl_releaselocks time including 850 * the ioctl lock. 851 */ 852 void 853 md_ioctl_reacquirelocks(int flags, mdi_unit_t *ui) 854 { 855 if (flags & MD_MT_IOCTL) { 856 mutex_enter(&md_mx); 857 md_mtioctl_cnt++; 858 mutex_exit(&md_mx); 859 } else { 860 while (md_ioctl_lock_enter() == EINTR) 861 ; 862 } 863 if (flags & MD_ARRAY_WRITER) { 864 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 865 } else if (flags & MD_ARRAY_READER) { 866 rw_enter(&md_unit_array_rw.lock, RW_READER); 867 } 868 if (ui != (mdi_unit_t *)NULL) { 869 if (flags & MD_IO_HELD) { 870 (void) md_io_writerlock(ui); 871 } 872 873 mutex_enter(&ui->ui_mx); 874 if (flags & MD_READER_HELD) { 875 (void) md_unit_readerlock_common(ui, 1); 876 } else if (flags & MD_WRITER_HELD) { 877 (void) md_unit_writerlock_common(ui, 1); 878 } 879 /* Wake up any blocked readerlock() calls */ 880 cv_broadcast(&ui->ui_cv); 881 mutex_exit(&ui->ui_mx); 882 } 883 } 884 885 void 886 md_ioctl_droplocks(IOLOCK *lock) 887 { 888 mdi_unit_t *ui; 889 int flags; 890 891 ASSERT(lock != NULL); 892 ui = lock->l_ui; 893 flags = lock->l_flags; 894 if (flags & MD_READER_HELD) { 895 lock->l_flags &= ~MD_READER_HELD; 896 md_unit_readerexit(ui); 897 } 898 if (flags & MD_WRITER_HELD) { 899 lock->l_flags &= ~MD_WRITER_HELD; 900 md_unit_writerexit(ui); 901 } 902 if (flags & MD_IO_HELD) { 903 lock->l_flags &= ~MD_IO_HELD; 904 md_io_writerexit(ui); 905 } 906 if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) { 907 lock->l_flags &= ~(MD_ARRAY_WRITER | MD_ARRAY_READER); 908 rw_exit(&md_unit_array_rw.lock); 909 } 910 } 911 912 void 913 md_array_writer(IOLOCK *lock) 914 { 915 ASSERT(lock != NULL); 916 lock->l_flags |= MD_ARRAY_WRITER; 917 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 918 } 919 920 void 921 md_array_reader(IOLOCK *lock) 922 { 923 ASSERT(lock != NULL); 924 lock->l_flags |= MD_ARRAY_READER; 925 rw_enter(&md_unit_array_rw.lock, RW_READER); 926 } 927 928 /* 929 * Called when in an ioctl and need opencloselock. 930 * Sets flags in lockp for READER_HELD. 931 */ 932 void * 933 md_ioctl_openclose_enter(IOLOCK *lockp, mdi_unit_t *ui) 934 { 935 void *un; 936 937 ASSERT(lockp != NULL); 938 mutex_enter(&ui->ui_mx); 939 while (ui->ui_lock & MD_UL_OPENORCLOSE) 940 cv_wait(&ui->ui_cv, &ui->ui_mx); 941 ui->ui_lock |= MD_UL_OPENORCLOSE; 942 943 /* Maintain mutex across the readerlock call */ 944 lockp->l_ui = ui; 945 lockp->l_flags |= MD_READER_HELD; 946 un = md_unit_readerlock_common(ui, 1); 947 mutex_exit(&ui->ui_mx); 948 949 return (un); 950 } 951 952 /* 953 * Clears reader lock using md_ioctl instead of md_unit 954 * and updates lockp. 955 */ 956 void 957 md_ioctl_openclose_exit(IOLOCK *lockp) 958 { 959 mdi_unit_t *ui; 960 961 ASSERT(lockp != NULL); 962 ui = lockp->l_ui; 963 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); 964 965 md_ioctl_readerexit(lockp); 966 967 mutex_enter(&ui->ui_mx); 968 ui->ui_lock &= ~MD_UL_OPENORCLOSE; 969 970 cv_broadcast(&ui->ui_cv); 971 mutex_exit(&ui->ui_mx); 972 } 973 974 /* 975 * Clears reader lock using md_ioctl instead of md_unit 976 * and updates lockp. 977 * Does not acquire or release the ui_mx lock since the calling 978 * routine has already acquired this lock. 979 */ 980 void 981 md_ioctl_openclose_exit_lh(IOLOCK *lockp) 982 { 983 mdi_unit_t *ui; 984 985 ASSERT(lockp != NULL); 986 ui = lockp->l_ui; 987 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); 988 989 lockp->l_flags &= ~MD_READER_HELD; 990 md_unit_readerexit_common(lockp->l_ui, 1); 991 992 ui->ui_lock &= ~MD_UL_OPENORCLOSE; 993 cv_broadcast(&ui->ui_cv); 994 } 995 996 void * 997 md_unit_openclose_enter(mdi_unit_t *ui) 998 { 999 void *un; 1000 1001 mutex_enter(&ui->ui_mx); 1002 while (ui->ui_lock & (MD_UL_OPENORCLOSE)) 1003 cv_wait(&ui->ui_cv, &ui->ui_mx); 1004 ui->ui_lock |= MD_UL_OPENORCLOSE; 1005 1006 /* Maintain mutex across the readerlock call */ 1007 un = md_unit_readerlock_common(ui, 1); 1008 mutex_exit(&ui->ui_mx); 1009 1010 return (un); 1011 } 1012 1013 void 1014 md_unit_openclose_exit(mdi_unit_t *ui) 1015 { 1016 md_unit_readerexit(ui); 1017 1018 mutex_enter(&ui->ui_mx); 1019 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); 1020 ui->ui_lock &= ~MD_UL_OPENORCLOSE; 1021 1022 cv_broadcast(&ui->ui_cv); 1023 mutex_exit(&ui->ui_mx); 1024 } 1025 1026 /* 1027 * Drop the openclose and readerlocks without acquiring or 1028 * releasing the ui_mx lock since the calling routine has 1029 * already acquired this lock. 1030 */ 1031 void 1032 md_unit_openclose_exit_lh(mdi_unit_t *ui) 1033 { 1034 md_unit_readerexit_common(ui, 1); 1035 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); 1036 ui->ui_lock &= ~MD_UL_OPENORCLOSE; 1037 cv_broadcast(&ui->ui_cv); 1038 } 1039 1040 int 1041 md_unit_isopen( 1042 mdi_unit_t *ui 1043 ) 1044 { 1045 int isopen; 1046 1047 /* check status */ 1048 mutex_enter(&ui->ui_mx); 1049 isopen = ((ui->ui_lock & MD_UL_OPEN) ? 1 : 0); 1050 mutex_exit(&ui->ui_mx); 1051 return (isopen); 1052 } 1053 1054 int 1055 md_unit_incopen( 1056 minor_t mnum, 1057 int flag, 1058 int otyp 1059 ) 1060 { 1061 mdi_unit_t *ui = MDI_UNIT(mnum); 1062 int err = 0; 1063 1064 /* check type and flags */ 1065 ASSERT(ui != NULL); 1066 mutex_enter(&ui->ui_mx); 1067 if ((otyp < 0) || (otyp >= OTYPCNT)) { 1068 err = EINVAL; 1069 goto out; 1070 } 1071 if (((flag & FEXCL) && (ui->ui_lock & MD_UL_OPEN)) || 1072 (ui->ui_lock & MD_UL_EXCL)) { 1073 err = EBUSY; 1074 goto out; 1075 } 1076 1077 /* count and flag open */ 1078 ui->ui_ocnt[otyp]++; 1079 ui->ui_lock |= MD_UL_OPEN; 1080 if (flag & FEXCL) 1081 ui->ui_lock |= MD_UL_EXCL; 1082 1083 /* setup kstat, return success */ 1084 mutex_exit(&ui->ui_mx); 1085 md_kstat_init(mnum); 1086 return (0); 1087 1088 /* return error */ 1089 out: 1090 mutex_exit(&ui->ui_mx); 1091 return (err); 1092 } 1093 1094 int 1095 md_unit_decopen( 1096 minor_t mnum, 1097 int otyp 1098 ) 1099 { 1100 mdi_unit_t *ui = MDI_UNIT(mnum); 1101 int err = 0; 1102 unsigned i; 1103 1104 /* check type and flags */ 1105 ASSERT(ui != NULL); 1106 mutex_enter(&ui->ui_mx); 1107 if ((otyp < 0) || (otyp >= OTYPCNT)) { 1108 err = EINVAL; 1109 goto out; 1110 } else if (ui->ui_ocnt[otyp] == 0) { 1111 err = ENXIO; 1112 goto out; 1113 } 1114 1115 /* count and flag closed */ 1116 if (otyp == OTYP_LYR) 1117 ui->ui_ocnt[otyp]--; 1118 else 1119 ui->ui_ocnt[otyp] = 0; 1120 ui->ui_lock &= ~MD_UL_OPEN; 1121 for (i = 0; (i < OTYPCNT); ++i) 1122 if (ui->ui_ocnt[i] != 0) 1123 ui->ui_lock |= MD_UL_OPEN; 1124 if (! (ui->ui_lock & MD_UL_OPEN)) 1125 ui->ui_lock &= ~MD_UL_EXCL; 1126 1127 /* teardown kstat, return success */ 1128 if (! (ui->ui_lock & MD_UL_OPEN)) { 1129 mutex_exit(&ui->ui_mx); 1130 md_kstat_destroy(mnum); 1131 return (0); 1132 } 1133 1134 /* return success */ 1135 out: 1136 mutex_exit(&ui->ui_mx); 1137 return (err); 1138 } 1139 1140 md_dev64_t 1141 md_xlate_targ_2_mini(md_dev64_t targ_devt) 1142 { 1143 dev32_t mini_32_devt, targ_32_devt; 1144 int i; 1145 1146 /* 1147 * check to see if we're in an upgrade situation 1148 * if we are not in upgrade just return the input device 1149 */ 1150 1151 if (!MD_UPGRADE) 1152 return (targ_devt); 1153 1154 targ_32_devt = md_cmpldev(targ_devt); 1155 1156 i = 0; 1157 while (i != md_tuple_length) { 1158 if (md_tuple_table[i].targ_devt == targ_32_devt) { 1159 mini_32_devt = md_tuple_table[i].mini_devt; 1160 return (md_expldev((md_dev64_t)mini_32_devt)); 1161 } 1162 i++; 1163 } 1164 return (NODEV64); 1165 } 1166 1167 md_dev64_t 1168 md_xlate_mini_2_targ(md_dev64_t mini_devt) 1169 { 1170 dev32_t mini_32_devt, targ_32_devt; 1171 int i; 1172 1173 if (!MD_UPGRADE) 1174 return (mini_devt); 1175 1176 mini_32_devt = md_cmpldev(mini_devt); 1177 1178 i = 0; 1179 while (i != md_tuple_length) { 1180 if (md_tuple_table[i].mini_devt == mini_32_devt) { 1181 targ_32_devt = md_tuple_table[i].targ_devt; 1182 return (md_expldev((md_dev64_t)targ_32_devt)); 1183 } 1184 i++; 1185 } 1186 return (NODEV64); 1187 } 1188 1189 void 1190 md_xlate_free(int size) 1191 { 1192 kmem_free(md_tuple_table, size); 1193 } 1194 1195 char * 1196 md_targ_major_to_name(major_t maj) 1197 { 1198 char *drv_name = NULL; 1199 int i; 1200 1201 if (!MD_UPGRADE) 1202 return (ddi_major_to_name(maj)); 1203 1204 for (i = 0; i < md_majortab_len; i++) { 1205 if (md_major_tuple_table[i].targ_maj == maj) { 1206 drv_name = md_major_tuple_table[i].drv_name; 1207 break; 1208 } 1209 } 1210 return (drv_name); 1211 } 1212 1213 major_t 1214 md_targ_name_to_major(char *drv_name) 1215 { 1216 major_t maj; 1217 int i; 1218 1219 maj = md_getmajor(NODEV64); 1220 if (!MD_UPGRADE) 1221 return (ddi_name_to_major(drv_name)); 1222 1223 for (i = 0; i < md_majortab_len; i++) { 1224 if ((strcmp(md_major_tuple_table[i].drv_name, 1225 drv_name)) == 0) { 1226 maj = md_major_tuple_table[i].targ_maj; 1227 break; 1228 } 1229 } 1230 1231 return (maj); 1232 } 1233 1234 void 1235 md_majortab_free() 1236 { 1237 size_t sz; 1238 int i; 1239 1240 for (i = 0; i < md_majortab_len; i++) { 1241 freestr(md_major_tuple_table[i].drv_name); 1242 } 1243 1244 sz = md_majortab_len * sizeof (struct md_xlate_major_table); 1245 kmem_free(md_major_tuple_table, sz); 1246 } 1247 1248 /* functions return a pointer to a function which returns an int */ 1249 1250 intptr_t (* 1251 md_get_named_service(md_dev64_t dev, int modindex, char *name, 1252 intptr_t (*Default)()))() 1253 { 1254 mdi_unit_t *ui; 1255 md_named_services_t *sp; 1256 int i; 1257 1258 /* 1259 * Return the first named service found. 1260 * Use this path when it is known that there is only 1261 * one named service possible (e.g., hotspare interface) 1262 */ 1263 if ((dev == NODEV64) && (modindex == ANY_SERVICE)) { 1264 for (i = 0; i < MD_NOPS; i++) { 1265 if (md_ops[i] == NULL) { 1266 continue; 1267 } 1268 sp = md_ops[i]->md_services; 1269 if (sp == NULL) 1270 continue; 1271 while (sp->md_service != NULL) { 1272 if (strcmp(name, sp->md_name) == 0) 1273 return (sp->md_service); 1274 sp++; 1275 } 1276 } 1277 return (Default); 1278 } 1279 1280 /* 1281 * Return the named service for the given modindex. 1282 * This is used if there are multiple possible named services 1283 * and each one needs to be called (e.g., poke hotspares) 1284 */ 1285 if (dev == NODEV64) { 1286 if (modindex >= MD_NOPS) 1287 return (Default); 1288 1289 if (md_ops[modindex] == NULL) 1290 return (Default); 1291 1292 sp = md_ops[modindex]->md_services; 1293 if (sp == NULL) 1294 return (Default); 1295 1296 while (sp->md_service != NULL) { 1297 if (strcmp(name, sp->md_name) == 0) 1298 return (sp->md_service); 1299 sp++; 1300 } 1301 return (Default); 1302 } 1303 1304 /* 1305 * Return the named service for this md_dev64_t 1306 */ 1307 if (md_getmajor(dev) != md_major) 1308 return (Default); 1309 1310 if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) || 1311 (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits)) 1312 return (NULL); 1313 1314 1315 if ((ui = MDI_UNIT(md_getminor(dev))) == NULL) 1316 return (NULL); 1317 1318 sp = md_ops[ui->ui_opsindex]->md_services; 1319 if (sp == NULL) 1320 return (Default); 1321 while (sp->md_service != NULL) { 1322 if (strcmp(name, sp->md_name) == 0) 1323 return (sp->md_service); 1324 sp++; 1325 } 1326 return (Default); 1327 } 1328 1329 /* 1330 * md_daemon callback routine 1331 */ 1332 boolean_t 1333 callb_md_cpr(void *arg, int code) 1334 { 1335 callb_cpr_t *cp = (callb_cpr_t *)arg; 1336 int ret = 0; /* assume success */ 1337 1338 mutex_enter(cp->cc_lockp); 1339 1340 switch (code) { 1341 case CB_CODE_CPR_CHKPT: 1342 /* 1343 * Check for active resync threads 1344 */ 1345 mutex_enter(&md_cpr_resync.md_resync_mutex); 1346 if ((md_cpr_resync.md_mirror_resync > 0) || 1347 (md_cpr_resync.md_raid_resync > 0)) { 1348 mutex_exit(&md_cpr_resync.md_resync_mutex); 1349 cmn_err(CE_WARN, "There are Solaris Volume Manager " 1350 "synchronization threads running."); 1351 cmn_err(CE_WARN, "Please try system suspension at " 1352 "a later time."); 1353 ret = -1; 1354 break; 1355 } 1356 mutex_exit(&md_cpr_resync.md_resync_mutex); 1357 1358 cp->cc_events |= CALLB_CPR_START; 1359 while (!(cp->cc_events & CALLB_CPR_SAFE)) 1360 /* cv_timedwait() returns -1 if it times out. */ 1361 if ((ret = cv_timedwait(&cp->cc_callb_cv, cp->cc_lockp, 1362 lbolt + CPR_KTHREAD_TIMEOUT_SEC * hz)) == -1) 1363 break; 1364 break; 1365 1366 case CB_CODE_CPR_RESUME: 1367 cp->cc_events &= ~CALLB_CPR_START; 1368 cv_signal(&cp->cc_stop_cv); 1369 break; 1370 } 1371 mutex_exit(cp->cc_lockp); 1372 return (ret != -1); 1373 } 1374 1375 void 1376 md_daemon(int pass_thru, mdq_anchor_t *anchor) 1377 { 1378 daemon_queue_t *dq; 1379 callb_cpr_t cprinfo; 1380 1381 if (pass_thru && (md_get_status() & MD_GBL_DAEMONS_LIVE)) 1382 return; 1383 /* 1384 * Register cpr callback 1385 */ 1386 CALLB_CPR_INIT(&cprinfo, &anchor->a_mx, callb_md_cpr, "md_daemon"); 1387 1388 /*CONSTCOND*/ 1389 while (1) { 1390 mutex_enter(&anchor->a_mx); 1391 while ((dq = anchor->dq.dq_next) == &(anchor->dq)) { 1392 if (pass_thru) { 1393 /* 1394 * CALLB_CPR_EXIT Will do 1395 * mutex_exit(&anchor->a_mx) 1396 */ 1397 CALLB_CPR_EXIT(&cprinfo); 1398 return; 1399 } 1400 if (md_get_status() & MD_GBL_DAEMONS_DIE) { 1401 mutex_exit(&anchor->a_mx); 1402 mutex_enter(&md_mx); 1403 md_num_daemons--; 1404 mutex_exit(&md_mx); 1405 /* 1406 * CALLB_CPR_EXIT will do 1407 * mutex_exit(&anchor->a_mx) 1408 */ 1409 mutex_enter(&anchor->a_mx); 1410 CALLB_CPR_EXIT(&cprinfo); 1411 thread_exit(); 1412 } 1413 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1414 cv_wait(&anchor->a_cv, &anchor->a_mx); 1415 CALLB_CPR_SAFE_END(&cprinfo, &anchor->a_mx); 1416 } 1417 dq->dq_prev->dq_next = dq->dq_next; 1418 dq->dq_next->dq_prev = dq->dq_prev; 1419 dq->dq_prev = dq->dq_next = NULL; 1420 anchor->dq.qlen--; 1421 mutex_exit(&anchor->a_mx); 1422 (*(dq->dq_call))(dq); 1423 } 1424 /*NOTREACHED*/ 1425 } 1426 1427 /* 1428 * daemon_request: 1429 * 1430 * Adds requests to appropriate requestq which is 1431 * anchored by *anchor. 1432 * The request is the first element of a doubly linked circular list. 1433 * When the request is a single element, the forward and backward 1434 * pointers MUST point to the element itself. 1435 */ 1436 1437 void 1438 daemon_request(mdq_anchor_t *anchor, void (*func)(), 1439 daemon_queue_t *request, callstyle_t style) 1440 { 1441 daemon_queue_t *rqtp; 1442 int i = 0; 1443 1444 rqtp = request; 1445 if (style == REQ_OLD) { 1446 ASSERT((rqtp->dq_next == NULL) && (rqtp->dq_prev == NULL)); 1447 /* set it to the new style */ 1448 rqtp->dq_prev = rqtp->dq_next = rqtp; 1449 } 1450 ASSERT((rqtp->dq_next != NULL) && (rqtp->dq_prev != NULL)); 1451 1452 /* scan the list and add the function to each element */ 1453 1454 do { 1455 rqtp->dq_call = func; 1456 i++; 1457 rqtp = rqtp->dq_next; 1458 } while (rqtp != request); 1459 1460 /* save pointer to tail of the request list */ 1461 rqtp = request->dq_prev; 1462 1463 mutex_enter(&anchor->a_mx); 1464 /* stats */ 1465 anchor->dq.qlen += i; 1466 anchor->dq.treqs += i; 1467 anchor->dq.maxq_len = (anchor->dq.qlen > anchor->dq.maxq_len) ? 1468 anchor->dq.qlen : anchor->dq.maxq_len; 1469 1470 /* now add the list to request queue */ 1471 request->dq_prev = anchor->dq.dq_prev; 1472 rqtp->dq_next = &anchor->dq; 1473 anchor->dq.dq_prev->dq_next = request; 1474 anchor->dq.dq_prev = rqtp; 1475 cv_broadcast(&anchor->a_cv); 1476 mutex_exit(&anchor->a_mx); 1477 } 1478 1479 void 1480 mddb_commitrec_wrapper(mddb_recid_t recid) 1481 { 1482 int sent_log = 0; 1483 uint_t retry = md_retry_cnt; 1484 set_t setno; 1485 1486 while (mddb_commitrec(recid)) { 1487 if (! sent_log) { 1488 cmn_err(CE_WARN, 1489 "md: state database commit failed"); 1490 sent_log = 1; 1491 } 1492 delay(md_hz); 1493 1494 /* 1495 * Setting retry cnt to one (pre decremented) so that we 1496 * actually do no retries when committing/deleting a mddb rec. 1497 * The underlying disk driver does several retries to check 1498 * if the disk is really dead or not so there 1499 * is no reason for us to retry on top of the drivers retries. 1500 */ 1501 1502 if (--retry == 0) { 1503 setno = mddb_getsetnum(recid); 1504 if (md_get_setstatus(setno) & MD_SET_TOOFEW) { 1505 panic( 1506 "md: Panic due to lack of DiskSuite state\n" 1507 " database replicas. Fewer than 50%% of " 1508 "the total were available,\n so panic to " 1509 "ensure data integrity."); 1510 } else { 1511 panic("md: state database problem"); 1512 } 1513 /*NOTREACHED*/ 1514 } 1515 } 1516 } 1517 1518 void 1519 mddb_commitrecs_wrapper(mddb_recid_t *recids) 1520 { 1521 int sent_log = 0; 1522 uint_t retry = md_retry_cnt; 1523 set_t setno; 1524 1525 while (mddb_commitrecs(recids)) { 1526 if (! sent_log) { 1527 cmn_err(CE_WARN, 1528 "md: state database commit failed"); 1529 sent_log = 1; 1530 } 1531 delay(md_hz); 1532 1533 /* 1534 * Setting retry cnt to one (pre decremented) so that we 1535 * actually do no retries when committing/deleting a mddb rec. 1536 * The underlying disk driver does several retries to check 1537 * if the disk is really dead or not so there 1538 * is no reason for us to retry on top of the drivers retries. 1539 */ 1540 1541 if (--retry == 0) { 1542 /* 1543 * since all the records are part of the same set 1544 * use the first one to get setno 1545 */ 1546 setno = mddb_getsetnum(*recids); 1547 if (md_get_setstatus(setno) & MD_SET_TOOFEW) { 1548 panic( 1549 "md: Panic due to lack of DiskSuite state\n" 1550 " database replicas. Fewer than 50%% of " 1551 "the total were available,\n so panic to " 1552 "ensure data integrity."); 1553 } else { 1554 panic("md: state database problem"); 1555 } 1556 /*NOTREACHED*/ 1557 } 1558 } 1559 } 1560 1561 void 1562 mddb_deleterec_wrapper(mddb_recid_t recid) 1563 { 1564 int sent_log = 0; 1565 uint_t retry = md_retry_cnt; 1566 set_t setno; 1567 1568 while (mddb_deleterec(recid)) { 1569 if (! sent_log) { 1570 cmn_err(CE_WARN, 1571 "md: state database delete failed"); 1572 sent_log = 1; 1573 } 1574 delay(md_hz); 1575 1576 /* 1577 * Setting retry cnt to one (pre decremented) so that we 1578 * actually do no retries when committing/deleting a mddb rec. 1579 * The underlying disk driver does several retries to check 1580 * if the disk is really dead or not so there 1581 * is no reason for us to retry on top of the drivers retries. 1582 */ 1583 1584 if (--retry == 0) { 1585 setno = mddb_getsetnum(recid); 1586 if (md_get_setstatus(setno) & MD_SET_TOOFEW) { 1587 panic( 1588 "md: Panic due to lack of DiskSuite state\n" 1589 " database replicas. Fewer than 50%% of " 1590 "the total were available,\n so panic to " 1591 "ensure data integrity."); 1592 } else { 1593 panic("md: state database problem"); 1594 } 1595 /*NOTREACHED*/ 1596 } 1597 } 1598 } 1599 1600 /* 1601 * md_holdset_enter is called in order to hold the set in its 1602 * current state (loaded, unloaded, snarfed, unsnarfed, etc) 1603 * until md_holdset_exit is called. This is used by the mirror 1604 * code to mark the set as HOLD so that the set won't be 1605 * unloaded while hotspares are being allocated in check_4_hotspares. 1606 * The original fix to the mirror code to hold the set was to call 1607 * md_haltsnarf_enter, but this will block all ioctls and ioctls 1608 * must work for a MN diskset while hotspares are allocated. 1609 */ 1610 void 1611 md_holdset_enter(set_t setno) 1612 { 1613 mutex_enter(&md_mx); 1614 while (md_set[setno].s_status & MD_SET_HOLD) 1615 cv_wait(&md_cv, &md_mx); 1616 md_set[setno].s_status |= MD_SET_HOLD; 1617 mutex_exit(&md_mx); 1618 } 1619 1620 void 1621 md_holdset_exit(set_t setno) 1622 { 1623 mutex_enter(&md_mx); 1624 md_set[setno].s_status &= ~MD_SET_HOLD; 1625 cv_broadcast(&md_cv); 1626 mutex_exit(&md_mx); 1627 } 1628 1629 /* 1630 * Returns a 0 if this thread marked the set as HOLD (success), 1631 * returns a -1 if set was already marked HOLD (failure). 1632 * Used by the release_set code to see if set is marked HOLD. 1633 * HOLD is set by a daemon when hotspares are being allocated 1634 * to mirror units. 1635 */ 1636 int 1637 md_holdset_testandenter(set_t setno) 1638 { 1639 mutex_enter(&md_mx); 1640 if (md_set[setno].s_status & MD_SET_HOLD) { 1641 mutex_exit(&md_mx); 1642 return (-1); 1643 } 1644 md_set[setno].s_status |= MD_SET_HOLD; 1645 mutex_exit(&md_mx); 1646 return (0); 1647 } 1648 1649 void 1650 md_haltsnarf_enter(set_t setno) 1651 { 1652 mutex_enter(&md_mx); 1653 while (md_set[setno].s_status & MD_SET_SNARFING) 1654 cv_wait(&md_cv, &md_mx); 1655 1656 md_set[setno].s_status |= MD_SET_SNARFING; 1657 mutex_exit(&md_mx); 1658 } 1659 1660 void 1661 md_haltsnarf_exit(set_t setno) 1662 { 1663 mutex_enter(&md_mx); 1664 md_set[setno].s_status &= ~MD_SET_SNARFING; 1665 cv_broadcast(&md_cv); 1666 mutex_exit(&md_mx); 1667 } 1668 1669 void 1670 md_haltsnarf_wait(set_t setno) 1671 { 1672 mutex_enter(&md_mx); 1673 while (md_set[setno].s_status & MD_SET_SNARFING) 1674 cv_wait(&md_cv, &md_mx); 1675 mutex_exit(&md_mx); 1676 } 1677 1678 /* 1679 * ASSUMED that the md_unit_array_rw WRITER lock is held. 1680 */ 1681 int 1682 md_halt_set(set_t setno, enum md_haltcmd cmd) 1683 { 1684 int i, err; 1685 1686 if (md_set[setno].s_un == NULL || md_set[setno].s_ui == NULL) { 1687 return (0); 1688 } 1689 1690 if ((cmd == MD_HALT_CHECK) || (cmd == MD_HALT_ALL)) { 1691 for (i = 0; i < MD_NOPS; i++) { 1692 if (md_ops[i] == NULL) 1693 continue; 1694 if ((*(md_ops[i]->md_halt))(MD_HALT_CLOSE, setno)) { 1695 for (--i; i > 0; --i) { 1696 if (md_ops[i] == NULL) 1697 continue; 1698 (void) (*(md_ops[i]->md_halt)) 1699 (MD_HALT_OPEN, setno); 1700 } 1701 return (EBUSY); 1702 } 1703 } 1704 1705 for (i = 0; i < MD_NOPS; i++) { 1706 if (md_ops[i] == NULL) 1707 continue; 1708 if ((*(md_ops[i]->md_halt))(MD_HALT_CHECK, setno)) { 1709 for (i = 0; i < MD_NOPS; i++) { 1710 if (md_ops[i] == NULL) 1711 continue; 1712 (void) (*(md_ops[i]->md_halt)) 1713 (MD_HALT_OPEN, setno); 1714 } 1715 return (EBUSY); 1716 } 1717 } 1718 } 1719 1720 if ((cmd == MD_HALT_DOIT) || (cmd == MD_HALT_ALL)) { 1721 for (i = 0; i < MD_NOPS; i++) { 1722 if (md_ops[i] == NULL) 1723 continue; 1724 err = (*(md_ops[i]->md_halt))(MD_HALT_DOIT, setno); 1725 if (err != 0) 1726 cmn_err(CE_NOTE, 1727 "md: halt failed for %s, error %d", 1728 md_ops[i]->md_driver.md_drivername, err); 1729 } 1730 1731 /* 1732 * Unload the devid namespace if it is loaded 1733 */ 1734 md_unload_namespace(setno, NM_DEVID); 1735 md_unload_namespace(setno, 0L); 1736 md_clr_setstatus(setno, MD_SET_SNARFED); 1737 } 1738 1739 return (0); 1740 } 1741 1742 int 1743 md_halt(int global_locks_owned_mask) 1744 { 1745 set_t i, j; 1746 int err; 1747 int init_queues; 1748 md_requestq_entry_t *rqp; 1749 md_ops_t **pops, *ops, *lops; 1750 ddi_modhandle_t mod; 1751 char *name; 1752 1753 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 1754 1755 /* 1756 * Grab the all of the global locks that are not 1757 * already owned to ensure that there isn't another 1758 * thread trying to access a global resource 1759 * while the halt is in progress 1760 */ 1761 if (md_global_lock_enter(global_locks_owned_mask) == EINTR) 1762 return (EINTR); 1763 1764 for (i = 0; i < md_nsets; i++) 1765 md_haltsnarf_enter(i); 1766 1767 /* 1768 * Kill the daemon threads. 1769 */ 1770 init_queues = ((md_get_status() & MD_GBL_DAEMONS_LIVE) ? FALSE : TRUE); 1771 md_clr_status(MD_GBL_DAEMONS_LIVE); 1772 md_set_status(MD_GBL_DAEMONS_DIE); 1773 1774 rqp = &md_daemon_queues[0]; 1775 i = 0; 1776 while (!NULL_REQUESTQ_ENTRY(rqp)) { 1777 cv_broadcast(&rqp->dispq_headp->a_cv); 1778 rqp = &md_daemon_queues[++i]; 1779 } 1780 1781 mutex_enter(&md_mx); 1782 while (md_num_daemons != 0) { 1783 mutex_exit(&md_mx); 1784 delay(md_hz); 1785 mutex_enter(&md_mx); 1786 } 1787 mutex_exit(&md_mx); 1788 md_clr_status(MD_GBL_DAEMONS_DIE); 1789 1790 for (i = 0; i < md_nsets; i++) 1791 /* 1792 * Only call into md_halt_set if s_un / s_ui are both set. 1793 * If they are NULL this set hasn't been accessed, so its 1794 * pointless performing the call. 1795 */ 1796 if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) { 1797 if (md_halt_set(i, MD_HALT_CHECK)) { 1798 if (md_start_daemons(init_queues)) 1799 cmn_err(CE_WARN, 1800 "md: restart of daemon threads " 1801 "failed"); 1802 for (j = 0; j < md_nsets; j++) 1803 md_haltsnarf_exit(j); 1804 1805 return (md_global_lock_exit( 1806 global_locks_owned_mask, EBUSY, 1807 MD_ARRAY_WRITER, NULL)); 1808 } 1809 } 1810 1811 /* 1812 * if we get here we are going to do it 1813 */ 1814 for (i = 0; i < md_nsets; i++) { 1815 /* 1816 * Only call into md_halt_set if s_un / s_ui are both set. 1817 * If they are NULL this set hasn't been accessed, so its 1818 * pointless performing the call. 1819 */ 1820 if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) { 1821 err = md_halt_set(i, MD_HALT_DOIT); 1822 if (err != 0) 1823 cmn_err(CE_NOTE, 1824 "md: halt failed set %u, error %d", 1825 (unsigned)i, err); 1826 } 1827 } 1828 1829 /* 1830 * issue a halt unload to each module to indicate that it 1831 * is about to be unloaded. Each module is called once, set 1832 * has no meaning at this point in time. 1833 */ 1834 for (i = 0; i < MD_NOPS; i++) { 1835 if (md_ops[i] == NULL) 1836 continue; 1837 err = (*(md_ops[i]->md_halt))(MD_HALT_UNLOAD, 0); 1838 if (err != 0) 1839 cmn_err(CE_NOTE, 1840 "md: halt failed for %s, error %d", 1841 md_ops[i]->md_driver.md_drivername, err); 1842 } 1843 1844 /* ddi_modclose the submodules */ 1845 for (i = 0; i < MD_NOPS; i++) { 1846 /* skip if not open */ 1847 if ((md_ops[i] == NULL) || (md_mods[i] == NULL)) 1848 continue; 1849 1850 /* find and unlink from md_opslist */ 1851 ops = md_ops[i]; 1852 mod = md_mods[i]; 1853 pops = &md_opslist; 1854 for (lops = *pops; lops; 1855 pops = &lops->md_next, lops = *pops) { 1856 if (lops == ops) { 1857 *pops = ops->md_next; 1858 ops->md_next = NULL; 1859 break; 1860 } 1861 } 1862 1863 /* uninitialize */ 1864 name = ops->md_driver.md_drivername; 1865 md_ops[i] = NULL; 1866 md_mods[i] = NULL; 1867 ops->md_selfindex = 0; 1868 ops->md_driver.md_drivername[0] = '\0'; 1869 rw_destroy(&ops->md_link_rw.lock); 1870 1871 /* close */ 1872 err = ddi_modclose(mod); 1873 if (err != 0) 1874 cmn_err(CE_NOTE, 1875 "md: halt close failed for %s, error %d", 1876 name ? name : "UNKNOWN", err); 1877 } 1878 1879 /* Unload the database */ 1880 mddb_unload(); 1881 1882 md_set_status(MD_GBL_HALTED); /* we are ready to be unloaded */ 1883 1884 for (i = 0; i < md_nsets; i++) 1885 md_haltsnarf_exit(i); 1886 1887 return (md_global_lock_exit(global_locks_owned_mask, 0, 1888 MD_ARRAY_WRITER, NULL)); 1889 } 1890 1891 /* 1892 * md_layered_open() is an internal routine only for SVM modules. 1893 * So the input device will be a md_dev64_t, because all SVM modules internally 1894 * work with that device type. 1895 * ddi routines on the other hand work with dev_t. So, if we call any ddi 1896 * routines from here we first have to convert that device into a dev_t. 1897 */ 1898 1899 int 1900 md_layered_open( 1901 minor_t mnum, 1902 md_dev64_t *dev, 1903 int md_oflags 1904 ) 1905 { 1906 int flag = (FREAD | FWRITE); 1907 cred_t *cred_p = kcred; 1908 major_t major; 1909 int err; 1910 dev_t ddi_dev = md_dev64_to_dev(*dev); 1911 1912 if (ddi_dev == NODEV) 1913 return (ENODEV); 1914 1915 major = getmajor(ddi_dev); 1916 1917 /* metadevice */ 1918 if (major == md_major) { 1919 mdi_unit_t *ui; 1920 1921 /* open underlying driver */ 1922 mnum = getminor(ddi_dev); 1923 1924 ui = MDI_UNIT(mnum); 1925 if (md_ops[ui->ui_opsindex]->md_open != NULL) { 1926 int ret = (*md_ops[ui->ui_opsindex]->md_open)(&ddi_dev, 1927 flag, OTYP_LYR, cred_p, md_oflags); 1928 /* 1929 * As open() may change the device, 1930 * send this info back to the caller. 1931 */ 1932 *dev = md_expldev(ddi_dev); 1933 return (ret); 1934 } 1935 1936 /* or do it ourselves */ 1937 (void) md_unit_openclose_enter(ui); 1938 err = md_unit_incopen(mnum, flag, OTYP_LYR); 1939 md_unit_openclose_exit(ui); 1940 /* convert our ddi_dev back to the dev we were given */ 1941 *dev = md_expldev(ddi_dev); 1942 return (err); 1943 } 1944 1945 /* 1946 * Open regular device, since open() may change dev_t give new dev_t 1947 * back to the caller. 1948 */ 1949 err = dev_lopen(&ddi_dev, flag, OTYP_LYR, cred_p); 1950 *dev = md_expldev(ddi_dev); 1951 return (err); 1952 } 1953 1954 /* 1955 * md_layered_close() is an internal routine only for SVM modules. 1956 * So the input device will be a md_dev64_t, because all SVM modules internally 1957 * work with that device type. 1958 * ddi routines on the other hand work with dev_t. So, if we call any ddi 1959 * routines from here we first have to convert that device into a dev_t. 1960 */ 1961 void 1962 md_layered_close( 1963 md_dev64_t dev, 1964 int md_cflags 1965 ) 1966 { 1967 int flag = (FREAD | FWRITE); 1968 cred_t *cred_p = kcred; 1969 dev_t ddi_dev = md_dev64_to_dev(dev); 1970 major_t major = getmajor(ddi_dev); 1971 minor_t mnum = getminor(ddi_dev); 1972 1973 /* metadevice */ 1974 if (major == md_major) { 1975 mdi_unit_t *ui = MDI_UNIT(mnum); 1976 1977 /* close underlying driver */ 1978 if (md_ops[ui->ui_opsindex]->md_close != NULL) { 1979 (*md_ops[ui->ui_opsindex]->md_close) 1980 (ddi_dev, flag, OTYP_LYR, cred_p, md_cflags); 1981 return; 1982 } 1983 1984 /* or do it ourselves */ 1985 (void) md_unit_openclose_enter(ui); 1986 (void) md_unit_decopen(mnum, OTYP_LYR); 1987 md_unit_openclose_exit(ui); 1988 return; 1989 } 1990 1991 /* close regular device */ 1992 (void) dev_lclose(ddi_dev, flag, OTYP_LYR, cred_p); 1993 } 1994 1995 /* 1996 * saves a little code in mdstrategy 1997 */ 1998 int 1999 errdone(mdi_unit_t *ui, struct buf *bp, int err) 2000 { 2001 if ((bp->b_error = err) != 0) 2002 bp->b_flags |= B_ERROR; 2003 else 2004 bp->b_resid = bp->b_bcount; 2005 md_unit_readerexit(ui); 2006 md_biodone(bp); 2007 return (1); 2008 } 2009 2010 static int md_write_label = 0; 2011 2012 int 2013 md_checkbuf(mdi_unit_t *ui, md_unit_t *un, buf_t *bp) 2014 { 2015 diskaddr_t endblk; 2016 set_t setno = MD_UN2SET(un); 2017 2018 if ((md_get_setstatus(setno) & MD_SET_STALE) && 2019 (! (bp->b_flags & B_READ))) 2020 return (errdone(ui, bp, EROFS)); 2021 /* 2022 * Check early for unreasonable block number. 2023 * 2024 * b_blkno is defined as adaddr_t which is typedef'd to a long. 2025 * A problem occurs if b_blkno has bit 31 set and un_total_blocks 2026 * doesn't, b_blkno is then compared as a negative number which is 2027 * always less than a positive. 2028 */ 2029 if ((u_longlong_t)bp->b_lblkno > (u_longlong_t)un->c.un_total_blocks) 2030 return (errdone(ui, bp, EINVAL)); 2031 2032 if (bp->b_lblkno == un->c.un_total_blocks) 2033 return (errdone(ui, bp, 0)); 2034 2035 /* 2036 * make sure we don't clobber any labels 2037 */ 2038 if ((bp->b_lblkno == 0) && (! (bp->b_flags & B_READ)) && 2039 (un->c.un_flag & MD_LABELED) && (! md_write_label)) { 2040 cmn_err(CE_NOTE, "md: %s: write to label", 2041 md_shortname(getminor(bp->b_edev))); 2042 return (errdone(ui, bp, EINVAL)); 2043 } 2044 2045 bp->b_resid = 0; 2046 endblk = (diskaddr_t)(bp->b_lblkno + 2047 howmany(bp->b_bcount, DEV_BSIZE) - 1); 2048 2049 if (endblk > (un->c.un_total_blocks - 1)) { 2050 bp->b_resid = dbtob(endblk - (un->c.un_total_blocks - 1)); 2051 endblk = un->c.un_total_blocks - 1; 2052 bp->b_bcount -= bp->b_resid; 2053 } 2054 return (0); 2055 } 2056 2057 /* 2058 * init_request_queue: initializes the request queues and creates the threads. 2059 * return value = 0 :invalid num_threads 2060 * = n : n is the number of threads created. 2061 */ 2062 2063 int 2064 init_requestq( 2065 md_requestq_entry_t *rq, /* request queue info */ 2066 void (*threadfn)(), /* function to start the thread */ 2067 caddr_t threadfn_args, /* args to the function */ 2068 int pri, /* thread priority */ 2069 int init_queue) /* flag to init queues */ 2070 { 2071 struct mdq_anchor *rqhead; 2072 int i; 2073 int num_threads; 2074 2075 2076 num_threads = *(rq->num_threadsp); 2077 rqhead = rq->dispq_headp; 2078 2079 if (NULL_REQUESTQ_ENTRY(rq) || num_threads == 0) 2080 return (0); 2081 2082 if (init_queue) { 2083 rqhead->dq.maxq_len = 0; 2084 rqhead->dq.treqs = 0; 2085 rqhead->dq.dq_next = &rqhead->dq; 2086 rqhead->dq.dq_prev = &rqhead->dq; 2087 cv_init(&rqhead->a_cv, NULL, CV_DEFAULT, NULL); 2088 mutex_init(&rqhead->a_mx, NULL, MUTEX_DEFAULT, NULL); 2089 } 2090 for (i = 0; i < num_threads; i++) { 2091 (void) thread_create(NULL, 0, threadfn, threadfn_args, 0, &p0, 2092 TS_RUN, pri); 2093 } 2094 return (i); 2095 } 2096 2097 static void 2098 start_daemon(struct mdq_anchor *q) 2099 { 2100 md_daemon(0, q); 2101 ASSERT(0); 2102 } 2103 2104 /* 2105 * Creates all the md daemons. 2106 * Global: 2107 * md_num_daemons is set to number of daemons. 2108 * MD_GBL_DAEMONS_LIVE flag set to indicate the daemons are active. 2109 * 2110 * Return value: 0 success 2111 * 1 failure 2112 */ 2113 int 2114 md_start_daemons(int init_queue) 2115 { 2116 md_requestq_entry_t *rqp; 2117 int cnt; 2118 int i; 2119 int retval = 0; 2120 2121 2122 if (md_get_status() & MD_GBL_DAEMONS_LIVE) { 2123 return (retval); 2124 } 2125 md_clr_status(MD_GBL_DAEMONS_DIE); 2126 2127 rqp = &md_daemon_queues[0]; 2128 i = 0; 2129 while (!NULL_REQUESTQ_ENTRY(rqp)) { 2130 cnt = init_requestq(rqp, start_daemon, 2131 (caddr_t)rqp->dispq_headp, minclsyspri, init_queue); 2132 2133 if (cnt && cnt != *rqp->num_threadsp) { 2134 retval = 1; 2135 break; 2136 } 2137 /* 2138 * initialize variables 2139 */ 2140 md_num_daemons += cnt; 2141 rqp = &md_daemon_queues[++i]; 2142 } 2143 2144 md_set_status(MD_GBL_DAEMONS_LIVE); 2145 return (retval); 2146 } 2147 2148 int 2149 md_loadsubmod(set_t setno, char *name, int drvrid) 2150 { 2151 ddi_modhandle_t mod; 2152 md_ops_t **pops, *ops; 2153 int i, err; 2154 2155 /* 2156 * See if the submodule is mdopened. If not, i is the index of the 2157 * next empty slot. 2158 */ 2159 for (i = 0; md_ops[i] != NULL; i++) { 2160 if (strncmp(name, md_ops[i]->md_driver.md_drivername, 2161 MD_DRIVERNAMELEN) == 0) 2162 return (i); 2163 2164 if (i == (MD_NOPS - 1)) 2165 return (-1); 2166 } 2167 2168 if (drvrid < 0) { 2169 /* Do not try to add any records to the DB when stale. */ 2170 if (md_get_setstatus(setno) & MD_SET_STALE) 2171 return (-1); 2172 drvrid = md_setshared_name(setno, name, 0L); 2173 } 2174 2175 if (drvrid < 0) 2176 return (-1); 2177 2178 /* open and import the md_ops of the submodules */ 2179 mod = ddi_modopen(name, KRTLD_MODE_FIRST, &err); 2180 if (mod == NULL) { 2181 cmn_err(CE_WARN, "md_loadsubmod: " 2182 "unable to ddi_modopen %s, error %d\n", name, err); 2183 return (-1); 2184 } 2185 pops = ddi_modsym(mod, "md_interface_ops", &err); 2186 if (pops == NULL) { 2187 cmn_err(CE_WARN, "md_loadsubmod: " 2188 "unable to import md_interface_ops from %s, error %d\n", 2189 name, err); 2190 (void) ddi_modclose(mod); 2191 return (-1); 2192 } 2193 2194 /* ddi_modsym returns pointer to md_interface_ops in submod */ 2195 ops = *pops; 2196 2197 /* initialize */ 2198 ops->md_selfindex = i; 2199 rw_init(&ops->md_link_rw.lock, NULL, RW_DEFAULT, NULL); 2200 (void) strncpy(ops->md_driver.md_drivername, name, 2201 MD_DRIVERNAMELEN); 2202 2203 /* plumb */ 2204 md_ops[i] = ops; 2205 md_mods[i] = mod; 2206 ops->md_next = md_opslist; 2207 md_opslist = ops; 2208 2209 /* return index */ 2210 return (i); 2211 } 2212 2213 int 2214 md_getmodindex(md_driver_t *driver, int dont_load, int db_notrequired) 2215 { 2216 int i; 2217 int modindex; 2218 char *name = driver->md_drivername; 2219 set_t setno = driver->md_setno; 2220 int drvid; 2221 int local_dont_load; 2222 2223 if (setno >= md_nsets) 2224 return (-1); 2225 2226 for (i = 0; name[i] != 0; i++) 2227 if (i == (MD_DRIVERNAMELEN -1)) 2228 return (-1); 2229 2230 /* 2231 * If set is STALE, set local_dont_load to 1 since no records 2232 * should be added to DB when stale. 2233 */ 2234 if (md_get_setstatus(setno) & MD_SET_STALE) { 2235 local_dont_load = 1; 2236 } else { 2237 local_dont_load = dont_load; 2238 } 2239 2240 /* 2241 * Single thread ioctl module binding with respect to 2242 * similar code executed in md_loadsubmod that is called 2243 * from md_snarf_db_set (which is where that path does 2244 * its md_haltsnarf_enter call). 2245 */ 2246 md_haltsnarf_enter(setno); 2247 2248 /* See if the submodule is already ddi_modopened. */ 2249 for (i = 0; md_ops[i] != NULL; i++) { 2250 if (strncmp(name, md_ops[i]->md_driver.md_drivername, 2251 MD_DRIVERNAMELEN) == 0) { 2252 if (! local_dont_load && 2253 (md_getshared_key(setno, name) == MD_KEYBAD)) { 2254 if (md_setshared_name(setno, name, 0L) 2255 == MD_KEYBAD) { 2256 if (!db_notrequired) 2257 goto err; 2258 } 2259 } 2260 md_haltsnarf_exit(setno); 2261 return (i); 2262 } 2263 2264 if (i == (MD_NOPS -1)) 2265 break; 2266 } 2267 2268 if (local_dont_load) 2269 goto err; 2270 2271 drvid = ((db_notrequired) ? 0 : (int)md_getshared_key(setno, name)); 2272 2273 /* ddi_modopen the submodule */ 2274 modindex = md_loadsubmod(setno, name, drvid); 2275 if (modindex < 0) 2276 goto err; 2277 2278 if (md_ops[modindex]->md_snarf != NULL) 2279 (*(md_ops[modindex]->md_snarf))(MD_SNARF_DOIT, setno); 2280 2281 md_haltsnarf_exit(setno); 2282 return (modindex); 2283 2284 err: md_haltsnarf_exit(setno); 2285 return (-1); 2286 } 2287 2288 void 2289 md_call_strategy(buf_t *bp, int flags, void *private) 2290 { 2291 mdi_unit_t *ui; 2292 2293 if (mdv_strategy_tstpnt) 2294 if ((*mdv_strategy_tstpnt)(bp, flags, private) != 0) 2295 return; 2296 if (getmajor(bp->b_edev) != md_major) { 2297 (void) bdev_strategy(bp); 2298 return; 2299 } 2300 2301 flags = (flags & MD_STR_PASSEDON) | MD_STR_NOTTOP; 2302 ui = MDI_UNIT(getminor(bp->b_edev)); 2303 ASSERT(ui != NULL); 2304 (*md_ops[ui->ui_opsindex]->md_strategy)(bp, flags, private); 2305 } 2306 2307 /* 2308 * md_call_ioctl: 2309 * ------------- 2310 * Issue the specified ioctl to the device associated with the given md_dev64_t 2311 * 2312 * Arguments: 2313 * dev - underlying device [md_dev64_t] 2314 * cmd - ioctl to perform 2315 * data - arguments / result location 2316 * mode - read/write/layered ioctl 2317 * lockp - lock reference 2318 * 2319 * Returns: 2320 * 0 success 2321 * !=0 Failure (error code) 2322 */ 2323 int 2324 md_call_ioctl(md_dev64_t dev, int cmd, void *data, int mode, IOLOCK *lockp) 2325 { 2326 dev_t device = md_dev64_to_dev(dev); 2327 int rval; 2328 mdi_unit_t *ui; 2329 2330 /* 2331 * See if device is a metadevice. If not call cdev_ioctl(), otherwise 2332 * call the ioctl entry-point in the metadevice. 2333 */ 2334 if (md_getmajor(dev) != md_major) { 2335 int rv; 2336 rval = cdev_ioctl(device, cmd, (intptr_t)data, mode, 2337 ddi_get_cred(), &rv); 2338 } else { 2339 ui = MDI_UNIT(md_getminor(dev)); 2340 ASSERT(ui != NULL); 2341 rval = (*md_ops[ui->ui_opsindex]->md_ioctl)(device, cmd, data, 2342 mode, lockp); 2343 } 2344 return (rval); 2345 } 2346 2347 void 2348 md_rem_link(set_t setno, int id, krwlock_t *rw, md_link_t **head) 2349 { 2350 md_link_t *next; 2351 md_link_t **pprev; 2352 2353 rw_enter(rw, RW_WRITER); 2354 2355 next = *head; 2356 pprev = head; 2357 while (next) { 2358 if ((next->ln_setno == setno) && (next->ln_id == id)) { 2359 *pprev = next->ln_next; 2360 rw_exit(rw); 2361 return; 2362 } 2363 pprev = &next->ln_next; 2364 next = next->ln_next; 2365 } 2366 2367 rw_exit(rw); 2368 } 2369 2370 int 2371 md_dev_exists(md_dev64_t dev) 2372 { 2373 2374 if (dev == NODEV64) 2375 return (0); 2376 2377 if (strcmp(ddi_major_to_name(md_getmajor(dev)), "md") != 0) 2378 return (1); 2379 2380 if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) || 2381 (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits)) 2382 return (0); 2383 2384 if (MDI_UNIT(md_getminor(dev)) != NULL) 2385 return (1); 2386 2387 return (0); 2388 } 2389 2390 md_parent_t 2391 md_get_parent(md_dev64_t dev) 2392 { 2393 md_unit_t *un; 2394 mdi_unit_t *ui; 2395 md_parent_t parent; 2396 2397 if (md_getmajor(dev) != md_major) 2398 return (MD_NO_PARENT); 2399 2400 ui = MDI_UNIT(md_getminor(dev)); 2401 2402 un = (md_unit_t *)md_unit_readerlock(ui); 2403 parent = un->c.un_parent; 2404 md_unit_readerexit(ui); 2405 2406 return (parent); 2407 } 2408 2409 void 2410 md_set_parent(md_dev64_t dev, md_parent_t parent) 2411 { 2412 md_unit_t *un; 2413 mdi_unit_t *ui; 2414 2415 if (md_getmajor(dev) != md_major) 2416 return; 2417 2418 ui = MDI_UNIT(md_getminor(dev)); 2419 2420 un = (md_unit_t *)md_unit_readerlock(ui); 2421 un->c.un_parent = parent; 2422 md_unit_readerexit(ui); 2423 } 2424 2425 void 2426 md_reset_parent(md_dev64_t dev) 2427 { 2428 md_unit_t *un; 2429 mdi_unit_t *ui; 2430 2431 if (md_getmajor(dev) != md_major) 2432 return; 2433 2434 ui = MDI_UNIT(md_getminor(dev)); 2435 2436 un = (md_unit_t *)md_unit_readerlock(ui); 2437 un->c.un_parent = MD_NO_PARENT; 2438 md_unit_readerexit(ui); 2439 } 2440 2441 2442 static intptr_t (*hot_spare_interface)() = (intptr_t (*)())NULL; 2443 2444 int 2445 md_hot_spare_ifc( 2446 hs_cmds_t cmd, 2447 mddb_recid_t id, 2448 u_longlong_t size, 2449 int labeled, 2450 mddb_recid_t *hs_id, 2451 mdkey_t *key, 2452 md_dev64_t *dev, 2453 diskaddr_t *sblock) 2454 { 2455 int err; 2456 2457 /* 2458 * RW lock on hot_spare_interface. We don't want it to change from 2459 * underneath us. If hot_spare_interface is NULL we're going to 2460 * need to set it. So we need to upgrade to a WRITER lock. If that 2461 * doesn't work, we drop the lock and reenter as WRITER. This leaves 2462 * a small hole during which hot_spare_interface could be modified 2463 * so we check it for NULL again. What a pain. Then if still null 2464 * load from md_get_named_service. 2465 */ 2466 2467 rw_enter(&hsp_rwlp.lock, RW_READER); 2468 if (hot_spare_interface == NULL) { 2469 if (rw_tryupgrade(&hsp_rwlp.lock) == 0) { 2470 rw_exit(&hsp_rwlp.lock); 2471 rw_enter(&hsp_rwlp.lock, RW_WRITER); 2472 if (hot_spare_interface != NULL) { 2473 err = ((*hot_spare_interface) 2474 (cmd, id, size, labeled, hs_id, key, dev, 2475 sblock)); 2476 rw_exit(&hsp_rwlp.lock); 2477 return (err); 2478 } 2479 } 2480 hot_spare_interface = md_get_named_service(NODEV64, ANY_SERVICE, 2481 "hot spare interface", 0); 2482 rw_downgrade(&hsp_rwlp.lock); 2483 } 2484 2485 if (hot_spare_interface == NULL) { 2486 cmn_err(CE_WARN, "md: no hotspare interface"); 2487 rw_exit(&hsp_rwlp.lock); 2488 return (0); 2489 } 2490 2491 err = ((*hot_spare_interface) 2492 (cmd, id, size, labeled, hs_id, key, dev, sblock)); 2493 rw_exit(&hsp_rwlp.lock); 2494 return (err); 2495 } 2496 2497 void 2498 md_clear_hot_spare_interface() 2499 { 2500 rw_enter(&hsp_rwlp.lock, RW_WRITER); 2501 hot_spare_interface = NULL; 2502 rw_exit(&hsp_rwlp.lock); 2503 } 2504 2505 2506 static intptr_t (*notify_interface)() = (intptr_t (*)())NULL; 2507 2508 int 2509 md_notify_interface( 2510 md_event_cmds_t cmd, 2511 md_tags_t tag, 2512 set_t set, 2513 md_dev64_t dev, 2514 md_event_type_t event 2515 ) 2516 { 2517 int err; 2518 2519 if (md_event_queue == NULL) 2520 return (0); 2521 rw_enter(&ni_rwlp.lock, RW_READER); 2522 if (notify_interface == NULL) { 2523 if (rw_tryupgrade(&ni_rwlp.lock) == 0) { 2524 rw_exit(&ni_rwlp.lock); 2525 rw_enter(&ni_rwlp.lock, RW_WRITER); 2526 if (notify_interface != NULL) { 2527 err = ((*notify_interface) 2528 (cmd, tag, set, dev, event)); 2529 rw_exit(&ni_rwlp.lock); 2530 return (err); 2531 } 2532 } 2533 notify_interface = md_get_named_service(NODEV64, ANY_SERVICE, 2534 "notify interface", 0); 2535 rw_downgrade(&ni_rwlp.lock); 2536 } 2537 if (notify_interface == NULL) { 2538 cmn_err(CE_WARN, "md: no notify interface"); 2539 rw_exit(&ni_rwlp.lock); 2540 return (0); 2541 } 2542 err = ((*notify_interface)(cmd, tag, set, dev, event)); 2543 rw_exit(&ni_rwlp.lock); 2544 return (err); 2545 } 2546 2547 char * 2548 obj2devname(uint32_t tag, uint_t setno, md_dev64_t dev) 2549 { 2550 char *setname; 2551 char name[MD_MAX_CTDLEN]; 2552 minor_t mnum = md_getminor(dev); 2553 major_t maj = md_getmajor(dev); 2554 int rtn = 0; 2555 2556 /* 2557 * Verify that the passed dev_t refers to a valid metadevice. 2558 * If it doesn't we can make no assumptions as to what the device 2559 * name is. Return NULL in these cases. 2560 */ 2561 if (((maj != md_major) || (MD_MIN2UNIT(mnum) >= md_nunits)) || 2562 (MD_MIN2SET(mnum) >= md_nsets)) { 2563 return (NULL); 2564 } 2565 2566 setname = NULL; 2567 name[0] = '\0'; 2568 switch (tag) { 2569 case SVM_TAG_HSP: 2570 if (setno == 0) { 2571 rtn = snprintf(name, sizeof (name), "hsp%u", 2572 (unsigned)MD_MIN2UNIT(mnum)); 2573 } else { 2574 setname = mddb_getsetname(setno); 2575 if (setname != NULL) { 2576 rtn = snprintf(name, sizeof (name), "%s/hsp%u", 2577 setname, (unsigned)MD_MIN2UNIT(mnum)); 2578 } 2579 } 2580 break; 2581 case SVM_TAG_DRIVE: 2582 (void) sprintf(name, "drive"); 2583 break; 2584 case SVM_TAG_HOST: 2585 (void) sprintf(name, "host"); 2586 break; 2587 case SVM_TAG_SET: 2588 rtn = snprintf(name, sizeof (name), "%s", 2589 mddb_getsetname(setno)); 2590 if ((name[0] == '\0') || (rtn >= sizeof (name))) { 2591 (void) sprintf(name, "diskset"); 2592 rtn = 0; 2593 } 2594 break; 2595 default: 2596 rtn = snprintf(name, sizeof (name), "%s", md_shortname(mnum)); 2597 break; 2598 } 2599 2600 /* Check if we got any rubbish for any of the snprintf's */ 2601 if ((name[0] == '\0') || (rtn >= sizeof (name))) { 2602 return (NULL); 2603 } 2604 2605 return (md_strdup(name)); 2606 } 2607 2608 /* Sysevent subclass and mdnotify event type pairs */ 2609 struct node { 2610 char *se_ev; 2611 md_event_type_t md_ev; 2612 }; 2613 2614 /* 2615 * Table must be sorted in case sensitive ascending order of 2616 * the sysevents values 2617 */ 2618 static struct node ev_table[] = { 2619 { ESC_SVM_ADD, EQ_ADD }, 2620 { ESC_SVM_ATTACH, EQ_ATTACH }, 2621 { ESC_SVM_ATTACHING, EQ_ATTACHING }, 2622 { ESC_SVM_CHANGE, EQ_CHANGE }, 2623 { ESC_SVM_CREATE, EQ_CREATE }, 2624 { ESC_SVM_DELETE, EQ_DELETE }, 2625 { ESC_SVM_DETACH, EQ_DETACH }, 2626 { ESC_SVM_DETACHING, EQ_DETACHING }, 2627 { ESC_SVM_DRIVE_ADD, EQ_DRIVE_ADD }, 2628 { ESC_SVM_DRIVE_DELETE, EQ_DRIVE_DELETE }, 2629 { ESC_SVM_ENABLE, EQ_ENABLE }, 2630 { ESC_SVM_ERRED, EQ_ERRED }, 2631 { ESC_SVM_EXCHANGE, EQ_EXCHANGE }, 2632 { ESC_SVM_GROW, EQ_GROW }, 2633 { ESC_SVM_HS_CHANGED, EQ_HS_CHANGED }, 2634 { ESC_SVM_HS_FREED, EQ_HS_FREED }, 2635 { ESC_SVM_HOST_ADD, EQ_HOST_ADD }, 2636 { ESC_SVM_HOST_DELETE, EQ_HOST_DELETE }, 2637 { ESC_SVM_HOTSPARED, EQ_HOTSPARED }, 2638 { ESC_SVM_INIT_FAILED, EQ_INIT_FAILED }, 2639 { ESC_SVM_INIT_FATAL, EQ_INIT_FATAL }, 2640 { ESC_SVM_INIT_START, EQ_INIT_START }, 2641 { ESC_SVM_INIT_SUCCESS, EQ_INIT_SUCCESS }, 2642 { ESC_SVM_IOERR, EQ_IOERR }, 2643 { ESC_SVM_LASTERRED, EQ_LASTERRED }, 2644 { ESC_SVM_MEDIATOR_ADD, EQ_MEDIATOR_ADD }, 2645 { ESC_SVM_MEDIATOR_DELETE, EQ_MEDIATOR_DELETE }, 2646 { ESC_SVM_OFFLINE, EQ_OFFLINE }, 2647 { ESC_SVM_OK, EQ_OK }, 2648 { ESC_SVM_ONLINE, EQ_ONLINE }, 2649 { ESC_SVM_OPEN_FAIL, EQ_OPEN_FAIL }, 2650 { ESC_SVM_REGEN_DONE, EQ_REGEN_DONE }, 2651 { ESC_SVM_REGEN_FAILED, EQ_REGEN_FAILED }, 2652 { ESC_SVM_REGEN_START, EQ_REGEN_START }, 2653 { ESC_SVM_RELEASE, EQ_RELEASE }, 2654 { ESC_SVM_REMOVE, EQ_REMOVE }, 2655 { ESC_SVM_RENAME_DST, EQ_RENAME_DST }, 2656 { ESC_SVM_RENAME_SRC, EQ_RENAME_SRC }, 2657 { ESC_SVM_REPLACE, EQ_REPLACE }, 2658 { ESC_SVM_RESYNC_DONE, EQ_RESYNC_DONE }, 2659 { ESC_SVM_RESYNC_FAILED, EQ_RESYNC_FAILED }, 2660 { ESC_SVM_RESYNC_START, EQ_RESYNC_START }, 2661 { ESC_SVM_RESYNC_SUCCESS, EQ_RESYNC_SUCCESS }, 2662 { ESC_SVM_TAKEOVER, EQ_TAKEOVER } 2663 }; 2664 2665 static md_tags_t md_tags[] = { 2666 TAG_UNK, 2667 TAG_METADEVICE, 2668 TAG_UNK, 2669 TAG_UNK, 2670 TAG_UNK, 2671 TAG_UNK, 2672 TAG_REPLICA, 2673 TAG_HSP, 2674 TAG_HS, 2675 TAG_SET, 2676 TAG_DRIVE, 2677 TAG_HOST, 2678 TAG_MEDIATOR 2679 }; 2680 2681 md_event_type_t 2682 ev_get(char *subclass) 2683 { 2684 int high, mid, low, p; 2685 2686 low = 0; 2687 high = (sizeof (ev_table) / sizeof (ev_table[0])) - 1; 2688 while (low <= high) { 2689 mid = (high + low) / 2; 2690 p = strcmp(subclass, ev_table[mid].se_ev); 2691 if (p == 0) { 2692 return (ev_table[mid].md_ev); 2693 } else if (p < 0) { 2694 high = mid - 1; 2695 } else { 2696 low = mid + 1; 2697 } 2698 } 2699 2700 return (EQ_EMPTY); 2701 } 2702 2703 /* 2704 * Log mdnotify event 2705 */ 2706 void 2707 do_mdnotify(char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid) 2708 { 2709 md_event_type_t ev_type; 2710 md_tags_t md_tag; 2711 2712 /* Translate sysevent into mdnotify event */ 2713 ev_type = ev_get(se_subclass); 2714 2715 if (tag >= (sizeof (md_tags) / sizeof (md_tags[0]))) { 2716 md_tag = TAG_UNK; 2717 } else { 2718 md_tag = md_tags[tag]; 2719 } 2720 2721 NOTIFY_MD(md_tag, setno, devid, ev_type); 2722 } 2723 2724 /* 2725 * Log SVM sys events 2726 */ 2727 void 2728 svm_gen_sysevent( 2729 char *se_class, 2730 char *se_subclass, 2731 uint32_t tag, 2732 set_t setno, 2733 md_dev64_t devid 2734 ) 2735 { 2736 nvlist_t *attr_list; 2737 sysevent_id_t eid; 2738 int err = DDI_SUCCESS; 2739 char *devname; 2740 extern dev_info_t *md_devinfo; 2741 2742 /* Raise the mdnotify event before anything else */ 2743 do_mdnotify(se_subclass, tag, setno, devid); 2744 2745 if (md_devinfo == NULL) { 2746 return; 2747 } 2748 2749 err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_NOSLEEP); 2750 2751 if (err == DDI_SUCCESS) { 2752 /* Add the version numver */ 2753 err = nvlist_add_uint32(attr_list, SVM_VERSION_NO, 2754 (uint32_t)SVM_VERSION); 2755 if (err != DDI_SUCCESS) { 2756 goto fail; 2757 } 2758 2759 /* Add the tag attribute */ 2760 err = nvlist_add_uint32(attr_list, SVM_TAG, (uint32_t)tag); 2761 if (err != DDI_SUCCESS) { 2762 goto fail; 2763 } 2764 2765 /* Add the set number attribute */ 2766 err = nvlist_add_uint32(attr_list, SVM_SET_NO, (uint32_t)setno); 2767 if (err != DDI_SUCCESS) { 2768 goto fail; 2769 } 2770 2771 /* Add the device id attribute */ 2772 err = nvlist_add_uint64(attr_list, SVM_DEV_ID, (uint64_t)devid); 2773 if (err != DDI_SUCCESS) { 2774 goto fail; 2775 } 2776 2777 /* Add the device name attribute */ 2778 devname = obj2devname(tag, setno, devid); 2779 if (devname != NULL) { 2780 err = nvlist_add_string(attr_list, SVM_DEV_NAME, 2781 devname); 2782 freestr(devname); 2783 } else { 2784 err = nvlist_add_string(attr_list, SVM_DEV_NAME, 2785 "unspecified"); 2786 } 2787 if (err != DDI_SUCCESS) { 2788 goto fail; 2789 } 2790 2791 /* Attempt to post event */ 2792 err = ddi_log_sysevent(md_devinfo, DDI_VENDOR_SUNW, se_class, 2793 se_subclass, attr_list, &eid, DDI_SLEEP); 2794 2795 nvlist_free(attr_list); 2796 if (err != DDI_SUCCESS) { 2797 cmn_err(CE_WARN, "Failed to log event for %s, %s," 2798 " err=%x", se_class, se_subclass, err); 2799 } 2800 } 2801 2802 return; 2803 2804 fail: 2805 nvlist_free(attr_list); 2806 cmn_err(CE_WARN, "Failed to setup attributes for event %s, %s, err=%x", 2807 se_class, se_subclass, err); 2808 } 2809 2810 void 2811 md_clear_named_service() 2812 { 2813 rw_enter(&ni_rwlp.lock, RW_WRITER); 2814 notify_interface = NULL; 2815 rw_exit(&ni_rwlp.lock); 2816 } 2817 2818 void 2819 md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock) 2820 { 2821 mdi_unit_t *ui; 2822 set_t setno = MD_MIN2SET(mnum); 2823 2824 ui = (mdi_unit_t *)kmem_zalloc(sizeof (mdi_unit_t), KM_SLEEP); 2825 ui->ui_opsindex = ops->md_selfindex; 2826 2827 /* initialize all the incore conditional variables */ 2828 mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL); 2829 cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL); 2830 2831 if (alloc_lock) { 2832 ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP); 2833 mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL); 2834 cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL); 2835 mutex_init(&ui->ui_io_lock->io_list_mutex, NULL, 2836 MUTEX_DEFAULT, NULL); 2837 ui->ui_io_lock->io_list_front = NULL; 2838 ui->ui_io_lock->io_list_back = NULL; 2839 } 2840 if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) { 2841 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 2842 MDI_VOIDUNIT(mnum) = (void *) ui; 2843 rw_exit(&md_unit_array_rw.lock); 2844 } else 2845 MDI_VOIDUNIT(mnum) = (void *) ui; 2846 2847 rw_enter(&ops->md_link_rw.lock, RW_WRITER); 2848 ui->ui_link.ln_next = ops->md_head; 2849 ui->ui_link.ln_setno = setno; 2850 ui->ui_link.ln_id = mnum; 2851 ops->md_head = &ui->ui_link; 2852 /* setup the unavailable field */ 2853 #if defined(_ILP32) 2854 if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) { 2855 ui->ui_tstate |= MD_64MD_ON_32KERNEL; 2856 cmn_err(CE_NOTE, "d%d is unavailable because 64 bit " 2857 "metadevices are not accessible on a 32 bit kernel", 2858 mnum); 2859 } 2860 #endif 2861 2862 rw_exit(&ops->md_link_rw.lock); 2863 } 2864 2865 void 2866 md_destroy_unit_incore(minor_t mnum, md_ops_t *ops) 2867 { 2868 mdi_unit_t *ui; 2869 2870 /* 2871 * ASSUMPTION: md_unit_array_rw WRITER lock is held. 2872 */ 2873 ui = MDI_UNIT(mnum); 2874 if (ui == NULL) 2875 return; 2876 2877 md_rem_link(MD_MIN2SET(mnum), mnum, &ops->md_link_rw.lock, 2878 &ops->md_head); 2879 2880 /* destroy the io lock if one is being used */ 2881 if (ui->ui_io_lock) { 2882 mutex_destroy(&ui->ui_io_lock->io_mx); 2883 cv_destroy(&ui->ui_io_lock->io_cv); 2884 kmem_free(ui->ui_io_lock, sizeof (md_io_lock_t)); 2885 } 2886 2887 /* teardown kstat */ 2888 md_kstat_destroy(mnum); 2889 2890 /* destroy all the incore conditional variables */ 2891 mutex_destroy(&ui->ui_mx); 2892 cv_destroy(&ui->ui_cv); 2893 2894 kmem_free(ui, sizeof (mdi_unit_t)); 2895 MDI_VOIDUNIT(mnum) = (void *) NULL; 2896 } 2897 2898 void 2899 md_rem_names(sv_dev_t *sv, int nsv) 2900 { 2901 int i, s; 2902 int max_sides; 2903 2904 if (nsv == 0) 2905 return; 2906 2907 /* All entries removed are in the same diskset */ 2908 if (md_get_setstatus(sv[0].setno) & MD_SET_MNSET) 2909 max_sides = MD_MNMAXSIDES; 2910 else 2911 max_sides = MD_MAXSIDES; 2912 2913 for (i = 0; i < nsv; i++) 2914 for (s = 0; s < max_sides; s++) 2915 (void) md_remdevname(sv[i].setno, s, sv[i].key); 2916 } 2917 2918 /* 2919 * Checking user args before we get into physio - returns 0 for ok, else errno 2920 * We do a lot of checking against illegal arguments here because some of the 2921 * real disk drivers don't like certain kinds of arguments. (e.g xy doesn't 2922 * like odd address user buffer.) Those drivers capture bad arguments in 2923 * xxread and xxwrite. But since meta-driver calls their strategy routines 2924 * directly, two bad scenario might happen: 2925 * 1. the real strategy doesn't like it and panic. 2926 * 2. the real strategy doesn't like it and set B_ERROR. 2927 * 2928 * The second case is no better than the first one, since the meta-driver 2929 * will treat it as a media-error and off line the mirror metapartition. 2930 * (Too bad there is no way to tell what error it is.) 2931 * 2932 */ 2933 int 2934 md_chk_uio(struct uio *uio) 2935 { 2936 int i; 2937 struct iovec *iov; 2938 2939 /* 2940 * Check for negative or not block-aligned offset 2941 */ 2942 if ((uio->uio_loffset < 0) || 2943 ((uio->uio_loffset & (DEV_BSIZE - 1)) != 0)) { 2944 return (EINVAL); 2945 } 2946 iov = uio->uio_iov; 2947 i = uio->uio_iovcnt; 2948 2949 while (i--) { 2950 if ((iov->iov_len & (DEV_BSIZE - 1)) != 0) 2951 return (EINVAL); 2952 /* 2953 * Bug # 1212146 2954 * The default is to not check alignment, but we can now check 2955 * for a larger number of alignments if desired. 2956 */ 2957 if ((uintptr_t)(iov->iov_base) & md_uio_alignment_mask) 2958 return (EINVAL); 2959 iov++; 2960 } 2961 return (0); 2962 } 2963 2964 char * 2965 md_shortname( 2966 minor_t mnum 2967 ) 2968 { 2969 static char buf[MAXPATHLEN]; 2970 char *devname; 2971 char *invalid = " (Invalid minor number %u) "; 2972 char *metaname; 2973 mdc_unit_t *un; 2974 side_t side; 2975 set_t setno = MD_MIN2SET(mnum); 2976 unit_t unit = MD_MIN2UNIT(mnum); 2977 2978 if ((un = MD_UNIT(mnum)) == NULL) { 2979 (void) snprintf(buf, sizeof (buf), invalid, mnum); 2980 return (buf); 2981 } 2982 2983 /* 2984 * If unit is not a friendly name unit, derive the name from the 2985 * minor number. 2986 */ 2987 if ((un->un_revision & MD_FN_META_DEV) == 0) { 2988 /* This is a traditional metadevice */ 2989 if (setno == MD_LOCAL_SET) { 2990 (void) snprintf(buf, sizeof (buf), "d%u", 2991 (unsigned)unit); 2992 } else { 2993 (void) snprintf(buf, sizeof (buf), "%s/d%u", 2994 mddb_getsetname(setno), (unsigned)unit); 2995 } 2996 return (buf); 2997 } 2998 2999 /* 3000 * It is a friendly name metadevice, so we need to get its name. 3001 */ 3002 side = mddb_getsidenum(setno); 3003 devname = (char *)kmem_alloc(MAXPATHLEN, KM_SLEEP); 3004 if (md_getdevname(setno, side, MD_KEYWILD, 3005 md_makedevice(md_major, mnum), devname, MAXPATHLEN) == 0) { 3006 /* 3007 * md_getdevname has given us either /dev/md/dsk/<metaname> 3008 * or /dev/md/<setname>/dsk/<metname> depending on whether 3009 * or not we are in the local set. Thus, we'll pull the 3010 * metaname from this string. 3011 */ 3012 if ((metaname = strrchr(devname, '/')) == NULL) { 3013 (void) snprintf(buf, sizeof (buf), invalid, mnum); 3014 goto out; 3015 } 3016 metaname++; /* move past slash */ 3017 if (setno == MD_LOCAL_SET) { 3018 /* No set name. */ 3019 (void) snprintf(buf, sizeof (buf), "%s", metaname); 3020 } else { 3021 /* Include setname */ 3022 (void) snprintf(buf, sizeof (buf), "%s/%s", 3023 mddb_getsetname(setno), metaname); 3024 } 3025 } else { 3026 /* We couldn't find the name. */ 3027 (void) snprintf(buf, sizeof (buf), invalid, mnum); 3028 } 3029 3030 out: 3031 kmem_free(devname, MAXPATHLEN); 3032 return (buf); 3033 } 3034 3035 char * 3036 md_devname( 3037 set_t setno, 3038 md_dev64_t dev, 3039 char *buf, 3040 size_t size 3041 ) 3042 { 3043 static char mybuf[MD_MAX_CTDLEN]; 3044 int err; 3045 3046 if (buf == NULL) { 3047 buf = mybuf; 3048 size = sizeof (mybuf); 3049 } else { 3050 ASSERT(size >= MD_MAX_CTDLEN); 3051 } 3052 3053 err = md_getdevname_common(setno, mddb_getsidenum(setno), 3054 0, dev, buf, size, MD_NOWAIT_LOCK); 3055 if (err) { 3056 if (err == ENOENT) { 3057 (void) sprintf(buf, "(Unavailable)"); 3058 } else { 3059 (void) sprintf(buf, "(%u.%u)", 3060 md_getmajor(dev), md_getminor(dev)); 3061 } 3062 } 3063 3064 return (buf); 3065 } 3066 void 3067 md_minphys(buf_t *pb) 3068 { 3069 extern unsigned md_maxbcount; 3070 3071 if (pb->b_bcount > md_maxbcount) 3072 pb->b_bcount = md_maxbcount; 3073 } 3074 3075 void 3076 md_bioinit(struct buf *bp) 3077 { 3078 ASSERT(bp); 3079 3080 bioinit(bp); 3081 bp->b_back = bp; 3082 bp->b_forw = bp; 3083 bp->b_flags = B_BUSY; /* initialize flags */ 3084 } 3085 3086 void 3087 md_bioreset(struct buf *bp) 3088 { 3089 ASSERT(bp); 3090 3091 bioreset(bp); 3092 bp->b_back = bp; 3093 bp->b_forw = bp; 3094 bp->b_flags = B_BUSY; /* initialize flags */ 3095 } 3096 3097 /* 3098 * md_bioclone is needed as long as the real bioclone only takes a daddr_t 3099 * as block number. 3100 * We simply call bioclone with all input parameters but blkno, and set the 3101 * correct blkno afterwards. 3102 * Caveat Emptor: bp_mem must not be NULL! 3103 */ 3104 buf_t * 3105 md_bioclone(buf_t *bp, off_t off, size_t len, dev_t dev, diskaddr_t blkno, 3106 int (*iodone)(buf_t *), buf_t *bp_mem, int sleep) 3107 { 3108 (void) bioclone(bp, off, len, dev, 0, iodone, bp_mem, sleep); 3109 bp_mem->b_lblkno = blkno; 3110 return (bp_mem); 3111 } 3112 3113 3114 /* 3115 * kstat stuff 3116 */ 3117 void 3118 md_kstat_init_ui( 3119 minor_t mnum, 3120 mdi_unit_t *ui 3121 ) 3122 { 3123 if ((ui != NULL) && (ui->ui_kstat == NULL)) { 3124 set_t setno = MD_MIN2SET(mnum); 3125 unit_t unit = MD_MIN2UNIT(mnum); 3126 char module[KSTAT_STRLEN]; 3127 char *p = module; 3128 3129 if (setno != MD_LOCAL_SET) { 3130 char buf[64]; 3131 char *s = buf; 3132 char *e = module + sizeof (module) - 4; 3133 3134 (void) sprintf(buf, "%u", setno); 3135 while ((p < e) && (*s != '\0')) 3136 *p++ = *s++; 3137 *p++ = '/'; 3138 } 3139 *p++ = 'm'; 3140 *p++ = 'd'; 3141 *p = '\0'; 3142 if ((ui->ui_kstat = kstat_create(module, unit, NULL, "disk", 3143 KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) { 3144 ui->ui_kstat->ks_lock = &ui->ui_mx; 3145 kstat_install(ui->ui_kstat); 3146 } 3147 } 3148 } 3149 3150 void 3151 md_kstat_init( 3152 minor_t mnum 3153 ) 3154 { 3155 md_kstat_init_ui(mnum, MDI_UNIT(mnum)); 3156 } 3157 3158 void 3159 md_kstat_destroy_ui( 3160 mdi_unit_t *ui 3161 ) 3162 { 3163 /* 3164 * kstat_delete() interface has it's own locking mechanism and 3165 * does not allow holding of kstat lock (ks_lock). 3166 * Note: ks_lock == ui_mx from the md_kstat_init_ui(). 3167 */ 3168 if ((ui != NULL) && (ui->ui_kstat != NULL)) { 3169 kstat_delete(ui->ui_kstat); 3170 ui->ui_kstat = NULL; 3171 } 3172 } 3173 3174 void 3175 md_kstat_destroy( 3176 minor_t mnum 3177 ) 3178 { 3179 md_kstat_destroy_ui(MDI_UNIT(mnum)); 3180 } 3181 3182 /* 3183 * In the following subsequent routines, locks are held before checking the 3184 * validity of ui_kstat. This is done to make sure that we don't trip over 3185 * a NULL ui_kstat anymore. 3186 */ 3187 3188 void 3189 md_kstat_waitq_enter( 3190 mdi_unit_t *ui 3191 ) 3192 { 3193 mutex_enter(&ui->ui_mx); 3194 if (ui->ui_kstat != NULL) 3195 kstat_waitq_enter(KSTAT_IO_PTR(ui->ui_kstat)); 3196 mutex_exit(&ui->ui_mx); 3197 } 3198 3199 void 3200 md_kstat_waitq_to_runq( 3201 mdi_unit_t *ui 3202 ) 3203 { 3204 mutex_enter(&ui->ui_mx); 3205 if (ui->ui_kstat != NULL) 3206 kstat_waitq_to_runq(KSTAT_IO_PTR(ui->ui_kstat)); 3207 mutex_exit(&ui->ui_mx); 3208 } 3209 3210 void 3211 md_kstat_waitq_exit( 3212 mdi_unit_t *ui 3213 ) 3214 { 3215 mutex_enter(&ui->ui_mx); 3216 if (ui->ui_kstat != NULL) 3217 kstat_waitq_exit(KSTAT_IO_PTR(ui->ui_kstat)); 3218 mutex_exit(&ui->ui_mx); 3219 } 3220 3221 void 3222 md_kstat_runq_enter( 3223 mdi_unit_t *ui 3224 ) 3225 { 3226 mutex_enter(&ui->ui_mx); 3227 if (ui->ui_kstat != NULL) 3228 kstat_runq_enter(KSTAT_IO_PTR(ui->ui_kstat)); 3229 mutex_exit(&ui->ui_mx); 3230 } 3231 3232 void 3233 md_kstat_runq_exit( 3234 mdi_unit_t *ui 3235 ) 3236 { 3237 mutex_enter(&ui->ui_mx); 3238 if (ui->ui_kstat != NULL) 3239 kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat)); 3240 mutex_exit(&ui->ui_mx); 3241 } 3242 3243 void 3244 md_kstat_done( 3245 mdi_unit_t *ui, 3246 buf_t *bp, 3247 int war 3248 ) 3249 { 3250 size_t n_done; 3251 3252 /* check for end of device */ 3253 if ((bp->b_resid != 0) && (! (bp->b_flags & B_ERROR))) { 3254 n_done = bp->b_bcount; 3255 } else if (bp->b_bcount < bp->b_resid) { 3256 n_done = 0; 3257 } else { 3258 n_done = bp->b_bcount - bp->b_resid; 3259 } 3260 3261 /* do accounting */ 3262 mutex_enter(&ui->ui_mx); 3263 if (ui->ui_kstat != NULL) { 3264 if ((! war) && (bp->b_flags & B_READ)) { 3265 KSTAT_IO_PTR(ui->ui_kstat)->reads++; 3266 KSTAT_IO_PTR(ui->ui_kstat)->nread += n_done; 3267 } else { 3268 KSTAT_IO_PTR(ui->ui_kstat)->writes++; 3269 KSTAT_IO_PTR(ui->ui_kstat)->nwritten += n_done; 3270 } 3271 kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat)); 3272 } 3273 mutex_exit(&ui->ui_mx); 3274 } 3275 3276 pid_t 3277 md_getpid() 3278 { 3279 pid_t valuep; 3280 if (drv_getparm(PPID, (pid_t *)&valuep) != 0) { 3281 ASSERT(0); 3282 return ((pid_t)0); 3283 } else { 3284 ASSERT(valuep); 3285 return (valuep); 3286 } 3287 } 3288 3289 3290 proc_t * 3291 md_getproc() 3292 { 3293 proc_t *valuep; 3294 if (drv_getparm(UPROCP, (proc_t **)&valuep) != 0) { 3295 ASSERT(0); 3296 return ((proc_t *)NULL); 3297 } else { 3298 ASSERT(valuep); 3299 return (valuep); 3300 } 3301 } 3302 3303 extern kmutex_t pidlock; 3304 3305 /* 3306 * this check to see if a process pid pair are still running. For the 3307 * disk set lock when both pid/proc are zero then the locks is not 3308 * currently held. 3309 */ 3310 int 3311 md_checkpid(pid_t pid, proc_t *proc) 3312 { 3313 int retval = 1; 3314 3315 if (pid == 0 && proc == NULL) 3316 return (0); 3317 3318 mutex_enter(&pidlock); 3319 if (prfind(pid) != proc) 3320 retval = 0; 3321 mutex_exit(&pidlock); 3322 return (retval); 3323 } 3324 3325 /* 3326 * NAME: md_init_probereq 3327 * 3328 * DESCRIPTION: initializes a probe request. Parcels out the mnums such that 3329 * they can be dispatched to multiple daemon threads. 3330 * 3331 * PARAMETERS: struct md_probedev *p pointer ioctl input 3332 * 3333 * RETURN VALUE: Returns errno 3334 * 3335 */ 3336 3337 int 3338 md_init_probereq(struct md_probedev_impl *p, daemon_queue_t **hdrpp) 3339 { 3340 int err = 0; 3341 int modindx; 3342 intptr_t (*probe_test)(); 3343 3344 /* 3345 * Initialize the semaphores and mutex 3346 * for the request 3347 */ 3348 3349 p->probe_sema = kmem_alloc(sizeof (ksema_t), KM_SLEEP); 3350 3351 p->probe_mx = kmem_alloc(sizeof (kmutex_t), KM_SLEEP); 3352 sema_init(PROBE_SEMA(p), 0, NULL, SEMA_DRIVER, NULL); 3353 mutex_init(PROBE_MX(p), NULL, MUTEX_DEFAULT, NULL); 3354 3355 modindx = md_getmodindex(&(p->probe.md_driver), 1, 1); 3356 probe_test = md_get_named_service(NODEV64, modindx, 3357 p->probe.test_name, 0); 3358 if (probe_test == NULL) { 3359 err = EINVAL; 3360 goto err_out; 3361 } 3362 3363 err = md_create_probe_rqlist(p, hdrpp, probe_test); 3364 err_out: 3365 return (err); 3366 } 3367 3368 /* 3369 * NAME: md_probe_one 3370 * 3371 * DESCRIPTION: Generic routine for probing disks. This is called from the 3372 * daemon. 3373 * 3374 * PARAMETERS: probe_req_t *reqp pointer to the probe request structure. 3375 * 3376 */ 3377 3378 void 3379 md_probe_one(probe_req_t *reqp) 3380 { 3381 mdi_unit_t *ui; 3382 md_probedev_impl_t *p; 3383 int err = 0; 3384 set_t setno; 3385 3386 p = (md_probedev_impl_t *)reqp->private_handle; 3387 /* 3388 * Validate the unit while holding the global ioctl lock, then 3389 * obtain the unit_writerlock. Once the writerlock has been obtained 3390 * we can release the global lock. As long as we hold one of these 3391 * locks this will prevent a metaclear operation being performed 3392 * on the metadevice because metaclear takes the readerlock (via 3393 * openclose lock). 3394 * To avoid a potential deadlock with the probe_fcn() causing i/o to 3395 * be issued to the writerlock'd metadevice we only grab the writerlock 3396 * if the unit is not an SVM root device. 3397 */ 3398 while (md_ioctl_lock_enter() == EINTR) 3399 ; 3400 setno = MD_MIN2SET(reqp->mnum); 3401 ui = MDI_UNIT(reqp->mnum); 3402 if (ui != NULL) { 3403 int writer_grabbed; 3404 dev_t svm_root; 3405 3406 if ((setno == MD_LOCAL_SET) && root_is_svm) { 3407 svm_root = getrootdev(); 3408 3409 if (getminor(svm_root) == reqp->mnum) { 3410 writer_grabbed = 0; 3411 } else { 3412 writer_grabbed = 1; 3413 (void) md_unit_writerlock_common(ui, 0); 3414 } 3415 } else { 3416 writer_grabbed = 1; 3417 (void) md_unit_writerlock_common(ui, 0); 3418 } 3419 (void) md_ioctl_lock_exit(0, 0, 0, FALSE); 3420 err = (*reqp->probe_fcn)(ui, reqp->mnum); 3421 if (writer_grabbed) { 3422 md_unit_writerexit(ui); 3423 } 3424 } else { 3425 (void) md_ioctl_lock_exit(0, 0, 0, FALSE); 3426 } 3427 3428 /* update the info in the probe structure */ 3429 3430 mutex_enter(PROBE_MX(p)); 3431 if (err != 0) { 3432 cmn_err(CE_NOTE, "md_probe_one: err %d mnum %d\n", err, 3433 reqp->mnum); 3434 (void) mdsyserror(&(p->probe.mde), err); 3435 } 3436 3437 mutex_exit(PROBE_MX(p)); 3438 sema_v(PROBE_SEMA(p)); 3439 3440 kmem_free(reqp, sizeof (probe_req_t)); 3441 } 3442 char * 3443 md_strdup(char *cp) 3444 { 3445 char *new_cp = NULL; 3446 3447 new_cp = kmem_alloc(strlen(cp) + 1, KM_SLEEP); 3448 3449 return (strcpy(new_cp, cp)); 3450 } 3451 3452 void 3453 freestr(char *cp) 3454 { 3455 kmem_free(cp, strlen(cp) + 1); 3456 } 3457 3458 /* 3459 * Validate the list and skip invalid devices. Then create 3460 * a doubly linked circular list of devices to probe. 3461 * The hdr points to the head and tail of this list. 3462 */ 3463 3464 static int 3465 md_create_probe_rqlist(md_probedev_impl_t *plist, daemon_queue_t **hdr, 3466 intptr_t (*probe_test)()) 3467 { 3468 int i, err, nodevcnt; 3469 probe_req_t *tp; 3470 daemon_queue_t *hp; 3471 minor_t mnum; 3472 3473 nodevcnt = 0; 3474 3475 hp = NULL; 3476 3477 for (i = 0; i < plist->probe.nmdevs; i++) { 3478 mnum = ((minor_t *)(uintptr_t)(plist->probe.mnum_list))[i]; 3479 if (MDI_UNIT(mnum) == NULL) { 3480 cmn_err(CE_WARN, "md: Cannot probe %s since it does " 3481 "not exist", md_shortname(mnum)); 3482 nodevcnt++; 3483 continue; 3484 } 3485 tp = kmem_alloc(sizeof (probe_req_t), KM_SLEEP); 3486 tp->mnum = mnum; 3487 tp->private_handle = (void *)plist; 3488 tp->probe_fcn = probe_test; 3489 if (hp == NULL) { 3490 hp = (daemon_queue_t *)tp; 3491 hp->dq_prev = hp->dq_next = (daemon_queue_t *)tp; 3492 } else { 3493 tp->dq.dq_next = hp; 3494 tp->dq.dq_prev = hp->dq_prev; 3495 hp->dq_prev->dq_next = (daemon_queue_t *)tp; 3496 hp->dq_prev = (daemon_queue_t *)tp; 3497 } 3498 } 3499 3500 *hdr = hp; 3501 if (nodevcnt > 0) 3502 plist->probe.nmdevs -= nodevcnt; 3503 3504 /* 3505 * If there are no devices to be probed because they were 3506 * incorrect, then return an error. 3507 */ 3508 err = (plist->probe.nmdevs == 0) ? ENODEV : 0; 3509 3510 return (err); 3511 } 3512 3513 /* 3514 * This routine increments the I/O count for set I/O operations. This 3515 * value is used to determine if an I/O can done. If a release is in 3516 * process this will return an error and cause the I/O to be errored. 3517 */ 3518 int 3519 md_inc_iocount(set_t setno) 3520 { 3521 int rc = 0; 3522 3523 if (setno == 0) 3524 return (0); 3525 3526 mutex_enter(&md_set_io[setno].md_io_mx); 3527 if (!(md_set_io[setno].io_state & MD_SET_ACTIVE)) { 3528 rc = EIO; 3529 goto out; 3530 } 3531 3532 ASSERT(md_set_io[setno].io_cnt >= 0); 3533 md_set_io[setno].io_cnt++; 3534 3535 out: mutex_exit(&md_set_io[setno].md_io_mx); 3536 return (rc); 3537 } 3538 3539 void 3540 md_inc_iocount_noblock(set_t setno) 3541 { 3542 3543 if (setno == 0) 3544 return; 3545 3546 mutex_enter(&md_set_io[setno].md_io_mx); 3547 md_set_io[setno].io_cnt++; 3548 mutex_exit(&md_set_io[setno].md_io_mx); 3549 } 3550 void 3551 md_dec_iocount(set_t setno) 3552 { 3553 3554 if (setno == 0) 3555 return; 3556 3557 mutex_enter(&md_set_io[setno].md_io_mx); 3558 md_set_io[setno].io_cnt--; 3559 ASSERT(md_set_io[setno].io_cnt >= 0); 3560 if ((md_set_io[setno].io_state & MD_SET_RELEASE) && 3561 (md_set_io[setno].io_cnt == 0)) 3562 cv_broadcast(&md_set_io[setno].md_io_cv); 3563 mutex_exit(&md_set_io[setno].md_io_mx); 3564 } 3565 3566 int 3567 md_isblock_setio(set_t setno) 3568 { 3569 int rc = 0; 3570 3571 if (setno == 0) 3572 return (0); 3573 3574 mutex_enter(&md_set_io[setno].md_io_mx); 3575 if (md_set_io[setno].io_state & MD_SET_RELEASE) 3576 rc = 1; 3577 3578 mutex_exit(&md_set_io[setno].md_io_mx); 3579 return (rc); 3580 } 3581 3582 int 3583 md_block_setio(set_t setno) 3584 { 3585 int rc = 0; 3586 3587 if (setno == 0) 3588 return (1); 3589 3590 mutex_enter(&md_set_io[setno].md_io_mx); 3591 md_set_io[setno].io_state = MD_SET_RELEASE; 3592 3593 while (md_set_io[setno].io_cnt > 0) { 3594 cv_wait(&md_set_io[setno].md_io_cv, 3595 &md_set_io[setno].md_io_mx); 3596 } 3597 rc = 1; 3598 3599 3600 ASSERT(md_set_io[setno].io_cnt == 0); 3601 mutex_exit(&md_set_io[setno].md_io_mx); 3602 3603 return (rc); 3604 } 3605 3606 void 3607 md_clearblock_setio(set_t setno) 3608 { 3609 if (setno == 0) 3610 return; 3611 3612 mutex_enter(&md_set_io[setno].md_io_mx); 3613 md_set_io[setno].io_state = MD_SET_ACTIVE; 3614 mutex_exit(&md_set_io[setno].md_io_mx); 3615 } 3616 3617 void 3618 md_unblock_setio(set_t setno) 3619 { 3620 if (setno == 0) 3621 return; 3622 3623 mutex_enter(&md_set_io[setno].md_io_mx); 3624 #ifdef DEBUG 3625 if (md_set_io[setno].io_cnt != 0) { 3626 cmn_err(CE_NOTE, "set %d count was %ld at take", 3627 setno, md_set_io[setno].io_cnt); 3628 } 3629 #endif /* DEBUG */ 3630 3631 md_set_io[setno].io_state = MD_SET_ACTIVE; 3632 md_set_io[setno].io_cnt = 0; 3633 mutex_exit(&md_set_io[setno].md_io_mx); 3634 } 3635 3636 /* 3637 * Test and set version of the md_block_setio. 3638 * Set the io_state to keep new I/O from being issued. 3639 * If there is I/O currently in progress, then set io_state to active 3640 * and return failure. Otherwise, return a 1 for success. 3641 * 3642 * Used in a MN diskset since the commd must be suspended before 3643 * this node can attempt to withdraw from a diskset. But, with commd 3644 * suspended, I/O may have been issued that can never finish until 3645 * commd is resumed (allocation of hotspare, etc). So, if I/O is 3646 * outstanding after diskset io_state is marked RELEASE, then set diskset 3647 * io_state back to ACTIVE and return failure. 3648 */ 3649 int 3650 md_tas_block_setio(set_t setno) 3651 { 3652 int rc; 3653 3654 if (setno == 0) 3655 return (1); 3656 3657 mutex_enter(&md_set_io[setno].md_io_mx); 3658 md_set_io[setno].io_state = MD_SET_RELEASE; 3659 3660 if (md_set_io[setno].io_cnt > 0) { 3661 md_set_io[setno].io_state = MD_SET_ACTIVE; 3662 rc = 0; 3663 } else { 3664 rc = 1; 3665 } 3666 3667 mutex_exit(&md_set_io[setno].md_io_mx); 3668 3669 return (rc); 3670 } 3671 3672 void 3673 md_biodone(struct buf *pb) 3674 { 3675 minor_t mnum; 3676 set_t setno; 3677 mdi_unit_t *ui; 3678 3679 mnum = getminor(pb->b_edev); 3680 setno = MD_MIN2SET(mnum); 3681 3682 if (setno == 0) { 3683 biodone(pb); 3684 return; 3685 } 3686 3687 #ifdef DEBUG 3688 ui = MDI_UNIT(mnum); 3689 if (!md_unit_isopen(ui)) 3690 cmn_err(CE_NOTE, "io after close on %s\n", md_shortname(mnum)); 3691 #endif /* DEBUG */ 3692 3693 /* 3694 * Handle the local diskset 3695 */ 3696 if (md_set_io[setno].io_cnt > 0) 3697 md_dec_iocount(setno); 3698 3699 #ifdef DEBUG 3700 /* 3701 * this is being done after the lock is dropped so there 3702 * are cases it may be invalid. It is advisory. 3703 */ 3704 if (md_set_io[setno].io_state & MD_SET_RELEASE) { 3705 /* Only display this error once for this metadevice */ 3706 if ((ui->ui_tstate & MD_RELEASE_IOERR_DONE) == 0) { 3707 cmn_err(CE_NOTE, 3708 "I/O to %s attempted during set RELEASE\n", 3709 md_shortname(mnum)); 3710 ui->ui_tstate |= MD_RELEASE_IOERR_DONE; 3711 } 3712 } 3713 #endif /* DEBUG */ 3714 3715 biodone(pb); 3716 } 3717 3718 3719 /* 3720 * Driver special private devt handling routine 3721 * INPUT: md_dev64_t 3722 * OUTPUT: dev_t, 32 bit on a 32 bit kernel, 64 bit on a 64 bit kernel. 3723 */ 3724 dev_t 3725 md_dev64_to_dev(md_dev64_t dev) 3726 { 3727 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 3728 minor_t minor = (minor_t)(dev & MAXMIN64); 3729 3730 return (makedevice(major, minor)); 3731 3732 } 3733 3734 /* 3735 * Driver private makedevice routine 3736 * INPUT: major_t major, minor_t minor 3737 * OUTPUT: md_dev64_t, no matter if on 32 bit or 64 bit kernel. 3738 */ 3739 md_dev64_t 3740 md_makedevice(major_t major, minor_t minor) 3741 { 3742 return (((md_dev64_t)major << NBITSMINOR64) | minor); 3743 3744 } 3745 3746 3747 /* 3748 * Driver private devt md_getmajor routine 3749 * INPUT: dev a 64 bit container holding either a 32 bit or a 64 bit device 3750 * OUTPUT: the appropriate major number 3751 */ 3752 major_t 3753 md_getmajor(md_dev64_t dev) 3754 { 3755 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 3756 3757 if (major == 0) { 3758 /* Here we were given a 32bit dev */ 3759 major = (major_t)(dev >> NBITSMINOR32) & MAXMAJ32; 3760 } 3761 return (major); 3762 } 3763 3764 /* 3765 * Driver private devt md_getminor routine 3766 * INPUT: dev a 64 bit container holding either a 32 bit or a 64 bit device 3767 * OUTPUT: the appropriate minor number 3768 */ 3769 minor_t 3770 md_getminor(md_dev64_t dev) 3771 { 3772 minor_t minor; 3773 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; 3774 3775 if (major == 0) { 3776 /* Here we were given a 32bit dev */ 3777 minor = (minor_t)(dev & MAXMIN32); 3778 } else { 3779 minor = (minor_t)(dev & MAXMIN64); 3780 } 3781 return (minor); 3782 } 3783 3784 int 3785 md_check_ioctl_against_unit(int cmd, mdc_unit_t c) 3786 { 3787 /* 3788 * If the metadevice is an old style device, it has a vtoc, 3789 * in that case all reading EFI ioctls are not applicable. 3790 * If the metadevice has an EFI label, reading vtoc and geom ioctls 3791 * are not supposed to work. 3792 */ 3793 switch (cmd) { 3794 case DKIOCGGEOM: 3795 case DKIOCGAPART: 3796 /* if > 2 TB then fail */ 3797 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { 3798 return (ENOTSUP); 3799 } 3800 break; 3801 case DKIOCGVTOC: 3802 /* if > 2 TB then fail */ 3803 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { 3804 return (ENOTSUP); 3805 } 3806 3807 /* if > 1 TB but < 2TB return overflow */ 3808 if (c.un_revision & MD_64BIT_META_DEV) { 3809 return (EOVERFLOW); 3810 } 3811 break; 3812 case DKIOCGEXTVTOC: 3813 /* if > 2 TB then fail */ 3814 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { 3815 return (ENOTSUP); 3816 } 3817 break; 3818 case DKIOCGETEFI: 3819 case DKIOCPARTITION: 3820 if ((c.un_flag & MD_EFILABEL) == 0) { 3821 return (ENOTSUP); 3822 } 3823 break; 3824 3825 case DKIOCSETEFI: 3826 /* setting an EFI label should always be ok */ 3827 return (0); 3828 3829 case DKIOCSVTOC: 3830 /* if > 2 TB then fail */ 3831 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { 3832 return (ENOTSUP); 3833 } 3834 3835 /* if > 1 TB but < 2TB return overflow */ 3836 if (c.un_revision & MD_64BIT_META_DEV) { 3837 return (EOVERFLOW); 3838 } 3839 break; 3840 case DKIOCSEXTVTOC: 3841 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { 3842 return (ENOTSUP); 3843 } 3844 break; 3845 } 3846 return (0); 3847 } 3848 3849 /* 3850 * md_vtoc_to_efi_record() 3851 * Input: record id of the vtoc record 3852 * Output: record id of the efi record 3853 * Function: 3854 * - reads the volume name from the vtoc record 3855 * - converts the volume name to a format, libefi understands 3856 * - creates a new record of size MD_EFI_PARTNAME_BYTES 3857 * - stores the volname in that record, 3858 * - commits that record 3859 * - returns the recid of the efi record. 3860 * Caveat Emptor: 3861 * The calling routine must do something like 3862 * - un->c.un_vtoc_id = md_vtoc_to_efi_record(vtoc_recid) 3863 * - commit(un) 3864 * - delete(vtoc_recid) 3865 * in order to keep the mddb consistent in case of a panic in the middle. 3866 * Errors: 3867 * - returns 0 on any error 3868 */ 3869 mddb_recid_t 3870 md_vtoc_to_efi_record(mddb_recid_t vtoc_recid, set_t setno) 3871 { 3872 struct vtoc *vtoc; 3873 ushort_t *v; 3874 mddb_recid_t efi_recid; 3875 int i; 3876 3877 if (mddb_getrecstatus(vtoc_recid) != MDDB_OK) { 3878 return (0); 3879 } 3880 vtoc = (struct vtoc *)mddb_getrecaddr(vtoc_recid); 3881 efi_recid = mddb_createrec(MD_EFI_PARTNAME_BYTES, MDDB_EFILABEL, 0, 3882 MD_CRO_32BIT, setno); 3883 if (efi_recid < 0) { 3884 return (0); 3885 } 3886 v = (ushort_t *)mddb_getrecaddr(efi_recid); 3887 3888 /* This for loop read, converts and writes */ 3889 for (i = 0; i < LEN_DKL_VVOL; i++) { 3890 v[i] = LE_16((uint16_t)vtoc->v_volume[i]); 3891 } 3892 /* commit the new record */ 3893 mddb_commitrec_wrapper(efi_recid); 3894 3895 return (efi_recid); 3896 } 3897 3898 /* 3899 * Send a kernel message. 3900 * user has to provide for an allocated result structure 3901 * If the door handler disappears we retry, emitting warnings every so often. 3902 * 3903 * The recipient argument is almost always unused, and is therefore typically 3904 * set to zero, as zero is an invalid cluster nodeid. The exceptions are the 3905 * marking and clearing of the DRL from a node that is not currently the 3906 * owner. In these cases, the recipient argument will be the nodeid of the 3907 * mirror owner, and MD_MSGF_DIRECTED will be set in the flags. Non-owner 3908 * nodes will not receive these messages. 3909 * 3910 * For the case where md_mn_is_commd_present() is false, we rely on the 3911 * "result" having been kmem_zalloc()ed which, in effect, sets MDMNE_NULL for 3912 * kmmr_comm_state making MDMN_KSEND_MSG_OK() result in 0. 3913 */ 3914 int 3915 mdmn_ksend_message( 3916 set_t setno, 3917 md_mn_msgtype_t type, 3918 uint_t flags, 3919 md_mn_nodeid_t recipient, 3920 char *data, 3921 int size, 3922 md_mn_kresult_t *result) 3923 { 3924 door_arg_t da; 3925 md_mn_kmsg_t *kmsg; 3926 uint_t send_try_cnt = 0; 3927 uint_t retry_noise_cnt = 0; 3928 int rval; 3929 k_sigset_t oldmask, newmask; 3930 3931 if (size > MDMN_MAX_KMSG_DATA) 3932 return (ENOMEM); 3933 kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP); 3934 kmsg->kmsg_flags = flags; 3935 kmsg->kmsg_setno = setno; 3936 kmsg->kmsg_recipient = recipient; 3937 kmsg->kmsg_type = type; 3938 kmsg->kmsg_size = size; 3939 bcopy(data, &(kmsg->kmsg_data), size); 3940 3941 /* 3942 * Wait for the door handle to be established. 3943 */ 3944 while (mdmn_door_did == -1) { 3945 if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) { 3946 cmn_err(CE_WARN, "door handle not yet ready. " 3947 "Check if /usr/lib/lvm/mddoors is running"); 3948 } 3949 delay(md_hz); 3950 } 3951 3952 /* 3953 * If MD_MSGF_BLK_SIGNAL is set, mask out all signals so that we 3954 * do not fail if the user process receives a signal while we're 3955 * active in the door interface. 3956 */ 3957 if (flags & MD_MSGF_BLK_SIGNAL) { 3958 sigfillset(&newmask); 3959 sigreplace(&newmask, &oldmask); 3960 } 3961 3962 /* 3963 * If message failed with an RPC_FAILURE when rpc.mdcommd had 3964 * been gracefully shutdown (md_mn_is_commd_present returns FALSE) 3965 * then don't retry the message anymore. If message 3966 * failed due to any other reason, then retry up to MD_MN_WARN_INTVL 3967 * times which should allow a shutting down system time to 3968 * notify the kernel of a graceful shutdown of rpc.mdcommd. 3969 * 3970 * Caller of this routine will need to check the md_mn_commd_present 3971 * flag and the failure error in order to determine whether to panic 3972 * or not. If md_mn_commd_present is set to 0 and failure error 3973 * is RPC_FAILURE, the calling routine should not panic since the 3974 * system is in the process of being shutdown. 3975 * 3976 */ 3977 3978 retry_noise_cnt = send_try_cnt = 0; 3979 while (md_mn_is_commd_present_lite()) { 3980 /* 3981 * data_ptr and data_size are initialized here because on 3982 * return from the upcall, they contain data duplicated from 3983 * rbuf and rsize. This causes subsequent upcalls to fail. 3984 */ 3985 da.data_ptr = (char *)(kmsg); 3986 da.data_size = sizeof (md_mn_kmsg_t); 3987 da.desc_ptr = NULL; 3988 da.desc_num = 0; 3989 da.rbuf = (char *)result; 3990 da.rsize = sizeof (*result); 3991 3992 while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da, 3993 NULL, SIZE_MAX, 0)) != 0) { 3994 if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) { 3995 if (rval == EAGAIN) { 3996 cmn_err(CE_WARN, 3997 "md: door_upcall failed. " 3998 "Check if mddoors is running."); 3999 } else if (rval == EINTR) { 4000 cmn_err(CE_WARN, 4001 "md: door_upcall failed. " 4002 "Check if rpc.mdcommd is running."); 4003 } else { 4004 cmn_err(CE_WARN, 4005 "md: door_upcall failed. " 4006 "Returned %d", 4007 rval); 4008 } 4009 } 4010 if (++send_try_cnt >= md_send_retry_limit) 4011 break; 4012 4013 delay(md_hz); 4014 4015 /* 4016 * data_ptr and data_size are re-initialized here 4017 * because on return from the upcall, they contain 4018 * data duplicated from rbuf and rsize. This causes 4019 * subsequent upcalls to fail. 4020 */ 4021 da.data_ptr = (char *)(kmsg); 4022 da.data_size = sizeof (md_mn_kmsg_t); 4023 da.desc_ptr = NULL; 4024 da.desc_num = 0; 4025 da.rbuf = (char *)result; 4026 da.rsize = sizeof (*result); 4027 } 4028 4029 4030 /* 4031 * If: 4032 * - the send succeeded (MDMNE_ACK) 4033 * - we had an MDMNE_RPC_FAIL and commd is now gone 4034 * (note: since the outer loop is commd-dependent, 4035 * checking MDMN_RPC_FAIL here is meaningless) 4036 * - we were told not to retry 4037 * - we exceeded the RPC failure send limit 4038 * punch out of the outer loop prior to the delay() 4039 */ 4040 if (result->kmmr_comm_state == MDMNE_ACK || 4041 (flags & MD_MSGF_KSEND_NORETRY) || 4042 (++send_try_cnt % md_send_retry_limit) == 0 || 4043 !md_mn_is_commd_present()) 4044 break; 4045 delay(md_hz); 4046 } 4047 4048 if (flags & MD_MSGF_BLK_SIGNAL) { 4049 sigreplace(&oldmask, (k_sigset_t *)NULL); 4050 } 4051 kmem_free(kmsg, sizeof (md_mn_kmsg_t)); 4052 4053 return (0); 4054 } 4055 4056 /* 4057 * Called to propagate the capability of a metadevice to all nodes in the set. 4058 * 4059 * On entry, lockp is set if the function has been called from within an ioctl. 4060 * 4061 * IOLOCK_RETURN_RELEASE, which drops the md_ioctl_lock is called in this 4062 * routine to enable other mdioctls to enter the kernel while this 4063 * thread of execution waits on the completion of mdmn_ksend_message. When 4064 * the message is completed the thread continues and md_ioctl_lock must be 4065 * reacquired. Even though md_ioctl_lock is interruptable, we choose to 4066 * ignore EINTR as we must not return without acquiring md_ioctl_lock. 4067 */ 4068 4069 int 4070 mdmn_send_capability_message(minor_t mnum, volcap_t vc, IOLOCK *lockp) 4071 { 4072 md_mn_msg_setcap_t msg; 4073 md_mn_kresult_t *kres; 4074 mdi_unit_t *ui = MDI_UNIT(mnum); 4075 int ret; 4076 k_sigset_t oldmask, newmask; 4077 4078 (void) strncpy((char *)&msg.msg_setcap_driver, 4079 md_ops[ui->ui_opsindex]->md_driver.md_drivername, MD_DRIVERNAMELEN); 4080 msg.msg_setcap_mnum = mnum; 4081 msg.msg_setcap_set = vc.vc_set; 4082 4083 if (lockp) 4084 IOLOCK_RETURN_RELEASE(0, lockp); 4085 kres = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP); 4086 4087 /* 4088 * Mask signals for the mdmd_ksend_message call. This keeps the door 4089 * interface from failing if the user process receives a signal while 4090 * in mdmn_ksend_message. 4091 */ 4092 sigfillset(&newmask); 4093 sigreplace(&newmask, &oldmask); 4094 ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP, 4095 MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (md_mn_msg_setcap_t), 4096 kres)); 4097 sigreplace(&oldmask, (k_sigset_t *)NULL); 4098 4099 if (!MDMN_KSEND_MSG_OK(ret, kres)) { 4100 mdmn_ksend_show_error(ret, kres, "MD_MN_MSG_SET_CAP"); 4101 ret = EIO; 4102 } 4103 kmem_free(kres, sizeof (md_mn_kresult_t)); 4104 4105 if (lockp) { 4106 IOLOCK_RETURN_REACQUIRE(lockp); 4107 } 4108 return (ret); 4109 } 4110 4111 /* 4112 * Called to clear all of the transient capabilities for a metadevice when it is 4113 * not open on any node in the cluster 4114 * Called from close for mirror and sp. 4115 */ 4116 4117 void 4118 mdmn_clear_all_capabilities(minor_t mnum) 4119 { 4120 md_isopen_t clumsg; 4121 int ret; 4122 md_mn_kresult_t *kresult; 4123 volcap_t vc; 4124 k_sigset_t oldmask, newmask; 4125 4126 clumsg.dev = md_makedevice(md_major, mnum); 4127 clumsg.mde = mdnullerror; 4128 /* 4129 * The check open message doesn't have to be logged, nor should the 4130 * result be stored in the MCT. We want an up-to-date state. 4131 */ 4132 kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP); 4133 4134 /* 4135 * Mask signals for the mdmd_ksend_message call. This keeps the door 4136 * interface from failing if the user process receives a signal while 4137 * in mdmn_ksend_message. 4138 */ 4139 sigfillset(&newmask); 4140 sigreplace(&newmask, &oldmask); 4141 ret = mdmn_ksend_message(MD_MIN2SET(mnum), 4142 MD_MN_MSG_CLU_CHECK, 4143 MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 0, 4144 (char *)&clumsg, sizeof (clumsg), kresult); 4145 sigreplace(&oldmask, (k_sigset_t *)NULL); 4146 4147 if ((ret == 0) && (kresult->kmmr_exitval == 0)) { 4148 /* 4149 * Not open on any node, clear all capabilities, eg ABR and 4150 * DMR 4151 */ 4152 vc.vc_set = 0; 4153 (void) mdmn_send_capability_message(mnum, vc, NULL); 4154 } 4155 kmem_free(kresult, sizeof (md_mn_kresult_t)); 4156 } 4157 4158 /* 4159 * mdmn_ksend_show_error: 4160 * --------------------- 4161 * Called to display the error contents of a failing mdmn_ksend_message() result 4162 * 4163 * Input: 4164 * rv - return value from mdmn_ksend_message() 4165 * kres - pointer to result structure filled in by mdmn_ksend_message 4166 * s - Informative message to identify failing condition (e.g. 4167 * "Ownership change") This string will be displayed with 4168 * cmn_err(CE_WARN, "%s *FAILED*",...) to alert the system 4169 * administrator 4170 */ 4171 void 4172 mdmn_ksend_show_error(int rv, md_mn_kresult_t *kres, const char *s) 4173 { 4174 if (rv == 0) { 4175 cmn_err(CE_WARN, "%s *FAILED*", s); 4176 cmn_err(CE_CONT, "exit_val = %d, comm_state = %d, failing_node" 4177 " = %d", kres->kmmr_exitval, kres->kmmr_comm_state, 4178 kres->kmmr_failing_node); 4179 } else { 4180 cmn_err(CE_WARN, "%s *FAILED*, return value = %d", s, rv); 4181 } 4182 } 4183 4184 /* 4185 * Callback routine for resync thread. If requested to suspend we mark the 4186 * commd as not being present. 4187 */ 4188 boolean_t 4189 callb_md_mrs_cpr(void *arg, int code) 4190 { 4191 callb_cpr_t *cp = (callb_cpr_t *)arg; 4192 int ret = 0; /* assume success */ 4193 4194 mutex_enter(cp->cc_lockp); 4195 4196 switch (code) { 4197 case CB_CODE_CPR_CHKPT: 4198 /* 4199 * Mark the rpc.mdcommd as no longer present. We are trying to 4200 * suspend the system and so we should expect RPC failures to 4201 * occur. 4202 */ 4203 md_mn_clear_commd_present(); 4204 cp->cc_events |= CALLB_CPR_START; 4205 while (!(cp->cc_events & CALLB_CPR_SAFE)) 4206 /* cv_timedwait() returns -1 if it times out. */ 4207 if ((ret = cv_timedwait(&cp->cc_callb_cv, cp->cc_lockp, 4208 lbolt + CPR_KTHREAD_TIMEOUT_SEC * hz)) == -1) 4209 break; 4210 break; 4211 4212 case CB_CODE_CPR_RESUME: 4213 cp->cc_events &= ~CALLB_CPR_START; 4214 cv_signal(&cp->cc_stop_cv); 4215 break; 4216 } 4217 mutex_exit(cp->cc_lockp); 4218 return (ret != -1); 4219 } 4220 4221 4222 void 4223 md_rem_hspname(set_t setno, mdkey_t n_key) 4224 { 4225 int s; 4226 int max_sides; 4227 4228 4229 /* All entries removed are in the same diskset */ 4230 if (md_get_setstatus(setno) & MD_SET_MNSET) 4231 max_sides = MD_MNMAXSIDES; 4232 else 4233 max_sides = MD_MAXSIDES; 4234 4235 for (s = 0; s < max_sides; s++) 4236 (void) md_remdevname(setno, s, n_key); 4237 } 4238 4239 4240 int 4241 md_rem_selfname(minor_t selfid) 4242 { 4243 int s; 4244 set_t setno = MD_MIN2SET(selfid); 4245 int max_sides; 4246 md_dev64_t dev; 4247 struct nm_next_hdr *nh; 4248 struct nm_name *n; 4249 mdkey_t key; 4250 4251 /* 4252 * Get the key since remove routine expects it 4253 */ 4254 dev = md_makedevice(md_major, selfid); 4255 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) { 4256 return (ENOENT); 4257 } 4258 4259 if ((n = (struct nm_name *)lookup_entry(nh, setno, MD_SIDEWILD, 4260 MD_KEYWILD, dev, 0L)) == NULL) { 4261 return (ENOENT); 4262 } 4263 4264 /* All entries removed are in the same diskset */ 4265 key = n->n_key; 4266 if (md_get_setstatus(setno) & MD_SET_MNSET) 4267 max_sides = MD_MNMAXSIDES; 4268 else 4269 max_sides = MD_MAXSIDES; 4270 4271 for (s = 0; s < max_sides; s++) 4272 (void) md_remdevname(setno, s, key); 4273 4274 return (0); 4275 } 4276 4277 void 4278 md_upd_set_unnext(set_t setno, unit_t un) 4279 { 4280 if (un < md_set[setno].s_un_next) { 4281 md_set[setno].s_un_next = un; 4282 } 4283 } 4284 4285 struct hot_spare_pool * 4286 find_hot_spare_pool(set_t setno, int hsp_id) 4287 { 4288 hot_spare_pool_t *hsp; 4289 4290 hsp = (hot_spare_pool_t *)md_set[setno].s_hsp; 4291 while (hsp != NULL) { 4292 if (hsp->hsp_self_id == hsp_id) 4293 return (hsp); 4294 hsp = hsp->hsp_next; 4295 } 4296 4297 return ((hot_spare_pool_t *)0); 4298 } 4299 4300 /* 4301 * md_create_taskq: 4302 * 4303 * Create a kernel taskq for the given set/unit combination. This is typically 4304 * used to complete a RR_CLEAN request when the callee is unable to obtain the 4305 * mutex / condvar access required to update the DRL safely. 4306 */ 4307 void * 4308 md_create_taskq(set_t setno, minor_t mnum) 4309 { 4310 char name[20]; 4311 ddi_taskq_t *tqp; 4312 4313 (void) snprintf(name, 20, "%d/d%d", setno, MD_MIN2UNIT(mnum)); 4314 4315 tqp = ddi_taskq_create(md_devinfo, name, 1, TASKQ_DEFAULTPRI, 0); 4316 4317 return ((void *)tqp); 4318 } 4319