1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/conf.h> 30 #include <sys/file.h> 31 #include <sys/user.h> 32 #include <sys/uio.h> 33 #include <sys/t_lock.h> 34 #include <sys/buf.h> 35 #include <sys/dkio.h> 36 #include <sys/vtoc.h> 37 #include <sys/kmem.h> 38 #include <vm/page.h> 39 #include <sys/cmn_err.h> 40 #include <sys/sysmacros.h> 41 #include <sys/types.h> 42 #include <sys/mkdev.h> 43 #include <sys/stat.h> 44 #include <sys/open.h> 45 #include <sys/modctl.h> 46 #include <sys/ddi.h> 47 #include <sys/sunddi.h> 48 #include <sys/debug.h> 49 #include <sys/dklabel.h> 50 #include <vm/hat.h> 51 #include <sys/lvm/mdvar.h> 52 #include <sys/lvm/md_mirror.h> 53 #include <sys/lvm/md_convert.h> 54 #include <sys/lvm/md_mddb.h> 55 #include <sys/esunddi.h> 56 57 #include <sys/sysevent/eventdefs.h> 58 #include <sys/sysevent/svm.h> 59 #include <sys/lvm/mdmn_commd.h> 60 #include <sys/avl.h> 61 62 md_ops_t mirror_md_ops; 63 #ifndef lint 64 char _depends_on[] = "drv/md"; 65 md_ops_t *md_interface_ops = &mirror_md_ops; 66 #endif 67 68 extern mdq_anchor_t md_done_daemon; 69 extern mdq_anchor_t md_mstr_daemon; 70 extern mdq_anchor_t md_mirror_daemon; 71 extern mdq_anchor_t md_mirror_io_daemon; 72 extern mdq_anchor_t md_mirror_rs_daemon; 73 extern mdq_anchor_t md_mhs_daemon; 74 75 extern unit_t md_nunits; 76 extern set_t md_nsets; 77 extern md_set_t md_set[]; 78 79 extern int md_status; 80 extern clock_t md_hz; 81 82 extern md_krwlock_t md_unit_array_rw; 83 extern kmutex_t md_mx; 84 extern kcondvar_t md_cv; 85 extern int md_mtioctl_cnt; 86 87 daemon_request_t mirror_timeout; 88 static daemon_request_t hotspare_request; 89 static daemon_request_t mn_hs_request[MD_MAXSETS]; /* Multinode hs req */ 90 91 int md_mirror_mcs_buf_off; 92 93 /* Flags for mdmn_ksend_message to allow debugging */ 94 int md_mirror_msg_flags; 95 96 #ifdef DEBUG 97 /* Flag to switch on debug messages */ 98 int mirror_debug_flag = 0; 99 #endif 100 101 /* 102 * Struct used to hold count of DMR reads and the timestamp of last DMR read 103 * It is used to verify, using a debugger, that the DMR read ioctl has been 104 * executed. 105 */ 106 dmr_stats_t mirror_dmr_stats = {0, 0}; 107 108 /* 109 * Mutex protecting list of non-failfast drivers. 110 */ 111 static kmutex_t non_ff_drv_mutex; 112 extern char **non_ff_drivers; 113 114 extern major_t md_major; 115 116 /* 117 * Write-On-Write memory pool. 118 */ 119 static void copy_write_cont(wowhdr_t *wowhdr); 120 static kmem_cache_t *mirror_wowblk_cache = NULL; 121 static int md_wowbuf_size = 16384; 122 static size_t md_wowblk_size; 123 124 /* 125 * This is a flag that allows: 126 * - disabling the write-on-write mechanism. 127 * - logging occurrences of write-on-write 128 * - switching wow handling procedure processing 129 * Counter for occurences of WOW. 130 */ 131 static uint_t md_mirror_wow_flg = 0; 132 static int md_mirror_wow_cnt = 0; 133 134 /* 135 * Tunable to enable/disable dirty region 136 * processing when closing down a mirror. 137 */ 138 static int new_resync = 1; 139 kmem_cache_t *mirror_parent_cache = NULL; 140 kmem_cache_t *mirror_child_cache = NULL; 141 142 extern int md_ff_disable; /* disable failfast */ 143 144 static int mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int); 145 static void mirror_read_strategy(buf_t *, int, void *); 146 static void mirror_write_strategy(buf_t *, int, void *); 147 static void become_owner(daemon_queue_t *); 148 static int mirror_done(struct buf *cb); 149 static int mirror_done_common(struct buf *cb); 150 static void clear_retry_error(struct buf *cb); 151 152 /* 153 * patchables 154 */ 155 int md_min_rr_size = 200; /* 2000 blocks, or 100k */ 156 int md_def_num_rr = 1000; /* Default number of dirty regions */ 157 158 /* 159 * patchable to change delay before rescheduling mirror ownership request. 160 * Value is clock ticks, default 0.5 seconds 161 */ 162 clock_t md_mirror_owner_to = 500000; 163 164 /*ARGSUSED1*/ 165 static int 166 mirror_parent_constructor(void *p, void *d1, int d2) 167 { 168 mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL); 169 return (0); 170 } 171 172 static void 173 mirror_parent_init(md_mps_t *ps) 174 { 175 bzero(ps, offsetof(md_mps_t, ps_mx)); 176 bzero(&ps->ps_overlap_node, sizeof (avl_node_t)); 177 } 178 179 /*ARGSUSED1*/ 180 static void 181 mirror_parent_destructor(void *p, void *d) 182 { 183 mutex_destroy(&((md_mps_t *)p)->ps_mx); 184 } 185 186 /*ARGSUSED1*/ 187 static int 188 mirror_child_constructor(void *p, void *d1, int d2) 189 { 190 bioinit(&((md_mcs_t *)p)->cs_buf); 191 return (0); 192 } 193 194 void 195 mirror_child_init(md_mcs_t *cs) 196 { 197 cs->cs_ps = NULL; 198 cs->cs_mdunit = 0; 199 md_bioreset(&cs->cs_buf); 200 } 201 202 /*ARGSUSED1*/ 203 static void 204 mirror_child_destructor(void *p, void *d) 205 { 206 biofini(&((md_mcs_t *)p)->cs_buf); 207 } 208 209 static void 210 mirror_wowblk_init(wowhdr_t *p) 211 { 212 bzero(p, md_wowblk_size); 213 } 214 215 static void 216 send_poke_hotspares_msg(daemon_request_t *drq) 217 { 218 int rval; 219 int nretries = 0; 220 md_mn_msg_pokehsp_t pokehsp; 221 md_mn_kresult_t *kresult; 222 set_t setno = (set_t)drq->dq.qlen; 223 224 pokehsp.pokehsp_setno = setno; 225 226 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 227 228 retry_sphmsg: 229 rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES, 230 MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp, 231 sizeof (pokehsp), kresult); 232 233 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 234 mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES"); 235 /* If we're shutting down already, pause things here. */ 236 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { 237 while (!md_mn_is_commd_present()) { 238 delay(md_hz); 239 } 240 /* 241 * commd has become reachable again, so retry once. 242 * If this fails we'll panic as the system is in an 243 * unexpected state. 244 */ 245 if (nretries++ == 0) 246 goto retry_sphmsg; 247 } 248 cmn_err(CE_PANIC, 249 "ksend_message failure: POKE_HOTSPARES"); 250 } 251 kmem_free(kresult, sizeof (md_mn_kresult_t)); 252 253 /* Allow further requests to use this set's queue structure */ 254 mutex_enter(&drq->dr_mx); 255 drq->dr_pending = 0; 256 mutex_exit(&drq->dr_mx); 257 } 258 259 /* 260 * Send a poke_hotspares message to the master node. To avoid swamping the 261 * commd handler with requests we only send a message if there is not one 262 * already outstanding. We punt the request to a separate thread context as 263 * cannot afford to block waiting on the request to be serviced. This is 264 * essential when a reconfig cycle is in progress as any open() of a multinode 265 * metadevice may result in a livelock. 266 */ 267 static void 268 send_poke_hotspares(set_t setno) 269 { 270 daemon_request_t *drq = &mn_hs_request[setno]; 271 272 mutex_enter(&drq->dr_mx); 273 if (drq->dr_pending == 0) { 274 drq->dr_pending = 1; 275 drq->dq.qlen = (int)setno; 276 daemon_request(&md_mhs_daemon, 277 send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD); 278 } 279 mutex_exit(&drq->dr_mx); 280 } 281 282 void 283 mirror_set_sm_state( 284 mm_submirror_t *sm, 285 mm_submirror_ic_t *smic, 286 sm_state_t newstate, 287 int force) 288 { 289 int compcnt; 290 int i; 291 int errcnt; 292 sm_state_t origstate; 293 md_m_shared_t *shared; 294 295 if (force) { 296 sm->sm_state = newstate; 297 uniqtime32(&sm->sm_timestamp); 298 return; 299 } 300 301 origstate = newstate; 302 303 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 304 for (i = 0, errcnt = 0; i < compcnt; i++) { 305 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 306 (sm->sm_dev, sm, i); 307 if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED)) 308 newstate |= SMS_COMP_ERRED; 309 if (shared->ms_state & (CS_RESYNC)) 310 newstate |= SMS_COMP_RESYNC; 311 if (shared->ms_state & CS_ERRED) 312 errcnt++; 313 } 314 315 if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0) 316 newstate &= ~origstate; 317 318 if (errcnt == compcnt) 319 newstate |= SMS_ALL_ERRED; 320 else 321 newstate &= ~SMS_ALL_ERRED; 322 323 sm->sm_state = newstate; 324 uniqtime32(&sm->sm_timestamp); 325 } 326 327 static int 328 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error, 329 int frm_probe) 330 { 331 mm_submirror_t *sm; 332 mm_submirror_ic_t *smic; 333 md_m_shared_t *shared; 334 int ci; 335 int i; 336 int compcnt; 337 int open_comp; /* flag for open component */ 338 339 for (i = *smi; i < NMIRROR; i++) { 340 sm = &un->un_sm[i]; 341 smic = &un->un_smic[i]; 342 343 if (!SMS_IS(sm, SMS_INUSE)) 344 continue; 345 346 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 347 for (ci = *cip; ci < compcnt; ci++) { 348 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 349 (sm->sm_dev, sm, ci); 350 /* 351 * if called from any routine but probe, we check for 352 * MDM_S_ISOPEN flag. Since probe does a pseduo open, 353 * it sets MDM_S_PROBEOPEN flag and we test for this 354 * flag. They are both exclusive tests. 355 */ 356 open_comp = (frm_probe) ? 357 (shared->ms_flags & MDM_S_PROBEOPEN): 358 (shared->ms_flags & MDM_S_ISOPEN); 359 if ((shared->ms_flags & MDM_S_IOERR || !open_comp) && 360 ((shared->ms_state == CS_OKAY) || 361 (shared->ms_state == CS_RESYNC))) { 362 if (clr_error) { 363 shared->ms_flags &= ~MDM_S_IOERR; 364 } 365 *cip = ci; 366 *smi = i; 367 return (1); 368 } 369 370 if (clr_error && (shared->ms_flags & MDM_S_IOERR)) { 371 shared->ms_flags &= ~MDM_S_IOERR; 372 } 373 } 374 375 *cip = 0; 376 } 377 return (0); 378 } 379 380 /*ARGSUSED*/ 381 static void 382 mirror_run_queue(void *d) 383 { 384 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 385 md_daemon(1, &md_done_daemon); 386 } 387 /* 388 * check_comp_4_hotspares 389 * 390 * This function attempts to allocate a hotspare for this component if the 391 * component is in error. In a MN set, the function can be called in 2 modes. 392 * It can be called either when a component error has been detected or when a 393 * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set 394 * in flags and the request is sent to all nodes. 395 * The handler on each of the nodes then calls this function with 396 * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed. 397 * 398 * For non-MN sets the function simply attempts to allocate a hotspare. 399 * 400 * On entry, the following locks are held 401 * mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set) 402 * md_unit_writerlock 403 * 404 * Returns 0 if ok 405 * 1 if the unit containing the component has been cleared while 406 * the mdmn_ksend_message() was being executed 407 */ 408 extern int 409 check_comp_4_hotspares( 410 mm_unit_t *un, 411 int smi, 412 int ci, 413 uint_t flags, 414 mddb_recid_t hs_id, /* Only used by MN disksets */ 415 IOLOCK *lockp /* can be NULL */ 416 ) 417 { 418 mm_submirror_t *sm; 419 mm_submirror_ic_t *smic; 420 md_m_shared_t *shared; 421 mddb_recid_t recids[6]; 422 minor_t mnum; 423 intptr_t (*hs_dev)(); 424 void (*hs_done)(); 425 void *hs_data; 426 md_error_t mde = mdnullerror; 427 set_t setno; 428 md_mn_msg_allochsp_t allochspmsg; 429 md_mn_kresult_t *kresult; 430 mm_unit_t *new_un; 431 int rval; 432 int nretries = 0; 433 434 mnum = MD_SID(un); 435 setno = MD_UN2SET(un); 436 sm = &un->un_sm[smi]; 437 smic = &un->un_smic[smi]; 438 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 439 (sm->sm_dev, sm, ci); 440 441 if (shared->ms_state != CS_ERRED) 442 return (0); 443 444 /* Don't start a new component resync if a resync is already running. */ 445 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 446 return (0); 447 448 if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) { 449 uint_t msgflags; 450 md_mn_msgtype_t msgtype; 451 452 /* Send allocate hotspare message to all nodes */ 453 454 allochspmsg.msg_allochsp_mnum = un->c.un_self_id; 455 allochspmsg.msg_allochsp_sm = smi; 456 allochspmsg.msg_allochsp_comp = ci; 457 allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id; 458 459 /* 460 * Before calling mdmn_ksend_message(), release locks 461 * Can never be in the context of an ioctl. 462 */ 463 md_unit_writerexit(MDI_UNIT(mnum)); 464 if (flags & MD_HOTSPARE_LINKHELD) 465 rw_exit(&mirror_md_ops.md_link_rw.lock); 466 #ifdef DEBUG 467 if (mirror_debug_flag) 468 printf("send alloc hotspare, flags=" 469 "0x%x %x, %x, %x, %x\n", flags, 470 allochspmsg.msg_allochsp_mnum, 471 allochspmsg.msg_allochsp_sm, 472 allochspmsg.msg_allochsp_comp, 473 allochspmsg.msg_allochsp_hs_id); 474 #endif 475 if (flags & MD_HOTSPARE_WMUPDATE) { 476 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE2; 477 /* 478 * When coming from an update of watermarks, there 479 * must already be a message logged that triggered 480 * this action. So, no need to log this message, too. 481 */ 482 msgflags = MD_MSGF_NO_LOG; 483 } else { 484 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE; 485 msgflags = MD_MSGF_DEFAULT_FLAGS; 486 } 487 488 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 489 490 cc4hs_msg: 491 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0, 492 (char *)&allochspmsg, sizeof (allochspmsg), 493 kresult); 494 495 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 496 #ifdef DEBUG 497 if (mirror_debug_flag) 498 mdmn_ksend_show_error(rval, kresult, 499 "ALLOCATE HOTSPARE"); 500 #endif 501 /* 502 * If message is sent ok but exitval indicates an error 503 * it must be because the mirror has been cleared. In 504 * this case re-obtain lock and return an error 505 */ 506 if ((rval == 0) && (kresult->kmmr_exitval != 0)) { 507 if (flags & MD_HOTSPARE_LINKHELD) { 508 rw_enter(&mirror_md_ops.md_link_rw.lock, 509 RW_READER); 510 } 511 kmem_free(kresult, sizeof (md_mn_kresult_t)); 512 return (1); 513 } 514 /* If we're shutting down already, pause things here. */ 515 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { 516 while (!md_mn_is_commd_present()) { 517 delay(md_hz); 518 } 519 /* 520 * commd has become reachable again, so retry 521 * once. If this fails we'll panic as the 522 * system is in an unexpected state. 523 */ 524 if (nretries++ == 0) 525 goto cc4hs_msg; 526 } 527 cmn_err(CE_PANIC, 528 "ksend_message failure: ALLOCATE_HOTSPARE"); 529 } 530 kmem_free(kresult, sizeof (md_mn_kresult_t)); 531 532 /* 533 * re-obtain the locks 534 */ 535 if (flags & MD_HOTSPARE_LINKHELD) 536 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 537 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 538 539 /* 540 * As we had to release the locks in order to send the 541 * message to all nodes, we need to check to see if the 542 * unit has changed. If it has we release the writerlock 543 * and return fail. 544 */ 545 if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) { 546 md_unit_writerexit(MDI_UNIT(mnum)); 547 return (1); 548 } 549 } else { 550 if (MD_MNSET_SETNO(setno)) { 551 /* 552 * If 2 or more nodes simultaneously see a 553 * component failure, these nodes will each 554 * send an ALLOCATE_HOTSPARE[2] message. 555 * The first message will allocate the hotspare 556 * and the subsequent messages should do nothing. 557 * 558 * If a slave node doesn't have a hotspare allocated 559 * at the time the message is initiated, then the 560 * passed in hs_id will be 0. If the node 561 * executing this routine has a component shared 562 * ms_hs_id of non-zero, but the message shows a 563 * hs_id of 0, then just return since a hotspare 564 * has already been allocated for this failing 565 * component. When the slave node returns from 566 * the ksend_message the hotspare will have 567 * already been allocated. 568 * 569 * If the slave node does send an hs_id of non-zero, 570 * and the slave node's hs_id matches this node's 571 * ms_hs_id, then the hotspare has error'd and 572 * should be replaced. 573 * 574 * If the slave node sends an hs_id of non-zero and 575 * this node has a different shared ms_hs_id, then 576 * just return since this hotspare has already 577 * been hotspared. 578 */ 579 if (shared->ms_hs_id != 0) { 580 if (hs_id == 0) { 581 #ifdef DEBUG 582 if (mirror_debug_flag) { 583 printf("check_comp_4_hotspares" 584 "(NOXMIT), short circuit " 585 "hs_id=0x%x, " 586 "ms_hs_id=0x%x\n", 587 hs_id, shared->ms_hs_id); 588 } 589 #endif 590 return (0); 591 } 592 if (hs_id != shared->ms_hs_id) { 593 #ifdef DEBUG 594 if (mirror_debug_flag) { 595 printf("check_comp_4_hotspares" 596 "(NOXMIT), short circuit2 " 597 "hs_id=0x%x, " 598 "ms_hs_id=0x%x\n", 599 hs_id, shared->ms_hs_id); 600 } 601 #endif 602 return (0); 603 } 604 } 605 } 606 607 sm = &un->un_sm[smi]; 608 hs_dev = md_get_named_service(sm->sm_dev, 0, 609 "hotspare device", 0); 610 if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done, 611 &hs_data) != 0) 612 return (0); 613 614 /* 615 * set_sm_comp_state() commits the modified records. 616 * As we don't transmit the changes, no need to drop the lock. 617 */ 618 set_sm_comp_state(un, smi, ci, CS_RESYNC, recids, 619 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 620 621 (*hs_done)(sm->sm_dev, hs_data); 622 623 mirror_check_failfast(mnum); 624 625 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE, 626 setno, MD_SID(un)); 627 628 /* 629 * For a multi-node set we need to reset the un_rs_type, 630 * un_rs_resync_done and un_rs_resync_2_do fields as the 631 * hot-spare resync must copy all applicable data. 632 */ 633 if (MD_MNSET_SETNO(setno)) { 634 un->un_rs_type = MD_RS_NONE; 635 un->un_rs_resync_done = 0; 636 un->un_rs_resync_2_do = 0; 637 } 638 639 /* 640 * Must drop writer lock since mirror_resync_unit will 641 * open devices and must be able to grab readerlock. 642 * Don't need to drop IOLOCK since any descendent routines 643 * calling ksend_messages will drop the IOLOCK as needed. 644 * 645 */ 646 if (lockp) { 647 md_ioctl_writerexit(lockp); 648 } else { 649 md_unit_writerexit(MDI_UNIT(mnum)); 650 } 651 652 /* start resync */ 653 (void) mirror_resync_unit(mnum, NULL, &mde, lockp); 654 655 if (lockp) { 656 new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum)); 657 } else { 658 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 659 } 660 } 661 return (0); 662 } 663 664 /* 665 * check_unit_4_hotspares 666 * 667 * For a given mirror, allocate hotspares, if available for any components 668 * that are in error 669 * 670 * Returns 0 if ok 671 * 1 if check_comp_4_hotspares returns non-zero. This will only 672 * happen for a MN unit where the unit has been cleared while 673 * the allocate hotspare message is sent to all nodes. 674 */ 675 static int 676 check_unit_4_hotspares(mm_unit_t *un, int flags) 677 { 678 mm_submirror_t *sm; 679 mm_submirror_ic_t *smic; 680 int ci; 681 int i; 682 int compcnt; 683 684 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 685 return (0); 686 687 for (i = 0; i < NMIRROR; i++) { 688 sm = &un->un_sm[i]; 689 smic = &un->un_smic[i]; 690 if (!SMS_IS(sm, SMS_INUSE)) 691 continue; 692 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm); 693 for (ci = 0; ci < compcnt; ci++) { 694 md_m_shared_t *shared; 695 696 shared = (md_m_shared_t *) 697 (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci); 698 /* 699 * Never called from ioctl context, so pass in 700 * (IOLOCK *)NULL. Pass through flags from calling 701 * routine, also setting XMIT flag. 702 */ 703 if (check_comp_4_hotspares(un, i, ci, 704 (MD_HOTSPARE_XMIT | flags), 705 shared->ms_hs_id, (IOLOCK *)NULL) != 0) 706 return (1); 707 } 708 } 709 return (0); 710 } 711 712 static void 713 check_4_hotspares(daemon_request_t *drq) 714 { 715 mdi_unit_t *ui; 716 mm_unit_t *un; 717 md_link_t *next; 718 int x; 719 720 mutex_enter(&drq->dr_mx); /* clear up front so can poke */ 721 drq->dr_pending = 0; /* again in low level routine if */ 722 mutex_exit(&drq->dr_mx); /* something found to do */ 723 724 /* 725 * Used to have a problem here. The disksets weren't marked as being 726 * MNHOLD. This opened a window where we could be searching for 727 * hotspares and have the disk set unloaded (released) from under 728 * us causing a panic in stripe_component_count(). 729 * The way to prevent that is to mark the set MNHOLD which prevents 730 * any diskset from being released while we are scanning the mirrors, 731 * submirrors and components. 732 */ 733 734 for (x = 0; x < md_nsets; x++) 735 md_holdset_enter(x); 736 737 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 738 for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) { 739 ui = MDI_UNIT(next->ln_id); 740 741 un = (mm_unit_t *)md_unit_readerlock(ui); 742 743 /* 744 * Only check the unit if we are the master for this set 745 * For an MN set, poke_hotspares() is only effective on the 746 * master 747 */ 748 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 749 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 750 md_unit_readerexit(ui); 751 continue; 752 } 753 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) { 754 md_unit_readerexit(ui); 755 continue; 756 } 757 md_unit_readerexit(ui); 758 759 un = (mm_unit_t *)md_unit_writerlock(ui); 760 /* 761 * check_unit_4_hotspares will exit 1 if the unit has been 762 * removed during the process of allocating the hotspare. 763 * This can only happen for a MN metadevice. If unit no longer 764 * exists, no need to release writerlock 765 */ 766 if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0) 767 md_unit_writerexit(ui); 768 else { 769 /* 770 * If check_unit_4_hotspares failed, queue another 771 * request and break out of this one 772 */ 773 (void) poke_hotspares(); 774 break; 775 } 776 } 777 rw_exit(&mirror_md_ops.md_link_rw.lock); 778 779 for (x = 0; x < md_nsets; x++) 780 md_holdset_exit(x); 781 } 782 783 /* 784 * poke_hotspares 785 * 786 * If there is not a pending poke_hotspares request pending, queue a requent 787 * to call check_4_hotspares(). This will scan all mirrors and attempt to 788 * allocate hotspares for all components in error. 789 */ 790 int 791 poke_hotspares() 792 { 793 mutex_enter(&hotspare_request.dr_mx); 794 if (hotspare_request.dr_pending == 0) { 795 hotspare_request.dr_pending = 1; 796 daemon_request(&md_mhs_daemon, 797 check_4_hotspares, (daemon_queue_t *)&hotspare_request, 798 REQ_OLD); 799 } 800 mutex_exit(&hotspare_request.dr_mx); 801 return (0); 802 } 803 804 static void 805 free_all_ecomps(err_comp_t *ecomp) 806 { 807 err_comp_t *d; 808 809 while (ecomp != NULL) { 810 d = ecomp; 811 ecomp = ecomp->ec_next; 812 kmem_free(d, sizeof (err_comp_t)); 813 } 814 } 815 816 /* 817 * NAME: mirror_openfail_console_info 818 * 819 * DESCRIPTION: Prints a informative message to the console when mirror 820 * cannot be opened. 821 * 822 * PARAMETERS: mm_unit_t un - pointer to mirror unit structure 823 * int smi - submirror index 824 * int ci - component index 825 */ 826 827 void 828 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci) 829 { 830 void (*get_dev)(); 831 ms_cd_info_t cd; 832 md_dev64_t tmpdev; 833 834 tmpdev = un->un_sm[smi].sm_dev; 835 get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0); 836 if (get_dev != NULL) { 837 (void) (*get_dev)(tmpdev, smi, ci, &cd); 838 cmn_err(CE_WARN, "md %s: open error on %s", 839 md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un), 840 cd.cd_dev, NULL, 0)); 841 } else { 842 cmn_err(CE_WARN, "md %s: open error", 843 md_shortname(MD_SID(un))); 844 } 845 } 846 847 static int 848 mirror_close_all_devs(mm_unit_t *un, int md_cflags) 849 { 850 int i; 851 md_dev64_t dev; 852 853 for (i = 0; i < NMIRROR; i++) { 854 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 855 continue; 856 dev = un->un_sm[i].sm_dev; 857 md_layered_close(dev, md_cflags); 858 } 859 return (0); 860 } 861 862 /* 863 * Keep track of drivers that don't support failfast. We use this so that 864 * we only log one diagnostic message for each of these drivers, no matter 865 * how many times we run the mirror_check_failfast function. 866 * Return 1 if this is a new driver that does not support failfast, 867 * return 0 if we have already seen this non-failfast driver. 868 */ 869 static int 870 new_non_ff_driver(const char *s) 871 { 872 mutex_enter(&non_ff_drv_mutex); 873 if (non_ff_drivers == NULL) { 874 non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *), 875 KM_NOSLEEP); 876 if (non_ff_drivers == NULL) { 877 mutex_exit(&non_ff_drv_mutex); 878 return (1); 879 } 880 881 non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, 882 KM_NOSLEEP); 883 if (non_ff_drivers[0] == NULL) { 884 kmem_free(non_ff_drivers, 2 * sizeof (char *)); 885 non_ff_drivers = NULL; 886 mutex_exit(&non_ff_drv_mutex); 887 return (1); 888 } 889 890 (void) strcpy(non_ff_drivers[0], s); 891 non_ff_drivers[1] = NULL; 892 893 } else { 894 int i; 895 char **tnames; 896 char **tmp; 897 898 for (i = 0; non_ff_drivers[i] != NULL; i++) { 899 if (strcmp(s, non_ff_drivers[i]) == 0) { 900 mutex_exit(&non_ff_drv_mutex); 901 return (0); 902 } 903 } 904 905 /* allow for new element and null */ 906 i += 2; 907 tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP); 908 if (tnames == NULL) { 909 mutex_exit(&non_ff_drv_mutex); 910 return (1); 911 } 912 913 for (i = 0; non_ff_drivers[i] != NULL; i++) 914 tnames[i] = non_ff_drivers[i]; 915 916 tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); 917 if (tnames[i] == NULL) { 918 /* adjust i so that it is the right count to free */ 919 kmem_free(tnames, (i + 2) * sizeof (char *)); 920 mutex_exit(&non_ff_drv_mutex); 921 return (1); 922 } 923 924 (void) strcpy(tnames[i++], s); 925 tnames[i] = NULL; 926 927 tmp = non_ff_drivers; 928 non_ff_drivers = tnames; 929 /* i now represents the count we previously alloced */ 930 kmem_free(tmp, i * sizeof (char *)); 931 } 932 mutex_exit(&non_ff_drv_mutex); 933 934 return (1); 935 } 936 937 /* 938 * Check for the "ddi-failfast-supported" devtree property on each submirror 939 * component to indicate if we should do I/O to that submirror with the 940 * B_FAILFAST flag set or not. This check is made at various state transitions 941 * in the mirror code (e.g. open, enable, hotspare, etc.). Sometimes we 942 * only need to check one drive (e.g. hotspare) but since the check is 943 * fast and infrequent and sometimes needs to be done on all components we 944 * just check all components on each call. 945 */ 946 void 947 mirror_check_failfast(minor_t mnum) 948 { 949 int i; 950 mm_unit_t *un; 951 952 if (md_ff_disable) 953 return; 954 955 un = MD_UNIT(mnum); 956 957 for (i = 0; i < NMIRROR; i++) { 958 int ci; 959 int cnt; 960 int ff = 1; 961 mm_submirror_t *sm; 962 mm_submirror_ic_t *smic; 963 void (*get_dev)(); 964 965 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 966 continue; 967 968 sm = &un->un_sm[i]; 969 smic = &un->un_smic[i]; 970 971 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 972 "get device", 0); 973 974 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 975 for (ci = 0; ci < cnt; ci++) { 976 int found = 0; 977 dev_t ci_dev; 978 major_t major; 979 dev_info_t *devi; 980 ms_cd_info_t cd; 981 982 /* 983 * this already returns the hs 984 * dev if the device is spared 985 */ 986 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 987 988 ci_dev = md_dev64_to_dev(cd.cd_dev); 989 major = getmajor(ci_dev); 990 991 if (major == md_major) { 992 /* 993 * this component must be a soft 994 * partition; get the real dev 995 */ 996 minor_t dev_mnum; 997 mdi_unit_t *ui; 998 mp_unit_t *un; 999 set_t setno; 1000 side_t side; 1001 md_dev64_t tmpdev; 1002 1003 ui = MDI_UNIT(getminor(ci_dev)); 1004 1005 /* grab necessary lock */ 1006 un = (mp_unit_t *)md_unit_readerlock(ui); 1007 1008 dev_mnum = MD_SID(un); 1009 setno = MD_MIN2SET(dev_mnum); 1010 side = mddb_getsidenum(setno); 1011 1012 tmpdev = un->un_dev; 1013 1014 /* Get dev by device id */ 1015 if (md_devid_found(setno, side, 1016 un->un_key) == 1) { 1017 tmpdev = md_resolve_bydevid(dev_mnum, 1018 tmpdev, un->un_key); 1019 } 1020 1021 md_unit_readerexit(ui); 1022 1023 ci_dev = md_dev64_to_dev(tmpdev); 1024 major = getmajor(ci_dev); 1025 } 1026 1027 if (ci_dev != NODEV32 && 1028 (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) 1029 != NULL) { 1030 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; 1031 int propvalue = 0; 1032 int proplength = sizeof (int); 1033 int error; 1034 struct cb_ops *cb; 1035 1036 if ((cb = devopsp[major]->devo_cb_ops) != 1037 NULL) { 1038 error = (*cb->cb_prop_op) 1039 (DDI_DEV_T_ANY, devi, prop_op, 1040 DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, 1041 "ddi-failfast-supported", 1042 (caddr_t)&propvalue, &proplength); 1043 1044 if (error == DDI_PROP_SUCCESS) 1045 found = 1; 1046 } 1047 1048 if (!found && new_non_ff_driver( 1049 ddi_driver_name(devi))) { 1050 cmn_err(CE_NOTE, "!md: B_FAILFAST I/O" 1051 "disabled on %s", 1052 ddi_driver_name(devi)); 1053 } 1054 1055 ddi_release_devi(devi); 1056 } 1057 1058 /* 1059 * All components must support 1060 * failfast in the submirror. 1061 */ 1062 if (!found) { 1063 ff = 0; 1064 break; 1065 } 1066 } 1067 1068 if (ff) { 1069 sm->sm_flags |= MD_SM_FAILFAST; 1070 } else { 1071 sm->sm_flags &= ~MD_SM_FAILFAST; 1072 } 1073 } 1074 } 1075 1076 /* 1077 * Return true if the submirror is unavailable. 1078 * If any of the submirror components are opened then the submirror cannot 1079 * be unavailable (MD_INACCESSIBLE). 1080 * If any of the components are already in the errored state, then the submirror 1081 * cannot be unavailable (MD_INACCESSIBLE). 1082 */ 1083 static bool_t 1084 submirror_unavailable(mm_unit_t *un, int smi, int from_probe) 1085 { 1086 mm_submirror_t *sm; 1087 mm_submirror_ic_t *smic; 1088 md_m_shared_t *shared; 1089 int ci; 1090 int compcnt; 1091 1092 sm = &un->un_sm[smi]; 1093 smic = &un->un_smic[smi]; 1094 1095 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 1096 for (ci = 0; ci < compcnt; ci++) { 1097 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 1098 (sm->sm_dev, sm, ci); 1099 if (from_probe) { 1100 if (shared->ms_flags & MDM_S_PROBEOPEN) 1101 return (B_FALSE); 1102 } else { 1103 if (shared->ms_flags & MDM_S_ISOPEN) 1104 return (B_FALSE); 1105 } 1106 if (shared->ms_state == CS_ERRED || 1107 shared->ms_state == CS_LAST_ERRED) 1108 return (B_FALSE); 1109 } 1110 1111 return (B_TRUE); 1112 } 1113 1114 static int 1115 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp) 1116 { 1117 int i; 1118 mm_unit_t *un; 1119 mdi_unit_t *ui; 1120 int err; 1121 int smi; 1122 int ci; 1123 err_comp_t *c; 1124 err_comp_t *ecomps = NULL; 1125 int smmask = 0; 1126 set_t setno; 1127 int sm_cnt; 1128 int sm_unavail_cnt; 1129 1130 mirror_check_failfast(mnum); 1131 1132 un = MD_UNIT(mnum); 1133 ui = MDI_UNIT(mnum); 1134 setno = MD_UN2SET(un); 1135 1136 for (i = 0; i < NMIRROR; i++) { 1137 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1138 1139 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1140 continue; 1141 if (md_layered_open(mnum, &tmpdev, md_oflags)) 1142 smmask |= SMI2BIT(i); 1143 un->un_sm[i].sm_dev = tmpdev; 1144 } 1145 1146 /* 1147 * If smmask is clear, all submirrors are accessible. Clear the 1148 * MD_INACCESSIBLE bit in this case. This bit is also cleared for the 1149 * mirror device. If smmask is set, we have to determine which of the 1150 * submirrors are in error. If no submirror is accessible we mark the 1151 * whole mirror as MD_INACCESSIBLE. 1152 */ 1153 if (smmask == 0) { 1154 if (lockp) { 1155 md_ioctl_readerexit(lockp); 1156 (void) md_ioctl_writerlock(lockp, ui); 1157 } else { 1158 md_unit_readerexit(ui); 1159 (void) md_unit_writerlock(ui); 1160 } 1161 ui->ui_tstate &= ~MD_INACCESSIBLE; 1162 if (lockp) { 1163 md_ioctl_writerexit(lockp); 1164 (void) md_ioctl_readerlock(lockp, ui); 1165 } else { 1166 md_unit_writerexit(ui); 1167 (void) md_unit_readerlock(ui); 1168 } 1169 1170 for (i = 0; i < NMIRROR; i++) { 1171 md_dev64_t tmpdev; 1172 mdi_unit_t *sm_ui; 1173 1174 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1175 continue; 1176 1177 tmpdev = un->un_sm[i].sm_dev; 1178 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1179 (void) md_unit_writerlock(sm_ui); 1180 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1181 md_unit_writerexit(sm_ui); 1182 } 1183 1184 return (0); 1185 } 1186 1187 for (i = 0; i < NMIRROR; i++) { 1188 md_dev64_t tmpdev; 1189 1190 if (!(smmask & SMI2BIT(i))) 1191 continue; 1192 1193 tmpdev = un->un_sm[i].sm_dev; 1194 err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS); 1195 un->un_sm[i].sm_dev = tmpdev; 1196 ASSERT(err == 0); 1197 } 1198 1199 if (lockp) { 1200 md_ioctl_readerexit(lockp); 1201 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui); 1202 } else { 1203 md_unit_readerexit(ui); 1204 un = (mm_unit_t *)md_unit_writerlock(ui); 1205 } 1206 1207 /* 1208 * We want to make sure the unavailable flag is not masking a real 1209 * error on the submirror. 1210 * For each submirror, 1211 * if all of the submirror components couldn't be opened and there 1212 * are no errors on the submirror, then set the unavailable flag 1213 * otherwise, clear unavailable. 1214 */ 1215 sm_cnt = 0; 1216 sm_unavail_cnt = 0; 1217 for (i = 0; i < NMIRROR; i++) { 1218 md_dev64_t tmpdev; 1219 mdi_unit_t *sm_ui; 1220 1221 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1222 continue; 1223 1224 sm_cnt++; 1225 tmpdev = un->un_sm[i].sm_dev; 1226 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1227 1228 (void) md_unit_writerlock(sm_ui); 1229 if (submirror_unavailable(un, i, 0)) { 1230 sm_ui->ui_tstate |= MD_INACCESSIBLE; 1231 sm_unavail_cnt++; 1232 } else { 1233 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1234 } 1235 md_unit_writerexit(sm_ui); 1236 } 1237 1238 /* 1239 * If all of the submirrors are unavailable, the mirror is also 1240 * unavailable. 1241 */ 1242 if (sm_cnt == sm_unavail_cnt) { 1243 ui->ui_tstate |= MD_INACCESSIBLE; 1244 } else { 1245 ui->ui_tstate &= ~MD_INACCESSIBLE; 1246 } 1247 1248 smi = 0; 1249 ci = 0; 1250 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 1251 if (mirror_other_sources(un, smi, ci, 1) == 1) { 1252 1253 free_all_ecomps(ecomps); 1254 (void) mirror_close_all_devs(un, md_oflags); 1255 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, 1256 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1257 mirror_openfail_console_info(un, smi, ci); 1258 if (lockp) { 1259 md_ioctl_writerexit(lockp); 1260 (void) md_ioctl_readerlock(lockp, ui); 1261 } else { 1262 md_unit_writerexit(ui); 1263 (void) md_unit_readerlock(ui); 1264 } 1265 return (ENXIO); 1266 } 1267 1268 /* track all component states that need changing */ 1269 c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP); 1270 c->ec_next = ecomps; 1271 c->ec_smi = smi; 1272 c->ec_ci = ci; 1273 ecomps = c; 1274 ci++; 1275 } 1276 1277 /* Make all state changes and commit them */ 1278 for (c = ecomps; c != NULL; c = c->ec_next) { 1279 /* 1280 * If lockp is set, then entering kernel through ioctl. 1281 * For a MN set, the only ioctl path is via a commd message 1282 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already 1283 * being sent to each node. 1284 * In this case, set NO_XMIT so that set_sm_comp_state 1285 * won't attempt to send a message on a message. 1286 * 1287 * In !MN sets, the xmit flag is ignored, so it doesn't matter 1288 * which flag is passed. 1289 */ 1290 if (lockp) { 1291 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1292 MD_STATE_NO_XMIT, lockp); 1293 } else { 1294 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1295 (MD_STATE_XMIT | MD_STATE_OCHELD), lockp); 1296 } 1297 /* 1298 * For a MN set, the NOTIFY is done when the state change is 1299 * processed on each node 1300 */ 1301 if (!MD_MNSET_SETNO(setno)) { 1302 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 1303 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1304 } 1305 } 1306 1307 if (lockp) { 1308 md_ioctl_writerexit(lockp); 1309 (void) md_ioctl_readerlock(lockp, ui); 1310 } else { 1311 md_unit_writerexit(ui); 1312 (void) md_unit_readerlock(ui); 1313 } 1314 1315 free_all_ecomps(ecomps); 1316 1317 /* allocate hotspares for all errored components */ 1318 if (MD_MNSET_SETNO(setno)) { 1319 /* 1320 * If we're called from an ioctl (lockp set) then we cannot 1321 * directly call send_poke_hotspares as this will block until 1322 * the message gets despatched to all nodes. If the cluster is 1323 * going through a reconfig cycle then the message will block 1324 * until the cycle is complete, and as we originate from a 1325 * service call from commd we will livelock. 1326 */ 1327 if (lockp == NULL) { 1328 md_unit_readerexit(ui); 1329 send_poke_hotspares(setno); 1330 (void) md_unit_readerlock(ui); 1331 } 1332 } else { 1333 (void) poke_hotspares(); 1334 } 1335 return (0); 1336 } 1337 1338 void 1339 mirror_overlap_tree_remove(md_mps_t *ps) 1340 { 1341 mm_unit_t *un; 1342 1343 if (panicstr) 1344 return; 1345 1346 VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP); 1347 un = ps->ps_un; 1348 1349 mutex_enter(&un->un_overlap_tree_mx); 1350 avl_remove(&un->un_overlap_root, ps); 1351 ps->ps_flags &= ~MD_MPS_ON_OVERLAP; 1352 if (un->un_overlap_tree_flag != 0) { 1353 un->un_overlap_tree_flag = 0; 1354 cv_broadcast(&un->un_overlap_tree_cv); 1355 } 1356 mutex_exit(&un->un_overlap_tree_mx); 1357 } 1358 1359 1360 /* 1361 * wait_for_overlaps: 1362 * ----------------- 1363 * Check that given i/o request does not cause an overlap with already pending 1364 * i/o. If it does, block until the overlapped i/o completes. 1365 * 1366 * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent 1367 * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if 1368 * it must not already be in the tree. 1369 */ 1370 static void 1371 wait_for_overlaps(md_mps_t *ps, int flags) 1372 { 1373 mm_unit_t *un; 1374 avl_index_t where; 1375 md_mps_t *ps1; 1376 1377 if (panicstr) 1378 return; 1379 1380 un = ps->ps_un; 1381 mutex_enter(&un->un_overlap_tree_mx); 1382 if ((flags & MD_OVERLAP_ALLOW_REPEAT) && 1383 (ps->ps_flags & MD_MPS_ON_OVERLAP)) { 1384 mutex_exit(&un->un_overlap_tree_mx); 1385 return; 1386 } 1387 1388 VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 1389 1390 do { 1391 ps1 = avl_find(&un->un_overlap_root, ps, &where); 1392 if (ps1 == NULL) { 1393 /* 1394 * The candidate range does not overlap with any 1395 * range in the tree. Insert it and be done. 1396 */ 1397 avl_insert(&un->un_overlap_root, ps, where); 1398 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1399 } else { 1400 /* 1401 * The candidate range would overlap. Set the flag 1402 * indicating we need to be woken up, and sleep 1403 * until another thread removes a range. If upon 1404 * waking up we find this mps was put on the tree 1405 * by another thread, the loop terminates. 1406 */ 1407 un->un_overlap_tree_flag = 1; 1408 cv_wait(&un->un_overlap_tree_cv, 1409 &un->un_overlap_tree_mx); 1410 } 1411 } while (!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 1412 mutex_exit(&un->un_overlap_tree_mx); 1413 } 1414 1415 /* 1416 * This function is called from mirror_done to check whether any pages have 1417 * been modified while a mirrored write was in progress. Returns 0 if 1418 * all pages associated with bp are clean, 1 otherwise. 1419 */ 1420 static int 1421 any_pages_dirty(struct buf *bp) 1422 { 1423 int rval; 1424 1425 rval = biomodified(bp); 1426 if (rval == -1) 1427 rval = 0; 1428 1429 return (rval); 1430 } 1431 1432 #define MAX_EXTRAS 10 1433 1434 void 1435 mirror_commit( 1436 mm_unit_t *un, 1437 int smmask, 1438 mddb_recid_t *extras 1439 ) 1440 { 1441 mm_submirror_t *sm; 1442 md_unit_t *su; 1443 int i; 1444 1445 /* 2=mirror,null id */ 1446 mddb_recid_t recids[NMIRROR+2+MAX_EXTRAS]; 1447 1448 int ri = 0; 1449 1450 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE) 1451 return; 1452 1453 /* Add two, this includes the mirror unit and the null recid */ 1454 if (extras != NULL) { 1455 int nrecids = 0; 1456 while (extras[nrecids] != 0) { 1457 nrecids++; 1458 } 1459 ASSERT(nrecids <= MAX_EXTRAS); 1460 } 1461 1462 if (un != NULL) 1463 recids[ri++] = un->c.un_record_id; 1464 for (i = 0; i < NMIRROR; i++) { 1465 if (!(smmask & SMI2BIT(i))) 1466 continue; 1467 sm = &un->un_sm[i]; 1468 if (!SMS_IS(sm, SMS_INUSE)) 1469 continue; 1470 if (md_getmajor(sm->sm_dev) != md_major) 1471 continue; 1472 su = MD_UNIT(md_getminor(sm->sm_dev)); 1473 recids[ri++] = su->c.un_record_id; 1474 } 1475 1476 if (extras != NULL) 1477 while (*extras != 0) { 1478 recids[ri++] = *extras; 1479 extras++; 1480 } 1481 1482 if (ri == 0) 1483 return; 1484 recids[ri] = 0; 1485 1486 /* 1487 * Ok to hold ioctl lock across record commit to mddb as 1488 * long as the record(s) being committed aren't resync records. 1489 */ 1490 mddb_commitrecs_wrapper(recids); 1491 } 1492 1493 1494 /* 1495 * This routine is used to set a bit in the writable_bm bitmap 1496 * which represents each submirror in a metamirror which 1497 * is writable. The first writable submirror index is assigned 1498 * to the sm_index. The number of writable submirrors are returned in nunits. 1499 * 1500 * This routine returns the submirror's unit number. 1501 */ 1502 1503 static void 1504 select_write_units(struct mm_unit *un, md_mps_t *ps) 1505 { 1506 1507 int i; 1508 unsigned writable_bm = 0; 1509 unsigned nunits = 0; 1510 1511 for (i = 0; i < NMIRROR; i++) { 1512 if (SUBMIRROR_IS_WRITEABLE(un, i)) { 1513 /* set bit of all writable units */ 1514 writable_bm |= SMI2BIT(i); 1515 nunits++; 1516 } 1517 } 1518 ps->ps_writable_sm = writable_bm; 1519 ps->ps_active_cnt = nunits; 1520 ps->ps_current_sm = 0; 1521 } 1522 1523 static 1524 unsigned 1525 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps) 1526 { 1527 1528 int i; 1529 unsigned writable_bm = 0; 1530 unsigned nunits = 0; 1531 1532 for (i = 0; i < NMIRROR; i++) { 1533 if (SUBMIRROR_IS_WRITEABLE(un, i) && 1534 un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) { 1535 writable_bm |= SMI2BIT(i); 1536 nunits++; 1537 } 1538 } 1539 if ((writable_bm & ps->ps_allfrom_sm) != 0) { 1540 writable_bm &= ~ps->ps_allfrom_sm; 1541 nunits--; 1542 } 1543 ps->ps_writable_sm = writable_bm; 1544 ps->ps_active_cnt = nunits; 1545 ps->ps_current_sm = 0; 1546 return (nunits); 1547 } 1548 1549 static md_dev64_t 1550 select_read_unit( 1551 mm_unit_t *un, 1552 diskaddr_t blkno, 1553 u_longlong_t reqcount, 1554 u_longlong_t *cando, 1555 int must_be_opened, 1556 md_m_shared_t **shared, 1557 md_mcs_t *cs) 1558 { 1559 int i; 1560 md_m_shared_t *s; 1561 uint_t lasterrcnt = 0; 1562 md_dev64_t dev = 0; 1563 u_longlong_t cnt; 1564 u_longlong_t mincnt; 1565 mm_submirror_t *sm; 1566 mm_submirror_ic_t *smic; 1567 mdi_unit_t *ui; 1568 1569 mincnt = reqcount; 1570 for (i = 0; i < NMIRROR; i++) { 1571 if (!SUBMIRROR_IS_READABLE(un, i)) 1572 continue; 1573 sm = &un->un_sm[i]; 1574 smic = &un->un_smic[i]; 1575 cnt = reqcount; 1576 1577 /* 1578 * If the current submirror is marked as inaccessible, do not 1579 * try to access it. 1580 */ 1581 ui = MDI_UNIT(getminor(expldev(sm->sm_dev))); 1582 (void) md_unit_readerlock(ui); 1583 if (ui->ui_tstate & MD_INACCESSIBLE) { 1584 md_unit_readerexit(ui); 1585 continue; 1586 } 1587 md_unit_readerexit(ui); 1588 1589 s = (md_m_shared_t *)(*(smic->sm_shared_by_blk)) 1590 (sm->sm_dev, sm, blkno, &cnt); 1591 1592 if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN)) 1593 continue; 1594 if (s->ms_state == CS_OKAY) { 1595 *cando = cnt; 1596 if (shared != NULL) 1597 *shared = s; 1598 1599 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST && 1600 cs != NULL) { 1601 cs->cs_buf.b_flags |= B_FAILFAST; 1602 } 1603 1604 return (un->un_sm[i].sm_dev); 1605 } 1606 if (s->ms_state != CS_LAST_ERRED) 1607 continue; 1608 1609 /* don't use B_FAILFAST since we're Last Erred */ 1610 1611 if (mincnt > cnt) 1612 mincnt = cnt; 1613 if (s->ms_lasterrcnt > lasterrcnt) { 1614 lasterrcnt = s->ms_lasterrcnt; 1615 if (shared != NULL) 1616 *shared = s; 1617 dev = un->un_sm[i].sm_dev; 1618 } 1619 } 1620 *cando = mincnt; 1621 return (dev); 1622 } 1623 1624 /* 1625 * Given a 32-bit bitmap, this routine will return the bit number 1626 * of the nth bit set. The nth bit set is passed via the index integer. 1627 * 1628 * This routine is used to run through the writable submirror bitmap 1629 * and starting all of the writes. See the value returned is the 1630 * index to appropriate submirror structure, in the md_sm 1631 * array for metamirrors. 1632 */ 1633 static int 1634 md_find_nth_unit(uint_t mask, int index) 1635 { 1636 int bit, nfound; 1637 1638 for (bit = -1, nfound = -1; nfound != index; bit++) { 1639 ASSERT(mask != 0); 1640 nfound += (mask & 1); 1641 mask >>= 1; 1642 } 1643 return (bit); 1644 } 1645 1646 static int 1647 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs) 1648 { 1649 mm_unit_t *un; 1650 buf_t *bp; 1651 int i; 1652 unsigned nunits = 0; 1653 int iunit; 1654 uint_t running_bm = 0; 1655 uint_t sm_index; 1656 1657 bp = &cs->cs_buf; 1658 un = ps->ps_un; 1659 1660 for (i = 0; i < NMIRROR; i++) { 1661 if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING)) 1662 continue; 1663 running_bm |= SMI2BIT(i); 1664 nunits++; 1665 } 1666 if (nunits == 0) 1667 return (1); 1668 1669 /* 1670 * For directed mirror read (DMR) we only use the specified side and 1671 * do not compute the source of the read. 1672 * If we're running with MD_MPS_DIRTY_RD set we always return the 1673 * first mirror side (this prevents unnecessary ownership switching). 1674 * Otherwise we return the submirror according to the mirror read option 1675 */ 1676 if (ps->ps_flags & MD_MPS_DMR) { 1677 sm_index = un->un_dmr_last_read; 1678 } else if (ps->ps_flags & MD_MPS_DIRTY_RD) { 1679 sm_index = md_find_nth_unit(running_bm, 0); 1680 } else { 1681 /* Normal (non-DMR) operation */ 1682 switch (un->un_read_option) { 1683 case RD_GEOMETRY: 1684 iunit = (int)(bp->b_lblkno / 1685 howmany(un->c.un_total_blocks, nunits)); 1686 sm_index = md_find_nth_unit(running_bm, iunit); 1687 break; 1688 case RD_FIRST: 1689 sm_index = md_find_nth_unit(running_bm, 0); 1690 break; 1691 case RD_LOAD_BAL: 1692 /* this is intentional to fall into the default */ 1693 default: 1694 un->un_last_read = (un->un_last_read + 1) % nunits; 1695 sm_index = md_find_nth_unit(running_bm, 1696 un->un_last_read); 1697 break; 1698 } 1699 } 1700 bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev); 1701 ps->ps_allfrom_sm = SMI2BIT(sm_index); 1702 1703 if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) { 1704 bp->b_flags |= B_FAILFAST; 1705 } 1706 1707 return (0); 1708 } 1709 1710 static 1711 int 1712 mirror_are_submirrors_available(mm_unit_t *un) 1713 { 1714 int i; 1715 for (i = 0; i < NMIRROR; i++) { 1716 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1717 1718 if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) || 1719 md_getmajor(tmpdev) != md_major) 1720 continue; 1721 1722 if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) || 1723 (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits)) 1724 return (0); 1725 1726 if (MDI_UNIT(md_getminor(tmpdev)) == NULL) 1727 return (0); 1728 } 1729 return (1); 1730 } 1731 1732 void 1733 build_submirror(mm_unit_t *un, int i, int snarfing) 1734 { 1735 struct mm_submirror *sm; 1736 struct mm_submirror_ic *smic; 1737 md_unit_t *su; 1738 set_t setno; 1739 1740 sm = &un->un_sm[i]; 1741 smic = &un->un_smic[i]; 1742 1743 sm->sm_flags = 0; /* sometime we may need to do more here */ 1744 1745 setno = MD_UN2SET(un); 1746 1747 if (!SMS_IS(sm, SMS_INUSE)) 1748 return; 1749 if (snarfing) { 1750 sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno), 1751 sm->sm_key, MD_NOTRUST_DEVT); 1752 } else { 1753 if (md_getmajor(sm->sm_dev) == md_major) { 1754 su = MD_UNIT(md_getminor(sm->sm_dev)); 1755 un->c.un_flag |= (su->c.un_flag & MD_LABELED); 1756 /* submirror can no longer be soft partitioned */ 1757 MD_CAPAB(su) &= (~MD_CAN_SP); 1758 } 1759 } 1760 smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev, 1761 0, "shared by blk", 0); 1762 smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev, 1763 0, "shared by indx", 0); 1764 smic->sm_get_component_count = (int (*)())md_get_named_service( 1765 sm->sm_dev, 0, "get component count", 0); 1766 smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0, 1767 "get block count skip size", 0); 1768 sm->sm_state &= ~SMS_IGNORE; 1769 if (SMS_IS(sm, SMS_OFFLINE)) 1770 MD_STATUS(un) |= MD_UN_OFFLINE_SM; 1771 md_set_parent(sm->sm_dev, MD_SID(un)); 1772 } 1773 1774 static void 1775 mirror_cleanup(mm_unit_t *un) 1776 { 1777 mddb_recid_t recid; 1778 int smi; 1779 sv_dev_t sv[NMIRROR]; 1780 int nsv = 0; 1781 1782 /* 1783 * If a MN diskset and this node is not the master, do 1784 * not delete any records on snarf of the mirror records. 1785 */ 1786 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1787 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 1788 return; 1789 } 1790 1791 for (smi = 0; smi < NMIRROR; smi++) { 1792 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 1793 continue; 1794 sv[nsv].setno = MD_UN2SET(un); 1795 sv[nsv++].key = un->un_sm[smi].sm_key; 1796 } 1797 1798 recid = un->un_rr_dirty_recid; 1799 mddb_deleterec_wrapper(un->c.un_record_id); 1800 if (recid > 0) 1801 mddb_deleterec_wrapper(recid); 1802 1803 md_rem_names(sv, nsv); 1804 } 1805 1806 /* 1807 * Comparison function for the avl tree which tracks 1808 * outstanding writes on submirrors. 1809 * 1810 * Returns: 1811 * -1: ps1 < ps2 1812 * 0: ps1 and ps2 overlap 1813 * 1: ps1 > ps2 1814 */ 1815 static int 1816 mirror_overlap_compare(const void *p1, const void *p2) 1817 { 1818 const md_mps_t *ps1 = (md_mps_t *)p1; 1819 const md_mps_t *ps2 = (md_mps_t *)p2; 1820 1821 if (ps1->ps_firstblk < ps2->ps_firstblk) { 1822 if (ps1->ps_lastblk >= ps2->ps_firstblk) 1823 return (0); 1824 return (-1); 1825 } 1826 1827 if (ps1->ps_firstblk > ps2->ps_firstblk) { 1828 if (ps1->ps_firstblk <= ps2->ps_lastblk) 1829 return (0); 1830 return (1); 1831 } 1832 1833 return (0); 1834 } 1835 1836 /* 1837 * Collapse any sparse submirror entries snarfed from the on-disk replica. 1838 * Only the in-core entries are updated. The replica will be updated on-disk 1839 * when the in-core replica is committed on shutdown of the SVM subsystem. 1840 */ 1841 static void 1842 collapse_submirrors(mm_unit_t *un) 1843 { 1844 int smi, nremovals, smiremove; 1845 mm_submirror_t *sm, *new_sm, *old_sm; 1846 mm_submirror_ic_t *smic; 1847 int nsmidx = un->un_nsm - 1; 1848 1849 rescan: 1850 nremovals = 0; 1851 smiremove = -1; 1852 1853 for (smi = 0; smi <= nsmidx; smi++) { 1854 sm = &un->un_sm[smi]; 1855 1856 /* 1857 * Check to see if this submirror is marked as in-use. 1858 * If it isn't then it is a potential sparse entry and 1859 * may need to be cleared from the configuration. 1860 * The records should _already_ have been cleared by the 1861 * original mirror_detach() code, but we need to shuffle 1862 * any NULL entries in un_sm[] to the end of the array. 1863 * Any NULL un_smic[] entries need to be reset to the underlying 1864 * submirror/slice accessor functions. 1865 */ 1866 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) { 1867 nremovals++; 1868 smiremove = smi; 1869 break; 1870 } 1871 } 1872 1873 if (nremovals == 0) { 1874 /* 1875 * Ensure that we have a matching contiguous set of un_smic[] 1876 * entries for the corresponding un_sm[] entries 1877 */ 1878 for (smi = 0; smi <= nsmidx; smi++) { 1879 smic = &un->un_smic[smi]; 1880 sm = &un->un_sm[smi]; 1881 1882 smic->sm_shared_by_blk = 1883 md_get_named_service(sm->sm_dev, 0, 1884 "shared by_blk", 0); 1885 smic->sm_shared_by_indx = 1886 md_get_named_service(sm->sm_dev, 0, 1887 "shared by indx", 0); 1888 smic->sm_get_component_count = 1889 (int (*)())md_get_named_service(sm->sm_dev, 0, 1890 "get component count", 0); 1891 smic->sm_get_bcss = 1892 (int (*)())md_get_named_service(sm->sm_dev, 0, 1893 "get block count skip size", 0); 1894 } 1895 return; 1896 } 1897 1898 /* 1899 * Reshuffle the submirror devices so that we do not have a dead record 1900 * in the middle of the array. Once we've done this we need to rescan 1901 * the mirror to check for any other holes. 1902 */ 1903 for (smi = 0; smi < NMIRROR; smi++) { 1904 if (smi < smiremove) 1905 continue; 1906 if (smi > smiremove) { 1907 old_sm = &un->un_sm[smi]; 1908 new_sm = &un->un_sm[smi - 1]; 1909 bcopy(old_sm, new_sm, sizeof (mm_submirror_t)); 1910 bzero(old_sm, sizeof (mm_submirror_t)); 1911 } 1912 } 1913 1914 /* 1915 * Now we need to rescan the array to find the next potential dead 1916 * entry. 1917 */ 1918 goto rescan; 1919 } 1920 1921 /* Return a -1 if optimized record unavailable and set should be released */ 1922 int 1923 mirror_build_incore(mm_unit_t *un, int snarfing) 1924 { 1925 int i; 1926 1927 if (MD_STATUS(un) & MD_UN_BEING_RESET) { 1928 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); 1929 return (1); 1930 } 1931 1932 if (mirror_are_submirrors_available(un) == 0) 1933 return (1); 1934 1935 if (MD_UNIT(MD_SID(un)) != NULL) 1936 return (0); 1937 1938 MD_STATUS(un) = 0; 1939 1940 /* pre-4.1 didn't define CAN_META_CHILD capability */ 1941 MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP; 1942 1943 un->un_overlap_tree_flag = 0; 1944 avl_create(&un->un_overlap_root, mirror_overlap_compare, 1945 sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node)); 1946 1947 /* 1948 * We need to collapse any sparse submirror entries into a non-sparse 1949 * array. This is to cover the case where we have an old replica image 1950 * which has not been updated (i.e. snarfed) since being modified. 1951 * The new code expects all submirror access to be sequential (i.e. 1952 * both the un_sm[] and un_smic[] entries correspond to non-empty 1953 * submirrors. 1954 */ 1955 1956 collapse_submirrors(un); 1957 1958 for (i = 0; i < NMIRROR; i++) 1959 build_submirror(un, i, snarfing); 1960 1961 if (unit_setup_resync(un, snarfing) != 0) { 1962 if (snarfing) { 1963 mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT); 1964 /* 1965 * If a MN set and set is not stale, then return -1 1966 * which will force the caller to unload the set. 1967 * The MN diskset nodes will return failure if 1968 * unit_setup_resync fails so that nodes won't 1969 * get out of sync. 1970 * 1971 * If set is STALE, the master node can't allocate 1972 * a resync record (if needed), but node needs to 1973 * join the set so that user can delete broken mddbs. 1974 * So, if set is STALE, just continue on. 1975 */ 1976 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1977 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 1978 return (-1); 1979 } 1980 } else 1981 return (1); 1982 } 1983 1984 mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL); 1985 cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL); 1986 1987 un->un_suspend_wr_flag = 0; 1988 mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL); 1989 cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL); 1990 1991 /* 1992 * Allocate mutexes for mirror-owner and resync-owner changes. 1993 * All references to the owner message state field must be guarded 1994 * by this mutex. 1995 */ 1996 mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL); 1997 1998 /* 1999 * Allocate mutex and condvar for resync thread manipulation. These 2000 * will be used by mirror_resync_unit/mirror_ioctl_resync 2001 */ 2002 mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL); 2003 cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL); 2004 2005 /* 2006 * Allocate mutex and condvar for resync progress thread manipulation. 2007 * This allows resyncs to be continued across an intervening reboot. 2008 */ 2009 mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL); 2010 cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL); 2011 2012 /* 2013 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This 2014 * provides synchronization between a user-ioctl and the resulting 2015 * strategy() call that performs the read(). 2016 */ 2017 mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL); 2018 cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL); 2019 2020 /* 2021 * Allocate rwlocks for un_pernode_dirty_bm accessing. 2022 */ 2023 for (i = 0; i < MD_MNMAXSIDES; i++) { 2024 rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL); 2025 } 2026 2027 /* place various information in the in-core data structures */ 2028 md_nblocks_set(MD_SID(un), un->c.un_total_blocks); 2029 MD_UNIT(MD_SID(un)) = un; 2030 2031 return (0); 2032 } 2033 2034 2035 void 2036 reset_mirror(struct mm_unit *un, minor_t mnum, int removing) 2037 { 2038 mddb_recid_t recid, vtoc_id; 2039 size_t bitcnt; 2040 size_t shortcnt; 2041 int smi; 2042 sv_dev_t sv[NMIRROR]; 2043 int nsv = 0; 2044 uint_t bits = 0; 2045 minor_t selfid; 2046 md_unit_t *su; 2047 int i; 2048 2049 md_destroy_unit_incore(mnum, &mirror_md_ops); 2050 2051 shortcnt = un->un_rrd_num * sizeof (short); 2052 bitcnt = howmany(un->un_rrd_num, NBBY); 2053 2054 if (un->un_outstanding_writes) 2055 kmem_free((caddr_t)un->un_outstanding_writes, shortcnt); 2056 if (un->un_goingclean_bm) 2057 kmem_free((caddr_t)un->un_goingclean_bm, bitcnt); 2058 if (un->un_goingdirty_bm) 2059 kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt); 2060 if (un->un_resync_bm) 2061 kmem_free((caddr_t)un->un_resync_bm, bitcnt); 2062 if (un->un_pernode_dirty_sum) 2063 kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num); 2064 2065 /* 2066 * Destroy the taskq for deferred processing of DRL clean requests. 2067 * This taskq will only be present for Multi Owner mirrors. 2068 */ 2069 if (un->un_drl_task != NULL) 2070 ddi_taskq_destroy(un->un_drl_task); 2071 2072 md_nblocks_set(mnum, -1ULL); 2073 MD_UNIT(mnum) = NULL; 2074 2075 /* 2076 * Attempt release of its minor node 2077 */ 2078 md_remove_minor_node(mnum); 2079 2080 if (!removing) 2081 return; 2082 2083 for (smi = 0; smi < NMIRROR; smi++) { 2084 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 2085 continue; 2086 /* reallow soft partitioning of submirror and reset parent */ 2087 su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev)); 2088 MD_CAPAB(su) |= MD_CAN_SP; 2089 md_reset_parent(un->un_sm[smi].sm_dev); 2090 reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]); 2091 2092 sv[nsv].setno = MD_MIN2SET(mnum); 2093 sv[nsv++].key = un->un_sm[smi].sm_key; 2094 bits |= SMI2BIT(smi); 2095 } 2096 2097 MD_STATUS(un) |= MD_UN_BEING_RESET; 2098 recid = un->un_rr_dirty_recid; 2099 vtoc_id = un->c.un_vtoc_id; 2100 selfid = MD_SID(un); 2101 2102 mirror_commit(un, bits, 0); 2103 2104 avl_destroy(&un->un_overlap_root); 2105 2106 /* Destroy all mutexes and condvars before returning. */ 2107 mutex_destroy(&un->un_suspend_wr_mx); 2108 cv_destroy(&un->un_suspend_wr_cv); 2109 mutex_destroy(&un->un_overlap_tree_mx); 2110 cv_destroy(&un->un_overlap_tree_cv); 2111 mutex_destroy(&un->un_owner_mx); 2112 mutex_destroy(&un->un_rs_thread_mx); 2113 cv_destroy(&un->un_rs_thread_cv); 2114 mutex_destroy(&un->un_rs_progress_mx); 2115 cv_destroy(&un->un_rs_progress_cv); 2116 mutex_destroy(&un->un_dmr_mx); 2117 cv_destroy(&un->un_dmr_cv); 2118 2119 for (i = 0; i < MD_MNMAXSIDES; i++) { 2120 rw_destroy(&un->un_pernode_dirty_mx[i]); 2121 if (un->un_pernode_dirty_bm[i]) 2122 kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt); 2123 } 2124 2125 /* 2126 * Remove self from the namespace 2127 */ 2128 if (un->c.un_revision & MD_FN_META_DEV) { 2129 (void) md_rem_selfname(un->c.un_self_id); 2130 } 2131 2132 /* This frees the unit structure. */ 2133 mddb_deleterec_wrapper(un->c.un_record_id); 2134 2135 if (recid != 0) 2136 mddb_deleterec_wrapper(recid); 2137 2138 /* Remove the vtoc, if present */ 2139 if (vtoc_id) 2140 mddb_deleterec_wrapper(vtoc_id); 2141 2142 md_rem_names(sv, nsv); 2143 2144 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, 2145 MD_MIN2SET(selfid), selfid); 2146 } 2147 2148 int 2149 mirror_internal_open( 2150 minor_t mnum, 2151 int flag, 2152 int otyp, 2153 int md_oflags, 2154 IOLOCK *lockp /* can be NULL */ 2155 ) 2156 { 2157 mdi_unit_t *ui = MDI_UNIT(mnum); 2158 int err = 0; 2159 2160 tryagain: 2161 /* single thread */ 2162 if (lockp) { 2163 /* 2164 * If ioctl lock is held, use openclose_enter 2165 * routine that will set the ioctl flag when 2166 * grabbing the readerlock. 2167 */ 2168 (void) md_ioctl_openclose_enter(lockp, ui); 2169 } else { 2170 (void) md_unit_openclose_enter(ui); 2171 } 2172 2173 /* 2174 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE 2175 * message in a MN diskset and this requires that the openclose 2176 * lock is dropped in order to send this message. So, another 2177 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from 2178 * attempting an open while this thread has an open in progress. 2179 * Call the *_lh version of the lock exit routines since the ui_mx 2180 * mutex must be held from checking for OPENINPROGRESS until 2181 * after the cv_wait call. 2182 */ 2183 mutex_enter(&ui->ui_mx); 2184 if (ui->ui_lock & MD_UL_OPENINPROGRESS) { 2185 if (lockp) { 2186 (void) md_ioctl_openclose_exit_lh(lockp); 2187 } else { 2188 md_unit_openclose_exit_lh(ui); 2189 } 2190 cv_wait(&ui->ui_cv, &ui->ui_mx); 2191 mutex_exit(&ui->ui_mx); 2192 goto tryagain; 2193 } 2194 2195 ui->ui_lock |= MD_UL_OPENINPROGRESS; 2196 mutex_exit(&ui->ui_mx); 2197 2198 /* open devices, if necessary */ 2199 if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) { 2200 if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0) 2201 goto out; 2202 } 2203 2204 /* count open */ 2205 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 2206 goto out; 2207 2208 /* unlock, return success */ 2209 out: 2210 mutex_enter(&ui->ui_mx); 2211 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 2212 mutex_exit(&ui->ui_mx); 2213 2214 if (lockp) { 2215 /* 2216 * If ioctl lock is held, use openclose_exit 2217 * routine that will clear the lockp reader flag. 2218 */ 2219 (void) md_ioctl_openclose_exit(lockp); 2220 } else { 2221 md_unit_openclose_exit(ui); 2222 } 2223 return (err); 2224 } 2225 2226 int 2227 mirror_internal_close( 2228 minor_t mnum, 2229 int otyp, 2230 int md_cflags, 2231 IOLOCK *lockp /* can be NULL */ 2232 ) 2233 { 2234 mdi_unit_t *ui = MDI_UNIT(mnum); 2235 mm_unit_t *un; 2236 int err = 0; 2237 2238 /* single thread */ 2239 if (lockp) { 2240 /* 2241 * If ioctl lock is held, use openclose_enter 2242 * routine that will set the ioctl flag when 2243 * grabbing the readerlock. 2244 */ 2245 un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui); 2246 } else { 2247 un = (mm_unit_t *)md_unit_openclose_enter(ui); 2248 } 2249 2250 /* count closed */ 2251 if ((err = md_unit_decopen(mnum, otyp)) != 0) 2252 goto out; 2253 2254 /* close devices, if necessary */ 2255 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 2256 /* 2257 * Clean up dirty bitmap for this unit. Do this 2258 * before closing the underlying devices to avoid 2259 * race conditions with reset_mirror() as a 2260 * result of a 'metaset -r' command running in 2261 * parallel. This might cause deallocation of 2262 * dirty region bitmaps; with underlying metadevices 2263 * in place this can't happen. 2264 * Don't do this if a MN set and ABR not set 2265 */ 2266 if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) { 2267 if (!MD_MNSET_SETNO(MD_UN2SET(un)) || 2268 !(ui->ui_tstate & MD_ABR_CAP)) 2269 mirror_process_unit_resync(un); 2270 } 2271 (void) mirror_close_all_devs(un, md_cflags); 2272 2273 /* 2274 * For a MN set with transient capabilities (eg ABR/DMR) set, 2275 * clear these capabilities on the last open in the cluster. 2276 * To do this we send a message to all nodes to see of the 2277 * device is open. 2278 */ 2279 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 2280 (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) { 2281 if (lockp) { 2282 (void) md_ioctl_openclose_exit(lockp); 2283 } else { 2284 md_unit_openclose_exit(ui); 2285 } 2286 2287 /* 2288 * if we are in the context of an ioctl, drop the 2289 * ioctl lock. 2290 * Otherwise, no other locks should be held. 2291 */ 2292 if (lockp) { 2293 IOLOCK_RETURN_RELEASE(0, lockp); 2294 } 2295 2296 mdmn_clear_all_capabilities(mnum); 2297 2298 /* if dropped the lock previously, regain it */ 2299 if (lockp) { 2300 IOLOCK_RETURN_REACQUIRE(lockp); 2301 } 2302 return (0); 2303 } 2304 /* unlock and return success */ 2305 } 2306 out: 2307 /* Call whether lockp is NULL or not. */ 2308 if (lockp) { 2309 md_ioctl_openclose_exit(lockp); 2310 } else { 2311 md_unit_openclose_exit(ui); 2312 } 2313 return (err); 2314 } 2315 2316 /* 2317 * When a component has completed resyncing and is now ok, check if the 2318 * corresponding component in the other submirrors is in the Last Erred 2319 * state. If it is, we want to change that to the Erred state so we stop 2320 * using that component and start using this good component instead. 2321 * 2322 * This is called from set_sm_comp_state and recursively calls 2323 * set_sm_comp_state if it needs to change the Last Erred state. 2324 */ 2325 static void 2326 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags, 2327 IOLOCK *lockp) 2328 { 2329 mm_submirror_t *sm; 2330 mm_submirror_ic_t *smic; 2331 int ci; 2332 int i; 2333 int compcnt; 2334 int changed = 0; 2335 2336 for (i = 0; i < NMIRROR; i++) { 2337 sm = &un->un_sm[i]; 2338 smic = &un->un_smic[i]; 2339 2340 if (!SMS_IS(sm, SMS_INUSE)) 2341 continue; 2342 2343 /* ignore the submirror that we just made ok */ 2344 if (i == smi) 2345 continue; 2346 2347 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 2348 for (ci = 0; ci < compcnt; ci++) { 2349 md_m_shared_t *shared; 2350 2351 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2352 (sm->sm_dev, sm, ci); 2353 2354 if ((shared->ms_state & CS_LAST_ERRED) && 2355 !mirror_other_sources(un, i, ci, 1)) { 2356 2357 set_sm_comp_state(un, i, ci, CS_ERRED, extras, 2358 flags, lockp); 2359 changed = 1; 2360 } 2361 } 2362 } 2363 2364 /* maybe there is a hotspare for this newly erred component */ 2365 if (changed) { 2366 set_t setno; 2367 2368 setno = MD_UN2SET(un); 2369 if (MD_MNSET_SETNO(setno)) { 2370 send_poke_hotspares(setno); 2371 } else { 2372 (void) poke_hotspares(); 2373 } 2374 } 2375 } 2376 2377 /* 2378 * set_sm_comp_state 2379 * 2380 * Set the state of a submirror component to the specified new state. 2381 * If the mirror is in a multi-node set, send messages to all nodes to 2382 * block all writes to the mirror and then update the state and release the 2383 * writes. These messages are only sent if MD_STATE_XMIT is set in flags. 2384 * MD_STATE_XMIT will be unset in 2 cases: 2385 * 1. When the state is changed to CS_RESYNC as this state change 2386 * will already have been updated on each node by the processing of the 2387 * distributed metasync command, hence no need to xmit. 2388 * 2. When the state is change to CS_OKAY after a resync has completed. Again 2389 * the resync completion will already have been processed on each node by 2390 * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component 2391 * resync, hence no need to xmit. 2392 * 2393 * In case we are called from the updates of a watermark, 2394 * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to 2395 * a metainit or similar. In this case the message that we sent to propagate 2396 * the state change must not be a class1 message as that would deadlock with 2397 * the metainit command that is still being processed. 2398 * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2 2399 * instead. This also makes the submessage generator to create a class2 2400 * submessage rather than a class1 (which would also block) 2401 * 2402 * On entry, unit_writerlock is held 2403 * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is 2404 * also held. 2405 */ 2406 void 2407 set_sm_comp_state( 2408 mm_unit_t *un, 2409 int smi, 2410 int ci, 2411 int newstate, 2412 mddb_recid_t *extras, 2413 uint_t flags, 2414 IOLOCK *lockp 2415 ) 2416 { 2417 mm_submirror_t *sm; 2418 mm_submirror_ic_t *smic; 2419 md_m_shared_t *shared; 2420 int origstate; 2421 void (*get_dev)(); 2422 ms_cd_info_t cd; 2423 char devname[MD_MAX_CTDLEN]; 2424 int err; 2425 set_t setno = MD_UN2SET(un); 2426 md_mn_msg_stch_t stchmsg; 2427 mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); 2428 md_mn_kresult_t *kresult; 2429 int rval; 2430 uint_t msgflags; 2431 md_mn_msgtype_t msgtype; 2432 int save_lock = 0; 2433 mdi_unit_t *ui_sm; 2434 int nretries = 0; 2435 2436 sm = &un->un_sm[smi]; 2437 smic = &un->un_smic[smi]; 2438 2439 /* If we have a real error status then turn off MD_INACCESSIBLE. */ 2440 ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev))); 2441 if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) && 2442 ui_sm->ui_tstate & MD_INACCESSIBLE) { 2443 ui_sm->ui_tstate &= ~MD_INACCESSIBLE; 2444 } 2445 2446 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2447 (sm->sm_dev, sm, ci); 2448 origstate = shared->ms_state; 2449 2450 /* 2451 * If the new state is an error and the old one wasn't, generate 2452 * a console message. We do this before we send the state to other 2453 * nodes in a MN set because the state change may change the component 2454 * name if a hotspare is allocated. 2455 */ 2456 if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) && 2457 (newstate & (CS_ERRED|CS_LAST_ERRED))) { 2458 2459 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 2460 "get device", 0); 2461 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2462 2463 err = md_getdevname(setno, mddb_getsidenum(setno), 0, 2464 cd.cd_dev, devname, sizeof (devname)); 2465 2466 if (err == ENOENT) { 2467 (void) md_devname(setno, cd.cd_dev, devname, 2468 sizeof (devname)); 2469 } 2470 2471 cmn_err(CE_WARN, "md: %s: %s needs maintenance", 2472 md_shortname(md_getminor(sm->sm_dev)), devname); 2473 2474 if (newstate & CS_LAST_ERRED) { 2475 cmn_err(CE_WARN, "md: %s: %s last erred", 2476 md_shortname(md_getminor(sm->sm_dev)), 2477 devname); 2478 2479 } else if (shared->ms_flags & MDM_S_ISOPEN) { 2480 /* 2481 * Close the broken device and clear the open flag on 2482 * it. Closing the device means the RCM framework will 2483 * be able to unconfigure the device if required. 2484 * 2485 * We have to check that the device is open, otherwise 2486 * the first open on it has resulted in the error that 2487 * is being processed and the actual cd.cd_dev will be 2488 * NODEV64. 2489 * 2490 * If this is a multi-node mirror, then the multinode 2491 * state checks following this code will cause the 2492 * slave nodes to close the mirror in the function 2493 * mirror_set_state(). 2494 */ 2495 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2496 shared->ms_flags &= ~MDM_S_ISOPEN; 2497 } 2498 2499 } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) && 2500 (shared->ms_flags & MDM_S_ISOPEN)) { 2501 /* 2502 * Similar to logic above except no log messages since we 2503 * are just transitioning from Last Erred to Erred. 2504 */ 2505 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 2506 "get device", 0); 2507 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2508 2509 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2510 shared->ms_flags &= ~MDM_S_ISOPEN; 2511 } 2512 2513 if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) && 2514 (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) { 2515 /* 2516 * For a multi-node mirror, send the state change to the 2517 * master, which broadcasts to all nodes, including this 2518 * one. Once the message is received, the state is set 2519 * in-core and the master commits the change to disk. 2520 * There is a case, comp_replace, where this function 2521 * can be called from within an ioctl and therefore in this 2522 * case, as the ioctl will already be called on each node, 2523 * there is no need to xmit the state change to the master for 2524 * distribution to the other nodes. MD_STATE_XMIT flag is used 2525 * to indicate whether a xmit is required. The mirror's 2526 * transient state is set to MD_ERR_PENDING to avoid sending 2527 * multiple messages. 2528 */ 2529 if (newstate & (CS_ERRED|CS_LAST_ERRED)) 2530 ui->ui_tstate |= MD_ERR_PENDING; 2531 2532 /* 2533 * Send a state update message to all nodes. This message 2534 * will generate 2 submessages, the first one to suspend 2535 * all writes to the mirror and the second to update the 2536 * state and resume writes. 2537 */ 2538 stchmsg.msg_stch_mnum = un->c.un_self_id; 2539 stchmsg.msg_stch_sm = smi; 2540 stchmsg.msg_stch_comp = ci; 2541 stchmsg.msg_stch_new_state = newstate; 2542 stchmsg.msg_stch_hs_id = shared->ms_hs_id; 2543 #ifdef DEBUG 2544 if (mirror_debug_flag) 2545 printf("send set state, %x, %x, %x, %x, %x\n", 2546 stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm, 2547 stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state, 2548 stchmsg.msg_stch_hs_id); 2549 #endif 2550 if (flags & MD_STATE_WMUPDATE) { 2551 msgtype = MD_MN_MSG_STATE_UPDATE2; 2552 /* 2553 * When coming from an update of watermarks, there 2554 * must already be a message logged that triggered 2555 * this action. So, no need to log this message, too. 2556 */ 2557 msgflags = MD_MSGF_NO_LOG; 2558 } else { 2559 msgtype = MD_MN_MSG_STATE_UPDATE; 2560 msgflags = MD_MSGF_DEFAULT_FLAGS; 2561 } 2562 2563 /* 2564 * If we are in the context of an ioctl, drop the ioctl lock. 2565 * lockp holds the list of locks held. 2566 * 2567 * Otherwise, increment the appropriate reacquire counters. 2568 * If openclose lock is *held, then must reacquire reader 2569 * lock before releasing the openclose lock. 2570 * Do not drop the ARRAY_WRITER lock as we may not be able 2571 * to reacquire it. 2572 */ 2573 if (lockp) { 2574 if (lockp->l_flags & MD_ARRAY_WRITER) { 2575 save_lock = MD_ARRAY_WRITER; 2576 lockp->l_flags &= ~MD_ARRAY_WRITER; 2577 } else if (lockp->l_flags & MD_ARRAY_READER) { 2578 save_lock = MD_ARRAY_READER; 2579 lockp->l_flags &= ~MD_ARRAY_READER; 2580 } 2581 IOLOCK_RETURN_RELEASE(0, lockp); 2582 } else { 2583 if (flags & MD_STATE_OCHELD) { 2584 md_unit_writerexit(ui); 2585 (void) md_unit_readerlock(ui); 2586 md_unit_openclose_exit(ui); 2587 } else { 2588 md_unit_writerexit(ui); 2589 } 2590 } 2591 2592 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 2593 sscs_msg: 2594 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0, 2595 (char *)&stchmsg, sizeof (stchmsg), kresult); 2596 2597 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 2598 mdmn_ksend_show_error(rval, kresult, "STATE UPDATE"); 2599 /* If we're shutting down already, pause things here. */ 2600 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { 2601 while (!md_mn_is_commd_present()) { 2602 delay(md_hz); 2603 } 2604 /* 2605 * commd is now available; retry the message 2606 * one time. If that fails we fall through and 2607 * panic as the system is in an unexpected state 2608 */ 2609 if (nretries++ == 0) 2610 goto sscs_msg; 2611 } 2612 cmn_err(CE_PANIC, 2613 "ksend_message failure: STATE_UPDATE"); 2614 } 2615 kmem_free(kresult, sizeof (md_mn_kresult_t)); 2616 2617 /* if dropped the lock previously, regain it */ 2618 if (lockp) { 2619 IOLOCK_RETURN_REACQUIRE(lockp); 2620 lockp->l_flags |= save_lock; 2621 } else { 2622 /* 2623 * Reacquire dropped locks and update acquirecnts 2624 * appropriately. 2625 */ 2626 if (flags & MD_STATE_OCHELD) { 2627 /* 2628 * openclose also grabs readerlock. 2629 */ 2630 (void) md_unit_openclose_enter(ui); 2631 md_unit_readerexit(ui); 2632 (void) md_unit_writerlock(ui); 2633 } else { 2634 (void) md_unit_writerlock(ui); 2635 } 2636 } 2637 2638 ui->ui_tstate &= ~MD_ERR_PENDING; 2639 } else { 2640 shared->ms_state = newstate; 2641 uniqtime32(&shared->ms_timestamp); 2642 2643 if (newstate == CS_ERRED) 2644 shared->ms_flags |= MDM_S_NOWRITE; 2645 else 2646 shared->ms_flags &= ~MDM_S_NOWRITE; 2647 2648 shared->ms_flags &= ~MDM_S_IOERR; 2649 un->un_changecnt++; 2650 shared->ms_lasterrcnt = un->un_changecnt; 2651 2652 mirror_set_sm_state(sm, smic, SMS_RUNNING, 0); 2653 mirror_commit(un, SMI2BIT(smi), extras); 2654 } 2655 2656 if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) { 2657 /* 2658 * Resetting the Last Erred state will recursively call back 2659 * into this function (set_sm_comp_state) to update the state. 2660 */ 2661 reset_lasterred(un, smi, extras, flags, lockp); 2662 } 2663 } 2664 2665 static int 2666 find_another_logical( 2667 mm_unit_t *un, 2668 mm_submirror_t *esm, 2669 diskaddr_t blk, 2670 u_longlong_t cnt, 2671 int must_be_open, 2672 int state, 2673 int err_cnt) 2674 { 2675 u_longlong_t cando; 2676 md_dev64_t dev; 2677 md_m_shared_t *s; 2678 2679 esm->sm_state |= SMS_IGNORE; 2680 while (cnt != 0) { 2681 u_longlong_t mcnt; 2682 2683 mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024)); /* 1 Gig Blks */ 2684 2685 dev = select_read_unit(un, blk, mcnt, &cando, 2686 must_be_open, &s, NULL); 2687 if (dev == (md_dev64_t)0) 2688 break; 2689 2690 if ((state == CS_LAST_ERRED) && 2691 (s->ms_state == CS_LAST_ERRED) && 2692 (err_cnt > s->ms_lasterrcnt)) 2693 break; 2694 2695 cnt -= cando; 2696 blk += cando; 2697 } 2698 esm->sm_state &= ~SMS_IGNORE; 2699 return (cnt != 0); 2700 } 2701 2702 int 2703 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open) 2704 { 2705 mm_submirror_t *sm; 2706 mm_submirror_ic_t *smic; 2707 size_t count; 2708 diskaddr_t block; 2709 u_longlong_t skip; 2710 u_longlong_t size; 2711 md_dev64_t dev; 2712 int cnt; 2713 md_m_shared_t *s; 2714 int not_found; 2715 2716 sm = &un->un_sm[smi]; 2717 smic = &un->un_smic[smi]; 2718 dev = sm->sm_dev; 2719 2720 /* 2721 * Make sure every component of the submirror 2722 * has other sources. 2723 */ 2724 if (ci < 0) { 2725 /* Find the highest lasterrcnt */ 2726 cnt = (*(smic->sm_get_component_count))(dev, sm); 2727 for (ci = 0; ci < cnt; ci++) { 2728 not_found = mirror_other_sources(un, smi, ci, 2729 must_be_open); 2730 if (not_found) 2731 return (1); 2732 } 2733 return (0); 2734 } 2735 2736 /* 2737 * Make sure this component has other sources 2738 */ 2739 (void) (*(smic->sm_get_bcss)) 2740 (dev, sm, ci, &block, &count, &skip, &size); 2741 2742 if (count == 0) 2743 return (1); 2744 2745 s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci); 2746 2747 while (count--) { 2748 if (block >= un->c.un_total_blocks) 2749 return (0); 2750 2751 if ((block + size) > un->c.un_total_blocks) 2752 size = un->c.un_total_blocks - block; 2753 2754 not_found = find_another_logical(un, sm, block, size, 2755 must_be_open, s->ms_state, s->ms_lasterrcnt); 2756 if (not_found) 2757 return (1); 2758 2759 block += size + skip; 2760 } 2761 return (0); 2762 } 2763 2764 static void 2765 finish_error(md_mps_t *ps) 2766 { 2767 struct buf *pb; 2768 mm_unit_t *un; 2769 mdi_unit_t *ui; 2770 uint_t new_str_flags; 2771 2772 pb = ps->ps_bp; 2773 un = ps->ps_un; 2774 ui = ps->ps_ui; 2775 2776 /* 2777 * Must flag any error to the resync originator if we're performing 2778 * a Write-after-Read. This corresponds to an i/o error on a resync 2779 * target device and in this case we ought to abort the resync as there 2780 * is nothing that can be done to recover from this without operator 2781 * intervention. If we don't set the B_ERROR flag we will continue 2782 * reading from the mirror but won't write to the target (as it will 2783 * have been placed into an errored state). 2784 * To handle the case of multiple components within a submirror we only 2785 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR. 2786 * The originator of the resync read will cause this bit to be set if 2787 * the underlying component count is one for a submirror resync. All 2788 * other resync types will have the flag set as there is no underlying 2789 * resync which can be performed on a contained metadevice for these 2790 * resync types (optimized or component). 2791 */ 2792 2793 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) { 2794 if (ps->ps_flags & MD_MPS_FLAG_ERROR) 2795 pb->b_flags |= B_ERROR; 2796 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2797 MPS_FREE(mirror_parent_cache, ps); 2798 md_unit_readerexit(ui); 2799 md_biodone(pb); 2800 return; 2801 } 2802 /* 2803 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 2804 * operation therefore this I/O request has already been counted, 2805 * the I/O count variable will be decremented by mirror_done()'s 2806 * call to md_biodone(). 2807 */ 2808 if (ps->ps_changecnt != un->un_changecnt) { 2809 new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED; 2810 if (ps->ps_flags & MD_MPS_WOW) 2811 new_str_flags |= MD_STR_WOW; 2812 if (ps->ps_flags & MD_MPS_MAPPED) 2813 new_str_flags |= MD_STR_MAPPED; 2814 /* 2815 * If this I/O request was a read that was part of a resync, 2816 * set MD_STR_WAR for the retried read to ensure that the 2817 * resync write (i.e. write-after-read) will be performed 2818 */ 2819 if (ps->ps_flags & MD_MPS_RESYNC_READ) 2820 new_str_flags |= MD_STR_WAR; 2821 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2822 MPS_FREE(mirror_parent_cache, ps); 2823 md_unit_readerexit(ui); 2824 (void) md_mirror_strategy(pb, new_str_flags, NULL); 2825 return; 2826 } 2827 2828 pb->b_flags |= B_ERROR; 2829 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2830 MPS_FREE(mirror_parent_cache, ps); 2831 md_unit_readerexit(ui); 2832 md_biodone(pb); 2833 } 2834 2835 static void 2836 error_update_unit(md_mps_t *ps) 2837 { 2838 mm_unit_t *un; 2839 mdi_unit_t *ui; 2840 int smi; /* sub mirror index */ 2841 int ci; /* errored component */ 2842 set_t setno; 2843 uint_t flags; /* for set_sm_comp_state() */ 2844 uint_t hspflags; /* for check_comp_4_hotspares() */ 2845 2846 ui = ps->ps_ui; 2847 un = (mm_unit_t *)md_unit_writerlock(ui); 2848 setno = MD_UN2SET(un); 2849 2850 /* All of these updates have to propagated in case of MN set */ 2851 flags = MD_STATE_XMIT; 2852 hspflags = MD_HOTSPARE_XMIT; 2853 2854 /* special treatment if we are called during updating watermarks */ 2855 if (ps->ps_flags & MD_MPS_WMUPDATE) { 2856 flags |= MD_STATE_WMUPDATE; 2857 hspflags |= MD_HOTSPARE_WMUPDATE; 2858 } 2859 smi = 0; 2860 ci = 0; 2861 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 2862 if (mirror_other_sources(un, smi, ci, 0) == 1) { 2863 2864 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2865 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags, 2866 (IOLOCK *)NULL); 2867 /* 2868 * For a MN set, the NOTIFY is done when the state 2869 * change is processed on each node 2870 */ 2871 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2872 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 2873 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2874 } 2875 continue; 2876 } 2877 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2878 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags, 2879 (IOLOCK *)NULL); 2880 /* 2881 * For a MN set, the NOTIFY is done when the state 2882 * change is processed on each node 2883 */ 2884 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2885 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 2886 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2887 } 2888 smi = 0; 2889 ci = 0; 2890 } 2891 2892 md_unit_writerexit(ui); 2893 if (MD_MNSET_SETNO(setno)) { 2894 send_poke_hotspares(setno); 2895 } else { 2896 (void) poke_hotspares(); 2897 } 2898 (void) md_unit_readerlock(ui); 2899 2900 finish_error(ps); 2901 } 2902 2903 /* 2904 * When we have a B_FAILFAST IO error on a Last Erred component we need to 2905 * retry the IO without B_FAILFAST set so that we try to ensure that the 2906 * component "sees" each IO. 2907 */ 2908 static void 2909 last_err_retry(md_mcs_t *cs) 2910 { 2911 struct buf *cb; 2912 md_mps_t *ps; 2913 uint_t flags; 2914 2915 cb = &cs->cs_buf; 2916 cb->b_flags &= ~B_FAILFAST; 2917 2918 /* if we're panicing just let this I/O error out */ 2919 if (panicstr) { 2920 (void) mirror_done(cb); 2921 return; 2922 } 2923 2924 /* reissue the I/O */ 2925 2926 ps = cs->cs_ps; 2927 2928 bioerror(cb, 0); 2929 2930 mutex_enter(&ps->ps_mx); 2931 2932 flags = MD_STR_NOTTOP; 2933 if (ps->ps_flags & MD_MPS_MAPPED) 2934 flags |= MD_STR_MAPPED; 2935 if (ps->ps_flags & MD_MPS_NOBLOCK) 2936 flags |= MD_NOBLOCK; 2937 2938 mutex_exit(&ps->ps_mx); 2939 2940 clear_retry_error(cb); 2941 2942 cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST", 2943 md_shortname(getminor(cb->b_edev))); 2944 2945 md_call_strategy(cb, flags, NULL); 2946 } 2947 2948 static void 2949 mirror_error(md_mps_t *ps) 2950 { 2951 int smi; /* sub mirror index */ 2952 int ci; /* errored component */ 2953 2954 if (panicstr) { 2955 finish_error(ps); 2956 return; 2957 } 2958 2959 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 2960 mirror_overlap_tree_remove(ps); 2961 2962 smi = 0; 2963 ci = 0; 2964 if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) { 2965 md_unit_readerexit(ps->ps_ui); 2966 daemon_request(&md_mstr_daemon, error_update_unit, 2967 (daemon_queue_t *)ps, REQ_OLD); 2968 return; 2969 } 2970 2971 finish_error(ps); 2972 } 2973 2974 static int 2975 copy_write_done(struct buf *cb) 2976 { 2977 md_mps_t *ps; 2978 buf_t *pb; 2979 char *wowbuf; 2980 wowhdr_t *wowhdr; 2981 ssize_t wow_resid; 2982 2983 /* get wowbuf ans save structure */ 2984 wowbuf = cb->b_un.b_addr; 2985 wowhdr = WOWBUF_HDR(wowbuf); 2986 ps = wowhdr->wow_ps; 2987 pb = ps->ps_bp; 2988 2989 /* Save error information, then free cb */ 2990 if (cb->b_flags & B_ERROR) 2991 pb->b_flags |= B_ERROR; 2992 2993 if (cb->b_flags & B_REMAPPED) 2994 bp_mapout(cb); 2995 2996 freerbuf(cb); 2997 2998 /* update residual and continue if needed */ 2999 if ((pb->b_flags & B_ERROR) == 0) { 3000 wow_resid = pb->b_bcount - wowhdr->wow_offset; 3001 pb->b_resid = wow_resid; 3002 if (wow_resid > 0) { 3003 daemon_request(&md_mstr_daemon, copy_write_cont, 3004 (daemon_queue_t *)wowhdr, REQ_OLD); 3005 return (1); 3006 } 3007 } 3008 3009 /* Write is complete, release resources. */ 3010 kmem_cache_free(mirror_wowblk_cache, wowhdr); 3011 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 3012 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3013 MPS_FREE(mirror_parent_cache, ps); 3014 md_biodone(pb); 3015 return (0); 3016 } 3017 3018 static void 3019 copy_write_cont(wowhdr_t *wowhdr) 3020 { 3021 buf_t *pb; 3022 buf_t *cb; 3023 char *wowbuf; 3024 int wow_offset; 3025 size_t wow_resid; 3026 diskaddr_t wow_blkno; 3027 3028 wowbuf = WOWHDR_BUF(wowhdr); 3029 pb = wowhdr->wow_ps->ps_bp; 3030 3031 /* get data on current location */ 3032 wow_offset = wowhdr->wow_offset; 3033 wow_resid = pb->b_bcount - wow_offset; 3034 wow_blkno = pb->b_lblkno + lbtodb(wow_offset); 3035 3036 /* setup child buffer */ 3037 cb = getrbuf(KM_SLEEP); 3038 cb->b_flags = B_WRITE; 3039 cb->b_edev = pb->b_edev; 3040 cb->b_un.b_addr = wowbuf; /* change to point at WOWBUF */ 3041 cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */ 3042 cb->b_iodone = copy_write_done; 3043 cb->b_bcount = MIN(md_wowbuf_size, wow_resid); 3044 cb->b_lblkno = wow_blkno; 3045 3046 /* move offset to next section */ 3047 wowhdr->wow_offset += cb->b_bcount; 3048 3049 /* copy and setup write for current section */ 3050 bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount); 3051 3052 /* do it */ 3053 /* 3054 * Do not set the MD_IO_COUNTED flag as this is a new I/O request 3055 * that handles the WOW condition. The resultant increment on the 3056 * I/O count variable is cleared by copy_write_done()'s call to 3057 * md_biodone(). 3058 */ 3059 (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW 3060 | MD_STR_MAPPED, NULL); 3061 } 3062 3063 static void 3064 md_mirror_copy_write(md_mps_t *ps) 3065 { 3066 wowhdr_t *wowhdr; 3067 3068 wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS); 3069 mirror_wowblk_init(wowhdr); 3070 wowhdr->wow_ps = ps; 3071 wowhdr->wow_offset = 0; 3072 copy_write_cont(wowhdr); 3073 } 3074 3075 static void 3076 handle_wow(md_mps_t *ps) 3077 { 3078 buf_t *pb; 3079 3080 pb = ps->ps_bp; 3081 3082 bp_mapin(pb); 3083 3084 md_mirror_wow_cnt++; 3085 if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) { 3086 cmn_err(CE_NOTE, 3087 "md: %s, blk %lld, cnt %ld: Write on write %d occurred", 3088 md_shortname(getminor(pb->b_edev)), 3089 (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt); 3090 } 3091 3092 /* 3093 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 3094 * operation therefore this I/O request has already been counted, 3095 * the I/O count variable will be decremented by mirror_done()'s 3096 * call to md_biodone(). 3097 */ 3098 if (md_mirror_wow_flg & WOW_NOCOPY) 3099 (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW | 3100 MD_STR_MAPPED | MD_IO_COUNTED, ps); 3101 else 3102 md_mirror_copy_write(ps); 3103 } 3104 3105 /* 3106 * Return true if the specified submirror is either in the Last Erred 3107 * state or is transitioning into the Last Erred state. 3108 */ 3109 static bool_t 3110 submirror_is_lasterred(mm_unit_t *un, int smi) 3111 { 3112 mm_submirror_t *sm; 3113 mm_submirror_ic_t *smic; 3114 md_m_shared_t *shared; 3115 int ci; 3116 int compcnt; 3117 3118 sm = &un->un_sm[smi]; 3119 smic = &un->un_smic[smi]; 3120 3121 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 3122 for (ci = 0; ci < compcnt; ci++) { 3123 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 3124 (sm->sm_dev, sm, ci); 3125 3126 if (shared->ms_state == CS_LAST_ERRED) 3127 return (B_TRUE); 3128 3129 /* 3130 * It is not currently Last Erred, check if entering Last Erred. 3131 */ 3132 if ((shared->ms_flags & MDM_S_IOERR) && 3133 ((shared->ms_state == CS_OKAY) || 3134 (shared->ms_state == CS_RESYNC))) { 3135 if (mirror_other_sources(un, smi, ci, 0) == 1) 3136 return (B_TRUE); 3137 } 3138 } 3139 3140 return (B_FALSE); 3141 } 3142 3143 3144 static int 3145 mirror_done(struct buf *cb) 3146 { 3147 md_mps_t *ps; 3148 md_mcs_t *cs; 3149 3150 /*LINTED*/ 3151 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3152 ps = cs->cs_ps; 3153 3154 mutex_enter(&ps->ps_mx); 3155 3156 /* check if we need to retry an errored failfast I/O */ 3157 if (cb->b_flags & B_ERROR) { 3158 struct buf *pb = ps->ps_bp; 3159 3160 if (cb->b_flags & B_FAILFAST) { 3161 int i; 3162 mm_unit_t *un = ps->ps_un; 3163 3164 for (i = 0; i < NMIRROR; i++) { 3165 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 3166 continue; 3167 3168 if (cb->b_edev == 3169 md_dev64_to_dev(un->un_sm[i].sm_dev)) { 3170 3171 /* 3172 * This is the submirror that had the 3173 * error. Check if it is Last Erred. 3174 */ 3175 if (submirror_is_lasterred(un, i)) { 3176 daemon_queue_t *dqp; 3177 3178 mutex_exit(&ps->ps_mx); 3179 dqp = (daemon_queue_t *)cs; 3180 dqp->dq_prev = NULL; 3181 dqp->dq_next = NULL; 3182 daemon_request(&md_done_daemon, 3183 last_err_retry, dqp, 3184 REQ_OLD); 3185 return (1); 3186 } 3187 break; 3188 } 3189 } 3190 } 3191 3192 /* continue to process the buf without doing a retry */ 3193 ps->ps_flags |= MD_MPS_ERROR; 3194 pb->b_error = cb->b_error; 3195 } 3196 3197 return (mirror_done_common(cb)); 3198 } 3199 3200 /* 3201 * Split from the original mirror_done function so we can handle bufs after a 3202 * retry. 3203 * ps->ps_mx is already held in the caller of this function and the cb error 3204 * has already been checked and handled in the caller. 3205 */ 3206 static int 3207 mirror_done_common(struct buf *cb) 3208 { 3209 struct buf *pb; 3210 mm_unit_t *un; 3211 mdi_unit_t *ui; 3212 md_mps_t *ps; 3213 md_mcs_t *cs; 3214 size_t end_rr, start_rr, current_rr; 3215 3216 /*LINTED*/ 3217 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3218 ps = cs->cs_ps; 3219 pb = ps->ps_bp; 3220 3221 if (cb->b_flags & B_REMAPPED) 3222 bp_mapout(cb); 3223 3224 ps->ps_frags--; 3225 if (ps->ps_frags != 0) { 3226 mutex_exit(&ps->ps_mx); 3227 kmem_cache_free(mirror_child_cache, cs); 3228 return (1); 3229 } 3230 un = ps->ps_un; 3231 ui = ps->ps_ui; 3232 3233 /* 3234 * Do not update outstanding_writes if we're running with ABR 3235 * set for this mirror or the write() was issued with MD_STR_ABR set. 3236 * Also a resync initiated write() has no outstanding_writes update 3237 * either. 3238 */ 3239 if (((cb->b_flags & B_READ) == 0) && 3240 (un->un_nsm >= 2) && 3241 (ps->ps_call == NULL) && 3242 !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) && 3243 !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) { 3244 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 3245 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 3246 mutex_enter(&un->un_resync_mx); 3247 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) 3248 un->un_outstanding_writes[current_rr]--; 3249 mutex_exit(&un->un_resync_mx); 3250 } 3251 kmem_cache_free(mirror_child_cache, cs); 3252 mutex_exit(&ps->ps_mx); 3253 3254 if (ps->ps_call != NULL) { 3255 daemon_request(&md_done_daemon, ps->ps_call, 3256 (daemon_queue_t *)ps, REQ_OLD); 3257 return (1); 3258 } 3259 3260 if ((ps->ps_flags & MD_MPS_ERROR)) { 3261 daemon_request(&md_done_daemon, mirror_error, 3262 (daemon_queue_t *)ps, REQ_OLD); 3263 return (1); 3264 } 3265 3266 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3267 mirror_overlap_tree_remove(ps); 3268 3269 /* 3270 * Handle Write-on-Write problem. 3271 * Skip In case of Raw and Direct I/O as they are 3272 * handled earlier. 3273 * 3274 */ 3275 if (!(md_mirror_wow_flg & WOW_DISABLE) && 3276 !(pb->b_flags & B_READ) && 3277 !(ps->ps_flags & MD_MPS_WOW) && 3278 !(pb->b_flags & B_PHYS) && 3279 any_pages_dirty(pb)) { 3280 md_unit_readerexit(ps->ps_ui); 3281 daemon_request(&md_mstr_daemon, handle_wow, 3282 (daemon_queue_t *)ps, REQ_OLD); 3283 return (1); 3284 } 3285 3286 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3287 MPS_FREE(mirror_parent_cache, ps); 3288 md_unit_readerexit(ui); 3289 md_biodone(pb); 3290 return (0); 3291 } 3292 3293 /* 3294 * Clear error state in submirror component if the retry worked after 3295 * a failfast error. 3296 */ 3297 static void 3298 clear_retry_error(struct buf *cb) 3299 { 3300 int smi; 3301 md_mcs_t *cs; 3302 mm_unit_t *un; 3303 mdi_unit_t *ui_sm; 3304 mm_submirror_t *sm; 3305 mm_submirror_ic_t *smic; 3306 u_longlong_t cnt; 3307 md_m_shared_t *shared; 3308 3309 /*LINTED*/ 3310 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3311 un = cs->cs_ps->ps_un; 3312 3313 for (smi = 0; smi < NMIRROR; smi++) { 3314 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 3315 continue; 3316 3317 if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) 3318 break; 3319 } 3320 3321 if (smi >= NMIRROR) 3322 return; 3323 3324 sm = &un->un_sm[smi]; 3325 smic = &un->un_smic[smi]; 3326 cnt = cb->b_bcount; 3327 3328 ui_sm = MDI_UNIT(getminor(cb->b_edev)); 3329 (void) md_unit_writerlock(ui_sm); 3330 3331 shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm, 3332 cb->b_blkno, &cnt); 3333 3334 if (shared->ms_flags & MDM_S_IOERR) { 3335 shared->ms_flags &= ~MDM_S_IOERR; 3336 3337 } else { 3338 /* the buf spans components and the first one is not erred */ 3339 int cnt; 3340 int i; 3341 3342 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un); 3343 for (i = 0; i < cnt; i++) { 3344 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 3345 (sm->sm_dev, sm, i); 3346 3347 if (shared->ms_flags & MDM_S_IOERR && 3348 shared->ms_state == CS_OKAY) { 3349 3350 shared->ms_flags &= ~MDM_S_IOERR; 3351 break; 3352 } 3353 } 3354 } 3355 3356 md_unit_writerexit(ui_sm); 3357 } 3358 3359 static size_t 3360 mirror_map_read( 3361 md_mps_t *ps, 3362 md_mcs_t *cs, 3363 diskaddr_t blkno, 3364 u_longlong_t count 3365 ) 3366 { 3367 mm_unit_t *un; 3368 buf_t *bp; 3369 u_longlong_t cando; 3370 3371 bp = &cs->cs_buf; 3372 un = ps->ps_un; 3373 3374 bp->b_lblkno = blkno; 3375 if (fast_select_read_unit(ps, cs) == 0) { 3376 bp->b_bcount = ldbtob(count); 3377 return (0); 3378 } 3379 bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, 3380 count, &cando, 0, NULL, cs)); 3381 bp->b_bcount = ldbtob(cando); 3382 if (count != cando) 3383 return (cando); 3384 return (0); 3385 } 3386 3387 static void 3388 write_after_read(md_mps_t *ps) 3389 { 3390 struct buf *pb; 3391 int flags; 3392 3393 if (ps->ps_flags & MD_MPS_ERROR) { 3394 mirror_error(ps); 3395 return; 3396 } 3397 3398 pb = ps->ps_bp; 3399 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3400 ps->ps_call = NULL; 3401 ps->ps_flags |= MD_MPS_WRITE_AFTER_READ; 3402 flags = MD_STR_NOTTOP | MD_STR_WAR; 3403 if (ps->ps_flags & MD_MPS_MAPPED) 3404 flags |= MD_STR_MAPPED; 3405 if (ps->ps_flags & MD_MPS_NOBLOCK) 3406 flags |= MD_NOBLOCK; 3407 if (ps->ps_flags & MD_MPS_DIRTY_RD) 3408 flags |= MD_STR_DIRTY_RD; 3409 (void) mirror_write_strategy(pb, flags, ps); 3410 } 3411 3412 static void 3413 continue_serial(md_mps_t *ps) 3414 { 3415 md_mcs_t *cs; 3416 buf_t *cb; 3417 mm_unit_t *un; 3418 int flags; 3419 3420 un = ps->ps_un; 3421 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 3422 mirror_child_init(cs); 3423 cb = &cs->cs_buf; 3424 ps->ps_call = NULL; 3425 ps->ps_frags = 1; 3426 (void) mirror_map_write(un, cs, ps, 0); 3427 flags = MD_STR_NOTTOP; 3428 if (ps->ps_flags & MD_MPS_MAPPED) 3429 flags |= MD_STR_MAPPED; 3430 md_call_strategy(cb, flags, NULL); 3431 } 3432 3433 static int 3434 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war) 3435 { 3436 int i; 3437 dev_t dev; /* needed for bioclone, so not md_dev64_t */ 3438 buf_t *cb; 3439 buf_t *pb; 3440 diskaddr_t blkno; 3441 size_t bcount; 3442 off_t offset; 3443 3444 pb = ps->ps_bp; 3445 cb = &cs->cs_buf; 3446 cs->cs_ps = ps; 3447 3448 i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm); 3449 3450 dev = md_dev64_to_dev(un->un_sm[i].sm_dev); 3451 3452 blkno = pb->b_lblkno; 3453 bcount = pb->b_bcount; 3454 offset = 0; 3455 if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) { 3456 blkno = DK_LABEL_LOC + 1; 3457 /* 3458 * This handles the case where we're requesting 3459 * a write to block 0 on a label partition 3460 * and the request size was smaller than the 3461 * size of the label. If this is the case 3462 * then we'll return -1. Failure to do so will 3463 * either cause the calling thread to hang due to 3464 * an ssd bug, or worse if the bcount were allowed 3465 * to go negative (ie large). 3466 */ 3467 if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1)) 3468 return (-1); 3469 bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3470 offset = (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3471 } 3472 3473 cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done, 3474 cb, KM_NOSLEEP); 3475 if (war) 3476 cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE; 3477 3478 /* 3479 * If the submirror is in the erred stated, check if any component is 3480 * in the Last Erred state. If so, we don't want to use the B_FAILFAST 3481 * flag on the IO. 3482 * 3483 * Provide a fast path for the non-erred case (which should be the 3484 * normal case). 3485 */ 3486 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) { 3487 if (un->un_sm[i].sm_state & SMS_COMP_ERRED) { 3488 mm_submirror_t *sm; 3489 mm_submirror_ic_t *smic; 3490 int ci; 3491 int compcnt; 3492 3493 sm = &un->un_sm[i]; 3494 smic = &un->un_smic[i]; 3495 3496 compcnt = (*(smic->sm_get_component_count)) 3497 (sm->sm_dev, un); 3498 for (ci = 0; ci < compcnt; ci++) { 3499 md_m_shared_t *shared; 3500 3501 shared = (md_m_shared_t *) 3502 (*(smic->sm_shared_by_indx))(sm->sm_dev, 3503 sm, ci); 3504 3505 if (shared->ms_state == CS_LAST_ERRED) 3506 break; 3507 } 3508 if (ci >= compcnt) 3509 cb->b_flags |= B_FAILFAST; 3510 3511 } else { 3512 cb->b_flags |= B_FAILFAST; 3513 } 3514 } 3515 3516 ps->ps_current_sm++; 3517 if (ps->ps_current_sm != ps->ps_active_cnt) { 3518 if (un->un_write_option == WR_SERIAL) { 3519 ps->ps_call = continue_serial; 3520 return (0); 3521 } 3522 return (1); 3523 } 3524 return (0); 3525 } 3526 3527 /* 3528 * directed_read_done: 3529 * ------------------ 3530 * Completion routine called when a DMR request has been returned from the 3531 * underlying driver. Wake-up the original ioctl() and return the data to 3532 * the user. 3533 */ 3534 static void 3535 directed_read_done(md_mps_t *ps) 3536 { 3537 mm_unit_t *un; 3538 mdi_unit_t *ui; 3539 3540 un = ps->ps_un; 3541 ui = ps->ps_ui; 3542 3543 md_unit_readerexit(ui); 3544 md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3545 ps->ps_call = NULL; 3546 3547 mutex_enter(&un->un_dmr_mx); 3548 cv_signal(&un->un_dmr_cv); 3549 mutex_exit(&un->un_dmr_mx); 3550 3551 /* release the parent structure */ 3552 kmem_cache_free(mirror_parent_cache, ps); 3553 } 3554 3555 /* 3556 * daemon_io: 3557 * ------------ 3558 * Called to issue a mirror_write_strategy() or mirror_read_strategy 3559 * call from a blockable context. NOTE: no mutex can be held on entry to this 3560 * routine 3561 */ 3562 static void 3563 daemon_io(daemon_queue_t *dq) 3564 { 3565 md_mps_t *ps = (md_mps_t *)dq; 3566 int flag = MD_STR_NOTTOP; 3567 buf_t *pb = ps->ps_bp; 3568 3569 if (ps->ps_flags & MD_MPS_MAPPED) 3570 flag |= MD_STR_MAPPED; 3571 if (ps->ps_flags & MD_MPS_WOW) 3572 flag |= MD_STR_WOW; 3573 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) 3574 flag |= MD_STR_WAR; 3575 if (ps->ps_flags & MD_MPS_ABR) 3576 flag |= MD_STR_ABR; 3577 if (ps->ps_flags & MD_MPS_BLOCKABLE_IO) 3578 flag |= MD_STR_BLOCK_OK; 3579 3580 /* 3581 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set 3582 * MD_STR_WAR before calling mirror_read_strategy 3583 */ 3584 if (pb->b_flags & B_READ) { 3585 if (!(ps->ps_flags & MD_MPS_DIRTY_RD)) 3586 flag |= MD_STR_WAR; 3587 mirror_read_strategy(pb, flag, ps); 3588 } else 3589 mirror_write_strategy(pb, flag, ps); 3590 } 3591 3592 /* 3593 * update_resync: 3594 * ------------- 3595 * Called to update the in-core version of the resync record with the latest 3596 * version that was committed to disk when the previous mirror owner 3597 * relinquished ownership. This call is likely to block as we must hold-off 3598 * any current resync processing that may be occurring. 3599 * On completion of the resync record update we issue the mirror_write_strategy 3600 * call to complete the i/o that first started this sequence. To remove a race 3601 * condition between a new write() request which is submitted and the resync 3602 * record update we acquire the writerlock. This will hold off all i/o to the 3603 * mirror until the resync update has completed. 3604 * NOTE: no mutex can be held on entry to this routine 3605 */ 3606 static void 3607 update_resync(daemon_queue_t *dq) 3608 { 3609 md_mps_t *ps = (md_mps_t *)dq; 3610 buf_t *pb = ps->ps_bp; 3611 mdi_unit_t *ui = ps->ps_ui; 3612 mm_unit_t *un = MD_UNIT(ui->ui_link.ln_id); 3613 set_t setno; 3614 int restart_resync; 3615 3616 mutex_enter(&un->un_rrp_inflight_mx); 3617 (void) md_unit_writerlock(ui); 3618 ps->ps_un = un; 3619 setno = MD_MIN2SET(getminor(pb->b_edev)); 3620 if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) { 3621 /* 3622 * Synchronize our in-core view of what regions need to be 3623 * resync'd with the on-disk version. 3624 */ 3625 mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm, 3626 un->un_dirty_bm); 3627 3628 /* Region dirty map is now up to date */ 3629 } 3630 restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0; 3631 md_unit_writerexit(ui); 3632 mutex_exit(&un->un_rrp_inflight_mx); 3633 3634 /* Restart the resync thread if it was previously blocked */ 3635 if (restart_resync) { 3636 mutex_enter(&un->un_rs_thread_mx); 3637 un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER; 3638 cv_signal(&un->un_rs_thread_cv); 3639 mutex_exit(&un->un_rs_thread_mx); 3640 } 3641 /* Continue with original deferred i/o */ 3642 daemon_io(dq); 3643 } 3644 3645 /* 3646 * owner_timeout: 3647 * ------------- 3648 * Called if the original mdmn_ksend_message() failed and the request is to be 3649 * retried. Reattempt the original ownership change. 3650 * 3651 * NOTE: called at interrupt context (see timeout(9f)). 3652 */ 3653 static void 3654 owner_timeout(void *arg) 3655 { 3656 daemon_queue_t *dq = (daemon_queue_t *)arg; 3657 3658 daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD); 3659 } 3660 3661 /* 3662 * become_owner: 3663 * ------------ 3664 * Called to issue RPC request to become the owner of the mirror 3665 * associated with this i/o request. We assume that the ownership request 3666 * is synchronous, so if it succeeds we will issue the request via 3667 * mirror_write_strategy(). 3668 * If multiple i/o's are outstanding we will be called from the mirror_daemon 3669 * service thread. 3670 * NOTE: no mutex should be held on entry to this routine. 3671 */ 3672 static void 3673 become_owner(daemon_queue_t *dq) 3674 { 3675 md_mps_t *ps = (md_mps_t *)dq; 3676 mm_unit_t *un = ps->ps_un; 3677 buf_t *pb = ps->ps_bp; 3678 set_t setno; 3679 md_mn_kresult_t *kres; 3680 int msg_flags = md_mirror_msg_flags; 3681 md_mps_t *ps1; 3682 3683 ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL); 3684 3685 /* 3686 * If we're already the mirror owner we do not need to send a message 3687 * but can simply process the i/o request immediately. 3688 * If we've already sent the request to become owner we requeue the 3689 * request as we're waiting for the synchronous ownership message to 3690 * be processed. 3691 */ 3692 if (MD_MN_MIRROR_OWNER(un)) { 3693 /* 3694 * As the strategy() call will potentially block we need to 3695 * punt this to a separate thread and complete this request 3696 * as quickly as possible. Note: if we're a read request 3697 * this must be a resync, we cannot afford to be queued 3698 * behind any intervening i/o requests. In this case we put the 3699 * request on the md_mirror_rs_daemon queue. 3700 */ 3701 if (pb->b_flags & B_READ) { 3702 daemon_request(&md_mirror_rs_daemon, daemon_io, dq, 3703 REQ_OLD); 3704 } else { 3705 daemon_request(&md_mirror_io_daemon, daemon_io, dq, 3706 REQ_OLD); 3707 } 3708 } else { 3709 mutex_enter(&un->un_owner_mx); 3710 if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) { 3711 md_mn_req_owner_t *msg; 3712 int rval = 0; 3713 3714 /* 3715 * Check to see that we haven't exceeded the maximum 3716 * retry count. If we have we fail the i/o as the 3717 * comms mechanism has become wedged beyond recovery. 3718 */ 3719 if (dq->qlen++ >= MD_OWNER_RETRIES) { 3720 mutex_exit(&un->un_owner_mx); 3721 cmn_err(CE_WARN, 3722 "md_mirror: Request exhausted ownership " 3723 "retry limit of %d attempts", dq->qlen); 3724 pb->b_error = EIO; 3725 pb->b_flags |= B_ERROR; 3726 pb->b_resid = pb->b_bcount; 3727 kmem_cache_free(mirror_parent_cache, ps); 3728 md_biodone(pb); 3729 return; 3730 } 3731 3732 /* 3733 * Issue request to change ownership. The call is 3734 * synchronous so when it returns we can complete the 3735 * i/o (if successful), or enqueue it again so that 3736 * the operation will be retried. 3737 */ 3738 un->un_owner_state |= MM_MN_OWNER_SENT; 3739 mutex_exit(&un->un_owner_mx); 3740 3741 msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP); 3742 setno = MD_MIN2SET(getminor(pb->b_edev)); 3743 msg->mnum = MD_SID(un); 3744 msg->owner = md_mn_mynode_id; 3745 msg_flags |= MD_MSGF_NO_LOG; 3746 /* 3747 * If this IO is triggered by updating a watermark, 3748 * it might be issued by the creation of a softpartition 3749 * while the commd subsystem is suspended. 3750 * We don't want this message to block. 3751 */ 3752 if (ps->ps_flags & MD_MPS_WMUPDATE) { 3753 msg_flags |= MD_MSGF_OVERRIDE_SUSPEND; 3754 } 3755 3756 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 3757 rval = mdmn_ksend_message(setno, 3758 MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0, 3759 (char *)msg, sizeof (md_mn_req_owner_t), kres); 3760 3761 kmem_free(msg, sizeof (md_mn_req_owner_t)); 3762 3763 if (MDMN_KSEND_MSG_OK(rval, kres)) { 3764 dq->qlen = 0; 3765 /* 3766 * Successfully changed owner, reread the 3767 * resync record so that we have a valid idea of 3768 * any previously committed incomplete write()s. 3769 * NOTE: As we need to acquire the resync mutex 3770 * this may block, so we defer it to a separate 3771 * thread handler. This makes us (effectively) 3772 * non-blocking once the ownership message 3773 * handling has completed. 3774 */ 3775 mutex_enter(&un->un_owner_mx); 3776 if (un->un_owner_state & MM_MN_BECOME_OWNER) { 3777 un->un_mirror_owner = md_mn_mynode_id; 3778 /* Sets owner of un_rr_dirty record */ 3779 if (un->un_rr_dirty_recid) 3780 (void) mddb_setowner( 3781 un->un_rr_dirty_recid, 3782 md_mn_mynode_id); 3783 un->un_owner_state &= 3784 ~MM_MN_BECOME_OWNER; 3785 /* 3786 * Release the block on the current 3787 * resync region if it is blocked 3788 */ 3789 ps1 = un->un_rs_prev_overlap; 3790 if ((ps1 != NULL) && 3791 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) 3792 mirror_overlap_tree_remove(ps1); 3793 mutex_exit(&un->un_owner_mx); 3794 3795 /* 3796 * If we're a read, this must be a 3797 * resync request, issue 3798 * the i/o request on the 3799 * md_mirror_rs_daemon queue. This is 3800 * to avoid a deadlock between the 3801 * resync_unit thread and 3802 * subsequent i/o requests that may 3803 * block on the resync region. 3804 */ 3805 if (pb->b_flags & B_READ) { 3806 daemon_request( 3807 &md_mirror_rs_daemon, 3808 update_resync, dq, REQ_OLD); 3809 } else { 3810 daemon_request( 3811 &md_mirror_io_daemon, 3812 update_resync, dq, REQ_OLD); 3813 } 3814 kmem_free(kres, 3815 sizeof (md_mn_kresult_t)); 3816 return; 3817 } else { 3818 /* 3819 * Some other node has beaten us to 3820 * obtain ownership. We need to 3821 * reschedule our ownership request 3822 */ 3823 mutex_exit(&un->un_owner_mx); 3824 } 3825 } else { 3826 mdmn_ksend_show_error(rval, kres, 3827 "MD_MN_MSG_REQUIRE_OWNER"); 3828 /* 3829 * Message transport failure is handled by the 3830 * comms layer. If the ownership change request 3831 * does not succeed we need to flag the error to 3832 * the initiator of the i/o. This is handled by 3833 * the retry logic above. As the request failed 3834 * we do not know _who_ the owner of the mirror 3835 * currently is. We reset our idea of the owner 3836 * to None so that any further write()s will 3837 * attempt to become the owner again. This stops 3838 * multiple nodes writing to the same mirror 3839 * simultaneously. 3840 */ 3841 mutex_enter(&un->un_owner_mx); 3842 un->un_owner_state &= 3843 ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER); 3844 un->un_mirror_owner = MD_MN_MIRROR_UNOWNED; 3845 mutex_exit(&un->un_owner_mx); 3846 } 3847 kmem_free(kres, sizeof (md_mn_kresult_t)); 3848 } else 3849 mutex_exit(&un->un_owner_mx); 3850 3851 /* 3852 * Re-enqueue this request on the deferred i/o list. Delay the 3853 * request for md_mirror_owner_to usecs to stop thrashing. 3854 */ 3855 (void) timeout(owner_timeout, dq, 3856 drv_usectohz(md_mirror_owner_to)); 3857 } 3858 } 3859 3860 static void 3861 mirror_write_strategy(buf_t *pb, int flag, void *private) 3862 { 3863 md_mps_t *ps; 3864 md_mcs_t *cs; 3865 int more; 3866 mm_unit_t *un; 3867 mdi_unit_t *ui; 3868 buf_t *cb; /* child buf pointer */ 3869 set_t setno; 3870 int rs_on_overlap = 0; 3871 3872 ui = MDI_UNIT(getminor(pb->b_edev)); 3873 un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev)); 3874 3875 3876 md_kstat_waitq_enter(ui); 3877 3878 /* 3879 * If a state change is in progress for this mirror in a MN set, 3880 * suspend all non-resync writes until the state change is complete. 3881 * The objective of this suspend is to ensure that it is not 3882 * possible for one node to read data from a submirror that another node 3883 * has not written to because of the state change. Therefore we 3884 * suspend all writes until the state change has been made. As it is 3885 * not possible to read from the target of a resync, there is no need 3886 * to suspend resync writes. 3887 * Note that we only block here if the caller can handle a busy-wait. 3888 * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only. 3889 */ 3890 3891 if (!(flag & MD_STR_WAR)) { 3892 if (flag & MD_STR_BLOCK_OK) { 3893 mutex_enter(&un->un_suspend_wr_mx); 3894 while (un->un_suspend_wr_flag) { 3895 cv_wait(&un->un_suspend_wr_cv, 3896 &un->un_suspend_wr_mx); 3897 } 3898 mutex_exit(&un->un_suspend_wr_mx); 3899 } 3900 (void) md_unit_readerlock(ui); 3901 } 3902 3903 if (!(flag & MD_STR_NOTTOP)) { 3904 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 3905 md_kstat_waitq_exit(ui); 3906 return; 3907 } 3908 } 3909 3910 setno = MD_MIN2SET(getminor(pb->b_edev)); 3911 3912 /* If an ABR write has been requested, set MD_STR_ABR flag */ 3913 if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE)) 3914 flag |= MD_STR_ABR; 3915 3916 if (private == NULL) { 3917 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 3918 mirror_parent_init(ps); 3919 } else { 3920 ps = private; 3921 private = NULL; 3922 } 3923 if (flag & MD_STR_MAPPED) 3924 ps->ps_flags |= MD_MPS_MAPPED; 3925 3926 if (flag & MD_STR_WOW) 3927 ps->ps_flags |= MD_MPS_WOW; 3928 3929 if (flag & MD_STR_ABR) 3930 ps->ps_flags |= MD_MPS_ABR; 3931 3932 if (flag & MD_STR_WMUPDATE) 3933 ps->ps_flags |= MD_MPS_WMUPDATE; 3934 3935 /* 3936 * Save essential information from the original buffhdr 3937 * in the md_save structure. 3938 */ 3939 ps->ps_un = un; 3940 ps->ps_ui = ui; 3941 ps->ps_bp = pb; 3942 ps->ps_addr = pb->b_un.b_addr; 3943 ps->ps_firstblk = pb->b_lblkno; 3944 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 3945 ps->ps_changecnt = un->un_changecnt; 3946 3947 /* 3948 * Check for suspended writes here. This is where we can defer the 3949 * write request to the daemon_io queue which will then call us with 3950 * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at 3951 * the top of this routine. 3952 */ 3953 if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) { 3954 mutex_enter(&un->un_suspend_wr_mx); 3955 if (un->un_suspend_wr_flag) { 3956 ps->ps_flags |= MD_MPS_BLOCKABLE_IO; 3957 mutex_exit(&un->un_suspend_wr_mx); 3958 md_unit_readerexit(ui); 3959 daemon_request(&md_mirror_daemon, daemon_io, 3960 (daemon_queue_t *)ps, REQ_OLD); 3961 return; 3962 } 3963 mutex_exit(&un->un_suspend_wr_mx); 3964 } 3965 3966 /* 3967 * If not MN owner and this is an ABR write, make sure the current 3968 * resync region is in the overlaps tree 3969 */ 3970 mutex_enter(&un->un_owner_mx); 3971 if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) && 3972 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 3973 md_mps_t *ps1; 3974 /* Block the current resync region, if not already blocked */ 3975 ps1 = un->un_rs_prev_overlap; 3976 3977 if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) || 3978 (ps1->ps_lastblk != 0))) { 3979 /* Drop locks to avoid deadlock */ 3980 mutex_exit(&un->un_owner_mx); 3981 md_unit_readerexit(ui); 3982 wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT); 3983 rs_on_overlap = 1; 3984 (void) md_unit_readerlock(ui); 3985 mutex_enter(&un->un_owner_mx); 3986 /* 3987 * Check to see if we have obtained ownership 3988 * while waiting for overlaps. If we have, remove 3989 * the resync_region entry from the overlap tree 3990 */ 3991 if (MD_MN_MIRROR_OWNER(un) && 3992 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) { 3993 mirror_overlap_tree_remove(ps1); 3994 rs_on_overlap = 0; 3995 } 3996 } 3997 } 3998 mutex_exit(&un->un_owner_mx); 3999 4000 4001 /* 4002 * following keep write after read from writing to the 4003 * source in the case where it all came from one place 4004 */ 4005 if (flag & MD_STR_WAR) { 4006 int abort_write = 0; 4007 /* 4008 * We are perfoming a write-after-read. This is either as a 4009 * result of a resync read or as a result of a read in a 4010 * dirty resync region when the optimized resync is not 4011 * complete. If in a MN set and a resync generated i/o, 4012 * if the current block is not in the current 4013 * resync region terminate the write as another node must have 4014 * completed this resync region 4015 */ 4016 if ((MD_MNSET_SETNO(MD_UN2SET(un))) && 4017 (!flag & MD_STR_DIRTY_RD)) { 4018 if (!IN_RESYNC_REGION(un, ps)) 4019 abort_write = 1; 4020 } 4021 if ((select_write_after_read_units(un, ps) == 0) || 4022 (abort_write)) { 4023 #ifdef DEBUG 4024 if (mirror_debug_flag) 4025 printf("Abort resync write on %x, block %lld\n", 4026 MD_SID(un), ps->ps_firstblk); 4027 #endif 4028 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4029 mirror_overlap_tree_remove(ps); 4030 kmem_cache_free(mirror_parent_cache, ps); 4031 md_kstat_waitq_exit(ui); 4032 md_unit_readerexit(ui); 4033 md_biodone(pb); 4034 return; 4035 } 4036 } else { 4037 select_write_units(un, ps); 4038 4039 /* Drop readerlock to avoid deadlock */ 4040 md_unit_readerexit(ui); 4041 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 4042 un = md_unit_readerlock(ui); 4043 /* 4044 * For a MN set with an ABR write, if we are now the 4045 * owner and we have a resync region in the overlap 4046 * tree, remove the entry from overlaps and retry the write. 4047 */ 4048 4049 if (MD_MNSET_SETNO(setno) && 4050 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 4051 mutex_enter(&un->un_owner_mx); 4052 if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) { 4053 mirror_overlap_tree_remove(ps); 4054 md_kstat_waitq_exit(ui); 4055 mutex_exit(&un->un_owner_mx); 4056 md_unit_readerexit(ui); 4057 daemon_request(&md_mirror_daemon, daemon_io, 4058 (daemon_queue_t *)ps, REQ_OLD); 4059 return; 4060 } 4061 mutex_exit(&un->un_owner_mx); 4062 } 4063 } 4064 4065 /* 4066 * For Multinode mirrors with no owner and a Resync Region (not ABR) 4067 * we need to become the mirror owner before continuing with the 4068 * write(). For ABR mirrors we check that we 'own' the resync if 4069 * we're in write-after-read mode. We do this _after_ ensuring that 4070 * there are no overlaps to ensure that once we know that we are 4071 * the owner, the readerlock will not be released until the write is 4072 * complete. As a change of ownership in a MN set requires the 4073 * writerlock, this ensures that ownership cannot be changed until 4074 * the write is complete. 4075 */ 4076 if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) || 4077 (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) { 4078 if (MD_MN_NO_MIRROR_OWNER(un)) { 4079 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4080 mirror_overlap_tree_remove(ps); 4081 md_kstat_waitq_exit(ui); 4082 ASSERT(!(flag & MD_STR_WAR)); 4083 md_unit_readerexit(ui); 4084 daemon_request(&md_mirror_daemon, become_owner, 4085 (daemon_queue_t *)ps, REQ_OLD); 4086 return; 4087 } 4088 } 4089 4090 /* 4091 * Mark resync region if mirror has a Resync Region _and_ we are not 4092 * a resync initiated write(). Don't mark region if we're flagged as 4093 * an ABR write. 4094 */ 4095 if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) && 4096 !(flag & MD_STR_WAR)) { 4097 if (mirror_mark_resync_region(un, ps->ps_firstblk, 4098 ps->ps_lastblk, md_mn_mynode_id)) { 4099 pb->b_flags |= B_ERROR; 4100 pb->b_resid = pb->b_bcount; 4101 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4102 mirror_overlap_tree_remove(ps); 4103 kmem_cache_free(mirror_parent_cache, ps); 4104 md_kstat_waitq_exit(ui); 4105 md_unit_readerexit(ui); 4106 md_biodone(pb); 4107 return; 4108 } 4109 } 4110 4111 ps->ps_childbflags = pb->b_flags | B_WRITE; 4112 ps->ps_childbflags &= ~B_READ; 4113 if (flag & MD_STR_MAPPED) 4114 ps->ps_childbflags &= ~B_PAGEIO; 4115 4116 if (!(flag & MD_STR_NOTTOP) && panicstr) 4117 /* Disable WOW and don't free ps */ 4118 ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE); 4119 4120 md_kstat_waitq_to_runq(ui); 4121 4122 /* 4123 * Treat Raw and Direct I/O as Write-on-Write always 4124 */ 4125 4126 if (!(md_mirror_wow_flg & WOW_DISABLE) && 4127 (md_mirror_wow_flg & WOW_PHYS_ENABLE) && 4128 (pb->b_flags & B_PHYS) && 4129 !(ps->ps_flags & MD_MPS_WOW)) { 4130 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4131 mirror_overlap_tree_remove(ps); 4132 md_unit_readerexit(ui); 4133 daemon_request(&md_mstr_daemon, handle_wow, 4134 (daemon_queue_t *)ps, REQ_OLD); 4135 return; 4136 } 4137 4138 ps->ps_frags = 1; 4139 do { 4140 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 4141 mirror_child_init(cs); 4142 cb = &cs->cs_buf; 4143 more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR)); 4144 4145 /* 4146 * This handles the case where we're requesting 4147 * a write to block 0 on a label partition. (more < 0) 4148 * means that the request size was smaller than the 4149 * size of the label. If so this request is done. 4150 */ 4151 if (more < 0) { 4152 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4153 mirror_overlap_tree_remove(ps); 4154 md_kstat_runq_exit(ui); 4155 kmem_cache_free(mirror_child_cache, cs); 4156 kmem_cache_free(mirror_parent_cache, ps); 4157 md_unit_readerexit(ui); 4158 md_biodone(pb); 4159 return; 4160 } 4161 if (more) { 4162 mutex_enter(&ps->ps_mx); 4163 ps->ps_frags++; 4164 mutex_exit(&ps->ps_mx); 4165 } 4166 md_call_strategy(cb, flag, private); 4167 } while (more); 4168 4169 if (!(flag & MD_STR_NOTTOP) && panicstr) { 4170 while (!(ps->ps_flags & MD_MPS_DONE)) { 4171 md_daemon(1, &md_done_daemon); 4172 drv_usecwait(10); 4173 } 4174 kmem_cache_free(mirror_parent_cache, ps); 4175 } 4176 } 4177 4178 static void 4179 mirror_read_strategy(buf_t *pb, int flag, void *private) 4180 { 4181 md_mps_t *ps; 4182 md_mcs_t *cs; 4183 size_t more; 4184 mm_unit_t *un; 4185 mdi_unit_t *ui; 4186 size_t current_count; 4187 diskaddr_t current_blkno; 4188 off_t current_offset; 4189 buf_t *cb; /* child buf pointer */ 4190 set_t setno; 4191 4192 ui = MDI_UNIT(getminor(pb->b_edev)); 4193 4194 md_kstat_waitq_enter(ui); 4195 4196 un = (mm_unit_t *)md_unit_readerlock(ui); 4197 4198 if (!(flag & MD_STR_NOTTOP)) { 4199 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 4200 md_kstat_waitq_exit(ui); 4201 return; 4202 } 4203 } 4204 4205 if (private == NULL) { 4206 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 4207 mirror_parent_init(ps); 4208 } else { 4209 ps = private; 4210 private = NULL; 4211 } 4212 4213 if (flag & MD_STR_MAPPED) 4214 ps->ps_flags |= MD_MPS_MAPPED; 4215 if (flag & MD_NOBLOCK) 4216 ps->ps_flags |= MD_MPS_NOBLOCK; 4217 if (flag & MD_STR_WMUPDATE) 4218 ps->ps_flags |= MD_MPS_WMUPDATE; 4219 4220 /* 4221 * Check to see if this is a DMR driven read. If so we need to use the 4222 * specified side (in un->un_dmr_last_read) for the source of the data. 4223 */ 4224 if (flag & MD_STR_DMR) 4225 ps->ps_flags |= MD_MPS_DMR; 4226 4227 /* 4228 * Save essential information from the original buffhdr 4229 * in the md_save structure. 4230 */ 4231 ps->ps_un = un; 4232 ps->ps_ui = ui; 4233 ps->ps_bp = pb; 4234 ps->ps_addr = pb->b_un.b_addr; 4235 ps->ps_firstblk = pb->b_lblkno; 4236 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 4237 ps->ps_changecnt = un->un_changecnt; 4238 4239 current_count = btodb(pb->b_bcount); 4240 current_blkno = pb->b_lblkno; 4241 current_offset = 0; 4242 4243 /* 4244 * If flag has MD_STR_WAR set this means that the read is issued by a 4245 * resync thread which may or may not be an optimised resync. 4246 * 4247 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync 4248 * code has not completed; either a resync has not started since snarf, 4249 * or there is an optimized resync in progress. 4250 * 4251 * We need to generate a write after this read in the following two 4252 * cases, 4253 * 4254 * 1. Any Resync-Generated read 4255 * 4256 * 2. Any read to a DIRTY REGION if there is an optimized resync 4257 * pending or in progress. 4258 * 4259 * The write after read is done in these cases to ensure that all sides 4260 * of the mirror are in sync with the read data and that it is not 4261 * possible for an application to read the same block multiple times 4262 * and get different data. 4263 * 4264 * This would be possible if the block was in a dirty region. 4265 * 4266 * If we're performing a directed read we don't write the data out as 4267 * the application is responsible for restoring the mirror to a known 4268 * state. 4269 */ 4270 if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) && 4271 !(flag & MD_STR_DMR)) { 4272 size_t start_rr, i, end_rr; 4273 int region_dirty = 1; 4274 4275 /* 4276 * We enter here under three circumstances, 4277 * 4278 * MD_UN_OPT_NOT_DONE MD_STR_WAR 4279 * 0 1 4280 * 1 0 4281 * 1 1 4282 * 4283 * To be optimal we only care to explicitly check for dirty 4284 * regions in the second case since if MD_STR_WAR is set we 4285 * always do the write after read. 4286 */ 4287 if (!(flag & MD_STR_WAR)) { 4288 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 4289 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 4290 4291 for (i = start_rr; i <= end_rr; i++) 4292 if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0) 4293 break; 4294 } 4295 4296 if ((region_dirty) && 4297 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 4298 ps->ps_call = write_after_read; 4299 /* 4300 * Mark this as a RESYNC_READ in ps_flags. 4301 * This is used if the read fails during a 4302 * resync of a 3-way mirror to ensure that 4303 * the retried read to the remaining 4304 * good submirror has MD_STR_WAR set. This 4305 * is needed to ensure that the resync write 4306 * (write-after-read) takes place. 4307 */ 4308 ps->ps_flags |= MD_MPS_RESYNC_READ; 4309 4310 /* 4311 * If MD_STR_FLAG_ERR is set in the flags we 4312 * set MD_MPS_FLAG_ERROR so that an error on the resync 4313 * write (issued by write_after_read) will be flagged 4314 * to the biowait'ing resync thread. This allows us to 4315 * avoid issuing further resync requests to a device 4316 * that has had a write failure. 4317 */ 4318 if (flag & MD_STR_FLAG_ERR) 4319 ps->ps_flags |= MD_MPS_FLAG_ERROR; 4320 4321 setno = MD_UN2SET(un); 4322 /* 4323 * Drop the readerlock to avoid 4324 * deadlock 4325 */ 4326 md_unit_readerexit(ui); 4327 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 4328 un = md_unit_readerlock(ui); 4329 /* 4330 * Ensure that we are owner 4331 */ 4332 if (MD_MNSET_SETNO(setno)) { 4333 /* 4334 * For a non-resync read that requires a 4335 * write-after-read to be done, set a flag 4336 * in the parent structure, so that the 4337 * write_strategy routine can omit the 4338 * test that the write is still within the 4339 * resync region 4340 */ 4341 if (!(flag & MD_STR_WAR)) 4342 ps->ps_flags |= MD_MPS_DIRTY_RD; 4343 4344 /* 4345 * Before reading the buffer, see if 4346 * there is an owner. 4347 */ 4348 if (MD_MN_NO_MIRROR_OWNER(un)) { 4349 ps->ps_call = NULL; 4350 mirror_overlap_tree_remove(ps); 4351 md_kstat_waitq_exit(ui); 4352 md_unit_readerexit(ui); 4353 daemon_request( 4354 &md_mirror_daemon, 4355 become_owner, 4356 (daemon_queue_t *)ps, 4357 REQ_OLD); 4358 return; 4359 } 4360 /* 4361 * For a resync read, check to see if I/O is 4362 * outside of the current resync region, or 4363 * the resync has finished. If so 4364 * just terminate the I/O 4365 */ 4366 if ((flag & MD_STR_WAR) && 4367 (!(un->c.un_status & MD_UN_WAR) || 4368 (!IN_RESYNC_REGION(un, ps)))) { 4369 #ifdef DEBUG 4370 if (mirror_debug_flag) 4371 printf("Abort resync read " 4372 "%x: %lld\n", 4373 MD_SID(un), 4374 ps->ps_firstblk); 4375 #endif 4376 mirror_overlap_tree_remove(ps); 4377 kmem_cache_free(mirror_parent_cache, 4378 ps); 4379 md_kstat_waitq_exit(ui); 4380 md_unit_readerexit(ui); 4381 md_biodone(pb); 4382 return; 4383 } 4384 } 4385 } 4386 } 4387 4388 if (flag & MD_STR_DMR) { 4389 ps->ps_call = directed_read_done; 4390 } 4391 4392 if (!(flag & MD_STR_NOTTOP) && panicstr) 4393 ps->ps_flags |= MD_MPS_DONTFREE; 4394 4395 md_kstat_waitq_to_runq(ui); 4396 4397 ps->ps_frags++; 4398 do { 4399 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 4400 mirror_child_init(cs); 4401 cb = &cs->cs_buf; 4402 cs->cs_ps = ps; 4403 4404 cb = md_bioclone(pb, current_offset, current_count, NODEV, 4405 current_blkno, mirror_done, cb, KM_NOSLEEP); 4406 4407 more = mirror_map_read(ps, cs, current_blkno, 4408 (u_longlong_t)current_count); 4409 if (more) { 4410 mutex_enter(&ps->ps_mx); 4411 ps->ps_frags++; 4412 mutex_exit(&ps->ps_mx); 4413 } 4414 4415 /* 4416 * Do these calculations now, 4417 * so that we pickup a valid b_bcount from the chld_bp. 4418 */ 4419 current_count -= more; 4420 current_offset += cb->b_bcount; 4421 current_blkno += more; 4422 md_call_strategy(cb, flag, private); 4423 } while (more); 4424 4425 if (!(flag & MD_STR_NOTTOP) && panicstr) { 4426 while (!(ps->ps_flags & MD_MPS_DONE)) { 4427 md_daemon(1, &md_done_daemon); 4428 drv_usecwait(10); 4429 } 4430 kmem_cache_free(mirror_parent_cache, ps); 4431 } 4432 } 4433 4434 void 4435 md_mirror_strategy(buf_t *bp, int flag, void *private) 4436 { 4437 set_t setno = MD_MIN2SET(getminor(bp->b_edev)); 4438 4439 /* 4440 * When doing IO to a multi owner meta device, check if set is halted. 4441 * We do this check without the needed lock held, for performance 4442 * reasons. 4443 * If an IO just slips through while the set is locked via an 4444 * MD_MN_SUSPEND_SET, we don't care about it. 4445 * Only check for suspension if we are a top-level i/o request 4446 * (MD_STR_NOTTOP is cleared in 'flag'). 4447 */ 4448 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 4449 (MD_SET_HALTED | MD_SET_MNSET)) { 4450 if ((flag & MD_STR_NOTTOP) == 0) { 4451 mutex_enter(&md_mx); 4452 /* Here we loop until the set is no longer halted */ 4453 while (md_set[setno].s_status & MD_SET_HALTED) { 4454 cv_wait(&md_cv, &md_mx); 4455 } 4456 mutex_exit(&md_mx); 4457 } 4458 } 4459 4460 if ((flag & MD_IO_COUNTED) == 0) { 4461 if ((flag & MD_NOBLOCK) == 0) { 4462 if (md_inc_iocount(setno) != 0) { 4463 bp->b_flags |= B_ERROR; 4464 bp->b_error = ENXIO; 4465 bp->b_resid = bp->b_bcount; 4466 biodone(bp); 4467 return; 4468 } 4469 } else { 4470 md_inc_iocount_noblock(setno); 4471 } 4472 } 4473 4474 if (bp->b_flags & B_READ) 4475 mirror_read_strategy(bp, flag, private); 4476 else 4477 mirror_write_strategy(bp, flag, private); 4478 } 4479 4480 /* 4481 * mirror_directed_read: 4482 * -------------------- 4483 * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror 4484 * so that the application can determine what (if any) resync needs to be 4485 * performed. The data is copied out to the user-supplied buffer. 4486 * 4487 * Parameters: 4488 * mdev - dev_t for the mirror device 4489 * vdr - directed read parameters specifying location and submirror 4490 * to perform the read from 4491 * mode - used to ddi_copyout() any resulting data from the read 4492 * 4493 * Returns: 4494 * 0 success 4495 * !0 error code 4496 * EINVAL - invalid request format 4497 */ 4498 int 4499 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode) 4500 { 4501 buf_t *bp; 4502 minor_t mnum = getminor(mdev); 4503 mdi_unit_t *ui = MDI_UNIT(mnum); 4504 mm_unit_t *un; 4505 mm_submirror_t *sm; 4506 char *sm_nm; 4507 uint_t next_side; 4508 void *kbuffer; 4509 4510 if (ui == NULL) 4511 return (ENXIO); 4512 4513 if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) { 4514 return (EINVAL); 4515 } 4516 4517 /* Check for aligned block access. We disallow non-aligned requests. */ 4518 if (vdr->vdr_offset % DEV_BSIZE) { 4519 return (EINVAL); 4520 } 4521 4522 /* 4523 * Allocate kernel buffer for target of read(). If we had a reliable 4524 * (sorry functional) DDI this wouldn't be needed. 4525 */ 4526 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 4527 if (kbuffer == NULL) { 4528 cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx" 4529 " bytes\n", vdr->vdr_nbytes); 4530 return (ENOMEM); 4531 } 4532 4533 bp = getrbuf(KM_SLEEP); 4534 4535 bp->b_un.b_addr = kbuffer; 4536 bp->b_flags = B_READ; 4537 bp->b_bcount = vdr->vdr_nbytes; 4538 bp->b_lblkno = lbtodb(vdr->vdr_offset); 4539 bp->b_edev = mdev; 4540 4541 un = md_unit_readerlock(ui); 4542 4543 /* 4544 * If DKV_SIDE_INIT is set we need to determine the first available 4545 * side to start reading from. If it isn't set we increment to the 4546 * next readable submirror. 4547 * If there are no readable submirrors we error out with DKV_DMR_ERROR. 4548 * Note: we check for a readable submirror on completion of the i/o so 4549 * we should _always_ have one available. If this becomes unavailable 4550 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if 4551 * a metadetach is made between the completion of one DKIOCDMR ioctl 4552 * and the start of the next (i.e. a sys-admin 'accident' occurred). 4553 * The chance of this is small, but not non-existent. 4554 */ 4555 if (vdr->vdr_side == DKV_SIDE_INIT) { 4556 next_side = 0; 4557 } else { 4558 next_side = vdr->vdr_side + 1; 4559 } 4560 while ((next_side < NMIRROR) && 4561 !SUBMIRROR_IS_READABLE(un, next_side)) 4562 next_side++; 4563 if (next_side >= NMIRROR) { 4564 vdr->vdr_flags |= DKV_DMR_ERROR; 4565 freerbuf(bp); 4566 vdr->vdr_bytesread = 0; 4567 md_unit_readerexit(ui); 4568 return (0); 4569 } 4570 4571 /* Set the side to read from */ 4572 un->un_dmr_last_read = next_side; 4573 4574 md_unit_readerexit(ui); 4575 4576 /* 4577 * Save timestamp for verification purposes. Can be read by debugger 4578 * to verify that this ioctl has been executed and to find the number 4579 * of DMR reads and the time of the last DMR read. 4580 */ 4581 uniqtime(&mirror_dmr_stats.dmr_timestamp); 4582 mirror_dmr_stats.dmr_count++; 4583 4584 /* Issue READ request and wait for completion */ 4585 mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL); 4586 4587 mutex_enter(&un->un_dmr_mx); 4588 cv_wait(&un->un_dmr_cv, &un->un_dmr_mx); 4589 mutex_exit(&un->un_dmr_mx); 4590 4591 /* 4592 * Check to see if we encountered an error during the read. If so we 4593 * can make no guarantee about any possibly returned data. 4594 */ 4595 if ((bp->b_flags & B_ERROR) == 0) { 4596 vdr->vdr_flags &= ~DKV_DMR_ERROR; 4597 if (bp->b_resid) { 4598 vdr->vdr_flags |= DKV_DMR_SHORT; 4599 vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid; 4600 } else { 4601 vdr->vdr_flags |= DKV_DMR_SUCCESS; 4602 vdr->vdr_bytesread = vdr->vdr_nbytes; 4603 } 4604 /* Copy the data read back out to the user supplied buffer */ 4605 if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread, 4606 mode)) { 4607 kmem_free(kbuffer, vdr->vdr_nbytes); 4608 return (EFAULT); 4609 } 4610 4611 } else { 4612 /* Error out with DKV_DMR_ERROR */ 4613 vdr->vdr_flags |= DKV_DMR_ERROR; 4614 vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE); 4615 } 4616 /* 4617 * Update the DMR parameters with the side and name of submirror that 4618 * we have just read from (un->un_dmr_last_read) 4619 */ 4620 un = md_unit_readerlock(ui); 4621 4622 vdr->vdr_side = un->un_dmr_last_read; 4623 sm = &un->un_sm[un->un_dmr_last_read]; 4624 sm_nm = md_shortname(md_getminor(sm->sm_dev)); 4625 4626 (void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name)); 4627 4628 /* 4629 * Determine if we've completed the read cycle. This is true iff the 4630 * next computed submirror (side) equals or exceeds NMIRROR. We cannot 4631 * use un_nsm as we need to handle a sparse array of submirrors (which 4632 * can occur if a submirror is metadetached). 4633 */ 4634 next_side = un->un_dmr_last_read + 1; 4635 while ((next_side < NMIRROR) && 4636 !SUBMIRROR_IS_READABLE(un, next_side)) 4637 next_side++; 4638 if (next_side >= NMIRROR) { 4639 /* We've finished */ 4640 vdr->vdr_flags |= DKV_DMR_DONE; 4641 } 4642 4643 md_unit_readerexit(ui); 4644 freerbuf(bp); 4645 kmem_free(kbuffer, vdr->vdr_nbytes); 4646 4647 return (0); 4648 } 4649 4650 /* 4651 * mirror_resync_message: 4652 * --------------------- 4653 * Handle the multi-node resync messages that keep all nodes within a given 4654 * disk-set in sync with their view of a mirror's resync status. 4655 * 4656 * The message types dealt with are: 4657 * MD_MN_MSG_RESYNC_STARTING - start a resync thread for a unit 4658 * MD_MN_MSG_RESYNC_NEXT - specified next region to be resynced 4659 * MD_MN_MSG_RESYNC_FINISH - stop the resync thread for a unit 4660 * MD_MN_MSG_RESYNC_PHASE_DONE - end of a resync phase, opt, submirror or comp 4661 * 4662 * Returns: 4663 * 0 Success 4664 * >0 Failure error number 4665 */ 4666 int 4667 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp) 4668 { 4669 mdi_unit_t *ui; 4670 mm_unit_t *un; 4671 set_t setno; 4672 int is_ABR; 4673 int smi; 4674 int ci; 4675 sm_state_t state; 4676 int broke_out; 4677 mm_submirror_t *sm; 4678 mm_submirror_ic_t *smic; 4679 md_m_shared_t *shared; 4680 md_error_t mde = mdnullerror; 4681 md_mps_t *ps; 4682 int rs_active; 4683 int rr, rr_start, rr_end; 4684 4685 /* Check that the given device is part of a multi-node set */ 4686 setno = MD_MIN2SET(p->mnum); 4687 if (setno >= md_nsets) { 4688 return (ENXIO); 4689 } 4690 if (!MD_MNSET_SETNO(setno)) { 4691 return (EINVAL); 4692 } 4693 4694 if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL) 4695 return (EINVAL); 4696 if ((ui = MDI_UNIT(p->mnum)) == NULL) 4697 return (EINVAL); 4698 is_ABR = (ui->ui_tstate & MD_ABR_CAP); 4699 4700 /* Obtain the current resync status */ 4701 (void) md_ioctl_readerlock(lockp, ui); 4702 rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0; 4703 md_ioctl_readerexit(lockp); 4704 4705 switch ((md_mn_msgtype_t)p->msg_type) { 4706 case MD_MN_MSG_RESYNC_STARTING: 4707 /* Start the resync thread for the mirror */ 4708 (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp); 4709 break; 4710 4711 case MD_MN_MSG_RESYNC_NEXT: 4712 /* 4713 * We have to release any previously marked overlap regions 4714 * so that i/o can resume. Then we need to block the region 4715 * from [rs_start..rs_start+rs_size) * so that no i/o is issued. 4716 * Update un_rs_resync_done and un_rs_resync_2_do. 4717 */ 4718 (void) md_ioctl_readerlock(lockp, ui); 4719 /* 4720 * Ignore the message if there is no active resync thread or 4721 * if it is for a resync type that we have already completed. 4722 * un_resync_completed is set to the last resync completed 4723 * when processing a PHASE_DONE message. 4724 */ 4725 if (!rs_active || (p->rs_type == un->un_resync_completed)) 4726 break; 4727 /* 4728 * If this message is for the same resync and is for an earlier 4729 * resync region, just ignore it. This can only occur if this 4730 * node has progressed on to the next resync region before 4731 * we receive this message. This can occur if the class for 4732 * this message is busy and the originator has to retry thus 4733 * allowing this node to move onto the next resync_region. 4734 */ 4735 if ((p->rs_type == un->un_rs_type) && 4736 (p->rs_start < un->un_resync_startbl)) 4737 break; 4738 ps = un->un_rs_prev_overlap; 4739 4740 /* Allocate previous overlap reference if needed */ 4741 if (ps == NULL) { 4742 ps = kmem_cache_alloc(mirror_parent_cache, 4743 MD_ALLOCFLAGS); 4744 ps->ps_un = un; 4745 ps->ps_ui = ui; 4746 ps->ps_firstblk = 0; 4747 ps->ps_lastblk = 0; 4748 ps->ps_flags = 0; 4749 md_ioctl_readerexit(lockp); 4750 (void) md_ioctl_writerlock(lockp, ui); 4751 un->un_rs_prev_overlap = ps; 4752 md_ioctl_writerexit(lockp); 4753 } else 4754 md_ioctl_readerexit(lockp); 4755 4756 if (p->rs_originator != md_mn_mynode_id) { 4757 /* 4758 * Clear our un_resync_bm for the regions completed. 4759 * The owner (originator) will take care of itself. 4760 */ 4761 BLK_TO_RR(rr_end, ps->ps_lastblk, un); 4762 BLK_TO_RR(rr_start, p->rs_start, un); 4763 if (ps->ps_lastblk && rr_end < rr_start) { 4764 BLK_TO_RR(rr_start, ps->ps_firstblk, un); 4765 mutex_enter(&un->un_resync_mx); 4766 /* 4767 * Update our resync bitmap to reflect that 4768 * another node has synchronized this range. 4769 */ 4770 for (rr = rr_start; rr <= rr_end; rr++) { 4771 CLR_KEEPDIRTY(rr, un); 4772 } 4773 mutex_exit(&un->un_resync_mx); 4774 } 4775 4776 /* 4777 * On all but the originating node, first update 4778 * the resync state, then unblock the previous 4779 * region and block the next one. No need 4780 * to do this if the region is already blocked. 4781 * Update the submirror state and flags from the 4782 * originator. This keeps the cluster in sync with 4783 * regards to the resync status. 4784 */ 4785 4786 (void) md_ioctl_writerlock(lockp, ui); 4787 un->un_rs_resync_done = p->rs_done; 4788 un->un_rs_resync_2_do = p->rs_2_do; 4789 un->un_rs_type = p->rs_type; 4790 un->un_resync_startbl = p->rs_start; 4791 md_ioctl_writerexit(lockp); 4792 /* 4793 * Use un_owner_mx to ensure that an ownership change 4794 * cannot happen at the same time as this message 4795 */ 4796 mutex_enter(&un->un_owner_mx); 4797 if (MD_MN_MIRROR_OWNER(un)) { 4798 ps->ps_firstblk = p->rs_start; 4799 ps->ps_lastblk = ps->ps_firstblk + 4800 p->rs_size - 1; 4801 } else { 4802 if ((ps->ps_firstblk != p->rs_start) || 4803 (ps->ps_lastblk != p->rs_start + 4804 p->rs_size - 1)) { 4805 /* Remove previous overlap range */ 4806 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4807 mirror_overlap_tree_remove(ps); 4808 4809 ps->ps_firstblk = p->rs_start; 4810 ps->ps_lastblk = ps->ps_firstblk + 4811 p->rs_size - 1; 4812 4813 mutex_exit(&un->un_owner_mx); 4814 /* Block this range from all i/o. */ 4815 if (ps->ps_firstblk != 0 || 4816 ps->ps_lastblk != 0) 4817 wait_for_overlaps(ps, 4818 MD_OVERLAP_ALLOW_REPEAT); 4819 mutex_enter(&un->un_owner_mx); 4820 /* 4821 * Check to see if we have obtained 4822 * ownership while waiting for 4823 * overlaps. If we have, remove 4824 * the resync_region entry from the 4825 * overlap tree 4826 */ 4827 if (MD_MN_MIRROR_OWNER(un) && 4828 (ps->ps_flags & MD_MPS_ON_OVERLAP)) 4829 mirror_overlap_tree_remove(ps); 4830 } 4831 } 4832 mutex_exit(&un->un_owner_mx); 4833 4834 /* 4835 * If this is the first RESYNC_NEXT message (i.e. 4836 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags), 4837 * issue RESYNC_START NOTIFY event 4838 */ 4839 if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) { 4840 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START, 4841 SVM_TAG_METADEVICE, MD_UN2SET(un), 4842 MD_SID(un)); 4843 } 4844 4845 /* Ensure that our local resync thread is running */ 4846 if (un->un_rs_thread == NULL) { 4847 (void) mirror_resync_unit(p->mnum, NULL, 4848 &p->mde, lockp); 4849 } 4850 } 4851 4852 break; 4853 case MD_MN_MSG_RESYNC_FINISH: 4854 /* 4855 * Complete the resync by stopping the resync thread. 4856 * Also release the previous overlap region field. 4857 * Update the resync_progress_thread by cv_signal'ing it so 4858 * that we mark the end of the resync as soon as possible. This 4859 * stops an unnecessary delay should be panic after resync 4860 * completion. 4861 */ 4862 #ifdef DEBUG 4863 if (!rs_active) { 4864 if (mirror_debug_flag) 4865 printf("RESYNC_FINISH (mnum = %x), " 4866 "Resync *NOT* active", 4867 p->mnum); 4868 } 4869 #endif 4870 4871 if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) && 4872 (p->rs_originator != md_mn_mynode_id)) { 4873 mutex_enter(&un->un_rs_thread_mx); 4874 un->c.un_status &= ~MD_UN_RESYNC_CANCEL; 4875 un->un_rs_thread_flags |= MD_RI_SHUTDOWN; 4876 un->un_rs_thread_flags &= 4877 ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER); 4878 cv_signal(&un->un_rs_thread_cv); 4879 mutex_exit(&un->un_rs_thread_mx); 4880 } 4881 if (is_ABR) { 4882 /* Resync finished, if ABR set owner to NULL */ 4883 mutex_enter(&un->un_owner_mx); 4884 un->un_mirror_owner = 0; 4885 mutex_exit(&un->un_owner_mx); 4886 } 4887 (void) md_ioctl_writerlock(lockp, ui); 4888 ps = un->un_rs_prev_overlap; 4889 if (ps != NULL) { 4890 /* Remove previous overlap range */ 4891 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4892 mirror_overlap_tree_remove(ps); 4893 /* 4894 * Release the overlap range reference 4895 */ 4896 un->un_rs_prev_overlap = NULL; 4897 kmem_cache_free(mirror_parent_cache, 4898 ps); 4899 } 4900 md_ioctl_writerexit(lockp); 4901 4902 /* Mark the resync as complete in the metadb */ 4903 un->un_rs_resync_done = p->rs_done; 4904 un->un_rs_resync_2_do = p->rs_2_do; 4905 un->un_rs_type = p->rs_type; 4906 mutex_enter(&un->un_rs_progress_mx); 4907 cv_signal(&un->un_rs_progress_cv); 4908 mutex_exit(&un->un_rs_progress_mx); 4909 4910 un = md_ioctl_writerlock(lockp, ui); 4911 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE; 4912 /* Deal with any pending grow_unit */ 4913 if (un->c.un_status & MD_UN_GROW_PENDING) { 4914 if ((mirror_grow_unit(un, &mde) != 0) || 4915 (! mdismderror(&mde, MDE_GROW_DELAYED))) { 4916 un->c.un_status &= ~MD_UN_GROW_PENDING; 4917 } 4918 } 4919 md_ioctl_writerexit(lockp); 4920 break; 4921 4922 case MD_MN_MSG_RESYNC_PHASE_DONE: 4923 /* 4924 * A phase of the resync, optimized. component or 4925 * submirror is complete. Update mirror status. 4926 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the 4927 * mirror owner is peforming a resync. If we have just snarfed 4928 * this set, then we must clear any of the flags set at snarf 4929 * time by unit_setup_resync(). 4930 * Note that unit_setup_resync() sets up these flags to 4931 * indicate that an optimized resync is required. These flags 4932 * need to be reset because if we get here, the mirror owner 4933 * will have handled the optimized resync. 4934 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and 4935 * MD_UN_WAR. In addition, for each submirror, 4936 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC 4937 * set to SMS_OFFLINE. 4938 */ 4939 #ifdef DEBUG 4940 if (mirror_debug_flag) 4941 printf("phase done mess received from %d, mnum=%x," 4942 "type=%x, flags=%x\n", p->rs_originator, p->mnum, 4943 p->rs_type, p->rs_flags); 4944 #endif 4945 /* 4946 * Ignore the message if there is no active resync thread. 4947 */ 4948 if (!rs_active) 4949 break; 4950 4951 broke_out = p->rs_flags & MD_MN_RS_ERR; 4952 switch (RS_TYPE(p->rs_type)) { 4953 case MD_RS_OPTIMIZED: 4954 un = md_ioctl_writerlock(lockp, ui); 4955 if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) { 4956 /* If we are originator, just clear rs_type */ 4957 if (p->rs_originator == md_mn_mynode_id) { 4958 SET_RS_TYPE_NONE(un->un_rs_type); 4959 md_ioctl_writerexit(lockp); 4960 break; 4961 } 4962 /* 4963 * If CLEAR_OPT_NOT_DONE is set, only clear the 4964 * flags if OPT_NOT_DONE is set *and* rs_type 4965 * is MD_RS_NONE. 4966 */ 4967 if ((un->c.un_status & MD_UN_OPT_NOT_DONE) && 4968 (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) { 4969 /* No resync in progress */ 4970 un->c.un_status &= ~MD_UN_OPT_NOT_DONE; 4971 un->c.un_status &= ~MD_UN_WAR; 4972 } else { 4973 /* 4974 * We are in the middle of an 4975 * optimized resync and this message 4976 * should be ignored. 4977 */ 4978 md_ioctl_writerexit(lockp); 4979 break; 4980 } 4981 } else { 4982 /* 4983 * This is the end of an optimized resync, 4984 * clear the OPT_NOT_DONE and OFFLINE_SM flags 4985 */ 4986 4987 un->c.un_status &= ~MD_UN_KEEP_DIRTY; 4988 if (!broke_out) 4989 un->c.un_status &= ~MD_UN_WAR; 4990 4991 /* 4992 * Clear our un_resync_bm for the regions 4993 * completed. The owner (originator) will 4994 * take care of itself. 4995 */ 4996 if (p->rs_originator != md_mn_mynode_id && 4997 (ps = un->un_rs_prev_overlap) != NULL) { 4998 BLK_TO_RR(rr_start, ps->ps_firstblk, 4999 un); 5000 BLK_TO_RR(rr_end, ps->ps_lastblk, un); 5001 mutex_enter(&un->un_resync_mx); 5002 for (rr = rr_start; rr <= rr_end; 5003 rr++) { 5004 CLR_KEEPDIRTY(rr, un); 5005 } 5006 mutex_exit(&un->un_resync_mx); 5007 } 5008 } 5009 5010 /* 5011 * Set resync_completed to last resync type and then 5012 * clear resync_type to indicate no resync in progress 5013 */ 5014 un->un_resync_completed = un->un_rs_type; 5015 SET_RS_TYPE_NONE(un->un_rs_type); 5016 5017 /* 5018 * If resync is as a result of a submirror ONLINE, 5019 * reset the submirror state to SMS_RUNNING if the 5020 * resync was ok else set back to SMS_OFFLINE. 5021 */ 5022 for (smi = 0; smi < NMIRROR; smi++) { 5023 un->un_sm[smi].sm_flags &= 5024 ~MD_SM_RESYNC_TARGET; 5025 if (SMS_BY_INDEX_IS(un, smi, 5026 SMS_OFFLINE_RESYNC)) { 5027 if (p->rs_flags & 5028 MD_MN_RS_CLEAR_OPT_NOT_DONE) { 5029 state = SMS_OFFLINE; 5030 } else { 5031 state = (broke_out ? 5032 SMS_OFFLINE : SMS_RUNNING); 5033 } 5034 mirror_set_sm_state( 5035 &un->un_sm[smi], 5036 &un->un_smic[smi], state, 5037 broke_out); 5038 mirror_commit(un, NO_SUBMIRRORS, 5039 0); 5040 } 5041 /* 5042 * If we still have an offline submirror, reset 5043 * the OFFLINE_SM flag in the mirror status 5044 */ 5045 if (SMS_BY_INDEX_IS(un, smi, 5046 SMS_OFFLINE)) 5047 un->c.un_status |= 5048 MD_UN_OFFLINE_SM; 5049 } 5050 md_ioctl_writerexit(lockp); 5051 break; 5052 case MD_RS_SUBMIRROR: 5053 un = md_ioctl_writerlock(lockp, ui); 5054 smi = RS_SMI(p->rs_type); 5055 sm = &un->un_sm[smi]; 5056 smic = &un->un_smic[smi]; 5057 /* Clear RESYNC target */ 5058 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 5059 /* 5060 * Set resync_completed to last resync type and then 5061 * clear resync_type to indicate no resync in progress 5062 */ 5063 un->un_resync_completed = un->un_rs_type; 5064 SET_RS_TYPE_NONE(un->un_rs_type); 5065 /* 5066 * If the resync completed ok reset the submirror 5067 * state to SMS_RUNNING else reset it to SMS_ATTACHED 5068 */ 5069 state = (broke_out ? 5070 SMS_ATTACHED : SMS_RUNNING); 5071 mirror_set_sm_state(sm, smic, state, broke_out); 5072 un->c.un_status &= ~MD_UN_WAR; 5073 mirror_commit(un, SMI2BIT(smi), 0); 5074 md_ioctl_writerexit(lockp); 5075 break; 5076 case MD_RS_COMPONENT: 5077 un = md_ioctl_writerlock(lockp, ui); 5078 smi = RS_SMI(p->rs_type); 5079 ci = RS_CI(p->rs_type); 5080 sm = &un->un_sm[smi]; 5081 smic = &un->un_smic[smi]; 5082 shared = (md_m_shared_t *) 5083 (*(smic->sm_shared_by_indx)) 5084 (sm->sm_dev, sm, ci); 5085 un->c.un_status &= ~MD_UN_WAR; 5086 /* Clear RESYNC target */ 5087 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 5088 /* 5089 * Set resync_completed to last resync type and then 5090 * clear resync_type to indicate no resync in progress 5091 */ 5092 un->un_resync_completed = un->un_rs_type; 5093 SET_RS_TYPE_NONE(un->un_rs_type); 5094 5095 /* 5096 * If the resync completed ok, set the component state 5097 * to CS_OKAY. 5098 */ 5099 if (broke_out) 5100 shared->ms_flags |= MDM_S_RS_TRIED; 5101 else { 5102 /* 5103 * As we don't transmit the changes, 5104 * no need to drop the lock. 5105 */ 5106 set_sm_comp_state(un, smi, ci, CS_OKAY, 0, 5107 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 5108 } 5109 md_ioctl_writerexit(lockp); 5110 default: 5111 break; 5112 } 5113 /* 5114 * If the purpose of this PHASE_DONE message is just to 5115 * indicate to all other nodes that the optimized resync 5116 * required (OPT_NOT_DONE) flag is to be cleared, there is 5117 * no need to generate a notify event as there has not 5118 * actually been a resync. 5119 */ 5120 if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) { 5121 if (broke_out) { 5122 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED, 5123 SVM_TAG_METADEVICE, MD_UN2SET(un), 5124 MD_SID(un)); 5125 } else { 5126 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE, 5127 SVM_TAG_METADEVICE, MD_UN2SET(un), 5128 MD_SID(un)); 5129 } 5130 } 5131 break; 5132 5133 default: 5134 #ifdef DEBUG 5135 cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type" 5136 " %x\n", p->msg_type); 5137 #endif 5138 return (EINVAL); 5139 } 5140 return (0); 5141 } 5142 5143 /* Return a -1 if snarf of optimized record failed and set should be released */ 5144 static int 5145 mirror_snarf(md_snarfcmd_t cmd, set_t setno) 5146 { 5147 mddb_recid_t recid; 5148 int gotsomething; 5149 int all_mirrors_gotten; 5150 mm_unit_t *un; 5151 mddb_type_t typ1; 5152 mddb_de_ic_t *dep; 5153 mddb_rb32_t *rbp; 5154 size_t newreqsize; 5155 mm_unit_t *big_un; 5156 mm_unit32_od_t *small_un; 5157 int retval; 5158 mdi_unit_t *ui; 5159 5160 if (cmd == MD_SNARF_CLEANUP) { 5161 if (md_get_setstatus(setno) & MD_SET_STALE) 5162 return (0); 5163 5164 recid = mddb_makerecid(setno, 0); 5165 typ1 = (mddb_type_t)md_getshared_key(setno, 5166 mirror_md_ops.md_driver.md_drivername); 5167 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5168 if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) { 5169 un = (mm_unit_t *)mddb_getrecaddr(recid); 5170 mirror_cleanup(un); 5171 recid = mddb_makerecid(setno, 0); 5172 } 5173 } 5174 return (0); 5175 } 5176 5177 all_mirrors_gotten = 1; 5178 gotsomething = 0; 5179 5180 recid = mddb_makerecid(setno, 0); 5181 typ1 = (mddb_type_t)md_getshared_key(setno, 5182 mirror_md_ops.md_driver.md_drivername); 5183 5184 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5185 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 5186 continue; 5187 5188 dep = mddb_getrecdep(recid); 5189 dep->de_flags = MDDB_F_MIRROR; 5190 rbp = dep->de_rb; 5191 5192 switch (rbp->rb_revision) { 5193 case MDDB_REV_RB: 5194 case MDDB_REV_RBFN: 5195 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 5196 /* 5197 * This means, we have an old and small 5198 * record and this record hasn't already 5199 * been converted. Before we create an 5200 * incore metadevice from this we have to 5201 * convert it to a big record. 5202 */ 5203 small_un = 5204 (mm_unit32_od_t *)mddb_getrecaddr(recid); 5205 newreqsize = sizeof (mm_unit_t); 5206 big_un = (mm_unit_t *)kmem_zalloc(newreqsize, 5207 KM_SLEEP); 5208 mirror_convert((caddr_t)small_un, 5209 (caddr_t)big_un, SMALL_2_BIG); 5210 kmem_free(small_un, dep->de_reqsize); 5211 5212 /* 5213 * Update userdata and incore userdata 5214 * incores are at the end of un 5215 */ 5216 dep->de_rb_userdata_ic = big_un; 5217 dep->de_rb_userdata = big_un; 5218 dep->de_icreqsize = newreqsize; 5219 un = big_un; 5220 rbp->rb_private |= MD_PRV_CONVD; 5221 } else { 5222 /* 5223 * Unit already converted, just get the 5224 * record address. 5225 */ 5226 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, 5227 sizeof (*un), 0); 5228 } 5229 un->c.un_revision &= ~MD_64BIT_META_DEV; 5230 break; 5231 case MDDB_REV_RB64: 5232 case MDDB_REV_RB64FN: 5233 /* Big device */ 5234 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, 5235 sizeof (*un), 0); 5236 un->c.un_revision |= MD_64BIT_META_DEV; 5237 un->c.un_flag |= MD_EFILABEL; 5238 break; 5239 } 5240 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 5241 5242 /* 5243 * Create minor device node for snarfed entry. 5244 */ 5245 (void) md_create_minor_node(setno, MD_SID(un)); 5246 5247 if (MD_UNIT(MD_SID(un)) != NULL) { 5248 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5249 continue; 5250 } 5251 all_mirrors_gotten = 0; 5252 retval = mirror_build_incore(un, 1); 5253 if (retval == 0) { 5254 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5255 md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0); 5256 resync_start_timeout(setno); 5257 gotsomething = 1; 5258 } else { 5259 return (retval); 5260 } 5261 /* 5262 * Set flag to indicate that the mirror has not yet 5263 * been through a reconfig. This flag is used for MN sets 5264 * when determining whether to update the mirror state from 5265 * the Master node. 5266 */ 5267 if (MD_MNSET_SETNO(setno)) { 5268 ui = MDI_UNIT(MD_SID(un)); 5269 ui->ui_tstate |= MD_RESYNC_NOT_DONE; 5270 } 5271 } 5272 5273 if (!all_mirrors_gotten) 5274 return (gotsomething); 5275 5276 recid = mddb_makerecid(setno, 0); 5277 while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0) 5278 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 5279 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5280 5281 return (0); 5282 } 5283 5284 static int 5285 mirror_halt(md_haltcmd_t cmd, set_t setno) 5286 { 5287 unit_t i; 5288 mdi_unit_t *ui; 5289 minor_t mnum; 5290 int reset_mirror_flag = 0; 5291 5292 if (cmd == MD_HALT_CLOSE) 5293 return (0); 5294 5295 if (cmd == MD_HALT_OPEN) 5296 return (0); 5297 5298 if (cmd == MD_HALT_UNLOAD) 5299 return (0); 5300 5301 if (cmd == MD_HALT_CHECK) { 5302 for (i = 0; i < md_nunits; i++) { 5303 mnum = MD_MKMIN(setno, i); 5304 if ((ui = MDI_UNIT(mnum)) == NULL) 5305 continue; 5306 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5307 continue; 5308 if (md_unit_isopen(ui)) 5309 return (1); 5310 } 5311 return (0); 5312 } 5313 5314 if (cmd != MD_HALT_DOIT) 5315 return (1); 5316 5317 for (i = 0; i < md_nunits; i++) { 5318 mnum = MD_MKMIN(setno, i); 5319 if ((ui = MDI_UNIT(mnum)) == NULL) 5320 continue; 5321 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5322 continue; 5323 reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0); 5324 5325 /* Set a flag if there is at least one mirror metadevice. */ 5326 reset_mirror_flag = 1; 5327 } 5328 5329 /* 5330 * Only wait for the global dr_timeout to finish 5331 * - if there are mirror metadevices in this diskset or 5332 * - if this is the local set since an unload of the md_mirror 5333 * driver could follow a successful mirror halt in the local set. 5334 */ 5335 if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) { 5336 while ((mirror_md_ops.md_head == NULL) && 5337 (mirror_timeout.dr_timeout_id != 0)) 5338 delay(md_hz); 5339 } 5340 5341 return (0); 5342 } 5343 5344 /*ARGSUSED3*/ 5345 static int 5346 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) 5347 { 5348 IOLOCK lock; 5349 minor_t mnum = getminor(*dev); 5350 set_t setno; 5351 5352 /* 5353 * When doing an open of a multi owner metadevice, check to see if this 5354 * node is a starting node and if a reconfig cycle is underway. 5355 * If so, the system isn't sufficiently set up enough to handle the 5356 * open (which involves I/O during sp_validate), so fail with ENXIO. 5357 */ 5358 setno = MD_MIN2SET(mnum); 5359 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 5360 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 5361 return (ENXIO); 5362 } 5363 5364 if (md_oflags & MD_OFLG_FROMIOCTL) { 5365 /* 5366 * This indicates that the caller is an ioctl service routine. 5367 * In this case we initialise our stack-based IOLOCK and pass 5368 * this into the internal open routine. This allows multi-owner 5369 * metadevices to avoid deadlocking if an error is encountered 5370 * during the open() attempt. The failure case is: 5371 * s-p -> mirror -> s-p (with error). Attempting to metaclear 5372 * this configuration would deadlock as the mirror code has to 5373 * send a state-update to the other nodes when it detects the 5374 * failure of the underlying submirror with an errored soft-part 5375 * on it. As there is a class1 message in progress (metaclear) 5376 * set_sm_comp_state() cannot send another class1 message; 5377 * instead we do not send a state_update message as the 5378 * metaclear is distributed and the failed submirror will be 5379 * cleared from the configuration by the metaclear. 5380 */ 5381 IOLOCK_INIT(&lock); 5382 return (mirror_internal_open(getminor(*dev), flag, otyp, 5383 md_oflags, &lock)); 5384 } else { 5385 return (mirror_internal_open(getminor(*dev), flag, otyp, 5386 md_oflags, (IOLOCK *)NULL)); 5387 } 5388 } 5389 5390 5391 /*ARGSUSED1*/ 5392 static int 5393 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) 5394 { 5395 return (mirror_internal_close(getminor(dev), otyp, md_cflags, 5396 (IOLOCK *)NULL)); 5397 } 5398 5399 5400 /* 5401 * This routine dumps memory to the disk. It assumes that the memory has 5402 * already been mapped into mainbus space. It is called at disk interrupt 5403 * priority when the system is in trouble. 5404 * 5405 */ 5406 static int 5407 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 5408 { 5409 mm_unit_t *un; 5410 dev_t mapdev; 5411 int result; 5412 int smi; 5413 int any_succeed = 0; 5414 int save_result = 0; 5415 5416 /* 5417 * Don't need to grab the unit lock. 5418 * Cause nothing else is suppose to be happenning. 5419 * Also dump is not suppose to sleep. 5420 */ 5421 un = (mm_unit_t *)MD_UNIT(getminor(dev)); 5422 5423 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 5424 return (EINVAL); 5425 5426 if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks) 5427 return (EINVAL); 5428 5429 for (smi = 0; smi < NMIRROR; smi++) { 5430 if (!SUBMIRROR_IS_WRITEABLE(un, smi)) 5431 continue; 5432 mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev); 5433 result = bdev_dump(mapdev, addr, blkno, nblk); 5434 if (result) 5435 save_result = result; 5436 5437 if (result == 0) 5438 any_succeed++; 5439 } 5440 5441 if (any_succeed) 5442 return (0); 5443 5444 return (save_result); 5445 } 5446 5447 /* 5448 * NAME: mirror_probe_dev 5449 * 5450 * DESCRITPION: force opens every component of a mirror. 5451 * 5452 * On entry the unit writerlock is held 5453 */ 5454 static int 5455 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum) 5456 { 5457 int i; 5458 int smi; 5459 int ci; 5460 mm_unit_t *un; 5461 int md_devopen = 0; 5462 set_t setno; 5463 int sm_cnt; 5464 int sm_unavail_cnt; 5465 5466 if (md_unit_isopen(ui)) 5467 md_devopen++; 5468 5469 un = MD_UNIT(mnum); 5470 setno = MD_UN2SET(un); 5471 5472 sm_cnt = 0; 5473 sm_unavail_cnt = 0; 5474 for (i = 0; i < NMIRROR; i++) { 5475 md_dev64_t tmpdev; 5476 mdi_unit_t *sm_ui; 5477 5478 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) { 5479 continue; 5480 } 5481 5482 sm_cnt++; 5483 tmpdev = un->un_sm[i].sm_dev; 5484 (void) md_layered_open(mnum, &tmpdev, 5485 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV); 5486 un->un_sm[i].sm_dev = tmpdev; 5487 5488 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 5489 5490 /* 5491 * Logic similar to that in mirror_open_all_devs. We set or 5492 * clear the submirror Unavailable bit. 5493 */ 5494 (void) md_unit_writerlock(sm_ui); 5495 if (submirror_unavailable(un, i, 1)) { 5496 sm_ui->ui_tstate |= MD_INACCESSIBLE; 5497 sm_unavail_cnt++; 5498 } else { 5499 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 5500 } 5501 md_unit_writerexit(sm_ui); 5502 } 5503 5504 /* 5505 * If all of the submirrors are unavailable, the mirror is also 5506 * unavailable. 5507 */ 5508 if (sm_cnt == sm_unavail_cnt) { 5509 ui->ui_tstate |= MD_INACCESSIBLE; 5510 } else { 5511 ui->ui_tstate &= ~MD_INACCESSIBLE; 5512 } 5513 5514 /* 5515 * Start checking from probe failures. If failures occur we 5516 * set the appropriate erred state only if the metadevice is in 5517 * use. This is specifically to prevent unnecessary resyncs. 5518 * For instance if the disks were accidentally disconnected when 5519 * the system booted up then until the metadevice is accessed 5520 * (like file system mount) the user can shutdown, recable and 5521 * reboot w/o incurring a potentially huge resync. 5522 */ 5523 5524 smi = 0; 5525 ci = 0; 5526 while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) { 5527 5528 if (mirror_other_sources(un, smi, ci, 0) == 1) { 5529 /* 5530 * Note that for a MN set, there is no need to call 5531 * SE_NOTIFY as that is done when processing the 5532 * state change 5533 */ 5534 if (md_devopen) { 5535 /* 5536 * Never called from ioctl context, 5537 * so (IOLOCK *)NULL 5538 */ 5539 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 5540 0, MD_STATE_XMIT, (IOLOCK *)NULL); 5541 if (!MD_MNSET_SETNO(setno)) { 5542 SE_NOTIFY(EC_SVM_STATE, 5543 ESC_SVM_LASTERRED, 5544 SVM_TAG_METADEVICE, setno, 5545 MD_SID(un)); 5546 } 5547 continue; 5548 } else { 5549 (void) mirror_close_all_devs(un, 5550 MD_OFLG_PROBEDEV); 5551 if (!MD_MNSET_SETNO(setno)) { 5552 SE_NOTIFY(EC_SVM_STATE, 5553 ESC_SVM_OPEN_FAIL, 5554 SVM_TAG_METADEVICE, setno, 5555 MD_SID(un)); 5556 } 5557 mirror_openfail_console_info(un, smi, ci); 5558 return (ENXIO); 5559 } 5560 } 5561 5562 /* 5563 * Note that for a MN set, there is no need to call 5564 * SE_NOTIFY as that is done when processing the 5565 * state change 5566 */ 5567 if (md_devopen) { 5568 /* Never called from ioctl context, so (IOLOCK *)NULL */ 5569 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, 5570 MD_STATE_XMIT, (IOLOCK *)NULL); 5571 if (!MD_MNSET_SETNO(setno)) { 5572 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 5573 SVM_TAG_METADEVICE, setno, 5574 MD_SID(un)); 5575 } 5576 } 5577 mirror_openfail_console_info(un, smi, ci); 5578 ci++; 5579 } 5580 5581 if (MD_MNSET_SETNO(setno)) { 5582 send_poke_hotspares(setno); 5583 } else { 5584 (void) poke_hotspares(); 5585 } 5586 (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV); 5587 5588 return (0); 5589 } 5590 5591 5592 static int 5593 mirror_imp_set( 5594 set_t setno 5595 ) 5596 { 5597 5598 mddb_recid_t recid; 5599 int gotsomething, i; 5600 mddb_type_t typ1; 5601 mddb_de_ic_t *dep; 5602 mddb_rb32_t *rbp; 5603 mm_unit32_od_t *un32; 5604 mm_unit_t *un64; 5605 md_dev64_t self_devt; 5606 minor_t *self_id; /* minor needs to be updated */ 5607 md_parent_t *parent_id; /* parent needs to be updated */ 5608 mddb_recid_t *record_id; /* record id needs to be updated */ 5609 mddb_recid_t *optrec_id; 5610 md_dev64_t tmpdev; 5611 5612 5613 gotsomething = 0; 5614 5615 typ1 = (mddb_type_t)md_getshared_key(setno, 5616 mirror_md_ops.md_driver.md_drivername); 5617 recid = mddb_makerecid(setno, 0); 5618 5619 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5620 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 5621 continue; 5622 5623 dep = mddb_getrecdep(recid); 5624 rbp = dep->de_rb; 5625 5626 switch (rbp->rb_revision) { 5627 case MDDB_REV_RB: 5628 case MDDB_REV_RBFN: 5629 /* 5630 * Small device 5631 */ 5632 un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid); 5633 self_id = &(un32->c.un_self_id); 5634 parent_id = &(un32->c.un_parent); 5635 record_id = &(un32->c.un_record_id); 5636 optrec_id = &(un32->un_rr_dirty_recid); 5637 5638 for (i = 0; i < un32->un_nsm; i++) { 5639 tmpdev = md_expldev(un32->un_sm[i].sm_dev); 5640 un32->un_sm[i].sm_dev = md_cmpldev 5641 (md_makedevice(md_major, MD_MKMIN(setno, 5642 MD_MIN2UNIT(md_getminor(tmpdev))))); 5643 5644 if (!md_update_minor(setno, mddb_getsidenum 5645 (setno), un32->un_sm[i].sm_key)) 5646 goto out; 5647 } 5648 break; 5649 case MDDB_REV_RB64: 5650 case MDDB_REV_RB64FN: 5651 un64 = (mm_unit_t *)mddb_getrecaddr(recid); 5652 self_id = &(un64->c.un_self_id); 5653 parent_id = &(un64->c.un_parent); 5654 record_id = &(un64->c.un_record_id); 5655 optrec_id = &(un64->un_rr_dirty_recid); 5656 5657 for (i = 0; i < un64->un_nsm; i++) { 5658 tmpdev = un64->un_sm[i].sm_dev; 5659 un64->un_sm[i].sm_dev = md_makedevice 5660 (md_major, MD_MKMIN(setno, MD_MIN2UNIT 5661 (md_getminor(tmpdev)))); 5662 5663 if (!md_update_minor(setno, mddb_getsidenum 5664 (setno), un64->un_sm[i].sm_key)) 5665 goto out; 5666 } 5667 break; 5668 } 5669 5670 /* 5671 * If this is a top level and a friendly name metadevice, 5672 * update its minor in the namespace. 5673 */ 5674 if ((*parent_id == MD_NO_PARENT) && 5675 ((rbp->rb_revision == MDDB_REV_RBFN) || 5676 (rbp->rb_revision == MDDB_REV_RB64FN))) { 5677 5678 self_devt = md_makedevice(md_major, *self_id); 5679 if (!md_update_top_device_minor(setno, 5680 mddb_getsidenum(setno), self_devt)) 5681 goto out; 5682 } 5683 5684 /* 5685 * Update unit with the imported setno 5686 * 5687 */ 5688 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5689 5690 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 5691 if (*parent_id != MD_NO_PARENT) 5692 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 5693 *record_id = MAKERECID(setno, DBID(*record_id)); 5694 *optrec_id = MAKERECID(setno, DBID(*optrec_id)); 5695 5696 gotsomething = 1; 5697 } 5698 5699 out: 5700 return (gotsomething); 5701 } 5702 5703 /* 5704 * NAME: mirror_check_offline 5705 * 5706 * DESCRIPTION: return offline_status = 1 if any submirrors are offline 5707 * 5708 * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is 5709 * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE 5710 * ioctl. 5711 */ 5712 int 5713 mirror_check_offline(md_dev64_t dev, int *offline_status) 5714 { 5715 mm_unit_t *un; 5716 md_error_t mde = mdnullerror; 5717 5718 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5719 return (EINVAL); 5720 *offline_status = 0; 5721 if (un->c.un_status & MD_UN_OFFLINE_SM) 5722 *offline_status = 1; 5723 return (0); 5724 } 5725 5726 /* 5727 * NAME: mirror_inc_abr_count 5728 * 5729 * DESCRIPTION: increment the count of layered soft parts with ABR set 5730 * 5731 * Called from ioctl, so access to un_abr_count is protected by the global 5732 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5733 */ 5734 int 5735 mirror_inc_abr_count(md_dev64_t dev) 5736 { 5737 mm_unit_t *un; 5738 md_error_t mde = mdnullerror; 5739 5740 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5741 return (EINVAL); 5742 un->un_abr_count++; 5743 return (0); 5744 } 5745 5746 /* 5747 * NAME: mirror_dec_abr_count 5748 * 5749 * DESCRIPTION: decrement the count of layered soft parts with ABR set 5750 * 5751 * Called from ioctl, so access to un_abr_count is protected by the global 5752 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5753 */ 5754 int 5755 mirror_dec_abr_count(md_dev64_t dev) 5756 { 5757 mm_unit_t *un; 5758 md_error_t mde = mdnullerror; 5759 5760 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5761 return (EINVAL); 5762 un->un_abr_count--; 5763 return (0); 5764 } 5765 5766 static md_named_services_t mirror_named_services[] = { 5767 {(intptr_t (*)()) poke_hotspares, "poke hotspares" }, 5768 {(intptr_t (*)()) mirror_rename_listkids, MDRNM_LIST_URKIDS }, 5769 {mirror_rename_check, MDRNM_CHECK }, 5770 {(intptr_t (*)()) mirror_renexch_update_kids, MDRNM_UPDATE_KIDS }, 5771 {(intptr_t (*)()) mirror_exchange_parent_update_to, 5772 MDRNM_PARENT_UPDATE_TO}, 5773 {(intptr_t (*)()) mirror_exchange_self_update_from_down, 5774 MDRNM_SELF_UPDATE_FROM_DOWN }, 5775 {(intptr_t (*)())mirror_probe_dev, "probe open test" }, 5776 {(intptr_t (*)())mirror_check_offline, MD_CHECK_OFFLINE }, 5777 {(intptr_t (*)())mirror_inc_abr_count, MD_INC_ABR_COUNT }, 5778 {(intptr_t (*)())mirror_dec_abr_count, MD_DEC_ABR_COUNT }, 5779 { NULL, 0 } 5780 }; 5781 5782 md_ops_t mirror_md_ops = { 5783 mirror_open, /* open */ 5784 mirror_close, /* close */ 5785 md_mirror_strategy, /* strategy */ 5786 NULL, /* print */ 5787 mirror_dump, /* dump */ 5788 NULL, /* read */ 5789 NULL, /* write */ 5790 md_mirror_ioctl, /* mirror_ioctl, */ 5791 mirror_snarf, /* mirror_snarf */ 5792 mirror_halt, /* mirror_halt */ 5793 NULL, /* aread */ 5794 NULL, /* awrite */ 5795 mirror_imp_set, /* import set */ 5796 mirror_named_services 5797 }; 5798 5799 /* module specific initilization */ 5800 static void 5801 init_init() 5802 { 5803 md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t); 5804 5805 /* Initialize the parent and child save memory pools */ 5806 mirror_parent_cache = kmem_cache_create("md_mirror_parent", 5807 sizeof (md_mps_t), 0, mirror_parent_constructor, 5808 mirror_parent_destructor, mirror_run_queue, NULL, NULL, 5809 0); 5810 5811 mirror_child_cache = kmem_cache_create("md_mirror_child", 5812 sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0, 5813 mirror_child_constructor, mirror_child_destructor, 5814 mirror_run_queue, NULL, NULL, 0); 5815 5816 /* 5817 * Insure wowbuf_size is a multiple of DEV_BSIZE, 5818 * then initialize wowbuf memory pool. 5819 */ 5820 md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE); 5821 if (md_wowbuf_size <= 0) 5822 md_wowbuf_size = 2 * DEV_BSIZE; 5823 if (md_wowbuf_size > (32 * DEV_BSIZE)) 5824 md_wowbuf_size = (32 * DEV_BSIZE); 5825 5826 md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t); 5827 mirror_wowblk_cache = kmem_cache_create("md_mirror_wow", 5828 md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0); 5829 5830 mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5831 mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5832 5833 mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL); 5834 } 5835 5836 /* module specific uninitilization (undo init_init()) */ 5837 static void 5838 fini_uninit() 5839 { 5840 kmem_cache_destroy(mirror_parent_cache); 5841 kmem_cache_destroy(mirror_child_cache); 5842 kmem_cache_destroy(mirror_wowblk_cache); 5843 mirror_parent_cache = mirror_child_cache = 5844 mirror_wowblk_cache = NULL; 5845 5846 mutex_destroy(&mirror_timeout.dr_mx); 5847 mutex_destroy(&hotspare_request.dr_mx); 5848 mutex_destroy(&non_ff_drv_mutex); 5849 } 5850 5851 /* define the module linkage */ 5852 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit()) 5853