1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/conf.h> 30 #include <sys/file.h> 31 #include <sys/user.h> 32 #include <sys/uio.h> 33 #include <sys/t_lock.h> 34 #include <sys/buf.h> 35 #include <sys/dkio.h> 36 #include <sys/vtoc.h> 37 #include <sys/kmem.h> 38 #include <vm/page.h> 39 #include <sys/cmn_err.h> 40 #include <sys/sysmacros.h> 41 #include <sys/types.h> 42 #include <sys/mkdev.h> 43 #include <sys/stat.h> 44 #include <sys/open.h> 45 #include <sys/modctl.h> 46 #include <sys/ddi.h> 47 #include <sys/sunddi.h> 48 #include <sys/debug.h> 49 #include <sys/dklabel.h> 50 #include <vm/hat.h> 51 #include <sys/lvm/mdvar.h> 52 #include <sys/lvm/md_mirror.h> 53 #include <sys/lvm/md_convert.h> 54 #include <sys/lvm/md_mddb.h> 55 #include <sys/esunddi.h> 56 57 #include <sys/sysevent/eventdefs.h> 58 #include <sys/sysevent/svm.h> 59 #include <sys/lvm/mdmn_commd.h> 60 #include <sys/avl.h> 61 62 md_ops_t mirror_md_ops; 63 #ifndef lint 64 char _depends_on[] = "drv/md"; 65 md_ops_t *md_interface_ops = &mirror_md_ops; 66 #endif 67 68 extern mdq_anchor_t md_done_daemon; 69 extern mdq_anchor_t md_mstr_daemon; 70 extern mdq_anchor_t md_mirror_daemon; 71 extern mdq_anchor_t md_mirror_io_daemon; 72 extern mdq_anchor_t md_mirror_rs_daemon; 73 extern mdq_anchor_t md_mhs_daemon; 74 75 extern unit_t md_nunits; 76 extern set_t md_nsets; 77 extern md_set_t md_set[]; 78 79 extern int md_status; 80 extern clock_t md_hz; 81 82 extern md_krwlock_t md_unit_array_rw; 83 extern kmutex_t md_mx; 84 extern kcondvar_t md_cv; 85 extern int md_mtioctl_cnt; 86 87 daemon_request_t mirror_timeout; 88 static daemon_request_t hotspare_request; 89 static daemon_request_t mn_hs_request[MD_MAXSETS]; /* Multinode hs req */ 90 91 int md_mirror_mcs_buf_off; 92 93 /* Flags for mdmn_ksend_message to allow debugging */ 94 int md_mirror_msg_flags; 95 96 #ifdef DEBUG 97 /* Flag to switch on debug messages */ 98 int mirror_debug_flag = 0; 99 #endif 100 101 /* 102 * Struct used to hold count of DMR reads and the timestamp of last DMR read 103 * It is used to verify, using a debugger, that the DMR read ioctl has been 104 * executed. 105 */ 106 dmr_stats_t mirror_dmr_stats = {0, 0}; 107 108 /* 109 * Mutex protecting list of non-failfast drivers. 110 */ 111 static kmutex_t non_ff_drv_mutex; 112 extern char **non_ff_drivers; 113 114 extern major_t md_major; 115 116 /* 117 * Write-On-Write memory pool. 118 */ 119 static void copy_write_cont(wowhdr_t *wowhdr); 120 static kmem_cache_t *mirror_wowblk_cache = NULL; 121 static int md_wowbuf_size = 16384; 122 static size_t md_wowblk_size; 123 124 /* 125 * This is a flag that allows: 126 * - disabling the write-on-write mechanism. 127 * - logging occurrences of write-on-write 128 * - switching wow handling procedure processing 129 * Counter for occurences of WOW. 130 */ 131 static uint_t md_mirror_wow_flg = 0; 132 static int md_mirror_wow_cnt = 0; 133 134 /* 135 * Tunable to enable/disable dirty region 136 * processing when closing down a mirror. 137 */ 138 static int new_resync = 1; 139 kmem_cache_t *mirror_parent_cache = NULL; 140 kmem_cache_t *mirror_child_cache = NULL; 141 142 extern int md_ff_disable; /* disable failfast */ 143 144 static int mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int); 145 static void mirror_read_strategy(buf_t *, int, void *); 146 static void mirror_write_strategy(buf_t *, int, void *); 147 static void become_owner(daemon_queue_t *); 148 static int mirror_done(struct buf *cb); 149 static int mirror_done_common(struct buf *cb); 150 static void clear_retry_error(struct buf *cb); 151 152 /* 153 * patchables 154 */ 155 int md_min_rr_size = 200; /* 2000 blocks, or 100k */ 156 int md_def_num_rr = 1000; /* Default number of dirty regions */ 157 158 /* 159 * patchable to change delay before rescheduling mirror ownership request. 160 * Value is clock ticks, default 0.5 seconds 161 */ 162 clock_t md_mirror_owner_to = 500000; 163 164 /*ARGSUSED1*/ 165 static int 166 mirror_parent_constructor(void *p, void *d1, int d2) 167 { 168 mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL); 169 return (0); 170 } 171 172 static void 173 mirror_parent_init(md_mps_t *ps) 174 { 175 bzero(ps, offsetof(md_mps_t, ps_mx)); 176 bzero(&ps->ps_overlap_node, sizeof (avl_node_t)); 177 } 178 179 /*ARGSUSED1*/ 180 static void 181 mirror_parent_destructor(void *p, void *d) 182 { 183 mutex_destroy(&((md_mps_t *)p)->ps_mx); 184 } 185 186 /*ARGSUSED1*/ 187 static int 188 mirror_child_constructor(void *p, void *d1, int d2) 189 { 190 bioinit(&((md_mcs_t *)p)->cs_buf); 191 return (0); 192 } 193 194 void 195 mirror_child_init(md_mcs_t *cs) 196 { 197 cs->cs_ps = NULL; 198 cs->cs_mdunit = 0; 199 md_bioreset(&cs->cs_buf); 200 } 201 202 /*ARGSUSED1*/ 203 static void 204 mirror_child_destructor(void *p, void *d) 205 { 206 biofini(&((md_mcs_t *)p)->cs_buf); 207 } 208 209 static void 210 mirror_wowblk_init(wowhdr_t *p) 211 { 212 bzero(p, md_wowblk_size); 213 } 214 215 static void 216 send_poke_hotspares_msg(daemon_request_t *drq) 217 { 218 int rval; 219 md_mn_msg_pokehsp_t pokehsp; 220 md_mn_kresult_t *kresult; 221 set_t setno = (set_t)drq->dq.qlen; 222 223 pokehsp.pokehsp_setno = setno; 224 225 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 226 rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES, 227 MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp, 228 sizeof (pokehsp), kresult); 229 230 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 231 mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES"); 232 /* If we're shutting down already, pause things here. */ 233 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { 234 while (!md_mn_is_commd_present()) { 235 delay(md_hz); 236 } 237 } 238 cmn_err(CE_PANIC, 239 "ksend_message failure: POKE_HOTSPARES"); 240 } 241 kmem_free(kresult, sizeof (md_mn_kresult_t)); 242 243 /* Allow further requests to use this set's queue structure */ 244 mutex_enter(&drq->dr_mx); 245 drq->dr_pending = 0; 246 mutex_exit(&drq->dr_mx); 247 } 248 249 /* 250 * Send a poke_hotspares message to the master node. To avoid swamping the 251 * commd handler with requests we only send a message if there is not one 252 * already outstanding. We punt the request to a separate thread context as 253 * cannot afford to block waiting on the request to be serviced. This is 254 * essential when a reconfig cycle is in progress as any open() of a multinode 255 * metadevice may result in a livelock. 256 */ 257 static void 258 send_poke_hotspares(set_t setno) 259 { 260 daemon_request_t *drq = &mn_hs_request[setno]; 261 262 mutex_enter(&drq->dr_mx); 263 if (drq->dr_pending == 0) { 264 drq->dr_pending = 1; 265 drq->dq.qlen = (int)setno; 266 daemon_request(&md_mhs_daemon, 267 send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD); 268 } 269 mutex_exit(&drq->dr_mx); 270 } 271 272 void 273 mirror_set_sm_state( 274 mm_submirror_t *sm, 275 mm_submirror_ic_t *smic, 276 sm_state_t newstate, 277 int force) 278 { 279 int compcnt; 280 int i; 281 int errcnt; 282 sm_state_t origstate; 283 md_m_shared_t *shared; 284 285 if (force) { 286 sm->sm_state = newstate; 287 uniqtime32(&sm->sm_timestamp); 288 return; 289 } 290 291 origstate = newstate; 292 293 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 294 for (i = 0, errcnt = 0; i < compcnt; i++) { 295 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 296 (sm->sm_dev, sm, i); 297 if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED)) 298 newstate |= SMS_COMP_ERRED; 299 if (shared->ms_state & (CS_RESYNC)) 300 newstate |= SMS_COMP_RESYNC; 301 if (shared->ms_state & CS_ERRED) 302 errcnt++; 303 } 304 305 if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0) 306 newstate &= ~origstate; 307 308 if (errcnt == compcnt) 309 newstate |= SMS_ALL_ERRED; 310 else 311 newstate &= ~SMS_ALL_ERRED; 312 313 sm->sm_state = newstate; 314 uniqtime32(&sm->sm_timestamp); 315 } 316 317 static int 318 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error, 319 int frm_probe) 320 { 321 mm_submirror_t *sm; 322 mm_submirror_ic_t *smic; 323 md_m_shared_t *shared; 324 int ci; 325 int i; 326 int compcnt; 327 int open_comp; /* flag for open component */ 328 329 for (i = *smi; i < NMIRROR; i++) { 330 sm = &un->un_sm[i]; 331 smic = &un->un_smic[i]; 332 333 if (!SMS_IS(sm, SMS_INUSE)) 334 continue; 335 336 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 337 for (ci = *cip; ci < compcnt; ci++) { 338 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 339 (sm->sm_dev, sm, ci); 340 /* 341 * if called from any routine but probe, we check for 342 * MDM_S_ISOPEN flag. Since probe does a pseduo open, 343 * it sets MDM_S_PROBEOPEN flag and we test for this 344 * flag. They are both exclusive tests. 345 */ 346 open_comp = (frm_probe) ? 347 (shared->ms_flags & MDM_S_PROBEOPEN): 348 (shared->ms_flags & MDM_S_ISOPEN); 349 if ((shared->ms_flags & MDM_S_IOERR || !open_comp) && 350 ((shared->ms_state == CS_OKAY) || 351 (shared->ms_state == CS_RESYNC))) { 352 if (clr_error) { 353 shared->ms_flags &= ~MDM_S_IOERR; 354 } 355 *cip = ci; 356 *smi = i; 357 return (1); 358 } 359 360 if (clr_error && (shared->ms_flags & MDM_S_IOERR)) { 361 shared->ms_flags &= ~MDM_S_IOERR; 362 } 363 } 364 365 *cip = 0; 366 } 367 return (0); 368 } 369 370 /*ARGSUSED*/ 371 static void 372 mirror_run_queue(void *d) 373 { 374 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 375 md_daemon(1, &md_done_daemon); 376 } 377 /* 378 * check_comp_4_hotspares 379 * 380 * This function attempts to allocate a hotspare for this component if the 381 * component is in error. In a MN set, the function can be called in 2 modes. 382 * It can be called either when a component error has been detected or when a 383 * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set 384 * in flags and the request is sent to all nodes. 385 * The handler on each of the nodes then calls this function with 386 * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed. 387 * 388 * For non-MN sets the function simply attempts to allocate a hotspare. 389 * 390 * On entry, the following locks are held 391 * mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set) 392 * md_unit_writerlock 393 * 394 * Returns 0 if ok 395 * 1 if the unit containing the component has been cleared while 396 * the mdmn_ksend_message() was being executed 397 */ 398 extern int 399 check_comp_4_hotspares( 400 mm_unit_t *un, 401 int smi, 402 int ci, 403 uint_t flags, 404 mddb_recid_t hs_id, /* Only used by MN disksets */ 405 IOLOCK *lockp /* can be NULL */ 406 ) 407 { 408 mm_submirror_t *sm; 409 mm_submirror_ic_t *smic; 410 md_m_shared_t *shared; 411 mddb_recid_t recids[6]; 412 minor_t mnum; 413 intptr_t (*hs_dev)(); 414 void (*hs_done)(); 415 void *hs_data; 416 md_error_t mde = mdnullerror; 417 set_t setno; 418 md_mn_msg_allochsp_t allochspmsg; 419 md_mn_kresult_t *kresult; 420 mm_unit_t *new_un; 421 int rval; 422 423 mnum = MD_SID(un); 424 setno = MD_UN2SET(un); 425 sm = &un->un_sm[smi]; 426 smic = &un->un_smic[smi]; 427 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 428 (sm->sm_dev, sm, ci); 429 430 if (shared->ms_state != CS_ERRED) 431 return (0); 432 433 /* Don't start a new component resync if a resync is already running. */ 434 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 435 return (0); 436 437 if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) { 438 uint_t msgflags; 439 md_mn_msgtype_t msgtype; 440 441 /* Send allocate hotspare message to all nodes */ 442 443 allochspmsg.msg_allochsp_mnum = un->c.un_self_id; 444 allochspmsg.msg_allochsp_sm = smi; 445 allochspmsg.msg_allochsp_comp = ci; 446 allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id; 447 448 /* 449 * Before calling mdmn_ksend_message(), release locks 450 * Can never be in the context of an ioctl. 451 */ 452 md_unit_writerexit(MDI_UNIT(mnum)); 453 if (flags & MD_HOTSPARE_LINKHELD) 454 rw_exit(&mirror_md_ops.md_link_rw.lock); 455 #ifdef DEBUG 456 if (mirror_debug_flag) 457 printf("send alloc hotspare, flags=" 458 "0x%x %x, %x, %x, %x\n", flags, 459 allochspmsg.msg_allochsp_mnum, 460 allochspmsg.msg_allochsp_sm, 461 allochspmsg.msg_allochsp_comp, 462 allochspmsg.msg_allochsp_hs_id); 463 #endif 464 if (flags & MD_HOTSPARE_WMUPDATE) { 465 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE2; 466 /* 467 * When coming from an update of watermarks, there 468 * must already be a message logged that triggered 469 * this action. So, no need to log this message, too. 470 */ 471 msgflags = MD_MSGF_NO_LOG; 472 } else { 473 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE; 474 msgflags = MD_MSGF_DEFAULT_FLAGS; 475 } 476 477 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 478 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0, 479 (char *)&allochspmsg, sizeof (allochspmsg), 480 kresult); 481 482 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 483 #ifdef DEBUG 484 if (mirror_debug_flag) 485 mdmn_ksend_show_error(rval, kresult, 486 "ALLOCATE HOTSPARE"); 487 #endif 488 /* 489 * If message is sent ok but exitval indicates an error 490 * it must be because the mirror has been cleared. In 491 * this case re-obtain lock and return an error 492 */ 493 if ((rval == 0) && (kresult->kmmr_exitval != 0)) { 494 if (flags & MD_HOTSPARE_LINKHELD) { 495 rw_enter(&mirror_md_ops.md_link_rw.lock, 496 RW_READER); 497 } 498 kmem_free(kresult, sizeof (md_mn_kresult_t)); 499 return (1); 500 } 501 /* If we're shutting down already, pause things here. */ 502 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { 503 while (!md_mn_is_commd_present()) { 504 delay(md_hz); 505 } 506 } 507 cmn_err(CE_PANIC, 508 "ksend_message failure: ALLOCATE_HOTSPARE"); 509 } 510 kmem_free(kresult, sizeof (md_mn_kresult_t)); 511 512 /* 513 * re-obtain the locks 514 */ 515 if (flags & MD_HOTSPARE_LINKHELD) 516 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 517 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 518 519 /* 520 * As we had to release the locks in order to send the 521 * message to all nodes, we need to check to see if the 522 * unit has changed. If it has we release the writerlock 523 * and return fail. 524 */ 525 if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) { 526 md_unit_writerexit(MDI_UNIT(mnum)); 527 return (1); 528 } 529 } else { 530 if (MD_MNSET_SETNO(setno)) { 531 /* 532 * If 2 or more nodes simultaneously see a 533 * component failure, these nodes will each 534 * send an ALLOCATE_HOTSPARE[2] message. 535 * The first message will allocate the hotspare 536 * and the subsequent messages should do nothing. 537 * 538 * If a slave node doesn't have a hotspare allocated 539 * at the time the message is initiated, then the 540 * passed in hs_id will be 0. If the node 541 * executing this routine has a component shared 542 * ms_hs_id of non-zero, but the message shows a 543 * hs_id of 0, then just return since a hotspare 544 * has already been allocated for this failing 545 * component. When the slave node returns from 546 * the ksend_message the hotspare will have 547 * already been allocated. 548 * 549 * If the slave node does send an hs_id of non-zero, 550 * and the slave node's hs_id matches this node's 551 * ms_hs_id, then the hotspare has error'd and 552 * should be replaced. 553 * 554 * If the slave node sends an hs_id of non-zero and 555 * this node has a different shared ms_hs_id, then 556 * just return since this hotspare has already 557 * been hotspared. 558 */ 559 if (shared->ms_hs_id != 0) { 560 if (hs_id == 0) { 561 #ifdef DEBUG 562 if (mirror_debug_flag) { 563 printf("check_comp_4_hotspares" 564 "(NOXMIT), short circuit " 565 "hs_id=0x%x, " 566 "ms_hs_id=0x%x\n", 567 hs_id, shared->ms_hs_id); 568 } 569 #endif 570 return (0); 571 } 572 if (hs_id != shared->ms_hs_id) { 573 #ifdef DEBUG 574 if (mirror_debug_flag) { 575 printf("check_comp_4_hotspares" 576 "(NOXMIT), short circuit2 " 577 "hs_id=0x%x, " 578 "ms_hs_id=0x%x\n", 579 hs_id, shared->ms_hs_id); 580 } 581 #endif 582 return (0); 583 } 584 } 585 } 586 587 sm = &un->un_sm[smi]; 588 hs_dev = md_get_named_service(sm->sm_dev, 0, 589 "hotspare device", 0); 590 if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done, 591 &hs_data) != 0) 592 return (0); 593 594 /* 595 * set_sm_comp_state() commits the modified records. 596 * As we don't transmit the changes, no need to drop the lock. 597 */ 598 set_sm_comp_state(un, smi, ci, CS_RESYNC, recids, 599 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 600 601 (*hs_done)(sm->sm_dev, hs_data); 602 603 mirror_check_failfast(mnum); 604 605 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE, 606 setno, MD_SID(un)); 607 608 /* 609 * For a multi-node set we need to reset the un_rs_type, 610 * un_rs_resync_done and un_rs_resync_2_do fields as the 611 * hot-spare resync must copy all applicable data. 612 */ 613 if (MD_MNSET_SETNO(setno)) { 614 un->un_rs_type = MD_RS_NONE; 615 un->un_rs_resync_done = 0; 616 un->un_rs_resync_2_do = 0; 617 } 618 619 /* 620 * Must drop writer lock since mirror_resync_unit will 621 * open devices and must be able to grab readerlock. 622 * Don't need to drop IOLOCK since any descendent routines 623 * calling ksend_messages will drop the IOLOCK as needed. 624 * 625 */ 626 if (lockp) { 627 md_ioctl_writerexit(lockp); 628 } else { 629 md_unit_writerexit(MDI_UNIT(mnum)); 630 } 631 632 /* start resync */ 633 (void) mirror_resync_unit(mnum, NULL, &mde, lockp); 634 635 if (lockp) { 636 new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum)); 637 } else { 638 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 639 } 640 } 641 return (0); 642 } 643 644 /* 645 * check_unit_4_hotspares 646 * 647 * For a given mirror, allocate hotspares, if available for any components 648 * that are in error 649 * 650 * Returns 0 if ok 651 * 1 if check_comp_4_hotspares returns non-zero. This will only 652 * happen for a MN unit where the unit has been cleared while 653 * the allocate hotspare message is sent to all nodes. 654 */ 655 static int 656 check_unit_4_hotspares(mm_unit_t *un, int flags) 657 { 658 mm_submirror_t *sm; 659 mm_submirror_ic_t *smic; 660 int ci; 661 int i; 662 int compcnt; 663 664 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 665 return (0); 666 667 for (i = 0; i < NMIRROR; i++) { 668 sm = &un->un_sm[i]; 669 smic = &un->un_smic[i]; 670 if (!SMS_IS(sm, SMS_INUSE)) 671 continue; 672 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm); 673 for (ci = 0; ci < compcnt; ci++) { 674 md_m_shared_t *shared; 675 676 shared = (md_m_shared_t *) 677 (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci); 678 /* 679 * Never called from ioctl context, so pass in 680 * (IOLOCK *)NULL. Pass through flags from calling 681 * routine, also setting XMIT flag. 682 */ 683 if (check_comp_4_hotspares(un, i, ci, 684 (MD_HOTSPARE_XMIT | flags), 685 shared->ms_hs_id, (IOLOCK *)NULL) != 0) 686 return (1); 687 } 688 } 689 return (0); 690 } 691 692 static void 693 check_4_hotspares(daemon_request_t *drq) 694 { 695 mdi_unit_t *ui; 696 mm_unit_t *un; 697 md_link_t *next; 698 int x; 699 700 mutex_enter(&drq->dr_mx); /* clear up front so can poke */ 701 drq->dr_pending = 0; /* again in low level routine if */ 702 mutex_exit(&drq->dr_mx); /* something found to do */ 703 704 /* 705 * Used to have a problem here. The disksets weren't marked as being 706 * MNHOLD. This opened a window where we could be searching for 707 * hotspares and have the disk set unloaded (released) from under 708 * us causing a panic in stripe_component_count(). 709 * The way to prevent that is to mark the set MNHOLD which prevents 710 * any diskset from being released while we are scanning the mirrors, 711 * submirrors and components. 712 */ 713 714 for (x = 0; x < md_nsets; x++) 715 md_holdset_enter(x); 716 717 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 718 for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) { 719 ui = MDI_UNIT(next->ln_id); 720 721 un = (mm_unit_t *)md_unit_readerlock(ui); 722 723 /* 724 * Only check the unit if we are the master for this set 725 * For an MN set, poke_hotspares() is only effective on the 726 * master 727 */ 728 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 729 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 730 md_unit_readerexit(ui); 731 continue; 732 } 733 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) { 734 md_unit_readerexit(ui); 735 continue; 736 } 737 md_unit_readerexit(ui); 738 739 un = (mm_unit_t *)md_unit_writerlock(ui); 740 /* 741 * check_unit_4_hotspares will exit 1 if the unit has been 742 * removed during the process of allocating the hotspare. 743 * This can only happen for a MN metadevice. If unit no longer 744 * exists, no need to release writerlock 745 */ 746 if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0) 747 md_unit_writerexit(ui); 748 else { 749 /* 750 * If check_unit_4_hotspares failed, queue another 751 * request and break out of this one 752 */ 753 (void) poke_hotspares(); 754 break; 755 } 756 } 757 rw_exit(&mirror_md_ops.md_link_rw.lock); 758 759 for (x = 0; x < md_nsets; x++) 760 md_holdset_exit(x); 761 } 762 763 /* 764 * poke_hotspares 765 * 766 * If there is not a pending poke_hotspares request pending, queue a requent 767 * to call check_4_hotspares(). This will scan all mirrors and attempt to 768 * allocate hotspares for all components in error. 769 */ 770 int 771 poke_hotspares() 772 { 773 mutex_enter(&hotspare_request.dr_mx); 774 if (hotspare_request.dr_pending == 0) { 775 hotspare_request.dr_pending = 1; 776 daemon_request(&md_mhs_daemon, 777 check_4_hotspares, (daemon_queue_t *)&hotspare_request, 778 REQ_OLD); 779 } 780 mutex_exit(&hotspare_request.dr_mx); 781 return (0); 782 } 783 784 static void 785 free_all_ecomps(err_comp_t *ecomp) 786 { 787 err_comp_t *d; 788 789 while (ecomp != NULL) { 790 d = ecomp; 791 ecomp = ecomp->ec_next; 792 kmem_free(d, sizeof (err_comp_t)); 793 } 794 } 795 796 /* 797 * NAME: mirror_openfail_console_info 798 * 799 * DESCRIPTION: Prints a informative message to the console when mirror 800 * cannot be opened. 801 * 802 * PARAMETERS: mm_unit_t un - pointer to mirror unit structure 803 * int smi - submirror index 804 * int ci - component index 805 */ 806 807 void 808 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci) 809 { 810 void (*get_dev)(); 811 ms_cd_info_t cd; 812 md_dev64_t tmpdev; 813 814 tmpdev = un->un_sm[smi].sm_dev; 815 get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0); 816 if (get_dev != NULL) { 817 (void) (*get_dev)(tmpdev, smi, ci, &cd); 818 cmn_err(CE_WARN, "md %s: open error on %s", 819 md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un), 820 cd.cd_dev, NULL, 0)); 821 } else { 822 cmn_err(CE_WARN, "md %s: open error", 823 md_shortname(MD_SID(un))); 824 } 825 } 826 827 static int 828 mirror_close_all_devs(mm_unit_t *un, int md_cflags) 829 { 830 int i; 831 md_dev64_t dev; 832 833 for (i = 0; i < NMIRROR; i++) { 834 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 835 continue; 836 dev = un->un_sm[i].sm_dev; 837 md_layered_close(dev, md_cflags); 838 } 839 return (0); 840 } 841 842 /* 843 * Keep track of drivers that don't support failfast. We use this so that 844 * we only log one diagnostic message for each of these drivers, no matter 845 * how many times we run the mirror_check_failfast function. 846 * Return 1 if this is a new driver that does not support failfast, 847 * return 0 if we have already seen this non-failfast driver. 848 */ 849 static int 850 new_non_ff_driver(const char *s) 851 { 852 mutex_enter(&non_ff_drv_mutex); 853 if (non_ff_drivers == NULL) { 854 non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *), 855 KM_NOSLEEP); 856 if (non_ff_drivers == NULL) { 857 mutex_exit(&non_ff_drv_mutex); 858 return (1); 859 } 860 861 non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, 862 KM_NOSLEEP); 863 if (non_ff_drivers[0] == NULL) { 864 kmem_free(non_ff_drivers, 2 * sizeof (char *)); 865 non_ff_drivers = NULL; 866 mutex_exit(&non_ff_drv_mutex); 867 return (1); 868 } 869 870 (void) strcpy(non_ff_drivers[0], s); 871 non_ff_drivers[1] = NULL; 872 873 } else { 874 int i; 875 char **tnames; 876 char **tmp; 877 878 for (i = 0; non_ff_drivers[i] != NULL; i++) { 879 if (strcmp(s, non_ff_drivers[i]) == 0) { 880 mutex_exit(&non_ff_drv_mutex); 881 return (0); 882 } 883 } 884 885 /* allow for new element and null */ 886 i += 2; 887 tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP); 888 if (tnames == NULL) { 889 mutex_exit(&non_ff_drv_mutex); 890 return (1); 891 } 892 893 for (i = 0; non_ff_drivers[i] != NULL; i++) 894 tnames[i] = non_ff_drivers[i]; 895 896 tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); 897 if (tnames[i] == NULL) { 898 /* adjust i so that it is the right count to free */ 899 kmem_free(tnames, (i + 2) * sizeof (char *)); 900 mutex_exit(&non_ff_drv_mutex); 901 return (1); 902 } 903 904 (void) strcpy(tnames[i++], s); 905 tnames[i] = NULL; 906 907 tmp = non_ff_drivers; 908 non_ff_drivers = tnames; 909 /* i now represents the count we previously alloced */ 910 kmem_free(tmp, i * sizeof (char *)); 911 } 912 mutex_exit(&non_ff_drv_mutex); 913 914 return (1); 915 } 916 917 /* 918 * Check for the "ddi-failfast-supported" devtree property on each submirror 919 * component to indicate if we should do I/O to that submirror with the 920 * B_FAILFAST flag set or not. This check is made at various state transitions 921 * in the mirror code (e.g. open, enable, hotspare, etc.). Sometimes we 922 * only need to check one drive (e.g. hotspare) but since the check is 923 * fast and infrequent and sometimes needs to be done on all components we 924 * just check all components on each call. 925 */ 926 void 927 mirror_check_failfast(minor_t mnum) 928 { 929 int i; 930 mm_unit_t *un; 931 932 if (md_ff_disable) 933 return; 934 935 un = MD_UNIT(mnum); 936 937 for (i = 0; i < NMIRROR; i++) { 938 int ci; 939 int cnt; 940 int ff = 1; 941 mm_submirror_t *sm; 942 mm_submirror_ic_t *smic; 943 void (*get_dev)(); 944 945 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 946 continue; 947 948 sm = &un->un_sm[i]; 949 smic = &un->un_smic[i]; 950 951 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 952 "get device", 0); 953 954 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 955 for (ci = 0; ci < cnt; ci++) { 956 int found = 0; 957 dev_t ci_dev; 958 major_t major; 959 dev_info_t *devi; 960 ms_cd_info_t cd; 961 962 /* 963 * this already returns the hs 964 * dev if the device is spared 965 */ 966 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 967 968 ci_dev = md_dev64_to_dev(cd.cd_dev); 969 major = getmajor(ci_dev); 970 971 if (major == md_major) { 972 /* 973 * this component must be a soft 974 * partition; get the real dev 975 */ 976 minor_t dev_mnum; 977 mdi_unit_t *ui; 978 mp_unit_t *un; 979 set_t setno; 980 side_t side; 981 md_dev64_t tmpdev; 982 983 ui = MDI_UNIT(getminor(ci_dev)); 984 985 /* grab necessary lock */ 986 un = (mp_unit_t *)md_unit_readerlock(ui); 987 988 dev_mnum = MD_SID(un); 989 setno = MD_MIN2SET(dev_mnum); 990 side = mddb_getsidenum(setno); 991 992 tmpdev = un->un_dev; 993 994 /* Get dev by device id */ 995 if (md_devid_found(setno, side, 996 un->un_key) == 1) { 997 tmpdev = md_resolve_bydevid(dev_mnum, 998 tmpdev, un->un_key); 999 } 1000 1001 md_unit_readerexit(ui); 1002 1003 ci_dev = md_dev64_to_dev(tmpdev); 1004 major = getmajor(ci_dev); 1005 } 1006 1007 if (ci_dev != NODEV32 && 1008 (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) 1009 != NULL) { 1010 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; 1011 int propvalue = 0; 1012 int proplength = sizeof (int); 1013 int error; 1014 struct cb_ops *cb; 1015 1016 if ((cb = devopsp[major]->devo_cb_ops) != 1017 NULL) { 1018 error = (*cb->cb_prop_op) 1019 (DDI_DEV_T_ANY, devi, prop_op, 1020 DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, 1021 "ddi-failfast-supported", 1022 (caddr_t)&propvalue, &proplength); 1023 1024 if (error == DDI_PROP_SUCCESS) 1025 found = 1; 1026 } 1027 1028 if (!found && new_non_ff_driver( 1029 ddi_driver_name(devi))) { 1030 cmn_err(CE_NOTE, "!md: B_FAILFAST I/O" 1031 "disabled on %s", 1032 ddi_driver_name(devi)); 1033 } 1034 1035 ddi_release_devi(devi); 1036 } 1037 1038 /* 1039 * All components must support 1040 * failfast in the submirror. 1041 */ 1042 if (!found) { 1043 ff = 0; 1044 break; 1045 } 1046 } 1047 1048 if (ff) { 1049 sm->sm_flags |= MD_SM_FAILFAST; 1050 } else { 1051 sm->sm_flags &= ~MD_SM_FAILFAST; 1052 } 1053 } 1054 } 1055 1056 /* 1057 * Return true if the submirror is unavailable. 1058 * If any of the submirror components are opened then the submirror cannot 1059 * be unavailable (MD_INACCESSIBLE). 1060 * If any of the components are already in the errored state, then the submirror 1061 * cannot be unavailable (MD_INACCESSIBLE). 1062 */ 1063 static bool_t 1064 submirror_unavailable(mm_unit_t *un, int smi, int from_probe) 1065 { 1066 mm_submirror_t *sm; 1067 mm_submirror_ic_t *smic; 1068 md_m_shared_t *shared; 1069 int ci; 1070 int compcnt; 1071 1072 sm = &un->un_sm[smi]; 1073 smic = &un->un_smic[smi]; 1074 1075 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 1076 for (ci = 0; ci < compcnt; ci++) { 1077 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 1078 (sm->sm_dev, sm, ci); 1079 if (from_probe) { 1080 if (shared->ms_flags & MDM_S_PROBEOPEN) 1081 return (B_FALSE); 1082 } else { 1083 if (shared->ms_flags & MDM_S_ISOPEN) 1084 return (B_FALSE); 1085 } 1086 if (shared->ms_state == CS_ERRED || 1087 shared->ms_state == CS_LAST_ERRED) 1088 return (B_FALSE); 1089 } 1090 1091 return (B_TRUE); 1092 } 1093 1094 static int 1095 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp) 1096 { 1097 int i; 1098 mm_unit_t *un; 1099 mdi_unit_t *ui; 1100 int err; 1101 int smi; 1102 int ci; 1103 err_comp_t *c; 1104 err_comp_t *ecomps = NULL; 1105 int smmask = 0; 1106 set_t setno; 1107 int sm_cnt; 1108 int sm_unavail_cnt; 1109 1110 mirror_check_failfast(mnum); 1111 1112 un = MD_UNIT(mnum); 1113 ui = MDI_UNIT(mnum); 1114 setno = MD_UN2SET(un); 1115 1116 for (i = 0; i < NMIRROR; i++) { 1117 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1118 1119 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1120 continue; 1121 if (md_layered_open(mnum, &tmpdev, md_oflags)) 1122 smmask |= SMI2BIT(i); 1123 un->un_sm[i].sm_dev = tmpdev; 1124 } 1125 1126 /* 1127 * If smmask is clear, all submirrors are accessible. Clear the 1128 * MD_INACCESSIBLE bit in this case. This bit is also cleared for the 1129 * mirror device. If smmask is set, we have to determine which of the 1130 * submirrors are in error. If no submirror is accessible we mark the 1131 * whole mirror as MD_INACCESSIBLE. 1132 */ 1133 if (smmask == 0) { 1134 if (lockp) { 1135 md_ioctl_readerexit(lockp); 1136 (void) md_ioctl_writerlock(lockp, ui); 1137 } else { 1138 md_unit_readerexit(ui); 1139 (void) md_unit_writerlock(ui); 1140 } 1141 ui->ui_tstate &= ~MD_INACCESSIBLE; 1142 if (lockp) { 1143 md_ioctl_writerexit(lockp); 1144 (void) md_ioctl_readerlock(lockp, ui); 1145 } else { 1146 md_unit_writerexit(ui); 1147 (void) md_unit_readerlock(ui); 1148 } 1149 1150 for (i = 0; i < NMIRROR; i++) { 1151 md_dev64_t tmpdev; 1152 mdi_unit_t *sm_ui; 1153 1154 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1155 continue; 1156 1157 tmpdev = un->un_sm[i].sm_dev; 1158 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1159 (void) md_unit_writerlock(sm_ui); 1160 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1161 md_unit_writerexit(sm_ui); 1162 } 1163 1164 return (0); 1165 } 1166 1167 for (i = 0; i < NMIRROR; i++) { 1168 md_dev64_t tmpdev; 1169 1170 if (!(smmask & SMI2BIT(i))) 1171 continue; 1172 1173 tmpdev = un->un_sm[i].sm_dev; 1174 err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS); 1175 un->un_sm[i].sm_dev = tmpdev; 1176 ASSERT(err == 0); 1177 } 1178 1179 if (lockp) { 1180 md_ioctl_readerexit(lockp); 1181 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui); 1182 } else { 1183 md_unit_readerexit(ui); 1184 un = (mm_unit_t *)md_unit_writerlock(ui); 1185 } 1186 1187 /* 1188 * We want to make sure the unavailable flag is not masking a real 1189 * error on the submirror. 1190 * For each submirror, 1191 * if all of the submirror components couldn't be opened and there 1192 * are no errors on the submirror, then set the unavailable flag 1193 * otherwise, clear unavailable. 1194 */ 1195 sm_cnt = 0; 1196 sm_unavail_cnt = 0; 1197 for (i = 0; i < NMIRROR; i++) { 1198 md_dev64_t tmpdev; 1199 mdi_unit_t *sm_ui; 1200 1201 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1202 continue; 1203 1204 sm_cnt++; 1205 tmpdev = un->un_sm[i].sm_dev; 1206 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1207 1208 (void) md_unit_writerlock(sm_ui); 1209 if (submirror_unavailable(un, i, 0)) { 1210 sm_ui->ui_tstate |= MD_INACCESSIBLE; 1211 sm_unavail_cnt++; 1212 } else { 1213 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1214 } 1215 md_unit_writerexit(sm_ui); 1216 } 1217 1218 /* 1219 * If all of the submirrors are unavailable, the mirror is also 1220 * unavailable. 1221 */ 1222 if (sm_cnt == sm_unavail_cnt) { 1223 ui->ui_tstate |= MD_INACCESSIBLE; 1224 } else { 1225 ui->ui_tstate &= ~MD_INACCESSIBLE; 1226 } 1227 1228 smi = 0; 1229 ci = 0; 1230 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 1231 if (mirror_other_sources(un, smi, ci, 1) == 1) { 1232 1233 free_all_ecomps(ecomps); 1234 (void) mirror_close_all_devs(un, md_oflags); 1235 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, 1236 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1237 mirror_openfail_console_info(un, smi, ci); 1238 if (lockp) { 1239 md_ioctl_writerexit(lockp); 1240 (void) md_ioctl_readerlock(lockp, ui); 1241 } else { 1242 md_unit_writerexit(ui); 1243 (void) md_unit_readerlock(ui); 1244 } 1245 return (ENXIO); 1246 } 1247 1248 /* track all component states that need changing */ 1249 c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP); 1250 c->ec_next = ecomps; 1251 c->ec_smi = smi; 1252 c->ec_ci = ci; 1253 ecomps = c; 1254 ci++; 1255 } 1256 1257 /* Make all state changes and commit them */ 1258 for (c = ecomps; c != NULL; c = c->ec_next) { 1259 /* 1260 * If lockp is set, then entering kernel through ioctl. 1261 * For a MN set, the only ioctl path is via a commd message 1262 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already 1263 * being sent to each node. 1264 * In this case, set NO_XMIT so that set_sm_comp_state 1265 * won't attempt to send a message on a message. 1266 * 1267 * In !MN sets, the xmit flag is ignored, so it doesn't matter 1268 * which flag is passed. 1269 */ 1270 if (lockp) { 1271 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1272 MD_STATE_NO_XMIT, lockp); 1273 } else { 1274 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1275 (MD_STATE_XMIT | MD_STATE_OCHELD), lockp); 1276 } 1277 /* 1278 * For a MN set, the NOTIFY is done when the state change is 1279 * processed on each node 1280 */ 1281 if (!MD_MNSET_SETNO(setno)) { 1282 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 1283 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1284 } 1285 } 1286 1287 if (lockp) { 1288 md_ioctl_writerexit(lockp); 1289 (void) md_ioctl_readerlock(lockp, ui); 1290 } else { 1291 md_unit_writerexit(ui); 1292 (void) md_unit_readerlock(ui); 1293 } 1294 1295 free_all_ecomps(ecomps); 1296 1297 /* allocate hotspares for all errored components */ 1298 if (MD_MNSET_SETNO(setno)) { 1299 /* 1300 * If we're called from an ioctl (lockp set) then we cannot 1301 * directly call send_poke_hotspares as this will block until 1302 * the message gets despatched to all nodes. If the cluster is 1303 * going through a reconfig cycle then the message will block 1304 * until the cycle is complete, and as we originate from a 1305 * service call from commd we will livelock. 1306 */ 1307 if (lockp == NULL) { 1308 md_unit_readerexit(ui); 1309 send_poke_hotspares(setno); 1310 (void) md_unit_readerlock(ui); 1311 } 1312 } else { 1313 (void) poke_hotspares(); 1314 } 1315 return (0); 1316 } 1317 1318 void 1319 mirror_overlap_tree_remove(md_mps_t *ps) 1320 { 1321 mm_unit_t *un; 1322 1323 if (panicstr) 1324 return; 1325 1326 VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP); 1327 un = ps->ps_un; 1328 1329 mutex_enter(&un->un_overlap_tree_mx); 1330 avl_remove(&un->un_overlap_root, ps); 1331 ps->ps_flags &= ~MD_MPS_ON_OVERLAP; 1332 if (un->un_overlap_tree_flag != 0) { 1333 un->un_overlap_tree_flag = 0; 1334 cv_broadcast(&un->un_overlap_tree_cv); 1335 } 1336 mutex_exit(&un->un_overlap_tree_mx); 1337 } 1338 1339 1340 /* 1341 * wait_for_overlaps: 1342 * ----------------- 1343 * Check that given i/o request does not cause an overlap with already pending 1344 * i/o. If it does, block until the overlapped i/o completes. 1345 * 1346 * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent 1347 * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if 1348 * it must not already be in the tree. 1349 */ 1350 static void 1351 wait_for_overlaps(md_mps_t *ps, int flags) 1352 { 1353 mm_unit_t *un; 1354 avl_index_t where; 1355 md_mps_t *ps1; 1356 1357 if (panicstr) 1358 return; 1359 1360 un = ps->ps_un; 1361 mutex_enter(&un->un_overlap_tree_mx); 1362 if ((flags & MD_OVERLAP_ALLOW_REPEAT) && 1363 (ps->ps_flags & MD_MPS_ON_OVERLAP)) { 1364 mutex_exit(&un->un_overlap_tree_mx); 1365 return; 1366 } 1367 1368 VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 1369 1370 do { 1371 ps1 = avl_find(&un->un_overlap_root, ps, &where); 1372 if (ps1 == NULL) { 1373 /* 1374 * The candidate range does not overlap with any 1375 * range in the tree. Insert it and be done. 1376 */ 1377 avl_insert(&un->un_overlap_root, ps, where); 1378 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1379 } else { 1380 /* 1381 * The candidate range would overlap. Set the flag 1382 * indicating we need to be woken up, and sleep 1383 * until another thread removes a range. If upon 1384 * waking up we find this mps was put on the tree 1385 * by another thread, the loop terminates. 1386 */ 1387 un->un_overlap_tree_flag = 1; 1388 cv_wait(&un->un_overlap_tree_cv, 1389 &un->un_overlap_tree_mx); 1390 } 1391 } while (!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 1392 mutex_exit(&un->un_overlap_tree_mx); 1393 } 1394 1395 /* 1396 * This function is called from mirror_done to check whether any pages have 1397 * been modified while a mirrored write was in progress. Returns 0 if 1398 * all pages associated with bp are clean, 1 otherwise. 1399 */ 1400 static int 1401 any_pages_dirty(struct buf *bp) 1402 { 1403 int rval; 1404 1405 rval = biomodified(bp); 1406 if (rval == -1) 1407 rval = 0; 1408 1409 return (rval); 1410 } 1411 1412 #define MAX_EXTRAS 10 1413 1414 void 1415 mirror_commit( 1416 mm_unit_t *un, 1417 int smmask, 1418 mddb_recid_t *extras 1419 ) 1420 { 1421 mm_submirror_t *sm; 1422 md_unit_t *su; 1423 int i; 1424 1425 /* 2=mirror,null id */ 1426 mddb_recid_t recids[NMIRROR+2+MAX_EXTRAS]; 1427 1428 int ri = 0; 1429 1430 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE) 1431 return; 1432 1433 /* Add two, this includes the mirror unit and the null recid */ 1434 if (extras != NULL) { 1435 int nrecids = 0; 1436 while (extras[nrecids] != 0) { 1437 nrecids++; 1438 } 1439 ASSERT(nrecids <= MAX_EXTRAS); 1440 } 1441 1442 if (un != NULL) 1443 recids[ri++] = un->c.un_record_id; 1444 for (i = 0; i < NMIRROR; i++) { 1445 if (!(smmask & SMI2BIT(i))) 1446 continue; 1447 sm = &un->un_sm[i]; 1448 if (!SMS_IS(sm, SMS_INUSE)) 1449 continue; 1450 if (md_getmajor(sm->sm_dev) != md_major) 1451 continue; 1452 su = MD_UNIT(md_getminor(sm->sm_dev)); 1453 recids[ri++] = su->c.un_record_id; 1454 } 1455 1456 if (extras != NULL) 1457 while (*extras != 0) { 1458 recids[ri++] = *extras; 1459 extras++; 1460 } 1461 1462 if (ri == 0) 1463 return; 1464 recids[ri] = 0; 1465 1466 /* 1467 * Ok to hold ioctl lock across record commit to mddb as 1468 * long as the record(s) being committed aren't resync records. 1469 */ 1470 mddb_commitrecs_wrapper(recids); 1471 } 1472 1473 1474 /* 1475 * This routine is used to set a bit in the writable_bm bitmap 1476 * which represents each submirror in a metamirror which 1477 * is writable. The first writable submirror index is assigned 1478 * to the sm_index. The number of writable submirrors are returned in nunits. 1479 * 1480 * This routine returns the submirror's unit number. 1481 */ 1482 1483 static void 1484 select_write_units(struct mm_unit *un, md_mps_t *ps) 1485 { 1486 1487 int i; 1488 unsigned writable_bm = 0; 1489 unsigned nunits = 0; 1490 1491 for (i = 0; i < NMIRROR; i++) { 1492 if (SUBMIRROR_IS_WRITEABLE(un, i)) { 1493 /* set bit of all writable units */ 1494 writable_bm |= SMI2BIT(i); 1495 nunits++; 1496 } 1497 } 1498 ps->ps_writable_sm = writable_bm; 1499 ps->ps_active_cnt = nunits; 1500 ps->ps_current_sm = 0; 1501 } 1502 1503 static 1504 unsigned 1505 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps) 1506 { 1507 1508 int i; 1509 unsigned writable_bm = 0; 1510 unsigned nunits = 0; 1511 1512 for (i = 0; i < NMIRROR; i++) { 1513 if (SUBMIRROR_IS_WRITEABLE(un, i) && 1514 un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) { 1515 writable_bm |= SMI2BIT(i); 1516 nunits++; 1517 } 1518 } 1519 if ((writable_bm & ps->ps_allfrom_sm) != 0) { 1520 writable_bm &= ~ps->ps_allfrom_sm; 1521 nunits--; 1522 } 1523 ps->ps_writable_sm = writable_bm; 1524 ps->ps_active_cnt = nunits; 1525 ps->ps_current_sm = 0; 1526 return (nunits); 1527 } 1528 1529 static md_dev64_t 1530 select_read_unit( 1531 mm_unit_t *un, 1532 diskaddr_t blkno, 1533 u_longlong_t reqcount, 1534 u_longlong_t *cando, 1535 int must_be_opened, 1536 md_m_shared_t **shared, 1537 md_mcs_t *cs) 1538 { 1539 int i; 1540 md_m_shared_t *s; 1541 uint_t lasterrcnt = 0; 1542 md_dev64_t dev = 0; 1543 u_longlong_t cnt; 1544 u_longlong_t mincnt; 1545 mm_submirror_t *sm; 1546 mm_submirror_ic_t *smic; 1547 mdi_unit_t *ui; 1548 1549 mincnt = reqcount; 1550 for (i = 0; i < NMIRROR; i++) { 1551 if (!SUBMIRROR_IS_READABLE(un, i)) 1552 continue; 1553 sm = &un->un_sm[i]; 1554 smic = &un->un_smic[i]; 1555 cnt = reqcount; 1556 1557 /* 1558 * If the current submirror is marked as inaccessible, do not 1559 * try to access it. 1560 */ 1561 ui = MDI_UNIT(getminor(expldev(sm->sm_dev))); 1562 (void) md_unit_readerlock(ui); 1563 if (ui->ui_tstate & MD_INACCESSIBLE) { 1564 md_unit_readerexit(ui); 1565 continue; 1566 } 1567 md_unit_readerexit(ui); 1568 1569 s = (md_m_shared_t *)(*(smic->sm_shared_by_blk)) 1570 (sm->sm_dev, sm, blkno, &cnt); 1571 1572 if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN)) 1573 continue; 1574 if (s->ms_state == CS_OKAY) { 1575 *cando = cnt; 1576 if (shared != NULL) 1577 *shared = s; 1578 1579 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST && 1580 cs != NULL) { 1581 cs->cs_buf.b_flags |= B_FAILFAST; 1582 } 1583 1584 return (un->un_sm[i].sm_dev); 1585 } 1586 if (s->ms_state != CS_LAST_ERRED) 1587 continue; 1588 1589 /* don't use B_FAILFAST since we're Last Erred */ 1590 1591 if (mincnt > cnt) 1592 mincnt = cnt; 1593 if (s->ms_lasterrcnt > lasterrcnt) { 1594 lasterrcnt = s->ms_lasterrcnt; 1595 if (shared != NULL) 1596 *shared = s; 1597 dev = un->un_sm[i].sm_dev; 1598 } 1599 } 1600 *cando = mincnt; 1601 return (dev); 1602 } 1603 1604 /* 1605 * Given a 32-bit bitmap, this routine will return the bit number 1606 * of the nth bit set. The nth bit set is passed via the index integer. 1607 * 1608 * This routine is used to run through the writable submirror bitmap 1609 * and starting all of the writes. See the value returned is the 1610 * index to appropriate submirror structure, in the md_sm 1611 * array for metamirrors. 1612 */ 1613 static int 1614 md_find_nth_unit(uint_t mask, int index) 1615 { 1616 int bit, nfound; 1617 1618 for (bit = -1, nfound = -1; nfound != index; bit++) { 1619 ASSERT(mask != 0); 1620 nfound += (mask & 1); 1621 mask >>= 1; 1622 } 1623 return (bit); 1624 } 1625 1626 static int 1627 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs) 1628 { 1629 mm_unit_t *un; 1630 buf_t *bp; 1631 int i; 1632 unsigned nunits = 0; 1633 int iunit; 1634 uint_t running_bm = 0; 1635 uint_t sm_index; 1636 1637 bp = &cs->cs_buf; 1638 un = ps->ps_un; 1639 1640 for (i = 0; i < NMIRROR; i++) { 1641 if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING)) 1642 continue; 1643 running_bm |= SMI2BIT(i); 1644 nunits++; 1645 } 1646 if (nunits == 0) 1647 return (1); 1648 1649 /* 1650 * For directed mirror read (DMR) we only use the specified side and 1651 * do not compute the source of the read. 1652 * If we're running with MD_MPS_DIRTY_RD set we always return the 1653 * first mirror side (this prevents unnecessary ownership switching). 1654 * Otherwise we return the submirror according to the mirror read option 1655 */ 1656 if (ps->ps_flags & MD_MPS_DMR) { 1657 sm_index = un->un_dmr_last_read; 1658 } else if (ps->ps_flags & MD_MPS_DIRTY_RD) { 1659 sm_index = md_find_nth_unit(running_bm, 0); 1660 } else { 1661 /* Normal (non-DMR) operation */ 1662 switch (un->un_read_option) { 1663 case RD_GEOMETRY: 1664 iunit = (int)(bp->b_lblkno / 1665 howmany(un->c.un_total_blocks, nunits)); 1666 sm_index = md_find_nth_unit(running_bm, iunit); 1667 break; 1668 case RD_FIRST: 1669 sm_index = md_find_nth_unit(running_bm, 0); 1670 break; 1671 case RD_LOAD_BAL: 1672 /* this is intentional to fall into the default */ 1673 default: 1674 un->un_last_read = (un->un_last_read + 1) % nunits; 1675 sm_index = md_find_nth_unit(running_bm, 1676 un->un_last_read); 1677 break; 1678 } 1679 } 1680 bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev); 1681 ps->ps_allfrom_sm = SMI2BIT(sm_index); 1682 1683 if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) { 1684 bp->b_flags |= B_FAILFAST; 1685 } 1686 1687 return (0); 1688 } 1689 1690 static 1691 int 1692 mirror_are_submirrors_available(mm_unit_t *un) 1693 { 1694 int i; 1695 for (i = 0; i < NMIRROR; i++) { 1696 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1697 1698 if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) || 1699 md_getmajor(tmpdev) != md_major) 1700 continue; 1701 1702 if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) || 1703 (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits)) 1704 return (0); 1705 1706 if (MDI_UNIT(md_getminor(tmpdev)) == NULL) 1707 return (0); 1708 } 1709 return (1); 1710 } 1711 1712 void 1713 build_submirror(mm_unit_t *un, int i, int snarfing) 1714 { 1715 struct mm_submirror *sm; 1716 struct mm_submirror_ic *smic; 1717 md_unit_t *su; 1718 set_t setno; 1719 1720 sm = &un->un_sm[i]; 1721 smic = &un->un_smic[i]; 1722 1723 sm->sm_flags = 0; /* sometime we may need to do more here */ 1724 1725 setno = MD_UN2SET(un); 1726 1727 if (!SMS_IS(sm, SMS_INUSE)) 1728 return; 1729 if (snarfing) { 1730 sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno), 1731 sm->sm_key, MD_NOTRUST_DEVT); 1732 } else { 1733 if (md_getmajor(sm->sm_dev) == md_major) { 1734 su = MD_UNIT(md_getminor(sm->sm_dev)); 1735 un->c.un_flag |= (su->c.un_flag & MD_LABELED); 1736 /* submirror can no longer be soft partitioned */ 1737 MD_CAPAB(su) &= (~MD_CAN_SP); 1738 } 1739 } 1740 smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev, 1741 0, "shared by blk", 0); 1742 smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev, 1743 0, "shared by indx", 0); 1744 smic->sm_get_component_count = (int (*)())md_get_named_service( 1745 sm->sm_dev, 0, "get component count", 0); 1746 smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0, 1747 "get block count skip size", 0); 1748 sm->sm_state &= ~SMS_IGNORE; 1749 if (SMS_IS(sm, SMS_OFFLINE)) 1750 MD_STATUS(un) |= MD_UN_OFFLINE_SM; 1751 md_set_parent(sm->sm_dev, MD_SID(un)); 1752 } 1753 1754 static void 1755 mirror_cleanup(mm_unit_t *un) 1756 { 1757 mddb_recid_t recid; 1758 int smi; 1759 sv_dev_t sv[NMIRROR]; 1760 int nsv = 0; 1761 1762 /* 1763 * If a MN diskset and this node is not the master, do 1764 * not delete any records on snarf of the mirror records. 1765 */ 1766 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1767 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 1768 return; 1769 } 1770 1771 for (smi = 0; smi < NMIRROR; smi++) { 1772 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 1773 continue; 1774 sv[nsv].setno = MD_UN2SET(un); 1775 sv[nsv++].key = un->un_sm[smi].sm_key; 1776 } 1777 1778 recid = un->un_rr_dirty_recid; 1779 mddb_deleterec_wrapper(un->c.un_record_id); 1780 if (recid > 0) 1781 mddb_deleterec_wrapper(recid); 1782 1783 md_rem_names(sv, nsv); 1784 } 1785 1786 /* 1787 * Comparison function for the avl tree which tracks 1788 * outstanding writes on submirrors. 1789 * 1790 * Returns: 1791 * -1: ps1 < ps2 1792 * 0: ps1 and ps2 overlap 1793 * 1: ps1 > ps2 1794 */ 1795 static int 1796 mirror_overlap_compare(const void *p1, const void *p2) 1797 { 1798 const md_mps_t *ps1 = (md_mps_t *)p1; 1799 const md_mps_t *ps2 = (md_mps_t *)p2; 1800 1801 if (ps1->ps_firstblk < ps2->ps_firstblk) { 1802 if (ps1->ps_lastblk >= ps2->ps_firstblk) 1803 return (0); 1804 return (-1); 1805 } 1806 1807 if (ps1->ps_firstblk > ps2->ps_firstblk) { 1808 if (ps1->ps_firstblk <= ps2->ps_lastblk) 1809 return (0); 1810 return (1); 1811 } 1812 1813 return (0); 1814 } 1815 1816 /* 1817 * Collapse any sparse submirror entries snarfed from the on-disk replica. 1818 * Only the in-core entries are updated. The replica will be updated on-disk 1819 * when the in-core replica is committed on shutdown of the SVM subsystem. 1820 */ 1821 static void 1822 collapse_submirrors(mm_unit_t *un) 1823 { 1824 int smi, nremovals, smiremove; 1825 mm_submirror_t *sm, *new_sm, *old_sm; 1826 mm_submirror_ic_t *smic; 1827 int nsmidx = un->un_nsm - 1; 1828 1829 rescan: 1830 nremovals = 0; 1831 smiremove = -1; 1832 1833 for (smi = 0; smi <= nsmidx; smi++) { 1834 sm = &un->un_sm[smi]; 1835 1836 /* 1837 * Check to see if this submirror is marked as in-use. 1838 * If it isn't then it is a potential sparse entry and 1839 * may need to be cleared from the configuration. 1840 * The records should _already_ have been cleared by the 1841 * original mirror_detach() code, but we need to shuffle 1842 * any NULL entries in un_sm[] to the end of the array. 1843 * Any NULL un_smic[] entries need to be reset to the underlying 1844 * submirror/slice accessor functions. 1845 */ 1846 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) { 1847 nremovals++; 1848 smiremove = smi; 1849 break; 1850 } 1851 } 1852 1853 if (nremovals == 0) { 1854 /* 1855 * Ensure that we have a matching contiguous set of un_smic[] 1856 * entries for the corresponding un_sm[] entries 1857 */ 1858 for (smi = 0; smi <= nsmidx; smi++) { 1859 smic = &un->un_smic[smi]; 1860 sm = &un->un_sm[smi]; 1861 1862 smic->sm_shared_by_blk = 1863 md_get_named_service(sm->sm_dev, 0, 1864 "shared by_blk", 0); 1865 smic->sm_shared_by_indx = 1866 md_get_named_service(sm->sm_dev, 0, 1867 "shared by indx", 0); 1868 smic->sm_get_component_count = 1869 (int (*)())md_get_named_service(sm->sm_dev, 0, 1870 "get component count", 0); 1871 smic->sm_get_bcss = 1872 (int (*)())md_get_named_service(sm->sm_dev, 0, 1873 "get block count skip size", 0); 1874 } 1875 return; 1876 } 1877 1878 /* 1879 * Reshuffle the submirror devices so that we do not have a dead record 1880 * in the middle of the array. Once we've done this we need to rescan 1881 * the mirror to check for any other holes. 1882 */ 1883 for (smi = 0; smi < NMIRROR; smi++) { 1884 if (smi < smiremove) 1885 continue; 1886 if (smi > smiremove) { 1887 old_sm = &un->un_sm[smi]; 1888 new_sm = &un->un_sm[smi - 1]; 1889 bcopy(old_sm, new_sm, sizeof (mm_submirror_t)); 1890 bzero(old_sm, sizeof (mm_submirror_t)); 1891 } 1892 } 1893 1894 /* 1895 * Now we need to rescan the array to find the next potential dead 1896 * entry. 1897 */ 1898 goto rescan; 1899 } 1900 1901 /* Return a -1 if optimized record unavailable and set should be released */ 1902 int 1903 mirror_build_incore(mm_unit_t *un, int snarfing) 1904 { 1905 int i; 1906 1907 if (MD_STATUS(un) & MD_UN_BEING_RESET) { 1908 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); 1909 return (1); 1910 } 1911 1912 if (mirror_are_submirrors_available(un) == 0) 1913 return (1); 1914 1915 if (MD_UNIT(MD_SID(un)) != NULL) 1916 return (0); 1917 1918 MD_STATUS(un) = 0; 1919 1920 /* pre-4.1 didn't define CAN_META_CHILD capability */ 1921 MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP; 1922 1923 un->un_overlap_tree_flag = 0; 1924 avl_create(&un->un_overlap_root, mirror_overlap_compare, 1925 sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node)); 1926 1927 /* 1928 * We need to collapse any sparse submirror entries into a non-sparse 1929 * array. This is to cover the case where we have an old replica image 1930 * which has not been updated (i.e. snarfed) since being modified. 1931 * The new code expects all submirror access to be sequential (i.e. 1932 * both the un_sm[] and un_smic[] entries correspond to non-empty 1933 * submirrors. 1934 */ 1935 1936 collapse_submirrors(un); 1937 1938 for (i = 0; i < NMIRROR; i++) 1939 build_submirror(un, i, snarfing); 1940 1941 if (unit_setup_resync(un, snarfing) != 0) { 1942 if (snarfing) { 1943 mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT); 1944 /* 1945 * If a MN set and set is not stale, then return -1 1946 * which will force the caller to unload the set. 1947 * The MN diskset nodes will return failure if 1948 * unit_setup_resync fails so that nodes won't 1949 * get out of sync. 1950 * 1951 * If set is STALE, the master node can't allocate 1952 * a resync record (if needed), but node needs to 1953 * join the set so that user can delete broken mddbs. 1954 * So, if set is STALE, just continue on. 1955 */ 1956 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1957 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 1958 return (-1); 1959 } 1960 } else 1961 return (1); 1962 } 1963 1964 mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL); 1965 cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL); 1966 1967 un->un_suspend_wr_flag = 0; 1968 mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL); 1969 cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL); 1970 1971 /* 1972 * Allocate mutexes for mirror-owner and resync-owner changes. 1973 * All references to the owner message state field must be guarded 1974 * by this mutex. 1975 */ 1976 mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL); 1977 1978 /* 1979 * Allocate mutex and condvar for resync thread manipulation. These 1980 * will be used by mirror_resync_unit/mirror_ioctl_resync 1981 */ 1982 mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL); 1983 cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL); 1984 1985 /* 1986 * Allocate mutex and condvar for resync progress thread manipulation. 1987 * This allows resyncs to be continued across an intervening reboot. 1988 */ 1989 mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL); 1990 cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL); 1991 1992 /* 1993 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This 1994 * provides synchronization between a user-ioctl and the resulting 1995 * strategy() call that performs the read(). 1996 */ 1997 mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL); 1998 cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL); 1999 2000 /* 2001 * Allocate rwlocks for un_pernode_dirty_bm accessing. 2002 */ 2003 for (i = 0; i < MD_MNMAXSIDES; i++) { 2004 rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL); 2005 } 2006 2007 /* place various information in the in-core data structures */ 2008 md_nblocks_set(MD_SID(un), un->c.un_total_blocks); 2009 MD_UNIT(MD_SID(un)) = un; 2010 2011 return (0); 2012 } 2013 2014 2015 void 2016 reset_mirror(struct mm_unit *un, minor_t mnum, int removing) 2017 { 2018 mddb_recid_t recid, vtoc_id; 2019 size_t bitcnt; 2020 size_t shortcnt; 2021 int smi; 2022 sv_dev_t sv[NMIRROR]; 2023 int nsv = 0; 2024 uint_t bits = 0; 2025 minor_t selfid; 2026 md_unit_t *su; 2027 int i; 2028 2029 md_destroy_unit_incore(mnum, &mirror_md_ops); 2030 2031 shortcnt = un->un_rrd_num * sizeof (short); 2032 bitcnt = howmany(un->un_rrd_num, NBBY); 2033 2034 if (un->un_outstanding_writes) 2035 kmem_free((caddr_t)un->un_outstanding_writes, shortcnt); 2036 if (un->un_goingclean_bm) 2037 kmem_free((caddr_t)un->un_goingclean_bm, bitcnt); 2038 if (un->un_goingdirty_bm) 2039 kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt); 2040 if (un->un_resync_bm) 2041 kmem_free((caddr_t)un->un_resync_bm, bitcnt); 2042 if (un->un_pernode_dirty_sum) 2043 kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num); 2044 2045 /* 2046 * Destroy the taskq for deferred processing of DRL clean requests. 2047 * This taskq will only be present for Multi Owner mirrors. 2048 */ 2049 if (un->un_drl_task != NULL) 2050 ddi_taskq_destroy(un->un_drl_task); 2051 2052 md_nblocks_set(mnum, -1ULL); 2053 MD_UNIT(mnum) = NULL; 2054 2055 /* 2056 * Attempt release of its minor node 2057 */ 2058 md_remove_minor_node(mnum); 2059 2060 if (!removing) 2061 return; 2062 2063 for (smi = 0; smi < NMIRROR; smi++) { 2064 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 2065 continue; 2066 /* reallow soft partitioning of submirror and reset parent */ 2067 su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev)); 2068 MD_CAPAB(su) |= MD_CAN_SP; 2069 md_reset_parent(un->un_sm[smi].sm_dev); 2070 reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]); 2071 2072 sv[nsv].setno = MD_MIN2SET(mnum); 2073 sv[nsv++].key = un->un_sm[smi].sm_key; 2074 bits |= SMI2BIT(smi); 2075 } 2076 2077 MD_STATUS(un) |= MD_UN_BEING_RESET; 2078 recid = un->un_rr_dirty_recid; 2079 vtoc_id = un->c.un_vtoc_id; 2080 selfid = MD_SID(un); 2081 2082 mirror_commit(un, bits, 0); 2083 2084 avl_destroy(&un->un_overlap_root); 2085 2086 /* Destroy all mutexes and condvars before returning. */ 2087 mutex_destroy(&un->un_suspend_wr_mx); 2088 cv_destroy(&un->un_suspend_wr_cv); 2089 mutex_destroy(&un->un_overlap_tree_mx); 2090 cv_destroy(&un->un_overlap_tree_cv); 2091 mutex_destroy(&un->un_owner_mx); 2092 mutex_destroy(&un->un_rs_thread_mx); 2093 cv_destroy(&un->un_rs_thread_cv); 2094 mutex_destroy(&un->un_rs_progress_mx); 2095 cv_destroy(&un->un_rs_progress_cv); 2096 mutex_destroy(&un->un_dmr_mx); 2097 cv_destroy(&un->un_dmr_cv); 2098 2099 for (i = 0; i < MD_MNMAXSIDES; i++) { 2100 rw_destroy(&un->un_pernode_dirty_mx[i]); 2101 if (un->un_pernode_dirty_bm[i]) 2102 kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt); 2103 } 2104 2105 /* 2106 * Remove self from the namespace 2107 */ 2108 if (un->c.un_revision & MD_FN_META_DEV) { 2109 (void) md_rem_selfname(un->c.un_self_id); 2110 } 2111 2112 /* This frees the unit structure. */ 2113 mddb_deleterec_wrapper(un->c.un_record_id); 2114 2115 if (recid != 0) 2116 mddb_deleterec_wrapper(recid); 2117 2118 /* Remove the vtoc, if present */ 2119 if (vtoc_id) 2120 mddb_deleterec_wrapper(vtoc_id); 2121 2122 md_rem_names(sv, nsv); 2123 2124 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, 2125 MD_MIN2SET(selfid), selfid); 2126 } 2127 2128 int 2129 mirror_internal_open( 2130 minor_t mnum, 2131 int flag, 2132 int otyp, 2133 int md_oflags, 2134 IOLOCK *lockp /* can be NULL */ 2135 ) 2136 { 2137 mdi_unit_t *ui = MDI_UNIT(mnum); 2138 int err = 0; 2139 2140 tryagain: 2141 /* single thread */ 2142 if (lockp) { 2143 /* 2144 * If ioctl lock is held, use openclose_enter 2145 * routine that will set the ioctl flag when 2146 * grabbing the readerlock. 2147 */ 2148 (void) md_ioctl_openclose_enter(lockp, ui); 2149 } else { 2150 (void) md_unit_openclose_enter(ui); 2151 } 2152 2153 /* 2154 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE 2155 * message in a MN diskset and this requires that the openclose 2156 * lock is dropped in order to send this message. So, another 2157 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from 2158 * attempting an open while this thread has an open in progress. 2159 * Call the *_lh version of the lock exit routines since the ui_mx 2160 * mutex must be held from checking for OPENINPROGRESS until 2161 * after the cv_wait call. 2162 */ 2163 mutex_enter(&ui->ui_mx); 2164 if (ui->ui_lock & MD_UL_OPENINPROGRESS) { 2165 if (lockp) { 2166 (void) md_ioctl_openclose_exit_lh(lockp); 2167 } else { 2168 md_unit_openclose_exit_lh(ui); 2169 } 2170 cv_wait(&ui->ui_cv, &ui->ui_mx); 2171 mutex_exit(&ui->ui_mx); 2172 goto tryagain; 2173 } 2174 2175 ui->ui_lock |= MD_UL_OPENINPROGRESS; 2176 mutex_exit(&ui->ui_mx); 2177 2178 /* open devices, if necessary */ 2179 if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) { 2180 if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0) 2181 goto out; 2182 } 2183 2184 /* count open */ 2185 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 2186 goto out; 2187 2188 /* unlock, return success */ 2189 out: 2190 mutex_enter(&ui->ui_mx); 2191 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 2192 mutex_exit(&ui->ui_mx); 2193 2194 if (lockp) { 2195 /* 2196 * If ioctl lock is held, use openclose_exit 2197 * routine that will clear the lockp reader flag. 2198 */ 2199 (void) md_ioctl_openclose_exit(lockp); 2200 } else { 2201 md_unit_openclose_exit(ui); 2202 } 2203 return (err); 2204 } 2205 2206 int 2207 mirror_internal_close( 2208 minor_t mnum, 2209 int otyp, 2210 int md_cflags, 2211 IOLOCK *lockp /* can be NULL */ 2212 ) 2213 { 2214 mdi_unit_t *ui = MDI_UNIT(mnum); 2215 mm_unit_t *un; 2216 int err = 0; 2217 2218 /* single thread */ 2219 if (lockp) { 2220 /* 2221 * If ioctl lock is held, use openclose_enter 2222 * routine that will set the ioctl flag when 2223 * grabbing the readerlock. 2224 */ 2225 un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui); 2226 } else { 2227 un = (mm_unit_t *)md_unit_openclose_enter(ui); 2228 } 2229 2230 /* count closed */ 2231 if ((err = md_unit_decopen(mnum, otyp)) != 0) 2232 goto out; 2233 2234 /* close devices, if necessary */ 2235 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 2236 /* 2237 * Clean up dirty bitmap for this unit. Do this 2238 * before closing the underlying devices to avoid 2239 * race conditions with reset_mirror() as a 2240 * result of a 'metaset -r' command running in 2241 * parallel. This might cause deallocation of 2242 * dirty region bitmaps; with underlying metadevices 2243 * in place this can't happen. 2244 * Don't do this if a MN set and ABR not set 2245 */ 2246 if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) { 2247 if (!MD_MNSET_SETNO(MD_UN2SET(un)) || 2248 !(ui->ui_tstate & MD_ABR_CAP)) 2249 mirror_process_unit_resync(un); 2250 } 2251 (void) mirror_close_all_devs(un, md_cflags); 2252 2253 /* 2254 * For a MN set with transient capabilities (eg ABR/DMR) set, 2255 * clear these capabilities on the last open in the cluster. 2256 * To do this we send a message to all nodes to see of the 2257 * device is open. 2258 */ 2259 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 2260 (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) { 2261 if (lockp) { 2262 (void) md_ioctl_openclose_exit(lockp); 2263 } else { 2264 md_unit_openclose_exit(ui); 2265 } 2266 2267 /* 2268 * if we are in the context of an ioctl, drop the 2269 * ioctl lock. 2270 * Otherwise, no other locks should be held. 2271 */ 2272 if (lockp) { 2273 IOLOCK_RETURN_RELEASE(0, lockp); 2274 } 2275 2276 mdmn_clear_all_capabilities(mnum); 2277 2278 /* if dropped the lock previously, regain it */ 2279 if (lockp) { 2280 IOLOCK_RETURN_REACQUIRE(lockp); 2281 } 2282 return (0); 2283 } 2284 /* unlock and return success */ 2285 } 2286 out: 2287 /* Call whether lockp is NULL or not. */ 2288 if (lockp) { 2289 md_ioctl_openclose_exit(lockp); 2290 } else { 2291 md_unit_openclose_exit(ui); 2292 } 2293 return (err); 2294 } 2295 2296 /* 2297 * When a component has completed resyncing and is now ok, check if the 2298 * corresponding component in the other submirrors is in the Last Erred 2299 * state. If it is, we want to change that to the Erred state so we stop 2300 * using that component and start using this good component instead. 2301 * 2302 * This is called from set_sm_comp_state and recursively calls 2303 * set_sm_comp_state if it needs to change the Last Erred state. 2304 */ 2305 static void 2306 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags, 2307 IOLOCK *lockp) 2308 { 2309 mm_submirror_t *sm; 2310 mm_submirror_ic_t *smic; 2311 int ci; 2312 int i; 2313 int compcnt; 2314 int changed = 0; 2315 2316 for (i = 0; i < NMIRROR; i++) { 2317 sm = &un->un_sm[i]; 2318 smic = &un->un_smic[i]; 2319 2320 if (!SMS_IS(sm, SMS_INUSE)) 2321 continue; 2322 2323 /* ignore the submirror that we just made ok */ 2324 if (i == smi) 2325 continue; 2326 2327 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 2328 for (ci = 0; ci < compcnt; ci++) { 2329 md_m_shared_t *shared; 2330 2331 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2332 (sm->sm_dev, sm, ci); 2333 2334 if ((shared->ms_state & CS_LAST_ERRED) && 2335 !mirror_other_sources(un, i, ci, 1)) { 2336 2337 set_sm_comp_state(un, i, ci, CS_ERRED, extras, 2338 flags, lockp); 2339 changed = 1; 2340 } 2341 } 2342 } 2343 2344 /* maybe there is a hotspare for this newly erred component */ 2345 if (changed) { 2346 set_t setno; 2347 2348 setno = MD_UN2SET(un); 2349 if (MD_MNSET_SETNO(setno)) { 2350 send_poke_hotspares(setno); 2351 } else { 2352 (void) poke_hotspares(); 2353 } 2354 } 2355 } 2356 2357 /* 2358 * set_sm_comp_state 2359 * 2360 * Set the state of a submirror component to the specified new state. 2361 * If the mirror is in a multi-node set, send messages to all nodes to 2362 * block all writes to the mirror and then update the state and release the 2363 * writes. These messages are only sent if MD_STATE_XMIT is set in flags. 2364 * MD_STATE_XMIT will be unset in 2 cases: 2365 * 1. When the state is changed to CS_RESYNC as this state change 2366 * will already have been updated on each node by the processing of the 2367 * distributed metasync command, hence no need to xmit. 2368 * 2. When the state is change to CS_OKAY after a resync has completed. Again 2369 * the resync completion will already have been processed on each node by 2370 * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component 2371 * resync, hence no need to xmit. 2372 * 2373 * In case we are called from the updates of a watermark, 2374 * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to 2375 * a metainit or similar. In this case the message that we sent to propagate 2376 * the state change must not be a class1 message as that would deadlock with 2377 * the metainit command that is still being processed. 2378 * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2 2379 * instead. This also makes the submessage generator to create a class2 2380 * submessage rather than a class1 (which would also block) 2381 * 2382 * On entry, unit_writerlock is held 2383 * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is 2384 * also held. 2385 */ 2386 void 2387 set_sm_comp_state( 2388 mm_unit_t *un, 2389 int smi, 2390 int ci, 2391 int newstate, 2392 mddb_recid_t *extras, 2393 uint_t flags, 2394 IOLOCK *lockp 2395 ) 2396 { 2397 mm_submirror_t *sm; 2398 mm_submirror_ic_t *smic; 2399 md_m_shared_t *shared; 2400 int origstate; 2401 void (*get_dev)(); 2402 ms_cd_info_t cd; 2403 char devname[MD_MAX_CTDLEN]; 2404 int err; 2405 set_t setno = MD_UN2SET(un); 2406 md_mn_msg_stch_t stchmsg; 2407 mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); 2408 md_mn_kresult_t *kresult; 2409 int rval; 2410 uint_t msgflags; 2411 md_mn_msgtype_t msgtype; 2412 int save_lock = 0; 2413 mdi_unit_t *ui_sm; 2414 2415 sm = &un->un_sm[smi]; 2416 smic = &un->un_smic[smi]; 2417 2418 /* If we have a real error status then turn off MD_INACCESSIBLE. */ 2419 ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev))); 2420 if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) && 2421 ui_sm->ui_tstate & MD_INACCESSIBLE) { 2422 ui_sm->ui_tstate &= ~MD_INACCESSIBLE; 2423 } 2424 2425 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2426 (sm->sm_dev, sm, ci); 2427 origstate = shared->ms_state; 2428 2429 /* 2430 * If the new state is an error and the old one wasn't, generate 2431 * a console message. We do this before we send the state to other 2432 * nodes in a MN set because the state change may change the component 2433 * name if a hotspare is allocated. 2434 */ 2435 if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) && 2436 (newstate & (CS_ERRED|CS_LAST_ERRED))) { 2437 2438 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 2439 "get device", 0); 2440 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2441 2442 err = md_getdevname(setno, mddb_getsidenum(setno), 0, 2443 cd.cd_dev, devname, sizeof (devname)); 2444 2445 if (err == ENOENT) { 2446 (void) md_devname(setno, cd.cd_dev, devname, 2447 sizeof (devname)); 2448 } 2449 2450 cmn_err(CE_WARN, "md: %s: %s needs maintenance", 2451 md_shortname(md_getminor(sm->sm_dev)), devname); 2452 2453 if (newstate & CS_LAST_ERRED) { 2454 cmn_err(CE_WARN, "md: %s: %s last erred", 2455 md_shortname(md_getminor(sm->sm_dev)), 2456 devname); 2457 2458 } else if (shared->ms_flags & MDM_S_ISOPEN) { 2459 /* 2460 * Close the broken device and clear the open flag on 2461 * it. Closing the device means the RCM framework will 2462 * be able to unconfigure the device if required. 2463 * 2464 * We have to check that the device is open, otherwise 2465 * the first open on it has resulted in the error that 2466 * is being processed and the actual cd.cd_dev will be 2467 * NODEV64. 2468 * 2469 * If this is a multi-node mirror, then the multinode 2470 * state checks following this code will cause the 2471 * slave nodes to close the mirror in the function 2472 * mirror_set_state(). 2473 */ 2474 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2475 shared->ms_flags &= ~MDM_S_ISOPEN; 2476 } 2477 2478 } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) && 2479 (shared->ms_flags & MDM_S_ISOPEN)) { 2480 /* 2481 * Similar to logic above except no log messages since we 2482 * are just transitioning from Last Erred to Erred. 2483 */ 2484 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 2485 "get device", 0); 2486 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2487 2488 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2489 shared->ms_flags &= ~MDM_S_ISOPEN; 2490 } 2491 2492 if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) && 2493 (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) { 2494 /* 2495 * For a multi-node mirror, send the state change to the 2496 * master, which broadcasts to all nodes, including this 2497 * one. Once the message is received, the state is set 2498 * in-core and the master commits the change to disk. 2499 * There is a case, comp_replace, where this function 2500 * can be called from within an ioctl and therefore in this 2501 * case, as the ioctl will already be called on each node, 2502 * there is no need to xmit the state change to the master for 2503 * distribution to the other nodes. MD_STATE_XMIT flag is used 2504 * to indicate whether a xmit is required. The mirror's 2505 * transient state is set to MD_ERR_PENDING to avoid sending 2506 * multiple messages. 2507 */ 2508 if (newstate & (CS_ERRED|CS_LAST_ERRED)) 2509 ui->ui_tstate |= MD_ERR_PENDING; 2510 2511 /* 2512 * Send a state update message to all nodes. This message 2513 * will generate 2 submessages, the first one to suspend 2514 * all writes to the mirror and the second to update the 2515 * state and resume writes. 2516 */ 2517 stchmsg.msg_stch_mnum = un->c.un_self_id; 2518 stchmsg.msg_stch_sm = smi; 2519 stchmsg.msg_stch_comp = ci; 2520 stchmsg.msg_stch_new_state = newstate; 2521 stchmsg.msg_stch_hs_id = shared->ms_hs_id; 2522 #ifdef DEBUG 2523 if (mirror_debug_flag) 2524 printf("send set state, %x, %x, %x, %x, %x\n", 2525 stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm, 2526 stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state, 2527 stchmsg.msg_stch_hs_id); 2528 #endif 2529 if (flags & MD_STATE_WMUPDATE) { 2530 msgtype = MD_MN_MSG_STATE_UPDATE2; 2531 /* 2532 * When coming from an update of watermarks, there 2533 * must already be a message logged that triggered 2534 * this action. So, no need to log this message, too. 2535 */ 2536 msgflags = MD_MSGF_NO_LOG; 2537 } else { 2538 msgtype = MD_MN_MSG_STATE_UPDATE; 2539 msgflags = MD_MSGF_DEFAULT_FLAGS; 2540 } 2541 2542 /* 2543 * If we are in the context of an ioctl, drop the ioctl lock. 2544 * lockp holds the list of locks held. 2545 * 2546 * Otherwise, increment the appropriate reacquire counters. 2547 * If openclose lock is *held, then must reacquire reader 2548 * lock before releasing the openclose lock. 2549 * Do not drop the ARRAY_WRITER lock as we may not be able 2550 * to reacquire it. 2551 */ 2552 if (lockp) { 2553 if (lockp->l_flags & MD_ARRAY_WRITER) { 2554 save_lock = MD_ARRAY_WRITER; 2555 lockp->l_flags &= ~MD_ARRAY_WRITER; 2556 } else if (lockp->l_flags & MD_ARRAY_READER) { 2557 save_lock = MD_ARRAY_READER; 2558 lockp->l_flags &= ~MD_ARRAY_READER; 2559 } 2560 IOLOCK_RETURN_RELEASE(0, lockp); 2561 } else { 2562 if (flags & MD_STATE_OCHELD) { 2563 md_unit_writerexit(ui); 2564 (void) md_unit_readerlock(ui); 2565 md_unit_openclose_exit(ui); 2566 } else { 2567 md_unit_writerexit(ui); 2568 } 2569 } 2570 2571 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 2572 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0, 2573 (char *)&stchmsg, sizeof (stchmsg), kresult); 2574 2575 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 2576 mdmn_ksend_show_error(rval, kresult, "STATE UPDATE"); 2577 /* If we're shutting down already, pause things here. */ 2578 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) { 2579 while (!md_mn_is_commd_present()) { 2580 delay(md_hz); 2581 } 2582 } 2583 cmn_err(CE_PANIC, 2584 "ksend_message failure: STATE_UPDATE"); 2585 } 2586 kmem_free(kresult, sizeof (md_mn_kresult_t)); 2587 2588 /* if dropped the lock previously, regain it */ 2589 if (lockp) { 2590 IOLOCK_RETURN_REACQUIRE(lockp); 2591 lockp->l_flags |= save_lock; 2592 } else { 2593 /* 2594 * Reacquire dropped locks and update acquirecnts 2595 * appropriately. 2596 */ 2597 if (flags & MD_STATE_OCHELD) { 2598 /* 2599 * openclose also grabs readerlock. 2600 */ 2601 (void) md_unit_openclose_enter(ui); 2602 md_unit_readerexit(ui); 2603 (void) md_unit_writerlock(ui); 2604 } else { 2605 (void) md_unit_writerlock(ui); 2606 } 2607 } 2608 2609 ui->ui_tstate &= ~MD_ERR_PENDING; 2610 } else { 2611 shared->ms_state = newstate; 2612 uniqtime32(&shared->ms_timestamp); 2613 2614 if (newstate == CS_ERRED) 2615 shared->ms_flags |= MDM_S_NOWRITE; 2616 else 2617 shared->ms_flags &= ~MDM_S_NOWRITE; 2618 2619 shared->ms_flags &= ~MDM_S_IOERR; 2620 un->un_changecnt++; 2621 shared->ms_lasterrcnt = un->un_changecnt; 2622 2623 mirror_set_sm_state(sm, smic, SMS_RUNNING, 0); 2624 mirror_commit(un, SMI2BIT(smi), extras); 2625 } 2626 2627 if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) { 2628 /* 2629 * Resetting the Last Erred state will recursively call back 2630 * into this function (set_sm_comp_state) to update the state. 2631 */ 2632 reset_lasterred(un, smi, extras, flags, lockp); 2633 } 2634 } 2635 2636 static int 2637 find_another_logical( 2638 mm_unit_t *un, 2639 mm_submirror_t *esm, 2640 diskaddr_t blk, 2641 u_longlong_t cnt, 2642 int must_be_open, 2643 int state, 2644 int err_cnt) 2645 { 2646 u_longlong_t cando; 2647 md_dev64_t dev; 2648 md_m_shared_t *s; 2649 2650 esm->sm_state |= SMS_IGNORE; 2651 while (cnt != 0) { 2652 u_longlong_t mcnt; 2653 2654 mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024)); /* 1 Gig Blks */ 2655 2656 dev = select_read_unit(un, blk, mcnt, &cando, 2657 must_be_open, &s, NULL); 2658 if (dev == (md_dev64_t)0) 2659 break; 2660 2661 if ((state == CS_LAST_ERRED) && 2662 (s->ms_state == CS_LAST_ERRED) && 2663 (err_cnt > s->ms_lasterrcnt)) 2664 break; 2665 2666 cnt -= cando; 2667 blk += cando; 2668 } 2669 esm->sm_state &= ~SMS_IGNORE; 2670 return (cnt != 0); 2671 } 2672 2673 int 2674 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open) 2675 { 2676 mm_submirror_t *sm; 2677 mm_submirror_ic_t *smic; 2678 size_t count; 2679 diskaddr_t block; 2680 u_longlong_t skip; 2681 u_longlong_t size; 2682 md_dev64_t dev; 2683 int cnt; 2684 md_m_shared_t *s; 2685 int not_found; 2686 2687 sm = &un->un_sm[smi]; 2688 smic = &un->un_smic[smi]; 2689 dev = sm->sm_dev; 2690 2691 /* 2692 * Make sure every component of the submirror 2693 * has other sources. 2694 */ 2695 if (ci < 0) { 2696 /* Find the highest lasterrcnt */ 2697 cnt = (*(smic->sm_get_component_count))(dev, sm); 2698 for (ci = 0; ci < cnt; ci++) { 2699 not_found = mirror_other_sources(un, smi, ci, 2700 must_be_open); 2701 if (not_found) 2702 return (1); 2703 } 2704 return (0); 2705 } 2706 2707 /* 2708 * Make sure this component has other sources 2709 */ 2710 (void) (*(smic->sm_get_bcss)) 2711 (dev, sm, ci, &block, &count, &skip, &size); 2712 2713 if (count == 0) 2714 return (1); 2715 2716 s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci); 2717 2718 while (count--) { 2719 if (block >= un->c.un_total_blocks) 2720 return (0); 2721 2722 if ((block + size) > un->c.un_total_blocks) 2723 size = un->c.un_total_blocks - block; 2724 2725 not_found = find_another_logical(un, sm, block, size, 2726 must_be_open, s->ms_state, s->ms_lasterrcnt); 2727 if (not_found) 2728 return (1); 2729 2730 block += size + skip; 2731 } 2732 return (0); 2733 } 2734 2735 static void 2736 finish_error(md_mps_t *ps) 2737 { 2738 struct buf *pb; 2739 mm_unit_t *un; 2740 mdi_unit_t *ui; 2741 uint_t new_str_flags; 2742 2743 pb = ps->ps_bp; 2744 un = ps->ps_un; 2745 ui = ps->ps_ui; 2746 2747 /* 2748 * Must flag any error to the resync originator if we're performing 2749 * a Write-after-Read. This corresponds to an i/o error on a resync 2750 * target device and in this case we ought to abort the resync as there 2751 * is nothing that can be done to recover from this without operator 2752 * intervention. If we don't set the B_ERROR flag we will continue 2753 * reading from the mirror but won't write to the target (as it will 2754 * have been placed into an errored state). 2755 * To handle the case of multiple components within a submirror we only 2756 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR. 2757 * The originator of the resync read will cause this bit to be set if 2758 * the underlying component count is one for a submirror resync. All 2759 * other resync types will have the flag set as there is no underlying 2760 * resync which can be performed on a contained metadevice for these 2761 * resync types (optimized or component). 2762 */ 2763 2764 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) { 2765 if (ps->ps_flags & MD_MPS_FLAG_ERROR) 2766 pb->b_flags |= B_ERROR; 2767 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2768 MPS_FREE(mirror_parent_cache, ps); 2769 md_unit_readerexit(ui); 2770 md_biodone(pb); 2771 return; 2772 } 2773 /* 2774 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 2775 * operation therefore this I/O request has already been counted, 2776 * the I/O count variable will be decremented by mirror_done()'s 2777 * call to md_biodone(). 2778 */ 2779 if (ps->ps_changecnt != un->un_changecnt) { 2780 new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED; 2781 if (ps->ps_flags & MD_MPS_WOW) 2782 new_str_flags |= MD_STR_WOW; 2783 if (ps->ps_flags & MD_MPS_MAPPED) 2784 new_str_flags |= MD_STR_MAPPED; 2785 /* 2786 * If this I/O request was a read that was part of a resync, 2787 * set MD_STR_WAR for the retried read to ensure that the 2788 * resync write (i.e. write-after-read) will be performed 2789 */ 2790 if (ps->ps_flags & MD_MPS_RESYNC_READ) 2791 new_str_flags |= MD_STR_WAR; 2792 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2793 MPS_FREE(mirror_parent_cache, ps); 2794 md_unit_readerexit(ui); 2795 (void) md_mirror_strategy(pb, new_str_flags, NULL); 2796 return; 2797 } 2798 2799 pb->b_flags |= B_ERROR; 2800 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2801 MPS_FREE(mirror_parent_cache, ps); 2802 md_unit_readerexit(ui); 2803 md_biodone(pb); 2804 } 2805 2806 static void 2807 error_update_unit(md_mps_t *ps) 2808 { 2809 mm_unit_t *un; 2810 mdi_unit_t *ui; 2811 int smi; /* sub mirror index */ 2812 int ci; /* errored component */ 2813 set_t setno; 2814 uint_t flags; /* for set_sm_comp_state() */ 2815 uint_t hspflags; /* for check_comp_4_hotspares() */ 2816 2817 ui = ps->ps_ui; 2818 un = (mm_unit_t *)md_unit_writerlock(ui); 2819 setno = MD_UN2SET(un); 2820 2821 /* All of these updates have to propagated in case of MN set */ 2822 flags = MD_STATE_XMIT; 2823 hspflags = MD_HOTSPARE_XMIT; 2824 2825 /* special treatment if we are called during updating watermarks */ 2826 if (ps->ps_flags & MD_MPS_WMUPDATE) { 2827 flags |= MD_STATE_WMUPDATE; 2828 hspflags |= MD_HOTSPARE_WMUPDATE; 2829 } 2830 smi = 0; 2831 ci = 0; 2832 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 2833 if (mirror_other_sources(un, smi, ci, 0) == 1) { 2834 2835 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2836 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags, 2837 (IOLOCK *)NULL); 2838 /* 2839 * For a MN set, the NOTIFY is done when the state 2840 * change is processed on each node 2841 */ 2842 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2843 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 2844 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2845 } 2846 continue; 2847 } 2848 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2849 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags, 2850 (IOLOCK *)NULL); 2851 /* 2852 * For a MN set, the NOTIFY is done when the state 2853 * change is processed on each node 2854 */ 2855 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2856 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 2857 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2858 } 2859 smi = 0; 2860 ci = 0; 2861 } 2862 2863 md_unit_writerexit(ui); 2864 if (MD_MNSET_SETNO(setno)) { 2865 send_poke_hotspares(setno); 2866 } else { 2867 (void) poke_hotspares(); 2868 } 2869 (void) md_unit_readerlock(ui); 2870 2871 finish_error(ps); 2872 } 2873 2874 /* 2875 * When we have a B_FAILFAST IO error on a Last Erred component we need to 2876 * retry the IO without B_FAILFAST set so that we try to ensure that the 2877 * component "sees" each IO. 2878 */ 2879 static void 2880 last_err_retry(md_mcs_t *cs) 2881 { 2882 struct buf *cb; 2883 md_mps_t *ps; 2884 uint_t flags; 2885 2886 cb = &cs->cs_buf; 2887 cb->b_flags &= ~B_FAILFAST; 2888 2889 /* if we're panicing just let this I/O error out */ 2890 if (panicstr) { 2891 (void) mirror_done(cb); 2892 return; 2893 } 2894 2895 /* reissue the I/O */ 2896 2897 ps = cs->cs_ps; 2898 2899 bioerror(cb, 0); 2900 2901 mutex_enter(&ps->ps_mx); 2902 2903 flags = MD_STR_NOTTOP; 2904 if (ps->ps_flags & MD_MPS_MAPPED) 2905 flags |= MD_STR_MAPPED; 2906 if (ps->ps_flags & MD_MPS_NOBLOCK) 2907 flags |= MD_NOBLOCK; 2908 2909 mutex_exit(&ps->ps_mx); 2910 2911 clear_retry_error(cb); 2912 2913 cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST", 2914 md_shortname(getminor(cb->b_edev))); 2915 2916 md_call_strategy(cb, flags, NULL); 2917 } 2918 2919 static void 2920 mirror_error(md_mps_t *ps) 2921 { 2922 int smi; /* sub mirror index */ 2923 int ci; /* errored component */ 2924 2925 if (panicstr) { 2926 finish_error(ps); 2927 return; 2928 } 2929 2930 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 2931 mirror_overlap_tree_remove(ps); 2932 2933 smi = 0; 2934 ci = 0; 2935 if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) { 2936 md_unit_readerexit(ps->ps_ui); 2937 daemon_request(&md_mstr_daemon, error_update_unit, 2938 (daemon_queue_t *)ps, REQ_OLD); 2939 return; 2940 } 2941 2942 finish_error(ps); 2943 } 2944 2945 static int 2946 copy_write_done(struct buf *cb) 2947 { 2948 md_mps_t *ps; 2949 buf_t *pb; 2950 char *wowbuf; 2951 wowhdr_t *wowhdr; 2952 ssize_t wow_resid; 2953 2954 /* get wowbuf ans save structure */ 2955 wowbuf = cb->b_un.b_addr; 2956 wowhdr = WOWBUF_HDR(wowbuf); 2957 ps = wowhdr->wow_ps; 2958 pb = ps->ps_bp; 2959 2960 /* Save error information, then free cb */ 2961 if (cb->b_flags & B_ERROR) 2962 pb->b_flags |= B_ERROR; 2963 2964 if (cb->b_flags & B_REMAPPED) 2965 bp_mapout(cb); 2966 2967 freerbuf(cb); 2968 2969 /* update residual and continue if needed */ 2970 if ((pb->b_flags & B_ERROR) == 0) { 2971 wow_resid = pb->b_bcount - wowhdr->wow_offset; 2972 pb->b_resid = wow_resid; 2973 if (wow_resid > 0) { 2974 daemon_request(&md_mstr_daemon, copy_write_cont, 2975 (daemon_queue_t *)wowhdr, REQ_OLD); 2976 return (1); 2977 } 2978 } 2979 2980 /* Write is complete, release resources. */ 2981 kmem_cache_free(mirror_wowblk_cache, wowhdr); 2982 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 2983 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2984 MPS_FREE(mirror_parent_cache, ps); 2985 md_biodone(pb); 2986 return (0); 2987 } 2988 2989 static void 2990 copy_write_cont(wowhdr_t *wowhdr) 2991 { 2992 buf_t *pb; 2993 buf_t *cb; 2994 char *wowbuf; 2995 int wow_offset; 2996 size_t wow_resid; 2997 diskaddr_t wow_blkno; 2998 2999 wowbuf = WOWHDR_BUF(wowhdr); 3000 pb = wowhdr->wow_ps->ps_bp; 3001 3002 /* get data on current location */ 3003 wow_offset = wowhdr->wow_offset; 3004 wow_resid = pb->b_bcount - wow_offset; 3005 wow_blkno = pb->b_lblkno + lbtodb(wow_offset); 3006 3007 /* setup child buffer */ 3008 cb = getrbuf(KM_SLEEP); 3009 cb->b_flags = B_WRITE; 3010 cb->b_edev = pb->b_edev; 3011 cb->b_un.b_addr = wowbuf; /* change to point at WOWBUF */ 3012 cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */ 3013 cb->b_iodone = copy_write_done; 3014 cb->b_bcount = MIN(md_wowbuf_size, wow_resid); 3015 cb->b_lblkno = wow_blkno; 3016 3017 /* move offset to next section */ 3018 wowhdr->wow_offset += cb->b_bcount; 3019 3020 /* copy and setup write for current section */ 3021 bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount); 3022 3023 /* do it */ 3024 /* 3025 * Do not set the MD_IO_COUNTED flag as this is a new I/O request 3026 * that handles the WOW condition. The resultant increment on the 3027 * I/O count variable is cleared by copy_write_done()'s call to 3028 * md_biodone(). 3029 */ 3030 (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW 3031 | MD_STR_MAPPED, NULL); 3032 } 3033 3034 static void 3035 md_mirror_copy_write(md_mps_t *ps) 3036 { 3037 wowhdr_t *wowhdr; 3038 3039 wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS); 3040 mirror_wowblk_init(wowhdr); 3041 wowhdr->wow_ps = ps; 3042 wowhdr->wow_offset = 0; 3043 copy_write_cont(wowhdr); 3044 } 3045 3046 static void 3047 handle_wow(md_mps_t *ps) 3048 { 3049 buf_t *pb; 3050 3051 pb = ps->ps_bp; 3052 3053 bp_mapin(pb); 3054 3055 md_mirror_wow_cnt++; 3056 if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) { 3057 cmn_err(CE_NOTE, 3058 "md: %s, blk %lld, cnt %ld: Write on write %d occurred", 3059 md_shortname(getminor(pb->b_edev)), 3060 (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt); 3061 } 3062 3063 /* 3064 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 3065 * operation therefore this I/O request has already been counted, 3066 * the I/O count variable will be decremented by mirror_done()'s 3067 * call to md_biodone(). 3068 */ 3069 if (md_mirror_wow_flg & WOW_NOCOPY) 3070 (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW | 3071 MD_STR_MAPPED | MD_IO_COUNTED, ps); 3072 else 3073 md_mirror_copy_write(ps); 3074 } 3075 3076 /* 3077 * Return true if the specified submirror is either in the Last Erred 3078 * state or is transitioning into the Last Erred state. 3079 */ 3080 static bool_t 3081 submirror_is_lasterred(mm_unit_t *un, int smi) 3082 { 3083 mm_submirror_t *sm; 3084 mm_submirror_ic_t *smic; 3085 md_m_shared_t *shared; 3086 int ci; 3087 int compcnt; 3088 3089 sm = &un->un_sm[smi]; 3090 smic = &un->un_smic[smi]; 3091 3092 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 3093 for (ci = 0; ci < compcnt; ci++) { 3094 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 3095 (sm->sm_dev, sm, ci); 3096 3097 if (shared->ms_state == CS_LAST_ERRED) 3098 return (B_TRUE); 3099 3100 /* 3101 * It is not currently Last Erred, check if entering Last Erred. 3102 */ 3103 if ((shared->ms_flags & MDM_S_IOERR) && 3104 ((shared->ms_state == CS_OKAY) || 3105 (shared->ms_state == CS_RESYNC))) { 3106 if (mirror_other_sources(un, smi, ci, 0) == 1) 3107 return (B_TRUE); 3108 } 3109 } 3110 3111 return (B_FALSE); 3112 } 3113 3114 3115 static int 3116 mirror_done(struct buf *cb) 3117 { 3118 md_mps_t *ps; 3119 md_mcs_t *cs; 3120 3121 /*LINTED*/ 3122 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3123 ps = cs->cs_ps; 3124 3125 mutex_enter(&ps->ps_mx); 3126 3127 /* check if we need to retry an errored failfast I/O */ 3128 if (cb->b_flags & B_ERROR) { 3129 struct buf *pb = ps->ps_bp; 3130 3131 if (cb->b_flags & B_FAILFAST) { 3132 int i; 3133 mm_unit_t *un = ps->ps_un; 3134 3135 for (i = 0; i < NMIRROR; i++) { 3136 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 3137 continue; 3138 3139 if (cb->b_edev == 3140 md_dev64_to_dev(un->un_sm[i].sm_dev)) { 3141 3142 /* 3143 * This is the submirror that had the 3144 * error. Check if it is Last Erred. 3145 */ 3146 if (submirror_is_lasterred(un, i)) { 3147 daemon_queue_t *dqp; 3148 3149 mutex_exit(&ps->ps_mx); 3150 dqp = (daemon_queue_t *)cs; 3151 dqp->dq_prev = NULL; 3152 dqp->dq_next = NULL; 3153 daemon_request(&md_done_daemon, 3154 last_err_retry, dqp, 3155 REQ_OLD); 3156 return (1); 3157 } 3158 break; 3159 } 3160 } 3161 } 3162 3163 /* continue to process the buf without doing a retry */ 3164 ps->ps_flags |= MD_MPS_ERROR; 3165 pb->b_error = cb->b_error; 3166 } 3167 3168 return (mirror_done_common(cb)); 3169 } 3170 3171 /* 3172 * Split from the original mirror_done function so we can handle bufs after a 3173 * retry. 3174 * ps->ps_mx is already held in the caller of this function and the cb error 3175 * has already been checked and handled in the caller. 3176 */ 3177 static int 3178 mirror_done_common(struct buf *cb) 3179 { 3180 struct buf *pb; 3181 mm_unit_t *un; 3182 mdi_unit_t *ui; 3183 md_mps_t *ps; 3184 md_mcs_t *cs; 3185 size_t end_rr, start_rr, current_rr; 3186 3187 /*LINTED*/ 3188 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3189 ps = cs->cs_ps; 3190 pb = ps->ps_bp; 3191 3192 if (cb->b_flags & B_REMAPPED) 3193 bp_mapout(cb); 3194 3195 ps->ps_frags--; 3196 if (ps->ps_frags != 0) { 3197 mutex_exit(&ps->ps_mx); 3198 kmem_cache_free(mirror_child_cache, cs); 3199 return (1); 3200 } 3201 un = ps->ps_un; 3202 ui = ps->ps_ui; 3203 3204 /* 3205 * Do not update outstanding_writes if we're running with ABR 3206 * set for this mirror or the write() was issued with MD_STR_ABR set. 3207 * Also a resync initiated write() has no outstanding_writes update 3208 * either. 3209 */ 3210 if (((cb->b_flags & B_READ) == 0) && 3211 (un->un_nsm >= 2) && 3212 (ps->ps_call == NULL) && 3213 !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) && 3214 !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) { 3215 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 3216 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 3217 mutex_enter(&un->un_resync_mx); 3218 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) 3219 un->un_outstanding_writes[current_rr]--; 3220 mutex_exit(&un->un_resync_mx); 3221 } 3222 kmem_cache_free(mirror_child_cache, cs); 3223 mutex_exit(&ps->ps_mx); 3224 3225 if (ps->ps_call != NULL) { 3226 daemon_request(&md_done_daemon, ps->ps_call, 3227 (daemon_queue_t *)ps, REQ_OLD); 3228 return (1); 3229 } 3230 3231 if ((ps->ps_flags & MD_MPS_ERROR)) { 3232 daemon_request(&md_done_daemon, mirror_error, 3233 (daemon_queue_t *)ps, REQ_OLD); 3234 return (1); 3235 } 3236 3237 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3238 mirror_overlap_tree_remove(ps); 3239 3240 /* 3241 * Handle Write-on-Write problem. 3242 * Skip In case of Raw and Direct I/O as they are 3243 * handled earlier. 3244 * 3245 */ 3246 if (!(md_mirror_wow_flg & WOW_DISABLE) && 3247 !(pb->b_flags & B_READ) && 3248 !(ps->ps_flags & MD_MPS_WOW) && 3249 !(pb->b_flags & B_PHYS) && 3250 any_pages_dirty(pb)) { 3251 md_unit_readerexit(ps->ps_ui); 3252 daemon_request(&md_mstr_daemon, handle_wow, 3253 (daemon_queue_t *)ps, REQ_OLD); 3254 return (1); 3255 } 3256 3257 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3258 MPS_FREE(mirror_parent_cache, ps); 3259 md_unit_readerexit(ui); 3260 md_biodone(pb); 3261 return (0); 3262 } 3263 3264 /* 3265 * Clear error state in submirror component if the retry worked after 3266 * a failfast error. 3267 */ 3268 static void 3269 clear_retry_error(struct buf *cb) 3270 { 3271 int smi; 3272 md_mcs_t *cs; 3273 mm_unit_t *un; 3274 mdi_unit_t *ui_sm; 3275 mm_submirror_t *sm; 3276 mm_submirror_ic_t *smic; 3277 u_longlong_t cnt; 3278 md_m_shared_t *shared; 3279 3280 /*LINTED*/ 3281 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3282 un = cs->cs_ps->ps_un; 3283 3284 for (smi = 0; smi < NMIRROR; smi++) { 3285 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 3286 continue; 3287 3288 if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) 3289 break; 3290 } 3291 3292 if (smi >= NMIRROR) 3293 return; 3294 3295 sm = &un->un_sm[smi]; 3296 smic = &un->un_smic[smi]; 3297 cnt = cb->b_bcount; 3298 3299 ui_sm = MDI_UNIT(getminor(cb->b_edev)); 3300 (void) md_unit_writerlock(ui_sm); 3301 3302 shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm, 3303 cb->b_blkno, &cnt); 3304 3305 if (shared->ms_flags & MDM_S_IOERR) { 3306 shared->ms_flags &= ~MDM_S_IOERR; 3307 3308 } else { 3309 /* the buf spans components and the first one is not erred */ 3310 int cnt; 3311 int i; 3312 3313 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un); 3314 for (i = 0; i < cnt; i++) { 3315 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 3316 (sm->sm_dev, sm, i); 3317 3318 if (shared->ms_flags & MDM_S_IOERR && 3319 shared->ms_state == CS_OKAY) { 3320 3321 shared->ms_flags &= ~MDM_S_IOERR; 3322 break; 3323 } 3324 } 3325 } 3326 3327 md_unit_writerexit(ui_sm); 3328 } 3329 3330 static size_t 3331 mirror_map_read( 3332 md_mps_t *ps, 3333 md_mcs_t *cs, 3334 diskaddr_t blkno, 3335 u_longlong_t count 3336 ) 3337 { 3338 mm_unit_t *un; 3339 buf_t *bp; 3340 u_longlong_t cando; 3341 3342 bp = &cs->cs_buf; 3343 un = ps->ps_un; 3344 3345 bp->b_lblkno = blkno; 3346 if (fast_select_read_unit(ps, cs) == 0) { 3347 bp->b_bcount = ldbtob(count); 3348 return (0); 3349 } 3350 bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, 3351 count, &cando, 0, NULL, cs)); 3352 bp->b_bcount = ldbtob(cando); 3353 if (count != cando) 3354 return (cando); 3355 return (0); 3356 } 3357 3358 static void 3359 write_after_read(md_mps_t *ps) 3360 { 3361 struct buf *pb; 3362 int flags; 3363 3364 if (ps->ps_flags & MD_MPS_ERROR) { 3365 mirror_error(ps); 3366 return; 3367 } 3368 3369 pb = ps->ps_bp; 3370 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3371 ps->ps_call = NULL; 3372 ps->ps_flags |= MD_MPS_WRITE_AFTER_READ; 3373 flags = MD_STR_NOTTOP | MD_STR_WAR; 3374 if (ps->ps_flags & MD_MPS_MAPPED) 3375 flags |= MD_STR_MAPPED; 3376 if (ps->ps_flags & MD_MPS_NOBLOCK) 3377 flags |= MD_NOBLOCK; 3378 if (ps->ps_flags & MD_MPS_DIRTY_RD) 3379 flags |= MD_STR_DIRTY_RD; 3380 (void) mirror_write_strategy(pb, flags, ps); 3381 } 3382 3383 static void 3384 continue_serial(md_mps_t *ps) 3385 { 3386 md_mcs_t *cs; 3387 buf_t *cb; 3388 mm_unit_t *un; 3389 int flags; 3390 3391 un = ps->ps_un; 3392 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 3393 mirror_child_init(cs); 3394 cb = &cs->cs_buf; 3395 ps->ps_call = NULL; 3396 ps->ps_frags = 1; 3397 (void) mirror_map_write(un, cs, ps, 0); 3398 flags = MD_STR_NOTTOP; 3399 if (ps->ps_flags & MD_MPS_MAPPED) 3400 flags |= MD_STR_MAPPED; 3401 md_call_strategy(cb, flags, NULL); 3402 } 3403 3404 static int 3405 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war) 3406 { 3407 int i; 3408 dev_t dev; /* needed for bioclone, so not md_dev64_t */ 3409 buf_t *cb; 3410 buf_t *pb; 3411 diskaddr_t blkno; 3412 size_t bcount; 3413 off_t offset; 3414 3415 pb = ps->ps_bp; 3416 cb = &cs->cs_buf; 3417 cs->cs_ps = ps; 3418 3419 i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm); 3420 3421 dev = md_dev64_to_dev(un->un_sm[i].sm_dev); 3422 3423 blkno = pb->b_lblkno; 3424 bcount = pb->b_bcount; 3425 offset = 0; 3426 if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) { 3427 blkno = DK_LABEL_LOC + 1; 3428 /* 3429 * This handles the case where we're requesting 3430 * a write to block 0 on a label partition 3431 * and the request size was smaller than the 3432 * size of the label. If this is the case 3433 * then we'll return -1. Failure to do so will 3434 * either cause the calling thread to hang due to 3435 * an ssd bug, or worse if the bcount were allowed 3436 * to go negative (ie large). 3437 */ 3438 if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1)) 3439 return (-1); 3440 bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3441 offset = (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3442 } 3443 3444 cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done, 3445 cb, KM_NOSLEEP); 3446 if (war) 3447 cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE; 3448 3449 /* 3450 * If the submirror is in the erred stated, check if any component is 3451 * in the Last Erred state. If so, we don't want to use the B_FAILFAST 3452 * flag on the IO. 3453 * 3454 * Provide a fast path for the non-erred case (which should be the 3455 * normal case). 3456 */ 3457 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) { 3458 if (un->un_sm[i].sm_state & SMS_COMP_ERRED) { 3459 mm_submirror_t *sm; 3460 mm_submirror_ic_t *smic; 3461 int ci; 3462 int compcnt; 3463 3464 sm = &un->un_sm[i]; 3465 smic = &un->un_smic[i]; 3466 3467 compcnt = (*(smic->sm_get_component_count)) 3468 (sm->sm_dev, un); 3469 for (ci = 0; ci < compcnt; ci++) { 3470 md_m_shared_t *shared; 3471 3472 shared = (md_m_shared_t *) 3473 (*(smic->sm_shared_by_indx))(sm->sm_dev, 3474 sm, ci); 3475 3476 if (shared->ms_state == CS_LAST_ERRED) 3477 break; 3478 } 3479 if (ci >= compcnt) 3480 cb->b_flags |= B_FAILFAST; 3481 3482 } else { 3483 cb->b_flags |= B_FAILFAST; 3484 } 3485 } 3486 3487 ps->ps_current_sm++; 3488 if (ps->ps_current_sm != ps->ps_active_cnt) { 3489 if (un->un_write_option == WR_SERIAL) { 3490 ps->ps_call = continue_serial; 3491 return (0); 3492 } 3493 return (1); 3494 } 3495 return (0); 3496 } 3497 3498 /* 3499 * directed_read_done: 3500 * ------------------ 3501 * Completion routine called when a DMR request has been returned from the 3502 * underlying driver. Wake-up the original ioctl() and return the data to 3503 * the user. 3504 */ 3505 static void 3506 directed_read_done(md_mps_t *ps) 3507 { 3508 mm_unit_t *un; 3509 mdi_unit_t *ui; 3510 3511 un = ps->ps_un; 3512 ui = ps->ps_ui; 3513 3514 md_unit_readerexit(ui); 3515 md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3516 ps->ps_call = NULL; 3517 3518 mutex_enter(&un->un_dmr_mx); 3519 cv_signal(&un->un_dmr_cv); 3520 mutex_exit(&un->un_dmr_mx); 3521 3522 /* release the parent structure */ 3523 kmem_cache_free(mirror_parent_cache, ps); 3524 } 3525 3526 /* 3527 * daemon_io: 3528 * ------------ 3529 * Called to issue a mirror_write_strategy() or mirror_read_strategy 3530 * call from a blockable context. NOTE: no mutex can be held on entry to this 3531 * routine 3532 */ 3533 static void 3534 daemon_io(daemon_queue_t *dq) 3535 { 3536 md_mps_t *ps = (md_mps_t *)dq; 3537 int flag = MD_STR_NOTTOP; 3538 buf_t *pb = ps->ps_bp; 3539 3540 if (ps->ps_flags & MD_MPS_MAPPED) 3541 flag |= MD_STR_MAPPED; 3542 if (ps->ps_flags & MD_MPS_WOW) 3543 flag |= MD_STR_WOW; 3544 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) 3545 flag |= MD_STR_WAR; 3546 if (ps->ps_flags & MD_MPS_ABR) 3547 flag |= MD_STR_ABR; 3548 if (ps->ps_flags & MD_MPS_BLOCKABLE_IO) 3549 flag |= MD_STR_BLOCK_OK; 3550 3551 /* 3552 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set 3553 * MD_STR_WAR before calling mirror_read_strategy 3554 */ 3555 if (pb->b_flags & B_READ) { 3556 if (!(ps->ps_flags & MD_MPS_DIRTY_RD)) 3557 flag |= MD_STR_WAR; 3558 mirror_read_strategy(pb, flag, ps); 3559 } else 3560 mirror_write_strategy(pb, flag, ps); 3561 } 3562 3563 /* 3564 * update_resync: 3565 * ------------- 3566 * Called to update the in-core version of the resync record with the latest 3567 * version that was committed to disk when the previous mirror owner 3568 * relinquished ownership. This call is likely to block as we must hold-off 3569 * any current resync processing that may be occurring. 3570 * On completion of the resync record update we issue the mirror_write_strategy 3571 * call to complete the i/o that first started this sequence. To remove a race 3572 * condition between a new write() request which is submitted and the resync 3573 * record update we acquire the writerlock. This will hold off all i/o to the 3574 * mirror until the resync update has completed. 3575 * NOTE: no mutex can be held on entry to this routine 3576 */ 3577 static void 3578 update_resync(daemon_queue_t *dq) 3579 { 3580 md_mps_t *ps = (md_mps_t *)dq; 3581 buf_t *pb = ps->ps_bp; 3582 mdi_unit_t *ui = ps->ps_ui; 3583 mm_unit_t *un = MD_UNIT(ui->ui_link.ln_id); 3584 set_t setno; 3585 int restart_resync; 3586 3587 mutex_enter(&un->un_rrp_inflight_mx); 3588 (void) md_unit_writerlock(ui); 3589 ps->ps_un = un; 3590 setno = MD_MIN2SET(getminor(pb->b_edev)); 3591 if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) { 3592 /* 3593 * Synchronize our in-core view of what regions need to be 3594 * resync'd with the on-disk version. 3595 */ 3596 mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm, 3597 un->un_dirty_bm); 3598 3599 /* Region dirty map is now up to date */ 3600 } 3601 restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0; 3602 md_unit_writerexit(ui); 3603 mutex_exit(&un->un_rrp_inflight_mx); 3604 3605 /* Restart the resync thread if it was previously blocked */ 3606 if (restart_resync) { 3607 mutex_enter(&un->un_rs_thread_mx); 3608 un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER; 3609 cv_signal(&un->un_rs_thread_cv); 3610 mutex_exit(&un->un_rs_thread_mx); 3611 } 3612 /* Continue with original deferred i/o */ 3613 daemon_io(dq); 3614 } 3615 3616 /* 3617 * owner_timeout: 3618 * ------------- 3619 * Called if the original mdmn_ksend_message() failed and the request is to be 3620 * retried. Reattempt the original ownership change. 3621 * 3622 * NOTE: called at interrupt context (see timeout(9f)). 3623 */ 3624 static void 3625 owner_timeout(void *arg) 3626 { 3627 daemon_queue_t *dq = (daemon_queue_t *)arg; 3628 3629 daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD); 3630 } 3631 3632 /* 3633 * become_owner: 3634 * ------------ 3635 * Called to issue RPC request to become the owner of the mirror 3636 * associated with this i/o request. We assume that the ownership request 3637 * is synchronous, so if it succeeds we will issue the request via 3638 * mirror_write_strategy(). 3639 * If multiple i/o's are outstanding we will be called from the mirror_daemon 3640 * service thread. 3641 * NOTE: no mutex should be held on entry to this routine. 3642 */ 3643 static void 3644 become_owner(daemon_queue_t *dq) 3645 { 3646 md_mps_t *ps = (md_mps_t *)dq; 3647 mm_unit_t *un = ps->ps_un; 3648 buf_t *pb = ps->ps_bp; 3649 set_t setno; 3650 md_mn_kresult_t *kres; 3651 int msg_flags = md_mirror_msg_flags; 3652 md_mps_t *ps1; 3653 3654 ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL); 3655 3656 /* 3657 * If we're already the mirror owner we do not need to send a message 3658 * but can simply process the i/o request immediately. 3659 * If we've already sent the request to become owner we requeue the 3660 * request as we're waiting for the synchronous ownership message to 3661 * be processed. 3662 */ 3663 if (MD_MN_MIRROR_OWNER(un)) { 3664 /* 3665 * As the strategy() call will potentially block we need to 3666 * punt this to a separate thread and complete this request 3667 * as quickly as possible. Note: if we're a read request 3668 * this must be a resync, we cannot afford to be queued 3669 * behind any intervening i/o requests. In this case we put the 3670 * request on the md_mirror_rs_daemon queue. 3671 */ 3672 if (pb->b_flags & B_READ) { 3673 daemon_request(&md_mirror_rs_daemon, daemon_io, dq, 3674 REQ_OLD); 3675 } else { 3676 daemon_request(&md_mirror_io_daemon, daemon_io, dq, 3677 REQ_OLD); 3678 } 3679 } else { 3680 mutex_enter(&un->un_owner_mx); 3681 if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) { 3682 md_mn_req_owner_t *msg; 3683 int rval = 0; 3684 3685 /* 3686 * Check to see that we haven't exceeded the maximum 3687 * retry count. If we have we fail the i/o as the 3688 * comms mechanism has become wedged beyond recovery. 3689 */ 3690 if (dq->qlen++ >= MD_OWNER_RETRIES) { 3691 mutex_exit(&un->un_owner_mx); 3692 cmn_err(CE_WARN, 3693 "md_mirror: Request exhausted ownership " 3694 "retry limit of %d attempts", dq->qlen); 3695 pb->b_error = EIO; 3696 pb->b_flags |= B_ERROR; 3697 pb->b_resid = pb->b_bcount; 3698 kmem_cache_free(mirror_parent_cache, ps); 3699 md_biodone(pb); 3700 return; 3701 } 3702 3703 /* 3704 * Issue request to change ownership. The call is 3705 * synchronous so when it returns we can complete the 3706 * i/o (if successful), or enqueue it again so that 3707 * the operation will be retried. 3708 */ 3709 un->un_owner_state |= MM_MN_OWNER_SENT; 3710 mutex_exit(&un->un_owner_mx); 3711 3712 msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP); 3713 setno = MD_MIN2SET(getminor(pb->b_edev)); 3714 msg->mnum = MD_SID(un); 3715 msg->owner = md_mn_mynode_id; 3716 msg_flags |= MD_MSGF_NO_LOG; 3717 /* 3718 * If this IO is triggered by updating a watermark, 3719 * it might be issued by the creation of a softpartition 3720 * while the commd subsystem is suspended. 3721 * We don't want this message to block. 3722 */ 3723 if (ps->ps_flags & MD_MPS_WMUPDATE) { 3724 msg_flags |= MD_MSGF_OVERRIDE_SUSPEND; 3725 } 3726 3727 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 3728 rval = mdmn_ksend_message(setno, 3729 MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0, 3730 (char *)msg, sizeof (md_mn_req_owner_t), kres); 3731 3732 kmem_free(msg, sizeof (md_mn_req_owner_t)); 3733 3734 if (MDMN_KSEND_MSG_OK(rval, kres)) { 3735 dq->qlen = 0; 3736 /* 3737 * Successfully changed owner, reread the 3738 * resync record so that we have a valid idea of 3739 * any previously committed incomplete write()s. 3740 * NOTE: As we need to acquire the resync mutex 3741 * this may block, so we defer it to a separate 3742 * thread handler. This makes us (effectively) 3743 * non-blocking once the ownership message 3744 * handling has completed. 3745 */ 3746 mutex_enter(&un->un_owner_mx); 3747 if (un->un_owner_state & MM_MN_BECOME_OWNER) { 3748 un->un_mirror_owner = md_mn_mynode_id; 3749 /* Sets owner of un_rr_dirty record */ 3750 if (un->un_rr_dirty_recid) 3751 (void) mddb_setowner( 3752 un->un_rr_dirty_recid, 3753 md_mn_mynode_id); 3754 un->un_owner_state &= 3755 ~MM_MN_BECOME_OWNER; 3756 /* 3757 * Release the block on the current 3758 * resync region if it is blocked 3759 */ 3760 ps1 = un->un_rs_prev_overlap; 3761 if ((ps1 != NULL) && 3762 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) 3763 mirror_overlap_tree_remove(ps1); 3764 mutex_exit(&un->un_owner_mx); 3765 3766 /* 3767 * If we're a read, this must be a 3768 * resync request, issue 3769 * the i/o request on the 3770 * md_mirror_rs_daemon queue. This is 3771 * to avoid a deadlock between the 3772 * resync_unit thread and 3773 * subsequent i/o requests that may 3774 * block on the resync region. 3775 */ 3776 if (pb->b_flags & B_READ) { 3777 daemon_request( 3778 &md_mirror_rs_daemon, 3779 update_resync, dq, REQ_OLD); 3780 } else { 3781 daemon_request( 3782 &md_mirror_io_daemon, 3783 update_resync, dq, REQ_OLD); 3784 } 3785 kmem_free(kres, 3786 sizeof (md_mn_kresult_t)); 3787 return; 3788 } else { 3789 /* 3790 * Some other node has beaten us to 3791 * obtain ownership. We need to 3792 * reschedule our ownership request 3793 */ 3794 mutex_exit(&un->un_owner_mx); 3795 } 3796 } else { 3797 mdmn_ksend_show_error(rval, kres, 3798 "MD_MN_MSG_REQUIRE_OWNER"); 3799 /* 3800 * Message transport failure is handled by the 3801 * comms layer. If the ownership change request 3802 * does not succeed we need to flag the error to 3803 * the initiator of the i/o. This is handled by 3804 * the retry logic above. As the request failed 3805 * we do not know _who_ the owner of the mirror 3806 * currently is. We reset our idea of the owner 3807 * to None so that any further write()s will 3808 * attempt to become the owner again. This stops 3809 * multiple nodes writing to the same mirror 3810 * simultaneously. 3811 */ 3812 mutex_enter(&un->un_owner_mx); 3813 un->un_owner_state &= 3814 ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER); 3815 un->un_mirror_owner = MD_MN_MIRROR_UNOWNED; 3816 mutex_exit(&un->un_owner_mx); 3817 } 3818 kmem_free(kres, sizeof (md_mn_kresult_t)); 3819 } else 3820 mutex_exit(&un->un_owner_mx); 3821 3822 /* 3823 * Re-enqueue this request on the deferred i/o list. Delay the 3824 * request for md_mirror_owner_to usecs to stop thrashing. 3825 */ 3826 (void) timeout(owner_timeout, dq, 3827 drv_usectohz(md_mirror_owner_to)); 3828 } 3829 } 3830 3831 static void 3832 mirror_write_strategy(buf_t *pb, int flag, void *private) 3833 { 3834 md_mps_t *ps; 3835 md_mcs_t *cs; 3836 int more; 3837 mm_unit_t *un; 3838 mdi_unit_t *ui; 3839 buf_t *cb; /* child buf pointer */ 3840 set_t setno; 3841 int rs_on_overlap = 0; 3842 3843 ui = MDI_UNIT(getminor(pb->b_edev)); 3844 un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev)); 3845 3846 3847 md_kstat_waitq_enter(ui); 3848 3849 /* 3850 * If a state change is in progress for this mirror in a MN set, 3851 * suspend all non-resync writes until the state change is complete. 3852 * The objective of this suspend is to ensure that it is not 3853 * possible for one node to read data from a submirror that another node 3854 * has not written to because of the state change. Therefore we 3855 * suspend all writes until the state change has been made. As it is 3856 * not possible to read from the target of a resync, there is no need 3857 * to suspend resync writes. 3858 * Note that we only block here if the caller can handle a busy-wait. 3859 * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only. 3860 */ 3861 3862 if (!(flag & MD_STR_WAR)) { 3863 if (flag & MD_STR_BLOCK_OK) { 3864 mutex_enter(&un->un_suspend_wr_mx); 3865 while (un->un_suspend_wr_flag) { 3866 cv_wait(&un->un_suspend_wr_cv, 3867 &un->un_suspend_wr_mx); 3868 } 3869 mutex_exit(&un->un_suspend_wr_mx); 3870 } 3871 (void) md_unit_readerlock(ui); 3872 } 3873 3874 if (!(flag & MD_STR_NOTTOP)) { 3875 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 3876 md_kstat_waitq_exit(ui); 3877 return; 3878 } 3879 } 3880 3881 setno = MD_MIN2SET(getminor(pb->b_edev)); 3882 3883 /* If an ABR write has been requested, set MD_STR_ABR flag */ 3884 if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE)) 3885 flag |= MD_STR_ABR; 3886 3887 if (private == NULL) { 3888 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 3889 mirror_parent_init(ps); 3890 } else { 3891 ps = private; 3892 private = NULL; 3893 } 3894 if (flag & MD_STR_MAPPED) 3895 ps->ps_flags |= MD_MPS_MAPPED; 3896 3897 if (flag & MD_STR_WOW) 3898 ps->ps_flags |= MD_MPS_WOW; 3899 3900 if (flag & MD_STR_ABR) 3901 ps->ps_flags |= MD_MPS_ABR; 3902 3903 if (flag & MD_STR_WMUPDATE) 3904 ps->ps_flags |= MD_MPS_WMUPDATE; 3905 3906 /* 3907 * Save essential information from the original buffhdr 3908 * in the md_save structure. 3909 */ 3910 ps->ps_un = un; 3911 ps->ps_ui = ui; 3912 ps->ps_bp = pb; 3913 ps->ps_addr = pb->b_un.b_addr; 3914 ps->ps_firstblk = pb->b_lblkno; 3915 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 3916 ps->ps_changecnt = un->un_changecnt; 3917 3918 /* 3919 * Check for suspended writes here. This is where we can defer the 3920 * write request to the daemon_io queue which will then call us with 3921 * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at 3922 * the top of this routine. 3923 */ 3924 if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) { 3925 mutex_enter(&un->un_suspend_wr_mx); 3926 if (un->un_suspend_wr_flag) { 3927 ps->ps_flags |= MD_MPS_BLOCKABLE_IO; 3928 mutex_exit(&un->un_suspend_wr_mx); 3929 md_unit_readerexit(ui); 3930 daemon_request(&md_mirror_daemon, daemon_io, 3931 (daemon_queue_t *)ps, REQ_OLD); 3932 return; 3933 } 3934 mutex_exit(&un->un_suspend_wr_mx); 3935 } 3936 3937 /* 3938 * If not MN owner and this is an ABR write, make sure the current 3939 * resync region is in the overlaps tree 3940 */ 3941 mutex_enter(&un->un_owner_mx); 3942 if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) && 3943 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 3944 md_mps_t *ps1; 3945 /* Block the current resync region, if not already blocked */ 3946 ps1 = un->un_rs_prev_overlap; 3947 3948 if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) || 3949 (ps1->ps_lastblk != 0))) { 3950 /* Drop locks to avoid deadlock */ 3951 mutex_exit(&un->un_owner_mx); 3952 md_unit_readerexit(ui); 3953 wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT); 3954 rs_on_overlap = 1; 3955 (void) md_unit_readerlock(ui); 3956 mutex_enter(&un->un_owner_mx); 3957 /* 3958 * Check to see if we have obtained ownership 3959 * while waiting for overlaps. If we have, remove 3960 * the resync_region entry from the overlap tree 3961 */ 3962 if (MD_MN_MIRROR_OWNER(un) && 3963 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) { 3964 mirror_overlap_tree_remove(ps1); 3965 rs_on_overlap = 0; 3966 } 3967 } 3968 } 3969 mutex_exit(&un->un_owner_mx); 3970 3971 3972 /* 3973 * following keep write after read from writing to the 3974 * source in the case where it all came from one place 3975 */ 3976 if (flag & MD_STR_WAR) { 3977 int abort_write = 0; 3978 /* 3979 * We are perfoming a write-after-read. This is either as a 3980 * result of a resync read or as a result of a read in a 3981 * dirty resync region when the optimized resync is not 3982 * complete. If in a MN set and a resync generated i/o, 3983 * if the current block is not in the current 3984 * resync region terminate the write as another node must have 3985 * completed this resync region 3986 */ 3987 if ((MD_MNSET_SETNO(MD_UN2SET(un))) && 3988 (!flag & MD_STR_DIRTY_RD)) { 3989 if (!IN_RESYNC_REGION(un, ps)) 3990 abort_write = 1; 3991 } 3992 if ((select_write_after_read_units(un, ps) == 0) || 3993 (abort_write)) { 3994 #ifdef DEBUG 3995 if (mirror_debug_flag) 3996 printf("Abort resync write on %x, block %lld\n", 3997 MD_SID(un), ps->ps_firstblk); 3998 #endif 3999 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4000 mirror_overlap_tree_remove(ps); 4001 kmem_cache_free(mirror_parent_cache, ps); 4002 md_kstat_waitq_exit(ui); 4003 md_unit_readerexit(ui); 4004 md_biodone(pb); 4005 return; 4006 } 4007 } else { 4008 select_write_units(un, ps); 4009 4010 /* Drop readerlock to avoid deadlock */ 4011 md_unit_readerexit(ui); 4012 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 4013 un = md_unit_readerlock(ui); 4014 /* 4015 * For a MN set with an ABR write, if we are now the 4016 * owner and we have a resync region in the overlap 4017 * tree, remove the entry from overlaps and retry the write. 4018 */ 4019 4020 if (MD_MNSET_SETNO(setno) && 4021 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 4022 mutex_enter(&un->un_owner_mx); 4023 if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) { 4024 mirror_overlap_tree_remove(ps); 4025 md_kstat_waitq_exit(ui); 4026 mutex_exit(&un->un_owner_mx); 4027 md_unit_readerexit(ui); 4028 daemon_request(&md_mirror_daemon, daemon_io, 4029 (daemon_queue_t *)ps, REQ_OLD); 4030 return; 4031 } 4032 mutex_exit(&un->un_owner_mx); 4033 } 4034 } 4035 4036 /* 4037 * For Multinode mirrors with no owner and a Resync Region (not ABR) 4038 * we need to become the mirror owner before continuing with the 4039 * write(). For ABR mirrors we check that we 'own' the resync if 4040 * we're in write-after-read mode. We do this _after_ ensuring that 4041 * there are no overlaps to ensure that once we know that we are 4042 * the owner, the readerlock will not be released until the write is 4043 * complete. As a change of ownership in a MN set requires the 4044 * writerlock, this ensures that ownership cannot be changed until 4045 * the write is complete. 4046 */ 4047 if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) || 4048 (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) { 4049 if (MD_MN_NO_MIRROR_OWNER(un)) { 4050 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4051 mirror_overlap_tree_remove(ps); 4052 md_kstat_waitq_exit(ui); 4053 ASSERT(!(flag & MD_STR_WAR)); 4054 md_unit_readerexit(ui); 4055 daemon_request(&md_mirror_daemon, become_owner, 4056 (daemon_queue_t *)ps, REQ_OLD); 4057 return; 4058 } 4059 } 4060 4061 /* 4062 * Mark resync region if mirror has a Resync Region _and_ we are not 4063 * a resync initiated write(). Don't mark region if we're flagged as 4064 * an ABR write. 4065 */ 4066 if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) && 4067 !(flag & MD_STR_WAR)) { 4068 if (mirror_mark_resync_region(un, ps->ps_firstblk, 4069 ps->ps_lastblk, md_mn_mynode_id)) { 4070 pb->b_flags |= B_ERROR; 4071 pb->b_resid = pb->b_bcount; 4072 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4073 mirror_overlap_tree_remove(ps); 4074 kmem_cache_free(mirror_parent_cache, ps); 4075 md_kstat_waitq_exit(ui); 4076 md_unit_readerexit(ui); 4077 md_biodone(pb); 4078 return; 4079 } 4080 } 4081 4082 ps->ps_childbflags = pb->b_flags | B_WRITE; 4083 ps->ps_childbflags &= ~B_READ; 4084 if (flag & MD_STR_MAPPED) 4085 ps->ps_childbflags &= ~B_PAGEIO; 4086 4087 if (!(flag & MD_STR_NOTTOP) && panicstr) 4088 /* Disable WOW and don't free ps */ 4089 ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE); 4090 4091 md_kstat_waitq_to_runq(ui); 4092 4093 /* 4094 * Treat Raw and Direct I/O as Write-on-Write always 4095 */ 4096 4097 if (!(md_mirror_wow_flg & WOW_DISABLE) && 4098 (md_mirror_wow_flg & WOW_PHYS_ENABLE) && 4099 (pb->b_flags & B_PHYS) && 4100 !(ps->ps_flags & MD_MPS_WOW)) { 4101 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4102 mirror_overlap_tree_remove(ps); 4103 md_unit_readerexit(ui); 4104 daemon_request(&md_mstr_daemon, handle_wow, 4105 (daemon_queue_t *)ps, REQ_OLD); 4106 return; 4107 } 4108 4109 ps->ps_frags = 1; 4110 do { 4111 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 4112 mirror_child_init(cs); 4113 cb = &cs->cs_buf; 4114 more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR)); 4115 4116 /* 4117 * This handles the case where we're requesting 4118 * a write to block 0 on a label partition. (more < 0) 4119 * means that the request size was smaller than the 4120 * size of the label. If so this request is done. 4121 */ 4122 if (more < 0) { 4123 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4124 mirror_overlap_tree_remove(ps); 4125 md_kstat_runq_exit(ui); 4126 kmem_cache_free(mirror_child_cache, cs); 4127 kmem_cache_free(mirror_parent_cache, ps); 4128 md_unit_readerexit(ui); 4129 md_biodone(pb); 4130 return; 4131 } 4132 if (more) { 4133 mutex_enter(&ps->ps_mx); 4134 ps->ps_frags++; 4135 mutex_exit(&ps->ps_mx); 4136 } 4137 md_call_strategy(cb, flag, private); 4138 } while (more); 4139 4140 if (!(flag & MD_STR_NOTTOP) && panicstr) { 4141 while (!(ps->ps_flags & MD_MPS_DONE)) { 4142 md_daemon(1, &md_done_daemon); 4143 drv_usecwait(10); 4144 } 4145 kmem_cache_free(mirror_parent_cache, ps); 4146 } 4147 } 4148 4149 static void 4150 mirror_read_strategy(buf_t *pb, int flag, void *private) 4151 { 4152 md_mps_t *ps; 4153 md_mcs_t *cs; 4154 size_t more; 4155 mm_unit_t *un; 4156 mdi_unit_t *ui; 4157 size_t current_count; 4158 diskaddr_t current_blkno; 4159 off_t current_offset; 4160 buf_t *cb; /* child buf pointer */ 4161 set_t setno; 4162 4163 ui = MDI_UNIT(getminor(pb->b_edev)); 4164 4165 md_kstat_waitq_enter(ui); 4166 4167 un = (mm_unit_t *)md_unit_readerlock(ui); 4168 4169 if (!(flag & MD_STR_NOTTOP)) { 4170 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 4171 md_kstat_waitq_exit(ui); 4172 return; 4173 } 4174 } 4175 4176 if (private == NULL) { 4177 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 4178 mirror_parent_init(ps); 4179 } else { 4180 ps = private; 4181 private = NULL; 4182 } 4183 4184 if (flag & MD_STR_MAPPED) 4185 ps->ps_flags |= MD_MPS_MAPPED; 4186 if (flag & MD_NOBLOCK) 4187 ps->ps_flags |= MD_MPS_NOBLOCK; 4188 if (flag & MD_STR_WMUPDATE) 4189 ps->ps_flags |= MD_MPS_WMUPDATE; 4190 4191 /* 4192 * Check to see if this is a DMR driven read. If so we need to use the 4193 * specified side (in un->un_dmr_last_read) for the source of the data. 4194 */ 4195 if (flag & MD_STR_DMR) 4196 ps->ps_flags |= MD_MPS_DMR; 4197 4198 /* 4199 * Save essential information from the original buffhdr 4200 * in the md_save structure. 4201 */ 4202 ps->ps_un = un; 4203 ps->ps_ui = ui; 4204 ps->ps_bp = pb; 4205 ps->ps_addr = pb->b_un.b_addr; 4206 ps->ps_firstblk = pb->b_lblkno; 4207 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 4208 ps->ps_changecnt = un->un_changecnt; 4209 4210 current_count = btodb(pb->b_bcount); 4211 current_blkno = pb->b_lblkno; 4212 current_offset = 0; 4213 4214 /* 4215 * If flag has MD_STR_WAR set this means that the read is issued by a 4216 * resync thread which may or may not be an optimised resync. 4217 * 4218 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync 4219 * code has not completed; either a resync has not started since snarf, 4220 * or there is an optimized resync in progress. 4221 * 4222 * We need to generate a write after this read in the following two 4223 * cases, 4224 * 4225 * 1. Any Resync-Generated read 4226 * 4227 * 2. Any read to a DIRTY REGION if there is an optimized resync 4228 * pending or in progress. 4229 * 4230 * The write after read is done in these cases to ensure that all sides 4231 * of the mirror are in sync with the read data and that it is not 4232 * possible for an application to read the same block multiple times 4233 * and get different data. 4234 * 4235 * This would be possible if the block was in a dirty region. 4236 * 4237 * If we're performing a directed read we don't write the data out as 4238 * the application is responsible for restoring the mirror to a known 4239 * state. 4240 */ 4241 if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) && 4242 !(flag & MD_STR_DMR)) { 4243 size_t start_rr, i, end_rr; 4244 int region_dirty = 1; 4245 4246 /* 4247 * We enter here under three circumstances, 4248 * 4249 * MD_UN_OPT_NOT_DONE MD_STR_WAR 4250 * 0 1 4251 * 1 0 4252 * 1 1 4253 * 4254 * To be optimal we only care to explicitly check for dirty 4255 * regions in the second case since if MD_STR_WAR is set we 4256 * always do the write after read. 4257 */ 4258 if (!(flag & MD_STR_WAR)) { 4259 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 4260 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 4261 4262 for (i = start_rr; i <= end_rr; i++) 4263 if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0) 4264 break; 4265 } 4266 4267 if ((region_dirty) && 4268 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 4269 ps->ps_call = write_after_read; 4270 /* 4271 * Mark this as a RESYNC_READ in ps_flags. 4272 * This is used if the read fails during a 4273 * resync of a 3-way mirror to ensure that 4274 * the retried read to the remaining 4275 * good submirror has MD_STR_WAR set. This 4276 * is needed to ensure that the resync write 4277 * (write-after-read) takes place. 4278 */ 4279 ps->ps_flags |= MD_MPS_RESYNC_READ; 4280 4281 /* 4282 * If MD_STR_FLAG_ERR is set in the flags we 4283 * set MD_MPS_FLAG_ERROR so that an error on the resync 4284 * write (issued by write_after_read) will be flagged 4285 * to the biowait'ing resync thread. This allows us to 4286 * avoid issuing further resync requests to a device 4287 * that has had a write failure. 4288 */ 4289 if (flag & MD_STR_FLAG_ERR) 4290 ps->ps_flags |= MD_MPS_FLAG_ERROR; 4291 4292 setno = MD_UN2SET(un); 4293 /* 4294 * Drop the readerlock to avoid 4295 * deadlock 4296 */ 4297 md_unit_readerexit(ui); 4298 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 4299 un = md_unit_readerlock(ui); 4300 /* 4301 * Ensure that we are owner 4302 */ 4303 if (MD_MNSET_SETNO(setno)) { 4304 /* 4305 * For a non-resync read that requires a 4306 * write-after-read to be done, set a flag 4307 * in the parent structure, so that the 4308 * write_strategy routine can omit the 4309 * test that the write is still within the 4310 * resync region 4311 */ 4312 if (!(flag & MD_STR_WAR)) 4313 ps->ps_flags |= MD_MPS_DIRTY_RD; 4314 4315 /* 4316 * Before reading the buffer, see if 4317 * there is an owner. 4318 */ 4319 if (MD_MN_NO_MIRROR_OWNER(un)) { 4320 ps->ps_call = NULL; 4321 mirror_overlap_tree_remove(ps); 4322 md_kstat_waitq_exit(ui); 4323 md_unit_readerexit(ui); 4324 daemon_request( 4325 &md_mirror_daemon, 4326 become_owner, 4327 (daemon_queue_t *)ps, 4328 REQ_OLD); 4329 return; 4330 } 4331 /* 4332 * For a resync read, check to see if I/O is 4333 * outside of the current resync region, or 4334 * the resync has finished. If so 4335 * just terminate the I/O 4336 */ 4337 if ((flag & MD_STR_WAR) && 4338 (!(un->c.un_status & MD_UN_WAR) || 4339 (!IN_RESYNC_REGION(un, ps)))) { 4340 #ifdef DEBUG 4341 if (mirror_debug_flag) 4342 printf("Abort resync read " 4343 "%x: %lld\n", 4344 MD_SID(un), 4345 ps->ps_firstblk); 4346 #endif 4347 mirror_overlap_tree_remove(ps); 4348 kmem_cache_free(mirror_parent_cache, 4349 ps); 4350 md_kstat_waitq_exit(ui); 4351 md_unit_readerexit(ui); 4352 md_biodone(pb); 4353 return; 4354 } 4355 } 4356 } 4357 } 4358 4359 if (flag & MD_STR_DMR) { 4360 ps->ps_call = directed_read_done; 4361 } 4362 4363 if (!(flag & MD_STR_NOTTOP) && panicstr) 4364 ps->ps_flags |= MD_MPS_DONTFREE; 4365 4366 md_kstat_waitq_to_runq(ui); 4367 4368 ps->ps_frags++; 4369 do { 4370 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 4371 mirror_child_init(cs); 4372 cb = &cs->cs_buf; 4373 cs->cs_ps = ps; 4374 4375 cb = md_bioclone(pb, current_offset, current_count, NODEV, 4376 current_blkno, mirror_done, cb, KM_NOSLEEP); 4377 4378 more = mirror_map_read(ps, cs, current_blkno, 4379 (u_longlong_t)current_count); 4380 if (more) { 4381 mutex_enter(&ps->ps_mx); 4382 ps->ps_frags++; 4383 mutex_exit(&ps->ps_mx); 4384 } 4385 4386 /* 4387 * Do these calculations now, 4388 * so that we pickup a valid b_bcount from the chld_bp. 4389 */ 4390 current_count -= more; 4391 current_offset += cb->b_bcount; 4392 current_blkno += more; 4393 md_call_strategy(cb, flag, private); 4394 } while (more); 4395 4396 if (!(flag & MD_STR_NOTTOP) && panicstr) { 4397 while (!(ps->ps_flags & MD_MPS_DONE)) { 4398 md_daemon(1, &md_done_daemon); 4399 drv_usecwait(10); 4400 } 4401 kmem_cache_free(mirror_parent_cache, ps); 4402 } 4403 } 4404 4405 void 4406 md_mirror_strategy(buf_t *bp, int flag, void *private) 4407 { 4408 set_t setno = MD_MIN2SET(getminor(bp->b_edev)); 4409 4410 /* 4411 * When doing IO to a multi owner meta device, check if set is halted. 4412 * We do this check without the needed lock held, for performance 4413 * reasons. 4414 * If an IO just slips through while the set is locked via an 4415 * MD_MN_SUSPEND_SET, we don't care about it. 4416 * Only check for suspension if we are a top-level i/o request 4417 * (MD_STR_NOTTOP is cleared in 'flag'). 4418 */ 4419 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 4420 (MD_SET_HALTED | MD_SET_MNSET)) { 4421 if ((flag & MD_STR_NOTTOP) == 0) { 4422 mutex_enter(&md_mx); 4423 /* Here we loop until the set is no longer halted */ 4424 while (md_set[setno].s_status & MD_SET_HALTED) { 4425 cv_wait(&md_cv, &md_mx); 4426 } 4427 mutex_exit(&md_mx); 4428 } 4429 } 4430 4431 if ((flag & MD_IO_COUNTED) == 0) { 4432 if ((flag & MD_NOBLOCK) == 0) { 4433 if (md_inc_iocount(setno) != 0) { 4434 bp->b_flags |= B_ERROR; 4435 bp->b_error = ENXIO; 4436 bp->b_resid = bp->b_bcount; 4437 biodone(bp); 4438 return; 4439 } 4440 } else { 4441 md_inc_iocount_noblock(setno); 4442 } 4443 } 4444 4445 if (bp->b_flags & B_READ) 4446 mirror_read_strategy(bp, flag, private); 4447 else 4448 mirror_write_strategy(bp, flag, private); 4449 } 4450 4451 /* 4452 * mirror_directed_read: 4453 * -------------------- 4454 * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror 4455 * so that the application can determine what (if any) resync needs to be 4456 * performed. The data is copied out to the user-supplied buffer. 4457 * 4458 * Parameters: 4459 * mdev - dev_t for the mirror device 4460 * vdr - directed read parameters specifying location and submirror 4461 * to perform the read from 4462 * mode - used to ddi_copyout() any resulting data from the read 4463 * 4464 * Returns: 4465 * 0 success 4466 * !0 error code 4467 * EINVAL - invalid request format 4468 */ 4469 int 4470 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode) 4471 { 4472 buf_t *bp; 4473 minor_t mnum = getminor(mdev); 4474 mdi_unit_t *ui = MDI_UNIT(mnum); 4475 mm_unit_t *un; 4476 mm_submirror_t *sm; 4477 char *sm_nm; 4478 uint_t next_side; 4479 void *kbuffer; 4480 4481 if (ui == NULL) 4482 return (ENXIO); 4483 4484 if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) { 4485 return (EINVAL); 4486 } 4487 4488 /* Check for aligned block access. We disallow non-aligned requests. */ 4489 if (vdr->vdr_offset % DEV_BSIZE) { 4490 return (EINVAL); 4491 } 4492 4493 /* 4494 * Allocate kernel buffer for target of read(). If we had a reliable 4495 * (sorry functional) DDI this wouldn't be needed. 4496 */ 4497 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 4498 if (kbuffer == NULL) { 4499 cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx" 4500 " bytes\n", vdr->vdr_nbytes); 4501 return (ENOMEM); 4502 } 4503 4504 bp = getrbuf(KM_SLEEP); 4505 4506 bp->b_un.b_addr = kbuffer; 4507 bp->b_flags = B_READ; 4508 bp->b_bcount = vdr->vdr_nbytes; 4509 bp->b_lblkno = lbtodb(vdr->vdr_offset); 4510 bp->b_edev = mdev; 4511 4512 un = md_unit_readerlock(ui); 4513 4514 /* 4515 * If DKV_SIDE_INIT is set we need to determine the first available 4516 * side to start reading from. If it isn't set we increment to the 4517 * next readable submirror. 4518 * If there are no readable submirrors we error out with DKV_DMR_ERROR. 4519 * Note: we check for a readable submirror on completion of the i/o so 4520 * we should _always_ have one available. If this becomes unavailable 4521 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if 4522 * a metadetach is made between the completion of one DKIOCDMR ioctl 4523 * and the start of the next (i.e. a sys-admin 'accident' occurred). 4524 * The chance of this is small, but not non-existent. 4525 */ 4526 if (vdr->vdr_side == DKV_SIDE_INIT) { 4527 next_side = 0; 4528 } else { 4529 next_side = vdr->vdr_side + 1; 4530 } 4531 while ((next_side < NMIRROR) && 4532 !SUBMIRROR_IS_READABLE(un, next_side)) 4533 next_side++; 4534 if (next_side >= NMIRROR) { 4535 vdr->vdr_flags |= DKV_DMR_ERROR; 4536 freerbuf(bp); 4537 vdr->vdr_bytesread = 0; 4538 md_unit_readerexit(ui); 4539 return (0); 4540 } 4541 4542 /* Set the side to read from */ 4543 un->un_dmr_last_read = next_side; 4544 4545 md_unit_readerexit(ui); 4546 4547 /* 4548 * Save timestamp for verification purposes. Can be read by debugger 4549 * to verify that this ioctl has been executed and to find the number 4550 * of DMR reads and the time of the last DMR read. 4551 */ 4552 uniqtime(&mirror_dmr_stats.dmr_timestamp); 4553 mirror_dmr_stats.dmr_count++; 4554 4555 /* Issue READ request and wait for completion */ 4556 mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL); 4557 4558 mutex_enter(&un->un_dmr_mx); 4559 cv_wait(&un->un_dmr_cv, &un->un_dmr_mx); 4560 mutex_exit(&un->un_dmr_mx); 4561 4562 /* 4563 * Check to see if we encountered an error during the read. If so we 4564 * can make no guarantee about any possibly returned data. 4565 */ 4566 if ((bp->b_flags & B_ERROR) == 0) { 4567 vdr->vdr_flags &= ~DKV_DMR_ERROR; 4568 if (bp->b_resid) { 4569 vdr->vdr_flags |= DKV_DMR_SHORT; 4570 vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid; 4571 } else { 4572 vdr->vdr_flags |= DKV_DMR_SUCCESS; 4573 vdr->vdr_bytesread = vdr->vdr_nbytes; 4574 } 4575 /* Copy the data read back out to the user supplied buffer */ 4576 if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread, 4577 mode)) { 4578 kmem_free(kbuffer, vdr->vdr_nbytes); 4579 return (EFAULT); 4580 } 4581 4582 } else { 4583 /* Error out with DKV_DMR_ERROR */ 4584 vdr->vdr_flags |= DKV_DMR_ERROR; 4585 vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE); 4586 } 4587 /* 4588 * Update the DMR parameters with the side and name of submirror that 4589 * we have just read from (un->un_dmr_last_read) 4590 */ 4591 un = md_unit_readerlock(ui); 4592 4593 vdr->vdr_side = un->un_dmr_last_read; 4594 sm = &un->un_sm[un->un_dmr_last_read]; 4595 sm_nm = md_shortname(md_getminor(sm->sm_dev)); 4596 4597 (void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name)); 4598 4599 /* 4600 * Determine if we've completed the read cycle. This is true iff the 4601 * next computed submirror (side) equals or exceeds NMIRROR. We cannot 4602 * use un_nsm as we need to handle a sparse array of submirrors (which 4603 * can occur if a submirror is metadetached). 4604 */ 4605 next_side = un->un_dmr_last_read + 1; 4606 while ((next_side < NMIRROR) && 4607 !SUBMIRROR_IS_READABLE(un, next_side)) 4608 next_side++; 4609 if (next_side >= NMIRROR) { 4610 /* We've finished */ 4611 vdr->vdr_flags |= DKV_DMR_DONE; 4612 } 4613 4614 md_unit_readerexit(ui); 4615 freerbuf(bp); 4616 kmem_free(kbuffer, vdr->vdr_nbytes); 4617 4618 return (0); 4619 } 4620 4621 /* 4622 * mirror_resync_message: 4623 * --------------------- 4624 * Handle the multi-node resync messages that keep all nodes within a given 4625 * disk-set in sync with their view of a mirror's resync status. 4626 * 4627 * The message types dealt with are: 4628 * MD_MN_MSG_RESYNC_STARTING - start a resync thread for a unit 4629 * MD_MN_MSG_RESYNC_NEXT - specified next region to be resynced 4630 * MD_MN_MSG_RESYNC_FINISH - stop the resync thread for a unit 4631 * MD_MN_MSG_RESYNC_PHASE_DONE - end of a resync phase, opt, submirror or comp 4632 * 4633 * Returns: 4634 * 0 Success 4635 * >0 Failure error number 4636 */ 4637 int 4638 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp) 4639 { 4640 mdi_unit_t *ui; 4641 mm_unit_t *un; 4642 set_t setno; 4643 int is_ABR; 4644 int smi; 4645 int ci; 4646 sm_state_t state; 4647 int broke_out; 4648 mm_submirror_t *sm; 4649 mm_submirror_ic_t *smic; 4650 md_m_shared_t *shared; 4651 md_error_t mde = mdnullerror; 4652 md_mps_t *ps; 4653 int rs_active; 4654 int rr, rr_start, rr_end; 4655 4656 /* Check that the given device is part of a multi-node set */ 4657 setno = MD_MIN2SET(p->mnum); 4658 if (setno >= md_nsets) { 4659 return (ENXIO); 4660 } 4661 if (!MD_MNSET_SETNO(setno)) { 4662 return (EINVAL); 4663 } 4664 4665 if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL) 4666 return (EINVAL); 4667 if ((ui = MDI_UNIT(p->mnum)) == NULL) 4668 return (EINVAL); 4669 is_ABR = (ui->ui_tstate & MD_ABR_CAP); 4670 4671 /* Obtain the current resync status */ 4672 (void) md_ioctl_readerlock(lockp, ui); 4673 rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0; 4674 md_ioctl_readerexit(lockp); 4675 4676 switch ((md_mn_msgtype_t)p->msg_type) { 4677 case MD_MN_MSG_RESYNC_STARTING: 4678 /* Start the resync thread for the mirror */ 4679 (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp); 4680 break; 4681 4682 case MD_MN_MSG_RESYNC_NEXT: 4683 /* 4684 * We have to release any previously marked overlap regions 4685 * so that i/o can resume. Then we need to block the region 4686 * from [rs_start..rs_start+rs_size) * so that no i/o is issued. 4687 * Update un_rs_resync_done and un_rs_resync_2_do. 4688 */ 4689 (void) md_ioctl_readerlock(lockp, ui); 4690 /* 4691 * Ignore the message if there is no active resync thread or 4692 * if it is for a resync type that we have already completed. 4693 * un_resync_completed is set to the last resync completed 4694 * when processing a PHASE_DONE message. 4695 */ 4696 if (!rs_active || (p->rs_type == un->un_resync_completed)) 4697 break; 4698 /* 4699 * If this message is for the same resync and is for an earlier 4700 * resync region, just ignore it. This can only occur if this 4701 * node has progressed on to the next resync region before 4702 * we receive this message. This can occur if the class for 4703 * this message is busy and the originator has to retry thus 4704 * allowing this node to move onto the next resync_region. 4705 */ 4706 if ((p->rs_type == un->un_rs_type) && 4707 (p->rs_start < un->un_resync_startbl)) 4708 break; 4709 ps = un->un_rs_prev_overlap; 4710 4711 /* Allocate previous overlap reference if needed */ 4712 if (ps == NULL) { 4713 ps = kmem_cache_alloc(mirror_parent_cache, 4714 MD_ALLOCFLAGS); 4715 ps->ps_un = un; 4716 ps->ps_ui = ui; 4717 ps->ps_firstblk = 0; 4718 ps->ps_lastblk = 0; 4719 ps->ps_flags = 0; 4720 md_ioctl_readerexit(lockp); 4721 (void) md_ioctl_writerlock(lockp, ui); 4722 un->un_rs_prev_overlap = ps; 4723 md_ioctl_writerexit(lockp); 4724 } else 4725 md_ioctl_readerexit(lockp); 4726 4727 if (p->rs_originator != md_mn_mynode_id) { 4728 /* 4729 * Clear our un_resync_bm for the regions completed. 4730 * The owner (originator) will take care of itself. 4731 */ 4732 BLK_TO_RR(rr_end, ps->ps_lastblk, un); 4733 BLK_TO_RR(rr_start, p->rs_start, un); 4734 if (ps->ps_lastblk && rr_end < rr_start) { 4735 BLK_TO_RR(rr_start, ps->ps_firstblk, un); 4736 mutex_enter(&un->un_resync_mx); 4737 /* 4738 * Update our resync bitmap to reflect that 4739 * another node has synchronized this range. 4740 */ 4741 for (rr = rr_start; rr <= rr_end; rr++) { 4742 CLR_KEEPDIRTY(rr, un); 4743 } 4744 mutex_exit(&un->un_resync_mx); 4745 } 4746 4747 /* 4748 * On all but the originating node, first update 4749 * the resync state, then unblock the previous 4750 * region and block the next one. No need 4751 * to do this if the region is already blocked. 4752 * Update the submirror state and flags from the 4753 * originator. This keeps the cluster in sync with 4754 * regards to the resync status. 4755 */ 4756 4757 (void) md_ioctl_writerlock(lockp, ui); 4758 un->un_rs_resync_done = p->rs_done; 4759 un->un_rs_resync_2_do = p->rs_2_do; 4760 un->un_rs_type = p->rs_type; 4761 un->un_resync_startbl = p->rs_start; 4762 md_ioctl_writerexit(lockp); 4763 /* 4764 * Use un_owner_mx to ensure that an ownership change 4765 * cannot happen at the same time as this message 4766 */ 4767 mutex_enter(&un->un_owner_mx); 4768 if (MD_MN_MIRROR_OWNER(un)) { 4769 ps->ps_firstblk = p->rs_start; 4770 ps->ps_lastblk = ps->ps_firstblk + 4771 p->rs_size - 1; 4772 } else { 4773 if ((ps->ps_firstblk != p->rs_start) || 4774 (ps->ps_lastblk != p->rs_start + 4775 p->rs_size - 1)) { 4776 /* Remove previous overlap range */ 4777 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4778 mirror_overlap_tree_remove(ps); 4779 4780 ps->ps_firstblk = p->rs_start; 4781 ps->ps_lastblk = ps->ps_firstblk + 4782 p->rs_size - 1; 4783 4784 mutex_exit(&un->un_owner_mx); 4785 /* Block this range from all i/o. */ 4786 if (ps->ps_firstblk != 0 || 4787 ps->ps_lastblk != 0) 4788 wait_for_overlaps(ps, 4789 MD_OVERLAP_ALLOW_REPEAT); 4790 mutex_enter(&un->un_owner_mx); 4791 /* 4792 * Check to see if we have obtained 4793 * ownership while waiting for 4794 * overlaps. If we have, remove 4795 * the resync_region entry from the 4796 * overlap tree 4797 */ 4798 if (MD_MN_MIRROR_OWNER(un) && 4799 (ps->ps_flags & MD_MPS_ON_OVERLAP)) 4800 mirror_overlap_tree_remove(ps); 4801 } 4802 } 4803 mutex_exit(&un->un_owner_mx); 4804 4805 /* 4806 * If this is the first RESYNC_NEXT message (i.e. 4807 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags), 4808 * issue RESYNC_START NOTIFY event 4809 */ 4810 if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) { 4811 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START, 4812 SVM_TAG_METADEVICE, MD_UN2SET(un), 4813 MD_SID(un)); 4814 } 4815 4816 /* Ensure that our local resync thread is running */ 4817 if (un->un_rs_thread == NULL) { 4818 (void) mirror_resync_unit(p->mnum, NULL, 4819 &p->mde, lockp); 4820 } 4821 } 4822 4823 break; 4824 case MD_MN_MSG_RESYNC_FINISH: 4825 /* 4826 * Complete the resync by stopping the resync thread. 4827 * Also release the previous overlap region field. 4828 * Update the resync_progress_thread by cv_signal'ing it so 4829 * that we mark the end of the resync as soon as possible. This 4830 * stops an unnecessary delay should be panic after resync 4831 * completion. 4832 */ 4833 #ifdef DEBUG 4834 if (!rs_active) { 4835 if (mirror_debug_flag) 4836 printf("RESYNC_FINISH (mnum = %x), " 4837 "Resync *NOT* active", 4838 p->mnum); 4839 } 4840 #endif 4841 4842 if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) && 4843 (p->rs_originator != md_mn_mynode_id)) { 4844 mutex_enter(&un->un_rs_thread_mx); 4845 un->c.un_status &= ~MD_UN_RESYNC_CANCEL; 4846 un->un_rs_thread_flags |= MD_RI_SHUTDOWN; 4847 un->un_rs_thread_flags &= 4848 ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER); 4849 cv_signal(&un->un_rs_thread_cv); 4850 mutex_exit(&un->un_rs_thread_mx); 4851 } 4852 if (is_ABR) { 4853 /* Resync finished, if ABR set owner to NULL */ 4854 mutex_enter(&un->un_owner_mx); 4855 un->un_mirror_owner = 0; 4856 mutex_exit(&un->un_owner_mx); 4857 } 4858 (void) md_ioctl_writerlock(lockp, ui); 4859 ps = un->un_rs_prev_overlap; 4860 if (ps != NULL) { 4861 /* Remove previous overlap range */ 4862 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4863 mirror_overlap_tree_remove(ps); 4864 /* 4865 * Release the overlap range reference 4866 */ 4867 un->un_rs_prev_overlap = NULL; 4868 kmem_cache_free(mirror_parent_cache, 4869 ps); 4870 } 4871 md_ioctl_writerexit(lockp); 4872 4873 /* Mark the resync as complete in the metadb */ 4874 un->un_rs_resync_done = p->rs_done; 4875 un->un_rs_resync_2_do = p->rs_2_do; 4876 un->un_rs_type = p->rs_type; 4877 mutex_enter(&un->un_rs_progress_mx); 4878 cv_signal(&un->un_rs_progress_cv); 4879 mutex_exit(&un->un_rs_progress_mx); 4880 4881 un = md_ioctl_writerlock(lockp, ui); 4882 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE; 4883 /* Deal with any pending grow_unit */ 4884 if (un->c.un_status & MD_UN_GROW_PENDING) { 4885 if ((mirror_grow_unit(un, &mde) != 0) || 4886 (! mdismderror(&mde, MDE_GROW_DELAYED))) { 4887 un->c.un_status &= ~MD_UN_GROW_PENDING; 4888 } 4889 } 4890 md_ioctl_writerexit(lockp); 4891 break; 4892 4893 case MD_MN_MSG_RESYNC_PHASE_DONE: 4894 /* 4895 * A phase of the resync, optimized. component or 4896 * submirror is complete. Update mirror status. 4897 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the 4898 * mirror owner is peforming a resync. If we have just snarfed 4899 * this set, then we must clear any of the flags set at snarf 4900 * time by unit_setup_resync(). 4901 * Note that unit_setup_resync() sets up these flags to 4902 * indicate that an optimized resync is required. These flags 4903 * need to be reset because if we get here, the mirror owner 4904 * will have handled the optimized resync. 4905 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and 4906 * MD_UN_WAR. In addition, for each submirror, 4907 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC 4908 * set to SMS_OFFLINE. 4909 */ 4910 #ifdef DEBUG 4911 if (mirror_debug_flag) 4912 printf("phase done mess received from %d, mnum=%x," 4913 "type=%x, flags=%x\n", p->rs_originator, p->mnum, 4914 p->rs_type, p->rs_flags); 4915 #endif 4916 /* 4917 * Ignore the message if there is no active resync thread. 4918 */ 4919 if (!rs_active) 4920 break; 4921 4922 broke_out = p->rs_flags & MD_MN_RS_ERR; 4923 switch (RS_TYPE(p->rs_type)) { 4924 case MD_RS_OPTIMIZED: 4925 un = md_ioctl_writerlock(lockp, ui); 4926 if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) { 4927 /* If we are originator, just clear rs_type */ 4928 if (p->rs_originator == md_mn_mynode_id) { 4929 SET_RS_TYPE_NONE(un->un_rs_type); 4930 md_ioctl_writerexit(lockp); 4931 break; 4932 } 4933 /* 4934 * If CLEAR_OPT_NOT_DONE is set, only clear the 4935 * flags if OPT_NOT_DONE is set *and* rs_type 4936 * is MD_RS_NONE. 4937 */ 4938 if ((un->c.un_status & MD_UN_OPT_NOT_DONE) && 4939 (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) { 4940 /* No resync in progress */ 4941 un->c.un_status &= ~MD_UN_OPT_NOT_DONE; 4942 un->c.un_status &= ~MD_UN_WAR; 4943 } else { 4944 /* 4945 * We are in the middle of an 4946 * optimized resync and this message 4947 * should be ignored. 4948 */ 4949 md_ioctl_writerexit(lockp); 4950 break; 4951 } 4952 } else { 4953 /* 4954 * This is the end of an optimized resync, 4955 * clear the OPT_NOT_DONE and OFFLINE_SM flags 4956 */ 4957 4958 un->c.un_status &= ~MD_UN_KEEP_DIRTY; 4959 if (!broke_out) 4960 un->c.un_status &= ~MD_UN_WAR; 4961 4962 /* 4963 * Clear our un_resync_bm for the regions 4964 * completed. The owner (originator) will 4965 * take care of itself. 4966 */ 4967 if (p->rs_originator != md_mn_mynode_id && 4968 (ps = un->un_rs_prev_overlap) != NULL) { 4969 BLK_TO_RR(rr_start, ps->ps_firstblk, 4970 un); 4971 BLK_TO_RR(rr_end, ps->ps_lastblk, un); 4972 mutex_enter(&un->un_resync_mx); 4973 for (rr = rr_start; rr <= rr_end; 4974 rr++) { 4975 CLR_KEEPDIRTY(rr, un); 4976 } 4977 mutex_exit(&un->un_resync_mx); 4978 } 4979 } 4980 4981 /* 4982 * Set resync_completed to last resync type and then 4983 * clear resync_type to indicate no resync in progress 4984 */ 4985 un->un_resync_completed = un->un_rs_type; 4986 SET_RS_TYPE_NONE(un->un_rs_type); 4987 4988 /* 4989 * If resync is as a result of a submirror ONLINE, 4990 * reset the submirror state to SMS_RUNNING if the 4991 * resync was ok else set back to SMS_OFFLINE. 4992 */ 4993 for (smi = 0; smi < NMIRROR; smi++) { 4994 un->un_sm[smi].sm_flags &= 4995 ~MD_SM_RESYNC_TARGET; 4996 if (SMS_BY_INDEX_IS(un, smi, 4997 SMS_OFFLINE_RESYNC)) { 4998 if (p->rs_flags & 4999 MD_MN_RS_CLEAR_OPT_NOT_DONE) { 5000 state = SMS_OFFLINE; 5001 } else { 5002 state = (broke_out ? 5003 SMS_OFFLINE : SMS_RUNNING); 5004 } 5005 mirror_set_sm_state( 5006 &un->un_sm[smi], 5007 &un->un_smic[smi], state, 5008 broke_out); 5009 mirror_commit(un, NO_SUBMIRRORS, 5010 0); 5011 } 5012 /* 5013 * If we still have an offline submirror, reset 5014 * the OFFLINE_SM flag in the mirror status 5015 */ 5016 if (SMS_BY_INDEX_IS(un, smi, 5017 SMS_OFFLINE)) 5018 un->c.un_status |= 5019 MD_UN_OFFLINE_SM; 5020 } 5021 md_ioctl_writerexit(lockp); 5022 break; 5023 case MD_RS_SUBMIRROR: 5024 un = md_ioctl_writerlock(lockp, ui); 5025 smi = RS_SMI(p->rs_type); 5026 sm = &un->un_sm[smi]; 5027 smic = &un->un_smic[smi]; 5028 /* Clear RESYNC target */ 5029 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 5030 /* 5031 * Set resync_completed to last resync type and then 5032 * clear resync_type to indicate no resync in progress 5033 */ 5034 un->un_resync_completed = un->un_rs_type; 5035 SET_RS_TYPE_NONE(un->un_rs_type); 5036 /* 5037 * If the resync completed ok reset the submirror 5038 * state to SMS_RUNNING else reset it to SMS_ATTACHED 5039 */ 5040 state = (broke_out ? 5041 SMS_ATTACHED : SMS_RUNNING); 5042 mirror_set_sm_state(sm, smic, state, broke_out); 5043 un->c.un_status &= ~MD_UN_WAR; 5044 mirror_commit(un, SMI2BIT(smi), 0); 5045 md_ioctl_writerexit(lockp); 5046 break; 5047 case MD_RS_COMPONENT: 5048 un = md_ioctl_writerlock(lockp, ui); 5049 smi = RS_SMI(p->rs_type); 5050 ci = RS_CI(p->rs_type); 5051 sm = &un->un_sm[smi]; 5052 smic = &un->un_smic[smi]; 5053 shared = (md_m_shared_t *) 5054 (*(smic->sm_shared_by_indx)) 5055 (sm->sm_dev, sm, ci); 5056 un->c.un_status &= ~MD_UN_WAR; 5057 /* Clear RESYNC target */ 5058 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 5059 /* 5060 * Set resync_completed to last resync type and then 5061 * clear resync_type to indicate no resync in progress 5062 */ 5063 un->un_resync_completed = un->un_rs_type; 5064 SET_RS_TYPE_NONE(un->un_rs_type); 5065 5066 /* 5067 * If the resync completed ok, set the component state 5068 * to CS_OKAY. 5069 */ 5070 if (broke_out) 5071 shared->ms_flags |= MDM_S_RS_TRIED; 5072 else { 5073 /* 5074 * As we don't transmit the changes, 5075 * no need to drop the lock. 5076 */ 5077 set_sm_comp_state(un, smi, ci, CS_OKAY, 0, 5078 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 5079 } 5080 md_ioctl_writerexit(lockp); 5081 default: 5082 break; 5083 } 5084 /* 5085 * If the purpose of this PHASE_DONE message is just to 5086 * indicate to all other nodes that the optimized resync 5087 * required (OPT_NOT_DONE) flag is to be cleared, there is 5088 * no need to generate a notify event as there has not 5089 * actually been a resync. 5090 */ 5091 if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) { 5092 if (broke_out) { 5093 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED, 5094 SVM_TAG_METADEVICE, MD_UN2SET(un), 5095 MD_SID(un)); 5096 } else { 5097 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE, 5098 SVM_TAG_METADEVICE, MD_UN2SET(un), 5099 MD_SID(un)); 5100 } 5101 } 5102 break; 5103 5104 default: 5105 #ifdef DEBUG 5106 cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type" 5107 " %x\n", p->msg_type); 5108 #endif 5109 return (EINVAL); 5110 } 5111 return (0); 5112 } 5113 5114 /* Return a -1 if snarf of optimized record failed and set should be released */ 5115 static int 5116 mirror_snarf(md_snarfcmd_t cmd, set_t setno) 5117 { 5118 mddb_recid_t recid; 5119 int gotsomething; 5120 int all_mirrors_gotten; 5121 mm_unit_t *un; 5122 mddb_type_t typ1; 5123 mddb_de_ic_t *dep; 5124 mddb_rb32_t *rbp; 5125 size_t newreqsize; 5126 mm_unit_t *big_un; 5127 mm_unit32_od_t *small_un; 5128 int retval; 5129 mdi_unit_t *ui; 5130 5131 if (cmd == MD_SNARF_CLEANUP) { 5132 if (md_get_setstatus(setno) & MD_SET_STALE) 5133 return (0); 5134 5135 recid = mddb_makerecid(setno, 0); 5136 typ1 = (mddb_type_t)md_getshared_key(setno, 5137 mirror_md_ops.md_driver.md_drivername); 5138 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5139 if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) { 5140 un = (mm_unit_t *)mddb_getrecaddr(recid); 5141 mirror_cleanup(un); 5142 recid = mddb_makerecid(setno, 0); 5143 } 5144 } 5145 return (0); 5146 } 5147 5148 all_mirrors_gotten = 1; 5149 gotsomething = 0; 5150 5151 recid = mddb_makerecid(setno, 0); 5152 typ1 = (mddb_type_t)md_getshared_key(setno, 5153 mirror_md_ops.md_driver.md_drivername); 5154 5155 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5156 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 5157 continue; 5158 5159 dep = mddb_getrecdep(recid); 5160 dep->de_flags = MDDB_F_MIRROR; 5161 rbp = dep->de_rb; 5162 5163 switch (rbp->rb_revision) { 5164 case MDDB_REV_RB: 5165 case MDDB_REV_RBFN: 5166 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 5167 /* 5168 * This means, we have an old and small 5169 * record and this record hasn't already 5170 * been converted. Before we create an 5171 * incore metadevice from this we have to 5172 * convert it to a big record. 5173 */ 5174 small_un = 5175 (mm_unit32_od_t *)mddb_getrecaddr(recid); 5176 newreqsize = sizeof (mm_unit_t); 5177 big_un = (mm_unit_t *)kmem_zalloc(newreqsize, 5178 KM_SLEEP); 5179 mirror_convert((caddr_t)small_un, 5180 (caddr_t)big_un, SMALL_2_BIG); 5181 kmem_free(small_un, dep->de_reqsize); 5182 5183 /* 5184 * Update userdata and incore userdata 5185 * incores are at the end of un 5186 */ 5187 dep->de_rb_userdata_ic = big_un; 5188 dep->de_rb_userdata = big_un; 5189 dep->de_icreqsize = newreqsize; 5190 un = big_un; 5191 rbp->rb_private |= MD_PRV_CONVD; 5192 } else { 5193 /* 5194 * Unit already converted, just get the 5195 * record address. 5196 */ 5197 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, 5198 sizeof (*un), 0); 5199 } 5200 un->c.un_revision &= ~MD_64BIT_META_DEV; 5201 break; 5202 case MDDB_REV_RB64: 5203 case MDDB_REV_RB64FN: 5204 /* Big device */ 5205 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, 5206 sizeof (*un), 0); 5207 un->c.un_revision |= MD_64BIT_META_DEV; 5208 un->c.un_flag |= MD_EFILABEL; 5209 break; 5210 } 5211 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 5212 5213 /* 5214 * Create minor device node for snarfed entry. 5215 */ 5216 (void) md_create_minor_node(setno, MD_SID(un)); 5217 5218 if (MD_UNIT(MD_SID(un)) != NULL) { 5219 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5220 continue; 5221 } 5222 all_mirrors_gotten = 0; 5223 retval = mirror_build_incore(un, 1); 5224 if (retval == 0) { 5225 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5226 md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0); 5227 resync_start_timeout(setno); 5228 gotsomething = 1; 5229 } else { 5230 return (retval); 5231 } 5232 /* 5233 * Set flag to indicate that the mirror has not yet 5234 * been through a reconfig. This flag is used for MN sets 5235 * when determining whether to update the mirror state from 5236 * the Master node. 5237 */ 5238 if (MD_MNSET_SETNO(setno)) { 5239 ui = MDI_UNIT(MD_SID(un)); 5240 ui->ui_tstate |= MD_RESYNC_NOT_DONE; 5241 } 5242 } 5243 5244 if (!all_mirrors_gotten) 5245 return (gotsomething); 5246 5247 recid = mddb_makerecid(setno, 0); 5248 while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0) 5249 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 5250 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5251 5252 return (0); 5253 } 5254 5255 static int 5256 mirror_halt(md_haltcmd_t cmd, set_t setno) 5257 { 5258 unit_t i; 5259 mdi_unit_t *ui; 5260 minor_t mnum; 5261 int reset_mirror_flag = 0; 5262 5263 if (cmd == MD_HALT_CLOSE) 5264 return (0); 5265 5266 if (cmd == MD_HALT_OPEN) 5267 return (0); 5268 5269 if (cmd == MD_HALT_UNLOAD) 5270 return (0); 5271 5272 if (cmd == MD_HALT_CHECK) { 5273 for (i = 0; i < md_nunits; i++) { 5274 mnum = MD_MKMIN(setno, i); 5275 if ((ui = MDI_UNIT(mnum)) == NULL) 5276 continue; 5277 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5278 continue; 5279 if (md_unit_isopen(ui)) 5280 return (1); 5281 } 5282 return (0); 5283 } 5284 5285 if (cmd != MD_HALT_DOIT) 5286 return (1); 5287 5288 for (i = 0; i < md_nunits; i++) { 5289 mnum = MD_MKMIN(setno, i); 5290 if ((ui = MDI_UNIT(mnum)) == NULL) 5291 continue; 5292 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5293 continue; 5294 reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0); 5295 5296 /* Set a flag if there is at least one mirror metadevice. */ 5297 reset_mirror_flag = 1; 5298 } 5299 5300 /* 5301 * Only wait for the global dr_timeout to finish 5302 * - if there are mirror metadevices in this diskset or 5303 * - if this is the local set since an unload of the md_mirror 5304 * driver could follow a successful mirror halt in the local set. 5305 */ 5306 if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) { 5307 while ((mirror_md_ops.md_head == NULL) && 5308 (mirror_timeout.dr_timeout_id != 0)) 5309 delay(md_hz); 5310 } 5311 5312 return (0); 5313 } 5314 5315 /*ARGSUSED3*/ 5316 static int 5317 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) 5318 { 5319 IOLOCK lock; 5320 minor_t mnum = getminor(*dev); 5321 set_t setno; 5322 5323 /* 5324 * When doing an open of a multi owner metadevice, check to see if this 5325 * node is a starting node and if a reconfig cycle is underway. 5326 * If so, the system isn't sufficiently set up enough to handle the 5327 * open (which involves I/O during sp_validate), so fail with ENXIO. 5328 */ 5329 setno = MD_MIN2SET(mnum); 5330 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 5331 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 5332 return (ENXIO); 5333 } 5334 5335 if (md_oflags & MD_OFLG_FROMIOCTL) { 5336 /* 5337 * This indicates that the caller is an ioctl service routine. 5338 * In this case we initialise our stack-based IOLOCK and pass 5339 * this into the internal open routine. This allows multi-owner 5340 * metadevices to avoid deadlocking if an error is encountered 5341 * during the open() attempt. The failure case is: 5342 * s-p -> mirror -> s-p (with error). Attempting to metaclear 5343 * this configuration would deadlock as the mirror code has to 5344 * send a state-update to the other nodes when it detects the 5345 * failure of the underlying submirror with an errored soft-part 5346 * on it. As there is a class1 message in progress (metaclear) 5347 * set_sm_comp_state() cannot send another class1 message; 5348 * instead we do not send a state_update message as the 5349 * metaclear is distributed and the failed submirror will be 5350 * cleared from the configuration by the metaclear. 5351 */ 5352 IOLOCK_INIT(&lock); 5353 return (mirror_internal_open(getminor(*dev), flag, otyp, 5354 md_oflags, &lock)); 5355 } else { 5356 return (mirror_internal_open(getminor(*dev), flag, otyp, 5357 md_oflags, (IOLOCK *)NULL)); 5358 } 5359 } 5360 5361 5362 /*ARGSUSED1*/ 5363 static int 5364 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) 5365 { 5366 return (mirror_internal_close(getminor(dev), otyp, md_cflags, 5367 (IOLOCK *)NULL)); 5368 } 5369 5370 5371 /* 5372 * This routine dumps memory to the disk. It assumes that the memory has 5373 * already been mapped into mainbus space. It is called at disk interrupt 5374 * priority when the system is in trouble. 5375 * 5376 */ 5377 static int 5378 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 5379 { 5380 mm_unit_t *un; 5381 dev_t mapdev; 5382 int result; 5383 int smi; 5384 int any_succeed = 0; 5385 int save_result = 0; 5386 5387 /* 5388 * Don't need to grab the unit lock. 5389 * Cause nothing else is suppose to be happenning. 5390 * Also dump is not suppose to sleep. 5391 */ 5392 un = (mm_unit_t *)MD_UNIT(getminor(dev)); 5393 5394 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 5395 return (EINVAL); 5396 5397 if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks) 5398 return (EINVAL); 5399 5400 for (smi = 0; smi < NMIRROR; smi++) { 5401 if (!SUBMIRROR_IS_WRITEABLE(un, smi)) 5402 continue; 5403 mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev); 5404 result = bdev_dump(mapdev, addr, blkno, nblk); 5405 if (result) 5406 save_result = result; 5407 5408 if (result == 0) 5409 any_succeed++; 5410 } 5411 5412 if (any_succeed) 5413 return (0); 5414 5415 return (save_result); 5416 } 5417 5418 /* 5419 * NAME: mirror_probe_dev 5420 * 5421 * DESCRITPION: force opens every component of a mirror. 5422 * 5423 * On entry the unit writerlock is held 5424 */ 5425 static int 5426 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum) 5427 { 5428 int i; 5429 int smi; 5430 int ci; 5431 mm_unit_t *un; 5432 int md_devopen = 0; 5433 set_t setno; 5434 int sm_cnt; 5435 int sm_unavail_cnt; 5436 5437 if (md_unit_isopen(ui)) 5438 md_devopen++; 5439 5440 un = MD_UNIT(mnum); 5441 setno = MD_UN2SET(un); 5442 5443 sm_cnt = 0; 5444 sm_unavail_cnt = 0; 5445 for (i = 0; i < NMIRROR; i++) { 5446 md_dev64_t tmpdev; 5447 mdi_unit_t *sm_ui; 5448 5449 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) { 5450 continue; 5451 } 5452 5453 sm_cnt++; 5454 tmpdev = un->un_sm[i].sm_dev; 5455 (void) md_layered_open(mnum, &tmpdev, 5456 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV); 5457 un->un_sm[i].sm_dev = tmpdev; 5458 5459 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 5460 5461 /* 5462 * Logic similar to that in mirror_open_all_devs. We set or 5463 * clear the submirror Unavailable bit. 5464 */ 5465 (void) md_unit_writerlock(sm_ui); 5466 if (submirror_unavailable(un, i, 1)) { 5467 sm_ui->ui_tstate |= MD_INACCESSIBLE; 5468 sm_unavail_cnt++; 5469 } else { 5470 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 5471 } 5472 md_unit_writerexit(sm_ui); 5473 } 5474 5475 /* 5476 * If all of the submirrors are unavailable, the mirror is also 5477 * unavailable. 5478 */ 5479 if (sm_cnt == sm_unavail_cnt) { 5480 ui->ui_tstate |= MD_INACCESSIBLE; 5481 } else { 5482 ui->ui_tstate &= ~MD_INACCESSIBLE; 5483 } 5484 5485 /* 5486 * Start checking from probe failures. If failures occur we 5487 * set the appropriate erred state only if the metadevice is in 5488 * use. This is specifically to prevent unnecessary resyncs. 5489 * For instance if the disks were accidentally disconnected when 5490 * the system booted up then until the metadevice is accessed 5491 * (like file system mount) the user can shutdown, recable and 5492 * reboot w/o incurring a potentially huge resync. 5493 */ 5494 5495 smi = 0; 5496 ci = 0; 5497 while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) { 5498 5499 if (mirror_other_sources(un, smi, ci, 0) == 1) { 5500 /* 5501 * Note that for a MN set, there is no need to call 5502 * SE_NOTIFY as that is done when processing the 5503 * state change 5504 */ 5505 if (md_devopen) { 5506 /* 5507 * Never called from ioctl context, 5508 * so (IOLOCK *)NULL 5509 */ 5510 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 5511 0, MD_STATE_XMIT, (IOLOCK *)NULL); 5512 if (!MD_MNSET_SETNO(setno)) { 5513 SE_NOTIFY(EC_SVM_STATE, 5514 ESC_SVM_LASTERRED, 5515 SVM_TAG_METADEVICE, setno, 5516 MD_SID(un)); 5517 } 5518 continue; 5519 } else { 5520 (void) mirror_close_all_devs(un, 5521 MD_OFLG_PROBEDEV); 5522 if (!MD_MNSET_SETNO(setno)) { 5523 SE_NOTIFY(EC_SVM_STATE, 5524 ESC_SVM_OPEN_FAIL, 5525 SVM_TAG_METADEVICE, setno, 5526 MD_SID(un)); 5527 } 5528 mirror_openfail_console_info(un, smi, ci); 5529 return (ENXIO); 5530 } 5531 } 5532 5533 /* 5534 * Note that for a MN set, there is no need to call 5535 * SE_NOTIFY as that is done when processing the 5536 * state change 5537 */ 5538 if (md_devopen) { 5539 /* Never called from ioctl context, so (IOLOCK *)NULL */ 5540 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, 5541 MD_STATE_XMIT, (IOLOCK *)NULL); 5542 if (!MD_MNSET_SETNO(setno)) { 5543 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 5544 SVM_TAG_METADEVICE, setno, 5545 MD_SID(un)); 5546 } 5547 } 5548 mirror_openfail_console_info(un, smi, ci); 5549 ci++; 5550 } 5551 5552 if (MD_MNSET_SETNO(setno)) { 5553 send_poke_hotspares(setno); 5554 } else { 5555 (void) poke_hotspares(); 5556 } 5557 (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV); 5558 5559 return (0); 5560 } 5561 5562 5563 static int 5564 mirror_imp_set( 5565 set_t setno 5566 ) 5567 { 5568 5569 mddb_recid_t recid; 5570 int gotsomething, i; 5571 mddb_type_t typ1; 5572 mddb_de_ic_t *dep; 5573 mddb_rb32_t *rbp; 5574 mm_unit32_od_t *un32; 5575 mm_unit_t *un64; 5576 md_dev64_t self_devt; 5577 minor_t *self_id; /* minor needs to be updated */ 5578 md_parent_t *parent_id; /* parent needs to be updated */ 5579 mddb_recid_t *record_id; /* record id needs to be updated */ 5580 mddb_recid_t *optrec_id; 5581 md_dev64_t tmpdev; 5582 5583 5584 gotsomething = 0; 5585 5586 typ1 = (mddb_type_t)md_getshared_key(setno, 5587 mirror_md_ops.md_driver.md_drivername); 5588 recid = mddb_makerecid(setno, 0); 5589 5590 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5591 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 5592 continue; 5593 5594 dep = mddb_getrecdep(recid); 5595 rbp = dep->de_rb; 5596 5597 switch (rbp->rb_revision) { 5598 case MDDB_REV_RB: 5599 case MDDB_REV_RBFN: 5600 /* 5601 * Small device 5602 */ 5603 un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid); 5604 self_id = &(un32->c.un_self_id); 5605 parent_id = &(un32->c.un_parent); 5606 record_id = &(un32->c.un_record_id); 5607 optrec_id = &(un32->un_rr_dirty_recid); 5608 5609 for (i = 0; i < un32->un_nsm; i++) { 5610 tmpdev = md_expldev(un32->un_sm[i].sm_dev); 5611 un32->un_sm[i].sm_dev = md_cmpldev 5612 (md_makedevice(md_major, MD_MKMIN(setno, 5613 MD_MIN2UNIT(md_getminor(tmpdev))))); 5614 5615 if (!md_update_minor(setno, mddb_getsidenum 5616 (setno), un32->un_sm[i].sm_key)) 5617 goto out; 5618 } 5619 break; 5620 case MDDB_REV_RB64: 5621 case MDDB_REV_RB64FN: 5622 un64 = (mm_unit_t *)mddb_getrecaddr(recid); 5623 self_id = &(un64->c.un_self_id); 5624 parent_id = &(un64->c.un_parent); 5625 record_id = &(un64->c.un_record_id); 5626 optrec_id = &(un64->un_rr_dirty_recid); 5627 5628 for (i = 0; i < un64->un_nsm; i++) { 5629 tmpdev = un64->un_sm[i].sm_dev; 5630 un64->un_sm[i].sm_dev = md_makedevice 5631 (md_major, MD_MKMIN(setno, MD_MIN2UNIT 5632 (md_getminor(tmpdev)))); 5633 5634 if (!md_update_minor(setno, mddb_getsidenum 5635 (setno), un64->un_sm[i].sm_key)) 5636 goto out; 5637 } 5638 break; 5639 } 5640 5641 /* 5642 * If this is a top level and a friendly name metadevice, 5643 * update its minor in the namespace. 5644 */ 5645 if ((*parent_id == MD_NO_PARENT) && 5646 ((rbp->rb_revision == MDDB_REV_RBFN) || 5647 (rbp->rb_revision == MDDB_REV_RB64FN))) { 5648 5649 self_devt = md_makedevice(md_major, *self_id); 5650 if (!md_update_top_device_minor(setno, 5651 mddb_getsidenum(setno), self_devt)) 5652 goto out; 5653 } 5654 5655 /* 5656 * Update unit with the imported setno 5657 * 5658 */ 5659 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5660 5661 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 5662 if (*parent_id != MD_NO_PARENT) 5663 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 5664 *record_id = MAKERECID(setno, DBID(*record_id)); 5665 *optrec_id = MAKERECID(setno, DBID(*optrec_id)); 5666 5667 gotsomething = 1; 5668 } 5669 5670 out: 5671 return (gotsomething); 5672 } 5673 5674 /* 5675 * NAME: mirror_check_offline 5676 * 5677 * DESCRIPTION: return offline_status = 1 if any submirrors are offline 5678 * 5679 * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is 5680 * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE 5681 * ioctl. 5682 */ 5683 int 5684 mirror_check_offline(md_dev64_t dev, int *offline_status) 5685 { 5686 mm_unit_t *un; 5687 md_error_t mde = mdnullerror; 5688 5689 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5690 return (EINVAL); 5691 *offline_status = 0; 5692 if (un->c.un_status & MD_UN_OFFLINE_SM) 5693 *offline_status = 1; 5694 return (0); 5695 } 5696 5697 /* 5698 * NAME: mirror_inc_abr_count 5699 * 5700 * DESCRIPTION: increment the count of layered soft parts with ABR set 5701 * 5702 * Called from ioctl, so access to un_abr_count is protected by the global 5703 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5704 */ 5705 int 5706 mirror_inc_abr_count(md_dev64_t dev) 5707 { 5708 mm_unit_t *un; 5709 md_error_t mde = mdnullerror; 5710 5711 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5712 return (EINVAL); 5713 un->un_abr_count++; 5714 return (0); 5715 } 5716 5717 /* 5718 * NAME: mirror_dec_abr_count 5719 * 5720 * DESCRIPTION: decrement the count of layered soft parts with ABR set 5721 * 5722 * Called from ioctl, so access to un_abr_count is protected by the global 5723 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5724 */ 5725 int 5726 mirror_dec_abr_count(md_dev64_t dev) 5727 { 5728 mm_unit_t *un; 5729 md_error_t mde = mdnullerror; 5730 5731 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5732 return (EINVAL); 5733 un->un_abr_count--; 5734 return (0); 5735 } 5736 5737 static md_named_services_t mirror_named_services[] = { 5738 {(intptr_t (*)()) poke_hotspares, "poke hotspares" }, 5739 {(intptr_t (*)()) mirror_rename_listkids, MDRNM_LIST_URKIDS }, 5740 {mirror_rename_check, MDRNM_CHECK }, 5741 {(intptr_t (*)()) mirror_renexch_update_kids, MDRNM_UPDATE_KIDS }, 5742 {(intptr_t (*)()) mirror_exchange_parent_update_to, 5743 MDRNM_PARENT_UPDATE_TO}, 5744 {(intptr_t (*)()) mirror_exchange_self_update_from_down, 5745 MDRNM_SELF_UPDATE_FROM_DOWN }, 5746 {(intptr_t (*)())mirror_probe_dev, "probe open test" }, 5747 {(intptr_t (*)())mirror_check_offline, MD_CHECK_OFFLINE }, 5748 {(intptr_t (*)())mirror_inc_abr_count, MD_INC_ABR_COUNT }, 5749 {(intptr_t (*)())mirror_dec_abr_count, MD_DEC_ABR_COUNT }, 5750 { NULL, 0 } 5751 }; 5752 5753 md_ops_t mirror_md_ops = { 5754 mirror_open, /* open */ 5755 mirror_close, /* close */ 5756 md_mirror_strategy, /* strategy */ 5757 NULL, /* print */ 5758 mirror_dump, /* dump */ 5759 NULL, /* read */ 5760 NULL, /* write */ 5761 md_mirror_ioctl, /* mirror_ioctl, */ 5762 mirror_snarf, /* mirror_snarf */ 5763 mirror_halt, /* mirror_halt */ 5764 NULL, /* aread */ 5765 NULL, /* awrite */ 5766 mirror_imp_set, /* import set */ 5767 mirror_named_services 5768 }; 5769 5770 /* module specific initilization */ 5771 static void 5772 init_init() 5773 { 5774 md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t); 5775 5776 /* Initialize the parent and child save memory pools */ 5777 mirror_parent_cache = kmem_cache_create("md_mirror_parent", 5778 sizeof (md_mps_t), 0, mirror_parent_constructor, 5779 mirror_parent_destructor, mirror_run_queue, NULL, NULL, 5780 0); 5781 5782 mirror_child_cache = kmem_cache_create("md_mirror_child", 5783 sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0, 5784 mirror_child_constructor, mirror_child_destructor, 5785 mirror_run_queue, NULL, NULL, 0); 5786 5787 /* 5788 * Insure wowbuf_size is a multiple of DEV_BSIZE, 5789 * then initialize wowbuf memory pool. 5790 */ 5791 md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE); 5792 if (md_wowbuf_size <= 0) 5793 md_wowbuf_size = 2 * DEV_BSIZE; 5794 if (md_wowbuf_size > (32 * DEV_BSIZE)) 5795 md_wowbuf_size = (32 * DEV_BSIZE); 5796 5797 md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t); 5798 mirror_wowblk_cache = kmem_cache_create("md_mirror_wow", 5799 md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0); 5800 5801 mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5802 mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5803 5804 mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL); 5805 } 5806 5807 /* module specific uninitilization (undo init_init()) */ 5808 static void 5809 fini_uninit() 5810 { 5811 kmem_cache_destroy(mirror_parent_cache); 5812 kmem_cache_destroy(mirror_child_cache); 5813 kmem_cache_destroy(mirror_wowblk_cache); 5814 mirror_parent_cache = mirror_child_cache = 5815 mirror_wowblk_cache = NULL; 5816 5817 mutex_destroy(&mirror_timeout.dr_mx); 5818 mutex_destroy(&hotspare_request.dr_mx); 5819 mutex_destroy(&non_ff_drv_mutex); 5820 } 5821 5822 /* define the module linkage */ 5823 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit()) 5824