1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/conf.h> 30 #include <sys/file.h> 31 #include <sys/user.h> 32 #include <sys/uio.h> 33 #include <sys/t_lock.h> 34 #include <sys/buf.h> 35 #include <sys/dkio.h> 36 #include <sys/vtoc.h> 37 #include <sys/kmem.h> 38 #include <vm/page.h> 39 #include <sys/cmn_err.h> 40 #include <sys/sysmacros.h> 41 #include <sys/types.h> 42 #include <sys/mkdev.h> 43 #include <sys/stat.h> 44 #include <sys/open.h> 45 #include <sys/modctl.h> 46 #include <sys/ddi.h> 47 #include <sys/sunddi.h> 48 #include <sys/debug.h> 49 #include <sys/dklabel.h> 50 #include <vm/hat.h> 51 #include <sys/lvm/mdvar.h> 52 #include <sys/lvm/md_mirror.h> 53 #include <sys/lvm/md_convert.h> 54 #include <sys/lvm/md_mddb.h> 55 #include <sys/esunddi.h> 56 57 #include <sys/sysevent/eventdefs.h> 58 #include <sys/sysevent/svm.h> 59 #include <sys/lvm/mdmn_commd.h> 60 #include <sys/avl.h> 61 62 md_ops_t mirror_md_ops; 63 #ifndef lint 64 char _depends_on[] = "drv/md"; 65 md_ops_t *md_interface_ops = &mirror_md_ops; 66 #endif 67 68 extern mdq_anchor_t md_done_daemon; 69 extern mdq_anchor_t md_mstr_daemon; 70 extern mdq_anchor_t md_mirror_daemon; 71 extern mdq_anchor_t md_mirror_io_daemon; 72 extern mdq_anchor_t md_mirror_rs_daemon; 73 extern mdq_anchor_t md_mhs_daemon; 74 75 extern unit_t md_nunits; 76 extern set_t md_nsets; 77 extern md_set_t md_set[]; 78 79 extern int md_status; 80 extern clock_t md_hz; 81 82 extern md_krwlock_t md_unit_array_rw; 83 extern kmutex_t md_mx; 84 extern kcondvar_t md_cv; 85 extern int md_mtioctl_cnt; 86 87 daemon_request_t mirror_timeout; 88 static daemon_request_t hotspare_request; 89 static daemon_request_t mn_hs_request[MD_MAXSETS]; /* Multinode hs req */ 90 91 int md_mirror_mcs_buf_off; 92 93 /* Flags for mdmn_ksend_message to allow debugging */ 94 int md_mirror_msg_flags; 95 96 #ifdef DEBUG 97 /* Flag to switch on debug messages */ 98 int mirror_debug_flag = 0; 99 #endif 100 101 /* 102 * Struct used to hold count of DMR reads and the timestamp of last DMR read 103 * It is used to verify, using a debugger, that the DMR read ioctl has been 104 * executed. 105 */ 106 dmr_stats_t mirror_dmr_stats = {0, 0}; 107 108 /* 109 * Mutex protecting list of non-failfast drivers. 110 */ 111 static kmutex_t non_ff_drv_mutex; 112 extern char **non_ff_drivers; 113 114 extern major_t md_major; 115 116 /* 117 * Write-On-Write memory pool. 118 */ 119 static void copy_write_cont(wowhdr_t *wowhdr); 120 static kmem_cache_t *mirror_wowblk_cache = NULL; 121 static int md_wowbuf_size = 16384; 122 static size_t md_wowblk_size; 123 124 /* 125 * This is a flag that allows: 126 * - disabling the write-on-write mechanism. 127 * - logging occurrences of write-on-write 128 * - switching wow handling procedure processing 129 * Counter for occurences of WOW. 130 */ 131 static uint_t md_mirror_wow_flg = 0; 132 static int md_mirror_wow_cnt = 0; 133 134 /* 135 * Tunable to enable/disable dirty region 136 * processing when closing down a mirror. 137 */ 138 static int new_resync = 1; 139 kmem_cache_t *mirror_parent_cache = NULL; 140 kmem_cache_t *mirror_child_cache = NULL; 141 142 extern int md_ff_disable; /* disable failfast */ 143 144 static int mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int); 145 static void mirror_read_strategy(buf_t *, int, void *); 146 static void mirror_write_strategy(buf_t *, int, void *); 147 static void become_owner(daemon_queue_t *); 148 static int mirror_done(struct buf *cb); 149 static int mirror_done_common(struct buf *cb); 150 static void clear_retry_error(struct buf *cb); 151 152 /* 153 * patchables 154 */ 155 int md_min_rr_size = 200; /* 2000 blocks, or 100k */ 156 int md_def_num_rr = 1000; /* Default number of dirty regions */ 157 158 /* 159 * patchable to change delay before rescheduling mirror ownership request. 160 * Value is clock ticks, default 0.5 seconds 161 */ 162 clock_t md_mirror_owner_to = 500000; 163 164 /*ARGSUSED1*/ 165 static int 166 mirror_parent_constructor(void *p, void *d1, int d2) 167 { 168 mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL); 169 return (0); 170 } 171 172 static void 173 mirror_parent_init(md_mps_t *ps) 174 { 175 bzero(ps, offsetof(md_mps_t, ps_mx)); 176 } 177 178 /*ARGSUSED1*/ 179 static void 180 mirror_parent_destructor(void *p, void *d) 181 { 182 mutex_destroy(&((md_mps_t *)p)->ps_mx); 183 } 184 185 /*ARGSUSED1*/ 186 static int 187 mirror_child_constructor(void *p, void *d1, int d2) 188 { 189 bioinit(&((md_mcs_t *)p)->cs_buf); 190 return (0); 191 } 192 193 void 194 mirror_child_init(md_mcs_t *cs) 195 { 196 cs->cs_ps = NULL; 197 cs->cs_mdunit = 0; 198 md_bioreset(&cs->cs_buf); 199 } 200 201 /*ARGSUSED1*/ 202 static void 203 mirror_child_destructor(void *p, void *d) 204 { 205 biofini(&((md_mcs_t *)p)->cs_buf); 206 } 207 208 static void 209 mirror_wowblk_init(wowhdr_t *p) 210 { 211 bzero(p, md_wowblk_size); 212 } 213 214 static void 215 send_poke_hotspares_msg(daemon_request_t *drq) 216 { 217 int rval; 218 md_mn_msg_pokehsp_t pokehsp; 219 md_mn_kresult_t *kresult; 220 set_t setno = (set_t)drq->dq.qlen; 221 222 pokehsp.pokehsp_setno = setno; 223 224 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 225 rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES, 226 MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, (char *)&pokehsp, 227 sizeof (pokehsp), kresult); 228 229 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 230 mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES"); 231 cmn_err(CE_PANIC, 232 "ksend_message failure: POKE_HOTSPARES"); 233 } 234 kmem_free(kresult, sizeof (md_mn_kresult_t)); 235 236 /* Allow further requests to use this set's queue structure */ 237 mutex_enter(&drq->dr_mx); 238 drq->dr_pending = 0; 239 mutex_exit(&drq->dr_mx); 240 } 241 242 /* 243 * Send a poke_hotspares message to the master node. To avoid swamping the 244 * commd handler with requests we only send a message if there is not one 245 * already outstanding. We punt the request to a separate thread context as 246 * cannot afford to block waiting on the request to be serviced. This is 247 * essential when a reconfig cycle is in progress as any open() of a multinode 248 * metadevice may result in a livelock. 249 */ 250 static void 251 send_poke_hotspares(set_t setno) 252 { 253 daemon_request_t *drq = &mn_hs_request[setno]; 254 255 mutex_enter(&drq->dr_mx); 256 if (drq->dr_pending == 0) { 257 drq->dr_pending = 1; 258 drq->dq.qlen = (int)setno; 259 daemon_request(&md_mhs_daemon, 260 send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD); 261 } 262 mutex_exit(&drq->dr_mx); 263 } 264 265 void 266 mirror_set_sm_state( 267 mm_submirror_t *sm, 268 mm_submirror_ic_t *smic, 269 sm_state_t newstate, 270 int force) 271 { 272 int compcnt; 273 int i; 274 int errcnt; 275 sm_state_t origstate; 276 md_m_shared_t *shared; 277 278 if (force) { 279 sm->sm_state = newstate; 280 uniqtime32(&sm->sm_timestamp); 281 return; 282 } 283 284 origstate = newstate; 285 286 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 287 for (i = 0, errcnt = 0; i < compcnt; i++) { 288 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 289 (sm->sm_dev, sm, i); 290 if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED)) 291 newstate |= SMS_COMP_ERRED; 292 if (shared->ms_state & (CS_RESYNC)) 293 newstate |= SMS_COMP_RESYNC; 294 if (shared->ms_state & CS_ERRED) 295 errcnt++; 296 } 297 298 if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0) 299 newstate &= ~origstate; 300 301 if (errcnt == compcnt) 302 newstate |= SMS_ALL_ERRED; 303 else 304 newstate &= ~SMS_ALL_ERRED; 305 306 sm->sm_state = newstate; 307 uniqtime32(&sm->sm_timestamp); 308 } 309 310 static int 311 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error, 312 int frm_probe) 313 { 314 mm_submirror_t *sm; 315 mm_submirror_ic_t *smic; 316 md_m_shared_t *shared; 317 int ci; 318 int i; 319 int compcnt; 320 int open_comp; /* flag for open component */ 321 322 for (i = *smi; i < NMIRROR; i++) { 323 sm = &un->un_sm[i]; 324 smic = &un->un_smic[i]; 325 326 if (!SMS_IS(sm, SMS_INUSE)) 327 continue; 328 329 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 330 for (ci = *cip; ci < compcnt; ci++) { 331 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 332 (sm->sm_dev, sm, ci); 333 /* 334 * if called from any routine but probe, we check for 335 * MDM_S_ISOPEN flag. Since probe does a pseduo open, 336 * it sets MDM_S_PROBEOPEN flag and we test for this 337 * flag. They are both exclusive tests. 338 */ 339 open_comp = (frm_probe) ? 340 (shared->ms_flags & MDM_S_PROBEOPEN): 341 (shared->ms_flags & MDM_S_ISOPEN); 342 if ((shared->ms_flags & MDM_S_IOERR || !open_comp) && 343 ((shared->ms_state == CS_OKAY) || 344 (shared->ms_state == CS_RESYNC))) { 345 if (clr_error) { 346 shared->ms_flags &= ~MDM_S_IOERR; 347 } 348 *cip = ci; 349 *smi = i; 350 return (1); 351 } 352 353 if (clr_error && (shared->ms_flags & MDM_S_IOERR)) { 354 shared->ms_flags &= ~MDM_S_IOERR; 355 } 356 } 357 358 *cip = 0; 359 } 360 return (0); 361 } 362 363 /*ARGSUSED*/ 364 static void 365 mirror_run_queue(void *d) 366 { 367 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 368 md_daemon(1, &md_done_daemon); 369 } 370 /* 371 * check_comp_4_hotspares 372 * 373 * This function attempts to allocate a hotspare for this component if the 374 * component is in error. In a MN set, the function can be called in 2 modes. 375 * It can be called either when a component error has been detected or when a 376 * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set 377 * in flags and the request is sent to all nodes. 378 * The handler on each of the nodes then calls this function with 379 * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed. 380 * 381 * For non-MN sets the function simply attempts to allocate a hotspare. 382 * 383 * On entry, the following locks are held 384 * mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set) 385 * md_unit_writerlock 386 * 387 * Returns 0 if ok 388 * 1 if the unit containing the component has been cleared while 389 * the mdmn_ksend_message() was being executed 390 */ 391 extern int 392 check_comp_4_hotspares( 393 mm_unit_t *un, 394 int smi, 395 int ci, 396 uint_t flags, 397 mddb_recid_t hs_id, /* Only used by MN disksets */ 398 IOLOCK *lockp /* can be NULL */ 399 ) 400 { 401 mm_submirror_t *sm; 402 mm_submirror_ic_t *smic; 403 md_m_shared_t *shared; 404 mddb_recid_t recids[6]; 405 minor_t mnum; 406 intptr_t (*hs_dev)(); 407 void (*hs_done)(); 408 void *hs_data; 409 md_error_t mde = mdnullerror; 410 set_t setno; 411 md_mn_msg_allochsp_t allochspmsg; 412 md_mn_kresult_t *kresult; 413 mm_unit_t *new_un; 414 int rval; 415 416 mnum = MD_SID(un); 417 setno = MD_UN2SET(un); 418 sm = &un->un_sm[smi]; 419 smic = &un->un_smic[smi]; 420 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 421 (sm->sm_dev, sm, ci); 422 423 if (shared->ms_state != CS_ERRED) 424 return (0); 425 426 /* Don't start a new component resync if a resync is already running. */ 427 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 428 return (0); 429 430 if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) { 431 uint_t msgflags; 432 md_mn_msgtype_t msgtype; 433 434 /* Send allocate hotspare message to all nodes */ 435 436 allochspmsg.msg_allochsp_mnum = un->c.un_self_id; 437 allochspmsg.msg_allochsp_sm = smi; 438 allochspmsg.msg_allochsp_comp = ci; 439 allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id; 440 441 /* 442 * Before calling mdmn_ksend_message(), release locks 443 * Can never be in the context of an ioctl. 444 */ 445 md_unit_writerexit(MDI_UNIT(mnum)); 446 if (flags & MD_HOTSPARE_LINKHELD) 447 rw_exit(&mirror_md_ops.md_link_rw.lock); 448 #ifdef DEBUG 449 if (mirror_debug_flag) 450 printf("send alloc hotspare, flags=" 451 "0x%x %x, %x, %x, %x\n", flags, 452 allochspmsg.msg_allochsp_mnum, 453 allochspmsg.msg_allochsp_sm, 454 allochspmsg.msg_allochsp_comp, 455 allochspmsg.msg_allochsp_hs_id); 456 #endif 457 if (flags & MD_HOTSPARE_WMUPDATE) { 458 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE2; 459 /* 460 * When coming from an update of watermarks, there 461 * must already be a message logged that triggered 462 * this action. So, no need to log this message, too. 463 */ 464 msgflags = MD_MSGF_NO_LOG; 465 } else { 466 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE; 467 msgflags = MD_MSGF_DEFAULT_FLAGS; 468 } 469 470 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 471 rval = mdmn_ksend_message(setno, msgtype, msgflags, 472 (char *)&allochspmsg, sizeof (allochspmsg), 473 kresult); 474 475 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 476 #ifdef DEBUG 477 if (mirror_debug_flag) 478 mdmn_ksend_show_error(rval, kresult, 479 "ALLOCATE HOTSPARE"); 480 #endif 481 /* 482 * If message is sent ok but exitval indicates an error 483 * it must be because the mirror has been cleared. In 484 * this case re-obtain lock and return an error 485 */ 486 if ((rval == 0) && (kresult->kmmr_exitval != 0)) { 487 if (flags & MD_HOTSPARE_LINKHELD) { 488 rw_enter(&mirror_md_ops.md_link_rw.lock, 489 RW_READER); 490 } 491 kmem_free(kresult, sizeof (md_mn_kresult_t)); 492 return (1); 493 } 494 cmn_err(CE_PANIC, 495 "ksend_message failure: ALLOCATE_HOTSPARE"); 496 } 497 kmem_free(kresult, sizeof (md_mn_kresult_t)); 498 499 /* 500 * re-obtain the locks 501 */ 502 if (flags & MD_HOTSPARE_LINKHELD) 503 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 504 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 505 506 /* 507 * As we had to release the locks in order to send the 508 * message to all nodes, we need to check to see if the 509 * unit has changed. If it has we release the writerlock 510 * and return fail. 511 */ 512 if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) { 513 md_unit_writerexit(MDI_UNIT(mnum)); 514 return (1); 515 } 516 } else { 517 if (MD_MNSET_SETNO(setno)) { 518 /* 519 * If 2 or more nodes simultaneously see a 520 * component failure, these nodes will each 521 * send an ALLOCATE_HOTSPARE[2] message. 522 * The first message will allocate the hotspare 523 * and the subsequent messages should do nothing. 524 * 525 * If a slave node doesn't have a hotspare allocated 526 * at the time the message is initiated, then the 527 * passed in hs_id will be 0. If the node 528 * executing this routine has a component shared 529 * ms_hs_id of non-zero, but the message shows a 530 * hs_id of 0, then just return since a hotspare 531 * has already been allocated for this failing 532 * component. When the slave node returns from 533 * the ksend_message the hotspare will have 534 * already been allocated. 535 * 536 * If the slave node does send an hs_id of non-zero, 537 * and the slave node's hs_id matches this node's 538 * ms_hs_id, then the hotspare has error'd and 539 * should be replaced. 540 * 541 * If the slave node sends an hs_id of non-zero and 542 * this node has a different shared ms_hs_id, then 543 * just return since this hotspare has already 544 * been hotspared. 545 */ 546 if (shared->ms_hs_id != 0) { 547 if (hs_id == 0) { 548 #ifdef DEBUG 549 if (mirror_debug_flag) { 550 printf("check_comp_4_hotspares" 551 "(NOXMIT), short circuit " 552 "hs_id=0x%x, " 553 "ms_hs_id=0x%x\n", 554 hs_id, shared->ms_hs_id); 555 } 556 #endif 557 return (0); 558 } 559 if (hs_id != shared->ms_hs_id) { 560 #ifdef DEBUG 561 if (mirror_debug_flag) { 562 printf("check_comp_4_hotspares" 563 "(NOXMIT), short circuit2 " 564 "hs_id=0x%x, " 565 "ms_hs_id=0x%x\n", 566 hs_id, shared->ms_hs_id); 567 } 568 #endif 569 return (0); 570 } 571 } 572 } 573 574 sm = &un->un_sm[smi]; 575 hs_dev = md_get_named_service(sm->sm_dev, 0, 576 "hotspare device", 0); 577 if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done, 578 &hs_data) != 0) 579 return (0); 580 581 /* 582 * set_sm_comp_state() commits the modified records. 583 * As we don't transmit the changes, no need to drop the lock. 584 */ 585 set_sm_comp_state(un, smi, ci, CS_RESYNC, recids, 586 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 587 588 (*hs_done)(sm->sm_dev, hs_data); 589 590 mirror_check_failfast(mnum); 591 592 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE, 593 setno, MD_SID(un)); 594 595 /* 596 * For a multi-node set we need to reset the un_rs_type, 597 * un_rs_resync_done and un_rs_resync_2_do fields as the 598 * hot-spare resync must copy all applicable data. 599 */ 600 if (MD_MNSET_SETNO(setno)) { 601 un->un_rs_type = MD_RS_NONE; 602 un->un_rs_resync_done = 0; 603 un->un_rs_resync_2_do = 0; 604 } 605 606 /* 607 * Must drop writer lock since mirror_resync_unit will 608 * open devices and must be able to grab readerlock. 609 * Don't need to drop IOLOCK since any descendent routines 610 * calling ksend_messages will drop the IOLOCK as needed. 611 * 612 */ 613 if (lockp) { 614 md_ioctl_writerexit(lockp); 615 } else { 616 md_unit_writerexit(MDI_UNIT(mnum)); 617 } 618 619 /* start resync */ 620 (void) mirror_resync_unit(mnum, NULL, &mde, lockp); 621 622 if (lockp) { 623 new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum)); 624 } else { 625 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 626 } 627 } 628 return (0); 629 } 630 631 /* 632 * check_unit_4_hotspares 633 * 634 * For a given mirror, allocate hotspares, if available for any components 635 * that are in error 636 * 637 * Returns 0 if ok 638 * 1 if check_comp_4_hotspares returns non-zero. This will only 639 * happen for a MN unit where the unit has been cleared while 640 * the allocate hotspare message is sent to all nodes. 641 */ 642 static int 643 check_unit_4_hotspares(mm_unit_t *un, int flags) 644 { 645 mm_submirror_t *sm; 646 mm_submirror_ic_t *smic; 647 int ci; 648 int i; 649 int compcnt; 650 651 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 652 return (0); 653 654 for (i = 0; i < NMIRROR; i++) { 655 sm = &un->un_sm[i]; 656 smic = &un->un_smic[i]; 657 if (!SMS_IS(sm, SMS_INUSE)) 658 continue; 659 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm); 660 for (ci = 0; ci < compcnt; ci++) { 661 md_m_shared_t *shared; 662 663 shared = (md_m_shared_t *) 664 (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci); 665 /* 666 * Never called from ioctl context, so pass in 667 * (IOLOCK *)NULL. Pass through flags from calling 668 * routine, also setting XMIT flag. 669 */ 670 if (check_comp_4_hotspares(un, i, ci, 671 (MD_HOTSPARE_XMIT | flags), 672 shared->ms_hs_id, (IOLOCK *)NULL) != 0) 673 return (1); 674 } 675 } 676 return (0); 677 } 678 679 static void 680 check_4_hotspares(daemon_request_t *drq) 681 { 682 mdi_unit_t *ui; 683 mm_unit_t *un; 684 md_link_t *next; 685 int x; 686 687 mutex_enter(&drq->dr_mx); /* clear up front so can poke */ 688 drq->dr_pending = 0; /* again in low level routine if */ 689 mutex_exit(&drq->dr_mx); /* something found to do */ 690 691 /* 692 * Used to have a problem here. The disksets weren't marked as being 693 * MNHOLD. This opened a window where we could be searching for 694 * hotspares and have the disk set unloaded (released) from under 695 * us causing a panic in stripe_component_count(). 696 * The way to prevent that is to mark the set MNHOLD which prevents 697 * any diskset from being released while we are scanning the mirrors, 698 * submirrors and components. 699 */ 700 701 for (x = 0; x < md_nsets; x++) 702 md_holdset_enter(x); 703 704 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 705 for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) { 706 ui = MDI_UNIT(next->ln_id); 707 708 un = (mm_unit_t *)md_unit_readerlock(ui); 709 710 /* 711 * Only check the unit if we are the master for this set 712 * For an MN set, poke_hotspares() is only effective on the 713 * master 714 */ 715 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 716 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 717 md_unit_readerexit(ui); 718 continue; 719 } 720 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) { 721 md_unit_readerexit(ui); 722 continue; 723 } 724 md_unit_readerexit(ui); 725 726 un = (mm_unit_t *)md_unit_writerlock(ui); 727 /* 728 * check_unit_4_hotspares will exit 1 if the unit has been 729 * removed during the process of allocating the hotspare. 730 * This can only happen for a MN metadevice. If unit no longer 731 * exists, no need to release writerlock 732 */ 733 if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0) 734 md_unit_writerexit(ui); 735 else { 736 /* 737 * If check_unit_4_hotspares failed, queue another 738 * request and break out of this one 739 */ 740 (void) poke_hotspares(); 741 break; 742 } 743 } 744 rw_exit(&mirror_md_ops.md_link_rw.lock); 745 746 for (x = 0; x < md_nsets; x++) 747 md_holdset_exit(x); 748 } 749 750 /* 751 * poke_hotspares 752 * 753 * If there is not a pending poke_hotspares request pending, queue a requent 754 * to call check_4_hotspares(). This will scan all mirrors and attempt to 755 * allocate hotspares for all components in error. 756 */ 757 int 758 poke_hotspares() 759 { 760 mutex_enter(&hotspare_request.dr_mx); 761 if (hotspare_request.dr_pending == 0) { 762 hotspare_request.dr_pending = 1; 763 daemon_request(&md_mhs_daemon, 764 check_4_hotspares, (daemon_queue_t *)&hotspare_request, 765 REQ_OLD); 766 } 767 mutex_exit(&hotspare_request.dr_mx); 768 return (0); 769 } 770 771 static void 772 free_all_ecomps(err_comp_t *ecomp) 773 { 774 err_comp_t *d; 775 776 while (ecomp != NULL) { 777 d = ecomp; 778 ecomp = ecomp->ec_next; 779 kmem_free(d, sizeof (err_comp_t)); 780 } 781 } 782 783 /* 784 * NAME: mirror_openfail_console_info 785 * 786 * DESCRIPTION: Prints a informative message to the console when mirror 787 * cannot be opened. 788 * 789 * PARAMETERS: mm_unit_t un - pointer to mirror unit structure 790 * int smi - submirror index 791 * int ci - component index 792 */ 793 794 void 795 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci) 796 { 797 void (*get_dev)(); 798 ms_cd_info_t cd; 799 md_dev64_t tmpdev; 800 801 tmpdev = un->un_sm[smi].sm_dev; 802 get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0); 803 if (get_dev != NULL) { 804 (void) (*get_dev)(tmpdev, smi, ci, &cd); 805 cmn_err(CE_WARN, "md %s: open error on %s", 806 md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un), 807 cd.cd_dev, NULL, 0)); 808 } else { 809 cmn_err(CE_WARN, "md %s: open error", 810 md_shortname(MD_SID(un))); 811 } 812 } 813 814 static int 815 mirror_close_all_devs(mm_unit_t *un, int md_cflags) 816 { 817 int i; 818 md_dev64_t dev; 819 820 for (i = 0; i < NMIRROR; i++) { 821 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 822 continue; 823 dev = un->un_sm[i].sm_dev; 824 md_layered_close(dev, md_cflags); 825 } 826 return (0); 827 } 828 829 /* 830 * Keep track of drivers that don't support failfast. We use this so that 831 * we only log one diagnostic message for each of these drivers, no matter 832 * how many times we run the mirror_check_failfast function. 833 * Return 1 if this is a new driver that does not support failfast, 834 * return 0 if we have already seen this non-failfast driver. 835 */ 836 static int 837 new_non_ff_driver(const char *s) 838 { 839 mutex_enter(&non_ff_drv_mutex); 840 if (non_ff_drivers == NULL) { 841 non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *), 842 KM_NOSLEEP); 843 if (non_ff_drivers == NULL) { 844 mutex_exit(&non_ff_drv_mutex); 845 return (1); 846 } 847 848 non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, 849 KM_NOSLEEP); 850 if (non_ff_drivers[0] == NULL) { 851 kmem_free(non_ff_drivers, 2 * sizeof (char *)); 852 non_ff_drivers = NULL; 853 mutex_exit(&non_ff_drv_mutex); 854 return (1); 855 } 856 857 (void) strcpy(non_ff_drivers[0], s); 858 non_ff_drivers[1] = NULL; 859 860 } else { 861 int i; 862 char **tnames; 863 char **tmp; 864 865 for (i = 0; non_ff_drivers[i] != NULL; i++) { 866 if (strcmp(s, non_ff_drivers[i]) == 0) { 867 mutex_exit(&non_ff_drv_mutex); 868 return (0); 869 } 870 } 871 872 /* allow for new element and null */ 873 i += 2; 874 tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP); 875 if (tnames == NULL) { 876 mutex_exit(&non_ff_drv_mutex); 877 return (1); 878 } 879 880 for (i = 0; non_ff_drivers[i] != NULL; i++) 881 tnames[i] = non_ff_drivers[i]; 882 883 tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); 884 if (tnames[i] == NULL) { 885 /* adjust i so that it is the right count to free */ 886 kmem_free(tnames, (i + 2) * sizeof (char *)); 887 mutex_exit(&non_ff_drv_mutex); 888 return (1); 889 } 890 891 (void) strcpy(tnames[i++], s); 892 tnames[i] = NULL; 893 894 tmp = non_ff_drivers; 895 non_ff_drivers = tnames; 896 /* i now represents the count we previously alloced */ 897 kmem_free(tmp, i * sizeof (char *)); 898 } 899 mutex_exit(&non_ff_drv_mutex); 900 901 return (1); 902 } 903 904 /* 905 * Check for the "ddi-failfast-supported" devtree property on each submirror 906 * component to indicate if we should do I/O to that submirror with the 907 * B_FAILFAST flag set or not. This check is made at various state transitions 908 * in the mirror code (e.g. open, enable, hotspare, etc.). Sometimes we 909 * only need to check one drive (e.g. hotspare) but since the check is 910 * fast and infrequent and sometimes needs to be done on all components we 911 * just check all components on each call. 912 */ 913 void 914 mirror_check_failfast(minor_t mnum) 915 { 916 int i; 917 mm_unit_t *un; 918 919 if (md_ff_disable) 920 return; 921 922 un = MD_UNIT(mnum); 923 924 for (i = 0; i < NMIRROR; i++) { 925 int ci; 926 int cnt; 927 int ff = 1; 928 mm_submirror_t *sm; 929 mm_submirror_ic_t *smic; 930 void (*get_dev)(); 931 932 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 933 continue; 934 935 sm = &un->un_sm[i]; 936 smic = &un->un_smic[i]; 937 938 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 939 "get device", 0); 940 941 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 942 for (ci = 0; ci < cnt; ci++) { 943 int found = 0; 944 dev_t ci_dev; 945 major_t major; 946 dev_info_t *devi; 947 ms_cd_info_t cd; 948 949 /* 950 * this already returns the hs 951 * dev if the device is spared 952 */ 953 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 954 955 ci_dev = md_dev64_to_dev(cd.cd_dev); 956 major = getmajor(ci_dev); 957 958 if (major == md_major) { 959 /* 960 * this component must be a soft 961 * partition; get the real dev 962 */ 963 minor_t dev_mnum; 964 mdi_unit_t *ui; 965 mp_unit_t *un; 966 set_t setno; 967 side_t side; 968 md_dev64_t tmpdev; 969 970 ui = MDI_UNIT(getminor(ci_dev)); 971 972 /* grab necessary lock */ 973 un = (mp_unit_t *)md_unit_readerlock(ui); 974 975 dev_mnum = MD_SID(un); 976 setno = MD_MIN2SET(dev_mnum); 977 side = mddb_getsidenum(setno); 978 979 tmpdev = un->un_dev; 980 981 /* Get dev by device id */ 982 if (md_devid_found(setno, side, 983 un->un_key) == 1) { 984 tmpdev = md_resolve_bydevid(dev_mnum, 985 tmpdev, un->un_key); 986 } 987 988 md_unit_readerexit(ui); 989 990 ci_dev = md_dev64_to_dev(tmpdev); 991 major = getmajor(ci_dev); 992 } 993 994 if (ci_dev != NODEV32 && 995 (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) 996 != NULL) { 997 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; 998 int propvalue = 0; 999 int proplength = sizeof (int); 1000 int error; 1001 struct cb_ops *cb; 1002 1003 if ((cb = devopsp[major]->devo_cb_ops) != 1004 NULL) { 1005 error = (*cb->cb_prop_op) 1006 (DDI_DEV_T_ANY, devi, prop_op, 1007 DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, 1008 "ddi-failfast-supported", 1009 (caddr_t)&propvalue, &proplength); 1010 1011 if (error == DDI_PROP_SUCCESS) 1012 found = 1; 1013 } 1014 1015 if (!found && new_non_ff_driver( 1016 ddi_driver_name(devi))) { 1017 cmn_err(CE_NOTE, "!md: B_FAILFAST I/O" 1018 "disabled on %s", 1019 ddi_driver_name(devi)); 1020 } 1021 1022 ddi_release_devi(devi); 1023 } 1024 1025 /* 1026 * All components must support 1027 * failfast in the submirror. 1028 */ 1029 if (!found) { 1030 ff = 0; 1031 break; 1032 } 1033 } 1034 1035 if (ff) { 1036 sm->sm_flags |= MD_SM_FAILFAST; 1037 } else { 1038 sm->sm_flags &= ~MD_SM_FAILFAST; 1039 } 1040 } 1041 } 1042 1043 /* 1044 * Return true if the submirror is unavailable. 1045 * If any of the submirror components are opened then the submirror cannot 1046 * be unavailable (MD_INACCESSIBLE). 1047 * If any of the components are already in the errored state, then the submirror 1048 * cannot be unavailable (MD_INACCESSIBLE). 1049 */ 1050 static bool_t 1051 submirror_unavailable(mm_unit_t *un, int smi, int from_probe) 1052 { 1053 mm_submirror_t *sm; 1054 mm_submirror_ic_t *smic; 1055 md_m_shared_t *shared; 1056 int ci; 1057 int compcnt; 1058 1059 sm = &un->un_sm[smi]; 1060 smic = &un->un_smic[smi]; 1061 1062 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 1063 for (ci = 0; ci < compcnt; ci++) { 1064 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 1065 (sm->sm_dev, sm, ci); 1066 if (from_probe) { 1067 if (shared->ms_flags & MDM_S_PROBEOPEN) 1068 return (B_FALSE); 1069 } else { 1070 if (shared->ms_flags & MDM_S_ISOPEN) 1071 return (B_FALSE); 1072 } 1073 if (shared->ms_state == CS_ERRED || 1074 shared->ms_state == CS_LAST_ERRED) 1075 return (B_FALSE); 1076 } 1077 1078 return (B_TRUE); 1079 } 1080 1081 static int 1082 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp) 1083 { 1084 int i; 1085 mm_unit_t *un; 1086 mdi_unit_t *ui; 1087 int err; 1088 int smi; 1089 int ci; 1090 err_comp_t *c; 1091 err_comp_t *ecomps = NULL; 1092 int smmask = 0; 1093 set_t setno; 1094 int sm_cnt; 1095 int sm_unavail_cnt; 1096 1097 mirror_check_failfast(mnum); 1098 1099 un = MD_UNIT(mnum); 1100 ui = MDI_UNIT(mnum); 1101 setno = MD_UN2SET(un); 1102 1103 for (i = 0; i < NMIRROR; i++) { 1104 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1105 1106 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1107 continue; 1108 if (md_layered_open(mnum, &tmpdev, md_oflags)) 1109 smmask |= SMI2BIT(i); 1110 un->un_sm[i].sm_dev = tmpdev; 1111 } 1112 1113 /* 1114 * If smmask is clear, all submirrors are accessible. Clear the 1115 * MD_INACCESSIBLE bit in this case. This bit is also cleared for the 1116 * mirror device. If smmask is set, we have to determine which of the 1117 * submirrors are in error. If no submirror is accessible we mark the 1118 * whole mirror as MD_INACCESSIBLE. 1119 */ 1120 if (smmask == 0) { 1121 if (lockp) { 1122 md_ioctl_readerexit(lockp); 1123 (void) md_ioctl_writerlock(lockp, ui); 1124 } else { 1125 md_unit_readerexit(ui); 1126 (void) md_unit_writerlock(ui); 1127 } 1128 ui->ui_tstate &= ~MD_INACCESSIBLE; 1129 if (lockp) { 1130 md_ioctl_writerexit(lockp); 1131 (void) md_ioctl_readerlock(lockp, ui); 1132 } else { 1133 md_unit_writerexit(ui); 1134 (void) md_unit_readerlock(ui); 1135 } 1136 1137 for (i = 0; i < NMIRROR; i++) { 1138 md_dev64_t tmpdev; 1139 mdi_unit_t *sm_ui; 1140 1141 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1142 continue; 1143 1144 tmpdev = un->un_sm[i].sm_dev; 1145 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1146 (void) md_unit_writerlock(sm_ui); 1147 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1148 md_unit_writerexit(sm_ui); 1149 } 1150 1151 return (0); 1152 } 1153 1154 for (i = 0; i < NMIRROR; i++) { 1155 md_dev64_t tmpdev; 1156 1157 if (!(smmask & SMI2BIT(i))) 1158 continue; 1159 1160 tmpdev = un->un_sm[i].sm_dev; 1161 err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS); 1162 un->un_sm[i].sm_dev = tmpdev; 1163 ASSERT(err == 0); 1164 } 1165 1166 if (lockp) { 1167 md_ioctl_readerexit(lockp); 1168 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui); 1169 } else { 1170 md_unit_readerexit(ui); 1171 un = (mm_unit_t *)md_unit_writerlock(ui); 1172 } 1173 1174 /* 1175 * We want to make sure the unavailable flag is not masking a real 1176 * error on the submirror. 1177 * For each submirror, 1178 * if all of the submirror components couldn't be opened and there 1179 * are no errors on the submirror, then set the unavailable flag 1180 * otherwise, clear unavailable. 1181 */ 1182 sm_cnt = 0; 1183 sm_unavail_cnt = 0; 1184 for (i = 0; i < NMIRROR; i++) { 1185 md_dev64_t tmpdev; 1186 mdi_unit_t *sm_ui; 1187 1188 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1189 continue; 1190 1191 sm_cnt++; 1192 tmpdev = un->un_sm[i].sm_dev; 1193 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1194 1195 (void) md_unit_writerlock(sm_ui); 1196 if (submirror_unavailable(un, i, 0)) { 1197 sm_ui->ui_tstate |= MD_INACCESSIBLE; 1198 sm_unavail_cnt++; 1199 } else { 1200 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1201 } 1202 md_unit_writerexit(sm_ui); 1203 } 1204 1205 /* 1206 * If all of the submirrors are unavailable, the mirror is also 1207 * unavailable. 1208 */ 1209 if (sm_cnt == sm_unavail_cnt) { 1210 ui->ui_tstate |= MD_INACCESSIBLE; 1211 } else { 1212 ui->ui_tstate &= ~MD_INACCESSIBLE; 1213 } 1214 1215 smi = 0; 1216 ci = 0; 1217 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 1218 if (mirror_other_sources(un, smi, ci, 1) == 1) { 1219 1220 free_all_ecomps(ecomps); 1221 (void) mirror_close_all_devs(un, md_oflags); 1222 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, 1223 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1224 mirror_openfail_console_info(un, smi, ci); 1225 if (lockp) { 1226 md_ioctl_writerexit(lockp); 1227 (void) md_ioctl_readerlock(lockp, ui); 1228 } else { 1229 md_unit_writerexit(ui); 1230 (void) md_unit_readerlock(ui); 1231 } 1232 return (ENXIO); 1233 } 1234 1235 /* track all component states that need changing */ 1236 c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP); 1237 c->ec_next = ecomps; 1238 c->ec_smi = smi; 1239 c->ec_ci = ci; 1240 ecomps = c; 1241 ci++; 1242 } 1243 1244 /* Make all state changes and commit them */ 1245 for (c = ecomps; c != NULL; c = c->ec_next) { 1246 /* 1247 * If lockp is set, then entering kernel through ioctl. 1248 * For a MN set, the only ioctl path is via a commd message 1249 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already 1250 * being sent to each node. 1251 * In this case, set NO_XMIT so that set_sm_comp_state 1252 * won't attempt to send a message on a message. 1253 * 1254 * In !MN sets, the xmit flag is ignored, so it doesn't matter 1255 * which flag is passed. 1256 */ 1257 if (lockp) { 1258 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1259 MD_STATE_NO_XMIT, lockp); 1260 } else { 1261 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1262 (MD_STATE_XMIT | MD_STATE_OCHELD), lockp); 1263 } 1264 /* 1265 * For a MN set, the NOTIFY is done when the state change is 1266 * processed on each node 1267 */ 1268 if (!MD_MNSET_SETNO(setno)) { 1269 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 1270 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1271 } 1272 } 1273 1274 if (lockp) { 1275 md_ioctl_writerexit(lockp); 1276 (void) md_ioctl_readerlock(lockp, ui); 1277 } else { 1278 md_unit_writerexit(ui); 1279 (void) md_unit_readerlock(ui); 1280 } 1281 1282 free_all_ecomps(ecomps); 1283 1284 /* allocate hotspares for all errored components */ 1285 if (MD_MNSET_SETNO(setno)) { 1286 /* 1287 * If we're called from an ioctl (lockp set) then we cannot 1288 * directly call send_poke_hotspares as this will block until 1289 * the message gets despatched to all nodes. If the cluster is 1290 * going through a reconfig cycle then the message will block 1291 * until the cycle is complete, and as we originate from a 1292 * service call from commd we will livelock. 1293 */ 1294 if (lockp == NULL) { 1295 md_unit_readerexit(ui); 1296 send_poke_hotspares(setno); 1297 (void) md_unit_readerlock(ui); 1298 } 1299 } else { 1300 (void) poke_hotspares(); 1301 } 1302 return (0); 1303 } 1304 1305 void 1306 mirror_overlap_tree_remove(md_mps_t *ps) 1307 { 1308 mm_unit_t *un; 1309 1310 if (panicstr) 1311 return; 1312 1313 VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP); 1314 un = ps->ps_un; 1315 1316 mutex_enter(&un->un_overlap_tree_mx); 1317 avl_remove(&un->un_overlap_root, ps); 1318 ps->ps_flags &= ~MD_MPS_ON_OVERLAP; 1319 if (un->un_overlap_tree_flag != 0) { 1320 un->un_overlap_tree_flag = 0; 1321 cv_broadcast(&un->un_overlap_tree_cv); 1322 } 1323 mutex_exit(&un->un_overlap_tree_mx); 1324 } 1325 1326 1327 /* 1328 * wait_for_overlaps: 1329 * ----------------- 1330 * Check that given i/o request does not cause an overlap with already pending 1331 * i/o. If it does, block until the overlapped i/o completes. 1332 * 1333 * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent 1334 * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if 1335 * it must not already be in the tree. 1336 */ 1337 static void 1338 wait_for_overlaps(md_mps_t *ps, int flags) 1339 { 1340 mm_unit_t *un; 1341 avl_index_t where; 1342 md_mps_t *ps1; 1343 1344 if (panicstr) 1345 return; 1346 1347 un = ps->ps_un; 1348 mutex_enter(&un->un_overlap_tree_mx); 1349 if ((flags & MD_OVERLAP_ALLOW_REPEAT) && 1350 (ps->ps_flags & MD_MPS_ON_OVERLAP)) { 1351 mutex_exit(&un->un_overlap_tree_mx); 1352 return; 1353 } 1354 1355 VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 1356 1357 do { 1358 ps1 = avl_find(&un->un_overlap_root, ps, &where); 1359 if (ps1 == NULL) { 1360 /* 1361 * The candidate range does not overlap with any 1362 * range in the tree. Insert it and be done. 1363 */ 1364 avl_insert(&un->un_overlap_root, ps, where); 1365 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1366 } else { 1367 /* 1368 * The candidate range would overlap. Set the flag 1369 * indicating we need to be woken up, and sleep 1370 * until another thread removes a range. If upon 1371 * waking up we find this mps was put on the tree 1372 * by another thread, the loop terminates. 1373 */ 1374 un->un_overlap_tree_flag = 1; 1375 cv_wait(&un->un_overlap_tree_cv, 1376 &un->un_overlap_tree_mx); 1377 } 1378 } while (!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 1379 mutex_exit(&un->un_overlap_tree_mx); 1380 } 1381 1382 /* 1383 * This function is called from mirror_done to check whether any pages have 1384 * been modified while a mirrored write was in progress. Returns 0 if 1385 * all pages associated with bp are clean, 1 otherwise. 1386 */ 1387 static int 1388 any_pages_dirty(struct buf *bp) 1389 { 1390 int rval; 1391 1392 rval = biomodified(bp); 1393 if (rval == -1) 1394 rval = 0; 1395 1396 return (rval); 1397 } 1398 1399 #define MAX_EXTRAS 10 1400 1401 void 1402 mirror_commit( 1403 mm_unit_t *un, 1404 int smmask, 1405 mddb_recid_t *extras 1406 ) 1407 { 1408 mm_submirror_t *sm; 1409 md_unit_t *su; 1410 int i; 1411 1412 /* 2=mirror,null id */ 1413 mddb_recid_t recids[NMIRROR+2+MAX_EXTRAS]; 1414 1415 int ri = 0; 1416 1417 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE) 1418 return; 1419 1420 /* Add two, this includes the mirror unit and the null recid */ 1421 if (extras != NULL) { 1422 int nrecids = 0; 1423 while (extras[nrecids] != 0) { 1424 nrecids++; 1425 } 1426 ASSERT(nrecids <= MAX_EXTRAS); 1427 } 1428 1429 if (un != NULL) 1430 recids[ri++] = un->c.un_record_id; 1431 for (i = 0; i < NMIRROR; i++) { 1432 if (!(smmask & SMI2BIT(i))) 1433 continue; 1434 sm = &un->un_sm[i]; 1435 if (!SMS_IS(sm, SMS_INUSE)) 1436 continue; 1437 if (md_getmajor(sm->sm_dev) != md_major) 1438 continue; 1439 su = MD_UNIT(md_getminor(sm->sm_dev)); 1440 recids[ri++] = su->c.un_record_id; 1441 } 1442 1443 if (extras != NULL) 1444 while (*extras != 0) { 1445 recids[ri++] = *extras; 1446 extras++; 1447 } 1448 1449 if (ri == 0) 1450 return; 1451 recids[ri] = 0; 1452 1453 /* 1454 * Ok to hold ioctl lock across record commit to mddb as 1455 * long as the record(s) being committed aren't resync records. 1456 */ 1457 mddb_commitrecs_wrapper(recids); 1458 } 1459 1460 1461 /* 1462 * This routine is used to set a bit in the writable_bm bitmap 1463 * which represents each submirror in a metamirror which 1464 * is writable. The first writable submirror index is assigned 1465 * to the sm_index. The number of writable submirrors are returned in nunits. 1466 * 1467 * This routine returns the submirror's unit number. 1468 */ 1469 1470 static void 1471 select_write_units(struct mm_unit *un, md_mps_t *ps) 1472 { 1473 1474 int i; 1475 unsigned writable_bm = 0; 1476 unsigned nunits = 0; 1477 1478 for (i = 0; i < NMIRROR; i++) { 1479 if (SUBMIRROR_IS_WRITEABLE(un, i)) { 1480 /* set bit of all writable units */ 1481 writable_bm |= SMI2BIT(i); 1482 nunits++; 1483 } 1484 } 1485 ps->ps_writable_sm = writable_bm; 1486 ps->ps_active_cnt = nunits; 1487 ps->ps_current_sm = 0; 1488 } 1489 1490 static 1491 unsigned 1492 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps) 1493 { 1494 1495 int i; 1496 unsigned writable_bm = 0; 1497 unsigned nunits = 0; 1498 1499 for (i = 0; i < NMIRROR; i++) { 1500 if (SUBMIRROR_IS_WRITEABLE(un, i) && 1501 un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) { 1502 writable_bm |= SMI2BIT(i); 1503 nunits++; 1504 } 1505 } 1506 if ((writable_bm & ps->ps_allfrom_sm) != 0) { 1507 writable_bm &= ~ps->ps_allfrom_sm; 1508 nunits--; 1509 } 1510 ps->ps_writable_sm = writable_bm; 1511 ps->ps_active_cnt = nunits; 1512 ps->ps_current_sm = 0; 1513 return (nunits); 1514 } 1515 1516 static md_dev64_t 1517 select_read_unit( 1518 mm_unit_t *un, 1519 diskaddr_t blkno, 1520 u_longlong_t reqcount, 1521 u_longlong_t *cando, 1522 int must_be_opened, 1523 md_m_shared_t **shared, 1524 md_mcs_t *cs) 1525 { 1526 int i; 1527 md_m_shared_t *s; 1528 uint_t lasterrcnt = 0; 1529 md_dev64_t dev = 0; 1530 u_longlong_t cnt; 1531 u_longlong_t mincnt; 1532 mm_submirror_t *sm; 1533 mm_submirror_ic_t *smic; 1534 mdi_unit_t *ui; 1535 1536 mincnt = reqcount; 1537 for (i = 0; i < NMIRROR; i++) { 1538 if (!SUBMIRROR_IS_READABLE(un, i)) 1539 continue; 1540 sm = &un->un_sm[i]; 1541 smic = &un->un_smic[i]; 1542 cnt = reqcount; 1543 1544 /* 1545 * If the current submirror is marked as inaccessible, do not 1546 * try to access it. 1547 */ 1548 ui = MDI_UNIT(getminor(expldev(sm->sm_dev))); 1549 (void) md_unit_readerlock(ui); 1550 if (ui->ui_tstate & MD_INACCESSIBLE) { 1551 md_unit_readerexit(ui); 1552 continue; 1553 } 1554 md_unit_readerexit(ui); 1555 1556 s = (md_m_shared_t *)(*(smic->sm_shared_by_blk)) 1557 (sm->sm_dev, sm, blkno, &cnt); 1558 1559 if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN)) 1560 continue; 1561 if (s->ms_state == CS_OKAY) { 1562 *cando = cnt; 1563 if (shared != NULL) 1564 *shared = s; 1565 1566 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST && 1567 cs != NULL) { 1568 cs->cs_buf.b_flags |= B_FAILFAST; 1569 } 1570 1571 return (un->un_sm[i].sm_dev); 1572 } 1573 if (s->ms_state != CS_LAST_ERRED) 1574 continue; 1575 1576 /* don't use B_FAILFAST since we're Last Erred */ 1577 1578 if (mincnt > cnt) 1579 mincnt = cnt; 1580 if (s->ms_lasterrcnt > lasterrcnt) { 1581 lasterrcnt = s->ms_lasterrcnt; 1582 if (shared != NULL) 1583 *shared = s; 1584 dev = un->un_sm[i].sm_dev; 1585 } 1586 } 1587 *cando = mincnt; 1588 return (dev); 1589 } 1590 1591 /* 1592 * Given a 32-bit bitmap, this routine will return the bit number 1593 * of the nth bit set. The nth bit set is passed via the index integer. 1594 * 1595 * This routine is used to run through the writable submirror bitmap 1596 * and starting all of the writes. See the value returned is the 1597 * index to appropriate submirror structure, in the md_sm 1598 * array for metamirrors. 1599 */ 1600 static int 1601 md_find_nth_unit(uint_t mask, int index) 1602 { 1603 int bit, nfound; 1604 1605 for (bit = -1, nfound = -1; nfound != index; bit++) { 1606 ASSERT(mask != 0); 1607 nfound += (mask & 1); 1608 mask >>= 1; 1609 } 1610 return (bit); 1611 } 1612 1613 static int 1614 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs) 1615 { 1616 mm_unit_t *un; 1617 buf_t *bp; 1618 int i; 1619 unsigned nunits = 0; 1620 int iunit; 1621 uint_t running_bm = 0; 1622 uint_t sm_index; 1623 1624 bp = &cs->cs_buf; 1625 un = ps->ps_un; 1626 1627 for (i = 0; i < NMIRROR; i++) { 1628 if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING)) 1629 continue; 1630 running_bm |= SMI2BIT(i); 1631 nunits++; 1632 } 1633 if (nunits == 0) 1634 return (1); 1635 1636 /* 1637 * For directed mirror read (DMR) we only use the specified side and 1638 * do not compute the source of the read. 1639 */ 1640 if (ps->ps_flags & MD_MPS_DMR) { 1641 sm_index = un->un_dmr_last_read; 1642 } else { 1643 /* Normal (non-DMR) operation */ 1644 switch (un->un_read_option) { 1645 case RD_GEOMETRY: 1646 iunit = (int)(bp->b_lblkno / 1647 howmany(un->c.un_total_blocks, nunits)); 1648 sm_index = md_find_nth_unit(running_bm, iunit); 1649 break; 1650 case RD_FIRST: 1651 sm_index = md_find_nth_unit(running_bm, 0); 1652 break; 1653 case RD_LOAD_BAL: 1654 /* this is intentional to fall into the default */ 1655 default: 1656 un->un_last_read = (un->un_last_read + 1) % nunits; 1657 sm_index = md_find_nth_unit(running_bm, 1658 un->un_last_read); 1659 break; 1660 } 1661 } 1662 bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev); 1663 ps->ps_allfrom_sm = SMI2BIT(sm_index); 1664 1665 if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) { 1666 bp->b_flags |= B_FAILFAST; 1667 } 1668 1669 return (0); 1670 } 1671 1672 static 1673 int 1674 mirror_are_submirrors_available(mm_unit_t *un) 1675 { 1676 int i; 1677 for (i = 0; i < NMIRROR; i++) { 1678 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1679 1680 if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) || 1681 md_getmajor(tmpdev) != md_major) 1682 continue; 1683 1684 if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) || 1685 (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits)) 1686 return (0); 1687 1688 if (MDI_UNIT(md_getminor(tmpdev)) == NULL) 1689 return (0); 1690 } 1691 return (1); 1692 } 1693 1694 void 1695 build_submirror(mm_unit_t *un, int i, int snarfing) 1696 { 1697 struct mm_submirror *sm; 1698 struct mm_submirror_ic *smic; 1699 md_unit_t *su; 1700 set_t setno; 1701 1702 sm = &un->un_sm[i]; 1703 smic = &un->un_smic[i]; 1704 1705 sm->sm_flags = 0; /* sometime we may need to do more here */ 1706 1707 setno = MD_UN2SET(un); 1708 1709 if (!SMS_IS(sm, SMS_INUSE)) 1710 return; 1711 if (snarfing) { 1712 sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno), 1713 sm->sm_key, MD_NOTRUST_DEVT); 1714 } else { 1715 if (md_getmajor(sm->sm_dev) == md_major) { 1716 su = MD_UNIT(md_getminor(sm->sm_dev)); 1717 un->c.un_flag |= (su->c.un_flag & MD_LABELED); 1718 /* submirror can no longer be soft partitioned */ 1719 MD_CAPAB(su) &= (~MD_CAN_SP); 1720 } 1721 } 1722 smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev, 1723 0, "shared by blk", 0); 1724 smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev, 1725 0, "shared by indx", 0); 1726 smic->sm_get_component_count = (int (*)())md_get_named_service( 1727 sm->sm_dev, 0, "get component count", 0); 1728 smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0, 1729 "get block count skip size", 0); 1730 sm->sm_state &= ~SMS_IGNORE; 1731 if (SMS_IS(sm, SMS_OFFLINE)) 1732 MD_STATUS(un) |= MD_UN_OFFLINE_SM; 1733 md_set_parent(sm->sm_dev, MD_SID(un)); 1734 } 1735 1736 static void 1737 mirror_cleanup(mm_unit_t *un) 1738 { 1739 mddb_recid_t recid; 1740 int smi; 1741 sv_dev_t sv[NMIRROR]; 1742 int nsv = 0; 1743 1744 /* 1745 * If a MN diskset and this node is not the master, do 1746 * not delete any records on snarf of the mirror records. 1747 */ 1748 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1749 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 1750 return; 1751 } 1752 1753 for (smi = 0; smi < NMIRROR; smi++) { 1754 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 1755 continue; 1756 sv[nsv].setno = MD_UN2SET(un); 1757 sv[nsv++].key = un->un_sm[smi].sm_key; 1758 } 1759 1760 recid = un->un_rr_dirty_recid; 1761 mddb_deleterec_wrapper(un->c.un_record_id); 1762 if (recid > 0) 1763 mddb_deleterec_wrapper(recid); 1764 1765 md_rem_names(sv, nsv); 1766 } 1767 1768 /* 1769 * Comparison function for the avl tree which tracks 1770 * outstanding writes on submirrors. 1771 * 1772 * Returns: 1773 * -1: ps1 < ps2 1774 * 0: ps1 and ps2 overlap 1775 * 1: ps1 > ps2 1776 */ 1777 static int 1778 mirror_overlap_compare(const void *p1, const void *p2) 1779 { 1780 const md_mps_t *ps1 = (md_mps_t *)p1; 1781 const md_mps_t *ps2 = (md_mps_t *)p2; 1782 1783 if (ps1->ps_firstblk < ps2->ps_firstblk) { 1784 if (ps1->ps_lastblk >= ps2->ps_firstblk) 1785 return (0); 1786 return (-1); 1787 } 1788 1789 if (ps1->ps_firstblk > ps2->ps_firstblk) { 1790 if (ps1->ps_firstblk <= ps2->ps_lastblk) 1791 return (0); 1792 return (1); 1793 } 1794 1795 return (0); 1796 } 1797 1798 /* Return a -1 if optimized record unavailable and set should be released */ 1799 int 1800 mirror_build_incore(mm_unit_t *un, int snarfing) 1801 { 1802 int i; 1803 1804 if (MD_STATUS(un) & MD_UN_BEING_RESET) { 1805 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); 1806 return (1); 1807 } 1808 1809 if (mirror_are_submirrors_available(un) == 0) 1810 return (1); 1811 1812 if (MD_UNIT(MD_SID(un)) != NULL) 1813 return (0); 1814 1815 MD_STATUS(un) = 0; 1816 1817 /* pre-4.1 didn't define CAN_META_CHILD capability */ 1818 MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP; 1819 1820 un->un_overlap_tree_flag = 0; 1821 avl_create(&un->un_overlap_root, mirror_overlap_compare, 1822 sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node)); 1823 1824 for (i = 0; i < NMIRROR; i++) 1825 build_submirror(un, i, snarfing); 1826 1827 if (unit_setup_resync(un, snarfing) != 0) { 1828 if (snarfing) { 1829 mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT); 1830 /* 1831 * If a MN set and set is not stale, then return -1 1832 * which will force the caller to unload the set. 1833 * The MN diskset nodes will return failure if 1834 * unit_setup_resync fails so that nodes won't 1835 * get out of sync. 1836 * 1837 * If set is STALE, the master node can't allocate 1838 * a resync record (if needed), but node needs to 1839 * join the set so that user can delete broken mddbs. 1840 * So, if set is STALE, just continue on. 1841 */ 1842 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1843 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 1844 return (-1); 1845 } 1846 } else 1847 return (1); 1848 } 1849 1850 mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL); 1851 cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL); 1852 1853 un->un_suspend_wr_flag = 0; 1854 mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL); 1855 cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL); 1856 1857 /* 1858 * Allocate mutexes for mirror-owner and resync-owner changes. 1859 * All references to the owner message state field must be guarded 1860 * by this mutex. 1861 */ 1862 mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL); 1863 1864 /* 1865 * Allocate mutex and condvar for resync thread manipulation. These 1866 * will be used by mirror_resync_unit/mirror_ioctl_resync 1867 */ 1868 mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL); 1869 cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL); 1870 1871 /* 1872 * Allocate mutex and condvar for resync progress thread manipulation. 1873 * This allows resyncs to be continued across an intervening reboot. 1874 */ 1875 mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL); 1876 cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL); 1877 1878 /* 1879 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This 1880 * provides synchronization between a user-ioctl and the resulting 1881 * strategy() call that performs the read(). 1882 */ 1883 mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL); 1884 cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL); 1885 1886 /* place various information in the in-core data structures */ 1887 md_nblocks_set(MD_SID(un), un->c.un_total_blocks); 1888 MD_UNIT(MD_SID(un)) = un; 1889 1890 return (0); 1891 } 1892 1893 1894 void 1895 reset_mirror(struct mm_unit *un, minor_t mnum, int removing) 1896 { 1897 mddb_recid_t recid, vtoc_id; 1898 size_t bitcnt; 1899 size_t shortcnt; 1900 int smi; 1901 sv_dev_t sv[NMIRROR]; 1902 int nsv = 0; 1903 uint_t bits = 0; 1904 minor_t selfid; 1905 md_unit_t *su; 1906 1907 md_destroy_unit_incore(mnum, &mirror_md_ops); 1908 1909 shortcnt = un->un_rrd_num * sizeof (short); 1910 bitcnt = howmany(un->un_rrd_num, NBBY); 1911 1912 if (un->un_outstanding_writes) 1913 kmem_free((caddr_t)un->un_outstanding_writes, shortcnt); 1914 if (un->un_goingclean_bm) 1915 kmem_free((caddr_t)un->un_goingclean_bm, bitcnt); 1916 if (un->un_goingdirty_bm) 1917 kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt); 1918 if (un->un_resync_bm) 1919 kmem_free((caddr_t)un->un_resync_bm, bitcnt); 1920 1921 md_nblocks_set(mnum, -1ULL); 1922 MD_UNIT(mnum) = NULL; 1923 1924 /* 1925 * Attempt release of its minor node 1926 */ 1927 md_remove_minor_node(mnum); 1928 1929 if (!removing) 1930 return; 1931 1932 for (smi = 0; smi < NMIRROR; smi++) { 1933 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 1934 continue; 1935 /* reallow soft partitioning of submirror and reset parent */ 1936 su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev)); 1937 MD_CAPAB(su) |= MD_CAN_SP; 1938 md_reset_parent(un->un_sm[smi].sm_dev); 1939 reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]); 1940 1941 sv[nsv].setno = MD_MIN2SET(mnum); 1942 sv[nsv++].key = un->un_sm[smi].sm_key; 1943 bits |= SMI2BIT(smi); 1944 } 1945 1946 MD_STATUS(un) |= MD_UN_BEING_RESET; 1947 recid = un->un_rr_dirty_recid; 1948 vtoc_id = un->c.un_vtoc_id; 1949 selfid = MD_SID(un); 1950 1951 mirror_commit(un, bits, 0); 1952 1953 avl_destroy(&un->un_overlap_root); 1954 1955 /* Destroy all mutexes and condvars before returning. */ 1956 mutex_destroy(&un->un_suspend_wr_mx); 1957 cv_destroy(&un->un_suspend_wr_cv); 1958 mutex_destroy(&un->un_overlap_tree_mx); 1959 cv_destroy(&un->un_overlap_tree_cv); 1960 mutex_destroy(&un->un_owner_mx); 1961 mutex_destroy(&un->un_rs_thread_mx); 1962 cv_destroy(&un->un_rs_thread_cv); 1963 mutex_destroy(&un->un_rs_progress_mx); 1964 cv_destroy(&un->un_rs_progress_cv); 1965 mutex_destroy(&un->un_dmr_mx); 1966 cv_destroy(&un->un_dmr_cv); 1967 1968 /* 1969 * Remove self from the namespace 1970 */ 1971 if (un->c.un_revision & MD_FN_META_DEV) { 1972 (void) md_rem_selfname(un->c.un_self_id); 1973 } 1974 1975 mddb_deleterec_wrapper(un->c.un_record_id); 1976 if (recid != 0) 1977 mddb_deleterec_wrapper(recid); 1978 1979 /* Remove the vtoc, if present */ 1980 if (vtoc_id) 1981 mddb_deleterec_wrapper(vtoc_id); 1982 1983 md_rem_names(sv, nsv); 1984 1985 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, 1986 MD_MIN2SET(selfid), selfid); 1987 } 1988 1989 int 1990 mirror_internal_open( 1991 minor_t mnum, 1992 int flag, 1993 int otyp, 1994 int md_oflags, 1995 IOLOCK *lockp /* can be NULL */ 1996 ) 1997 { 1998 mdi_unit_t *ui = MDI_UNIT(mnum); 1999 int err = 0; 2000 2001 tryagain: 2002 /* single thread */ 2003 if (lockp) { 2004 /* 2005 * If ioctl lock is held, use openclose_enter 2006 * routine that will set the ioctl flag when 2007 * grabbing the readerlock. 2008 */ 2009 (void) md_ioctl_openclose_enter(lockp, ui); 2010 } else { 2011 (void) md_unit_openclose_enter(ui); 2012 } 2013 2014 /* 2015 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE 2016 * message in a MN diskset and this requires that the openclose 2017 * lock is dropped in order to send this message. So, another 2018 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from 2019 * attempting an open while this thread has an open in progress. 2020 * Call the *_lh version of the lock exit routines since the ui_mx 2021 * mutex must be held from checking for OPENINPROGRESS until 2022 * after the cv_wait call. 2023 */ 2024 mutex_enter(&ui->ui_mx); 2025 if (ui->ui_lock & MD_UL_OPENINPROGRESS) { 2026 if (lockp) { 2027 (void) md_ioctl_openclose_exit_lh(lockp); 2028 } else { 2029 md_unit_openclose_exit_lh(ui); 2030 } 2031 cv_wait(&ui->ui_cv, &ui->ui_mx); 2032 mutex_exit(&ui->ui_mx); 2033 goto tryagain; 2034 } 2035 2036 ui->ui_lock |= MD_UL_OPENINPROGRESS; 2037 mutex_exit(&ui->ui_mx); 2038 2039 /* open devices, if necessary */ 2040 if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) { 2041 if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0) 2042 goto out; 2043 } 2044 2045 /* count open */ 2046 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 2047 goto out; 2048 2049 /* unlock, return success */ 2050 out: 2051 mutex_enter(&ui->ui_mx); 2052 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 2053 mutex_exit(&ui->ui_mx); 2054 2055 if (lockp) { 2056 /* 2057 * If ioctl lock is held, use openclose_exit 2058 * routine that will clear the lockp reader flag. 2059 */ 2060 (void) md_ioctl_openclose_exit(lockp); 2061 } else { 2062 md_unit_openclose_exit(ui); 2063 } 2064 return (err); 2065 } 2066 2067 int 2068 mirror_internal_close( 2069 minor_t mnum, 2070 int otyp, 2071 int md_cflags, 2072 IOLOCK *lockp /* can be NULL */ 2073 ) 2074 { 2075 mdi_unit_t *ui = MDI_UNIT(mnum); 2076 mm_unit_t *un; 2077 int err = 0; 2078 2079 /* single thread */ 2080 if (lockp) { 2081 /* 2082 * If ioctl lock is held, use openclose_enter 2083 * routine that will set the ioctl flag when 2084 * grabbing the readerlock. 2085 */ 2086 un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui); 2087 } else { 2088 un = (mm_unit_t *)md_unit_openclose_enter(ui); 2089 } 2090 2091 /* count closed */ 2092 if ((err = md_unit_decopen(mnum, otyp)) != 0) 2093 goto out; 2094 2095 /* close devices, if necessary */ 2096 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 2097 /* 2098 * Clean up dirty bitmap for this unit. Do this 2099 * before closing the underlying devices to avoid 2100 * race conditions with reset_mirror() as a 2101 * result of a 'metaset -r' command running in 2102 * parallel. This might cause deallocation of 2103 * dirty region bitmaps; with underlying metadevices 2104 * in place this can't happen. 2105 * Don't do this if a MN set and ABR not set 2106 */ 2107 if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) { 2108 if (!MD_MNSET_SETNO(MD_UN2SET(un)) || 2109 !(ui->ui_tstate & MD_ABR_CAP)) 2110 mirror_process_unit_resync(un); 2111 } 2112 (void) mirror_close_all_devs(un, md_cflags); 2113 2114 /* 2115 * For a MN set with transient capabilities (eg ABR/DMR) set, 2116 * clear these capabilities on the last open in the cluster. 2117 * To do this we send a message to all nodes to see of the 2118 * device is open. 2119 */ 2120 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 2121 (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) { 2122 if (lockp) { 2123 (void) md_ioctl_openclose_exit(lockp); 2124 } else { 2125 md_unit_openclose_exit(ui); 2126 } 2127 2128 /* 2129 * if we are in the context of an ioctl, drop the 2130 * ioctl lock. 2131 * Otherwise, no other locks should be held. 2132 */ 2133 if (lockp) { 2134 IOLOCK_RETURN_RELEASE(0, lockp); 2135 } 2136 2137 mdmn_clear_all_capabilities(mnum); 2138 2139 /* if dropped the lock previously, regain it */ 2140 if (lockp) { 2141 IOLOCK_RETURN_REACQUIRE(lockp); 2142 } 2143 return (0); 2144 } 2145 /* unlock and return success */ 2146 } 2147 out: 2148 /* Call whether lockp is NULL or not. */ 2149 if (lockp) { 2150 md_ioctl_openclose_exit(lockp); 2151 } else { 2152 md_unit_openclose_exit(ui); 2153 } 2154 return (err); 2155 } 2156 2157 /* 2158 * When a component has completed resyncing and is now ok, check if the 2159 * corresponding component in the other submirrors is in the Last Erred 2160 * state. If it is, we want to change that to the Erred state so we stop 2161 * using that component and start using this good component instead. 2162 * 2163 * This is called from set_sm_comp_state and recursively calls 2164 * set_sm_comp_state if it needs to change the Last Erred state. 2165 */ 2166 static void 2167 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags, 2168 IOLOCK *lockp) 2169 { 2170 mm_submirror_t *sm; 2171 mm_submirror_ic_t *smic; 2172 int ci; 2173 int i; 2174 int compcnt; 2175 int changed = 0; 2176 2177 for (i = 0; i < NMIRROR; i++) { 2178 sm = &un->un_sm[i]; 2179 smic = &un->un_smic[i]; 2180 2181 if (!SMS_IS(sm, SMS_INUSE)) 2182 continue; 2183 2184 /* ignore the submirror that we just made ok */ 2185 if (i == smi) 2186 continue; 2187 2188 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 2189 for (ci = 0; ci < compcnt; ci++) { 2190 md_m_shared_t *shared; 2191 2192 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2193 (sm->sm_dev, sm, ci); 2194 2195 if ((shared->ms_state & CS_LAST_ERRED) && 2196 !mirror_other_sources(un, i, ci, 1)) { 2197 2198 set_sm_comp_state(un, i, ci, CS_ERRED, extras, 2199 flags, lockp); 2200 changed = 1; 2201 } 2202 } 2203 } 2204 2205 /* maybe there is a hotspare for this newly erred component */ 2206 if (changed) { 2207 set_t setno; 2208 2209 setno = MD_UN2SET(un); 2210 if (MD_MNSET_SETNO(setno)) { 2211 send_poke_hotspares(setno); 2212 } else { 2213 (void) poke_hotspares(); 2214 } 2215 } 2216 } 2217 2218 /* 2219 * set_sm_comp_state 2220 * 2221 * Set the state of a submirror component to the specified new state. 2222 * If the mirror is in a multi-node set, send messages to all nodes to 2223 * block all writes to the mirror and then update the state and release the 2224 * writes. These messages are only sent if MD_STATE_XMIT is set in flags. 2225 * MD_STATE_XMIT will be unset in 2 cases: 2226 * 1. When the state is changed to CS_RESYNC as this state change 2227 * will already have been updated on each node by the processing of the 2228 * distributed metasync command, hence no need to xmit. 2229 * 2. When the state is change to CS_OKAY after a resync has completed. Again 2230 * the resync completion will already have been processed on each node by 2231 * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component 2232 * resync, hence no need to xmit. 2233 * 2234 * In case we are called from the updates of a watermark, 2235 * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to 2236 * a metainit or similar. In this case the message that we sent to propagate 2237 * the state change must not be a class1 message as that would deadlock with 2238 * the metainit command that is still being processed. 2239 * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2 2240 * instead. This also makes the submessage generator to create a class2 2241 * submessage rather than a class1 (which would also block) 2242 * 2243 * On entry, unit_writerlock is held 2244 * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is 2245 * also held. 2246 */ 2247 void 2248 set_sm_comp_state( 2249 mm_unit_t *un, 2250 int smi, 2251 int ci, 2252 int newstate, 2253 mddb_recid_t *extras, 2254 uint_t flags, 2255 IOLOCK *lockp 2256 ) 2257 { 2258 mm_submirror_t *sm; 2259 mm_submirror_ic_t *smic; 2260 md_m_shared_t *shared; 2261 int origstate; 2262 void (*get_dev)(); 2263 ms_cd_info_t cd; 2264 char devname[MD_MAX_CTDLEN]; 2265 int err; 2266 set_t setno = MD_UN2SET(un); 2267 md_mn_msg_stch_t stchmsg; 2268 mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); 2269 md_mn_kresult_t *kresult; 2270 int rval; 2271 uint_t msgflags; 2272 md_mn_msgtype_t msgtype; 2273 int save_lock = 0; 2274 mdi_unit_t *ui_sm; 2275 2276 sm = &un->un_sm[smi]; 2277 smic = &un->un_smic[smi]; 2278 2279 /* If we have a real error status then turn off MD_INACCESSIBLE. */ 2280 ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev))); 2281 if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) && 2282 ui_sm->ui_tstate & MD_INACCESSIBLE) { 2283 ui_sm->ui_tstate &= ~MD_INACCESSIBLE; 2284 } 2285 2286 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2287 (sm->sm_dev, sm, ci); 2288 origstate = shared->ms_state; 2289 2290 /* 2291 * If the new state is an error and the old one wasn't, generate 2292 * a console message. We do this before we send the state to other 2293 * nodes in a MN set because the state change may change the component 2294 * name if a hotspare is allocated. 2295 */ 2296 if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) && 2297 (newstate & (CS_ERRED|CS_LAST_ERRED))) { 2298 2299 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 2300 "get device", 0); 2301 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2302 2303 err = md_getdevname(setno, mddb_getsidenum(setno), 0, 2304 cd.cd_dev, devname, sizeof (devname)); 2305 2306 if (err == ENOENT) { 2307 (void) md_devname(setno, cd.cd_dev, devname, 2308 sizeof (devname)); 2309 } 2310 2311 cmn_err(CE_WARN, "md: %s: %s needs maintenance", 2312 md_shortname(md_getminor(sm->sm_dev)), devname); 2313 2314 if (newstate & CS_LAST_ERRED) { 2315 cmn_err(CE_WARN, "md: %s: %s last erred", 2316 md_shortname(md_getminor(sm->sm_dev)), 2317 devname); 2318 2319 } else if (shared->ms_flags & MDM_S_ISOPEN) { 2320 /* 2321 * Close the broken device and clear the open flag on 2322 * it. Closing the device means the RCM framework will 2323 * be able to unconfigure the device if required. 2324 * 2325 * We have to check that the device is open, otherwise 2326 * the first open on it has resulted in the error that 2327 * is being processed and the actual cd.cd_dev will be 2328 * NODEV64. 2329 * 2330 * If this is a multi-node mirror, then the multinode 2331 * state checks following this code will cause the 2332 * slave nodes to close the mirror in the function 2333 * mirror_set_state(). 2334 */ 2335 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2336 shared->ms_flags &= ~MDM_S_ISOPEN; 2337 } 2338 2339 } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) && 2340 (shared->ms_flags & MDM_S_ISOPEN)) { 2341 /* 2342 * Similar to logic above except no log messages since we 2343 * are just transitioning from Last Erred to Erred. 2344 */ 2345 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 2346 "get device", 0); 2347 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2348 2349 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2350 shared->ms_flags &= ~MDM_S_ISOPEN; 2351 } 2352 2353 if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) && 2354 (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) { 2355 /* 2356 * For a multi-node mirror, send the state change to the 2357 * master, which broadcasts to all nodes, including this 2358 * one. Once the message is received, the state is set 2359 * in-core and the master commits the change to disk. 2360 * There is a case, comp_replace, where this function 2361 * can be called from within an ioctl and therefore in this 2362 * case, as the ioctl will already be called on each node, 2363 * there is no need to xmit the state change to the master for 2364 * distribution to the other nodes. MD_STATE_XMIT flag is used 2365 * to indicate whether a xmit is required. The mirror's 2366 * transient state is set to MD_ERR_PENDING to avoid sending 2367 * multiple messages. 2368 */ 2369 if (newstate & (CS_ERRED|CS_LAST_ERRED)) 2370 ui->ui_tstate |= MD_ERR_PENDING; 2371 2372 /* 2373 * Send a state update message to all nodes. This message 2374 * will generate 2 submessages, the first one to suspend 2375 * all writes to the mirror and the second to update the 2376 * state and resume writes. 2377 */ 2378 stchmsg.msg_stch_mnum = un->c.un_self_id; 2379 stchmsg.msg_stch_sm = smi; 2380 stchmsg.msg_stch_comp = ci; 2381 stchmsg.msg_stch_new_state = newstate; 2382 stchmsg.msg_stch_hs_id = shared->ms_hs_id; 2383 #ifdef DEBUG 2384 if (mirror_debug_flag) 2385 printf("send set state, %x, %x, %x, %x, %x\n", 2386 stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm, 2387 stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state, 2388 stchmsg.msg_stch_hs_id); 2389 #endif 2390 if (flags & MD_STATE_WMUPDATE) { 2391 msgtype = MD_MN_MSG_STATE_UPDATE2; 2392 /* 2393 * When coming from an update of watermarks, there 2394 * must already be a message logged that triggered 2395 * this action. So, no need to log this message, too. 2396 */ 2397 msgflags = MD_MSGF_NO_LOG; 2398 } else { 2399 msgtype = MD_MN_MSG_STATE_UPDATE; 2400 msgflags = MD_MSGF_DEFAULT_FLAGS; 2401 } 2402 2403 /* 2404 * If we are in the context of an ioctl, drop the ioctl lock. 2405 * lockp holds the list of locks held. 2406 * 2407 * Otherwise, increment the appropriate reacquire counters. 2408 * If openclose lock is *held, then must reacquire reader 2409 * lock before releasing the openclose lock. 2410 * Do not drop the ARRAY_WRITER lock as we may not be able 2411 * to reacquire it. 2412 */ 2413 if (lockp) { 2414 if (lockp->l_flags & MD_ARRAY_WRITER) { 2415 save_lock = MD_ARRAY_WRITER; 2416 lockp->l_flags &= ~MD_ARRAY_WRITER; 2417 } else if (lockp->l_flags & MD_ARRAY_READER) { 2418 save_lock = MD_ARRAY_READER; 2419 lockp->l_flags &= ~MD_ARRAY_READER; 2420 } 2421 IOLOCK_RETURN_RELEASE(0, lockp); 2422 } else { 2423 if (flags & MD_STATE_OCHELD) { 2424 md_unit_writerexit(ui); 2425 (void) md_unit_readerlock(ui); 2426 md_unit_openclose_exit(ui); 2427 } else { 2428 md_unit_writerexit(ui); 2429 } 2430 } 2431 2432 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 2433 rval = mdmn_ksend_message(setno, msgtype, msgflags, 2434 (char *)&stchmsg, sizeof (stchmsg), kresult); 2435 2436 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 2437 mdmn_ksend_show_error(rval, kresult, "STATE UPDATE"); 2438 cmn_err(CE_PANIC, 2439 "ksend_message failure: STATE_UPDATE"); 2440 } 2441 kmem_free(kresult, sizeof (md_mn_kresult_t)); 2442 2443 /* if dropped the lock previously, regain it */ 2444 if (lockp) { 2445 IOLOCK_RETURN_REACQUIRE(lockp); 2446 lockp->l_flags |= save_lock; 2447 } else { 2448 /* 2449 * Reacquire dropped locks and update acquirecnts 2450 * appropriately. 2451 */ 2452 if (flags & MD_STATE_OCHELD) { 2453 /* 2454 * openclose also grabs readerlock. 2455 */ 2456 (void) md_unit_openclose_enter(ui); 2457 md_unit_readerexit(ui); 2458 (void) md_unit_writerlock(ui); 2459 } else { 2460 (void) md_unit_writerlock(ui); 2461 } 2462 } 2463 2464 ui->ui_tstate &= ~MD_ERR_PENDING; 2465 } else { 2466 shared->ms_state = newstate; 2467 uniqtime32(&shared->ms_timestamp); 2468 2469 if (newstate == CS_ERRED) 2470 shared->ms_flags |= MDM_S_NOWRITE; 2471 else 2472 shared->ms_flags &= ~MDM_S_NOWRITE; 2473 2474 shared->ms_flags &= ~MDM_S_IOERR; 2475 un->un_changecnt++; 2476 shared->ms_lasterrcnt = un->un_changecnt; 2477 2478 mirror_set_sm_state(sm, smic, SMS_RUNNING, 0); 2479 mirror_commit(un, SMI2BIT(smi), extras); 2480 } 2481 2482 if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) { 2483 /* 2484 * Resetting the Last Erred state will recursively call back 2485 * into this function (set_sm_comp_state) to update the state. 2486 */ 2487 reset_lasterred(un, smi, extras, flags, lockp); 2488 } 2489 } 2490 2491 static int 2492 find_another_logical( 2493 mm_unit_t *un, 2494 mm_submirror_t *esm, 2495 diskaddr_t blk, 2496 u_longlong_t cnt, 2497 int must_be_open, 2498 int state, 2499 int err_cnt) 2500 { 2501 u_longlong_t cando; 2502 md_dev64_t dev; 2503 md_m_shared_t *s; 2504 2505 esm->sm_state |= SMS_IGNORE; 2506 while (cnt != 0) { 2507 u_longlong_t mcnt; 2508 2509 mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024)); /* 1 Gig Blks */ 2510 2511 dev = select_read_unit(un, blk, mcnt, &cando, 2512 must_be_open, &s, NULL); 2513 if (dev == (md_dev64_t)0) 2514 break; 2515 2516 if ((state == CS_LAST_ERRED) && 2517 (s->ms_state == CS_LAST_ERRED) && 2518 (err_cnt > s->ms_lasterrcnt)) 2519 break; 2520 2521 cnt -= cando; 2522 blk += cando; 2523 } 2524 esm->sm_state &= ~SMS_IGNORE; 2525 return (cnt != 0); 2526 } 2527 2528 int 2529 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open) 2530 { 2531 mm_submirror_t *sm; 2532 mm_submirror_ic_t *smic; 2533 size_t count; 2534 diskaddr_t block; 2535 u_longlong_t skip; 2536 u_longlong_t size; 2537 md_dev64_t dev; 2538 int cnt; 2539 md_m_shared_t *s; 2540 int not_found; 2541 2542 sm = &un->un_sm[smi]; 2543 smic = &un->un_smic[smi]; 2544 dev = sm->sm_dev; 2545 2546 /* 2547 * Make sure every component of the submirror 2548 * has other sources. 2549 */ 2550 if (ci < 0) { 2551 /* Find the highest lasterrcnt */ 2552 cnt = (*(smic->sm_get_component_count))(dev, sm); 2553 for (ci = 0; ci < cnt; ci++) { 2554 not_found = mirror_other_sources(un, smi, ci, 2555 must_be_open); 2556 if (not_found) 2557 return (1); 2558 } 2559 return (0); 2560 } 2561 2562 /* 2563 * Make sure this component has other sources 2564 */ 2565 (void) (*(smic->sm_get_bcss)) 2566 (dev, sm, ci, &block, &count, &skip, &size); 2567 2568 if (count == 0) 2569 return (1); 2570 2571 s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci); 2572 2573 while (count--) { 2574 if (block >= un->c.un_total_blocks) 2575 return (0); 2576 2577 if ((block + size) > un->c.un_total_blocks) 2578 size = un->c.un_total_blocks - block; 2579 2580 not_found = find_another_logical(un, sm, block, size, 2581 must_be_open, s->ms_state, s->ms_lasterrcnt); 2582 if (not_found) 2583 return (1); 2584 2585 block += size + skip; 2586 } 2587 return (0); 2588 } 2589 2590 static void 2591 finish_error(md_mps_t *ps) 2592 { 2593 struct buf *pb; 2594 mm_unit_t *un; 2595 mdi_unit_t *ui; 2596 uint_t new_str_flags; 2597 2598 pb = ps->ps_bp; 2599 un = ps->ps_un; 2600 ui = ps->ps_ui; 2601 2602 /* 2603 * Must flag any error to the resync originator if we're performing 2604 * a Write-after-Read. This corresponds to an i/o error on a resync 2605 * target device and in this case we ought to abort the resync as there 2606 * is nothing that can be done to recover from this without operator 2607 * intervention. If we don't set the B_ERROR flag we will continue 2608 * reading from the mirror but won't write to the target (as it will 2609 * have been placed into an errored state). 2610 * To handle the case of multiple components within a submirror we only 2611 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR. 2612 * The originator of the resync read will cause this bit to be set if 2613 * the underlying component count is one for a submirror resync. All 2614 * other resync types will have the flag set as there is no underlying 2615 * resync which can be performed on a contained metadevice for these 2616 * resync types (optimized or component). 2617 */ 2618 2619 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) { 2620 if (ps->ps_flags & MD_MPS_FLAG_ERROR) 2621 pb->b_flags |= B_ERROR; 2622 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2623 MPS_FREE(mirror_parent_cache, ps); 2624 md_unit_readerexit(ui); 2625 md_biodone(pb); 2626 return; 2627 } 2628 /* 2629 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 2630 * operation therefore this I/O request has already been counted, 2631 * the I/O count variable will be decremented by mirror_done()'s 2632 * call to md_biodone(). 2633 */ 2634 if (ps->ps_changecnt != un->un_changecnt) { 2635 new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED; 2636 if (ps->ps_flags & MD_MPS_WOW) 2637 new_str_flags |= MD_STR_WOW; 2638 if (ps->ps_flags & MD_MPS_MAPPED) 2639 new_str_flags |= MD_STR_MAPPED; 2640 /* 2641 * If this I/O request was a read that was part of a resync, 2642 * set MD_STR_WAR for the retried read to ensure that the 2643 * resync write (i.e. write-after-read) will be performed 2644 */ 2645 if (ps->ps_flags & MD_MPS_RESYNC_READ) 2646 new_str_flags |= MD_STR_WAR; 2647 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2648 MPS_FREE(mirror_parent_cache, ps); 2649 md_unit_readerexit(ui); 2650 (void) md_mirror_strategy(pb, new_str_flags, NULL); 2651 return; 2652 } 2653 2654 pb->b_flags |= B_ERROR; 2655 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2656 MPS_FREE(mirror_parent_cache, ps); 2657 md_unit_readerexit(ui); 2658 md_biodone(pb); 2659 } 2660 2661 static void 2662 error_update_unit(md_mps_t *ps) 2663 { 2664 mm_unit_t *un; 2665 mdi_unit_t *ui; 2666 int smi; /* sub mirror index */ 2667 int ci; /* errored component */ 2668 set_t setno; 2669 uint_t flags; /* for set_sm_comp_state() */ 2670 uint_t hspflags; /* for check_comp_4_hotspares() */ 2671 2672 ui = ps->ps_ui; 2673 un = (mm_unit_t *)md_unit_writerlock(ui); 2674 setno = MD_UN2SET(un); 2675 2676 /* All of these updates have to propagated in case of MN set */ 2677 flags = MD_STATE_XMIT; 2678 hspflags = MD_HOTSPARE_XMIT; 2679 2680 /* special treatment if we are called during updating watermarks */ 2681 if (ps->ps_flags & MD_MPS_WMUPDATE) { 2682 flags |= MD_STATE_WMUPDATE; 2683 hspflags |= MD_HOTSPARE_WMUPDATE; 2684 } 2685 smi = 0; 2686 ci = 0; 2687 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 2688 if (mirror_other_sources(un, smi, ci, 0) == 1) { 2689 2690 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2691 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags, 2692 (IOLOCK *)NULL); 2693 /* 2694 * For a MN set, the NOTIFY is done when the state 2695 * change is processed on each node 2696 */ 2697 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2698 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 2699 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2700 } 2701 continue; 2702 } 2703 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2704 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags, 2705 (IOLOCK *)NULL); 2706 /* 2707 * For a MN set, the NOTIFY is done when the state 2708 * change is processed on each node 2709 */ 2710 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2711 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 2712 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2713 } 2714 smi = 0; 2715 ci = 0; 2716 } 2717 2718 md_unit_writerexit(ui); 2719 if (MD_MNSET_SETNO(setno)) { 2720 send_poke_hotspares(setno); 2721 } else { 2722 (void) poke_hotspares(); 2723 } 2724 (void) md_unit_readerlock(ui); 2725 2726 finish_error(ps); 2727 } 2728 2729 /* 2730 * When we have a B_FAILFAST IO error on a Last Erred component we need to 2731 * retry the IO without B_FAILFAST set so that we try to ensure that the 2732 * component "sees" each IO. 2733 */ 2734 static void 2735 last_err_retry(md_mcs_t *cs) 2736 { 2737 struct buf *cb; 2738 md_mps_t *ps; 2739 uint_t flags; 2740 2741 cb = &cs->cs_buf; 2742 cb->b_flags &= ~B_FAILFAST; 2743 2744 /* if we're panicing just let this I/O error out */ 2745 if (panicstr) { 2746 (void) mirror_done(cb); 2747 return; 2748 } 2749 2750 /* reissue the I/O */ 2751 2752 ps = cs->cs_ps; 2753 2754 bioerror(cb, 0); 2755 2756 mutex_enter(&ps->ps_mx); 2757 2758 flags = MD_STR_NOTTOP; 2759 if (ps->ps_flags & MD_MPS_MAPPED) 2760 flags |= MD_STR_MAPPED; 2761 if (ps->ps_flags & MD_MPS_NOBLOCK) 2762 flags |= MD_NOBLOCK; 2763 2764 mutex_exit(&ps->ps_mx); 2765 2766 clear_retry_error(cb); 2767 2768 cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST", 2769 md_shortname(getminor(cb->b_edev))); 2770 2771 md_call_strategy(cb, flags, NULL); 2772 } 2773 2774 static void 2775 mirror_error(md_mps_t *ps) 2776 { 2777 int smi; /* sub mirror index */ 2778 int ci; /* errored component */ 2779 2780 if (panicstr) { 2781 finish_error(ps); 2782 return; 2783 } 2784 2785 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 2786 mirror_overlap_tree_remove(ps); 2787 2788 smi = 0; 2789 ci = 0; 2790 if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) { 2791 md_unit_readerexit(ps->ps_ui); 2792 daemon_request(&md_mstr_daemon, error_update_unit, 2793 (daemon_queue_t *)ps, REQ_OLD); 2794 return; 2795 } 2796 2797 finish_error(ps); 2798 } 2799 2800 static int 2801 copy_write_done(struct buf *cb) 2802 { 2803 md_mps_t *ps; 2804 buf_t *pb; 2805 char *wowbuf; 2806 wowhdr_t *wowhdr; 2807 ssize_t wow_resid; 2808 2809 /* get wowbuf ans save structure */ 2810 wowbuf = cb->b_un.b_addr; 2811 wowhdr = WOWBUF_HDR(wowbuf); 2812 ps = wowhdr->wow_ps; 2813 pb = ps->ps_bp; 2814 2815 /* Save error information, then free cb */ 2816 if (cb->b_flags & B_ERROR) 2817 pb->b_flags |= B_ERROR; 2818 2819 if (cb->b_flags & B_REMAPPED) 2820 bp_mapout(cb); 2821 2822 freerbuf(cb); 2823 2824 /* update residual and continue if needed */ 2825 if ((pb->b_flags & B_ERROR) == 0) { 2826 wow_resid = pb->b_bcount - wowhdr->wow_offset; 2827 pb->b_resid = wow_resid; 2828 if (wow_resid > 0) { 2829 daemon_request(&md_mstr_daemon, copy_write_cont, 2830 (daemon_queue_t *)wowhdr, REQ_OLD); 2831 return (1); 2832 } 2833 } 2834 2835 /* Write is complete, release resources. */ 2836 kmem_cache_free(mirror_wowblk_cache, wowhdr); 2837 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 2838 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2839 MPS_FREE(mirror_parent_cache, ps); 2840 md_biodone(pb); 2841 return (0); 2842 } 2843 2844 static void 2845 copy_write_cont(wowhdr_t *wowhdr) 2846 { 2847 buf_t *pb; 2848 buf_t *cb; 2849 char *wowbuf; 2850 int wow_offset; 2851 size_t wow_resid; 2852 diskaddr_t wow_blkno; 2853 2854 wowbuf = WOWHDR_BUF(wowhdr); 2855 pb = wowhdr->wow_ps->ps_bp; 2856 2857 /* get data on current location */ 2858 wow_offset = wowhdr->wow_offset; 2859 wow_resid = pb->b_bcount - wow_offset; 2860 wow_blkno = pb->b_lblkno + lbtodb(wow_offset); 2861 2862 /* setup child buffer */ 2863 cb = getrbuf(KM_SLEEP); 2864 cb->b_flags = B_WRITE; 2865 cb->b_edev = pb->b_edev; 2866 cb->b_un.b_addr = wowbuf; /* change to point at WOWBUF */ 2867 cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */ 2868 cb->b_iodone = copy_write_done; 2869 cb->b_bcount = MIN(md_wowbuf_size, wow_resid); 2870 cb->b_lblkno = wow_blkno; 2871 2872 /* move offset to next section */ 2873 wowhdr->wow_offset += cb->b_bcount; 2874 2875 /* copy and setup write for current section */ 2876 bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount); 2877 2878 /* do it */ 2879 /* 2880 * Do not set the MD_IO_COUNTED flag as this is a new I/O request 2881 * that handles the WOW condition. The resultant increment on the 2882 * I/O count variable is cleared by copy_write_done()'s call to 2883 * md_biodone(). 2884 */ 2885 (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW 2886 | MD_STR_MAPPED, NULL); 2887 } 2888 2889 static void 2890 md_mirror_copy_write(md_mps_t *ps) 2891 { 2892 wowhdr_t *wowhdr; 2893 2894 wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS); 2895 mirror_wowblk_init(wowhdr); 2896 wowhdr->wow_ps = ps; 2897 wowhdr->wow_offset = 0; 2898 copy_write_cont(wowhdr); 2899 } 2900 2901 static void 2902 handle_wow(md_mps_t *ps) 2903 { 2904 buf_t *pb; 2905 2906 pb = ps->ps_bp; 2907 2908 bp_mapin(pb); 2909 2910 md_mirror_wow_cnt++; 2911 if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) { 2912 cmn_err(CE_NOTE, 2913 "md: %s, blk %lld, cnt %ld: Write on write %d occurred", 2914 md_shortname(getminor(pb->b_edev)), 2915 (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt); 2916 } 2917 2918 /* 2919 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 2920 * operation therefore this I/O request has already been counted, 2921 * the I/O count variable will be decremented by mirror_done()'s 2922 * call to md_biodone(). 2923 */ 2924 if (md_mirror_wow_flg & WOW_NOCOPY) 2925 (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW | 2926 MD_STR_MAPPED | MD_IO_COUNTED, ps); 2927 else 2928 md_mirror_copy_write(ps); 2929 } 2930 2931 /* 2932 * Return true if the specified submirror is either in the Last Erred 2933 * state or is transitioning into the Last Erred state. 2934 */ 2935 static bool_t 2936 submirror_is_lasterred(mm_unit_t *un, int smi) 2937 { 2938 mm_submirror_t *sm; 2939 mm_submirror_ic_t *smic; 2940 md_m_shared_t *shared; 2941 int ci; 2942 int compcnt; 2943 2944 sm = &un->un_sm[smi]; 2945 smic = &un->un_smic[smi]; 2946 2947 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 2948 for (ci = 0; ci < compcnt; ci++) { 2949 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2950 (sm->sm_dev, sm, ci); 2951 2952 if (shared->ms_state == CS_LAST_ERRED) 2953 return (B_TRUE); 2954 2955 /* 2956 * It is not currently Last Erred, check if entering Last Erred. 2957 */ 2958 if ((shared->ms_flags & MDM_S_IOERR) && 2959 ((shared->ms_state == CS_OKAY) || 2960 (shared->ms_state == CS_RESYNC))) { 2961 if (mirror_other_sources(un, smi, ci, 0) == 1) 2962 return (B_TRUE); 2963 } 2964 } 2965 2966 return (B_FALSE); 2967 } 2968 2969 2970 static int 2971 mirror_done(struct buf *cb) 2972 { 2973 md_mps_t *ps; 2974 md_mcs_t *cs; 2975 2976 /*LINTED*/ 2977 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 2978 ps = cs->cs_ps; 2979 2980 mutex_enter(&ps->ps_mx); 2981 2982 /* check if we need to retry an errored failfast I/O */ 2983 if (cb->b_flags & B_ERROR) { 2984 struct buf *pb = ps->ps_bp; 2985 2986 if (cb->b_flags & B_FAILFAST) { 2987 int i; 2988 mm_unit_t *un = ps->ps_un; 2989 2990 for (i = 0; i < NMIRROR; i++) { 2991 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 2992 continue; 2993 2994 if (cb->b_edev == 2995 md_dev64_to_dev(un->un_sm[i].sm_dev)) { 2996 2997 /* 2998 * This is the submirror that had the 2999 * error. Check if it is Last Erred. 3000 */ 3001 if (submirror_is_lasterred(un, i)) { 3002 daemon_queue_t *dqp; 3003 3004 mutex_exit(&ps->ps_mx); 3005 dqp = (daemon_queue_t *)cs; 3006 dqp->dq_prev = NULL; 3007 dqp->dq_next = NULL; 3008 daemon_request(&md_done_daemon, 3009 last_err_retry, dqp, 3010 REQ_OLD); 3011 return (1); 3012 } 3013 break; 3014 } 3015 } 3016 } 3017 3018 /* continue to process the buf without doing a retry */ 3019 ps->ps_flags |= MD_MPS_ERROR; 3020 pb->b_error = cb->b_error; 3021 } 3022 3023 return (mirror_done_common(cb)); 3024 } 3025 3026 /* 3027 * Split from the original mirror_done function so we can handle bufs after a 3028 * retry. 3029 * ps->ps_mx is already held in the caller of this function and the cb error 3030 * has already been checked and handled in the caller. 3031 */ 3032 static int 3033 mirror_done_common(struct buf *cb) 3034 { 3035 struct buf *pb; 3036 mm_unit_t *un; 3037 mdi_unit_t *ui; 3038 md_mps_t *ps; 3039 md_mcs_t *cs; 3040 size_t end_rr, start_rr, current_rr; 3041 3042 /*LINTED*/ 3043 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3044 ps = cs->cs_ps; 3045 pb = ps->ps_bp; 3046 3047 if (cb->b_flags & B_REMAPPED) 3048 bp_mapout(cb); 3049 3050 ps->ps_frags--; 3051 if (ps->ps_frags != 0) { 3052 mutex_exit(&ps->ps_mx); 3053 kmem_cache_free(mirror_child_cache, cs); 3054 return (1); 3055 } 3056 un = ps->ps_un; 3057 ui = ps->ps_ui; 3058 3059 /* 3060 * Do not update outstanding_writes if we're running with ABR 3061 * set for this mirror or the write() was issued with MD_STR_ABR set. 3062 * Also a resync initiated write() has no outstanding_writes update 3063 * either. 3064 */ 3065 if (((cb->b_flags & B_READ) == 0) && 3066 (un->un_nsm >= 2) && 3067 (ps->ps_call == NULL) && 3068 !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) && 3069 !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) { 3070 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 3071 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 3072 mutex_enter(&un->un_resync_mx); 3073 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) 3074 un->un_outstanding_writes[current_rr]--; 3075 mutex_exit(&un->un_resync_mx); 3076 } 3077 kmem_cache_free(mirror_child_cache, cs); 3078 mutex_exit(&ps->ps_mx); 3079 3080 if (ps->ps_call != NULL) { 3081 daemon_request(&md_done_daemon, ps->ps_call, 3082 (daemon_queue_t *)ps, REQ_OLD); 3083 return (1); 3084 } 3085 3086 if ((ps->ps_flags & MD_MPS_ERROR)) { 3087 daemon_request(&md_done_daemon, mirror_error, 3088 (daemon_queue_t *)ps, REQ_OLD); 3089 return (1); 3090 } 3091 3092 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3093 mirror_overlap_tree_remove(ps); 3094 3095 /* 3096 * Handle Write-on-Write problem. 3097 * Skip In case of Raw and Direct I/O as they are 3098 * handled earlier. 3099 * 3100 */ 3101 if (!(md_mirror_wow_flg & WOW_DISABLE) && 3102 !(pb->b_flags & B_READ) && 3103 !(ps->ps_flags & MD_MPS_WOW) && 3104 !(pb->b_flags & B_PHYS) && 3105 any_pages_dirty(pb)) { 3106 md_unit_readerexit(ps->ps_ui); 3107 daemon_request(&md_mstr_daemon, handle_wow, 3108 (daemon_queue_t *)ps, REQ_OLD); 3109 return (1); 3110 } 3111 3112 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3113 MPS_FREE(mirror_parent_cache, ps); 3114 md_unit_readerexit(ui); 3115 md_biodone(pb); 3116 return (0); 3117 } 3118 3119 /* 3120 * Clear error state in submirror component if the retry worked after 3121 * a failfast error. 3122 */ 3123 static void 3124 clear_retry_error(struct buf *cb) 3125 { 3126 int smi; 3127 md_mcs_t *cs; 3128 mm_unit_t *un; 3129 mdi_unit_t *ui_sm; 3130 mm_submirror_t *sm; 3131 mm_submirror_ic_t *smic; 3132 u_longlong_t cnt; 3133 md_m_shared_t *shared; 3134 3135 /*LINTED*/ 3136 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3137 un = cs->cs_ps->ps_un; 3138 3139 for (smi = 0; smi < NMIRROR; smi++) { 3140 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 3141 continue; 3142 3143 if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) 3144 break; 3145 } 3146 3147 if (smi >= NMIRROR) 3148 return; 3149 3150 sm = &un->un_sm[smi]; 3151 smic = &un->un_smic[smi]; 3152 cnt = cb->b_bcount; 3153 3154 ui_sm = MDI_UNIT(getminor(cb->b_edev)); 3155 (void) md_unit_writerlock(ui_sm); 3156 3157 shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm, 3158 cb->b_blkno, &cnt); 3159 3160 if (shared->ms_flags & MDM_S_IOERR) { 3161 shared->ms_flags &= ~MDM_S_IOERR; 3162 3163 } else { 3164 /* the buf spans components and the first one is not erred */ 3165 int cnt; 3166 int i; 3167 3168 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un); 3169 for (i = 0; i < cnt; i++) { 3170 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 3171 (sm->sm_dev, sm, i); 3172 3173 if (shared->ms_flags & MDM_S_IOERR && 3174 shared->ms_state == CS_OKAY) { 3175 3176 shared->ms_flags &= ~MDM_S_IOERR; 3177 break; 3178 } 3179 } 3180 } 3181 3182 md_unit_writerexit(ui_sm); 3183 } 3184 3185 static size_t 3186 mirror_map_read( 3187 md_mps_t *ps, 3188 md_mcs_t *cs, 3189 diskaddr_t blkno, 3190 u_longlong_t count 3191 ) 3192 { 3193 mm_unit_t *un; 3194 buf_t *bp; 3195 u_longlong_t cando; 3196 3197 bp = &cs->cs_buf; 3198 un = ps->ps_un; 3199 3200 bp->b_lblkno = blkno; 3201 if (fast_select_read_unit(ps, cs) == 0) { 3202 bp->b_bcount = ldbtob(count); 3203 return (0); 3204 } 3205 bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, 3206 count, &cando, 0, NULL, cs)); 3207 bp->b_bcount = ldbtob(cando); 3208 if (count != cando) 3209 return (cando); 3210 return (0); 3211 } 3212 3213 static void 3214 write_after_read(md_mps_t *ps) 3215 { 3216 struct buf *pb; 3217 int flags; 3218 3219 if (ps->ps_flags & MD_MPS_ERROR) { 3220 mirror_error(ps); 3221 return; 3222 } 3223 3224 pb = ps->ps_bp; 3225 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3226 ps->ps_call = NULL; 3227 ps->ps_flags |= MD_MPS_WRITE_AFTER_READ; 3228 flags = MD_STR_NOTTOP | MD_STR_WAR; 3229 if (ps->ps_flags & MD_MPS_MAPPED) 3230 flags |= MD_STR_MAPPED; 3231 if (ps->ps_flags & MD_MPS_NOBLOCK) 3232 flags |= MD_NOBLOCK; 3233 if (ps->ps_flags & MD_MPS_DIRTY_RD) 3234 flags |= MD_STR_DIRTY_RD; 3235 (void) mirror_write_strategy(pb, flags, ps); 3236 } 3237 3238 static void 3239 continue_serial(md_mps_t *ps) 3240 { 3241 md_mcs_t *cs; 3242 buf_t *cb; 3243 mm_unit_t *un; 3244 int flags; 3245 3246 un = ps->ps_un; 3247 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 3248 mirror_child_init(cs); 3249 cb = &cs->cs_buf; 3250 ps->ps_call = NULL; 3251 ps->ps_frags = 1; 3252 (void) mirror_map_write(un, cs, ps, 0); 3253 flags = MD_STR_NOTTOP; 3254 if (ps->ps_flags & MD_MPS_MAPPED) 3255 flags |= MD_STR_MAPPED; 3256 md_call_strategy(cb, flags, NULL); 3257 } 3258 3259 static int 3260 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war) 3261 { 3262 int i; 3263 dev_t dev; /* needed for bioclone, so not md_dev64_t */ 3264 buf_t *cb; 3265 buf_t *pb; 3266 diskaddr_t blkno; 3267 size_t bcount; 3268 off_t offset; 3269 3270 pb = ps->ps_bp; 3271 cb = &cs->cs_buf; 3272 cs->cs_ps = ps; 3273 3274 i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm); 3275 3276 dev = md_dev64_to_dev(un->un_sm[i].sm_dev); 3277 3278 blkno = pb->b_lblkno; 3279 bcount = pb->b_bcount; 3280 offset = 0; 3281 if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) { 3282 blkno = DK_LABEL_LOC + 1; 3283 /* 3284 * This handles the case where we're requesting 3285 * a write to block 0 on a label partition 3286 * and the request size was smaller than the 3287 * size of the label. If this is the case 3288 * then we'll return -1. Failure to do so will 3289 * either cause the calling thread to hang due to 3290 * an ssd bug, or worse if the bcount were allowed 3291 * to go negative (ie large). 3292 */ 3293 if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1)) 3294 return (-1); 3295 bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3296 offset = (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3297 } 3298 3299 cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done, 3300 cb, KM_NOSLEEP); 3301 if (war) 3302 cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE; 3303 3304 /* 3305 * If the submirror is in the erred stated, check if any component is 3306 * in the Last Erred state. If so, we don't want to use the B_FAILFAST 3307 * flag on the IO. 3308 * 3309 * Provide a fast path for the non-erred case (which should be the 3310 * normal case). 3311 */ 3312 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) { 3313 if (un->un_sm[i].sm_state & SMS_COMP_ERRED) { 3314 mm_submirror_t *sm; 3315 mm_submirror_ic_t *smic; 3316 int ci; 3317 int compcnt; 3318 3319 sm = &un->un_sm[i]; 3320 smic = &un->un_smic[i]; 3321 3322 compcnt = (*(smic->sm_get_component_count)) 3323 (sm->sm_dev, un); 3324 for (ci = 0; ci < compcnt; ci++) { 3325 md_m_shared_t *shared; 3326 3327 shared = (md_m_shared_t *) 3328 (*(smic->sm_shared_by_indx))(sm->sm_dev, 3329 sm, ci); 3330 3331 if (shared->ms_state == CS_LAST_ERRED) 3332 break; 3333 } 3334 if (ci >= compcnt) 3335 cb->b_flags |= B_FAILFAST; 3336 3337 } else { 3338 cb->b_flags |= B_FAILFAST; 3339 } 3340 } 3341 3342 ps->ps_current_sm++; 3343 if (ps->ps_current_sm != ps->ps_active_cnt) { 3344 if (un->un_write_option == WR_SERIAL) { 3345 ps->ps_call = continue_serial; 3346 return (0); 3347 } 3348 return (1); 3349 } 3350 return (0); 3351 } 3352 3353 /* 3354 * directed_read_done: 3355 * ------------------ 3356 * Completion routine called when a DMR request has been returned from the 3357 * underlying driver. Wake-up the original ioctl() and return the data to 3358 * the user. 3359 */ 3360 static void 3361 directed_read_done(md_mps_t *ps) 3362 { 3363 mm_unit_t *un; 3364 mdi_unit_t *ui; 3365 3366 un = ps->ps_un; 3367 ui = ps->ps_ui; 3368 3369 md_unit_readerexit(ui); 3370 md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3371 ps->ps_call = NULL; 3372 3373 mutex_enter(&un->un_dmr_mx); 3374 cv_signal(&un->un_dmr_cv); 3375 mutex_exit(&un->un_dmr_mx); 3376 3377 /* release the parent structure */ 3378 kmem_cache_free(mirror_parent_cache, ps); 3379 } 3380 3381 /* 3382 * daemon_io: 3383 * ------------ 3384 * Called to issue a mirror_write_strategy() or mirror_read_strategy 3385 * call from a blockable context. NOTE: no mutex can be held on entry to this 3386 * routine 3387 */ 3388 static void 3389 daemon_io(daemon_queue_t *dq) 3390 { 3391 md_mps_t *ps = (md_mps_t *)dq; 3392 int flag = MD_STR_NOTTOP; 3393 buf_t *pb = ps->ps_bp; 3394 3395 if (ps->ps_flags & MD_MPS_MAPPED) 3396 flag |= MD_STR_MAPPED; 3397 if (ps->ps_flags & MD_MPS_WOW) 3398 flag |= MD_STR_WOW; 3399 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) 3400 flag |= MD_STR_WAR; 3401 if (ps->ps_flags & MD_MPS_ABR) 3402 flag |= MD_STR_ABR; 3403 3404 /* 3405 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set 3406 * MD_STR_WAR before calling mirror_read_strategy 3407 */ 3408 if (pb->b_flags & B_READ) { 3409 if (!(ps->ps_flags & MD_MPS_DIRTY_RD)) 3410 flag |= MD_STR_WAR; 3411 mirror_read_strategy(pb, flag, ps); 3412 } else 3413 mirror_write_strategy(pb, flag, ps); 3414 } 3415 3416 /* 3417 * update_resync: 3418 * ------------- 3419 * Called to update the in-core version of the resync record with the latest 3420 * version that was committed to disk when the previous mirror owner 3421 * relinquished ownership. This call is likely to block as we must hold-off 3422 * any current resync processing that may be occurring. 3423 * On completion of the resync record update we issue the mirror_write_strategy 3424 * call to complete the i/o that first started this sequence. To remove a race 3425 * condition between a new write() request which is submitted and the resync 3426 * record update we acquire the writerlock. This will hold off all i/o to the 3427 * mirror until the resync update has completed. 3428 * NOTE: no mutex can be held on entry to this routine 3429 */ 3430 static void 3431 update_resync(daemon_queue_t *dq) 3432 { 3433 md_mps_t *ps = (md_mps_t *)dq; 3434 buf_t *pb = ps->ps_bp; 3435 mdi_unit_t *ui = ps->ps_ui; 3436 mm_unit_t *un; 3437 set_t setno; 3438 int restart_resync; 3439 3440 un = md_unit_writerlock(ui); 3441 ps->ps_un = un; 3442 setno = MD_MIN2SET(getminor(pb->b_edev)); 3443 if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) { 3444 /* 3445 * Synchronize our in-core view of what regions need to be 3446 * resync'd with the on-disk version. 3447 */ 3448 mutex_enter(&un->un_rrp_inflight_mx); 3449 mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm, 3450 un->un_dirty_bm); 3451 mutex_exit(&un->un_rrp_inflight_mx); 3452 3453 /* Region dirty map is now up to date */ 3454 } 3455 restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0; 3456 md_unit_writerexit(ui); 3457 3458 /* Restart the resync thread if it was previously blocked */ 3459 if (restart_resync) { 3460 mutex_enter(&un->un_rs_thread_mx); 3461 un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER; 3462 cv_signal(&un->un_rs_thread_cv); 3463 mutex_exit(&un->un_rs_thread_mx); 3464 } 3465 /* Continue with original deferred i/o */ 3466 daemon_io(dq); 3467 } 3468 3469 /* 3470 * owner_timeout: 3471 * ------------- 3472 * Called if the original mdmn_ksend_message() failed and the request is to be 3473 * retried. Reattempt the original ownership change. 3474 * 3475 * NOTE: called at interrupt context (see timeout(9f)). 3476 */ 3477 static void 3478 owner_timeout(void *arg) 3479 { 3480 daemon_queue_t *dq = (daemon_queue_t *)arg; 3481 3482 daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD); 3483 } 3484 3485 /* 3486 * become_owner: 3487 * ------------ 3488 * Called to issue RPC request to become the owner of the mirror 3489 * associated with this i/o request. We assume that the ownership request 3490 * is synchronous, so if it succeeds we will issue the request via 3491 * mirror_write_strategy(). 3492 * If multiple i/o's are outstanding we will be called from the mirror_daemon 3493 * service thread. 3494 * NOTE: no mutex should be held on entry to this routine. 3495 */ 3496 static void 3497 become_owner(daemon_queue_t *dq) 3498 { 3499 md_mps_t *ps = (md_mps_t *)dq; 3500 mm_unit_t *un = ps->ps_un; 3501 buf_t *pb = ps->ps_bp; 3502 set_t setno; 3503 md_mn_kresult_t *kres; 3504 int msg_flags = md_mirror_msg_flags; 3505 md_mps_t *ps1; 3506 3507 ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL); 3508 3509 /* 3510 * If we're already the mirror owner we do not need to send a message 3511 * but can simply process the i/o request immediately. 3512 * If we've already sent the request to become owner we requeue the 3513 * request as we're waiting for the synchronous ownership message to 3514 * be processed. 3515 */ 3516 if (MD_MN_MIRROR_OWNER(un)) { 3517 /* 3518 * As the strategy() call will potentially block we need to 3519 * punt this to a separate thread and complete this request 3520 * as quickly as possible. Note: if we're a read request 3521 * this must be a resync, we cannot afford to be queued 3522 * behind any intervening i/o requests. In this case we put the 3523 * request on the md_mirror_rs_daemon queue. 3524 */ 3525 if (pb->b_flags & B_READ) { 3526 daemon_request(&md_mirror_rs_daemon, daemon_io, dq, 3527 REQ_OLD); 3528 } else { 3529 daemon_request(&md_mirror_io_daemon, daemon_io, dq, 3530 REQ_OLD); 3531 } 3532 } else { 3533 mutex_enter(&un->un_owner_mx); 3534 if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) { 3535 md_mn_req_owner_t *msg; 3536 int rval = 0; 3537 3538 /* 3539 * Check to see that we haven't exceeded the maximum 3540 * retry count. If we have we fail the i/o as the 3541 * comms mechanism has become wedged beyond recovery. 3542 */ 3543 if (dq->qlen++ >= MD_OWNER_RETRIES) { 3544 mutex_exit(&un->un_owner_mx); 3545 cmn_err(CE_WARN, 3546 "md_mirror: Request exhausted ownership " 3547 "retry limit of %d attempts", dq->qlen); 3548 pb->b_error = EIO; 3549 pb->b_flags |= B_ERROR; 3550 pb->b_resid = pb->b_bcount; 3551 kmem_cache_free(mirror_parent_cache, ps); 3552 md_biodone(pb); 3553 return; 3554 } 3555 3556 /* 3557 * Issue request to change ownership. The call is 3558 * synchronous so when it returns we can complete the 3559 * i/o (if successful), or enqueue it again so that 3560 * the operation will be retried. 3561 */ 3562 un->un_owner_state |= MM_MN_OWNER_SENT; 3563 mutex_exit(&un->un_owner_mx); 3564 3565 msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP); 3566 setno = MD_MIN2SET(getminor(pb->b_edev)); 3567 msg->mnum = MD_SID(un); 3568 msg->owner = md_mn_mynode_id; 3569 msg_flags |= MD_MSGF_NO_LOG; 3570 /* 3571 * If this IO is triggered by updating a watermark, 3572 * it might be issued by the creation of a softpartition 3573 * while the commd subsystem is suspended. 3574 * We don't want this message to block. 3575 */ 3576 if (ps->ps_flags & MD_MPS_WMUPDATE) { 3577 msg_flags |= MD_MSGF_OVERRIDE_SUSPEND; 3578 } 3579 3580 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 3581 rval = mdmn_ksend_message(setno, 3582 MD_MN_MSG_REQUIRE_OWNER, msg_flags, 3583 /* flags */ (char *)msg, 3584 sizeof (md_mn_req_owner_t), kres); 3585 3586 kmem_free(msg, sizeof (md_mn_req_owner_t)); 3587 3588 if (MDMN_KSEND_MSG_OK(rval, kres)) { 3589 dq->qlen = 0; 3590 /* 3591 * Successfully changed owner, reread the 3592 * resync record so that we have a valid idea of 3593 * any previously committed incomplete write()s. 3594 * NOTE: As we need to acquire the resync mutex 3595 * this may block, so we defer it to a separate 3596 * thread handler. This makes us (effectively) 3597 * non-blocking once the ownership message 3598 * handling has completed. 3599 */ 3600 mutex_enter(&un->un_owner_mx); 3601 if (un->un_owner_state & MM_MN_BECOME_OWNER) { 3602 un->un_mirror_owner = md_mn_mynode_id; 3603 /* Sets owner of un_rr_dirty record */ 3604 if (un->un_rr_dirty_recid) 3605 (void) mddb_setowner( 3606 un->un_rr_dirty_recid, 3607 md_mn_mynode_id); 3608 un->un_owner_state &= 3609 ~MM_MN_BECOME_OWNER; 3610 /* 3611 * Release the block on the current 3612 * resync region if it is blocked 3613 */ 3614 ps1 = un->un_rs_prev_overlap; 3615 if ((ps1 != NULL) && 3616 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) 3617 mirror_overlap_tree_remove(ps1); 3618 mutex_exit(&un->un_owner_mx); 3619 3620 /* 3621 * If we're a read, this must be a 3622 * resync request, issue 3623 * the i/o request on the 3624 * md_mirror_rs_daemon queue. This is 3625 * to avoid a deadlock between the 3626 * resync_unit thread and 3627 * subsequent i/o requests that may 3628 * block on the resync region. 3629 */ 3630 if (pb->b_flags & B_READ) { 3631 daemon_request( 3632 &md_mirror_rs_daemon, 3633 update_resync, dq, REQ_OLD); 3634 } else { 3635 daemon_request( 3636 &md_mirror_io_daemon, 3637 update_resync, dq, REQ_OLD); 3638 } 3639 kmem_free(kres, 3640 sizeof (md_mn_kresult_t)); 3641 return; 3642 } else { 3643 /* 3644 * Some other node has beaten us to 3645 * obtain ownership. We need to 3646 * reschedule our ownership request 3647 */ 3648 mutex_exit(&un->un_owner_mx); 3649 } 3650 } else { 3651 mdmn_ksend_show_error(rval, kres, 3652 "MD_MN_MSG_REQUIRE_OWNER"); 3653 /* 3654 * Message transport failure is handled by the 3655 * comms layer. If the ownership change request 3656 * does not succeed we need to flag the error to 3657 * the initiator of the i/o. This is handled by 3658 * the retry logic above. As the request failed 3659 * we do not know _who_ the owner of the mirror 3660 * currently is. We reset our idea of the owner 3661 * to None so that any further write()s will 3662 * attempt to become the owner again. This stops 3663 * multiple nodes writing to the same mirror 3664 * simultaneously. 3665 */ 3666 mutex_enter(&un->un_owner_mx); 3667 un->un_owner_state &= 3668 ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER); 3669 un->un_mirror_owner = MD_MN_MIRROR_UNOWNED; 3670 mutex_exit(&un->un_owner_mx); 3671 } 3672 kmem_free(kres, sizeof (md_mn_kresult_t)); 3673 } else 3674 mutex_exit(&un->un_owner_mx); 3675 3676 /* 3677 * Re-enqueue this request on the deferred i/o list. Delay the 3678 * request for md_mirror_owner_to usecs to stop thrashing. 3679 */ 3680 (void) timeout(owner_timeout, dq, 3681 drv_usectohz(md_mirror_owner_to)); 3682 } 3683 } 3684 3685 static void 3686 mirror_write_strategy(buf_t *pb, int flag, void *private) 3687 { 3688 md_mps_t *ps; 3689 md_mcs_t *cs; 3690 int more; 3691 mm_unit_t *un; 3692 mdi_unit_t *ui; 3693 buf_t *cb; /* child buf pointer */ 3694 set_t setno; 3695 int rs_on_overlap = 0; 3696 3697 ui = MDI_UNIT(getminor(pb->b_edev)); 3698 un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev)); 3699 3700 3701 md_kstat_waitq_enter(ui); 3702 3703 /* 3704 * If a state change is in progress for this mirror in a MN set, 3705 * suspend all non-resync writes until the state change is complete. 3706 * The objective of this suspend is to ensure that it is not 3707 * possible for one node to read data from a submirror that another node 3708 * has not written to because of the state change. Therefore we 3709 * suspend all writes until the state change has been made. As it is 3710 * not possible to read from the target of a resync, there is no need 3711 * to suspend resync writes. 3712 */ 3713 3714 if (!(flag & MD_STR_WAR)) { 3715 mutex_enter(&un->un_suspend_wr_mx); 3716 while (un->un_suspend_wr_flag) { 3717 cv_wait(&un->un_suspend_wr_cv, &un->un_suspend_wr_mx); 3718 } 3719 mutex_exit(&un->un_suspend_wr_mx); 3720 (void) md_unit_readerlock(ui); 3721 } 3722 3723 if (!(flag & MD_STR_NOTTOP)) { 3724 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 3725 md_kstat_waitq_exit(ui); 3726 return; 3727 } 3728 } 3729 3730 setno = MD_MIN2SET(getminor(pb->b_edev)); 3731 3732 /* If an ABR write has been requested, set MD_STR_ABR flag */ 3733 if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE)) 3734 flag |= MD_STR_ABR; 3735 3736 if (private == NULL) { 3737 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 3738 mirror_parent_init(ps); 3739 } else { 3740 ps = private; 3741 private = NULL; 3742 } 3743 if (flag & MD_STR_MAPPED) 3744 ps->ps_flags |= MD_MPS_MAPPED; 3745 3746 if (flag & MD_STR_WOW) 3747 ps->ps_flags |= MD_MPS_WOW; 3748 3749 if (flag & MD_STR_ABR) 3750 ps->ps_flags |= MD_MPS_ABR; 3751 3752 if (flag & MD_STR_WMUPDATE) 3753 ps->ps_flags |= MD_MPS_WMUPDATE; 3754 3755 /* 3756 * Save essential information from the original buffhdr 3757 * in the md_save structure. 3758 */ 3759 ps->ps_un = un; 3760 ps->ps_ui = ui; 3761 ps->ps_bp = pb; 3762 ps->ps_addr = pb->b_un.b_addr; 3763 ps->ps_firstblk = pb->b_lblkno; 3764 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 3765 ps->ps_changecnt = un->un_changecnt; 3766 3767 /* 3768 * If not MN owner and this is an ABR write, make sure the current 3769 * resync region is in the overlaps tree 3770 */ 3771 mutex_enter(&un->un_owner_mx); 3772 if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) && 3773 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 3774 md_mps_t *ps1; 3775 /* Block the current resync region, if not already blocked */ 3776 ps1 = un->un_rs_prev_overlap; 3777 3778 if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) || 3779 (ps1->ps_lastblk != 0))) { 3780 /* Drop locks to avoid deadlock */ 3781 mutex_exit(&un->un_owner_mx); 3782 md_unit_readerexit(ui); 3783 wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT); 3784 rs_on_overlap = 1; 3785 (void) md_unit_readerlock(ui); 3786 mutex_enter(&un->un_owner_mx); 3787 /* 3788 * Check to see if we have obtained ownership 3789 * while waiting for overlaps. If we have, remove 3790 * the resync_region entry from the overlap tree 3791 */ 3792 if (MD_MN_MIRROR_OWNER(un) && 3793 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) { 3794 mirror_overlap_tree_remove(ps1); 3795 rs_on_overlap = 0; 3796 } 3797 } 3798 } 3799 mutex_exit(&un->un_owner_mx); 3800 3801 3802 /* 3803 * following keep write after read from writing to the 3804 * source in the case where it all came from one place 3805 */ 3806 if (flag & MD_STR_WAR) { 3807 int abort_write = 0; 3808 /* 3809 * We are perfoming a write-after-read. This is either as a 3810 * result of a resync read or as a result of a read in a 3811 * dirty resync region when the optimized resync is not 3812 * complete. If in a MN set and a resync generated i/o, 3813 * if the current block is not in the current 3814 * resync region terminate the write as another node must have 3815 * completed this resync region 3816 */ 3817 if ((MD_MNSET_SETNO(MD_UN2SET(un))) && 3818 (!flag & MD_STR_DIRTY_RD)) { 3819 if (!IN_RESYNC_REGION(un, ps)) 3820 abort_write = 1; 3821 } 3822 if ((select_write_after_read_units(un, ps) == 0) || 3823 (abort_write)) { 3824 #ifdef DEBUG 3825 if (mirror_debug_flag) 3826 printf("Abort resync write on %x, block %lld\n", 3827 MD_SID(un), ps->ps_firstblk); 3828 #endif 3829 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3830 mirror_overlap_tree_remove(ps); 3831 kmem_cache_free(mirror_parent_cache, ps); 3832 md_kstat_waitq_exit(ui); 3833 md_unit_readerexit(ui); 3834 md_biodone(pb); 3835 return; 3836 } 3837 } else { 3838 select_write_units(un, ps); 3839 3840 /* Drop readerlock to avoid deadlock */ 3841 md_unit_readerexit(ui); 3842 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 3843 un = md_unit_readerlock(ui); 3844 /* 3845 * For a MN set with an ABR write, if we are now the 3846 * owner and we have a resync region in the overlap 3847 * tree, remove the entry from overlaps and retry the write. 3848 */ 3849 3850 if (MD_MNSET_SETNO(setno) && 3851 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 3852 mutex_enter(&un->un_owner_mx); 3853 if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) { 3854 mirror_overlap_tree_remove(ps); 3855 md_kstat_waitq_exit(ui); 3856 mutex_exit(&un->un_owner_mx); 3857 md_unit_readerexit(ui); 3858 daemon_request(&md_mirror_daemon, daemon_io, 3859 (daemon_queue_t *)ps, REQ_OLD); 3860 return; 3861 } 3862 mutex_exit(&un->un_owner_mx); 3863 } 3864 } 3865 3866 /* 3867 * For Multinode mirrors with a Resync Region (not ABR) we need to 3868 * become the mirror owner before continuing with the write(). For ABR 3869 * mirrors we check that we 'own' the resync if we're in 3870 * write-after-read mode. We do this _after_ ensuring that there are no 3871 * overlaps to ensure that the once we know that we are the owner, the 3872 * readerlock will not released until the write is complete. As a 3873 * change of ownership in a MN set requires the writerlock, this 3874 * ensures that ownership cannot be changed until the write is 3875 * complete 3876 */ 3877 if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) || 3878 (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) { 3879 if (!MD_MN_MIRROR_OWNER(un)) { 3880 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3881 mirror_overlap_tree_remove(ps); 3882 md_kstat_waitq_exit(ui); 3883 ASSERT(!(flag & MD_STR_WAR)); 3884 md_unit_readerexit(ui); 3885 daemon_request(&md_mirror_daemon, become_owner, 3886 (daemon_queue_t *)ps, REQ_OLD); 3887 return; 3888 } 3889 } 3890 3891 /* 3892 * Mark resync region if mirror has a Resync Region _and_ we are not 3893 * a resync initiated write(). Don't mark region if we're flagged as 3894 * an ABR write. 3895 */ 3896 if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) && 3897 !(flag & MD_STR_WAR)) { 3898 if (mirror_mark_resync_region(un, ps->ps_firstblk, 3899 ps->ps_lastblk)) { 3900 pb->b_flags |= B_ERROR; 3901 pb->b_resid = pb->b_bcount; 3902 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 3903 kmem_cache_free(mirror_parent_cache, ps); 3904 md_kstat_waitq_exit(ui); 3905 md_unit_readerexit(ui); 3906 md_biodone(pb); 3907 return; 3908 } 3909 } 3910 3911 ps->ps_childbflags = pb->b_flags | B_WRITE; 3912 ps->ps_childbflags &= ~B_READ; 3913 if (flag & MD_STR_MAPPED) 3914 ps->ps_childbflags &= ~B_PAGEIO; 3915 3916 if (!(flag & MD_STR_NOTTOP) && panicstr) 3917 /* Disable WOW and don't free ps */ 3918 ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE); 3919 3920 md_kstat_waitq_to_runq(ui); 3921 3922 /* 3923 * Treat Raw and Direct I/O as Write-on-Write always 3924 */ 3925 3926 if (!(md_mirror_wow_flg & WOW_DISABLE) && 3927 (md_mirror_wow_flg & WOW_PHYS_ENABLE) && 3928 (pb->b_flags & B_PHYS) && 3929 !(ps->ps_flags & MD_MPS_WOW)) { 3930 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3931 mirror_overlap_tree_remove(ps); 3932 md_unit_readerexit(ui); 3933 daemon_request(&md_mstr_daemon, handle_wow, 3934 (daemon_queue_t *)ps, REQ_OLD); 3935 return; 3936 } 3937 3938 ps->ps_frags = 1; 3939 do { 3940 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 3941 mirror_child_init(cs); 3942 cb = &cs->cs_buf; 3943 more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR)); 3944 3945 /* 3946 * This handles the case where we're requesting 3947 * a write to block 0 on a label partition. (more < 0) 3948 * means that the request size was smaller than the 3949 * size of the label. If so this request is done. 3950 */ 3951 if (more < 0) { 3952 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3953 mirror_overlap_tree_remove(ps); 3954 md_kstat_runq_exit(ui); 3955 kmem_cache_free(mirror_child_cache, cs); 3956 kmem_cache_free(mirror_parent_cache, ps); 3957 md_unit_readerexit(ui); 3958 md_biodone(pb); 3959 return; 3960 } 3961 if (more) { 3962 mutex_enter(&ps->ps_mx); 3963 ps->ps_frags++; 3964 mutex_exit(&ps->ps_mx); 3965 } 3966 md_call_strategy(cb, flag, private); 3967 } while (more); 3968 3969 if (!(flag & MD_STR_NOTTOP) && panicstr) { 3970 while (!(ps->ps_flags & MD_MPS_DONE)) { 3971 md_daemon(1, &md_done_daemon); 3972 drv_usecwait(10); 3973 } 3974 kmem_cache_free(mirror_parent_cache, ps); 3975 } 3976 } 3977 3978 static void 3979 mirror_read_strategy(buf_t *pb, int flag, void *private) 3980 { 3981 md_mps_t *ps; 3982 md_mcs_t *cs; 3983 size_t more; 3984 mm_unit_t *un; 3985 mdi_unit_t *ui; 3986 size_t current_count; 3987 diskaddr_t current_blkno; 3988 off_t current_offset; 3989 buf_t *cb; /* child buf pointer */ 3990 set_t setno; 3991 3992 ui = MDI_UNIT(getminor(pb->b_edev)); 3993 3994 md_kstat_waitq_enter(ui); 3995 3996 un = (mm_unit_t *)md_unit_readerlock(ui); 3997 3998 if (!(flag & MD_STR_NOTTOP)) { 3999 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 4000 md_kstat_waitq_exit(ui); 4001 return; 4002 } 4003 } 4004 4005 if (private == NULL) { 4006 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 4007 mirror_parent_init(ps); 4008 } else { 4009 ps = private; 4010 private = NULL; 4011 } 4012 4013 if (flag & MD_STR_MAPPED) 4014 ps->ps_flags |= MD_MPS_MAPPED; 4015 if (flag & MD_NOBLOCK) 4016 ps->ps_flags |= MD_MPS_NOBLOCK; 4017 if (flag & MD_STR_WMUPDATE) 4018 ps->ps_flags |= MD_MPS_WMUPDATE; 4019 4020 /* 4021 * Check to see if this is a DMR driven read. If so we need to use the 4022 * specified side (in un->un_dmr_last_read) for the source of the data. 4023 */ 4024 if (flag & MD_STR_DMR) 4025 ps->ps_flags |= MD_MPS_DMR; 4026 4027 /* 4028 * Save essential information from the original buffhdr 4029 * in the md_save structure. 4030 */ 4031 ps->ps_un = un; 4032 ps->ps_ui = ui; 4033 ps->ps_bp = pb; 4034 ps->ps_addr = pb->b_un.b_addr; 4035 ps->ps_firstblk = pb->b_lblkno; 4036 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 4037 ps->ps_changecnt = un->un_changecnt; 4038 4039 current_count = btodb(pb->b_bcount); 4040 current_blkno = pb->b_lblkno; 4041 current_offset = 0; 4042 4043 /* 4044 * If flag has MD_STR_WAR set this means that the read is issued by a 4045 * resync thread which may or may not be an optimised resync. 4046 * 4047 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync 4048 * code has not completed; either a resync has not started since snarf, 4049 * or there is an optimized resync in progress. 4050 * 4051 * We need to generate a write after this read in the following two 4052 * cases, 4053 * 4054 * 1. Any Resync-Generated read 4055 * 4056 * 2. Any read to a DIRTY REGION if there is an optimized resync 4057 * pending or in progress. 4058 * 4059 * The write after read is done in these cases to ensure that all sides 4060 * of the mirror are in sync with the read data and that it is not 4061 * possible for an application to read the same block multiple times 4062 * and get different data. 4063 * 4064 * This would be possible if the block was in a dirty region. 4065 * 4066 * If we're performing a directed read we don't write the data out as 4067 * the application is responsible for restoring the mirror to a known 4068 * state. 4069 */ 4070 if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) && 4071 !(flag & MD_STR_DMR)) { 4072 size_t start_rr, i, end_rr; 4073 int region_dirty = 1; 4074 4075 /* 4076 * We enter here under three circumstances, 4077 * 4078 * MD_UN_OPT_NOT_DONE MD_STR_WAR 4079 * 0 1 4080 * 1 0 4081 * 1 1 4082 * 4083 * To be optimal we only care to explicitly check for dirty 4084 * regions in the second case since if MD_STR_WAR is set we 4085 * always do the write after read. 4086 */ 4087 if (!(flag & MD_STR_WAR)) { 4088 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 4089 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 4090 4091 for (i = start_rr; i <= end_rr; i++) 4092 if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0) 4093 break; 4094 } 4095 4096 if ((region_dirty) && 4097 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 4098 ps->ps_call = write_after_read; 4099 /* 4100 * Mark this as a RESYNC_READ in ps_flags. 4101 * This is used if the read fails during a 4102 * resync of a 3-way mirror to ensure that 4103 * the retried read to the remaining 4104 * good submirror has MD_STR_WAR set. This 4105 * is needed to ensure that the resync write 4106 * (write-after-read) takes place. 4107 */ 4108 ps->ps_flags |= MD_MPS_RESYNC_READ; 4109 4110 /* 4111 * If MD_STR_FLAG_ERR is set in the flags we 4112 * set MD_MPS_FLAG_ERROR so that an error on the resync 4113 * write (issued by write_after_read) will be flagged 4114 * to the biowait'ing resync thread. This allows us to 4115 * avoid issuing further resync requests to a device 4116 * that has had a write failure. 4117 */ 4118 if (flag & MD_STR_FLAG_ERR) 4119 ps->ps_flags |= MD_MPS_FLAG_ERROR; 4120 4121 setno = MD_UN2SET(un); 4122 /* 4123 * Drop the readerlock to avoid 4124 * deadlock 4125 */ 4126 md_unit_readerexit(ui); 4127 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 4128 un = md_unit_readerlock(ui); 4129 /* 4130 * Ensure that we are owner 4131 */ 4132 if (MD_MNSET_SETNO(setno)) { 4133 /* 4134 * For a non-resync read that requires a 4135 * write-after-read to be done, set a flag 4136 * in the parent structure, so that the 4137 * write_strategy routine can omit the 4138 * test that the write is still within the 4139 * resync region 4140 */ 4141 if (!(flag & MD_STR_WAR)) 4142 ps->ps_flags |= MD_MPS_DIRTY_RD; 4143 4144 /* 4145 * Before reading the buffer, see if 4146 * we are the owner 4147 */ 4148 if (!MD_MN_MIRROR_OWNER(un)) { 4149 ps->ps_call = NULL; 4150 mirror_overlap_tree_remove(ps); 4151 md_kstat_waitq_exit(ui); 4152 md_unit_readerexit(ui); 4153 daemon_request( 4154 &md_mirror_daemon, 4155 become_owner, 4156 (daemon_queue_t *)ps, 4157 REQ_OLD); 4158 return; 4159 } 4160 /* 4161 * For a resync read, check to see if I/O is 4162 * outside of the current resync region, or 4163 * the resync has finished. If so 4164 * just terminate the I/O 4165 */ 4166 if ((flag & MD_STR_WAR) && 4167 (!(un->c.un_status & MD_UN_WAR) || 4168 (!IN_RESYNC_REGION(un, ps)))) { 4169 #ifdef DEBUG 4170 if (mirror_debug_flag) 4171 printf("Abort resync read " 4172 "%x: %lld\n", 4173 MD_SID(un), 4174 ps->ps_firstblk); 4175 #endif 4176 mirror_overlap_tree_remove(ps); 4177 kmem_cache_free(mirror_parent_cache, 4178 ps); 4179 md_kstat_waitq_exit(ui); 4180 md_unit_readerexit(ui); 4181 md_biodone(pb); 4182 return; 4183 } 4184 } 4185 } 4186 } 4187 4188 if (flag & MD_STR_DMR) { 4189 ps->ps_call = directed_read_done; 4190 } 4191 4192 if (!(flag & MD_STR_NOTTOP) && panicstr) 4193 ps->ps_flags |= MD_MPS_DONTFREE; 4194 4195 md_kstat_waitq_to_runq(ui); 4196 4197 ps->ps_frags++; 4198 do { 4199 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 4200 mirror_child_init(cs); 4201 cb = &cs->cs_buf; 4202 cs->cs_ps = ps; 4203 4204 cb = md_bioclone(pb, current_offset, current_count, NODEV, 4205 current_blkno, mirror_done, cb, KM_NOSLEEP); 4206 4207 more = mirror_map_read(ps, cs, current_blkno, 4208 (u_longlong_t)current_count); 4209 if (more) { 4210 mutex_enter(&ps->ps_mx); 4211 ps->ps_frags++; 4212 mutex_exit(&ps->ps_mx); 4213 } 4214 4215 /* 4216 * Do these calculations now, 4217 * so that we pickup a valid b_bcount from the chld_bp. 4218 */ 4219 current_count -= more; 4220 current_offset += cb->b_bcount; 4221 current_blkno += more; 4222 md_call_strategy(cb, flag, private); 4223 } while (more); 4224 4225 if (!(flag & MD_STR_NOTTOP) && panicstr) { 4226 while (!(ps->ps_flags & MD_MPS_DONE)) { 4227 md_daemon(1, &md_done_daemon); 4228 drv_usecwait(10); 4229 } 4230 kmem_cache_free(mirror_parent_cache, ps); 4231 } 4232 } 4233 4234 void 4235 md_mirror_strategy(buf_t *bp, int flag, void *private) 4236 { 4237 set_t setno = MD_MIN2SET(getminor(bp->b_edev)); 4238 4239 /* 4240 * When doing IO to a multi owner meta device, check if set is halted. 4241 * We do this check without the needed lock held, for performance 4242 * reasons. 4243 * If an IO just slips through while the set is locked via an 4244 * MD_MN_SUSPEND_SET, we don't care about it. 4245 * Only check for suspension if we are a top-level i/o request 4246 * (MD_STR_NOTTOP is cleared in 'flag'). 4247 */ 4248 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 4249 (MD_SET_HALTED | MD_SET_MNSET)) { 4250 if ((flag & MD_STR_NOTTOP) == 0) { 4251 mutex_enter(&md_mx); 4252 /* Here we loop until the set is no longer halted */ 4253 while (md_set[setno].s_status & MD_SET_HALTED) { 4254 cv_wait(&md_cv, &md_mx); 4255 } 4256 mutex_exit(&md_mx); 4257 } 4258 } 4259 4260 if ((flag & MD_IO_COUNTED) == 0) { 4261 if ((flag & MD_NOBLOCK) == 0) { 4262 if (md_inc_iocount(setno) != 0) { 4263 bp->b_flags |= B_ERROR; 4264 bp->b_error = ENXIO; 4265 bp->b_resid = bp->b_bcount; 4266 biodone(bp); 4267 return; 4268 } 4269 } else { 4270 md_inc_iocount_noblock(setno); 4271 } 4272 } 4273 4274 if (bp->b_flags & B_READ) 4275 mirror_read_strategy(bp, flag, private); 4276 else 4277 mirror_write_strategy(bp, flag, private); 4278 } 4279 4280 /* 4281 * mirror_directed_read: 4282 * -------------------- 4283 * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror 4284 * so that the application can determine what (if any) resync needs to be 4285 * performed. The data is copied out to the user-supplied buffer. 4286 * 4287 * Parameters: 4288 * mdev - dev_t for the mirror device 4289 * vdr - directed read parameters specifying location and submirror 4290 * to perform the read from 4291 * mode - used to ddi_copyout() any resulting data from the read 4292 * 4293 * Returns: 4294 * 0 success 4295 * !0 error code 4296 * EINVAL - invalid request format 4297 */ 4298 int 4299 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode) 4300 { 4301 buf_t *bp; 4302 minor_t mnum = getminor(mdev); 4303 mdi_unit_t *ui = MDI_UNIT(mnum); 4304 mm_unit_t *un; 4305 mm_submirror_t *sm; 4306 char *sm_nm; 4307 uint_t next_side; 4308 void *kbuffer; 4309 4310 if (ui == NULL) 4311 return (ENXIO); 4312 4313 if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) { 4314 return (EINVAL); 4315 } 4316 4317 /* Check for aligned block access. We disallow non-aligned requests. */ 4318 if (vdr->vdr_offset % DEV_BSIZE) { 4319 return (EINVAL); 4320 } 4321 4322 /* 4323 * Allocate kernel buffer for target of read(). If we had a reliable 4324 * (sorry functional) DDI this wouldn't be needed. 4325 */ 4326 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 4327 if (kbuffer == NULL) { 4328 cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx" 4329 " bytes\n", vdr->vdr_nbytes); 4330 return (ENOMEM); 4331 } 4332 4333 bp = getrbuf(KM_SLEEP); 4334 4335 bp->b_un.b_addr = kbuffer; 4336 bp->b_flags = B_READ; 4337 bp->b_bcount = vdr->vdr_nbytes; 4338 bp->b_lblkno = lbtodb(vdr->vdr_offset); 4339 bp->b_edev = mdev; 4340 4341 un = md_unit_readerlock(ui); 4342 4343 /* 4344 * If DKV_SIDE_INIT is set we need to determine the first available 4345 * side to start reading from. If it isn't set we increment to the 4346 * next readable submirror. 4347 * If there are no readable submirrors we error out with DKV_DMR_ERROR. 4348 * Note: we check for a readable submirror on completion of the i/o so 4349 * we should _always_ have one available. If this becomes unavailable 4350 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if 4351 * a metadetach is made between the completion of one DKIOCDMR ioctl 4352 * and the start of the next (i.e. a sys-admin 'accident' occurred). 4353 * The chance of this is small, but not non-existent. 4354 */ 4355 if (vdr->vdr_side == DKV_SIDE_INIT) { 4356 next_side = 0; 4357 } else { 4358 next_side = vdr->vdr_side + 1; 4359 } 4360 while ((next_side < NMIRROR) && 4361 !SUBMIRROR_IS_READABLE(un, next_side)) 4362 next_side++; 4363 if (next_side >= NMIRROR) { 4364 vdr->vdr_flags |= DKV_DMR_ERROR; 4365 freerbuf(bp); 4366 vdr->vdr_bytesread = 0; 4367 md_unit_readerexit(ui); 4368 return (0); 4369 } 4370 4371 /* Set the side to read from */ 4372 un->un_dmr_last_read = next_side; 4373 4374 md_unit_readerexit(ui); 4375 4376 /* 4377 * Save timestamp for verification purposes. Can be read by debugger 4378 * to verify that this ioctl has been executed and to find the number 4379 * of DMR reads and the time of the last DMR read. 4380 */ 4381 uniqtime(&mirror_dmr_stats.dmr_timestamp); 4382 mirror_dmr_stats.dmr_count++; 4383 4384 /* Issue READ request and wait for completion */ 4385 mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL); 4386 4387 mutex_enter(&un->un_dmr_mx); 4388 cv_wait(&un->un_dmr_cv, &un->un_dmr_mx); 4389 mutex_exit(&un->un_dmr_mx); 4390 4391 /* 4392 * Check to see if we encountered an error during the read. If so we 4393 * can make no guarantee about any possibly returned data. 4394 */ 4395 if ((bp->b_flags & B_ERROR) == 0) { 4396 vdr->vdr_flags &= ~DKV_DMR_ERROR; 4397 if (bp->b_resid) { 4398 vdr->vdr_flags |= DKV_DMR_SHORT; 4399 vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid; 4400 } else { 4401 vdr->vdr_flags |= DKV_DMR_SUCCESS; 4402 vdr->vdr_bytesread = vdr->vdr_nbytes; 4403 } 4404 /* Copy the data read back out to the user supplied buffer */ 4405 if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread, 4406 mode)) { 4407 kmem_free(kbuffer, vdr->vdr_nbytes); 4408 return (EFAULT); 4409 } 4410 4411 } else { 4412 /* Error out with DKV_DMR_ERROR */ 4413 vdr->vdr_flags |= DKV_DMR_ERROR; 4414 vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE); 4415 } 4416 /* 4417 * Update the DMR parameters with the side and name of submirror that 4418 * we have just read from (un->un_dmr_last_read) 4419 */ 4420 un = md_unit_readerlock(ui); 4421 4422 vdr->vdr_side = un->un_dmr_last_read; 4423 sm = &un->un_sm[un->un_dmr_last_read]; 4424 sm_nm = md_shortname(md_getminor(sm->sm_dev)); 4425 4426 (void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name)); 4427 4428 /* 4429 * Determine if we've completed the read cycle. This is true iff the 4430 * next computed submirror (side) equals or exceeds NMIRROR. We cannot 4431 * use un_nsm as we need to handle a sparse array of submirrors (which 4432 * can occur if a submirror is metadetached). 4433 */ 4434 next_side = un->un_dmr_last_read + 1; 4435 while ((next_side < NMIRROR) && 4436 !SUBMIRROR_IS_READABLE(un, next_side)) 4437 next_side++; 4438 if (next_side >= NMIRROR) { 4439 /* We've finished */ 4440 vdr->vdr_flags |= DKV_DMR_DONE; 4441 } 4442 4443 md_unit_readerexit(ui); 4444 freerbuf(bp); 4445 kmem_free(kbuffer, vdr->vdr_nbytes); 4446 4447 return (0); 4448 } 4449 4450 /* 4451 * mirror_resync_message: 4452 * --------------------- 4453 * Handle the multi-node resync messages that keep all nodes within a given 4454 * disk-set in sync with their view of a mirror's resync status. 4455 * 4456 * The message types dealt with are: 4457 * MD_MN_MSG_RESYNC_STARTING - start a resync thread for a unit 4458 * MD_MN_MSG_RESYNC_NEXT - specified next region to be resynced 4459 * MD_MN_MSG_RESYNC_FINISH - stop the resync thread for a unit 4460 * MD_MN_MSG_RESYNC_PHASE_DONE - end of a resync phase, opt, submirror or comp 4461 * 4462 * Returns: 4463 * 0 Success 4464 * >0 Failure error number 4465 */ 4466 int 4467 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp) 4468 { 4469 mdi_unit_t *ui; 4470 mm_unit_t *un; 4471 set_t setno; 4472 int is_ABR; 4473 int smi; 4474 int ci; 4475 sm_state_t state; 4476 int broke_out; 4477 mm_submirror_t *sm; 4478 mm_submirror_ic_t *smic; 4479 md_m_shared_t *shared; 4480 md_error_t mde = mdnullerror; 4481 md_mps_t *ps; 4482 int rs_active; 4483 4484 /* Check that the given device is part of a multi-node set */ 4485 setno = MD_MIN2SET(p->mnum); 4486 if (setno >= md_nsets) { 4487 return (ENXIO); 4488 } 4489 if (!MD_MNSET_SETNO(setno)) { 4490 return (EINVAL); 4491 } 4492 4493 if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL) 4494 return (EINVAL); 4495 if ((ui = MDI_UNIT(p->mnum)) == NULL) 4496 return (EINVAL); 4497 is_ABR = (ui->ui_tstate & MD_ABR_CAP); 4498 4499 /* Obtain the current resync status */ 4500 (void) md_ioctl_readerlock(lockp, ui); 4501 rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0; 4502 md_ioctl_readerexit(lockp); 4503 4504 switch ((md_mn_msgtype_t)p->msg_type) { 4505 case MD_MN_MSG_RESYNC_STARTING: 4506 /* Start the resync thread for the mirror */ 4507 (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp); 4508 break; 4509 4510 case MD_MN_MSG_RESYNC_NEXT: 4511 /* 4512 * We have to release any previously marked overlap regions 4513 * so that i/o can resume. Then we need to block the region 4514 * from [rs_start..rs_start+rs_size) * so that no i/o is issued. 4515 * Update un_rs_resync_done and un_rs_resync_2_do. 4516 */ 4517 (void) md_ioctl_readerlock(lockp, ui); 4518 /* 4519 * Ignore the message if there is no active resync thread or 4520 * if it is for a resync type that we have already completed. 4521 * un_resync_completed is set to the last resync completed 4522 * when processing a PHASE_DONE message. 4523 */ 4524 if (!rs_active || (p->rs_type == un->un_resync_completed)) 4525 break; 4526 /* 4527 * If this message is for the same resync and is for an earlier 4528 * resync region, just ignore it. This can only occur if this 4529 * node has progressed on to the next resync region before 4530 * we receive this message. This can occur if the class for 4531 * this message is busy and the originator has to retry thus 4532 * allowing this node to move onto the next resync_region. 4533 */ 4534 if ((p->rs_type == un->un_rs_type) && 4535 (p->rs_start < un->un_resync_startbl)) 4536 break; 4537 ps = un->un_rs_prev_overlap; 4538 4539 /* Allocate previous overlap reference if needed */ 4540 if (ps == NULL) { 4541 ps = kmem_cache_alloc(mirror_parent_cache, 4542 MD_ALLOCFLAGS); 4543 ps->ps_un = un; 4544 ps->ps_ui = ui; 4545 ps->ps_firstblk = 0; 4546 ps->ps_lastblk = 0; 4547 ps->ps_flags = 0; 4548 md_ioctl_readerexit(lockp); 4549 (void) md_ioctl_writerlock(lockp, ui); 4550 un->un_rs_prev_overlap = ps; 4551 md_ioctl_writerexit(lockp); 4552 } else 4553 md_ioctl_readerexit(lockp); 4554 4555 if (p->rs_originator != md_mn_mynode_id) { 4556 /* 4557 * On all but the originating node, first update 4558 * the resync state, then unblock the previous 4559 * region and block the next one. No need 4560 * to do this if the region is already blocked. 4561 * Update the submirror state and flags from the 4562 * originator. This keeps the cluster in sync with 4563 * regards to the resync status. 4564 */ 4565 4566 (void) md_ioctl_writerlock(lockp, ui); 4567 un->un_rs_resync_done = p->rs_done; 4568 un->un_rs_resync_2_do = p->rs_2_do; 4569 un->un_rs_type = p->rs_type; 4570 un->un_resync_startbl = p->rs_start; 4571 md_ioctl_writerexit(lockp); 4572 /* 4573 * Use un_owner_mx to ensure that an ownership change 4574 * cannot happen at the same time as this message 4575 */ 4576 mutex_enter(&un->un_owner_mx); 4577 if (MD_MN_MIRROR_OWNER(un)) { 4578 ps->ps_firstblk = p->rs_start; 4579 ps->ps_lastblk = ps->ps_firstblk + 4580 p->rs_size - 1; 4581 } else { 4582 if ((ps->ps_firstblk != p->rs_start) || 4583 (ps->ps_lastblk != p->rs_start + 4584 p->rs_size - 1)) { 4585 /* Remove previous overlap range */ 4586 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4587 mirror_overlap_tree_remove(ps); 4588 4589 ps->ps_firstblk = p->rs_start; 4590 ps->ps_lastblk = ps->ps_firstblk + 4591 p->rs_size - 1; 4592 4593 mutex_exit(&un->un_owner_mx); 4594 /* Block this range from all i/o. */ 4595 if (ps->ps_firstblk != 0 || 4596 ps->ps_lastblk != 0) 4597 wait_for_overlaps(ps, 4598 MD_OVERLAP_ALLOW_REPEAT); 4599 mutex_enter(&un->un_owner_mx); 4600 /* 4601 * Check to see if we have obtained 4602 * ownership while waiting for 4603 * overlaps. If we have, remove 4604 * the resync_region entry from the 4605 * overlap tree 4606 */ 4607 if (MD_MN_MIRROR_OWNER(un) && 4608 (ps->ps_flags & MD_MPS_ON_OVERLAP)) 4609 mirror_overlap_tree_remove(ps); 4610 } 4611 } 4612 mutex_exit(&un->un_owner_mx); 4613 4614 /* 4615 * If this is the first RESYNC_NEXT message (i.e. 4616 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags), 4617 * issue RESYNC_START NOTIFY event 4618 */ 4619 if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) { 4620 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START, 4621 SVM_TAG_METADEVICE, MD_UN2SET(un), 4622 MD_SID(un)); 4623 } 4624 4625 /* Ensure that our local resync thread is running */ 4626 if (un->un_rs_thread == NULL) { 4627 (void) mirror_resync_unit(p->mnum, NULL, 4628 &p->mde, lockp); 4629 } 4630 } 4631 break; 4632 case MD_MN_MSG_RESYNC_FINISH: 4633 /* 4634 * Complete the resync by stopping the resync thread. 4635 * Also release the previous overlap region field. 4636 * Update the resync_progress_thread by cv_signal'ing it so 4637 * that we mark the end of the resync as soon as possible. This 4638 * stops an unnecessary delay should be panic after resync 4639 * completion. 4640 */ 4641 #ifdef DEBUG 4642 if (!rs_active) { 4643 if (mirror_debug_flag) 4644 printf("RESYNC_FINISH (mnum = %x), " 4645 "Resync *NOT* active", 4646 p->mnum); 4647 } 4648 #endif 4649 4650 if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) && 4651 (p->rs_originator != md_mn_mynode_id)) { 4652 mutex_enter(&un->un_rs_thread_mx); 4653 un->c.un_status &= ~MD_UN_RESYNC_CANCEL; 4654 un->un_rs_thread_flags |= MD_RI_SHUTDOWN; 4655 un->un_rs_thread_flags &= 4656 ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER); 4657 cv_signal(&un->un_rs_thread_cv); 4658 mutex_exit(&un->un_rs_thread_mx); 4659 } 4660 if (is_ABR) { 4661 /* Resync finished, if ABR set owner to NULL */ 4662 mutex_enter(&un->un_owner_mx); 4663 un->un_mirror_owner = 0; 4664 mutex_exit(&un->un_owner_mx); 4665 } 4666 (void) md_ioctl_writerlock(lockp, ui); 4667 ps = un->un_rs_prev_overlap; 4668 if (ps != NULL) { 4669 /* Remove previous overlap range */ 4670 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4671 mirror_overlap_tree_remove(ps); 4672 /* 4673 * Release the overlap range reference 4674 */ 4675 un->un_rs_prev_overlap = NULL; 4676 kmem_cache_free(mirror_parent_cache, 4677 ps); 4678 } 4679 md_ioctl_writerexit(lockp); 4680 4681 /* Mark the resync as complete in the metadb */ 4682 un->un_rs_resync_done = p->rs_done; 4683 un->un_rs_resync_2_do = p->rs_2_do; 4684 un->un_rs_type = p->rs_type; 4685 mutex_enter(&un->un_rs_progress_mx); 4686 cv_signal(&un->un_rs_progress_cv); 4687 mutex_exit(&un->un_rs_progress_mx); 4688 4689 un = md_ioctl_writerlock(lockp, ui); 4690 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE; 4691 /* Deal with any pending grow_unit */ 4692 if (un->c.un_status & MD_UN_GROW_PENDING) { 4693 if ((mirror_grow_unit(un, &mde) != 0) || 4694 (! mdismderror(&mde, MDE_GROW_DELAYED))) { 4695 un->c.un_status &= ~MD_UN_GROW_PENDING; 4696 } 4697 } 4698 md_ioctl_writerexit(lockp); 4699 break; 4700 4701 case MD_MN_MSG_RESYNC_PHASE_DONE: 4702 /* 4703 * A phase of the resync, optimized. component or 4704 * submirror is complete. Update mirror status. 4705 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the 4706 * mirror owner is peforming a resync. If we have just snarfed 4707 * this set, then we must clear any of the flags set at snarf 4708 * time by unit_setup_resync(). 4709 * Note that unit_setup_resync() sets up these flags to 4710 * indicate that an optimized resync is required. These flags 4711 * need to be reset because if we get here, the mirror owner 4712 * will have handled the optimized resync. 4713 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and 4714 * MD_UN_WAR. In addition, for each submirror, 4715 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC 4716 * set to SMS_OFFLINE. 4717 */ 4718 #ifdef DEBUG 4719 if (mirror_debug_flag) 4720 printf("phase done mess received from %d, mnum=%x," 4721 "type=%x, flags=%x\n", p->rs_originator, p->mnum, 4722 p->rs_type, p->rs_flags); 4723 #endif 4724 /* 4725 * Ignore the message if there is no active resync thread. 4726 */ 4727 if (!rs_active) 4728 break; 4729 4730 broke_out = p->rs_flags & MD_MN_RS_ERR; 4731 switch (RS_TYPE(p->rs_type)) { 4732 case MD_RS_OPTIMIZED: 4733 un = md_ioctl_writerlock(lockp, ui); 4734 if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) { 4735 /* If we are originator, just clear rs_type */ 4736 if (p->rs_originator == md_mn_mynode_id) { 4737 SET_RS_TYPE_NONE(un->un_rs_type); 4738 md_ioctl_writerexit(lockp); 4739 break; 4740 } 4741 /* 4742 * If CLEAR_OPT_NOT_DONE is set, only clear the 4743 * flags if OPT_NOT_DONE is set *and* rs_type 4744 * is MD_RS_NONE. 4745 */ 4746 if ((un->c.un_status & MD_UN_OPT_NOT_DONE) && 4747 (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) { 4748 /* No resync in progress */ 4749 un->c.un_status &= ~MD_UN_OPT_NOT_DONE; 4750 un->c.un_status &= ~MD_UN_WAR; 4751 } else { 4752 /* 4753 * We are in the middle of an 4754 * optimized resync and this message 4755 * should be ignored. 4756 */ 4757 md_ioctl_writerexit(lockp); 4758 break; 4759 } 4760 } else { 4761 /* 4762 * This is the end of an optimized resync, 4763 * clear the OPT_NOT_DONE and OFFLINE_SM flags 4764 */ 4765 4766 un->c.un_status &= ~MD_UN_KEEP_DIRTY; 4767 if (!broke_out) 4768 un->c.un_status &= ~MD_UN_WAR; 4769 } 4770 4771 /* 4772 * Set resync_completed to last resync type and then 4773 * clear resync_type to indicate no resync in progress 4774 */ 4775 un->un_resync_completed = un->un_rs_type; 4776 SET_RS_TYPE_NONE(un->un_rs_type); 4777 4778 /* 4779 * If resync is as a result of a submirror ONLINE, 4780 * reset the submirror state to SMS_RUNNING if the 4781 * resync was ok else set back to SMS_OFFLINE. 4782 */ 4783 for (smi = 0; smi < NMIRROR; smi++) { 4784 un->un_sm[smi].sm_flags &= 4785 ~MD_SM_RESYNC_TARGET; 4786 if (SMS_BY_INDEX_IS(un, smi, 4787 SMS_OFFLINE_RESYNC)) { 4788 if (p->rs_flags & 4789 MD_MN_RS_CLEAR_OPT_NOT_DONE) { 4790 state = SMS_OFFLINE; 4791 } else { 4792 state = (broke_out ? 4793 SMS_OFFLINE : SMS_RUNNING); 4794 } 4795 mirror_set_sm_state( 4796 &un->un_sm[smi], 4797 &un->un_smic[smi], state, 4798 broke_out); 4799 mirror_commit(un, NO_SUBMIRRORS, 4800 0); 4801 } 4802 /* 4803 * If we still have an offline submirror, reset 4804 * the OFFLINE_SM flag in the mirror status 4805 */ 4806 if (SMS_BY_INDEX_IS(un, smi, 4807 SMS_OFFLINE)) 4808 un->c.un_status |= 4809 MD_UN_OFFLINE_SM; 4810 } 4811 md_ioctl_writerexit(lockp); 4812 break; 4813 case MD_RS_SUBMIRROR: 4814 un = md_ioctl_writerlock(lockp, ui); 4815 smi = RS_SMI(p->rs_type); 4816 sm = &un->un_sm[smi]; 4817 smic = &un->un_smic[smi]; 4818 /* Clear RESYNC target */ 4819 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 4820 /* 4821 * Set resync_completed to last resync type and then 4822 * clear resync_type to indicate no resync in progress 4823 */ 4824 un->un_resync_completed = un->un_rs_type; 4825 SET_RS_TYPE_NONE(un->un_rs_type); 4826 /* 4827 * If the resync completed ok reset the submirror 4828 * state to SMS_RUNNING else reset it to SMS_ATTACHED 4829 */ 4830 state = (broke_out ? 4831 SMS_ATTACHED : SMS_RUNNING); 4832 mirror_set_sm_state(sm, smic, state, broke_out); 4833 un->c.un_status &= ~MD_UN_WAR; 4834 mirror_commit(un, SMI2BIT(smi), 0); 4835 md_ioctl_writerexit(lockp); 4836 break; 4837 case MD_RS_COMPONENT: 4838 un = md_ioctl_writerlock(lockp, ui); 4839 smi = RS_SMI(p->rs_type); 4840 ci = RS_CI(p->rs_type); 4841 sm = &un->un_sm[smi]; 4842 smic = &un->un_smic[smi]; 4843 shared = (md_m_shared_t *) 4844 (*(smic->sm_shared_by_indx)) 4845 (sm->sm_dev, sm, ci); 4846 un->c.un_status &= ~MD_UN_WAR; 4847 /* Clear RESYNC target */ 4848 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 4849 /* 4850 * Set resync_completed to last resync type and then 4851 * clear resync_type to indicate no resync in progress 4852 */ 4853 un->un_resync_completed = un->un_rs_type; 4854 SET_RS_TYPE_NONE(un->un_rs_type); 4855 4856 /* 4857 * If the resync completed ok, set the component state 4858 * to CS_OKAY. 4859 */ 4860 if (broke_out) 4861 shared->ms_flags |= MDM_S_RS_TRIED; 4862 else { 4863 /* 4864 * As we don't transmit the changes, 4865 * no need to drop the lock. 4866 */ 4867 set_sm_comp_state(un, smi, ci, CS_OKAY, 0, 4868 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 4869 } 4870 md_ioctl_writerexit(lockp); 4871 default: 4872 break; 4873 } 4874 /* 4875 * If the purpose of this PHASE_DONE message is just to 4876 * indicate to all other nodes that the optimized resync 4877 * required (OPT_NOT_DONE) flag is to be cleared, there is 4878 * no need to generate a notify event as there has not 4879 * actually been a resync. 4880 */ 4881 if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) { 4882 if (broke_out) { 4883 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED, 4884 SVM_TAG_METADEVICE, MD_UN2SET(un), 4885 MD_SID(un)); 4886 } else { 4887 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE, 4888 SVM_TAG_METADEVICE, MD_UN2SET(un), 4889 MD_SID(un)); 4890 } 4891 } 4892 break; 4893 4894 default: 4895 #ifdef DEBUG 4896 cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type" 4897 " %x\n", p->msg_type); 4898 #endif 4899 return (EINVAL); 4900 } 4901 return (0); 4902 } 4903 4904 /* Return a -1 if snarf of optimized record failed and set should be released */ 4905 static int 4906 mirror_snarf(md_snarfcmd_t cmd, set_t setno) 4907 { 4908 mddb_recid_t recid; 4909 int gotsomething; 4910 int all_mirrors_gotten; 4911 mm_unit_t *un; 4912 mddb_type_t typ1; 4913 mddb_de_ic_t *dep; 4914 mddb_rb32_t *rbp; 4915 size_t newreqsize; 4916 mm_unit_t *big_un; 4917 mm_unit32_od_t *small_un; 4918 int retval; 4919 mdi_unit_t *ui; 4920 4921 if (cmd == MD_SNARF_CLEANUP) { 4922 if (md_get_setstatus(setno) & MD_SET_STALE) 4923 return (0); 4924 4925 recid = mddb_makerecid(setno, 0); 4926 typ1 = (mddb_type_t)md_getshared_key(setno, 4927 mirror_md_ops.md_driver.md_drivername); 4928 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 4929 if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) { 4930 un = (mm_unit_t *)mddb_getrecaddr(recid); 4931 mirror_cleanup(un); 4932 recid = mddb_makerecid(setno, 0); 4933 } 4934 } 4935 return (0); 4936 } 4937 4938 all_mirrors_gotten = 1; 4939 gotsomething = 0; 4940 4941 recid = mddb_makerecid(setno, 0); 4942 typ1 = (mddb_type_t)md_getshared_key(setno, 4943 mirror_md_ops.md_driver.md_drivername); 4944 4945 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 4946 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 4947 continue; 4948 4949 dep = mddb_getrecdep(recid); 4950 dep->de_flags = MDDB_F_MIRROR; 4951 rbp = dep->de_rb; 4952 4953 switch (rbp->rb_revision) { 4954 case MDDB_REV_RB: 4955 case MDDB_REV_RBFN: 4956 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 4957 /* 4958 * This means, we have an old and small 4959 * record and this record hasn't already 4960 * been converted. Before we create an 4961 * incore metadevice from this we have to 4962 * convert it to a big record. 4963 */ 4964 small_un = 4965 (mm_unit32_od_t *)mddb_getrecaddr(recid); 4966 newreqsize = sizeof (mm_unit_t); 4967 big_un = (mm_unit_t *)kmem_zalloc(newreqsize, 4968 KM_SLEEP); 4969 mirror_convert((caddr_t)small_un, 4970 (caddr_t)big_un, SMALL_2_BIG); 4971 kmem_free(small_un, dep->de_reqsize); 4972 4973 /* 4974 * Update userdata and incore userdata 4975 * incores are at the end of un 4976 */ 4977 dep->de_rb_userdata_ic = big_un; 4978 dep->de_rb_userdata = big_un; 4979 dep->de_icreqsize = newreqsize; 4980 un = big_un; 4981 rbp->rb_private |= MD_PRV_CONVD; 4982 } else { 4983 /* 4984 * Unit already converted, just get the 4985 * record address. 4986 */ 4987 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, 4988 sizeof (*un), 0); 4989 } 4990 un->c.un_revision &= ~MD_64BIT_META_DEV; 4991 break; 4992 case MDDB_REV_RB64: 4993 case MDDB_REV_RB64FN: 4994 /* Big device */ 4995 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, 4996 sizeof (*un), 0); 4997 un->c.un_revision |= MD_64BIT_META_DEV; 4998 un->c.un_flag |= MD_EFILABEL; 4999 break; 5000 } 5001 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 5002 5003 /* 5004 * Create minor device node for snarfed entry. 5005 */ 5006 (void) md_create_minor_node(setno, MD_SID(un)); 5007 5008 if (MD_UNIT(MD_SID(un)) != NULL) { 5009 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5010 continue; 5011 } 5012 all_mirrors_gotten = 0; 5013 retval = mirror_build_incore(un, 1); 5014 if (retval == 0) { 5015 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5016 md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0); 5017 resync_start_timeout(setno); 5018 gotsomething = 1; 5019 } else { 5020 return (retval); 5021 } 5022 /* 5023 * Set flag to indicate that the mirror has not yet 5024 * been through a reconfig. This flag is used for MN sets 5025 * when determining whether to update the mirror state from 5026 * the Master node. 5027 */ 5028 if (MD_MNSET_SETNO(setno)) { 5029 ui = MDI_UNIT(MD_SID(un)); 5030 ui->ui_tstate |= MD_RESYNC_NOT_DONE; 5031 } 5032 } 5033 5034 if (!all_mirrors_gotten) 5035 return (gotsomething); 5036 5037 recid = mddb_makerecid(setno, 0); 5038 while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0) 5039 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 5040 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5041 5042 return (0); 5043 } 5044 5045 static int 5046 mirror_halt(md_haltcmd_t cmd, set_t setno) 5047 { 5048 unit_t i; 5049 mdi_unit_t *ui; 5050 minor_t mnum; 5051 int reset_mirror_flag = 0; 5052 5053 if (cmd == MD_HALT_CLOSE) 5054 return (0); 5055 5056 if (cmd == MD_HALT_OPEN) 5057 return (0); 5058 5059 if (cmd == MD_HALT_UNLOAD) 5060 return (0); 5061 5062 if (cmd == MD_HALT_CHECK) { 5063 for (i = 0; i < md_nunits; i++) { 5064 mnum = MD_MKMIN(setno, i); 5065 if ((ui = MDI_UNIT(mnum)) == NULL) 5066 continue; 5067 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5068 continue; 5069 if (md_unit_isopen(ui)) 5070 return (1); 5071 } 5072 return (0); 5073 } 5074 5075 if (cmd != MD_HALT_DOIT) 5076 return (1); 5077 5078 for (i = 0; i < md_nunits; i++) { 5079 mnum = MD_MKMIN(setno, i); 5080 if ((ui = MDI_UNIT(mnum)) == NULL) 5081 continue; 5082 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5083 continue; 5084 reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0); 5085 5086 /* Set a flag if there is at least one mirror metadevice. */ 5087 reset_mirror_flag = 1; 5088 } 5089 5090 /* 5091 * Only wait for the global dr_timeout to finish 5092 * - if there are mirror metadevices in this diskset or 5093 * - if this is the local set since an unload of the md_mirror 5094 * driver could follow a successful mirror halt in the local set. 5095 */ 5096 if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) { 5097 while ((mirror_md_ops.md_head == NULL) && 5098 (mirror_timeout.dr_timeout_id != 0)) 5099 delay(md_hz); 5100 } 5101 5102 return (0); 5103 } 5104 5105 /*ARGSUSED3*/ 5106 static int 5107 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) 5108 { 5109 IOLOCK lock; 5110 minor_t mnum = getminor(*dev); 5111 set_t setno; 5112 5113 /* 5114 * When doing an open of a multi owner metadevice, check to see if this 5115 * node is a starting node and if a reconfig cycle is underway. 5116 * If so, the system isn't sufficiently set up enough to handle the 5117 * open (which involves I/O during sp_validate), so fail with ENXIO. 5118 */ 5119 setno = MD_MIN2SET(mnum); 5120 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 5121 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 5122 return (ENXIO); 5123 } 5124 5125 if (md_oflags & MD_OFLG_FROMIOCTL) { 5126 /* 5127 * This indicates that the caller is an ioctl service routine. 5128 * In this case we initialise our stack-based IOLOCK and pass 5129 * this into the internal open routine. This allows multi-owner 5130 * metadevices to avoid deadlocking if an error is encountered 5131 * during the open() attempt. The failure case is: 5132 * s-p -> mirror -> s-p (with error). Attempting to metaclear 5133 * this configuration would deadlock as the mirror code has to 5134 * send a state-update to the other nodes when it detects the 5135 * failure of the underlying submirror with an errored soft-part 5136 * on it. As there is a class1 message in progress (metaclear) 5137 * set_sm_comp_state() cannot send another class1 message; 5138 * instead we do not send a state_update message as the 5139 * metaclear is distributed and the failed submirror will be 5140 * cleared from the configuration by the metaclear. 5141 */ 5142 IOLOCK_INIT(&lock); 5143 return (mirror_internal_open(getminor(*dev), flag, otyp, 5144 md_oflags, &lock)); 5145 } else { 5146 return (mirror_internal_open(getminor(*dev), flag, otyp, 5147 md_oflags, (IOLOCK *)NULL)); 5148 } 5149 } 5150 5151 5152 /*ARGSUSED1*/ 5153 static int 5154 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) 5155 { 5156 return (mirror_internal_close(getminor(dev), otyp, md_cflags, 5157 (IOLOCK *)NULL)); 5158 } 5159 5160 5161 /* 5162 * This routine dumps memory to the disk. It assumes that the memory has 5163 * already been mapped into mainbus space. It is called at disk interrupt 5164 * priority when the system is in trouble. 5165 * 5166 */ 5167 static int 5168 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 5169 { 5170 mm_unit_t *un; 5171 dev_t mapdev; 5172 int result; 5173 int smi; 5174 int any_succeed = 0; 5175 int save_result = 0; 5176 5177 /* 5178 * Don't need to grab the unit lock. 5179 * Cause nothing else is suppose to be happenning. 5180 * Also dump is not suppose to sleep. 5181 */ 5182 un = (mm_unit_t *)MD_UNIT(getminor(dev)); 5183 5184 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 5185 return (EINVAL); 5186 5187 if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks) 5188 return (EINVAL); 5189 5190 for (smi = 0; smi < NMIRROR; smi++) { 5191 if (!SUBMIRROR_IS_WRITEABLE(un, smi)) 5192 continue; 5193 mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev); 5194 result = bdev_dump(mapdev, addr, blkno, nblk); 5195 if (result) 5196 save_result = result; 5197 5198 if (result == 0) 5199 any_succeed++; 5200 } 5201 5202 if (any_succeed) 5203 return (0); 5204 5205 return (save_result); 5206 } 5207 5208 /* 5209 * NAME: mirror_probe_dev 5210 * 5211 * DESCRITPION: force opens every component of a mirror. 5212 * 5213 * On entry the unit writerlock is held 5214 */ 5215 static int 5216 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum) 5217 { 5218 int i; 5219 int smi; 5220 int ci; 5221 mm_unit_t *un; 5222 int md_devopen = 0; 5223 set_t setno; 5224 int sm_cnt; 5225 int sm_unavail_cnt; 5226 5227 if (md_unit_isopen(ui)) 5228 md_devopen++; 5229 5230 un = MD_UNIT(mnum); 5231 setno = MD_UN2SET(un); 5232 5233 sm_cnt = 0; 5234 sm_unavail_cnt = 0; 5235 for (i = 0; i < NMIRROR; i++) { 5236 md_dev64_t tmpdev; 5237 mdi_unit_t *sm_ui; 5238 5239 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) { 5240 continue; 5241 } 5242 5243 sm_cnt++; 5244 tmpdev = un->un_sm[i].sm_dev; 5245 (void) md_layered_open(mnum, &tmpdev, 5246 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV); 5247 un->un_sm[i].sm_dev = tmpdev; 5248 5249 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 5250 5251 /* 5252 * Logic similar to that in mirror_open_all_devs. We set or 5253 * clear the submirror Unavailable bit. 5254 */ 5255 (void) md_unit_writerlock(sm_ui); 5256 if (submirror_unavailable(un, i, 1)) { 5257 sm_ui->ui_tstate |= MD_INACCESSIBLE; 5258 sm_unavail_cnt++; 5259 } else { 5260 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 5261 } 5262 md_unit_writerexit(sm_ui); 5263 } 5264 5265 /* 5266 * If all of the submirrors are unavailable, the mirror is also 5267 * unavailable. 5268 */ 5269 if (sm_cnt == sm_unavail_cnt) { 5270 ui->ui_tstate |= MD_INACCESSIBLE; 5271 } else { 5272 ui->ui_tstate &= ~MD_INACCESSIBLE; 5273 } 5274 5275 /* 5276 * Start checking from probe failures. If failures occur we 5277 * set the appropriate erred state only if the metadevice is in 5278 * use. This is specifically to prevent unnecessary resyncs. 5279 * For instance if the disks were accidentally disconnected when 5280 * the system booted up then until the metadevice is accessed 5281 * (like file system mount) the user can shutdown, recable and 5282 * reboot w/o incurring a potentially huge resync. 5283 */ 5284 5285 smi = 0; 5286 ci = 0; 5287 while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) { 5288 5289 if (mirror_other_sources(un, smi, ci, 0) == 1) { 5290 /* 5291 * Note that for a MN set, there is no need to call 5292 * SE_NOTIFY as that is done when processing the 5293 * state change 5294 */ 5295 if (md_devopen) { 5296 /* 5297 * Never called from ioctl context, 5298 * so (IOLOCK *)NULL 5299 */ 5300 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 5301 0, MD_STATE_XMIT, (IOLOCK *)NULL); 5302 if (!MD_MNSET_SETNO(setno)) { 5303 SE_NOTIFY(EC_SVM_STATE, 5304 ESC_SVM_LASTERRED, 5305 SVM_TAG_METADEVICE, setno, 5306 MD_SID(un)); 5307 } 5308 continue; 5309 } else { 5310 (void) mirror_close_all_devs(un, 5311 MD_OFLG_PROBEDEV); 5312 if (!MD_MNSET_SETNO(setno)) { 5313 SE_NOTIFY(EC_SVM_STATE, 5314 ESC_SVM_OPEN_FAIL, 5315 SVM_TAG_METADEVICE, setno, 5316 MD_SID(un)); 5317 } 5318 mirror_openfail_console_info(un, smi, ci); 5319 return (ENXIO); 5320 } 5321 } 5322 5323 /* 5324 * Note that for a MN set, there is no need to call 5325 * SE_NOTIFY as that is done when processing the 5326 * state change 5327 */ 5328 if (md_devopen) { 5329 /* Never called from ioctl context, so (IOLOCK *)NULL */ 5330 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, 5331 MD_STATE_XMIT, (IOLOCK *)NULL); 5332 if (!MD_MNSET_SETNO(setno)) { 5333 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 5334 SVM_TAG_METADEVICE, setno, 5335 MD_SID(un)); 5336 } 5337 } 5338 mirror_openfail_console_info(un, smi, ci); 5339 ci++; 5340 } 5341 5342 if (MD_MNSET_SETNO(setno)) { 5343 send_poke_hotspares(setno); 5344 } else { 5345 (void) poke_hotspares(); 5346 } 5347 (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV); 5348 5349 return (0); 5350 } 5351 5352 5353 static int 5354 mirror_imp_set( 5355 set_t setno 5356 ) 5357 { 5358 5359 mddb_recid_t recid; 5360 int gotsomething, i; 5361 mddb_type_t typ1; 5362 mddb_de_ic_t *dep; 5363 mddb_rb32_t *rbp; 5364 mm_unit32_od_t *un32; 5365 mm_unit_t *un64; 5366 md_dev64_t self_devt; 5367 minor_t *self_id; /* minor needs to be updated */ 5368 md_parent_t *parent_id; /* parent needs to be updated */ 5369 mddb_recid_t *record_id; /* record id needs to be updated */ 5370 mddb_recid_t *optrec_id; 5371 md_dev64_t tmpdev; 5372 5373 5374 gotsomething = 0; 5375 5376 typ1 = (mddb_type_t)md_getshared_key(setno, 5377 mirror_md_ops.md_driver.md_drivername); 5378 recid = mddb_makerecid(setno, 0); 5379 5380 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5381 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 5382 continue; 5383 5384 dep = mddb_getrecdep(recid); 5385 rbp = dep->de_rb; 5386 5387 switch (rbp->rb_revision) { 5388 case MDDB_REV_RB: 5389 case MDDB_REV_RBFN: 5390 /* 5391 * Small device 5392 */ 5393 un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid); 5394 self_id = &(un32->c.un_self_id); 5395 parent_id = &(un32->c.un_parent); 5396 record_id = &(un32->c.un_record_id); 5397 optrec_id = &(un32->un_rr_dirty_recid); 5398 5399 for (i = 0; i < un32->un_nsm; i++) { 5400 tmpdev = md_expldev(un32->un_sm[i].sm_dev); 5401 un32->un_sm[i].sm_dev = md_cmpldev 5402 (md_makedevice(md_major, MD_MKMIN(setno, 5403 MD_MIN2UNIT(md_getminor(tmpdev))))); 5404 5405 if (!md_update_minor(setno, mddb_getsidenum 5406 (setno), un32->un_sm[i].sm_key)) 5407 goto out; 5408 } 5409 break; 5410 case MDDB_REV_RB64: 5411 case MDDB_REV_RB64FN: 5412 un64 = (mm_unit_t *)mddb_getrecaddr(recid); 5413 self_id = &(un64->c.un_self_id); 5414 parent_id = &(un64->c.un_parent); 5415 record_id = &(un64->c.un_record_id); 5416 optrec_id = &(un64->un_rr_dirty_recid); 5417 5418 for (i = 0; i < un64->un_nsm; i++) { 5419 tmpdev = un64->un_sm[i].sm_dev; 5420 un64->un_sm[i].sm_dev = md_makedevice 5421 (md_major, MD_MKMIN(setno, MD_MIN2UNIT 5422 (md_getminor(tmpdev)))); 5423 5424 if (!md_update_minor(setno, mddb_getsidenum 5425 (setno), un64->un_sm[i].sm_key)) 5426 goto out; 5427 } 5428 break; 5429 } 5430 5431 /* 5432 * If this is a top level and a friendly name metadevice, 5433 * update its minor in the namespace. 5434 */ 5435 if ((*parent_id == MD_NO_PARENT) && 5436 ((rbp->rb_revision == MDDB_REV_RBFN) || 5437 (rbp->rb_revision == MDDB_REV_RB64FN))) { 5438 5439 self_devt = md_makedevice(md_major, *self_id); 5440 if (!md_update_top_device_minor(setno, 5441 mddb_getsidenum(setno), self_devt)) 5442 goto out; 5443 } 5444 5445 /* 5446 * Update unit with the imported setno 5447 * 5448 */ 5449 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5450 5451 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 5452 if (*parent_id != MD_NO_PARENT) 5453 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 5454 *record_id = MAKERECID(setno, DBID(*record_id)); 5455 *optrec_id = MAKERECID(setno, DBID(*optrec_id)); 5456 5457 gotsomething = 1; 5458 } 5459 5460 out: 5461 return (gotsomething); 5462 } 5463 5464 /* 5465 * NAME: mirror_check_offline 5466 * 5467 * DESCRIPTION: return offline_status = 1 if any submirrors are offline 5468 * 5469 * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is 5470 * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE 5471 * ioctl. 5472 */ 5473 int 5474 mirror_check_offline(md_dev64_t dev, int *offline_status) 5475 { 5476 mm_unit_t *un; 5477 md_error_t mde = mdnullerror; 5478 5479 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5480 return (EINVAL); 5481 *offline_status = 0; 5482 if (un->c.un_status & MD_UN_OFFLINE_SM) 5483 *offline_status = 1; 5484 return (0); 5485 } 5486 5487 /* 5488 * NAME: mirror_inc_abr_count 5489 * 5490 * DESCRIPTION: increment the count of layered soft parts with ABR set 5491 * 5492 * Called from ioctl, so access to un_abr_count is protected by the global 5493 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5494 */ 5495 int 5496 mirror_inc_abr_count(md_dev64_t dev) 5497 { 5498 mm_unit_t *un; 5499 md_error_t mde = mdnullerror; 5500 5501 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5502 return (EINVAL); 5503 un->un_abr_count++; 5504 return (0); 5505 } 5506 5507 /* 5508 * NAME: mirror_dec_abr_count 5509 * 5510 * DESCRIPTION: decrement the count of layered soft parts with ABR set 5511 * 5512 * Called from ioctl, so access to un_abr_count is protected by the global 5513 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5514 */ 5515 int 5516 mirror_dec_abr_count(md_dev64_t dev) 5517 { 5518 mm_unit_t *un; 5519 md_error_t mde = mdnullerror; 5520 5521 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5522 return (EINVAL); 5523 un->un_abr_count--; 5524 return (0); 5525 } 5526 5527 static md_named_services_t mirror_named_services[] = { 5528 {(intptr_t (*)()) poke_hotspares, "poke hotspares" }, 5529 {(intptr_t (*)()) mirror_rename_listkids, MDRNM_LIST_URKIDS }, 5530 {mirror_rename_check, MDRNM_CHECK }, 5531 {(intptr_t (*)()) mirror_renexch_update_kids, MDRNM_UPDATE_KIDS }, 5532 {(intptr_t (*)()) mirror_exchange_parent_update_to, 5533 MDRNM_PARENT_UPDATE_TO}, 5534 {(intptr_t (*)()) mirror_exchange_self_update_from_down, 5535 MDRNM_SELF_UPDATE_FROM_DOWN }, 5536 {(intptr_t (*)())mirror_probe_dev, "probe open test" }, 5537 {(intptr_t (*)())mirror_check_offline, MD_CHECK_OFFLINE }, 5538 {(intptr_t (*)())mirror_inc_abr_count, MD_INC_ABR_COUNT }, 5539 {(intptr_t (*)())mirror_dec_abr_count, MD_DEC_ABR_COUNT }, 5540 { NULL, 0 } 5541 }; 5542 5543 md_ops_t mirror_md_ops = { 5544 mirror_open, /* open */ 5545 mirror_close, /* close */ 5546 md_mirror_strategy, /* strategy */ 5547 NULL, /* print */ 5548 mirror_dump, /* dump */ 5549 NULL, /* read */ 5550 NULL, /* write */ 5551 md_mirror_ioctl, /* mirror_ioctl, */ 5552 mirror_snarf, /* mirror_snarf */ 5553 mirror_halt, /* mirror_halt */ 5554 NULL, /* aread */ 5555 NULL, /* awrite */ 5556 mirror_imp_set, /* import set */ 5557 mirror_named_services 5558 }; 5559 5560 /* module specific initilization */ 5561 static void 5562 init_init() 5563 { 5564 md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t); 5565 5566 /* Initialize the parent and child save memory pools */ 5567 mirror_parent_cache = kmem_cache_create("md_mirror_parent", 5568 sizeof (md_mps_t), 0, mirror_parent_constructor, 5569 mirror_parent_destructor, mirror_run_queue, NULL, NULL, 5570 0); 5571 5572 mirror_child_cache = kmem_cache_create("md_mirror_child", 5573 sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0, 5574 mirror_child_constructor, mirror_child_destructor, 5575 mirror_run_queue, NULL, NULL, 0); 5576 5577 /* 5578 * Insure wowbuf_size is a multiple of DEV_BSIZE, 5579 * then initialize wowbuf memory pool. 5580 */ 5581 md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE); 5582 if (md_wowbuf_size <= 0) 5583 md_wowbuf_size = 2 * DEV_BSIZE; 5584 if (md_wowbuf_size > (32 * DEV_BSIZE)) 5585 md_wowbuf_size = (32 * DEV_BSIZE); 5586 5587 md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t); 5588 mirror_wowblk_cache = kmem_cache_create("md_mirror_wow", 5589 md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0); 5590 5591 mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5592 mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5593 5594 mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL); 5595 } 5596 5597 /* module specific uninitilization (undo init_init()) */ 5598 static void 5599 fini_uninit() 5600 { 5601 kmem_cache_destroy(mirror_parent_cache); 5602 kmem_cache_destroy(mirror_child_cache); 5603 kmem_cache_destroy(mirror_wowblk_cache); 5604 mirror_parent_cache = mirror_child_cache = 5605 mirror_wowblk_cache = NULL; 5606 5607 mutex_destroy(&mirror_timeout.dr_mx); 5608 mutex_destroy(&hotspare_request.dr_mx); 5609 mutex_destroy(&non_ff_drv_mutex); 5610 } 5611 5612 /* define the module linkage */ 5613 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit()) 5614