1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/conf.h> 31 #include <sys/file.h> 32 #include <sys/user.h> 33 #include <sys/uio.h> 34 #include <sys/t_lock.h> 35 #include <sys/buf.h> 36 #include <sys/dkio.h> 37 #include <sys/vtoc.h> 38 #include <sys/kmem.h> 39 #include <vm/page.h> 40 #include <sys/cmn_err.h> 41 #include <sys/sysmacros.h> 42 #include <sys/types.h> 43 #include <sys/mkdev.h> 44 #include <sys/stat.h> 45 #include <sys/open.h> 46 #include <sys/modctl.h> 47 #include <sys/ddi.h> 48 #include <sys/sunddi.h> 49 #include <sys/debug.h> 50 #include <sys/dklabel.h> 51 #include <vm/hat.h> 52 #include <sys/lvm/mdvar.h> 53 #include <sys/lvm/md_mirror.h> 54 #include <sys/lvm/md_convert.h> 55 #include <sys/lvm/md_mddb.h> 56 #include <sys/esunddi.h> 57 58 #include <sys/sysevent/eventdefs.h> 59 #include <sys/sysevent/svm.h> 60 #include <sys/lvm/mdmn_commd.h> 61 #include <sys/avl.h> 62 63 md_ops_t mirror_md_ops; 64 #ifndef lint 65 char _depends_on[] = "drv/md"; 66 md_ops_t *md_interface_ops = &mirror_md_ops; 67 #endif 68 69 extern mdq_anchor_t md_done_daemon; 70 extern mdq_anchor_t md_mstr_daemon; 71 extern mdq_anchor_t md_mirror_daemon; 72 extern mdq_anchor_t md_mirror_io_daemon; 73 extern mdq_anchor_t md_mirror_rs_daemon; 74 extern mdq_anchor_t md_mhs_daemon; 75 76 extern unit_t md_nunits; 77 extern set_t md_nsets; 78 extern md_set_t md_set[]; 79 80 extern int md_status; 81 extern clock_t md_hz; 82 83 extern md_krwlock_t md_unit_array_rw; 84 extern kmutex_t md_mx; 85 extern kcondvar_t md_cv; 86 extern int md_mtioctl_cnt; 87 88 daemon_request_t mirror_timeout; 89 static daemon_request_t hotspare_request; 90 static daemon_request_t mn_hs_request[MD_MAXSETS]; /* Multinode hs req */ 91 92 int md_mirror_mcs_buf_off; 93 94 /* Flags for mdmn_ksend_message to allow debugging */ 95 int md_mirror_msg_flags; 96 97 #ifdef DEBUG 98 /* Flag to switch on debug messages */ 99 int mirror_debug_flag = 0; 100 #endif 101 102 /* 103 * Struct used to hold count of DMR reads and the timestamp of last DMR read 104 * It is used to verify, using a debugger, that the DMR read ioctl has been 105 * executed. 106 */ 107 dmr_stats_t mirror_dmr_stats = {0, 0}; 108 109 /* 110 * Mutex protecting list of non-failfast drivers. 111 */ 112 static kmutex_t non_ff_drv_mutex; 113 extern char **non_ff_drivers; 114 115 extern major_t md_major; 116 117 /* 118 * Write-On-Write memory pool. 119 */ 120 static void copy_write_cont(wowhdr_t *wowhdr); 121 static kmem_cache_t *mirror_wowblk_cache = NULL; 122 static int md_wowbuf_size = 16384; 123 static size_t md_wowblk_size; 124 125 /* 126 * This is a flag that allows: 127 * - disabling the write-on-write mechanism. 128 * - logging occurrences of write-on-write 129 * - switching wow handling procedure processing 130 * Counter for occurences of WOW. 131 */ 132 static uint_t md_mirror_wow_flg = 0; 133 static int md_mirror_wow_cnt = 0; 134 135 /* 136 * Tunable to enable/disable dirty region 137 * processing when closing down a mirror. 138 */ 139 static int new_resync = 1; 140 kmem_cache_t *mirror_parent_cache = NULL; 141 kmem_cache_t *mirror_child_cache = NULL; 142 143 extern int md_ff_disable; /* disable failfast */ 144 145 static int mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int); 146 static void mirror_read_strategy(buf_t *, int, void *); 147 static void mirror_write_strategy(buf_t *, int, void *); 148 static void become_owner(daemon_queue_t *); 149 static int mirror_done(struct buf *cb); 150 static int mirror_done_common(struct buf *cb); 151 static void clear_retry_error(struct buf *cb); 152 153 /* 154 * patchables 155 */ 156 int md_min_rr_size = 200; /* 2000 blocks, or 100k */ 157 int md_def_num_rr = 1000; /* Default number of dirty regions */ 158 159 /* 160 * patchable to change delay before rescheduling mirror ownership request. 161 * Value is clock ticks, default 0.5 seconds 162 */ 163 clock_t md_mirror_owner_to = 500000; 164 165 /*ARGSUSED1*/ 166 static int 167 mirror_parent_constructor(void *p, void *d1, int d2) 168 { 169 mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL); 170 return (0); 171 } 172 173 static void 174 mirror_parent_init(md_mps_t *ps) 175 { 176 bzero(ps, offsetof(md_mps_t, ps_mx)); 177 } 178 179 /*ARGSUSED1*/ 180 static void 181 mirror_parent_destructor(void *p, void *d) 182 { 183 mutex_destroy(&((md_mps_t *)p)->ps_mx); 184 } 185 186 /*ARGSUSED1*/ 187 static int 188 mirror_child_constructor(void *p, void *d1, int d2) 189 { 190 bioinit(&((md_mcs_t *)p)->cs_buf); 191 return (0); 192 } 193 194 void 195 mirror_child_init(md_mcs_t *cs) 196 { 197 cs->cs_ps = NULL; 198 cs->cs_mdunit = 0; 199 md_bioreset(&cs->cs_buf); 200 } 201 202 /*ARGSUSED1*/ 203 static void 204 mirror_child_destructor(void *p, void *d) 205 { 206 biofini(&((md_mcs_t *)p)->cs_buf); 207 } 208 209 static void 210 mirror_wowblk_init(wowhdr_t *p) 211 { 212 bzero(p, md_wowblk_size); 213 } 214 215 static void 216 send_poke_hotspares_msg(daemon_request_t *drq) 217 { 218 int rval; 219 md_mn_msg_pokehsp_t pokehsp; 220 md_mn_kresult_t *kresult; 221 set_t setno = (set_t)drq->dq.qlen; 222 223 pokehsp.pokehsp_setno = setno; 224 225 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 226 rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES, 227 MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, (char *)&pokehsp, 228 sizeof (pokehsp), kresult); 229 230 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 231 mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES"); 232 cmn_err(CE_PANIC, 233 "ksend_message failure: POKE_HOTSPARES"); 234 } 235 kmem_free(kresult, sizeof (md_mn_kresult_t)); 236 237 /* Allow further requests to use this set's queue structure */ 238 mutex_enter(&drq->dr_mx); 239 drq->dr_pending = 0; 240 mutex_exit(&drq->dr_mx); 241 } 242 243 /* 244 * Send a poke_hotspares message to the master node. To avoid swamping the 245 * commd handler with requests we only send a message if there is not one 246 * already outstanding. We punt the request to a separate thread context as 247 * cannot afford to block waiting on the request to be serviced. This is 248 * essential when a reconfig cycle is in progress as any open() of a multinode 249 * metadevice may result in a livelock. 250 */ 251 static void 252 send_poke_hotspares(set_t setno) 253 { 254 daemon_request_t *drq = &mn_hs_request[setno]; 255 256 mutex_enter(&drq->dr_mx); 257 if (drq->dr_pending == 0) { 258 drq->dr_pending = 1; 259 drq->dq.qlen = (int)setno; 260 daemon_request(&md_mhs_daemon, 261 send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD); 262 } 263 mutex_exit(&drq->dr_mx); 264 } 265 266 void 267 mirror_set_sm_state( 268 mm_submirror_t *sm, 269 mm_submirror_ic_t *smic, 270 sm_state_t newstate, 271 int force) 272 { 273 int compcnt; 274 int i; 275 int errcnt; 276 sm_state_t origstate; 277 md_m_shared_t *shared; 278 279 if (force) { 280 sm->sm_state = newstate; 281 uniqtime32(&sm->sm_timestamp); 282 return; 283 } 284 285 origstate = newstate; 286 287 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 288 for (i = 0, errcnt = 0; i < compcnt; i++) { 289 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 290 (sm->sm_dev, sm, i); 291 if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED)) 292 newstate |= SMS_COMP_ERRED; 293 if (shared->ms_state & (CS_RESYNC)) 294 newstate |= SMS_COMP_RESYNC; 295 if (shared->ms_state & CS_ERRED) 296 errcnt++; 297 } 298 299 if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0) 300 newstate &= ~origstate; 301 302 if (errcnt == compcnt) 303 newstate |= SMS_ALL_ERRED; 304 else 305 newstate &= ~SMS_ALL_ERRED; 306 307 sm->sm_state = newstate; 308 uniqtime32(&sm->sm_timestamp); 309 } 310 311 static int 312 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error, 313 int frm_probe) 314 { 315 mm_submirror_t *sm; 316 mm_submirror_ic_t *smic; 317 md_m_shared_t *shared; 318 int ci; 319 int i; 320 int compcnt; 321 int open_comp; /* flag for open component */ 322 323 for (i = *smi; i < NMIRROR; i++) { 324 sm = &un->un_sm[i]; 325 smic = &un->un_smic[i]; 326 327 if (!SMS_IS(sm, SMS_INUSE)) 328 continue; 329 330 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 331 for (ci = *cip; ci < compcnt; ci++) { 332 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 333 (sm->sm_dev, sm, ci); 334 /* 335 * if called from any routine but probe, we check for 336 * MDM_S_ISOPEN flag. Since probe does a pseduo open, 337 * it sets MDM_S_PROBEOPEN flag and we test for this 338 * flag. They are both exclusive tests. 339 */ 340 open_comp = (frm_probe) ? 341 (shared->ms_flags & MDM_S_PROBEOPEN): 342 (shared->ms_flags & MDM_S_ISOPEN); 343 if ((shared->ms_flags & MDM_S_IOERR || !open_comp) && 344 ((shared->ms_state == CS_OKAY) || 345 (shared->ms_state == CS_RESYNC))) { 346 if (clr_error) { 347 shared->ms_flags &= ~MDM_S_IOERR; 348 } 349 *cip = ci; 350 *smi = i; 351 return (1); 352 } 353 354 if (clr_error && (shared->ms_flags & MDM_S_IOERR)) { 355 shared->ms_flags &= ~MDM_S_IOERR; 356 } 357 } 358 359 *cip = 0; 360 } 361 return (0); 362 } 363 364 /*ARGSUSED*/ 365 static void 366 mirror_run_queue(void *d) 367 { 368 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 369 md_daemon(1, &md_done_daemon); 370 } 371 /* 372 * check_comp_4_hotspares 373 * 374 * This function attempts to allocate a hotspare for this component if the 375 * component is in error. In a MN set, the function can be called in 2 modes. 376 * It can be called either when a component error has been detected or when a 377 * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set 378 * in flags and the request is sent to all nodes. 379 * The handler on each of the nodes then calls this function with 380 * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed. 381 * 382 * For non-MN sets the function simply attempts to allocate a hotspare. 383 * 384 * On entry, the following locks are held 385 * mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set) 386 * md_unit_writerlock 387 * 388 * Returns 0 if ok 389 * 1 if the unit containing the component has been cleared while 390 * the mdmn_ksend_message() was being executed 391 */ 392 extern int 393 check_comp_4_hotspares( 394 mm_unit_t *un, 395 int smi, 396 int ci, 397 uint_t flags, 398 mddb_recid_t hs_id, /* Only used by MN disksets */ 399 IOLOCK *lockp /* can be NULL */ 400 ) 401 { 402 mm_submirror_t *sm; 403 mm_submirror_ic_t *smic; 404 md_m_shared_t *shared; 405 mddb_recid_t recids[6]; 406 minor_t mnum; 407 intptr_t (*hs_dev)(); 408 void (*hs_done)(); 409 void *hs_data; 410 md_error_t mde = mdnullerror; 411 set_t setno; 412 md_mn_msg_allochsp_t allochspmsg; 413 md_mn_kresult_t *kresult; 414 mm_unit_t *new_un; 415 int rval; 416 417 mnum = MD_SID(un); 418 setno = MD_UN2SET(un); 419 sm = &un->un_sm[smi]; 420 smic = &un->un_smic[smi]; 421 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 422 (sm->sm_dev, sm, ci); 423 424 if (shared->ms_state != CS_ERRED) 425 return (0); 426 427 /* Don't start a new component resync if a resync is already running. */ 428 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 429 return (0); 430 431 if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) { 432 uint_t msgflags; 433 md_mn_msgtype_t msgtype; 434 435 /* Send allocate hotspare message to all nodes */ 436 437 allochspmsg.msg_allochsp_mnum = un->c.un_self_id; 438 allochspmsg.msg_allochsp_sm = smi; 439 allochspmsg.msg_allochsp_comp = ci; 440 allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id; 441 442 /* 443 * Before calling mdmn_ksend_message(), release locks 444 * Can never be in the context of an ioctl. 445 */ 446 md_unit_writerexit(MDI_UNIT(mnum)); 447 if (flags & MD_HOTSPARE_LINKHELD) 448 rw_exit(&mirror_md_ops.md_link_rw.lock); 449 #ifdef DEBUG 450 if (mirror_debug_flag) 451 printf("send alloc hotspare, flags=" 452 "0x%x %x, %x, %x, %x\n", flags, 453 allochspmsg.msg_allochsp_mnum, 454 allochspmsg.msg_allochsp_sm, 455 allochspmsg.msg_allochsp_comp, 456 allochspmsg.msg_allochsp_hs_id); 457 #endif 458 if (flags & MD_HOTSPARE_WMUPDATE) { 459 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE2; 460 /* 461 * When coming from an update of watermarks, there 462 * must already be a message logged that triggered 463 * this action. So, no need to log this message, too. 464 */ 465 msgflags = MD_MSGF_NO_LOG; 466 } else { 467 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE; 468 msgflags = MD_MSGF_DEFAULT_FLAGS; 469 } 470 471 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 472 rval = mdmn_ksend_message(setno, msgtype, msgflags, 473 (char *)&allochspmsg, sizeof (allochspmsg), 474 kresult); 475 476 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 477 #ifdef DEBUG 478 if (mirror_debug_flag) 479 mdmn_ksend_show_error(rval, kresult, 480 "ALLOCATE HOTSPARE"); 481 #endif 482 /* 483 * If message is sent ok but exitval indicates an error 484 * it must be because the mirror has been cleared. In 485 * this case re-obtain lock and return an error 486 */ 487 if ((rval == 0) && (kresult->kmmr_exitval != 0)) { 488 if (flags & MD_HOTSPARE_LINKHELD) { 489 rw_enter(&mirror_md_ops.md_link_rw.lock, 490 RW_READER); 491 } 492 kmem_free(kresult, sizeof (md_mn_kresult_t)); 493 return (1); 494 } 495 cmn_err(CE_PANIC, 496 "ksend_message failure: ALLOCATE_HOTSPARE"); 497 } 498 kmem_free(kresult, sizeof (md_mn_kresult_t)); 499 500 /* 501 * re-obtain the locks 502 */ 503 if (flags & MD_HOTSPARE_LINKHELD) 504 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 505 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 506 507 /* 508 * As we had to release the locks in order to send the 509 * message to all nodes, we need to check to see if the 510 * unit has changed. If it has we release the writerlock 511 * and return fail. 512 */ 513 if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) { 514 md_unit_writerexit(MDI_UNIT(mnum)); 515 return (1); 516 } 517 } else { 518 if (MD_MNSET_SETNO(setno)) { 519 /* 520 * If 2 or more nodes simultaneously see a 521 * component failure, these nodes will each 522 * send an ALLOCATE_HOTSPARE[2] message. 523 * The first message will allocate the hotspare 524 * and the subsequent messages should do nothing. 525 * 526 * If a slave node doesn't have a hotspare allocated 527 * at the time the message is initiated, then the 528 * passed in hs_id will be 0. If the node 529 * executing this routine has a component shared 530 * ms_hs_id of non-zero, but the message shows a 531 * hs_id of 0, then just return since a hotspare 532 * has already been allocated for this failing 533 * component. When the slave node returns from 534 * the ksend_message the hotspare will have 535 * already been allocated. 536 * 537 * If the slave node does send an hs_id of non-zero, 538 * and the slave node's hs_id matches this node's 539 * ms_hs_id, then the hotspare has error'd and 540 * should be replaced. 541 * 542 * If the slave node sends an hs_id of non-zero and 543 * this node has a different shared ms_hs_id, then 544 * just return since this hotspare has already 545 * been hotspared. 546 */ 547 if (shared->ms_hs_id != 0) { 548 if (hs_id == 0) { 549 #ifdef DEBUG 550 if (mirror_debug_flag) { 551 printf("check_comp_4_hotspares" 552 "(NOXMIT), short circuit " 553 "hs_id=0x%x, " 554 "ms_hs_id=0x%x\n", 555 hs_id, shared->ms_hs_id); 556 } 557 #endif 558 return (0); 559 } 560 if (hs_id != shared->ms_hs_id) { 561 #ifdef DEBUG 562 if (mirror_debug_flag) { 563 printf("check_comp_4_hotspares" 564 "(NOXMIT), short circuit2 " 565 "hs_id=0x%x, " 566 "ms_hs_id=0x%x\n", 567 hs_id, shared->ms_hs_id); 568 } 569 #endif 570 return (0); 571 } 572 } 573 } 574 575 sm = &un->un_sm[smi]; 576 hs_dev = md_get_named_service(sm->sm_dev, 0, 577 "hotspare device", 0); 578 if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done, 579 &hs_data) != 0) 580 return (0); 581 582 /* 583 * set_sm_comp_state() commits the modified records. 584 * As we don't transmit the changes, no need to drop the lock. 585 */ 586 set_sm_comp_state(un, smi, ci, CS_RESYNC, recids, 587 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 588 589 (*hs_done)(sm->sm_dev, hs_data); 590 591 mirror_check_failfast(mnum); 592 593 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE, 594 setno, MD_SID(un)); 595 596 /* 597 * For a multi-node set we need to reset the un_rs_type, 598 * un_rs_resync_done and un_rs_resync_2_do fields as the 599 * hot-spare resync must copy all applicable data. 600 */ 601 if (MD_MNSET_SETNO(setno)) { 602 un->un_rs_type = MD_RS_NONE; 603 un->un_rs_resync_done = 0; 604 un->un_rs_resync_2_do = 0; 605 } 606 607 /* 608 * Must drop writer lock since mirror_resync_unit will 609 * open devices and must be able to grab readerlock. 610 * Don't need to drop IOLOCK since any descendent routines 611 * calling ksend_messages will drop the IOLOCK as needed. 612 * 613 */ 614 if (lockp) { 615 md_ioctl_writerexit(lockp); 616 } else { 617 md_unit_writerexit(MDI_UNIT(mnum)); 618 } 619 620 /* start resync */ 621 (void) mirror_resync_unit(mnum, NULL, &mde, lockp); 622 623 if (lockp) { 624 new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum)); 625 } else { 626 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 627 } 628 } 629 return (0); 630 } 631 632 /* 633 * check_unit_4_hotspares 634 * 635 * For a given mirror, allocate hotspares, if available for any components 636 * that are in error 637 * 638 * Returns 0 if ok 639 * 1 if check_comp_4_hotspares returns non-zero. This will only 640 * happen for a MN unit where the unit has been cleared while 641 * the allocate hotspare message is sent to all nodes. 642 */ 643 static int 644 check_unit_4_hotspares(mm_unit_t *un, int flags) 645 { 646 mm_submirror_t *sm; 647 mm_submirror_ic_t *smic; 648 int ci; 649 int i; 650 int compcnt; 651 652 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 653 return (0); 654 655 for (i = 0; i < NMIRROR; i++) { 656 sm = &un->un_sm[i]; 657 smic = &un->un_smic[i]; 658 if (!SMS_IS(sm, SMS_INUSE)) 659 continue; 660 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm); 661 for (ci = 0; ci < compcnt; ci++) { 662 md_m_shared_t *shared; 663 664 shared = (md_m_shared_t *) 665 (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci); 666 /* 667 * Never called from ioctl context, so pass in 668 * (IOLOCK *)NULL. Pass through flags from calling 669 * routine, also setting XMIT flag. 670 */ 671 if (check_comp_4_hotspares(un, i, ci, 672 (MD_HOTSPARE_XMIT | flags), 673 shared->ms_hs_id, (IOLOCK *)NULL) != 0) 674 return (1); 675 } 676 } 677 return (0); 678 } 679 680 static void 681 check_4_hotspares(daemon_request_t *drq) 682 { 683 mdi_unit_t *ui; 684 mm_unit_t *un; 685 md_link_t *next; 686 int x; 687 688 mutex_enter(&drq->dr_mx); /* clear up front so can poke */ 689 drq->dr_pending = 0; /* again in low level routine if */ 690 mutex_exit(&drq->dr_mx); /* something found to do */ 691 692 /* 693 * Used to have a problem here. The disksets weren't marked as being 694 * MNHOLD. This opened a window where we could be searching for 695 * hotspares and have the disk set unloaded (released) from under 696 * us causing a panic in stripe_component_count(). 697 * The way to prevent that is to mark the set MNHOLD which prevents 698 * any diskset from being released while we are scanning the mirrors, 699 * submirrors and components. 700 */ 701 702 for (x = 0; x < md_nsets; x++) 703 md_holdset_enter(x); 704 705 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 706 for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) { 707 ui = MDI_UNIT(next->ln_id); 708 709 un = (mm_unit_t *)md_unit_readerlock(ui); 710 711 /* 712 * Only check the unit if we are the master for this set 713 * For an MN set, poke_hotspares() is only effective on the 714 * master 715 */ 716 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 717 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 718 md_unit_readerexit(ui); 719 continue; 720 } 721 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) { 722 md_unit_readerexit(ui); 723 continue; 724 } 725 md_unit_readerexit(ui); 726 727 un = (mm_unit_t *)md_unit_writerlock(ui); 728 /* 729 * check_unit_4_hotspares will exit 1 if the unit has been 730 * removed during the process of allocating the hotspare. 731 * This can only happen for a MN metadevice. If unit no longer 732 * exists, no need to release writerlock 733 */ 734 if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0) 735 md_unit_writerexit(ui); 736 else { 737 /* 738 * If check_unit_4_hotspares failed, queue another 739 * request and break out of this one 740 */ 741 (void) poke_hotspares(); 742 break; 743 } 744 } 745 rw_exit(&mirror_md_ops.md_link_rw.lock); 746 747 for (x = 0; x < md_nsets; x++) 748 md_holdset_exit(x); 749 } 750 751 /* 752 * poke_hotspares 753 * 754 * If there is not a pending poke_hotspares request pending, queue a requent 755 * to call check_4_hotspares(). This will scan all mirrors and attempt to 756 * allocate hotspares for all components in error. 757 */ 758 int 759 poke_hotspares() 760 { 761 mutex_enter(&hotspare_request.dr_mx); 762 if (hotspare_request.dr_pending == 0) { 763 hotspare_request.dr_pending = 1; 764 daemon_request(&md_mhs_daemon, 765 check_4_hotspares, (daemon_queue_t *)&hotspare_request, 766 REQ_OLD); 767 } 768 mutex_exit(&hotspare_request.dr_mx); 769 return (0); 770 } 771 772 static void 773 free_all_ecomps(err_comp_t *ecomp) 774 { 775 err_comp_t *d; 776 777 while (ecomp != NULL) { 778 d = ecomp; 779 ecomp = ecomp->ec_next; 780 kmem_free(d, sizeof (err_comp_t)); 781 } 782 } 783 784 /* 785 * NAME: mirror_openfail_console_info 786 * 787 * DESCRIPTION: Prints a informative message to the console when mirror 788 * cannot be opened. 789 * 790 * PARAMETERS: mm_unit_t un - pointer to mirror unit structure 791 * int smi - submirror index 792 * int ci - component index 793 */ 794 795 void 796 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci) 797 { 798 void (*get_dev)(); 799 ms_cd_info_t cd; 800 md_dev64_t tmpdev; 801 802 tmpdev = un->un_sm[smi].sm_dev; 803 get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0); 804 if (get_dev != NULL) { 805 (void) (*get_dev)(tmpdev, smi, ci, &cd); 806 cmn_err(CE_WARN, "md %s: open error on %s", 807 md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un), 808 cd.cd_dev, NULL, 0)); 809 } else { 810 cmn_err(CE_WARN, "md %s: open error", 811 md_shortname(MD_SID(un))); 812 } 813 } 814 815 static int 816 mirror_close_all_devs(mm_unit_t *un, int md_cflags) 817 { 818 int i; 819 md_dev64_t dev; 820 821 for (i = 0; i < NMIRROR; i++) { 822 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 823 continue; 824 dev = un->un_sm[i].sm_dev; 825 md_layered_close(dev, md_cflags); 826 } 827 return (0); 828 } 829 830 /* 831 * Keep track of drivers that don't support failfast. We use this so that 832 * we only log one diagnostic message for each of these drivers, no matter 833 * how many times we run the mirror_check_failfast function. 834 * Return 1 if this is a new driver that does not support failfast, 835 * return 0 if we have already seen this non-failfast driver. 836 */ 837 static int 838 new_non_ff_driver(const char *s) 839 { 840 mutex_enter(&non_ff_drv_mutex); 841 if (non_ff_drivers == NULL) { 842 non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *), 843 KM_NOSLEEP); 844 if (non_ff_drivers == NULL) { 845 mutex_exit(&non_ff_drv_mutex); 846 return (1); 847 } 848 849 non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, 850 KM_NOSLEEP); 851 if (non_ff_drivers[0] == NULL) { 852 kmem_free(non_ff_drivers, 2 * sizeof (char *)); 853 non_ff_drivers = NULL; 854 mutex_exit(&non_ff_drv_mutex); 855 return (1); 856 } 857 858 (void) strcpy(non_ff_drivers[0], s); 859 non_ff_drivers[1] = NULL; 860 861 } else { 862 int i; 863 char **tnames; 864 char **tmp; 865 866 for (i = 0; non_ff_drivers[i] != NULL; i++) { 867 if (strcmp(s, non_ff_drivers[i]) == 0) { 868 mutex_exit(&non_ff_drv_mutex); 869 return (0); 870 } 871 } 872 873 /* allow for new element and null */ 874 i += 2; 875 tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP); 876 if (tnames == NULL) { 877 mutex_exit(&non_ff_drv_mutex); 878 return (1); 879 } 880 881 for (i = 0; non_ff_drivers[i] != NULL; i++) 882 tnames[i] = non_ff_drivers[i]; 883 884 tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); 885 if (tnames[i] == NULL) { 886 /* adjust i so that it is the right count to free */ 887 kmem_free(tnames, (i + 2) * sizeof (char *)); 888 mutex_exit(&non_ff_drv_mutex); 889 return (1); 890 } 891 892 (void) strcpy(tnames[i++], s); 893 tnames[i] = NULL; 894 895 tmp = non_ff_drivers; 896 non_ff_drivers = tnames; 897 /* i now represents the count we previously alloced */ 898 kmem_free(tmp, i * sizeof (char *)); 899 } 900 mutex_exit(&non_ff_drv_mutex); 901 902 return (1); 903 } 904 905 /* 906 * Check for the "ddi-failfast-supported" devtree property on each submirror 907 * component to indicate if we should do I/O to that submirror with the 908 * B_FAILFAST flag set or not. This check is made at various state transitions 909 * in the mirror code (e.g. open, enable, hotspare, etc.). Sometimes we 910 * only need to check one drive (e.g. hotspare) but since the check is 911 * fast and infrequent and sometimes needs to be done on all components we 912 * just check all components on each call. 913 */ 914 void 915 mirror_check_failfast(minor_t mnum) 916 { 917 int i; 918 mm_unit_t *un; 919 920 if (md_ff_disable) 921 return; 922 923 un = MD_UNIT(mnum); 924 925 for (i = 0; i < NMIRROR; i++) { 926 int ci; 927 int cnt; 928 int ff = 1; 929 mm_submirror_t *sm; 930 mm_submirror_ic_t *smic; 931 void (*get_dev)(); 932 933 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 934 continue; 935 936 sm = &un->un_sm[i]; 937 smic = &un->un_smic[i]; 938 939 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 940 "get device", 0); 941 942 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 943 for (ci = 0; ci < cnt; ci++) { 944 int found = 0; 945 dev_t ci_dev; 946 major_t major; 947 dev_info_t *devi; 948 ms_cd_info_t cd; 949 950 /* 951 * this already returns the hs 952 * dev if the device is spared 953 */ 954 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 955 956 ci_dev = md_dev64_to_dev(cd.cd_dev); 957 major = getmajor(ci_dev); 958 959 if (major == md_major) { 960 /* 961 * this component must be a soft 962 * partition; get the real dev 963 */ 964 minor_t dev_mnum; 965 mdi_unit_t *ui; 966 mp_unit_t *un; 967 set_t setno; 968 side_t side; 969 md_dev64_t tmpdev; 970 971 ui = MDI_UNIT(getminor(ci_dev)); 972 973 /* grab necessary lock */ 974 un = (mp_unit_t *)md_unit_readerlock(ui); 975 976 dev_mnum = MD_SID(un); 977 setno = MD_MIN2SET(dev_mnum); 978 side = mddb_getsidenum(setno); 979 980 tmpdev = un->un_dev; 981 982 /* Get dev by device id */ 983 if (md_devid_found(setno, side, 984 un->un_key) == 1) { 985 tmpdev = md_resolve_bydevid(dev_mnum, 986 tmpdev, un->un_key); 987 } 988 989 md_unit_readerexit(ui); 990 991 ci_dev = md_dev64_to_dev(tmpdev); 992 major = getmajor(ci_dev); 993 } 994 995 if (ci_dev != NODEV32 && 996 (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) 997 != NULL) { 998 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; 999 int propvalue = 0; 1000 int proplength = sizeof (int); 1001 int error; 1002 struct cb_ops *cb; 1003 1004 if ((cb = devopsp[major]->devo_cb_ops) != 1005 NULL) { 1006 error = (*cb->cb_prop_op) 1007 (DDI_DEV_T_ANY, devi, prop_op, 1008 DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, 1009 "ddi-failfast-supported", 1010 (caddr_t)&propvalue, &proplength); 1011 1012 if (error == DDI_PROP_SUCCESS) 1013 found = 1; 1014 } 1015 1016 if (!found && new_non_ff_driver( 1017 ddi_driver_name(devi))) { 1018 cmn_err(CE_NOTE, "!md: B_FAILFAST I/O" 1019 "disabled on %s", 1020 ddi_driver_name(devi)); 1021 } 1022 1023 ddi_release_devi(devi); 1024 } 1025 1026 /* 1027 * All components must support 1028 * failfast in the submirror. 1029 */ 1030 if (!found) { 1031 ff = 0; 1032 break; 1033 } 1034 } 1035 1036 if (ff) { 1037 sm->sm_flags |= MD_SM_FAILFAST; 1038 } else { 1039 sm->sm_flags &= ~MD_SM_FAILFAST; 1040 } 1041 } 1042 } 1043 1044 /* 1045 * Return true if the submirror is unavailable. 1046 * If any of the submirror components are opened then the submirror cannot 1047 * be unavailable (MD_INACCESSIBLE). 1048 * If any of the components are already in the errored state, then the submirror 1049 * cannot be unavailable (MD_INACCESSIBLE). 1050 */ 1051 static bool_t 1052 submirror_unavailable(mm_unit_t *un, int smi, int from_probe) 1053 { 1054 mm_submirror_t *sm; 1055 mm_submirror_ic_t *smic; 1056 md_m_shared_t *shared; 1057 int ci; 1058 int compcnt; 1059 1060 sm = &un->un_sm[smi]; 1061 smic = &un->un_smic[smi]; 1062 1063 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 1064 for (ci = 0; ci < compcnt; ci++) { 1065 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 1066 (sm->sm_dev, sm, ci); 1067 if (from_probe) { 1068 if (shared->ms_flags & MDM_S_PROBEOPEN) 1069 return (B_FALSE); 1070 } else { 1071 if (shared->ms_flags & MDM_S_ISOPEN) 1072 return (B_FALSE); 1073 } 1074 if (shared->ms_state == CS_ERRED || 1075 shared->ms_state == CS_LAST_ERRED) 1076 return (B_FALSE); 1077 } 1078 1079 return (B_TRUE); 1080 } 1081 1082 static int 1083 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp) 1084 { 1085 int i; 1086 mm_unit_t *un; 1087 mdi_unit_t *ui; 1088 int err; 1089 int smi; 1090 int ci; 1091 err_comp_t *c; 1092 err_comp_t *ecomps = NULL; 1093 int smmask = 0; 1094 set_t setno; 1095 int sm_cnt; 1096 int sm_unavail_cnt; 1097 1098 mirror_check_failfast(mnum); 1099 1100 un = MD_UNIT(mnum); 1101 ui = MDI_UNIT(mnum); 1102 setno = MD_UN2SET(un); 1103 1104 for (i = 0; i < NMIRROR; i++) { 1105 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1106 1107 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1108 continue; 1109 if (md_layered_open(mnum, &tmpdev, md_oflags)) 1110 smmask |= SMI2BIT(i); 1111 un->un_sm[i].sm_dev = tmpdev; 1112 } 1113 1114 /* 1115 * If smmask is clear, all submirrors are accessible. Clear the 1116 * MD_INACCESSIBLE bit in this case. This bit is also cleared for the 1117 * mirror device. If smmask is set, we have to determine which of the 1118 * submirrors are in error. If no submirror is accessible we mark the 1119 * whole mirror as MD_INACCESSIBLE. 1120 */ 1121 if (smmask == 0) { 1122 if (lockp) { 1123 md_ioctl_readerexit(lockp); 1124 (void) md_ioctl_writerlock(lockp, ui); 1125 } else { 1126 md_unit_readerexit(ui); 1127 (void) md_unit_writerlock(ui); 1128 } 1129 ui->ui_tstate &= ~MD_INACCESSIBLE; 1130 if (lockp) { 1131 md_ioctl_writerexit(lockp); 1132 (void) md_ioctl_readerlock(lockp, ui); 1133 } else { 1134 md_unit_writerexit(ui); 1135 (void) md_unit_readerlock(ui); 1136 } 1137 1138 for (i = 0; i < NMIRROR; i++) { 1139 md_dev64_t tmpdev; 1140 mdi_unit_t *sm_ui; 1141 1142 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1143 continue; 1144 1145 tmpdev = un->un_sm[i].sm_dev; 1146 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1147 (void) md_unit_writerlock(sm_ui); 1148 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1149 md_unit_writerexit(sm_ui); 1150 } 1151 1152 return (0); 1153 } 1154 1155 for (i = 0; i < NMIRROR; i++) { 1156 md_dev64_t tmpdev; 1157 1158 if (!(smmask & SMI2BIT(i))) 1159 continue; 1160 1161 tmpdev = un->un_sm[i].sm_dev; 1162 err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS); 1163 un->un_sm[i].sm_dev = tmpdev; 1164 ASSERT(err == 0); 1165 } 1166 1167 if (lockp) { 1168 md_ioctl_readerexit(lockp); 1169 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui); 1170 } else { 1171 md_unit_readerexit(ui); 1172 un = (mm_unit_t *)md_unit_writerlock(ui); 1173 } 1174 1175 /* 1176 * We want to make sure the unavailable flag is not masking a real 1177 * error on the submirror. 1178 * For each submirror, 1179 * if all of the submirror components couldn't be opened and there 1180 * are no errors on the submirror, then set the unavailable flag 1181 * otherwise, clear unavailable. 1182 */ 1183 sm_cnt = 0; 1184 sm_unavail_cnt = 0; 1185 for (i = 0; i < NMIRROR; i++) { 1186 md_dev64_t tmpdev; 1187 mdi_unit_t *sm_ui; 1188 1189 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1190 continue; 1191 1192 sm_cnt++; 1193 tmpdev = un->un_sm[i].sm_dev; 1194 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1195 1196 (void) md_unit_writerlock(sm_ui); 1197 if (submirror_unavailable(un, i, 0)) { 1198 sm_ui->ui_tstate |= MD_INACCESSIBLE; 1199 sm_unavail_cnt++; 1200 } else { 1201 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1202 } 1203 md_unit_writerexit(sm_ui); 1204 } 1205 1206 /* 1207 * If all of the submirrors are unavailable, the mirror is also 1208 * unavailable. 1209 */ 1210 if (sm_cnt == sm_unavail_cnt) { 1211 ui->ui_tstate |= MD_INACCESSIBLE; 1212 } else { 1213 ui->ui_tstate &= ~MD_INACCESSIBLE; 1214 } 1215 1216 smi = 0; 1217 ci = 0; 1218 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 1219 if (mirror_other_sources(un, smi, ci, 1) == 1) { 1220 1221 free_all_ecomps(ecomps); 1222 (void) mirror_close_all_devs(un, md_oflags); 1223 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, 1224 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1225 mirror_openfail_console_info(un, smi, ci); 1226 if (lockp) { 1227 md_ioctl_writerexit(lockp); 1228 (void) md_ioctl_readerlock(lockp, ui); 1229 } else { 1230 md_unit_writerexit(ui); 1231 (void) md_unit_readerlock(ui); 1232 } 1233 return (ENXIO); 1234 } 1235 1236 /* track all component states that need changing */ 1237 c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP); 1238 c->ec_next = ecomps; 1239 c->ec_smi = smi; 1240 c->ec_ci = ci; 1241 ecomps = c; 1242 ci++; 1243 } 1244 1245 /* Make all state changes and commit them */ 1246 for (c = ecomps; c != NULL; c = c->ec_next) { 1247 /* 1248 * If lockp is set, then entering kernel through ioctl. 1249 * For a MN set, the only ioctl path is via a commd message 1250 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already 1251 * being sent to each node. 1252 * In this case, set NO_XMIT so that set_sm_comp_state 1253 * won't attempt to send a message on a message. 1254 * 1255 * In !MN sets, the xmit flag is ignored, so it doesn't matter 1256 * which flag is passed. 1257 */ 1258 if (lockp) { 1259 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1260 MD_STATE_NO_XMIT, lockp); 1261 } else { 1262 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1263 (MD_STATE_XMIT | MD_STATE_OCHELD), lockp); 1264 } 1265 /* 1266 * For a MN set, the NOTIFY is done when the state change is 1267 * processed on each node 1268 */ 1269 if (!MD_MNSET_SETNO(setno)) { 1270 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 1271 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1272 } 1273 } 1274 1275 if (lockp) { 1276 md_ioctl_writerexit(lockp); 1277 (void) md_ioctl_readerlock(lockp, ui); 1278 } else { 1279 md_unit_writerexit(ui); 1280 (void) md_unit_readerlock(ui); 1281 } 1282 1283 free_all_ecomps(ecomps); 1284 1285 /* allocate hotspares for all errored components */ 1286 if (MD_MNSET_SETNO(setno)) { 1287 /* 1288 * If we're called from an ioctl (lockp set) then we cannot 1289 * directly call send_poke_hotspares as this will block until 1290 * the message gets despatched to all nodes. If the cluster is 1291 * going through a reconfig cycle then the message will block 1292 * until the cycle is complete, and as we originate from a 1293 * service call from commd we will livelock. 1294 */ 1295 if (lockp == NULL) { 1296 md_unit_readerexit(ui); 1297 send_poke_hotspares(setno); 1298 (void) md_unit_readerlock(ui); 1299 } 1300 } else { 1301 (void) poke_hotspares(); 1302 } 1303 return (0); 1304 } 1305 1306 void 1307 mirror_overlap_tree_remove(md_mps_t *ps) 1308 { 1309 mm_unit_t *un; 1310 1311 if (panicstr) 1312 return; 1313 1314 VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP); 1315 un = ps->ps_un; 1316 1317 mutex_enter(&un->un_overlap_tree_mx); 1318 avl_remove(&un->un_overlap_root, ps); 1319 ps->ps_flags &= ~MD_MPS_ON_OVERLAP; 1320 if (un->un_overlap_tree_flag != 0) { 1321 un->un_overlap_tree_flag = 0; 1322 cv_broadcast(&un->un_overlap_tree_cv); 1323 } 1324 mutex_exit(&un->un_overlap_tree_mx); 1325 } 1326 1327 1328 /* 1329 * wait_for_overlaps: 1330 * ----------------- 1331 * Check that given i/o request does not cause an overlap with already pending 1332 * i/o. If it does, block until the overlapped i/o completes. 1333 * 1334 * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent 1335 * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if 1336 * it must not already be in the tree. 1337 */ 1338 static void 1339 wait_for_overlaps(md_mps_t *ps, int flags) 1340 { 1341 mm_unit_t *un; 1342 avl_index_t where; 1343 md_mps_t *ps1; 1344 1345 if (panicstr) 1346 return; 1347 1348 un = ps->ps_un; 1349 mutex_enter(&un->un_overlap_tree_mx); 1350 if ((flags & MD_OVERLAP_ALLOW_REPEAT) && 1351 (ps->ps_flags & MD_MPS_ON_OVERLAP)) { 1352 mutex_exit(&un->un_overlap_tree_mx); 1353 return; 1354 } 1355 1356 VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 1357 1358 do { 1359 ps1 = avl_find(&un->un_overlap_root, ps, &where); 1360 if (ps1 == NULL) { 1361 /* 1362 * The candidate range does not overlap with any 1363 * range in the tree. Insert it and be done. 1364 */ 1365 avl_insert(&un->un_overlap_root, ps, where); 1366 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1367 } else { 1368 /* 1369 * The candidate range would overlap. Set the flag 1370 * indicating we need to be woken up, and sleep 1371 * until another thread removes a range. If upon 1372 * waking up we find this mps was put on the tree 1373 * by another thread, the loop terminates. 1374 */ 1375 un->un_overlap_tree_flag = 1; 1376 cv_wait(&un->un_overlap_tree_cv, 1377 &un->un_overlap_tree_mx); 1378 } 1379 } while (!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 1380 mutex_exit(&un->un_overlap_tree_mx); 1381 } 1382 1383 /* 1384 * This function is called from mirror_done to check whether any pages have 1385 * been modified while a mirrored write was in progress. Returns 0 if 1386 * all pages associated with bp are clean, 1 otherwise. 1387 */ 1388 static int 1389 any_pages_dirty(struct buf *bp) 1390 { 1391 int rval; 1392 1393 rval = biomodified(bp); 1394 if (rval == -1) 1395 rval = 0; 1396 1397 return (rval); 1398 } 1399 1400 #define MAX_EXTRAS 10 1401 1402 void 1403 mirror_commit( 1404 mm_unit_t *un, 1405 int smmask, 1406 mddb_recid_t *extras 1407 ) 1408 { 1409 mm_submirror_t *sm; 1410 md_unit_t *su; 1411 int i; 1412 1413 /* 2=mirror,null id */ 1414 mddb_recid_t recids[NMIRROR+2+MAX_EXTRAS]; 1415 1416 int ri = 0; 1417 1418 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE) 1419 return; 1420 1421 /* Add two, this includes the mirror unit and the null recid */ 1422 if (extras != NULL) { 1423 int nrecids = 0; 1424 while (extras[nrecids] != 0) { 1425 nrecids++; 1426 } 1427 ASSERT(nrecids <= MAX_EXTRAS); 1428 } 1429 1430 if (un != NULL) 1431 recids[ri++] = un->c.un_record_id; 1432 for (i = 0; i < NMIRROR; i++) { 1433 if (!(smmask & SMI2BIT(i))) 1434 continue; 1435 sm = &un->un_sm[i]; 1436 if (!SMS_IS(sm, SMS_INUSE)) 1437 continue; 1438 if (md_getmajor(sm->sm_dev) != md_major) 1439 continue; 1440 su = MD_UNIT(md_getminor(sm->sm_dev)); 1441 recids[ri++] = su->c.un_record_id; 1442 } 1443 1444 if (extras != NULL) 1445 while (*extras != 0) { 1446 recids[ri++] = *extras; 1447 extras++; 1448 } 1449 1450 if (ri == 0) 1451 return; 1452 recids[ri] = 0; 1453 1454 /* 1455 * Ok to hold ioctl lock across record commit to mddb as 1456 * long as the record(s) being committed aren't resync records. 1457 */ 1458 mddb_commitrecs_wrapper(recids); 1459 } 1460 1461 1462 /* 1463 * This routine is used to set a bit in the writable_bm bitmap 1464 * which represents each submirror in a metamirror which 1465 * is writable. The first writable submirror index is assigned 1466 * to the sm_index. The number of writable submirrors are returned in nunits. 1467 * 1468 * This routine returns the submirror's unit number. 1469 */ 1470 1471 static void 1472 select_write_units(struct mm_unit *un, md_mps_t *ps) 1473 { 1474 1475 int i; 1476 unsigned writable_bm = 0; 1477 unsigned nunits = 0; 1478 1479 for (i = 0; i < NMIRROR; i++) { 1480 if (SUBMIRROR_IS_WRITEABLE(un, i)) { 1481 /* set bit of all writable units */ 1482 writable_bm |= SMI2BIT(i); 1483 nunits++; 1484 } 1485 } 1486 ps->ps_writable_sm = writable_bm; 1487 ps->ps_active_cnt = nunits; 1488 ps->ps_current_sm = 0; 1489 } 1490 1491 static 1492 unsigned 1493 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps) 1494 { 1495 1496 int i; 1497 unsigned writable_bm = 0; 1498 unsigned nunits = 0; 1499 1500 for (i = 0; i < NMIRROR; i++) { 1501 if (SUBMIRROR_IS_WRITEABLE(un, i) && 1502 un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) { 1503 writable_bm |= SMI2BIT(i); 1504 nunits++; 1505 } 1506 } 1507 if ((writable_bm & ps->ps_allfrom_sm) != 0) { 1508 writable_bm &= ~ps->ps_allfrom_sm; 1509 nunits--; 1510 } 1511 ps->ps_writable_sm = writable_bm; 1512 ps->ps_active_cnt = nunits; 1513 ps->ps_current_sm = 0; 1514 return (nunits); 1515 } 1516 1517 static md_dev64_t 1518 select_read_unit( 1519 mm_unit_t *un, 1520 diskaddr_t blkno, 1521 u_longlong_t reqcount, 1522 u_longlong_t *cando, 1523 int must_be_opened, 1524 md_m_shared_t **shared, 1525 md_mcs_t *cs) 1526 { 1527 int i; 1528 md_m_shared_t *s; 1529 uint_t lasterrcnt = 0; 1530 md_dev64_t dev = 0; 1531 u_longlong_t cnt; 1532 u_longlong_t mincnt; 1533 mm_submirror_t *sm; 1534 mm_submirror_ic_t *smic; 1535 mdi_unit_t *ui; 1536 1537 mincnt = reqcount; 1538 for (i = 0; i < NMIRROR; i++) { 1539 if (!SUBMIRROR_IS_READABLE(un, i)) 1540 continue; 1541 sm = &un->un_sm[i]; 1542 smic = &un->un_smic[i]; 1543 cnt = reqcount; 1544 1545 /* 1546 * If the current submirror is marked as inaccessible, do not 1547 * try to access it. 1548 */ 1549 ui = MDI_UNIT(getminor(expldev(sm->sm_dev))); 1550 (void) md_unit_readerlock(ui); 1551 if (ui->ui_tstate & MD_INACCESSIBLE) { 1552 md_unit_readerexit(ui); 1553 continue; 1554 } 1555 md_unit_readerexit(ui); 1556 1557 s = (md_m_shared_t *)(*(smic->sm_shared_by_blk)) 1558 (sm->sm_dev, sm, blkno, &cnt); 1559 1560 if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN)) 1561 continue; 1562 if (s->ms_state == CS_OKAY) { 1563 *cando = cnt; 1564 if (shared != NULL) 1565 *shared = s; 1566 1567 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST && 1568 cs != NULL) { 1569 cs->cs_buf.b_flags |= B_FAILFAST; 1570 } 1571 1572 return (un->un_sm[i].sm_dev); 1573 } 1574 if (s->ms_state != CS_LAST_ERRED) 1575 continue; 1576 1577 /* don't use B_FAILFAST since we're Last Erred */ 1578 1579 if (mincnt > cnt) 1580 mincnt = cnt; 1581 if (s->ms_lasterrcnt > lasterrcnt) { 1582 lasterrcnt = s->ms_lasterrcnt; 1583 if (shared != NULL) 1584 *shared = s; 1585 dev = un->un_sm[i].sm_dev; 1586 } 1587 } 1588 *cando = mincnt; 1589 return (dev); 1590 } 1591 1592 /* 1593 * Given a 32-bit bitmap, this routine will return the bit number 1594 * of the nth bit set. The nth bit set is passed via the index integer. 1595 * 1596 * This routine is used to run through the writable submirror bitmap 1597 * and starting all of the writes. See the value returned is the 1598 * index to appropriate submirror structure, in the md_sm 1599 * array for metamirrors. 1600 */ 1601 static int 1602 md_find_nth_unit(uint_t mask, int index) 1603 { 1604 int bit, nfound; 1605 1606 for (bit = -1, nfound = -1; nfound != index; bit++) { 1607 ASSERT(mask != 0); 1608 nfound += (mask & 1); 1609 mask >>= 1; 1610 } 1611 return (bit); 1612 } 1613 1614 static int 1615 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs) 1616 { 1617 mm_unit_t *un; 1618 buf_t *bp; 1619 int i; 1620 unsigned nunits = 0; 1621 int iunit; 1622 uint_t running_bm = 0; 1623 uint_t sm_index; 1624 1625 bp = &cs->cs_buf; 1626 un = ps->ps_un; 1627 1628 for (i = 0; i < NMIRROR; i++) { 1629 if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING)) 1630 continue; 1631 running_bm |= SMI2BIT(i); 1632 nunits++; 1633 } 1634 if (nunits == 0) 1635 return (1); 1636 1637 /* 1638 * For directed mirror read (DMR) we only use the specified side and 1639 * do not compute the source of the read. 1640 */ 1641 if (ps->ps_flags & MD_MPS_DMR) { 1642 sm_index = un->un_dmr_last_read; 1643 } else { 1644 /* Normal (non-DMR) operation */ 1645 switch (un->un_read_option) { 1646 case RD_GEOMETRY: 1647 iunit = (int)(bp->b_lblkno / 1648 howmany(un->c.un_total_blocks, nunits)); 1649 sm_index = md_find_nth_unit(running_bm, iunit); 1650 break; 1651 case RD_FIRST: 1652 sm_index = md_find_nth_unit(running_bm, 0); 1653 break; 1654 case RD_LOAD_BAL: 1655 /* this is intentional to fall into the default */ 1656 default: 1657 un->un_last_read = (un->un_last_read + 1) % nunits; 1658 sm_index = md_find_nth_unit(running_bm, 1659 un->un_last_read); 1660 break; 1661 } 1662 } 1663 bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev); 1664 ps->ps_allfrom_sm = SMI2BIT(sm_index); 1665 1666 if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) { 1667 bp->b_flags |= B_FAILFAST; 1668 } 1669 1670 return (0); 1671 } 1672 1673 static 1674 int 1675 mirror_are_submirrors_available(mm_unit_t *un) 1676 { 1677 int i; 1678 for (i = 0; i < NMIRROR; i++) { 1679 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1680 1681 if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) || 1682 md_getmajor(tmpdev) != md_major) 1683 continue; 1684 1685 if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) || 1686 (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits)) 1687 return (0); 1688 1689 if (MDI_UNIT(md_getminor(tmpdev)) == NULL) 1690 return (0); 1691 } 1692 return (1); 1693 } 1694 1695 void 1696 build_submirror(mm_unit_t *un, int i, int snarfing) 1697 { 1698 struct mm_submirror *sm; 1699 struct mm_submirror_ic *smic; 1700 md_unit_t *su; 1701 set_t setno; 1702 1703 sm = &un->un_sm[i]; 1704 smic = &un->un_smic[i]; 1705 1706 sm->sm_flags = 0; /* sometime we may need to do more here */ 1707 1708 setno = MD_UN2SET(un); 1709 1710 if (!SMS_IS(sm, SMS_INUSE)) 1711 return; 1712 if (snarfing) { 1713 sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno), 1714 sm->sm_key, MD_NOTRUST_DEVT); 1715 } else { 1716 if (md_getmajor(sm->sm_dev) == md_major) { 1717 su = MD_UNIT(md_getminor(sm->sm_dev)); 1718 un->c.un_flag |= (su->c.un_flag & MD_LABELED); 1719 /* submirror can no longer be soft partitioned */ 1720 MD_CAPAB(su) &= (~MD_CAN_SP); 1721 } 1722 } 1723 smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev, 1724 0, "shared by blk", 0); 1725 smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev, 1726 0, "shared by indx", 0); 1727 smic->sm_get_component_count = (int (*)())md_get_named_service( 1728 sm->sm_dev, 0, "get component count", 0); 1729 smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0, 1730 "get block count skip size", 0); 1731 sm->sm_state &= ~SMS_IGNORE; 1732 if (SMS_IS(sm, SMS_OFFLINE)) 1733 MD_STATUS(un) |= MD_UN_OFFLINE_SM; 1734 md_set_parent(sm->sm_dev, MD_SID(un)); 1735 } 1736 1737 static void 1738 mirror_cleanup(mm_unit_t *un) 1739 { 1740 mddb_recid_t recid; 1741 int smi; 1742 sv_dev_t sv[NMIRROR]; 1743 int nsv = 0; 1744 1745 /* 1746 * If a MN diskset and this node is not the master, do 1747 * not delete any records on snarf of the mirror records. 1748 */ 1749 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1750 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 1751 return; 1752 } 1753 1754 for (smi = 0; smi < NMIRROR; smi++) { 1755 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 1756 continue; 1757 sv[nsv].setno = MD_UN2SET(un); 1758 sv[nsv++].key = un->un_sm[smi].sm_key; 1759 } 1760 1761 recid = un->un_rr_dirty_recid; 1762 mddb_deleterec_wrapper(un->c.un_record_id); 1763 if (recid > 0) 1764 mddb_deleterec_wrapper(recid); 1765 1766 md_rem_names(sv, nsv); 1767 } 1768 1769 /* 1770 * Comparison function for the avl tree which tracks 1771 * outstanding writes on submirrors. 1772 * 1773 * Returns: 1774 * -1: ps1 < ps2 1775 * 0: ps1 and ps2 overlap 1776 * 1: ps1 > ps2 1777 */ 1778 static int 1779 mirror_overlap_compare(const void *p1, const void *p2) 1780 { 1781 const md_mps_t *ps1 = (md_mps_t *)p1; 1782 const md_mps_t *ps2 = (md_mps_t *)p2; 1783 1784 if (ps1->ps_firstblk < ps2->ps_firstblk) { 1785 if (ps1->ps_lastblk >= ps2->ps_firstblk) 1786 return (0); 1787 return (-1); 1788 } 1789 1790 if (ps1->ps_firstblk > ps2->ps_firstblk) { 1791 if (ps1->ps_firstblk <= ps2->ps_lastblk) 1792 return (0); 1793 return (1); 1794 } 1795 1796 return (0); 1797 } 1798 1799 /* Return a -1 if optimized record unavailable and set should be released */ 1800 int 1801 mirror_build_incore(mm_unit_t *un, int snarfing) 1802 { 1803 int i; 1804 1805 if (MD_STATUS(un) & MD_UN_BEING_RESET) { 1806 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); 1807 return (1); 1808 } 1809 1810 if (mirror_are_submirrors_available(un) == 0) 1811 return (1); 1812 1813 if (MD_UNIT(MD_SID(un)) != NULL) 1814 return (0); 1815 1816 MD_STATUS(un) = 0; 1817 1818 /* pre-4.1 didn't define CAN_META_CHILD capability */ 1819 MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP; 1820 1821 un->un_overlap_tree_flag = 0; 1822 avl_create(&un->un_overlap_root, mirror_overlap_compare, 1823 sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node)); 1824 1825 for (i = 0; i < NMIRROR; i++) 1826 build_submirror(un, i, snarfing); 1827 1828 if (unit_setup_resync(un, snarfing) != 0) { 1829 if (snarfing) { 1830 mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT); 1831 /* 1832 * If a MN set and set is not stale, then return -1 1833 * which will force the caller to unload the set. 1834 * The MN diskset nodes will return failure if 1835 * unit_setup_resync fails so that nodes won't 1836 * get out of sync. 1837 * 1838 * If set is STALE, the master node can't allocate 1839 * a resync record (if needed), but node needs to 1840 * join the set so that user can delete broken mddbs. 1841 * So, if set is STALE, just continue on. 1842 */ 1843 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1844 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 1845 return (-1); 1846 } 1847 } else 1848 return (1); 1849 } 1850 1851 mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL); 1852 cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL); 1853 1854 un->un_suspend_wr_flag = 0; 1855 mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL); 1856 cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL); 1857 1858 /* 1859 * Allocate mutexes for mirror-owner and resync-owner changes. 1860 * All references to the owner message state field must be guarded 1861 * by this mutex. 1862 */ 1863 mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL); 1864 1865 /* 1866 * Allocate mutex and condvar for resync thread manipulation. These 1867 * will be used by mirror_resync_unit/mirror_ioctl_resync 1868 */ 1869 mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL); 1870 cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL); 1871 1872 /* 1873 * Allocate mutex and condvar for resync progress thread manipulation. 1874 * This allows resyncs to be continued across an intervening reboot. 1875 */ 1876 mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL); 1877 cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL); 1878 1879 /* 1880 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This 1881 * provides synchronization between a user-ioctl and the resulting 1882 * strategy() call that performs the read(). 1883 */ 1884 mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL); 1885 cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL); 1886 1887 MD_UNIT(MD_SID(un)) = un; 1888 return (0); 1889 } 1890 1891 1892 void 1893 reset_mirror(struct mm_unit *un, minor_t mnum, int removing) 1894 { 1895 mddb_recid_t recid, vtoc_id; 1896 size_t bitcnt; 1897 size_t shortcnt; 1898 int smi; 1899 sv_dev_t sv[NMIRROR]; 1900 int nsv = 0; 1901 uint_t bits = 0; 1902 minor_t selfid; 1903 md_unit_t *su; 1904 1905 md_destroy_unit_incore(mnum, &mirror_md_ops); 1906 1907 shortcnt = un->un_rrd_num * sizeof (short); 1908 bitcnt = howmany(un->un_rrd_num, NBBY); 1909 1910 if (un->un_outstanding_writes) 1911 kmem_free((caddr_t)un->un_outstanding_writes, shortcnt); 1912 if (un->un_goingclean_bm) 1913 kmem_free((caddr_t)un->un_goingclean_bm, bitcnt); 1914 if (un->un_goingdirty_bm) 1915 kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt); 1916 if (un->un_resync_bm) 1917 kmem_free((caddr_t)un->un_resync_bm, bitcnt); 1918 1919 MD_UNIT(mnum) = NULL; 1920 1921 /* 1922 * Attempt release of its minor node 1923 */ 1924 md_remove_minor_node(mnum); 1925 1926 if (!removing) 1927 return; 1928 1929 for (smi = 0; smi < NMIRROR; smi++) { 1930 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 1931 continue; 1932 /* reallow soft partitioning of submirror and reset parent */ 1933 su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev)); 1934 MD_CAPAB(su) |= MD_CAN_SP; 1935 md_reset_parent(un->un_sm[smi].sm_dev); 1936 reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]); 1937 1938 sv[nsv].setno = MD_MIN2SET(mnum); 1939 sv[nsv++].key = un->un_sm[smi].sm_key; 1940 bits |= SMI2BIT(smi); 1941 } 1942 1943 MD_STATUS(un) |= MD_UN_BEING_RESET; 1944 recid = un->un_rr_dirty_recid; 1945 vtoc_id = un->c.un_vtoc_id; 1946 selfid = MD_SID(un); 1947 1948 mirror_commit(un, bits, 0); 1949 1950 avl_destroy(&un->un_overlap_root); 1951 1952 /* Destroy all mutexes and condvars before returning. */ 1953 mutex_destroy(&un->un_suspend_wr_mx); 1954 cv_destroy(&un->un_suspend_wr_cv); 1955 mutex_destroy(&un->un_overlap_tree_mx); 1956 cv_destroy(&un->un_overlap_tree_cv); 1957 mutex_destroy(&un->un_owner_mx); 1958 mutex_destroy(&un->un_rs_thread_mx); 1959 cv_destroy(&un->un_rs_thread_cv); 1960 mutex_destroy(&un->un_rs_progress_mx); 1961 cv_destroy(&un->un_rs_progress_cv); 1962 mutex_destroy(&un->un_dmr_mx); 1963 cv_destroy(&un->un_dmr_cv); 1964 1965 /* 1966 * Remove self from the namespace 1967 */ 1968 if (un->c.un_revision & MD_FN_META_DEV) { 1969 (void) md_rem_selfname(un->c.un_self_id); 1970 } 1971 1972 mddb_deleterec_wrapper(un->c.un_record_id); 1973 if (recid != 0) 1974 mddb_deleterec_wrapper(recid); 1975 1976 /* Remove the vtoc, if present */ 1977 if (vtoc_id) 1978 mddb_deleterec_wrapper(vtoc_id); 1979 1980 md_rem_names(sv, nsv); 1981 1982 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, 1983 MD_MIN2SET(selfid), selfid); 1984 } 1985 1986 int 1987 mirror_internal_open( 1988 minor_t mnum, 1989 int flag, 1990 int otyp, 1991 int md_oflags, 1992 IOLOCK *lockp /* can be NULL */ 1993 ) 1994 { 1995 mdi_unit_t *ui = MDI_UNIT(mnum); 1996 int err = 0; 1997 1998 tryagain: 1999 /* single thread */ 2000 if (lockp) { 2001 /* 2002 * If ioctl lock is held, use openclose_enter 2003 * routine that will set the ioctl flag when 2004 * grabbing the readerlock. 2005 */ 2006 (void) md_ioctl_openclose_enter(lockp, ui); 2007 } else { 2008 (void) md_unit_openclose_enter(ui); 2009 } 2010 2011 /* 2012 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE 2013 * message in a MN diskset and this requires that the openclose 2014 * lock is dropped in order to send this message. So, another 2015 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from 2016 * attempting an open while this thread has an open in progress. 2017 * Call the *_lh version of the lock exit routines since the ui_mx 2018 * mutex must be held from checking for OPENINPROGRESS until 2019 * after the cv_wait call. 2020 */ 2021 mutex_enter(&ui->ui_mx); 2022 if (ui->ui_lock & MD_UL_OPENINPROGRESS) { 2023 if (lockp) { 2024 (void) md_ioctl_openclose_exit_lh(lockp); 2025 } else { 2026 md_unit_openclose_exit_lh(ui); 2027 } 2028 cv_wait(&ui->ui_cv, &ui->ui_mx); 2029 mutex_exit(&ui->ui_mx); 2030 goto tryagain; 2031 } 2032 2033 ui->ui_lock |= MD_UL_OPENINPROGRESS; 2034 mutex_exit(&ui->ui_mx); 2035 2036 /* open devices, if necessary */ 2037 if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) { 2038 if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0) 2039 goto out; 2040 } 2041 2042 /* count open */ 2043 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 2044 goto out; 2045 2046 /* unlock, return success */ 2047 out: 2048 mutex_enter(&ui->ui_mx); 2049 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 2050 mutex_exit(&ui->ui_mx); 2051 2052 if (lockp) { 2053 /* 2054 * If ioctl lock is held, use openclose_exit 2055 * routine that will clear the lockp reader flag. 2056 */ 2057 (void) md_ioctl_openclose_exit(lockp); 2058 } else { 2059 md_unit_openclose_exit(ui); 2060 } 2061 return (err); 2062 } 2063 2064 int 2065 mirror_internal_close( 2066 minor_t mnum, 2067 int otyp, 2068 int md_cflags, 2069 IOLOCK *lockp /* can be NULL */ 2070 ) 2071 { 2072 mdi_unit_t *ui = MDI_UNIT(mnum); 2073 mm_unit_t *un; 2074 int err = 0; 2075 2076 /* single thread */ 2077 if (lockp) { 2078 /* 2079 * If ioctl lock is held, use openclose_enter 2080 * routine that will set the ioctl flag when 2081 * grabbing the readerlock. 2082 */ 2083 un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui); 2084 } else { 2085 un = (mm_unit_t *)md_unit_openclose_enter(ui); 2086 } 2087 2088 /* count closed */ 2089 if ((err = md_unit_decopen(mnum, otyp)) != 0) 2090 goto out; 2091 2092 /* close devices, if necessary */ 2093 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 2094 /* 2095 * Clean up dirty bitmap for this unit. Do this 2096 * before closing the underlying devices to avoid 2097 * race conditions with reset_mirror() as a 2098 * result of a 'metaset -r' command running in 2099 * parallel. This might cause deallocation of 2100 * dirty region bitmaps; with underlying metadevices 2101 * in place this can't happen. 2102 * Don't do this if a MN set and ABR not set 2103 */ 2104 if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) { 2105 if (!MD_MNSET_SETNO(MD_UN2SET(un)) || 2106 !(ui->ui_tstate & MD_ABR_CAP)) 2107 mirror_process_unit_resync(un); 2108 } 2109 (void) mirror_close_all_devs(un, md_cflags); 2110 2111 /* 2112 * For a MN set with transient capabilities (eg ABR/DMR) set, 2113 * clear these capabilities on the last open in the cluster. 2114 * To do this we send a message to all nodes to see of the 2115 * device is open. 2116 */ 2117 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 2118 (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) { 2119 if (lockp) { 2120 (void) md_ioctl_openclose_exit(lockp); 2121 } else { 2122 md_unit_openclose_exit(ui); 2123 } 2124 2125 /* 2126 * if we are in the context of an ioctl, drop the 2127 * ioctl lock. 2128 * Otherwise, no other locks should be held. 2129 */ 2130 if (lockp) { 2131 IOLOCK_RETURN_RELEASE(0, lockp); 2132 } 2133 2134 mdmn_clear_all_capabilities(mnum); 2135 2136 /* if dropped the lock previously, regain it */ 2137 if (lockp) { 2138 IOLOCK_RETURN_REACQUIRE(lockp); 2139 } 2140 return (0); 2141 } 2142 /* unlock and return success */ 2143 } 2144 out: 2145 /* Call whether lockp is NULL or not. */ 2146 if (lockp) { 2147 md_ioctl_openclose_exit(lockp); 2148 } else { 2149 md_unit_openclose_exit(ui); 2150 } 2151 return (err); 2152 } 2153 2154 /* 2155 * When a component has completed resyncing and is now ok, check if the 2156 * corresponding component in the other submirrors is in the Last Erred 2157 * state. If it is, we want to change that to the Erred state so we stop 2158 * using that component and start using this good component instead. 2159 * 2160 * This is called from set_sm_comp_state and recursively calls 2161 * set_sm_comp_state if it needs to change the Last Erred state. 2162 */ 2163 static void 2164 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags, 2165 IOLOCK *lockp) 2166 { 2167 mm_submirror_t *sm; 2168 mm_submirror_ic_t *smic; 2169 int ci; 2170 int i; 2171 int compcnt; 2172 int changed = 0; 2173 2174 for (i = 0; i < NMIRROR; i++) { 2175 sm = &un->un_sm[i]; 2176 smic = &un->un_smic[i]; 2177 2178 if (!SMS_IS(sm, SMS_INUSE)) 2179 continue; 2180 2181 /* ignore the submirror that we just made ok */ 2182 if (i == smi) 2183 continue; 2184 2185 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 2186 for (ci = 0; ci < compcnt; ci++) { 2187 md_m_shared_t *shared; 2188 2189 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2190 (sm->sm_dev, sm, ci); 2191 2192 if ((shared->ms_state & CS_LAST_ERRED) && 2193 !mirror_other_sources(un, i, ci, 1)) { 2194 2195 set_sm_comp_state(un, i, ci, CS_ERRED, extras, 2196 flags, lockp); 2197 changed = 1; 2198 } 2199 } 2200 } 2201 2202 /* maybe there is a hotspare for this newly erred component */ 2203 if (changed) { 2204 set_t setno; 2205 2206 setno = MD_UN2SET(un); 2207 if (MD_MNSET_SETNO(setno)) { 2208 send_poke_hotspares(setno); 2209 } else { 2210 (void) poke_hotspares(); 2211 } 2212 } 2213 } 2214 2215 /* 2216 * set_sm_comp_state 2217 * 2218 * Set the state of a submirror component to the specified new state. 2219 * If the mirror is in a multi-node set, send messages to all nodes to 2220 * block all writes to the mirror and then update the state and release the 2221 * writes. These messages are only sent if MD_STATE_XMIT is set in flags. 2222 * MD_STATE_XMIT will be unset in 2 cases: 2223 * 1. When the state is changed to CS_RESYNC as this state change 2224 * will already have been updated on each node by the processing of the 2225 * distributed metasync command, hence no need to xmit. 2226 * 2. When the state is change to CS_OKAY after a resync has completed. Again 2227 * the resync completion will already have been processed on each node by 2228 * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component 2229 * resync, hence no need to xmit. 2230 * 2231 * In case we are called from the updates of a watermark, 2232 * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to 2233 * a metainit or similar. In this case the message that we sent to propagate 2234 * the state change must not be a class1 message as that would deadlock with 2235 * the metainit command that is still being processed. 2236 * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2 2237 * instead. This also makes the submessage generator to create a class2 2238 * submessage rather than a class1 (which would also block) 2239 * 2240 * On entry, unit_writerlock is held 2241 * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is 2242 * also held. 2243 */ 2244 void 2245 set_sm_comp_state( 2246 mm_unit_t *un, 2247 int smi, 2248 int ci, 2249 int newstate, 2250 mddb_recid_t *extras, 2251 uint_t flags, 2252 IOLOCK *lockp 2253 ) 2254 { 2255 mm_submirror_t *sm; 2256 mm_submirror_ic_t *smic; 2257 md_m_shared_t *shared; 2258 int origstate; 2259 void (*get_dev)(); 2260 ms_cd_info_t cd; 2261 char devname[MD_MAX_CTDLEN]; 2262 int err; 2263 set_t setno = MD_UN2SET(un); 2264 md_mn_msg_stch_t stchmsg; 2265 mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); 2266 md_mn_kresult_t *kresult; 2267 int rval; 2268 uint_t msgflags; 2269 md_mn_msgtype_t msgtype; 2270 int save_lock = 0; 2271 mdi_unit_t *ui_sm; 2272 2273 sm = &un->un_sm[smi]; 2274 smic = &un->un_smic[smi]; 2275 2276 /* If we have a real error status then turn off MD_INACCESSIBLE. */ 2277 ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev))); 2278 if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) && 2279 ui_sm->ui_tstate & MD_INACCESSIBLE) { 2280 ui_sm->ui_tstate &= ~MD_INACCESSIBLE; 2281 } 2282 2283 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2284 (sm->sm_dev, sm, ci); 2285 origstate = shared->ms_state; 2286 2287 /* 2288 * If the new state is an error and the old one wasn't, generate 2289 * a console message. We do this before we send the state to other 2290 * nodes in a MN set because the state change may change the component 2291 * name if a hotspare is allocated. 2292 */ 2293 if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) && 2294 (newstate & (CS_ERRED|CS_LAST_ERRED))) { 2295 2296 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 2297 "get device", 0); 2298 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2299 2300 err = md_getdevname(setno, mddb_getsidenum(setno), 0, 2301 cd.cd_dev, devname, sizeof (devname)); 2302 2303 if (err == ENOENT) { 2304 (void) md_devname(setno, cd.cd_dev, devname, 2305 sizeof (devname)); 2306 } 2307 2308 cmn_err(CE_WARN, "md: %s: %s needs maintenance", 2309 md_shortname(md_getminor(sm->sm_dev)), devname); 2310 2311 if (newstate & CS_LAST_ERRED) { 2312 cmn_err(CE_WARN, "md: %s: %s last erred", 2313 md_shortname(md_getminor(sm->sm_dev)), 2314 devname); 2315 2316 } else if (shared->ms_flags & MDM_S_ISOPEN) { 2317 /* 2318 * Close the broken device and clear the open flag on 2319 * it. Closing the device means the RCM framework will 2320 * be able to unconfigure the device if required. 2321 * 2322 * We have to check that the device is open, otherwise 2323 * the first open on it has resulted in the error that 2324 * is being processed and the actual cd.cd_dev will be 2325 * NODEV64. 2326 * 2327 * If this is a multi-node mirror, then the multinode 2328 * state checks following this code will cause the 2329 * slave nodes to close the mirror in the function 2330 * mirror_set_state(). 2331 */ 2332 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2333 shared->ms_flags &= ~MDM_S_ISOPEN; 2334 } 2335 2336 } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) && 2337 (shared->ms_flags & MDM_S_ISOPEN)) { 2338 /* 2339 * Similar to logic above except no log messages since we 2340 * are just transitioning from Last Erred to Erred. 2341 */ 2342 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 2343 "get device", 0); 2344 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2345 2346 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2347 shared->ms_flags &= ~MDM_S_ISOPEN; 2348 } 2349 2350 if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) && 2351 (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) { 2352 /* 2353 * For a multi-node mirror, send the state change to the 2354 * master, which broadcasts to all nodes, including this 2355 * one. Once the message is received, the state is set 2356 * in-core and the master commits the change to disk. 2357 * There is a case, comp_replace, where this function 2358 * can be called from within an ioctl and therefore in this 2359 * case, as the ioctl will already be called on each node, 2360 * there is no need to xmit the state change to the master for 2361 * distribution to the other nodes. MD_STATE_XMIT flag is used 2362 * to indicate whether a xmit is required. The mirror's 2363 * transient state is set to MD_ERR_PENDING to avoid sending 2364 * multiple messages. 2365 */ 2366 if (newstate & (CS_ERRED|CS_LAST_ERRED)) 2367 ui->ui_tstate |= MD_ERR_PENDING; 2368 2369 /* 2370 * Send a state update message to all nodes. This message 2371 * will generate 2 submessages, the first one to suspend 2372 * all writes to the mirror and the second to update the 2373 * state and resume writes. 2374 */ 2375 stchmsg.msg_stch_mnum = un->c.un_self_id; 2376 stchmsg.msg_stch_sm = smi; 2377 stchmsg.msg_stch_comp = ci; 2378 stchmsg.msg_stch_new_state = newstate; 2379 stchmsg.msg_stch_hs_id = shared->ms_hs_id; 2380 #ifdef DEBUG 2381 if (mirror_debug_flag) 2382 printf("send set state, %x, %x, %x, %x, %x\n", 2383 stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm, 2384 stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state, 2385 stchmsg.msg_stch_hs_id); 2386 #endif 2387 if (flags & MD_STATE_WMUPDATE) { 2388 msgtype = MD_MN_MSG_STATE_UPDATE2; 2389 /* 2390 * When coming from an update of watermarks, there 2391 * must already be a message logged that triggered 2392 * this action. So, no need to log this message, too. 2393 */ 2394 msgflags = MD_MSGF_NO_LOG; 2395 } else { 2396 msgtype = MD_MN_MSG_STATE_UPDATE; 2397 msgflags = MD_MSGF_DEFAULT_FLAGS; 2398 } 2399 2400 /* 2401 * If we are in the context of an ioctl, drop the ioctl lock. 2402 * lockp holds the list of locks held. 2403 * 2404 * Otherwise, increment the appropriate reacquire counters. 2405 * If openclose lock is *held, then must reacquire reader 2406 * lock before releasing the openclose lock. 2407 * Do not drop the ARRAY_WRITER lock as we may not be able 2408 * to reacquire it. 2409 */ 2410 if (lockp) { 2411 if (lockp->l_flags & MD_ARRAY_WRITER) { 2412 save_lock = MD_ARRAY_WRITER; 2413 lockp->l_flags &= ~MD_ARRAY_WRITER; 2414 } else if (lockp->l_flags & MD_ARRAY_READER) { 2415 save_lock = MD_ARRAY_READER; 2416 lockp->l_flags &= ~MD_ARRAY_READER; 2417 } 2418 IOLOCK_RETURN_RELEASE(0, lockp); 2419 } else { 2420 if (flags & MD_STATE_OCHELD) { 2421 md_unit_writerexit(ui); 2422 (void) md_unit_readerlock(ui); 2423 md_unit_openclose_exit(ui); 2424 } else { 2425 md_unit_writerexit(ui); 2426 } 2427 } 2428 2429 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 2430 rval = mdmn_ksend_message(setno, msgtype, msgflags, 2431 (char *)&stchmsg, sizeof (stchmsg), kresult); 2432 2433 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 2434 mdmn_ksend_show_error(rval, kresult, "STATE UPDATE"); 2435 cmn_err(CE_PANIC, 2436 "ksend_message failure: STATE_UPDATE"); 2437 } 2438 kmem_free(kresult, sizeof (md_mn_kresult_t)); 2439 2440 /* if dropped the lock previously, regain it */ 2441 if (lockp) { 2442 IOLOCK_RETURN_REACQUIRE(lockp); 2443 lockp->l_flags |= save_lock; 2444 } else { 2445 /* 2446 * Reacquire dropped locks and update acquirecnts 2447 * appropriately. 2448 */ 2449 if (flags & MD_STATE_OCHELD) { 2450 /* 2451 * openclose also grabs readerlock. 2452 */ 2453 (void) md_unit_openclose_enter(ui); 2454 md_unit_readerexit(ui); 2455 (void) md_unit_writerlock(ui); 2456 } else { 2457 (void) md_unit_writerlock(ui); 2458 } 2459 } 2460 2461 ui->ui_tstate &= ~MD_ERR_PENDING; 2462 } else { 2463 shared->ms_state = newstate; 2464 uniqtime32(&shared->ms_timestamp); 2465 2466 if (newstate == CS_ERRED) 2467 shared->ms_flags |= MDM_S_NOWRITE; 2468 else 2469 shared->ms_flags &= ~MDM_S_NOWRITE; 2470 2471 shared->ms_flags &= ~MDM_S_IOERR; 2472 un->un_changecnt++; 2473 shared->ms_lasterrcnt = un->un_changecnt; 2474 2475 mirror_set_sm_state(sm, smic, SMS_RUNNING, 0); 2476 mirror_commit(un, SMI2BIT(smi), extras); 2477 } 2478 2479 if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) { 2480 /* 2481 * Resetting the Last Erred state will recursively call back 2482 * into this function (set_sm_comp_state) to update the state. 2483 */ 2484 reset_lasterred(un, smi, extras, flags, lockp); 2485 } 2486 } 2487 2488 static int 2489 find_another_logical( 2490 mm_unit_t *un, 2491 mm_submirror_t *esm, 2492 diskaddr_t blk, 2493 u_longlong_t cnt, 2494 int must_be_open, 2495 int state, 2496 int err_cnt) 2497 { 2498 u_longlong_t cando; 2499 md_dev64_t dev; 2500 md_m_shared_t *s; 2501 2502 esm->sm_state |= SMS_IGNORE; 2503 while (cnt != 0) { 2504 u_longlong_t mcnt; 2505 2506 mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024)); /* 1 Gig Blks */ 2507 2508 dev = select_read_unit(un, blk, mcnt, &cando, 2509 must_be_open, &s, NULL); 2510 if (dev == (md_dev64_t)0) 2511 break; 2512 2513 if ((state == CS_LAST_ERRED) && 2514 (s->ms_state == CS_LAST_ERRED) && 2515 (err_cnt > s->ms_lasterrcnt)) 2516 break; 2517 2518 cnt -= cando; 2519 blk += cando; 2520 } 2521 esm->sm_state &= ~SMS_IGNORE; 2522 return (cnt != 0); 2523 } 2524 2525 int 2526 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open) 2527 { 2528 mm_submirror_t *sm; 2529 mm_submirror_ic_t *smic; 2530 size_t count; 2531 diskaddr_t block; 2532 u_longlong_t skip; 2533 u_longlong_t size; 2534 md_dev64_t dev; 2535 int cnt; 2536 md_m_shared_t *s; 2537 int not_found; 2538 2539 sm = &un->un_sm[smi]; 2540 smic = &un->un_smic[smi]; 2541 dev = sm->sm_dev; 2542 2543 /* 2544 * Make sure every component of the submirror 2545 * has other sources. 2546 */ 2547 if (ci < 0) { 2548 /* Find the highest lasterrcnt */ 2549 cnt = (*(smic->sm_get_component_count))(dev, sm); 2550 for (ci = 0; ci < cnt; ci++) { 2551 not_found = mirror_other_sources(un, smi, ci, 2552 must_be_open); 2553 if (not_found) 2554 return (1); 2555 } 2556 return (0); 2557 } 2558 2559 /* 2560 * Make sure this component has other sources 2561 */ 2562 (void) (*(smic->sm_get_bcss)) 2563 (dev, sm, ci, &block, &count, &skip, &size); 2564 2565 if (count == 0) 2566 return (1); 2567 2568 s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci); 2569 2570 while (count--) { 2571 if (block >= un->c.un_total_blocks) 2572 return (0); 2573 2574 if ((block + size) > un->c.un_total_blocks) 2575 size = un->c.un_total_blocks - block; 2576 2577 not_found = find_another_logical(un, sm, block, size, 2578 must_be_open, s->ms_state, s->ms_lasterrcnt); 2579 if (not_found) 2580 return (1); 2581 2582 block += size + skip; 2583 } 2584 return (0); 2585 } 2586 2587 static void 2588 finish_error(md_mps_t *ps) 2589 { 2590 struct buf *pb; 2591 mm_unit_t *un; 2592 mdi_unit_t *ui; 2593 uint_t new_str_flags; 2594 2595 pb = ps->ps_bp; 2596 un = ps->ps_un; 2597 ui = ps->ps_ui; 2598 2599 /* 2600 * Must flag any error to the resync originator if we're performing 2601 * a Write-after-Read. This corresponds to an i/o error on a resync 2602 * target device and in this case we ought to abort the resync as there 2603 * is nothing that can be done to recover from this without operator 2604 * intervention. If we don't set the B_ERROR flag we will continue 2605 * reading from the mirror but won't write to the target (as it will 2606 * have been placed into an errored state). 2607 * To handle the case of multiple components within a submirror we only 2608 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR. 2609 * The originator of the resync read will cause this bit to be set if 2610 * the underlying component count is one for a submirror resync. All 2611 * other resync types will have the flag set as there is no underlying 2612 * resync which can be performed on a contained metadevice for these 2613 * resync types (optimized or component). 2614 */ 2615 2616 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) { 2617 if (ps->ps_flags & MD_MPS_FLAG_ERROR) 2618 pb->b_flags |= B_ERROR; 2619 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2620 MPS_FREE(mirror_parent_cache, ps); 2621 md_unit_readerexit(ui); 2622 md_biodone(pb); 2623 return; 2624 } 2625 /* 2626 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 2627 * operation therefore this I/O request has already been counted, 2628 * the I/O count variable will be decremented by mirror_done()'s 2629 * call to md_biodone(). 2630 */ 2631 if (ps->ps_changecnt != un->un_changecnt) { 2632 new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED; 2633 if (ps->ps_flags & MD_MPS_WOW) 2634 new_str_flags |= MD_STR_WOW; 2635 if (ps->ps_flags & MD_MPS_MAPPED) 2636 new_str_flags |= MD_STR_MAPPED; 2637 /* 2638 * If this I/O request was a read that was part of a resync, 2639 * set MD_STR_WAR for the retried read to ensure that the 2640 * resync write (i.e. write-after-read) will be performed 2641 */ 2642 if (ps->ps_flags & MD_MPS_RESYNC_READ) 2643 new_str_flags |= MD_STR_WAR; 2644 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2645 MPS_FREE(mirror_parent_cache, ps); 2646 md_unit_readerexit(ui); 2647 (void) md_mirror_strategy(pb, new_str_flags, NULL); 2648 return; 2649 } 2650 2651 pb->b_flags |= B_ERROR; 2652 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2653 MPS_FREE(mirror_parent_cache, ps); 2654 md_unit_readerexit(ui); 2655 md_biodone(pb); 2656 } 2657 2658 static void 2659 error_update_unit(md_mps_t *ps) 2660 { 2661 mm_unit_t *un; 2662 mdi_unit_t *ui; 2663 int smi; /* sub mirror index */ 2664 int ci; /* errored component */ 2665 set_t setno; 2666 uint_t flags; /* for set_sm_comp_state() */ 2667 uint_t hspflags; /* for check_comp_4_hotspares() */ 2668 2669 ui = ps->ps_ui; 2670 un = (mm_unit_t *)md_unit_writerlock(ui); 2671 setno = MD_UN2SET(un); 2672 2673 /* All of these updates have to propagated in case of MN set */ 2674 flags = MD_STATE_XMIT; 2675 hspflags = MD_HOTSPARE_XMIT; 2676 2677 /* special treatment if we are called during updating watermarks */ 2678 if (ps->ps_flags & MD_MPS_WMUPDATE) { 2679 flags |= MD_STATE_WMUPDATE; 2680 hspflags |= MD_HOTSPARE_WMUPDATE; 2681 } 2682 smi = 0; 2683 ci = 0; 2684 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 2685 if (mirror_other_sources(un, smi, ci, 0) == 1) { 2686 2687 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2688 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags, 2689 (IOLOCK *)NULL); 2690 /* 2691 * For a MN set, the NOTIFY is done when the state 2692 * change is processed on each node 2693 */ 2694 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2695 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 2696 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2697 } 2698 continue; 2699 } 2700 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2701 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags, 2702 (IOLOCK *)NULL); 2703 /* 2704 * For a MN set, the NOTIFY is done when the state 2705 * change is processed on each node 2706 */ 2707 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2708 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 2709 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2710 } 2711 smi = 0; 2712 ci = 0; 2713 } 2714 2715 md_unit_writerexit(ui); 2716 if (MD_MNSET_SETNO(setno)) { 2717 send_poke_hotspares(setno); 2718 } else { 2719 (void) poke_hotspares(); 2720 } 2721 (void) md_unit_readerlock(ui); 2722 2723 finish_error(ps); 2724 } 2725 2726 /* 2727 * When we have a B_FAILFAST IO error on a Last Erred component we need to 2728 * retry the IO without B_FAILFAST set so that we try to ensure that the 2729 * component "sees" each IO. 2730 */ 2731 static void 2732 last_err_retry(md_mcs_t *cs) 2733 { 2734 struct buf *cb; 2735 md_mps_t *ps; 2736 uint_t flags; 2737 2738 cb = &cs->cs_buf; 2739 cb->b_flags &= ~B_FAILFAST; 2740 2741 /* if we're panicing just let this I/O error out */ 2742 if (panicstr) { 2743 (void) mirror_done(cb); 2744 return; 2745 } 2746 2747 /* reissue the I/O */ 2748 2749 ps = cs->cs_ps; 2750 2751 bioerror(cb, 0); 2752 2753 mutex_enter(&ps->ps_mx); 2754 2755 flags = MD_STR_NOTTOP; 2756 if (ps->ps_flags & MD_MPS_MAPPED) 2757 flags |= MD_STR_MAPPED; 2758 if (ps->ps_flags & MD_MPS_NOBLOCK) 2759 flags |= MD_NOBLOCK; 2760 2761 mutex_exit(&ps->ps_mx); 2762 2763 clear_retry_error(cb); 2764 2765 cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST", 2766 md_shortname(getminor(cb->b_edev))); 2767 2768 md_call_strategy(cb, flags, NULL); 2769 } 2770 2771 static void 2772 mirror_error(md_mps_t *ps) 2773 { 2774 int smi; /* sub mirror index */ 2775 int ci; /* errored component */ 2776 2777 if (panicstr) { 2778 finish_error(ps); 2779 return; 2780 } 2781 2782 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 2783 mirror_overlap_tree_remove(ps); 2784 2785 smi = 0; 2786 ci = 0; 2787 if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) { 2788 md_unit_readerexit(ps->ps_ui); 2789 daemon_request(&md_mstr_daemon, error_update_unit, 2790 (daemon_queue_t *)ps, REQ_OLD); 2791 return; 2792 } 2793 2794 finish_error(ps); 2795 } 2796 2797 static int 2798 copy_write_done(struct buf *cb) 2799 { 2800 md_mps_t *ps; 2801 buf_t *pb; 2802 char *wowbuf; 2803 wowhdr_t *wowhdr; 2804 ssize_t wow_resid; 2805 2806 /* get wowbuf ans save structure */ 2807 wowbuf = cb->b_un.b_addr; 2808 wowhdr = WOWBUF_HDR(wowbuf); 2809 ps = wowhdr->wow_ps; 2810 pb = ps->ps_bp; 2811 2812 /* Save error information, then free cb */ 2813 if (cb->b_flags & B_ERROR) 2814 pb->b_flags |= B_ERROR; 2815 2816 if (cb->b_flags & B_REMAPPED) 2817 bp_mapout(cb); 2818 2819 freerbuf(cb); 2820 2821 /* update residual and continue if needed */ 2822 if ((pb->b_flags & B_ERROR) == 0) { 2823 wow_resid = pb->b_bcount - wowhdr->wow_offset; 2824 pb->b_resid = wow_resid; 2825 if (wow_resid > 0) { 2826 daemon_request(&md_mstr_daemon, copy_write_cont, 2827 (daemon_queue_t *)wowhdr, REQ_OLD); 2828 return (1); 2829 } 2830 } 2831 2832 /* Write is complete, release resources. */ 2833 kmem_cache_free(mirror_wowblk_cache, wowhdr); 2834 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 2835 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2836 MPS_FREE(mirror_parent_cache, ps); 2837 md_biodone(pb); 2838 return (0); 2839 } 2840 2841 static void 2842 copy_write_cont(wowhdr_t *wowhdr) 2843 { 2844 buf_t *pb; 2845 buf_t *cb; 2846 char *wowbuf; 2847 int wow_offset; 2848 size_t wow_resid; 2849 diskaddr_t wow_blkno; 2850 2851 wowbuf = WOWHDR_BUF(wowhdr); 2852 pb = wowhdr->wow_ps->ps_bp; 2853 2854 /* get data on current location */ 2855 wow_offset = wowhdr->wow_offset; 2856 wow_resid = pb->b_bcount - wow_offset; 2857 wow_blkno = pb->b_lblkno + lbtodb(wow_offset); 2858 2859 /* setup child buffer */ 2860 cb = getrbuf(KM_SLEEP); 2861 cb->b_flags = B_WRITE; 2862 cb->b_edev = pb->b_edev; 2863 cb->b_un.b_addr = wowbuf; /* change to point at WOWBUF */ 2864 cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */ 2865 cb->b_iodone = copy_write_done; 2866 cb->b_bcount = MIN(md_wowbuf_size, wow_resid); 2867 cb->b_lblkno = wow_blkno; 2868 2869 /* move offset to next section */ 2870 wowhdr->wow_offset += cb->b_bcount; 2871 2872 /* copy and setup write for current section */ 2873 bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount); 2874 2875 /* do it */ 2876 /* 2877 * Do not set the MD_IO_COUNTED flag as this is a new I/O request 2878 * that handles the WOW condition. The resultant increment on the 2879 * I/O count variable is cleared by copy_write_done()'s call to 2880 * md_biodone(). 2881 */ 2882 (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW 2883 | MD_STR_MAPPED, NULL); 2884 } 2885 2886 static void 2887 md_mirror_copy_write(md_mps_t *ps) 2888 { 2889 wowhdr_t *wowhdr; 2890 2891 wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS); 2892 mirror_wowblk_init(wowhdr); 2893 wowhdr->wow_ps = ps; 2894 wowhdr->wow_offset = 0; 2895 copy_write_cont(wowhdr); 2896 } 2897 2898 static void 2899 handle_wow(md_mps_t *ps) 2900 { 2901 buf_t *pb; 2902 2903 pb = ps->ps_bp; 2904 2905 bp_mapin(pb); 2906 2907 md_mirror_wow_cnt++; 2908 if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) { 2909 cmn_err(CE_NOTE, 2910 "md: %s, blk %lld, cnt %ld: Write on write %d occurred", 2911 md_shortname(getminor(pb->b_edev)), 2912 (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt); 2913 } 2914 2915 /* 2916 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 2917 * operation therefore this I/O request has already been counted, 2918 * the I/O count variable will be decremented by mirror_done()'s 2919 * call to md_biodone(). 2920 */ 2921 if (md_mirror_wow_flg & WOW_NOCOPY) 2922 (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW | 2923 MD_STR_MAPPED | MD_IO_COUNTED, ps); 2924 else 2925 md_mirror_copy_write(ps); 2926 } 2927 2928 /* 2929 * Return true if the specified submirror is either in the Last Erred 2930 * state or is transitioning into the Last Erred state. 2931 */ 2932 static bool_t 2933 submirror_is_lasterred(mm_unit_t *un, int smi) 2934 { 2935 mm_submirror_t *sm; 2936 mm_submirror_ic_t *smic; 2937 md_m_shared_t *shared; 2938 int ci; 2939 int compcnt; 2940 2941 sm = &un->un_sm[smi]; 2942 smic = &un->un_smic[smi]; 2943 2944 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 2945 for (ci = 0; ci < compcnt; ci++) { 2946 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2947 (sm->sm_dev, sm, ci); 2948 2949 if (shared->ms_state == CS_LAST_ERRED) 2950 return (B_TRUE); 2951 2952 /* 2953 * It is not currently Last Erred, check if entering Last Erred. 2954 */ 2955 if ((shared->ms_flags & MDM_S_IOERR) && 2956 ((shared->ms_state == CS_OKAY) || 2957 (shared->ms_state == CS_RESYNC))) { 2958 if (mirror_other_sources(un, smi, ci, 0) == 1) 2959 return (B_TRUE); 2960 } 2961 } 2962 2963 return (B_FALSE); 2964 } 2965 2966 2967 static int 2968 mirror_done(struct buf *cb) 2969 { 2970 md_mps_t *ps; 2971 md_mcs_t *cs; 2972 2973 /*LINTED*/ 2974 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 2975 ps = cs->cs_ps; 2976 2977 mutex_enter(&ps->ps_mx); 2978 2979 /* check if we need to retry an errored failfast I/O */ 2980 if (cb->b_flags & B_ERROR) { 2981 struct buf *pb = ps->ps_bp; 2982 2983 if (cb->b_flags & B_FAILFAST) { 2984 int i; 2985 mm_unit_t *un = ps->ps_un; 2986 2987 for (i = 0; i < NMIRROR; i++) { 2988 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 2989 continue; 2990 2991 if (cb->b_edev == 2992 md_dev64_to_dev(un->un_sm[i].sm_dev)) { 2993 2994 /* 2995 * This is the submirror that had the 2996 * error. Check if it is Last Erred. 2997 */ 2998 if (submirror_is_lasterred(un, i)) { 2999 daemon_queue_t *dqp; 3000 3001 mutex_exit(&ps->ps_mx); 3002 dqp = (daemon_queue_t *)cs; 3003 dqp->dq_prev = NULL; 3004 dqp->dq_next = NULL; 3005 daemon_request(&md_done_daemon, 3006 last_err_retry, dqp, 3007 REQ_OLD); 3008 return (1); 3009 } 3010 break; 3011 } 3012 } 3013 } 3014 3015 /* continue to process the buf without doing a retry */ 3016 ps->ps_flags |= MD_MPS_ERROR; 3017 pb->b_error = cb->b_error; 3018 } 3019 3020 return (mirror_done_common(cb)); 3021 } 3022 3023 /* 3024 * Split from the original mirror_done function so we can handle bufs after a 3025 * retry. 3026 * ps->ps_mx is already held in the caller of this function and the cb error 3027 * has already been checked and handled in the caller. 3028 */ 3029 static int 3030 mirror_done_common(struct buf *cb) 3031 { 3032 struct buf *pb; 3033 mm_unit_t *un; 3034 mdi_unit_t *ui; 3035 md_mps_t *ps; 3036 md_mcs_t *cs; 3037 size_t end_rr, start_rr, current_rr; 3038 3039 /*LINTED*/ 3040 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3041 ps = cs->cs_ps; 3042 pb = ps->ps_bp; 3043 3044 if (cb->b_flags & B_REMAPPED) 3045 bp_mapout(cb); 3046 3047 ps->ps_frags--; 3048 if (ps->ps_frags != 0) { 3049 mutex_exit(&ps->ps_mx); 3050 kmem_cache_free(mirror_child_cache, cs); 3051 return (1); 3052 } 3053 un = ps->ps_un; 3054 ui = ps->ps_ui; 3055 3056 /* 3057 * Do not update outstanding_writes if we're running with ABR 3058 * set for this mirror or the write() was issued with MD_STR_ABR set. 3059 * Also a resync initiated write() has no outstanding_writes update 3060 * either. 3061 */ 3062 if (((cb->b_flags & B_READ) == 0) && 3063 (un->un_nsm >= 2) && 3064 (ps->ps_call == NULL) && 3065 !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) && 3066 !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) { 3067 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 3068 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 3069 mutex_enter(&un->un_resync_mx); 3070 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) 3071 un->un_outstanding_writes[current_rr]--; 3072 mutex_exit(&un->un_resync_mx); 3073 } 3074 kmem_cache_free(mirror_child_cache, cs); 3075 mutex_exit(&ps->ps_mx); 3076 3077 if (ps->ps_call != NULL) { 3078 daemon_request(&md_done_daemon, ps->ps_call, 3079 (daemon_queue_t *)ps, REQ_OLD); 3080 return (1); 3081 } 3082 3083 if ((ps->ps_flags & MD_MPS_ERROR)) { 3084 daemon_request(&md_done_daemon, mirror_error, 3085 (daemon_queue_t *)ps, REQ_OLD); 3086 return (1); 3087 } 3088 3089 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3090 mirror_overlap_tree_remove(ps); 3091 3092 /* 3093 * Handle Write-on-Write problem. 3094 * Skip In case of Raw and Direct I/O as they are 3095 * handled earlier. 3096 * 3097 */ 3098 if (!(md_mirror_wow_flg & WOW_DISABLE) && 3099 !(pb->b_flags & B_READ) && 3100 !(ps->ps_flags & MD_MPS_WOW) && 3101 !(pb->b_flags & B_PHYS) && 3102 any_pages_dirty(pb)) { 3103 md_unit_readerexit(ps->ps_ui); 3104 daemon_request(&md_mstr_daemon, handle_wow, 3105 (daemon_queue_t *)ps, REQ_OLD); 3106 return (1); 3107 } 3108 3109 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3110 MPS_FREE(mirror_parent_cache, ps); 3111 md_unit_readerexit(ui); 3112 md_biodone(pb); 3113 return (0); 3114 } 3115 3116 /* 3117 * Clear error state in submirror component if the retry worked after 3118 * a failfast error. 3119 */ 3120 static void 3121 clear_retry_error(struct buf *cb) 3122 { 3123 int smi; 3124 md_mcs_t *cs; 3125 mm_unit_t *un; 3126 mdi_unit_t *ui_sm; 3127 mm_submirror_t *sm; 3128 mm_submirror_ic_t *smic; 3129 u_longlong_t cnt; 3130 md_m_shared_t *shared; 3131 3132 /*LINTED*/ 3133 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3134 un = cs->cs_ps->ps_un; 3135 3136 for (smi = 0; smi < NMIRROR; smi++) { 3137 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 3138 continue; 3139 3140 if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) 3141 break; 3142 } 3143 3144 if (smi >= NMIRROR) 3145 return; 3146 3147 sm = &un->un_sm[smi]; 3148 smic = &un->un_smic[smi]; 3149 cnt = cb->b_bcount; 3150 3151 ui_sm = MDI_UNIT(getminor(cb->b_edev)); 3152 (void) md_unit_writerlock(ui_sm); 3153 3154 shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm, 3155 cb->b_blkno, &cnt); 3156 3157 if (shared->ms_flags & MDM_S_IOERR) { 3158 shared->ms_flags &= ~MDM_S_IOERR; 3159 3160 } else { 3161 /* the buf spans components and the first one is not erred */ 3162 int cnt; 3163 int i; 3164 3165 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un); 3166 for (i = 0; i < cnt; i++) { 3167 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 3168 (sm->sm_dev, sm, i); 3169 3170 if (shared->ms_flags & MDM_S_IOERR && 3171 shared->ms_state == CS_OKAY) { 3172 3173 shared->ms_flags &= ~MDM_S_IOERR; 3174 break; 3175 } 3176 } 3177 } 3178 3179 md_unit_writerexit(ui_sm); 3180 } 3181 3182 static size_t 3183 mirror_map_read( 3184 md_mps_t *ps, 3185 md_mcs_t *cs, 3186 diskaddr_t blkno, 3187 u_longlong_t count 3188 ) 3189 { 3190 mm_unit_t *un; 3191 buf_t *bp; 3192 u_longlong_t cando; 3193 3194 bp = &cs->cs_buf; 3195 un = ps->ps_un; 3196 3197 bp->b_lblkno = blkno; 3198 if (fast_select_read_unit(ps, cs) == 0) { 3199 bp->b_bcount = ldbtob(count); 3200 return (0); 3201 } 3202 bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, 3203 count, &cando, 0, NULL, cs)); 3204 bp->b_bcount = ldbtob(cando); 3205 if (count != cando) 3206 return (cando); 3207 return (0); 3208 } 3209 3210 static void 3211 write_after_read(md_mps_t *ps) 3212 { 3213 struct buf *pb; 3214 int flags; 3215 3216 if (ps->ps_flags & MD_MPS_ERROR) { 3217 mirror_error(ps); 3218 return; 3219 } 3220 3221 pb = ps->ps_bp; 3222 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3223 ps->ps_call = NULL; 3224 ps->ps_flags |= MD_MPS_WRITE_AFTER_READ; 3225 flags = MD_STR_NOTTOP | MD_STR_WAR; 3226 if (ps->ps_flags & MD_MPS_MAPPED) 3227 flags |= MD_STR_MAPPED; 3228 if (ps->ps_flags & MD_MPS_NOBLOCK) 3229 flags |= MD_NOBLOCK; 3230 if (ps->ps_flags & MD_MPS_DIRTY_RD) 3231 flags |= MD_STR_DIRTY_RD; 3232 (void) mirror_write_strategy(pb, flags, ps); 3233 } 3234 3235 static void 3236 continue_serial(md_mps_t *ps) 3237 { 3238 md_mcs_t *cs; 3239 buf_t *cb; 3240 mm_unit_t *un; 3241 int flags; 3242 3243 un = ps->ps_un; 3244 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 3245 mirror_child_init(cs); 3246 cb = &cs->cs_buf; 3247 ps->ps_call = NULL; 3248 ps->ps_frags = 1; 3249 (void) mirror_map_write(un, cs, ps, 0); 3250 flags = MD_STR_NOTTOP; 3251 if (ps->ps_flags & MD_MPS_MAPPED) 3252 flags |= MD_STR_MAPPED; 3253 md_call_strategy(cb, flags, NULL); 3254 } 3255 3256 static int 3257 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war) 3258 { 3259 int i; 3260 dev_t dev; /* needed for bioclone, so not md_dev64_t */ 3261 buf_t *cb; 3262 buf_t *pb; 3263 diskaddr_t blkno; 3264 size_t bcount; 3265 off_t offset; 3266 3267 pb = ps->ps_bp; 3268 cb = &cs->cs_buf; 3269 cs->cs_ps = ps; 3270 3271 i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm); 3272 3273 dev = md_dev64_to_dev(un->un_sm[i].sm_dev); 3274 3275 blkno = pb->b_lblkno; 3276 bcount = pb->b_bcount; 3277 offset = 0; 3278 if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) { 3279 blkno = DK_LABEL_LOC + 1; 3280 /* 3281 * This handles the case where we're requesting 3282 * a write to block 0 on a label partition 3283 * and the request size was smaller than the 3284 * size of the label. If this is the case 3285 * then we'll return -1. Failure to do so will 3286 * either cause the calling thread to hang due to 3287 * an ssd bug, or worse if the bcount were allowed 3288 * to go negative (ie large). 3289 */ 3290 if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1)) 3291 return (-1); 3292 bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3293 offset = (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3294 } 3295 3296 cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done, 3297 cb, KM_NOSLEEP); 3298 if (war) 3299 cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE; 3300 3301 /* 3302 * If the submirror is in the erred stated, check if any component is 3303 * in the Last Erred state. If so, we don't want to use the B_FAILFAST 3304 * flag on the IO. 3305 * 3306 * Provide a fast path for the non-erred case (which should be the 3307 * normal case). 3308 */ 3309 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) { 3310 if (un->un_sm[i].sm_state & SMS_COMP_ERRED) { 3311 mm_submirror_t *sm; 3312 mm_submirror_ic_t *smic; 3313 int ci; 3314 int compcnt; 3315 3316 sm = &un->un_sm[i]; 3317 smic = &un->un_smic[i]; 3318 3319 compcnt = (*(smic->sm_get_component_count)) 3320 (sm->sm_dev, un); 3321 for (ci = 0; ci < compcnt; ci++) { 3322 md_m_shared_t *shared; 3323 3324 shared = (md_m_shared_t *) 3325 (*(smic->sm_shared_by_indx))(sm->sm_dev, 3326 sm, ci); 3327 3328 if (shared->ms_state == CS_LAST_ERRED) 3329 break; 3330 } 3331 if (ci >= compcnt) 3332 cb->b_flags |= B_FAILFAST; 3333 3334 } else { 3335 cb->b_flags |= B_FAILFAST; 3336 } 3337 } 3338 3339 ps->ps_current_sm++; 3340 if (ps->ps_current_sm != ps->ps_active_cnt) { 3341 if (un->un_write_option == WR_SERIAL) { 3342 ps->ps_call = continue_serial; 3343 return (0); 3344 } 3345 return (1); 3346 } 3347 return (0); 3348 } 3349 3350 /* 3351 * directed_read_done: 3352 * ------------------ 3353 * Completion routine called when a DMR request has been returned from the 3354 * underlying driver. Wake-up the original ioctl() and return the data to 3355 * the user. 3356 */ 3357 static void 3358 directed_read_done(md_mps_t *ps) 3359 { 3360 mm_unit_t *un; 3361 mdi_unit_t *ui; 3362 3363 un = ps->ps_un; 3364 ui = ps->ps_ui; 3365 3366 md_unit_readerexit(ui); 3367 md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3368 ps->ps_call = NULL; 3369 3370 mutex_enter(&un->un_dmr_mx); 3371 cv_signal(&un->un_dmr_cv); 3372 mutex_exit(&un->un_dmr_mx); 3373 3374 /* release the parent structure */ 3375 kmem_cache_free(mirror_parent_cache, ps); 3376 } 3377 3378 /* 3379 * daemon_io: 3380 * ------------ 3381 * Called to issue a mirror_write_strategy() or mirror_read_strategy 3382 * call from a blockable context. NOTE: no mutex can be held on entry to this 3383 * routine 3384 */ 3385 static void 3386 daemon_io(daemon_queue_t *dq) 3387 { 3388 md_mps_t *ps = (md_mps_t *)dq; 3389 int flag = MD_STR_NOTTOP; 3390 buf_t *pb = ps->ps_bp; 3391 3392 if (ps->ps_flags & MD_MPS_MAPPED) 3393 flag |= MD_STR_MAPPED; 3394 if (ps->ps_flags & MD_MPS_WOW) 3395 flag |= MD_STR_WOW; 3396 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) 3397 flag |= MD_STR_WAR; 3398 if (ps->ps_flags & MD_MPS_ABR) 3399 flag |= MD_STR_ABR; 3400 3401 /* 3402 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set 3403 * MD_STR_WAR before calling mirror_read_strategy 3404 */ 3405 if (pb->b_flags & B_READ) { 3406 if (!(ps->ps_flags & MD_MPS_DIRTY_RD)) 3407 flag |= MD_STR_WAR; 3408 mirror_read_strategy(pb, flag, ps); 3409 } else 3410 mirror_write_strategy(pb, flag, ps); 3411 } 3412 3413 /* 3414 * update_resync: 3415 * ------------- 3416 * Called to update the in-core version of the resync record with the latest 3417 * version that was committed to disk when the previous mirror owner 3418 * relinquished ownership. This call is likely to block as we must hold-off 3419 * any current resync processing that may be occurring. 3420 * On completion of the resync record update we issue the mirror_write_strategy 3421 * call to complete the i/o that first started this sequence. To remove a race 3422 * condition between a new write() request which is submitted and the resync 3423 * record update we acquire the writerlock. This will hold off all i/o to the 3424 * mirror until the resync update has completed. 3425 * NOTE: no mutex can be held on entry to this routine 3426 */ 3427 static void 3428 update_resync(daemon_queue_t *dq) 3429 { 3430 md_mps_t *ps = (md_mps_t *)dq; 3431 buf_t *pb = ps->ps_bp; 3432 mdi_unit_t *ui = ps->ps_ui; 3433 mm_unit_t *un; 3434 set_t setno; 3435 int restart_resync; 3436 3437 un = md_unit_writerlock(ui); 3438 ps->ps_un = un; 3439 setno = MD_MIN2SET(getminor(pb->b_edev)); 3440 if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) { 3441 /* 3442 * Synchronize our in-core view of what regions need to be 3443 * resync'd with the on-disk version. 3444 */ 3445 mutex_enter(&un->un_rrp_inflight_mx); 3446 mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm, 3447 un->un_dirty_bm); 3448 mutex_exit(&un->un_rrp_inflight_mx); 3449 3450 /* Region dirty map is now up to date */ 3451 } 3452 restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0; 3453 md_unit_writerexit(ui); 3454 3455 /* Restart the resync thread if it was previously blocked */ 3456 if (restart_resync) { 3457 mutex_enter(&un->un_rs_thread_mx); 3458 un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER; 3459 cv_signal(&un->un_rs_thread_cv); 3460 mutex_exit(&un->un_rs_thread_mx); 3461 } 3462 /* Continue with original deferred i/o */ 3463 daemon_io(dq); 3464 } 3465 3466 /* 3467 * owner_timeout: 3468 * ------------- 3469 * Called if the original mdmn_ksend_message() failed and the request is to be 3470 * retried. Reattempt the original ownership change. 3471 * 3472 * NOTE: called at interrupt context (see timeout(9f)). 3473 */ 3474 static void 3475 owner_timeout(void *arg) 3476 { 3477 daemon_queue_t *dq = (daemon_queue_t *)arg; 3478 3479 daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD); 3480 } 3481 3482 /* 3483 * become_owner: 3484 * ------------ 3485 * Called to issue RPC request to become the owner of the mirror 3486 * associated with this i/o request. We assume that the ownership request 3487 * is synchronous, so if it succeeds we will issue the request via 3488 * mirror_write_strategy(). 3489 * If multiple i/o's are outstanding we will be called from the mirror_daemon 3490 * service thread. 3491 * NOTE: no mutex should be held on entry to this routine. 3492 */ 3493 static void 3494 become_owner(daemon_queue_t *dq) 3495 { 3496 md_mps_t *ps = (md_mps_t *)dq; 3497 mm_unit_t *un = ps->ps_un; 3498 buf_t *pb = ps->ps_bp; 3499 set_t setno; 3500 md_mn_kresult_t *kres; 3501 int msg_flags = md_mirror_msg_flags; 3502 md_mps_t *ps1; 3503 3504 ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL); 3505 3506 /* 3507 * If we're already the mirror owner we do not need to send a message 3508 * but can simply process the i/o request immediately. 3509 * If we've already sent the request to become owner we requeue the 3510 * request as we're waiting for the synchronous ownership message to 3511 * be processed. 3512 */ 3513 if (MD_MN_MIRROR_OWNER(un)) { 3514 /* 3515 * As the strategy() call will potentially block we need to 3516 * punt this to a separate thread and complete this request 3517 * as quickly as possible. Note: if we're a read request 3518 * this must be a resync, we cannot afford to be queued 3519 * behind any intervening i/o requests. In this case we put the 3520 * request on the md_mirror_rs_daemon queue. 3521 */ 3522 if (pb->b_flags & B_READ) { 3523 daemon_request(&md_mirror_rs_daemon, daemon_io, dq, 3524 REQ_OLD); 3525 } else { 3526 daemon_request(&md_mirror_io_daemon, daemon_io, dq, 3527 REQ_OLD); 3528 } 3529 } else { 3530 mutex_enter(&un->un_owner_mx); 3531 if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) { 3532 md_mn_req_owner_t *msg; 3533 int rval = 0; 3534 3535 /* 3536 * Check to see that we haven't exceeded the maximum 3537 * retry count. If we have we fail the i/o as the 3538 * comms mechanism has become wedged beyond recovery. 3539 */ 3540 if (dq->qlen++ >= MD_OWNER_RETRIES) { 3541 mutex_exit(&un->un_owner_mx); 3542 cmn_err(CE_WARN, 3543 "md_mirror: Request exhausted ownership " 3544 "retry limit of %d attempts", dq->qlen); 3545 pb->b_error = EIO; 3546 pb->b_flags |= B_ERROR; 3547 pb->b_resid = pb->b_bcount; 3548 kmem_cache_free(mirror_parent_cache, ps); 3549 md_biodone(pb); 3550 return; 3551 } 3552 3553 /* 3554 * Issue request to change ownership. The call is 3555 * synchronous so when it returns we can complete the 3556 * i/o (if successful), or enqueue it again so that 3557 * the operation will be retried. 3558 */ 3559 un->un_owner_state |= MM_MN_OWNER_SENT; 3560 mutex_exit(&un->un_owner_mx); 3561 3562 msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP); 3563 setno = MD_MIN2SET(getminor(pb->b_edev)); 3564 msg->mnum = MD_SID(un); 3565 msg->owner = md_mn_mynode_id; 3566 msg_flags |= MD_MSGF_NO_LOG; 3567 /* 3568 * If this IO is triggered by updating a watermark, 3569 * it might be issued by the creation of a softpartition 3570 * while the commd subsystem is suspended. 3571 * We don't want this message to block. 3572 */ 3573 if (ps->ps_flags & MD_MPS_WMUPDATE) { 3574 msg_flags |= MD_MSGF_OVERRIDE_SUSPEND; 3575 } 3576 3577 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 3578 rval = mdmn_ksend_message(setno, 3579 MD_MN_MSG_REQUIRE_OWNER, msg_flags, 3580 /* flags */ (char *)msg, 3581 sizeof (md_mn_req_owner_t), kres); 3582 3583 kmem_free(msg, sizeof (md_mn_req_owner_t)); 3584 3585 if (MDMN_KSEND_MSG_OK(rval, kres)) { 3586 dq->qlen = 0; 3587 /* 3588 * Successfully changed owner, reread the 3589 * resync record so that we have a valid idea of 3590 * any previously committed incomplete write()s. 3591 * NOTE: As we need to acquire the resync mutex 3592 * this may block, so we defer it to a separate 3593 * thread handler. This makes us (effectively) 3594 * non-blocking once the ownership message 3595 * handling has completed. 3596 */ 3597 mutex_enter(&un->un_owner_mx); 3598 if (un->un_owner_state & MM_MN_BECOME_OWNER) { 3599 un->un_mirror_owner = md_mn_mynode_id; 3600 /* Sets owner of un_rr_dirty record */ 3601 if (un->un_rr_dirty_recid) 3602 (void) mddb_setowner( 3603 un->un_rr_dirty_recid, 3604 md_mn_mynode_id); 3605 un->un_owner_state &= 3606 ~MM_MN_BECOME_OWNER; 3607 /* 3608 * Release the block on the current 3609 * resync region if it is blocked 3610 */ 3611 ps1 = un->un_rs_prev_overlap; 3612 if ((ps1 != NULL) && 3613 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) 3614 mirror_overlap_tree_remove(ps1); 3615 mutex_exit(&un->un_owner_mx); 3616 3617 /* 3618 * If we're a read, this must be a 3619 * resync request, issue 3620 * the i/o request on the 3621 * md_mirror_rs_daemon queue. This is 3622 * to avoid a deadlock between the 3623 * resync_unit thread and 3624 * subsequent i/o requests that may 3625 * block on the resync region. 3626 */ 3627 if (pb->b_flags & B_READ) { 3628 daemon_request( 3629 &md_mirror_rs_daemon, 3630 update_resync, dq, REQ_OLD); 3631 } else { 3632 daemon_request( 3633 &md_mirror_io_daemon, 3634 update_resync, dq, REQ_OLD); 3635 } 3636 kmem_free(kres, 3637 sizeof (md_mn_kresult_t)); 3638 return; 3639 } else { 3640 /* 3641 * Some other node has beaten us to 3642 * obtain ownership. We need to 3643 * reschedule our ownership request 3644 */ 3645 mutex_exit(&un->un_owner_mx); 3646 } 3647 } else { 3648 mdmn_ksend_show_error(rval, kres, 3649 "MD_MN_MSG_REQUIRE_OWNER"); 3650 /* 3651 * Message transport failure is handled by the 3652 * comms layer. If the ownership change request 3653 * does not succeed we need to flag the error to 3654 * the initiator of the i/o. This is handled by 3655 * the retry logic above. As the request failed 3656 * we do not know _who_ the owner of the mirror 3657 * currently is. We reset our idea of the owner 3658 * to None so that any further write()s will 3659 * attempt to become the owner again. This stops 3660 * multiple nodes writing to the same mirror 3661 * simultaneously. 3662 */ 3663 mutex_enter(&un->un_owner_mx); 3664 un->un_owner_state &= 3665 ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER); 3666 un->un_mirror_owner = MD_MN_MIRROR_UNOWNED; 3667 mutex_exit(&un->un_owner_mx); 3668 } 3669 kmem_free(kres, sizeof (md_mn_kresult_t)); 3670 } else 3671 mutex_exit(&un->un_owner_mx); 3672 3673 /* 3674 * Re-enqueue this request on the deferred i/o list. Delay the 3675 * request for md_mirror_owner_to usecs to stop thrashing. 3676 */ 3677 (void) timeout(owner_timeout, dq, 3678 drv_usectohz(md_mirror_owner_to)); 3679 } 3680 } 3681 3682 static void 3683 mirror_write_strategy(buf_t *pb, int flag, void *private) 3684 { 3685 md_mps_t *ps; 3686 md_mcs_t *cs; 3687 int more; 3688 mm_unit_t *un; 3689 mdi_unit_t *ui; 3690 buf_t *cb; /* child buf pointer */ 3691 set_t setno; 3692 int rs_on_overlap = 0; 3693 3694 ui = MDI_UNIT(getminor(pb->b_edev)); 3695 un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev)); 3696 3697 3698 md_kstat_waitq_enter(ui); 3699 3700 /* 3701 * If a state change is in progress for this mirror in a MN set, 3702 * suspend all non-resync writes until the state change is complete. 3703 * The objective of this suspend is to ensure that it is not 3704 * possible for one node to read data from a submirror that another node 3705 * has not written to because of the state change. Therefore we 3706 * suspend all writes until the state change has been made. As it is 3707 * not possible to read from the target of a resync, there is no need 3708 * to suspend resync writes. 3709 */ 3710 3711 if (!(flag & MD_STR_WAR)) { 3712 mutex_enter(&un->un_suspend_wr_mx); 3713 while (un->un_suspend_wr_flag) { 3714 cv_wait(&un->un_suspend_wr_cv, &un->un_suspend_wr_mx); 3715 } 3716 mutex_exit(&un->un_suspend_wr_mx); 3717 (void) md_unit_readerlock(ui); 3718 } 3719 3720 if (!(flag & MD_STR_NOTTOP)) { 3721 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 3722 md_kstat_waitq_exit(ui); 3723 return; 3724 } 3725 } 3726 3727 setno = MD_MIN2SET(getminor(pb->b_edev)); 3728 3729 /* If an ABR write has been requested, set MD_STR_ABR flag */ 3730 if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE)) 3731 flag |= MD_STR_ABR; 3732 3733 if (private == NULL) { 3734 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 3735 mirror_parent_init(ps); 3736 } else { 3737 ps = private; 3738 private = NULL; 3739 } 3740 if (flag & MD_STR_MAPPED) 3741 ps->ps_flags |= MD_MPS_MAPPED; 3742 3743 if (flag & MD_STR_WOW) 3744 ps->ps_flags |= MD_MPS_WOW; 3745 3746 if (flag & MD_STR_ABR) 3747 ps->ps_flags |= MD_MPS_ABR; 3748 3749 if (flag & MD_STR_WMUPDATE) 3750 ps->ps_flags |= MD_MPS_WMUPDATE; 3751 3752 /* 3753 * Save essential information from the original buffhdr 3754 * in the md_save structure. 3755 */ 3756 ps->ps_un = un; 3757 ps->ps_ui = ui; 3758 ps->ps_bp = pb; 3759 ps->ps_addr = pb->b_un.b_addr; 3760 ps->ps_firstblk = pb->b_lblkno; 3761 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 3762 ps->ps_changecnt = un->un_changecnt; 3763 3764 /* 3765 * If not MN owner and this is an ABR write, make sure the current 3766 * resync region is in the overlaps tree 3767 */ 3768 mutex_enter(&un->un_owner_mx); 3769 if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) && 3770 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 3771 md_mps_t *ps1; 3772 /* Block the current resync region, if not already blocked */ 3773 ps1 = un->un_rs_prev_overlap; 3774 3775 if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) || 3776 (ps1->ps_lastblk != 0))) { 3777 /* Drop locks to avoid deadlock */ 3778 mutex_exit(&un->un_owner_mx); 3779 md_unit_readerexit(ui); 3780 wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT); 3781 rs_on_overlap = 1; 3782 (void) md_unit_readerlock(ui); 3783 mutex_enter(&un->un_owner_mx); 3784 /* 3785 * Check to see if we have obtained ownership 3786 * while waiting for overlaps. If we have, remove 3787 * the resync_region entry from the overlap tree 3788 */ 3789 if (MD_MN_MIRROR_OWNER(un) && 3790 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) { 3791 mirror_overlap_tree_remove(ps1); 3792 rs_on_overlap = 0; 3793 } 3794 } 3795 } 3796 mutex_exit(&un->un_owner_mx); 3797 3798 3799 /* 3800 * following keep write after read from writing to the 3801 * source in the case where it all came from one place 3802 */ 3803 if (flag & MD_STR_WAR) { 3804 int abort_write = 0; 3805 /* 3806 * We are perfoming a write-after-read. This is either as a 3807 * result of a resync read or as a result of a read in a 3808 * dirty resync region when the optimized resync is not 3809 * complete. If in a MN set and a resync generated i/o, 3810 * if the current block is not in the current 3811 * resync region terminate the write as another node must have 3812 * completed this resync region 3813 */ 3814 if ((MD_MNSET_SETNO(MD_UN2SET(un))) && 3815 (!flag & MD_STR_DIRTY_RD)) { 3816 if (!IN_RESYNC_REGION(un, ps)) 3817 abort_write = 1; 3818 } 3819 if ((select_write_after_read_units(un, ps) == 0) || 3820 (abort_write)) { 3821 #ifdef DEBUG 3822 if (mirror_debug_flag) 3823 printf("Abort resync write on %x, block %lld\n", 3824 MD_SID(un), ps->ps_firstblk); 3825 #endif 3826 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3827 mirror_overlap_tree_remove(ps); 3828 kmem_cache_free(mirror_parent_cache, ps); 3829 md_kstat_waitq_exit(ui); 3830 md_unit_readerexit(ui); 3831 md_biodone(pb); 3832 return; 3833 } 3834 } else { 3835 select_write_units(un, ps); 3836 3837 /* Drop readerlock to avoid deadlock */ 3838 md_unit_readerexit(ui); 3839 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 3840 un = md_unit_readerlock(ui); 3841 /* 3842 * For a MN set with an ABR write, if we are now the 3843 * owner and we have a resync region in the overlap 3844 * tree, remove the entry from overlaps and retry the write. 3845 */ 3846 3847 if (MD_MNSET_SETNO(setno) && 3848 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 3849 mutex_enter(&un->un_owner_mx); 3850 if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) { 3851 mirror_overlap_tree_remove(ps); 3852 md_kstat_waitq_exit(ui); 3853 mutex_exit(&un->un_owner_mx); 3854 md_unit_readerexit(ui); 3855 daemon_request(&md_mirror_daemon, daemon_io, 3856 (daemon_queue_t *)ps, REQ_OLD); 3857 return; 3858 } 3859 mutex_exit(&un->un_owner_mx); 3860 } 3861 } 3862 3863 /* 3864 * For Multinode mirrors with a Resync Region (not ABR) we need to 3865 * become the mirror owner before continuing with the write(). For ABR 3866 * mirrors we check that we 'own' the resync if we're in 3867 * write-after-read mode. We do this _after_ ensuring that there are no 3868 * overlaps to ensure that the once we know that we are the owner, the 3869 * readerlock will not released until the write is complete. As a 3870 * change of ownership in a MN set requires the writerlock, this 3871 * ensures that ownership cannot be changed until the write is 3872 * complete 3873 */ 3874 if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) || 3875 (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) { 3876 if (!MD_MN_MIRROR_OWNER(un)) { 3877 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3878 mirror_overlap_tree_remove(ps); 3879 md_kstat_waitq_exit(ui); 3880 ASSERT(!(flag & MD_STR_WAR)); 3881 md_unit_readerexit(ui); 3882 daemon_request(&md_mirror_daemon, become_owner, 3883 (daemon_queue_t *)ps, REQ_OLD); 3884 return; 3885 } 3886 } 3887 3888 /* 3889 * Mark resync region if mirror has a Resync Region _and_ we are not 3890 * a resync initiated write(). Don't mark region if we're flagged as 3891 * an ABR write. 3892 */ 3893 if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) && 3894 !(flag & MD_STR_WAR)) { 3895 if (mirror_mark_resync_region(un, ps->ps_firstblk, 3896 ps->ps_lastblk)) { 3897 pb->b_flags |= B_ERROR; 3898 pb->b_resid = pb->b_bcount; 3899 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 3900 kmem_cache_free(mirror_parent_cache, ps); 3901 md_kstat_waitq_exit(ui); 3902 md_unit_readerexit(ui); 3903 md_biodone(pb); 3904 return; 3905 } 3906 } 3907 3908 ps->ps_childbflags = pb->b_flags | B_WRITE; 3909 ps->ps_childbflags &= ~B_READ; 3910 if (flag & MD_STR_MAPPED) 3911 ps->ps_childbflags &= ~B_PAGEIO; 3912 3913 if (!(flag & MD_STR_NOTTOP) && panicstr) 3914 /* Disable WOW and don't free ps */ 3915 ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE); 3916 3917 md_kstat_waitq_to_runq(ui); 3918 3919 /* 3920 * Treat Raw and Direct I/O as Write-on-Write always 3921 */ 3922 3923 if (!(md_mirror_wow_flg & WOW_DISABLE) && 3924 (md_mirror_wow_flg & WOW_PHYS_ENABLE) && 3925 (pb->b_flags & B_PHYS) && 3926 !(ps->ps_flags & MD_MPS_WOW)) { 3927 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3928 mirror_overlap_tree_remove(ps); 3929 md_unit_readerexit(ui); 3930 daemon_request(&md_mstr_daemon, handle_wow, 3931 (daemon_queue_t *)ps, REQ_OLD); 3932 return; 3933 } 3934 3935 ps->ps_frags = 1; 3936 do { 3937 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 3938 mirror_child_init(cs); 3939 cb = &cs->cs_buf; 3940 more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR)); 3941 3942 /* 3943 * This handles the case where we're requesting 3944 * a write to block 0 on a label partition. (more < 0) 3945 * means that the request size was smaller than the 3946 * size of the label. If so this request is done. 3947 */ 3948 if (more < 0) { 3949 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3950 mirror_overlap_tree_remove(ps); 3951 md_kstat_runq_exit(ui); 3952 kmem_cache_free(mirror_child_cache, cs); 3953 kmem_cache_free(mirror_parent_cache, ps); 3954 md_unit_readerexit(ui); 3955 md_biodone(pb); 3956 return; 3957 } 3958 if (more) { 3959 mutex_enter(&ps->ps_mx); 3960 ps->ps_frags++; 3961 mutex_exit(&ps->ps_mx); 3962 } 3963 md_call_strategy(cb, flag, private); 3964 } while (more); 3965 3966 if (!(flag & MD_STR_NOTTOP) && panicstr) { 3967 while (!(ps->ps_flags & MD_MPS_DONE)) { 3968 md_daemon(1, &md_done_daemon); 3969 drv_usecwait(10); 3970 } 3971 kmem_cache_free(mirror_parent_cache, ps); 3972 } 3973 } 3974 3975 static void 3976 mirror_read_strategy(buf_t *pb, int flag, void *private) 3977 { 3978 md_mps_t *ps; 3979 md_mcs_t *cs; 3980 size_t more; 3981 mm_unit_t *un; 3982 mdi_unit_t *ui; 3983 size_t current_count; 3984 diskaddr_t current_blkno; 3985 off_t current_offset; 3986 buf_t *cb; /* child buf pointer */ 3987 set_t setno; 3988 3989 ui = MDI_UNIT(getminor(pb->b_edev)); 3990 3991 md_kstat_waitq_enter(ui); 3992 3993 un = (mm_unit_t *)md_unit_readerlock(ui); 3994 3995 if (!(flag & MD_STR_NOTTOP)) { 3996 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 3997 md_kstat_waitq_exit(ui); 3998 return; 3999 } 4000 } 4001 4002 if (private == NULL) { 4003 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 4004 mirror_parent_init(ps); 4005 } else { 4006 ps = private; 4007 private = NULL; 4008 } 4009 4010 if (flag & MD_STR_MAPPED) 4011 ps->ps_flags |= MD_MPS_MAPPED; 4012 if (flag & MD_NOBLOCK) 4013 ps->ps_flags |= MD_MPS_NOBLOCK; 4014 if (flag & MD_STR_WMUPDATE) 4015 ps->ps_flags |= MD_MPS_WMUPDATE; 4016 4017 /* 4018 * Check to see if this is a DMR driven read. If so we need to use the 4019 * specified side (in un->un_dmr_last_read) for the source of the data. 4020 */ 4021 if (flag & MD_STR_DMR) 4022 ps->ps_flags |= MD_MPS_DMR; 4023 4024 /* 4025 * Save essential information from the original buffhdr 4026 * in the md_save structure. 4027 */ 4028 ps->ps_un = un; 4029 ps->ps_ui = ui; 4030 ps->ps_bp = pb; 4031 ps->ps_addr = pb->b_un.b_addr; 4032 ps->ps_firstblk = pb->b_lblkno; 4033 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 4034 ps->ps_changecnt = un->un_changecnt; 4035 4036 current_count = btodb(pb->b_bcount); 4037 current_blkno = pb->b_lblkno; 4038 current_offset = 0; 4039 4040 /* 4041 * If flag has MD_STR_WAR set this means that the read is issued by a 4042 * resync thread which may or may not be an optimised resync. 4043 * 4044 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync 4045 * code has not completed; either a resync has not started since snarf, 4046 * or there is an optimized resync in progress. 4047 * 4048 * We need to generate a write after this read in the following two 4049 * cases, 4050 * 4051 * 1. Any Resync-Generated read 4052 * 4053 * 2. Any read to a DIRTY REGION if there is an optimized resync 4054 * pending or in progress. 4055 * 4056 * The write after read is done in these cases to ensure that all sides 4057 * of the mirror are in sync with the read data and that it is not 4058 * possible for an application to read the same block multiple times 4059 * and get different data. 4060 * 4061 * This would be possible if the block was in a dirty region. 4062 * 4063 * If we're performing a directed read we don't write the data out as 4064 * the application is responsible for restoring the mirror to a known 4065 * state. 4066 */ 4067 if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) && 4068 !(flag & MD_STR_DMR)) { 4069 size_t start_rr, i, end_rr; 4070 int region_dirty = 1; 4071 4072 /* 4073 * We enter here under three circumstances, 4074 * 4075 * MD_UN_OPT_NOT_DONE MD_STR_WAR 4076 * 0 1 4077 * 1 0 4078 * 1 1 4079 * 4080 * To be optimal we only care to explicitly check for dirty 4081 * regions in the second case since if MD_STR_WAR is set we 4082 * always do the write after read. 4083 */ 4084 if (!(flag & MD_STR_WAR)) { 4085 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 4086 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 4087 4088 for (i = start_rr; i <= end_rr; i++) 4089 if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0) 4090 break; 4091 } 4092 4093 if ((region_dirty) && 4094 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 4095 ps->ps_call = write_after_read; 4096 /* 4097 * Mark this as a RESYNC_READ in ps_flags. 4098 * This is used if the read fails during a 4099 * resync of a 3-way mirror to ensure that 4100 * the retried read to the remaining 4101 * good submirror has MD_STR_WAR set. This 4102 * is needed to ensure that the resync write 4103 * (write-after-read) takes place. 4104 */ 4105 ps->ps_flags |= MD_MPS_RESYNC_READ; 4106 4107 /* 4108 * If MD_STR_FLAG_ERR is set in the flags we 4109 * set MD_MPS_FLAG_ERROR so that an error on the resync 4110 * write (issued by write_after_read) will be flagged 4111 * to the biowait'ing resync thread. This allows us to 4112 * avoid issuing further resync requests to a device 4113 * that has had a write failure. 4114 */ 4115 if (flag & MD_STR_FLAG_ERR) 4116 ps->ps_flags |= MD_MPS_FLAG_ERROR; 4117 4118 setno = MD_UN2SET(un); 4119 /* 4120 * Drop the readerlock to avoid 4121 * deadlock 4122 */ 4123 md_unit_readerexit(ui); 4124 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 4125 un = md_unit_readerlock(ui); 4126 /* 4127 * Ensure that we are owner 4128 */ 4129 if (MD_MNSET_SETNO(setno)) { 4130 /* 4131 * For a non-resync read that requires a 4132 * write-after-read to be done, set a flag 4133 * in the parent structure, so that the 4134 * write_strategy routine can omit the 4135 * test that the write is still within the 4136 * resync region 4137 */ 4138 if (!(flag & MD_STR_WAR)) 4139 ps->ps_flags |= MD_MPS_DIRTY_RD; 4140 4141 /* 4142 * Before reading the buffer, see if 4143 * we are the owner 4144 */ 4145 if (!MD_MN_MIRROR_OWNER(un)) { 4146 ps->ps_call = NULL; 4147 mirror_overlap_tree_remove(ps); 4148 md_kstat_waitq_exit(ui); 4149 md_unit_readerexit(ui); 4150 daemon_request( 4151 &md_mirror_daemon, 4152 become_owner, 4153 (daemon_queue_t *)ps, 4154 REQ_OLD); 4155 return; 4156 } 4157 /* 4158 * For a resync read, check to see if I/O is 4159 * outside of the current resync region, or 4160 * the resync has finished. If so 4161 * just terminate the I/O 4162 */ 4163 if ((flag & MD_STR_WAR) && 4164 (!(un->c.un_status & MD_UN_WAR) || 4165 (!IN_RESYNC_REGION(un, ps)))) { 4166 #ifdef DEBUG 4167 if (mirror_debug_flag) 4168 printf("Abort resync read " 4169 "%x: %lld\n", 4170 MD_SID(un), 4171 ps->ps_firstblk); 4172 #endif 4173 mirror_overlap_tree_remove(ps); 4174 kmem_cache_free(mirror_parent_cache, 4175 ps); 4176 md_kstat_waitq_exit(ui); 4177 md_unit_readerexit(ui); 4178 md_biodone(pb); 4179 return; 4180 } 4181 } 4182 } 4183 } 4184 4185 if (flag & MD_STR_DMR) { 4186 ps->ps_call = directed_read_done; 4187 } 4188 4189 if (!(flag & MD_STR_NOTTOP) && panicstr) 4190 ps->ps_flags |= MD_MPS_DONTFREE; 4191 4192 md_kstat_waitq_to_runq(ui); 4193 4194 ps->ps_frags++; 4195 do { 4196 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 4197 mirror_child_init(cs); 4198 cb = &cs->cs_buf; 4199 cs->cs_ps = ps; 4200 4201 cb = md_bioclone(pb, current_offset, current_count, NODEV, 4202 current_blkno, mirror_done, cb, KM_NOSLEEP); 4203 4204 more = mirror_map_read(ps, cs, current_blkno, 4205 (u_longlong_t)current_count); 4206 if (more) { 4207 mutex_enter(&ps->ps_mx); 4208 ps->ps_frags++; 4209 mutex_exit(&ps->ps_mx); 4210 } 4211 4212 /* 4213 * Do these calculations now, 4214 * so that we pickup a valid b_bcount from the chld_bp. 4215 */ 4216 current_count -= more; 4217 current_offset += cb->b_bcount; 4218 current_blkno += more; 4219 md_call_strategy(cb, flag, private); 4220 } while (more); 4221 4222 if (!(flag & MD_STR_NOTTOP) && panicstr) { 4223 while (!(ps->ps_flags & MD_MPS_DONE)) { 4224 md_daemon(1, &md_done_daemon); 4225 drv_usecwait(10); 4226 } 4227 kmem_cache_free(mirror_parent_cache, ps); 4228 } 4229 } 4230 4231 void 4232 md_mirror_strategy(buf_t *bp, int flag, void *private) 4233 { 4234 set_t setno = MD_MIN2SET(getminor(bp->b_edev)); 4235 4236 /* 4237 * When doing IO to a multi owner meta device, check if set is halted. 4238 * We do this check without the needed lock held, for performance 4239 * reasons. 4240 * If an IO just slips through while the set is locked via an 4241 * MD_MN_SUSPEND_SET, we don't care about it. 4242 * Only check for suspension if we are a top-level i/o request 4243 * (MD_STR_NOTTOP is cleared in 'flag'). 4244 */ 4245 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 4246 (MD_SET_HALTED | MD_SET_MNSET)) { 4247 if ((flag & MD_STR_NOTTOP) == 0) { 4248 mutex_enter(&md_mx); 4249 /* Here we loop until the set is no longer halted */ 4250 while (md_set[setno].s_status & MD_SET_HALTED) { 4251 cv_wait(&md_cv, &md_mx); 4252 } 4253 mutex_exit(&md_mx); 4254 } 4255 } 4256 4257 if ((flag & MD_IO_COUNTED) == 0) { 4258 if ((flag & MD_NOBLOCK) == 0) { 4259 if (md_inc_iocount(setno) != 0) { 4260 bp->b_flags |= B_ERROR; 4261 bp->b_error = ENXIO; 4262 bp->b_resid = bp->b_bcount; 4263 biodone(bp); 4264 return; 4265 } 4266 } else { 4267 md_inc_iocount_noblock(setno); 4268 } 4269 } 4270 4271 if (bp->b_flags & B_READ) 4272 mirror_read_strategy(bp, flag, private); 4273 else 4274 mirror_write_strategy(bp, flag, private); 4275 } 4276 4277 /* 4278 * mirror_directed_read: 4279 * -------------------- 4280 * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror 4281 * so that the application can determine what (if any) resync needs to be 4282 * performed. The data is copied out to the user-supplied buffer. 4283 * 4284 * Parameters: 4285 * mdev - dev_t for the mirror device 4286 * vdr - directed read parameters specifying location and submirror 4287 * to perform the read from 4288 * mode - used to ddi_copyout() any resulting data from the read 4289 * 4290 * Returns: 4291 * 0 success 4292 * !0 error code 4293 * EINVAL - invalid request format 4294 */ 4295 int 4296 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode) 4297 { 4298 buf_t *bp; 4299 minor_t mnum = getminor(mdev); 4300 mdi_unit_t *ui = MDI_UNIT(mnum); 4301 mm_unit_t *un; 4302 mm_submirror_t *sm; 4303 char *sm_nm; 4304 uint_t next_side; 4305 void *kbuffer; 4306 4307 if (ui == NULL) 4308 return (ENXIO); 4309 4310 if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) { 4311 return (EINVAL); 4312 } 4313 4314 /* Check for aligned block access. We disallow non-aligned requests. */ 4315 if (vdr->vdr_offset % DEV_BSIZE) { 4316 return (EINVAL); 4317 } 4318 4319 /* 4320 * Allocate kernel buffer for target of read(). If we had a reliable 4321 * (sorry functional) DDI this wouldn't be needed. 4322 */ 4323 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 4324 if (kbuffer == NULL) { 4325 cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx" 4326 " bytes\n", vdr->vdr_nbytes); 4327 return (ENOMEM); 4328 } 4329 4330 bp = getrbuf(KM_SLEEP); 4331 4332 bp->b_un.b_addr = kbuffer; 4333 bp->b_flags = B_READ; 4334 bp->b_bcount = vdr->vdr_nbytes; 4335 bp->b_lblkno = lbtodb(vdr->vdr_offset); 4336 bp->b_edev = mdev; 4337 4338 un = md_unit_readerlock(ui); 4339 4340 /* 4341 * If DKV_SIDE_INIT is set we need to determine the first available 4342 * side to start reading from. If it isn't set we increment to the 4343 * next readable submirror. 4344 * If there are no readable submirrors we error out with DKV_DMR_ERROR. 4345 * Note: we check for a readable submirror on completion of the i/o so 4346 * we should _always_ have one available. If this becomes unavailable 4347 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if 4348 * a metadetach is made between the completion of one DKIOCDMR ioctl 4349 * and the start of the next (i.e. a sys-admin 'accident' occurred). 4350 * The chance of this is small, but not non-existent. 4351 */ 4352 if (vdr->vdr_side == DKV_SIDE_INIT) { 4353 next_side = 0; 4354 } else { 4355 next_side = vdr->vdr_side + 1; 4356 } 4357 while ((next_side < NMIRROR) && 4358 !SUBMIRROR_IS_READABLE(un, next_side)) 4359 next_side++; 4360 if (next_side >= NMIRROR) { 4361 vdr->vdr_flags |= DKV_DMR_ERROR; 4362 freerbuf(bp); 4363 vdr->vdr_bytesread = 0; 4364 md_unit_readerexit(ui); 4365 return (0); 4366 } 4367 4368 /* Set the side to read from */ 4369 un->un_dmr_last_read = next_side; 4370 4371 md_unit_readerexit(ui); 4372 4373 /* 4374 * Save timestamp for verification purposes. Can be read by debugger 4375 * to verify that this ioctl has been executed and to find the number 4376 * of DMR reads and the time of the last DMR read. 4377 */ 4378 uniqtime(&mirror_dmr_stats.dmr_timestamp); 4379 mirror_dmr_stats.dmr_count++; 4380 4381 /* Issue READ request and wait for completion */ 4382 mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL); 4383 4384 mutex_enter(&un->un_dmr_mx); 4385 cv_wait(&un->un_dmr_cv, &un->un_dmr_mx); 4386 mutex_exit(&un->un_dmr_mx); 4387 4388 /* 4389 * Check to see if we encountered an error during the read. If so we 4390 * can make no guarantee about any possibly returned data. 4391 */ 4392 if ((bp->b_flags & B_ERROR) == 0) { 4393 vdr->vdr_flags &= ~DKV_DMR_ERROR; 4394 if (bp->b_resid) { 4395 vdr->vdr_flags |= DKV_DMR_SHORT; 4396 vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid; 4397 } else { 4398 vdr->vdr_flags |= DKV_DMR_SUCCESS; 4399 vdr->vdr_bytesread = vdr->vdr_nbytes; 4400 } 4401 /* Copy the data read back out to the user supplied buffer */ 4402 if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread, 4403 mode)) { 4404 kmem_free(kbuffer, vdr->vdr_nbytes); 4405 return (EFAULT); 4406 } 4407 4408 } else { 4409 /* Error out with DKV_DMR_ERROR */ 4410 vdr->vdr_flags |= DKV_DMR_ERROR; 4411 vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE); 4412 } 4413 /* 4414 * Update the DMR parameters with the side and name of submirror that 4415 * we have just read from (un->un_dmr_last_read) 4416 */ 4417 un = md_unit_readerlock(ui); 4418 4419 vdr->vdr_side = un->un_dmr_last_read; 4420 sm = &un->un_sm[un->un_dmr_last_read]; 4421 sm_nm = md_shortname(md_getminor(sm->sm_dev)); 4422 4423 (void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name)); 4424 4425 /* 4426 * Determine if we've completed the read cycle. This is true iff the 4427 * next computed submirror (side) equals or exceeds NMIRROR. We cannot 4428 * use un_nsm as we need to handle a sparse array of submirrors (which 4429 * can occur if a submirror is metadetached). 4430 */ 4431 next_side = un->un_dmr_last_read + 1; 4432 while ((next_side < NMIRROR) && 4433 !SUBMIRROR_IS_READABLE(un, next_side)) 4434 next_side++; 4435 if (next_side >= NMIRROR) { 4436 /* We've finished */ 4437 vdr->vdr_flags |= DKV_DMR_DONE; 4438 } 4439 4440 md_unit_readerexit(ui); 4441 freerbuf(bp); 4442 kmem_free(kbuffer, vdr->vdr_nbytes); 4443 4444 return (0); 4445 } 4446 4447 /* 4448 * mirror_resync_message: 4449 * --------------------- 4450 * Handle the multi-node resync messages that keep all nodes within a given 4451 * disk-set in sync with their view of a mirror's resync status. 4452 * 4453 * The message types dealt with are: 4454 * MD_MN_MSG_RESYNC_STARTING - start a resync thread for a unit 4455 * MD_MN_MSG_RESYNC_NEXT - specified next region to be resynced 4456 * MD_MN_MSG_RESYNC_FINISH - stop the resync thread for a unit 4457 * MD_MN_MSG_RESYNC_PHASE_DONE - end of a resync phase, opt, submirror or comp 4458 * 4459 * Returns: 4460 * 0 Success 4461 * >0 Failure error number 4462 */ 4463 int 4464 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp) 4465 { 4466 mdi_unit_t *ui; 4467 mm_unit_t *un; 4468 set_t setno; 4469 int is_ABR; 4470 int smi; 4471 int ci; 4472 sm_state_t state; 4473 int broke_out; 4474 mm_submirror_t *sm; 4475 mm_submirror_ic_t *smic; 4476 md_m_shared_t *shared; 4477 md_error_t mde = mdnullerror; 4478 md_mps_t *ps; 4479 int rs_active; 4480 4481 /* Check that the given device is part of a multi-node set */ 4482 setno = MD_MIN2SET(p->mnum); 4483 if (setno >= md_nsets) { 4484 return (ENXIO); 4485 } 4486 if (!MD_MNSET_SETNO(setno)) { 4487 return (EINVAL); 4488 } 4489 4490 if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL) 4491 return (EINVAL); 4492 if ((ui = MDI_UNIT(p->mnum)) == NULL) 4493 return (EINVAL); 4494 is_ABR = (ui->ui_tstate & MD_ABR_CAP); 4495 4496 /* Obtain the current resync status */ 4497 (void) md_ioctl_readerlock(lockp, ui); 4498 rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0; 4499 md_ioctl_readerexit(lockp); 4500 4501 switch ((md_mn_msgtype_t)p->msg_type) { 4502 case MD_MN_MSG_RESYNC_STARTING: 4503 /* Start the resync thread for the mirror */ 4504 (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp); 4505 break; 4506 4507 case MD_MN_MSG_RESYNC_NEXT: 4508 /* 4509 * We have to release any previously marked overlap regions 4510 * so that i/o can resume. Then we need to block the region 4511 * from [rs_start..rs_start+rs_size) * so that no i/o is issued. 4512 * Update un_rs_resync_done and un_rs_resync_2_do. 4513 */ 4514 (void) md_ioctl_readerlock(lockp, ui); 4515 /* 4516 * Ignore the message if there is no active resync thread or 4517 * if it is for a resync type that we have already completed. 4518 * un_resync_completed is set to the last resync completed 4519 * when processing a PHASE_DONE message. 4520 */ 4521 if (!rs_active || (p->rs_type == un->un_resync_completed)) 4522 break; 4523 /* 4524 * If this message is for the same resync and is for an earlier 4525 * resync region, just ignore it. This can only occur if this 4526 * node has progressed on to the next resync region before 4527 * we receive this message. This can occur if the class for 4528 * this message is busy and the originator has to retry thus 4529 * allowing this node to move onto the next resync_region. 4530 */ 4531 if ((p->rs_type == un->un_rs_type) && 4532 (p->rs_start < un->un_resync_startbl)) 4533 break; 4534 ps = un->un_rs_prev_overlap; 4535 4536 /* Allocate previous overlap reference if needed */ 4537 if (ps == NULL) { 4538 ps = kmem_cache_alloc(mirror_parent_cache, 4539 MD_ALLOCFLAGS); 4540 ps->ps_un = un; 4541 ps->ps_ui = ui; 4542 ps->ps_firstblk = 0; 4543 ps->ps_lastblk = 0; 4544 ps->ps_flags = 0; 4545 md_ioctl_readerexit(lockp); 4546 (void) md_ioctl_writerlock(lockp, ui); 4547 un->un_rs_prev_overlap = ps; 4548 md_ioctl_writerexit(lockp); 4549 } else 4550 md_ioctl_readerexit(lockp); 4551 4552 if (p->rs_originator != md_mn_mynode_id) { 4553 /* 4554 * On all but the originating node, first update 4555 * the resync state, then unblock the previous 4556 * region and block the next one. No need 4557 * to do this if the region is already blocked. 4558 * Update the submirror state and flags from the 4559 * originator. This keeps the cluster in sync with 4560 * regards to the resync status. 4561 */ 4562 4563 (void) md_ioctl_writerlock(lockp, ui); 4564 un->un_rs_resync_done = p->rs_done; 4565 un->un_rs_resync_2_do = p->rs_2_do; 4566 un->un_rs_type = p->rs_type; 4567 un->un_resync_startbl = p->rs_start; 4568 md_ioctl_writerexit(lockp); 4569 /* 4570 * Use un_owner_mx to ensure that an ownership change 4571 * cannot happen at the same time as this message 4572 */ 4573 mutex_enter(&un->un_owner_mx); 4574 if (MD_MN_MIRROR_OWNER(un)) { 4575 ps->ps_firstblk = p->rs_start; 4576 ps->ps_lastblk = ps->ps_firstblk + 4577 p->rs_size - 1; 4578 } else { 4579 if ((ps->ps_firstblk != p->rs_start) || 4580 (ps->ps_lastblk != p->rs_start + 4581 p->rs_size - 1)) { 4582 /* Remove previous overlap range */ 4583 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4584 mirror_overlap_tree_remove(ps); 4585 4586 ps->ps_firstblk = p->rs_start; 4587 ps->ps_lastblk = ps->ps_firstblk + 4588 p->rs_size - 1; 4589 4590 mutex_exit(&un->un_owner_mx); 4591 /* Block this range from all i/o. */ 4592 if (ps->ps_firstblk != 0 || 4593 ps->ps_lastblk != 0) 4594 wait_for_overlaps(ps, 4595 MD_OVERLAP_ALLOW_REPEAT); 4596 mutex_enter(&un->un_owner_mx); 4597 /* 4598 * Check to see if we have obtained 4599 * ownership while waiting for 4600 * overlaps. If we have, remove 4601 * the resync_region entry from the 4602 * overlap tree 4603 */ 4604 if (MD_MN_MIRROR_OWNER(un) && 4605 (ps->ps_flags & MD_MPS_ON_OVERLAP)) 4606 mirror_overlap_tree_remove(ps); 4607 } 4608 } 4609 mutex_exit(&un->un_owner_mx); 4610 4611 /* 4612 * If this is the first RESYNC_NEXT message (i.e. 4613 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags), 4614 * issue RESYNC_START NOTIFY event 4615 */ 4616 if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) { 4617 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START, 4618 SVM_TAG_METADEVICE, MD_UN2SET(un), 4619 MD_SID(un)); 4620 } 4621 4622 /* Ensure that our local resync thread is running */ 4623 if (un->un_rs_thread == NULL) { 4624 (void) mirror_resync_unit(p->mnum, NULL, 4625 &p->mde, lockp); 4626 } 4627 } 4628 break; 4629 case MD_MN_MSG_RESYNC_FINISH: 4630 /* 4631 * Complete the resync by stopping the resync thread. 4632 * Also release the previous overlap region field. 4633 * Update the resync_progress_thread by cv_signal'ing it so 4634 * that we mark the end of the resync as soon as possible. This 4635 * stops an unnecessary delay should be panic after resync 4636 * completion. 4637 */ 4638 #ifdef DEBUG 4639 if (!rs_active) { 4640 if (mirror_debug_flag) 4641 printf("RESYNC_FINISH (mnum = %x), " 4642 "Resync *NOT* active", 4643 p->mnum); 4644 } 4645 #endif 4646 4647 if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) && 4648 (p->rs_originator != md_mn_mynode_id)) { 4649 mutex_enter(&un->un_rs_thread_mx); 4650 un->c.un_status &= ~MD_UN_RESYNC_CANCEL; 4651 un->un_rs_thread_flags |= MD_RI_SHUTDOWN; 4652 un->un_rs_thread_flags &= 4653 ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER); 4654 cv_signal(&un->un_rs_thread_cv); 4655 mutex_exit(&un->un_rs_thread_mx); 4656 } 4657 if (is_ABR) { 4658 /* Resync finished, if ABR set owner to NULL */ 4659 mutex_enter(&un->un_owner_mx); 4660 un->un_mirror_owner = 0; 4661 mutex_exit(&un->un_owner_mx); 4662 } 4663 (void) md_ioctl_writerlock(lockp, ui); 4664 ps = un->un_rs_prev_overlap; 4665 if (ps != NULL) { 4666 /* Remove previous overlap range */ 4667 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4668 mirror_overlap_tree_remove(ps); 4669 /* 4670 * Release the overlap range reference 4671 */ 4672 un->un_rs_prev_overlap = NULL; 4673 kmem_cache_free(mirror_parent_cache, 4674 ps); 4675 } 4676 md_ioctl_writerexit(lockp); 4677 4678 /* Mark the resync as complete in the metadb */ 4679 un->un_rs_resync_done = p->rs_done; 4680 un->un_rs_resync_2_do = p->rs_2_do; 4681 un->un_rs_type = p->rs_type; 4682 mutex_enter(&un->un_rs_progress_mx); 4683 cv_signal(&un->un_rs_progress_cv); 4684 mutex_exit(&un->un_rs_progress_mx); 4685 4686 un = md_ioctl_writerlock(lockp, ui); 4687 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE; 4688 /* Deal with any pending grow_unit */ 4689 if (un->c.un_status & MD_UN_GROW_PENDING) { 4690 if ((mirror_grow_unit(un, &mde) != 0) || 4691 (! mdismderror(&mde, MDE_GROW_DELAYED))) { 4692 un->c.un_status &= ~MD_UN_GROW_PENDING; 4693 } 4694 } 4695 md_ioctl_writerexit(lockp); 4696 break; 4697 4698 case MD_MN_MSG_RESYNC_PHASE_DONE: 4699 /* 4700 * A phase of the resync, optimized. component or 4701 * submirror is complete. Update mirror status. 4702 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the 4703 * mirror owner is peforming a resync. If we have just snarfed 4704 * this set, then we must clear any of the flags set at snarf 4705 * time by unit_setup_resync(). 4706 * Note that unit_setup_resync() sets up these flags to 4707 * indicate that an optimized resync is required. These flags 4708 * need to be reset because if we get here, the mirror owner 4709 * will have handled the optimized resync. 4710 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and 4711 * MD_UN_WAR. In addition, for each submirror, 4712 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC 4713 * set to SMS_OFFLINE. 4714 */ 4715 #ifdef DEBUG 4716 if (mirror_debug_flag) 4717 printf("phase done mess received from %d, mnum=%x," 4718 "type=%x, flags=%x\n", p->rs_originator, p->mnum, 4719 p->rs_type, p->rs_flags); 4720 #endif 4721 /* 4722 * Ignore the message if there is no active resync thread. 4723 */ 4724 if (!rs_active) 4725 break; 4726 4727 broke_out = p->rs_flags & MD_MN_RS_ERR; 4728 switch (RS_TYPE(p->rs_type)) { 4729 case MD_RS_OPTIMIZED: 4730 un = md_ioctl_writerlock(lockp, ui); 4731 if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) { 4732 /* If we are originator, just clear rs_type */ 4733 if (p->rs_originator == md_mn_mynode_id) { 4734 SET_RS_TYPE_NONE(un->un_rs_type); 4735 md_ioctl_writerexit(lockp); 4736 break; 4737 } 4738 /* 4739 * If CLEAR_OPT_NOT_DONE is set, only clear the 4740 * flags if OPT_NOT_DONE is set *and* rs_type 4741 * is MD_RS_NONE. 4742 */ 4743 if ((un->c.un_status & MD_UN_OPT_NOT_DONE) && 4744 (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) { 4745 /* No resync in progress */ 4746 un->c.un_status &= ~MD_UN_OPT_NOT_DONE; 4747 un->c.un_status &= ~MD_UN_WAR; 4748 } else { 4749 /* 4750 * We are in the middle of an 4751 * optimized resync and this message 4752 * should be ignored. 4753 */ 4754 md_ioctl_writerexit(lockp); 4755 break; 4756 } 4757 } else { 4758 /* 4759 * This is the end of an optimized resync, 4760 * clear the OPT_NOT_DONE and OFFLINE_SM flags 4761 */ 4762 4763 un->c.un_status &= ~MD_UN_KEEP_DIRTY; 4764 if (!broke_out) 4765 un->c.un_status &= ~MD_UN_WAR; 4766 } 4767 4768 /* 4769 * Set resync_completed to last resync type and then 4770 * clear resync_type to indicate no resync in progress 4771 */ 4772 un->un_resync_completed = un->un_rs_type; 4773 SET_RS_TYPE_NONE(un->un_rs_type); 4774 4775 /* 4776 * If resync is as a result of a submirror ONLINE, 4777 * reset the submirror state to SMS_RUNNING if the 4778 * resync was ok else set back to SMS_OFFLINE. 4779 */ 4780 for (smi = 0; smi < NMIRROR; smi++) { 4781 un->un_sm[smi].sm_flags &= 4782 ~MD_SM_RESYNC_TARGET; 4783 if (SMS_BY_INDEX_IS(un, smi, 4784 SMS_OFFLINE_RESYNC)) { 4785 if (p->rs_flags & 4786 MD_MN_RS_CLEAR_OPT_NOT_DONE) { 4787 state = SMS_OFFLINE; 4788 } else { 4789 state = (broke_out ? 4790 SMS_OFFLINE : SMS_RUNNING); 4791 } 4792 mirror_set_sm_state( 4793 &un->un_sm[smi], 4794 &un->un_smic[smi], state, 4795 broke_out); 4796 mirror_commit(un, NO_SUBMIRRORS, 4797 0); 4798 } 4799 /* 4800 * If we still have an offline submirror, reset 4801 * the OFFLINE_SM flag in the mirror status 4802 */ 4803 if (SMS_BY_INDEX_IS(un, smi, 4804 SMS_OFFLINE)) 4805 un->c.un_status |= 4806 MD_UN_OFFLINE_SM; 4807 } 4808 md_ioctl_writerexit(lockp); 4809 break; 4810 case MD_RS_SUBMIRROR: 4811 un = md_ioctl_writerlock(lockp, ui); 4812 smi = RS_SMI(p->rs_type); 4813 sm = &un->un_sm[smi]; 4814 smic = &un->un_smic[smi]; 4815 /* Clear RESYNC target */ 4816 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 4817 /* 4818 * Set resync_completed to last resync type and then 4819 * clear resync_type to indicate no resync in progress 4820 */ 4821 un->un_resync_completed = un->un_rs_type; 4822 SET_RS_TYPE_NONE(un->un_rs_type); 4823 /* 4824 * If the resync completed ok reset the submirror 4825 * state to SMS_RUNNING else reset it to SMS_ATTACHED 4826 */ 4827 state = (broke_out ? 4828 SMS_ATTACHED : SMS_RUNNING); 4829 mirror_set_sm_state(sm, smic, state, broke_out); 4830 un->c.un_status &= ~MD_UN_WAR; 4831 mirror_commit(un, SMI2BIT(smi), 0); 4832 md_ioctl_writerexit(lockp); 4833 break; 4834 case MD_RS_COMPONENT: 4835 un = md_ioctl_writerlock(lockp, ui); 4836 smi = RS_SMI(p->rs_type); 4837 ci = RS_CI(p->rs_type); 4838 sm = &un->un_sm[smi]; 4839 smic = &un->un_smic[smi]; 4840 shared = (md_m_shared_t *) 4841 (*(smic->sm_shared_by_indx)) 4842 (sm->sm_dev, sm, ci); 4843 un->c.un_status &= ~MD_UN_WAR; 4844 /* Clear RESYNC target */ 4845 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 4846 /* 4847 * Set resync_completed to last resync type and then 4848 * clear resync_type to indicate no resync in progress 4849 */ 4850 un->un_resync_completed = un->un_rs_type; 4851 SET_RS_TYPE_NONE(un->un_rs_type); 4852 4853 /* 4854 * If the resync completed ok, set the component state 4855 * to CS_OKAY. 4856 */ 4857 if (broke_out) 4858 shared->ms_flags |= MDM_S_RS_TRIED; 4859 else { 4860 /* 4861 * As we don't transmit the changes, 4862 * no need to drop the lock. 4863 */ 4864 set_sm_comp_state(un, smi, ci, CS_OKAY, 0, 4865 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 4866 } 4867 md_ioctl_writerexit(lockp); 4868 default: 4869 break; 4870 } 4871 /* 4872 * If the purpose of this PHASE_DONE message is just to 4873 * indicate to all other nodes that the optimized resync 4874 * required (OPT_NOT_DONE) flag is to be cleared, there is 4875 * no need to generate a notify event as there has not 4876 * actually been a resync. 4877 */ 4878 if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) { 4879 if (broke_out) { 4880 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED, 4881 SVM_TAG_METADEVICE, MD_UN2SET(un), 4882 MD_SID(un)); 4883 } else { 4884 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE, 4885 SVM_TAG_METADEVICE, MD_UN2SET(un), 4886 MD_SID(un)); 4887 } 4888 } 4889 break; 4890 4891 default: 4892 #ifdef DEBUG 4893 cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type" 4894 " %x\n", p->msg_type); 4895 #endif 4896 return (EINVAL); 4897 } 4898 return (0); 4899 } 4900 4901 /* Return a -1 if snarf of optimized record failed and set should be released */ 4902 static int 4903 mirror_snarf(md_snarfcmd_t cmd, set_t setno) 4904 { 4905 mddb_recid_t recid; 4906 int gotsomething; 4907 int all_mirrors_gotten; 4908 mm_unit_t *un; 4909 mddb_type_t typ1; 4910 mddb_de_ic_t *dep; 4911 mddb_rb32_t *rbp; 4912 size_t newreqsize; 4913 mm_unit_t *big_un; 4914 mm_unit32_od_t *small_un; 4915 int retval; 4916 mdi_unit_t *ui; 4917 4918 if (cmd == MD_SNARF_CLEANUP) { 4919 if (md_get_setstatus(setno) & MD_SET_STALE) 4920 return (0); 4921 4922 recid = mddb_makerecid(setno, 0); 4923 typ1 = (mddb_type_t)md_getshared_key(setno, 4924 mirror_md_ops.md_driver.md_drivername); 4925 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 4926 if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) { 4927 un = (mm_unit_t *)mddb_getrecaddr(recid); 4928 mirror_cleanup(un); 4929 recid = mddb_makerecid(setno, 0); 4930 } 4931 } 4932 return (0); 4933 } 4934 4935 all_mirrors_gotten = 1; 4936 gotsomething = 0; 4937 4938 recid = mddb_makerecid(setno, 0); 4939 typ1 = (mddb_type_t)md_getshared_key(setno, 4940 mirror_md_ops.md_driver.md_drivername); 4941 4942 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 4943 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 4944 continue; 4945 4946 dep = mddb_getrecdep(recid); 4947 dep->de_flags = MDDB_F_MIRROR; 4948 rbp = dep->de_rb; 4949 4950 switch (rbp->rb_revision) { 4951 case MDDB_REV_RB: 4952 case MDDB_REV_RBFN: 4953 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 4954 /* 4955 * This means, we have an old and small 4956 * record and this record hasn't already 4957 * been converted. Before we create an 4958 * incore metadevice from this we have to 4959 * convert it to a big record. 4960 */ 4961 small_un = 4962 (mm_unit32_od_t *)mddb_getrecaddr(recid); 4963 newreqsize = sizeof (mm_unit_t); 4964 big_un = (mm_unit_t *)kmem_zalloc(newreqsize, 4965 KM_SLEEP); 4966 mirror_convert((caddr_t)small_un, 4967 (caddr_t)big_un, SMALL_2_BIG); 4968 kmem_free(small_un, dep->de_reqsize); 4969 4970 /* 4971 * Update userdata and incore userdata 4972 * incores are at the end of un 4973 */ 4974 dep->de_rb_userdata_ic = big_un; 4975 dep->de_rb_userdata = big_un; 4976 dep->de_icreqsize = newreqsize; 4977 un = big_un; 4978 rbp->rb_private |= MD_PRV_CONVD; 4979 } else { 4980 /* 4981 * Unit already converted, just get the 4982 * record address. 4983 */ 4984 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, 4985 sizeof (*un), 0); 4986 } 4987 un->c.un_revision &= ~MD_64BIT_META_DEV; 4988 break; 4989 case MDDB_REV_RB64: 4990 case MDDB_REV_RB64FN: 4991 /* Big device */ 4992 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, 4993 sizeof (*un), 0); 4994 un->c.un_revision |= MD_64BIT_META_DEV; 4995 un->c.un_flag |= MD_EFILABEL; 4996 break; 4997 } 4998 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 4999 5000 /* 5001 * Create minor device node for snarfed entry. 5002 */ 5003 (void) md_create_minor_node(setno, MD_SID(un)); 5004 5005 if (MD_UNIT(MD_SID(un)) != NULL) { 5006 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5007 continue; 5008 } 5009 all_mirrors_gotten = 0; 5010 retval = mirror_build_incore(un, 1); 5011 if (retval == 0) { 5012 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5013 md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0); 5014 resync_start_timeout(setno); 5015 gotsomething = 1; 5016 } else { 5017 return (retval); 5018 } 5019 /* 5020 * Set flag to indicate that the mirror has not yet 5021 * been through a reconfig. This flag is used for MN sets 5022 * when determining whether to update the mirror state from 5023 * the Master node. 5024 */ 5025 if (MD_MNSET_SETNO(setno)) { 5026 ui = MDI_UNIT(MD_SID(un)); 5027 ui->ui_tstate |= MD_RESYNC_NOT_DONE; 5028 } 5029 } 5030 5031 if (!all_mirrors_gotten) 5032 return (gotsomething); 5033 5034 recid = mddb_makerecid(setno, 0); 5035 while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0) 5036 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 5037 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5038 5039 return (0); 5040 } 5041 5042 static int 5043 mirror_halt(md_haltcmd_t cmd, set_t setno) 5044 { 5045 unit_t i; 5046 mdi_unit_t *ui; 5047 minor_t mnum; 5048 int reset_mirror_flag = 0; 5049 5050 if (cmd == MD_HALT_CLOSE) 5051 return (0); 5052 5053 if (cmd == MD_HALT_OPEN) 5054 return (0); 5055 5056 if (cmd == MD_HALT_UNLOAD) 5057 return (0); 5058 5059 if (cmd == MD_HALT_CHECK) { 5060 for (i = 0; i < md_nunits; i++) { 5061 mnum = MD_MKMIN(setno, i); 5062 if ((ui = MDI_UNIT(mnum)) == NULL) 5063 continue; 5064 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5065 continue; 5066 if (md_unit_isopen(ui)) 5067 return (1); 5068 } 5069 return (0); 5070 } 5071 5072 if (cmd != MD_HALT_DOIT) 5073 return (1); 5074 5075 for (i = 0; i < md_nunits; i++) { 5076 mnum = MD_MKMIN(setno, i); 5077 if ((ui = MDI_UNIT(mnum)) == NULL) 5078 continue; 5079 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5080 continue; 5081 reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0); 5082 5083 /* Set a flag if there is at least one mirror metadevice. */ 5084 reset_mirror_flag = 1; 5085 } 5086 5087 /* 5088 * Only wait for the global dr_timeout to finish 5089 * - if there are mirror metadevices in this diskset or 5090 * - if this is the local set since an unload of the md_mirror 5091 * driver could follow a successful mirror halt in the local set. 5092 */ 5093 if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) { 5094 while ((mirror_md_ops.md_head == NULL) && 5095 (mirror_timeout.dr_timeout_id != 0)) 5096 delay(md_hz); 5097 } 5098 5099 return (0); 5100 } 5101 5102 /*ARGSUSED3*/ 5103 static int 5104 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) 5105 { 5106 IOLOCK lock; 5107 minor_t mnum = getminor(*dev); 5108 set_t setno; 5109 5110 /* 5111 * When doing an open of a multi owner metadevice, check to see if this 5112 * node is a starting node and if a reconfig cycle is underway. 5113 * If so, the system isn't sufficiently set up enough to handle the 5114 * open (which involves I/O during sp_validate), so fail with ENXIO. 5115 */ 5116 setno = MD_MIN2SET(mnum); 5117 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 5118 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 5119 return (ENXIO); 5120 } 5121 5122 if (md_oflags & MD_OFLG_FROMIOCTL) { 5123 /* 5124 * This indicates that the caller is an ioctl service routine. 5125 * In this case we initialise our stack-based IOLOCK and pass 5126 * this into the internal open routine. This allows multi-owner 5127 * metadevices to avoid deadlocking if an error is encountered 5128 * during the open() attempt. The failure case is: 5129 * s-p -> mirror -> s-p (with error). Attempting to metaclear 5130 * this configuration would deadlock as the mirror code has to 5131 * send a state-update to the other nodes when it detects the 5132 * failure of the underlying submirror with an errored soft-part 5133 * on it. As there is a class1 message in progress (metaclear) 5134 * set_sm_comp_state() cannot send another class1 message; 5135 * instead we do not send a state_update message as the 5136 * metaclear is distributed and the failed submirror will be 5137 * cleared from the configuration by the metaclear. 5138 */ 5139 IOLOCK_INIT(&lock); 5140 return (mirror_internal_open(getminor(*dev), flag, otyp, 5141 md_oflags, &lock)); 5142 } else { 5143 return (mirror_internal_open(getminor(*dev), flag, otyp, 5144 md_oflags, (IOLOCK *)NULL)); 5145 } 5146 } 5147 5148 5149 /*ARGSUSED1*/ 5150 static int 5151 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) 5152 { 5153 return (mirror_internal_close(getminor(dev), otyp, md_cflags, 5154 (IOLOCK *)NULL)); 5155 } 5156 5157 5158 /* 5159 * This routine dumps memory to the disk. It assumes that the memory has 5160 * already been mapped into mainbus space. It is called at disk interrupt 5161 * priority when the system is in trouble. 5162 * 5163 */ 5164 static int 5165 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 5166 { 5167 mm_unit_t *un; 5168 dev_t mapdev; 5169 int result; 5170 int smi; 5171 int any_succeed = 0; 5172 int save_result = 0; 5173 5174 /* 5175 * Don't need to grab the unit lock. 5176 * Cause nothing else is suppose to be happenning. 5177 * Also dump is not suppose to sleep. 5178 */ 5179 un = (mm_unit_t *)MD_UNIT(getminor(dev)); 5180 5181 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 5182 return (EINVAL); 5183 5184 if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks) 5185 return (EINVAL); 5186 5187 for (smi = 0; smi < NMIRROR; smi++) { 5188 if (!SUBMIRROR_IS_WRITEABLE(un, smi)) 5189 continue; 5190 mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev); 5191 result = bdev_dump(mapdev, addr, blkno, nblk); 5192 if (result) 5193 save_result = result; 5194 5195 if (result == 0) 5196 any_succeed++; 5197 } 5198 5199 if (any_succeed) 5200 return (0); 5201 5202 return (save_result); 5203 } 5204 5205 /* 5206 * NAME: mirror_probe_dev 5207 * 5208 * DESCRITPION: force opens every component of a mirror. 5209 * 5210 * On entry the unit writerlock is held 5211 */ 5212 static int 5213 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum) 5214 { 5215 int i; 5216 int smi; 5217 int ci; 5218 mm_unit_t *un; 5219 int md_devopen = 0; 5220 set_t setno; 5221 int sm_cnt; 5222 int sm_unavail_cnt; 5223 5224 if (md_unit_isopen(ui)) 5225 md_devopen++; 5226 5227 un = MD_UNIT(mnum); 5228 setno = MD_UN2SET(un); 5229 5230 sm_cnt = 0; 5231 sm_unavail_cnt = 0; 5232 for (i = 0; i < NMIRROR; i++) { 5233 md_dev64_t tmpdev; 5234 mdi_unit_t *sm_ui; 5235 5236 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) { 5237 continue; 5238 } 5239 5240 sm_cnt++; 5241 tmpdev = un->un_sm[i].sm_dev; 5242 (void) md_layered_open(mnum, &tmpdev, 5243 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV); 5244 un->un_sm[i].sm_dev = tmpdev; 5245 5246 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 5247 5248 /* 5249 * Logic similar to that in mirror_open_all_devs. We set or 5250 * clear the submirror Unavailable bit. 5251 */ 5252 (void) md_unit_writerlock(sm_ui); 5253 if (submirror_unavailable(un, i, 1)) { 5254 sm_ui->ui_tstate |= MD_INACCESSIBLE; 5255 sm_unavail_cnt++; 5256 } else { 5257 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 5258 } 5259 md_unit_writerexit(sm_ui); 5260 } 5261 5262 /* 5263 * If all of the submirrors are unavailable, the mirror is also 5264 * unavailable. 5265 */ 5266 if (sm_cnt == sm_unavail_cnt) { 5267 ui->ui_tstate |= MD_INACCESSIBLE; 5268 } else { 5269 ui->ui_tstate &= ~MD_INACCESSIBLE; 5270 } 5271 5272 /* 5273 * Start checking from probe failures. If failures occur we 5274 * set the appropriate erred state only if the metadevice is in 5275 * use. This is specifically to prevent unnecessary resyncs. 5276 * For instance if the disks were accidentally disconnected when 5277 * the system booted up then until the metadevice is accessed 5278 * (like file system mount) the user can shutdown, recable and 5279 * reboot w/o incurring a potentially huge resync. 5280 */ 5281 5282 smi = 0; 5283 ci = 0; 5284 while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) { 5285 5286 if (mirror_other_sources(un, smi, ci, 0) == 1) { 5287 /* 5288 * Note that for a MN set, there is no need to call 5289 * SE_NOTIFY as that is done when processing the 5290 * state change 5291 */ 5292 if (md_devopen) { 5293 /* 5294 * Never called from ioctl context, 5295 * so (IOLOCK *)NULL 5296 */ 5297 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 5298 0, MD_STATE_XMIT, (IOLOCK *)NULL); 5299 if (!MD_MNSET_SETNO(setno)) { 5300 SE_NOTIFY(EC_SVM_STATE, 5301 ESC_SVM_LASTERRED, 5302 SVM_TAG_METADEVICE, setno, 5303 MD_SID(un)); 5304 } 5305 continue; 5306 } else { 5307 (void) mirror_close_all_devs(un, 5308 MD_OFLG_PROBEDEV); 5309 if (!MD_MNSET_SETNO(setno)) { 5310 SE_NOTIFY(EC_SVM_STATE, 5311 ESC_SVM_OPEN_FAIL, 5312 SVM_TAG_METADEVICE, setno, 5313 MD_SID(un)); 5314 } 5315 mirror_openfail_console_info(un, smi, ci); 5316 return (ENXIO); 5317 } 5318 } 5319 5320 /* 5321 * Note that for a MN set, there is no need to call 5322 * SE_NOTIFY as that is done when processing the 5323 * state change 5324 */ 5325 if (md_devopen) { 5326 /* Never called from ioctl context, so (IOLOCK *)NULL */ 5327 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, 5328 MD_STATE_XMIT, (IOLOCK *)NULL); 5329 if (!MD_MNSET_SETNO(setno)) { 5330 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 5331 SVM_TAG_METADEVICE, setno, 5332 MD_SID(un)); 5333 } 5334 } 5335 mirror_openfail_console_info(un, smi, ci); 5336 ci++; 5337 } 5338 5339 if (MD_MNSET_SETNO(setno)) { 5340 send_poke_hotspares(setno); 5341 } else { 5342 (void) poke_hotspares(); 5343 } 5344 (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV); 5345 5346 return (0); 5347 } 5348 5349 5350 static int 5351 mirror_imp_set( 5352 set_t setno 5353 ) 5354 { 5355 5356 mddb_recid_t recid; 5357 int gotsomething, i; 5358 mddb_type_t typ1; 5359 mddb_de_ic_t *dep; 5360 mddb_rb32_t *rbp; 5361 mm_unit32_od_t *un32; 5362 mm_unit_t *un64; 5363 md_dev64_t self_devt; 5364 minor_t *self_id; /* minor needs to be updated */ 5365 md_parent_t *parent_id; /* parent needs to be updated */ 5366 mddb_recid_t *record_id; /* record id needs to be updated */ 5367 mddb_recid_t *optrec_id; 5368 md_dev64_t tmpdev; 5369 5370 5371 gotsomething = 0; 5372 5373 typ1 = (mddb_type_t)md_getshared_key(setno, 5374 mirror_md_ops.md_driver.md_drivername); 5375 recid = mddb_makerecid(setno, 0); 5376 5377 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5378 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 5379 continue; 5380 5381 dep = mddb_getrecdep(recid); 5382 rbp = dep->de_rb; 5383 5384 switch (rbp->rb_revision) { 5385 case MDDB_REV_RB: 5386 case MDDB_REV_RBFN: 5387 /* 5388 * Small device 5389 */ 5390 un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid); 5391 self_id = &(un32->c.un_self_id); 5392 parent_id = &(un32->c.un_parent); 5393 record_id = &(un32->c.un_record_id); 5394 optrec_id = &(un32->un_rr_dirty_recid); 5395 5396 for (i = 0; i < un32->un_nsm; i++) { 5397 tmpdev = md_expldev(un32->un_sm[i].sm_dev); 5398 un32->un_sm[i].sm_dev = md_cmpldev 5399 (md_makedevice(md_major, MD_MKMIN(setno, 5400 MD_MIN2UNIT(md_getminor(tmpdev))))); 5401 5402 if (!md_update_minor(setno, mddb_getsidenum 5403 (setno), un32->un_sm[i].sm_key)) 5404 goto out; 5405 } 5406 break; 5407 case MDDB_REV_RB64: 5408 case MDDB_REV_RB64FN: 5409 un64 = (mm_unit_t *)mddb_getrecaddr(recid); 5410 self_id = &(un64->c.un_self_id); 5411 parent_id = &(un64->c.un_parent); 5412 record_id = &(un64->c.un_record_id); 5413 optrec_id = &(un64->un_rr_dirty_recid); 5414 5415 for (i = 0; i < un64->un_nsm; i++) { 5416 tmpdev = un64->un_sm[i].sm_dev; 5417 un64->un_sm[i].sm_dev = md_makedevice 5418 (md_major, MD_MKMIN(setno, MD_MIN2UNIT 5419 (md_getminor(tmpdev)))); 5420 5421 if (!md_update_minor(setno, mddb_getsidenum 5422 (setno), un64->un_sm[i].sm_key)) 5423 goto out; 5424 } 5425 break; 5426 } 5427 5428 /* 5429 * If this is a top level and a friendly name metadevice, 5430 * update its minor in the namespace. 5431 */ 5432 if ((*parent_id == MD_NO_PARENT) && 5433 ((rbp->rb_revision == MDDB_REV_RBFN) || 5434 (rbp->rb_revision == MDDB_REV_RB64FN))) { 5435 5436 self_devt = md_makedevice(md_major, *self_id); 5437 if (!md_update_top_device_minor(setno, 5438 mddb_getsidenum(setno), self_devt)) 5439 goto out; 5440 } 5441 5442 /* 5443 * Update unit with the imported setno 5444 * 5445 */ 5446 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5447 5448 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 5449 if (*parent_id != MD_NO_PARENT) 5450 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 5451 *record_id = MAKERECID(setno, DBID(*record_id)); 5452 *optrec_id = MAKERECID(setno, DBID(*optrec_id)); 5453 5454 gotsomething = 1; 5455 } 5456 5457 out: 5458 return (gotsomething); 5459 } 5460 5461 /* 5462 * NAME: mirror_check_offline 5463 * 5464 * DESCRIPTION: return offline_status = 1 if any submirrors are offline 5465 * 5466 * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is 5467 * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE 5468 * ioctl. 5469 */ 5470 int 5471 mirror_check_offline(md_dev64_t dev, int *offline_status) 5472 { 5473 mm_unit_t *un; 5474 md_error_t mde = mdnullerror; 5475 5476 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5477 return (EINVAL); 5478 *offline_status = 0; 5479 if (un->c.un_status & MD_UN_OFFLINE_SM) 5480 *offline_status = 1; 5481 return (0); 5482 } 5483 5484 /* 5485 * NAME: mirror_inc_abr_count 5486 * 5487 * DESCRIPTION: increment the count of layered soft parts with ABR set 5488 * 5489 * Called from ioctl, so access to un_abr_count is protected by the global 5490 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5491 */ 5492 int 5493 mirror_inc_abr_count(md_dev64_t dev) 5494 { 5495 mm_unit_t *un; 5496 md_error_t mde = mdnullerror; 5497 5498 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5499 return (EINVAL); 5500 un->un_abr_count++; 5501 return (0); 5502 } 5503 5504 /* 5505 * NAME: mirror_dec_abr_count 5506 * 5507 * DESCRIPTION: decrement the count of layered soft parts with ABR set 5508 * 5509 * Called from ioctl, so access to un_abr_count is protected by the global 5510 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5511 */ 5512 int 5513 mirror_dec_abr_count(md_dev64_t dev) 5514 { 5515 mm_unit_t *un; 5516 md_error_t mde = mdnullerror; 5517 5518 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5519 return (EINVAL); 5520 un->un_abr_count--; 5521 return (0); 5522 } 5523 5524 static md_named_services_t mirror_named_services[] = { 5525 {(intptr_t (*)()) poke_hotspares, "poke hotspares" }, 5526 {(intptr_t (*)()) mirror_rename_listkids, MDRNM_LIST_URKIDS }, 5527 {mirror_rename_check, MDRNM_CHECK }, 5528 {(intptr_t (*)()) mirror_renexch_update_kids, MDRNM_UPDATE_KIDS }, 5529 {(intptr_t (*)()) mirror_exchange_parent_update_to, 5530 MDRNM_PARENT_UPDATE_TO}, 5531 {(intptr_t (*)()) mirror_exchange_self_update_from_down, 5532 MDRNM_SELF_UPDATE_FROM_DOWN }, 5533 {(intptr_t (*)())mirror_probe_dev, "probe open test" }, 5534 {(intptr_t (*)())mirror_check_offline, MD_CHECK_OFFLINE }, 5535 {(intptr_t (*)())mirror_inc_abr_count, MD_INC_ABR_COUNT }, 5536 {(intptr_t (*)())mirror_dec_abr_count, MD_DEC_ABR_COUNT }, 5537 { NULL, 0 } 5538 }; 5539 5540 md_ops_t mirror_md_ops = { 5541 mirror_open, /* open */ 5542 mirror_close, /* close */ 5543 md_mirror_strategy, /* strategy */ 5544 NULL, /* print */ 5545 mirror_dump, /* dump */ 5546 NULL, /* read */ 5547 NULL, /* write */ 5548 md_mirror_ioctl, /* mirror_ioctl, */ 5549 mirror_snarf, /* mirror_snarf */ 5550 mirror_halt, /* mirror_halt */ 5551 NULL, /* aread */ 5552 NULL, /* awrite */ 5553 mirror_imp_set, /* import set */ 5554 mirror_named_services 5555 }; 5556 5557 /* module specific initilization */ 5558 static void 5559 init_init() 5560 { 5561 md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t); 5562 5563 /* Initialize the parent and child save memory pools */ 5564 mirror_parent_cache = kmem_cache_create("md_mirror_parent", 5565 sizeof (md_mps_t), 0, mirror_parent_constructor, 5566 mirror_parent_destructor, mirror_run_queue, NULL, NULL, 5567 0); 5568 5569 mirror_child_cache = kmem_cache_create("md_mirror_child", 5570 sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0, 5571 mirror_child_constructor, mirror_child_destructor, 5572 mirror_run_queue, NULL, NULL, 0); 5573 5574 /* 5575 * Insure wowbuf_size is a multiple of DEV_BSIZE, 5576 * then initialize wowbuf memory pool. 5577 */ 5578 md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE); 5579 if (md_wowbuf_size <= 0) 5580 md_wowbuf_size = 2 * DEV_BSIZE; 5581 if (md_wowbuf_size > (32 * DEV_BSIZE)) 5582 md_wowbuf_size = (32 * DEV_BSIZE); 5583 5584 md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t); 5585 mirror_wowblk_cache = kmem_cache_create("md_mirror_wow", 5586 md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0); 5587 5588 mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5589 mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5590 5591 mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL); 5592 } 5593 5594 /* module specific uninitilization (undo init_init()) */ 5595 static void 5596 fini_uninit() 5597 { 5598 kmem_cache_destroy(mirror_parent_cache); 5599 kmem_cache_destroy(mirror_child_cache); 5600 kmem_cache_destroy(mirror_wowblk_cache); 5601 mirror_parent_cache = mirror_child_cache = 5602 mirror_wowblk_cache = NULL; 5603 5604 mutex_destroy(&mirror_timeout.dr_mx); 5605 mutex_destroy(&hotspare_request.dr_mx); 5606 mutex_destroy(&non_ff_drv_mutex); 5607 } 5608 5609 /* define the module linkage */ 5610 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit()) 5611