1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/conf.h> 31 #include <sys/file.h> 32 #include <sys/user.h> 33 #include <sys/uio.h> 34 #include <sys/t_lock.h> 35 #include <sys/buf.h> 36 #include <sys/dkio.h> 37 #include <sys/vtoc.h> 38 #include <sys/kmem.h> 39 #include <vm/page.h> 40 #include <sys/cmn_err.h> 41 #include <sys/sysmacros.h> 42 #include <sys/types.h> 43 #include <sys/mkdev.h> 44 #include <sys/stat.h> 45 #include <sys/open.h> 46 #include <sys/modctl.h> 47 #include <sys/ddi.h> 48 #include <sys/sunddi.h> 49 #include <sys/debug.h> 50 #include <sys/dklabel.h> 51 #include <vm/hat.h> 52 #include <sys/lvm/md_mirror.h> 53 #include <sys/lvm/md_convert.h> 54 #include <sys/lvm/md_mddb.h> 55 #include <sys/esunddi.h> 56 57 #include <sys/sysevent/eventdefs.h> 58 #include <sys/sysevent/svm.h> 59 #include <sys/lvm/mdmn_commd.h> 60 61 md_ops_t mirror_md_ops; 62 #ifndef lint 63 char _depends_on[] = "drv/md"; 64 md_ops_t *md_interface_ops = &mirror_md_ops; 65 #endif 66 67 extern mdq_anchor_t md_done_daemon; 68 extern mdq_anchor_t md_mstr_daemon; 69 extern mdq_anchor_t md_mirror_daemon; 70 extern mdq_anchor_t md_mirror_io_daemon; 71 extern mdq_anchor_t md_mirror_rs_daemon; 72 extern mdq_anchor_t md_mhs_daemon; 73 74 extern unit_t md_nunits; 75 extern set_t md_nsets; 76 extern md_set_t md_set[]; 77 78 extern int md_status; 79 extern clock_t md_hz; 80 81 extern md_krwlock_t md_unit_array_rw; 82 extern kmutex_t md_mx; 83 extern kcondvar_t md_cv; 84 extern int md_mtioctl_cnt; 85 86 daemon_request_t mirror_timeout; 87 static daemon_request_t hotspare_request; 88 static daemon_request_t mn_hs_request[MD_MAXSETS]; /* Multinode hs req */ 89 90 int md_mirror_mcs_buf_off; 91 92 /* Flags for mdmn_ksend_message to allow debugging */ 93 int md_mirror_msg_flags; 94 95 #ifdef DEBUG 96 /* Flag to switch on debug messages */ 97 int mirror_debug_flag = 0; 98 #endif 99 100 /* 101 * Struct used to hold count of DMR reads and the timestamp of last DMR read 102 * It is used to verify, using a debugger, that the DMR read ioctl has been 103 * executed. 104 */ 105 dmr_stats_t mirror_dmr_stats = {0, 0}; 106 107 /* 108 * Mutex protecting list of non-failfast drivers. 109 */ 110 static kmutex_t non_ff_drv_mutex; 111 static char **non_ff_drivers = NULL; 112 113 extern major_t md_major; 114 115 /* 116 * Write-On-Write memory pool. 117 */ 118 static void copy_write_cont(wowhdr_t *wowhdr); 119 static kmem_cache_t *mirror_wowblk_cache = NULL; 120 static int md_wowbuf_size = 16384; 121 static size_t md_wowblk_size; 122 123 /* 124 * This is a flag that allows: 125 * - disabling the write-on-write mechanism. 126 * - logging occurrences of write-on-write 127 * - switching wow handling procedure processing 128 * Counter for occurences of WOW. 129 */ 130 static uint_t md_mirror_wow_flg = 0; 131 static int md_mirror_wow_cnt = 0; 132 133 /* 134 * Tunable to enable/disable dirty region 135 * processing when closing down a mirror. 136 */ 137 static int new_resync = 1; 138 kmem_cache_t *mirror_parent_cache = NULL; 139 kmem_cache_t *mirror_child_cache = NULL; 140 141 extern int md_ff_disable; /* disable failfast */ 142 143 static int mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int); 144 static void mirror_read_strategy(buf_t *, int, void *); 145 static void mirror_write_strategy(buf_t *, int, void *); 146 static void become_owner(daemon_queue_t *); 147 static int mirror_done(struct buf *cb); 148 static int mirror_done_common(struct buf *cb); 149 static void clear_retry_error(struct buf *cb); 150 151 /* 152 * patchables 153 */ 154 int md_min_rr_size = 200; /* 2000 blocks, or 100k */ 155 int md_def_num_rr = 1000; /* Default number of dirty regions */ 156 157 /* 158 * patchable to change delay before rescheduling mirror ownership request. 159 * Value is clock ticks, default 0.5 seconds 160 */ 161 clock_t md_mirror_owner_to = 500000; 162 163 /*ARGSUSED1*/ 164 static int 165 mirror_parent_constructor(void *p, void *d1, int d2) 166 { 167 mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL); 168 return (0); 169 } 170 171 static void 172 mirror_parent_init(md_mps_t *ps) 173 { 174 bzero(ps, offsetof(md_mps_t, ps_mx)); 175 } 176 177 /*ARGSUSED1*/ 178 static void 179 mirror_parent_destructor(void *p, void *d) 180 { 181 mutex_destroy(&((md_mps_t *)p)->ps_mx); 182 } 183 184 /*ARGSUSED1*/ 185 static int 186 mirror_child_constructor(void *p, void *d1, int d2) 187 { 188 bioinit(&((md_mcs_t *)p)->cs_buf); 189 return (0); 190 } 191 192 void 193 mirror_child_init(md_mcs_t *cs) 194 { 195 cs->cs_ps = NULL; 196 cs->cs_mdunit = 0; 197 md_bioreset(&cs->cs_buf); 198 } 199 200 /*ARGSUSED1*/ 201 static void 202 mirror_child_destructor(void *p, void *d) 203 { 204 biofini(&((md_mcs_t *)p)->cs_buf); 205 } 206 207 static void 208 mirror_wowblk_init(wowhdr_t *p) 209 { 210 bzero(p, md_wowblk_size); 211 } 212 213 static void 214 send_poke_hotspares_msg(daemon_request_t *drq) 215 { 216 int rval; 217 md_mn_msg_pokehsp_t pokehsp; 218 md_mn_kresult_t *kresult; 219 set_t setno = (set_t)drq->dq.qlen; 220 221 pokehsp.pokehsp_setno = setno; 222 223 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 224 rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES, 225 MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, (char *)&pokehsp, 226 sizeof (pokehsp), kresult); 227 228 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 229 mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES"); 230 cmn_err(CE_PANIC, 231 "ksend_message failure: POKE_HOTSPARES"); 232 } 233 kmem_free(kresult, sizeof (md_mn_kresult_t)); 234 235 /* Allow further requests to use this set's queue structure */ 236 mutex_enter(&drq->dr_mx); 237 drq->dr_pending = 0; 238 mutex_exit(&drq->dr_mx); 239 } 240 241 /* 242 * Send a poke_hotspares message to the master node. To avoid swamping the 243 * commd handler with requests we only send a message if there is not one 244 * already outstanding. We punt the request to a separate thread context as 245 * cannot afford to block waiting on the request to be serviced. This is 246 * essential when a reconfig cycle is in progress as any open() of a multinode 247 * metadevice may result in a livelock. 248 */ 249 static void 250 send_poke_hotspares(set_t setno) 251 { 252 daemon_request_t *drq = &mn_hs_request[setno]; 253 254 mutex_enter(&drq->dr_mx); 255 if (drq->dr_pending == 0) { 256 drq->dr_pending = 1; 257 drq->dq.qlen = (int)setno; 258 daemon_request(&md_mhs_daemon, 259 send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD); 260 } 261 mutex_exit(&drq->dr_mx); 262 } 263 264 void 265 mirror_set_sm_state( 266 mm_submirror_t *sm, 267 mm_submirror_ic_t *smic, 268 sm_state_t newstate, 269 int force) 270 { 271 int compcnt; 272 int i; 273 int errcnt; 274 sm_state_t origstate; 275 md_m_shared_t *shared; 276 277 if (force) { 278 sm->sm_state = newstate; 279 uniqtime32(&sm->sm_timestamp); 280 return; 281 } 282 283 origstate = newstate; 284 285 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 286 for (i = 0, errcnt = 0; i < compcnt; i++) { 287 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 288 (sm->sm_dev, sm, i); 289 if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED)) 290 newstate |= SMS_COMP_ERRED; 291 if (shared->ms_state & (CS_RESYNC)) 292 newstate |= SMS_COMP_RESYNC; 293 if (shared->ms_state & CS_ERRED) 294 errcnt++; 295 } 296 297 if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0) 298 newstate &= ~origstate; 299 300 if (errcnt == compcnt) 301 newstate |= SMS_ALL_ERRED; 302 else 303 newstate &= ~SMS_ALL_ERRED; 304 305 sm->sm_state = newstate; 306 uniqtime32(&sm->sm_timestamp); 307 } 308 309 static int 310 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error, 311 int frm_probe) 312 { 313 mm_submirror_t *sm; 314 mm_submirror_ic_t *smic; 315 md_m_shared_t *shared; 316 int ci; 317 int i; 318 int compcnt; 319 int open_comp; /* flag for open component */ 320 321 for (i = *smi; i < NMIRROR; i++) { 322 sm = &un->un_sm[i]; 323 smic = &un->un_smic[i]; 324 325 if (!SMS_IS(sm, SMS_INUSE)) 326 continue; 327 328 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 329 for (ci = *cip; ci < compcnt; ci++) { 330 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 331 (sm->sm_dev, sm, ci); 332 /* 333 * if called from any routine but probe, we check for 334 * MDM_S_ISOPEN flag. Since probe does a pseduo open, 335 * it sets MDM_S_PROBEOPEN flag and we test for this 336 * flag. They are both exclusive tests. 337 */ 338 open_comp = (frm_probe) ? 339 (shared->ms_flags & MDM_S_PROBEOPEN): 340 (shared->ms_flags & MDM_S_ISOPEN); 341 if ((shared->ms_flags & MDM_S_IOERR || !open_comp) && 342 ((shared->ms_state == CS_OKAY) || 343 (shared->ms_state == CS_RESYNC))) { 344 if (clr_error) { 345 shared->ms_flags &= ~MDM_S_IOERR; 346 } 347 *cip = ci; 348 *smi = i; 349 return (1); 350 } 351 352 if (clr_error && (shared->ms_flags & MDM_S_IOERR)) { 353 shared->ms_flags &= ~MDM_S_IOERR; 354 } 355 } 356 357 *cip = 0; 358 } 359 return (0); 360 } 361 362 /*ARGSUSED*/ 363 static void 364 mirror_run_queue(void *d) 365 { 366 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 367 md_daemon(1, &md_done_daemon); 368 } 369 /* 370 * check_comp_4_hotspares 371 * 372 * This function attempts to allocate a hotspare for this component if the 373 * component is in error. In a MN set, the function can be called in 2 modes. 374 * It can be called either when a component error has been detected or when a 375 * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set 376 * in flags and the request is sent to all nodes. 377 * The handler on each of the nodes then calls this function with 378 * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed. 379 * 380 * For non-MN sets the function simply attempts to allocate a hotspare. 381 * 382 * On entry, the following locks are held 383 * mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set) 384 * md_unit_writerlock 385 * 386 * Returns 0 if ok 387 * 1 if the unit containing the component has been cleared while 388 * the mdmn_ksend_message() was being executed 389 */ 390 extern int 391 check_comp_4_hotspares( 392 mm_unit_t *un, 393 int smi, 394 int ci, 395 uint_t flags, 396 mddb_recid_t hs_id, /* Only used by MN disksets */ 397 IOLOCK *lockp /* can be NULL */ 398 ) 399 { 400 mm_submirror_t *sm; 401 mm_submirror_ic_t *smic; 402 md_m_shared_t *shared; 403 mddb_recid_t recids[6]; 404 minor_t mnum; 405 intptr_t (*hs_dev)(); 406 void (*hs_done)(); 407 void *hs_data; 408 md_error_t mde = mdnullerror; 409 set_t setno; 410 md_mn_msg_allochsp_t allochspmsg; 411 md_mn_kresult_t *kresult; 412 mm_unit_t *new_un; 413 int rval; 414 415 mnum = MD_SID(un); 416 setno = MD_UN2SET(un); 417 sm = &un->un_sm[smi]; 418 smic = &un->un_smic[smi]; 419 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 420 (sm->sm_dev, sm, ci); 421 422 if (shared->ms_state != CS_ERRED) 423 return (0); 424 425 /* Don't start a new component resync if a resync is already running. */ 426 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 427 return (0); 428 429 if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) { 430 uint_t msgflags; 431 md_mn_msgtype_t msgtype; 432 433 /* Send allocate hotspare message to all nodes */ 434 435 allochspmsg.msg_allochsp_mnum = un->c.un_self_id; 436 allochspmsg.msg_allochsp_sm = smi; 437 allochspmsg.msg_allochsp_comp = ci; 438 allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id; 439 440 /* 441 * Before calling mdmn_ksend_message(), release locks 442 * Can never be in the context of an ioctl. 443 */ 444 md_unit_writerexit(MDI_UNIT(mnum)); 445 if (flags & MD_HOTSPARE_LINKHELD) 446 rw_exit(&mirror_md_ops.md_link_rw.lock); 447 #ifdef DEBUG 448 if (mirror_debug_flag) 449 printf("send alloc hotspare, flags=0x%x %x, %x, %x, %x\n", 450 flags, 451 allochspmsg.msg_allochsp_mnum, 452 allochspmsg.msg_allochsp_sm, 453 allochspmsg.msg_allochsp_comp, 454 allochspmsg.msg_allochsp_hs_id); 455 #endif 456 if (flags & MD_HOTSPARE_WMUPDATE) { 457 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE2; 458 /* 459 * When coming from an update of watermarks, there 460 * must already be a message logged that triggered 461 * this action. So, no need to log this message, too. 462 */ 463 msgflags = MD_MSGF_NO_LOG; 464 } else { 465 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE; 466 msgflags = MD_MSGF_DEFAULT_FLAGS; 467 } 468 469 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 470 rval = mdmn_ksend_message(setno, msgtype, msgflags, 471 (char *)&allochspmsg, sizeof (allochspmsg), 472 kresult); 473 474 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 475 #ifdef DEBUG 476 if (mirror_debug_flag) 477 mdmn_ksend_show_error(rval, kresult, 478 "ALLOCATE HOTSPARE"); 479 #endif 480 /* 481 * If message is sent ok but exitval indicates an error 482 * it must be because the mirror has been cleared. In 483 * this case re-obtain lock and return an error 484 */ 485 if ((rval == 0) && (kresult->kmmr_exitval != 0)) { 486 if (flags & MD_HOTSPARE_LINKHELD) { 487 rw_enter(&mirror_md_ops.md_link_rw.lock, 488 RW_READER); 489 } 490 kmem_free(kresult, sizeof (md_mn_kresult_t)); 491 return (1); 492 } 493 cmn_err(CE_PANIC, 494 "ksend_message failure: ALLOCATE_HOTSPARE"); 495 } 496 kmem_free(kresult, sizeof (md_mn_kresult_t)); 497 498 /* 499 * re-obtain the locks 500 */ 501 if (flags & MD_HOTSPARE_LINKHELD) 502 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 503 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 504 505 /* 506 * As we had to release the locks in order to send the 507 * message to all nodes, we need to check to see if the 508 * unit has changed. If it has we release the writerlock 509 * and return fail. 510 */ 511 if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) { 512 md_unit_writerexit(MDI_UNIT(mnum)); 513 return (1); 514 } 515 } else { 516 if (MD_MNSET_SETNO(setno)) { 517 /* 518 * If 2 or more nodes simultaneously see a 519 * component failure, these nodes will each 520 * send an ALLOCATE_HOTSPARE[2] message. 521 * The first message will allocate the hotspare 522 * and the subsequent messages should do nothing. 523 * 524 * If a slave node doesn't have a hotspare allocated 525 * at the time the message is initiated, then the 526 * passed in hs_id will be 0. If the node 527 * executing this routine has a component shared 528 * ms_hs_id of non-zero, but the message shows a 529 * hs_id of 0, then just return since a hotspare 530 * has already been allocated for this failing 531 * component. When the slave node returns from 532 * the ksend_message the hotspare will have 533 * already been allocated. 534 * 535 * If the slave node does send an hs_id of non-zero, 536 * and the slave node's hs_id matches this node's 537 * ms_hs_id, then the hotspare has error'd and 538 * should be replaced. 539 * 540 * If the slave node sends an hs_id of non-zero and 541 * this node has a different shared ms_hs_id, then 542 * just return since this hotspare has already 543 * been hotspared. 544 */ 545 if (shared->ms_hs_id != 0) { 546 if (hs_id == 0) { 547 #ifdef DEBUG 548 if (mirror_debug_flag) { 549 printf("check_comp_4_hotspares" 550 "(NOXMIT), short circuit " 551 "hs_id=0x%x, " 552 "ms_hs_id=0x%x\n", 553 hs_id, shared->ms_hs_id); 554 } 555 #endif 556 return (0); 557 } 558 if (hs_id != shared->ms_hs_id) { 559 #ifdef DEBUG 560 if (mirror_debug_flag) { 561 printf("check_comp_4_hotspares" 562 "(NOXMIT), short circuit2 " 563 "hs_id=0x%x, " 564 "ms_hs_id=0x%x\n", 565 hs_id, shared->ms_hs_id); 566 } 567 #endif 568 return (0); 569 } 570 } 571 } 572 573 sm = &un->un_sm[smi]; 574 hs_dev = md_get_named_service(sm->sm_dev, 0, 575 "hotspare device", 0); 576 if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done, 577 &hs_data) != 0) 578 return (0); 579 580 /* 581 * set_sm_comp_state() commits the modified records. 582 * As we don't transmit the changes, no need to drop the lock. 583 */ 584 set_sm_comp_state(un, smi, ci, CS_RESYNC, recids, 585 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 586 587 (*hs_done)(sm->sm_dev, hs_data); 588 589 mirror_check_failfast(mnum); 590 591 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE, 592 setno, MD_SID(un)); 593 594 /* 595 * For a multi-node set we need to reset the un_rs_type, 596 * un_rs_resync_done and un_rs_resync_2_do fields as the 597 * hot-spare resync must copy all applicable data. 598 */ 599 if (MD_MNSET_SETNO(setno)) { 600 un->un_rs_type = MD_RS_NONE; 601 un->un_rs_resync_done = 0; 602 un->un_rs_resync_2_do = 0; 603 } 604 605 /* 606 * Must drop writer lock since mirror_resync_unit will 607 * open devices and must be able to grab readerlock. 608 * Don't need to drop IOLOCK since any descendent routines 609 * calling ksend_messages will drop the IOLOCK as needed. 610 * 611 */ 612 if (lockp) { 613 md_ioctl_writerexit(lockp); 614 } else { 615 md_unit_writerexit(MDI_UNIT(mnum)); 616 } 617 618 /* start resync */ 619 (void) mirror_resync_unit(mnum, NULL, &mde, lockp); 620 621 if (lockp) { 622 new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum)); 623 } else { 624 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 625 } 626 } 627 return (0); 628 } 629 630 /* 631 * check_unit_4_hotspares 632 * 633 * For a given mirror, allocate hotspares, if available for any components 634 * that are in error 635 * 636 * Returns 0 if ok 637 * 1 if check_comp_4_hotspares returns non-zero. This will only 638 * happen for a MN unit where the unit has been cleared while 639 * the allocate hotspare message is sent to all nodes. 640 */ 641 static int 642 check_unit_4_hotspares(mm_unit_t *un, int flags) 643 { 644 mm_submirror_t *sm; 645 mm_submirror_ic_t *smic; 646 int ci; 647 int i; 648 int compcnt; 649 650 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 651 return (0); 652 653 for (i = 0; i < NMIRROR; i++) { 654 sm = &un->un_sm[i]; 655 smic = &un->un_smic[i]; 656 if (!SMS_IS(sm, SMS_INUSE)) 657 continue; 658 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm); 659 for (ci = 0; ci < compcnt; ci++) { 660 md_m_shared_t *shared; 661 662 shared = (md_m_shared_t *) 663 (*(smic->sm_shared_by_indx))(sm->sm_dev, 664 sm, ci); 665 /* 666 * Never called from ioctl context, so pass in 667 * (IOLOCK *)NULL. Pass through flags from calling 668 * routine, also setting XMIT flag. 669 */ 670 if (check_comp_4_hotspares(un, i, ci, 671 (MD_HOTSPARE_XMIT | flags), 672 shared->ms_hs_id, (IOLOCK *)NULL) != 0) 673 return (1); 674 } 675 } 676 return (0); 677 } 678 679 static void 680 check_4_hotspares(daemon_request_t *drq) 681 { 682 mdi_unit_t *ui; 683 mm_unit_t *un; 684 md_link_t *next; 685 int x; 686 687 mutex_enter(&drq->dr_mx); /* clear up front so can poke */ 688 drq->dr_pending = 0; /* again in low level routine if */ 689 mutex_exit(&drq->dr_mx); /* something found to do */ 690 691 /* 692 * Used to have a problem here. The disksets weren't marked as being 693 * MNHOLD. This opened a window where we could be searching for 694 * hotspares and have the disk set unloaded (released) from under 695 * us causing a panic in stripe_component_count(). 696 * The way to prevent that is to mark the set MNHOLD which prevents 697 * any diskset from being released while we are scanning the mirrors, 698 * submirrors and components. 699 */ 700 701 for (x = 0; x < md_nsets; x++) 702 md_holdset_enter(x); 703 704 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 705 for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) { 706 ui = MDI_UNIT(next->ln_id); 707 708 un = (mm_unit_t *)md_unit_readerlock(ui); 709 710 /* 711 * Only check the unit if we are the master for this set 712 * For an MN set, poke_hotspares() is only effective on the 713 * master 714 */ 715 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 716 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 717 md_unit_readerexit(ui); 718 continue; 719 } 720 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) { 721 md_unit_readerexit(ui); 722 continue; 723 } 724 md_unit_readerexit(ui); 725 726 un = (mm_unit_t *)md_unit_writerlock(ui); 727 /* 728 * check_unit_4_hotspares will exit 1 if the unit has been 729 * removed during the process of allocating the hotspare. 730 * This can only happen for a MN metadevice. If unit no longer 731 * exists, no need to release writerlock 732 */ 733 if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0) 734 md_unit_writerexit(ui); 735 else { 736 /* 737 * If check_unit_4_hotspares failed, queue another 738 * request and break out of this one 739 */ 740 (void) poke_hotspares(); 741 break; 742 } 743 } 744 rw_exit(&mirror_md_ops.md_link_rw.lock); 745 746 for (x = 0; x < md_nsets; x++) 747 md_holdset_exit(x); 748 } 749 750 /* 751 * poke_hotspares 752 * 753 * If there is not a pending poke_hotspares request pending, queue a requent 754 * to call check_4_hotspares(). This will scan all mirrors and attempt to 755 * allocate hotspares for all components in error. 756 */ 757 int 758 poke_hotspares() 759 { 760 mutex_enter(&hotspare_request.dr_mx); 761 if (hotspare_request.dr_pending == 0) { 762 hotspare_request.dr_pending = 1; 763 daemon_request(&md_mhs_daemon, 764 check_4_hotspares, 765 (daemon_queue_t *)&hotspare_request, REQ_OLD); 766 } 767 mutex_exit(&hotspare_request.dr_mx); 768 return (0); 769 } 770 771 static void 772 free_all_ecomps(err_comp_t *ecomp) 773 { 774 err_comp_t *d; 775 776 while (ecomp != NULL) { 777 d = ecomp; 778 ecomp = ecomp->ec_next; 779 kmem_free(d, sizeof (err_comp_t)); 780 } 781 } 782 783 /* 784 * NAME: mirror_openfail_console_info 785 * 786 * DESCRIPTION: Prints a informative message to the console when mirror 787 * cannot be opened. 788 * 789 * PARAMETERS: mm_unit_t un - pointer to mirror unit structure 790 * int smi - submirror index 791 * int ci - component index 792 */ 793 794 void 795 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci) 796 { 797 void (*get_dev)(); 798 ms_cd_info_t cd; 799 md_dev64_t tmpdev; 800 801 tmpdev = un->un_sm[smi].sm_dev; 802 get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0); 803 if (get_dev != NULL) { 804 (void) (*get_dev)(tmpdev, smi, ci, &cd); 805 cmn_err(CE_WARN, "md %s: open error on %s", 806 md_shortname(MD_SID(un)), 807 md_devname(MD_UN2SET(un), cd.cd_dev, 808 NULL, 0)); 809 } else { 810 cmn_err(CE_WARN, "md %s: open error", 811 md_shortname(MD_SID(un))); 812 } 813 } 814 815 static int 816 mirror_close_all_devs(mm_unit_t *un, int md_cflags) 817 { 818 int i; 819 md_dev64_t dev; 820 821 for (i = 0; i < NMIRROR; i++) { 822 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 823 continue; 824 dev = un->un_sm[i].sm_dev; 825 md_layered_close(dev, md_cflags); 826 } 827 return (0); 828 } 829 830 /* 831 * Keep track of drivers that don't support failfast. We use this so that 832 * we only log one diagnostic message for each of these drivers, no matter 833 * how many times we run the mirror_check_failfast function. 834 * Return 1 if this is a new driver that does not support failfast, 835 * return 0 if we have already seen this non-failfast driver. 836 */ 837 static int 838 new_non_ff_driver(const char *s) 839 { 840 mutex_enter(&non_ff_drv_mutex); 841 if (non_ff_drivers == NULL) { 842 non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *), 843 KM_NOSLEEP); 844 if (non_ff_drivers == NULL) { 845 mutex_exit(&non_ff_drv_mutex); 846 return (1); 847 } 848 849 non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); 850 if (non_ff_drivers[0] == NULL) { 851 kmem_free(non_ff_drivers, 2 * sizeof (char *)); 852 non_ff_drivers = NULL; 853 mutex_exit(&non_ff_drv_mutex); 854 return (1); 855 } 856 857 (void) strcpy(non_ff_drivers[0], s); 858 non_ff_drivers[1] = NULL; 859 860 } else { 861 int i; 862 char **tnames; 863 char **tmp; 864 865 for (i = 0; non_ff_drivers[i] != NULL; i++) { 866 if (strcmp(s, non_ff_drivers[i]) == 0) { 867 mutex_exit(&non_ff_drv_mutex); 868 return (0); 869 } 870 } 871 872 /* allow for new element and null */ 873 i += 2; 874 tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP); 875 if (tnames == NULL) { 876 mutex_exit(&non_ff_drv_mutex); 877 return (1); 878 } 879 880 for (i = 0; non_ff_drivers[i] != NULL; i++) 881 tnames[i] = non_ff_drivers[i]; 882 883 tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); 884 if (tnames[i] == NULL) { 885 /* adjust i so that it is the right count to free */ 886 kmem_free(tnames, (i + 2) * sizeof (char *)); 887 mutex_exit(&non_ff_drv_mutex); 888 return (1); 889 } 890 891 (void) strcpy(tnames[i++], s); 892 tnames[i] = NULL; 893 894 tmp = non_ff_drivers; 895 non_ff_drivers = tnames; 896 /* i now represents the count we previously alloced */ 897 kmem_free(tmp, i * sizeof (char *)); 898 } 899 mutex_exit(&non_ff_drv_mutex); 900 901 return (1); 902 } 903 904 /* 905 * Check for the "ddi-failfast-supported" devtree property on each submirror 906 * component to indicate if we should do I/O to that submirror with the 907 * B_FAILFAST flag set or not. This check is made at various state transitions 908 * in the mirror code (e.g. open, enable, hotspare, etc.). Sometimes we 909 * only need to check one drive (e.g. hotspare) but since the check is 910 * fast and infrequent and sometimes needs to be done on all components we 911 * just check all components on each call. 912 */ 913 void 914 mirror_check_failfast(minor_t mnum) 915 { 916 int i; 917 mm_unit_t *un; 918 919 if (md_ff_disable) 920 return; 921 922 un = MD_UNIT(mnum); 923 924 for (i = 0; i < NMIRROR; i++) { 925 int ci; 926 int cnt; 927 int ff = 1; 928 mm_submirror_t *sm; 929 mm_submirror_ic_t *smic; 930 void (*get_dev)(); 931 932 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 933 continue; 934 935 sm = &un->un_sm[i]; 936 smic = &un->un_smic[i]; 937 938 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 939 "get device", 0); 940 941 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 942 for (ci = 0; ci < cnt; ci++) { 943 int found = 0; 944 dev_t ci_dev; 945 major_t major; 946 dev_info_t *devi; 947 ms_cd_info_t cd; 948 949 /* this already returns the hs dev if the device is spared */ 950 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 951 952 ci_dev = md_dev64_to_dev(cd.cd_dev); 953 major = getmajor(ci_dev); 954 955 if (major == md_major) { 956 /* this component must be a soft partition; get real dev */ 957 minor_t dev_mnum; 958 mdi_unit_t *ui; 959 mp_unit_t *un; 960 set_t setno; 961 side_t side; 962 md_dev64_t tmpdev; 963 964 ui = MDI_UNIT(getminor(ci_dev)); 965 966 /* grab necessary lock */ 967 un = (mp_unit_t *)md_unit_readerlock(ui); 968 969 dev_mnum = MD_SID(un); 970 setno = MD_MIN2SET(dev_mnum); 971 side = mddb_getsidenum(setno); 972 973 tmpdev = un->un_dev; 974 975 /* Get dev by device id */ 976 if (md_devid_found(setno, side, un->un_key) == 1) { 977 tmpdev = md_resolve_bydevid(dev_mnum, tmpdev, 978 un->un_key); 979 } 980 981 md_unit_readerexit(ui); 982 983 ci_dev = md_dev64_to_dev(tmpdev); 984 major = getmajor(ci_dev); 985 } 986 987 if (ci_dev != NODEV32 && 988 (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) != NULL) { 989 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; 990 int propvalue = 0; 991 int proplength = sizeof (int); 992 int error; 993 struct cb_ops *cb; 994 995 if ((cb = devopsp[major]->devo_cb_ops) != NULL) { 996 error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi, prop_op, 997 DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, 998 "ddi-failfast-supported", 999 (caddr_t)&propvalue, &proplength); 1000 1001 if (error == DDI_PROP_SUCCESS) 1002 found = 1; 1003 } 1004 1005 if (!found && new_non_ff_driver(ddi_driver_name(devi))) 1006 cmn_err(CE_NOTE, "!md: B_FAILFAST I/O disabled on %s", 1007 ddi_driver_name(devi)); 1008 1009 ddi_release_devi(devi); 1010 } 1011 1012 /* All components must support failfast in the submirror. */ 1013 if (!found) { 1014 ff = 0; 1015 break; 1016 } 1017 } 1018 1019 if (ff) { 1020 sm->sm_flags |= MD_SM_FAILFAST; 1021 } else { 1022 sm->sm_flags &= ~MD_SM_FAILFAST; 1023 } 1024 } 1025 } 1026 1027 /* 1028 * Return true if the submirror is unavailable. 1029 * If any of the submirror components are opened then the submirror cannot 1030 * be unavailable (MD_INACCESSIBLE). 1031 * If any of the components are already in the errored state, then the submirror 1032 * cannot be unavailable (MD_INACCESSIBLE). 1033 */ 1034 static bool_t 1035 submirror_unavailable(mm_unit_t *un, int smi, int from_probe) 1036 { 1037 mm_submirror_t *sm; 1038 mm_submirror_ic_t *smic; 1039 md_m_shared_t *shared; 1040 int ci; 1041 int compcnt; 1042 1043 sm = &un->un_sm[smi]; 1044 smic = &un->un_smic[smi]; 1045 1046 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 1047 for (ci = 0; ci < compcnt; ci++) { 1048 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 1049 (sm->sm_dev, sm, ci); 1050 if (from_probe) { 1051 if (shared->ms_flags & MDM_S_PROBEOPEN) 1052 return (B_FALSE); 1053 } else { 1054 if (shared->ms_flags & MDM_S_ISOPEN) 1055 return (B_FALSE); 1056 } 1057 if (shared->ms_state == CS_ERRED || 1058 shared->ms_state == CS_LAST_ERRED) 1059 return (B_FALSE); 1060 } 1061 1062 return (B_TRUE); 1063 } 1064 1065 static int 1066 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp) 1067 { 1068 int i; 1069 mm_unit_t *un; 1070 mdi_unit_t *ui; 1071 int err; 1072 int smi; 1073 int ci; 1074 err_comp_t *c; 1075 err_comp_t *ecomps = NULL; 1076 int smmask = 0; 1077 set_t setno; 1078 int sm_cnt; 1079 int sm_unavail_cnt; 1080 1081 mirror_check_failfast(mnum); 1082 1083 un = MD_UNIT(mnum); 1084 ui = MDI_UNIT(mnum); 1085 setno = MD_UN2SET(un); 1086 1087 for (i = 0; i < NMIRROR; i++) { 1088 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1089 1090 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1091 continue; 1092 if (md_layered_open(mnum, &tmpdev, md_oflags)) 1093 smmask |= SMI2BIT(i); 1094 un->un_sm[i].sm_dev = tmpdev; 1095 } 1096 1097 /* 1098 * If smmask is clear, all submirrors are accessible. Clear the 1099 * MD_INACCESSIBLE bit in this case. This bit is also cleared for the 1100 * mirror device. If smmask is set, we have to determine which of the 1101 * submirrors are in error. If no submirror is accessible we mark the 1102 * whole mirror as MD_INACCESSIBLE. 1103 */ 1104 if (smmask == 0) { 1105 if (lockp) { 1106 md_ioctl_readerexit(lockp); 1107 (void) md_ioctl_writerlock(lockp, ui); 1108 } else { 1109 md_unit_readerexit(ui); 1110 (void) md_unit_writerlock(ui); 1111 } 1112 ui->ui_tstate &= ~MD_INACCESSIBLE; 1113 if (lockp) { 1114 md_ioctl_writerexit(lockp); 1115 (void) md_ioctl_readerlock(lockp, ui); 1116 } else { 1117 md_unit_writerexit(ui); 1118 (void) md_unit_readerlock(ui); 1119 } 1120 1121 for (i = 0; i < NMIRROR; i++) { 1122 md_dev64_t tmpdev; 1123 mdi_unit_t *sm_ui; 1124 1125 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1126 continue; 1127 1128 tmpdev = un->un_sm[i].sm_dev; 1129 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1130 (void) md_unit_writerlock(sm_ui); 1131 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1132 md_unit_writerexit(sm_ui); 1133 } 1134 1135 return (0); 1136 } 1137 1138 for (i = 0; i < NMIRROR; i++) { 1139 md_dev64_t tmpdev; 1140 1141 if (!(smmask & SMI2BIT(i))) 1142 continue; 1143 1144 tmpdev = un->un_sm[i].sm_dev; 1145 err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS); 1146 un->un_sm[i].sm_dev = tmpdev; 1147 ASSERT(err == 0); 1148 } 1149 1150 if (lockp) { 1151 md_ioctl_readerexit(lockp); 1152 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui); 1153 } else { 1154 md_unit_readerexit(ui); 1155 un = (mm_unit_t *)md_unit_writerlock(ui); 1156 } 1157 1158 /* 1159 * We want to make sure the unavailable flag is not masking a real 1160 * error on the submirror. 1161 * For each submirror, 1162 * if all of the submirror components couldn't be opened and there 1163 * are no errors on the submirror, then set the unavailable flag 1164 * otherwise, clear unavailable. 1165 */ 1166 sm_cnt = 0; 1167 sm_unavail_cnt = 0; 1168 for (i = 0; i < NMIRROR; i++) { 1169 md_dev64_t tmpdev; 1170 mdi_unit_t *sm_ui; 1171 1172 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1173 continue; 1174 1175 sm_cnt++; 1176 tmpdev = un->un_sm[i].sm_dev; 1177 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1178 1179 (void) md_unit_writerlock(sm_ui); 1180 if (submirror_unavailable(un, i, 0)) { 1181 sm_ui->ui_tstate |= MD_INACCESSIBLE; 1182 sm_unavail_cnt++; 1183 } else { 1184 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1185 } 1186 md_unit_writerexit(sm_ui); 1187 } 1188 1189 /* 1190 * If all of the submirrors are unavailable, the mirror is also 1191 * unavailable. 1192 */ 1193 if (sm_cnt == sm_unavail_cnt) { 1194 ui->ui_tstate |= MD_INACCESSIBLE; 1195 } else { 1196 ui->ui_tstate &= ~MD_INACCESSIBLE; 1197 } 1198 1199 smi = 0; 1200 ci = 0; 1201 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 1202 if (mirror_other_sources(un, smi, ci, 1) == 1) { 1203 1204 free_all_ecomps(ecomps); 1205 (void) mirror_close_all_devs(un, md_oflags); 1206 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, 1207 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1208 mirror_openfail_console_info(un, smi, ci); 1209 if (lockp) { 1210 md_ioctl_writerexit(lockp); 1211 (void) md_ioctl_readerlock(lockp, ui); 1212 } else { 1213 md_unit_writerexit(ui); 1214 (void) md_unit_readerlock(ui); 1215 } 1216 return (ENXIO); 1217 } 1218 1219 /* track all component states that need changing */ 1220 c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP); 1221 c->ec_next = ecomps; 1222 c->ec_smi = smi; 1223 c->ec_ci = ci; 1224 ecomps = c; 1225 ci++; 1226 } 1227 1228 /* Make all state changes and commit them */ 1229 for (c = ecomps; c != NULL; c = c->ec_next) { 1230 /* 1231 * If lockp is set, then entering kernel through ioctl. 1232 * For a MN set, the only ioctl path is via a commd message 1233 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already 1234 * being sent to each node. 1235 * In this case, set NO_XMIT so that set_sm_comp_state 1236 * won't attempt to send a message on a message. 1237 * 1238 * In !MN sets, the xmit flag is ignored, so it doesn't matter 1239 * which flag is passed. 1240 */ 1241 if (lockp) { 1242 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1243 MD_STATE_NO_XMIT, lockp); 1244 } else { 1245 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1246 (MD_STATE_XMIT | MD_STATE_OCHELD), lockp); 1247 } 1248 /* 1249 * For a MN set, the NOTIFY is done when the state change is 1250 * processed on each node 1251 */ 1252 if (!MD_MNSET_SETNO(setno)) { 1253 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 1254 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1255 } 1256 } 1257 1258 if (lockp) { 1259 md_ioctl_writerexit(lockp); 1260 (void) md_ioctl_readerlock(lockp, ui); 1261 } else { 1262 md_unit_writerexit(ui); 1263 (void) md_unit_readerlock(ui); 1264 } 1265 1266 free_all_ecomps(ecomps); 1267 1268 /* allocate hotspares for all errored components */ 1269 if (MD_MNSET_SETNO(setno)) { 1270 /* 1271 * If we're called from an ioctl (lockp set) then we cannot 1272 * directly call send_poke_hotspares as this will block until 1273 * the message gets despatched to all nodes. If the cluster is 1274 * going through a reconfig cycle then the message will block 1275 * until the cycle is complete, and as we originate from a 1276 * service call from commd we will livelock. 1277 */ 1278 if (lockp == NULL) { 1279 md_unit_readerexit(ui); 1280 send_poke_hotspares(setno); 1281 (void) md_unit_readerlock(ui); 1282 } 1283 } else { 1284 (void) poke_hotspares(); 1285 } 1286 return (0); 1287 } 1288 1289 void 1290 mirror_overlap_chain_remove(md_mps_t *ps) 1291 { 1292 mm_unit_t *un; 1293 1294 if (panicstr) 1295 return; 1296 1297 ASSERT(ps->ps_flags & MD_MPS_ON_OVERLAP); 1298 1299 un = ps->ps_un; 1300 1301 mutex_enter(&un->un_ovrlap_chn_mx); 1302 if (ps->ps_ovrlap_prev != &un->un_ovrlap_chn) 1303 ps->ps_ovrlap_prev->ps_ovrlap_next = ps->ps_ovrlap_next; 1304 else 1305 un->un_ovrlap_chn.ps_ovrlap_next = ps->ps_ovrlap_next; 1306 if (ps->ps_ovrlap_next != &un->un_ovrlap_chn) 1307 ps->ps_ovrlap_next->ps_ovrlap_prev = ps->ps_ovrlap_prev; 1308 else 1309 un->un_ovrlap_chn.ps_ovrlap_prev = ps->ps_ovrlap_prev; 1310 /* Handle empty overlap chain */ 1311 if (un->un_ovrlap_chn.ps_ovrlap_prev == &un->un_ovrlap_chn) { 1312 un->un_ovrlap_chn.ps_ovrlap_prev = 1313 un->un_ovrlap_chn.ps_ovrlap_next = NULL; 1314 } 1315 if (un->un_ovrlap_chn_flg) { 1316 un->un_ovrlap_chn_flg = 0; 1317 cv_broadcast(&un->un_ovrlap_chn_cv); 1318 } 1319 ps->ps_flags &= ~MD_MPS_ON_OVERLAP; 1320 mutex_exit(&un->un_ovrlap_chn_mx); 1321 } 1322 1323 1324 /* 1325 * wait_for_overlaps: 1326 * ----------------- 1327 * Check that given i/o request does not cause an overlap with already pending 1328 * i/o. If it does, block until the overlapped i/o completes. 1329 * 1330 * Note: the overlap chain is held as a monotonically increasing doubly-linked 1331 * list with the sentinel contained in un->un_ovrlap_chn. We avoid a linear 1332 * search of the list by the following logic: 1333 * ps->ps_lastblk < un_ovrlap_chn.ps_ovrlap_next->ps_firstblk => No overlap 1334 * ps->ps_firstblk > un_ovrlap_chn.ps_ovrlap_prev->ps_lastblk => No overlap 1335 * otherwise 1336 * scan un_ovrlap_chn.ps_ovrlap_next for location where ps->ps_firstblk 1337 * > chain->ps_lastblk. This is the insertion point. As the list is 1338 * guaranteed to be ordered there is no need to continue scanning. 1339 * 1340 * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent 1341 * structure to be already on the overlap chain and MD_OVERLAP_NO_REPEAT 1342 * if it must not already be on the chain 1343 */ 1344 static void 1345 wait_for_overlaps(md_mps_t *ps, int flags) 1346 { 1347 mm_unit_t *un; 1348 md_mps_t *ps1, **head, **tail; 1349 1350 if (panicstr) 1351 return; 1352 1353 1354 un = ps->ps_un; 1355 1356 mutex_enter(&un->un_ovrlap_chn_mx); 1357 if ((flags & MD_OVERLAP_ALLOW_REPEAT) && 1358 (ps->ps_flags & MD_MPS_ON_OVERLAP)) { 1359 mutex_exit(&un->un_ovrlap_chn_mx); 1360 return; 1361 } 1362 1363 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 1364 head = &(un->un_ovrlap_chn.ps_ovrlap_next); 1365 tail = &(un->un_ovrlap_chn.ps_ovrlap_prev); 1366 ps1 = *head; 1367 /* 1368 * Check for simple limit cases: 1369 * *head == NULL 1370 * insert ps at head of list 1371 * lastblk < head->firstblk 1372 * insert at head of list 1373 * firstblk > tail->lastblk 1374 * insert at tail of list 1375 */ 1376 if (ps1 == NULL) { 1377 /* Insert at head */ 1378 ps->ps_ovrlap_next = &un->un_ovrlap_chn; 1379 ps->ps_ovrlap_prev = &un->un_ovrlap_chn; 1380 *head = ps; 1381 *tail = ps; 1382 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1383 mutex_exit(&un->un_ovrlap_chn_mx); 1384 return; 1385 } else if (ps->ps_lastblk < (*head)->ps_firstblk) { 1386 /* Insert at head */ 1387 ps->ps_ovrlap_next = (*head); 1388 ps->ps_ovrlap_prev = &un->un_ovrlap_chn; 1389 (*head)->ps_ovrlap_prev = ps; 1390 *head = ps; 1391 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1392 mutex_exit(&un->un_ovrlap_chn_mx); 1393 return; 1394 } else if (ps->ps_firstblk > (*tail)->ps_lastblk) { 1395 /* Insert at tail */ 1396 ps->ps_ovrlap_prev = (*tail); 1397 ps->ps_ovrlap_next = &un->un_ovrlap_chn; 1398 (*tail)->ps_ovrlap_next = ps; 1399 *tail = ps; 1400 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1401 mutex_exit(&un->un_ovrlap_chn_mx); 1402 return; 1403 } 1404 /* Now we have to scan the list for possible overlaps */ 1405 while (ps1 != NULL) { 1406 /* 1407 * If this region has been put on the chain by another thread 1408 * just exit 1409 */ 1410 if ((flags & MD_OVERLAP_ALLOW_REPEAT) && 1411 (ps->ps_flags & MD_MPS_ON_OVERLAP)) { 1412 mutex_exit(&un->un_ovrlap_chn_mx); 1413 return; 1414 1415 } 1416 for (ps1 = *head; ps1 && (ps1 != &un->un_ovrlap_chn); 1417 ps1 = ps1->ps_ovrlap_next) { 1418 if (ps->ps_firstblk > (*tail)->ps_lastblk) { 1419 /* Insert at tail */ 1420 ps->ps_ovrlap_prev = (*tail); 1421 ps->ps_ovrlap_next = &un->un_ovrlap_chn; 1422 (*tail)->ps_ovrlap_next = ps; 1423 *tail = ps; 1424 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1425 mutex_exit(&un->un_ovrlap_chn_mx); 1426 return; 1427 } 1428 if (ps->ps_firstblk > ps1->ps_lastblk) 1429 continue; 1430 if (ps->ps_lastblk < ps1->ps_firstblk) { 1431 /* Insert into list at current 'ps1' position */ 1432 ps->ps_ovrlap_next = ps1; 1433 ps->ps_ovrlap_prev = ps1->ps_ovrlap_prev; 1434 ps1->ps_ovrlap_prev->ps_ovrlap_next = ps; 1435 ps1->ps_ovrlap_prev = ps; 1436 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1437 mutex_exit(&un->un_ovrlap_chn_mx); 1438 return; 1439 } 1440 break; 1441 } 1442 if (ps1 != NULL) { 1443 un->un_ovrlap_chn_flg = 1; 1444 cv_wait(&un->un_ovrlap_chn_cv, &un->un_ovrlap_chn_mx); 1445 /* 1446 * Now ps1 refers to the old insertion point and we 1447 * have to check the whole chain to see if we're still 1448 * overlapping any other i/o. 1449 */ 1450 } 1451 } 1452 1453 /* 1454 * Only get here if we had one overlapping i/o on the list and that 1455 * has now completed. In this case the list is empty so we insert <ps> 1456 * at the head of the chain. 1457 */ 1458 ASSERT(*head == NULL); 1459 *tail = *head = ps; 1460 ps->ps_ovrlap_next = ps->ps_ovrlap_prev = &un->un_ovrlap_chn; 1461 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1462 mutex_exit(&un->un_ovrlap_chn_mx); 1463 } 1464 1465 /* 1466 * This function is called from mirror_done to check whether any pages have 1467 * been modified while a mirrored write was in progress. Returns 0 if 1468 * all pages associated with bp are clean, 1 otherwise. 1469 */ 1470 static int 1471 any_pages_dirty(struct buf *bp) 1472 { 1473 int rval; 1474 1475 rval = biomodified(bp); 1476 if (rval == -1) 1477 rval = 0; 1478 1479 return (rval); 1480 } 1481 1482 #define MAX_EXTRAS 10 1483 1484 void 1485 mirror_commit( 1486 mm_unit_t *un, 1487 int smmask, 1488 mddb_recid_t *extras 1489 ) 1490 { 1491 mm_submirror_t *sm; 1492 md_unit_t *su; 1493 int i; 1494 1495 /* 2=mirror,null id */ 1496 mddb_recid_t recids[NMIRROR+2+MAX_EXTRAS]; 1497 1498 int ri = 0; 1499 1500 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE) 1501 return; 1502 1503 /* Add two, this includes the mirror unit and the null recid */ 1504 if (extras != NULL) { 1505 int nrecids = 0; 1506 while (extras[nrecids] != 0) { 1507 nrecids++; 1508 } 1509 ASSERT(nrecids <= MAX_EXTRAS); 1510 } 1511 1512 if (un != NULL) 1513 recids[ri++] = un->c.un_record_id; 1514 for (i = 0; i < NMIRROR; i++) { 1515 if (!(smmask & SMI2BIT(i))) 1516 continue; 1517 sm = &un->un_sm[i]; 1518 if (!SMS_IS(sm, SMS_INUSE)) 1519 continue; 1520 if (md_getmajor(sm->sm_dev) != md_major) 1521 continue; 1522 su = MD_UNIT(md_getminor(sm->sm_dev)); 1523 recids[ri++] = su->c.un_record_id; 1524 } 1525 1526 if (extras != NULL) 1527 while (*extras != 0) { 1528 recids[ri++] = *extras; 1529 extras++; 1530 } 1531 1532 if (ri == 0) 1533 return; 1534 recids[ri] = 0; 1535 1536 /* 1537 * Ok to hold ioctl lock across record commit to mddb as 1538 * long as the record(s) being committed aren't resync records. 1539 */ 1540 mddb_commitrecs_wrapper(recids); 1541 } 1542 1543 1544 /* 1545 * This routine is used to set a bit in the writable_bm bitmap 1546 * which represents each submirror in a metamirror which 1547 * is writable. The first writable submirror index is assigned 1548 * to the sm_index. The number of writable submirrors are returned in nunits. 1549 * 1550 * This routine returns the submirror's unit number. 1551 */ 1552 1553 static void 1554 select_write_units(struct mm_unit *un, md_mps_t *ps) 1555 { 1556 1557 int i; 1558 unsigned writable_bm = 0; 1559 unsigned nunits = 0; 1560 1561 for (i = 0; i < NMIRROR; i++) { 1562 if (SUBMIRROR_IS_WRITEABLE(un, i)) { 1563 /* set bit of all writable units */ 1564 writable_bm |= SMI2BIT(i); 1565 nunits++; 1566 } 1567 } 1568 ps->ps_writable_sm = writable_bm; 1569 ps->ps_active_cnt = nunits; 1570 ps->ps_current_sm = 0; 1571 } 1572 1573 static 1574 unsigned 1575 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps) 1576 { 1577 1578 int i; 1579 unsigned writable_bm = 0; 1580 unsigned nunits = 0; 1581 1582 for (i = 0; i < NMIRROR; i++) { 1583 if (SUBMIRROR_IS_WRITEABLE(un, i) && 1584 un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) { 1585 writable_bm |= SMI2BIT(i); 1586 nunits++; 1587 } 1588 } 1589 if ((writable_bm & ps->ps_allfrom_sm) != 0) { 1590 writable_bm &= ~ps->ps_allfrom_sm; 1591 nunits--; 1592 } 1593 ps->ps_writable_sm = writable_bm; 1594 ps->ps_active_cnt = nunits; 1595 ps->ps_current_sm = 0; 1596 return (nunits); 1597 } 1598 1599 static md_dev64_t 1600 select_read_unit( 1601 mm_unit_t *un, 1602 diskaddr_t blkno, 1603 u_longlong_t reqcount, 1604 u_longlong_t *cando, 1605 int must_be_opened, 1606 md_m_shared_t **shared, 1607 md_mcs_t *cs) 1608 { 1609 int i; 1610 md_m_shared_t *s; 1611 uint_t lasterrcnt = 0; 1612 md_dev64_t dev = 0; 1613 u_longlong_t cnt; 1614 u_longlong_t mincnt; 1615 mm_submirror_t *sm; 1616 mm_submirror_ic_t *smic; 1617 mdi_unit_t *ui; 1618 1619 mincnt = reqcount; 1620 for (i = 0; i < NMIRROR; i++) { 1621 if (!SUBMIRROR_IS_READABLE(un, i)) 1622 continue; 1623 sm = &un->un_sm[i]; 1624 smic = &un->un_smic[i]; 1625 cnt = reqcount; 1626 1627 /* 1628 * If the current submirror is marked as inaccessible, do not 1629 * try to access it. 1630 */ 1631 ui = MDI_UNIT(getminor(expldev(sm->sm_dev))); 1632 (void) md_unit_readerlock(ui); 1633 if (ui->ui_tstate & MD_INACCESSIBLE) { 1634 md_unit_readerexit(ui); 1635 continue; 1636 } 1637 md_unit_readerexit(ui); 1638 1639 s = (md_m_shared_t *)(*(smic->sm_shared_by_blk)) 1640 (sm->sm_dev, sm, blkno, &cnt); 1641 1642 if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN)) 1643 continue; 1644 if (s->ms_state == CS_OKAY) { 1645 *cando = cnt; 1646 if (shared != NULL) 1647 *shared = s; 1648 1649 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST && 1650 cs != NULL) { 1651 cs->cs_buf.b_flags |= B_FAILFAST; 1652 } 1653 1654 return (un->un_sm[i].sm_dev); 1655 } 1656 if (s->ms_state != CS_LAST_ERRED) 1657 continue; 1658 1659 /* don't use B_FAILFAST since we're Last Erred */ 1660 1661 if (mincnt > cnt) 1662 mincnt = cnt; 1663 if (s->ms_lasterrcnt > lasterrcnt) { 1664 lasterrcnt = s->ms_lasterrcnt; 1665 if (shared != NULL) 1666 *shared = s; 1667 dev = un->un_sm[i].sm_dev; 1668 } 1669 } 1670 *cando = mincnt; 1671 return (dev); 1672 } 1673 1674 /* 1675 * Given a 32-bit bitmap, this routine will return the bit number 1676 * of the nth bit set. The nth bit set is passed via the index integer. 1677 * 1678 * This routine is used to run through the writable submirror bitmap 1679 * and starting all of the writes. See the value returned is the 1680 * index to appropriate submirror structure, in the md_sm 1681 * array for metamirrors. 1682 */ 1683 static int 1684 md_find_nth_unit(uint_t mask, int index) 1685 { 1686 int bit, nfound; 1687 1688 for (bit = -1, nfound = -1; nfound != index; bit++) { 1689 ASSERT(mask != 0); 1690 nfound += (mask & 1); 1691 mask >>= 1; 1692 } 1693 return (bit); 1694 } 1695 1696 static int 1697 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs) 1698 { 1699 mm_unit_t *un; 1700 buf_t *bp; 1701 int i; 1702 unsigned nunits = 0; 1703 int iunit; 1704 uint_t running_bm = 0; 1705 uint_t sm_index; 1706 1707 bp = &cs->cs_buf; 1708 un = ps->ps_un; 1709 1710 for (i = 0; i < NMIRROR; i++) { 1711 if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING)) 1712 continue; 1713 running_bm |= SMI2BIT(i); 1714 nunits++; 1715 } 1716 if (nunits == 0) 1717 return (1); 1718 1719 /* 1720 * For directed mirror read (DMR) we only use the specified side and 1721 * do not compute the source of the read. 1722 */ 1723 if (ps->ps_flags & MD_MPS_DMR) { 1724 sm_index = un->un_dmr_last_read; 1725 } else { 1726 /* Normal (non-DMR) operation */ 1727 switch (un->un_read_option) { 1728 case RD_GEOMETRY: 1729 iunit = (int)(bp->b_lblkno / 1730 howmany(un->c.un_total_blocks, nunits)); 1731 sm_index = md_find_nth_unit(running_bm, iunit); 1732 break; 1733 case RD_FIRST: 1734 sm_index = md_find_nth_unit(running_bm, 0); 1735 break; 1736 case RD_LOAD_BAL: 1737 /* this is intentional to fall into the default */ 1738 default: 1739 un->un_last_read = (un->un_last_read + 1) % nunits; 1740 sm_index = md_find_nth_unit(running_bm, 1741 un->un_last_read); 1742 break; 1743 } 1744 } 1745 bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev); 1746 ps->ps_allfrom_sm = SMI2BIT(sm_index); 1747 1748 if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) { 1749 bp->b_flags |= B_FAILFAST; 1750 } 1751 1752 return (0); 1753 } 1754 1755 static 1756 int 1757 mirror_are_submirrors_available(mm_unit_t *un) 1758 { 1759 int i; 1760 for (i = 0; i < NMIRROR; i++) { 1761 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1762 1763 if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) || 1764 md_getmajor(tmpdev) != md_major) 1765 continue; 1766 1767 if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) || 1768 (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits)) 1769 return (0); 1770 1771 if (MDI_UNIT(md_getminor(tmpdev)) == NULL) 1772 return (0); 1773 } 1774 return (1); 1775 } 1776 1777 void 1778 build_submirror(mm_unit_t *un, int i, int snarfing) 1779 { 1780 struct mm_submirror *sm; 1781 struct mm_submirror_ic *smic; 1782 md_unit_t *su; 1783 set_t setno; 1784 1785 sm = &un->un_sm[i]; 1786 smic = &un->un_smic[i]; 1787 1788 sm->sm_flags = 0; /* sometime we may need to do more here */ 1789 1790 setno = MD_UN2SET(un); 1791 1792 if (!SMS_IS(sm, SMS_INUSE)) 1793 return; 1794 if (snarfing) { 1795 sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno), 1796 sm->sm_key, MD_NOTRUST_DEVT); 1797 } else { 1798 if (md_getmajor(sm->sm_dev) == md_major) { 1799 su = MD_UNIT(md_getminor(sm->sm_dev)); 1800 un->c.un_flag |= (su->c.un_flag & MD_LABELED); 1801 /* submirror can no longer be soft partitioned */ 1802 MD_CAPAB(su) &= (~MD_CAN_SP); 1803 } 1804 } 1805 smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev, 1806 0, "shared by blk", 0); 1807 smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev, 1808 0, "shared by indx", 0); 1809 smic->sm_get_component_count = 1810 (int (*)())md_get_named_service(sm->sm_dev, 0, 1811 "get component count", 0); 1812 smic->sm_get_bcss = 1813 (int (*)())md_get_named_service(sm->sm_dev, 0, 1814 "get block count skip size", 0); 1815 sm->sm_state &= ~SMS_IGNORE; 1816 if (SMS_IS(sm, SMS_OFFLINE)) 1817 MD_STATUS(un) |= MD_UN_OFFLINE_SM; 1818 md_set_parent(sm->sm_dev, MD_SID(un)); 1819 } 1820 1821 static void 1822 mirror_cleanup(mm_unit_t *un) 1823 { 1824 mddb_recid_t recid; 1825 int smi; 1826 sv_dev_t sv[NMIRROR]; 1827 int nsv = 0; 1828 1829 /* 1830 * If a MN diskset and this node is not the master, do 1831 * not delete any records on snarf of the mirror records. 1832 */ 1833 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1834 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 1835 return; 1836 } 1837 1838 for (smi = 0; smi < NMIRROR; smi++) { 1839 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 1840 continue; 1841 sv[nsv].setno = MD_UN2SET(un); 1842 sv[nsv++].key = un->un_sm[smi].sm_key; 1843 } 1844 1845 recid = un->un_rr_dirty_recid; 1846 mddb_deleterec_wrapper(un->c.un_record_id); 1847 if (recid > 0) 1848 mddb_deleterec_wrapper(recid); 1849 1850 md_rem_names(sv, nsv); 1851 } 1852 1853 /* Return a -1 if optimized record unavailable and set should be released */ 1854 int 1855 mirror_build_incore(mm_unit_t *un, int snarfing) 1856 { 1857 int i; 1858 1859 if (MD_STATUS(un) & MD_UN_BEING_RESET) { 1860 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); 1861 return (1); 1862 } 1863 1864 if (mirror_are_submirrors_available(un) == 0) 1865 return (1); 1866 1867 if (MD_UNIT(MD_SID(un)) != NULL) 1868 return (0); 1869 1870 MD_STATUS(un) = 0; 1871 1872 /* pre-4.1 didn't define CAN_META_CHILD capability */ 1873 MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP; 1874 1875 un->un_ovrlap_chn_flg = 0; 1876 bzero(&un->un_ovrlap_chn, sizeof (un->un_ovrlap_chn)); 1877 1878 for (i = 0; i < NMIRROR; i++) 1879 build_submirror(un, i, snarfing); 1880 1881 if (unit_setup_resync(un, snarfing) != 0) { 1882 if (snarfing) { 1883 mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT); 1884 /* 1885 * If a MN set and set is not stale, then return -1 1886 * which will force the caller to unload the set. 1887 * The MN diskset nodes will return failure if 1888 * unit_setup_resync fails so that nodes won't 1889 * get out of sync. 1890 * 1891 * If set is STALE, the master node can't allocate 1892 * a resync record (if needed), but node needs to 1893 * join the set so that user can delete broken mddbs. 1894 * So, if set is STALE, just continue on. 1895 */ 1896 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1897 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 1898 return (-1); 1899 } 1900 } else 1901 return (1); 1902 } 1903 1904 mutex_init(&un->un_ovrlap_chn_mx, NULL, MUTEX_DEFAULT, NULL); 1905 cv_init(&un->un_ovrlap_chn_cv, NULL, CV_DEFAULT, NULL); 1906 1907 un->un_suspend_wr_flag = 0; 1908 mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL); 1909 cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL); 1910 1911 /* 1912 * Allocate mutexes for mirror-owner and resync-owner changes. 1913 * All references to the owner message state field must be guarded 1914 * by this mutex. 1915 */ 1916 mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL); 1917 1918 /* 1919 * Allocate mutex and condvar for resync thread manipulation. These 1920 * will be used by mirror_resync_unit/mirror_ioctl_resync 1921 */ 1922 mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL); 1923 cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL); 1924 1925 /* 1926 * Allocate mutex and condvar for resync progress thread manipulation. 1927 * This allows resyncs to be continued across an intervening reboot. 1928 */ 1929 mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL); 1930 cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL); 1931 1932 /* 1933 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This 1934 * provides synchronization between a user-ioctl and the resulting 1935 * strategy() call that performs the read(). 1936 */ 1937 mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL); 1938 cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL); 1939 1940 MD_UNIT(MD_SID(un)) = un; 1941 return (0); 1942 } 1943 1944 1945 void 1946 reset_mirror(struct mm_unit *un, minor_t mnum, int removing) 1947 { 1948 mddb_recid_t recid, vtoc_id; 1949 size_t bitcnt; 1950 size_t shortcnt; 1951 int smi; 1952 sv_dev_t sv[NMIRROR]; 1953 int nsv = 0; 1954 uint_t bits = 0; 1955 minor_t selfid; 1956 md_unit_t *su; 1957 1958 md_destroy_unit_incore(mnum, &mirror_md_ops); 1959 1960 shortcnt = un->un_rrd_num * sizeof (short); 1961 bitcnt = howmany(un->un_rrd_num, NBBY); 1962 1963 if (un->un_outstanding_writes) 1964 kmem_free((caddr_t)un->un_outstanding_writes, shortcnt); 1965 if (un->un_goingclean_bm) 1966 kmem_free((caddr_t)un->un_goingclean_bm, bitcnt); 1967 if (un->un_goingdirty_bm) 1968 kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt); 1969 if (un->un_resync_bm) 1970 kmem_free((caddr_t)un->un_resync_bm, bitcnt); 1971 1972 MD_UNIT(mnum) = NULL; 1973 1974 if (!removing) 1975 return; 1976 1977 for (smi = 0; smi < NMIRROR; smi++) { 1978 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 1979 continue; 1980 /* reallow soft partitioning of submirror and reset parent */ 1981 su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev)); 1982 MD_CAPAB(su) |= MD_CAN_SP; 1983 md_reset_parent(un->un_sm[smi].sm_dev); 1984 reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]); 1985 1986 sv[nsv].setno = MD_MIN2SET(mnum); 1987 sv[nsv++].key = un->un_sm[smi].sm_key; 1988 bits |= SMI2BIT(smi); 1989 } 1990 1991 MD_STATUS(un) |= MD_UN_BEING_RESET; 1992 recid = un->un_rr_dirty_recid; 1993 vtoc_id = un->c.un_vtoc_id; 1994 selfid = MD_SID(un); 1995 1996 mirror_commit(un, bits, 0); 1997 1998 /* Destroy all mutexes and condvars before returning. */ 1999 mutex_destroy(&un->un_suspend_wr_mx); 2000 cv_destroy(&un->un_suspend_wr_cv); 2001 mutex_destroy(&un->un_ovrlap_chn_mx); 2002 cv_destroy(&un->un_ovrlap_chn_cv); 2003 mutex_destroy(&un->un_owner_mx); 2004 mutex_destroy(&un->un_rs_thread_mx); 2005 cv_destroy(&un->un_rs_thread_cv); 2006 mutex_destroy(&un->un_rs_progress_mx); 2007 cv_destroy(&un->un_rs_progress_cv); 2008 mutex_destroy(&un->un_dmr_mx); 2009 cv_destroy(&un->un_dmr_cv); 2010 mddb_deleterec_wrapper(un->c.un_record_id); 2011 if (recid != 0) 2012 mddb_deleterec_wrapper(recid); 2013 2014 /* Remove the vtoc, if present */ 2015 if (vtoc_id) 2016 mddb_deleterec_wrapper(vtoc_id); 2017 2018 md_rem_names(sv, nsv); 2019 2020 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, 2021 MD_MIN2SET(selfid), selfid); 2022 } 2023 2024 int 2025 mirror_internal_open( 2026 minor_t mnum, 2027 int flag, 2028 int otyp, 2029 int md_oflags, 2030 IOLOCK *lockp /* can be NULL */ 2031 ) 2032 { 2033 mdi_unit_t *ui = MDI_UNIT(mnum); 2034 int err = 0; 2035 2036 tryagain: 2037 /* single thread */ 2038 if (lockp) { 2039 /* 2040 * If ioctl lock is held, use openclose_enter 2041 * routine that will set the ioctl flag when 2042 * grabbing the readerlock. 2043 */ 2044 (void) md_ioctl_openclose_enter(lockp, ui); 2045 } else { 2046 (void) md_unit_openclose_enter(ui); 2047 } 2048 2049 /* 2050 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE 2051 * message in a MN diskset and this requires that the openclose 2052 * lock is dropped in order to send this message. So, another 2053 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from 2054 * attempting an open while this thread has an open in progress. 2055 * Call the *_lh version of the lock exit routines since the ui_mx 2056 * mutex must be held from checking for OPENINPROGRESS until 2057 * after the cv_wait call. 2058 */ 2059 mutex_enter(&ui->ui_mx); 2060 if (ui->ui_lock & MD_UL_OPENINPROGRESS) { 2061 if (lockp) { 2062 (void) md_ioctl_openclose_exit_lh(lockp); 2063 } else { 2064 md_unit_openclose_exit_lh(ui); 2065 } 2066 cv_wait(&ui->ui_cv, &ui->ui_mx); 2067 mutex_exit(&ui->ui_mx); 2068 goto tryagain; 2069 } 2070 2071 ui->ui_lock |= MD_UL_OPENINPROGRESS; 2072 mutex_exit(&ui->ui_mx); 2073 2074 /* open devices, if necessary */ 2075 if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) { 2076 if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0) 2077 goto out; 2078 } 2079 2080 /* count open */ 2081 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 2082 goto out; 2083 2084 /* unlock, return success */ 2085 out: 2086 mutex_enter(&ui->ui_mx); 2087 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 2088 mutex_exit(&ui->ui_mx); 2089 2090 if (lockp) { 2091 /* 2092 * If ioctl lock is held, use openclose_exit 2093 * routine that will clear the lockp reader flag. 2094 */ 2095 (void) md_ioctl_openclose_exit(lockp); 2096 } else { 2097 md_unit_openclose_exit(ui); 2098 } 2099 return (err); 2100 } 2101 2102 int 2103 mirror_internal_close( 2104 minor_t mnum, 2105 int otyp, 2106 int md_cflags, 2107 IOLOCK *lockp /* can be NULL */ 2108 ) 2109 { 2110 mdi_unit_t *ui = MDI_UNIT(mnum); 2111 mm_unit_t *un; 2112 int err = 0; 2113 2114 /* single thread */ 2115 if (lockp) { 2116 /* 2117 * If ioctl lock is held, use openclose_enter 2118 * routine that will set the ioctl flag when 2119 * grabbing the readerlock. 2120 */ 2121 un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui); 2122 } else { 2123 un = (mm_unit_t *)md_unit_openclose_enter(ui); 2124 } 2125 2126 /* count closed */ 2127 if ((err = md_unit_decopen(mnum, otyp)) != 0) 2128 goto out; 2129 2130 /* close devices, if necessary */ 2131 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 2132 /* 2133 * Clean up dirty bitmap for this unit. Do this 2134 * before closing the underlying devices to avoid 2135 * race conditions with reset_mirror() as a 2136 * result of a 'metaset -r' command running in 2137 * parallel. This might cause deallocation of 2138 * dirty region bitmaps; with underlying metadevices 2139 * in place this can't happen. 2140 * Don't do this if a MN set and ABR not set 2141 */ 2142 if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) { 2143 if (!MD_MNSET_SETNO(MD_UN2SET(un)) || 2144 !(ui->ui_tstate & MD_ABR_CAP)) 2145 mirror_process_unit_resync(un); 2146 } 2147 (void) mirror_close_all_devs(un, md_cflags); 2148 2149 /* 2150 * For a MN set with transient capabilities (eg ABR/DMR) set, 2151 * clear these capabilities on the last open in the cluster. 2152 * To do this we send a message to all nodes to see of the 2153 * device is open. 2154 */ 2155 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 2156 (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) { 2157 if (lockp) { 2158 (void) md_ioctl_openclose_exit(lockp); 2159 } else { 2160 md_unit_openclose_exit(ui); 2161 } 2162 2163 /* 2164 * if we are in the context of an ioctl, drop the 2165 * ioctl lock. 2166 * Otherwise, no other locks should be held. 2167 */ 2168 if (lockp) { 2169 IOLOCK_RETURN_RELEASE(0, lockp); 2170 } 2171 2172 mdmn_clear_all_capabilities(mnum); 2173 2174 /* if dropped the lock previously, regain it */ 2175 if (lockp) { 2176 IOLOCK_RETURN_REACQUIRE(lockp); 2177 } 2178 return (0); 2179 } 2180 /* unlock and return success */ 2181 } 2182 out: 2183 /* Call whether lockp is NULL or not. */ 2184 if (lockp) { 2185 md_ioctl_openclose_exit(lockp); 2186 } else { 2187 md_unit_openclose_exit(ui); 2188 } 2189 return (err); 2190 } 2191 2192 /* 2193 * When a component has completed resyncing and is now ok, check if the 2194 * corresponding component in the other submirrors is in the Last Erred 2195 * state. If it is, we want to change that to the Erred state so we stop 2196 * using that component and start using this good component instead. 2197 * 2198 * This is called from set_sm_comp_state and recursively calls 2199 * set_sm_comp_state if it needs to change the Last Erred state. 2200 */ 2201 static void 2202 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags, 2203 IOLOCK *lockp) 2204 { 2205 mm_submirror_t *sm; 2206 mm_submirror_ic_t *smic; 2207 int ci; 2208 int i; 2209 int compcnt; 2210 int changed = 0; 2211 2212 for (i = 0; i < NMIRROR; i++) { 2213 sm = &un->un_sm[i]; 2214 smic = &un->un_smic[i]; 2215 2216 if (!SMS_IS(sm, SMS_INUSE)) 2217 continue; 2218 2219 /* ignore the submirror that we just made ok */ 2220 if (i == smi) 2221 continue; 2222 2223 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 2224 for (ci = 0; ci < compcnt; ci++) { 2225 md_m_shared_t *shared; 2226 2227 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2228 (sm->sm_dev, sm, ci); 2229 2230 if ((shared->ms_state & CS_LAST_ERRED) && 2231 !mirror_other_sources(un, i, ci, 1)) { 2232 2233 set_sm_comp_state(un, i, ci, CS_ERRED, extras, 2234 flags, lockp); 2235 changed = 1; 2236 } 2237 } 2238 } 2239 2240 /* maybe there is a hotspare for this newly erred component */ 2241 if (changed) { 2242 set_t setno; 2243 2244 setno = MD_UN2SET(un); 2245 if (MD_MNSET_SETNO(setno)) { 2246 send_poke_hotspares(setno); 2247 } else { 2248 (void) poke_hotspares(); 2249 } 2250 } 2251 } 2252 2253 /* 2254 * set_sm_comp_state 2255 * 2256 * Set the state of a submirror component to the specified new state. 2257 * If the mirror is in a multi-node set, send messages to all nodes to 2258 * block all writes to the mirror and then update the state and release the 2259 * writes. These messages are only sent if MD_STATE_XMIT is set in flags. 2260 * MD_STATE_XMIT will be unset in 2 cases: 2261 * 1. When the state is changed to CS_RESYNC as this state change 2262 * will already have been updated on each node by the processing of the 2263 * distributed metasync command, hence no need to xmit. 2264 * 2. When the state is change to CS_OKAY after a resync has completed. Again 2265 * the resync completion will already have been processed on each node by 2266 * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component 2267 * resync, hence no need to xmit. 2268 * 2269 * In case we are called from the updates of a watermark, 2270 * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to 2271 * a metainit or similar. In this case the message that we sent to propagate 2272 * the state change must not be a class1 message as that would deadlock with 2273 * the metainit command that is still being processed. 2274 * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2 2275 * instead. This also makes the submessage generator to create a class2 2276 * submessage rather than a class1 (which would also block) 2277 * 2278 * On entry, unit_writerlock is held 2279 * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is 2280 * also held. 2281 */ 2282 void 2283 set_sm_comp_state( 2284 mm_unit_t *un, 2285 int smi, 2286 int ci, 2287 int newstate, 2288 mddb_recid_t *extras, 2289 uint_t flags, 2290 IOLOCK *lockp 2291 ) 2292 { 2293 mm_submirror_t *sm; 2294 mm_submirror_ic_t *smic; 2295 md_m_shared_t *shared; 2296 int origstate; 2297 void (*get_dev)(); 2298 ms_cd_info_t cd; 2299 char devname[MD_MAX_CTDLEN]; 2300 int err; 2301 set_t setno = MD_UN2SET(un); 2302 md_mn_msg_stch_t stchmsg; 2303 mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); 2304 md_mn_kresult_t *kresult; 2305 int rval; 2306 uint_t msgflags; 2307 md_mn_msgtype_t msgtype; 2308 int save_lock = 0; 2309 mdi_unit_t *ui_sm; 2310 2311 sm = &un->un_sm[smi]; 2312 smic = &un->un_smic[smi]; 2313 2314 /* If we have a real error status then turn off MD_INACCESSIBLE. */ 2315 ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev))); 2316 if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) && 2317 ui_sm->ui_tstate & MD_INACCESSIBLE) { 2318 ui_sm->ui_tstate &= ~MD_INACCESSIBLE; 2319 } 2320 2321 shared = (md_m_shared_t *) 2322 (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci); 2323 origstate = shared->ms_state; 2324 2325 /* 2326 * If the new state is an error and the old one wasn't, generate 2327 * a console message. We do this before we send the state to other 2328 * nodes in a MN set because the state change may change the component 2329 * name if a hotspare is allocated. 2330 */ 2331 if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) && 2332 (newstate & (CS_ERRED|CS_LAST_ERRED))) { 2333 2334 get_dev = 2335 (void (*)())md_get_named_service(sm->sm_dev, 0, 2336 "get device", 0); 2337 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2338 2339 err = md_getdevname(setno, mddb_getsidenum(setno), 0, 2340 cd.cd_dev, devname, sizeof (devname)); 2341 2342 if (err == ENOENT) { 2343 (void) md_devname(setno, cd.cd_dev, devname, 2344 sizeof (devname)); 2345 } 2346 2347 cmn_err(CE_WARN, "md: %s: %s needs maintenance", 2348 md_shortname(md_getminor(sm->sm_dev)), devname); 2349 2350 if (newstate & CS_LAST_ERRED) { 2351 cmn_err(CE_WARN, "md: %s: %s last erred", 2352 md_shortname(md_getminor(sm->sm_dev)), 2353 devname); 2354 2355 } else if (shared->ms_flags & MDM_S_ISOPEN) { 2356 /* 2357 * Close the broken device and clear the open flag on 2358 * it. Closing the device means the RCM framework will 2359 * be able to unconfigure the device if required. 2360 * 2361 * We have to check that the device is open, otherwise 2362 * the first open on it has resulted in the error that 2363 * is being processed and the actual cd.cd_dev will be 2364 * NODEV64. 2365 * 2366 * If this is a multi-node mirror, then the multinode 2367 * state checks following this code will cause the 2368 * slave nodes to close the mirror in the function 2369 * mirror_set_state(). 2370 */ 2371 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2372 shared->ms_flags &= ~MDM_S_ISOPEN; 2373 } 2374 2375 } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) && 2376 (shared->ms_flags & MDM_S_ISOPEN)) { 2377 /* 2378 * Similar to logic above except no log messages since we 2379 * are just transitioning from Last Erred to Erred. 2380 */ 2381 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 2382 "get device", 0); 2383 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2384 2385 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2386 shared->ms_flags &= ~MDM_S_ISOPEN; 2387 } 2388 2389 if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) && 2390 (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) { 2391 /* 2392 * For a multi-node mirror, send the state change to the 2393 * master, which broadcasts to all nodes, including this 2394 * one. Once the message is received, the state is set 2395 * in-core and the master commits the change to disk. 2396 * There is a case, comp_replace, where this function 2397 * can be called from within an ioctl and therefore in this 2398 * case, as the ioctl will already be called on each node, 2399 * there is no need to xmit the state change to the master for 2400 * distribution to the other nodes. MD_STATE_XMIT flag is used 2401 * to indicate whether a xmit is required. The mirror's 2402 * transient state is set to MD_ERR_PENDING to avoid sending 2403 * multiple messages. 2404 */ 2405 if (newstate & (CS_ERRED|CS_LAST_ERRED)) 2406 ui->ui_tstate |= MD_ERR_PENDING; 2407 2408 /* 2409 * Send a state update message to all nodes. This message 2410 * will generate 2 submessages, the first one to suspend 2411 * all writes to the mirror and the second to update the 2412 * state and resume writes. 2413 */ 2414 stchmsg.msg_stch_mnum = un->c.un_self_id; 2415 stchmsg.msg_stch_sm = smi; 2416 stchmsg.msg_stch_comp = ci; 2417 stchmsg.msg_stch_new_state = newstate; 2418 stchmsg.msg_stch_hs_id = shared->ms_hs_id; 2419 #ifdef DEBUG 2420 if (mirror_debug_flag) 2421 printf("send set state, %x, %x, %x, %x, %x\n", 2422 stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm, 2423 stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state, 2424 stchmsg.msg_stch_hs_id); 2425 #endif 2426 if (flags & MD_STATE_WMUPDATE) { 2427 msgtype = MD_MN_MSG_STATE_UPDATE2; 2428 /* 2429 * When coming from an update of watermarks, there 2430 * must already be a message logged that triggered 2431 * this action. So, no need to log this message, too. 2432 */ 2433 msgflags = MD_MSGF_NO_LOG; 2434 } else { 2435 msgtype = MD_MN_MSG_STATE_UPDATE; 2436 msgflags = MD_MSGF_DEFAULT_FLAGS; 2437 } 2438 2439 /* 2440 * If we are in the context of an ioctl, drop the ioctl lock. 2441 * lockp holds the list of locks held. 2442 * 2443 * Otherwise, increment the appropriate reacquire counters. 2444 * If openclose lock is *held, then must reacquire reader 2445 * lock before releasing the openclose lock. 2446 * Do not drop the ARRAY_WRITER lock as we may not be able 2447 * to reacquire it. 2448 */ 2449 if (lockp) { 2450 if (lockp->l_flags & MD_ARRAY_WRITER) { 2451 save_lock = MD_ARRAY_WRITER; 2452 lockp->l_flags &= ~MD_ARRAY_WRITER; 2453 } else if (lockp->l_flags & MD_ARRAY_READER) { 2454 save_lock = MD_ARRAY_READER; 2455 lockp->l_flags &= ~MD_ARRAY_READER; 2456 } 2457 IOLOCK_RETURN_RELEASE(0, lockp); 2458 } else { 2459 if (flags & MD_STATE_OCHELD) { 2460 md_unit_writerexit(ui); 2461 (void) md_unit_readerlock(ui); 2462 md_unit_openclose_exit(ui); 2463 } else { 2464 md_unit_writerexit(ui); 2465 } 2466 } 2467 2468 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 2469 rval = mdmn_ksend_message(setno, 2470 msgtype, 2471 msgflags, 2472 (char *)&stchmsg, 2473 sizeof (stchmsg), 2474 kresult); 2475 2476 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 2477 mdmn_ksend_show_error(rval, kresult, "STATE UPDATE"); 2478 cmn_err(CE_PANIC, 2479 "ksend_message failure: STATE_UPDATE"); 2480 } 2481 kmem_free(kresult, sizeof (md_mn_kresult_t)); 2482 2483 /* if dropped the lock previously, regain it */ 2484 if (lockp) { 2485 IOLOCK_RETURN_REACQUIRE(lockp); 2486 lockp->l_flags |= save_lock; 2487 } else { 2488 /* 2489 * Reacquire dropped locks and update acquirecnts 2490 * appropriately. 2491 */ 2492 if (flags & MD_STATE_OCHELD) { 2493 /* 2494 * openclose also grabs readerlock. 2495 */ 2496 (void) md_unit_openclose_enter(ui); 2497 md_unit_readerexit(ui); 2498 (void) md_unit_writerlock(ui); 2499 } else { 2500 (void) md_unit_writerlock(ui); 2501 } 2502 } 2503 2504 ui->ui_tstate &= ~MD_ERR_PENDING; 2505 } else { 2506 shared->ms_state = newstate; 2507 uniqtime32(&shared->ms_timestamp); 2508 2509 if (newstate == CS_ERRED) 2510 shared->ms_flags |= MDM_S_NOWRITE; 2511 else 2512 shared->ms_flags &= ~MDM_S_NOWRITE; 2513 2514 shared->ms_flags &= ~MDM_S_IOERR; 2515 un->un_changecnt++; 2516 shared->ms_lasterrcnt = un->un_changecnt; 2517 2518 mirror_set_sm_state(sm, smic, SMS_RUNNING, 0); 2519 mirror_commit(un, SMI2BIT(smi), extras); 2520 } 2521 2522 if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) { 2523 /* 2524 * Resetting the Last Erred state will recursively call back 2525 * into this function (set_sm_comp_state) to update the state. 2526 */ 2527 reset_lasterred(un, smi, extras, flags, lockp); 2528 } 2529 } 2530 2531 static int 2532 find_another_logical( 2533 mm_unit_t *un, 2534 mm_submirror_t *esm, 2535 diskaddr_t blk, 2536 u_longlong_t cnt, 2537 int must_be_open, 2538 int state, 2539 int err_cnt) 2540 { 2541 u_longlong_t cando; 2542 md_dev64_t dev; 2543 md_m_shared_t *s; 2544 2545 esm->sm_state |= SMS_IGNORE; 2546 while (cnt != 0) { 2547 u_longlong_t mcnt; 2548 2549 mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024)); /* 1 Gig Blks */ 2550 2551 dev = select_read_unit(un, blk, mcnt, &cando, must_be_open, &s, 2552 NULL); 2553 if (dev == (md_dev64_t)0) 2554 break; 2555 2556 if ((state == CS_LAST_ERRED) && 2557 (s->ms_state == CS_LAST_ERRED) && 2558 (err_cnt > s->ms_lasterrcnt)) 2559 break; 2560 2561 cnt -= cando; 2562 blk += cando; 2563 } 2564 esm->sm_state &= ~SMS_IGNORE; 2565 return (cnt != 0); 2566 } 2567 2568 int 2569 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open) 2570 { 2571 mm_submirror_t *sm; 2572 mm_submirror_ic_t *smic; 2573 size_t count; 2574 diskaddr_t block; 2575 u_longlong_t skip; 2576 u_longlong_t size; 2577 md_dev64_t dev; 2578 int cnt; 2579 md_m_shared_t *s; 2580 int not_found; 2581 2582 sm = &un->un_sm[smi]; 2583 smic = &un->un_smic[smi]; 2584 dev = sm->sm_dev; 2585 2586 /* 2587 * Make sure every component of the submirror 2588 * has other sources. 2589 */ 2590 if (ci < 0) { 2591 /* Find the highest lasterrcnt */ 2592 cnt = (*(smic->sm_get_component_count))(dev, sm); 2593 for (ci = 0; ci < cnt; ci++) { 2594 not_found = mirror_other_sources(un, smi, ci, 2595 must_be_open); 2596 if (not_found) 2597 return (1); 2598 } 2599 return (0); 2600 } 2601 2602 /* 2603 * Make sure this component has other sources 2604 */ 2605 (void) (*(smic->sm_get_bcss)) 2606 (dev, sm, ci, &block, &count, &skip, &size); 2607 2608 if (count == 0) 2609 return (1); 2610 2611 s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci); 2612 2613 while (count--) { 2614 if (block >= un->c.un_total_blocks) 2615 return (0); 2616 2617 if ((block + size) > un->c.un_total_blocks) 2618 size = un->c.un_total_blocks - block; 2619 2620 not_found = find_another_logical(un, sm, block, size, 2621 must_be_open, s->ms_state, s->ms_lasterrcnt); 2622 if (not_found) 2623 return (1); 2624 2625 block += size + skip; 2626 } 2627 return (0); 2628 } 2629 2630 static void 2631 finish_error(md_mps_t *ps) 2632 { 2633 struct buf *pb; 2634 mm_unit_t *un; 2635 mdi_unit_t *ui; 2636 uint_t new_str_flags; 2637 2638 pb = ps->ps_bp; 2639 un = ps->ps_un; 2640 ui = ps->ps_ui; 2641 2642 /* 2643 * Must flag any error to the resync originator if we're performing 2644 * a Write-after-Read. This corresponds to an i/o error on a resync 2645 * target device and in this case we ought to abort the resync as there 2646 * is nothing that can be done to recover from this without operator 2647 * intervention. If we don't set the B_ERROR flag we will continue 2648 * reading from the mirror but won't write to the target (as it will 2649 * have been placed into an errored state). 2650 * To handle the case of multiple components within a submirror we only 2651 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR. 2652 * The originator of the resync read will cause this bit to be set if 2653 * the underlying component count is one for a submirror resync. All 2654 * other resync types will have the flag set as there is no underlying 2655 * resync which can be performed on a contained metadevice for these 2656 * resync types (optimized or component). 2657 */ 2658 2659 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) { 2660 if (ps->ps_flags & MD_MPS_FLAG_ERROR) 2661 pb->b_flags |= B_ERROR; 2662 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2663 MPS_FREE(mirror_parent_cache, ps); 2664 md_unit_readerexit(ui); 2665 md_biodone(pb); 2666 return; 2667 } 2668 /* 2669 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 2670 * operation therefore this I/O request has already been counted, 2671 * the I/O count variable will be decremented by mirror_done()'s 2672 * call to md_biodone(). 2673 */ 2674 if (ps->ps_changecnt != un->un_changecnt) { 2675 new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED; 2676 if (ps->ps_flags & MD_MPS_WOW) 2677 new_str_flags |= MD_STR_WOW; 2678 if (ps->ps_flags & MD_MPS_MAPPED) 2679 new_str_flags |= MD_STR_MAPPED; 2680 /* 2681 * If this I/O request was a read that was part of a resync, 2682 * set MD_STR_WAR for the retried read to ensure that the 2683 * resync write (i.e. write-after-read) will be performed 2684 */ 2685 if (ps->ps_flags & MD_MPS_RESYNC_READ) 2686 new_str_flags |= MD_STR_WAR; 2687 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2688 MPS_FREE(mirror_parent_cache, ps); 2689 md_unit_readerexit(ui); 2690 (void) md_mirror_strategy(pb, new_str_flags, NULL); 2691 return; 2692 } 2693 2694 pb->b_flags |= B_ERROR; 2695 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2696 MPS_FREE(mirror_parent_cache, ps); 2697 md_unit_readerexit(ui); 2698 md_biodone(pb); 2699 } 2700 2701 static void 2702 error_update_unit(md_mps_t *ps) 2703 { 2704 mm_unit_t *un; 2705 mdi_unit_t *ui; 2706 int smi; /* sub mirror index */ 2707 int ci; /* errored component */ 2708 set_t setno; 2709 uint_t flags; /* for set_sm_comp_state() */ 2710 uint_t hspflags; /* for check_comp_4_hotspares() */ 2711 2712 ui = ps->ps_ui; 2713 un = (mm_unit_t *)md_unit_writerlock(ui); 2714 setno = MD_UN2SET(un); 2715 2716 /* All of these updates have to propagated in case of MN set */ 2717 flags = MD_STATE_XMIT; 2718 hspflags = MD_HOTSPARE_XMIT; 2719 2720 /* special treatment if we are called during updating watermarks */ 2721 if (ps->ps_flags & MD_MPS_WMUPDATE) { 2722 flags |= MD_STATE_WMUPDATE; 2723 hspflags |= MD_HOTSPARE_WMUPDATE; 2724 } 2725 smi = 0; 2726 ci = 0; 2727 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 2728 if (mirror_other_sources(un, smi, ci, 0) == 1) { 2729 2730 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2731 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags, 2732 (IOLOCK *)NULL); 2733 /* 2734 * For a MN set, the NOTIFY is done when the state 2735 * change is processed on each node 2736 */ 2737 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2738 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 2739 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2740 } 2741 continue; 2742 } 2743 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2744 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags, 2745 (IOLOCK *)NULL); 2746 /* 2747 * For a MN set, the NOTIFY is done when the state 2748 * change is processed on each node 2749 */ 2750 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2751 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 2752 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2753 } 2754 smi = 0; 2755 ci = 0; 2756 } 2757 2758 md_unit_writerexit(ui); 2759 if (MD_MNSET_SETNO(setno)) { 2760 send_poke_hotspares(setno); 2761 } else { 2762 (void) poke_hotspares(); 2763 } 2764 (void) md_unit_readerlock(ui); 2765 2766 finish_error(ps); 2767 } 2768 2769 /* 2770 * When we have a B_FAILFAST IO error on a Last Erred component we need to 2771 * retry the IO without B_FAILFAST set so that we try to ensure that the 2772 * component "sees" each IO. 2773 */ 2774 static void 2775 last_err_retry(md_mcs_t *cs) 2776 { 2777 struct buf *cb; 2778 md_mps_t *ps; 2779 uint_t flags; 2780 2781 cb = &cs->cs_buf; 2782 cb->b_flags &= ~B_FAILFAST; 2783 2784 /* if we're panicing just let this I/O error out */ 2785 if (panicstr) { 2786 (void) mirror_done(cb); 2787 return; 2788 } 2789 2790 /* reissue the I/O */ 2791 2792 ps = cs->cs_ps; 2793 2794 bioerror(cb, 0); 2795 2796 mutex_enter(&ps->ps_mx); 2797 2798 flags = MD_STR_NOTTOP; 2799 if (ps->ps_flags & MD_MPS_MAPPED) 2800 flags |= MD_STR_MAPPED; 2801 if (ps->ps_flags & MD_MPS_NOBLOCK) 2802 flags |= MD_NOBLOCK; 2803 2804 mutex_exit(&ps->ps_mx); 2805 2806 clear_retry_error(cb); 2807 2808 cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST", 2809 md_shortname(getminor(cb->b_edev))); 2810 2811 md_call_strategy(cb, flags, NULL); 2812 } 2813 2814 static void 2815 mirror_error(md_mps_t *ps) 2816 { 2817 int smi; /* sub mirror index */ 2818 int ci; /* errored component */ 2819 2820 if (panicstr) { 2821 finish_error(ps); 2822 return; 2823 } 2824 2825 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 2826 mirror_overlap_chain_remove(ps); 2827 2828 smi = 0; 2829 ci = 0; 2830 if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) { 2831 md_unit_readerexit(ps->ps_ui); 2832 daemon_request(&md_mstr_daemon, error_update_unit, 2833 (daemon_queue_t *)ps, REQ_OLD); 2834 return; 2835 } 2836 2837 finish_error(ps); 2838 } 2839 2840 static int 2841 copy_write_done(struct buf *cb) 2842 { 2843 md_mps_t *ps; 2844 buf_t *pb; 2845 char *wowbuf; 2846 wowhdr_t *wowhdr; 2847 ssize_t wow_resid; 2848 2849 /* get wowbuf ans save structure */ 2850 wowbuf = cb->b_un.b_addr; 2851 wowhdr = WOWBUF_HDR(wowbuf); 2852 ps = wowhdr->wow_ps; 2853 pb = ps->ps_bp; 2854 2855 /* Save error information, then free cb */ 2856 if (cb->b_flags & B_ERROR) 2857 pb->b_flags |= B_ERROR; 2858 2859 if (cb->b_flags & B_REMAPPED) 2860 bp_mapout(cb); 2861 2862 freerbuf(cb); 2863 2864 /* update residual and continue if needed */ 2865 if ((pb->b_flags & B_ERROR) == 0) { 2866 wow_resid = pb->b_bcount - wowhdr->wow_offset; 2867 pb->b_resid = wow_resid; 2868 if (wow_resid > 0) { 2869 daemon_request(&md_mstr_daemon, copy_write_cont, 2870 (daemon_queue_t *)wowhdr, REQ_OLD); 2871 return (1); 2872 } 2873 } 2874 2875 /* Write is complete, release resources. */ 2876 kmem_cache_free(mirror_wowblk_cache, wowhdr); 2877 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 2878 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2879 MPS_FREE(mirror_parent_cache, ps); 2880 md_biodone(pb); 2881 return (0); 2882 } 2883 2884 static void 2885 copy_write_cont(wowhdr_t *wowhdr) 2886 { 2887 buf_t *pb; 2888 buf_t *cb; 2889 char *wowbuf; 2890 int wow_offset; 2891 size_t wow_resid; 2892 diskaddr_t wow_blkno; 2893 2894 wowbuf = WOWHDR_BUF(wowhdr); 2895 pb = wowhdr->wow_ps->ps_bp; 2896 2897 /* get data on current location */ 2898 wow_offset = wowhdr->wow_offset; 2899 wow_resid = pb->b_bcount - wow_offset; 2900 wow_blkno = pb->b_lblkno + lbtodb(wow_offset); 2901 2902 /* setup child buffer */ 2903 cb = getrbuf(KM_SLEEP); 2904 cb->b_flags = B_WRITE; 2905 cb->b_edev = pb->b_edev; 2906 cb->b_un.b_addr = wowbuf; /* change to point at WOWBUF */ 2907 cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */ 2908 cb->b_iodone = copy_write_done; 2909 cb->b_bcount = MIN(md_wowbuf_size, wow_resid); 2910 cb->b_lblkno = wow_blkno; 2911 2912 /* move offset to next section */ 2913 wowhdr->wow_offset += cb->b_bcount; 2914 2915 /* copy and setup write for current section */ 2916 bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount); 2917 2918 /* do it */ 2919 /* 2920 * Do not set the MD_IO_COUNTED flag as this is a new I/O request 2921 * that handles the WOW condition. The resultant increment on the 2922 * I/O count variable is cleared by copy_write_done()'s call to 2923 * md_biodone(). 2924 */ 2925 (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW 2926 | MD_STR_MAPPED, NULL); 2927 } 2928 2929 static void 2930 md_mirror_copy_write(md_mps_t *ps) 2931 { 2932 wowhdr_t *wowhdr; 2933 2934 wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS); 2935 mirror_wowblk_init(wowhdr); 2936 wowhdr->wow_ps = ps; 2937 wowhdr->wow_offset = 0; 2938 copy_write_cont(wowhdr); 2939 } 2940 2941 static void 2942 handle_wow(md_mps_t *ps) 2943 { 2944 buf_t *pb; 2945 2946 pb = ps->ps_bp; 2947 2948 bp_mapin(pb); 2949 2950 md_mirror_wow_cnt++; 2951 if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) { 2952 cmn_err(CE_NOTE, 2953 "md: %s, blk %lld, cnt %ld: Write on write %d occurred", 2954 md_shortname(getminor(pb->b_edev)), 2955 (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt); 2956 } 2957 2958 /* 2959 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 2960 * operation therefore this I/O request has already been counted, 2961 * the I/O count variable will be decremented by mirror_done()'s 2962 * call to md_biodone(). 2963 */ 2964 if (md_mirror_wow_flg & WOW_NOCOPY) 2965 (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW | 2966 MD_STR_MAPPED | MD_IO_COUNTED, ps); 2967 else 2968 md_mirror_copy_write(ps); 2969 } 2970 2971 /* 2972 * Return true if the specified submirror is either in the Last Erred 2973 * state or is transitioning into the Last Erred state. 2974 */ 2975 static bool_t 2976 submirror_is_lasterred(mm_unit_t *un, int smi) 2977 { 2978 mm_submirror_t *sm; 2979 mm_submirror_ic_t *smic; 2980 md_m_shared_t *shared; 2981 int ci; 2982 int compcnt; 2983 2984 sm = &un->un_sm[smi]; 2985 smic = &un->un_smic[smi]; 2986 2987 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 2988 for (ci = 0; ci < compcnt; ci++) { 2989 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2990 (sm->sm_dev, sm, ci); 2991 2992 if (shared->ms_state == CS_LAST_ERRED) 2993 return (B_TRUE); 2994 2995 /* 2996 * It is not currently Last Erred, check if entering Last Erred. 2997 */ 2998 if ((shared->ms_flags & MDM_S_IOERR) && 2999 ((shared->ms_state == CS_OKAY) || 3000 (shared->ms_state == CS_RESYNC))) { 3001 if (mirror_other_sources(un, smi, ci, 0) == 1) 3002 return (B_TRUE); 3003 } 3004 } 3005 3006 return (B_FALSE); 3007 } 3008 3009 3010 static int 3011 mirror_done(struct buf *cb) 3012 { 3013 md_mps_t *ps; 3014 md_mcs_t *cs; 3015 3016 /*LINTED*/ 3017 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3018 ps = cs->cs_ps; 3019 3020 mutex_enter(&ps->ps_mx); 3021 3022 /* check if we need to retry an errored failfast I/O */ 3023 if (cb->b_flags & B_ERROR) { 3024 struct buf *pb = ps->ps_bp; 3025 3026 if (cb->b_flags & B_FAILFAST) { 3027 int i; 3028 mm_unit_t *un = ps->ps_un; 3029 3030 for (i = 0; i < NMIRROR; i++) { 3031 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 3032 continue; 3033 3034 if (cb->b_edev == 3035 md_dev64_to_dev(un->un_sm[i].sm_dev)) { 3036 3037 /* 3038 * This is the submirror that had the 3039 * error. Check if it is Last Erred. 3040 */ 3041 if (submirror_is_lasterred(un, i)) { 3042 daemon_queue_t *dqp; 3043 3044 mutex_exit(&ps->ps_mx); 3045 dqp = (daemon_queue_t *)cs; 3046 dqp->dq_prev = NULL; 3047 dqp->dq_next = NULL; 3048 daemon_request(&md_done_daemon, 3049 last_err_retry, dqp, 3050 REQ_OLD); 3051 return (1); 3052 } 3053 break; 3054 } 3055 } 3056 } 3057 3058 /* continue to process the buf without doing a retry */ 3059 ps->ps_flags |= MD_MPS_ERROR; 3060 pb->b_error = cb->b_error; 3061 } 3062 3063 return (mirror_done_common(cb)); 3064 } 3065 3066 /* 3067 * Split from the original mirror_done function so we can handle bufs after a 3068 * retry. 3069 * ps->ps_mx is already held in the caller of this function and the cb error 3070 * has already been checked and handled in the caller. 3071 */ 3072 static int 3073 mirror_done_common(struct buf *cb) 3074 { 3075 struct buf *pb; 3076 mm_unit_t *un; 3077 mdi_unit_t *ui; 3078 md_mps_t *ps; 3079 md_mcs_t *cs; 3080 size_t end_rr, start_rr, current_rr; 3081 3082 /*LINTED*/ 3083 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3084 ps = cs->cs_ps; 3085 pb = ps->ps_bp; 3086 3087 if (cb->b_flags & B_REMAPPED) 3088 bp_mapout(cb); 3089 3090 ps->ps_frags--; 3091 if (ps->ps_frags != 0) { 3092 mutex_exit(&ps->ps_mx); 3093 kmem_cache_free(mirror_child_cache, cs); 3094 return (1); 3095 } 3096 un = ps->ps_un; 3097 ui = ps->ps_ui; 3098 3099 /* 3100 * Do not update outstanding_writes if we're running with ABR 3101 * set for this mirror or the write() was issued with MD_STR_ABR set. 3102 * Also a resync initiated write() has no outstanding_writes update 3103 * either. 3104 */ 3105 if (((cb->b_flags & B_READ) == 0) && 3106 (un->un_nsm >= 2) && 3107 (ps->ps_call == NULL) && 3108 !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) && 3109 !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) { 3110 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 3111 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 3112 mutex_enter(&un->un_resync_mx); 3113 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) 3114 un->un_outstanding_writes[current_rr]--; 3115 mutex_exit(&un->un_resync_mx); 3116 } 3117 kmem_cache_free(mirror_child_cache, cs); 3118 mutex_exit(&ps->ps_mx); 3119 3120 if (ps->ps_call != NULL) { 3121 daemon_request(&md_done_daemon, ps->ps_call, 3122 (daemon_queue_t *)ps, REQ_OLD); 3123 return (1); 3124 } 3125 3126 if ((ps->ps_flags & MD_MPS_ERROR)) { 3127 daemon_request(&md_done_daemon, mirror_error, 3128 (daemon_queue_t *)ps, REQ_OLD); 3129 return (1); 3130 } 3131 3132 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3133 mirror_overlap_chain_remove(ps); 3134 3135 /* 3136 * Handle Write-on-Write problem. 3137 * Skip In case of Raw and Direct I/O as they are 3138 * handled earlier. 3139 * 3140 */ 3141 if (!(md_mirror_wow_flg & WOW_DISABLE) && 3142 !(pb->b_flags & B_READ) && 3143 !(ps->ps_flags & MD_MPS_WOW) && 3144 !(pb->b_flags & B_PHYS) && 3145 any_pages_dirty(pb)) { 3146 md_unit_readerexit(ps->ps_ui); 3147 daemon_request(&md_mstr_daemon, handle_wow, 3148 (daemon_queue_t *)ps, REQ_OLD); 3149 return (1); 3150 } 3151 3152 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3153 MPS_FREE(mirror_parent_cache, ps); 3154 md_unit_readerexit(ui); 3155 md_biodone(pb); 3156 return (0); 3157 } 3158 3159 /* 3160 * Clear error state in submirror component if the retry worked after 3161 * a failfast error. 3162 */ 3163 static void 3164 clear_retry_error(struct buf *cb) 3165 { 3166 int smi; 3167 md_mcs_t *cs; 3168 mm_unit_t *un; 3169 mdi_unit_t *ui_sm; 3170 mm_submirror_t *sm; 3171 mm_submirror_ic_t *smic; 3172 u_longlong_t cnt; 3173 md_m_shared_t *shared; 3174 3175 /*LINTED*/ 3176 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3177 un = cs->cs_ps->ps_un; 3178 3179 for (smi = 0; smi < NMIRROR; smi++) { 3180 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 3181 continue; 3182 3183 if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) { 3184 break; 3185 } 3186 } 3187 3188 if (smi >= NMIRROR) 3189 return; 3190 3191 sm = &un->un_sm[smi]; 3192 smic = &un->un_smic[smi]; 3193 cnt = cb->b_bcount; 3194 3195 ui_sm = MDI_UNIT(getminor(cb->b_edev)); 3196 (void) md_unit_writerlock(ui_sm); 3197 3198 shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm, 3199 cb->b_blkno, &cnt); 3200 3201 if (shared->ms_flags & MDM_S_IOERR) { 3202 shared->ms_flags &= ~MDM_S_IOERR; 3203 3204 } else { 3205 /* the I/O buf spans components and the first one is not erred */ 3206 int cnt; 3207 int i; 3208 3209 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un); 3210 for (i = 0; i < cnt; i++) { 3211 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 3212 (sm->sm_dev, sm, i); 3213 3214 if (shared->ms_flags & MDM_S_IOERR && 3215 shared->ms_state == CS_OKAY) { 3216 3217 shared->ms_flags &= ~MDM_S_IOERR; 3218 break; 3219 } 3220 } 3221 } 3222 3223 md_unit_writerexit(ui_sm); 3224 } 3225 3226 static size_t 3227 mirror_map_read( 3228 md_mps_t *ps, 3229 md_mcs_t *cs, 3230 diskaddr_t blkno, 3231 u_longlong_t count 3232 ) 3233 { 3234 mm_unit_t *un; 3235 buf_t *bp; 3236 u_longlong_t cando; 3237 3238 bp = &cs->cs_buf; 3239 un = ps->ps_un; 3240 3241 bp->b_lblkno = blkno; 3242 if (fast_select_read_unit(ps, cs) == 0) { 3243 bp->b_bcount = ldbtob(count); 3244 return (0); 3245 } 3246 bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, count, &cando, 3247 0, NULL, cs)); 3248 bp->b_bcount = ldbtob(cando); 3249 if (count != cando) 3250 return (cando); 3251 return (0); 3252 } 3253 3254 static void 3255 write_after_read(md_mps_t *ps) 3256 { 3257 struct buf *pb; 3258 int flags; 3259 3260 if (ps->ps_flags & MD_MPS_ERROR) { 3261 mirror_error(ps); 3262 return; 3263 } 3264 3265 pb = ps->ps_bp; 3266 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3267 ps->ps_call = NULL; 3268 ps->ps_flags |= MD_MPS_WRITE_AFTER_READ; 3269 flags = MD_STR_NOTTOP | MD_STR_WAR; 3270 if (ps->ps_flags & MD_MPS_MAPPED) 3271 flags |= MD_STR_MAPPED; 3272 if (ps->ps_flags & MD_MPS_NOBLOCK) 3273 flags |= MD_NOBLOCK; 3274 if (ps->ps_flags & MD_MPS_DIRTY_RD) 3275 flags |= MD_STR_DIRTY_RD; 3276 (void) mirror_write_strategy(pb, flags, ps); 3277 } 3278 3279 static void 3280 continue_serial(md_mps_t *ps) 3281 { 3282 md_mcs_t *cs; 3283 buf_t *cb; 3284 mm_unit_t *un; 3285 int flags; 3286 3287 un = ps->ps_un; 3288 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 3289 mirror_child_init(cs); 3290 cb = &cs->cs_buf; 3291 ps->ps_call = NULL; 3292 ps->ps_frags = 1; 3293 (void) mirror_map_write(un, cs, ps, 0); 3294 flags = MD_STR_NOTTOP; 3295 if (ps->ps_flags & MD_MPS_MAPPED) 3296 flags |= MD_STR_MAPPED; 3297 md_call_strategy(cb, flags, NULL); 3298 } 3299 3300 static int 3301 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war) 3302 { 3303 int i; 3304 dev_t dev; /* needed for bioclone, so not md_dev64_t */ 3305 buf_t *cb; 3306 buf_t *pb; 3307 diskaddr_t blkno; 3308 size_t bcount; 3309 off_t offset; 3310 3311 pb = ps->ps_bp; 3312 cb = &cs->cs_buf; 3313 cs->cs_ps = ps; 3314 3315 i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm); 3316 3317 dev = md_dev64_to_dev(un->un_sm[i].sm_dev); 3318 3319 blkno = pb->b_lblkno; 3320 bcount = pb->b_bcount; 3321 offset = 0; 3322 if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) { 3323 blkno = DK_LABEL_LOC + 1; 3324 /* 3325 * This handles the case where we're requesting 3326 * a write to block 0 on a label partition 3327 * and the request size was smaller than the 3328 * size of the label. If this is the case 3329 * then we'll return -1. Failure to do so will 3330 * either cause the calling thread to hang due to 3331 * an ssd bug, or worse if the bcount were allowed 3332 * to go negative (ie large). 3333 */ 3334 if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1)) 3335 return (-1); 3336 bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3337 offset = (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3338 } 3339 3340 cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done, 3341 cb, KM_NOSLEEP); 3342 if (war) 3343 cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE; 3344 3345 /* 3346 * If the submirror is in the erred stated, check if any component is 3347 * in the Last Erred state. If so, we don't want to use the B_FAILFAST 3348 * flag on the IO. 3349 * 3350 * Provide a fast path for the non-erred case (which should be the 3351 * normal case). 3352 */ 3353 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) { 3354 if (un->un_sm[i].sm_state & SMS_COMP_ERRED) { 3355 mm_submirror_t *sm; 3356 mm_submirror_ic_t *smic; 3357 int ci; 3358 int compcnt; 3359 3360 sm = &un->un_sm[i]; 3361 smic = &un->un_smic[i]; 3362 3363 compcnt = (*(smic->sm_get_component_count)) 3364 (sm->sm_dev, un); 3365 for (ci = 0; ci < compcnt; ci++) { 3366 md_m_shared_t *shared; 3367 3368 shared = (md_m_shared_t *) 3369 (*(smic->sm_shared_by_indx))(sm->sm_dev, 3370 sm, ci); 3371 3372 if (shared->ms_state == CS_LAST_ERRED) 3373 break; 3374 } 3375 if (ci >= compcnt) 3376 cb->b_flags |= B_FAILFAST; 3377 3378 } else { 3379 cb->b_flags |= B_FAILFAST; 3380 } 3381 } 3382 3383 ps->ps_current_sm++; 3384 if (ps->ps_current_sm != ps->ps_active_cnt) { 3385 if (un->un_write_option == WR_SERIAL) { 3386 ps->ps_call = continue_serial; 3387 return (0); 3388 } 3389 return (1); 3390 } 3391 return (0); 3392 } 3393 3394 /* 3395 * directed_read_done: 3396 * ------------------ 3397 * Completion routine called when a DMR request has been returned from the 3398 * underlying driver. Wake-up the original ioctl() and return the data to 3399 * the user. 3400 */ 3401 static void 3402 directed_read_done(md_mps_t *ps) 3403 { 3404 mm_unit_t *un; 3405 mdi_unit_t *ui; 3406 3407 un = ps->ps_un; 3408 ui = ps->ps_ui; 3409 3410 md_unit_readerexit(ui); 3411 md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3412 ps->ps_call = NULL; 3413 3414 mutex_enter(&un->un_dmr_mx); 3415 cv_signal(&un->un_dmr_cv); 3416 mutex_exit(&un->un_dmr_mx); 3417 3418 /* release the parent structure */ 3419 kmem_cache_free(mirror_parent_cache, ps); 3420 } 3421 3422 /* 3423 * daemon_io: 3424 * ------------ 3425 * Called to issue a mirror_write_strategy() or mirror_read_strategy 3426 * call from a blockable context. NOTE: no mutex can be held on entry to this 3427 * routine 3428 */ 3429 static void 3430 daemon_io(daemon_queue_t *dq) 3431 { 3432 md_mps_t *ps = (md_mps_t *)dq; 3433 int flag = MD_STR_NOTTOP; 3434 buf_t *pb = ps->ps_bp; 3435 3436 if (ps->ps_flags & MD_MPS_MAPPED) 3437 flag |= MD_STR_MAPPED; 3438 if (ps->ps_flags & MD_MPS_WOW) 3439 flag |= MD_STR_WOW; 3440 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) 3441 flag |= MD_STR_WAR; 3442 if (ps->ps_flags & MD_MPS_ABR) 3443 flag |= MD_STR_ABR; 3444 3445 /* 3446 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set 3447 * MD_STR_WAR before calling mirror_read_strategy 3448 */ 3449 if (pb->b_flags & B_READ) { 3450 if (!(ps->ps_flags & MD_MPS_DIRTY_RD)) 3451 flag |= MD_STR_WAR; 3452 mirror_read_strategy(pb, flag, ps); 3453 } else 3454 mirror_write_strategy(pb, flag, ps); 3455 } 3456 3457 /* 3458 * update_resync: 3459 * ------------- 3460 * Called to update the in-core version of the resync record with the latest 3461 * version that was committed to disk when the previous mirror owner 3462 * relinquished ownership. This call is likely to block as we must hold-off 3463 * any current resync processing that may be occurring. 3464 * On completion of the resync record update we issue the mirror_write_strategy 3465 * call to complete the i/o that first started this sequence. To remove a race 3466 * condition between a new write() request which is submitted and the resync 3467 * record update we acquire the writerlock. This will hold off all i/o to the 3468 * mirror until the resync update has completed. 3469 * NOTE: no mutex can be held on entry to this routine 3470 */ 3471 static void 3472 update_resync(daemon_queue_t *dq) 3473 { 3474 md_mps_t *ps = (md_mps_t *)dq; 3475 buf_t *pb = ps->ps_bp; 3476 mdi_unit_t *ui = ps->ps_ui; 3477 mm_unit_t *un; 3478 set_t setno; 3479 int restart_resync; 3480 3481 un = md_unit_writerlock(ui); 3482 ps->ps_un = un; 3483 setno = MD_MIN2SET(getminor(pb->b_edev)); 3484 if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) { 3485 /* 3486 * Synchronize our in-core view of what regions need to be 3487 * resync'd with the on-disk version. 3488 */ 3489 mutex_enter(&un->un_rrp_inflight_mx); 3490 mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm, 3491 un->un_dirty_bm); 3492 mutex_exit(&un->un_rrp_inflight_mx); 3493 3494 /* Region dirty map is now up to date */ 3495 } 3496 restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0; 3497 md_unit_writerexit(ui); 3498 3499 /* Restart the resync thread if it was previously blocked */ 3500 if (restart_resync) { 3501 mutex_enter(&un->un_rs_thread_mx); 3502 un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER; 3503 cv_signal(&un->un_rs_thread_cv); 3504 mutex_exit(&un->un_rs_thread_mx); 3505 } 3506 /* Continue with original deferred i/o */ 3507 daemon_io(dq); 3508 } 3509 3510 /* 3511 * owner_timeout: 3512 * ------------- 3513 * Called if the original mdmn_ksend_message() failed and the request is to be 3514 * retried. Reattempt the original ownership change. 3515 * 3516 * NOTE: called at interrupt context (see timeout(9f)). 3517 */ 3518 static void 3519 owner_timeout(void *arg) 3520 { 3521 daemon_queue_t *dq = (daemon_queue_t *)arg; 3522 3523 daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD); 3524 } 3525 3526 /* 3527 * become_owner: 3528 * ------------ 3529 * Called to issue RPC request to become the owner of the mirror 3530 * associated with this i/o request. We assume that the ownership request 3531 * is synchronous, so if it succeeds we will issue the request via 3532 * mirror_write_strategy(). 3533 * If multiple i/o's are outstanding we will be called from the mirror_daemon 3534 * service thread. 3535 * NOTE: no mutex should be held on entry to this routine. 3536 */ 3537 static void 3538 become_owner(daemon_queue_t *dq) 3539 { 3540 md_mps_t *ps = (md_mps_t *)dq; 3541 mm_unit_t *un = ps->ps_un; 3542 buf_t *pb = ps->ps_bp; 3543 set_t setno; 3544 md_mn_kresult_t *kres; 3545 int msg_flags = md_mirror_msg_flags; 3546 md_mps_t *ps1; 3547 3548 ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL); 3549 3550 /* 3551 * If we're already the mirror owner we do not need to send a message 3552 * but can simply process the i/o request immediately. 3553 * If we've already sent the request to become owner we requeue the 3554 * request as we're waiting for the synchronous ownership message to 3555 * be processed. 3556 */ 3557 if (MD_MN_MIRROR_OWNER(un)) { 3558 /* 3559 * As the strategy() call will potentially block we need to 3560 * punt this to a separate thread and complete this request 3561 * as quickly as possible. Note: if we're a read request 3562 * this must be a resync, we cannot afford to be queued 3563 * behind any intervening i/o requests. In this case we put the 3564 * request on the md_mirror_rs_daemon queue. 3565 */ 3566 if (pb->b_flags & B_READ) { 3567 daemon_request(&md_mirror_rs_daemon, daemon_io, dq, 3568 REQ_OLD); 3569 } else { 3570 daemon_request(&md_mirror_io_daemon, daemon_io, dq, 3571 REQ_OLD); 3572 } 3573 } else { 3574 mutex_enter(&un->un_owner_mx); 3575 if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) { 3576 md_mn_req_owner_t *msg; 3577 int rval = 0; 3578 3579 /* 3580 * Check to see that we haven't exceeded the maximum 3581 * retry count. If we have we fail the i/o as the 3582 * comms mechanism has become wedged beyond recovery. 3583 */ 3584 if (dq->qlen++ >= MD_OWNER_RETRIES) { 3585 mutex_exit(&un->un_owner_mx); 3586 cmn_err(CE_WARN, 3587 "md_mirror: Request exhausted ownership " 3588 "retry limit of %d attempts", dq->qlen); 3589 pb->b_error = EIO; 3590 pb->b_flags |= B_ERROR; 3591 pb->b_resid = pb->b_bcount; 3592 kmem_cache_free(mirror_parent_cache, ps); 3593 md_biodone(pb); 3594 return; 3595 } 3596 3597 /* 3598 * Issue request to change ownership. The call is 3599 * synchronous so when it returns we can complete the 3600 * i/o (if successful), or enqueue it again so that 3601 * the operation will be retried. 3602 */ 3603 un->un_owner_state |= MM_MN_OWNER_SENT; 3604 mutex_exit(&un->un_owner_mx); 3605 3606 msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP); 3607 setno = MD_MIN2SET(getminor(pb->b_edev)); 3608 msg->mnum = MD_SID(un); 3609 msg->owner = md_mn_mynode_id; 3610 msg_flags |= MD_MSGF_NO_LOG; 3611 /* 3612 * If this IO is triggered by updating a watermark, 3613 * it might be issued by the creation of a softpartition 3614 * while the commd subsystem is suspended. 3615 * We don't want this message to block. 3616 */ 3617 if (ps->ps_flags & MD_MPS_WMUPDATE) { 3618 msg_flags |= MD_MSGF_OVERRIDE_SUSPEND; 3619 } 3620 3621 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 3622 rval = mdmn_ksend_message(setno, 3623 MD_MN_MSG_REQUIRE_OWNER, 3624 msg_flags, /* flags */ 3625 (char *)msg, 3626 sizeof (md_mn_req_owner_t), 3627 kres); 3628 3629 kmem_free(msg, sizeof (md_mn_req_owner_t)); 3630 3631 if (MDMN_KSEND_MSG_OK(rval, kres)) { 3632 dq->qlen = 0; 3633 /* 3634 * Successfully changed owner, reread the 3635 * resync record so that we have a valid idea of 3636 * any previously committed incomplete write()s. 3637 * NOTE: As we need to acquire the resync mutex 3638 * this may block, so we defer it to a separate 3639 * thread handler. This makes us (effectively) 3640 * non-blocking once the ownership message 3641 * handling has completed. 3642 */ 3643 mutex_enter(&un->un_owner_mx); 3644 if (un->un_owner_state & MM_MN_BECOME_OWNER) { 3645 un->un_mirror_owner = md_mn_mynode_id; 3646 /* Sets owner of un_rr_dirty record */ 3647 if (un->un_rr_dirty_recid) 3648 (void) mddb_setowner( 3649 un->un_rr_dirty_recid, 3650 md_mn_mynode_id); 3651 un->un_owner_state &= 3652 ~MM_MN_BECOME_OWNER; 3653 /* 3654 * Release the block on the current 3655 * resync region if it is blocked 3656 */ 3657 ps1 = un->un_rs_prev_ovrlap; 3658 if ((ps1 != NULL) && 3659 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) 3660 mirror_overlap_chain_remove( 3661 ps1); 3662 mutex_exit(&un->un_owner_mx); 3663 3664 /* 3665 * If we're a read, this must be a 3666 * resync request, issue 3667 * the i/o request on the 3668 * md_mirror_rs_daemon queue. This is 3669 * to avoid a deadlock between the 3670 * resync_unit thread and 3671 * subsequent i/o requests that may 3672 * block on the resync region. 3673 */ 3674 if (pb->b_flags & B_READ) { 3675 daemon_request( 3676 &md_mirror_rs_daemon, 3677 update_resync, dq, REQ_OLD); 3678 } else { 3679 daemon_request( 3680 &md_mirror_io_daemon, 3681 update_resync, dq, REQ_OLD); 3682 } 3683 kmem_free(kres, 3684 sizeof (md_mn_kresult_t)); 3685 return; 3686 } else { 3687 /* 3688 * Some other node has beaten us to 3689 * obtain ownership. We need to 3690 * reschedule our ownership request 3691 */ 3692 mutex_exit(&un->un_owner_mx); 3693 } 3694 } else { 3695 mdmn_ksend_show_error(rval, kres, 3696 "MD_MN_MSG_REQUIRE_OWNER"); 3697 /* 3698 * Message transport failure is handled by the 3699 * comms layer. If the ownership change request 3700 * does not succeed we need to flag the error to 3701 * the initiator of the i/o. This is handled by 3702 * the retry logic above. As the request failed 3703 * we do not know _who_ the owner of the mirror 3704 * currently is. We reset our idea of the owner 3705 * to None so that any further write()s will 3706 * attempt to become the owner again. This stops 3707 * multiple nodes writing to the same mirror 3708 * simultaneously. 3709 */ 3710 mutex_enter(&un->un_owner_mx); 3711 un->un_owner_state &= 3712 ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER); 3713 un->un_mirror_owner = MD_MN_MIRROR_UNOWNED; 3714 mutex_exit(&un->un_owner_mx); 3715 } 3716 kmem_free(kres, sizeof (md_mn_kresult_t)); 3717 } else 3718 mutex_exit(&un->un_owner_mx); 3719 3720 /* 3721 * Re-enqueue this request on the deferred i/o list. Delay the 3722 * request for md_mirror_owner_to usecs to stop thrashing. 3723 */ 3724 (void) timeout(owner_timeout, dq, 3725 drv_usectohz(md_mirror_owner_to)); 3726 } 3727 } 3728 3729 static void 3730 mirror_write_strategy(buf_t *pb, int flag, void *private) 3731 { 3732 md_mps_t *ps; 3733 md_mcs_t *cs; 3734 int more; 3735 mm_unit_t *un; 3736 mdi_unit_t *ui; 3737 buf_t *cb; /* child buf pointer */ 3738 set_t setno; 3739 int rs_on_overlap = 0; 3740 3741 ui = MDI_UNIT(getminor(pb->b_edev)); 3742 un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev)); 3743 3744 3745 md_kstat_waitq_enter(ui); 3746 3747 /* 3748 * If a state change is in progress for this mirror in a MN set, 3749 * suspend all non-resync writes until the state change is complete. 3750 * The objective of this suspend is to ensure that it is not 3751 * possible for one node to read data from a submirror that another node 3752 * has not written to because of the state change. Therefore we 3753 * suspend all writes until the state change has been made. As it is 3754 * not possible to read from the target of a resync, there is no need 3755 * to suspend resync writes. 3756 */ 3757 3758 if (!(flag & MD_STR_WAR)) { 3759 mutex_enter(&un->un_suspend_wr_mx); 3760 while (un->un_suspend_wr_flag) { 3761 cv_wait(&un->un_suspend_wr_cv, &un->un_suspend_wr_mx); 3762 } 3763 mutex_exit(&un->un_suspend_wr_mx); 3764 (void) md_unit_readerlock(ui); 3765 } 3766 3767 if (!(flag & MD_STR_NOTTOP)) { 3768 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 3769 md_kstat_waitq_exit(ui); 3770 return; 3771 } 3772 } 3773 3774 setno = MD_MIN2SET(getminor(pb->b_edev)); 3775 3776 /* If an ABR write has been requested, set MD_STR_ABR flag */ 3777 if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE)) 3778 flag |= MD_STR_ABR; 3779 3780 if (private == NULL) { 3781 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 3782 mirror_parent_init(ps); 3783 } else { 3784 ps = private; 3785 private = NULL; 3786 } 3787 if (flag & MD_STR_MAPPED) 3788 ps->ps_flags |= MD_MPS_MAPPED; 3789 3790 if (flag & MD_STR_WOW) 3791 ps->ps_flags |= MD_MPS_WOW; 3792 3793 if (flag & MD_STR_ABR) 3794 ps->ps_flags |= MD_MPS_ABR; 3795 3796 if (flag & MD_STR_WMUPDATE) 3797 ps->ps_flags |= MD_MPS_WMUPDATE; 3798 3799 /* 3800 * Save essential information from the original buffhdr 3801 * in the md_save structure. 3802 */ 3803 ps->ps_un = un; 3804 ps->ps_ui = ui; 3805 ps->ps_bp = pb; 3806 ps->ps_addr = pb->b_un.b_addr; 3807 ps->ps_firstblk = pb->b_lblkno; 3808 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 3809 ps->ps_changecnt = un->un_changecnt; 3810 3811 /* 3812 * If not MN owner and this is an ABR write, make sure the current 3813 * resync region is on the overlaps chain 3814 */ 3815 mutex_enter(&un->un_owner_mx); 3816 if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) && 3817 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 3818 md_mps_t *ps1; 3819 /* Block the current resync region, if not already blocked */ 3820 ps1 = un->un_rs_prev_ovrlap; 3821 3822 if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) || 3823 (ps1->ps_lastblk != 0))) { 3824 /* Drop locks to avoid deadlock */ 3825 mutex_exit(&un->un_owner_mx); 3826 md_unit_readerexit(ui); 3827 wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT); 3828 rs_on_overlap = 1; 3829 (void) md_unit_readerlock(ui); 3830 mutex_enter(&un->un_owner_mx); 3831 /* 3832 * Check to see if we have obtained ownership 3833 * while waiting for overlaps. If we have, remove 3834 * the resync_region entry from the overlap chain 3835 */ 3836 if (MD_MN_MIRROR_OWNER(un) && 3837 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) { 3838 mirror_overlap_chain_remove(ps1); 3839 rs_on_overlap = 0; 3840 } 3841 } 3842 } 3843 mutex_exit(&un->un_owner_mx); 3844 3845 3846 /* 3847 * following keep write after read from writing to the 3848 * source in the case where it all came from one place 3849 */ 3850 if (flag & MD_STR_WAR) { 3851 int abort_write = 0; 3852 /* 3853 * We are perfoming a write-after-read. This is either as a 3854 * result of a resync read or as a result of a read in a 3855 * dirty resync region when the optimized resync is not 3856 * complete. If in a MN set and a resync generated i/o, 3857 * if the current block is not in the current 3858 * resync region terminate the write as another node must have 3859 * completed this resync region 3860 */ 3861 if ((MD_MNSET_SETNO(MD_UN2SET(un))) && 3862 (!flag & MD_STR_DIRTY_RD)) { 3863 if (!IN_RESYNC_REGION(un, ps)) 3864 abort_write = 1; 3865 } 3866 if ((select_write_after_read_units(un, ps) == 0) || 3867 (abort_write)) { 3868 #ifdef DEBUG 3869 if (mirror_debug_flag) 3870 printf("Abort resync write on %x, block %lld\n", 3871 MD_SID(un), ps->ps_firstblk); 3872 #endif 3873 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3874 mirror_overlap_chain_remove(ps); 3875 kmem_cache_free(mirror_parent_cache, ps); 3876 md_kstat_waitq_exit(ui); 3877 md_unit_readerexit(ui); 3878 md_biodone(pb); 3879 return; 3880 } 3881 } else { 3882 select_write_units(un, ps); 3883 3884 /* Drop readerlock to avoid deadlock */ 3885 md_unit_readerexit(ui); 3886 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 3887 un = md_unit_readerlock(ui); 3888 /* 3889 * For a MN set with an ABR write, if we are now the 3890 * owner and we have a resync region on the overlap 3891 * chain, remove the entry from overlaps and retry the write. 3892 */ 3893 3894 if (MD_MNSET_SETNO(setno) && 3895 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 3896 mutex_enter(&un->un_owner_mx); 3897 if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) { 3898 mirror_overlap_chain_remove(ps); 3899 md_kstat_waitq_exit(ui); 3900 mutex_exit(&un->un_owner_mx); 3901 md_unit_readerexit(ui); 3902 daemon_request(&md_mirror_daemon, daemon_io, 3903 (daemon_queue_t *)ps, REQ_OLD); 3904 return; 3905 } 3906 mutex_exit(&un->un_owner_mx); 3907 } 3908 } 3909 3910 /* 3911 * For Multinode mirrors with a Resync Region (not ABR) we need to 3912 * become the mirror owner before continuing with the write(). For ABR 3913 * mirrors we check that we 'own' the resync if we're in 3914 * write-after-read mode. We do this _after_ ensuring that there are no 3915 * overlaps to ensure that the once we know that we are the owner, the 3916 * readerlock will not released until the write is complete. As a 3917 * change of ownership in a MN set requires the writerlock, this 3918 * ensures that ownership cannot be changed until the write is 3919 * complete 3920 */ 3921 if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) || 3922 (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) { 3923 if (!MD_MN_MIRROR_OWNER(un)) { 3924 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3925 mirror_overlap_chain_remove(ps); 3926 md_kstat_waitq_exit(ui); 3927 ASSERT(!(flag & MD_STR_WAR)); 3928 md_unit_readerexit(ui); 3929 daemon_request(&md_mirror_daemon, become_owner, 3930 (daemon_queue_t *)ps, REQ_OLD); 3931 return; 3932 } 3933 } 3934 3935 /* 3936 * Mark resync region if mirror has a Resync Region _and_ we are not 3937 * a resync initiated write(). Don't mark region if we're flagged as 3938 * an ABR write. 3939 */ 3940 if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) && 3941 !(flag & MD_STR_WAR)) { 3942 if (mirror_mark_resync_region(un, ps->ps_firstblk, 3943 ps->ps_lastblk)) { 3944 pb->b_flags |= B_ERROR; 3945 pb->b_resid = pb->b_bcount; 3946 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 3947 kmem_cache_free(mirror_parent_cache, ps); 3948 md_kstat_waitq_exit(ui); 3949 md_unit_readerexit(ui); 3950 md_biodone(pb); 3951 return; 3952 } 3953 } 3954 3955 ps->ps_childbflags = pb->b_flags | B_WRITE; 3956 ps->ps_childbflags &= ~B_READ; 3957 if (flag & MD_STR_MAPPED) 3958 ps->ps_childbflags &= ~B_PAGEIO; 3959 3960 if (!(flag & MD_STR_NOTTOP) && panicstr) 3961 /* Disable WOW and don't free ps */ 3962 ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE); 3963 3964 md_kstat_waitq_to_runq(ui); 3965 3966 /* 3967 * Treat Raw and Direct I/O as Write-on-Write always 3968 */ 3969 3970 if (!(md_mirror_wow_flg & WOW_DISABLE) && 3971 (md_mirror_wow_flg & WOW_PHYS_ENABLE) && 3972 (pb->b_flags & B_PHYS) && 3973 !(ps->ps_flags & MD_MPS_WOW)) { 3974 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3975 mirror_overlap_chain_remove(ps); 3976 md_unit_readerexit(ui); 3977 daemon_request(&md_mstr_daemon, handle_wow, 3978 (daemon_queue_t *)ps, REQ_OLD); 3979 return; 3980 } 3981 3982 ps->ps_frags = 1; 3983 do { 3984 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 3985 mirror_child_init(cs); 3986 cb = &cs->cs_buf; 3987 more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR)); 3988 3989 /* 3990 * This handles the case where we're requesting 3991 * a write to block 0 on a label partition. (more < 0) 3992 * means that the request size was smaller than the 3993 * size of the label. If so this request is done. 3994 */ 3995 if (more < 0) { 3996 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3997 mirror_overlap_chain_remove(ps); 3998 md_kstat_runq_exit(ui); 3999 kmem_cache_free(mirror_child_cache, cs); 4000 kmem_cache_free(mirror_parent_cache, ps); 4001 md_unit_readerexit(ui); 4002 md_biodone(pb); 4003 return; 4004 } 4005 if (more) { 4006 mutex_enter(&ps->ps_mx); 4007 ps->ps_frags++; 4008 mutex_exit(&ps->ps_mx); 4009 } 4010 md_call_strategy(cb, flag, private); 4011 } while (more); 4012 4013 if (!(flag & MD_STR_NOTTOP) && panicstr) { 4014 while (!(ps->ps_flags & MD_MPS_DONE)) { 4015 md_daemon(1, &md_done_daemon); 4016 drv_usecwait(10); 4017 } 4018 kmem_cache_free(mirror_parent_cache, ps); 4019 } 4020 } 4021 4022 static void 4023 mirror_read_strategy(buf_t *pb, int flag, void *private) 4024 { 4025 md_mps_t *ps; 4026 md_mcs_t *cs; 4027 size_t more; 4028 mm_unit_t *un; 4029 mdi_unit_t *ui; 4030 size_t current_count; 4031 diskaddr_t current_blkno; 4032 off_t current_offset; 4033 buf_t *cb; /* child buf pointer */ 4034 set_t setno; 4035 4036 ui = MDI_UNIT(getminor(pb->b_edev)); 4037 4038 md_kstat_waitq_enter(ui); 4039 4040 un = (mm_unit_t *)md_unit_readerlock(ui); 4041 4042 if (!(flag & MD_STR_NOTTOP)) { 4043 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 4044 md_kstat_waitq_exit(ui); 4045 return; 4046 } 4047 } 4048 4049 if (private == NULL) { 4050 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 4051 mirror_parent_init(ps); 4052 } else { 4053 ps = private; 4054 private = NULL; 4055 } 4056 4057 if (flag & MD_STR_MAPPED) 4058 ps->ps_flags |= MD_MPS_MAPPED; 4059 if (flag & MD_NOBLOCK) 4060 ps->ps_flags |= MD_MPS_NOBLOCK; 4061 if (flag & MD_STR_WMUPDATE) 4062 ps->ps_flags |= MD_MPS_WMUPDATE; 4063 4064 /* 4065 * Check to see if this is a DMR driven read. If so we need to use the 4066 * specified side (in un->un_dmr_last_read) for the source of the data. 4067 */ 4068 if (flag & MD_STR_DMR) 4069 ps->ps_flags |= MD_MPS_DMR; 4070 4071 /* 4072 * Save essential information from the original buffhdr 4073 * in the md_save structure. 4074 */ 4075 ps->ps_un = un; 4076 ps->ps_ui = ui; 4077 ps->ps_bp = pb; 4078 ps->ps_addr = pb->b_un.b_addr; 4079 ps->ps_firstblk = pb->b_lblkno; 4080 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 4081 ps->ps_changecnt = un->un_changecnt; 4082 4083 current_count = btodb(pb->b_bcount); 4084 current_blkno = pb->b_lblkno; 4085 current_offset = 0; 4086 4087 /* 4088 * If flag has MD_STR_WAR set this means that the read is issued by a 4089 * resync thread which may or may not be an optimised resync. 4090 * 4091 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync 4092 * code has not completed; either a resync has not started since snarf, 4093 * or there is an optimized resync in progress. 4094 * 4095 * We need to generate a write after this read in the following two 4096 * cases, 4097 * 4098 * 1. Any Resync-Generated read 4099 * 4100 * 2. Any read to a DIRTY REGION if there is an optimized resync 4101 * pending or in progress. 4102 * 4103 * The write after read is done in these cases to ensure that all sides 4104 * of the mirror are in sync with the read data and that it is not 4105 * possible for an application to read the same block multiple times 4106 * and get different data. 4107 * 4108 * This would be possible if the block was in a dirty region. 4109 * 4110 * If we're performing a directed read we don't write the data out as 4111 * the application is responsible for restoring the mirror to a known 4112 * state. 4113 */ 4114 if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) && 4115 !(flag & MD_STR_DMR)) { 4116 size_t start_rr, i, end_rr; 4117 int region_dirty = 1; 4118 4119 /* 4120 * We enter here under three circumstances, 4121 * 4122 * MD_UN_OPT_NOT_DONE MD_STR_WAR 4123 * 0 1 4124 * 1 0 4125 * 1 1 4126 * 4127 * To be optimal we only care to explicitly check for dirty 4128 * regions in the second case since if MD_STR_WAR is set we 4129 * always do the write after read. 4130 */ 4131 if (!(flag & MD_STR_WAR)) { 4132 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 4133 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 4134 4135 for (i = start_rr; i <= end_rr; i++) 4136 if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0) 4137 break; 4138 } 4139 4140 if ((region_dirty) && 4141 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 4142 ps->ps_call = write_after_read; 4143 /* 4144 * Mark this as a RESYNC_READ in ps_flags. 4145 * This is used if the read fails during a 4146 * resync of a 3-way mirror to ensure that 4147 * the retried read to the remaining 4148 * good submirror has MD_STR_WAR set. This 4149 * is needed to ensure that the resync write 4150 * (write-after-read) takes place. 4151 */ 4152 ps->ps_flags |= MD_MPS_RESYNC_READ; 4153 4154 /* 4155 * If MD_STR_FLAG_ERR is set in the flags we 4156 * set MD_MPS_FLAG_ERROR so that an error on the resync 4157 * write (issued by write_after_read) will be flagged 4158 * to the biowait'ing resync thread. This allows us to 4159 * avoid issuing further resync requests to a device 4160 * that has had a write failure. 4161 */ 4162 if (flag & MD_STR_FLAG_ERR) 4163 ps->ps_flags |= MD_MPS_FLAG_ERROR; 4164 4165 setno = MD_UN2SET(un); 4166 /* 4167 * Drop the readerlock to avoid 4168 * deadlock 4169 */ 4170 md_unit_readerexit(ui); 4171 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 4172 un = md_unit_readerlock(ui); 4173 /* 4174 * Ensure that we are owner 4175 */ 4176 if (MD_MNSET_SETNO(setno)) { 4177 /* 4178 * For a non-resync read that requires a 4179 * write-after-read to be done, set a flag 4180 * in the parent structure, so that the 4181 * write_strategy routine can omit the 4182 * test that the write is still within the 4183 * resync region 4184 */ 4185 if (!(flag & MD_STR_WAR)) 4186 ps->ps_flags |= MD_MPS_DIRTY_RD; 4187 4188 /* 4189 * Before reading the buffer, see if 4190 * we are the owner 4191 */ 4192 if (!MD_MN_MIRROR_OWNER(un)) { 4193 ps->ps_call = NULL; 4194 mirror_overlap_chain_remove(ps); 4195 md_kstat_waitq_exit(ui); 4196 md_unit_readerexit(ui); 4197 daemon_request( 4198 &md_mirror_daemon, 4199 become_owner, 4200 (daemon_queue_t *)ps, 4201 REQ_OLD); 4202 return; 4203 } 4204 /* 4205 * For a resync read, check to see if I/O is 4206 * outside of the current resync region, or 4207 * the resync has finished. If so 4208 * just terminate the I/O 4209 */ 4210 if ((flag & MD_STR_WAR) && 4211 (!(un->c.un_status & MD_UN_WAR) || 4212 (!IN_RESYNC_REGION(un, ps)))) { 4213 #ifdef DEBUG 4214 if (mirror_debug_flag) 4215 printf("Abort resync read " 4216 "%x: %lld\n", 4217 MD_SID(un), 4218 ps->ps_firstblk); 4219 #endif 4220 mirror_overlap_chain_remove(ps); 4221 kmem_cache_free(mirror_parent_cache, 4222 ps); 4223 md_kstat_waitq_exit(ui); 4224 md_unit_readerexit(ui); 4225 md_biodone(pb); 4226 return; 4227 } 4228 } 4229 } 4230 } 4231 4232 if (flag & MD_STR_DMR) { 4233 ps->ps_call = directed_read_done; 4234 } 4235 4236 if (!(flag & MD_STR_NOTTOP) && panicstr) 4237 ps->ps_flags |= MD_MPS_DONTFREE; 4238 4239 md_kstat_waitq_to_runq(ui); 4240 4241 ps->ps_frags++; 4242 do { 4243 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 4244 mirror_child_init(cs); 4245 cb = &cs->cs_buf; 4246 cs->cs_ps = ps; 4247 4248 cb = md_bioclone(pb, current_offset, current_count, NODEV, 4249 current_blkno, mirror_done, cb, KM_NOSLEEP); 4250 4251 more = mirror_map_read(ps, cs, current_blkno, 4252 (u_longlong_t)current_count); 4253 if (more) { 4254 mutex_enter(&ps->ps_mx); 4255 ps->ps_frags++; 4256 mutex_exit(&ps->ps_mx); 4257 } 4258 4259 /* 4260 * Do these calculations now, 4261 * so that we pickup a valid b_bcount from the chld_bp. 4262 */ 4263 current_count -= more; 4264 current_offset += cb->b_bcount; 4265 current_blkno += more; 4266 md_call_strategy(cb, flag, private); 4267 } while (more); 4268 4269 if (!(flag & MD_STR_NOTTOP) && panicstr) { 4270 while (!(ps->ps_flags & MD_MPS_DONE)) { 4271 md_daemon(1, &md_done_daemon); 4272 drv_usecwait(10); 4273 } 4274 kmem_cache_free(mirror_parent_cache, ps); 4275 } 4276 } 4277 4278 void 4279 md_mirror_strategy(buf_t *bp, int flag, void *private) 4280 { 4281 set_t setno = MD_MIN2SET(getminor(bp->b_edev)); 4282 4283 /* 4284 * When doing IO to a multi owner meta device, check if set is halted. 4285 * We do this check without the needed lock held, for performance 4286 * reasons. 4287 * If an IO just slips through while the set is locked via an 4288 * MD_MN_SUSPEND_SET, we don't care about it. 4289 * Only check for suspension if we are a top-level i/o request 4290 * (MD_STR_NOTTOP is cleared in 'flag'). 4291 */ 4292 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 4293 (MD_SET_HALTED | MD_SET_MNSET)) { 4294 if ((flag & MD_STR_NOTTOP) == 0) { 4295 mutex_enter(&md_mx); 4296 /* Here we loop until the set is no longer halted */ 4297 while (md_set[setno].s_status & MD_SET_HALTED) { 4298 cv_wait(&md_cv, &md_mx); 4299 } 4300 mutex_exit(&md_mx); 4301 } 4302 } 4303 4304 if ((flag & MD_IO_COUNTED) == 0) { 4305 if ((flag & MD_NOBLOCK) == 0) { 4306 if (md_inc_iocount(setno) != 0) { 4307 bp->b_flags |= B_ERROR; 4308 bp->b_error = ENXIO; 4309 bp->b_resid = bp->b_bcount; 4310 biodone(bp); 4311 return; 4312 } 4313 } else { 4314 md_inc_iocount_noblock(setno); 4315 } 4316 } 4317 4318 if (bp->b_flags & B_READ) 4319 mirror_read_strategy(bp, flag, private); 4320 else 4321 mirror_write_strategy(bp, flag, private); 4322 } 4323 4324 /* 4325 * mirror_directed_read: 4326 * -------------------- 4327 * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror 4328 * so that the application can determine what (if any) resync needs to be 4329 * performed. The data is copied out to the user-supplied buffer. 4330 * 4331 * Parameters: 4332 * mdev - dev_t for the mirror device 4333 * vdr - directed read parameters specifying location and submirror 4334 * to perform the read from 4335 * mode - used to ddi_copyout() any resulting data from the read 4336 * 4337 * Returns: 4338 * 0 success 4339 * !0 error code 4340 * EINVAL - invalid request format 4341 */ 4342 int 4343 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode) 4344 { 4345 buf_t *bp; 4346 minor_t mnum = getminor(mdev); 4347 mdi_unit_t *ui = MDI_UNIT(mnum); 4348 mm_unit_t *un; 4349 mm_submirror_t *sm; 4350 char *sm_nm; 4351 size_t namelen; 4352 uint_t next_side; 4353 void *kbuffer; 4354 4355 if (ui == NULL) 4356 return (ENXIO); 4357 4358 if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) { 4359 return (EINVAL); 4360 } 4361 4362 /* Check for aligned block access. We disallow non-aligned requests. */ 4363 if (vdr->vdr_offset % DEV_BSIZE) { 4364 return (EINVAL); 4365 } 4366 4367 /* 4368 * Allocate kernel buffer for target of read(). If we had a reliable 4369 * (sorry functional) DDI this wouldn't be needed. 4370 */ 4371 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 4372 if (kbuffer == NULL) { 4373 cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx" 4374 " bytes\n", vdr->vdr_nbytes); 4375 return (ENOMEM); 4376 } 4377 4378 bp = getrbuf(KM_SLEEP); 4379 4380 bp->b_un.b_addr = kbuffer; 4381 bp->b_flags = B_READ; 4382 bp->b_bcount = vdr->vdr_nbytes; 4383 bp->b_lblkno = lbtodb(vdr->vdr_offset); 4384 bp->b_edev = mdev; 4385 4386 un = md_unit_readerlock(ui); 4387 4388 /* 4389 * If DKV_SIDE_INIT is set we need to determine the first available 4390 * side to start reading from. If it isn't set we increment to the 4391 * next readable submirror. 4392 * If there are no readable submirrors we error out with DKV_DMR_ERROR. 4393 * Note: we check for a readable submirror on completion of the i/o so 4394 * we should _always_ have one available. If this becomes unavailable 4395 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if 4396 * a metadetach is made between the completion of one DKIOCDMR ioctl 4397 * and the start of the next (i.e. a sys-admin 'accident' occurred). 4398 * The chance of this is small, but not non-existent. 4399 */ 4400 if (vdr->vdr_side == DKV_SIDE_INIT) { 4401 next_side = 0; 4402 } else { 4403 next_side = vdr->vdr_side + 1; 4404 } 4405 while ((next_side < NMIRROR) && 4406 !SUBMIRROR_IS_READABLE(un, next_side)) 4407 next_side++; 4408 if (next_side >= NMIRROR) { 4409 vdr->vdr_flags |= DKV_DMR_ERROR; 4410 freerbuf(bp); 4411 vdr->vdr_bytesread = 0; 4412 md_unit_readerexit(ui); 4413 return (0); 4414 } 4415 4416 /* Set the side to read from */ 4417 un->un_dmr_last_read = next_side; 4418 4419 md_unit_readerexit(ui); 4420 4421 /* 4422 * Save timestamp for verification purposes. Can be read by debugger 4423 * to verify that this ioctl has been executed and to find the number 4424 * of DMR reads and the time of the last DMR read. 4425 */ 4426 uniqtime(&mirror_dmr_stats.dmr_timestamp); 4427 mirror_dmr_stats.dmr_count++; 4428 4429 /* Issue READ request and wait for completion */ 4430 mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL); 4431 4432 mutex_enter(&un->un_dmr_mx); 4433 cv_wait(&un->un_dmr_cv, &un->un_dmr_mx); 4434 mutex_exit(&un->un_dmr_mx); 4435 4436 /* 4437 * Check to see if we encountered an error during the read. If so we 4438 * can make no guarantee about any possibly returned data. 4439 */ 4440 if ((bp->b_flags & B_ERROR) == 0) { 4441 vdr->vdr_flags &= ~DKV_DMR_ERROR; 4442 if (bp->b_resid) { 4443 vdr->vdr_flags |= DKV_DMR_SHORT; 4444 vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid; 4445 } else { 4446 vdr->vdr_flags |= DKV_DMR_SUCCESS; 4447 vdr->vdr_bytesread = vdr->vdr_nbytes; 4448 } 4449 /* Copy the data read back out to the user supplied buffer */ 4450 if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread, 4451 mode)) { 4452 kmem_free(kbuffer, vdr->vdr_nbytes); 4453 return (EFAULT); 4454 } 4455 4456 } else { 4457 /* Error out with DKV_DMR_ERROR */ 4458 vdr->vdr_flags |= DKV_DMR_ERROR; 4459 vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE); 4460 } 4461 /* 4462 * Update the DMR parameters with the side and name of submirror that 4463 * we have just read from (un->un_dmr_last_read) 4464 */ 4465 un = md_unit_readerlock(ui); 4466 4467 vdr->vdr_side = un->un_dmr_last_read; 4468 sm = &un->un_sm[un->un_dmr_last_read]; 4469 sm_nm = md_shortname(md_getminor(sm->sm_dev)); 4470 4471 namelen = MIN(MD_MAX_SIDENAME_LEN, VOL_SIDENAME); 4472 (void) strncpy(vdr->vdr_side_name, sm_nm, namelen); 4473 4474 /* 4475 * Determine if we've completed the read cycle. This is true iff the 4476 * next computed submirror (side) equals or exceeds NMIRROR. We cannot 4477 * use un_nsm as we need to handle a sparse array of submirrors (which 4478 * can occur if a submirror is metadetached). 4479 */ 4480 next_side = un->un_dmr_last_read + 1; 4481 while ((next_side < NMIRROR) && 4482 !SUBMIRROR_IS_READABLE(un, next_side)) 4483 next_side++; 4484 if (next_side >= NMIRROR) { 4485 /* We've finished */ 4486 vdr->vdr_flags |= DKV_DMR_DONE; 4487 } 4488 4489 md_unit_readerexit(ui); 4490 freerbuf(bp); 4491 kmem_free(kbuffer, vdr->vdr_nbytes); 4492 4493 return (0); 4494 } 4495 4496 /* 4497 * mirror_resync_message: 4498 * --------------------- 4499 * Handle the multi-node resync messages that keep all nodes within a given 4500 * disk-set in sync with their view of a mirror's resync status. 4501 * 4502 * The message types dealt with are: 4503 * MD_MN_MSG_RESYNC_STARTING - start a resync thread for a unit 4504 * MD_MN_MSG_RESYNC_NEXT - specified next region to be resynced 4505 * MD_MN_MSG_RESYNC_FINISH - stop the resync thread for a unit 4506 * MD_MN_MSG_RESYNC_PHASE_DONE - end of a resync phase, opt, submirror or comp 4507 * 4508 * Returns: 4509 * 0 Success 4510 * >0 Failure error number 4511 */ 4512 int 4513 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp) 4514 { 4515 mdi_unit_t *ui; 4516 mm_unit_t *un; 4517 set_t setno; 4518 int is_ABR; 4519 int smi; 4520 int ci; 4521 sm_state_t state; 4522 int broke_out; 4523 mm_submirror_t *sm; 4524 mm_submirror_ic_t *smic; 4525 md_m_shared_t *shared; 4526 md_error_t mde = mdnullerror; 4527 md_mps_t *ps; 4528 int rs_active; 4529 4530 /* Check that the given device is part of a multi-node set */ 4531 setno = MD_MIN2SET(p->mnum); 4532 if (setno >= md_nsets) { 4533 return (ENXIO); 4534 } 4535 if (!MD_MNSET_SETNO(setno)) { 4536 return (EINVAL); 4537 } 4538 4539 if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL) 4540 return (EINVAL); 4541 if ((ui = MDI_UNIT(p->mnum)) == NULL) 4542 return (EINVAL); 4543 is_ABR = (ui->ui_tstate & MD_ABR_CAP); 4544 4545 /* Obtain the current resync status */ 4546 (void) md_ioctl_readerlock(lockp, ui); 4547 rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0; 4548 md_ioctl_readerexit(lockp); 4549 4550 switch ((md_mn_msgtype_t)p->msg_type) { 4551 case MD_MN_MSG_RESYNC_STARTING: 4552 /* Start the resync thread for the mirror */ 4553 (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp); 4554 break; 4555 4556 case MD_MN_MSG_RESYNC_NEXT: 4557 /* 4558 * We have to release any previously marked overlap regions 4559 * so that i/o can resume. Then we need to block the region 4560 * from [rs_start..rs_start+rs_size) * so that no i/o is issued. 4561 * Update un_rs_resync_done and un_rs_resync_2_do. 4562 */ 4563 (void) md_ioctl_readerlock(lockp, ui); 4564 /* 4565 * Ignore the message if there is no active resync thread or 4566 * if it is for a resync type that we have already completed. 4567 * un_resync_completed is set to the last resync completed 4568 * when processing a PHASE_DONE message. 4569 */ 4570 if (!rs_active || (p->rs_type == un->un_resync_completed)) 4571 break; 4572 /* 4573 * If this message is for the same resync and is for an earlier 4574 * resync region, just ignore it. This can only occur if this 4575 * node has progressed on to the next resync region before 4576 * we receive this message. This can occur if the class for 4577 * this message is busy and the originator has to retry thus 4578 * allowing this node to move onto the next resync_region. 4579 */ 4580 if ((p->rs_type == un->un_rs_type) && 4581 (p->rs_start < un->un_resync_startbl)) 4582 break; 4583 ps = un->un_rs_prev_ovrlap; 4584 4585 /* Allocate previous overlap reference if needed */ 4586 if (ps == NULL) { 4587 ps = kmem_cache_alloc(mirror_parent_cache, 4588 MD_ALLOCFLAGS); 4589 ps->ps_un = un; 4590 ps->ps_ui = ui; 4591 ps->ps_firstblk = 0; 4592 ps->ps_lastblk = 0; 4593 ps->ps_flags = 0; 4594 md_ioctl_readerexit(lockp); 4595 (void) md_ioctl_writerlock(lockp, ui); 4596 un->un_rs_prev_ovrlap = ps; 4597 md_ioctl_writerexit(lockp); 4598 } else 4599 md_ioctl_readerexit(lockp); 4600 4601 if (p->rs_originator != md_mn_mynode_id) { 4602 /* 4603 * On all but the originating node, first update 4604 * the resync state, then unblock the previous 4605 * region and block the next one. No need 4606 * to do this if the region is already blocked. 4607 * Update the submirror state and flags from the 4608 * originator. This keeps the cluster in sync with 4609 * regards to the resync status. 4610 */ 4611 4612 (void) md_ioctl_writerlock(lockp, ui); 4613 un->un_rs_resync_done = p->rs_done; 4614 un->un_rs_resync_2_do = p->rs_2_do; 4615 un->un_rs_type = p->rs_type; 4616 un->un_resync_startbl = p->rs_start; 4617 md_ioctl_writerexit(lockp); 4618 /* 4619 * Use un_owner_mx to ensure that an ownership change 4620 * cannot happen at the same time as this message 4621 */ 4622 mutex_enter(&un->un_owner_mx); 4623 if (MD_MN_MIRROR_OWNER(un)) { 4624 ps->ps_firstblk = p->rs_start; 4625 ps->ps_lastblk = ps->ps_firstblk + 4626 p->rs_size - 1; 4627 } else { 4628 if ((ps->ps_firstblk != p->rs_start) || 4629 (ps->ps_lastblk != p->rs_start + 4630 p->rs_size - 1)) { 4631 /* Remove previous overlap range */ 4632 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4633 mirror_overlap_chain_remove(ps); 4634 4635 ps->ps_firstblk = p->rs_start; 4636 ps->ps_lastblk = ps->ps_firstblk + 4637 p->rs_size - 1; 4638 4639 mutex_exit(&un->un_owner_mx); 4640 /* Block this range from all i/o. */ 4641 if (ps->ps_firstblk != 0 || 4642 ps->ps_lastblk != 0) 4643 wait_for_overlaps(ps, 4644 MD_OVERLAP_ALLOW_REPEAT); 4645 mutex_enter(&un->un_owner_mx); 4646 /* 4647 * Check to see if we have obtained 4648 * ownership while waiting for 4649 * overlaps. If we have, remove 4650 * the resync_region entry from the 4651 * overlap chain 4652 */ 4653 if (MD_MN_MIRROR_OWNER(un) && 4654 (ps->ps_flags & MD_MPS_ON_OVERLAP)) 4655 mirror_overlap_chain_remove(ps); 4656 } 4657 } 4658 mutex_exit(&un->un_owner_mx); 4659 4660 /* 4661 * If this is the first RESYNC_NEXT message (i.e. 4662 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags), 4663 * issue RESYNC_START NOTIFY event 4664 */ 4665 if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) { 4666 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START, 4667 SVM_TAG_METADEVICE, MD_UN2SET(un), 4668 MD_SID(un)); 4669 } 4670 4671 /* Ensure that our local resync thread is running */ 4672 if (un->un_rs_thread == NULL) { 4673 (void) mirror_resync_unit(p->mnum, NULL, 4674 &p->mde, lockp); 4675 } 4676 } 4677 break; 4678 case MD_MN_MSG_RESYNC_FINISH: 4679 /* 4680 * Complete the resync by stopping the resync thread. 4681 * Also release the previous overlap region field. 4682 * Update the resync_progress_thread by cv_signal'ing it so 4683 * that we mark the end of the resync as soon as possible. This 4684 * stops an unnecessary delay should be panic after resync 4685 * completion. 4686 */ 4687 #ifdef DEBUG 4688 if (!rs_active) { 4689 if (mirror_debug_flag) 4690 printf("RESYNC_FINISH (mnum = %x), " 4691 "Resync *NOT* active", 4692 p->mnum); 4693 } 4694 #endif 4695 4696 if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) && 4697 (p->rs_originator != md_mn_mynode_id)) { 4698 mutex_enter(&un->un_rs_thread_mx); 4699 un->c.un_status &= ~MD_UN_RESYNC_CANCEL; 4700 un->un_rs_thread_flags |= MD_RI_SHUTDOWN; 4701 un->un_rs_thread_flags &= 4702 ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER); 4703 cv_signal(&un->un_rs_thread_cv); 4704 mutex_exit(&un->un_rs_thread_mx); 4705 } 4706 if (is_ABR) { 4707 /* Resync finished, if ABR set owner to NULL */ 4708 mutex_enter(&un->un_owner_mx); 4709 un->un_mirror_owner = 0; 4710 mutex_exit(&un->un_owner_mx); 4711 } 4712 (void) md_ioctl_writerlock(lockp, ui); 4713 ps = un->un_rs_prev_ovrlap; 4714 if (ps != NULL) { 4715 /* Remove previous overlap range */ 4716 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4717 mirror_overlap_chain_remove(ps); 4718 /* 4719 * Release the overlap range reference 4720 */ 4721 un->un_rs_prev_ovrlap = NULL; 4722 kmem_cache_free(mirror_parent_cache, 4723 ps); 4724 } 4725 md_ioctl_writerexit(lockp); 4726 4727 /* Mark the resync as complete in the metadb */ 4728 un->un_rs_resync_done = p->rs_done; 4729 un->un_rs_resync_2_do = p->rs_2_do; 4730 un->un_rs_type = p->rs_type; 4731 mutex_enter(&un->un_rs_progress_mx); 4732 cv_signal(&un->un_rs_progress_cv); 4733 mutex_exit(&un->un_rs_progress_mx); 4734 4735 un = md_ioctl_writerlock(lockp, ui); 4736 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE; 4737 /* Deal with any pending grow_unit */ 4738 if (un->c.un_status & MD_UN_GROW_PENDING) { 4739 if ((mirror_grow_unit(un, &mde) != 0) || 4740 (! mdismderror(&mde, MDE_GROW_DELAYED))) { 4741 un->c.un_status &= ~MD_UN_GROW_PENDING; 4742 } 4743 } 4744 md_ioctl_writerexit(lockp); 4745 break; 4746 4747 case MD_MN_MSG_RESYNC_PHASE_DONE: 4748 /* 4749 * A phase of the resync, optimized. component or 4750 * submirror is complete. Update mirror status. 4751 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the 4752 * mirror owner is peforming a resync. If we have just snarfed 4753 * this set, then we must clear any of the flags set at snarf 4754 * time by unit_setup_resync(). 4755 * Note that unit_setup_resync() sets up these flags to 4756 * indicate that an optimized resync is required. These flags 4757 * need to be reset because if we get here, the mirror owner 4758 * will have handled the optimized resync. 4759 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and 4760 * MD_UN_WAR. In addition, for each submirror, 4761 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC 4762 * set to SMS_OFFLINE. 4763 */ 4764 #ifdef DEBUG 4765 if (mirror_debug_flag) 4766 printf("phase done mess received from %d, mnum=%x," 4767 "type=%x, flags=%x\n", p->rs_originator, p->mnum, 4768 p->rs_type, p->rs_flags); 4769 #endif 4770 /* 4771 * Ignore the message if there is no active resync thread. 4772 */ 4773 if (!rs_active) 4774 break; 4775 4776 broke_out = p->rs_flags & MD_MN_RS_ERR; 4777 switch (RS_TYPE(p->rs_type)) { 4778 case MD_RS_OPTIMIZED: 4779 un = md_ioctl_writerlock(lockp, ui); 4780 if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) { 4781 /* If we are originator, just clear rs_type */ 4782 if (p->rs_originator == md_mn_mynode_id) { 4783 SET_RS_TYPE_NONE(un->un_rs_type); 4784 md_ioctl_writerexit(lockp); 4785 break; 4786 } 4787 /* 4788 * If CLEAR_OPT_NOT_DONE is set, only clear the 4789 * flags if OPT_NOT_DONE is set *and* rs_type 4790 * is MD_RS_NONE. 4791 */ 4792 if ((un->c.un_status & MD_UN_OPT_NOT_DONE) && 4793 (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) { 4794 /* No resync in progress */ 4795 un->c.un_status &= ~MD_UN_OPT_NOT_DONE; 4796 un->c.un_status &= ~MD_UN_WAR; 4797 } else { 4798 /* 4799 * We are in the middle of an 4800 * optimized resync and this message 4801 * should be ignored. 4802 */ 4803 md_ioctl_writerexit(lockp); 4804 break; 4805 } 4806 } else { 4807 /* 4808 * This is the end of an optimized resync, 4809 * clear the OPT_NOT_DONE and OFFLINE_SM flags 4810 */ 4811 4812 un->c.un_status &= ~MD_UN_KEEP_DIRTY; 4813 if (!broke_out) 4814 un->c.un_status &= ~MD_UN_WAR; 4815 } 4816 4817 /* 4818 * Set resync_completed to last resync type and then 4819 * clear resync_type to indicate no resync in progress 4820 */ 4821 un->un_resync_completed = un->un_rs_type; 4822 SET_RS_TYPE_NONE(un->un_rs_type); 4823 4824 /* 4825 * If resync is as a result of a submirror ONLINE, 4826 * reset the submirror state to SMS_RUNNING if the 4827 * resync was ok else set back to SMS_OFFLINE. 4828 */ 4829 for (smi = 0; smi < NMIRROR; smi++) { 4830 un->un_sm[smi].sm_flags &= 4831 ~MD_SM_RESYNC_TARGET; 4832 if (SMS_BY_INDEX_IS(un, smi, 4833 SMS_OFFLINE_RESYNC)) { 4834 if (p->rs_flags & 4835 MD_MN_RS_CLEAR_OPT_NOT_DONE) { 4836 state = SMS_OFFLINE; 4837 } else { 4838 state = (broke_out ? 4839 SMS_OFFLINE : SMS_RUNNING); 4840 } 4841 mirror_set_sm_state( 4842 &un->un_sm[smi], 4843 &un->un_smic[smi], state, 4844 broke_out); 4845 mirror_commit(un, NO_SUBMIRRORS, 4846 0); 4847 } 4848 /* 4849 * If we still have an offline submirror, reset 4850 * the OFFLINE_SM flag in the mirror status 4851 */ 4852 if (SMS_BY_INDEX_IS(un, smi, 4853 SMS_OFFLINE)) 4854 un->c.un_status |= 4855 MD_UN_OFFLINE_SM; 4856 } 4857 md_ioctl_writerexit(lockp); 4858 break; 4859 case MD_RS_SUBMIRROR: 4860 un = md_ioctl_writerlock(lockp, ui); 4861 smi = RS_SMI(p->rs_type); 4862 sm = &un->un_sm[smi]; 4863 smic = &un->un_smic[smi]; 4864 /* Clear RESYNC target */ 4865 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 4866 /* 4867 * Set resync_completed to last resync type and then 4868 * clear resync_type to indicate no resync in progress 4869 */ 4870 un->un_resync_completed = un->un_rs_type; 4871 SET_RS_TYPE_NONE(un->un_rs_type); 4872 /* 4873 * If the resync completed ok reset the submirror 4874 * state to SMS_RUNNING else reset it to SMS_ATTACHED 4875 */ 4876 state = (broke_out ? 4877 SMS_ATTACHED : SMS_RUNNING); 4878 mirror_set_sm_state(sm, smic, state, broke_out); 4879 un->c.un_status &= ~MD_UN_WAR; 4880 mirror_commit(un, SMI2BIT(smi), 0); 4881 md_ioctl_writerexit(lockp); 4882 break; 4883 case MD_RS_COMPONENT: 4884 un = md_ioctl_writerlock(lockp, ui); 4885 smi = RS_SMI(p->rs_type); 4886 ci = RS_CI(p->rs_type); 4887 sm = &un->un_sm[smi]; 4888 smic = &un->un_smic[smi]; 4889 shared = (md_m_shared_t *) 4890 (*(smic->sm_shared_by_indx)) 4891 (sm->sm_dev, sm, ci); 4892 un->c.un_status &= ~MD_UN_WAR; 4893 /* Clear RESYNC target */ 4894 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 4895 /* 4896 * Set resync_completed to last resync type and then 4897 * clear resync_type to indicate no resync in progress 4898 */ 4899 un->un_resync_completed = un->un_rs_type; 4900 SET_RS_TYPE_NONE(un->un_rs_type); 4901 4902 /* 4903 * If the resync completed ok, set the component state 4904 * to CS_OKAY. 4905 */ 4906 if (broke_out) 4907 shared->ms_flags |= MDM_S_RS_TRIED; 4908 else { 4909 /* 4910 * As we don't transmit the changes, 4911 * no need to drop the lock. 4912 */ 4913 set_sm_comp_state(un, smi, ci, CS_OKAY, 0, 4914 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 4915 } 4916 md_ioctl_writerexit(lockp); 4917 default: 4918 break; 4919 } 4920 /* 4921 * If the purpose of this PHASE_DONE message is just to 4922 * indicate to all other nodes that the optimized resync 4923 * required (OPT_NOT_DONE) flag is to be cleared, there is 4924 * no need to generate a notify event as there has not 4925 * actually been a resync. 4926 */ 4927 if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) { 4928 if (broke_out) { 4929 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED, 4930 SVM_TAG_METADEVICE, MD_UN2SET(un), 4931 MD_SID(un)); 4932 } else { 4933 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE, 4934 SVM_TAG_METADEVICE, MD_UN2SET(un), 4935 MD_SID(un)); 4936 } 4937 } 4938 break; 4939 4940 default: 4941 #ifdef DEBUG 4942 cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type" 4943 " %x\n", p->msg_type); 4944 #endif 4945 return (EINVAL); 4946 } 4947 return (0); 4948 } 4949 4950 /* Return a -1 if snarf of optimized record failed and set should be released */ 4951 static int 4952 mirror_snarf(md_snarfcmd_t cmd, set_t setno) 4953 { 4954 mddb_recid_t recid; 4955 int gotsomething; 4956 int all_mirrors_gotten; 4957 mm_unit_t *un; 4958 mddb_type_t typ1; 4959 mddb_de_ic_t *dep; 4960 mddb_rb32_t *rbp; 4961 size_t newreqsize; 4962 mm_unit_t *big_un; 4963 mm_unit32_od_t *small_un; 4964 int retval; 4965 mdi_unit_t *ui; 4966 4967 if (cmd == MD_SNARF_CLEANUP) { 4968 if (md_get_setstatus(setno) & MD_SET_STALE) 4969 return (0); 4970 4971 recid = mddb_makerecid(setno, 0); 4972 typ1 = (mddb_type_t)md_getshared_key(setno, 4973 mirror_md_ops.md_driver.md_drivername); 4974 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 4975 if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) { 4976 un = (mm_unit_t *)mddb_getrecaddr(recid); 4977 mirror_cleanup(un); 4978 recid = mddb_makerecid(setno, 0); 4979 } 4980 } 4981 return (0); 4982 } 4983 4984 all_mirrors_gotten = 1; 4985 gotsomething = 0; 4986 4987 recid = mddb_makerecid(setno, 0); 4988 typ1 = (mddb_type_t)md_getshared_key(setno, 4989 mirror_md_ops.md_driver.md_drivername); 4990 4991 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 4992 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 4993 continue; 4994 4995 dep = mddb_getrecdep(recid); 4996 dep->de_flags = MDDB_F_MIRROR; 4997 rbp = dep->de_rb; 4998 4999 if ((rbp->rb_revision == MDDB_REV_RB) && 5000 ((rbp->rb_private & MD_PRV_CONVD) == 0)) { 5001 /* 5002 * This means, we have an old and small record 5003 * and this record hasn't already been converted. 5004 * Before we create an incore metadevice from this 5005 * we have to convert it to a big record. 5006 */ 5007 small_un = (mm_unit32_od_t *)mddb_getrecaddr(recid); 5008 newreqsize = sizeof (mm_unit_t); 5009 big_un = (mm_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP); 5010 mirror_convert((caddr_t)small_un, (caddr_t)big_un, 5011 SMALL_2_BIG); 5012 kmem_free(small_un, dep->de_reqsize); 5013 5014 /* 5015 * Update userdata and incore userdata 5016 * incores are at the end of un 5017 */ 5018 dep->de_rb_userdata_ic = big_un; 5019 dep->de_rb_userdata = big_un; 5020 dep->de_icreqsize = newreqsize; 5021 un = big_un; 5022 rbp->rb_private |= MD_PRV_CONVD; 5023 } else { 5024 /* Big device */ 5025 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, 5026 sizeof (*un), 0); 5027 } 5028 5029 /* Set revision and flag accordingly */ 5030 if (rbp->rb_revision == MDDB_REV_RB) { 5031 un->c.un_revision = MD_32BIT_META_DEV; 5032 } else { 5033 un->c.un_revision = MD_64BIT_META_DEV; 5034 un->c.un_flag |= MD_EFILABEL; 5035 } 5036 5037 /* 5038 * Create minor device node for snarfed entry. 5039 */ 5040 (void) md_create_minor_node(setno, MD_SID(un)); 5041 5042 if (MD_UNIT(MD_SID(un)) != NULL) { 5043 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5044 continue; 5045 } 5046 all_mirrors_gotten = 0; 5047 retval = mirror_build_incore(un, 1); 5048 if (retval == 0) { 5049 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5050 md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0); 5051 resync_start_timeout(setno); 5052 gotsomething = 1; 5053 } else if (retval == -1) { 5054 return (-1); 5055 } 5056 /* 5057 * Set flag to indicate that the mirror has not yet 5058 * been through a reconfig. This flag is used for MN sets 5059 * when determining whether to update the mirror state from 5060 * the Master node. 5061 */ 5062 if (MD_MNSET_SETNO(setno)) { 5063 ui = MDI_UNIT(MD_SID(un)); 5064 ui->ui_tstate |= MD_RESYNC_NOT_DONE; 5065 } 5066 } 5067 5068 if (!all_mirrors_gotten) 5069 return (gotsomething); 5070 5071 recid = mddb_makerecid(setno, 0); 5072 while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0) 5073 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 5074 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5075 5076 return (0); 5077 } 5078 5079 static int 5080 mirror_halt(md_haltcmd_t cmd, set_t setno) 5081 { 5082 unit_t i; 5083 mdi_unit_t *ui; 5084 minor_t mnum; 5085 int reset_mirror_flag = 0; 5086 5087 if (cmd == MD_HALT_CLOSE) 5088 return (0); 5089 5090 if (cmd == MD_HALT_OPEN) 5091 return (0); 5092 5093 if (cmd == MD_HALT_UNLOAD) 5094 return (0); 5095 5096 if (cmd == MD_HALT_CHECK) { 5097 for (i = 0; i < md_nunits; i++) { 5098 mnum = MD_MKMIN(setno, i); 5099 if ((ui = MDI_UNIT(mnum)) == NULL) 5100 continue; 5101 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5102 continue; 5103 if (md_unit_isopen(ui)) 5104 return (1); 5105 } 5106 return (0); 5107 } 5108 5109 if (cmd != MD_HALT_DOIT) 5110 return (1); 5111 5112 for (i = 0; i < md_nunits; i++) { 5113 mnum = MD_MKMIN(setno, i); 5114 if ((ui = MDI_UNIT(mnum)) == NULL) 5115 continue; 5116 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5117 continue; 5118 reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0); 5119 5120 /* Set a flag if there is at least one mirror metadevice. */ 5121 reset_mirror_flag = 1; 5122 } 5123 5124 /* 5125 * Only wait for the global dr_timeout to finish 5126 * - if there are mirror metadevices in this diskset or 5127 * - if this is the local set since an unload of the md_mirror 5128 * driver could follow a successful mirror halt in the local set. 5129 */ 5130 if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) { 5131 while ((mirror_md_ops.md_head == NULL) && 5132 (mirror_timeout.dr_timeout_id != 0)) 5133 delay(md_hz); 5134 } 5135 5136 return (0); 5137 } 5138 5139 /*ARGSUSED3*/ 5140 static int 5141 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) 5142 { 5143 IOLOCK lock; 5144 minor_t mnum = getminor(*dev); 5145 set_t setno; 5146 5147 /* 5148 * When doing an open of a multi owner metadevice, check to see if this 5149 * node is a starting node and if a reconfig cycle is underway. 5150 * If so, the system isn't sufficiently set up enough to handle the 5151 * open (which involves I/O during sp_validate), so fail with ENXIO. 5152 */ 5153 setno = MD_MIN2SET(mnum); 5154 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 5155 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 5156 return (ENXIO); 5157 } 5158 5159 if (md_oflags & MD_OFLG_FROMIOCTL) { 5160 /* 5161 * This indicates that the caller is an ioctl service routine. 5162 * In this case we initialise our stack-based IOLOCK and pass 5163 * this into the internal open routine. This allows multi-owner 5164 * metadevices to avoid deadlocking if an error is encountered 5165 * during the open() attempt. The failure case is: 5166 * s-p -> mirror -> s-p (with error). Attempting to metaclear 5167 * this configuration would deadlock as the mirror code has to 5168 * send a state-update to the other nodes when it detects the 5169 * failure of the underlying submirror with an errored soft-part 5170 * on it. As there is a class1 message in progress (metaclear) 5171 * set_sm_comp_state() cannot send another class1 message; 5172 * instead we do not send a state_update message as the 5173 * metaclear is distributed and the failed submirror will be 5174 * cleared from the configuration by the metaclear. 5175 */ 5176 IOLOCK_INIT(&lock); 5177 return (mirror_internal_open(getminor(*dev), flag, otyp, 5178 md_oflags, &lock)); 5179 } else { 5180 return (mirror_internal_open(getminor(*dev), flag, otyp, 5181 md_oflags, (IOLOCK *)NULL)); 5182 } 5183 } 5184 5185 5186 /*ARGSUSED1*/ 5187 static int 5188 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) 5189 { 5190 return (mirror_internal_close(getminor(dev), otyp, md_cflags, 5191 (IOLOCK *)NULL)); 5192 } 5193 5194 5195 /* 5196 * This routine dumps memory to the disk. It assumes that the memory has 5197 * already been mapped into mainbus space. It is called at disk interrupt 5198 * priority when the system is in trouble. 5199 * 5200 */ 5201 static int 5202 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 5203 { 5204 mm_unit_t *un; 5205 dev_t mapdev; 5206 int result; 5207 int smi; 5208 int any_succeed = 0; 5209 int save_result = 0; 5210 5211 /* 5212 * Don't need to grab the unit lock. 5213 * Cause nothing else is suppose to be happenning. 5214 * Also dump is not suppose to sleep. 5215 */ 5216 un = (mm_unit_t *)MD_UNIT(getminor(dev)); 5217 5218 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 5219 return (EINVAL); 5220 5221 if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks) 5222 return (EINVAL); 5223 5224 for (smi = 0; smi < NMIRROR; smi++) { 5225 if (!SUBMIRROR_IS_WRITEABLE(un, smi)) 5226 continue; 5227 mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev); 5228 result = bdev_dump(mapdev, addr, blkno, nblk); 5229 if (result) 5230 save_result = result; 5231 5232 if (result == 0) 5233 any_succeed++; 5234 } 5235 5236 if (any_succeed) 5237 return (0); 5238 5239 return (save_result); 5240 } 5241 5242 /* 5243 * NAME: mirror_probe_dev 5244 * 5245 * DESCRITPION: force opens every component of a mirror. 5246 * 5247 * On entry the unit writerlock is held 5248 */ 5249 static int 5250 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum) 5251 { 5252 int i; 5253 int smi; 5254 int ci; 5255 mm_unit_t *un; 5256 int md_devopen = 0; 5257 set_t setno; 5258 int sm_cnt; 5259 int sm_unavail_cnt; 5260 5261 if (md_unit_isopen(ui)) 5262 md_devopen++; 5263 5264 un = MD_UNIT(mnum); 5265 setno = MD_UN2SET(un); 5266 5267 sm_cnt = 0; 5268 sm_unavail_cnt = 0; 5269 for (i = 0; i < NMIRROR; i++) { 5270 md_dev64_t tmpdev; 5271 mdi_unit_t *sm_ui; 5272 5273 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) { 5274 continue; 5275 } 5276 5277 sm_cnt++; 5278 tmpdev = un->un_sm[i].sm_dev; 5279 (void) md_layered_open(mnum, &tmpdev, 5280 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV); 5281 un->un_sm[i].sm_dev = tmpdev; 5282 5283 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 5284 5285 /* 5286 * Logic similar to that in mirror_open_all_devs. We set or 5287 * clear the submirror Unavailable bit. 5288 */ 5289 (void) md_unit_writerlock(sm_ui); 5290 if (submirror_unavailable(un, i, 1)) { 5291 sm_ui->ui_tstate |= MD_INACCESSIBLE; 5292 sm_unavail_cnt++; 5293 } else { 5294 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 5295 } 5296 md_unit_writerexit(sm_ui); 5297 } 5298 5299 /* 5300 * If all of the submirrors are unavailable, the mirror is also 5301 * unavailable. 5302 */ 5303 if (sm_cnt == sm_unavail_cnt) { 5304 ui->ui_tstate |= MD_INACCESSIBLE; 5305 } else { 5306 ui->ui_tstate &= ~MD_INACCESSIBLE; 5307 } 5308 5309 /* 5310 * Start checking from probe failures. If failures occur we 5311 * set the appropriate erred state only if the metadevice is in 5312 * use. This is specifically to prevent unnecessary resyncs. 5313 * For instance if the disks were accidentally disconnected when 5314 * the system booted up then until the metadevice is accessed 5315 * (like file system mount) the user can shutdown, recable and 5316 * reboot w/o incurring a potentially huge resync. 5317 */ 5318 5319 smi = 0; 5320 ci = 0; 5321 while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) { 5322 5323 if (mirror_other_sources(un, smi, ci, 0) == 1) { 5324 /* 5325 * Note that for a MN set, there is no need to call 5326 * SE_NOTIFY as that is done when processing the 5327 * state change 5328 */ 5329 if (md_devopen) { 5330 /* 5331 * Never called from ioctl context, 5332 * so (IOLOCK *)NULL 5333 */ 5334 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 5335 0, MD_STATE_XMIT, (IOLOCK *)NULL); 5336 if (!MD_MNSET_SETNO(setno)) { 5337 SE_NOTIFY(EC_SVM_STATE, 5338 ESC_SVM_LASTERRED, 5339 SVM_TAG_METADEVICE, setno, 5340 MD_SID(un)); 5341 } 5342 continue; 5343 } else { 5344 (void) mirror_close_all_devs(un, 5345 MD_OFLG_PROBEDEV); 5346 if (!MD_MNSET_SETNO(setno)) { 5347 SE_NOTIFY(EC_SVM_STATE, 5348 ESC_SVM_OPEN_FAIL, 5349 SVM_TAG_METADEVICE, setno, 5350 MD_SID(un)); 5351 } 5352 mirror_openfail_console_info(un, smi, ci); 5353 return (ENXIO); 5354 } 5355 } 5356 5357 /* 5358 * Note that for a MN set, there is no need to call 5359 * SE_NOTIFY as that is done when processing the 5360 * state change 5361 */ 5362 if (md_devopen) { 5363 /* Never called from ioctl context, so (IOLOCK *)NULL */ 5364 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, 5365 MD_STATE_XMIT, (IOLOCK *)NULL); 5366 if (!MD_MNSET_SETNO(setno)) { 5367 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 5368 SVM_TAG_METADEVICE, setno, 5369 MD_SID(un)); 5370 } 5371 } 5372 mirror_openfail_console_info(un, smi, ci); 5373 ci++; 5374 } 5375 5376 if (MD_MNSET_SETNO(setno)) { 5377 send_poke_hotspares(setno); 5378 } else { 5379 (void) poke_hotspares(); 5380 } 5381 (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV); 5382 5383 return (0); 5384 } 5385 5386 5387 static int 5388 mirror_imp_set( 5389 set_t setno 5390 ) 5391 { 5392 5393 mddb_recid_t recid; 5394 int gotsomething, i; 5395 mddb_type_t typ1; 5396 mddb_de_ic_t *dep; 5397 mddb_rb32_t *rbp; 5398 mm_unit32_od_t *un32; 5399 mm_unit_t *un64; 5400 minor_t *self_id; /* minor needs to be updated */ 5401 md_parent_t *parent_id; /* parent needs to be updated */ 5402 mddb_recid_t *record_id; /* record id needs to be updated */ 5403 mddb_recid_t *optrec_id; 5404 md_dev64_t tmpdev; 5405 5406 5407 gotsomething = 0; 5408 5409 typ1 = (mddb_type_t)md_getshared_key(setno, 5410 mirror_md_ops.md_driver.md_drivername); 5411 recid = mddb_makerecid(setno, 0); 5412 5413 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5414 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 5415 continue; 5416 5417 dep = mddb_getrecdep(recid); 5418 rbp = dep->de_rb; 5419 5420 if (rbp->rb_revision == MDDB_REV_RB) { 5421 /* 5422 * Small device 5423 */ 5424 un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid); 5425 self_id = &(un32->c.un_self_id); 5426 parent_id = &(un32->c.un_parent); 5427 record_id = &(un32->c.un_record_id); 5428 optrec_id = &(un32->un_rr_dirty_recid); 5429 5430 for (i = 0; i < un32->un_nsm; i++) { 5431 tmpdev = md_expldev(un32->un_sm[i].sm_dev); 5432 un32->un_sm[i].sm_dev = md_cmpldev 5433 (md_makedevice(md_major, MD_MKMIN(setno, 5434 MD_MIN2UNIT(md_getminor(tmpdev))))); 5435 5436 if (!md_update_minor(setno, mddb_getsidenum 5437 (setno), un32->un_sm[i].sm_key)) 5438 goto out; 5439 } 5440 } else { 5441 un64 = (mm_unit_t *)mddb_getrecaddr(recid); 5442 self_id = &(un64->c.un_self_id); 5443 parent_id = &(un64->c.un_parent); 5444 record_id = &(un64->c.un_record_id); 5445 optrec_id = &(un64->un_rr_dirty_recid); 5446 5447 for (i = 0; i < un64->un_nsm; i++) { 5448 tmpdev = un64->un_sm[i].sm_dev; 5449 un64->un_sm[i].sm_dev = md_makedevice 5450 (md_major, MD_MKMIN(setno, MD_MIN2UNIT 5451 (md_getminor(tmpdev)))); 5452 5453 if (!md_update_minor(setno, mddb_getsidenum 5454 (setno), un64->un_sm[i].sm_key)) 5455 goto out; 5456 } 5457 } 5458 5459 /* 5460 * Update unit with the imported setno 5461 * 5462 */ 5463 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5464 5465 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 5466 if (*parent_id != MD_NO_PARENT) 5467 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 5468 *record_id = MAKERECID(setno, DBID(*record_id)); 5469 *optrec_id = MAKERECID(setno, DBID(*optrec_id)); 5470 5471 gotsomething = 1; 5472 } 5473 5474 out: 5475 return (gotsomething); 5476 } 5477 5478 /* 5479 * NAME: mirror_check_offline 5480 * 5481 * DESCRIPTION: return offline_status = 1 if any submirrors are offline 5482 * 5483 * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is 5484 * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE 5485 * ioctl. 5486 */ 5487 int 5488 mirror_check_offline(md_dev64_t dev, int *offline_status) 5489 { 5490 mm_unit_t *un; 5491 md_error_t mde = mdnullerror; 5492 5493 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5494 return (EINVAL); 5495 *offline_status = 0; 5496 if (un->c.un_status & MD_UN_OFFLINE_SM) 5497 *offline_status = 1; 5498 return (0); 5499 } 5500 5501 /* 5502 * NAME: mirror_inc_abr_count 5503 * 5504 * DESCRIPTION: increment the count of layered soft parts with ABR set 5505 * 5506 * Called from ioctl, so access to un_abr_count is protected by the global 5507 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5508 */ 5509 int 5510 mirror_inc_abr_count(md_dev64_t dev) 5511 { 5512 mm_unit_t *un; 5513 md_error_t mde = mdnullerror; 5514 5515 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5516 return (EINVAL); 5517 un->un_abr_count++; 5518 return (0); 5519 } 5520 5521 /* 5522 * NAME: mirror_dec_abr_count 5523 * 5524 * DESCRIPTION: decrement the count of layered soft parts with ABR set 5525 * 5526 * Called from ioctl, so access to un_abr_count is protected by the global 5527 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5528 */ 5529 int 5530 mirror_dec_abr_count(md_dev64_t dev) 5531 { 5532 mm_unit_t *un; 5533 md_error_t mde = mdnullerror; 5534 5535 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5536 return (EINVAL); 5537 un->un_abr_count--; 5538 return (0); 5539 } 5540 5541 static md_named_services_t mirror_named_services[] = { 5542 {(intptr_t (*)()) poke_hotspares, "poke hotspares" }, 5543 {(intptr_t (*)()) mirror_rename_listkids, MDRNM_LIST_URKIDS }, 5544 {mirror_rename_check, MDRNM_CHECK }, 5545 {(intptr_t (*)()) mirror_renexch_update_kids, MDRNM_UPDATE_KIDS }, 5546 {(intptr_t (*)()) mirror_exchange_parent_update_to, 5547 MDRNM_PARENT_UPDATE_TO}, 5548 {(intptr_t (*)()) mirror_exchange_self_update_from_down, 5549 MDRNM_SELF_UPDATE_FROM_DOWN }, 5550 {(intptr_t (*)())mirror_probe_dev, "probe open test" }, 5551 {(intptr_t (*)())mirror_check_offline, MD_CHECK_OFFLINE }, 5552 {(intptr_t (*)())mirror_inc_abr_count, MD_INC_ABR_COUNT }, 5553 {(intptr_t (*)())mirror_dec_abr_count, MD_DEC_ABR_COUNT }, 5554 { NULL, 0 } 5555 }; 5556 5557 md_ops_t mirror_md_ops = { 5558 mirror_open, /* open */ 5559 mirror_close, /* close */ 5560 md_mirror_strategy, /* strategy */ 5561 NULL, /* print */ 5562 mirror_dump, /* dump */ 5563 NULL, /* read */ 5564 NULL, /* write */ 5565 md_mirror_ioctl, /* mirror_ioctl, */ 5566 mirror_snarf, /* mirror_snarf */ 5567 mirror_halt, /* mirror_halt */ 5568 NULL, /* aread */ 5569 NULL, /* awrite */ 5570 mirror_imp_set, /* import set */ 5571 mirror_named_services 5572 }; 5573 5574 /* module specific initilization */ 5575 static void 5576 init_init() 5577 { 5578 md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t); 5579 5580 /* Initialize the parent and child save memory pools */ 5581 mirror_parent_cache = kmem_cache_create("md_mirror_parent", 5582 sizeof (md_mps_t), 0, mirror_parent_constructor, 5583 mirror_parent_destructor, mirror_run_queue, NULL, NULL, 5584 0); 5585 5586 mirror_child_cache = kmem_cache_create("md_mirror_child", 5587 sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0, 5588 mirror_child_constructor, mirror_child_destructor, 5589 mirror_run_queue, NULL, NULL, 0); 5590 5591 /* 5592 * Insure wowbuf_size is a multiple of DEV_BSIZE, 5593 * then initialize wowbuf memory pool. 5594 */ 5595 md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE); 5596 if (md_wowbuf_size <= 0) 5597 md_wowbuf_size = 2 * DEV_BSIZE; 5598 if (md_wowbuf_size > (32 * DEV_BSIZE)) 5599 md_wowbuf_size = (32 * DEV_BSIZE); 5600 5601 md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t); 5602 mirror_wowblk_cache = kmem_cache_create("md_mirror_wow", 5603 md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0); 5604 5605 mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5606 mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5607 5608 mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL); 5609 } 5610 5611 /* module specific uninitilization (undo init_init()) */ 5612 static void 5613 fini_uninit() 5614 { 5615 kmem_cache_destroy(mirror_parent_cache); 5616 kmem_cache_destroy(mirror_child_cache); 5617 kmem_cache_destroy(mirror_wowblk_cache); 5618 mirror_parent_cache = mirror_child_cache = 5619 mirror_wowblk_cache = NULL; 5620 5621 mutex_destroy(&mirror_timeout.dr_mx); 5622 mutex_destroy(&hotspare_request.dr_mx); 5623 mutex_destroy(&non_ff_drv_mutex); 5624 } 5625 5626 /* define the module linkage */ 5627 MD_PLUGIN_MISC_MODULE("mirrors module %I%", init_init(), fini_uninit()) 5628