1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/conf.h> 31 #include <sys/file.h> 32 #include <sys/user.h> 33 #include <sys/uio.h> 34 #include <sys/t_lock.h> 35 #include <sys/buf.h> 36 #include <sys/dkio.h> 37 #include <sys/vtoc.h> 38 #include <sys/kmem.h> 39 #include <vm/page.h> 40 #include <sys/cmn_err.h> 41 #include <sys/sysmacros.h> 42 #include <sys/types.h> 43 #include <sys/mkdev.h> 44 #include <sys/stat.h> 45 #include <sys/open.h> 46 #include <sys/modctl.h> 47 #include <sys/ddi.h> 48 #include <sys/sunddi.h> 49 #include <sys/debug.h> 50 #include <sys/dklabel.h> 51 #include <vm/hat.h> 52 #include <sys/lvm/mdvar.h> 53 #include <sys/lvm/md_mirror.h> 54 #include <sys/lvm/md_convert.h> 55 #include <sys/lvm/md_mddb.h> 56 #include <sys/esunddi.h> 57 58 #include <sys/sysevent/eventdefs.h> 59 #include <sys/sysevent/svm.h> 60 #include <sys/lvm/mdmn_commd.h> 61 62 md_ops_t mirror_md_ops; 63 #ifndef lint 64 char _depends_on[] = "drv/md"; 65 md_ops_t *md_interface_ops = &mirror_md_ops; 66 #endif 67 68 extern mdq_anchor_t md_done_daemon; 69 extern mdq_anchor_t md_mstr_daemon; 70 extern mdq_anchor_t md_mirror_daemon; 71 extern mdq_anchor_t md_mirror_io_daemon; 72 extern mdq_anchor_t md_mirror_rs_daemon; 73 extern mdq_anchor_t md_mhs_daemon; 74 75 extern unit_t md_nunits; 76 extern set_t md_nsets; 77 extern md_set_t md_set[]; 78 79 extern int md_status; 80 extern clock_t md_hz; 81 82 extern md_krwlock_t md_unit_array_rw; 83 extern kmutex_t md_mx; 84 extern kcondvar_t md_cv; 85 extern int md_mtioctl_cnt; 86 87 daemon_request_t mirror_timeout; 88 static daemon_request_t hotspare_request; 89 static daemon_request_t mn_hs_request[MD_MAXSETS]; /* Multinode hs req */ 90 91 int md_mirror_mcs_buf_off; 92 93 /* Flags for mdmn_ksend_message to allow debugging */ 94 int md_mirror_msg_flags; 95 96 #ifdef DEBUG 97 /* Flag to switch on debug messages */ 98 int mirror_debug_flag = 0; 99 #endif 100 101 /* 102 * Struct used to hold count of DMR reads and the timestamp of last DMR read 103 * It is used to verify, using a debugger, that the DMR read ioctl has been 104 * executed. 105 */ 106 dmr_stats_t mirror_dmr_stats = {0, 0}; 107 108 /* 109 * Mutex protecting list of non-failfast drivers. 110 */ 111 static kmutex_t non_ff_drv_mutex; 112 extern char **non_ff_drivers; 113 114 extern major_t md_major; 115 116 /* 117 * Write-On-Write memory pool. 118 */ 119 static void copy_write_cont(wowhdr_t *wowhdr); 120 static kmem_cache_t *mirror_wowblk_cache = NULL; 121 static int md_wowbuf_size = 16384; 122 static size_t md_wowblk_size; 123 124 /* 125 * This is a flag that allows: 126 * - disabling the write-on-write mechanism. 127 * - logging occurrences of write-on-write 128 * - switching wow handling procedure processing 129 * Counter for occurences of WOW. 130 */ 131 static uint_t md_mirror_wow_flg = 0; 132 static int md_mirror_wow_cnt = 0; 133 134 /* 135 * Tunable to enable/disable dirty region 136 * processing when closing down a mirror. 137 */ 138 static int new_resync = 1; 139 kmem_cache_t *mirror_parent_cache = NULL; 140 kmem_cache_t *mirror_child_cache = NULL; 141 142 extern int md_ff_disable; /* disable failfast */ 143 144 static int mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int); 145 static void mirror_read_strategy(buf_t *, int, void *); 146 static void mirror_write_strategy(buf_t *, int, void *); 147 static void become_owner(daemon_queue_t *); 148 static int mirror_done(struct buf *cb); 149 static int mirror_done_common(struct buf *cb); 150 static void clear_retry_error(struct buf *cb); 151 152 /* 153 * patchables 154 */ 155 int md_min_rr_size = 200; /* 2000 blocks, or 100k */ 156 int md_def_num_rr = 1000; /* Default number of dirty regions */ 157 158 /* 159 * patchable to change delay before rescheduling mirror ownership request. 160 * Value is clock ticks, default 0.5 seconds 161 */ 162 clock_t md_mirror_owner_to = 500000; 163 164 /*ARGSUSED1*/ 165 static int 166 mirror_parent_constructor(void *p, void *d1, int d2) 167 { 168 mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL); 169 return (0); 170 } 171 172 static void 173 mirror_parent_init(md_mps_t *ps) 174 { 175 bzero(ps, offsetof(md_mps_t, ps_mx)); 176 } 177 178 /*ARGSUSED1*/ 179 static void 180 mirror_parent_destructor(void *p, void *d) 181 { 182 mutex_destroy(&((md_mps_t *)p)->ps_mx); 183 } 184 185 /*ARGSUSED1*/ 186 static int 187 mirror_child_constructor(void *p, void *d1, int d2) 188 { 189 bioinit(&((md_mcs_t *)p)->cs_buf); 190 return (0); 191 } 192 193 void 194 mirror_child_init(md_mcs_t *cs) 195 { 196 cs->cs_ps = NULL; 197 cs->cs_mdunit = 0; 198 md_bioreset(&cs->cs_buf); 199 } 200 201 /*ARGSUSED1*/ 202 static void 203 mirror_child_destructor(void *p, void *d) 204 { 205 biofini(&((md_mcs_t *)p)->cs_buf); 206 } 207 208 static void 209 mirror_wowblk_init(wowhdr_t *p) 210 { 211 bzero(p, md_wowblk_size); 212 } 213 214 static void 215 send_poke_hotspares_msg(daemon_request_t *drq) 216 { 217 int rval; 218 md_mn_msg_pokehsp_t pokehsp; 219 md_mn_kresult_t *kresult; 220 set_t setno = (set_t)drq->dq.qlen; 221 222 pokehsp.pokehsp_setno = setno; 223 224 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 225 rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES, 226 MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, (char *)&pokehsp, 227 sizeof (pokehsp), kresult); 228 229 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 230 mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES"); 231 cmn_err(CE_PANIC, 232 "ksend_message failure: POKE_HOTSPARES"); 233 } 234 kmem_free(kresult, sizeof (md_mn_kresult_t)); 235 236 /* Allow further requests to use this set's queue structure */ 237 mutex_enter(&drq->dr_mx); 238 drq->dr_pending = 0; 239 mutex_exit(&drq->dr_mx); 240 } 241 242 /* 243 * Send a poke_hotspares message to the master node. To avoid swamping the 244 * commd handler with requests we only send a message if there is not one 245 * already outstanding. We punt the request to a separate thread context as 246 * cannot afford to block waiting on the request to be serviced. This is 247 * essential when a reconfig cycle is in progress as any open() of a multinode 248 * metadevice may result in a livelock. 249 */ 250 static void 251 send_poke_hotspares(set_t setno) 252 { 253 daemon_request_t *drq = &mn_hs_request[setno]; 254 255 mutex_enter(&drq->dr_mx); 256 if (drq->dr_pending == 0) { 257 drq->dr_pending = 1; 258 drq->dq.qlen = (int)setno; 259 daemon_request(&md_mhs_daemon, 260 send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD); 261 } 262 mutex_exit(&drq->dr_mx); 263 } 264 265 void 266 mirror_set_sm_state( 267 mm_submirror_t *sm, 268 mm_submirror_ic_t *smic, 269 sm_state_t newstate, 270 int force) 271 { 272 int compcnt; 273 int i; 274 int errcnt; 275 sm_state_t origstate; 276 md_m_shared_t *shared; 277 278 if (force) { 279 sm->sm_state = newstate; 280 uniqtime32(&sm->sm_timestamp); 281 return; 282 } 283 284 origstate = newstate; 285 286 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 287 for (i = 0, errcnt = 0; i < compcnt; i++) { 288 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 289 (sm->sm_dev, sm, i); 290 if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED)) 291 newstate |= SMS_COMP_ERRED; 292 if (shared->ms_state & (CS_RESYNC)) 293 newstate |= SMS_COMP_RESYNC; 294 if (shared->ms_state & CS_ERRED) 295 errcnt++; 296 } 297 298 if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0) 299 newstate &= ~origstate; 300 301 if (errcnt == compcnt) 302 newstate |= SMS_ALL_ERRED; 303 else 304 newstate &= ~SMS_ALL_ERRED; 305 306 sm->sm_state = newstate; 307 uniqtime32(&sm->sm_timestamp); 308 } 309 310 static int 311 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error, 312 int frm_probe) 313 { 314 mm_submirror_t *sm; 315 mm_submirror_ic_t *smic; 316 md_m_shared_t *shared; 317 int ci; 318 int i; 319 int compcnt; 320 int open_comp; /* flag for open component */ 321 322 for (i = *smi; i < NMIRROR; i++) { 323 sm = &un->un_sm[i]; 324 smic = &un->un_smic[i]; 325 326 if (!SMS_IS(sm, SMS_INUSE)) 327 continue; 328 329 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 330 for (ci = *cip; ci < compcnt; ci++) { 331 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 332 (sm->sm_dev, sm, ci); 333 /* 334 * if called from any routine but probe, we check for 335 * MDM_S_ISOPEN flag. Since probe does a pseduo open, 336 * it sets MDM_S_PROBEOPEN flag and we test for this 337 * flag. They are both exclusive tests. 338 */ 339 open_comp = (frm_probe) ? 340 (shared->ms_flags & MDM_S_PROBEOPEN): 341 (shared->ms_flags & MDM_S_ISOPEN); 342 if ((shared->ms_flags & MDM_S_IOERR || !open_comp) && 343 ((shared->ms_state == CS_OKAY) || 344 (shared->ms_state == CS_RESYNC))) { 345 if (clr_error) { 346 shared->ms_flags &= ~MDM_S_IOERR; 347 } 348 *cip = ci; 349 *smi = i; 350 return (1); 351 } 352 353 if (clr_error && (shared->ms_flags & MDM_S_IOERR)) { 354 shared->ms_flags &= ~MDM_S_IOERR; 355 } 356 } 357 358 *cip = 0; 359 } 360 return (0); 361 } 362 363 /*ARGSUSED*/ 364 static void 365 mirror_run_queue(void *d) 366 { 367 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 368 md_daemon(1, &md_done_daemon); 369 } 370 /* 371 * check_comp_4_hotspares 372 * 373 * This function attempts to allocate a hotspare for this component if the 374 * component is in error. In a MN set, the function can be called in 2 modes. 375 * It can be called either when a component error has been detected or when a 376 * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set 377 * in flags and the request is sent to all nodes. 378 * The handler on each of the nodes then calls this function with 379 * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed. 380 * 381 * For non-MN sets the function simply attempts to allocate a hotspare. 382 * 383 * On entry, the following locks are held 384 * mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set) 385 * md_unit_writerlock 386 * 387 * Returns 0 if ok 388 * 1 if the unit containing the component has been cleared while 389 * the mdmn_ksend_message() was being executed 390 */ 391 extern int 392 check_comp_4_hotspares( 393 mm_unit_t *un, 394 int smi, 395 int ci, 396 uint_t flags, 397 mddb_recid_t hs_id, /* Only used by MN disksets */ 398 IOLOCK *lockp /* can be NULL */ 399 ) 400 { 401 mm_submirror_t *sm; 402 mm_submirror_ic_t *smic; 403 md_m_shared_t *shared; 404 mddb_recid_t recids[6]; 405 minor_t mnum; 406 intptr_t (*hs_dev)(); 407 void (*hs_done)(); 408 void *hs_data; 409 md_error_t mde = mdnullerror; 410 set_t setno; 411 md_mn_msg_allochsp_t allochspmsg; 412 md_mn_kresult_t *kresult; 413 mm_unit_t *new_un; 414 int rval; 415 416 mnum = MD_SID(un); 417 setno = MD_UN2SET(un); 418 sm = &un->un_sm[smi]; 419 smic = &un->un_smic[smi]; 420 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 421 (sm->sm_dev, sm, ci); 422 423 if (shared->ms_state != CS_ERRED) 424 return (0); 425 426 /* Don't start a new component resync if a resync is already running. */ 427 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 428 return (0); 429 430 if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) { 431 uint_t msgflags; 432 md_mn_msgtype_t msgtype; 433 434 /* Send allocate hotspare message to all nodes */ 435 436 allochspmsg.msg_allochsp_mnum = un->c.un_self_id; 437 allochspmsg.msg_allochsp_sm = smi; 438 allochspmsg.msg_allochsp_comp = ci; 439 allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id; 440 441 /* 442 * Before calling mdmn_ksend_message(), release locks 443 * Can never be in the context of an ioctl. 444 */ 445 md_unit_writerexit(MDI_UNIT(mnum)); 446 if (flags & MD_HOTSPARE_LINKHELD) 447 rw_exit(&mirror_md_ops.md_link_rw.lock); 448 #ifdef DEBUG 449 if (mirror_debug_flag) 450 printf("send alloc hotspare, flags=0x%x %x, %x, %x, %x\n", 451 flags, 452 allochspmsg.msg_allochsp_mnum, 453 allochspmsg.msg_allochsp_sm, 454 allochspmsg.msg_allochsp_comp, 455 allochspmsg.msg_allochsp_hs_id); 456 #endif 457 if (flags & MD_HOTSPARE_WMUPDATE) { 458 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE2; 459 /* 460 * When coming from an update of watermarks, there 461 * must already be a message logged that triggered 462 * this action. So, no need to log this message, too. 463 */ 464 msgflags = MD_MSGF_NO_LOG; 465 } else { 466 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE; 467 msgflags = MD_MSGF_DEFAULT_FLAGS; 468 } 469 470 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 471 rval = mdmn_ksend_message(setno, msgtype, msgflags, 472 (char *)&allochspmsg, sizeof (allochspmsg), 473 kresult); 474 475 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 476 #ifdef DEBUG 477 if (mirror_debug_flag) 478 mdmn_ksend_show_error(rval, kresult, 479 "ALLOCATE HOTSPARE"); 480 #endif 481 /* 482 * If message is sent ok but exitval indicates an error 483 * it must be because the mirror has been cleared. In 484 * this case re-obtain lock and return an error 485 */ 486 if ((rval == 0) && (kresult->kmmr_exitval != 0)) { 487 if (flags & MD_HOTSPARE_LINKHELD) { 488 rw_enter(&mirror_md_ops.md_link_rw.lock, 489 RW_READER); 490 } 491 kmem_free(kresult, sizeof (md_mn_kresult_t)); 492 return (1); 493 } 494 cmn_err(CE_PANIC, 495 "ksend_message failure: ALLOCATE_HOTSPARE"); 496 } 497 kmem_free(kresult, sizeof (md_mn_kresult_t)); 498 499 /* 500 * re-obtain the locks 501 */ 502 if (flags & MD_HOTSPARE_LINKHELD) 503 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 504 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 505 506 /* 507 * As we had to release the locks in order to send the 508 * message to all nodes, we need to check to see if the 509 * unit has changed. If it has we release the writerlock 510 * and return fail. 511 */ 512 if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) { 513 md_unit_writerexit(MDI_UNIT(mnum)); 514 return (1); 515 } 516 } else { 517 if (MD_MNSET_SETNO(setno)) { 518 /* 519 * If 2 or more nodes simultaneously see a 520 * component failure, these nodes will each 521 * send an ALLOCATE_HOTSPARE[2] message. 522 * The first message will allocate the hotspare 523 * and the subsequent messages should do nothing. 524 * 525 * If a slave node doesn't have a hotspare allocated 526 * at the time the message is initiated, then the 527 * passed in hs_id will be 0. If the node 528 * executing this routine has a component shared 529 * ms_hs_id of non-zero, but the message shows a 530 * hs_id of 0, then just return since a hotspare 531 * has already been allocated for this failing 532 * component. When the slave node returns from 533 * the ksend_message the hotspare will have 534 * already been allocated. 535 * 536 * If the slave node does send an hs_id of non-zero, 537 * and the slave node's hs_id matches this node's 538 * ms_hs_id, then the hotspare has error'd and 539 * should be replaced. 540 * 541 * If the slave node sends an hs_id of non-zero and 542 * this node has a different shared ms_hs_id, then 543 * just return since this hotspare has already 544 * been hotspared. 545 */ 546 if (shared->ms_hs_id != 0) { 547 if (hs_id == 0) { 548 #ifdef DEBUG 549 if (mirror_debug_flag) { 550 printf("check_comp_4_hotspares" 551 "(NOXMIT), short circuit " 552 "hs_id=0x%x, " 553 "ms_hs_id=0x%x\n", 554 hs_id, shared->ms_hs_id); 555 } 556 #endif 557 return (0); 558 } 559 if (hs_id != shared->ms_hs_id) { 560 #ifdef DEBUG 561 if (mirror_debug_flag) { 562 printf("check_comp_4_hotspares" 563 "(NOXMIT), short circuit2 " 564 "hs_id=0x%x, " 565 "ms_hs_id=0x%x\n", 566 hs_id, shared->ms_hs_id); 567 } 568 #endif 569 return (0); 570 } 571 } 572 } 573 574 sm = &un->un_sm[smi]; 575 hs_dev = md_get_named_service(sm->sm_dev, 0, 576 "hotspare device", 0); 577 if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done, 578 &hs_data) != 0) 579 return (0); 580 581 /* 582 * set_sm_comp_state() commits the modified records. 583 * As we don't transmit the changes, no need to drop the lock. 584 */ 585 set_sm_comp_state(un, smi, ci, CS_RESYNC, recids, 586 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 587 588 (*hs_done)(sm->sm_dev, hs_data); 589 590 mirror_check_failfast(mnum); 591 592 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE, 593 setno, MD_SID(un)); 594 595 /* 596 * For a multi-node set we need to reset the un_rs_type, 597 * un_rs_resync_done and un_rs_resync_2_do fields as the 598 * hot-spare resync must copy all applicable data. 599 */ 600 if (MD_MNSET_SETNO(setno)) { 601 un->un_rs_type = MD_RS_NONE; 602 un->un_rs_resync_done = 0; 603 un->un_rs_resync_2_do = 0; 604 } 605 606 /* 607 * Must drop writer lock since mirror_resync_unit will 608 * open devices and must be able to grab readerlock. 609 * Don't need to drop IOLOCK since any descendent routines 610 * calling ksend_messages will drop the IOLOCK as needed. 611 * 612 */ 613 if (lockp) { 614 md_ioctl_writerexit(lockp); 615 } else { 616 md_unit_writerexit(MDI_UNIT(mnum)); 617 } 618 619 /* start resync */ 620 (void) mirror_resync_unit(mnum, NULL, &mde, lockp); 621 622 if (lockp) { 623 new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum)); 624 } else { 625 new_un = md_unit_writerlock(MDI_UNIT(mnum)); 626 } 627 } 628 return (0); 629 } 630 631 /* 632 * check_unit_4_hotspares 633 * 634 * For a given mirror, allocate hotspares, if available for any components 635 * that are in error 636 * 637 * Returns 0 if ok 638 * 1 if check_comp_4_hotspares returns non-zero. This will only 639 * happen for a MN unit where the unit has been cleared while 640 * the allocate hotspare message is sent to all nodes. 641 */ 642 static int 643 check_unit_4_hotspares(mm_unit_t *un, int flags) 644 { 645 mm_submirror_t *sm; 646 mm_submirror_ic_t *smic; 647 int ci; 648 int i; 649 int compcnt; 650 651 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 652 return (0); 653 654 for (i = 0; i < NMIRROR; i++) { 655 sm = &un->un_sm[i]; 656 smic = &un->un_smic[i]; 657 if (!SMS_IS(sm, SMS_INUSE)) 658 continue; 659 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm); 660 for (ci = 0; ci < compcnt; ci++) { 661 md_m_shared_t *shared; 662 663 shared = (md_m_shared_t *) 664 (*(smic->sm_shared_by_indx))(sm->sm_dev, 665 sm, ci); 666 /* 667 * Never called from ioctl context, so pass in 668 * (IOLOCK *)NULL. Pass through flags from calling 669 * routine, also setting XMIT flag. 670 */ 671 if (check_comp_4_hotspares(un, i, ci, 672 (MD_HOTSPARE_XMIT | flags), 673 shared->ms_hs_id, (IOLOCK *)NULL) != 0) 674 return (1); 675 } 676 } 677 return (0); 678 } 679 680 static void 681 check_4_hotspares(daemon_request_t *drq) 682 { 683 mdi_unit_t *ui; 684 mm_unit_t *un; 685 md_link_t *next; 686 int x; 687 688 mutex_enter(&drq->dr_mx); /* clear up front so can poke */ 689 drq->dr_pending = 0; /* again in low level routine if */ 690 mutex_exit(&drq->dr_mx); /* something found to do */ 691 692 /* 693 * Used to have a problem here. The disksets weren't marked as being 694 * MNHOLD. This opened a window where we could be searching for 695 * hotspares and have the disk set unloaded (released) from under 696 * us causing a panic in stripe_component_count(). 697 * The way to prevent that is to mark the set MNHOLD which prevents 698 * any diskset from being released while we are scanning the mirrors, 699 * submirrors and components. 700 */ 701 702 for (x = 0; x < md_nsets; x++) 703 md_holdset_enter(x); 704 705 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER); 706 for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) { 707 ui = MDI_UNIT(next->ln_id); 708 709 un = (mm_unit_t *)md_unit_readerlock(ui); 710 711 /* 712 * Only check the unit if we are the master for this set 713 * For an MN set, poke_hotspares() is only effective on the 714 * master 715 */ 716 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 717 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 718 md_unit_readerexit(ui); 719 continue; 720 } 721 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) { 722 md_unit_readerexit(ui); 723 continue; 724 } 725 md_unit_readerexit(ui); 726 727 un = (mm_unit_t *)md_unit_writerlock(ui); 728 /* 729 * check_unit_4_hotspares will exit 1 if the unit has been 730 * removed during the process of allocating the hotspare. 731 * This can only happen for a MN metadevice. If unit no longer 732 * exists, no need to release writerlock 733 */ 734 if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0) 735 md_unit_writerexit(ui); 736 else { 737 /* 738 * If check_unit_4_hotspares failed, queue another 739 * request and break out of this one 740 */ 741 (void) poke_hotspares(); 742 break; 743 } 744 } 745 rw_exit(&mirror_md_ops.md_link_rw.lock); 746 747 for (x = 0; x < md_nsets; x++) 748 md_holdset_exit(x); 749 } 750 751 /* 752 * poke_hotspares 753 * 754 * If there is not a pending poke_hotspares request pending, queue a requent 755 * to call check_4_hotspares(). This will scan all mirrors and attempt to 756 * allocate hotspares for all components in error. 757 */ 758 int 759 poke_hotspares() 760 { 761 mutex_enter(&hotspare_request.dr_mx); 762 if (hotspare_request.dr_pending == 0) { 763 hotspare_request.dr_pending = 1; 764 daemon_request(&md_mhs_daemon, 765 check_4_hotspares, 766 (daemon_queue_t *)&hotspare_request, REQ_OLD); 767 } 768 mutex_exit(&hotspare_request.dr_mx); 769 return (0); 770 } 771 772 static void 773 free_all_ecomps(err_comp_t *ecomp) 774 { 775 err_comp_t *d; 776 777 while (ecomp != NULL) { 778 d = ecomp; 779 ecomp = ecomp->ec_next; 780 kmem_free(d, sizeof (err_comp_t)); 781 } 782 } 783 784 /* 785 * NAME: mirror_openfail_console_info 786 * 787 * DESCRIPTION: Prints a informative message to the console when mirror 788 * cannot be opened. 789 * 790 * PARAMETERS: mm_unit_t un - pointer to mirror unit structure 791 * int smi - submirror index 792 * int ci - component index 793 */ 794 795 void 796 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci) 797 { 798 void (*get_dev)(); 799 ms_cd_info_t cd; 800 md_dev64_t tmpdev; 801 802 tmpdev = un->un_sm[smi].sm_dev; 803 get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0); 804 if (get_dev != NULL) { 805 (void) (*get_dev)(tmpdev, smi, ci, &cd); 806 cmn_err(CE_WARN, "md %s: open error on %s", 807 md_shortname(MD_SID(un)), 808 md_devname(MD_UN2SET(un), cd.cd_dev, 809 NULL, 0)); 810 } else { 811 cmn_err(CE_WARN, "md %s: open error", 812 md_shortname(MD_SID(un))); 813 } 814 } 815 816 static int 817 mirror_close_all_devs(mm_unit_t *un, int md_cflags) 818 { 819 int i; 820 md_dev64_t dev; 821 822 for (i = 0; i < NMIRROR; i++) { 823 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 824 continue; 825 dev = un->un_sm[i].sm_dev; 826 md_layered_close(dev, md_cflags); 827 } 828 return (0); 829 } 830 831 /* 832 * Keep track of drivers that don't support failfast. We use this so that 833 * we only log one diagnostic message for each of these drivers, no matter 834 * how many times we run the mirror_check_failfast function. 835 * Return 1 if this is a new driver that does not support failfast, 836 * return 0 if we have already seen this non-failfast driver. 837 */ 838 static int 839 new_non_ff_driver(const char *s) 840 { 841 mutex_enter(&non_ff_drv_mutex); 842 if (non_ff_drivers == NULL) { 843 non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *), 844 KM_NOSLEEP); 845 if (non_ff_drivers == NULL) { 846 mutex_exit(&non_ff_drv_mutex); 847 return (1); 848 } 849 850 non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); 851 if (non_ff_drivers[0] == NULL) { 852 kmem_free(non_ff_drivers, 2 * sizeof (char *)); 853 non_ff_drivers = NULL; 854 mutex_exit(&non_ff_drv_mutex); 855 return (1); 856 } 857 858 (void) strcpy(non_ff_drivers[0], s); 859 non_ff_drivers[1] = NULL; 860 861 } else { 862 int i; 863 char **tnames; 864 char **tmp; 865 866 for (i = 0; non_ff_drivers[i] != NULL; i++) { 867 if (strcmp(s, non_ff_drivers[i]) == 0) { 868 mutex_exit(&non_ff_drv_mutex); 869 return (0); 870 } 871 } 872 873 /* allow for new element and null */ 874 i += 2; 875 tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP); 876 if (tnames == NULL) { 877 mutex_exit(&non_ff_drv_mutex); 878 return (1); 879 } 880 881 for (i = 0; non_ff_drivers[i] != NULL; i++) 882 tnames[i] = non_ff_drivers[i]; 883 884 tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP); 885 if (tnames[i] == NULL) { 886 /* adjust i so that it is the right count to free */ 887 kmem_free(tnames, (i + 2) * sizeof (char *)); 888 mutex_exit(&non_ff_drv_mutex); 889 return (1); 890 } 891 892 (void) strcpy(tnames[i++], s); 893 tnames[i] = NULL; 894 895 tmp = non_ff_drivers; 896 non_ff_drivers = tnames; 897 /* i now represents the count we previously alloced */ 898 kmem_free(tmp, i * sizeof (char *)); 899 } 900 mutex_exit(&non_ff_drv_mutex); 901 902 return (1); 903 } 904 905 /* 906 * Check for the "ddi-failfast-supported" devtree property on each submirror 907 * component to indicate if we should do I/O to that submirror with the 908 * B_FAILFAST flag set or not. This check is made at various state transitions 909 * in the mirror code (e.g. open, enable, hotspare, etc.). Sometimes we 910 * only need to check one drive (e.g. hotspare) but since the check is 911 * fast and infrequent and sometimes needs to be done on all components we 912 * just check all components on each call. 913 */ 914 void 915 mirror_check_failfast(minor_t mnum) 916 { 917 int i; 918 mm_unit_t *un; 919 920 if (md_ff_disable) 921 return; 922 923 un = MD_UNIT(mnum); 924 925 for (i = 0; i < NMIRROR; i++) { 926 int ci; 927 int cnt; 928 int ff = 1; 929 mm_submirror_t *sm; 930 mm_submirror_ic_t *smic; 931 void (*get_dev)(); 932 933 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 934 continue; 935 936 sm = &un->un_sm[i]; 937 smic = &un->un_smic[i]; 938 939 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 940 "get device", 0); 941 942 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm); 943 for (ci = 0; ci < cnt; ci++) { 944 int found = 0; 945 dev_t ci_dev; 946 major_t major; 947 dev_info_t *devi; 948 ms_cd_info_t cd; 949 950 /* this already returns the hs dev if the device is spared */ 951 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 952 953 ci_dev = md_dev64_to_dev(cd.cd_dev); 954 major = getmajor(ci_dev); 955 956 if (major == md_major) { 957 /* this component must be a soft partition; get real dev */ 958 minor_t dev_mnum; 959 mdi_unit_t *ui; 960 mp_unit_t *un; 961 set_t setno; 962 side_t side; 963 md_dev64_t tmpdev; 964 965 ui = MDI_UNIT(getminor(ci_dev)); 966 967 /* grab necessary lock */ 968 un = (mp_unit_t *)md_unit_readerlock(ui); 969 970 dev_mnum = MD_SID(un); 971 setno = MD_MIN2SET(dev_mnum); 972 side = mddb_getsidenum(setno); 973 974 tmpdev = un->un_dev; 975 976 /* Get dev by device id */ 977 if (md_devid_found(setno, side, un->un_key) == 1) { 978 tmpdev = md_resolve_bydevid(dev_mnum, tmpdev, 979 un->un_key); 980 } 981 982 md_unit_readerexit(ui); 983 984 ci_dev = md_dev64_to_dev(tmpdev); 985 major = getmajor(ci_dev); 986 } 987 988 if (ci_dev != NODEV32 && 989 (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) != NULL) { 990 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; 991 int propvalue = 0; 992 int proplength = sizeof (int); 993 int error; 994 struct cb_ops *cb; 995 996 if ((cb = devopsp[major]->devo_cb_ops) != NULL) { 997 error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi, prop_op, 998 DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, 999 "ddi-failfast-supported", 1000 (caddr_t)&propvalue, &proplength); 1001 1002 if (error == DDI_PROP_SUCCESS) 1003 found = 1; 1004 } 1005 1006 if (!found && new_non_ff_driver(ddi_driver_name(devi))) 1007 cmn_err(CE_NOTE, "!md: B_FAILFAST I/O disabled on %s", 1008 ddi_driver_name(devi)); 1009 1010 ddi_release_devi(devi); 1011 } 1012 1013 /* All components must support failfast in the submirror. */ 1014 if (!found) { 1015 ff = 0; 1016 break; 1017 } 1018 } 1019 1020 if (ff) { 1021 sm->sm_flags |= MD_SM_FAILFAST; 1022 } else { 1023 sm->sm_flags &= ~MD_SM_FAILFAST; 1024 } 1025 } 1026 } 1027 1028 /* 1029 * Return true if the submirror is unavailable. 1030 * If any of the submirror components are opened then the submirror cannot 1031 * be unavailable (MD_INACCESSIBLE). 1032 * If any of the components are already in the errored state, then the submirror 1033 * cannot be unavailable (MD_INACCESSIBLE). 1034 */ 1035 static bool_t 1036 submirror_unavailable(mm_unit_t *un, int smi, int from_probe) 1037 { 1038 mm_submirror_t *sm; 1039 mm_submirror_ic_t *smic; 1040 md_m_shared_t *shared; 1041 int ci; 1042 int compcnt; 1043 1044 sm = &un->un_sm[smi]; 1045 smic = &un->un_smic[smi]; 1046 1047 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 1048 for (ci = 0; ci < compcnt; ci++) { 1049 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 1050 (sm->sm_dev, sm, ci); 1051 if (from_probe) { 1052 if (shared->ms_flags & MDM_S_PROBEOPEN) 1053 return (B_FALSE); 1054 } else { 1055 if (shared->ms_flags & MDM_S_ISOPEN) 1056 return (B_FALSE); 1057 } 1058 if (shared->ms_state == CS_ERRED || 1059 shared->ms_state == CS_LAST_ERRED) 1060 return (B_FALSE); 1061 } 1062 1063 return (B_TRUE); 1064 } 1065 1066 static int 1067 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp) 1068 { 1069 int i; 1070 mm_unit_t *un; 1071 mdi_unit_t *ui; 1072 int err; 1073 int smi; 1074 int ci; 1075 err_comp_t *c; 1076 err_comp_t *ecomps = NULL; 1077 int smmask = 0; 1078 set_t setno; 1079 int sm_cnt; 1080 int sm_unavail_cnt; 1081 1082 mirror_check_failfast(mnum); 1083 1084 un = MD_UNIT(mnum); 1085 ui = MDI_UNIT(mnum); 1086 setno = MD_UN2SET(un); 1087 1088 for (i = 0; i < NMIRROR; i++) { 1089 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1090 1091 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1092 continue; 1093 if (md_layered_open(mnum, &tmpdev, md_oflags)) 1094 smmask |= SMI2BIT(i); 1095 un->un_sm[i].sm_dev = tmpdev; 1096 } 1097 1098 /* 1099 * If smmask is clear, all submirrors are accessible. Clear the 1100 * MD_INACCESSIBLE bit in this case. This bit is also cleared for the 1101 * mirror device. If smmask is set, we have to determine which of the 1102 * submirrors are in error. If no submirror is accessible we mark the 1103 * whole mirror as MD_INACCESSIBLE. 1104 */ 1105 if (smmask == 0) { 1106 if (lockp) { 1107 md_ioctl_readerexit(lockp); 1108 (void) md_ioctl_writerlock(lockp, ui); 1109 } else { 1110 md_unit_readerexit(ui); 1111 (void) md_unit_writerlock(ui); 1112 } 1113 ui->ui_tstate &= ~MD_INACCESSIBLE; 1114 if (lockp) { 1115 md_ioctl_writerexit(lockp); 1116 (void) md_ioctl_readerlock(lockp, ui); 1117 } else { 1118 md_unit_writerexit(ui); 1119 (void) md_unit_readerlock(ui); 1120 } 1121 1122 for (i = 0; i < NMIRROR; i++) { 1123 md_dev64_t tmpdev; 1124 mdi_unit_t *sm_ui; 1125 1126 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1127 continue; 1128 1129 tmpdev = un->un_sm[i].sm_dev; 1130 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1131 (void) md_unit_writerlock(sm_ui); 1132 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1133 md_unit_writerexit(sm_ui); 1134 } 1135 1136 return (0); 1137 } 1138 1139 for (i = 0; i < NMIRROR; i++) { 1140 md_dev64_t tmpdev; 1141 1142 if (!(smmask & SMI2BIT(i))) 1143 continue; 1144 1145 tmpdev = un->un_sm[i].sm_dev; 1146 err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS); 1147 un->un_sm[i].sm_dev = tmpdev; 1148 ASSERT(err == 0); 1149 } 1150 1151 if (lockp) { 1152 md_ioctl_readerexit(lockp); 1153 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui); 1154 } else { 1155 md_unit_readerexit(ui); 1156 un = (mm_unit_t *)md_unit_writerlock(ui); 1157 } 1158 1159 /* 1160 * We want to make sure the unavailable flag is not masking a real 1161 * error on the submirror. 1162 * For each submirror, 1163 * if all of the submirror components couldn't be opened and there 1164 * are no errors on the submirror, then set the unavailable flag 1165 * otherwise, clear unavailable. 1166 */ 1167 sm_cnt = 0; 1168 sm_unavail_cnt = 0; 1169 for (i = 0; i < NMIRROR; i++) { 1170 md_dev64_t tmpdev; 1171 mdi_unit_t *sm_ui; 1172 1173 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 1174 continue; 1175 1176 sm_cnt++; 1177 tmpdev = un->un_sm[i].sm_dev; 1178 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 1179 1180 (void) md_unit_writerlock(sm_ui); 1181 if (submirror_unavailable(un, i, 0)) { 1182 sm_ui->ui_tstate |= MD_INACCESSIBLE; 1183 sm_unavail_cnt++; 1184 } else { 1185 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 1186 } 1187 md_unit_writerexit(sm_ui); 1188 } 1189 1190 /* 1191 * If all of the submirrors are unavailable, the mirror is also 1192 * unavailable. 1193 */ 1194 if (sm_cnt == sm_unavail_cnt) { 1195 ui->ui_tstate |= MD_INACCESSIBLE; 1196 } else { 1197 ui->ui_tstate &= ~MD_INACCESSIBLE; 1198 } 1199 1200 smi = 0; 1201 ci = 0; 1202 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 1203 if (mirror_other_sources(un, smi, ci, 1) == 1) { 1204 1205 free_all_ecomps(ecomps); 1206 (void) mirror_close_all_devs(un, md_oflags); 1207 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, 1208 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1209 mirror_openfail_console_info(un, smi, ci); 1210 if (lockp) { 1211 md_ioctl_writerexit(lockp); 1212 (void) md_ioctl_readerlock(lockp, ui); 1213 } else { 1214 md_unit_writerexit(ui); 1215 (void) md_unit_readerlock(ui); 1216 } 1217 return (ENXIO); 1218 } 1219 1220 /* track all component states that need changing */ 1221 c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP); 1222 c->ec_next = ecomps; 1223 c->ec_smi = smi; 1224 c->ec_ci = ci; 1225 ecomps = c; 1226 ci++; 1227 } 1228 1229 /* Make all state changes and commit them */ 1230 for (c = ecomps; c != NULL; c = c->ec_next) { 1231 /* 1232 * If lockp is set, then entering kernel through ioctl. 1233 * For a MN set, the only ioctl path is via a commd message 1234 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already 1235 * being sent to each node. 1236 * In this case, set NO_XMIT so that set_sm_comp_state 1237 * won't attempt to send a message on a message. 1238 * 1239 * In !MN sets, the xmit flag is ignored, so it doesn't matter 1240 * which flag is passed. 1241 */ 1242 if (lockp) { 1243 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1244 MD_STATE_NO_XMIT, lockp); 1245 } else { 1246 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0, 1247 (MD_STATE_XMIT | MD_STATE_OCHELD), lockp); 1248 } 1249 /* 1250 * For a MN set, the NOTIFY is done when the state change is 1251 * processed on each node 1252 */ 1253 if (!MD_MNSET_SETNO(setno)) { 1254 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 1255 SVM_TAG_METADEVICE, setno, MD_SID(un)); 1256 } 1257 } 1258 1259 if (lockp) { 1260 md_ioctl_writerexit(lockp); 1261 (void) md_ioctl_readerlock(lockp, ui); 1262 } else { 1263 md_unit_writerexit(ui); 1264 (void) md_unit_readerlock(ui); 1265 } 1266 1267 free_all_ecomps(ecomps); 1268 1269 /* allocate hotspares for all errored components */ 1270 if (MD_MNSET_SETNO(setno)) { 1271 /* 1272 * If we're called from an ioctl (lockp set) then we cannot 1273 * directly call send_poke_hotspares as this will block until 1274 * the message gets despatched to all nodes. If the cluster is 1275 * going through a reconfig cycle then the message will block 1276 * until the cycle is complete, and as we originate from a 1277 * service call from commd we will livelock. 1278 */ 1279 if (lockp == NULL) { 1280 md_unit_readerexit(ui); 1281 send_poke_hotspares(setno); 1282 (void) md_unit_readerlock(ui); 1283 } 1284 } else { 1285 (void) poke_hotspares(); 1286 } 1287 return (0); 1288 } 1289 1290 void 1291 mirror_overlap_chain_remove(md_mps_t *ps) 1292 { 1293 mm_unit_t *un; 1294 1295 if (panicstr) 1296 return; 1297 1298 ASSERT(ps->ps_flags & MD_MPS_ON_OVERLAP); 1299 1300 un = ps->ps_un; 1301 1302 mutex_enter(&un->un_ovrlap_chn_mx); 1303 if (ps->ps_ovrlap_prev != &un->un_ovrlap_chn) 1304 ps->ps_ovrlap_prev->ps_ovrlap_next = ps->ps_ovrlap_next; 1305 else 1306 un->un_ovrlap_chn.ps_ovrlap_next = ps->ps_ovrlap_next; 1307 if (ps->ps_ovrlap_next != &un->un_ovrlap_chn) 1308 ps->ps_ovrlap_next->ps_ovrlap_prev = ps->ps_ovrlap_prev; 1309 else 1310 un->un_ovrlap_chn.ps_ovrlap_prev = ps->ps_ovrlap_prev; 1311 /* Handle empty overlap chain */ 1312 if (un->un_ovrlap_chn.ps_ovrlap_prev == &un->un_ovrlap_chn) { 1313 un->un_ovrlap_chn.ps_ovrlap_prev = 1314 un->un_ovrlap_chn.ps_ovrlap_next = NULL; 1315 } 1316 if (un->un_ovrlap_chn_flg) { 1317 un->un_ovrlap_chn_flg = 0; 1318 cv_broadcast(&un->un_ovrlap_chn_cv); 1319 } 1320 ps->ps_flags &= ~MD_MPS_ON_OVERLAP; 1321 mutex_exit(&un->un_ovrlap_chn_mx); 1322 } 1323 1324 1325 /* 1326 * wait_for_overlaps: 1327 * ----------------- 1328 * Check that given i/o request does not cause an overlap with already pending 1329 * i/o. If it does, block until the overlapped i/o completes. 1330 * 1331 * Note: the overlap chain is held as a monotonically increasing doubly-linked 1332 * list with the sentinel contained in un->un_ovrlap_chn. We avoid a linear 1333 * search of the list by the following logic: 1334 * ps->ps_lastblk < un_ovrlap_chn.ps_ovrlap_next->ps_firstblk => No overlap 1335 * ps->ps_firstblk > un_ovrlap_chn.ps_ovrlap_prev->ps_lastblk => No overlap 1336 * otherwise 1337 * scan un_ovrlap_chn.ps_ovrlap_next for location where ps->ps_firstblk 1338 * > chain->ps_lastblk. This is the insertion point. As the list is 1339 * guaranteed to be ordered there is no need to continue scanning. 1340 * 1341 * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent 1342 * structure to be already on the overlap chain and MD_OVERLAP_NO_REPEAT 1343 * if it must not already be on the chain 1344 */ 1345 static void 1346 wait_for_overlaps(md_mps_t *ps, int flags) 1347 { 1348 mm_unit_t *un; 1349 md_mps_t *ps1, **head, **tail; 1350 1351 if (panicstr) 1352 return; 1353 1354 1355 un = ps->ps_un; 1356 1357 mutex_enter(&un->un_ovrlap_chn_mx); 1358 if ((flags & MD_OVERLAP_ALLOW_REPEAT) && 1359 (ps->ps_flags & MD_MPS_ON_OVERLAP)) { 1360 mutex_exit(&un->un_ovrlap_chn_mx); 1361 return; 1362 } 1363 1364 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 1365 head = &(un->un_ovrlap_chn.ps_ovrlap_next); 1366 tail = &(un->un_ovrlap_chn.ps_ovrlap_prev); 1367 ps1 = *head; 1368 /* 1369 * Check for simple limit cases: 1370 * *head == NULL 1371 * insert ps at head of list 1372 * lastblk < head->firstblk 1373 * insert at head of list 1374 * firstblk > tail->lastblk 1375 * insert at tail of list 1376 */ 1377 if (ps1 == NULL) { 1378 /* Insert at head */ 1379 ps->ps_ovrlap_next = &un->un_ovrlap_chn; 1380 ps->ps_ovrlap_prev = &un->un_ovrlap_chn; 1381 *head = ps; 1382 *tail = ps; 1383 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1384 mutex_exit(&un->un_ovrlap_chn_mx); 1385 return; 1386 } else if (ps->ps_lastblk < (*head)->ps_firstblk) { 1387 /* Insert at head */ 1388 ps->ps_ovrlap_next = (*head); 1389 ps->ps_ovrlap_prev = &un->un_ovrlap_chn; 1390 (*head)->ps_ovrlap_prev = ps; 1391 *head = ps; 1392 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1393 mutex_exit(&un->un_ovrlap_chn_mx); 1394 return; 1395 } else if (ps->ps_firstblk > (*tail)->ps_lastblk) { 1396 /* Insert at tail */ 1397 ps->ps_ovrlap_prev = (*tail); 1398 ps->ps_ovrlap_next = &un->un_ovrlap_chn; 1399 (*tail)->ps_ovrlap_next = ps; 1400 *tail = ps; 1401 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1402 mutex_exit(&un->un_ovrlap_chn_mx); 1403 return; 1404 } 1405 /* Now we have to scan the list for possible overlaps */ 1406 while (ps1 != NULL) { 1407 /* 1408 * If this region has been put on the chain by another thread 1409 * just exit 1410 */ 1411 if ((flags & MD_OVERLAP_ALLOW_REPEAT) && 1412 (ps->ps_flags & MD_MPS_ON_OVERLAP)) { 1413 mutex_exit(&un->un_ovrlap_chn_mx); 1414 return; 1415 1416 } 1417 for (ps1 = *head; ps1 && (ps1 != &un->un_ovrlap_chn); 1418 ps1 = ps1->ps_ovrlap_next) { 1419 if (ps->ps_firstblk > (*tail)->ps_lastblk) { 1420 /* Insert at tail */ 1421 ps->ps_ovrlap_prev = (*tail); 1422 ps->ps_ovrlap_next = &un->un_ovrlap_chn; 1423 (*tail)->ps_ovrlap_next = ps; 1424 *tail = ps; 1425 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1426 mutex_exit(&un->un_ovrlap_chn_mx); 1427 return; 1428 } 1429 if (ps->ps_firstblk > ps1->ps_lastblk) 1430 continue; 1431 if (ps->ps_lastblk < ps1->ps_firstblk) { 1432 /* Insert into list at current 'ps1' position */ 1433 ps->ps_ovrlap_next = ps1; 1434 ps->ps_ovrlap_prev = ps1->ps_ovrlap_prev; 1435 ps1->ps_ovrlap_prev->ps_ovrlap_next = ps; 1436 ps1->ps_ovrlap_prev = ps; 1437 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1438 mutex_exit(&un->un_ovrlap_chn_mx); 1439 return; 1440 } 1441 break; 1442 } 1443 if (ps1 != NULL) { 1444 un->un_ovrlap_chn_flg = 1; 1445 cv_wait(&un->un_ovrlap_chn_cv, &un->un_ovrlap_chn_mx); 1446 /* 1447 * Now ps1 refers to the old insertion point and we 1448 * have to check the whole chain to see if we're still 1449 * overlapping any other i/o. 1450 */ 1451 } 1452 } 1453 1454 /* 1455 * Only get here if we had one overlapping i/o on the list and that 1456 * has now completed. In this case the list is empty so we insert <ps> 1457 * at the head of the chain. 1458 */ 1459 ASSERT(*head == NULL); 1460 *tail = *head = ps; 1461 ps->ps_ovrlap_next = ps->ps_ovrlap_prev = &un->un_ovrlap_chn; 1462 ps->ps_flags |= MD_MPS_ON_OVERLAP; 1463 mutex_exit(&un->un_ovrlap_chn_mx); 1464 } 1465 1466 /* 1467 * This function is called from mirror_done to check whether any pages have 1468 * been modified while a mirrored write was in progress. Returns 0 if 1469 * all pages associated with bp are clean, 1 otherwise. 1470 */ 1471 static int 1472 any_pages_dirty(struct buf *bp) 1473 { 1474 int rval; 1475 1476 rval = biomodified(bp); 1477 if (rval == -1) 1478 rval = 0; 1479 1480 return (rval); 1481 } 1482 1483 #define MAX_EXTRAS 10 1484 1485 void 1486 mirror_commit( 1487 mm_unit_t *un, 1488 int smmask, 1489 mddb_recid_t *extras 1490 ) 1491 { 1492 mm_submirror_t *sm; 1493 md_unit_t *su; 1494 int i; 1495 1496 /* 2=mirror,null id */ 1497 mddb_recid_t recids[NMIRROR+2+MAX_EXTRAS]; 1498 1499 int ri = 0; 1500 1501 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE) 1502 return; 1503 1504 /* Add two, this includes the mirror unit and the null recid */ 1505 if (extras != NULL) { 1506 int nrecids = 0; 1507 while (extras[nrecids] != 0) { 1508 nrecids++; 1509 } 1510 ASSERT(nrecids <= MAX_EXTRAS); 1511 } 1512 1513 if (un != NULL) 1514 recids[ri++] = un->c.un_record_id; 1515 for (i = 0; i < NMIRROR; i++) { 1516 if (!(smmask & SMI2BIT(i))) 1517 continue; 1518 sm = &un->un_sm[i]; 1519 if (!SMS_IS(sm, SMS_INUSE)) 1520 continue; 1521 if (md_getmajor(sm->sm_dev) != md_major) 1522 continue; 1523 su = MD_UNIT(md_getminor(sm->sm_dev)); 1524 recids[ri++] = su->c.un_record_id; 1525 } 1526 1527 if (extras != NULL) 1528 while (*extras != 0) { 1529 recids[ri++] = *extras; 1530 extras++; 1531 } 1532 1533 if (ri == 0) 1534 return; 1535 recids[ri] = 0; 1536 1537 /* 1538 * Ok to hold ioctl lock across record commit to mddb as 1539 * long as the record(s) being committed aren't resync records. 1540 */ 1541 mddb_commitrecs_wrapper(recids); 1542 } 1543 1544 1545 /* 1546 * This routine is used to set a bit in the writable_bm bitmap 1547 * which represents each submirror in a metamirror which 1548 * is writable. The first writable submirror index is assigned 1549 * to the sm_index. The number of writable submirrors are returned in nunits. 1550 * 1551 * This routine returns the submirror's unit number. 1552 */ 1553 1554 static void 1555 select_write_units(struct mm_unit *un, md_mps_t *ps) 1556 { 1557 1558 int i; 1559 unsigned writable_bm = 0; 1560 unsigned nunits = 0; 1561 1562 for (i = 0; i < NMIRROR; i++) { 1563 if (SUBMIRROR_IS_WRITEABLE(un, i)) { 1564 /* set bit of all writable units */ 1565 writable_bm |= SMI2BIT(i); 1566 nunits++; 1567 } 1568 } 1569 ps->ps_writable_sm = writable_bm; 1570 ps->ps_active_cnt = nunits; 1571 ps->ps_current_sm = 0; 1572 } 1573 1574 static 1575 unsigned 1576 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps) 1577 { 1578 1579 int i; 1580 unsigned writable_bm = 0; 1581 unsigned nunits = 0; 1582 1583 for (i = 0; i < NMIRROR; i++) { 1584 if (SUBMIRROR_IS_WRITEABLE(un, i) && 1585 un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) { 1586 writable_bm |= SMI2BIT(i); 1587 nunits++; 1588 } 1589 } 1590 if ((writable_bm & ps->ps_allfrom_sm) != 0) { 1591 writable_bm &= ~ps->ps_allfrom_sm; 1592 nunits--; 1593 } 1594 ps->ps_writable_sm = writable_bm; 1595 ps->ps_active_cnt = nunits; 1596 ps->ps_current_sm = 0; 1597 return (nunits); 1598 } 1599 1600 static md_dev64_t 1601 select_read_unit( 1602 mm_unit_t *un, 1603 diskaddr_t blkno, 1604 u_longlong_t reqcount, 1605 u_longlong_t *cando, 1606 int must_be_opened, 1607 md_m_shared_t **shared, 1608 md_mcs_t *cs) 1609 { 1610 int i; 1611 md_m_shared_t *s; 1612 uint_t lasterrcnt = 0; 1613 md_dev64_t dev = 0; 1614 u_longlong_t cnt; 1615 u_longlong_t mincnt; 1616 mm_submirror_t *sm; 1617 mm_submirror_ic_t *smic; 1618 mdi_unit_t *ui; 1619 1620 mincnt = reqcount; 1621 for (i = 0; i < NMIRROR; i++) { 1622 if (!SUBMIRROR_IS_READABLE(un, i)) 1623 continue; 1624 sm = &un->un_sm[i]; 1625 smic = &un->un_smic[i]; 1626 cnt = reqcount; 1627 1628 /* 1629 * If the current submirror is marked as inaccessible, do not 1630 * try to access it. 1631 */ 1632 ui = MDI_UNIT(getminor(expldev(sm->sm_dev))); 1633 (void) md_unit_readerlock(ui); 1634 if (ui->ui_tstate & MD_INACCESSIBLE) { 1635 md_unit_readerexit(ui); 1636 continue; 1637 } 1638 md_unit_readerexit(ui); 1639 1640 s = (md_m_shared_t *)(*(smic->sm_shared_by_blk)) 1641 (sm->sm_dev, sm, blkno, &cnt); 1642 1643 if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN)) 1644 continue; 1645 if (s->ms_state == CS_OKAY) { 1646 *cando = cnt; 1647 if (shared != NULL) 1648 *shared = s; 1649 1650 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST && 1651 cs != NULL) { 1652 cs->cs_buf.b_flags |= B_FAILFAST; 1653 } 1654 1655 return (un->un_sm[i].sm_dev); 1656 } 1657 if (s->ms_state != CS_LAST_ERRED) 1658 continue; 1659 1660 /* don't use B_FAILFAST since we're Last Erred */ 1661 1662 if (mincnt > cnt) 1663 mincnt = cnt; 1664 if (s->ms_lasterrcnt > lasterrcnt) { 1665 lasterrcnt = s->ms_lasterrcnt; 1666 if (shared != NULL) 1667 *shared = s; 1668 dev = un->un_sm[i].sm_dev; 1669 } 1670 } 1671 *cando = mincnt; 1672 return (dev); 1673 } 1674 1675 /* 1676 * Given a 32-bit bitmap, this routine will return the bit number 1677 * of the nth bit set. The nth bit set is passed via the index integer. 1678 * 1679 * This routine is used to run through the writable submirror bitmap 1680 * and starting all of the writes. See the value returned is the 1681 * index to appropriate submirror structure, in the md_sm 1682 * array for metamirrors. 1683 */ 1684 static int 1685 md_find_nth_unit(uint_t mask, int index) 1686 { 1687 int bit, nfound; 1688 1689 for (bit = -1, nfound = -1; nfound != index; bit++) { 1690 ASSERT(mask != 0); 1691 nfound += (mask & 1); 1692 mask >>= 1; 1693 } 1694 return (bit); 1695 } 1696 1697 static int 1698 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs) 1699 { 1700 mm_unit_t *un; 1701 buf_t *bp; 1702 int i; 1703 unsigned nunits = 0; 1704 int iunit; 1705 uint_t running_bm = 0; 1706 uint_t sm_index; 1707 1708 bp = &cs->cs_buf; 1709 un = ps->ps_un; 1710 1711 for (i = 0; i < NMIRROR; i++) { 1712 if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING)) 1713 continue; 1714 running_bm |= SMI2BIT(i); 1715 nunits++; 1716 } 1717 if (nunits == 0) 1718 return (1); 1719 1720 /* 1721 * For directed mirror read (DMR) we only use the specified side and 1722 * do not compute the source of the read. 1723 */ 1724 if (ps->ps_flags & MD_MPS_DMR) { 1725 sm_index = un->un_dmr_last_read; 1726 } else { 1727 /* Normal (non-DMR) operation */ 1728 switch (un->un_read_option) { 1729 case RD_GEOMETRY: 1730 iunit = (int)(bp->b_lblkno / 1731 howmany(un->c.un_total_blocks, nunits)); 1732 sm_index = md_find_nth_unit(running_bm, iunit); 1733 break; 1734 case RD_FIRST: 1735 sm_index = md_find_nth_unit(running_bm, 0); 1736 break; 1737 case RD_LOAD_BAL: 1738 /* this is intentional to fall into the default */ 1739 default: 1740 un->un_last_read = (un->un_last_read + 1) % nunits; 1741 sm_index = md_find_nth_unit(running_bm, 1742 un->un_last_read); 1743 break; 1744 } 1745 } 1746 bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev); 1747 ps->ps_allfrom_sm = SMI2BIT(sm_index); 1748 1749 if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) { 1750 bp->b_flags |= B_FAILFAST; 1751 } 1752 1753 return (0); 1754 } 1755 1756 static 1757 int 1758 mirror_are_submirrors_available(mm_unit_t *un) 1759 { 1760 int i; 1761 for (i = 0; i < NMIRROR; i++) { 1762 md_dev64_t tmpdev = un->un_sm[i].sm_dev; 1763 1764 if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) || 1765 md_getmajor(tmpdev) != md_major) 1766 continue; 1767 1768 if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) || 1769 (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits)) 1770 return (0); 1771 1772 if (MDI_UNIT(md_getminor(tmpdev)) == NULL) 1773 return (0); 1774 } 1775 return (1); 1776 } 1777 1778 void 1779 build_submirror(mm_unit_t *un, int i, int snarfing) 1780 { 1781 struct mm_submirror *sm; 1782 struct mm_submirror_ic *smic; 1783 md_unit_t *su; 1784 set_t setno; 1785 1786 sm = &un->un_sm[i]; 1787 smic = &un->un_smic[i]; 1788 1789 sm->sm_flags = 0; /* sometime we may need to do more here */ 1790 1791 setno = MD_UN2SET(un); 1792 1793 if (!SMS_IS(sm, SMS_INUSE)) 1794 return; 1795 if (snarfing) { 1796 sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno), 1797 sm->sm_key, MD_NOTRUST_DEVT); 1798 } else { 1799 if (md_getmajor(sm->sm_dev) == md_major) { 1800 su = MD_UNIT(md_getminor(sm->sm_dev)); 1801 un->c.un_flag |= (su->c.un_flag & MD_LABELED); 1802 /* submirror can no longer be soft partitioned */ 1803 MD_CAPAB(su) &= (~MD_CAN_SP); 1804 } 1805 } 1806 smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev, 1807 0, "shared by blk", 0); 1808 smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev, 1809 0, "shared by indx", 0); 1810 smic->sm_get_component_count = 1811 (int (*)())md_get_named_service(sm->sm_dev, 0, 1812 "get component count", 0); 1813 smic->sm_get_bcss = 1814 (int (*)())md_get_named_service(sm->sm_dev, 0, 1815 "get block count skip size", 0); 1816 sm->sm_state &= ~SMS_IGNORE; 1817 if (SMS_IS(sm, SMS_OFFLINE)) 1818 MD_STATUS(un) |= MD_UN_OFFLINE_SM; 1819 md_set_parent(sm->sm_dev, MD_SID(un)); 1820 } 1821 1822 static void 1823 mirror_cleanup(mm_unit_t *un) 1824 { 1825 mddb_recid_t recid; 1826 int smi; 1827 sv_dev_t sv[NMIRROR]; 1828 int nsv = 0; 1829 1830 /* 1831 * If a MN diskset and this node is not the master, do 1832 * not delete any records on snarf of the mirror records. 1833 */ 1834 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1835 md_set[MD_UN2SET(un)].s_am_i_master == 0) { 1836 return; 1837 } 1838 1839 for (smi = 0; smi < NMIRROR; smi++) { 1840 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 1841 continue; 1842 sv[nsv].setno = MD_UN2SET(un); 1843 sv[nsv++].key = un->un_sm[smi].sm_key; 1844 } 1845 1846 recid = un->un_rr_dirty_recid; 1847 mddb_deleterec_wrapper(un->c.un_record_id); 1848 if (recid > 0) 1849 mddb_deleterec_wrapper(recid); 1850 1851 md_rem_names(sv, nsv); 1852 } 1853 1854 /* Return a -1 if optimized record unavailable and set should be released */ 1855 int 1856 mirror_build_incore(mm_unit_t *un, int snarfing) 1857 { 1858 int i; 1859 1860 if (MD_STATUS(un) & MD_UN_BEING_RESET) { 1861 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); 1862 return (1); 1863 } 1864 1865 if (mirror_are_submirrors_available(un) == 0) 1866 return (1); 1867 1868 if (MD_UNIT(MD_SID(un)) != NULL) 1869 return (0); 1870 1871 MD_STATUS(un) = 0; 1872 1873 /* pre-4.1 didn't define CAN_META_CHILD capability */ 1874 MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP; 1875 1876 un->un_ovrlap_chn_flg = 0; 1877 bzero(&un->un_ovrlap_chn, sizeof (un->un_ovrlap_chn)); 1878 1879 for (i = 0; i < NMIRROR; i++) 1880 build_submirror(un, i, snarfing); 1881 1882 if (unit_setup_resync(un, snarfing) != 0) { 1883 if (snarfing) { 1884 mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT); 1885 /* 1886 * If a MN set and set is not stale, then return -1 1887 * which will force the caller to unload the set. 1888 * The MN diskset nodes will return failure if 1889 * unit_setup_resync fails so that nodes won't 1890 * get out of sync. 1891 * 1892 * If set is STALE, the master node can't allocate 1893 * a resync record (if needed), but node needs to 1894 * join the set so that user can delete broken mddbs. 1895 * So, if set is STALE, just continue on. 1896 */ 1897 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1898 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 1899 return (-1); 1900 } 1901 } else 1902 return (1); 1903 } 1904 1905 mutex_init(&un->un_ovrlap_chn_mx, NULL, MUTEX_DEFAULT, NULL); 1906 cv_init(&un->un_ovrlap_chn_cv, NULL, CV_DEFAULT, NULL); 1907 1908 un->un_suspend_wr_flag = 0; 1909 mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL); 1910 cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL); 1911 1912 /* 1913 * Allocate mutexes for mirror-owner and resync-owner changes. 1914 * All references to the owner message state field must be guarded 1915 * by this mutex. 1916 */ 1917 mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL); 1918 1919 /* 1920 * Allocate mutex and condvar for resync thread manipulation. These 1921 * will be used by mirror_resync_unit/mirror_ioctl_resync 1922 */ 1923 mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL); 1924 cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL); 1925 1926 /* 1927 * Allocate mutex and condvar for resync progress thread manipulation. 1928 * This allows resyncs to be continued across an intervening reboot. 1929 */ 1930 mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL); 1931 cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL); 1932 1933 /* 1934 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This 1935 * provides synchronization between a user-ioctl and the resulting 1936 * strategy() call that performs the read(). 1937 */ 1938 mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL); 1939 cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL); 1940 1941 MD_UNIT(MD_SID(un)) = un; 1942 return (0); 1943 } 1944 1945 1946 void 1947 reset_mirror(struct mm_unit *un, minor_t mnum, int removing) 1948 { 1949 mddb_recid_t recid, vtoc_id; 1950 size_t bitcnt; 1951 size_t shortcnt; 1952 int smi; 1953 sv_dev_t sv[NMIRROR]; 1954 int nsv = 0; 1955 uint_t bits = 0; 1956 minor_t selfid; 1957 md_unit_t *su; 1958 1959 md_destroy_unit_incore(mnum, &mirror_md_ops); 1960 1961 shortcnt = un->un_rrd_num * sizeof (short); 1962 bitcnt = howmany(un->un_rrd_num, NBBY); 1963 1964 if (un->un_outstanding_writes) 1965 kmem_free((caddr_t)un->un_outstanding_writes, shortcnt); 1966 if (un->un_goingclean_bm) 1967 kmem_free((caddr_t)un->un_goingclean_bm, bitcnt); 1968 if (un->un_goingdirty_bm) 1969 kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt); 1970 if (un->un_resync_bm) 1971 kmem_free((caddr_t)un->un_resync_bm, bitcnt); 1972 1973 MD_UNIT(mnum) = NULL; 1974 1975 /* 1976 * Attempt release of its minor node 1977 */ 1978 (void) md_remove_minor_node(mnum); 1979 1980 if (!removing) 1981 return; 1982 1983 for (smi = 0; smi < NMIRROR; smi++) { 1984 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 1985 continue; 1986 /* reallow soft partitioning of submirror and reset parent */ 1987 su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev)); 1988 MD_CAPAB(su) |= MD_CAN_SP; 1989 md_reset_parent(un->un_sm[smi].sm_dev); 1990 reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]); 1991 1992 sv[nsv].setno = MD_MIN2SET(mnum); 1993 sv[nsv++].key = un->un_sm[smi].sm_key; 1994 bits |= SMI2BIT(smi); 1995 } 1996 1997 MD_STATUS(un) |= MD_UN_BEING_RESET; 1998 recid = un->un_rr_dirty_recid; 1999 vtoc_id = un->c.un_vtoc_id; 2000 selfid = MD_SID(un); 2001 2002 mirror_commit(un, bits, 0); 2003 2004 /* Destroy all mutexes and condvars before returning. */ 2005 mutex_destroy(&un->un_suspend_wr_mx); 2006 cv_destroy(&un->un_suspend_wr_cv); 2007 mutex_destroy(&un->un_ovrlap_chn_mx); 2008 cv_destroy(&un->un_ovrlap_chn_cv); 2009 mutex_destroy(&un->un_owner_mx); 2010 mutex_destroy(&un->un_rs_thread_mx); 2011 cv_destroy(&un->un_rs_thread_cv); 2012 mutex_destroy(&un->un_rs_progress_mx); 2013 cv_destroy(&un->un_rs_progress_cv); 2014 mutex_destroy(&un->un_dmr_mx); 2015 cv_destroy(&un->un_dmr_cv); 2016 2017 /* 2018 * Remove self from the namespace 2019 */ 2020 if (un->c.un_revision & MD_FN_META_DEV) { 2021 (void) md_rem_selfname(un->c.un_self_id); 2022 } 2023 2024 mddb_deleterec_wrapper(un->c.un_record_id); 2025 if (recid != 0) 2026 mddb_deleterec_wrapper(recid); 2027 2028 /* Remove the vtoc, if present */ 2029 if (vtoc_id) 2030 mddb_deleterec_wrapper(vtoc_id); 2031 2032 md_rem_names(sv, nsv); 2033 2034 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, 2035 MD_MIN2SET(selfid), selfid); 2036 } 2037 2038 int 2039 mirror_internal_open( 2040 minor_t mnum, 2041 int flag, 2042 int otyp, 2043 int md_oflags, 2044 IOLOCK *lockp /* can be NULL */ 2045 ) 2046 { 2047 mdi_unit_t *ui = MDI_UNIT(mnum); 2048 int err = 0; 2049 2050 tryagain: 2051 /* single thread */ 2052 if (lockp) { 2053 /* 2054 * If ioctl lock is held, use openclose_enter 2055 * routine that will set the ioctl flag when 2056 * grabbing the readerlock. 2057 */ 2058 (void) md_ioctl_openclose_enter(lockp, ui); 2059 } else { 2060 (void) md_unit_openclose_enter(ui); 2061 } 2062 2063 /* 2064 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE 2065 * message in a MN diskset and this requires that the openclose 2066 * lock is dropped in order to send this message. So, another 2067 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from 2068 * attempting an open while this thread has an open in progress. 2069 * Call the *_lh version of the lock exit routines since the ui_mx 2070 * mutex must be held from checking for OPENINPROGRESS until 2071 * after the cv_wait call. 2072 */ 2073 mutex_enter(&ui->ui_mx); 2074 if (ui->ui_lock & MD_UL_OPENINPROGRESS) { 2075 if (lockp) { 2076 (void) md_ioctl_openclose_exit_lh(lockp); 2077 } else { 2078 md_unit_openclose_exit_lh(ui); 2079 } 2080 cv_wait(&ui->ui_cv, &ui->ui_mx); 2081 mutex_exit(&ui->ui_mx); 2082 goto tryagain; 2083 } 2084 2085 ui->ui_lock |= MD_UL_OPENINPROGRESS; 2086 mutex_exit(&ui->ui_mx); 2087 2088 /* open devices, if necessary */ 2089 if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) { 2090 if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0) 2091 goto out; 2092 } 2093 2094 /* count open */ 2095 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 2096 goto out; 2097 2098 /* unlock, return success */ 2099 out: 2100 mutex_enter(&ui->ui_mx); 2101 ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 2102 mutex_exit(&ui->ui_mx); 2103 2104 if (lockp) { 2105 /* 2106 * If ioctl lock is held, use openclose_exit 2107 * routine that will clear the lockp reader flag. 2108 */ 2109 (void) md_ioctl_openclose_exit(lockp); 2110 } else { 2111 md_unit_openclose_exit(ui); 2112 } 2113 return (err); 2114 } 2115 2116 int 2117 mirror_internal_close( 2118 minor_t mnum, 2119 int otyp, 2120 int md_cflags, 2121 IOLOCK *lockp /* can be NULL */ 2122 ) 2123 { 2124 mdi_unit_t *ui = MDI_UNIT(mnum); 2125 mm_unit_t *un; 2126 int err = 0; 2127 2128 /* single thread */ 2129 if (lockp) { 2130 /* 2131 * If ioctl lock is held, use openclose_enter 2132 * routine that will set the ioctl flag when 2133 * grabbing the readerlock. 2134 */ 2135 un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui); 2136 } else { 2137 un = (mm_unit_t *)md_unit_openclose_enter(ui); 2138 } 2139 2140 /* count closed */ 2141 if ((err = md_unit_decopen(mnum, otyp)) != 0) 2142 goto out; 2143 2144 /* close devices, if necessary */ 2145 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 2146 /* 2147 * Clean up dirty bitmap for this unit. Do this 2148 * before closing the underlying devices to avoid 2149 * race conditions with reset_mirror() as a 2150 * result of a 'metaset -r' command running in 2151 * parallel. This might cause deallocation of 2152 * dirty region bitmaps; with underlying metadevices 2153 * in place this can't happen. 2154 * Don't do this if a MN set and ABR not set 2155 */ 2156 if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) { 2157 if (!MD_MNSET_SETNO(MD_UN2SET(un)) || 2158 !(ui->ui_tstate & MD_ABR_CAP)) 2159 mirror_process_unit_resync(un); 2160 } 2161 (void) mirror_close_all_devs(un, md_cflags); 2162 2163 /* 2164 * For a MN set with transient capabilities (eg ABR/DMR) set, 2165 * clear these capabilities on the last open in the cluster. 2166 * To do this we send a message to all nodes to see of the 2167 * device is open. 2168 */ 2169 if (MD_MNSET_SETNO(MD_UN2SET(un)) && 2170 (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) { 2171 if (lockp) { 2172 (void) md_ioctl_openclose_exit(lockp); 2173 } else { 2174 md_unit_openclose_exit(ui); 2175 } 2176 2177 /* 2178 * if we are in the context of an ioctl, drop the 2179 * ioctl lock. 2180 * Otherwise, no other locks should be held. 2181 */ 2182 if (lockp) { 2183 IOLOCK_RETURN_RELEASE(0, lockp); 2184 } 2185 2186 mdmn_clear_all_capabilities(mnum); 2187 2188 /* if dropped the lock previously, regain it */ 2189 if (lockp) { 2190 IOLOCK_RETURN_REACQUIRE(lockp); 2191 } 2192 return (0); 2193 } 2194 /* unlock and return success */ 2195 } 2196 out: 2197 /* Call whether lockp is NULL or not. */ 2198 if (lockp) { 2199 md_ioctl_openclose_exit(lockp); 2200 } else { 2201 md_unit_openclose_exit(ui); 2202 } 2203 return (err); 2204 } 2205 2206 /* 2207 * When a component has completed resyncing and is now ok, check if the 2208 * corresponding component in the other submirrors is in the Last Erred 2209 * state. If it is, we want to change that to the Erred state so we stop 2210 * using that component and start using this good component instead. 2211 * 2212 * This is called from set_sm_comp_state and recursively calls 2213 * set_sm_comp_state if it needs to change the Last Erred state. 2214 */ 2215 static void 2216 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags, 2217 IOLOCK *lockp) 2218 { 2219 mm_submirror_t *sm; 2220 mm_submirror_ic_t *smic; 2221 int ci; 2222 int i; 2223 int compcnt; 2224 int changed = 0; 2225 2226 for (i = 0; i < NMIRROR; i++) { 2227 sm = &un->un_sm[i]; 2228 smic = &un->un_smic[i]; 2229 2230 if (!SMS_IS(sm, SMS_INUSE)) 2231 continue; 2232 2233 /* ignore the submirror that we just made ok */ 2234 if (i == smi) 2235 continue; 2236 2237 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 2238 for (ci = 0; ci < compcnt; ci++) { 2239 md_m_shared_t *shared; 2240 2241 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 2242 (sm->sm_dev, sm, ci); 2243 2244 if ((shared->ms_state & CS_LAST_ERRED) && 2245 !mirror_other_sources(un, i, ci, 1)) { 2246 2247 set_sm_comp_state(un, i, ci, CS_ERRED, extras, 2248 flags, lockp); 2249 changed = 1; 2250 } 2251 } 2252 } 2253 2254 /* maybe there is a hotspare for this newly erred component */ 2255 if (changed) { 2256 set_t setno; 2257 2258 setno = MD_UN2SET(un); 2259 if (MD_MNSET_SETNO(setno)) { 2260 send_poke_hotspares(setno); 2261 } else { 2262 (void) poke_hotspares(); 2263 } 2264 } 2265 } 2266 2267 /* 2268 * set_sm_comp_state 2269 * 2270 * Set the state of a submirror component to the specified new state. 2271 * If the mirror is in a multi-node set, send messages to all nodes to 2272 * block all writes to the mirror and then update the state and release the 2273 * writes. These messages are only sent if MD_STATE_XMIT is set in flags. 2274 * MD_STATE_XMIT will be unset in 2 cases: 2275 * 1. When the state is changed to CS_RESYNC as this state change 2276 * will already have been updated on each node by the processing of the 2277 * distributed metasync command, hence no need to xmit. 2278 * 2. When the state is change to CS_OKAY after a resync has completed. Again 2279 * the resync completion will already have been processed on each node by 2280 * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component 2281 * resync, hence no need to xmit. 2282 * 2283 * In case we are called from the updates of a watermark, 2284 * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to 2285 * a metainit or similar. In this case the message that we sent to propagate 2286 * the state change must not be a class1 message as that would deadlock with 2287 * the metainit command that is still being processed. 2288 * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2 2289 * instead. This also makes the submessage generator to create a class2 2290 * submessage rather than a class1 (which would also block) 2291 * 2292 * On entry, unit_writerlock is held 2293 * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is 2294 * also held. 2295 */ 2296 void 2297 set_sm_comp_state( 2298 mm_unit_t *un, 2299 int smi, 2300 int ci, 2301 int newstate, 2302 mddb_recid_t *extras, 2303 uint_t flags, 2304 IOLOCK *lockp 2305 ) 2306 { 2307 mm_submirror_t *sm; 2308 mm_submirror_ic_t *smic; 2309 md_m_shared_t *shared; 2310 int origstate; 2311 void (*get_dev)(); 2312 ms_cd_info_t cd; 2313 char devname[MD_MAX_CTDLEN]; 2314 int err; 2315 set_t setno = MD_UN2SET(un); 2316 md_mn_msg_stch_t stchmsg; 2317 mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); 2318 md_mn_kresult_t *kresult; 2319 int rval; 2320 uint_t msgflags; 2321 md_mn_msgtype_t msgtype; 2322 int save_lock = 0; 2323 mdi_unit_t *ui_sm; 2324 2325 sm = &un->un_sm[smi]; 2326 smic = &un->un_smic[smi]; 2327 2328 /* If we have a real error status then turn off MD_INACCESSIBLE. */ 2329 ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev))); 2330 if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) && 2331 ui_sm->ui_tstate & MD_INACCESSIBLE) { 2332 ui_sm->ui_tstate &= ~MD_INACCESSIBLE; 2333 } 2334 2335 shared = (md_m_shared_t *) 2336 (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci); 2337 origstate = shared->ms_state; 2338 2339 /* 2340 * If the new state is an error and the old one wasn't, generate 2341 * a console message. We do this before we send the state to other 2342 * nodes in a MN set because the state change may change the component 2343 * name if a hotspare is allocated. 2344 */ 2345 if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) && 2346 (newstate & (CS_ERRED|CS_LAST_ERRED))) { 2347 2348 get_dev = 2349 (void (*)())md_get_named_service(sm->sm_dev, 0, 2350 "get device", 0); 2351 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2352 2353 err = md_getdevname(setno, mddb_getsidenum(setno), 0, 2354 cd.cd_dev, devname, sizeof (devname)); 2355 2356 if (err == ENOENT) { 2357 (void) md_devname(setno, cd.cd_dev, devname, 2358 sizeof (devname)); 2359 } 2360 2361 cmn_err(CE_WARN, "md: %s: %s needs maintenance", 2362 md_shortname(md_getminor(sm->sm_dev)), devname); 2363 2364 if (newstate & CS_LAST_ERRED) { 2365 cmn_err(CE_WARN, "md: %s: %s last erred", 2366 md_shortname(md_getminor(sm->sm_dev)), 2367 devname); 2368 2369 } else if (shared->ms_flags & MDM_S_ISOPEN) { 2370 /* 2371 * Close the broken device and clear the open flag on 2372 * it. Closing the device means the RCM framework will 2373 * be able to unconfigure the device if required. 2374 * 2375 * We have to check that the device is open, otherwise 2376 * the first open on it has resulted in the error that 2377 * is being processed and the actual cd.cd_dev will be 2378 * NODEV64. 2379 * 2380 * If this is a multi-node mirror, then the multinode 2381 * state checks following this code will cause the 2382 * slave nodes to close the mirror in the function 2383 * mirror_set_state(). 2384 */ 2385 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2386 shared->ms_flags &= ~MDM_S_ISOPEN; 2387 } 2388 2389 } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) && 2390 (shared->ms_flags & MDM_S_ISOPEN)) { 2391 /* 2392 * Similar to logic above except no log messages since we 2393 * are just transitioning from Last Erred to Erred. 2394 */ 2395 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0, 2396 "get device", 0); 2397 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd); 2398 2399 md_layered_close(cd.cd_dev, MD_OFLG_NULL); 2400 shared->ms_flags &= ~MDM_S_ISOPEN; 2401 } 2402 2403 if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) && 2404 (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) { 2405 /* 2406 * For a multi-node mirror, send the state change to the 2407 * master, which broadcasts to all nodes, including this 2408 * one. Once the message is received, the state is set 2409 * in-core and the master commits the change to disk. 2410 * There is a case, comp_replace, where this function 2411 * can be called from within an ioctl and therefore in this 2412 * case, as the ioctl will already be called on each node, 2413 * there is no need to xmit the state change to the master for 2414 * distribution to the other nodes. MD_STATE_XMIT flag is used 2415 * to indicate whether a xmit is required. The mirror's 2416 * transient state is set to MD_ERR_PENDING to avoid sending 2417 * multiple messages. 2418 */ 2419 if (newstate & (CS_ERRED|CS_LAST_ERRED)) 2420 ui->ui_tstate |= MD_ERR_PENDING; 2421 2422 /* 2423 * Send a state update message to all nodes. This message 2424 * will generate 2 submessages, the first one to suspend 2425 * all writes to the mirror and the second to update the 2426 * state and resume writes. 2427 */ 2428 stchmsg.msg_stch_mnum = un->c.un_self_id; 2429 stchmsg.msg_stch_sm = smi; 2430 stchmsg.msg_stch_comp = ci; 2431 stchmsg.msg_stch_new_state = newstate; 2432 stchmsg.msg_stch_hs_id = shared->ms_hs_id; 2433 #ifdef DEBUG 2434 if (mirror_debug_flag) 2435 printf("send set state, %x, %x, %x, %x, %x\n", 2436 stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm, 2437 stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state, 2438 stchmsg.msg_stch_hs_id); 2439 #endif 2440 if (flags & MD_STATE_WMUPDATE) { 2441 msgtype = MD_MN_MSG_STATE_UPDATE2; 2442 /* 2443 * When coming from an update of watermarks, there 2444 * must already be a message logged that triggered 2445 * this action. So, no need to log this message, too. 2446 */ 2447 msgflags = MD_MSGF_NO_LOG; 2448 } else { 2449 msgtype = MD_MN_MSG_STATE_UPDATE; 2450 msgflags = MD_MSGF_DEFAULT_FLAGS; 2451 } 2452 2453 /* 2454 * If we are in the context of an ioctl, drop the ioctl lock. 2455 * lockp holds the list of locks held. 2456 * 2457 * Otherwise, increment the appropriate reacquire counters. 2458 * If openclose lock is *held, then must reacquire reader 2459 * lock before releasing the openclose lock. 2460 * Do not drop the ARRAY_WRITER lock as we may not be able 2461 * to reacquire it. 2462 */ 2463 if (lockp) { 2464 if (lockp->l_flags & MD_ARRAY_WRITER) { 2465 save_lock = MD_ARRAY_WRITER; 2466 lockp->l_flags &= ~MD_ARRAY_WRITER; 2467 } else if (lockp->l_flags & MD_ARRAY_READER) { 2468 save_lock = MD_ARRAY_READER; 2469 lockp->l_flags &= ~MD_ARRAY_READER; 2470 } 2471 IOLOCK_RETURN_RELEASE(0, lockp); 2472 } else { 2473 if (flags & MD_STATE_OCHELD) { 2474 md_unit_writerexit(ui); 2475 (void) md_unit_readerlock(ui); 2476 md_unit_openclose_exit(ui); 2477 } else { 2478 md_unit_writerexit(ui); 2479 } 2480 } 2481 2482 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 2483 rval = mdmn_ksend_message(setno, 2484 msgtype, 2485 msgflags, 2486 (char *)&stchmsg, 2487 sizeof (stchmsg), 2488 kresult); 2489 2490 if (!MDMN_KSEND_MSG_OK(rval, kresult)) { 2491 mdmn_ksend_show_error(rval, kresult, "STATE UPDATE"); 2492 cmn_err(CE_PANIC, 2493 "ksend_message failure: STATE_UPDATE"); 2494 } 2495 kmem_free(kresult, sizeof (md_mn_kresult_t)); 2496 2497 /* if dropped the lock previously, regain it */ 2498 if (lockp) { 2499 IOLOCK_RETURN_REACQUIRE(lockp); 2500 lockp->l_flags |= save_lock; 2501 } else { 2502 /* 2503 * Reacquire dropped locks and update acquirecnts 2504 * appropriately. 2505 */ 2506 if (flags & MD_STATE_OCHELD) { 2507 /* 2508 * openclose also grabs readerlock. 2509 */ 2510 (void) md_unit_openclose_enter(ui); 2511 md_unit_readerexit(ui); 2512 (void) md_unit_writerlock(ui); 2513 } else { 2514 (void) md_unit_writerlock(ui); 2515 } 2516 } 2517 2518 ui->ui_tstate &= ~MD_ERR_PENDING; 2519 } else { 2520 shared->ms_state = newstate; 2521 uniqtime32(&shared->ms_timestamp); 2522 2523 if (newstate == CS_ERRED) 2524 shared->ms_flags |= MDM_S_NOWRITE; 2525 else 2526 shared->ms_flags &= ~MDM_S_NOWRITE; 2527 2528 shared->ms_flags &= ~MDM_S_IOERR; 2529 un->un_changecnt++; 2530 shared->ms_lasterrcnt = un->un_changecnt; 2531 2532 mirror_set_sm_state(sm, smic, SMS_RUNNING, 0); 2533 mirror_commit(un, SMI2BIT(smi), extras); 2534 } 2535 2536 if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) { 2537 /* 2538 * Resetting the Last Erred state will recursively call back 2539 * into this function (set_sm_comp_state) to update the state. 2540 */ 2541 reset_lasterred(un, smi, extras, flags, lockp); 2542 } 2543 } 2544 2545 static int 2546 find_another_logical( 2547 mm_unit_t *un, 2548 mm_submirror_t *esm, 2549 diskaddr_t blk, 2550 u_longlong_t cnt, 2551 int must_be_open, 2552 int state, 2553 int err_cnt) 2554 { 2555 u_longlong_t cando; 2556 md_dev64_t dev; 2557 md_m_shared_t *s; 2558 2559 esm->sm_state |= SMS_IGNORE; 2560 while (cnt != 0) { 2561 u_longlong_t mcnt; 2562 2563 mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024)); /* 1 Gig Blks */ 2564 2565 dev = select_read_unit(un, blk, mcnt, &cando, must_be_open, &s, 2566 NULL); 2567 if (dev == (md_dev64_t)0) 2568 break; 2569 2570 if ((state == CS_LAST_ERRED) && 2571 (s->ms_state == CS_LAST_ERRED) && 2572 (err_cnt > s->ms_lasterrcnt)) 2573 break; 2574 2575 cnt -= cando; 2576 blk += cando; 2577 } 2578 esm->sm_state &= ~SMS_IGNORE; 2579 return (cnt != 0); 2580 } 2581 2582 int 2583 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open) 2584 { 2585 mm_submirror_t *sm; 2586 mm_submirror_ic_t *smic; 2587 size_t count; 2588 diskaddr_t block; 2589 u_longlong_t skip; 2590 u_longlong_t size; 2591 md_dev64_t dev; 2592 int cnt; 2593 md_m_shared_t *s; 2594 int not_found; 2595 2596 sm = &un->un_sm[smi]; 2597 smic = &un->un_smic[smi]; 2598 dev = sm->sm_dev; 2599 2600 /* 2601 * Make sure every component of the submirror 2602 * has other sources. 2603 */ 2604 if (ci < 0) { 2605 /* Find the highest lasterrcnt */ 2606 cnt = (*(smic->sm_get_component_count))(dev, sm); 2607 for (ci = 0; ci < cnt; ci++) { 2608 not_found = mirror_other_sources(un, smi, ci, 2609 must_be_open); 2610 if (not_found) 2611 return (1); 2612 } 2613 return (0); 2614 } 2615 2616 /* 2617 * Make sure this component has other sources 2618 */ 2619 (void) (*(smic->sm_get_bcss)) 2620 (dev, sm, ci, &block, &count, &skip, &size); 2621 2622 if (count == 0) 2623 return (1); 2624 2625 s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci); 2626 2627 while (count--) { 2628 if (block >= un->c.un_total_blocks) 2629 return (0); 2630 2631 if ((block + size) > un->c.un_total_blocks) 2632 size = un->c.un_total_blocks - block; 2633 2634 not_found = find_another_logical(un, sm, block, size, 2635 must_be_open, s->ms_state, s->ms_lasterrcnt); 2636 if (not_found) 2637 return (1); 2638 2639 block += size + skip; 2640 } 2641 return (0); 2642 } 2643 2644 static void 2645 finish_error(md_mps_t *ps) 2646 { 2647 struct buf *pb; 2648 mm_unit_t *un; 2649 mdi_unit_t *ui; 2650 uint_t new_str_flags; 2651 2652 pb = ps->ps_bp; 2653 un = ps->ps_un; 2654 ui = ps->ps_ui; 2655 2656 /* 2657 * Must flag any error to the resync originator if we're performing 2658 * a Write-after-Read. This corresponds to an i/o error on a resync 2659 * target device and in this case we ought to abort the resync as there 2660 * is nothing that can be done to recover from this without operator 2661 * intervention. If we don't set the B_ERROR flag we will continue 2662 * reading from the mirror but won't write to the target (as it will 2663 * have been placed into an errored state). 2664 * To handle the case of multiple components within a submirror we only 2665 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR. 2666 * The originator of the resync read will cause this bit to be set if 2667 * the underlying component count is one for a submirror resync. All 2668 * other resync types will have the flag set as there is no underlying 2669 * resync which can be performed on a contained metadevice for these 2670 * resync types (optimized or component). 2671 */ 2672 2673 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) { 2674 if (ps->ps_flags & MD_MPS_FLAG_ERROR) 2675 pb->b_flags |= B_ERROR; 2676 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2677 MPS_FREE(mirror_parent_cache, ps); 2678 md_unit_readerexit(ui); 2679 md_biodone(pb); 2680 return; 2681 } 2682 /* 2683 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 2684 * operation therefore this I/O request has already been counted, 2685 * the I/O count variable will be decremented by mirror_done()'s 2686 * call to md_biodone(). 2687 */ 2688 if (ps->ps_changecnt != un->un_changecnt) { 2689 new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED; 2690 if (ps->ps_flags & MD_MPS_WOW) 2691 new_str_flags |= MD_STR_WOW; 2692 if (ps->ps_flags & MD_MPS_MAPPED) 2693 new_str_flags |= MD_STR_MAPPED; 2694 /* 2695 * If this I/O request was a read that was part of a resync, 2696 * set MD_STR_WAR for the retried read to ensure that the 2697 * resync write (i.e. write-after-read) will be performed 2698 */ 2699 if (ps->ps_flags & MD_MPS_RESYNC_READ) 2700 new_str_flags |= MD_STR_WAR; 2701 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2702 MPS_FREE(mirror_parent_cache, ps); 2703 md_unit_readerexit(ui); 2704 (void) md_mirror_strategy(pb, new_str_flags, NULL); 2705 return; 2706 } 2707 2708 pb->b_flags |= B_ERROR; 2709 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2710 MPS_FREE(mirror_parent_cache, ps); 2711 md_unit_readerexit(ui); 2712 md_biodone(pb); 2713 } 2714 2715 static void 2716 error_update_unit(md_mps_t *ps) 2717 { 2718 mm_unit_t *un; 2719 mdi_unit_t *ui; 2720 int smi; /* sub mirror index */ 2721 int ci; /* errored component */ 2722 set_t setno; 2723 uint_t flags; /* for set_sm_comp_state() */ 2724 uint_t hspflags; /* for check_comp_4_hotspares() */ 2725 2726 ui = ps->ps_ui; 2727 un = (mm_unit_t *)md_unit_writerlock(ui); 2728 setno = MD_UN2SET(un); 2729 2730 /* All of these updates have to propagated in case of MN set */ 2731 flags = MD_STATE_XMIT; 2732 hspflags = MD_HOTSPARE_XMIT; 2733 2734 /* special treatment if we are called during updating watermarks */ 2735 if (ps->ps_flags & MD_MPS_WMUPDATE) { 2736 flags |= MD_STATE_WMUPDATE; 2737 hspflags |= MD_HOTSPARE_WMUPDATE; 2738 } 2739 smi = 0; 2740 ci = 0; 2741 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) { 2742 if (mirror_other_sources(un, smi, ci, 0) == 1) { 2743 2744 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2745 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags, 2746 (IOLOCK *)NULL); 2747 /* 2748 * For a MN set, the NOTIFY is done when the state 2749 * change is processed on each node 2750 */ 2751 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2752 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, 2753 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2754 } 2755 continue; 2756 } 2757 /* Never called from ioctl context, so (IOLOCK *)NULL */ 2758 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags, 2759 (IOLOCK *)NULL); 2760 /* 2761 * For a MN set, the NOTIFY is done when the state 2762 * change is processed on each node 2763 */ 2764 if (!MD_MNSET_SETNO(MD_UN2SET(un))) { 2765 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 2766 SVM_TAG_METADEVICE, setno, MD_SID(un)); 2767 } 2768 smi = 0; 2769 ci = 0; 2770 } 2771 2772 md_unit_writerexit(ui); 2773 if (MD_MNSET_SETNO(setno)) { 2774 send_poke_hotspares(setno); 2775 } else { 2776 (void) poke_hotspares(); 2777 } 2778 (void) md_unit_readerlock(ui); 2779 2780 finish_error(ps); 2781 } 2782 2783 /* 2784 * When we have a B_FAILFAST IO error on a Last Erred component we need to 2785 * retry the IO without B_FAILFAST set so that we try to ensure that the 2786 * component "sees" each IO. 2787 */ 2788 static void 2789 last_err_retry(md_mcs_t *cs) 2790 { 2791 struct buf *cb; 2792 md_mps_t *ps; 2793 uint_t flags; 2794 2795 cb = &cs->cs_buf; 2796 cb->b_flags &= ~B_FAILFAST; 2797 2798 /* if we're panicing just let this I/O error out */ 2799 if (panicstr) { 2800 (void) mirror_done(cb); 2801 return; 2802 } 2803 2804 /* reissue the I/O */ 2805 2806 ps = cs->cs_ps; 2807 2808 bioerror(cb, 0); 2809 2810 mutex_enter(&ps->ps_mx); 2811 2812 flags = MD_STR_NOTTOP; 2813 if (ps->ps_flags & MD_MPS_MAPPED) 2814 flags |= MD_STR_MAPPED; 2815 if (ps->ps_flags & MD_MPS_NOBLOCK) 2816 flags |= MD_NOBLOCK; 2817 2818 mutex_exit(&ps->ps_mx); 2819 2820 clear_retry_error(cb); 2821 2822 cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST", 2823 md_shortname(getminor(cb->b_edev))); 2824 2825 md_call_strategy(cb, flags, NULL); 2826 } 2827 2828 static void 2829 mirror_error(md_mps_t *ps) 2830 { 2831 int smi; /* sub mirror index */ 2832 int ci; /* errored component */ 2833 2834 if (panicstr) { 2835 finish_error(ps); 2836 return; 2837 } 2838 2839 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 2840 mirror_overlap_chain_remove(ps); 2841 2842 smi = 0; 2843 ci = 0; 2844 if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) { 2845 md_unit_readerexit(ps->ps_ui); 2846 daemon_request(&md_mstr_daemon, error_update_unit, 2847 (daemon_queue_t *)ps, REQ_OLD); 2848 return; 2849 } 2850 2851 finish_error(ps); 2852 } 2853 2854 static int 2855 copy_write_done(struct buf *cb) 2856 { 2857 md_mps_t *ps; 2858 buf_t *pb; 2859 char *wowbuf; 2860 wowhdr_t *wowhdr; 2861 ssize_t wow_resid; 2862 2863 /* get wowbuf ans save structure */ 2864 wowbuf = cb->b_un.b_addr; 2865 wowhdr = WOWBUF_HDR(wowbuf); 2866 ps = wowhdr->wow_ps; 2867 pb = ps->ps_bp; 2868 2869 /* Save error information, then free cb */ 2870 if (cb->b_flags & B_ERROR) 2871 pb->b_flags |= B_ERROR; 2872 2873 if (cb->b_flags & B_REMAPPED) 2874 bp_mapout(cb); 2875 2876 freerbuf(cb); 2877 2878 /* update residual and continue if needed */ 2879 if ((pb->b_flags & B_ERROR) == 0) { 2880 wow_resid = pb->b_bcount - wowhdr->wow_offset; 2881 pb->b_resid = wow_resid; 2882 if (wow_resid > 0) { 2883 daemon_request(&md_mstr_daemon, copy_write_cont, 2884 (daemon_queue_t *)wowhdr, REQ_OLD); 2885 return (1); 2886 } 2887 } 2888 2889 /* Write is complete, release resources. */ 2890 kmem_cache_free(mirror_wowblk_cache, wowhdr); 2891 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 2892 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 2893 MPS_FREE(mirror_parent_cache, ps); 2894 md_biodone(pb); 2895 return (0); 2896 } 2897 2898 static void 2899 copy_write_cont(wowhdr_t *wowhdr) 2900 { 2901 buf_t *pb; 2902 buf_t *cb; 2903 char *wowbuf; 2904 int wow_offset; 2905 size_t wow_resid; 2906 diskaddr_t wow_blkno; 2907 2908 wowbuf = WOWHDR_BUF(wowhdr); 2909 pb = wowhdr->wow_ps->ps_bp; 2910 2911 /* get data on current location */ 2912 wow_offset = wowhdr->wow_offset; 2913 wow_resid = pb->b_bcount - wow_offset; 2914 wow_blkno = pb->b_lblkno + lbtodb(wow_offset); 2915 2916 /* setup child buffer */ 2917 cb = getrbuf(KM_SLEEP); 2918 cb->b_flags = B_WRITE; 2919 cb->b_edev = pb->b_edev; 2920 cb->b_un.b_addr = wowbuf; /* change to point at WOWBUF */ 2921 cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */ 2922 cb->b_iodone = copy_write_done; 2923 cb->b_bcount = MIN(md_wowbuf_size, wow_resid); 2924 cb->b_lblkno = wow_blkno; 2925 2926 /* move offset to next section */ 2927 wowhdr->wow_offset += cb->b_bcount; 2928 2929 /* copy and setup write for current section */ 2930 bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount); 2931 2932 /* do it */ 2933 /* 2934 * Do not set the MD_IO_COUNTED flag as this is a new I/O request 2935 * that handles the WOW condition. The resultant increment on the 2936 * I/O count variable is cleared by copy_write_done()'s call to 2937 * md_biodone(). 2938 */ 2939 (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW 2940 | MD_STR_MAPPED, NULL); 2941 } 2942 2943 static void 2944 md_mirror_copy_write(md_mps_t *ps) 2945 { 2946 wowhdr_t *wowhdr; 2947 2948 wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS); 2949 mirror_wowblk_init(wowhdr); 2950 wowhdr->wow_ps = ps; 2951 wowhdr->wow_offset = 0; 2952 copy_write_cont(wowhdr); 2953 } 2954 2955 static void 2956 handle_wow(md_mps_t *ps) 2957 { 2958 buf_t *pb; 2959 2960 pb = ps->ps_bp; 2961 2962 bp_mapin(pb); 2963 2964 md_mirror_wow_cnt++; 2965 if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) { 2966 cmn_err(CE_NOTE, 2967 "md: %s, blk %lld, cnt %ld: Write on write %d occurred", 2968 md_shortname(getminor(pb->b_edev)), 2969 (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt); 2970 } 2971 2972 /* 2973 * Set the MD_IO_COUNTED flag as we are retrying the same I/O 2974 * operation therefore this I/O request has already been counted, 2975 * the I/O count variable will be decremented by mirror_done()'s 2976 * call to md_biodone(). 2977 */ 2978 if (md_mirror_wow_flg & WOW_NOCOPY) 2979 (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW | 2980 MD_STR_MAPPED | MD_IO_COUNTED, ps); 2981 else 2982 md_mirror_copy_write(ps); 2983 } 2984 2985 /* 2986 * Return true if the specified submirror is either in the Last Erred 2987 * state or is transitioning into the Last Erred state. 2988 */ 2989 static bool_t 2990 submirror_is_lasterred(mm_unit_t *un, int smi) 2991 { 2992 mm_submirror_t *sm; 2993 mm_submirror_ic_t *smic; 2994 md_m_shared_t *shared; 2995 int ci; 2996 int compcnt; 2997 2998 sm = &un->un_sm[smi]; 2999 smic = &un->un_smic[smi]; 3000 3001 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un); 3002 for (ci = 0; ci < compcnt; ci++) { 3003 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 3004 (sm->sm_dev, sm, ci); 3005 3006 if (shared->ms_state == CS_LAST_ERRED) 3007 return (B_TRUE); 3008 3009 /* 3010 * It is not currently Last Erred, check if entering Last Erred. 3011 */ 3012 if ((shared->ms_flags & MDM_S_IOERR) && 3013 ((shared->ms_state == CS_OKAY) || 3014 (shared->ms_state == CS_RESYNC))) { 3015 if (mirror_other_sources(un, smi, ci, 0) == 1) 3016 return (B_TRUE); 3017 } 3018 } 3019 3020 return (B_FALSE); 3021 } 3022 3023 3024 static int 3025 mirror_done(struct buf *cb) 3026 { 3027 md_mps_t *ps; 3028 md_mcs_t *cs; 3029 3030 /*LINTED*/ 3031 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3032 ps = cs->cs_ps; 3033 3034 mutex_enter(&ps->ps_mx); 3035 3036 /* check if we need to retry an errored failfast I/O */ 3037 if (cb->b_flags & B_ERROR) { 3038 struct buf *pb = ps->ps_bp; 3039 3040 if (cb->b_flags & B_FAILFAST) { 3041 int i; 3042 mm_unit_t *un = ps->ps_un; 3043 3044 for (i = 0; i < NMIRROR; i++) { 3045 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) 3046 continue; 3047 3048 if (cb->b_edev == 3049 md_dev64_to_dev(un->un_sm[i].sm_dev)) { 3050 3051 /* 3052 * This is the submirror that had the 3053 * error. Check if it is Last Erred. 3054 */ 3055 if (submirror_is_lasterred(un, i)) { 3056 daemon_queue_t *dqp; 3057 3058 mutex_exit(&ps->ps_mx); 3059 dqp = (daemon_queue_t *)cs; 3060 dqp->dq_prev = NULL; 3061 dqp->dq_next = NULL; 3062 daemon_request(&md_done_daemon, 3063 last_err_retry, dqp, 3064 REQ_OLD); 3065 return (1); 3066 } 3067 break; 3068 } 3069 } 3070 } 3071 3072 /* continue to process the buf without doing a retry */ 3073 ps->ps_flags |= MD_MPS_ERROR; 3074 pb->b_error = cb->b_error; 3075 } 3076 3077 return (mirror_done_common(cb)); 3078 } 3079 3080 /* 3081 * Split from the original mirror_done function so we can handle bufs after a 3082 * retry. 3083 * ps->ps_mx is already held in the caller of this function and the cb error 3084 * has already been checked and handled in the caller. 3085 */ 3086 static int 3087 mirror_done_common(struct buf *cb) 3088 { 3089 struct buf *pb; 3090 mm_unit_t *un; 3091 mdi_unit_t *ui; 3092 md_mps_t *ps; 3093 md_mcs_t *cs; 3094 size_t end_rr, start_rr, current_rr; 3095 3096 /*LINTED*/ 3097 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3098 ps = cs->cs_ps; 3099 pb = ps->ps_bp; 3100 3101 if (cb->b_flags & B_REMAPPED) 3102 bp_mapout(cb); 3103 3104 ps->ps_frags--; 3105 if (ps->ps_frags != 0) { 3106 mutex_exit(&ps->ps_mx); 3107 kmem_cache_free(mirror_child_cache, cs); 3108 return (1); 3109 } 3110 un = ps->ps_un; 3111 ui = ps->ps_ui; 3112 3113 /* 3114 * Do not update outstanding_writes if we're running with ABR 3115 * set for this mirror or the write() was issued with MD_STR_ABR set. 3116 * Also a resync initiated write() has no outstanding_writes update 3117 * either. 3118 */ 3119 if (((cb->b_flags & B_READ) == 0) && 3120 (un->un_nsm >= 2) && 3121 (ps->ps_call == NULL) && 3122 !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) && 3123 !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) { 3124 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 3125 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 3126 mutex_enter(&un->un_resync_mx); 3127 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) 3128 un->un_outstanding_writes[current_rr]--; 3129 mutex_exit(&un->un_resync_mx); 3130 } 3131 kmem_cache_free(mirror_child_cache, cs); 3132 mutex_exit(&ps->ps_mx); 3133 3134 if (ps->ps_call != NULL) { 3135 daemon_request(&md_done_daemon, ps->ps_call, 3136 (daemon_queue_t *)ps, REQ_OLD); 3137 return (1); 3138 } 3139 3140 if ((ps->ps_flags & MD_MPS_ERROR)) { 3141 daemon_request(&md_done_daemon, mirror_error, 3142 (daemon_queue_t *)ps, REQ_OLD); 3143 return (1); 3144 } 3145 3146 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3147 mirror_overlap_chain_remove(ps); 3148 3149 /* 3150 * Handle Write-on-Write problem. 3151 * Skip In case of Raw and Direct I/O as they are 3152 * handled earlier. 3153 * 3154 */ 3155 if (!(md_mirror_wow_flg & WOW_DISABLE) && 3156 !(pb->b_flags & B_READ) && 3157 !(ps->ps_flags & MD_MPS_WOW) && 3158 !(pb->b_flags & B_PHYS) && 3159 any_pages_dirty(pb)) { 3160 md_unit_readerexit(ps->ps_ui); 3161 daemon_request(&md_mstr_daemon, handle_wow, 3162 (daemon_queue_t *)ps, REQ_OLD); 3163 return (1); 3164 } 3165 3166 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3167 MPS_FREE(mirror_parent_cache, ps); 3168 md_unit_readerexit(ui); 3169 md_biodone(pb); 3170 return (0); 3171 } 3172 3173 /* 3174 * Clear error state in submirror component if the retry worked after 3175 * a failfast error. 3176 */ 3177 static void 3178 clear_retry_error(struct buf *cb) 3179 { 3180 int smi; 3181 md_mcs_t *cs; 3182 mm_unit_t *un; 3183 mdi_unit_t *ui_sm; 3184 mm_submirror_t *sm; 3185 mm_submirror_ic_t *smic; 3186 u_longlong_t cnt; 3187 md_m_shared_t *shared; 3188 3189 /*LINTED*/ 3190 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off); 3191 un = cs->cs_ps->ps_un; 3192 3193 for (smi = 0; smi < NMIRROR; smi++) { 3194 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) 3195 continue; 3196 3197 if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) { 3198 break; 3199 } 3200 } 3201 3202 if (smi >= NMIRROR) 3203 return; 3204 3205 sm = &un->un_sm[smi]; 3206 smic = &un->un_smic[smi]; 3207 cnt = cb->b_bcount; 3208 3209 ui_sm = MDI_UNIT(getminor(cb->b_edev)); 3210 (void) md_unit_writerlock(ui_sm); 3211 3212 shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm, 3213 cb->b_blkno, &cnt); 3214 3215 if (shared->ms_flags & MDM_S_IOERR) { 3216 shared->ms_flags &= ~MDM_S_IOERR; 3217 3218 } else { 3219 /* the I/O buf spans components and the first one is not erred */ 3220 int cnt; 3221 int i; 3222 3223 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un); 3224 for (i = 0; i < cnt; i++) { 3225 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx)) 3226 (sm->sm_dev, sm, i); 3227 3228 if (shared->ms_flags & MDM_S_IOERR && 3229 shared->ms_state == CS_OKAY) { 3230 3231 shared->ms_flags &= ~MDM_S_IOERR; 3232 break; 3233 } 3234 } 3235 } 3236 3237 md_unit_writerexit(ui_sm); 3238 } 3239 3240 static size_t 3241 mirror_map_read( 3242 md_mps_t *ps, 3243 md_mcs_t *cs, 3244 diskaddr_t blkno, 3245 u_longlong_t count 3246 ) 3247 { 3248 mm_unit_t *un; 3249 buf_t *bp; 3250 u_longlong_t cando; 3251 3252 bp = &cs->cs_buf; 3253 un = ps->ps_un; 3254 3255 bp->b_lblkno = blkno; 3256 if (fast_select_read_unit(ps, cs) == 0) { 3257 bp->b_bcount = ldbtob(count); 3258 return (0); 3259 } 3260 bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, count, &cando, 3261 0, NULL, cs)); 3262 bp->b_bcount = ldbtob(cando); 3263 if (count != cando) 3264 return (cando); 3265 return (0); 3266 } 3267 3268 static void 3269 write_after_read(md_mps_t *ps) 3270 { 3271 struct buf *pb; 3272 int flags; 3273 3274 if (ps->ps_flags & MD_MPS_ERROR) { 3275 mirror_error(ps); 3276 return; 3277 } 3278 3279 pb = ps->ps_bp; 3280 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3281 ps->ps_call = NULL; 3282 ps->ps_flags |= MD_MPS_WRITE_AFTER_READ; 3283 flags = MD_STR_NOTTOP | MD_STR_WAR; 3284 if (ps->ps_flags & MD_MPS_MAPPED) 3285 flags |= MD_STR_MAPPED; 3286 if (ps->ps_flags & MD_MPS_NOBLOCK) 3287 flags |= MD_NOBLOCK; 3288 if (ps->ps_flags & MD_MPS_DIRTY_RD) 3289 flags |= MD_STR_DIRTY_RD; 3290 (void) mirror_write_strategy(pb, flags, ps); 3291 } 3292 3293 static void 3294 continue_serial(md_mps_t *ps) 3295 { 3296 md_mcs_t *cs; 3297 buf_t *cb; 3298 mm_unit_t *un; 3299 int flags; 3300 3301 un = ps->ps_un; 3302 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 3303 mirror_child_init(cs); 3304 cb = &cs->cs_buf; 3305 ps->ps_call = NULL; 3306 ps->ps_frags = 1; 3307 (void) mirror_map_write(un, cs, ps, 0); 3308 flags = MD_STR_NOTTOP; 3309 if (ps->ps_flags & MD_MPS_MAPPED) 3310 flags |= MD_STR_MAPPED; 3311 md_call_strategy(cb, flags, NULL); 3312 } 3313 3314 static int 3315 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war) 3316 { 3317 int i; 3318 dev_t dev; /* needed for bioclone, so not md_dev64_t */ 3319 buf_t *cb; 3320 buf_t *pb; 3321 diskaddr_t blkno; 3322 size_t bcount; 3323 off_t offset; 3324 3325 pb = ps->ps_bp; 3326 cb = &cs->cs_buf; 3327 cs->cs_ps = ps; 3328 3329 i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm); 3330 3331 dev = md_dev64_to_dev(un->un_sm[i].sm_dev); 3332 3333 blkno = pb->b_lblkno; 3334 bcount = pb->b_bcount; 3335 offset = 0; 3336 if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) { 3337 blkno = DK_LABEL_LOC + 1; 3338 /* 3339 * This handles the case where we're requesting 3340 * a write to block 0 on a label partition 3341 * and the request size was smaller than the 3342 * size of the label. If this is the case 3343 * then we'll return -1. Failure to do so will 3344 * either cause the calling thread to hang due to 3345 * an ssd bug, or worse if the bcount were allowed 3346 * to go negative (ie large). 3347 */ 3348 if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1)) 3349 return (-1); 3350 bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3351 offset = (DEV_BSIZE*(DK_LABEL_LOC + 1)); 3352 } 3353 3354 cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done, 3355 cb, KM_NOSLEEP); 3356 if (war) 3357 cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE; 3358 3359 /* 3360 * If the submirror is in the erred stated, check if any component is 3361 * in the Last Erred state. If so, we don't want to use the B_FAILFAST 3362 * flag on the IO. 3363 * 3364 * Provide a fast path for the non-erred case (which should be the 3365 * normal case). 3366 */ 3367 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) { 3368 if (un->un_sm[i].sm_state & SMS_COMP_ERRED) { 3369 mm_submirror_t *sm; 3370 mm_submirror_ic_t *smic; 3371 int ci; 3372 int compcnt; 3373 3374 sm = &un->un_sm[i]; 3375 smic = &un->un_smic[i]; 3376 3377 compcnt = (*(smic->sm_get_component_count)) 3378 (sm->sm_dev, un); 3379 for (ci = 0; ci < compcnt; ci++) { 3380 md_m_shared_t *shared; 3381 3382 shared = (md_m_shared_t *) 3383 (*(smic->sm_shared_by_indx))(sm->sm_dev, 3384 sm, ci); 3385 3386 if (shared->ms_state == CS_LAST_ERRED) 3387 break; 3388 } 3389 if (ci >= compcnt) 3390 cb->b_flags |= B_FAILFAST; 3391 3392 } else { 3393 cb->b_flags |= B_FAILFAST; 3394 } 3395 } 3396 3397 ps->ps_current_sm++; 3398 if (ps->ps_current_sm != ps->ps_active_cnt) { 3399 if (un->un_write_option == WR_SERIAL) { 3400 ps->ps_call = continue_serial; 3401 return (0); 3402 } 3403 return (1); 3404 } 3405 return (0); 3406 } 3407 3408 /* 3409 * directed_read_done: 3410 * ------------------ 3411 * Completion routine called when a DMR request has been returned from the 3412 * underlying driver. Wake-up the original ioctl() and return the data to 3413 * the user. 3414 */ 3415 static void 3416 directed_read_done(md_mps_t *ps) 3417 { 3418 mm_unit_t *un; 3419 mdi_unit_t *ui; 3420 3421 un = ps->ps_un; 3422 ui = ps->ps_ui; 3423 3424 md_unit_readerexit(ui); 3425 md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)); 3426 ps->ps_call = NULL; 3427 3428 mutex_enter(&un->un_dmr_mx); 3429 cv_signal(&un->un_dmr_cv); 3430 mutex_exit(&un->un_dmr_mx); 3431 3432 /* release the parent structure */ 3433 kmem_cache_free(mirror_parent_cache, ps); 3434 } 3435 3436 /* 3437 * daemon_io: 3438 * ------------ 3439 * Called to issue a mirror_write_strategy() or mirror_read_strategy 3440 * call from a blockable context. NOTE: no mutex can be held on entry to this 3441 * routine 3442 */ 3443 static void 3444 daemon_io(daemon_queue_t *dq) 3445 { 3446 md_mps_t *ps = (md_mps_t *)dq; 3447 int flag = MD_STR_NOTTOP; 3448 buf_t *pb = ps->ps_bp; 3449 3450 if (ps->ps_flags & MD_MPS_MAPPED) 3451 flag |= MD_STR_MAPPED; 3452 if (ps->ps_flags & MD_MPS_WOW) 3453 flag |= MD_STR_WOW; 3454 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) 3455 flag |= MD_STR_WAR; 3456 if (ps->ps_flags & MD_MPS_ABR) 3457 flag |= MD_STR_ABR; 3458 3459 /* 3460 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set 3461 * MD_STR_WAR before calling mirror_read_strategy 3462 */ 3463 if (pb->b_flags & B_READ) { 3464 if (!(ps->ps_flags & MD_MPS_DIRTY_RD)) 3465 flag |= MD_STR_WAR; 3466 mirror_read_strategy(pb, flag, ps); 3467 } else 3468 mirror_write_strategy(pb, flag, ps); 3469 } 3470 3471 /* 3472 * update_resync: 3473 * ------------- 3474 * Called to update the in-core version of the resync record with the latest 3475 * version that was committed to disk when the previous mirror owner 3476 * relinquished ownership. This call is likely to block as we must hold-off 3477 * any current resync processing that may be occurring. 3478 * On completion of the resync record update we issue the mirror_write_strategy 3479 * call to complete the i/o that first started this sequence. To remove a race 3480 * condition between a new write() request which is submitted and the resync 3481 * record update we acquire the writerlock. This will hold off all i/o to the 3482 * mirror until the resync update has completed. 3483 * NOTE: no mutex can be held on entry to this routine 3484 */ 3485 static void 3486 update_resync(daemon_queue_t *dq) 3487 { 3488 md_mps_t *ps = (md_mps_t *)dq; 3489 buf_t *pb = ps->ps_bp; 3490 mdi_unit_t *ui = ps->ps_ui; 3491 mm_unit_t *un; 3492 set_t setno; 3493 int restart_resync; 3494 3495 un = md_unit_writerlock(ui); 3496 ps->ps_un = un; 3497 setno = MD_MIN2SET(getminor(pb->b_edev)); 3498 if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) { 3499 /* 3500 * Synchronize our in-core view of what regions need to be 3501 * resync'd with the on-disk version. 3502 */ 3503 mutex_enter(&un->un_rrp_inflight_mx); 3504 mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm, 3505 un->un_dirty_bm); 3506 mutex_exit(&un->un_rrp_inflight_mx); 3507 3508 /* Region dirty map is now up to date */ 3509 } 3510 restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0; 3511 md_unit_writerexit(ui); 3512 3513 /* Restart the resync thread if it was previously blocked */ 3514 if (restart_resync) { 3515 mutex_enter(&un->un_rs_thread_mx); 3516 un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER; 3517 cv_signal(&un->un_rs_thread_cv); 3518 mutex_exit(&un->un_rs_thread_mx); 3519 } 3520 /* Continue with original deferred i/o */ 3521 daemon_io(dq); 3522 } 3523 3524 /* 3525 * owner_timeout: 3526 * ------------- 3527 * Called if the original mdmn_ksend_message() failed and the request is to be 3528 * retried. Reattempt the original ownership change. 3529 * 3530 * NOTE: called at interrupt context (see timeout(9f)). 3531 */ 3532 static void 3533 owner_timeout(void *arg) 3534 { 3535 daemon_queue_t *dq = (daemon_queue_t *)arg; 3536 3537 daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD); 3538 } 3539 3540 /* 3541 * become_owner: 3542 * ------------ 3543 * Called to issue RPC request to become the owner of the mirror 3544 * associated with this i/o request. We assume that the ownership request 3545 * is synchronous, so if it succeeds we will issue the request via 3546 * mirror_write_strategy(). 3547 * If multiple i/o's are outstanding we will be called from the mirror_daemon 3548 * service thread. 3549 * NOTE: no mutex should be held on entry to this routine. 3550 */ 3551 static void 3552 become_owner(daemon_queue_t *dq) 3553 { 3554 md_mps_t *ps = (md_mps_t *)dq; 3555 mm_unit_t *un = ps->ps_un; 3556 buf_t *pb = ps->ps_bp; 3557 set_t setno; 3558 md_mn_kresult_t *kres; 3559 int msg_flags = md_mirror_msg_flags; 3560 md_mps_t *ps1; 3561 3562 ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL); 3563 3564 /* 3565 * If we're already the mirror owner we do not need to send a message 3566 * but can simply process the i/o request immediately. 3567 * If we've already sent the request to become owner we requeue the 3568 * request as we're waiting for the synchronous ownership message to 3569 * be processed. 3570 */ 3571 if (MD_MN_MIRROR_OWNER(un)) { 3572 /* 3573 * As the strategy() call will potentially block we need to 3574 * punt this to a separate thread and complete this request 3575 * as quickly as possible. Note: if we're a read request 3576 * this must be a resync, we cannot afford to be queued 3577 * behind any intervening i/o requests. In this case we put the 3578 * request on the md_mirror_rs_daemon queue. 3579 */ 3580 if (pb->b_flags & B_READ) { 3581 daemon_request(&md_mirror_rs_daemon, daemon_io, dq, 3582 REQ_OLD); 3583 } else { 3584 daemon_request(&md_mirror_io_daemon, daemon_io, dq, 3585 REQ_OLD); 3586 } 3587 } else { 3588 mutex_enter(&un->un_owner_mx); 3589 if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) { 3590 md_mn_req_owner_t *msg; 3591 int rval = 0; 3592 3593 /* 3594 * Check to see that we haven't exceeded the maximum 3595 * retry count. If we have we fail the i/o as the 3596 * comms mechanism has become wedged beyond recovery. 3597 */ 3598 if (dq->qlen++ >= MD_OWNER_RETRIES) { 3599 mutex_exit(&un->un_owner_mx); 3600 cmn_err(CE_WARN, 3601 "md_mirror: Request exhausted ownership " 3602 "retry limit of %d attempts", dq->qlen); 3603 pb->b_error = EIO; 3604 pb->b_flags |= B_ERROR; 3605 pb->b_resid = pb->b_bcount; 3606 kmem_cache_free(mirror_parent_cache, ps); 3607 md_biodone(pb); 3608 return; 3609 } 3610 3611 /* 3612 * Issue request to change ownership. The call is 3613 * synchronous so when it returns we can complete the 3614 * i/o (if successful), or enqueue it again so that 3615 * the operation will be retried. 3616 */ 3617 un->un_owner_state |= MM_MN_OWNER_SENT; 3618 mutex_exit(&un->un_owner_mx); 3619 3620 msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP); 3621 setno = MD_MIN2SET(getminor(pb->b_edev)); 3622 msg->mnum = MD_SID(un); 3623 msg->owner = md_mn_mynode_id; 3624 msg_flags |= MD_MSGF_NO_LOG; 3625 /* 3626 * If this IO is triggered by updating a watermark, 3627 * it might be issued by the creation of a softpartition 3628 * while the commd subsystem is suspended. 3629 * We don't want this message to block. 3630 */ 3631 if (ps->ps_flags & MD_MPS_WMUPDATE) { 3632 msg_flags |= MD_MSGF_OVERRIDE_SUSPEND; 3633 } 3634 3635 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 3636 rval = mdmn_ksend_message(setno, 3637 MD_MN_MSG_REQUIRE_OWNER, 3638 msg_flags, /* flags */ 3639 (char *)msg, 3640 sizeof (md_mn_req_owner_t), 3641 kres); 3642 3643 kmem_free(msg, sizeof (md_mn_req_owner_t)); 3644 3645 if (MDMN_KSEND_MSG_OK(rval, kres)) { 3646 dq->qlen = 0; 3647 /* 3648 * Successfully changed owner, reread the 3649 * resync record so that we have a valid idea of 3650 * any previously committed incomplete write()s. 3651 * NOTE: As we need to acquire the resync mutex 3652 * this may block, so we defer it to a separate 3653 * thread handler. This makes us (effectively) 3654 * non-blocking once the ownership message 3655 * handling has completed. 3656 */ 3657 mutex_enter(&un->un_owner_mx); 3658 if (un->un_owner_state & MM_MN_BECOME_OWNER) { 3659 un->un_mirror_owner = md_mn_mynode_id; 3660 /* Sets owner of un_rr_dirty record */ 3661 if (un->un_rr_dirty_recid) 3662 (void) mddb_setowner( 3663 un->un_rr_dirty_recid, 3664 md_mn_mynode_id); 3665 un->un_owner_state &= 3666 ~MM_MN_BECOME_OWNER; 3667 /* 3668 * Release the block on the current 3669 * resync region if it is blocked 3670 */ 3671 ps1 = un->un_rs_prev_ovrlap; 3672 if ((ps1 != NULL) && 3673 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) 3674 mirror_overlap_chain_remove( 3675 ps1); 3676 mutex_exit(&un->un_owner_mx); 3677 3678 /* 3679 * If we're a read, this must be a 3680 * resync request, issue 3681 * the i/o request on the 3682 * md_mirror_rs_daemon queue. This is 3683 * to avoid a deadlock between the 3684 * resync_unit thread and 3685 * subsequent i/o requests that may 3686 * block on the resync region. 3687 */ 3688 if (pb->b_flags & B_READ) { 3689 daemon_request( 3690 &md_mirror_rs_daemon, 3691 update_resync, dq, REQ_OLD); 3692 } else { 3693 daemon_request( 3694 &md_mirror_io_daemon, 3695 update_resync, dq, REQ_OLD); 3696 } 3697 kmem_free(kres, 3698 sizeof (md_mn_kresult_t)); 3699 return; 3700 } else { 3701 /* 3702 * Some other node has beaten us to 3703 * obtain ownership. We need to 3704 * reschedule our ownership request 3705 */ 3706 mutex_exit(&un->un_owner_mx); 3707 } 3708 } else { 3709 mdmn_ksend_show_error(rval, kres, 3710 "MD_MN_MSG_REQUIRE_OWNER"); 3711 /* 3712 * Message transport failure is handled by the 3713 * comms layer. If the ownership change request 3714 * does not succeed we need to flag the error to 3715 * the initiator of the i/o. This is handled by 3716 * the retry logic above. As the request failed 3717 * we do not know _who_ the owner of the mirror 3718 * currently is. We reset our idea of the owner 3719 * to None so that any further write()s will 3720 * attempt to become the owner again. This stops 3721 * multiple nodes writing to the same mirror 3722 * simultaneously. 3723 */ 3724 mutex_enter(&un->un_owner_mx); 3725 un->un_owner_state &= 3726 ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER); 3727 un->un_mirror_owner = MD_MN_MIRROR_UNOWNED; 3728 mutex_exit(&un->un_owner_mx); 3729 } 3730 kmem_free(kres, sizeof (md_mn_kresult_t)); 3731 } else 3732 mutex_exit(&un->un_owner_mx); 3733 3734 /* 3735 * Re-enqueue this request on the deferred i/o list. Delay the 3736 * request for md_mirror_owner_to usecs to stop thrashing. 3737 */ 3738 (void) timeout(owner_timeout, dq, 3739 drv_usectohz(md_mirror_owner_to)); 3740 } 3741 } 3742 3743 static void 3744 mirror_write_strategy(buf_t *pb, int flag, void *private) 3745 { 3746 md_mps_t *ps; 3747 md_mcs_t *cs; 3748 int more; 3749 mm_unit_t *un; 3750 mdi_unit_t *ui; 3751 buf_t *cb; /* child buf pointer */ 3752 set_t setno; 3753 int rs_on_overlap = 0; 3754 3755 ui = MDI_UNIT(getminor(pb->b_edev)); 3756 un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev)); 3757 3758 3759 md_kstat_waitq_enter(ui); 3760 3761 /* 3762 * If a state change is in progress for this mirror in a MN set, 3763 * suspend all non-resync writes until the state change is complete. 3764 * The objective of this suspend is to ensure that it is not 3765 * possible for one node to read data from a submirror that another node 3766 * has not written to because of the state change. Therefore we 3767 * suspend all writes until the state change has been made. As it is 3768 * not possible to read from the target of a resync, there is no need 3769 * to suspend resync writes. 3770 */ 3771 3772 if (!(flag & MD_STR_WAR)) { 3773 mutex_enter(&un->un_suspend_wr_mx); 3774 while (un->un_suspend_wr_flag) { 3775 cv_wait(&un->un_suspend_wr_cv, &un->un_suspend_wr_mx); 3776 } 3777 mutex_exit(&un->un_suspend_wr_mx); 3778 (void) md_unit_readerlock(ui); 3779 } 3780 3781 if (!(flag & MD_STR_NOTTOP)) { 3782 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 3783 md_kstat_waitq_exit(ui); 3784 return; 3785 } 3786 } 3787 3788 setno = MD_MIN2SET(getminor(pb->b_edev)); 3789 3790 /* If an ABR write has been requested, set MD_STR_ABR flag */ 3791 if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE)) 3792 flag |= MD_STR_ABR; 3793 3794 if (private == NULL) { 3795 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 3796 mirror_parent_init(ps); 3797 } else { 3798 ps = private; 3799 private = NULL; 3800 } 3801 if (flag & MD_STR_MAPPED) 3802 ps->ps_flags |= MD_MPS_MAPPED; 3803 3804 if (flag & MD_STR_WOW) 3805 ps->ps_flags |= MD_MPS_WOW; 3806 3807 if (flag & MD_STR_ABR) 3808 ps->ps_flags |= MD_MPS_ABR; 3809 3810 if (flag & MD_STR_WMUPDATE) 3811 ps->ps_flags |= MD_MPS_WMUPDATE; 3812 3813 /* 3814 * Save essential information from the original buffhdr 3815 * in the md_save structure. 3816 */ 3817 ps->ps_un = un; 3818 ps->ps_ui = ui; 3819 ps->ps_bp = pb; 3820 ps->ps_addr = pb->b_un.b_addr; 3821 ps->ps_firstblk = pb->b_lblkno; 3822 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 3823 ps->ps_changecnt = un->un_changecnt; 3824 3825 /* 3826 * If not MN owner and this is an ABR write, make sure the current 3827 * resync region is on the overlaps chain 3828 */ 3829 mutex_enter(&un->un_owner_mx); 3830 if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) && 3831 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 3832 md_mps_t *ps1; 3833 /* Block the current resync region, if not already blocked */ 3834 ps1 = un->un_rs_prev_ovrlap; 3835 3836 if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) || 3837 (ps1->ps_lastblk != 0))) { 3838 /* Drop locks to avoid deadlock */ 3839 mutex_exit(&un->un_owner_mx); 3840 md_unit_readerexit(ui); 3841 wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT); 3842 rs_on_overlap = 1; 3843 (void) md_unit_readerlock(ui); 3844 mutex_enter(&un->un_owner_mx); 3845 /* 3846 * Check to see if we have obtained ownership 3847 * while waiting for overlaps. If we have, remove 3848 * the resync_region entry from the overlap chain 3849 */ 3850 if (MD_MN_MIRROR_OWNER(un) && 3851 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) { 3852 mirror_overlap_chain_remove(ps1); 3853 rs_on_overlap = 0; 3854 } 3855 } 3856 } 3857 mutex_exit(&un->un_owner_mx); 3858 3859 3860 /* 3861 * following keep write after read from writing to the 3862 * source in the case where it all came from one place 3863 */ 3864 if (flag & MD_STR_WAR) { 3865 int abort_write = 0; 3866 /* 3867 * We are perfoming a write-after-read. This is either as a 3868 * result of a resync read or as a result of a read in a 3869 * dirty resync region when the optimized resync is not 3870 * complete. If in a MN set and a resync generated i/o, 3871 * if the current block is not in the current 3872 * resync region terminate the write as another node must have 3873 * completed this resync region 3874 */ 3875 if ((MD_MNSET_SETNO(MD_UN2SET(un))) && 3876 (!flag & MD_STR_DIRTY_RD)) { 3877 if (!IN_RESYNC_REGION(un, ps)) 3878 abort_write = 1; 3879 } 3880 if ((select_write_after_read_units(un, ps) == 0) || 3881 (abort_write)) { 3882 #ifdef DEBUG 3883 if (mirror_debug_flag) 3884 printf("Abort resync write on %x, block %lld\n", 3885 MD_SID(un), ps->ps_firstblk); 3886 #endif 3887 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3888 mirror_overlap_chain_remove(ps); 3889 kmem_cache_free(mirror_parent_cache, ps); 3890 md_kstat_waitq_exit(ui); 3891 md_unit_readerexit(ui); 3892 md_biodone(pb); 3893 return; 3894 } 3895 } else { 3896 select_write_units(un, ps); 3897 3898 /* Drop readerlock to avoid deadlock */ 3899 md_unit_readerexit(ui); 3900 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 3901 un = md_unit_readerlock(ui); 3902 /* 3903 * For a MN set with an ABR write, if we are now the 3904 * owner and we have a resync region on the overlap 3905 * chain, remove the entry from overlaps and retry the write. 3906 */ 3907 3908 if (MD_MNSET_SETNO(setno) && 3909 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) { 3910 mutex_enter(&un->un_owner_mx); 3911 if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) { 3912 mirror_overlap_chain_remove(ps); 3913 md_kstat_waitq_exit(ui); 3914 mutex_exit(&un->un_owner_mx); 3915 md_unit_readerexit(ui); 3916 daemon_request(&md_mirror_daemon, daemon_io, 3917 (daemon_queue_t *)ps, REQ_OLD); 3918 return; 3919 } 3920 mutex_exit(&un->un_owner_mx); 3921 } 3922 } 3923 3924 /* 3925 * For Multinode mirrors with a Resync Region (not ABR) we need to 3926 * become the mirror owner before continuing with the write(). For ABR 3927 * mirrors we check that we 'own' the resync if we're in 3928 * write-after-read mode. We do this _after_ ensuring that there are no 3929 * overlaps to ensure that the once we know that we are the owner, the 3930 * readerlock will not released until the write is complete. As a 3931 * change of ownership in a MN set requires the writerlock, this 3932 * ensures that ownership cannot be changed until the write is 3933 * complete 3934 */ 3935 if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) || 3936 (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) { 3937 if (!MD_MN_MIRROR_OWNER(un)) { 3938 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3939 mirror_overlap_chain_remove(ps); 3940 md_kstat_waitq_exit(ui); 3941 ASSERT(!(flag & MD_STR_WAR)); 3942 md_unit_readerexit(ui); 3943 daemon_request(&md_mirror_daemon, become_owner, 3944 (daemon_queue_t *)ps, REQ_OLD); 3945 return; 3946 } 3947 } 3948 3949 /* 3950 * Mark resync region if mirror has a Resync Region _and_ we are not 3951 * a resync initiated write(). Don't mark region if we're flagged as 3952 * an ABR write. 3953 */ 3954 if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) && 3955 !(flag & MD_STR_WAR)) { 3956 if (mirror_mark_resync_region(un, ps->ps_firstblk, 3957 ps->ps_lastblk)) { 3958 pb->b_flags |= B_ERROR; 3959 pb->b_resid = pb->b_bcount; 3960 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP)); 3961 kmem_cache_free(mirror_parent_cache, ps); 3962 md_kstat_waitq_exit(ui); 3963 md_unit_readerexit(ui); 3964 md_biodone(pb); 3965 return; 3966 } 3967 } 3968 3969 ps->ps_childbflags = pb->b_flags | B_WRITE; 3970 ps->ps_childbflags &= ~B_READ; 3971 if (flag & MD_STR_MAPPED) 3972 ps->ps_childbflags &= ~B_PAGEIO; 3973 3974 if (!(flag & MD_STR_NOTTOP) && panicstr) 3975 /* Disable WOW and don't free ps */ 3976 ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE); 3977 3978 md_kstat_waitq_to_runq(ui); 3979 3980 /* 3981 * Treat Raw and Direct I/O as Write-on-Write always 3982 */ 3983 3984 if (!(md_mirror_wow_flg & WOW_DISABLE) && 3985 (md_mirror_wow_flg & WOW_PHYS_ENABLE) && 3986 (pb->b_flags & B_PHYS) && 3987 !(ps->ps_flags & MD_MPS_WOW)) { 3988 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 3989 mirror_overlap_chain_remove(ps); 3990 md_unit_readerexit(ui); 3991 daemon_request(&md_mstr_daemon, handle_wow, 3992 (daemon_queue_t *)ps, REQ_OLD); 3993 return; 3994 } 3995 3996 ps->ps_frags = 1; 3997 do { 3998 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 3999 mirror_child_init(cs); 4000 cb = &cs->cs_buf; 4001 more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR)); 4002 4003 /* 4004 * This handles the case where we're requesting 4005 * a write to block 0 on a label partition. (more < 0) 4006 * means that the request size was smaller than the 4007 * size of the label. If so this request is done. 4008 */ 4009 if (more < 0) { 4010 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4011 mirror_overlap_chain_remove(ps); 4012 md_kstat_runq_exit(ui); 4013 kmem_cache_free(mirror_child_cache, cs); 4014 kmem_cache_free(mirror_parent_cache, ps); 4015 md_unit_readerexit(ui); 4016 md_biodone(pb); 4017 return; 4018 } 4019 if (more) { 4020 mutex_enter(&ps->ps_mx); 4021 ps->ps_frags++; 4022 mutex_exit(&ps->ps_mx); 4023 } 4024 md_call_strategy(cb, flag, private); 4025 } while (more); 4026 4027 if (!(flag & MD_STR_NOTTOP) && panicstr) { 4028 while (!(ps->ps_flags & MD_MPS_DONE)) { 4029 md_daemon(1, &md_done_daemon); 4030 drv_usecwait(10); 4031 } 4032 kmem_cache_free(mirror_parent_cache, ps); 4033 } 4034 } 4035 4036 static void 4037 mirror_read_strategy(buf_t *pb, int flag, void *private) 4038 { 4039 md_mps_t *ps; 4040 md_mcs_t *cs; 4041 size_t more; 4042 mm_unit_t *un; 4043 mdi_unit_t *ui; 4044 size_t current_count; 4045 diskaddr_t current_blkno; 4046 off_t current_offset; 4047 buf_t *cb; /* child buf pointer */ 4048 set_t setno; 4049 4050 ui = MDI_UNIT(getminor(pb->b_edev)); 4051 4052 md_kstat_waitq_enter(ui); 4053 4054 un = (mm_unit_t *)md_unit_readerlock(ui); 4055 4056 if (!(flag & MD_STR_NOTTOP)) { 4057 if (md_checkbuf(ui, (md_unit_t *)un, pb)) { 4058 md_kstat_waitq_exit(ui); 4059 return; 4060 } 4061 } 4062 4063 if (private == NULL) { 4064 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS); 4065 mirror_parent_init(ps); 4066 } else { 4067 ps = private; 4068 private = NULL; 4069 } 4070 4071 if (flag & MD_STR_MAPPED) 4072 ps->ps_flags |= MD_MPS_MAPPED; 4073 if (flag & MD_NOBLOCK) 4074 ps->ps_flags |= MD_MPS_NOBLOCK; 4075 if (flag & MD_STR_WMUPDATE) 4076 ps->ps_flags |= MD_MPS_WMUPDATE; 4077 4078 /* 4079 * Check to see if this is a DMR driven read. If so we need to use the 4080 * specified side (in un->un_dmr_last_read) for the source of the data. 4081 */ 4082 if (flag & MD_STR_DMR) 4083 ps->ps_flags |= MD_MPS_DMR; 4084 4085 /* 4086 * Save essential information from the original buffhdr 4087 * in the md_save structure. 4088 */ 4089 ps->ps_un = un; 4090 ps->ps_ui = ui; 4091 ps->ps_bp = pb; 4092 ps->ps_addr = pb->b_un.b_addr; 4093 ps->ps_firstblk = pb->b_lblkno; 4094 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1; 4095 ps->ps_changecnt = un->un_changecnt; 4096 4097 current_count = btodb(pb->b_bcount); 4098 current_blkno = pb->b_lblkno; 4099 current_offset = 0; 4100 4101 /* 4102 * If flag has MD_STR_WAR set this means that the read is issued by a 4103 * resync thread which may or may not be an optimised resync. 4104 * 4105 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync 4106 * code has not completed; either a resync has not started since snarf, 4107 * or there is an optimized resync in progress. 4108 * 4109 * We need to generate a write after this read in the following two 4110 * cases, 4111 * 4112 * 1. Any Resync-Generated read 4113 * 4114 * 2. Any read to a DIRTY REGION if there is an optimized resync 4115 * pending or in progress. 4116 * 4117 * The write after read is done in these cases to ensure that all sides 4118 * of the mirror are in sync with the read data and that it is not 4119 * possible for an application to read the same block multiple times 4120 * and get different data. 4121 * 4122 * This would be possible if the block was in a dirty region. 4123 * 4124 * If we're performing a directed read we don't write the data out as 4125 * the application is responsible for restoring the mirror to a known 4126 * state. 4127 */ 4128 if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) && 4129 !(flag & MD_STR_DMR)) { 4130 size_t start_rr, i, end_rr; 4131 int region_dirty = 1; 4132 4133 /* 4134 * We enter here under three circumstances, 4135 * 4136 * MD_UN_OPT_NOT_DONE MD_STR_WAR 4137 * 0 1 4138 * 1 0 4139 * 1 1 4140 * 4141 * To be optimal we only care to explicitly check for dirty 4142 * regions in the second case since if MD_STR_WAR is set we 4143 * always do the write after read. 4144 */ 4145 if (!(flag & MD_STR_WAR)) { 4146 BLK_TO_RR(end_rr, ps->ps_lastblk, un); 4147 BLK_TO_RR(start_rr, ps->ps_firstblk, un); 4148 4149 for (i = start_rr; i <= end_rr; i++) 4150 if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0) 4151 break; 4152 } 4153 4154 if ((region_dirty) && 4155 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) { 4156 ps->ps_call = write_after_read; 4157 /* 4158 * Mark this as a RESYNC_READ in ps_flags. 4159 * This is used if the read fails during a 4160 * resync of a 3-way mirror to ensure that 4161 * the retried read to the remaining 4162 * good submirror has MD_STR_WAR set. This 4163 * is needed to ensure that the resync write 4164 * (write-after-read) takes place. 4165 */ 4166 ps->ps_flags |= MD_MPS_RESYNC_READ; 4167 4168 /* 4169 * If MD_STR_FLAG_ERR is set in the flags we 4170 * set MD_MPS_FLAG_ERROR so that an error on the resync 4171 * write (issued by write_after_read) will be flagged 4172 * to the biowait'ing resync thread. This allows us to 4173 * avoid issuing further resync requests to a device 4174 * that has had a write failure. 4175 */ 4176 if (flag & MD_STR_FLAG_ERR) 4177 ps->ps_flags |= MD_MPS_FLAG_ERROR; 4178 4179 setno = MD_UN2SET(un); 4180 /* 4181 * Drop the readerlock to avoid 4182 * deadlock 4183 */ 4184 md_unit_readerexit(ui); 4185 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT); 4186 un = md_unit_readerlock(ui); 4187 /* 4188 * Ensure that we are owner 4189 */ 4190 if (MD_MNSET_SETNO(setno)) { 4191 /* 4192 * For a non-resync read that requires a 4193 * write-after-read to be done, set a flag 4194 * in the parent structure, so that the 4195 * write_strategy routine can omit the 4196 * test that the write is still within the 4197 * resync region 4198 */ 4199 if (!(flag & MD_STR_WAR)) 4200 ps->ps_flags |= MD_MPS_DIRTY_RD; 4201 4202 /* 4203 * Before reading the buffer, see if 4204 * we are the owner 4205 */ 4206 if (!MD_MN_MIRROR_OWNER(un)) { 4207 ps->ps_call = NULL; 4208 mirror_overlap_chain_remove(ps); 4209 md_kstat_waitq_exit(ui); 4210 md_unit_readerexit(ui); 4211 daemon_request( 4212 &md_mirror_daemon, 4213 become_owner, 4214 (daemon_queue_t *)ps, 4215 REQ_OLD); 4216 return; 4217 } 4218 /* 4219 * For a resync read, check to see if I/O is 4220 * outside of the current resync region, or 4221 * the resync has finished. If so 4222 * just terminate the I/O 4223 */ 4224 if ((flag & MD_STR_WAR) && 4225 (!(un->c.un_status & MD_UN_WAR) || 4226 (!IN_RESYNC_REGION(un, ps)))) { 4227 #ifdef DEBUG 4228 if (mirror_debug_flag) 4229 printf("Abort resync read " 4230 "%x: %lld\n", 4231 MD_SID(un), 4232 ps->ps_firstblk); 4233 #endif 4234 mirror_overlap_chain_remove(ps); 4235 kmem_cache_free(mirror_parent_cache, 4236 ps); 4237 md_kstat_waitq_exit(ui); 4238 md_unit_readerexit(ui); 4239 md_biodone(pb); 4240 return; 4241 } 4242 } 4243 } 4244 } 4245 4246 if (flag & MD_STR_DMR) { 4247 ps->ps_call = directed_read_done; 4248 } 4249 4250 if (!(flag & MD_STR_NOTTOP) && panicstr) 4251 ps->ps_flags |= MD_MPS_DONTFREE; 4252 4253 md_kstat_waitq_to_runq(ui); 4254 4255 ps->ps_frags++; 4256 do { 4257 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS); 4258 mirror_child_init(cs); 4259 cb = &cs->cs_buf; 4260 cs->cs_ps = ps; 4261 4262 cb = md_bioclone(pb, current_offset, current_count, NODEV, 4263 current_blkno, mirror_done, cb, KM_NOSLEEP); 4264 4265 more = mirror_map_read(ps, cs, current_blkno, 4266 (u_longlong_t)current_count); 4267 if (more) { 4268 mutex_enter(&ps->ps_mx); 4269 ps->ps_frags++; 4270 mutex_exit(&ps->ps_mx); 4271 } 4272 4273 /* 4274 * Do these calculations now, 4275 * so that we pickup a valid b_bcount from the chld_bp. 4276 */ 4277 current_count -= more; 4278 current_offset += cb->b_bcount; 4279 current_blkno += more; 4280 md_call_strategy(cb, flag, private); 4281 } while (more); 4282 4283 if (!(flag & MD_STR_NOTTOP) && panicstr) { 4284 while (!(ps->ps_flags & MD_MPS_DONE)) { 4285 md_daemon(1, &md_done_daemon); 4286 drv_usecwait(10); 4287 } 4288 kmem_cache_free(mirror_parent_cache, ps); 4289 } 4290 } 4291 4292 void 4293 md_mirror_strategy(buf_t *bp, int flag, void *private) 4294 { 4295 set_t setno = MD_MIN2SET(getminor(bp->b_edev)); 4296 4297 /* 4298 * When doing IO to a multi owner meta device, check if set is halted. 4299 * We do this check without the needed lock held, for performance 4300 * reasons. 4301 * If an IO just slips through while the set is locked via an 4302 * MD_MN_SUSPEND_SET, we don't care about it. 4303 * Only check for suspension if we are a top-level i/o request 4304 * (MD_STR_NOTTOP is cleared in 'flag'). 4305 */ 4306 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 4307 (MD_SET_HALTED | MD_SET_MNSET)) { 4308 if ((flag & MD_STR_NOTTOP) == 0) { 4309 mutex_enter(&md_mx); 4310 /* Here we loop until the set is no longer halted */ 4311 while (md_set[setno].s_status & MD_SET_HALTED) { 4312 cv_wait(&md_cv, &md_mx); 4313 } 4314 mutex_exit(&md_mx); 4315 } 4316 } 4317 4318 if ((flag & MD_IO_COUNTED) == 0) { 4319 if ((flag & MD_NOBLOCK) == 0) { 4320 if (md_inc_iocount(setno) != 0) { 4321 bp->b_flags |= B_ERROR; 4322 bp->b_error = ENXIO; 4323 bp->b_resid = bp->b_bcount; 4324 biodone(bp); 4325 return; 4326 } 4327 } else { 4328 md_inc_iocount_noblock(setno); 4329 } 4330 } 4331 4332 if (bp->b_flags & B_READ) 4333 mirror_read_strategy(bp, flag, private); 4334 else 4335 mirror_write_strategy(bp, flag, private); 4336 } 4337 4338 /* 4339 * mirror_directed_read: 4340 * -------------------- 4341 * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror 4342 * so that the application can determine what (if any) resync needs to be 4343 * performed. The data is copied out to the user-supplied buffer. 4344 * 4345 * Parameters: 4346 * mdev - dev_t for the mirror device 4347 * vdr - directed read parameters specifying location and submirror 4348 * to perform the read from 4349 * mode - used to ddi_copyout() any resulting data from the read 4350 * 4351 * Returns: 4352 * 0 success 4353 * !0 error code 4354 * EINVAL - invalid request format 4355 */ 4356 int 4357 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode) 4358 { 4359 buf_t *bp; 4360 minor_t mnum = getminor(mdev); 4361 mdi_unit_t *ui = MDI_UNIT(mnum); 4362 mm_unit_t *un; 4363 mm_submirror_t *sm; 4364 char *sm_nm; 4365 uint_t next_side; 4366 void *kbuffer; 4367 4368 if (ui == NULL) 4369 return (ENXIO); 4370 4371 if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) { 4372 return (EINVAL); 4373 } 4374 4375 /* Check for aligned block access. We disallow non-aligned requests. */ 4376 if (vdr->vdr_offset % DEV_BSIZE) { 4377 return (EINVAL); 4378 } 4379 4380 /* 4381 * Allocate kernel buffer for target of read(). If we had a reliable 4382 * (sorry functional) DDI this wouldn't be needed. 4383 */ 4384 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 4385 if (kbuffer == NULL) { 4386 cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx" 4387 " bytes\n", vdr->vdr_nbytes); 4388 return (ENOMEM); 4389 } 4390 4391 bp = getrbuf(KM_SLEEP); 4392 4393 bp->b_un.b_addr = kbuffer; 4394 bp->b_flags = B_READ; 4395 bp->b_bcount = vdr->vdr_nbytes; 4396 bp->b_lblkno = lbtodb(vdr->vdr_offset); 4397 bp->b_edev = mdev; 4398 4399 un = md_unit_readerlock(ui); 4400 4401 /* 4402 * If DKV_SIDE_INIT is set we need to determine the first available 4403 * side to start reading from. If it isn't set we increment to the 4404 * next readable submirror. 4405 * If there are no readable submirrors we error out with DKV_DMR_ERROR. 4406 * Note: we check for a readable submirror on completion of the i/o so 4407 * we should _always_ have one available. If this becomes unavailable 4408 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if 4409 * a metadetach is made between the completion of one DKIOCDMR ioctl 4410 * and the start of the next (i.e. a sys-admin 'accident' occurred). 4411 * The chance of this is small, but not non-existent. 4412 */ 4413 if (vdr->vdr_side == DKV_SIDE_INIT) { 4414 next_side = 0; 4415 } else { 4416 next_side = vdr->vdr_side + 1; 4417 } 4418 while ((next_side < NMIRROR) && 4419 !SUBMIRROR_IS_READABLE(un, next_side)) 4420 next_side++; 4421 if (next_side >= NMIRROR) { 4422 vdr->vdr_flags |= DKV_DMR_ERROR; 4423 freerbuf(bp); 4424 vdr->vdr_bytesread = 0; 4425 md_unit_readerexit(ui); 4426 return (0); 4427 } 4428 4429 /* Set the side to read from */ 4430 un->un_dmr_last_read = next_side; 4431 4432 md_unit_readerexit(ui); 4433 4434 /* 4435 * Save timestamp for verification purposes. Can be read by debugger 4436 * to verify that this ioctl has been executed and to find the number 4437 * of DMR reads and the time of the last DMR read. 4438 */ 4439 uniqtime(&mirror_dmr_stats.dmr_timestamp); 4440 mirror_dmr_stats.dmr_count++; 4441 4442 /* Issue READ request and wait for completion */ 4443 mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL); 4444 4445 mutex_enter(&un->un_dmr_mx); 4446 cv_wait(&un->un_dmr_cv, &un->un_dmr_mx); 4447 mutex_exit(&un->un_dmr_mx); 4448 4449 /* 4450 * Check to see if we encountered an error during the read. If so we 4451 * can make no guarantee about any possibly returned data. 4452 */ 4453 if ((bp->b_flags & B_ERROR) == 0) { 4454 vdr->vdr_flags &= ~DKV_DMR_ERROR; 4455 if (bp->b_resid) { 4456 vdr->vdr_flags |= DKV_DMR_SHORT; 4457 vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid; 4458 } else { 4459 vdr->vdr_flags |= DKV_DMR_SUCCESS; 4460 vdr->vdr_bytesread = vdr->vdr_nbytes; 4461 } 4462 /* Copy the data read back out to the user supplied buffer */ 4463 if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread, 4464 mode)) { 4465 kmem_free(kbuffer, vdr->vdr_nbytes); 4466 return (EFAULT); 4467 } 4468 4469 } else { 4470 /* Error out with DKV_DMR_ERROR */ 4471 vdr->vdr_flags |= DKV_DMR_ERROR; 4472 vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE); 4473 } 4474 /* 4475 * Update the DMR parameters with the side and name of submirror that 4476 * we have just read from (un->un_dmr_last_read) 4477 */ 4478 un = md_unit_readerlock(ui); 4479 4480 vdr->vdr_side = un->un_dmr_last_read; 4481 sm = &un->un_sm[un->un_dmr_last_read]; 4482 sm_nm = md_shortname(md_getminor(sm->sm_dev)); 4483 4484 (void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name)); 4485 4486 /* 4487 * Determine if we've completed the read cycle. This is true iff the 4488 * next computed submirror (side) equals or exceeds NMIRROR. We cannot 4489 * use un_nsm as we need to handle a sparse array of submirrors (which 4490 * can occur if a submirror is metadetached). 4491 */ 4492 next_side = un->un_dmr_last_read + 1; 4493 while ((next_side < NMIRROR) && 4494 !SUBMIRROR_IS_READABLE(un, next_side)) 4495 next_side++; 4496 if (next_side >= NMIRROR) { 4497 /* We've finished */ 4498 vdr->vdr_flags |= DKV_DMR_DONE; 4499 } 4500 4501 md_unit_readerexit(ui); 4502 freerbuf(bp); 4503 kmem_free(kbuffer, vdr->vdr_nbytes); 4504 4505 return (0); 4506 } 4507 4508 /* 4509 * mirror_resync_message: 4510 * --------------------- 4511 * Handle the multi-node resync messages that keep all nodes within a given 4512 * disk-set in sync with their view of a mirror's resync status. 4513 * 4514 * The message types dealt with are: 4515 * MD_MN_MSG_RESYNC_STARTING - start a resync thread for a unit 4516 * MD_MN_MSG_RESYNC_NEXT - specified next region to be resynced 4517 * MD_MN_MSG_RESYNC_FINISH - stop the resync thread for a unit 4518 * MD_MN_MSG_RESYNC_PHASE_DONE - end of a resync phase, opt, submirror or comp 4519 * 4520 * Returns: 4521 * 0 Success 4522 * >0 Failure error number 4523 */ 4524 int 4525 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp) 4526 { 4527 mdi_unit_t *ui; 4528 mm_unit_t *un; 4529 set_t setno; 4530 int is_ABR; 4531 int smi; 4532 int ci; 4533 sm_state_t state; 4534 int broke_out; 4535 mm_submirror_t *sm; 4536 mm_submirror_ic_t *smic; 4537 md_m_shared_t *shared; 4538 md_error_t mde = mdnullerror; 4539 md_mps_t *ps; 4540 int rs_active; 4541 4542 /* Check that the given device is part of a multi-node set */ 4543 setno = MD_MIN2SET(p->mnum); 4544 if (setno >= md_nsets) { 4545 return (ENXIO); 4546 } 4547 if (!MD_MNSET_SETNO(setno)) { 4548 return (EINVAL); 4549 } 4550 4551 if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL) 4552 return (EINVAL); 4553 if ((ui = MDI_UNIT(p->mnum)) == NULL) 4554 return (EINVAL); 4555 is_ABR = (ui->ui_tstate & MD_ABR_CAP); 4556 4557 /* Obtain the current resync status */ 4558 (void) md_ioctl_readerlock(lockp, ui); 4559 rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0; 4560 md_ioctl_readerexit(lockp); 4561 4562 switch ((md_mn_msgtype_t)p->msg_type) { 4563 case MD_MN_MSG_RESYNC_STARTING: 4564 /* Start the resync thread for the mirror */ 4565 (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp); 4566 break; 4567 4568 case MD_MN_MSG_RESYNC_NEXT: 4569 /* 4570 * We have to release any previously marked overlap regions 4571 * so that i/o can resume. Then we need to block the region 4572 * from [rs_start..rs_start+rs_size) * so that no i/o is issued. 4573 * Update un_rs_resync_done and un_rs_resync_2_do. 4574 */ 4575 (void) md_ioctl_readerlock(lockp, ui); 4576 /* 4577 * Ignore the message if there is no active resync thread or 4578 * if it is for a resync type that we have already completed. 4579 * un_resync_completed is set to the last resync completed 4580 * when processing a PHASE_DONE message. 4581 */ 4582 if (!rs_active || (p->rs_type == un->un_resync_completed)) 4583 break; 4584 /* 4585 * If this message is for the same resync and is for an earlier 4586 * resync region, just ignore it. This can only occur if this 4587 * node has progressed on to the next resync region before 4588 * we receive this message. This can occur if the class for 4589 * this message is busy and the originator has to retry thus 4590 * allowing this node to move onto the next resync_region. 4591 */ 4592 if ((p->rs_type == un->un_rs_type) && 4593 (p->rs_start < un->un_resync_startbl)) 4594 break; 4595 ps = un->un_rs_prev_ovrlap; 4596 4597 /* Allocate previous overlap reference if needed */ 4598 if (ps == NULL) { 4599 ps = kmem_cache_alloc(mirror_parent_cache, 4600 MD_ALLOCFLAGS); 4601 ps->ps_un = un; 4602 ps->ps_ui = ui; 4603 ps->ps_firstblk = 0; 4604 ps->ps_lastblk = 0; 4605 ps->ps_flags = 0; 4606 md_ioctl_readerexit(lockp); 4607 (void) md_ioctl_writerlock(lockp, ui); 4608 un->un_rs_prev_ovrlap = ps; 4609 md_ioctl_writerexit(lockp); 4610 } else 4611 md_ioctl_readerexit(lockp); 4612 4613 if (p->rs_originator != md_mn_mynode_id) { 4614 /* 4615 * On all but the originating node, first update 4616 * the resync state, then unblock the previous 4617 * region and block the next one. No need 4618 * to do this if the region is already blocked. 4619 * Update the submirror state and flags from the 4620 * originator. This keeps the cluster in sync with 4621 * regards to the resync status. 4622 */ 4623 4624 (void) md_ioctl_writerlock(lockp, ui); 4625 un->un_rs_resync_done = p->rs_done; 4626 un->un_rs_resync_2_do = p->rs_2_do; 4627 un->un_rs_type = p->rs_type; 4628 un->un_resync_startbl = p->rs_start; 4629 md_ioctl_writerexit(lockp); 4630 /* 4631 * Use un_owner_mx to ensure that an ownership change 4632 * cannot happen at the same time as this message 4633 */ 4634 mutex_enter(&un->un_owner_mx); 4635 if (MD_MN_MIRROR_OWNER(un)) { 4636 ps->ps_firstblk = p->rs_start; 4637 ps->ps_lastblk = ps->ps_firstblk + 4638 p->rs_size - 1; 4639 } else { 4640 if ((ps->ps_firstblk != p->rs_start) || 4641 (ps->ps_lastblk != p->rs_start + 4642 p->rs_size - 1)) { 4643 /* Remove previous overlap range */ 4644 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4645 mirror_overlap_chain_remove(ps); 4646 4647 ps->ps_firstblk = p->rs_start; 4648 ps->ps_lastblk = ps->ps_firstblk + 4649 p->rs_size - 1; 4650 4651 mutex_exit(&un->un_owner_mx); 4652 /* Block this range from all i/o. */ 4653 if (ps->ps_firstblk != 0 || 4654 ps->ps_lastblk != 0) 4655 wait_for_overlaps(ps, 4656 MD_OVERLAP_ALLOW_REPEAT); 4657 mutex_enter(&un->un_owner_mx); 4658 /* 4659 * Check to see if we have obtained 4660 * ownership while waiting for 4661 * overlaps. If we have, remove 4662 * the resync_region entry from the 4663 * overlap chain 4664 */ 4665 if (MD_MN_MIRROR_OWNER(un) && 4666 (ps->ps_flags & MD_MPS_ON_OVERLAP)) 4667 mirror_overlap_chain_remove(ps); 4668 } 4669 } 4670 mutex_exit(&un->un_owner_mx); 4671 4672 /* 4673 * If this is the first RESYNC_NEXT message (i.e. 4674 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags), 4675 * issue RESYNC_START NOTIFY event 4676 */ 4677 if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) { 4678 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START, 4679 SVM_TAG_METADEVICE, MD_UN2SET(un), 4680 MD_SID(un)); 4681 } 4682 4683 /* Ensure that our local resync thread is running */ 4684 if (un->un_rs_thread == NULL) { 4685 (void) mirror_resync_unit(p->mnum, NULL, 4686 &p->mde, lockp); 4687 } 4688 } 4689 break; 4690 case MD_MN_MSG_RESYNC_FINISH: 4691 /* 4692 * Complete the resync by stopping the resync thread. 4693 * Also release the previous overlap region field. 4694 * Update the resync_progress_thread by cv_signal'ing it so 4695 * that we mark the end of the resync as soon as possible. This 4696 * stops an unnecessary delay should be panic after resync 4697 * completion. 4698 */ 4699 #ifdef DEBUG 4700 if (!rs_active) { 4701 if (mirror_debug_flag) 4702 printf("RESYNC_FINISH (mnum = %x), " 4703 "Resync *NOT* active", 4704 p->mnum); 4705 } 4706 #endif 4707 4708 if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) && 4709 (p->rs_originator != md_mn_mynode_id)) { 4710 mutex_enter(&un->un_rs_thread_mx); 4711 un->c.un_status &= ~MD_UN_RESYNC_CANCEL; 4712 un->un_rs_thread_flags |= MD_RI_SHUTDOWN; 4713 un->un_rs_thread_flags &= 4714 ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER); 4715 cv_signal(&un->un_rs_thread_cv); 4716 mutex_exit(&un->un_rs_thread_mx); 4717 } 4718 if (is_ABR) { 4719 /* Resync finished, if ABR set owner to NULL */ 4720 mutex_enter(&un->un_owner_mx); 4721 un->un_mirror_owner = 0; 4722 mutex_exit(&un->un_owner_mx); 4723 } 4724 (void) md_ioctl_writerlock(lockp, ui); 4725 ps = un->un_rs_prev_ovrlap; 4726 if (ps != NULL) { 4727 /* Remove previous overlap range */ 4728 if (ps->ps_flags & MD_MPS_ON_OVERLAP) 4729 mirror_overlap_chain_remove(ps); 4730 /* 4731 * Release the overlap range reference 4732 */ 4733 un->un_rs_prev_ovrlap = NULL; 4734 kmem_cache_free(mirror_parent_cache, 4735 ps); 4736 } 4737 md_ioctl_writerexit(lockp); 4738 4739 /* Mark the resync as complete in the metadb */ 4740 un->un_rs_resync_done = p->rs_done; 4741 un->un_rs_resync_2_do = p->rs_2_do; 4742 un->un_rs_type = p->rs_type; 4743 mutex_enter(&un->un_rs_progress_mx); 4744 cv_signal(&un->un_rs_progress_cv); 4745 mutex_exit(&un->un_rs_progress_mx); 4746 4747 un = md_ioctl_writerlock(lockp, ui); 4748 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE; 4749 /* Deal with any pending grow_unit */ 4750 if (un->c.un_status & MD_UN_GROW_PENDING) { 4751 if ((mirror_grow_unit(un, &mde) != 0) || 4752 (! mdismderror(&mde, MDE_GROW_DELAYED))) { 4753 un->c.un_status &= ~MD_UN_GROW_PENDING; 4754 } 4755 } 4756 md_ioctl_writerexit(lockp); 4757 break; 4758 4759 case MD_MN_MSG_RESYNC_PHASE_DONE: 4760 /* 4761 * A phase of the resync, optimized. component or 4762 * submirror is complete. Update mirror status. 4763 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the 4764 * mirror owner is peforming a resync. If we have just snarfed 4765 * this set, then we must clear any of the flags set at snarf 4766 * time by unit_setup_resync(). 4767 * Note that unit_setup_resync() sets up these flags to 4768 * indicate that an optimized resync is required. These flags 4769 * need to be reset because if we get here, the mirror owner 4770 * will have handled the optimized resync. 4771 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and 4772 * MD_UN_WAR. In addition, for each submirror, 4773 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC 4774 * set to SMS_OFFLINE. 4775 */ 4776 #ifdef DEBUG 4777 if (mirror_debug_flag) 4778 printf("phase done mess received from %d, mnum=%x," 4779 "type=%x, flags=%x\n", p->rs_originator, p->mnum, 4780 p->rs_type, p->rs_flags); 4781 #endif 4782 /* 4783 * Ignore the message if there is no active resync thread. 4784 */ 4785 if (!rs_active) 4786 break; 4787 4788 broke_out = p->rs_flags & MD_MN_RS_ERR; 4789 switch (RS_TYPE(p->rs_type)) { 4790 case MD_RS_OPTIMIZED: 4791 un = md_ioctl_writerlock(lockp, ui); 4792 if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) { 4793 /* If we are originator, just clear rs_type */ 4794 if (p->rs_originator == md_mn_mynode_id) { 4795 SET_RS_TYPE_NONE(un->un_rs_type); 4796 md_ioctl_writerexit(lockp); 4797 break; 4798 } 4799 /* 4800 * If CLEAR_OPT_NOT_DONE is set, only clear the 4801 * flags if OPT_NOT_DONE is set *and* rs_type 4802 * is MD_RS_NONE. 4803 */ 4804 if ((un->c.un_status & MD_UN_OPT_NOT_DONE) && 4805 (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) { 4806 /* No resync in progress */ 4807 un->c.un_status &= ~MD_UN_OPT_NOT_DONE; 4808 un->c.un_status &= ~MD_UN_WAR; 4809 } else { 4810 /* 4811 * We are in the middle of an 4812 * optimized resync and this message 4813 * should be ignored. 4814 */ 4815 md_ioctl_writerexit(lockp); 4816 break; 4817 } 4818 } else { 4819 /* 4820 * This is the end of an optimized resync, 4821 * clear the OPT_NOT_DONE and OFFLINE_SM flags 4822 */ 4823 4824 un->c.un_status &= ~MD_UN_KEEP_DIRTY; 4825 if (!broke_out) 4826 un->c.un_status &= ~MD_UN_WAR; 4827 } 4828 4829 /* 4830 * Set resync_completed to last resync type and then 4831 * clear resync_type to indicate no resync in progress 4832 */ 4833 un->un_resync_completed = un->un_rs_type; 4834 SET_RS_TYPE_NONE(un->un_rs_type); 4835 4836 /* 4837 * If resync is as a result of a submirror ONLINE, 4838 * reset the submirror state to SMS_RUNNING if the 4839 * resync was ok else set back to SMS_OFFLINE. 4840 */ 4841 for (smi = 0; smi < NMIRROR; smi++) { 4842 un->un_sm[smi].sm_flags &= 4843 ~MD_SM_RESYNC_TARGET; 4844 if (SMS_BY_INDEX_IS(un, smi, 4845 SMS_OFFLINE_RESYNC)) { 4846 if (p->rs_flags & 4847 MD_MN_RS_CLEAR_OPT_NOT_DONE) { 4848 state = SMS_OFFLINE; 4849 } else { 4850 state = (broke_out ? 4851 SMS_OFFLINE : SMS_RUNNING); 4852 } 4853 mirror_set_sm_state( 4854 &un->un_sm[smi], 4855 &un->un_smic[smi], state, 4856 broke_out); 4857 mirror_commit(un, NO_SUBMIRRORS, 4858 0); 4859 } 4860 /* 4861 * If we still have an offline submirror, reset 4862 * the OFFLINE_SM flag in the mirror status 4863 */ 4864 if (SMS_BY_INDEX_IS(un, smi, 4865 SMS_OFFLINE)) 4866 un->c.un_status |= 4867 MD_UN_OFFLINE_SM; 4868 } 4869 md_ioctl_writerexit(lockp); 4870 break; 4871 case MD_RS_SUBMIRROR: 4872 un = md_ioctl_writerlock(lockp, ui); 4873 smi = RS_SMI(p->rs_type); 4874 sm = &un->un_sm[smi]; 4875 smic = &un->un_smic[smi]; 4876 /* Clear RESYNC target */ 4877 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 4878 /* 4879 * Set resync_completed to last resync type and then 4880 * clear resync_type to indicate no resync in progress 4881 */ 4882 un->un_resync_completed = un->un_rs_type; 4883 SET_RS_TYPE_NONE(un->un_rs_type); 4884 /* 4885 * If the resync completed ok reset the submirror 4886 * state to SMS_RUNNING else reset it to SMS_ATTACHED 4887 */ 4888 state = (broke_out ? 4889 SMS_ATTACHED : SMS_RUNNING); 4890 mirror_set_sm_state(sm, smic, state, broke_out); 4891 un->c.un_status &= ~MD_UN_WAR; 4892 mirror_commit(un, SMI2BIT(smi), 0); 4893 md_ioctl_writerexit(lockp); 4894 break; 4895 case MD_RS_COMPONENT: 4896 un = md_ioctl_writerlock(lockp, ui); 4897 smi = RS_SMI(p->rs_type); 4898 ci = RS_CI(p->rs_type); 4899 sm = &un->un_sm[smi]; 4900 smic = &un->un_smic[smi]; 4901 shared = (md_m_shared_t *) 4902 (*(smic->sm_shared_by_indx)) 4903 (sm->sm_dev, sm, ci); 4904 un->c.un_status &= ~MD_UN_WAR; 4905 /* Clear RESYNC target */ 4906 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET; 4907 /* 4908 * Set resync_completed to last resync type and then 4909 * clear resync_type to indicate no resync in progress 4910 */ 4911 un->un_resync_completed = un->un_rs_type; 4912 SET_RS_TYPE_NONE(un->un_rs_type); 4913 4914 /* 4915 * If the resync completed ok, set the component state 4916 * to CS_OKAY. 4917 */ 4918 if (broke_out) 4919 shared->ms_flags |= MDM_S_RS_TRIED; 4920 else { 4921 /* 4922 * As we don't transmit the changes, 4923 * no need to drop the lock. 4924 */ 4925 set_sm_comp_state(un, smi, ci, CS_OKAY, 0, 4926 MD_STATE_NO_XMIT, (IOLOCK *)NULL); 4927 } 4928 md_ioctl_writerexit(lockp); 4929 default: 4930 break; 4931 } 4932 /* 4933 * If the purpose of this PHASE_DONE message is just to 4934 * indicate to all other nodes that the optimized resync 4935 * required (OPT_NOT_DONE) flag is to be cleared, there is 4936 * no need to generate a notify event as there has not 4937 * actually been a resync. 4938 */ 4939 if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) { 4940 if (broke_out) { 4941 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED, 4942 SVM_TAG_METADEVICE, MD_UN2SET(un), 4943 MD_SID(un)); 4944 } else { 4945 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE, 4946 SVM_TAG_METADEVICE, MD_UN2SET(un), 4947 MD_SID(un)); 4948 } 4949 } 4950 break; 4951 4952 default: 4953 #ifdef DEBUG 4954 cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type" 4955 " %x\n", p->msg_type); 4956 #endif 4957 return (EINVAL); 4958 } 4959 return (0); 4960 } 4961 4962 /* Return a -1 if snarf of optimized record failed and set should be released */ 4963 static int 4964 mirror_snarf(md_snarfcmd_t cmd, set_t setno) 4965 { 4966 mddb_recid_t recid; 4967 int gotsomething; 4968 int all_mirrors_gotten; 4969 mm_unit_t *un; 4970 mddb_type_t typ1; 4971 mddb_de_ic_t *dep; 4972 mddb_rb32_t *rbp; 4973 size_t newreqsize; 4974 mm_unit_t *big_un; 4975 mm_unit32_od_t *small_un; 4976 int retval; 4977 mdi_unit_t *ui; 4978 4979 if (cmd == MD_SNARF_CLEANUP) { 4980 if (md_get_setstatus(setno) & MD_SET_STALE) 4981 return (0); 4982 4983 recid = mddb_makerecid(setno, 0); 4984 typ1 = (mddb_type_t)md_getshared_key(setno, 4985 mirror_md_ops.md_driver.md_drivername); 4986 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 4987 if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) { 4988 un = (mm_unit_t *)mddb_getrecaddr(recid); 4989 mirror_cleanup(un); 4990 recid = mddb_makerecid(setno, 0); 4991 } 4992 } 4993 return (0); 4994 } 4995 4996 all_mirrors_gotten = 1; 4997 gotsomething = 0; 4998 4999 recid = mddb_makerecid(setno, 0); 5000 typ1 = (mddb_type_t)md_getshared_key(setno, 5001 mirror_md_ops.md_driver.md_drivername); 5002 5003 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5004 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 5005 continue; 5006 5007 dep = mddb_getrecdep(recid); 5008 dep->de_flags = MDDB_F_MIRROR; 5009 rbp = dep->de_rb; 5010 5011 switch (rbp->rb_revision) { 5012 case MDDB_REV_RB: 5013 case MDDB_REV_RBFN: 5014 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 5015 /* 5016 * This means, we have an old and small 5017 * record and this record hasn't already 5018 * been converted. Before we create an 5019 * incore metadevice from this we have to 5020 * convert it to a big record. 5021 */ 5022 small_un = 5023 (mm_unit32_od_t *)mddb_getrecaddr(recid); 5024 newreqsize = sizeof (mm_unit_t); 5025 big_un = (mm_unit_t *)kmem_zalloc(newreqsize, 5026 KM_SLEEP); 5027 mirror_convert((caddr_t)small_un, 5028 (caddr_t)big_un, SMALL_2_BIG); 5029 kmem_free(small_un, dep->de_reqsize); 5030 5031 /* 5032 * Update userdata and incore userdata 5033 * incores are at the end of un 5034 */ 5035 dep->de_rb_userdata_ic = big_un; 5036 dep->de_rb_userdata = big_un; 5037 dep->de_icreqsize = newreqsize; 5038 un = big_un; 5039 rbp->rb_private |= MD_PRV_CONVD; 5040 } else { 5041 /* 5042 * Unit already converted, just get the 5043 * record address. 5044 */ 5045 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, 5046 sizeof (*un), 0); 5047 } 5048 un->c.un_revision &= ~MD_64BIT_META_DEV; 5049 break; 5050 case MDDB_REV_RB64: 5051 case MDDB_REV_RB64FN: 5052 /* Big device */ 5053 un = (mm_unit_t *)mddb_getrecaddr_resize(recid, 5054 sizeof (*un), 0); 5055 un->c.un_revision |= MD_64BIT_META_DEV; 5056 un->c.un_flag |= MD_EFILABEL; 5057 break; 5058 } 5059 NOTE_FN(rbp->rb_revision, un->c.un_revision); 5060 5061 /* 5062 * Create minor device node for snarfed entry. 5063 */ 5064 (void) md_create_minor_node(setno, MD_SID(un)); 5065 5066 if (MD_UNIT(MD_SID(un)) != NULL) { 5067 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5068 continue; 5069 } 5070 all_mirrors_gotten = 0; 5071 retval = mirror_build_incore(un, 1); 5072 if (retval == 0) { 5073 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5074 md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0); 5075 resync_start_timeout(setno); 5076 gotsomething = 1; 5077 } else { 5078 return (retval); 5079 } 5080 /* 5081 * Set flag to indicate that the mirror has not yet 5082 * been through a reconfig. This flag is used for MN sets 5083 * when determining whether to update the mirror state from 5084 * the Master node. 5085 */ 5086 if (MD_MNSET_SETNO(setno)) { 5087 ui = MDI_UNIT(MD_SID(un)); 5088 ui->ui_tstate |= MD_RESYNC_NOT_DONE; 5089 } 5090 } 5091 5092 if (!all_mirrors_gotten) 5093 return (gotsomething); 5094 5095 recid = mddb_makerecid(setno, 0); 5096 while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0) 5097 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 5098 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 5099 5100 return (0); 5101 } 5102 5103 static int 5104 mirror_halt(md_haltcmd_t cmd, set_t setno) 5105 { 5106 unit_t i; 5107 mdi_unit_t *ui; 5108 minor_t mnum; 5109 int reset_mirror_flag = 0; 5110 5111 if (cmd == MD_HALT_CLOSE) 5112 return (0); 5113 5114 if (cmd == MD_HALT_OPEN) 5115 return (0); 5116 5117 if (cmd == MD_HALT_UNLOAD) 5118 return (0); 5119 5120 if (cmd == MD_HALT_CHECK) { 5121 for (i = 0; i < md_nunits; i++) { 5122 mnum = MD_MKMIN(setno, i); 5123 if ((ui = MDI_UNIT(mnum)) == NULL) 5124 continue; 5125 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5126 continue; 5127 if (md_unit_isopen(ui)) 5128 return (1); 5129 } 5130 return (0); 5131 } 5132 5133 if (cmd != MD_HALT_DOIT) 5134 return (1); 5135 5136 for (i = 0; i < md_nunits; i++) { 5137 mnum = MD_MKMIN(setno, i); 5138 if ((ui = MDI_UNIT(mnum)) == NULL) 5139 continue; 5140 if (ui->ui_opsindex != mirror_md_ops.md_selfindex) 5141 continue; 5142 reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0); 5143 5144 /* Set a flag if there is at least one mirror metadevice. */ 5145 reset_mirror_flag = 1; 5146 } 5147 5148 /* 5149 * Only wait for the global dr_timeout to finish 5150 * - if there are mirror metadevices in this diskset or 5151 * - if this is the local set since an unload of the md_mirror 5152 * driver could follow a successful mirror halt in the local set. 5153 */ 5154 if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) { 5155 while ((mirror_md_ops.md_head == NULL) && 5156 (mirror_timeout.dr_timeout_id != 0)) 5157 delay(md_hz); 5158 } 5159 5160 return (0); 5161 } 5162 5163 /*ARGSUSED3*/ 5164 static int 5165 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) 5166 { 5167 IOLOCK lock; 5168 minor_t mnum = getminor(*dev); 5169 set_t setno; 5170 5171 /* 5172 * When doing an open of a multi owner metadevice, check to see if this 5173 * node is a starting node and if a reconfig cycle is underway. 5174 * If so, the system isn't sufficiently set up enough to handle the 5175 * open (which involves I/O during sp_validate), so fail with ENXIO. 5176 */ 5177 setno = MD_MIN2SET(mnum); 5178 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 5179 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 5180 return (ENXIO); 5181 } 5182 5183 if (md_oflags & MD_OFLG_FROMIOCTL) { 5184 /* 5185 * This indicates that the caller is an ioctl service routine. 5186 * In this case we initialise our stack-based IOLOCK and pass 5187 * this into the internal open routine. This allows multi-owner 5188 * metadevices to avoid deadlocking if an error is encountered 5189 * during the open() attempt. The failure case is: 5190 * s-p -> mirror -> s-p (with error). Attempting to metaclear 5191 * this configuration would deadlock as the mirror code has to 5192 * send a state-update to the other nodes when it detects the 5193 * failure of the underlying submirror with an errored soft-part 5194 * on it. As there is a class1 message in progress (metaclear) 5195 * set_sm_comp_state() cannot send another class1 message; 5196 * instead we do not send a state_update message as the 5197 * metaclear is distributed and the failed submirror will be 5198 * cleared from the configuration by the metaclear. 5199 */ 5200 IOLOCK_INIT(&lock); 5201 return (mirror_internal_open(getminor(*dev), flag, otyp, 5202 md_oflags, &lock)); 5203 } else { 5204 return (mirror_internal_open(getminor(*dev), flag, otyp, 5205 md_oflags, (IOLOCK *)NULL)); 5206 } 5207 } 5208 5209 5210 /*ARGSUSED1*/ 5211 static int 5212 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) 5213 { 5214 return (mirror_internal_close(getminor(dev), otyp, md_cflags, 5215 (IOLOCK *)NULL)); 5216 } 5217 5218 5219 /* 5220 * This routine dumps memory to the disk. It assumes that the memory has 5221 * already been mapped into mainbus space. It is called at disk interrupt 5222 * priority when the system is in trouble. 5223 * 5224 */ 5225 static int 5226 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 5227 { 5228 mm_unit_t *un; 5229 dev_t mapdev; 5230 int result; 5231 int smi; 5232 int any_succeed = 0; 5233 int save_result = 0; 5234 5235 /* 5236 * Don't need to grab the unit lock. 5237 * Cause nothing else is suppose to be happenning. 5238 * Also dump is not suppose to sleep. 5239 */ 5240 un = (mm_unit_t *)MD_UNIT(getminor(dev)); 5241 5242 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 5243 return (EINVAL); 5244 5245 if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks) 5246 return (EINVAL); 5247 5248 for (smi = 0; smi < NMIRROR; smi++) { 5249 if (!SUBMIRROR_IS_WRITEABLE(un, smi)) 5250 continue; 5251 mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev); 5252 result = bdev_dump(mapdev, addr, blkno, nblk); 5253 if (result) 5254 save_result = result; 5255 5256 if (result == 0) 5257 any_succeed++; 5258 } 5259 5260 if (any_succeed) 5261 return (0); 5262 5263 return (save_result); 5264 } 5265 5266 /* 5267 * NAME: mirror_probe_dev 5268 * 5269 * DESCRITPION: force opens every component of a mirror. 5270 * 5271 * On entry the unit writerlock is held 5272 */ 5273 static int 5274 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum) 5275 { 5276 int i; 5277 int smi; 5278 int ci; 5279 mm_unit_t *un; 5280 int md_devopen = 0; 5281 set_t setno; 5282 int sm_cnt; 5283 int sm_unavail_cnt; 5284 5285 if (md_unit_isopen(ui)) 5286 md_devopen++; 5287 5288 un = MD_UNIT(mnum); 5289 setno = MD_UN2SET(un); 5290 5291 sm_cnt = 0; 5292 sm_unavail_cnt = 0; 5293 for (i = 0; i < NMIRROR; i++) { 5294 md_dev64_t tmpdev; 5295 mdi_unit_t *sm_ui; 5296 5297 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) { 5298 continue; 5299 } 5300 5301 sm_cnt++; 5302 tmpdev = un->un_sm[i].sm_dev; 5303 (void) md_layered_open(mnum, &tmpdev, 5304 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV); 5305 un->un_sm[i].sm_dev = tmpdev; 5306 5307 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev))); 5308 5309 /* 5310 * Logic similar to that in mirror_open_all_devs. We set or 5311 * clear the submirror Unavailable bit. 5312 */ 5313 (void) md_unit_writerlock(sm_ui); 5314 if (submirror_unavailable(un, i, 1)) { 5315 sm_ui->ui_tstate |= MD_INACCESSIBLE; 5316 sm_unavail_cnt++; 5317 } else { 5318 sm_ui->ui_tstate &= ~MD_INACCESSIBLE; 5319 } 5320 md_unit_writerexit(sm_ui); 5321 } 5322 5323 /* 5324 * If all of the submirrors are unavailable, the mirror is also 5325 * unavailable. 5326 */ 5327 if (sm_cnt == sm_unavail_cnt) { 5328 ui->ui_tstate |= MD_INACCESSIBLE; 5329 } else { 5330 ui->ui_tstate &= ~MD_INACCESSIBLE; 5331 } 5332 5333 /* 5334 * Start checking from probe failures. If failures occur we 5335 * set the appropriate erred state only if the metadevice is in 5336 * use. This is specifically to prevent unnecessary resyncs. 5337 * For instance if the disks were accidentally disconnected when 5338 * the system booted up then until the metadevice is accessed 5339 * (like file system mount) the user can shutdown, recable and 5340 * reboot w/o incurring a potentially huge resync. 5341 */ 5342 5343 smi = 0; 5344 ci = 0; 5345 while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) { 5346 5347 if (mirror_other_sources(un, smi, ci, 0) == 1) { 5348 /* 5349 * Note that for a MN set, there is no need to call 5350 * SE_NOTIFY as that is done when processing the 5351 * state change 5352 */ 5353 if (md_devopen) { 5354 /* 5355 * Never called from ioctl context, 5356 * so (IOLOCK *)NULL 5357 */ 5358 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 5359 0, MD_STATE_XMIT, (IOLOCK *)NULL); 5360 if (!MD_MNSET_SETNO(setno)) { 5361 SE_NOTIFY(EC_SVM_STATE, 5362 ESC_SVM_LASTERRED, 5363 SVM_TAG_METADEVICE, setno, 5364 MD_SID(un)); 5365 } 5366 continue; 5367 } else { 5368 (void) mirror_close_all_devs(un, 5369 MD_OFLG_PROBEDEV); 5370 if (!MD_MNSET_SETNO(setno)) { 5371 SE_NOTIFY(EC_SVM_STATE, 5372 ESC_SVM_OPEN_FAIL, 5373 SVM_TAG_METADEVICE, setno, 5374 MD_SID(un)); 5375 } 5376 mirror_openfail_console_info(un, smi, ci); 5377 return (ENXIO); 5378 } 5379 } 5380 5381 /* 5382 * Note that for a MN set, there is no need to call 5383 * SE_NOTIFY as that is done when processing the 5384 * state change 5385 */ 5386 if (md_devopen) { 5387 /* Never called from ioctl context, so (IOLOCK *)NULL */ 5388 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, 5389 MD_STATE_XMIT, (IOLOCK *)NULL); 5390 if (!MD_MNSET_SETNO(setno)) { 5391 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, 5392 SVM_TAG_METADEVICE, setno, 5393 MD_SID(un)); 5394 } 5395 } 5396 mirror_openfail_console_info(un, smi, ci); 5397 ci++; 5398 } 5399 5400 if (MD_MNSET_SETNO(setno)) { 5401 send_poke_hotspares(setno); 5402 } else { 5403 (void) poke_hotspares(); 5404 } 5405 (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV); 5406 5407 return (0); 5408 } 5409 5410 5411 static int 5412 mirror_imp_set( 5413 set_t setno 5414 ) 5415 { 5416 5417 mddb_recid_t recid; 5418 int gotsomething, i; 5419 mddb_type_t typ1; 5420 mddb_de_ic_t *dep; 5421 mddb_rb32_t *rbp; 5422 mm_unit32_od_t *un32; 5423 mm_unit_t *un64; 5424 md_dev64_t self_devt; 5425 minor_t *self_id; /* minor needs to be updated */ 5426 md_parent_t *parent_id; /* parent needs to be updated */ 5427 mddb_recid_t *record_id; /* record id needs to be updated */ 5428 mddb_recid_t *optrec_id; 5429 md_dev64_t tmpdev; 5430 5431 5432 gotsomething = 0; 5433 5434 typ1 = (mddb_type_t)md_getshared_key(setno, 5435 mirror_md_ops.md_driver.md_drivername); 5436 recid = mddb_makerecid(setno, 0); 5437 5438 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) { 5439 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 5440 continue; 5441 5442 dep = mddb_getrecdep(recid); 5443 rbp = dep->de_rb; 5444 5445 switch (rbp->rb_revision) { 5446 case MDDB_REV_RB: 5447 case MDDB_REV_RBFN: 5448 /* 5449 * Small device 5450 */ 5451 un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid); 5452 self_id = &(un32->c.un_self_id); 5453 parent_id = &(un32->c.un_parent); 5454 record_id = &(un32->c.un_record_id); 5455 optrec_id = &(un32->un_rr_dirty_recid); 5456 5457 for (i = 0; i < un32->un_nsm; i++) { 5458 tmpdev = md_expldev(un32->un_sm[i].sm_dev); 5459 un32->un_sm[i].sm_dev = md_cmpldev 5460 (md_makedevice(md_major, MD_MKMIN(setno, 5461 MD_MIN2UNIT(md_getminor(tmpdev))))); 5462 5463 if (!md_update_minor(setno, mddb_getsidenum 5464 (setno), un32->un_sm[i].sm_key)) 5465 goto out; 5466 } 5467 break; 5468 case MDDB_REV_RB64: 5469 case MDDB_REV_RB64FN: 5470 un64 = (mm_unit_t *)mddb_getrecaddr(recid); 5471 self_id = &(un64->c.un_self_id); 5472 parent_id = &(un64->c.un_parent); 5473 record_id = &(un64->c.un_record_id); 5474 optrec_id = &(un64->un_rr_dirty_recid); 5475 5476 for (i = 0; i < un64->un_nsm; i++) { 5477 tmpdev = un64->un_sm[i].sm_dev; 5478 un64->un_sm[i].sm_dev = md_makedevice 5479 (md_major, MD_MKMIN(setno, MD_MIN2UNIT 5480 (md_getminor(tmpdev)))); 5481 5482 if (!md_update_minor(setno, mddb_getsidenum 5483 (setno), un64->un_sm[i].sm_key)) 5484 goto out; 5485 } 5486 break; 5487 } 5488 5489 /* 5490 * If this is a top level and a friendly name metadevice, 5491 * update its minor in the namespace. 5492 */ 5493 if ((*parent_id == MD_NO_PARENT) && 5494 ((rbp->rb_revision == MDDB_REV_RBFN) || 5495 (rbp->rb_revision == MDDB_REV_RB64FN))) { 5496 5497 self_devt = md_makedevice(md_major, *self_id); 5498 if (!md_update_top_device_minor(setno, 5499 mddb_getsidenum(setno), self_devt)) 5500 goto out; 5501 } 5502 5503 /* 5504 * Update unit with the imported setno 5505 * 5506 */ 5507 mddb_setrecprivate(recid, MD_PRV_GOTIT); 5508 5509 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 5510 if (*parent_id != MD_NO_PARENT) 5511 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 5512 *record_id = MAKERECID(setno, DBID(*record_id)); 5513 *optrec_id = MAKERECID(setno, DBID(*optrec_id)); 5514 5515 gotsomething = 1; 5516 } 5517 5518 out: 5519 return (gotsomething); 5520 } 5521 5522 /* 5523 * NAME: mirror_check_offline 5524 * 5525 * DESCRIPTION: return offline_status = 1 if any submirrors are offline 5526 * 5527 * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is 5528 * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE 5529 * ioctl. 5530 */ 5531 int 5532 mirror_check_offline(md_dev64_t dev, int *offline_status) 5533 { 5534 mm_unit_t *un; 5535 md_error_t mde = mdnullerror; 5536 5537 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5538 return (EINVAL); 5539 *offline_status = 0; 5540 if (un->c.un_status & MD_UN_OFFLINE_SM) 5541 *offline_status = 1; 5542 return (0); 5543 } 5544 5545 /* 5546 * NAME: mirror_inc_abr_count 5547 * 5548 * DESCRIPTION: increment the count of layered soft parts with ABR set 5549 * 5550 * Called from ioctl, so access to un_abr_count is protected by the global 5551 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5552 */ 5553 int 5554 mirror_inc_abr_count(md_dev64_t dev) 5555 { 5556 mm_unit_t *un; 5557 md_error_t mde = mdnullerror; 5558 5559 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5560 return (EINVAL); 5561 un->un_abr_count++; 5562 return (0); 5563 } 5564 5565 /* 5566 * NAME: mirror_dec_abr_count 5567 * 5568 * DESCRIPTION: decrement the count of layered soft parts with ABR set 5569 * 5570 * Called from ioctl, so access to un_abr_count is protected by the global 5571 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl. 5572 */ 5573 int 5574 mirror_dec_abr_count(md_dev64_t dev) 5575 { 5576 mm_unit_t *un; 5577 md_error_t mde = mdnullerror; 5578 5579 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL) 5580 return (EINVAL); 5581 un->un_abr_count--; 5582 return (0); 5583 } 5584 5585 static md_named_services_t mirror_named_services[] = { 5586 {(intptr_t (*)()) poke_hotspares, "poke hotspares" }, 5587 {(intptr_t (*)()) mirror_rename_listkids, MDRNM_LIST_URKIDS }, 5588 {mirror_rename_check, MDRNM_CHECK }, 5589 {(intptr_t (*)()) mirror_renexch_update_kids, MDRNM_UPDATE_KIDS }, 5590 {(intptr_t (*)()) mirror_exchange_parent_update_to, 5591 MDRNM_PARENT_UPDATE_TO}, 5592 {(intptr_t (*)()) mirror_exchange_self_update_from_down, 5593 MDRNM_SELF_UPDATE_FROM_DOWN }, 5594 {(intptr_t (*)())mirror_probe_dev, "probe open test" }, 5595 {(intptr_t (*)())mirror_check_offline, MD_CHECK_OFFLINE }, 5596 {(intptr_t (*)())mirror_inc_abr_count, MD_INC_ABR_COUNT }, 5597 {(intptr_t (*)())mirror_dec_abr_count, MD_DEC_ABR_COUNT }, 5598 { NULL, 0 } 5599 }; 5600 5601 md_ops_t mirror_md_ops = { 5602 mirror_open, /* open */ 5603 mirror_close, /* close */ 5604 md_mirror_strategy, /* strategy */ 5605 NULL, /* print */ 5606 mirror_dump, /* dump */ 5607 NULL, /* read */ 5608 NULL, /* write */ 5609 md_mirror_ioctl, /* mirror_ioctl, */ 5610 mirror_snarf, /* mirror_snarf */ 5611 mirror_halt, /* mirror_halt */ 5612 NULL, /* aread */ 5613 NULL, /* awrite */ 5614 mirror_imp_set, /* import set */ 5615 mirror_named_services 5616 }; 5617 5618 /* module specific initilization */ 5619 static void 5620 init_init() 5621 { 5622 md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t); 5623 5624 /* Initialize the parent and child save memory pools */ 5625 mirror_parent_cache = kmem_cache_create("md_mirror_parent", 5626 sizeof (md_mps_t), 0, mirror_parent_constructor, 5627 mirror_parent_destructor, mirror_run_queue, NULL, NULL, 5628 0); 5629 5630 mirror_child_cache = kmem_cache_create("md_mirror_child", 5631 sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0, 5632 mirror_child_constructor, mirror_child_destructor, 5633 mirror_run_queue, NULL, NULL, 0); 5634 5635 /* 5636 * Insure wowbuf_size is a multiple of DEV_BSIZE, 5637 * then initialize wowbuf memory pool. 5638 */ 5639 md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE); 5640 if (md_wowbuf_size <= 0) 5641 md_wowbuf_size = 2 * DEV_BSIZE; 5642 if (md_wowbuf_size > (32 * DEV_BSIZE)) 5643 md_wowbuf_size = (32 * DEV_BSIZE); 5644 5645 md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t); 5646 mirror_wowblk_cache = kmem_cache_create("md_mirror_wow", 5647 md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0); 5648 5649 mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5650 mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL); 5651 5652 mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL); 5653 } 5654 5655 /* module specific uninitilization (undo init_init()) */ 5656 static void 5657 fini_uninit() 5658 { 5659 kmem_cache_destroy(mirror_parent_cache); 5660 kmem_cache_destroy(mirror_child_cache); 5661 kmem_cache_destroy(mirror_wowblk_cache); 5662 mirror_parent_cache = mirror_child_cache = 5663 mirror_wowblk_cache = NULL; 5664 5665 mutex_destroy(&mirror_timeout.dr_mx); 5666 mutex_destroy(&hotspare_request.dr_mx); 5667 mutex_destroy(&non_ff_drv_mutex); 5668 } 5669 5670 /* define the module linkage */ 5671 MD_PLUGIN_MISC_MODULE("mirrors module %I%", init_init(), fini_uninit()) 5672