1*0Sstevel@tonic-gate /* 2*0Sstevel@tonic-gate * CDDL HEADER START 3*0Sstevel@tonic-gate * 4*0Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*0Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 6*0Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 7*0Sstevel@tonic-gate * with the License. 8*0Sstevel@tonic-gate * 9*0Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*0Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 11*0Sstevel@tonic-gate * See the License for the specific language governing permissions 12*0Sstevel@tonic-gate * and limitations under the License. 13*0Sstevel@tonic-gate * 14*0Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 15*0Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*0Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 17*0Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 18*0Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 19*0Sstevel@tonic-gate * 20*0Sstevel@tonic-gate * CDDL HEADER END 21*0Sstevel@tonic-gate */ 22*0Sstevel@tonic-gate /* 23*0Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*0Sstevel@tonic-gate * Use is subject to license terms. 25*0Sstevel@tonic-gate */ 26*0Sstevel@tonic-gate 27*0Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 28*0Sstevel@tonic-gate 29*0Sstevel@tonic-gate /* 30*0Sstevel@tonic-gate * Soft partitioning metadevice driver (md_sp). 31*0Sstevel@tonic-gate * 32*0Sstevel@tonic-gate * This file contains the primary operations of the soft partitioning 33*0Sstevel@tonic-gate * metadevice driver. This includes all routines for normal operation 34*0Sstevel@tonic-gate * (open/close/read/write). Please see mdvar.h for a definition of 35*0Sstevel@tonic-gate * metadevice operations vector (md_ops_t). This driver is loosely 36*0Sstevel@tonic-gate * based on the stripe driver (md_stripe). 37*0Sstevel@tonic-gate * 38*0Sstevel@tonic-gate * All metadevice administration is done through the use of ioctl's. 39*0Sstevel@tonic-gate * As such, all administrative routines appear in sp_ioctl.c. 40*0Sstevel@tonic-gate * 41*0Sstevel@tonic-gate * Soft partitions are represented both in-core and in the metadb with a 42*0Sstevel@tonic-gate * unit structure. The soft partition-specific information in the unit 43*0Sstevel@tonic-gate * structure includes the following information: 44*0Sstevel@tonic-gate * - Device information (md_dev64_t & md key) about the device on which 45*0Sstevel@tonic-gate * the soft partition is built. 46*0Sstevel@tonic-gate * - Soft partition status information. 47*0Sstevel@tonic-gate * - The size of the soft partition and number of extents used to 48*0Sstevel@tonic-gate * make up that size. 49*0Sstevel@tonic-gate * - An array of exents which define virtual/physical offset 50*0Sstevel@tonic-gate * mappings and lengths for each extent. 51*0Sstevel@tonic-gate * 52*0Sstevel@tonic-gate * Typical soft partition operation proceeds as follows: 53*0Sstevel@tonic-gate * - The unit structure is fetched from the metadb and placed into 54*0Sstevel@tonic-gate * an in-core array (as with other metadevices). This operation 55*0Sstevel@tonic-gate * is performed via sp_build_incore( ) and takes place during 56*0Sstevel@tonic-gate * "snarfing" (when all metadevices are brought in-core at 57*0Sstevel@tonic-gate * once) and when a new soft partition is created. 58*0Sstevel@tonic-gate * - A soft partition is opened via sp_open( ). At open time the 59*0Sstevel@tonic-gate * the soft partition unit structure is verified with the soft 60*0Sstevel@tonic-gate * partition on-disk structures. Additionally, the soft partition 61*0Sstevel@tonic-gate * status is checked (only soft partitions in the OK state may be 62*0Sstevel@tonic-gate * opened). 63*0Sstevel@tonic-gate * - Soft partition I/O is performed via sp_strategy( ) which relies on 64*0Sstevel@tonic-gate * a support routine, sp_mapbuf( ), to do most of the work. 65*0Sstevel@tonic-gate * sp_mapbuf( ) maps a buffer to a particular extent via a binary 66*0Sstevel@tonic-gate * search of the extent array in the soft partition unit structure. 67*0Sstevel@tonic-gate * Once a translation has been performed, the I/O is passed down 68*0Sstevel@tonic-gate * to the next layer, which may be another metadevice or a physical 69*0Sstevel@tonic-gate * disk. Since a soft partition may contain multiple, non-contiguous 70*0Sstevel@tonic-gate * extents, a single I/O may have to be fragmented. 71*0Sstevel@tonic-gate * - Soft partitions are closed using sp_close. 72*0Sstevel@tonic-gate * 73*0Sstevel@tonic-gate */ 74*0Sstevel@tonic-gate 75*0Sstevel@tonic-gate #include <sys/param.h> 76*0Sstevel@tonic-gate #include <sys/systm.h> 77*0Sstevel@tonic-gate #include <sys/conf.h> 78*0Sstevel@tonic-gate #include <sys/file.h> 79*0Sstevel@tonic-gate #include <sys/user.h> 80*0Sstevel@tonic-gate #include <sys/uio.h> 81*0Sstevel@tonic-gate #include <sys/t_lock.h> 82*0Sstevel@tonic-gate #include <sys/buf.h> 83*0Sstevel@tonic-gate #include <sys/dkio.h> 84*0Sstevel@tonic-gate #include <sys/vtoc.h> 85*0Sstevel@tonic-gate #include <sys/kmem.h> 86*0Sstevel@tonic-gate #include <vm/page.h> 87*0Sstevel@tonic-gate #include <sys/cmn_err.h> 88*0Sstevel@tonic-gate #include <sys/sysmacros.h> 89*0Sstevel@tonic-gate #include <sys/types.h> 90*0Sstevel@tonic-gate #include <sys/mkdev.h> 91*0Sstevel@tonic-gate #include <sys/stat.h> 92*0Sstevel@tonic-gate #include <sys/open.h> 93*0Sstevel@tonic-gate #include <sys/lvm/mdvar.h> 94*0Sstevel@tonic-gate #include <sys/lvm/md_sp.h> 95*0Sstevel@tonic-gate #include <sys/lvm/md_convert.h> 96*0Sstevel@tonic-gate #include <sys/lvm/md_notify.h> 97*0Sstevel@tonic-gate #include <sys/lvm/md_crc.h> 98*0Sstevel@tonic-gate #include <sys/modctl.h> 99*0Sstevel@tonic-gate #include <sys/ddi.h> 100*0Sstevel@tonic-gate #include <sys/sunddi.h> 101*0Sstevel@tonic-gate #include <sys/debug.h> 102*0Sstevel@tonic-gate 103*0Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h> 104*0Sstevel@tonic-gate #include <sys/sysevent/svm.h> 105*0Sstevel@tonic-gate 106*0Sstevel@tonic-gate md_ops_t sp_md_ops; 107*0Sstevel@tonic-gate #ifndef lint 108*0Sstevel@tonic-gate static char _depends_on[] = "drv/md"; 109*0Sstevel@tonic-gate md_ops_t *md_interface_ops = &sp_md_ops; 110*0Sstevel@tonic-gate #endif 111*0Sstevel@tonic-gate 112*0Sstevel@tonic-gate extern unit_t md_nunits; 113*0Sstevel@tonic-gate extern set_t md_nsets; 114*0Sstevel@tonic-gate extern md_set_t md_set[]; 115*0Sstevel@tonic-gate 116*0Sstevel@tonic-gate extern int md_status; 117*0Sstevel@tonic-gate extern major_t md_major; 118*0Sstevel@tonic-gate extern mdq_anchor_t md_done_daemon; 119*0Sstevel@tonic-gate extern mdq_anchor_t md_sp_daemon; 120*0Sstevel@tonic-gate extern kmutex_t md_mx; 121*0Sstevel@tonic-gate extern kcondvar_t md_cv; 122*0Sstevel@tonic-gate extern md_krwlock_t md_unit_array_rw; 123*0Sstevel@tonic-gate 124*0Sstevel@tonic-gate static kmem_cache_t *sp_parent_cache = NULL; 125*0Sstevel@tonic-gate static kmem_cache_t *sp_child_cache = NULL; 126*0Sstevel@tonic-gate static void sp_send_stat_ok(mp_unit_t *); 127*0Sstevel@tonic-gate static void sp_send_stat_err(mp_unit_t *); 128*0Sstevel@tonic-gate 129*0Sstevel@tonic-gate /* 130*0Sstevel@tonic-gate * FUNCTION: sp_parent_constructor() 131*0Sstevel@tonic-gate * INPUT: none. 132*0Sstevel@tonic-gate * OUTPUT: ps - parent save structure initialized. 133*0Sstevel@tonic-gate * RETURNS: void * - ptr to initialized parent save structure. 134*0Sstevel@tonic-gate * PURPOSE: initialize parent save structure. 135*0Sstevel@tonic-gate */ 136*0Sstevel@tonic-gate /*ARGSUSED1*/ 137*0Sstevel@tonic-gate static int 138*0Sstevel@tonic-gate sp_parent_constructor(void *p, void *d1, int d2) 139*0Sstevel@tonic-gate { 140*0Sstevel@tonic-gate mutex_init(&((md_spps_t *)p)->ps_mx, 141*0Sstevel@tonic-gate NULL, MUTEX_DEFAULT, NULL); 142*0Sstevel@tonic-gate return (0); 143*0Sstevel@tonic-gate } 144*0Sstevel@tonic-gate 145*0Sstevel@tonic-gate static void 146*0Sstevel@tonic-gate sp_parent_init(md_spps_t *ps) 147*0Sstevel@tonic-gate { 148*0Sstevel@tonic-gate bzero(ps, offsetof(md_spps_t, ps_mx)); 149*0Sstevel@tonic-gate } 150*0Sstevel@tonic-gate 151*0Sstevel@tonic-gate /*ARGSUSED1*/ 152*0Sstevel@tonic-gate static void 153*0Sstevel@tonic-gate sp_parent_destructor(void *p, void *d) 154*0Sstevel@tonic-gate { 155*0Sstevel@tonic-gate mutex_destroy(&((md_spps_t *)p)->ps_mx); 156*0Sstevel@tonic-gate } 157*0Sstevel@tonic-gate 158*0Sstevel@tonic-gate /* 159*0Sstevel@tonic-gate * FUNCTION: sp_child_constructor() 160*0Sstevel@tonic-gate * INPUT: none. 161*0Sstevel@tonic-gate * OUTPUT: cs - child save structure initialized. 162*0Sstevel@tonic-gate * RETURNS: void * - ptr to initialized child save structure. 163*0Sstevel@tonic-gate * PURPOSE: initialize child save structure. 164*0Sstevel@tonic-gate */ 165*0Sstevel@tonic-gate /*ARGSUSED1*/ 166*0Sstevel@tonic-gate static int 167*0Sstevel@tonic-gate sp_child_constructor(void *p, void *d1, int d2) 168*0Sstevel@tonic-gate { 169*0Sstevel@tonic-gate bioinit(&((md_spcs_t *)p)->cs_buf); 170*0Sstevel@tonic-gate return (0); 171*0Sstevel@tonic-gate } 172*0Sstevel@tonic-gate 173*0Sstevel@tonic-gate static void 174*0Sstevel@tonic-gate sp_child_init(md_spcs_t *cs) 175*0Sstevel@tonic-gate { 176*0Sstevel@tonic-gate cs->cs_mdunit = 0; 177*0Sstevel@tonic-gate cs->cs_ps = NULL; 178*0Sstevel@tonic-gate md_bioreset(&cs->cs_buf); 179*0Sstevel@tonic-gate } 180*0Sstevel@tonic-gate 181*0Sstevel@tonic-gate /*ARGSUSED1*/ 182*0Sstevel@tonic-gate static void 183*0Sstevel@tonic-gate sp_child_destructor(void *p, void *d) 184*0Sstevel@tonic-gate { 185*0Sstevel@tonic-gate biofini(&((md_spcs_t *)p)->cs_buf); 186*0Sstevel@tonic-gate } 187*0Sstevel@tonic-gate 188*0Sstevel@tonic-gate /* 189*0Sstevel@tonic-gate * FUNCTION: sp_run_queue() 190*0Sstevel@tonic-gate * INPUT: none. 191*0Sstevel@tonic-gate * OUTPUT: none. 192*0Sstevel@tonic-gate * RETURNS: void. 193*0Sstevel@tonic-gate * PURPOSE: run the md_daemon to clean up memory pool. 194*0Sstevel@tonic-gate */ 195*0Sstevel@tonic-gate /*ARGSUSED*/ 196*0Sstevel@tonic-gate static void 197*0Sstevel@tonic-gate sp_run_queue(void *d) 198*0Sstevel@tonic-gate { 199*0Sstevel@tonic-gate if (!(md_status & MD_GBL_DAEMONS_LIVE)) 200*0Sstevel@tonic-gate md_daemon(1, &md_done_daemon); 201*0Sstevel@tonic-gate } 202*0Sstevel@tonic-gate 203*0Sstevel@tonic-gate 204*0Sstevel@tonic-gate /* 205*0Sstevel@tonic-gate * FUNCTION: sp_build_incore() 206*0Sstevel@tonic-gate * INPUT: p - ptr to unit structure. 207*0Sstevel@tonic-gate * snarfing - flag to tell us we are snarfing. 208*0Sstevel@tonic-gate * OUTPUT: non. 209*0Sstevel@tonic-gate * RETURNS: int - 0 (always). 210*0Sstevel@tonic-gate * PURPOSE: place unit structure into in-core unit array (keyed from 211*0Sstevel@tonic-gate * minor number). 212*0Sstevel@tonic-gate */ 213*0Sstevel@tonic-gate int 214*0Sstevel@tonic-gate sp_build_incore(void *p, int snarfing) 215*0Sstevel@tonic-gate { 216*0Sstevel@tonic-gate mp_unit_t *un = (mp_unit_t *)p; 217*0Sstevel@tonic-gate minor_t mnum; 218*0Sstevel@tonic-gate set_t setno; 219*0Sstevel@tonic-gate md_dev64_t tmpdev; 220*0Sstevel@tonic-gate 221*0Sstevel@tonic-gate mnum = MD_SID(un); 222*0Sstevel@tonic-gate 223*0Sstevel@tonic-gate if (MD_UNIT(mnum) != NULL) 224*0Sstevel@tonic-gate return (0); 225*0Sstevel@tonic-gate 226*0Sstevel@tonic-gate MD_STATUS(un) = 0; 227*0Sstevel@tonic-gate 228*0Sstevel@tonic-gate if (snarfing) { 229*0Sstevel@tonic-gate /* 230*0Sstevel@tonic-gate * if we are snarfing, we get the device information 231*0Sstevel@tonic-gate * from the metadb record (using the metadb key for 232*0Sstevel@tonic-gate * that device). 233*0Sstevel@tonic-gate */ 234*0Sstevel@tonic-gate setno = MD_MIN2SET(mnum); 235*0Sstevel@tonic-gate 236*0Sstevel@tonic-gate tmpdev = md_getdevnum(setno, mddb_getsidenum(setno), 237*0Sstevel@tonic-gate un->un_key, MD_NOTRUST_DEVT); 238*0Sstevel@tonic-gate un->un_dev = tmpdev; 239*0Sstevel@tonic-gate } 240*0Sstevel@tonic-gate 241*0Sstevel@tonic-gate /* place unit in in-core array */ 242*0Sstevel@tonic-gate MD_UNIT(mnum) = un; 243*0Sstevel@tonic-gate return (0); 244*0Sstevel@tonic-gate } 245*0Sstevel@tonic-gate 246*0Sstevel@tonic-gate /* 247*0Sstevel@tonic-gate * FUNCTION: reset_sp() 248*0Sstevel@tonic-gate * INPUT: un - unit structure to be reset/removed. 249*0Sstevel@tonic-gate * mnum - minor number to be reset/removed. 250*0Sstevel@tonic-gate * removing - flag to tell us if we are removing 251*0Sstevel@tonic-gate * permanently or just reseting in-core 252*0Sstevel@tonic-gate * structures. 253*0Sstevel@tonic-gate * OUTPUT: none. 254*0Sstevel@tonic-gate * RETURNS: void. 255*0Sstevel@tonic-gate * PURPOSE: used to either simply reset in-core structures or to 256*0Sstevel@tonic-gate * permanently remove metadevices from the metadb. 257*0Sstevel@tonic-gate */ 258*0Sstevel@tonic-gate void 259*0Sstevel@tonic-gate reset_sp(mp_unit_t *un, minor_t mnum, int removing) 260*0Sstevel@tonic-gate { 261*0Sstevel@tonic-gate sv_dev_t *sv; 262*0Sstevel@tonic-gate mddb_recid_t vtoc_id; 263*0Sstevel@tonic-gate 264*0Sstevel@tonic-gate /* clean up in-core structures */ 265*0Sstevel@tonic-gate md_destroy_unit_incore(mnum, &sp_md_ops); 266*0Sstevel@tonic-gate 267*0Sstevel@tonic-gate MD_UNIT(mnum) = NULL; 268*0Sstevel@tonic-gate 269*0Sstevel@tonic-gate if (!removing) 270*0Sstevel@tonic-gate return; 271*0Sstevel@tonic-gate 272*0Sstevel@tonic-gate /* we are removing the soft partition from the metadb */ 273*0Sstevel@tonic-gate 274*0Sstevel@tonic-gate /* 275*0Sstevel@tonic-gate * Save off device information so we can get to 276*0Sstevel@tonic-gate * it after we do the mddb_deleterec(). 277*0Sstevel@tonic-gate */ 278*0Sstevel@tonic-gate sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP); 279*0Sstevel@tonic-gate sv->setno = MD_MIN2SET(mnum); 280*0Sstevel@tonic-gate sv->key = un->un_key; 281*0Sstevel@tonic-gate vtoc_id = un->c.un_vtoc_id; 282*0Sstevel@tonic-gate 283*0Sstevel@tonic-gate /* Remove the unit structure */ 284*0Sstevel@tonic-gate mddb_deleterec_wrapper(un->c.un_record_id); 285*0Sstevel@tonic-gate 286*0Sstevel@tonic-gate if (vtoc_id) 287*0Sstevel@tonic-gate mddb_deleterec_wrapper(vtoc_id); 288*0Sstevel@tonic-gate 289*0Sstevel@tonic-gate SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE, 290*0Sstevel@tonic-gate MD_MIN2SET(mnum), MD_MIN2UNIT(mnum)); 291*0Sstevel@tonic-gate 292*0Sstevel@tonic-gate /* 293*0Sstevel@tonic-gate * remove the underlying device name from the metadb. if other 294*0Sstevel@tonic-gate * soft partitions are built on this device, this will simply 295*0Sstevel@tonic-gate * decrease the reference count for this device. otherwise the 296*0Sstevel@tonic-gate * name record for this device will be removed from the metadb. 297*0Sstevel@tonic-gate */ 298*0Sstevel@tonic-gate md_rem_names(sv, 1); 299*0Sstevel@tonic-gate kmem_free(sv, sizeof (sv_dev_t)); 300*0Sstevel@tonic-gate } 301*0Sstevel@tonic-gate 302*0Sstevel@tonic-gate /* 303*0Sstevel@tonic-gate * FUNCTION: sp_send_stat_msg 304*0Sstevel@tonic-gate * INPUT: un - unit reference 305*0Sstevel@tonic-gate * status - status to be sent to master node 306*0Sstevel@tonic-gate * MD_SP_OK - soft-partition is now OK 307*0Sstevel@tonic-gate * MD_SP_ERR " " errored 308*0Sstevel@tonic-gate * OUTPUT: none. 309*0Sstevel@tonic-gate * RETURNS: void. 310*0Sstevel@tonic-gate * PURPOSE: send a soft-partition status change to the master node. If the 311*0Sstevel@tonic-gate * message succeeds we simply return. If it fails we panic as the 312*0Sstevel@tonic-gate * cluster-wide view of the metadevices is now inconsistent. 313*0Sstevel@tonic-gate * CALLING CONTEXT: 314*0Sstevel@tonic-gate * Blockable. No locks can be held. 315*0Sstevel@tonic-gate */ 316*0Sstevel@tonic-gate static void 317*0Sstevel@tonic-gate sp_send_stat_msg(mp_unit_t *un, sp_status_t status) 318*0Sstevel@tonic-gate { 319*0Sstevel@tonic-gate md_mn_msg_sp_setstat_t sp_msg; 320*0Sstevel@tonic-gate md_mn_kresult_t *kres; 321*0Sstevel@tonic-gate set_t setno = MD_UN2SET(un); 322*0Sstevel@tonic-gate int rval; 323*0Sstevel@tonic-gate const char *str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK"; 324*0Sstevel@tonic-gate 325*0Sstevel@tonic-gate sp_msg.sp_setstat_mnum = MD_SID(un); 326*0Sstevel@tonic-gate sp_msg.sp_setstat_status = status; 327*0Sstevel@tonic-gate 328*0Sstevel@tonic-gate kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 329*0Sstevel@tonic-gate 330*0Sstevel@tonic-gate rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG, 331*0Sstevel@tonic-gate (char *)&sp_msg, sizeof (sp_msg), kres); 332*0Sstevel@tonic-gate 333*0Sstevel@tonic-gate if (!MDMN_KSEND_MSG_OK(rval, kres)) { 334*0Sstevel@tonic-gate mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2"); 335*0Sstevel@tonic-gate 336*0Sstevel@tonic-gate /* 337*0Sstevel@tonic-gate * Panic as we are now in an inconsistent state. 338*0Sstevel@tonic-gate */ 339*0Sstevel@tonic-gate 340*0Sstevel@tonic-gate cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n", 341*0Sstevel@tonic-gate md_shortname(MD_SID(un)), str); 342*0Sstevel@tonic-gate } 343*0Sstevel@tonic-gate 344*0Sstevel@tonic-gate kmem_free(kres, sizeof (md_mn_kresult_t)); 345*0Sstevel@tonic-gate } 346*0Sstevel@tonic-gate 347*0Sstevel@tonic-gate /* 348*0Sstevel@tonic-gate * FUNCTION: sp_finish_error 349*0Sstevel@tonic-gate * INPUT: ps - parent save structure for error-ed I/O. 350*0Sstevel@tonic-gate * lock_held - set if the unit readerlock is held 351*0Sstevel@tonic-gate * OUTPUT: none. 352*0Sstevel@tonic-gate * RETURNS: void. 353*0Sstevel@tonic-gate * PURPOSE: report a driver error 354*0Sstevel@tonic-gate */ 355*0Sstevel@tonic-gate static void 356*0Sstevel@tonic-gate sp_finish_error(md_spps_t *ps, int lock_held) 357*0Sstevel@tonic-gate { 358*0Sstevel@tonic-gate struct buf *pb = ps->ps_bp; 359*0Sstevel@tonic-gate mdi_unit_t *ui = ps->ps_ui; 360*0Sstevel@tonic-gate md_dev64_t un_dev; /* underlying device */ 361*0Sstevel@tonic-gate md_dev64_t md_dev = md_expldev(pb->b_edev); /* metadev in error */ 362*0Sstevel@tonic-gate char *str; 363*0Sstevel@tonic-gate 364*0Sstevel@tonic-gate un_dev = md_expldev(ps->ps_un->un_dev); 365*0Sstevel@tonic-gate /* set error type */ 366*0Sstevel@tonic-gate if (pb->b_flags & B_READ) { 367*0Sstevel@tonic-gate str = "read"; 368*0Sstevel@tonic-gate } else { 369*0Sstevel@tonic-gate str = "write"; 370*0Sstevel@tonic-gate } 371*0Sstevel@tonic-gate 372*0Sstevel@tonic-gate 373*0Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps); 374*0Sstevel@tonic-gate pb->b_flags |= B_ERROR; 375*0Sstevel@tonic-gate 376*0Sstevel@tonic-gate md_kstat_done(ui, pb, 0); 377*0Sstevel@tonic-gate 378*0Sstevel@tonic-gate if (lock_held) { 379*0Sstevel@tonic-gate md_unit_readerexit(ui); 380*0Sstevel@tonic-gate } 381*0Sstevel@tonic-gate md_biodone(pb); 382*0Sstevel@tonic-gate 383*0Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: %s error on %s", 384*0Sstevel@tonic-gate md_shortname(md_getminor(md_dev)), str, 385*0Sstevel@tonic-gate md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0)); 386*0Sstevel@tonic-gate } 387*0Sstevel@tonic-gate 388*0Sstevel@tonic-gate 389*0Sstevel@tonic-gate /* 390*0Sstevel@tonic-gate * FUNCTION: sp_xmit_ok 391*0Sstevel@tonic-gate * INPUT: dq - daemon queue referencing failing ps structure 392*0Sstevel@tonic-gate * OUTPUT: none. 393*0Sstevel@tonic-gate * RETURNS: void. 394*0Sstevel@tonic-gate * PURPOSE: send a message to the master node in a multi-owner diskset to 395*0Sstevel@tonic-gate * update all attached nodes view of the soft-part to be MD_SP_OK. 396*0Sstevel@tonic-gate * CALLING CONTEXT: 397*0Sstevel@tonic-gate * Blockable. No unit lock held. 398*0Sstevel@tonic-gate */ 399*0Sstevel@tonic-gate static void 400*0Sstevel@tonic-gate sp_xmit_ok(daemon_queue_t *dq) 401*0Sstevel@tonic-gate { 402*0Sstevel@tonic-gate md_spps_t *ps = (md_spps_t *)dq; 403*0Sstevel@tonic-gate 404*0Sstevel@tonic-gate /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 405*0Sstevel@tonic-gate sp_send_stat_msg(ps->ps_un, MD_SP_OK); 406*0Sstevel@tonic-gate 407*0Sstevel@tonic-gate /* 408*0Sstevel@tonic-gate * Successfully transmitted error state to all nodes, now release this 409*0Sstevel@tonic-gate * parent structure. 410*0Sstevel@tonic-gate */ 411*0Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps); 412*0Sstevel@tonic-gate } 413*0Sstevel@tonic-gate 414*0Sstevel@tonic-gate /* 415*0Sstevel@tonic-gate * FUNCTION: sp_xmit_error 416*0Sstevel@tonic-gate * INPUT: dq - daemon queue referencing failing ps structure 417*0Sstevel@tonic-gate * OUTPUT: none. 418*0Sstevel@tonic-gate * RETURNS: void. 419*0Sstevel@tonic-gate * PURPOSE: send a message to the master node in a multi-owner diskset to 420*0Sstevel@tonic-gate * update all attached nodes view of the soft-part to be MD_SP_ERR. 421*0Sstevel@tonic-gate * CALLING CONTEXT: 422*0Sstevel@tonic-gate * Blockable. No unit lock held. 423*0Sstevel@tonic-gate */ 424*0Sstevel@tonic-gate static void 425*0Sstevel@tonic-gate sp_xmit_error(daemon_queue_t *dq) 426*0Sstevel@tonic-gate { 427*0Sstevel@tonic-gate md_spps_t *ps = (md_spps_t *)dq; 428*0Sstevel@tonic-gate 429*0Sstevel@tonic-gate /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 430*0Sstevel@tonic-gate sp_send_stat_msg(ps->ps_un, MD_SP_ERR); 431*0Sstevel@tonic-gate 432*0Sstevel@tonic-gate /* 433*0Sstevel@tonic-gate * Successfully transmitted error state to all nodes, now release this 434*0Sstevel@tonic-gate * parent structure. 435*0Sstevel@tonic-gate */ 436*0Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps); 437*0Sstevel@tonic-gate } 438*0Sstevel@tonic-gate static void 439*0Sstevel@tonic-gate sp_send_stat_ok(mp_unit_t *un) 440*0Sstevel@tonic-gate { 441*0Sstevel@tonic-gate minor_t mnum = MD_SID(un); 442*0Sstevel@tonic-gate md_spps_t *ps; 443*0Sstevel@tonic-gate 444*0Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 445*0Sstevel@tonic-gate sp_parent_init(ps); 446*0Sstevel@tonic-gate ps->ps_un = un; 447*0Sstevel@tonic-gate ps->ps_ui = MDI_UNIT(mnum); 448*0Sstevel@tonic-gate 449*0Sstevel@tonic-gate daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps, 450*0Sstevel@tonic-gate REQ_OLD); 451*0Sstevel@tonic-gate } 452*0Sstevel@tonic-gate 453*0Sstevel@tonic-gate static void 454*0Sstevel@tonic-gate sp_send_stat_err(mp_unit_t *un) 455*0Sstevel@tonic-gate { 456*0Sstevel@tonic-gate minor_t mnum = MD_SID(un); 457*0Sstevel@tonic-gate md_spps_t *ps; 458*0Sstevel@tonic-gate 459*0Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 460*0Sstevel@tonic-gate sp_parent_init(ps); 461*0Sstevel@tonic-gate ps->ps_un = un; 462*0Sstevel@tonic-gate ps->ps_ui = MDI_UNIT(mnum); 463*0Sstevel@tonic-gate 464*0Sstevel@tonic-gate daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps, 465*0Sstevel@tonic-gate REQ_OLD); 466*0Sstevel@tonic-gate } 467*0Sstevel@tonic-gate 468*0Sstevel@tonic-gate 469*0Sstevel@tonic-gate /* 470*0Sstevel@tonic-gate * FUNCTION: sp_error() 471*0Sstevel@tonic-gate * INPUT: ps - parent save structure for error-ed I/O. 472*0Sstevel@tonic-gate * OUTPUT: none. 473*0Sstevel@tonic-gate * RETURNS: void. 474*0Sstevel@tonic-gate * PURPOSE: report a driver error. 475*0Sstevel@tonic-gate * CALLING CONTEXT: 476*0Sstevel@tonic-gate * Interrupt - non-blockable 477*0Sstevel@tonic-gate */ 478*0Sstevel@tonic-gate static void 479*0Sstevel@tonic-gate sp_error(md_spps_t *ps) 480*0Sstevel@tonic-gate { 481*0Sstevel@tonic-gate set_t setno = MD_UN2SET(ps->ps_un); 482*0Sstevel@tonic-gate 483*0Sstevel@tonic-gate /* 484*0Sstevel@tonic-gate * Drop the mutex associated with this request before (potentially) 485*0Sstevel@tonic-gate * enqueuing the free onto a separate thread. We have to release the 486*0Sstevel@tonic-gate * mutex before destroying the parent structure. 487*0Sstevel@tonic-gate */ 488*0Sstevel@tonic-gate if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 489*0Sstevel@tonic-gate if (MUTEX_HELD(&ps->ps_mx)) { 490*0Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 491*0Sstevel@tonic-gate } 492*0Sstevel@tonic-gate } else { 493*0Sstevel@tonic-gate /* 494*0Sstevel@tonic-gate * this should only ever happen if we are panicking, 495*0Sstevel@tonic-gate * since DONTFREE is only set on the parent if panicstr 496*0Sstevel@tonic-gate * is non-NULL. 497*0Sstevel@tonic-gate */ 498*0Sstevel@tonic-gate ASSERT(panicstr); 499*0Sstevel@tonic-gate } 500*0Sstevel@tonic-gate 501*0Sstevel@tonic-gate /* 502*0Sstevel@tonic-gate * For a multi-owner set we need to send a message to the master so that 503*0Sstevel@tonic-gate * all nodes get the errored status when we first encounter it. To avoid 504*0Sstevel@tonic-gate * deadlocking when multiple soft-partitions encounter an error on one 505*0Sstevel@tonic-gate * physical unit we drop the unit readerlock before enqueueing the 506*0Sstevel@tonic-gate * request. That way we can service any messages that require a 507*0Sstevel@tonic-gate * writerlock to be held. Additionally, to avoid deadlocking when at 508*0Sstevel@tonic-gate * the bottom of a metadevice stack and a higher level mirror has 509*0Sstevel@tonic-gate * multiple requests outstanding on this soft-part, we clone the ps 510*0Sstevel@tonic-gate * that failed and pass the error back up the stack to release the 511*0Sstevel@tonic-gate * reference that this i/o may have in the higher-level metadevice. 512*0Sstevel@tonic-gate * The other nodes in the cluster just have to modify the soft-part 513*0Sstevel@tonic-gate * status and we do not need to block the i/o completion for this. 514*0Sstevel@tonic-gate */ 515*0Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) { 516*0Sstevel@tonic-gate md_spps_t *err_ps; 517*0Sstevel@tonic-gate err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 518*0Sstevel@tonic-gate sp_parent_init(err_ps); 519*0Sstevel@tonic-gate 520*0Sstevel@tonic-gate err_ps->ps_un = ps->ps_un; 521*0Sstevel@tonic-gate err_ps->ps_ui = ps->ps_ui; 522*0Sstevel@tonic-gate 523*0Sstevel@tonic-gate md_unit_readerexit(ps->ps_ui); 524*0Sstevel@tonic-gate 525*0Sstevel@tonic-gate daemon_request(&md_sp_daemon, sp_xmit_error, 526*0Sstevel@tonic-gate (daemon_queue_t *)err_ps, REQ_OLD); 527*0Sstevel@tonic-gate 528*0Sstevel@tonic-gate sp_finish_error(ps, 0); 529*0Sstevel@tonic-gate 530*0Sstevel@tonic-gate return; 531*0Sstevel@tonic-gate } else { 532*0Sstevel@tonic-gate ps->ps_un->un_status = MD_SP_ERR; 533*0Sstevel@tonic-gate } 534*0Sstevel@tonic-gate 535*0Sstevel@tonic-gate /* Flag the error */ 536*0Sstevel@tonic-gate sp_finish_error(ps, 1); 537*0Sstevel@tonic-gate 538*0Sstevel@tonic-gate } 539*0Sstevel@tonic-gate 540*0Sstevel@tonic-gate /* 541*0Sstevel@tonic-gate * FUNCTION: sp_mapbuf() 542*0Sstevel@tonic-gate * INPUT: un - unit structure for soft partition we are doing 543*0Sstevel@tonic-gate * I/O on. 544*0Sstevel@tonic-gate * voff - virtual offset in soft partition to map. 545*0Sstevel@tonic-gate * bcount - # of blocks in the I/O. 546*0Sstevel@tonic-gate * OUTPUT: bp - translated buffer to be passed down to next layer. 547*0Sstevel@tonic-gate * RETURNS: 1 - request must be fragmented, more work to do, 548*0Sstevel@tonic-gate * 0 - request satisified, no more work to do 549*0Sstevel@tonic-gate * -1 - error 550*0Sstevel@tonic-gate * PURPOSE: Map the the virtual offset in the soft partition (passed 551*0Sstevel@tonic-gate * in via voff) to the "physical" offset on whatever the soft 552*0Sstevel@tonic-gate * partition is built on top of. We do this by doing a binary 553*0Sstevel@tonic-gate * search of the extent array in the soft partition unit 554*0Sstevel@tonic-gate * structure. Once the current extent is found, we do the 555*0Sstevel@tonic-gate * translation, determine if the I/O will cross extent 556*0Sstevel@tonic-gate * boundaries (if so, we have to fragment the I/O), then 557*0Sstevel@tonic-gate * fill in the buf structure to be passed down to the next layer. 558*0Sstevel@tonic-gate */ 559*0Sstevel@tonic-gate static int 560*0Sstevel@tonic-gate sp_mapbuf( 561*0Sstevel@tonic-gate mp_unit_t *un, 562*0Sstevel@tonic-gate sp_ext_offset_t voff, 563*0Sstevel@tonic-gate sp_ext_length_t bcount, 564*0Sstevel@tonic-gate buf_t *bp 565*0Sstevel@tonic-gate ) 566*0Sstevel@tonic-gate { 567*0Sstevel@tonic-gate int lo, mid, hi, found, more; 568*0Sstevel@tonic-gate size_t new_bcount; 569*0Sstevel@tonic-gate sp_ext_offset_t new_blkno; 570*0Sstevel@tonic-gate sp_ext_offset_t new_offset; 571*0Sstevel@tonic-gate sp_ext_offset_t ext_endblk; 572*0Sstevel@tonic-gate md_dev64_t new_edev; 573*0Sstevel@tonic-gate extern unsigned md_maxphys; 574*0Sstevel@tonic-gate 575*0Sstevel@tonic-gate found = 0; 576*0Sstevel@tonic-gate lo = 0; 577*0Sstevel@tonic-gate hi = un->un_numexts - 1; 578*0Sstevel@tonic-gate 579*0Sstevel@tonic-gate /* 580*0Sstevel@tonic-gate * do a binary search to find the extent that contains the 581*0Sstevel@tonic-gate * starting offset. after this loop, mid contains the index 582*0Sstevel@tonic-gate * of the correct extent. 583*0Sstevel@tonic-gate */ 584*0Sstevel@tonic-gate while (lo <= hi && !found) { 585*0Sstevel@tonic-gate mid = (lo + hi) / 2; 586*0Sstevel@tonic-gate /* is the starting offset contained within the mid-ext? */ 587*0Sstevel@tonic-gate if (voff >= un->un_ext[mid].un_voff && 588*0Sstevel@tonic-gate voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len) 589*0Sstevel@tonic-gate found = 1; 590*0Sstevel@tonic-gate else if (voff < un->un_ext[mid].un_voff) 591*0Sstevel@tonic-gate hi = mid - 1; 592*0Sstevel@tonic-gate else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */ 593*0Sstevel@tonic-gate lo = mid + 1; 594*0Sstevel@tonic-gate } 595*0Sstevel@tonic-gate 596*0Sstevel@tonic-gate if (!found) { 597*0Sstevel@tonic-gate cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff); 598*0Sstevel@tonic-gate return (-1); 599*0Sstevel@tonic-gate } 600*0Sstevel@tonic-gate 601*0Sstevel@tonic-gate /* translate to underlying physical offset/device */ 602*0Sstevel@tonic-gate new_offset = voff - un->un_ext[mid].un_voff; 603*0Sstevel@tonic-gate new_blkno = un->un_ext[mid].un_poff + new_offset; 604*0Sstevel@tonic-gate new_edev = un->un_dev; 605*0Sstevel@tonic-gate 606*0Sstevel@tonic-gate /* determine if we need to break the I/O into fragments */ 607*0Sstevel@tonic-gate ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len; 608*0Sstevel@tonic-gate if (voff + btodb(bcount) > ext_endblk) { 609*0Sstevel@tonic-gate new_bcount = dbtob(ext_endblk - voff); 610*0Sstevel@tonic-gate more = 1; 611*0Sstevel@tonic-gate } else { 612*0Sstevel@tonic-gate new_bcount = bcount; 613*0Sstevel@tonic-gate more = 0; 614*0Sstevel@tonic-gate } 615*0Sstevel@tonic-gate 616*0Sstevel@tonic-gate /* only break up the I/O if we're not built on another metadevice */ 617*0Sstevel@tonic-gate if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) { 618*0Sstevel@tonic-gate new_bcount = md_maxphys; 619*0Sstevel@tonic-gate more = 1; 620*0Sstevel@tonic-gate } 621*0Sstevel@tonic-gate if (bp != (buf_t *)NULL) { 622*0Sstevel@tonic-gate /* do bp updates */ 623*0Sstevel@tonic-gate bp->b_bcount = new_bcount; 624*0Sstevel@tonic-gate bp->b_lblkno = new_blkno; 625*0Sstevel@tonic-gate bp->b_edev = md_dev64_to_dev(new_edev); 626*0Sstevel@tonic-gate } 627*0Sstevel@tonic-gate return (more); 628*0Sstevel@tonic-gate } 629*0Sstevel@tonic-gate 630*0Sstevel@tonic-gate /* 631*0Sstevel@tonic-gate * FUNCTION: sp_validate() 632*0Sstevel@tonic-gate * INPUT: un - unit structure to be validated. 633*0Sstevel@tonic-gate * OUTPUT: none. 634*0Sstevel@tonic-gate * RETURNS: 0 - soft partition ok. 635*0Sstevel@tonic-gate * -1 - error. 636*0Sstevel@tonic-gate * PURPOSE: called on open to sanity check the soft partition. In 637*0Sstevel@tonic-gate * order to open a soft partition: 638*0Sstevel@tonic-gate * - it must have at least one extent 639*0Sstevel@tonic-gate * - the extent info in core and on disk must match 640*0Sstevel@tonic-gate * - it may not be in an intermediate state (which would 641*0Sstevel@tonic-gate * imply that a two-phase commit was interrupted) 642*0Sstevel@tonic-gate * 643*0Sstevel@tonic-gate * If the extent checking fails (B_ERROR returned from the read 644*0Sstevel@tonic-gate * strategy call) _and_ we're a multi-owner diskset, we send a 645*0Sstevel@tonic-gate * message to the master so that all nodes inherit the same view 646*0Sstevel@tonic-gate * of the soft partition. 647*0Sstevel@tonic-gate * If we are checking a soft-part that is marked as in error, and 648*0Sstevel@tonic-gate * we can actually read and validate the watermarks we send a 649*0Sstevel@tonic-gate * message to clear the error to the master node. 650*0Sstevel@tonic-gate */ 651*0Sstevel@tonic-gate static int 652*0Sstevel@tonic-gate sp_validate(mp_unit_t *un) 653*0Sstevel@tonic-gate { 654*0Sstevel@tonic-gate uint_t ext; 655*0Sstevel@tonic-gate struct buf *buf; 656*0Sstevel@tonic-gate sp_ext_length_t len; 657*0Sstevel@tonic-gate mp_watermark_t *wm; 658*0Sstevel@tonic-gate set_t setno; 659*0Sstevel@tonic-gate int reset_error = 0; 660*0Sstevel@tonic-gate 661*0Sstevel@tonic-gate setno = MD_UN2SET(un); 662*0Sstevel@tonic-gate 663*0Sstevel@tonic-gate /* sanity check unit structure components ?? */ 664*0Sstevel@tonic-gate if (un->un_status != MD_SP_OK) { 665*0Sstevel@tonic-gate if (un->un_status != MD_SP_ERR) { 666*0Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, soft partition " 667*0Sstevel@tonic-gate "status is %u.", 668*0Sstevel@tonic-gate md_shortname(MD_SID(un)), 669*0Sstevel@tonic-gate un->un_status); 670*0Sstevel@tonic-gate return (-1); 671*0Sstevel@tonic-gate } else { 672*0Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open of soft partition " 673*0Sstevel@tonic-gate "in Errored state.", 674*0Sstevel@tonic-gate md_shortname(MD_SID(un))); 675*0Sstevel@tonic-gate reset_error = 1; 676*0Sstevel@tonic-gate } 677*0Sstevel@tonic-gate } 678*0Sstevel@tonic-gate 679*0Sstevel@tonic-gate if (un->un_numexts == 0) { 680*0Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, soft partition does " 681*0Sstevel@tonic-gate "not have any extents.", md_shortname(MD_SID(un))); 682*0Sstevel@tonic-gate return (-1); 683*0Sstevel@tonic-gate } 684*0Sstevel@tonic-gate 685*0Sstevel@tonic-gate len = 0LL; 686*0Sstevel@tonic-gate for (ext = 0; ext < un->un_numexts; ext++) { 687*0Sstevel@tonic-gate 688*0Sstevel@tonic-gate /* tally extent lengths to check total size */ 689*0Sstevel@tonic-gate len += un->un_ext[ext].un_len; 690*0Sstevel@tonic-gate 691*0Sstevel@tonic-gate /* allocate buffer for watermark */ 692*0Sstevel@tonic-gate buf = getrbuf(KM_SLEEP); 693*0Sstevel@tonic-gate 694*0Sstevel@tonic-gate /* read watermark */ 695*0Sstevel@tonic-gate buf->b_flags = B_READ; 696*0Sstevel@tonic-gate buf->b_edev = md_dev64_to_dev(un->un_dev); 697*0Sstevel@tonic-gate buf->b_iodone = NULL; 698*0Sstevel@tonic-gate buf->b_proc = NULL; 699*0Sstevel@tonic-gate buf->b_bcount = sizeof (mp_watermark_t); 700*0Sstevel@tonic-gate buf->b_lblkno = un->un_ext[ext].un_poff - 1; 701*0Sstevel@tonic-gate buf->b_bufsize = sizeof (mp_watermark_t); 702*0Sstevel@tonic-gate buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t), 703*0Sstevel@tonic-gate KM_SLEEP); 704*0Sstevel@tonic-gate 705*0Sstevel@tonic-gate /* 706*0Sstevel@tonic-gate * make the call non-blocking so that it is not affected 707*0Sstevel@tonic-gate * by a set take. 708*0Sstevel@tonic-gate */ 709*0Sstevel@tonic-gate md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL); 710*0Sstevel@tonic-gate (void) biowait(buf); 711*0Sstevel@tonic-gate 712*0Sstevel@tonic-gate if (buf->b_flags & B_ERROR) { 713*0Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, could not " 714*0Sstevel@tonic-gate "read watermark at block %llu for extent %u, " 715*0Sstevel@tonic-gate "error %d.", md_shortname(MD_SID(un)), 716*0Sstevel@tonic-gate buf->b_lblkno, ext, buf->b_error); 717*0Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 718*0Sstevel@tonic-gate freerbuf(buf); 719*0Sstevel@tonic-gate 720*0Sstevel@tonic-gate /* 721*0Sstevel@tonic-gate * If we're a multi-owner diskset we send a message 722*0Sstevel@tonic-gate * indicating that this soft-part has an invalid 723*0Sstevel@tonic-gate * extent to the master node. This ensures a consistent 724*0Sstevel@tonic-gate * view of the soft-part across the cluster. 725*0Sstevel@tonic-gate */ 726*0Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) { 727*0Sstevel@tonic-gate sp_send_stat_err(un); 728*0Sstevel@tonic-gate } 729*0Sstevel@tonic-gate return (-1); 730*0Sstevel@tonic-gate } 731*0Sstevel@tonic-gate 732*0Sstevel@tonic-gate wm = (mp_watermark_t *)buf->b_un.b_addr; 733*0Sstevel@tonic-gate 734*0Sstevel@tonic-gate /* make sure the checksum is correct first */ 735*0Sstevel@tonic-gate if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum, 736*0Sstevel@tonic-gate (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) { 737*0Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 738*0Sstevel@tonic-gate "at block %llu for extent %u does not have a " 739*0Sstevel@tonic-gate "valid checksum 0x%08x.", md_shortname(MD_SID(un)), 740*0Sstevel@tonic-gate buf->b_lblkno, ext, wm->wm_checksum); 741*0Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 742*0Sstevel@tonic-gate freerbuf(buf); 743*0Sstevel@tonic-gate return (-1); 744*0Sstevel@tonic-gate } 745*0Sstevel@tonic-gate 746*0Sstevel@tonic-gate if (wm->wm_magic != MD_SP_MAGIC) { 747*0Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 748*0Sstevel@tonic-gate "at block %llu for extent %u does not have a " 749*0Sstevel@tonic-gate "valid watermark magic number, expected 0x%x, " 750*0Sstevel@tonic-gate "found 0x%x.", md_shortname(MD_SID(un)), 751*0Sstevel@tonic-gate buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic); 752*0Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 753*0Sstevel@tonic-gate freerbuf(buf); 754*0Sstevel@tonic-gate return (-1); 755*0Sstevel@tonic-gate } 756*0Sstevel@tonic-gate 757*0Sstevel@tonic-gate /* make sure sequence number matches the current extent */ 758*0Sstevel@tonic-gate if (wm->wm_seq != ext) { 759*0Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 760*0Sstevel@tonic-gate "at block %llu for extent %u has invalid " 761*0Sstevel@tonic-gate "sequence number %u.", md_shortname(MD_SID(un)), 762*0Sstevel@tonic-gate buf->b_lblkno, ext, wm->wm_seq); 763*0Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 764*0Sstevel@tonic-gate freerbuf(buf); 765*0Sstevel@tonic-gate return (-1); 766*0Sstevel@tonic-gate } 767*0Sstevel@tonic-gate 768*0Sstevel@tonic-gate /* make sure watermark length matches unit structure */ 769*0Sstevel@tonic-gate if (wm->wm_length != un->un_ext[ext].un_len) { 770*0Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 771*0Sstevel@tonic-gate "at block %llu for extent %u has inconsistent " 772*0Sstevel@tonic-gate "length, expected %llu, found %llu.", 773*0Sstevel@tonic-gate md_shortname(MD_SID(un)), buf->b_lblkno, 774*0Sstevel@tonic-gate ext, un->un_ext[ext].un_len, 775*0Sstevel@tonic-gate (u_longlong_t)wm->wm_length); 776*0Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 777*0Sstevel@tonic-gate freerbuf(buf); 778*0Sstevel@tonic-gate return (-1); 779*0Sstevel@tonic-gate } 780*0Sstevel@tonic-gate 781*0Sstevel@tonic-gate /* 782*0Sstevel@tonic-gate * make sure the type is a valid soft partition and not 783*0Sstevel@tonic-gate * a free extent or the end. 784*0Sstevel@tonic-gate */ 785*0Sstevel@tonic-gate if (wm->wm_type != EXTTYP_ALLOC) { 786*0Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 787*0Sstevel@tonic-gate "at block %llu for extent %u is not marked " 788*0Sstevel@tonic-gate "as in-use, type = %u.", md_shortname(MD_SID(un)), 789*0Sstevel@tonic-gate buf->b_lblkno, ext, wm->wm_type); 790*0Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 791*0Sstevel@tonic-gate freerbuf(buf); 792*0Sstevel@tonic-gate return (-1); 793*0Sstevel@tonic-gate } 794*0Sstevel@tonic-gate /* free up buffer */ 795*0Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 796*0Sstevel@tonic-gate freerbuf(buf); 797*0Sstevel@tonic-gate } 798*0Sstevel@tonic-gate 799*0Sstevel@tonic-gate if (len != un->un_length) { 800*0Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, computed length " 801*0Sstevel@tonic-gate "%llu != expected length %llu.", md_shortname(MD_SID(un)), 802*0Sstevel@tonic-gate len, un->un_length); 803*0Sstevel@tonic-gate return (-1); 804*0Sstevel@tonic-gate } 805*0Sstevel@tonic-gate 806*0Sstevel@tonic-gate /* 807*0Sstevel@tonic-gate * If we're a multi-owner set _and_ reset_error is set, we should clear 808*0Sstevel@tonic-gate * the error condition on all nodes in the set. Use SP_SETSTAT2 with 809*0Sstevel@tonic-gate * MD_SP_OK. 810*0Sstevel@tonic-gate */ 811*0Sstevel@tonic-gate if (MD_MNSET_SETNO(setno) && reset_error) { 812*0Sstevel@tonic-gate sp_send_stat_ok(un); 813*0Sstevel@tonic-gate } 814*0Sstevel@tonic-gate return (0); 815*0Sstevel@tonic-gate } 816*0Sstevel@tonic-gate 817*0Sstevel@tonic-gate /* 818*0Sstevel@tonic-gate * FUNCTION: sp_done() 819*0Sstevel@tonic-gate * INPUT: child_buf - buffer attached to child save structure. 820*0Sstevel@tonic-gate * this is the buffer on which I/O has just 821*0Sstevel@tonic-gate * completed. 822*0Sstevel@tonic-gate * OUTPUT: none. 823*0Sstevel@tonic-gate * RETURNS: 0 - success. 824*0Sstevel@tonic-gate * 1 - error. 825*0Sstevel@tonic-gate * PURPOSE: called on I/O completion. 826*0Sstevel@tonic-gate */ 827*0Sstevel@tonic-gate static int 828*0Sstevel@tonic-gate sp_done(struct buf *child_buf) 829*0Sstevel@tonic-gate { 830*0Sstevel@tonic-gate struct buf *parent_buf; 831*0Sstevel@tonic-gate mdi_unit_t *ui; 832*0Sstevel@tonic-gate md_spps_t *ps; 833*0Sstevel@tonic-gate md_spcs_t *cs; 834*0Sstevel@tonic-gate 835*0Sstevel@tonic-gate /* find the child save structure to which this buffer belongs */ 836*0Sstevel@tonic-gate cs = (md_spcs_t *)((caddr_t)child_buf - 837*0Sstevel@tonic-gate (sizeof (md_spcs_t) - sizeof (buf_t))); 838*0Sstevel@tonic-gate /* now get the parent save structure */ 839*0Sstevel@tonic-gate ps = cs->cs_ps; 840*0Sstevel@tonic-gate parent_buf = ps->ps_bp; 841*0Sstevel@tonic-gate 842*0Sstevel@tonic-gate mutex_enter(&ps->ps_mx); 843*0Sstevel@tonic-gate /* pass any errors back up to the parent */ 844*0Sstevel@tonic-gate if (child_buf->b_flags & B_ERROR) { 845*0Sstevel@tonic-gate ps->ps_flags |= MD_SPPS_ERROR; 846*0Sstevel@tonic-gate parent_buf->b_error = child_buf->b_error; 847*0Sstevel@tonic-gate } 848*0Sstevel@tonic-gate /* mapout, if needed */ 849*0Sstevel@tonic-gate if (child_buf->b_flags & B_REMAPPED) 850*0Sstevel@tonic-gate bp_mapout(child_buf); 851*0Sstevel@tonic-gate 852*0Sstevel@tonic-gate ps->ps_frags--; 853*0Sstevel@tonic-gate if (ps->ps_frags != 0) { 854*0Sstevel@tonic-gate /* 855*0Sstevel@tonic-gate * if this parent has more children, we just free the 856*0Sstevel@tonic-gate * child and return. 857*0Sstevel@tonic-gate */ 858*0Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs); 859*0Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 860*0Sstevel@tonic-gate return (1); 861*0Sstevel@tonic-gate } 862*0Sstevel@tonic-gate /* there are no more children */ 863*0Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs); 864*0Sstevel@tonic-gate if (ps->ps_flags & MD_SPPS_ERROR) { 865*0Sstevel@tonic-gate sp_error(ps); 866*0Sstevel@tonic-gate return (1); 867*0Sstevel@tonic-gate } 868*0Sstevel@tonic-gate ui = ps->ps_ui; 869*0Sstevel@tonic-gate if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 870*0Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 871*0Sstevel@tonic-gate } else { 872*0Sstevel@tonic-gate /* 873*0Sstevel@tonic-gate * this should only ever happen if we are panicking, 874*0Sstevel@tonic-gate * since DONTFREE is only set on the parent if panicstr 875*0Sstevel@tonic-gate * is non-NULL. 876*0Sstevel@tonic-gate */ 877*0Sstevel@tonic-gate ASSERT(panicstr); 878*0Sstevel@tonic-gate } 879*0Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps); 880*0Sstevel@tonic-gate md_kstat_done(ui, parent_buf, 0); 881*0Sstevel@tonic-gate md_unit_readerexit(ui); 882*0Sstevel@tonic-gate md_biodone(parent_buf); 883*0Sstevel@tonic-gate return (0); 884*0Sstevel@tonic-gate } 885*0Sstevel@tonic-gate 886*0Sstevel@tonic-gate /* 887*0Sstevel@tonic-gate * FUNCTION: md_sp_strategy() 888*0Sstevel@tonic-gate * INPUT: parent_buf - parent buffer 889*0Sstevel@tonic-gate * flag - flags 890*0Sstevel@tonic-gate * private - private data 891*0Sstevel@tonic-gate * OUTPUT: none. 892*0Sstevel@tonic-gate * RETURNS: void. 893*0Sstevel@tonic-gate * PURPOSE: Soft partitioning I/O strategy. Performs the main work 894*0Sstevel@tonic-gate * needed to do I/O to a soft partition. The basic 895*0Sstevel@tonic-gate * algorithm is as follows: 896*0Sstevel@tonic-gate * - Allocate a child save structure to keep track 897*0Sstevel@tonic-gate * of the I/O we are going to pass down. 898*0Sstevel@tonic-gate * - Map the I/O to the correct extent in the soft 899*0Sstevel@tonic-gate * partition (see sp_mapbuf()). 900*0Sstevel@tonic-gate * - bioclone() the buffer and pass it down the 901*0Sstevel@tonic-gate * stack using md_call_strategy. 902*0Sstevel@tonic-gate * - If the I/O needs to split across extents, 903*0Sstevel@tonic-gate * repeat the above steps until all fragments 904*0Sstevel@tonic-gate * are finished. 905*0Sstevel@tonic-gate */ 906*0Sstevel@tonic-gate static void 907*0Sstevel@tonic-gate md_sp_strategy(buf_t *parent_buf, int flag, void *private) 908*0Sstevel@tonic-gate { 909*0Sstevel@tonic-gate md_spps_t *ps; 910*0Sstevel@tonic-gate md_spcs_t *cs; 911*0Sstevel@tonic-gate int more; 912*0Sstevel@tonic-gate mp_unit_t *un; 913*0Sstevel@tonic-gate mdi_unit_t *ui; 914*0Sstevel@tonic-gate size_t current_count; 915*0Sstevel@tonic-gate off_t current_offset; 916*0Sstevel@tonic-gate sp_ext_offset_t current_blkno; 917*0Sstevel@tonic-gate buf_t *child_buf; 918*0Sstevel@tonic-gate set_t setno = MD_MIN2SET(getminor(parent_buf->b_edev)); 919*0Sstevel@tonic-gate int strat_flag = flag; 920*0Sstevel@tonic-gate 921*0Sstevel@tonic-gate /* 922*0Sstevel@tonic-gate * When doing IO to a multi owner meta device, check if set is halted. 923*0Sstevel@tonic-gate * We do this check without the needed lock held, for performance 924*0Sstevel@tonic-gate * reasons. 925*0Sstevel@tonic-gate * If an IO just slips through while the set is locked via an 926*0Sstevel@tonic-gate * MD_MN_SUSPEND_SET, we don't care about it. 927*0Sstevel@tonic-gate * Only check for suspension if we are a top-level i/o request 928*0Sstevel@tonic-gate * (MD_STR_NOTTOP is cleared in 'flag'); 929*0Sstevel@tonic-gate */ 930*0Sstevel@tonic-gate if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 931*0Sstevel@tonic-gate (MD_SET_HALTED | MD_SET_MNSET)) { 932*0Sstevel@tonic-gate if ((flag & MD_STR_NOTTOP) == 0) { 933*0Sstevel@tonic-gate mutex_enter(&md_mx); 934*0Sstevel@tonic-gate /* Here we loop until the set is no longer halted */ 935*0Sstevel@tonic-gate while (md_set[setno].s_status & MD_SET_HALTED) { 936*0Sstevel@tonic-gate cv_wait(&md_cv, &md_mx); 937*0Sstevel@tonic-gate } 938*0Sstevel@tonic-gate mutex_exit(&md_mx); 939*0Sstevel@tonic-gate } 940*0Sstevel@tonic-gate } 941*0Sstevel@tonic-gate 942*0Sstevel@tonic-gate ui = MDI_UNIT(getminor(parent_buf->b_edev)); 943*0Sstevel@tonic-gate 944*0Sstevel@tonic-gate md_kstat_waitq_enter(ui); 945*0Sstevel@tonic-gate 946*0Sstevel@tonic-gate un = (mp_unit_t *)md_unit_readerlock(ui); 947*0Sstevel@tonic-gate 948*0Sstevel@tonic-gate if ((flag & MD_NOBLOCK) == 0) { 949*0Sstevel@tonic-gate if (md_inc_iocount(setno) != 0) { 950*0Sstevel@tonic-gate parent_buf->b_flags |= B_ERROR; 951*0Sstevel@tonic-gate parent_buf->b_error = ENXIO; 952*0Sstevel@tonic-gate parent_buf->b_resid = parent_buf->b_bcount; 953*0Sstevel@tonic-gate md_unit_readerexit(ui); 954*0Sstevel@tonic-gate biodone(parent_buf); 955*0Sstevel@tonic-gate return; 956*0Sstevel@tonic-gate } 957*0Sstevel@tonic-gate } else { 958*0Sstevel@tonic-gate md_inc_iocount_noblock(setno); 959*0Sstevel@tonic-gate } 960*0Sstevel@tonic-gate 961*0Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP)) { 962*0Sstevel@tonic-gate if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) { 963*0Sstevel@tonic-gate md_kstat_waitq_exit(ui); 964*0Sstevel@tonic-gate return; 965*0Sstevel@tonic-gate } 966*0Sstevel@tonic-gate } 967*0Sstevel@tonic-gate 968*0Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 969*0Sstevel@tonic-gate sp_parent_init(ps); 970*0Sstevel@tonic-gate 971*0Sstevel@tonic-gate /* 972*0Sstevel@tonic-gate * Save essential information from the original buffhdr 973*0Sstevel@tonic-gate * in the parent. 974*0Sstevel@tonic-gate */ 975*0Sstevel@tonic-gate ps->ps_un = un; 976*0Sstevel@tonic-gate ps->ps_ui = ui; 977*0Sstevel@tonic-gate ps->ps_bp = parent_buf; 978*0Sstevel@tonic-gate ps->ps_addr = parent_buf->b_un.b_addr; 979*0Sstevel@tonic-gate 980*0Sstevel@tonic-gate current_count = parent_buf->b_bcount; 981*0Sstevel@tonic-gate current_blkno = (sp_ext_offset_t)parent_buf->b_blkno; 982*0Sstevel@tonic-gate current_offset = 0; 983*0Sstevel@tonic-gate 984*0Sstevel@tonic-gate /* 985*0Sstevel@tonic-gate * if we are at the top and we are panicking, 986*0Sstevel@tonic-gate * we don't free in order to save state. 987*0Sstevel@tonic-gate */ 988*0Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) 989*0Sstevel@tonic-gate ps->ps_flags |= MD_SPPS_DONTFREE; 990*0Sstevel@tonic-gate 991*0Sstevel@tonic-gate md_kstat_waitq_to_runq(ui); 992*0Sstevel@tonic-gate 993*0Sstevel@tonic-gate ps->ps_frags++; 994*0Sstevel@tonic-gate 995*0Sstevel@tonic-gate /* 996*0Sstevel@tonic-gate * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this 997*0Sstevel@tonic-gate * metadevice. 998*0Sstevel@tonic-gate */ 999*0Sstevel@tonic-gate if (ui->ui_tstate & MD_ABR_CAP) 1000*0Sstevel@tonic-gate strat_flag |= MD_STR_ABR; 1001*0Sstevel@tonic-gate 1002*0Sstevel@tonic-gate /* 1003*0Sstevel@tonic-gate * this loop does the main work of an I/O. we allocate a 1004*0Sstevel@tonic-gate * a child save for each buf, do the logical to physical 1005*0Sstevel@tonic-gate * mapping, decide if we need to frag the I/O, clone the 1006*0Sstevel@tonic-gate * new I/O to pass down the stack. repeat until we've 1007*0Sstevel@tonic-gate * taken care of the entire buf that was passed to us. 1008*0Sstevel@tonic-gate */ 1009*0Sstevel@tonic-gate do { 1010*0Sstevel@tonic-gate cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1011*0Sstevel@tonic-gate sp_child_init(cs); 1012*0Sstevel@tonic-gate child_buf = &cs->cs_buf; 1013*0Sstevel@tonic-gate cs->cs_ps = ps; 1014*0Sstevel@tonic-gate 1015*0Sstevel@tonic-gate more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1016*0Sstevel@tonic-gate if (more == -1) { 1017*0Sstevel@tonic-gate parent_buf->b_flags |= B_ERROR; 1018*0Sstevel@tonic-gate parent_buf->b_error = EIO; 1019*0Sstevel@tonic-gate md_kstat_done(ui, parent_buf, 0); 1020*0Sstevel@tonic-gate md_unit_readerexit(ui); 1021*0Sstevel@tonic-gate md_biodone(parent_buf); 1022*0Sstevel@tonic-gate kmem_cache_free(sp_parent_cache, ps); 1023*0Sstevel@tonic-gate return; 1024*0Sstevel@tonic-gate } 1025*0Sstevel@tonic-gate 1026*0Sstevel@tonic-gate child_buf = md_bioclone(parent_buf, current_offset, 1027*0Sstevel@tonic-gate child_buf->b_bcount, child_buf->b_edev, 1028*0Sstevel@tonic-gate child_buf->b_blkno, sp_done, child_buf, 1029*0Sstevel@tonic-gate KM_NOSLEEP); 1030*0Sstevel@tonic-gate /* calculate new offset, counts, etc... */ 1031*0Sstevel@tonic-gate current_offset += child_buf->b_bcount; 1032*0Sstevel@tonic-gate current_count -= child_buf->b_bcount; 1033*0Sstevel@tonic-gate current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1034*0Sstevel@tonic-gate 1035*0Sstevel@tonic-gate if (more) { 1036*0Sstevel@tonic-gate mutex_enter(&ps->ps_mx); 1037*0Sstevel@tonic-gate ps->ps_frags++; 1038*0Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 1039*0Sstevel@tonic-gate } 1040*0Sstevel@tonic-gate 1041*0Sstevel@tonic-gate md_call_strategy(child_buf, strat_flag, private); 1042*0Sstevel@tonic-gate } while (more); 1043*0Sstevel@tonic-gate 1044*0Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) { 1045*0Sstevel@tonic-gate while (!(ps->ps_flags & MD_SPPS_DONE)) { 1046*0Sstevel@tonic-gate md_daemon(1, &md_done_daemon); 1047*0Sstevel@tonic-gate } 1048*0Sstevel@tonic-gate kmem_cache_free(sp_parent_cache, ps); 1049*0Sstevel@tonic-gate } 1050*0Sstevel@tonic-gate } 1051*0Sstevel@tonic-gate 1052*0Sstevel@tonic-gate /* 1053*0Sstevel@tonic-gate * FUNCTION: sp_directed_read() 1054*0Sstevel@tonic-gate * INPUT: mnum - minor number 1055*0Sstevel@tonic-gate * vdr - vol_directed_rd_t from user 1056*0Sstevel@tonic-gate * mode - access mode for copying data out. 1057*0Sstevel@tonic-gate * OUTPUT: none. 1058*0Sstevel@tonic-gate * RETURNS: 0 - success 1059*0Sstevel@tonic-gate * Exxxxx - failure error-code 1060*0Sstevel@tonic-gate * PURPOSE: Construct the necessary sub-device i/o requests to perform the 1061*0Sstevel@tonic-gate * directed read as requested by the user. This is essentially the 1062*0Sstevel@tonic-gate * same as md_sp_strategy() with the exception being that the 1063*0Sstevel@tonic-gate * underlying 'md_call_strategy' is replaced with an ioctl call. 1064*0Sstevel@tonic-gate */ 1065*0Sstevel@tonic-gate int 1066*0Sstevel@tonic-gate sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode) 1067*0Sstevel@tonic-gate { 1068*0Sstevel@tonic-gate md_spps_t *ps; 1069*0Sstevel@tonic-gate md_spcs_t *cs; 1070*0Sstevel@tonic-gate int more; 1071*0Sstevel@tonic-gate mp_unit_t *un; 1072*0Sstevel@tonic-gate mdi_unit_t *ui; 1073*0Sstevel@tonic-gate size_t current_count; 1074*0Sstevel@tonic-gate off_t current_offset; 1075*0Sstevel@tonic-gate sp_ext_offset_t current_blkno; 1076*0Sstevel@tonic-gate buf_t *child_buf, *parent_buf; 1077*0Sstevel@tonic-gate void *kbuffer; 1078*0Sstevel@tonic-gate vol_directed_rd_t cvdr; 1079*0Sstevel@tonic-gate caddr_t userbuf; 1080*0Sstevel@tonic-gate offset_t useroff; 1081*0Sstevel@tonic-gate int ret = 0; 1082*0Sstevel@tonic-gate 1083*0Sstevel@tonic-gate ui = MDI_UNIT(mnum); 1084*0Sstevel@tonic-gate 1085*0Sstevel@tonic-gate md_kstat_waitq_enter(ui); 1086*0Sstevel@tonic-gate 1087*0Sstevel@tonic-gate bzero(&cvdr, sizeof (cvdr)); 1088*0Sstevel@tonic-gate 1089*0Sstevel@tonic-gate un = (mp_unit_t *)md_unit_readerlock(ui); 1090*0Sstevel@tonic-gate 1091*0Sstevel@tonic-gate /* 1092*0Sstevel@tonic-gate * Construct a parent_buf header which reflects the user-supplied 1093*0Sstevel@tonic-gate * request. 1094*0Sstevel@tonic-gate */ 1095*0Sstevel@tonic-gate 1096*0Sstevel@tonic-gate kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 1097*0Sstevel@tonic-gate if (kbuffer == NULL) { 1098*0Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR; 1099*0Sstevel@tonic-gate md_unit_readerexit(ui); 1100*0Sstevel@tonic-gate return (ENOMEM); 1101*0Sstevel@tonic-gate } 1102*0Sstevel@tonic-gate 1103*0Sstevel@tonic-gate parent_buf = getrbuf(KM_NOSLEEP); 1104*0Sstevel@tonic-gate if (parent_buf == NULL) { 1105*0Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR; 1106*0Sstevel@tonic-gate md_unit_readerexit(ui); 1107*0Sstevel@tonic-gate kmem_free(kbuffer, vdr->vdr_nbytes); 1108*0Sstevel@tonic-gate return (ENOMEM); 1109*0Sstevel@tonic-gate } 1110*0Sstevel@tonic-gate parent_buf->b_un.b_addr = kbuffer; 1111*0Sstevel@tonic-gate parent_buf->b_flags = B_READ; 1112*0Sstevel@tonic-gate parent_buf->b_bcount = vdr->vdr_nbytes; 1113*0Sstevel@tonic-gate parent_buf->b_lblkno = lbtodb(vdr->vdr_offset); 1114*0Sstevel@tonic-gate parent_buf->b_edev = un->un_dev; 1115*0Sstevel@tonic-gate 1116*0Sstevel@tonic-gate 1117*0Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 1118*0Sstevel@tonic-gate sp_parent_init(ps); 1119*0Sstevel@tonic-gate 1120*0Sstevel@tonic-gate /* 1121*0Sstevel@tonic-gate * Save essential information from the original buffhdr 1122*0Sstevel@tonic-gate * in the parent. 1123*0Sstevel@tonic-gate */ 1124*0Sstevel@tonic-gate ps->ps_un = un; 1125*0Sstevel@tonic-gate ps->ps_ui = ui; 1126*0Sstevel@tonic-gate ps->ps_bp = parent_buf; 1127*0Sstevel@tonic-gate ps->ps_addr = parent_buf->b_un.b_addr; 1128*0Sstevel@tonic-gate 1129*0Sstevel@tonic-gate current_count = parent_buf->b_bcount; 1130*0Sstevel@tonic-gate current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno; 1131*0Sstevel@tonic-gate current_offset = 0; 1132*0Sstevel@tonic-gate 1133*0Sstevel@tonic-gate ps->ps_frags++; 1134*0Sstevel@tonic-gate vdr->vdr_bytesread = 0; 1135*0Sstevel@tonic-gate 1136*0Sstevel@tonic-gate /* 1137*0Sstevel@tonic-gate * this loop does the main work of an I/O. we allocate a 1138*0Sstevel@tonic-gate * a child save for each buf, do the logical to physical 1139*0Sstevel@tonic-gate * mapping, decide if we need to frag the I/O, clone the 1140*0Sstevel@tonic-gate * new I/O to pass down the stack. repeat until we've 1141*0Sstevel@tonic-gate * taken care of the entire buf that was passed to us. 1142*0Sstevel@tonic-gate */ 1143*0Sstevel@tonic-gate do { 1144*0Sstevel@tonic-gate cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 1145*0Sstevel@tonic-gate sp_child_init(cs); 1146*0Sstevel@tonic-gate child_buf = &cs->cs_buf; 1147*0Sstevel@tonic-gate cs->cs_ps = ps; 1148*0Sstevel@tonic-gate 1149*0Sstevel@tonic-gate more = sp_mapbuf(un, current_blkno, current_count, child_buf); 1150*0Sstevel@tonic-gate if (more == -1) { 1151*0Sstevel@tonic-gate ret = EIO; 1152*0Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_SHORT; 1153*0Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs); 1154*0Sstevel@tonic-gate goto err_out; 1155*0Sstevel@tonic-gate } 1156*0Sstevel@tonic-gate 1157*0Sstevel@tonic-gate cvdr.vdr_flags = vdr->vdr_flags; 1158*0Sstevel@tonic-gate cvdr.vdr_side = vdr->vdr_side; 1159*0Sstevel@tonic-gate cvdr.vdr_nbytes = child_buf->b_bcount; 1160*0Sstevel@tonic-gate cvdr.vdr_offset = ldbtob(child_buf->b_lblkno); 1161*0Sstevel@tonic-gate /* Work out where we are in the allocated buffer */ 1162*0Sstevel@tonic-gate useroff = (offset_t)kbuffer; 1163*0Sstevel@tonic-gate useroff = useroff + (offset_t)current_offset; 1164*0Sstevel@tonic-gate cvdr.vdr_data = (void *)useroff; 1165*0Sstevel@tonic-gate child_buf = md_bioclone(parent_buf, current_offset, 1166*0Sstevel@tonic-gate child_buf->b_bcount, child_buf->b_edev, 1167*0Sstevel@tonic-gate child_buf->b_blkno, NULL, 1168*0Sstevel@tonic-gate child_buf, KM_NOSLEEP); 1169*0Sstevel@tonic-gate /* calculate new offset, counts, etc... */ 1170*0Sstevel@tonic-gate current_offset += child_buf->b_bcount; 1171*0Sstevel@tonic-gate current_count -= child_buf->b_bcount; 1172*0Sstevel@tonic-gate current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 1173*0Sstevel@tonic-gate 1174*0Sstevel@tonic-gate if (more) { 1175*0Sstevel@tonic-gate mutex_enter(&ps->ps_mx); 1176*0Sstevel@tonic-gate ps->ps_frags++; 1177*0Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 1178*0Sstevel@tonic-gate } 1179*0Sstevel@tonic-gate 1180*0Sstevel@tonic-gate ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr, 1181*0Sstevel@tonic-gate (mode | FKIOCTL), NULL); 1182*0Sstevel@tonic-gate 1183*0Sstevel@tonic-gate /* 1184*0Sstevel@tonic-gate * Free the child structure as we've finished with it. 1185*0Sstevel@tonic-gate * Normally this would be done by sp_done() but we're just 1186*0Sstevel@tonic-gate * using md_bioclone() to segment the transfer and we never 1187*0Sstevel@tonic-gate * issue a strategy request so the iodone will not be called. 1188*0Sstevel@tonic-gate */ 1189*0Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs); 1190*0Sstevel@tonic-gate if (ret == 0) { 1191*0Sstevel@tonic-gate /* copyout the returned data to vdr_data + offset */ 1192*0Sstevel@tonic-gate userbuf = (caddr_t)kbuffer; 1193*0Sstevel@tonic-gate userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer; 1194*0Sstevel@tonic-gate if (ddi_copyout(userbuf, vdr->vdr_data, 1195*0Sstevel@tonic-gate cvdr.vdr_bytesread, mode)) { 1196*0Sstevel@tonic-gate ret = EFAULT; 1197*0Sstevel@tonic-gate goto err_out; 1198*0Sstevel@tonic-gate } 1199*0Sstevel@tonic-gate vdr->vdr_bytesread += cvdr.vdr_bytesread; 1200*0Sstevel@tonic-gate } else { 1201*0Sstevel@tonic-gate goto err_out; 1202*0Sstevel@tonic-gate } 1203*0Sstevel@tonic-gate } while (more); 1204*0Sstevel@tonic-gate 1205*0Sstevel@tonic-gate /* 1206*0Sstevel@tonic-gate * Update the user-supplied vol_directed_rd_t structure with the 1207*0Sstevel@tonic-gate * contents of the last issued child request. 1208*0Sstevel@tonic-gate */ 1209*0Sstevel@tonic-gate vdr->vdr_flags = cvdr.vdr_flags; 1210*0Sstevel@tonic-gate vdr->vdr_side = cvdr.vdr_side; 1211*0Sstevel@tonic-gate bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME); 1212*0Sstevel@tonic-gate 1213*0Sstevel@tonic-gate err_out: 1214*0Sstevel@tonic-gate if (ret != 0) { 1215*0Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR; 1216*0Sstevel@tonic-gate } 1217*0Sstevel@tonic-gate if (vdr->vdr_bytesread != vdr->vdr_nbytes) { 1218*0Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_SHORT; 1219*0Sstevel@tonic-gate } 1220*0Sstevel@tonic-gate kmem_cache_free(sp_parent_cache, ps); 1221*0Sstevel@tonic-gate kmem_free(kbuffer, vdr->vdr_nbytes); 1222*0Sstevel@tonic-gate freerbuf(parent_buf); 1223*0Sstevel@tonic-gate md_unit_readerexit(ui); 1224*0Sstevel@tonic-gate return (ret); 1225*0Sstevel@tonic-gate } 1226*0Sstevel@tonic-gate 1227*0Sstevel@tonic-gate /* 1228*0Sstevel@tonic-gate * FUNCTION: sp_snarf() 1229*0Sstevel@tonic-gate * INPUT: cmd - snarf cmd. 1230*0Sstevel@tonic-gate * setno - set number. 1231*0Sstevel@tonic-gate * OUTPUT: none. 1232*0Sstevel@tonic-gate * RETURNS: 1 - soft partitions were snarfed. 1233*0Sstevel@tonic-gate * 0 - no soft partitions were snarfed. 1234*0Sstevel@tonic-gate * PURPOSE: Snarf soft partition metadb records into their in-core 1235*0Sstevel@tonic-gate * structures. This routine is called at "snarf time" when 1236*0Sstevel@tonic-gate * md loads and gets all metadevices records into memory. 1237*0Sstevel@tonic-gate * The basic algorithm is simply to walk the soft partition 1238*0Sstevel@tonic-gate * records in the metadb and call the soft partitioning 1239*0Sstevel@tonic-gate * build_incore routine to set up the in-core structures. 1240*0Sstevel@tonic-gate */ 1241*0Sstevel@tonic-gate static int 1242*0Sstevel@tonic-gate sp_snarf(md_snarfcmd_t cmd, set_t setno) 1243*0Sstevel@tonic-gate { 1244*0Sstevel@tonic-gate mp_unit_t *un; 1245*0Sstevel@tonic-gate mddb_recid_t recid; 1246*0Sstevel@tonic-gate int gotsomething; 1247*0Sstevel@tonic-gate int all_sp_gotten; 1248*0Sstevel@tonic-gate mddb_type_t rec_type; 1249*0Sstevel@tonic-gate mddb_de_ic_t *dep; 1250*0Sstevel@tonic-gate mddb_rb32_t *rbp; 1251*0Sstevel@tonic-gate mp_unit_t *big_un; 1252*0Sstevel@tonic-gate mp_unit32_od_t *small_un; 1253*0Sstevel@tonic-gate size_t newreqsize; 1254*0Sstevel@tonic-gate 1255*0Sstevel@tonic-gate 1256*0Sstevel@tonic-gate if (cmd == MD_SNARF_CLEANUP) 1257*0Sstevel@tonic-gate return (0); 1258*0Sstevel@tonic-gate 1259*0Sstevel@tonic-gate all_sp_gotten = 1; 1260*0Sstevel@tonic-gate gotsomething = 0; 1261*0Sstevel@tonic-gate 1262*0Sstevel@tonic-gate /* get the record type */ 1263*0Sstevel@tonic-gate rec_type = (mddb_type_t)md_getshared_key(setno, 1264*0Sstevel@tonic-gate sp_md_ops.md_driver.md_drivername); 1265*0Sstevel@tonic-gate recid = mddb_makerecid(setno, 0); 1266*0Sstevel@tonic-gate 1267*0Sstevel@tonic-gate /* 1268*0Sstevel@tonic-gate * walk soft partition records in the metadb and call 1269*0Sstevel@tonic-gate * sp_build_incore to build in-core structures. 1270*0Sstevel@tonic-gate */ 1271*0Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1272*0Sstevel@tonic-gate /* if we've already gotten this record, go to the next one */ 1273*0Sstevel@tonic-gate if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1274*0Sstevel@tonic-gate continue; 1275*0Sstevel@tonic-gate 1276*0Sstevel@tonic-gate 1277*0Sstevel@tonic-gate dep = mddb_getrecdep(recid); 1278*0Sstevel@tonic-gate dep->de_flags = MDDB_F_SOFTPART; 1279*0Sstevel@tonic-gate rbp = dep->de_rb; 1280*0Sstevel@tonic-gate 1281*0Sstevel@tonic-gate if ((rbp->rb_revision == MDDB_REV_RB) && 1282*0Sstevel@tonic-gate ((rbp->rb_private & MD_PRV_CONVD) == 0)) { 1283*0Sstevel@tonic-gate /* 1284*0Sstevel@tonic-gate * This means, we have an old and small record. 1285*0Sstevel@tonic-gate * And this record hasn't already been converted :-o 1286*0Sstevel@tonic-gate * before we create an incore metadevice from this 1287*0Sstevel@tonic-gate * we have to convert it to a big record. 1288*0Sstevel@tonic-gate */ 1289*0Sstevel@tonic-gate small_un = (mp_unit32_od_t *)mddb_getrecaddr(recid); 1290*0Sstevel@tonic-gate newreqsize = sizeof (mp_unit_t) + 1291*0Sstevel@tonic-gate ((small_un->un_numexts - 1) * 1292*0Sstevel@tonic-gate sizeof (struct mp_ext)); 1293*0Sstevel@tonic-gate big_un = (mp_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP); 1294*0Sstevel@tonic-gate softpart_convert((caddr_t)small_un, (caddr_t)big_un, 1295*0Sstevel@tonic-gate SMALL_2_BIG); 1296*0Sstevel@tonic-gate kmem_free(small_un, dep->de_reqsize); 1297*0Sstevel@tonic-gate dep->de_rb_userdata = big_un; 1298*0Sstevel@tonic-gate dep->de_reqsize = newreqsize; 1299*0Sstevel@tonic-gate rbp->rb_private |= MD_PRV_CONVD; 1300*0Sstevel@tonic-gate un = big_un; 1301*0Sstevel@tonic-gate } else { 1302*0Sstevel@tonic-gate /* Large device */ 1303*0Sstevel@tonic-gate un = (mp_unit_t *)mddb_getrecaddr(recid); 1304*0Sstevel@tonic-gate } 1305*0Sstevel@tonic-gate 1306*0Sstevel@tonic-gate /* Set revision and flag accordingly */ 1307*0Sstevel@tonic-gate if (rbp->rb_revision == MDDB_REV_RB) { 1308*0Sstevel@tonic-gate un->c.un_revision = MD_32BIT_META_DEV; 1309*0Sstevel@tonic-gate } else { 1310*0Sstevel@tonic-gate un->c.un_revision = MD_64BIT_META_DEV; 1311*0Sstevel@tonic-gate un->c.un_flag |= MD_EFILABEL; 1312*0Sstevel@tonic-gate } 1313*0Sstevel@tonic-gate 1314*0Sstevel@tonic-gate /* 1315*0Sstevel@tonic-gate * Create minor node for snarfed entry. 1316*0Sstevel@tonic-gate */ 1317*0Sstevel@tonic-gate (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 1318*0Sstevel@tonic-gate 1319*0Sstevel@tonic-gate if (MD_UNIT(MD_SID(un)) != NULL) { 1320*0Sstevel@tonic-gate /* unit is already in-core */ 1321*0Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1322*0Sstevel@tonic-gate continue; 1323*0Sstevel@tonic-gate } 1324*0Sstevel@tonic-gate all_sp_gotten = 0; 1325*0Sstevel@tonic-gate if (sp_build_incore((void *)un, 1) == 0) { 1326*0Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_GOTIT); 1327*0Sstevel@tonic-gate md_create_unit_incore(MD_SID(un), &sp_md_ops, 0); 1328*0Sstevel@tonic-gate gotsomething = 1; 1329*0Sstevel@tonic-gate } 1330*0Sstevel@tonic-gate } 1331*0Sstevel@tonic-gate 1332*0Sstevel@tonic-gate if (!all_sp_gotten) 1333*0Sstevel@tonic-gate return (gotsomething); 1334*0Sstevel@tonic-gate /* double-check records */ 1335*0Sstevel@tonic-gate recid = mddb_makerecid(setno, 0); 1336*0Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) 1337*0Sstevel@tonic-gate if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 1338*0Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_PENDDEL); 1339*0Sstevel@tonic-gate 1340*0Sstevel@tonic-gate return (0); 1341*0Sstevel@tonic-gate } 1342*0Sstevel@tonic-gate 1343*0Sstevel@tonic-gate /* 1344*0Sstevel@tonic-gate * FUNCTION: sp_halt() 1345*0Sstevel@tonic-gate * INPUT: cmd - halt cmd. 1346*0Sstevel@tonic-gate * setno - set number. 1347*0Sstevel@tonic-gate * RETURNS: 0 - success. 1348*0Sstevel@tonic-gate * 1 - err. 1349*0Sstevel@tonic-gate * PURPOSE: Perform driver halt operations. As with stripe, we 1350*0Sstevel@tonic-gate * support MD_HALT_CHECK and MD_HALT_DOIT. The first 1351*0Sstevel@tonic-gate * does a check to see if halting can be done safely 1352*0Sstevel@tonic-gate * (no open soft partitions), the second cleans up and 1353*0Sstevel@tonic-gate * shuts down the driver. 1354*0Sstevel@tonic-gate */ 1355*0Sstevel@tonic-gate static int 1356*0Sstevel@tonic-gate sp_halt(md_haltcmd_t cmd, set_t setno) 1357*0Sstevel@tonic-gate { 1358*0Sstevel@tonic-gate int i; 1359*0Sstevel@tonic-gate mdi_unit_t *ui; 1360*0Sstevel@tonic-gate minor_t mnum; 1361*0Sstevel@tonic-gate 1362*0Sstevel@tonic-gate if (cmd == MD_HALT_CLOSE) 1363*0Sstevel@tonic-gate return (0); 1364*0Sstevel@tonic-gate 1365*0Sstevel@tonic-gate if (cmd == MD_HALT_OPEN) 1366*0Sstevel@tonic-gate return (0); 1367*0Sstevel@tonic-gate 1368*0Sstevel@tonic-gate if (cmd == MD_HALT_UNLOAD) 1369*0Sstevel@tonic-gate return (0); 1370*0Sstevel@tonic-gate 1371*0Sstevel@tonic-gate if (cmd == MD_HALT_CHECK) { 1372*0Sstevel@tonic-gate for (i = 0; i < md_nunits; i++) { 1373*0Sstevel@tonic-gate mnum = MD_MKMIN(setno, i); 1374*0Sstevel@tonic-gate if ((ui = MDI_UNIT(mnum)) == NULL) 1375*0Sstevel@tonic-gate continue; 1376*0Sstevel@tonic-gate if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1377*0Sstevel@tonic-gate continue; 1378*0Sstevel@tonic-gate if (md_unit_isopen(ui)) 1379*0Sstevel@tonic-gate return (1); 1380*0Sstevel@tonic-gate } 1381*0Sstevel@tonic-gate return (0); 1382*0Sstevel@tonic-gate } 1383*0Sstevel@tonic-gate 1384*0Sstevel@tonic-gate if (cmd != MD_HALT_DOIT) 1385*0Sstevel@tonic-gate return (1); 1386*0Sstevel@tonic-gate 1387*0Sstevel@tonic-gate for (i = 0; i < md_nunits; i++) { 1388*0Sstevel@tonic-gate mnum = MD_MKMIN(setno, i); 1389*0Sstevel@tonic-gate if ((ui = MDI_UNIT(mnum)) == NULL) 1390*0Sstevel@tonic-gate continue; 1391*0Sstevel@tonic-gate if (ui->ui_opsindex != sp_md_ops.md_selfindex) 1392*0Sstevel@tonic-gate continue; 1393*0Sstevel@tonic-gate reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0); 1394*0Sstevel@tonic-gate } 1395*0Sstevel@tonic-gate 1396*0Sstevel@tonic-gate return (0); 1397*0Sstevel@tonic-gate } 1398*0Sstevel@tonic-gate 1399*0Sstevel@tonic-gate /* 1400*0Sstevel@tonic-gate * FUNCTION: sp_open_dev() 1401*0Sstevel@tonic-gate * INPUT: un - unit structure. 1402*0Sstevel@tonic-gate * oflags - open flags. 1403*0Sstevel@tonic-gate * OUTPUT: none. 1404*0Sstevel@tonic-gate * RETURNS: 0 - success. 1405*0Sstevel@tonic-gate * non-zero - err. 1406*0Sstevel@tonic-gate * PURPOSE: open underlying device via md_layered_open. 1407*0Sstevel@tonic-gate */ 1408*0Sstevel@tonic-gate static int 1409*0Sstevel@tonic-gate sp_open_dev(mp_unit_t *un, int oflags) 1410*0Sstevel@tonic-gate { 1411*0Sstevel@tonic-gate minor_t mnum = MD_SID(un); 1412*0Sstevel@tonic-gate int err; 1413*0Sstevel@tonic-gate md_dev64_t tmpdev; 1414*0Sstevel@tonic-gate set_t setno = MD_MIN2SET(MD_SID(un)); 1415*0Sstevel@tonic-gate side_t side = mddb_getsidenum(setno); 1416*0Sstevel@tonic-gate 1417*0Sstevel@tonic-gate tmpdev = un->un_dev; 1418*0Sstevel@tonic-gate /* 1419*0Sstevel@tonic-gate * Do the open by device id if underlying is regular 1420*0Sstevel@tonic-gate */ 1421*0Sstevel@tonic-gate if ((md_getmajor(tmpdev) != md_major) && 1422*0Sstevel@tonic-gate md_devid_found(setno, side, un->un_key) == 1) { 1423*0Sstevel@tonic-gate tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key); 1424*0Sstevel@tonic-gate } 1425*0Sstevel@tonic-gate err = md_layered_open(mnum, &tmpdev, oflags); 1426*0Sstevel@tonic-gate un->un_dev = tmpdev; 1427*0Sstevel@tonic-gate 1428*0Sstevel@tonic-gate if (err) 1429*0Sstevel@tonic-gate return (ENXIO); 1430*0Sstevel@tonic-gate 1431*0Sstevel@tonic-gate return (0); 1432*0Sstevel@tonic-gate } 1433*0Sstevel@tonic-gate 1434*0Sstevel@tonic-gate /* 1435*0Sstevel@tonic-gate * FUNCTION: sp_open() 1436*0Sstevel@tonic-gate * INPUT: dev - device to open. 1437*0Sstevel@tonic-gate * flag - pass-through flag. 1438*0Sstevel@tonic-gate * otyp - pass-through open type. 1439*0Sstevel@tonic-gate * cred_p - credentials. 1440*0Sstevel@tonic-gate * md_oflags - open flags. 1441*0Sstevel@tonic-gate * OUTPUT: none. 1442*0Sstevel@tonic-gate * RETURNS: 0 - success. 1443*0Sstevel@tonic-gate * non-zero - err. 1444*0Sstevel@tonic-gate * PURPOSE: open a soft partition. 1445*0Sstevel@tonic-gate */ 1446*0Sstevel@tonic-gate /* ARGSUSED */ 1447*0Sstevel@tonic-gate static int 1448*0Sstevel@tonic-gate sp_open( 1449*0Sstevel@tonic-gate dev_t *dev, 1450*0Sstevel@tonic-gate int flag, 1451*0Sstevel@tonic-gate int otyp, 1452*0Sstevel@tonic-gate cred_t *cred_p, 1453*0Sstevel@tonic-gate int md_oflags 1454*0Sstevel@tonic-gate ) 1455*0Sstevel@tonic-gate { 1456*0Sstevel@tonic-gate minor_t mnum = getminor(*dev); 1457*0Sstevel@tonic-gate mdi_unit_t *ui = MDI_UNIT(mnum); 1458*0Sstevel@tonic-gate mp_unit_t *un; 1459*0Sstevel@tonic-gate int err = 0; 1460*0Sstevel@tonic-gate set_t setno; 1461*0Sstevel@tonic-gate 1462*0Sstevel@tonic-gate /* grab necessary locks */ 1463*0Sstevel@tonic-gate un = (mp_unit_t *)md_unit_openclose_enter(ui); 1464*0Sstevel@tonic-gate setno = MD_UN2SET(un); 1465*0Sstevel@tonic-gate 1466*0Sstevel@tonic-gate /* open underlying device, if necessary */ 1467*0Sstevel@tonic-gate if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) { 1468*0Sstevel@tonic-gate if ((err = sp_open_dev(un, md_oflags)) != 0) 1469*0Sstevel@tonic-gate goto out; 1470*0Sstevel@tonic-gate 1471*0Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) { 1472*0Sstevel@tonic-gate /* For probe, don't incur the overhead of validate */ 1473*0Sstevel@tonic-gate if (!(md_oflags & MD_OFLG_PROBEDEV)) { 1474*0Sstevel@tonic-gate /* 1475*0Sstevel@tonic-gate * Don't call sp_validate while 1476*0Sstevel@tonic-gate * unit_openclose lock is held. So, actually 1477*0Sstevel@tonic-gate * open the device, drop openclose lock, 1478*0Sstevel@tonic-gate * call sp_validate, reacquire openclose lock, 1479*0Sstevel@tonic-gate * and close the device. If sp_validate 1480*0Sstevel@tonic-gate * succeeds, then device will be re-opened. 1481*0Sstevel@tonic-gate */ 1482*0Sstevel@tonic-gate if ((err = md_unit_incopen(mnum, flag, 1483*0Sstevel@tonic-gate otyp)) != 0) 1484*0Sstevel@tonic-gate goto out; 1485*0Sstevel@tonic-gate 1486*0Sstevel@tonic-gate mutex_enter(&ui->ui_mx); 1487*0Sstevel@tonic-gate ui->ui_lock |= MD_UL_OPENINPROGRESS; 1488*0Sstevel@tonic-gate mutex_exit(&ui->ui_mx); 1489*0Sstevel@tonic-gate md_unit_openclose_exit(ui); 1490*0Sstevel@tonic-gate if (otyp != OTYP_LYR) 1491*0Sstevel@tonic-gate rw_exit(&md_unit_array_rw.lock); 1492*0Sstevel@tonic-gate 1493*0Sstevel@tonic-gate err = sp_validate(un); 1494*0Sstevel@tonic-gate 1495*0Sstevel@tonic-gate if (otyp != OTYP_LYR) 1496*0Sstevel@tonic-gate rw_enter(&md_unit_array_rw.lock, 1497*0Sstevel@tonic-gate RW_READER); 1498*0Sstevel@tonic-gate (void) md_unit_openclose_enter(ui); 1499*0Sstevel@tonic-gate (void) md_unit_decopen(mnum, otyp); 1500*0Sstevel@tonic-gate mutex_enter(&ui->ui_mx); 1501*0Sstevel@tonic-gate ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 1502*0Sstevel@tonic-gate cv_broadcast(&ui->ui_cv); 1503*0Sstevel@tonic-gate mutex_exit(&ui->ui_mx); 1504*0Sstevel@tonic-gate /* 1505*0Sstevel@tonic-gate * Should be in the same state as before 1506*0Sstevel@tonic-gate * the sp_validate. 1507*0Sstevel@tonic-gate */ 1508*0Sstevel@tonic-gate if (err != 0) { 1509*0Sstevel@tonic-gate /* close the device opened above */ 1510*0Sstevel@tonic-gate md_layered_close(un->un_dev, md_oflags); 1511*0Sstevel@tonic-gate err = EIO; 1512*0Sstevel@tonic-gate goto out; 1513*0Sstevel@tonic-gate } 1514*0Sstevel@tonic-gate } 1515*0Sstevel@tonic-gate /* 1516*0Sstevel@tonic-gate * As we're a multi-owner metadevice we need to ensure 1517*0Sstevel@tonic-gate * that all nodes have the same idea of the status. 1518*0Sstevel@tonic-gate * sp_validate() will mark the device as errored (if 1519*0Sstevel@tonic-gate * it cannot read the watermark) or ok (if it was 1520*0Sstevel@tonic-gate * previously errored but the watermark is now valid). 1521*0Sstevel@tonic-gate * This code-path is only entered on the non-probe open 1522*0Sstevel@tonic-gate * so we will maintain the errored state during a probe 1523*0Sstevel@tonic-gate * call. This means the sys-admin must metarecover -m 1524*0Sstevel@tonic-gate * to reset the soft-partition error. 1525*0Sstevel@tonic-gate */ 1526*0Sstevel@tonic-gate } else { 1527*0Sstevel@tonic-gate /* For probe, don't incur the overhead of validate */ 1528*0Sstevel@tonic-gate if (!(md_oflags & MD_OFLG_PROBEDEV) && 1529*0Sstevel@tonic-gate (err = sp_validate(un)) != 0) { 1530*0Sstevel@tonic-gate /* close the device opened above */ 1531*0Sstevel@tonic-gate md_layered_close(un->un_dev, md_oflags); 1532*0Sstevel@tonic-gate err = EIO; 1533*0Sstevel@tonic-gate goto out; 1534*0Sstevel@tonic-gate } else { 1535*0Sstevel@tonic-gate /* 1536*0Sstevel@tonic-gate * we succeeded in validating the on disk 1537*0Sstevel@tonic-gate * format versus the in core, so reset the 1538*0Sstevel@tonic-gate * status if it's in error 1539*0Sstevel@tonic-gate */ 1540*0Sstevel@tonic-gate if (un->un_status == MD_SP_ERR) { 1541*0Sstevel@tonic-gate un->un_status = MD_SP_OK; 1542*0Sstevel@tonic-gate } 1543*0Sstevel@tonic-gate } 1544*0Sstevel@tonic-gate } 1545*0Sstevel@tonic-gate } 1546*0Sstevel@tonic-gate 1547*0Sstevel@tonic-gate /* count open */ 1548*0Sstevel@tonic-gate if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 1549*0Sstevel@tonic-gate goto out; 1550*0Sstevel@tonic-gate 1551*0Sstevel@tonic-gate out: 1552*0Sstevel@tonic-gate md_unit_openclose_exit(ui); 1553*0Sstevel@tonic-gate return (err); 1554*0Sstevel@tonic-gate } 1555*0Sstevel@tonic-gate 1556*0Sstevel@tonic-gate /* 1557*0Sstevel@tonic-gate * FUNCTION: sp_close() 1558*0Sstevel@tonic-gate * INPUT: dev - device to close. 1559*0Sstevel@tonic-gate * flag - pass-through flag. 1560*0Sstevel@tonic-gate * otyp - pass-through type. 1561*0Sstevel@tonic-gate * cred_p - credentials. 1562*0Sstevel@tonic-gate * md_cflags - close flags. 1563*0Sstevel@tonic-gate * OUTPUT: none. 1564*0Sstevel@tonic-gate * RETURNS: 0 - success. 1565*0Sstevel@tonic-gate * non-zero - err. 1566*0Sstevel@tonic-gate * PURPOSE: close a soft paritition. 1567*0Sstevel@tonic-gate */ 1568*0Sstevel@tonic-gate /* ARGSUSED */ 1569*0Sstevel@tonic-gate static int 1570*0Sstevel@tonic-gate sp_close( 1571*0Sstevel@tonic-gate dev_t dev, 1572*0Sstevel@tonic-gate int flag, 1573*0Sstevel@tonic-gate int otyp, 1574*0Sstevel@tonic-gate cred_t *cred_p, 1575*0Sstevel@tonic-gate int md_cflags 1576*0Sstevel@tonic-gate ) 1577*0Sstevel@tonic-gate { 1578*0Sstevel@tonic-gate minor_t mnum = getminor(dev); 1579*0Sstevel@tonic-gate mdi_unit_t *ui = MDI_UNIT(mnum); 1580*0Sstevel@tonic-gate mp_unit_t *un; 1581*0Sstevel@tonic-gate int err = 0; 1582*0Sstevel@tonic-gate 1583*0Sstevel@tonic-gate /* grab necessary locks */ 1584*0Sstevel@tonic-gate un = (mp_unit_t *)md_unit_openclose_enter(ui); 1585*0Sstevel@tonic-gate 1586*0Sstevel@tonic-gate /* count closed */ 1587*0Sstevel@tonic-gate if ((err = md_unit_decopen(mnum, otyp)) != 0) 1588*0Sstevel@tonic-gate goto out; 1589*0Sstevel@tonic-gate 1590*0Sstevel@tonic-gate /* close devices, if necessary */ 1591*0Sstevel@tonic-gate if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 1592*0Sstevel@tonic-gate md_layered_close(un->un_dev, md_cflags); 1593*0Sstevel@tonic-gate } 1594*0Sstevel@tonic-gate 1595*0Sstevel@tonic-gate /* 1596*0Sstevel@tonic-gate * If a MN set and transient capabilities (eg ABR/DMR) are set, 1597*0Sstevel@tonic-gate * clear these capabilities if this is the last close in 1598*0Sstevel@tonic-gate * the cluster 1599*0Sstevel@tonic-gate */ 1600*0Sstevel@tonic-gate if (MD_MNSET_SETNO(MD_UN2SET(un)) && 1601*0Sstevel@tonic-gate (ui->ui_tstate & MD_ABR_CAP)) { 1602*0Sstevel@tonic-gate md_unit_openclose_exit(ui); 1603*0Sstevel@tonic-gate mdmn_clear_all_capabilities(mnum); 1604*0Sstevel@tonic-gate return (0); 1605*0Sstevel@tonic-gate } 1606*0Sstevel@tonic-gate /* unlock, return success */ 1607*0Sstevel@tonic-gate out: 1608*0Sstevel@tonic-gate md_unit_openclose_exit(ui); 1609*0Sstevel@tonic-gate return (err); 1610*0Sstevel@tonic-gate } 1611*0Sstevel@tonic-gate 1612*0Sstevel@tonic-gate 1613*0Sstevel@tonic-gate /* used in sp_dump routine */ 1614*0Sstevel@tonic-gate static struct buf dumpbuf; 1615*0Sstevel@tonic-gate 1616*0Sstevel@tonic-gate /* 1617*0Sstevel@tonic-gate * FUNCTION: sp_dump() 1618*0Sstevel@tonic-gate * INPUT: dev - device to dump to. 1619*0Sstevel@tonic-gate * addr - address to dump. 1620*0Sstevel@tonic-gate * blkno - blkno on device. 1621*0Sstevel@tonic-gate * nblk - number of blocks to dump. 1622*0Sstevel@tonic-gate * OUTPUT: none. 1623*0Sstevel@tonic-gate * RETURNS: result from bdev_dump. 1624*0Sstevel@tonic-gate * PURPOSE: This routine dumps memory to the disk. It assumes that 1625*0Sstevel@tonic-gate * the memory has already been mapped into mainbus space. 1626*0Sstevel@tonic-gate * It is called at disk interrupt priority when the system 1627*0Sstevel@tonic-gate * is in trouble. 1628*0Sstevel@tonic-gate * NOTE: this function is defined using 32-bit arguments, 1629*0Sstevel@tonic-gate * but soft partitioning is internally 64-bit. Arguments 1630*0Sstevel@tonic-gate * are casted where appropriate. 1631*0Sstevel@tonic-gate */ 1632*0Sstevel@tonic-gate static int 1633*0Sstevel@tonic-gate sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1634*0Sstevel@tonic-gate { 1635*0Sstevel@tonic-gate mp_unit_t *un; 1636*0Sstevel@tonic-gate buf_t *bp; 1637*0Sstevel@tonic-gate sp_ext_length_t nb; 1638*0Sstevel@tonic-gate daddr_t mapblk; 1639*0Sstevel@tonic-gate int result; 1640*0Sstevel@tonic-gate int more; 1641*0Sstevel@tonic-gate int saveresult = 0; 1642*0Sstevel@tonic-gate 1643*0Sstevel@tonic-gate /* 1644*0Sstevel@tonic-gate * Don't need to grab the unit lock. 1645*0Sstevel@tonic-gate * Cause nothing else is supposed to be happenning. 1646*0Sstevel@tonic-gate * Also dump is not supposed to sleep. 1647*0Sstevel@tonic-gate */ 1648*0Sstevel@tonic-gate un = (mp_unit_t *)MD_UNIT(getminor(dev)); 1649*0Sstevel@tonic-gate 1650*0Sstevel@tonic-gate if ((diskaddr_t)blkno >= un->c.un_total_blocks) 1651*0Sstevel@tonic-gate return (EINVAL); 1652*0Sstevel@tonic-gate 1653*0Sstevel@tonic-gate if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks) 1654*0Sstevel@tonic-gate return (EINVAL); 1655*0Sstevel@tonic-gate 1656*0Sstevel@tonic-gate bp = &dumpbuf; 1657*0Sstevel@tonic-gate nb = (sp_ext_length_t)dbtob(nblk); 1658*0Sstevel@tonic-gate do { 1659*0Sstevel@tonic-gate bzero((caddr_t)bp, sizeof (*bp)); 1660*0Sstevel@tonic-gate more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp); 1661*0Sstevel@tonic-gate nblk = (int)(btodb(bp->b_bcount)); 1662*0Sstevel@tonic-gate mapblk = bp->b_blkno; 1663*0Sstevel@tonic-gate result = bdev_dump(bp->b_edev, addr, mapblk, nblk); 1664*0Sstevel@tonic-gate if (result) 1665*0Sstevel@tonic-gate saveresult = result; 1666*0Sstevel@tonic-gate 1667*0Sstevel@tonic-gate nb -= bp->b_bcount; 1668*0Sstevel@tonic-gate addr += bp->b_bcount; 1669*0Sstevel@tonic-gate blkno += nblk; 1670*0Sstevel@tonic-gate } while (more); 1671*0Sstevel@tonic-gate 1672*0Sstevel@tonic-gate return (saveresult); 1673*0Sstevel@tonic-gate } 1674*0Sstevel@tonic-gate 1675*0Sstevel@tonic-gate static int 1676*0Sstevel@tonic-gate sp_imp_set( 1677*0Sstevel@tonic-gate set_t setno 1678*0Sstevel@tonic-gate ) 1679*0Sstevel@tonic-gate { 1680*0Sstevel@tonic-gate mddb_recid_t recid; 1681*0Sstevel@tonic-gate int gotsomething; 1682*0Sstevel@tonic-gate mddb_type_t rec_type; 1683*0Sstevel@tonic-gate mddb_de_ic_t *dep; 1684*0Sstevel@tonic-gate mddb_rb32_t *rbp; 1685*0Sstevel@tonic-gate mp_unit_t *un64; 1686*0Sstevel@tonic-gate mp_unit32_od_t *un32; 1687*0Sstevel@tonic-gate minor_t *self_id; /* minor needs to be updated */ 1688*0Sstevel@tonic-gate md_parent_t *parent_id; /* parent needs to be updated */ 1689*0Sstevel@tonic-gate mddb_recid_t *record_id; /* record id needs to be updated */ 1690*0Sstevel@tonic-gate 1691*0Sstevel@tonic-gate gotsomething = 0; 1692*0Sstevel@tonic-gate 1693*0Sstevel@tonic-gate rec_type = (mddb_type_t)md_getshared_key(setno, 1694*0Sstevel@tonic-gate sp_md_ops.md_driver.md_drivername); 1695*0Sstevel@tonic-gate recid = mddb_makerecid(setno, 0); 1696*0Sstevel@tonic-gate 1697*0Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 1698*0Sstevel@tonic-gate if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1699*0Sstevel@tonic-gate continue; 1700*0Sstevel@tonic-gate 1701*0Sstevel@tonic-gate dep = mddb_getrecdep(recid); 1702*0Sstevel@tonic-gate rbp = dep->de_rb; 1703*0Sstevel@tonic-gate 1704*0Sstevel@tonic-gate if (rbp->rb_revision == MDDB_REV_RB) { 1705*0Sstevel@tonic-gate /* 1706*0Sstevel@tonic-gate * Small device 1707*0Sstevel@tonic-gate */ 1708*0Sstevel@tonic-gate un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid); 1709*0Sstevel@tonic-gate self_id = &(un32->c.un_self_id); 1710*0Sstevel@tonic-gate parent_id = &(un32->c.un_parent); 1711*0Sstevel@tonic-gate record_id = &(un32->c.un_record_id); 1712*0Sstevel@tonic-gate 1713*0Sstevel@tonic-gate if (!md_update_minor(setno, mddb_getsidenum 1714*0Sstevel@tonic-gate (setno), un32->un_key)) 1715*0Sstevel@tonic-gate goto out; 1716*0Sstevel@tonic-gate } else { 1717*0Sstevel@tonic-gate un64 = (mp_unit_t *)mddb_getrecaddr(recid); 1718*0Sstevel@tonic-gate self_id = &(un64->c.un_self_id); 1719*0Sstevel@tonic-gate parent_id = &(un64->c.un_parent); 1720*0Sstevel@tonic-gate record_id = &(un64->c.un_record_id); 1721*0Sstevel@tonic-gate 1722*0Sstevel@tonic-gate if (!md_update_minor(setno, mddb_getsidenum 1723*0Sstevel@tonic-gate (setno), un64->un_key)) 1724*0Sstevel@tonic-gate goto out; 1725*0Sstevel@tonic-gate } 1726*0Sstevel@tonic-gate 1727*0Sstevel@tonic-gate /* 1728*0Sstevel@tonic-gate * Update unit with the imported setno 1729*0Sstevel@tonic-gate * 1730*0Sstevel@tonic-gate */ 1731*0Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_GOTIT); 1732*0Sstevel@tonic-gate 1733*0Sstevel@tonic-gate *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 1734*0Sstevel@tonic-gate if (*parent_id != MD_NO_PARENT) 1735*0Sstevel@tonic-gate *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 1736*0Sstevel@tonic-gate *record_id = MAKERECID(setno, DBID(*record_id)); 1737*0Sstevel@tonic-gate 1738*0Sstevel@tonic-gate gotsomething = 1; 1739*0Sstevel@tonic-gate } 1740*0Sstevel@tonic-gate 1741*0Sstevel@tonic-gate out: 1742*0Sstevel@tonic-gate return (gotsomething); 1743*0Sstevel@tonic-gate } 1744*0Sstevel@tonic-gate 1745*0Sstevel@tonic-gate static md_named_services_t sp_named_services[] = { 1746*0Sstevel@tonic-gate {NULL, 0} 1747*0Sstevel@tonic-gate }; 1748*0Sstevel@tonic-gate 1749*0Sstevel@tonic-gate md_ops_t sp_md_ops = { 1750*0Sstevel@tonic-gate sp_open, /* open */ 1751*0Sstevel@tonic-gate sp_close, /* close */ 1752*0Sstevel@tonic-gate md_sp_strategy, /* strategy */ 1753*0Sstevel@tonic-gate NULL, /* print */ 1754*0Sstevel@tonic-gate sp_dump, /* dump */ 1755*0Sstevel@tonic-gate NULL, /* read */ 1756*0Sstevel@tonic-gate NULL, /* write */ 1757*0Sstevel@tonic-gate md_sp_ioctl, /* ioctl, */ 1758*0Sstevel@tonic-gate sp_snarf, /* snarf */ 1759*0Sstevel@tonic-gate sp_halt, /* halt */ 1760*0Sstevel@tonic-gate NULL, /* aread */ 1761*0Sstevel@tonic-gate NULL, /* awrite */ 1762*0Sstevel@tonic-gate sp_imp_set, /* import set */ 1763*0Sstevel@tonic-gate sp_named_services 1764*0Sstevel@tonic-gate }; 1765*0Sstevel@tonic-gate 1766*0Sstevel@tonic-gate static void 1767*0Sstevel@tonic-gate init_init() 1768*0Sstevel@tonic-gate { 1769*0Sstevel@tonic-gate sp_parent_cache = kmem_cache_create("md_softpart_parent", 1770*0Sstevel@tonic-gate sizeof (md_spps_t), 0, sp_parent_constructor, 1771*0Sstevel@tonic-gate sp_parent_destructor, sp_run_queue, NULL, NULL, 0); 1772*0Sstevel@tonic-gate sp_child_cache = kmem_cache_create("md_softpart_child", 1773*0Sstevel@tonic-gate sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0, 1774*0Sstevel@tonic-gate sp_child_constructor, sp_child_destructor, sp_run_queue, 1775*0Sstevel@tonic-gate NULL, NULL, 0); 1776*0Sstevel@tonic-gate } 1777*0Sstevel@tonic-gate 1778*0Sstevel@tonic-gate static void 1779*0Sstevel@tonic-gate fini_uninit() 1780*0Sstevel@tonic-gate { 1781*0Sstevel@tonic-gate kmem_cache_destroy(sp_parent_cache); 1782*0Sstevel@tonic-gate kmem_cache_destroy(sp_child_cache); 1783*0Sstevel@tonic-gate sp_parent_cache = sp_child_cache = NULL; 1784*0Sstevel@tonic-gate } 1785*0Sstevel@tonic-gate 1786*0Sstevel@tonic-gate /* define the module linkage */ 1787*0Sstevel@tonic-gate MD_PLUGIN_MISC_MODULE("soft partition module %I%", init_init(), fini_uninit()) 1788