10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*1366Spetede * Common Development and Distribution License (the "License"). 6*1366Spetede * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 22*1366Spetede * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 270Sstevel@tonic-gate 280Sstevel@tonic-gate /* 290Sstevel@tonic-gate * Soft partitioning metadevice driver (md_sp). 300Sstevel@tonic-gate * 310Sstevel@tonic-gate * This file contains the primary operations of the soft partitioning 320Sstevel@tonic-gate * metadevice driver. This includes all routines for normal operation 330Sstevel@tonic-gate * (open/close/read/write). Please see mdvar.h for a definition of 340Sstevel@tonic-gate * metadevice operations vector (md_ops_t). This driver is loosely 350Sstevel@tonic-gate * based on the stripe driver (md_stripe). 360Sstevel@tonic-gate * 370Sstevel@tonic-gate * All metadevice administration is done through the use of ioctl's. 380Sstevel@tonic-gate * As such, all administrative routines appear in sp_ioctl.c. 390Sstevel@tonic-gate * 400Sstevel@tonic-gate * Soft partitions are represented both in-core and in the metadb with a 410Sstevel@tonic-gate * unit structure. The soft partition-specific information in the unit 420Sstevel@tonic-gate * structure includes the following information: 430Sstevel@tonic-gate * - Device information (md_dev64_t & md key) about the device on which 440Sstevel@tonic-gate * the soft partition is built. 450Sstevel@tonic-gate * - Soft partition status information. 460Sstevel@tonic-gate * - The size of the soft partition and number of extents used to 470Sstevel@tonic-gate * make up that size. 480Sstevel@tonic-gate * - An array of exents which define virtual/physical offset 490Sstevel@tonic-gate * mappings and lengths for each extent. 500Sstevel@tonic-gate * 510Sstevel@tonic-gate * Typical soft partition operation proceeds as follows: 520Sstevel@tonic-gate * - The unit structure is fetched from the metadb and placed into 530Sstevel@tonic-gate * an in-core array (as with other metadevices). This operation 540Sstevel@tonic-gate * is performed via sp_build_incore( ) and takes place during 550Sstevel@tonic-gate * "snarfing" (when all metadevices are brought in-core at 560Sstevel@tonic-gate * once) and when a new soft partition is created. 570Sstevel@tonic-gate * - A soft partition is opened via sp_open( ). At open time the 580Sstevel@tonic-gate * the soft partition unit structure is verified with the soft 590Sstevel@tonic-gate * partition on-disk structures. Additionally, the soft partition 600Sstevel@tonic-gate * status is checked (only soft partitions in the OK state may be 610Sstevel@tonic-gate * opened). 620Sstevel@tonic-gate * - Soft partition I/O is performed via sp_strategy( ) which relies on 630Sstevel@tonic-gate * a support routine, sp_mapbuf( ), to do most of the work. 640Sstevel@tonic-gate * sp_mapbuf( ) maps a buffer to a particular extent via a binary 650Sstevel@tonic-gate * search of the extent array in the soft partition unit structure. 660Sstevel@tonic-gate * Once a translation has been performed, the I/O is passed down 670Sstevel@tonic-gate * to the next layer, which may be another metadevice or a physical 680Sstevel@tonic-gate * disk. Since a soft partition may contain multiple, non-contiguous 690Sstevel@tonic-gate * extents, a single I/O may have to be fragmented. 700Sstevel@tonic-gate * - Soft partitions are closed using sp_close. 710Sstevel@tonic-gate * 720Sstevel@tonic-gate */ 730Sstevel@tonic-gate 740Sstevel@tonic-gate #include <sys/param.h> 750Sstevel@tonic-gate #include <sys/systm.h> 760Sstevel@tonic-gate #include <sys/conf.h> 770Sstevel@tonic-gate #include <sys/file.h> 780Sstevel@tonic-gate #include <sys/user.h> 790Sstevel@tonic-gate #include <sys/uio.h> 800Sstevel@tonic-gate #include <sys/t_lock.h> 810Sstevel@tonic-gate #include <sys/buf.h> 820Sstevel@tonic-gate #include <sys/dkio.h> 830Sstevel@tonic-gate #include <sys/vtoc.h> 840Sstevel@tonic-gate #include <sys/kmem.h> 850Sstevel@tonic-gate #include <vm/page.h> 860Sstevel@tonic-gate #include <sys/cmn_err.h> 870Sstevel@tonic-gate #include <sys/sysmacros.h> 880Sstevel@tonic-gate #include <sys/types.h> 890Sstevel@tonic-gate #include <sys/mkdev.h> 900Sstevel@tonic-gate #include <sys/stat.h> 910Sstevel@tonic-gate #include <sys/open.h> 920Sstevel@tonic-gate #include <sys/lvm/mdvar.h> 930Sstevel@tonic-gate #include <sys/lvm/md_sp.h> 940Sstevel@tonic-gate #include <sys/lvm/md_convert.h> 950Sstevel@tonic-gate #include <sys/lvm/md_notify.h> 960Sstevel@tonic-gate #include <sys/lvm/md_crc.h> 970Sstevel@tonic-gate #include <sys/modctl.h> 980Sstevel@tonic-gate #include <sys/ddi.h> 990Sstevel@tonic-gate #include <sys/sunddi.h> 1000Sstevel@tonic-gate #include <sys/debug.h> 1010Sstevel@tonic-gate 1020Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h> 1030Sstevel@tonic-gate #include <sys/sysevent/svm.h> 1040Sstevel@tonic-gate 1050Sstevel@tonic-gate md_ops_t sp_md_ops; 1060Sstevel@tonic-gate #ifndef lint 107*1366Spetede char _depends_on[] = "drv/md"; 1080Sstevel@tonic-gate md_ops_t *md_interface_ops = &sp_md_ops; 1090Sstevel@tonic-gate #endif 1100Sstevel@tonic-gate 1110Sstevel@tonic-gate extern unit_t md_nunits; 1120Sstevel@tonic-gate extern set_t md_nsets; 1130Sstevel@tonic-gate extern md_set_t md_set[]; 1140Sstevel@tonic-gate 1150Sstevel@tonic-gate extern int md_status; 1160Sstevel@tonic-gate extern major_t md_major; 1170Sstevel@tonic-gate extern mdq_anchor_t md_done_daemon; 1180Sstevel@tonic-gate extern mdq_anchor_t md_sp_daemon; 1190Sstevel@tonic-gate extern kmutex_t md_mx; 1200Sstevel@tonic-gate extern kcondvar_t md_cv; 1210Sstevel@tonic-gate extern md_krwlock_t md_unit_array_rw; 1220Sstevel@tonic-gate 1230Sstevel@tonic-gate static kmem_cache_t *sp_parent_cache = NULL; 1240Sstevel@tonic-gate static kmem_cache_t *sp_child_cache = NULL; 1250Sstevel@tonic-gate static void sp_send_stat_ok(mp_unit_t *); 1260Sstevel@tonic-gate static void sp_send_stat_err(mp_unit_t *); 1270Sstevel@tonic-gate 1280Sstevel@tonic-gate /* 1290Sstevel@tonic-gate * FUNCTION: sp_parent_constructor() 1300Sstevel@tonic-gate * INPUT: none. 1310Sstevel@tonic-gate * OUTPUT: ps - parent save structure initialized. 1320Sstevel@tonic-gate * RETURNS: void * - ptr to initialized parent save structure. 1330Sstevel@tonic-gate * PURPOSE: initialize parent save structure. 1340Sstevel@tonic-gate */ 1350Sstevel@tonic-gate /*ARGSUSED1*/ 1360Sstevel@tonic-gate static int 1370Sstevel@tonic-gate sp_parent_constructor(void *p, void *d1, int d2) 1380Sstevel@tonic-gate { 1390Sstevel@tonic-gate mutex_init(&((md_spps_t *)p)->ps_mx, 1400Sstevel@tonic-gate NULL, MUTEX_DEFAULT, NULL); 1410Sstevel@tonic-gate return (0); 1420Sstevel@tonic-gate } 1430Sstevel@tonic-gate 1440Sstevel@tonic-gate static void 1450Sstevel@tonic-gate sp_parent_init(md_spps_t *ps) 1460Sstevel@tonic-gate { 1470Sstevel@tonic-gate bzero(ps, offsetof(md_spps_t, ps_mx)); 1480Sstevel@tonic-gate } 1490Sstevel@tonic-gate 1500Sstevel@tonic-gate /*ARGSUSED1*/ 1510Sstevel@tonic-gate static void 1520Sstevel@tonic-gate sp_parent_destructor(void *p, void *d) 1530Sstevel@tonic-gate { 1540Sstevel@tonic-gate mutex_destroy(&((md_spps_t *)p)->ps_mx); 1550Sstevel@tonic-gate } 1560Sstevel@tonic-gate 1570Sstevel@tonic-gate /* 1580Sstevel@tonic-gate * FUNCTION: sp_child_constructor() 1590Sstevel@tonic-gate * INPUT: none. 1600Sstevel@tonic-gate * OUTPUT: cs - child save structure initialized. 1610Sstevel@tonic-gate * RETURNS: void * - ptr to initialized child save structure. 1620Sstevel@tonic-gate * PURPOSE: initialize child save structure. 1630Sstevel@tonic-gate */ 1640Sstevel@tonic-gate /*ARGSUSED1*/ 1650Sstevel@tonic-gate static int 1660Sstevel@tonic-gate sp_child_constructor(void *p, void *d1, int d2) 1670Sstevel@tonic-gate { 1680Sstevel@tonic-gate bioinit(&((md_spcs_t *)p)->cs_buf); 1690Sstevel@tonic-gate return (0); 1700Sstevel@tonic-gate } 1710Sstevel@tonic-gate 1720Sstevel@tonic-gate static void 1730Sstevel@tonic-gate sp_child_init(md_spcs_t *cs) 1740Sstevel@tonic-gate { 1750Sstevel@tonic-gate cs->cs_mdunit = 0; 1760Sstevel@tonic-gate cs->cs_ps = NULL; 1770Sstevel@tonic-gate md_bioreset(&cs->cs_buf); 1780Sstevel@tonic-gate } 1790Sstevel@tonic-gate 1800Sstevel@tonic-gate /*ARGSUSED1*/ 1810Sstevel@tonic-gate static void 1820Sstevel@tonic-gate sp_child_destructor(void *p, void *d) 1830Sstevel@tonic-gate { 1840Sstevel@tonic-gate biofini(&((md_spcs_t *)p)->cs_buf); 1850Sstevel@tonic-gate } 1860Sstevel@tonic-gate 1870Sstevel@tonic-gate /* 1880Sstevel@tonic-gate * FUNCTION: sp_run_queue() 1890Sstevel@tonic-gate * INPUT: none. 1900Sstevel@tonic-gate * OUTPUT: none. 1910Sstevel@tonic-gate * RETURNS: void. 1920Sstevel@tonic-gate * PURPOSE: run the md_daemon to clean up memory pool. 1930Sstevel@tonic-gate */ 1940Sstevel@tonic-gate /*ARGSUSED*/ 1950Sstevel@tonic-gate static void 1960Sstevel@tonic-gate sp_run_queue(void *d) 1970Sstevel@tonic-gate { 1980Sstevel@tonic-gate if (!(md_status & MD_GBL_DAEMONS_LIVE)) 1990Sstevel@tonic-gate md_daemon(1, &md_done_daemon); 2000Sstevel@tonic-gate } 2010Sstevel@tonic-gate 2020Sstevel@tonic-gate 2030Sstevel@tonic-gate /* 2040Sstevel@tonic-gate * FUNCTION: sp_build_incore() 2050Sstevel@tonic-gate * INPUT: p - ptr to unit structure. 2060Sstevel@tonic-gate * snarfing - flag to tell us we are snarfing. 2070Sstevel@tonic-gate * OUTPUT: non. 2080Sstevel@tonic-gate * RETURNS: int - 0 (always). 2090Sstevel@tonic-gate * PURPOSE: place unit structure into in-core unit array (keyed from 2100Sstevel@tonic-gate * minor number). 2110Sstevel@tonic-gate */ 2120Sstevel@tonic-gate int 2130Sstevel@tonic-gate sp_build_incore(void *p, int snarfing) 2140Sstevel@tonic-gate { 2150Sstevel@tonic-gate mp_unit_t *un = (mp_unit_t *)p; 2160Sstevel@tonic-gate minor_t mnum; 2170Sstevel@tonic-gate set_t setno; 2180Sstevel@tonic-gate md_dev64_t tmpdev; 2190Sstevel@tonic-gate 2200Sstevel@tonic-gate mnum = MD_SID(un); 2210Sstevel@tonic-gate 2220Sstevel@tonic-gate if (MD_UNIT(mnum) != NULL) 2230Sstevel@tonic-gate return (0); 2240Sstevel@tonic-gate 2250Sstevel@tonic-gate MD_STATUS(un) = 0; 2260Sstevel@tonic-gate 2270Sstevel@tonic-gate if (snarfing) { 2280Sstevel@tonic-gate /* 2290Sstevel@tonic-gate * if we are snarfing, we get the device information 2300Sstevel@tonic-gate * from the metadb record (using the metadb key for 2310Sstevel@tonic-gate * that device). 2320Sstevel@tonic-gate */ 2330Sstevel@tonic-gate setno = MD_MIN2SET(mnum); 2340Sstevel@tonic-gate 2350Sstevel@tonic-gate tmpdev = md_getdevnum(setno, mddb_getsidenum(setno), 2360Sstevel@tonic-gate un->un_key, MD_NOTRUST_DEVT); 2370Sstevel@tonic-gate un->un_dev = tmpdev; 2380Sstevel@tonic-gate } 2390Sstevel@tonic-gate 2400Sstevel@tonic-gate /* place unit in in-core array */ 2410Sstevel@tonic-gate MD_UNIT(mnum) = un; 2420Sstevel@tonic-gate return (0); 2430Sstevel@tonic-gate } 2440Sstevel@tonic-gate 2450Sstevel@tonic-gate /* 2460Sstevel@tonic-gate * FUNCTION: reset_sp() 2470Sstevel@tonic-gate * INPUT: un - unit structure to be reset/removed. 2480Sstevel@tonic-gate * mnum - minor number to be reset/removed. 2490Sstevel@tonic-gate * removing - flag to tell us if we are removing 2500Sstevel@tonic-gate * permanently or just reseting in-core 2510Sstevel@tonic-gate * structures. 2520Sstevel@tonic-gate * OUTPUT: none. 2530Sstevel@tonic-gate * RETURNS: void. 2540Sstevel@tonic-gate * PURPOSE: used to either simply reset in-core structures or to 2550Sstevel@tonic-gate * permanently remove metadevices from the metadb. 2560Sstevel@tonic-gate */ 2570Sstevel@tonic-gate void 2580Sstevel@tonic-gate reset_sp(mp_unit_t *un, minor_t mnum, int removing) 2590Sstevel@tonic-gate { 2600Sstevel@tonic-gate sv_dev_t *sv; 2610Sstevel@tonic-gate mddb_recid_t vtoc_id; 2620Sstevel@tonic-gate 2630Sstevel@tonic-gate /* clean up in-core structures */ 2640Sstevel@tonic-gate md_destroy_unit_incore(mnum, &sp_md_ops); 2650Sstevel@tonic-gate 2660Sstevel@tonic-gate MD_UNIT(mnum) = NULL; 2670Sstevel@tonic-gate 2680Sstevel@tonic-gate if (!removing) 2690Sstevel@tonic-gate return; 2700Sstevel@tonic-gate 2710Sstevel@tonic-gate /* we are removing the soft partition from the metadb */ 2720Sstevel@tonic-gate 2730Sstevel@tonic-gate /* 2740Sstevel@tonic-gate * Save off device information so we can get to 2750Sstevel@tonic-gate * it after we do the mddb_deleterec(). 2760Sstevel@tonic-gate */ 2770Sstevel@tonic-gate sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP); 2780Sstevel@tonic-gate sv->setno = MD_MIN2SET(mnum); 2790Sstevel@tonic-gate sv->key = un->un_key; 2800Sstevel@tonic-gate vtoc_id = un->c.un_vtoc_id; 2810Sstevel@tonic-gate 2820Sstevel@tonic-gate /* Remove the unit structure */ 2830Sstevel@tonic-gate mddb_deleterec_wrapper(un->c.un_record_id); 2840Sstevel@tonic-gate 2850Sstevel@tonic-gate if (vtoc_id) 2860Sstevel@tonic-gate mddb_deleterec_wrapper(vtoc_id); 2870Sstevel@tonic-gate 2880Sstevel@tonic-gate SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE, 2890Sstevel@tonic-gate MD_MIN2SET(mnum), MD_MIN2UNIT(mnum)); 2900Sstevel@tonic-gate 2910Sstevel@tonic-gate /* 2920Sstevel@tonic-gate * remove the underlying device name from the metadb. if other 2930Sstevel@tonic-gate * soft partitions are built on this device, this will simply 2940Sstevel@tonic-gate * decrease the reference count for this device. otherwise the 2950Sstevel@tonic-gate * name record for this device will be removed from the metadb. 2960Sstevel@tonic-gate */ 2970Sstevel@tonic-gate md_rem_names(sv, 1); 2980Sstevel@tonic-gate kmem_free(sv, sizeof (sv_dev_t)); 2990Sstevel@tonic-gate } 3000Sstevel@tonic-gate 3010Sstevel@tonic-gate /* 3020Sstevel@tonic-gate * FUNCTION: sp_send_stat_msg 3030Sstevel@tonic-gate * INPUT: un - unit reference 3040Sstevel@tonic-gate * status - status to be sent to master node 3050Sstevel@tonic-gate * MD_SP_OK - soft-partition is now OK 3060Sstevel@tonic-gate * MD_SP_ERR " " errored 3070Sstevel@tonic-gate * OUTPUT: none. 3080Sstevel@tonic-gate * RETURNS: void. 3090Sstevel@tonic-gate * PURPOSE: send a soft-partition status change to the master node. If the 3100Sstevel@tonic-gate * message succeeds we simply return. If it fails we panic as the 3110Sstevel@tonic-gate * cluster-wide view of the metadevices is now inconsistent. 3120Sstevel@tonic-gate * CALLING CONTEXT: 3130Sstevel@tonic-gate * Blockable. No locks can be held. 3140Sstevel@tonic-gate */ 3150Sstevel@tonic-gate static void 3160Sstevel@tonic-gate sp_send_stat_msg(mp_unit_t *un, sp_status_t status) 3170Sstevel@tonic-gate { 3180Sstevel@tonic-gate md_mn_msg_sp_setstat_t sp_msg; 3190Sstevel@tonic-gate md_mn_kresult_t *kres; 3200Sstevel@tonic-gate set_t setno = MD_UN2SET(un); 3210Sstevel@tonic-gate int rval; 3220Sstevel@tonic-gate const char *str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK"; 3230Sstevel@tonic-gate 3240Sstevel@tonic-gate sp_msg.sp_setstat_mnum = MD_SID(un); 3250Sstevel@tonic-gate sp_msg.sp_setstat_status = status; 3260Sstevel@tonic-gate 3270Sstevel@tonic-gate kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 3280Sstevel@tonic-gate 3290Sstevel@tonic-gate rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG, 3300Sstevel@tonic-gate (char *)&sp_msg, sizeof (sp_msg), kres); 3310Sstevel@tonic-gate 3320Sstevel@tonic-gate if (!MDMN_KSEND_MSG_OK(rval, kres)) { 3330Sstevel@tonic-gate mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2"); 3340Sstevel@tonic-gate 3350Sstevel@tonic-gate /* 3360Sstevel@tonic-gate * Panic as we are now in an inconsistent state. 3370Sstevel@tonic-gate */ 3380Sstevel@tonic-gate 3390Sstevel@tonic-gate cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n", 3400Sstevel@tonic-gate md_shortname(MD_SID(un)), str); 3410Sstevel@tonic-gate } 3420Sstevel@tonic-gate 3430Sstevel@tonic-gate kmem_free(kres, sizeof (md_mn_kresult_t)); 3440Sstevel@tonic-gate } 3450Sstevel@tonic-gate 3460Sstevel@tonic-gate /* 3470Sstevel@tonic-gate * FUNCTION: sp_finish_error 3480Sstevel@tonic-gate * INPUT: ps - parent save structure for error-ed I/O. 3490Sstevel@tonic-gate * lock_held - set if the unit readerlock is held 3500Sstevel@tonic-gate * OUTPUT: none. 3510Sstevel@tonic-gate * RETURNS: void. 3520Sstevel@tonic-gate * PURPOSE: report a driver error 3530Sstevel@tonic-gate */ 3540Sstevel@tonic-gate static void 3550Sstevel@tonic-gate sp_finish_error(md_spps_t *ps, int lock_held) 3560Sstevel@tonic-gate { 3570Sstevel@tonic-gate struct buf *pb = ps->ps_bp; 3580Sstevel@tonic-gate mdi_unit_t *ui = ps->ps_ui; 3590Sstevel@tonic-gate md_dev64_t un_dev; /* underlying device */ 3600Sstevel@tonic-gate md_dev64_t md_dev = md_expldev(pb->b_edev); /* metadev in error */ 3610Sstevel@tonic-gate char *str; 3620Sstevel@tonic-gate 3630Sstevel@tonic-gate un_dev = md_expldev(ps->ps_un->un_dev); 3640Sstevel@tonic-gate /* set error type */ 3650Sstevel@tonic-gate if (pb->b_flags & B_READ) { 3660Sstevel@tonic-gate str = "read"; 3670Sstevel@tonic-gate } else { 3680Sstevel@tonic-gate str = "write"; 3690Sstevel@tonic-gate } 3700Sstevel@tonic-gate 3710Sstevel@tonic-gate 3720Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps); 3730Sstevel@tonic-gate pb->b_flags |= B_ERROR; 3740Sstevel@tonic-gate 3750Sstevel@tonic-gate md_kstat_done(ui, pb, 0); 3760Sstevel@tonic-gate 3770Sstevel@tonic-gate if (lock_held) { 3780Sstevel@tonic-gate md_unit_readerexit(ui); 3790Sstevel@tonic-gate } 3800Sstevel@tonic-gate md_biodone(pb); 3810Sstevel@tonic-gate 3820Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: %s error on %s", 3830Sstevel@tonic-gate md_shortname(md_getminor(md_dev)), str, 3840Sstevel@tonic-gate md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0)); 3850Sstevel@tonic-gate } 3860Sstevel@tonic-gate 3870Sstevel@tonic-gate 3880Sstevel@tonic-gate /* 3890Sstevel@tonic-gate * FUNCTION: sp_xmit_ok 3900Sstevel@tonic-gate * INPUT: dq - daemon queue referencing failing ps structure 3910Sstevel@tonic-gate * OUTPUT: none. 3920Sstevel@tonic-gate * RETURNS: void. 3930Sstevel@tonic-gate * PURPOSE: send a message to the master node in a multi-owner diskset to 3940Sstevel@tonic-gate * update all attached nodes view of the soft-part to be MD_SP_OK. 3950Sstevel@tonic-gate * CALLING CONTEXT: 3960Sstevel@tonic-gate * Blockable. No unit lock held. 3970Sstevel@tonic-gate */ 3980Sstevel@tonic-gate static void 3990Sstevel@tonic-gate sp_xmit_ok(daemon_queue_t *dq) 4000Sstevel@tonic-gate { 4010Sstevel@tonic-gate md_spps_t *ps = (md_spps_t *)dq; 4020Sstevel@tonic-gate 4030Sstevel@tonic-gate /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 4040Sstevel@tonic-gate sp_send_stat_msg(ps->ps_un, MD_SP_OK); 4050Sstevel@tonic-gate 4060Sstevel@tonic-gate /* 4070Sstevel@tonic-gate * Successfully transmitted error state to all nodes, now release this 4080Sstevel@tonic-gate * parent structure. 4090Sstevel@tonic-gate */ 4100Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps); 4110Sstevel@tonic-gate } 4120Sstevel@tonic-gate 4130Sstevel@tonic-gate /* 4140Sstevel@tonic-gate * FUNCTION: sp_xmit_error 4150Sstevel@tonic-gate * INPUT: dq - daemon queue referencing failing ps structure 4160Sstevel@tonic-gate * OUTPUT: none. 4170Sstevel@tonic-gate * RETURNS: void. 4180Sstevel@tonic-gate * PURPOSE: send a message to the master node in a multi-owner diskset to 4190Sstevel@tonic-gate * update all attached nodes view of the soft-part to be MD_SP_ERR. 4200Sstevel@tonic-gate * CALLING CONTEXT: 4210Sstevel@tonic-gate * Blockable. No unit lock held. 4220Sstevel@tonic-gate */ 4230Sstevel@tonic-gate static void 4240Sstevel@tonic-gate sp_xmit_error(daemon_queue_t *dq) 4250Sstevel@tonic-gate { 4260Sstevel@tonic-gate md_spps_t *ps = (md_spps_t *)dq; 4270Sstevel@tonic-gate 4280Sstevel@tonic-gate /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 4290Sstevel@tonic-gate sp_send_stat_msg(ps->ps_un, MD_SP_ERR); 4300Sstevel@tonic-gate 4310Sstevel@tonic-gate /* 4320Sstevel@tonic-gate * Successfully transmitted error state to all nodes, now release this 4330Sstevel@tonic-gate * parent structure. 4340Sstevel@tonic-gate */ 4350Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps); 4360Sstevel@tonic-gate } 4370Sstevel@tonic-gate static void 4380Sstevel@tonic-gate sp_send_stat_ok(mp_unit_t *un) 4390Sstevel@tonic-gate { 4400Sstevel@tonic-gate minor_t mnum = MD_SID(un); 4410Sstevel@tonic-gate md_spps_t *ps; 4420Sstevel@tonic-gate 4430Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 4440Sstevel@tonic-gate sp_parent_init(ps); 4450Sstevel@tonic-gate ps->ps_un = un; 4460Sstevel@tonic-gate ps->ps_ui = MDI_UNIT(mnum); 4470Sstevel@tonic-gate 4480Sstevel@tonic-gate daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps, 4490Sstevel@tonic-gate REQ_OLD); 4500Sstevel@tonic-gate } 4510Sstevel@tonic-gate 4520Sstevel@tonic-gate static void 4530Sstevel@tonic-gate sp_send_stat_err(mp_unit_t *un) 4540Sstevel@tonic-gate { 4550Sstevel@tonic-gate minor_t mnum = MD_SID(un); 4560Sstevel@tonic-gate md_spps_t *ps; 4570Sstevel@tonic-gate 4580Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 4590Sstevel@tonic-gate sp_parent_init(ps); 4600Sstevel@tonic-gate ps->ps_un = un; 4610Sstevel@tonic-gate ps->ps_ui = MDI_UNIT(mnum); 4620Sstevel@tonic-gate 4630Sstevel@tonic-gate daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps, 4640Sstevel@tonic-gate REQ_OLD); 4650Sstevel@tonic-gate } 4660Sstevel@tonic-gate 4670Sstevel@tonic-gate 4680Sstevel@tonic-gate /* 4690Sstevel@tonic-gate * FUNCTION: sp_error() 4700Sstevel@tonic-gate * INPUT: ps - parent save structure for error-ed I/O. 4710Sstevel@tonic-gate * OUTPUT: none. 4720Sstevel@tonic-gate * RETURNS: void. 4730Sstevel@tonic-gate * PURPOSE: report a driver error. 4740Sstevel@tonic-gate * CALLING CONTEXT: 4750Sstevel@tonic-gate * Interrupt - non-blockable 4760Sstevel@tonic-gate */ 4770Sstevel@tonic-gate static void 4780Sstevel@tonic-gate sp_error(md_spps_t *ps) 4790Sstevel@tonic-gate { 4800Sstevel@tonic-gate set_t setno = MD_UN2SET(ps->ps_un); 4810Sstevel@tonic-gate 4820Sstevel@tonic-gate /* 4830Sstevel@tonic-gate * Drop the mutex associated with this request before (potentially) 4840Sstevel@tonic-gate * enqueuing the free onto a separate thread. We have to release the 4850Sstevel@tonic-gate * mutex before destroying the parent structure. 4860Sstevel@tonic-gate */ 4870Sstevel@tonic-gate if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 4880Sstevel@tonic-gate if (MUTEX_HELD(&ps->ps_mx)) { 4890Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 4900Sstevel@tonic-gate } 4910Sstevel@tonic-gate } else { 4920Sstevel@tonic-gate /* 4930Sstevel@tonic-gate * this should only ever happen if we are panicking, 4940Sstevel@tonic-gate * since DONTFREE is only set on the parent if panicstr 4950Sstevel@tonic-gate * is non-NULL. 4960Sstevel@tonic-gate */ 4970Sstevel@tonic-gate ASSERT(panicstr); 4980Sstevel@tonic-gate } 4990Sstevel@tonic-gate 5000Sstevel@tonic-gate /* 5010Sstevel@tonic-gate * For a multi-owner set we need to send a message to the master so that 5020Sstevel@tonic-gate * all nodes get the errored status when we first encounter it. To avoid 5030Sstevel@tonic-gate * deadlocking when multiple soft-partitions encounter an error on one 5040Sstevel@tonic-gate * physical unit we drop the unit readerlock before enqueueing the 5050Sstevel@tonic-gate * request. That way we can service any messages that require a 5060Sstevel@tonic-gate * writerlock to be held. Additionally, to avoid deadlocking when at 5070Sstevel@tonic-gate * the bottom of a metadevice stack and a higher level mirror has 5080Sstevel@tonic-gate * multiple requests outstanding on this soft-part, we clone the ps 5090Sstevel@tonic-gate * that failed and pass the error back up the stack to release the 5100Sstevel@tonic-gate * reference that this i/o may have in the higher-level metadevice. 5110Sstevel@tonic-gate * The other nodes in the cluster just have to modify the soft-part 5120Sstevel@tonic-gate * status and we do not need to block the i/o completion for this. 5130Sstevel@tonic-gate */ 5140Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) { 5150Sstevel@tonic-gate md_spps_t *err_ps; 5160Sstevel@tonic-gate err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 5170Sstevel@tonic-gate sp_parent_init(err_ps); 5180Sstevel@tonic-gate 5190Sstevel@tonic-gate err_ps->ps_un = ps->ps_un; 5200Sstevel@tonic-gate err_ps->ps_ui = ps->ps_ui; 5210Sstevel@tonic-gate 5220Sstevel@tonic-gate md_unit_readerexit(ps->ps_ui); 5230Sstevel@tonic-gate 5240Sstevel@tonic-gate daemon_request(&md_sp_daemon, sp_xmit_error, 5250Sstevel@tonic-gate (daemon_queue_t *)err_ps, REQ_OLD); 5260Sstevel@tonic-gate 5270Sstevel@tonic-gate sp_finish_error(ps, 0); 5280Sstevel@tonic-gate 5290Sstevel@tonic-gate return; 5300Sstevel@tonic-gate } else { 5310Sstevel@tonic-gate ps->ps_un->un_status = MD_SP_ERR; 5320Sstevel@tonic-gate } 5330Sstevel@tonic-gate 5340Sstevel@tonic-gate /* Flag the error */ 5350Sstevel@tonic-gate sp_finish_error(ps, 1); 5360Sstevel@tonic-gate 5370Sstevel@tonic-gate } 5380Sstevel@tonic-gate 5390Sstevel@tonic-gate /* 5400Sstevel@tonic-gate * FUNCTION: sp_mapbuf() 5410Sstevel@tonic-gate * INPUT: un - unit structure for soft partition we are doing 5420Sstevel@tonic-gate * I/O on. 5430Sstevel@tonic-gate * voff - virtual offset in soft partition to map. 5440Sstevel@tonic-gate * bcount - # of blocks in the I/O. 5450Sstevel@tonic-gate * OUTPUT: bp - translated buffer to be passed down to next layer. 5460Sstevel@tonic-gate * RETURNS: 1 - request must be fragmented, more work to do, 5470Sstevel@tonic-gate * 0 - request satisified, no more work to do 5480Sstevel@tonic-gate * -1 - error 5490Sstevel@tonic-gate * PURPOSE: Map the the virtual offset in the soft partition (passed 5500Sstevel@tonic-gate * in via voff) to the "physical" offset on whatever the soft 5510Sstevel@tonic-gate * partition is built on top of. We do this by doing a binary 5520Sstevel@tonic-gate * search of the extent array in the soft partition unit 5530Sstevel@tonic-gate * structure. Once the current extent is found, we do the 5540Sstevel@tonic-gate * translation, determine if the I/O will cross extent 5550Sstevel@tonic-gate * boundaries (if so, we have to fragment the I/O), then 5560Sstevel@tonic-gate * fill in the buf structure to be passed down to the next layer. 5570Sstevel@tonic-gate */ 5580Sstevel@tonic-gate static int 5590Sstevel@tonic-gate sp_mapbuf( 5600Sstevel@tonic-gate mp_unit_t *un, 5610Sstevel@tonic-gate sp_ext_offset_t voff, 5620Sstevel@tonic-gate sp_ext_length_t bcount, 5630Sstevel@tonic-gate buf_t *bp 5640Sstevel@tonic-gate ) 5650Sstevel@tonic-gate { 5660Sstevel@tonic-gate int lo, mid, hi, found, more; 5670Sstevel@tonic-gate size_t new_bcount; 5680Sstevel@tonic-gate sp_ext_offset_t new_blkno; 5690Sstevel@tonic-gate sp_ext_offset_t new_offset; 5700Sstevel@tonic-gate sp_ext_offset_t ext_endblk; 5710Sstevel@tonic-gate md_dev64_t new_edev; 5720Sstevel@tonic-gate extern unsigned md_maxphys; 5730Sstevel@tonic-gate 5740Sstevel@tonic-gate found = 0; 5750Sstevel@tonic-gate lo = 0; 5760Sstevel@tonic-gate hi = un->un_numexts - 1; 5770Sstevel@tonic-gate 5780Sstevel@tonic-gate /* 5790Sstevel@tonic-gate * do a binary search to find the extent that contains the 5800Sstevel@tonic-gate * starting offset. after this loop, mid contains the index 5810Sstevel@tonic-gate * of the correct extent. 5820Sstevel@tonic-gate */ 5830Sstevel@tonic-gate while (lo <= hi && !found) { 5840Sstevel@tonic-gate mid = (lo + hi) / 2; 5850Sstevel@tonic-gate /* is the starting offset contained within the mid-ext? */ 5860Sstevel@tonic-gate if (voff >= un->un_ext[mid].un_voff && 5870Sstevel@tonic-gate voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len) 5880Sstevel@tonic-gate found = 1; 5890Sstevel@tonic-gate else if (voff < un->un_ext[mid].un_voff) 5900Sstevel@tonic-gate hi = mid - 1; 5910Sstevel@tonic-gate else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */ 5920Sstevel@tonic-gate lo = mid + 1; 5930Sstevel@tonic-gate } 5940Sstevel@tonic-gate 5950Sstevel@tonic-gate if (!found) { 5960Sstevel@tonic-gate cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff); 5970Sstevel@tonic-gate return (-1); 5980Sstevel@tonic-gate } 5990Sstevel@tonic-gate 6000Sstevel@tonic-gate /* translate to underlying physical offset/device */ 6010Sstevel@tonic-gate new_offset = voff - un->un_ext[mid].un_voff; 6020Sstevel@tonic-gate new_blkno = un->un_ext[mid].un_poff + new_offset; 6030Sstevel@tonic-gate new_edev = un->un_dev; 6040Sstevel@tonic-gate 6050Sstevel@tonic-gate /* determine if we need to break the I/O into fragments */ 6060Sstevel@tonic-gate ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len; 6070Sstevel@tonic-gate if (voff + btodb(bcount) > ext_endblk) { 6080Sstevel@tonic-gate new_bcount = dbtob(ext_endblk - voff); 6090Sstevel@tonic-gate more = 1; 6100Sstevel@tonic-gate } else { 6110Sstevel@tonic-gate new_bcount = bcount; 6120Sstevel@tonic-gate more = 0; 6130Sstevel@tonic-gate } 6140Sstevel@tonic-gate 6150Sstevel@tonic-gate /* only break up the I/O if we're not built on another metadevice */ 6160Sstevel@tonic-gate if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) { 6170Sstevel@tonic-gate new_bcount = md_maxphys; 6180Sstevel@tonic-gate more = 1; 6190Sstevel@tonic-gate } 6200Sstevel@tonic-gate if (bp != (buf_t *)NULL) { 6210Sstevel@tonic-gate /* do bp updates */ 6220Sstevel@tonic-gate bp->b_bcount = new_bcount; 6230Sstevel@tonic-gate bp->b_lblkno = new_blkno; 6240Sstevel@tonic-gate bp->b_edev = md_dev64_to_dev(new_edev); 6250Sstevel@tonic-gate } 6260Sstevel@tonic-gate return (more); 6270Sstevel@tonic-gate } 6280Sstevel@tonic-gate 6290Sstevel@tonic-gate /* 6300Sstevel@tonic-gate * FUNCTION: sp_validate() 6310Sstevel@tonic-gate * INPUT: un - unit structure to be validated. 6320Sstevel@tonic-gate * OUTPUT: none. 6330Sstevel@tonic-gate * RETURNS: 0 - soft partition ok. 6340Sstevel@tonic-gate * -1 - error. 6350Sstevel@tonic-gate * PURPOSE: called on open to sanity check the soft partition. In 6360Sstevel@tonic-gate * order to open a soft partition: 6370Sstevel@tonic-gate * - it must have at least one extent 6380Sstevel@tonic-gate * - the extent info in core and on disk must match 6390Sstevel@tonic-gate * - it may not be in an intermediate state (which would 6400Sstevel@tonic-gate * imply that a two-phase commit was interrupted) 6410Sstevel@tonic-gate * 6420Sstevel@tonic-gate * If the extent checking fails (B_ERROR returned from the read 6430Sstevel@tonic-gate * strategy call) _and_ we're a multi-owner diskset, we send a 6440Sstevel@tonic-gate * message to the master so that all nodes inherit the same view 6450Sstevel@tonic-gate * of the soft partition. 6460Sstevel@tonic-gate * If we are checking a soft-part that is marked as in error, and 6470Sstevel@tonic-gate * we can actually read and validate the watermarks we send a 6480Sstevel@tonic-gate * message to clear the error to the master node. 6490Sstevel@tonic-gate */ 6500Sstevel@tonic-gate static int 6510Sstevel@tonic-gate sp_validate(mp_unit_t *un) 6520Sstevel@tonic-gate { 6530Sstevel@tonic-gate uint_t ext; 6540Sstevel@tonic-gate struct buf *buf; 6550Sstevel@tonic-gate sp_ext_length_t len; 6560Sstevel@tonic-gate mp_watermark_t *wm; 6570Sstevel@tonic-gate set_t setno; 6580Sstevel@tonic-gate int reset_error = 0; 6590Sstevel@tonic-gate 6600Sstevel@tonic-gate setno = MD_UN2SET(un); 6610Sstevel@tonic-gate 6620Sstevel@tonic-gate /* sanity check unit structure components ?? */ 6630Sstevel@tonic-gate if (un->un_status != MD_SP_OK) { 6640Sstevel@tonic-gate if (un->un_status != MD_SP_ERR) { 6650Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, soft partition " 6660Sstevel@tonic-gate "status is %u.", 6670Sstevel@tonic-gate md_shortname(MD_SID(un)), 6680Sstevel@tonic-gate un->un_status); 6690Sstevel@tonic-gate return (-1); 6700Sstevel@tonic-gate } else { 6710Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open of soft partition " 6720Sstevel@tonic-gate "in Errored state.", 6730Sstevel@tonic-gate md_shortname(MD_SID(un))); 6740Sstevel@tonic-gate reset_error = 1; 6750Sstevel@tonic-gate } 6760Sstevel@tonic-gate } 6770Sstevel@tonic-gate 6780Sstevel@tonic-gate if (un->un_numexts == 0) { 6790Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, soft partition does " 6800Sstevel@tonic-gate "not have any extents.", md_shortname(MD_SID(un))); 6810Sstevel@tonic-gate return (-1); 6820Sstevel@tonic-gate } 6830Sstevel@tonic-gate 6840Sstevel@tonic-gate len = 0LL; 6850Sstevel@tonic-gate for (ext = 0; ext < un->un_numexts; ext++) { 6860Sstevel@tonic-gate 6870Sstevel@tonic-gate /* tally extent lengths to check total size */ 6880Sstevel@tonic-gate len += un->un_ext[ext].un_len; 6890Sstevel@tonic-gate 6900Sstevel@tonic-gate /* allocate buffer for watermark */ 6910Sstevel@tonic-gate buf = getrbuf(KM_SLEEP); 6920Sstevel@tonic-gate 6930Sstevel@tonic-gate /* read watermark */ 6940Sstevel@tonic-gate buf->b_flags = B_READ; 6950Sstevel@tonic-gate buf->b_edev = md_dev64_to_dev(un->un_dev); 6960Sstevel@tonic-gate buf->b_iodone = NULL; 6970Sstevel@tonic-gate buf->b_proc = NULL; 6980Sstevel@tonic-gate buf->b_bcount = sizeof (mp_watermark_t); 6990Sstevel@tonic-gate buf->b_lblkno = un->un_ext[ext].un_poff - 1; 7000Sstevel@tonic-gate buf->b_bufsize = sizeof (mp_watermark_t); 7010Sstevel@tonic-gate buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t), 7020Sstevel@tonic-gate KM_SLEEP); 7030Sstevel@tonic-gate 7040Sstevel@tonic-gate /* 7050Sstevel@tonic-gate * make the call non-blocking so that it is not affected 7060Sstevel@tonic-gate * by a set take. 7070Sstevel@tonic-gate */ 7080Sstevel@tonic-gate md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL); 7090Sstevel@tonic-gate (void) biowait(buf); 7100Sstevel@tonic-gate 7110Sstevel@tonic-gate if (buf->b_flags & B_ERROR) { 7120Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, could not " 7130Sstevel@tonic-gate "read watermark at block %llu for extent %u, " 7140Sstevel@tonic-gate "error %d.", md_shortname(MD_SID(un)), 7150Sstevel@tonic-gate buf->b_lblkno, ext, buf->b_error); 7160Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 7170Sstevel@tonic-gate freerbuf(buf); 7180Sstevel@tonic-gate 7190Sstevel@tonic-gate /* 7200Sstevel@tonic-gate * If we're a multi-owner diskset we send a message 7210Sstevel@tonic-gate * indicating that this soft-part has an invalid 7220Sstevel@tonic-gate * extent to the master node. This ensures a consistent 7230Sstevel@tonic-gate * view of the soft-part across the cluster. 7240Sstevel@tonic-gate */ 7250Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) { 7260Sstevel@tonic-gate sp_send_stat_err(un); 7270Sstevel@tonic-gate } 7280Sstevel@tonic-gate return (-1); 7290Sstevel@tonic-gate } 7300Sstevel@tonic-gate 7310Sstevel@tonic-gate wm = (mp_watermark_t *)buf->b_un.b_addr; 7320Sstevel@tonic-gate 7330Sstevel@tonic-gate /* make sure the checksum is correct first */ 7340Sstevel@tonic-gate if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum, 7350Sstevel@tonic-gate (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) { 7360Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 7370Sstevel@tonic-gate "at block %llu for extent %u does not have a " 7380Sstevel@tonic-gate "valid checksum 0x%08x.", md_shortname(MD_SID(un)), 7390Sstevel@tonic-gate buf->b_lblkno, ext, wm->wm_checksum); 7400Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 7410Sstevel@tonic-gate freerbuf(buf); 7420Sstevel@tonic-gate return (-1); 7430Sstevel@tonic-gate } 7440Sstevel@tonic-gate 7450Sstevel@tonic-gate if (wm->wm_magic != MD_SP_MAGIC) { 7460Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 7470Sstevel@tonic-gate "at block %llu for extent %u does not have a " 7480Sstevel@tonic-gate "valid watermark magic number, expected 0x%x, " 7490Sstevel@tonic-gate "found 0x%x.", md_shortname(MD_SID(un)), 7500Sstevel@tonic-gate buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic); 7510Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 7520Sstevel@tonic-gate freerbuf(buf); 7530Sstevel@tonic-gate return (-1); 7540Sstevel@tonic-gate } 7550Sstevel@tonic-gate 7560Sstevel@tonic-gate /* make sure sequence number matches the current extent */ 7570Sstevel@tonic-gate if (wm->wm_seq != ext) { 7580Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 7590Sstevel@tonic-gate "at block %llu for extent %u has invalid " 7600Sstevel@tonic-gate "sequence number %u.", md_shortname(MD_SID(un)), 7610Sstevel@tonic-gate buf->b_lblkno, ext, wm->wm_seq); 7620Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 7630Sstevel@tonic-gate freerbuf(buf); 7640Sstevel@tonic-gate return (-1); 7650Sstevel@tonic-gate } 7660Sstevel@tonic-gate 7670Sstevel@tonic-gate /* make sure watermark length matches unit structure */ 7680Sstevel@tonic-gate if (wm->wm_length != un->un_ext[ext].un_len) { 7690Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 7700Sstevel@tonic-gate "at block %llu for extent %u has inconsistent " 7710Sstevel@tonic-gate "length, expected %llu, found %llu.", 7720Sstevel@tonic-gate md_shortname(MD_SID(un)), buf->b_lblkno, 7730Sstevel@tonic-gate ext, un->un_ext[ext].un_len, 7740Sstevel@tonic-gate (u_longlong_t)wm->wm_length); 7750Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 7760Sstevel@tonic-gate freerbuf(buf); 7770Sstevel@tonic-gate return (-1); 7780Sstevel@tonic-gate } 7790Sstevel@tonic-gate 7800Sstevel@tonic-gate /* 7810Sstevel@tonic-gate * make sure the type is a valid soft partition and not 7820Sstevel@tonic-gate * a free extent or the end. 7830Sstevel@tonic-gate */ 7840Sstevel@tonic-gate if (wm->wm_type != EXTTYP_ALLOC) { 7850Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 7860Sstevel@tonic-gate "at block %llu for extent %u is not marked " 7870Sstevel@tonic-gate "as in-use, type = %u.", md_shortname(MD_SID(un)), 7880Sstevel@tonic-gate buf->b_lblkno, ext, wm->wm_type); 7890Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 7900Sstevel@tonic-gate freerbuf(buf); 7910Sstevel@tonic-gate return (-1); 7920Sstevel@tonic-gate } 7930Sstevel@tonic-gate /* free up buffer */ 7940Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 7950Sstevel@tonic-gate freerbuf(buf); 7960Sstevel@tonic-gate } 7970Sstevel@tonic-gate 7980Sstevel@tonic-gate if (len != un->un_length) { 7990Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, computed length " 8000Sstevel@tonic-gate "%llu != expected length %llu.", md_shortname(MD_SID(un)), 8010Sstevel@tonic-gate len, un->un_length); 8020Sstevel@tonic-gate return (-1); 8030Sstevel@tonic-gate } 8040Sstevel@tonic-gate 8050Sstevel@tonic-gate /* 8060Sstevel@tonic-gate * If we're a multi-owner set _and_ reset_error is set, we should clear 8070Sstevel@tonic-gate * the error condition on all nodes in the set. Use SP_SETSTAT2 with 8080Sstevel@tonic-gate * MD_SP_OK. 8090Sstevel@tonic-gate */ 8100Sstevel@tonic-gate if (MD_MNSET_SETNO(setno) && reset_error) { 8110Sstevel@tonic-gate sp_send_stat_ok(un); 8120Sstevel@tonic-gate } 8130Sstevel@tonic-gate return (0); 8140Sstevel@tonic-gate } 8150Sstevel@tonic-gate 8160Sstevel@tonic-gate /* 8170Sstevel@tonic-gate * FUNCTION: sp_done() 8180Sstevel@tonic-gate * INPUT: child_buf - buffer attached to child save structure. 8190Sstevel@tonic-gate * this is the buffer on which I/O has just 8200Sstevel@tonic-gate * completed. 8210Sstevel@tonic-gate * OUTPUT: none. 8220Sstevel@tonic-gate * RETURNS: 0 - success. 8230Sstevel@tonic-gate * 1 - error. 8240Sstevel@tonic-gate * PURPOSE: called on I/O completion. 8250Sstevel@tonic-gate */ 8260Sstevel@tonic-gate static int 8270Sstevel@tonic-gate sp_done(struct buf *child_buf) 8280Sstevel@tonic-gate { 8290Sstevel@tonic-gate struct buf *parent_buf; 8300Sstevel@tonic-gate mdi_unit_t *ui; 8310Sstevel@tonic-gate md_spps_t *ps; 8320Sstevel@tonic-gate md_spcs_t *cs; 8330Sstevel@tonic-gate 8340Sstevel@tonic-gate /* find the child save structure to which this buffer belongs */ 8350Sstevel@tonic-gate cs = (md_spcs_t *)((caddr_t)child_buf - 8360Sstevel@tonic-gate (sizeof (md_spcs_t) - sizeof (buf_t))); 8370Sstevel@tonic-gate /* now get the parent save structure */ 8380Sstevel@tonic-gate ps = cs->cs_ps; 8390Sstevel@tonic-gate parent_buf = ps->ps_bp; 8400Sstevel@tonic-gate 8410Sstevel@tonic-gate mutex_enter(&ps->ps_mx); 8420Sstevel@tonic-gate /* pass any errors back up to the parent */ 8430Sstevel@tonic-gate if (child_buf->b_flags & B_ERROR) { 8440Sstevel@tonic-gate ps->ps_flags |= MD_SPPS_ERROR; 8450Sstevel@tonic-gate parent_buf->b_error = child_buf->b_error; 8460Sstevel@tonic-gate } 8470Sstevel@tonic-gate /* mapout, if needed */ 8480Sstevel@tonic-gate if (child_buf->b_flags & B_REMAPPED) 8490Sstevel@tonic-gate bp_mapout(child_buf); 8500Sstevel@tonic-gate 8510Sstevel@tonic-gate ps->ps_frags--; 8520Sstevel@tonic-gate if (ps->ps_frags != 0) { 8530Sstevel@tonic-gate /* 8540Sstevel@tonic-gate * if this parent has more children, we just free the 8550Sstevel@tonic-gate * child and return. 8560Sstevel@tonic-gate */ 8570Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs); 8580Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 8590Sstevel@tonic-gate return (1); 8600Sstevel@tonic-gate } 8610Sstevel@tonic-gate /* there are no more children */ 8620Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs); 8630Sstevel@tonic-gate if (ps->ps_flags & MD_SPPS_ERROR) { 8640Sstevel@tonic-gate sp_error(ps); 8650Sstevel@tonic-gate return (1); 8660Sstevel@tonic-gate } 8670Sstevel@tonic-gate ui = ps->ps_ui; 8680Sstevel@tonic-gate if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 8690Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 8700Sstevel@tonic-gate } else { 8710Sstevel@tonic-gate /* 8720Sstevel@tonic-gate * this should only ever happen if we are panicking, 8730Sstevel@tonic-gate * since DONTFREE is only set on the parent if panicstr 8740Sstevel@tonic-gate * is non-NULL. 8750Sstevel@tonic-gate */ 8760Sstevel@tonic-gate ASSERT(panicstr); 8770Sstevel@tonic-gate } 8780Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps); 8790Sstevel@tonic-gate md_kstat_done(ui, parent_buf, 0); 8800Sstevel@tonic-gate md_unit_readerexit(ui); 8810Sstevel@tonic-gate md_biodone(parent_buf); 8820Sstevel@tonic-gate return (0); 8830Sstevel@tonic-gate } 8840Sstevel@tonic-gate 8850Sstevel@tonic-gate /* 8860Sstevel@tonic-gate * FUNCTION: md_sp_strategy() 8870Sstevel@tonic-gate * INPUT: parent_buf - parent buffer 8880Sstevel@tonic-gate * flag - flags 8890Sstevel@tonic-gate * private - private data 8900Sstevel@tonic-gate * OUTPUT: none. 8910Sstevel@tonic-gate * RETURNS: void. 8920Sstevel@tonic-gate * PURPOSE: Soft partitioning I/O strategy. Performs the main work 8930Sstevel@tonic-gate * needed to do I/O to a soft partition. The basic 8940Sstevel@tonic-gate * algorithm is as follows: 8950Sstevel@tonic-gate * - Allocate a child save structure to keep track 8960Sstevel@tonic-gate * of the I/O we are going to pass down. 8970Sstevel@tonic-gate * - Map the I/O to the correct extent in the soft 8980Sstevel@tonic-gate * partition (see sp_mapbuf()). 8990Sstevel@tonic-gate * - bioclone() the buffer and pass it down the 9000Sstevel@tonic-gate * stack using md_call_strategy. 9010Sstevel@tonic-gate * - If the I/O needs to split across extents, 9020Sstevel@tonic-gate * repeat the above steps until all fragments 9030Sstevel@tonic-gate * are finished. 9040Sstevel@tonic-gate */ 9050Sstevel@tonic-gate static void 9060Sstevel@tonic-gate md_sp_strategy(buf_t *parent_buf, int flag, void *private) 9070Sstevel@tonic-gate { 9080Sstevel@tonic-gate md_spps_t *ps; 9090Sstevel@tonic-gate md_spcs_t *cs; 9100Sstevel@tonic-gate int more; 9110Sstevel@tonic-gate mp_unit_t *un; 9120Sstevel@tonic-gate mdi_unit_t *ui; 9130Sstevel@tonic-gate size_t current_count; 9140Sstevel@tonic-gate off_t current_offset; 9150Sstevel@tonic-gate sp_ext_offset_t current_blkno; 9160Sstevel@tonic-gate buf_t *child_buf; 9170Sstevel@tonic-gate set_t setno = MD_MIN2SET(getminor(parent_buf->b_edev)); 9180Sstevel@tonic-gate int strat_flag = flag; 9190Sstevel@tonic-gate 9200Sstevel@tonic-gate /* 9210Sstevel@tonic-gate * When doing IO to a multi owner meta device, check if set is halted. 9220Sstevel@tonic-gate * We do this check without the needed lock held, for performance 9230Sstevel@tonic-gate * reasons. 9240Sstevel@tonic-gate * If an IO just slips through while the set is locked via an 9250Sstevel@tonic-gate * MD_MN_SUSPEND_SET, we don't care about it. 9260Sstevel@tonic-gate * Only check for suspension if we are a top-level i/o request 9270Sstevel@tonic-gate * (MD_STR_NOTTOP is cleared in 'flag'); 9280Sstevel@tonic-gate */ 9290Sstevel@tonic-gate if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 9300Sstevel@tonic-gate (MD_SET_HALTED | MD_SET_MNSET)) { 9310Sstevel@tonic-gate if ((flag & MD_STR_NOTTOP) == 0) { 9320Sstevel@tonic-gate mutex_enter(&md_mx); 9330Sstevel@tonic-gate /* Here we loop until the set is no longer halted */ 9340Sstevel@tonic-gate while (md_set[setno].s_status & MD_SET_HALTED) { 9350Sstevel@tonic-gate cv_wait(&md_cv, &md_mx); 9360Sstevel@tonic-gate } 9370Sstevel@tonic-gate mutex_exit(&md_mx); 9380Sstevel@tonic-gate } 9390Sstevel@tonic-gate } 9400Sstevel@tonic-gate 9410Sstevel@tonic-gate ui = MDI_UNIT(getminor(parent_buf->b_edev)); 9420Sstevel@tonic-gate 9430Sstevel@tonic-gate md_kstat_waitq_enter(ui); 9440Sstevel@tonic-gate 9450Sstevel@tonic-gate un = (mp_unit_t *)md_unit_readerlock(ui); 9460Sstevel@tonic-gate 9470Sstevel@tonic-gate if ((flag & MD_NOBLOCK) == 0) { 9480Sstevel@tonic-gate if (md_inc_iocount(setno) != 0) { 9490Sstevel@tonic-gate parent_buf->b_flags |= B_ERROR; 9500Sstevel@tonic-gate parent_buf->b_error = ENXIO; 9510Sstevel@tonic-gate parent_buf->b_resid = parent_buf->b_bcount; 9520Sstevel@tonic-gate md_unit_readerexit(ui); 9530Sstevel@tonic-gate biodone(parent_buf); 9540Sstevel@tonic-gate return; 9550Sstevel@tonic-gate } 9560Sstevel@tonic-gate } else { 9570Sstevel@tonic-gate md_inc_iocount_noblock(setno); 9580Sstevel@tonic-gate } 9590Sstevel@tonic-gate 9600Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP)) { 9610Sstevel@tonic-gate if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) { 9620Sstevel@tonic-gate md_kstat_waitq_exit(ui); 9630Sstevel@tonic-gate return; 9640Sstevel@tonic-gate } 9650Sstevel@tonic-gate } 9660Sstevel@tonic-gate 9670Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 9680Sstevel@tonic-gate sp_parent_init(ps); 9690Sstevel@tonic-gate 9700Sstevel@tonic-gate /* 9710Sstevel@tonic-gate * Save essential information from the original buffhdr 9720Sstevel@tonic-gate * in the parent. 9730Sstevel@tonic-gate */ 9740Sstevel@tonic-gate ps->ps_un = un; 9750Sstevel@tonic-gate ps->ps_ui = ui; 9760Sstevel@tonic-gate ps->ps_bp = parent_buf; 9770Sstevel@tonic-gate ps->ps_addr = parent_buf->b_un.b_addr; 9780Sstevel@tonic-gate 9790Sstevel@tonic-gate current_count = parent_buf->b_bcount; 9800Sstevel@tonic-gate current_blkno = (sp_ext_offset_t)parent_buf->b_blkno; 9810Sstevel@tonic-gate current_offset = 0; 9820Sstevel@tonic-gate 9830Sstevel@tonic-gate /* 9840Sstevel@tonic-gate * if we are at the top and we are panicking, 9850Sstevel@tonic-gate * we don't free in order to save state. 9860Sstevel@tonic-gate */ 9870Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) 9880Sstevel@tonic-gate ps->ps_flags |= MD_SPPS_DONTFREE; 9890Sstevel@tonic-gate 9900Sstevel@tonic-gate md_kstat_waitq_to_runq(ui); 9910Sstevel@tonic-gate 9920Sstevel@tonic-gate ps->ps_frags++; 9930Sstevel@tonic-gate 9940Sstevel@tonic-gate /* 9950Sstevel@tonic-gate * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this 9960Sstevel@tonic-gate * metadevice. 9970Sstevel@tonic-gate */ 9980Sstevel@tonic-gate if (ui->ui_tstate & MD_ABR_CAP) 9990Sstevel@tonic-gate strat_flag |= MD_STR_ABR; 10000Sstevel@tonic-gate 10010Sstevel@tonic-gate /* 10020Sstevel@tonic-gate * this loop does the main work of an I/O. we allocate a 10030Sstevel@tonic-gate * a child save for each buf, do the logical to physical 10040Sstevel@tonic-gate * mapping, decide if we need to frag the I/O, clone the 10050Sstevel@tonic-gate * new I/O to pass down the stack. repeat until we've 10060Sstevel@tonic-gate * taken care of the entire buf that was passed to us. 10070Sstevel@tonic-gate */ 10080Sstevel@tonic-gate do { 10090Sstevel@tonic-gate cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 10100Sstevel@tonic-gate sp_child_init(cs); 10110Sstevel@tonic-gate child_buf = &cs->cs_buf; 10120Sstevel@tonic-gate cs->cs_ps = ps; 10130Sstevel@tonic-gate 10140Sstevel@tonic-gate more = sp_mapbuf(un, current_blkno, current_count, child_buf); 10150Sstevel@tonic-gate if (more == -1) { 10160Sstevel@tonic-gate parent_buf->b_flags |= B_ERROR; 10170Sstevel@tonic-gate parent_buf->b_error = EIO; 10180Sstevel@tonic-gate md_kstat_done(ui, parent_buf, 0); 10190Sstevel@tonic-gate md_unit_readerexit(ui); 10200Sstevel@tonic-gate md_biodone(parent_buf); 10210Sstevel@tonic-gate kmem_cache_free(sp_parent_cache, ps); 10220Sstevel@tonic-gate return; 10230Sstevel@tonic-gate } 10240Sstevel@tonic-gate 10250Sstevel@tonic-gate child_buf = md_bioclone(parent_buf, current_offset, 10260Sstevel@tonic-gate child_buf->b_bcount, child_buf->b_edev, 10270Sstevel@tonic-gate child_buf->b_blkno, sp_done, child_buf, 10280Sstevel@tonic-gate KM_NOSLEEP); 10290Sstevel@tonic-gate /* calculate new offset, counts, etc... */ 10300Sstevel@tonic-gate current_offset += child_buf->b_bcount; 10310Sstevel@tonic-gate current_count -= child_buf->b_bcount; 10320Sstevel@tonic-gate current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 10330Sstevel@tonic-gate 10340Sstevel@tonic-gate if (more) { 10350Sstevel@tonic-gate mutex_enter(&ps->ps_mx); 10360Sstevel@tonic-gate ps->ps_frags++; 10370Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 10380Sstevel@tonic-gate } 10390Sstevel@tonic-gate 10400Sstevel@tonic-gate md_call_strategy(child_buf, strat_flag, private); 10410Sstevel@tonic-gate } while (more); 10420Sstevel@tonic-gate 10430Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) { 10440Sstevel@tonic-gate while (!(ps->ps_flags & MD_SPPS_DONE)) { 10450Sstevel@tonic-gate md_daemon(1, &md_done_daemon); 10460Sstevel@tonic-gate } 10470Sstevel@tonic-gate kmem_cache_free(sp_parent_cache, ps); 10480Sstevel@tonic-gate } 10490Sstevel@tonic-gate } 10500Sstevel@tonic-gate 10510Sstevel@tonic-gate /* 10520Sstevel@tonic-gate * FUNCTION: sp_directed_read() 10530Sstevel@tonic-gate * INPUT: mnum - minor number 10540Sstevel@tonic-gate * vdr - vol_directed_rd_t from user 10550Sstevel@tonic-gate * mode - access mode for copying data out. 10560Sstevel@tonic-gate * OUTPUT: none. 10570Sstevel@tonic-gate * RETURNS: 0 - success 10580Sstevel@tonic-gate * Exxxxx - failure error-code 10590Sstevel@tonic-gate * PURPOSE: Construct the necessary sub-device i/o requests to perform the 10600Sstevel@tonic-gate * directed read as requested by the user. This is essentially the 10610Sstevel@tonic-gate * same as md_sp_strategy() with the exception being that the 10620Sstevel@tonic-gate * underlying 'md_call_strategy' is replaced with an ioctl call. 10630Sstevel@tonic-gate */ 10640Sstevel@tonic-gate int 10650Sstevel@tonic-gate sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode) 10660Sstevel@tonic-gate { 10670Sstevel@tonic-gate md_spps_t *ps; 10680Sstevel@tonic-gate md_spcs_t *cs; 10690Sstevel@tonic-gate int more; 10700Sstevel@tonic-gate mp_unit_t *un; 10710Sstevel@tonic-gate mdi_unit_t *ui; 10720Sstevel@tonic-gate size_t current_count; 10730Sstevel@tonic-gate off_t current_offset; 10740Sstevel@tonic-gate sp_ext_offset_t current_blkno; 10750Sstevel@tonic-gate buf_t *child_buf, *parent_buf; 10760Sstevel@tonic-gate void *kbuffer; 10770Sstevel@tonic-gate vol_directed_rd_t cvdr; 10780Sstevel@tonic-gate caddr_t userbuf; 10790Sstevel@tonic-gate offset_t useroff; 10800Sstevel@tonic-gate int ret = 0; 10810Sstevel@tonic-gate 10820Sstevel@tonic-gate ui = MDI_UNIT(mnum); 10830Sstevel@tonic-gate 10840Sstevel@tonic-gate md_kstat_waitq_enter(ui); 10850Sstevel@tonic-gate 10860Sstevel@tonic-gate bzero(&cvdr, sizeof (cvdr)); 10870Sstevel@tonic-gate 10880Sstevel@tonic-gate un = (mp_unit_t *)md_unit_readerlock(ui); 10890Sstevel@tonic-gate 10900Sstevel@tonic-gate /* 10910Sstevel@tonic-gate * Construct a parent_buf header which reflects the user-supplied 10920Sstevel@tonic-gate * request. 10930Sstevel@tonic-gate */ 10940Sstevel@tonic-gate 10950Sstevel@tonic-gate kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 10960Sstevel@tonic-gate if (kbuffer == NULL) { 10970Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR; 10980Sstevel@tonic-gate md_unit_readerexit(ui); 10990Sstevel@tonic-gate return (ENOMEM); 11000Sstevel@tonic-gate } 11010Sstevel@tonic-gate 11020Sstevel@tonic-gate parent_buf = getrbuf(KM_NOSLEEP); 11030Sstevel@tonic-gate if (parent_buf == NULL) { 11040Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR; 11050Sstevel@tonic-gate md_unit_readerexit(ui); 11060Sstevel@tonic-gate kmem_free(kbuffer, vdr->vdr_nbytes); 11070Sstevel@tonic-gate return (ENOMEM); 11080Sstevel@tonic-gate } 11090Sstevel@tonic-gate parent_buf->b_un.b_addr = kbuffer; 11100Sstevel@tonic-gate parent_buf->b_flags = B_READ; 11110Sstevel@tonic-gate parent_buf->b_bcount = vdr->vdr_nbytes; 11120Sstevel@tonic-gate parent_buf->b_lblkno = lbtodb(vdr->vdr_offset); 11130Sstevel@tonic-gate parent_buf->b_edev = un->un_dev; 11140Sstevel@tonic-gate 11150Sstevel@tonic-gate 11160Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 11170Sstevel@tonic-gate sp_parent_init(ps); 11180Sstevel@tonic-gate 11190Sstevel@tonic-gate /* 11200Sstevel@tonic-gate * Save essential information from the original buffhdr 11210Sstevel@tonic-gate * in the parent. 11220Sstevel@tonic-gate */ 11230Sstevel@tonic-gate ps->ps_un = un; 11240Sstevel@tonic-gate ps->ps_ui = ui; 11250Sstevel@tonic-gate ps->ps_bp = parent_buf; 11260Sstevel@tonic-gate ps->ps_addr = parent_buf->b_un.b_addr; 11270Sstevel@tonic-gate 11280Sstevel@tonic-gate current_count = parent_buf->b_bcount; 11290Sstevel@tonic-gate current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno; 11300Sstevel@tonic-gate current_offset = 0; 11310Sstevel@tonic-gate 11320Sstevel@tonic-gate ps->ps_frags++; 11330Sstevel@tonic-gate vdr->vdr_bytesread = 0; 11340Sstevel@tonic-gate 11350Sstevel@tonic-gate /* 11360Sstevel@tonic-gate * this loop does the main work of an I/O. we allocate a 11370Sstevel@tonic-gate * a child save for each buf, do the logical to physical 11380Sstevel@tonic-gate * mapping, decide if we need to frag the I/O, clone the 11390Sstevel@tonic-gate * new I/O to pass down the stack. repeat until we've 11400Sstevel@tonic-gate * taken care of the entire buf that was passed to us. 11410Sstevel@tonic-gate */ 11420Sstevel@tonic-gate do { 11430Sstevel@tonic-gate cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 11440Sstevel@tonic-gate sp_child_init(cs); 11450Sstevel@tonic-gate child_buf = &cs->cs_buf; 11460Sstevel@tonic-gate cs->cs_ps = ps; 11470Sstevel@tonic-gate 11480Sstevel@tonic-gate more = sp_mapbuf(un, current_blkno, current_count, child_buf); 11490Sstevel@tonic-gate if (more == -1) { 11500Sstevel@tonic-gate ret = EIO; 11510Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_SHORT; 11520Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs); 11530Sstevel@tonic-gate goto err_out; 11540Sstevel@tonic-gate } 11550Sstevel@tonic-gate 11560Sstevel@tonic-gate cvdr.vdr_flags = vdr->vdr_flags; 11570Sstevel@tonic-gate cvdr.vdr_side = vdr->vdr_side; 11580Sstevel@tonic-gate cvdr.vdr_nbytes = child_buf->b_bcount; 11590Sstevel@tonic-gate cvdr.vdr_offset = ldbtob(child_buf->b_lblkno); 11600Sstevel@tonic-gate /* Work out where we are in the allocated buffer */ 116162Sjeanm useroff = (offset_t)(uintptr_t)kbuffer; 11620Sstevel@tonic-gate useroff = useroff + (offset_t)current_offset; 116362Sjeanm cvdr.vdr_data = (void *)(uintptr_t)useroff; 11640Sstevel@tonic-gate child_buf = md_bioclone(parent_buf, current_offset, 11650Sstevel@tonic-gate child_buf->b_bcount, child_buf->b_edev, 11660Sstevel@tonic-gate child_buf->b_blkno, NULL, 11670Sstevel@tonic-gate child_buf, KM_NOSLEEP); 11680Sstevel@tonic-gate /* calculate new offset, counts, etc... */ 11690Sstevel@tonic-gate current_offset += child_buf->b_bcount; 11700Sstevel@tonic-gate current_count -= child_buf->b_bcount; 11710Sstevel@tonic-gate current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 11720Sstevel@tonic-gate 11730Sstevel@tonic-gate if (more) { 11740Sstevel@tonic-gate mutex_enter(&ps->ps_mx); 11750Sstevel@tonic-gate ps->ps_frags++; 11760Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 11770Sstevel@tonic-gate } 11780Sstevel@tonic-gate 11790Sstevel@tonic-gate ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr, 11800Sstevel@tonic-gate (mode | FKIOCTL), NULL); 11810Sstevel@tonic-gate 11820Sstevel@tonic-gate /* 11830Sstevel@tonic-gate * Free the child structure as we've finished with it. 11840Sstevel@tonic-gate * Normally this would be done by sp_done() but we're just 11850Sstevel@tonic-gate * using md_bioclone() to segment the transfer and we never 11860Sstevel@tonic-gate * issue a strategy request so the iodone will not be called. 11870Sstevel@tonic-gate */ 11880Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs); 11890Sstevel@tonic-gate if (ret == 0) { 11900Sstevel@tonic-gate /* copyout the returned data to vdr_data + offset */ 11910Sstevel@tonic-gate userbuf = (caddr_t)kbuffer; 11920Sstevel@tonic-gate userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer; 11930Sstevel@tonic-gate if (ddi_copyout(userbuf, vdr->vdr_data, 11940Sstevel@tonic-gate cvdr.vdr_bytesread, mode)) { 11950Sstevel@tonic-gate ret = EFAULT; 11960Sstevel@tonic-gate goto err_out; 11970Sstevel@tonic-gate } 11980Sstevel@tonic-gate vdr->vdr_bytesread += cvdr.vdr_bytesread; 11990Sstevel@tonic-gate } else { 12000Sstevel@tonic-gate goto err_out; 12010Sstevel@tonic-gate } 12020Sstevel@tonic-gate } while (more); 12030Sstevel@tonic-gate 12040Sstevel@tonic-gate /* 12050Sstevel@tonic-gate * Update the user-supplied vol_directed_rd_t structure with the 12060Sstevel@tonic-gate * contents of the last issued child request. 12070Sstevel@tonic-gate */ 12080Sstevel@tonic-gate vdr->vdr_flags = cvdr.vdr_flags; 12090Sstevel@tonic-gate vdr->vdr_side = cvdr.vdr_side; 12100Sstevel@tonic-gate bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME); 12110Sstevel@tonic-gate 12120Sstevel@tonic-gate err_out: 12130Sstevel@tonic-gate if (ret != 0) { 12140Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR; 12150Sstevel@tonic-gate } 12160Sstevel@tonic-gate if (vdr->vdr_bytesread != vdr->vdr_nbytes) { 12170Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_SHORT; 12180Sstevel@tonic-gate } 12190Sstevel@tonic-gate kmem_cache_free(sp_parent_cache, ps); 12200Sstevel@tonic-gate kmem_free(kbuffer, vdr->vdr_nbytes); 12210Sstevel@tonic-gate freerbuf(parent_buf); 12220Sstevel@tonic-gate md_unit_readerexit(ui); 12230Sstevel@tonic-gate return (ret); 12240Sstevel@tonic-gate } 12250Sstevel@tonic-gate 12260Sstevel@tonic-gate /* 12270Sstevel@tonic-gate * FUNCTION: sp_snarf() 12280Sstevel@tonic-gate * INPUT: cmd - snarf cmd. 12290Sstevel@tonic-gate * setno - set number. 12300Sstevel@tonic-gate * OUTPUT: none. 12310Sstevel@tonic-gate * RETURNS: 1 - soft partitions were snarfed. 12320Sstevel@tonic-gate * 0 - no soft partitions were snarfed. 12330Sstevel@tonic-gate * PURPOSE: Snarf soft partition metadb records into their in-core 12340Sstevel@tonic-gate * structures. This routine is called at "snarf time" when 12350Sstevel@tonic-gate * md loads and gets all metadevices records into memory. 12360Sstevel@tonic-gate * The basic algorithm is simply to walk the soft partition 12370Sstevel@tonic-gate * records in the metadb and call the soft partitioning 12380Sstevel@tonic-gate * build_incore routine to set up the in-core structures. 12390Sstevel@tonic-gate */ 12400Sstevel@tonic-gate static int 12410Sstevel@tonic-gate sp_snarf(md_snarfcmd_t cmd, set_t setno) 12420Sstevel@tonic-gate { 12430Sstevel@tonic-gate mp_unit_t *un; 12440Sstevel@tonic-gate mddb_recid_t recid; 12450Sstevel@tonic-gate int gotsomething; 12460Sstevel@tonic-gate int all_sp_gotten; 12470Sstevel@tonic-gate mddb_type_t rec_type; 12480Sstevel@tonic-gate mddb_de_ic_t *dep; 12490Sstevel@tonic-gate mddb_rb32_t *rbp; 12500Sstevel@tonic-gate mp_unit_t *big_un; 12510Sstevel@tonic-gate mp_unit32_od_t *small_un; 12520Sstevel@tonic-gate size_t newreqsize; 12530Sstevel@tonic-gate 12540Sstevel@tonic-gate 12550Sstevel@tonic-gate if (cmd == MD_SNARF_CLEANUP) 12560Sstevel@tonic-gate return (0); 12570Sstevel@tonic-gate 12580Sstevel@tonic-gate all_sp_gotten = 1; 12590Sstevel@tonic-gate gotsomething = 0; 12600Sstevel@tonic-gate 12610Sstevel@tonic-gate /* get the record type */ 12620Sstevel@tonic-gate rec_type = (mddb_type_t)md_getshared_key(setno, 12630Sstevel@tonic-gate sp_md_ops.md_driver.md_drivername); 12640Sstevel@tonic-gate recid = mddb_makerecid(setno, 0); 12650Sstevel@tonic-gate 12660Sstevel@tonic-gate /* 12670Sstevel@tonic-gate * walk soft partition records in the metadb and call 12680Sstevel@tonic-gate * sp_build_incore to build in-core structures. 12690Sstevel@tonic-gate */ 12700Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 12710Sstevel@tonic-gate /* if we've already gotten this record, go to the next one */ 12720Sstevel@tonic-gate if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 12730Sstevel@tonic-gate continue; 12740Sstevel@tonic-gate 12750Sstevel@tonic-gate 12760Sstevel@tonic-gate dep = mddb_getrecdep(recid); 12770Sstevel@tonic-gate dep->de_flags = MDDB_F_SOFTPART; 12780Sstevel@tonic-gate rbp = dep->de_rb; 12790Sstevel@tonic-gate 12800Sstevel@tonic-gate if ((rbp->rb_revision == MDDB_REV_RB) && 12810Sstevel@tonic-gate ((rbp->rb_private & MD_PRV_CONVD) == 0)) { 12820Sstevel@tonic-gate /* 12830Sstevel@tonic-gate * This means, we have an old and small record. 12840Sstevel@tonic-gate * And this record hasn't already been converted :-o 12850Sstevel@tonic-gate * before we create an incore metadevice from this 12860Sstevel@tonic-gate * we have to convert it to a big record. 12870Sstevel@tonic-gate */ 12880Sstevel@tonic-gate small_un = (mp_unit32_od_t *)mddb_getrecaddr(recid); 12890Sstevel@tonic-gate newreqsize = sizeof (mp_unit_t) + 12900Sstevel@tonic-gate ((small_un->un_numexts - 1) * 12910Sstevel@tonic-gate sizeof (struct mp_ext)); 12920Sstevel@tonic-gate big_un = (mp_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP); 12930Sstevel@tonic-gate softpart_convert((caddr_t)small_un, (caddr_t)big_un, 12940Sstevel@tonic-gate SMALL_2_BIG); 12950Sstevel@tonic-gate kmem_free(small_un, dep->de_reqsize); 12960Sstevel@tonic-gate dep->de_rb_userdata = big_un; 12970Sstevel@tonic-gate dep->de_reqsize = newreqsize; 12980Sstevel@tonic-gate rbp->rb_private |= MD_PRV_CONVD; 12990Sstevel@tonic-gate un = big_un; 13000Sstevel@tonic-gate } else { 13010Sstevel@tonic-gate /* Large device */ 13020Sstevel@tonic-gate un = (mp_unit_t *)mddb_getrecaddr(recid); 13030Sstevel@tonic-gate } 13040Sstevel@tonic-gate 13050Sstevel@tonic-gate /* Set revision and flag accordingly */ 13060Sstevel@tonic-gate if (rbp->rb_revision == MDDB_REV_RB) { 13070Sstevel@tonic-gate un->c.un_revision = MD_32BIT_META_DEV; 13080Sstevel@tonic-gate } else { 13090Sstevel@tonic-gate un->c.un_revision = MD_64BIT_META_DEV; 13100Sstevel@tonic-gate un->c.un_flag |= MD_EFILABEL; 13110Sstevel@tonic-gate } 13120Sstevel@tonic-gate 13130Sstevel@tonic-gate /* 13140Sstevel@tonic-gate * Create minor node for snarfed entry. 13150Sstevel@tonic-gate */ 13160Sstevel@tonic-gate (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 13170Sstevel@tonic-gate 13180Sstevel@tonic-gate if (MD_UNIT(MD_SID(un)) != NULL) { 13190Sstevel@tonic-gate /* unit is already in-core */ 13200Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_PENDDEL); 13210Sstevel@tonic-gate continue; 13220Sstevel@tonic-gate } 13230Sstevel@tonic-gate all_sp_gotten = 0; 13240Sstevel@tonic-gate if (sp_build_incore((void *)un, 1) == 0) { 13250Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_GOTIT); 13260Sstevel@tonic-gate md_create_unit_incore(MD_SID(un), &sp_md_ops, 0); 13270Sstevel@tonic-gate gotsomething = 1; 13280Sstevel@tonic-gate } 13290Sstevel@tonic-gate } 13300Sstevel@tonic-gate 13310Sstevel@tonic-gate if (!all_sp_gotten) 13320Sstevel@tonic-gate return (gotsomething); 13330Sstevel@tonic-gate /* double-check records */ 13340Sstevel@tonic-gate recid = mddb_makerecid(setno, 0); 13350Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) 13360Sstevel@tonic-gate if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 13370Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_PENDDEL); 13380Sstevel@tonic-gate 13390Sstevel@tonic-gate return (0); 13400Sstevel@tonic-gate } 13410Sstevel@tonic-gate 13420Sstevel@tonic-gate /* 13430Sstevel@tonic-gate * FUNCTION: sp_halt() 13440Sstevel@tonic-gate * INPUT: cmd - halt cmd. 13450Sstevel@tonic-gate * setno - set number. 13460Sstevel@tonic-gate * RETURNS: 0 - success. 13470Sstevel@tonic-gate * 1 - err. 13480Sstevel@tonic-gate * PURPOSE: Perform driver halt operations. As with stripe, we 13490Sstevel@tonic-gate * support MD_HALT_CHECK and MD_HALT_DOIT. The first 13500Sstevel@tonic-gate * does a check to see if halting can be done safely 13510Sstevel@tonic-gate * (no open soft partitions), the second cleans up and 13520Sstevel@tonic-gate * shuts down the driver. 13530Sstevel@tonic-gate */ 13540Sstevel@tonic-gate static int 13550Sstevel@tonic-gate sp_halt(md_haltcmd_t cmd, set_t setno) 13560Sstevel@tonic-gate { 13570Sstevel@tonic-gate int i; 13580Sstevel@tonic-gate mdi_unit_t *ui; 13590Sstevel@tonic-gate minor_t mnum; 13600Sstevel@tonic-gate 13610Sstevel@tonic-gate if (cmd == MD_HALT_CLOSE) 13620Sstevel@tonic-gate return (0); 13630Sstevel@tonic-gate 13640Sstevel@tonic-gate if (cmd == MD_HALT_OPEN) 13650Sstevel@tonic-gate return (0); 13660Sstevel@tonic-gate 13670Sstevel@tonic-gate if (cmd == MD_HALT_UNLOAD) 13680Sstevel@tonic-gate return (0); 13690Sstevel@tonic-gate 13700Sstevel@tonic-gate if (cmd == MD_HALT_CHECK) { 13710Sstevel@tonic-gate for (i = 0; i < md_nunits; i++) { 13720Sstevel@tonic-gate mnum = MD_MKMIN(setno, i); 13730Sstevel@tonic-gate if ((ui = MDI_UNIT(mnum)) == NULL) 13740Sstevel@tonic-gate continue; 13750Sstevel@tonic-gate if (ui->ui_opsindex != sp_md_ops.md_selfindex) 13760Sstevel@tonic-gate continue; 13770Sstevel@tonic-gate if (md_unit_isopen(ui)) 13780Sstevel@tonic-gate return (1); 13790Sstevel@tonic-gate } 13800Sstevel@tonic-gate return (0); 13810Sstevel@tonic-gate } 13820Sstevel@tonic-gate 13830Sstevel@tonic-gate if (cmd != MD_HALT_DOIT) 13840Sstevel@tonic-gate return (1); 13850Sstevel@tonic-gate 13860Sstevel@tonic-gate for (i = 0; i < md_nunits; i++) { 13870Sstevel@tonic-gate mnum = MD_MKMIN(setno, i); 13880Sstevel@tonic-gate if ((ui = MDI_UNIT(mnum)) == NULL) 13890Sstevel@tonic-gate continue; 13900Sstevel@tonic-gate if (ui->ui_opsindex != sp_md_ops.md_selfindex) 13910Sstevel@tonic-gate continue; 13920Sstevel@tonic-gate reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0); 13930Sstevel@tonic-gate } 13940Sstevel@tonic-gate 13950Sstevel@tonic-gate return (0); 13960Sstevel@tonic-gate } 13970Sstevel@tonic-gate 13980Sstevel@tonic-gate /* 13990Sstevel@tonic-gate * FUNCTION: sp_open_dev() 14000Sstevel@tonic-gate * INPUT: un - unit structure. 14010Sstevel@tonic-gate * oflags - open flags. 14020Sstevel@tonic-gate * OUTPUT: none. 14030Sstevel@tonic-gate * RETURNS: 0 - success. 14040Sstevel@tonic-gate * non-zero - err. 14050Sstevel@tonic-gate * PURPOSE: open underlying device via md_layered_open. 14060Sstevel@tonic-gate */ 14070Sstevel@tonic-gate static int 14080Sstevel@tonic-gate sp_open_dev(mp_unit_t *un, int oflags) 14090Sstevel@tonic-gate { 14100Sstevel@tonic-gate minor_t mnum = MD_SID(un); 14110Sstevel@tonic-gate int err; 14120Sstevel@tonic-gate md_dev64_t tmpdev; 14130Sstevel@tonic-gate set_t setno = MD_MIN2SET(MD_SID(un)); 14140Sstevel@tonic-gate side_t side = mddb_getsidenum(setno); 14150Sstevel@tonic-gate 14160Sstevel@tonic-gate tmpdev = un->un_dev; 14170Sstevel@tonic-gate /* 14180Sstevel@tonic-gate * Do the open by device id if underlying is regular 14190Sstevel@tonic-gate */ 14200Sstevel@tonic-gate if ((md_getmajor(tmpdev) != md_major) && 14210Sstevel@tonic-gate md_devid_found(setno, side, un->un_key) == 1) { 14220Sstevel@tonic-gate tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key); 14230Sstevel@tonic-gate } 14240Sstevel@tonic-gate err = md_layered_open(mnum, &tmpdev, oflags); 14250Sstevel@tonic-gate un->un_dev = tmpdev; 14260Sstevel@tonic-gate 14270Sstevel@tonic-gate if (err) 14280Sstevel@tonic-gate return (ENXIO); 14290Sstevel@tonic-gate 14300Sstevel@tonic-gate return (0); 14310Sstevel@tonic-gate } 14320Sstevel@tonic-gate 14330Sstevel@tonic-gate /* 14340Sstevel@tonic-gate * FUNCTION: sp_open() 14350Sstevel@tonic-gate * INPUT: dev - device to open. 14360Sstevel@tonic-gate * flag - pass-through flag. 14370Sstevel@tonic-gate * otyp - pass-through open type. 14380Sstevel@tonic-gate * cred_p - credentials. 14390Sstevel@tonic-gate * md_oflags - open flags. 14400Sstevel@tonic-gate * OUTPUT: none. 14410Sstevel@tonic-gate * RETURNS: 0 - success. 14420Sstevel@tonic-gate * non-zero - err. 14430Sstevel@tonic-gate * PURPOSE: open a soft partition. 14440Sstevel@tonic-gate */ 14450Sstevel@tonic-gate /* ARGSUSED */ 14460Sstevel@tonic-gate static int 14470Sstevel@tonic-gate sp_open( 14480Sstevel@tonic-gate dev_t *dev, 14490Sstevel@tonic-gate int flag, 14500Sstevel@tonic-gate int otyp, 14510Sstevel@tonic-gate cred_t *cred_p, 14520Sstevel@tonic-gate int md_oflags 14530Sstevel@tonic-gate ) 14540Sstevel@tonic-gate { 14550Sstevel@tonic-gate minor_t mnum = getminor(*dev); 14560Sstevel@tonic-gate mdi_unit_t *ui = MDI_UNIT(mnum); 14570Sstevel@tonic-gate mp_unit_t *un; 14580Sstevel@tonic-gate int err = 0; 14590Sstevel@tonic-gate set_t setno; 14600Sstevel@tonic-gate 146146Sskamm /* 146246Sskamm * When doing an open of a multi owner metadevice, check to see if this 146346Sskamm * node is a starting node and if a reconfig cycle is underway. 146446Sskamm * If so, the system isn't sufficiently set up enough to handle the 146546Sskamm * open (which involves I/O during sp_validate), so fail with ENXIO. 146646Sskamm */ 146746Sskamm setno = MD_MIN2SET(mnum); 146846Sskamm if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 146946Sskamm (MD_SET_MNSET | MD_SET_MN_START_RC)) { 147046Sskamm return (ENXIO); 147146Sskamm } 147246Sskamm 14730Sstevel@tonic-gate /* grab necessary locks */ 14740Sstevel@tonic-gate un = (mp_unit_t *)md_unit_openclose_enter(ui); 14750Sstevel@tonic-gate setno = MD_UN2SET(un); 14760Sstevel@tonic-gate 14770Sstevel@tonic-gate /* open underlying device, if necessary */ 14780Sstevel@tonic-gate if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) { 14790Sstevel@tonic-gate if ((err = sp_open_dev(un, md_oflags)) != 0) 14800Sstevel@tonic-gate goto out; 14810Sstevel@tonic-gate 14820Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) { 14830Sstevel@tonic-gate /* For probe, don't incur the overhead of validate */ 14840Sstevel@tonic-gate if (!(md_oflags & MD_OFLG_PROBEDEV)) { 14850Sstevel@tonic-gate /* 14860Sstevel@tonic-gate * Don't call sp_validate while 14870Sstevel@tonic-gate * unit_openclose lock is held. So, actually 14880Sstevel@tonic-gate * open the device, drop openclose lock, 14890Sstevel@tonic-gate * call sp_validate, reacquire openclose lock, 14900Sstevel@tonic-gate * and close the device. If sp_validate 14910Sstevel@tonic-gate * succeeds, then device will be re-opened. 14920Sstevel@tonic-gate */ 14930Sstevel@tonic-gate if ((err = md_unit_incopen(mnum, flag, 14940Sstevel@tonic-gate otyp)) != 0) 14950Sstevel@tonic-gate goto out; 14960Sstevel@tonic-gate 14970Sstevel@tonic-gate mutex_enter(&ui->ui_mx); 14980Sstevel@tonic-gate ui->ui_lock |= MD_UL_OPENINPROGRESS; 14990Sstevel@tonic-gate mutex_exit(&ui->ui_mx); 15000Sstevel@tonic-gate md_unit_openclose_exit(ui); 15010Sstevel@tonic-gate if (otyp != OTYP_LYR) 15020Sstevel@tonic-gate rw_exit(&md_unit_array_rw.lock); 15030Sstevel@tonic-gate 15040Sstevel@tonic-gate err = sp_validate(un); 15050Sstevel@tonic-gate 15060Sstevel@tonic-gate if (otyp != OTYP_LYR) 15070Sstevel@tonic-gate rw_enter(&md_unit_array_rw.lock, 15080Sstevel@tonic-gate RW_READER); 15090Sstevel@tonic-gate (void) md_unit_openclose_enter(ui); 15100Sstevel@tonic-gate (void) md_unit_decopen(mnum, otyp); 15110Sstevel@tonic-gate mutex_enter(&ui->ui_mx); 15120Sstevel@tonic-gate ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 15130Sstevel@tonic-gate cv_broadcast(&ui->ui_cv); 15140Sstevel@tonic-gate mutex_exit(&ui->ui_mx); 15150Sstevel@tonic-gate /* 15160Sstevel@tonic-gate * Should be in the same state as before 15170Sstevel@tonic-gate * the sp_validate. 15180Sstevel@tonic-gate */ 15190Sstevel@tonic-gate if (err != 0) { 15200Sstevel@tonic-gate /* close the device opened above */ 15210Sstevel@tonic-gate md_layered_close(un->un_dev, md_oflags); 15220Sstevel@tonic-gate err = EIO; 15230Sstevel@tonic-gate goto out; 15240Sstevel@tonic-gate } 15250Sstevel@tonic-gate } 15260Sstevel@tonic-gate /* 15270Sstevel@tonic-gate * As we're a multi-owner metadevice we need to ensure 15280Sstevel@tonic-gate * that all nodes have the same idea of the status. 15290Sstevel@tonic-gate * sp_validate() will mark the device as errored (if 15300Sstevel@tonic-gate * it cannot read the watermark) or ok (if it was 15310Sstevel@tonic-gate * previously errored but the watermark is now valid). 15320Sstevel@tonic-gate * This code-path is only entered on the non-probe open 15330Sstevel@tonic-gate * so we will maintain the errored state during a probe 15340Sstevel@tonic-gate * call. This means the sys-admin must metarecover -m 15350Sstevel@tonic-gate * to reset the soft-partition error. 15360Sstevel@tonic-gate */ 15370Sstevel@tonic-gate } else { 15380Sstevel@tonic-gate /* For probe, don't incur the overhead of validate */ 15390Sstevel@tonic-gate if (!(md_oflags & MD_OFLG_PROBEDEV) && 15400Sstevel@tonic-gate (err = sp_validate(un)) != 0) { 15410Sstevel@tonic-gate /* close the device opened above */ 15420Sstevel@tonic-gate md_layered_close(un->un_dev, md_oflags); 15430Sstevel@tonic-gate err = EIO; 15440Sstevel@tonic-gate goto out; 15450Sstevel@tonic-gate } else { 15460Sstevel@tonic-gate /* 15470Sstevel@tonic-gate * we succeeded in validating the on disk 15480Sstevel@tonic-gate * format versus the in core, so reset the 15490Sstevel@tonic-gate * status if it's in error 15500Sstevel@tonic-gate */ 15510Sstevel@tonic-gate if (un->un_status == MD_SP_ERR) { 15520Sstevel@tonic-gate un->un_status = MD_SP_OK; 15530Sstevel@tonic-gate } 15540Sstevel@tonic-gate } 15550Sstevel@tonic-gate } 15560Sstevel@tonic-gate } 15570Sstevel@tonic-gate 15580Sstevel@tonic-gate /* count open */ 15590Sstevel@tonic-gate if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 15600Sstevel@tonic-gate goto out; 15610Sstevel@tonic-gate 15620Sstevel@tonic-gate out: 15630Sstevel@tonic-gate md_unit_openclose_exit(ui); 15640Sstevel@tonic-gate return (err); 15650Sstevel@tonic-gate } 15660Sstevel@tonic-gate 15670Sstevel@tonic-gate /* 15680Sstevel@tonic-gate * FUNCTION: sp_close() 15690Sstevel@tonic-gate * INPUT: dev - device to close. 15700Sstevel@tonic-gate * flag - pass-through flag. 15710Sstevel@tonic-gate * otyp - pass-through type. 15720Sstevel@tonic-gate * cred_p - credentials. 15730Sstevel@tonic-gate * md_cflags - close flags. 15740Sstevel@tonic-gate * OUTPUT: none. 15750Sstevel@tonic-gate * RETURNS: 0 - success. 15760Sstevel@tonic-gate * non-zero - err. 15770Sstevel@tonic-gate * PURPOSE: close a soft paritition. 15780Sstevel@tonic-gate */ 15790Sstevel@tonic-gate /* ARGSUSED */ 15800Sstevel@tonic-gate static int 15810Sstevel@tonic-gate sp_close( 15820Sstevel@tonic-gate dev_t dev, 15830Sstevel@tonic-gate int flag, 15840Sstevel@tonic-gate int otyp, 15850Sstevel@tonic-gate cred_t *cred_p, 15860Sstevel@tonic-gate int md_cflags 15870Sstevel@tonic-gate ) 15880Sstevel@tonic-gate { 15890Sstevel@tonic-gate minor_t mnum = getminor(dev); 15900Sstevel@tonic-gate mdi_unit_t *ui = MDI_UNIT(mnum); 15910Sstevel@tonic-gate mp_unit_t *un; 15920Sstevel@tonic-gate int err = 0; 15930Sstevel@tonic-gate 15940Sstevel@tonic-gate /* grab necessary locks */ 15950Sstevel@tonic-gate un = (mp_unit_t *)md_unit_openclose_enter(ui); 15960Sstevel@tonic-gate 15970Sstevel@tonic-gate /* count closed */ 15980Sstevel@tonic-gate if ((err = md_unit_decopen(mnum, otyp)) != 0) 15990Sstevel@tonic-gate goto out; 16000Sstevel@tonic-gate 16010Sstevel@tonic-gate /* close devices, if necessary */ 16020Sstevel@tonic-gate if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 16030Sstevel@tonic-gate md_layered_close(un->un_dev, md_cflags); 16040Sstevel@tonic-gate } 16050Sstevel@tonic-gate 16060Sstevel@tonic-gate /* 16070Sstevel@tonic-gate * If a MN set and transient capabilities (eg ABR/DMR) are set, 16080Sstevel@tonic-gate * clear these capabilities if this is the last close in 16090Sstevel@tonic-gate * the cluster 16100Sstevel@tonic-gate */ 16110Sstevel@tonic-gate if (MD_MNSET_SETNO(MD_UN2SET(un)) && 16120Sstevel@tonic-gate (ui->ui_tstate & MD_ABR_CAP)) { 16130Sstevel@tonic-gate md_unit_openclose_exit(ui); 16140Sstevel@tonic-gate mdmn_clear_all_capabilities(mnum); 16150Sstevel@tonic-gate return (0); 16160Sstevel@tonic-gate } 16170Sstevel@tonic-gate /* unlock, return success */ 16180Sstevel@tonic-gate out: 16190Sstevel@tonic-gate md_unit_openclose_exit(ui); 16200Sstevel@tonic-gate return (err); 16210Sstevel@tonic-gate } 16220Sstevel@tonic-gate 16230Sstevel@tonic-gate 16240Sstevel@tonic-gate /* used in sp_dump routine */ 16250Sstevel@tonic-gate static struct buf dumpbuf; 16260Sstevel@tonic-gate 16270Sstevel@tonic-gate /* 16280Sstevel@tonic-gate * FUNCTION: sp_dump() 16290Sstevel@tonic-gate * INPUT: dev - device to dump to. 16300Sstevel@tonic-gate * addr - address to dump. 16310Sstevel@tonic-gate * blkno - blkno on device. 16320Sstevel@tonic-gate * nblk - number of blocks to dump. 16330Sstevel@tonic-gate * OUTPUT: none. 16340Sstevel@tonic-gate * RETURNS: result from bdev_dump. 16350Sstevel@tonic-gate * PURPOSE: This routine dumps memory to the disk. It assumes that 16360Sstevel@tonic-gate * the memory has already been mapped into mainbus space. 16370Sstevel@tonic-gate * It is called at disk interrupt priority when the system 16380Sstevel@tonic-gate * is in trouble. 16390Sstevel@tonic-gate * NOTE: this function is defined using 32-bit arguments, 16400Sstevel@tonic-gate * but soft partitioning is internally 64-bit. Arguments 16410Sstevel@tonic-gate * are casted where appropriate. 16420Sstevel@tonic-gate */ 16430Sstevel@tonic-gate static int 16440Sstevel@tonic-gate sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 16450Sstevel@tonic-gate { 16460Sstevel@tonic-gate mp_unit_t *un; 16470Sstevel@tonic-gate buf_t *bp; 16480Sstevel@tonic-gate sp_ext_length_t nb; 16490Sstevel@tonic-gate daddr_t mapblk; 16500Sstevel@tonic-gate int result; 16510Sstevel@tonic-gate int more; 16520Sstevel@tonic-gate int saveresult = 0; 16530Sstevel@tonic-gate 16540Sstevel@tonic-gate /* 16550Sstevel@tonic-gate * Don't need to grab the unit lock. 16560Sstevel@tonic-gate * Cause nothing else is supposed to be happenning. 16570Sstevel@tonic-gate * Also dump is not supposed to sleep. 16580Sstevel@tonic-gate */ 16590Sstevel@tonic-gate un = (mp_unit_t *)MD_UNIT(getminor(dev)); 16600Sstevel@tonic-gate 16610Sstevel@tonic-gate if ((diskaddr_t)blkno >= un->c.un_total_blocks) 16620Sstevel@tonic-gate return (EINVAL); 16630Sstevel@tonic-gate 16640Sstevel@tonic-gate if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks) 16650Sstevel@tonic-gate return (EINVAL); 16660Sstevel@tonic-gate 16670Sstevel@tonic-gate bp = &dumpbuf; 16680Sstevel@tonic-gate nb = (sp_ext_length_t)dbtob(nblk); 16690Sstevel@tonic-gate do { 16700Sstevel@tonic-gate bzero((caddr_t)bp, sizeof (*bp)); 16710Sstevel@tonic-gate more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp); 16720Sstevel@tonic-gate nblk = (int)(btodb(bp->b_bcount)); 16730Sstevel@tonic-gate mapblk = bp->b_blkno; 16740Sstevel@tonic-gate result = bdev_dump(bp->b_edev, addr, mapblk, nblk); 16750Sstevel@tonic-gate if (result) 16760Sstevel@tonic-gate saveresult = result; 16770Sstevel@tonic-gate 16780Sstevel@tonic-gate nb -= bp->b_bcount; 16790Sstevel@tonic-gate addr += bp->b_bcount; 16800Sstevel@tonic-gate blkno += nblk; 16810Sstevel@tonic-gate } while (more); 16820Sstevel@tonic-gate 16830Sstevel@tonic-gate return (saveresult); 16840Sstevel@tonic-gate } 16850Sstevel@tonic-gate 16860Sstevel@tonic-gate static int 16870Sstevel@tonic-gate sp_imp_set( 16880Sstevel@tonic-gate set_t setno 16890Sstevel@tonic-gate ) 16900Sstevel@tonic-gate { 16910Sstevel@tonic-gate mddb_recid_t recid; 16920Sstevel@tonic-gate int gotsomething; 16930Sstevel@tonic-gate mddb_type_t rec_type; 16940Sstevel@tonic-gate mddb_de_ic_t *dep; 16950Sstevel@tonic-gate mddb_rb32_t *rbp; 16960Sstevel@tonic-gate mp_unit_t *un64; 16970Sstevel@tonic-gate mp_unit32_od_t *un32; 16980Sstevel@tonic-gate minor_t *self_id; /* minor needs to be updated */ 16990Sstevel@tonic-gate md_parent_t *parent_id; /* parent needs to be updated */ 17000Sstevel@tonic-gate mddb_recid_t *record_id; /* record id needs to be updated */ 17010Sstevel@tonic-gate 17020Sstevel@tonic-gate gotsomething = 0; 17030Sstevel@tonic-gate 17040Sstevel@tonic-gate rec_type = (mddb_type_t)md_getshared_key(setno, 17050Sstevel@tonic-gate sp_md_ops.md_driver.md_drivername); 17060Sstevel@tonic-gate recid = mddb_makerecid(setno, 0); 17070Sstevel@tonic-gate 17080Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 17090Sstevel@tonic-gate if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 17100Sstevel@tonic-gate continue; 17110Sstevel@tonic-gate 17120Sstevel@tonic-gate dep = mddb_getrecdep(recid); 17130Sstevel@tonic-gate rbp = dep->de_rb; 17140Sstevel@tonic-gate 17150Sstevel@tonic-gate if (rbp->rb_revision == MDDB_REV_RB) { 17160Sstevel@tonic-gate /* 17170Sstevel@tonic-gate * Small device 17180Sstevel@tonic-gate */ 17190Sstevel@tonic-gate un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid); 17200Sstevel@tonic-gate self_id = &(un32->c.un_self_id); 17210Sstevel@tonic-gate parent_id = &(un32->c.un_parent); 17220Sstevel@tonic-gate record_id = &(un32->c.un_record_id); 17230Sstevel@tonic-gate 17240Sstevel@tonic-gate if (!md_update_minor(setno, mddb_getsidenum 17250Sstevel@tonic-gate (setno), un32->un_key)) 17260Sstevel@tonic-gate goto out; 17270Sstevel@tonic-gate } else { 17280Sstevel@tonic-gate un64 = (mp_unit_t *)mddb_getrecaddr(recid); 17290Sstevel@tonic-gate self_id = &(un64->c.un_self_id); 17300Sstevel@tonic-gate parent_id = &(un64->c.un_parent); 17310Sstevel@tonic-gate record_id = &(un64->c.un_record_id); 17320Sstevel@tonic-gate 17330Sstevel@tonic-gate if (!md_update_minor(setno, mddb_getsidenum 17340Sstevel@tonic-gate (setno), un64->un_key)) 17350Sstevel@tonic-gate goto out; 17360Sstevel@tonic-gate } 17370Sstevel@tonic-gate 17380Sstevel@tonic-gate /* 17390Sstevel@tonic-gate * Update unit with the imported setno 17400Sstevel@tonic-gate * 17410Sstevel@tonic-gate */ 17420Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_GOTIT); 17430Sstevel@tonic-gate 17440Sstevel@tonic-gate *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 17450Sstevel@tonic-gate if (*parent_id != MD_NO_PARENT) 17460Sstevel@tonic-gate *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 17470Sstevel@tonic-gate *record_id = MAKERECID(setno, DBID(*record_id)); 17480Sstevel@tonic-gate 17490Sstevel@tonic-gate gotsomething = 1; 17500Sstevel@tonic-gate } 17510Sstevel@tonic-gate 17520Sstevel@tonic-gate out: 17530Sstevel@tonic-gate return (gotsomething); 17540Sstevel@tonic-gate } 17550Sstevel@tonic-gate 17560Sstevel@tonic-gate static md_named_services_t sp_named_services[] = { 17570Sstevel@tonic-gate {NULL, 0} 17580Sstevel@tonic-gate }; 17590Sstevel@tonic-gate 17600Sstevel@tonic-gate md_ops_t sp_md_ops = { 17610Sstevel@tonic-gate sp_open, /* open */ 17620Sstevel@tonic-gate sp_close, /* close */ 17630Sstevel@tonic-gate md_sp_strategy, /* strategy */ 17640Sstevel@tonic-gate NULL, /* print */ 17650Sstevel@tonic-gate sp_dump, /* dump */ 17660Sstevel@tonic-gate NULL, /* read */ 17670Sstevel@tonic-gate NULL, /* write */ 17680Sstevel@tonic-gate md_sp_ioctl, /* ioctl, */ 17690Sstevel@tonic-gate sp_snarf, /* snarf */ 17700Sstevel@tonic-gate sp_halt, /* halt */ 17710Sstevel@tonic-gate NULL, /* aread */ 17720Sstevel@tonic-gate NULL, /* awrite */ 17730Sstevel@tonic-gate sp_imp_set, /* import set */ 17740Sstevel@tonic-gate sp_named_services 17750Sstevel@tonic-gate }; 17760Sstevel@tonic-gate 17770Sstevel@tonic-gate static void 17780Sstevel@tonic-gate init_init() 17790Sstevel@tonic-gate { 17800Sstevel@tonic-gate sp_parent_cache = kmem_cache_create("md_softpart_parent", 17810Sstevel@tonic-gate sizeof (md_spps_t), 0, sp_parent_constructor, 17820Sstevel@tonic-gate sp_parent_destructor, sp_run_queue, NULL, NULL, 0); 17830Sstevel@tonic-gate sp_child_cache = kmem_cache_create("md_softpart_child", 17840Sstevel@tonic-gate sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0, 17850Sstevel@tonic-gate sp_child_constructor, sp_child_destructor, sp_run_queue, 17860Sstevel@tonic-gate NULL, NULL, 0); 17870Sstevel@tonic-gate } 17880Sstevel@tonic-gate 17890Sstevel@tonic-gate static void 17900Sstevel@tonic-gate fini_uninit() 17910Sstevel@tonic-gate { 17920Sstevel@tonic-gate kmem_cache_destroy(sp_parent_cache); 17930Sstevel@tonic-gate kmem_cache_destroy(sp_child_cache); 17940Sstevel@tonic-gate sp_parent_cache = sp_child_cache = NULL; 17950Sstevel@tonic-gate } 17960Sstevel@tonic-gate 17970Sstevel@tonic-gate /* define the module linkage */ 17980Sstevel@tonic-gate MD_PLUGIN_MISC_MODULE("soft partition module %I%", init_init(), fini_uninit()) 1799