10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 51366Spetede * Common Development and Distribution License (the "License"). 61366Spetede * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 21*7627SChris.Horne@Sun.COM 220Sstevel@tonic-gate /* 23*7627SChris.Horne@Sun.COM * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 240Sstevel@tonic-gate * Use is subject to license terms. 250Sstevel@tonic-gate */ 260Sstevel@tonic-gate 270Sstevel@tonic-gate /* 280Sstevel@tonic-gate * Soft partitioning metadevice driver (md_sp). 290Sstevel@tonic-gate * 300Sstevel@tonic-gate * This file contains the primary operations of the soft partitioning 310Sstevel@tonic-gate * metadevice driver. This includes all routines for normal operation 320Sstevel@tonic-gate * (open/close/read/write). Please see mdvar.h for a definition of 330Sstevel@tonic-gate * metadevice operations vector (md_ops_t). This driver is loosely 340Sstevel@tonic-gate * based on the stripe driver (md_stripe). 350Sstevel@tonic-gate * 360Sstevel@tonic-gate * All metadevice administration is done through the use of ioctl's. 370Sstevel@tonic-gate * As such, all administrative routines appear in sp_ioctl.c. 380Sstevel@tonic-gate * 390Sstevel@tonic-gate * Soft partitions are represented both in-core and in the metadb with a 400Sstevel@tonic-gate * unit structure. The soft partition-specific information in the unit 410Sstevel@tonic-gate * structure includes the following information: 420Sstevel@tonic-gate * - Device information (md_dev64_t & md key) about the device on which 430Sstevel@tonic-gate * the soft partition is built. 440Sstevel@tonic-gate * - Soft partition status information. 450Sstevel@tonic-gate * - The size of the soft partition and number of extents used to 460Sstevel@tonic-gate * make up that size. 470Sstevel@tonic-gate * - An array of exents which define virtual/physical offset 480Sstevel@tonic-gate * mappings and lengths for each extent. 490Sstevel@tonic-gate * 500Sstevel@tonic-gate * Typical soft partition operation proceeds as follows: 510Sstevel@tonic-gate * - The unit structure is fetched from the metadb and placed into 520Sstevel@tonic-gate * an in-core array (as with other metadevices). This operation 530Sstevel@tonic-gate * is performed via sp_build_incore( ) and takes place during 540Sstevel@tonic-gate * "snarfing" (when all metadevices are brought in-core at 550Sstevel@tonic-gate * once) and when a new soft partition is created. 560Sstevel@tonic-gate * - A soft partition is opened via sp_open( ). At open time the 570Sstevel@tonic-gate * the soft partition unit structure is verified with the soft 580Sstevel@tonic-gate * partition on-disk structures. Additionally, the soft partition 590Sstevel@tonic-gate * status is checked (only soft partitions in the OK state may be 600Sstevel@tonic-gate * opened). 610Sstevel@tonic-gate * - Soft partition I/O is performed via sp_strategy( ) which relies on 620Sstevel@tonic-gate * a support routine, sp_mapbuf( ), to do most of the work. 630Sstevel@tonic-gate * sp_mapbuf( ) maps a buffer to a particular extent via a binary 640Sstevel@tonic-gate * search of the extent array in the soft partition unit structure. 650Sstevel@tonic-gate * Once a translation has been performed, the I/O is passed down 660Sstevel@tonic-gate * to the next layer, which may be another metadevice or a physical 670Sstevel@tonic-gate * disk. Since a soft partition may contain multiple, non-contiguous 680Sstevel@tonic-gate * extents, a single I/O may have to be fragmented. 690Sstevel@tonic-gate * - Soft partitions are closed using sp_close. 700Sstevel@tonic-gate * 710Sstevel@tonic-gate */ 720Sstevel@tonic-gate 730Sstevel@tonic-gate #include <sys/param.h> 740Sstevel@tonic-gate #include <sys/systm.h> 750Sstevel@tonic-gate #include <sys/conf.h> 760Sstevel@tonic-gate #include <sys/file.h> 770Sstevel@tonic-gate #include <sys/user.h> 780Sstevel@tonic-gate #include <sys/uio.h> 790Sstevel@tonic-gate #include <sys/t_lock.h> 800Sstevel@tonic-gate #include <sys/buf.h> 810Sstevel@tonic-gate #include <sys/dkio.h> 820Sstevel@tonic-gate #include <sys/vtoc.h> 830Sstevel@tonic-gate #include <sys/kmem.h> 840Sstevel@tonic-gate #include <vm/page.h> 850Sstevel@tonic-gate #include <sys/cmn_err.h> 860Sstevel@tonic-gate #include <sys/sysmacros.h> 870Sstevel@tonic-gate #include <sys/types.h> 880Sstevel@tonic-gate #include <sys/mkdev.h> 890Sstevel@tonic-gate #include <sys/stat.h> 900Sstevel@tonic-gate #include <sys/open.h> 910Sstevel@tonic-gate #include <sys/lvm/mdvar.h> 920Sstevel@tonic-gate #include <sys/lvm/md_sp.h> 930Sstevel@tonic-gate #include <sys/lvm/md_convert.h> 940Sstevel@tonic-gate #include <sys/lvm/md_notify.h> 950Sstevel@tonic-gate #include <sys/lvm/md_crc.h> 960Sstevel@tonic-gate #include <sys/modctl.h> 970Sstevel@tonic-gate #include <sys/ddi.h> 980Sstevel@tonic-gate #include <sys/sunddi.h> 990Sstevel@tonic-gate #include <sys/debug.h> 1000Sstevel@tonic-gate 1010Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h> 1020Sstevel@tonic-gate #include <sys/sysevent/svm.h> 1030Sstevel@tonic-gate 1040Sstevel@tonic-gate md_ops_t sp_md_ops; 1050Sstevel@tonic-gate #ifndef lint 1061366Spetede char _depends_on[] = "drv/md"; 1070Sstevel@tonic-gate md_ops_t *md_interface_ops = &sp_md_ops; 1080Sstevel@tonic-gate #endif 1090Sstevel@tonic-gate 1100Sstevel@tonic-gate extern unit_t md_nunits; 1110Sstevel@tonic-gate extern set_t md_nsets; 1120Sstevel@tonic-gate extern md_set_t md_set[]; 1130Sstevel@tonic-gate 1140Sstevel@tonic-gate extern int md_status; 1150Sstevel@tonic-gate extern major_t md_major; 1160Sstevel@tonic-gate extern mdq_anchor_t md_done_daemon; 1170Sstevel@tonic-gate extern mdq_anchor_t md_sp_daemon; 1180Sstevel@tonic-gate extern kmutex_t md_mx; 1190Sstevel@tonic-gate extern kcondvar_t md_cv; 1200Sstevel@tonic-gate extern md_krwlock_t md_unit_array_rw; 1210Sstevel@tonic-gate 1220Sstevel@tonic-gate static kmem_cache_t *sp_parent_cache = NULL; 1230Sstevel@tonic-gate static kmem_cache_t *sp_child_cache = NULL; 1240Sstevel@tonic-gate static void sp_send_stat_ok(mp_unit_t *); 1250Sstevel@tonic-gate static void sp_send_stat_err(mp_unit_t *); 1260Sstevel@tonic-gate 1270Sstevel@tonic-gate /* 1280Sstevel@tonic-gate * FUNCTION: sp_parent_constructor() 1290Sstevel@tonic-gate * INPUT: none. 1300Sstevel@tonic-gate * OUTPUT: ps - parent save structure initialized. 1310Sstevel@tonic-gate * RETURNS: void * - ptr to initialized parent save structure. 1320Sstevel@tonic-gate * PURPOSE: initialize parent save structure. 1330Sstevel@tonic-gate */ 1340Sstevel@tonic-gate /*ARGSUSED1*/ 1350Sstevel@tonic-gate static int 1360Sstevel@tonic-gate sp_parent_constructor(void *p, void *d1, int d2) 1370Sstevel@tonic-gate { 1380Sstevel@tonic-gate mutex_init(&((md_spps_t *)p)->ps_mx, 1390Sstevel@tonic-gate NULL, MUTEX_DEFAULT, NULL); 1400Sstevel@tonic-gate return (0); 1410Sstevel@tonic-gate } 1420Sstevel@tonic-gate 1430Sstevel@tonic-gate static void 1440Sstevel@tonic-gate sp_parent_init(md_spps_t *ps) 1450Sstevel@tonic-gate { 1460Sstevel@tonic-gate bzero(ps, offsetof(md_spps_t, ps_mx)); 1470Sstevel@tonic-gate } 1480Sstevel@tonic-gate 1490Sstevel@tonic-gate /*ARGSUSED1*/ 1500Sstevel@tonic-gate static void 1510Sstevel@tonic-gate sp_parent_destructor(void *p, void *d) 1520Sstevel@tonic-gate { 1530Sstevel@tonic-gate mutex_destroy(&((md_spps_t *)p)->ps_mx); 1540Sstevel@tonic-gate } 1550Sstevel@tonic-gate 1560Sstevel@tonic-gate /* 1570Sstevel@tonic-gate * FUNCTION: sp_child_constructor() 1580Sstevel@tonic-gate * INPUT: none. 1590Sstevel@tonic-gate * OUTPUT: cs - child save structure initialized. 1600Sstevel@tonic-gate * RETURNS: void * - ptr to initialized child save structure. 1610Sstevel@tonic-gate * PURPOSE: initialize child save structure. 1620Sstevel@tonic-gate */ 1630Sstevel@tonic-gate /*ARGSUSED1*/ 1640Sstevel@tonic-gate static int 1650Sstevel@tonic-gate sp_child_constructor(void *p, void *d1, int d2) 1660Sstevel@tonic-gate { 1670Sstevel@tonic-gate bioinit(&((md_spcs_t *)p)->cs_buf); 1680Sstevel@tonic-gate return (0); 1690Sstevel@tonic-gate } 1700Sstevel@tonic-gate 1710Sstevel@tonic-gate static void 1720Sstevel@tonic-gate sp_child_init(md_spcs_t *cs) 1730Sstevel@tonic-gate { 1740Sstevel@tonic-gate cs->cs_mdunit = 0; 1750Sstevel@tonic-gate cs->cs_ps = NULL; 1760Sstevel@tonic-gate md_bioreset(&cs->cs_buf); 1770Sstevel@tonic-gate } 1780Sstevel@tonic-gate 1790Sstevel@tonic-gate /*ARGSUSED1*/ 1800Sstevel@tonic-gate static void 1810Sstevel@tonic-gate sp_child_destructor(void *p, void *d) 1820Sstevel@tonic-gate { 1830Sstevel@tonic-gate biofini(&((md_spcs_t *)p)->cs_buf); 1840Sstevel@tonic-gate } 1850Sstevel@tonic-gate 1860Sstevel@tonic-gate /* 1870Sstevel@tonic-gate * FUNCTION: sp_run_queue() 1880Sstevel@tonic-gate * INPUT: none. 1890Sstevel@tonic-gate * OUTPUT: none. 1900Sstevel@tonic-gate * RETURNS: void. 1910Sstevel@tonic-gate * PURPOSE: run the md_daemon to clean up memory pool. 1920Sstevel@tonic-gate */ 1930Sstevel@tonic-gate /*ARGSUSED*/ 1940Sstevel@tonic-gate static void 1950Sstevel@tonic-gate sp_run_queue(void *d) 1960Sstevel@tonic-gate { 1970Sstevel@tonic-gate if (!(md_status & MD_GBL_DAEMONS_LIVE)) 1980Sstevel@tonic-gate md_daemon(1, &md_done_daemon); 1990Sstevel@tonic-gate } 2000Sstevel@tonic-gate 2010Sstevel@tonic-gate 2020Sstevel@tonic-gate /* 2030Sstevel@tonic-gate * FUNCTION: sp_build_incore() 2040Sstevel@tonic-gate * INPUT: p - ptr to unit structure. 2050Sstevel@tonic-gate * snarfing - flag to tell us we are snarfing. 2060Sstevel@tonic-gate * OUTPUT: non. 2070Sstevel@tonic-gate * RETURNS: int - 0 (always). 2080Sstevel@tonic-gate * PURPOSE: place unit structure into in-core unit array (keyed from 2090Sstevel@tonic-gate * minor number). 2100Sstevel@tonic-gate */ 2110Sstevel@tonic-gate int 2120Sstevel@tonic-gate sp_build_incore(void *p, int snarfing) 2130Sstevel@tonic-gate { 2140Sstevel@tonic-gate mp_unit_t *un = (mp_unit_t *)p; 2150Sstevel@tonic-gate minor_t mnum; 2160Sstevel@tonic-gate set_t setno; 2170Sstevel@tonic-gate md_dev64_t tmpdev; 2180Sstevel@tonic-gate 2190Sstevel@tonic-gate mnum = MD_SID(un); 2200Sstevel@tonic-gate 2210Sstevel@tonic-gate if (MD_UNIT(mnum) != NULL) 2220Sstevel@tonic-gate return (0); 2230Sstevel@tonic-gate 2240Sstevel@tonic-gate MD_STATUS(un) = 0; 2250Sstevel@tonic-gate 2260Sstevel@tonic-gate if (snarfing) { 2270Sstevel@tonic-gate /* 2280Sstevel@tonic-gate * if we are snarfing, we get the device information 2290Sstevel@tonic-gate * from the metadb record (using the metadb key for 2300Sstevel@tonic-gate * that device). 2310Sstevel@tonic-gate */ 2320Sstevel@tonic-gate setno = MD_MIN2SET(mnum); 2330Sstevel@tonic-gate 2340Sstevel@tonic-gate tmpdev = md_getdevnum(setno, mddb_getsidenum(setno), 2350Sstevel@tonic-gate un->un_key, MD_NOTRUST_DEVT); 2360Sstevel@tonic-gate un->un_dev = tmpdev; 2370Sstevel@tonic-gate } 2380Sstevel@tonic-gate 239*7627SChris.Horne@Sun.COM /* place various information in the in-core data structures */ 240*7627SChris.Horne@Sun.COM md_nblocks_set(mnum, un->c.un_total_blocks); 2410Sstevel@tonic-gate MD_UNIT(mnum) = un; 242*7627SChris.Horne@Sun.COM 2430Sstevel@tonic-gate return (0); 2440Sstevel@tonic-gate } 2450Sstevel@tonic-gate 2460Sstevel@tonic-gate /* 2470Sstevel@tonic-gate * FUNCTION: reset_sp() 2480Sstevel@tonic-gate * INPUT: un - unit structure to be reset/removed. 2490Sstevel@tonic-gate * mnum - minor number to be reset/removed. 2500Sstevel@tonic-gate * removing - flag to tell us if we are removing 2510Sstevel@tonic-gate * permanently or just reseting in-core 2520Sstevel@tonic-gate * structures. 2530Sstevel@tonic-gate * OUTPUT: none. 2540Sstevel@tonic-gate * RETURNS: void. 2550Sstevel@tonic-gate * PURPOSE: used to either simply reset in-core structures or to 2560Sstevel@tonic-gate * permanently remove metadevices from the metadb. 2570Sstevel@tonic-gate */ 2580Sstevel@tonic-gate void 2590Sstevel@tonic-gate reset_sp(mp_unit_t *un, minor_t mnum, int removing) 2600Sstevel@tonic-gate { 2610Sstevel@tonic-gate sv_dev_t *sv; 2620Sstevel@tonic-gate mddb_recid_t vtoc_id; 2630Sstevel@tonic-gate 2640Sstevel@tonic-gate /* clean up in-core structures */ 2650Sstevel@tonic-gate md_destroy_unit_incore(mnum, &sp_md_ops); 2660Sstevel@tonic-gate 267*7627SChris.Horne@Sun.COM md_nblocks_set(mnum, -1ULL); 2680Sstevel@tonic-gate MD_UNIT(mnum) = NULL; 2690Sstevel@tonic-gate 2701623Stw21770 /* 2711623Stw21770 * Attempt release of minor node 2721623Stw21770 */ 2732077Stw21770 md_remove_minor_node(mnum); 2741623Stw21770 2750Sstevel@tonic-gate if (!removing) 2760Sstevel@tonic-gate return; 2770Sstevel@tonic-gate 2780Sstevel@tonic-gate /* we are removing the soft partition from the metadb */ 2790Sstevel@tonic-gate 2800Sstevel@tonic-gate /* 2810Sstevel@tonic-gate * Save off device information so we can get to 2820Sstevel@tonic-gate * it after we do the mddb_deleterec(). 2830Sstevel@tonic-gate */ 2840Sstevel@tonic-gate sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP); 2850Sstevel@tonic-gate sv->setno = MD_MIN2SET(mnum); 2860Sstevel@tonic-gate sv->key = un->un_key; 2870Sstevel@tonic-gate vtoc_id = un->c.un_vtoc_id; 2880Sstevel@tonic-gate 2891623Stw21770 /* 2901623Stw21770 * Remove self from the namespace 2911623Stw21770 */ 2921623Stw21770 if (un->c.un_revision & MD_FN_META_DEV) { 2931623Stw21770 (void) md_rem_selfname(un->c.un_self_id); 2941623Stw21770 } 2951623Stw21770 2960Sstevel@tonic-gate /* Remove the unit structure */ 2970Sstevel@tonic-gate mddb_deleterec_wrapper(un->c.un_record_id); 2980Sstevel@tonic-gate 2990Sstevel@tonic-gate if (vtoc_id) 3000Sstevel@tonic-gate mddb_deleterec_wrapper(vtoc_id); 3010Sstevel@tonic-gate 3020Sstevel@tonic-gate SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE, 3030Sstevel@tonic-gate MD_MIN2SET(mnum), MD_MIN2UNIT(mnum)); 3040Sstevel@tonic-gate 3050Sstevel@tonic-gate /* 3060Sstevel@tonic-gate * remove the underlying device name from the metadb. if other 3070Sstevel@tonic-gate * soft partitions are built on this device, this will simply 3080Sstevel@tonic-gate * decrease the reference count for this device. otherwise the 3090Sstevel@tonic-gate * name record for this device will be removed from the metadb. 3100Sstevel@tonic-gate */ 3110Sstevel@tonic-gate md_rem_names(sv, 1); 3120Sstevel@tonic-gate kmem_free(sv, sizeof (sv_dev_t)); 3130Sstevel@tonic-gate } 3140Sstevel@tonic-gate 3150Sstevel@tonic-gate /* 3160Sstevel@tonic-gate * FUNCTION: sp_send_stat_msg 3170Sstevel@tonic-gate * INPUT: un - unit reference 3180Sstevel@tonic-gate * status - status to be sent to master node 3190Sstevel@tonic-gate * MD_SP_OK - soft-partition is now OK 3200Sstevel@tonic-gate * MD_SP_ERR " " errored 3210Sstevel@tonic-gate * OUTPUT: none. 3220Sstevel@tonic-gate * RETURNS: void. 3230Sstevel@tonic-gate * PURPOSE: send a soft-partition status change to the master node. If the 3240Sstevel@tonic-gate * message succeeds we simply return. If it fails we panic as the 3250Sstevel@tonic-gate * cluster-wide view of the metadevices is now inconsistent. 3260Sstevel@tonic-gate * CALLING CONTEXT: 3270Sstevel@tonic-gate * Blockable. No locks can be held. 3280Sstevel@tonic-gate */ 3290Sstevel@tonic-gate static void 3300Sstevel@tonic-gate sp_send_stat_msg(mp_unit_t *un, sp_status_t status) 3310Sstevel@tonic-gate { 3320Sstevel@tonic-gate md_mn_msg_sp_setstat_t sp_msg; 3330Sstevel@tonic-gate md_mn_kresult_t *kres; 3340Sstevel@tonic-gate set_t setno = MD_UN2SET(un); 3350Sstevel@tonic-gate int rval; 3360Sstevel@tonic-gate const char *str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK"; 3370Sstevel@tonic-gate 3380Sstevel@tonic-gate sp_msg.sp_setstat_mnum = MD_SID(un); 3390Sstevel@tonic-gate sp_msg.sp_setstat_status = status; 3400Sstevel@tonic-gate 3410Sstevel@tonic-gate kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 3420Sstevel@tonic-gate 3430Sstevel@tonic-gate rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG, 3440Sstevel@tonic-gate (char *)&sp_msg, sizeof (sp_msg), kres); 3450Sstevel@tonic-gate 3460Sstevel@tonic-gate if (!MDMN_KSEND_MSG_OK(rval, kres)) { 3470Sstevel@tonic-gate mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2"); 3480Sstevel@tonic-gate 3490Sstevel@tonic-gate /* 3500Sstevel@tonic-gate * Panic as we are now in an inconsistent state. 3510Sstevel@tonic-gate */ 3520Sstevel@tonic-gate 3530Sstevel@tonic-gate cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n", 3540Sstevel@tonic-gate md_shortname(MD_SID(un)), str); 3550Sstevel@tonic-gate } 3560Sstevel@tonic-gate 3570Sstevel@tonic-gate kmem_free(kres, sizeof (md_mn_kresult_t)); 3580Sstevel@tonic-gate } 3590Sstevel@tonic-gate 3600Sstevel@tonic-gate /* 3610Sstevel@tonic-gate * FUNCTION: sp_finish_error 3620Sstevel@tonic-gate * INPUT: ps - parent save structure for error-ed I/O. 3630Sstevel@tonic-gate * lock_held - set if the unit readerlock is held 3640Sstevel@tonic-gate * OUTPUT: none. 3650Sstevel@tonic-gate * RETURNS: void. 3660Sstevel@tonic-gate * PURPOSE: report a driver error 3670Sstevel@tonic-gate */ 3680Sstevel@tonic-gate static void 3690Sstevel@tonic-gate sp_finish_error(md_spps_t *ps, int lock_held) 3700Sstevel@tonic-gate { 3710Sstevel@tonic-gate struct buf *pb = ps->ps_bp; 3720Sstevel@tonic-gate mdi_unit_t *ui = ps->ps_ui; 3730Sstevel@tonic-gate md_dev64_t un_dev; /* underlying device */ 3740Sstevel@tonic-gate md_dev64_t md_dev = md_expldev(pb->b_edev); /* metadev in error */ 3750Sstevel@tonic-gate char *str; 3760Sstevel@tonic-gate 3770Sstevel@tonic-gate un_dev = md_expldev(ps->ps_un->un_dev); 3780Sstevel@tonic-gate /* set error type */ 3790Sstevel@tonic-gate if (pb->b_flags & B_READ) { 3800Sstevel@tonic-gate str = "read"; 3810Sstevel@tonic-gate } else { 3820Sstevel@tonic-gate str = "write"; 3830Sstevel@tonic-gate } 3840Sstevel@tonic-gate 3850Sstevel@tonic-gate 3860Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps); 3870Sstevel@tonic-gate pb->b_flags |= B_ERROR; 3880Sstevel@tonic-gate 3890Sstevel@tonic-gate md_kstat_done(ui, pb, 0); 3900Sstevel@tonic-gate 3910Sstevel@tonic-gate if (lock_held) { 3920Sstevel@tonic-gate md_unit_readerexit(ui); 3930Sstevel@tonic-gate } 3940Sstevel@tonic-gate md_biodone(pb); 3950Sstevel@tonic-gate 3960Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: %s error on %s", 3970Sstevel@tonic-gate md_shortname(md_getminor(md_dev)), str, 3980Sstevel@tonic-gate md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0)); 3990Sstevel@tonic-gate } 4000Sstevel@tonic-gate 4010Sstevel@tonic-gate 4020Sstevel@tonic-gate /* 4030Sstevel@tonic-gate * FUNCTION: sp_xmit_ok 4040Sstevel@tonic-gate * INPUT: dq - daemon queue referencing failing ps structure 4050Sstevel@tonic-gate * OUTPUT: none. 4060Sstevel@tonic-gate * RETURNS: void. 4070Sstevel@tonic-gate * PURPOSE: send a message to the master node in a multi-owner diskset to 4080Sstevel@tonic-gate * update all attached nodes view of the soft-part to be MD_SP_OK. 4090Sstevel@tonic-gate * CALLING CONTEXT: 4100Sstevel@tonic-gate * Blockable. No unit lock held. 4110Sstevel@tonic-gate */ 4120Sstevel@tonic-gate static void 4130Sstevel@tonic-gate sp_xmit_ok(daemon_queue_t *dq) 4140Sstevel@tonic-gate { 4150Sstevel@tonic-gate md_spps_t *ps = (md_spps_t *)dq; 4160Sstevel@tonic-gate 4170Sstevel@tonic-gate /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 4180Sstevel@tonic-gate sp_send_stat_msg(ps->ps_un, MD_SP_OK); 4190Sstevel@tonic-gate 4200Sstevel@tonic-gate /* 4210Sstevel@tonic-gate * Successfully transmitted error state to all nodes, now release this 4220Sstevel@tonic-gate * parent structure. 4230Sstevel@tonic-gate */ 4240Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps); 4250Sstevel@tonic-gate } 4260Sstevel@tonic-gate 4270Sstevel@tonic-gate /* 4280Sstevel@tonic-gate * FUNCTION: sp_xmit_error 4290Sstevel@tonic-gate * INPUT: dq - daemon queue referencing failing ps structure 4300Sstevel@tonic-gate * OUTPUT: none. 4310Sstevel@tonic-gate * RETURNS: void. 4320Sstevel@tonic-gate * PURPOSE: send a message to the master node in a multi-owner diskset to 4330Sstevel@tonic-gate * update all attached nodes view of the soft-part to be MD_SP_ERR. 4340Sstevel@tonic-gate * CALLING CONTEXT: 4350Sstevel@tonic-gate * Blockable. No unit lock held. 4360Sstevel@tonic-gate */ 4370Sstevel@tonic-gate static void 4380Sstevel@tonic-gate sp_xmit_error(daemon_queue_t *dq) 4390Sstevel@tonic-gate { 4400Sstevel@tonic-gate md_spps_t *ps = (md_spps_t *)dq; 4410Sstevel@tonic-gate 4420Sstevel@tonic-gate /* Send a MD_MN_MSG_SP_SETSTAT to the master */ 4430Sstevel@tonic-gate sp_send_stat_msg(ps->ps_un, MD_SP_ERR); 4440Sstevel@tonic-gate 4450Sstevel@tonic-gate /* 4460Sstevel@tonic-gate * Successfully transmitted error state to all nodes, now release this 4470Sstevel@tonic-gate * parent structure. 4480Sstevel@tonic-gate */ 4490Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps); 4500Sstevel@tonic-gate } 4510Sstevel@tonic-gate static void 4520Sstevel@tonic-gate sp_send_stat_ok(mp_unit_t *un) 4530Sstevel@tonic-gate { 4540Sstevel@tonic-gate minor_t mnum = MD_SID(un); 4550Sstevel@tonic-gate md_spps_t *ps; 4560Sstevel@tonic-gate 4570Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 4580Sstevel@tonic-gate sp_parent_init(ps); 4590Sstevel@tonic-gate ps->ps_un = un; 4600Sstevel@tonic-gate ps->ps_ui = MDI_UNIT(mnum); 4610Sstevel@tonic-gate 4620Sstevel@tonic-gate daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps, 463*7627SChris.Horne@Sun.COM REQ_OLD); 4640Sstevel@tonic-gate } 4650Sstevel@tonic-gate 4660Sstevel@tonic-gate static void 4670Sstevel@tonic-gate sp_send_stat_err(mp_unit_t *un) 4680Sstevel@tonic-gate { 4690Sstevel@tonic-gate minor_t mnum = MD_SID(un); 4700Sstevel@tonic-gate md_spps_t *ps; 4710Sstevel@tonic-gate 4720Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 4730Sstevel@tonic-gate sp_parent_init(ps); 4740Sstevel@tonic-gate ps->ps_un = un; 4750Sstevel@tonic-gate ps->ps_ui = MDI_UNIT(mnum); 4760Sstevel@tonic-gate 4770Sstevel@tonic-gate daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps, 478*7627SChris.Horne@Sun.COM REQ_OLD); 4790Sstevel@tonic-gate } 4800Sstevel@tonic-gate 4810Sstevel@tonic-gate 4820Sstevel@tonic-gate /* 4830Sstevel@tonic-gate * FUNCTION: sp_error() 4840Sstevel@tonic-gate * INPUT: ps - parent save structure for error-ed I/O. 4850Sstevel@tonic-gate * OUTPUT: none. 4860Sstevel@tonic-gate * RETURNS: void. 4870Sstevel@tonic-gate * PURPOSE: report a driver error. 4880Sstevel@tonic-gate * CALLING CONTEXT: 4890Sstevel@tonic-gate * Interrupt - non-blockable 4900Sstevel@tonic-gate */ 4910Sstevel@tonic-gate static void 4920Sstevel@tonic-gate sp_error(md_spps_t *ps) 4930Sstevel@tonic-gate { 4940Sstevel@tonic-gate set_t setno = MD_UN2SET(ps->ps_un); 4950Sstevel@tonic-gate 4960Sstevel@tonic-gate /* 4970Sstevel@tonic-gate * Drop the mutex associated with this request before (potentially) 4980Sstevel@tonic-gate * enqueuing the free onto a separate thread. We have to release the 4990Sstevel@tonic-gate * mutex before destroying the parent structure. 5000Sstevel@tonic-gate */ 5010Sstevel@tonic-gate if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 5020Sstevel@tonic-gate if (MUTEX_HELD(&ps->ps_mx)) { 5030Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 5040Sstevel@tonic-gate } 5050Sstevel@tonic-gate } else { 5060Sstevel@tonic-gate /* 5070Sstevel@tonic-gate * this should only ever happen if we are panicking, 5080Sstevel@tonic-gate * since DONTFREE is only set on the parent if panicstr 5090Sstevel@tonic-gate * is non-NULL. 5100Sstevel@tonic-gate */ 5110Sstevel@tonic-gate ASSERT(panicstr); 5120Sstevel@tonic-gate } 5130Sstevel@tonic-gate 5140Sstevel@tonic-gate /* 5150Sstevel@tonic-gate * For a multi-owner set we need to send a message to the master so that 5160Sstevel@tonic-gate * all nodes get the errored status when we first encounter it. To avoid 5170Sstevel@tonic-gate * deadlocking when multiple soft-partitions encounter an error on one 5180Sstevel@tonic-gate * physical unit we drop the unit readerlock before enqueueing the 5190Sstevel@tonic-gate * request. That way we can service any messages that require a 5200Sstevel@tonic-gate * writerlock to be held. Additionally, to avoid deadlocking when at 5210Sstevel@tonic-gate * the bottom of a metadevice stack and a higher level mirror has 5220Sstevel@tonic-gate * multiple requests outstanding on this soft-part, we clone the ps 5230Sstevel@tonic-gate * that failed and pass the error back up the stack to release the 5240Sstevel@tonic-gate * reference that this i/o may have in the higher-level metadevice. 5250Sstevel@tonic-gate * The other nodes in the cluster just have to modify the soft-part 5260Sstevel@tonic-gate * status and we do not need to block the i/o completion for this. 5270Sstevel@tonic-gate */ 5280Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) { 5290Sstevel@tonic-gate md_spps_t *err_ps; 5300Sstevel@tonic-gate err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 5310Sstevel@tonic-gate sp_parent_init(err_ps); 5320Sstevel@tonic-gate 5330Sstevel@tonic-gate err_ps->ps_un = ps->ps_un; 5340Sstevel@tonic-gate err_ps->ps_ui = ps->ps_ui; 5350Sstevel@tonic-gate 5360Sstevel@tonic-gate md_unit_readerexit(ps->ps_ui); 5370Sstevel@tonic-gate 5380Sstevel@tonic-gate daemon_request(&md_sp_daemon, sp_xmit_error, 5390Sstevel@tonic-gate (daemon_queue_t *)err_ps, REQ_OLD); 5400Sstevel@tonic-gate 5410Sstevel@tonic-gate sp_finish_error(ps, 0); 5420Sstevel@tonic-gate 5430Sstevel@tonic-gate return; 5440Sstevel@tonic-gate } else { 5450Sstevel@tonic-gate ps->ps_un->un_status = MD_SP_ERR; 5460Sstevel@tonic-gate } 5470Sstevel@tonic-gate 5480Sstevel@tonic-gate /* Flag the error */ 5490Sstevel@tonic-gate sp_finish_error(ps, 1); 5500Sstevel@tonic-gate 5510Sstevel@tonic-gate } 5520Sstevel@tonic-gate 5530Sstevel@tonic-gate /* 5540Sstevel@tonic-gate * FUNCTION: sp_mapbuf() 5550Sstevel@tonic-gate * INPUT: un - unit structure for soft partition we are doing 5560Sstevel@tonic-gate * I/O on. 5570Sstevel@tonic-gate * voff - virtual offset in soft partition to map. 5580Sstevel@tonic-gate * bcount - # of blocks in the I/O. 5590Sstevel@tonic-gate * OUTPUT: bp - translated buffer to be passed down to next layer. 5600Sstevel@tonic-gate * RETURNS: 1 - request must be fragmented, more work to do, 5610Sstevel@tonic-gate * 0 - request satisified, no more work to do 5620Sstevel@tonic-gate * -1 - error 5630Sstevel@tonic-gate * PURPOSE: Map the the virtual offset in the soft partition (passed 5640Sstevel@tonic-gate * in via voff) to the "physical" offset on whatever the soft 5650Sstevel@tonic-gate * partition is built on top of. We do this by doing a binary 5660Sstevel@tonic-gate * search of the extent array in the soft partition unit 5670Sstevel@tonic-gate * structure. Once the current extent is found, we do the 5680Sstevel@tonic-gate * translation, determine if the I/O will cross extent 5690Sstevel@tonic-gate * boundaries (if so, we have to fragment the I/O), then 5700Sstevel@tonic-gate * fill in the buf structure to be passed down to the next layer. 5710Sstevel@tonic-gate */ 5720Sstevel@tonic-gate static int 5730Sstevel@tonic-gate sp_mapbuf( 5740Sstevel@tonic-gate mp_unit_t *un, 5750Sstevel@tonic-gate sp_ext_offset_t voff, 5760Sstevel@tonic-gate sp_ext_length_t bcount, 5770Sstevel@tonic-gate buf_t *bp 5780Sstevel@tonic-gate ) 5790Sstevel@tonic-gate { 5800Sstevel@tonic-gate int lo, mid, hi, found, more; 5810Sstevel@tonic-gate size_t new_bcount; 5820Sstevel@tonic-gate sp_ext_offset_t new_blkno; 5830Sstevel@tonic-gate sp_ext_offset_t new_offset; 5840Sstevel@tonic-gate sp_ext_offset_t ext_endblk; 5850Sstevel@tonic-gate md_dev64_t new_edev; 5860Sstevel@tonic-gate extern unsigned md_maxphys; 5870Sstevel@tonic-gate 5880Sstevel@tonic-gate found = 0; 5890Sstevel@tonic-gate lo = 0; 5900Sstevel@tonic-gate hi = un->un_numexts - 1; 5910Sstevel@tonic-gate 5920Sstevel@tonic-gate /* 5930Sstevel@tonic-gate * do a binary search to find the extent that contains the 5940Sstevel@tonic-gate * starting offset. after this loop, mid contains the index 5950Sstevel@tonic-gate * of the correct extent. 5960Sstevel@tonic-gate */ 5970Sstevel@tonic-gate while (lo <= hi && !found) { 5980Sstevel@tonic-gate mid = (lo + hi) / 2; 5990Sstevel@tonic-gate /* is the starting offset contained within the mid-ext? */ 6000Sstevel@tonic-gate if (voff >= un->un_ext[mid].un_voff && 6010Sstevel@tonic-gate voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len) 6020Sstevel@tonic-gate found = 1; 6030Sstevel@tonic-gate else if (voff < un->un_ext[mid].un_voff) 6040Sstevel@tonic-gate hi = mid - 1; 6050Sstevel@tonic-gate else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */ 6060Sstevel@tonic-gate lo = mid + 1; 6070Sstevel@tonic-gate } 6080Sstevel@tonic-gate 6090Sstevel@tonic-gate if (!found) { 6100Sstevel@tonic-gate cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff); 6110Sstevel@tonic-gate return (-1); 6120Sstevel@tonic-gate } 6130Sstevel@tonic-gate 6140Sstevel@tonic-gate /* translate to underlying physical offset/device */ 6150Sstevel@tonic-gate new_offset = voff - un->un_ext[mid].un_voff; 6160Sstevel@tonic-gate new_blkno = un->un_ext[mid].un_poff + new_offset; 6170Sstevel@tonic-gate new_edev = un->un_dev; 6180Sstevel@tonic-gate 6190Sstevel@tonic-gate /* determine if we need to break the I/O into fragments */ 6200Sstevel@tonic-gate ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len; 6210Sstevel@tonic-gate if (voff + btodb(bcount) > ext_endblk) { 6220Sstevel@tonic-gate new_bcount = dbtob(ext_endblk - voff); 6230Sstevel@tonic-gate more = 1; 6240Sstevel@tonic-gate } else { 6250Sstevel@tonic-gate new_bcount = bcount; 6260Sstevel@tonic-gate more = 0; 6270Sstevel@tonic-gate } 6280Sstevel@tonic-gate 6290Sstevel@tonic-gate /* only break up the I/O if we're not built on another metadevice */ 6300Sstevel@tonic-gate if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) { 6310Sstevel@tonic-gate new_bcount = md_maxphys; 6320Sstevel@tonic-gate more = 1; 6330Sstevel@tonic-gate } 6340Sstevel@tonic-gate if (bp != (buf_t *)NULL) { 6350Sstevel@tonic-gate /* do bp updates */ 6360Sstevel@tonic-gate bp->b_bcount = new_bcount; 6370Sstevel@tonic-gate bp->b_lblkno = new_blkno; 6380Sstevel@tonic-gate bp->b_edev = md_dev64_to_dev(new_edev); 6390Sstevel@tonic-gate } 6400Sstevel@tonic-gate return (more); 6410Sstevel@tonic-gate } 6420Sstevel@tonic-gate 6430Sstevel@tonic-gate /* 6440Sstevel@tonic-gate * FUNCTION: sp_validate() 6450Sstevel@tonic-gate * INPUT: un - unit structure to be validated. 6460Sstevel@tonic-gate * OUTPUT: none. 6470Sstevel@tonic-gate * RETURNS: 0 - soft partition ok. 6480Sstevel@tonic-gate * -1 - error. 6490Sstevel@tonic-gate * PURPOSE: called on open to sanity check the soft partition. In 6500Sstevel@tonic-gate * order to open a soft partition: 6510Sstevel@tonic-gate * - it must have at least one extent 6520Sstevel@tonic-gate * - the extent info in core and on disk must match 6530Sstevel@tonic-gate * - it may not be in an intermediate state (which would 6540Sstevel@tonic-gate * imply that a two-phase commit was interrupted) 6550Sstevel@tonic-gate * 6560Sstevel@tonic-gate * If the extent checking fails (B_ERROR returned from the read 6570Sstevel@tonic-gate * strategy call) _and_ we're a multi-owner diskset, we send a 6580Sstevel@tonic-gate * message to the master so that all nodes inherit the same view 6590Sstevel@tonic-gate * of the soft partition. 6600Sstevel@tonic-gate * If we are checking a soft-part that is marked as in error, and 6610Sstevel@tonic-gate * we can actually read and validate the watermarks we send a 6620Sstevel@tonic-gate * message to clear the error to the master node. 6630Sstevel@tonic-gate */ 6640Sstevel@tonic-gate static int 6650Sstevel@tonic-gate sp_validate(mp_unit_t *un) 6660Sstevel@tonic-gate { 6670Sstevel@tonic-gate uint_t ext; 6680Sstevel@tonic-gate struct buf *buf; 6690Sstevel@tonic-gate sp_ext_length_t len; 6700Sstevel@tonic-gate mp_watermark_t *wm; 6710Sstevel@tonic-gate set_t setno; 6720Sstevel@tonic-gate int reset_error = 0; 6730Sstevel@tonic-gate 6740Sstevel@tonic-gate setno = MD_UN2SET(un); 6750Sstevel@tonic-gate 6760Sstevel@tonic-gate /* sanity check unit structure components ?? */ 6770Sstevel@tonic-gate if (un->un_status != MD_SP_OK) { 6780Sstevel@tonic-gate if (un->un_status != MD_SP_ERR) { 6790Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, soft partition " 6800Sstevel@tonic-gate "status is %u.", 6810Sstevel@tonic-gate md_shortname(MD_SID(un)), 6820Sstevel@tonic-gate un->un_status); 6830Sstevel@tonic-gate return (-1); 6840Sstevel@tonic-gate } else { 6850Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open of soft partition " 6860Sstevel@tonic-gate "in Errored state.", 6870Sstevel@tonic-gate md_shortname(MD_SID(un))); 6880Sstevel@tonic-gate reset_error = 1; 6890Sstevel@tonic-gate } 6900Sstevel@tonic-gate } 6910Sstevel@tonic-gate 6920Sstevel@tonic-gate if (un->un_numexts == 0) { 6930Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, soft partition does " 6940Sstevel@tonic-gate "not have any extents.", md_shortname(MD_SID(un))); 6950Sstevel@tonic-gate return (-1); 6960Sstevel@tonic-gate } 6970Sstevel@tonic-gate 6980Sstevel@tonic-gate len = 0LL; 6990Sstevel@tonic-gate for (ext = 0; ext < un->un_numexts; ext++) { 7000Sstevel@tonic-gate 7010Sstevel@tonic-gate /* tally extent lengths to check total size */ 7020Sstevel@tonic-gate len += un->un_ext[ext].un_len; 7030Sstevel@tonic-gate 7040Sstevel@tonic-gate /* allocate buffer for watermark */ 7050Sstevel@tonic-gate buf = getrbuf(KM_SLEEP); 7060Sstevel@tonic-gate 7070Sstevel@tonic-gate /* read watermark */ 7080Sstevel@tonic-gate buf->b_flags = B_READ; 7090Sstevel@tonic-gate buf->b_edev = md_dev64_to_dev(un->un_dev); 7100Sstevel@tonic-gate buf->b_iodone = NULL; 7110Sstevel@tonic-gate buf->b_proc = NULL; 7120Sstevel@tonic-gate buf->b_bcount = sizeof (mp_watermark_t); 7130Sstevel@tonic-gate buf->b_lblkno = un->un_ext[ext].un_poff - 1; 7140Sstevel@tonic-gate buf->b_bufsize = sizeof (mp_watermark_t); 7150Sstevel@tonic-gate buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t), 7160Sstevel@tonic-gate KM_SLEEP); 7170Sstevel@tonic-gate 7180Sstevel@tonic-gate /* 7190Sstevel@tonic-gate * make the call non-blocking so that it is not affected 7200Sstevel@tonic-gate * by a set take. 7210Sstevel@tonic-gate */ 7220Sstevel@tonic-gate md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL); 7230Sstevel@tonic-gate (void) biowait(buf); 7240Sstevel@tonic-gate 7250Sstevel@tonic-gate if (buf->b_flags & B_ERROR) { 7260Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, could not " 7270Sstevel@tonic-gate "read watermark at block %llu for extent %u, " 7280Sstevel@tonic-gate "error %d.", md_shortname(MD_SID(un)), 7290Sstevel@tonic-gate buf->b_lblkno, ext, buf->b_error); 7300Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 7310Sstevel@tonic-gate freerbuf(buf); 7320Sstevel@tonic-gate 7330Sstevel@tonic-gate /* 7340Sstevel@tonic-gate * If we're a multi-owner diskset we send a message 7350Sstevel@tonic-gate * indicating that this soft-part has an invalid 7360Sstevel@tonic-gate * extent to the master node. This ensures a consistent 7370Sstevel@tonic-gate * view of the soft-part across the cluster. 7380Sstevel@tonic-gate */ 7390Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) { 7400Sstevel@tonic-gate sp_send_stat_err(un); 7410Sstevel@tonic-gate } 7420Sstevel@tonic-gate return (-1); 7430Sstevel@tonic-gate } 7440Sstevel@tonic-gate 7450Sstevel@tonic-gate wm = (mp_watermark_t *)buf->b_un.b_addr; 7460Sstevel@tonic-gate 7470Sstevel@tonic-gate /* make sure the checksum is correct first */ 7480Sstevel@tonic-gate if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum, 7490Sstevel@tonic-gate (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) { 7500Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 7510Sstevel@tonic-gate "at block %llu for extent %u does not have a " 7520Sstevel@tonic-gate "valid checksum 0x%08x.", md_shortname(MD_SID(un)), 7530Sstevel@tonic-gate buf->b_lblkno, ext, wm->wm_checksum); 7540Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 7550Sstevel@tonic-gate freerbuf(buf); 7560Sstevel@tonic-gate return (-1); 7570Sstevel@tonic-gate } 7580Sstevel@tonic-gate 7590Sstevel@tonic-gate if (wm->wm_magic != MD_SP_MAGIC) { 7600Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 7610Sstevel@tonic-gate "at block %llu for extent %u does not have a " 7620Sstevel@tonic-gate "valid watermark magic number, expected 0x%x, " 7630Sstevel@tonic-gate "found 0x%x.", md_shortname(MD_SID(un)), 7640Sstevel@tonic-gate buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic); 7650Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 7660Sstevel@tonic-gate freerbuf(buf); 7670Sstevel@tonic-gate return (-1); 7680Sstevel@tonic-gate } 7690Sstevel@tonic-gate 7700Sstevel@tonic-gate /* make sure sequence number matches the current extent */ 7710Sstevel@tonic-gate if (wm->wm_seq != ext) { 7720Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 7730Sstevel@tonic-gate "at block %llu for extent %u has invalid " 7740Sstevel@tonic-gate "sequence number %u.", md_shortname(MD_SID(un)), 7750Sstevel@tonic-gate buf->b_lblkno, ext, wm->wm_seq); 7760Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 7770Sstevel@tonic-gate freerbuf(buf); 7780Sstevel@tonic-gate return (-1); 7790Sstevel@tonic-gate } 7800Sstevel@tonic-gate 7810Sstevel@tonic-gate /* make sure watermark length matches unit structure */ 7820Sstevel@tonic-gate if (wm->wm_length != un->un_ext[ext].un_len) { 7830Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 7840Sstevel@tonic-gate "at block %llu for extent %u has inconsistent " 7850Sstevel@tonic-gate "length, expected %llu, found %llu.", 7860Sstevel@tonic-gate md_shortname(MD_SID(un)), buf->b_lblkno, 7870Sstevel@tonic-gate ext, un->un_ext[ext].un_len, 7880Sstevel@tonic-gate (u_longlong_t)wm->wm_length); 7890Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 7900Sstevel@tonic-gate freerbuf(buf); 7910Sstevel@tonic-gate return (-1); 7920Sstevel@tonic-gate } 7930Sstevel@tonic-gate 7940Sstevel@tonic-gate /* 7950Sstevel@tonic-gate * make sure the type is a valid soft partition and not 7960Sstevel@tonic-gate * a free extent or the end. 7970Sstevel@tonic-gate */ 7980Sstevel@tonic-gate if (wm->wm_type != EXTTYP_ALLOC) { 7990Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark " 8000Sstevel@tonic-gate "at block %llu for extent %u is not marked " 8010Sstevel@tonic-gate "as in-use, type = %u.", md_shortname(MD_SID(un)), 8020Sstevel@tonic-gate buf->b_lblkno, ext, wm->wm_type); 8030Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 8040Sstevel@tonic-gate freerbuf(buf); 8050Sstevel@tonic-gate return (-1); 8060Sstevel@tonic-gate } 8070Sstevel@tonic-gate /* free up buffer */ 8080Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t)); 8090Sstevel@tonic-gate freerbuf(buf); 8100Sstevel@tonic-gate } 8110Sstevel@tonic-gate 8120Sstevel@tonic-gate if (len != un->un_length) { 8130Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, computed length " 8140Sstevel@tonic-gate "%llu != expected length %llu.", md_shortname(MD_SID(un)), 8150Sstevel@tonic-gate len, un->un_length); 8160Sstevel@tonic-gate return (-1); 8170Sstevel@tonic-gate } 8180Sstevel@tonic-gate 8190Sstevel@tonic-gate /* 8200Sstevel@tonic-gate * If we're a multi-owner set _and_ reset_error is set, we should clear 8210Sstevel@tonic-gate * the error condition on all nodes in the set. Use SP_SETSTAT2 with 8220Sstevel@tonic-gate * MD_SP_OK. 8230Sstevel@tonic-gate */ 8240Sstevel@tonic-gate if (MD_MNSET_SETNO(setno) && reset_error) { 8250Sstevel@tonic-gate sp_send_stat_ok(un); 8260Sstevel@tonic-gate } 8270Sstevel@tonic-gate return (0); 8280Sstevel@tonic-gate } 8290Sstevel@tonic-gate 8300Sstevel@tonic-gate /* 8310Sstevel@tonic-gate * FUNCTION: sp_done() 8320Sstevel@tonic-gate * INPUT: child_buf - buffer attached to child save structure. 8330Sstevel@tonic-gate * this is the buffer on which I/O has just 8340Sstevel@tonic-gate * completed. 8350Sstevel@tonic-gate * OUTPUT: none. 8360Sstevel@tonic-gate * RETURNS: 0 - success. 8370Sstevel@tonic-gate * 1 - error. 8380Sstevel@tonic-gate * PURPOSE: called on I/O completion. 8390Sstevel@tonic-gate */ 8400Sstevel@tonic-gate static int 8410Sstevel@tonic-gate sp_done(struct buf *child_buf) 8420Sstevel@tonic-gate { 8430Sstevel@tonic-gate struct buf *parent_buf; 8440Sstevel@tonic-gate mdi_unit_t *ui; 8450Sstevel@tonic-gate md_spps_t *ps; 8460Sstevel@tonic-gate md_spcs_t *cs; 8470Sstevel@tonic-gate 8480Sstevel@tonic-gate /* find the child save structure to which this buffer belongs */ 8490Sstevel@tonic-gate cs = (md_spcs_t *)((caddr_t)child_buf - 8500Sstevel@tonic-gate (sizeof (md_spcs_t) - sizeof (buf_t))); 8510Sstevel@tonic-gate /* now get the parent save structure */ 8520Sstevel@tonic-gate ps = cs->cs_ps; 8530Sstevel@tonic-gate parent_buf = ps->ps_bp; 8540Sstevel@tonic-gate 8550Sstevel@tonic-gate mutex_enter(&ps->ps_mx); 8560Sstevel@tonic-gate /* pass any errors back up to the parent */ 8570Sstevel@tonic-gate if (child_buf->b_flags & B_ERROR) { 8580Sstevel@tonic-gate ps->ps_flags |= MD_SPPS_ERROR; 8590Sstevel@tonic-gate parent_buf->b_error = child_buf->b_error; 8600Sstevel@tonic-gate } 8610Sstevel@tonic-gate /* mapout, if needed */ 8620Sstevel@tonic-gate if (child_buf->b_flags & B_REMAPPED) 8630Sstevel@tonic-gate bp_mapout(child_buf); 8640Sstevel@tonic-gate 8650Sstevel@tonic-gate ps->ps_frags--; 8660Sstevel@tonic-gate if (ps->ps_frags != 0) { 8670Sstevel@tonic-gate /* 8680Sstevel@tonic-gate * if this parent has more children, we just free the 8690Sstevel@tonic-gate * child and return. 8700Sstevel@tonic-gate */ 8710Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs); 8720Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 8730Sstevel@tonic-gate return (1); 8740Sstevel@tonic-gate } 8750Sstevel@tonic-gate /* there are no more children */ 8760Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs); 8770Sstevel@tonic-gate if (ps->ps_flags & MD_SPPS_ERROR) { 8780Sstevel@tonic-gate sp_error(ps); 8790Sstevel@tonic-gate return (1); 8800Sstevel@tonic-gate } 8810Sstevel@tonic-gate ui = ps->ps_ui; 8820Sstevel@tonic-gate if (!(ps->ps_flags & MD_SPPS_DONTFREE)) { 8830Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 8840Sstevel@tonic-gate } else { 8850Sstevel@tonic-gate /* 8860Sstevel@tonic-gate * this should only ever happen if we are panicking, 8870Sstevel@tonic-gate * since DONTFREE is only set on the parent if panicstr 8880Sstevel@tonic-gate * is non-NULL. 8890Sstevel@tonic-gate */ 8900Sstevel@tonic-gate ASSERT(panicstr); 8910Sstevel@tonic-gate } 8920Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps); 8930Sstevel@tonic-gate md_kstat_done(ui, parent_buf, 0); 8940Sstevel@tonic-gate md_unit_readerexit(ui); 8950Sstevel@tonic-gate md_biodone(parent_buf); 8960Sstevel@tonic-gate return (0); 8970Sstevel@tonic-gate } 8980Sstevel@tonic-gate 8990Sstevel@tonic-gate /* 9000Sstevel@tonic-gate * FUNCTION: md_sp_strategy() 9010Sstevel@tonic-gate * INPUT: parent_buf - parent buffer 9020Sstevel@tonic-gate * flag - flags 9030Sstevel@tonic-gate * private - private data 9040Sstevel@tonic-gate * OUTPUT: none. 9050Sstevel@tonic-gate * RETURNS: void. 9060Sstevel@tonic-gate * PURPOSE: Soft partitioning I/O strategy. Performs the main work 9070Sstevel@tonic-gate * needed to do I/O to a soft partition. The basic 9080Sstevel@tonic-gate * algorithm is as follows: 9090Sstevel@tonic-gate * - Allocate a child save structure to keep track 9100Sstevel@tonic-gate * of the I/O we are going to pass down. 9110Sstevel@tonic-gate * - Map the I/O to the correct extent in the soft 9120Sstevel@tonic-gate * partition (see sp_mapbuf()). 9130Sstevel@tonic-gate * - bioclone() the buffer and pass it down the 9140Sstevel@tonic-gate * stack using md_call_strategy. 9150Sstevel@tonic-gate * - If the I/O needs to split across extents, 9160Sstevel@tonic-gate * repeat the above steps until all fragments 9170Sstevel@tonic-gate * are finished. 9180Sstevel@tonic-gate */ 9190Sstevel@tonic-gate static void 9200Sstevel@tonic-gate md_sp_strategy(buf_t *parent_buf, int flag, void *private) 9210Sstevel@tonic-gate { 9220Sstevel@tonic-gate md_spps_t *ps; 9230Sstevel@tonic-gate md_spcs_t *cs; 9240Sstevel@tonic-gate int more; 9250Sstevel@tonic-gate mp_unit_t *un; 9260Sstevel@tonic-gate mdi_unit_t *ui; 9270Sstevel@tonic-gate size_t current_count; 9280Sstevel@tonic-gate off_t current_offset; 9290Sstevel@tonic-gate sp_ext_offset_t current_blkno; 9300Sstevel@tonic-gate buf_t *child_buf; 9310Sstevel@tonic-gate set_t setno = MD_MIN2SET(getminor(parent_buf->b_edev)); 9320Sstevel@tonic-gate int strat_flag = flag; 9330Sstevel@tonic-gate 9340Sstevel@tonic-gate /* 9350Sstevel@tonic-gate * When doing IO to a multi owner meta device, check if set is halted. 9360Sstevel@tonic-gate * We do this check without the needed lock held, for performance 9370Sstevel@tonic-gate * reasons. 9380Sstevel@tonic-gate * If an IO just slips through while the set is locked via an 9390Sstevel@tonic-gate * MD_MN_SUSPEND_SET, we don't care about it. 9400Sstevel@tonic-gate * Only check for suspension if we are a top-level i/o request 9410Sstevel@tonic-gate * (MD_STR_NOTTOP is cleared in 'flag'); 9420Sstevel@tonic-gate */ 9430Sstevel@tonic-gate if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 9440Sstevel@tonic-gate (MD_SET_HALTED | MD_SET_MNSET)) { 9450Sstevel@tonic-gate if ((flag & MD_STR_NOTTOP) == 0) { 9460Sstevel@tonic-gate mutex_enter(&md_mx); 9470Sstevel@tonic-gate /* Here we loop until the set is no longer halted */ 9480Sstevel@tonic-gate while (md_set[setno].s_status & MD_SET_HALTED) { 9490Sstevel@tonic-gate cv_wait(&md_cv, &md_mx); 9500Sstevel@tonic-gate } 9510Sstevel@tonic-gate mutex_exit(&md_mx); 9520Sstevel@tonic-gate } 9530Sstevel@tonic-gate } 9540Sstevel@tonic-gate 9550Sstevel@tonic-gate ui = MDI_UNIT(getminor(parent_buf->b_edev)); 9560Sstevel@tonic-gate 9570Sstevel@tonic-gate md_kstat_waitq_enter(ui); 9580Sstevel@tonic-gate 9590Sstevel@tonic-gate un = (mp_unit_t *)md_unit_readerlock(ui); 9600Sstevel@tonic-gate 9610Sstevel@tonic-gate if ((flag & MD_NOBLOCK) == 0) { 9620Sstevel@tonic-gate if (md_inc_iocount(setno) != 0) { 9630Sstevel@tonic-gate parent_buf->b_flags |= B_ERROR; 9640Sstevel@tonic-gate parent_buf->b_error = ENXIO; 9650Sstevel@tonic-gate parent_buf->b_resid = parent_buf->b_bcount; 9662150Sjeanm md_kstat_waitq_exit(ui); 9670Sstevel@tonic-gate md_unit_readerexit(ui); 9680Sstevel@tonic-gate biodone(parent_buf); 9690Sstevel@tonic-gate return; 9700Sstevel@tonic-gate } 9710Sstevel@tonic-gate } else { 9720Sstevel@tonic-gate md_inc_iocount_noblock(setno); 9730Sstevel@tonic-gate } 9740Sstevel@tonic-gate 9750Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP)) { 9760Sstevel@tonic-gate if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) { 9770Sstevel@tonic-gate md_kstat_waitq_exit(ui); 9780Sstevel@tonic-gate return; 9790Sstevel@tonic-gate } 9800Sstevel@tonic-gate } 9810Sstevel@tonic-gate 9820Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 9830Sstevel@tonic-gate sp_parent_init(ps); 9840Sstevel@tonic-gate 9850Sstevel@tonic-gate /* 9860Sstevel@tonic-gate * Save essential information from the original buffhdr 9870Sstevel@tonic-gate * in the parent. 9880Sstevel@tonic-gate */ 9890Sstevel@tonic-gate ps->ps_un = un; 9900Sstevel@tonic-gate ps->ps_ui = ui; 9910Sstevel@tonic-gate ps->ps_bp = parent_buf; 9920Sstevel@tonic-gate ps->ps_addr = parent_buf->b_un.b_addr; 9930Sstevel@tonic-gate 9940Sstevel@tonic-gate current_count = parent_buf->b_bcount; 9950Sstevel@tonic-gate current_blkno = (sp_ext_offset_t)parent_buf->b_blkno; 9960Sstevel@tonic-gate current_offset = 0; 9970Sstevel@tonic-gate 9980Sstevel@tonic-gate /* 9990Sstevel@tonic-gate * if we are at the top and we are panicking, 10000Sstevel@tonic-gate * we don't free in order to save state. 10010Sstevel@tonic-gate */ 10020Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) 10030Sstevel@tonic-gate ps->ps_flags |= MD_SPPS_DONTFREE; 10040Sstevel@tonic-gate 10050Sstevel@tonic-gate md_kstat_waitq_to_runq(ui); 10060Sstevel@tonic-gate 10070Sstevel@tonic-gate ps->ps_frags++; 10080Sstevel@tonic-gate 10090Sstevel@tonic-gate /* 10100Sstevel@tonic-gate * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this 10110Sstevel@tonic-gate * metadevice. 10120Sstevel@tonic-gate */ 10130Sstevel@tonic-gate if (ui->ui_tstate & MD_ABR_CAP) 10140Sstevel@tonic-gate strat_flag |= MD_STR_ABR; 10150Sstevel@tonic-gate 10160Sstevel@tonic-gate /* 10170Sstevel@tonic-gate * this loop does the main work of an I/O. we allocate a 10180Sstevel@tonic-gate * a child save for each buf, do the logical to physical 10190Sstevel@tonic-gate * mapping, decide if we need to frag the I/O, clone the 10200Sstevel@tonic-gate * new I/O to pass down the stack. repeat until we've 10210Sstevel@tonic-gate * taken care of the entire buf that was passed to us. 10220Sstevel@tonic-gate */ 10230Sstevel@tonic-gate do { 10240Sstevel@tonic-gate cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 10250Sstevel@tonic-gate sp_child_init(cs); 10260Sstevel@tonic-gate child_buf = &cs->cs_buf; 10270Sstevel@tonic-gate cs->cs_ps = ps; 10280Sstevel@tonic-gate 10290Sstevel@tonic-gate more = sp_mapbuf(un, current_blkno, current_count, child_buf); 10300Sstevel@tonic-gate if (more == -1) { 10310Sstevel@tonic-gate parent_buf->b_flags |= B_ERROR; 10320Sstevel@tonic-gate parent_buf->b_error = EIO; 10330Sstevel@tonic-gate md_kstat_done(ui, parent_buf, 0); 10340Sstevel@tonic-gate md_unit_readerexit(ui); 10350Sstevel@tonic-gate md_biodone(parent_buf); 10360Sstevel@tonic-gate kmem_cache_free(sp_parent_cache, ps); 10370Sstevel@tonic-gate return; 10380Sstevel@tonic-gate } 10390Sstevel@tonic-gate 10400Sstevel@tonic-gate child_buf = md_bioclone(parent_buf, current_offset, 1041*7627SChris.Horne@Sun.COM child_buf->b_bcount, child_buf->b_edev, 1042*7627SChris.Horne@Sun.COM child_buf->b_blkno, sp_done, child_buf, 1043*7627SChris.Horne@Sun.COM KM_NOSLEEP); 10440Sstevel@tonic-gate /* calculate new offset, counts, etc... */ 10450Sstevel@tonic-gate current_offset += child_buf->b_bcount; 10460Sstevel@tonic-gate current_count -= child_buf->b_bcount; 10470Sstevel@tonic-gate current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 10480Sstevel@tonic-gate 10490Sstevel@tonic-gate if (more) { 10500Sstevel@tonic-gate mutex_enter(&ps->ps_mx); 10510Sstevel@tonic-gate ps->ps_frags++; 10520Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 10530Sstevel@tonic-gate } 10540Sstevel@tonic-gate 10550Sstevel@tonic-gate md_call_strategy(child_buf, strat_flag, private); 10560Sstevel@tonic-gate } while (more); 10570Sstevel@tonic-gate 10580Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) { 10590Sstevel@tonic-gate while (!(ps->ps_flags & MD_SPPS_DONE)) { 10600Sstevel@tonic-gate md_daemon(1, &md_done_daemon); 10610Sstevel@tonic-gate } 10620Sstevel@tonic-gate kmem_cache_free(sp_parent_cache, ps); 10630Sstevel@tonic-gate } 10640Sstevel@tonic-gate } 10650Sstevel@tonic-gate 10660Sstevel@tonic-gate /* 10670Sstevel@tonic-gate * FUNCTION: sp_directed_read() 10680Sstevel@tonic-gate * INPUT: mnum - minor number 10690Sstevel@tonic-gate * vdr - vol_directed_rd_t from user 10700Sstevel@tonic-gate * mode - access mode for copying data out. 10710Sstevel@tonic-gate * OUTPUT: none. 10720Sstevel@tonic-gate * RETURNS: 0 - success 10730Sstevel@tonic-gate * Exxxxx - failure error-code 10740Sstevel@tonic-gate * PURPOSE: Construct the necessary sub-device i/o requests to perform the 10750Sstevel@tonic-gate * directed read as requested by the user. This is essentially the 10760Sstevel@tonic-gate * same as md_sp_strategy() with the exception being that the 10770Sstevel@tonic-gate * underlying 'md_call_strategy' is replaced with an ioctl call. 10780Sstevel@tonic-gate */ 10790Sstevel@tonic-gate int 10800Sstevel@tonic-gate sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode) 10810Sstevel@tonic-gate { 10820Sstevel@tonic-gate md_spps_t *ps; 10830Sstevel@tonic-gate md_spcs_t *cs; 10840Sstevel@tonic-gate int more; 10850Sstevel@tonic-gate mp_unit_t *un; 10860Sstevel@tonic-gate mdi_unit_t *ui; 10870Sstevel@tonic-gate size_t current_count; 10880Sstevel@tonic-gate off_t current_offset; 10890Sstevel@tonic-gate sp_ext_offset_t current_blkno; 10900Sstevel@tonic-gate buf_t *child_buf, *parent_buf; 10910Sstevel@tonic-gate void *kbuffer; 10920Sstevel@tonic-gate vol_directed_rd_t cvdr; 10930Sstevel@tonic-gate caddr_t userbuf; 10940Sstevel@tonic-gate offset_t useroff; 10950Sstevel@tonic-gate int ret = 0; 10960Sstevel@tonic-gate 10970Sstevel@tonic-gate ui = MDI_UNIT(mnum); 10980Sstevel@tonic-gate 10990Sstevel@tonic-gate md_kstat_waitq_enter(ui); 11000Sstevel@tonic-gate 11010Sstevel@tonic-gate bzero(&cvdr, sizeof (cvdr)); 11020Sstevel@tonic-gate 11030Sstevel@tonic-gate un = (mp_unit_t *)md_unit_readerlock(ui); 11040Sstevel@tonic-gate 11050Sstevel@tonic-gate /* 11060Sstevel@tonic-gate * Construct a parent_buf header which reflects the user-supplied 11070Sstevel@tonic-gate * request. 11080Sstevel@tonic-gate */ 11090Sstevel@tonic-gate 11100Sstevel@tonic-gate kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP); 11110Sstevel@tonic-gate if (kbuffer == NULL) { 11120Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR; 11132150Sjeanm md_kstat_waitq_exit(ui); 11140Sstevel@tonic-gate md_unit_readerexit(ui); 11150Sstevel@tonic-gate return (ENOMEM); 11160Sstevel@tonic-gate } 11170Sstevel@tonic-gate 11180Sstevel@tonic-gate parent_buf = getrbuf(KM_NOSLEEP); 11190Sstevel@tonic-gate if (parent_buf == NULL) { 11200Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR; 11212150Sjeanm md_kstat_waitq_exit(ui); 11220Sstevel@tonic-gate md_unit_readerexit(ui); 11230Sstevel@tonic-gate kmem_free(kbuffer, vdr->vdr_nbytes); 11240Sstevel@tonic-gate return (ENOMEM); 11250Sstevel@tonic-gate } 11260Sstevel@tonic-gate parent_buf->b_un.b_addr = kbuffer; 11270Sstevel@tonic-gate parent_buf->b_flags = B_READ; 11280Sstevel@tonic-gate parent_buf->b_bcount = vdr->vdr_nbytes; 11290Sstevel@tonic-gate parent_buf->b_lblkno = lbtodb(vdr->vdr_offset); 11300Sstevel@tonic-gate parent_buf->b_edev = un->un_dev; 11310Sstevel@tonic-gate 11320Sstevel@tonic-gate 11330Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS); 11340Sstevel@tonic-gate sp_parent_init(ps); 11350Sstevel@tonic-gate 11360Sstevel@tonic-gate /* 11370Sstevel@tonic-gate * Save essential information from the original buffhdr 11380Sstevel@tonic-gate * in the parent. 11390Sstevel@tonic-gate */ 11400Sstevel@tonic-gate ps->ps_un = un; 11410Sstevel@tonic-gate ps->ps_ui = ui; 11420Sstevel@tonic-gate ps->ps_bp = parent_buf; 11430Sstevel@tonic-gate ps->ps_addr = parent_buf->b_un.b_addr; 11440Sstevel@tonic-gate 11450Sstevel@tonic-gate current_count = parent_buf->b_bcount; 11460Sstevel@tonic-gate current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno; 11470Sstevel@tonic-gate current_offset = 0; 11480Sstevel@tonic-gate 11492150Sjeanm md_kstat_waitq_to_runq(ui); 11502150Sjeanm 11510Sstevel@tonic-gate ps->ps_frags++; 11520Sstevel@tonic-gate vdr->vdr_bytesread = 0; 11530Sstevel@tonic-gate 11540Sstevel@tonic-gate /* 11550Sstevel@tonic-gate * this loop does the main work of an I/O. we allocate a 11560Sstevel@tonic-gate * a child save for each buf, do the logical to physical 11570Sstevel@tonic-gate * mapping, decide if we need to frag the I/O, clone the 11580Sstevel@tonic-gate * new I/O to pass down the stack. repeat until we've 11590Sstevel@tonic-gate * taken care of the entire buf that was passed to us. 11600Sstevel@tonic-gate */ 11610Sstevel@tonic-gate do { 11620Sstevel@tonic-gate cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS); 11630Sstevel@tonic-gate sp_child_init(cs); 11640Sstevel@tonic-gate child_buf = &cs->cs_buf; 11650Sstevel@tonic-gate cs->cs_ps = ps; 11660Sstevel@tonic-gate 11670Sstevel@tonic-gate more = sp_mapbuf(un, current_blkno, current_count, child_buf); 11680Sstevel@tonic-gate if (more == -1) { 11690Sstevel@tonic-gate ret = EIO; 11700Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_SHORT; 11710Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs); 11720Sstevel@tonic-gate goto err_out; 11730Sstevel@tonic-gate } 11740Sstevel@tonic-gate 11750Sstevel@tonic-gate cvdr.vdr_flags = vdr->vdr_flags; 11760Sstevel@tonic-gate cvdr.vdr_side = vdr->vdr_side; 11770Sstevel@tonic-gate cvdr.vdr_nbytes = child_buf->b_bcount; 11780Sstevel@tonic-gate cvdr.vdr_offset = ldbtob(child_buf->b_lblkno); 11790Sstevel@tonic-gate /* Work out where we are in the allocated buffer */ 118062Sjeanm useroff = (offset_t)(uintptr_t)kbuffer; 11810Sstevel@tonic-gate useroff = useroff + (offset_t)current_offset; 118262Sjeanm cvdr.vdr_data = (void *)(uintptr_t)useroff; 11830Sstevel@tonic-gate child_buf = md_bioclone(parent_buf, current_offset, 1184*7627SChris.Horne@Sun.COM child_buf->b_bcount, child_buf->b_edev, 1185*7627SChris.Horne@Sun.COM child_buf->b_blkno, NULL, 1186*7627SChris.Horne@Sun.COM child_buf, KM_NOSLEEP); 11870Sstevel@tonic-gate /* calculate new offset, counts, etc... */ 11880Sstevel@tonic-gate current_offset += child_buf->b_bcount; 11890Sstevel@tonic-gate current_count -= child_buf->b_bcount; 11900Sstevel@tonic-gate current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount)); 11910Sstevel@tonic-gate 11920Sstevel@tonic-gate if (more) { 11930Sstevel@tonic-gate mutex_enter(&ps->ps_mx); 11940Sstevel@tonic-gate ps->ps_frags++; 11950Sstevel@tonic-gate mutex_exit(&ps->ps_mx); 11960Sstevel@tonic-gate } 11970Sstevel@tonic-gate 11980Sstevel@tonic-gate ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr, 11990Sstevel@tonic-gate (mode | FKIOCTL), NULL); 12000Sstevel@tonic-gate 12010Sstevel@tonic-gate /* 12020Sstevel@tonic-gate * Free the child structure as we've finished with it. 12030Sstevel@tonic-gate * Normally this would be done by sp_done() but we're just 12040Sstevel@tonic-gate * using md_bioclone() to segment the transfer and we never 12050Sstevel@tonic-gate * issue a strategy request so the iodone will not be called. 12060Sstevel@tonic-gate */ 12070Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs); 12080Sstevel@tonic-gate if (ret == 0) { 12090Sstevel@tonic-gate /* copyout the returned data to vdr_data + offset */ 12100Sstevel@tonic-gate userbuf = (caddr_t)kbuffer; 12110Sstevel@tonic-gate userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer; 12120Sstevel@tonic-gate if (ddi_copyout(userbuf, vdr->vdr_data, 12130Sstevel@tonic-gate cvdr.vdr_bytesread, mode)) { 12140Sstevel@tonic-gate ret = EFAULT; 12150Sstevel@tonic-gate goto err_out; 12160Sstevel@tonic-gate } 12170Sstevel@tonic-gate vdr->vdr_bytesread += cvdr.vdr_bytesread; 12180Sstevel@tonic-gate } else { 12190Sstevel@tonic-gate goto err_out; 12200Sstevel@tonic-gate } 12210Sstevel@tonic-gate } while (more); 12220Sstevel@tonic-gate 12230Sstevel@tonic-gate /* 12240Sstevel@tonic-gate * Update the user-supplied vol_directed_rd_t structure with the 12250Sstevel@tonic-gate * contents of the last issued child request. 12260Sstevel@tonic-gate */ 12270Sstevel@tonic-gate vdr->vdr_flags = cvdr.vdr_flags; 12280Sstevel@tonic-gate vdr->vdr_side = cvdr.vdr_side; 12290Sstevel@tonic-gate bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME); 12300Sstevel@tonic-gate 12310Sstevel@tonic-gate err_out: 12320Sstevel@tonic-gate if (ret != 0) { 12330Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR; 12340Sstevel@tonic-gate } 12350Sstevel@tonic-gate if (vdr->vdr_bytesread != vdr->vdr_nbytes) { 12360Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_SHORT; 12370Sstevel@tonic-gate } 12380Sstevel@tonic-gate kmem_cache_free(sp_parent_cache, ps); 12390Sstevel@tonic-gate kmem_free(kbuffer, vdr->vdr_nbytes); 12400Sstevel@tonic-gate freerbuf(parent_buf); 12410Sstevel@tonic-gate md_unit_readerexit(ui); 12420Sstevel@tonic-gate return (ret); 12430Sstevel@tonic-gate } 12440Sstevel@tonic-gate 12450Sstevel@tonic-gate /* 12460Sstevel@tonic-gate * FUNCTION: sp_snarf() 12470Sstevel@tonic-gate * INPUT: cmd - snarf cmd. 12480Sstevel@tonic-gate * setno - set number. 12490Sstevel@tonic-gate * OUTPUT: none. 12500Sstevel@tonic-gate * RETURNS: 1 - soft partitions were snarfed. 12510Sstevel@tonic-gate * 0 - no soft partitions were snarfed. 12520Sstevel@tonic-gate * PURPOSE: Snarf soft partition metadb records into their in-core 12530Sstevel@tonic-gate * structures. This routine is called at "snarf time" when 12540Sstevel@tonic-gate * md loads and gets all metadevices records into memory. 12550Sstevel@tonic-gate * The basic algorithm is simply to walk the soft partition 12560Sstevel@tonic-gate * records in the metadb and call the soft partitioning 12570Sstevel@tonic-gate * build_incore routine to set up the in-core structures. 12580Sstevel@tonic-gate */ 12590Sstevel@tonic-gate static int 12600Sstevel@tonic-gate sp_snarf(md_snarfcmd_t cmd, set_t setno) 12610Sstevel@tonic-gate { 12620Sstevel@tonic-gate mp_unit_t *un; 12630Sstevel@tonic-gate mddb_recid_t recid; 12640Sstevel@tonic-gate int gotsomething; 12650Sstevel@tonic-gate int all_sp_gotten; 12660Sstevel@tonic-gate mddb_type_t rec_type; 12670Sstevel@tonic-gate mddb_de_ic_t *dep; 12680Sstevel@tonic-gate mddb_rb32_t *rbp; 12690Sstevel@tonic-gate mp_unit_t *big_un; 12700Sstevel@tonic-gate mp_unit32_od_t *small_un; 12710Sstevel@tonic-gate size_t newreqsize; 12720Sstevel@tonic-gate 12730Sstevel@tonic-gate 12740Sstevel@tonic-gate if (cmd == MD_SNARF_CLEANUP) 12750Sstevel@tonic-gate return (0); 12760Sstevel@tonic-gate 12770Sstevel@tonic-gate all_sp_gotten = 1; 12780Sstevel@tonic-gate gotsomething = 0; 12790Sstevel@tonic-gate 12800Sstevel@tonic-gate /* get the record type */ 12810Sstevel@tonic-gate rec_type = (mddb_type_t)md_getshared_key(setno, 12820Sstevel@tonic-gate sp_md_ops.md_driver.md_drivername); 12830Sstevel@tonic-gate recid = mddb_makerecid(setno, 0); 12840Sstevel@tonic-gate 12850Sstevel@tonic-gate /* 12860Sstevel@tonic-gate * walk soft partition records in the metadb and call 12870Sstevel@tonic-gate * sp_build_incore to build in-core structures. 12880Sstevel@tonic-gate */ 12890Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 12900Sstevel@tonic-gate /* if we've already gotten this record, go to the next one */ 12910Sstevel@tonic-gate if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 12920Sstevel@tonic-gate continue; 12930Sstevel@tonic-gate 12940Sstevel@tonic-gate 12950Sstevel@tonic-gate dep = mddb_getrecdep(recid); 12960Sstevel@tonic-gate dep->de_flags = MDDB_F_SOFTPART; 12970Sstevel@tonic-gate rbp = dep->de_rb; 12980Sstevel@tonic-gate 12991623Stw21770 switch (rbp->rb_revision) { 13001623Stw21770 case MDDB_REV_RB: 13011623Stw21770 case MDDB_REV_RBFN: 13021623Stw21770 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 13031623Stw21770 /* 13041623Stw21770 * This means, we have an old and small record. 13051623Stw21770 * And this record hasn't already been converted 13061623Stw21770 * :-o before we create an incore metadevice 13071623Stw21770 * from this we have to convert it to a big 13081623Stw21770 * record. 13091623Stw21770 */ 13101623Stw21770 small_un = 13111623Stw21770 (mp_unit32_od_t *)mddb_getrecaddr(recid); 13121623Stw21770 newreqsize = sizeof (mp_unit_t) + 1313*7627SChris.Horne@Sun.COM ((small_un->un_numexts - 1) * 1314*7627SChris.Horne@Sun.COM sizeof (struct mp_ext)); 13151623Stw21770 big_un = (mp_unit_t *)kmem_zalloc(newreqsize, 1316*7627SChris.Horne@Sun.COM KM_SLEEP); 13171623Stw21770 softpart_convert((caddr_t)small_un, 1318*7627SChris.Horne@Sun.COM (caddr_t)big_un, SMALL_2_BIG); 13191623Stw21770 kmem_free(small_un, dep->de_reqsize); 13201623Stw21770 dep->de_rb_userdata = big_un; 13211623Stw21770 dep->de_reqsize = newreqsize; 13221623Stw21770 rbp->rb_private |= MD_PRV_CONVD; 13231623Stw21770 un = big_un; 13241623Stw21770 } else { 13251623Stw21770 /* Record has already been converted */ 13261623Stw21770 un = (mp_unit_t *)mddb_getrecaddr(recid); 13271623Stw21770 } 13281623Stw21770 un->c.un_revision &= ~MD_64BIT_META_DEV; 13291623Stw21770 break; 13301623Stw21770 case MDDB_REV_RB64: 13311623Stw21770 case MDDB_REV_RB64FN: 13320Sstevel@tonic-gate /* Large device */ 13330Sstevel@tonic-gate un = (mp_unit_t *)mddb_getrecaddr(recid); 13341623Stw21770 un->c.un_revision |= MD_64BIT_META_DEV; 13351623Stw21770 un->c.un_flag |= MD_EFILABEL; 13361623Stw21770 break; 13370Sstevel@tonic-gate } 13382077Stw21770 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 13390Sstevel@tonic-gate 13400Sstevel@tonic-gate /* 13410Sstevel@tonic-gate * Create minor node for snarfed entry. 13420Sstevel@tonic-gate */ 13430Sstevel@tonic-gate (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 13440Sstevel@tonic-gate 13450Sstevel@tonic-gate if (MD_UNIT(MD_SID(un)) != NULL) { 13460Sstevel@tonic-gate /* unit is already in-core */ 13470Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_PENDDEL); 13480Sstevel@tonic-gate continue; 13490Sstevel@tonic-gate } 13500Sstevel@tonic-gate all_sp_gotten = 0; 13510Sstevel@tonic-gate if (sp_build_incore((void *)un, 1) == 0) { 13520Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_GOTIT); 13530Sstevel@tonic-gate md_create_unit_incore(MD_SID(un), &sp_md_ops, 0); 13540Sstevel@tonic-gate gotsomething = 1; 13550Sstevel@tonic-gate } 13560Sstevel@tonic-gate } 13570Sstevel@tonic-gate 13580Sstevel@tonic-gate if (!all_sp_gotten) 13590Sstevel@tonic-gate return (gotsomething); 13600Sstevel@tonic-gate /* double-check records */ 13610Sstevel@tonic-gate recid = mddb_makerecid(setno, 0); 13620Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) 13630Sstevel@tonic-gate if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 13640Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_PENDDEL); 13650Sstevel@tonic-gate 13660Sstevel@tonic-gate return (0); 13670Sstevel@tonic-gate } 13680Sstevel@tonic-gate 13690Sstevel@tonic-gate /* 13700Sstevel@tonic-gate * FUNCTION: sp_halt() 13710Sstevel@tonic-gate * INPUT: cmd - halt cmd. 13720Sstevel@tonic-gate * setno - set number. 13730Sstevel@tonic-gate * RETURNS: 0 - success. 13740Sstevel@tonic-gate * 1 - err. 13750Sstevel@tonic-gate * PURPOSE: Perform driver halt operations. As with stripe, we 13760Sstevel@tonic-gate * support MD_HALT_CHECK and MD_HALT_DOIT. The first 13770Sstevel@tonic-gate * does a check to see if halting can be done safely 13780Sstevel@tonic-gate * (no open soft partitions), the second cleans up and 13790Sstevel@tonic-gate * shuts down the driver. 13800Sstevel@tonic-gate */ 13810Sstevel@tonic-gate static int 13820Sstevel@tonic-gate sp_halt(md_haltcmd_t cmd, set_t setno) 13830Sstevel@tonic-gate { 13840Sstevel@tonic-gate int i; 13850Sstevel@tonic-gate mdi_unit_t *ui; 13860Sstevel@tonic-gate minor_t mnum; 13870Sstevel@tonic-gate 13880Sstevel@tonic-gate if (cmd == MD_HALT_CLOSE) 13890Sstevel@tonic-gate return (0); 13900Sstevel@tonic-gate 13910Sstevel@tonic-gate if (cmd == MD_HALT_OPEN) 13920Sstevel@tonic-gate return (0); 13930Sstevel@tonic-gate 13940Sstevel@tonic-gate if (cmd == MD_HALT_UNLOAD) 13950Sstevel@tonic-gate return (0); 13960Sstevel@tonic-gate 13970Sstevel@tonic-gate if (cmd == MD_HALT_CHECK) { 13980Sstevel@tonic-gate for (i = 0; i < md_nunits; i++) { 13990Sstevel@tonic-gate mnum = MD_MKMIN(setno, i); 14000Sstevel@tonic-gate if ((ui = MDI_UNIT(mnum)) == NULL) 14010Sstevel@tonic-gate continue; 14020Sstevel@tonic-gate if (ui->ui_opsindex != sp_md_ops.md_selfindex) 14030Sstevel@tonic-gate continue; 14040Sstevel@tonic-gate if (md_unit_isopen(ui)) 14050Sstevel@tonic-gate return (1); 14060Sstevel@tonic-gate } 14070Sstevel@tonic-gate return (0); 14080Sstevel@tonic-gate } 14090Sstevel@tonic-gate 14100Sstevel@tonic-gate if (cmd != MD_HALT_DOIT) 14110Sstevel@tonic-gate return (1); 14120Sstevel@tonic-gate 14130Sstevel@tonic-gate for (i = 0; i < md_nunits; i++) { 14140Sstevel@tonic-gate mnum = MD_MKMIN(setno, i); 14150Sstevel@tonic-gate if ((ui = MDI_UNIT(mnum)) == NULL) 14160Sstevel@tonic-gate continue; 14170Sstevel@tonic-gate if (ui->ui_opsindex != sp_md_ops.md_selfindex) 14180Sstevel@tonic-gate continue; 14190Sstevel@tonic-gate reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0); 14200Sstevel@tonic-gate } 14210Sstevel@tonic-gate 14220Sstevel@tonic-gate return (0); 14230Sstevel@tonic-gate } 14240Sstevel@tonic-gate 14250Sstevel@tonic-gate /* 14260Sstevel@tonic-gate * FUNCTION: sp_open_dev() 14270Sstevel@tonic-gate * INPUT: un - unit structure. 14280Sstevel@tonic-gate * oflags - open flags. 14290Sstevel@tonic-gate * OUTPUT: none. 14300Sstevel@tonic-gate * RETURNS: 0 - success. 14310Sstevel@tonic-gate * non-zero - err. 14320Sstevel@tonic-gate * PURPOSE: open underlying device via md_layered_open. 14330Sstevel@tonic-gate */ 14340Sstevel@tonic-gate static int 14350Sstevel@tonic-gate sp_open_dev(mp_unit_t *un, int oflags) 14360Sstevel@tonic-gate { 14370Sstevel@tonic-gate minor_t mnum = MD_SID(un); 14380Sstevel@tonic-gate int err; 14390Sstevel@tonic-gate md_dev64_t tmpdev; 14400Sstevel@tonic-gate set_t setno = MD_MIN2SET(MD_SID(un)); 14410Sstevel@tonic-gate side_t side = mddb_getsidenum(setno); 14420Sstevel@tonic-gate 14430Sstevel@tonic-gate tmpdev = un->un_dev; 14440Sstevel@tonic-gate /* 14450Sstevel@tonic-gate * Do the open by device id if underlying is regular 14460Sstevel@tonic-gate */ 14470Sstevel@tonic-gate if ((md_getmajor(tmpdev) != md_major) && 1448*7627SChris.Horne@Sun.COM md_devid_found(setno, side, un->un_key) == 1) { 14490Sstevel@tonic-gate tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key); 14500Sstevel@tonic-gate } 14510Sstevel@tonic-gate err = md_layered_open(mnum, &tmpdev, oflags); 14520Sstevel@tonic-gate un->un_dev = tmpdev; 14530Sstevel@tonic-gate 14540Sstevel@tonic-gate if (err) 14550Sstevel@tonic-gate return (ENXIO); 14560Sstevel@tonic-gate 14570Sstevel@tonic-gate return (0); 14580Sstevel@tonic-gate } 14590Sstevel@tonic-gate 14600Sstevel@tonic-gate /* 14610Sstevel@tonic-gate * FUNCTION: sp_open() 14620Sstevel@tonic-gate * INPUT: dev - device to open. 14630Sstevel@tonic-gate * flag - pass-through flag. 14640Sstevel@tonic-gate * otyp - pass-through open type. 14650Sstevel@tonic-gate * cred_p - credentials. 14660Sstevel@tonic-gate * md_oflags - open flags. 14670Sstevel@tonic-gate * OUTPUT: none. 14680Sstevel@tonic-gate * RETURNS: 0 - success. 14690Sstevel@tonic-gate * non-zero - err. 14700Sstevel@tonic-gate * PURPOSE: open a soft partition. 14710Sstevel@tonic-gate */ 14720Sstevel@tonic-gate /* ARGSUSED */ 14730Sstevel@tonic-gate static int 14740Sstevel@tonic-gate sp_open( 14750Sstevel@tonic-gate dev_t *dev, 14760Sstevel@tonic-gate int flag, 14770Sstevel@tonic-gate int otyp, 14780Sstevel@tonic-gate cred_t *cred_p, 14790Sstevel@tonic-gate int md_oflags 14800Sstevel@tonic-gate ) 14810Sstevel@tonic-gate { 14820Sstevel@tonic-gate minor_t mnum = getminor(*dev); 14830Sstevel@tonic-gate mdi_unit_t *ui = MDI_UNIT(mnum); 14840Sstevel@tonic-gate mp_unit_t *un; 14850Sstevel@tonic-gate int err = 0; 14860Sstevel@tonic-gate set_t setno; 14870Sstevel@tonic-gate 148846Sskamm /* 148946Sskamm * When doing an open of a multi owner metadevice, check to see if this 149046Sskamm * node is a starting node and if a reconfig cycle is underway. 149146Sskamm * If so, the system isn't sufficiently set up enough to handle the 149246Sskamm * open (which involves I/O during sp_validate), so fail with ENXIO. 149346Sskamm */ 149446Sskamm setno = MD_MIN2SET(mnum); 149546Sskamm if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 149646Sskamm (MD_SET_MNSET | MD_SET_MN_START_RC)) { 149746Sskamm return (ENXIO); 149846Sskamm } 149946Sskamm 15000Sstevel@tonic-gate /* grab necessary locks */ 15010Sstevel@tonic-gate un = (mp_unit_t *)md_unit_openclose_enter(ui); 15020Sstevel@tonic-gate setno = MD_UN2SET(un); 15030Sstevel@tonic-gate 15040Sstevel@tonic-gate /* open underlying device, if necessary */ 15050Sstevel@tonic-gate if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) { 15060Sstevel@tonic-gate if ((err = sp_open_dev(un, md_oflags)) != 0) 15070Sstevel@tonic-gate goto out; 15080Sstevel@tonic-gate 15090Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) { 15100Sstevel@tonic-gate /* For probe, don't incur the overhead of validate */ 15110Sstevel@tonic-gate if (!(md_oflags & MD_OFLG_PROBEDEV)) { 15120Sstevel@tonic-gate /* 15130Sstevel@tonic-gate * Don't call sp_validate while 15140Sstevel@tonic-gate * unit_openclose lock is held. So, actually 15150Sstevel@tonic-gate * open the device, drop openclose lock, 15160Sstevel@tonic-gate * call sp_validate, reacquire openclose lock, 15170Sstevel@tonic-gate * and close the device. If sp_validate 15180Sstevel@tonic-gate * succeeds, then device will be re-opened. 15190Sstevel@tonic-gate */ 15200Sstevel@tonic-gate if ((err = md_unit_incopen(mnum, flag, 15210Sstevel@tonic-gate otyp)) != 0) 15220Sstevel@tonic-gate goto out; 15230Sstevel@tonic-gate 15240Sstevel@tonic-gate mutex_enter(&ui->ui_mx); 15250Sstevel@tonic-gate ui->ui_lock |= MD_UL_OPENINPROGRESS; 15260Sstevel@tonic-gate mutex_exit(&ui->ui_mx); 15270Sstevel@tonic-gate md_unit_openclose_exit(ui); 15280Sstevel@tonic-gate if (otyp != OTYP_LYR) 15290Sstevel@tonic-gate rw_exit(&md_unit_array_rw.lock); 15300Sstevel@tonic-gate 15310Sstevel@tonic-gate err = sp_validate(un); 15320Sstevel@tonic-gate 15330Sstevel@tonic-gate if (otyp != OTYP_LYR) 15340Sstevel@tonic-gate rw_enter(&md_unit_array_rw.lock, 15350Sstevel@tonic-gate RW_READER); 15360Sstevel@tonic-gate (void) md_unit_openclose_enter(ui); 15370Sstevel@tonic-gate (void) md_unit_decopen(mnum, otyp); 15380Sstevel@tonic-gate mutex_enter(&ui->ui_mx); 15390Sstevel@tonic-gate ui->ui_lock &= ~MD_UL_OPENINPROGRESS; 15400Sstevel@tonic-gate cv_broadcast(&ui->ui_cv); 15410Sstevel@tonic-gate mutex_exit(&ui->ui_mx); 15420Sstevel@tonic-gate /* 15430Sstevel@tonic-gate * Should be in the same state as before 15440Sstevel@tonic-gate * the sp_validate. 15450Sstevel@tonic-gate */ 15460Sstevel@tonic-gate if (err != 0) { 15470Sstevel@tonic-gate /* close the device opened above */ 15480Sstevel@tonic-gate md_layered_close(un->un_dev, md_oflags); 15490Sstevel@tonic-gate err = EIO; 15500Sstevel@tonic-gate goto out; 15510Sstevel@tonic-gate } 15520Sstevel@tonic-gate } 15530Sstevel@tonic-gate /* 15540Sstevel@tonic-gate * As we're a multi-owner metadevice we need to ensure 15550Sstevel@tonic-gate * that all nodes have the same idea of the status. 15560Sstevel@tonic-gate * sp_validate() will mark the device as errored (if 15570Sstevel@tonic-gate * it cannot read the watermark) or ok (if it was 15580Sstevel@tonic-gate * previously errored but the watermark is now valid). 15590Sstevel@tonic-gate * This code-path is only entered on the non-probe open 15600Sstevel@tonic-gate * so we will maintain the errored state during a probe 15610Sstevel@tonic-gate * call. This means the sys-admin must metarecover -m 15620Sstevel@tonic-gate * to reset the soft-partition error. 15630Sstevel@tonic-gate */ 15640Sstevel@tonic-gate } else { 15650Sstevel@tonic-gate /* For probe, don't incur the overhead of validate */ 15660Sstevel@tonic-gate if (!(md_oflags & MD_OFLG_PROBEDEV) && 15670Sstevel@tonic-gate (err = sp_validate(un)) != 0) { 15680Sstevel@tonic-gate /* close the device opened above */ 15690Sstevel@tonic-gate md_layered_close(un->un_dev, md_oflags); 15700Sstevel@tonic-gate err = EIO; 15710Sstevel@tonic-gate goto out; 15720Sstevel@tonic-gate } else { 15730Sstevel@tonic-gate /* 15740Sstevel@tonic-gate * we succeeded in validating the on disk 15750Sstevel@tonic-gate * format versus the in core, so reset the 15760Sstevel@tonic-gate * status if it's in error 15770Sstevel@tonic-gate */ 15780Sstevel@tonic-gate if (un->un_status == MD_SP_ERR) { 15790Sstevel@tonic-gate un->un_status = MD_SP_OK; 15800Sstevel@tonic-gate } 15810Sstevel@tonic-gate } 15820Sstevel@tonic-gate } 15830Sstevel@tonic-gate } 15840Sstevel@tonic-gate 15850Sstevel@tonic-gate /* count open */ 15860Sstevel@tonic-gate if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 15870Sstevel@tonic-gate goto out; 15880Sstevel@tonic-gate 15890Sstevel@tonic-gate out: 15900Sstevel@tonic-gate md_unit_openclose_exit(ui); 15910Sstevel@tonic-gate return (err); 15920Sstevel@tonic-gate } 15930Sstevel@tonic-gate 15940Sstevel@tonic-gate /* 15950Sstevel@tonic-gate * FUNCTION: sp_close() 15960Sstevel@tonic-gate * INPUT: dev - device to close. 15970Sstevel@tonic-gate * flag - pass-through flag. 15980Sstevel@tonic-gate * otyp - pass-through type. 15990Sstevel@tonic-gate * cred_p - credentials. 16000Sstevel@tonic-gate * md_cflags - close flags. 16010Sstevel@tonic-gate * OUTPUT: none. 16020Sstevel@tonic-gate * RETURNS: 0 - success. 16030Sstevel@tonic-gate * non-zero - err. 16040Sstevel@tonic-gate * PURPOSE: close a soft paritition. 16050Sstevel@tonic-gate */ 16060Sstevel@tonic-gate /* ARGSUSED */ 16070Sstevel@tonic-gate static int 16080Sstevel@tonic-gate sp_close( 16090Sstevel@tonic-gate dev_t dev, 16100Sstevel@tonic-gate int flag, 16110Sstevel@tonic-gate int otyp, 16120Sstevel@tonic-gate cred_t *cred_p, 16130Sstevel@tonic-gate int md_cflags 16140Sstevel@tonic-gate ) 16150Sstevel@tonic-gate { 16160Sstevel@tonic-gate minor_t mnum = getminor(dev); 16170Sstevel@tonic-gate mdi_unit_t *ui = MDI_UNIT(mnum); 16180Sstevel@tonic-gate mp_unit_t *un; 16190Sstevel@tonic-gate int err = 0; 16200Sstevel@tonic-gate 16210Sstevel@tonic-gate /* grab necessary locks */ 16220Sstevel@tonic-gate un = (mp_unit_t *)md_unit_openclose_enter(ui); 16230Sstevel@tonic-gate 16240Sstevel@tonic-gate /* count closed */ 16250Sstevel@tonic-gate if ((err = md_unit_decopen(mnum, otyp)) != 0) 16260Sstevel@tonic-gate goto out; 16270Sstevel@tonic-gate 16280Sstevel@tonic-gate /* close devices, if necessary */ 16290Sstevel@tonic-gate if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 16300Sstevel@tonic-gate md_layered_close(un->un_dev, md_cflags); 16310Sstevel@tonic-gate } 16320Sstevel@tonic-gate 16330Sstevel@tonic-gate /* 16340Sstevel@tonic-gate * If a MN set and transient capabilities (eg ABR/DMR) are set, 16350Sstevel@tonic-gate * clear these capabilities if this is the last close in 16360Sstevel@tonic-gate * the cluster 16370Sstevel@tonic-gate */ 16380Sstevel@tonic-gate if (MD_MNSET_SETNO(MD_UN2SET(un)) && 16390Sstevel@tonic-gate (ui->ui_tstate & MD_ABR_CAP)) { 16400Sstevel@tonic-gate md_unit_openclose_exit(ui); 16410Sstevel@tonic-gate mdmn_clear_all_capabilities(mnum); 16420Sstevel@tonic-gate return (0); 16430Sstevel@tonic-gate } 16440Sstevel@tonic-gate /* unlock, return success */ 16450Sstevel@tonic-gate out: 16460Sstevel@tonic-gate md_unit_openclose_exit(ui); 16470Sstevel@tonic-gate return (err); 16480Sstevel@tonic-gate } 16490Sstevel@tonic-gate 16500Sstevel@tonic-gate 16510Sstevel@tonic-gate /* used in sp_dump routine */ 16520Sstevel@tonic-gate static struct buf dumpbuf; 16530Sstevel@tonic-gate 16540Sstevel@tonic-gate /* 16550Sstevel@tonic-gate * FUNCTION: sp_dump() 16560Sstevel@tonic-gate * INPUT: dev - device to dump to. 16570Sstevel@tonic-gate * addr - address to dump. 16580Sstevel@tonic-gate * blkno - blkno on device. 16590Sstevel@tonic-gate * nblk - number of blocks to dump. 16600Sstevel@tonic-gate * OUTPUT: none. 16610Sstevel@tonic-gate * RETURNS: result from bdev_dump. 16620Sstevel@tonic-gate * PURPOSE: This routine dumps memory to the disk. It assumes that 16630Sstevel@tonic-gate * the memory has already been mapped into mainbus space. 16640Sstevel@tonic-gate * It is called at disk interrupt priority when the system 16650Sstevel@tonic-gate * is in trouble. 16660Sstevel@tonic-gate * NOTE: this function is defined using 32-bit arguments, 16670Sstevel@tonic-gate * but soft partitioning is internally 64-bit. Arguments 16680Sstevel@tonic-gate * are casted where appropriate. 16690Sstevel@tonic-gate */ 16700Sstevel@tonic-gate static int 16710Sstevel@tonic-gate sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 16720Sstevel@tonic-gate { 16730Sstevel@tonic-gate mp_unit_t *un; 16740Sstevel@tonic-gate buf_t *bp; 16750Sstevel@tonic-gate sp_ext_length_t nb; 16760Sstevel@tonic-gate daddr_t mapblk; 16770Sstevel@tonic-gate int result; 16780Sstevel@tonic-gate int more; 16790Sstevel@tonic-gate int saveresult = 0; 16800Sstevel@tonic-gate 16810Sstevel@tonic-gate /* 16820Sstevel@tonic-gate * Don't need to grab the unit lock. 16830Sstevel@tonic-gate * Cause nothing else is supposed to be happenning. 16840Sstevel@tonic-gate * Also dump is not supposed to sleep. 16850Sstevel@tonic-gate */ 16860Sstevel@tonic-gate un = (mp_unit_t *)MD_UNIT(getminor(dev)); 16870Sstevel@tonic-gate 16880Sstevel@tonic-gate if ((diskaddr_t)blkno >= un->c.un_total_blocks) 16890Sstevel@tonic-gate return (EINVAL); 16900Sstevel@tonic-gate 16910Sstevel@tonic-gate if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks) 16920Sstevel@tonic-gate return (EINVAL); 16930Sstevel@tonic-gate 16940Sstevel@tonic-gate bp = &dumpbuf; 16950Sstevel@tonic-gate nb = (sp_ext_length_t)dbtob(nblk); 16960Sstevel@tonic-gate do { 16970Sstevel@tonic-gate bzero((caddr_t)bp, sizeof (*bp)); 16980Sstevel@tonic-gate more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp); 16990Sstevel@tonic-gate nblk = (int)(btodb(bp->b_bcount)); 17000Sstevel@tonic-gate mapblk = bp->b_blkno; 17010Sstevel@tonic-gate result = bdev_dump(bp->b_edev, addr, mapblk, nblk); 17020Sstevel@tonic-gate if (result) 17030Sstevel@tonic-gate saveresult = result; 17040Sstevel@tonic-gate 17050Sstevel@tonic-gate nb -= bp->b_bcount; 17060Sstevel@tonic-gate addr += bp->b_bcount; 17070Sstevel@tonic-gate blkno += nblk; 17080Sstevel@tonic-gate } while (more); 17090Sstevel@tonic-gate 17100Sstevel@tonic-gate return (saveresult); 17110Sstevel@tonic-gate } 17120Sstevel@tonic-gate 17130Sstevel@tonic-gate static int 17140Sstevel@tonic-gate sp_imp_set( 17150Sstevel@tonic-gate set_t setno 17160Sstevel@tonic-gate ) 17170Sstevel@tonic-gate { 17180Sstevel@tonic-gate mddb_recid_t recid; 17190Sstevel@tonic-gate int gotsomething; 17200Sstevel@tonic-gate mddb_type_t rec_type; 17210Sstevel@tonic-gate mddb_de_ic_t *dep; 17220Sstevel@tonic-gate mddb_rb32_t *rbp; 17230Sstevel@tonic-gate mp_unit_t *un64; 17240Sstevel@tonic-gate mp_unit32_od_t *un32; 17251623Stw21770 md_dev64_t self_devt; 17260Sstevel@tonic-gate minor_t *self_id; /* minor needs to be updated */ 17270Sstevel@tonic-gate md_parent_t *parent_id; /* parent needs to be updated */ 17280Sstevel@tonic-gate mddb_recid_t *record_id; /* record id needs to be updated */ 17290Sstevel@tonic-gate 17300Sstevel@tonic-gate gotsomething = 0; 17310Sstevel@tonic-gate 17320Sstevel@tonic-gate rec_type = (mddb_type_t)md_getshared_key(setno, 1733*7627SChris.Horne@Sun.COM sp_md_ops.md_driver.md_drivername); 17340Sstevel@tonic-gate recid = mddb_makerecid(setno, 0); 17350Sstevel@tonic-gate 17360Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) { 17370Sstevel@tonic-gate if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 17380Sstevel@tonic-gate continue; 17390Sstevel@tonic-gate 17400Sstevel@tonic-gate dep = mddb_getrecdep(recid); 17410Sstevel@tonic-gate rbp = dep->de_rb; 17420Sstevel@tonic-gate 17431623Stw21770 switch (rbp->rb_revision) { 17441623Stw21770 case MDDB_REV_RB: 17451623Stw21770 case MDDB_REV_RBFN: 17460Sstevel@tonic-gate /* 17470Sstevel@tonic-gate * Small device 17480Sstevel@tonic-gate */ 17490Sstevel@tonic-gate un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid); 17500Sstevel@tonic-gate self_id = &(un32->c.un_self_id); 17510Sstevel@tonic-gate parent_id = &(un32->c.un_parent); 17520Sstevel@tonic-gate record_id = &(un32->c.un_record_id); 17530Sstevel@tonic-gate 17540Sstevel@tonic-gate if (!md_update_minor(setno, mddb_getsidenum 1755*7627SChris.Horne@Sun.COM (setno), un32->un_key)) 17560Sstevel@tonic-gate goto out; 17571623Stw21770 break; 17581623Stw21770 17591623Stw21770 case MDDB_REV_RB64: 17601623Stw21770 case MDDB_REV_RB64FN: 17610Sstevel@tonic-gate un64 = (mp_unit_t *)mddb_getrecaddr(recid); 17620Sstevel@tonic-gate self_id = &(un64->c.un_self_id); 17630Sstevel@tonic-gate parent_id = &(un64->c.un_parent); 17640Sstevel@tonic-gate record_id = &(un64->c.un_record_id); 17650Sstevel@tonic-gate 17660Sstevel@tonic-gate if (!md_update_minor(setno, mddb_getsidenum 1767*7627SChris.Horne@Sun.COM (setno), un64->un_key)) 17680Sstevel@tonic-gate goto out; 17691623Stw21770 break; 17701623Stw21770 } 17711623Stw21770 17721623Stw21770 /* 17731623Stw21770 * If this is a top level and a friendly name metadevice, 17741623Stw21770 * update its minor in the namespace. 17751623Stw21770 */ 17761623Stw21770 if ((*parent_id == MD_NO_PARENT) && 17771623Stw21770 ((rbp->rb_revision == MDDB_REV_RBFN) || 17781623Stw21770 (rbp->rb_revision == MDDB_REV_RB64FN))) { 17791623Stw21770 17801623Stw21770 self_devt = md_makedevice(md_major, *self_id); 17811623Stw21770 if (!md_update_top_device_minor(setno, 17821623Stw21770 mddb_getsidenum(setno), self_devt)) 17831623Stw21770 goto out; 17840Sstevel@tonic-gate } 17850Sstevel@tonic-gate 17860Sstevel@tonic-gate /* 17870Sstevel@tonic-gate * Update unit with the imported setno 17880Sstevel@tonic-gate * 17890Sstevel@tonic-gate */ 17900Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_GOTIT); 17910Sstevel@tonic-gate 17920Sstevel@tonic-gate *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 17930Sstevel@tonic-gate if (*parent_id != MD_NO_PARENT) 17940Sstevel@tonic-gate *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 17950Sstevel@tonic-gate *record_id = MAKERECID(setno, DBID(*record_id)); 17960Sstevel@tonic-gate 17970Sstevel@tonic-gate gotsomething = 1; 17980Sstevel@tonic-gate } 17990Sstevel@tonic-gate 18000Sstevel@tonic-gate out: 18010Sstevel@tonic-gate return (gotsomething); 18020Sstevel@tonic-gate } 18030Sstevel@tonic-gate 18040Sstevel@tonic-gate static md_named_services_t sp_named_services[] = { 18050Sstevel@tonic-gate {NULL, 0} 18060Sstevel@tonic-gate }; 18070Sstevel@tonic-gate 18080Sstevel@tonic-gate md_ops_t sp_md_ops = { 18090Sstevel@tonic-gate sp_open, /* open */ 18100Sstevel@tonic-gate sp_close, /* close */ 18110Sstevel@tonic-gate md_sp_strategy, /* strategy */ 18120Sstevel@tonic-gate NULL, /* print */ 18130Sstevel@tonic-gate sp_dump, /* dump */ 18140Sstevel@tonic-gate NULL, /* read */ 18150Sstevel@tonic-gate NULL, /* write */ 18160Sstevel@tonic-gate md_sp_ioctl, /* ioctl, */ 18170Sstevel@tonic-gate sp_snarf, /* snarf */ 18180Sstevel@tonic-gate sp_halt, /* halt */ 18190Sstevel@tonic-gate NULL, /* aread */ 18200Sstevel@tonic-gate NULL, /* awrite */ 18210Sstevel@tonic-gate sp_imp_set, /* import set */ 18220Sstevel@tonic-gate sp_named_services 18230Sstevel@tonic-gate }; 18240Sstevel@tonic-gate 18250Sstevel@tonic-gate static void 18260Sstevel@tonic-gate init_init() 18270Sstevel@tonic-gate { 18280Sstevel@tonic-gate sp_parent_cache = kmem_cache_create("md_softpart_parent", 18290Sstevel@tonic-gate sizeof (md_spps_t), 0, sp_parent_constructor, 18300Sstevel@tonic-gate sp_parent_destructor, sp_run_queue, NULL, NULL, 0); 18310Sstevel@tonic-gate sp_child_cache = kmem_cache_create("md_softpart_child", 18320Sstevel@tonic-gate sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0, 18330Sstevel@tonic-gate sp_child_constructor, sp_child_destructor, sp_run_queue, 18340Sstevel@tonic-gate NULL, NULL, 0); 18350Sstevel@tonic-gate } 18360Sstevel@tonic-gate 18370Sstevel@tonic-gate static void 18380Sstevel@tonic-gate fini_uninit() 18390Sstevel@tonic-gate { 18400Sstevel@tonic-gate kmem_cache_destroy(sp_parent_cache); 18410Sstevel@tonic-gate kmem_cache_destroy(sp_child_cache); 18420Sstevel@tonic-gate sp_parent_cache = sp_child_cache = NULL; 18430Sstevel@tonic-gate } 18440Sstevel@tonic-gate 18450Sstevel@tonic-gate /* define the module linkage */ 18464932Spetede MD_PLUGIN_MISC_MODULE("soft partition module", init_init(), fini_uninit()) 1847