10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
51366Spetede * Common Development and Distribution License (the "License").
61366Spetede * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
217627SChris.Horne@Sun.COM
220Sstevel@tonic-gate /*
23*11130SJames.Hall@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
240Sstevel@tonic-gate * Use is subject to license terms.
250Sstevel@tonic-gate */
260Sstevel@tonic-gate
270Sstevel@tonic-gate /*
280Sstevel@tonic-gate * Soft partitioning metadevice driver (md_sp).
290Sstevel@tonic-gate *
300Sstevel@tonic-gate * This file contains the primary operations of the soft partitioning
310Sstevel@tonic-gate * metadevice driver. This includes all routines for normal operation
320Sstevel@tonic-gate * (open/close/read/write). Please see mdvar.h for a definition of
330Sstevel@tonic-gate * metadevice operations vector (md_ops_t). This driver is loosely
340Sstevel@tonic-gate * based on the stripe driver (md_stripe).
350Sstevel@tonic-gate *
360Sstevel@tonic-gate * All metadevice administration is done through the use of ioctl's.
370Sstevel@tonic-gate * As such, all administrative routines appear in sp_ioctl.c.
380Sstevel@tonic-gate *
390Sstevel@tonic-gate * Soft partitions are represented both in-core and in the metadb with a
400Sstevel@tonic-gate * unit structure. The soft partition-specific information in the unit
410Sstevel@tonic-gate * structure includes the following information:
420Sstevel@tonic-gate * - Device information (md_dev64_t & md key) about the device on which
430Sstevel@tonic-gate * the soft partition is built.
440Sstevel@tonic-gate * - Soft partition status information.
450Sstevel@tonic-gate * - The size of the soft partition and number of extents used to
460Sstevel@tonic-gate * make up that size.
470Sstevel@tonic-gate * - An array of exents which define virtual/physical offset
480Sstevel@tonic-gate * mappings and lengths for each extent.
490Sstevel@tonic-gate *
500Sstevel@tonic-gate * Typical soft partition operation proceeds as follows:
510Sstevel@tonic-gate * - The unit structure is fetched from the metadb and placed into
520Sstevel@tonic-gate * an in-core array (as with other metadevices). This operation
530Sstevel@tonic-gate * is performed via sp_build_incore( ) and takes place during
540Sstevel@tonic-gate * "snarfing" (when all metadevices are brought in-core at
550Sstevel@tonic-gate * once) and when a new soft partition is created.
560Sstevel@tonic-gate * - A soft partition is opened via sp_open( ). At open time the
570Sstevel@tonic-gate * the soft partition unit structure is verified with the soft
580Sstevel@tonic-gate * partition on-disk structures. Additionally, the soft partition
590Sstevel@tonic-gate * status is checked (only soft partitions in the OK state may be
600Sstevel@tonic-gate * opened).
610Sstevel@tonic-gate * - Soft partition I/O is performed via sp_strategy( ) which relies on
620Sstevel@tonic-gate * a support routine, sp_mapbuf( ), to do most of the work.
630Sstevel@tonic-gate * sp_mapbuf( ) maps a buffer to a particular extent via a binary
640Sstevel@tonic-gate * search of the extent array in the soft partition unit structure.
650Sstevel@tonic-gate * Once a translation has been performed, the I/O is passed down
660Sstevel@tonic-gate * to the next layer, which may be another metadevice or a physical
670Sstevel@tonic-gate * disk. Since a soft partition may contain multiple, non-contiguous
680Sstevel@tonic-gate * extents, a single I/O may have to be fragmented.
690Sstevel@tonic-gate * - Soft partitions are closed using sp_close.
700Sstevel@tonic-gate *
710Sstevel@tonic-gate */
720Sstevel@tonic-gate
730Sstevel@tonic-gate #include <sys/param.h>
740Sstevel@tonic-gate #include <sys/systm.h>
750Sstevel@tonic-gate #include <sys/conf.h>
760Sstevel@tonic-gate #include <sys/file.h>
770Sstevel@tonic-gate #include <sys/user.h>
780Sstevel@tonic-gate #include <sys/uio.h>
790Sstevel@tonic-gate #include <sys/t_lock.h>
800Sstevel@tonic-gate #include <sys/buf.h>
810Sstevel@tonic-gate #include <sys/dkio.h>
820Sstevel@tonic-gate #include <sys/vtoc.h>
830Sstevel@tonic-gate #include <sys/kmem.h>
840Sstevel@tonic-gate #include <vm/page.h>
850Sstevel@tonic-gate #include <sys/cmn_err.h>
860Sstevel@tonic-gate #include <sys/sysmacros.h>
870Sstevel@tonic-gate #include <sys/types.h>
880Sstevel@tonic-gate #include <sys/mkdev.h>
890Sstevel@tonic-gate #include <sys/stat.h>
900Sstevel@tonic-gate #include <sys/open.h>
910Sstevel@tonic-gate #include <sys/lvm/mdvar.h>
920Sstevel@tonic-gate #include <sys/lvm/md_sp.h>
930Sstevel@tonic-gate #include <sys/lvm/md_convert.h>
940Sstevel@tonic-gate #include <sys/lvm/md_notify.h>
950Sstevel@tonic-gate #include <sys/lvm/md_crc.h>
960Sstevel@tonic-gate #include <sys/modctl.h>
970Sstevel@tonic-gate #include <sys/ddi.h>
980Sstevel@tonic-gate #include <sys/sunddi.h>
990Sstevel@tonic-gate #include <sys/debug.h>
1000Sstevel@tonic-gate
1010Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h>
1020Sstevel@tonic-gate #include <sys/sysevent/svm.h>
1030Sstevel@tonic-gate
1040Sstevel@tonic-gate md_ops_t sp_md_ops;
1050Sstevel@tonic-gate #ifndef lint
1061366Spetede char _depends_on[] = "drv/md";
1070Sstevel@tonic-gate md_ops_t *md_interface_ops = &sp_md_ops;
1080Sstevel@tonic-gate #endif
1090Sstevel@tonic-gate
1100Sstevel@tonic-gate extern unit_t md_nunits;
1110Sstevel@tonic-gate extern set_t md_nsets;
1120Sstevel@tonic-gate extern md_set_t md_set[];
1130Sstevel@tonic-gate
1140Sstevel@tonic-gate extern int md_status;
1150Sstevel@tonic-gate extern major_t md_major;
1160Sstevel@tonic-gate extern mdq_anchor_t md_done_daemon;
1170Sstevel@tonic-gate extern mdq_anchor_t md_sp_daemon;
1180Sstevel@tonic-gate extern kmutex_t md_mx;
1190Sstevel@tonic-gate extern kcondvar_t md_cv;
1200Sstevel@tonic-gate extern md_krwlock_t md_unit_array_rw;
1218452SJohn.Wren.Kennedy@Sun.COM extern clock_t md_hz;
1220Sstevel@tonic-gate
1230Sstevel@tonic-gate static kmem_cache_t *sp_parent_cache = NULL;
1240Sstevel@tonic-gate static kmem_cache_t *sp_child_cache = NULL;
1250Sstevel@tonic-gate static void sp_send_stat_ok(mp_unit_t *);
1260Sstevel@tonic-gate static void sp_send_stat_err(mp_unit_t *);
1270Sstevel@tonic-gate
1280Sstevel@tonic-gate /*
1290Sstevel@tonic-gate * FUNCTION: sp_parent_constructor()
1300Sstevel@tonic-gate * INPUT: none.
1310Sstevel@tonic-gate * OUTPUT: ps - parent save structure initialized.
1320Sstevel@tonic-gate * RETURNS: void * - ptr to initialized parent save structure.
1330Sstevel@tonic-gate * PURPOSE: initialize parent save structure.
1340Sstevel@tonic-gate */
1350Sstevel@tonic-gate /*ARGSUSED1*/
1360Sstevel@tonic-gate static int
sp_parent_constructor(void * p,void * d1,int d2)1370Sstevel@tonic-gate sp_parent_constructor(void *p, void *d1, int d2)
1380Sstevel@tonic-gate {
1390Sstevel@tonic-gate mutex_init(&((md_spps_t *)p)->ps_mx,
1400Sstevel@tonic-gate NULL, MUTEX_DEFAULT, NULL);
1410Sstevel@tonic-gate return (0);
1420Sstevel@tonic-gate }
1430Sstevel@tonic-gate
1440Sstevel@tonic-gate static void
sp_parent_init(md_spps_t * ps)1450Sstevel@tonic-gate sp_parent_init(md_spps_t *ps)
1460Sstevel@tonic-gate {
1470Sstevel@tonic-gate bzero(ps, offsetof(md_spps_t, ps_mx));
1480Sstevel@tonic-gate }
1490Sstevel@tonic-gate
1500Sstevel@tonic-gate /*ARGSUSED1*/
1510Sstevel@tonic-gate static void
sp_parent_destructor(void * p,void * d)1520Sstevel@tonic-gate sp_parent_destructor(void *p, void *d)
1530Sstevel@tonic-gate {
1540Sstevel@tonic-gate mutex_destroy(&((md_spps_t *)p)->ps_mx);
1550Sstevel@tonic-gate }
1560Sstevel@tonic-gate
1570Sstevel@tonic-gate /*
1580Sstevel@tonic-gate * FUNCTION: sp_child_constructor()
1590Sstevel@tonic-gate * INPUT: none.
1600Sstevel@tonic-gate * OUTPUT: cs - child save structure initialized.
1610Sstevel@tonic-gate * RETURNS: void * - ptr to initialized child save structure.
1620Sstevel@tonic-gate * PURPOSE: initialize child save structure.
1630Sstevel@tonic-gate */
1640Sstevel@tonic-gate /*ARGSUSED1*/
1650Sstevel@tonic-gate static int
sp_child_constructor(void * p,void * d1,int d2)1660Sstevel@tonic-gate sp_child_constructor(void *p, void *d1, int d2)
1670Sstevel@tonic-gate {
1680Sstevel@tonic-gate bioinit(&((md_spcs_t *)p)->cs_buf);
1690Sstevel@tonic-gate return (0);
1700Sstevel@tonic-gate }
1710Sstevel@tonic-gate
1720Sstevel@tonic-gate static void
sp_child_init(md_spcs_t * cs)1730Sstevel@tonic-gate sp_child_init(md_spcs_t *cs)
1740Sstevel@tonic-gate {
1750Sstevel@tonic-gate cs->cs_mdunit = 0;
1760Sstevel@tonic-gate cs->cs_ps = NULL;
1770Sstevel@tonic-gate md_bioreset(&cs->cs_buf);
1780Sstevel@tonic-gate }
1790Sstevel@tonic-gate
1800Sstevel@tonic-gate /*ARGSUSED1*/
1810Sstevel@tonic-gate static void
sp_child_destructor(void * p,void * d)1820Sstevel@tonic-gate sp_child_destructor(void *p, void *d)
1830Sstevel@tonic-gate {
1840Sstevel@tonic-gate biofini(&((md_spcs_t *)p)->cs_buf);
1850Sstevel@tonic-gate }
1860Sstevel@tonic-gate
1870Sstevel@tonic-gate /*
1880Sstevel@tonic-gate * FUNCTION: sp_run_queue()
1890Sstevel@tonic-gate * INPUT: none.
1900Sstevel@tonic-gate * OUTPUT: none.
1910Sstevel@tonic-gate * RETURNS: void.
1920Sstevel@tonic-gate * PURPOSE: run the md_daemon to clean up memory pool.
1930Sstevel@tonic-gate */
1940Sstevel@tonic-gate /*ARGSUSED*/
1950Sstevel@tonic-gate static void
sp_run_queue(void * d)1960Sstevel@tonic-gate sp_run_queue(void *d)
1970Sstevel@tonic-gate {
1980Sstevel@tonic-gate if (!(md_status & MD_GBL_DAEMONS_LIVE))
1990Sstevel@tonic-gate md_daemon(1, &md_done_daemon);
2000Sstevel@tonic-gate }
2010Sstevel@tonic-gate
2020Sstevel@tonic-gate
2030Sstevel@tonic-gate /*
2040Sstevel@tonic-gate * FUNCTION: sp_build_incore()
2050Sstevel@tonic-gate * INPUT: p - ptr to unit structure.
2060Sstevel@tonic-gate * snarfing - flag to tell us we are snarfing.
2070Sstevel@tonic-gate * OUTPUT: non.
2080Sstevel@tonic-gate * RETURNS: int - 0 (always).
2090Sstevel@tonic-gate * PURPOSE: place unit structure into in-core unit array (keyed from
2100Sstevel@tonic-gate * minor number).
2110Sstevel@tonic-gate */
2120Sstevel@tonic-gate int
sp_build_incore(void * p,int snarfing)2130Sstevel@tonic-gate sp_build_incore(void *p, int snarfing)
2140Sstevel@tonic-gate {
2150Sstevel@tonic-gate mp_unit_t *un = (mp_unit_t *)p;
2160Sstevel@tonic-gate minor_t mnum;
2170Sstevel@tonic-gate set_t setno;
2180Sstevel@tonic-gate md_dev64_t tmpdev;
2190Sstevel@tonic-gate
2200Sstevel@tonic-gate mnum = MD_SID(un);
2210Sstevel@tonic-gate
2220Sstevel@tonic-gate if (MD_UNIT(mnum) != NULL)
2230Sstevel@tonic-gate return (0);
2240Sstevel@tonic-gate
2250Sstevel@tonic-gate MD_STATUS(un) = 0;
2260Sstevel@tonic-gate
2270Sstevel@tonic-gate if (snarfing) {
2280Sstevel@tonic-gate /*
2290Sstevel@tonic-gate * if we are snarfing, we get the device information
2300Sstevel@tonic-gate * from the metadb record (using the metadb key for
2310Sstevel@tonic-gate * that device).
2320Sstevel@tonic-gate */
2330Sstevel@tonic-gate setno = MD_MIN2SET(mnum);
2340Sstevel@tonic-gate
2350Sstevel@tonic-gate tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
2360Sstevel@tonic-gate un->un_key, MD_NOTRUST_DEVT);
2370Sstevel@tonic-gate un->un_dev = tmpdev;
2380Sstevel@tonic-gate }
2390Sstevel@tonic-gate
2407627SChris.Horne@Sun.COM /* place various information in the in-core data structures */
2417627SChris.Horne@Sun.COM md_nblocks_set(mnum, un->c.un_total_blocks);
2420Sstevel@tonic-gate MD_UNIT(mnum) = un;
2437627SChris.Horne@Sun.COM
2440Sstevel@tonic-gate return (0);
2450Sstevel@tonic-gate }
2460Sstevel@tonic-gate
2470Sstevel@tonic-gate /*
2480Sstevel@tonic-gate * FUNCTION: reset_sp()
2490Sstevel@tonic-gate * INPUT: un - unit structure to be reset/removed.
2500Sstevel@tonic-gate * mnum - minor number to be reset/removed.
2510Sstevel@tonic-gate * removing - flag to tell us if we are removing
2520Sstevel@tonic-gate * permanently or just reseting in-core
2530Sstevel@tonic-gate * structures.
2540Sstevel@tonic-gate * OUTPUT: none.
2550Sstevel@tonic-gate * RETURNS: void.
2560Sstevel@tonic-gate * PURPOSE: used to either simply reset in-core structures or to
2570Sstevel@tonic-gate * permanently remove metadevices from the metadb.
2580Sstevel@tonic-gate */
2590Sstevel@tonic-gate void
reset_sp(mp_unit_t * un,minor_t mnum,int removing)2600Sstevel@tonic-gate reset_sp(mp_unit_t *un, minor_t mnum, int removing)
2610Sstevel@tonic-gate {
2620Sstevel@tonic-gate sv_dev_t *sv;
2630Sstevel@tonic-gate mddb_recid_t vtoc_id;
2640Sstevel@tonic-gate
2650Sstevel@tonic-gate /* clean up in-core structures */
2660Sstevel@tonic-gate md_destroy_unit_incore(mnum, &sp_md_ops);
2670Sstevel@tonic-gate
2687627SChris.Horne@Sun.COM md_nblocks_set(mnum, -1ULL);
2690Sstevel@tonic-gate MD_UNIT(mnum) = NULL;
2700Sstevel@tonic-gate
2711623Stw21770 /*
2721623Stw21770 * Attempt release of minor node
2731623Stw21770 */
2742077Stw21770 md_remove_minor_node(mnum);
2751623Stw21770
2760Sstevel@tonic-gate if (!removing)
2770Sstevel@tonic-gate return;
2780Sstevel@tonic-gate
2790Sstevel@tonic-gate /* we are removing the soft partition from the metadb */
2800Sstevel@tonic-gate
2810Sstevel@tonic-gate /*
2820Sstevel@tonic-gate * Save off device information so we can get to
2830Sstevel@tonic-gate * it after we do the mddb_deleterec().
2840Sstevel@tonic-gate */
2850Sstevel@tonic-gate sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
2860Sstevel@tonic-gate sv->setno = MD_MIN2SET(mnum);
2870Sstevel@tonic-gate sv->key = un->un_key;
2880Sstevel@tonic-gate vtoc_id = un->c.un_vtoc_id;
2890Sstevel@tonic-gate
2901623Stw21770 /*
2911623Stw21770 * Remove self from the namespace
2921623Stw21770 */
2931623Stw21770 if (un->c.un_revision & MD_FN_META_DEV) {
2941623Stw21770 (void) md_rem_selfname(un->c.un_self_id);
2951623Stw21770 }
2961623Stw21770
2970Sstevel@tonic-gate /* Remove the unit structure */
2980Sstevel@tonic-gate mddb_deleterec_wrapper(un->c.un_record_id);
2990Sstevel@tonic-gate
3000Sstevel@tonic-gate if (vtoc_id)
3010Sstevel@tonic-gate mddb_deleterec_wrapper(vtoc_id);
3020Sstevel@tonic-gate
3030Sstevel@tonic-gate SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE,
3040Sstevel@tonic-gate MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
3050Sstevel@tonic-gate
3060Sstevel@tonic-gate /*
3070Sstevel@tonic-gate * remove the underlying device name from the metadb. if other
3080Sstevel@tonic-gate * soft partitions are built on this device, this will simply
3090Sstevel@tonic-gate * decrease the reference count for this device. otherwise the
3100Sstevel@tonic-gate * name record for this device will be removed from the metadb.
3110Sstevel@tonic-gate */
3120Sstevel@tonic-gate md_rem_names(sv, 1);
3130Sstevel@tonic-gate kmem_free(sv, sizeof (sv_dev_t));
3140Sstevel@tonic-gate }
3150Sstevel@tonic-gate
3160Sstevel@tonic-gate /*
3170Sstevel@tonic-gate * FUNCTION: sp_send_stat_msg
3180Sstevel@tonic-gate * INPUT: un - unit reference
3190Sstevel@tonic-gate * status - status to be sent to master node
3200Sstevel@tonic-gate * MD_SP_OK - soft-partition is now OK
3210Sstevel@tonic-gate * MD_SP_ERR " " errored
3220Sstevel@tonic-gate * OUTPUT: none.
3230Sstevel@tonic-gate * RETURNS: void.
3240Sstevel@tonic-gate * PURPOSE: send a soft-partition status change to the master node. If the
3250Sstevel@tonic-gate * message succeeds we simply return. If it fails we panic as the
3260Sstevel@tonic-gate * cluster-wide view of the metadevices is now inconsistent.
3270Sstevel@tonic-gate * CALLING CONTEXT:
3280Sstevel@tonic-gate * Blockable. No locks can be held.
3290Sstevel@tonic-gate */
3300Sstevel@tonic-gate static void
sp_send_stat_msg(mp_unit_t * un,sp_status_t status)3310Sstevel@tonic-gate sp_send_stat_msg(mp_unit_t *un, sp_status_t status)
3320Sstevel@tonic-gate {
3330Sstevel@tonic-gate md_mn_msg_sp_setstat_t sp_msg;
3340Sstevel@tonic-gate md_mn_kresult_t *kres;
3350Sstevel@tonic-gate set_t setno = MD_UN2SET(un);
3360Sstevel@tonic-gate int rval;
3370Sstevel@tonic-gate const char *str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK";
338*11130SJames.Hall@Sun.COM int nretries = 0;
3390Sstevel@tonic-gate
3400Sstevel@tonic-gate sp_msg.sp_setstat_mnum = MD_SID(un);
3410Sstevel@tonic-gate sp_msg.sp_setstat_status = status;
3420Sstevel@tonic-gate
3430Sstevel@tonic-gate kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3440Sstevel@tonic-gate
345*11130SJames.Hall@Sun.COM spss_msg:
3460Sstevel@tonic-gate rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG,
3478452SJohn.Wren.Kennedy@Sun.COM 0, (char *)&sp_msg, sizeof (sp_msg), kres);
3480Sstevel@tonic-gate
3490Sstevel@tonic-gate if (!MDMN_KSEND_MSG_OK(rval, kres)) {
3500Sstevel@tonic-gate mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2");
3518452SJohn.Wren.Kennedy@Sun.COM /* If we're shutting down already, pause things here. */
3528452SJohn.Wren.Kennedy@Sun.COM if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
3538452SJohn.Wren.Kennedy@Sun.COM while (!md_mn_is_commd_present()) {
3548452SJohn.Wren.Kennedy@Sun.COM delay(md_hz);
3558452SJohn.Wren.Kennedy@Sun.COM }
356*11130SJames.Hall@Sun.COM /*
357*11130SJames.Hall@Sun.COM * commd is available again. Retry the message once.
358*11130SJames.Hall@Sun.COM * If it fails we panic as the system is in an
359*11130SJames.Hall@Sun.COM * unexpected state.
360*11130SJames.Hall@Sun.COM */
361*11130SJames.Hall@Sun.COM if (nretries++ == 0)
362*11130SJames.Hall@Sun.COM goto spss_msg;
3638452SJohn.Wren.Kennedy@Sun.COM }
3640Sstevel@tonic-gate /*
3650Sstevel@tonic-gate * Panic as we are now in an inconsistent state.
3660Sstevel@tonic-gate */
3670Sstevel@tonic-gate cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n",
3680Sstevel@tonic-gate md_shortname(MD_SID(un)), str);
3690Sstevel@tonic-gate }
3700Sstevel@tonic-gate
3710Sstevel@tonic-gate kmem_free(kres, sizeof (md_mn_kresult_t));
3720Sstevel@tonic-gate }
3730Sstevel@tonic-gate
3740Sstevel@tonic-gate /*
3750Sstevel@tonic-gate * FUNCTION: sp_finish_error
3760Sstevel@tonic-gate * INPUT: ps - parent save structure for error-ed I/O.
3770Sstevel@tonic-gate * lock_held - set if the unit readerlock is held
3780Sstevel@tonic-gate * OUTPUT: none.
3790Sstevel@tonic-gate * RETURNS: void.
3800Sstevel@tonic-gate * PURPOSE: report a driver error
3810Sstevel@tonic-gate */
3820Sstevel@tonic-gate static void
sp_finish_error(md_spps_t * ps,int lock_held)3830Sstevel@tonic-gate sp_finish_error(md_spps_t *ps, int lock_held)
3840Sstevel@tonic-gate {
3850Sstevel@tonic-gate struct buf *pb = ps->ps_bp;
3860Sstevel@tonic-gate mdi_unit_t *ui = ps->ps_ui;
3870Sstevel@tonic-gate md_dev64_t un_dev; /* underlying device */
3880Sstevel@tonic-gate md_dev64_t md_dev = md_expldev(pb->b_edev); /* metadev in error */
3890Sstevel@tonic-gate char *str;
3900Sstevel@tonic-gate
3910Sstevel@tonic-gate un_dev = md_expldev(ps->ps_un->un_dev);
3920Sstevel@tonic-gate /* set error type */
3930Sstevel@tonic-gate if (pb->b_flags & B_READ) {
3940Sstevel@tonic-gate str = "read";
3950Sstevel@tonic-gate } else {
3960Sstevel@tonic-gate str = "write";
3970Sstevel@tonic-gate }
3980Sstevel@tonic-gate
3990Sstevel@tonic-gate
4000Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps);
4010Sstevel@tonic-gate pb->b_flags |= B_ERROR;
4020Sstevel@tonic-gate
4030Sstevel@tonic-gate md_kstat_done(ui, pb, 0);
4040Sstevel@tonic-gate
4050Sstevel@tonic-gate if (lock_held) {
4060Sstevel@tonic-gate md_unit_readerexit(ui);
4070Sstevel@tonic-gate }
4080Sstevel@tonic-gate md_biodone(pb);
4090Sstevel@tonic-gate
4100Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: %s error on %s",
4110Sstevel@tonic-gate md_shortname(md_getminor(md_dev)), str,
4120Sstevel@tonic-gate md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0));
4130Sstevel@tonic-gate }
4140Sstevel@tonic-gate
4150Sstevel@tonic-gate
4160Sstevel@tonic-gate /*
4170Sstevel@tonic-gate * FUNCTION: sp_xmit_ok
4180Sstevel@tonic-gate * INPUT: dq - daemon queue referencing failing ps structure
4190Sstevel@tonic-gate * OUTPUT: none.
4200Sstevel@tonic-gate * RETURNS: void.
4210Sstevel@tonic-gate * PURPOSE: send a message to the master node in a multi-owner diskset to
4220Sstevel@tonic-gate * update all attached nodes view of the soft-part to be MD_SP_OK.
4230Sstevel@tonic-gate * CALLING CONTEXT:
4240Sstevel@tonic-gate * Blockable. No unit lock held.
4250Sstevel@tonic-gate */
4260Sstevel@tonic-gate static void
sp_xmit_ok(daemon_queue_t * dq)4270Sstevel@tonic-gate sp_xmit_ok(daemon_queue_t *dq)
4280Sstevel@tonic-gate {
4290Sstevel@tonic-gate md_spps_t *ps = (md_spps_t *)dq;
4300Sstevel@tonic-gate
4310Sstevel@tonic-gate /* Send a MD_MN_MSG_SP_SETSTAT to the master */
4320Sstevel@tonic-gate sp_send_stat_msg(ps->ps_un, MD_SP_OK);
4330Sstevel@tonic-gate
4340Sstevel@tonic-gate /*
4350Sstevel@tonic-gate * Successfully transmitted error state to all nodes, now release this
4360Sstevel@tonic-gate * parent structure.
4370Sstevel@tonic-gate */
4380Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps);
4390Sstevel@tonic-gate }
4400Sstevel@tonic-gate
4410Sstevel@tonic-gate /*
4420Sstevel@tonic-gate * FUNCTION: sp_xmit_error
4430Sstevel@tonic-gate * INPUT: dq - daemon queue referencing failing ps structure
4440Sstevel@tonic-gate * OUTPUT: none.
4450Sstevel@tonic-gate * RETURNS: void.
4460Sstevel@tonic-gate * PURPOSE: send a message to the master node in a multi-owner diskset to
4470Sstevel@tonic-gate * update all attached nodes view of the soft-part to be MD_SP_ERR.
4480Sstevel@tonic-gate * CALLING CONTEXT:
4490Sstevel@tonic-gate * Blockable. No unit lock held.
4500Sstevel@tonic-gate */
4510Sstevel@tonic-gate static void
sp_xmit_error(daemon_queue_t * dq)4520Sstevel@tonic-gate sp_xmit_error(daemon_queue_t *dq)
4530Sstevel@tonic-gate {
4540Sstevel@tonic-gate md_spps_t *ps = (md_spps_t *)dq;
4550Sstevel@tonic-gate
4560Sstevel@tonic-gate /* Send a MD_MN_MSG_SP_SETSTAT to the master */
4570Sstevel@tonic-gate sp_send_stat_msg(ps->ps_un, MD_SP_ERR);
4580Sstevel@tonic-gate
4590Sstevel@tonic-gate /*
4600Sstevel@tonic-gate * Successfully transmitted error state to all nodes, now release this
4610Sstevel@tonic-gate * parent structure.
4620Sstevel@tonic-gate */
4630Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps);
4640Sstevel@tonic-gate }
4650Sstevel@tonic-gate static void
sp_send_stat_ok(mp_unit_t * un)4660Sstevel@tonic-gate sp_send_stat_ok(mp_unit_t *un)
4670Sstevel@tonic-gate {
4680Sstevel@tonic-gate minor_t mnum = MD_SID(un);
4690Sstevel@tonic-gate md_spps_t *ps;
4700Sstevel@tonic-gate
4710Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
4720Sstevel@tonic-gate sp_parent_init(ps);
4730Sstevel@tonic-gate ps->ps_un = un;
4740Sstevel@tonic-gate ps->ps_ui = MDI_UNIT(mnum);
4750Sstevel@tonic-gate
4760Sstevel@tonic-gate daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps,
4777627SChris.Horne@Sun.COM REQ_OLD);
4780Sstevel@tonic-gate }
4790Sstevel@tonic-gate
4800Sstevel@tonic-gate static void
sp_send_stat_err(mp_unit_t * un)4810Sstevel@tonic-gate sp_send_stat_err(mp_unit_t *un)
4820Sstevel@tonic-gate {
4830Sstevel@tonic-gate minor_t mnum = MD_SID(un);
4840Sstevel@tonic-gate md_spps_t *ps;
4850Sstevel@tonic-gate
4860Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
4870Sstevel@tonic-gate sp_parent_init(ps);
4880Sstevel@tonic-gate ps->ps_un = un;
4890Sstevel@tonic-gate ps->ps_ui = MDI_UNIT(mnum);
4900Sstevel@tonic-gate
4910Sstevel@tonic-gate daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps,
4927627SChris.Horne@Sun.COM REQ_OLD);
4930Sstevel@tonic-gate }
4940Sstevel@tonic-gate
4950Sstevel@tonic-gate
4960Sstevel@tonic-gate /*
4970Sstevel@tonic-gate * FUNCTION: sp_error()
4980Sstevel@tonic-gate * INPUT: ps - parent save structure for error-ed I/O.
4990Sstevel@tonic-gate * OUTPUT: none.
5000Sstevel@tonic-gate * RETURNS: void.
5010Sstevel@tonic-gate * PURPOSE: report a driver error.
5020Sstevel@tonic-gate * CALLING CONTEXT:
5030Sstevel@tonic-gate * Interrupt - non-blockable
5040Sstevel@tonic-gate */
5050Sstevel@tonic-gate static void
sp_error(md_spps_t * ps)5060Sstevel@tonic-gate sp_error(md_spps_t *ps)
5070Sstevel@tonic-gate {
5080Sstevel@tonic-gate set_t setno = MD_UN2SET(ps->ps_un);
5090Sstevel@tonic-gate
5100Sstevel@tonic-gate /*
5110Sstevel@tonic-gate * Drop the mutex associated with this request before (potentially)
5120Sstevel@tonic-gate * enqueuing the free onto a separate thread. We have to release the
5130Sstevel@tonic-gate * mutex before destroying the parent structure.
5140Sstevel@tonic-gate */
5150Sstevel@tonic-gate if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
5160Sstevel@tonic-gate if (MUTEX_HELD(&ps->ps_mx)) {
5170Sstevel@tonic-gate mutex_exit(&ps->ps_mx);
5180Sstevel@tonic-gate }
5190Sstevel@tonic-gate } else {
5200Sstevel@tonic-gate /*
5210Sstevel@tonic-gate * this should only ever happen if we are panicking,
5220Sstevel@tonic-gate * since DONTFREE is only set on the parent if panicstr
5230Sstevel@tonic-gate * is non-NULL.
5240Sstevel@tonic-gate */
5250Sstevel@tonic-gate ASSERT(panicstr);
5260Sstevel@tonic-gate }
5270Sstevel@tonic-gate
5280Sstevel@tonic-gate /*
5290Sstevel@tonic-gate * For a multi-owner set we need to send a message to the master so that
5300Sstevel@tonic-gate * all nodes get the errored status when we first encounter it. To avoid
5310Sstevel@tonic-gate * deadlocking when multiple soft-partitions encounter an error on one
5320Sstevel@tonic-gate * physical unit we drop the unit readerlock before enqueueing the
5330Sstevel@tonic-gate * request. That way we can service any messages that require a
5340Sstevel@tonic-gate * writerlock to be held. Additionally, to avoid deadlocking when at
5350Sstevel@tonic-gate * the bottom of a metadevice stack and a higher level mirror has
5360Sstevel@tonic-gate * multiple requests outstanding on this soft-part, we clone the ps
5370Sstevel@tonic-gate * that failed and pass the error back up the stack to release the
5380Sstevel@tonic-gate * reference that this i/o may have in the higher-level metadevice.
5390Sstevel@tonic-gate * The other nodes in the cluster just have to modify the soft-part
5400Sstevel@tonic-gate * status and we do not need to block the i/o completion for this.
5410Sstevel@tonic-gate */
5420Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) {
5430Sstevel@tonic-gate md_spps_t *err_ps;
5440Sstevel@tonic-gate err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
5450Sstevel@tonic-gate sp_parent_init(err_ps);
5460Sstevel@tonic-gate
5470Sstevel@tonic-gate err_ps->ps_un = ps->ps_un;
5480Sstevel@tonic-gate err_ps->ps_ui = ps->ps_ui;
5490Sstevel@tonic-gate
5500Sstevel@tonic-gate md_unit_readerexit(ps->ps_ui);
5510Sstevel@tonic-gate
5520Sstevel@tonic-gate daemon_request(&md_sp_daemon, sp_xmit_error,
5530Sstevel@tonic-gate (daemon_queue_t *)err_ps, REQ_OLD);
5540Sstevel@tonic-gate
5550Sstevel@tonic-gate sp_finish_error(ps, 0);
5560Sstevel@tonic-gate
5570Sstevel@tonic-gate return;
5580Sstevel@tonic-gate } else {
5590Sstevel@tonic-gate ps->ps_un->un_status = MD_SP_ERR;
5600Sstevel@tonic-gate }
5610Sstevel@tonic-gate
5620Sstevel@tonic-gate /* Flag the error */
5630Sstevel@tonic-gate sp_finish_error(ps, 1);
5640Sstevel@tonic-gate
5650Sstevel@tonic-gate }
5660Sstevel@tonic-gate
5670Sstevel@tonic-gate /*
5680Sstevel@tonic-gate * FUNCTION: sp_mapbuf()
5690Sstevel@tonic-gate * INPUT: un - unit structure for soft partition we are doing
5700Sstevel@tonic-gate * I/O on.
5710Sstevel@tonic-gate * voff - virtual offset in soft partition to map.
5720Sstevel@tonic-gate * bcount - # of blocks in the I/O.
5730Sstevel@tonic-gate * OUTPUT: bp - translated buffer to be passed down to next layer.
5740Sstevel@tonic-gate * RETURNS: 1 - request must be fragmented, more work to do,
5750Sstevel@tonic-gate * 0 - request satisified, no more work to do
5760Sstevel@tonic-gate * -1 - error
5770Sstevel@tonic-gate * PURPOSE: Map the the virtual offset in the soft partition (passed
5780Sstevel@tonic-gate * in via voff) to the "physical" offset on whatever the soft
5790Sstevel@tonic-gate * partition is built on top of. We do this by doing a binary
5800Sstevel@tonic-gate * search of the extent array in the soft partition unit
5810Sstevel@tonic-gate * structure. Once the current extent is found, we do the
5820Sstevel@tonic-gate * translation, determine if the I/O will cross extent
5830Sstevel@tonic-gate * boundaries (if so, we have to fragment the I/O), then
5840Sstevel@tonic-gate * fill in the buf structure to be passed down to the next layer.
5850Sstevel@tonic-gate */
5860Sstevel@tonic-gate static int
sp_mapbuf(mp_unit_t * un,sp_ext_offset_t voff,sp_ext_length_t bcount,buf_t * bp)5870Sstevel@tonic-gate sp_mapbuf(
5880Sstevel@tonic-gate mp_unit_t *un,
5890Sstevel@tonic-gate sp_ext_offset_t voff,
5900Sstevel@tonic-gate sp_ext_length_t bcount,
5910Sstevel@tonic-gate buf_t *bp
5920Sstevel@tonic-gate )
5930Sstevel@tonic-gate {
5940Sstevel@tonic-gate int lo, mid, hi, found, more;
5950Sstevel@tonic-gate size_t new_bcount;
5960Sstevel@tonic-gate sp_ext_offset_t new_blkno;
5970Sstevel@tonic-gate sp_ext_offset_t new_offset;
5980Sstevel@tonic-gate sp_ext_offset_t ext_endblk;
5990Sstevel@tonic-gate md_dev64_t new_edev;
6000Sstevel@tonic-gate extern unsigned md_maxphys;
6010Sstevel@tonic-gate
6020Sstevel@tonic-gate found = 0;
6030Sstevel@tonic-gate lo = 0;
6040Sstevel@tonic-gate hi = un->un_numexts - 1;
6050Sstevel@tonic-gate
6060Sstevel@tonic-gate /*
6070Sstevel@tonic-gate * do a binary search to find the extent that contains the
6080Sstevel@tonic-gate * starting offset. after this loop, mid contains the index
6090Sstevel@tonic-gate * of the correct extent.
6100Sstevel@tonic-gate */
6110Sstevel@tonic-gate while (lo <= hi && !found) {
6120Sstevel@tonic-gate mid = (lo + hi) / 2;
6130Sstevel@tonic-gate /* is the starting offset contained within the mid-ext? */
6140Sstevel@tonic-gate if (voff >= un->un_ext[mid].un_voff &&
6150Sstevel@tonic-gate voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len)
6160Sstevel@tonic-gate found = 1;
6170Sstevel@tonic-gate else if (voff < un->un_ext[mid].un_voff)
6180Sstevel@tonic-gate hi = mid - 1;
6190Sstevel@tonic-gate else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */
6200Sstevel@tonic-gate lo = mid + 1;
6210Sstevel@tonic-gate }
6220Sstevel@tonic-gate
6230Sstevel@tonic-gate if (!found) {
6240Sstevel@tonic-gate cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff);
6250Sstevel@tonic-gate return (-1);
6260Sstevel@tonic-gate }
6270Sstevel@tonic-gate
6280Sstevel@tonic-gate /* translate to underlying physical offset/device */
6290Sstevel@tonic-gate new_offset = voff - un->un_ext[mid].un_voff;
6300Sstevel@tonic-gate new_blkno = un->un_ext[mid].un_poff + new_offset;
6310Sstevel@tonic-gate new_edev = un->un_dev;
6320Sstevel@tonic-gate
6330Sstevel@tonic-gate /* determine if we need to break the I/O into fragments */
6340Sstevel@tonic-gate ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len;
6350Sstevel@tonic-gate if (voff + btodb(bcount) > ext_endblk) {
6360Sstevel@tonic-gate new_bcount = dbtob(ext_endblk - voff);
6370Sstevel@tonic-gate more = 1;
6380Sstevel@tonic-gate } else {
6390Sstevel@tonic-gate new_bcount = bcount;
6400Sstevel@tonic-gate more = 0;
6410Sstevel@tonic-gate }
6420Sstevel@tonic-gate
6430Sstevel@tonic-gate /* only break up the I/O if we're not built on another metadevice */
6440Sstevel@tonic-gate if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) {
6450Sstevel@tonic-gate new_bcount = md_maxphys;
6460Sstevel@tonic-gate more = 1;
6470Sstevel@tonic-gate }
6480Sstevel@tonic-gate if (bp != (buf_t *)NULL) {
6490Sstevel@tonic-gate /* do bp updates */
6500Sstevel@tonic-gate bp->b_bcount = new_bcount;
6510Sstevel@tonic-gate bp->b_lblkno = new_blkno;
6520Sstevel@tonic-gate bp->b_edev = md_dev64_to_dev(new_edev);
6530Sstevel@tonic-gate }
6540Sstevel@tonic-gate return (more);
6550Sstevel@tonic-gate }
6560Sstevel@tonic-gate
6570Sstevel@tonic-gate /*
6580Sstevel@tonic-gate * FUNCTION: sp_validate()
6590Sstevel@tonic-gate * INPUT: un - unit structure to be validated.
6600Sstevel@tonic-gate * OUTPUT: none.
6610Sstevel@tonic-gate * RETURNS: 0 - soft partition ok.
6620Sstevel@tonic-gate * -1 - error.
6630Sstevel@tonic-gate * PURPOSE: called on open to sanity check the soft partition. In
6640Sstevel@tonic-gate * order to open a soft partition:
6650Sstevel@tonic-gate * - it must have at least one extent
6660Sstevel@tonic-gate * - the extent info in core and on disk must match
6670Sstevel@tonic-gate * - it may not be in an intermediate state (which would
6680Sstevel@tonic-gate * imply that a two-phase commit was interrupted)
6690Sstevel@tonic-gate *
6700Sstevel@tonic-gate * If the extent checking fails (B_ERROR returned from the read
6710Sstevel@tonic-gate * strategy call) _and_ we're a multi-owner diskset, we send a
6720Sstevel@tonic-gate * message to the master so that all nodes inherit the same view
6730Sstevel@tonic-gate * of the soft partition.
6740Sstevel@tonic-gate * If we are checking a soft-part that is marked as in error, and
6750Sstevel@tonic-gate * we can actually read and validate the watermarks we send a
6760Sstevel@tonic-gate * message to clear the error to the master node.
6770Sstevel@tonic-gate */
6780Sstevel@tonic-gate static int
sp_validate(mp_unit_t * un)6790Sstevel@tonic-gate sp_validate(mp_unit_t *un)
6800Sstevel@tonic-gate {
6810Sstevel@tonic-gate uint_t ext;
6820Sstevel@tonic-gate struct buf *buf;
6830Sstevel@tonic-gate sp_ext_length_t len;
6840Sstevel@tonic-gate mp_watermark_t *wm;
6850Sstevel@tonic-gate set_t setno;
6860Sstevel@tonic-gate int reset_error = 0;
6870Sstevel@tonic-gate
6880Sstevel@tonic-gate setno = MD_UN2SET(un);
6890Sstevel@tonic-gate
6900Sstevel@tonic-gate /* sanity check unit structure components ?? */
6910Sstevel@tonic-gate if (un->un_status != MD_SP_OK) {
6920Sstevel@tonic-gate if (un->un_status != MD_SP_ERR) {
6930Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, soft partition "
6940Sstevel@tonic-gate "status is %u.",
6950Sstevel@tonic-gate md_shortname(MD_SID(un)),
6960Sstevel@tonic-gate un->un_status);
6970Sstevel@tonic-gate return (-1);
6980Sstevel@tonic-gate } else {
6990Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open of soft partition "
7000Sstevel@tonic-gate "in Errored state.",
7010Sstevel@tonic-gate md_shortname(MD_SID(un)));
7020Sstevel@tonic-gate reset_error = 1;
7030Sstevel@tonic-gate }
7040Sstevel@tonic-gate }
7050Sstevel@tonic-gate
7060Sstevel@tonic-gate if (un->un_numexts == 0) {
7070Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, soft partition does "
7080Sstevel@tonic-gate "not have any extents.", md_shortname(MD_SID(un)));
7090Sstevel@tonic-gate return (-1);
7100Sstevel@tonic-gate }
7110Sstevel@tonic-gate
7120Sstevel@tonic-gate len = 0LL;
7130Sstevel@tonic-gate for (ext = 0; ext < un->un_numexts; ext++) {
7140Sstevel@tonic-gate
7150Sstevel@tonic-gate /* tally extent lengths to check total size */
7160Sstevel@tonic-gate len += un->un_ext[ext].un_len;
7170Sstevel@tonic-gate
7180Sstevel@tonic-gate /* allocate buffer for watermark */
7190Sstevel@tonic-gate buf = getrbuf(KM_SLEEP);
7200Sstevel@tonic-gate
7210Sstevel@tonic-gate /* read watermark */
7220Sstevel@tonic-gate buf->b_flags = B_READ;
7230Sstevel@tonic-gate buf->b_edev = md_dev64_to_dev(un->un_dev);
7240Sstevel@tonic-gate buf->b_iodone = NULL;
7250Sstevel@tonic-gate buf->b_proc = NULL;
7260Sstevel@tonic-gate buf->b_bcount = sizeof (mp_watermark_t);
7270Sstevel@tonic-gate buf->b_lblkno = un->un_ext[ext].un_poff - 1;
7280Sstevel@tonic-gate buf->b_bufsize = sizeof (mp_watermark_t);
7290Sstevel@tonic-gate buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t),
7300Sstevel@tonic-gate KM_SLEEP);
7310Sstevel@tonic-gate
7320Sstevel@tonic-gate /*
7330Sstevel@tonic-gate * make the call non-blocking so that it is not affected
7340Sstevel@tonic-gate * by a set take.
7350Sstevel@tonic-gate */
7360Sstevel@tonic-gate md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL);
7370Sstevel@tonic-gate (void) biowait(buf);
7380Sstevel@tonic-gate
7390Sstevel@tonic-gate if (buf->b_flags & B_ERROR) {
7400Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, could not "
7410Sstevel@tonic-gate "read watermark at block %llu for extent %u, "
7420Sstevel@tonic-gate "error %d.", md_shortname(MD_SID(un)),
7430Sstevel@tonic-gate buf->b_lblkno, ext, buf->b_error);
7440Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
7450Sstevel@tonic-gate freerbuf(buf);
7460Sstevel@tonic-gate
7470Sstevel@tonic-gate /*
7480Sstevel@tonic-gate * If we're a multi-owner diskset we send a message
7490Sstevel@tonic-gate * indicating that this soft-part has an invalid
7500Sstevel@tonic-gate * extent to the master node. This ensures a consistent
7510Sstevel@tonic-gate * view of the soft-part across the cluster.
7520Sstevel@tonic-gate */
7530Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) {
7540Sstevel@tonic-gate sp_send_stat_err(un);
7550Sstevel@tonic-gate }
7560Sstevel@tonic-gate return (-1);
7570Sstevel@tonic-gate }
7580Sstevel@tonic-gate
7590Sstevel@tonic-gate wm = (mp_watermark_t *)buf->b_un.b_addr;
7600Sstevel@tonic-gate
7610Sstevel@tonic-gate /* make sure the checksum is correct first */
7620Sstevel@tonic-gate if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
7630Sstevel@tonic-gate (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) {
7640Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark "
7650Sstevel@tonic-gate "at block %llu for extent %u does not have a "
7660Sstevel@tonic-gate "valid checksum 0x%08x.", md_shortname(MD_SID(un)),
7670Sstevel@tonic-gate buf->b_lblkno, ext, wm->wm_checksum);
7680Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
7690Sstevel@tonic-gate freerbuf(buf);
7700Sstevel@tonic-gate return (-1);
7710Sstevel@tonic-gate }
7720Sstevel@tonic-gate
7730Sstevel@tonic-gate if (wm->wm_magic != MD_SP_MAGIC) {
7740Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark "
7750Sstevel@tonic-gate "at block %llu for extent %u does not have a "
7760Sstevel@tonic-gate "valid watermark magic number, expected 0x%x, "
7770Sstevel@tonic-gate "found 0x%x.", md_shortname(MD_SID(un)),
7780Sstevel@tonic-gate buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic);
7790Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
7800Sstevel@tonic-gate freerbuf(buf);
7810Sstevel@tonic-gate return (-1);
7820Sstevel@tonic-gate }
7830Sstevel@tonic-gate
7840Sstevel@tonic-gate /* make sure sequence number matches the current extent */
7850Sstevel@tonic-gate if (wm->wm_seq != ext) {
7860Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark "
7870Sstevel@tonic-gate "at block %llu for extent %u has invalid "
7880Sstevel@tonic-gate "sequence number %u.", md_shortname(MD_SID(un)),
7890Sstevel@tonic-gate buf->b_lblkno, ext, wm->wm_seq);
7900Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
7910Sstevel@tonic-gate freerbuf(buf);
7920Sstevel@tonic-gate return (-1);
7930Sstevel@tonic-gate }
7940Sstevel@tonic-gate
7950Sstevel@tonic-gate /* make sure watermark length matches unit structure */
7960Sstevel@tonic-gate if (wm->wm_length != un->un_ext[ext].un_len) {
7970Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark "
7980Sstevel@tonic-gate "at block %llu for extent %u has inconsistent "
7990Sstevel@tonic-gate "length, expected %llu, found %llu.",
8000Sstevel@tonic-gate md_shortname(MD_SID(un)), buf->b_lblkno,
8010Sstevel@tonic-gate ext, un->un_ext[ext].un_len,
8020Sstevel@tonic-gate (u_longlong_t)wm->wm_length);
8030Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
8040Sstevel@tonic-gate freerbuf(buf);
8050Sstevel@tonic-gate return (-1);
8060Sstevel@tonic-gate }
8070Sstevel@tonic-gate
8080Sstevel@tonic-gate /*
8090Sstevel@tonic-gate * make sure the type is a valid soft partition and not
8100Sstevel@tonic-gate * a free extent or the end.
8110Sstevel@tonic-gate */
8120Sstevel@tonic-gate if (wm->wm_type != EXTTYP_ALLOC) {
8130Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, watermark "
8140Sstevel@tonic-gate "at block %llu for extent %u is not marked "
8150Sstevel@tonic-gate "as in-use, type = %u.", md_shortname(MD_SID(un)),
8160Sstevel@tonic-gate buf->b_lblkno, ext, wm->wm_type);
8170Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
8180Sstevel@tonic-gate freerbuf(buf);
8190Sstevel@tonic-gate return (-1);
8200Sstevel@tonic-gate }
8210Sstevel@tonic-gate /* free up buffer */
8220Sstevel@tonic-gate kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
8230Sstevel@tonic-gate freerbuf(buf);
8240Sstevel@tonic-gate }
8250Sstevel@tonic-gate
8260Sstevel@tonic-gate if (len != un->un_length) {
8270Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: open failed, computed length "
8280Sstevel@tonic-gate "%llu != expected length %llu.", md_shortname(MD_SID(un)),
8290Sstevel@tonic-gate len, un->un_length);
8300Sstevel@tonic-gate return (-1);
8310Sstevel@tonic-gate }
8320Sstevel@tonic-gate
8330Sstevel@tonic-gate /*
8340Sstevel@tonic-gate * If we're a multi-owner set _and_ reset_error is set, we should clear
8350Sstevel@tonic-gate * the error condition on all nodes in the set. Use SP_SETSTAT2 with
8360Sstevel@tonic-gate * MD_SP_OK.
8370Sstevel@tonic-gate */
8380Sstevel@tonic-gate if (MD_MNSET_SETNO(setno) && reset_error) {
8390Sstevel@tonic-gate sp_send_stat_ok(un);
8400Sstevel@tonic-gate }
8410Sstevel@tonic-gate return (0);
8420Sstevel@tonic-gate }
8430Sstevel@tonic-gate
8440Sstevel@tonic-gate /*
8450Sstevel@tonic-gate * FUNCTION: sp_done()
8460Sstevel@tonic-gate * INPUT: child_buf - buffer attached to child save structure.
8470Sstevel@tonic-gate * this is the buffer on which I/O has just
8480Sstevel@tonic-gate * completed.
8490Sstevel@tonic-gate * OUTPUT: none.
8500Sstevel@tonic-gate * RETURNS: 0 - success.
8510Sstevel@tonic-gate * 1 - error.
8520Sstevel@tonic-gate * PURPOSE: called on I/O completion.
8530Sstevel@tonic-gate */
8540Sstevel@tonic-gate static int
sp_done(struct buf * child_buf)8550Sstevel@tonic-gate sp_done(struct buf *child_buf)
8560Sstevel@tonic-gate {
8570Sstevel@tonic-gate struct buf *parent_buf;
8580Sstevel@tonic-gate mdi_unit_t *ui;
8590Sstevel@tonic-gate md_spps_t *ps;
8600Sstevel@tonic-gate md_spcs_t *cs;
8610Sstevel@tonic-gate
8620Sstevel@tonic-gate /* find the child save structure to which this buffer belongs */
8630Sstevel@tonic-gate cs = (md_spcs_t *)((caddr_t)child_buf -
8640Sstevel@tonic-gate (sizeof (md_spcs_t) - sizeof (buf_t)));
8650Sstevel@tonic-gate /* now get the parent save structure */
8660Sstevel@tonic-gate ps = cs->cs_ps;
8670Sstevel@tonic-gate parent_buf = ps->ps_bp;
8680Sstevel@tonic-gate
8690Sstevel@tonic-gate mutex_enter(&ps->ps_mx);
8700Sstevel@tonic-gate /* pass any errors back up to the parent */
8710Sstevel@tonic-gate if (child_buf->b_flags & B_ERROR) {
8720Sstevel@tonic-gate ps->ps_flags |= MD_SPPS_ERROR;
8730Sstevel@tonic-gate parent_buf->b_error = child_buf->b_error;
8740Sstevel@tonic-gate }
8750Sstevel@tonic-gate /* mapout, if needed */
8760Sstevel@tonic-gate if (child_buf->b_flags & B_REMAPPED)
8770Sstevel@tonic-gate bp_mapout(child_buf);
8780Sstevel@tonic-gate
8790Sstevel@tonic-gate ps->ps_frags--;
8800Sstevel@tonic-gate if (ps->ps_frags != 0) {
8810Sstevel@tonic-gate /*
8820Sstevel@tonic-gate * if this parent has more children, we just free the
8830Sstevel@tonic-gate * child and return.
8840Sstevel@tonic-gate */
8850Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs);
8860Sstevel@tonic-gate mutex_exit(&ps->ps_mx);
8870Sstevel@tonic-gate return (1);
8880Sstevel@tonic-gate }
8890Sstevel@tonic-gate /* there are no more children */
8900Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs);
8910Sstevel@tonic-gate if (ps->ps_flags & MD_SPPS_ERROR) {
8920Sstevel@tonic-gate sp_error(ps);
8930Sstevel@tonic-gate return (1);
8940Sstevel@tonic-gate }
8950Sstevel@tonic-gate ui = ps->ps_ui;
8960Sstevel@tonic-gate if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
8970Sstevel@tonic-gate mutex_exit(&ps->ps_mx);
8980Sstevel@tonic-gate } else {
8990Sstevel@tonic-gate /*
9000Sstevel@tonic-gate * this should only ever happen if we are panicking,
9010Sstevel@tonic-gate * since DONTFREE is only set on the parent if panicstr
9020Sstevel@tonic-gate * is non-NULL.
9030Sstevel@tonic-gate */
9040Sstevel@tonic-gate ASSERT(panicstr);
9050Sstevel@tonic-gate }
9060Sstevel@tonic-gate SPPS_FREE(sp_parent_cache, ps);
9070Sstevel@tonic-gate md_kstat_done(ui, parent_buf, 0);
9080Sstevel@tonic-gate md_unit_readerexit(ui);
9090Sstevel@tonic-gate md_biodone(parent_buf);
9100Sstevel@tonic-gate return (0);
9110Sstevel@tonic-gate }
9120Sstevel@tonic-gate
9130Sstevel@tonic-gate /*
9140Sstevel@tonic-gate * FUNCTION: md_sp_strategy()
9150Sstevel@tonic-gate * INPUT: parent_buf - parent buffer
9160Sstevel@tonic-gate * flag - flags
9170Sstevel@tonic-gate * private - private data
9180Sstevel@tonic-gate * OUTPUT: none.
9190Sstevel@tonic-gate * RETURNS: void.
9200Sstevel@tonic-gate * PURPOSE: Soft partitioning I/O strategy. Performs the main work
9210Sstevel@tonic-gate * needed to do I/O to a soft partition. The basic
9220Sstevel@tonic-gate * algorithm is as follows:
9230Sstevel@tonic-gate * - Allocate a child save structure to keep track
9240Sstevel@tonic-gate * of the I/O we are going to pass down.
9250Sstevel@tonic-gate * - Map the I/O to the correct extent in the soft
9260Sstevel@tonic-gate * partition (see sp_mapbuf()).
9270Sstevel@tonic-gate * - bioclone() the buffer and pass it down the
9280Sstevel@tonic-gate * stack using md_call_strategy.
9290Sstevel@tonic-gate * - If the I/O needs to split across extents,
9300Sstevel@tonic-gate * repeat the above steps until all fragments
9310Sstevel@tonic-gate * are finished.
9320Sstevel@tonic-gate */
9330Sstevel@tonic-gate static void
md_sp_strategy(buf_t * parent_buf,int flag,void * private)9340Sstevel@tonic-gate md_sp_strategy(buf_t *parent_buf, int flag, void *private)
9350Sstevel@tonic-gate {
9360Sstevel@tonic-gate md_spps_t *ps;
9370Sstevel@tonic-gate md_spcs_t *cs;
9380Sstevel@tonic-gate int more;
9390Sstevel@tonic-gate mp_unit_t *un;
9400Sstevel@tonic-gate mdi_unit_t *ui;
9410Sstevel@tonic-gate size_t current_count;
9420Sstevel@tonic-gate off_t current_offset;
9430Sstevel@tonic-gate sp_ext_offset_t current_blkno;
9440Sstevel@tonic-gate buf_t *child_buf;
9450Sstevel@tonic-gate set_t setno = MD_MIN2SET(getminor(parent_buf->b_edev));
9460Sstevel@tonic-gate int strat_flag = flag;
9470Sstevel@tonic-gate
9480Sstevel@tonic-gate /*
9490Sstevel@tonic-gate * When doing IO to a multi owner meta device, check if set is halted.
9500Sstevel@tonic-gate * We do this check without the needed lock held, for performance
9510Sstevel@tonic-gate * reasons.
9520Sstevel@tonic-gate * If an IO just slips through while the set is locked via an
9530Sstevel@tonic-gate * MD_MN_SUSPEND_SET, we don't care about it.
9540Sstevel@tonic-gate * Only check for suspension if we are a top-level i/o request
9550Sstevel@tonic-gate * (MD_STR_NOTTOP is cleared in 'flag');
9560Sstevel@tonic-gate */
9570Sstevel@tonic-gate if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
9580Sstevel@tonic-gate (MD_SET_HALTED | MD_SET_MNSET)) {
9590Sstevel@tonic-gate if ((flag & MD_STR_NOTTOP) == 0) {
9600Sstevel@tonic-gate mutex_enter(&md_mx);
9610Sstevel@tonic-gate /* Here we loop until the set is no longer halted */
9620Sstevel@tonic-gate while (md_set[setno].s_status & MD_SET_HALTED) {
9630Sstevel@tonic-gate cv_wait(&md_cv, &md_mx);
9640Sstevel@tonic-gate }
9650Sstevel@tonic-gate mutex_exit(&md_mx);
9660Sstevel@tonic-gate }
9670Sstevel@tonic-gate }
9680Sstevel@tonic-gate
9690Sstevel@tonic-gate ui = MDI_UNIT(getminor(parent_buf->b_edev));
9700Sstevel@tonic-gate
9710Sstevel@tonic-gate md_kstat_waitq_enter(ui);
9720Sstevel@tonic-gate
9730Sstevel@tonic-gate un = (mp_unit_t *)md_unit_readerlock(ui);
9740Sstevel@tonic-gate
9750Sstevel@tonic-gate if ((flag & MD_NOBLOCK) == 0) {
9760Sstevel@tonic-gate if (md_inc_iocount(setno) != 0) {
9770Sstevel@tonic-gate parent_buf->b_flags |= B_ERROR;
9780Sstevel@tonic-gate parent_buf->b_error = ENXIO;
9790Sstevel@tonic-gate parent_buf->b_resid = parent_buf->b_bcount;
9802150Sjeanm md_kstat_waitq_exit(ui);
9810Sstevel@tonic-gate md_unit_readerexit(ui);
9820Sstevel@tonic-gate biodone(parent_buf);
9830Sstevel@tonic-gate return;
9840Sstevel@tonic-gate }
9850Sstevel@tonic-gate } else {
9860Sstevel@tonic-gate md_inc_iocount_noblock(setno);
9870Sstevel@tonic-gate }
9880Sstevel@tonic-gate
9890Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP)) {
9900Sstevel@tonic-gate if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) {
9910Sstevel@tonic-gate md_kstat_waitq_exit(ui);
9920Sstevel@tonic-gate return;
9930Sstevel@tonic-gate }
9940Sstevel@tonic-gate }
9950Sstevel@tonic-gate
9960Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
9970Sstevel@tonic-gate sp_parent_init(ps);
9980Sstevel@tonic-gate
9990Sstevel@tonic-gate /*
10000Sstevel@tonic-gate * Save essential information from the original buffhdr
10010Sstevel@tonic-gate * in the parent.
10020Sstevel@tonic-gate */
10030Sstevel@tonic-gate ps->ps_un = un;
10040Sstevel@tonic-gate ps->ps_ui = ui;
10050Sstevel@tonic-gate ps->ps_bp = parent_buf;
10060Sstevel@tonic-gate ps->ps_addr = parent_buf->b_un.b_addr;
10070Sstevel@tonic-gate
10080Sstevel@tonic-gate current_count = parent_buf->b_bcount;
10090Sstevel@tonic-gate current_blkno = (sp_ext_offset_t)parent_buf->b_blkno;
10100Sstevel@tonic-gate current_offset = 0;
10110Sstevel@tonic-gate
10120Sstevel@tonic-gate /*
10130Sstevel@tonic-gate * if we are at the top and we are panicking,
10140Sstevel@tonic-gate * we don't free in order to save state.
10150Sstevel@tonic-gate */
10160Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL))
10170Sstevel@tonic-gate ps->ps_flags |= MD_SPPS_DONTFREE;
10180Sstevel@tonic-gate
10190Sstevel@tonic-gate md_kstat_waitq_to_runq(ui);
10200Sstevel@tonic-gate
10210Sstevel@tonic-gate ps->ps_frags++;
10220Sstevel@tonic-gate
10230Sstevel@tonic-gate /*
10240Sstevel@tonic-gate * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this
10250Sstevel@tonic-gate * metadevice.
10260Sstevel@tonic-gate */
10270Sstevel@tonic-gate if (ui->ui_tstate & MD_ABR_CAP)
10280Sstevel@tonic-gate strat_flag |= MD_STR_ABR;
10290Sstevel@tonic-gate
10300Sstevel@tonic-gate /*
10310Sstevel@tonic-gate * this loop does the main work of an I/O. we allocate a
10320Sstevel@tonic-gate * a child save for each buf, do the logical to physical
10330Sstevel@tonic-gate * mapping, decide if we need to frag the I/O, clone the
10340Sstevel@tonic-gate * new I/O to pass down the stack. repeat until we've
10350Sstevel@tonic-gate * taken care of the entire buf that was passed to us.
10360Sstevel@tonic-gate */
10370Sstevel@tonic-gate do {
10380Sstevel@tonic-gate cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
10390Sstevel@tonic-gate sp_child_init(cs);
10400Sstevel@tonic-gate child_buf = &cs->cs_buf;
10410Sstevel@tonic-gate cs->cs_ps = ps;
10420Sstevel@tonic-gate
10430Sstevel@tonic-gate more = sp_mapbuf(un, current_blkno, current_count, child_buf);
10440Sstevel@tonic-gate if (more == -1) {
10450Sstevel@tonic-gate parent_buf->b_flags |= B_ERROR;
10460Sstevel@tonic-gate parent_buf->b_error = EIO;
10470Sstevel@tonic-gate md_kstat_done(ui, parent_buf, 0);
10480Sstevel@tonic-gate md_unit_readerexit(ui);
10490Sstevel@tonic-gate md_biodone(parent_buf);
10500Sstevel@tonic-gate kmem_cache_free(sp_parent_cache, ps);
10510Sstevel@tonic-gate return;
10520Sstevel@tonic-gate }
10530Sstevel@tonic-gate
10540Sstevel@tonic-gate child_buf = md_bioclone(parent_buf, current_offset,
10557627SChris.Horne@Sun.COM child_buf->b_bcount, child_buf->b_edev,
10567627SChris.Horne@Sun.COM child_buf->b_blkno, sp_done, child_buf,
10577627SChris.Horne@Sun.COM KM_NOSLEEP);
10580Sstevel@tonic-gate /* calculate new offset, counts, etc... */
10590Sstevel@tonic-gate current_offset += child_buf->b_bcount;
10600Sstevel@tonic-gate current_count -= child_buf->b_bcount;
10610Sstevel@tonic-gate current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount));
10620Sstevel@tonic-gate
10630Sstevel@tonic-gate if (more) {
10640Sstevel@tonic-gate mutex_enter(&ps->ps_mx);
10650Sstevel@tonic-gate ps->ps_frags++;
10660Sstevel@tonic-gate mutex_exit(&ps->ps_mx);
10670Sstevel@tonic-gate }
10680Sstevel@tonic-gate
10690Sstevel@tonic-gate md_call_strategy(child_buf, strat_flag, private);
10700Sstevel@tonic-gate } while (more);
10710Sstevel@tonic-gate
10720Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) {
10730Sstevel@tonic-gate while (!(ps->ps_flags & MD_SPPS_DONE)) {
10740Sstevel@tonic-gate md_daemon(1, &md_done_daemon);
10750Sstevel@tonic-gate }
10760Sstevel@tonic-gate kmem_cache_free(sp_parent_cache, ps);
10770Sstevel@tonic-gate }
10780Sstevel@tonic-gate }
10790Sstevel@tonic-gate
10800Sstevel@tonic-gate /*
10810Sstevel@tonic-gate * FUNCTION: sp_directed_read()
10820Sstevel@tonic-gate * INPUT: mnum - minor number
10830Sstevel@tonic-gate * vdr - vol_directed_rd_t from user
10840Sstevel@tonic-gate * mode - access mode for copying data out.
10850Sstevel@tonic-gate * OUTPUT: none.
10860Sstevel@tonic-gate * RETURNS: 0 - success
10870Sstevel@tonic-gate * Exxxxx - failure error-code
10880Sstevel@tonic-gate * PURPOSE: Construct the necessary sub-device i/o requests to perform the
10890Sstevel@tonic-gate * directed read as requested by the user. This is essentially the
10900Sstevel@tonic-gate * same as md_sp_strategy() with the exception being that the
10910Sstevel@tonic-gate * underlying 'md_call_strategy' is replaced with an ioctl call.
10920Sstevel@tonic-gate */
10930Sstevel@tonic-gate int
sp_directed_read(minor_t mnum,vol_directed_rd_t * vdr,int mode)10940Sstevel@tonic-gate sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode)
10950Sstevel@tonic-gate {
10960Sstevel@tonic-gate md_spps_t *ps;
10970Sstevel@tonic-gate md_spcs_t *cs;
10980Sstevel@tonic-gate int more;
10990Sstevel@tonic-gate mp_unit_t *un;
11000Sstevel@tonic-gate mdi_unit_t *ui;
11010Sstevel@tonic-gate size_t current_count;
11020Sstevel@tonic-gate off_t current_offset;
11030Sstevel@tonic-gate sp_ext_offset_t current_blkno;
11040Sstevel@tonic-gate buf_t *child_buf, *parent_buf;
11050Sstevel@tonic-gate void *kbuffer;
11060Sstevel@tonic-gate vol_directed_rd_t cvdr;
11070Sstevel@tonic-gate caddr_t userbuf;
11080Sstevel@tonic-gate offset_t useroff;
11090Sstevel@tonic-gate int ret = 0;
11100Sstevel@tonic-gate
11110Sstevel@tonic-gate ui = MDI_UNIT(mnum);
11120Sstevel@tonic-gate
11130Sstevel@tonic-gate md_kstat_waitq_enter(ui);
11140Sstevel@tonic-gate
11150Sstevel@tonic-gate bzero(&cvdr, sizeof (cvdr));
11160Sstevel@tonic-gate
11170Sstevel@tonic-gate un = (mp_unit_t *)md_unit_readerlock(ui);
11180Sstevel@tonic-gate
11190Sstevel@tonic-gate /*
11200Sstevel@tonic-gate * Construct a parent_buf header which reflects the user-supplied
11210Sstevel@tonic-gate * request.
11220Sstevel@tonic-gate */
11230Sstevel@tonic-gate
11240Sstevel@tonic-gate kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
11250Sstevel@tonic-gate if (kbuffer == NULL) {
11260Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR;
11272150Sjeanm md_kstat_waitq_exit(ui);
11280Sstevel@tonic-gate md_unit_readerexit(ui);
11290Sstevel@tonic-gate return (ENOMEM);
11300Sstevel@tonic-gate }
11310Sstevel@tonic-gate
11320Sstevel@tonic-gate parent_buf = getrbuf(KM_NOSLEEP);
11330Sstevel@tonic-gate if (parent_buf == NULL) {
11340Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR;
11352150Sjeanm md_kstat_waitq_exit(ui);
11360Sstevel@tonic-gate md_unit_readerexit(ui);
11370Sstevel@tonic-gate kmem_free(kbuffer, vdr->vdr_nbytes);
11380Sstevel@tonic-gate return (ENOMEM);
11390Sstevel@tonic-gate }
11400Sstevel@tonic-gate parent_buf->b_un.b_addr = kbuffer;
11410Sstevel@tonic-gate parent_buf->b_flags = B_READ;
11420Sstevel@tonic-gate parent_buf->b_bcount = vdr->vdr_nbytes;
11430Sstevel@tonic-gate parent_buf->b_lblkno = lbtodb(vdr->vdr_offset);
11440Sstevel@tonic-gate parent_buf->b_edev = un->un_dev;
11450Sstevel@tonic-gate
11460Sstevel@tonic-gate
11470Sstevel@tonic-gate ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
11480Sstevel@tonic-gate sp_parent_init(ps);
11490Sstevel@tonic-gate
11500Sstevel@tonic-gate /*
11510Sstevel@tonic-gate * Save essential information from the original buffhdr
11520Sstevel@tonic-gate * in the parent.
11530Sstevel@tonic-gate */
11540Sstevel@tonic-gate ps->ps_un = un;
11550Sstevel@tonic-gate ps->ps_ui = ui;
11560Sstevel@tonic-gate ps->ps_bp = parent_buf;
11570Sstevel@tonic-gate ps->ps_addr = parent_buf->b_un.b_addr;
11580Sstevel@tonic-gate
11590Sstevel@tonic-gate current_count = parent_buf->b_bcount;
11600Sstevel@tonic-gate current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno;
11610Sstevel@tonic-gate current_offset = 0;
11620Sstevel@tonic-gate
11632150Sjeanm md_kstat_waitq_to_runq(ui);
11642150Sjeanm
11650Sstevel@tonic-gate ps->ps_frags++;
11660Sstevel@tonic-gate vdr->vdr_bytesread = 0;
11670Sstevel@tonic-gate
11680Sstevel@tonic-gate /*
11690Sstevel@tonic-gate * this loop does the main work of an I/O. we allocate a
11700Sstevel@tonic-gate * a child save for each buf, do the logical to physical
11710Sstevel@tonic-gate * mapping, decide if we need to frag the I/O, clone the
11720Sstevel@tonic-gate * new I/O to pass down the stack. repeat until we've
11730Sstevel@tonic-gate * taken care of the entire buf that was passed to us.
11740Sstevel@tonic-gate */
11750Sstevel@tonic-gate do {
11760Sstevel@tonic-gate cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
11770Sstevel@tonic-gate sp_child_init(cs);
11780Sstevel@tonic-gate child_buf = &cs->cs_buf;
11790Sstevel@tonic-gate cs->cs_ps = ps;
11800Sstevel@tonic-gate
11810Sstevel@tonic-gate more = sp_mapbuf(un, current_blkno, current_count, child_buf);
11820Sstevel@tonic-gate if (more == -1) {
11830Sstevel@tonic-gate ret = EIO;
11840Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_SHORT;
11850Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs);
11860Sstevel@tonic-gate goto err_out;
11870Sstevel@tonic-gate }
11880Sstevel@tonic-gate
11890Sstevel@tonic-gate cvdr.vdr_flags = vdr->vdr_flags;
11900Sstevel@tonic-gate cvdr.vdr_side = vdr->vdr_side;
11910Sstevel@tonic-gate cvdr.vdr_nbytes = child_buf->b_bcount;
11920Sstevel@tonic-gate cvdr.vdr_offset = ldbtob(child_buf->b_lblkno);
11930Sstevel@tonic-gate /* Work out where we are in the allocated buffer */
119462Sjeanm useroff = (offset_t)(uintptr_t)kbuffer;
11950Sstevel@tonic-gate useroff = useroff + (offset_t)current_offset;
119662Sjeanm cvdr.vdr_data = (void *)(uintptr_t)useroff;
11970Sstevel@tonic-gate child_buf = md_bioclone(parent_buf, current_offset,
11987627SChris.Horne@Sun.COM child_buf->b_bcount, child_buf->b_edev,
11997627SChris.Horne@Sun.COM child_buf->b_blkno, NULL,
12007627SChris.Horne@Sun.COM child_buf, KM_NOSLEEP);
12010Sstevel@tonic-gate /* calculate new offset, counts, etc... */
12020Sstevel@tonic-gate current_offset += child_buf->b_bcount;
12030Sstevel@tonic-gate current_count -= child_buf->b_bcount;
12040Sstevel@tonic-gate current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount));
12050Sstevel@tonic-gate
12060Sstevel@tonic-gate if (more) {
12070Sstevel@tonic-gate mutex_enter(&ps->ps_mx);
12080Sstevel@tonic-gate ps->ps_frags++;
12090Sstevel@tonic-gate mutex_exit(&ps->ps_mx);
12100Sstevel@tonic-gate }
12110Sstevel@tonic-gate
12120Sstevel@tonic-gate ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr,
12130Sstevel@tonic-gate (mode | FKIOCTL), NULL);
12140Sstevel@tonic-gate
12150Sstevel@tonic-gate /*
12160Sstevel@tonic-gate * Free the child structure as we've finished with it.
12170Sstevel@tonic-gate * Normally this would be done by sp_done() but we're just
12180Sstevel@tonic-gate * using md_bioclone() to segment the transfer and we never
12190Sstevel@tonic-gate * issue a strategy request so the iodone will not be called.
12200Sstevel@tonic-gate */
12210Sstevel@tonic-gate kmem_cache_free(sp_child_cache, cs);
12220Sstevel@tonic-gate if (ret == 0) {
12230Sstevel@tonic-gate /* copyout the returned data to vdr_data + offset */
12240Sstevel@tonic-gate userbuf = (caddr_t)kbuffer;
12250Sstevel@tonic-gate userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer;
12260Sstevel@tonic-gate if (ddi_copyout(userbuf, vdr->vdr_data,
12270Sstevel@tonic-gate cvdr.vdr_bytesread, mode)) {
12280Sstevel@tonic-gate ret = EFAULT;
12290Sstevel@tonic-gate goto err_out;
12300Sstevel@tonic-gate }
12310Sstevel@tonic-gate vdr->vdr_bytesread += cvdr.vdr_bytesread;
12320Sstevel@tonic-gate } else {
12330Sstevel@tonic-gate goto err_out;
12340Sstevel@tonic-gate }
12350Sstevel@tonic-gate } while (more);
12360Sstevel@tonic-gate
12370Sstevel@tonic-gate /*
12380Sstevel@tonic-gate * Update the user-supplied vol_directed_rd_t structure with the
12390Sstevel@tonic-gate * contents of the last issued child request.
12400Sstevel@tonic-gate */
12410Sstevel@tonic-gate vdr->vdr_flags = cvdr.vdr_flags;
12420Sstevel@tonic-gate vdr->vdr_side = cvdr.vdr_side;
12430Sstevel@tonic-gate bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME);
12440Sstevel@tonic-gate
12450Sstevel@tonic-gate err_out:
12460Sstevel@tonic-gate if (ret != 0) {
12470Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR;
12480Sstevel@tonic-gate }
12490Sstevel@tonic-gate if (vdr->vdr_bytesread != vdr->vdr_nbytes) {
12500Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_SHORT;
12510Sstevel@tonic-gate }
12520Sstevel@tonic-gate kmem_cache_free(sp_parent_cache, ps);
12530Sstevel@tonic-gate kmem_free(kbuffer, vdr->vdr_nbytes);
12540Sstevel@tonic-gate freerbuf(parent_buf);
12550Sstevel@tonic-gate md_unit_readerexit(ui);
12560Sstevel@tonic-gate return (ret);
12570Sstevel@tonic-gate }
12580Sstevel@tonic-gate
12590Sstevel@tonic-gate /*
12600Sstevel@tonic-gate * FUNCTION: sp_snarf()
12610Sstevel@tonic-gate * INPUT: cmd - snarf cmd.
12620Sstevel@tonic-gate * setno - set number.
12630Sstevel@tonic-gate * OUTPUT: none.
12640Sstevel@tonic-gate * RETURNS: 1 - soft partitions were snarfed.
12650Sstevel@tonic-gate * 0 - no soft partitions were snarfed.
12660Sstevel@tonic-gate * PURPOSE: Snarf soft partition metadb records into their in-core
12670Sstevel@tonic-gate * structures. This routine is called at "snarf time" when
12680Sstevel@tonic-gate * md loads and gets all metadevices records into memory.
12690Sstevel@tonic-gate * The basic algorithm is simply to walk the soft partition
12700Sstevel@tonic-gate * records in the metadb and call the soft partitioning
12710Sstevel@tonic-gate * build_incore routine to set up the in-core structures.
12720Sstevel@tonic-gate */
12730Sstevel@tonic-gate static int
sp_snarf(md_snarfcmd_t cmd,set_t setno)12740Sstevel@tonic-gate sp_snarf(md_snarfcmd_t cmd, set_t setno)
12750Sstevel@tonic-gate {
12760Sstevel@tonic-gate mp_unit_t *un;
12770Sstevel@tonic-gate mddb_recid_t recid;
12780Sstevel@tonic-gate int gotsomething;
12790Sstevel@tonic-gate int all_sp_gotten;
12800Sstevel@tonic-gate mddb_type_t rec_type;
12810Sstevel@tonic-gate mddb_de_ic_t *dep;
12820Sstevel@tonic-gate mddb_rb32_t *rbp;
12830Sstevel@tonic-gate mp_unit_t *big_un;
12840Sstevel@tonic-gate mp_unit32_od_t *small_un;
12850Sstevel@tonic-gate size_t newreqsize;
12860Sstevel@tonic-gate
12870Sstevel@tonic-gate
12880Sstevel@tonic-gate if (cmd == MD_SNARF_CLEANUP)
12890Sstevel@tonic-gate return (0);
12900Sstevel@tonic-gate
12910Sstevel@tonic-gate all_sp_gotten = 1;
12920Sstevel@tonic-gate gotsomething = 0;
12930Sstevel@tonic-gate
12940Sstevel@tonic-gate /* get the record type */
12950Sstevel@tonic-gate rec_type = (mddb_type_t)md_getshared_key(setno,
12960Sstevel@tonic-gate sp_md_ops.md_driver.md_drivername);
12970Sstevel@tonic-gate recid = mddb_makerecid(setno, 0);
12980Sstevel@tonic-gate
12990Sstevel@tonic-gate /*
13000Sstevel@tonic-gate * walk soft partition records in the metadb and call
13010Sstevel@tonic-gate * sp_build_incore to build in-core structures.
13020Sstevel@tonic-gate */
13030Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
13040Sstevel@tonic-gate /* if we've already gotten this record, go to the next one */
13050Sstevel@tonic-gate if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
13060Sstevel@tonic-gate continue;
13070Sstevel@tonic-gate
13080Sstevel@tonic-gate
13090Sstevel@tonic-gate dep = mddb_getrecdep(recid);
13100Sstevel@tonic-gate dep->de_flags = MDDB_F_SOFTPART;
13110Sstevel@tonic-gate rbp = dep->de_rb;
13120Sstevel@tonic-gate
13131623Stw21770 switch (rbp->rb_revision) {
13141623Stw21770 case MDDB_REV_RB:
13151623Stw21770 case MDDB_REV_RBFN:
13161623Stw21770 if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
13171623Stw21770 /*
13181623Stw21770 * This means, we have an old and small record.
13191623Stw21770 * And this record hasn't already been converted
13201623Stw21770 * :-o before we create an incore metadevice
13211623Stw21770 * from this we have to convert it to a big
13221623Stw21770 * record.
13231623Stw21770 */
13241623Stw21770 small_un =
13251623Stw21770 (mp_unit32_od_t *)mddb_getrecaddr(recid);
13261623Stw21770 newreqsize = sizeof (mp_unit_t) +
13277627SChris.Horne@Sun.COM ((small_un->un_numexts - 1) *
13287627SChris.Horne@Sun.COM sizeof (struct mp_ext));
13291623Stw21770 big_un = (mp_unit_t *)kmem_zalloc(newreqsize,
13307627SChris.Horne@Sun.COM KM_SLEEP);
13311623Stw21770 softpart_convert((caddr_t)small_un,
13327627SChris.Horne@Sun.COM (caddr_t)big_un, SMALL_2_BIG);
13331623Stw21770 kmem_free(small_un, dep->de_reqsize);
13341623Stw21770 dep->de_rb_userdata = big_un;
13351623Stw21770 dep->de_reqsize = newreqsize;
13361623Stw21770 rbp->rb_private |= MD_PRV_CONVD;
13371623Stw21770 un = big_un;
13381623Stw21770 } else {
13391623Stw21770 /* Record has already been converted */
13401623Stw21770 un = (mp_unit_t *)mddb_getrecaddr(recid);
13411623Stw21770 }
13421623Stw21770 un->c.un_revision &= ~MD_64BIT_META_DEV;
13431623Stw21770 break;
13441623Stw21770 case MDDB_REV_RB64:
13451623Stw21770 case MDDB_REV_RB64FN:
13460Sstevel@tonic-gate /* Large device */
13470Sstevel@tonic-gate un = (mp_unit_t *)mddb_getrecaddr(recid);
13481623Stw21770 un->c.un_revision |= MD_64BIT_META_DEV;
13491623Stw21770 un->c.un_flag |= MD_EFILABEL;
13501623Stw21770 break;
13510Sstevel@tonic-gate }
13522077Stw21770 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
13530Sstevel@tonic-gate
13540Sstevel@tonic-gate /*
13550Sstevel@tonic-gate * Create minor node for snarfed entry.
13560Sstevel@tonic-gate */
13570Sstevel@tonic-gate (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
13580Sstevel@tonic-gate
13590Sstevel@tonic-gate if (MD_UNIT(MD_SID(un)) != NULL) {
13600Sstevel@tonic-gate /* unit is already in-core */
13610Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_PENDDEL);
13620Sstevel@tonic-gate continue;
13630Sstevel@tonic-gate }
13640Sstevel@tonic-gate all_sp_gotten = 0;
13650Sstevel@tonic-gate if (sp_build_incore((void *)un, 1) == 0) {
13660Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_GOTIT);
13670Sstevel@tonic-gate md_create_unit_incore(MD_SID(un), &sp_md_ops, 0);
13680Sstevel@tonic-gate gotsomething = 1;
13690Sstevel@tonic-gate }
13700Sstevel@tonic-gate }
13710Sstevel@tonic-gate
13720Sstevel@tonic-gate if (!all_sp_gotten)
13730Sstevel@tonic-gate return (gotsomething);
13740Sstevel@tonic-gate /* double-check records */
13750Sstevel@tonic-gate recid = mddb_makerecid(setno, 0);
13760Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0)
13770Sstevel@tonic-gate if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
13780Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_PENDDEL);
13790Sstevel@tonic-gate
13800Sstevel@tonic-gate return (0);
13810Sstevel@tonic-gate }
13820Sstevel@tonic-gate
13830Sstevel@tonic-gate /*
13840Sstevel@tonic-gate * FUNCTION: sp_halt()
13850Sstevel@tonic-gate * INPUT: cmd - halt cmd.
13860Sstevel@tonic-gate * setno - set number.
13870Sstevel@tonic-gate * RETURNS: 0 - success.
13880Sstevel@tonic-gate * 1 - err.
13890Sstevel@tonic-gate * PURPOSE: Perform driver halt operations. As with stripe, we
13900Sstevel@tonic-gate * support MD_HALT_CHECK and MD_HALT_DOIT. The first
13910Sstevel@tonic-gate * does a check to see if halting can be done safely
13920Sstevel@tonic-gate * (no open soft partitions), the second cleans up and
13930Sstevel@tonic-gate * shuts down the driver.
13940Sstevel@tonic-gate */
13950Sstevel@tonic-gate static int
sp_halt(md_haltcmd_t cmd,set_t setno)13960Sstevel@tonic-gate sp_halt(md_haltcmd_t cmd, set_t setno)
13970Sstevel@tonic-gate {
13980Sstevel@tonic-gate int i;
13990Sstevel@tonic-gate mdi_unit_t *ui;
14000Sstevel@tonic-gate minor_t mnum;
14010Sstevel@tonic-gate
14020Sstevel@tonic-gate if (cmd == MD_HALT_CLOSE)
14030Sstevel@tonic-gate return (0);
14040Sstevel@tonic-gate
14050Sstevel@tonic-gate if (cmd == MD_HALT_OPEN)
14060Sstevel@tonic-gate return (0);
14070Sstevel@tonic-gate
14080Sstevel@tonic-gate if (cmd == MD_HALT_UNLOAD)
14090Sstevel@tonic-gate return (0);
14100Sstevel@tonic-gate
14110Sstevel@tonic-gate if (cmd == MD_HALT_CHECK) {
14120Sstevel@tonic-gate for (i = 0; i < md_nunits; i++) {
14130Sstevel@tonic-gate mnum = MD_MKMIN(setno, i);
14140Sstevel@tonic-gate if ((ui = MDI_UNIT(mnum)) == NULL)
14150Sstevel@tonic-gate continue;
14160Sstevel@tonic-gate if (ui->ui_opsindex != sp_md_ops.md_selfindex)
14170Sstevel@tonic-gate continue;
14180Sstevel@tonic-gate if (md_unit_isopen(ui))
14190Sstevel@tonic-gate return (1);
14200Sstevel@tonic-gate }
14210Sstevel@tonic-gate return (0);
14220Sstevel@tonic-gate }
14230Sstevel@tonic-gate
14240Sstevel@tonic-gate if (cmd != MD_HALT_DOIT)
14250Sstevel@tonic-gate return (1);
14260Sstevel@tonic-gate
14270Sstevel@tonic-gate for (i = 0; i < md_nunits; i++) {
14280Sstevel@tonic-gate mnum = MD_MKMIN(setno, i);
14290Sstevel@tonic-gate if ((ui = MDI_UNIT(mnum)) == NULL)
14300Sstevel@tonic-gate continue;
14310Sstevel@tonic-gate if (ui->ui_opsindex != sp_md_ops.md_selfindex)
14320Sstevel@tonic-gate continue;
14330Sstevel@tonic-gate reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0);
14340Sstevel@tonic-gate }
14350Sstevel@tonic-gate
14360Sstevel@tonic-gate return (0);
14370Sstevel@tonic-gate }
14380Sstevel@tonic-gate
14390Sstevel@tonic-gate /*
14400Sstevel@tonic-gate * FUNCTION: sp_open_dev()
14410Sstevel@tonic-gate * INPUT: un - unit structure.
14420Sstevel@tonic-gate * oflags - open flags.
14430Sstevel@tonic-gate * OUTPUT: none.
14440Sstevel@tonic-gate * RETURNS: 0 - success.
14450Sstevel@tonic-gate * non-zero - err.
14460Sstevel@tonic-gate * PURPOSE: open underlying device via md_layered_open.
14470Sstevel@tonic-gate */
14480Sstevel@tonic-gate static int
sp_open_dev(mp_unit_t * un,int oflags)14490Sstevel@tonic-gate sp_open_dev(mp_unit_t *un, int oflags)
14500Sstevel@tonic-gate {
14510Sstevel@tonic-gate minor_t mnum = MD_SID(un);
14520Sstevel@tonic-gate int err;
14530Sstevel@tonic-gate md_dev64_t tmpdev;
14540Sstevel@tonic-gate set_t setno = MD_MIN2SET(MD_SID(un));
14550Sstevel@tonic-gate side_t side = mddb_getsidenum(setno);
14560Sstevel@tonic-gate
14570Sstevel@tonic-gate tmpdev = un->un_dev;
14580Sstevel@tonic-gate /*
14590Sstevel@tonic-gate * Do the open by device id if underlying is regular
14600Sstevel@tonic-gate */
14610Sstevel@tonic-gate if ((md_getmajor(tmpdev) != md_major) &&
14627627SChris.Horne@Sun.COM md_devid_found(setno, side, un->un_key) == 1) {
14630Sstevel@tonic-gate tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key);
14640Sstevel@tonic-gate }
14650Sstevel@tonic-gate err = md_layered_open(mnum, &tmpdev, oflags);
14660Sstevel@tonic-gate un->un_dev = tmpdev;
14670Sstevel@tonic-gate
14680Sstevel@tonic-gate if (err)
14690Sstevel@tonic-gate return (ENXIO);
14700Sstevel@tonic-gate
14710Sstevel@tonic-gate return (0);
14720Sstevel@tonic-gate }
14730Sstevel@tonic-gate
14740Sstevel@tonic-gate /*
14750Sstevel@tonic-gate * FUNCTION: sp_open()
14760Sstevel@tonic-gate * INPUT: dev - device to open.
14770Sstevel@tonic-gate * flag - pass-through flag.
14780Sstevel@tonic-gate * otyp - pass-through open type.
14790Sstevel@tonic-gate * cred_p - credentials.
14800Sstevel@tonic-gate * md_oflags - open flags.
14810Sstevel@tonic-gate * OUTPUT: none.
14820Sstevel@tonic-gate * RETURNS: 0 - success.
14830Sstevel@tonic-gate * non-zero - err.
14840Sstevel@tonic-gate * PURPOSE: open a soft partition.
14850Sstevel@tonic-gate */
14860Sstevel@tonic-gate /* ARGSUSED */
14870Sstevel@tonic-gate static int
sp_open(dev_t * dev,int flag,int otyp,cred_t * cred_p,int md_oflags)14880Sstevel@tonic-gate sp_open(
14890Sstevel@tonic-gate dev_t *dev,
14900Sstevel@tonic-gate int flag,
14910Sstevel@tonic-gate int otyp,
14920Sstevel@tonic-gate cred_t *cred_p,
14930Sstevel@tonic-gate int md_oflags
14940Sstevel@tonic-gate )
14950Sstevel@tonic-gate {
14960Sstevel@tonic-gate minor_t mnum = getminor(*dev);
14970Sstevel@tonic-gate mdi_unit_t *ui = MDI_UNIT(mnum);
14980Sstevel@tonic-gate mp_unit_t *un;
14990Sstevel@tonic-gate int err = 0;
15000Sstevel@tonic-gate set_t setno;
15010Sstevel@tonic-gate
150246Sskamm /*
150346Sskamm * When doing an open of a multi owner metadevice, check to see if this
150446Sskamm * node is a starting node and if a reconfig cycle is underway.
150546Sskamm * If so, the system isn't sufficiently set up enough to handle the
150646Sskamm * open (which involves I/O during sp_validate), so fail with ENXIO.
150746Sskamm */
150846Sskamm setno = MD_MIN2SET(mnum);
150946Sskamm if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
151046Sskamm (MD_SET_MNSET | MD_SET_MN_START_RC)) {
151146Sskamm return (ENXIO);
151246Sskamm }
151346Sskamm
15140Sstevel@tonic-gate /* grab necessary locks */
15150Sstevel@tonic-gate un = (mp_unit_t *)md_unit_openclose_enter(ui);
15160Sstevel@tonic-gate setno = MD_UN2SET(un);
15170Sstevel@tonic-gate
15180Sstevel@tonic-gate /* open underlying device, if necessary */
15190Sstevel@tonic-gate if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
15200Sstevel@tonic-gate if ((err = sp_open_dev(un, md_oflags)) != 0)
15210Sstevel@tonic-gate goto out;
15220Sstevel@tonic-gate
15230Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) {
15240Sstevel@tonic-gate /* For probe, don't incur the overhead of validate */
15250Sstevel@tonic-gate if (!(md_oflags & MD_OFLG_PROBEDEV)) {
15260Sstevel@tonic-gate /*
15270Sstevel@tonic-gate * Don't call sp_validate while
15280Sstevel@tonic-gate * unit_openclose lock is held. So, actually
15290Sstevel@tonic-gate * open the device, drop openclose lock,
15300Sstevel@tonic-gate * call sp_validate, reacquire openclose lock,
15310Sstevel@tonic-gate * and close the device. If sp_validate
15320Sstevel@tonic-gate * succeeds, then device will be re-opened.
15330Sstevel@tonic-gate */
15340Sstevel@tonic-gate if ((err = md_unit_incopen(mnum, flag,
15350Sstevel@tonic-gate otyp)) != 0)
15360Sstevel@tonic-gate goto out;
15370Sstevel@tonic-gate
15380Sstevel@tonic-gate mutex_enter(&ui->ui_mx);
15390Sstevel@tonic-gate ui->ui_lock |= MD_UL_OPENINPROGRESS;
15400Sstevel@tonic-gate mutex_exit(&ui->ui_mx);
15410Sstevel@tonic-gate md_unit_openclose_exit(ui);
15420Sstevel@tonic-gate if (otyp != OTYP_LYR)
15430Sstevel@tonic-gate rw_exit(&md_unit_array_rw.lock);
15440Sstevel@tonic-gate
15450Sstevel@tonic-gate err = sp_validate(un);
15460Sstevel@tonic-gate
15470Sstevel@tonic-gate if (otyp != OTYP_LYR)
15480Sstevel@tonic-gate rw_enter(&md_unit_array_rw.lock,
15490Sstevel@tonic-gate RW_READER);
15500Sstevel@tonic-gate (void) md_unit_openclose_enter(ui);
15510Sstevel@tonic-gate (void) md_unit_decopen(mnum, otyp);
15520Sstevel@tonic-gate mutex_enter(&ui->ui_mx);
15530Sstevel@tonic-gate ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
15540Sstevel@tonic-gate cv_broadcast(&ui->ui_cv);
15550Sstevel@tonic-gate mutex_exit(&ui->ui_mx);
15560Sstevel@tonic-gate /*
15570Sstevel@tonic-gate * Should be in the same state as before
15580Sstevel@tonic-gate * the sp_validate.
15590Sstevel@tonic-gate */
15600Sstevel@tonic-gate if (err != 0) {
15610Sstevel@tonic-gate /* close the device opened above */
15620Sstevel@tonic-gate md_layered_close(un->un_dev, md_oflags);
15630Sstevel@tonic-gate err = EIO;
15640Sstevel@tonic-gate goto out;
15650Sstevel@tonic-gate }
15660Sstevel@tonic-gate }
15670Sstevel@tonic-gate /*
15680Sstevel@tonic-gate * As we're a multi-owner metadevice we need to ensure
15690Sstevel@tonic-gate * that all nodes have the same idea of the status.
15700Sstevel@tonic-gate * sp_validate() will mark the device as errored (if
15710Sstevel@tonic-gate * it cannot read the watermark) or ok (if it was
15720Sstevel@tonic-gate * previously errored but the watermark is now valid).
15730Sstevel@tonic-gate * This code-path is only entered on the non-probe open
15740Sstevel@tonic-gate * so we will maintain the errored state during a probe
15750Sstevel@tonic-gate * call. This means the sys-admin must metarecover -m
15760Sstevel@tonic-gate * to reset the soft-partition error.
15770Sstevel@tonic-gate */
15780Sstevel@tonic-gate } else {
15790Sstevel@tonic-gate /* For probe, don't incur the overhead of validate */
15800Sstevel@tonic-gate if (!(md_oflags & MD_OFLG_PROBEDEV) &&
15810Sstevel@tonic-gate (err = sp_validate(un)) != 0) {
15820Sstevel@tonic-gate /* close the device opened above */
15830Sstevel@tonic-gate md_layered_close(un->un_dev, md_oflags);
15840Sstevel@tonic-gate err = EIO;
15850Sstevel@tonic-gate goto out;
15860Sstevel@tonic-gate } else {
15870Sstevel@tonic-gate /*
15880Sstevel@tonic-gate * we succeeded in validating the on disk
15890Sstevel@tonic-gate * format versus the in core, so reset the
15900Sstevel@tonic-gate * status if it's in error
15910Sstevel@tonic-gate */
15920Sstevel@tonic-gate if (un->un_status == MD_SP_ERR) {
15930Sstevel@tonic-gate un->un_status = MD_SP_OK;
15940Sstevel@tonic-gate }
15950Sstevel@tonic-gate }
15960Sstevel@tonic-gate }
15970Sstevel@tonic-gate }
15980Sstevel@tonic-gate
15990Sstevel@tonic-gate /* count open */
16000Sstevel@tonic-gate if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
16010Sstevel@tonic-gate goto out;
16020Sstevel@tonic-gate
16030Sstevel@tonic-gate out:
16040Sstevel@tonic-gate md_unit_openclose_exit(ui);
16050Sstevel@tonic-gate return (err);
16060Sstevel@tonic-gate }
16070Sstevel@tonic-gate
16080Sstevel@tonic-gate /*
16090Sstevel@tonic-gate * FUNCTION: sp_close()
16100Sstevel@tonic-gate * INPUT: dev - device to close.
16110Sstevel@tonic-gate * flag - pass-through flag.
16120Sstevel@tonic-gate * otyp - pass-through type.
16130Sstevel@tonic-gate * cred_p - credentials.
16140Sstevel@tonic-gate * md_cflags - close flags.
16150Sstevel@tonic-gate * OUTPUT: none.
16160Sstevel@tonic-gate * RETURNS: 0 - success.
16170Sstevel@tonic-gate * non-zero - err.
16180Sstevel@tonic-gate * PURPOSE: close a soft paritition.
16190Sstevel@tonic-gate */
16200Sstevel@tonic-gate /* ARGSUSED */
16210Sstevel@tonic-gate static int
sp_close(dev_t dev,int flag,int otyp,cred_t * cred_p,int md_cflags)16220Sstevel@tonic-gate sp_close(
16230Sstevel@tonic-gate dev_t dev,
16240Sstevel@tonic-gate int flag,
16250Sstevel@tonic-gate int otyp,
16260Sstevel@tonic-gate cred_t *cred_p,
16270Sstevel@tonic-gate int md_cflags
16280Sstevel@tonic-gate )
16290Sstevel@tonic-gate {
16300Sstevel@tonic-gate minor_t mnum = getminor(dev);
16310Sstevel@tonic-gate mdi_unit_t *ui = MDI_UNIT(mnum);
16320Sstevel@tonic-gate mp_unit_t *un;
16330Sstevel@tonic-gate int err = 0;
16340Sstevel@tonic-gate
16350Sstevel@tonic-gate /* grab necessary locks */
16360Sstevel@tonic-gate un = (mp_unit_t *)md_unit_openclose_enter(ui);
16370Sstevel@tonic-gate
16380Sstevel@tonic-gate /* count closed */
16390Sstevel@tonic-gate if ((err = md_unit_decopen(mnum, otyp)) != 0)
16400Sstevel@tonic-gate goto out;
16410Sstevel@tonic-gate
16420Sstevel@tonic-gate /* close devices, if necessary */
16430Sstevel@tonic-gate if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
16440Sstevel@tonic-gate md_layered_close(un->un_dev, md_cflags);
16450Sstevel@tonic-gate }
16460Sstevel@tonic-gate
16470Sstevel@tonic-gate /*
16480Sstevel@tonic-gate * If a MN set and transient capabilities (eg ABR/DMR) are set,
16490Sstevel@tonic-gate * clear these capabilities if this is the last close in
16500Sstevel@tonic-gate * the cluster
16510Sstevel@tonic-gate */
16520Sstevel@tonic-gate if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
16530Sstevel@tonic-gate (ui->ui_tstate & MD_ABR_CAP)) {
16540Sstevel@tonic-gate md_unit_openclose_exit(ui);
16550Sstevel@tonic-gate mdmn_clear_all_capabilities(mnum);
16560Sstevel@tonic-gate return (0);
16570Sstevel@tonic-gate }
16580Sstevel@tonic-gate /* unlock, return success */
16590Sstevel@tonic-gate out:
16600Sstevel@tonic-gate md_unit_openclose_exit(ui);
16610Sstevel@tonic-gate return (err);
16620Sstevel@tonic-gate }
16630Sstevel@tonic-gate
16640Sstevel@tonic-gate
16650Sstevel@tonic-gate /* used in sp_dump routine */
16660Sstevel@tonic-gate static struct buf dumpbuf;
16670Sstevel@tonic-gate
16680Sstevel@tonic-gate /*
16690Sstevel@tonic-gate * FUNCTION: sp_dump()
16700Sstevel@tonic-gate * INPUT: dev - device to dump to.
16710Sstevel@tonic-gate * addr - address to dump.
16720Sstevel@tonic-gate * blkno - blkno on device.
16730Sstevel@tonic-gate * nblk - number of blocks to dump.
16740Sstevel@tonic-gate * OUTPUT: none.
16750Sstevel@tonic-gate * RETURNS: result from bdev_dump.
16760Sstevel@tonic-gate * PURPOSE: This routine dumps memory to the disk. It assumes that
16770Sstevel@tonic-gate * the memory has already been mapped into mainbus space.
16780Sstevel@tonic-gate * It is called at disk interrupt priority when the system
16790Sstevel@tonic-gate * is in trouble.
16800Sstevel@tonic-gate * NOTE: this function is defined using 32-bit arguments,
16810Sstevel@tonic-gate * but soft partitioning is internally 64-bit. Arguments
16820Sstevel@tonic-gate * are casted where appropriate.
16830Sstevel@tonic-gate */
16840Sstevel@tonic-gate static int
sp_dump(dev_t dev,caddr_t addr,daddr_t blkno,int nblk)16850Sstevel@tonic-gate sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
16860Sstevel@tonic-gate {
16870Sstevel@tonic-gate mp_unit_t *un;
16880Sstevel@tonic-gate buf_t *bp;
16890Sstevel@tonic-gate sp_ext_length_t nb;
16900Sstevel@tonic-gate daddr_t mapblk;
16910Sstevel@tonic-gate int result;
16920Sstevel@tonic-gate int more;
16930Sstevel@tonic-gate int saveresult = 0;
16940Sstevel@tonic-gate
16950Sstevel@tonic-gate /*
16960Sstevel@tonic-gate * Don't need to grab the unit lock.
16970Sstevel@tonic-gate * Cause nothing else is supposed to be happenning.
16980Sstevel@tonic-gate * Also dump is not supposed to sleep.
16990Sstevel@tonic-gate */
17000Sstevel@tonic-gate un = (mp_unit_t *)MD_UNIT(getminor(dev));
17010Sstevel@tonic-gate
17020Sstevel@tonic-gate if ((diskaddr_t)blkno >= un->c.un_total_blocks)
17030Sstevel@tonic-gate return (EINVAL);
17040Sstevel@tonic-gate
17050Sstevel@tonic-gate if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks)
17060Sstevel@tonic-gate return (EINVAL);
17070Sstevel@tonic-gate
17080Sstevel@tonic-gate bp = &dumpbuf;
17090Sstevel@tonic-gate nb = (sp_ext_length_t)dbtob(nblk);
17100Sstevel@tonic-gate do {
17110Sstevel@tonic-gate bzero((caddr_t)bp, sizeof (*bp));
17120Sstevel@tonic-gate more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp);
17130Sstevel@tonic-gate nblk = (int)(btodb(bp->b_bcount));
17140Sstevel@tonic-gate mapblk = bp->b_blkno;
17150Sstevel@tonic-gate result = bdev_dump(bp->b_edev, addr, mapblk, nblk);
17160Sstevel@tonic-gate if (result)
17170Sstevel@tonic-gate saveresult = result;
17180Sstevel@tonic-gate
17190Sstevel@tonic-gate nb -= bp->b_bcount;
17200Sstevel@tonic-gate addr += bp->b_bcount;
17210Sstevel@tonic-gate blkno += nblk;
17220Sstevel@tonic-gate } while (more);
17230Sstevel@tonic-gate
17240Sstevel@tonic-gate return (saveresult);
17250Sstevel@tonic-gate }
17260Sstevel@tonic-gate
17270Sstevel@tonic-gate static int
sp_imp_set(set_t setno)17280Sstevel@tonic-gate sp_imp_set(
17290Sstevel@tonic-gate set_t setno
17300Sstevel@tonic-gate )
17310Sstevel@tonic-gate {
17320Sstevel@tonic-gate mddb_recid_t recid;
17330Sstevel@tonic-gate int gotsomething;
17340Sstevel@tonic-gate mddb_type_t rec_type;
17350Sstevel@tonic-gate mddb_de_ic_t *dep;
17360Sstevel@tonic-gate mddb_rb32_t *rbp;
17370Sstevel@tonic-gate mp_unit_t *un64;
17380Sstevel@tonic-gate mp_unit32_od_t *un32;
17391623Stw21770 md_dev64_t self_devt;
17400Sstevel@tonic-gate minor_t *self_id; /* minor needs to be updated */
17410Sstevel@tonic-gate md_parent_t *parent_id; /* parent needs to be updated */
17420Sstevel@tonic-gate mddb_recid_t *record_id; /* record id needs to be updated */
17430Sstevel@tonic-gate
17440Sstevel@tonic-gate gotsomething = 0;
17450Sstevel@tonic-gate
17460Sstevel@tonic-gate rec_type = (mddb_type_t)md_getshared_key(setno,
17477627SChris.Horne@Sun.COM sp_md_ops.md_driver.md_drivername);
17480Sstevel@tonic-gate recid = mddb_makerecid(setno, 0);
17490Sstevel@tonic-gate
17500Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
17510Sstevel@tonic-gate if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
17520Sstevel@tonic-gate continue;
17530Sstevel@tonic-gate
17540Sstevel@tonic-gate dep = mddb_getrecdep(recid);
17550Sstevel@tonic-gate rbp = dep->de_rb;
17560Sstevel@tonic-gate
17571623Stw21770 switch (rbp->rb_revision) {
17581623Stw21770 case MDDB_REV_RB:
17591623Stw21770 case MDDB_REV_RBFN:
17600Sstevel@tonic-gate /*
17610Sstevel@tonic-gate * Small device
17620Sstevel@tonic-gate */
17630Sstevel@tonic-gate un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid);
17640Sstevel@tonic-gate self_id = &(un32->c.un_self_id);
17650Sstevel@tonic-gate parent_id = &(un32->c.un_parent);
17660Sstevel@tonic-gate record_id = &(un32->c.un_record_id);
17670Sstevel@tonic-gate
17680Sstevel@tonic-gate if (!md_update_minor(setno, mddb_getsidenum
17697627SChris.Horne@Sun.COM (setno), un32->un_key))
17700Sstevel@tonic-gate goto out;
17711623Stw21770 break;
17721623Stw21770
17731623Stw21770 case MDDB_REV_RB64:
17741623Stw21770 case MDDB_REV_RB64FN:
17750Sstevel@tonic-gate un64 = (mp_unit_t *)mddb_getrecaddr(recid);
17760Sstevel@tonic-gate self_id = &(un64->c.un_self_id);
17770Sstevel@tonic-gate parent_id = &(un64->c.un_parent);
17780Sstevel@tonic-gate record_id = &(un64->c.un_record_id);
17790Sstevel@tonic-gate
17800Sstevel@tonic-gate if (!md_update_minor(setno, mddb_getsidenum
17817627SChris.Horne@Sun.COM (setno), un64->un_key))
17820Sstevel@tonic-gate goto out;
17831623Stw21770 break;
17841623Stw21770 }
17851623Stw21770
17861623Stw21770 /*
17871623Stw21770 * If this is a top level and a friendly name metadevice,
17881623Stw21770 * update its minor in the namespace.
17891623Stw21770 */
17901623Stw21770 if ((*parent_id == MD_NO_PARENT) &&
17911623Stw21770 ((rbp->rb_revision == MDDB_REV_RBFN) ||
17921623Stw21770 (rbp->rb_revision == MDDB_REV_RB64FN))) {
17931623Stw21770
17941623Stw21770 self_devt = md_makedevice(md_major, *self_id);
17951623Stw21770 if (!md_update_top_device_minor(setno,
17961623Stw21770 mddb_getsidenum(setno), self_devt))
17971623Stw21770 goto out;
17980Sstevel@tonic-gate }
17990Sstevel@tonic-gate
18000Sstevel@tonic-gate /*
18010Sstevel@tonic-gate * Update unit with the imported setno
18020Sstevel@tonic-gate *
18030Sstevel@tonic-gate */
18040Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_GOTIT);
18050Sstevel@tonic-gate
18060Sstevel@tonic-gate *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
18070Sstevel@tonic-gate if (*parent_id != MD_NO_PARENT)
18080Sstevel@tonic-gate *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
18090Sstevel@tonic-gate *record_id = MAKERECID(setno, DBID(*record_id));
18100Sstevel@tonic-gate
18110Sstevel@tonic-gate gotsomething = 1;
18120Sstevel@tonic-gate }
18130Sstevel@tonic-gate
18140Sstevel@tonic-gate out:
18150Sstevel@tonic-gate return (gotsomething);
18160Sstevel@tonic-gate }
18170Sstevel@tonic-gate
18180Sstevel@tonic-gate static md_named_services_t sp_named_services[] = {
18190Sstevel@tonic-gate {NULL, 0}
18200Sstevel@tonic-gate };
18210Sstevel@tonic-gate
18220Sstevel@tonic-gate md_ops_t sp_md_ops = {
18230Sstevel@tonic-gate sp_open, /* open */
18240Sstevel@tonic-gate sp_close, /* close */
18250Sstevel@tonic-gate md_sp_strategy, /* strategy */
18260Sstevel@tonic-gate NULL, /* print */
18270Sstevel@tonic-gate sp_dump, /* dump */
18280Sstevel@tonic-gate NULL, /* read */
18290Sstevel@tonic-gate NULL, /* write */
18300Sstevel@tonic-gate md_sp_ioctl, /* ioctl, */
18310Sstevel@tonic-gate sp_snarf, /* snarf */
18320Sstevel@tonic-gate sp_halt, /* halt */
18330Sstevel@tonic-gate NULL, /* aread */
18340Sstevel@tonic-gate NULL, /* awrite */
18350Sstevel@tonic-gate sp_imp_set, /* import set */
18360Sstevel@tonic-gate sp_named_services
18370Sstevel@tonic-gate };
18380Sstevel@tonic-gate
18390Sstevel@tonic-gate static void
init_init()18400Sstevel@tonic-gate init_init()
18410Sstevel@tonic-gate {
18420Sstevel@tonic-gate sp_parent_cache = kmem_cache_create("md_softpart_parent",
18430Sstevel@tonic-gate sizeof (md_spps_t), 0, sp_parent_constructor,
18440Sstevel@tonic-gate sp_parent_destructor, sp_run_queue, NULL, NULL, 0);
18450Sstevel@tonic-gate sp_child_cache = kmem_cache_create("md_softpart_child",
18460Sstevel@tonic-gate sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0,
18470Sstevel@tonic-gate sp_child_constructor, sp_child_destructor, sp_run_queue,
18480Sstevel@tonic-gate NULL, NULL, 0);
18490Sstevel@tonic-gate }
18500Sstevel@tonic-gate
18510Sstevel@tonic-gate static void
fini_uninit()18520Sstevel@tonic-gate fini_uninit()
18530Sstevel@tonic-gate {
18540Sstevel@tonic-gate kmem_cache_destroy(sp_parent_cache);
18550Sstevel@tonic-gate kmem_cache_destroy(sp_child_cache);
18560Sstevel@tonic-gate sp_parent_cache = sp_child_cache = NULL;
18570Sstevel@tonic-gate }
18580Sstevel@tonic-gate
18590Sstevel@tonic-gate /* define the module linkage */
18604932Spetede MD_PLUGIN_MISC_MODULE("soft partition module", init_init(), fini_uninit())
1861