/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Driver for Virtual Disk. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Machine specific Hertz is kept here */ extern clock_t md_hz; /* * Externs. */ extern int (*mdv_strategy_tstpnt)(buf_t *, int, void*); extern major_t md_major; extern unit_t md_nunits; extern set_t md_nsets; extern md_set_t md_set[]; extern md_set_io_t md_set_io[]; extern md_ops_t **md_ops; extern md_ops_t *md_opslist; extern ddi_modhandle_t *md_mods; extern dev_info_t *md_devinfo; extern md_krwlock_t md_unit_array_rw; extern kmutex_t md_mx; extern kcondvar_t md_cv; extern md_krwlock_t hsp_rwlp; extern md_krwlock_t ni_rwlp; extern int md_num_daemons; extern int md_status; extern int md_ioctl_cnt; extern int md_mtioctl_cnt; extern struct metatransops metatransops; extern md_event_queue_t *md_event_queue; extern md_resync_t md_cpr_resync; extern int md_done_daemon_threads; extern int md_ff_daemon_threads; extern mddb_set_t *mddb_setenter(set_t setno, int flag, int *errorcodep); extern void mddb_setexit(mddb_set_t *s); extern void *lookup_entry(struct nm_next_hdr *, set_t, side_t, mdkey_t, md_dev64_t, int); extern struct nm_next_hdr *get_first_record(set_t, int, int); extern dev_t getrootdev(void); struct mdq_anchor md_done_daemon; /* done request queue */ struct mdq_anchor md_mstr_daemon; /* mirror error, WOW requests */ struct mdq_anchor md_mhs_daemon; /* mirror hotspare requests queue */ struct mdq_anchor md_hs_daemon; /* raid hotspare requests queue */ struct mdq_anchor md_ff_daemonq; /* failfast request queue */ struct mdq_anchor md_mirror_daemon; /* mirror owner queue */ struct mdq_anchor md_mirror_io_daemon; /* mirror owner i/o queue */ struct mdq_anchor md_mirror_rs_daemon; /* mirror resync done queue */ struct mdq_anchor md_sp_daemon; /* soft-part error daemon queue */ struct mdq_anchor md_mto_daemon; /* mirror timeout daemon queue */ int md_done_daemon_threads = 1; /* threads for md_done_daemon requestq */ int md_mstr_daemon_threads = 1; /* threads for md_mstr_daemon requestq */ int md_mhs_daemon_threads = 1; /* threads for md_mhs_daemon requestq */ int md_hs_daemon_threads = 1; /* threads for md_hs_daemon requestq */ int md_ff_daemon_threads = 3; /* threads for md_ff_daemon requestq */ int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */ int md_sp_daemon_threads = 1; /* threads for md_sp_daemon requestq */ int md_mto_daemon_threads = 1; /* threads for md_mto_daemon requestq */ #ifdef DEBUG /* Flag to switch on debug messages */ int md_release_reacquire_debug = 0; /* debug flag */ #endif /* * * The md_request_queues is table of pointers to request queues and the number * of threads associated with the request queues. * When the number of threads is set to 1, then the order of execution is * sequential. * The number of threads for all the queues have been defined as global * variables to enable kernel tuning. * */ #define MD_DAEMON_QUEUES 11 md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = { {&md_done_daemon, &md_done_daemon_threads}, {&md_mstr_daemon, &md_mstr_daemon_threads}, {&md_hs_daemon, &md_hs_daemon_threads}, {&md_ff_daemonq, &md_ff_daemon_threads}, {&md_mirror_daemon, &md_mirror_daemon_threads}, {&md_mirror_io_daemon, &md_mirror_daemon_threads}, {&md_mirror_rs_daemon, &md_mirror_daemon_threads}, {&md_sp_daemon, &md_sp_daemon_threads}, {&md_mhs_daemon, &md_mhs_daemon_threads}, {&md_mto_daemon, &md_mto_daemon_threads}, {0, 0} }; /* * Number of times a message is retried before issuing a warning to the operator */ #define MD_MN_WARN_INTVL 10 /* * Setting retry cnt to one (pre decremented) so that we actually do no * retries when committing/deleting a mddb rec. The underlying disk driver * does several retries to check if the disk is really dead or not so there * is no reason for us to retry on top of the drivers retries. */ uint_t md_retry_cnt = 1; /* global so it can be patched */ /* * How many times to try to do the door_ki_upcall() in mdmn_ksend_message. * Again, made patchable here should it prove useful. */ uint_t md_send_retry_limit = 30; /* * Bug # 1212146 * Before this change the user had to pass in a short aligned buffer because of * problems in some underlying device drivers. This problem seems to have been * corrected in the underlying drivers so we will default to not requiring any * alignment. If the user needs to check for a specific alignment, * md_uio_alignment_mask may be set in /etc/system to accomplish this. To get * the behavior before this fix, the md_uio_alignment_mask would be set to 1, * to check for word alignment, it can be set to 3, for double word alignment, * it can be set to 7, etc. * * [Other part of fix is in function md_chk_uio()] */ static int md_uio_alignment_mask = 0; /* * for md_dev64_t translation */ struct md_xlate_table *md_tuple_table; struct md_xlate_major_table *md_major_tuple_table; int md_tuple_length; uint_t md_majortab_len; /* Function declarations */ static int md_create_probe_rqlist(md_probedev_impl_t *plist, daemon_queue_t **hdr, intptr_t (*probe_test)()); /* * manipulate global status */ void md_set_status(int bits) { mutex_enter(&md_mx); md_status |= bits; mutex_exit(&md_mx); } void md_clr_status(int bits) { mutex_enter(&md_mx); md_status &= ~bits; mutex_exit(&md_mx); } int md_get_status() { int result; mutex_enter(&md_mx); result = md_status; mutex_exit(&md_mx); return (result); } void md_set_setstatus(set_t setno, int bits) { ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS); mutex_enter(&md_mx); md_set[setno].s_status |= bits; mutex_exit(&md_mx); } void md_clr_setstatus(set_t setno, int bits) { ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS); mutex_enter(&md_mx); md_set[setno].s_status &= ~bits; mutex_exit(&md_mx); } uint_t md_get_setstatus(set_t setno) { uint_t result; ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS); mutex_enter(&md_mx); result = md_set[setno].s_status; mutex_exit(&md_mx); return (result); } /* * md_unit_readerlock_common: * ------------------------- * Mark the given unit as having a reader reference. Spin waiting for any * writer references to be released. * * Input: * ui unit reference * lock_held 0 => ui_mx needs to be grabbed * 1 => ui_mx already held * Output: * mm_unit_t corresponding to unit structure * ui->ui_readercnt incremented */ static void * md_unit_readerlock_common(mdi_unit_t *ui, int lock_held) { uint_t flag = MD_UL_WRITER | MD_UL_WANABEWRITER; if (!lock_held) mutex_enter(&ui->ui_mx); while (ui->ui_lock & flag) { if (panicstr) { if (ui->ui_lock & MD_UL_WRITER) panic("md: writer lock is held"); break; } cv_wait(&ui->ui_cv, &ui->ui_mx); } ui->ui_readercnt++; if (!lock_held) mutex_exit(&ui->ui_mx); return (MD_UNIT(ui->ui_link.ln_id)); } void * md_unit_readerlock(mdi_unit_t *ui) { return (md_unit_readerlock_common(ui, 0)); } /* * md_unit_writerlock_common: * ------------------------- * Acquire a unique writer reference. Causes previous readers to drain. * Spins if a writer reference already exists or if a previous reader/writer * dropped the lock to allow a ksend_message to be despatched. * * Input: * ui unit reference * lock_held 0 => grab ui_mx * 1 => ui_mx already held on entry * Output: * mm_unit_t reference */ static void * md_unit_writerlock_common(mdi_unit_t *ui, int lock_held) { uint_t flag = MD_UL_WRITER; if (panicstr) panic("md: writer lock not allowed"); if (!lock_held) mutex_enter(&ui->ui_mx); while ((ui->ui_lock & flag) || (ui->ui_readercnt != 0)) { ui->ui_wanabecnt++; ui->ui_lock |= MD_UL_WANABEWRITER; cv_wait(&ui->ui_cv, &ui->ui_mx); if (--ui->ui_wanabecnt == 0) ui->ui_lock &= ~MD_UL_WANABEWRITER; } ui->ui_lock |= MD_UL_WRITER; ui->ui_owner = curthread; if (!lock_held) mutex_exit(&ui->ui_mx); return (MD_UNIT(ui->ui_link.ln_id)); } void * md_unit_writerlock(mdi_unit_t *ui) { return (md_unit_writerlock_common(ui, 0)); } /* * md_unit_readerexit_common: * ------------------------- * Release the readerlock for the specified unit. If the reader count reaches * zero and there are waiting writers (MD_UL_WANABEWRITER set) wake them up. * * Input: * ui unit reference * lock_held 0 => ui_mx needs to be acquired * 1 => ui_mx already held */ static void md_unit_readerexit_common(mdi_unit_t *ui, int lock_held) { if (!lock_held) mutex_enter(&ui->ui_mx); ASSERT((ui->ui_lock & MD_UL_WRITER) == 0); ASSERT(ui->ui_readercnt != 0); ui->ui_readercnt--; if ((ui->ui_wanabecnt != 0) && (ui->ui_readercnt == 0)) cv_broadcast(&ui->ui_cv); if (!lock_held) mutex_exit(&ui->ui_mx); } void md_unit_readerexit(mdi_unit_t *ui) { md_unit_readerexit_common(ui, 0); } /* * md_unit_writerexit_common: * ------------------------- * Release the writerlock currently held on the unit. Wake any threads waiting * on becoming reader or writer (MD_UL_WANABEWRITER set). * * Input: * ui unit reference * lock_held 0 => ui_mx to be acquired * 1 => ui_mx already held */ static void md_unit_writerexit_common(mdi_unit_t *ui, int lock_held) { if (!lock_held) mutex_enter(&ui->ui_mx); ASSERT((ui->ui_lock & MD_UL_WRITER) != 0); ASSERT(ui->ui_readercnt == 0); ui->ui_lock &= ~MD_UL_WRITER; ui->ui_owner = NULL; cv_broadcast(&ui->ui_cv); if (!lock_held) mutex_exit(&ui->ui_mx); } void md_unit_writerexit(mdi_unit_t *ui) { md_unit_writerexit_common(ui, 0); } void * md_io_readerlock(mdi_unit_t *ui) { md_io_lock_t *io = ui->ui_io_lock; ASSERT(io); /* checks case where no io lock allocated */ mutex_enter(&io->io_mx); while (io->io_lock & (MD_UL_WRITER | MD_UL_WANABEWRITER)) { if (panicstr) { if (io->io_lock & MD_UL_WRITER) panic("md: writer lock is held"); break; } cv_wait(&io->io_cv, &io->io_mx); } io->io_readercnt++; mutex_exit(&io->io_mx); return (MD_UNIT(ui->ui_link.ln_id)); } void * md_io_writerlock(mdi_unit_t *ui) { md_io_lock_t *io = ui->ui_io_lock; ASSERT(io); /* checks case where no io lock allocated */ if (panicstr) panic("md: writer lock not allowed"); mutex_enter(&io->io_mx); while ((io->io_lock & MD_UL_WRITER) || (io->io_readercnt != 0)) { io->io_wanabecnt++; io->io_lock |= MD_UL_WANABEWRITER; cv_wait(&io->io_cv, &io->io_mx); if (--io->io_wanabecnt == 0) io->io_lock &= ~MD_UL_WANABEWRITER; } io->io_lock |= MD_UL_WRITER; io->io_owner = curthread; mutex_exit(&io->io_mx); return (MD_UNIT(ui->ui_link.ln_id)); } void md_io_readerexit(mdi_unit_t *ui) { md_io_lock_t *io = ui->ui_io_lock; mutex_enter(&io->io_mx); ASSERT((io->io_lock & MD_UL_WRITER) == 0); ASSERT(io->io_readercnt != 0); io->io_readercnt--; if ((io->io_wanabecnt != 0) && (io->io_readercnt == 0)) { cv_broadcast(&io->io_cv); } mutex_exit(&io->io_mx); } void md_io_writerexit(mdi_unit_t *ui) { md_io_lock_t *io = ui->ui_io_lock; mutex_enter(&io->io_mx); ASSERT((io->io_lock & MD_UL_WRITER) != 0); ASSERT(io->io_readercnt == 0); io->io_lock &= ~MD_UL_WRITER; io->io_owner = NULL; cv_broadcast(&io->io_cv); mutex_exit(&io->io_mx); } /* * Attempt to grab that set of locks defined as global. * A mask containing the set of global locks that are owned upon * entry is input. Any additional global locks are then grabbed. * This keeps the caller from having to know the set of global * locks. */ static int md_global_lock_enter(int global_locks_owned_mask) { /* * The current implementation has been verified by inspection * and test to be deadlock free. If another global lock is * added, changing the algorithm used by this function should * be considered. With more than 2 locks it is difficult to * guarantee that locks are being acquired in the correct order. * The safe approach would be to drop all of the locks that are * owned at function entry and then reacquire all of the locks * in the order defined by the lock hierarchy. */ mutex_enter(&md_mx); if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) { while ((md_mtioctl_cnt != 0) || (md_status & MD_GBL_IOCTL_LOCK)) { if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) { mutex_exit(&md_mx); return (EINTR); } } md_status |= MD_GBL_IOCTL_LOCK; md_ioctl_cnt++; } if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) { while (md_status & MD_GBL_HS_LOCK) { if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) { md_status &= ~MD_GBL_IOCTL_LOCK; mutex_exit(&md_mx); return (EINTR); } } md_status |= MD_GBL_HS_LOCK; } mutex_exit(&md_mx); return (0); } /* * Release the set of global locks that were grabbed in md_global_lock_enter * that were not already owned by the calling thread. The set of previously * owned global locks is passed in as a mask parameter. */ static int md_global_lock_exit(int global_locks_owned_mask, int code, int flags, mdi_unit_t *ui) { mutex_enter(&md_mx); /* If MT ioctl decrement mt_ioctl_cnt */ if ((flags & MD_MT_IOCTL)) { md_mtioctl_cnt--; } else { if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) { /* clear the lock and decrement count */ ASSERT(md_ioctl_cnt == 1); md_ioctl_cnt--; md_status &= ~MD_GBL_IOCTL_LOCK; } if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) md_status &= ~MD_GBL_HS_LOCK; } if (flags & MD_READER_HELD) md_unit_readerexit(ui); if (flags & MD_WRITER_HELD) md_unit_writerexit(ui); if (flags & MD_IO_HELD) md_io_writerexit(ui); if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) { rw_exit(&md_unit_array_rw.lock); } cv_broadcast(&md_cv); mutex_exit(&md_mx); return (code); } /* * The two functions, md_ioctl_lock_enter, and md_ioctl_lock_exit make * use of the md_global_lock_{enter|exit} functions to avoid duplication * of code. They rely upon the fact that the locks that are specified in * the input mask are not acquired or freed. If this algorithm changes * as described in the block comment at the beginning of md_global_lock_enter * then it will be necessary to change these 2 functions. Otherwise these * functions will be grabbing and holding global locks unnecessarily. */ int md_ioctl_lock_enter(void) { /* grab only the ioctl lock */ return (md_global_lock_enter(~MD_GBL_IOCTL_LOCK)); } /* * If md_ioctl_lock_exit is being called at the end of an ioctl before * returning to user space, then ioctl_end is set to 1. * Otherwise, the ioctl lock is being dropped in the middle of handling * an ioctl and will be reacquired before the end of the ioctl. * Do not attempt to process the MN diskset mddb parse flags unless * ioctl_end is true - otherwise a deadlock situation could arise. */ int md_ioctl_lock_exit(int code, int flags, mdi_unit_t *ui, int ioctl_end) { int ret_val; uint_t status; mddb_set_t *s; int i; int err; md_mn_msg_mddb_parse_t *mddb_parse_msg; md_mn_kresult_t *kresult; mddb_lb_t *lbp; int rval = 1; int flag; /* release only the ioctl lock */ ret_val = md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui); /* * If md_ioctl_lock_exit is being called with a possible lock held * (ioctl_end is 0), then don't check the MN disksets since the * call to mddb_setenter may cause a lock ordering deadlock. */ if (!ioctl_end) return (ret_val); /* * Walk through disksets to see if there is a MN diskset that * has messages that need to be sent. Set must be snarfed and * be a MN diskset in order to be checked. * * In a MN diskset, this routine may send messages to the * rpc.mdcommd in order to have the slave nodes re-parse parts * of the mddb. Messages can only be sent with no locks held, * so if mddb change occurred while the ioctl lock is held, this * routine must send the messages. */ for (i = 1; i < md_nsets; i++) { status = md_get_setstatus(i); /* Set must be snarfed and be a MN diskset */ if ((status & (MD_SET_SNARFED | MD_SET_MNSET)) != (MD_SET_SNARFED | MD_SET_MNSET)) continue; /* Grab set lock so that set can't change */ if ((s = mddb_setenter(i, MDDB_MUSTEXIST, &err)) == NULL) continue; lbp = s->s_lbp; /* Re-get set status now that lock is held */ status = md_get_setstatus(i); /* * If MN parsing block flag is set - continue to next set. * * If s_mn_parseflags_sending is non-zero, then another thread * is already currently sending a parse message, so just * release the set mutex. If this ioctl had caused an mddb * change that results in a parse message to be generated, * the thread that is currently sending a parse message would * generate the additional parse message. * * If s_mn_parseflags_sending is zero then loop until * s_mn_parseflags is 0 (until there are no more * messages to send). * While s_mn_parseflags is non-zero, * put snapshot of parse_flags in s_mn_parseflags_sending * set s_mn_parseflags to zero * release set mutex * send message * re-grab set mutex * set s_mn_parseflags_sending to zero * * If set is STALE, send message with NO_LOG flag so that * rpc.mdcommd won't attempt to log message to non-writeable * replica. */ mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP); while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) && (s->s_mn_parseflags & MDDB_PARSE_MASK) && (!(status & MD_SET_MNPARSE_BLK))) { /* Grab snapshot of parse flags */ s->s_mn_parseflags_sending = s->s_mn_parseflags; s->s_mn_parseflags = 0; mutex_exit(&md_set[(s)->s_setno].s_dbmx); /* * Send the message to the slaves to re-parse * the indicated portions of the mddb. Send the status * of the 50 mddbs in this set so that slaves know * which mddbs that the master node thinks are 'good'. * Otherwise, slave may reparse, but from wrong * replica. */ mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending; for (i = 0; i < MDDB_NLB; i++) { mddb_parse_msg->msg_lb_flags[i] = lbp->lb_locators[i].l_flags; } kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); while (rval != 0) { flag = 0; if (status & MD_SET_STALE) flag |= MD_MSGF_NO_LOG; rval = mdmn_ksend_message(s->s_setno, MD_MN_MSG_MDDB_PARSE, flag, 0, (char *)mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t), kresult); /* if the node hasn't yet joined, it's Ok. */ if ((!MDMN_KSEND_MSG_OK(rval, kresult)) && (kresult->kmmr_comm_state != MDMNE_NOT_JOINED)) { mdmn_ksend_show_error(rval, kresult, "MD_MN_MSG_MDDB_PARSE"); cmn_err(CE_WARN, "md_ioctl_lock_exit: " "Unable to send mddb update " "message to other nodes in " "diskset %s\n", s->s_setname); rval = 1; } } kmem_free(kresult, sizeof (md_mn_kresult_t)); /* * Re-grab mutex to clear sending field and to * see if another parse message needs to be generated. */ mutex_enter(&md_set[(s)->s_setno].s_dbmx); s->s_mn_parseflags_sending = 0; } kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t)); mutex_exit(&md_set[(s)->s_setno].s_dbmx); } return (ret_val); } /* * Called when in an ioctl and need readerlock. */ void * md_ioctl_readerlock(IOLOCK *lock, mdi_unit_t *ui) { ASSERT(lock != NULL); lock->l_ui = ui; lock->l_flags |= MD_READER_HELD; return (md_unit_readerlock_common(ui, 0)); } /* * Called when in an ioctl and need writerlock. */ void * md_ioctl_writerlock(IOLOCK *lock, mdi_unit_t *ui) { ASSERT(lock != NULL); lock->l_ui = ui; lock->l_flags |= MD_WRITER_HELD; return (md_unit_writerlock_common(ui, 0)); } void * md_ioctl_io_lock(IOLOCK *lock, mdi_unit_t *ui) { ASSERT(lock != NULL); lock->l_ui = ui; lock->l_flags |= MD_IO_HELD; return (md_io_writerlock(ui)); } void md_ioctl_readerexit(IOLOCK *lock) { ASSERT(lock != NULL); lock->l_flags &= ~MD_READER_HELD; md_unit_readerexit(lock->l_ui); } void md_ioctl_writerexit(IOLOCK *lock) { ASSERT(lock != NULL); lock->l_flags &= ~MD_WRITER_HELD; md_unit_writerexit(lock->l_ui); } void md_ioctl_io_exit(IOLOCK *lock) { ASSERT(lock != NULL); lock->l_flags &= ~MD_IO_HELD; md_io_writerexit(lock->l_ui); } /* * md_ioctl_releaselocks: * -------------------- * Release the unit locks that are held and stop subsequent * md_unit_reader/writerlock calls from progressing. This allows the caller * to send messages across the cluster when running in a multinode * environment. * ioctl originated locks (via md_ioctl_readerlock/md_ioctl_writerlock) are * allowed to progress as normal. This is required as these typically are * invoked by the message handler that may be called while a unit lock is * marked as released. * * On entry: * variety of unit locks may be held including ioctl lock * * On exit: * locks released and unit structure updated to prevent subsequent reader/ * writer locks being acquired until md_ioctl_reacquirelocks is called */ void md_ioctl_releaselocks(int code, int flags, mdi_unit_t *ui) { /* This actually releases the locks. */ (void) md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui); } /* * md_ioctl_reacquirelocks: * ---------------------- * Reacquire the locks that were held when md_ioctl_releaselocks * was called. * * On entry: * No unit locks held * On exit: * locks held that were held at md_ioctl_releaselocks time including * the ioctl lock. */ void md_ioctl_reacquirelocks(int flags, mdi_unit_t *ui) { if (flags & MD_MT_IOCTL) { mutex_enter(&md_mx); md_mtioctl_cnt++; mutex_exit(&md_mx); } else { while (md_ioctl_lock_enter() == EINTR) ; } if (flags & MD_ARRAY_WRITER) { rw_enter(&md_unit_array_rw.lock, RW_WRITER); } else if (flags & MD_ARRAY_READER) { rw_enter(&md_unit_array_rw.lock, RW_READER); } if (ui != (mdi_unit_t *)NULL) { if (flags & MD_IO_HELD) { (void) md_io_writerlock(ui); } mutex_enter(&ui->ui_mx); if (flags & MD_READER_HELD) { (void) md_unit_readerlock_common(ui, 1); } else if (flags & MD_WRITER_HELD) { (void) md_unit_writerlock_common(ui, 1); } /* Wake up any blocked readerlock() calls */ cv_broadcast(&ui->ui_cv); mutex_exit(&ui->ui_mx); } } void md_ioctl_droplocks(IOLOCK *lock) { mdi_unit_t *ui; int flags; ASSERT(lock != NULL); ui = lock->l_ui; flags = lock->l_flags; if (flags & MD_READER_HELD) { lock->l_flags &= ~MD_READER_HELD; md_unit_readerexit(ui); } if (flags & MD_WRITER_HELD) { lock->l_flags &= ~MD_WRITER_HELD; md_unit_writerexit(ui); } if (flags & MD_IO_HELD) { lock->l_flags &= ~MD_IO_HELD; md_io_writerexit(ui); } if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) { lock->l_flags &= ~(MD_ARRAY_WRITER | MD_ARRAY_READER); rw_exit(&md_unit_array_rw.lock); } } void md_array_writer(IOLOCK *lock) { ASSERT(lock != NULL); lock->l_flags |= MD_ARRAY_WRITER; rw_enter(&md_unit_array_rw.lock, RW_WRITER); } void md_array_reader(IOLOCK *lock) { ASSERT(lock != NULL); lock->l_flags |= MD_ARRAY_READER; rw_enter(&md_unit_array_rw.lock, RW_READER); } /* * Called when in an ioctl and need opencloselock. * Sets flags in lockp for READER_HELD. */ void * md_ioctl_openclose_enter(IOLOCK *lockp, mdi_unit_t *ui) { void *un; ASSERT(lockp != NULL); mutex_enter(&ui->ui_mx); while (ui->ui_lock & MD_UL_OPENORCLOSE) cv_wait(&ui->ui_cv, &ui->ui_mx); ui->ui_lock |= MD_UL_OPENORCLOSE; /* Maintain mutex across the readerlock call */ lockp->l_ui = ui; lockp->l_flags |= MD_READER_HELD; un = md_unit_readerlock_common(ui, 1); mutex_exit(&ui->ui_mx); return (un); } /* * Clears reader lock using md_ioctl instead of md_unit * and updates lockp. */ void md_ioctl_openclose_exit(IOLOCK *lockp) { mdi_unit_t *ui; ASSERT(lockp != NULL); ui = lockp->l_ui; ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); md_ioctl_readerexit(lockp); mutex_enter(&ui->ui_mx); ui->ui_lock &= ~MD_UL_OPENORCLOSE; cv_broadcast(&ui->ui_cv); mutex_exit(&ui->ui_mx); } /* * Clears reader lock using md_ioctl instead of md_unit * and updates lockp. * Does not acquire or release the ui_mx lock since the calling * routine has already acquired this lock. */ void md_ioctl_openclose_exit_lh(IOLOCK *lockp) { mdi_unit_t *ui; ASSERT(lockp != NULL); ui = lockp->l_ui; ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); lockp->l_flags &= ~MD_READER_HELD; md_unit_readerexit_common(lockp->l_ui, 1); ui->ui_lock &= ~MD_UL_OPENORCLOSE; cv_broadcast(&ui->ui_cv); } void * md_unit_openclose_enter(mdi_unit_t *ui) { void *un; mutex_enter(&ui->ui_mx); while (ui->ui_lock & (MD_UL_OPENORCLOSE)) cv_wait(&ui->ui_cv, &ui->ui_mx); ui->ui_lock |= MD_UL_OPENORCLOSE; /* Maintain mutex across the readerlock call */ un = md_unit_readerlock_common(ui, 1); mutex_exit(&ui->ui_mx); return (un); } void md_unit_openclose_exit(mdi_unit_t *ui) { md_unit_readerexit(ui); mutex_enter(&ui->ui_mx); ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); ui->ui_lock &= ~MD_UL_OPENORCLOSE; cv_broadcast(&ui->ui_cv); mutex_exit(&ui->ui_mx); } /* * Drop the openclose and readerlocks without acquiring or * releasing the ui_mx lock since the calling routine has * already acquired this lock. */ void md_unit_openclose_exit_lh(mdi_unit_t *ui) { md_unit_readerexit_common(ui, 1); ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE); ui->ui_lock &= ~MD_UL_OPENORCLOSE; cv_broadcast(&ui->ui_cv); } int md_unit_isopen( mdi_unit_t *ui ) { int isopen; /* check status */ mutex_enter(&ui->ui_mx); isopen = ((ui->ui_lock & MD_UL_OPEN) ? 1 : 0); mutex_exit(&ui->ui_mx); return (isopen); } int md_unit_incopen( minor_t mnum, int flag, int otyp ) { mdi_unit_t *ui = MDI_UNIT(mnum); int err = 0; /* check type and flags */ ASSERT(ui != NULL); mutex_enter(&ui->ui_mx); if ((otyp < 0) || (otyp >= OTYPCNT)) { err = EINVAL; goto out; } if (((flag & FEXCL) && (ui->ui_lock & MD_UL_OPEN)) || (ui->ui_lock & MD_UL_EXCL)) { err = EBUSY; goto out; } /* count and flag open */ ui->ui_ocnt[otyp]++; ui->ui_lock |= MD_UL_OPEN; if (flag & FEXCL) ui->ui_lock |= MD_UL_EXCL; /* setup kstat, return success */ mutex_exit(&ui->ui_mx); md_kstat_init(mnum); return (0); /* return error */ out: mutex_exit(&ui->ui_mx); return (err); } int md_unit_decopen( minor_t mnum, int otyp ) { mdi_unit_t *ui = MDI_UNIT(mnum); int err = 0; unsigned i; /* check type and flags */ ASSERT(ui != NULL); mutex_enter(&ui->ui_mx); if ((otyp < 0) || (otyp >= OTYPCNT)) { err = EINVAL; goto out; } else if (ui->ui_ocnt[otyp] == 0) { err = ENXIO; goto out; } /* count and flag closed */ if (otyp == OTYP_LYR) ui->ui_ocnt[otyp]--; else ui->ui_ocnt[otyp] = 0; ui->ui_lock &= ~MD_UL_OPEN; for (i = 0; (i < OTYPCNT); ++i) if (ui->ui_ocnt[i] != 0) ui->ui_lock |= MD_UL_OPEN; if (! (ui->ui_lock & MD_UL_OPEN)) ui->ui_lock &= ~MD_UL_EXCL; /* teardown kstat, return success */ if (! (ui->ui_lock & MD_UL_OPEN)) { /* * We have a race condition inherited from specfs between * open() and close() calls. This results in the kstat * for a pending I/O being torn down, and then a panic. * To avoid this, only tear the kstat down if there are * no other readers on this device. */ if (ui->ui_readercnt > 1) { mutex_exit(&ui->ui_mx); } else { mutex_exit(&ui->ui_mx); md_kstat_destroy(mnum); } return (0); } /* return success */ out: mutex_exit(&ui->ui_mx); return (err); } md_dev64_t md_xlate_targ_2_mini(md_dev64_t targ_devt) { dev32_t mini_32_devt, targ_32_devt; int i; /* * check to see if we're in an upgrade situation * if we are not in upgrade just return the input device */ if (!MD_UPGRADE) return (targ_devt); targ_32_devt = md_cmpldev(targ_devt); i = 0; while (i != md_tuple_length) { if (md_tuple_table[i].targ_devt == targ_32_devt) { mini_32_devt = md_tuple_table[i].mini_devt; return (md_expldev((md_dev64_t)mini_32_devt)); } i++; } return (NODEV64); } md_dev64_t md_xlate_mini_2_targ(md_dev64_t mini_devt) { dev32_t mini_32_devt, targ_32_devt; int i; if (!MD_UPGRADE) return (mini_devt); mini_32_devt = md_cmpldev(mini_devt); i = 0; while (i != md_tuple_length) { if (md_tuple_table[i].mini_devt == mini_32_devt) { targ_32_devt = md_tuple_table[i].targ_devt; return (md_expldev((md_dev64_t)targ_32_devt)); } i++; } return (NODEV64); } void md_xlate_free(int size) { kmem_free(md_tuple_table, size); } char * md_targ_major_to_name(major_t maj) { char *drv_name = NULL; int i; if (!MD_UPGRADE) return (ddi_major_to_name(maj)); for (i = 0; i < md_majortab_len; i++) { if (md_major_tuple_table[i].targ_maj == maj) { drv_name = md_major_tuple_table[i].drv_name; break; } } return (drv_name); } major_t md_targ_name_to_major(char *drv_name) { major_t maj; int i; maj = md_getmajor(NODEV64); if (!MD_UPGRADE) return (ddi_name_to_major(drv_name)); for (i = 0; i < md_majortab_len; i++) { if ((strcmp(md_major_tuple_table[i].drv_name, drv_name)) == 0) { maj = md_major_tuple_table[i].targ_maj; break; } } return (maj); } void md_majortab_free() { size_t sz; int i; for (i = 0; i < md_majortab_len; i++) { freestr(md_major_tuple_table[i].drv_name); } sz = md_majortab_len * sizeof (struct md_xlate_major_table); kmem_free(md_major_tuple_table, sz); } /* functions return a pointer to a function which returns an int */ intptr_t (* md_get_named_service(md_dev64_t dev, int modindex, char *name, intptr_t (*Default)()))() { mdi_unit_t *ui; md_named_services_t *sp; int i; /* * Return the first named service found. * Use this path when it is known that there is only * one named service possible (e.g., hotspare interface) */ if ((dev == NODEV64) && (modindex == ANY_SERVICE)) { for (i = 0; i < MD_NOPS; i++) { if (md_ops[i] == NULL) { continue; } sp = md_ops[i]->md_services; if (sp == NULL) continue; while (sp->md_service != NULL) { if (strcmp(name, sp->md_name) == 0) return (sp->md_service); sp++; } } return (Default); } /* * Return the named service for the given modindex. * This is used if there are multiple possible named services * and each one needs to be called (e.g., poke hotspares) */ if (dev == NODEV64) { if (modindex >= MD_NOPS) return (Default); if (md_ops[modindex] == NULL) return (Default); sp = md_ops[modindex]->md_services; if (sp == NULL) return (Default); while (sp->md_service != NULL) { if (strcmp(name, sp->md_name) == 0) return (sp->md_service); sp++; } return (Default); } /* * Return the named service for this md_dev64_t */ if (md_getmajor(dev) != md_major) return (Default); if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) || (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits)) return (NULL); if ((ui = MDI_UNIT(md_getminor(dev))) == NULL) return (NULL); sp = md_ops[ui->ui_opsindex]->md_services; if (sp == NULL) return (Default); while (sp->md_service != NULL) { if (strcmp(name, sp->md_name) == 0) return (sp->md_service); sp++; } return (Default); } /* * md_daemon callback routine */ boolean_t callb_md_cpr(void *arg, int code) { callb_cpr_t *cp = (callb_cpr_t *)arg; int ret = 0; /* assume success */ clock_t delta; mutex_enter(cp->cc_lockp); switch (code) { case CB_CODE_CPR_CHKPT: /* * Check for active resync threads */ mutex_enter(&md_cpr_resync.md_resync_mutex); if ((md_cpr_resync.md_mirror_resync > 0) || (md_cpr_resync.md_raid_resync > 0)) { mutex_exit(&md_cpr_resync.md_resync_mutex); cmn_err(CE_WARN, "There are Solaris Volume Manager " "synchronization threads running."); cmn_err(CE_WARN, "Please try system suspension at " "a later time."); ret = -1; break; } mutex_exit(&md_cpr_resync.md_resync_mutex); cp->cc_events |= CALLB_CPR_START; delta = CPR_KTHREAD_TIMEOUT_SEC * hz; while (!(cp->cc_events & CALLB_CPR_SAFE)) /* cv_reltimedwait() returns -1 if it times out. */ if ((ret = cv_reltimedwait(&cp->cc_callb_cv, cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1) break; break; case CB_CODE_CPR_RESUME: cp->cc_events &= ~CALLB_CPR_START; cv_signal(&cp->cc_stop_cv); break; } mutex_exit(cp->cc_lockp); return (ret != -1); } void md_daemon(int pass_thru, mdq_anchor_t *anchor) { daemon_queue_t *dq; callb_cpr_t cprinfo; if (pass_thru && (md_get_status() & MD_GBL_DAEMONS_LIVE)) return; /* * Register cpr callback */ CALLB_CPR_INIT(&cprinfo, &anchor->a_mx, callb_md_cpr, "md_daemon"); /*CONSTCOND*/ while (1) { mutex_enter(&anchor->a_mx); while ((dq = anchor->dq.dq_next) == &(anchor->dq)) { if (pass_thru) { /* * CALLB_CPR_EXIT Will do * mutex_exit(&anchor->a_mx) */ CALLB_CPR_EXIT(&cprinfo); return; } if (md_get_status() & MD_GBL_DAEMONS_DIE) { mutex_exit(&anchor->a_mx); mutex_enter(&md_mx); md_num_daemons--; mutex_exit(&md_mx); /* * CALLB_CPR_EXIT will do * mutex_exit(&anchor->a_mx) */ mutex_enter(&anchor->a_mx); CALLB_CPR_EXIT(&cprinfo); thread_exit(); } CALLB_CPR_SAFE_BEGIN(&cprinfo); cv_wait(&anchor->a_cv, &anchor->a_mx); CALLB_CPR_SAFE_END(&cprinfo, &anchor->a_mx); } dq->dq_prev->dq_next = dq->dq_next; dq->dq_next->dq_prev = dq->dq_prev; dq->dq_prev = dq->dq_next = NULL; anchor->dq.qlen--; mutex_exit(&anchor->a_mx); (*(dq->dq_call))(dq); } /*NOTREACHED*/ } /* * daemon_request: * * Adds requests to appropriate requestq which is * anchored by *anchor. * The request is the first element of a doubly linked circular list. * When the request is a single element, the forward and backward * pointers MUST point to the element itself. */ void daemon_request(mdq_anchor_t *anchor, void (*func)(), daemon_queue_t *request, callstyle_t style) { daemon_queue_t *rqtp; int i = 0; rqtp = request; if (style == REQ_OLD) { ASSERT((rqtp->dq_next == NULL) && (rqtp->dq_prev == NULL)); /* set it to the new style */ rqtp->dq_prev = rqtp->dq_next = rqtp; } ASSERT((rqtp->dq_next != NULL) && (rqtp->dq_prev != NULL)); /* scan the list and add the function to each element */ do { rqtp->dq_call = func; i++; rqtp = rqtp->dq_next; } while (rqtp != request); /* save pointer to tail of the request list */ rqtp = request->dq_prev; mutex_enter(&anchor->a_mx); /* stats */ anchor->dq.qlen += i; anchor->dq.treqs += i; anchor->dq.maxq_len = (anchor->dq.qlen > anchor->dq.maxq_len) ? anchor->dq.qlen : anchor->dq.maxq_len; /* now add the list to request queue */ request->dq_prev = anchor->dq.dq_prev; rqtp->dq_next = &anchor->dq; anchor->dq.dq_prev->dq_next = request; anchor->dq.dq_prev = rqtp; cv_broadcast(&anchor->a_cv); mutex_exit(&anchor->a_mx); } void mddb_commitrec_wrapper(mddb_recid_t recid) { int sent_log = 0; uint_t retry = md_retry_cnt; set_t setno; while (mddb_commitrec(recid)) { if (! sent_log) { cmn_err(CE_WARN, "md: state database commit failed"); sent_log = 1; } delay(md_hz); /* * Setting retry cnt to one (pre decremented) so that we * actually do no retries when committing/deleting a mddb rec. * The underlying disk driver does several retries to check * if the disk is really dead or not so there * is no reason for us to retry on top of the drivers retries. */ if (--retry == 0) { setno = mddb_getsetnum(recid); if (md_get_setstatus(setno) & MD_SET_TOOFEW) { panic( "md: Panic due to lack of DiskSuite state\n" " database replicas. Fewer than 50%% of " "the total were available,\n so panic to " "ensure data integrity."); } else { panic("md: state database problem"); } /*NOTREACHED*/ } } } void mddb_commitrecs_wrapper(mddb_recid_t *recids) { int sent_log = 0; uint_t retry = md_retry_cnt; set_t setno; while (mddb_commitrecs(recids)) { if (! sent_log) { cmn_err(CE_WARN, "md: state database commit failed"); sent_log = 1; } delay(md_hz); /* * Setting retry cnt to one (pre decremented) so that we * actually do no retries when committing/deleting a mddb rec. * The underlying disk driver does several retries to check * if the disk is really dead or not so there * is no reason for us to retry on top of the drivers retries. */ if (--retry == 0) { /* * since all the records are part of the same set * use the first one to get setno */ setno = mddb_getsetnum(*recids); if (md_get_setstatus(setno) & MD_SET_TOOFEW) { panic( "md: Panic due to lack of DiskSuite state\n" " database replicas. Fewer than 50%% of " "the total were available,\n so panic to " "ensure data integrity."); } else { panic("md: state database problem"); } /*NOTREACHED*/ } } } void mddb_deleterec_wrapper(mddb_recid_t recid) { int sent_log = 0; uint_t retry = md_retry_cnt; set_t setno; while (mddb_deleterec(recid)) { if (! sent_log) { cmn_err(CE_WARN, "md: state database delete failed"); sent_log = 1; } delay(md_hz); /* * Setting retry cnt to one (pre decremented) so that we * actually do no retries when committing/deleting a mddb rec. * The underlying disk driver does several retries to check * if the disk is really dead or not so there * is no reason for us to retry on top of the drivers retries. */ if (--retry == 0) { setno = mddb_getsetnum(recid); if (md_get_setstatus(setno) & MD_SET_TOOFEW) { panic( "md: Panic due to lack of DiskSuite state\n" " database replicas. Fewer than 50%% of " "the total were available,\n so panic to " "ensure data integrity."); } else { panic("md: state database problem"); } /*NOTREACHED*/ } } } /* * md_holdset_enter is called in order to hold the set in its * current state (loaded, unloaded, snarfed, unsnarfed, etc) * until md_holdset_exit is called. This is used by the mirror * code to mark the set as HOLD so that the set won't be * unloaded while hotspares are being allocated in check_4_hotspares. * The original fix to the mirror code to hold the set was to call * md_haltsnarf_enter, but this will block all ioctls and ioctls * must work for a MN diskset while hotspares are allocated. */ void md_holdset_enter(set_t setno) { mutex_enter(&md_mx); while (md_set[setno].s_status & MD_SET_HOLD) cv_wait(&md_cv, &md_mx); md_set[setno].s_status |= MD_SET_HOLD; mutex_exit(&md_mx); } void md_holdset_exit(set_t setno) { mutex_enter(&md_mx); md_set[setno].s_status &= ~MD_SET_HOLD; cv_broadcast(&md_cv); mutex_exit(&md_mx); } /* * Returns a 0 if this thread marked the set as HOLD (success), * returns a -1 if set was already marked HOLD (failure). * Used by the release_set code to see if set is marked HOLD. * HOLD is set by a daemon when hotspares are being allocated * to mirror units. */ int md_holdset_testandenter(set_t setno) { mutex_enter(&md_mx); if (md_set[setno].s_status & MD_SET_HOLD) { mutex_exit(&md_mx); return (-1); } md_set[setno].s_status |= MD_SET_HOLD; mutex_exit(&md_mx); return (0); } void md_haltsnarf_enter(set_t setno) { mutex_enter(&md_mx); while (md_set[setno].s_status & MD_SET_SNARFING) cv_wait(&md_cv, &md_mx); md_set[setno].s_status |= MD_SET_SNARFING; mutex_exit(&md_mx); } void md_haltsnarf_exit(set_t setno) { mutex_enter(&md_mx); md_set[setno].s_status &= ~MD_SET_SNARFING; cv_broadcast(&md_cv); mutex_exit(&md_mx); } void md_haltsnarf_wait(set_t setno) { mutex_enter(&md_mx); while (md_set[setno].s_status & MD_SET_SNARFING) cv_wait(&md_cv, &md_mx); mutex_exit(&md_mx); } /* * ASSUMED that the md_unit_array_rw WRITER lock is held. */ int md_halt_set(set_t setno, enum md_haltcmd cmd) { int i, err; if (md_set[setno].s_un == NULL || md_set[setno].s_ui == NULL) { return (0); } if ((cmd == MD_HALT_CHECK) || (cmd == MD_HALT_ALL)) { for (i = 0; i < MD_NOPS; i++) { if (md_ops[i] == NULL) continue; if ((*(md_ops[i]->md_halt))(MD_HALT_CLOSE, setno)) { for (--i; i > 0; --i) { if (md_ops[i] == NULL) continue; (void) (*(md_ops[i]->md_halt)) (MD_HALT_OPEN, setno); } return (EBUSY); } } for (i = 0; i < MD_NOPS; i++) { if (md_ops[i] == NULL) continue; if ((*(md_ops[i]->md_halt))(MD_HALT_CHECK, setno)) { for (i = 0; i < MD_NOPS; i++) { if (md_ops[i] == NULL) continue; (void) (*(md_ops[i]->md_halt)) (MD_HALT_OPEN, setno); } return (EBUSY); } } } if ((cmd == MD_HALT_DOIT) || (cmd == MD_HALT_ALL)) { for (i = 0; i < MD_NOPS; i++) { if (md_ops[i] == NULL) continue; err = (*(md_ops[i]->md_halt))(MD_HALT_DOIT, setno); if (err != 0) cmn_err(CE_NOTE, "md: halt failed for %s, error %d", md_ops[i]->md_driver.md_drivername, err); } /* * Unload the devid namespace if it is loaded */ md_unload_namespace(setno, NM_DEVID); md_unload_namespace(setno, 0L); md_clr_setstatus(setno, MD_SET_SNARFED); } return (0); } int md_halt(int global_locks_owned_mask) { set_t i, j; int err; int init_queues; md_requestq_entry_t *rqp; md_ops_t **pops, *ops, *lops; ddi_modhandle_t mod; char *name; rw_enter(&md_unit_array_rw.lock, RW_WRITER); /* * Grab the all of the global locks that are not * already owned to ensure that there isn't another * thread trying to access a global resource * while the halt is in progress */ if (md_global_lock_enter(global_locks_owned_mask) == EINTR) return (EINTR); for (i = 0; i < md_nsets; i++) md_haltsnarf_enter(i); /* * Kill the daemon threads. */ init_queues = ((md_get_status() & MD_GBL_DAEMONS_LIVE) ? FALSE : TRUE); md_clr_status(MD_GBL_DAEMONS_LIVE); md_set_status(MD_GBL_DAEMONS_DIE); rqp = &md_daemon_queues[0]; i = 0; while (!NULL_REQUESTQ_ENTRY(rqp)) { cv_broadcast(&rqp->dispq_headp->a_cv); rqp = &md_daemon_queues[++i]; } mutex_enter(&md_mx); while (md_num_daemons != 0) { mutex_exit(&md_mx); delay(md_hz); mutex_enter(&md_mx); } mutex_exit(&md_mx); md_clr_status(MD_GBL_DAEMONS_DIE); for (i = 0; i < md_nsets; i++) /* * Only call into md_halt_set if s_un / s_ui are both set. * If they are NULL this set hasn't been accessed, so its * pointless performing the call. */ if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) { if (md_halt_set(i, MD_HALT_CHECK)) { if (md_start_daemons(init_queues)) cmn_err(CE_WARN, "md: restart of daemon threads " "failed"); for (j = 0; j < md_nsets; j++) md_haltsnarf_exit(j); return (md_global_lock_exit( global_locks_owned_mask, EBUSY, MD_ARRAY_WRITER, NULL)); } } /* * if we get here we are going to do it */ for (i = 0; i < md_nsets; i++) { /* * Only call into md_halt_set if s_un / s_ui are both set. * If they are NULL this set hasn't been accessed, so its * pointless performing the call. */ if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) { err = md_halt_set(i, MD_HALT_DOIT); if (err != 0) cmn_err(CE_NOTE, "md: halt failed set %u, error %d", (unsigned)i, err); } } /* * issue a halt unload to each module to indicate that it * is about to be unloaded. Each module is called once, set * has no meaning at this point in time. */ for (i = 0; i < MD_NOPS; i++) { if (md_ops[i] == NULL) continue; err = (*(md_ops[i]->md_halt))(MD_HALT_UNLOAD, 0); if (err != 0) cmn_err(CE_NOTE, "md: halt failed for %s, error %d", md_ops[i]->md_driver.md_drivername, err); } /* ddi_modclose the submodules */ for (i = 0; i < MD_NOPS; i++) { /* skip if not open */ if ((md_ops[i] == NULL) || (md_mods[i] == NULL)) continue; /* find and unlink from md_opslist */ ops = md_ops[i]; mod = md_mods[i]; pops = &md_opslist; for (lops = *pops; lops; pops = &lops->md_next, lops = *pops) { if (lops == ops) { *pops = ops->md_next; ops->md_next = NULL; break; } } /* uninitialize */ name = ops->md_driver.md_drivername; md_ops[i] = NULL; md_mods[i] = NULL; ops->md_selfindex = 0; ops->md_driver.md_drivername[0] = '\0'; rw_destroy(&ops->md_link_rw.lock); /* close */ err = ddi_modclose(mod); if (err != 0) cmn_err(CE_NOTE, "md: halt close failed for %s, error %d", name ? name : "UNKNOWN", err); } /* Unload the database */ mddb_unload(); md_set_status(MD_GBL_HALTED); /* we are ready to be unloaded */ for (i = 0; i < md_nsets; i++) md_haltsnarf_exit(i); return (md_global_lock_exit(global_locks_owned_mask, 0, MD_ARRAY_WRITER, NULL)); } /* * md_layered_open() is an internal routine only for SVM modules. * So the input device will be a md_dev64_t, because all SVM modules internally * work with that device type. * ddi routines on the other hand work with dev_t. So, if we call any ddi * routines from here we first have to convert that device into a dev_t. */ int md_layered_open( minor_t mnum, md_dev64_t *dev, int md_oflags ) { int flag = (FREAD | FWRITE); cred_t *cred_p = kcred; major_t major; int err; dev_t ddi_dev = md_dev64_to_dev(*dev); if (ddi_dev == NODEV) return (ENODEV); major = getmajor(ddi_dev); /* metadevice */ if (major == md_major) { mdi_unit_t *ui; /* open underlying driver */ mnum = getminor(ddi_dev); ui = MDI_UNIT(mnum); if (md_ops[ui->ui_opsindex]->md_open != NULL) { int ret = (*md_ops[ui->ui_opsindex]->md_open)(&ddi_dev, flag, OTYP_LYR, cred_p, md_oflags); /* * As open() may change the device, * send this info back to the caller. */ *dev = md_expldev(ddi_dev); return (ret); } /* or do it ourselves */ (void) md_unit_openclose_enter(ui); err = md_unit_incopen(mnum, flag, OTYP_LYR); md_unit_openclose_exit(ui); /* convert our ddi_dev back to the dev we were given */ *dev = md_expldev(ddi_dev); return (err); } /* * Open regular device, since open() may change dev_t give new dev_t * back to the caller. */ err = dev_lopen(&ddi_dev, flag, OTYP_LYR, cred_p); *dev = md_expldev(ddi_dev); return (err); } /* * md_layered_close() is an internal routine only for SVM modules. * So the input device will be a md_dev64_t, because all SVM modules internally * work with that device type. * ddi routines on the other hand work with dev_t. So, if we call any ddi * routines from here we first have to convert that device into a dev_t. */ void md_layered_close( md_dev64_t dev, int md_cflags ) { int flag = (FREAD | FWRITE); cred_t *cred_p = kcred; dev_t ddi_dev = md_dev64_to_dev(dev); major_t major = getmajor(ddi_dev); minor_t mnum = getminor(ddi_dev); /* metadevice */ if (major == md_major) { mdi_unit_t *ui = MDI_UNIT(mnum); /* close underlying driver */ if (md_ops[ui->ui_opsindex]->md_close != NULL) { (*md_ops[ui->ui_opsindex]->md_close) (ddi_dev, flag, OTYP_LYR, cred_p, md_cflags); return; } /* or do it ourselves */ (void) md_unit_openclose_enter(ui); (void) md_unit_decopen(mnum, OTYP_LYR); md_unit_openclose_exit(ui); return; } /* close regular device */ (void) dev_lclose(ddi_dev, flag, OTYP_LYR, cred_p); } /* * saves a little code in mdstrategy */ int errdone(mdi_unit_t *ui, struct buf *bp, int err) { if ((bp->b_error = err) != 0) bp->b_flags |= B_ERROR; else bp->b_resid = bp->b_bcount; md_unit_readerexit(ui); md_biodone(bp); return (1); } static int md_write_label = 0; int md_checkbuf(mdi_unit_t *ui, md_unit_t *un, buf_t *bp) { diskaddr_t endblk; set_t setno = MD_UN2SET(un); if ((md_get_setstatus(setno) & MD_SET_STALE) && (! (bp->b_flags & B_READ))) return (errdone(ui, bp, EROFS)); /* * Check early for unreasonable block number. * * b_blkno is defined as adaddr_t which is typedef'd to a long. * A problem occurs if b_blkno has bit 31 set and un_total_blocks * doesn't, b_blkno is then compared as a negative number which is * always less than a positive. */ if ((u_longlong_t)bp->b_lblkno > (u_longlong_t)un->c.un_total_blocks) return (errdone(ui, bp, EINVAL)); if (bp->b_lblkno == un->c.un_total_blocks) return (errdone(ui, bp, 0)); /* * make sure we don't clobber any labels */ if ((bp->b_lblkno == 0) && (! (bp->b_flags & B_READ)) && (un->c.un_flag & MD_LABELED) && (! md_write_label)) { cmn_err(CE_NOTE, "md: %s: write to label", md_shortname(getminor(bp->b_edev))); return (errdone(ui, bp, EINVAL)); } bp->b_resid = 0; endblk = (diskaddr_t)(bp->b_lblkno + howmany(bp->b_bcount, DEV_BSIZE) - 1); if (endblk > (un->c.un_total_blocks - 1)) { bp->b_resid = dbtob(endblk - (un->c.un_total_blocks - 1)); endblk = un->c.un_total_blocks - 1; bp->b_bcount -= bp->b_resid; } return (0); } /* * init_request_queue: initializes the request queues and creates the threads. * return value = 0 :invalid num_threads * = n : n is the number of threads created. */ int init_requestq( md_requestq_entry_t *rq, /* request queue info */ void (*threadfn)(), /* function to start the thread */ caddr_t threadfn_args, /* args to the function */ int pri, /* thread priority */ int init_queue) /* flag to init queues */ { struct mdq_anchor *rqhead; int i; int num_threads; num_threads = *(rq->num_threadsp); rqhead = rq->dispq_headp; if (NULL_REQUESTQ_ENTRY(rq) || num_threads == 0) return (0); if (init_queue) { rqhead->dq.maxq_len = 0; rqhead->dq.treqs = 0; rqhead->dq.dq_next = &rqhead->dq; rqhead->dq.dq_prev = &rqhead->dq; cv_init(&rqhead->a_cv, NULL, CV_DEFAULT, NULL); mutex_init(&rqhead->a_mx, NULL, MUTEX_DEFAULT, NULL); } for (i = 0; i < num_threads; i++) { (void) thread_create(NULL, 0, threadfn, threadfn_args, 0, &p0, TS_RUN, pri); } return (i); } static void start_daemon(struct mdq_anchor *q) { md_daemon(0, q); ASSERT(0); } /* * Creates all the md daemons. * Global: * md_num_daemons is set to number of daemons. * MD_GBL_DAEMONS_LIVE flag set to indicate the daemons are active. * * Return value: 0 success * 1 failure */ int md_start_daemons(int init_queue) { md_requestq_entry_t *rqp; int cnt; int i; int retval = 0; if (md_get_status() & MD_GBL_DAEMONS_LIVE) { return (retval); } md_clr_status(MD_GBL_DAEMONS_DIE); rqp = &md_daemon_queues[0]; i = 0; while (!NULL_REQUESTQ_ENTRY(rqp)) { cnt = init_requestq(rqp, start_daemon, (caddr_t)rqp->dispq_headp, minclsyspri, init_queue); if (cnt && cnt != *rqp->num_threadsp) { retval = 1; break; } /* * initialize variables */ md_num_daemons += cnt; rqp = &md_daemon_queues[++i]; } md_set_status(MD_GBL_DAEMONS_LIVE); return (retval); } int md_loadsubmod(set_t setno, char *name, int drvrid) { ddi_modhandle_t mod; md_ops_t **pops, *ops; int i, err; /* * See if the submodule is mdopened. If not, i is the index of the * next empty slot. */ for (i = 0; md_ops[i] != NULL; i++) { if (strncmp(name, md_ops[i]->md_driver.md_drivername, MD_DRIVERNAMELEN) == 0) return (i); if (i == (MD_NOPS - 1)) return (-1); } if (drvrid < 0) { /* Do not try to add any records to the DB when stale. */ if (md_get_setstatus(setno) & MD_SET_STALE) return (-1); drvrid = md_setshared_name(setno, name, 0L); } if (drvrid < 0) return (-1); /* open and import the md_ops of the submodules */ mod = ddi_modopen(name, KRTLD_MODE_FIRST, &err); if (mod == NULL) { cmn_err(CE_WARN, "md_loadsubmod: " "unable to ddi_modopen %s, error %d\n", name, err); return (-1); } pops = ddi_modsym(mod, "md_interface_ops", &err); if (pops == NULL) { cmn_err(CE_WARN, "md_loadsubmod: " "unable to import md_interface_ops from %s, error %d\n", name, err); (void) ddi_modclose(mod); return (-1); } /* ddi_modsym returns pointer to md_interface_ops in submod */ ops = *pops; /* initialize */ ops->md_selfindex = i; rw_init(&ops->md_link_rw.lock, NULL, RW_DEFAULT, NULL); (void) strncpy(ops->md_driver.md_drivername, name, MD_DRIVERNAMELEN); /* plumb */ md_ops[i] = ops; md_mods[i] = mod; ops->md_next = md_opslist; md_opslist = ops; /* return index */ return (i); } int md_getmodindex(md_driver_t *driver, int dont_load, int db_notrequired) { int i; int modindex; char *name = driver->md_drivername; set_t setno = driver->md_setno; int drvid; int local_dont_load; if (setno >= md_nsets) return (-1); for (i = 0; name[i] != 0; i++) if (i == (MD_DRIVERNAMELEN -1)) return (-1); /* * If set is STALE, set local_dont_load to 1 since no records * should be added to DB when stale. */ if (md_get_setstatus(setno) & MD_SET_STALE) { local_dont_load = 1; } else { local_dont_load = dont_load; } /* * Single thread ioctl module binding with respect to * similar code executed in md_loadsubmod that is called * from md_snarf_db_set (which is where that path does * its md_haltsnarf_enter call). */ md_haltsnarf_enter(setno); /* See if the submodule is already ddi_modopened. */ for (i = 0; md_ops[i] != NULL; i++) { if (strncmp(name, md_ops[i]->md_driver.md_drivername, MD_DRIVERNAMELEN) == 0) { if (! local_dont_load && (md_getshared_key(setno, name) == MD_KEYBAD)) { if (md_setshared_name(setno, name, 0L) == MD_KEYBAD) { if (!db_notrequired) goto err; } } md_haltsnarf_exit(setno); return (i); } if (i == (MD_NOPS -1)) break; } if (local_dont_load) goto err; drvid = ((db_notrequired) ? 0 : (int)md_getshared_key(setno, name)); /* ddi_modopen the submodule */ modindex = md_loadsubmod(setno, name, drvid); if (modindex < 0) goto err; if (md_ops[modindex]->md_snarf != NULL) (*(md_ops[modindex]->md_snarf))(MD_SNARF_DOIT, setno); md_haltsnarf_exit(setno); return (modindex); err: md_haltsnarf_exit(setno); return (-1); } void md_call_strategy(buf_t *bp, int flags, void *private) { mdi_unit_t *ui; if (mdv_strategy_tstpnt) if ((*mdv_strategy_tstpnt)(bp, flags, private) != 0) return; if (getmajor(bp->b_edev) != md_major) { (void) bdev_strategy(bp); return; } flags = (flags & MD_STR_PASSEDON) | MD_STR_NOTTOP; ui = MDI_UNIT(getminor(bp->b_edev)); ASSERT(ui != NULL); (*md_ops[ui->ui_opsindex]->md_strategy)(bp, flags, private); } /* * md_call_ioctl: * ------------- * Issue the specified ioctl to the device associated with the given md_dev64_t * * Arguments: * dev - underlying device [md_dev64_t] * cmd - ioctl to perform * data - arguments / result location * mode - read/write/layered ioctl * lockp - lock reference * * Returns: * 0 success * !=0 Failure (error code) */ int md_call_ioctl(md_dev64_t dev, int cmd, void *data, int mode, IOLOCK *lockp) { dev_t device = md_dev64_to_dev(dev); int rval; mdi_unit_t *ui; /* * See if device is a metadevice. If not call cdev_ioctl(), otherwise * call the ioctl entry-point in the metadevice. */ if (md_getmajor(dev) != md_major) { int rv; rval = cdev_ioctl(device, cmd, (intptr_t)data, mode, ddi_get_cred(), &rv); } else { ui = MDI_UNIT(md_getminor(dev)); ASSERT(ui != NULL); rval = (*md_ops[ui->ui_opsindex]->md_ioctl)(device, cmd, data, mode, lockp); } return (rval); } void md_rem_link(set_t setno, int id, krwlock_t *rw, md_link_t **head) { md_link_t *next; md_link_t **pprev; rw_enter(rw, RW_WRITER); next = *head; pprev = head; while (next) { if ((next->ln_setno == setno) && (next->ln_id == id)) { *pprev = next->ln_next; rw_exit(rw); return; } pprev = &next->ln_next; next = next->ln_next; } rw_exit(rw); } int md_dev_exists(md_dev64_t dev) { if (dev == NODEV64) return (0); if (strcmp(ddi_major_to_name(md_getmajor(dev)), "md") != 0) return (1); if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) || (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits)) return (0); if (MDI_UNIT(md_getminor(dev)) != NULL) return (1); return (0); } md_parent_t md_get_parent(md_dev64_t dev) { md_unit_t *un; mdi_unit_t *ui; md_parent_t parent; if (md_getmajor(dev) != md_major) return (MD_NO_PARENT); ui = MDI_UNIT(md_getminor(dev)); un = (md_unit_t *)md_unit_readerlock(ui); parent = un->c.un_parent; md_unit_readerexit(ui); return (parent); } void md_set_parent(md_dev64_t dev, md_parent_t parent) { md_unit_t *un; mdi_unit_t *ui; if (md_getmajor(dev) != md_major) return; ui = MDI_UNIT(md_getminor(dev)); un = (md_unit_t *)md_unit_readerlock(ui); un->c.un_parent = parent; md_unit_readerexit(ui); } void md_reset_parent(md_dev64_t dev) { md_unit_t *un; mdi_unit_t *ui; if (md_getmajor(dev) != md_major) return; ui = MDI_UNIT(md_getminor(dev)); un = (md_unit_t *)md_unit_readerlock(ui); un->c.un_parent = MD_NO_PARENT; md_unit_readerexit(ui); } static intptr_t (*hot_spare_interface)() = (intptr_t (*)())NULL; int md_hot_spare_ifc( hs_cmds_t cmd, mddb_recid_t id, u_longlong_t size, int labeled, mddb_recid_t *hs_id, mdkey_t *key, md_dev64_t *dev, diskaddr_t *sblock) { int err; /* * RW lock on hot_spare_interface. We don't want it to change from * underneath us. If hot_spare_interface is NULL we're going to * need to set it. So we need to upgrade to a WRITER lock. If that * doesn't work, we drop the lock and reenter as WRITER. This leaves * a small hole during which hot_spare_interface could be modified * so we check it for NULL again. What a pain. Then if still null * load from md_get_named_service. */ rw_enter(&hsp_rwlp.lock, RW_READER); if (hot_spare_interface == NULL) { if (rw_tryupgrade(&hsp_rwlp.lock) == 0) { rw_exit(&hsp_rwlp.lock); rw_enter(&hsp_rwlp.lock, RW_WRITER); if (hot_spare_interface != NULL) { err = ((*hot_spare_interface) (cmd, id, size, labeled, hs_id, key, dev, sblock)); rw_exit(&hsp_rwlp.lock); return (err); } } hot_spare_interface = md_get_named_service(NODEV64, ANY_SERVICE, "hot spare interface", 0); rw_downgrade(&hsp_rwlp.lock); } if (hot_spare_interface == NULL) { cmn_err(CE_WARN, "md: no hotspare interface"); rw_exit(&hsp_rwlp.lock); return (0); } err = ((*hot_spare_interface) (cmd, id, size, labeled, hs_id, key, dev, sblock)); rw_exit(&hsp_rwlp.lock); return (err); } void md_clear_hot_spare_interface() { rw_enter(&hsp_rwlp.lock, RW_WRITER); hot_spare_interface = NULL; rw_exit(&hsp_rwlp.lock); } static intptr_t (*notify_interface)() = (intptr_t (*)())NULL; int md_notify_interface( md_event_cmds_t cmd, md_tags_t tag, set_t set, md_dev64_t dev, md_event_type_t event ) { int err; if (md_event_queue == NULL) return (0); rw_enter(&ni_rwlp.lock, RW_READER); if (notify_interface == NULL) { if (rw_tryupgrade(&ni_rwlp.lock) == 0) { rw_exit(&ni_rwlp.lock); rw_enter(&ni_rwlp.lock, RW_WRITER); if (notify_interface != NULL) { err = ((*notify_interface) (cmd, tag, set, dev, event)); rw_exit(&ni_rwlp.lock); return (err); } } notify_interface = md_get_named_service(NODEV64, ANY_SERVICE, "notify interface", 0); rw_downgrade(&ni_rwlp.lock); } if (notify_interface == NULL) { cmn_err(CE_WARN, "md: no notify interface"); rw_exit(&ni_rwlp.lock); return (0); } err = ((*notify_interface)(cmd, tag, set, dev, event)); rw_exit(&ni_rwlp.lock); return (err); } char * obj2devname(uint32_t tag, uint_t setno, md_dev64_t dev) { char *setname; char name[MD_MAX_CTDLEN]; minor_t mnum = md_getminor(dev); major_t maj = md_getmajor(dev); int rtn = 0; /* * Verify that the passed dev_t refers to a valid metadevice. * If it doesn't we can make no assumptions as to what the device * name is. Return NULL in these cases. */ if (((maj != md_major) || (MD_MIN2UNIT(mnum) >= md_nunits)) || (MD_MIN2SET(mnum) >= md_nsets)) { return (NULL); } setname = NULL; name[0] = '\0'; switch (tag) { case SVM_TAG_HSP: if (setno == 0) { rtn = snprintf(name, sizeof (name), "hsp%u", (unsigned)MD_MIN2UNIT(mnum)); } else { setname = mddb_getsetname(setno); if (setname != NULL) { rtn = snprintf(name, sizeof (name), "%s/hsp%u", setname, (unsigned)MD_MIN2UNIT(mnum)); } } break; case SVM_TAG_DRIVE: (void) sprintf(name, "drive"); break; case SVM_TAG_HOST: (void) sprintf(name, "host"); break; case SVM_TAG_SET: rtn = snprintf(name, sizeof (name), "%s", mddb_getsetname(setno)); if ((name[0] == '\0') || (rtn >= sizeof (name))) { (void) sprintf(name, "diskset"); rtn = 0; } break; default: rtn = snprintf(name, sizeof (name), "%s", md_shortname(mnum)); break; } /* Check if we got any rubbish for any of the snprintf's */ if ((name[0] == '\0') || (rtn >= sizeof (name))) { return (NULL); } return (md_strdup(name)); } /* Sysevent subclass and mdnotify event type pairs */ struct node { char *se_ev; md_event_type_t md_ev; }; /* * Table must be sorted in case sensitive ascending order of * the sysevents values */ static struct node ev_table[] = { { ESC_SVM_ADD, EQ_ADD }, { ESC_SVM_ATTACH, EQ_ATTACH }, { ESC_SVM_ATTACHING, EQ_ATTACHING }, { ESC_SVM_CHANGE, EQ_CHANGE }, { ESC_SVM_CREATE, EQ_CREATE }, { ESC_SVM_DELETE, EQ_DELETE }, { ESC_SVM_DETACH, EQ_DETACH }, { ESC_SVM_DETACHING, EQ_DETACHING }, { ESC_SVM_DRIVE_ADD, EQ_DRIVE_ADD }, { ESC_SVM_DRIVE_DELETE, EQ_DRIVE_DELETE }, { ESC_SVM_ENABLE, EQ_ENABLE }, { ESC_SVM_ERRED, EQ_ERRED }, { ESC_SVM_EXCHANGE, EQ_EXCHANGE }, { ESC_SVM_GROW, EQ_GROW }, { ESC_SVM_HS_CHANGED, EQ_HS_CHANGED }, { ESC_SVM_HS_FREED, EQ_HS_FREED }, { ESC_SVM_HOST_ADD, EQ_HOST_ADD }, { ESC_SVM_HOST_DELETE, EQ_HOST_DELETE }, { ESC_SVM_HOTSPARED, EQ_HOTSPARED }, { ESC_SVM_INIT_FAILED, EQ_INIT_FAILED }, { ESC_SVM_INIT_FATAL, EQ_INIT_FATAL }, { ESC_SVM_INIT_START, EQ_INIT_START }, { ESC_SVM_INIT_SUCCESS, EQ_INIT_SUCCESS }, { ESC_SVM_IOERR, EQ_IOERR }, { ESC_SVM_LASTERRED, EQ_LASTERRED }, { ESC_SVM_MEDIATOR_ADD, EQ_MEDIATOR_ADD }, { ESC_SVM_MEDIATOR_DELETE, EQ_MEDIATOR_DELETE }, { ESC_SVM_OFFLINE, EQ_OFFLINE }, { ESC_SVM_OK, EQ_OK }, { ESC_SVM_ONLINE, EQ_ONLINE }, { ESC_SVM_OPEN_FAIL, EQ_OPEN_FAIL }, { ESC_SVM_REGEN_DONE, EQ_REGEN_DONE }, { ESC_SVM_REGEN_FAILED, EQ_REGEN_FAILED }, { ESC_SVM_REGEN_START, EQ_REGEN_START }, { ESC_SVM_RELEASE, EQ_RELEASE }, { ESC_SVM_REMOVE, EQ_REMOVE }, { ESC_SVM_RENAME_DST, EQ_RENAME_DST }, { ESC_SVM_RENAME_SRC, EQ_RENAME_SRC }, { ESC_SVM_REPLACE, EQ_REPLACE }, { ESC_SVM_RESYNC_DONE, EQ_RESYNC_DONE }, { ESC_SVM_RESYNC_FAILED, EQ_RESYNC_FAILED }, { ESC_SVM_RESYNC_START, EQ_RESYNC_START }, { ESC_SVM_RESYNC_SUCCESS, EQ_RESYNC_SUCCESS }, { ESC_SVM_TAKEOVER, EQ_TAKEOVER } }; static md_tags_t md_tags[] = { TAG_UNK, TAG_METADEVICE, TAG_UNK, TAG_UNK, TAG_UNK, TAG_UNK, TAG_REPLICA, TAG_HSP, TAG_HS, TAG_SET, TAG_DRIVE, TAG_HOST, TAG_MEDIATOR }; md_event_type_t ev_get(char *subclass) { int high, mid, low, p; low = 0; high = (sizeof (ev_table) / sizeof (ev_table[0])) - 1; while (low <= high) { mid = (high + low) / 2; p = strcmp(subclass, ev_table[mid].se_ev); if (p == 0) { return (ev_table[mid].md_ev); } else if (p < 0) { high = mid - 1; } else { low = mid + 1; } } return (EQ_EMPTY); } /* * Log mdnotify event */ void do_mdnotify(char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid) { md_event_type_t ev_type; md_tags_t md_tag; /* Translate sysevent into mdnotify event */ ev_type = ev_get(se_subclass); if (tag >= (sizeof (md_tags) / sizeof (md_tags[0]))) { md_tag = TAG_UNK; } else { md_tag = md_tags[tag]; } NOTIFY_MD(md_tag, setno, devid, ev_type); } /* * Log SVM sys events */ void svm_gen_sysevent( char *se_class, char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid ) { nvlist_t *attr_list; sysevent_id_t eid; int err = DDI_SUCCESS; char *devname; extern dev_info_t *md_devinfo; /* Raise the mdnotify event before anything else */ do_mdnotify(se_subclass, tag, setno, devid); if (md_devinfo == NULL) { return; } err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_NOSLEEP); if (err == DDI_SUCCESS) { /* Add the version numver */ err = nvlist_add_uint32(attr_list, SVM_VERSION_NO, (uint32_t)SVM_VERSION); if (err != DDI_SUCCESS) { goto fail; } /* Add the tag attribute */ err = nvlist_add_uint32(attr_list, SVM_TAG, (uint32_t)tag); if (err != DDI_SUCCESS) { goto fail; } /* Add the set number attribute */ err = nvlist_add_uint32(attr_list, SVM_SET_NO, (uint32_t)setno); if (err != DDI_SUCCESS) { goto fail; } /* Add the device id attribute */ err = nvlist_add_uint64(attr_list, SVM_DEV_ID, (uint64_t)devid); if (err != DDI_SUCCESS) { goto fail; } /* Add the device name attribute */ devname = obj2devname(tag, setno, devid); if (devname != NULL) { err = nvlist_add_string(attr_list, SVM_DEV_NAME, devname); freestr(devname); } else { err = nvlist_add_string(attr_list, SVM_DEV_NAME, "unspecified"); } if (err != DDI_SUCCESS) { goto fail; } /* Attempt to post event */ err = ddi_log_sysevent(md_devinfo, DDI_VENDOR_SUNW, se_class, se_subclass, attr_list, &eid, DDI_SLEEP); nvlist_free(attr_list); if (err != DDI_SUCCESS) { cmn_err(CE_WARN, "Failed to log event for %s, %s," " err=%x", se_class, se_subclass, err); } } return; fail: nvlist_free(attr_list); cmn_err(CE_WARN, "Failed to setup attributes for event %s, %s, err=%x", se_class, se_subclass, err); } void md_clear_named_service() { rw_enter(&ni_rwlp.lock, RW_WRITER); notify_interface = NULL; rw_exit(&ni_rwlp.lock); } void md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock) { mdi_unit_t *ui; set_t setno = MD_MIN2SET(mnum); ui = (mdi_unit_t *)kmem_zalloc(sizeof (mdi_unit_t), KM_SLEEP); ui->ui_opsindex = ops->md_selfindex; /* initialize all the incore conditional variables */ mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL); cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL); if (alloc_lock) { ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP); mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL); cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&ui->ui_io_lock->io_list_mutex, NULL, MUTEX_DEFAULT, NULL); ui->ui_io_lock->io_list_front = NULL; ui->ui_io_lock->io_list_back = NULL; } if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) { rw_enter(&md_unit_array_rw.lock, RW_WRITER); MDI_VOIDUNIT(mnum) = (void *) ui; rw_exit(&md_unit_array_rw.lock); } else MDI_VOIDUNIT(mnum) = (void *) ui; rw_enter(&ops->md_link_rw.lock, RW_WRITER); ui->ui_link.ln_next = ops->md_head; ui->ui_link.ln_setno = setno; ui->ui_link.ln_id = mnum; ops->md_head = &ui->ui_link; /* setup the unavailable field */ #if defined(_ILP32) if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) { ui->ui_tstate |= MD_64MD_ON_32KERNEL; cmn_err(CE_NOTE, "d%d is unavailable because 64 bit " "metadevices are not accessible on a 32 bit kernel", mnum); } #endif rw_exit(&ops->md_link_rw.lock); } void md_destroy_unit_incore(minor_t mnum, md_ops_t *ops) { mdi_unit_t *ui; /* * ASSUMPTION: md_unit_array_rw WRITER lock is held. */ ui = MDI_UNIT(mnum); if (ui == NULL) return; md_rem_link(MD_MIN2SET(mnum), mnum, &ops->md_link_rw.lock, &ops->md_head); /* destroy the io lock if one is being used */ if (ui->ui_io_lock) { mutex_destroy(&ui->ui_io_lock->io_mx); cv_destroy(&ui->ui_io_lock->io_cv); kmem_free(ui->ui_io_lock, sizeof (md_io_lock_t)); } /* teardown kstat */ md_kstat_destroy(mnum); /* destroy all the incore conditional variables */ mutex_destroy(&ui->ui_mx); cv_destroy(&ui->ui_cv); kmem_free(ui, sizeof (mdi_unit_t)); MDI_VOIDUNIT(mnum) = (void *) NULL; } void md_rem_names(sv_dev_t *sv, int nsv) { int i, s; int max_sides; if (nsv == 0) return; /* All entries removed are in the same diskset */ if (md_get_setstatus(sv[0].setno) & MD_SET_MNSET) max_sides = MD_MNMAXSIDES; else max_sides = MD_MAXSIDES; for (i = 0; i < nsv; i++) for (s = 0; s < max_sides; s++) (void) md_remdevname(sv[i].setno, s, sv[i].key); } /* * Checking user args before we get into physio - returns 0 for ok, else errno * We do a lot of checking against illegal arguments here because some of the * real disk drivers don't like certain kinds of arguments. (e.g xy doesn't * like odd address user buffer.) Those drivers capture bad arguments in * xxread and xxwrite. But since meta-driver calls their strategy routines * directly, two bad scenario might happen: * 1. the real strategy doesn't like it and panic. * 2. the real strategy doesn't like it and set B_ERROR. * * The second case is no better than the first one, since the meta-driver * will treat it as a media-error and off line the mirror metapartition. * (Too bad there is no way to tell what error it is.) * */ int md_chk_uio(struct uio *uio) { int i; struct iovec *iov; /* * Check for negative or not block-aligned offset */ if ((uio->uio_loffset < 0) || ((uio->uio_loffset & (DEV_BSIZE - 1)) != 0)) { return (EINVAL); } iov = uio->uio_iov; i = uio->uio_iovcnt; while (i--) { if ((iov->iov_len & (DEV_BSIZE - 1)) != 0) return (EINVAL); /* * Bug # 1212146 * The default is to not check alignment, but we can now check * for a larger number of alignments if desired. */ if ((uintptr_t)(iov->iov_base) & md_uio_alignment_mask) return (EINVAL); iov++; } return (0); } char * md_shortname( minor_t mnum ) { static char buf[MAXPATHLEN]; char *devname; char *invalid = " (Invalid minor number %u) "; char *metaname; mdc_unit_t *un; side_t side; set_t setno = MD_MIN2SET(mnum); unit_t unit = MD_MIN2UNIT(mnum); if ((un = MD_UNIT(mnum)) == NULL) { (void) snprintf(buf, sizeof (buf), invalid, mnum); return (buf); } /* * If unit is not a friendly name unit, derive the name from the * minor number. */ if ((un->un_revision & MD_FN_META_DEV) == 0) { /* This is a traditional metadevice */ if (setno == MD_LOCAL_SET) { (void) snprintf(buf, sizeof (buf), "d%u", (unsigned)unit); } else { (void) snprintf(buf, sizeof (buf), "%s/d%u", mddb_getsetname(setno), (unsigned)unit); } return (buf); } /* * It is a friendly name metadevice, so we need to get its name. */ side = mddb_getsidenum(setno); devname = (char *)kmem_alloc(MAXPATHLEN, KM_SLEEP); if (md_getdevname(setno, side, MD_KEYWILD, md_makedevice(md_major, mnum), devname, MAXPATHLEN) == 0) { /* * md_getdevname has given us either /dev/md/dsk/ * or /dev/md//dsk/ depending on whether * or not we are in the local set. Thus, we'll pull the * metaname from this string. */ if ((metaname = strrchr(devname, '/')) == NULL) { (void) snprintf(buf, sizeof (buf), invalid, mnum); goto out; } metaname++; /* move past slash */ if (setno == MD_LOCAL_SET) { /* No set name. */ (void) snprintf(buf, sizeof (buf), "%s", metaname); } else { /* Include setname */ (void) snprintf(buf, sizeof (buf), "%s/%s", mddb_getsetname(setno), metaname); } } else { /* We couldn't find the name. */ (void) snprintf(buf, sizeof (buf), invalid, mnum); } out: kmem_free(devname, MAXPATHLEN); return (buf); } char * md_devname( set_t setno, md_dev64_t dev, char *buf, size_t size ) { static char mybuf[MD_MAX_CTDLEN]; int err; if (buf == NULL) { buf = mybuf; size = sizeof (mybuf); } else { ASSERT(size >= MD_MAX_CTDLEN); } err = md_getdevname_common(setno, mddb_getsidenum(setno), 0, dev, buf, size, MD_NOWAIT_LOCK); if (err) { if (err == ENOENT) { (void) sprintf(buf, "(Unavailable)"); } else { (void) sprintf(buf, "(%u.%u)", md_getmajor(dev), md_getminor(dev)); } } return (buf); } void md_minphys(buf_t *pb) { extern unsigned md_maxbcount; if (pb->b_bcount > md_maxbcount) pb->b_bcount = md_maxbcount; } void md_bioinit(struct buf *bp) { ASSERT(bp); bioinit(bp); bp->b_back = bp; bp->b_forw = bp; bp->b_flags = B_BUSY; /* initialize flags */ } void md_bioreset(struct buf *bp) { ASSERT(bp); bioreset(bp); bp->b_back = bp; bp->b_forw = bp; bp->b_flags = B_BUSY; /* initialize flags */ } /* * md_bioclone is needed as long as the real bioclone only takes a daddr_t * as block number. * We simply call bioclone with all input parameters but blkno, and set the * correct blkno afterwards. * Caveat Emptor: bp_mem must not be NULL! */ buf_t * md_bioclone(buf_t *bp, off_t off, size_t len, dev_t dev, diskaddr_t blkno, int (*iodone)(buf_t *), buf_t *bp_mem, int sleep) { (void) bioclone(bp, off, len, dev, 0, iodone, bp_mem, sleep); bp_mem->b_lblkno = blkno; return (bp_mem); } /* * kstat stuff */ void md_kstat_init_ui( minor_t mnum, mdi_unit_t *ui ) { if ((ui != NULL) && (ui->ui_kstat == NULL)) { set_t setno = MD_MIN2SET(mnum); unit_t unit = MD_MIN2UNIT(mnum); char module[KSTAT_STRLEN]; char *p = module; if (setno != MD_LOCAL_SET) { char buf[64]; char *s = buf; char *e = module + sizeof (module) - 4; (void) sprintf(buf, "%u", setno); while ((p < e) && (*s != '\0')) *p++ = *s++; *p++ = '/'; } *p++ = 'm'; *p++ = 'd'; *p = '\0'; if ((ui->ui_kstat = kstat_create(module, unit, NULL, "disk", KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) { ui->ui_kstat->ks_lock = &ui->ui_mx; kstat_install(ui->ui_kstat); } } } void md_kstat_init( minor_t mnum ) { md_kstat_init_ui(mnum, MDI_UNIT(mnum)); } void md_kstat_destroy_ui( mdi_unit_t *ui ) { /* * kstat_delete() interface has it's own locking mechanism and * does not allow holding of kstat lock (ks_lock). * Note: ks_lock == ui_mx from the md_kstat_init_ui(). */ if ((ui != NULL) && (ui->ui_kstat != NULL)) { kstat_delete(ui->ui_kstat); ui->ui_kstat = NULL; } } void md_kstat_destroy( minor_t mnum ) { md_kstat_destroy_ui(MDI_UNIT(mnum)); } /* * In the following subsequent routines, locks are held before checking the * validity of ui_kstat. This is done to make sure that we don't trip over * a NULL ui_kstat anymore. */ void md_kstat_waitq_enter( mdi_unit_t *ui ) { mutex_enter(&ui->ui_mx); if (ui->ui_kstat != NULL) kstat_waitq_enter(KSTAT_IO_PTR(ui->ui_kstat)); mutex_exit(&ui->ui_mx); } void md_kstat_waitq_to_runq( mdi_unit_t *ui ) { mutex_enter(&ui->ui_mx); if (ui->ui_kstat != NULL) kstat_waitq_to_runq(KSTAT_IO_PTR(ui->ui_kstat)); mutex_exit(&ui->ui_mx); } void md_kstat_waitq_exit( mdi_unit_t *ui ) { mutex_enter(&ui->ui_mx); if (ui->ui_kstat != NULL) kstat_waitq_exit(KSTAT_IO_PTR(ui->ui_kstat)); mutex_exit(&ui->ui_mx); } void md_kstat_runq_enter( mdi_unit_t *ui ) { mutex_enter(&ui->ui_mx); if (ui->ui_kstat != NULL) kstat_runq_enter(KSTAT_IO_PTR(ui->ui_kstat)); mutex_exit(&ui->ui_mx); } void md_kstat_runq_exit( mdi_unit_t *ui ) { mutex_enter(&ui->ui_mx); if (ui->ui_kstat != NULL) kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat)); mutex_exit(&ui->ui_mx); } void md_kstat_done( mdi_unit_t *ui, buf_t *bp, int war ) { size_t n_done; /* check for end of device */ if ((bp->b_resid != 0) && (! (bp->b_flags & B_ERROR))) { n_done = bp->b_bcount; } else if (bp->b_bcount < bp->b_resid) { n_done = 0; } else { n_done = bp->b_bcount - bp->b_resid; } /* do accounting */ mutex_enter(&ui->ui_mx); if (ui->ui_kstat != NULL) { if ((! war) && (bp->b_flags & B_READ)) { KSTAT_IO_PTR(ui->ui_kstat)->reads++; KSTAT_IO_PTR(ui->ui_kstat)->nread += n_done; } else { KSTAT_IO_PTR(ui->ui_kstat)->writes++; KSTAT_IO_PTR(ui->ui_kstat)->nwritten += n_done; } kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat)); } mutex_exit(&ui->ui_mx); } pid_t md_getpid() { pid_t valuep; if (drv_getparm(PPID, (pid_t *)&valuep) != 0) { ASSERT(0); return ((pid_t)0); } else { ASSERT(valuep); return (valuep); } } proc_t * md_getproc() { proc_t *valuep; if (drv_getparm(UPROCP, (proc_t **)&valuep) != 0) { ASSERT(0); return ((proc_t *)NULL); } else { ASSERT(valuep); return (valuep); } } extern kmutex_t pidlock; /* * this check to see if a process pid pair are still running. For the * disk set lock when both pid/proc are zero then the locks is not * currently held. */ int md_checkpid(pid_t pid, proc_t *proc) { int retval = 1; if (pid == 0 && proc == NULL) return (0); mutex_enter(&pidlock); if (prfind(pid) != proc) retval = 0; mutex_exit(&pidlock); return (retval); } /* * NAME: md_init_probereq * * DESCRIPTION: initializes a probe request. Parcels out the mnums such that * they can be dispatched to multiple daemon threads. * * PARAMETERS: struct md_probedev *p pointer ioctl input * * RETURN VALUE: Returns errno * */ int md_init_probereq(struct md_probedev_impl *p, daemon_queue_t **hdrpp) { int err = 0; int modindx; intptr_t (*probe_test)(); /* * Initialize the semaphores and mutex * for the request */ p->probe_sema = kmem_alloc(sizeof (ksema_t), KM_SLEEP); p->probe_mx = kmem_alloc(sizeof (kmutex_t), KM_SLEEP); sema_init(PROBE_SEMA(p), 0, NULL, SEMA_DRIVER, NULL); mutex_init(PROBE_MX(p), NULL, MUTEX_DEFAULT, NULL); modindx = md_getmodindex(&(p->probe.md_driver), 1, 1); probe_test = md_get_named_service(NODEV64, modindx, p->probe.test_name, 0); if (probe_test == NULL) { err = EINVAL; goto err_out; } err = md_create_probe_rqlist(p, hdrpp, probe_test); err_out: return (err); } /* * NAME: md_probe_one * * DESCRIPTION: Generic routine for probing disks. This is called from the * daemon. * * PARAMETERS: probe_req_t *reqp pointer to the probe request structure. * */ void md_probe_one(probe_req_t *reqp) { mdi_unit_t *ui; md_probedev_impl_t *p; int err = 0; set_t setno; p = (md_probedev_impl_t *)reqp->private_handle; /* * Validate the unit while holding the global ioctl lock, then * obtain the unit_writerlock. Once the writerlock has been obtained * we can release the global lock. As long as we hold one of these * locks this will prevent a metaclear operation being performed * on the metadevice because metaclear takes the readerlock (via * openclose lock). * To avoid a potential deadlock with the probe_fcn() causing i/o to * be issued to the writerlock'd metadevice we only grab the writerlock * if the unit is not an SVM root device. */ while (md_ioctl_lock_enter() == EINTR) ; setno = MD_MIN2SET(reqp->mnum); ui = MDI_UNIT(reqp->mnum); if (ui != NULL) { int writer_grabbed; dev_t svm_root; if ((setno == MD_LOCAL_SET) && root_is_svm) { svm_root = getrootdev(); if (getminor(svm_root) == reqp->mnum) { writer_grabbed = 0; } else { writer_grabbed = 1; (void) md_unit_writerlock_common(ui, 0); } } else { writer_grabbed = 1; (void) md_unit_writerlock_common(ui, 0); } (void) md_ioctl_lock_exit(0, 0, 0, FALSE); err = (*reqp->probe_fcn)(ui, reqp->mnum); if (writer_grabbed) { md_unit_writerexit(ui); } } else { (void) md_ioctl_lock_exit(0, 0, 0, FALSE); } /* update the info in the probe structure */ mutex_enter(PROBE_MX(p)); if (err != 0) { cmn_err(CE_NOTE, "md_probe_one: err %d mnum %d\n", err, reqp->mnum); (void) mdsyserror(&(p->probe.mde), err); } mutex_exit(PROBE_MX(p)); sema_v(PROBE_SEMA(p)); kmem_free(reqp, sizeof (probe_req_t)); } char * md_strdup(char *cp) { char *new_cp = NULL; new_cp = kmem_alloc(strlen(cp) + 1, KM_SLEEP); return (strcpy(new_cp, cp)); } void freestr(char *cp) { kmem_free(cp, strlen(cp) + 1); } /* * Validate the list and skip invalid devices. Then create * a doubly linked circular list of devices to probe. * The hdr points to the head and tail of this list. */ static int md_create_probe_rqlist(md_probedev_impl_t *plist, daemon_queue_t **hdr, intptr_t (*probe_test)()) { int i, err, nodevcnt; probe_req_t *tp; daemon_queue_t *hp; minor_t mnum; nodevcnt = 0; hp = NULL; for (i = 0; i < plist->probe.nmdevs; i++) { mnum = ((minor_t *)(uintptr_t)(plist->probe.mnum_list))[i]; if (MDI_UNIT(mnum) == NULL) { cmn_err(CE_WARN, "md: Cannot probe %s since it does " "not exist", md_shortname(mnum)); nodevcnt++; continue; } tp = kmem_alloc(sizeof (probe_req_t), KM_SLEEP); tp->mnum = mnum; tp->private_handle = (void *)plist; tp->probe_fcn = probe_test; if (hp == NULL) { hp = (daemon_queue_t *)tp; hp->dq_prev = hp->dq_next = (daemon_queue_t *)tp; } else { tp->dq.dq_next = hp; tp->dq.dq_prev = hp->dq_prev; hp->dq_prev->dq_next = (daemon_queue_t *)tp; hp->dq_prev = (daemon_queue_t *)tp; } } *hdr = hp; if (nodevcnt > 0) plist->probe.nmdevs -= nodevcnt; /* * If there are no devices to be probed because they were * incorrect, then return an error. */ err = (plist->probe.nmdevs == 0) ? ENODEV : 0; return (err); } /* * This routine increments the I/O count for set I/O operations. This * value is used to determine if an I/O can done. If a release is in * process this will return an error and cause the I/O to be errored. */ int md_inc_iocount(set_t setno) { int rc = 0; if (setno == 0) return (0); mutex_enter(&md_set_io[setno].md_io_mx); if (!(md_set_io[setno].io_state & MD_SET_ACTIVE)) { rc = EIO; goto out; } ASSERT(md_set_io[setno].io_cnt >= 0); md_set_io[setno].io_cnt++; out: mutex_exit(&md_set_io[setno].md_io_mx); return (rc); } void md_inc_iocount_noblock(set_t setno) { if (setno == 0) return; mutex_enter(&md_set_io[setno].md_io_mx); md_set_io[setno].io_cnt++; mutex_exit(&md_set_io[setno].md_io_mx); } void md_dec_iocount(set_t setno) { if (setno == 0) return; mutex_enter(&md_set_io[setno].md_io_mx); md_set_io[setno].io_cnt--; ASSERT(md_set_io[setno].io_cnt >= 0); if ((md_set_io[setno].io_state & MD_SET_RELEASE) && (md_set_io[setno].io_cnt == 0)) cv_broadcast(&md_set_io[setno].md_io_cv); mutex_exit(&md_set_io[setno].md_io_mx); } int md_isblock_setio(set_t setno) { int rc = 0; if (setno == 0) return (0); mutex_enter(&md_set_io[setno].md_io_mx); if (md_set_io[setno].io_state & MD_SET_RELEASE) rc = 1; mutex_exit(&md_set_io[setno].md_io_mx); return (rc); } int md_block_setio(set_t setno) { int rc = 0; if (setno == 0) return (1); mutex_enter(&md_set_io[setno].md_io_mx); md_set_io[setno].io_state = MD_SET_RELEASE; while (md_set_io[setno].io_cnt > 0) { cv_wait(&md_set_io[setno].md_io_cv, &md_set_io[setno].md_io_mx); } rc = 1; ASSERT(md_set_io[setno].io_cnt == 0); mutex_exit(&md_set_io[setno].md_io_mx); return (rc); } void md_clearblock_setio(set_t setno) { if (setno == 0) return; mutex_enter(&md_set_io[setno].md_io_mx); md_set_io[setno].io_state = MD_SET_ACTIVE; mutex_exit(&md_set_io[setno].md_io_mx); } void md_unblock_setio(set_t setno) { if (setno == 0) return; mutex_enter(&md_set_io[setno].md_io_mx); #ifdef DEBUG if (md_set_io[setno].io_cnt != 0) { cmn_err(CE_NOTE, "set %d count was %ld at take", setno, md_set_io[setno].io_cnt); } #endif /* DEBUG */ md_set_io[setno].io_state = MD_SET_ACTIVE; md_set_io[setno].io_cnt = 0; mutex_exit(&md_set_io[setno].md_io_mx); } /* * Test and set version of the md_block_setio. * Set the io_state to keep new I/O from being issued. * If there is I/O currently in progress, then set io_state to active * and return failure. Otherwise, return a 1 for success. * * Used in a MN diskset since the commd must be suspended before * this node can attempt to withdraw from a diskset. But, with commd * suspended, I/O may have been issued that can never finish until * commd is resumed (allocation of hotspare, etc). So, if I/O is * outstanding after diskset io_state is marked RELEASE, then set diskset * io_state back to ACTIVE and return failure. */ int md_tas_block_setio(set_t setno) { int rc; if (setno == 0) return (1); mutex_enter(&md_set_io[setno].md_io_mx); md_set_io[setno].io_state = MD_SET_RELEASE; if (md_set_io[setno].io_cnt > 0) { md_set_io[setno].io_state = MD_SET_ACTIVE; rc = 0; } else { rc = 1; } mutex_exit(&md_set_io[setno].md_io_mx); return (rc); } void md_biodone(struct buf *pb) { minor_t mnum; set_t setno; mdi_unit_t *ui; mnum = getminor(pb->b_edev); setno = MD_MIN2SET(mnum); if (setno == 0) { biodone(pb); return; } #ifdef DEBUG ui = MDI_UNIT(mnum); if (!md_unit_isopen(ui)) cmn_err(CE_NOTE, "io after close on %s\n", md_shortname(mnum)); #endif /* DEBUG */ /* * Handle the local diskset */ if (md_set_io[setno].io_cnt > 0) md_dec_iocount(setno); #ifdef DEBUG /* * this is being done after the lock is dropped so there * are cases it may be invalid. It is advisory. */ if (md_set_io[setno].io_state & MD_SET_RELEASE) { /* Only display this error once for this metadevice */ if ((ui->ui_tstate & MD_RELEASE_IOERR_DONE) == 0) { cmn_err(CE_NOTE, "I/O to %s attempted during set RELEASE\n", md_shortname(mnum)); ui->ui_tstate |= MD_RELEASE_IOERR_DONE; } } #endif /* DEBUG */ biodone(pb); } /* * Driver special private devt handling routine * INPUT: md_dev64_t * OUTPUT: dev_t, 32 bit on a 32 bit kernel, 64 bit on a 64 bit kernel. */ dev_t md_dev64_to_dev(md_dev64_t dev) { major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; minor_t minor = (minor_t)(dev & MAXMIN64); return (makedevice(major, minor)); } /* * Driver private makedevice routine * INPUT: major_t major, minor_t minor * OUTPUT: md_dev64_t, no matter if on 32 bit or 64 bit kernel. */ md_dev64_t md_makedevice(major_t major, minor_t minor) { return (((md_dev64_t)major << NBITSMINOR64) | minor); } /* * Driver private devt md_getmajor routine * INPUT: dev a 64 bit container holding either a 32 bit or a 64 bit device * OUTPUT: the appropriate major number */ major_t md_getmajor(md_dev64_t dev) { major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; if (major == 0) { /* Here we were given a 32bit dev */ major = (major_t)(dev >> NBITSMINOR32) & MAXMAJ32; } return (major); } /* * Driver private devt md_getminor routine * INPUT: dev a 64 bit container holding either a 32 bit or a 64 bit device * OUTPUT: the appropriate minor number */ minor_t md_getminor(md_dev64_t dev) { minor_t minor; major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64; if (major == 0) { /* Here we were given a 32bit dev */ minor = (minor_t)(dev & MAXMIN32); } else { minor = (minor_t)(dev & MAXMIN64); } return (minor); } int md_check_ioctl_against_unit(int cmd, mdc_unit_t c) { /* * If the metadevice is an old style device, it has a vtoc, * in that case all reading EFI ioctls are not applicable. * If the metadevice has an EFI label, reading vtoc and geom ioctls * are not supposed to work. */ switch (cmd) { case DKIOCGGEOM: case DKIOCGAPART: /* if > 2 TB then fail */ if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { return (ENOTSUP); } break; case DKIOCGVTOC: /* if > 2 TB then fail */ if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { return (ENOTSUP); } /* if > 1 TB but < 2TB return overflow */ if (c.un_revision & MD_64BIT_META_DEV) { return (EOVERFLOW); } break; case DKIOCGEXTVTOC: /* if > 2 TB then fail */ if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { return (ENOTSUP); } break; case DKIOCGETEFI: case DKIOCPARTITION: if ((c.un_flag & MD_EFILABEL) == 0) { return (ENOTSUP); } break; case DKIOCSETEFI: /* setting an EFI label should always be ok */ return (0); case DKIOCSVTOC: /* if > 2 TB then fail */ if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { return (ENOTSUP); } /* if > 1 TB but < 2TB return overflow */ if (c.un_revision & MD_64BIT_META_DEV) { return (EOVERFLOW); } break; case DKIOCSEXTVTOC: if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) { return (ENOTSUP); } break; } return (0); } /* * md_vtoc_to_efi_record() * Input: record id of the vtoc record * Output: record id of the efi record * Function: * - reads the volume name from the vtoc record * - converts the volume name to a format, libefi understands * - creates a new record of size MD_EFI_PARTNAME_BYTES * - stores the volname in that record, * - commits that record * - returns the recid of the efi record. * Caveat Emptor: * The calling routine must do something like * - un->c.un_vtoc_id = md_vtoc_to_efi_record(vtoc_recid) * - commit(un) * - delete(vtoc_recid) * in order to keep the mddb consistent in case of a panic in the middle. * Errors: * - returns 0 on any error */ mddb_recid_t md_vtoc_to_efi_record(mddb_recid_t vtoc_recid, set_t setno) { struct vtoc *vtoc; ushort_t *v; mddb_recid_t efi_recid; int i; if (mddb_getrecstatus(vtoc_recid) != MDDB_OK) { return (0); } vtoc = (struct vtoc *)mddb_getrecaddr(vtoc_recid); efi_recid = mddb_createrec(MD_EFI_PARTNAME_BYTES, MDDB_EFILABEL, 0, MD_CRO_32BIT, setno); if (efi_recid < 0) { return (0); } v = (ushort_t *)mddb_getrecaddr(efi_recid); /* This for loop read, converts and writes */ for (i = 0; i < LEN_DKL_VVOL; i++) { v[i] = LE_16((uint16_t)vtoc->v_volume[i]); } /* commit the new record */ mddb_commitrec_wrapper(efi_recid); return (efi_recid); } /* * Send a kernel message. * user has to provide for an allocated result structure * If the door handler disappears we retry, emitting warnings every so often. * * The recipient argument is almost always unused, and is therefore typically * set to zero, as zero is an invalid cluster nodeid. The exceptions are the * marking and clearing of the DRL from a node that is not currently the * owner. In these cases, the recipient argument will be the nodeid of the * mirror owner, and MD_MSGF_DIRECTED will be set in the flags. Non-owner * nodes will not receive these messages. * * For the case where md_mn_is_commd_present() is false, we simply pre-set * the result->kmmr_comm_state to MDMNE_RPC_FAIL. * This covers the case where the service mdcommd has been killed and so we do * not get a 'new' result structure copied back. Instead we return with the * supplied result field, and we need to flag a failure to the caller. */ int mdmn_ksend_message( set_t setno, md_mn_msgtype_t type, uint_t flags, md_mn_nodeid_t recipient, char *data, int size, md_mn_kresult_t *result) { door_arg_t da; md_mn_kmsg_t *kmsg; uint_t send_try_cnt = 0; uint_t retry_noise_cnt = 0; int rval; k_sigset_t oldmask, newmask; /* * Ensure that we default to a recoverable failure state if the * door upcall cannot pass the request on to rpc.mdcommd. * This may occur when shutting the node down while there is still * a mirror resync or metadevice state update occurring. */ result->kmmr_comm_state = MDMNE_RPC_FAIL; result->kmmr_exitval = ~0; if (size > MDMN_MAX_KMSG_DATA) return (ENOMEM); kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP); kmsg->kmsg_flags = flags; kmsg->kmsg_setno = setno; kmsg->kmsg_recipient = recipient; kmsg->kmsg_type = type; kmsg->kmsg_size = size; bcopy(data, &(kmsg->kmsg_data), size); /* * Wait for the door handle to be established. */ while (mdmn_door_did == -1) { if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) { cmn_err(CE_WARN, "door handle not yet ready. " "Check if /usr/lib/lvm/mddoors is running"); } delay(md_hz); } /* * If MD_MSGF_BLK_SIGNAL is set, mask out all signals so that we * do not fail if the user process receives a signal while we're * active in the door interface. */ if (flags & MD_MSGF_BLK_SIGNAL) { sigfillset(&newmask); sigreplace(&newmask, &oldmask); } /* * If message failed with an RPC_FAILURE when rpc.mdcommd had * been gracefully shutdown (md_mn_is_commd_present returns FALSE) * then don't retry the message anymore. If message * failed due to any other reason, then retry up to MD_MN_WARN_INTVL * times which should allow a shutting down system time to * notify the kernel of a graceful shutdown of rpc.mdcommd. * * Caller of this routine will need to check the md_mn_commd_present * flag and the failure error in order to determine whether to panic * or not. If md_mn_commd_present is set to 0 and failure error * is RPC_FAILURE, the calling routine should not panic since the * system is in the process of being shutdown. * */ retry_noise_cnt = send_try_cnt = 0; while (md_mn_is_commd_present_lite()) { /* * data_ptr and data_size are initialized here because on * return from the upcall, they contain data duplicated from * rbuf and rsize. This causes subsequent upcalls to fail. */ da.data_ptr = (char *)(kmsg); da.data_size = sizeof (md_mn_kmsg_t); da.desc_ptr = NULL; da.desc_num = 0; da.rbuf = (char *)result; da.rsize = sizeof (*result); while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da, NULL, SIZE_MAX, 0)) != 0) { if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) { if (rval == EAGAIN) { cmn_err(CE_WARN, "md: door_upcall failed. " "Check if mddoors is running."); } else if (rval == EINTR) { cmn_err(CE_WARN, "md: door_upcall failed. " "Check if rpc.mdcommd is running."); } else { cmn_err(CE_WARN, "md: door_upcall failed. " "Returned %d", rval); } } if (++send_try_cnt >= md_send_retry_limit) break; delay(md_hz); /* * data_ptr and data_size are re-initialized here * because on return from the upcall, they contain * data duplicated from rbuf and rsize. This causes * subsequent upcalls to fail. */ da.data_ptr = (char *)(kmsg); da.data_size = sizeof (md_mn_kmsg_t); da.desc_ptr = NULL; da.desc_num = 0; da.rbuf = (char *)result; da.rsize = sizeof (*result); } /* * If: * - the send succeeded (MDMNE_ACK) * - we had an MDMNE_RPC_FAIL and commd is now gone * (note: since the outer loop is commd-dependent, * checking MDMN_RPC_FAIL here is meaningless) * - we were told not to retry * - we exceeded the RPC failure send limit * punch out of the outer loop prior to the delay() */ if (result->kmmr_comm_state == MDMNE_ACK || (flags & MD_MSGF_KSEND_NORETRY) || (++send_try_cnt % md_send_retry_limit) == 0 || !md_mn_is_commd_present()) break; delay(md_hz); } if (flags & MD_MSGF_BLK_SIGNAL) { sigreplace(&oldmask, (k_sigset_t *)NULL); } kmem_free(kmsg, sizeof (md_mn_kmsg_t)); return (0); } /* * Called to propagate the capability of a metadevice to all nodes in the set. * * On entry, lockp is set if the function has been called from within an ioctl. * * IOLOCK_RETURN_RELEASE, which drops the md_ioctl_lock is called in this * routine to enable other mdioctls to enter the kernel while this * thread of execution waits on the completion of mdmn_ksend_message. When * the message is completed the thread continues and md_ioctl_lock must be * reacquired. Even though md_ioctl_lock is interruptable, we choose to * ignore EINTR as we must not return without acquiring md_ioctl_lock. */ int mdmn_send_capability_message(minor_t mnum, volcap_t vc, IOLOCK *lockp) { md_mn_msg_setcap_t msg; md_mn_kresult_t *kres; mdi_unit_t *ui = MDI_UNIT(mnum); int ret; k_sigset_t oldmask, newmask; (void) strncpy((char *)&msg.msg_setcap_driver, md_ops[ui->ui_opsindex]->md_driver.md_drivername, MD_DRIVERNAMELEN); msg.msg_setcap_mnum = mnum; msg.msg_setcap_set = vc.vc_set; if (lockp) IOLOCK_RETURN_RELEASE(0, lockp); kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); /* * Mask signals for the mdmd_ksend_message call. This keeps the door * interface from failing if the user process receives a signal while * in mdmn_ksend_message. */ sigfillset(&newmask); sigreplace(&newmask, &oldmask); ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP, MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (md_mn_msg_setcap_t), kres)); sigreplace(&oldmask, (k_sigset_t *)NULL); if (!MDMN_KSEND_MSG_OK(ret, kres)) { mdmn_ksend_show_error(ret, kres, "MD_MN_MSG_SET_CAP"); ret = EIO; } kmem_free(kres, sizeof (md_mn_kresult_t)); if (lockp) { IOLOCK_RETURN_REACQUIRE(lockp); } return (ret); } /* * Called to clear all of the transient capabilities for a metadevice when it is * not open on any node in the cluster * Called from close for mirror and sp. */ void mdmn_clear_all_capabilities(minor_t mnum) { md_isopen_t clumsg; int ret; md_mn_kresult_t *kresult; volcap_t vc; k_sigset_t oldmask, newmask; clumsg.dev = md_makedevice(md_major, mnum); clumsg.mde = mdnullerror; /* * The check open message doesn't have to be logged, nor should the * result be stored in the MCT. We want an up-to-date state. */ kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); /* * Mask signals for the mdmd_ksend_message call. This keeps the door * interface from failing if the user process receives a signal while * in mdmn_ksend_message. */ sigfillset(&newmask); sigreplace(&newmask, &oldmask); ret = mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_CLU_CHECK, MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 0, (char *)&clumsg, sizeof (clumsg), kresult); sigreplace(&oldmask, (k_sigset_t *)NULL); if ((ret == 0) && (kresult->kmmr_exitval == 0)) { /* * Not open on any node, clear all capabilities, eg ABR and * DMR */ vc.vc_set = 0; (void) mdmn_send_capability_message(mnum, vc, NULL); } kmem_free(kresult, sizeof (md_mn_kresult_t)); } /* * mdmn_ksend_show_error: * --------------------- * Called to display the error contents of a failing mdmn_ksend_message() result * * Input: * rv - return value from mdmn_ksend_message() * kres - pointer to result structure filled in by mdmn_ksend_message * s - Informative message to identify failing condition (e.g. * "Ownership change") This string will be displayed with * cmn_err(CE_WARN, "%s *FAILED*",...) to alert the system * administrator */ void mdmn_ksend_show_error(int rv, md_mn_kresult_t *kres, const char *s) { if (rv == 0) { cmn_err(CE_WARN, "%s *FAILED*", s); cmn_err(CE_CONT, "exit_val = %d, comm_state = %d, failing_node" " = %d", kres->kmmr_exitval, kres->kmmr_comm_state, kres->kmmr_failing_node); } else { cmn_err(CE_WARN, "%s *FAILED*, return value = %d", s, rv); } } /* * Callback routine for resync thread. If requested to suspend we mark the * commd as not being present. */ boolean_t callb_md_mrs_cpr(void *arg, int code) { callb_cpr_t *cp = (callb_cpr_t *)arg; int ret = 0; /* assume success */ clock_t delta; mutex_enter(cp->cc_lockp); switch (code) { case CB_CODE_CPR_CHKPT: /* * Mark the rpc.mdcommd as no longer present. We are trying to * suspend the system and so we should expect RPC failures to * occur. */ md_mn_clear_commd_present(); cp->cc_events |= CALLB_CPR_START; delta = CPR_KTHREAD_TIMEOUT_SEC * hz; while (!(cp->cc_events & CALLB_CPR_SAFE)) /* cv_timedwait() returns -1 if it times out. */ if ((ret = cv_reltimedwait(&cp->cc_callb_cv, cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1) break; break; case CB_CODE_CPR_RESUME: cp->cc_events &= ~CALLB_CPR_START; cv_signal(&cp->cc_stop_cv); break; } mutex_exit(cp->cc_lockp); return (ret != -1); } void md_rem_hspname(set_t setno, mdkey_t n_key) { int s; int max_sides; /* All entries removed are in the same diskset */ if (md_get_setstatus(setno) & MD_SET_MNSET) max_sides = MD_MNMAXSIDES; else max_sides = MD_MAXSIDES; for (s = 0; s < max_sides; s++) (void) md_remdevname(setno, s, n_key); } int md_rem_selfname(minor_t selfid) { int s; set_t setno = MD_MIN2SET(selfid); int max_sides; md_dev64_t dev; struct nm_next_hdr *nh; struct nm_name *n; mdkey_t key; /* * Get the key since remove routine expects it */ dev = md_makedevice(md_major, selfid); if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) { return (ENOENT); } if ((n = (struct nm_name *)lookup_entry(nh, setno, MD_SIDEWILD, MD_KEYWILD, dev, 0L)) == NULL) { return (ENOENT); } /* All entries removed are in the same diskset */ key = n->n_key; if (md_get_setstatus(setno) & MD_SET_MNSET) max_sides = MD_MNMAXSIDES; else max_sides = MD_MAXSIDES; for (s = 0; s < max_sides; s++) (void) md_remdevname(setno, s, key); return (0); } void md_upd_set_unnext(set_t setno, unit_t un) { if (un < md_set[setno].s_un_next) { md_set[setno].s_un_next = un; } } struct hot_spare_pool * find_hot_spare_pool(set_t setno, int hsp_id) { hot_spare_pool_t *hsp; hsp = (hot_spare_pool_t *)md_set[setno].s_hsp; while (hsp != NULL) { if (hsp->hsp_self_id == hsp_id) return (hsp); hsp = hsp->hsp_next; } return ((hot_spare_pool_t *)0); } /* * md_create_taskq: * * Create a kernel taskq for the given set/unit combination. This is typically * used to complete a RR_CLEAN request when the callee is unable to obtain the * mutex / condvar access required to update the DRL safely. */ void * md_create_taskq(set_t setno, minor_t mnum) { char name[20]; ddi_taskq_t *tqp; (void) snprintf(name, 20, "%d/d%d", setno, MD_MIN2UNIT(mnum)); tqp = ddi_taskq_create(md_devinfo, name, 1, TASKQ_DEFAULTPRI, 0); return ((void *)tqp); }