1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * NAME: raid_ioctl.c
29 *
30 * DESCRIPTION: RAID driver source file containing IOCTL operations.
31 *
32 * ROUTINES PROVIDED FOR EXTERNAL USE:
33 * raid_commit() - commits MD database updates for a RAID metadevice
34 * md_raid_ioctl() - RAID metadevice IOCTL operations entry point.
35 *
36 * ROUTINES PROVIDED FOR INTERNAL USE:
37 * raid_getun() - Performs unit checking on a RAID metadevice
38 * init_col_nextio() - normal backend when zeroing column of RAID metadevice.
39 * init_col_int() - I/O interrupt while zeroing column of RAID metadevice.
40 * raid_init_columns() - Zero one or more columns of a RAID metadevice.
41 * raid_set() - used to create a RAID metadevice
42 * raid_get() - used to get the unit structure of a RAID metadevice
43 * raid_replace() - used to replace a component of a RAID metadevice
44 * raid_grow() - Concatenate to a RAID metadevice
45 * raid_change() - change dynamic values of a RAID metadevice
46 * raid_reset() - used to reset (clear / remove) a RAID metadevice
47 * raid_get_geom() - used to get the geometry of a RAID metadevice
48 * raid_get_vtoc() - used to get the VTOC on a RAID metadevice
49 * raid_set_vtoc() - used to set the VTOC on a RAID metadevice
50 * raid_get_extvtoc() - used to get the extended VTOC on a RAID metadevice
51 * raid_set_extvtoc() - used to set the extended VTOC on a RAID metadevice
52 * raid_getdevs() - return all devices within a RAID metadevice
53 * raid_admin_ioctl() - IOCTL operations unique to metadevices and RAID
54 */
55
56
57 #include <sys/param.h>
58 #include <sys/systm.h>
59 #include <sys/conf.h>
60 #include <sys/file.h>
61 #include <sys/user.h>
62 #include <sys/uio.h>
63 #include <sys/t_lock.h>
64 #include <sys/buf.h>
65 #include <sys/dkio.h>
66 #include <sys/vtoc.h>
67 #include <sys/kmem.h>
68 #include <vm/page.h>
69 #include <sys/sysmacros.h>
70 #include <sys/types.h>
71 #include <sys/mkdev.h>
72 #include <sys/stat.h>
73 #include <sys/open.h>
74 #include <sys/disp.h>
75 #include <sys/modctl.h>
76 #include <sys/ddi.h>
77 #include <sys/sunddi.h>
78 #include <sys/cred.h>
79 #include <sys/lvm/mdvar.h>
80 #include <sys/lvm/md_names.h>
81 #include <sys/lvm/md_mddb.h>
82 #include <sys/lvm/md_raid.h>
83 #include <sys/lvm/md_convert.h>
84
85 #include <sys/sysevent/eventdefs.h>
86 #include <sys/sysevent/svm.h>
87
88 extern int md_status;
89 extern unit_t md_nunits;
90 extern set_t md_nsets;
91 extern md_set_t md_set[];
92 extern md_ops_t raid_md_ops;
93 extern major_t md_major;
94 extern md_krwlock_t md_unit_array_rw;
95 extern mdq_anchor_t md_done_daemon;
96 extern mdq_anchor_t md_ff_daemonq;
97 extern int mdopen();
98 extern int mdclose();
99 extern void md_probe_one();
100 extern int md_init_probereq(md_probedev_impl_t *,
101 daemon_queue_t **);
102 extern md_resync_t md_cpr_resync;
103
104
105 extern void dump_mr_unit(mr_unit_t *);
106
107 typedef struct raid_ci {
108 DAEMON_QUEUE
109 struct raid_ci *ci_next;
110 mr_unit_t *ci_un;
111 int ci_col;
112 int ci_err;
113 int ci_flag;
114 size_t ci_zerosize;
115 diskaddr_t ci_blkno;
116 diskaddr_t ci_lastblk;
117 buf_t ci_buf;
118 } raid_ci_t;
119 /* values for the ci_flag */
120 #define COL_INITING (0x0001)
121 #define COL_INIT_DONE (0x0002)
122 #define COL_READY (0x0004)
123
124 /*
125 * NAME: raid_getun
126 * DESCRIPTION: performs a lot of unit checking on a RAID metadevice
127 * PARAMETERS: minor_t mnum - minor device number for RAID unit
128 * md_error_t *mde - pointer to error reporting structure
129 * int flags - pointer to error reporting structure
130 * STALE_OK - allow stale MD memory
131 * NO_OLD - unit must not exist
132 * NO_LOCK - no IOCTL lock needed
133 * WR_LOCK - write IOCTL lock needed
134 * RD_LOCK - read IOCTL lock needed
135 * IOLOCK *lock - pointer to IOCTL lock
136 *
137 * LOCKS: obtains unit reader or writer lock via IOLOCK
138 *
139 */
140 static mr_unit_t *
raid_getun(minor_t mnum,md_error_t * mde,int flags,IOLOCK * lock)141 raid_getun(minor_t mnum, md_error_t *mde, int flags, IOLOCK *lock)
142 {
143 mr_unit_t *un;
144 mdi_unit_t *ui;
145 set_t setno = MD_MIN2SET(mnum);
146
147 if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) {
148 (void) mdmderror(mde, MDE_INVAL_UNIT, mnum);
149 return (NULL);
150 }
151
152 if (!(flags & STALE_OK)) {
153 if (md_get_setstatus(setno) & MD_SET_STALE) {
154 (void) mdmddberror(mde, MDE_DB_STALE, mnum, setno);
155 return (NULL);
156 }
157 }
158
159 ui = MDI_UNIT(mnum);
160 if (flags & NO_OLD) {
161 if (ui != NULL) {
162 (void) mdmderror(mde, MDE_UNIT_ALREADY_SETUP, mnum);
163 return (NULL);
164 }
165 return ((mr_unit_t *)1);
166 }
167
168 if (ui == NULL) {
169 (void) mdmderror(mde, MDE_UNIT_NOT_SETUP, mnum);
170 return (NULL);
171 }
172 if (flags & ARRAY_WRITER)
173 md_array_writer(lock);
174 else if (flags & ARRAY_READER)
175 md_array_reader(lock);
176
177 if (!(flags & NO_LOCK)) {
178 if (flags & WR_LOCK) {
179 (void) md_ioctl_io_lock(lock, ui);
180 (void) md_ioctl_writerlock(lock, ui);
181 } else /* RD_LOCK */
182 (void) md_ioctl_readerlock(lock, ui);
183 }
184 un = (mr_unit_t *)MD_UNIT(mnum);
185
186 if (un->c.un_type != MD_METARAID) {
187 (void) mdmderror(mde, MDE_NOT_RAID, mnum);
188 return (NULL);
189 }
190
191 return (un);
192 }
193
194
195 /*
196 * NAME: raid_commit
197 * DESCRIPTION: commits MD database updates for a RAID metadevice
198 * PARAMETERS: mr_unit_t *un - RAID unit to update in the MD database
199 * mddb_recid_t *extras - array of other record IDs to update
200 *
201 * LOCKS: assumes caller holds unit writer lock
202 *
203 */
204 void
raid_commit(mr_unit_t * un,mddb_recid_t * extras)205 raid_commit(mr_unit_t *un, mddb_recid_t *extras)
206 {
207 mddb_recid_t *recids;
208 int ri = 0;
209 int nrecids = 0;
210
211 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
212 return;
213
214 /* Count the extra recids */
215 if (extras != NULL) {
216 while (extras[nrecids] != 0) {
217 nrecids++;
218 }
219 }
220
221 /*
222 * Allocate space for two recids in addition to the extras:
223 * one for the unit structure, one for the null terminator.
224 */
225 nrecids += 2;
226 recids = (mddb_recid_t *)
227 kmem_zalloc(nrecids * sizeof (mddb_recid_t), KM_SLEEP);
228
229 if (un != NULL) {
230 ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1);
231 recids[ri++] = un->c.un_record_id;
232 }
233
234 if (extras != NULL) {
235 while (*extras != 0) {
236 recids[ri++] = *extras;
237 extras++;
238 }
239 }
240
241 if (ri > 0) {
242 mddb_commitrecs_wrapper(recids);
243 }
244
245 kmem_free(recids, nrecids * sizeof (mddb_recid_t));
246 }
247
248 static int
raid_check_pw(mr_unit_t * un)249 raid_check_pw(mr_unit_t *un)
250 {
251 buf_t bp;
252 char *buf;
253 mr_column_t *colptr;
254 minor_t mnum = MD_SID(un);
255 int i;
256 int err = 0;
257 minor_t unit;
258
259 buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP);
260
261 for (i = 0; i < un->un_totalcolumncnt; i++) {
262 md_dev64_t tmpdev;
263
264 colptr = &un->un_column[i];
265
266 tmpdev = colptr->un_dev;
267 /*
268 * Open by device id
269 * If this device is hotspared
270 * use the hotspare key
271 */
272 tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ?
273 colptr->un_hs_key : colptr->un_orig_key);
274 if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
275 colptr->un_dev = tmpdev;
276 return (1);
277 }
278 colptr->un_dev = tmpdev;
279
280 bzero((caddr_t)&bp, sizeof (buf_t));
281 bp.b_back = &bp;
282 bp.b_forw = &bp;
283 bp.b_flags = B_READ | B_BUSY;
284 sema_init(&bp.b_io, 0, NULL,
285 SEMA_DEFAULT, NULL);
286 sema_init(&bp.b_sem, 0, NULL,
287 SEMA_DEFAULT, NULL);
288 bp.b_edev = md_dev64_to_dev(colptr->un_dev);
289 bp.b_lblkno = colptr->un_pwstart;
290 bp.b_bcount = DEV_BSIZE;
291 bp.b_bufsize = DEV_BSIZE;
292 bp.b_un.b_addr = (caddr_t)buf;
293 bp.b_offset = -1;
294 (void) md_call_strategy(&bp, 0, NULL);
295 if (biowait(&bp))
296 err = 1;
297 if (i == 0) {
298 if (un->c.un_revision & MD_64BIT_META_DEV) {
299 unit = ((raid_pwhdr_t *)buf)->rpw_unit;
300 } else {
301 unit = ((raid_pwhdr32_od_t *)buf)->rpw_unit;
302 }
303 }
304 /*
305 * depending upon being an 64bit or 32 bit raid, the
306 * pre write headers have different layout
307 */
308 if (un->c.un_revision & MD_64BIT_META_DEV) {
309 if ((((raid_pwhdr_t *)buf)->rpw_column != i) ||
310 (((raid_pwhdr_t *)buf)->rpw_unit != unit))
311 err = 1;
312 } else {
313 if ((((raid_pwhdr32_od_t *)buf)->rpw_column != i) ||
314 (((raid_pwhdr32_od_t *)buf)->rpw_unit != unit))
315 err = 1;
316 }
317 md_layered_close(colptr->un_dev, MD_OFLG_NULL);
318 if (err)
319 break;
320 }
321 kmem_free(buf, DEV_BSIZE);
322 return (err);
323 }
324
325 /*
326 * NAME: init_col_nextio
327 * DESCRIPTION: normal backend process when zeroing column of a RAID metadevice.
328 * PARAMETERS: raid_ci_t *cur - struct for column being zeroed
329 *
330 * LOCKS: assumes caller holds unit reader lock,
331 * preiodically releases and reacquires unit reader lock,
332 * broadcasts on unit conditional variable (un_cv)
333 *
334 */
335 #define INIT_RLS_CNT 10
336 static void
init_col_nextio(raid_ci_t * cur)337 init_col_nextio(raid_ci_t *cur)
338 {
339 mr_unit_t *un;
340
341 un = cur->ci_un;
342
343 cur->ci_blkno += cur->ci_zerosize;
344
345 mutex_enter(&un->un_mx);
346 /* ===> update un_percent_done */
347 un->un_init_iocnt += btodb(cur->ci_buf.b_bcount);
348 mutex_exit(&un->un_mx);
349
350 /*
351 * When gorwing a device, normal I/O is still going on.
352 * The init thread still holds the unit reader lock which
353 * prevents I/O from doing state changes.
354 * So every INIT_RLS_CNT init I/Os, we will release the
355 * unit reader lock.
356 *
357 * CAVEAT:
358 * We know we are in the middle of a grow operation and the
359 * unit cannot be grown or removed (through reset or halt)
360 * so the mr_unit_t structure will not move or disappear.
361 * In addition, we know that only one of the init I/Os
362 * can be in col_init_nextio at a time because they are
363 * placed on the md_done_daemon queue and md only processes
364 * one element of this queue at a time. In addition, any
365 * code that needs to acquire the unit writer lock to change
366 * state is supposed to be on the md_mstr_daemon queue so
367 * it can be processing while we sit here waiting to get the
368 * unit reader lock back.
369 */
370
371 if (cur->ci_blkno < cur->ci_lastblk) {
372 /* truncate last chunk to end_addr if needed */
373 if (cur->ci_blkno + cur->ci_zerosize > cur->ci_lastblk) {
374 cur->ci_zerosize = (size_t)
375 (cur->ci_lastblk - cur->ci_blkno);
376 }
377
378 /* set address and length for I/O bufs */
379 cur->ci_buf.b_bufsize = dbtob(cur->ci_zerosize);
380 cur->ci_buf.b_bcount = dbtob(cur->ci_zerosize);
381 cur->ci_buf.b_lblkno = cur->ci_blkno;
382
383 (void) md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
384 return;
385 }
386 /* finished initializing this column */
387 mutex_enter(&un->un_mx);
388 cur->ci_flag = COL_INIT_DONE;
389 uniqtime32(&un->un_column[cur->ci_col].un_devtimestamp);
390 mutex_exit(&un->un_mx);
391 cv_broadcast(&un->un_cv);
392 }
393
394 /*
395 * NAME: init_col_int
396 * DESCRIPTION: I/O interrupt while zeroing column of a RAID metadevice.
397 * PARAMETERS: buf_t *cb - I/O buffer for which interrupt occurred
398 *
399 * LOCKS: assumes caller holds unit reader or writer lock
400 *
401 */
402 static int
init_col_int(buf_t * cb)403 init_col_int(buf_t *cb)
404 {
405 raid_ci_t *cur;
406
407 cur = (raid_ci_t *)cb->b_chain;
408 if (cb->b_flags & B_ERROR) {
409 mutex_enter(&cur->ci_un->un_mx);
410 cur->ci_err = EIO;
411 mutex_exit(&cur->ci_un->un_mx);
412 cv_broadcast(&cur->ci_un->un_cv);
413 return (1);
414 }
415 daemon_request(&md_done_daemon, init_col_nextio,
416 (daemon_queue_t *)cur, REQ_OLD);
417 return (1);
418 }
419
420 /*
421 * NAME: raid_init_columns
422 * DESCRIPTION: Zero one or more columns of a RAID metadevice.
423 * PARAMETERS: minor_t mnum - RAID unit minor identifier
424 *
425 * LOCKS: obtains and releases unit reader lock,
426 * obtains and releases unit writer lock,
427 * obtains and releases md_unit_array_rw write lock,
428 * obtains and releases unit mutex (un_mx) lock,
429 * waits on unit conditional variable (un_cv)
430 *
431 */
432 static void
raid_init_columns(minor_t mnum)433 raid_init_columns(minor_t mnum)
434 {
435 mr_unit_t *un;
436 mdi_unit_t *ui;
437 raid_ci_t *ci_chain = NULL, *cur;
438 rus_state_t state;
439 caddr_t zero_addr;
440 diskaddr_t end_off;
441 size_t zerosize;
442 int err = 0;
443 int ix;
444 int colcnt = 0;
445 int col;
446 set_t setno = MD_MIN2SET(mnum);
447
448 /*
449 * Increment the raid resync count for cpr
450 */
451 mutex_enter(&md_cpr_resync.md_resync_mutex);
452 md_cpr_resync.md_raid_resync++;
453 mutex_exit(&md_cpr_resync.md_resync_mutex);
454
455 /*
456 * initialization is a multiple step process. The first step
457 * is to go through the unit structure and start each device
458 * in the init state writing zeros over the component.
459 * Next initialize the prewrite areas, so the device can be
460 * used if a metainit -k is done. Now close the componenets.
461 *
462 * Once this complete set the state of each component being
463 * zeroed and set the correct state for the unit.
464 *
465 * last commit the records.
466 */
467
468 ui = MDI_UNIT(mnum);
469 un = md_unit_readerlock(ui);
470
471 /* check for active init on this column */
472 /* exiting is cpr safe */
473 if ((un->un_init_colcnt > 0) && (un->un_resync_index != -1)) {
474 md_unit_readerexit(ui);
475 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
476 /*
477 * Decrement the raid resync count for cpr
478 */
479 mutex_enter(&md_cpr_resync.md_resync_mutex);
480 md_cpr_resync.md_raid_resync--;
481 mutex_exit(&md_cpr_resync.md_resync_mutex);
482 thread_exit();
483 }
484
485 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_START, SVM_TAG_METADEVICE, setno,
486 MD_SID(un));
487 un->un_init_colcnt = 0;
488 un->un_init_iocnt = 0;
489 end_off = un->un_pwsize + (un->un_segsize * un->un_segsincolumn);
490 zerosize = (size_t)MIN((diskaddr_t)un->un_maxio, end_off);
491
492 /* allocate zero-filled buffer */
493 zero_addr = kmem_zalloc(dbtob(zerosize), KM_SLEEP);
494
495 for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
496 if (un->un_column[ix].un_devstate != RCS_INIT)
497 continue;
498 /* allocate new column init structure */
499 cur = (raid_ci_t *)kmem_zalloc((sizeof (raid_ci_t)), KM_SLEEP);
500 ASSERT(cur != NULL);
501 un->un_init_colcnt++;
502 cur->ci_next = ci_chain;
503 ci_chain = cur;
504 cur->ci_un = un;
505 cur->ci_col = ix;
506 cur->ci_err = 0;
507 cur->ci_flag = COL_INITING;
508 cur->ci_zerosize = zerosize;
509 cur->ci_blkno = un->un_column[ix].un_pwstart;
510 cur->ci_lastblk = cur->ci_blkno + un->un_pwsize
511 + (un->un_segsize * un->un_segsincolumn);
512 /* initialize static buf fields */
513 cur->ci_buf.b_un.b_addr = zero_addr;
514 cur->ci_buf.b_chain = (buf_t *)cur;
515 cur->ci_buf.b_back = &cur->ci_buf;
516 cur->ci_buf.b_forw = &cur->ci_buf;
517 cur->ci_buf.b_iodone = init_col_int;
518 cur->ci_buf.b_flags = B_BUSY | B_WRITE;
519 cur->ci_buf.b_edev = md_dev64_to_dev(un->un_column[ix].un_dev);
520 sema_init(&cur->ci_buf.b_io, 0, NULL, SEMA_DEFAULT, NULL);
521 sema_init(&cur->ci_buf.b_sem, 0, NULL, SEMA_DEFAULT, NULL);
522 /* set address and length for I/O bufs */
523 cur->ci_buf.b_bufsize = dbtob(zerosize);
524 cur->ci_buf.b_bcount = dbtob(zerosize);
525 cur->ci_buf.b_lblkno = un->un_column[ix].un_pwstart;
526 cur->ci_buf.b_offset = -1;
527
528 if (! (un->un_column[ix].un_devflags & MD_RAID_DEV_ISOPEN)) {
529 md_dev64_t tmpdev = un->un_column[ix].un_dev;
530 /*
531 * Open by device id
532 * If this column is hotspared then
533 * use the hotspare key
534 */
535 tmpdev = md_resolve_bydevid(mnum, tmpdev,
536 HOTSPARED(un, ix) ?
537 un->un_column[ix].un_hs_key :
538 un->un_column[ix].un_orig_key);
539 if ((cur->ci_err = md_layered_open(mnum, &tmpdev,
540 MD_OFLG_NULL)) == 0)
541 un->un_column[ix].un_devflags |=
542 MD_RAID_DEV_ISOPEN;
543 un->un_column[ix].un_dev = tmpdev;
544 }
545 if (cur->ci_err == 0)
546 md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
547 }
548
549 md_unit_readerexit(ui);
550 state = un->un_state;
551 colcnt = un->un_init_colcnt;
552 mutex_enter(&un->un_mx);
553 while (colcnt) {
554 cv_wait(&un->un_cv, &un->un_mx);
555
556 colcnt = 0;
557 for (cur = ci_chain; cur != NULL; cur = cur->ci_next) {
558 col = cur->ci_col;
559 if ((cur->ci_flag != COL_INITING) || (cur->ci_err)) {
560 if (cur->ci_err)
561 err = cur->ci_err;
562 else if (cur->ci_flag == COL_INIT_DONE) {
563 (void) init_pw_area(un,
564 un->un_column[col].un_dev,
565 un->un_column[col].un_pwstart,
566 col);
567 cur->ci_flag = COL_READY;
568 }
569 } else {
570 colcnt++;
571 }
572 }
573 }
574 mutex_exit(&un->un_mx);
575
576 /* This prevents new opens */
577 rw_enter(&md_unit_array_rw.lock, RW_WRITER);
578 (void) md_io_writerlock(ui);
579 un = (mr_unit_t *)md_unit_writerlock(ui);
580 while (ci_chain) {
581 cur = ci_chain;
582
583 /* take this element out of the chain */
584 ci_chain = cur->ci_next;
585 /* free this element */
586 sema_destroy(&cur->ci_buf.b_io);
587 sema_destroy(&cur->ci_buf.b_sem);
588 if (cur->ci_err)
589 raid_set_state(cur->ci_un, cur->ci_col,
590 RCS_INIT_ERRED, 0);
591 else
592 raid_set_state(cur->ci_un, cur->ci_col,
593 RCS_OKAY, 0);
594 kmem_free(cur, sizeof (raid_ci_t));
595 }
596
597 /* free the zeroed buffer */
598 kmem_free(zero_addr, dbtob(zerosize));
599
600 /* determine new unit state */
601 if (err == 0) {
602 if (state == RUS_INIT)
603 un->un_state = RUS_OKAY;
604 else {
605 un->c.un_total_blocks = un->un_grow_tb;
606 md_nblocks_set(mnum, un->c.un_total_blocks);
607 un->un_grow_tb = 0;
608 if (raid_state_cnt(un, RCS_OKAY) ==
609 un->un_totalcolumncnt)
610 un->un_state = RUS_OKAY;
611 }
612 } else { /* error orcurred */
613 if (state & RUS_INIT)
614 un->un_state = RUS_DOI;
615 }
616 uniqtime32(&un->un_timestamp);
617 MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
618 un->un_init_colcnt = 0;
619 un->un_init_iocnt = 0;
620 raid_commit(un, NULL);
621 md_unit_writerexit(ui);
622 (void) md_io_writerexit(ui);
623 rw_exit(&md_unit_array_rw.lock);
624 if (err) {
625 if (un->un_state & RUS_DOI) {
626 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
627 SVM_TAG_METADEVICE, setno, MD_SID(un));
628 } else {
629 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
630 SVM_TAG_METADEVICE, setno, MD_SID(un));
631 }
632 } else {
633 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_SUCCESS,
634 SVM_TAG_METADEVICE, setno, MD_SID(un));
635 }
636 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
637 /*
638 * Decrement the raid resync count for cpr
639 */
640 mutex_enter(&md_cpr_resync.md_resync_mutex);
641 md_cpr_resync.md_raid_resync--;
642 mutex_exit(&md_cpr_resync.md_resync_mutex);
643 thread_exit();
644 /*NOTREACHED*/
645 }
646
647 static int
raid_init_unit(minor_t mnum,md_error_t * ep)648 raid_init_unit(minor_t mnum, md_error_t *ep)
649 {
650 mdi_unit_t *ui;
651 mr_unit_t *un;
652 int rval, i;
653 set_t setno = MD_MIN2SET(mnum);
654
655 ui = MDI_UNIT(mnum);
656 if (md_get_setstatus(setno) & MD_SET_STALE)
657 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
658
659 /* Don't start an init if the device is not available */
660 if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
661 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
662 }
663
664 if (raid_internal_open(mnum, (FREAD | FWRITE),
665 OTYP_LYR, MD_OFLG_ISINIT)) {
666 rval = mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum);
667 goto out;
668 }
669
670 un = md_unit_readerlock(ui);
671 un->un_percent_done = 0;
672 md_unit_readerexit(ui);
673 /* start resync_unit thread */
674 (void) thread_create(NULL, 0, raid_init_columns,
675 (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
676
677 return (0);
678
679 out:
680 un = md_unit_writerlock(ui);
681 MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
682 /* recover state */
683 for (i = 0; i < un->un_totalcolumncnt; i++)
684 if (COLUMN_STATE(un, i) == RCS_INIT)
685 raid_set_state(un, i, RCS_ERRED, 0);
686 if (un->un_state & RUS_INIT)
687 un->un_state = RUS_DOI;
688 raid_commit(un, NULL);
689 md_unit_writerexit(ui);
690 if (un->un_state & RUS_DOI) {
691 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
692 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
693 } else {
694 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
695 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
696 }
697 return (rval);
698 }
699
700 /*
701 * NAME: raid_regen
702 *
703 * DESCRIPTION: regenerate all the parity on the raid device. This
704 * routine starts a thread that will regenerate the
705 * parity on a raid device. If an I/O error occurs during
706 * this process the entire device is placed in error.
707 *
708 * PARAMETERS: md_set_params_t *msp - ioctl packet
709 */
710 static void
regen_unit(minor_t mnum)711 regen_unit(minor_t mnum)
712 {
713 mdi_unit_t *ui = MDI_UNIT(mnum);
714 mr_unit_t *un = MD_UNIT(mnum);
715 buf_t buf, *bp;
716 caddr_t buffer;
717 int err = 0;
718 diskaddr_t total_segments;
719 diskaddr_t line;
720 size_t iosize;
721
722 /*
723 * Increment raid resync count for cpr
724 */
725 mutex_enter(&md_cpr_resync.md_resync_mutex);
726 md_cpr_resync.md_raid_resync++;
727 mutex_exit(&md_cpr_resync.md_resync_mutex);
728
729 iosize = dbtob(un->un_segsize);
730 buffer = kmem_alloc(iosize, KM_SLEEP);
731 bp = &buf;
732 total_segments = un->un_segsincolumn;
733 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_START, SVM_TAG_METADEVICE,
734 MD_UN2SET(un), MD_SID(un));
735 un->un_percent_done = 0;
736 init_buf(bp, B_READ | B_BUSY, iosize);
737
738 for (line = 0; line < total_segments; line++) {
739 bp->b_lblkno = line *
740 ((un->un_origcolumncnt - 1) * un->un_segsize);
741 bp->b_un.b_addr = buffer;
742 bp->b_bcount = iosize;
743 bp->b_iodone = NULL;
744 /*
745 * The following assignment is only correct because
746 * md_raid_strategy is fine when it's only a minor number
747 * and not a real dev_t. Yuck.
748 */
749 bp->b_edev = mnum;
750 md_raid_strategy(bp, MD_STR_NOTTOP, NULL);
751 if (biowait(bp)) {
752 err = 1;
753 break;
754 }
755 un->un_percent_done = (uint_t)((line * 1000) /
756 un->un_segsincolumn);
757 /* just to avoid rounding errors */
758 if (un->un_percent_done > 1000)
759 un->un_percent_done = 1000;
760 reset_buf(bp, B_READ | B_BUSY, iosize);
761 }
762 destroy_buf(bp);
763 kmem_free(buffer, iosize);
764
765 (void) md_io_writerlock(ui);
766 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
767 (void) md_io_writerexit(ui);
768 un = md_unit_writerlock(ui);
769 if (!err &&
770 (raid_state_cnt(un, RCS_OKAY) == un->un_totalcolumncnt))
771 un->un_state = RUS_OKAY;
772 raid_commit(un, NULL);
773 md_unit_writerexit(ui);
774 if (err ||
775 raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) {
776 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_FAILED,
777 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
778 } else {
779 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_DONE, SVM_TAG_METADEVICE,
780 MD_UN2SET(un), MD_SID(un));
781 }
782
783 /*
784 * Decrement the raid resync count for cpr
785 */
786 mutex_enter(&md_cpr_resync.md_resync_mutex);
787 md_cpr_resync.md_raid_resync--;
788 mutex_exit(&md_cpr_resync.md_resync_mutex);
789 thread_exit();
790 }
791
792 static int
raid_regen_unit(minor_t mnum,md_error_t * ep)793 raid_regen_unit(minor_t mnum, md_error_t *ep)
794 {
795 mdi_unit_t *ui;
796 mr_unit_t *un;
797 int i;
798 set_t setno = MD_MIN2SET(mnum);
799
800 ui = MDI_UNIT(mnum);
801 un = (mr_unit_t *)MD_UNIT(mnum);
802
803 if (md_get_setstatus(setno) & MD_SET_STALE)
804 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
805
806 /* Don't start a regen if the device is not available */
807 if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
808 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
809 }
810
811 if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
812 (void) md_unit_writerlock(ui);
813 for (i = 0; i < un->un_totalcolumncnt; i++)
814 raid_set_state(un, i, RCS_ERRED, 0);
815 md_unit_writerexit(ui);
816 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
817 }
818
819 /* start resync_unit thread */
820 (void) thread_create(NULL, 0, regen_unit,
821 (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
822
823 return (0);
824 }
825
826 static int
raid_regen(md_regen_param_t * mrp,IOLOCK * lock)827 raid_regen(md_regen_param_t *mrp, IOLOCK *lock)
828 {
829 minor_t mnum = mrp->mnum;
830 mr_unit_t *un;
831
832 mdclrerror(&mrp->mde);
833
834 un = md_unit_readerlock(MDI_UNIT(mnum));
835
836 if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
837 md_unit_readerexit(MDI_UNIT(mnum));
838 return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
839 }
840
841 if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
842 (raid_state_cnt(un, RCS_RESYNC))) {
843 md_unit_readerexit(MDI_UNIT(mnum));
844 return (mdmderror(&mrp->mde, MDE_RESYNC_ACTIVE, mnum));
845 }
846
847 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
848 md_unit_readerexit(MDI_UNIT(mnum));
849 return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
850 }
851
852 if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
853 (! (un->un_state & RUS_OKAY))) {
854 md_unit_readerexit(MDI_UNIT(mnum));
855 return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
856 }
857
858 md_unit_readerexit(MDI_UNIT(mnum));
859
860 /* get locks and recheck to be sure something did not change */
861 if ((un = raid_getun(mnum, &mrp->mde, WRITERS, lock)) == NULL)
862 return (0);
863
864 if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
865 (! (un->un_state & RUS_OKAY))) {
866 return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
867 }
868
869 raid_set_state(un, 0, RCS_REGEN, 0);
870 raid_commit(un, NULL);
871 md_ioctl_droplocks(lock);
872 return (raid_regen_unit(mnum, &mrp->mde));
873 }
874
875 /*
876 * NAME: raid_set
877 * DESCRIPTION: used to create a RAID metadevice
878 * PARAMETERS: md_set_params_t *d - pointer to set data structure
879 * int mode - must be FWRITE
880 *
881 * LOCKS: none
882 *
883 */
884 static int
raid_set(void * d,int mode)885 raid_set(void *d, int mode)
886 {
887 minor_t mnum;
888 mr_unit_t *un;
889 mddb_recid_t mr_recid;
890 mddb_recid_t *recids;
891 mddb_type_t typ1;
892 int err;
893 set_t setno;
894 int num_recs;
895 int rid;
896 int col;
897 md_set_params_t *msp = d;
898
899
900 mnum = msp->mnum;
901 setno = MD_MIN2SET(mnum);
902
903 mdclrerror(&msp->mde);
904
905 if (raid_getun(mnum, &msp->mde, NO_OLD, NULL) == NULL)
906 return (0);
907
908 typ1 = (mddb_type_t)md_getshared_key(setno,
909 raid_md_ops.md_driver.md_drivername);
910
911 /* create the db record for this mdstruct */
912
913 if (msp->options & MD_CRO_64BIT) {
914 #if defined(_ILP32)
915 return (mdmderror(&msp->mde, MDE_UNIT_TOO_LARGE, mnum));
916 #else
917 mr_recid = mddb_createrec(msp->size, typ1, 0,
918 MD_CRO_64BIT | MD_CRO_RAID | MD_CRO_FN, setno);
919 #endif
920 } else {
921 mr_recid = mddb_createrec(msp->size, typ1, 0,
922 MD_CRO_32BIT | MD_CRO_RAID | MD_CRO_FN, setno);
923 }
924
925 if (mr_recid < 0)
926 return (mddbstatus2error(&msp->mde,
927 (int)mr_recid, mnum, setno));
928
929 /* get the address of the mdstruct */
930 un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
931 /*
932 * It is okay that we muck with the mdstruct here,
933 * since no one else will know about the mdstruct
934 * until we commit it. If we crash, the record will
935 * be automatically purged, since we haven't
936 * committed it yet.
937 */
938
939 /* copy in the user's mdstruct */
940 if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, un,
941 msp->size, mode)) {
942 mddb_deleterec_wrapper(mr_recid);
943 return (EFAULT);
944 }
945 /* All 64 bit metadevices only support EFI labels. */
946 if (msp->options & MD_CRO_64BIT) {
947 un->c.un_flag |= MD_EFILABEL;
948 }
949
950 /*
951 * allocate the real recids array. since we may have to commit
952 * underlying metadevice records, we need an array of size:
953 * total number of components in raid + 3 (1 for the raid itself,
954 * one for the hotspare, one for the end marker).
955 */
956 num_recs = un->un_totalcolumncnt + 3;
957 rid = 0;
958 recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
959 recids[rid++] = mr_recid;
960
961 MD_SID(un) = mnum;
962 MD_RECID(un) = recids[0];
963 MD_CAPAB(un) = MD_CAN_PARENT | MD_CAN_SP;
964 MD_PARENT(un) = MD_NO_PARENT;
965 un->un_resync_copysize = 0;
966 un->c.un_revision |= MD_FN_META_DEV;
967
968 if (UNIT_STATE(un) == RUS_INIT)
969 MD_STATUS(un) |= MD_UN_GROW_PENDING;
970
971 if ((UNIT_STATE(un) != RUS_INIT) && raid_check_pw(un)) {
972 mddb_deleterec_wrapper(mr_recid);
973 err = mderror(&msp->mde, MDE_RAID_INVALID);
974 goto out;
975 }
976
977 if (err = raid_build_incore(un, 0)) {
978 if (un->mr_ic) {
979 kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
980 un->un_totalcolumncnt);
981 kmem_free(un->mr_ic, sizeof (*un->mr_ic));
982 }
983
984 md_nblocks_set(mnum, -1ULL);
985 MD_UNIT(mnum) = NULL;
986
987 mddb_deleterec_wrapper(mr_recid);
988 goto out;
989 }
990
991 /*
992 * Update unit availability
993 */
994 md_set[setno].s_un_avail--;
995
996 recids[rid] = 0;
997 if (un->un_hsp_id != -1) {
998 /* increment the reference count of the hot spare pool */
999 err = md_hot_spare_ifc(HSP_INCREF, un->un_hsp_id, 0, 0,
1000 &recids[rid], NULL, NULL, NULL);
1001 if (err) {
1002 md_nblocks_set(mnum, -1ULL);
1003 MD_UNIT(mnum) = NULL;
1004
1005 mddb_deleterec_wrapper(mr_recid);
1006 goto out;
1007 }
1008 rid++;
1009 }
1010
1011 /*
1012 * set the parent on any metadevice components.
1013 * NOTE: currently soft partitions are the only metadevices
1014 * which can appear within a RAID metadevice.
1015 */
1016 for (col = 0; col < un->un_totalcolumncnt; col++) {
1017 mr_column_t *mr_col = &un->un_column[col];
1018 md_unit_t *comp_un;
1019
1020 if (md_getmajor(mr_col->un_dev) == md_major) {
1021 comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
1022 recids[rid++] = MD_RECID(comp_un);
1023 md_set_parent(mr_col->un_dev, MD_SID(un));
1024 }
1025 }
1026
1027 /* set the end marker */
1028 recids[rid] = 0;
1029
1030 mddb_commitrecs_wrapper(recids);
1031 md_create_unit_incore(mnum, &raid_md_ops, 1);
1032
1033 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE, setno,
1034 MD_SID(un));
1035
1036 out:
1037 kmem_free(recids, (num_recs * sizeof (mddb_recid_t)));
1038 if (err)
1039 return (err);
1040
1041 /* only attempt to init a device that is in the init state */
1042 if (UNIT_STATE(un) != RUS_INIT)
1043 return (0);
1044
1045 return (raid_init_unit(mnum, &msp->mde));
1046 }
1047
1048 /*
1049 * NAME: raid_get
1050 * DESCRIPTION: used to get the unit structure of a RAID metadevice
1051 * PARAMETERS: md_i_get_t *migp - pointer to get data structure
1052 * int mode - must be FREAD
1053 * IOLOCK *lock - pointer to IOCTL lock
1054 *
1055 * LOCKS: obtains unit reader lock via IOLOCK
1056 *
1057 */
1058 static int
raid_get(void * migp,int mode,IOLOCK * lock)1059 raid_get(
1060 void *migp,
1061 int mode,
1062 IOLOCK *lock
1063 )
1064 {
1065 minor_t mnum;
1066 mr_unit_t *un;
1067 md_i_get_t *migph = migp;
1068
1069
1070 mnum = migph->id;
1071
1072 mdclrerror(&migph->mde);
1073
1074 if ((un = raid_getun(mnum, &migph->mde,
1075 RD_LOCK, lock)) == NULL)
1076 return (0);
1077
1078 if (migph->size == 0) {
1079 migph->size = un->c.un_size;
1080 return (0);
1081 }
1082
1083 if (migph->size < un->c.un_size) {
1084 return (EFAULT);
1085 }
1086 if (ddi_copyout(un, (void *)(uintptr_t)migph->mdp,
1087 un->c.un_size, mode))
1088 return (EFAULT);
1089
1090 return (0);
1091 }
1092
1093
1094 /*
1095 * NAME: raid_replace
1096 * DESCRIPTION: used to replace a component of a RAID metadevice
1097 * PARAMETERS: replace_params_t *mrp - pointer to replace data structure
1098 * IOLOCK *lock - pointer to IOCTL lock
1099 *
1100 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun),
1101 * obtains and releases md_unit_array_rw write lock
1102 *
1103 */
1104 static int
raid_replace(replace_params_t * mrp,IOLOCK * lock)1105 raid_replace(
1106 replace_params_t *mrp,
1107 IOLOCK *lock
1108 )
1109 {
1110 minor_t mnum = mrp->mnum;
1111 md_dev64_t odev = mrp->old_dev;
1112 md_error_t *ep = &mrp->mde;
1113 mr_unit_t *un;
1114 rcs_state_t state;
1115 int ix, col = -1;
1116 int force = 0;
1117 int err = 0;
1118 replace_cmd_t cmd;
1119 set_t setno;
1120 side_t side;
1121 mdkey_t devkey;
1122 int nkeys;
1123 mddb_recid_t extra_recids[3] = { 0, 0, 0 };
1124 int extra_rids = 0;
1125 md_error_t mde = mdnullerror;
1126 sv_dev_t sv = {MD_SET_BAD, MD_SIDEWILD, MD_KEYWILD};
1127
1128 mdclrerror(ep);
1129 setno = MD_MIN2SET(mnum);
1130 side = mddb_getsidenum(setno);
1131
1132 un = md_unit_readerlock(MDI_UNIT(mnum));
1133
1134 if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
1135 (raid_state_cnt(un, RCS_RESYNC) != 0)) {
1136 md_unit_readerexit(MDI_UNIT(mnum));
1137 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1138 }
1139
1140 if (un->un_state & RUS_DOI) {
1141 md_unit_readerexit(MDI_UNIT(mnum));
1142 return (mdmderror(ep, MDE_RAID_DOI, mnum));
1143 }
1144
1145 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
1146 (MD_STATUS(un) & MD_UN_GROW_PENDING)) {
1147 md_unit_readerexit(MDI_UNIT(mnum));
1148 return (mdmderror(ep, MDE_IN_USE, mnum));
1149 }
1150
1151 md_unit_readerexit(MDI_UNIT(mnum));
1152
1153 /* get locks and recheck to be sure something did not change */
1154 if ((un = raid_getun(mnum, ep, WRITERS, lock)) == NULL)
1155 return (0);
1156
1157 if (md_getkeyfromdev(setno, side, odev, &devkey, &nkeys) != 0) {
1158 return (mddeverror(ep, MDE_NAME_SPACE, odev));
1159 }
1160
1161 for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1162 md_dev64_t tmpdevt = un->un_column[ix].un_orig_dev;
1163 /*
1164 * Try to resolve devt again if NODEV64
1165 */
1166 if (tmpdevt == NODEV64) {
1167 tmpdevt = md_resolve_bydevid(mnum, tmpdevt,
1168 un->un_column[ix].un_orig_key);
1169 un->un_column[ix].un_orig_dev = tmpdevt;
1170 }
1171
1172 if (un->un_column[ix].un_orig_dev == odev) {
1173 col = ix;
1174 break;
1175 } else {
1176 if (un->un_column[ix].un_orig_dev == NODEV64) {
1177 /*
1178 * Now we use the keys to match.
1179 * If no key found, continue.
1180 */
1181 if (nkeys == 0) {
1182 continue;
1183 }
1184 if (un->un_column[ix].un_orig_key == devkey) {
1185 if (nkeys > 1)
1186 return (mddeverror(ep,
1187 MDE_MULTNM, odev));
1188 col = ix;
1189 break;
1190 }
1191 }
1192 }
1193 }
1194
1195 if (col == -1)
1196 return (mdcomperror(ep, MDE_CANT_FIND_COMP,
1197 mnum, odev));
1198
1199 if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
1200 (raid_state_cnt(un, RCS_RESYNC) != 0))
1201 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1202
1203 if (un->un_state & RUS_DOI)
1204 return (mdcomperror(ep, MDE_REPL_INVAL_STATE, mnum,
1205 un->un_column[col].un_dev));
1206
1207 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
1208 (MD_STATUS(un) & MD_UN_GROW_PENDING))
1209 return (mdmderror(ep, MDE_IN_USE, mnum));
1210
1211 if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == FORCE_REPLACE_COMP))
1212 force = 1;
1213 if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == ENABLE_COMP))
1214 cmd = ENABLE_COMP;
1215 if ((mrp->cmd == FORCE_REPLACE_COMP) || (mrp->cmd == REPLACE_COMP))
1216 cmd = REPLACE_COMP;
1217
1218 if (un->un_state == RUS_LAST_ERRED) {
1219 /* Must use -f force flag for unit in LAST_ERRED state */
1220 if (!force)
1221 return (mdmderror(ep, MDE_RAID_NEED_FORCE, mnum));
1222
1223 /* Must use -f force flag on ERRED column first */
1224 if (un->un_column[col].un_devstate != RCS_ERRED) {
1225 for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1226 if (un->un_column[ix].un_devstate & RCS_ERRED)
1227 return (mdcomperror(ep,
1228 MDE_RAID_COMP_ERRED, mnum,
1229 un->un_column[ix].un_dev));
1230 }
1231 }
1232
1233 /* must use -f force flag on LAST_ERRED columns next */
1234 if ((un->un_column[col].un_devstate != RCS_LAST_ERRED) &&
1235 (un->un_column[col].un_devstate != RCS_ERRED))
1236 return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
1237 mnum, un->un_column[col].un_dev));
1238 }
1239
1240 if (un->un_state == RUS_ERRED) {
1241 if (! (un->un_column[col].un_devstate &
1242 (RCS_ERRED | RCS_INIT_ERRED)))
1243 return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
1244 mnum, un->un_column[ix].un_dev));
1245 }
1246
1247 ASSERT(!(un->un_column[col].un_devflags & MD_RAID_ALT_ISOPEN));
1248 ASSERT(!(un->un_column[col].un_devflags & MD_RAID_WRITE_ALT));
1249
1250 state = un->un_column[col].un_devstate;
1251 if (state & RCS_INIT_ERRED) {
1252 MD_STATUS(un) |= MD_UN_GROW_PENDING;
1253 un->un_percent_done = 0;
1254 raid_set_state(un, col, RCS_INIT, 0);
1255 } else if (((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) &&
1256 resync_request(mnum, col, 0, ep))
1257 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1258
1259
1260 if (cmd == REPLACE_COMP) {
1261 md_dev64_t tmpdev = mrp->new_dev;
1262
1263 /*
1264 * open the device by device id
1265 */
1266 tmpdev = md_resolve_bydevid(mnum, tmpdev, mrp->new_key);
1267 if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
1268 return (mdcomperror(ep, MDE_COMP_OPEN_ERR, mnum,
1269 tmpdev));
1270 }
1271
1272 /*
1273 * If it's a metadevice, make sure it gets reparented
1274 */
1275 if (md_getmajor(tmpdev) == md_major) {
1276 minor_t new_mnum = md_getminor(tmpdev);
1277 md_unit_t *new_un = MD_UNIT(new_mnum);
1278
1279 md_set_parent(tmpdev, MD_SID(un));
1280 extra_recids[extra_rids++] = MD_RECID(new_un);
1281 }
1282
1283 mrp->new_dev = tmpdev;
1284 un->un_column[col].un_orig_dev = tmpdev;
1285 un->un_column[col].un_orig_key = mrp->new_key;
1286 un->un_column[col].un_orig_pwstart = mrp->start_blk;
1287 un->un_column[col].un_orig_devstart =
1288 mrp->start_blk + un->un_pwsize;
1289
1290 /*
1291 * If the old device was a metadevice, make sure to
1292 * reset its parent.
1293 */
1294 if (md_getmajor(odev) == md_major) {
1295 minor_t old_mnum = md_getminor(odev);
1296 md_unit_t *old_un = MD_UNIT(old_mnum);
1297
1298 md_reset_parent(odev);
1299 extra_recids[extra_rids++] =
1300 MD_RECID(old_un);
1301 }
1302
1303 if (HOTSPARED(un, col)) {
1304 md_layered_close(mrp->new_dev, MD_OFLG_NULL);
1305 un->un_column[col].un_alt_dev = mrp->new_dev;
1306 un->un_column[col].un_alt_pwstart = mrp->start_blk;
1307 un->un_column[col].un_alt_devstart =
1308 mrp->start_blk + un->un_pwsize;
1309 un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
1310 } else {
1311 /*
1312 * not hot spared. Close the old device and
1313 * move the new device in.
1314 */
1315 if (un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN)
1316 md_layered_close(odev, MD_OFLG_NULL);
1317 un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
1318 un->un_column[col].un_dev = mrp->new_dev;
1319 un->un_column[col].un_pwstart = mrp->start_blk;
1320 un->un_column[col].un_devstart =
1321 mrp->start_blk + un->un_pwsize;
1322 if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) {
1323 un->un_column[col].un_devflags |=
1324 MD_RAID_REGEN_RESYNC;
1325 }
1326 }
1327 /*
1328 * If the old device is not a metadevice then
1329 * save off the set number and key so that it
1330 * can be removed from the namespace later.
1331 */
1332 if (md_getmajor(odev) != md_major) {
1333 sv.setno = setno;
1334 sv.key = devkey;
1335 }
1336 }
1337
1338 if (cmd == ENABLE_COMP) {
1339 md_dev64_t tmpdev = un->un_column[col].un_orig_dev;
1340 mdkey_t raidkey = un->un_column[col].un_orig_key;
1341
1342 /*
1343 * We trust the dev_t because we cannot determine the
1344 * dev_t from the device id since a new disk is in the
1345 * same location. Since this is a call from metareplace -e dx
1346 * AND it is SCSI a new dev_t is not generated. So the
1347 * dev_t from the mddb is used. Before enabling the device
1348 * we check to make sure that multiple entries for the same
1349 * device does not exist in the namespace. If they do we
1350 * fail the ioctl.
1351 * One of the many ways multiple entries in the name space
1352 * can occur is if one removed the failed component in a
1353 * RAID metadevice and put another disk that was part of
1354 * another metadevice. After reboot metadevadm would correctly
1355 * update the device name for the metadevice whose component
1356 * has moved. However now in the metadb there are two entries
1357 * for the same name (ctds) that belong to different
1358 * metadevices. One is valid, the other is a ghost or "last
1359 * know as" ctds.
1360 */
1361 tmpdev = md_resolve_bydevid(mnum, tmpdev, raidkey);
1362 if (tmpdev == NODEV64)
1363 tmpdev = md_getdevnum(setno, side, raidkey,
1364 MD_TRUST_DEVT);
1365 /*
1366 * check for multiple entries in namespace for the
1367 * same dev
1368 */
1369
1370 if (md_getkeyfromdev(setno, side, tmpdev, &devkey,
1371 &nkeys) != 0)
1372 return (mddeverror(ep, MDE_NAME_SPACE, tmpdev));
1373 /*
1374 * If number of keys are greater that
1375 * 1, then we have an invalid
1376 * namespace. STOP and return.
1377 */
1378 if (nkeys > 1)
1379 return (mddeverror(ep, MDE_MULTNM, tmpdev));
1380 if (devkey != raidkey)
1381 return (mdcomperror(ep, MDE_CANT_FIND_COMP,
1382 mnum, tmpdev));
1383
1384 if (un->un_column[col].un_orig_dev == NODEV64)
1385 un->un_column[col].un_orig_dev = tmpdev;
1386
1387 if (HOTSPARED(un, col)) {
1388 un->un_column[col].un_alt_dev =
1389 un->un_column[col].un_orig_dev;
1390 un->un_column[col].un_alt_pwstart =
1391 un->un_column[col].un_orig_pwstart;
1392 un->un_column[col].un_alt_devstart =
1393 un->un_column[col].un_orig_devstart;
1394 un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
1395 } else {
1396 if (!(un->un_column[col].un_devflags &
1397 MD_RAID_DEV_ISOPEN)) {
1398 if (md_layered_open(mnum, &tmpdev,
1399 MD_OFLG_NULL)) {
1400 un->un_column[col].un_dev = tmpdev;
1401 return (mdcomperror(ep,
1402 MDE_COMP_OPEN_ERR, mnum, tmpdev));
1403 }
1404 ASSERT(tmpdev != NODEV64 &&
1405 tmpdev != 0);
1406
1407 if ((md_getmajor(tmpdev) != md_major) &&
1408 (md_devid_found(setno, side, raidkey)
1409 == 1)) {
1410 if (md_update_namespace_did(setno, side,
1411 raidkey, &mde) != 0) {
1412 cmn_err(CE_WARN,
1413 "md: could not"
1414 " update namespace\n");
1415 }
1416 }
1417 un->un_column[col].un_dev =
1418 un->un_column[col].un_orig_dev;
1419 }
1420 un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
1421 un->un_column[col].un_devflags |= MD_RAID_REGEN_RESYNC;
1422 }
1423 }
1424 if (mrp->has_label) {
1425 un->un_column[col].un_devflags |= MD_RAID_HAS_LABEL;
1426 } else {
1427 un->un_column[col].un_devflags &= ~MD_RAID_HAS_LABEL;
1428 }
1429
1430 raid_commit(un, extra_recids);
1431
1432 /* If the component has been replaced - clean up the name space */
1433 if (sv.setno != MD_SET_BAD) {
1434 md_rem_names(&sv, 1);
1435 }
1436
1437 md_ioctl_droplocks(lock);
1438
1439 if ((cmd == ENABLE_COMP) || (cmd == FORCE_ENABLE_COMP)) {
1440 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ENABLE, SVM_TAG_METADEVICE,
1441 setno, MD_SID(un));
1442 } else {
1443 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REPLACE, SVM_TAG_METADEVICE,
1444 setno, MD_SID(un));
1445 }
1446
1447 if (un->un_column[col].un_devstate & RCS_INIT)
1448 err = raid_init_unit(mnum, ep);
1449 else if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0)
1450 err = raid_resync_unit(mnum, ep);
1451
1452 mdclrerror(ep);
1453 if (!err)
1454 return (0);
1455
1456 /* be sure state */
1457 /* is already set by this time */
1458 /* fix state and commit record */
1459 un = md_unit_writerlock(MDI_UNIT(mnum));
1460 if (state & RCS_INIT_ERRED)
1461 raid_set_state(un, col, state, 1);
1462 else if (state & RCS_OKAY)
1463 raid_set_state(un, col, RCS_ERRED, 0);
1464 else
1465 raid_set_state(un, col, state, 1);
1466 raid_commit(un, NULL);
1467 md_unit_writerexit(MDI_UNIT(mnum));
1468 mdclrerror(ep);
1469 return (0);
1470 }
1471
1472
1473 /*
1474 * NAME: raid_set_sync
1475 * DESCRIPTION: used to sync a component of a RAID metadevice
1476 * PARAMETERS: md_resync_ioctl_t *mrp - pointer to resync data structure
1477 * int mode - must be FWRITE
1478 * IOLOCK *lock - pointer to IOCTL lock
1479 *
1480 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun),
1481 * obtains and releases md_unit_array_rw write lock
1482 *
1483 */
1484 static int
raid_set_sync(md_resync_ioctl_t * rip,IOLOCK * lock)1485 raid_set_sync(
1486 md_resync_ioctl_t *rip,
1487 IOLOCK *lock
1488 )
1489 {
1490 minor_t mnum = rip->ri_mnum;
1491 mr_unit_t *un;
1492 int init = 0;
1493 int resync = 0;
1494 int regen = 0;
1495 int ix;
1496 int err;
1497
1498 mdclrerror(&rip->mde);
1499
1500 if ((un = raid_getun(mnum, &rip->mde, WRITERS, lock)) == NULL)
1501 return (0);
1502
1503 if (un->un_state & RUS_DOI)
1504 return (mdmderror(&rip->mde, MDE_RAID_DOI, mnum));
1505
1506 if (un->c.un_status & MD_UN_RESYNC_ACTIVE)
1507 return (mdmderror(&rip->mde, MDE_RESYNC_ACTIVE, mnum));
1508
1509 /* This prevents new opens */
1510
1511 rip->ri_flags = 0;
1512 if (un->un_state & RUS_REGEN)
1513 regen++;
1514
1515 if (raid_state_cnt(un, RCS_RESYNC))
1516 resync++;
1517
1518 if (raid_state_cnt(un, RCS_INIT) || (un->un_state & RUS_INIT))
1519 init++;
1520
1521 ASSERT(!(resync && init && regen));
1522 md_ioctl_droplocks(lock);
1523 rip->ri_percent_done = 0;
1524
1525 if (init) {
1526 MD_STATUS(un) |= MD_UN_GROW_PENDING;
1527 return (raid_init_unit(mnum, &rip->mde));
1528 }
1529
1530 /*
1531 * If resync is needed, it will call raid_internal_open forcing
1532 * replay before the open completes.
1533 * Otherwise, call raid_internal_open directly to force
1534 * replay to complete during boot (metasync -r).
1535 * NOTE: the unit writer lock must remain held while setting
1536 * MD_UN_RESYNC_ACTIVE but must be released before
1537 * calling raid_resync_unit or raid_internal_open.
1538 */
1539 if (resync) {
1540 ASSERT(resync < 2);
1541 un = md_unit_writerlock(MDI_UNIT(mnum));
1542 MD_STATUS(un) |= MD_UN_RESYNC_ACTIVE;
1543 /* Must release unit writer lock for resync */
1544 /*
1545 * correctly setup the devices before trying to start the
1546 * resync operation.
1547 */
1548 for (ix = 0; un->un_totalcolumncnt; ix++) {
1549 if (un->un_column[ix].un_devstate & RCS_RESYNC) {
1550 if ((un->un_column[ix].un_devflags &
1551 MD_RAID_COPY_RESYNC) &&
1552 HOTSPARED(un, ix)) {
1553 un->un_column[ix].un_alt_dev =
1554 un->un_column[ix].un_orig_dev;
1555 un->un_column[ix].un_alt_devstart =
1556 un->un_column[ix].un_orig_devstart;
1557 un->un_column[ix].un_alt_pwstart =
1558 un->un_column[ix].un_orig_pwstart;
1559 }
1560 break;
1561 }
1562 }
1563 ASSERT(un->un_column[ix].un_devflags &
1564 (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC));
1565 rip->ri_percent_done = 0;
1566 un->un_column[ix].un_devflags |= MD_RAID_RESYNC;
1567 (void) resync_request(mnum, ix, 0, NULL);
1568 md_unit_writerexit(MDI_UNIT(mnum));
1569 err = raid_resync_unit(mnum, &rip->mde);
1570 return (err);
1571 }
1572
1573 if (regen) {
1574 err = raid_regen_unit(mnum, &rip->mde);
1575 return (err);
1576 }
1577
1578 /* The unit requires not work so just force replay of the device */
1579 if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0))
1580 return (mdmderror(&rip->mde,
1581 MDE_RAID_OPEN_FAILURE, mnum));
1582 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1583
1584 return (0);
1585 }
1586
1587 /*
1588 * NAME: raid_get_resync
1589 * DESCRIPTION: used to check resync status on a component of a RAID metadevice
1590 * PARAMETERS: md_resync_ioctl_t *mrp - pointer to resync data structure
1591 * int mode - must be FWRITE
1592 * IOLOCK *lock - pointer to IOCTL lock
1593 *
1594 * LOCKS: none
1595 *
1596 */
1597 static int
raid_get_resync(md_resync_ioctl_t * rip,IOLOCK * lock)1598 raid_get_resync(
1599 md_resync_ioctl_t *rip,
1600 IOLOCK *lock
1601 )
1602 {
1603 minor_t mnum = rip->ri_mnum;
1604 mr_unit_t *un;
1605 u_longlong_t percent;
1606 int cnt;
1607 int ix;
1608 uint64_t d;
1609
1610 mdclrerror(&rip->mde);
1611
1612 if ((un = raid_getun(mnum, &rip->mde, RD_LOCK, lock)) == NULL)
1613 return (0);
1614
1615 rip->ri_flags = 0;
1616 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
1617 d = un->un_segsincolumn;
1618 percent = d ? ((1000 * un->un_resync_line_index) / d) : 0;
1619 if (percent > 1000)
1620 percent = 1000; /* can't go over 100% */
1621 rip->ri_percent_done = (int)percent;
1622 rip->ri_flags |= MD_RI_INPROGRESS;
1623 }
1624
1625 if (UNIT_STATE(un) & RUS_INIT) {
1626 d = un->un_segsize * un->un_segsincolumn *
1627 un->un_totalcolumncnt;
1628 percent =
1629 d ? ((1000 * (u_longlong_t)un->un_init_iocnt) / d) : 0;
1630 if (percent > 1000)
1631 percent = 1000; /* can't go over 100% */
1632 rip->ri_percent_done = (int)percent;
1633 rip->ri_flags |= MD_GROW_INPROGRESS;
1634 } else if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
1635 d = un->un_segsize * un->un_segsincolumn * un->un_init_colcnt;
1636 percent =
1637 d ? (((u_longlong_t)un->un_init_iocnt * 1000) / d) : 0;
1638 if (percent > 1000)
1639 percent = 1000;
1640 rip->ri_percent_done = (int)percent;
1641 rip->ri_flags |= MD_GROW_INPROGRESS;
1642 }
1643
1644 if (un->un_state & RUS_REGEN)
1645 rip->ri_percent_done = un->un_percent_done;
1646
1647 cnt = 0;
1648 for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1649 switch (un->un_column[ix].un_devstate) {
1650 case RCS_INIT:
1651 case RCS_ERRED:
1652 case RCS_LAST_ERRED:
1653 cnt++;
1654 break;
1655 default:
1656 break;
1657 }
1658 }
1659 d = un->un_totalcolumncnt;
1660 rip->ri_percent_dirty = d ? (((u_longlong_t)cnt * 100) / d) : 0;
1661 return (0);
1662 }
1663
1664 /*
1665 * NAME: raid_grow
1666 * DESCRIPTION: Concatenate to a RAID metadevice
1667 * PARAMETERS: md_grow_params_t *mgp
1668 * - pointer to IOCGROW data structure
1669 * int mode - must be FWRITE
1670 * IOLOCK *lockp - IOCTL read/write and unit_array_rw lock
1671 *
1672 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun),
1673 * obtains and releases md_unit_array_rw write lock
1674 *
1675 */
1676 static int
raid_grow(void * mgp,int mode,IOLOCK * lock)1677 raid_grow(void *mgp, int mode, IOLOCK *lock)
1678 {
1679 minor_t mnum;
1680 mr_unit_t *un, *new_un;
1681 mdi_unit_t *ui;
1682 mddb_type_t typ1;
1683 mddb_recid_t mr_recid;
1684 mddb_recid_t old_vtoc = 0;
1685 mddb_recid_t *recids;
1686 md_create_rec_option_t options;
1687 int err;
1688 int col, i;
1689 int64_t tb, atb;
1690 u_longlong_t unrev;
1691 int tc;
1692 int rval = 0;
1693 set_t setno;
1694 mr_column_ic_t *mrc;
1695 int num_recs, rid;
1696 md_grow_params_t *mgph = mgp;
1697
1698
1699 mnum = mgph->mnum;
1700
1701 mdclrerror(&mgph->mde);
1702
1703 ui = MDI_UNIT(mnum);
1704 un = md_unit_readerlock(ui);
1705
1706 if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
1707 md_unit_readerexit(ui);
1708 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1709 }
1710
1711 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
1712 md_unit_readerexit(ui);
1713 return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
1714 }
1715
1716 if (UNIT_STATE(un) & RUS_LAST_ERRED) {
1717 md_unit_readerexit(ui);
1718 return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
1719 }
1720
1721 if (UNIT_STATE(un) & RUS_DOI) {
1722 md_unit_readerexit(ui);
1723 return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
1724 }
1725
1726 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
1727 md_unit_readerexit(ui);
1728 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1729 }
1730
1731 md_unit_readerexit(ui);
1732
1733 if ((un = raid_getun(mnum, &mgph->mde, WRITERS, lock)) ==
1734 NULL)
1735 return (0);
1736
1737 if (MD_STATUS(un) & MD_UN_GROW_PENDING)
1738 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1739
1740 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
1741 return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
1742
1743 if (un->c.un_size >= mgph->size)
1744 return (EINVAL);
1745
1746 if (UNIT_STATE(un) & RUS_LAST_ERRED)
1747 return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
1748
1749 if (UNIT_STATE(un) & RUS_DOI)
1750 return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
1751
1752 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT))
1753 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1754
1755 setno = MD_MIN2SET(mnum);
1756
1757 typ1 = (mddb_type_t)md_getshared_key(setno,
1758 raid_md_ops.md_driver.md_drivername);
1759
1760 /*
1761 * Preserve the friendly name nature of the device that is
1762 * growing.
1763 */
1764 options = MD_CRO_RAID;
1765 if (un->c.un_revision & MD_FN_META_DEV)
1766 options |= MD_CRO_FN;
1767 if (mgph->options & MD_CRO_64BIT) {
1768 #if defined(_ILP32)
1769 return (mdmderror(&mgph->mde, MDE_UNIT_TOO_LARGE, mnum));
1770 #else
1771 mr_recid = mddb_createrec(mgph->size, typ1, 0,
1772 MD_CRO_64BIT | options, setno);
1773 #endif
1774 } else {
1775 mr_recid = mddb_createrec(mgph->size, typ1, 0,
1776 MD_CRO_32BIT | options, setno);
1777 }
1778 if (mr_recid < 0) {
1779 rval = mddbstatus2error(&mgph->mde, (int)mr_recid,
1780 mnum, setno);
1781 return (rval);
1782 }
1783
1784 /* get the address of the new unit */
1785 new_un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
1786
1787 /*
1788 * It is okay that we muck with the new unit here,
1789 * since no one else will know about the unit struct
1790 * until we commit it. If we crash, the record will
1791 * be automatically purged, since we haven't
1792 * committed it yet and the old unit struct will be found.
1793 */
1794
1795 /* copy in the user's unit struct */
1796 err = ddi_copyin((void *)(uintptr_t)mgph->mdp, new_un,
1797 mgph->size, mode);
1798 if (err) {
1799 mddb_deleterec_wrapper(mr_recid);
1800 return (EFAULT);
1801 }
1802
1803 /* make sure columns are being added */
1804 if (un->un_totalcolumncnt >= new_un->un_totalcolumncnt) {
1805 mddb_deleterec_wrapper(mr_recid);
1806 return (EINVAL);
1807 }
1808
1809 /*
1810 * Save a few of the new unit structs fields.
1811 * Before they get clobbered.
1812 */
1813 tc = new_un->un_totalcolumncnt;
1814 tb = new_un->c.un_total_blocks;
1815 atb = new_un->c.un_actual_tb;
1816 unrev = new_un->c.un_revision;
1817
1818 /*
1819 * Copy the old unit struct (static stuff)
1820 * into new unit struct
1821 */
1822 bcopy((caddr_t)un, (caddr_t)new_un, un->c.un_size);
1823
1824 /*
1825 * Restore a few of the new unit struct values.
1826 */
1827 new_un->un_totalcolumncnt = tc;
1828 new_un->c.un_actual_tb = atb;
1829 new_un->un_grow_tb = tb;
1830 new_un->c.un_revision = unrev;
1831 new_un->c.un_record_id = mr_recid;
1832 new_un->c.un_size = mgph->size;
1833
1834 ASSERT(new_un->mr_ic == un->mr_ic);
1835
1836 /*
1837 * Save old column slots
1838 */
1839 mrc = un->un_column_ic;
1840
1841 /*
1842 * Allocate new column slot
1843 */
1844 new_un->un_column_ic = (mr_column_ic_t *)
1845 kmem_zalloc(sizeof (mr_column_ic_t) * new_un->un_totalcolumncnt,
1846 KM_SLEEP);
1847
1848 /*
1849 * Restore old column slots
1850 * Free the old column slots
1851 */
1852 bcopy(mrc, new_un->un_column_ic,
1853 sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
1854 kmem_free(mrc, sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
1855
1856 /* All 64 bit metadevices only support EFI labels. */
1857 if (mgph->options & MD_CRO_64BIT) {
1858 new_un->c.un_flag |= MD_EFILABEL;
1859 /*
1860 * If the device was previously smaller than a terabyte,
1861 * and had a vtoc record attached to it, we remove the
1862 * vtoc record, because the layout has changed completely.
1863 */
1864 if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) &&
1865 (un->c.un_vtoc_id != 0)) {
1866 old_vtoc = un->c.un_vtoc_id;
1867 new_un->c.un_vtoc_id =
1868 md_vtoc_to_efi_record(old_vtoc, setno);
1869 }
1870 }
1871
1872
1873 /*
1874 * allocate the real recids array. since we may have to commit
1875 * underlying metadevice records, we need an array of size:
1876 * total number of new components being attach + 2 (one for the
1877 * raid itself, one for the end marker).
1878 */
1879 num_recs = new_un->un_totalcolumncnt + 2;
1880 rid = 0;
1881 recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
1882 recids[rid++] = mr_recid;
1883
1884 for (col = un->un_totalcolumncnt;
1885 (col < new_un->un_totalcolumncnt); col++) {
1886 mr_column_t *mr_col = &new_un->un_column[col];
1887 md_unit_t *comp_un;
1888
1889 if (raid_build_pw_reservation(new_un, col) != 0) {
1890 /* release pwslots already allocated by grow */
1891 for (i = un->un_totalcolumncnt; i < col; i++) {
1892 raid_free_pw_reservation(new_un, i);
1893 }
1894 kmem_free(new_un->un_column_ic,
1895 sizeof (mr_column_ic_t) *
1896 new_un->un_totalcolumncnt);
1897 kmem_free(new_un->mr_ic, sizeof (*un->mr_ic));
1898 kmem_free(recids, num_recs * sizeof (mddb_recid_t));
1899 mddb_deleterec_wrapper(mr_recid);
1900 return (EINVAL);
1901 }
1902 /*
1903 * set parent on metadevices being added.
1904 * NOTE: currently soft partitions are the only metadevices
1905 * which can appear within a RAID metadevice.
1906 */
1907 if (md_getmajor(mr_col->un_dev) == md_major) {
1908 comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
1909 recids[rid++] = MD_RECID(comp_un);
1910 md_set_parent(mr_col->un_dev, MD_SID(new_un));
1911 }
1912 new_un->un_column[col].un_devflags = 0;
1913 }
1914
1915 /* set end marker */
1916 recids[rid] = 0;
1917
1918 /* commit new unit struct */
1919 mddb_commitrecs_wrapper(recids);
1920
1921 /* delete old unit struct */
1922 mddb_deleterec_wrapper(un->c.un_record_id);
1923
1924 /* place new unit in in-core array */
1925 md_nblocks_set(mnum, new_un->c.un_total_blocks);
1926 MD_UNIT(mnum) = new_un;
1927
1928 /*
1929 * If old_vtoc has a non zero value, we know:
1930 * - This unit crossed the border from smaller to larger one TB
1931 * - There was a vtoc record for the unit,
1932 * - This vtoc record is no longer needed, because
1933 * a new efi record has been created for this un.
1934 */
1935 if (old_vtoc != 0) {
1936 mddb_deleterec_wrapper(old_vtoc);
1937 }
1938
1939 /* free recids */
1940 kmem_free(recids, num_recs * sizeof (mddb_recid_t));
1941
1942 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE,
1943 MD_UN2SET(new_un), MD_SID(new_un));
1944 MD_STATUS(new_un) |= MD_UN_GROW_PENDING;
1945
1946 /*
1947 * Since the md_ioctl_writelock aquires the unit write lock
1948 * and open/close aquires the unit reader lock it is necessary
1949 * to drop the unit write lock and then reaquire it as needed
1950 * later.
1951 */
1952 md_unit_writerexit(ui);
1953
1954 if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
1955 rval = mdmderror(&mgph->mde, MDE_RAID_OPEN_FAILURE, mnum);
1956 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
1957 MD_UN2SET(new_un), MD_SID(new_un));
1958 return (rval);
1959 }
1960 (void) md_unit_writerlock(ui);
1961 for (i = 0; i < new_un->un_totalcolumncnt; i++) {
1962 if (new_un->un_column[i].un_devstate & RCS_OKAY)
1963 (void) init_pw_area(new_un, new_un->un_column[i].un_dev,
1964 new_un->un_column[i].un_pwstart, i);
1965 }
1966 md_unit_writerexit(ui);
1967 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1968 (void) md_unit_writerlock(ui);
1969 /* create a background thread to initialize the columns */
1970 md_ioctl_droplocks(lock);
1971
1972 return (raid_init_unit(mnum, &mgph->mde));
1973 }
1974
1975 /*
1976 * NAME: raid_reset
1977 * DESCRIPTION: used to reset (clear / remove) a RAID metadevice
1978 * PARAMETERS: md_i_reset_t *mirp - pointer to reset data structure
1979 *
1980 * LOCKS: obtains and releases md_unit_array_rw write lock
1981 *
1982 */
1983 static int
raid_reset(md_i_reset_t * mirp)1984 raid_reset(md_i_reset_t *mirp)
1985 {
1986 minor_t mnum = mirp->mnum;
1987 mr_unit_t *un;
1988 mdi_unit_t *ui;
1989 set_t setno = MD_MIN2SET(mnum);
1990
1991 mdclrerror(&mirp->mde);
1992
1993 rw_enter(&md_unit_array_rw.lock, RW_WRITER);
1994 /*
1995 * NOTE: need to get md_unit_writerlock to avoid conflict
1996 * with raid_init thread.
1997 */
1998 if ((un = raid_getun(mnum, &mirp->mde, NO_LOCK, NULL)) ==
1999 NULL) {
2000 rw_exit(&md_unit_array_rw.lock);
2001 return (0);
2002 }
2003 ui = MDI_UNIT(mnum);
2004
2005 if (MD_HAS_PARENT(MD_PARENT(un))) {
2006 rw_exit(&md_unit_array_rw.lock);
2007 return (mdmderror(&mirp->mde, MDE_IN_USE, mnum));
2008 }
2009
2010 un = (mr_unit_t *)md_unit_openclose_enter(ui);
2011 if (md_unit_isopen(MDI_UNIT(mnum))) {
2012 md_unit_openclose_exit(ui);
2013 rw_exit(&md_unit_array_rw.lock);
2014 return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum));
2015 }
2016 md_unit_openclose_exit(ui);
2017 if (UNIT_STATE(un) != RUS_OKAY && !mirp->force) {
2018 rw_exit(&md_unit_array_rw.lock);
2019 return (mdmderror(&mirp->mde, MDE_RAID_NEED_FORCE, mnum));
2020 }
2021
2022 reset_raid(un, mnum, 1);
2023
2024 /*
2025 * Update unit availability
2026 */
2027 md_set[setno].s_un_avail++;
2028
2029 /*
2030 * If MN set, reset s_un_next so all nodes can have
2031 * the same view of the next available slot when
2032 * nodes are -w and -j
2033 */
2034 if (MD_MNSET_SETNO(setno)) {
2035 (void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum));
2036 }
2037
2038 rw_exit(&md_unit_array_rw.lock);
2039
2040 return (0);
2041 }
2042
2043 /*
2044 * NAME: raid_get_geom
2045 * DESCRIPTION: used to get the geometry of a RAID metadevice
2046 * PARAMETERS: mr_unit_t *un - RAID unit to get the geometry for
2047 * struct dk_geom *gp - pointer to geometry data structure
2048 *
2049 * LOCKS: none
2050 *
2051 */
2052 static int
raid_get_geom(mr_unit_t * un,struct dk_geom * geomp)2053 raid_get_geom(
2054 mr_unit_t *un,
2055 struct dk_geom *geomp
2056 )
2057 {
2058 md_get_geom((md_unit_t *)un, geomp);
2059
2060 return (0);
2061 }
2062
2063 /*
2064 * NAME: raid_get_vtoc
2065 * DESCRIPTION: used to get the VTOC on a RAID metadevice
2066 * PARAMETERS: mr_unit_t *un - RAID unit to get the VTOC from
2067 * struct vtoc *vtocp - pointer to VTOC data structure
2068 *
2069 * LOCKS: none
2070 *
2071 */
2072 static int
raid_get_vtoc(mr_unit_t * un,struct vtoc * vtocp)2073 raid_get_vtoc(
2074 mr_unit_t *un,
2075 struct vtoc *vtocp
2076 )
2077 {
2078 md_get_vtoc((md_unit_t *)un, vtocp);
2079
2080 return (0);
2081 }
2082
2083 /*
2084 * NAME: raid_set_vtoc
2085 * DESCRIPTION: used to set the VTOC on a RAID metadevice
2086 * PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on
2087 * struct vtoc *vtocp - pointer to VTOC data structure
2088 *
2089 * LOCKS: none
2090 *
2091 */
2092 static int
raid_set_vtoc(mr_unit_t * un,struct vtoc * vtocp)2093 raid_set_vtoc(
2094 mr_unit_t *un,
2095 struct vtoc *vtocp
2096 )
2097 {
2098 return (md_set_vtoc((md_unit_t *)un, vtocp));
2099 }
2100
2101
2102 /*
2103 * NAME: raid_get_extvtoc
2104 * DESCRIPTION: used to get the extended VTOC on a RAID metadevice
2105 * PARAMETERS: mr_unit_t *un - RAID unit to get the VTOC from
2106 * struct extvtoc *vtocp - pointer to extended VTOC data structure
2107 *
2108 * LOCKS: none
2109 *
2110 */
2111 static int
raid_get_extvtoc(mr_unit_t * un,struct extvtoc * vtocp)2112 raid_get_extvtoc(
2113 mr_unit_t *un,
2114 struct extvtoc *vtocp
2115 )
2116 {
2117 md_get_extvtoc((md_unit_t *)un, vtocp);
2118
2119 return (0);
2120 }
2121
2122 /*
2123 * NAME: raid_set_extvtoc
2124 * DESCRIPTION: used to set the extended VTOC on a RAID metadevice
2125 * PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on
2126 * struct extvtoc *vtocp - pointer to extended VTOC data structure
2127 *
2128 * LOCKS: none
2129 *
2130 */
2131 static int
raid_set_extvtoc(mr_unit_t * un,struct extvtoc * vtocp)2132 raid_set_extvtoc(
2133 mr_unit_t *un,
2134 struct extvtoc *vtocp
2135 )
2136 {
2137 return (md_set_extvtoc((md_unit_t *)un, vtocp));
2138 }
2139
2140
2141
2142 /*
2143 * NAME: raid_get_cgapart
2144 * DESCRIPTION: used to get the dk_map on a RAID metadevice
2145 * PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on
2146 * struct vtoc *dkmapp - pointer to dk_map data structure
2147 *
2148 * LOCKS: none
2149 *
2150 */
2151
2152 static int
raid_get_cgapart(mr_unit_t * un,struct dk_map * dkmapp)2153 raid_get_cgapart(
2154 mr_unit_t *un,
2155 struct dk_map *dkmapp
2156 )
2157 {
2158 md_get_cgapart((md_unit_t *)un, dkmapp);
2159 return (0);
2160 }
2161
2162 /*
2163 * NAME: raid_getdevs
2164 * DESCRIPTION: return all devices within a RAID metadevice
2165 * PARAMETERS: md_getdevs_params_t *mgdp
2166 * - pointer to getdevs IOCTL data structure
2167 * int mode - should be FREAD
2168 * IOLOCK *lockp - IOCTL read/write lock
2169 *
2170 * LOCKS: obtains unit reader lock via IOLOCK
2171 *
2172 */
2173 static int
raid_getdevs(void * mgdp,int mode,IOLOCK * lock)2174 raid_getdevs(
2175 void *mgdp,
2176 int mode,
2177 IOLOCK *lock
2178 )
2179 {
2180 minor_t mnum;
2181 mr_unit_t *un;
2182 md_dev64_t *udevs;
2183 int i, cnt;
2184 md_dev64_t unit_dev;
2185 md_getdevs_params_t *mgdph = mgdp;
2186
2187
2188 mnum = mgdph->mnum;
2189
2190 /* check out unit */
2191 mdclrerror(&mgdph->mde);
2192
2193 if ((un = raid_getun(mnum, &mgdph->mde, RD_LOCK, lock)) == NULL)
2194 return (0);
2195
2196 udevs = (md_dev64_t *)(uintptr_t)mgdph->devs;
2197
2198 for (cnt = 0, i = 0; i < un->un_totalcolumncnt; i++, cnt++) {
2199 if (cnt < mgdph->cnt) {
2200 unit_dev = un->un_column[i].un_orig_dev;
2201 if (md_getmajor(unit_dev) != md_major) {
2202 if ((unit_dev = md_xlate_mini_2_targ
2203 (unit_dev)) == NODEV64)
2204 return (ENODEV);
2205 }
2206
2207 if (ddi_copyout((caddr_t)&unit_dev,
2208 (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
2209 return (EFAULT);
2210 }
2211 if (HOTSPARED(un, i)) {
2212 cnt++;
2213 if (cnt >= mgdph->cnt)
2214 continue;
2215
2216 unit_dev = un->un_column[i].un_dev;
2217 if (md_getmajor(unit_dev) != md_major) {
2218 if ((unit_dev = md_xlate_mini_2_targ
2219 (unit_dev)) == NODEV64)
2220 return (ENODEV);
2221 }
2222
2223 if (ddi_copyout((caddr_t)&unit_dev,
2224 (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
2225 return (EFAULT);
2226 }
2227 }
2228 mgdph->cnt = cnt;
2229 return (0);
2230 }
2231
2232 /*
2233 * NAME: raid_change
2234 * DESCRIPTION: used to change the following dynamic values:
2235 * the hot spare pool
2236 * in the unit structure of a RAID metadevice
2237 * PARAMETERS: md_change_params_t *mcp - pointer to change data structure
2238 * IOLOCK *lock - pointer to IOCTL lock
2239 *
2240 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun)
2241 *
2242 */
2243 static int
raid_change(md_raid_params_t * mrp,IOLOCK * lock)2244 raid_change(
2245 md_raid_params_t *mrp,
2246 IOLOCK *lock
2247 )
2248 {
2249 minor_t mnum = mrp->mnum;
2250 mr_unit_t *un;
2251 int ix;
2252 mddb_recid_t recids[3] = {0, 0, 0};
2253 int err;
2254 int irecid;
2255 int inc_new_hsp = 0;
2256
2257 mdclrerror(&mrp->mde);
2258
2259 if ((un = raid_getun(mnum, &mrp->mde, WR_LOCK, lock)) == NULL)
2260 return (0);
2261
2262 if (!mrp->params.change_hsp_id)
2263 return (0);
2264
2265 /* verify that no hotspare is in use */
2266 for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
2267 if (HOTSPARED(un, ix)) {
2268 return (mdmderror(&mrp->mde, MDE_HS_IN_USE, mnum));
2269 }
2270 }
2271
2272 /* replace the hot spare pool */
2273
2274 irecid = 0;
2275 if (mrp->params.hsp_id != -1) {
2276 /* increment the reference count of the new hsp */
2277 err = md_hot_spare_ifc(HSP_INCREF, mrp->params.hsp_id, 0, 0,
2278 &recids[0], NULL, NULL, NULL);
2279 if (err) {
2280 return (mdhsperror(&mrp->mde, MDE_INVAL_HSP,
2281 mrp->params.hsp_id));
2282 }
2283 inc_new_hsp = 1;
2284 irecid++;
2285 }
2286
2287 if (un->un_hsp_id != -1) {
2288 /* decrement the reference count of the old hsp */
2289 err = md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
2290 &recids[irecid], NULL, NULL, NULL);
2291 if (err) {
2292 err = mdhsperror(&mrp->mde, MDE_INVAL_HSP,
2293 mrp->params.hsp_id);
2294 if (inc_new_hsp) {
2295 (void) md_hot_spare_ifc(HSP_DECREF,
2296 mrp->params.hsp_id, 0, 0,
2297 &recids[0], NULL, NULL, NULL);
2298 /*
2299 * Don't need to commit the record,
2300 * because it wasn't committed before
2301 */
2302 }
2303 return (err);
2304 }
2305 }
2306
2307 un->un_hsp_id = mrp->params.hsp_id;
2308
2309 raid_commit(un, recids);
2310 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE,
2311 MD_UN2SET(un), MD_SID(un));
2312
2313 /* Now trigger hot spare processing in case one is needed. */
2314 if ((un->un_hsp_id != -1) && (un->un_state == RUS_ERRED))
2315 (void) raid_hotspares();
2316
2317 return (0);
2318 }
2319
2320 /*
2321 * NAME: raid_admin_ioctl
2322 * DESCRIPTION: IOCTL operations unique to metadevices and RAID
2323 * PARAMETERS: int cmd - IOCTL command to be executed
2324 * void *data - pointer to IOCTL data structure
2325 * int mode - either FREAD or FWRITE
2326 * IOLOCK *lockp - IOCTL read/write lock
2327 *
2328 * LOCKS: none
2329 *
2330 */
2331 static int
raid_admin_ioctl(int cmd,void * data,int mode,IOLOCK * lockp)2332 raid_admin_ioctl(
2333 int cmd,
2334 void *data,
2335 int mode,
2336 IOLOCK *lockp
2337 )
2338 {
2339 size_t sz = 0;
2340 void *d = NULL;
2341 int err = 0;
2342
2343 /* We can only handle 32-bit clients for internal commands */
2344 if ((mode & DATAMODEL_MASK) != DATAMODEL_ILP32) {
2345 return (EINVAL);
2346 }
2347
2348
2349 /* dispatch ioctl */
2350 switch (cmd) {
2351
2352 case MD_IOCSET:
2353 {
2354 if (! (mode & FWRITE))
2355 return (EACCES);
2356
2357 sz = sizeof (md_set_params_t);
2358 d = kmem_alloc(sz, KM_SLEEP);
2359
2360 if (ddi_copyin(data, d, sz, mode)) {
2361 err = EFAULT;
2362 break;
2363 }
2364
2365 err = raid_set(d, mode);
2366 break;
2367 }
2368
2369 case MD_IOCGET:
2370 {
2371 if (! (mode & FREAD))
2372 return (EACCES);
2373
2374 sz = sizeof (md_i_get_t);
2375 d = kmem_alloc(sz, KM_SLEEP);
2376
2377 if (ddi_copyin(data, d, sz, mode)) {
2378 err = EFAULT;
2379 break;
2380 }
2381
2382 err = raid_get(d, mode, lockp);
2383 break;
2384 }
2385
2386 case MD_IOCREPLACE:
2387 {
2388 if (! (mode & FWRITE))
2389 return (EACCES);
2390
2391 sz = sizeof (replace_params_t);
2392 d = kmem_alloc(sz, KM_SLEEP);
2393
2394 if (ddi_copyin(data, d, sz, mode)) {
2395 err = EFAULT;
2396 break;
2397 }
2398
2399 err = raid_replace((replace_params_t *)d, lockp);
2400 break;
2401 }
2402
2403 case MD_IOCSETSYNC:
2404 {
2405 if (! (mode & FWRITE))
2406 return (EACCES);
2407
2408 sz = sizeof (md_resync_ioctl_t);
2409 d = kmem_alloc(sz, KM_SLEEP);
2410
2411 if (ddi_copyin(data, d, sz, mode)) {
2412 err = EFAULT;
2413 break;
2414 }
2415
2416 err = raid_set_sync((md_resync_ioctl_t *)d, lockp);
2417 break;
2418 }
2419
2420 case MD_IOCGETSYNC:
2421 {
2422 if (! (mode & FREAD))
2423 return (EACCES);
2424
2425 sz = sizeof (md_resync_ioctl_t);
2426 d = kmem_alloc(sz, KM_SLEEP);
2427
2428 if (ddi_copyin(data, d, sz, mode)) {
2429 err = EFAULT;
2430 break;
2431 }
2432 err = raid_get_resync((md_resync_ioctl_t *)d, lockp);
2433
2434 break;
2435 }
2436
2437 case MD_IOCGROW:
2438 {
2439 if (! (mode & FWRITE))
2440 return (EACCES);
2441
2442 sz = sizeof (md_grow_params_t);
2443 d = kmem_alloc(sz, KM_SLEEP);
2444
2445 if (ddi_copyin(data, d, sz, mode)) {
2446 err = EFAULT;
2447 break;
2448 }
2449
2450 err = raid_grow(d, mode, lockp);
2451 break;
2452 }
2453
2454 case MD_IOCCHANGE:
2455 {
2456 if (! (mode & FWRITE))
2457 return (EACCES);
2458
2459 sz = sizeof (md_raid_params_t);
2460 d = kmem_alloc(sz, KM_SLEEP);
2461
2462 if (ddi_copyin(data, d, sz, mode)) {
2463 err = EFAULT;
2464 break;
2465 }
2466
2467 err = raid_change((md_raid_params_t *)d, lockp);
2468 break;
2469 }
2470
2471 case MD_IOCRESET:
2472 {
2473 if (! (mode & FWRITE))
2474 return (EACCES);
2475
2476 sz = sizeof (md_i_reset_t);
2477 d = kmem_alloc(sz, KM_SLEEP);
2478
2479 if (ddi_copyin(data, d, sz, mode)) {
2480 err = EFAULT;
2481 break;
2482 }
2483
2484 err = raid_reset((md_i_reset_t *)d);
2485 break;
2486 }
2487
2488 case MD_IOCGET_DEVS:
2489 {
2490 if (! (mode & FREAD))
2491 return (EACCES);
2492
2493 sz = sizeof (md_getdevs_params_t);
2494 d = kmem_alloc(sz, KM_SLEEP);
2495
2496 if (ddi_copyin(data, d, sz, mode)) {
2497 err = EFAULT;
2498 break;
2499 }
2500
2501 err = raid_getdevs(d, mode, lockp);
2502 break;
2503 }
2504
2505 case MD_IOCSETREGEN:
2506 {
2507 if (! (mode & FWRITE))
2508 return (EACCES);
2509
2510 sz = sizeof (md_regen_param_t);
2511 d = kmem_alloc(sz, KM_SLEEP);
2512
2513 if (ddi_copyin(data, d, sz, mode)) {
2514 err = EFAULT;
2515 break;
2516 }
2517
2518 err = raid_regen((md_regen_param_t *)d, lockp);
2519 break;
2520 }
2521
2522 case MD_IOCPROBE_DEV:
2523 {
2524 md_probedev_impl_t *p = NULL;
2525 md_probedev_t *ph = NULL;
2526 daemon_queue_t *hdr = NULL;
2527 int i;
2528 size_t sz1 = 0;
2529
2530
2531 if (! (mode & FREAD))
2532 return (EACCES);
2533
2534 sz = sizeof (md_probedev_t);
2535
2536 d = kmem_alloc(sz, KM_SLEEP);
2537
2538 /* now copy in the data */
2539 if (ddi_copyin(data, d, sz, mode)) {
2540 err = EFAULT;
2541 goto free_mem;
2542 }
2543
2544 /*
2545 * Sanity test the args. Test name should have the keyword
2546 * probe.
2547 */
2548 p = kmem_alloc(sizeof (md_probedev_impl_t), KM_SLEEP);
2549 p->probe_sema = NULL;
2550 p->probe_mx = NULL;
2551 p->probe.mnum_list = (uint64_t)NULL;
2552
2553 ph = (md_probedev_t *)d;
2554 p->probe.nmdevs = ph->nmdevs;
2555 (void) strcpy(p->probe.test_name, ph->test_name);
2556 bcopy(&ph->md_driver, &(p->probe.md_driver),
2557 sizeof (md_driver_t));
2558
2559 if ((p->probe.nmdevs < 1) ||
2560 (strstr(p->probe.test_name, "probe") == NULL)) {
2561 err = EINVAL;
2562 goto free_mem;
2563 }
2564
2565 sz1 = sizeof (minor_t) * p->probe.nmdevs;
2566
2567 p->probe.mnum_list = (uint64_t)(uintptr_t)kmem_alloc(sz1,
2568 KM_SLEEP);
2569
2570 if (ddi_copyin((caddr_t)(uintptr_t)ph->mnum_list,
2571 (caddr_t)(uintptr_t)p->probe.mnum_list, sz1, mode)) {
2572 err = EFAULT;
2573 goto free_mem;
2574 }
2575
2576 if (err = md_init_probereq(p, &hdr))
2577 goto free_mem;
2578
2579 /*
2580 * put the request on the queue and wait.
2581 */
2582
2583 daemon_request_new(&md_ff_daemonq, md_probe_one, hdr, REQ_NEW);
2584
2585 (void) IOLOCK_RETURN(0, lockp);
2586 /* wait for the events to occur */
2587 for (i = 0; i < p->probe.nmdevs; i++) {
2588 sema_p(PROBE_SEMA(p));
2589 }
2590 while (md_ioctl_lock_enter() == EINTR)
2591 ;
2592
2593 /*
2594 * clean up. The hdr list is freed in the probe routines
2595 * since the list is NULL by the time we get here.
2596 */
2597 free_mem:
2598 if (p) {
2599 if (p->probe_sema != NULL) {
2600 sema_destroy(PROBE_SEMA(p));
2601 kmem_free(p->probe_sema, sizeof (ksema_t));
2602 }
2603 if (p->probe_mx != NULL) {
2604 mutex_destroy(PROBE_MX(p));
2605 kmem_free(p->probe_mx, sizeof (kmutex_t));
2606 }
2607 if (p->probe.mnum_list)
2608 kmem_free((caddr_t)(uintptr_t)
2609 p->probe.mnum_list, sz1);
2610
2611 kmem_free(p, sizeof (md_probedev_impl_t));
2612 }
2613 break;
2614 }
2615
2616 default:
2617 return (ENOTTY);
2618 }
2619
2620 /*
2621 * copyout and free any args
2622 */
2623 if (sz != 0) {
2624 if (err == 0) {
2625 if (ddi_copyout(d, data, sz, mode) != 0) {
2626 err = EFAULT;
2627 }
2628 }
2629 kmem_free(d, sz);
2630 }
2631 return (err);
2632 }
2633
2634 /*
2635 * NAME: md_raid_ioctl
2636 * DESCRIPTION: RAID metadevice IOCTL operations entry point.
2637 * PARAMETERS: md_dev64_t dev - RAID device identifier
2638 * int cmd - IOCTL command to be executed
2639 * void *data - pointer to IOCTL data structure
2640 * int mode - either FREAD or FWRITE
2641 * IOLOCK *lockp - IOCTL read/write lock
2642 *
2643 * LOCKS: none
2644 *
2645 */
2646 int
md_raid_ioctl(dev_t dev,int cmd,void * data,int mode,IOLOCK * lockp)2647 md_raid_ioctl(
2648 dev_t dev,
2649 int cmd,
2650 void *data,
2651 int mode,
2652 IOLOCK *lockp
2653 )
2654 {
2655 minor_t mnum = getminor(dev);
2656 mr_unit_t *un;
2657 int err = 0;
2658
2659 /* handle admin ioctls */
2660 if (mnum == MD_ADM_MINOR)
2661 return (raid_admin_ioctl(cmd, data, mode, lockp));
2662
2663 /* check unit */
2664 if ((MD_MIN2SET(mnum) >= md_nsets) ||
2665 (MD_MIN2UNIT(mnum) >= md_nunits) ||
2666 ((un = MD_UNIT(mnum)) == NULL))
2667 return (ENXIO);
2668
2669 /* is this a supported ioctl? */
2670 err = md_check_ioctl_against_unit(cmd, un->c);
2671 if (err != 0) {
2672 return (err);
2673 }
2674
2675 /* dispatch ioctl */
2676 switch (cmd) {
2677
2678 case DKIOCINFO:
2679 {
2680 struct dk_cinfo *p;
2681
2682 if (! (mode & FREAD))
2683 return (EACCES);
2684
2685 p = kmem_alloc(sizeof (*p), KM_SLEEP);
2686
2687 get_info(p, mnum);
2688 if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0)
2689 err = EFAULT;
2690
2691 kmem_free(p, sizeof (*p));
2692 return (err);
2693 }
2694
2695 case DKIOCGMEDIAINFO:
2696 {
2697 struct dk_minfo p;
2698
2699 if (! (mode & FREAD))
2700 return (EACCES);
2701
2702 get_minfo(&p, mnum);
2703 if (ddi_copyout(&p, data, sizeof (struct dk_minfo), mode) != 0)
2704 err = EFAULT;
2705
2706 return (err);
2707 }
2708
2709 case DKIOCGGEOM:
2710 {
2711 struct dk_geom *p;
2712
2713 if (! (mode & FREAD))
2714 return (EACCES);
2715
2716 p = kmem_alloc(sizeof (*p), KM_SLEEP);
2717
2718 if ((err = raid_get_geom(un, p)) == 0) {
2719 if (ddi_copyout((caddr_t)p, data, sizeof (*p),
2720 mode) != 0)
2721 err = EFAULT;
2722 }
2723
2724 kmem_free(p, sizeof (*p));
2725 return (err);
2726 }
2727
2728 case DKIOCGVTOC:
2729 {
2730 struct vtoc *vtoc;
2731
2732 if (! (mode & FREAD))
2733 return (EACCES);
2734
2735 vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
2736 if ((err = raid_get_vtoc(un, vtoc)) != 0) {
2737 kmem_free(vtoc, sizeof (*vtoc));
2738 return (err);
2739 }
2740
2741 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2742 if (ddi_copyout(vtoc, data, sizeof (*vtoc), mode))
2743 err = EFAULT;
2744 }
2745 #ifdef _SYSCALL32
2746 else {
2747 struct vtoc32 *vtoc32;
2748
2749 vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
2750
2751 vtoctovtoc32((*vtoc), (*vtoc32));
2752 if (ddi_copyout(vtoc32, data, sizeof (*vtoc32), mode))
2753 err = EFAULT;
2754 kmem_free(vtoc32, sizeof (*vtoc32));
2755 }
2756 #endif /* _SYSCALL32 */
2757
2758 kmem_free(vtoc, sizeof (*vtoc));
2759 return (err);
2760 }
2761
2762 case DKIOCSVTOC:
2763 {
2764 struct vtoc *vtoc;
2765
2766 if (! (mode & FWRITE))
2767 return (EACCES);
2768
2769 vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
2770 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2771 if (ddi_copyin(data, vtoc, sizeof (*vtoc), mode)) {
2772 err = EFAULT;
2773 }
2774 }
2775 #ifdef _SYSCALL32
2776 else {
2777 struct vtoc32 *vtoc32;
2778
2779 vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
2780
2781 if (ddi_copyin(data, vtoc32, sizeof (*vtoc32), mode)) {
2782 err = EFAULT;
2783 } else {
2784 vtoc32tovtoc((*vtoc32), (*vtoc));
2785 }
2786 kmem_free(vtoc32, sizeof (*vtoc32));
2787 }
2788 #endif /* _SYSCALL32 */
2789
2790 if (err == 0)
2791 err = raid_set_vtoc(un, vtoc);
2792
2793 kmem_free(vtoc, sizeof (*vtoc));
2794 return (err);
2795 }
2796
2797 case DKIOCGEXTVTOC:
2798 {
2799 struct extvtoc *extvtoc;
2800
2801 if (! (mode & FREAD))
2802 return (EACCES);
2803
2804 extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
2805 if ((err = raid_get_extvtoc(un, extvtoc)) != 0) {
2806 kmem_free(extvtoc, sizeof (*extvtoc));
2807 return (err);
2808 }
2809
2810 if (ddi_copyout(extvtoc, data, sizeof (*extvtoc), mode))
2811 err = EFAULT;
2812
2813 kmem_free(extvtoc, sizeof (*extvtoc));
2814 return (err);
2815 }
2816
2817 case DKIOCSEXTVTOC:
2818 {
2819 struct extvtoc *extvtoc;
2820
2821 if (! (mode & FWRITE))
2822 return (EACCES);
2823
2824 extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
2825 if (ddi_copyin(data, extvtoc, sizeof (*extvtoc), mode)) {
2826 err = EFAULT;
2827 }
2828
2829 if (err == 0)
2830 err = raid_set_extvtoc(un, extvtoc);
2831
2832 kmem_free(extvtoc, sizeof (*extvtoc));
2833 return (err);
2834 }
2835
2836 case DKIOCGAPART:
2837 {
2838 struct dk_map dmp;
2839
2840 if ((err = raid_get_cgapart(un, &dmp)) != 0) {
2841 return (err);
2842 }
2843
2844 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2845 if (ddi_copyout((caddr_t)&dmp, data, sizeof (dmp),
2846 mode) != 0)
2847 err = EFAULT;
2848 }
2849 #ifdef _SYSCALL32
2850 else {
2851 struct dk_map32 dmp32;
2852
2853 dmp32.dkl_cylno = dmp.dkl_cylno;
2854 dmp32.dkl_nblk = dmp.dkl_nblk;
2855
2856 if (ddi_copyout((caddr_t)&dmp32, data, sizeof (dmp32),
2857 mode) != 0)
2858 err = EFAULT;
2859 }
2860 #endif /* _SYSCALL32 */
2861
2862 return (err);
2863 }
2864 case DKIOCGETEFI:
2865 {
2866 /*
2867 * This one can be done centralized,
2868 * no need to put in the same code for all types of metadevices
2869 */
2870 return (md_dkiocgetefi(mnum, data, mode));
2871 }
2872
2873 case DKIOCSETEFI:
2874 {
2875 /*
2876 * This one can be done centralized,
2877 * no need to put in the same code for all types of metadevices
2878 */
2879 return (md_dkiocsetefi(mnum, data, mode));
2880 }
2881
2882 case DKIOCPARTITION:
2883 {
2884 return (md_dkiocpartition(mnum, data, mode));
2885 }
2886
2887 default:
2888 return (ENOTTY);
2889 }
2890 }
2891
2892 /*
2893 * rename/exchange named service entry points and support functions follow.
2894 * Most functions are handled generically, except for raid-specific locking
2895 * and checking
2896 */
2897
2898 /*
2899 * NAME: raid_may_renexch_self
2900 * DESCRIPTION: support routine for rename check ("MDRNM_CHECK") named service
2901 * PARAMETERS: mr_unit_t *un - unit struct of raid unit to be renamed
2902 * mdi_unit_t *ui - in-core unit struct of same raid unit
2903 * md_rentxn_t *rtxnp - rename transaction state
2904 *
2905 * LOCKS: none
2906 *
2907 */
2908 static int
raid_may_renexch_self(mr_unit_t * un,mdi_unit_t * ui,md_rentxn_t * rtxnp)2909 raid_may_renexch_self(
2910 mr_unit_t *un,
2911 mdi_unit_t *ui,
2912 md_rentxn_t *rtxnp)
2913 {
2914 minor_t from_min;
2915 minor_t to_min;
2916 bool_t toplevel;
2917 bool_t related;
2918
2919 from_min = rtxnp->from.mnum;
2920 to_min = rtxnp->to.mnum;
2921
2922 if (!un || !ui) {
2923 (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
2924 from_min);
2925 return (EINVAL);
2926 }
2927
2928 ASSERT(!(MD_CAPAB(un) & MD_CAN_META_CHILD));
2929 if (MD_CAPAB(un) & MD_CAN_META_CHILD) {
2930 (void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
2931 return (EINVAL);
2932 }
2933
2934 if (MD_PARENT(un) == MD_MULTI_PARENT) {
2935 (void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
2936 return (EINVAL);
2937 }
2938
2939 toplevel = !MD_HAS_PARENT(MD_PARENT(un));
2940
2941 /* we're related if trying to swap with our parent */
2942 related = (!toplevel) && (MD_PARENT(un) == to_min);
2943
2944 switch (rtxnp->op) {
2945 case MDRNOP_EXCHANGE:
2946
2947 if (!related) {
2948 (void) mdmderror(&rtxnp->mde,
2949 MDE_RENAME_TARGET_UNRELATED, to_min);
2950 return (EINVAL);
2951 }
2952
2953 break;
2954
2955 case MDRNOP_RENAME:
2956 /*
2957 * if from is top-level and is open, then the kernel is using
2958 * the md_dev64_t.
2959 */
2960
2961 if (toplevel && md_unit_isopen(ui)) {
2962 (void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
2963 from_min);
2964 return (EBUSY);
2965 }
2966 break;
2967
2968 default:
2969 (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
2970 from_min);
2971 return (EINVAL);
2972 }
2973
2974 return (0); /* ok */
2975 }
2976
2977 /*
2978 * NAME: raid_rename_check
2979 * DESCRIPTION: ("MDRNM_CHECK") rename/exchange named service entry point
2980 * PARAMETERS: md_rendelta_t *delta - describes changes to be made to this
2981 * raid device for rename transaction
2982 * md_rentxn_t *rtxnp - rename transaction state
2983 *
2984 * LOCKS: none
2985 *
2986 */
2987 intptr_t
raid_rename_check(md_rendelta_t * delta,md_rentxn_t * rtxnp)2988 raid_rename_check(
2989 md_rendelta_t *delta,
2990 md_rentxn_t *rtxnp)
2991 {
2992 int err = 0;
2993 int column;
2994 mr_unit_t *un;
2995
2996 ASSERT(delta);
2997 ASSERT(rtxnp);
2998 ASSERT(delta->unp);
2999 ASSERT(delta->uip);
3000
3001 if (!delta || !rtxnp || !delta->unp || !delta->uip) {
3002 (void) mdsyserror(&rtxnp->mde, EINVAL);
3003 return (EINVAL);
3004 }
3005
3006 un = (mr_unit_t *)delta->unp;
3007
3008 for (column = 0; column < un->un_totalcolumncnt; column++) {
3009 rcs_state_t colstate;
3010
3011 colstate = un->un_column[column].un_devstate;
3012
3013 if (colstate & RCS_LAST_ERRED) {
3014 (void) mdmderror(&rtxnp->mde, MDE_RAID_LAST_ERRED,
3015 md_getminor(delta->dev));
3016 return (EINVAL);
3017 }
3018
3019 if (colstate & RCS_INIT_ERRED) {
3020 (void) mdmderror(&rtxnp->mde, MDE_RAID_DOI,
3021 md_getminor(delta->dev));
3022 return (EINVAL);
3023 }
3024
3025 /* How did we get this far before detecting this? */
3026 if (colstate & RCS_RESYNC) {
3027 (void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
3028 md_getminor(delta->dev));
3029 return (EBUSY);
3030 }
3031
3032 if (colstate & RCS_ERRED) {
3033 (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3034 md_getminor(delta->dev));
3035 return (EINVAL);
3036 }
3037
3038 if (!(colstate & RCS_OKAY)) {
3039 (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3040 md_getminor(delta->dev));
3041 return (EINVAL);
3042 }
3043
3044 if (HOTSPARED(un, column)) {
3045 (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3046 md_getminor(delta->dev));
3047 return (EINVAL);
3048 }
3049 }
3050
3051 /* self does additional checks */
3052 if (delta->old_role == MDRR_SELF) {
3053 err = raid_may_renexch_self((mr_unit_t *)delta->unp,
3054 delta->uip, rtxnp);
3055 }
3056 return (err);
3057 }
3058
3059 /*
3060 * NAME: raid_rename_lock
3061 * DESCRIPTION: ("MDRNM_LOCK") rename/exchange named service entry point
3062 * PARAMETERS: md_rendelta_t *delta - describes changes to be made to this
3063 * raid device for rename transaction
3064 * md_rentxn_t *rtxnp - rename transaction state
3065 *
3066 * LOCKS: io and unit locks (taken explicitly *not* via ioctl wrappers)
3067 *
3068 */
3069 intptr_t
raid_rename_lock(md_rendelta_t * delta,md_rentxn_t * rtxnp)3070 raid_rename_lock(
3071 md_rendelta_t *delta,
3072 md_rentxn_t *rtxnp)
3073 {
3074 minor_t mnum;
3075
3076 ASSERT(delta);
3077 ASSERT(rtxnp);
3078
3079 mnum = md_getminor(delta->dev);
3080 if (mnum == rtxnp->to.mnum && rtxnp->op == MDRNOP_RENAME) {
3081 return (0);
3082 }
3083
3084 ASSERT(delta->uip);
3085 if (!delta->uip) {
3086 (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum);
3087 return (ENODEV);
3088 }
3089
3090 ASSERT(delta->unp);
3091 if (!delta->unp) {
3092
3093 return (ENODEV);
3094 }
3095
3096 ASSERT(!IO_WRITER_HELD(delta->unp));
3097 (void) md_io_writerlock(delta->uip);
3098 ASSERT(IO_WRITER_HELD(delta->unp));
3099
3100
3101 ASSERT(!UNIT_WRITER_HELD(delta->unp));
3102 (void) md_unit_writerlock(delta->uip);
3103 ASSERT(UNIT_WRITER_HELD(delta->unp));
3104
3105 return (0);
3106 }
3107
3108 /*
3109 * NAME: raid_rename_unlock
3110 * DESCRIPTION: ("MDRNM_UNLOCK") rename/exchange named service entry point
3111 * PARAMETERS: md_rendelta_t *delta - describes changes to be made to this
3112 * raid device for rename transaction
3113 * md_rentxn_t *rtxnp - rename transaction state
3114 *
3115 * LOCKS: drops io and unit locks
3116 *
3117 */
3118 /* ARGSUSED */
3119 void
raid_rename_unlock(md_rendelta_t * delta,md_rentxn_t * rtxnp)3120 raid_rename_unlock(
3121 md_rendelta_t *delta,
3122 md_rentxn_t *rtxnp)
3123 {
3124 mr_unit_t *un = (mr_unit_t *)delta->unp;
3125 minor_t mnum = MD_SID(un);
3126 int col;
3127
3128 ASSERT(delta);
3129 ASSERT(delta->unp);
3130 ASSERT(delta->uip);
3131
3132 ASSERT(UNIT_WRITER_HELD(delta->unp));
3133 md_unit_writerexit(delta->uip);
3134 ASSERT(!UNIT_WRITER_HELD(delta->unp));
3135
3136 if (! (delta->txn_stat.role_swapped) || ! (delta->txn_stat.is_open)) {
3137 goto out;
3138 }
3139 if (raid_internal_open(mnum, (FREAD | FWRITE),
3140 OTYP_LYR, MD_OFLG_ISINIT) == 0) {
3141 for (col = 0; col < un->un_totalcolumncnt; col++) {
3142 if (un->un_column[col].un_devstate & RCS_OKAY)
3143 (void) init_pw_area(un,
3144 un->un_column[col].un_dev,
3145 un->un_column[col].un_pwstart, col);
3146 }
3147 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
3148 }
3149
3150 out:
3151 ASSERT(IO_WRITER_HELD(delta->unp));
3152 md_io_writerexit(delta->uip);
3153 ASSERT(!IO_WRITER_HELD(delta->unp));
3154 }
3155 /* end of rename/exchange named service and support functions */
3156