1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * NAME: raid_ioctl.c 30 * 31 * DESCRIPTION: RAID driver source file containing IOCTL operations. 32 * 33 * ROUTINES PROVIDED FOR EXTERNAL USE: 34 * raid_commit() - commits MD database updates for a RAID metadevice 35 * md_raid_ioctl() - RAID metadevice IOCTL operations entry point. 36 * 37 * ROUTINES PROVIDED FOR INTERNAL USE: 38 * raid_getun() - Performs unit checking on a RAID metadevice 39 * init_col_nextio() - normal backend when zeroing column of RAID metadevice. 40 * init_col_int() - I/O interrupt while zeroing column of RAID metadevice. 41 * raid_init_columns() - Zero one or more columns of a RAID metadevice. 42 * raid_set() - used to create a RAID metadevice 43 * raid_get() - used to get the unit structure of a RAID metadevice 44 * raid_replace() - used to replace a component of a RAID metadevice 45 * raid_grow() - Concatenate to a RAID metadevice 46 * raid_change() - change dynamic values of a RAID metadevice 47 * raid_reset() - used to reset (clear / remove) a RAID metadevice 48 * raid_get_geom() - used to get the geometry of a RAID metadevice 49 * raid_get_vtoc() - used to get the VTOC on a RAID metadevice 50 * raid_set_vtoc() - used to set the VTOC on a RAID metadevice 51 * raid_getdevs() - return all devices within a RAID metadevice 52 * raid_admin_ioctl() - IOCTL operations unique to metadevices and RAID 53 */ 54 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/conf.h> 59 #include <sys/file.h> 60 #include <sys/user.h> 61 #include <sys/uio.h> 62 #include <sys/t_lock.h> 63 #include <sys/buf.h> 64 #include <sys/dkio.h> 65 #include <sys/vtoc.h> 66 #include <sys/kmem.h> 67 #include <vm/page.h> 68 #include <sys/sysmacros.h> 69 #include <sys/types.h> 70 #include <sys/mkdev.h> 71 #include <sys/stat.h> 72 #include <sys/open.h> 73 #include <sys/disp.h> 74 #include <sys/modctl.h> 75 #include <sys/ddi.h> 76 #include <sys/sunddi.h> 77 #include <sys/cred.h> 78 #include <sys/lvm/mdvar.h> 79 #include <sys/lvm/md_names.h> 80 #include <sys/lvm/md_mddb.h> 81 #include <sys/lvm/md_raid.h> 82 #include <sys/lvm/md_convert.h> 83 84 #include <sys/sysevent/eventdefs.h> 85 #include <sys/sysevent/svm.h> 86 87 extern int md_status; 88 extern unit_t md_nunits; 89 extern set_t md_nsets; 90 extern md_set_t md_set[]; 91 extern md_ops_t raid_md_ops; 92 extern major_t md_major; 93 extern md_krwlock_t md_unit_array_rw; 94 extern mdq_anchor_t md_done_daemon; 95 extern mdq_anchor_t md_ff_daemonq; 96 extern int mdopen(); 97 extern int mdclose(); 98 extern void md_probe_one(); 99 extern int md_init_probereq(md_probedev_impl_t *, 100 daemon_queue_t **); 101 extern md_resync_t md_cpr_resync; 102 103 104 extern void dump_mr_unit(mr_unit_t *); 105 106 typedef struct raid_ci { 107 DAEMON_QUEUE 108 struct raid_ci *ci_next; 109 mr_unit_t *ci_un; 110 int ci_col; 111 int ci_err; 112 int ci_flag; 113 size_t ci_zerosize; 114 diskaddr_t ci_blkno; 115 diskaddr_t ci_lastblk; 116 buf_t ci_buf; 117 } raid_ci_t; 118 /* values for the ci_flag */ 119 #define COL_INITING (0x0001) 120 #define COL_INIT_DONE (0x0002) 121 #define COL_READY (0x0004) 122 123 /* 124 * NAME: raid_getun 125 * DESCRIPTION: performs a lot of unit checking on a RAID metadevice 126 * PARAMETERS: minor_t mnum - minor device number for RAID unit 127 * md_error_t *mde - pointer to error reporting structure 128 * int flags - pointer to error reporting structure 129 * STALE_OK - allow stale MD memory 130 * NO_OLD - unit must not exist 131 * NO_LOCK - no IOCTL lock needed 132 * WR_LOCK - write IOCTL lock needed 133 * RD_LOCK - read IOCTL lock needed 134 * IOLOCK *lock - pointer to IOCTL lock 135 * 136 * LOCKS: obtains unit reader or writer lock via IOLOCK 137 * 138 */ 139 static mr_unit_t * 140 raid_getun(minor_t mnum, md_error_t *mde, int flags, IOLOCK *lock) 141 { 142 mr_unit_t *un; 143 mdi_unit_t *ui; 144 set_t setno = MD_MIN2SET(mnum); 145 146 if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) { 147 (void) mdmderror(mde, MDE_INVAL_UNIT, mnum); 148 return (NULL); 149 } 150 151 if (!(flags & STALE_OK)) { 152 if (md_get_setstatus(setno) & MD_SET_STALE) { 153 (void) mdmddberror(mde, MDE_DB_STALE, mnum, setno); 154 return (NULL); 155 } 156 } 157 158 ui = MDI_UNIT(mnum); 159 if (flags & NO_OLD) { 160 if (ui != NULL) { 161 (void) mdmderror(mde, MDE_UNIT_ALREADY_SETUP, mnum); 162 return (NULL); 163 } 164 return ((mr_unit_t *)1); 165 } 166 167 if (ui == NULL) { 168 (void) mdmderror(mde, MDE_UNIT_NOT_SETUP, mnum); 169 return (NULL); 170 } 171 if (flags & ARRAY_WRITER) 172 md_array_writer(lock); 173 else if (flags & ARRAY_READER) 174 md_array_reader(lock); 175 176 if (!(flags & NO_LOCK)) { 177 if (flags & WR_LOCK) { 178 (void) md_ioctl_io_lock(lock, ui); 179 (void) md_ioctl_writerlock(lock, ui); 180 } else /* RD_LOCK */ 181 (void) md_ioctl_readerlock(lock, ui); 182 } 183 un = (mr_unit_t *)MD_UNIT(mnum); 184 185 if (un->c.un_type != MD_METARAID) { 186 (void) mdmderror(mde, MDE_NOT_RAID, mnum); 187 return (NULL); 188 } 189 190 return (un); 191 } 192 193 194 /* 195 * NAME: raid_commit 196 * DESCRIPTION: commits MD database updates for a RAID metadevice 197 * PARAMETERS: mr_unit_t *un - RAID unit to update in the MD database 198 * mddb_recid_t *extras - array of other record IDs to update 199 * 200 * LOCKS: assumes caller holds unit writer lock 201 * 202 */ 203 void 204 raid_commit(mr_unit_t *un, mddb_recid_t *extras) 205 { 206 mddb_recid_t *recids; 207 int ri = 0; 208 int nrecids = 0; 209 210 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE) 211 return; 212 213 /* Count the extra recids */ 214 if (extras != NULL) { 215 while (extras[nrecids] != 0) { 216 nrecids++; 217 } 218 } 219 220 /* 221 * Allocate space for two recids in addition to the extras: 222 * one for the unit structure, one for the null terminator. 223 */ 224 nrecids += 2; 225 recids = (mddb_recid_t *) 226 kmem_zalloc(nrecids * sizeof (mddb_recid_t), KM_SLEEP); 227 228 if (un != NULL) { 229 ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1); 230 recids[ri++] = un->c.un_record_id; 231 } 232 233 if (extras != NULL) { 234 while (*extras != 0) { 235 recids[ri++] = *extras; 236 extras++; 237 } 238 } 239 240 if (ri > 0) { 241 mddb_commitrecs_wrapper(recids); 242 } 243 244 kmem_free(recids, nrecids * sizeof (mddb_recid_t)); 245 } 246 247 static int 248 raid_check_pw(mr_unit_t *un) 249 { 250 buf_t bp; 251 char *buf; 252 mr_column_t *colptr; 253 minor_t mnum = MD_SID(un); 254 int i; 255 int err = 0; 256 minor_t unit; 257 258 buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP); 259 260 for (i = 0; i < un->un_totalcolumncnt; i++) { 261 md_dev64_t tmpdev; 262 263 colptr = &un->un_column[i]; 264 265 tmpdev = colptr->un_dev; 266 /* 267 * Open by device id 268 * If this device is hotspared 269 * use the hotspare key 270 */ 271 tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ? 272 colptr->un_hs_key : colptr->un_orig_key); 273 if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) { 274 colptr->un_dev = tmpdev; 275 return (1); 276 } 277 colptr->un_dev = tmpdev; 278 279 bzero((caddr_t)&bp, sizeof (buf_t)); 280 bp.b_back = &bp; 281 bp.b_forw = &bp; 282 bp.b_flags = B_READ | B_BUSY; 283 sema_init(&bp.b_io, 0, NULL, 284 SEMA_DEFAULT, NULL); 285 sema_init(&bp.b_sem, 0, NULL, 286 SEMA_DEFAULT, NULL); 287 bp.b_edev = md_dev64_to_dev(colptr->un_dev); 288 bp.b_lblkno = colptr->un_pwstart; 289 bp.b_bcount = DEV_BSIZE; 290 bp.b_bufsize = DEV_BSIZE; 291 bp.b_un.b_addr = (caddr_t)buf; 292 bp.b_offset = -1; 293 (void) md_call_strategy(&bp, 0, NULL); 294 if (biowait(&bp)) 295 err = 1; 296 if (i == 0) { 297 if (un->c.un_revision & MD_64BIT_META_DEV) { 298 unit = ((raid_pwhdr_t *)buf)->rpw_unit; 299 } else { 300 unit = ((raid_pwhdr32_od_t *)buf)->rpw_unit; 301 } 302 } 303 /* 304 * depending upon being an 64bit or 32 bit raid, the 305 * pre write headers have different layout 306 */ 307 if (un->c.un_revision & MD_64BIT_META_DEV) { 308 if ((((raid_pwhdr_t *)buf)->rpw_column != i) || 309 (((raid_pwhdr_t *)buf)->rpw_unit != unit)) 310 err = 1; 311 } else { 312 if ((((raid_pwhdr32_od_t *)buf)->rpw_column != i) || 313 (((raid_pwhdr32_od_t *)buf)->rpw_unit != unit)) 314 err = 1; 315 } 316 md_layered_close(colptr->un_dev, MD_OFLG_NULL); 317 if (err) 318 break; 319 } 320 kmem_free(buf, DEV_BSIZE); 321 return (err); 322 } 323 324 /* 325 * NAME: init_col_nextio 326 * DESCRIPTION: normal backend process when zeroing column of a RAID metadevice. 327 * PARAMETERS: raid_ci_t *cur - struct for column being zeroed 328 * 329 * LOCKS: assumes caller holds unit reader lock, 330 * preiodically releases and reacquires unit reader lock, 331 * broadcasts on unit conditional variable (un_cv) 332 * 333 */ 334 #define INIT_RLS_CNT 10 335 static void 336 init_col_nextio(raid_ci_t *cur) 337 { 338 mr_unit_t *un; 339 340 un = cur->ci_un; 341 342 cur->ci_blkno += cur->ci_zerosize; 343 344 mutex_enter(&un->un_mx); 345 /* ===> update un_percent_done */ 346 un->un_init_iocnt += btodb(cur->ci_buf.b_bcount); 347 mutex_exit(&un->un_mx); 348 349 /* 350 * When gorwing a device, normal I/O is still going on. 351 * The init thread still holds the unit reader lock which 352 * prevents I/O from doing state changes. 353 * So every INIT_RLS_CNT init I/Os, we will release the 354 * unit reader lock. 355 * 356 * CAVEAT: 357 * We know we are in the middle of a grow operation and the 358 * unit cannot be grown or removed (through reset or halt) 359 * so the mr_unit_t structure will not move or disappear. 360 * In addition, we know that only one of the init I/Os 361 * can be in col_init_nextio at a time because they are 362 * placed on the md_done_daemon queue and md only processes 363 * one element of this queue at a time. In addition, any 364 * code that needs to acquire the unit writer lock to change 365 * state is supposed to be on the md_mstr_daemon queue so 366 * it can be processing while we sit here waiting to get the 367 * unit reader lock back. 368 */ 369 370 if (cur->ci_blkno < cur->ci_lastblk) { 371 /* truncate last chunk to end_addr if needed */ 372 if (cur->ci_blkno + cur->ci_zerosize > cur->ci_lastblk) { 373 cur->ci_zerosize = (size_t) 374 (cur->ci_lastblk - cur->ci_blkno); 375 } 376 377 /* set address and length for I/O bufs */ 378 cur->ci_buf.b_bufsize = dbtob(cur->ci_zerosize); 379 cur->ci_buf.b_bcount = dbtob(cur->ci_zerosize); 380 cur->ci_buf.b_lblkno = cur->ci_blkno; 381 382 (void) md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL); 383 return; 384 } 385 /* finished initializing this column */ 386 mutex_enter(&un->un_mx); 387 cur->ci_flag = COL_INIT_DONE; 388 uniqtime32(&un->un_column[cur->ci_col].un_devtimestamp); 389 mutex_exit(&un->un_mx); 390 cv_broadcast(&un->un_cv); 391 } 392 393 /* 394 * NAME: init_col_int 395 * DESCRIPTION: I/O interrupt while zeroing column of a RAID metadevice. 396 * PARAMETERS: buf_t *cb - I/O buffer for which interrupt occurred 397 * 398 * LOCKS: assumes caller holds unit reader or writer lock 399 * 400 */ 401 static int 402 init_col_int(buf_t *cb) 403 { 404 raid_ci_t *cur; 405 406 cur = (raid_ci_t *)cb->b_chain; 407 if (cb->b_flags & B_ERROR) { 408 mutex_enter(&cur->ci_un->un_mx); 409 cur->ci_err = EIO; 410 mutex_exit(&cur->ci_un->un_mx); 411 cv_broadcast(&cur->ci_un->un_cv); 412 return (1); 413 } 414 daemon_request(&md_done_daemon, init_col_nextio, 415 (daemon_queue_t *)cur, REQ_OLD); 416 return (1); 417 } 418 419 /* 420 * NAME: raid_init_columns 421 * DESCRIPTION: Zero one or more columns of a RAID metadevice. 422 * PARAMETERS: minor_t mnum - RAID unit minor identifier 423 * 424 * LOCKS: obtains and releases unit reader lock, 425 * obtains and releases unit writer lock, 426 * obtains and releases md_unit_array_rw write lock, 427 * obtains and releases unit mutex (un_mx) lock, 428 * waits on unit conditional variable (un_cv) 429 * 430 */ 431 static void 432 raid_init_columns(minor_t mnum) 433 { 434 mr_unit_t *un; 435 mdi_unit_t *ui; 436 raid_ci_t *ci_chain = NULL, *cur; 437 rus_state_t state; 438 caddr_t zero_addr; 439 diskaddr_t end_off; 440 size_t zerosize; 441 int err = 0; 442 int ix; 443 int colcnt = 0; 444 int col; 445 set_t setno = MD_MIN2SET(mnum); 446 447 /* 448 * Increment the raid resync count for cpr 449 */ 450 mutex_enter(&md_cpr_resync.md_resync_mutex); 451 md_cpr_resync.md_raid_resync++; 452 mutex_exit(&md_cpr_resync.md_resync_mutex); 453 454 /* 455 * initialization is a multiple step process. The first step 456 * is to go through the unit structure and start each device 457 * in the init state writing zeros over the component. 458 * Next initialize the prewrite areas, so the device can be 459 * used if a metainit -k is done. Now close the componenets. 460 * 461 * Once this complete set the state of each component being 462 * zeroed and set the correct state for the unit. 463 * 464 * last commit the records. 465 */ 466 467 ui = MDI_UNIT(mnum); 468 un = md_unit_readerlock(ui); 469 470 /* check for active init on this column */ 471 /* exiting is cpr safe */ 472 if ((un->un_init_colcnt > 0) && (un->un_resync_index != -1)) { 473 md_unit_readerexit(ui); 474 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0); 475 /* 476 * Decrement the raid resync count for cpr 477 */ 478 mutex_enter(&md_cpr_resync.md_resync_mutex); 479 md_cpr_resync.md_raid_resync--; 480 mutex_exit(&md_cpr_resync.md_resync_mutex); 481 thread_exit(); 482 } 483 484 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_START, SVM_TAG_METADEVICE, setno, 485 MD_SID(un)); 486 un->un_init_colcnt = 0; 487 un->un_init_iocnt = 0; 488 end_off = un->un_pwsize + (un->un_segsize * un->un_segsincolumn); 489 zerosize = (size_t)MIN((diskaddr_t)un->un_maxio, end_off); 490 491 /* allocate zero-filled buffer */ 492 zero_addr = kmem_zalloc(dbtob(zerosize), KM_SLEEP); 493 494 for (ix = 0; ix < un->un_totalcolumncnt; ix++) { 495 if (un->un_column[ix].un_devstate != RCS_INIT) 496 continue; 497 /* allocate new column init structure */ 498 cur = (raid_ci_t *)kmem_zalloc((sizeof (raid_ci_t)), KM_SLEEP); 499 ASSERT(cur != NULL); 500 un->un_init_colcnt++; 501 cur->ci_next = ci_chain; 502 ci_chain = cur; 503 cur->ci_un = un; 504 cur->ci_col = ix; 505 cur->ci_err = 0; 506 cur->ci_flag = COL_INITING; 507 cur->ci_zerosize = zerosize; 508 cur->ci_blkno = un->un_column[ix].un_pwstart; 509 cur->ci_lastblk = cur->ci_blkno + un->un_pwsize 510 + (un->un_segsize * un->un_segsincolumn); 511 /* initialize static buf fields */ 512 cur->ci_buf.b_un.b_addr = zero_addr; 513 cur->ci_buf.b_chain = (buf_t *)cur; 514 cur->ci_buf.b_back = &cur->ci_buf; 515 cur->ci_buf.b_forw = &cur->ci_buf; 516 cur->ci_buf.b_iodone = init_col_int; 517 cur->ci_buf.b_flags = B_BUSY | B_WRITE; 518 cur->ci_buf.b_edev = md_dev64_to_dev(un->un_column[ix].un_dev); 519 sema_init(&cur->ci_buf.b_io, 0, NULL, 520 SEMA_DEFAULT, NULL); 521 sema_init(&cur->ci_buf.b_sem, 0, NULL, 522 SEMA_DEFAULT, NULL); 523 /* set address and length for I/O bufs */ 524 cur->ci_buf.b_bufsize = dbtob(zerosize); 525 cur->ci_buf.b_bcount = dbtob(zerosize); 526 cur->ci_buf.b_lblkno = un->un_column[ix].un_pwstart; 527 cur->ci_buf.b_offset = -1; 528 529 if (! (un->un_column[ix].un_devflags & MD_RAID_DEV_ISOPEN)) { 530 md_dev64_t tmpdev = un->un_column[ix].un_dev; 531 /* 532 * Open by device id 533 * If this column is hotspared then 534 * use the hotspare key 535 */ 536 tmpdev = md_resolve_bydevid(mnum, tmpdev, 537 HOTSPARED(un, ix) ? 538 un->un_column[ix].un_hs_key : 539 un->un_column[ix].un_orig_key); 540 if ((cur->ci_err = md_layered_open(mnum, &tmpdev, 541 MD_OFLG_NULL)) == 0) 542 un->un_column[ix].un_devflags |= 543 MD_RAID_DEV_ISOPEN; 544 un->un_column[ix].un_dev = tmpdev; 545 } 546 if (cur->ci_err == 0) 547 md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL); 548 } 549 550 md_unit_readerexit(ui); 551 state = un->un_state; 552 colcnt = un->un_init_colcnt; 553 mutex_enter(&un->un_mx); 554 while (colcnt) { 555 cv_wait(&un->un_cv, &un->un_mx); 556 557 colcnt = 0; 558 for (cur = ci_chain; cur != NULL; cur = cur->ci_next) { 559 col = cur->ci_col; 560 if ((cur->ci_flag != COL_INITING) || (cur->ci_err)) { 561 if (cur->ci_err) 562 err = cur->ci_err; 563 else if (cur->ci_flag == COL_INIT_DONE) { 564 (void) init_pw_area(un, 565 un->un_column[col].un_dev, 566 un->un_column[col].un_pwstart, 567 col); 568 cur->ci_flag = COL_READY; 569 } 570 } else { 571 colcnt++; 572 } 573 } 574 } 575 mutex_exit(&un->un_mx); 576 577 /* This prevents new opens */ 578 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 579 (void) md_io_writerlock(ui); 580 un = (mr_unit_t *)md_unit_writerlock(ui); 581 while (ci_chain) { 582 cur = ci_chain; 583 584 /* take this element out of the chain */ 585 ci_chain = cur->ci_next; 586 /* free this element */ 587 sema_destroy(&cur->ci_buf.b_io); 588 sema_destroy(&cur->ci_buf.b_sem); 589 if (cur->ci_err) 590 raid_set_state(cur->ci_un, cur->ci_col, 591 RCS_INIT_ERRED, 0); 592 else 593 raid_set_state(cur->ci_un, cur->ci_col, 594 RCS_OKAY, 0); 595 kmem_free(cur, sizeof (raid_ci_t)); 596 } 597 598 /* free the zeroed buffer */ 599 kmem_free(zero_addr, dbtob(zerosize)); 600 601 /* determine new unit state */ 602 if (err == 0) { 603 if (state == RUS_INIT) 604 un->un_state = RUS_OKAY; 605 else { 606 un->c.un_total_blocks = un->un_grow_tb; 607 un->un_grow_tb = 0; 608 if (raid_state_cnt(un, RCS_OKAY) == 609 un->un_totalcolumncnt) 610 un->un_state = RUS_OKAY; 611 } 612 } else { /* error orcurred */ 613 if (state & RUS_INIT) 614 un->un_state = RUS_DOI; 615 } 616 uniqtime32(&un->un_timestamp); 617 MD_STATUS(un) &= ~MD_UN_GROW_PENDING; 618 un->un_init_colcnt = 0; 619 un->un_init_iocnt = 0; 620 raid_commit(un, NULL); 621 md_unit_writerexit(ui); 622 (void) md_io_writerexit(ui); 623 rw_exit(&md_unit_array_rw.lock); 624 if (err) { 625 if (un->un_state & RUS_DOI) { 626 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL, 627 SVM_TAG_METADEVICE, setno, MD_SID(un)); 628 } else { 629 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED, 630 SVM_TAG_METADEVICE, setno, MD_SID(un)); 631 } 632 } else { 633 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_SUCCESS, 634 SVM_TAG_METADEVICE, setno, MD_SID(un)); 635 } 636 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0); 637 /* 638 * Decrement the raid resync count for cpr 639 */ 640 mutex_enter(&md_cpr_resync.md_resync_mutex); 641 md_cpr_resync.md_raid_resync--; 642 mutex_exit(&md_cpr_resync.md_resync_mutex); 643 thread_exit(); 644 /*NOTREACHED*/ 645 } 646 647 static int 648 raid_init_unit(minor_t mnum, md_error_t *ep) 649 { 650 mdi_unit_t *ui; 651 mr_unit_t *un; 652 int rval, i; 653 set_t setno = MD_MIN2SET(mnum); 654 655 ui = MDI_UNIT(mnum); 656 if (md_get_setstatus(setno) & MD_SET_STALE) 657 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno)); 658 659 /* Don't start an init if the device is not available */ 660 if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) { 661 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum)); 662 } 663 664 if (raid_internal_open(mnum, (FREAD | FWRITE), 665 OTYP_LYR, MD_OFLG_ISINIT)) { 666 rval = mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum); 667 goto out; 668 } 669 670 un = md_unit_readerlock(ui); 671 un->un_percent_done = 0; 672 md_unit_readerexit(ui); 673 /* start resync_unit thread */ 674 (void) thread_create(NULL, 0, raid_init_columns, 675 (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri); 676 677 return (0); 678 679 out: 680 un = md_unit_writerlock(ui); 681 MD_STATUS(un) &= ~MD_UN_GROW_PENDING; 682 /* recover state */ 683 for (i = 0; i < un->un_totalcolumncnt; i++) 684 if (COLUMN_STATE(un, i) == RCS_INIT) 685 raid_set_state(un, i, RCS_ERRED, 0); 686 if (un->un_state & RUS_INIT) 687 un->un_state = RUS_DOI; 688 raid_commit(un, NULL); 689 md_unit_writerexit(ui); 690 if (un->un_state & RUS_DOI) { 691 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL, 692 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 693 } else { 694 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED, 695 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 696 } 697 return (rval); 698 } 699 700 /* 701 * NAME: raid_regen 702 * 703 * DESCRIPTION: regenerate all the parity on the raid device. This 704 * routine starts a thread that will regenerate the 705 * parity on a raid device. If an I/O error occurs during 706 * this process the entire device is placed in error. 707 * 708 * PARAMETERS: md_set_params_t *msp - ioctl packet 709 */ 710 static void 711 regen_unit(minor_t mnum) 712 { 713 mdi_unit_t *ui = MDI_UNIT(mnum); 714 mr_unit_t *un = MD_UNIT(mnum); 715 buf_t buf, *bp; 716 caddr_t buffer; 717 int err = 0; 718 diskaddr_t total_segments; 719 diskaddr_t line; 720 size_t iosize; 721 722 /* 723 * Increment raid resync count for cpr 724 */ 725 mutex_enter(&md_cpr_resync.md_resync_mutex); 726 md_cpr_resync.md_raid_resync++; 727 mutex_exit(&md_cpr_resync.md_resync_mutex); 728 729 iosize = dbtob(un->un_segsize); 730 buffer = kmem_alloc(iosize, KM_SLEEP); 731 bp = &buf; 732 total_segments = un->un_segsincolumn; 733 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_START, SVM_TAG_METADEVICE, 734 MD_UN2SET(un), MD_SID(un)); 735 un->un_percent_done = 0; 736 init_buf(bp, B_READ | B_BUSY, iosize); 737 738 for (line = 0; line < total_segments; line++) { 739 bp->b_lblkno = line * 740 ((un->un_origcolumncnt - 1) * un->un_segsize); 741 bp->b_un.b_addr = buffer; 742 bp->b_bcount = iosize; 743 bp->b_iodone = NULL; 744 /* 745 * The following assignment is only correct because 746 * md_raid_strategy is fine when it's only a minor number 747 * and not a real dev_t. Yuck. 748 */ 749 bp->b_edev = mnum; 750 md_raid_strategy(bp, MD_STR_NOTTOP, NULL); 751 if (biowait(bp)) { 752 err = 1; 753 break; 754 } 755 un->un_percent_done = (uint_t)((line * 1000) / 756 un->un_segsincolumn); 757 /* just to avoid rounding errors */ 758 if (un->un_percent_done > 1000) 759 un->un_percent_done = 1000; 760 reset_buf(bp, B_READ | B_BUSY, iosize); 761 } 762 destroy_buf(bp); 763 kmem_free(buffer, iosize); 764 765 (void) md_io_writerlock(ui); 766 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0); 767 (void) md_io_writerexit(ui); 768 un = md_unit_writerlock(ui); 769 if (!err && 770 (raid_state_cnt(un, RCS_OKAY) == un->un_totalcolumncnt)) 771 un->un_state = RUS_OKAY; 772 raid_commit(un, NULL); 773 md_unit_writerexit(ui); 774 if (err || 775 raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) { 776 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_FAILED, 777 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); 778 } else { 779 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_DONE, SVM_TAG_METADEVICE, 780 MD_UN2SET(un), MD_SID(un)); 781 } 782 783 /* 784 * Decrement the raid resync count for cpr 785 */ 786 mutex_enter(&md_cpr_resync.md_resync_mutex); 787 md_cpr_resync.md_raid_resync--; 788 mutex_exit(&md_cpr_resync.md_resync_mutex); 789 thread_exit(); 790 } 791 792 static int 793 raid_regen_unit(minor_t mnum, md_error_t *ep) 794 { 795 mdi_unit_t *ui; 796 mr_unit_t *un; 797 int i; 798 set_t setno = MD_MIN2SET(mnum); 799 800 ui = MDI_UNIT(mnum); 801 un = (mr_unit_t *)MD_UNIT(mnum); 802 803 if (md_get_setstatus(setno) & MD_SET_STALE) 804 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno)); 805 806 /* Don't start a regen if the device is not available */ 807 if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) { 808 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum)); 809 } 810 811 if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) { 812 (void) md_unit_writerlock(ui); 813 for (i = 0; i < un->un_totalcolumncnt; i++) 814 raid_set_state(un, i, RCS_ERRED, 0); 815 md_unit_writerexit(ui); 816 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum)); 817 } 818 819 /* start resync_unit thread */ 820 (void) thread_create(NULL, 0, regen_unit, 821 (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri); 822 823 return (0); 824 } 825 826 static int 827 raid_regen(md_regen_param_t *mrp, IOLOCK *lock) 828 { 829 minor_t mnum = mrp->mnum; 830 mr_unit_t *un; 831 832 mdclrerror(&mrp->mde); 833 834 un = md_unit_readerlock(MDI_UNIT(mnum)); 835 836 if (MD_STATUS(un) & MD_UN_GROW_PENDING) { 837 md_unit_readerexit(MDI_UNIT(mnum)); 838 return (mdmderror(&mrp->mde, MDE_IN_USE, mnum)); 839 } 840 841 if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) || 842 (raid_state_cnt(un, RCS_RESYNC))) { 843 md_unit_readerexit(MDI_UNIT(mnum)); 844 return (mdmderror(&mrp->mde, MDE_RESYNC_ACTIVE, mnum)); 845 } 846 847 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) { 848 md_unit_readerexit(MDI_UNIT(mnum)); 849 return (mdmderror(&mrp->mde, MDE_IN_USE, mnum)); 850 } 851 852 if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) || 853 (! (un->un_state & RUS_OKAY))) { 854 md_unit_readerexit(MDI_UNIT(mnum)); 855 return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum)); 856 } 857 858 md_unit_readerexit(MDI_UNIT(mnum)); 859 860 /* get locks and recheck to be sure something did not change */ 861 if ((un = raid_getun(mnum, &mrp->mde, WRITERS, lock)) == NULL) 862 return (0); 863 864 if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) || 865 (! (un->un_state & RUS_OKAY))) { 866 return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum)); 867 } 868 869 raid_set_state(un, 0, RCS_REGEN, 0); 870 raid_commit(un, NULL); 871 md_ioctl_droplocks(lock); 872 return (raid_regen_unit(mnum, &mrp->mde)); 873 } 874 875 /* 876 * NAME: raid_set 877 * DESCRIPTION: used to create a RAID metadevice 878 * PARAMETERS: md_set_params_t *d - pointer to set data structure 879 * int mode - must be FWRITE 880 * 881 * LOCKS: none 882 * 883 */ 884 static int 885 raid_set(void *d, int mode) 886 { 887 minor_t mnum; 888 mr_unit_t *un; 889 mddb_recid_t mr_recid; 890 mddb_recid_t *recids; 891 mddb_type_t typ1; 892 int err; 893 set_t setno; 894 int num_recs; 895 int rid; 896 int col; 897 md_set_params_t *msp = d; 898 899 900 mnum = msp->mnum; 901 setno = MD_MIN2SET(mnum); 902 903 mdclrerror(&msp->mde); 904 905 if (raid_getun(mnum, &msp->mde, NO_OLD, NULL) == NULL) 906 return (0); 907 908 typ1 = (mddb_type_t)md_getshared_key(setno, 909 raid_md_ops.md_driver.md_drivername); 910 911 /* create the db record for this mdstruct */ 912 913 if (msp->options & MD_CRO_64BIT) { 914 #if defined(_ILP32) 915 return (mdmderror(&msp->mde, MDE_UNIT_TOO_LARGE, mnum)); 916 #else 917 mr_recid = mddb_createrec(msp->size, typ1, 0, 918 MD_CRO_64BIT | MD_CRO_RAID | MD_CRO_FN, setno); 919 #endif 920 } else { 921 mr_recid = mddb_createrec(msp->size, typ1, 0, 922 MD_CRO_32BIT | MD_CRO_RAID | MD_CRO_FN, setno); 923 } 924 925 if (mr_recid < 0) 926 return (mddbstatus2error(&msp->mde, 927 (int)mr_recid, mnum, setno)); 928 929 /* get the address of the mdstruct */ 930 un = (mr_unit_t *)mddb_getrecaddr(mr_recid); 931 /* 932 * It is okay that we muck with the mdstruct here, 933 * since no one else will know about the mdstruct 934 * until we commit it. If we crash, the record will 935 * be automatically purged, since we haven't 936 * committed it yet. 937 */ 938 939 /* copy in the user's mdstruct */ 940 if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, un, 941 msp->size, mode)) { 942 mddb_deleterec_wrapper(mr_recid); 943 return (EFAULT); 944 } 945 /* All 64 bit metadevices only support EFI labels. */ 946 if (msp->options & MD_CRO_64BIT) { 947 un->c.un_flag |= MD_EFILABEL; 948 } 949 950 /* 951 * allocate the real recids array. since we may have to commit 952 * underlying metadevice records, we need an array of size: 953 * total number of components in raid + 3 (1 for the raid itself, 954 * one for the hotspare, one for the end marker). 955 */ 956 num_recs = un->un_totalcolumncnt + 3; 957 rid = 0; 958 recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP); 959 recids[rid++] = mr_recid; 960 961 MD_SID(un) = mnum; 962 MD_RECID(un) = recids[0]; 963 MD_CAPAB(un) = MD_CAN_PARENT | MD_CAN_SP; 964 MD_PARENT(un) = MD_NO_PARENT; 965 un->un_resync_copysize = 0; 966 un->c.un_revision |= MD_FN_META_DEV; 967 968 if (UNIT_STATE(un) == RUS_INIT) 969 MD_STATUS(un) |= MD_UN_GROW_PENDING; 970 971 if ((UNIT_STATE(un) != RUS_INIT) && raid_check_pw(un)) { 972 mddb_deleterec_wrapper(mr_recid); 973 err = mderror(&msp->mde, MDE_RAID_INVALID); 974 goto out; 975 } 976 977 if (err = raid_build_incore(un, 0)) { 978 if (un->mr_ic) { 979 kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * 980 un->un_totalcolumncnt); 981 kmem_free(un->mr_ic, sizeof (*un->mr_ic)); 982 } 983 MD_UNIT(mnum) = NULL; 984 mddb_deleterec_wrapper(mr_recid); 985 goto out; 986 } 987 988 /* 989 * Update unit availability 990 */ 991 md_set[setno].s_un_avail--; 992 993 recids[rid] = 0; 994 if (un->un_hsp_id != -1) { 995 /* increment the reference count of the hot spare pool */ 996 err = md_hot_spare_ifc(HSP_INCREF, un->un_hsp_id, 0, 0, 997 &recids[rid], NULL, NULL, NULL); 998 if (err) { 999 MD_UNIT(mnum) = NULL; 1000 mddb_deleterec_wrapper(mr_recid); 1001 goto out; 1002 } 1003 rid++; 1004 } 1005 1006 /* 1007 * set the parent on any metadevice components. 1008 * NOTE: currently soft partitions are the only metadevices 1009 * which can appear within a RAID metadevice. 1010 */ 1011 for (col = 0; col < un->un_totalcolumncnt; col++) { 1012 mr_column_t *mr_col = &un->un_column[col]; 1013 md_unit_t *comp_un; 1014 1015 if (md_getmajor(mr_col->un_dev) == md_major) { 1016 comp_un = MD_UNIT(md_getminor(mr_col->un_dev)); 1017 recids[rid++] = MD_RECID(comp_un); 1018 md_set_parent(mr_col->un_dev, MD_SID(un)); 1019 } 1020 } 1021 1022 /* set the end marker */ 1023 recids[rid] = 0; 1024 1025 mddb_commitrecs_wrapper(recids); 1026 md_create_unit_incore(mnum, &raid_md_ops, 1); 1027 1028 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE, setno, 1029 MD_SID(un)); 1030 1031 out: 1032 kmem_free(recids, (num_recs * sizeof (mddb_recid_t))); 1033 if (err) 1034 return (err); 1035 1036 /* only attempt to init a device that is in the init state */ 1037 if (UNIT_STATE(un) != RUS_INIT) 1038 return (0); 1039 1040 return (raid_init_unit(mnum, &msp->mde)); 1041 } 1042 1043 /* 1044 * NAME: raid_get 1045 * DESCRIPTION: used to get the unit structure of a RAID metadevice 1046 * PARAMETERS: md_i_get_t *migp - pointer to get data structure 1047 * int mode - must be FREAD 1048 * IOLOCK *lock - pointer to IOCTL lock 1049 * 1050 * LOCKS: obtains unit reader lock via IOLOCK 1051 * 1052 */ 1053 static int 1054 raid_get( 1055 void *migp, 1056 int mode, 1057 IOLOCK *lock 1058 ) 1059 { 1060 minor_t mnum; 1061 mr_unit_t *un; 1062 md_i_get_t *migph = migp; 1063 1064 1065 mnum = migph->id; 1066 1067 mdclrerror(&migph->mde); 1068 1069 if ((un = raid_getun(mnum, &migph->mde, 1070 RD_LOCK, lock)) == NULL) 1071 return (0); 1072 1073 if (migph->size == 0) { 1074 migph->size = un->c.un_size; 1075 return (0); 1076 } 1077 1078 if (migph->size < un->c.un_size) { 1079 return (EFAULT); 1080 } 1081 if (ddi_copyout(un, (void *)(uintptr_t)migph->mdp, 1082 un->c.un_size, mode)) 1083 return (EFAULT); 1084 1085 return (0); 1086 } 1087 1088 1089 /* 1090 * NAME: raid_replace 1091 * DESCRIPTION: used to replace a component of a RAID metadevice 1092 * PARAMETERS: replace_params_t *mrp - pointer to replace data structure 1093 * IOLOCK *lock - pointer to IOCTL lock 1094 * 1095 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun), 1096 * obtains and releases md_unit_array_rw write lock 1097 * 1098 */ 1099 static int 1100 raid_replace( 1101 replace_params_t *mrp, 1102 IOLOCK *lock 1103 ) 1104 { 1105 minor_t mnum = mrp->mnum; 1106 md_dev64_t odev = mrp->old_dev; 1107 md_error_t *ep = &mrp->mde; 1108 mr_unit_t *un; 1109 rcs_state_t state; 1110 int ix, col = -1; 1111 int force = 0; 1112 int err = 0; 1113 replace_cmd_t cmd; 1114 set_t setno; 1115 side_t side; 1116 mdkey_t devkey; 1117 int nkeys; 1118 mddb_recid_t extra_recids[3] = { 0, 0, 0 }; 1119 int extra_rids = 0; 1120 md_error_t mde = mdnullerror; 1121 sv_dev_t sv = {MD_SET_BAD, MD_SIDEWILD, MD_KEYWILD}; 1122 1123 mdclrerror(ep); 1124 setno = MD_MIN2SET(mnum); 1125 side = mddb_getsidenum(setno); 1126 1127 un = md_unit_readerlock(MDI_UNIT(mnum)); 1128 1129 if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) || 1130 (raid_state_cnt(un, RCS_RESYNC) != 0)) { 1131 md_unit_readerexit(MDI_UNIT(mnum)); 1132 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum)); 1133 } 1134 1135 if (un->un_state & RUS_DOI) { 1136 md_unit_readerexit(MDI_UNIT(mnum)); 1137 return (mdmderror(ep, MDE_RAID_DOI, mnum)); 1138 } 1139 1140 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) || 1141 (MD_STATUS(un) & MD_UN_GROW_PENDING)) { 1142 md_unit_readerexit(MDI_UNIT(mnum)); 1143 return (mdmderror(ep, MDE_IN_USE, mnum)); 1144 } 1145 1146 md_unit_readerexit(MDI_UNIT(mnum)); 1147 1148 /* get locks and recheck to be sure something did not change */ 1149 if ((un = raid_getun(mnum, ep, WRITERS, lock)) == NULL) 1150 return (0); 1151 1152 if (md_getkeyfromdev(setno, side, odev, &devkey, &nkeys) != 0) { 1153 return (mddeverror(ep, MDE_NAME_SPACE, odev)); 1154 } 1155 1156 for (ix = 0; ix < un->un_totalcolumncnt; ix++) { 1157 md_dev64_t tmpdevt = un->un_column[ix].un_orig_dev; 1158 /* 1159 * Try to resolve devt again if NODEV64 1160 */ 1161 if (tmpdevt == NODEV64) { 1162 tmpdevt = md_resolve_bydevid(mnum, tmpdevt, 1163 un->un_column[ix].un_orig_key); 1164 un->un_column[ix].un_orig_dev = tmpdevt; 1165 } 1166 1167 if (un->un_column[ix].un_orig_dev == odev) { 1168 col = ix; 1169 break; 1170 } else { 1171 if (un->un_column[ix].un_orig_dev == NODEV64) { 1172 /* 1173 * Now we use the keys to match. 1174 * If no key found, continue. 1175 */ 1176 if (nkeys == 0) { 1177 continue; 1178 } 1179 if (un->un_column[ix].un_orig_key == devkey) { 1180 if (nkeys > 1) 1181 return (mddeverror(ep, 1182 MDE_MULTNM, odev)); 1183 col = ix; 1184 break; 1185 } 1186 } 1187 } 1188 } 1189 1190 if (col == -1) 1191 return (mdcomperror(ep, MDE_CANT_FIND_COMP, 1192 mnum, odev)); 1193 1194 if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) || 1195 (raid_state_cnt(un, RCS_RESYNC) != 0)) 1196 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum)); 1197 1198 if (un->un_state & RUS_DOI) 1199 return (mdcomperror(ep, MDE_REPL_INVAL_STATE, mnum, 1200 un->un_column[col].un_dev)); 1201 1202 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) || 1203 (MD_STATUS(un) & MD_UN_GROW_PENDING)) 1204 return (mdmderror(ep, MDE_IN_USE, mnum)); 1205 1206 if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == FORCE_REPLACE_COMP)) 1207 force = 1; 1208 if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == ENABLE_COMP)) 1209 cmd = ENABLE_COMP; 1210 if ((mrp->cmd == FORCE_REPLACE_COMP) || (mrp->cmd == REPLACE_COMP)) 1211 cmd = REPLACE_COMP; 1212 1213 if (un->un_state == RUS_LAST_ERRED) { 1214 /* Must use -f force flag for unit in LAST_ERRED state */ 1215 if (!force) 1216 return (mdmderror(ep, 1217 MDE_RAID_NEED_FORCE, mnum)); 1218 1219 /* Must use -f force flag on ERRED column first */ 1220 if (un->un_column[col].un_devstate != RCS_ERRED) { 1221 for (ix = 0; ix < un->un_totalcolumncnt; ix++) { 1222 if (un->un_column[ix].un_devstate & RCS_ERRED) 1223 return (mdcomperror(ep, 1224 MDE_RAID_COMP_ERRED, mnum, 1225 un->un_column[ix].un_dev)); 1226 } 1227 } 1228 1229 /* must use -f force flag on LAST_ERRED columns next */ 1230 if ((un->un_column[col].un_devstate != RCS_LAST_ERRED) && 1231 (un->un_column[col].un_devstate != RCS_ERRED)) 1232 return (mdcomperror(ep, MDE_RAID_COMP_ERRED, 1233 mnum, un->un_column[col].un_dev)); 1234 } 1235 1236 if (un->un_state == RUS_ERRED) { 1237 if (! (un->un_column[col].un_devstate & 1238 (RCS_ERRED | RCS_INIT_ERRED))) 1239 return (mdcomperror(ep, MDE_RAID_COMP_ERRED, 1240 mnum, un->un_column[ix].un_dev)); 1241 } 1242 1243 ASSERT(!(un->un_column[col].un_devflags & MD_RAID_ALT_ISOPEN)); 1244 ASSERT(!(un->un_column[col].un_devflags & MD_RAID_WRITE_ALT)); 1245 1246 state = un->un_column[col].un_devstate; 1247 if (state & RCS_INIT_ERRED) { 1248 MD_STATUS(un) |= MD_UN_GROW_PENDING; 1249 un->un_percent_done = 0; 1250 raid_set_state(un, col, RCS_INIT, 0); 1251 } else if (((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) && 1252 resync_request(mnum, col, 0, ep)) 1253 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum)); 1254 1255 1256 if (cmd == REPLACE_COMP) { 1257 md_dev64_t tmpdev = mrp->new_dev; 1258 1259 /* 1260 * open the device by device id 1261 */ 1262 tmpdev = md_resolve_bydevid(mnum, tmpdev, mrp->new_key); 1263 if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) { 1264 return (mdcomperror(ep, MDE_COMP_OPEN_ERR, mnum, 1265 tmpdev)); 1266 } 1267 1268 /* 1269 * If it's a metadevice, make sure it gets reparented 1270 */ 1271 if (md_getmajor(tmpdev) == md_major) { 1272 minor_t new_mnum = md_getminor(tmpdev); 1273 md_unit_t *new_un = MD_UNIT(new_mnum); 1274 1275 md_set_parent(tmpdev, MD_SID(un)); 1276 extra_recids[extra_rids++] = MD_RECID(new_un); 1277 } 1278 1279 mrp->new_dev = tmpdev; 1280 un->un_column[col].un_orig_dev = tmpdev; 1281 un->un_column[col].un_orig_key = mrp->new_key; 1282 un->un_column[col].un_orig_pwstart = mrp->start_blk; 1283 un->un_column[col].un_orig_devstart = 1284 mrp->start_blk + un->un_pwsize; 1285 1286 /* 1287 * If the old device was a metadevice, make sure to 1288 * reset its parent. 1289 */ 1290 if (md_getmajor(odev) == md_major) { 1291 minor_t old_mnum = md_getminor(odev); 1292 md_unit_t *old_un = MD_UNIT(old_mnum); 1293 1294 md_reset_parent(odev); 1295 extra_recids[extra_rids++] = 1296 MD_RECID(old_un); 1297 } 1298 1299 if (HOTSPARED(un, col)) { 1300 md_layered_close(mrp->new_dev, MD_OFLG_NULL); 1301 un->un_column[col].un_alt_dev = mrp->new_dev; 1302 un->un_column[col].un_alt_pwstart = mrp->start_blk; 1303 un->un_column[col].un_alt_devstart = 1304 mrp->start_blk + un->un_pwsize; 1305 un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC; 1306 } else { 1307 /* 1308 * not hot spared. Close the old device and 1309 * move the new device in. 1310 */ 1311 if (un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) 1312 md_layered_close(odev, MD_OFLG_NULL); 1313 un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN; 1314 un->un_column[col].un_dev = mrp->new_dev; 1315 un->un_column[col].un_pwstart = mrp->start_blk; 1316 un->un_column[col].un_devstart = 1317 mrp->start_blk + un->un_pwsize; 1318 if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) { 1319 un->un_column[col].un_devflags |= 1320 MD_RAID_REGEN_RESYNC; 1321 } 1322 } 1323 /* 1324 * If the old device is not a metadevice then 1325 * save off the set number and key so that it 1326 * can be removed from the namespace later. 1327 */ 1328 if (md_getmajor(odev) != md_major) { 1329 sv.setno = setno; 1330 sv.key = devkey; 1331 } 1332 } 1333 1334 if (cmd == ENABLE_COMP) { 1335 md_dev64_t tmpdev = un->un_column[col].un_orig_dev; 1336 mdkey_t raidkey = un->un_column[col].un_orig_key; 1337 1338 /* 1339 * We trust the dev_t because we cannot determine the 1340 * dev_t from the device id since a new disk is in the 1341 * same location. Since this is a call from metareplace -e dx 1342 * AND it is SCSI a new dev_t is not generated. So the 1343 * dev_t from the mddb is used. Before enabling the device 1344 * we check to make sure that multiple entries for the same 1345 * device does not exist in the namespace. If they do we 1346 * fail the ioctl. 1347 * One of the many ways multiple entries in the name space 1348 * can occur is if one removed the failed component in a 1349 * RAID metadevice and put another disk that was part of 1350 * another metadevice. After reboot metadevadm would correctly 1351 * update the device name for the metadevice whose component 1352 * has moved. However now in the metadb there are two entries 1353 * for the same name (ctds) that belong to different 1354 * metadevices. One is valid, the other is a ghost or "last 1355 * know as" ctds. 1356 */ 1357 tmpdev = md_resolve_bydevid(mnum, tmpdev, raidkey); 1358 if (tmpdev == NODEV64) 1359 tmpdev = md_getdevnum(setno, side, raidkey, 1360 MD_TRUST_DEVT); 1361 /* 1362 * check for multiple entries in namespace for the 1363 * same dev 1364 */ 1365 1366 if (md_getkeyfromdev(setno, side, tmpdev, &devkey, 1367 &nkeys) != 0) 1368 return (mddeverror(ep, MDE_NAME_SPACE, tmpdev)); 1369 /* 1370 * If number of keys are greater that 1371 * 1, then we have an invalid 1372 * namespace. STOP and return. 1373 */ 1374 if (nkeys > 1) 1375 return (mddeverror(ep, MDE_MULTNM, tmpdev)); 1376 if (devkey != raidkey) 1377 return (mdcomperror(ep, MDE_CANT_FIND_COMP, 1378 mnum, tmpdev)); 1379 1380 if (un->un_column[col].un_orig_dev == NODEV64) 1381 un->un_column[col].un_orig_dev = tmpdev; 1382 1383 if (HOTSPARED(un, col)) { 1384 un->un_column[col].un_alt_dev = 1385 un->un_column[col].un_orig_dev; 1386 un->un_column[col].un_alt_pwstart = 1387 un->un_column[col].un_orig_pwstart; 1388 un->un_column[col].un_alt_devstart = 1389 un->un_column[col].un_orig_devstart; 1390 un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC; 1391 } else { 1392 if (!(un->un_column[col].un_devflags & 1393 MD_RAID_DEV_ISOPEN)) { 1394 if (md_layered_open(mnum, &tmpdev, 1395 MD_OFLG_NULL)) { 1396 un->un_column[col].un_dev = tmpdev; 1397 return (mdcomperror(ep, 1398 MDE_COMP_OPEN_ERR, mnum, tmpdev)); 1399 } 1400 ASSERT(tmpdev != NODEV64 && 1401 tmpdev != 0); 1402 1403 if ((md_getmajor(tmpdev) != md_major) && 1404 (md_devid_found(setno, side, raidkey) 1405 == 1)) { 1406 if (md_update_namespace_did(setno, side, 1407 raidkey, &mde) != 0) { 1408 cmn_err(CE_WARN, 1409 "md: could not" 1410 " update namespace\n"); 1411 } 1412 } 1413 un->un_column[col].un_dev = 1414 un->un_column[col].un_orig_dev; 1415 } 1416 un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN; 1417 un->un_column[col].un_devflags |= MD_RAID_REGEN_RESYNC; 1418 } 1419 } 1420 if (mrp->has_label) { 1421 un->un_column[col].un_devflags |= MD_RAID_HAS_LABEL; 1422 } else { 1423 un->un_column[col].un_devflags &= ~MD_RAID_HAS_LABEL; 1424 } 1425 1426 raid_commit(un, extra_recids); 1427 1428 /* If the component has been replaced - clean up the name space */ 1429 if (sv.setno != MD_SET_BAD) { 1430 md_rem_names(&sv, 1); 1431 } 1432 1433 md_ioctl_droplocks(lock); 1434 1435 if ((cmd == ENABLE_COMP) || (cmd == FORCE_ENABLE_COMP)) { 1436 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ENABLE, SVM_TAG_METADEVICE, 1437 setno, MD_SID(un)); 1438 } else { 1439 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REPLACE, SVM_TAG_METADEVICE, 1440 setno, MD_SID(un)); 1441 } 1442 1443 if (un->un_column[col].un_devstate & RCS_INIT) 1444 err = raid_init_unit(mnum, ep); 1445 else if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) 1446 err = raid_resync_unit(mnum, ep); 1447 1448 mdclrerror(ep); 1449 if (!err) 1450 return (0); 1451 1452 /* be sure state */ 1453 /* is already set by this time */ 1454 /* fix state and commit record */ 1455 un = md_unit_writerlock(MDI_UNIT(mnum)); 1456 if (state & RCS_INIT_ERRED) 1457 raid_set_state(un, col, state, 1); 1458 else if (state & RCS_OKAY) 1459 raid_set_state(un, col, RCS_ERRED, 0); 1460 else 1461 raid_set_state(un, col, state, 1); 1462 raid_commit(un, NULL); 1463 md_unit_writerexit(MDI_UNIT(mnum)); 1464 mdclrerror(ep); 1465 return (0); 1466 } 1467 1468 1469 /* 1470 * NAME: raid_set_sync 1471 * DESCRIPTION: used to sync a component of a RAID metadevice 1472 * PARAMETERS: md_resync_ioctl_t *mrp - pointer to resync data structure 1473 * int mode - must be FWRITE 1474 * IOLOCK *lock - pointer to IOCTL lock 1475 * 1476 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun), 1477 * obtains and releases md_unit_array_rw write lock 1478 * 1479 */ 1480 static int 1481 raid_set_sync( 1482 md_resync_ioctl_t *rip, 1483 IOLOCK *lock 1484 ) 1485 { 1486 minor_t mnum = rip->ri_mnum; 1487 mr_unit_t *un; 1488 int init = 0; 1489 int resync = 0; 1490 int regen = 0; 1491 int ix; 1492 int err; 1493 1494 mdclrerror(&rip->mde); 1495 1496 if ((un = raid_getun(mnum, &rip->mde, WRITERS, lock)) == NULL) 1497 return (0); 1498 1499 if (un->un_state & RUS_DOI) 1500 return (mdmderror(&rip->mde, MDE_RAID_DOI, mnum)); 1501 1502 if (un->c.un_status & MD_UN_RESYNC_ACTIVE) 1503 return (mdmderror(&rip->mde, MDE_RESYNC_ACTIVE, mnum)); 1504 1505 /* This prevents new opens */ 1506 1507 rip->ri_flags = 0; 1508 if (un->un_state & RUS_REGEN) 1509 regen++; 1510 1511 if (raid_state_cnt(un, RCS_RESYNC)) 1512 resync++; 1513 1514 if (raid_state_cnt(un, RCS_INIT) || (un->un_state & RUS_INIT)) 1515 init++; 1516 1517 ASSERT(!(resync && init && regen)); 1518 md_ioctl_droplocks(lock); 1519 rip->ri_percent_done = 0; 1520 1521 if (init) { 1522 MD_STATUS(un) |= MD_UN_GROW_PENDING; 1523 return (raid_init_unit(mnum, &rip->mde)); 1524 } 1525 1526 /* 1527 * If resync is needed, it will call raid_internal_open forcing 1528 * replay before the open completes. 1529 * Otherwise, call raid_internal_open directly to force 1530 * replay to complete during boot (metasync -r). 1531 * NOTE: the unit writer lock must remain held while setting 1532 * MD_UN_RESYNC_ACTIVE but must be released before 1533 * calling raid_resync_unit or raid_internal_open. 1534 */ 1535 if (resync) { 1536 ASSERT(resync < 2); 1537 un = md_unit_writerlock(MDI_UNIT(mnum)); 1538 MD_STATUS(un) |= MD_UN_RESYNC_ACTIVE; 1539 /* Must release unit writer lock for resync */ 1540 /* 1541 * correctly setup the devices before trying to start the 1542 * resync operation. 1543 */ 1544 for (ix = 0; un->un_totalcolumncnt; ix++) { 1545 if (un->un_column[ix].un_devstate & RCS_RESYNC) { 1546 if ((un->un_column[ix].un_devflags & 1547 MD_RAID_COPY_RESYNC) && 1548 HOTSPARED(un, ix)) { 1549 un->un_column[ix].un_alt_dev = 1550 un->un_column[ix].un_orig_dev; 1551 un->un_column[ix].un_alt_devstart = 1552 un->un_column[ix].un_orig_devstart; 1553 un->un_column[ix].un_alt_pwstart = 1554 un->un_column[ix].un_orig_pwstart; 1555 } 1556 break; 1557 } 1558 } 1559 ASSERT(un->un_column[ix].un_devflags & 1560 (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC)); 1561 rip->ri_percent_done = 0; 1562 un->un_column[ix].un_devflags |= MD_RAID_RESYNC; 1563 (void) resync_request(mnum, ix, 0, NULL); 1564 md_unit_writerexit(MDI_UNIT(mnum)); 1565 err = raid_resync_unit(mnum, &rip->mde); 1566 return (err); 1567 } 1568 1569 if (regen) { 1570 err = raid_regen_unit(mnum, &rip->mde); 1571 return (err); 1572 } 1573 1574 /* The unit requires not work so just force replay of the device */ 1575 if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) 1576 return (mdmderror(&rip->mde, 1577 MDE_RAID_OPEN_FAILURE, mnum)); 1578 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0); 1579 1580 return (0); 1581 } 1582 1583 /* 1584 * NAME: raid_get_resync 1585 * DESCRIPTION: used to check resync status on a component of a RAID metadevice 1586 * PARAMETERS: md_resync_ioctl_t *mrp - pointer to resync data structure 1587 * int mode - must be FWRITE 1588 * IOLOCK *lock - pointer to IOCTL lock 1589 * 1590 * LOCKS: none 1591 * 1592 */ 1593 static int 1594 raid_get_resync( 1595 md_resync_ioctl_t *rip, 1596 IOLOCK *lock 1597 ) 1598 { 1599 minor_t mnum = rip->ri_mnum; 1600 mr_unit_t *un; 1601 u_longlong_t percent; 1602 int cnt; 1603 int ix; 1604 uint64_t d; 1605 1606 mdclrerror(&rip->mde); 1607 1608 if ((un = raid_getun(mnum, &rip->mde, RD_LOCK, lock)) == NULL) 1609 return (0); 1610 1611 rip->ri_flags = 0; 1612 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) { 1613 d = un->un_segsincolumn; 1614 percent = d ? ((1000 * un->un_resync_line_index) / d) : 0; 1615 if (percent > 1000) 1616 percent = 1000; /* can't go over 100% */ 1617 rip->ri_percent_done = (int)percent; 1618 rip->ri_flags |= MD_RI_INPROGRESS; 1619 } 1620 1621 if (UNIT_STATE(un) & RUS_INIT) { 1622 d = un->un_segsize * un->un_segsincolumn * 1623 un->un_totalcolumncnt; 1624 percent = 1625 d ? ((1000 * (u_longlong_t)un->un_init_iocnt) / d) : 0; 1626 if (percent > 1000) 1627 percent = 1000; /* can't go over 100% */ 1628 rip->ri_percent_done = (int)percent; 1629 rip->ri_flags |= MD_GROW_INPROGRESS; 1630 } else if (MD_STATUS(un) & MD_UN_GROW_PENDING) { 1631 d = un->un_segsize * un->un_segsincolumn * un->un_init_colcnt; 1632 percent = 1633 d ? (((u_longlong_t)un->un_init_iocnt * 1000) / d) : 0; 1634 if (percent > 1000) 1635 percent = 1000; 1636 rip->ri_percent_done = (int)percent; 1637 rip->ri_flags |= MD_GROW_INPROGRESS; 1638 } 1639 1640 if (un->un_state & RUS_REGEN) 1641 rip->ri_percent_done = un->un_percent_done; 1642 1643 cnt = 0; 1644 for (ix = 0; ix < un->un_totalcolumncnt; ix++) { 1645 switch (un->un_column[ix].un_devstate) { 1646 case RCS_INIT: 1647 case RCS_ERRED: 1648 case RCS_LAST_ERRED: 1649 cnt++; 1650 break; 1651 default: 1652 break; 1653 } 1654 } 1655 d = un->un_totalcolumncnt; 1656 rip->ri_percent_dirty = d ? (((u_longlong_t)cnt * 100) / d) : 0; 1657 return (0); 1658 } 1659 1660 /* 1661 * NAME: raid_grow 1662 * DESCRIPTION: Concatenate to a RAID metadevice 1663 * PARAMETERS: md_grow_params_t *mgp 1664 * - pointer to IOCGROW data structure 1665 * int mode - must be FWRITE 1666 * IOLOCK *lockp - IOCTL read/write and unit_array_rw lock 1667 * 1668 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun), 1669 * obtains and releases md_unit_array_rw write lock 1670 * 1671 */ 1672 static int 1673 raid_grow(void *mgp, int mode, IOLOCK *lock) 1674 { 1675 minor_t mnum; 1676 mr_unit_t *un, *new_un; 1677 mdi_unit_t *ui; 1678 mddb_type_t typ1; 1679 mddb_recid_t mr_recid; 1680 mddb_recid_t old_vtoc = 0; 1681 mddb_recid_t *recids; 1682 md_create_rec_option_t options; 1683 int err; 1684 int col, i; 1685 int64_t tb, atb; 1686 u_longlong_t unrev; 1687 int tc; 1688 int rval = 0; 1689 set_t setno; 1690 mr_column_ic_t *mrc; 1691 int num_recs, rid; 1692 md_grow_params_t *mgph = mgp; 1693 1694 1695 mnum = mgph->mnum; 1696 1697 mdclrerror(&mgph->mde); 1698 1699 ui = MDI_UNIT(mnum); 1700 un = md_unit_readerlock(ui); 1701 1702 if (MD_STATUS(un) & MD_UN_GROW_PENDING) { 1703 md_unit_readerexit(ui); 1704 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum)); 1705 } 1706 1707 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) { 1708 md_unit_readerexit(ui); 1709 return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, 1710 mnum)); 1711 } 1712 1713 if (UNIT_STATE(un) & RUS_LAST_ERRED) { 1714 md_unit_readerexit(ui); 1715 return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, 1716 mnum)); 1717 } 1718 1719 if (UNIT_STATE(un) & RUS_DOI) { 1720 md_unit_readerexit(ui); 1721 return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum)); 1722 } 1723 1724 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) { 1725 md_unit_readerexit(ui); 1726 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum)); 1727 } 1728 1729 md_unit_readerexit(ui); 1730 1731 if ((un = raid_getun(mnum, &mgph->mde, WRITERS, lock)) == 1732 NULL) 1733 return (0); 1734 1735 if (MD_STATUS(un) & MD_UN_GROW_PENDING) 1736 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum)); 1737 1738 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) 1739 return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, 1740 mnum)); 1741 1742 if (un->c.un_size >= mgph->size) 1743 return (EINVAL); 1744 1745 if (UNIT_STATE(un) & RUS_LAST_ERRED) 1746 return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, 1747 mnum)); 1748 1749 if (UNIT_STATE(un) & RUS_DOI) 1750 return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum)); 1751 1752 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) 1753 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum)); 1754 1755 setno = MD_MIN2SET(mnum); 1756 1757 typ1 = (mddb_type_t)md_getshared_key(setno, 1758 raid_md_ops.md_driver.md_drivername); 1759 1760 /* 1761 * Preserve the friendly name nature of the device that is 1762 * growing. 1763 */ 1764 options = MD_CRO_RAID; 1765 if (un->c.un_revision & MD_FN_META_DEV) 1766 options |= MD_CRO_FN; 1767 if (mgph->options & MD_CRO_64BIT) { 1768 #if defined(_ILP32) 1769 return (mdmderror(&mgph->mde, MDE_UNIT_TOO_LARGE, mnum)); 1770 #else 1771 mr_recid = mddb_createrec(mgph->size, typ1, 0, 1772 MD_CRO_64BIT | options, setno); 1773 #endif 1774 } else { 1775 mr_recid = mddb_createrec(mgph->size, typ1, 0, 1776 MD_CRO_32BIT | options, setno); 1777 } 1778 if (mr_recid < 0) { 1779 rval = mddbstatus2error(&mgph->mde, (int)mr_recid, 1780 mnum, setno); 1781 return (rval); 1782 } 1783 1784 /* get the address of the new unit */ 1785 new_un = (mr_unit_t *)mddb_getrecaddr(mr_recid); 1786 1787 /* 1788 * It is okay that we muck with the new unit here, 1789 * since no one else will know about the unit struct 1790 * until we commit it. If we crash, the record will 1791 * be automatically purged, since we haven't 1792 * committed it yet and the old unit struct will be found. 1793 */ 1794 1795 /* copy in the user's unit struct */ 1796 err = ddi_copyin((void *)(uintptr_t)mgph->mdp, new_un, 1797 mgph->size, mode); 1798 if (err) { 1799 mddb_deleterec_wrapper(mr_recid); 1800 return (EFAULT); 1801 } 1802 1803 /* make sure columns are being added */ 1804 if (un->un_totalcolumncnt >= new_un->un_totalcolumncnt) { 1805 mddb_deleterec_wrapper(mr_recid); 1806 return (EINVAL); 1807 } 1808 1809 /* 1810 * Save a few of the new unit structs fields. 1811 * Before they get clobbered. 1812 */ 1813 tc = new_un->un_totalcolumncnt; 1814 tb = new_un->c.un_total_blocks; 1815 atb = new_un->c.un_actual_tb; 1816 unrev = new_un->c.un_revision; 1817 1818 /* 1819 * Copy the old unit struct (static stuff) 1820 * into new unit struct 1821 */ 1822 bcopy((caddr_t)un, (caddr_t)new_un, un->c.un_size); 1823 1824 /* 1825 * Restore a few of the new unit struct values. 1826 */ 1827 new_un->un_totalcolumncnt = tc; 1828 new_un->c.un_actual_tb = atb; 1829 new_un->un_grow_tb = tb; 1830 new_un->c.un_revision = unrev; 1831 new_un->c.un_record_id = mr_recid; 1832 new_un->c.un_size = mgph->size; 1833 1834 ASSERT(new_un->mr_ic == un->mr_ic); 1835 1836 /* 1837 * Save old column slots 1838 */ 1839 mrc = un->un_column_ic; 1840 1841 /* 1842 * Allocate new column slot 1843 */ 1844 new_un->un_column_ic = (mr_column_ic_t *) 1845 kmem_zalloc(sizeof (mr_column_ic_t) * new_un->un_totalcolumncnt, 1846 KM_SLEEP); 1847 1848 /* 1849 * Restore old column slots 1850 * Free the old column slots 1851 */ 1852 bcopy(mrc, new_un->un_column_ic, 1853 sizeof (mr_column_ic_t) * un->un_totalcolumncnt); 1854 kmem_free(mrc, sizeof (mr_column_ic_t) * un->un_totalcolumncnt); 1855 1856 /* All 64 bit metadevices only support EFI labels. */ 1857 if (mgph->options & MD_CRO_64BIT) { 1858 new_un->c.un_flag |= MD_EFILABEL; 1859 /* 1860 * If the device was previously smaller than a terabyte, 1861 * and had a vtoc record attached to it, we remove the 1862 * vtoc record, because the layout has changed completely. 1863 */ 1864 if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) && 1865 (un->c.un_vtoc_id != 0)) { 1866 old_vtoc = un->c.un_vtoc_id; 1867 new_un->c.un_vtoc_id = 1868 md_vtoc_to_efi_record(old_vtoc, setno); 1869 } 1870 } 1871 1872 1873 /* 1874 * allocate the real recids array. since we may have to commit 1875 * underlying metadevice records, we need an array of size: 1876 * total number of new components being attach + 2 (one for the 1877 * raid itself, one for the end marker). 1878 */ 1879 num_recs = new_un->un_totalcolumncnt + 2; 1880 rid = 0; 1881 recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP); 1882 recids[rid++] = mr_recid; 1883 1884 for (col = un->un_totalcolumncnt; 1885 (col < new_un->un_totalcolumncnt); col++) { 1886 mr_column_t *mr_col = &new_un->un_column[col]; 1887 md_unit_t *comp_un; 1888 1889 if (raid_build_pw_reservation(new_un, col) != 0) { 1890 /* release pwslots already allocated by grow */ 1891 for (i = un->un_totalcolumncnt; i < col; i++) { 1892 raid_free_pw_reservation(new_un, i); 1893 } 1894 kmem_free(new_un->un_column_ic, 1895 sizeof (mr_column_ic_t) * 1896 new_un->un_totalcolumncnt); 1897 kmem_free(new_un->mr_ic, sizeof (*un->mr_ic)); 1898 kmem_free(recids, num_recs * sizeof (mddb_recid_t)); 1899 mddb_deleterec_wrapper(mr_recid); 1900 return (EINVAL); 1901 } 1902 /* 1903 * set parent on metadevices being added. 1904 * NOTE: currently soft partitions are the only metadevices 1905 * which can appear within a RAID metadevice. 1906 */ 1907 if (md_getmajor(mr_col->un_dev) == md_major) { 1908 comp_un = MD_UNIT(md_getminor(mr_col->un_dev)); 1909 recids[rid++] = MD_RECID(comp_un); 1910 md_set_parent(mr_col->un_dev, MD_SID(new_un)); 1911 } 1912 new_un->un_column[col].un_devflags = 0; 1913 } 1914 1915 /* set end marker */ 1916 recids[rid] = 0; 1917 1918 /* commit new unit struct */ 1919 mddb_commitrecs_wrapper(recids); 1920 1921 /* delete old unit struct */ 1922 mddb_deleterec_wrapper(un->c.un_record_id); 1923 MD_UNIT(mnum) = new_un; 1924 1925 /* 1926 * If old_vtoc has a non zero value, we know: 1927 * - This unit crossed the border from smaller to larger one TB 1928 * - There was a vtoc record for the unit, 1929 * - This vtoc record is no longer needed, because 1930 * a new efi record has been created for this un. 1931 */ 1932 if (old_vtoc != 0) { 1933 mddb_deleterec_wrapper(old_vtoc); 1934 } 1935 1936 /* free recids */ 1937 kmem_free(recids, num_recs * sizeof (mddb_recid_t)); 1938 1939 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE, 1940 MD_UN2SET(new_un), MD_SID(new_un)); 1941 MD_STATUS(new_un) |= MD_UN_GROW_PENDING; 1942 1943 /* 1944 * Since the md_ioctl_writelock aquires the unit write lock 1945 * and open/close aquires the unit reader lock it is necessary 1946 * to drop the unit write lock and then reaquire it as needed 1947 * later. 1948 */ 1949 md_unit_writerexit(ui); 1950 1951 if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) { 1952 rval = mdmderror(&mgph->mde, MDE_RAID_OPEN_FAILURE, 1953 mnum); 1954 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, 1955 MD_UN2SET(new_un), MD_SID(new_un)); 1956 return (rval); 1957 } 1958 (void) md_unit_writerlock(ui); 1959 for (i = 0; i < new_un->un_totalcolumncnt; i++) { 1960 if (new_un->un_column[i].un_devstate & RCS_OKAY) 1961 (void) init_pw_area(new_un, new_un->un_column[i].un_dev, 1962 new_un->un_column[i].un_pwstart, i); 1963 } 1964 md_unit_writerexit(ui); 1965 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0); 1966 (void) md_unit_writerlock(ui); 1967 /* create a background thread to initialize the columns */ 1968 md_ioctl_droplocks(lock); 1969 1970 return (raid_init_unit(mnum, &mgph->mde)); 1971 } 1972 1973 /* 1974 * NAME: raid_reset 1975 * DESCRIPTION: used to reset (clear / remove) a RAID metadevice 1976 * PARAMETERS: md_i_reset_t *mirp - pointer to reset data structure 1977 * 1978 * LOCKS: obtains and releases md_unit_array_rw write lock 1979 * 1980 */ 1981 static int 1982 raid_reset(md_i_reset_t *mirp) 1983 { 1984 minor_t mnum = mirp->mnum; 1985 mr_unit_t *un; 1986 mdi_unit_t *ui; 1987 set_t setno = MD_MIN2SET(mnum); 1988 1989 mdclrerror(&mirp->mde); 1990 1991 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 1992 /* 1993 * NOTE: need to get md_unit_writerlock to avoid conflict 1994 * with raid_init thread. 1995 */ 1996 if ((un = raid_getun(mnum, &mirp->mde, NO_LOCK, NULL)) == 1997 NULL) { 1998 rw_exit(&md_unit_array_rw.lock); 1999 return (0); 2000 } 2001 ui = MDI_UNIT(mnum); 2002 2003 if (MD_HAS_PARENT(MD_PARENT(un))) { 2004 rw_exit(&md_unit_array_rw.lock); 2005 return (mdmderror(&mirp->mde, MDE_IN_USE, mnum)); 2006 } 2007 2008 un = (mr_unit_t *)md_unit_openclose_enter(ui); 2009 if (md_unit_isopen(MDI_UNIT(mnum))) { 2010 md_unit_openclose_exit(ui); 2011 rw_exit(&md_unit_array_rw.lock); 2012 return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum)); 2013 } 2014 md_unit_openclose_exit(ui); 2015 if (UNIT_STATE(un) != RUS_OKAY && !mirp->force) { 2016 rw_exit(&md_unit_array_rw.lock); 2017 return (mdmderror(&mirp->mde, MDE_RAID_NEED_FORCE, mnum)); 2018 } 2019 2020 reset_raid(un, mnum, 1); 2021 2022 /* 2023 * Update unit availability 2024 */ 2025 md_set[setno].s_un_avail++; 2026 2027 /* 2028 * If MN set, reset s_un_next so all nodes can have 2029 * the same view of the next available slot when 2030 * nodes are -w and -j 2031 */ 2032 if (MD_MNSET_SETNO(setno)) { 2033 (void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum)); 2034 } 2035 2036 rw_exit(&md_unit_array_rw.lock); 2037 2038 return (0); 2039 } 2040 2041 /* 2042 * NAME: raid_get_geom 2043 * DESCRIPTION: used to get the geometry of a RAID metadevice 2044 * PARAMETERS: mr_unit_t *un - RAID unit to get the geometry for 2045 * struct dk_geom *gp - pointer to geometry data structure 2046 * 2047 * LOCKS: none 2048 * 2049 */ 2050 static int 2051 raid_get_geom( 2052 mr_unit_t *un, 2053 struct dk_geom *geomp 2054 ) 2055 { 2056 md_get_geom((md_unit_t *)un, geomp); 2057 2058 return (0); 2059 } 2060 2061 /* 2062 * NAME: raid_get_vtoc 2063 * DESCRIPTION: used to get the VTOC on a RAID metadevice 2064 * PARAMETERS: mr_unit_t *un - RAID unit to get the VTOC from 2065 * struct vtoc *vtocp - pointer to VTOC data structure 2066 * 2067 * LOCKS: none 2068 * 2069 */ 2070 static int 2071 raid_get_vtoc( 2072 mr_unit_t *un, 2073 struct vtoc *vtocp 2074 ) 2075 { 2076 md_get_vtoc((md_unit_t *)un, vtocp); 2077 2078 return (0); 2079 } 2080 2081 /* 2082 * NAME: raid_set_vtoc 2083 * DESCRIPTION: used to set the VTOC on a RAID metadevice 2084 * PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on 2085 * struct vtoc *vtocp - pointer to VTOC data structure 2086 * 2087 * LOCKS: none 2088 * 2089 */ 2090 static int 2091 raid_set_vtoc( 2092 mr_unit_t *un, 2093 struct vtoc *vtocp 2094 ) 2095 { 2096 return (md_set_vtoc((md_unit_t *)un, vtocp)); 2097 } 2098 2099 2100 2101 /* 2102 * NAME: raid_get_cgapart 2103 * DESCRIPTION: used to get the dk_map on a RAID metadevice 2104 * PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on 2105 * struct vtoc *dkmapp - pointer to dk_map data structure 2106 * 2107 * LOCKS: none 2108 * 2109 */ 2110 2111 static int 2112 raid_get_cgapart( 2113 mr_unit_t *un, 2114 struct dk_map *dkmapp 2115 ) 2116 { 2117 md_get_cgapart((md_unit_t *)un, dkmapp); 2118 return (0); 2119 } 2120 2121 /* 2122 * NAME: raid_getdevs 2123 * DESCRIPTION: return all devices within a RAID metadevice 2124 * PARAMETERS: md_getdevs_params_t *mgdp 2125 * - pointer to getdevs IOCTL data structure 2126 * int mode - should be FREAD 2127 * IOLOCK *lockp - IOCTL read/write lock 2128 * 2129 * LOCKS: obtains unit reader lock via IOLOCK 2130 * 2131 */ 2132 static int 2133 raid_getdevs( 2134 void *mgdp, 2135 int mode, 2136 IOLOCK *lock 2137 ) 2138 { 2139 minor_t mnum; 2140 mr_unit_t *un; 2141 md_dev64_t *udevs; 2142 int i, cnt; 2143 md_dev64_t unit_dev; 2144 md_getdevs_params_t *mgdph = mgdp; 2145 2146 2147 mnum = mgdph->mnum; 2148 2149 /* check out unit */ 2150 mdclrerror(&mgdph->mde); 2151 2152 if ((un = raid_getun(mnum, &mgdph->mde, RD_LOCK, 2153 lock)) == NULL) 2154 return (0); 2155 2156 udevs = (md_dev64_t *)(uintptr_t)mgdph->devs; 2157 2158 for (cnt = 0, i = 0; i < un->un_totalcolumncnt; i++, cnt++) { 2159 if (cnt < mgdph->cnt) { 2160 unit_dev = un->un_column[i].un_orig_dev; 2161 if (md_getmajor(unit_dev) != md_major) { 2162 if ((unit_dev = md_xlate_mini_2_targ 2163 (unit_dev)) == NODEV64) 2164 return (ENODEV); 2165 } 2166 2167 if (ddi_copyout((caddr_t)&unit_dev, 2168 (caddr_t)&udevs[cnt], sizeof (*udevs), 2169 mode) != 0) 2170 return (EFAULT); 2171 } 2172 if (HOTSPARED(un, i)) { 2173 cnt++; 2174 if (cnt >= mgdph->cnt) 2175 continue; 2176 2177 unit_dev = un->un_column[i].un_dev; 2178 if (md_getmajor(unit_dev) != md_major) { 2179 if ((unit_dev = md_xlate_mini_2_targ 2180 (unit_dev)) == NODEV64) 2181 return (ENODEV); 2182 } 2183 2184 if (ddi_copyout((caddr_t)&unit_dev, 2185 (caddr_t)&udevs[cnt], sizeof (*udevs), 2186 mode) != 0) 2187 return (EFAULT); 2188 } 2189 } 2190 mgdph->cnt = cnt; 2191 return (0); 2192 } 2193 2194 /* 2195 * NAME: raid_change 2196 * DESCRIPTION: used to change the following dynamic values: 2197 * the hot spare pool 2198 * in the unit structure of a RAID metadevice 2199 * PARAMETERS: md_change_params_t *mcp - pointer to change data structure 2200 * IOLOCK *lock - pointer to IOCTL lock 2201 * 2202 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun) 2203 * 2204 */ 2205 static int 2206 raid_change( 2207 md_raid_params_t *mrp, 2208 IOLOCK *lock 2209 ) 2210 { 2211 minor_t mnum = mrp->mnum; 2212 mr_unit_t *un; 2213 int ix; 2214 mddb_recid_t recids[3] = {0, 0, 0}; 2215 int err; 2216 int irecid; 2217 int inc_new_hsp = 0; 2218 2219 mdclrerror(&mrp->mde); 2220 2221 if ((un = raid_getun(mnum, &mrp->mde, WR_LOCK, lock)) == NULL) 2222 return (0); 2223 2224 if (!mrp->params.change_hsp_id) 2225 return (0); 2226 2227 /* verify that no hotspare is in use */ 2228 for (ix = 0; ix < un->un_totalcolumncnt; ix++) { 2229 if (HOTSPARED(un, ix)) { 2230 return (mdmderror(&mrp->mde, MDE_HS_IN_USE, mnum)); 2231 } 2232 } 2233 2234 /* replace the hot spare pool */ 2235 2236 irecid = 0; 2237 if (mrp->params.hsp_id != -1) { 2238 /* increment the reference count of the new hsp */ 2239 err = md_hot_spare_ifc(HSP_INCREF, mrp->params.hsp_id, 0, 0, 2240 &recids[0], NULL, NULL, NULL); 2241 if (err) { 2242 return (mdhsperror(&mrp->mde, MDE_INVAL_HSP, 2243 mrp->params.hsp_id)); 2244 } 2245 inc_new_hsp = 1; 2246 irecid++; 2247 } 2248 2249 if (un->un_hsp_id != -1) { 2250 /* decrement the reference count of the old hsp */ 2251 err = md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0, 2252 &recids[irecid], NULL, NULL, NULL); 2253 if (err) { 2254 err = mdhsperror(&mrp->mde, MDE_INVAL_HSP, 2255 mrp->params.hsp_id); 2256 if (inc_new_hsp) { 2257 (void) md_hot_spare_ifc(HSP_DECREF, 2258 mrp->params.hsp_id, 0, 0, 2259 &recids[0], NULL, NULL, NULL); 2260 /* 2261 * Don't need to commit the record, 2262 * because it wasn't committed before 2263 */ 2264 } 2265 return (err); 2266 } 2267 } 2268 2269 un->un_hsp_id = mrp->params.hsp_id; 2270 2271 raid_commit(un, recids); 2272 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE, 2273 MD_UN2SET(un), MD_SID(un)); 2274 2275 /* Now trigger hot spare processing in case one is needed. */ 2276 if ((un->un_hsp_id != -1) && (un->un_state == RUS_ERRED)) 2277 (void) raid_hotspares(); 2278 2279 return (0); 2280 } 2281 2282 /* 2283 * NAME: raid_admin_ioctl 2284 * DESCRIPTION: IOCTL operations unique to metadevices and RAID 2285 * PARAMETERS: int cmd - IOCTL command to be executed 2286 * void *data - pointer to IOCTL data structure 2287 * int mode - either FREAD or FWRITE 2288 * IOLOCK *lockp - IOCTL read/write lock 2289 * 2290 * LOCKS: none 2291 * 2292 */ 2293 static int 2294 raid_admin_ioctl( 2295 int cmd, 2296 void *data, 2297 int mode, 2298 IOLOCK *lockp 2299 ) 2300 { 2301 size_t sz = 0; 2302 void *d = NULL; 2303 int err = 0; 2304 2305 /* We can only handle 32-bit clients for internal commands */ 2306 if ((mode & DATAMODEL_MASK) != DATAMODEL_ILP32) { 2307 return (EINVAL); 2308 } 2309 2310 2311 /* dispatch ioctl */ 2312 switch (cmd) { 2313 2314 case MD_IOCSET: 2315 { 2316 if (! (mode & FWRITE)) 2317 return (EACCES); 2318 2319 sz = sizeof (md_set_params_t); 2320 d = kmem_alloc(sz, KM_SLEEP); 2321 2322 if (ddi_copyin(data, d, sz, mode)) { 2323 err = EFAULT; 2324 break; 2325 } 2326 2327 err = raid_set(d, mode); 2328 break; 2329 } 2330 2331 case MD_IOCGET: 2332 { 2333 if (! (mode & FREAD)) 2334 return (EACCES); 2335 2336 sz = sizeof (md_i_get_t); 2337 d = kmem_alloc(sz, KM_SLEEP); 2338 2339 if (ddi_copyin(data, d, sz, mode)) { 2340 err = EFAULT; 2341 break; 2342 } 2343 2344 err = raid_get(d, mode, lockp); 2345 break; 2346 } 2347 2348 case MD_IOCREPLACE: 2349 { 2350 if (! (mode & FWRITE)) 2351 return (EACCES); 2352 2353 sz = sizeof (replace_params_t); 2354 d = kmem_alloc(sz, KM_SLEEP); 2355 2356 if (ddi_copyin(data, d, sz, mode)) { 2357 err = EFAULT; 2358 break; 2359 } 2360 2361 err = raid_replace((replace_params_t *)d, lockp); 2362 break; 2363 } 2364 2365 case MD_IOCSETSYNC: 2366 { 2367 if (! (mode & FWRITE)) 2368 return (EACCES); 2369 2370 sz = sizeof (md_resync_ioctl_t); 2371 d = kmem_alloc(sz, KM_SLEEP); 2372 2373 if (ddi_copyin(data, d, sz, mode)) { 2374 err = EFAULT; 2375 break; 2376 } 2377 2378 err = raid_set_sync((md_resync_ioctl_t *)d, lockp); 2379 break; 2380 } 2381 2382 case MD_IOCGETSYNC: 2383 { 2384 if (! (mode & FREAD)) 2385 return (EACCES); 2386 2387 sz = sizeof (md_resync_ioctl_t); 2388 d = kmem_alloc(sz, KM_SLEEP); 2389 2390 if (ddi_copyin(data, d, sz, mode)) { 2391 err = EFAULT; 2392 break; 2393 } 2394 err = raid_get_resync((md_resync_ioctl_t *)d, lockp); 2395 2396 break; 2397 } 2398 2399 case MD_IOCGROW: 2400 { 2401 if (! (mode & FWRITE)) 2402 return (EACCES); 2403 2404 sz = sizeof (md_grow_params_t); 2405 d = kmem_alloc(sz, KM_SLEEP); 2406 2407 if (ddi_copyin(data, d, sz, mode)) { 2408 err = EFAULT; 2409 break; 2410 } 2411 2412 err = raid_grow(d, mode, lockp); 2413 break; 2414 } 2415 2416 case MD_IOCCHANGE: 2417 { 2418 if (! (mode & FWRITE)) 2419 return (EACCES); 2420 2421 sz = sizeof (md_raid_params_t); 2422 d = kmem_alloc(sz, KM_SLEEP); 2423 2424 if (ddi_copyin(data, d, sz, mode)) { 2425 err = EFAULT; 2426 break; 2427 } 2428 2429 err = raid_change((md_raid_params_t *)d, lockp); 2430 break; 2431 } 2432 2433 case MD_IOCRESET: 2434 { 2435 if (! (mode & FWRITE)) 2436 return (EACCES); 2437 2438 sz = sizeof (md_i_reset_t); 2439 d = kmem_alloc(sz, KM_SLEEP); 2440 2441 if (ddi_copyin(data, d, sz, mode)) { 2442 err = EFAULT; 2443 break; 2444 } 2445 2446 err = raid_reset((md_i_reset_t *)d); 2447 break; 2448 } 2449 2450 case MD_IOCGET_DEVS: 2451 { 2452 if (! (mode & FREAD)) 2453 return (EACCES); 2454 2455 sz = sizeof (md_getdevs_params_t); 2456 d = kmem_alloc(sz, KM_SLEEP); 2457 2458 if (ddi_copyin(data, d, sz, mode)) { 2459 err = EFAULT; 2460 break; 2461 } 2462 2463 err = raid_getdevs(d, mode, lockp); 2464 break; 2465 } 2466 2467 case MD_IOCSETREGEN: 2468 { 2469 if (! (mode & FWRITE)) 2470 return (EACCES); 2471 2472 sz = sizeof (md_regen_param_t); 2473 d = kmem_alloc(sz, KM_SLEEP); 2474 2475 if (ddi_copyin(data, d, sz, mode)) { 2476 err = EFAULT; 2477 break; 2478 } 2479 2480 err = raid_regen((md_regen_param_t *)d, lockp); 2481 break; 2482 } 2483 2484 case MD_IOCPROBE_DEV: 2485 { 2486 md_probedev_impl_t *p = NULL; 2487 md_probedev_t *ph = NULL; 2488 daemon_queue_t *hdr = NULL; 2489 int i; 2490 size_t sz1 = 0; 2491 2492 2493 if (! (mode & FREAD)) 2494 return (EACCES); 2495 2496 sz = sizeof (md_probedev_t); 2497 2498 d = kmem_alloc(sz, KM_SLEEP); 2499 2500 /* now copy in the data */ 2501 if (ddi_copyin(data, d, sz, mode)) { 2502 err = EFAULT; 2503 goto free_mem; 2504 } 2505 2506 /* 2507 * Sanity test the args. Test name should have the keyword 2508 * probe. 2509 */ 2510 p = kmem_alloc(sizeof (md_probedev_impl_t), KM_SLEEP); 2511 p->probe_sema = NULL; 2512 p->probe_mx = NULL; 2513 p->probe.mnum_list = (uint64_t)NULL; 2514 2515 ph = (md_probedev_t *)d; 2516 p->probe.nmdevs = ph->nmdevs; 2517 (void) strcpy(p->probe.test_name, ph->test_name); 2518 bcopy(&ph->md_driver, &(p->probe.md_driver), 2519 sizeof (md_driver_t)); 2520 2521 if ((p->probe.nmdevs < 1) || 2522 (strstr(p->probe.test_name, "probe") == NULL)) { 2523 err = EINVAL; 2524 goto free_mem; 2525 } 2526 2527 sz1 = sizeof (minor_t) * p->probe.nmdevs; 2528 2529 p->probe.mnum_list = (uint64_t)(uintptr_t)kmem_alloc(sz1, 2530 KM_SLEEP); 2531 2532 if (ddi_copyin((caddr_t)(uintptr_t)ph->mnum_list, 2533 (caddr_t)(uintptr_t)p->probe.mnum_list, sz1, mode)) { 2534 err = EFAULT; 2535 goto free_mem; 2536 } 2537 2538 if (err = md_init_probereq(p, &hdr)) 2539 goto free_mem; 2540 2541 /* 2542 * put the request on the queue and wait. 2543 */ 2544 2545 daemon_request_new(&md_ff_daemonq, md_probe_one, hdr, REQ_NEW); 2546 2547 (void) IOLOCK_RETURN(0, lockp); 2548 /* wait for the events to occur */ 2549 for (i = 0; i < p->probe.nmdevs; i++) { 2550 sema_p(PROBE_SEMA(p)); 2551 } 2552 while (md_ioctl_lock_enter() == EINTR); 2553 2554 /* 2555 * clean up. The hdr list is freed in the probe routines 2556 * since the list is NULL by the time we get here. 2557 */ 2558 free_mem: 2559 if (p) { 2560 if (p->probe_sema != NULL) { 2561 sema_destroy(PROBE_SEMA(p)); 2562 kmem_free(p->probe_sema, sizeof (ksema_t)); 2563 } 2564 if (p->probe_mx != NULL) { 2565 mutex_destroy(PROBE_MX(p)); 2566 kmem_free(p->probe_mx, sizeof (kmutex_t)); 2567 } 2568 if (p->probe.mnum_list) 2569 kmem_free((caddr_t)(uintptr_t) 2570 p->probe.mnum_list, sz1); 2571 2572 kmem_free(p, sizeof (md_probedev_impl_t)); 2573 } 2574 break; 2575 } 2576 2577 default: 2578 return (ENOTTY); 2579 } 2580 2581 /* 2582 * copyout and free any args 2583 */ 2584 if (sz != 0) { 2585 if (err == 0) { 2586 if (ddi_copyout(d, data, sz, mode) != 0) { 2587 err = EFAULT; 2588 } 2589 } 2590 kmem_free(d, sz); 2591 } 2592 return (err); 2593 } 2594 2595 /* 2596 * NAME: md_raid_ioctl 2597 * DESCRIPTION: RAID metadevice IOCTL operations entry point. 2598 * PARAMETERS: md_dev64_t dev - RAID device identifier 2599 * int cmd - IOCTL command to be executed 2600 * void *data - pointer to IOCTL data structure 2601 * int mode - either FREAD or FWRITE 2602 * IOLOCK *lockp - IOCTL read/write lock 2603 * 2604 * LOCKS: none 2605 * 2606 */ 2607 int 2608 md_raid_ioctl( 2609 dev_t dev, 2610 int cmd, 2611 void *data, 2612 int mode, 2613 IOLOCK *lockp 2614 ) 2615 { 2616 minor_t mnum = getminor(dev); 2617 mr_unit_t *un; 2618 int err = 0; 2619 2620 /* handle admin ioctls */ 2621 if (mnum == MD_ADM_MINOR) 2622 return (raid_admin_ioctl(cmd, data, mode, lockp)); 2623 2624 /* check unit */ 2625 if ((MD_MIN2SET(mnum) >= md_nsets) || 2626 (MD_MIN2UNIT(mnum) >= md_nunits) || 2627 ((un = MD_UNIT(mnum)) == NULL)) 2628 return (ENXIO); 2629 2630 /* is this a supported ioctl? */ 2631 err = md_check_ioctl_against_efi(cmd, un->c.un_flag); 2632 if (err != 0) { 2633 return (err); 2634 } 2635 2636 /* dispatch ioctl */ 2637 switch (cmd) { 2638 2639 case DKIOCINFO: 2640 { 2641 struct dk_cinfo *p; 2642 2643 if (! (mode & FREAD)) 2644 return (EACCES); 2645 2646 p = kmem_alloc(sizeof (*p), KM_SLEEP); 2647 2648 get_info(p, mnum); 2649 if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0) 2650 err = EFAULT; 2651 2652 kmem_free(p, sizeof (*p)); 2653 return (err); 2654 } 2655 2656 case DKIOCGMEDIAINFO: 2657 { 2658 struct dk_minfo p; 2659 2660 if (! (mode & FREAD)) 2661 return (EACCES); 2662 2663 get_minfo(&p, mnum); 2664 if (ddi_copyout(&p, data, sizeof (struct dk_minfo), mode) != 0) 2665 err = EFAULT; 2666 2667 return (err); 2668 } 2669 2670 case DKIOCGGEOM: 2671 { 2672 struct dk_geom *p; 2673 2674 if (! (mode & FREAD)) 2675 return (EACCES); 2676 2677 p = kmem_alloc(sizeof (*p), KM_SLEEP); 2678 2679 if ((err = raid_get_geom(un, p)) == 0) { 2680 if (ddi_copyout((caddr_t)p, data, sizeof (*p), 2681 mode) != 0) 2682 err = EFAULT; 2683 } 2684 2685 kmem_free(p, sizeof (*p)); 2686 return (err); 2687 } 2688 2689 case DKIOCGVTOC: 2690 { 2691 struct vtoc vtoc; 2692 2693 if (! (mode & FREAD)) 2694 return (EACCES); 2695 2696 if ((err = raid_get_vtoc(un, &vtoc)) != 0) { 2697 return (err); 2698 } 2699 2700 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 2701 if (ddi_copyout(&vtoc, data, sizeof (vtoc), mode)) 2702 err = EFAULT; 2703 } 2704 #ifdef _SYSCALL32 2705 else { 2706 struct vtoc32 vtoc32; 2707 vtoctovtoc32(vtoc, vtoc32); 2708 if (ddi_copyout(&vtoc32, data, sizeof (vtoc32), mode)) 2709 err = EFAULT; 2710 } 2711 #endif /* _SYSCALL32 */ 2712 2713 return (err); 2714 } 2715 2716 case DKIOCSVTOC: 2717 { 2718 struct vtoc vtoc; 2719 2720 if (! (mode & FWRITE)) 2721 return (EACCES); 2722 2723 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 2724 if (ddi_copyin(data, &vtoc, sizeof (vtoc), mode)) { 2725 err = EFAULT; 2726 } 2727 } 2728 #ifdef _SYSCALL32 2729 else { 2730 struct vtoc32 vtoc32; 2731 if (ddi_copyin(data, &vtoc32, sizeof (vtoc32), mode)) { 2732 err = EFAULT; 2733 } else { 2734 vtoc32tovtoc(vtoc32, vtoc); 2735 } 2736 } 2737 #endif /* _SYSCALL32 */ 2738 2739 if (err == 0) 2740 err = raid_set_vtoc(un, &vtoc); 2741 2742 return (err); 2743 } 2744 2745 case DKIOCGAPART: 2746 { 2747 struct dk_map dmp; 2748 2749 if ((err = raid_get_cgapart(un, &dmp)) != 0) { 2750 return (err); 2751 } 2752 2753 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) { 2754 if (ddi_copyout((caddr_t)&dmp, data, sizeof (dmp), 2755 mode) != 0) 2756 err = EFAULT; 2757 } 2758 #ifdef _SYSCALL32 2759 else { 2760 struct dk_map32 dmp32; 2761 2762 dmp32.dkl_cylno = dmp.dkl_cylno; 2763 dmp32.dkl_nblk = dmp.dkl_nblk; 2764 2765 if (ddi_copyout((caddr_t)&dmp32, data, sizeof (dmp32), 2766 mode) != 0) 2767 err = EFAULT; 2768 } 2769 #endif /* _SYSCALL32 */ 2770 2771 return (err); 2772 } 2773 case DKIOCGETEFI: 2774 { 2775 /* 2776 * This one can be done centralized, 2777 * no need to put in the same code for all types of metadevices 2778 */ 2779 return (md_dkiocgetefi(mnum, data, mode)); 2780 } 2781 2782 case DKIOCSETEFI: 2783 { 2784 /* 2785 * This one can be done centralized, 2786 * no need to put in the same code for all types of metadevices 2787 */ 2788 return (md_dkiocsetefi(mnum, data, mode)); 2789 } 2790 2791 case DKIOCPARTITION: 2792 { 2793 return (md_dkiocpartition(mnum, data, mode)); 2794 } 2795 2796 default: 2797 return (ENOTTY); 2798 } 2799 } 2800 2801 /* 2802 * rename/exchange named service entry points and support functions follow. 2803 * Most functions are handled generically, except for raid-specific locking 2804 * and checking 2805 */ 2806 2807 /* 2808 * NAME: raid_may_renexch_self 2809 * DESCRIPTION: support routine for rename check ("MDRNM_CHECK") named service 2810 * PARAMETERS: mr_unit_t *un - unit struct of raid unit to be renamed 2811 * mdi_unit_t *ui - in-core unit struct of same raid unit 2812 * md_rentxn_t *rtxnp - rename transaction state 2813 * 2814 * LOCKS: none 2815 * 2816 */ 2817 static int 2818 raid_may_renexch_self( 2819 mr_unit_t *un, 2820 mdi_unit_t *ui, 2821 md_rentxn_t *rtxnp) 2822 { 2823 minor_t from_min; 2824 minor_t to_min; 2825 bool_t toplevel; 2826 bool_t related; 2827 2828 from_min = rtxnp->from.mnum; 2829 to_min = rtxnp->to.mnum; 2830 2831 if (!un || !ui) { 2832 (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR, 2833 from_min); 2834 return (EINVAL); 2835 } 2836 2837 ASSERT(!(MD_CAPAB(un) & MD_CAN_META_CHILD)); 2838 if (MD_CAPAB(un) & MD_CAN_META_CHILD) { 2839 (void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min); 2840 return (EINVAL); 2841 } 2842 2843 if (MD_PARENT(un) == MD_MULTI_PARENT) { 2844 (void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min); 2845 return (EINVAL); 2846 } 2847 2848 toplevel = !MD_HAS_PARENT(MD_PARENT(un)); 2849 2850 /* we're related if trying to swap with our parent */ 2851 related = (!toplevel) && (MD_PARENT(un) == to_min); 2852 2853 switch (rtxnp->op) { 2854 case MDRNOP_EXCHANGE: 2855 2856 if (!related) { 2857 (void) mdmderror(&rtxnp->mde, 2858 MDE_RENAME_TARGET_UNRELATED, to_min); 2859 return (EINVAL); 2860 } 2861 2862 break; 2863 2864 case MDRNOP_RENAME: 2865 /* 2866 * if from is top-level and is open, then the kernel is using 2867 * the md_dev64_t. 2868 */ 2869 2870 if (toplevel && md_unit_isopen(ui)) { 2871 (void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY, 2872 from_min); 2873 return (EBUSY); 2874 } 2875 break; 2876 2877 default: 2878 (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR, 2879 from_min); 2880 return (EINVAL); 2881 } 2882 2883 return (0); /* ok */ 2884 } 2885 2886 /* 2887 * NAME: raid_rename_check 2888 * DESCRIPTION: ("MDRNM_CHECK") rename/exchange named service entry point 2889 * PARAMETERS: md_rendelta_t *delta - describes changes to be made to this 2890 * raid device for rename transaction 2891 * md_rentxn_t *rtxnp - rename transaction state 2892 * 2893 * LOCKS: none 2894 * 2895 */ 2896 intptr_t 2897 raid_rename_check( 2898 md_rendelta_t *delta, 2899 md_rentxn_t *rtxnp) 2900 { 2901 int err = 0; 2902 int column; 2903 mr_unit_t *un; 2904 2905 ASSERT(delta); 2906 ASSERT(rtxnp); 2907 ASSERT(delta->unp); 2908 ASSERT(delta->uip); 2909 2910 if (!delta || !rtxnp || !delta->unp || !delta->uip) { 2911 (void) mdsyserror(&rtxnp->mde, EINVAL); 2912 return (EINVAL); 2913 } 2914 2915 un = (mr_unit_t *)delta->unp; 2916 2917 for (column = 0; column < un->un_totalcolumncnt; column++) { 2918 rcs_state_t colstate; 2919 2920 colstate = un->un_column[column].un_devstate; 2921 2922 if (colstate & RCS_LAST_ERRED) { 2923 (void) mdmderror(&rtxnp->mde, MDE_RAID_LAST_ERRED, 2924 md_getminor(delta->dev)); 2925 return (EINVAL); 2926 } 2927 2928 if (colstate & RCS_INIT_ERRED) { 2929 (void) mdmderror(&rtxnp->mde, MDE_RAID_DOI, 2930 md_getminor(delta->dev)); 2931 return (EINVAL); 2932 } 2933 2934 /* How did we get this far before detecting this? */ 2935 if (colstate & RCS_RESYNC) { 2936 (void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY, 2937 md_getminor(delta->dev)); 2938 return (EBUSY); 2939 } 2940 2941 if (colstate & RCS_ERRED) { 2942 (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY, 2943 md_getminor(delta->dev)); 2944 return (EINVAL); 2945 } 2946 2947 if (!(colstate & RCS_OKAY)) { 2948 (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY, 2949 md_getminor(delta->dev)); 2950 return (EINVAL); 2951 } 2952 2953 if (HOTSPARED(un, column)) { 2954 (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY, 2955 md_getminor(delta->dev)); 2956 return (EINVAL); 2957 } 2958 } 2959 2960 /* self does additional checks */ 2961 if (delta->old_role == MDRR_SELF) { 2962 err = raid_may_renexch_self((mr_unit_t *)delta->unp, 2963 delta->uip, rtxnp); 2964 } 2965 return (err); 2966 } 2967 2968 /* 2969 * NAME: raid_rename_lock 2970 * DESCRIPTION: ("MDRNM_LOCK") rename/exchange named service entry point 2971 * PARAMETERS: md_rendelta_t *delta - describes changes to be made to this 2972 * raid device for rename transaction 2973 * md_rentxn_t *rtxnp - rename transaction state 2974 * 2975 * LOCKS: io and unit locks (taken explicitly *not* via ioctl wrappers) 2976 * 2977 */ 2978 intptr_t 2979 raid_rename_lock( 2980 md_rendelta_t *delta, 2981 md_rentxn_t *rtxnp) 2982 { 2983 minor_t mnum; 2984 2985 ASSERT(delta); 2986 ASSERT(rtxnp); 2987 2988 mnum = md_getminor(delta->dev); 2989 if (mnum == rtxnp->to.mnum && rtxnp->op == MDRNOP_RENAME) { 2990 return (0); 2991 } 2992 2993 ASSERT(delta->uip); 2994 if (!delta->uip) { 2995 (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum); 2996 return (ENODEV); 2997 } 2998 2999 ASSERT(delta->unp); 3000 if (!delta->unp) { 3001 3002 return (ENODEV); 3003 } 3004 3005 ASSERT(!IO_WRITER_HELD(delta->unp)); 3006 (void) md_io_writerlock(delta->uip); 3007 ASSERT(IO_WRITER_HELD(delta->unp)); 3008 3009 3010 ASSERT(!UNIT_WRITER_HELD(delta->unp)); 3011 (void) md_unit_writerlock(delta->uip); 3012 ASSERT(UNIT_WRITER_HELD(delta->unp)); 3013 3014 return (0); 3015 } 3016 3017 /* 3018 * NAME: raid_rename_unlock 3019 * DESCRIPTION: ("MDRNM_UNLOCK") rename/exchange named service entry point 3020 * PARAMETERS: md_rendelta_t *delta - describes changes to be made to this 3021 * raid device for rename transaction 3022 * md_rentxn_t *rtxnp - rename transaction state 3023 * 3024 * LOCKS: drops io and unit locks 3025 * 3026 */ 3027 /* ARGSUSED */ 3028 void 3029 raid_rename_unlock( 3030 md_rendelta_t *delta, 3031 md_rentxn_t *rtxnp) 3032 { 3033 mr_unit_t *un = (mr_unit_t *)delta->unp; 3034 minor_t mnum = MD_SID(un); 3035 int col; 3036 3037 ASSERT(delta); 3038 ASSERT(delta->unp); 3039 ASSERT(delta->uip); 3040 3041 ASSERT(UNIT_WRITER_HELD(delta->unp)); 3042 md_unit_writerexit(delta->uip); 3043 ASSERT(!UNIT_WRITER_HELD(delta->unp)); 3044 3045 if (! (delta->txn_stat.role_swapped) || ! (delta->txn_stat.is_open)) { 3046 goto out; 3047 } 3048 if (raid_internal_open(mnum, (FREAD | FWRITE), 3049 OTYP_LYR, MD_OFLG_ISINIT) == 0) { 3050 for (col = 0; col < un->un_totalcolumncnt; col++) { 3051 if (un->un_column[col].un_devstate & RCS_OKAY) 3052 (void) init_pw_area(un, 3053 un->un_column[col].un_dev, 3054 un->un_column[col].un_pwstart, 3055 col); 3056 } 3057 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0); 3058 } 3059 3060 out: 3061 ASSERT(IO_WRITER_HELD(delta->unp)); 3062 md_io_writerexit(delta->uip); 3063 ASSERT(!IO_WRITER_HELD(delta->unp)); 3064 } 3065 /* end of rename/exchange named service and support functions */ 3066