1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Storage Volume Character and Block Driver (SV)
28 *
29 * This driver implements a simplistic /dev/{r}dsk/ interface to a
30 * specified disk volume that is otherwise managed by the Prism
31 * software. The SV driver layers itself onto the underlying disk
32 * device driver by changing function pointers in the cb_ops
33 * structure.
34 *
35 * CONFIGURATION:
36 *
37 * 1. Configure the driver using the svadm utility.
38 * 2. Access the device as before through /dev/rdsk/c?t?d?s?
39 *
40 * LIMITATIONS:
41 *
42 * This driver should NOT be used to share a device between another
43 * DataServices user interface module (e.g., STE) and a user accessing
44 * the device through the block device in O_WRITE mode. This is because
45 * writes through the block device are asynchronous (due to the page
46 * cache) and so consistency between the block device user and the
47 * STE user cannot be guaranteed.
48 *
49 * Data is copied between system struct buf(9s) and nsc_vec_t. This is
50 * wasteful and slow.
51 */
52
53 #include <sys/debug.h>
54 #include <sys/types.h>
55
56 #include <sys/ksynch.h>
57 #include <sys/kmem.h>
58 #include <sys/errno.h>
59 #include <sys/varargs.h>
60 #include <sys/file.h>
61 #include <sys/open.h>
62 #include <sys/conf.h>
63 #include <sys/cred.h>
64 #include <sys/buf.h>
65 #include <sys/uio.h>
66 #ifndef DS_DDICT
67 #include <sys/pathname.h>
68 #endif
69 #include <sys/aio_req.h>
70 #include <sys/dkio.h>
71 #include <sys/vtoc.h>
72 #include <sys/cmn_err.h>
73 #include <sys/modctl.h>
74 #include <sys/ddi.h>
75 #include <sys/sunddi.h>
76 #include <sys/sunldi.h>
77 #include <sys/nsctl/nsvers.h>
78
79 #include <sys/nsc_thread.h>
80 #include <sys/unistat/spcs_s.h>
81 #include <sys/unistat/spcs_s_k.h>
82 #include <sys/unistat/spcs_errors.h>
83
84 #ifdef DS_DDICT
85 #include "../contract.h"
86 #endif
87
88 #include "../nsctl.h"
89
90
91 #include <sys/sdt.h> /* dtrace is S10 or later */
92
93 #include "sv.h"
94 #include "sv_impl.h"
95 #include "sv_efi.h"
96
97 #define MAX_EINTR_COUNT 1000
98
99 /*
100 * sv_mod_status
101 */
102 #define SV_PREVENT_UNLOAD 1
103 #define SV_ALLOW_UNLOAD 2
104
105 static const int sv_major_rev = ISS_VERSION_MAJ; /* Major number */
106 static const int sv_minor_rev = ISS_VERSION_MIN; /* Minor number */
107 static const int sv_micro_rev = ISS_VERSION_MIC; /* Micro number */
108 static const int sv_baseline_rev = ISS_VERSION_NUM; /* Baseline number */
109
110 #ifdef DKIOCPARTITION
111 /*
112 * CRC32 polynomial table needed for computing the checksums
113 * in an EFI vtoc.
114 */
115 static const uint32_t sv_crc32_table[256] = { CRC32_TABLE };
116 #endif
117
118 static clock_t sv_config_time; /* Time of successful {en,dis}able */
119 static int sv_debug; /* Set non-zero for debug to syslog */
120 static int sv_mod_status; /* Set to prevent modunload */
121
122 static dev_info_t *sv_dip; /* Single DIP for driver */
123 static kmutex_t sv_mutex; /* Protect global lists, etc. */
124
125 static nsc_mem_t *sv_mem; /* nsctl memory allocator token */
126
127
128 /*
129 * Per device and per major state.
130 */
131
132 #ifndef _SunOS_5_6
133 #define UNSAFE_ENTER()
134 #define UNSAFE_EXIT()
135 #else
136 #define UNSAFE_ENTER() mutex_enter(&unsafe_driver)
137 #define UNSAFE_EXIT() mutex_exit(&unsafe_driver)
138 #endif
139
140 /* hash table of major dev structures */
141 static sv_maj_t *sv_majors[SV_MAJOR_HASH_CNT] = {0};
142 static sv_dev_t *sv_devs; /* array of per device structures */
143 static int sv_max_devices; /* SV version of nsc_max_devices() */
144 static int sv_ndevices; /* number of SV enabled devices */
145
146 /*
147 * Threading.
148 */
149
150 int sv_threads_max = 1024; /* maximum # to dynamically alloc */
151 int sv_threads = 32; /* # to pre-allocate (see sv.conf) */
152 int sv_threads_extra = 0; /* addl # we would have alloc'ed */
153
154 static nstset_t *sv_tset; /* the threadset pointer */
155
156 static int sv_threads_hysteresis = 4; /* hysteresis for threadset resizing */
157 static int sv_threads_dev = 2; /* # of threads to alloc per device */
158 static int sv_threads_inc = 8; /* increment for changing the set */
159 static int sv_threads_needed; /* number of threads needed */
160 static int sv_no_threads; /* number of nsc_create errors */
161 static int sv_max_nlive; /* max number of threads running */
162
163
164
165 /*
166 * nsctl fd callbacks.
167 */
168
169 static int svattach_fd(blind_t);
170 static int svdetach_fd(blind_t);
171
172 static nsc_def_t sv_fd_def[] = {
173 { "Attach", (uintptr_t)svattach_fd, },
174 { "Detach", (uintptr_t)svdetach_fd, },
175 { 0, 0, }
176 };
177
178 /*
179 * cb_ops functions.
180 */
181
182 static int svopen(dev_t *, int, int, cred_t *);
183 static int svclose(dev_t, int, int, cred_t *);
184 static int svioctl(dev_t, int, intptr_t, int, cred_t *, int *);
185 static int svprint(dev_t, char *);
186
187 /*
188 * These next functions are layered into the underlying driver's devops.
189 */
190
191 static int sv_lyr_open(dev_t *, int, int, cred_t *);
192 static int sv_lyr_close(dev_t, int, int, cred_t *);
193 static int sv_lyr_strategy(struct buf *);
194 static int sv_lyr_read(dev_t, struct uio *, cred_t *);
195 static int sv_lyr_write(dev_t, struct uio *, cred_t *);
196 static int sv_lyr_aread(dev_t, struct aio_req *, cred_t *);
197 static int sv_lyr_awrite(dev_t, struct aio_req *, cred_t *);
198 static int sv_lyr_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
199
200 static struct cb_ops sv_cb_ops = {
201 svopen, /* open */
202 svclose, /* close */
203 nulldev, /* strategy */
204 svprint,
205 nodev, /* dump */
206 nodev, /* read */
207 nodev, /* write */
208 svioctl,
209 nodev, /* devmap */
210 nodev, /* mmap */
211 nodev, /* segmap */
212 nochpoll, /* poll */
213 ddi_prop_op,
214 NULL, /* NOT a stream */
215 D_NEW | D_MP | D_64BIT,
216 CB_REV,
217 nodev, /* aread */
218 nodev, /* awrite */
219 };
220
221
222 /*
223 * dev_ops functions.
224 */
225
226 static int sv_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
227 static int sv_attach(dev_info_t *, ddi_attach_cmd_t);
228 static int sv_detach(dev_info_t *, ddi_detach_cmd_t);
229
230 static struct dev_ops sv_ops = {
231 DEVO_REV,
232 0,
233 sv_getinfo,
234 nulldev, /* identify */
235 nulldev, /* probe */
236 sv_attach,
237 sv_detach,
238 nodev, /* reset */
239 &sv_cb_ops,
240 (struct bus_ops *)0
241 };
242
243 /*
244 * Module linkage.
245 */
246
247 extern struct mod_ops mod_driverops;
248
249 static struct modldrv modldrv = {
250 &mod_driverops,
251 "nws:Storage Volume:" ISS_VERSION_STR,
252 &sv_ops
253 };
254
255 static struct modlinkage modlinkage = {
256 MODREV_1,
257 &modldrv,
258 0
259 };
260
261
262 int
_init(void)263 _init(void)
264 {
265 int error;
266
267 mutex_init(&sv_mutex, NULL, MUTEX_DRIVER, NULL);
268
269 if ((error = mod_install(&modlinkage)) != 0) {
270 mutex_destroy(&sv_mutex);
271 return (error);
272 }
273
274 #ifdef DEBUG
275 cmn_err(CE_CONT, "!sv (revision %d.%d.%d.%d, %s, %s)\n",
276 sv_major_rev, sv_minor_rev, sv_micro_rev, sv_baseline_rev,
277 ISS_VERSION_STR, BUILD_DATE_STR);
278 #else
279 if (sv_micro_rev) {
280 cmn_err(CE_CONT, "!sv (revision %d.%d.%d, %s, %s)\n",
281 sv_major_rev, sv_minor_rev, sv_micro_rev,
282 ISS_VERSION_STR, BUILD_DATE_STR);
283 } else {
284 cmn_err(CE_CONT, "!sv (revision %d.%d, %s, %s)\n",
285 sv_major_rev, sv_minor_rev,
286 ISS_VERSION_STR, BUILD_DATE_STR);
287 }
288 #endif
289
290 return (error);
291 }
292
293
294 int
_fini(void)295 _fini(void)
296 {
297 int error;
298
299 if ((error = mod_remove(&modlinkage)) != 0)
300 return (error);
301
302 mutex_destroy(&sv_mutex);
303
304 return (error);
305 }
306
307
308 int
_info(struct modinfo * modinfop)309 _info(struct modinfo *modinfop)
310 {
311 return (mod_info(&modlinkage, modinfop));
312 }
313
314
315 /*
316 * Locking & State.
317 *
318 * sv_mutex protects config information - sv_maj_t and sv_dev_t lists;
319 * threadset creation and sizing; sv_ndevices.
320 *
321 * If we need to hold both sv_mutex and sv_lock, then the sv_mutex
322 * must be acquired first.
323 *
324 * sv_lock protects the sv_dev_t structure for an individual device.
325 *
326 * sv_olock protects the otyp/open members of the sv_dev_t. If we need
327 * to hold both sv_lock and sv_olock, then the sv_lock must be acquired
328 * first.
329 *
330 * nsc_reserve/nsc_release are used in NSC_MULTI mode to allow multiple
331 * I/O operations to a device simultaneously, as above.
332 *
333 * All nsc_open/nsc_close/nsc_reserve/nsc_release operations that occur
334 * with sv_lock write-locked must be done with (sv_state == SV_PENDING)
335 * and (sv_pending == curthread) so that any recursion through
336 * sv_lyr_open/sv_lyr_close can be detected.
337 */
338
339
340 static int
sv_init_devs(void)341 sv_init_devs(void)
342 {
343 int i;
344
345 ASSERT(MUTEX_HELD(&sv_mutex));
346
347 if (sv_max_devices > 0)
348 return (0);
349
350 sv_max_devices = nsc_max_devices();
351
352 if (sv_max_devices <= 0) {
353 /* nsctl is not attached (nskernd not running) */
354 if (sv_debug > 0)
355 cmn_err(CE_CONT, "!sv: nsc_max_devices = 0\n");
356 return (EAGAIN);
357 }
358
359 sv_devs = nsc_kmem_zalloc((sv_max_devices * sizeof (*sv_devs)),
360 KM_NOSLEEP, sv_mem);
361
362 if (sv_devs == NULL) {
363 cmn_err(CE_WARN, "!sv: could not allocate sv_devs array");
364 return (ENOMEM);
365 }
366
367 for (i = 0; i < sv_max_devices; i++) {
368 mutex_init(&sv_devs[i].sv_olock, NULL, MUTEX_DRIVER, NULL);
369 rw_init(&sv_devs[i].sv_lock, NULL, RW_DRIVER, NULL);
370 }
371
372 if (sv_debug > 0)
373 cmn_err(CE_CONT, "!sv: sv_init_devs successful\n");
374
375 return (0);
376 }
377
378
379 static int
sv_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)380 sv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
381 {
382 int rc;
383
384 switch (cmd) {
385
386 case DDI_ATTACH:
387 sv_dip = dip;
388
389 if (ddi_create_minor_node(dip, "sv", S_IFCHR,
390 0, DDI_PSEUDO, 0) != DDI_SUCCESS)
391 goto failed;
392
393 mutex_enter(&sv_mutex);
394
395 sv_mem = nsc_register_mem("SV", NSC_MEM_LOCAL, 0);
396 if (sv_mem == NULL) {
397 mutex_exit(&sv_mutex);
398 goto failed;
399 }
400
401 rc = sv_init_devs();
402 if (rc != 0 && rc != EAGAIN) {
403 mutex_exit(&sv_mutex);
404 goto failed;
405 }
406
407 mutex_exit(&sv_mutex);
408
409
410 ddi_report_dev(dip);
411
412 sv_threads = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
413 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM,
414 "sv_threads", sv_threads);
415
416 if (sv_debug > 0)
417 cmn_err(CE_CONT, "!sv: sv_threads=%d\n", sv_threads);
418
419 if (sv_threads > sv_threads_max)
420 sv_threads_max = sv_threads;
421
422 return (DDI_SUCCESS);
423
424 default:
425 return (DDI_FAILURE);
426 }
427
428 failed:
429 DTRACE_PROBE(sv_attach_failed);
430 (void) sv_detach(dip, DDI_DETACH);
431 return (DDI_FAILURE);
432 }
433
434
435 static int
sv_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)436 sv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
437 {
438 sv_dev_t *svp;
439 int i;
440
441 switch (cmd) {
442
443 case DDI_DETACH:
444
445 /*
446 * Check that everything is disabled.
447 */
448
449 mutex_enter(&sv_mutex);
450
451 if (sv_mod_status == SV_PREVENT_UNLOAD) {
452 mutex_exit(&sv_mutex);
453 DTRACE_PROBE(sv_detach_err_prevent);
454 return (DDI_FAILURE);
455 }
456
457 for (i = 0; sv_devs && i < sv_max_devices; i++) {
458 svp = &sv_devs[i];
459
460 if (svp->sv_state != SV_DISABLE) {
461 mutex_exit(&sv_mutex);
462 DTRACE_PROBE(sv_detach_err_busy);
463 return (DDI_FAILURE);
464 }
465 }
466
467
468 for (i = 0; sv_devs && i < sv_max_devices; i++) {
469 mutex_destroy(&sv_devs[i].sv_olock);
470 rw_destroy(&sv_devs[i].sv_lock);
471 }
472
473 if (sv_devs) {
474 nsc_kmem_free(sv_devs,
475 (sv_max_devices * sizeof (*sv_devs)));
476 sv_devs = NULL;
477 }
478 sv_max_devices = 0;
479
480 if (sv_mem) {
481 nsc_unregister_mem(sv_mem);
482 sv_mem = NULL;
483 }
484
485 mutex_exit(&sv_mutex);
486
487 /*
488 * Remove all minor nodes.
489 */
490
491 ddi_remove_minor_node(dip, NULL);
492 sv_dip = NULL;
493
494 return (DDI_SUCCESS);
495
496 default:
497 return (DDI_FAILURE);
498 }
499 }
500
501 static sv_maj_t *
sv_getmajor(const dev_t dev)502 sv_getmajor(const dev_t dev)
503 {
504 sv_maj_t **insert, *maj;
505 major_t umaj = getmajor(dev);
506
507 /*
508 * See if the hash table entry, or one of the hash chains
509 * is already allocated for this major number
510 */
511 if ((maj = sv_majors[SV_MAJOR_HASH(umaj)]) != 0) {
512 do {
513 if (maj->sm_major == umaj)
514 return (maj);
515 } while ((maj = maj->sm_next) != 0);
516 }
517
518 /*
519 * If the sv_mutex is held, there is design flaw, as the only non-mutex
520 * held callers can be sv_enable() or sv_dev_to_sv()
521 * Return an error, instead of panicing the system
522 */
523 if (MUTEX_HELD(&sv_mutex)) {
524 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
525 return (NULL);
526 }
527
528 /*
529 * Determine where to allocate a new element in the hash table
530 */
531 mutex_enter(&sv_mutex);
532 insert = &(sv_majors[SV_MAJOR_HASH(umaj)]);
533 for (maj = *insert; maj; maj = maj->sm_next) {
534
535 /* Did another thread beat us to it? */
536 if (maj->sm_major == umaj)
537 return (maj);
538
539 /* Find a NULL insert point? */
540 if (maj->sm_next == NULL)
541 insert = &maj->sm_next;
542 }
543
544 /*
545 * Located the new insert point
546 */
547 *insert = nsc_kmem_zalloc(sizeof (*maj), KM_NOSLEEP, sv_mem);
548 if ((maj = *insert) != 0)
549 maj->sm_major = umaj;
550 else
551 cmn_err(CE_WARN, "!sv: could not allocate sv_maj_t");
552
553 mutex_exit(&sv_mutex);
554
555 return (maj);
556 }
557
558 /* ARGSUSED */
559
560 static int
sv_getinfo(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** result)561 sv_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
562 {
563 int rc = DDI_FAILURE;
564
565 switch (infocmd) {
566
567 case DDI_INFO_DEVT2DEVINFO:
568 *result = sv_dip;
569 rc = DDI_SUCCESS;
570 break;
571
572 case DDI_INFO_DEVT2INSTANCE:
573 /*
574 * We only have a single instance.
575 */
576 *result = 0;
577 rc = DDI_SUCCESS;
578 break;
579
580 default:
581 break;
582 }
583
584 return (rc);
585 }
586
587
588 /*
589 * Hashing of devices onto major device structures.
590 *
591 * Individual device structures are hashed onto one of the sm_hash[]
592 * buckets in the relevant major device structure.
593 *
594 * Hash insertion and deletion -must- be done with sv_mutex held. Hash
595 * searching does not require the mutex because of the sm_seq member.
596 * sm_seq is incremented on each insertion (-after- hash chain pointer
597 * manipulation) and each deletion (-before- hash chain pointer
598 * manipulation). When searching the hash chain, the seq number is
599 * checked before accessing each device structure, if the seq number has
600 * changed, then we restart the search from the top of the hash chain.
601 * If we restart more than SV_HASH_RETRY times, we take sv_mutex and search
602 * the hash chain (we are guaranteed that this search cannot be
603 * interrupted).
604 */
605
606 #define SV_HASH_RETRY 16
607
608 static sv_dev_t *
sv_dev_to_sv(const dev_t dev,sv_maj_t ** majpp)609 sv_dev_to_sv(const dev_t dev, sv_maj_t **majpp)
610 {
611 minor_t umin = getminor(dev);
612 sv_dev_t **hb, *next, *svp;
613 sv_maj_t *maj;
614 int seq;
615 int try;
616
617 /* Get major hash table */
618 maj = sv_getmajor(dev);
619 if (majpp)
620 *majpp = maj;
621 if (maj == NULL)
622 return (NULL);
623
624 if (maj->sm_inuse == 0) {
625 DTRACE_PROBE1(
626 sv_dev_to_sv_end,
627 dev_t, dev);
628 return (NULL);
629 }
630
631 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
632 try = 0;
633
634 retry:
635 if (try > SV_HASH_RETRY)
636 mutex_enter(&sv_mutex);
637
638 seq = maj->sm_seq;
639 for (svp = *hb; svp; svp = next) {
640 next = svp->sv_hash;
641
642 nsc_membar_stld(); /* preserve register load order */
643
644 if (maj->sm_seq != seq) {
645 DTRACE_PROBE1(sv_dev_to_sv_retry, dev_t, dev);
646 try++;
647 goto retry;
648 }
649
650 if (svp->sv_dev == dev)
651 break;
652 }
653
654 if (try > SV_HASH_RETRY)
655 mutex_exit(&sv_mutex);
656
657 return (svp);
658 }
659
660
661 /*
662 * Must be called with sv_mutex held.
663 */
664
665 static int
sv_get_state(const dev_t udev,sv_dev_t ** svpp)666 sv_get_state(const dev_t udev, sv_dev_t **svpp)
667 {
668 sv_dev_t **hb, **insert, *svp;
669 sv_maj_t *maj;
670 minor_t umin;
671 int i;
672
673 /* Get major hash table */
674 if ((maj = sv_getmajor(udev)) == NULL)
675 return (NULL);
676
677 /* Determine which minor hash table */
678 umin = getminor(udev);
679 hb = &(maj->sm_hash[SV_MINOR_HASH(umin)]);
680
681 /* look for clash */
682
683 insert = hb;
684
685 for (svp = *hb; svp; svp = svp->sv_hash) {
686 if (svp->sv_dev == udev)
687 break;
688
689 if (svp->sv_hash == NULL)
690 insert = &svp->sv_hash;
691 }
692
693 if (svp) {
694 DTRACE_PROBE1(
695 sv_get_state_enabled,
696 dev_t, udev);
697 return (SV_EENABLED);
698 }
699
700 /* look for spare sv_devs slot */
701
702 for (i = 0; i < sv_max_devices; i++) {
703 svp = &sv_devs[i];
704
705 if (svp->sv_state == SV_DISABLE)
706 break;
707 }
708
709 if (i >= sv_max_devices) {
710 DTRACE_PROBE1(
711 sv_get_state_noslots,
712 dev_t, udev);
713 return (SV_ENOSLOTS);
714 }
715
716 svp->sv_state = SV_PENDING;
717 svp->sv_pending = curthread;
718
719 *insert = svp;
720 svp->sv_hash = NULL;
721 maj->sm_seq++; /* must be after the store to the hash chain */
722
723 *svpp = svp;
724
725 /*
726 * We do not know the size of the underlying device at
727 * this stage, so initialise "nblocks" property to
728 * zero, and update it whenever we succeed in
729 * nsc_reserve'ing the underlying nsc_fd_t.
730 */
731
732 svp->sv_nblocks = 0;
733
734 return (0);
735 }
736
737
738 /*
739 * Remove a device structure from it's hash chain.
740 * Must be called with sv_mutex held.
741 */
742
743 static void
sv_rm_hash(sv_dev_t * svp)744 sv_rm_hash(sv_dev_t *svp)
745 {
746 sv_dev_t **svpp;
747 sv_maj_t *maj;
748
749 /* Get major hash table */
750 if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
751 return;
752
753 /* remove svp from hash chain */
754
755 svpp = &(maj->sm_hash[SV_MINOR_HASH(getminor(svp->sv_dev))]);
756 while (*svpp) {
757 if (*svpp == svp) {
758 /*
759 * increment of sm_seq must be before the
760 * removal from the hash chain
761 */
762 maj->sm_seq++;
763 *svpp = svp->sv_hash;
764 break;
765 }
766
767 svpp = &(*svpp)->sv_hash;
768 }
769
770 svp->sv_hash = NULL;
771 }
772
773 /*
774 * Free (disable) a device structure.
775 * Must be called with sv_lock(RW_WRITER) and sv_mutex held, and will
776 * perform the exits during its processing.
777 */
778
779 static int
sv_free(sv_dev_t * svp,const int error)780 sv_free(sv_dev_t *svp, const int error)
781 {
782 struct cb_ops *cb_ops;
783 sv_maj_t *maj;
784
785 /* Get major hash table */
786 if ((maj = sv_getmajor(svp->sv_dev)) == NULL)
787 return (NULL);
788
789 svp->sv_state = SV_PENDING;
790 svp->sv_pending = curthread;
791
792 /*
793 * Close the fd's before removing from the hash or swapping
794 * back the cb_ops pointers so that the cache flushes before new
795 * io can come in.
796 */
797
798 if (svp->sv_fd) {
799 (void) nsc_close(svp->sv_fd);
800 svp->sv_fd = 0;
801 }
802
803 sv_rm_hash(svp);
804
805 if (error != SV_ESDOPEN &&
806 error != SV_ELYROPEN && --maj->sm_inuse == 0) {
807
808 if (maj->sm_dev_ops)
809 cb_ops = maj->sm_dev_ops->devo_cb_ops;
810 else
811 cb_ops = NULL;
812
813 if (cb_ops && maj->sm_strategy != NULL) {
814 cb_ops->cb_strategy = maj->sm_strategy;
815 cb_ops->cb_close = maj->sm_close;
816 cb_ops->cb_ioctl = maj->sm_ioctl;
817 cb_ops->cb_write = maj->sm_write;
818 cb_ops->cb_open = maj->sm_open;
819 cb_ops->cb_read = maj->sm_read;
820 cb_ops->cb_flag = maj->sm_flag;
821
822 if (maj->sm_awrite)
823 cb_ops->cb_awrite = maj->sm_awrite;
824
825 if (maj->sm_aread)
826 cb_ops->cb_aread = maj->sm_aread;
827
828 /*
829 * corbin XXX
830 * Leave backing device ops in maj->sm_*
831 * to handle any requests that might come
832 * in during the disable. This could be
833 * a problem however if the backing device
834 * driver is changed while we process these
835 * requests.
836 *
837 * maj->sm_strategy = 0;
838 * maj->sm_awrite = 0;
839 * maj->sm_write = 0;
840 * maj->sm_ioctl = 0;
841 * maj->sm_close = 0;
842 * maj->sm_aread = 0;
843 * maj->sm_read = 0;
844 * maj->sm_open = 0;
845 * maj->sm_flag = 0;
846 *
847 */
848 }
849
850 if (maj->sm_dev_ops) {
851 maj->sm_dev_ops = 0;
852 }
853 }
854
855 if (svp->sv_lh) {
856 cred_t *crp = ddi_get_cred();
857
858 /*
859 * Close the protective layered driver open using the
860 * Sun Private layered driver i/f.
861 */
862
863 (void) ldi_close(svp->sv_lh, FREAD|FWRITE, crp);
864 svp->sv_lh = NULL;
865 }
866
867 svp->sv_timestamp = nsc_lbolt();
868 svp->sv_state = SV_DISABLE;
869 svp->sv_pending = NULL;
870 rw_exit(&svp->sv_lock);
871 mutex_exit(&sv_mutex);
872
873 return (error);
874 }
875
876 /*
877 * Reserve the device, taking into account the possibility that
878 * the reserve might have to be retried.
879 */
880 static int
sv_reserve(nsc_fd_t * fd,int flags)881 sv_reserve(nsc_fd_t *fd, int flags)
882 {
883 int eintr_count;
884 int rc;
885
886 eintr_count = 0;
887 do {
888 rc = nsc_reserve(fd, flags);
889 if (rc == EINTR) {
890 ++eintr_count;
891 delay(2);
892 }
893 } while ((rc == EINTR) && (eintr_count < MAX_EINTR_COUNT));
894
895 return (rc);
896 }
897
898 static int
sv_enable(const caddr_t path,const int flag,const dev_t udev,spcs_s_info_t kstatus)899 sv_enable(const caddr_t path, const int flag,
900 const dev_t udev, spcs_s_info_t kstatus)
901 {
902 struct dev_ops *dev_ops;
903 struct cb_ops *cb_ops;
904 sv_dev_t *svp;
905 sv_maj_t *maj;
906 nsc_size_t nblocks;
907 int rc;
908 cred_t *crp;
909 ldi_ident_t li;
910
911 if (udev == (dev_t)-1 || udev == 0) {
912 DTRACE_PROBE1(
913 sv_enable_err_baddev,
914 dev_t, udev);
915 return (SV_EBADDEV);
916 }
917
918 if ((flag & ~(NSC_CACHE|NSC_DEVICE)) != 0) {
919 DTRACE_PROBE1(sv_enable_err_amode, dev_t, udev);
920 return (SV_EAMODE);
921 }
922
923 /* Get major hash table */
924 if ((maj = sv_getmajor(udev)) == NULL)
925 return (SV_EBADDEV);
926
927 mutex_enter(&sv_mutex);
928
929 rc = sv_get_state(udev, &svp);
930 if (rc) {
931 mutex_exit(&sv_mutex);
932 DTRACE_PROBE1(sv_enable_err_state, dev_t, udev);
933 return (rc);
934 }
935
936 rw_enter(&svp->sv_lock, RW_WRITER);
937
938 /*
939 * Get real fd used for io
940 */
941
942 svp->sv_dev = udev;
943 svp->sv_flag = flag;
944
945 /*
946 * OR in NSC_DEVICE to ensure that nskern grabs the real strategy
947 * function pointer before sv swaps them out.
948 */
949
950 svp->sv_fd = nsc_open(path, (svp->sv_flag | NSC_DEVICE),
951 sv_fd_def, (blind_t)udev, &rc);
952
953 if (svp->sv_fd == NULL) {
954 if (kstatus)
955 spcs_s_add(kstatus, rc);
956 DTRACE_PROBE1(sv_enable_err_fd, dev_t, udev);
957 return (sv_free(svp, SV_ESDOPEN));
958 }
959
960 /*
961 * Perform a layered driver open using the Sun Private layered
962 * driver i/f to ensure that the cb_ops structure for the driver
963 * is not detached out from under us whilst sv is enabled.
964 *
965 */
966
967 crp = ddi_get_cred();
968 svp->sv_lh = NULL;
969
970 if ((rc = ldi_ident_from_dev(svp->sv_dev, &li)) == 0) {
971 rc = ldi_open_by_dev(&svp->sv_dev,
972 OTYP_BLK, FREAD|FWRITE, crp, &svp->sv_lh, li);
973 }
974
975 if (rc != 0) {
976 if (kstatus)
977 spcs_s_add(kstatus, rc);
978 DTRACE_PROBE1(sv_enable_err_lyr_open, dev_t, udev);
979 return (sv_free(svp, SV_ELYROPEN));
980 }
981
982 /*
983 * Do layering if required - must happen after nsc_open().
984 */
985
986 if (maj->sm_inuse++ == 0) {
987 maj->sm_dev_ops = nsc_get_devops(getmajor(udev));
988
989 if (maj->sm_dev_ops == NULL ||
990 maj->sm_dev_ops->devo_cb_ops == NULL) {
991 DTRACE_PROBE1(sv_enable_err_load, dev_t, udev);
992 return (sv_free(svp, SV_ELOAD));
993 }
994
995 dev_ops = maj->sm_dev_ops;
996 cb_ops = dev_ops->devo_cb_ops;
997
998 if (cb_ops->cb_strategy == NULL ||
999 cb_ops->cb_strategy == nodev ||
1000 cb_ops->cb_strategy == nulldev) {
1001 DTRACE_PROBE1(sv_enable_err_nostrategy, dev_t, udev);
1002 return (sv_free(svp, SV_ELOAD));
1003 }
1004
1005 if (cb_ops->cb_strategy == sv_lyr_strategy) {
1006 DTRACE_PROBE1(sv_enable_err_svstrategy, dev_t, udev);
1007 return (sv_free(svp, SV_ESTRATEGY));
1008 }
1009
1010 maj->sm_strategy = cb_ops->cb_strategy;
1011 maj->sm_close = cb_ops->cb_close;
1012 maj->sm_ioctl = cb_ops->cb_ioctl;
1013 maj->sm_write = cb_ops->cb_write;
1014 maj->sm_open = cb_ops->cb_open;
1015 maj->sm_read = cb_ops->cb_read;
1016 maj->sm_flag = cb_ops->cb_flag;
1017
1018 cb_ops->cb_flag = cb_ops->cb_flag | D_MP;
1019 cb_ops->cb_strategy = sv_lyr_strategy;
1020 cb_ops->cb_close = sv_lyr_close;
1021 cb_ops->cb_ioctl = sv_lyr_ioctl;
1022 cb_ops->cb_write = sv_lyr_write;
1023 cb_ops->cb_open = sv_lyr_open;
1024 cb_ops->cb_read = sv_lyr_read;
1025
1026 /*
1027 * Check that the driver has async I/O entry points
1028 * before changing them.
1029 */
1030
1031 if (dev_ops->devo_rev < 3 || cb_ops->cb_rev < 1) {
1032 maj->sm_awrite = 0;
1033 maj->sm_aread = 0;
1034 } else {
1035 maj->sm_awrite = cb_ops->cb_awrite;
1036 maj->sm_aread = cb_ops->cb_aread;
1037
1038 cb_ops->cb_awrite = sv_lyr_awrite;
1039 cb_ops->cb_aread = sv_lyr_aread;
1040 }
1041
1042 /*
1043 * Bug 4645743
1044 *
1045 * Prevent sv from ever unloading after it has interposed
1046 * on a major device because there is a race between
1047 * sv removing its layered entry points from the target
1048 * dev_ops, a client coming in and accessing the driver,
1049 * and the kernel modunloading the sv text.
1050 *
1051 * To allow unload, do svboot -u, which only happens in
1052 * pkgrm time.
1053 */
1054 ASSERT(MUTEX_HELD(&sv_mutex));
1055 sv_mod_status = SV_PREVENT_UNLOAD;
1056 }
1057
1058
1059 svp->sv_timestamp = nsc_lbolt();
1060 svp->sv_state = SV_ENABLE;
1061 svp->sv_pending = NULL;
1062 rw_exit(&svp->sv_lock);
1063
1064 sv_ndevices++;
1065 mutex_exit(&sv_mutex);
1066
1067 nblocks = 0;
1068 if (sv_reserve(svp->sv_fd, NSC_READ|NSC_MULTI|NSC_PCATCH) == 0) {
1069 nblocks = svp->sv_nblocks;
1070 nsc_release(svp->sv_fd);
1071 }
1072
1073 cmn_err(CE_CONT, "!sv: rdev 0x%lx, nblocks %" NSC_SZFMT "\n",
1074 svp->sv_dev, nblocks);
1075
1076 return (0);
1077 }
1078
1079
1080 static int
sv_prepare_unload()1081 sv_prepare_unload()
1082 {
1083 int rc = 0;
1084
1085 mutex_enter(&sv_mutex);
1086
1087 if (sv_mod_status == SV_PREVENT_UNLOAD) {
1088 if ((sv_ndevices != 0) || (sv_tset != NULL)) {
1089 rc = EBUSY;
1090 } else {
1091 sv_mod_status = SV_ALLOW_UNLOAD;
1092 delay(SV_WAIT_UNLOAD * drv_usectohz(1000000));
1093 }
1094 }
1095
1096 mutex_exit(&sv_mutex);
1097 return (rc);
1098 }
1099
1100 static int
svattach_fd(blind_t arg)1101 svattach_fd(blind_t arg)
1102 {
1103 dev_t dev = (dev_t)arg;
1104 sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1105 int rc;
1106
1107 if (sv_debug > 0)
1108 cmn_err(CE_CONT, "!svattach_fd(%p, %p)\n", arg, (void *)svp);
1109
1110 if (svp == NULL) {
1111 cmn_err(CE_WARN, "!svattach_fd: no state (arg %p)", arg);
1112 return (0);
1113 }
1114
1115 if ((rc = nsc_partsize(svp->sv_fd, &svp->sv_nblocks)) != 0) {
1116 cmn_err(CE_WARN,
1117 "!svattach_fd: nsc_partsize() failed, rc %d", rc);
1118 svp->sv_nblocks = 0;
1119 }
1120
1121 if ((rc = nsc_maxfbas(svp->sv_fd, 0, &svp->sv_maxfbas)) != 0) {
1122 cmn_err(CE_WARN,
1123 "!svattach_fd: nsc_maxfbas() failed, rc %d", rc);
1124 svp->sv_maxfbas = 0;
1125 }
1126
1127 if (sv_debug > 0) {
1128 cmn_err(CE_CONT,
1129 "!svattach_fd(%p): size %" NSC_SZFMT ", "
1130 "maxfbas %" NSC_SZFMT "\n",
1131 arg, svp->sv_nblocks, svp->sv_maxfbas);
1132 }
1133
1134 return (0);
1135 }
1136
1137
1138 static int
svdetach_fd(blind_t arg)1139 svdetach_fd(blind_t arg)
1140 {
1141 dev_t dev = (dev_t)arg;
1142 sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1143
1144 if (sv_debug > 0)
1145 cmn_err(CE_CONT, "!svdetach_fd(%p, %p)\n", arg, (void *)svp);
1146
1147 /* svp can be NULL during disable of an sv */
1148 if (svp == NULL)
1149 return (0);
1150
1151 svp->sv_maxfbas = 0;
1152 svp->sv_nblocks = 0;
1153 return (0);
1154 }
1155
1156
1157 /*
1158 * Side effect: if called with (guard != 0), then expects both sv_mutex
1159 * and sv_lock(RW_WRITER) to be held, and will release them before returning.
1160 */
1161
1162 /* ARGSUSED */
1163 static int
sv_disable(dev_t dev,spcs_s_info_t kstatus)1164 sv_disable(dev_t dev, spcs_s_info_t kstatus)
1165 {
1166 sv_dev_t *svp = sv_dev_to_sv(dev, NULL);
1167
1168 if (svp == NULL) {
1169
1170 DTRACE_PROBE1(sv_disable_err_nodev, sv_dev_t *, svp);
1171 return (SV_ENODEV);
1172 }
1173
1174 mutex_enter(&sv_mutex);
1175 rw_enter(&svp->sv_lock, RW_WRITER);
1176
1177 if (svp->sv_fd == NULL || svp->sv_state != SV_ENABLE) {
1178 rw_exit(&svp->sv_lock);
1179 mutex_exit(&sv_mutex);
1180
1181 DTRACE_PROBE1(sv_disable_err_disabled, sv_dev_t *, svp);
1182 return (SV_EDISABLED);
1183 }
1184
1185
1186 sv_ndevices--;
1187 return (sv_free(svp, 0));
1188 }
1189
1190
1191
1192 static int
sv_lyr_open(dev_t * devp,int flag,int otyp,cred_t * crp)1193 sv_lyr_open(dev_t *devp, int flag, int otyp, cred_t *crp)
1194 {
1195 nsc_buf_t *tmph;
1196 sv_dev_t *svp;
1197 sv_maj_t *maj;
1198 int (*fn)();
1199 dev_t odev;
1200 int ret;
1201 int rc;
1202
1203 svp = sv_dev_to_sv(*devp, &maj);
1204
1205 if (svp) {
1206 if (svp->sv_state == SV_PENDING &&
1207 svp->sv_pending == curthread) {
1208 /*
1209 * This is a recursive open from a call to
1210 * ddi_lyr_open_by_devt and so we just want
1211 * to pass it straight through to the
1212 * underlying driver.
1213 */
1214 DTRACE_PROBE2(sv_lyr_open_recursive,
1215 sv_dev_t *, svp,
1216 dev_t, *devp);
1217 svp = NULL;
1218 } else
1219 rw_enter(&svp->sv_lock, RW_READER);
1220 }
1221
1222 odev = *devp;
1223
1224 if (maj && (fn = maj->sm_open) != 0) {
1225 if (!(maj->sm_flag & D_MP)) {
1226 UNSAFE_ENTER();
1227 ret = (*fn)(devp, flag, otyp, crp);
1228 UNSAFE_EXIT();
1229 } else {
1230 ret = (*fn)(devp, flag, otyp, crp);
1231 }
1232
1233 if (ret == 0) {
1234 /*
1235 * Re-acquire svp if the driver changed *devp.
1236 */
1237
1238 if (*devp != odev) {
1239 rw_exit(&svp->sv_lock);
1240
1241 svp = sv_dev_to_sv(*devp, NULL);
1242
1243 if (svp) {
1244 rw_enter(&svp->sv_lock, RW_READER);
1245 }
1246 }
1247 }
1248 } else {
1249 ret = ENODEV;
1250 }
1251
1252 if (svp && ret != 0 && svp->sv_state == SV_ENABLE) {
1253 /*
1254 * Underlying DDI open failed, but we have this
1255 * device SV enabled. If we can read some data
1256 * from the device, fake a successful open (this
1257 * probably means that this device is RDC'd and we
1258 * are getting the data from the secondary node).
1259 *
1260 * The reserve must be done with NSC_TRY|NSC_NOWAIT to
1261 * ensure that it does not deadlock if this open is
1262 * coming from nskernd:get_bsize().
1263 */
1264 rc = sv_reserve(svp->sv_fd,
1265 NSC_TRY | NSC_NOWAIT | NSC_MULTI | NSC_PCATCH);
1266 if (rc == 0) {
1267 tmph = NULL;
1268
1269 rc = nsc_alloc_buf(svp->sv_fd, 0, 1, NSC_READ, &tmph);
1270 if (rc <= 0) {
1271 /* success */
1272 ret = 0;
1273 }
1274
1275 if (tmph) {
1276 (void) nsc_free_buf(tmph);
1277 tmph = NULL;
1278 }
1279
1280 nsc_release(svp->sv_fd);
1281
1282 /*
1283 * Count the number of layered opens that we
1284 * fake since we have to fake a matching number
1285 * of closes (OTYP_LYR open/close calls must be
1286 * paired).
1287 */
1288
1289 if (ret == 0 && otyp == OTYP_LYR) {
1290 mutex_enter(&svp->sv_olock);
1291 svp->sv_openlcnt++;
1292 mutex_exit(&svp->sv_olock);
1293 }
1294 }
1295 }
1296
1297 if (svp) {
1298 rw_exit(&svp->sv_lock);
1299 }
1300
1301 return (ret);
1302 }
1303
1304
1305 static int
sv_lyr_close(dev_t dev,int flag,int otyp,cred_t * crp)1306 sv_lyr_close(dev_t dev, int flag, int otyp, cred_t *crp)
1307 {
1308 sv_dev_t *svp;
1309 sv_maj_t *maj;
1310 int (*fn)();
1311 int ret;
1312
1313 svp = sv_dev_to_sv(dev, &maj);
1314
1315 if (svp &&
1316 svp->sv_state == SV_PENDING &&
1317 svp->sv_pending == curthread) {
1318 /*
1319 * This is a recursive open from a call to
1320 * ddi_lyr_close and so we just want
1321 * to pass it straight through to the
1322 * underlying driver.
1323 */
1324 DTRACE_PROBE2(sv_lyr_close_recursive, sv_dev_t *, svp,
1325 dev_t, dev);
1326 svp = NULL;
1327 }
1328
1329 if (svp) {
1330 rw_enter(&svp->sv_lock, RW_READER);
1331
1332 if (otyp == OTYP_LYR) {
1333 mutex_enter(&svp->sv_olock);
1334
1335 if (svp->sv_openlcnt) {
1336 /*
1337 * Consume sufficient layered closes to
1338 * account for the opens that we faked
1339 * whilst the device was failed.
1340 */
1341 svp->sv_openlcnt--;
1342 mutex_exit(&svp->sv_olock);
1343 rw_exit(&svp->sv_lock);
1344
1345 DTRACE_PROBE1(sv_lyr_close_end, dev_t, dev);
1346
1347 return (0);
1348 }
1349
1350 mutex_exit(&svp->sv_olock);
1351 }
1352 }
1353
1354 if (maj && (fn = maj->sm_close) != 0) {
1355 if (!(maj->sm_flag & D_MP)) {
1356 UNSAFE_ENTER();
1357 ret = (*fn)(dev, flag, otyp, crp);
1358 UNSAFE_EXIT();
1359 } else {
1360 ret = (*fn)(dev, flag, otyp, crp);
1361 }
1362 } else {
1363 ret = ENODEV;
1364 }
1365
1366 if (svp) {
1367 rw_exit(&svp->sv_lock);
1368 }
1369
1370 return (ret);
1371 }
1372
1373
1374 /*
1375 * Convert the specified dev_t into a locked and enabled sv_dev_t, or
1376 * return NULL.
1377 */
1378 static sv_dev_t *
sv_find_enabled(const dev_t dev,sv_maj_t ** majpp)1379 sv_find_enabled(const dev_t dev, sv_maj_t **majpp)
1380 {
1381 sv_dev_t *svp;
1382
1383 while ((svp = sv_dev_to_sv(dev, majpp)) != NULL) {
1384 rw_enter(&svp->sv_lock, RW_READER);
1385
1386 if (svp->sv_state == SV_ENABLE) {
1387 /* locked and enabled */
1388 break;
1389 }
1390
1391 /*
1392 * State was changed while waiting on the lock.
1393 * Wait for a stable state.
1394 */
1395 rw_exit(&svp->sv_lock);
1396
1397 DTRACE_PROBE1(sv_find_enabled_retry, dev_t, dev);
1398
1399 delay(2);
1400 }
1401
1402 return (svp);
1403 }
1404
1405
1406 static int
sv_lyr_uio(dev_t dev,uio_t * uiop,cred_t * crp,int rw)1407 sv_lyr_uio(dev_t dev, uio_t *uiop, cred_t *crp, int rw)
1408 {
1409 sv_dev_t *svp;
1410 sv_maj_t *maj;
1411 int (*fn)();
1412 int rc;
1413
1414 svp = sv_find_enabled(dev, &maj);
1415 if (svp == NULL) {
1416 if (maj) {
1417 if (rw == NSC_READ)
1418 fn = maj->sm_read;
1419 else
1420 fn = maj->sm_write;
1421
1422 if (fn != 0) {
1423 if (!(maj->sm_flag & D_MP)) {
1424 UNSAFE_ENTER();
1425 rc = (*fn)(dev, uiop, crp);
1426 UNSAFE_EXIT();
1427 } else {
1428 rc = (*fn)(dev, uiop, crp);
1429 }
1430 }
1431
1432 return (rc);
1433 } else {
1434 return (ENODEV);
1435 }
1436 }
1437
1438 ASSERT(RW_READ_HELD(&svp->sv_lock));
1439
1440 if (svp->sv_flag == 0) {
1441 /*
1442 * guard access mode
1443 * - prevent user level access to the device
1444 */
1445 DTRACE_PROBE1(sv_lyr_uio_err_guard, uio_t *, uiop);
1446 rc = EPERM;
1447 goto out;
1448 }
1449
1450 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
1451 DTRACE_PROBE1(sv_lyr_uio_err_rsrv, uio_t *, uiop);
1452 goto out;
1453 }
1454
1455 if (rw == NSC_READ)
1456 rc = nsc_uread(svp->sv_fd, uiop, crp);
1457 else
1458 rc = nsc_uwrite(svp->sv_fd, uiop, crp);
1459
1460 nsc_release(svp->sv_fd);
1461
1462 out:
1463 rw_exit(&svp->sv_lock);
1464
1465 return (rc);
1466 }
1467
1468
1469 static int
sv_lyr_read(dev_t dev,uio_t * uiop,cred_t * crp)1470 sv_lyr_read(dev_t dev, uio_t *uiop, cred_t *crp)
1471 {
1472 return (sv_lyr_uio(dev, uiop, crp, NSC_READ));
1473 }
1474
1475
1476 static int
sv_lyr_write(dev_t dev,uio_t * uiop,cred_t * crp)1477 sv_lyr_write(dev_t dev, uio_t *uiop, cred_t *crp)
1478 {
1479 return (sv_lyr_uio(dev, uiop, crp, NSC_WRITE));
1480 }
1481
1482
1483 /* ARGSUSED */
1484
1485 static int
sv_lyr_aread(dev_t dev,struct aio_req * aio,cred_t * crp)1486 sv_lyr_aread(dev_t dev, struct aio_req *aio, cred_t *crp)
1487 {
1488 return (aphysio(sv_lyr_strategy,
1489 anocancel, dev, B_READ, minphys, aio));
1490 }
1491
1492
1493 /* ARGSUSED */
1494
1495 static int
sv_lyr_awrite(dev_t dev,struct aio_req * aio,cred_t * crp)1496 sv_lyr_awrite(dev_t dev, struct aio_req *aio, cred_t *crp)
1497 {
1498 return (aphysio(sv_lyr_strategy,
1499 anocancel, dev, B_WRITE, minphys, aio));
1500 }
1501
1502
1503 /*
1504 * Set up an array containing the list of raw path names
1505 * The array for the paths is svl and the size of the array is
1506 * in size.
1507 *
1508 * If there are more layered devices than will fit in the array,
1509 * the number of extra layered devices is returned. Otherwise
1510 * zero is return.
1511 *
1512 * Input:
1513 * svn : array for paths
1514 * size : size of the array
1515 *
1516 * Output (extra):
1517 * zero : All paths fit in array
1518 * >0 : Number of defined layered devices don't fit in array
1519 */
1520
1521 static int
sv_list(void * ptr,const int size,int * extra,const int ilp32)1522 sv_list(void *ptr, const int size, int *extra, const int ilp32)
1523 {
1524 sv_name32_t *svn32;
1525 sv_name_t *svn;
1526 sv_dev_t *svp;
1527 int *mode, *nblocks;
1528 int i, index;
1529 char *path;
1530
1531 *extra = 0;
1532 index = 0;
1533
1534 if (ilp32)
1535 svn32 = ptr;
1536 else
1537 svn = ptr;
1538
1539 mutex_enter(&sv_mutex);
1540 for (i = 0; i < sv_max_devices; i++) {
1541 svp = &sv_devs[i];
1542
1543 rw_enter(&svp->sv_lock, RW_READER);
1544
1545 if (svp->sv_state != SV_ENABLE) {
1546 rw_exit(&svp->sv_lock);
1547 continue;
1548 }
1549
1550 if ((*extra) != 0 || ptr == NULL) {
1551 /* Another overflow entry */
1552 rw_exit(&svp->sv_lock);
1553 (*extra)++;
1554 continue;
1555 }
1556
1557 if (ilp32) {
1558 nblocks = &svn32->svn_nblocks;
1559 mode = &svn32->svn_mode;
1560 path = svn32->svn_path;
1561
1562 svn32->svn_timestamp = (uint32_t)svp->sv_timestamp;
1563 svn32++;
1564 } else {
1565 nblocks = &svn->svn_nblocks;
1566 mode = &svn->svn_mode;
1567 path = svn->svn_path;
1568
1569 svn->svn_timestamp = svp->sv_timestamp;
1570 svn++;
1571 }
1572
1573 (void) strcpy(path, nsc_pathname(svp->sv_fd));
1574 *nblocks = svp->sv_nblocks;
1575 *mode = svp->sv_flag;
1576
1577 if (*nblocks == 0) {
1578 if (sv_debug > 3)
1579 cmn_err(CE_CONT, "!sv_list: need to reserve\n");
1580
1581 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
1582 *nblocks = svp->sv_nblocks;
1583 nsc_release(svp->sv_fd);
1584 }
1585 }
1586
1587 if (++index >= size) {
1588 /* Out of space */
1589 (*extra)++;
1590 }
1591
1592 rw_exit(&svp->sv_lock);
1593 }
1594 mutex_exit(&sv_mutex);
1595
1596 if (index < size) {
1597 /* NULL terminated list */
1598 if (ilp32)
1599 svn32->svn_path[0] = '\0';
1600 else
1601 svn->svn_path[0] = '\0';
1602 }
1603
1604 return (0);
1605 }
1606
1607
1608 static void
sv_thread_tune(int threads)1609 sv_thread_tune(int threads)
1610 {
1611 int incr = (threads > 0) ? 1 : -1;
1612 int change = 0;
1613 int nthreads;
1614
1615 ASSERT(MUTEX_HELD(&sv_mutex));
1616
1617 if (sv_threads_extra) {
1618 /* keep track of any additional threads requested */
1619 if (threads > 0) {
1620 sv_threads_extra += threads;
1621 return;
1622 }
1623 threads = -threads;
1624 if (threads >= sv_threads_extra) {
1625 threads -= sv_threads_extra;
1626 sv_threads_extra = 0;
1627 /* fall through to while loop */
1628 } else {
1629 sv_threads_extra -= threads;
1630 return;
1631 }
1632 } else if (threads > 0) {
1633 /*
1634 * do not increase the number of threads beyond
1635 * sv_threads_max when doing dynamic thread tuning
1636 */
1637 nthreads = nst_nthread(sv_tset);
1638 if ((nthreads + threads) > sv_threads_max) {
1639 sv_threads_extra = nthreads + threads - sv_threads_max;
1640 threads = sv_threads_max - nthreads;
1641 if (threads <= 0)
1642 return;
1643 }
1644 }
1645
1646 if (threads < 0)
1647 threads = -threads;
1648
1649 while (threads--) {
1650 nthreads = nst_nthread(sv_tset);
1651 sv_threads_needed += incr;
1652
1653 if (sv_threads_needed >= nthreads)
1654 change += nst_add_thread(sv_tset, sv_threads_inc);
1655 else if ((sv_threads_needed <
1656 (nthreads - (sv_threads_inc + sv_threads_hysteresis))) &&
1657 ((nthreads - sv_threads_inc) >= sv_threads))
1658 change -= nst_del_thread(sv_tset, sv_threads_inc);
1659 }
1660
1661 #ifdef DEBUG
1662 if (change) {
1663 cmn_err(CE_NOTE,
1664 "!sv_thread_tune: threads needed %d, nthreads %d, "
1665 "nthreads change %d",
1666 sv_threads_needed, nst_nthread(sv_tset), change);
1667 }
1668 #endif
1669 }
1670
1671
1672 /* ARGSUSED */
1673 static int
svopen(dev_t * devp,int flag,int otyp,cred_t * crp)1674 svopen(dev_t *devp, int flag, int otyp, cred_t *crp)
1675 {
1676 int rc;
1677
1678 mutex_enter(&sv_mutex);
1679 rc = sv_init_devs();
1680 mutex_exit(&sv_mutex);
1681
1682 return (rc);
1683 }
1684
1685
1686 /* ARGSUSED */
1687 static int
svclose(dev_t dev,int flag,int otyp,cred_t * crp)1688 svclose(dev_t dev, int flag, int otyp, cred_t *crp)
1689 {
1690 const int secs = HZ * 5;
1691 const int ticks = HZ / 10;
1692 int loops = secs / ticks;
1693
1694 mutex_enter(&sv_mutex);
1695 while (sv_ndevices <= 0 && sv_tset != NULL && loops > 0) {
1696 if (nst_nlive(sv_tset) <= 0) {
1697 nst_destroy(sv_tset);
1698 sv_tset = NULL;
1699 break;
1700 }
1701
1702 /* threads still active - wait for them to exit */
1703 mutex_exit(&sv_mutex);
1704 delay(ticks);
1705 loops--;
1706 mutex_enter(&sv_mutex);
1707 }
1708 mutex_exit(&sv_mutex);
1709
1710 if (loops <= 0) {
1711 cmn_err(CE_WARN,
1712 #ifndef DEBUG
1713 /* do not write to console when non-DEBUG */
1714 "!"
1715 #endif
1716 "sv:svclose: threads still active "
1717 "after %d sec - leaking thread set", secs);
1718 }
1719
1720 return (0);
1721 }
1722
1723
1724 static int
svioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * crp,int * rvalp)1725 svioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *crp, int *rvalp)
1726 {
1727 char itmp1[12], itmp2[12]; /* temp char array for editing ints */
1728 spcs_s_info_t kstatus; /* Kernel version of spcs status */
1729 spcs_s_info_t ustatus; /* Address of user version of spcs status */
1730 sv_list32_t svl32; /* 32 bit Initial structure for SVIOC_LIST */
1731 sv_version_t svv; /* Version structure */
1732 sv_conf_t svc; /* User config structure */
1733 sv_list_t svl; /* Initial structure for SVIOC_LIST */
1734 void *usvn; /* Address of user sv_name_t */
1735 void *svn = NULL; /* Array for SVIOC_LIST */
1736 uint64_t phash; /* pathname hash */
1737 int rc = 0; /* Return code -- errno */
1738 int size; /* Number of items in array */
1739 int bytes; /* Byte size of array */
1740 int ilp32; /* Convert data structures for ilp32 userland */
1741
1742 *rvalp = 0;
1743
1744 /*
1745 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
1746 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
1747 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
1748 *
1749 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
1750 */
1751 if (sv_mod_status == SV_ALLOW_UNLOAD) {
1752 return (EBUSY);
1753 }
1754
1755 if ((cmd != SVIOC_LIST) && ((rc = drv_priv(crp)) != 0))
1756 return (rc);
1757
1758 kstatus = spcs_s_kcreate();
1759 if (!kstatus) {
1760 DTRACE_PROBE1(sv_ioctl_err_kcreate, dev_t, dev);
1761 return (ENOMEM);
1762 }
1763
1764 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
1765
1766 switch (cmd) {
1767
1768 case SVIOC_ENABLE:
1769
1770 if (ilp32) {
1771 sv_conf32_t svc32;
1772
1773 if (ddi_copyin((void *)arg, &svc32,
1774 sizeof (svc32), mode) < 0) {
1775 spcs_s_kfree(kstatus);
1776 return (EFAULT);
1777 }
1778
1779 svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1780 (void) strcpy(svc.svc_path, svc32.svc_path);
1781 svc.svc_flag = svc32.svc_flag;
1782 svc.svc_major = svc32.svc_major;
1783 svc.svc_minor = svc32.svc_minor;
1784 } else {
1785 if (ddi_copyin((void *)arg, &svc,
1786 sizeof (svc), mode) < 0) {
1787 spcs_s_kfree(kstatus);
1788 return (EFAULT);
1789 }
1790 }
1791
1792 /* force to raw access */
1793 svc.svc_flag = NSC_DEVICE;
1794
1795 if (sv_tset == NULL) {
1796 mutex_enter(&sv_mutex);
1797
1798 if (sv_tset == NULL) {
1799 sv_tset = nst_init("sv_thr", sv_threads);
1800 }
1801
1802 mutex_exit(&sv_mutex);
1803
1804 if (sv_tset == NULL) {
1805 cmn_err(CE_WARN,
1806 "!sv: could not allocate %d threads",
1807 sv_threads);
1808 }
1809 }
1810
1811 rc = sv_enable(svc.svc_path, svc.svc_flag,
1812 makedevice(svc.svc_major, svc.svc_minor), kstatus);
1813
1814 if (rc == 0) {
1815 sv_config_time = nsc_lbolt();
1816
1817 mutex_enter(&sv_mutex);
1818 sv_thread_tune(sv_threads_dev);
1819 mutex_exit(&sv_mutex);
1820 }
1821
1822 DTRACE_PROBE3(sv_ioctl_end, dev_t, dev, int, *rvalp, int, rc);
1823
1824 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1825 /* NOTREACHED */
1826
1827 case SVIOC_DISABLE:
1828
1829 if (ilp32) {
1830 sv_conf32_t svc32;
1831
1832 if (ddi_copyin((void *)arg, &svc32,
1833 sizeof (svc32), mode) < 0) {
1834 spcs_s_kfree(kstatus);
1835 return (EFAULT);
1836 }
1837
1838 svc.svc_error = (spcs_s_info_t)svc32.svc_error;
1839 svc.svc_major = svc32.svc_major;
1840 svc.svc_minor = svc32.svc_minor;
1841 (void) strcpy(svc.svc_path, svc32.svc_path);
1842 svc.svc_flag = svc32.svc_flag;
1843 } else {
1844 if (ddi_copyin((void *)arg, &svc,
1845 sizeof (svc), mode) < 0) {
1846 spcs_s_kfree(kstatus);
1847 return (EFAULT);
1848 }
1849 }
1850
1851 if (svc.svc_major == (major_t)-1 &&
1852 svc.svc_minor == (minor_t)-1) {
1853 sv_dev_t *svp;
1854 int i;
1855
1856 /*
1857 * User level could not find the minor device
1858 * node, so do this the slow way by searching
1859 * the entire sv config for a matching pathname.
1860 */
1861
1862 phash = nsc_strhash(svc.svc_path);
1863
1864 mutex_enter(&sv_mutex);
1865
1866 for (i = 0; i < sv_max_devices; i++) {
1867 svp = &sv_devs[i];
1868
1869 if (svp->sv_state == SV_DISABLE ||
1870 svp->sv_fd == NULL)
1871 continue;
1872
1873 if (nsc_fdpathcmp(svp->sv_fd, phash,
1874 svc.svc_path) == 0) {
1875 svc.svc_major = getmajor(svp->sv_dev);
1876 svc.svc_minor = getminor(svp->sv_dev);
1877 break;
1878 }
1879 }
1880
1881 mutex_exit(&sv_mutex);
1882
1883 if (svc.svc_major == (major_t)-1 &&
1884 svc.svc_minor == (minor_t)-1)
1885 return (spcs_s_ocopyoutf(&kstatus,
1886 svc.svc_error, SV_ENODEV));
1887 }
1888
1889 rc = sv_disable(makedevice(svc.svc_major, svc.svc_minor),
1890 kstatus);
1891
1892 if (rc == 0) {
1893 sv_config_time = nsc_lbolt();
1894
1895 mutex_enter(&sv_mutex);
1896 sv_thread_tune(-sv_threads_dev);
1897 mutex_exit(&sv_mutex);
1898 }
1899
1900 DTRACE_PROBE3(sv_ioctl_2, dev_t, dev, int, *rvalp, int, rc);
1901
1902 return (spcs_s_ocopyoutf(&kstatus, svc.svc_error, rc));
1903 /* NOTREACHED */
1904
1905 case SVIOC_LIST:
1906
1907 if (ilp32) {
1908 if (ddi_copyin((void *)arg, &svl32,
1909 sizeof (svl32), mode) < 0) {
1910 spcs_s_kfree(kstatus);
1911 return (EFAULT);
1912 }
1913
1914 ustatus = (spcs_s_info_t)svl32.svl_error;
1915 size = svl32.svl_count;
1916 usvn = (void *)(unsigned long)svl32.svl_names;
1917 } else {
1918 if (ddi_copyin((void *)arg, &svl,
1919 sizeof (svl), mode) < 0) {
1920 spcs_s_kfree(kstatus);
1921 return (EFAULT);
1922 }
1923
1924 ustatus = svl.svl_error;
1925 size = svl.svl_count;
1926 usvn = svl.svl_names;
1927 }
1928
1929 /* Do some boundary checking */
1930 if ((size < 0) || (size > sv_max_devices)) {
1931 /* Array size is out of range */
1932 return (spcs_s_ocopyoutf(&kstatus, ustatus,
1933 SV_EARRBOUNDS, "0",
1934 spcs_s_inttostring(sv_max_devices, itmp1,
1935 sizeof (itmp1), 0),
1936 spcs_s_inttostring(size, itmp2,
1937 sizeof (itmp2), 0)));
1938 }
1939
1940 if (ilp32)
1941 bytes = size * sizeof (sv_name32_t);
1942 else
1943 bytes = size * sizeof (sv_name_t);
1944
1945 /* Allocate memory for the array of structures */
1946 if (bytes != 0) {
1947 svn = kmem_zalloc(bytes, KM_SLEEP);
1948 if (!svn) {
1949 return (spcs_s_ocopyoutf(&kstatus,
1950 ustatus, ENOMEM));
1951 }
1952 }
1953
1954 rc = sv_list(svn, size, rvalp, ilp32);
1955 if (rc) {
1956 if (svn != NULL)
1957 kmem_free(svn, bytes);
1958 return (spcs_s_ocopyoutf(&kstatus, ustatus, rc));
1959 }
1960
1961 if (ilp32) {
1962 svl32.svl_timestamp = (uint32_t)sv_config_time;
1963 svl32.svl_maxdevs = (int32_t)sv_max_devices;
1964
1965 /* Return the list structure */
1966 if (ddi_copyout(&svl32, (void *)arg,
1967 sizeof (svl32), mode) < 0) {
1968 spcs_s_kfree(kstatus);
1969 if (svn != NULL)
1970 kmem_free(svn, bytes);
1971 return (EFAULT);
1972 }
1973 } else {
1974 svl.svl_timestamp = sv_config_time;
1975 svl.svl_maxdevs = sv_max_devices;
1976
1977 /* Return the list structure */
1978 if (ddi_copyout(&svl, (void *)arg,
1979 sizeof (svl), mode) < 0) {
1980 spcs_s_kfree(kstatus);
1981 if (svn != NULL)
1982 kmem_free(svn, bytes);
1983 return (EFAULT);
1984 }
1985 }
1986
1987 /* Return the array */
1988 if (svn != NULL) {
1989 if (ddi_copyout(svn, usvn, bytes, mode) < 0) {
1990 kmem_free(svn, bytes);
1991 spcs_s_kfree(kstatus);
1992 return (EFAULT);
1993 }
1994 kmem_free(svn, bytes);
1995 }
1996
1997 DTRACE_PROBE3(sv_ioctl_3, dev_t, dev, int, *rvalp, int, 0);
1998
1999 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2000 /* NOTREACHED */
2001
2002 case SVIOC_VERSION:
2003
2004 if (ilp32) {
2005 sv_version32_t svv32;
2006
2007 if (ddi_copyin((void *)arg, &svv32,
2008 sizeof (svv32), mode) < 0) {
2009 spcs_s_kfree(kstatus);
2010 return (EFAULT);
2011 }
2012
2013 svv32.svv_major_rev = sv_major_rev;
2014 svv32.svv_minor_rev = sv_minor_rev;
2015 svv32.svv_micro_rev = sv_micro_rev;
2016 svv32.svv_baseline_rev = sv_baseline_rev;
2017
2018 if (ddi_copyout(&svv32, (void *)arg,
2019 sizeof (svv32), mode) < 0) {
2020 spcs_s_kfree(kstatus);
2021 return (EFAULT);
2022 }
2023
2024 ustatus = (spcs_s_info_t)svv32.svv_error;
2025 } else {
2026 if (ddi_copyin((void *)arg, &svv,
2027 sizeof (svv), mode) < 0) {
2028 spcs_s_kfree(kstatus);
2029 return (EFAULT);
2030 }
2031
2032 svv.svv_major_rev = sv_major_rev;
2033 svv.svv_minor_rev = sv_minor_rev;
2034 svv.svv_micro_rev = sv_micro_rev;
2035 svv.svv_baseline_rev = sv_baseline_rev;
2036
2037 if (ddi_copyout(&svv, (void *)arg,
2038 sizeof (svv), mode) < 0) {
2039 spcs_s_kfree(kstatus);
2040 return (EFAULT);
2041 }
2042
2043 ustatus = svv.svv_error;
2044 }
2045
2046 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, 0);
2047
2048 return (spcs_s_ocopyoutf(&kstatus, ustatus, 0));
2049 /* NOTREACHED */
2050
2051 case SVIOC_UNLOAD:
2052 rc = sv_prepare_unload();
2053
2054 if (ddi_copyout(&rc, (void *)arg, sizeof (rc), mode) < 0) {
2055 rc = EFAULT;
2056 }
2057
2058 spcs_s_kfree(kstatus);
2059 return (rc);
2060
2061 default:
2062 spcs_s_kfree(kstatus);
2063
2064 DTRACE_PROBE3(sv_ioctl_4, dev_t, dev, int, *rvalp, int, EINVAL);
2065
2066 return (EINVAL);
2067 /* NOTREACHED */
2068 }
2069
2070 /* NOTREACHED */
2071 }
2072
2073
2074 /* ARGSUSED */
2075 static int
svprint(dev_t dev,char * str)2076 svprint(dev_t dev, char *str)
2077 {
2078 int instance = ddi_get_instance(sv_dip);
2079 cmn_err(CE_WARN, "!%s%d: %s", ddi_get_name(sv_dip), instance, str);
2080 return (0);
2081 }
2082
2083
2084 static void
_sv_lyr_strategy(struct buf * bp)2085 _sv_lyr_strategy(struct buf *bp)
2086 {
2087 caddr_t buf_addr; /* pointer to linear buffer in bp */
2088 nsc_buf_t *bufh = NULL;
2089 nsc_buf_t *hndl = NULL;
2090 sv_dev_t *svp;
2091 nsc_vec_t *v;
2092 sv_maj_t *maj;
2093 nsc_size_t fba_req, fba_len; /* FBA lengths */
2094 nsc_off_t fba_off; /* FBA offset */
2095 size_t tocopy, nbytes; /* byte lengths */
2096 int rw, rc; /* flags and return codes */
2097 int (*fn)();
2098
2099 rc = 0;
2100
2101 if (sv_debug > 5)
2102 cmn_err(CE_CONT, "!_sv_lyr_strategy(%p)\n", (void *)bp);
2103
2104 svp = sv_find_enabled(bp->b_edev, &maj);
2105 if (svp == NULL) {
2106 if (maj && (fn = maj->sm_strategy) != 0) {
2107 if (!(maj->sm_flag & D_MP)) {
2108 UNSAFE_ENTER();
2109 rc = (*fn)(bp);
2110 UNSAFE_EXIT();
2111 } else {
2112 rc = (*fn)(bp);
2113 }
2114 return;
2115 } else {
2116 bioerror(bp, ENODEV);
2117 biodone(bp);
2118 return;
2119 }
2120 }
2121
2122 ASSERT(RW_READ_HELD(&svp->sv_lock));
2123
2124 if (svp->sv_flag == 0) {
2125 /*
2126 * guard access mode
2127 * - prevent user level access to the device
2128 */
2129 DTRACE_PROBE1(sv_lyr_strategy_err_guard, struct buf *, bp);
2130 bioerror(bp, EPERM);
2131 goto out;
2132 }
2133
2134 if ((rc = sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH)) != 0) {
2135 DTRACE_PROBE1(sv_lyr_strategy_err_rsrv, struct buf *, bp);
2136
2137 if (rc == EINTR)
2138 cmn_err(CE_WARN, "!nsc_reserve() returned EINTR");
2139 bioerror(bp, rc);
2140 goto out;
2141 }
2142
2143 if (bp->b_lblkno >= (diskaddr_t)svp->sv_nblocks) {
2144 DTRACE_PROBE1(sv_lyr_strategy_eof, struct buf *, bp);
2145
2146 if (bp->b_flags & B_READ) {
2147 /* return EOF, not an error */
2148 bp->b_resid = bp->b_bcount;
2149 bioerror(bp, 0);
2150 } else
2151 bioerror(bp, EINVAL);
2152
2153 goto done;
2154 }
2155
2156 /*
2157 * Preallocate a handle once per call to strategy.
2158 * If this fails, then the nsc_alloc_buf() will allocate
2159 * a temporary handle per allocation/free pair.
2160 */
2161
2162 DTRACE_PROBE1(sv_dbg_alloch_start, sv_dev_t *, svp);
2163
2164 bufh = nsc_alloc_handle(svp->sv_fd, NULL, NULL, NULL);
2165
2166 DTRACE_PROBE1(sv_dbg_alloch_end, sv_dev_t *, svp);
2167
2168 if (bufh && (bufh->sb_flag & NSC_HACTIVE) != 0) {
2169 DTRACE_PROBE1(sv_lyr_strategy_err_hactive, struct buf *, bp);
2170
2171 cmn_err(CE_WARN,
2172 "!sv: allocated active handle (bufh %p, flags %x)",
2173 (void *)bufh, bufh->sb_flag);
2174
2175 bioerror(bp, ENXIO);
2176 goto done;
2177 }
2178
2179 fba_req = FBA_LEN(bp->b_bcount);
2180 if (fba_req + bp->b_lblkno > (diskaddr_t)svp->sv_nblocks)
2181 fba_req = (nsc_size_t)(svp->sv_nblocks - bp->b_lblkno);
2182
2183 rw = (bp->b_flags & B_READ) ? NSC_READ : NSC_WRITE;
2184
2185 bp_mapin(bp);
2186
2187 bp->b_resid = bp->b_bcount;
2188 buf_addr = bp->b_un.b_addr;
2189 fba_off = 0;
2190
2191 /*
2192 * fba_req - requested size of transfer in FBAs after
2193 * truncation to device extent, and allowing for
2194 * possible non-FBA bounded final chunk.
2195 * fba_off - offset of start of chunk from start of bp in FBAs.
2196 * fba_len - size of this chunk in FBAs.
2197 */
2198
2199 loop:
2200 fba_len = min(fba_req, svp->sv_maxfbas);
2201 hndl = bufh;
2202
2203 DTRACE_PROBE4(sv_dbg_allocb_start,
2204 sv_dev_t *, svp,
2205 uint64_t, (uint64_t)(bp->b_lblkno + fba_off),
2206 uint64_t, (uint64_t)fba_len,
2207 int, rw);
2208
2209 rc = nsc_alloc_buf(svp->sv_fd, (nsc_off_t)(bp->b_lblkno + fba_off),
2210 fba_len, rw, &hndl);
2211
2212 DTRACE_PROBE1(sv_dbg_allocb_end, sv_dev_t *, svp);
2213
2214 if (rc > 0) {
2215 DTRACE_PROBE1(sv_lyr_strategy_err_alloc, struct buf *, bp);
2216 bioerror(bp, rc);
2217 if (hndl != bufh)
2218 (void) nsc_free_buf(hndl);
2219 hndl = NULL;
2220 goto done;
2221 }
2222
2223 tocopy = min(FBA_SIZE(fba_len), bp->b_resid);
2224 v = hndl->sb_vec;
2225
2226 if (rw == NSC_WRITE && FBA_OFF(tocopy) != 0) {
2227 /*
2228 * Not overwriting all of the last FBA, so read in the
2229 * old contents now before we overwrite it with the new
2230 * data.
2231 */
2232
2233 DTRACE_PROBE2(sv_dbg_read_start, sv_dev_t *, svp,
2234 uint64_t, (uint64_t)(hndl->sb_pos + hndl->sb_len - 1));
2235
2236 rc = nsc_read(hndl, (hndl->sb_pos + hndl->sb_len - 1), 1, 0);
2237 if (rc > 0) {
2238 bioerror(bp, rc);
2239 goto done;
2240 }
2241
2242 DTRACE_PROBE1(sv_dbg_read_end, sv_dev_t *, svp);
2243 }
2244
2245 DTRACE_PROBE1(sv_dbg_bcopy_start, sv_dev_t *, svp);
2246
2247 while (tocopy > 0) {
2248 nbytes = min(tocopy, (nsc_size_t)v->sv_len);
2249
2250 if (bp->b_flags & B_READ)
2251 (void) bcopy(v->sv_addr, buf_addr, nbytes);
2252 else
2253 (void) bcopy(buf_addr, v->sv_addr, nbytes);
2254
2255 bp->b_resid -= nbytes;
2256 buf_addr += nbytes;
2257 tocopy -= nbytes;
2258 v++;
2259 }
2260
2261 DTRACE_PROBE1(sv_dbg_bcopy_end, sv_dev_t *, svp);
2262
2263 if ((bp->b_flags & B_READ) == 0) {
2264 DTRACE_PROBE3(sv_dbg_write_start, sv_dev_t *, svp,
2265 uint64_t, (uint64_t)hndl->sb_pos,
2266 uint64_t, (uint64_t)hndl->sb_len);
2267
2268 rc = nsc_write(hndl, hndl->sb_pos, hndl->sb_len, 0);
2269
2270 DTRACE_PROBE1(sv_dbg_write_end, sv_dev_t *, svp);
2271
2272 if (rc > 0) {
2273 bioerror(bp, rc);
2274 goto done;
2275 }
2276 }
2277
2278 /*
2279 * Adjust FBA offset and requested (ie. remaining) length,
2280 * loop if more data to transfer.
2281 */
2282
2283 fba_off += fba_len;
2284 fba_req -= fba_len;
2285
2286 if (fba_req > 0) {
2287 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2288
2289 rc = nsc_free_buf(hndl);
2290
2291 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2292
2293 if (rc > 0) {
2294 DTRACE_PROBE1(sv_lyr_strategy_err_free,
2295 struct buf *, bp);
2296 bioerror(bp, rc);
2297 }
2298
2299 hndl = NULL;
2300
2301 if (rc <= 0)
2302 goto loop;
2303 }
2304
2305 done:
2306 if (hndl != NULL) {
2307 DTRACE_PROBE1(sv_dbg_freeb_start, sv_dev_t *, svp);
2308
2309 rc = nsc_free_buf(hndl);
2310
2311 DTRACE_PROBE1(sv_dbg_freeb_end, sv_dev_t *, svp);
2312
2313 if (rc > 0) {
2314 DTRACE_PROBE1(sv_lyr_strategy_err_free,
2315 struct buf *, bp);
2316 bioerror(bp, rc);
2317 }
2318
2319 hndl = NULL;
2320 }
2321
2322 if (bufh)
2323 (void) nsc_free_handle(bufh);
2324
2325 DTRACE_PROBE1(sv_dbg_rlse_start, sv_dev_t *, svp);
2326
2327 nsc_release(svp->sv_fd);
2328
2329 DTRACE_PROBE1(sv_dbg_rlse_end, sv_dev_t *, svp);
2330
2331 out:
2332 if (sv_debug > 5) {
2333 cmn_err(CE_CONT,
2334 "!_sv_lyr_strategy: bp %p, bufh %p, bp->b_error %d\n",
2335 (void *)bp, (void *)bufh, bp->b_error);
2336 }
2337
2338 DTRACE_PROBE2(sv_lyr_strategy_end, struct buf *, bp, int, bp->b_error);
2339
2340 rw_exit(&svp->sv_lock);
2341 biodone(bp);
2342 }
2343
2344
2345 static void
sv_async_strategy(blind_t arg)2346 sv_async_strategy(blind_t arg)
2347 {
2348 struct buf *bp = (struct buf *)arg;
2349 _sv_lyr_strategy(bp);
2350 }
2351
2352
2353 static int
sv_lyr_strategy(struct buf * bp)2354 sv_lyr_strategy(struct buf *bp)
2355 {
2356 nsthread_t *tp;
2357 int nlive;
2358
2359 /*
2360 * If B_ASYNC was part of the DDI we could use it as a hint to
2361 * not create a thread for synchronous i/o.
2362 */
2363 if (sv_dev_to_sv(bp->b_edev, NULL) == NULL) {
2364 /* not sv enabled - just pass through */
2365 DTRACE_PROBE1(sv_lyr_strategy_notsv, struct buf *, bp);
2366 _sv_lyr_strategy(bp);
2367 return (0);
2368 }
2369
2370 if (sv_debug > 4) {
2371 cmn_err(CE_CONT, "!sv_lyr_strategy: nthread %d nlive %d\n",
2372 nst_nthread(sv_tset), nst_nlive(sv_tset));
2373 }
2374
2375 /*
2376 * If there are only guard devices enabled there
2377 * won't be a threadset, so don't try and use it.
2378 */
2379 tp = NULL;
2380 if (sv_tset != NULL) {
2381 tp = nst_create(sv_tset, sv_async_strategy, (blind_t)bp, 0);
2382 }
2383
2384 if (tp == NULL) {
2385 /*
2386 * out of threads, so fall back to synchronous io.
2387 */
2388 if (sv_debug > 0) {
2389 cmn_err(CE_CONT,
2390 "!sv_lyr_strategy: thread alloc failed\n");
2391 }
2392
2393 DTRACE_PROBE1(sv_lyr_strategy_no_thread,
2394 struct buf *, bp);
2395
2396 _sv_lyr_strategy(bp);
2397 sv_no_threads++;
2398 } else {
2399 nlive = nst_nlive(sv_tset);
2400 if (nlive > sv_max_nlive) {
2401 if (sv_debug > 0) {
2402 cmn_err(CE_CONT,
2403 "!sv_lyr_strategy: "
2404 "new max nlive %d (nthread %d)\n",
2405 nlive, nst_nthread(sv_tset));
2406 }
2407
2408 sv_max_nlive = nlive;
2409 }
2410 }
2411
2412 return (0);
2413 }
2414
2415
2416 #ifndef offsetof
2417 #define offsetof(s, m) ((size_t)(&((s *)0)->m))
2418 #endif
2419
2420 /*
2421 * re-write the size of the current partition
2422 */
2423 static int
sv_fix_dkiocgvtoc(const intptr_t arg,const int mode,sv_dev_t * svp)2424 sv_fix_dkiocgvtoc(const intptr_t arg, const int mode, sv_dev_t *svp)
2425 {
2426 size_t offset;
2427 int ilp32;
2428 int pnum;
2429 int rc;
2430
2431 ilp32 = (ddi_model_convert_from((mode & FMODELS)) == DDI_MODEL_ILP32);
2432
2433 rc = nskern_partition(svp->sv_dev, &pnum);
2434 if (rc != 0) {
2435 return (rc);
2436 }
2437
2438 if (pnum < 0 || pnum >= V_NUMPAR) {
2439 cmn_err(CE_WARN,
2440 "!sv_gvtoc: unable to determine partition number "
2441 "for dev %lx", svp->sv_dev);
2442 return (EINVAL);
2443 }
2444
2445 if (ilp32) {
2446 int32_t p_size;
2447
2448 #ifdef _SunOS_5_6
2449 offset = offsetof(struct vtoc, v_part);
2450 offset += sizeof (struct partition) * pnum;
2451 offset += offsetof(struct partition, p_size);
2452 #else
2453 offset = offsetof(struct vtoc32, v_part);
2454 offset += sizeof (struct partition32) * pnum;
2455 offset += offsetof(struct partition32, p_size);
2456 #endif
2457
2458 p_size = (int32_t)svp->sv_nblocks;
2459 if (p_size == 0) {
2460 if (sv_reserve(svp->sv_fd,
2461 NSC_MULTI|NSC_PCATCH) == 0) {
2462 p_size = (int32_t)svp->sv_nblocks;
2463 nsc_release(svp->sv_fd);
2464 } else {
2465 rc = EINTR;
2466 }
2467 }
2468
2469 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2470 sizeof (p_size), mode) != 0) {
2471 rc = EFAULT;
2472 }
2473 } else {
2474 long p_size;
2475
2476 offset = offsetof(struct vtoc, v_part);
2477 offset += sizeof (struct partition) * pnum;
2478 offset += offsetof(struct partition, p_size);
2479
2480 p_size = (long)svp->sv_nblocks;
2481 if (p_size == 0) {
2482 if (sv_reserve(svp->sv_fd,
2483 NSC_MULTI|NSC_PCATCH) == 0) {
2484 p_size = (long)svp->sv_nblocks;
2485 nsc_release(svp->sv_fd);
2486 } else {
2487 rc = EINTR;
2488 }
2489 }
2490
2491 if ((rc == 0) && ddi_copyout(&p_size, (void *)(arg + offset),
2492 sizeof (p_size), mode) != 0) {
2493 rc = EFAULT;
2494 }
2495 }
2496
2497 return (rc);
2498 }
2499
2500
2501 #ifdef DKIOCPARTITION
2502 /*
2503 * re-write the size of the current partition
2504 *
2505 * arg is dk_efi_t.
2506 *
2507 * dk_efi_t->dki_data = (void *)(uintptr_t)efi.dki_data_64;
2508 *
2509 * dk_efi_t->dki_data --> efi_gpt_t (label header)
2510 * dk_efi_t->dki_data + 1 --> efi_gpe_t[] (array of partitions)
2511 *
2512 * efi_gpt_t->efi_gpt_PartitionEntryArrayCRC32 --> CRC32 of array of parts
2513 * efi_gpt_t->efi_gpt_HeaderCRC32 --> CRC32 of header itself
2514 *
2515 * This assumes that sizeof (efi_gpt_t) is the same as the size of a
2516 * logical block on the disk.
2517 *
2518 * Everything is little endian (i.e. disk format).
2519 */
2520 static int
sv_fix_dkiocgetefi(const intptr_t arg,const int mode,sv_dev_t * svp)2521 sv_fix_dkiocgetefi(const intptr_t arg, const int mode, sv_dev_t *svp)
2522 {
2523 dk_efi_t efi;
2524 efi_gpt_t gpt;
2525 efi_gpe_t *gpe = NULL;
2526 size_t sgpe;
2527 uint64_t p_size; /* virtual partition size from nsctl */
2528 uint32_t crc;
2529 int unparts; /* number of parts in user's array */
2530 int pnum;
2531 int rc;
2532
2533 rc = nskern_partition(svp->sv_dev, &pnum);
2534 if (rc != 0) {
2535 return (rc);
2536 }
2537
2538 if (pnum < 0) {
2539 cmn_err(CE_WARN,
2540 "!sv_efi: unable to determine partition number for dev %lx",
2541 svp->sv_dev);
2542 return (EINVAL);
2543 }
2544
2545 if (ddi_copyin((void *)arg, &efi, sizeof (efi), mode)) {
2546 return (EFAULT);
2547 }
2548
2549 efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
2550
2551 if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
2552 return (EINVAL);
2553 }
2554
2555 if (ddi_copyin((void *)efi.dki_data, &gpt, sizeof (gpt), mode)) {
2556 rc = EFAULT;
2557 goto out;
2558 }
2559
2560 if ((unparts = LE_32(gpt.efi_gpt_NumberOfPartitionEntries)) == 0)
2561 unparts = 1;
2562 else if (pnum >= unparts) {
2563 cmn_err(CE_WARN,
2564 "!sv_efi: partition# beyond end of user array (%d >= %d)",
2565 pnum, unparts);
2566 return (EINVAL);
2567 }
2568
2569 sgpe = sizeof (*gpe) * unparts;
2570 gpe = kmem_alloc(sgpe, KM_SLEEP);
2571
2572 if (ddi_copyin((void *)(efi.dki_data + 1), gpe, sgpe, mode)) {
2573 rc = EFAULT;
2574 goto out;
2575 }
2576
2577 p_size = svp->sv_nblocks;
2578 if (p_size == 0) {
2579 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2580 p_size = (diskaddr_t)svp->sv_nblocks;
2581 nsc_release(svp->sv_fd);
2582 } else {
2583 rc = EINTR;
2584 }
2585 }
2586
2587 gpe[pnum].efi_gpe_EndingLBA = LE_64(
2588 LE_64(gpe[pnum].efi_gpe_StartingLBA) + p_size - 1);
2589
2590 gpt.efi_gpt_PartitionEntryArrayCRC32 = 0;
2591 CRC32(crc, gpe, sgpe, -1U, sv_crc32_table);
2592 gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2593
2594 gpt.efi_gpt_HeaderCRC32 = 0;
2595 CRC32(crc, &gpt, sizeof (gpt), -1U, sv_crc32_table);
2596 gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
2597
2598 if ((rc == 0) && ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), mode)) {
2599 rc = EFAULT;
2600 goto out;
2601 }
2602
2603 if ((rc == 0) && ddi_copyout(gpe, efi.dki_data + 1, sgpe, mode)) {
2604 rc = EFAULT;
2605 goto out;
2606 }
2607
2608 out:
2609 if (gpe) {
2610 kmem_free(gpe, sgpe);
2611 }
2612
2613 return (rc);
2614 }
2615
2616
2617 /*
2618 * Re-write the size of the partition specified by p_partno
2619 *
2620 * Note that if a DKIOCPARTITION is issued to an fd opened against a
2621 * non-sv'd device, but p_partno requests the size for a different
2622 * device that is sv'd, this function will *not* be called as sv is
2623 * not interposed on the original device (the fd).
2624 *
2625 * It would not be easy to change this as we cannot get the partition
2626 * number for the non-sv'd device, so cannot compute the dev_t of the
2627 * (sv'd) p_partno device, and so cannot find out if it is sv'd or get
2628 * its size from nsctl.
2629 *
2630 * See also the "Bug 4755783" comment in sv_lyr_ioctl().
2631 */
2632 static int
sv_fix_dkiocpartition(const intptr_t arg,const int mode,sv_dev_t * svp)2633 sv_fix_dkiocpartition(const intptr_t arg, const int mode, sv_dev_t *svp)
2634 {
2635 struct partition64 p64;
2636 sv_dev_t *nsvp = NULL;
2637 diskaddr_t p_size;
2638 minor_t nminor;
2639 int pnum, rc;
2640 dev_t ndev;
2641
2642 rc = nskern_partition(svp->sv_dev, &pnum);
2643 if (rc != 0) {
2644 return (rc);
2645 }
2646
2647 if (ddi_copyin((void *)arg, &p64, sizeof (p64), mode)) {
2648 return (EFAULT);
2649 }
2650
2651 if (p64.p_partno != pnum) {
2652 /* switch to requested partition, not the current one */
2653 nminor = getminor(svp->sv_dev) + (p64.p_partno - pnum);
2654 ndev = makedevice(getmajor(svp->sv_dev), nminor);
2655 nsvp = sv_find_enabled(ndev, NULL);
2656 if (nsvp == NULL) {
2657 /* not sv device - just return */
2658 return (0);
2659 }
2660
2661 svp = nsvp;
2662 }
2663
2664 p_size = svp->sv_nblocks;
2665 if (p_size == 0) {
2666 if (sv_reserve(svp->sv_fd, NSC_MULTI|NSC_PCATCH) == 0) {
2667 p_size = (diskaddr_t)svp->sv_nblocks;
2668 nsc_release(svp->sv_fd);
2669 } else {
2670 rc = EINTR;
2671 }
2672 }
2673
2674 if (nsvp != NULL) {
2675 rw_exit(&nsvp->sv_lock);
2676 }
2677
2678 if ((rc == 0) && ddi_copyout(&p_size,
2679 (void *)(arg + offsetof(struct partition64, p_size)),
2680 sizeof (p_size), mode) != 0) {
2681 return (EFAULT);
2682 }
2683
2684 return (rc);
2685 }
2686 #endif /* DKIOCPARTITION */
2687
2688
2689 static int
sv_lyr_ioctl(const dev_t dev,const int cmd,const intptr_t arg,const int mode,cred_t * crp,int * rvalp)2690 sv_lyr_ioctl(const dev_t dev, const int cmd, const intptr_t arg,
2691 const int mode, cred_t *crp, int *rvalp)
2692 {
2693 sv_dev_t *svp;
2694 sv_maj_t *maj;
2695 int (*fn)();
2696 int rc = 0;
2697
2698 maj = 0;
2699 fn = 0;
2700
2701 /*
2702 * If sv_mod_status is 0 or SV_PREVENT_UNLOAD, then it will continue.
2703 * else it means it previously was SV_PREVENT_UNLOAD, and now it's
2704 * SV_ALLOW_UNLOAD, expecting the driver to eventually unload.
2705 *
2706 * SV_ALLOW_UNLOAD is final state, so no need to grab sv_mutex.
2707 */
2708 if (sv_mod_status == SV_ALLOW_UNLOAD) {
2709 return (EBUSY);
2710 }
2711
2712 svp = sv_find_enabled(dev, &maj);
2713 if (svp != NULL) {
2714 if (nskernd_isdaemon()) {
2715 /*
2716 * This is nskernd which always needs to see
2717 * the underlying disk device accurately.
2718 *
2719 * So just pass the ioctl straight through
2720 * to the underlying driver as though the device
2721 * was not sv enabled.
2722 */
2723 DTRACE_PROBE2(sv_lyr_ioctl_nskernd, sv_dev_t *, svp,
2724 dev_t, dev);
2725
2726 rw_exit(&svp->sv_lock);
2727 svp = NULL;
2728 } else {
2729 ASSERT(RW_READ_HELD(&svp->sv_lock));
2730 }
2731 }
2732
2733 /*
2734 * We now have a locked and enabled SV device, or a non-SV device.
2735 */
2736
2737 switch (cmd) {
2738 /*
2739 * DKIOCGVTOC, DKIOCSVTOC, DKIOCPARTITION, DKIOCGETEFI
2740 * and DKIOCSETEFI are intercepted and faked up as some
2741 * i/o providers emulate volumes of a different size to
2742 * the underlying volume.
2743 *
2744 * Setting the size by rewriting the vtoc is not permitted.
2745 */
2746
2747 case DKIOCSVTOC:
2748 #ifdef DKIOCPARTITION
2749 case DKIOCSETEFI:
2750 #endif
2751 if (svp == NULL) {
2752 /* not intercepted -- allow ioctl through */
2753 break;
2754 }
2755
2756 rw_exit(&svp->sv_lock);
2757
2758 DTRACE_PROBE2(sv_lyr_ioctl_svtoc, dev_t, dev, int, EPERM);
2759
2760 return (EPERM);
2761
2762 default:
2763 break;
2764 }
2765
2766 /*
2767 * Pass through the real ioctl command.
2768 */
2769
2770 if (maj && (fn = maj->sm_ioctl) != 0) {
2771 if (!(maj->sm_flag & D_MP)) {
2772 UNSAFE_ENTER();
2773 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2774 UNSAFE_EXIT();
2775 } else {
2776 rc = (*fn)(dev, cmd, arg, mode, crp, rvalp);
2777 }
2778 } else {
2779 rc = ENODEV;
2780 }
2781
2782 /*
2783 * Bug 4755783
2784 * Fix up the size of the current partition to allow
2785 * for the virtual volume to be a different size to the
2786 * physical volume (e.g. for II compact dependent shadows).
2787 *
2788 * Note that this only attempts to fix up the current partition
2789 * - the one that the ioctl was issued against. There could be
2790 * other sv'd partitions in the same vtoc, but we cannot tell
2791 * so we don't attempt to fix them up.
2792 */
2793
2794 if (svp != NULL && rc == 0) {
2795 switch (cmd) {
2796 case DKIOCGVTOC:
2797 rc = sv_fix_dkiocgvtoc(arg, mode, svp);
2798 break;
2799
2800 #ifdef DKIOCPARTITION
2801 case DKIOCGETEFI:
2802 rc = sv_fix_dkiocgetefi(arg, mode, svp);
2803 break;
2804
2805 case DKIOCPARTITION:
2806 rc = sv_fix_dkiocpartition(arg, mode, svp);
2807 break;
2808 #endif /* DKIOCPARTITION */
2809 }
2810 }
2811
2812 if (svp != NULL) {
2813 rw_exit(&svp->sv_lock);
2814 }
2815
2816 return (rc);
2817 }
2818