xref: /dflybsd-src/sys/kern/kern_device.c (revision 06cdb70d3c2d4585f58c8f08e050a5d105d1bbb6)
1 /*
2  * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com> All rights reserved.
3  * cdevsw from kern/kern_conf.c Copyright (c) 1995 Terrence R. Lambert
4  * cdevsw from kern/kern_conf.c Copyright (c) 1995 Julian R. Elishcer,
5  *							All rights reserved.
6  * Copyright (c) 1982, 1986, 1991, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/sysctl.h>
35 #include <sys/module.h>
36 #include <sys/malloc.h>
37 #include <sys/conf.h>
38 #include <sys/bio.h>
39 #include <sys/buf.h>
40 #include <sys/vnode.h>
41 #include <sys/queue.h>
42 #include <sys/device.h>
43 #include <sys/tree.h>
44 #include <sys/syslink_rpc.h>
45 #include <sys/proc.h>
46 #include <machine/stdarg.h>
47 #include <sys/devfs.h>
48 #include <sys/dsched.h>
49 
50 #include <sys/thread2.h>
51 #include <sys/mplock2.h>
52 
53 static int mpsafe_writes;
54 static int mplock_writes;
55 static int mpsafe_reads;
56 static int mplock_reads;
57 static int mpsafe_strategies;
58 static int mplock_strategies;
59 
60 SYSCTL_INT(_kern, OID_AUTO, mpsafe_writes, CTLFLAG_RD, &mpsafe_writes,
61 	   0, "mpsafe writes");
62 SYSCTL_INT(_kern, OID_AUTO, mplock_writes, CTLFLAG_RD, &mplock_writes,
63 	   0, "non-mpsafe writes");
64 SYSCTL_INT(_kern, OID_AUTO, mpsafe_reads, CTLFLAG_RD, &mpsafe_reads,
65 	   0, "mpsafe reads");
66 SYSCTL_INT(_kern, OID_AUTO, mplock_reads, CTLFLAG_RD, &mplock_reads,
67 	   0, "non-mpsafe reads");
68 SYSCTL_INT(_kern, OID_AUTO, mpsafe_strategies, CTLFLAG_RD, &mpsafe_strategies,
69 	   0, "mpsafe strategies");
70 SYSCTL_INT(_kern, OID_AUTO, mplock_strategies, CTLFLAG_RD, &mplock_strategies,
71 	   0, "non-mpsafe strategies");
72 
73 /*
74  * system link descriptors identify the command in the
75  * arguments structure.
76  */
77 #define DDESCNAME(name) __CONCAT(__CONCAT(dev_,name),_desc)
78 
79 #define DEVOP_DESC_INIT(name)						\
80 	    struct syslink_desc DDESCNAME(name) = {			\
81 		__offsetof(struct dev_ops, __CONCAT(d_, name)),	\
82 	    #name }
83 
84 DEVOP_DESC_INIT(default);
85 DEVOP_DESC_INIT(open);
86 DEVOP_DESC_INIT(close);
87 DEVOP_DESC_INIT(read);
88 DEVOP_DESC_INIT(write);
89 DEVOP_DESC_INIT(ioctl);
90 DEVOP_DESC_INIT(dump);
91 DEVOP_DESC_INIT(psize);
92 DEVOP_DESC_INIT(mmap);
93 DEVOP_DESC_INIT(strategy);
94 DEVOP_DESC_INIT(kqfilter);
95 DEVOP_DESC_INIT(revoke);
96 DEVOP_DESC_INIT(clone);
97 
98 /*
99  * Misc default ops
100  */
101 struct dev_ops dead_dev_ops;
102 
103 struct dev_ops default_dev_ops = {
104 	{ "null" },
105 	.d_default = NULL,	/* must be NULL */
106 	.d_open = noopen,
107 	.d_close = noclose,
108 	.d_read = noread,
109 	.d_write = nowrite,
110 	.d_ioctl = noioctl,
111 	.d_mmap = nommap,
112 	.d_strategy = nostrategy,
113 	.d_dump = nodump,
114 	.d_psize = nopsize,
115 	.d_kqfilter = nokqfilter,
116 	.d_revoke = norevoke,
117 	.d_clone = noclone
118 };
119 
120 static __inline
121 int
122 dev_needmplock(cdev_t dev)
123 {
124     return((dev->si_ops->head.flags & D_MPSAFE) == 0);
125 }
126 
127 /************************************************************************
128  *			GENERAL DEVICE API FUNCTIONS			*
129  ************************************************************************
130  *
131  * The MPSAFEness of these depends on dev->si_ops->head.flags
132  */
133 int
134 dev_dopen(cdev_t dev, int oflags, int devtype, struct ucred *cred)
135 {
136 	struct dev_open_args ap;
137 	int needmplock = dev_needmplock(dev);
138 	int error;
139 
140 	ap.a_head.a_desc = &dev_open_desc;
141 	ap.a_head.a_dev = dev;
142 	ap.a_oflags = oflags;
143 	ap.a_devtype = devtype;
144 	ap.a_cred = cred;
145 
146 	if (needmplock)
147 		get_mplock();
148 	error = dev->si_ops->d_open(&ap);
149 	if (needmplock)
150 		rel_mplock();
151 	return (error);
152 }
153 
154 int
155 dev_dclose(cdev_t dev, int fflag, int devtype)
156 {
157 	struct dev_close_args ap;
158 	int needmplock = dev_needmplock(dev);
159 	int error;
160 
161 	ap.a_head.a_desc = &dev_close_desc;
162 	ap.a_head.a_dev = dev;
163 	ap.a_fflag = fflag;
164 	ap.a_devtype = devtype;
165 
166 	if (needmplock)
167 		get_mplock();
168 	error = dev->si_ops->d_close(&ap);
169 	if (needmplock)
170 		rel_mplock();
171 	return (error);
172 }
173 
174 int
175 dev_dread(cdev_t dev, struct uio *uio, int ioflag)
176 {
177 	struct dev_read_args ap;
178 	int needmplock = dev_needmplock(dev);
179 	int error;
180 
181 	ap.a_head.a_desc = &dev_read_desc;
182 	ap.a_head.a_dev = dev;
183 	ap.a_uio = uio;
184 	ap.a_ioflag = ioflag;
185 
186 	if (needmplock) {
187 		get_mplock();
188 		++mplock_reads;
189 	} else {
190 		++mpsafe_reads;
191 	}
192 	error = dev->si_ops->d_read(&ap);
193 	if (needmplock)
194 		rel_mplock();
195 	if (error == 0)
196 		dev->si_lastread = time_second;
197 	return (error);
198 }
199 
200 int
201 dev_dwrite(cdev_t dev, struct uio *uio, int ioflag)
202 {
203 	struct dev_write_args ap;
204 	int needmplock = dev_needmplock(dev);
205 	int error;
206 
207 	dev->si_lastwrite = time_second;
208 	ap.a_head.a_desc = &dev_write_desc;
209 	ap.a_head.a_dev = dev;
210 	ap.a_uio = uio;
211 	ap.a_ioflag = ioflag;
212 
213 	if (needmplock) {
214 		get_mplock();
215 		++mplock_writes;
216 	} else {
217 		++mpsafe_writes;
218 	}
219 	error = dev->si_ops->d_write(&ap);
220 	if (needmplock)
221 		rel_mplock();
222 	return (error);
223 }
224 
225 int
226 dev_dioctl(cdev_t dev, u_long cmd, caddr_t data, int fflag, struct ucred *cred,
227 	   struct sysmsg *msg)
228 {
229 	struct dev_ioctl_args ap;
230 	int needmplock = dev_needmplock(dev);
231 	int error;
232 
233 	ap.a_head.a_desc = &dev_ioctl_desc;
234 	ap.a_head.a_dev = dev;
235 	ap.a_cmd = cmd;
236 	ap.a_data = data;
237 	ap.a_fflag = fflag;
238 	ap.a_cred = cred;
239 	ap.a_sysmsg = msg;
240 
241 	if (needmplock)
242 		get_mplock();
243 	error = dev->si_ops->d_ioctl(&ap);
244 	if (needmplock)
245 		rel_mplock();
246 	return (error);
247 }
248 
249 int
250 dev_dmmap(cdev_t dev, vm_offset_t offset, int nprot)
251 {
252 	struct dev_mmap_args ap;
253 	int needmplock = dev_needmplock(dev);
254 	int error;
255 
256 	ap.a_head.a_desc = &dev_mmap_desc;
257 	ap.a_head.a_dev = dev;
258 	ap.a_offset = offset;
259 	ap.a_nprot = nprot;
260 
261 	if (needmplock)
262 		get_mplock();
263 	error = dev->si_ops->d_mmap(&ap);
264 	if (needmplock)
265 		rel_mplock();
266 
267 	if (error == 0)
268 		return(ap.a_result);
269 	return(-1);
270 }
271 
272 int
273 dev_dclone(cdev_t dev)
274 {
275 	struct dev_clone_args ap;
276 	int needmplock = dev_needmplock(dev);
277 	int error;
278 
279 	ap.a_head.a_desc = &dev_clone_desc;
280 	ap.a_head.a_dev = dev;
281 
282 	if (needmplock)
283 		get_mplock();
284 	error = dev->si_ops->d_clone(&ap);
285 	if (needmplock)
286 		rel_mplock();
287 	return (error);
288 }
289 
290 int
291 dev_drevoke(cdev_t dev)
292 {
293 	struct dev_revoke_args ap;
294 	int needmplock = dev_needmplock(dev);
295 	int error;
296 
297 	ap.a_head.a_desc = &dev_revoke_desc;
298 	ap.a_head.a_dev = dev;
299 
300 	if (needmplock)
301 		get_mplock();
302 	error = dev->si_ops->d_revoke(&ap);
303 	if (needmplock)
304 		rel_mplock();
305 
306 	return (error);
307 }
308 
309 /*
310  * Core device strategy call, used to issue I/O on a device.  There are
311  * two versions, a non-chained version and a chained version.  The chained
312  * version reuses a BIO set up by vn_strategy().  The only difference is
313  * that, for now, we do not push a new tracking structure when chaining
314  * from vn_strategy.  XXX this will ultimately have to change.
315  */
316 void
317 dev_dstrategy(cdev_t dev, struct bio *bio)
318 {
319 	struct dev_strategy_args ap;
320 	struct bio_track *track;
321 	int needmplock = dev_needmplock(dev);
322 
323 	ap.a_head.a_desc = &dev_strategy_desc;
324 	ap.a_head.a_dev = dev;
325 	ap.a_bio = bio;
326 
327 	KKASSERT(bio->bio_track == NULL);
328 	KKASSERT(bio->bio_buf->b_cmd != BUF_CMD_DONE);
329 	if (bio->bio_buf->b_cmd == BUF_CMD_READ)
330 	    track = &dev->si_track_read;
331 	else
332 	    track = &dev->si_track_write;
333 	bio_track_ref(track);
334 	bio->bio_track = track;
335 
336 	if (dsched_is_clear_buf_priv(bio->bio_buf))
337 		dsched_new_buf(bio->bio_buf);
338 
339 	KKASSERT((bio->bio_flags & BIO_DONE) == 0);
340 	if (needmplock) {
341 		get_mplock();
342 		++mplock_strategies;
343 	} else {
344 		++mpsafe_strategies;
345 	}
346 	(void)dev->si_ops->d_strategy(&ap);
347 	if (needmplock)
348 		rel_mplock();
349 }
350 
351 void
352 dev_dstrategy_chain(cdev_t dev, struct bio *bio)
353 {
354 	struct dev_strategy_args ap;
355 	int needmplock = dev_needmplock(dev);
356 
357 	ap.a_head.a_desc = &dev_strategy_desc;
358 	ap.a_head.a_dev = dev;
359 	ap.a_bio = bio;
360 
361 	KKASSERT(bio->bio_track != NULL);
362 	KKASSERT((bio->bio_flags & BIO_DONE) == 0);
363 	if (needmplock)
364 		get_mplock();
365 	(void)dev->si_ops->d_strategy(&ap);
366 	if (needmplock)
367 		rel_mplock();
368 }
369 
370 /*
371  * note: the disk layer is expected to set count, blkno, and secsize before
372  * forwarding the message.
373  */
374 int
375 dev_ddump(cdev_t dev, void *virtual, vm_offset_t physical, off_t offset,
376     size_t length)
377 {
378 	struct dev_dump_args ap;
379 	int needmplock = dev_needmplock(dev);
380 	int error;
381 
382 	ap.a_head.a_desc = &dev_dump_desc;
383 	ap.a_head.a_dev = dev;
384 	ap.a_count = 0;
385 	ap.a_blkno = 0;
386 	ap.a_secsize = 0;
387 	ap.a_virtual = virtual;
388 	ap.a_physical = physical;
389 	ap.a_offset = offset;
390 	ap.a_length = length;
391 
392 	if (needmplock)
393 		get_mplock();
394 	error = dev->si_ops->d_dump(&ap);
395 	if (needmplock)
396 		rel_mplock();
397 	return (error);
398 }
399 
400 int64_t
401 dev_dpsize(cdev_t dev)
402 {
403 	struct dev_psize_args ap;
404 	int needmplock = dev_needmplock(dev);
405 	int error;
406 
407 	ap.a_head.a_desc = &dev_psize_desc;
408 	ap.a_head.a_dev = dev;
409 
410 	if (needmplock)
411 		get_mplock();
412 	error = dev->si_ops->d_psize(&ap);
413 	if (needmplock)
414 		rel_mplock();
415 
416 	if (error == 0)
417 		return (ap.a_result);
418 	return(-1);
419 }
420 
421 /*
422  * Pass-thru to the device kqfilter.
423  *
424  * NOTE: We explicitly preset a_result to 0 so d_kqfilter() functions
425  *	 which return 0 do not have to bother setting a_result.
426  */
427 int
428 dev_dkqfilter(cdev_t dev, struct knote *kn)
429 {
430 	struct dev_kqfilter_args ap;
431 	int needmplock = dev_needmplock(dev);
432 	int error;
433 
434 	ap.a_head.a_desc = &dev_kqfilter_desc;
435 	ap.a_head.a_dev = dev;
436 	ap.a_kn = kn;
437 	ap.a_result = 0;
438 
439 	if (needmplock)
440 		get_mplock();
441 	error = dev->si_ops->d_kqfilter(&ap);
442 	if (needmplock)
443 		rel_mplock();
444 
445 	if (error == 0)
446 		return(ap.a_result);
447 	else if (error == EOPNOTSUPP)
448 		return(EOPNOTSUPP);
449 	return(ENODEV);
450 }
451 
452 /************************************************************************
453  *			DEVICE HELPER FUNCTIONS				*
454  ************************************************************************/
455 
456 /*
457  * MPSAFE
458  */
459 int
460 dev_drefs(cdev_t dev)
461 {
462     return(dev->si_sysref.refcnt);
463 }
464 
465 /*
466  * MPSAFE
467  */
468 const char *
469 dev_dname(cdev_t dev)
470 {
471     return(dev->si_ops->head.name);
472 }
473 
474 /*
475  * MPSAFE
476  */
477 int
478 dev_dflags(cdev_t dev)
479 {
480     return(dev->si_ops->head.flags);
481 }
482 
483 /*
484  * MPSAFE
485  */
486 int
487 dev_dmaj(cdev_t dev)
488 {
489     return(dev->si_ops->head.maj);
490 }
491 
492 /*
493  * Used when forwarding a request through layers.  The caller adjusts
494  * ap->a_head.a_dev and then calls this function.
495  */
496 int
497 dev_doperate(struct dev_generic_args *ap)
498 {
499     int (*func)(struct dev_generic_args *);
500     int needmplock = dev_needmplock(ap->a_dev);
501     int error;
502 
503     func = *(void **)((char *)ap->a_dev->si_ops + ap->a_desc->sd_offset);
504 
505     if (needmplock)
506 	    get_mplock();
507     error = func(ap);
508     if (needmplock)
509 	    rel_mplock();
510 
511     return (error);
512 }
513 
514 /*
515  * Used by the console intercept code only.  Issue an operation through
516  * a foreign ops structure allowing the ops structure associated
517  * with the device to remain intact.
518  */
519 int
520 dev_doperate_ops(struct dev_ops *ops, struct dev_generic_args *ap)
521 {
522     int (*func)(struct dev_generic_args *);
523     int needmplock = ((ops->head.flags & D_MPSAFE) == 0);
524     int error;
525 
526     func = *(void **)((char *)ops + ap->a_desc->sd_offset);
527 
528     if (needmplock)
529 	    get_mplock();
530     error = func(ap);
531     if (needmplock)
532 	    rel_mplock();
533 
534     return (error);
535 }
536 
537 /*
538  * Convert a template dev_ops into the real thing by filling in
539  * uninitialized fields.
540  */
541 void
542 compile_dev_ops(struct dev_ops *ops)
543 {
544 	int offset;
545 
546 	for (offset = offsetof(struct dev_ops, dev_ops_first_field);
547 	     offset <= offsetof(struct dev_ops, dev_ops_last_field);
548 	     offset += sizeof(void *)
549 	) {
550 		void **func_p = (void **)((char *)ops + offset);
551 		void **def_p = (void **)((char *)&default_dev_ops + offset);
552 		if (*func_p == NULL) {
553 			if (ops->d_default)
554 				*func_p = ops->d_default;
555 			else
556 				*func_p = *def_p;
557 		}
558 	}
559 }
560 
561 /************************************************************************
562  *			MAJOR/MINOR SPACE FUNCTION 			*
563  ************************************************************************/
564 
565 /*
566  * This makes a dev_ops entry visible to userland (e.g /dev/<blah>).
567  *
568  * Disk devices typically register their major, e.g. 'ad0', and then call
569  * into the disk label management code which overloads its own onto e.g. 'ad0'
570  * to support all the various slice and partition combinations.
571  *
572  * The mask/match supplied in this call are a full 32 bits and the same
573  * mask and match must be specified in a later dev_ops_remove() call to
574  * match this add.  However, the match value for the minor number should never
575  * have any bits set in the major number's bit range (8-15).  The mask value
576  * may be conveniently specified as -1 without creating any major number
577  * interference.
578  */
579 
580 static
581 int
582 rb_dev_ops_compare(struct dev_ops_maj *a, struct dev_ops_maj *b)
583 {
584     if (a->maj < b->maj)
585 	return(-1);
586     else if (a->maj > b->maj)
587 	return(1);
588     return(0);
589 }
590 
591 RB_GENERATE2(dev_ops_rb_tree, dev_ops_maj, rbnode, rb_dev_ops_compare, int, maj);
592 
593 struct dev_ops_rb_tree dev_ops_rbhead = RB_INITIALIZER(dev_ops_rbhead);
594 
595 int
596 dev_ops_remove_all(struct dev_ops *ops)
597 {
598 	return devfs_destroy_dev_by_ops(ops, -1);
599 }
600 
601 int
602 dev_ops_remove_minor(struct dev_ops *ops, int minor)
603 {
604 	return devfs_destroy_dev_by_ops(ops, minor);
605 }
606 
607 struct dev_ops *
608 dev_ops_intercept(cdev_t dev, struct dev_ops *iops)
609 {
610 	struct dev_ops *oops = dev->si_ops;
611 
612 	compile_dev_ops(iops);
613 	iops->head.maj = oops->head.maj;
614 	iops->head.data = oops->head.data;
615 	iops->head.flags = oops->head.flags;
616 	dev->si_ops = iops;
617 	dev->si_flags |= SI_INTERCEPTED;
618 
619 	return (oops);
620 }
621 
622 void
623 dev_ops_restore(cdev_t dev, struct dev_ops *oops)
624 {
625 	struct dev_ops *iops = dev->si_ops;
626 
627 	dev->si_ops = oops;
628 	dev->si_flags &= ~SI_INTERCEPTED;
629 	iops->head.maj = 0;
630 	iops->head.data = NULL;
631 	iops->head.flags = 0;
632 }
633 
634 /************************************************************************
635  *			DEFAULT DEV OPS FUNCTIONS			*
636  ************************************************************************/
637 
638 
639 /*
640  * Unsupported devswitch functions (e.g. for writing to read-only device).
641  * XXX may belong elsewhere.
642  */
643 int
644 norevoke(struct dev_revoke_args *ap)
645 {
646 	/* take no action */
647 	return(0);
648 }
649 
650 int
651 noclone(struct dev_clone_args *ap)
652 {
653 	/* take no action */
654 	return (0);	/* allow the clone */
655 }
656 
657 int
658 noopen(struct dev_open_args *ap)
659 {
660 	return (ENODEV);
661 }
662 
663 int
664 noclose(struct dev_close_args *ap)
665 {
666 	return (ENODEV);
667 }
668 
669 int
670 noread(struct dev_read_args *ap)
671 {
672 	return (ENODEV);
673 }
674 
675 int
676 nowrite(struct dev_write_args *ap)
677 {
678 	return (ENODEV);
679 }
680 
681 int
682 noioctl(struct dev_ioctl_args *ap)
683 {
684 	return (ENODEV);
685 }
686 
687 int
688 nokqfilter(struct dev_kqfilter_args *ap)
689 {
690 	return (ENODEV);
691 }
692 
693 int
694 nommap(struct dev_mmap_args *ap)
695 {
696 	return (ENODEV);
697 }
698 
699 int
700 nostrategy(struct dev_strategy_args *ap)
701 {
702 	struct bio *bio = ap->a_bio;
703 
704 	bio->bio_buf->b_flags |= B_ERROR;
705 	bio->bio_buf->b_error = EOPNOTSUPP;
706 	biodone(bio);
707 	return(0);
708 }
709 
710 int
711 nopsize(struct dev_psize_args *ap)
712 {
713 	ap->a_result = 0;
714 	return(0);
715 }
716 
717 int
718 nodump(struct dev_dump_args *ap)
719 {
720 	return (ENODEV);
721 }
722 
723 /*
724  * XXX this is probably bogus.  Any device that uses it isn't checking the
725  * minor number.
726  */
727 int
728 nullopen(struct dev_open_args *ap)
729 {
730 	return (0);
731 }
732 
733 int
734 nullclose(struct dev_close_args *ap)
735 {
736 	return (0);
737 }
738 
739