xref: /freebsd-src/sys/contrib/openzfs/module/zfs/fm.c (revision 16d6b3b3da62aa5baaf3c66c8d4e6f8c8f70aeb7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Fault Management Architecture (FMA) Resource and Protocol Support
27  *
28  * The routines contained herein provide services to support kernel subsystems
29  * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089).
30  *
31  * Name-Value Pair Lists
32  *
33  * The embodiment of an FMA protocol element (event, fmri or authority) is a
34  * name-value pair list (nvlist_t).  FMA-specific nvlist constructor and
35  * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used
36  * to create an nvpair list using custom allocators.  Callers may choose to
37  * allocate either from the kernel memory allocator, or from a preallocated
38  * buffer, useful in constrained contexts like high-level interrupt routines.
39  *
40  * Protocol Event and FMRI Construction
41  *
42  * Convenience routines are provided to construct nvlist events according to
43  * the FMA Event Protocol and Naming Schema specification for ereports and
44  * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes.
45  *
46  * ENA Manipulation
47  *
48  * Routines to generate ENA formats 0, 1 and 2 are available as well as
49  * routines to increment formats 1 and 2.  Individual fields within the
50  * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(),
51  * fm_ena_format_get() and fm_ena_gen_get().
52  */
53 
54 #include <sys/types.h>
55 #include <sys/time.h>
56 #include <sys/list.h>
57 #include <sys/nvpair.h>
58 #include <sys/cmn_err.h>
59 #include <sys/sysmacros.h>
60 #include <sys/sunddi.h>
61 #include <sys/systeminfo.h>
62 #include <sys/fm/util.h>
63 #include <sys/fm/protocol.h>
64 #include <sys/kstat.h>
65 #include <sys/zfs_context.h>
66 #ifdef _KERNEL
67 #include <sys/atomic.h>
68 #include <sys/condvar.h>
69 #include <sys/console.h>
70 #include <sys/time.h>
71 #include <sys/zfs_ioctl.h>
72 
73 int zfs_zevent_len_max = 0;
74 int zfs_zevent_cols = 80;
75 int zfs_zevent_console = 0;
76 
77 static int zevent_len_cur = 0;
78 static int zevent_waiters = 0;
79 static int zevent_flags = 0;
80 
81 /* Num events rate limited since the last time zfs_zevent_next() was called */
82 static uint64_t ratelimit_dropped = 0;
83 
84 /*
85  * The EID (Event IDentifier) is used to uniquely tag a zevent when it is
86  * posted.  The posted EIDs are monotonically increasing but not persistent.
87  * They will be reset to the initial value (1) each time the kernel module is
88  * loaded.
89  */
90 static uint64_t zevent_eid = 0;
91 
92 static kmutex_t zevent_lock;
93 static list_t zevent_list;
94 static kcondvar_t zevent_cv;
95 #endif /* _KERNEL */
96 
97 
98 /*
99  * Common fault management kstats to record event generation failures
100  */
101 
102 struct erpt_kstat {
103 	kstat_named_t	erpt_dropped;		/* num erpts dropped on post */
104 	kstat_named_t	erpt_set_failed;	/* num erpt set failures */
105 	kstat_named_t	fmri_set_failed;	/* num fmri set failures */
106 	kstat_named_t	payload_set_failed;	/* num payload set failures */
107 };
108 
109 static struct erpt_kstat erpt_kstat_data = {
110 	{ "erpt-dropped", KSTAT_DATA_UINT64 },
111 	{ "erpt-set-failed", KSTAT_DATA_UINT64 },
112 	{ "fmri-set-failed", KSTAT_DATA_UINT64 },
113 	{ "payload-set-failed", KSTAT_DATA_UINT64 }
114 };
115 
116 kstat_t *fm_ksp;
117 
118 #ifdef _KERNEL
119 
120 /*
121  * Formatting utility function for fm_nvprintr.  We attempt to wrap chunks of
122  * output so they aren't split across console lines, and return the end column.
123  */
124 /*PRINTFLIKE4*/
125 static int
126 fm_printf(int depth, int c, int cols, const char *format, ...)
127 {
128 	va_list ap;
129 	int width;
130 	char c1;
131 
132 	va_start(ap, format);
133 	width = vsnprintf(&c1, sizeof (c1), format, ap);
134 	va_end(ap);
135 
136 	if (c + width >= cols) {
137 		console_printf("\n");
138 		c = 0;
139 		if (format[0] != ' ' && depth > 0) {
140 			console_printf(" ");
141 			c++;
142 		}
143 	}
144 
145 	va_start(ap, format);
146 	console_vprintf(format, ap);
147 	va_end(ap);
148 
149 	return ((c + width) % cols);
150 }
151 
152 /*
153  * Recursively print an nvlist in the specified column width and return the
154  * column we end up in.  This function is called recursively by fm_nvprint(),
155  * below.  We generically format the entire nvpair using hexadecimal
156  * integers and strings, and elide any integer arrays.  Arrays are basically
157  * used for cache dumps right now, so we suppress them so as not to overwhelm
158  * the amount of console output we produce at panic time.  This can be further
159  * enhanced as FMA technology grows based upon the needs of consumers.  All
160  * FMA telemetry is logged using the dump device transport, so the console
161  * output serves only as a fallback in case this procedure is unsuccessful.
162  */
163 static int
164 fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
165 {
166 	nvpair_t *nvp;
167 
168 	for (nvp = nvlist_next_nvpair(nvl, NULL);
169 	    nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
170 
171 		data_type_t type = nvpair_type(nvp);
172 		const char *name = nvpair_name(nvp);
173 
174 		boolean_t b;
175 		uint8_t i8;
176 		uint16_t i16;
177 		uint32_t i32;
178 		uint64_t i64;
179 		char *str;
180 		nvlist_t *cnv;
181 
182 		if (strcmp(name, FM_CLASS) == 0)
183 			continue; /* already printed by caller */
184 
185 		c = fm_printf(d, c, cols, " %s=", name);
186 
187 		switch (type) {
188 		case DATA_TYPE_BOOLEAN:
189 			c = fm_printf(d + 1, c, cols, " 1");
190 			break;
191 
192 		case DATA_TYPE_BOOLEAN_VALUE:
193 			(void) nvpair_value_boolean_value(nvp, &b);
194 			c = fm_printf(d + 1, c, cols, b ? "1" : "0");
195 			break;
196 
197 		case DATA_TYPE_BYTE:
198 			(void) nvpair_value_byte(nvp, &i8);
199 			c = fm_printf(d + 1, c, cols, "0x%x", i8);
200 			break;
201 
202 		case DATA_TYPE_INT8:
203 			(void) nvpair_value_int8(nvp, (void *)&i8);
204 			c = fm_printf(d + 1, c, cols, "0x%x", i8);
205 			break;
206 
207 		case DATA_TYPE_UINT8:
208 			(void) nvpair_value_uint8(nvp, &i8);
209 			c = fm_printf(d + 1, c, cols, "0x%x", i8);
210 			break;
211 
212 		case DATA_TYPE_INT16:
213 			(void) nvpair_value_int16(nvp, (void *)&i16);
214 			c = fm_printf(d + 1, c, cols, "0x%x", i16);
215 			break;
216 
217 		case DATA_TYPE_UINT16:
218 			(void) nvpair_value_uint16(nvp, &i16);
219 			c = fm_printf(d + 1, c, cols, "0x%x", i16);
220 			break;
221 
222 		case DATA_TYPE_INT32:
223 			(void) nvpair_value_int32(nvp, (void *)&i32);
224 			c = fm_printf(d + 1, c, cols, "0x%x", i32);
225 			break;
226 
227 		case DATA_TYPE_UINT32:
228 			(void) nvpair_value_uint32(nvp, &i32);
229 			c = fm_printf(d + 1, c, cols, "0x%x", i32);
230 			break;
231 
232 		case DATA_TYPE_INT64:
233 			(void) nvpair_value_int64(nvp, (void *)&i64);
234 			c = fm_printf(d + 1, c, cols, "0x%llx",
235 			    (u_longlong_t)i64);
236 			break;
237 
238 		case DATA_TYPE_UINT64:
239 			(void) nvpair_value_uint64(nvp, &i64);
240 			c = fm_printf(d + 1, c, cols, "0x%llx",
241 			    (u_longlong_t)i64);
242 			break;
243 
244 		case DATA_TYPE_HRTIME:
245 			(void) nvpair_value_hrtime(nvp, (void *)&i64);
246 			c = fm_printf(d + 1, c, cols, "0x%llx",
247 			    (u_longlong_t)i64);
248 			break;
249 
250 		case DATA_TYPE_STRING:
251 			(void) nvpair_value_string(nvp, &str);
252 			c = fm_printf(d + 1, c, cols, "\"%s\"",
253 			    str ? str : "<NULL>");
254 			break;
255 
256 		case DATA_TYPE_NVLIST:
257 			c = fm_printf(d + 1, c, cols, "[");
258 			(void) nvpair_value_nvlist(nvp, &cnv);
259 			c = fm_nvprintr(cnv, d + 1, c, cols);
260 			c = fm_printf(d + 1, c, cols, " ]");
261 			break;
262 
263 		case DATA_TYPE_NVLIST_ARRAY: {
264 			nvlist_t **val;
265 			uint_t i, nelem;
266 
267 			c = fm_printf(d + 1, c, cols, "[");
268 			(void) nvpair_value_nvlist_array(nvp, &val, &nelem);
269 			for (i = 0; i < nelem; i++) {
270 				c = fm_nvprintr(val[i], d + 1, c, cols);
271 			}
272 			c = fm_printf(d + 1, c, cols, " ]");
273 			}
274 			break;
275 
276 		case DATA_TYPE_INT8_ARRAY: {
277 			int8_t *val;
278 			uint_t i, nelem;
279 
280 			c = fm_printf(d + 1, c, cols, "[ ");
281 			(void) nvpair_value_int8_array(nvp, &val, &nelem);
282 			for (i = 0; i < nelem; i++)
283 				c = fm_printf(d + 1, c, cols, "0x%llx ",
284 				    (u_longlong_t)val[i]);
285 
286 			c = fm_printf(d + 1, c, cols, "]");
287 			break;
288 			}
289 
290 		case DATA_TYPE_UINT8_ARRAY: {
291 			uint8_t *val;
292 			uint_t i, nelem;
293 
294 			c = fm_printf(d + 1, c, cols, "[ ");
295 			(void) nvpair_value_uint8_array(nvp, &val, &nelem);
296 			for (i = 0; i < nelem; i++)
297 				c = fm_printf(d + 1, c, cols, "0x%llx ",
298 				    (u_longlong_t)val[i]);
299 
300 			c = fm_printf(d + 1, c, cols, "]");
301 			break;
302 			}
303 
304 		case DATA_TYPE_INT16_ARRAY: {
305 			int16_t *val;
306 			uint_t i, nelem;
307 
308 			c = fm_printf(d + 1, c, cols, "[ ");
309 			(void) nvpair_value_int16_array(nvp, &val, &nelem);
310 			for (i = 0; i < nelem; i++)
311 				c = fm_printf(d + 1, c, cols, "0x%llx ",
312 				    (u_longlong_t)val[i]);
313 
314 			c = fm_printf(d + 1, c, cols, "]");
315 			break;
316 			}
317 
318 		case DATA_TYPE_UINT16_ARRAY: {
319 			uint16_t *val;
320 			uint_t i, nelem;
321 
322 			c = fm_printf(d + 1, c, cols, "[ ");
323 			(void) nvpair_value_uint16_array(nvp, &val, &nelem);
324 			for (i = 0; i < nelem; i++)
325 				c = fm_printf(d + 1, c, cols, "0x%llx ",
326 				    (u_longlong_t)val[i]);
327 
328 			c = fm_printf(d + 1, c, cols, "]");
329 			break;
330 			}
331 
332 		case DATA_TYPE_INT32_ARRAY: {
333 			int32_t *val;
334 			uint_t i, nelem;
335 
336 			c = fm_printf(d + 1, c, cols, "[ ");
337 			(void) nvpair_value_int32_array(nvp, &val, &nelem);
338 			for (i = 0; i < nelem; i++)
339 			c = fm_printf(d + 1, c, cols, "0x%llx ",
340 			    (u_longlong_t)val[i]);
341 
342 			c = fm_printf(d + 1, c, cols, "]");
343 			break;
344 			}
345 
346 		case DATA_TYPE_UINT32_ARRAY: {
347 			uint32_t *val;
348 			uint_t i, nelem;
349 
350 			c = fm_printf(d + 1, c, cols, "[ ");
351 			(void) nvpair_value_uint32_array(nvp, &val, &nelem);
352 			for (i = 0; i < nelem; i++)
353 				c = fm_printf(d + 1, c, cols, "0x%llx ",
354 				    (u_longlong_t)val[i]);
355 
356 			c = fm_printf(d + 1, c, cols, "]");
357 			break;
358 			}
359 
360 		case DATA_TYPE_INT64_ARRAY: {
361 			int64_t *val;
362 			uint_t i, nelem;
363 
364 			c = fm_printf(d + 1, c, cols, "[ ");
365 			(void) nvpair_value_int64_array(nvp, &val, &nelem);
366 			for (i = 0; i < nelem; i++)
367 				c = fm_printf(d + 1, c, cols, "0x%llx ",
368 				    (u_longlong_t)val[i]);
369 
370 			c = fm_printf(d + 1, c, cols, "]");
371 			break;
372 			}
373 
374 		case DATA_TYPE_UINT64_ARRAY: {
375 			uint64_t *val;
376 			uint_t i, nelem;
377 
378 			c = fm_printf(d + 1, c, cols, "[ ");
379 			(void) nvpair_value_uint64_array(nvp, &val, &nelem);
380 			for (i = 0; i < nelem; i++)
381 				c = fm_printf(d + 1, c, cols, "0x%llx ",
382 				    (u_longlong_t)val[i]);
383 
384 			c = fm_printf(d + 1, c, cols, "]");
385 			break;
386 			}
387 
388 		case DATA_TYPE_STRING_ARRAY:
389 		case DATA_TYPE_BOOLEAN_ARRAY:
390 		case DATA_TYPE_BYTE_ARRAY:
391 			c = fm_printf(d + 1, c, cols, "[...]");
392 			break;
393 
394 		case DATA_TYPE_UNKNOWN:
395 		case DATA_TYPE_DONTCARE:
396 			c = fm_printf(d + 1, c, cols, "<unknown>");
397 			break;
398 		}
399 	}
400 
401 	return (c);
402 }
403 
404 void
405 fm_nvprint(nvlist_t *nvl)
406 {
407 	char *class;
408 	int c = 0;
409 
410 	console_printf("\n");
411 
412 	if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0)
413 		c = fm_printf(0, c, zfs_zevent_cols, "%s", class);
414 
415 	if (fm_nvprintr(nvl, 0, c, zfs_zevent_cols) != 0)
416 		console_printf("\n");
417 
418 	console_printf("\n");
419 }
420 
421 static zevent_t *
422 zfs_zevent_alloc(void)
423 {
424 	zevent_t *ev;
425 
426 	ev = kmem_zalloc(sizeof (zevent_t), KM_SLEEP);
427 
428 	list_create(&ev->ev_ze_list, sizeof (zfs_zevent_t),
429 	    offsetof(zfs_zevent_t, ze_node));
430 	list_link_init(&ev->ev_node);
431 
432 	return (ev);
433 }
434 
435 static void
436 zfs_zevent_free(zevent_t *ev)
437 {
438 	/* Run provided cleanup callback */
439 	ev->ev_cb(ev->ev_nvl, ev->ev_detector);
440 
441 	list_destroy(&ev->ev_ze_list);
442 	kmem_free(ev, sizeof (zevent_t));
443 }
444 
445 static void
446 zfs_zevent_drain(zevent_t *ev)
447 {
448 	zfs_zevent_t *ze;
449 
450 	ASSERT(MUTEX_HELD(&zevent_lock));
451 	list_remove(&zevent_list, ev);
452 
453 	/* Remove references to this event in all private file data */
454 	while ((ze = list_head(&ev->ev_ze_list)) != NULL) {
455 		list_remove(&ev->ev_ze_list, ze);
456 		ze->ze_zevent = NULL;
457 		ze->ze_dropped++;
458 	}
459 
460 	zfs_zevent_free(ev);
461 }
462 
463 void
464 zfs_zevent_drain_all(int *count)
465 {
466 	zevent_t *ev;
467 
468 	mutex_enter(&zevent_lock);
469 	while ((ev = list_head(&zevent_list)) != NULL)
470 		zfs_zevent_drain(ev);
471 
472 	*count = zevent_len_cur;
473 	zevent_len_cur = 0;
474 	mutex_exit(&zevent_lock);
475 }
476 
477 /*
478  * New zevents are inserted at the head.  If the maximum queue
479  * length is exceeded a zevent will be drained from the tail.
480  * As part of this any user space processes which currently have
481  * a reference to this zevent_t in their private data will have
482  * this reference set to NULL.
483  */
484 static void
485 zfs_zevent_insert(zevent_t *ev)
486 {
487 	ASSERT(MUTEX_HELD(&zevent_lock));
488 	list_insert_head(&zevent_list, ev);
489 
490 	if (zevent_len_cur >= zfs_zevent_len_max)
491 		zfs_zevent_drain(list_tail(&zevent_list));
492 	else
493 		zevent_len_cur++;
494 }
495 
496 /*
497  * Post a zevent. The cb will be called when nvl and detector are no longer
498  * needed, i.e.:
499  * - An error happened and a zevent can't be posted. In this case, cb is called
500  *   before zfs_zevent_post() returns.
501  * - The event is being drained and freed.
502  */
503 int
504 zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb)
505 {
506 	inode_timespec_t tv;
507 	int64_t tv_array[2];
508 	uint64_t eid;
509 	size_t nvl_size = 0;
510 	zevent_t *ev;
511 	int error;
512 
513 	ASSERT(cb != NULL);
514 
515 	gethrestime(&tv);
516 	tv_array[0] = tv.tv_sec;
517 	tv_array[1] = tv.tv_nsec;
518 
519 	error = nvlist_add_int64_array(nvl, FM_EREPORT_TIME, tv_array, 2);
520 	if (error) {
521 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
522 		goto out;
523 	}
524 
525 	eid = atomic_inc_64_nv(&zevent_eid);
526 	error = nvlist_add_uint64(nvl, FM_EREPORT_EID, eid);
527 	if (error) {
528 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
529 		goto out;
530 	}
531 
532 	error = nvlist_size(nvl, &nvl_size, NV_ENCODE_NATIVE);
533 	if (error) {
534 		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
535 		goto out;
536 	}
537 
538 	if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
539 		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
540 		error = EOVERFLOW;
541 		goto out;
542 	}
543 
544 	if (zfs_zevent_console)
545 		fm_nvprint(nvl);
546 
547 	ev = zfs_zevent_alloc();
548 	if (ev == NULL) {
549 		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
550 		error = ENOMEM;
551 		goto out;
552 	}
553 
554 	ev->ev_nvl = nvl;
555 	ev->ev_detector = detector;
556 	ev->ev_cb = cb;
557 	ev->ev_eid = eid;
558 
559 	mutex_enter(&zevent_lock);
560 	zfs_zevent_insert(ev);
561 	cv_broadcast(&zevent_cv);
562 	mutex_exit(&zevent_lock);
563 
564 out:
565 	if (error)
566 		cb(nvl, detector);
567 
568 	return (error);
569 }
570 
571 static int
572 zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze)
573 {
574 	*ze = zfsdev_get_state(minor, ZST_ZEVENT);
575 	if (*ze == NULL)
576 		return (SET_ERROR(EBADF));
577 
578 	return (0);
579 }
580 
581 int
582 zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze)
583 {
584 	int error;
585 
586 	error = zfsdev_getminor(fd, minorp);
587 	if (error == 0)
588 		error = zfs_zevent_minor_to_state(*minorp, ze);
589 
590 	if (error)
591 		zfs_zevent_fd_rele(fd);
592 
593 	return (error);
594 }
595 
596 void
597 zfs_zevent_fd_rele(int fd)
598 {
599 	zfs_file_put(fd);
600 }
601 
602 /*
603  * Get the next zevent in the stream and place a copy in 'event'.  This
604  * may fail with ENOMEM if the encoded nvlist size exceeds the passed
605  * 'event_size'.  In this case the stream pointer is not advanced and
606  * and 'event_size' is set to the minimum required buffer size.
607  */
608 int
609 zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size,
610     uint64_t *dropped)
611 {
612 	zevent_t *ev;
613 	size_t size;
614 	int error = 0;
615 
616 	mutex_enter(&zevent_lock);
617 	if (ze->ze_zevent == NULL) {
618 		/* New stream start at the beginning/tail */
619 		ev = list_tail(&zevent_list);
620 		if (ev == NULL) {
621 			error = ENOENT;
622 			goto out;
623 		}
624 	} else {
625 		/*
626 		 * Existing stream continue with the next element and remove
627 		 * ourselves from the wait queue for the previous element
628 		 */
629 		ev = list_prev(&zevent_list, ze->ze_zevent);
630 		if (ev == NULL) {
631 			error = ENOENT;
632 			goto out;
633 		}
634 	}
635 
636 	VERIFY(nvlist_size(ev->ev_nvl, &size, NV_ENCODE_NATIVE) == 0);
637 	if (size > *event_size) {
638 		*event_size = size;
639 		error = ENOMEM;
640 		goto out;
641 	}
642 
643 	if (ze->ze_zevent)
644 		list_remove(&ze->ze_zevent->ev_ze_list, ze);
645 
646 	ze->ze_zevent = ev;
647 	list_insert_head(&ev->ev_ze_list, ze);
648 	(void) nvlist_dup(ev->ev_nvl, event, KM_SLEEP);
649 	*dropped = ze->ze_dropped;
650 
651 #ifdef _KERNEL
652 	/* Include events dropped due to rate limiting */
653 	*dropped += ratelimit_dropped;
654 	ratelimit_dropped = 0;
655 #endif
656 	ze->ze_dropped = 0;
657 out:
658 	mutex_exit(&zevent_lock);
659 
660 	return (error);
661 }
662 
663 /*
664  * Wait in an interruptible state for any new events.
665  */
666 int
667 zfs_zevent_wait(zfs_zevent_t *ze)
668 {
669 	int error = EAGAIN;
670 
671 	mutex_enter(&zevent_lock);
672 	zevent_waiters++;
673 
674 	while (error == EAGAIN) {
675 		if (zevent_flags & ZEVENT_SHUTDOWN) {
676 			error = SET_ERROR(ESHUTDOWN);
677 			break;
678 		}
679 
680 		error = cv_wait_sig(&zevent_cv, &zevent_lock);
681 		if (signal_pending(current)) {
682 			error = SET_ERROR(EINTR);
683 			break;
684 		} else if (!list_is_empty(&zevent_list)) {
685 			error = 0;
686 			continue;
687 		} else {
688 			error = EAGAIN;
689 		}
690 	}
691 
692 	zevent_waiters--;
693 	mutex_exit(&zevent_lock);
694 
695 	return (error);
696 }
697 
698 /*
699  * The caller may seek to a specific EID by passing that EID.  If the EID
700  * is still available in the posted list of events the cursor is positioned
701  * there.  Otherwise ENOENT is returned and the cursor is not moved.
702  *
703  * There are two reserved EIDs which may be passed and will never fail.
704  * ZEVENT_SEEK_START positions the cursor at the start of the list, and
705  * ZEVENT_SEEK_END positions the cursor at the end of the list.
706  */
707 int
708 zfs_zevent_seek(zfs_zevent_t *ze, uint64_t eid)
709 {
710 	zevent_t *ev;
711 	int error = 0;
712 
713 	mutex_enter(&zevent_lock);
714 
715 	if (eid == ZEVENT_SEEK_START) {
716 		if (ze->ze_zevent)
717 			list_remove(&ze->ze_zevent->ev_ze_list, ze);
718 
719 		ze->ze_zevent = NULL;
720 		goto out;
721 	}
722 
723 	if (eid == ZEVENT_SEEK_END) {
724 		if (ze->ze_zevent)
725 			list_remove(&ze->ze_zevent->ev_ze_list, ze);
726 
727 		ev = list_head(&zevent_list);
728 		if (ev) {
729 			ze->ze_zevent = ev;
730 			list_insert_head(&ev->ev_ze_list, ze);
731 		} else {
732 			ze->ze_zevent = NULL;
733 		}
734 
735 		goto out;
736 	}
737 
738 	for (ev = list_tail(&zevent_list); ev != NULL;
739 	    ev = list_prev(&zevent_list, ev)) {
740 		if (ev->ev_eid == eid) {
741 			if (ze->ze_zevent)
742 				list_remove(&ze->ze_zevent->ev_ze_list, ze);
743 
744 			ze->ze_zevent = ev;
745 			list_insert_head(&ev->ev_ze_list, ze);
746 			break;
747 		}
748 	}
749 
750 	if (ev == NULL)
751 		error = ENOENT;
752 
753 out:
754 	mutex_exit(&zevent_lock);
755 
756 	return (error);
757 }
758 
759 void
760 zfs_zevent_init(zfs_zevent_t **zep)
761 {
762 	zfs_zevent_t *ze;
763 
764 	ze = *zep = kmem_zalloc(sizeof (zfs_zevent_t), KM_SLEEP);
765 	list_link_init(&ze->ze_node);
766 }
767 
768 void
769 zfs_zevent_destroy(zfs_zevent_t *ze)
770 {
771 	mutex_enter(&zevent_lock);
772 	if (ze->ze_zevent)
773 		list_remove(&ze->ze_zevent->ev_ze_list, ze);
774 	mutex_exit(&zevent_lock);
775 
776 	kmem_free(ze, sizeof (zfs_zevent_t));
777 }
778 #endif /* _KERNEL */
779 
780 /*
781  * Wrappers for FM nvlist allocators
782  */
783 /* ARGSUSED */
784 static void *
785 i_fm_alloc(nv_alloc_t *nva, size_t size)
786 {
787 	return (kmem_zalloc(size, KM_SLEEP));
788 }
789 
790 /* ARGSUSED */
791 static void
792 i_fm_free(nv_alloc_t *nva, void *buf, size_t size)
793 {
794 	kmem_free(buf, size);
795 }
796 
797 const nv_alloc_ops_t fm_mem_alloc_ops = {
798 	.nv_ao_init = NULL,
799 	.nv_ao_fini = NULL,
800 	.nv_ao_alloc = i_fm_alloc,
801 	.nv_ao_free = i_fm_free,
802 	.nv_ao_reset = NULL
803 };
804 
805 /*
806  * Create and initialize a new nv_alloc_t for a fixed buffer, buf.  A pointer
807  * to the newly allocated nv_alloc_t structure is returned upon success or NULL
808  * is returned to indicate that the nv_alloc structure could not be created.
809  */
810 nv_alloc_t *
811 fm_nva_xcreate(char *buf, size_t bufsz)
812 {
813 	nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
814 
815 	if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) {
816 		kmem_free(nvhdl, sizeof (nv_alloc_t));
817 		return (NULL);
818 	}
819 
820 	return (nvhdl);
821 }
822 
823 /*
824  * Destroy a previously allocated nv_alloc structure.  The fixed buffer
825  * associated with nva must be freed by the caller.
826  */
827 void
828 fm_nva_xdestroy(nv_alloc_t *nva)
829 {
830 	nv_alloc_fini(nva);
831 	kmem_free(nva, sizeof (nv_alloc_t));
832 }
833 
834 /*
835  * Create a new nv list.  A pointer to a new nv list structure is returned
836  * upon success or NULL is returned to indicate that the structure could
837  * not be created.  The newly created nv list is created and managed by the
838  * operations installed in nva.   If nva is NULL, the default FMA nva
839  * operations are installed and used.
840  *
841  * When called from the kernel and nva == NULL, this function must be called
842  * from passive kernel context with no locks held that can prevent a
843  * sleeping memory allocation from occurring.  Otherwise, this function may
844  * be called from other kernel contexts as long a valid nva created via
845  * fm_nva_create() is supplied.
846  */
847 nvlist_t *
848 fm_nvlist_create(nv_alloc_t *nva)
849 {
850 	int hdl_alloced = 0;
851 	nvlist_t *nvl;
852 	nv_alloc_t *nvhdl;
853 
854 	if (nva == NULL) {
855 		nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
856 
857 		if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) {
858 			kmem_free(nvhdl, sizeof (nv_alloc_t));
859 			return (NULL);
860 		}
861 		hdl_alloced = 1;
862 	} else {
863 		nvhdl = nva;
864 	}
865 
866 	if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
867 		if (hdl_alloced) {
868 			nv_alloc_fini(nvhdl);
869 			kmem_free(nvhdl, sizeof (nv_alloc_t));
870 		}
871 		return (NULL);
872 	}
873 
874 	return (nvl);
875 }
876 
877 /*
878  * Destroy a previously allocated nvlist structure.  flag indicates whether
879  * or not the associated nva structure should be freed (FM_NVA_FREE) or
880  * retained (FM_NVA_RETAIN).  Retaining the nv alloc structure allows
881  * it to be re-used for future nvlist creation operations.
882  */
883 void
884 fm_nvlist_destroy(nvlist_t *nvl, int flag)
885 {
886 	nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl);
887 
888 	nvlist_free(nvl);
889 
890 	if (nva != NULL) {
891 		if (flag == FM_NVA_FREE)
892 			fm_nva_xdestroy(nva);
893 	}
894 }
895 
896 int
897 i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
898 {
899 	int nelem, ret = 0;
900 	data_type_t type;
901 
902 	while (ret == 0 && name != NULL) {
903 		type = va_arg(ap, data_type_t);
904 		switch (type) {
905 		case DATA_TYPE_BYTE:
906 			ret = nvlist_add_byte(payload, name,
907 			    va_arg(ap, uint_t));
908 			break;
909 		case DATA_TYPE_BYTE_ARRAY:
910 			nelem = va_arg(ap, int);
911 			ret = nvlist_add_byte_array(payload, name,
912 			    va_arg(ap, uchar_t *), nelem);
913 			break;
914 		case DATA_TYPE_BOOLEAN_VALUE:
915 			ret = nvlist_add_boolean_value(payload, name,
916 			    va_arg(ap, boolean_t));
917 			break;
918 		case DATA_TYPE_BOOLEAN_ARRAY:
919 			nelem = va_arg(ap, int);
920 			ret = nvlist_add_boolean_array(payload, name,
921 			    va_arg(ap, boolean_t *), nelem);
922 			break;
923 		case DATA_TYPE_INT8:
924 			ret = nvlist_add_int8(payload, name,
925 			    va_arg(ap, int));
926 			break;
927 		case DATA_TYPE_INT8_ARRAY:
928 			nelem = va_arg(ap, int);
929 			ret = nvlist_add_int8_array(payload, name,
930 			    va_arg(ap, int8_t *), nelem);
931 			break;
932 		case DATA_TYPE_UINT8:
933 			ret = nvlist_add_uint8(payload, name,
934 			    va_arg(ap, uint_t));
935 			break;
936 		case DATA_TYPE_UINT8_ARRAY:
937 			nelem = va_arg(ap, int);
938 			ret = nvlist_add_uint8_array(payload, name,
939 			    va_arg(ap, uint8_t *), nelem);
940 			break;
941 		case DATA_TYPE_INT16:
942 			ret = nvlist_add_int16(payload, name,
943 			    va_arg(ap, int));
944 			break;
945 		case DATA_TYPE_INT16_ARRAY:
946 			nelem = va_arg(ap, int);
947 			ret = nvlist_add_int16_array(payload, name,
948 			    va_arg(ap, int16_t *), nelem);
949 			break;
950 		case DATA_TYPE_UINT16:
951 			ret = nvlist_add_uint16(payload, name,
952 			    va_arg(ap, uint_t));
953 			break;
954 		case DATA_TYPE_UINT16_ARRAY:
955 			nelem = va_arg(ap, int);
956 			ret = nvlist_add_uint16_array(payload, name,
957 			    va_arg(ap, uint16_t *), nelem);
958 			break;
959 		case DATA_TYPE_INT32:
960 			ret = nvlist_add_int32(payload, name,
961 			    va_arg(ap, int32_t));
962 			break;
963 		case DATA_TYPE_INT32_ARRAY:
964 			nelem = va_arg(ap, int);
965 			ret = nvlist_add_int32_array(payload, name,
966 			    va_arg(ap, int32_t *), nelem);
967 			break;
968 		case DATA_TYPE_UINT32:
969 			ret = nvlist_add_uint32(payload, name,
970 			    va_arg(ap, uint32_t));
971 			break;
972 		case DATA_TYPE_UINT32_ARRAY:
973 			nelem = va_arg(ap, int);
974 			ret = nvlist_add_uint32_array(payload, name,
975 			    va_arg(ap, uint32_t *), nelem);
976 			break;
977 		case DATA_TYPE_INT64:
978 			ret = nvlist_add_int64(payload, name,
979 			    va_arg(ap, int64_t));
980 			break;
981 		case DATA_TYPE_INT64_ARRAY:
982 			nelem = va_arg(ap, int);
983 			ret = nvlist_add_int64_array(payload, name,
984 			    va_arg(ap, int64_t *), nelem);
985 			break;
986 		case DATA_TYPE_UINT64:
987 			ret = nvlist_add_uint64(payload, name,
988 			    va_arg(ap, uint64_t));
989 			break;
990 		case DATA_TYPE_UINT64_ARRAY:
991 			nelem = va_arg(ap, int);
992 			ret = nvlist_add_uint64_array(payload, name,
993 			    va_arg(ap, uint64_t *), nelem);
994 			break;
995 		case DATA_TYPE_STRING:
996 			ret = nvlist_add_string(payload, name,
997 			    va_arg(ap, char *));
998 			break;
999 		case DATA_TYPE_STRING_ARRAY:
1000 			nelem = va_arg(ap, int);
1001 			ret = nvlist_add_string_array(payload, name,
1002 			    va_arg(ap, char **), nelem);
1003 			break;
1004 		case DATA_TYPE_NVLIST:
1005 			ret = nvlist_add_nvlist(payload, name,
1006 			    va_arg(ap, nvlist_t *));
1007 			break;
1008 		case DATA_TYPE_NVLIST_ARRAY:
1009 			nelem = va_arg(ap, int);
1010 			ret = nvlist_add_nvlist_array(payload, name,
1011 			    va_arg(ap, nvlist_t **), nelem);
1012 			break;
1013 		default:
1014 			ret = EINVAL;
1015 		}
1016 
1017 		name = va_arg(ap, char *);
1018 	}
1019 	return (ret);
1020 }
1021 
1022 void
1023 fm_payload_set(nvlist_t *payload, ...)
1024 {
1025 	int ret;
1026 	const char *name;
1027 	va_list ap;
1028 
1029 	va_start(ap, payload);
1030 	name = va_arg(ap, char *);
1031 	ret = i_fm_payload_set(payload, name, ap);
1032 	va_end(ap);
1033 
1034 	if (ret)
1035 		atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64);
1036 }
1037 
1038 /*
1039  * Set-up and validate the members of an ereport event according to:
1040  *
1041  *	Member name		Type		Value
1042  *	====================================================
1043  *	class			string		ereport
1044  *	version			uint8_t		0
1045  *	ena			uint64_t	<ena>
1046  *	detector		nvlist_t	<detector>
1047  *	ereport-payload		nvlist_t	<var args>
1048  *
1049  * We don't actually add a 'version' member to the payload.  Really,
1050  * the version quoted to us by our caller is that of the category 1
1051  * "ereport" event class (and we require FM_EREPORT_VERS0) but
1052  * the payload version of the actual leaf class event under construction
1053  * may be something else.  Callers should supply a version in the varargs,
1054  * or (better) we could take two version arguments - one for the
1055  * ereport category 1 classification (expect FM_EREPORT_VERS0) and one
1056  * for the leaf class.
1057  */
1058 void
1059 fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class,
1060     uint64_t ena, const nvlist_t *detector, ...)
1061 {
1062 	char ereport_class[FM_MAX_CLASS];
1063 	const char *name;
1064 	va_list ap;
1065 	int ret;
1066 
1067 	if (version != FM_EREPORT_VERS0) {
1068 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1069 		return;
1070 	}
1071 
1072 	(void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
1073 	    FM_EREPORT_CLASS, erpt_class);
1074 	if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
1075 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1076 		return;
1077 	}
1078 
1079 	if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
1080 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1081 	}
1082 
1083 	if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
1084 	    (nvlist_t *)detector) != 0) {
1085 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1086 	}
1087 
1088 	va_start(ap, detector);
1089 	name = va_arg(ap, const char *);
1090 	ret = i_fm_payload_set(ereport, name, ap);
1091 	va_end(ap);
1092 
1093 	if (ret)
1094 		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
1095 }
1096 
1097 /*
1098  * Set-up and validate the members of an hc fmri according to;
1099  *
1100  *	Member name		Type		Value
1101  *	===================================================
1102  *	version			uint8_t		0
1103  *	auth			nvlist_t	<auth>
1104  *	hc-name			string		<name>
1105  *	hc-id			string		<id>
1106  *
1107  * Note that auth and hc-id are optional members.
1108  */
1109 
1110 #define	HC_MAXPAIRS	20
1111 #define	HC_MAXNAMELEN	50
1112 
1113 static int
1114 fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth)
1115 {
1116 	if (version != FM_HC_SCHEME_VERSION) {
1117 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1118 		return (0);
1119 	}
1120 
1121 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 ||
1122 	    nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
1123 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1124 		return (0);
1125 	}
1126 
1127 	if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
1128 	    (nvlist_t *)auth) != 0) {
1129 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1130 		return (0);
1131 	}
1132 
1133 	return (1);
1134 }
1135 
1136 void
1137 fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
1138     nvlist_t *snvl, int npairs, ...)
1139 {
1140 	nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
1141 	nvlist_t *pairs[HC_MAXPAIRS];
1142 	va_list ap;
1143 	int i;
1144 
1145 	if (!fm_fmri_hc_set_common(fmri, version, auth))
1146 		return;
1147 
1148 	npairs = MIN(npairs, HC_MAXPAIRS);
1149 
1150 	va_start(ap, npairs);
1151 	for (i = 0; i < npairs; i++) {
1152 		const char *name = va_arg(ap, const char *);
1153 		uint32_t id = va_arg(ap, uint32_t);
1154 		char idstr[11];
1155 
1156 		(void) snprintf(idstr, sizeof (idstr), "%u", id);
1157 
1158 		pairs[i] = fm_nvlist_create(nva);
1159 		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
1160 		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
1161 			atomic_inc_64(
1162 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1163 		}
1164 	}
1165 	va_end(ap);
1166 
1167 	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
1168 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1169 
1170 	for (i = 0; i < npairs; i++)
1171 		fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
1172 
1173 	if (snvl != NULL) {
1174 		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
1175 			atomic_inc_64(
1176 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1177 		}
1178 	}
1179 }
1180 
1181 void
1182 fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
1183     nvlist_t *snvl, nvlist_t *bboard, int npairs, ...)
1184 {
1185 	nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
1186 	nvlist_t *pairs[HC_MAXPAIRS];
1187 	nvlist_t **hcl;
1188 	uint_t n;
1189 	int i, j;
1190 	va_list ap;
1191 	char *hcname, *hcid;
1192 
1193 	if (!fm_fmri_hc_set_common(fmri, version, auth))
1194 		return;
1195 
1196 	/*
1197 	 * copy the bboard nvpairs to the pairs array
1198 	 */
1199 	if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
1200 	    != 0) {
1201 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1202 		return;
1203 	}
1204 
1205 	for (i = 0; i < n; i++) {
1206 		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
1207 		    &hcname) != 0) {
1208 			atomic_inc_64(
1209 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1210 			return;
1211 		}
1212 		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
1213 			atomic_inc_64(
1214 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1215 			return;
1216 		}
1217 
1218 		pairs[i] = fm_nvlist_create(nva);
1219 		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 ||
1220 		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
1221 			for (j = 0; j <= i; j++) {
1222 				if (pairs[j] != NULL)
1223 					fm_nvlist_destroy(pairs[j],
1224 					    FM_NVA_RETAIN);
1225 			}
1226 			atomic_inc_64(
1227 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1228 			return;
1229 		}
1230 	}
1231 
1232 	/*
1233 	 * create the pairs from passed in pairs
1234 	 */
1235 	npairs = MIN(npairs, HC_MAXPAIRS);
1236 
1237 	va_start(ap, npairs);
1238 	for (i = n; i < npairs + n; i++) {
1239 		const char *name = va_arg(ap, const char *);
1240 		uint32_t id = va_arg(ap, uint32_t);
1241 		char idstr[11];
1242 		(void) snprintf(idstr, sizeof (idstr), "%u", id);
1243 		pairs[i] = fm_nvlist_create(nva);
1244 		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
1245 		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
1246 			for (j = 0; j <= i; j++) {
1247 				if (pairs[j] != NULL)
1248 					fm_nvlist_destroy(pairs[j],
1249 					    FM_NVA_RETAIN);
1250 			}
1251 			atomic_inc_64(
1252 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1253 			return;
1254 		}
1255 	}
1256 	va_end(ap);
1257 
1258 	/*
1259 	 * Create the fmri hc list
1260 	 */
1261 	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
1262 	    npairs + n) != 0) {
1263 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1264 		return;
1265 	}
1266 
1267 	for (i = 0; i < npairs + n; i++) {
1268 			fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
1269 	}
1270 
1271 	if (snvl != NULL) {
1272 		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
1273 			atomic_inc_64(
1274 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1275 			return;
1276 		}
1277 	}
1278 }
1279 
1280 /*
1281  * Set-up and validate the members of an dev fmri according to:
1282  *
1283  *	Member name		Type		Value
1284  *	====================================================
1285  *	version			uint8_t		0
1286  *	auth			nvlist_t	<auth>
1287  *	devpath			string		<devpath>
1288  *	[devid]			string		<devid>
1289  *	[target-port-l0id]	string		<target-port-lun0-id>
1290  *
1291  * Note that auth and devid are optional members.
1292  */
1293 void
1294 fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth,
1295     const char *devpath, const char *devid, const char *tpl0)
1296 {
1297 	int err = 0;
1298 
1299 	if (version != DEV_SCHEME_VERSION0) {
1300 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1301 		return;
1302 	}
1303 
1304 	err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
1305 	err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
1306 
1307 	if (auth != NULL) {
1308 		err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
1309 		    (nvlist_t *)auth);
1310 	}
1311 
1312 	err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);
1313 
1314 	if (devid != NULL)
1315 		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
1316 
1317 	if (tpl0 != NULL)
1318 		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
1319 
1320 	if (err)
1321 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1322 
1323 }
1324 
1325 /*
1326  * Set-up and validate the members of an cpu fmri according to:
1327  *
1328  *	Member name		Type		Value
1329  *	====================================================
1330  *	version			uint8_t		0
1331  *	auth			nvlist_t	<auth>
1332  *	cpuid			uint32_t	<cpu_id>
1333  *	cpumask			uint8_t		<cpu_mask>
1334  *	serial			uint64_t	<serial_id>
1335  *
1336  * Note that auth, cpumask, serial are optional members.
1337  *
1338  */
1339 void
1340 fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth,
1341     uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp)
1342 {
1343 	uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
1344 
1345 	if (version < CPU_SCHEME_VERSION1) {
1346 		atomic_inc_64(failedp);
1347 		return;
1348 	}
1349 
1350 	if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
1351 		atomic_inc_64(failedp);
1352 		return;
1353 	}
1354 
1355 	if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
1356 	    FM_FMRI_SCHEME_CPU) != 0) {
1357 		atomic_inc_64(failedp);
1358 		return;
1359 	}
1360 
1361 	if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
1362 	    (nvlist_t *)auth) != 0)
1363 		atomic_inc_64(failedp);
1364 
1365 	if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
1366 		atomic_inc_64(failedp);
1367 
1368 	if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
1369 	    *cpu_maskp) != 0)
1370 		atomic_inc_64(failedp);
1371 
1372 	if (serial_idp == NULL || nvlist_add_string(fmri_cpu,
1373 	    FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
1374 			atomic_inc_64(failedp);
1375 }
1376 
1377 /*
1378  * Set-up and validate the members of a mem according to:
1379  *
1380  *	Member name		Type		Value
1381  *	====================================================
1382  *	version			uint8_t		0
1383  *	auth			nvlist_t	<auth>		[optional]
1384  *	unum			string		<unum>
1385  *	serial			string		<serial>	[optional*]
1386  *	offset			uint64_t	<offset>	[optional]
1387  *
1388  *	* serial is required if offset is present
1389  */
1390 void
1391 fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
1392     const char *unum, const char *serial, uint64_t offset)
1393 {
1394 	if (version != MEM_SCHEME_VERSION0) {
1395 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1396 		return;
1397 	}
1398 
1399 	if (!serial && (offset != (uint64_t)-1)) {
1400 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1401 		return;
1402 	}
1403 
1404 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
1405 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1406 		return;
1407 	}
1408 
1409 	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
1410 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1411 		return;
1412 	}
1413 
1414 	if (auth != NULL) {
1415 		if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
1416 		    (nvlist_t *)auth) != 0) {
1417 			atomic_inc_64(
1418 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1419 		}
1420 	}
1421 
1422 	if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
1423 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1424 	}
1425 
1426 	if (serial != NULL) {
1427 		if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
1428 		    (char **)&serial, 1) != 0) {
1429 			atomic_inc_64(
1430 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1431 		}
1432 		if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri,
1433 		    FM_FMRI_MEM_OFFSET, offset) != 0) {
1434 			atomic_inc_64(
1435 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1436 		}
1437 	}
1438 }
1439 
1440 void
1441 fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
1442     uint64_t vdev_guid)
1443 {
1444 	if (version != ZFS_SCHEME_VERSION0) {
1445 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1446 		return;
1447 	}
1448 
1449 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
1450 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1451 		return;
1452 	}
1453 
1454 	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
1455 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1456 		return;
1457 	}
1458 
1459 	if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
1460 		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
1461 	}
1462 
1463 	if (vdev_guid != 0) {
1464 		if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
1465 			atomic_inc_64(
1466 			    &erpt_kstat_data.fmri_set_failed.value.ui64);
1467 		}
1468 	}
1469 }
1470 
1471 uint64_t
1472 fm_ena_increment(uint64_t ena)
1473 {
1474 	uint64_t new_ena;
1475 
1476 	switch (ENA_FORMAT(ena)) {
1477 	case FM_ENA_FMT1:
1478 		new_ena = ena + (1 << ENA_FMT1_GEN_SHFT);
1479 		break;
1480 	case FM_ENA_FMT2:
1481 		new_ena = ena + (1 << ENA_FMT2_GEN_SHFT);
1482 		break;
1483 	default:
1484 		new_ena = 0;
1485 	}
1486 
1487 	return (new_ena);
1488 }
1489 
1490 uint64_t
1491 fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format)
1492 {
1493 	uint64_t ena = 0;
1494 
1495 	switch (format) {
1496 	case FM_ENA_FMT1:
1497 		if (timestamp) {
1498 			ena = (uint64_t)((format & ENA_FORMAT_MASK) |
1499 			    ((cpuid << ENA_FMT1_CPUID_SHFT) &
1500 			    ENA_FMT1_CPUID_MASK) |
1501 			    ((timestamp << ENA_FMT1_TIME_SHFT) &
1502 			    ENA_FMT1_TIME_MASK));
1503 		} else {
1504 			ena = (uint64_t)((format & ENA_FORMAT_MASK) |
1505 			    ((cpuid << ENA_FMT1_CPUID_SHFT) &
1506 			    ENA_FMT1_CPUID_MASK) |
1507 			    ((gethrtime() << ENA_FMT1_TIME_SHFT) &
1508 			    ENA_FMT1_TIME_MASK));
1509 		}
1510 		break;
1511 	case FM_ENA_FMT2:
1512 		ena = (uint64_t)((format & ENA_FORMAT_MASK) |
1513 		    ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK));
1514 		break;
1515 	default:
1516 		break;
1517 	}
1518 
1519 	return (ena);
1520 }
1521 
1522 uint64_t
1523 fm_ena_generate(uint64_t timestamp, uchar_t format)
1524 {
1525 	uint64_t ena;
1526 
1527 	kpreempt_disable();
1528 	ena = fm_ena_generate_cpu(timestamp, getcpuid(), format);
1529 	kpreempt_enable();
1530 
1531 	return (ena);
1532 }
1533 
1534 uint64_t
1535 fm_ena_generation_get(uint64_t ena)
1536 {
1537 	uint64_t gen;
1538 
1539 	switch (ENA_FORMAT(ena)) {
1540 	case FM_ENA_FMT1:
1541 		gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT;
1542 		break;
1543 	case FM_ENA_FMT2:
1544 		gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT;
1545 		break;
1546 	default:
1547 		gen = 0;
1548 		break;
1549 	}
1550 
1551 	return (gen);
1552 }
1553 
1554 uchar_t
1555 fm_ena_format_get(uint64_t ena)
1556 {
1557 
1558 	return (ENA_FORMAT(ena));
1559 }
1560 
1561 uint64_t
1562 fm_ena_id_get(uint64_t ena)
1563 {
1564 	uint64_t id;
1565 
1566 	switch (ENA_FORMAT(ena)) {
1567 	case FM_ENA_FMT1:
1568 		id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT;
1569 		break;
1570 	case FM_ENA_FMT2:
1571 		id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT;
1572 		break;
1573 	default:
1574 		id = 0;
1575 	}
1576 
1577 	return (id);
1578 }
1579 
1580 uint64_t
1581 fm_ena_time_get(uint64_t ena)
1582 {
1583 	uint64_t time;
1584 
1585 	switch (ENA_FORMAT(ena)) {
1586 	case FM_ENA_FMT1:
1587 		time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT;
1588 		break;
1589 	case FM_ENA_FMT2:
1590 		time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT;
1591 		break;
1592 	default:
1593 		time = 0;
1594 	}
1595 
1596 	return (time);
1597 }
1598 
1599 #ifdef _KERNEL
1600 /*
1601  * Helper function to increment ereport dropped count.  Used by the event
1602  * rate limiting code to give feedback to the user about how many events were
1603  * rate limited by including them in the 'dropped' count.
1604  */
1605 void
1606 fm_erpt_dropped_increment(void)
1607 {
1608 	atomic_inc_64(&ratelimit_dropped);
1609 }
1610 
1611 void
1612 fm_init(void)
1613 {
1614 	zevent_len_cur = 0;
1615 	zevent_flags = 0;
1616 
1617 	if (zfs_zevent_len_max == 0)
1618 		zfs_zevent_len_max = ERPT_MAX_ERRS * MAX(max_ncpus, 4);
1619 
1620 	/* Initialize zevent allocation and generation kstats */
1621 	fm_ksp = kstat_create("zfs", 0, "fm", "misc", KSTAT_TYPE_NAMED,
1622 	    sizeof (struct erpt_kstat) / sizeof (kstat_named_t),
1623 	    KSTAT_FLAG_VIRTUAL);
1624 
1625 	if (fm_ksp != NULL) {
1626 		fm_ksp->ks_data = &erpt_kstat_data;
1627 		kstat_install(fm_ksp);
1628 	} else {
1629 		cmn_err(CE_NOTE, "failed to create fm/misc kstat\n");
1630 	}
1631 
1632 	mutex_init(&zevent_lock, NULL, MUTEX_DEFAULT, NULL);
1633 	list_create(&zevent_list, sizeof (zevent_t),
1634 	    offsetof(zevent_t, ev_node));
1635 	cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL);
1636 }
1637 
1638 void
1639 fm_fini(void)
1640 {
1641 	int count;
1642 
1643 	zfs_zevent_drain_all(&count);
1644 
1645 	mutex_enter(&zevent_lock);
1646 	cv_broadcast(&zevent_cv);
1647 
1648 	zevent_flags |= ZEVENT_SHUTDOWN;
1649 	while (zevent_waiters > 0) {
1650 		mutex_exit(&zevent_lock);
1651 		schedule();
1652 		mutex_enter(&zevent_lock);
1653 	}
1654 	mutex_exit(&zevent_lock);
1655 
1656 	cv_destroy(&zevent_cv);
1657 	list_destroy(&zevent_list);
1658 	mutex_destroy(&zevent_lock);
1659 
1660 	if (fm_ksp != NULL) {
1661 		kstat_delete(fm_ksp);
1662 		fm_ksp = NULL;
1663 	}
1664 }
1665 #endif /* _KERNEL */
1666 
1667 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, INT, ZMOD_RW,
1668 	"Max event queue length");
1669 
1670 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, cols, INT, ZMOD_RW,
1671 	"Max event column width");
1672 
1673 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, console, INT, ZMOD_RW,
1674 	"Log events to the console");
1675